LLVM 23.0.0git
AMDGPULegalizerInfo.cpp
Go to the documentation of this file.
1//===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8/// \file
9/// This file implements the targeting of the Machinelegalizer class for
10/// AMDGPU.
11/// \todo This should be generated by TableGen.
12//===----------------------------------------------------------------------===//
13
14#include "AMDGPULegalizerInfo.h"
15
16#include "AMDGPU.h"
18#include "AMDGPUInstrInfo.h"
19#include "AMDGPUMemoryUtils.h"
20#include "AMDGPUTargetMachine.h"
22#include "SIInstrInfo.h"
24#include "SIRegisterInfo.h"
26#include "llvm/ADT/ScopeExit.h"
37#include "llvm/IR/IntrinsicsAMDGPU.h"
38#include "llvm/IR/IntrinsicsR600.h"
39
40#define DEBUG_TYPE "amdgpu-legalinfo"
41
42using namespace llvm;
43using namespace LegalizeActions;
44using namespace LegalizeMutations;
45using namespace LegalityPredicates;
46using namespace MIPatternMatch;
47
48// Hack until load/store selection patterns support any tuple of legal types.
50 "amdgpu-global-isel-new-legality",
51 cl::desc("Use GlobalISel desired legality, rather than try to use"
52 "rules compatible with selection patterns"),
53 cl::init(false),
55
56static constexpr unsigned MaxRegisterSize = 1024;
57
58// Round the number of elements to the next power of two elements
60 unsigned NElts = Ty.getNumElements();
61 unsigned Pow2NElts = 1 << Log2_32_Ceil(NElts);
62 return Ty.changeElementCount(ElementCount::getFixed(Pow2NElts));
63}
64
65// Round the number of bits to the next power of two bits
67 unsigned Bits = Ty.getSizeInBits();
68 unsigned Pow2Bits = 1 << Log2_32_Ceil(Bits);
69 return LLT::scalar(Pow2Bits);
70}
71
72/// \returns true if this is an odd sized vector which should widen by adding an
73/// additional element. This is mostly to handle <3 x s16> -> <4 x s16>. This
74/// excludes s1 vectors, which should always be scalarized.
75static LegalityPredicate isSmallOddVector(unsigned TypeIdx) {
76 return [=](const LegalityQuery &Query) {
77 const LLT Ty = Query.Types[TypeIdx];
78 if (!Ty.isVector())
79 return false;
80
81 const LLT EltTy = Ty.getElementType();
82 const unsigned EltSize = EltTy.getSizeInBits();
83 return Ty.getNumElements() % 2 != 0 &&
84 EltSize > 1 && EltSize < 32 &&
85 Ty.getSizeInBits() % 32 != 0;
86 };
87}
88
89static LegalityPredicate sizeIsMultipleOf32(unsigned TypeIdx) {
90 return [=](const LegalityQuery &Query) {
91 const LLT Ty = Query.Types[TypeIdx];
92 return Ty.getSizeInBits() % 32 == 0;
93 };
94}
95
96static LegalityPredicate isWideVec16(unsigned TypeIdx) {
97 return [=](const LegalityQuery &Query) {
98 const LLT Ty = Query.Types[TypeIdx];
99 const LLT EltTy = Ty.getScalarType();
100 return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2;
101 };
102}
103
104static LegalizeMutation oneMoreElement(unsigned TypeIdx) {
105 return [=](const LegalityQuery &Query) {
106 const LLT Ty = Query.Types[TypeIdx];
107 const LLT EltTy = Ty.getElementType();
108 return std::pair(TypeIdx,
109 LLT::fixed_vector(Ty.getNumElements() + 1, EltTy));
110 };
111}
112
114 return [=](const LegalityQuery &Query) {
115 const LLT Ty = Query.Types[TypeIdx];
116 const LLT EltTy = Ty.getElementType();
117 unsigned Size = Ty.getSizeInBits();
118 unsigned Pieces = (Size + 63) / 64;
119 unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces;
120 return std::pair(TypeIdx, LLT::scalarOrVector(
121 ElementCount::getFixed(NewNumElts), EltTy));
122 };
123}
124
125// Increase the number of vector elements to reach the next multiple of 32-bit
126// type.
127static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) {
128 return [=](const LegalityQuery &Query) {
129 const LLT Ty = Query.Types[TypeIdx];
130
131 const LLT EltTy = Ty.getElementType();
132 const int Size = Ty.getSizeInBits();
133 const int EltSize = EltTy.getSizeInBits();
134 const int NextMul32 = (Size + 31) / 32;
135
136 assert(EltSize < 32);
137
138 const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize;
139 return std::pair(TypeIdx, LLT::fixed_vector(NewNumElts, EltTy));
140 };
141}
142
143// Retrieves the scalar type that's the same size as the mem desc
145 return [=](const LegalityQuery &Query) {
146 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
147 return std::make_pair(TypeIdx, LLT::scalar(MemSize));
148 };
149}
150
151// Increase the number of vector elements to reach the next legal RegClass.
153 return [=](const LegalityQuery &Query) {
154 const LLT Ty = Query.Types[TypeIdx];
155 const unsigned NumElts = Ty.getNumElements();
156 const unsigned EltSize = Ty.getElementType().getSizeInBits();
157 const unsigned MaxNumElts = MaxRegisterSize / EltSize;
158
159 assert(EltSize == 32 || EltSize == 64);
160 assert(Ty.getSizeInBits() < MaxRegisterSize);
161
162 unsigned NewNumElts;
163 // Find the nearest legal RegClass that is larger than the current type.
164 for (NewNumElts = NumElts; NewNumElts < MaxNumElts; ++NewNumElts) {
165 if (SIRegisterInfo::getSGPRClassForBitWidth(NewNumElts * EltSize))
166 break;
167 }
168 return std::pair(TypeIdx,
169 LLT::fixed_vector(NewNumElts, Ty.getElementType()));
170 };
171}
172
174 if (!Ty.isVector())
175 return LLT::scalar(128);
176 const ElementCount NumElems = Ty.getElementCount();
177 return LLT::vector(NumElems, LLT::scalar(128));
178}
179
181 if (!Ty.isVector())
182 return LLT::fixed_vector(4, LLT::scalar(32));
183 const unsigned NumElems = Ty.getElementCount().getFixedValue();
184 return LLT::fixed_vector(NumElems * 4, LLT::scalar(32));
185}
186
188 const unsigned Size = Ty.getSizeInBits();
189
190 if (Size <= 32) {
191 // <2 x s8> -> s16
192 // <4 x s8> -> s32
193 return LLT::scalar(Size);
194 }
195
197}
198
199static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx) {
200 return [=](const LegalityQuery &Query) {
201 const LLT Ty = Query.Types[TypeIdx];
202 return std::pair(TypeIdx, getBitcastRegisterType(Ty));
203 };
204}
205
207 return [=](const LegalityQuery &Query) {
208 const LLT Ty = Query.Types[TypeIdx];
209 unsigned Size = Ty.getSizeInBits();
210 assert(Size % 32 == 0);
211 return std::pair(
213 };
214}
215
216static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) {
217 return [=](const LegalityQuery &Query) {
218 const LLT QueryTy = Query.Types[TypeIdx];
219 return QueryTy.isVector() && QueryTy.getSizeInBits() < Size;
220 };
221}
222
223static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) {
224 return [=](const LegalityQuery &Query) {
225 const LLT QueryTy = Query.Types[TypeIdx];
226 return QueryTy.isVector() && QueryTy.getSizeInBits() > Size;
227 };
228}
229
230static LegalityPredicate numElementsNotEven(unsigned TypeIdx) {
231 return [=](const LegalityQuery &Query) {
232 const LLT QueryTy = Query.Types[TypeIdx];
233 return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0;
234 };
235}
236
237static bool isRegisterSize(const GCNSubtarget &ST, unsigned Size) {
238 return ((ST.useRealTrue16Insts() && Size == 16) || Size % 32 == 0) &&
240}
241
243 const int EltSize = EltTy.getSizeInBits();
244 return EltSize == 16 || EltSize % 32 == 0;
245}
246
247static bool isRegisterVectorType(LLT Ty) {
248 const int EltSize = Ty.getElementType().getSizeInBits();
249 return EltSize == 32 || EltSize == 64 ||
250 (EltSize == 16 && Ty.getNumElements() % 2 == 0) ||
251 EltSize == 128 || EltSize == 256;
252}
253
254// TODO: replace all uses of isRegisterType with isRegisterClassType
255static bool isRegisterType(const GCNSubtarget &ST, LLT Ty) {
256 if (!isRegisterSize(ST, Ty.getSizeInBits()))
257 return false;
258
259 if (Ty.isVector())
260 return isRegisterVectorType(Ty);
261
262 return true;
263}
264
265// Any combination of 32 or 64-bit elements up the maximum register size, and
266// multiples of v2s16.
268 unsigned TypeIdx) {
269 return [=, &ST](const LegalityQuery &Query) {
270 return isRegisterType(ST, Query.Types[TypeIdx]);
271 };
272}
273
274// RegisterType that doesn't have a corresponding RegClass.
275// TODO: Once `isRegisterType` is replaced with `isRegisterClassType` this
276// should be removed.
278 unsigned TypeIdx) {
279 return [=, &ST](const LegalityQuery &Query) {
280 LLT Ty = Query.Types[TypeIdx];
281 return isRegisterType(ST, Ty) &&
282 !SIRegisterInfo::getSGPRClassForBitWidth(Ty.getSizeInBits());
283 };
284}
285
286static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) {
287 return [=](const LegalityQuery &Query) {
288 const LLT QueryTy = Query.Types[TypeIdx];
289 if (!QueryTy.isVector())
290 return false;
291 const LLT EltTy = QueryTy.getElementType();
292 return EltTy == LLT::scalar(16) || EltTy.getSizeInBits() >= 32;
293 };
294}
295
296constexpr LLT S1 = LLT::scalar(1);
297constexpr LLT S8 = LLT::scalar(8);
298constexpr LLT S16 = LLT::scalar(16);
299constexpr LLT S32 = LLT::scalar(32);
300constexpr LLT F32 = LLT::scalar(32); // TODO: Expected float32
301constexpr LLT S64 = LLT::scalar(64);
302constexpr LLT F64 = LLT::scalar(64); // TODO: Expected float64
303constexpr LLT S96 = LLT::scalar(96);
304constexpr LLT S128 = LLT::scalar(128);
305constexpr LLT S160 = LLT::scalar(160);
306constexpr LLT S192 = LLT::scalar(192);
307constexpr LLT S224 = LLT::scalar(224);
308constexpr LLT S256 = LLT::scalar(256);
309constexpr LLT S512 = LLT::scalar(512);
310constexpr LLT S1024 = LLT::scalar(1024);
312
313constexpr LLT V2S8 = LLT::fixed_vector(2, 8);
314constexpr LLT V2S16 = LLT::fixed_vector(2, 16);
315constexpr LLT V4S16 = LLT::fixed_vector(4, 16);
316constexpr LLT V6S16 = LLT::fixed_vector(6, 16);
317constexpr LLT V8S16 = LLT::fixed_vector(8, 16);
318constexpr LLT V10S16 = LLT::fixed_vector(10, 16);
319constexpr LLT V12S16 = LLT::fixed_vector(12, 16);
320constexpr LLT V16S16 = LLT::fixed_vector(16, 16);
321
322// TODO: Expected LLT::fixed_vector(2, LLT::float16())
324constexpr LLT V2BF16 = V2F16; // FIXME
325
326constexpr LLT V2S32 = LLT::fixed_vector(2, 32);
327constexpr LLT V3S32 = LLT::fixed_vector(3, 32);
328constexpr LLT V4S32 = LLT::fixed_vector(4, 32);
329constexpr LLT V5S32 = LLT::fixed_vector(5, 32);
330constexpr LLT V6S32 = LLT::fixed_vector(6, 32);
331constexpr LLT V7S32 = LLT::fixed_vector(7, 32);
332constexpr LLT V8S32 = LLT::fixed_vector(8, 32);
333constexpr LLT V9S32 = LLT::fixed_vector(9, 32);
334constexpr LLT V10S32 = LLT::fixed_vector(10, 32);
335constexpr LLT V11S32 = LLT::fixed_vector(11, 32);
336constexpr LLT V12S32 = LLT::fixed_vector(12, 32);
337constexpr LLT V16S32 = LLT::fixed_vector(16, 32);
338constexpr LLT V32S32 = LLT::fixed_vector(32, 32);
339
340constexpr LLT V2S64 = LLT::fixed_vector(2, 64);
341constexpr LLT V3S64 = LLT::fixed_vector(3, 64);
342constexpr LLT V4S64 = LLT::fixed_vector(4, 64);
343constexpr LLT V5S64 = LLT::fixed_vector(5, 64);
344constexpr LLT V6S64 = LLT::fixed_vector(6, 64);
345constexpr LLT V7S64 = LLT::fixed_vector(7, 64);
346constexpr LLT V8S64 = LLT::fixed_vector(8, 64);
347constexpr LLT V16S64 = LLT::fixed_vector(16, 64);
348
349constexpr LLT V2S128 = LLT::fixed_vector(2, 128);
350constexpr LLT V4S128 = LLT::fixed_vector(4, 128);
351
352constexpr std::initializer_list<LLT> AllScalarTypes = {
354
355constexpr std::initializer_list<LLT> AllS16Vectors{
357
358constexpr std::initializer_list<LLT> AllS32Vectors = {
361
362constexpr std::initializer_list<LLT> AllS64Vectors = {
364
370
371// Checks whether a type is in the list of legal register types.
372static bool isRegisterClassType(const GCNSubtarget &ST, LLT Ty) {
373 if (Ty.isPointerOrPointerVector())
374 Ty = Ty.changeElementType(LLT::scalar(Ty.getScalarSizeInBits()));
375
378 (ST.useRealTrue16Insts() && Ty == S16) ||
380}
381
383 unsigned TypeIdx) {
384 return [&ST, TypeIdx](const LegalityQuery &Query) {
385 return isRegisterClassType(ST, Query.Types[TypeIdx]);
386 };
387}
388
389// If we have a truncating store or an extending load with a data size larger
390// than 32-bits, we need to reduce to a 32-bit type.
392 return [=](const LegalityQuery &Query) {
393 const LLT Ty = Query.Types[TypeIdx];
394 return !Ty.isVector() && Ty.getSizeInBits() > 32 &&
395 Query.MMODescrs[0].MemoryTy.getSizeInBits() < Ty.getSizeInBits();
396 };
397}
398
399// If we have a truncating store or an extending load with a data size larger
400// than 32-bits and mem location is a power of 2
402 return [=](const LegalityQuery &Query) {
403 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
404 return isWideScalarExtLoadTruncStore(TypeIdx)(Query) &&
405 isPowerOf2_64(MemSize);
406 };
407}
408
409// TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we
410// handle some operations by just promoting the register during
411// selection. There are also d16 loads on GFX9+ which preserve the high bits.
412static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS,
413 bool IsLoad, bool IsAtomic) {
414 switch (AS) {
416 // FIXME: Private element size.
417 return ST.hasFlatScratchEnabled() ? 128 : 32;
419 return ST.useDS128() ? 128 : 64;
424 // Treat constant and global as identical. SMRD loads are sometimes usable for
425 // global loads (ideally constant address space should be eliminated)
426 // depending on the context. Legality cannot be context dependent, but
427 // RegBankSelect can split the load as necessary depending on the pointer
428 // register bank/uniformity and if the memory is invariant or not written in a
429 // kernel.
430 return IsLoad ? 512 : 128;
431 default:
432 // FIXME: Flat addresses may contextually need to be split to 32-bit parts
433 // if they may alias scratch depending on the subtarget. This needs to be
434 // moved to custom handling to use addressMayBeAccessedAsPrivate
435 return ST.hasMultiDwordFlatScratchAddressing() || IsAtomic ? 128 : 32;
436 }
437}
438
439static bool isLoadStoreSizeLegal(const GCNSubtarget &ST,
440 const LegalityQuery &Query) {
441 const LLT Ty = Query.Types[0];
442
443 // Handle G_LOAD, G_ZEXTLOAD, G_SEXTLOAD
444 const bool IsLoad = Query.Opcode != AMDGPU::G_STORE;
445
446 unsigned RegSize = Ty.getSizeInBits();
447 uint64_t MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
448 uint64_t AlignBits = Query.MMODescrs[0].AlignInBits;
449 unsigned AS = Query.Types[1].getAddressSpace();
450
451 // All of these need to be custom lowered to cast the pointer operand.
453 return false;
454
455 // Do not handle extending vector loads.
456 if (Ty.isVector() && MemSize != RegSize)
457 return false;
458
459 // TODO: We should be able to widen loads if the alignment is high enough, but
460 // we also need to modify the memory access size.
461#if 0
462 // Accept widening loads based on alignment.
463 if (IsLoad && MemSize < Size)
464 MemSize = std::max(MemSize, Align);
465#endif
466
467 // Only 1-byte and 2-byte to 32-bit extloads are valid.
468 if (MemSize != RegSize && RegSize != 32)
469 return false;
470
471 if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad,
472 Query.MMODescrs[0].Ordering !=
474 return false;
475
476 switch (MemSize) {
477 case 8:
478 case 16:
479 case 32:
480 case 64:
481 case 128:
482 break;
483 case 96:
484 if (!ST.hasDwordx3LoadStores())
485 return false;
486 break;
487 case 256:
488 case 512:
489 // These may contextually need to be broken down.
490 break;
491 default:
492 return false;
493 }
494
495 assert(RegSize >= MemSize);
496
497 if (AlignBits < MemSize) {
498 const SITargetLowering *TLI = ST.getTargetLowering();
499 if (!TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS,
500 Align(AlignBits / 8)))
501 return false;
502 }
503
504 return true;
505}
506
507// The newer buffer intrinsic forms take their resource arguments as
508// pointers in address space 8, aka s128 values. However, in order to not break
509// SelectionDAG, the underlying operations have to continue to take v4i32
510// arguments. Therefore, we convert resource pointers - or vectors of them
511// to integer values here.
512static bool hasBufferRsrcWorkaround(const LLT Ty) {
513 if (Ty.isPointer() && Ty.getAddressSpace() == AMDGPUAS::BUFFER_RESOURCE)
514 return true;
515 if (Ty.isVector()) {
516 const LLT ElemTy = Ty.getElementType();
517 return hasBufferRsrcWorkaround(ElemTy);
518 }
519 return false;
520}
521
522// The current selector can't handle <6 x s16>, <8 x s16>, s96, s128 etc, so
523// workaround this. Eventually it should ignore the type for loads and only care
524// about the size. Return true in cases where we will workaround this for now by
525// bitcasting.
526static bool loadStoreBitcastWorkaround(const LLT Ty) {
528 return false;
529
530 const unsigned Size = Ty.getSizeInBits();
531 if (Ty.isPointerVector())
532 return true;
533 if (Size <= 64)
534 return false;
535 // Address space 8 pointers get their own workaround.
537 return false;
538 if (!Ty.isVector())
539 return true;
540
541 unsigned EltSize = Ty.getScalarSizeInBits();
542 return EltSize != 32 && EltSize != 64;
543}
544
545static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query) {
546 const LLT Ty = Query.Types[0];
547 return isRegisterType(ST, Ty) && isLoadStoreSizeLegal(ST, Query) &&
549}
550
551/// Return true if a load or store of the type should be lowered with a bitcast
552/// to a different type.
553static bool shouldBitcastLoadStoreType(const GCNSubtarget &ST, const LLT Ty,
554 const LLT MemTy) {
555 const unsigned MemSizeInBits = MemTy.getSizeInBits();
556 const unsigned Size = Ty.getSizeInBits();
557 if (Size != MemSizeInBits)
558 return Size <= 32 && Ty.isVector();
559
561 return true;
562
563 // Don't try to handle bitcasting vector ext loads for now.
564 return Ty.isVector() && (!MemTy.isVector() || MemTy == Ty) &&
565 (Size <= 32 || isRegisterSize(ST, Size)) &&
566 !isRegisterVectorElementType(Ty.getElementType());
567}
568
569/// Return true if we should legalize a load by widening an odd sized memory
570/// access up to the alignment. Note this case when the memory access itself
571/// changes, not the size of the result register.
572static bool shouldWidenLoad(const GCNSubtarget &ST, LLT MemoryTy,
573 uint64_t AlignInBits, unsigned AddrSpace,
574 unsigned Opcode) {
575 unsigned SizeInBits = MemoryTy.getSizeInBits();
576 // We don't want to widen cases that are naturally legal.
577 if (isPowerOf2_32(SizeInBits))
578 return false;
579
580 // If we have 96-bit memory operations, we shouldn't touch them. Note we may
581 // end up widening these for a scalar load during RegBankSelect, if we don't
582 // have 96-bit scalar loads.
583 if (SizeInBits == 96 && ST.hasDwordx3LoadStores())
584 return false;
585
586 if (SizeInBits >= maxSizeForAddrSpace(ST, AddrSpace, Opcode, false))
587 return false;
588
589 // A load is known dereferenceable up to the alignment, so it's legal to widen
590 // to it.
591 //
592 // TODO: Could check dereferenceable for less aligned cases.
593 unsigned RoundedSize = NextPowerOf2(SizeInBits);
594 if (AlignInBits < RoundedSize)
595 return false;
596
597 // Do not widen if it would introduce a slow unaligned load.
598 const SITargetLowering *TLI = ST.getTargetLowering();
599 unsigned Fast = 0;
601 RoundedSize, AddrSpace, Align(AlignInBits / 8),
603 Fast;
604}
605
606static bool shouldWidenLoad(const GCNSubtarget &ST, const LegalityQuery &Query,
607 unsigned Opcode) {
608 if (Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic)
609 return false;
610
611 return shouldWidenLoad(ST, Query.MMODescrs[0].MemoryTy,
612 Query.MMODescrs[0].AlignInBits,
613 Query.Types[1].getAddressSpace(), Opcode);
614}
615
616/// Mutates IR (typicaly a load instruction) to use a <4 x s32> as the initial
617/// type of the operand `idx` and then to transform it to a `p8` via bitcasts
618/// and inttoptr. In addition, handle vectors of p8. Returns the new type.
620 MachineRegisterInfo &MRI, unsigned Idx) {
621 MachineOperand &MO = MI.getOperand(Idx);
622
623 const LLT PointerTy = MRI.getType(MO.getReg());
624
625 // Paranoidly prevent us from doing this multiple times.
627 return PointerTy;
628
629 const LLT ScalarTy = getBufferRsrcScalarType(PointerTy);
630 const LLT VectorTy = getBufferRsrcRegisterType(PointerTy);
631 if (!PointerTy.isVector()) {
632 // Happy path: (4 x s32) -> (s32, s32, s32, s32) -> (p8)
633 const unsigned NumParts = PointerTy.getSizeInBits() / 32;
634 const LLT S32 = LLT::scalar(32);
635
636 Register VectorReg = MRI.createGenericVirtualRegister(VectorTy);
637 std::array<Register, 4> VectorElems;
638 B.setInsertPt(B.getMBB(), ++B.getInsertPt());
639 for (unsigned I = 0; I < NumParts; ++I)
640 VectorElems[I] =
641 B.buildExtractVectorElementConstant(S32, VectorReg, I).getReg(0);
642 B.buildMergeValues(MO, VectorElems);
643 MO.setReg(VectorReg);
644 return VectorTy;
645 }
646 Register BitcastReg = MRI.createGenericVirtualRegister(VectorTy);
647 B.setInsertPt(B.getMBB(), ++B.getInsertPt());
648 auto Scalar = B.buildBitcast(ScalarTy, BitcastReg);
649 B.buildIntToPtr(MO, Scalar);
650 MO.setReg(BitcastReg);
651
652 return VectorTy;
653}
654
655/// Cast a buffer resource (an address space 8 pointer) into a 4xi32, which is
656/// the form in which the value must be in order to be passed to the low-level
657/// representations used for MUBUF/MTBUF intrinsics. This is a hack, which is
658/// needed in order to account for the fact that we can't define a register
659/// class for s128 without breaking SelectionDAG.
661 MachineRegisterInfo &MRI = *B.getMRI();
662 const LLT PointerTy = MRI.getType(Pointer);
663 const LLT ScalarTy = getBufferRsrcScalarType(PointerTy);
664 const LLT VectorTy = getBufferRsrcRegisterType(PointerTy);
665
666 if (!PointerTy.isVector()) {
667 // Special case: p8 -> (s32, s32, s32, s32) -> (4xs32)
668 SmallVector<Register, 4> PointerParts;
669 const unsigned NumParts = PointerTy.getSizeInBits() / 32;
670 auto Unmerged = B.buildUnmerge(LLT::scalar(32), Pointer);
671 for (unsigned I = 0; I < NumParts; ++I)
672 PointerParts.push_back(Unmerged.getReg(I));
673 return B.buildBuildVector(VectorTy, PointerParts).getReg(0);
674 }
675 Register Scalar = B.buildPtrToInt(ScalarTy, Pointer).getReg(0);
676 return B.buildBitcast(VectorTy, Scalar).getReg(0);
677}
678
680 unsigned Idx) {
681 MachineOperand &MO = MI.getOperand(Idx);
682
683 const LLT PointerTy = B.getMRI()->getType(MO.getReg());
684 // Paranoidly prevent us from doing this multiple times.
686 return;
688}
689
691 const GCNTargetMachine &TM)
692 : ST(ST_) {
693 using namespace TargetOpcode;
694
695 auto GetAddrSpacePtr = [&TM](unsigned AS) {
696 return LLT::pointer(AS, TM.getPointerSizeInBits(AS));
697 };
698
699 const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS);
700 const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS);
701 const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT);
702 const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS);
703 const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS);
704 const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS);
705 const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS);
706 const LLT BufferFatPtr = GetAddrSpacePtr(AMDGPUAS::BUFFER_FAT_POINTER);
707 const LLT RsrcPtr = GetAddrSpacePtr(AMDGPUAS::BUFFER_RESOURCE);
708 const LLT BufferStridedPtr =
709 GetAddrSpacePtr(AMDGPUAS::BUFFER_STRIDED_POINTER);
710
711 const LLT CodePtr = FlatPtr;
712
713 const std::initializer_list<LLT> AddrSpaces64 = {
714 GlobalPtr, ConstantPtr, FlatPtr
715 };
716
717 const std::initializer_list<LLT> AddrSpaces32 = {
718 LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr
719 };
720
721 const std::initializer_list<LLT> AddrSpaces128 = {RsrcPtr};
722
723 const std::initializer_list<LLT> FPTypesBase = {
724 S32, S64
725 };
726
727 const std::initializer_list<LLT> FPTypes16 = {
728 S32, S64, S16
729 };
730
731 const std::initializer_list<LLT> FPTypesPK16 = {
732 S32, S64, S16, V2S16
733 };
734
735 const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32;
736
738
739 // s1 for VCC branches, s32 for SCC branches.
741
742 // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more
743 // elements for v3s16
746 .legalFor(AllS32Vectors)
748 .legalFor(AddrSpaces64)
749 .legalFor(AddrSpaces32)
750 .legalFor(AddrSpaces128)
751 .legalIf(isPointer(0))
752 .clampScalar(0, S16, S256)
754 .clampMaxNumElements(0, S32, 16)
756 .scalarize(0);
757
758 if (ST.hasVOP3PInsts() && ST.hasAddNoCarryInsts() && ST.hasIntClamp()) {
759 // Full set of gfx9 features.
760 if (ST.hasScalarAddSub64()) {
761 getActionDefinitionsBuilder({G_ADD, G_SUB})
762 .legalFor({S64, S32, S16, V2S16})
763 .clampMaxNumElementsStrict(0, S16, 2)
764 .scalarize(0)
765 .minScalar(0, S16)
767 .maxScalar(0, S32);
768 } else {
769 getActionDefinitionsBuilder({G_ADD, G_SUB})
770 .legalFor({S32, S16, V2S16})
771 .clampMaxNumElementsStrict(0, S16, 2)
772 .scalarize(0)
773 .minScalar(0, S16)
775 .maxScalar(0, S32);
776 }
777
778 if (ST.hasScalarSMulU64()) {
780 .legalFor({S64, S32, S16, V2S16})
781 .clampMaxNumElementsStrict(0, S16, 2)
782 .scalarize(0)
783 .minScalar(0, S16)
785 .custom();
786 } else {
788 .legalFor({S32, S16, V2S16})
789 .clampMaxNumElementsStrict(0, S16, 2)
790 .scalarize(0)
791 .minScalar(0, S16)
793 .custom();
794 }
795 assert(ST.hasMad64_32());
796
797 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT, G_SADDSAT, G_SSUBSAT})
798 .legalFor({S32, S16, V2S16}) // Clamp modifier
799 .minScalarOrElt(0, S16)
801 .scalarize(0)
803 .lower();
804 } else if (ST.has16BitInsts()) {
805 getActionDefinitionsBuilder({G_ADD, G_SUB})
806 .legalFor({S32, S16})
807 .minScalar(0, S16)
809 .maxScalar(0, S32)
810 .scalarize(0);
811
813 .legalFor({S32, S16})
814 .scalarize(0)
815 .minScalar(0, S16)
817 .custom();
818 assert(ST.hasMad64_32());
819
820 // Technically the saturating operations require clamp bit support, but this
821 // was introduced at the same time as 16-bit operations.
822 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
823 .legalFor({S32, S16}) // Clamp modifier
824 .minScalar(0, S16)
825 .scalarize(0)
827 .lower();
828
829 // We're just lowering this, but it helps get a better result to try to
830 // coerce to the desired type first.
831 getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT})
832 .minScalar(0, S16)
833 .scalarize(0)
834 .lower();
835 } else {
836 getActionDefinitionsBuilder({G_ADD, G_SUB})
837 .legalFor({S32})
838 .widenScalarToNextMultipleOf(0, 32)
839 .clampScalar(0, S32, S32)
840 .scalarize(0);
841
842 auto &Mul = getActionDefinitionsBuilder(G_MUL)
843 .legalFor({S32})
844 .scalarize(0)
845 .minScalar(0, S32)
847
848 if (ST.hasMad64_32())
849 Mul.custom();
850 else
851 Mul.maxScalar(0, S32);
852
853 if (ST.hasIntClamp()) {
854 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
855 .legalFor({S32}) // Clamp modifier.
856 .scalarize(0)
858 .lower();
859 } else {
860 // Clamp bit support was added in VI, along with 16-bit operations.
861 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
862 .minScalar(0, S32)
863 .scalarize(0)
864 .lower();
865 }
866
867 // FIXME: DAG expansion gets better results. The widening uses the smaller
868 // range values and goes for the min/max lowering directly.
869 getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT})
870 .minScalar(0, S32)
871 .scalarize(0)
872 .lower();
873 }
874
876 {G_SDIV, G_UDIV, G_SREM, G_UREM, G_SDIVREM, G_UDIVREM})
877 .customFor({S32, S64})
878 .clampScalar(0, S32, S64)
880 .scalarize(0);
881
882 auto &Mulh = getActionDefinitionsBuilder({G_UMULH, G_SMULH})
883 .legalFor({S32})
884 .maxScalar(0, S32);
885
886 if (ST.hasVOP3PInsts()) {
887 Mulh
888 .clampMaxNumElements(0, S8, 2)
889 .lowerFor({V2S8});
890 }
891
892 Mulh
893 .scalarize(0)
894 .lower();
895
896 // Report legal for any types we can handle anywhere. For the cases only legal
897 // on the SALU, RegBankSelect will be able to re-legalize.
898 getActionDefinitionsBuilder({G_AND, G_OR, G_XOR})
899 .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16})
900 .clampScalar(0, S32, S64)
906 .scalarize(0);
907
909 {G_UADDO, G_USUBO, G_UADDE, G_SADDE, G_USUBE, G_SSUBE})
910 .legalFor({{S32, S1}, {S32, S32}})
911 .clampScalar(0, S32, S32)
912 .scalarize(0);
913
915 // Don't worry about the size constraint.
917 .lower();
918
920 .legalFor({S1, S32, S64, S16, GlobalPtr,
921 LocalPtr, ConstantPtr, PrivatePtr, FlatPtr })
922 .legalIf(isPointer(0))
923 .clampScalar(0, S32, S64)
925
926 getActionDefinitionsBuilder(G_FCONSTANT)
927 .legalFor({S32, S64, S16})
928 .clampScalar(0, S16, S64);
929
930 getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE})
931 .legalIf(isRegisterClassType(ST, 0))
932 // s1 and s16 are special cases because they have legal operations on
933 // them, but don't really occupy registers in the normal way.
934 .legalFor({S1, S16})
935 .clampNumElements(0, V16S32, V32S32)
939 .clampMaxNumElements(0, S32, 16);
940
941 getActionDefinitionsBuilder(G_FRAME_INDEX).legalFor({PrivatePtr});
942
943 // If the amount is divergent, we have to do a wave reduction to get the
944 // maximum value, so this is expanded during RegBankSelect.
945 getActionDefinitionsBuilder(G_DYN_STACKALLOC)
946 .legalFor({{PrivatePtr, S32}});
947
948 getActionDefinitionsBuilder(G_STACKSAVE)
949 .customFor({PrivatePtr});
950 getActionDefinitionsBuilder(G_STACKRESTORE)
951 .legalFor({PrivatePtr});
952
953 getActionDefinitionsBuilder({G_GET_FPENV, G_SET_FPENV}).customFor({S64});
954
955 getActionDefinitionsBuilder(G_GLOBAL_VALUE)
956 .customIf(typeIsNot(0, PrivatePtr));
957
958 getActionDefinitionsBuilder(G_BLOCK_ADDR).legalFor({CodePtr});
959
960 auto &FPOpActions = getActionDefinitionsBuilder(
961 { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE,
962 G_STRICT_FADD, G_STRICT_FMUL, G_STRICT_FMA})
963 .legalFor({S32, S64});
964 auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS})
965 .customFor({S32, S64});
966 auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV)
967 .customFor({S32, S64});
968
969 if (ST.has16BitInsts()) {
970 if (ST.hasVOP3PInsts())
971 FPOpActions.legalFor({S16, V2S16});
972 else
973 FPOpActions.legalFor({S16});
974
975 TrigActions.customFor({S16});
976 FDIVActions.customFor({S16});
977 }
978
979 if (ST.hasPackedFP32Ops()) {
980 FPOpActions.legalFor({V2S32});
981 FPOpActions.clampMaxNumElementsStrict(0, S32, 2);
982 }
983
984 auto &MinNumMaxNumIeee =
985 getActionDefinitionsBuilder({G_FMINNUM_IEEE, G_FMAXNUM_IEEE});
986
987 if (ST.hasVOP3PInsts()) {
988 MinNumMaxNumIeee.legalFor(FPTypesPK16)
989 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
990 .clampMaxNumElements(0, S16, 2)
991 .clampScalar(0, S16, S64)
992 .scalarize(0);
993 } else if (ST.has16BitInsts()) {
994 MinNumMaxNumIeee.legalFor(FPTypes16).clampScalar(0, S16, S64).scalarize(0);
995 } else {
996 MinNumMaxNumIeee.legalFor(FPTypesBase)
997 .clampScalar(0, S32, S64)
998 .scalarize(0);
999 }
1000
1001 auto &MinNumMaxNum = getActionDefinitionsBuilder(
1002 {G_FMINNUM, G_FMAXNUM, G_FMINIMUMNUM, G_FMAXIMUMNUM});
1003
1004 if (ST.hasVOP3PInsts()) {
1005 MinNumMaxNum.customFor(FPTypesPK16)
1006 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
1007 .clampMaxNumElements(0, S16, 2)
1008 .clampScalar(0, S16, S64)
1009 .scalarize(0);
1010 } else if (ST.has16BitInsts()) {
1011 MinNumMaxNum.customFor(FPTypes16)
1012 .clampScalar(0, S16, S64)
1013 .scalarize(0);
1014 } else {
1015 MinNumMaxNum.customFor(FPTypesBase)
1016 .clampScalar(0, S32, S64)
1017 .scalarize(0);
1018 }
1019
1020 if (ST.hasVOP3PInsts())
1021 FPOpActions.clampMaxNumElementsStrict(0, S16, 2);
1022
1023 FPOpActions
1024 .scalarize(0)
1025 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
1026
1027 TrigActions
1028 .scalarize(0)
1029 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
1030
1031 FDIVActions
1032 .scalarize(0)
1033 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
1034
1035 auto &FNegAbs = getActionDefinitionsBuilder({G_FNEG, G_FABS});
1036 FNegAbs.legalFor(FPTypesPK16)
1037 .legalFor(ST.hasPackedFP32Ops(), {V2S32})
1039 if (ST.hasPackedFP32Ops())
1040 FNegAbs.clampMaxNumElementsStrict(0, S32, 2);
1041 FNegAbs.scalarize(0).clampScalar(0, S16, S64);
1042
1043 if (ST.has16BitInsts()) {
1045 .legalFor({S16})
1046 .customFor({S32, S64})
1047 .scalarize(0)
1048 .unsupported();
1050 .legalFor({S32, S64, S16})
1051 .scalarize(0)
1052 .clampScalar(0, S16, S64);
1053
1054 getActionDefinitionsBuilder({G_FLDEXP, G_STRICT_FLDEXP})
1055 .legalFor({{S32, S32}, {S64, S32}, {S16, S16}})
1056 .scalarize(0)
1057 .maxScalarIf(typeIs(0, S16), 1, S16)
1058 .clampScalar(1, S32, S32)
1059 .lower();
1060
1062 .customFor({{S32, S32}, {S64, S32}, {S16, S16}, {S16, S32}})
1063 .scalarize(0)
1064 .lower();
1065
1067 .lowerFor({S16, S32, S64})
1068 .scalarize(0)
1069 .lower();
1070 } else {
1072 .customFor({S32, S64, S16})
1073 .scalarize(0)
1074 .unsupported();
1075
1076
1077 if (ST.hasFractBug()) {
1079 .customFor({S64})
1080 .legalFor({S32, S64})
1081 .scalarize(0)
1082 .clampScalar(0, S32, S64);
1083 } else {
1085 .legalFor({S32, S64})
1086 .scalarize(0)
1087 .clampScalar(0, S32, S64);
1088 }
1089
1090 getActionDefinitionsBuilder({G_FLDEXP, G_STRICT_FLDEXP})
1091 .legalFor({{S32, S32}, {S64, S32}})
1092 .scalarize(0)
1093 .clampScalar(0, S32, S64)
1094 .clampScalar(1, S32, S32)
1095 .lower();
1096
1098 .customFor({{S32, S32}, {S64, S32}})
1099 .scalarize(0)
1100 .minScalar(0, S32)
1101 .clampScalar(1, S32, S32)
1102 .lower();
1103
1105 .lowerFor({S32, S64})
1106 .scalarize(0)
1107 .lower();
1108 }
1109
1110 auto &FPTruncActions = getActionDefinitionsBuilder(G_FPTRUNC);
1111 if (ST.hasCvtPkF16F32Inst()) {
1112 FPTruncActions.legalFor({{S32, S64}, {S16, S32}, {V2S16, V2S32}})
1113 .clampMaxNumElements(0, S16, 2);
1114 } else {
1115 FPTruncActions.legalFor({{S32, S64}, {S16, S32}});
1116 }
1117 FPTruncActions.scalarize(0).lower();
1118
1120 .legalFor({{S64, S32}, {S32, S16}})
1121 .narrowScalarFor({{S64, S16}}, changeTo(0, S32))
1122 .scalarize(0);
1123
1124 auto &FSubActions = getActionDefinitionsBuilder({G_FSUB, G_STRICT_FSUB});
1125 if (ST.has16BitInsts()) {
1126 FSubActions
1127 // Use actual fsub instruction
1128 .legalFor({S32, S16})
1129 // Must use fadd + fneg
1130 .lowerFor({S64, V2S16});
1131 } else {
1132 FSubActions
1133 // Use actual fsub instruction
1134 .legalFor({S32})
1135 // Must use fadd + fneg
1136 .lowerFor({S64, S16, V2S16});
1137 }
1138
1139 if (ST.hasPackedFP32Ops())
1140 FSubActions.lowerFor({V2S32}).clampMaxNumElements(0, S32, 2);
1141
1142 FSubActions
1143 .clampMaxNumElements(0, S16, 2)
1144 .scalarize(0)
1145 .clampScalar(0, S32, S64);
1146
1147 // Whether this is legal depends on the floating point mode for the function.
1148 auto &FMad = getActionDefinitionsBuilder(G_FMAD);
1149 if (ST.hasMadF16() && ST.hasMadMacF32Insts())
1150 FMad.customFor({S32, S16});
1151 else if (ST.hasMadMacF32Insts())
1152 FMad.customFor({S32});
1153 else if (ST.hasMadF16())
1154 FMad.customFor({S16});
1155 FMad.scalarize(0)
1156 .lower();
1157
1158 auto &FRem = getActionDefinitionsBuilder(G_FREM);
1159 if (ST.has16BitInsts()) {
1160 FRem.customFor({S16, S32, S64});
1161 } else {
1162 FRem.minScalar(0, S32)
1163 .customFor({S32, S64});
1164 }
1165 FRem.scalarize(0);
1166
1167 // TODO: Do we need to clamp maximum bitwidth?
1169 .legalIf(isScalar(0))
1170 .legalFor({{V2S16, V2S32}})
1171 .clampMaxNumElements(0, S16, 2)
1172 // Avoid scalarizing in cases that should be truly illegal. In unresolvable
1173 // situations (like an invalid implicit use), we don't want to infinite loop
1174 // in the legalizer.
1176 .alwaysLegal();
1177
1178 getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT})
1179 .legalFor({{S64, S32}, {S32, S16}, {S64, S16},
1180 {S32, S1}, {S64, S1}, {S16, S1}})
1181 .scalarize(0)
1182 .clampScalar(0, S32, S64)
1183 .widenScalarToNextPow2(1, 32);
1184
1185 // TODO: Split s1->s64 during regbankselect for VALU.
1186 auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP})
1187 .legalFor({{S32, S32}, {S64, S32}, {S16, S32}})
1188 .lowerIf(typeIs(1, S1))
1189 .customFor({{S32, S64}, {S64, S64}});
1190 if (ST.has16BitInsts())
1191 IToFP.legalFor({{S16, S16}});
1192 IToFP.clampScalar(1, S32, S64)
1193 .minScalar(0, S32)
1194 .scalarize(0)
1196
1197 auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI})
1198 .legalFor({{S32, S32}, {S32, S64}, {S32, S16}})
1199 .customFor({{S64, S32}, {S64, S64}})
1200 .narrowScalarFor({{S64, S16}}, changeTo(0, S32));
1201 if (ST.has16BitInsts())
1202 FPToI.legalFor({{S16, S16}});
1203 else
1204 FPToI.minScalar(1, S32);
1205
1206 FPToI.minScalar(0, S32)
1207 .widenScalarToNextPow2(0, 32)
1208 .scalarize(0)
1209 .lower();
1210
1211 // clang-format off
1212 auto &FPToISat = getActionDefinitionsBuilder({G_FPTOSI_SAT, G_FPTOUI_SAT})
1213 .legalFor({{S32, S32}, {S32, S64}})
1214 .legalFor(ST.has16BitInsts(),{{S16, S16}})
1215 .narrowScalarFor({{S64, S16}}, changeTo(0, S32));
1216
1217 // If available, widen width <16 to i16, intead of i32 so v_cvt_i16/u16_f16 can be used.
1218 if (ST.has16BitInsts())
1219 FPToISat.minScalarIf(typeIs(1, S16), 0, S16);
1220
1221 FPToISat.minScalar(1, S32);
1222 FPToISat.minScalar(0, S32)
1223 .widenScalarToNextPow2(0, 32)
1224 .scalarize(0)
1225 .lower();
1226 // clang-format on
1227
1228 getActionDefinitionsBuilder({G_LROUND, G_LLROUND})
1229 .clampScalar(0, S16, S64)
1230 .scalarize(0)
1231 .lower();
1232
1233 getActionDefinitionsBuilder(G_INTRINSIC_FPTRUNC_ROUND)
1234 .legalFor({S16, S32})
1235 .scalarize(0)
1236 .lower();
1237
1238 // Lower G_FNEARBYINT and G_FRINT into G_INTRINSIC_ROUNDEVEN
1239 getActionDefinitionsBuilder({G_INTRINSIC_ROUND, G_FRINT, G_FNEARBYINT})
1240 .scalarize(0)
1241 .lower();
1242
1243 getActionDefinitionsBuilder({G_INTRINSIC_LRINT, G_INTRINSIC_LLRINT})
1244 .clampScalar(0, S16, S64)
1245 .scalarize(0)
1246 .lower();
1247
1248 if (ST.has16BitInsts()) {
1249 getActionDefinitionsBuilder(
1250 {G_INTRINSIC_TRUNC, G_FCEIL, G_INTRINSIC_ROUNDEVEN})
1251 .legalFor({S16, S32, S64})
1252 .clampScalar(0, S16, S64)
1253 .scalarize(0);
1254 } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) {
1255 getActionDefinitionsBuilder(
1256 {G_INTRINSIC_TRUNC, G_FCEIL, G_INTRINSIC_ROUNDEVEN})
1257 .legalFor({S32, S64})
1258 .clampScalar(0, S32, S64)
1259 .scalarize(0);
1260 } else {
1261 getActionDefinitionsBuilder(
1262 {G_INTRINSIC_TRUNC, G_FCEIL, G_INTRINSIC_ROUNDEVEN})
1263 .legalFor({S32})
1264 .customFor({S64})
1265 .clampScalar(0, S32, S64)
1266 .scalarize(0);
1267 }
1268
1269 getActionDefinitionsBuilder(G_PTR_ADD)
1270 .unsupportedFor({BufferFatPtr, BufferStridedPtr, RsrcPtr})
1271 .legalIf(all(isPointer(0), sameSize(0, 1)))
1272 .scalarize(0)
1273 .scalarSameSizeAs(1, 0);
1274
1275 getActionDefinitionsBuilder(G_PTRMASK)
1276 .legalIf(all(sameSize(0, 1), typeInSet(1, {S64, S32})))
1277 .scalarSameSizeAs(1, 0)
1278 .scalarize(0);
1279
1280 auto &CmpBuilder =
1281 getActionDefinitionsBuilder(G_ICMP)
1282 // The compare output type differs based on the register bank of the output,
1283 // so make both s1 and s32 legal.
1284 //
1285 // Scalar compares producing output in scc will be promoted to s32, as that
1286 // is the allocatable register type that will be needed for the copy from
1287 // scc. This will be promoted during RegBankSelect, and we assume something
1288 // before that won't try to use s32 result types.
1289 //
1290 // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg
1291 // bank.
1293 {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr})
1294 .legalForCartesianProduct(
1295 {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr});
1296 if (ST.has16BitInsts()) {
1297 CmpBuilder.legalFor({{S1, S16}});
1298 }
1299
1300 CmpBuilder
1302 .clampScalar(1, S32, S64)
1303 .scalarize(0)
1304 .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1)));
1305
1306 auto &FCmpBuilder =
1307 getActionDefinitionsBuilder(G_FCMP).legalForCartesianProduct(
1308 {S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase);
1309
1310 if (ST.hasSALUFloatInsts())
1311 FCmpBuilder.legalForCartesianProduct({S32}, {S16, S32});
1312
1313 FCmpBuilder
1315 .clampScalar(1, S32, S64)
1316 .scalarize(0);
1317
1318 // FIXME: fpow has a selection pattern that should move to custom lowering.
1319 auto &ExpOps = getActionDefinitionsBuilder(G_FPOW);
1320 if (ST.has16BitInsts())
1321 ExpOps.customFor({{S32}, {S16}});
1322 else
1323 ExpOps.customFor({S32});
1324 ExpOps.clampScalar(0, MinScalarFPTy, S32)
1325 .scalarize(0);
1326
1327 getActionDefinitionsBuilder(G_FPOWI)
1328 .clampScalar(0, MinScalarFPTy, S32)
1329 .lower();
1330
1331 getActionDefinitionsBuilder(G_FLOG2)
1332 .legalFor(ST.has16BitInsts(), {S16})
1333 .customFor({S32, S16})
1334 .scalarize(0)
1335 .lower();
1336
1337 getActionDefinitionsBuilder(G_FEXP2)
1338 .legalFor(ST.has16BitInsts(), {S16})
1339 .customFor({S32, S64, S16})
1340 .scalarize(0)
1341 .lower();
1342
1343 auto &LogOps =
1344 getActionDefinitionsBuilder({G_FLOG, G_FLOG10, G_FEXP, G_FEXP10});
1345 LogOps.customFor({S32, S16, S64});
1346 LogOps.clampScalar(0, MinScalarFPTy, S32)
1347 .scalarize(0);
1348
1349 // The 64-bit versions produce 32-bit results, but only on the SALU.
1350 getActionDefinitionsBuilder(G_CTPOP)
1351 .legalFor({{S32, S32}, {S32, S64}})
1352 .clampScalar(0, S32, S32)
1353 .widenScalarToNextPow2(1, 32)
1354 .clampScalar(1, S32, S64)
1355 .scalarize(0)
1356 .widenScalarToNextPow2(0, 32);
1357
1358 // If no 16 bit instr is available, lower into different instructions.
1359 if (ST.has16BitInsts())
1360 getActionDefinitionsBuilder(G_IS_FPCLASS)
1361 .legalForCartesianProduct({S1}, FPTypes16)
1362 .widenScalarToNextPow2(1)
1363 .scalarize(0)
1364 .lower();
1365 else
1366 getActionDefinitionsBuilder(G_IS_FPCLASS)
1367 .legalForCartesianProduct({S1}, FPTypesBase)
1368 .lowerFor({S1, S16})
1369 .widenScalarToNextPow2(1)
1370 .scalarize(0)
1371 .lower();
1372
1373 // The hardware instructions return a different result on 0 than the generic
1374 // instructions expect. The hardware produces -1, but these produce the
1375 // bitwidth.
1376 getActionDefinitionsBuilder({G_CTLZ, G_CTTZ})
1377 .scalarize(0)
1378 .clampScalar(0, S32, S32)
1379 .clampScalar(1, S32, S64)
1380 .widenScalarToNextPow2(0, 32)
1381 .widenScalarToNextPow2(1, 32)
1382 .custom();
1383
1384 // The 64-bit versions produce 32-bit results, but only on the SALU.
1385 getActionDefinitionsBuilder(G_CTLZ_ZERO_POISON)
1386 .legalFor({{S32, S32}, {S32, S64}})
1387 .customIf(scalarNarrowerThan(1, 32))
1388 .clampScalar(0, S32, S32)
1389 .clampScalar(1, S32, S64)
1390 .scalarize(0)
1391 .widenScalarToNextPow2(0, 32)
1392 .widenScalarToNextPow2(1, 32);
1393
1394 getActionDefinitionsBuilder(G_CTTZ_ZERO_POISON)
1395 .legalFor({{S32, S32}, {S32, S64}})
1396 .clampScalar(0, S32, S32)
1397 .clampScalar(1, S32, S64)
1398 .scalarize(0)
1399 .widenScalarToNextPow2(0, 32)
1400 .widenScalarToNextPow2(1, 32);
1401
1402 getActionDefinitionsBuilder(G_CTLS)
1403 .customFor({{S32, S32}})
1404 .scalarize(0)
1405 .clampScalar(0, S32, S32)
1406 .clampScalar(1, S32, S32);
1407
1408 // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
1409 // RegBankSelect.
1410 getActionDefinitionsBuilder(G_BITREVERSE)
1411 .legalFor({S32, S64})
1412 .clampScalar(0, S32, S64)
1413 .scalarize(0)
1414 .widenScalarToNextPow2(0);
1415
1416 if (ST.has16BitInsts()) {
1417 getActionDefinitionsBuilder(G_BSWAP)
1418 .legalFor({S16, S32, V2S16})
1419 .clampMaxNumElementsStrict(0, S16, 2)
1420 // FIXME: Fixing non-power-of-2 before clamp is workaround for
1421 // narrowScalar limitation.
1422 .widenScalarToNextPow2(0)
1423 .clampScalar(0, S16, S32)
1424 .scalarize(0);
1425
1426 if (ST.hasVOP3PInsts()) {
1427 getActionDefinitionsBuilder(G_ABS)
1428 .legalFor({S32, S16, V2S16})
1429 .clampMaxNumElements(0, S16, 2)
1430 .minScalar(0, S16)
1431 .widenScalarToNextPow2(0)
1432 .scalarize(0)
1433 .lower();
1434 if (ST.hasIntMinMax64()) {
1435 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
1436 .legalFor({S32, S16, S64, V2S16})
1437 .clampMaxNumElements(0, S16, 2)
1438 .minScalar(0, S16)
1439 .widenScalarToNextPow2(0)
1440 .scalarize(0)
1441 .lower();
1442 } else {
1443 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
1444 .legalFor({S32, S16, V2S16})
1445 .clampMaxNumElements(0, S16, 2)
1446 .minScalar(0, S16)
1447 .widenScalarToNextPow2(0)
1448 .scalarize(0)
1449 .lower();
1450 }
1451 } else {
1452 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
1453 .legalFor({S32, S16})
1454 .widenScalarToNextPow2(0)
1455 .minScalar(0, S16)
1456 .scalarize(0)
1457 .lower();
1458 }
1459 } else {
1460 // TODO: Should have same legality without v_perm_b32
1461 getActionDefinitionsBuilder(G_BSWAP)
1462 .legalFor({S32})
1463 .lowerIf(scalarNarrowerThan(0, 32))
1464 // FIXME: Fixing non-power-of-2 before clamp is workaround for
1465 // narrowScalar limitation.
1466 .widenScalarToNextPow2(0)
1467 .maxScalar(0, S32)
1468 .scalarize(0)
1469 .lower();
1470
1471 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
1472 .legalFor({S32})
1473 .minScalar(0, S32)
1474 .widenScalarToNextPow2(0)
1475 .scalarize(0)
1476 .lower();
1477 }
1478
1479 getActionDefinitionsBuilder(G_INTTOPTR)
1480 // List the common cases
1481 .legalForCartesianProduct(AddrSpaces64, {S64})
1482 .legalForCartesianProduct(AddrSpaces32, {S32})
1483 .scalarize(0)
1484 // Accept any address space as long as the size matches
1485 .legalIf(sameSize(0, 1))
1486 .widenScalarIf(smallerThan(1, 0),
1487 [](const LegalityQuery &Query) {
1488 return std::pair(
1489 1, LLT::scalar(Query.Types[0].getSizeInBits()));
1490 })
1491 .narrowScalarIf(largerThan(1, 0), [](const LegalityQuery &Query) {
1492 return std::pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
1493 });
1494
1495 getActionDefinitionsBuilder(G_PTRTOINT)
1496 // List the common cases
1497 .legalForCartesianProduct(AddrSpaces64, {S64})
1498 .legalForCartesianProduct(AddrSpaces32, {S32})
1499 .scalarize(0)
1500 // Accept any address space as long as the size matches
1501 .legalIf(sameSize(0, 1))
1502 .widenScalarIf(smallerThan(0, 1),
1503 [](const LegalityQuery &Query) {
1504 return std::pair(
1505 0, LLT::scalar(Query.Types[1].getSizeInBits()));
1506 })
1507 .narrowScalarIf(largerThan(0, 1), [](const LegalityQuery &Query) {
1508 return std::pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
1509 });
1510
1511 getActionDefinitionsBuilder(G_ADDRSPACE_CAST)
1512 .scalarize(0)
1513 .custom();
1514
1515 const auto needToSplitMemOp = [=](const LegalityQuery &Query,
1516 bool IsLoad) -> bool {
1517 const LLT DstTy = Query.Types[0];
1518
1519 // Split vector extloads.
1520 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
1521
1522 if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize)
1523 return true;
1524
1525 const LLT PtrTy = Query.Types[1];
1526 unsigned AS = PtrTy.getAddressSpace();
1527 if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad,
1528 Query.MMODescrs[0].Ordering !=
1530 return true;
1531
1532 // Catch weird sized loads that don't evenly divide into the access sizes
1533 // TODO: May be able to widen depending on alignment etc.
1534 unsigned NumRegs = (MemSize + 31) / 32;
1535 if (NumRegs == 3) {
1536 if (!ST.hasDwordx3LoadStores())
1537 return true;
1538 } else {
1539 // If the alignment allows, these should have been widened.
1540 if (!isPowerOf2_32(NumRegs))
1541 return true;
1542 }
1543
1544 return false;
1545 };
1546
1547 unsigned GlobalAlign32 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 32;
1548 unsigned GlobalAlign16 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 16;
1549 unsigned GlobalAlign8 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 8;
1550
1551 // TODO: Refine based on subtargets which support unaligned access or 128-bit
1552 // LDS
1553 // TODO: Unsupported flat for SI.
1554
1555 for (unsigned Op : {G_LOAD, G_STORE}) {
1556 const bool IsStore = Op == G_STORE;
1557
1558 auto &Actions = getActionDefinitionsBuilder(Op);
1559 // Explicitly list some common cases.
1560 // TODO: Does this help compile time at all?
1561 Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, S32, GlobalAlign32},
1562 {V2S32, GlobalPtr, V2S32, GlobalAlign32},
1563 {V4S32, GlobalPtr, V4S32, GlobalAlign32},
1564 {S64, GlobalPtr, S64, GlobalAlign32},
1565 {V2S64, GlobalPtr, V2S64, GlobalAlign32},
1566 {V2S16, GlobalPtr, V2S16, GlobalAlign32},
1567 {S32, GlobalPtr, S8, GlobalAlign8},
1568 {S32, GlobalPtr, S16, GlobalAlign16},
1569
1570 {S32, LocalPtr, S32, 32},
1571 {S64, LocalPtr, S64, 32},
1572 {V2S32, LocalPtr, V2S32, 32},
1573 {S32, LocalPtr, S8, 8},
1574 {S32, LocalPtr, S16, 16},
1575 {V2S16, LocalPtr, S32, 32},
1576
1577 {S32, PrivatePtr, S32, 32},
1578 {S32, PrivatePtr, S8, 8},
1579 {S32, PrivatePtr, S16, 16},
1580 {V2S16, PrivatePtr, S32, 32},
1581
1582 {S32, ConstantPtr, S32, GlobalAlign32},
1583 {V2S32, ConstantPtr, V2S32, GlobalAlign32},
1584 {V4S32, ConstantPtr, V4S32, GlobalAlign32},
1585 {S64, ConstantPtr, S64, GlobalAlign32},
1586 {V2S32, ConstantPtr, V2S32, GlobalAlign32}});
1587
1588 Actions.legalForTypesWithMemDesc(ST.useRealTrue16Insts(), /* Pred */
1589 {{S16, GlobalPtr, S8, GlobalAlign8},
1590 {S16, GlobalPtr, S16, GlobalAlign16},
1591 {S16, LocalPtr, S8, 8},
1592 {S16, LocalPtr, S16, 16},
1593 {S16, PrivatePtr, S8, 8},
1594 {S16, PrivatePtr, S16, 16}});
1595
1596 Actions.legalIf(
1597 [=](const LegalityQuery &Query) -> bool {
1598 return isLoadStoreLegal(ST, Query);
1599 });
1600
1601 // The custom pointers (fat pointers, buffer resources) don't work with load
1602 // and store at this level. Fat pointers should have been lowered to
1603 // intrinsics before the translation to MIR.
1604 Actions.unsupportedIf(
1605 typeInSet(1, {BufferFatPtr, BufferStridedPtr, RsrcPtr}));
1606
1607 // Address space 8 pointers are handled by a 4xs32 load, bitcast, and
1608 // ptrtoint. This is needed to account for the fact that we can't have i128
1609 // as a register class for SelectionDAG reasons.
1610 Actions.customIf([=](const LegalityQuery &Query) -> bool {
1611 return hasBufferRsrcWorkaround(Query.Types[0]);
1612 });
1613
1614 // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to
1615 // 64-bits.
1616 //
1617 // TODO: Should generalize bitcast action into coerce, which will also cover
1618 // inserting addrspacecasts.
1619 Actions.customIf(typeIs(1, Constant32Ptr));
1620
1621 // Turn any illegal element vectors into something easier to deal
1622 // with. These will ultimately produce 32-bit scalar shifts to extract the
1623 // parts anyway.
1624 //
1625 // For odd 16-bit element vectors, prefer to split those into pieces with
1626 // 16-bit vector parts.
1627 Actions.bitcastIf(
1628 [=](const LegalityQuery &Query) -> bool {
1629 return shouldBitcastLoadStoreType(ST, Query.Types[0],
1630 Query.MMODescrs[0].MemoryTy);
1631 }, bitcastToRegisterType(0));
1632
1633 if (!IsStore) {
1634 // Widen suitably aligned loads by loading extra bytes. The standard
1635 // legalization actions can't properly express widening memory operands.
1636 Actions.customIf([=](const LegalityQuery &Query) -> bool {
1637 return shouldWidenLoad(ST, Query, G_LOAD);
1638 });
1639 }
1640
1641 // FIXME: load/store narrowing should be moved to lower action
1642 Actions
1643 .narrowScalarIf(
1644 [=](const LegalityQuery &Query) -> bool {
1645 return !Query.Types[0].isVector() &&
1646 needToSplitMemOp(Query, Op == G_LOAD);
1647 },
1648 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1649 const LLT DstTy = Query.Types[0];
1650 const LLT PtrTy = Query.Types[1];
1651
1652 const unsigned DstSize = DstTy.getSizeInBits();
1653 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
1654
1655 // Split extloads.
1656 if (DstSize > MemSize)
1657 return std::pair(0, LLT::scalar(MemSize));
1658
1659 unsigned MaxSize = maxSizeForAddrSpace(
1660 ST, PtrTy.getAddressSpace(), Op == G_LOAD,
1661 Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic);
1662 if (MemSize > MaxSize)
1663 return std::pair(0, LLT::scalar(MaxSize));
1664
1665 uint64_t Align = Query.MMODescrs[0].AlignInBits;
1666 return std::pair(0, LLT::scalar(Align));
1667 })
1668 .fewerElementsIf(
1669 [=](const LegalityQuery &Query) -> bool {
1670 return Query.Types[0].isVector() &&
1671 needToSplitMemOp(Query, Op == G_LOAD);
1672 },
1673 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1674 const LLT DstTy = Query.Types[0];
1675 const LLT PtrTy = Query.Types[1];
1676
1677 LLT EltTy = DstTy.getElementType();
1678 unsigned MaxSize = maxSizeForAddrSpace(
1679 ST, PtrTy.getAddressSpace(), Op == G_LOAD,
1680 Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic);
1681
1682 // FIXME: Handle widened to power of 2 results better. This ends
1683 // up scalarizing.
1684 // FIXME: 3 element stores scalarized on SI
1685
1686 // Split if it's too large for the address space.
1687 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
1688 if (MemSize > MaxSize) {
1689 unsigned NumElts = DstTy.getNumElements();
1690 unsigned EltSize = EltTy.getSizeInBits();
1691
1692 if (MaxSize % EltSize == 0) {
1693 return std::pair(
1695 ElementCount::getFixed(MaxSize / EltSize), EltTy));
1696 }
1697
1698 unsigned NumPieces = MemSize / MaxSize;
1699
1700 // FIXME: Refine when odd breakdowns handled
1701 // The scalars will need to be re-legalized.
1702 if (NumPieces == 1 || NumPieces >= NumElts ||
1703 NumElts % NumPieces != 0)
1704 return std::pair(0, EltTy);
1705
1706 return std::pair(0,
1707 LLT::fixed_vector(NumElts / NumPieces, EltTy));
1708 }
1709
1710 // FIXME: We could probably handle weird extending loads better.
1711 if (DstTy.getSizeInBits() > MemSize)
1712 return std::pair(0, EltTy);
1713
1714 unsigned EltSize = EltTy.getSizeInBits();
1715 unsigned DstSize = DstTy.getSizeInBits();
1716 if (!isPowerOf2_32(DstSize)) {
1717 // We're probably decomposing an odd sized store. Try to split
1718 // to the widest type. TODO: Account for alignment. As-is it
1719 // should be OK, since the new parts will be further legalized.
1720 unsigned FloorSize = llvm::bit_floor(DstSize);
1721 return std::pair(
1723 ElementCount::getFixed(FloorSize / EltSize), EltTy));
1724 }
1725
1726 // May need relegalization for the scalars.
1727 return std::pair(0, EltTy);
1728 })
1729 .minScalar(0, S32)
1730 .narrowScalarIf(isTruncStoreToSizePowerOf2(0),
1732 .widenScalarToNextPow2(0)
1733 .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0))
1734 .lower();
1735 }
1736
1737 // FIXME: Unaligned accesses not lowered.
1738 auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD})
1739 .legalForTypesWithMemDesc({{S32, GlobalPtr, S8, 8},
1740 {S32, GlobalPtr, S16, 2 * 8},
1741 {S32, LocalPtr, S8, 8},
1742 {S32, LocalPtr, S16, 16},
1743 {S32, PrivatePtr, S8, 8},
1744 {S32, PrivatePtr, S16, 16},
1745 {S32, ConstantPtr, S8, 8},
1746 {S32, ConstantPtr, S16, 2 * 8}})
1747 .legalIf(
1748 [=](const LegalityQuery &Query) -> bool {
1749 return isLoadStoreLegal(ST, Query);
1750 });
1751
1752 if (ST.hasFlatAddressSpace()) {
1753 ExtLoads.legalForTypesWithMemDesc(
1754 {{S32, FlatPtr, S8, 8}, {S32, FlatPtr, S16, 16}});
1755 }
1756
1757 // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to
1758 // 64-bits.
1759 //
1760 // TODO: Should generalize bitcast action into coerce, which will also cover
1761 // inserting addrspacecasts.
1762 ExtLoads.customIf(typeIs(1, Constant32Ptr));
1763
1764 ExtLoads.narrowScalarIf(
1765 [](const LegalityQuery &Query) {
1766 LLT MemTy = Query.MMODescrs[0].MemoryTy;
1767 return MemTy.isAnyScalar() && MemTy.getSizeInBits() > 32 &&
1768 Query.Types[0].getSizeInBits() > MemTy.getSizeInBits();
1769 }, // For large MemSize, narrowscalar to MemSize (load MemSize + ext)
1771 ExtLoads.clampScalar(0, S32, S32)
1772 .widenScalarToNextPow2(0)
1773 .lower();
1774
1775 auto &Atomics = getActionDefinitionsBuilder(
1776 {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB,
1777 G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR,
1778 G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX,
1779 G_ATOMICRMW_UMIN, G_ATOMICRMW_UINC_WRAP, G_ATOMICRMW_UDEC_WRAP})
1780 .legalFor({{S32, GlobalPtr}, {S32, LocalPtr},
1781 {S64, GlobalPtr}, {S64, LocalPtr},
1782 {S32, RegionPtr}, {S64, RegionPtr}});
1783 if (ST.hasFlatAddressSpace()) {
1784 Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}});
1785 }
1786
1787 auto &Atomics32 =
1788 getActionDefinitionsBuilder({G_ATOMICRMW_USUB_COND, G_ATOMICRMW_USUB_SAT})
1789 .legalFor({{S32, GlobalPtr}, {S32, LocalPtr}, {S32, RegionPtr}});
1790 if (ST.hasFlatAddressSpace()) {
1791 Atomics32.legalFor({{S32, FlatPtr}});
1792 }
1793
1794 // TODO: v2bf16 operations, and fat buffer pointer support.
1795 auto &Atomic = getActionDefinitionsBuilder(G_ATOMICRMW_FADD);
1796 if (ST.hasLDSFPAtomicAddF32()) {
1797 Atomic.legalFor({{S32, LocalPtr}, {S32, RegionPtr}});
1798 if (ST.hasLdsAtomicAddF64())
1799 Atomic.legalFor({{S64, LocalPtr}});
1800 if (ST.hasAtomicDsPkAdd16Insts())
1801 Atomic.legalFor({{V2F16, LocalPtr}, {V2BF16, LocalPtr}});
1802 }
1803 if (ST.hasAtomicFaddInsts())
1804 Atomic.legalFor({{S32, GlobalPtr}});
1805 if (ST.hasFlatAtomicFaddF32Inst())
1806 Atomic.legalFor({{S32, FlatPtr}});
1807
1808 if (ST.hasGFX90AInsts() || ST.hasGFX1250Insts()) {
1809 // These are legal with some caveats, and should have undergone expansion in
1810 // the IR in most situations
1811 // TODO: Move atomic expansion into legalizer
1812 Atomic.legalFor({
1813 {S32, GlobalPtr},
1814 {S64, GlobalPtr},
1815 {S64, FlatPtr}
1816 });
1817 }
1818
1819 if (ST.hasAtomicBufferGlobalPkAddF16NoRtnInsts() ||
1820 ST.hasAtomicBufferGlobalPkAddF16Insts())
1821 Atomic.legalFor({{V2F16, GlobalPtr}, {V2F16, BufferFatPtr}});
1822 if (ST.hasAtomicGlobalPkAddBF16Inst())
1823 Atomic.legalFor({{V2BF16, GlobalPtr}});
1824 if (ST.hasAtomicFlatPkAdd16Insts())
1825 Atomic.legalFor({{V2F16, FlatPtr}, {V2BF16, FlatPtr}});
1826
1827
1828 // Most of the legalization work here is done by AtomicExpand. We could
1829 // probably use a simpler legality rule that just assumes anything is OK.
1830 auto &AtomicFMinFMax =
1831 getActionDefinitionsBuilder({G_ATOMICRMW_FMIN, G_ATOMICRMW_FMAX})
1832 .legalFor({{F32, LocalPtr}, {F64, LocalPtr}});
1833
1834 if (ST.hasAtomicFMinFMaxF32GlobalInsts())
1835 AtomicFMinFMax.legalFor({{F32, GlobalPtr},{F32, BufferFatPtr}});
1836 if (ST.hasAtomicFMinFMaxF64GlobalInsts())
1837 AtomicFMinFMax.legalFor({{F64, GlobalPtr}, {F64, BufferFatPtr}});
1838 if (ST.hasAtomicFMinFMaxF32FlatInsts())
1839 AtomicFMinFMax.legalFor({F32, FlatPtr});
1840 if (ST.hasAtomicFMinFMaxF64FlatInsts())
1841 AtomicFMinFMax.legalFor({F64, FlatPtr});
1842
1843 // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output
1844 // demarshalling
1845 getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG)
1846 .customFor({{S32, GlobalPtr}, {S64, GlobalPtr},
1847 {S32, FlatPtr}, {S64, FlatPtr}})
1848 .legalFor({{S32, LocalPtr}, {S64, LocalPtr},
1849 {S32, RegionPtr}, {S64, RegionPtr}});
1850 // TODO: Pointer types, any 32-bit or 64-bit vector
1851
1852 // Condition should be s32 for scalar, s1 for vector.
1853 getActionDefinitionsBuilder(G_SELECT)
1854 .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16, GlobalPtr,
1855 LocalPtr, FlatPtr, PrivatePtr,
1856 LLT::fixed_vector(2, LocalPtr),
1857 LLT::fixed_vector(2, PrivatePtr)},
1858 {S1, S32})
1859 .clampScalar(0, S16, S64)
1860 .scalarize(1)
1861 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
1862 .fewerElementsIf(numElementsNotEven(0), scalarize(0))
1863 .clampMaxNumElements(0, S32, 2)
1864 .clampMaxNumElements(0, LocalPtr, 2)
1865 .clampMaxNumElements(0, PrivatePtr, 2)
1866 .scalarize(0)
1867 .widenScalarToNextPow2(0)
1868 .legalIf(all(isPointer(0), typeInSet(1, {S1, S32})));
1869
1870 // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can
1871 // be more flexible with the shift amount type.
1872 auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR})
1873 .legalFor({{S32, S32}, {S64, S32}});
1874 if (ST.has16BitInsts()) {
1875 if (ST.hasVOP3PInsts()) {
1876 Shifts.legalFor({{S16, S16}, {V2S16, V2S16}})
1877 .clampMaxNumElements(0, S16, 2);
1878 } else
1879 Shifts.legalFor({{S16, S16}});
1880
1881 // TODO: Support 16-bit shift amounts for all types
1882 Shifts.widenScalarIf(
1883 [=](const LegalityQuery &Query) {
1884 // Use 16-bit shift amounts for any 16-bit shift. Otherwise we want a
1885 // 32-bit amount.
1886 const LLT ValTy = Query.Types[0];
1887 const LLT AmountTy = Query.Types[1];
1888 return ValTy.isScalar() && ValTy.getSizeInBits() <= 16 &&
1889 AmountTy.getSizeInBits() < 16;
1890 }, changeTo(1, S16));
1891 Shifts.maxScalarIf(typeIs(0, S16), 1, S16);
1892 Shifts.clampScalar(1, S32, S32);
1893 Shifts.widenScalarToNextPow2(0, 16);
1894 Shifts.clampScalar(0, S16, S64);
1895
1896 getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1897 .minScalar(0, S16)
1898 .scalarize(0)
1899 .lower();
1900 } else {
1901 // Make sure we legalize the shift amount type first, as the general
1902 // expansion for the shifted type will produce much worse code if it hasn't
1903 // been truncated already.
1904 Shifts.clampScalar(1, S32, S32);
1905 Shifts.widenScalarToNextPow2(0, 32);
1906 Shifts.clampScalar(0, S32, S64);
1907
1908 getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1909 .minScalar(0, S32)
1910 .scalarize(0)
1911 .lower();
1912 }
1913 Shifts.scalarize(0);
1914
1915 for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) {
1916 unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0;
1917 unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1;
1918 unsigned IdxTypeIdx = 2;
1919
1920 getActionDefinitionsBuilder(Op)
1921 .customIf([=](const LegalityQuery &Query) {
1922 const LLT EltTy = Query.Types[EltTypeIdx];
1923 const LLT VecTy = Query.Types[VecTypeIdx];
1924 const LLT IdxTy = Query.Types[IdxTypeIdx];
1925 const unsigned EltSize = EltTy.getSizeInBits();
1926 const bool isLegalVecType =
1928 // Address space 8 pointers are 128-bit wide values, but the logic
1929 // below will try to bitcast them to 2N x s64, which will fail.
1930 // Therefore, as an intermediate step, wrap extracts/insertions from a
1931 // ptrtoint-ing the vector and scalar arguments (or inttoptring the
1932 // extraction result) in order to produce a vector operation that can
1933 // be handled by the logic below.
1934 if (EltTy.isPointer() && EltSize > 64)
1935 return true;
1936 return (EltSize == 32 || EltSize == 64) &&
1937 VecTy.getSizeInBits() % 32 == 0 &&
1938 VecTy.getSizeInBits() <= MaxRegisterSize &&
1939 IdxTy.getSizeInBits() == 32 &&
1940 isLegalVecType;
1941 })
1942 .bitcastIf(all(sizeIsMultipleOf32(VecTypeIdx),
1943 scalarOrEltNarrowerThan(VecTypeIdx, 32)),
1944 bitcastToVectorElement32(VecTypeIdx))
1945 //.bitcastIf(vectorSmallerThan(1, 32), bitcastToScalar(1))
1946 .bitcastIf(all(sizeIsMultipleOf32(VecTypeIdx),
1947 scalarOrEltWiderThan(VecTypeIdx, 64)),
1948 [=](const LegalityQuery &Query) {
1949 // For > 64-bit element types, try to turn this into a
1950 // 64-bit element vector since we may be able to do better
1951 // indexing if this is scalar. If not, fall back to 32.
1952 const LLT EltTy = Query.Types[EltTypeIdx];
1953 const LLT VecTy = Query.Types[VecTypeIdx];
1954 const unsigned DstEltSize = EltTy.getSizeInBits();
1955 const unsigned VecSize = VecTy.getSizeInBits();
1956
1957 const unsigned TargetEltSize =
1958 DstEltSize % 64 == 0 ? 64 : 32;
1959 return std::pair(VecTypeIdx,
1960 LLT::fixed_vector(VecSize / TargetEltSize,
1961 TargetEltSize));
1962 })
1963 .clampScalar(EltTypeIdx, S32, S64)
1964 .clampScalar(VecTypeIdx, S32, S64)
1965 .clampScalar(IdxTypeIdx, S32, S32)
1966 .clampMaxNumElements(VecTypeIdx, S32, 32)
1967 // TODO: Clamp elements for 64-bit vectors?
1968 .moreElementsIf(isIllegalRegisterType(ST, VecTypeIdx),
1970 // It should only be necessary with variable indexes.
1971 // As a last resort, lower to the stack
1972 .lower();
1973 }
1974
1975 getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT)
1976 .unsupportedIf([=](const LegalityQuery &Query) {
1977 const LLT &EltTy = Query.Types[1].getElementType();
1978 return Query.Types[0] != EltTy;
1979 });
1980
1981 for (unsigned Op : {G_EXTRACT, G_INSERT}) {
1982 unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0;
1983 unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1;
1984 getActionDefinitionsBuilder(Op)
1985 .widenScalarIf(
1986 [=](const LegalityQuery &Query) {
1987 const LLT BigTy = Query.Types[BigTyIdx];
1988 return (BigTy.getScalarSizeInBits() < 16);
1989 },
1991 .widenScalarIf(
1992 [=](const LegalityQuery &Query) {
1993 const LLT LitTy = Query.Types[LitTyIdx];
1994 return (LitTy.getScalarSizeInBits() < 16);
1995 },
1997 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1998 .widenScalarToNextPow2(BigTyIdx, 32)
1999 .customIf([=](const LegalityQuery &Query) {
2000 // Generic lower operates on the full-width value, producing
2001 // shift+trunc/mask sequences. For simple cases where extract/insert
2002 // values are 32-bit aligned, we can instead unmerge/merge and work on
2003 // the 32-bit components. However, we can't check the offset here so
2004 // custom lower function will have to call generic lowering if offset
2005 // is not 32-bit aligned.
2006 const LLT BigTy = Query.Types[BigTyIdx];
2007 const LLT LitTy = Query.Types[LitTyIdx];
2008 return !BigTy.isVector() && BigTy.getSizeInBits() % 32 == 0 &&
2009 LitTy.getSizeInBits() % 32 == 0;
2010 })
2011 .lower();
2012 }
2013
2014 auto &BuildVector =
2015 getActionDefinitionsBuilder(G_BUILD_VECTOR)
2016 .legalForCartesianProduct(AllS32Vectors, {S32})
2017 .legalForCartesianProduct(AllS64Vectors, {S64})
2018 .clampNumElements(0, V16S32, V32S32)
2019 .clampNumElements(0, V2S64, V16S64)
2020 .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16))
2021 .moreElementsIf(isIllegalRegisterType(ST, 0),
2023
2024 if (ST.hasScalarPackInsts()) {
2025 BuildVector
2026 // FIXME: Should probably widen s1 vectors straight to s32
2027 .minScalarOrElt(0, S16)
2028 .minScalar(1, S16);
2029
2030 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
2031 .legalFor({V2S16, S32})
2032 .lower();
2033 } else {
2034 BuildVector.customFor({V2S16, S16});
2035 BuildVector.minScalarOrElt(0, S32);
2036
2037 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
2038 .customFor({V2S16, S32})
2039 .lower();
2040 }
2041
2042 BuildVector.legalIf(isRegisterType(ST, 0));
2043
2044 // FIXME: Clamp maximum size
2045 getActionDefinitionsBuilder(G_CONCAT_VECTORS)
2046 .legalIf(all(isRegisterType(ST, 0), isRegisterType(ST, 1)))
2047 .clampMaxNumElements(0, S32, 32)
2048 .clampMaxNumElements(1, S16, 2) // TODO: Make 4?
2049 .clampMaxNumElements(0, S16, 64);
2050
2051 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower();
2052
2053 // Merge/Unmerge
2054 for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
2055 unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1;
2056 unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0;
2057
2058 auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) {
2059 const LLT Ty = Query.Types[TypeIdx];
2060 if (Ty.isVector()) {
2061 const LLT &EltTy = Ty.getElementType();
2062 if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 512)
2063 return true;
2065 return true;
2066 }
2067 return false;
2068 };
2069
2070 auto &Builder =
2071 getActionDefinitionsBuilder(Op)
2072 .legalIf(all(isRegisterType(ST, 0), isRegisterType(ST, 1)))
2073 .lowerFor({{S16, V2S16}})
2074 .lowerIf([=](const LegalityQuery &Query) {
2075 const LLT BigTy = Query.Types[BigTyIdx];
2076 return BigTy.getSizeInBits() == 32;
2077 })
2078 // Try to widen to s16 first for small types.
2079 // TODO: Only do this on targets with legal s16 shifts
2080 .minScalarOrEltIf(scalarNarrowerThan(LitTyIdx, 16), LitTyIdx, S16)
2081 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16)
2082 .moreElementsIf(isSmallOddVector(BigTyIdx),
2083 oneMoreElement(BigTyIdx))
2084 .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32),
2085 elementTypeIs(1, S16)),
2086 changeTo(1, V2S16))
2087 // Clamp the little scalar to s8-s256 and make it a power of 2. It's
2088 // not worth considering the multiples of 64 since 2*192 and 2*384
2089 // are not valid.
2090 .clampScalar(LitTyIdx, S32, S512)
2091 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32)
2092 // Break up vectors with weird elements into scalars
2093 .fewerElementsIf(
2094 [=](const LegalityQuery &Query) {
2095 return notValidElt(Query, LitTyIdx);
2096 },
2097 scalarize(0))
2098 .fewerElementsIf(
2099 [=](const LegalityQuery &Query) {
2100 return notValidElt(Query, BigTyIdx);
2101 },
2102 scalarize(1))
2103 .clampScalar(BigTyIdx, S32, MaxScalar);
2104
2105 if (Op == G_MERGE_VALUES) {
2106 Builder.widenScalarIf(
2107 // TODO: Use 16-bit shifts if legal for 8-bit values?
2108 [=](const LegalityQuery &Query) {
2109 const LLT Ty = Query.Types[LitTyIdx];
2110 return Ty.getSizeInBits() < 32;
2111 },
2112 changeTo(LitTyIdx, S32));
2113 }
2114
2115 Builder.widenScalarIf(
2116 [=](const LegalityQuery &Query) {
2117 const LLT Ty = Query.Types[BigTyIdx];
2118 return Ty.getSizeInBits() % 16 != 0;
2119 },
2120 [=](const LegalityQuery &Query) {
2121 // Pick the next power of 2, or a multiple of 64 over 128.
2122 // Whichever is smaller.
2123 const LLT &Ty = Query.Types[BigTyIdx];
2124 unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1);
2125 if (NewSizeInBits >= 256) {
2126 unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1);
2127 if (RoundedTo < NewSizeInBits)
2128 NewSizeInBits = RoundedTo;
2129 }
2130 return std::pair(BigTyIdx, LLT::scalar(NewSizeInBits));
2131 })
2132 // Any vectors left are the wrong size. Scalarize them.
2133 .scalarize(0)
2134 .scalarize(1);
2135 }
2136
2137 // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
2138 // RegBankSelect.
2139 auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG)
2140 .legalFor({{S32}, {S64}})
2141 .clampScalar(0, S32, S64);
2142
2143 if (ST.hasVOP3PInsts()) {
2144 SextInReg.lowerFor({{V2S16}})
2145 // Prefer to reduce vector widths for 16-bit vectors before lowering, to
2146 // get more vector shift opportunities, since we'll get those when
2147 // expanded.
2148 .clampMaxNumElementsStrict(0, S16, 2);
2149 } else if (ST.has16BitInsts()) {
2150 SextInReg.lowerFor({{S32}, {S64}, {S16}});
2151 } else {
2152 // Prefer to promote to s32 before lowering if we don't have 16-bit
2153 // shifts. This avoid a lot of intermediate truncate and extend operations.
2154 SextInReg.lowerFor({{S32}, {S64}});
2155 }
2156
2157 SextInReg
2158 .scalarize(0)
2159 .clampScalar(0, S32, S64)
2160 .lower();
2161
2162 getActionDefinitionsBuilder({G_ROTR, G_ROTL})
2163 .scalarize(0)
2164 .lower();
2165
2166 auto &FSHRActionDefs = getActionDefinitionsBuilder(G_FSHR);
2167 FSHRActionDefs.legalFor({{S32, S32}})
2168 .clampMaxNumElementsStrict(0, S16, 2);
2169 if (ST.hasVOP3PInsts())
2170 FSHRActionDefs.lowerFor({{V2S16, V2S16}});
2171 FSHRActionDefs.scalarize(0).lower();
2172
2173 if (ST.hasVOP3PInsts()) {
2174 getActionDefinitionsBuilder(G_FSHL)
2175 .lowerFor({{V2S16, V2S16}})
2176 .clampMaxNumElementsStrict(0, S16, 2)
2177 .scalarize(0)
2178 .lower();
2179 } else {
2180 getActionDefinitionsBuilder(G_FSHL)
2181 .scalarize(0)
2182 .lower();
2183 }
2184
2185 getActionDefinitionsBuilder(G_READCYCLECOUNTER)
2186 .legalFor({S64});
2187
2188 getActionDefinitionsBuilder(G_READSTEADYCOUNTER).legalFor({S64});
2189
2190 getActionDefinitionsBuilder(G_FENCE)
2191 .alwaysLegal();
2192
2193 getActionDefinitionsBuilder({G_SMULO, G_UMULO})
2194 .scalarize(0)
2195 .minScalar(0, S32)
2196 .lower();
2197
2198 getActionDefinitionsBuilder({G_SBFX, G_UBFX})
2199 .legalFor({{S32, S32}, {S64, S32}})
2200 .clampScalar(1, S32, S32)
2201 .clampScalar(0, S32, S64)
2202 .widenScalarToNextPow2(0)
2203 .scalarize(0);
2204
2205 getActionDefinitionsBuilder(
2206 {// TODO: Verify V_BFI_B32 is generated from expanded bit ops
2207 G_FCOPYSIGN,
2208
2209 G_ATOMIC_CMPXCHG_WITH_SUCCESS, G_ATOMICRMW_NAND, G_ATOMICRMW_FSUB,
2210 G_READ_REGISTER, G_WRITE_REGISTER,
2211
2212 G_SADDO, G_SSUBO})
2213 .lower();
2214
2215 if (ST.hasIEEEMinimumMaximumInsts()) {
2216 getActionDefinitionsBuilder({G_FMINIMUM, G_FMAXIMUM})
2217 .legalFor(FPTypesPK16)
2218 .clampMaxNumElements(0, S16, 2)
2219 .scalarize(0);
2220 } else if (ST.hasVOP3PInsts()) {
2221 getActionDefinitionsBuilder({G_FMINIMUM, G_FMAXIMUM})
2222 .lowerFor({V2S16})
2223 .clampMaxNumElementsStrict(0, S16, 2)
2224 .scalarize(0)
2225 .lower();
2226 } else {
2227 getActionDefinitionsBuilder({G_FMINIMUM, G_FMAXIMUM})
2228 .scalarize(0)
2229 .clampScalar(0, S32, S64)
2230 .lower();
2231 }
2232
2233 getActionDefinitionsBuilder({G_MEMCPY, G_MEMCPY_INLINE, G_MEMMOVE, G_MEMSET})
2234 .lower();
2235
2236 getActionDefinitionsBuilder({G_TRAP, G_DEBUGTRAP}).custom();
2237
2238 getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE,
2239 G_INDEXED_LOAD, G_INDEXED_SEXTLOAD,
2240 G_INDEXED_ZEXTLOAD, G_INDEXED_STORE})
2241 .unsupported();
2242
2243 getActionDefinitionsBuilder(G_PREFETCH).alwaysLegal();
2244
2245 getActionDefinitionsBuilder(
2246 {G_VECREDUCE_SMIN, G_VECREDUCE_SMAX, G_VECREDUCE_UMIN, G_VECREDUCE_UMAX,
2247 G_VECREDUCE_ADD, G_VECREDUCE_MUL, G_VECREDUCE_FMUL, G_VECREDUCE_FMIN,
2248 G_VECREDUCE_FMAX, G_VECREDUCE_FMINIMUM, G_VECREDUCE_FMAXIMUM,
2249 G_VECREDUCE_OR, G_VECREDUCE_AND, G_VECREDUCE_XOR})
2250 .legalFor(AllVectors)
2251 .scalarize(1)
2252 .lower();
2253
2254 getActionDefinitionsBuilder({G_INTRINSIC, G_INTRINSIC_W_SIDE_EFFECTS,
2255 G_INTRINSIC_CONVERGENT,
2256 G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS})
2257 .alwaysLegal();
2258
2259 getLegacyLegalizerInfo().computeTables();
2260 verify(*ST.getInstrInfo());
2261}
2262
2265 LostDebugLocObserver &LocObserver) const {
2266 MachineIRBuilder &B = Helper.MIRBuilder;
2267 MachineRegisterInfo &MRI = *B.getMRI();
2268
2269 switch (MI.getOpcode()) {
2270 case TargetOpcode::G_ADDRSPACE_CAST:
2271 return legalizeAddrSpaceCast(MI, MRI, B);
2272 case TargetOpcode::G_INTRINSIC_ROUNDEVEN:
2273 return legalizeFroundeven(MI, MRI, B);
2274 case TargetOpcode::G_FCEIL:
2275 return legalizeFceil(MI, MRI, B);
2276 case TargetOpcode::G_FREM:
2277 return legalizeFrem(MI, MRI, B);
2278 case TargetOpcode::G_INTRINSIC_TRUNC:
2279 return legalizeIntrinsicTrunc(MI, MRI, B);
2280 case TargetOpcode::G_SITOFP:
2281 return legalizeITOFP(MI, MRI, B, true);
2282 case TargetOpcode::G_UITOFP:
2283 return legalizeITOFP(MI, MRI, B, false);
2284 case TargetOpcode::G_FPTOSI:
2285 return legalizeFPTOI(MI, MRI, B, true);
2286 case TargetOpcode::G_FPTOUI:
2287 return legalizeFPTOI(MI, MRI, B, false);
2288 case TargetOpcode::G_FMINNUM:
2289 case TargetOpcode::G_FMAXNUM:
2290 case TargetOpcode::G_FMINIMUMNUM:
2291 case TargetOpcode::G_FMAXIMUMNUM:
2292 return legalizeMinNumMaxNum(Helper, MI);
2293 case TargetOpcode::G_EXTRACT:
2294 return legalizeExtract(Helper, MI);
2295 case TargetOpcode::G_INSERT:
2296 return legalizeInsert(Helper, MI);
2297 case TargetOpcode::G_EXTRACT_VECTOR_ELT:
2298 return legalizeExtractVectorElt(MI, MRI, B);
2299 case TargetOpcode::G_INSERT_VECTOR_ELT:
2300 return legalizeInsertVectorElt(MI, MRI, B);
2301 case TargetOpcode::G_FSIN:
2302 case TargetOpcode::G_FCOS:
2303 return legalizeSinCos(MI, MRI, B);
2304 case TargetOpcode::G_GLOBAL_VALUE:
2305 return legalizeGlobalValue(MI, MRI, B);
2306 case TargetOpcode::G_LOAD:
2307 case TargetOpcode::G_SEXTLOAD:
2308 case TargetOpcode::G_ZEXTLOAD:
2309 return legalizeLoad(Helper, MI);
2310 case TargetOpcode::G_STORE:
2311 return legalizeStore(Helper, MI);
2312 case TargetOpcode::G_FMAD:
2313 return legalizeFMad(MI, MRI, B);
2314 case TargetOpcode::G_FDIV:
2315 return legalizeFDIV(MI, MRI, B);
2316 case TargetOpcode::G_FFREXP:
2317 return legalizeFFREXP(MI, MRI, B);
2318 case TargetOpcode::G_FSQRT:
2319 return legalizeFSQRT(MI, MRI, B);
2320 case TargetOpcode::G_UDIV:
2321 case TargetOpcode::G_UREM:
2322 case TargetOpcode::G_UDIVREM:
2323 return legalizeUnsignedDIV_REM(MI, MRI, B);
2324 case TargetOpcode::G_SDIV:
2325 case TargetOpcode::G_SREM:
2326 case TargetOpcode::G_SDIVREM:
2327 return legalizeSignedDIV_REM(MI, MRI, B);
2328 case TargetOpcode::G_ATOMIC_CMPXCHG:
2329 return legalizeAtomicCmpXChg(MI, MRI, B);
2330 case TargetOpcode::G_FLOG2:
2331 return legalizeFlog2(MI, B);
2332 case TargetOpcode::G_FLOG:
2333 case TargetOpcode::G_FLOG10:
2334 return legalizeFlogCommon(MI, B);
2335 case TargetOpcode::G_FEXP2:
2336 return legalizeFExp2(MI, B);
2337 case TargetOpcode::G_FEXP:
2338 case TargetOpcode::G_FEXP10:
2339 return legalizeFExp(MI, B);
2340 case TargetOpcode::G_FPOW:
2341 return legalizeFPow(MI, B);
2342 case TargetOpcode::G_FFLOOR:
2343 return legalizeFFloor(MI, MRI, B);
2344 case TargetOpcode::G_BUILD_VECTOR:
2345 case TargetOpcode::G_BUILD_VECTOR_TRUNC:
2346 return legalizeBuildVector(MI, MRI, B);
2347 case TargetOpcode::G_MUL:
2348 return legalizeMul(Helper, MI);
2349 case TargetOpcode::G_CTLZ:
2350 case TargetOpcode::G_CTTZ:
2351 return legalizeCTLZ_CTTZ(MI, MRI, B);
2352 case TargetOpcode::G_CTLS:
2353 return legalizeCTLS(MI, MRI, B);
2354 case TargetOpcode::G_CTLZ_ZERO_POISON:
2355 return legalizeCTLZ_ZERO_POISON(MI, MRI, B);
2356 case TargetOpcode::G_STACKSAVE:
2357 return legalizeStackSave(MI, B);
2358 case TargetOpcode::G_GET_FPENV:
2359 return legalizeGetFPEnv(MI, MRI, B);
2360 case TargetOpcode::G_SET_FPENV:
2361 return legalizeSetFPEnv(MI, MRI, B);
2362 case TargetOpcode::G_TRAP:
2363 return legalizeTrap(MI, MRI, B);
2364 case TargetOpcode::G_DEBUGTRAP:
2365 return legalizeDebugTrap(MI, MRI, B);
2366 default:
2367 return false;
2368 }
2369
2370 llvm_unreachable("expected switch to return");
2371}
2372
2374 unsigned AS,
2376 MachineIRBuilder &B) const {
2377 MachineFunction &MF = B.getMF();
2378 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
2379 const LLT S32 = LLT::scalar(32);
2380 const LLT S64 = LLT::scalar(64);
2381
2383
2384 if (ST.hasApertureRegs()) {
2385 // Note: this register is somewhat broken. When used as a 32-bit operand,
2386 // it only returns zeroes. The real value is in the upper 32 bits.
2387 // Thus, we must emit extract the high 32 bits.
2388 const unsigned ApertureRegNo = (AS == AMDGPUAS::LOCAL_ADDRESS)
2389 ? AMDGPU::SRC_SHARED_BASE
2390 : AMDGPU::SRC_PRIVATE_BASE;
2391 assert((ApertureRegNo != AMDGPU::SRC_PRIVATE_BASE ||
2392 !ST.hasGloballyAddressableScratch()) &&
2393 "Cannot use src_private_base with globally addressable scratch!");
2395 MRI.setRegClass(Dst, &AMDGPU::SReg_64RegClass);
2396 B.buildCopy({Dst}, {Register(ApertureRegNo)});
2397 return B.buildUnmerge(S32, Dst).getReg(1);
2398 }
2399
2402 // For code object version 5, private_base and shared_base are passed through
2403 // implicit kernargs.
2407
2412 ST.getTargetLowering()->getImplicitParameterOffset(B.getMF(), Param);
2413
2414 Register KernargPtrReg = MRI.createGenericVirtualRegister(
2416
2417 if (!loadInputValue(KernargPtrReg, B,
2419 return Register();
2420
2422 PtrInfo.getWithOffset(Offset),
2426
2427 // Pointer address
2428 B.buildObjectPtrOffset(LoadAddr, KernargPtrReg,
2429 B.buildConstant(LLT::scalar(64), Offset).getReg(0));
2430 // Load address
2431 return B.buildLoad(S32, LoadAddr, *MMO).getReg(0);
2432 }
2433
2436
2438 return Register();
2439
2440 // TODO: Use custom PseudoSourceValue
2442
2443 // Offset into amd_queue_t for group_segment_aperture_base_hi /
2444 // private_segment_aperture_base_hi.
2445 uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
2446
2448 PtrInfo,
2451 LLT::scalar(32), commonAlignment(Align(64), StructOffset));
2452
2453 B.buildObjectPtrOffset(
2454 LoadAddr, QueuePtr,
2455 B.buildConstant(LLT::scalar(64), StructOffset).getReg(0));
2456 return B.buildLoad(S32, LoadAddr, *MMO).getReg(0);
2457}
2458
2459/// Return true if the value is a known valid address, such that a null check is
2460/// not necessary.
2462 const AMDGPUTargetMachine &TM, unsigned AddrSpace) {
2463 MachineInstr *Def = MRI.getVRegDef(Val);
2464 switch (Def->getOpcode()) {
2465 case AMDGPU::G_FRAME_INDEX:
2466 case AMDGPU::G_GLOBAL_VALUE:
2467 case AMDGPU::G_BLOCK_ADDR:
2468 return true;
2469 case AMDGPU::G_CONSTANT: {
2470 const ConstantInt *CI = Def->getOperand(1).getCImm();
2471 return CI->getSExtValue() != AMDGPU::getNullPointerValue(AddrSpace);
2472 }
2473 default:
2474 return false;
2475 }
2476
2477 return false;
2478}
2479
2482 MachineIRBuilder &B) const {
2483 MachineFunction &MF = B.getMF();
2484
2485 // MI can either be a G_ADDRSPACE_CAST or a
2486 // G_INTRINSIC @llvm.amdgcn.addrspacecast.nonnull
2487 assert(MI.getOpcode() == TargetOpcode::G_ADDRSPACE_CAST ||
2488 (isa<GIntrinsic>(MI) && cast<GIntrinsic>(MI).getIntrinsicID() ==
2489 Intrinsic::amdgcn_addrspacecast_nonnull));
2490
2491 const LLT S32 = LLT::scalar(32);
2492 Register Dst = MI.getOperand(0).getReg();
2493 Register Src = isa<GIntrinsic>(MI) ? MI.getOperand(2).getReg()
2494 : MI.getOperand(1).getReg();
2495 LLT DstTy = MRI.getType(Dst);
2496 LLT SrcTy = MRI.getType(Src);
2497 unsigned DestAS = DstTy.getAddressSpace();
2498 unsigned SrcAS = SrcTy.getAddressSpace();
2499
2500 // TODO: Avoid reloading from the queue ptr for each cast, or at least each
2501 // vector element.
2502 assert(!DstTy.isVector());
2503
2504 const AMDGPUTargetMachine &TM
2505 = static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
2506
2507 if (TM.isNoopAddrSpaceCast(SrcAS, DestAS)) {
2508 MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST));
2509 return true;
2510 }
2511
2512 if (SrcAS == AMDGPUAS::FLAT_ADDRESS &&
2513 (DestAS == AMDGPUAS::LOCAL_ADDRESS ||
2514 DestAS == AMDGPUAS::PRIVATE_ADDRESS)) {
2515 auto castFlatToLocalOrPrivate = [&](const DstOp &Dst) -> Register {
2516 if (DestAS == AMDGPUAS::PRIVATE_ADDRESS &&
2517 ST.hasGloballyAddressableScratch()) {
2518 // flat -> private with globally addressable scratch: subtract
2519 // src_flat_scratch_base_lo.
2520 const LLT S32 = LLT::scalar(32);
2521 Register SrcLo = B.buildExtract(S32, Src, 0).getReg(0);
2522 Register FlatScratchBaseLo =
2523 B.buildInstr(AMDGPU::S_MOV_B32, {S32},
2524 {Register(AMDGPU::SRC_FLAT_SCRATCH_BASE_LO)})
2525 .getReg(0);
2526 MRI.setRegClass(FlatScratchBaseLo, &AMDGPU::SReg_32RegClass);
2527 Register Sub = B.buildSub(S32, SrcLo, FlatScratchBaseLo).getReg(0);
2528 return B.buildIntToPtr(Dst, Sub).getReg(0);
2529 }
2530
2531 // Extract low 32-bits of the pointer.
2532 return B.buildExtract(Dst, Src, 0).getReg(0);
2533 };
2534
2535 // For llvm.amdgcn.addrspacecast.nonnull we can always assume non-null, for
2536 // G_ADDRSPACE_CAST we need to guess.
2537 if (isa<GIntrinsic>(MI) || isKnownNonNull(Src, MRI, TM, SrcAS)) {
2538 castFlatToLocalOrPrivate(Dst);
2539 MI.eraseFromParent();
2540 return true;
2541 }
2542
2543 unsigned NullVal = AMDGPU::getNullPointerValue(DestAS);
2544
2545 auto SegmentNull = B.buildConstant(DstTy, NullVal);
2546 auto FlatNull = B.buildConstant(SrcTy, 0);
2547
2548 // Extract low 32-bits of the pointer.
2549 auto PtrLo32 = castFlatToLocalOrPrivate(DstTy);
2550
2551 auto CmpRes =
2552 B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0));
2553 B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0));
2554
2555 MI.eraseFromParent();
2556 return true;
2557 }
2558
2559 if (DestAS == AMDGPUAS::FLAT_ADDRESS &&
2560 (SrcAS == AMDGPUAS::LOCAL_ADDRESS ||
2561 SrcAS == AMDGPUAS::PRIVATE_ADDRESS)) {
2562 auto castLocalOrPrivateToFlat = [&](const DstOp &Dst) -> Register {
2563 // Coerce the type of the low half of the result so we can use
2564 // merge_values.
2565 Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0);
2566
2567 if (SrcAS == AMDGPUAS::PRIVATE_ADDRESS &&
2568 ST.hasGloballyAddressableScratch()) {
2569 // For wave32: Addr = (TID[4:0] << 52) + FLAT_SCRATCH_BASE + privateAddr
2570 // For wave64: Addr = (TID[5:0] << 51) + FLAT_SCRATCH_BASE + privateAddr
2571 Register AllOnes = B.buildConstant(S32, -1).getReg(0);
2572 Register ThreadID = B.buildConstant(S32, 0).getReg(0);
2573 ThreadID = B.buildIntrinsic(Intrinsic::amdgcn_mbcnt_lo, {S32})
2574 .addUse(AllOnes)
2575 .addUse(ThreadID)
2576 .getReg(0);
2577 if (ST.isWave64()) {
2578 ThreadID = B.buildIntrinsic(Intrinsic::amdgcn_mbcnt_hi, {S32})
2579 .addUse(AllOnes)
2580 .addUse(ThreadID)
2581 .getReg(0);
2582 }
2583 Register ShAmt =
2584 B.buildConstant(S32, 57 - 32 - ST.getWavefrontSizeLog2()).getReg(0);
2585 Register SrcHi = B.buildShl(S32, ThreadID, ShAmt).getReg(0);
2586 Register CvtPtr =
2587 B.buildMergeLikeInstr(DstTy, {SrcAsInt, SrcHi}).getReg(0);
2588 // Accessing src_flat_scratch_base_lo as a 64-bit operand gives the full
2589 // 64-bit hi:lo value.
2590 Register FlatScratchBase =
2591 B.buildInstr(AMDGPU::S_MOV_B64, {S64},
2592 {Register(AMDGPU::SRC_FLAT_SCRATCH_BASE)})
2593 .getReg(0);
2594 MRI.setRegClass(FlatScratchBase, &AMDGPU::SReg_64RegClass);
2595 return B.buildPtrAdd(Dst, CvtPtr, FlatScratchBase).getReg(0);
2596 }
2597
2598 Register ApertureReg = getSegmentAperture(SrcAS, MRI, B);
2599 if (!ApertureReg.isValid())
2600 return false;
2601
2602 // TODO: Should we allow mismatched types but matching sizes in merges to
2603 // avoid the ptrtoint?
2604 return B.buildMergeLikeInstr(Dst, {SrcAsInt, ApertureReg}).getReg(0);
2605 };
2606
2607 // For llvm.amdgcn.addrspacecast.nonnull we can always assume non-null, for
2608 // G_ADDRSPACE_CAST we need to guess.
2609 if (isa<GIntrinsic>(MI) || isKnownNonNull(Src, MRI, TM, SrcAS)) {
2610 castLocalOrPrivateToFlat(Dst);
2611 MI.eraseFromParent();
2612 return true;
2613 }
2614
2615 Register BuildPtr = castLocalOrPrivateToFlat(DstTy);
2616
2617 auto SegmentNull =
2618 B.buildConstant(SrcTy, AMDGPU::getNullPointerValue(SrcAS));
2619 auto FlatNull = B.buildConstant(DstTy, AMDGPU::getNullPointerValue(DestAS));
2620
2621 auto CmpRes = B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src,
2622 SegmentNull.getReg(0));
2623
2624 B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull);
2625
2626 MI.eraseFromParent();
2627 return true;
2628 }
2629
2630 if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
2631 SrcTy.getSizeInBits() == 64) {
2632 // Truncate.
2633 B.buildExtract(Dst, Src, 0);
2634 MI.eraseFromParent();
2635 return true;
2636 }
2637
2638 if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
2639 DstTy.getSizeInBits() == 64) {
2641 uint32_t AddrHiVal = Info->get32BitAddressHighBits();
2642 auto PtrLo = B.buildPtrToInt(S32, Src);
2643 if (AddrHiVal == 0) {
2644 auto Zext = B.buildZExt(LLT::scalar(64), PtrLo);
2645 B.buildIntToPtr(Dst, Zext);
2646 } else {
2647 auto HighAddr = B.buildConstant(S32, AddrHiVal);
2648 B.buildMergeLikeInstr(Dst, {PtrLo, HighAddr});
2649 }
2650
2651 MI.eraseFromParent();
2652 return true;
2653 }
2654
2655 // Invalid casts are poison.
2656 // TODO: Should return poison
2657 B.buildUndef(Dst);
2658 MI.eraseFromParent();
2659 return true;
2660}
2661
2664 MachineIRBuilder &B) const {
2665 Register Src = MI.getOperand(1).getReg();
2666 LLT Ty = MRI.getType(Src);
2667 assert(Ty.isScalar() && Ty.getSizeInBits() == 64);
2668
2669 APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
2670 APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
2671
2672 auto C1 = B.buildFConstant(Ty, C1Val);
2673 auto CopySign = B.buildFCopysign(Ty, C1, Src);
2674
2675 // TODO: Should this propagate fast-math-flags?
2676 auto Tmp1 = B.buildFAdd(Ty, Src, CopySign);
2677 auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign);
2678
2679 auto C2 = B.buildFConstant(Ty, C2Val);
2680 auto Fabs = B.buildFAbs(Ty, Src);
2681
2682 auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2);
2683 B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2);
2684 MI.eraseFromParent();
2685 return true;
2686}
2687
2690 MachineIRBuilder &B) const {
2691
2692 const LLT S1 = LLT::scalar(1);
2693 const LLT S64 = LLT::scalar(64);
2694
2695 Register Src = MI.getOperand(1).getReg();
2696 assert(MRI.getType(Src) == S64);
2697
2698 // result = trunc(src)
2699 // if (src > 0.0 && src != result)
2700 // result += 1.0
2701
2702 auto Trunc = B.buildIntrinsicTrunc(S64, Src);
2703
2704 const auto Zero = B.buildFConstant(S64, 0.0);
2705 const auto One = B.buildFConstant(S64, 1.0);
2706 auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero);
2707 auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc);
2708 auto And = B.buildAnd(S1, Lt0, NeTrunc);
2709 auto Add = B.buildSelect(S64, And, One, Zero);
2710
2711 // TODO: Should this propagate fast-math-flags?
2712 B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add);
2713 MI.eraseFromParent();
2714 return true;
2715}
2716
2719 MachineIRBuilder &B) const {
2720 Register DstReg = MI.getOperand(0).getReg();
2721 Register Src0Reg = MI.getOperand(1).getReg();
2722 Register Src1Reg = MI.getOperand(2).getReg();
2723 auto Flags = MI.getFlags();
2724 LLT Ty = MRI.getType(DstReg);
2725
2726 auto Div = B.buildFDiv(Ty, Src0Reg, Src1Reg, Flags);
2727 auto Trunc = B.buildIntrinsicTrunc(Ty, Div, Flags);
2728 auto Neg = B.buildFNeg(Ty, Trunc, Flags);
2729 B.buildFMA(DstReg, Neg, Src1Reg, Src0Reg, Flags);
2730 MI.eraseFromParent();
2731 return true;
2732}
2733
2736 const unsigned FractBits = 52;
2737 const unsigned ExpBits = 11;
2738 LLT S32 = LLT::scalar(32);
2739
2740 auto Const0 = B.buildConstant(S32, FractBits - 32);
2741 auto Const1 = B.buildConstant(S32, ExpBits);
2742
2743 auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32})
2744 .addUse(Hi)
2745 .addUse(Const0.getReg(0))
2746 .addUse(Const1.getReg(0));
2747
2748 return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023));
2749}
2750
2753 MachineIRBuilder &B) const {
2754 const LLT S1 = LLT::scalar(1);
2755 const LLT S32 = LLT::scalar(32);
2756 const LLT S64 = LLT::scalar(64);
2757
2758 Register Src = MI.getOperand(1).getReg();
2759 assert(MRI.getType(Src) == S64);
2760
2761 // TODO: Should this use extract since the low half is unused?
2762 auto Unmerge = B.buildUnmerge({S32, S32}, Src);
2763 Register Hi = Unmerge.getReg(1);
2764
2765 // Extract the upper half, since this is where we will find the sign and
2766 // exponent.
2767 auto Exp = extractF64Exponent(Hi, B);
2768
2769 const unsigned FractBits = 52;
2770
2771 // Extract the sign bit.
2772 const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31);
2773 auto SignBit = B.buildAnd(S32, Hi, SignBitMask);
2774
2775 const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1);
2776
2777 const auto Zero32 = B.buildConstant(S32, 0);
2778
2779 // Extend back to 64-bits.
2780 auto SignBit64 = B.buildMergeLikeInstr(S64, {Zero32, SignBit});
2781
2782 auto Shr = B.buildAShr(S64, FractMask, Exp);
2783 auto Not = B.buildNot(S64, Shr);
2784 auto Tmp0 = B.buildAnd(S64, Src, Not);
2785 auto FiftyOne = B.buildConstant(S32, FractBits - 1);
2786
2787 auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32);
2788 auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne);
2789
2790 auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0);
2791 B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1);
2792 MI.eraseFromParent();
2793 return true;
2794}
2795
2798 MachineIRBuilder &B, bool Signed) const {
2799
2800 Register Dst = MI.getOperand(0).getReg();
2801 Register Src = MI.getOperand(1).getReg();
2802
2803 const LLT S64 = LLT::scalar(64);
2804 const LLT S32 = LLT::scalar(32);
2805
2806 assert(MRI.getType(Src) == S64);
2807
2808 auto Unmerge = B.buildUnmerge({S32, S32}, Src);
2809 auto ThirtyTwo = B.buildConstant(S32, 32);
2810
2811 if (MRI.getType(Dst) == S64) {
2812 auto CvtHi = Signed ? B.buildSITOFP(S64, Unmerge.getReg(1))
2813 : B.buildUITOFP(S64, Unmerge.getReg(1));
2814
2815 auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0));
2816 auto LdExp = B.buildFLdexp(S64, CvtHi, ThirtyTwo);
2817
2818 // TODO: Should this propagate fast-math-flags?
2819 B.buildFAdd(Dst, LdExp, CvtLo);
2820 MI.eraseFromParent();
2821 return true;
2822 }
2823
2824 assert(MRI.getType(Dst) == S32);
2825
2826 auto One = B.buildConstant(S32, 1);
2827
2828 MachineInstrBuilder ShAmt;
2829 if (Signed) {
2830 auto ThirtyOne = B.buildConstant(S32, 31);
2831 auto X = B.buildXor(S32, Unmerge.getReg(0), Unmerge.getReg(1));
2832 auto OppositeSign = B.buildAShr(S32, X, ThirtyOne);
2833 auto MaxShAmt = B.buildAdd(S32, ThirtyTwo, OppositeSign);
2834 auto LS = B.buildIntrinsic(Intrinsic::amdgcn_sffbh, {S32})
2835 .addUse(Unmerge.getReg(1));
2836 auto LS2 = B.buildSub(S32, LS, One);
2837 ShAmt = B.buildUMin(S32, LS2, MaxShAmt);
2838 } else
2839 ShAmt = B.buildCTLZ(S32, Unmerge.getReg(1));
2840 auto Norm = B.buildShl(S64, Src, ShAmt);
2841 auto Unmerge2 = B.buildUnmerge({S32, S32}, Norm);
2842 auto Adjust = B.buildUMin(S32, One, Unmerge2.getReg(0));
2843 auto Norm2 = B.buildOr(S32, Unmerge2.getReg(1), Adjust);
2844 auto FVal = Signed ? B.buildSITOFP(S32, Norm2) : B.buildUITOFP(S32, Norm2);
2845 auto Scale = B.buildSub(S32, ThirtyTwo, ShAmt);
2846 B.buildFLdexp(Dst, FVal, Scale);
2847 MI.eraseFromParent();
2848 return true;
2849}
2850
2851// TODO: Copied from DAG implementation. Verify logic and document how this
2852// actually works.
2856 bool Signed) const {
2857
2858 Register Dst = MI.getOperand(0).getReg();
2859 Register Src = MI.getOperand(1).getReg();
2860
2861 const LLT S64 = LLT::scalar(64);
2862 const LLT S32 = LLT::scalar(32);
2863
2864 const LLT SrcLT = MRI.getType(Src);
2865 assert((SrcLT == S32 || SrcLT == S64) && MRI.getType(Dst) == S64);
2866
2867 unsigned Flags = MI.getFlags();
2868
2869 // The basic idea of converting a floating point number into a pair of 32-bit
2870 // integers is illustrated as follows:
2871 //
2872 // tf := trunc(val);
2873 // hif := floor(tf * 2^-32);
2874 // lof := tf - hif * 2^32; // lof is always positive due to floor.
2875 // hi := fptoi(hif);
2876 // lo := fptoi(lof);
2877 //
2878 auto Trunc = B.buildIntrinsicTrunc(SrcLT, Src, Flags);
2880 if (Signed && SrcLT == S32) {
2881 // However, a 32-bit floating point number has only 23 bits mantissa and
2882 // it's not enough to hold all the significant bits of `lof` if val is
2883 // negative. To avoid the loss of precision, We need to take the absolute
2884 // value after truncating and flip the result back based on the original
2885 // signedness.
2886 Sign = B.buildAShr(S32, Src, B.buildConstant(S32, 31));
2887 Trunc = B.buildFAbs(S32, Trunc, Flags);
2888 }
2889 MachineInstrBuilder K0, K1;
2890 if (SrcLT == S64) {
2891 K0 = B.buildFConstant(
2892 S64, llvm::bit_cast<double>(UINT64_C(/*2^-32*/ 0x3df0000000000000)));
2893 K1 = B.buildFConstant(
2894 S64, llvm::bit_cast<double>(UINT64_C(/*-2^32*/ 0xc1f0000000000000)));
2895 } else {
2896 K0 = B.buildFConstant(
2897 S32, llvm::bit_cast<float>(UINT32_C(/*2^-32*/ 0x2f800000)));
2898 K1 = B.buildFConstant(
2899 S32, llvm::bit_cast<float>(UINT32_C(/*-2^32*/ 0xcf800000)));
2900 }
2901
2902 auto Mul = B.buildFMul(SrcLT, Trunc, K0, Flags);
2903 auto FloorMul = B.buildFFloor(SrcLT, Mul, Flags);
2904 auto Fma = B.buildFMA(SrcLT, FloorMul, K1, Trunc, Flags);
2905
2906 auto Hi = (Signed && SrcLT == S64) ? B.buildFPTOSI(S32, FloorMul)
2907 : B.buildFPTOUI(S32, FloorMul);
2908 auto Lo = B.buildFPTOUI(S32, Fma);
2909
2910 if (Signed && SrcLT == S32) {
2911 // Flip the result based on the signedness, which is either all 0s or 1s.
2912 Sign = B.buildMergeLikeInstr(S64, {Sign, Sign});
2913 // r := xor({lo, hi}, sign) - sign;
2914 B.buildSub(Dst, B.buildXor(S64, B.buildMergeLikeInstr(S64, {Lo, Hi}), Sign),
2915 Sign);
2916 } else
2917 B.buildMergeLikeInstr(Dst, {Lo, Hi});
2918 MI.eraseFromParent();
2919
2920 return true;
2921}
2922
2924 MachineInstr &MI) const {
2925 MachineFunction &MF = Helper.MIRBuilder.getMF();
2927
2928 // With ieee_mode disabled, the instructions have the correct behavior.
2929 if (!MFI->getMode().IEEE)
2930 return true;
2931
2933}
2934
2936 MachineInstr &MI) const {
2937 MachineIRBuilder &B = Helper.MIRBuilder;
2938 MachineRegisterInfo &MRI = *B.getMRI();
2939 Register DstReg = MI.getOperand(0).getReg();
2940 Register SrcReg = MI.getOperand(1).getReg();
2941 uint64_t Offset = MI.getOperand(2).getImm();
2942
2943 // Fall back to generic lowering for offset 0 (trivial trunc) and
2944 // non-32-bit-aligned cases which require shift+trunc sequences
2945 // that generic code handles correctly.
2946 if (Offset == 0 || Offset % 32 != 0)
2947 return Helper.lowerExtract(MI) == LegalizerHelper::Legalized;
2948
2949 const LLT DstTy = MRI.getType(DstReg);
2950 unsigned StartIdx = Offset / 32;
2951 unsigned DstCount = DstTy.getSizeInBits() / 32;
2952 auto Unmerge = B.buildUnmerge(LLT::scalar(32), SrcReg);
2953
2954 if (DstCount == 1) {
2955 if (DstTy.isPointer())
2956 B.buildIntToPtr(DstReg, Unmerge.getReg(StartIdx));
2957 else
2958 MRI.replaceRegWith(DstReg, Unmerge.getReg(StartIdx));
2959 } else {
2960 SmallVector<Register, 8> MergeVec;
2961 for (unsigned I = 0; I < DstCount; ++I)
2962 MergeVec.push_back(Unmerge.getReg(StartIdx + I));
2963 B.buildMergeLikeInstr(DstReg, MergeVec);
2964 }
2965
2966 MI.eraseFromParent();
2967 return true;
2968}
2969
2971 MachineInstr &MI) const {
2972 MachineIRBuilder &B = Helper.MIRBuilder;
2973 MachineRegisterInfo &MRI = *B.getMRI();
2974 Register DstReg = MI.getOperand(0).getReg();
2975 Register SrcReg = MI.getOperand(1).getReg();
2976 Register InsertSrc = MI.getOperand(2).getReg();
2977 uint64_t Offset = MI.getOperand(3).getImm();
2978
2979 unsigned DstSize = MRI.getType(DstReg).getSizeInBits();
2980 const LLT InsertTy = MRI.getType(InsertSrc);
2981 unsigned InsertSize = InsertTy.getSizeInBits();
2982
2983 // Fall back to generic lowering for non-32-bit-aligned cases which
2984 // require shift+mask sequences that generic code handles correctly.
2985 if (Offset % 32 != 0 || DstSize % 32 != 0 || InsertSize % 32 != 0)
2986 return Helper.lowerInsert(MI) == LegalizerHelper::Legalized;
2987
2988 const LLT S32 = LLT::scalar(32);
2989 unsigned DstCount = DstSize / 32;
2990 unsigned InsertCount = InsertSize / 32;
2991 unsigned StartIdx = Offset / 32;
2992
2993 auto SrcUnmerge = B.buildUnmerge(S32, SrcReg);
2994
2995 SmallVector<Register, 8> MergeVec;
2996 for (unsigned I = 0; I < StartIdx; ++I)
2997 MergeVec.push_back(SrcUnmerge.getReg(I));
2998
2999 if (InsertCount == 1) {
3000 // Merge-like instructions require same source types. Convert pointer
3001 // to scalar when inserting a pointer value into a scalar.
3002 if (InsertTy.isPointer())
3003 InsertSrc = B.buildPtrToInt(S32, InsertSrc).getReg(0);
3004 MergeVec.push_back(InsertSrc);
3005 } else {
3006 auto InsertUnmerge = B.buildUnmerge(S32, InsertSrc);
3007 for (unsigned I = 0; I < InsertCount; ++I)
3008 MergeVec.push_back(InsertUnmerge.getReg(I));
3009 }
3010
3011 for (unsigned I = StartIdx + InsertCount; I < DstCount; ++I)
3012 MergeVec.push_back(SrcUnmerge.getReg(I));
3013
3014 B.buildMergeLikeInstr(DstReg, MergeVec);
3015
3016 MI.eraseFromParent();
3017 return true;
3018}
3019
3022 MachineIRBuilder &B) const {
3023 // TODO: Should move some of this into LegalizerHelper.
3024
3025 // TODO: Promote dynamic indexing of s16 to s32
3026
3027 Register Dst = MI.getOperand(0).getReg();
3028 Register Vec = MI.getOperand(1).getReg();
3029
3030 LLT VecTy = MRI.getType(Vec);
3031 LLT EltTy = VecTy.getElementType();
3032 assert(EltTy == MRI.getType(Dst));
3033
3034 // Other legalization maps vector<? x [type bigger than 64 bits]> via bitcasts
3035 // but we can't go directly to that logic becasue you can't bitcast a vector
3036 // of pointers to a vector of integers. Therefore, introduce an intermediate
3037 // vector of integers using ptrtoint (and inttoptr on the output) in order to
3038 // drive the legalization forward.
3039 if (EltTy.isPointer() && EltTy.getSizeInBits() > 64) {
3040 LLT IntTy = LLT::scalar(EltTy.getSizeInBits());
3041 LLT IntVecTy = VecTy.changeElementType(IntTy);
3042
3043 auto IntVec = B.buildPtrToInt(IntVecTy, Vec);
3044 auto IntElt = B.buildExtractVectorElement(IntTy, IntVec, MI.getOperand(2));
3045 B.buildIntToPtr(Dst, IntElt);
3046
3047 MI.eraseFromParent();
3048 return true;
3049 }
3050
3051 // FIXME: Artifact combiner probably should have replaced the truncated
3052 // constant before this, so we shouldn't need
3053 // getIConstantVRegValWithLookThrough.
3054 std::optional<ValueAndVReg> MaybeIdxVal =
3055 getIConstantVRegValWithLookThrough(MI.getOperand(2).getReg(), MRI);
3056 if (!MaybeIdxVal) // Dynamic case will be selected to register indexing.
3057 return true;
3058 const uint64_t IdxVal = MaybeIdxVal->Value.getZExtValue();
3059
3060 if (IdxVal < VecTy.getNumElements()) {
3061 auto Unmerge = B.buildUnmerge(EltTy, Vec);
3062 B.buildCopy(Dst, Unmerge.getReg(IdxVal));
3063 } else {
3064 B.buildUndef(Dst);
3065 }
3066
3067 MI.eraseFromParent();
3068 return true;
3069}
3070
3073 MachineIRBuilder &B) const {
3074 // TODO: Should move some of this into LegalizerHelper.
3075
3076 // TODO: Promote dynamic indexing of s16 to s32
3077
3078 Register Dst = MI.getOperand(0).getReg();
3079 Register Vec = MI.getOperand(1).getReg();
3080 Register Ins = MI.getOperand(2).getReg();
3081
3082 LLT VecTy = MRI.getType(Vec);
3083 LLT EltTy = VecTy.getElementType();
3084 assert(EltTy == MRI.getType(Ins));
3085
3086 // Other legalization maps vector<? x [type bigger than 64 bits]> via bitcasts
3087 // but we can't go directly to that logic becasue you can't bitcast a vector
3088 // of pointers to a vector of integers. Therefore, make the pointer vector
3089 // into an equivalent vector of integers with ptrtoint, insert the ptrtoint'd
3090 // new value, and then inttoptr the result vector back. This will then allow
3091 // the rest of legalization to take over.
3092 if (EltTy.isPointer() && EltTy.getSizeInBits() > 64) {
3093 LLT IntTy = LLT::scalar(EltTy.getSizeInBits());
3094 LLT IntVecTy = VecTy.changeElementType(IntTy);
3095
3096 auto IntVecSource = B.buildPtrToInt(IntVecTy, Vec);
3097 auto IntIns = B.buildPtrToInt(IntTy, Ins);
3098 auto IntVecDest = B.buildInsertVectorElement(IntVecTy, IntVecSource, IntIns,
3099 MI.getOperand(3));
3100 B.buildIntToPtr(Dst, IntVecDest);
3101 MI.eraseFromParent();
3102 return true;
3103 }
3104
3105 // FIXME: Artifact combiner probably should have replaced the truncated
3106 // constant before this, so we shouldn't need
3107 // getIConstantVRegValWithLookThrough.
3108 std::optional<ValueAndVReg> MaybeIdxVal =
3109 getIConstantVRegValWithLookThrough(MI.getOperand(3).getReg(), MRI);
3110 if (!MaybeIdxVal) // Dynamic case will be selected to register indexing.
3111 return true;
3112
3113 const uint64_t IdxVal = MaybeIdxVal->Value.getZExtValue();
3114
3115 unsigned NumElts = VecTy.getNumElements();
3116 if (IdxVal < NumElts) {
3118 for (unsigned i = 0; i < NumElts; ++i)
3119 SrcRegs.push_back(MRI.createGenericVirtualRegister(EltTy));
3120 B.buildUnmerge(SrcRegs, Vec);
3121
3122 SrcRegs[IdxVal] = MI.getOperand(2).getReg();
3123 B.buildMergeLikeInstr(Dst, SrcRegs);
3124 } else {
3125 B.buildUndef(Dst);
3126 }
3127
3128 MI.eraseFromParent();
3129 return true;
3130}
3131
3134 MachineIRBuilder &B) const {
3135
3136 Register DstReg = MI.getOperand(0).getReg();
3137 Register SrcReg = MI.getOperand(1).getReg();
3138 LLT Ty = MRI.getType(DstReg);
3139 unsigned Flags = MI.getFlags();
3140
3141 Register TrigVal;
3142 auto OneOver2Pi = B.buildFConstant(Ty, 0.5 * numbers::inv_pi);
3143 if (ST.hasTrigReducedRange()) {
3144 auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags);
3145 TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty})
3146 .addUse(MulVal.getReg(0))
3147 .setMIFlags(Flags)
3148 .getReg(0);
3149 } else
3150 TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0);
3151
3152 Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ?
3153 Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos;
3154 B.buildIntrinsic(TrigIntrin, ArrayRef<Register>(DstReg))
3155 .addUse(TrigVal)
3156 .setMIFlags(Flags);
3157 MI.eraseFromParent();
3158 return true;
3159}
3160
3163 const GlobalValue *GV,
3164 int64_t Offset,
3165 unsigned GAFlags) const {
3166 assert(isInt<32>(Offset + 4) && "32-bit offset is expected!");
3167 // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered
3168 // to the following code sequence:
3169 //
3170 // For constant address space:
3171 // s_getpc_b64 s[0:1]
3172 // s_add_u32 s0, s0, $symbol
3173 // s_addc_u32 s1, s1, 0
3174 //
3175 // s_getpc_b64 returns the address of the s_add_u32 instruction and then
3176 // a fixup or relocation is emitted to replace $symbol with a literal
3177 // constant, which is a pc-relative offset from the encoding of the $symbol
3178 // operand to the global variable.
3179 //
3180 // For global address space:
3181 // s_getpc_b64 s[0:1]
3182 // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
3183 // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
3184 //
3185 // s_getpc_b64 returns the address of the s_add_u32 instruction and then
3186 // fixups or relocations are emitted to replace $symbol@*@lo and
3187 // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
3188 // which is a 64-bit pc-relative offset from the encoding of the $symbol
3189 // operand to the global variable.
3190
3192
3193 Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg :
3194 B.getMRI()->createGenericVirtualRegister(ConstPtrTy);
3195
3196 if (ST.has64BitLiterals()) {
3197 assert(GAFlags != SIInstrInfo::MO_NONE);
3198
3200 B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET64).addDef(PCReg);
3201 MIB.addGlobalAddress(GV, Offset, GAFlags + 2);
3202 } else {
3204 B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET).addDef(PCReg);
3205
3206 MIB.addGlobalAddress(GV, Offset, GAFlags);
3207 if (GAFlags == SIInstrInfo::MO_NONE)
3208 MIB.addImm(0);
3209 else
3210 MIB.addGlobalAddress(GV, Offset, GAFlags + 1);
3211 }
3212
3213 if (!B.getMRI()->getRegClassOrNull(PCReg))
3214 B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass);
3215
3216 if (PtrTy.getSizeInBits() == 32)
3217 B.buildExtract(DstReg, PCReg, 0);
3218 return true;
3219}
3220
3221// Emit a ABS32_LO / ABS32_HI relocation stub.
3223 Register DstReg, LLT PtrTy, MachineIRBuilder &B, const GlobalValue *GV,
3224 MachineRegisterInfo &MRI) const {
3225 bool RequiresHighHalf = PtrTy.getSizeInBits() != 32;
3226
3227 if (RequiresHighHalf && ST.has64BitLiterals()) {
3228 if (!MRI.getRegClassOrNull(DstReg))
3229 MRI.setRegClass(DstReg, &AMDGPU::SReg_64RegClass);
3230 B.buildInstr(AMDGPU::S_MOV_B64)
3231 .addDef(DstReg)
3232 .addGlobalAddress(GV, 0, SIInstrInfo::MO_ABS64);
3233 return;
3234 }
3235
3236 LLT S32 = LLT::scalar(32);
3237
3238 // Use the destination directly, if and only if we store the lower address
3239 // part only and we don't have a register class being set.
3240 Register AddrLo = !RequiresHighHalf && !MRI.getRegClassOrNull(DstReg)
3241 ? DstReg
3243
3244 if (!MRI.getRegClassOrNull(AddrLo))
3245 MRI.setRegClass(AddrLo, &AMDGPU::SReg_32RegClass);
3246
3247 // Write the lower half.
3248 B.buildInstr(AMDGPU::S_MOV_B32)
3249 .addDef(AddrLo)
3250 .addGlobalAddress(GV, 0, SIInstrInfo::MO_ABS32_LO);
3251
3252 // If required, write the upper half as well.
3253 if (RequiresHighHalf) {
3254 assert(PtrTy.getSizeInBits() == 64 &&
3255 "Must provide a 64-bit pointer type!");
3256
3258 MRI.setRegClass(AddrHi, &AMDGPU::SReg_32RegClass);
3259
3260 B.buildInstr(AMDGPU::S_MOV_B32)
3261 .addDef(AddrHi)
3262 .addGlobalAddress(GV, 0, SIInstrInfo::MO_ABS32_HI);
3263
3264 // Use the destination directly, if and only if we don't have a register
3265 // class being set.
3266 Register AddrDst = !MRI.getRegClassOrNull(DstReg)
3267 ? DstReg
3269
3270 if (!MRI.getRegClassOrNull(AddrDst))
3271 MRI.setRegClass(AddrDst, &AMDGPU::SReg_64RegClass);
3272
3273 B.buildMergeValues(AddrDst, {AddrLo, AddrHi});
3274
3275 // If we created a new register for the destination, cast the result into
3276 // the final output.
3277 if (AddrDst != DstReg)
3278 B.buildCast(DstReg, AddrDst);
3279 } else if (AddrLo != DstReg) {
3280 // If we created a new register for the destination, cast the result into
3281 // the final output.
3282 B.buildCast(DstReg, AddrLo);
3283 }
3284}
3285
3288 MachineIRBuilder &B) const {
3289 Register DstReg = MI.getOperand(0).getReg();
3290 LLT Ty = MRI.getType(DstReg);
3291 unsigned AS = Ty.getAddressSpace();
3292
3293 const GlobalValue *GV = MI.getOperand(1).getGlobal();
3294 MachineFunction &MF = B.getMF();
3296
3298 if (!MFI->isModuleEntryFunction() &&
3299 GV->getName() != "llvm.amdgcn.module.lds" &&
3301 const Function &Fn = MF.getFunction();
3303 Fn, "local memory global used by non-kernel function",
3304 MI.getDebugLoc(), DS_Warning));
3305
3306 // We currently don't have a way to correctly allocate LDS objects that
3307 // aren't directly associated with a kernel. We do force inlining of
3308 // functions that use local objects. However, if these dead functions are
3309 // not eliminated, we don't want a compile time error. Just emit a warning
3310 // and a trap, since there should be no callable path here.
3311 B.buildTrap();
3312 B.buildUndef(DstReg);
3313 MI.eraseFromParent();
3314 return true;
3315 }
3316
3317 // TODO: We could emit code to handle the initialization somewhere.
3318 // We ignore the initializer for now and legalize it to allow selection.
3319 // The initializer will anyway get errored out during assembly emission.
3320 const SITargetLowering *TLI = ST.getTargetLowering();
3321 if (!TLI->shouldUseLDSConstAddress(GV)) {
3322 MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO);
3323 return true; // Leave in place;
3324 }
3325
3326 const GlobalVariable &GVar = *cast<GlobalVariable>(GV);
3327 if (AS == AMDGPUAS::LOCAL_ADDRESS && GV->hasExternalLinkage()) {
3328 // HIP uses an unsized array `extern __shared__ T s[]` or similar
3329 // zero-sized type in other languages to declare the dynamic shared
3330 // memory which size is not known at the compile time. They will be
3331 // allocated by the runtime and placed directly after the static
3332 // allocated ones. They all share the same offset.
3333 if (GVar.getGlobalSize(GVar.getDataLayout()) == 0) {
3334 // Adjust alignment for that dynamic shared memory array.
3335 MFI->setDynLDSAlign(MF.getFunction(), GVar);
3336 LLT S32 = LLT::scalar(32);
3337 auto Sz = B.buildIntrinsic(Intrinsic::amdgcn_groupstaticsize, {S32});
3338 B.buildIntToPtr(DstReg, Sz);
3339 MI.eraseFromParent();
3340 return true;
3341 }
3342 }
3343
3344 B.buildConstant(DstReg, MFI->allocateLDSGlobal(B.getDataLayout(), GVar));
3345 MI.eraseFromParent();
3346 return true;
3347 }
3348
3349 if (ST.isAmdPalOS() || ST.isMesa3DOS()) {
3350 buildAbsGlobalAddress(DstReg, Ty, B, GV, MRI);
3351 MI.eraseFromParent();
3352 return true;
3353 }
3354
3355 const SITargetLowering *TLI = ST.getTargetLowering();
3356
3357 if (TLI->shouldEmitFixup(GV)) {
3358 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0);
3359 MI.eraseFromParent();
3360 return true;
3361 }
3362
3363 if (TLI->shouldEmitPCReloc(GV)) {
3364 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32);
3365 MI.eraseFromParent();
3366 return true;
3367 }
3368
3370 Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy);
3371
3372 LLT LoadTy = Ty.getSizeInBits() == 32 ? PtrTy : Ty;
3377 LoadTy, Align(8));
3378
3379 buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32);
3380
3381 if (Ty.getSizeInBits() == 32) {
3382 // Truncate if this is a 32-bit constant address.
3383 auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO);
3384 B.buildExtract(DstReg, Load, 0);
3385 } else
3386 B.buildLoad(DstReg, GOTAddr, *GOTMMO);
3387
3388 MI.eraseFromParent();
3389 return true;
3390}
3391
3393 if (Ty.isVector())
3394 return Ty.changeElementCount(
3395 ElementCount::getFixed(PowerOf2Ceil(Ty.getNumElements())));
3396 return LLT::scalar(PowerOf2Ceil(Ty.getSizeInBits()));
3397}
3398
3400 MachineInstr &MI) const {
3401 MachineIRBuilder &B = Helper.MIRBuilder;
3402 MachineRegisterInfo &MRI = *B.getMRI();
3403 GISelChangeObserver &Observer = Helper.Observer;
3404
3405 Register PtrReg = MI.getOperand(1).getReg();
3406 LLT PtrTy = MRI.getType(PtrReg);
3407 unsigned AddrSpace = PtrTy.getAddressSpace();
3408
3409 if (AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
3411 auto Cast = B.buildAddrSpaceCast(ConstPtr, PtrReg);
3412 Observer.changingInstr(MI);
3413 MI.getOperand(1).setReg(Cast.getReg(0));
3414 Observer.changedInstr(MI);
3415 return true;
3416 }
3417
3418 if (MI.getOpcode() != AMDGPU::G_LOAD)
3419 return false;
3420
3421 Register ValReg = MI.getOperand(0).getReg();
3422 LLT ValTy = MRI.getType(ValReg);
3423
3424 if (hasBufferRsrcWorkaround(ValTy)) {
3425 Observer.changingInstr(MI);
3426 castBufferRsrcFromV4I32(MI, B, MRI, 0);
3427 Observer.changedInstr(MI);
3428 return true;
3429 }
3430
3431 MachineMemOperand *MMO = *MI.memoperands_begin();
3432 const unsigned ValSize = ValTy.getSizeInBits();
3433 const LLT MemTy = MMO->getMemoryType();
3434 const Align MemAlign = MMO->getAlign();
3435 const unsigned MemSize = MemTy.getSizeInBits();
3436 const uint64_t AlignInBits = 8 * MemAlign.value();
3437
3438 // Widen non-power-of-2 loads to the alignment if needed
3439 if (shouldWidenLoad(ST, MemTy, AlignInBits, AddrSpace, MI.getOpcode())) {
3440 const unsigned WideMemSize = PowerOf2Ceil(MemSize);
3441
3442 // This was already the correct extending load result type, so just adjust
3443 // the memory type.
3444 if (WideMemSize == ValSize) {
3445 MachineFunction &MF = B.getMF();
3446
3447 MachineMemOperand *WideMMO =
3448 MF.getMachineMemOperand(MMO, 0, WideMemSize / 8);
3449 Observer.changingInstr(MI);
3450 MI.setMemRefs(MF, {WideMMO});
3451 Observer.changedInstr(MI);
3452 return true;
3453 }
3454
3455 // Don't bother handling edge case that should probably never be produced.
3456 if (ValSize > WideMemSize)
3457 return false;
3458
3459 LLT WideTy = widenToNextPowerOf2(ValTy);
3460
3461 Register WideLoad;
3462 if (!WideTy.isVector()) {
3463 WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
3464 B.buildTrunc(ValReg, WideLoad).getReg(0);
3465 } else {
3466 // Extract the subvector.
3467
3468 if (isRegisterType(ST, ValTy)) {
3469 // If this a case where G_EXTRACT is legal, use it.
3470 // (e.g. <3 x s32> -> <4 x s32>)
3471 WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
3472 B.buildExtract(ValReg, WideLoad, 0);
3473 } else {
3474 // For cases where the widened type isn't a nice register value, unmerge
3475 // from a widened register (e.g. <3 x s16> -> <4 x s16>)
3476 WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
3477 B.buildDeleteTrailingVectorElements(ValReg, WideLoad);
3478 }
3479 }
3480
3481 MI.eraseFromParent();
3482 return true;
3483 }
3484
3485 return false;
3486}
3487
3489 MachineInstr &MI) const {
3490 MachineIRBuilder &B = Helper.MIRBuilder;
3491 MachineRegisterInfo &MRI = *B.getMRI();
3492 GISelChangeObserver &Observer = Helper.Observer;
3493
3494 Register DataReg = MI.getOperand(0).getReg();
3495 LLT DataTy = MRI.getType(DataReg);
3496
3497 if (hasBufferRsrcWorkaround(DataTy)) {
3498 Observer.changingInstr(MI);
3500 Observer.changedInstr(MI);
3501 return true;
3502 }
3503 return false;
3504}
3505
3508 MachineIRBuilder &B) const {
3509 LLT Ty = MRI.getType(MI.getOperand(0).getReg());
3510 assert(Ty.isScalar());
3511
3512 MachineFunction &MF = B.getMF();
3514
3515 // TODO: Always legal with future ftz flag.
3516 // TODO: Type is expected to be LLT::float32()/LLT::float16()
3517 // FIXME: Do we need just output?
3518 if (Ty == LLT::scalar(32) &&
3520 return true;
3521 if (Ty == LLT::scalar(16) &&
3523 return true;
3524
3525 MachineIRBuilder HelperBuilder(MI);
3526 GISelObserverWrapper DummyObserver;
3527 LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
3528 return Helper.lowerFMad(MI) == LegalizerHelper::Legalized;
3529}
3530
3533 Register DstReg = MI.getOperand(0).getReg();
3534 Register PtrReg = MI.getOperand(1).getReg();
3535 Register CmpVal = MI.getOperand(2).getReg();
3536 Register NewVal = MI.getOperand(3).getReg();
3537
3539 "this should not have been custom lowered");
3540
3541 LLT ValTy = MRI.getType(CmpVal);
3542 LLT VecTy = LLT::fixed_vector(2, ValTy);
3543
3544 Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0);
3545
3546 B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG)
3547 .addDef(DstReg)
3548 .addUse(PtrReg)
3549 .addUse(PackedVal)
3550 .setMemRefs(MI.memoperands());
3551
3552 MI.eraseFromParent();
3553 return true;
3554}
3555
3556/// Return true if it's known that \p Src can never be an f32 denormal value.
3558 Register Src) {
3559 const MachineInstr *DefMI = MRI.getVRegDef(Src);
3560 switch (DefMI->getOpcode()) {
3561 case TargetOpcode::G_INTRINSIC: {
3563 case Intrinsic::amdgcn_frexp_mant:
3564 case Intrinsic::amdgcn_log:
3565 case Intrinsic::amdgcn_log_clamp:
3566 case Intrinsic::amdgcn_exp2:
3567 case Intrinsic::amdgcn_sqrt:
3568 return true;
3569 default:
3570 break;
3571 }
3572
3573 break;
3574 }
3575 case TargetOpcode::G_FSQRT:
3576 return true;
3577 case TargetOpcode::G_FFREXP: {
3578 if (DefMI->getOperand(0).getReg() == Src)
3579 return true;
3580 break;
3581 }
3582 case TargetOpcode::G_FPEXT: {
3583 return MRI.getType(DefMI->getOperand(1).getReg()) == LLT::scalar(16);
3584 }
3585 default:
3586 return false;
3587 }
3588
3589 return false;
3590}
3591
3592static bool allowApproxFunc(const MachineFunction &MF, unsigned Flags) {
3593 return Flags & MachineInstr::FmAfn;
3594}
3595
3597 unsigned Flags) {
3598 return !valueIsKnownNeverF32Denorm(MF.getRegInfo(), Src) &&
3601}
3602
3603std::pair<Register, Register>
3605 unsigned Flags) const {
3606 if (!needsDenormHandlingF32(B.getMF(), Src, Flags))
3607 return {};
3608
3609 const LLT F32 = LLT::scalar(32);
3610 auto SmallestNormal = B.buildFConstant(
3612 auto IsLtSmallestNormal =
3613 B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), Src, SmallestNormal);
3614
3615 auto Scale32 = B.buildFConstant(F32, 0x1.0p+32);
3616 auto One = B.buildFConstant(F32, 1.0);
3617 auto ScaleFactor =
3618 B.buildSelect(F32, IsLtSmallestNormal, Scale32, One, Flags);
3619 auto ScaledInput = B.buildFMul(F32, Src, ScaleFactor, Flags);
3620
3621 return {ScaledInput.getReg(0), IsLtSmallestNormal.getReg(0)};
3622}
3623
3625 MachineIRBuilder &B) const {
3626 // v_log_f32 is good enough for OpenCL, except it doesn't handle denormals.
3627 // If we have to handle denormals, scale up the input and adjust the result.
3628
3629 // scaled = x * (is_denormal ? 0x1.0p+32 : 1.0)
3630 // log2 = amdgpu_log2 - (is_denormal ? 32.0 : 0.0)
3631
3632 Register Dst = MI.getOperand(0).getReg();
3633 Register Src = MI.getOperand(1).getReg();
3634 LLT Ty = B.getMRI()->getType(Dst);
3635 unsigned Flags = MI.getFlags();
3636
3637 if (Ty == LLT::scalar(16)) {
3638 const LLT F32 = LLT::scalar(32);
3639 // Nothing in half is a denormal when promoted to f32.
3640 auto Ext = B.buildFPExt(F32, Src, Flags);
3641 auto Log2 = B.buildIntrinsic(Intrinsic::amdgcn_log, {F32})
3642 .addUse(Ext.getReg(0))
3643 .setMIFlags(Flags);
3644 B.buildFPTrunc(Dst, Log2, Flags);
3645 MI.eraseFromParent();
3646 return true;
3647 }
3648
3649 assert(Ty == LLT::scalar(32));
3650
3651 auto [ScaledInput, IsLtSmallestNormal] = getScaledLogInput(B, Src, Flags);
3652 if (!ScaledInput) {
3653 B.buildIntrinsic(Intrinsic::amdgcn_log, {MI.getOperand(0)})
3654 .addUse(Src)
3655 .setMIFlags(Flags);
3656 MI.eraseFromParent();
3657 return true;
3658 }
3659
3660 auto Log2 = B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty})
3661 .addUse(ScaledInput)
3662 .setMIFlags(Flags);
3663
3664 auto ThirtyTwo = B.buildFConstant(Ty, 32.0);
3665 auto Zero = B.buildFConstant(Ty, 0.0);
3666 auto ResultOffset =
3667 B.buildSelect(Ty, IsLtSmallestNormal, ThirtyTwo, Zero, Flags);
3668 B.buildFSub(Dst, Log2, ResultOffset, Flags);
3669
3670 MI.eraseFromParent();
3671 return true;
3672}
3673
3675 Register Z, unsigned Flags) {
3676 auto FMul = B.buildFMul(Ty, X, Y, Flags);
3677 return B.buildFAdd(Ty, FMul, Z, Flags).getReg(0);
3678}
3679
3681 MachineIRBuilder &B) const {
3682 const bool IsLog10 = MI.getOpcode() == TargetOpcode::G_FLOG10;
3683 assert(IsLog10 || MI.getOpcode() == TargetOpcode::G_FLOG);
3684
3685 MachineRegisterInfo &MRI = *B.getMRI();
3686 Register Dst = MI.getOperand(0).getReg();
3687 Register X = MI.getOperand(1).getReg();
3688 unsigned Flags = MI.getFlags();
3689 const LLT Ty = MRI.getType(X);
3690
3691 const LLT F32 = LLT::scalar(32);
3692 const LLT F16 = LLT::scalar(16);
3693
3694 if (Ty == F16 || MI.getFlag(MachineInstr::FmAfn)) {
3695 // TODO: The direct f16 path is 1.79 ulp for f16. This should be used
3696 // depending on !fpmath metadata.
3697 bool PromoteToF32 =
3698 Ty == F16 && (!MI.getFlag(MachineInstr::FmAfn) || !ST.has16BitInsts());
3699 if (PromoteToF32) {
3701 auto PromoteSrc = B.buildFPExt(F32, X);
3702 legalizeFlogUnsafe(B, LogVal, PromoteSrc.getReg(0), IsLog10, Flags);
3703 B.buildFPTrunc(Dst, LogVal);
3704 } else {
3705 legalizeFlogUnsafe(B, Dst, X, IsLog10, Flags);
3706 }
3707
3708 MI.eraseFromParent();
3709 return true;
3710 }
3711
3712 auto [ScaledInput, IsScaled] = getScaledLogInput(B, X, Flags);
3713 if (ScaledInput)
3714 X = ScaledInput;
3715
3716 auto Y =
3717 B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty}).addUse(X).setMIFlags(Flags);
3718
3719 Register R;
3720 if (ST.hasFastFMAF32()) {
3721 // c+cc are ln(2)/ln(10) to more than 49 bits
3722 const float c_log10 = 0x1.344134p-2f;
3723 const float cc_log10 = 0x1.09f79ep-26f;
3724
3725 // c + cc is ln(2) to more than 49 bits
3726 const float c_log = 0x1.62e42ep-1f;
3727 const float cc_log = 0x1.efa39ep-25f;
3728
3729 auto C = B.buildFConstant(Ty, IsLog10 ? c_log10 : c_log);
3730 auto CC = B.buildFConstant(Ty, IsLog10 ? cc_log10 : cc_log);
3731 // This adds correction terms for which contraction may lead to an increase
3732 // in the error of the approximation, so disable it.
3733 auto NewFlags = Flags & ~(MachineInstr::FmContract);
3734 R = B.buildFMul(Ty, Y, C, NewFlags).getReg(0);
3735 auto NegR = B.buildFNeg(Ty, R, NewFlags);
3736 auto FMA0 = B.buildFMA(Ty, Y, C, NegR, NewFlags);
3737 auto FMA1 = B.buildFMA(Ty, Y, CC, FMA0, NewFlags);
3738 R = B.buildFAdd(Ty, R, FMA1, NewFlags).getReg(0);
3739 } else {
3740 // ch+ct is ln(2)/ln(10) to more than 36 bits
3741 const float ch_log10 = 0x1.344000p-2f;
3742 const float ct_log10 = 0x1.3509f6p-18f;
3743
3744 // ch + ct is ln(2) to more than 36 bits
3745 const float ch_log = 0x1.62e000p-1f;
3746 const float ct_log = 0x1.0bfbe8p-15f;
3747
3748 auto CH = B.buildFConstant(Ty, IsLog10 ? ch_log10 : ch_log);
3749 auto CT = B.buildFConstant(Ty, IsLog10 ? ct_log10 : ct_log);
3750
3751 auto MaskConst = B.buildConstant(Ty, 0xfffff000);
3752 auto YH = B.buildAnd(Ty, Y, MaskConst);
3753 auto YT = B.buildFSub(Ty, Y, YH, Flags);
3754 // This adds correction terms for which contraction may lead to an increase
3755 // in the error of the approximation, so disable it.
3756 auto NewFlags = Flags & ~(MachineInstr::FmContract);
3757 auto YTCT = B.buildFMul(Ty, YT, CT, NewFlags);
3758
3759 Register Mad0 =
3760 getMad(B, Ty, YH.getReg(0), CT.getReg(0), YTCT.getReg(0), NewFlags);
3761 Register Mad1 = getMad(B, Ty, YT.getReg(0), CH.getReg(0), Mad0, NewFlags);
3762 R = getMad(B, Ty, YH.getReg(0), CH.getReg(0), Mad1, NewFlags);
3763 }
3764
3765 const bool IsFiniteOnly =
3767
3768 if (!IsFiniteOnly) {
3769 // Expand isfinite(x) => fabs(x) < inf
3770 auto Inf = B.buildFConstant(Ty, APFloat::getInf(APFloat::IEEEsingle()));
3771 auto Fabs = B.buildFAbs(Ty, Y);
3772 auto IsFinite =
3773 B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), Fabs, Inf, Flags);
3774 R = B.buildSelect(Ty, IsFinite, R, Y, Flags).getReg(0);
3775 }
3776
3777 if (ScaledInput) {
3778 auto Zero = B.buildFConstant(Ty, 0.0);
3779 auto ShiftK =
3780 B.buildFConstant(Ty, IsLog10 ? 0x1.344136p+3f : 0x1.62e430p+4f);
3781 auto Shift = B.buildSelect(Ty, IsScaled, ShiftK, Zero, Flags);
3782 B.buildFSub(Dst, R, Shift, Flags);
3783 } else {
3784 B.buildCopy(Dst, R);
3785 }
3786
3787 MI.eraseFromParent();
3788 return true;
3789}
3790
3792 Register Src, bool IsLog10,
3793 unsigned Flags) const {
3794 const double Log2BaseInverted =
3796
3797 LLT Ty = B.getMRI()->getType(Dst);
3798
3799 if (Ty == LLT::scalar(32)) {
3800 auto [ScaledInput, IsScaled] = getScaledLogInput(B, Src, Flags);
3801 if (ScaledInput) {
3802 auto LogSrc = B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty})
3803 .addUse(Src)
3804 .setMIFlags(Flags);
3805 auto ScaledResultOffset = B.buildFConstant(Ty, -32.0 * Log2BaseInverted);
3806 auto Zero = B.buildFConstant(Ty, 0.0);
3807 auto ResultOffset =
3808 B.buildSelect(Ty, IsScaled, ScaledResultOffset, Zero, Flags);
3809 auto Log2Inv = B.buildFConstant(Ty, Log2BaseInverted);
3810
3811 if (ST.hasFastFMAF32())
3812 B.buildFMA(Dst, LogSrc, Log2Inv, ResultOffset, Flags);
3813 else {
3814 auto Mul = B.buildFMul(Ty, LogSrc, Log2Inv, Flags);
3815 B.buildFAdd(Dst, Mul, ResultOffset, Flags);
3816 }
3817
3818 return true;
3819 }
3820 }
3821
3822 auto Log2Operand = Ty == LLT::scalar(16)
3823 ? B.buildFLog2(Ty, Src, Flags)
3824 : B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty})
3825 .addUse(Src)
3826 .setMIFlags(Flags);
3827 auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted);
3828 B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags);
3829 return true;
3830}
3831
3833 MachineIRBuilder &B) const {
3834 // v_exp_f32 is good enough for OpenCL, except it doesn't handle denormals.
3835 // If we have to handle denormals, scale up the input and adjust the result.
3836
3837 Register Dst = MI.getOperand(0).getReg();
3838 Register Src = MI.getOperand(1).getReg();
3839 unsigned Flags = MI.getFlags();
3840 LLT Ty = B.getMRI()->getType(Dst);
3841 const LLT F16 = LLT::scalar(16);
3842 const LLT F32 = LLT::scalar(32);
3843 const LLT F64 = LLT::scalar(64);
3844
3845 if (Ty == F64)
3846 return legalizeFEXPF64(MI, B);
3847
3848 if (Ty == F16) {
3849 // Nothing in half is a denormal when promoted to f32.
3850 auto Ext = B.buildFPExt(F32, Src, Flags);
3851 auto Log2 = B.buildIntrinsic(Intrinsic::amdgcn_exp2, {F32})
3852 .addUse(Ext.getReg(0))
3853 .setMIFlags(Flags);
3854 B.buildFPTrunc(Dst, Log2, Flags);
3855 MI.eraseFromParent();
3856 return true;
3857 }
3858
3859 assert(Ty == F32);
3860
3861 if (!needsDenormHandlingF32(B.getMF(), Src, Flags)) {
3862 B.buildIntrinsic(Intrinsic::amdgcn_exp2, ArrayRef<Register>{Dst})
3863 .addUse(Src)
3864 .setMIFlags(Flags);
3865 MI.eraseFromParent();
3866 return true;
3867 }
3868
3869 // bool needs_scaling = x < -0x1.f80000p+6f;
3870 // v_exp_f32(x + (s ? 0x1.0p+6f : 0.0f)) * (s ? 0x1.0p-64f : 1.0f);
3871
3872 // -nextafter(128.0, -1)
3873 auto RangeCheckConst = B.buildFConstant(Ty, -0x1.f80000p+6f);
3874 auto NeedsScaling = B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), Src,
3875 RangeCheckConst, Flags);
3876
3877 auto SixtyFour = B.buildFConstant(Ty, 0x1.0p+6f);
3878 auto Zero = B.buildFConstant(Ty, 0.0);
3879 auto AddOffset = B.buildSelect(F32, NeedsScaling, SixtyFour, Zero, Flags);
3880 auto AddInput = B.buildFAdd(F32, Src, AddOffset, Flags);
3881
3882 auto Exp2 = B.buildIntrinsic(Intrinsic::amdgcn_exp2, {Ty})
3883 .addUse(AddInput.getReg(0))
3884 .setMIFlags(Flags);
3885
3886 auto TwoExpNeg64 = B.buildFConstant(Ty, 0x1.0p-64f);
3887 auto One = B.buildFConstant(Ty, 1.0);
3888 auto ResultScale = B.buildSelect(F32, NeedsScaling, TwoExpNeg64, One, Flags);
3889 B.buildFMul(Dst, Exp2, ResultScale, Flags);
3890 MI.eraseFromParent();
3891 return true;
3892}
3893
3895 const SrcOp &Src, unsigned Flags) {
3896 LLT Ty = Dst.getLLTTy(*B.getMRI());
3897
3898 if (Ty == LLT::scalar(32)) {
3899 return B.buildIntrinsic(Intrinsic::amdgcn_exp2, {Dst})
3900 .addUse(Src.getReg())
3901 .setMIFlags(Flags);
3902 }
3903 return B.buildFExp2(Dst, Src, Flags);
3904}
3905
3907 Register Dst, Register X,
3908 unsigned Flags,
3909 bool IsExp10) const {
3910 LLT Ty = B.getMRI()->getType(X);
3911
3912 // exp(x) -> exp2(M_LOG2E_F * x);
3913 // exp10(x) -> exp2(log2(10) * x);
3914 auto Const = B.buildFConstant(Ty, IsExp10 ? 0x1.a934f0p+1f : numbers::log2e);
3915 auto Mul = B.buildFMul(Ty, X, Const, Flags);
3916 buildExp(B, Dst, Mul, Flags);
3917 return true;
3918}
3919
3921 Register X, unsigned Flags) const {
3922 LLT Ty = B.getMRI()->getType(Dst);
3923 LLT F32 = LLT::scalar(32);
3924
3925 if (Ty != F32 || !needsDenormHandlingF32(B.getMF(), X, Flags)) {
3926 return legalizeFExpUnsafeImpl(B, Dst, X, Flags, /*IsExp10=*/false);
3927 }
3928
3929 auto Threshold = B.buildFConstant(Ty, -0x1.5d58a0p+6f);
3930 auto NeedsScaling =
3931 B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), X, Threshold, Flags);
3932 auto ScaleOffset = B.buildFConstant(Ty, 0x1.0p+6f);
3933 auto ScaledX = B.buildFAdd(Ty, X, ScaleOffset, Flags);
3934 auto AdjustedX = B.buildSelect(Ty, NeedsScaling, ScaledX, X, Flags);
3935
3936 auto Log2E = B.buildFConstant(Ty, numbers::log2e);
3937 auto ExpInput = B.buildFMul(Ty, AdjustedX, Log2E, Flags);
3938
3939 auto Exp2 = B.buildIntrinsic(Intrinsic::amdgcn_exp2, {Ty})
3940 .addUse(ExpInput.getReg(0))
3941 .setMIFlags(Flags);
3942
3943 auto ResultScaleFactor = B.buildFConstant(Ty, 0x1.969d48p-93f);
3944 auto AdjustedResult = B.buildFMul(Ty, Exp2, ResultScaleFactor, Flags);
3945 B.buildSelect(Dst, NeedsScaling, AdjustedResult, Exp2, Flags);
3946 return true;
3947}
3948
3950 Register Dst, Register X,
3951 unsigned Flags) const {
3952 LLT Ty = B.getMRI()->getType(Dst);
3953 LLT F32 = LLT::scalar(32);
3954
3955 if (Ty != F32 || !needsDenormHandlingF32(B.getMF(), X, Flags)) {
3956 // exp2(x * 0x1.a92000p+1f) * exp2(x * 0x1.4f0978p-11f);
3957 auto K0 = B.buildFConstant(Ty, 0x1.a92000p+1f);
3958 auto K1 = B.buildFConstant(Ty, 0x1.4f0978p-11f);
3959
3960 auto Mul1 = B.buildFMul(Ty, X, K1, Flags);
3961 auto Exp2_1 = buildExp(B, Ty, Mul1, Flags);
3962 auto Mul0 = B.buildFMul(Ty, X, K0, Flags);
3963 auto Exp2_0 = buildExp(B, Ty, Mul0, Flags);
3964 B.buildFMul(Dst, Exp2_0, Exp2_1, Flags);
3965 return true;
3966 }
3967
3968 // bool s = x < -0x1.2f7030p+5f;
3969 // x += s ? 0x1.0p+5f : 0.0f;
3970 // exp10 = exp2(x * 0x1.a92000p+1f) *
3971 // exp2(x * 0x1.4f0978p-11f) *
3972 // (s ? 0x1.9f623ep-107f : 1.0f);
3973
3974 auto Threshold = B.buildFConstant(Ty, -0x1.2f7030p+5f);
3975 auto NeedsScaling =
3976 B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), X, Threshold);
3977
3978 auto ScaleOffset = B.buildFConstant(Ty, 0x1.0p+5f);
3979 auto ScaledX = B.buildFAdd(Ty, X, ScaleOffset, Flags);
3980 auto AdjustedX = B.buildSelect(Ty, NeedsScaling, ScaledX, X);
3981
3982 auto K0 = B.buildFConstant(Ty, 0x1.a92000p+1f);
3983 auto K1 = B.buildFConstant(Ty, 0x1.4f0978p-11f);
3984
3985 auto Mul1 = B.buildFMul(Ty, AdjustedX, K1, Flags);
3986 auto Exp2_1 = buildExp(B, Ty, Mul1, Flags);
3987 auto Mul0 = B.buildFMul(Ty, AdjustedX, K0, Flags);
3988 auto Exp2_0 = buildExp(B, Ty, Mul0, Flags);
3989
3990 auto MulExps = B.buildFMul(Ty, Exp2_0, Exp2_1, Flags);
3991 auto ResultScaleFactor = B.buildFConstant(Ty, 0x1.9f623ep-107f);
3992 auto AdjustedResult = B.buildFMul(Ty, MulExps, ResultScaleFactor, Flags);
3993
3994 B.buildSelect(Dst, NeedsScaling, AdjustedResult, MulExps);
3995 return true;
3996}
3997
3998// This expansion gives a result slightly better than 1ulp.
4000 MachineIRBuilder &B) const {
4001
4002 Register X = MI.getOperand(1).getReg();
4003 LLT S64 = LLT::scalar(64);
4004 LLT S32 = LLT::scalar(32);
4005 LLT S1 = LLT::scalar(1);
4006
4007 // TODO: Check if reassoc is safe. There is an output change in exp2 and
4008 // exp10, which slightly increases ulp.
4009 unsigned Flags = MI.getFlags() & ~MachineInstr::FmReassoc;
4010
4011 Register Dn, F, T;
4012
4013 if (MI.getOpcode() == TargetOpcode::G_FEXP2) {
4014 // Dn = rint(X)
4015 Dn = B.buildFRint(S64, X, Flags).getReg(0);
4016 // F = X - Dn
4017 F = B.buildFSub(S64, X, Dn, Flags).getReg(0);
4018 // T = F*C1 + F*C2
4019 auto C1 = B.buildFConstant(S64, APFloat(0x1.62e42fefa39efp-1));
4020 auto C2 = B.buildFConstant(S64, APFloat(0x1.abc9e3b39803fp-56));
4021 auto Mul2 = B.buildFMul(S64, F, C2, Flags).getReg(0);
4022 T = B.buildFMA(S64, F, C1, Mul2, Flags).getReg(0);
4023
4024 } else if (MI.getOpcode() == TargetOpcode::G_FEXP10) {
4025 auto C1 = B.buildFConstant(S64, APFloat(0x1.a934f0979a371p+1));
4026 auto Mul = B.buildFMul(S64, X, C1, Flags).getReg(0);
4027 Dn = B.buildFRint(S64, Mul, Flags).getReg(0);
4028
4029 auto NegDn = B.buildFNeg(S64, Dn, Flags).getReg(0);
4030 auto C2 = B.buildFConstant(S64, APFloat(-0x1.9dc1da994fd21p-59));
4031 auto C3 = B.buildFConstant(S64, APFloat(0x1.34413509f79ffp-2));
4032 auto Inner = B.buildFMA(S64, NegDn, C3, X, Flags).getReg(0);
4033 F = B.buildFMA(S64, NegDn, C2, Inner, Flags).getReg(0);
4034
4035 auto C4 = B.buildFConstant(S64, APFloat(0x1.26bb1bbb55516p+1));
4036 auto C5 = B.buildFConstant(S64, APFloat(-0x1.f48ad494ea3e9p-53));
4037 auto MulF = B.buildFMul(S64, F, C5, Flags).getReg(0);
4038 T = B.buildFMA(S64, F, C4, MulF, Flags).getReg(0);
4039
4040 } else { // G_FEXP
4041 auto C1 = B.buildFConstant(S64, APFloat(0x1.71547652b82fep+0));
4042 auto Mul = B.buildFMul(S64, X, C1, Flags).getReg(0);
4043 Dn = B.buildFRint(S64, Mul, Flags).getReg(0);
4044
4045 auto NegDn = B.buildFNeg(S64, Dn, Flags).getReg(0);
4046 auto C2 = B.buildFConstant(S64, APFloat(0x1.abc9e3b39803fp-56));
4047 auto C3 = B.buildFConstant(S64, APFloat(0x1.62e42fefa39efp-1));
4048 auto Inner = B.buildFMA(S64, NegDn, C3, X, Flags).getReg(0);
4049 T = B.buildFMA(S64, NegDn, C2, Inner, Flags).getReg(0);
4050 }
4051
4052 // Polynomial chain for P
4053 auto P = B.buildFConstant(S64, 0x1.ade156a5dcb37p-26);
4054 P = B.buildFMA(S64, T, P, B.buildFConstant(S64, 0x1.28af3fca7ab0cp-22),
4055 Flags);
4056 P = B.buildFMA(S64, T, P, B.buildFConstant(S64, 0x1.71dee623fde64p-19),
4057 Flags);
4058 P = B.buildFMA(S64, T, P, B.buildFConstant(S64, 0x1.a01997c89e6b0p-16),
4059 Flags);
4060 P = B.buildFMA(S64, T, P, B.buildFConstant(S64, 0x1.a01a014761f6ep-13),
4061 Flags);
4062 P = B.buildFMA(S64, T, P, B.buildFConstant(S64, 0x1.6c16c1852b7b0p-10),
4063 Flags);
4064 P = B.buildFMA(S64, T, P, B.buildFConstant(S64, 0x1.1111111122322p-7), Flags);
4065 P = B.buildFMA(S64, T, P, B.buildFConstant(S64, 0x1.55555555502a1p-5), Flags);
4066 P = B.buildFMA(S64, T, P, B.buildFConstant(S64, 0x1.5555555555511p-3), Flags);
4067 P = B.buildFMA(S64, T, P, B.buildFConstant(S64, 0x1.000000000000bp-1), Flags);
4068
4069 auto One = B.buildFConstant(S64, 1.0);
4070 P = B.buildFMA(S64, T, P, One, Flags);
4071 P = B.buildFMA(S64, T, P, One, Flags);
4072
4073 // Z = FLDEXP(P, (int)Dn)
4074 auto DnInt = B.buildFPTOSI(S32, Dn);
4075 auto Z = B.buildFLdexp(S64, P, DnInt, Flags);
4076
4077 if (!(Flags & MachineInstr::FmNoInfs)) {
4078 // Overflow guard: if X <= 1024.0 then Z else +inf
4079 auto CondHi = B.buildFCmp(CmpInst::FCMP_ULE, S1, X,
4080 B.buildFConstant(S64, APFloat(1024.0)));
4081 auto PInf = B.buildFConstant(S64, APFloat::getInf(APFloat::IEEEdouble()));
4082 Z = B.buildSelect(S64, CondHi, Z, PInf, Flags);
4083 }
4084
4085 // Underflow guard: if X >= -1075.0 then Z else 0.0
4086 auto CondLo = B.buildFCmp(CmpInst::FCMP_UGE, S1, X,
4087 B.buildFConstant(S64, APFloat(-1075.0)));
4088 auto Zero = B.buildFConstant(S64, APFloat(0.0));
4089 B.buildSelect(MI.getOperand(0).getReg(), CondLo, Z, Zero, Flags);
4090
4091 MI.eraseFromParent();
4092 return true;
4093}
4094
4096 MachineIRBuilder &B) const {
4097 Register Dst = MI.getOperand(0).getReg();
4098 Register X = MI.getOperand(1).getReg();
4099 const unsigned Flags = MI.getFlags();
4100 MachineFunction &MF = B.getMF();
4101 MachineRegisterInfo &MRI = *B.getMRI();
4102 LLT Ty = MRI.getType(Dst);
4103
4104 const LLT F64 = LLT::scalar(64);
4105
4106 if (Ty == F64)
4107 return legalizeFEXPF64(MI, B);
4108
4109 const LLT F16 = LLT::scalar(16);
4110 const LLT F32 = LLT::scalar(32);
4111 const bool IsExp10 = MI.getOpcode() == TargetOpcode::G_FEXP10;
4112
4113 if (Ty == F16) {
4114 // v_exp_f16 (fmul x, log2e)
4115 if (allowApproxFunc(MF, Flags)) {
4116 // TODO: Does this really require fast?
4117 IsExp10 ? legalizeFExp10Unsafe(B, Dst, X, Flags)
4118 : legalizeFExpUnsafe(B, Dst, X, Flags);
4119 MI.eraseFromParent();
4120 return true;
4121 }
4122
4123 // Nothing in half is a denormal when promoted to f32.
4124 //
4125 // exp(f16 x) ->
4126 // fptrunc (v_exp_f32 (fmul (fpext x), log2e))
4127 //
4128 // exp10(f16 x) ->
4129 // fptrunc (v_exp_f32 (fmul (fpext x), log2(10)))
4130 auto Ext = B.buildFPExt(F32, X, Flags);
4132 legalizeFExpUnsafeImpl(B, Lowered, Ext.getReg(0), Flags, IsExp10);
4133 B.buildFPTrunc(Dst, Lowered, Flags);
4134 MI.eraseFromParent();
4135 return true;
4136 }
4137
4138 assert(Ty == F32);
4139
4140 // TODO: Interpret allowApproxFunc as ignoring DAZ. This is currently copying
4141 // library behavior. Also, is known-not-daz source sufficient?
4142 if (allowApproxFunc(MF, Flags)) {
4143 IsExp10 ? legalizeFExp10Unsafe(B, Dst, X, Flags)
4144 : legalizeFExpUnsafe(B, Dst, X, Flags);
4145 MI.eraseFromParent();
4146 return true;
4147 }
4148
4149 // Algorithm:
4150 //
4151 // e^x = 2^(x/ln(2)) = 2^(x*(64/ln(2))/64)
4152 //
4153 // x*(64/ln(2)) = n + f, |f| <= 0.5, n is integer
4154 // n = 64*m + j, 0 <= j < 64
4155 //
4156 // e^x = 2^((64*m + j + f)/64)
4157 // = (2^m) * (2^(j/64)) * 2^(f/64)
4158 // = (2^m) * (2^(j/64)) * e^(f*(ln(2)/64))
4159 //
4160 // f = x*(64/ln(2)) - n
4161 // r = f*(ln(2)/64) = x - n*(ln(2)/64)
4162 //
4163 // e^x = (2^m) * (2^(j/64)) * e^r
4164 //
4165 // (2^(j/64)) is precomputed
4166 //
4167 // e^r = 1 + r + (r^2)/2! + (r^3)/3! + (r^4)/4! + (r^5)/5!
4168 // e^r = 1 + q
4169 //
4170 // q = r + (r^2)/2! + (r^3)/3! + (r^4)/4! + (r^5)/5!
4171 //
4172 // e^x = (2^m) * ( (2^(j/64)) + q*(2^(j/64)) )
4173 const unsigned FlagsNoContract = Flags & ~MachineInstr::FmContract;
4174 Register PH, PL;
4175
4176 if (ST.hasFastFMAF32()) {
4177 const float c_exp = numbers::log2ef;
4178 const float cc_exp = 0x1.4ae0bep-26f; // c+cc are 49 bits
4179 const float c_exp10 = 0x1.a934f0p+1f;
4180 const float cc_exp10 = 0x1.2f346ep-24f;
4181
4182 auto C = B.buildFConstant(Ty, IsExp10 ? c_exp10 : c_exp);
4183 PH = B.buildFMul(Ty, X, C, Flags).getReg(0);
4184 auto NegPH = B.buildFNeg(Ty, PH, Flags);
4185 auto FMA0 = B.buildFMA(Ty, X, C, NegPH, Flags);
4186
4187 auto CC = B.buildFConstant(Ty, IsExp10 ? cc_exp10 : cc_exp);
4188 PL = B.buildFMA(Ty, X, CC, FMA0, Flags).getReg(0);
4189 } else {
4190 const float ch_exp = 0x1.714000p+0f;
4191 const float cl_exp = 0x1.47652ap-12f; // ch + cl are 36 bits
4192
4193 const float ch_exp10 = 0x1.a92000p+1f;
4194 const float cl_exp10 = 0x1.4f0978p-11f;
4195
4196 auto MaskConst = B.buildConstant(Ty, 0xfffff000);
4197 auto XH = B.buildAnd(Ty, X, MaskConst);
4198 auto XL = B.buildFSub(Ty, X, XH, Flags);
4199
4200 auto CH = B.buildFConstant(Ty, IsExp10 ? ch_exp10 : ch_exp);
4201 PH = B.buildFMul(Ty, XH, CH, Flags).getReg(0);
4202
4203 auto CL = B.buildFConstant(Ty, IsExp10 ? cl_exp10 : cl_exp);
4204 auto XLCL = B.buildFMul(Ty, XL, CL, Flags);
4205
4206 Register Mad0 =
4207 getMad(B, Ty, XL.getReg(0), CH.getReg(0), XLCL.getReg(0), Flags);
4208 PL = getMad(B, Ty, XH.getReg(0), CL.getReg(0), Mad0, Flags);
4209 }
4210
4211 auto E = B.buildIntrinsicRoundeven(Ty, PH, Flags);
4212
4213 // It is unsafe to contract this fsub into the PH multiply.
4214 auto PHSubE = B.buildFSub(Ty, PH, E, FlagsNoContract);
4215 auto A = B.buildFAdd(Ty, PHSubE, PL, Flags);
4216 auto IntE = B.buildFPTOSI(LLT::scalar(32), E);
4217
4218 auto Exp2 = B.buildIntrinsic(Intrinsic::amdgcn_exp2, {Ty})
4219 .addUse(A.getReg(0))
4220 .setMIFlags(Flags);
4221 auto R = B.buildFLdexp(Ty, Exp2, IntE, Flags);
4222
4223 auto UnderflowCheckConst =
4224 B.buildFConstant(Ty, IsExp10 ? -0x1.66d3e8p+5f : -0x1.9d1da0p+6f);
4225 auto Zero = B.buildFConstant(Ty, 0.0);
4226 auto Underflow =
4227 B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), X, UnderflowCheckConst);
4228
4229 R = B.buildSelect(Ty, Underflow, Zero, R);
4230
4231 if (!(Flags & MachineInstr::FmNoInfs)) {
4232 auto OverflowCheckConst =
4233 B.buildFConstant(Ty, IsExp10 ? 0x1.344136p+5f : 0x1.62e430p+6f);
4234
4235 auto Overflow =
4236 B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), X, OverflowCheckConst);
4237 auto Inf = B.buildFConstant(Ty, APFloat::getInf(APFloat::IEEEsingle()));
4238 R = B.buildSelect(Ty, Overflow, Inf, R, Flags);
4239 }
4240
4241 B.buildCopy(Dst, R);
4242 MI.eraseFromParent();
4243 return true;
4244}
4245
4247 MachineIRBuilder &B) const {
4248 Register Dst = MI.getOperand(0).getReg();
4249 Register Src0 = MI.getOperand(1).getReg();
4250 Register Src1 = MI.getOperand(2).getReg();
4251 unsigned Flags = MI.getFlags();
4252 LLT Ty = B.getMRI()->getType(Dst);
4253 const LLT F16 = LLT::scalar(16); // TODO: Expected LLT::float16()
4254 const LLT F32 = LLT::scalar(32); // TODO: Expected LLT::float32()
4255
4256 if (Ty == F32) {
4257 auto Log = B.buildFLog2(F32, Src0, Flags);
4258 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {F32})
4259 .addUse(Log.getReg(0))
4260 .addUse(Src1)
4261 .setMIFlags(Flags);
4262 B.buildFExp2(Dst, Mul, Flags);
4263 } else if (Ty == F16) {
4264 // There's no f16 fmul_legacy, so we need to convert for it.
4265 auto Log = B.buildFLog2(F16, Src0, Flags);
4266 auto Ext0 = B.buildFPExt(F32, Log, Flags);
4267 auto Ext1 = B.buildFPExt(F32, Src1, Flags);
4268 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {F32})
4269 .addUse(Ext0.getReg(0))
4270 .addUse(Ext1.getReg(0))
4271 .setMIFlags(Flags);
4272 B.buildFExp2(Dst, B.buildFPTrunc(F16, Mul), Flags);
4273 } else
4274 return false;
4275
4276 MI.eraseFromParent();
4277 return true;
4278}
4279
4280// Find a source register, ignoring any possible source modifiers.
4282 Register ModSrc = OrigSrc;
4283 if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) {
4284 ModSrc = SrcFNeg->getOperand(1).getReg();
4285 if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
4286 ModSrc = SrcFAbs->getOperand(1).getReg();
4287 } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
4288 ModSrc = SrcFAbs->getOperand(1).getReg();
4289 return ModSrc;
4290}
4291
4294 MachineIRBuilder &B) const {
4295
4296 const LLT S1 = LLT::scalar(1);
4297 const LLT F64 = LLT::scalar(64); // TODO: Expected float64
4298 Register Dst = MI.getOperand(0).getReg();
4299 Register OrigSrc = MI.getOperand(1).getReg();
4300 unsigned Flags = MI.getFlags();
4301 assert(ST.hasFractBug() && MRI.getType(Dst) == F64 &&
4302 "this should not have been custom lowered");
4303
4304 // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x))
4305 // is used instead. However, SI doesn't have V_FLOOR_F64, so the most
4306 // efficient way to implement it is using V_FRACT_F64. The workaround for the
4307 // V_FRACT bug is:
4308 // fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999)
4309 //
4310 // Convert floor(x) to (x - fract(x))
4311
4312 auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {F64})
4313 .addUse(OrigSrc)
4314 .setMIFlags(Flags);
4315
4316 // Give source modifier matching some assistance before obscuring a foldable
4317 // pattern.
4318
4319 // TODO: We can avoid the neg on the fract? The input sign to fract
4320 // shouldn't matter?
4321 Register ModSrc = stripAnySourceMods(OrigSrc, MRI);
4322
4323 auto Const =
4324 B.buildFConstant(F64, llvm::bit_cast<double>(0x3fefffffffffffff));
4325
4327
4328 // We don't need to concern ourselves with the snan handling difference, so
4329 // use the one which will directly select.
4330 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
4331 if (MFI->getMode().IEEE)
4332 B.buildFMinNumIEEE(Min, Fract, Const, Flags);
4333 else
4334 B.buildFMinNum(Min, Fract, Const, Flags);
4335
4336 Register CorrectedFract = Min;
4337 if (!MI.getFlag(MachineInstr::FmNoNans)) {
4338 auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags);
4339 CorrectedFract = B.buildSelect(F64, IsNan, ModSrc, Min, Flags).getReg(0);
4340 }
4341
4342 auto NegFract = B.buildFNeg(F64, CorrectedFract, Flags);
4343 B.buildFAdd(Dst, OrigSrc, NegFract, Flags);
4344
4345 MI.eraseFromParent();
4346 return true;
4347}
4348
4349// Turn an illegal packed v2s16 build vector into bit operations.
4350// TODO: This should probably be a bitcast action in LegalizerHelper.
4353 Register Dst = MI.getOperand(0).getReg();
4354 const LLT S32 = LLT::scalar(32);
4355 const LLT S16 = LLT::scalar(16);
4356 assert(MRI.getType(Dst) == LLT::fixed_vector(2, 16));
4357
4358 Register Src0 = MI.getOperand(1).getReg();
4359 Register Src1 = MI.getOperand(2).getReg();
4360
4361 if (MI.getOpcode() == AMDGPU::G_BUILD_VECTOR_TRUNC) {
4362 assert(MRI.getType(Src0) == S32);
4363 Src0 = B.buildTrunc(S16, MI.getOperand(1).getReg()).getReg(0);
4364 Src1 = B.buildTrunc(S16, MI.getOperand(2).getReg()).getReg(0);
4365 }
4366
4367 auto Merge = B.buildMergeLikeInstr(S32, {Src0, Src1});
4368 B.buildBitcast(Dst, Merge);
4369
4370 MI.eraseFromParent();
4371 return true;
4372}
4373
4374// Build a big integer multiply or multiply-add using MAD_64_32 instructions.
4375//
4376// Source and accumulation registers must all be 32-bits.
4377//
4378// TODO: When the multiply is uniform, we should produce a code sequence
4379// that is better suited to instruction selection on the SALU. Instead of
4380// the outer loop going over parts of the result, the outer loop should go
4381// over parts of one of the factors. This should result in instruction
4382// selection that makes full use of S_ADDC_U32 instructions.
4385 ArrayRef<Register> Src0,
4386 ArrayRef<Register> Src1,
4387 bool UsePartialMad64_32,
4388 bool SeparateOddAlignedProducts) const {
4389 // Use (possibly empty) vectors of S1 registers to represent the set of
4390 // carries from one pair of positions to the next.
4391 using Carry = SmallVector<Register, 2>;
4392
4393 MachineIRBuilder &B = Helper.MIRBuilder;
4394 GISelValueTracking &VT = *Helper.getValueTracking();
4395
4396 const LLT S1 = LLT::scalar(1);
4397 const LLT S32 = LLT::scalar(32);
4398 const LLT S64 = LLT::scalar(64);
4399
4400 Register Zero32;
4401 Register Zero64;
4402
4403 auto getZero32 = [&]() -> Register {
4404 if (!Zero32)
4405 Zero32 = B.buildConstant(S32, 0).getReg(0);
4406 return Zero32;
4407 };
4408 auto getZero64 = [&]() -> Register {
4409 if (!Zero64)
4410 Zero64 = B.buildConstant(S64, 0).getReg(0);
4411 return Zero64;
4412 };
4413
4414 SmallVector<bool, 2> Src0KnownZeros, Src1KnownZeros;
4415 for (unsigned i = 0; i < Src0.size(); ++i) {
4416 Src0KnownZeros.push_back(VT.getKnownBits(Src0[i]).isZero());
4417 Src1KnownZeros.push_back(VT.getKnownBits(Src1[i]).isZero());
4418 }
4419
4420 // Merge the given carries into the 32-bit LocalAccum, which is modified
4421 // in-place.
4422 //
4423 // Returns the carry-out, which is a single S1 register or null.
4424 auto mergeCarry =
4425 [&](Register &LocalAccum, const Carry &CarryIn) -> Register {
4426 if (CarryIn.empty())
4427 return Register();
4428
4429 bool HaveCarryOut = true;
4430 Register CarryAccum;
4431 if (CarryIn.size() == 1) {
4432 if (!LocalAccum) {
4433 LocalAccum = B.buildZExt(S32, CarryIn[0]).getReg(0);
4434 return Register();
4435 }
4436
4437 CarryAccum = getZero32();
4438 } else {
4439 CarryAccum = B.buildZExt(S32, CarryIn[0]).getReg(0);
4440 for (unsigned i = 1; i + 1 < CarryIn.size(); ++i) {
4441 CarryAccum =
4442 B.buildUAdde(S32, S1, CarryAccum, getZero32(), CarryIn[i])
4443 .getReg(0);
4444 }
4445
4446 if (!LocalAccum) {
4447 LocalAccum = getZero32();
4448 HaveCarryOut = false;
4449 }
4450 }
4451
4452 auto Add =
4453 B.buildUAdde(S32, S1, CarryAccum, LocalAccum, CarryIn.back());
4454 LocalAccum = Add.getReg(0);
4455 return HaveCarryOut ? Add.getReg(1) : Register();
4456 };
4457
4458 // Build a multiply-add chain to compute
4459 //
4460 // LocalAccum + (partial products at DstIndex)
4461 // + (opportunistic subset of CarryIn)
4462 //
4463 // LocalAccum is an array of one or two 32-bit registers that are updated
4464 // in-place. The incoming registers may be null.
4465 //
4466 // In some edge cases, carry-ins can be consumed "for free". In that case,
4467 // the consumed carry bits are removed from CarryIn in-place.
4468 auto buildMadChain =
4469 [&](MutableArrayRef<Register> LocalAccum, unsigned DstIndex, Carry &CarryIn)
4470 -> Carry {
4471 assert((DstIndex + 1 < Accum.size() && LocalAccum.size() == 2) ||
4472 (DstIndex + 1 >= Accum.size() && LocalAccum.size() == 1));
4473
4474 Carry CarryOut;
4475 unsigned j0 = 0;
4476
4477 // Use plain 32-bit multiplication for the most significant part of the
4478 // result by default.
4479 if (LocalAccum.size() == 1 &&
4480 (!UsePartialMad64_32 || !CarryIn.empty())) {
4481 do {
4482 // Skip multiplication if one of the operands is 0
4483 unsigned j1 = DstIndex - j0;
4484 if (Src0KnownZeros[j0] || Src1KnownZeros[j1]) {
4485 ++j0;
4486 continue;
4487 }
4488 auto Mul = B.buildMul(S32, Src0[j0], Src1[j1]);
4489 if (!LocalAccum[0] || VT.getKnownBits(LocalAccum[0]).isZero()) {
4490 LocalAccum[0] = Mul.getReg(0);
4491 } else {
4492 if (CarryIn.empty()) {
4493 LocalAccum[0] = B.buildAdd(S32, LocalAccum[0], Mul).getReg(0);
4494 } else {
4495 LocalAccum[0] =
4496 B.buildUAdde(S32, S1, LocalAccum[0], Mul, CarryIn.back())
4497 .getReg(0);
4498 CarryIn.pop_back();
4499 }
4500 }
4501 ++j0;
4502 } while (j0 <= DstIndex && (!UsePartialMad64_32 || !CarryIn.empty()));
4503 }
4504
4505 // Build full 64-bit multiplies.
4506 if (j0 <= DstIndex) {
4507 bool HaveSmallAccum = false;
4508 Register Tmp;
4509
4510 if (LocalAccum[0]) {
4511 if (LocalAccum.size() == 1) {
4512 Tmp = B.buildAnyExt(S64, LocalAccum[0]).getReg(0);
4513 HaveSmallAccum = true;
4514 } else if (LocalAccum[1]) {
4515 Tmp = B.buildMergeLikeInstr(S64, LocalAccum).getReg(0);
4516 HaveSmallAccum = false;
4517 } else {
4518 Tmp = B.buildZExt(S64, LocalAccum[0]).getReg(0);
4519 HaveSmallAccum = true;
4520 }
4521 } else {
4522 assert(LocalAccum.size() == 1 || !LocalAccum[1]);
4523 Tmp = getZero64();
4524 HaveSmallAccum = true;
4525 }
4526
4527 do {
4528 unsigned j1 = DstIndex - j0;
4529 if (Src0KnownZeros[j0] || Src1KnownZeros[j1]) {
4530 ++j0;
4531 continue;
4532 }
4533 auto Mad = B.buildInstr(AMDGPU::G_AMDGPU_MAD_U64_U32, {S64, S1},
4534 {Src0[j0], Src1[j1], Tmp});
4535 Tmp = Mad.getReg(0);
4536 if (!HaveSmallAccum)
4537 CarryOut.push_back(Mad.getReg(1));
4538 HaveSmallAccum = false;
4539
4540 ++j0;
4541 } while (j0 <= DstIndex);
4542
4543 auto Unmerge = B.buildUnmerge(S32, Tmp);
4544 LocalAccum[0] = Unmerge.getReg(0);
4545 if (LocalAccum.size() > 1)
4546 LocalAccum[1] = Unmerge.getReg(1);
4547 }
4548
4549 return CarryOut;
4550 };
4551
4552 // Outer multiply loop, iterating over destination parts from least
4553 // significant to most significant parts.
4554 //
4555 // The columns of the following diagram correspond to the destination parts
4556 // affected by one iteration of the outer loop (ignoring boundary
4557 // conditions).
4558 //
4559 // Dest index relative to 2 * i: 1 0 -1
4560 // ------
4561 // Carries from previous iteration: e o
4562 // Even-aligned partial product sum: E E .
4563 // Odd-aligned partial product sum: O O
4564 //
4565 // 'o' is OddCarry, 'e' is EvenCarry.
4566 // EE and OO are computed from partial products via buildMadChain and use
4567 // accumulation where possible and appropriate.
4568 //
4569 Register SeparateOddCarry;
4570 Carry EvenCarry;
4571 Carry OddCarry;
4572
4573 for (unsigned i = 0; i <= Accum.size() / 2; ++i) {
4574 Carry OddCarryIn = std::move(OddCarry);
4575 Carry EvenCarryIn = std::move(EvenCarry);
4576 OddCarry.clear();
4577 EvenCarry.clear();
4578
4579 // Partial products at offset 2 * i.
4580 if (2 * i < Accum.size()) {
4581 auto LocalAccum = Accum.drop_front(2 * i).take_front(2);
4582 EvenCarry = buildMadChain(LocalAccum, 2 * i, EvenCarryIn);
4583 }
4584
4585 // Partial products at offset 2 * i - 1.
4586 if (i > 0) {
4587 if (!SeparateOddAlignedProducts) {
4588 auto LocalAccum = Accum.drop_front(2 * i - 1).take_front(2);
4589 OddCarry = buildMadChain(LocalAccum, 2 * i - 1, OddCarryIn);
4590 } else {
4591 bool IsHighest = 2 * i >= Accum.size();
4592 Register SeparateOddOut[2];
4593 auto LocalAccum = MutableArrayRef(SeparateOddOut)
4594 .take_front(IsHighest ? 1 : 2);
4595 OddCarry = buildMadChain(LocalAccum, 2 * i - 1, OddCarryIn);
4596
4598
4599 if (i == 1) {
4600 if (!IsHighest)
4601 Lo = B.buildUAddo(S32, S1, Accum[2 * i - 1], SeparateOddOut[0]);
4602 else
4603 Lo = B.buildAdd(S32, Accum[2 * i - 1], SeparateOddOut[0]);
4604 } else {
4605 Lo = B.buildUAdde(S32, S1, Accum[2 * i - 1], SeparateOddOut[0],
4606 SeparateOddCarry);
4607 }
4608 Accum[2 * i - 1] = Lo->getOperand(0).getReg();
4609
4610 if (!IsHighest) {
4611 auto Hi = B.buildUAdde(S32, S1, Accum[2 * i], SeparateOddOut[1],
4612 Lo->getOperand(1).getReg());
4613 Accum[2 * i] = Hi.getReg(0);
4614 SeparateOddCarry = Hi.getReg(1);
4615 }
4616 }
4617 }
4618
4619 // Add in the carries from the previous iteration
4620 if (i > 0) {
4621 if (Register CarryOut = mergeCarry(Accum[2 * i - 1], OddCarryIn))
4622 EvenCarryIn.push_back(CarryOut);
4623
4624 if (2 * i < Accum.size()) {
4625 if (Register CarryOut = mergeCarry(Accum[2 * i], EvenCarryIn))
4626 OddCarry.push_back(CarryOut);
4627 }
4628 }
4629 }
4630}
4631
4632// Custom narrowing of wide multiplies using wide multiply-add instructions.
4633//
4634// TODO: If the multiply is followed by an addition, we should attempt to
4635// integrate it to make better use of V_MAD_U64_U32's multiply-add capabilities.
4637 MachineInstr &MI) const {
4638 assert(ST.hasMad64_32());
4639 assert(MI.getOpcode() == TargetOpcode::G_MUL);
4640
4641 MachineIRBuilder &B = Helper.MIRBuilder;
4642 MachineRegisterInfo &MRI = *B.getMRI();
4643
4644 Register DstReg = MI.getOperand(0).getReg();
4645 Register Src0 = MI.getOperand(1).getReg();
4646 Register Src1 = MI.getOperand(2).getReg();
4647
4648 LLT Ty = MRI.getType(DstReg);
4649 assert(Ty.isScalar());
4650
4651 unsigned Size = Ty.getSizeInBits();
4652 if (ST.hasVMulU64Inst() && Size == 64)
4653 return true;
4654
4655 unsigned NumParts = Size / 32;
4656 assert((Size % 32) == 0);
4657 assert(NumParts >= 2);
4658
4659 // Whether to use MAD_64_32 for partial products whose high half is
4660 // discarded. This avoids some ADD instructions but risks false dependency
4661 // stalls on some subtargets in some cases.
4662 const bool UsePartialMad64_32 = ST.getGeneration() < AMDGPUSubtarget::GFX10;
4663
4664 // Whether to compute odd-aligned partial products separately. This is
4665 // advisable on subtargets where the accumulator of MAD_64_32 must be placed
4666 // in an even-aligned VGPR.
4667 const bool SeparateOddAlignedProducts = ST.hasFullRate64Ops();
4668
4669 LLT S32 = LLT::scalar(32);
4670 SmallVector<Register, 2> Src0Parts, Src1Parts;
4671 for (unsigned i = 0; i < NumParts; ++i) {
4674 }
4675 B.buildUnmerge(Src0Parts, Src0);
4676 B.buildUnmerge(Src1Parts, Src1);
4677
4678 SmallVector<Register, 2> AccumRegs(NumParts);
4679 buildMultiply(Helper, AccumRegs, Src0Parts, Src1Parts, UsePartialMad64_32,
4680 SeparateOddAlignedProducts);
4681
4682 B.buildMergeLikeInstr(DstReg, AccumRegs);
4683 MI.eraseFromParent();
4684 return true;
4685}
4686
4687// Legalize ctlz/cttz to ffbh/ffbl instead of the default legalization to
4688// ctlz/cttz_zero_poison. This allows us to fix up the result for the zero input
4689// case with a single min instruction instead of a compare+select.
4692 MachineIRBuilder &B) const {
4693 Register Dst = MI.getOperand(0).getReg();
4694 Register Src = MI.getOperand(1).getReg();
4695 LLT DstTy = MRI.getType(Dst);
4696 LLT SrcTy = MRI.getType(Src);
4697
4698 unsigned NewOpc = MI.getOpcode() == AMDGPU::G_CTLZ
4699 ? AMDGPU::G_AMDGPU_FFBH_U32
4700 : AMDGPU::G_AMDGPU_FFBL_B32;
4701 auto Tmp = B.buildInstr(NewOpc, {DstTy}, {Src});
4702 B.buildUMin(Dst, Tmp, B.buildConstant(DstTy, SrcTy.getSizeInBits()));
4703
4704 MI.eraseFromParent();
4705 return true;
4706}
4707
4710 MachineIRBuilder &B) const {
4711 Register Dst = MI.getOperand(0).getReg();
4712 Register Src = MI.getOperand(1).getReg();
4713 LLT SrcTy = MRI.getType(Src);
4714 TypeSize NumBits = SrcTy.getSizeInBits();
4715
4716 assert(NumBits < 32u);
4717
4718 auto ShiftAmt = B.buildConstant(S32, 32u - NumBits);
4719 auto Extend = B.buildAnyExt(S32, {Src}).getReg(0u);
4720 auto Shift = B.buildShl(S32, Extend, ShiftAmt);
4721 auto Ctlz = B.buildInstr(AMDGPU::G_AMDGPU_FFBH_U32, {S32}, {Shift});
4722 B.buildTrunc(Dst, Ctlz);
4723 MI.eraseFromParent();
4724 return true;
4725}
4726
4729 MachineIRBuilder &B) const {
4730 Register Dst = MI.getOperand(0).getReg();
4731 Register Src = MI.getOperand(1).getReg();
4732 LLT SrcTy = MRI.getType(Src);
4733 const LLT S32 = LLT::scalar(32);
4734 assert(SrcTy == S32 && "legalizeCTLS only supports s32");
4735 unsigned BitWidth = SrcTy.getSizeInBits();
4736
4737 auto Sffbh = B.buildIntrinsic(Intrinsic::amdgcn_sffbh, {S32}).addUse(Src);
4738 auto Clamped = B.buildUMin(S32, Sffbh, B.buildConstant(S32, BitWidth));
4739 B.buildSub(Dst, Clamped, B.buildConstant(S32, 1));
4740 MI.eraseFromParent();
4741 return true;
4742}
4743
4744// Check that this is a G_XOR x, -1
4745static bool isNot(const MachineRegisterInfo &MRI, const MachineInstr &MI) {
4746 if (MI.getOpcode() != TargetOpcode::G_XOR)
4747 return false;
4748 auto ConstVal = getIConstantVRegSExtVal(MI.getOperand(2).getReg(), MRI);
4749 return ConstVal == -1;
4750}
4751
4752// Return the use branch instruction, otherwise null if the usage is invalid.
4753static MachineInstr *
4755 MachineBasicBlock *&UncondBrTarget, bool &Negated) {
4756 Register CondDef = MI.getOperand(0).getReg();
4757 if (!MRI.hasOneNonDBGUse(CondDef))
4758 return nullptr;
4759
4760 MachineBasicBlock *Parent = MI.getParent();
4761 MachineInstr *UseMI = &*MRI.use_instr_nodbg_begin(CondDef);
4762
4763 if (isNot(MRI, *UseMI)) {
4764 Register NegatedCond = UseMI->getOperand(0).getReg();
4765 if (!MRI.hasOneNonDBGUse(NegatedCond))
4766 return nullptr;
4767
4768 // We're deleting the def of this value, so we need to remove it.
4769 eraseInstr(*UseMI, MRI);
4770
4771 UseMI = &*MRI.use_instr_nodbg_begin(NegatedCond);
4772 Negated = true;
4773 }
4774
4775 if (UseMI->getParent() != Parent || UseMI->getOpcode() != AMDGPU::G_BRCOND)
4776 return nullptr;
4777
4778 // Make sure the cond br is followed by a G_BR, or is the last instruction.
4779 MachineBasicBlock::iterator Next = std::next(UseMI->getIterator());
4780 if (Next == Parent->end()) {
4781 MachineFunction::iterator NextMBB = std::next(Parent->getIterator());
4782 if (NextMBB == Parent->getParent()->end()) // Illegal intrinsic use.
4783 return nullptr;
4784 UncondBrTarget = &*NextMBB;
4785 } else {
4786 if (Next->getOpcode() != AMDGPU::G_BR)
4787 return nullptr;
4788 Br = &*Next;
4789 UncondBrTarget = Br->getOperand(0).getMBB();
4790 }
4791
4792 return UseMI;
4793}
4794
4797 const ArgDescriptor *Arg,
4798 const TargetRegisterClass *ArgRC,
4799 LLT ArgTy) const {
4800 MCRegister SrcReg = Arg->getRegister();
4801 assert(SrcReg.isPhysical() && "Physical register expected");
4802 assert(DstReg.isVirtual() && "Virtual register expected");
4803
4804 Register LiveIn = getFunctionLiveInPhysReg(B.getMF(), B.getTII(), SrcReg,
4805 *ArgRC, B.getDebugLoc(), ArgTy);
4806 if (Arg->isMasked()) {
4807 // TODO: Should we try to emit this once in the entry block?
4808 const LLT S32 = LLT::scalar(32);
4809 const unsigned Mask = Arg->getMask();
4810 const unsigned Shift = llvm::countr_zero<unsigned>(Mask);
4811
4812 Register AndMaskSrc = LiveIn;
4813
4814 // TODO: Avoid clearing the high bits if we know workitem id y/z are always
4815 // 0.
4816 if (Shift != 0) {
4817 auto ShiftAmt = B.buildConstant(S32, Shift);
4818 AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0);
4819 }
4820
4821 B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift));
4822 } else {
4823 B.buildCopy(DstReg, LiveIn);
4824 }
4825}
4826
4831 AMDGPUFunctionArgInfo::PreloadedValue ClusterWorkGroupIdPV) const {
4832 Register DstReg = MI.getOperand(0).getReg();
4833 if (!ST.hasClusters()) {
4834 if (!loadInputValue(DstReg, B, WorkGroupIdPV))
4835 return false;
4836 MI.eraseFromParent();
4837 return true;
4838 }
4839
4840 // Clusters are supported. Return the global position in the grid. If clusters
4841 // are enabled, WorkGroupIdPV returns the cluster ID not the workgroup ID.
4842
4843 // WorkGroupIdXYZ = ClusterId == 0 ?
4844 // ClusterIdXYZ :
4845 // ClusterIdXYZ * (ClusterMaxIdXYZ + 1) + ClusterWorkGroupIdXYZ
4846 MachineRegisterInfo &MRI = *B.getMRI();
4847 const LLT S32 = LLT::scalar(32);
4848 Register ClusterIdXYZ = MRI.createGenericVirtualRegister(S32);
4849 Register ClusterMaxIdXYZ = MRI.createGenericVirtualRegister(S32);
4850 Register ClusterWorkGroupIdXYZ = MRI.createGenericVirtualRegister(S32);
4851 if (!loadInputValue(ClusterIdXYZ, B, WorkGroupIdPV) ||
4852 !loadInputValue(ClusterWorkGroupIdXYZ, B, ClusterWorkGroupIdPV) ||
4853 !loadInputValue(ClusterMaxIdXYZ, B, ClusterMaxIdPV))
4854 return false;
4855
4856 auto One = B.buildConstant(S32, 1);
4857 auto ClusterSizeXYZ = B.buildAdd(S32, ClusterMaxIdXYZ, One);
4858 auto GlobalIdXYZ = B.buildAdd(S32, ClusterWorkGroupIdXYZ,
4859 B.buildMul(S32, ClusterIdXYZ, ClusterSizeXYZ));
4860
4861 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
4862
4863 switch (MFI->getClusterDims().getKind()) {
4866 B.buildCopy(DstReg, GlobalIdXYZ);
4867 MI.eraseFromParent();
4868 return true;
4869 }
4871 B.buildCopy(DstReg, ClusterIdXYZ);
4872 MI.eraseFromParent();
4873 return true;
4874 }
4876 using namespace AMDGPU::Hwreg;
4877 unsigned ClusterIdField = HwregEncoding::encode(ID_IB_STS2, 6, 4);
4878 Register ClusterId = MRI.createGenericVirtualRegister(S32);
4879 MRI.setRegClass(ClusterId, &AMDGPU::SReg_32RegClass);
4880 B.buildInstr(AMDGPU::S_GETREG_B32_const)
4881 .addDef(ClusterId)
4882 .addImm(ClusterIdField);
4883 auto Zero = B.buildConstant(S32, 0);
4884 auto NoClusters =
4885 B.buildICmp(CmpInst::ICMP_EQ, LLT::scalar(1), ClusterId, Zero);
4886 B.buildSelect(DstReg, NoClusters, ClusterIdXYZ, GlobalIdXYZ);
4887 MI.eraseFromParent();
4888 return true;
4889 }
4890 }
4891
4892 llvm_unreachable("nothing should reach here");
4893}
4894
4896 Register DstReg, MachineIRBuilder &B,
4898 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
4899 const ArgDescriptor *Arg = nullptr;
4900 const TargetRegisterClass *ArgRC;
4901 LLT ArgTy;
4902
4903 CallingConv::ID CC = B.getMF().getFunction().getCallingConv();
4904 const ArgDescriptor WorkGroupIDX =
4905 ArgDescriptor::createRegister(AMDGPU::TTMP9);
4906 // If GridZ is not programmed in an entry function then the hardware will set
4907 // it to all zeros, so there is no need to mask the GridY value in the low
4908 // order bits.
4909 const ArgDescriptor WorkGroupIDY = ArgDescriptor::createRegister(
4910 AMDGPU::TTMP7,
4911 AMDGPU::isEntryFunctionCC(CC) && !MFI->hasWorkGroupIDZ() ? ~0u : 0xFFFFu);
4912 const ArgDescriptor WorkGroupIDZ =
4913 ArgDescriptor::createRegister(AMDGPU::TTMP7, 0xFFFF0000u);
4914 const ArgDescriptor ClusterWorkGroupIDX =
4915 ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x0000000Fu);
4916 const ArgDescriptor ClusterWorkGroupIDY =
4917 ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x000000F0u);
4918 const ArgDescriptor ClusterWorkGroupIDZ =
4919 ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x00000F00u);
4920 const ArgDescriptor ClusterWorkGroupMaxIDX =
4921 ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x0000F000u);
4922 const ArgDescriptor ClusterWorkGroupMaxIDY =
4923 ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x000F0000u);
4924 const ArgDescriptor ClusterWorkGroupMaxIDZ =
4925 ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x00F00000u);
4926 const ArgDescriptor ClusterWorkGroupMaxFlatID =
4927 ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x0F000000u);
4928
4929 auto LoadConstant = [&](unsigned N) {
4930 B.buildConstant(DstReg, N);
4931 return true;
4932 };
4933
4934 if (ST.hasArchitectedSGPRs() &&
4936 AMDGPU::ClusterDimsAttr ClusterDims = MFI->getClusterDims();
4937 bool HasFixedDims = ClusterDims.isFixedDims();
4938
4939 switch (ArgType) {
4941 Arg = &WorkGroupIDX;
4942 ArgRC = &AMDGPU::SReg_32RegClass;
4943 ArgTy = LLT::scalar(32);
4944 break;
4946 Arg = &WorkGroupIDY;
4947 ArgRC = &AMDGPU::SReg_32RegClass;
4948 ArgTy = LLT::scalar(32);
4949 break;
4951 Arg = &WorkGroupIDZ;
4952 ArgRC = &AMDGPU::SReg_32RegClass;
4953 ArgTy = LLT::scalar(32);
4954 break;
4956 if (HasFixedDims && ClusterDims.getDims()[0] == 1)
4957 return LoadConstant(0);
4958 Arg = &ClusterWorkGroupIDX;
4959 ArgRC = &AMDGPU::SReg_32RegClass;
4960 ArgTy = LLT::scalar(32);
4961 break;
4963 if (HasFixedDims && ClusterDims.getDims()[1] == 1)
4964 return LoadConstant(0);
4965 Arg = &ClusterWorkGroupIDY;
4966 ArgRC = &AMDGPU::SReg_32RegClass;
4967 ArgTy = LLT::scalar(32);
4968 break;
4970 if (HasFixedDims && ClusterDims.getDims()[2] == 1)
4971 return LoadConstant(0);
4972 Arg = &ClusterWorkGroupIDZ;
4973 ArgRC = &AMDGPU::SReg_32RegClass;
4974 ArgTy = LLT::scalar(32);
4975 break;
4977 if (HasFixedDims)
4978 return LoadConstant(ClusterDims.getDims()[0] - 1);
4979 Arg = &ClusterWorkGroupMaxIDX;
4980 ArgRC = &AMDGPU::SReg_32RegClass;
4981 ArgTy = LLT::scalar(32);
4982 break;
4984 if (HasFixedDims)
4985 return LoadConstant(ClusterDims.getDims()[1] - 1);
4986 Arg = &ClusterWorkGroupMaxIDY;
4987 ArgRC = &AMDGPU::SReg_32RegClass;
4988 ArgTy = LLT::scalar(32);
4989 break;
4991 if (HasFixedDims)
4992 return LoadConstant(ClusterDims.getDims()[2] - 1);
4993 Arg = &ClusterWorkGroupMaxIDZ;
4994 ArgRC = &AMDGPU::SReg_32RegClass;
4995 ArgTy = LLT::scalar(32);
4996 break;
4998 Arg = &ClusterWorkGroupMaxFlatID;
4999 ArgRC = &AMDGPU::SReg_32RegClass;
5000 ArgTy = LLT::scalar(32);
5001 break;
5002 default:
5003 break;
5004 }
5005 }
5006
5007 if (!Arg)
5008 std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType);
5009
5010 if (!Arg) {
5012 // The intrinsic may appear when we have a 0 sized kernarg segment, in
5013 // which case the pointer argument may be missing and we use null.
5014 return LoadConstant(0);
5015 }
5016
5017 // It's undefined behavior if a function marked with the amdgpu-no-*
5018 // attributes uses the corresponding intrinsic.
5019 B.buildUndef(DstReg);
5020 return true;
5021 }
5022
5023 if (!Arg->isRegister() || !Arg->getRegister().isValid())
5024 return false; // TODO: Handle these
5025 buildLoadInputValue(DstReg, B, Arg, ArgRC, ArgTy);
5026 return true;
5027}
5028
5032 if (!loadInputValue(MI.getOperand(0).getReg(), B, ArgType))
5033 return false;
5034
5035 MI.eraseFromParent();
5036 return true;
5037}
5038
5040 int64_t C) {
5041 B.buildConstant(MI.getOperand(0).getReg(), C);
5042 MI.eraseFromParent();
5043 return true;
5044}
5045
5048 unsigned Dim, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
5049 unsigned MaxID = ST.getMaxWorkitemID(B.getMF().getFunction(), Dim);
5050 if (MaxID == 0)
5051 return replaceWithConstant(B, MI, 0);
5052
5053 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
5054 const ArgDescriptor *Arg;
5055 const TargetRegisterClass *ArgRC;
5056 LLT ArgTy;
5057 std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType);
5058
5059 Register DstReg = MI.getOperand(0).getReg();
5060 if (!Arg) {
5061 // It's undefined behavior if a function marked with the amdgpu-no-*
5062 // attributes uses the corresponding intrinsic.
5063 B.buildUndef(DstReg);
5064 MI.eraseFromParent();
5065 return true;
5066 }
5067
5068 if (Arg->isMasked()) {
5069 // Don't bother inserting AssertZext for packed IDs since we're emitting the
5070 // masking operations anyway.
5071 //
5072 // TODO: We could assert the top bit is 0 for the source copy.
5073 if (!loadInputValue(DstReg, B, ArgType))
5074 return false;
5075 } else {
5077 if (!loadInputValue(TmpReg, B, ArgType))
5078 return false;
5079 B.buildAssertZExt(DstReg, TmpReg, llvm::bit_width(MaxID));
5080 }
5081
5082 MI.eraseFromParent();
5083 return true;
5084}
5085
5088 // This isn't really a constant pool but close enough.
5091 return PtrInfo;
5092}
5093
5095 int64_t Offset) const {
5097 Register KernArgReg = B.getMRI()->createGenericVirtualRegister(PtrTy);
5098
5099 // TODO: If we passed in the base kernel offset we could have a better
5100 // alignment than 4, but we don't really need it.
5101 if (!loadInputValue(KernArgReg, B,
5103 llvm_unreachable("failed to find kernarg segment ptr");
5104
5105 auto COffset = B.buildConstant(LLT::scalar(64), Offset);
5106 return B.buildObjectPtrOffset(PtrTy, KernArgReg, COffset).getReg(0);
5107}
5108
5109/// Legalize a value that's loaded from kernel arguments. This is only used by
5110/// legacy intrinsics.
5114 Align Alignment) const {
5115 Register DstReg = MI.getOperand(0).getReg();
5116
5117 assert(B.getMRI()->getType(DstReg) == LLT::scalar(32) &&
5118 "unexpected kernarg parameter type");
5119
5122 B.buildLoad(DstReg, Ptr, PtrInfo.getWithOffset(Offset), Align(4),
5125 MI.eraseFromParent();
5126 return true;
5127}
5128
5131 MachineIRBuilder &B) const {
5132 Register Dst = MI.getOperand(0).getReg();
5133 LLT DstTy = MRI.getType(Dst);
5134 LLT S16 = LLT::scalar(16);
5135 LLT S32 = LLT::scalar(32);
5136 LLT S64 = LLT::scalar(64);
5137
5138 if (DstTy == S16)
5139 return legalizeFDIV16(MI, MRI, B);
5140 if (DstTy == S32)
5141 return legalizeFDIV32(MI, MRI, B);
5142 if (DstTy == S64)
5143 return legalizeFDIV64(MI, MRI, B);
5144
5145 return false;
5146}
5147
5149 Register DstDivReg,
5150 Register DstRemReg,
5151 Register X,
5152 Register Y) const {
5153 const LLT S1 = LLT::scalar(1);
5154 const LLT S32 = LLT::scalar(32);
5155
5156 // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the
5157 // algorithm used here.
5158
5159 // Initial estimate of inv(y).
5160 auto FloatY = B.buildUITOFP(S32, Y);
5161 auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {FloatY});
5162 auto Scale = B.buildFConstant(S32, llvm::bit_cast<float>(0x4f7ffffe));
5163 auto ScaledY = B.buildFMul(S32, RcpIFlag, Scale);
5164 auto Z = B.buildFPTOUI(S32, ScaledY);
5165
5166 // One round of UNR.
5167 auto NegY = B.buildSub(S32, B.buildConstant(S32, 0), Y);
5168 auto NegYZ = B.buildMul(S32, NegY, Z);
5169 Z = B.buildAdd(S32, Z, B.buildUMulH(S32, Z, NegYZ));
5170
5171 // Quotient/remainder estimate.
5172 auto Q = B.buildUMulH(S32, X, Z);
5173 auto R = B.buildSub(S32, X, B.buildMul(S32, Q, Y));
5174
5175 // First quotient/remainder refinement.
5176 auto One = B.buildConstant(S32, 1);
5177 auto Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
5178 if (DstDivReg)
5179 Q = B.buildSelect(S32, Cond, B.buildAdd(S32, Q, One), Q);
5180 R = B.buildSelect(S32, Cond, B.buildSub(S32, R, Y), R);
5181
5182 // Second quotient/remainder refinement.
5183 Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
5184 if (DstDivReg)
5185 B.buildSelect(DstDivReg, Cond, B.buildAdd(S32, Q, One), Q);
5186
5187 if (DstRemReg)
5188 B.buildSelect(DstRemReg, Cond, B.buildSub(S32, R, Y), R);
5189}
5190
5191// Build integer reciprocal sequence around V_RCP_IFLAG_F32
5192//
5193// Return lo, hi of result
5194//
5195// %cvt.lo = G_UITOFP Val.lo
5196// %cvt.hi = G_UITOFP Val.hi
5197// %mad = G_FMAD %cvt.hi, 2**32, %cvt.lo
5198// %rcp = G_AMDGPU_RCP_IFLAG %mad
5199// %mul1 = G_FMUL %rcp, 0x5f7ffffc
5200// %mul2 = G_FMUL %mul1, 2**(-32)
5201// %trunc = G_INTRINSIC_TRUNC %mul2
5202// %mad2 = G_FMAD %trunc, -(2**32), %mul1
5203// return {G_FPTOUI %mad2, G_FPTOUI %trunc}
5204static std::pair<Register, Register> emitReciprocalU64(MachineIRBuilder &B,
5205 Register Val) {
5206 const LLT S32 = LLT::scalar(32);
5207 auto Unmerge = B.buildUnmerge(S32, Val);
5208
5209 auto CvtLo = B.buildUITOFP(S32, Unmerge.getReg(0));
5210 auto CvtHi = B.buildUITOFP(S32, Unmerge.getReg(1));
5211
5212 auto Mad = B.buildFMAD(
5213 S32, CvtHi, // 2**32
5214 B.buildFConstant(S32, llvm::bit_cast<float>(0x4f800000)), CvtLo);
5215
5216 auto Rcp = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Mad});
5217 auto Mul1 = B.buildFMul(
5218 S32, Rcp, B.buildFConstant(S32, llvm::bit_cast<float>(0x5f7ffffc)));
5219
5220 // 2**(-32)
5221 auto Mul2 = B.buildFMul(
5222 S32, Mul1, B.buildFConstant(S32, llvm::bit_cast<float>(0x2f800000)));
5223 auto Trunc = B.buildIntrinsicTrunc(S32, Mul2);
5224
5225 // -(2**32)
5226 auto Mad2 = B.buildFMAD(
5227 S32, Trunc, B.buildFConstant(S32, llvm::bit_cast<float>(0xcf800000)),
5228 Mul1);
5229
5230 auto ResultLo = B.buildFPTOUI(S32, Mad2);
5231 auto ResultHi = B.buildFPTOUI(S32, Trunc);
5232
5233 return {ResultLo.getReg(0), ResultHi.getReg(0)};
5234}
5235
5237 Register DstDivReg,
5238 Register DstRemReg,
5239 Register Numer,
5240 Register Denom) const {
5241 const LLT S32 = LLT::scalar(32);
5242 const LLT S64 = LLT::scalar(64);
5243 const LLT S1 = LLT::scalar(1);
5244 Register RcpLo, RcpHi;
5245
5246 std::tie(RcpLo, RcpHi) = emitReciprocalU64(B, Denom);
5247
5248 auto Rcp = B.buildMergeLikeInstr(S64, {RcpLo, RcpHi});
5249
5250 auto Zero64 = B.buildConstant(S64, 0);
5251 auto NegDenom = B.buildSub(S64, Zero64, Denom);
5252
5253 auto MulLo1 = B.buildMul(S64, NegDenom, Rcp);
5254 auto MulHi1 = B.buildUMulH(S64, Rcp, MulLo1);
5255
5256 auto UnmergeMulHi1 = B.buildUnmerge(S32, MulHi1);
5257 Register MulHi1_Lo = UnmergeMulHi1.getReg(0);
5258 Register MulHi1_Hi = UnmergeMulHi1.getReg(1);
5259
5260 auto Add1_Lo = B.buildUAddo(S32, S1, RcpLo, MulHi1_Lo);
5261 auto Add1_Hi = B.buildUAdde(S32, S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1));
5262 auto Add1 = B.buildMergeLikeInstr(S64, {Add1_Lo, Add1_Hi});
5263
5264 auto MulLo2 = B.buildMul(S64, NegDenom, Add1);
5265 auto MulHi2 = B.buildUMulH(S64, Add1, MulLo2);
5266 auto UnmergeMulHi2 = B.buildUnmerge(S32, MulHi2);
5267 Register MulHi2_Lo = UnmergeMulHi2.getReg(0);
5268 Register MulHi2_Hi = UnmergeMulHi2.getReg(1);
5269
5270 auto Zero32 = B.buildConstant(S32, 0);
5271 auto Add2_Lo = B.buildUAddo(S32, S1, Add1_Lo, MulHi2_Lo);
5272 auto Add2_Hi = B.buildUAdde(S32, S1, Add1_Hi, MulHi2_Hi, Add2_Lo.getReg(1));
5273 auto Add2 = B.buildMergeLikeInstr(S64, {Add2_Lo, Add2_Hi});
5274
5275 auto UnmergeNumer = B.buildUnmerge(S32, Numer);
5276 Register NumerLo = UnmergeNumer.getReg(0);
5277 Register NumerHi = UnmergeNumer.getReg(1);
5278
5279 auto MulHi3 = B.buildUMulH(S64, Numer, Add2);
5280 auto Mul3 = B.buildMul(S64, Denom, MulHi3);
5281 auto UnmergeMul3 = B.buildUnmerge(S32, Mul3);
5282 Register Mul3_Lo = UnmergeMul3.getReg(0);
5283 Register Mul3_Hi = UnmergeMul3.getReg(1);
5284 auto Sub1_Lo = B.buildUSubo(S32, S1, NumerLo, Mul3_Lo);
5285 auto Sub1_Hi = B.buildUSube(S32, S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1));
5286 auto Sub1_Mi = B.buildSub(S32, NumerHi, Mul3_Hi);
5287 auto Sub1 = B.buildMergeLikeInstr(S64, {Sub1_Lo, Sub1_Hi});
5288
5289 auto UnmergeDenom = B.buildUnmerge(S32, Denom);
5290 Register DenomLo = UnmergeDenom.getReg(0);
5291 Register DenomHi = UnmergeDenom.getReg(1);
5292
5293 auto CmpHi = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Hi, DenomHi);
5294 auto C1 = B.buildSExt(S32, CmpHi);
5295
5296 auto CmpLo = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Lo, DenomLo);
5297 auto C2 = B.buildSExt(S32, CmpLo);
5298
5299 auto CmpEq = B.buildICmp(CmpInst::ICMP_EQ, S1, Sub1_Hi, DenomHi);
5300 auto C3 = B.buildSelect(S32, CmpEq, C2, C1);
5301
5302 // TODO: Here and below portions of the code can be enclosed into if/endif.
5303 // Currently control flow is unconditional and we have 4 selects after
5304 // potential endif to substitute PHIs.
5305
5306 // if C3 != 0 ...
5307 auto Sub2_Lo = B.buildUSubo(S32, S1, Sub1_Lo, DenomLo);
5308 auto Sub2_Mi = B.buildUSube(S32, S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1));
5309 auto Sub2_Hi = B.buildUSube(S32, S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1));
5310 auto Sub2 = B.buildMergeLikeInstr(S64, {Sub2_Lo, Sub2_Hi});
5311
5312 auto One64 = B.buildConstant(S64, 1);
5313 auto Add3 = B.buildAdd(S64, MulHi3, One64);
5314
5315 auto C4 =
5316 B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Hi, DenomHi));
5317 auto C5 =
5318 B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Lo, DenomLo));
5319 auto C6 = B.buildSelect(
5320 S32, B.buildICmp(CmpInst::ICMP_EQ, S1, Sub2_Hi, DenomHi), C5, C4);
5321
5322 // if (C6 != 0)
5323 auto Add4 = B.buildAdd(S64, Add3, One64);
5324 auto Sub3_Lo = B.buildUSubo(S32, S1, Sub2_Lo, DenomLo);
5325
5326 auto Sub3_Mi = B.buildUSube(S32, S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1));
5327 auto Sub3_Hi = B.buildUSube(S32, S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1));
5328 auto Sub3 = B.buildMergeLikeInstr(S64, {Sub3_Lo, Sub3_Hi});
5329
5330 // endif C6
5331 // endif C3
5332
5333 if (DstDivReg) {
5334 auto Sel1 = B.buildSelect(
5335 S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Add4, Add3);
5336 B.buildSelect(DstDivReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32),
5337 Sel1, MulHi3);
5338 }
5339
5340 if (DstRemReg) {
5341 auto Sel2 = B.buildSelect(
5342 S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Sub3, Sub2);
5343 B.buildSelect(DstRemReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32),
5344 Sel2, Sub1);
5345 }
5346}
5347
5350 MachineIRBuilder &B) const {
5351 Register DstDivReg, DstRemReg;
5352 switch (MI.getOpcode()) {
5353 default:
5354 llvm_unreachable("Unexpected opcode!");
5355 case AMDGPU::G_UDIV: {
5356 DstDivReg = MI.getOperand(0).getReg();
5357 break;
5358 }
5359 case AMDGPU::G_UREM: {
5360 DstRemReg = MI.getOperand(0).getReg();
5361 break;
5362 }
5363 case AMDGPU::G_UDIVREM: {
5364 DstDivReg = MI.getOperand(0).getReg();
5365 DstRemReg = MI.getOperand(1).getReg();
5366 break;
5367 }
5368 }
5369
5370 const LLT S64 = LLT::scalar(64);
5371 const LLT S32 = LLT::scalar(32);
5372 const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs();
5373 Register Num = MI.getOperand(FirstSrcOpIdx).getReg();
5374 Register Den = MI.getOperand(FirstSrcOpIdx + 1).getReg();
5375 LLT Ty = MRI.getType(MI.getOperand(0).getReg());
5376
5377 if (Ty == S32)
5378 legalizeUnsignedDIV_REM32Impl(B, DstDivReg, DstRemReg, Num, Den);
5379 else if (Ty == S64)
5380 legalizeUnsignedDIV_REM64Impl(B, DstDivReg, DstRemReg, Num, Den);
5381 else
5382 return false;
5383
5384 MI.eraseFromParent();
5385 return true;
5386}
5387
5390 MachineIRBuilder &B) const {
5391 const LLT S64 = LLT::scalar(64);
5392 const LLT S32 = LLT::scalar(32);
5393
5394 LLT Ty = MRI.getType(MI.getOperand(0).getReg());
5395 if (Ty != S32 && Ty != S64)
5396 return false;
5397
5398 const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs();
5399 Register LHS = MI.getOperand(FirstSrcOpIdx).getReg();
5400 Register RHS = MI.getOperand(FirstSrcOpIdx + 1).getReg();
5401
5402 auto SignBitOffset = B.buildConstant(S32, Ty.getSizeInBits() - 1);
5403 auto LHSign = B.buildAShr(Ty, LHS, SignBitOffset);
5404 auto RHSign = B.buildAShr(Ty, RHS, SignBitOffset);
5405
5406 LHS = B.buildAdd(Ty, LHS, LHSign).getReg(0);
5407 RHS = B.buildAdd(Ty, RHS, RHSign).getReg(0);
5408
5409 LHS = B.buildXor(Ty, LHS, LHSign).getReg(0);
5410 RHS = B.buildXor(Ty, RHS, RHSign).getReg(0);
5411
5412 Register DstDivReg, DstRemReg, TmpDivReg, TmpRemReg;
5413 switch (MI.getOpcode()) {
5414 default:
5415 llvm_unreachable("Unexpected opcode!");
5416 case AMDGPU::G_SDIV: {
5417 DstDivReg = MI.getOperand(0).getReg();
5418 TmpDivReg = MRI.createGenericVirtualRegister(Ty);
5419 break;
5420 }
5421 case AMDGPU::G_SREM: {
5422 DstRemReg = MI.getOperand(0).getReg();
5423 TmpRemReg = MRI.createGenericVirtualRegister(Ty);
5424 break;
5425 }
5426 case AMDGPU::G_SDIVREM: {
5427 DstDivReg = MI.getOperand(0).getReg();
5428 DstRemReg = MI.getOperand(1).getReg();
5429 TmpDivReg = MRI.createGenericVirtualRegister(Ty);
5430 TmpRemReg = MRI.createGenericVirtualRegister(Ty);
5431 break;
5432 }
5433 }
5434
5435 if (Ty == S32)
5436 legalizeUnsignedDIV_REM32Impl(B, TmpDivReg, TmpRemReg, LHS, RHS);
5437 else
5438 legalizeUnsignedDIV_REM64Impl(B, TmpDivReg, TmpRemReg, LHS, RHS);
5439
5440 if (DstDivReg) {
5441 auto Sign = B.buildXor(Ty, LHSign, RHSign).getReg(0);
5442 auto SignXor = B.buildXor(Ty, TmpDivReg, Sign).getReg(0);
5443 B.buildSub(DstDivReg, SignXor, Sign);
5444 }
5445
5446 if (DstRemReg) {
5447 auto Sign = LHSign.getReg(0); // Remainder sign is the same as LHS
5448 auto SignXor = B.buildXor(Ty, TmpRemReg, Sign).getReg(0);
5449 B.buildSub(DstRemReg, SignXor, Sign);
5450 }
5451
5452 MI.eraseFromParent();
5453 return true;
5454}
5455
5458 MachineIRBuilder &B) const {
5459 Register Res = MI.getOperand(0).getReg();
5460 Register LHS = MI.getOperand(1).getReg();
5461 Register RHS = MI.getOperand(2).getReg();
5462 uint16_t Flags = MI.getFlags();
5463 LLT ResTy = MRI.getType(Res);
5464
5465 bool AllowInaccurateRcp = MI.getFlag(MachineInstr::FmAfn);
5466
5467 if (const auto *CLHS = getConstantFPVRegVal(LHS, MRI)) {
5468 if (!AllowInaccurateRcp && ResTy != LLT::scalar(16))
5469 return false;
5470
5471 // v_rcp_f32 and v_rsq_f32 do not support denormals, and according to
5472 // the CI documentation has a worst case error of 1 ulp.
5473 // OpenCL requires <= 2.5 ulp for 1.0 / x, so it should always be OK to
5474 // use it as long as we aren't trying to use denormals.
5475 //
5476 // v_rcp_f16 and v_rsq_f16 DO support denormals and 0.51ulp.
5477
5478 // 1 / x -> RCP(x)
5479 if (CLHS->isExactlyValue(1.0)) {
5480 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res)
5481 .addUse(RHS)
5482 .setMIFlags(Flags);
5483
5484 MI.eraseFromParent();
5485 return true;
5486 }
5487
5488 // -1 / x -> RCP( FNEG(x) )
5489 if (CLHS->isExactlyValue(-1.0)) {
5490 auto FNeg = B.buildFNeg(ResTy, RHS, Flags);
5491 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res)
5492 .addUse(FNeg.getReg(0))
5493 .setMIFlags(Flags);
5494
5495 MI.eraseFromParent();
5496 return true;
5497 }
5498 }
5499
5500 // For f16 require afn or arcp.
5501 // For f32 require afn.
5502 if (!AllowInaccurateRcp && (ResTy != LLT::scalar(16) ||
5503 !MI.getFlag(MachineInstr::FmArcp)))
5504 return false;
5505
5506 // x / y -> x * (1.0 / y)
5507 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy})
5508 .addUse(RHS)
5509 .setMIFlags(Flags);
5510 B.buildFMul(Res, LHS, RCP, Flags);
5511
5512 MI.eraseFromParent();
5513 return true;
5514}
5515
5518 MachineIRBuilder &B) const {
5519 Register Res = MI.getOperand(0).getReg();
5520 Register X = MI.getOperand(1).getReg();
5521 Register Y = MI.getOperand(2).getReg();
5522 uint16_t Flags = MI.getFlags();
5523 LLT ResTy = MRI.getType(Res);
5524
5525 bool AllowInaccurateRcp = MI.getFlag(MachineInstr::FmAfn);
5526
5527 if (!AllowInaccurateRcp)
5528 return false;
5529
5530 const ConstantFP *CLHS = getConstantFPVRegVal(X, MRI);
5531 bool IsNegRcp = CLHS && CLHS->isExactlyValue(-1.0);
5532
5533 // Pull out the negation so it folds for free into the source modifiers.
5534 if (IsNegRcp)
5535 X = B.buildFConstant(ResTy, 1.0).getReg(0);
5536
5537 Register NegY = IsNegRcp ? Y : B.buildFNeg(ResTy, Y).getReg(0);
5538 auto One = B.buildFConstant(ResTy, 1.0);
5539
5540 auto R = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy})
5541 .addUse(Y)
5542 .setMIFlags(Flags);
5543 if (IsNegRcp)
5544 R = B.buildFNeg(ResTy, R);
5545
5546 auto Tmp0 = B.buildFMA(ResTy, NegY, R, One);
5547 R = B.buildFMA(ResTy, Tmp0, R, R);
5548
5549 auto Tmp1 = B.buildFMA(ResTy, NegY, R, One);
5550 R = B.buildFMA(ResTy, Tmp1, R, R);
5551
5552 // Skip the last 2 correction terms for reciprocal.
5553 if (IsNegRcp || (CLHS && CLHS->isExactlyValue(1.0))) {
5554 B.buildCopy(Res, R);
5555 MI.eraseFromParent();
5556 return true;
5557 }
5558
5559 auto Ret = B.buildFMul(ResTy, X, R);
5560 auto Tmp2 = B.buildFMA(ResTy, NegY, Ret, X);
5561
5562 B.buildFMA(Res, Tmp2, R, Ret);
5563 MI.eraseFromParent();
5564 return true;
5565}
5566
5569 MachineIRBuilder &B) const {
5570 if (legalizeFastUnsafeFDIV(MI, MRI, B))
5571 return true;
5572
5573 Register Res = MI.getOperand(0).getReg();
5574 Register LHS = MI.getOperand(1).getReg();
5575 Register RHS = MI.getOperand(2).getReg();
5576
5577 uint16_t Flags = MI.getFlags();
5578
5579 LLT S16 = LLT::scalar(16);
5580 LLT S32 = LLT::scalar(32);
5581
5582 // a32.u = opx(V_CVT_F32_F16, a.u); // CVT to F32
5583 // b32.u = opx(V_CVT_F32_F16, b.u); // CVT to F32
5584 // r32.u = opx(V_RCP_F32, b32.u); // rcp = 1 / d
5585 // q32.u = opx(V_MUL_F32, a32.u, r32.u); // q = n * rcp
5586 // e32.u = opx(V_MAD_F32, (b32.u^_neg32), q32.u, a32.u); // err = -d * q + n
5587 // q32.u = opx(V_MAD_F32, e32.u, r32.u, q32.u); // q = n * rcp
5588 // e32.u = opx(V_MAD_F32, (b32.u^_neg32), q32.u, a32.u); // err = -d * q + n
5589 // tmp.u = opx(V_MUL_F32, e32.u, r32.u);
5590 // tmp.u = opx(V_AND_B32, tmp.u, 0xff800000)
5591 // q32.u = opx(V_ADD_F32, tmp.u, q32.u);
5592 // q16.u = opx(V_CVT_F16_F32, q32.u);
5593 // q16.u = opx(V_DIV_FIXUP_F16, q16.u, b.u, a.u); // q = touchup(q, d, n)
5594
5595 auto LHSExt = B.buildFPExt(S32, LHS, Flags);
5596 auto RHSExt = B.buildFPExt(S32, RHS, Flags);
5597 auto NegRHSExt = B.buildFNeg(S32, RHSExt);
5598 auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32})
5599 .addUse(RHSExt.getReg(0))
5600 .setMIFlags(Flags);
5601 auto Quot = B.buildFMul(S32, LHSExt, Rcp, Flags);
5603 if (ST.hasMadMacF32Insts()) {
5604 Err = B.buildFMAD(S32, NegRHSExt, Quot, LHSExt, Flags);
5605 Quot = B.buildFMAD(S32, Err, Rcp, Quot, Flags);
5606 Err = B.buildFMAD(S32, NegRHSExt, Quot, LHSExt, Flags);
5607 } else {
5608 Err = B.buildFMA(S32, NegRHSExt, Quot, LHSExt, Flags);
5609 Quot = B.buildFMA(S32, Err, Rcp, Quot, Flags);
5610 Err = B.buildFMA(S32, NegRHSExt, Quot, LHSExt, Flags);
5611 }
5612 auto Tmp = B.buildFMul(S32, Err, Rcp, Flags);
5613 Tmp = B.buildAnd(S32, Tmp, B.buildConstant(S32, 0xff800000));
5614 Quot = B.buildFAdd(S32, Tmp, Quot, Flags);
5615 auto RDst = B.buildFPTrunc(S16, Quot, Flags);
5616 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res)
5617 .addUse(RDst.getReg(0))
5618 .addUse(RHS)
5619 .addUse(LHS)
5620 .setMIFlags(Flags);
5621
5622 MI.eraseFromParent();
5623 return true;
5624}
5625
5626static constexpr unsigned SPDenormModeBitField =
5628
5629// Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions
5630// to enable denorm mode. When 'Enable' is false, disable denorm mode.
5632 const GCNSubtarget &ST,
5634 // Set SP denorm mode to this value.
5635 unsigned SPDenormMode =
5636 Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue();
5637
5638 if (ST.hasDenormModeInst()) {
5639 // Preserve default FP64FP16 denorm mode while updating FP32 mode.
5640 uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue();
5641
5642 uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2);
5643 B.buildInstr(AMDGPU::S_DENORM_MODE)
5644 .addImm(NewDenormModeValue);
5645
5646 } else {
5647 B.buildInstr(AMDGPU::S_SETREG_IMM32_B32)
5648 .addImm(SPDenormMode)
5649 .addImm(SPDenormModeBitField);
5650 }
5651}
5652
5655 MachineIRBuilder &B) const {
5656 if (legalizeFastUnsafeFDIV(MI, MRI, B))
5657 return true;
5658
5659 Register Res = MI.getOperand(0).getReg();
5660 Register LHS = MI.getOperand(1).getReg();
5661 Register RHS = MI.getOperand(2).getReg();
5662 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
5663 SIModeRegisterDefaults Mode = MFI->getMode();
5664
5665 uint16_t Flags = MI.getFlags();
5666
5667 LLT S32 = LLT::scalar(32);
5668 LLT S1 = LLT::scalar(1);
5669
5670 auto One = B.buildFConstant(S32, 1.0f);
5671
5672 auto DenominatorScaled =
5673 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1})
5674 .addUse(LHS)
5675 .addUse(RHS)
5676 .addImm(0)
5677 .setMIFlags(Flags);
5678 auto NumeratorScaled =
5679 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1})
5680 .addUse(LHS)
5681 .addUse(RHS)
5682 .addImm(1)
5683 .setMIFlags(Flags);
5684
5685 auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32})
5686 .addUse(DenominatorScaled.getReg(0))
5687 .setMIFlags(Flags);
5688 auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags);
5689
5690 const bool PreservesDenormals = Mode.FP32Denormals == DenormalMode::getIEEE();
5691 const bool HasDynamicDenormals =
5692 (Mode.FP32Denormals.Input == DenormalMode::Dynamic) ||
5693 (Mode.FP32Denormals.Output == DenormalMode::Dynamic);
5694
5695 Register SavedSPDenormMode;
5696 if (!PreservesDenormals) {
5697 if (HasDynamicDenormals) {
5698 SavedSPDenormMode = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5699 B.buildInstr(AMDGPU::S_GETREG_B32)
5700 .addDef(SavedSPDenormMode)
5701 .addImm(SPDenormModeBitField);
5702 }
5703 toggleSPDenormMode(true, B, ST, Mode);
5704 }
5705
5706 auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags);
5707 auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags);
5708 auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags);
5709 auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags);
5710 auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags);
5711 auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags);
5712
5713 if (!PreservesDenormals) {
5714 if (HasDynamicDenormals) {
5715 assert(SavedSPDenormMode);
5716 B.buildInstr(AMDGPU::S_SETREG_B32)
5717 .addReg(SavedSPDenormMode)
5718 .addImm(SPDenormModeBitField);
5719 } else
5720 toggleSPDenormMode(false, B, ST, Mode);
5721 }
5722
5723 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32})
5724 .addUse(Fma4.getReg(0))
5725 .addUse(Fma1.getReg(0))
5726 .addUse(Fma3.getReg(0))
5727 .addUse(NumeratorScaled.getReg(1))
5728 .setMIFlags(Flags);
5729
5730 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res)
5731 .addUse(Fmas.getReg(0))
5732 .addUse(RHS)
5733 .addUse(LHS)
5734 .setMIFlags(Flags);
5735
5736 MI.eraseFromParent();
5737 return true;
5738}
5739
5742 MachineIRBuilder &B) const {
5743 if (legalizeFastUnsafeFDIV64(MI, MRI, B))
5744 return true;
5745
5746 Register Res = MI.getOperand(0).getReg();
5747 Register LHS = MI.getOperand(1).getReg();
5748 Register RHS = MI.getOperand(2).getReg();
5749
5750 uint16_t Flags = MI.getFlags();
5751
5752 LLT S64 = LLT::scalar(64);
5753 LLT S1 = LLT::scalar(1);
5754
5755 auto One = B.buildFConstant(S64, 1.0);
5756
5757 auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1})
5758 .addUse(LHS)
5759 .addUse(RHS)
5760 .addImm(0)
5761 .setMIFlags(Flags);
5762
5763 auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags);
5764
5765 auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64})
5766 .addUse(DivScale0.getReg(0))
5767 .setMIFlags(Flags);
5768
5769 auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags);
5770 auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags);
5771 auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags);
5772
5773 auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1})
5774 .addUse(LHS)
5775 .addUse(RHS)
5776 .addImm(1)
5777 .setMIFlags(Flags);
5778
5779 auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags);
5780 auto Mul = B.buildFMul(S64, DivScale1.getReg(0), Fma3, Flags);
5781 auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags);
5782
5783 Register Scale;
5784 if (!ST.hasUsableDivScaleConditionOutput()) {
5785 // Workaround a hardware bug on SI where the condition output from div_scale
5786 // is not usable.
5787
5788 LLT S32 = LLT::scalar(32);
5789
5790 auto NumUnmerge = B.buildUnmerge(S32, LHS);
5791 auto DenUnmerge = B.buildUnmerge(S32, RHS);
5792 auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0);
5793 auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1);
5794
5795 auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1),
5796 Scale1Unmerge.getReg(1));
5797 auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1),
5798 Scale0Unmerge.getReg(1));
5799 Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0);
5800 } else {
5801 Scale = DivScale1.getReg(1);
5802 }
5803
5804 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64})
5805 .addUse(Fma4.getReg(0))
5806 .addUse(Fma3.getReg(0))
5807 .addUse(Mul.getReg(0))
5808 .addUse(Scale)
5809 .setMIFlags(Flags);
5810
5811 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, ArrayRef(Res))
5812 .addUse(Fmas.getReg(0))
5813 .addUse(RHS)
5814 .addUse(LHS)
5815 .setMIFlags(Flags);
5816
5817 MI.eraseFromParent();
5818 return true;
5819}
5820
5823 MachineIRBuilder &B) const {
5824 Register Res0 = MI.getOperand(0).getReg();
5825 Register Res1 = MI.getOperand(1).getReg();
5826 Register Val = MI.getOperand(2).getReg();
5827 uint16_t Flags = MI.getFlags();
5828
5829 LLT Ty = MRI.getType(Res0);
5830 LLT InstrExpTy = Ty == LLT::scalar(16) ? LLT::scalar(16) : LLT::scalar(32);
5831
5832 auto Mant = B.buildIntrinsic(Intrinsic::amdgcn_frexp_mant, {Ty})
5833 .addUse(Val)
5834 .setMIFlags(Flags);
5835 auto Exp = B.buildIntrinsic(Intrinsic::amdgcn_frexp_exp, {InstrExpTy})
5836 .addUse(Val)
5837 .setMIFlags(Flags);
5838
5839 if (ST.hasFractBug()) {
5840 auto Fabs = B.buildFAbs(Ty, Val);
5841 auto Inf = B.buildFConstant(Ty, APFloat::getInf(getFltSemanticForLLT(Ty)));
5842 auto IsFinite =
5843 B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), Fabs, Inf, Flags);
5844 auto Zero = B.buildConstant(InstrExpTy, 0);
5845 Exp = B.buildSelect(InstrExpTy, IsFinite, Exp, Zero);
5846 Mant = B.buildSelect(Ty, IsFinite, Mant, Val);
5847 }
5848
5849 B.buildCopy(Res0, Mant);
5850 B.buildSExtOrTrunc(Res1, Exp);
5851
5852 MI.eraseFromParent();
5853 return true;
5854}
5855
5858 MachineIRBuilder &B) const {
5859 Register Res = MI.getOperand(0).getReg();
5860 Register LHS = MI.getOperand(2).getReg();
5861 Register RHS = MI.getOperand(3).getReg();
5862 uint16_t Flags = MI.getFlags();
5863
5864 LLT S32 = LLT::scalar(32);
5865 LLT S1 = LLT::scalar(1);
5866
5867 auto Abs = B.buildFAbs(S32, RHS, Flags);
5868 const APFloat C0Val(1.0f);
5869
5870 auto C0 = B.buildFConstant(S32, 0x1p+96f);
5871 auto C1 = B.buildFConstant(S32, 0x1p-32f);
5872 auto C2 = B.buildFConstant(S32, 1.0f);
5873
5874 auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags);
5875 auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags);
5876
5877 auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags);
5878
5879 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32})
5880 .addUse(Mul0.getReg(0))
5881 .setMIFlags(Flags);
5882
5883 auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags);
5884
5885 B.buildFMul(Res, Sel, Mul1, Flags);
5886
5887 MI.eraseFromParent();
5888 return true;
5889}
5890
5893 MachineIRBuilder &B) const {
5894 // Bypass the correct expansion a standard promotion through G_FSQRT would
5895 // get. The f32 op is accurate enough for the f16 cas.
5896 unsigned Flags = MI.getFlags();
5897 assert(!ST.has16BitInsts());
5898 const LLT F32 = LLT::scalar(32);
5899 auto Ext = B.buildFPExt(F32, MI.getOperand(1), Flags);
5900 auto Log2 = B.buildIntrinsic(Intrinsic::amdgcn_sqrt, {F32})
5901 .addUse(Ext.getReg(0))
5902 .setMIFlags(Flags);
5903 B.buildFPTrunc(MI.getOperand(0), Log2, Flags);
5904 MI.eraseFromParent();
5905 return true;
5906}
5907
5910 MachineIRBuilder &B) const {
5911 MachineFunction &MF = B.getMF();
5912 Register Dst = MI.getOperand(0).getReg();
5913 Register X = MI.getOperand(1).getReg();
5914 const unsigned Flags = MI.getFlags();
5915 const LLT S1 = LLT::scalar(1);
5916 const LLT F32 = LLT::scalar(32);
5917 const LLT I32 = LLT::scalar(32);
5918
5919 if (allowApproxFunc(MF, Flags)) {
5920 B.buildIntrinsic(Intrinsic::amdgcn_sqrt, ArrayRef<Register>({Dst}))
5921 .addUse(X)
5922 .setMIFlags(Flags);
5923 MI.eraseFromParent();
5924 return true;
5925 }
5926
5927 auto ScaleThreshold = B.buildFConstant(F32, 0x1.0p-96f);
5928 auto NeedScale = B.buildFCmp(CmpInst::FCMP_OGT, S1, ScaleThreshold, X, Flags);
5929 auto ScaleUpFactor = B.buildFConstant(F32, 0x1.0p+32f);
5930 auto ScaledX = B.buildFMul(F32, X, ScaleUpFactor, Flags);
5931 auto SqrtX = B.buildSelect(F32, NeedScale, ScaledX, X, Flags);
5932
5934 if (needsDenormHandlingF32(MF, X, Flags)) {
5935 B.buildIntrinsic(Intrinsic::amdgcn_sqrt, ArrayRef<Register>({SqrtS}))
5936 .addUse(SqrtX.getReg(0))
5937 .setMIFlags(Flags);
5938
5939 auto NegOne = B.buildConstant(I32, -1);
5940 auto SqrtSNextDown = B.buildAdd(I32, SqrtS, NegOne);
5941
5942 auto NegSqrtSNextDown = B.buildFNeg(F32, SqrtSNextDown, Flags);
5943 auto SqrtVP = B.buildFMA(F32, NegSqrtSNextDown, SqrtS, SqrtX, Flags);
5944
5945 auto PosOne = B.buildConstant(I32, 1);
5946 auto SqrtSNextUp = B.buildAdd(I32, SqrtS, PosOne);
5947
5948 auto NegSqrtSNextUp = B.buildFNeg(F32, SqrtSNextUp, Flags);
5949 auto SqrtVS = B.buildFMA(F32, NegSqrtSNextUp, SqrtS, SqrtX, Flags);
5950
5951 auto Zero = B.buildFConstant(F32, 0.0f);
5952 auto SqrtVPLE0 = B.buildFCmp(CmpInst::FCMP_OLE, S1, SqrtVP, Zero, Flags);
5953
5954 SqrtS =
5955 B.buildSelect(F32, SqrtVPLE0, SqrtSNextDown, SqrtS, Flags).getReg(0);
5956
5957 auto SqrtVPVSGT0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, SqrtVS, Zero, Flags);
5958 SqrtS =
5959 B.buildSelect(F32, SqrtVPVSGT0, SqrtSNextUp, SqrtS, Flags).getReg(0);
5960 } else {
5961 auto SqrtR =
5962 B.buildIntrinsic(Intrinsic::amdgcn_rsq, {F32}).addReg(SqrtX.getReg(0));
5963 B.buildFMul(SqrtS, SqrtX, SqrtR, Flags);
5964
5965 auto Half = B.buildFConstant(F32, 0.5f);
5966 auto SqrtH = B.buildFMul(F32, SqrtR, Half, Flags);
5967 auto NegSqrtH = B.buildFNeg(F32, SqrtH, Flags);
5968 auto SqrtE = B.buildFMA(F32, NegSqrtH, SqrtS, Half, Flags);
5969 SqrtH = B.buildFMA(F32, SqrtH, SqrtE, SqrtH, Flags);
5970 SqrtS = B.buildFMA(F32, SqrtS, SqrtE, SqrtS, Flags).getReg(0);
5971 auto NegSqrtS = B.buildFNeg(F32, SqrtS, Flags);
5972 auto SqrtD = B.buildFMA(F32, NegSqrtS, SqrtS, SqrtX, Flags);
5973 SqrtS = B.buildFMA(F32, SqrtD, SqrtH, SqrtS, Flags).getReg(0);
5974 }
5975
5976 auto ScaleDownFactor = B.buildFConstant(F32, 0x1.0p-16f);
5977
5978 auto ScaledDown = B.buildFMul(F32, SqrtS, ScaleDownFactor, Flags);
5979
5980 SqrtS = B.buildSelect(F32, NeedScale, ScaledDown, SqrtS, Flags).getReg(0);
5981
5982 auto IsZeroOrInf = B.buildIsFPClass(LLT::scalar(1), SqrtX, fcZero | fcPosInf);
5983 B.buildSelect(Dst, IsZeroOrInf, SqrtX, SqrtS, Flags);
5984
5985 MI.eraseFromParent();
5986 return true;
5987}
5988
5991 MachineIRBuilder &B) const {
5992 // For double type, the SQRT and RSQ instructions don't have required
5993 // precision, we apply Goldschmidt's algorithm to improve the result:
5994 //
5995 // y0 = rsq(x)
5996 // g0 = x * y0
5997 // h0 = 0.5 * y0
5998 //
5999 // r0 = 0.5 - h0 * g0
6000 // g1 = g0 * r0 + g0
6001 // h1 = h0 * r0 + h0
6002 //
6003 // r1 = 0.5 - h1 * g1 => d0 = x - g1 * g1
6004 // g2 = g1 * r1 + g1 g2 = d0 * h1 + g1
6005 // h2 = h1 * r1 + h1
6006 //
6007 // r2 = 0.5 - h2 * g2 => d1 = x - g2 * g2
6008 // g3 = g2 * r2 + g2 g3 = d1 * h1 + g2
6009 //
6010 // sqrt(x) = g3
6011
6012 const LLT S1 = LLT::scalar(1);
6013 const LLT S32 = LLT::scalar(32);
6014 const LLT F64 = LLT::scalar(64);
6015
6016 Register Dst = MI.getOperand(0).getReg();
6017 assert(MRI.getType(Dst) == F64 && "only expect to lower f64 sqrt");
6018
6019 Register X = MI.getOperand(1).getReg();
6020 unsigned Flags = MI.getFlags();
6021
6022 Register SqrtX = X;
6023 Register Scaling, ZeroInt;
6024 if (!MI.getFlag(MachineInstr::FmAfn)) {
6025 auto ScaleConstant = B.buildFConstant(F64, 0x1.0p-767);
6026
6027 ZeroInt = B.buildConstant(S32, 0).getReg(0);
6028 Scaling = B.buildFCmp(FCmpInst::FCMP_OLT, S1, X, ScaleConstant).getReg(0);
6029
6030 // Scale up input if it is too small.
6031 auto ScaleUpFactor = B.buildConstant(S32, 256);
6032 auto ScaleUp = B.buildSelect(S32, Scaling, ScaleUpFactor, ZeroInt);
6033 SqrtX = B.buildFLdexp(F64, X, ScaleUp, Flags).getReg(0);
6034 }
6035
6036 auto SqrtY = B.buildIntrinsic(Intrinsic::amdgcn_rsq, {F64}).addReg(SqrtX);
6037
6038 auto Half = B.buildFConstant(F64, 0.5);
6039 auto SqrtH0 = B.buildFMul(F64, SqrtY, Half);
6040 auto SqrtS0 = B.buildFMul(F64, SqrtX, SqrtY);
6041
6042 auto NegSqrtH0 = B.buildFNeg(F64, SqrtH0);
6043 auto SqrtR0 = B.buildFMA(F64, NegSqrtH0, SqrtS0, Half);
6044
6045 auto SqrtS1 = B.buildFMA(F64, SqrtS0, SqrtR0, SqrtS0);
6046 auto SqrtH1 = B.buildFMA(F64, SqrtH0, SqrtR0, SqrtH0);
6047
6048 auto NegSqrtS1 = B.buildFNeg(F64, SqrtS1);
6049 auto SqrtD0 = B.buildFMA(F64, NegSqrtS1, SqrtS1, SqrtX);
6050
6051 auto SqrtS2 = B.buildFMA(F64, SqrtD0, SqrtH1, SqrtS1);
6052
6053 Register SqrtRet = SqrtS2.getReg(0);
6054 if (!MI.getFlag(MachineInstr::FmAfn)) {
6055 auto NegSqrtS2 = B.buildFNeg(F64, SqrtS2);
6056 auto SqrtD1 = B.buildFMA(F64, NegSqrtS2, SqrtS2, SqrtX);
6057 auto SqrtD2 = B.buildFMA(F64, SqrtD1, SqrtH1, SqrtS2);
6058
6059 // Scale down the result.
6060 auto ScaleDownFactor = B.buildConstant(S32, -128);
6061 auto ScaleDown = B.buildSelect(S32, Scaling, ScaleDownFactor, ZeroInt);
6062 SqrtRet = B.buildFLdexp(F64, SqrtD2, ScaleDown, Flags).getReg(0);
6063 }
6064
6065 Register IsZeroOrInf;
6066 if (MI.getFlag(MachineInstr::FmNoInfs)) {
6067 auto ZeroFP = B.buildFConstant(F64, 0.0);
6068 IsZeroOrInf = B.buildFCmp(FCmpInst::FCMP_OEQ, S1, SqrtX, ZeroFP).getReg(0);
6069 } else {
6070 IsZeroOrInf = B.buildIsFPClass(S1, SqrtX, fcZero | fcPosInf).getReg(0);
6071 }
6072
6073 // TODO: Check for DAZ and expand to subnormals
6074
6075 // If x is +INF, +0, or -0, use its original value
6076 B.buildSelect(Dst, IsZeroOrInf, SqrtX, SqrtRet, Flags);
6077
6078 MI.eraseFromParent();
6079 return true;
6080}
6081
6084 MachineIRBuilder &B) const {
6085 LLT Ty = MRI.getType(MI.getOperand(0).getReg());
6086 if (Ty == LLT::scalar(32))
6087 return legalizeFSQRTF32(MI, MRI, B);
6088 if (Ty == LLT::scalar(64))
6089 return legalizeFSQRTF64(MI, MRI, B);
6090 if (Ty == LLT::scalar(16))
6091 return legalizeFSQRTF16(MI, MRI, B);
6092 return false;
6093}
6094
6095// Expand llvm.amdgcn.rsq.clamp on targets that don't support the instruction.
6096// FIXME: Why do we handle this one but not other removed instructions?
6097//
6098// Reciprocal square root. The clamp prevents infinite results, clamping
6099// infinities to max_float. D.f = 1.0 / sqrt(S0.f), result clamped to
6100// +-max_float.
6103 MachineIRBuilder &B) const {
6104 if (ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS)
6105 return true;
6106
6107 Register Dst = MI.getOperand(0).getReg();
6108 Register Src = MI.getOperand(2).getReg();
6109 auto Flags = MI.getFlags();
6110
6111 LLT Ty = MRI.getType(Dst);
6112
6113 const fltSemantics *FltSemantics;
6114 if (Ty == LLT::scalar(32))
6115 FltSemantics = &APFloat::IEEEsingle();
6116 else if (Ty == LLT::scalar(64))
6117 FltSemantics = &APFloat::IEEEdouble();
6118 else
6119 return false;
6120
6121 auto Rsq = B.buildIntrinsic(Intrinsic::amdgcn_rsq, {Ty})
6122 .addUse(Src)
6123 .setMIFlags(Flags);
6124
6125 // We don't need to concern ourselves with the snan handling difference, since
6126 // the rsq quieted (or not) so use the one which will directly select.
6127 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
6128 const bool UseIEEE = MFI->getMode().IEEE;
6129
6130 auto MaxFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics));
6131 auto ClampMax = UseIEEE ? B.buildFMinNumIEEE(Ty, Rsq, MaxFlt, Flags) :
6132 B.buildFMinNum(Ty, Rsq, MaxFlt, Flags);
6133
6134 auto MinFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics, true));
6135
6136 if (UseIEEE)
6137 B.buildFMaxNumIEEE(Dst, ClampMax, MinFlt, Flags);
6138 else
6139 B.buildFMaxNum(Dst, ClampMax, MinFlt, Flags);
6140 MI.eraseFromParent();
6141 return true;
6142}
6143
6144// TODO: Fix pointer type handling
6147 Intrinsic::ID IID) const {
6148
6149 MachineIRBuilder &B = Helper.MIRBuilder;
6150 MachineRegisterInfo &MRI = *B.getMRI();
6151
6152 bool IsPermLane16 = IID == Intrinsic::amdgcn_permlane16 ||
6153 IID == Intrinsic::amdgcn_permlanex16;
6154 bool IsSetInactive = IID == Intrinsic::amdgcn_set_inactive ||
6155 IID == Intrinsic::amdgcn_set_inactive_chain_arg;
6156 bool IsPermlaneShuffle = IID == Intrinsic::amdgcn_permlane_bcast ||
6157 IID == Intrinsic::amdgcn_permlane_up ||
6158 IID == Intrinsic::amdgcn_permlane_down ||
6159 IID == Intrinsic::amdgcn_permlane_xor;
6160
6161 auto createLaneOp = [&IID, &B, &MI](Register Src0, Register Src1,
6162 Register Src2, LLT VT) -> Register {
6163 auto LaneOp = B.buildIntrinsic(IID, {VT}).addUse(Src0);
6164 switch (IID) {
6165 case Intrinsic::amdgcn_readfirstlane:
6166 case Intrinsic::amdgcn_permlane64:
6167 return LaneOp.getReg(0);
6168 case Intrinsic::amdgcn_readlane:
6169 case Intrinsic::amdgcn_set_inactive:
6170 case Intrinsic::amdgcn_set_inactive_chain_arg:
6171 return LaneOp.addUse(Src1).getReg(0);
6172 case Intrinsic::amdgcn_writelane:
6173 case Intrinsic::amdgcn_permlane_bcast:
6174 case Intrinsic::amdgcn_permlane_up:
6175 case Intrinsic::amdgcn_permlane_down:
6176 case Intrinsic::amdgcn_permlane_xor:
6177 return LaneOp.addUse(Src1).addUse(Src2).getReg(0);
6178 case Intrinsic::amdgcn_permlane16:
6179 case Intrinsic::amdgcn_permlanex16: {
6180 Register Src3 = MI.getOperand(5).getReg();
6181 int64_t Src4 = MI.getOperand(6).getImm();
6182 int64_t Src5 = MI.getOperand(7).getImm();
6183 return LaneOp.addUse(Src1)
6184 .addUse(Src2)
6185 .addUse(Src3)
6186 .addImm(Src4)
6187 .addImm(Src5)
6188 .getReg(0);
6189 }
6190 case Intrinsic::amdgcn_mov_dpp8:
6191 return LaneOp.addImm(MI.getOperand(3).getImm()).getReg(0);
6192 case Intrinsic::amdgcn_update_dpp:
6193 return LaneOp.addUse(Src1)
6194 .addImm(MI.getOperand(4).getImm())
6195 .addImm(MI.getOperand(5).getImm())
6196 .addImm(MI.getOperand(6).getImm())
6197 .addImm(MI.getOperand(7).getImm())
6198 .getReg(0);
6199 default:
6200 llvm_unreachable("unhandled lane op");
6201 }
6202 };
6203
6204 Register DstReg = MI.getOperand(0).getReg();
6205 Register Src0 = MI.getOperand(2).getReg();
6206 Register Src1, Src2;
6207 if (IID == Intrinsic::amdgcn_readlane || IID == Intrinsic::amdgcn_writelane ||
6208 IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16 ||
6209 IsPermlaneShuffle) {
6210 Src1 = MI.getOperand(3).getReg();
6211 if (IID == Intrinsic::amdgcn_writelane || IsPermLane16 ||
6212 IsPermlaneShuffle) {
6213 Src2 = MI.getOperand(4).getReg();
6214 }
6215 }
6216
6217 LLT Ty = MRI.getType(DstReg);
6218 unsigned Size = Ty.getSizeInBits();
6219
6220 unsigned SplitSize = 32;
6221 if (IID == Intrinsic::amdgcn_update_dpp && (Size % 64 == 0) &&
6222 ST.hasDPALU_DPP() &&
6223 AMDGPU::isLegalDPALU_DPPControl(ST, MI.getOperand(4).getImm()))
6224 SplitSize = 64;
6225
6226 if (Size == SplitSize) {
6227 // Already legal
6228 return true;
6229 }
6230
6231 if (Size < 32) {
6232 Src0 = B.buildAnyExt(S32, Src0).getReg(0);
6233
6234 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16)
6235 Src1 = B.buildAnyExt(LLT::scalar(32), Src1).getReg(0);
6236
6237 if (IID == Intrinsic::amdgcn_writelane)
6238 Src2 = B.buildAnyExt(LLT::scalar(32), Src2).getReg(0);
6239
6240 Register LaneOpDst = createLaneOp(Src0, Src1, Src2, S32);
6241 B.buildTrunc(DstReg, LaneOpDst);
6242 MI.eraseFromParent();
6243 return true;
6244 }
6245
6246 if (Size % SplitSize != 0)
6247 return false;
6248
6249 LLT PartialResTy = LLT::scalar(SplitSize);
6250 bool NeedsBitcast = false;
6251 if (Ty.isVector()) {
6252 LLT EltTy = Ty.getElementType();
6253 unsigned EltSize = EltTy.getSizeInBits();
6254 if (EltSize == SplitSize) {
6255 PartialResTy = EltTy;
6256 } else if (EltSize == 16 || EltSize == 32) {
6257 unsigned NElem = SplitSize / EltSize;
6258 PartialResTy = Ty.changeElementCount(ElementCount::getFixed(NElem));
6259 } else {
6260 // Handle all other cases via S32/S64 pieces
6261 NeedsBitcast = true;
6262 }
6263 }
6264
6265 SmallVector<Register, 4> PartialRes;
6266 unsigned NumParts = Size / SplitSize;
6267 MachineInstrBuilder Src0Parts = B.buildUnmerge(PartialResTy, Src0);
6268 MachineInstrBuilder Src1Parts, Src2Parts;
6269
6270 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16)
6271 Src1Parts = B.buildUnmerge(PartialResTy, Src1);
6272
6273 if (IID == Intrinsic::amdgcn_writelane)
6274 Src2Parts = B.buildUnmerge(PartialResTy, Src2);
6275
6276 for (unsigned i = 0; i < NumParts; ++i) {
6277 Src0 = Src0Parts.getReg(i);
6278
6279 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16)
6280 Src1 = Src1Parts.getReg(i);
6281
6282 if (IID == Intrinsic::amdgcn_writelane)
6283 Src2 = Src2Parts.getReg(i);
6284
6285 PartialRes.push_back(createLaneOp(Src0, Src1, Src2, PartialResTy));
6286 }
6287
6288 if (NeedsBitcast)
6289 B.buildBitcast(DstReg, B.buildMergeLikeInstr(
6290 LLT::scalar(Ty.getSizeInBits()), PartialRes));
6291 else
6292 B.buildMergeLikeInstr(DstReg, PartialRes);
6293
6294 MI.eraseFromParent();
6295 return true;
6296}
6297
6300 MachineIRBuilder &B) const {
6302 ST.getTargetLowering()->getImplicitParameterOffset(
6304 LLT DstTy = MRI.getType(DstReg);
6305 LLT IdxTy = LLT::scalar(DstTy.getSizeInBits());
6306
6307 Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy);
6308 if (!loadInputValue(KernargPtrReg, B,
6310 return false;
6311
6312 B.buildObjectPtrOffset(DstReg, KernargPtrReg,
6313 B.buildConstant(IdxTy, Offset).getReg(0));
6314 return true;
6315}
6316
6317/// To create a buffer resource from a 64-bit pointer, mask off the upper 32
6318/// bits of the pointer and replace them with the stride argument, then
6319/// merge_values everything together. In the common case of a raw buffer (the
6320/// stride component is 0), we can just AND off the upper half.
6323 Register Result = MI.getOperand(0).getReg();
6324 Register Pointer = MI.getOperand(2).getReg();
6325 Register Stride = MI.getOperand(3).getReg();
6326 Register NumRecords = MI.getOperand(4).getReg();
6327 Register Flags = MI.getOperand(5).getReg();
6328
6329 LLT S32 = LLT::scalar(32);
6330 LLT S64 = LLT::scalar(64);
6331
6332 B.setInsertPt(B.getMBB(), ++B.getInsertPt());
6333
6334 auto ExtStride = B.buildAnyExt(S32, Stride);
6335
6336 if (ST.has45BitNumRecordsBufferResource()) {
6337 Register Zero = B.buildConstant(S32, 0).getReg(0);
6338 // Build the lower 64-bit value, which has a 57-bit base and the lower 7-bit
6339 // num_records.
6340 LLT PtrIntTy = LLT::scalar(MRI.getType(Pointer).getSizeInBits());
6341 auto PointerInt = B.buildPtrToInt(PtrIntTy, Pointer);
6342 auto ExtPointer = B.buildAnyExtOrTrunc(S64, PointerInt);
6343 auto NumRecordsLHS = B.buildShl(S64, NumRecords, B.buildConstant(S32, 57));
6344 Register LowHalf = B.buildOr(S64, ExtPointer, NumRecordsLHS).getReg(0);
6345
6346 // Build the higher 64-bit value, which has the higher 38-bit num_records,
6347 // 6-bit zero (omit), 16-bit stride and scale and 4-bit flag.
6348 auto NumRecordsRHS = B.buildLShr(S64, NumRecords, B.buildConstant(S32, 7));
6349 auto ShiftedStride = B.buildShl(S32, ExtStride, B.buildConstant(S32, 12));
6350 auto ExtShiftedStride =
6351 B.buildMergeValues(S64, {Zero, ShiftedStride.getReg(0)});
6352 auto ShiftedFlags = B.buildShl(S32, Flags, B.buildConstant(S32, 28));
6353 auto ExtShiftedFlags =
6354 B.buildMergeValues(S64, {Zero, ShiftedFlags.getReg(0)});
6355 auto CombinedFields = B.buildOr(S64, NumRecordsRHS, ExtShiftedStride);
6356 Register HighHalf =
6357 B.buildOr(S64, CombinedFields, ExtShiftedFlags).getReg(0);
6358 B.buildMergeValues(Result, {LowHalf, HighHalf});
6359 } else {
6360 NumRecords = B.buildTrunc(S32, NumRecords).getReg(0);
6361 auto Unmerge = B.buildUnmerge(S32, Pointer);
6362 auto LowHalf = Unmerge.getReg(0);
6363 auto HighHalf = Unmerge.getReg(1);
6364
6365 auto AndMask = B.buildConstant(S32, 0x0000ffff);
6366 auto Masked = B.buildAnd(S32, HighHalf, AndMask);
6367 auto ShiftConst = B.buildConstant(S32, 16);
6368 auto ShiftedStride = B.buildShl(S32, ExtStride, ShiftConst);
6369 auto NewHighHalf = B.buildOr(S32, Masked, ShiftedStride);
6370 Register NewHighHalfReg = NewHighHalf.getReg(0);
6371 B.buildMergeValues(Result, {LowHalf, NewHighHalfReg, NumRecords, Flags});
6372 }
6373
6374 MI.eraseFromParent();
6375 return true;
6376}
6377
6380 MachineIRBuilder &B) const {
6381 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
6382 if (!MFI->isEntryFunction()) {
6383 return legalizePreloadedArgIntrin(MI, MRI, B,
6385 }
6386
6387 Register DstReg = MI.getOperand(0).getReg();
6388 if (!getImplicitArgPtr(DstReg, MRI, B))
6389 return false;
6390
6391 MI.eraseFromParent();
6392 return true;
6393}
6394
6397 MachineIRBuilder &B) const {
6398 Function &F = B.getMF().getFunction();
6399 std::optional<uint32_t> KnownSize =
6401 if (KnownSize.has_value())
6402 B.buildConstant(DstReg, *KnownSize);
6403 return false;
6404}
6405
6408 MachineIRBuilder &B) const {
6409
6410 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
6411 if (!MFI->isEntryFunction()) {
6412 return legalizePreloadedArgIntrin(MI, MRI, B,
6414 }
6415
6416 Register DstReg = MI.getOperand(0).getReg();
6417 if (!getLDSKernelId(DstReg, MRI, B))
6418 return false;
6419
6420 MI.eraseFromParent();
6421 return true;
6422}
6423
6427 unsigned AddrSpace) const {
6428 const LLT S32 = LLT::scalar(32);
6429 auto Unmerge = B.buildUnmerge(S32, MI.getOperand(2).getReg());
6430 Register Hi32 = Unmerge.getReg(1);
6431
6432 if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS &&
6433 ST.hasGloballyAddressableScratch()) {
6434 Register FlatScratchBaseHi =
6435 B.buildInstr(AMDGPU::S_MOV_B32, {S32},
6436 {Register(AMDGPU::SRC_FLAT_SCRATCH_BASE_HI)})
6437 .getReg(0);
6438 MRI.setRegClass(FlatScratchBaseHi, &AMDGPU::SReg_32RegClass);
6439 // Test bits 63..58 against the aperture address.
6440 Register XOR = B.buildXor(S32, Hi32, FlatScratchBaseHi).getReg(0);
6441 B.buildICmp(ICmpInst::ICMP_ULT, MI.getOperand(0), XOR,
6442 B.buildConstant(S32, 1u << 26));
6443 } else {
6444 Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B);
6445 B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg);
6446 }
6447 MI.eraseFromParent();
6448 return true;
6449}
6450
6451// The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args:
6452// offset (the offset that is included in bounds checking and swizzling, to be
6453// split between the instruction's voffset and immoffset fields) and soffset
6454// (the offset that is excluded from bounds checking and swizzling, to go in
6455// the instruction's soffset field). This function takes the first kind of
6456// offset and figures out how to split it between voffset and immoffset.
6457std::pair<Register, unsigned>
6459 Register OrigOffset) const {
6460 const unsigned MaxImm = SIInstrInfo::getMaxMUBUFImmOffset(ST);
6461 Register BaseReg;
6462 unsigned ImmOffset;
6463 const LLT S32 = LLT::scalar(32);
6464 MachineRegisterInfo &MRI = *B.getMRI();
6465
6466 // On GFX1250+, voffset and immoffset are zero-extended from 32 bits before
6467 // being added, so we can only safely match a 32-bit addition with no unsigned
6468 // overflow.
6469 bool CheckNUW = ST.hasGFX1250Insts();
6470 std::tie(BaseReg, ImmOffset) = AMDGPU::getBaseWithConstantOffset(
6471 MRI, OrigOffset, /*KnownBits=*/nullptr, CheckNUW);
6472
6473 // If BaseReg is a pointer, convert it to int.
6474 if (MRI.getType(BaseReg).isPointer())
6475 BaseReg = B.buildPtrToInt(MRI.getType(OrigOffset), BaseReg).getReg(0);
6476
6477 // If the immediate value is too big for the immoffset field, put only bits
6478 // that would normally fit in the immoffset field. The remaining value that
6479 // is copied/added for the voffset field is a large power of 2, and it
6480 // stands more chance of being CSEd with the copy/add for another similar
6481 // load/store.
6482 // However, do not do that rounding down if that is a negative
6483 // number, as it appears to be illegal to have a negative offset in the
6484 // vgpr, even if adding the immediate offset makes it positive.
6485 unsigned Overflow = ImmOffset & ~MaxImm;
6486 ImmOffset -= Overflow;
6487 if ((int32_t)Overflow < 0) {
6488 Overflow += ImmOffset;
6489 ImmOffset = 0;
6490 }
6491
6492 if (Overflow != 0) {
6493 if (!BaseReg) {
6494 BaseReg = B.buildConstant(S32, Overflow).getReg(0);
6495 } else {
6496 auto OverflowVal = B.buildConstant(S32, Overflow);
6497 BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0);
6498 }
6499 }
6500
6501 if (!BaseReg)
6502 BaseReg = B.buildConstant(S32, 0).getReg(0);
6503
6504 return std::pair(BaseReg, ImmOffset);
6505}
6506
6507/// Handle register layout difference for f16 images for some subtargets.
6510 Register Reg,
6511 bool ImageStore) const {
6512 const LLT S16 = LLT::scalar(16);
6513 const LLT S32 = LLT::scalar(32);
6514 LLT StoreVT = MRI.getType(Reg);
6515 assert(StoreVT.isVector() && StoreVT.getElementType() == S16);
6516
6517 if (ST.hasUnpackedD16VMem()) {
6518 auto Unmerge = B.buildUnmerge(S16, Reg);
6519
6520 SmallVector<Register, 4> WideRegs;
6521 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
6522 WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0));
6523
6524 int NumElts = StoreVT.getNumElements();
6525
6526 return B.buildBuildVector(LLT::fixed_vector(NumElts, S32), WideRegs)
6527 .getReg(0);
6528 }
6529
6530 if (ImageStore && ST.hasImageStoreD16Bug()) {
6531 if (StoreVT.getNumElements() == 2) {
6532 SmallVector<Register, 4> PackedRegs;
6533 Reg = B.buildBitcast(S32, Reg).getReg(0);
6534 PackedRegs.push_back(Reg);
6535 PackedRegs.resize(2, B.buildUndef(S32).getReg(0));
6536 return B.buildBuildVector(LLT::fixed_vector(2, S32), PackedRegs)
6537 .getReg(0);
6538 }
6539
6540 if (StoreVT.getNumElements() == 3) {
6541 SmallVector<Register, 4> PackedRegs;
6542 auto Unmerge = B.buildUnmerge(S16, Reg);
6543 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
6544 PackedRegs.push_back(Unmerge.getReg(I));
6545 PackedRegs.resize(6, B.buildUndef(S16).getReg(0));
6546 Reg = B.buildBuildVector(LLT::fixed_vector(6, S16), PackedRegs).getReg(0);
6547 return B.buildBitcast(LLT::fixed_vector(3, S32), Reg).getReg(0);
6548 }
6549
6550 if (StoreVT.getNumElements() == 4) {
6551 SmallVector<Register, 4> PackedRegs;
6552 Reg = B.buildBitcast(LLT::fixed_vector(2, S32), Reg).getReg(0);
6553 auto Unmerge = B.buildUnmerge(S32, Reg);
6554 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
6555 PackedRegs.push_back(Unmerge.getReg(I));
6556 PackedRegs.resize(4, B.buildUndef(S32).getReg(0));
6557 return B.buildBuildVector(LLT::fixed_vector(4, S32), PackedRegs)
6558 .getReg(0);
6559 }
6560
6561 llvm_unreachable("invalid data type");
6562 }
6563
6564 if (StoreVT == LLT::fixed_vector(3, S16)) {
6565 Reg = B.buildPadVectorWithUndefElements(LLT::fixed_vector(4, S16), Reg)
6566 .getReg(0);
6567 }
6568 return Reg;
6569}
6570
6572 Register VData, LLT MemTy,
6573 bool IsFormat) const {
6574 MachineRegisterInfo *MRI = B.getMRI();
6575 LLT Ty = MRI->getType(VData);
6576
6577 const LLT S16 = LLT::scalar(16);
6578
6579 // Fixup buffer resources themselves needing to be v4i128.
6581 return castBufferRsrcToV4I32(VData, B);
6582
6583 if (shouldBitcastLoadStoreType(ST, Ty, MemTy)) {
6584 Ty = getBitcastRegisterType(Ty);
6585 VData = B.buildBitcast(Ty, VData).getReg(0);
6586 }
6587 // Fixup illegal register types for i8 stores.
6588 if (Ty == LLT::scalar(8) || Ty == S16) {
6589 Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0);
6590 return AnyExt;
6591 }
6592
6593 if (Ty.isVector()) {
6594 if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) {
6595 if (IsFormat)
6596 return handleD16VData(B, *MRI, VData);
6597 }
6598 }
6599
6600 return VData;
6601}
6602
6604 LegalizerHelper &Helper,
6605 bool IsTyped,
6606 bool IsFormat) const {
6607 MachineIRBuilder &B = Helper.MIRBuilder;
6608 MachineRegisterInfo &MRI = *B.getMRI();
6609
6610 Register VData = MI.getOperand(1).getReg();
6611 LLT Ty = MRI.getType(VData);
6612 LLT EltTy = Ty.getScalarType();
6613 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
6614 const LLT S32 = LLT::scalar(32);
6615
6616 MachineMemOperand *MMO = *MI.memoperands_begin();
6617 const int MemSize = MMO->getSize().getValue();
6618 LLT MemTy = MMO->getMemoryType();
6619
6620 VData = fixStoreSourceType(B, VData, MemTy, IsFormat);
6621
6623 Register RSrc = MI.getOperand(2).getReg();
6624
6625 unsigned ImmOffset;
6626
6627 // The typed intrinsics add an immediate after the registers.
6628 const unsigned NumVIndexOps = IsTyped ? 8 : 7;
6629
6630 // The struct intrinsic variants add one additional operand over raw.
6631 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
6632 Register VIndex;
6633 int OpOffset = 0;
6634 if (HasVIndex) {
6635 VIndex = MI.getOperand(3).getReg();
6636 OpOffset = 1;
6637 } else {
6638 VIndex = B.buildConstant(S32, 0).getReg(0);
6639 }
6640
6641 Register VOffset = MI.getOperand(3 + OpOffset).getReg();
6642 Register SOffset = MI.getOperand(4 + OpOffset).getReg();
6643
6644 unsigned Format = 0;
6645 if (IsTyped) {
6646 Format = MI.getOperand(5 + OpOffset).getImm();
6647 ++OpOffset;
6648 }
6649
6650 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
6651
6652 std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset);
6653
6654 unsigned Opc;
6655 if (IsTyped) {
6656 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 :
6657 AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT;
6658 } else if (IsFormat) {
6659 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 :
6660 AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT;
6661 } else {
6662 switch (MemSize) {
6663 case 1:
6664 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE;
6665 break;
6666 case 2:
6667 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT;
6668 break;
6669 default:
6670 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE;
6671 break;
6672 }
6673 }
6674
6675 auto MIB = B.buildInstr(Opc)
6676 .addUse(VData) // vdata
6677 .addUse(RSrc) // rsrc
6678 .addUse(VIndex) // vindex
6679 .addUse(VOffset) // voffset
6680 .addUse(SOffset) // soffset
6681 .addImm(ImmOffset); // offset(imm)
6682
6683 if (IsTyped)
6684 MIB.addImm(Format);
6685
6686 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm)
6687 .addImm(HasVIndex ? -1 : 0) // idxen(imm)
6688 .addMemOperand(MMO);
6689
6690 MI.eraseFromParent();
6691 return true;
6692}
6693
6694static void buildBufferLoad(unsigned Opc, Register LoadDstReg, Register RSrc,
6695 Register VIndex, Register VOffset, Register SOffset,
6696 unsigned ImmOffset, unsigned Format,
6697 unsigned AuxiliaryData, MachineMemOperand *MMO,
6698 bool IsTyped, bool HasVIndex, MachineIRBuilder &B) {
6699 auto MIB = B.buildInstr(Opc)
6700 .addDef(LoadDstReg) // vdata
6701 .addUse(RSrc) // rsrc
6702 .addUse(VIndex) // vindex
6703 .addUse(VOffset) // voffset
6704 .addUse(SOffset) // soffset
6705 .addImm(ImmOffset); // offset(imm)
6706
6707 if (IsTyped)
6708 MIB.addImm(Format);
6709
6710 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm)
6711 .addImm(HasVIndex ? -1 : 0) // idxen(imm)
6712 .addMemOperand(MMO);
6713}
6714
6716 LegalizerHelper &Helper,
6717 bool IsFormat,
6718 bool IsTyped) const {
6719 MachineIRBuilder &B = Helper.MIRBuilder;
6720 MachineRegisterInfo &MRI = *B.getMRI();
6721 GISelChangeObserver &Observer = Helper.Observer;
6722
6723 // FIXME: Verifier should enforce 1 MMO for these intrinsics.
6724 MachineMemOperand *MMO = *MI.memoperands_begin();
6725 const LLT MemTy = MMO->getMemoryType();
6726 const LLT S32 = LLT::scalar(32);
6727
6728 Register Dst = MI.getOperand(0).getReg();
6729
6730 Register StatusDst;
6731 int OpOffset = 0;
6732 assert(MI.getNumExplicitDefs() == 1 || MI.getNumExplicitDefs() == 2);
6733 bool IsTFE = MI.getNumExplicitDefs() == 2;
6734 if (IsTFE) {
6735 StatusDst = MI.getOperand(1).getReg();
6736 ++OpOffset;
6737 }
6738
6739 castBufferRsrcArgToV4I32(MI, B, 2 + OpOffset);
6740 Register RSrc = MI.getOperand(2 + OpOffset).getReg();
6741
6742 // The typed intrinsics add an immediate after the registers.
6743 const unsigned NumVIndexOps = IsTyped ? 8 : 7;
6744
6745 // The struct intrinsic variants add one additional operand over raw.
6746 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps + OpOffset;
6747 Register VIndex;
6748 if (HasVIndex) {
6749 VIndex = MI.getOperand(3 + OpOffset).getReg();
6750 ++OpOffset;
6751 } else {
6752 VIndex = B.buildConstant(S32, 0).getReg(0);
6753 }
6754
6755 Register VOffset = MI.getOperand(3 + OpOffset).getReg();
6756 Register SOffset = MI.getOperand(4 + OpOffset).getReg();
6757
6758 unsigned Format = 0;
6759 if (IsTyped) {
6760 Format = MI.getOperand(5 + OpOffset).getImm();
6761 ++OpOffset;
6762 }
6763
6764 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
6765 unsigned ImmOffset;
6766
6767 LLT Ty = MRI.getType(Dst);
6768 // Make addrspace 8 pointers loads into 4xs32 loads here, so the rest of the
6769 // logic doesn't have to handle that case.
6770 if (hasBufferRsrcWorkaround(Ty)) {
6771 Observer.changingInstr(MI);
6772 Ty = castBufferRsrcFromV4I32(MI, B, MRI, 0);
6773 Observer.changedInstr(MI);
6774 Dst = MI.getOperand(0).getReg();
6775 B.setInsertPt(B.getMBB(), MI);
6776 }
6777 if (shouldBitcastLoadStoreType(ST, Ty, MemTy)) {
6778 Ty = getBitcastRegisterType(Ty);
6779 Observer.changingInstr(MI);
6780 Helper.bitcastDst(MI, Ty, 0);
6781 Observer.changedInstr(MI);
6782 Dst = MI.getOperand(0).getReg();
6783 B.setInsertPt(B.getMBB(), MI);
6784 }
6785
6786 LLT EltTy = Ty.getScalarType();
6787 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
6788 const bool Unpacked = ST.hasUnpackedD16VMem();
6789
6790 std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset);
6791
6792 unsigned Opc;
6793
6794 // TODO: Support TFE for typed and narrow loads.
6795 if (IsTyped) {
6796 if (IsTFE)
6797 return false;
6798 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 :
6799 AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT;
6800 } else if (IsFormat) {
6801 if (IsD16) {
6802 if (IsTFE)
6803 return false;
6804 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16;
6805 } else {
6806 Opc = IsTFE ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_TFE
6807 : AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT;
6808 }
6809 } else {
6810 switch (MemTy.getSizeInBits()) {
6811 case 8:
6812 Opc = IsTFE ? AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE_TFE
6813 : AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE;
6814 break;
6815 case 16:
6816 Opc = IsTFE ? AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT_TFE
6817 : AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT;
6818 break;
6819 default:
6820 Opc = IsTFE ? AMDGPU::G_AMDGPU_BUFFER_LOAD_TFE
6821 : AMDGPU::G_AMDGPU_BUFFER_LOAD;
6822 break;
6823 }
6824 }
6825
6826 if (IsTFE) {
6827 unsigned NumValueDWords = divideCeil(Ty.getSizeInBits(), 32);
6828 unsigned NumLoadDWords = NumValueDWords + 1;
6829 LLT LoadTy = LLT::fixed_vector(NumLoadDWords, S32);
6830 Register LoadDstReg = B.getMRI()->createGenericVirtualRegister(LoadTy);
6831 buildBufferLoad(Opc, LoadDstReg, RSrc, VIndex, VOffset, SOffset, ImmOffset,
6832 Format, AuxiliaryData, MMO, IsTyped, HasVIndex, B);
6833 if (MemTy.getSizeInBits() < 32) {
6834 Register ExtDst = B.getMRI()->createGenericVirtualRegister(S32);
6835 B.buildUnmerge({ExtDst, StatusDst}, LoadDstReg);
6836 B.buildTrunc(Dst, ExtDst);
6837 } else if (NumValueDWords == 1) {
6838 B.buildUnmerge({Dst, StatusDst}, LoadDstReg);
6839 } else {
6840 SmallVector<Register, 5> LoadElts;
6841 for (unsigned I = 0; I != NumValueDWords; ++I)
6842 LoadElts.push_back(B.getMRI()->createGenericVirtualRegister(S32));
6843 LoadElts.push_back(StatusDst);
6844 B.buildUnmerge(LoadElts, LoadDstReg);
6845 LoadElts.truncate(NumValueDWords);
6846 B.buildMergeLikeInstr(Dst, LoadElts);
6847 }
6848 } else if ((!IsD16 && MemTy.getSizeInBits() < 32) ||
6849 (IsD16 && !Ty.isVector())) {
6850 Register LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32);
6851 buildBufferLoad(Opc, LoadDstReg, RSrc, VIndex, VOffset, SOffset, ImmOffset,
6852 Format, AuxiliaryData, MMO, IsTyped, HasVIndex, B);
6853 B.setInsertPt(B.getMBB(), ++B.getInsertPt());
6854 B.buildTrunc(Dst, LoadDstReg);
6855 } else if (Unpacked && IsD16 && Ty.isVector()) {
6856 LLT UnpackedTy = Ty.changeElementSize(32);
6857 Register LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy);
6858 buildBufferLoad(Opc, LoadDstReg, RSrc, VIndex, VOffset, SOffset, ImmOffset,
6859 Format, AuxiliaryData, MMO, IsTyped, HasVIndex, B);
6860 B.setInsertPt(B.getMBB(), ++B.getInsertPt());
6861 // FIXME: G_TRUNC should work, but legalization currently fails
6862 auto Unmerge = B.buildUnmerge(S32, LoadDstReg);
6864 for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I)
6865 Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0));
6866 B.buildMergeLikeInstr(Dst, Repack);
6867 } else {
6868 buildBufferLoad(Opc, Dst, RSrc, VIndex, VOffset, SOffset, ImmOffset, Format,
6869 AuxiliaryData, MMO, IsTyped, HasVIndex, B);
6870 }
6871
6872 MI.eraseFromParent();
6873 return true;
6874}
6875
6876static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) {
6877 switch (IntrID) {
6878 case Intrinsic::amdgcn_raw_buffer_atomic_swap:
6879 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_swap:
6880 case Intrinsic::amdgcn_struct_buffer_atomic_swap:
6881 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_swap:
6882 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP;
6883 case Intrinsic::amdgcn_raw_buffer_atomic_add:
6884 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_add:
6885 case Intrinsic::amdgcn_struct_buffer_atomic_add:
6886 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_add:
6887 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD;
6888 case Intrinsic::amdgcn_raw_buffer_atomic_sub:
6889 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub:
6890 case Intrinsic::amdgcn_struct_buffer_atomic_sub:
6891 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub:
6892 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB;
6893 case Intrinsic::amdgcn_raw_buffer_atomic_smin:
6894 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin:
6895 case Intrinsic::amdgcn_struct_buffer_atomic_smin:
6896 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smin:
6897 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN;
6898 case Intrinsic::amdgcn_raw_buffer_atomic_umin:
6899 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin:
6900 case Intrinsic::amdgcn_struct_buffer_atomic_umin:
6901 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umin:
6902 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN;
6903 case Intrinsic::amdgcn_raw_buffer_atomic_smax:
6904 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax:
6905 case Intrinsic::amdgcn_struct_buffer_atomic_smax:
6906 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smax:
6907 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX;
6908 case Intrinsic::amdgcn_raw_buffer_atomic_umax:
6909 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umax:
6910 case Intrinsic::amdgcn_struct_buffer_atomic_umax:
6911 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umax:
6912 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX;
6913 case Intrinsic::amdgcn_raw_buffer_atomic_and:
6914 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_and:
6915 case Intrinsic::amdgcn_struct_buffer_atomic_and:
6916 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_and:
6917 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND;
6918 case Intrinsic::amdgcn_raw_buffer_atomic_or:
6919 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_or:
6920 case Intrinsic::amdgcn_struct_buffer_atomic_or:
6921 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_or:
6922 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR;
6923 case Intrinsic::amdgcn_raw_buffer_atomic_xor:
6924 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor:
6925 case Intrinsic::amdgcn_struct_buffer_atomic_xor:
6926 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_xor:
6927 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR;
6928 case Intrinsic::amdgcn_raw_buffer_atomic_inc:
6929 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_inc:
6930 case Intrinsic::amdgcn_struct_buffer_atomic_inc:
6931 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_inc:
6932 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC;
6933 case Intrinsic::amdgcn_raw_buffer_atomic_dec:
6934 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_dec:
6935 case Intrinsic::amdgcn_struct_buffer_atomic_dec:
6936 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_dec:
6937 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC;
6938 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
6939 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap:
6940 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
6941 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap:
6942 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP;
6943 case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
6944 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd:
6945 case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
6946 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd:
6947 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD;
6948 case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
6949 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin:
6950 case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
6951 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmin:
6952 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN;
6953 case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
6954 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmax:
6955 case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
6956 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmax:
6957 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX;
6958 case Intrinsic::amdgcn_raw_buffer_atomic_sub_clamp_u32:
6959 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub_clamp_u32:
6960 case Intrinsic::amdgcn_struct_buffer_atomic_sub_clamp_u32:
6961 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub_clamp_u32:
6962 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB_CLAMP_U32;
6963 case Intrinsic::amdgcn_raw_buffer_atomic_cond_sub_u32:
6964 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cond_sub_u32:
6965 case Intrinsic::amdgcn_struct_buffer_atomic_cond_sub_u32:
6966 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cond_sub_u32:
6967 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_COND_SUB_U32;
6968 default:
6969 llvm_unreachable("unhandled atomic opcode");
6970 }
6971}
6972
6975 Intrinsic::ID IID) const {
6976 const bool IsCmpSwap =
6977 IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap ||
6978 IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap ||
6979 IID == Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap ||
6980 IID == Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap;
6981
6982 Register Dst = MI.getOperand(0).getReg();
6983 // Since we don't have 128-bit atomics, we don't need to handle the case of
6984 // p8 argmunents to the atomic itself
6985 Register VData = MI.getOperand(2).getReg();
6986
6987 Register CmpVal;
6988 int OpOffset = 0;
6989
6990 if (IsCmpSwap) {
6991 CmpVal = MI.getOperand(3).getReg();
6992 ++OpOffset;
6993 }
6994
6995 castBufferRsrcArgToV4I32(MI, B, 3 + OpOffset);
6996 Register RSrc = MI.getOperand(3 + OpOffset).getReg();
6997 const unsigned NumVIndexOps = IsCmpSwap ? 9 : 8;
6998
6999 // The struct intrinsic variants add one additional operand over raw.
7000 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
7001 Register VIndex;
7002 if (HasVIndex) {
7003 VIndex = MI.getOperand(4 + OpOffset).getReg();
7004 ++OpOffset;
7005 } else {
7006 VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0);
7007 }
7008
7009 Register VOffset = MI.getOperand(4 + OpOffset).getReg();
7010 Register SOffset = MI.getOperand(5 + OpOffset).getReg();
7011 unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm();
7012
7013 MachineMemOperand *MMO = *MI.memoperands_begin();
7014
7015 unsigned ImmOffset;
7016 std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset);
7017
7018 auto MIB = B.buildInstr(getBufferAtomicPseudo(IID))
7019 .addDef(Dst)
7020 .addUse(VData); // vdata
7021
7022 if (IsCmpSwap)
7023 MIB.addReg(CmpVal);
7024
7025 MIB.addUse(RSrc) // rsrc
7026 .addUse(VIndex) // vindex
7027 .addUse(VOffset) // voffset
7028 .addUse(SOffset) // soffset
7029 .addImm(ImmOffset) // offset(imm)
7030 .addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm)
7031 .addImm(HasVIndex ? -1 : 0) // idxen(imm)
7032 .addMemOperand(MMO);
7033
7034 MI.eraseFromParent();
7035 return true;
7036}
7037
7038/// Turn a set of s16 typed registers in \p AddrRegs into a dword sized
7039/// vector with s16 typed elements.
7041 SmallVectorImpl<Register> &PackedAddrs,
7042 unsigned ArgOffset,
7044 bool IsA16, bool IsG16) {
7045 const LLT S16 = LLT::scalar(16);
7046 const LLT V2S16 = LLT::fixed_vector(2, 16);
7047 auto EndIdx = Intr->VAddrEnd;
7048
7049 for (unsigned I = Intr->VAddrStart; I < EndIdx; I++) {
7050 MachineOperand &SrcOp = MI.getOperand(ArgOffset + I);
7051 if (!SrcOp.isReg())
7052 continue; // _L to _LZ may have eliminated this.
7053
7054 Register AddrReg = SrcOp.getReg();
7055
7056 if ((I < Intr->GradientStart) ||
7057 (I >= Intr->GradientStart && I < Intr->CoordStart && !IsG16) ||
7058 (I >= Intr->CoordStart && !IsA16)) {
7059 if ((I < Intr->GradientStart) && IsA16 &&
7060 (B.getMRI()->getType(AddrReg) == S16)) {
7061 assert(I == Intr->BiasIndex && "Got unexpected 16-bit extra argument");
7062 // Special handling of bias when A16 is on. Bias is of type half but
7063 // occupies full 32-bit.
7064 PackedAddrs.push_back(
7065 B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
7066 .getReg(0));
7067 } else {
7068 assert((!IsA16 || Intr->NumBiasArgs == 0 || I != Intr->BiasIndex) &&
7069 "Bias needs to be converted to 16 bit in A16 mode");
7070 // Handle any gradient or coordinate operands that should not be packed
7071 AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0);
7072 PackedAddrs.push_back(AddrReg);
7073 }
7074 } else {
7075 // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D,
7076 // derivatives dx/dh and dx/dv are packed with undef.
7077 if (((I + 1) >= EndIdx) ||
7078 ((Intr->NumGradients / 2) % 2 == 1 &&
7079 (I == static_cast<unsigned>(Intr->GradientStart +
7080 (Intr->NumGradients / 2) - 1) ||
7081 I == static_cast<unsigned>(Intr->GradientStart +
7082 Intr->NumGradients - 1))) ||
7083 // Check for _L to _LZ optimization
7084 !MI.getOperand(ArgOffset + I + 1).isReg()) {
7085 PackedAddrs.push_back(
7086 B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
7087 .getReg(0));
7088 } else {
7089 PackedAddrs.push_back(
7090 B.buildBuildVector(
7091 V2S16, {AddrReg, MI.getOperand(ArgOffset + I + 1).getReg()})
7092 .getReg(0));
7093 ++I;
7094 }
7095 }
7096 }
7097}
7098
7099/// Convert from separate vaddr components to a single vector address register,
7100/// and replace the remaining operands with $noreg.
7102 int DimIdx, int NumVAddrs) {
7103 const LLT S32 = LLT::scalar(32);
7104 (void)S32;
7105 SmallVector<Register, 8> AddrRegs;
7106 for (int I = 0; I != NumVAddrs; ++I) {
7107 MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
7108 if (SrcOp.isReg()) {
7109 AddrRegs.push_back(SrcOp.getReg());
7110 assert(B.getMRI()->getType(SrcOp.getReg()) == S32);
7111 }
7112 }
7113
7114 int NumAddrRegs = AddrRegs.size();
7115 if (NumAddrRegs != 1) {
7116 auto VAddr =
7117 B.buildBuildVector(LLT::fixed_vector(NumAddrRegs, 32), AddrRegs);
7118 MI.getOperand(DimIdx).setReg(VAddr.getReg(0));
7119 }
7120
7121 for (int I = 1; I != NumVAddrs; ++I) {
7122 MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
7123 if (SrcOp.isReg())
7124 MI.getOperand(DimIdx + I).setReg(AMDGPU::NoRegister);
7125 }
7126}
7127
7128/// Rewrite image intrinsics to use register layouts expected by the subtarget.
7129///
7130/// Depending on the subtarget, load/store with 16-bit element data need to be
7131/// rewritten to use the low half of 32-bit registers, or directly use a packed
7132/// layout. 16-bit addresses should also sometimes be packed into 32-bit
7133/// registers.
7134///
7135/// We don't want to directly select image instructions just yet, but also want
7136/// to exposes all register repacking to the legalizer/combiners. We also don't
7137/// want a selected instruction entering RegBankSelect. In order to avoid
7138/// defining a multitude of intermediate image instructions, directly hack on
7139/// the intrinsic's arguments. In cases like a16 addresses, this requires
7140/// padding now unnecessary arguments with $noreg.
7143 const AMDGPU::ImageDimIntrinsicInfo *Intr) const {
7144
7145 const MachineFunction &MF = *MI.getMF();
7146 const unsigned NumDefs = MI.getNumExplicitDefs();
7147 const unsigned ArgOffset = NumDefs + 1;
7148 bool IsTFE = NumDefs == 2;
7149 // We are only processing the operands of d16 image operations on subtargets
7150 // that use the unpacked register layout, or need to repack the TFE result.
7151
7152 // TODO: Do we need to guard against already legalized intrinsics?
7153 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
7155
7156 MachineRegisterInfo *MRI = B.getMRI();
7157 const LLT S32 = LLT::scalar(32);
7158 const LLT S16 = LLT::scalar(16);
7159 const LLT V2S16 = LLT::fixed_vector(2, 16);
7160
7161 unsigned DMask = 0;
7162 Register VData;
7163 LLT Ty;
7164
7165 if (!BaseOpcode->NoReturn || BaseOpcode->Store) {
7166 VData = MI.getOperand(NumDefs == 0 ? 1 : 0).getReg();
7167 Ty = MRI->getType(VData);
7168 }
7169
7170 const bool IsAtomicPacked16Bit =
7171 (BaseOpcode->BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_F16 ||
7172 BaseOpcode->BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_BF16);
7173
7174 // Check for 16 bit addresses and pack if true.
7175 LLT GradTy =
7176 MRI->getType(MI.getOperand(ArgOffset + Intr->GradientStart).getReg());
7177 LLT AddrTy =
7178 MRI->getType(MI.getOperand(ArgOffset + Intr->CoordStart).getReg());
7179 const bool IsG16 =
7180 ST.hasG16() ? (BaseOpcode->Gradients && GradTy == S16) : GradTy == S16;
7181 const bool IsA16 = AddrTy == S16;
7182 const bool IsD16 = !IsAtomicPacked16Bit && Ty.getScalarType() == S16;
7183
7184 int DMaskLanes = 0;
7185 if (!BaseOpcode->Atomic) {
7186 DMask = MI.getOperand(ArgOffset + Intr->DMaskIndex).getImm();
7187 if (BaseOpcode->Gather4) {
7188 DMaskLanes = 4;
7189 } else if (DMask != 0) {
7190 DMaskLanes = llvm::popcount(DMask);
7191 } else if (!IsTFE && !BaseOpcode->Store) {
7192 // If dmask is 0, this is a no-op load. This can be eliminated.
7193 B.buildUndef(MI.getOperand(0));
7194 MI.eraseFromParent();
7195 return true;
7196 }
7197 }
7198
7199 Observer.changingInstr(MI);
7200 scope_exit ChangedInstr([&] { Observer.changedInstr(MI); });
7201
7202 const unsigned StoreOpcode = IsD16 ? AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16
7203 : AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE;
7204 const unsigned LoadOpcode = IsD16 ? AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16
7205 : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD;
7206 unsigned NewOpcode = LoadOpcode;
7207 if (BaseOpcode->Store)
7208 NewOpcode = StoreOpcode;
7209 else if (BaseOpcode->NoReturn)
7210 NewOpcode = AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_NORET;
7211
7212 // Track that we legalized this
7213 MI.setDesc(B.getTII().get(NewOpcode));
7214
7215 // Expecting to get an error flag since TFC is on - and dmask is 0 Force
7216 // dmask to be at least 1 otherwise the instruction will fail
7217 if (IsTFE && DMask == 0) {
7218 DMask = 0x1;
7219 DMaskLanes = 1;
7220 MI.getOperand(ArgOffset + Intr->DMaskIndex).setImm(DMask);
7221 }
7222
7223 if (BaseOpcode->Atomic) {
7224 Register VData0 = MI.getOperand(2).getReg();
7225 LLT Ty = MRI->getType(VData0);
7226
7227 // TODO: Allow atomic swap and bit ops for v2s16/v4s16
7228 if (Ty.isVector() && !IsAtomicPacked16Bit)
7229 return false;
7230
7231 if (BaseOpcode->AtomicX2) {
7232 Register VData1 = MI.getOperand(3).getReg();
7233 // The two values are packed in one register.
7234 LLT PackedTy = LLT::fixed_vector(2, Ty);
7235 auto Concat = B.buildBuildVector(PackedTy, {VData0, VData1});
7236 MI.getOperand(2).setReg(Concat.getReg(0));
7237 MI.getOperand(3).setReg(AMDGPU::NoRegister);
7238 }
7239 }
7240
7241 unsigned CorrectedNumVAddrs = Intr->NumVAddrs;
7242
7243 // Rewrite the addressing register layout before doing anything else.
7244 if (BaseOpcode->Gradients && !ST.hasG16() && (IsA16 != IsG16)) {
7245 // 16 bit gradients are supported, but are tied to the A16 control
7246 // so both gradients and addresses must be 16 bit
7247 return false;
7248 }
7249
7250 if (IsA16 && !ST.hasA16()) {
7251 // A16 not supported
7252 return false;
7253 }
7254
7255 const unsigned NSAMaxSize = ST.getNSAMaxSize(BaseOpcode->Sampler);
7256 const unsigned HasPartialNSA = ST.hasPartialNSAEncoding();
7257
7258 if (IsA16 || IsG16) {
7259 // Even if NumVAddrs == 1 we should pack it into a 32-bit value, because the
7260 // instructions expect VGPR_32
7261 SmallVector<Register, 4> PackedRegs;
7262
7263 packImage16bitOpsToDwords(B, MI, PackedRegs, ArgOffset, Intr, IsA16, IsG16);
7264
7265 // See also below in the non-a16 branch
7266 const bool UseNSA = ST.hasNSAEncoding() &&
7267 PackedRegs.size() >= ST.getNSAThreshold(MF) &&
7268 (PackedRegs.size() <= NSAMaxSize || HasPartialNSA);
7269 const bool UsePartialNSA =
7270 UseNSA && HasPartialNSA && PackedRegs.size() > NSAMaxSize;
7271
7272 if (UsePartialNSA) {
7273 // Pack registers that would go over NSAMaxSize into last VAddr register
7274 LLT PackedAddrTy =
7275 LLT::fixed_vector(2 * (PackedRegs.size() - NSAMaxSize + 1), 16);
7276 auto Concat = B.buildConcatVectors(
7277 PackedAddrTy, ArrayRef(PackedRegs).slice(NSAMaxSize - 1));
7278 PackedRegs[NSAMaxSize - 1] = Concat.getReg(0);
7279 PackedRegs.resize(NSAMaxSize);
7280 } else if (!UseNSA && PackedRegs.size() > 1) {
7281 LLT PackedAddrTy = LLT::fixed_vector(2 * PackedRegs.size(), 16);
7282 auto Concat = B.buildConcatVectors(PackedAddrTy, PackedRegs);
7283 PackedRegs[0] = Concat.getReg(0);
7284 PackedRegs.resize(1);
7285 }
7286
7287 const unsigned NumPacked = PackedRegs.size();
7288 for (unsigned I = Intr->VAddrStart; I < Intr->VAddrEnd; I++) {
7289 MachineOperand &SrcOp = MI.getOperand(ArgOffset + I);
7290 if (!SrcOp.isReg()) {
7291 assert(SrcOp.isImm() && SrcOp.getImm() == 0);
7292 continue;
7293 }
7294
7295 assert(SrcOp.getReg() != AMDGPU::NoRegister);
7296
7297 if (I - Intr->VAddrStart < NumPacked)
7298 SrcOp.setReg(PackedRegs[I - Intr->VAddrStart]);
7299 else
7300 SrcOp.setReg(AMDGPU::NoRegister);
7301 }
7302 } else {
7303 // If the register allocator cannot place the address registers contiguously
7304 // without introducing moves, then using the non-sequential address encoding
7305 // is always preferable, since it saves VALU instructions and is usually a
7306 // wash in terms of code size or even better.
7307 //
7308 // However, we currently have no way of hinting to the register allocator
7309 // that MIMG addresses should be placed contiguously when it is possible to
7310 // do so, so force non-NSA for the common 2-address case as a heuristic.
7311 //
7312 // SIShrinkInstructions will convert NSA encodings to non-NSA after register
7313 // allocation when possible.
7314 //
7315 // Partial NSA is allowed on GFX11+ where the final register is a contiguous
7316 // set of the remaining addresses.
7317 const bool UseNSA = ST.hasNSAEncoding() &&
7318 CorrectedNumVAddrs >= ST.getNSAThreshold(MF) &&
7319 (CorrectedNumVAddrs <= NSAMaxSize || HasPartialNSA);
7320 const bool UsePartialNSA =
7321 UseNSA && HasPartialNSA && CorrectedNumVAddrs > NSAMaxSize;
7322
7323 if (UsePartialNSA) {
7325 ArgOffset + Intr->VAddrStart + NSAMaxSize - 1,
7326 Intr->NumVAddrs - NSAMaxSize + 1);
7327 } else if (!UseNSA && Intr->NumVAddrs > 1) {
7328 convertImageAddrToPacked(B, MI, ArgOffset + Intr->VAddrStart,
7329 Intr->NumVAddrs);
7330 }
7331 }
7332
7333 int Flags = 0;
7334 if (IsA16)
7335 Flags |= 1;
7336 if (IsG16)
7337 Flags |= 2;
7338 MI.addOperand(MachineOperand::CreateImm(Flags));
7339
7340 if (BaseOpcode->NoReturn) { // No TFE for stores?
7341 // TODO: Handle dmask trim
7342 if (!Ty.isVector() || !IsD16)
7343 return true;
7344
7345 Register RepackedReg = handleD16VData(B, *MRI, VData, true);
7346 if (RepackedReg != VData) {
7347 MI.getOperand(1).setReg(RepackedReg);
7348 }
7349
7350 return true;
7351 }
7352
7353 Register DstReg = MI.getOperand(0).getReg();
7354 const LLT EltTy = Ty.getScalarType();
7355 const int NumElts = Ty.isVector() ? Ty.getNumElements() : 1;
7356
7357 // Confirm that the return type is large enough for the dmask specified
7358 if (NumElts < DMaskLanes)
7359 return false;
7360
7361 if (NumElts > 4 || DMaskLanes > 4)
7362 return false;
7363
7364 // Image atomic instructions are using DMask to specify how many bits
7365 // input/output data will have. 32-bits (s32, v2s16) or 64-bits (s64, v4s16).
7366 // DMaskLanes for image atomic has default value '0'.
7367 // We must be sure that atomic variants (especially packed) will not be
7368 // truncated from v2s16 or v4s16 to s16 type.
7369 //
7370 // ChangeElementCount will be needed for image load where Ty is always scalar.
7371 const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes;
7372 const LLT AdjustedTy =
7373 DMaskLanes == 0
7374 ? Ty
7375 : Ty.changeElementCount(ElementCount::getFixed(AdjustedNumElts));
7376
7377 // The raw dword aligned data component of the load. The only legal cases
7378 // where this matters should be when using the packed D16 format, for
7379 // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>,
7380 LLT RoundedTy;
7381
7382 // S32 vector to cover all data, plus TFE result element.
7383 LLT TFETy;
7384
7385 // Register type to use for each loaded component. Will be S32 or V2S16.
7386 LLT RegTy;
7387
7388 if (IsD16 && ST.hasUnpackedD16VMem()) {
7389 RoundedTy =
7390 LLT::scalarOrVector(ElementCount::getFixed(AdjustedNumElts), 32);
7391 TFETy = LLT::fixed_vector(AdjustedNumElts + 1, 32);
7392 RegTy = S32;
7393 } else {
7394 unsigned EltSize = EltTy.getSizeInBits();
7395 unsigned RoundedElts = (AdjustedTy.getSizeInBits() + 31) / 32;
7396 unsigned RoundedSize = 32 * RoundedElts;
7397 RoundedTy = LLT::scalarOrVector(
7398 ElementCount::getFixed(RoundedSize / EltSize), EltSize);
7399 TFETy = LLT::fixed_vector(RoundedSize / 32 + 1, S32);
7400 RegTy = !IsTFE && EltSize == 16 ? V2S16 : S32;
7401 }
7402
7403 // The return type does not need adjustment.
7404 // TODO: Should we change s16 case to s32 or <2 x s16>?
7405 if (!IsTFE && (RoundedTy == Ty || !Ty.isVector()))
7406 return true;
7407
7408 Register Dst1Reg;
7409
7410 // Insert after the instruction.
7411 B.setInsertPt(*MI.getParent(), ++MI.getIterator());
7412
7413 // TODO: For TFE with d16, if we used a TFE type that was a multiple of <2 x
7414 // s16> instead of s32, we would only need 1 bitcast instead of multiple.
7415 const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy;
7416 const int ResultNumRegs = LoadResultTy.getSizeInBits() / 32;
7417
7418 Register NewResultReg = MRI->createGenericVirtualRegister(LoadResultTy);
7419
7420 MI.getOperand(0).setReg(NewResultReg);
7421
7422 // In the IR, TFE is supposed to be used with a 2 element struct return
7423 // type. The instruction really returns these two values in one contiguous
7424 // register, with one additional dword beyond the loaded data. Rewrite the
7425 // return type to use a single register result.
7426
7427 if (IsTFE) {
7428 Dst1Reg = MI.getOperand(1).getReg();
7429 if (MRI->getType(Dst1Reg) != S32)
7430 return false;
7431
7432 // TODO: Make sure the TFE operand bit is set.
7433 MI.removeOperand(1);
7434
7435 // Handle the easy case that requires no repack instructions.
7436 if (Ty == S32) {
7437 B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg);
7438 return true;
7439 }
7440 }
7441
7442 // Now figure out how to copy the new result register back into the old
7443 // result.
7444 SmallVector<Register, 5> ResultRegs(ResultNumRegs, Dst1Reg);
7445
7446 const int NumDataRegs = IsTFE ? ResultNumRegs - 1 : ResultNumRegs;
7447
7448 if (ResultNumRegs == 1) {
7449 assert(!IsTFE);
7450 ResultRegs[0] = NewResultReg;
7451 } else {
7452 // We have to repack into a new vector of some kind.
7453 for (int I = 0; I != NumDataRegs; ++I)
7454 ResultRegs[I] = MRI->createGenericVirtualRegister(RegTy);
7455 B.buildUnmerge(ResultRegs, NewResultReg);
7456
7457 // Drop the final TFE element to get the data part. The TFE result is
7458 // directly written to the right place already.
7459 if (IsTFE)
7460 ResultRegs.resize(NumDataRegs);
7461 }
7462
7463 // For an s16 scalar result, we form an s32 result with a truncate regardless
7464 // of packed vs. unpacked.
7465 if (IsD16 && !Ty.isVector()) {
7466 B.buildTrunc(DstReg, ResultRegs[0]);
7467 return true;
7468 }
7469
7470 // Avoid a build/concat_vector of 1 entry.
7471 if (Ty == V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) {
7472 B.buildBitcast(DstReg, ResultRegs[0]);
7473 return true;
7474 }
7475
7476 assert(Ty.isVector());
7477
7478 if (IsD16) {
7479 // For packed D16 results with TFE enabled, all the data components are
7480 // S32. Cast back to the expected type.
7481 //
7482 // TODO: We don't really need to use load s32 elements. We would only need one
7483 // cast for the TFE result if a multiple of v2s16 was used.
7484 if (RegTy != V2S16 && !ST.hasUnpackedD16VMem()) {
7485 for (Register &Reg : ResultRegs)
7486 Reg = B.buildBitcast(V2S16, Reg).getReg(0);
7487 } else if (ST.hasUnpackedD16VMem()) {
7488 for (Register &Reg : ResultRegs)
7489 Reg = B.buildTrunc(S16, Reg).getReg(0);
7490 }
7491 }
7492
7493 auto padWithUndef = [&](LLT Ty, int NumElts) {
7494 if (NumElts == 0)
7495 return;
7496 Register Undef = B.buildUndef(Ty).getReg(0);
7497 for (int I = 0; I != NumElts; ++I)
7498 ResultRegs.push_back(Undef);
7499 };
7500
7501 // Pad out any elements eliminated due to the dmask.
7502 LLT ResTy = MRI->getType(ResultRegs[0]);
7503 if (!ResTy.isVector()) {
7504 padWithUndef(ResTy, NumElts - ResultRegs.size());
7505 B.buildBuildVector(DstReg, ResultRegs);
7506 return true;
7507 }
7508
7509 assert(!ST.hasUnpackedD16VMem() && ResTy == V2S16);
7510 const int RegsToCover = (Ty.getSizeInBits() + 31) / 32;
7511
7512 // Deal with the one annoying legal case.
7513 const LLT V3S16 = LLT::fixed_vector(3, 16);
7514 if (Ty == V3S16) {
7515 if (IsTFE) {
7516 if (ResultRegs.size() == 1) {
7517 NewResultReg = ResultRegs[0];
7518 } else if (ResultRegs.size() == 2) {
7519 LLT V4S16 = LLT::fixed_vector(4, 16);
7520 NewResultReg = B.buildConcatVectors(V4S16, ResultRegs).getReg(0);
7521 } else {
7522 return false;
7523 }
7524 }
7525
7526 if (MRI->getType(DstReg).getNumElements() <
7527 MRI->getType(NewResultReg).getNumElements()) {
7528 B.buildDeleteTrailingVectorElements(DstReg, NewResultReg);
7529 } else {
7530 B.buildPadVectorWithUndefElements(DstReg, NewResultReg);
7531 }
7532 return true;
7533 }
7534
7535 padWithUndef(ResTy, RegsToCover - ResultRegs.size());
7536 B.buildConcatVectors(DstReg, ResultRegs);
7537 return true;
7538}
7539
7541 MachineInstr &MI) const {
7542 MachineIRBuilder &B = Helper.MIRBuilder;
7543 GISelChangeObserver &Observer = Helper.Observer;
7544
7545 Register OrigDst = MI.getOperand(0).getReg();
7546 Register Dst;
7547 LLT Ty = B.getMRI()->getType(OrigDst);
7548 unsigned Size = Ty.getSizeInBits();
7549 MachineFunction &MF = B.getMF();
7550 unsigned Opc = 0;
7551 if (Size < 32 && ST.hasScalarSubwordLoads()) {
7552 assert(Size == 8 || Size == 16);
7553 Opc = Size == 8 ? AMDGPU::G_AMDGPU_S_BUFFER_LOAD_UBYTE
7554 : AMDGPU::G_AMDGPU_S_BUFFER_LOAD_USHORT;
7555 // The 8-bit and 16-bit scalar buffer load instructions have 32-bit
7556 // destination register.
7557 Dst = B.getMRI()->createGenericVirtualRegister(LLT::scalar(32));
7558 } else {
7559 Opc = AMDGPU::G_AMDGPU_S_BUFFER_LOAD;
7560 Dst = OrigDst;
7561 }
7562
7563 Observer.changingInstr(MI);
7564
7565 // Handle needing to s.buffer.load() a p8 value.
7566 if (hasBufferRsrcWorkaround(Ty)) {
7567 Ty = castBufferRsrcFromV4I32(MI, B, *B.getMRI(), 0);
7568 B.setInsertPt(B.getMBB(), MI);
7569 }
7571 Ty = getBitcastRegisterType(Ty);
7572 Helper.bitcastDst(MI, Ty, 0);
7573 B.setInsertPt(B.getMBB(), MI);
7574 }
7575
7576 // FIXME: We don't really need this intermediate instruction. The intrinsic
7577 // should be fixed to have a memory operand. Since it's readnone, we're not
7578 // allowed to add one.
7579 MI.setDesc(B.getTII().get(Opc));
7580 MI.removeOperand(1); // Remove intrinsic ID
7581
7582 // FIXME: When intrinsic definition is fixed, this should have an MMO already.
7583 const unsigned MemSize = (Size + 7) / 8;
7584 const Align MemAlign = B.getDataLayout().getABITypeAlign(
7590 MemSize, MemAlign);
7591 MI.addMemOperand(MF, MMO);
7592 if (Dst != OrigDst) {
7593 MI.getOperand(0).setReg(Dst);
7594 B.setInsertPt(B.getMBB(), ++B.getInsertPt());
7595 B.buildTrunc(OrigDst, Dst);
7596 }
7597
7598 // If we don't have 96-bit result scalar loads, widening to 128-bit should
7599 // always be legal. We may need to restore this to a 96-bit result if it turns
7600 // out this needs to be converted to a vector load during RegBankSelect.
7601 if (!isPowerOf2_32(Size) && (Size != 96 || !ST.hasScalarDwordx3Loads())) {
7602 if (Ty.isVector())
7604 else
7605 Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0);
7606 }
7607
7608 Observer.changedInstr(MI);
7609 return true;
7610}
7611
7613 MachineInstr &MI) const {
7614 MachineIRBuilder &B = Helper.MIRBuilder;
7615 GISelChangeObserver &Observer = Helper.Observer;
7616 Observer.changingInstr(MI);
7617 MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_PREFETCH));
7618 MI.removeOperand(0); // Remove intrinsic ID
7620 Observer.changedInstr(MI);
7621 return true;
7622}
7623
7624// TODO: Move to selection
7627 MachineIRBuilder &B) const {
7628 if (!ST.hasTrapHandler() ||
7629 ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA)
7630 return legalizeTrapEndpgm(MI, MRI, B);
7631
7632 return ST.supportsGetDoorbellID() ?
7634}
7635
7638 const DebugLoc &DL = MI.getDebugLoc();
7639 MachineBasicBlock &BB = B.getMBB();
7640 MachineFunction *MF = BB.getParent();
7641
7642 if (BB.succ_empty() && std::next(MI.getIterator()) == BB.end()) {
7643 BuildMI(BB, BB.end(), DL, B.getTII().get(AMDGPU::S_ENDPGM))
7644 .addImm(0);
7645 MI.eraseFromParent();
7646 return true;
7647 }
7648
7649 // We need a block split to make the real endpgm a terminator. We also don't
7650 // want to break phis in successor blocks, so we can't just delete to the
7651 // end of the block.
7652 BB.splitAt(MI, false /*UpdateLiveIns*/);
7654 MF->push_back(TrapBB);
7655 BuildMI(*TrapBB, TrapBB->end(), DL, B.getTII().get(AMDGPU::S_ENDPGM))
7656 .addImm(0);
7657 BuildMI(BB, &MI, DL, B.getTII().get(AMDGPU::S_CBRANCH_EXECNZ))
7658 .addMBB(TrapBB);
7659
7660 BB.addSuccessor(TrapBB);
7661 MI.eraseFromParent();
7662 return true;
7663}
7664
7667 MachineFunction &MF = B.getMF();
7668 const LLT S64 = LLT::scalar(64);
7669
7670 Register SGPR01(AMDGPU::SGPR0_SGPR1);
7671 // For code object version 5, queue_ptr is passed through implicit kernarg.
7677 ST.getTargetLowering()->getImplicitParameterOffset(B.getMF(), Param);
7678
7679 Register KernargPtrReg = MRI.createGenericVirtualRegister(
7681
7682 if (!loadInputValue(KernargPtrReg, B,
7684 return false;
7685
7686 // TODO: can we be smarter about machine pointer info?
7689 PtrInfo.getWithOffset(Offset),
7693
7694 // Pointer address
7697 B.buildObjectPtrOffset(LoadAddr, KernargPtrReg,
7698 B.buildConstant(LLT::scalar(64), Offset).getReg(0));
7699 // Load address
7700 Register Temp = B.buildLoad(S64, LoadAddr, *MMO).getReg(0);
7701 B.buildCopy(SGPR01, Temp);
7702 B.buildInstr(AMDGPU::S_TRAP)
7703 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap))
7704 .addReg(SGPR01, RegState::Implicit);
7705 MI.eraseFromParent();
7706 return true;
7707 }
7708
7709 // Pass queue pointer to trap handler as input, and insert trap instruction
7710 // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi
7711 Register LiveIn =
7714 return false;
7715
7716 B.buildCopy(SGPR01, LiveIn);
7717 B.buildInstr(AMDGPU::S_TRAP)
7718 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap))
7719 .addReg(SGPR01, RegState::Implicit);
7720
7721 MI.eraseFromParent();
7722 return true;
7723}
7724
7727 MachineIRBuilder &B) const {
7728 // We need to simulate the 's_trap 2' instruction on targets that run in
7729 // PRIV=1 (where it is treated as a nop).
7730 if (ST.hasPrivEnabledTrap2NopBug()) {
7731 ST.getInstrInfo()->insertSimulatedTrap(MRI, B.getMBB(), MI,
7732 MI.getDebugLoc());
7733 MI.eraseFromParent();
7734 return true;
7735 }
7736
7737 B.buildInstr(AMDGPU::S_TRAP)
7738 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap));
7739 MI.eraseFromParent();
7740 return true;
7741}
7742
7745 MachineIRBuilder &B) const {
7746 // Is non-HSA path or trap-handler disabled? Then, report a warning
7747 // accordingly
7748 if (!ST.hasTrapHandler() ||
7749 ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) {
7750 Function &Fn = B.getMF().getFunction();
7752 Fn, "debugtrap handler not supported", MI.getDebugLoc(), DS_Warning));
7753 } else {
7754 // Insert debug-trap instruction
7755 B.buildInstr(AMDGPU::S_TRAP)
7756 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSADebugTrap));
7757 }
7758
7759 MI.eraseFromParent();
7760 return true;
7761}
7762
7764 MachineInstr &MI, MachineIRBuilder &B) const {
7765 MachineRegisterInfo &MRI = *B.getMRI();
7766 const LLT S16 = LLT::scalar(16);
7767 const LLT S32 = LLT::scalar(32);
7768 const LLT V2S16 = LLT::fixed_vector(2, 16);
7769 const LLT V3S32 = LLT::fixed_vector(3, 32);
7770
7771 Register DstReg = MI.getOperand(0).getReg();
7772 Register NodePtr = MI.getOperand(2).getReg();
7773 Register RayExtent = MI.getOperand(3).getReg();
7774 Register RayOrigin = MI.getOperand(4).getReg();
7775 Register RayDir = MI.getOperand(5).getReg();
7776 Register RayInvDir = MI.getOperand(6).getReg();
7777 Register TDescr = MI.getOperand(7).getReg();
7778
7779 if (!ST.hasGFX10_AEncoding()) {
7780 Function &Fn = B.getMF().getFunction();
7782 Fn, "intrinsic not supported on subtarget", MI.getDebugLoc()));
7783 return false;
7784 }
7785
7786 const bool IsGFX11 = AMDGPU::isGFX11(ST);
7787 const bool IsGFX11Plus = AMDGPU::isGFX11Plus(ST);
7788 const bool IsGFX12Plus = AMDGPU::isGFX12Plus(ST);
7789 const bool IsA16 = MRI.getType(RayDir).getElementType().getSizeInBits() == 16;
7790 const bool Is64 = MRI.getType(NodePtr).getSizeInBits() == 64;
7791 const unsigned NumVDataDwords = 4;
7792 const unsigned NumVAddrDwords = IsA16 ? (Is64 ? 9 : 8) : (Is64 ? 12 : 11);
7793 const unsigned NumVAddrs = IsGFX11Plus ? (IsA16 ? 4 : 5) : NumVAddrDwords;
7794 const bool UseNSA =
7795 IsGFX12Plus || (ST.hasNSAEncoding() && NumVAddrs <= ST.getNSAMaxSize());
7796
7797 const unsigned BaseOpcodes[2][2] = {
7798 {AMDGPU::IMAGE_BVH_INTERSECT_RAY, AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16},
7799 {AMDGPU::IMAGE_BVH64_INTERSECT_RAY,
7800 AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16}};
7801 int Opcode;
7802 if (UseNSA) {
7803 Opcode = AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16],
7804 IsGFX12Plus ? AMDGPU::MIMGEncGfx12
7805 : IsGFX11 ? AMDGPU::MIMGEncGfx11NSA
7806 : AMDGPU::MIMGEncGfx10NSA,
7807 NumVDataDwords, NumVAddrDwords);
7808 } else {
7809 assert(!IsGFX12Plus);
7810 Opcode = AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16],
7811 IsGFX11 ? AMDGPU::MIMGEncGfx11Default
7812 : AMDGPU::MIMGEncGfx10Default,
7813 NumVDataDwords, NumVAddrDwords);
7814 }
7815 assert(Opcode != -1);
7816
7818 if (UseNSA && IsGFX11Plus) {
7819 auto packLanes = [&Ops, &S32, &V3S32, &B](Register Src) {
7820 auto Unmerge = B.buildUnmerge({S32, S32, S32}, Src);
7821 auto Merged = B.buildMergeLikeInstr(
7822 V3S32, {Unmerge.getReg(0), Unmerge.getReg(1), Unmerge.getReg(2)});
7823 Ops.push_back(Merged.getReg(0));
7824 };
7825
7826 Ops.push_back(NodePtr);
7827 Ops.push_back(RayExtent);
7828 packLanes(RayOrigin);
7829
7830 if (IsA16) {
7831 auto UnmergeRayDir = B.buildUnmerge({S16, S16, S16}, RayDir);
7832 auto UnmergeRayInvDir = B.buildUnmerge({S16, S16, S16}, RayInvDir);
7833 auto MergedDir = B.buildMergeLikeInstr(
7834 V3S32,
7835 {B.buildBitcast(
7836 S32, B.buildMergeLikeInstr(V2S16, {UnmergeRayInvDir.getReg(0),
7837 UnmergeRayDir.getReg(0)}))
7838 .getReg(0),
7839 B.buildBitcast(
7840 S32, B.buildMergeLikeInstr(V2S16, {UnmergeRayInvDir.getReg(1),
7841 UnmergeRayDir.getReg(1)}))
7842 .getReg(0),
7843 B.buildBitcast(
7844 S32, B.buildMergeLikeInstr(V2S16, {UnmergeRayInvDir.getReg(2),
7845 UnmergeRayDir.getReg(2)}))
7846 .getReg(0)});
7847 Ops.push_back(MergedDir.getReg(0));
7848 } else {
7849 packLanes(RayDir);
7850 packLanes(RayInvDir);
7851 }
7852 } else {
7853 if (Is64) {
7854 auto Unmerge = B.buildUnmerge({S32, S32}, NodePtr);
7855 Ops.push_back(Unmerge.getReg(0));
7856 Ops.push_back(Unmerge.getReg(1));
7857 } else {
7858 Ops.push_back(NodePtr);
7859 }
7860 Ops.push_back(RayExtent);
7861
7862 auto packLanes = [&Ops, &S32, &B](Register Src) {
7863 auto Unmerge = B.buildUnmerge({S32, S32, S32}, Src);
7864 Ops.push_back(Unmerge.getReg(0));
7865 Ops.push_back(Unmerge.getReg(1));
7866 Ops.push_back(Unmerge.getReg(2));
7867 };
7868
7869 packLanes(RayOrigin);
7870 if (IsA16) {
7871 auto UnmergeRayDir = B.buildUnmerge({S16, S16, S16}, RayDir);
7872 auto UnmergeRayInvDir = B.buildUnmerge({S16, S16, S16}, RayInvDir);
7876 B.buildMergeLikeInstr(R1,
7877 {UnmergeRayDir.getReg(0), UnmergeRayDir.getReg(1)});
7878 B.buildMergeLikeInstr(
7879 R2, {UnmergeRayDir.getReg(2), UnmergeRayInvDir.getReg(0)});
7880 B.buildMergeLikeInstr(
7881 R3, {UnmergeRayInvDir.getReg(1), UnmergeRayInvDir.getReg(2)});
7882 Ops.push_back(R1);
7883 Ops.push_back(R2);
7884 Ops.push_back(R3);
7885 } else {
7886 packLanes(RayDir);
7887 packLanes(RayInvDir);
7888 }
7889 }
7890
7891 if (!UseNSA) {
7892 // Build a single vector containing all the operands so far prepared.
7893 LLT OpTy = LLT::fixed_vector(Ops.size(), 32);
7894 Register MergedOps = B.buildMergeLikeInstr(OpTy, Ops).getReg(0);
7895 Ops.clear();
7896 Ops.push_back(MergedOps);
7897 }
7898
7899 auto MIB = B.buildInstr(AMDGPU::G_AMDGPU_BVH_INTERSECT_RAY)
7900 .addDef(DstReg)
7901 .addImm(Opcode);
7902
7903 for (Register R : Ops) {
7904 MIB.addUse(R);
7905 }
7906
7907 MIB.addUse(TDescr)
7908 .addImm(IsA16 ? 1 : 0)
7909 .cloneMemRefs(MI);
7910
7911 MI.eraseFromParent();
7912 return true;
7913}
7914
7916 MachineInstr &MI, MachineIRBuilder &B) const {
7917 const LLT S32 = LLT::scalar(32);
7918 const LLT V2S32 = LLT::fixed_vector(2, 32);
7919
7920 Register DstReg = MI.getOperand(0).getReg();
7921 Register DstOrigin = MI.getOperand(1).getReg();
7922 Register DstDir = MI.getOperand(2).getReg();
7923 Register NodePtr = MI.getOperand(4).getReg();
7924 Register RayExtent = MI.getOperand(5).getReg();
7925 Register InstanceMask = MI.getOperand(6).getReg();
7926 Register RayOrigin = MI.getOperand(7).getReg();
7927 Register RayDir = MI.getOperand(8).getReg();
7928 Register Offsets = MI.getOperand(9).getReg();
7929 Register TDescr = MI.getOperand(10).getReg();
7930
7931 if (!ST.hasBVHDualAndBVH8Insts()) {
7932 Function &Fn = B.getMF().getFunction();
7934 Fn, "intrinsic not supported on subtarget", MI.getDebugLoc()));
7935 return false;
7936 }
7937
7938 bool IsBVH8 = cast<GIntrinsic>(MI).getIntrinsicID() ==
7939 Intrinsic::amdgcn_image_bvh8_intersect_ray;
7940 const unsigned NumVDataDwords = 10;
7941 const unsigned NumVAddrDwords = IsBVH8 ? 11 : 12;
7942 int Opcode = AMDGPU::getMIMGOpcode(
7943 IsBVH8 ? AMDGPU::IMAGE_BVH8_INTERSECT_RAY
7944 : AMDGPU::IMAGE_BVH_DUAL_INTERSECT_RAY,
7945 AMDGPU::MIMGEncGfx12, NumVDataDwords, NumVAddrDwords);
7946 assert(Opcode != -1);
7947
7948 auto RayExtentInstanceMaskVec = B.buildMergeLikeInstr(
7949 V2S32, {RayExtent, B.buildAnyExt(S32, InstanceMask)});
7950
7951 B.buildInstr(IsBVH8 ? AMDGPU::G_AMDGPU_BVH8_INTERSECT_RAY
7952 : AMDGPU::G_AMDGPU_BVH_DUAL_INTERSECT_RAY)
7953 .addDef(DstReg)
7954 .addDef(DstOrigin)
7955 .addDef(DstDir)
7956 .addImm(Opcode)
7957 .addUse(NodePtr)
7958 .addUse(RayExtentInstanceMaskVec.getReg(0))
7959 .addUse(RayOrigin)
7960 .addUse(RayDir)
7961 .addUse(Offsets)
7962 .addUse(TDescr)
7963 .cloneMemRefs(MI);
7964
7965 MI.eraseFromParent();
7966 return true;
7967}
7968
7970 MachineIRBuilder &B) const {
7971 const SITargetLowering *TLI = ST.getTargetLowering();
7973 Register DstReg = MI.getOperand(0).getReg();
7974 B.buildInstr(AMDGPU::G_AMDGPU_WAVE_ADDRESS, {DstReg}, {StackPtr});
7975 MI.eraseFromParent();
7976 return true;
7977}
7978
7980 MachineIRBuilder &B) const {
7981 // With architected SGPRs, waveIDinGroup is in TTMP8[29:25].
7982 if (!ST.hasArchitectedSGPRs())
7983 return false;
7984 LLT S32 = LLT::scalar(32);
7985 Register DstReg = MI.getOperand(0).getReg();
7986 auto TTMP8 = B.buildCopy(S32, Register(AMDGPU::TTMP8));
7987 auto LSB = B.buildConstant(S32, 25);
7988 auto Width = B.buildConstant(S32, 5);
7989 B.buildUbfx(DstReg, TTMP8, LSB, Width);
7990 MI.eraseFromParent();
7991 return true;
7992}
7993
7996 AMDGPU::Hwreg::Id HwReg,
7997 unsigned LowBit,
7998 unsigned Width) const {
7999 MachineRegisterInfo &MRI = *B.getMRI();
8000 Register DstReg = MI.getOperand(0).getReg();
8001 if (!MRI.getRegClassOrNull(DstReg))
8002 MRI.setRegClass(DstReg, &AMDGPU::SReg_32RegClass);
8003 B.buildInstr(AMDGPU::S_GETREG_B32_const)
8004 .addDef(DstReg)
8005 .addImm(AMDGPU::Hwreg::HwregEncoding::encode(HwReg, LowBit, Width));
8006 MI.eraseFromParent();
8007 return true;
8008}
8009
8010static constexpr unsigned FPEnvModeBitField =
8012
8013static constexpr unsigned FPEnvTrapBitField =
8015
8018 MachineIRBuilder &B) const {
8019 Register Src = MI.getOperand(0).getReg();
8020 if (MRI.getType(Src) != S64)
8021 return false;
8022
8023 auto ModeReg =
8024 B.buildIntrinsic(Intrinsic::amdgcn_s_getreg, {S32},
8025 /*HasSideEffects=*/true, /*isConvergent=*/false)
8026 .addImm(FPEnvModeBitField);
8027 auto TrapReg =
8028 B.buildIntrinsic(Intrinsic::amdgcn_s_getreg, {S32},
8029 /*HasSideEffects=*/true, /*isConvergent=*/false)
8030 .addImm(FPEnvTrapBitField);
8031 B.buildMergeLikeInstr(Src, {ModeReg, TrapReg});
8032 MI.eraseFromParent();
8033 return true;
8034}
8035
8038 MachineIRBuilder &B) const {
8039 Register Src = MI.getOperand(0).getReg();
8040 if (MRI.getType(Src) != S64)
8041 return false;
8042
8043 auto Unmerge = B.buildUnmerge({S32, S32}, MI.getOperand(0));
8044 B.buildIntrinsic(Intrinsic::amdgcn_s_setreg, ArrayRef<DstOp>(),
8045 /*HasSideEffects=*/true, /*isConvergent=*/false)
8046 .addImm(static_cast<int16_t>(FPEnvModeBitField))
8047 .addReg(Unmerge.getReg(0));
8048 B.buildIntrinsic(Intrinsic::amdgcn_s_setreg, ArrayRef<DstOp>(),
8049 /*HasSideEffects=*/true, /*isConvergent=*/false)
8050 .addImm(static_cast<int16_t>(FPEnvTrapBitField))
8051 .addReg(Unmerge.getReg(1));
8052 MI.eraseFromParent();
8053 return true;
8054}
8055
8057 MachineInstr &MI) const {
8058 MachineIRBuilder &B = Helper.MIRBuilder;
8059 MachineRegisterInfo &MRI = *B.getMRI();
8060
8061 // Replace the use G_BRCOND with the exec manipulate and branch pseudos.
8062 auto IntrID = cast<GIntrinsic>(MI).getIntrinsicID();
8063 switch (IntrID) {
8064 case Intrinsic::sponentry:
8065 if (B.getMF().getInfo<SIMachineFunctionInfo>()->isBottomOfStack()) {
8066 // FIXME: The imported pattern checks for i32 instead of p5; if we fix
8067 // that we can remove this cast.
8068 const LLT S32 = LLT::scalar(32);
8070 B.buildInstr(AMDGPU::G_AMDGPU_SPONENTRY).addDef(TmpReg);
8071
8072 Register DstReg = MI.getOperand(0).getReg();
8073 B.buildIntToPtr(DstReg, TmpReg);
8074 MI.eraseFromParent();
8075 } else {
8076 int FI = B.getMF().getFrameInfo().CreateFixedObject(
8077 1, 0, /*IsImmutable=*/false);
8078 B.buildFrameIndex(MI.getOperand(0), FI);
8079 MI.eraseFromParent();
8080 }
8081 return true;
8082 case Intrinsic::amdgcn_if:
8083 case Intrinsic::amdgcn_else: {
8084 MachineInstr *Br = nullptr;
8085 MachineBasicBlock *UncondBrTarget = nullptr;
8086 bool Negated = false;
8087 if (MachineInstr *BrCond =
8088 verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) {
8089 const SIRegisterInfo *TRI
8090 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
8091
8092 Register Def = MI.getOperand(1).getReg();
8093 Register Use = MI.getOperand(3).getReg();
8094
8095 MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
8096
8097 if (Negated)
8098 std::swap(CondBrTarget, UncondBrTarget);
8099
8100 B.setInsertPt(B.getMBB(), BrCond->getIterator());
8101 if (IntrID == Intrinsic::amdgcn_if) {
8102 B.buildInstr(AMDGPU::SI_IF)
8103 .addDef(Def)
8104 .addUse(Use)
8105 .addMBB(UncondBrTarget);
8106 } else {
8107 B.buildInstr(AMDGPU::SI_ELSE)
8108 .addDef(Def)
8109 .addUse(Use)
8110 .addMBB(UncondBrTarget);
8111 }
8112
8113 if (Br) {
8114 Br->getOperand(0).setMBB(CondBrTarget);
8115 } else {
8116 // The IRTranslator skips inserting the G_BR for fallthrough cases, but
8117 // since we're swapping branch targets it needs to be reinserted.
8118 // FIXME: IRTranslator should probably not do this
8119 B.buildBr(*CondBrTarget);
8120 }
8121
8122 MRI.setRegClass(Def, TRI->getWaveMaskRegClass());
8123 MRI.setRegClass(Use, TRI->getWaveMaskRegClass());
8124 MI.eraseFromParent();
8125 BrCond->eraseFromParent();
8126 return true;
8127 }
8128
8129 return false;
8130 }
8131 case Intrinsic::amdgcn_loop: {
8132 MachineInstr *Br = nullptr;
8133 MachineBasicBlock *UncondBrTarget = nullptr;
8134 bool Negated = false;
8135 if (MachineInstr *BrCond =
8136 verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) {
8137 const SIRegisterInfo *TRI
8138 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
8139
8140 MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
8141 Register Reg = MI.getOperand(2).getReg();
8142
8143 if (Negated)
8144 std::swap(CondBrTarget, UncondBrTarget);
8145
8146 B.setInsertPt(B.getMBB(), BrCond->getIterator());
8147 B.buildInstr(AMDGPU::SI_LOOP)
8148 .addUse(Reg)
8149 .addMBB(UncondBrTarget);
8150
8151 if (Br)
8152 Br->getOperand(0).setMBB(CondBrTarget);
8153 else
8154 B.buildBr(*CondBrTarget);
8155
8156 MI.eraseFromParent();
8157 BrCond->eraseFromParent();
8158 MRI.setRegClass(Reg, TRI->getWaveMaskRegClass());
8159 return true;
8160 }
8161
8162 return false;
8163 }
8164 case Intrinsic::amdgcn_addrspacecast_nonnull:
8165 return legalizeAddrSpaceCast(MI, MRI, B);
8166 case Intrinsic::amdgcn_make_buffer_rsrc:
8167 return legalizePointerAsRsrcIntrin(MI, MRI, B);
8168 case Intrinsic::amdgcn_kernarg_segment_ptr:
8169 if (!AMDGPU::isKernel(B.getMF().getFunction())) {
8170 // This only makes sense to call in a kernel, so just lower to null.
8171 B.buildConstant(MI.getOperand(0).getReg(), 0);
8172 MI.eraseFromParent();
8173 return true;
8174 }
8175
8178 case Intrinsic::amdgcn_implicitarg_ptr:
8179 return legalizeImplicitArgPtr(MI, MRI, B);
8180 case Intrinsic::amdgcn_workitem_id_x:
8181 return legalizeWorkitemIDIntrinsic(MI, MRI, B, 0,
8183 case Intrinsic::amdgcn_workitem_id_y:
8184 return legalizeWorkitemIDIntrinsic(MI, MRI, B, 1,
8186 case Intrinsic::amdgcn_workitem_id_z:
8187 return legalizeWorkitemIDIntrinsic(MI, MRI, B, 2,
8189 case Intrinsic::amdgcn_workgroup_id_x:
8190 return legalizeWorkGroupId(
8194 case Intrinsic::amdgcn_workgroup_id_y:
8195 return legalizeWorkGroupId(
8199 case Intrinsic::amdgcn_workgroup_id_z:
8200 return legalizeWorkGroupId(
8204 case Intrinsic::amdgcn_cluster_id_x:
8205 return ST.hasClusters() &&
8208 case Intrinsic::amdgcn_cluster_id_y:
8209 return ST.hasClusters() &&
8212 case Intrinsic::amdgcn_cluster_id_z:
8213 return ST.hasClusters() &&
8216 case Intrinsic::amdgcn_cluster_workgroup_id_x:
8217 return ST.hasClusters() &&
8220 case Intrinsic::amdgcn_cluster_workgroup_id_y:
8221 return ST.hasClusters() &&
8224 case Intrinsic::amdgcn_cluster_workgroup_id_z:
8225 return ST.hasClusters() &&
8228 case Intrinsic::amdgcn_cluster_workgroup_flat_id:
8229 return ST.hasClusters() &&
8231 case Intrinsic::amdgcn_cluster_workgroup_max_id_x:
8232 return ST.hasClusters() &&
8235 case Intrinsic::amdgcn_cluster_workgroup_max_id_y:
8236 return ST.hasClusters() &&
8239 case Intrinsic::amdgcn_cluster_workgroup_max_id_z:
8240 return ST.hasClusters() &&
8243 case Intrinsic::amdgcn_cluster_workgroup_max_flat_id:
8244 return ST.hasClusters() &&
8246 MI, MRI, B,
8248 case Intrinsic::amdgcn_wave_id:
8249 return legalizeWaveID(MI, B);
8250 case Intrinsic::amdgcn_lds_kernel_id:
8251 return legalizePreloadedArgIntrin(MI, MRI, B,
8253 case Intrinsic::amdgcn_dispatch_ptr:
8254 return legalizePreloadedArgIntrin(MI, MRI, B,
8256 case Intrinsic::amdgcn_queue_ptr:
8257 return legalizePreloadedArgIntrin(MI, MRI, B,
8259 case Intrinsic::amdgcn_implicit_buffer_ptr:
8262 case Intrinsic::amdgcn_dispatch_id:
8263 return legalizePreloadedArgIntrin(MI, MRI, B,
8265 case Intrinsic::r600_read_ngroups_x:
8266 // TODO: Emit error for hsa
8269 case Intrinsic::r600_read_ngroups_y:
8272 case Intrinsic::r600_read_ngroups_z:
8275 case Intrinsic::r600_read_local_size_x:
8276 // TODO: Could insert G_ASSERT_ZEXT from s16
8278 case Intrinsic::r600_read_local_size_y:
8279 // TODO: Could insert G_ASSERT_ZEXT from s16
8281 // TODO: Could insert G_ASSERT_ZEXT from s16
8282 case Intrinsic::r600_read_local_size_z:
8285 case Intrinsic::amdgcn_fdiv_fast:
8286 return legalizeFDIVFastIntrin(MI, MRI, B);
8287 case Intrinsic::amdgcn_is_shared:
8289 case Intrinsic::amdgcn_is_private:
8291 case Intrinsic::amdgcn_wavefrontsize: {
8292 B.buildConstant(MI.getOperand(0), ST.getWavefrontSize());
8293 MI.eraseFromParent();
8294 return true;
8295 }
8296 case Intrinsic::amdgcn_s_buffer_load:
8297 return legalizeSBufferLoad(Helper, MI);
8298 case Intrinsic::amdgcn_raw_buffer_store:
8299 case Intrinsic::amdgcn_raw_ptr_buffer_store:
8300 case Intrinsic::amdgcn_struct_buffer_store:
8301 case Intrinsic::amdgcn_struct_ptr_buffer_store:
8302 return legalizeBufferStore(MI, Helper, false, false);
8303 case Intrinsic::amdgcn_raw_buffer_store_format:
8304 case Intrinsic::amdgcn_raw_ptr_buffer_store_format:
8305 case Intrinsic::amdgcn_struct_buffer_store_format:
8306 case Intrinsic::amdgcn_struct_ptr_buffer_store_format:
8307 return legalizeBufferStore(MI, Helper, false, true);
8308 case Intrinsic::amdgcn_raw_tbuffer_store:
8309 case Intrinsic::amdgcn_raw_ptr_tbuffer_store:
8310 case Intrinsic::amdgcn_struct_tbuffer_store:
8311 case Intrinsic::amdgcn_struct_ptr_tbuffer_store:
8312 return legalizeBufferStore(MI, Helper, true, true);
8313 case Intrinsic::amdgcn_raw_buffer_load:
8314 case Intrinsic::amdgcn_raw_ptr_buffer_load:
8315 case Intrinsic::amdgcn_raw_atomic_buffer_load:
8316 case Intrinsic::amdgcn_raw_ptr_atomic_buffer_load:
8317 case Intrinsic::amdgcn_struct_buffer_load:
8318 case Intrinsic::amdgcn_struct_ptr_buffer_load:
8319 case Intrinsic::amdgcn_struct_atomic_buffer_load:
8320 case Intrinsic::amdgcn_struct_ptr_atomic_buffer_load:
8321 return legalizeBufferLoad(MI, Helper, false, false);
8322 case Intrinsic::amdgcn_raw_buffer_load_format:
8323 case Intrinsic::amdgcn_raw_ptr_buffer_load_format:
8324 case Intrinsic::amdgcn_struct_buffer_load_format:
8325 case Intrinsic::amdgcn_struct_ptr_buffer_load_format:
8326 return legalizeBufferLoad(MI, Helper, true, false);
8327 case Intrinsic::amdgcn_raw_tbuffer_load:
8328 case Intrinsic::amdgcn_raw_ptr_tbuffer_load:
8329 case Intrinsic::amdgcn_struct_tbuffer_load:
8330 case Intrinsic::amdgcn_struct_ptr_tbuffer_load:
8331 return legalizeBufferLoad(MI, Helper, true, true);
8332 case Intrinsic::amdgcn_raw_buffer_atomic_swap:
8333 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_swap:
8334 case Intrinsic::amdgcn_struct_buffer_atomic_swap:
8335 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_swap:
8336 case Intrinsic::amdgcn_raw_buffer_atomic_add:
8337 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_add:
8338 case Intrinsic::amdgcn_struct_buffer_atomic_add:
8339 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_add:
8340 case Intrinsic::amdgcn_raw_buffer_atomic_sub:
8341 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub:
8342 case Intrinsic::amdgcn_struct_buffer_atomic_sub:
8343 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub:
8344 case Intrinsic::amdgcn_raw_buffer_atomic_smin:
8345 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin:
8346 case Intrinsic::amdgcn_struct_buffer_atomic_smin:
8347 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smin:
8348 case Intrinsic::amdgcn_raw_buffer_atomic_umin:
8349 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin:
8350 case Intrinsic::amdgcn_struct_buffer_atomic_umin:
8351 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umin:
8352 case Intrinsic::amdgcn_raw_buffer_atomic_smax:
8353 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax:
8354 case Intrinsic::amdgcn_struct_buffer_atomic_smax:
8355 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smax:
8356 case Intrinsic::amdgcn_raw_buffer_atomic_umax:
8357 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umax:
8358 case Intrinsic::amdgcn_struct_buffer_atomic_umax:
8359 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umax:
8360 case Intrinsic::amdgcn_raw_buffer_atomic_and:
8361 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_and:
8362 case Intrinsic::amdgcn_struct_buffer_atomic_and:
8363 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_and:
8364 case Intrinsic::amdgcn_raw_buffer_atomic_or:
8365 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_or:
8366 case Intrinsic::amdgcn_struct_buffer_atomic_or:
8367 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_or:
8368 case Intrinsic::amdgcn_raw_buffer_atomic_xor:
8369 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor:
8370 case Intrinsic::amdgcn_struct_buffer_atomic_xor:
8371 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_xor:
8372 case Intrinsic::amdgcn_raw_buffer_atomic_inc:
8373 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_inc:
8374 case Intrinsic::amdgcn_struct_buffer_atomic_inc:
8375 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_inc:
8376 case Intrinsic::amdgcn_raw_buffer_atomic_dec:
8377 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_dec:
8378 case Intrinsic::amdgcn_struct_buffer_atomic_dec:
8379 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_dec:
8380 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
8381 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap:
8382 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
8383 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap:
8384 case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
8385 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin:
8386 case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
8387 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmin:
8388 case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
8389 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmax:
8390 case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
8391 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmax:
8392 case Intrinsic::amdgcn_raw_buffer_atomic_sub_clamp_u32:
8393 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub_clamp_u32:
8394 case Intrinsic::amdgcn_struct_buffer_atomic_sub_clamp_u32:
8395 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub_clamp_u32:
8396 case Intrinsic::amdgcn_raw_buffer_atomic_cond_sub_u32:
8397 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cond_sub_u32:
8398 case Intrinsic::amdgcn_struct_buffer_atomic_cond_sub_u32:
8399 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cond_sub_u32:
8400 case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
8401 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd:
8402 case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
8403 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd:
8404 return legalizeBufferAtomic(MI, B, IntrID);
8405 case Intrinsic::amdgcn_rsq_clamp:
8406 return legalizeRsqClampIntrinsic(MI, MRI, B);
8407 case Intrinsic::amdgcn_image_bvh_intersect_ray:
8409 case Intrinsic::amdgcn_image_bvh_dual_intersect_ray:
8410 case Intrinsic::amdgcn_image_bvh8_intersect_ray:
8412 case Intrinsic::amdgcn_swmmac_f32_16x16x128_fp8_fp8:
8413 case Intrinsic::amdgcn_swmmac_f32_16x16x128_fp8_bf8:
8414 case Intrinsic::amdgcn_swmmac_f32_16x16x128_bf8_fp8:
8415 case Intrinsic::amdgcn_swmmac_f32_16x16x128_bf8_bf8:
8416 case Intrinsic::amdgcn_swmmac_f16_16x16x128_fp8_fp8:
8417 case Intrinsic::amdgcn_swmmac_f16_16x16x128_fp8_bf8:
8418 case Intrinsic::amdgcn_swmmac_f16_16x16x128_bf8_fp8:
8419 case Intrinsic::amdgcn_swmmac_f16_16x16x128_bf8_bf8: {
8420 Register Index = MI.getOperand(5).getReg();
8421 LLT S64 = LLT::scalar(64);
8422 LLT IndexArgTy = MRI.getType(Index);
8423 if (IndexArgTy != S64) {
8424 auto NewIndex = IndexArgTy.isVector() ? B.buildBitcast(S64, Index)
8425 : B.buildAnyExt(S64, Index);
8426 MI.getOperand(5).setReg(NewIndex.getReg(0));
8427 }
8428 return true;
8429 }
8430 case Intrinsic::amdgcn_swmmac_f16_16x16x32_f16:
8431 case Intrinsic::amdgcn_swmmac_bf16_16x16x32_bf16:
8432 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf16:
8433 case Intrinsic::amdgcn_swmmac_f32_16x16x32_f16:
8434 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_fp8:
8435 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_bf8:
8436 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_fp8:
8437 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_bf8: {
8438 Register Index = MI.getOperand(5).getReg();
8439 LLT S32 = LLT::scalar(32);
8440 if (MRI.getType(Index) != S32)
8441 MI.getOperand(5).setReg(B.buildAnyExt(S32, Index).getReg(0));
8442 return true;
8443 }
8444 case Intrinsic::amdgcn_swmmac_f16_16x16x64_f16:
8445 case Intrinsic::amdgcn_swmmac_bf16_16x16x64_bf16:
8446 case Intrinsic::amdgcn_swmmac_f32_16x16x64_bf16:
8447 case Intrinsic::amdgcn_swmmac_bf16f32_16x16x64_bf16:
8448 case Intrinsic::amdgcn_swmmac_f32_16x16x64_f16:
8449 case Intrinsic::amdgcn_swmmac_i32_16x16x128_iu8:
8450 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu4:
8451 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu8:
8452 case Intrinsic::amdgcn_swmmac_i32_16x16x64_iu4: {
8453 Register Index = MI.getOperand(7).getReg();
8454 LLT IdxTy = IntrID == Intrinsic::amdgcn_swmmac_i32_16x16x128_iu8
8455 ? LLT::scalar(64)
8456 : LLT::scalar(32);
8457 LLT IndexArgTy = MRI.getType(Index);
8458 if (IndexArgTy != IdxTy) {
8459 auto NewIndex = IndexArgTy.isVector() ? B.buildBitcast(IdxTy, Index)
8460 : B.buildAnyExt(IdxTy, Index);
8461 MI.getOperand(7).setReg(NewIndex.getReg(0));
8462 }
8463 return true;
8464 }
8465
8466 case Intrinsic::amdgcn_fmed3: {
8467 GISelChangeObserver &Observer = Helper.Observer;
8468
8469 // FIXME: This is to workaround the inability of tablegen match combiners to
8470 // match intrinsics in patterns.
8471 Observer.changingInstr(MI);
8472 MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_FMED3));
8473 MI.removeOperand(1);
8474 Observer.changedInstr(MI);
8475 return true;
8476 }
8477 case Intrinsic::amdgcn_readlane:
8478 case Intrinsic::amdgcn_writelane:
8479 case Intrinsic::amdgcn_readfirstlane:
8480 case Intrinsic::amdgcn_permlane16:
8481 case Intrinsic::amdgcn_permlanex16:
8482 case Intrinsic::amdgcn_permlane64:
8483 case Intrinsic::amdgcn_set_inactive:
8484 case Intrinsic::amdgcn_set_inactive_chain_arg:
8485 case Intrinsic::amdgcn_mov_dpp8:
8486 case Intrinsic::amdgcn_update_dpp:
8487 case Intrinsic::amdgcn_permlane_bcast:
8488 case Intrinsic::amdgcn_permlane_up:
8489 case Intrinsic::amdgcn_permlane_down:
8490 case Intrinsic::amdgcn_permlane_xor:
8491 return legalizeLaneOp(Helper, MI, IntrID);
8492 case Intrinsic::amdgcn_s_buffer_prefetch_data:
8493 return legalizeSBufferPrefetch(Helper, MI);
8494 case Intrinsic::amdgcn_dead: {
8495 // TODO: Use poison instead of undef
8496 for (const MachineOperand &Def : MI.defs())
8497 B.buildUndef(Def);
8498 MI.eraseFromParent();
8499 return true;
8500 }
8501 case Intrinsic::amdgcn_cooperative_atomic_load_32x4B:
8502 case Intrinsic::amdgcn_cooperative_atomic_load_16x8B:
8503 case Intrinsic::amdgcn_cooperative_atomic_load_8x16B:
8504 assert(MI.hasOneMemOperand() && "Expected IRTranslator to set MemOp!");
8505 B.buildLoad(MI.getOperand(0), MI.getOperand(2), **MI.memoperands_begin());
8506 MI.eraseFromParent();
8507 return true;
8508 case Intrinsic::amdgcn_cooperative_atomic_store_32x4B:
8509 case Intrinsic::amdgcn_cooperative_atomic_store_16x8B:
8510 case Intrinsic::amdgcn_cooperative_atomic_store_8x16B:
8511 assert(MI.hasOneMemOperand() && "Expected IRTranslator to set MemOp!");
8512 B.buildStore(MI.getOperand(2), MI.getOperand(1), **MI.memoperands_begin());
8513 MI.eraseFromParent();
8514 return true;
8515 case Intrinsic::amdgcn_av_load_b128:
8516 case Intrinsic::amdgcn_av_store_b128: {
8517 const GCNSubtarget &ST = B.getMF().getSubtarget<GCNSubtarget>();
8518 if (!ST.hasFlatGlobalInsts()) {
8519 const char *Name = IntrID == Intrinsic::amdgcn_av_load_b128
8520 ? "llvm.amdgcn.av.load.b128"
8521 : "llvm.amdgcn.av.store.b128";
8522 Function &Fn = B.getMF().getFunction();
8524 Fn, Twine(Name) + " not supported on subtarget", MI.getDebugLoc()));
8525 return false;
8526 }
8527 assert(MI.hasOneMemOperand() && "Expected IRTranslator to set MemOp!");
8528 if (IntrID == Intrinsic::amdgcn_av_load_b128)
8529 B.buildLoad(MI.getOperand(0), MI.getOperand(2), **MI.memoperands_begin());
8530 else
8531 B.buildStore(MI.getOperand(2), MI.getOperand(1),
8532 **MI.memoperands_begin());
8533 MI.eraseFromParent();
8534 return true;
8535 }
8536 case Intrinsic::amdgcn_flat_load_monitor_b32:
8537 case Intrinsic::amdgcn_flat_load_monitor_b64:
8538 case Intrinsic::amdgcn_flat_load_monitor_b128:
8539 assert(MI.hasOneMemOperand() && "Expected IRTranslator to set MemOp!");
8540 B.buildInstr(AMDGPU::G_AMDGPU_FLAT_LOAD_MONITOR)
8541 .add(MI.getOperand(0))
8542 .add(MI.getOperand(2))
8543 .addMemOperand(*MI.memoperands_begin());
8544 MI.eraseFromParent();
8545 return true;
8546 case Intrinsic::amdgcn_global_load_monitor_b32:
8547 case Intrinsic::amdgcn_global_load_monitor_b64:
8548 case Intrinsic::amdgcn_global_load_monitor_b128:
8549 assert(MI.hasOneMemOperand() && "Expected IRTranslator to set MemOp!");
8550 B.buildInstr(AMDGPU::G_AMDGPU_GLOBAL_LOAD_MONITOR)
8551 .add(MI.getOperand(0))
8552 .add(MI.getOperand(2))
8553 .addMemOperand(*MI.memoperands_begin());
8554 MI.eraseFromParent();
8555 return true;
8556 default: {
8557 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
8559 return legalizeImageIntrinsic(MI, B, Helper.Observer, ImageDimIntr);
8560 return true;
8561 }
8562 }
8563
8564 return true;
8565}
MachineInstrBuilder & UseMI
MachineInstrBuilder MachineInstrBuilder & DefMI
static unsigned getIntrinsicID(const SDNode *N)
unsigned RegSize
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
static SDValue extractF64Exponent(SDValue Hi, const SDLoc &SL, SelectionDAG &DAG)
static SDValue getMad(SelectionDAG &DAG, const SDLoc &SL, EVT VT, SDValue X, SDValue Y, SDValue C, SDNodeFlags Flags=SDNodeFlags())
static bool valueIsKnownNeverF32Denorm(SDValue Src)
Return true if it's known that Src can never be an f32 denormal value.
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.
static void packImage16bitOpsToDwords(MachineIRBuilder &B, MachineInstr &MI, SmallVectorImpl< Register > &PackedAddrs, unsigned ArgOffset, const AMDGPU::ImageDimIntrinsicInfo *Intr, bool IsA16, bool IsG16)
Turn a set of s16 typed registers in AddrRegs into a dword sized vector with s16 typed elements.
static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID)
static LLT getBufferRsrcScalarType(const LLT Ty)
static LegalityPredicate isIllegalRegisterType(const GCNSubtarget &ST, unsigned TypeIdx)
static cl::opt< bool > EnableNewLegality("amdgpu-global-isel-new-legality", cl::desc("Use GlobalISel desired legality, rather than try to use" "rules compatible with selection patterns"), cl::init(false), cl::ReallyHidden)
static MachineInstrBuilder buildExp(MachineIRBuilder &B, const DstOp &Dst, const SrcOp &Src, unsigned Flags)
static bool needsDenormHandlingF32(const MachineFunction &MF, Register Src, unsigned Flags)
constexpr std::initializer_list< LLT > AllVectors
static LegalizeMutation bitcastToVectorElement32(unsigned TypeIdx)
static LegalityPredicate isSmallOddVector(unsigned TypeIdx)
static LegalizeMutation oneMoreElement(unsigned TypeIdx)
constexpr LLT F64
static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size)
constexpr LLT V2S8
static bool allowApproxFunc(const MachineFunction &MF, unsigned Flags)
constexpr LLT V4S128
constexpr LLT S16
constexpr LLT S1
static bool shouldBitcastLoadStoreType(const GCNSubtarget &ST, const LLT Ty, const LLT MemTy)
Return true if a load or store of the type should be lowered with a bitcast to a different type.
constexpr LLT S1024
static constexpr unsigned FPEnvModeBitField
constexpr LLT V7S64
static LegalizeMutation getScalarTypeFromMemDesc(unsigned TypeIdx)
static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size)
static bool shouldWidenLoad(const GCNSubtarget &ST, LLT MemoryTy, uint64_t AlignInBits, unsigned AddrSpace, unsigned Opcode)
Return true if we should legalize a load by widening an odd sized memory access up to the alignment.
static bool isRegisterVectorElementType(LLT EltTy)
static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx)
static LegalityPredicate isWideVec16(unsigned TypeIdx)
constexpr std::initializer_list< LLT > AllScalarTypes
static LegalityPredicate isTruncStoreToSizePowerOf2(unsigned TypeIdx)
constexpr LLT V2S16
constexpr LLT V8S16
constexpr LLT V9S32
constexpr std::initializer_list< LLT > AllS32Vectors
constexpr LLT S224
static LegalizeMutation moreElementsToNextExistingRegClass(unsigned TypeIdx)
constexpr LLT S512
constexpr LLT MaxScalar
static Register castBufferRsrcToV4I32(Register Pointer, MachineIRBuilder &B)
Cast a buffer resource (an address space 8 pointer) into a 4xi32, which is the form in which the valu...
constexpr LLT V11S32
static bool isRegisterClassType(const GCNSubtarget &ST, LLT Ty)
constexpr LLT V6S64
constexpr LLT V2S64
static std::pair< Register, Register > emitReciprocalU64(MachineIRBuilder &B, Register Val)
static LLT getBitcastRegisterType(const LLT Ty)
static LLT getBufferRsrcRegisterType(const LLT Ty)
constexpr LLT S32
constexpr LLT V2F16
static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx)
static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI)
constexpr LLT V8S32
constexpr LLT V2BF16
constexpr LLT S192
static LLT castBufferRsrcFromV4I32(MachineInstr &MI, MachineIRBuilder &B, MachineRegisterInfo &MRI, unsigned Idx)
Mutates IR (typicaly a load instruction) to use a <4 x s32> as the initial type of the operand idx an...
static bool replaceWithConstant(MachineIRBuilder &B, MachineInstr &MI, int64_t C)
static constexpr unsigned SPDenormModeBitField
constexpr LLT F32
static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS, bool IsLoad, bool IsAtomic)
constexpr LLT V6S32
static bool isLoadStoreSizeLegal(const GCNSubtarget &ST, const LegalityQuery &Query)
constexpr LLT S160
static MachineInstr * verifyCFIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineInstr *&Br, MachineBasicBlock *&UncondBrTarget, bool &Negated)
constexpr LLT V4S16
constexpr LLT V2S128
constexpr LLT V10S16
static LegalityPredicate numElementsNotEven(unsigned TypeIdx)
constexpr LLT V4S32
constexpr LLT V3S32
constexpr LLT V6S16
constexpr std::initializer_list< LLT > AllS64Vectors
constexpr LLT S256
static void castBufferRsrcArgToV4I32(MachineInstr &MI, MachineIRBuilder &B, unsigned Idx)
constexpr LLT V4S64
static constexpr unsigned FPEnvTrapBitField
constexpr LLT V10S32
constexpr LLT V16S32
static constexpr unsigned MaxRegisterSize
constexpr LLT V7S32
constexpr LLT S96
constexpr LLT V12S16
constexpr LLT V16S64
static bool isRegisterSize(const GCNSubtarget &ST, unsigned Size)
static LegalityPredicate isWideScalarExtLoadTruncStore(unsigned TypeIdx)
static bool hasBufferRsrcWorkaround(const LLT Ty)
constexpr LLT V32S32
static void toggleSPDenormMode(bool Enable, MachineIRBuilder &B, const GCNSubtarget &ST, SIModeRegisterDefaults Mode)
constexpr LLT S64
constexpr std::initializer_list< LLT > AllS16Vectors
static bool loadStoreBitcastWorkaround(const LLT Ty)
static LLT widenToNextPowerOf2(LLT Ty)
static bool isNot(const MachineRegisterInfo &MRI, const MachineInstr &MI)
constexpr LLT V16S16
static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI, int DimIdx, int NumVAddrs)
Convert from separate vaddr components to a single vector address register, and replace the remaining...
static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query)
static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx)
constexpr LLT V5S32
constexpr LLT V5S64
constexpr LLT V3S64
static LLT getPow2VectorType(LLT Ty)
static void buildBufferLoad(unsigned Opc, Register LoadDstReg, Register RSrc, Register VIndex, Register VOffset, Register SOffset, unsigned ImmOffset, unsigned Format, unsigned AuxiliaryData, MachineMemOperand *MMO, bool IsTyped, bool HasVIndex, MachineIRBuilder &B)
constexpr LLT V8S64
static LLT getPow2ScalarType(LLT Ty)
static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx)
constexpr LLT V2S32
static bool isRegisterVectorType(LLT Ty)
constexpr LLT V12S32
constexpr LLT S128
static LegalityPredicate sizeIsMultipleOf32(unsigned TypeIdx)
constexpr LLT S8
static bool isRegisterType(const GCNSubtarget &ST, LLT Ty)
static bool isKnownNonNull(Register Val, MachineRegisterInfo &MRI, const AMDGPUTargetMachine &TM, unsigned AddrSpace)
Return true if the value is a known valid address, such that a null check is not necessary.
This file declares the targeting of the Machinelegalizer class for AMDGPU.
Provides AMDGPU specific target descriptions.
The AMDGPU TargetMachine interface definition for hw codegen targets.
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
#define X(NUM, ENUM, NAME)
Definition ELF.h:853
static Error unsupported(const char *Str, const Triple &T)
Definition MachO.cpp:77
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
@ Enable
Declares convenience wrapper classes for interpreting MachineInstr instances as specific generic oper...
IRTranslator LLVM IR MI
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
Interface for Targets to specify which operations they can successfully select and how the others sho...
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
Contains matchers for matching SSA Machine Instructions.
This file declares the MachineIRBuilder class.
Register const TargetRegisterInfo * TRI
#define R2(n)
Promote Memory to Register
Definition Mem2Reg.cpp:110
#define T
static MCRegister getReg(const MCDisassembler *D, unsigned RC, unsigned RegNo)
#define P(N)
ppc ctr loops verify
R600 Clause Merge
const SmallVectorImpl< MachineOperand > & Cond
static cl::opt< RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode > Mode("regalloc-enable-advisor", cl::Hidden, cl::init(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default), cl::desc("Enable regalloc advisor mode"), cl::values(clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default, "default", "Default"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Release, "release", "precompiled"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Development, "development", "for training")))
#define CH(x, y, z)
Definition SHA256.cpp:34
#define FP_DENORM_FLUSH_NONE
Definition SIDefines.h:1273
Interface definition for SIInstrInfo.
Interface definition for SIRegisterInfo.
This file defines the make_scope_exit function, which executes user-defined cleanup logic at scope ex...
static TableGen::Emitter::Opt Y("gen-skeleton-entry", EmitSkeleton, "Generate example skeleton entry")
static constexpr int Concat[]
bool legalizeConstHwRegRead(MachineInstr &MI, MachineIRBuilder &B, AMDGPU::Hwreg::Id HwReg, unsigned LowBit, unsigned Width) const
void buildMultiply(LegalizerHelper &Helper, MutableArrayRef< Register > Accum, ArrayRef< Register > Src0, ArrayRef< Register > Src1, bool UsePartialMad64_32, bool SeparateOddAlignedProducts) const
bool legalizeGlobalValue(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFSQRTF16(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeIntrinsicTrunc(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeInsert(LegalizerHelper &Helper, MachineInstr &MI) const
std::pair< Register, unsigned > splitBufferOffsets(MachineIRBuilder &B, Register OrigOffset) const
bool legalizeBVHIntersectRayIntrinsic(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeIsAddrSpace(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, unsigned AddrSpace) const
bool legalizeUnsignedDIV_REM(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFSQRTF32(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeCTLZ_ZERO_POISON(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeAtomicCmpXChg(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeTrapHsa(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeBufferStore(MachineInstr &MI, LegalizerHelper &Helper, bool IsTyped, bool IsFormat) const
bool legalizeMul(LegalizerHelper &Helper, MachineInstr &MI) const
bool legalizeFFREXP(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
Register getSegmentAperture(unsigned AddrSpace, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFDIV64(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizePointerAsRsrcIntrin(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
To create a buffer resource from a 64-bit pointer, mask off the upper 32 bits of the pointer and repl...
bool legalizeFlogCommon(MachineInstr &MI, MachineIRBuilder &B) const
bool getLDSKernelId(Register DstReg, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFExp2(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeTrap(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeBufferAtomic(MachineInstr &MI, MachineIRBuilder &B, Intrinsic::ID IID) const
void legalizeUnsignedDIV_REM32Impl(MachineIRBuilder &B, Register DstDivReg, Register DstRemReg, Register Num, Register Den) const
Register handleD16VData(MachineIRBuilder &B, MachineRegisterInfo &MRI, Register Reg, bool ImageStore=false) const
Handle register layout difference for f16 images for some subtargets.
bool legalizeCTLZ_CTTZ(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeBuildVector(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFFloor(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
AMDGPULegalizerInfo(const GCNSubtarget &ST, const GCNTargetMachine &TM)
bool legalizeFDIV32(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFMad(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFDIV(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeSBufferPrefetch(LegalizerHelper &Helper, MachineInstr &MI) const
bool legalizeFExp10Unsafe(MachineIRBuilder &B, Register Dst, Register Src, unsigned Flags) const
bool legalizeFExp(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeIntrinsic(LegalizerHelper &Helper, MachineInstr &MI) const override
bool legalizeFrem(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizePreloadedArgIntrin(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const
bool legalizeStore(LegalizerHelper &Helper, MachineInstr &MI) const
bool legalizeCustom(LegalizerHelper &Helper, MachineInstr &MI, LostDebugLocObserver &LocObserver) const override
Called for instructions with the Custom LegalizationAction.
bool buildPCRelGlobalAddress(Register DstReg, LLT PtrTy, MachineIRBuilder &B, const GlobalValue *GV, int64_t Offset, unsigned GAFlags=SIInstrInfo::MO_NONE) const
MachinePointerInfo getKernargSegmentPtrInfo(MachineFunction &MF) const
bool legalizeFDIV16(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeRsqClampIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFExpUnsafeImpl(MachineIRBuilder &B, Register Dst, Register Src, unsigned Flags, bool IsExp10) const
std::pair< Register, Register > getScaledLogInput(MachineIRBuilder &B, Register Src, unsigned Flags) const
bool legalizeFDIVFastIntrin(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool loadInputValue(Register DstReg, MachineIRBuilder &B, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const
bool legalizeBVHDualOrBVH8IntersectRayIntrinsic(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeInsertVectorElt(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFExpUnsafe(MachineIRBuilder &B, Register Dst, Register Src, unsigned Flags) const
bool legalizeFEXPF64(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeAddrSpaceCast(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeExtract(LegalizerHelper &Helper, MachineInstr &MI) const
bool legalizeBufferLoad(MachineInstr &MI, LegalizerHelper &Helper, bool IsFormat, bool IsTyped) const
bool legalizeImplicitArgPtr(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeMinNumMaxNum(LegalizerHelper &Helper, MachineInstr &MI) const
void legalizeUnsignedDIV_REM64Impl(MachineIRBuilder &B, Register DstDivReg, Register DstRemReg, Register Num, Register Den) const
bool legalizeDebugTrap(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFastUnsafeFDIV(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeSinCos(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeCTLS(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeWaveID(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeFroundeven(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeLDSKernelId(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeWorkGroupId(MachineInstr &MI, MachineIRBuilder &B, AMDGPUFunctionArgInfo::PreloadedValue ClusterIdPV, AMDGPUFunctionArgInfo::PreloadedValue ClusterMaxIdPV, AMDGPUFunctionArgInfo::PreloadedValue ClusterWorkGroupIdPV) const
bool legalizeSignedDIV_REM(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeITOFP(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, bool Signed) const
bool legalizeFPow(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeFastUnsafeFDIV64(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFPTOI(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, bool Signed) const
bool legalizeStackSave(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeFlogUnsafe(MachineIRBuilder &B, Register Dst, Register Src, bool IsLog10, unsigned Flags) const
bool legalizeKernargMemParameter(MachineInstr &MI, MachineIRBuilder &B, uint64_t Offset, Align Alignment=Align(4)) const
Legalize a value that's loaded from kernel arguments.
bool legalizeImageIntrinsic(MachineInstr &MI, MachineIRBuilder &B, GISelChangeObserver &Observer, const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr) const
Rewrite image intrinsics to use register layouts expected by the subtarget.
void buildAbsGlobalAddress(Register DstReg, LLT PtrTy, MachineIRBuilder &B, const GlobalValue *GV, MachineRegisterInfo &MRI) const
bool legalizeGetFPEnv(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool getImplicitArgPtr(Register DstReg, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFSQRT(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
Register getKernargParameterPtr(MachineIRBuilder &B, int64_t Offset) const
bool legalizeSBufferLoad(LegalizerHelper &Helper, MachineInstr &MI) const
bool legalizeFceil(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFSQRTF64(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeExtractVectorElt(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeLoad(LegalizerHelper &Helper, MachineInstr &MI) const
Register fixStoreSourceType(MachineIRBuilder &B, Register VData, LLT MemTy, bool IsFormat) const
bool legalizeLaneOp(LegalizerHelper &Helper, MachineInstr &MI, Intrinsic::ID IID) const
bool legalizeSetFPEnv(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeWorkitemIDIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, unsigned Dim, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const
void buildLoadInputValue(Register DstReg, MachineIRBuilder &B, const ArgDescriptor *Arg, const TargetRegisterClass *ArgRC, LLT ArgTy) const
bool legalizeTrapHsaQueuePtr(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFlog2(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeTrapEndpgm(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
static std::optional< uint32_t > getLDSKernelIdMetadata(const Function &F)
void setDynLDSAlign(const Function &F, const GlobalVariable &GV)
unsigned allocateLDSGlobal(const DataLayout &DL, const GlobalVariable &GV)
bool isNoopAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override
Returns true if a cast between SrcAS and DestAS is a noop.
const std::array< unsigned, 3 > & getDims() const
static const fltSemantics & IEEEsingle()
Definition APFloat.h:296
static const fltSemantics & IEEEdouble()
Definition APFloat.h:297
static APFloat getSmallestNormalized(const fltSemantics &Sem, bool Negative=false)
Returns the smallest (by magnitude) normalized finite number in the given semantics.
Definition APFloat.h:1217
static APFloat getLargest(const fltSemantics &Sem, bool Negative=false)
Returns the largest finite number in the given semantics.
Definition APFloat.h:1197
static APFloat getInf(const fltSemantics &Sem, bool Negative=false)
Factory for Positive and Negative Infinity.
Definition APFloat.h:1157
Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
size_t size() const
Get the array size.
Definition ArrayRef.h:141
@ FCMP_OEQ
0 0 0 1 True if ordered and equal
Definition InstrTypes.h:743
@ ICMP_SLT
signed less than
Definition InstrTypes.h:769
@ FCMP_OLT
0 1 0 0 True if ordered and less than
Definition InstrTypes.h:746
@ FCMP_ULE
1 1 0 1 True if unordered, less than, or equal
Definition InstrTypes.h:755
@ FCMP_OGT
0 0 1 0 True if ordered and greater than
Definition InstrTypes.h:744
@ ICMP_UGE
unsigned greater or equal
Definition InstrTypes.h:764
@ ICMP_SGT
signed greater than
Definition InstrTypes.h:767
@ FCMP_ONE
0 1 1 0 True if ordered and operands are unequal
Definition InstrTypes.h:748
@ ICMP_ULT
unsigned less than
Definition InstrTypes.h:765
@ FCMP_OLE
0 1 0 1 True if ordered and less than or equal
Definition InstrTypes.h:747
@ FCMP_ORD
0 1 1 1 True if ordered (no nans)
Definition InstrTypes.h:749
@ ICMP_NE
not equal
Definition InstrTypes.h:762
@ FCMP_UGE
1 0 1 1 True if unordered, greater than, or equal
Definition InstrTypes.h:753
ConstantFP - Floating Point Values [float, double].
Definition Constants.h:420
LLVM_ABI bool isExactlyValue(const APFloat &V) const
We don't rely on operator== working on double values, as it returns true for things that are clearly ...
This is the shared class of boolean and integer constants.
Definition Constants.h:87
int64_t getSExtValue() const
Return the constant as a 64-bit integer value after it has been sign extended as appropriate for the ...
Definition Constants.h:174
A debug info location.
Definition DebugLoc.h:123
Diagnostic information for unsupported feature in backend.
static constexpr ElementCount getFixed(ScalarTy MinVal)
Definition TypeSize.h:309
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition Function.cpp:358
Abstract class that contains various methods for clients to notify about changes.
virtual void changingInstr(MachineInstr &MI)=0
This instruction is about to be mutated in some way.
virtual void changedInstr(MachineInstr &MI)=0
This instruction was mutated in some way.
Simple wrapper observer that takes several observers, and calls each one for each event.
KnownBits getKnownBits(Register R)
bool hasExternalLinkage() const
Module * getParent()
Get the module that this global value is contained inside of...
LLVM_ABI const DataLayout & getDataLayout() const
Get the data layout of the module this global belongs to.
Definition Globals.cpp:141
LLVM_ABI uint64_t getGlobalSize(const DataLayout &DL) const
Get the size of this global variable in bytes.
Definition Globals.cpp:569
LLT changeElementCount(ElementCount EC) const
Return a vector or scalar with the same element type and the new element count.
constexpr unsigned getScalarSizeInBits() const
constexpr bool isScalar() const
constexpr LLT changeElementType(LLT NewEltTy) const
If this type is a vector, return a vector with the same number of elements but the new element type.
static constexpr LLT vector(ElementCount EC, unsigned ScalarSizeInBits)
Get a low-level vector of some number of elements and element width.
LLT getScalarType() const
static constexpr LLT scalar(unsigned SizeInBits)
Get a low-level scalar or aggregate "bag of bits".
constexpr uint16_t getNumElements() const
Returns the number of elements in a vector LLT.
constexpr bool isVector() const
static constexpr LLT pointer(unsigned AddressSpace, unsigned SizeInBits)
Get a low-level pointer in the given address space.
constexpr TypeSize getSizeInBits() const
Returns the total size of the type. Must only be called on sized types.
constexpr bool isPointer() const
constexpr bool isAnyScalar() const
constexpr unsigned getAddressSpace() const
static constexpr LLT fixed_vector(unsigned NumElements, unsigned ScalarSizeInBits)
Get a low-level fixed-width vector of some number of elements and element width.
LLT getElementType() const
Returns the vector's element type. Only valid for vector types.
static constexpr LLT scalarOrVector(ElementCount EC, LLT ScalarTy)
LLT changeElementSize(unsigned NewEltSize) const
If this type is a vector, return a vector with the same number of elements but the new element size.
LLVM_ABI void diagnose(const DiagnosticInfo &DI)
Report a message to the currently installed diagnostic handler.
LegalizeRuleSet & minScalar(unsigned TypeIdx, const LLT Ty)
Ensure the scalar is at least as wide as Ty.
LegalizeRuleSet & legalFor(std::initializer_list< LLT > Types)
The instruction is legal when type index 0 is any type in the given list.
LegalizeRuleSet & unsupported()
The instruction is unsupported.
LegalizeRuleSet & scalarSameSizeAs(unsigned TypeIdx, unsigned SameSizeIdx)
Change the type TypeIdx to have the same scalar size as type SameSizeIdx.
LegalizeRuleSet & fewerElementsIf(LegalityPredicate Predicate, LegalizeMutation Mutation)
Remove elements to reach the type selected by the mutation if the predicate is true.
LegalizeRuleSet & clampScalarOrElt(unsigned TypeIdx, const LLT MinTy, const LLT MaxTy)
Limit the range of scalar sizes to MinTy and MaxTy.
LegalizeRuleSet & maxScalar(unsigned TypeIdx, const LLT Ty)
Ensure the scalar is at most as wide as Ty.
LegalizeRuleSet & minScalarOrElt(unsigned TypeIdx, const LLT Ty)
Ensure the scalar or element is at least as wide as Ty.
LegalizeRuleSet & clampMaxNumElements(unsigned TypeIdx, const LLT EltTy, unsigned MaxElements)
Limit the number of elements in EltTy vectors to at most MaxElements.
LegalizeRuleSet & unsupportedFor(std::initializer_list< LLT > Types)
LegalizeRuleSet & lower()
The instruction is lowered.
LegalizeRuleSet & moreElementsIf(LegalityPredicate Predicate, LegalizeMutation Mutation)
Add more elements to reach the type selected by the mutation if the predicate is true.
LegalizeRuleSet & lowerFor(std::initializer_list< LLT > Types)
The instruction is lowered when type index 0 is any type in the given list.
LegalizeRuleSet & clampScalar(unsigned TypeIdx, const LLT MinTy, const LLT MaxTy)
Limit the range of scalar sizes to MinTy and MaxTy.
LegalizeRuleSet & custom()
Unconditionally custom lower.
LegalizeRuleSet & clampMaxNumElementsStrict(unsigned TypeIdx, const LLT EltTy, unsigned NumElts)
Express EltTy vectors strictly using vectors with NumElts elements (or scalars when NumElts equals 1)...
LegalizeRuleSet & alwaysLegal()
LegalizeRuleSet & maxScalarIf(LegalityPredicate Predicate, unsigned TypeIdx, const LLT Ty)
Conditionally limit the maximum size of the scalar.
LegalizeRuleSet & customIf(LegalityPredicate Predicate)
LegalizeRuleSet & widenScalarToNextPow2(unsigned TypeIdx, unsigned MinSize=0)
Widen the scalar to the next power of two that is at least MinSize.
LegalizeRuleSet & scalarize(unsigned TypeIdx)
LegalizeRuleSet & legalForCartesianProduct(std::initializer_list< LLT > Types)
The instruction is legal when type indexes 0 and 1 are both in the given list.
LegalizeRuleSet & minScalarIf(LegalityPredicate Predicate, unsigned TypeIdx, const LLT Ty)
Ensure the scalar is at least as wide as Ty if condition is met.
LegalizeRuleSet & legalIf(LegalityPredicate Predicate)
The instruction is legal if predicate is true.
LegalizeRuleSet & customFor(std::initializer_list< LLT > Types)
LegalizeRuleSet & widenScalarToNextMultipleOf(unsigned TypeIdx, unsigned Size)
Widen the scalar to the next multiple of Size.
LLVM_ABI LegalizeResult lowerFMinNumMaxNum(MachineInstr &MI)
LLVM_ABI void moreElementsVectorDst(MachineInstr &MI, LLT MoreTy, unsigned OpIdx)
Legalize a single operand OpIdx of the machine instruction MI as a Def by performing it with addition...
LLVM_ABI LegalizeResult lowerInsert(MachineInstr &MI)
LLVM_ABI LegalizeResult lowerExtract(MachineInstr &MI)
GISelValueTracking * getValueTracking() const
@ Legalized
Instruction has been legalized and the MachineFunction changed.
GISelChangeObserver & Observer
To keep track of changes made by the LegalizerHelper.
LLVM_ABI void bitcastDst(MachineInstr &MI, LLT CastTy, unsigned OpIdx)
Legalize a single operand OpIdx of the machine instruction MI as a def by inserting a G_BITCAST from ...
LLVM_ABI LegalizeResult lowerFMad(MachineInstr &MI)
MachineIRBuilder & MIRBuilder
Expose MIRBuilder so clients can set their own RecordInsertInstruction functions.
LLVM_ABI void widenScalarDst(MachineInstr &MI, LLT WideTy, unsigned OpIdx=0, unsigned TruncOpcode=TargetOpcode::G_TRUNC)
Legalize a single operand OpIdx of the machine instruction MI as a Def by extending the operand's typ...
LegalizeRuleSet & getActionDefinitionsBuilder(unsigned Opcode)
Get the action definition builder for the given opcode.
TypeSize getValue() const
Wrapper class representing physical registers. Should be passed by value.
Definition MCRegister.h:41
constexpr bool isPhysical() const
Return true if the specified register number is in the physical register namespace.
Definition MCRegister.h:72
LLVM_ABI void addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob=BranchProbability::getUnknown())
Add Succ as a successor of this MachineBasicBlock.
LLVM_ABI MachineBasicBlock * splitAt(MachineInstr &SplitInst, bool UpdateLiveIns=true, LiveIntervals *LIS=nullptr)
Split a basic block into 2 pieces at SplitPoint.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
MachineInstrBundleIterator< MachineInstr > iterator
PseudoSourceValueManager & getPSVManager() const
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
DenormalMode getDenormalMode(const fltSemantics &FPType) const
Returns the denormal handling type for the default rounding mode of the function.
void push_back(MachineBasicBlock *MBB)
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
BasicBlockListType::iterator iterator
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
MachineBasicBlock * CreateMachineBasicBlock(const BasicBlock *BB=nullptr, std::optional< UniqueBBID > BBID=std::nullopt)
CreateMachineInstr - Allocate a new MachineInstr.
const TargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
Helper class to build MachineInstr.
MachineFunction & getMF()
Getter for the function we currently build.
Register getReg(unsigned Idx) const
Get the register for the operand index.
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & addGlobalAddress(const GlobalValue *GV, int64_t Offset=0, unsigned TargetFlags=0) const
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
Representation of each machine instruction.
const MachineOperand & getOperand(unsigned i) const
A description of a memory reference used in the backend.
LocationSize getSize() const
Return the size in bytes of the memory reference.
LLT getMemoryType() const
Return the memory type of the memory reference.
@ MODereferenceable
The memory access is dereferenceable (i.e., doesn't trap).
@ MOLoad
The memory access reads data.
@ MOInvariant
The memory access always returns the same value (or traps).
LLVM_ABI Align getAlign() const
Return the minimum known alignment in bytes of the actual memory reference.
MachineOperand class - Representation of each machine instruction operand.
MachineBasicBlock * getMBB() const
LLVM_ABI void setReg(Register Reg)
Change the register this operand corresponds to.
void setMBB(MachineBasicBlock *MBB)
static MachineOperand CreateImm(int64_t Val)
Register getReg() const
getReg - Returns the register number.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
LLVM_ABI bool hasOneNonDBGUse(Register RegNo) const
hasOneNonDBGUse - Return true if there is exactly one non-Debug use of the specified register.
LLVM_ABI MachineInstr * getVRegDef(Register Reg) const
getVRegDef - Return the machine instr that defines the specified virtual register or null if none is ...
LLVM_ABI Register createVirtualRegister(const TargetRegisterClass *RegClass, StringRef Name="")
createVirtualRegister - Create and return a new virtual register in the function with the specified r...
LLT getType(Register Reg) const
Get the low-level type of Reg or LLT{} if Reg is not a generic (target independent) virtual register.
use_instr_nodbg_iterator use_instr_nodbg_begin(Register RegNo) const
LLVM_ABI void setRegClass(Register Reg, const TargetRegisterClass *RC)
setRegClass - Set the register class of the specified virtual register.
LLVM_ABI Register createGenericVirtualRegister(LLT Ty, StringRef Name="")
Create and return a new generic virtual register with low-level type Ty.
const TargetRegisterClass * getRegClassOrNull(Register Reg) const
Return the register class of Reg, or null if Reg has not been assigned a register class yet.
const TargetRegisterInfo * getTargetRegisterInfo() const
LLVM_ABI void replaceRegWith(Register FromReg, Register ToReg)
replaceRegWith - Replace all instances of FromReg with ToReg in the machine function.
Represent a mutable reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:294
MutableArrayRef< T > drop_front(size_t N=1) const
Drop the first N elements of the array.
Definition ArrayRef.h:383
LLVM_ABI const PseudoSourceValue * getConstantPool()
Return a pseudo source value referencing the constant pool.
Wrapper class representing virtual and physical registers.
Definition Register.h:20
constexpr bool isValid() const
Definition Register.h:112
constexpr bool isVirtual() const
Return true if the specified register number is in the virtual register namespace.
Definition Register.h:79
static unsigned getMaxMUBUFImmOffset(const GCNSubtarget &ST)
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
AMDGPU::ClusterDimsAttr getClusterDims() const
SIModeRegisterDefaults getMode() const
std::tuple< const ArgDescriptor *, const TargetRegisterClass *, LLT > getPreloadedValue(AMDGPUFunctionArgInfo::PreloadedValue Value) const
static LLVM_READONLY const TargetRegisterClass * getSGPRClassForBitWidth(unsigned BitWidth)
bool allowsMisalignedMemoryAccessesImpl(unsigned Size, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *IsFast=nullptr) const
bool shouldEmitFixup(const GlobalValue *GV) const
bool shouldUseLDSConstAddress(const GlobalValue *GV) const
bool shouldEmitPCReloc(const GlobalValue *GV) const
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void truncate(size_type N)
Like resize, but requires that N is less than size().
void resize(size_type N)
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
int64_t getImm() const
Register getReg() const
Register getStackPointerRegisterToSaveRestore() const
If a physical register, this specifies the register that llvm.savestack/llvm.restorestack should save...
unsigned getPointerSizeInBits(unsigned AS) const
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition Twine.h:82
A Use represents the edge between a Value definition and its users.
Definition Use.h:35
LLVM_ABI StringRef getName() const
Return a constant reference to the value's name.
Definition Value.cpp:318
self_iterator getIterator()
Definition ilist_node.h:123
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ CONSTANT_ADDRESS_32BIT
Address space for 32-bit constant memory.
@ BUFFER_STRIDED_POINTER
Address space for 192-bit fat buffer pointers with an additional index.
@ REGION_ADDRESS
Address space for region memory. (GDS)
@ LOCAL_ADDRESS
Address space for local memory.
@ CONSTANT_ADDRESS
Address space for constant memory (VTX2).
@ FLAT_ADDRESS
Address space for flat memory.
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
@ BUFFER_FAT_POINTER
Address space for 160-bit buffer fat pointers.
@ PRIVATE_ADDRESS
Address space for private memory.
@ BUFFER_RESOURCE
Address space for 128-bit buffer resources.
int getMIMGOpcode(unsigned BaseOpcode, unsigned MIMGEncoding, unsigned VDataDwords, unsigned VAddrDwords)
bool isFlatGlobalAddrSpace(unsigned AS)
bool isGFX12Plus(const MCSubtargetInfo &STI)
constexpr int64_t getNullPointerValue(unsigned AS)
Get the null pointer value for the given address space.
bool isGFX11(const MCSubtargetInfo &STI)
LLVM_READNONE bool isLegalDPALU_DPPControl(const MCSubtargetInfo &ST, unsigned DC)
unsigned getAMDHSACodeObjectVersion(const Module &M)
LLVM_READNONE constexpr bool isKernel(CallingConv::ID CC)
LLVM_READNONE constexpr bool isEntryFunctionCC(CallingConv::ID CC)
LLVM_READNONE constexpr bool isCompute(CallingConv::ID CC)
TargetExtType * isNamedBarrier(const GlobalVariable &GV)
bool isGFX11Plus(const MCSubtargetInfo &STI)
LLVM_READONLY const MIMGBaseOpcodeInfo * getMIMGBaseOpcodeInfo(unsigned BaseOpcode)
std::pair< Register, unsigned > getBaseWithConstantOffset(MachineRegisterInfo &MRI, Register Reg, GISelValueTracking *ValueTracking=nullptr, bool CheckNUW=false)
Returns base register and constant offset.
const ImageDimIntrinsicInfo * getImageDimIntrinsicInfo(unsigned Intr)
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ MaxID
The highest possible ID. Must be some 2^k - 1.
@ AMDGPU_Gfx
Used for AMD graphics targets.
@ Fast
Attempts to make calls as fast as possible (e.g.
Definition CallingConv.h:41
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
LLVM_ABI LegalityPredicate scalarOrEltWiderThan(unsigned TypeIdx, unsigned Size)
True iff the specified type index is a scalar or a vector with an element type that's wider than the ...
LLVM_ABI LegalityPredicate isScalar(unsigned TypeIdx)
True iff the specified type index is a scalar.
LLVM_ABI LegalityPredicate isPointer(unsigned TypeIdx)
True iff the specified type index is a pointer (with any address space).
LLVM_ABI LegalityPredicate typeInSet(unsigned TypeIdx, std::initializer_list< LLT > TypesInit)
True iff the given type index is one of the specified types.
LLVM_ABI LegalityPredicate smallerThan(unsigned TypeIdx0, unsigned TypeIdx1)
True iff the first type index has a smaller total bit size than second type index.
LLVM_ABI LegalityPredicate largerThan(unsigned TypeIdx0, unsigned TypeIdx1)
True iff the first type index has a larger total bit size than second type index.
LLVM_ABI LegalityPredicate elementTypeIs(unsigned TypeIdx, LLT EltTy)
True if the type index is a vector with element type EltTy.
LLVM_ABI LegalityPredicate sameSize(unsigned TypeIdx0, unsigned TypeIdx1)
True iff the specified type indices are both the same bit size.
LLVM_ABI LegalityPredicate scalarOrEltNarrowerThan(unsigned TypeIdx, unsigned Size)
True iff the specified type index is a scalar or vector with an element type that's narrower than the...
LegalityPredicate typeIsNot(unsigned TypeIdx, LLT Type)
True iff the given type index is not the specified type.
Predicate all(Predicate P0, Predicate P1)
True iff P0 and P1 are true.
LLVM_ABI LegalityPredicate typeIs(unsigned TypeIdx, LLT TypesInit)
True iff the given type index is the specified type.
LLVM_ABI LegalityPredicate scalarNarrowerThan(unsigned TypeIdx, unsigned Size)
True iff the specified type index is a scalar that's narrower than the given size.
LLVM_ABI LegalizeMutation scalarize(unsigned TypeIdx)
Break up the vector type for the given type index into the element type.
LLVM_ABI LegalizeMutation widenScalarOrEltToNextPow2(unsigned TypeIdx, unsigned Min=0)
Widen the scalar type or vector element type for the given type index to the next power of 2.
LLVM_ABI LegalizeMutation changeTo(unsigned TypeIdx, LLT Ty)
Select this specific type for the given type index.
Invariant opcodes: All instruction sets have these as their low opcodes.
initializer< Ty > init(const Ty &Val)
constexpr double inv_pi
constexpr double ln2
constexpr double ln10
constexpr float log2ef
Definition MathExtras.h:51
constexpr double log2e
This is an optimization pass for GlobalISel generic memory operations.
LLVM_ABI Register getFunctionLiveInPhysReg(MachineFunction &MF, const TargetInstrInfo &TII, MCRegister PhysReg, const TargetRegisterClass &RC, const DebugLoc &DL, LLT RegTy=LLT())
Return a virtual register corresponding to the incoming argument register PhysReg.
Definition Utils.cpp:858
unsigned Log2_32_Ceil(uint32_t Value)
Return the ceil log base 2 of the specified value, 32 if the value is zero.
Definition MathExtras.h:344
@ Offset
Definition DWP.cpp:558
LLVM_ABI Type * getTypeForLLT(LLT Ty, LLVMContext &C)
Get the type back from LLT.
Definition Utils.cpp:1984
LLVM_ABI MachineInstr * getOpcodeDef(unsigned Opcode, Register Reg, const MachineRegisterInfo &MRI)
See if Reg is defined by an single def instruction that is Opcode.
Definition Utils.cpp:653
LLVM_ABI const ConstantFP * getConstantFPVRegVal(Register VReg, const MachineRegisterInfo &MRI)
Definition Utils.cpp:461
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
constexpr bool isInt(int64_t x)
Checks if an integer fits into the given bit width.
Definition MathExtras.h:165
@ Implicit
Not emitted register (e.g. carry, or temporary result).
@ Undef
Value of the register doesn't matter.
LLVM_ABI const llvm::fltSemantics & getFltSemanticForLLT(LLT Ty)
Get the appropriate floating point arithmetic semantic based on the bit size of the given scalar LLT.
std::function< std::pair< unsigned, LLT >(const LegalityQuery &)> LegalizeMutation
int bit_width(T Value)
Returns the number of bits needed to represent Value if Value is nonzero.
Definition bit.h:325
void * PointerTy
constexpr bool isPowerOf2_64(uint64_t Value)
Return true if the argument is a power of two > 0 (64 bit edition.)
Definition MathExtras.h:284
constexpr int popcount(T Value) noexcept
Count the number of set bits in a value.
Definition bit.h:156
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
Definition MathExtras.h:385
LLVM_ABI std::optional< int64_t > getIConstantVRegSExtVal(Register VReg, const MachineRegisterInfo &MRI)
If VReg is defined by a G_CONSTANT fits in int64_t returns it.
Definition Utils.cpp:314
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
Definition bit.h:204
constexpr bool has_single_bit(T Value) noexcept
Definition bit.h:149
std::function< bool(const LegalityQuery &)> LegalityPredicate
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition MathExtras.h:279
constexpr uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition Alignment.h:144
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
MutableArrayRef(T &OneElt) -> MutableArrayRef< T >
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition MathExtras.h:394
To bit_cast(const From &from) noexcept
Definition bit.h:90
@ Mul
Product of integers.
@ FMul
Product of floats.
@ Sub
Subtraction of integers.
@ Add
Sum of integers.
FunctionAddr VTableAddr Next
Definition InstrProf.h:141
DWARFExpression::Operation Op
ArrayRef(const T &OneElt) -> ArrayRef< T >
constexpr unsigned BitWidth
LLVM_ABI void eraseInstr(MachineInstr &MI, MachineRegisterInfo &MRI, LostDebugLocObserver *LocObserver=nullptr)
Definition Utils.cpp:1682
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
LLVM_ABI std::optional< ValueAndVReg > getIConstantVRegValWithLookThrough(Register VReg, const MachineRegisterInfo &MRI, bool LookThroughInstrs=true)
If VReg is defined by a statically evaluable chain of instructions rooted on a G_CONSTANT returns its...
Definition Utils.cpp:433
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition STLExtras.h:1946
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
Definition Alignment.h:201
unsigned Log2(Align A)
Returns the log2 of the alignment.
Definition Alignment.h:197
T bit_floor(T Value)
Returns the largest integral power of two no greater than Value if Value is nonzero.
Definition bit.h:347
constexpr uint64_t NextPowerOf2(uint64_t A)
Returns the next power of two (in 64-bits) that is strictly greater than A.
Definition MathExtras.h:373
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:876
#define N
static constexpr uint64_t encode(Fields... Values)
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
constexpr uint64_t value() const
This is a hole in the type system and should not be abused.
Definition Alignment.h:77
MCRegister getRegister() const
static ArgDescriptor createRegister(Register Reg, unsigned Mask=~0u)
DenormalModeKind Input
Denormal treatment kind for floating point instruction inputs in the default floating-point environme...
@ PreserveSign
The sign of a flushed-to-zero number is preserved in the sign of 0.
@ Dynamic
Denormals have unknown treatment.
static constexpr DenormalMode getPreserveSign()
static constexpr DenormalMode getIEEE()
bool isZero() const
Returns true if value is all zero.
Definition KnownBits.h:78
The LegalityQuery object bundles together all the information that's needed to decide whether a given...
ArrayRef< MemDesc > MMODescrs
Operations which require memory can use this to place requirements on the memory type for each MMO.
ArrayRef< LLT > Types
Matching combinators.
This class contains a discriminated union of information about pointers in memory operands,...
MachinePointerInfo getWithOffset(int64_t O) const
static LLVM_ABI MachinePointerInfo getGOT(MachineFunction &MF)
Return a MachinePointerInfo record that refers to a GOT entry.
DenormalMode FP64FP16Denormals
If this is set, neither input or output denormals are flushed for both f64 and f16/v2f16 instructions...
bool IEEE
Floating point opcodes that support exception flag gathering quiet and propagate signaling NaN inputs...
DenormalMode FP32Denormals
If this is set, neither input or output denormals are flushed for most f32 instructions.