LLVM 19.0.0git
AMDGPULegalizerInfo.cpp
Go to the documentation of this file.
1//===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8/// \file
9/// This file implements the targeting of the Machinelegalizer class for
10/// AMDGPU.
11/// \todo This should be generated by TableGen.
12//===----------------------------------------------------------------------===//
13
14#include "AMDGPULegalizerInfo.h"
15
16#include "AMDGPU.h"
18#include "AMDGPUInstrInfo.h"
19#include "AMDGPUTargetMachine.h"
21#include "SIInstrInfo.h"
23#include "SIRegisterInfo.h"
25#include "llvm/ADT/ScopeExit.h"
34#include "llvm/IR/IntrinsicsAMDGPU.h"
35#include "llvm/IR/IntrinsicsR600.h"
36
37#define DEBUG_TYPE "amdgpu-legalinfo"
38
39using namespace llvm;
40using namespace LegalizeActions;
41using namespace LegalizeMutations;
42using namespace LegalityPredicates;
43using namespace MIPatternMatch;
44
45// Hack until load/store selection patterns support any tuple of legal types.
47 "amdgpu-global-isel-new-legality",
48 cl::desc("Use GlobalISel desired legality, rather than try to use"
49 "rules compatible with selection patterns"),
50 cl::init(false),
52
53static constexpr unsigned MaxRegisterSize = 1024;
54
55// Round the number of elements to the next power of two elements
57 unsigned NElts = Ty.getNumElements();
58 unsigned Pow2NElts = 1 << Log2_32_Ceil(NElts);
59 return Ty.changeElementCount(ElementCount::getFixed(Pow2NElts));
60}
61
62// Round the number of bits to the next power of two bits
64 unsigned Bits = Ty.getSizeInBits();
65 unsigned Pow2Bits = 1 << Log2_32_Ceil(Bits);
66 return LLT::scalar(Pow2Bits);
67}
68
69/// \returns true if this is an odd sized vector which should widen by adding an
70/// additional element. This is mostly to handle <3 x s16> -> <4 x s16>. This
71/// excludes s1 vectors, which should always be scalarized.
72static LegalityPredicate isSmallOddVector(unsigned TypeIdx) {
73 return [=](const LegalityQuery &Query) {
74 const LLT Ty = Query.Types[TypeIdx];
75 if (!Ty.isVector())
76 return false;
77
78 const LLT EltTy = Ty.getElementType();
79 const unsigned EltSize = EltTy.getSizeInBits();
80 return Ty.getNumElements() % 2 != 0 &&
81 EltSize > 1 && EltSize < 32 &&
82 Ty.getSizeInBits() % 32 != 0;
83 };
84}
85
86static LegalityPredicate sizeIsMultipleOf32(unsigned TypeIdx) {
87 return [=](const LegalityQuery &Query) {
88 const LLT Ty = Query.Types[TypeIdx];
89 return Ty.getSizeInBits() % 32 == 0;
90 };
91}
92
93static LegalityPredicate isWideVec16(unsigned TypeIdx) {
94 return [=](const LegalityQuery &Query) {
95 const LLT Ty = Query.Types[TypeIdx];
96 const LLT EltTy = Ty.getScalarType();
97 return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2;
98 };
99}
100
101static LegalizeMutation oneMoreElement(unsigned TypeIdx) {
102 return [=](const LegalityQuery &Query) {
103 const LLT Ty = Query.Types[TypeIdx];
104 const LLT EltTy = Ty.getElementType();
105 return std::pair(TypeIdx,
106 LLT::fixed_vector(Ty.getNumElements() + 1, EltTy));
107 };
108}
109
111 return [=](const LegalityQuery &Query) {
112 const LLT Ty = Query.Types[TypeIdx];
113 const LLT EltTy = Ty.getElementType();
114 unsigned Size = Ty.getSizeInBits();
115 unsigned Pieces = (Size + 63) / 64;
116 unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces;
117 return std::pair(TypeIdx, LLT::scalarOrVector(
118 ElementCount::getFixed(NewNumElts), EltTy));
119 };
120}
121
122// Increase the number of vector elements to reach the next multiple of 32-bit
123// type.
124static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) {
125 return [=](const LegalityQuery &Query) {
126 const LLT Ty = Query.Types[TypeIdx];
127
128 const LLT EltTy = Ty.getElementType();
129 const int Size = Ty.getSizeInBits();
130 const int EltSize = EltTy.getSizeInBits();
131 const int NextMul32 = (Size + 31) / 32;
132
133 assert(EltSize < 32);
134
135 const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize;
136 return std::pair(TypeIdx, LLT::fixed_vector(NewNumElts, EltTy));
137 };
138}
139
140// Increase the number of vector elements to reach the next legal RegClass.
142 return [=](const LegalityQuery &Query) {
143 const LLT Ty = Query.Types[TypeIdx];
144 const unsigned NumElts = Ty.getNumElements();
145 const unsigned EltSize = Ty.getElementType().getSizeInBits();
146 const unsigned MaxNumElts = MaxRegisterSize / EltSize;
147
148 assert(EltSize == 32 || EltSize == 64);
150
151 unsigned NewNumElts;
152 // Find the nearest legal RegClass that is larger than the current type.
153 for (NewNumElts = NumElts; NewNumElts < MaxNumElts; ++NewNumElts) {
154 if (SIRegisterInfo::getSGPRClassForBitWidth(NewNumElts * EltSize))
155 break;
156 }
157
158 return std::pair(TypeIdx, LLT::fixed_vector(NewNumElts, EltSize));
159 };
160}
161
163 if (!Ty.isVector())
164 return LLT::scalar(128);
165 const ElementCount NumElems = Ty.getElementCount();
166 return LLT::vector(NumElems, LLT::scalar(128));
167}
168
170 if (!Ty.isVector())
171 return LLT::fixed_vector(4, LLT::scalar(32));
172 const unsigned NumElems = Ty.getElementCount().getFixedValue();
173 return LLT::fixed_vector(NumElems * 4, LLT::scalar(32));
174}
175
177 const unsigned Size = Ty.getSizeInBits();
178
179 if (Size <= 32) {
180 // <2 x s8> -> s16
181 // <4 x s8> -> s32
182 return LLT::scalar(Size);
183 }
184
186}
187
188static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx) {
189 return [=](const LegalityQuery &Query) {
190 const LLT Ty = Query.Types[TypeIdx];
191 return std::pair(TypeIdx, getBitcastRegisterType(Ty));
192 };
193}
194
196 return [=](const LegalityQuery &Query) {
197 const LLT Ty = Query.Types[TypeIdx];
198 unsigned Size = Ty.getSizeInBits();
199 assert(Size % 32 == 0);
200 return std::pair(
202 };
203}
204
205static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) {
206 return [=](const LegalityQuery &Query) {
207 const LLT QueryTy = Query.Types[TypeIdx];
208 return QueryTy.isVector() && QueryTy.getSizeInBits() < Size;
209 };
210}
211
212static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) {
213 return [=](const LegalityQuery &Query) {
214 const LLT QueryTy = Query.Types[TypeIdx];
215 return QueryTy.isVector() && QueryTy.getSizeInBits() > Size;
216 };
217}
218
219static LegalityPredicate numElementsNotEven(unsigned TypeIdx) {
220 return [=](const LegalityQuery &Query) {
221 const LLT QueryTy = Query.Types[TypeIdx];
222 return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0;
223 };
224}
225
226static bool isRegisterSize(unsigned Size) {
227 return Size % 32 == 0 && Size <= MaxRegisterSize;
228}
229
231 const int EltSize = EltTy.getSizeInBits();
232 return EltSize == 16 || EltSize % 32 == 0;
233}
234
235static bool isRegisterVectorType(LLT Ty) {
236 const int EltSize = Ty.getElementType().getSizeInBits();
237 return EltSize == 32 || EltSize == 64 ||
238 (EltSize == 16 && Ty.getNumElements() % 2 == 0) ||
239 EltSize == 128 || EltSize == 256;
240}
241
242// TODO: replace all uses of isRegisterType with isRegisterClassType
243static bool isRegisterType(LLT Ty) {
244 if (!isRegisterSize(Ty.getSizeInBits()))
245 return false;
246
247 if (Ty.isVector())
248 return isRegisterVectorType(Ty);
249
250 return true;
251}
252
253// Any combination of 32 or 64-bit elements up the maximum register size, and
254// multiples of v2s16.
255static LegalityPredicate isRegisterType(unsigned TypeIdx) {
256 return [=](const LegalityQuery &Query) {
257 return isRegisterType(Query.Types[TypeIdx]);
258 };
259}
260
261// RegisterType that doesn't have a corresponding RegClass.
262// TODO: Once `isRegisterType` is replaced with `isRegisterClassType` this
263// should be removed.
264static LegalityPredicate isIllegalRegisterType(unsigned TypeIdx) {
265 return [=](const LegalityQuery &Query) {
266 LLT Ty = Query.Types[TypeIdx];
267 return isRegisterType(Ty) &&
269 };
270}
271
272static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) {
273 return [=](const LegalityQuery &Query) {
274 const LLT QueryTy = Query.Types[TypeIdx];
275 if (!QueryTy.isVector())
276 return false;
277 const LLT EltTy = QueryTy.getElementType();
278 return EltTy == LLT::scalar(16) || EltTy.getSizeInBits() >= 32;
279 };
280}
281
282static const LLT S1 = LLT::scalar(1);
283static const LLT S8 = LLT::scalar(8);
284static const LLT S16 = LLT::scalar(16);
285static const LLT S32 = LLT::scalar(32);
286static const LLT S64 = LLT::scalar(64);
287static const LLT S96 = LLT::scalar(96);
288static const LLT S128 = LLT::scalar(128);
289static const LLT S160 = LLT::scalar(160);
290static const LLT S224 = LLT::scalar(224);
291static const LLT S256 = LLT::scalar(256);
292static const LLT S512 = LLT::scalar(512);
294
295static const LLT V2S8 = LLT::fixed_vector(2, 8);
296static const LLT V2S16 = LLT::fixed_vector(2, 16);
297static const LLT V4S16 = LLT::fixed_vector(4, 16);
298static const LLT V6S16 = LLT::fixed_vector(6, 16);
299static const LLT V8S16 = LLT::fixed_vector(8, 16);
300static const LLT V10S16 = LLT::fixed_vector(10, 16);
301static const LLT V12S16 = LLT::fixed_vector(12, 16);
302static const LLT V16S16 = LLT::fixed_vector(16, 16);
303
304static const LLT V2S32 = LLT::fixed_vector(2, 32);
305static const LLT V3S32 = LLT::fixed_vector(3, 32);
306static const LLT V4S32 = LLT::fixed_vector(4, 32);
307static const LLT V5S32 = LLT::fixed_vector(5, 32);
308static const LLT V6S32 = LLT::fixed_vector(6, 32);
309static const LLT V7S32 = LLT::fixed_vector(7, 32);
310static const LLT V8S32 = LLT::fixed_vector(8, 32);
311static const LLT V9S32 = LLT::fixed_vector(9, 32);
312static const LLT V10S32 = LLT::fixed_vector(10, 32);
313static const LLT V11S32 = LLT::fixed_vector(11, 32);
314static const LLT V12S32 = LLT::fixed_vector(12, 32);
315static const LLT V16S32 = LLT::fixed_vector(16, 32);
316static const LLT V32S32 = LLT::fixed_vector(32, 32);
317
318static const LLT V2S64 = LLT::fixed_vector(2, 64);
319static const LLT V3S64 = LLT::fixed_vector(3, 64);
320static const LLT V4S64 = LLT::fixed_vector(4, 64);
321static const LLT V5S64 = LLT::fixed_vector(5, 64);
322static const LLT V6S64 = LLT::fixed_vector(6, 64);
323static const LLT V7S64 = LLT::fixed_vector(7, 64);
324static const LLT V8S64 = LLT::fixed_vector(8, 64);
325static const LLT V16S64 = LLT::fixed_vector(16, 64);
326
327static const LLT V2S128 = LLT::fixed_vector(2, 128);
328static const LLT V4S128 = LLT::fixed_vector(4, 128);
329
330static std::initializer_list<LLT> AllScalarTypes = {S32, S64, S96, S128,
331 S160, S224, S256, S512};
332
333static std::initializer_list<LLT> AllS16Vectors{
335
336static std::initializer_list<LLT> AllS32Vectors = {
339
340static std::initializer_list<LLT> AllS64Vectors = {V2S64, V3S64, V4S64, V5S64,
342
343// Checks whether a type is in the list of legal register types.
344static bool isRegisterClassType(LLT Ty) {
347
350}
351
352static LegalityPredicate isRegisterClassType(unsigned TypeIdx) {
353 return [TypeIdx](const LegalityQuery &Query) {
354 return isRegisterClassType(Query.Types[TypeIdx]);
355 };
356}
357
358// If we have a truncating store or an extending load with a data size larger
359// than 32-bits, we need to reduce to a 32-bit type.
361 return [=](const LegalityQuery &Query) {
362 const LLT Ty = Query.Types[TypeIdx];
363 return !Ty.isVector() && Ty.getSizeInBits() > 32 &&
364 Query.MMODescrs[0].MemoryTy.getSizeInBits() < Ty.getSizeInBits();
365 };
366}
367
368// TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we
369// handle some operations by just promoting the register during
370// selection. There are also d16 loads on GFX9+ which preserve the high bits.
371static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS,
372 bool IsLoad, bool IsAtomic) {
373 switch (AS) {
375 // FIXME: Private element size.
376 return ST.enableFlatScratch() ? 128 : 32;
378 return ST.useDS128() ? 128 : 64;
383 // Treat constant and global as identical. SMRD loads are sometimes usable for
384 // global loads (ideally constant address space should be eliminated)
385 // depending on the context. Legality cannot be context dependent, but
386 // RegBankSelect can split the load as necessary depending on the pointer
387 // register bank/uniformity and if the memory is invariant or not written in a
388 // kernel.
389 return IsLoad ? 512 : 128;
390 default:
391 // FIXME: Flat addresses may contextually need to be split to 32-bit parts
392 // if they may alias scratch depending on the subtarget. This needs to be
393 // moved to custom handling to use addressMayBeAccessedAsPrivate
394 return ST.hasMultiDwordFlatScratchAddressing() || IsAtomic ? 128 : 32;
395 }
396}
397
398static bool isLoadStoreSizeLegal(const GCNSubtarget &ST,
399 const LegalityQuery &Query) {
400 const LLT Ty = Query.Types[0];
401
402 // Handle G_LOAD, G_ZEXTLOAD, G_SEXTLOAD
403 const bool IsLoad = Query.Opcode != AMDGPU::G_STORE;
404
405 unsigned RegSize = Ty.getSizeInBits();
406 uint64_t MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
407 uint64_t AlignBits = Query.MMODescrs[0].AlignInBits;
408 unsigned AS = Query.Types[1].getAddressSpace();
409
410 // All of these need to be custom lowered to cast the pointer operand.
412 return false;
413
414 // Do not handle extending vector loads.
415 if (Ty.isVector() && MemSize != RegSize)
416 return false;
417
418 // TODO: We should be able to widen loads if the alignment is high enough, but
419 // we also need to modify the memory access size.
420#if 0
421 // Accept widening loads based on alignment.
422 if (IsLoad && MemSize < Size)
423 MemSize = std::max(MemSize, Align);
424#endif
425
426 // Only 1-byte and 2-byte to 32-bit extloads are valid.
427 if (MemSize != RegSize && RegSize != 32)
428 return false;
429
430 if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad,
431 Query.MMODescrs[0].Ordering !=
432 AtomicOrdering::NotAtomic))
433 return false;
434
435 switch (MemSize) {
436 case 8:
437 case 16:
438 case 32:
439 case 64:
440 case 128:
441 break;
442 case 96:
443 if (!ST.hasDwordx3LoadStores())
444 return false;
445 break;
446 case 256:
447 case 512:
448 // These may contextually need to be broken down.
449 break;
450 default:
451 return false;
452 }
453
454 assert(RegSize >= MemSize);
455
456 if (AlignBits < MemSize) {
457 const SITargetLowering *TLI = ST.getTargetLowering();
458 if (!TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS,
459 Align(AlignBits / 8)))
460 return false;
461 }
462
463 return true;
464}
465
466// The newer buffer intrinsic forms take their resource arguments as
467// pointers in address space 8, aka s128 values. However, in order to not break
468// SelectionDAG, the underlying operations have to continue to take v4i32
469// arguments. Therefore, we convert resource pointers - or vectors of them
470// to integer values here.
471static bool hasBufferRsrcWorkaround(const LLT Ty) {
473 return true;
474 if (Ty.isVector()) {
475 const LLT ElemTy = Ty.getElementType();
476 return hasBufferRsrcWorkaround(ElemTy);
477 }
478 return false;
479}
480
481// The current selector can't handle <6 x s16>, <8 x s16>, s96, s128 etc, so
482// workaround this. Eventually it should ignore the type for loads and only care
483// about the size. Return true in cases where we will workaround this for now by
484// bitcasting.
485static bool loadStoreBitcastWorkaround(const LLT Ty) {
487 return false;
488
489 const unsigned Size = Ty.getSizeInBits();
490 if (Size <= 64)
491 return false;
492 // Address space 8 pointers get their own workaround.
494 return false;
495 if (!Ty.isVector())
496 return true;
497
498 if (Ty.isPointerVector())
499 return true;
500
501 unsigned EltSize = Ty.getScalarSizeInBits();
502 return EltSize != 32 && EltSize != 64;
503}
504
505static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query) {
506 const LLT Ty = Query.Types[0];
507 return isRegisterType(Ty) && isLoadStoreSizeLegal(ST, Query) &&
509}
510
511/// Return true if a load or store of the type should be lowered with a bitcast
512/// to a different type.
513static bool shouldBitcastLoadStoreType(const GCNSubtarget &ST, const LLT Ty,
514 const LLT MemTy) {
515 const unsigned MemSizeInBits = MemTy.getSizeInBits();
516 const unsigned Size = Ty.getSizeInBits();
517 if (Size != MemSizeInBits)
518 return Size <= 32 && Ty.isVector();
519
521 return true;
522
523 // Don't try to handle bitcasting vector ext loads for now.
524 return Ty.isVector() && (!MemTy.isVector() || MemTy == Ty) &&
525 (Size <= 32 || isRegisterSize(Size)) &&
527}
528
529/// Return true if we should legalize a load by widening an odd sized memory
530/// access up to the alignment. Note this case when the memory access itself
531/// changes, not the size of the result register.
532static bool shouldWidenLoad(const GCNSubtarget &ST, LLT MemoryTy,
533 uint64_t AlignInBits, unsigned AddrSpace,
534 unsigned Opcode) {
535 unsigned SizeInBits = MemoryTy.getSizeInBits();
536 // We don't want to widen cases that are naturally legal.
537 if (isPowerOf2_32(SizeInBits))
538 return false;
539
540 // If we have 96-bit memory operations, we shouldn't touch them. Note we may
541 // end up widening these for a scalar load during RegBankSelect, if we don't
542 // have 96-bit scalar loads.
543 if (SizeInBits == 96 && ST.hasDwordx3LoadStores())
544 return false;
545
546 if (SizeInBits >= maxSizeForAddrSpace(ST, AddrSpace, Opcode, false))
547 return false;
548
549 // A load is known dereferenceable up to the alignment, so it's legal to widen
550 // to it.
551 //
552 // TODO: Could check dereferenceable for less aligned cases.
553 unsigned RoundedSize = NextPowerOf2(SizeInBits);
554 if (AlignInBits < RoundedSize)
555 return false;
556
557 // Do not widen if it would introduce a slow unaligned load.
558 const SITargetLowering *TLI = ST.getTargetLowering();
559 unsigned Fast = 0;
561 RoundedSize, AddrSpace, Align(AlignInBits / 8),
563 Fast;
564}
565
566static bool shouldWidenLoad(const GCNSubtarget &ST, const LegalityQuery &Query,
567 unsigned Opcode) {
568 if (Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic)
569 return false;
570
571 return shouldWidenLoad(ST, Query.MMODescrs[0].MemoryTy,
572 Query.MMODescrs[0].AlignInBits,
573 Query.Types[1].getAddressSpace(), Opcode);
574}
575
576/// Mutates IR (typicaly a load instruction) to use a <4 x s32> as the initial
577/// type of the operand `idx` and then to transform it to a `p8` via bitcasts
578/// and inttoptr. In addition, handle vectors of p8. Returns the new type.
580 MachineRegisterInfo &MRI, unsigned Idx) {
581 MachineOperand &MO = MI.getOperand(Idx);
582
583 const LLT PointerTy = MRI.getType(MO.getReg());
584
585 // Paranoidly prevent us from doing this multiple times.
587 return PointerTy;
588
589 const LLT ScalarTy = getBufferRsrcScalarType(PointerTy);
590 const LLT VectorTy = getBufferRsrcRegisterType(PointerTy);
591 if (!PointerTy.isVector()) {
592 // Happy path: (4 x s32) -> (s32, s32, s32, s32) -> (p8)
593 const unsigned NumParts = PointerTy.getSizeInBits() / 32;
594 const LLT S32 = LLT::scalar(32);
595
596 Register VectorReg = MRI.createGenericVirtualRegister(VectorTy);
597 std::array<Register, 4> VectorElems;
598 B.setInsertPt(B.getMBB(), ++B.getInsertPt());
599 for (unsigned I = 0; I < NumParts; ++I)
600 VectorElems[I] =
601 B.buildExtractVectorElementConstant(S32, VectorReg, I).getReg(0);
602 B.buildMergeValues(MO, VectorElems);
603 MO.setReg(VectorReg);
604 return VectorTy;
605 }
606 Register BitcastReg = MRI.createGenericVirtualRegister(VectorTy);
607 B.setInsertPt(B.getMBB(), ++B.getInsertPt());
608 auto Scalar = B.buildBitcast(ScalarTy, BitcastReg);
609 B.buildIntToPtr(MO, Scalar);
610 MO.setReg(BitcastReg);
611
612 return VectorTy;
613}
614
615/// Cast a buffer resource (an address space 8 pointer) into a 4xi32, which is
616/// the form in which the value must be in order to be passed to the low-level
617/// representations used for MUBUF/MTBUF intrinsics. This is a hack, which is
618/// needed in order to account for the fact that we can't define a register
619/// class for s128 without breaking SelectionDAG.
621 MachineRegisterInfo &MRI = *B.getMRI();
622 const LLT PointerTy = MRI.getType(Pointer);
623 const LLT ScalarTy = getBufferRsrcScalarType(PointerTy);
624 const LLT VectorTy = getBufferRsrcRegisterType(PointerTy);
625
626 if (!PointerTy.isVector()) {
627 // Special case: p8 -> (s32, s32, s32, s32) -> (4xs32)
628 SmallVector<Register, 4> PointerParts;
629 const unsigned NumParts = PointerTy.getSizeInBits() / 32;
630 auto Unmerged = B.buildUnmerge(LLT::scalar(32), Pointer);
631 for (unsigned I = 0; I < NumParts; ++I)
632 PointerParts.push_back(Unmerged.getReg(I));
633 return B.buildBuildVector(VectorTy, PointerParts).getReg(0);
634 }
635 Register Scalar = B.buildPtrToInt(ScalarTy, Pointer).getReg(0);
636 return B.buildBitcast(VectorTy, Scalar).getReg(0);
637}
638
640 unsigned Idx) {
641 MachineOperand &MO = MI.getOperand(Idx);
642
643 const LLT PointerTy = B.getMRI()->getType(MO.getReg());
644 // Paranoidly prevent us from doing this multiple times.
646 return;
648}
649
651 const GCNTargetMachine &TM)
652 : ST(ST_) {
653 using namespace TargetOpcode;
654
655 auto GetAddrSpacePtr = [&TM](unsigned AS) {
656 return LLT::pointer(AS, TM.getPointerSizeInBits(AS));
657 };
658
659 const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS);
660 const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS);
661 const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT);
662 const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS);
663 const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS);
664 const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS);
665 const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS);
666 const LLT BufferFatPtr = GetAddrSpacePtr(AMDGPUAS::BUFFER_FAT_POINTER);
667 const LLT RsrcPtr = GetAddrSpacePtr(AMDGPUAS::BUFFER_RESOURCE);
668 const LLT BufferStridedPtr =
669 GetAddrSpacePtr(AMDGPUAS::BUFFER_STRIDED_POINTER);
670
671 const LLT CodePtr = FlatPtr;
672
673 const std::initializer_list<LLT> AddrSpaces64 = {
674 GlobalPtr, ConstantPtr, FlatPtr
675 };
676
677 const std::initializer_list<LLT> AddrSpaces32 = {
678 LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr
679 };
680
681 const std::initializer_list<LLT> AddrSpaces128 = {RsrcPtr};
682
683 const std::initializer_list<LLT> FPTypesBase = {
684 S32, S64
685 };
686
687 const std::initializer_list<LLT> FPTypes16 = {
688 S32, S64, S16
689 };
690
691 const std::initializer_list<LLT> FPTypesPK16 = {
692 S32, S64, S16, V2S16
693 };
694
695 const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32;
696
697 // s1 for VCC branches, s32 for SCC branches.
699
700 // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more
701 // elements for v3s16
704 .legalFor(AllS32Vectors)
706 .legalFor(AddrSpaces64)
707 .legalFor(AddrSpaces32)
708 .legalFor(AddrSpaces128)
709 .legalIf(isPointer(0))
710 .clampScalar(0, S16, S256)
712 .clampMaxNumElements(0, S32, 16)
714 .scalarize(0);
715
716 if (ST.hasVOP3PInsts() && ST.hasAddNoCarry() && ST.hasIntClamp()) {
717 // Full set of gfx9 features.
718 if (ST.hasScalarAddSub64()) {
719 getActionDefinitionsBuilder({G_ADD, G_SUB})
720 .legalFor({S64, S32, S16, V2S16})
721 .clampMaxNumElementsStrict(0, S16, 2)
722 .scalarize(0)
723 .minScalar(0, S16)
725 .maxScalar(0, S32);
726 } else {
727 getActionDefinitionsBuilder({G_ADD, G_SUB})
728 .legalFor({S32, S16, V2S16})
729 .clampMaxNumElementsStrict(0, S16, 2)
730 .scalarize(0)
731 .minScalar(0, S16)
733 .maxScalar(0, S32);
734 }
735
736 if (ST.hasScalarSMulU64()) {
738 .legalFor({S64, S32, S16, V2S16})
739 .clampMaxNumElementsStrict(0, S16, 2)
740 .scalarize(0)
741 .minScalar(0, S16)
743 .custom();
744 } else {
746 .legalFor({S32, S16, V2S16})
747 .clampMaxNumElementsStrict(0, S16, 2)
748 .scalarize(0)
749 .minScalar(0, S16)
751 .custom();
752 }
753 assert(ST.hasMad64_32());
754
755 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT, G_SADDSAT, G_SSUBSAT})
756 .legalFor({S32, S16, V2S16}) // Clamp modifier
757 .minScalarOrElt(0, S16)
759 .scalarize(0)
761 .lower();
762 } else if (ST.has16BitInsts()) {
763 getActionDefinitionsBuilder({G_ADD, G_SUB})
764 .legalFor({S32, S16})
765 .minScalar(0, S16)
767 .maxScalar(0, S32)
768 .scalarize(0);
769
771 .legalFor({S32, S16})
772 .scalarize(0)
773 .minScalar(0, S16)
774 .widenScalarToNextMultipleOf(0, 32)
775 .custom();
776 assert(ST.hasMad64_32());
777
778 // Technically the saturating operations require clamp bit support, but this
779 // was introduced at the same time as 16-bit operations.
780 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
781 .legalFor({S32, S16}) // Clamp modifier
782 .minScalar(0, S16)
783 .scalarize(0)
785 .lower();
786
787 // We're just lowering this, but it helps get a better result to try to
788 // coerce to the desired type first.
789 getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT})
790 .minScalar(0, S16)
791 .scalarize(0)
792 .lower();
793 } else {
794 getActionDefinitionsBuilder({G_ADD, G_SUB})
795 .legalFor({S32})
796 .widenScalarToNextMultipleOf(0, 32)
797 .clampScalar(0, S32, S32)
798 .scalarize(0);
799
800 auto &Mul = getActionDefinitionsBuilder(G_MUL)
801 .legalFor({S32})
802 .scalarize(0)
803 .minScalar(0, S32)
804 .widenScalarToNextMultipleOf(0, 32);
805
806 if (ST.hasMad64_32())
807 Mul.custom();
808 else
809 Mul.maxScalar(0, S32);
810
811 if (ST.hasIntClamp()) {
812 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
813 .legalFor({S32}) // Clamp modifier.
814 .scalarize(0)
815 .minScalarOrElt(0, S32)
816 .lower();
817 } else {
818 // Clamp bit support was added in VI, along with 16-bit operations.
819 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
820 .minScalar(0, S32)
821 .scalarize(0)
822 .lower();
823 }
824
825 // FIXME: DAG expansion gets better results. The widening uses the smaller
826 // range values and goes for the min/max lowering directly.
827 getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT})
828 .minScalar(0, S32)
829 .scalarize(0)
830 .lower();
831 }
832
834 {G_SDIV, G_UDIV, G_SREM, G_UREM, G_SDIVREM, G_UDIVREM})
835 .customFor({S32, S64})
836 .clampScalar(0, S32, S64)
838 .scalarize(0);
839
840 auto &Mulh = getActionDefinitionsBuilder({G_UMULH, G_SMULH})
841 .legalFor({S32})
842 .maxScalar(0, S32);
843
844 if (ST.hasVOP3PInsts()) {
845 Mulh
846 .clampMaxNumElements(0, S8, 2)
847 .lowerFor({V2S8});
848 }
849
850 Mulh
851 .scalarize(0)
852 .lower();
853
854 // Report legal for any types we can handle anywhere. For the cases only legal
855 // on the SALU, RegBankSelect will be able to re-legalize.
856 getActionDefinitionsBuilder({G_AND, G_OR, G_XOR})
857 .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16})
858 .clampScalar(0, S32, S64)
862 .scalarize(0);
863
865 {G_UADDO, G_USUBO, G_UADDE, G_SADDE, G_USUBE, G_SSUBE})
866 .legalFor({{S32, S1}, {S32, S32}})
867 .clampScalar(0, S32, S32)
868 .scalarize(0);
869
871 // Don't worry about the size constraint.
873 .lower();
874
876 .legalFor({S1, S32, S64, S16, GlobalPtr,
877 LocalPtr, ConstantPtr, PrivatePtr, FlatPtr })
878 .legalIf(isPointer(0))
879 .clampScalar(0, S32, S64)
881
882 getActionDefinitionsBuilder(G_FCONSTANT)
883 .legalFor({S32, S64, S16})
884 .clampScalar(0, S16, S64);
885
886 getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE})
887 .legalIf(isRegisterType(0))
888 // s1 and s16 are special cases because they have legal operations on
889 // them, but don't really occupy registers in the normal way.
890 .legalFor({S1, S16})
891 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
894 .clampMaxNumElements(0, S32, 16);
895
896 getActionDefinitionsBuilder(G_FRAME_INDEX).legalFor({PrivatePtr});
897
898 // If the amount is divergent, we have to do a wave reduction to get the
899 // maximum value, so this is expanded during RegBankSelect.
900 getActionDefinitionsBuilder(G_DYN_STACKALLOC)
901 .legalFor({{PrivatePtr, S32}});
902
903 getActionDefinitionsBuilder(G_STACKSAVE)
904 .customFor({PrivatePtr});
905 getActionDefinitionsBuilder(G_STACKRESTORE)
906 .legalFor({PrivatePtr});
907
908 getActionDefinitionsBuilder({G_GET_FPENV, G_SET_FPENV}).customFor({S64});
909
910 getActionDefinitionsBuilder(G_GLOBAL_VALUE)
911 .customIf(typeIsNot(0, PrivatePtr));
912
913 getActionDefinitionsBuilder(G_BLOCK_ADDR).legalFor({CodePtr});
914
915 auto &FPOpActions = getActionDefinitionsBuilder(
916 { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE,
917 G_STRICT_FADD, G_STRICT_FMUL, G_STRICT_FMA})
918 .legalFor({S32, S64});
919 auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS})
920 .customFor({S32, S64});
921 auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV)
922 .customFor({S32, S64});
923
924 if (ST.has16BitInsts()) {
925 if (ST.hasVOP3PInsts())
926 FPOpActions.legalFor({S16, V2S16});
927 else
928 FPOpActions.legalFor({S16});
929
930 TrigActions.customFor({S16});
931 FDIVActions.customFor({S16});
932 }
933
934 if (ST.hasPackedFP32Ops()) {
935 FPOpActions.legalFor({V2S32});
936 FPOpActions.clampMaxNumElementsStrict(0, S32, 2);
937 }
938
939 auto &MinNumMaxNum = getActionDefinitionsBuilder({
940 G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE});
941
942 if (ST.hasVOP3PInsts()) {
943 MinNumMaxNum.customFor(FPTypesPK16)
946 .clampScalar(0, S16, S64)
947 .scalarize(0);
948 } else if (ST.has16BitInsts()) {
949 MinNumMaxNum.customFor(FPTypes16)
950 .clampScalar(0, S16, S64)
951 .scalarize(0);
952 } else {
953 MinNumMaxNum.customFor(FPTypesBase)
954 .clampScalar(0, S32, S64)
955 .scalarize(0);
956 }
957
958 if (ST.hasVOP3PInsts())
959 FPOpActions.clampMaxNumElementsStrict(0, S16, 2);
960
961 FPOpActions
962 .scalarize(0)
963 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
964
965 TrigActions
966 .scalarize(0)
967 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
968
969 FDIVActions
970 .scalarize(0)
971 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
972
973 getActionDefinitionsBuilder({G_FNEG, G_FABS})
974 .legalFor(FPTypesPK16)
976 .scalarize(0)
977 .clampScalar(0, S16, S64);
978
979 if (ST.has16BitInsts()) {
981 .legalFor({S16})
982 .customFor({S32, S64})
983 .scalarize(0)
984 .unsupported();
986 .legalFor({S32, S64, S16})
987 .scalarize(0)
988 .clampScalar(0, S16, S64);
989
990 getActionDefinitionsBuilder({G_FLDEXP, G_STRICT_FLDEXP})
991 .legalFor({{S32, S32}, {S64, S32}, {S16, S16}})
992 .scalarize(0)
993 .maxScalarIf(typeIs(0, S16), 1, S16)
994 .clampScalar(1, S32, S32)
995 .lower();
996
998 .customFor({{S32, S32}, {S64, S32}, {S16, S16}, {S16, S32}})
999 .scalarize(0)
1000 .lower();
1001 } else {
1003 .customFor({S32, S64, S16})
1004 .scalarize(0)
1005 .unsupported();
1006
1007
1008 if (ST.hasFractBug()) {
1010 .customFor({S64})
1011 .legalFor({S32, S64})
1012 .scalarize(0)
1013 .clampScalar(0, S32, S64);
1014 } else {
1016 .legalFor({S32, S64})
1017 .scalarize(0)
1018 .clampScalar(0, S32, S64);
1019 }
1020
1021 getActionDefinitionsBuilder({G_FLDEXP, G_STRICT_FLDEXP})
1022 .legalFor({{S32, S32}, {S64, S32}})
1023 .scalarize(0)
1024 .clampScalar(0, S32, S64)
1025 .clampScalar(1, S32, S32)
1026 .lower();
1027
1029 .customFor({{S32, S32}, {S64, S32}})
1030 .scalarize(0)
1031 .minScalar(0, S32)
1032 .clampScalar(1, S32, S32)
1033 .lower();
1034 }
1035
1037 .legalFor({{S32, S64}, {S16, S32}})
1038 .scalarize(0)
1039 .lower();
1040
1042 .legalFor({{S64, S32}, {S32, S16}})
1043 .narrowScalarFor({{S64, S16}}, changeTo(0, S32))
1044 .scalarize(0);
1045
1046 auto &FSubActions = getActionDefinitionsBuilder({G_FSUB, G_STRICT_FSUB});
1047 if (ST.has16BitInsts()) {
1048 FSubActions
1049 // Use actual fsub instruction
1050 .legalFor({S32, S16})
1051 // Must use fadd + fneg
1052 .lowerFor({S64, V2S16});
1053 } else {
1054 FSubActions
1055 // Use actual fsub instruction
1056 .legalFor({S32})
1057 // Must use fadd + fneg
1058 .lowerFor({S64, S16, V2S16});
1059 }
1060
1061 FSubActions
1062 .scalarize(0)
1063 .clampScalar(0, S32, S64);
1064
1065 // Whether this is legal depends on the floating point mode for the function.
1066 auto &FMad = getActionDefinitionsBuilder(G_FMAD);
1067 if (ST.hasMadF16() && ST.hasMadMacF32Insts())
1068 FMad.customFor({S32, S16});
1069 else if (ST.hasMadMacF32Insts())
1070 FMad.customFor({S32});
1071 else if (ST.hasMadF16())
1072 FMad.customFor({S16});
1073 FMad.scalarize(0)
1074 .lower();
1075
1076 auto &FRem = getActionDefinitionsBuilder(G_FREM);
1077 if (ST.has16BitInsts()) {
1078 FRem.customFor({S16, S32, S64});
1079 } else {
1080 FRem.minScalar(0, S32)
1081 .customFor({S32, S64});
1082 }
1083 FRem.scalarize(0);
1084
1085 // TODO: Do we need to clamp maximum bitwidth?
1087 .legalIf(isScalar(0))
1088 .legalFor({{V2S16, V2S32}})
1089 .clampMaxNumElements(0, S16, 2)
1090 // Avoid scalarizing in cases that should be truly illegal. In unresolvable
1091 // situations (like an invalid implicit use), we don't want to infinite loop
1092 // in the legalizer.
1094 .alwaysLegal();
1095
1096 getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT})
1097 .legalFor({{S64, S32}, {S32, S16}, {S64, S16},
1098 {S32, S1}, {S64, S1}, {S16, S1}})
1099 .scalarize(0)
1100 .clampScalar(0, S32, S64)
1101 .widenScalarToNextPow2(1, 32);
1102
1103 // TODO: Split s1->s64 during regbankselect for VALU.
1104 auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP})
1105 .legalFor({{S32, S32}, {S64, S32}, {S16, S32}})
1106 .lowerIf(typeIs(1, S1))
1107 .customFor({{S32, S64}, {S64, S64}});
1108 if (ST.has16BitInsts())
1109 IToFP.legalFor({{S16, S16}});
1110 IToFP.clampScalar(1, S32, S64)
1111 .minScalar(0, S32)
1112 .scalarize(0)
1114
1115 auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI})
1116 .legalFor({{S32, S32}, {S32, S64}, {S32, S16}})
1117 .customFor({{S64, S32}, {S64, S64}})
1118 .narrowScalarFor({{S64, S16}}, changeTo(0, S32));
1119 if (ST.has16BitInsts())
1120 FPToI.legalFor({{S16, S16}});
1121 else
1122 FPToI.minScalar(1, S32);
1123
1124 FPToI.minScalar(0, S32)
1125 .widenScalarToNextPow2(0, 32)
1126 .scalarize(0)
1127 .lower();
1128
1129 getActionDefinitionsBuilder(G_INTRINSIC_FPTRUNC_ROUND)
1130 .customFor({S16, S32})
1131 .scalarize(0)
1132 .lower();
1133
1134 // Lower G_FNEARBYINT and G_FRINT into G_INTRINSIC_ROUNDEVEN
1135 getActionDefinitionsBuilder({G_INTRINSIC_ROUND, G_FRINT, G_FNEARBYINT})
1136 .scalarize(0)
1137 .lower();
1138
1139 if (ST.has16BitInsts()) {
1140 getActionDefinitionsBuilder(
1141 {G_INTRINSIC_TRUNC, G_FCEIL, G_INTRINSIC_ROUNDEVEN})
1142 .legalFor({S16, S32, S64})
1143 .clampScalar(0, S16, S64)
1144 .scalarize(0);
1145 } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) {
1146 getActionDefinitionsBuilder(
1147 {G_INTRINSIC_TRUNC, G_FCEIL, G_INTRINSIC_ROUNDEVEN})
1148 .legalFor({S32, S64})
1149 .clampScalar(0, S32, S64)
1150 .scalarize(0);
1151 } else {
1152 getActionDefinitionsBuilder(
1153 {G_INTRINSIC_TRUNC, G_FCEIL, G_INTRINSIC_ROUNDEVEN})
1154 .legalFor({S32})
1155 .customFor({S64})
1156 .clampScalar(0, S32, S64)
1157 .scalarize(0);
1158 }
1159
1160 getActionDefinitionsBuilder(G_PTR_ADD)
1161 .unsupportedFor({BufferFatPtr, BufferStridedPtr, RsrcPtr})
1162 .legalIf(all(isPointer(0), sameSize(0, 1)))
1163 .scalarize(0)
1164 .scalarSameSizeAs(1, 0);
1165
1166 getActionDefinitionsBuilder(G_PTRMASK)
1167 .legalIf(all(sameSize(0, 1), typeInSet(1, {S64, S32})))
1168 .scalarSameSizeAs(1, 0)
1169 .scalarize(0);
1170
1171 auto &CmpBuilder =
1172 getActionDefinitionsBuilder(G_ICMP)
1173 // The compare output type differs based on the register bank of the output,
1174 // so make both s1 and s32 legal.
1175 //
1176 // Scalar compares producing output in scc will be promoted to s32, as that
1177 // is the allocatable register type that will be needed for the copy from
1178 // scc. This will be promoted during RegBankSelect, and we assume something
1179 // before that won't try to use s32 result types.
1180 //
1181 // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg
1182 // bank.
1183 .legalForCartesianProduct(
1184 {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr})
1185 .legalForCartesianProduct(
1186 {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr});
1187 if (ST.has16BitInsts()) {
1188 CmpBuilder.legalFor({{S1, S16}});
1189 }
1190
1191 CmpBuilder
1192 .widenScalarToNextPow2(1)
1193 .clampScalar(1, S32, S64)
1194 .scalarize(0)
1195 .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1)));
1196
1197 auto &FCmpBuilder =
1198 getActionDefinitionsBuilder(G_FCMP).legalForCartesianProduct(
1199 {S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase);
1200
1201 if (ST.hasSALUFloatInsts())
1202 FCmpBuilder.legalForCartesianProduct({S32}, {S16, S32});
1203
1204 FCmpBuilder
1205 .widenScalarToNextPow2(1)
1206 .clampScalar(1, S32, S64)
1207 .scalarize(0);
1208
1209 // FIXME: fpow has a selection pattern that should move to custom lowering.
1210 auto &ExpOps = getActionDefinitionsBuilder(G_FPOW);
1211 if (ST.has16BitInsts())
1212 ExpOps.customFor({{S32}, {S16}});
1213 else
1214 ExpOps.customFor({S32});
1215 ExpOps.clampScalar(0, MinScalarFPTy, S32)
1216 .scalarize(0);
1217
1218 getActionDefinitionsBuilder(G_FPOWI)
1219 .clampScalar(0, MinScalarFPTy, S32)
1220 .lower();
1221
1222 auto &Log2Ops = getActionDefinitionsBuilder({G_FLOG2, G_FEXP2});
1223 Log2Ops.customFor({S32});
1224 if (ST.has16BitInsts())
1225 Log2Ops.legalFor({S16});
1226 else
1227 Log2Ops.customFor({S16});
1228 Log2Ops.scalarize(0)
1229 .lower();
1230
1231 auto &LogOps =
1232 getActionDefinitionsBuilder({G_FLOG, G_FLOG10, G_FEXP, G_FEXP10});
1233 LogOps.customFor({S32, S16});
1234 LogOps.clampScalar(0, MinScalarFPTy, S32)
1235 .scalarize(0);
1236
1237 // The 64-bit versions produce 32-bit results, but only on the SALU.
1238 getActionDefinitionsBuilder(G_CTPOP)
1239 .legalFor({{S32, S32}, {S32, S64}})
1240 .clampScalar(0, S32, S32)
1241 .widenScalarToNextPow2(1, 32)
1242 .clampScalar(1, S32, S64)
1243 .scalarize(0)
1244 .widenScalarToNextPow2(0, 32);
1245
1246 // If no 16 bit instr is available, lower into different instructions.
1247 if (ST.has16BitInsts())
1248 getActionDefinitionsBuilder(G_IS_FPCLASS)
1249 .legalForCartesianProduct({S1}, FPTypes16)
1250 .widenScalarToNextPow2(1)
1251 .scalarize(0)
1252 .lower();
1253 else
1254 getActionDefinitionsBuilder(G_IS_FPCLASS)
1255 .legalForCartesianProduct({S1}, FPTypesBase)
1256 .lowerFor({S1, S16})
1257 .widenScalarToNextPow2(1)
1258 .scalarize(0)
1259 .lower();
1260
1261 // The hardware instructions return a different result on 0 than the generic
1262 // instructions expect. The hardware produces -1, but these produce the
1263 // bitwidth.
1264 getActionDefinitionsBuilder({G_CTLZ, G_CTTZ})
1265 .scalarize(0)
1266 .clampScalar(0, S32, S32)
1267 .clampScalar(1, S32, S64)
1268 .widenScalarToNextPow2(0, 32)
1269 .widenScalarToNextPow2(1, 32)
1270 .custom();
1271
1272 // The 64-bit versions produce 32-bit results, but only on the SALU.
1273 getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF})
1274 .legalFor({{S32, S32}, {S32, S64}})
1275 .clampScalar(0, S32, S32)
1276 .clampScalar(1, S32, S64)
1277 .scalarize(0)
1278 .widenScalarToNextPow2(0, 32)
1279 .widenScalarToNextPow2(1, 32);
1280
1281 // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
1282 // RegBankSelect.
1283 getActionDefinitionsBuilder(G_BITREVERSE)
1284 .legalFor({S32, S64})
1285 .clampScalar(0, S32, S64)
1286 .scalarize(0)
1287 .widenScalarToNextPow2(0);
1288
1289 if (ST.has16BitInsts()) {
1290 getActionDefinitionsBuilder(G_BSWAP)
1291 .legalFor({S16, S32, V2S16})
1292 .clampMaxNumElementsStrict(0, S16, 2)
1293 // FIXME: Fixing non-power-of-2 before clamp is workaround for
1294 // narrowScalar limitation.
1295 .widenScalarToNextPow2(0)
1296 .clampScalar(0, S16, S32)
1297 .scalarize(0);
1298
1299 if (ST.hasVOP3PInsts()) {
1300 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
1301 .legalFor({S32, S16, V2S16})
1302 .clampMaxNumElements(0, S16, 2)
1303 .minScalar(0, S16)
1304 .widenScalarToNextPow2(0)
1305 .scalarize(0)
1306 .lower();
1307 } else {
1308 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
1309 .legalFor({S32, S16})
1310 .widenScalarToNextPow2(0)
1311 .minScalar(0, S16)
1312 .scalarize(0)
1313 .lower();
1314 }
1315 } else {
1316 // TODO: Should have same legality without v_perm_b32
1317 getActionDefinitionsBuilder(G_BSWAP)
1318 .legalFor({S32})
1319 .lowerIf(scalarNarrowerThan(0, 32))
1320 // FIXME: Fixing non-power-of-2 before clamp is workaround for
1321 // narrowScalar limitation.
1322 .widenScalarToNextPow2(0)
1323 .maxScalar(0, S32)
1324 .scalarize(0)
1325 .lower();
1326
1327 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
1328 .legalFor({S32})
1329 .minScalar(0, S32)
1330 .widenScalarToNextPow2(0)
1331 .scalarize(0)
1332 .lower();
1333 }
1334
1335 getActionDefinitionsBuilder(G_INTTOPTR)
1336 // List the common cases
1337 .legalForCartesianProduct(AddrSpaces64, {S64})
1338 .legalForCartesianProduct(AddrSpaces32, {S32})
1339 .scalarize(0)
1340 // Accept any address space as long as the size matches
1341 .legalIf(sameSize(0, 1))
1342 .widenScalarIf(smallerThan(1, 0),
1343 [](const LegalityQuery &Query) {
1344 return std::pair(
1345 1, LLT::scalar(Query.Types[0].getSizeInBits()));
1346 })
1347 .narrowScalarIf(largerThan(1, 0), [](const LegalityQuery &Query) {
1348 return std::pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
1349 });
1350
1351 getActionDefinitionsBuilder(G_PTRTOINT)
1352 // List the common cases
1353 .legalForCartesianProduct(AddrSpaces64, {S64})
1354 .legalForCartesianProduct(AddrSpaces32, {S32})
1355 .scalarize(0)
1356 // Accept any address space as long as the size matches
1357 .legalIf(sameSize(0, 1))
1358 .widenScalarIf(smallerThan(0, 1),
1359 [](const LegalityQuery &Query) {
1360 return std::pair(
1361 0, LLT::scalar(Query.Types[1].getSizeInBits()));
1362 })
1363 .narrowScalarIf(largerThan(0, 1), [](const LegalityQuery &Query) {
1364 return std::pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
1365 });
1366
1367 getActionDefinitionsBuilder(G_ADDRSPACE_CAST)
1368 .scalarize(0)
1369 .custom();
1370
1371 const auto needToSplitMemOp = [=](const LegalityQuery &Query,
1372 bool IsLoad) -> bool {
1373 const LLT DstTy = Query.Types[0];
1374
1375 // Split vector extloads.
1376 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
1377
1378 if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize)
1379 return true;
1380
1381 const LLT PtrTy = Query.Types[1];
1382 unsigned AS = PtrTy.getAddressSpace();
1383 if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad,
1384 Query.MMODescrs[0].Ordering !=
1386 return true;
1387
1388 // Catch weird sized loads that don't evenly divide into the access sizes
1389 // TODO: May be able to widen depending on alignment etc.
1390 unsigned NumRegs = (MemSize + 31) / 32;
1391 if (NumRegs == 3) {
1392 if (!ST.hasDwordx3LoadStores())
1393 return true;
1394 } else {
1395 // If the alignment allows, these should have been widened.
1396 if (!isPowerOf2_32(NumRegs))
1397 return true;
1398 }
1399
1400 return false;
1401 };
1402
1403 unsigned GlobalAlign32 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 32;
1404 unsigned GlobalAlign16 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 16;
1405 unsigned GlobalAlign8 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 8;
1406
1407 // TODO: Refine based on subtargets which support unaligned access or 128-bit
1408 // LDS
1409 // TODO: Unsupported flat for SI.
1410
1411 for (unsigned Op : {G_LOAD, G_STORE}) {
1412 const bool IsStore = Op == G_STORE;
1413
1414 auto &Actions = getActionDefinitionsBuilder(Op);
1415 // Explicitly list some common cases.
1416 // TODO: Does this help compile time at all?
1417 Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, S32, GlobalAlign32},
1418 {V2S32, GlobalPtr, V2S32, GlobalAlign32},
1419 {V4S32, GlobalPtr, V4S32, GlobalAlign32},
1420 {S64, GlobalPtr, S64, GlobalAlign32},
1421 {V2S64, GlobalPtr, V2S64, GlobalAlign32},
1422 {V2S16, GlobalPtr, V2S16, GlobalAlign32},
1423 {S32, GlobalPtr, S8, GlobalAlign8},
1424 {S32, GlobalPtr, S16, GlobalAlign16},
1425
1426 {S32, LocalPtr, S32, 32},
1427 {S64, LocalPtr, S64, 32},
1428 {V2S32, LocalPtr, V2S32, 32},
1429 {S32, LocalPtr, S8, 8},
1430 {S32, LocalPtr, S16, 16},
1431 {V2S16, LocalPtr, S32, 32},
1432
1433 {S32, PrivatePtr, S32, 32},
1434 {S32, PrivatePtr, S8, 8},
1435 {S32, PrivatePtr, S16, 16},
1436 {V2S16, PrivatePtr, S32, 32},
1437
1438 {S32, ConstantPtr, S32, GlobalAlign32},
1439 {V2S32, ConstantPtr, V2S32, GlobalAlign32},
1440 {V4S32, ConstantPtr, V4S32, GlobalAlign32},
1441 {S64, ConstantPtr, S64, GlobalAlign32},
1442 {V2S32, ConstantPtr, V2S32, GlobalAlign32}});
1443 Actions.legalIf(
1444 [=](const LegalityQuery &Query) -> bool {
1445 return isLoadStoreLegal(ST, Query);
1446 });
1447
1448 // The custom pointers (fat pointers, buffer resources) don't work with load
1449 // and store at this level. Fat pointers should have been lowered to
1450 // intrinsics before the translation to MIR.
1451 Actions.unsupportedIf(
1452 typeInSet(1, {BufferFatPtr, BufferStridedPtr, RsrcPtr}));
1453
1454 // Address space 8 pointers are handled by a 4xs32 load, bitcast, and
1455 // ptrtoint. This is needed to account for the fact that we can't have i128
1456 // as a register class for SelectionDAG reasons.
1457 Actions.customIf([=](const LegalityQuery &Query) -> bool {
1458 return hasBufferRsrcWorkaround(Query.Types[0]);
1459 });
1460
1461 // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to
1462 // 64-bits.
1463 //
1464 // TODO: Should generalize bitcast action into coerce, which will also cover
1465 // inserting addrspacecasts.
1466 Actions.customIf(typeIs(1, Constant32Ptr));
1467
1468 // Turn any illegal element vectors into something easier to deal
1469 // with. These will ultimately produce 32-bit scalar shifts to extract the
1470 // parts anyway.
1471 //
1472 // For odd 16-bit element vectors, prefer to split those into pieces with
1473 // 16-bit vector parts.
1474 Actions.bitcastIf(
1475 [=](const LegalityQuery &Query) -> bool {
1476 return shouldBitcastLoadStoreType(ST, Query.Types[0],
1477 Query.MMODescrs[0].MemoryTy);
1478 }, bitcastToRegisterType(0));
1479
1480 if (!IsStore) {
1481 // Widen suitably aligned loads by loading extra bytes. The standard
1482 // legalization actions can't properly express widening memory operands.
1483 Actions.customIf([=](const LegalityQuery &Query) -> bool {
1484 return shouldWidenLoad(ST, Query, G_LOAD);
1485 });
1486 }
1487
1488 // FIXME: load/store narrowing should be moved to lower action
1489 Actions
1490 .narrowScalarIf(
1491 [=](const LegalityQuery &Query) -> bool {
1492 return !Query.Types[0].isVector() &&
1493 needToSplitMemOp(Query, Op == G_LOAD);
1494 },
1495 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1496 const LLT DstTy = Query.Types[0];
1497 const LLT PtrTy = Query.Types[1];
1498
1499 const unsigned DstSize = DstTy.getSizeInBits();
1500 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
1501
1502 // Split extloads.
1503 if (DstSize > MemSize)
1504 return std::pair(0, LLT::scalar(MemSize));
1505
1506 unsigned MaxSize = maxSizeForAddrSpace(
1507 ST, PtrTy.getAddressSpace(), Op == G_LOAD,
1508 Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic);
1509 if (MemSize > MaxSize)
1510 return std::pair(0, LLT::scalar(MaxSize));
1511
1512 uint64_t Align = Query.MMODescrs[0].AlignInBits;
1513 return std::pair(0, LLT::scalar(Align));
1514 })
1515 .fewerElementsIf(
1516 [=](const LegalityQuery &Query) -> bool {
1517 return Query.Types[0].isVector() &&
1518 needToSplitMemOp(Query, Op == G_LOAD);
1519 },
1520 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1521 const LLT DstTy = Query.Types[0];
1522 const LLT PtrTy = Query.Types[1];
1523
1524 LLT EltTy = DstTy.getElementType();
1525 unsigned MaxSize = maxSizeForAddrSpace(
1526 ST, PtrTy.getAddressSpace(), Op == G_LOAD,
1527 Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic);
1528
1529 // FIXME: Handle widened to power of 2 results better. This ends
1530 // up scalarizing.
1531 // FIXME: 3 element stores scalarized on SI
1532
1533 // Split if it's too large for the address space.
1534 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
1535 if (MemSize > MaxSize) {
1536 unsigned NumElts = DstTy.getNumElements();
1537 unsigned EltSize = EltTy.getSizeInBits();
1538
1539 if (MaxSize % EltSize == 0) {
1540 return std::pair(
1542 ElementCount::getFixed(MaxSize / EltSize), EltTy));
1543 }
1544
1545 unsigned NumPieces = MemSize / MaxSize;
1546
1547 // FIXME: Refine when odd breakdowns handled
1548 // The scalars will need to be re-legalized.
1549 if (NumPieces == 1 || NumPieces >= NumElts ||
1550 NumElts % NumPieces != 0)
1551 return std::pair(0, EltTy);
1552
1553 return std::pair(0,
1554 LLT::fixed_vector(NumElts / NumPieces, EltTy));
1555 }
1556
1557 // FIXME: We could probably handle weird extending loads better.
1558 if (DstTy.getSizeInBits() > MemSize)
1559 return std::pair(0, EltTy);
1560
1561 unsigned EltSize = EltTy.getSizeInBits();
1562 unsigned DstSize = DstTy.getSizeInBits();
1563 if (!isPowerOf2_32(DstSize)) {
1564 // We're probably decomposing an odd sized store. Try to split
1565 // to the widest type. TODO: Account for alignment. As-is it
1566 // should be OK, since the new parts will be further legalized.
1567 unsigned FloorSize = llvm::bit_floor(DstSize);
1568 return std::pair(
1570 ElementCount::getFixed(FloorSize / EltSize), EltTy));
1571 }
1572
1573 // May need relegalization for the scalars.
1574 return std::pair(0, EltTy);
1575 })
1576 .minScalar(0, S32)
1577 .narrowScalarIf(isWideScalarExtLoadTruncStore(0), changeTo(0, S32))
1578 .widenScalarToNextPow2(0)
1579 .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0))
1580 .lower();
1581 }
1582
1583 // FIXME: Unaligned accesses not lowered.
1584 auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD})
1585 .legalForTypesWithMemDesc({{S32, GlobalPtr, S8, 8},
1586 {S32, GlobalPtr, S16, 2 * 8},
1587 {S32, LocalPtr, S8, 8},
1588 {S32, LocalPtr, S16, 16},
1589 {S32, PrivatePtr, S8, 8},
1590 {S32, PrivatePtr, S16, 16},
1591 {S32, ConstantPtr, S8, 8},
1592 {S32, ConstantPtr, S16, 2 * 8}})
1593 .legalIf(
1594 [=](const LegalityQuery &Query) -> bool {
1595 return isLoadStoreLegal(ST, Query);
1596 });
1597
1598 if (ST.hasFlatAddressSpace()) {
1599 ExtLoads.legalForTypesWithMemDesc(
1600 {{S32, FlatPtr, S8, 8}, {S32, FlatPtr, S16, 16}});
1601 }
1602
1603 // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to
1604 // 64-bits.
1605 //
1606 // TODO: Should generalize bitcast action into coerce, which will also cover
1607 // inserting addrspacecasts.
1608 ExtLoads.customIf(typeIs(1, Constant32Ptr));
1609
1610 ExtLoads.clampScalar(0, S32, S32)
1611 .widenScalarToNextPow2(0)
1612 .lower();
1613
1614 auto &Atomics = getActionDefinitionsBuilder(
1615 {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB,
1616 G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR,
1617 G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX,
1618 G_ATOMICRMW_UMIN, G_ATOMICRMW_UINC_WRAP, G_ATOMICRMW_UDEC_WRAP})
1619 .legalFor({{S32, GlobalPtr}, {S32, LocalPtr},
1620 {S64, GlobalPtr}, {S64, LocalPtr},
1621 {S32, RegionPtr}, {S64, RegionPtr}});
1622 if (ST.hasFlatAddressSpace()) {
1623 Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}});
1624 }
1625
1626 auto &Atomic = getActionDefinitionsBuilder(G_ATOMICRMW_FADD);
1627 if (ST.hasLDSFPAtomicAddF32()) {
1628 Atomic.legalFor({{S32, LocalPtr}, {S32, RegionPtr}});
1629 if (ST.hasLdsAtomicAddF64())
1630 Atomic.legalFor({{S64, LocalPtr}});
1631 if (ST.hasAtomicDsPkAdd16Insts())
1632 Atomic.legalFor({{V2S16, LocalPtr}});
1633 }
1634 if (ST.hasAtomicFaddInsts())
1635 Atomic.legalFor({{S32, GlobalPtr}});
1636 if (ST.hasFlatAtomicFaddF32Inst())
1637 Atomic.legalFor({{S32, FlatPtr}});
1638
1639 if (ST.hasGFX90AInsts()) {
1640 // These are legal with some caveats, and should have undergone expansion in
1641 // the IR in most situations
1642 // TODO: Move atomic expansion into legalizer
1643 Atomic.legalFor({
1644 {S32, GlobalPtr},
1645 {S64, GlobalPtr},
1646 {S64, FlatPtr}
1647 });
1648 }
1649
1650 // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output
1651 // demarshalling
1652 getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG)
1653 .customFor({{S32, GlobalPtr}, {S64, GlobalPtr},
1654 {S32, FlatPtr}, {S64, FlatPtr}})
1655 .legalFor({{S32, LocalPtr}, {S64, LocalPtr},
1656 {S32, RegionPtr}, {S64, RegionPtr}});
1657 // TODO: Pointer types, any 32-bit or 64-bit vector
1658
1659 // Condition should be s32 for scalar, s1 for vector.
1660 getActionDefinitionsBuilder(G_SELECT)
1661 .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16, GlobalPtr,
1662 LocalPtr, FlatPtr, PrivatePtr,
1663 LLT::fixed_vector(2, LocalPtr),
1664 LLT::fixed_vector(2, PrivatePtr)},
1665 {S1, S32})
1666 .clampScalar(0, S16, S64)
1667 .scalarize(1)
1668 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
1669 .fewerElementsIf(numElementsNotEven(0), scalarize(0))
1670 .clampMaxNumElements(0, S32, 2)
1671 .clampMaxNumElements(0, LocalPtr, 2)
1672 .clampMaxNumElements(0, PrivatePtr, 2)
1673 .scalarize(0)
1674 .widenScalarToNextPow2(0)
1675 .legalIf(all(isPointer(0), typeInSet(1, {S1, S32})));
1676
1677 // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can
1678 // be more flexible with the shift amount type.
1679 auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR})
1680 .legalFor({{S32, S32}, {S64, S32}});
1681 if (ST.has16BitInsts()) {
1682 if (ST.hasVOP3PInsts()) {
1683 Shifts.legalFor({{S16, S16}, {V2S16, V2S16}})
1684 .clampMaxNumElements(0, S16, 2);
1685 } else
1686 Shifts.legalFor({{S16, S16}});
1687
1688 // TODO: Support 16-bit shift amounts for all types
1689 Shifts.widenScalarIf(
1690 [=](const LegalityQuery &Query) {
1691 // Use 16-bit shift amounts for any 16-bit shift. Otherwise we want a
1692 // 32-bit amount.
1693 const LLT ValTy = Query.Types[0];
1694 const LLT AmountTy = Query.Types[1];
1695 return ValTy.getSizeInBits() <= 16 &&
1696 AmountTy.getSizeInBits() < 16;
1697 }, changeTo(1, S16));
1698 Shifts.maxScalarIf(typeIs(0, S16), 1, S16);
1699 Shifts.clampScalar(1, S32, S32);
1700 Shifts.widenScalarToNextPow2(0, 16);
1701 Shifts.clampScalar(0, S16, S64);
1702
1703 getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1704 .minScalar(0, S16)
1705 .scalarize(0)
1706 .lower();
1707 } else {
1708 // Make sure we legalize the shift amount type first, as the general
1709 // expansion for the shifted type will produce much worse code if it hasn't
1710 // been truncated already.
1711 Shifts.clampScalar(1, S32, S32);
1712 Shifts.widenScalarToNextPow2(0, 32);
1713 Shifts.clampScalar(0, S32, S64);
1714
1715 getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1716 .minScalar(0, S32)
1717 .scalarize(0)
1718 .lower();
1719 }
1720 Shifts.scalarize(0);
1721
1722 for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) {
1723 unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0;
1724 unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1;
1725 unsigned IdxTypeIdx = 2;
1726
1727 getActionDefinitionsBuilder(Op)
1728 .customIf([=](const LegalityQuery &Query) {
1729 const LLT EltTy = Query.Types[EltTypeIdx];
1730 const LLT VecTy = Query.Types[VecTypeIdx];
1731 const LLT IdxTy = Query.Types[IdxTypeIdx];
1732 const unsigned EltSize = EltTy.getSizeInBits();
1733 const bool isLegalVecType =
1735 // Address space 8 pointers are 128-bit wide values, but the logic
1736 // below will try to bitcast them to 2N x s64, which will fail.
1737 // Therefore, as an intermediate step, wrap extracts/insertions from a
1738 // ptrtoint-ing the vector and scalar arguments (or inttoptring the
1739 // extraction result) in order to produce a vector operation that can
1740 // be handled by the logic below.
1741 if (EltTy.isPointer() && EltSize > 64)
1742 return true;
1743 return (EltSize == 32 || EltSize == 64) &&
1744 VecTy.getSizeInBits() % 32 == 0 &&
1745 VecTy.getSizeInBits() <= MaxRegisterSize &&
1746 IdxTy.getSizeInBits() == 32 &&
1747 isLegalVecType;
1748 })
1749 .bitcastIf(all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltNarrowerThan(VecTypeIdx, 32)),
1750 bitcastToVectorElement32(VecTypeIdx))
1751 //.bitcastIf(vectorSmallerThan(1, 32), bitcastToScalar(1))
1752 .bitcastIf(
1753 all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltWiderThan(VecTypeIdx, 64)),
1754 [=](const LegalityQuery &Query) {
1755 // For > 64-bit element types, try to turn this into a 64-bit
1756 // element vector since we may be able to do better indexing
1757 // if this is scalar. If not, fall back to 32.
1758 const LLT EltTy = Query.Types[EltTypeIdx];
1759 const LLT VecTy = Query.Types[VecTypeIdx];
1760 const unsigned DstEltSize = EltTy.getSizeInBits();
1761 const unsigned VecSize = VecTy.getSizeInBits();
1762
1763 const unsigned TargetEltSize = DstEltSize % 64 == 0 ? 64 : 32;
1764 return std::pair(
1765 VecTypeIdx,
1766 LLT::fixed_vector(VecSize / TargetEltSize, TargetEltSize));
1767 })
1768 .clampScalar(EltTypeIdx, S32, S64)
1769 .clampScalar(VecTypeIdx, S32, S64)
1770 .clampScalar(IdxTypeIdx, S32, S32)
1771 .clampMaxNumElements(VecTypeIdx, S32, 32)
1772 // TODO: Clamp elements for 64-bit vectors?
1773 .moreElementsIf(
1774 isIllegalRegisterType(VecTypeIdx),
1776 // It should only be necessary with variable indexes.
1777 // As a last resort, lower to the stack
1778 .lower();
1779 }
1780
1781 getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT)
1782 .unsupportedIf([=](const LegalityQuery &Query) {
1783 const LLT &EltTy = Query.Types[1].getElementType();
1784 return Query.Types[0] != EltTy;
1785 });
1786
1787 for (unsigned Op : {G_EXTRACT, G_INSERT}) {
1788 unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0;
1789 unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1;
1790
1791 // FIXME: Doesn't handle extract of illegal sizes.
1792 getActionDefinitionsBuilder(Op)
1793 .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32)))
1794 .lowerIf([=](const LegalityQuery &Query) {
1795 // Sub-vector(or single element) insert and extract.
1796 // TODO: verify immediate offset here since lower only works with
1797 // whole elements.
1798 const LLT BigTy = Query.Types[BigTyIdx];
1799 return BigTy.isVector();
1800 })
1801 // FIXME: Multiples of 16 should not be legal.
1802 .legalIf([=](const LegalityQuery &Query) {
1803 const LLT BigTy = Query.Types[BigTyIdx];
1804 const LLT LitTy = Query.Types[LitTyIdx];
1805 return (BigTy.getSizeInBits() % 32 == 0) &&
1806 (LitTy.getSizeInBits() % 16 == 0);
1807 })
1808 .widenScalarIf(
1809 [=](const LegalityQuery &Query) {
1810 const LLT BigTy = Query.Types[BigTyIdx];
1811 return (BigTy.getScalarSizeInBits() < 16);
1812 },
1814 .widenScalarIf(
1815 [=](const LegalityQuery &Query) {
1816 const LLT LitTy = Query.Types[LitTyIdx];
1817 return (LitTy.getScalarSizeInBits() < 16);
1818 },
1820 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1821 .widenScalarToNextPow2(BigTyIdx, 32);
1822
1823 }
1824
1825 auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR)
1826 .legalForCartesianProduct(AllS32Vectors, {S32})
1827 .legalForCartesianProduct(AllS64Vectors, {S64})
1828 .clampNumElements(0, V16S32, V32S32)
1829 .clampNumElements(0, V2S64, V16S64)
1830 .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16))
1831 .moreElementsIf(
1834
1835 if (ST.hasScalarPackInsts()) {
1836 BuildVector
1837 // FIXME: Should probably widen s1 vectors straight to s32
1838 .minScalarOrElt(0, S16)
1839 .minScalar(1, S16);
1840
1841 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1842 .legalFor({V2S16, S32})
1843 .lower();
1844 } else {
1845 BuildVector.customFor({V2S16, S16});
1846 BuildVector.minScalarOrElt(0, S32);
1847
1848 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1849 .customFor({V2S16, S32})
1850 .lower();
1851 }
1852
1853 BuildVector.legalIf(isRegisterType(0));
1854
1855 // FIXME: Clamp maximum size
1856 getActionDefinitionsBuilder(G_CONCAT_VECTORS)
1857 .legalIf(all(isRegisterType(0), isRegisterType(1)))
1858 .clampMaxNumElements(0, S32, 32)
1859 .clampMaxNumElements(1, S16, 2) // TODO: Make 4?
1860 .clampMaxNumElements(0, S16, 64);
1861
1862 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower();
1863
1864 // Merge/Unmerge
1865 for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
1866 unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1;
1867 unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0;
1868
1869 auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) {
1870 const LLT Ty = Query.Types[TypeIdx];
1871 if (Ty.isVector()) {
1872 const LLT &EltTy = Ty.getElementType();
1873 if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 512)
1874 return true;
1875 if (!llvm::has_single_bit<uint32_t>(EltTy.getSizeInBits()))
1876 return true;
1877 }
1878 return false;
1879 };
1880
1881 auto &Builder = getActionDefinitionsBuilder(Op)
1882 .legalIf(all(isRegisterType(0), isRegisterType(1)))
1883 .lowerFor({{S16, V2S16}})
1884 .lowerIf([=](const LegalityQuery &Query) {
1885 const LLT BigTy = Query.Types[BigTyIdx];
1886 return BigTy.getSizeInBits() == 32;
1887 })
1888 // Try to widen to s16 first for small types.
1889 // TODO: Only do this on targets with legal s16 shifts
1890 .minScalarOrEltIf(scalarNarrowerThan(LitTyIdx, 16), LitTyIdx, S16)
1891 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16)
1892 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1893 .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32),
1894 elementTypeIs(1, S16)),
1895 changeTo(1, V2S16))
1896 // Clamp the little scalar to s8-s256 and make it a power of 2. It's not
1897 // worth considering the multiples of 64 since 2*192 and 2*384 are not
1898 // valid.
1899 .clampScalar(LitTyIdx, S32, S512)
1900 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32)
1901 // Break up vectors with weird elements into scalars
1902 .fewerElementsIf(
1903 [=](const LegalityQuery &Query) { return notValidElt(Query, LitTyIdx); },
1904 scalarize(0))
1905 .fewerElementsIf(
1906 [=](const LegalityQuery &Query) { return notValidElt(Query, BigTyIdx); },
1907 scalarize(1))
1908 .clampScalar(BigTyIdx, S32, MaxScalar);
1909
1910 if (Op == G_MERGE_VALUES) {
1911 Builder.widenScalarIf(
1912 // TODO: Use 16-bit shifts if legal for 8-bit values?
1913 [=](const LegalityQuery &Query) {
1914 const LLT Ty = Query.Types[LitTyIdx];
1915 return Ty.getSizeInBits() < 32;
1916 },
1917 changeTo(LitTyIdx, S32));
1918 }
1919
1920 Builder.widenScalarIf(
1921 [=](const LegalityQuery &Query) {
1922 const LLT Ty = Query.Types[BigTyIdx];
1923 return Ty.getSizeInBits() % 16 != 0;
1924 },
1925 [=](const LegalityQuery &Query) {
1926 // Pick the next power of 2, or a multiple of 64 over 128.
1927 // Whichever is smaller.
1928 const LLT &Ty = Query.Types[BigTyIdx];
1929 unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1);
1930 if (NewSizeInBits >= 256) {
1931 unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1);
1932 if (RoundedTo < NewSizeInBits)
1933 NewSizeInBits = RoundedTo;
1934 }
1935 return std::pair(BigTyIdx, LLT::scalar(NewSizeInBits));
1936 })
1937 // Any vectors left are the wrong size. Scalarize them.
1938 .scalarize(0)
1939 .scalarize(1);
1940 }
1941
1942 // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
1943 // RegBankSelect.
1944 auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG)
1945 .legalFor({{S32}, {S64}});
1946
1947 if (ST.hasVOP3PInsts()) {
1948 SextInReg.lowerFor({{V2S16}})
1949 // Prefer to reduce vector widths for 16-bit vectors before lowering, to
1950 // get more vector shift opportunities, since we'll get those when
1951 // expanded.
1952 .clampMaxNumElementsStrict(0, S16, 2);
1953 } else if (ST.has16BitInsts()) {
1954 SextInReg.lowerFor({{S32}, {S64}, {S16}});
1955 } else {
1956 // Prefer to promote to s32 before lowering if we don't have 16-bit
1957 // shifts. This avoid a lot of intermediate truncate and extend operations.
1958 SextInReg.lowerFor({{S32}, {S64}});
1959 }
1960
1961 SextInReg
1962 .scalarize(0)
1963 .clampScalar(0, S32, S64)
1964 .lower();
1965
1966 getActionDefinitionsBuilder({G_ROTR, G_ROTL})
1967 .scalarize(0)
1968 .lower();
1969
1970 // TODO: Only Try to form v2s16 with legal packed instructions.
1971 getActionDefinitionsBuilder(G_FSHR)
1972 .legalFor({{S32, S32}})
1973 .lowerFor({{V2S16, V2S16}})
1974 .clampMaxNumElementsStrict(0, S16, 2)
1975 .scalarize(0)
1976 .lower();
1977
1978 if (ST.hasVOP3PInsts()) {
1979 getActionDefinitionsBuilder(G_FSHL)
1980 .lowerFor({{V2S16, V2S16}})
1981 .clampMaxNumElementsStrict(0, S16, 2)
1982 .scalarize(0)
1983 .lower();
1984 } else {
1985 getActionDefinitionsBuilder(G_FSHL)
1986 .scalarize(0)
1987 .lower();
1988 }
1989
1990 getActionDefinitionsBuilder(G_READCYCLECOUNTER)
1991 .legalFor({S64});
1992
1993 getActionDefinitionsBuilder(G_READSTEADYCOUNTER).legalFor({S64});
1994
1995 getActionDefinitionsBuilder(G_FENCE)
1996 .alwaysLegal();
1997
1998 getActionDefinitionsBuilder({G_SMULO, G_UMULO})
1999 .scalarize(0)
2000 .minScalar(0, S32)
2001 .lower();
2002
2003 getActionDefinitionsBuilder({G_SBFX, G_UBFX})
2004 .legalFor({{S32, S32}, {S64, S32}})
2005 .clampScalar(1, S32, S32)
2006 .clampScalar(0, S32, S64)
2007 .widenScalarToNextPow2(0)
2008 .scalarize(0);
2009
2010 getActionDefinitionsBuilder(
2011 {// TODO: Verify V_BFI_B32 is generated from expanded bit ops
2012 G_FCOPYSIGN,
2013
2014 G_ATOMIC_CMPXCHG_WITH_SUCCESS, G_ATOMICRMW_NAND, G_ATOMICRMW_FSUB,
2015 G_READ_REGISTER, G_WRITE_REGISTER,
2016
2017 G_SADDO, G_SSUBO})
2018 .lower();
2019
2020 if (ST.hasIEEEMinMax()) {
2021 getActionDefinitionsBuilder({G_FMINIMUM, G_FMAXIMUM})
2022 .legalFor(FPTypesPK16)
2023 .clampMaxNumElements(0, S16, 2)
2024 .scalarize(0);
2025 } else {
2026 // TODO: Implement
2027 getActionDefinitionsBuilder({G_FMINIMUM, G_FMAXIMUM}).lower();
2028 }
2029
2030 getActionDefinitionsBuilder({G_MEMCPY, G_MEMCPY_INLINE, G_MEMMOVE, G_MEMSET})
2031 .lower();
2032
2033 getActionDefinitionsBuilder({G_TRAP, G_DEBUGTRAP}).custom();
2034
2035 getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE,
2036 G_INDEXED_LOAD, G_INDEXED_SEXTLOAD,
2037 G_INDEXED_ZEXTLOAD, G_INDEXED_STORE})
2038 .unsupported();
2039
2040 getActionDefinitionsBuilder(G_PREFETCH).alwaysLegal();
2041
2042 getLegacyLegalizerInfo().computeTables();
2043 verify(*ST.getInstrInfo());
2044}
2045
2048 LostDebugLocObserver &LocObserver) const {
2049 MachineIRBuilder &B = Helper.MIRBuilder;
2050 MachineRegisterInfo &MRI = *B.getMRI();
2051
2052 switch (MI.getOpcode()) {
2053 case TargetOpcode::G_ADDRSPACE_CAST:
2054 return legalizeAddrSpaceCast(MI, MRI, B);
2055 case TargetOpcode::G_INTRINSIC_ROUNDEVEN:
2056 return legalizeFroundeven(MI, MRI, B);
2057 case TargetOpcode::G_FCEIL:
2058 return legalizeFceil(MI, MRI, B);
2059 case TargetOpcode::G_FREM:
2060 return legalizeFrem(MI, MRI, B);
2061 case TargetOpcode::G_INTRINSIC_TRUNC:
2062 return legalizeIntrinsicTrunc(MI, MRI, B);
2063 case TargetOpcode::G_SITOFP:
2064 return legalizeITOFP(MI, MRI, B, true);
2065 case TargetOpcode::G_UITOFP:
2066 return legalizeITOFP(MI, MRI, B, false);
2067 case TargetOpcode::G_FPTOSI:
2068 return legalizeFPTOI(MI, MRI, B, true);
2069 case TargetOpcode::G_FPTOUI:
2070 return legalizeFPTOI(MI, MRI, B, false);
2071 case TargetOpcode::G_FMINNUM:
2072 case TargetOpcode::G_FMAXNUM:
2073 case TargetOpcode::G_FMINNUM_IEEE:
2074 case TargetOpcode::G_FMAXNUM_IEEE:
2075 return legalizeMinNumMaxNum(Helper, MI);
2076 case TargetOpcode::G_EXTRACT_VECTOR_ELT:
2077 return legalizeExtractVectorElt(MI, MRI, B);
2078 case TargetOpcode::G_INSERT_VECTOR_ELT:
2079 return legalizeInsertVectorElt(MI, MRI, B);
2080 case TargetOpcode::G_FSIN:
2081 case TargetOpcode::G_FCOS:
2082 return legalizeSinCos(MI, MRI, B);
2083 case TargetOpcode::G_GLOBAL_VALUE:
2084 return legalizeGlobalValue(MI, MRI, B);
2085 case TargetOpcode::G_LOAD:
2086 case TargetOpcode::G_SEXTLOAD:
2087 case TargetOpcode::G_ZEXTLOAD:
2088 return legalizeLoad(Helper, MI);
2089 case TargetOpcode::G_STORE:
2090 return legalizeStore(Helper, MI);
2091 case TargetOpcode::G_FMAD:
2092 return legalizeFMad(MI, MRI, B);
2093 case TargetOpcode::G_FDIV:
2094 return legalizeFDIV(MI, MRI, B);
2095 case TargetOpcode::G_FFREXP:
2096 return legalizeFFREXP(MI, MRI, B);
2097 case TargetOpcode::G_FSQRT:
2098 return legalizeFSQRT(MI, MRI, B);
2099 case TargetOpcode::G_UDIV:
2100 case TargetOpcode::G_UREM:
2101 case TargetOpcode::G_UDIVREM:
2102 return legalizeUnsignedDIV_REM(MI, MRI, B);
2103 case TargetOpcode::G_SDIV:
2104 case TargetOpcode::G_SREM:
2105 case TargetOpcode::G_SDIVREM:
2106 return legalizeSignedDIV_REM(MI, MRI, B);
2107 case TargetOpcode::G_ATOMIC_CMPXCHG:
2108 return legalizeAtomicCmpXChg(MI, MRI, B);
2109 case TargetOpcode::G_FLOG2:
2110 return legalizeFlog2(MI, B);
2111 case TargetOpcode::G_FLOG:
2112 case TargetOpcode::G_FLOG10:
2113 return legalizeFlogCommon(MI, B);
2114 case TargetOpcode::G_FEXP2:
2115 return legalizeFExp2(MI, B);
2116 case TargetOpcode::G_FEXP:
2117 case TargetOpcode::G_FEXP10:
2118 return legalizeFExp(MI, B);
2119 case TargetOpcode::G_FPOW:
2120 return legalizeFPow(MI, B);
2121 case TargetOpcode::G_FFLOOR:
2122 return legalizeFFloor(MI, MRI, B);
2123 case TargetOpcode::G_BUILD_VECTOR:
2124 case TargetOpcode::G_BUILD_VECTOR_TRUNC:
2125 return legalizeBuildVector(MI, MRI, B);
2126 case TargetOpcode::G_MUL:
2127 return legalizeMul(Helper, MI);
2128 case TargetOpcode::G_CTLZ:
2129 case TargetOpcode::G_CTTZ:
2130 return legalizeCTLZ_CTTZ(MI, MRI, B);
2131 case TargetOpcode::G_INTRINSIC_FPTRUNC_ROUND:
2132 return legalizeFPTruncRound(MI, B);
2133 case TargetOpcode::G_STACKSAVE:
2134 return legalizeStackSave(MI, B);
2135 case TargetOpcode::G_GET_FPENV:
2136 return legalizeGetFPEnv(MI, MRI, B);
2137 case TargetOpcode::G_SET_FPENV:
2138 return legalizeSetFPEnv(MI, MRI, B);
2139 case TargetOpcode::G_TRAP:
2140 return legalizeTrap(MI, MRI, B);
2141 case TargetOpcode::G_DEBUGTRAP:
2142 return legalizeDebugTrap(MI, MRI, B);
2143 default:
2144 return false;
2145 }
2146
2147 llvm_unreachable("expected switch to return");
2148}
2149
2151 unsigned AS,
2153 MachineIRBuilder &B) const {
2154 MachineFunction &MF = B.getMF();
2155 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
2156 const LLT S32 = LLT::scalar(32);
2157 const LLT S64 = LLT::scalar(64);
2158
2160
2161 if (ST.hasApertureRegs()) {
2162 // Note: this register is somewhat broken. When used as a 32-bit operand,
2163 // it only returns zeroes. The real value is in the upper 32 bits.
2164 // Thus, we must emit extract the high 32 bits.
2165 const unsigned ApertureRegNo = (AS == AMDGPUAS::LOCAL_ADDRESS)
2166 ? AMDGPU::SRC_SHARED_BASE
2167 : AMDGPU::SRC_PRIVATE_BASE;
2168 // FIXME: It would be more natural to emit a COPY here, but then copy
2169 // coalescing would kick in and it would think it's okay to use the "HI"
2170 // subregister (instead of extracting the HI 32 bits) which is an artificial
2171 // (unusable) register.
2172 // Register TableGen definitions would need an overhaul to get rid of the
2173 // artificial "HI" aperture registers and prevent this kind of issue from
2174 // happening.
2175 Register Dst = MRI.createGenericVirtualRegister(S64);
2176 MRI.setRegClass(Dst, &AMDGPU::SReg_64RegClass);
2177 B.buildInstr(AMDGPU::S_MOV_B64, {Dst}, {Register(ApertureRegNo)});
2178 return B.buildUnmerge(S32, Dst).getReg(1);
2179 }
2180
2181 // TODO: can we be smarter about machine pointer info?
2183 Register LoadAddr = MRI.createGenericVirtualRegister(
2185 // For code object version 5, private_base and shared_base are passed through
2186 // implicit kernargs.
2193 ST.getTargetLowering()->getImplicitParameterOffset(B.getMF(), Param);
2194
2195 Register KernargPtrReg = MRI.createGenericVirtualRegister(
2197
2198 if (!loadInputValue(KernargPtrReg, B,
2200 return Register();
2201
2203 PtrInfo,
2207
2208 // Pointer address
2209 B.buildPtrAdd(LoadAddr, KernargPtrReg,
2210 B.buildConstant(LLT::scalar(64), Offset).getReg(0));
2211 // Load address
2212 return B.buildLoad(S32, LoadAddr, *MMO).getReg(0);
2213 }
2214
2215 Register QueuePtr = MRI.createGenericVirtualRegister(
2217
2219 return Register();
2220
2221 // Offset into amd_queue_t for group_segment_aperture_base_hi /
2222 // private_segment_aperture_base_hi.
2223 uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
2224
2226 PtrInfo,
2229 LLT::scalar(32), commonAlignment(Align(64), StructOffset));
2230
2231 B.buildPtrAdd(LoadAddr, QueuePtr,
2232 B.buildConstant(LLT::scalar(64), StructOffset).getReg(0));
2233 return B.buildLoad(S32, LoadAddr, *MMO).getReg(0);
2234}
2235
2236/// Return true if the value is a known valid address, such that a null check is
2237/// not necessary.
2239 const AMDGPUTargetMachine &TM, unsigned AddrSpace) {
2240 MachineInstr *Def = MRI.getVRegDef(Val);
2241 switch (Def->getOpcode()) {
2242 case AMDGPU::G_FRAME_INDEX:
2243 case AMDGPU::G_GLOBAL_VALUE:
2244 case AMDGPU::G_BLOCK_ADDR:
2245 return true;
2246 case AMDGPU::G_CONSTANT: {
2247 const ConstantInt *CI = Def->getOperand(1).getCImm();
2248 return CI->getSExtValue() != TM.getNullPointerValue(AddrSpace);
2249 }
2250 default:
2251 return false;
2252 }
2253
2254 return false;
2255}
2256
2259 MachineIRBuilder &B) const {
2260 MachineFunction &MF = B.getMF();
2261
2262 // MI can either be a G_ADDRSPACE_CAST or a
2263 // G_INTRINSIC @llvm.amdgcn.addrspacecast.nonnull
2264 assert(MI.getOpcode() == TargetOpcode::G_ADDRSPACE_CAST ||
2265 (isa<GIntrinsic>(MI) && cast<GIntrinsic>(MI).getIntrinsicID() ==
2266 Intrinsic::amdgcn_addrspacecast_nonnull));
2267
2268 const LLT S32 = LLT::scalar(32);
2269 Register Dst = MI.getOperand(0).getReg();
2270 Register Src = isa<GIntrinsic>(MI) ? MI.getOperand(2).getReg()
2271 : MI.getOperand(1).getReg();
2272 LLT DstTy = MRI.getType(Dst);
2273 LLT SrcTy = MRI.getType(Src);
2274 unsigned DestAS = DstTy.getAddressSpace();
2275 unsigned SrcAS = SrcTy.getAddressSpace();
2276
2277 // TODO: Avoid reloading from the queue ptr for each cast, or at least each
2278 // vector element.
2279 assert(!DstTy.isVector());
2280
2281 const AMDGPUTargetMachine &TM
2282 = static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
2283
2284 if (TM.isNoopAddrSpaceCast(SrcAS, DestAS)) {
2285 MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST));
2286 return true;
2287 }
2288
2289 if (SrcAS == AMDGPUAS::FLAT_ADDRESS &&
2290 (DestAS == AMDGPUAS::LOCAL_ADDRESS ||
2291 DestAS == AMDGPUAS::PRIVATE_ADDRESS)) {
2292 // For llvm.amdgcn.addrspacecast.nonnull we can always assume non-null, for
2293 // G_ADDRSPACE_CAST we need to guess.
2294 if (isa<GIntrinsic>(MI) || isKnownNonNull(Src, MRI, TM, SrcAS)) {
2295 // Extract low 32-bits of the pointer.
2296 B.buildExtract(Dst, Src, 0);
2297 MI.eraseFromParent();
2298 return true;
2299 }
2300
2301 unsigned NullVal = TM.getNullPointerValue(DestAS);
2302
2303 auto SegmentNull = B.buildConstant(DstTy, NullVal);
2304 auto FlatNull = B.buildConstant(SrcTy, 0);
2305
2306 // Extract low 32-bits of the pointer.
2307 auto PtrLo32 = B.buildExtract(DstTy, Src, 0);
2308
2309 auto CmpRes =
2310 B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0));
2311 B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0));
2312
2313 MI.eraseFromParent();
2314 return true;
2315 }
2316
2317 if (DestAS == AMDGPUAS::FLAT_ADDRESS &&
2318 (SrcAS == AMDGPUAS::LOCAL_ADDRESS ||
2319 SrcAS == AMDGPUAS::PRIVATE_ADDRESS)) {
2320 Register ApertureReg = getSegmentAperture(SrcAS, MRI, B);
2321 if (!ApertureReg.isValid())
2322 return false;
2323
2324 // Coerce the type of the low half of the result so we can use merge_values.
2325 Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0);
2326
2327 // TODO: Should we allow mismatched types but matching sizes in merges to
2328 // avoid the ptrtoint?
2329 auto BuildPtr = B.buildMergeLikeInstr(DstTy, {SrcAsInt, ApertureReg});
2330
2331 // For llvm.amdgcn.addrspacecast.nonnull we can always assume non-null, for
2332 // G_ADDRSPACE_CAST we need to guess.
2333 if (isa<GIntrinsic>(MI) || isKnownNonNull(Src, MRI, TM, SrcAS)) {
2334 B.buildCopy(Dst, BuildPtr);
2335 MI.eraseFromParent();
2336 return true;
2337 }
2338
2339 auto SegmentNull = B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS));
2340 auto FlatNull = B.buildConstant(DstTy, TM.getNullPointerValue(DestAS));
2341
2342 auto CmpRes = B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src,
2343 SegmentNull.getReg(0));
2344
2345 B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull);
2346
2347 MI.eraseFromParent();
2348 return true;
2349 }
2350
2351 if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
2352 SrcTy.getSizeInBits() == 64) {
2353 // Truncate.
2354 B.buildExtract(Dst, Src, 0);
2355 MI.eraseFromParent();
2356 return true;
2357 }
2358
2359 if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
2360 DstTy.getSizeInBits() == 64) {
2362 uint32_t AddrHiVal = Info->get32BitAddressHighBits();
2363 auto PtrLo = B.buildPtrToInt(S32, Src);
2364 auto HighAddr = B.buildConstant(S32, AddrHiVal);
2365 B.buildMergeLikeInstr(Dst, {PtrLo, HighAddr});
2366 MI.eraseFromParent();
2367 return true;
2368 }
2369
2370 DiagnosticInfoUnsupported InvalidAddrSpaceCast(
2371 MF.getFunction(), "invalid addrspacecast", B.getDebugLoc());
2372
2373 LLVMContext &Ctx = MF.getFunction().getContext();
2374 Ctx.diagnose(InvalidAddrSpaceCast);
2375 B.buildUndef(Dst);
2376 MI.eraseFromParent();
2377 return true;
2378}
2379
2382 MachineIRBuilder &B) const {
2383 Register Src = MI.getOperand(1).getReg();
2384 LLT Ty = MRI.getType(Src);
2385 assert(Ty.isScalar() && Ty.getSizeInBits() == 64);
2386
2387 APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
2388 APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
2389
2390 auto C1 = B.buildFConstant(Ty, C1Val);
2391 auto CopySign = B.buildFCopysign(Ty, C1, Src);
2392
2393 // TODO: Should this propagate fast-math-flags?
2394 auto Tmp1 = B.buildFAdd(Ty, Src, CopySign);
2395 auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign);
2396
2397 auto C2 = B.buildFConstant(Ty, C2Val);
2398 auto Fabs = B.buildFAbs(Ty, Src);
2399
2400 auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2);
2401 B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2);
2402 MI.eraseFromParent();
2403 return true;
2404}
2405
2408 MachineIRBuilder &B) const {
2409
2410 const LLT S1 = LLT::scalar(1);
2411 const LLT S64 = LLT::scalar(64);
2412
2413 Register Src = MI.getOperand(1).getReg();
2414 assert(MRI.getType(Src) == S64);
2415
2416 // result = trunc(src)
2417 // if (src > 0.0 && src != result)
2418 // result += 1.0
2419
2420 auto Trunc = B.buildIntrinsicTrunc(S64, Src);
2421
2422 const auto Zero = B.buildFConstant(S64, 0.0);
2423 const auto One = B.buildFConstant(S64, 1.0);
2424 auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero);
2425 auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc);
2426 auto And = B.buildAnd(S1, Lt0, NeTrunc);
2427 auto Add = B.buildSelect(S64, And, One, Zero);
2428
2429 // TODO: Should this propagate fast-math-flags?
2430 B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add);
2431 MI.eraseFromParent();
2432 return true;
2433}
2434
2437 MachineIRBuilder &B) const {
2438 Register DstReg = MI.getOperand(0).getReg();
2439 Register Src0Reg = MI.getOperand(1).getReg();
2440 Register Src1Reg = MI.getOperand(2).getReg();
2441 auto Flags = MI.getFlags();
2442 LLT Ty = MRI.getType(DstReg);
2443
2444 auto Div = B.buildFDiv(Ty, Src0Reg, Src1Reg, Flags);
2445 auto Trunc = B.buildIntrinsicTrunc(Ty, Div, Flags);
2446 auto Neg = B.buildFNeg(Ty, Trunc, Flags);
2447 B.buildFMA(DstReg, Neg, Src1Reg, Src0Reg, Flags);
2448 MI.eraseFromParent();
2449 return true;
2450}
2451
2454 const unsigned FractBits = 52;
2455 const unsigned ExpBits = 11;
2456 LLT S32 = LLT::scalar(32);
2457
2458 auto Const0 = B.buildConstant(S32, FractBits - 32);
2459 auto Const1 = B.buildConstant(S32, ExpBits);
2460
2461 auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32})
2462 .addUse(Hi)
2463 .addUse(Const0.getReg(0))
2464 .addUse(Const1.getReg(0));
2465
2466 return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023));
2467}
2468
2471 MachineIRBuilder &B) const {
2472 const LLT S1 = LLT::scalar(1);
2473 const LLT S32 = LLT::scalar(32);
2474 const LLT S64 = LLT::scalar(64);
2475
2476 Register Src = MI.getOperand(1).getReg();
2477 assert(MRI.getType(Src) == S64);
2478
2479 // TODO: Should this use extract since the low half is unused?
2480 auto Unmerge = B.buildUnmerge({S32, S32}, Src);
2481 Register Hi = Unmerge.getReg(1);
2482
2483 // Extract the upper half, since this is where we will find the sign and
2484 // exponent.
2485 auto Exp = extractF64Exponent(Hi, B);
2486
2487 const unsigned FractBits = 52;
2488
2489 // Extract the sign bit.
2490 const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31);
2491 auto SignBit = B.buildAnd(S32, Hi, SignBitMask);
2492
2493 const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1);
2494
2495 const auto Zero32 = B.buildConstant(S32, 0);
2496
2497 // Extend back to 64-bits.
2498 auto SignBit64 = B.buildMergeLikeInstr(S64, {Zero32, SignBit});
2499
2500 auto Shr = B.buildAShr(S64, FractMask, Exp);
2501 auto Not = B.buildNot(S64, Shr);
2502 auto Tmp0 = B.buildAnd(S64, Src, Not);
2503 auto FiftyOne = B.buildConstant(S32, FractBits - 1);
2504
2505 auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32);
2506 auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne);
2507
2508 auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0);
2509 B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1);
2510 MI.eraseFromParent();
2511 return true;
2512}
2513
2516 MachineIRBuilder &B, bool Signed) const {
2517
2518 Register Dst = MI.getOperand(0).getReg();
2519 Register Src = MI.getOperand(1).getReg();
2520
2521 const LLT S64 = LLT::scalar(64);
2522 const LLT S32 = LLT::scalar(32);
2523
2524 assert(MRI.getType(Src) == S64);
2525
2526 auto Unmerge = B.buildUnmerge({S32, S32}, Src);
2527 auto ThirtyTwo = B.buildConstant(S32, 32);
2528
2529 if (MRI.getType(Dst) == S64) {
2530 auto CvtHi = Signed ? B.buildSITOFP(S64, Unmerge.getReg(1))
2531 : B.buildUITOFP(S64, Unmerge.getReg(1));
2532
2533 auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0));
2534 auto LdExp = B.buildFLdexp(S64, CvtHi, ThirtyTwo);
2535
2536 // TODO: Should this propagate fast-math-flags?
2537 B.buildFAdd(Dst, LdExp, CvtLo);
2538 MI.eraseFromParent();
2539 return true;
2540 }
2541
2542 assert(MRI.getType(Dst) == S32);
2543
2544 auto One = B.buildConstant(S32, 1);
2545
2546 MachineInstrBuilder ShAmt;
2547 if (Signed) {
2548 auto ThirtyOne = B.buildConstant(S32, 31);
2549 auto X = B.buildXor(S32, Unmerge.getReg(0), Unmerge.getReg(1));
2550 auto OppositeSign = B.buildAShr(S32, X, ThirtyOne);
2551 auto MaxShAmt = B.buildAdd(S32, ThirtyTwo, OppositeSign);
2552 auto LS = B.buildIntrinsic(Intrinsic::amdgcn_sffbh, {S32})
2553 .addUse(Unmerge.getReg(1));
2554 auto LS2 = B.buildSub(S32, LS, One);
2555 ShAmt = B.buildUMin(S32, LS2, MaxShAmt);
2556 } else
2557 ShAmt = B.buildCTLZ(S32, Unmerge.getReg(1));
2558 auto Norm = B.buildShl(S64, Src, ShAmt);
2559 auto Unmerge2 = B.buildUnmerge({S32, S32}, Norm);
2560 auto Adjust = B.buildUMin(S32, One, Unmerge2.getReg(0));
2561 auto Norm2 = B.buildOr(S32, Unmerge2.getReg(1), Adjust);
2562 auto FVal = Signed ? B.buildSITOFP(S32, Norm2) : B.buildUITOFP(S32, Norm2);
2563 auto Scale = B.buildSub(S32, ThirtyTwo, ShAmt);
2564 B.buildFLdexp(Dst, FVal, Scale);
2565 MI.eraseFromParent();
2566 return true;
2567}
2568
2569// TODO: Copied from DAG implementation. Verify logic and document how this
2570// actually works.
2574 bool Signed) const {
2575
2576 Register Dst = MI.getOperand(0).getReg();
2577 Register Src = MI.getOperand(1).getReg();
2578
2579 const LLT S64 = LLT::scalar(64);
2580 const LLT S32 = LLT::scalar(32);
2581
2582 const LLT SrcLT = MRI.getType(Src);
2583 assert((SrcLT == S32 || SrcLT == S64) && MRI.getType(Dst) == S64);
2584
2585 unsigned Flags = MI.getFlags();
2586
2587 // The basic idea of converting a floating point number into a pair of 32-bit
2588 // integers is illustrated as follows:
2589 //
2590 // tf := trunc(val);
2591 // hif := floor(tf * 2^-32);
2592 // lof := tf - hif * 2^32; // lof is always positive due to floor.
2593 // hi := fptoi(hif);
2594 // lo := fptoi(lof);
2595 //
2596 auto Trunc = B.buildIntrinsicTrunc(SrcLT, Src, Flags);
2598 if (Signed && SrcLT == S32) {
2599 // However, a 32-bit floating point number has only 23 bits mantissa and
2600 // it's not enough to hold all the significant bits of `lof` if val is
2601 // negative. To avoid the loss of precision, We need to take the absolute
2602 // value after truncating and flip the result back based on the original
2603 // signedness.
2604 Sign = B.buildAShr(S32, Src, B.buildConstant(S32, 31));
2605 Trunc = B.buildFAbs(S32, Trunc, Flags);
2606 }
2607 MachineInstrBuilder K0, K1;
2608 if (SrcLT == S64) {
2609 K0 = B.buildFConstant(
2610 S64, llvm::bit_cast<double>(UINT64_C(/*2^-32*/ 0x3df0000000000000)));
2611 K1 = B.buildFConstant(
2612 S64, llvm::bit_cast<double>(UINT64_C(/*-2^32*/ 0xc1f0000000000000)));
2613 } else {
2614 K0 = B.buildFConstant(
2615 S32, llvm::bit_cast<float>(UINT32_C(/*2^-32*/ 0x2f800000)));
2616 K1 = B.buildFConstant(
2617 S32, llvm::bit_cast<float>(UINT32_C(/*-2^32*/ 0xcf800000)));
2618 }
2619
2620 auto Mul = B.buildFMul(SrcLT, Trunc, K0, Flags);
2621 auto FloorMul = B.buildFFloor(SrcLT, Mul, Flags);
2622 auto Fma = B.buildFMA(SrcLT, FloorMul, K1, Trunc, Flags);
2623
2624 auto Hi = (Signed && SrcLT == S64) ? B.buildFPTOSI(S32, FloorMul)
2625 : B.buildFPTOUI(S32, FloorMul);
2626 auto Lo = B.buildFPTOUI(S32, Fma);
2627
2628 if (Signed && SrcLT == S32) {
2629 // Flip the result based on the signedness, which is either all 0s or 1s.
2630 Sign = B.buildMergeLikeInstr(S64, {Sign, Sign});
2631 // r := xor({lo, hi}, sign) - sign;
2632 B.buildSub(Dst, B.buildXor(S64, B.buildMergeLikeInstr(S64, {Lo, Hi}), Sign),
2633 Sign);
2634 } else
2635 B.buildMergeLikeInstr(Dst, {Lo, Hi});
2636 MI.eraseFromParent();
2637
2638 return true;
2639}
2640
2642 MachineInstr &MI) const {
2643 MachineFunction &MF = Helper.MIRBuilder.getMF();
2645
2646 const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE ||
2647 MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE;
2648
2649 // With ieee_mode disabled, the instructions have the correct behavior
2650 // already for G_FMINNUM/G_FMAXNUM
2651 if (!MFI->getMode().IEEE)
2652 return !IsIEEEOp;
2653
2654 if (IsIEEEOp)
2655 return true;
2656
2658}
2659
2662 MachineIRBuilder &B) const {
2663 // TODO: Should move some of this into LegalizerHelper.
2664
2665 // TODO: Promote dynamic indexing of s16 to s32
2666
2667 Register Dst = MI.getOperand(0).getReg();
2668 Register Vec = MI.getOperand(1).getReg();
2669
2670 LLT VecTy = MRI.getType(Vec);
2671 LLT EltTy = VecTy.getElementType();
2672 assert(EltTy == MRI.getType(Dst));
2673
2674 // Other legalization maps vector<? x [type bigger than 64 bits]> via bitcasts
2675 // but we can't go directly to that logic becasue you can't bitcast a vector
2676 // of pointers to a vector of integers. Therefore, introduce an intermediate
2677 // vector of integers using ptrtoint (and inttoptr on the output) in order to
2678 // drive the legalization forward.
2679 if (EltTy.isPointer() && EltTy.getSizeInBits() > 64) {
2680 LLT IntTy = LLT::scalar(EltTy.getSizeInBits());
2681 LLT IntVecTy = VecTy.changeElementType(IntTy);
2682
2683 auto IntVec = B.buildPtrToInt(IntVecTy, Vec);
2684 auto IntElt = B.buildExtractVectorElement(IntTy, IntVec, MI.getOperand(2));
2685 B.buildIntToPtr(Dst, IntElt);
2686
2687 MI.eraseFromParent();
2688 return true;
2689 }
2690
2691 // FIXME: Artifact combiner probably should have replaced the truncated
2692 // constant before this, so we shouldn't need
2693 // getIConstantVRegValWithLookThrough.
2694 std::optional<ValueAndVReg> MaybeIdxVal =
2695 getIConstantVRegValWithLookThrough(MI.getOperand(2).getReg(), MRI);
2696 if (!MaybeIdxVal) // Dynamic case will be selected to register indexing.
2697 return true;
2698 const uint64_t IdxVal = MaybeIdxVal->Value.getZExtValue();
2699
2700 if (IdxVal < VecTy.getNumElements()) {
2701 auto Unmerge = B.buildUnmerge(EltTy, Vec);
2702 B.buildCopy(Dst, Unmerge.getReg(IdxVal));
2703 } else {
2704 B.buildUndef(Dst);
2705 }
2706
2707 MI.eraseFromParent();
2708 return true;
2709}
2710
2713 MachineIRBuilder &B) const {
2714 // TODO: Should move some of this into LegalizerHelper.
2715
2716 // TODO: Promote dynamic indexing of s16 to s32
2717
2718 Register Dst = MI.getOperand(0).getReg();
2719 Register Vec = MI.getOperand(1).getReg();
2720 Register Ins = MI.getOperand(2).getReg();
2721
2722 LLT VecTy = MRI.getType(Vec);
2723 LLT EltTy = VecTy.getElementType();
2724 assert(EltTy == MRI.getType(Ins));
2725
2726 // Other legalization maps vector<? x [type bigger than 64 bits]> via bitcasts
2727 // but we can't go directly to that logic becasue you can't bitcast a vector
2728 // of pointers to a vector of integers. Therefore, make the pointer vector
2729 // into an equivalent vector of integers with ptrtoint, insert the ptrtoint'd
2730 // new value, and then inttoptr the result vector back. This will then allow
2731 // the rest of legalization to take over.
2732 if (EltTy.isPointer() && EltTy.getSizeInBits() > 64) {
2733 LLT IntTy = LLT::scalar(EltTy.getSizeInBits());
2734 LLT IntVecTy = VecTy.changeElementType(IntTy);
2735
2736 auto IntVecSource = B.buildPtrToInt(IntVecTy, Vec);
2737 auto IntIns = B.buildPtrToInt(IntTy, Ins);
2738 auto IntVecDest = B.buildInsertVectorElement(IntVecTy, IntVecSource, IntIns,
2739 MI.getOperand(3));
2740 B.buildIntToPtr(Dst, IntVecDest);
2741 MI.eraseFromParent();
2742 return true;
2743 }
2744
2745 // FIXME: Artifact combiner probably should have replaced the truncated
2746 // constant before this, so we shouldn't need
2747 // getIConstantVRegValWithLookThrough.
2748 std::optional<ValueAndVReg> MaybeIdxVal =
2749 getIConstantVRegValWithLookThrough(MI.getOperand(3).getReg(), MRI);
2750 if (!MaybeIdxVal) // Dynamic case will be selected to register indexing.
2751 return true;
2752
2753 const uint64_t IdxVal = MaybeIdxVal->Value.getZExtValue();
2754
2755 unsigned NumElts = VecTy.getNumElements();
2756 if (IdxVal < NumElts) {
2758 for (unsigned i = 0; i < NumElts; ++i)
2759 SrcRegs.push_back(MRI.createGenericVirtualRegister(EltTy));
2760 B.buildUnmerge(SrcRegs, Vec);
2761
2762 SrcRegs[IdxVal] = MI.getOperand(2).getReg();
2763 B.buildMergeLikeInstr(Dst, SrcRegs);
2764 } else {
2765 B.buildUndef(Dst);
2766 }
2767
2768 MI.eraseFromParent();
2769 return true;
2770}
2771
2774 MachineIRBuilder &B) const {
2775
2776 Register DstReg = MI.getOperand(0).getReg();
2777 Register SrcReg = MI.getOperand(1).getReg();
2778 LLT Ty = MRI.getType(DstReg);
2779 unsigned Flags = MI.getFlags();
2780
2781 Register TrigVal;
2782 auto OneOver2Pi = B.buildFConstant(Ty, 0.5 * numbers::inv_pi);
2783 if (ST.hasTrigReducedRange()) {
2784 auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags);
2785 TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty})
2786 .addUse(MulVal.getReg(0))
2787 .setMIFlags(Flags)
2788 .getReg(0);
2789 } else
2790 TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0);
2791
2792 Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ?
2793 Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos;
2794 B.buildIntrinsic(TrigIntrin, ArrayRef<Register>(DstReg))
2795 .addUse(TrigVal)
2796 .setMIFlags(Flags);
2797 MI.eraseFromParent();
2798 return true;
2799}
2800
2803 const GlobalValue *GV,
2804 int64_t Offset,
2805 unsigned GAFlags) const {
2806 assert(isInt<32>(Offset + 4) && "32-bit offset is expected!");
2807 // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered
2808 // to the following code sequence:
2809 //
2810 // For constant address space:
2811 // s_getpc_b64 s[0:1]
2812 // s_add_u32 s0, s0, $symbol
2813 // s_addc_u32 s1, s1, 0
2814 //
2815 // s_getpc_b64 returns the address of the s_add_u32 instruction and then
2816 // a fixup or relocation is emitted to replace $symbol with a literal
2817 // constant, which is a pc-relative offset from the encoding of the $symbol
2818 // operand to the global variable.
2819 //
2820 // For global address space:
2821 // s_getpc_b64 s[0:1]
2822 // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
2823 // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
2824 //
2825 // s_getpc_b64 returns the address of the s_add_u32 instruction and then
2826 // fixups or relocations are emitted to replace $symbol@*@lo and
2827 // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
2828 // which is a 64-bit pc-relative offset from the encoding of the $symbol
2829 // operand to the global variable.
2830
2832
2833 Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg :
2834 B.getMRI()->createGenericVirtualRegister(ConstPtrTy);
2835
2836 MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET)
2837 .addDef(PCReg);
2838
2839 MIB.addGlobalAddress(GV, Offset, GAFlags);
2840 if (GAFlags == SIInstrInfo::MO_NONE)
2841 MIB.addImm(0);
2842 else
2843 MIB.addGlobalAddress(GV, Offset, GAFlags + 1);
2844
2845 if (!B.getMRI()->getRegClassOrNull(PCReg))
2846 B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass);
2847
2848 if (PtrTy.getSizeInBits() == 32)
2849 B.buildExtract(DstReg, PCReg, 0);
2850 return true;
2851}
2852
2853// Emit a ABS32_LO / ABS32_HI relocation stub.
2855 Register DstReg, LLT PtrTy, MachineIRBuilder &B, const GlobalValue *GV,
2856 MachineRegisterInfo &MRI) const {
2857 bool RequiresHighHalf = PtrTy.getSizeInBits() != 32;
2858
2859 LLT S32 = LLT::scalar(32);
2860
2861 // Use the destination directly, if and only if we store the lower address
2862 // part only and we don't have a register class being set.
2863 Register AddrLo = !RequiresHighHalf && !MRI.getRegClassOrNull(DstReg)
2864 ? DstReg
2865 : MRI.createGenericVirtualRegister(S32);
2866
2867 if (!MRI.getRegClassOrNull(AddrLo))
2868 MRI.setRegClass(AddrLo, &AMDGPU::SReg_32RegClass);
2869
2870 // Write the lower half.
2871 B.buildInstr(AMDGPU::S_MOV_B32)
2872 .addDef(AddrLo)
2873 .addGlobalAddress(GV, 0, SIInstrInfo::MO_ABS32_LO);
2874
2875 // If required, write the upper half as well.
2876 if (RequiresHighHalf) {
2877 assert(PtrTy.getSizeInBits() == 64 &&
2878 "Must provide a 64-bit pointer type!");
2879
2880 Register AddrHi = MRI.createGenericVirtualRegister(S32);
2881 MRI.setRegClass(AddrHi, &AMDGPU::SReg_32RegClass);
2882
2883 B.buildInstr(AMDGPU::S_MOV_B32)
2884 .addDef(AddrHi)
2885 .addGlobalAddress(GV, 0, SIInstrInfo::MO_ABS32_HI);
2886
2887 // Use the destination directly, if and only if we don't have a register
2888 // class being set.
2889 Register AddrDst = !MRI.getRegClassOrNull(DstReg)
2890 ? DstReg
2891 : MRI.createGenericVirtualRegister(LLT::scalar(64));
2892
2893 if (!MRI.getRegClassOrNull(AddrDst))
2894 MRI.setRegClass(AddrDst, &AMDGPU::SReg_64RegClass);
2895
2896 B.buildMergeValues(AddrDst, {AddrLo, AddrHi});
2897
2898 // If we created a new register for the destination, cast the result into
2899 // the final output.
2900 if (AddrDst != DstReg)
2901 B.buildCast(DstReg, AddrDst);
2902 } else if (AddrLo != DstReg) {
2903 // If we created a new register for the destination, cast the result into
2904 // the final output.
2905 B.buildCast(DstReg, AddrLo);
2906 }
2907}
2908
2911 MachineIRBuilder &B) const {
2912 Register DstReg = MI.getOperand(0).getReg();
2913 LLT Ty = MRI.getType(DstReg);
2914 unsigned AS = Ty.getAddressSpace();
2915
2916 const GlobalValue *GV = MI.getOperand(1).getGlobal();
2917 MachineFunction &MF = B.getMF();
2919
2921 if (!MFI->isModuleEntryFunction() &&
2922 GV->getName() != "llvm.amdgcn.module.lds") {
2923 const Function &Fn = MF.getFunction();
2924 DiagnosticInfoUnsupported BadLDSDecl(
2925 Fn, "local memory global used by non-kernel function", MI.getDebugLoc(),
2926 DS_Warning);
2927 Fn.getContext().diagnose(BadLDSDecl);
2928
2929 // We currently don't have a way to correctly allocate LDS objects that
2930 // aren't directly associated with a kernel. We do force inlining of
2931 // functions that use local objects. However, if these dead functions are
2932 // not eliminated, we don't want a compile time error. Just emit a warning
2933 // and a trap, since there should be no callable path here.
2934 B.buildTrap();
2935 B.buildUndef(DstReg);
2936 MI.eraseFromParent();
2937 return true;
2938 }
2939
2940 // TODO: We could emit code to handle the initialization somewhere.
2941 // We ignore the initializer for now and legalize it to allow selection.
2942 // The initializer will anyway get errored out during assembly emission.
2943 const SITargetLowering *TLI = ST.getTargetLowering();
2944 if (!TLI->shouldUseLDSConstAddress(GV)) {
2945 MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO);
2946 return true; // Leave in place;
2947 }
2948
2949 if (AS == AMDGPUAS::LOCAL_ADDRESS && GV->hasExternalLinkage()) {
2950 Type *Ty = GV->getValueType();
2951 // HIP uses an unsized array `extern __shared__ T s[]` or similar
2952 // zero-sized type in other languages to declare the dynamic shared
2953 // memory which size is not known at the compile time. They will be
2954 // allocated by the runtime and placed directly after the static
2955 // allocated ones. They all share the same offset.
2956 if (B.getDataLayout().getTypeAllocSize(Ty).isZero()) {
2957 // Adjust alignment for that dynamic shared memory array.
2958 MFI->setDynLDSAlign(MF.getFunction(), *cast<GlobalVariable>(GV));
2959 LLT S32 = LLT::scalar(32);
2960 auto Sz = B.buildIntrinsic(Intrinsic::amdgcn_groupstaticsize, {S32});
2961 B.buildIntToPtr(DstReg, Sz);
2962 MI.eraseFromParent();
2963 return true;
2964 }
2965 }
2966
2967 B.buildConstant(DstReg, MFI->allocateLDSGlobal(B.getDataLayout(),
2968 *cast<GlobalVariable>(GV)));
2969 MI.eraseFromParent();
2970 return true;
2971 }
2972
2973 if (ST.isAmdPalOS() || ST.isMesa3DOS()) {
2974 buildAbsGlobalAddress(DstReg, Ty, B, GV, MRI);
2975 MI.eraseFromParent();
2976 return true;
2977 }
2978
2979 const SITargetLowering *TLI = ST.getTargetLowering();
2980
2981 if (TLI->shouldEmitFixup(GV)) {
2982 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0);
2983 MI.eraseFromParent();
2984 return true;
2985 }
2986
2987 if (TLI->shouldEmitPCReloc(GV)) {
2988 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32);
2989 MI.eraseFromParent();
2990 return true;
2991 }
2992
2994 Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy);
2995
2996 LLT LoadTy = Ty.getSizeInBits() == 32 ? PtrTy : Ty;
3001 LoadTy, Align(8));
3002
3003 buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32);
3004
3005 if (Ty.getSizeInBits() == 32) {
3006 // Truncate if this is a 32-bit constant address.
3007 auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO);
3008 B.buildExtract(DstReg, Load, 0);
3009 } else
3010 B.buildLoad(DstReg, GOTAddr, *GOTMMO);
3011
3012 MI.eraseFromParent();
3013 return true;
3014}
3015
3017 if (Ty.isVector())
3018 return Ty.changeElementCount(
3021}
3022
3024 MachineInstr &MI) const {
3025 MachineIRBuilder &B = Helper.MIRBuilder;
3026 MachineRegisterInfo &MRI = *B.getMRI();
3027 GISelChangeObserver &Observer = Helper.Observer;
3028
3029 Register PtrReg = MI.getOperand(1).getReg();
3030 LLT PtrTy = MRI.getType(PtrReg);
3031 unsigned AddrSpace = PtrTy.getAddressSpace();
3032
3033 if (AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
3035 auto Cast = B.buildAddrSpaceCast(ConstPtr, PtrReg);
3036 Observer.changingInstr(MI);
3037 MI.getOperand(1).setReg(Cast.getReg(0));
3038 Observer.changedInstr(MI);
3039 return true;
3040 }
3041
3042 if (MI.getOpcode() != AMDGPU::G_LOAD)
3043 return false;
3044
3045 Register ValReg = MI.getOperand(0).getReg();
3046 LLT ValTy = MRI.getType(ValReg);
3047
3048 if (hasBufferRsrcWorkaround(ValTy)) {
3049 Observer.changingInstr(MI);
3051 Observer.changedInstr(MI);
3052 return true;
3053 }
3054
3055 MachineMemOperand *MMO = *MI.memoperands_begin();
3056 const unsigned ValSize = ValTy.getSizeInBits();
3057 const LLT MemTy = MMO->getMemoryType();
3058 const Align MemAlign = MMO->getAlign();
3059 const unsigned MemSize = MemTy.getSizeInBits();
3060 const uint64_t AlignInBits = 8 * MemAlign.value();
3061
3062 // Widen non-power-of-2 loads to the alignment if needed
3063 if (shouldWidenLoad(ST, MemTy, AlignInBits, AddrSpace, MI.getOpcode())) {
3064 const unsigned WideMemSize = PowerOf2Ceil(MemSize);
3065
3066 // This was already the correct extending load result type, so just adjust
3067 // the memory type.
3068 if (WideMemSize == ValSize) {
3069 MachineFunction &MF = B.getMF();
3070
3071 MachineMemOperand *WideMMO =
3072 MF.getMachineMemOperand(MMO, 0, WideMemSize / 8);
3073 Observer.changingInstr(MI);
3074 MI.setMemRefs(MF, {WideMMO});
3075 Observer.changedInstr(MI);
3076 return true;
3077 }
3078
3079 // Don't bother handling edge case that should probably never be produced.
3080 if (ValSize > WideMemSize)
3081 return false;
3082
3083 LLT WideTy = widenToNextPowerOf2(ValTy);
3084
3085 Register WideLoad;
3086 if (!WideTy.isVector()) {
3087 WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
3088 B.buildTrunc(ValReg, WideLoad).getReg(0);
3089 } else {
3090 // Extract the subvector.
3091
3092 if (isRegisterType(ValTy)) {
3093 // If this a case where G_EXTRACT is legal, use it.
3094 // (e.g. <3 x s32> -> <4 x s32>)
3095 WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
3096 B.buildExtract(ValReg, WideLoad, 0);
3097 } else {
3098 // For cases where the widened type isn't a nice register value, unmerge
3099 // from a widened register (e.g. <3 x s16> -> <4 x s16>)
3100 WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
3101 B.buildDeleteTrailingVectorElements(ValReg, WideLoad);
3102 }
3103 }
3104
3105 MI.eraseFromParent();
3106 return true;
3107 }
3108
3109 return false;
3110}
3111
3113 MachineInstr &MI) const {
3114 MachineIRBuilder &B = Helper.MIRBuilder;
3115 MachineRegisterInfo &MRI = *B.getMRI();
3116 GISelChangeObserver &Observer = Helper.Observer;
3117
3118 Register DataReg = MI.getOperand(0).getReg();
3119 LLT DataTy = MRI.getType(DataReg);
3120
3121 if (hasBufferRsrcWorkaround(DataTy)) {
3122 Observer.changingInstr(MI);
3124 Observer.changedInstr(MI);
3125 return true;
3126 }
3127 return false;
3128}
3129
3132 MachineIRBuilder &B) const {
3133 LLT Ty = MRI.getType(MI.getOperand(0).getReg());
3134 assert(Ty.isScalar());
3135
3136 MachineFunction &MF = B.getMF();
3138
3139 // TODO: Always legal with future ftz flag.
3140 // FIXME: Do we need just output?
3141 if (Ty == LLT::float32() &&
3143 return true;
3144 if (Ty == LLT::float16() &&
3146 return true;
3147
3148 MachineIRBuilder HelperBuilder(MI);
3149 GISelObserverWrapper DummyObserver;
3150 LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
3151 return Helper.lowerFMad(MI) == LegalizerHelper::Legalized;
3152}
3153
3156 Register DstReg = MI.getOperand(0).getReg();
3157 Register PtrReg = MI.getOperand(1).getReg();
3158 Register CmpVal = MI.getOperand(2).getReg();
3159 Register NewVal = MI.getOperand(3).getReg();
3160
3161 assert(AMDGPU::isFlatGlobalAddrSpace(MRI.getType(PtrReg).getAddressSpace()) &&
3162 "this should not have been custom lowered");
3163
3164 LLT ValTy = MRI.getType(CmpVal);
3165 LLT VecTy = LLT::fixed_vector(2, ValTy);
3166
3167 Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0);
3168
3169 B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG)
3170 .addDef(DstReg)
3171 .addUse(PtrReg)
3172 .addUse(PackedVal)
3173 .setMemRefs(MI.memoperands());
3174
3175 MI.eraseFromParent();
3176 return true;
3177}
3178
3179/// Return true if it's known that \p Src can never be an f32 denormal value.
3181 Register Src) {
3182 const MachineInstr *DefMI = MRI.getVRegDef(Src);
3183 switch (DefMI->getOpcode()) {
3184 case TargetOpcode::G_INTRINSIC: {
3185 switch (cast<GIntrinsic>(DefMI)->getIntrinsicID()) {
3186 case Intrinsic::amdgcn_frexp_mant:
3187 return true;
3188 default:
3189 break;
3190 }
3191
3192 break;
3193 }
3194 case TargetOpcode::G_FFREXP: {
3195 if (DefMI->getOperand(0).getReg() == Src)
3196 return true;
3197 break;
3198 }
3199 case TargetOpcode::G_FPEXT: {
3200 return MRI.getType(DefMI->getOperand(1).getReg()) == LLT::scalar(16);
3201 }
3202 default:
3203 return false;
3204 }
3205
3206 return false;
3207}
3208
3209static bool allowApproxFunc(const MachineFunction &MF, unsigned Flags) {
3210 if (Flags & MachineInstr::FmAfn)
3211 return true;
3212 const auto &Options = MF.getTarget().Options;
3213 return Options.UnsafeFPMath || Options.ApproxFuncFPMath;
3214}
3215
3217 unsigned Flags) {
3218 return !valueIsKnownNeverF32Denorm(MF.getRegInfo(), Src) &&
3221}
3222
3223std::pair<Register, Register>
3225 unsigned Flags) const {
3226 if (!needsDenormHandlingF32(B.getMF(), Src, Flags))
3227 return {};
3228
3229 const LLT F32 = LLT::scalar(32);
3230 auto SmallestNormal = B.buildFConstant(
3232 auto IsLtSmallestNormal =
3233 B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), Src, SmallestNormal);
3234
3235 auto Scale32 = B.buildFConstant(F32, 0x1.0p+32);
3236 auto One = B.buildFConstant(F32, 1.0);
3237 auto ScaleFactor =
3238 B.buildSelect(F32, IsLtSmallestNormal, Scale32, One, Flags);
3239 auto ScaledInput = B.buildFMul(F32, Src, ScaleFactor, Flags);
3240
3241 return {ScaledInput.getReg(0), IsLtSmallestNormal.getReg(0)};
3242}
3243
3245 MachineIRBuilder &B) const {
3246 // v_log_f32 is good enough for OpenCL, except it doesn't handle denormals.
3247 // If we have to handle denormals, scale up the input and adjust the result.
3248
3249 // scaled = x * (is_denormal ? 0x1.0p+32 : 1.0)
3250 // log2 = amdgpu_log2 - (is_denormal ? 32.0 : 0.0)
3251
3252 Register Dst = MI.getOperand(0).getReg();
3253 Register Src = MI.getOperand(1).getReg();
3254 LLT Ty = B.getMRI()->getType(Dst);
3255 unsigned Flags = MI.getFlags();
3256
3257 if (Ty == LLT::scalar(16)) {
3258 const LLT F32 = LLT::scalar(32);
3259 // Nothing in half is a denormal when promoted to f32.
3260 auto Ext = B.buildFPExt(F32, Src, Flags);
3261 auto Log2 = B.buildIntrinsic(Intrinsic::amdgcn_log, {F32})
3262 .addUse(Ext.getReg(0))
3263 .setMIFlags(Flags);
3264 B.buildFPTrunc(Dst, Log2, Flags);
3265 MI.eraseFromParent();
3266 return true;
3267 }
3268
3269 assert(Ty == LLT::scalar(32));
3270
3271 auto [ScaledInput, IsLtSmallestNormal] = getScaledLogInput(B, Src, Flags);
3272 if (!ScaledInput) {
3273 B.buildIntrinsic(Intrinsic::amdgcn_log, {MI.getOperand(0)})
3274 .addUse(Src)
3275 .setMIFlags(Flags);
3276 MI.eraseFromParent();
3277 return true;
3278 }
3279
3280 auto Log2 = B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty})
3281 .addUse(ScaledInput)
3282 .setMIFlags(Flags);
3283
3284 auto ThirtyTwo = B.buildFConstant(Ty, 32.0);
3285 auto Zero = B.buildFConstant(Ty, 0.0);
3286 auto ResultOffset =
3287 B.buildSelect(Ty, IsLtSmallestNormal, ThirtyTwo, Zero, Flags);
3288 B.buildFSub(Dst, Log2, ResultOffset, Flags);
3289
3290 MI.eraseFromParent();
3291 return true;
3292}
3293
3295 Register Z, unsigned Flags) {
3296 auto FMul = B.buildFMul(Ty, X, Y, Flags);
3297 return B.buildFAdd(Ty, FMul, Z, Flags).getReg(0);
3298}
3299
3301 MachineIRBuilder &B) const {
3302 const bool IsLog10 = MI.getOpcode() == TargetOpcode::G_FLOG10;
3303 assert(IsLog10 || MI.getOpcode() == TargetOpcode::G_FLOG);
3304
3305 MachineRegisterInfo &MRI = *B.getMRI();
3306 Register Dst = MI.getOperand(0).getReg();
3307 Register X = MI.getOperand(1).getReg();
3308 unsigned Flags = MI.getFlags();
3309 const LLT Ty = MRI.getType(X);
3310 MachineFunction &MF = B.getMF();
3311
3312 const LLT F32 = LLT::scalar(32);
3313 const LLT F16 = LLT::scalar(16);
3314
3315 const AMDGPUTargetMachine &TM =
3316 static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
3317
3318 if (Ty == F16 || MI.getFlag(MachineInstr::FmAfn) ||
3319 TM.Options.ApproxFuncFPMath || TM.Options.UnsafeFPMath) {
3320 if (Ty == F16 && !ST.has16BitInsts()) {
3321 Register LogVal = MRI.createGenericVirtualRegister(F32);
3322 auto PromoteSrc = B.buildFPExt(F32, X);
3323 legalizeFlogUnsafe(B, LogVal, PromoteSrc.getReg(0), IsLog10, Flags);
3324 B.buildFPTrunc(Dst, LogVal);
3325 } else {
3326 legalizeFlogUnsafe(B, Dst, X, IsLog10, Flags);
3327 }
3328
3329 MI.eraseFromParent();
3330 return true;
3331 }
3332
3333 auto [ScaledInput, IsScaled] = getScaledLogInput(B, X, Flags);
3334 if (ScaledInput)
3335 X = ScaledInput;
3336
3337 auto Y =
3338 B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty}).addUse(X).setMIFlags(Flags);
3339
3340 Register R;
3341 if (ST.hasFastFMAF32()) {
3342 // c+cc are ln(2)/ln(10) to more than 49 bits
3343 const float c_log10 = 0x1.344134p-2f;
3344 const float cc_log10 = 0x1.09f79ep-26f;
3345
3346 // c + cc is ln(2) to more than 49 bits
3347 const float c_log = 0x1.62e42ep-1f;
3348 const float cc_log = 0x1.efa39ep-25f;
3349
3350 auto C = B.buildFConstant(Ty, IsLog10 ? c_log10 : c_log);
3351 auto CC = B.buildFConstant(Ty, IsLog10 ? cc_log10 : cc_log);
3352
3353 R = B.buildFMul(Ty, Y, C, Flags).getReg(0);
3354 auto NegR = B.buildFNeg(Ty, R, Flags);
3355 auto FMA0 = B.buildFMA(Ty, Y, C, NegR, Flags);
3356 auto FMA1 = B.buildFMA(Ty, Y, CC, FMA0, Flags);
3357 R = B.buildFAdd(Ty, R, FMA1, Flags).getReg(0);
3358 } else {
3359 // ch+ct is ln(2)/ln(10) to more than 36 bits
3360 const float ch_log10 = 0x1.344000p-2f;
3361 const float ct_log10 = 0x1.3509f6p-18f;
3362
3363 // ch + ct is ln(2) to more than 36 bits
3364 const float ch_log = 0x1.62e000p-1f;
3365 const float ct_log = 0x1.0bfbe8p-15f;
3366
3367 auto CH = B.buildFConstant(Ty, IsLog10 ? ch_log10 : ch_log);
3368 auto CT = B.buildFConstant(Ty, IsLog10 ? ct_log10 : ct_log);
3369
3370 auto MaskConst = B.buildConstant(Ty, 0xfffff000);
3371 auto YH = B.buildAnd(Ty, Y, MaskConst);
3372 auto YT = B.buildFSub(Ty, Y, YH, Flags);
3373 auto YTCT = B.buildFMul(Ty, YT, CT, Flags);
3374
3375 Register Mad0 =
3376 getMad(B, Ty, YH.getReg(0), CT.getReg(0), YTCT.getReg(0), Flags);
3377 Register Mad1 = getMad(B, Ty, YT.getReg(0), CH.getReg(0), Mad0, Flags);
3378 R = getMad(B, Ty, YH.getReg(0), CH.getReg(0), Mad1, Flags);
3379 }
3380
3381 const bool IsFiniteOnly =
3382 (MI.getFlag(MachineInstr::FmNoNans) || TM.Options.NoNaNsFPMath) &&
3383 (MI.getFlag(MachineInstr::FmNoInfs) || TM.Options.NoInfsFPMath);
3384
3385 if (!IsFiniteOnly) {
3386 // Expand isfinite(x) => fabs(x) < inf
3387 auto Inf = B.buildFConstant(Ty, APFloat::getInf(APFloat::IEEEsingle()));
3388 auto Fabs = B.buildFAbs(Ty, Y);
3389 auto IsFinite =
3390 B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), Fabs, Inf, Flags);
3391 R = B.buildSelect(Ty, IsFinite, R, Y, Flags).getReg(0);
3392 }
3393
3394 if (ScaledInput) {
3395 auto Zero = B.buildFConstant(Ty, 0.0);
3396 auto ShiftK =
3397 B.buildFConstant(Ty, IsLog10 ? 0x1.344136p+3f : 0x1.62e430p+4f);
3398 auto Shift = B.buildSelect(Ty, IsScaled, ShiftK, Zero, Flags);
3399 B.buildFSub(Dst, R, Shift, Flags);
3400 } else {
3401 B.buildCopy(Dst, R);
3402 }
3403
3404 MI.eraseFromParent();
3405 return true;
3406}
3407
3409 Register Src, bool IsLog10,
3410 unsigned Flags) const {
3411 const double Log2BaseInverted =
3413
3414 LLT Ty = B.getMRI()->getType(Dst);
3415
3416 if (Ty == LLT::scalar(32)) {
3417 auto [ScaledInput, IsScaled] = getScaledLogInput(B, Src, Flags);
3418 if (ScaledInput) {
3419 auto LogSrc = B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty})
3420 .addUse(Src)
3421 .setMIFlags(Flags);
3422 auto ScaledResultOffset = B.buildFConstant(Ty, -32.0 * Log2BaseInverted);
3423 auto Zero = B.buildFConstant(Ty, 0.0);
3424 auto ResultOffset =
3425 B.buildSelect(Ty, IsScaled, ScaledResultOffset, Zero, Flags);
3426 auto Log2Inv = B.buildFConstant(Ty, Log2BaseInverted);
3427
3428 if (ST.hasFastFMAF32())
3429 B.buildFMA(Dst, LogSrc, Log2Inv, ResultOffset, Flags);
3430 else {
3431 auto Mul = B.buildFMul(Ty, LogSrc, Log2Inv, Flags);
3432 B.buildFAdd(Dst, Mul, ResultOffset, Flags);
3433 }
3434
3435 return true;
3436 }
3437 }
3438
3439 auto Log2Operand = Ty == LLT::scalar(16)
3440 ? B.buildFLog2(Ty, Src, Flags)
3441 : B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty})
3442 .addUse(Src)
3443 .setMIFlags(Flags);
3444 auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted);
3445 B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags);
3446 return true;
3447}
3448
3450 MachineIRBuilder &B) const {
3451 // v_exp_f32 is good enough for OpenCL, except it doesn't handle denormals.
3452 // If we have to handle denormals, scale up the input and adjust the result.
3453
3454 Register Dst = MI.getOperand(0).getReg();
3455 Register Src = MI.getOperand(1).getReg();
3456 unsigned Flags = MI.getFlags();
3457 LLT Ty = B.getMRI()->getType(Dst);
3458 const LLT F16 = LLT::scalar(16);
3459 const LLT F32 = LLT::scalar(32);
3460
3461 if (Ty == F16) {
3462 // Nothing in half is a denormal when promoted to f32.
3463 auto Ext = B.buildFPExt(F32, Src, Flags);
3464 auto Log2 = B.buildIntrinsic(Intrinsic::amdgcn_exp2, {F32})
3465 .addUse(Ext.getReg(0))
3466 .setMIFlags(Flags);
3467 B.buildFPTrunc(Dst, Log2, Flags);
3468 MI.eraseFromParent();
3469 return true;
3470 }
3471
3472 assert(Ty == F32);
3473
3474 if (!needsDenormHandlingF32(B.getMF(), Src, Flags)) {
3475 B.buildIntrinsic(Intrinsic::amdgcn_exp2, ArrayRef<Register>{Dst})
3476 .addUse(Src)
3477 .setMIFlags(Flags);
3478 MI.eraseFromParent();
3479 return true;
3480 }
3481
3482 // bool needs_scaling = x < -0x1.f80000p+6f;
3483 // v_exp_f32(x + (s ? 0x1.0p+6f : 0.0f)) * (s ? 0x1.0p-64f : 1.0f);
3484
3485 // -nextafter(128.0, -1)
3486 auto RangeCheckConst = B.buildFConstant(Ty, -0x1.f80000p+6f);
3487 auto NeedsScaling = B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), Src,
3488 RangeCheckConst, Flags);
3489
3490 auto SixtyFour = B.buildFConstant(Ty, 0x1.0p+6f);
3491 auto Zero = B.buildFConstant(Ty, 0.0);
3492 auto AddOffset = B.buildSelect(F32, NeedsScaling, SixtyFour, Zero, Flags);
3493 auto AddInput = B.buildFAdd(F32, Src, AddOffset, Flags);
3494
3495 auto Exp2 = B.buildIntrinsic(Intrinsic::amdgcn_exp2, {Ty})
3496 .addUse(AddInput.getReg(0))
3497 .setMIFlags(Flags);
3498
3499 auto TwoExpNeg64 = B.buildFConstant(Ty, 0x1.0p-64f);
3500 auto One = B.buildFConstant(Ty, 1.0);
3501 auto ResultScale = B.buildSelect(F32, NeedsScaling, TwoExpNeg64, One, Flags);
3502 B.buildFMul(Dst, Exp2, ResultScale, Flags);
3503 MI.eraseFromParent();
3504 return true;
3505}
3506
3508 Register X, unsigned Flags) const {
3509 LLT Ty = B.getMRI()->getType(Dst);
3510 LLT F32 = LLT::scalar(32);
3511
3512 if (Ty != F32 || !needsDenormHandlingF32(B.getMF(), X, Flags)) {
3513 auto Log2E = B.buildFConstant(Ty, numbers::log2e);
3514 auto Mul = B.buildFMul(Ty, X, Log2E, Flags);
3515
3516 if (Ty == F32) {
3517 B.buildIntrinsic(Intrinsic::amdgcn_exp2, ArrayRef<Register>{Dst})
3518 .addUse(Mul.getReg(0))
3519 .setMIFlags(Flags);
3520 } else {
3521 B.buildFExp2(Dst, Mul.getReg(0), Flags);
3522 }
3523
3524 return true;
3525 }
3526
3527 auto Threshold = B.buildFConstant(Ty, -0x1.5d58a0p+6f);
3528 auto NeedsScaling =
3529 B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), X, Threshold, Flags);
3530 auto ScaleOffset = B.buildFConstant(Ty, 0x1.0p+6f);
3531 auto ScaledX = B.buildFAdd(Ty, X, ScaleOffset, Flags);
3532 auto AdjustedX = B.buildSelect(Ty, NeedsScaling, ScaledX, X, Flags);
3533
3534 auto Log2E = B.buildFConstant(Ty, numbers::log2e);
3535 auto ExpInput = B.buildFMul(Ty, AdjustedX, Log2E, Flags);
3536
3537 auto Exp2 = B.buildIntrinsic(Intrinsic::amdgcn_exp2, {Ty})
3538 .addUse(ExpInput.getReg(0))
3539 .setMIFlags(Flags);
3540
3541 auto ResultScaleFactor = B.buildFConstant(Ty, 0x1.969d48p-93f);
3542 auto AdjustedResult = B.buildFMul(Ty, Exp2, ResultScaleFactor, Flags);
3543 B.buildSelect(Dst, NeedsScaling, AdjustedResult, Exp2, Flags);
3544 return true;
3545}
3546
3548 MachineIRBuilder &B) const {
3549 Register Dst = MI.getOperand(0).getReg();
3550 Register X = MI.getOperand(1).getReg();
3551 const unsigned Flags = MI.getFlags();
3552 MachineFunction &MF = B.getMF();
3553 MachineRegisterInfo &MRI = *B.getMRI();
3554 LLT Ty = MRI.getType(Dst);
3555 const LLT F16 = LLT::scalar(16);
3556 const LLT F32 = LLT::scalar(32);
3557 const bool IsExp10 = MI.getOpcode() == TargetOpcode::G_FEXP10;
3558
3559 if (Ty == F16) {
3560 // v_exp_f16 (fmul x, log2e)
3561 if (allowApproxFunc(MF, Flags)) {
3562 // TODO: Does this really require fast?
3563 legalizeFExpUnsafe(B, Dst, X, Flags);
3564 MI.eraseFromParent();
3565 return true;
3566 }
3567
3568 // exp(f16 x) ->
3569 // fptrunc (v_exp_f32 (fmul (fpext x), log2e))
3570
3571 // Nothing in half is a denormal when promoted to f32.
3572 auto Ext = B.buildFPExt(F32, X, Flags);
3573 Register Lowered = MRI.createGenericVirtualRegister(F32);
3574 legalizeFExpUnsafe(B, Lowered, Ext.getReg(0), Flags);
3575 B.buildFPTrunc(Dst, Lowered, Flags);
3576 MI.eraseFromParent();
3577 return true;
3578 }
3579
3580 assert(Ty == F32);
3581
3582 // TODO: Interpret allowApproxFunc as ignoring DAZ. This is currently copying
3583 // library behavior. Also, is known-not-daz source sufficient?
3584 if (allowApproxFunc(MF, Flags)) {
3585 legalizeFExpUnsafe(B, Dst, X, Flags);
3586 MI.eraseFromParent();
3587 return true;
3588 }
3589
3590 // Algorithm:
3591 //
3592 // e^x = 2^(x/ln(2)) = 2^(x*(64/ln(2))/64)
3593 //
3594 // x*(64/ln(2)) = n + f, |f| <= 0.5, n is integer
3595 // n = 64*m + j, 0 <= j < 64
3596 //
3597 // e^x = 2^((64*m + j + f)/64)
3598 // = (2^m) * (2^(j/64)) * 2^(f/64)
3599 // = (2^m) * (2^(j/64)) * e^(f*(ln(2)/64))
3600 //
3601 // f = x*(64/ln(2)) - n
3602 // r = f*(ln(2)/64) = x - n*(ln(2)/64)
3603 //
3604 // e^x = (2^m) * (2^(j/64)) * e^r
3605 //
3606 // (2^(j/64)) is precomputed
3607 //
3608 // e^r = 1 + r + (r^2)/2! + (r^3)/3! + (r^4)/4! + (r^5)/5!
3609 // e^r = 1 + q
3610 //
3611 // q = r + (r^2)/2! + (r^3)/3! + (r^4)/4! + (r^5)/5!
3612 //
3613 // e^x = (2^m) * ( (2^(j/64)) + q*(2^(j/64)) )
3614 const unsigned FlagsNoContract = Flags & ~MachineInstr::FmContract;
3615 Register PH, PL;
3616
3617 if (ST.hasFastFMAF32()) {
3618 const float c_exp = numbers::log2ef;
3619 const float cc_exp = 0x1.4ae0bep-26f; // c+cc are 49 bits
3620 const float c_exp10 = 0x1.a934f0p+1f;
3621 const float cc_exp10 = 0x1.2f346ep-24f;
3622
3623 auto C = B.buildFConstant(Ty, IsExp10 ? c_exp10 : c_exp);
3624 PH = B.buildFMul(Ty, X, C, Flags).getReg(0);
3625 auto NegPH = B.buildFNeg(Ty, PH, Flags);
3626 auto FMA0 = B.buildFMA(Ty, X, C, NegPH, Flags);
3627
3628 auto CC = B.buildFConstant(Ty, IsExp10 ? cc_exp10 : cc_exp);
3629 PL = B.buildFMA(Ty, X, CC, FMA0, Flags).getReg(0);
3630 } else {
3631 const float ch_exp = 0x1.714000p+0f;
3632 const float cl_exp = 0x1.47652ap-12f; // ch + cl are 36 bits
3633
3634 const float ch_exp10 = 0x1.a92000p+1f;
3635 const float cl_exp10 = 0x1.4f0978p-11f;
3636
3637 auto MaskConst = B.buildConstant(Ty, 0xfffff000);
3638 auto XH = B.buildAnd(Ty, X, MaskConst);
3639 auto XL = B.buildFSub(Ty, X, XH, Flags);
3640
3641 auto CH = B.buildFConstant(Ty, IsExp10 ? ch_exp10 : ch_exp);
3642 PH = B.buildFMul(Ty, XH, CH, Flags).getReg(0);
3643
3644 auto CL = B.buildFConstant(Ty, IsExp10 ? cl_exp10 : cl_exp);
3645 auto XLCL = B.buildFMul(Ty, XL, CL, Flags);
3646
3647 Register Mad0 =
3648 getMad(B, Ty, XL.getReg(0), CH.getReg(0), XLCL.getReg(0), Flags);
3649 PL = getMad(B, Ty, XH.getReg(0), CL.getReg(0), Mad0, Flags);
3650 }
3651
3652 auto E = B.buildIntrinsicRoundeven(Ty, PH, Flags);
3653
3654 // It is unsafe to contract this fsub into the PH multiply.
3655 auto PHSubE = B.buildFSub(Ty, PH, E, FlagsNoContract);
3656 auto A = B.buildFAdd(Ty, PHSubE, PL, Flags);
3657 auto IntE = B.buildFPTOSI(LLT::scalar(32), E);
3658
3659 auto Exp2 = B.buildIntrinsic(Intrinsic::amdgcn_exp2, {Ty})
3660 .addUse(A.getReg(0))
3661 .setMIFlags(Flags);
3662 auto R = B.buildFLdexp(Ty, Exp2, IntE, Flags);
3663
3664 auto UnderflowCheckConst =
3665 B.buildFConstant(Ty, IsExp10 ? -0x1.66d3e8p+5f : -0x1.9d1da0p+6f);
3666 auto Zero = B.buildFConstant(Ty, 0.0);
3667 auto Underflow =
3668 B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), X, UnderflowCheckConst);
3669
3670 R = B.buildSelect(Ty, Underflow, Zero, R);
3671
3672 const auto &Options = MF.getTarget().Options;
3673
3674 if (!(Flags & MachineInstr::FmNoInfs) && !Options.NoInfsFPMath) {
3675 auto OverflowCheckConst =
3676 B.buildFConstant(Ty, IsExp10 ? 0x1.344136p+5f : 0x1.62e430p+6f);
3677
3678 auto Overflow =
3679 B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), X, OverflowCheckConst);
3680 auto Inf = B.buildFConstant(Ty, APFloat::getInf(APFloat::IEEEsingle()));
3681 R = B.buildSelect(Ty, Overflow, Inf, R, Flags);
3682 }
3683
3684 B.buildCopy(Dst, R);
3685 MI.eraseFromParent();
3686 return true;
3687}
3688
3690 MachineIRBuilder &B) const {
3691 Register Dst = MI.getOperand(0).getReg();
3692 Register Src0 = MI.getOperand(1).getReg();
3693 Register Src1 = MI.getOperand(2).getReg();
3694 unsigned Flags = MI.getFlags();
3695 LLT Ty = B.getMRI()->getType(Dst);
3696 const LLT F16 = LLT::float16();
3697 const LLT F32 = LLT::float32();
3698
3699 if (Ty == F32) {
3700 auto Log = B.buildFLog2(F32, Src0, Flags);
3701 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {F32})
3702 .addUse(Log.getReg(0))
3703 .addUse(Src1)
3704 .setMIFlags(Flags);
3705 B.buildFExp2(Dst, Mul, Flags);
3706 } else if (Ty == F16) {
3707 // There's no f16 fmul_legacy, so we need to convert for it.
3708 auto Log = B.buildFLog2(F16, Src0, Flags);
3709 auto Ext0 = B.buildFPExt(F32, Log, Flags);
3710 auto Ext1 = B.buildFPExt(F32, Src1, Flags);
3711 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {F32})
3712 .addUse(Ext0.getReg(0))
3713 .addUse(Ext1.getReg(0))
3714 .setMIFlags(Flags);
3715 B.buildFExp2(Dst, B.buildFPTrunc(F16, Mul), Flags);
3716 } else
3717 return false;
3718
3719 MI.eraseFromParent();
3720 return true;
3721}
3722
3723// Find a source register, ignoring any possible source modifiers.
3725 Register ModSrc = OrigSrc;
3726 if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) {
3727 ModSrc = SrcFNeg->getOperand(1).getReg();
3728 if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
3729 ModSrc = SrcFAbs->getOperand(1).getReg();
3730 } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
3731 ModSrc = SrcFAbs->getOperand(1).getReg();
3732 return ModSrc;
3733}
3734
3737 MachineIRBuilder &B) const {
3738
3739 const LLT S1 = LLT::scalar(1);
3740 const LLT F64 = LLT::float64();
3741 Register Dst = MI.getOperand(0).getReg();
3742 Register OrigSrc = MI.getOperand(1).getReg();
3743 unsigned Flags = MI.getFlags();
3744 assert(ST.hasFractBug() && MRI.getType(Dst) == F64 &&
3745 "this should not have been custom lowered");
3746
3747 // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x))
3748 // is used instead. However, SI doesn't have V_FLOOR_F64, so the most
3749 // efficient way to implement it is using V_FRACT_F64. The workaround for the
3750 // V_FRACT bug is:
3751 // fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999)
3752 //
3753 // Convert floor(x) to (x - fract(x))
3754
3755 auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {F64})
3756 .addUse(OrigSrc)
3757 .setMIFlags(Flags);
3758
3759 // Give source modifier matching some assistance before obscuring a foldable
3760 // pattern.
3761
3762 // TODO: We can avoid the neg on the fract? The input sign to fract
3763 // shouldn't matter?
3764 Register ModSrc = stripAnySourceMods(OrigSrc, MRI);
3765
3766 auto Const =
3767 B.buildFConstant(F64, llvm::bit_cast<double>(0x3fefffffffffffff));
3768
3769 Register Min = MRI.createGenericVirtualRegister(F64);
3770
3771 // We don't need to concern ourselves with the snan handling difference, so
3772 // use the one which will directly select.
3773 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3774 if (MFI->getMode().IEEE)
3775 B.buildFMinNumIEEE(Min, Fract, Const, Flags);
3776 else
3777 B.buildFMinNum(Min, Fract, Const, Flags);
3778
3779 Register CorrectedFract = Min;
3780 if (!MI.getFlag(MachineInstr::FmNoNans)) {
3781 auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags);
3782 CorrectedFract = B.buildSelect(F64, IsNan, ModSrc, Min, Flags).getReg(0);
3783 }
3784
3785 auto NegFract = B.buildFNeg(F64, CorrectedFract, Flags);
3786 B.buildFAdd(Dst, OrigSrc, NegFract, Flags);
3787
3788 MI.eraseFromParent();
3789 return true;
3790}
3791
3792// Turn an illegal packed v2s16 build vector into bit operations.
3793// TODO: This should probably be a bitcast action in LegalizerHelper.
3796 Register Dst = MI.getOperand(0).getReg();
3797 const LLT S32 = LLT::scalar(32);
3798 const LLT S16 = LLT::scalar(16);
3799 assert(MRI.getType(Dst) == LLT::fixed_vector(2, 16));
3800
3801 Register Src0 = MI.getOperand(1).getReg();
3802 Register Src1 = MI.getOperand(2).getReg();
3803
3804 if (MI.getOpcode() == AMDGPU::G_BUILD_VECTOR_TRUNC) {
3805 assert(MRI.getType(Src0) == S32);
3806 Src0 = B.buildTrunc(S16, MI.getOperand(1).getReg()).getReg(0);
3807 Src1 = B.buildTrunc(S16, MI.getOperand(2).getReg()).getReg(0);
3808 }
3809
3810 auto Merge = B.buildMergeLikeInstr(S32, {Src0, Src1});
3811 B.buildBitcast(Dst, Merge);
3812
3813 MI.eraseFromParent();
3814 return true;
3815}
3816
3817// Build a big integer multiply or multiply-add using MAD_64_32 instructions.
3818//
3819// Source and accumulation registers must all be 32-bits.
3820//
3821// TODO: When the multiply is uniform, we should produce a code sequence
3822// that is better suited to instruction selection on the SALU. Instead of
3823// the outer loop going over parts of the result, the outer loop should go
3824// over parts of one of the factors. This should result in instruction
3825// selection that makes full use of S_ADDC_U32 instructions.
3828 ArrayRef<Register> Src0,
3829 ArrayRef<Register> Src1,
3830 bool UsePartialMad64_32,
3831 bool SeparateOddAlignedProducts) const {
3832 // Use (possibly empty) vectors of S1 registers to represent the set of
3833 // carries from one pair of positions to the next.
3834 using Carry = SmallVector<Register, 2>;
3835
3836 MachineIRBuilder &B = Helper.MIRBuilder;
3837 GISelKnownBits &KB = *Helper.getKnownBits();
3838
3839 const LLT S1 = LLT::scalar(1);
3840 const LLT S32 = LLT::scalar(32);
3841 const LLT S64 = LLT::scalar(64);
3842
3843 Register Zero32;
3844 Register Zero64;
3845
3846 auto getZero32 = [&]() -> Register {
3847 if (!Zero32)
3848 Zero32 = B.buildConstant(S32, 0).getReg(0);
3849 return Zero32;
3850 };
3851 auto getZero64 = [&]() -> Register {
3852 if (!Zero64)
3853 Zero64 = B.buildConstant(S64, 0).getReg(0);
3854 return Zero64;
3855 };
3856
3857 SmallVector<bool, 2> Src0KnownZeros, Src1KnownZeros;
3858 for (unsigned i = 0; i < Src0.size(); ++i) {
3859 Src0KnownZeros.push_back(KB.getKnownBits(Src0[i]).isZero());
3860 Src1KnownZeros.push_back(KB.getKnownBits(Src1[i]).isZero());
3861 }
3862
3863 // Merge the given carries into the 32-bit LocalAccum, which is modified
3864 // in-place.
3865 //
3866 // Returns the carry-out, which is a single S1 register or null.
3867 auto mergeCarry =
3868 [&](Register &LocalAccum, const Carry &CarryIn) -> Register {
3869 if (CarryIn.empty())
3870 return Register();
3871
3872 bool HaveCarryOut = true;
3873 Register CarryAccum;
3874 if (CarryIn.size() == 1) {
3875 if (!LocalAccum) {
3876 LocalAccum = B.buildZExt(S32, CarryIn[0]).getReg(0);
3877 return Register();
3878 }
3879
3880 CarryAccum = getZero32();
3881 } else {
3882 CarryAccum = B.buildZExt(S32, CarryIn[0]).getReg(0);
3883 for (unsigned i = 1; i + 1 < CarryIn.size(); ++i) {
3884 CarryAccum =
3885 B.buildUAdde(S32, S1, CarryAccum, getZero32(), CarryIn[i])
3886 .getReg(0);
3887 }
3888
3889 if (!LocalAccum) {
3890 LocalAccum = getZero32();
3891 HaveCarryOut = false;
3892 }
3893 }
3894
3895 auto Add =
3896 B.buildUAdde(S32, S1, CarryAccum, LocalAccum, CarryIn.back());
3897 LocalAccum = Add.getReg(0);
3898 return HaveCarryOut ? Add.getReg(1) : Register();
3899 };
3900
3901 // Build a multiply-add chain to compute
3902 //
3903 // LocalAccum + (partial products at DstIndex)
3904 // + (opportunistic subset of CarryIn)
3905 //
3906 // LocalAccum is an array of one or two 32-bit registers that are updated
3907 // in-place. The incoming registers may be null.
3908 //
3909 // In some edge cases, carry-ins can be consumed "for free". In that case,
3910 // the consumed carry bits are removed from CarryIn in-place.
3911 auto buildMadChain =
3912 [&](MutableArrayRef<Register> LocalAccum, unsigned DstIndex, Carry &CarryIn)
3913 -> Carry {
3914 assert((DstIndex + 1 < Accum.size() && LocalAccum.size() == 2) ||
3915 (DstIndex + 1 >= Accum.size() && LocalAccum.size() == 1));
3916
3917 Carry CarryOut;
3918 unsigned j0 = 0;
3919
3920 // Use plain 32-bit multiplication for the most significant part of the
3921 // result by default.
3922 if (LocalAccum.size() == 1 &&
3923 (!UsePartialMad64_32 || !CarryIn.empty())) {
3924 do {
3925 // Skip multiplication if one of the operands is 0
3926 unsigned j1 = DstIndex - j0;
3927 if (Src0KnownZeros[j0] || Src1KnownZeros[j1]) {
3928 ++j0;
3929 continue;
3930 }
3931 auto Mul = B.buildMul(S32, Src0[j0], Src1[j1]);
3932 if (!LocalAccum[0] || KB.getKnownBits(LocalAccum[0]).isZero()) {
3933 LocalAccum[0] = Mul.getReg(0);
3934 } else {
3935 if (CarryIn.empty()) {
3936 LocalAccum[0] = B.buildAdd(S32, LocalAccum[0], Mul).getReg(0);
3937 } else {
3938 LocalAccum[0] =
3939 B.buildUAdde(S32, S1, LocalAccum[0], Mul, CarryIn.back())
3940 .getReg(0);
3941 CarryIn.pop_back();
3942 }
3943 }
3944 ++j0;
3945 } while (j0 <= DstIndex && (!UsePartialMad64_32 || !CarryIn.empty()));
3946 }
3947
3948 // Build full 64-bit multiplies.
3949 if (j0 <= DstIndex) {
3950 bool HaveSmallAccum = false;
3951 Register Tmp;
3952
3953 if (LocalAccum[0]) {
3954 if (LocalAccum.size() == 1) {
3955 Tmp = B.buildAnyExt(S64, LocalAccum[0]).getReg(0);
3956 HaveSmallAccum = true;
3957 } else if (LocalAccum[1]) {
3958 Tmp = B.buildMergeLikeInstr(S64, LocalAccum).getReg(0);
3959 HaveSmallAccum = false;
3960 } else {
3961 Tmp = B.buildZExt(S64, LocalAccum[0]).getReg(0);
3962 HaveSmallAccum = true;
3963 }
3964 } else {
3965 assert(LocalAccum.size() == 1 || !LocalAccum[1]);
3966 Tmp = getZero64();
3967 HaveSmallAccum = true;
3968 }
3969
3970 do {
3971 unsigned j1 = DstIndex - j0;
3972 if (Src0KnownZeros[j0] || Src1KnownZeros[j1]) {
3973 ++j0;
3974 continue;
3975 }
3976 auto Mad = B.buildInstr(AMDGPU::G_AMDGPU_MAD_U64_U32, {S64, S1},
3977 {Src0[j0], Src1[j1], Tmp});
3978 Tmp = Mad.getReg(0);
3979 if (!HaveSmallAccum)
3980 CarryOut.push_back(Mad.getReg(1));
3981 HaveSmallAccum = false;
3982
3983 ++j0;
3984 } while (j0 <= DstIndex);
3985
3986 auto Unmerge = B.buildUnmerge(S32, Tmp);
3987 LocalAccum[0] = Unmerge.getReg(0);
3988 if (LocalAccum.size() > 1)
3989 LocalAccum[1] = Unmerge.getReg(1);
3990 }
3991
3992 return CarryOut;
3993 };
3994
3995 // Outer multiply loop, iterating over destination parts from least
3996 // significant to most significant parts.
3997 //
3998 // The columns of the following diagram correspond to the destination parts
3999 // affected by one iteration of the outer loop (ignoring boundary
4000 // conditions).
4001 //
4002 // Dest index relative to 2 * i: 1 0 -1
4003 // ------
4004 // Carries from previous iteration: e o
4005 // Even-aligned partial product sum: E E .
4006 // Odd-aligned partial product sum: O O
4007 //
4008 // 'o' is OddCarry, 'e' is EvenCarry.
4009 // EE and OO are computed from partial products via buildMadChain and use
4010 // accumulation where possible and appropriate.
4011 //
4012 Register SeparateOddCarry;
4013 Carry EvenCarry;
4014 Carry OddCarry;
4015
4016 for (unsigned i = 0; i <= Accum.size() / 2; ++i) {
4017 Carry OddCarryIn = std::move(OddCarry);
4018 Carry EvenCarryIn = std::move(EvenCarry);
4019 OddCarry.clear();
4020 EvenCarry.clear();
4021
4022 // Partial products at offset 2 * i.
4023 if (2 * i < Accum.size()) {
4024 auto LocalAccum = Accum.drop_front(2 * i).take_front(2);
4025 EvenCarry = buildMadChain(LocalAccum, 2 * i, EvenCarryIn);
4026 }
4027
4028 // Partial products at offset 2 * i - 1.
4029 if (i > 0) {
4030 if (!SeparateOddAlignedProducts) {
4031 auto LocalAccum = Accum.drop_front(2 * i - 1).take_front(2);
4032 OddCarry = buildMadChain(LocalAccum, 2 * i - 1, OddCarryIn);
4033 } else {
4034 bool IsHighest = 2 * i >= Accum.size();
4035 Register SeparateOddOut[2];
4036 auto LocalAccum = MutableArrayRef(SeparateOddOut)
4037 .take_front(IsHighest ? 1 : 2);
4038 OddCarry = buildMadChain(LocalAccum, 2 * i - 1, OddCarryIn);
4039
4041
4042 if (i == 1) {
4043 if (!IsHighest)
4044 Lo = B.buildUAddo(S32, S1, Accum[2 * i - 1], SeparateOddOut[0]);
4045 else
4046 Lo = B.buildAdd(S32, Accum[2 * i - 1], SeparateOddOut[0]);
4047 } else {
4048 Lo = B.buildUAdde(S32, S1, Accum[2 * i - 1], SeparateOddOut[0],
4049 SeparateOddCarry);
4050 }
4051 Accum[2 * i - 1] = Lo->getOperand(0).getReg();
4052
4053 if (!IsHighest) {
4054 auto Hi = B.buildUAdde(S32, S1, Accum[2 * i], SeparateOddOut[1],
4055 Lo->getOperand(1).getReg());
4056 Accum[2 * i] = Hi.getReg(0);
4057 SeparateOddCarry = Hi.getReg(1);
4058 }
4059 }
4060 }
4061
4062 // Add in the carries from the previous iteration
4063 if (i > 0) {
4064 if (Register CarryOut = mergeCarry(Accum[2 * i - 1], OddCarryIn))
4065 EvenCarryIn.push_back(CarryOut);
4066
4067 if (2 * i < Accum.size()) {
4068 if (Register CarryOut = mergeCarry(Accum[2 * i], EvenCarryIn))
4069 OddCarry.push_back(CarryOut);
4070 }
4071 }
4072 }
4073}
4074
4075// Custom narrowing of wide multiplies using wide multiply-add instructions.
4076//
4077// TODO: If the multiply is followed by an addition, we should attempt to
4078// integrate it to make better use of V_MAD_U64_U32's multiply-add capabilities.
4080 MachineInstr &MI) const {
4081 assert(ST.hasMad64_32());
4082 assert(MI.getOpcode() == TargetOpcode::G_MUL);
4083
4084 MachineIRBuilder &B = Helper.MIRBuilder;
4085 MachineRegisterInfo &MRI = *B.getMRI();
4086
4087 Register DstReg = MI.getOperand(0).getReg();
4088 Register Src0 = MI.getOperand(1).getReg();
4089 Register Src1 = MI.getOperand(2).getReg();
4090
4091 LLT Ty = MRI.getType(DstReg);
4092 assert(Ty.isScalar());
4093
4094 unsigned Size = Ty.getSizeInBits();
4095 unsigned NumParts = Size / 32;
4096 assert((Size % 32) == 0);
4097 assert(NumParts >= 2);
4098
4099 // Whether to use MAD_64_32 for partial products whose high half is
4100 // discarded. This avoids some ADD instructions but risks false dependency
4101 // stalls on some subtargets in some cases.
4102 const bool UsePartialMad64_32 = ST.getGeneration() < AMDGPUSubtarget::GFX10;
4103
4104 // Whether to compute odd-aligned partial products separately. This is
4105 // advisable on subtargets where the accumulator of MAD_64_32 must be placed
4106 // in an even-aligned VGPR.
4107 const bool SeparateOddAlignedProducts = ST.hasFullRate64Ops();
4108
4109 LLT S32 = LLT::scalar(32);
4110 SmallVector<Register, 2> Src0Parts, Src1Parts;
4111 for (unsigned i = 0; i < NumParts; ++i) {
4112 Src0Parts.push_back(MRI.createGenericVirtualRegister(S32));
4113 Src1Parts.push_back(MRI.createGenericVirtualRegister(S32));
4114 }
4115 B.buildUnmerge(Src0Parts, Src0);
4116 B.buildUnmerge(Src1Parts, Src1);
4117
4118 SmallVector<Register, 2> AccumRegs(NumParts);
4119 buildMultiply(Helper, AccumRegs, Src0Parts, Src1Parts, UsePartialMad64_32,
4120 SeparateOddAlignedProducts);
4121
4122 B.buildMergeLikeInstr(DstReg, AccumRegs);
4123 MI.eraseFromParent();
4124 return true;
4125}
4126
4127// Legalize ctlz/cttz to ffbh/ffbl instead of the default legalization to
4128// ctlz/cttz_zero_undef. This allows us to fix up the result for the zero input
4129// case with a single min instruction instead of a compare+select.
4132 MachineIRBuilder &B) const {
4133 Register Dst = MI.getOperand(0).getReg();
4134 Register Src = MI.getOperand(1).getReg();
4135 LLT DstTy = MRI.getType(Dst);
4136 LLT SrcTy = MRI.getType(Src);
4137
4138 unsigned NewOpc = MI.getOpcode() == AMDGPU::G_CTLZ
4139 ? AMDGPU::G_AMDGPU_FFBH_U32
4140 : AMDGPU::G_AMDGPU_FFBL_B32;
4141 auto Tmp = B.buildInstr(NewOpc, {DstTy}, {Src});
4142 B.buildUMin(Dst, Tmp, B.buildConstant(DstTy, SrcTy.getSizeInBits()));
4143
4144 MI.eraseFromParent();
4145 return true;
4146}
4147
4148// Check that this is a G_XOR x, -1
4149static bool isNot(const MachineRegisterInfo &MRI, const MachineInstr &MI) {
4150 if (MI.getOpcode() != TargetOpcode::G_XOR)
4151 return false;
4152 auto ConstVal = getIConstantVRegSExtVal(MI.getOperand(2).getReg(), MRI);
4153 return ConstVal && *ConstVal == -1;
4154}
4155
4156// Return the use branch instruction, otherwise null if the usage is invalid.
4157static MachineInstr *
4159 MachineBasicBlock *&UncondBrTarget, bool &Negated) {
4160 Register CondDef = MI.getOperand(0).getReg();
4161 if (!MRI.hasOneNonDBGUse(CondDef))
4162 return nullptr;
4163
4164 MachineBasicBlock *Parent = MI.getParent();
4165 MachineInstr *UseMI = &*MRI.use_instr_nodbg_begin(CondDef);
4166
4167 if (isNot(MRI, *UseMI)) {
4168 Register NegatedCond = UseMI->getOperand(0).getReg();
4169 if (!MRI.hasOneNonDBGUse(NegatedCond))
4170 return nullptr;
4171
4172 // We're deleting the def of this value, so we need to remove it.
4173 eraseInstr(*UseMI, MRI);
4174
4175 UseMI = &*MRI.use_instr_nodbg_begin(NegatedCond);
4176 Negated = true;
4177 }
4178
4179 if (UseMI->getParent() != Parent || UseMI->getOpcode() != AMDGPU::G_BRCOND)
4180 return nullptr;
4181
4182 // Make sure the cond br is followed by a G_BR, or is the last instruction.
4183 MachineBasicBlock::iterator Next = std::next(UseMI->getIterator());
4184 if (Next == Parent->end()) {
4185 MachineFunction::iterator NextMBB = std::next(Parent->getIterator());
4186 if (NextMBB == Parent->getParent()->end()) // Illegal intrinsic use.
4187 return nullptr;
4188 UncondBrTarget = &*NextMBB;
4189 } else {
4190 if (Next->getOpcode() != AMDGPU::G_BR)
4191 return nullptr;
4192 Br = &*Next;
4193 UncondBrTarget = Br->getOperand(0).getMBB();
4194 }
4195
4196 return UseMI;
4197}
4198
4200 const ArgDescriptor *Arg,
4201 const TargetRegisterClass *ArgRC,
4202 LLT ArgTy) const {
4203 MCRegister SrcReg = Arg->getRegister();
4204 assert(Register::isPhysicalRegister(SrcReg) && "Physical register expected");
4205 assert(DstReg.isVirtual() && "Virtual register expected");
4206
4207 Register LiveIn = getFunctionLiveInPhysReg(B.getMF(), B.getTII(), SrcReg,
4208 *ArgRC, B.getDebugLoc(), ArgTy);
4209 if (Arg->isMasked()) {
4210 // TODO: Should we try to emit this once in the entry block?
4211 const LLT S32 = LLT::scalar(32);
4212 const unsigned Mask = Arg->getMask();
4213 const unsigned Shift = llvm::countr_zero<unsigned>(Mask);
4214
4215 Register AndMaskSrc = LiveIn;
4216
4217 // TODO: Avoid clearing the high bits if we know workitem id y/z are always
4218 // 0.
4219 if (Shift != 0) {
4220 auto ShiftAmt = B.buildConstant(S32, Shift);
4221 AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0);
4222 }
4223
4224 B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift));
4225 } else {
4226 B.buildCopy(DstReg, LiveIn);
4227 }
4228
4229 return true;
4230}
4231
4233 Register DstReg, MachineIRBuilder &B,
4235 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
4236 const ArgDescriptor *Arg = nullptr;
4237 const TargetRegisterClass *ArgRC;
4238 LLT ArgTy;
4239
4240 CallingConv::ID CC = B.getMF().getFunction().getCallingConv();
4241 const ArgDescriptor WorkGroupIDX =
4242 ArgDescriptor::createRegister(AMDGPU::TTMP9);
4243 // If GridZ is not programmed in an entry function then the hardware will set
4244 // it to all zeros, so there is no need to mask the GridY value in the low
4245 // order bits.
4246 const ArgDescriptor WorkGroupIDY = ArgDescriptor::createRegister(
4247 AMDGPU::TTMP7,
4248 AMDGPU::isEntryFunctionCC(CC) && !MFI->hasWorkGroupIDZ() ? ~0u : 0xFFFFu);
4249 const ArgDescriptor WorkGroupIDZ =
4250 ArgDescriptor::createRegister(AMDGPU::TTMP7, 0xFFFF0000u);
4251 if (ST.hasArchitectedSGPRs() &&
4253 switch (ArgType) {
4255 Arg = &WorkGroupIDX;
4256 ArgRC = &AMDGPU::SReg_32RegClass;
4257 ArgTy = LLT::scalar(32);
4258 break;
4260 Arg = &WorkGroupIDY;
4261 ArgRC = &AMDGPU::SReg_32RegClass;
4262 ArgTy = LLT::scalar(32);
4263 break;
4265 Arg = &WorkGroupIDZ;
4266 ArgRC = &AMDGPU::SReg_32RegClass;
4267 ArgTy = LLT::scalar(32);
4268 break;
4269 default:
4270 break;
4271 }
4272 }
4273
4274 if (!Arg)
4275 std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType);
4276
4277 if (!Arg) {
4279 // The intrinsic may appear when we have a 0 sized kernarg segment, in which
4280 // case the pointer argument may be missing and we use null.
4281 B.buildConstant(DstReg, 0);
4282 return true;
4283 }
4284
4285 // It's undefined behavior if a function marked with the amdgpu-no-*
4286 // attributes uses the corresponding intrinsic.
4287 B.buildUndef(DstReg);
4288 return true;
4289 }
4290
4291 if (!Arg->isRegister() || !Arg->getRegister().isValid())
4292 return false; // TODO: Handle these
4293 return loadInputValue(DstReg, B, Arg, ArgRC, ArgTy);
4294}
4295
4299 if (!loadInputValue(MI.getOperand(0).getReg(), B, ArgType))
4300 return false;
4301
4302 MI.eraseFromParent();
4303 return true;
4304}
4305
4307 int64_t C) {
4308 B.buildConstant(MI.getOperand(0).getReg(), C);
4309 MI.eraseFromParent();
4310 return true;
4311}
4312
4315 unsigned Dim, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
4316 unsigned MaxID = ST.getMaxWorkitemID(B.getMF().getFunction(), Dim);
4317 if (MaxID == 0)
4318 return replaceWithConstant(B, MI, 0);
4319
4320 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
4321 const ArgDescriptor *Arg;
4322 const TargetRegisterClass *ArgRC;
4323 LLT ArgTy;
4324 std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType);
4325
4326 Register DstReg = MI.getOperand(0).getReg();
4327 if (!Arg) {
4328 // It's undefined behavior if a function marked with the amdgpu-no-*
4329 // attributes uses the corresponding intrinsic.
4330 B.buildUndef(DstReg);
4331 MI.eraseFromParent();
4332 return true;
4333 }
4334
4335 if (Arg->isMasked()) {
4336 // Don't bother inserting AssertZext for packed IDs since we're emitting the
4337 // masking operations anyway.
4338 //
4339 // TODO: We could assert the top bit is 0 for the source copy.
4340 if (!loadInputValue(DstReg, B, ArgType))
4341 return false;
4342 } else {
4343 Register TmpReg = MRI.createGenericVirtualRegister(LLT::scalar(32));
4344 if (!loadInputValue(TmpReg, B, ArgType))
4345 return false;
4346 B.buildAssertZExt(DstReg, TmpReg, llvm::bit_width(MaxID));
4347 }
4348
4349 MI.eraseFromParent();
4350 return true;
4351}
4352
4354 int64_t Offset) const {
4356 Register KernArgReg = B.getMRI()->createGenericVirtualRegister(PtrTy);
4357
4358 // TODO: If we passed in the base kernel offset we could have a better
4359 // alignment than 4, but we don't really need it.
4360 if (!loadInputValue(KernArgReg, B,
4362 llvm_unreachable("failed to find kernarg segment ptr");
4363
4364 auto COffset = B.buildConstant(LLT::scalar(64), Offset);
4365 // TODO: Should get nuw
4366 return B.buildPtrAdd(PtrTy, KernArgReg, COffset).getReg(0);
4367}
4368
4369/// Legalize a value that's loaded from kernel arguments. This is only used by
4370/// legacy intrinsics.
4374 Align Alignment) const {
4375 Register DstReg = MI.getOperand(0).getReg();
4376
4377 assert(B.getMRI()->getType(DstReg) == LLT::scalar(32) &&
4378 "unexpected kernarg parameter type");
4379
4382 B.buildLoad(DstReg, Ptr, PtrInfo, Align(4),
4385 MI.eraseFromParent();
4386 return true;
4387}
4388
4391 MachineIRBuilder &B) const {
4392 Register Dst = MI.getOperand(0).getReg();
4393 LLT DstTy = MRI.getType(Dst);
4394 LLT S16 = LLT::scalar(16);
4395 LLT S32 = LLT::scalar(32);
4396 LLT S64 = LLT::scalar(64);
4397
4398 if (DstTy == S16)
4399 return legalizeFDIV16(MI, MRI, B);
4400 if (DstTy == S32)
4401 return legalizeFDIV32(MI, MRI, B);
4402 if (DstTy == S64)
4403 return legalizeFDIV64(MI, MRI, B);
4404
4405 return false;
4406}
4407
4409 Register DstDivReg,
4410 Register DstRemReg,
4411 Register X,
4412 Register Y) const {
4413 const LLT S1 = LLT::scalar(1);
4414 const LLT S32 = LLT::scalar(32);
4415
4416 // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the
4417 // algorithm used here.
4418
4419 // Initial estimate of inv(y).
4420 auto FloatY = B.buildUITOFP(S32, Y);
4421 auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {FloatY});
4422 auto Scale = B.buildFConstant(S32, llvm::bit_cast<float>(0x4f7ffffe));
4423 auto ScaledY = B.buildFMul(S32, RcpIFlag, Scale);
4424 auto Z = B.buildFPTOUI(S32, ScaledY);
4425
4426 // One round of UNR.
4427 auto NegY = B.buildSub(S32, B.buildConstant(S32, 0), Y);
4428 auto NegYZ = B.buildMul(S32, NegY, Z);
4429 Z = B.buildAdd(S32, Z, B.buildUMulH(S32, Z, NegYZ));
4430
4431 // Quotient/remainder estimate.
4432 auto Q = B.buildUMulH(S32, X, Z);
4433 auto R = B.buildSub(S32, X, B.buildMul(S32, Q, Y));
4434
4435 // First quotient/remainder refinement.
4436 auto One = B.buildConstant(S32, 1);
4437 auto Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
4438 if (DstDivReg)
4439 Q = B.buildSelect(S32, Cond, B.buildAdd(S32, Q, One), Q);
4440 R = B.buildSelect(S32, Cond, B.buildSub(S32, R, Y), R);
4441
4442 // Second quotient/remainder refinement.
4443 Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
4444 if (DstDivReg)
4445 B.buildSelect(DstDivReg, Cond, B.buildAdd(S32, Q, One), Q);
4446
4447 if (DstRemReg)
4448 B.buildSelect(DstRemReg, Cond, B.buildSub(S32, R, Y), R);
4449}
4450
4451// Build integer reciprocal sequence around V_RCP_IFLAG_F32
4452//
4453// Return lo, hi of result
4454//
4455// %cvt.lo = G_UITOFP Val.lo
4456// %cvt.hi = G_UITOFP Val.hi
4457// %mad = G_FMAD %cvt.hi, 2**32, %cvt.lo
4458// %rcp = G_AMDGPU_RCP_IFLAG %mad
4459// %mul1 = G_FMUL %rcp, 0x5f7ffffc
4460// %mul2 = G_FMUL %mul1, 2**(-32)
4461// %trunc = G_INTRINSIC_TRUNC %mul2
4462// %mad2 = G_FMAD %trunc, -(2**32), %mul1
4463// return {G_FPTOUI %mad2, G_FPTOUI %trunc}
4464static std::pair<Register, Register> emitReciprocalU64(MachineIRBuilder &B,
4465 Register Val) {
4466 const LLT S32 = LLT::scalar(32);
4467 auto Unmerge = B.buildUnmerge(S32, Val);
4468
4469 auto CvtLo = B.buildUITOFP(S32, Unmerge.getReg(0));
4470 auto CvtHi = B.buildUITOFP(S32, Unmerge.getReg(1));