LLVM 18.0.0git
AMDGPULegalizerInfo.cpp
Go to the documentation of this file.
1//===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8/// \file
9/// This file implements the targeting of the Machinelegalizer class for
10/// AMDGPU.
11/// \todo This should be generated by TableGen.
12//===----------------------------------------------------------------------===//
13
14#include "AMDGPULegalizerInfo.h"
15
16#include "AMDGPU.h"
18#include "AMDGPUInstrInfo.h"
19#include "AMDGPUTargetMachine.h"
21#include "SIInstrInfo.h"
23#include "SIRegisterInfo.h"
25#include "llvm/ADT/ScopeExit.h"
34#include "llvm/IR/IntrinsicsAMDGPU.h"
35#include "llvm/IR/IntrinsicsR600.h"
36
37#define DEBUG_TYPE "amdgpu-legalinfo"
38
39using namespace llvm;
40using namespace LegalizeActions;
41using namespace LegalizeMutations;
42using namespace LegalityPredicates;
43using namespace MIPatternMatch;
44
45// Hack until load/store selection patterns support any tuple of legal types.
47 "amdgpu-global-isel-new-legality",
48 cl::desc("Use GlobalISel desired legality, rather than try to use"
49 "rules compatible with selection patterns"),
50 cl::init(false),
52
53static constexpr unsigned MaxRegisterSize = 1024;
54
55// Round the number of elements to the next power of two elements
57 unsigned NElts = Ty.getNumElements();
58 unsigned Pow2NElts = 1 << Log2_32_Ceil(NElts);
59 return Ty.changeElementCount(ElementCount::getFixed(Pow2NElts));
60}
61
62// Round the number of bits to the next power of two bits
64 unsigned Bits = Ty.getSizeInBits();
65 unsigned Pow2Bits = 1 << Log2_32_Ceil(Bits);
66 return LLT::scalar(Pow2Bits);
67}
68
69/// \returns true if this is an odd sized vector which should widen by adding an
70/// additional element. This is mostly to handle <3 x s16> -> <4 x s16>. This
71/// excludes s1 vectors, which should always be scalarized.
72static LegalityPredicate isSmallOddVector(unsigned TypeIdx) {
73 return [=](const LegalityQuery &Query) {
74 const LLT Ty = Query.Types[TypeIdx];
75 if (!Ty.isVector())
76 return false;
77
78 const LLT EltTy = Ty.getElementType();
79 const unsigned EltSize = EltTy.getSizeInBits();
80 return Ty.getNumElements() % 2 != 0 &&
81 EltSize > 1 && EltSize < 32 &&
82 Ty.getSizeInBits() % 32 != 0;
83 };
84}
85
86static LegalityPredicate sizeIsMultipleOf32(unsigned TypeIdx) {
87 return [=](const LegalityQuery &Query) {
88 const LLT Ty = Query.Types[TypeIdx];
89 return Ty.getSizeInBits() % 32 == 0;
90 };
91}
92
93static LegalityPredicate isWideVec16(unsigned TypeIdx) {
94 return [=](const LegalityQuery &Query) {
95 const LLT Ty = Query.Types[TypeIdx];
96 const LLT EltTy = Ty.getScalarType();
97 return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2;
98 };
99}
100
101static LegalizeMutation oneMoreElement(unsigned TypeIdx) {
102 return [=](const LegalityQuery &Query) {
103 const LLT Ty = Query.Types[TypeIdx];
104 const LLT EltTy = Ty.getElementType();
105 return std::pair(TypeIdx,
106 LLT::fixed_vector(Ty.getNumElements() + 1, EltTy));
107 };
108}
109
111 return [=](const LegalityQuery &Query) {
112 const LLT Ty = Query.Types[TypeIdx];
113 const LLT EltTy = Ty.getElementType();
114 unsigned Size = Ty.getSizeInBits();
115 unsigned Pieces = (Size + 63) / 64;
116 unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces;
117 return std::pair(TypeIdx, LLT::scalarOrVector(
118 ElementCount::getFixed(NewNumElts), EltTy));
119 };
120}
121
122// Increase the number of vector elements to reach the next multiple of 32-bit
123// type.
124static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) {
125 return [=](const LegalityQuery &Query) {
126 const LLT Ty = Query.Types[TypeIdx];
127
128 const LLT EltTy = Ty.getElementType();
129 const int Size = Ty.getSizeInBits();
130 const int EltSize = EltTy.getSizeInBits();
131 const int NextMul32 = (Size + 31) / 32;
132
133 assert(EltSize < 32);
134
135 const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize;
136 return std::pair(TypeIdx, LLT::fixed_vector(NewNumElts, EltTy));
137 };
138}
139
140// Increase the number of vector elements to reach the next legal RegClass.
142 return [=](const LegalityQuery &Query) {
143 const LLT Ty = Query.Types[TypeIdx];
144 const unsigned NumElts = Ty.getNumElements();
145 const unsigned EltSize = Ty.getElementType().getSizeInBits();
146 const unsigned MaxNumElts = MaxRegisterSize / EltSize;
147
148 assert(EltSize == 32 || EltSize == 64);
150
151 unsigned NewNumElts;
152 // Find the nearest legal RegClass that is larger than the current type.
153 for (NewNumElts = NumElts; NewNumElts < MaxNumElts; ++NewNumElts) {
154 if (SIRegisterInfo::getSGPRClassForBitWidth(NewNumElts * EltSize))
155 break;
156 }
157
158 return std::pair(TypeIdx, LLT::fixed_vector(NewNumElts, EltSize));
159 };
160}
161
163 if (!Ty.isVector())
164 return LLT::scalar(128);
165 const ElementCount NumElems = Ty.getElementCount();
166 return LLT::vector(NumElems, LLT::scalar(128));
167}
168
170 if (!Ty.isVector())
171 return LLT::fixed_vector(4, LLT::scalar(32));
172 const unsigned NumElems = Ty.getElementCount().getFixedValue();
173 return LLT::fixed_vector(NumElems * 4, LLT::scalar(32));
174}
175
177 const unsigned Size = Ty.getSizeInBits();
178
179 if (Size <= 32) {
180 // <2 x s8> -> s16
181 // <4 x s8> -> s32
182 return LLT::scalar(Size);
183 }
184
186}
187
188static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx) {
189 return [=](const LegalityQuery &Query) {
190 const LLT Ty = Query.Types[TypeIdx];
191 return std::pair(TypeIdx, getBitcastRegisterType(Ty));
192 };
193}
194
196 return [=](const LegalityQuery &Query) {
197 const LLT Ty = Query.Types[TypeIdx];
198 unsigned Size = Ty.getSizeInBits();
199 assert(Size % 32 == 0);
200 return std::pair(
202 };
203}
204
205static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) {
206 return [=](const LegalityQuery &Query) {
207 const LLT QueryTy = Query.Types[TypeIdx];
208 return QueryTy.isVector() && QueryTy.getSizeInBits() < Size;
209 };
210}
211
212static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) {
213 return [=](const LegalityQuery &Query) {
214 const LLT QueryTy = Query.Types[TypeIdx];
215 return QueryTy.isVector() && QueryTy.getSizeInBits() > Size;
216 };
217}
218
219static LegalityPredicate numElementsNotEven(unsigned TypeIdx) {
220 return [=](const LegalityQuery &Query) {
221 const LLT QueryTy = Query.Types[TypeIdx];
222 return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0;
223 };
224}
225
226static bool isRegisterSize(unsigned Size) {
227 return Size % 32 == 0 && Size <= MaxRegisterSize;
228}
229
231 const int EltSize = EltTy.getSizeInBits();
232 return EltSize == 16 || EltSize % 32 == 0;
233}
234
235static bool isRegisterVectorType(LLT Ty) {
236 const int EltSize = Ty.getElementType().getSizeInBits();
237 return EltSize == 32 || EltSize == 64 ||
238 (EltSize == 16 && Ty.getNumElements() % 2 == 0) ||
239 EltSize == 128 || EltSize == 256;
240}
241
242static bool isRegisterType(LLT Ty) {
243 if (!isRegisterSize(Ty.getSizeInBits()))
244 return false;
245
246 if (Ty.isVector())
247 return isRegisterVectorType(Ty);
248
249 return true;
250}
251
252// Any combination of 32 or 64-bit elements up the maximum register size, and
253// multiples of v2s16.
254static LegalityPredicate isRegisterType(unsigned TypeIdx) {
255 return [=](const LegalityQuery &Query) {
256 return isRegisterType(Query.Types[TypeIdx]);
257 };
258}
259
260// RegisterType that doesn't have a corresponding RegClass.
261static LegalityPredicate isIllegalRegisterType(unsigned TypeIdx) {
262 return [=](const LegalityQuery &Query) {
263 LLT Ty = Query.Types[TypeIdx];
264 return isRegisterType(Ty) &&
266 };
267}
268
269static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) {
270 return [=](const LegalityQuery &Query) {
271 const LLT QueryTy = Query.Types[TypeIdx];
272 if (!QueryTy.isVector())
273 return false;
274 const LLT EltTy = QueryTy.getElementType();
275 return EltTy == LLT::scalar(16) || EltTy.getSizeInBits() >= 32;
276 };
277}
278
279// If we have a truncating store or an extending load with a data size larger
280// than 32-bits, we need to reduce to a 32-bit type.
282 return [=](const LegalityQuery &Query) {
283 const LLT Ty = Query.Types[TypeIdx];
284 return !Ty.isVector() && Ty.getSizeInBits() > 32 &&
285 Query.MMODescrs[0].MemoryTy.getSizeInBits() < Ty.getSizeInBits();
286 };
287}
288
289// TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we
290// handle some operations by just promoting the register during
291// selection. There are also d16 loads on GFX9+ which preserve the high bits.
292static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS,
293 bool IsLoad, bool IsAtomic) {
294 switch (AS) {
296 // FIXME: Private element size.
297 return ST.enableFlatScratch() ? 128 : 32;
299 return ST.useDS128() ? 128 : 64;
304 // Treat constant and global as identical. SMRD loads are sometimes usable for
305 // global loads (ideally constant address space should be eliminated)
306 // depending on the context. Legality cannot be context dependent, but
307 // RegBankSelect can split the load as necessary depending on the pointer
308 // register bank/uniformity and if the memory is invariant or not written in a
309 // kernel.
310 return IsLoad ? 512 : 128;
311 default:
312 // FIXME: Flat addresses may contextually need to be split to 32-bit parts
313 // if they may alias scratch depending on the subtarget. This needs to be
314 // moved to custom handling to use addressMayBeAccessedAsPrivate
315 return ST.hasMultiDwordFlatScratchAddressing() || IsAtomic ? 128 : 32;
316 }
317}
318
319static bool isLoadStoreSizeLegal(const GCNSubtarget &ST,
320 const LegalityQuery &Query) {
321 const LLT Ty = Query.Types[0];
322
323 // Handle G_LOAD, G_ZEXTLOAD, G_SEXTLOAD
324 const bool IsLoad = Query.Opcode != AMDGPU::G_STORE;
325
326 unsigned RegSize = Ty.getSizeInBits();
327 uint64_t MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
328 uint64_t AlignBits = Query.MMODescrs[0].AlignInBits;
329 unsigned AS = Query.Types[1].getAddressSpace();
330
331 // All of these need to be custom lowered to cast the pointer operand.
333 return false;
334
335 // Do not handle extending vector loads.
336 if (Ty.isVector() && MemSize != RegSize)
337 return false;
338
339 // TODO: We should be able to widen loads if the alignment is high enough, but
340 // we also need to modify the memory access size.
341#if 0
342 // Accept widening loads based on alignment.
343 if (IsLoad && MemSize < Size)
344 MemSize = std::max(MemSize, Align);
345#endif
346
347 // Only 1-byte and 2-byte to 32-bit extloads are valid.
348 if (MemSize != RegSize && RegSize != 32)
349 return false;
350
351 if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad,
352 Query.MMODescrs[0].Ordering !=
353 AtomicOrdering::NotAtomic))
354 return false;
355
356 switch (MemSize) {
357 case 8:
358 case 16:
359 case 32:
360 case 64:
361 case 128:
362 break;
363 case 96:
364 if (!ST.hasDwordx3LoadStores())
365 return false;
366 break;
367 case 256:
368 case 512:
369 // These may contextually need to be broken down.
370 break;
371 default:
372 return false;
373 }
374
375 assert(RegSize >= MemSize);
376
377 if (AlignBits < MemSize) {
378 const SITargetLowering *TLI = ST.getTargetLowering();
379 if (!TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS,
380 Align(AlignBits / 8)))
381 return false;
382 }
383
384 return true;
385}
386
387// The newer buffer intrinsic forms take their resource arguments as
388// pointers in address space 8, aka s128 values. However, in order to not break
389// SelectionDAG, the underlying operations have to continue to take v4i32
390// arguments. Therefore, we convert resource pointers - or vectors of them
391// to integer values here.
392static bool hasBufferRsrcWorkaround(const LLT Ty) {
394 return true;
395 if (Ty.isVector()) {
396 const LLT ElemTy = Ty.getElementType();
397 return hasBufferRsrcWorkaround(ElemTy);
398 }
399 return false;
400}
401
402// The current selector can't handle <6 x s16>, <8 x s16>, s96, s128 etc, so
403// workaround this. Eventually it should ignore the type for loads and only care
404// about the size. Return true in cases where we will workaround this for now by
405// bitcasting.
406static bool loadStoreBitcastWorkaround(const LLT Ty) {
408 return false;
409
410 const unsigned Size = Ty.getSizeInBits();
411 if (Size <= 64)
412 return false;
413 // Address space 8 pointers get their own workaround.
415 return false;
416 if (!Ty.isVector())
417 return true;
418
419 LLT EltTy = Ty.getElementType();
420 if (EltTy.isPointer())
421 return true;
422
423 unsigned EltSize = EltTy.getSizeInBits();
424 return EltSize != 32 && EltSize != 64;
425}
426
427static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query) {
428 const LLT Ty = Query.Types[0];
429 return isRegisterType(Ty) && isLoadStoreSizeLegal(ST, Query) &&
431}
432
433/// Return true if a load or store of the type should be lowered with a bitcast
434/// to a different type.
435static bool shouldBitcastLoadStoreType(const GCNSubtarget &ST, const LLT Ty,
436 const LLT MemTy) {
437 const unsigned MemSizeInBits = MemTy.getSizeInBits();
438 const unsigned Size = Ty.getSizeInBits();
439 if (Size != MemSizeInBits)
440 return Size <= 32 && Ty.isVector();
441
443 return true;
444
445 // Don't try to handle bitcasting vector ext loads for now.
446 return Ty.isVector() && (!MemTy.isVector() || MemTy == Ty) &&
447 (Size <= 32 || isRegisterSize(Size)) &&
449}
450
451/// Return true if we should legalize a load by widening an odd sized memory
452/// access up to the alignment. Note this case when the memory access itself
453/// changes, not the size of the result register.
454static bool shouldWidenLoad(const GCNSubtarget &ST, LLT MemoryTy,
455 uint64_t AlignInBits, unsigned AddrSpace,
456 unsigned Opcode) {
457 unsigned SizeInBits = MemoryTy.getSizeInBits();
458 // We don't want to widen cases that are naturally legal.
459 if (isPowerOf2_32(SizeInBits))
460 return false;
461
462 // If we have 96-bit memory operations, we shouldn't touch them. Note we may
463 // end up widening these for a scalar load during RegBankSelect, since there
464 // aren't 96-bit scalar loads.
465 if (SizeInBits == 96 && ST.hasDwordx3LoadStores())
466 return false;
467
468 if (SizeInBits >= maxSizeForAddrSpace(ST, AddrSpace, Opcode, false))
469 return false;
470
471 // A load is known dereferenceable up to the alignment, so it's legal to widen
472 // to it.
473 //
474 // TODO: Could check dereferenceable for less aligned cases.
475 unsigned RoundedSize = NextPowerOf2(SizeInBits);
476 if (AlignInBits < RoundedSize)
477 return false;
478
479 // Do not widen if it would introduce a slow unaligned load.
480 const SITargetLowering *TLI = ST.getTargetLowering();
481 unsigned Fast = 0;
483 RoundedSize, AddrSpace, Align(AlignInBits / 8),
485 Fast;
486}
487
488static bool shouldWidenLoad(const GCNSubtarget &ST, const LegalityQuery &Query,
489 unsigned Opcode) {
490 if (Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic)
491 return false;
492
493 return shouldWidenLoad(ST, Query.MMODescrs[0].MemoryTy,
494 Query.MMODescrs[0].AlignInBits,
495 Query.Types[1].getAddressSpace(), Opcode);
496}
497
498/// Mutates IR (typicaly a load instruction) to use a <4 x s32> as the initial
499/// type of the operand `idx` and then to transform it to a `p8` via bitcasts
500/// and inttoptr. In addition, handle vectors of p8. Returns the new type.
502 MachineRegisterInfo &MRI, unsigned Idx) {
503 MachineOperand &MO = MI.getOperand(Idx);
504
505 const LLT PointerTy = MRI.getType(MO.getReg());
506
507 // Paranoidly prevent us from doing this multiple times.
509 return PointerTy;
510
511 const LLT ScalarTy = getBufferRsrcScalarType(PointerTy);
512 const LLT VectorTy = getBufferRsrcRegisterType(PointerTy);
513 if (!PointerTy.isVector()) {
514 // Happy path: (4 x s32) -> (s32, s32, s32, s32) -> (p8)
515 const unsigned NumParts = PointerTy.getSizeInBits() / 32;
516 const LLT S32 = LLT::scalar(32);
517
518 Register VectorReg = MRI.createGenericVirtualRegister(VectorTy);
519 std::array<Register, 4> VectorElems;
520 B.setInsertPt(B.getMBB(), ++B.getInsertPt());
521 for (unsigned I = 0; I < NumParts; ++I)
522 VectorElems[I] =
523 B.buildExtractVectorElementConstant(S32, VectorReg, I).getReg(0);
524 B.buildMergeValues(MO, VectorElems);
525 MO.setReg(VectorReg);
526 return VectorTy;
527 }
528 Register BitcastReg = MRI.createGenericVirtualRegister(VectorTy);
529 B.setInsertPt(B.getMBB(), ++B.getInsertPt());
530 auto Scalar = B.buildBitcast(ScalarTy, BitcastReg);
531 B.buildIntToPtr(MO, Scalar);
532 MO.setReg(BitcastReg);
533
534 return VectorTy;
535}
536
537/// Cast a buffer resource (an address space 8 pointer) into a 4xi32, which is
538/// the form in which the value must be in order to be passed to the low-level
539/// representations used for MUBUF/MTBUF intrinsics. This is a hack, which is
540/// needed in order to account for the fact that we can't define a register
541/// class for s128 without breaking SelectionDAG.
543 MachineRegisterInfo &MRI = *B.getMRI();
544 const LLT PointerTy = MRI.getType(Pointer);
545 const LLT ScalarTy = getBufferRsrcScalarType(PointerTy);
546 const LLT VectorTy = getBufferRsrcRegisterType(PointerTy);
547
548 if (!PointerTy.isVector()) {
549 // Special case: p8 -> (s32, s32, s32, s32) -> (4xs32)
550 SmallVector<Register, 4> PointerParts;
551 const unsigned NumParts = PointerTy.getSizeInBits() / 32;
552 auto Unmerged = B.buildUnmerge(LLT::scalar(32), Pointer);
553 for (unsigned I = 0; I < NumParts; ++I)
554 PointerParts.push_back(Unmerged.getReg(I));
555 return B.buildBuildVector(VectorTy, PointerParts).getReg(0);
556 }
557 Register Scalar = B.buildPtrToInt(ScalarTy, Pointer).getReg(0);
558 return B.buildBitcast(VectorTy, Scalar).getReg(0);
559}
560
562 unsigned Idx) {
563 MachineOperand &MO = MI.getOperand(Idx);
564
565 const LLT PointerTy = B.getMRI()->getType(MO.getReg());
566 // Paranoidly prevent us from doing this multiple times.
568 return;
570}
571
573 const GCNTargetMachine &TM)
574 : ST(ST_) {
575 using namespace TargetOpcode;
576
577 auto GetAddrSpacePtr = [&TM](unsigned AS) {
578 return LLT::pointer(AS, TM.getPointerSizeInBits(AS));
579 };
580
581 const LLT S1 = LLT::scalar(1);
582 const LLT S8 = LLT::scalar(8);
583 const LLT S16 = LLT::scalar(16);
584 const LLT S32 = LLT::scalar(32);
585 const LLT S64 = LLT::scalar(64);
586 const LLT S128 = LLT::scalar(128);
587 const LLT S256 = LLT::scalar(256);
588 const LLT S512 = LLT::scalar(512);
589 const LLT MaxScalar = LLT::scalar(MaxRegisterSize);
590
591 const LLT V2S8 = LLT::fixed_vector(2, 8);
592 const LLT V2S16 = LLT::fixed_vector(2, 16);
593 const LLT V4S16 = LLT::fixed_vector(4, 16);
594
595 const LLT V2S32 = LLT::fixed_vector(2, 32);
596 const LLT V3S32 = LLT::fixed_vector(3, 32);
597 const LLT V4S32 = LLT::fixed_vector(4, 32);
598 const LLT V5S32 = LLT::fixed_vector(5, 32);
599 const LLT V6S32 = LLT::fixed_vector(6, 32);
600 const LLT V7S32 = LLT::fixed_vector(7, 32);
601 const LLT V8S32 = LLT::fixed_vector(8, 32);
602 const LLT V9S32 = LLT::fixed_vector(9, 32);
603 const LLT V10S32 = LLT::fixed_vector(10, 32);
604 const LLT V11S32 = LLT::fixed_vector(11, 32);
605 const LLT V12S32 = LLT::fixed_vector(12, 32);
606 const LLT V13S32 = LLT::fixed_vector(13, 32);
607 const LLT V14S32 = LLT::fixed_vector(14, 32);
608 const LLT V15S32 = LLT::fixed_vector(15, 32);
609 const LLT V16S32 = LLT::fixed_vector(16, 32);
610 const LLT V32S32 = LLT::fixed_vector(32, 32);
611
612 const LLT V2S64 = LLT::fixed_vector(2, 64);
613 const LLT V3S64 = LLT::fixed_vector(3, 64);
614 const LLT V4S64 = LLT::fixed_vector(4, 64);
615 const LLT V5S64 = LLT::fixed_vector(5, 64);
616 const LLT V6S64 = LLT::fixed_vector(6, 64);
617 const LLT V7S64 = LLT::fixed_vector(7, 64);
618 const LLT V8S64 = LLT::fixed_vector(8, 64);
619 const LLT V16S64 = LLT::fixed_vector(16, 64);
620
621 std::initializer_list<LLT> AllS32Vectors =
622 {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32,
623 V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32};
624 std::initializer_list<LLT> AllS64Vectors =
625 {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64};
626
627 const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS);
628 const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS);
629 const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT);
630 const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS);
631 const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS);
632 const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS);
633 const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS);
634 const LLT BufferFatPtr = GetAddrSpacePtr(AMDGPUAS::BUFFER_FAT_POINTER);
635 const LLT RsrcPtr = GetAddrSpacePtr(AMDGPUAS::BUFFER_RESOURCE);
636
637 const LLT CodePtr = FlatPtr;
638
639 const std::initializer_list<LLT> AddrSpaces64 = {
640 GlobalPtr, ConstantPtr, FlatPtr
641 };
642
643 const std::initializer_list<LLT> AddrSpaces32 = {
644 LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr
645 };
646
647 const std::initializer_list<LLT> AddrSpaces128 = {RsrcPtr};
648
649 const std::initializer_list<LLT> FPTypesBase = {
650 S32, S64
651 };
652
653 const std::initializer_list<LLT> FPTypes16 = {
654 S32, S64, S16
655 };
656
657 const std::initializer_list<LLT> FPTypesPK16 = {
658 S32, S64, S16, V2S16
659 };
660
661 const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32;
662
663 // s1 for VCC branches, s32 for SCC branches.
664 getActionDefinitionsBuilder(G_BRCOND).legalFor({S1, S32});
665
666 // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more
667 // elements for v3s16
669 .legalFor({S32, S64, V2S16, S16, V4S16, S1, S128, S256})
670 .legalFor(AllS32Vectors)
671 .legalFor(AllS64Vectors)
672 .legalFor(AddrSpaces64)
673 .legalFor(AddrSpaces32)
674 .legalFor(AddrSpaces128)
675 .legalIf(isPointer(0))
676 .clampScalar(0, S16, S256)
678 .clampMaxNumElements(0, S32, 16)
680 .scalarize(0);
681
682 if (ST.hasVOP3PInsts() && ST.hasAddNoCarry() && ST.hasIntClamp()) {
683 // Full set of gfx9 features.
684 getActionDefinitionsBuilder({G_ADD, G_SUB})
685 .legalFor({S32, S16, V2S16})
686 .clampMaxNumElementsStrict(0, S16, 2)
687 .scalarize(0)
688 .minScalar(0, S16)
690 .maxScalar(0, S32);
691
693 .legalFor({S32, S16, V2S16})
694 .clampMaxNumElementsStrict(0, S16, 2)
695 .scalarize(0)
696 .minScalar(0, S16)
698 .custom();
699 assert(ST.hasMad64_32());
700
701 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT, G_SADDSAT, G_SSUBSAT})
702 .legalFor({S32, S16, V2S16}) // Clamp modifier
703 .minScalarOrElt(0, S16)
704 .clampMaxNumElementsStrict(0, S16, 2)
705 .scalarize(0)
707 .lower();
708 } else if (ST.has16BitInsts()) {
709 getActionDefinitionsBuilder({G_ADD, G_SUB})
710 .legalFor({S32, S16})
711 .minScalar(0, S16)
713 .maxScalar(0, S32)
714 .scalarize(0);
715
717 .legalFor({S32, S16})
718 .scalarize(0)
719 .minScalar(0, S16)
720 .widenScalarToNextMultipleOf(0, 32)
721 .custom();
722 assert(ST.hasMad64_32());
723
724 // Technically the saturating operations require clamp bit support, but this
725 // was introduced at the same time as 16-bit operations.
726 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
727 .legalFor({S32, S16}) // Clamp modifier
728 .minScalar(0, S16)
729 .scalarize(0)
731 .lower();
732
733 // We're just lowering this, but it helps get a better result to try to
734 // coerce to the desired type first.
735 getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT})
736 .minScalar(0, S16)
737 .scalarize(0)
738 .lower();
739 } else {
740 getActionDefinitionsBuilder({G_ADD, G_SUB})
741 .legalFor({S32})
742 .widenScalarToNextMultipleOf(0, 32)
743 .clampScalar(0, S32, S32)
744 .scalarize(0);
745
746 auto &Mul = getActionDefinitionsBuilder(G_MUL)
747 .legalFor({S32})
748 .scalarize(0)
749 .minScalar(0, S32)
750 .widenScalarToNextMultipleOf(0, 32);
751
752 if (ST.hasMad64_32())
753 Mul.custom();
754 else
755 Mul.maxScalar(0, S32);
756
757 if (ST.hasIntClamp()) {
758 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
759 .legalFor({S32}) // Clamp modifier.
760 .scalarize(0)
761 .minScalarOrElt(0, S32)
762 .lower();
763 } else {
764 // Clamp bit support was added in VI, along with 16-bit operations.
765 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
766 .minScalar(0, S32)
767 .scalarize(0)
768 .lower();
769 }
770
771 // FIXME: DAG expansion gets better results. The widening uses the smaller
772 // range values and goes for the min/max lowering directly.
773 getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT})
774 .minScalar(0, S32)
775 .scalarize(0)
776 .lower();
777 }
778
780 {G_SDIV, G_UDIV, G_SREM, G_UREM, G_SDIVREM, G_UDIVREM})
781 .customFor({S32, S64})
782 .clampScalar(0, S32, S64)
784 .scalarize(0);
785
786 auto &Mulh = getActionDefinitionsBuilder({G_UMULH, G_SMULH})
787 .legalFor({S32})
788 .maxScalar(0, S32);
789
790 if (ST.hasVOP3PInsts()) {
791 Mulh
792 .clampMaxNumElements(0, S8, 2)
793 .lowerFor({V2S8});
794 }
795
796 Mulh
797 .scalarize(0)
798 .lower();
799
800 // Report legal for any types we can handle anywhere. For the cases only legal
801 // on the SALU, RegBankSelect will be able to re-legalize.
802 getActionDefinitionsBuilder({G_AND, G_OR, G_XOR})
803 .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16})
804 .clampScalar(0, S32, S64)
808 .scalarize(0);
809
811 {G_UADDO, G_USUBO, G_UADDE, G_SADDE, G_USUBE, G_SSUBE})
812 .legalFor({{S32, S1}, {S32, S32}})
813 .clampScalar(0, S32, S32)
814 .scalarize(0);
815
817 // Don't worry about the size constraint.
819 .lower();
820
821
823 .legalFor({S1, S32, S64, S16, GlobalPtr,
824 LocalPtr, ConstantPtr, PrivatePtr, FlatPtr })
825 .legalIf(isPointer(0))
826 .clampScalar(0, S32, S64)
828
829 getActionDefinitionsBuilder(G_FCONSTANT)
830 .legalFor({S32, S64, S16})
831 .clampScalar(0, S16, S64);
832
833 getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE})
834 .legalIf(isRegisterType(0))
835 // s1 and s16 are special cases because they have legal operations on
836 // them, but don't really occupy registers in the normal way.
837 .legalFor({S1, S16})
838 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
839 .clampScalarOrElt(0, S32, MaxScalar)
841 .clampMaxNumElements(0, S32, 16);
842
843 getActionDefinitionsBuilder(G_FRAME_INDEX).legalFor({PrivatePtr});
844
845 // If the amount is divergent, we have to do a wave reduction to get the
846 // maximum value, so this is expanded during RegBankSelect.
847 getActionDefinitionsBuilder(G_DYN_STACKALLOC)
848 .legalFor({{PrivatePtr, S32}});
849
850 getActionDefinitionsBuilder(G_STACKSAVE)
851 .customFor({PrivatePtr});
852 getActionDefinitionsBuilder(G_STACKRESTORE)
853 .legalFor({PrivatePtr});
854
855 getActionDefinitionsBuilder(G_GLOBAL_VALUE)
856 .customIf(typeIsNot(0, PrivatePtr));
857
858 getActionDefinitionsBuilder(G_BLOCK_ADDR).legalFor({CodePtr});
859
860 auto &FPOpActions = getActionDefinitionsBuilder(
861 { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE,
862 G_STRICT_FADD, G_STRICT_FMUL, G_STRICT_FMA})
863 .legalFor({S32, S64});
864 auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS})
865 .customFor({S32, S64});
866 auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV)
867 .customFor({S32, S64});
868
869 if (ST.has16BitInsts()) {
870 if (ST.hasVOP3PInsts())
871 FPOpActions.legalFor({S16, V2S16});
872 else
873 FPOpActions.legalFor({S16});
874
875 TrigActions.customFor({S16});
876 FDIVActions.customFor({S16});
877 }
878
879 if (ST.hasPackedFP32Ops()) {
880 FPOpActions.legalFor({V2S32});
881 FPOpActions.clampMaxNumElementsStrict(0, S32, 2);
882 }
883
884 auto &MinNumMaxNum = getActionDefinitionsBuilder({
885 G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE});
886
887 if (ST.hasVOP3PInsts()) {
888 MinNumMaxNum.customFor(FPTypesPK16)
890 .clampMaxNumElements(0, S16, 2)
891 .clampScalar(0, S16, S64)
892 .scalarize(0);
893 } else if (ST.has16BitInsts()) {
894 MinNumMaxNum.customFor(FPTypes16)
895 .clampScalar(0, S16, S64)
896 .scalarize(0);
897 } else {
898 MinNumMaxNum.customFor(FPTypesBase)
899 .clampScalar(0, S32, S64)
900 .scalarize(0);
901 }
902
903 if (ST.hasVOP3PInsts())
904 FPOpActions.clampMaxNumElementsStrict(0, S16, 2);
905
906 FPOpActions
907 .scalarize(0)
908 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
909
910 TrigActions
911 .scalarize(0)
912 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
913
914 FDIVActions
915 .scalarize(0)
916 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
917
918 getActionDefinitionsBuilder({G_FNEG, G_FABS})
919 .legalFor(FPTypesPK16)
920 .clampMaxNumElementsStrict(0, S16, 2)
921 .scalarize(0)
922 .clampScalar(0, S16, S64);
923
924 if (ST.has16BitInsts()) {
926 .legalFor({S16})
927 .customFor({S32, S64})
928 .scalarize(0)
929 .unsupported();
931 .legalFor({S32, S64, S16})
932 .scalarize(0)
933 .clampScalar(0, S16, S64);
934
935 getActionDefinitionsBuilder({G_FLDEXP, G_STRICT_FLDEXP})
936 .legalFor({{S32, S32}, {S64, S32}, {S16, S16}})
937 .scalarize(0)
938 .maxScalarIf(typeIs(0, S16), 1, S16)
939 .clampScalar(1, S32, S32)
940 .lower();
941
943 .customFor({{S32, S32}, {S64, S32}, {S16, S16}, {S16, S32}})
944 .scalarize(0)
945 .lower();
946 } else {
948 .customFor({S32, S64, S16})
949 .scalarize(0)
950 .unsupported();
951
952
953 if (ST.hasFractBug()) {
955 .customFor({S64})
956 .legalFor({S32, S64})
957 .scalarize(0)
958 .clampScalar(0, S32, S64);
959 } else {
961 .legalFor({S32, S64})
962 .scalarize(0)
963 .clampScalar(0, S32, S64);
964 }
965
966 getActionDefinitionsBuilder({G_FLDEXP, G_STRICT_FLDEXP})
967 .legalFor({{S32, S32}, {S64, S32}})
968 .scalarize(0)
969 .clampScalar(0, S32, S64)
970 .clampScalar(1, S32, S32)
971 .lower();
972
974 .customFor({{S32, S32}, {S64, S32}})
975 .scalarize(0)
976 .minScalar(0, S32)
977 .clampScalar(1, S32, S32)
978 .lower();
979 }
980
982 .legalFor({{S32, S64}, {S16, S32}})
983 .scalarize(0)
984 .lower();
985
987 .legalFor({{S64, S32}, {S32, S16}})
988 .narrowScalarFor({{S64, S16}}, changeTo(0, S32))
989 .scalarize(0);
990
991 auto &FSubActions = getActionDefinitionsBuilder({G_FSUB, G_STRICT_FSUB});
992 if (ST.has16BitInsts()) {
993 FSubActions
994 // Use actual fsub instruction
995 .legalFor({S32, S16})
996 // Must use fadd + fneg
997 .lowerFor({S64, V2S16});
998 } else {
999 FSubActions
1000 // Use actual fsub instruction
1001 .legalFor({S32})
1002 // Must use fadd + fneg
1003 .lowerFor({S64, S16, V2S16});
1004 }
1005
1006 FSubActions
1007 .scalarize(0)
1008 .clampScalar(0, S32, S64);
1009
1010 // Whether this is legal depends on the floating point mode for the function.
1011 auto &FMad = getActionDefinitionsBuilder(G_FMAD);
1012 if (ST.hasMadF16() && ST.hasMadMacF32Insts())
1013 FMad.customFor({S32, S16});
1014 else if (ST.hasMadMacF32Insts())
1015 FMad.customFor({S32});
1016 else if (ST.hasMadF16())
1017 FMad.customFor({S16});
1018 FMad.scalarize(0)
1019 .lower();
1020
1021 auto &FRem = getActionDefinitionsBuilder(G_FREM);
1022 if (ST.has16BitInsts()) {
1023 FRem.customFor({S16, S32, S64});
1024 } else {
1025 FRem.minScalar(0, S32)
1026 .customFor({S32, S64});
1027 }
1028 FRem.scalarize(0);
1029
1030 // TODO: Do we need to clamp maximum bitwidth?
1032 .legalIf(isScalar(0))
1033 .legalFor({{V2S16, V2S32}})
1034 .clampMaxNumElements(0, S16, 2)
1035 // Avoid scalarizing in cases that should be truly illegal. In unresolvable
1036 // situations (like an invalid implicit use), we don't want to infinite loop
1037 // in the legalizer.
1039 .alwaysLegal();
1040
1041 getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT})
1042 .legalFor({{S64, S32}, {S32, S16}, {S64, S16},
1043 {S32, S1}, {S64, S1}, {S16, S1}})
1044 .scalarize(0)
1045 .clampScalar(0, S32, S64)
1046 .widenScalarToNextPow2(1, 32);
1047
1048 // TODO: Split s1->s64 during regbankselect for VALU.
1049 auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP})
1050 .legalFor({{S32, S32}, {S64, S32}, {S16, S32}})
1051 .lowerIf(typeIs(1, S1))
1052 .customFor({{S32, S64}, {S64, S64}});
1053 if (ST.has16BitInsts())
1054 IToFP.legalFor({{S16, S16}});
1055 IToFP.clampScalar(1, S32, S64)
1056 .minScalar(0, S32)
1057 .scalarize(0)
1059
1060 auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI})
1061 .legalFor({{S32, S32}, {S32, S64}, {S32, S16}})
1062 .customFor({{S64, S32}, {S64, S64}})
1063 .narrowScalarFor({{S64, S16}}, changeTo(0, S32));
1064 if (ST.has16BitInsts())
1065 FPToI.legalFor({{S16, S16}});
1066 else
1067 FPToI.minScalar(1, S32);
1068
1069 FPToI.minScalar(0, S32)
1070 .widenScalarToNextPow2(0, 32)
1071 .scalarize(0)
1072 .lower();
1073
1074 getActionDefinitionsBuilder(G_INTRINSIC_FPTRUNC_ROUND)
1075 .customFor({S16, S32})
1076 .scalarize(0)
1077 .lower();
1078
1079 // Lower G_FNEARBYINT and G_FRINT into G_INTRINSIC_ROUNDEVEN
1080 getActionDefinitionsBuilder({G_INTRINSIC_ROUND, G_FRINT, G_FNEARBYINT})
1081 .scalarize(0)
1082 .lower();
1083
1084 if (ST.has16BitInsts()) {
1085 getActionDefinitionsBuilder(
1086 {G_INTRINSIC_TRUNC, G_FCEIL, G_INTRINSIC_ROUNDEVEN})
1087 .legalFor({S16, S32, S64})
1088 .clampScalar(0, S16, S64)
1089 .scalarize(0);
1090 } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) {
1091 getActionDefinitionsBuilder(
1092 {G_INTRINSIC_TRUNC, G_FCEIL, G_INTRINSIC_ROUNDEVEN})
1093 .legalFor({S32, S64})
1094 .clampScalar(0, S32, S64)
1095 .scalarize(0);
1096 } else {
1097 getActionDefinitionsBuilder(
1098 {G_INTRINSIC_TRUNC, G_FCEIL, G_INTRINSIC_ROUNDEVEN})
1099 .legalFor({S32})
1100 .customFor({S64})
1101 .clampScalar(0, S32, S64)
1102 .scalarize(0);
1103 }
1104
1105 getActionDefinitionsBuilder(G_PTR_ADD)
1106 .unsupportedFor({BufferFatPtr, RsrcPtr})
1107 .legalIf(all(isPointer(0), sameSize(0, 1)))
1108 .scalarize(0)
1109 .scalarSameSizeAs(1, 0);
1110
1111 getActionDefinitionsBuilder(G_PTRMASK)
1112 .legalIf(all(sameSize(0, 1), typeInSet(1, {S64, S32})))
1113 .scalarSameSizeAs(1, 0)
1114 .scalarize(0);
1115
1116 auto &CmpBuilder =
1117 getActionDefinitionsBuilder(G_ICMP)
1118 // The compare output type differs based on the register bank of the output,
1119 // so make both s1 and s32 legal.
1120 //
1121 // Scalar compares producing output in scc will be promoted to s32, as that
1122 // is the allocatable register type that will be needed for the copy from
1123 // scc. This will be promoted during RegBankSelect, and we assume something
1124 // before that won't try to use s32 result types.
1125 //
1126 // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg
1127 // bank.
1128 .legalForCartesianProduct(
1129 {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr})
1130 .legalForCartesianProduct(
1131 {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr});
1132 if (ST.has16BitInsts()) {
1133 CmpBuilder.legalFor({{S1, S16}});
1134 }
1135
1136 CmpBuilder
1137 .widenScalarToNextPow2(1)
1138 .clampScalar(1, S32, S64)
1139 .scalarize(0)
1140 .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1)));
1141
1142 auto &FCmpBuilder =
1143 getActionDefinitionsBuilder(G_FCMP).legalForCartesianProduct(
1144 {S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase);
1145
1146 if (ST.hasSALUFloatInsts())
1147 FCmpBuilder.legalForCartesianProduct({S32}, {S16, S32});
1148
1149 FCmpBuilder
1150 .widenScalarToNextPow2(1)
1151 .clampScalar(1, S32, S64)
1152 .scalarize(0);
1153
1154 // FIXME: fpow has a selection pattern that should move to custom lowering.
1155 auto &ExpOps = getActionDefinitionsBuilder(G_FPOW);
1156 if (ST.has16BitInsts())
1157 ExpOps.customFor({{S32}, {S16}});
1158 else
1159 ExpOps.customFor({S32});
1160 ExpOps.clampScalar(0, MinScalarFPTy, S32)
1161 .scalarize(0);
1162
1163 getActionDefinitionsBuilder(G_FPOWI)
1164 .clampScalar(0, MinScalarFPTy, S32)
1165 .lower();
1166
1167 auto &Log2Ops = getActionDefinitionsBuilder({G_FLOG2, G_FEXP2});
1168 Log2Ops.customFor({S32});
1169 if (ST.has16BitInsts())
1170 Log2Ops.legalFor({S16});
1171 else
1172 Log2Ops.customFor({S16});
1173 Log2Ops.scalarize(0)
1174 .lower();
1175
1176 auto &LogOps = getActionDefinitionsBuilder({G_FLOG, G_FLOG10, G_FEXP});
1177 LogOps.customFor({S32, S16});
1178 LogOps.clampScalar(0, MinScalarFPTy, S32)
1179 .scalarize(0);
1180
1181 // The 64-bit versions produce 32-bit results, but only on the SALU.
1182 getActionDefinitionsBuilder(G_CTPOP)
1183 .legalFor({{S32, S32}, {S32, S64}})
1184 .clampScalar(0, S32, S32)
1185 .widenScalarToNextPow2(1, 32)
1186 .clampScalar(1, S32, S64)
1187 .scalarize(0)
1188 .widenScalarToNextPow2(0, 32);
1189
1190 // If no 16 bit instr is available, lower into different instructions.
1191 if (ST.has16BitInsts())
1192 getActionDefinitionsBuilder(G_IS_FPCLASS)
1193 .legalForCartesianProduct({S1}, FPTypes16)
1194 .widenScalarToNextPow2(1)
1195 .scalarize(0)
1196 .lower();
1197 else
1198 getActionDefinitionsBuilder(G_IS_FPCLASS)
1199 .legalForCartesianProduct({S1}, FPTypesBase)
1200 .lowerFor({S1, S16})
1201 .widenScalarToNextPow2(1)
1202 .scalarize(0)
1203 .lower();
1204
1205 // The hardware instructions return a different result on 0 than the generic
1206 // instructions expect. The hardware produces -1, but these produce the
1207 // bitwidth.
1208 getActionDefinitionsBuilder({G_CTLZ, G_CTTZ})
1209 .scalarize(0)
1210 .clampScalar(0, S32, S32)
1211 .clampScalar(1, S32, S64)
1212 .widenScalarToNextPow2(0, 32)
1213 .widenScalarToNextPow2(1, 32)
1214 .custom();
1215
1216 // The 64-bit versions produce 32-bit results, but only on the SALU.
1217 getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF})
1218 .legalFor({{S32, S32}, {S32, S64}})
1219 .clampScalar(0, S32, S32)
1220 .clampScalar(1, S32, S64)
1221 .scalarize(0)
1222 .widenScalarToNextPow2(0, 32)
1223 .widenScalarToNextPow2(1, 32);
1224
1225 // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
1226 // RegBankSelect.
1227 getActionDefinitionsBuilder(G_BITREVERSE)
1228 .legalFor({S32, S64})
1229 .clampScalar(0, S32, S64)
1230 .scalarize(0)
1231 .widenScalarToNextPow2(0);
1232
1233 if (ST.has16BitInsts()) {
1234 getActionDefinitionsBuilder(G_BSWAP)
1235 .legalFor({S16, S32, V2S16})
1236 .clampMaxNumElementsStrict(0, S16, 2)
1237 // FIXME: Fixing non-power-of-2 before clamp is workaround for
1238 // narrowScalar limitation.
1239 .widenScalarToNextPow2(0)
1240 .clampScalar(0, S16, S32)
1241 .scalarize(0);
1242
1243 if (ST.hasVOP3PInsts()) {
1244 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
1245 .legalFor({S32, S16, V2S16})
1246 .clampMaxNumElements(0, S16, 2)
1247 .minScalar(0, S16)
1248 .widenScalarToNextPow2(0)
1249 .scalarize(0)
1250 .lower();
1251 } else {
1252 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
1253 .legalFor({S32, S16})
1254 .widenScalarToNextPow2(0)
1255 .minScalar(0, S16)
1256 .scalarize(0)
1257 .lower();
1258 }
1259 } else {
1260 // TODO: Should have same legality without v_perm_b32
1261 getActionDefinitionsBuilder(G_BSWAP)
1262 .legalFor({S32})
1263 .lowerIf(scalarNarrowerThan(0, 32))
1264 // FIXME: Fixing non-power-of-2 before clamp is workaround for
1265 // narrowScalar limitation.
1266 .widenScalarToNextPow2(0)
1267 .maxScalar(0, S32)
1268 .scalarize(0)
1269 .lower();
1270
1271 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
1272 .legalFor({S32})
1273 .minScalar(0, S32)
1274 .widenScalarToNextPow2(0)
1275 .scalarize(0)
1276 .lower();
1277 }
1278
1279 getActionDefinitionsBuilder(G_INTTOPTR)
1280 // List the common cases
1281 .legalForCartesianProduct(AddrSpaces64, {S64})
1282 .legalForCartesianProduct(AddrSpaces32, {S32})
1283 .scalarize(0)
1284 // Accept any address space as long as the size matches
1285 .legalIf(sameSize(0, 1))
1286 .widenScalarIf(smallerThan(1, 0),
1287 [](const LegalityQuery &Query) {
1288 return std::pair(
1289 1, LLT::scalar(Query.Types[0].getSizeInBits()));
1290 })
1291 .narrowScalarIf(largerThan(1, 0), [](const LegalityQuery &Query) {
1292 return std::pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
1293 });
1294
1295 getActionDefinitionsBuilder(G_PTRTOINT)
1296 // List the common cases
1297 .legalForCartesianProduct(AddrSpaces64, {S64})
1298 .legalForCartesianProduct(AddrSpaces32, {S32})
1299 .scalarize(0)
1300 // Accept any address space as long as the size matches
1301 .legalIf(sameSize(0, 1))
1302 .widenScalarIf(smallerThan(0, 1),
1303 [](const LegalityQuery &Query) {
1304 return std::pair(
1305 0, LLT::scalar(Query.Types[1].getSizeInBits()));
1306 })
1307 .narrowScalarIf(largerThan(0, 1), [](const LegalityQuery &Query) {
1308 return std::pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
1309 });
1310
1311 getActionDefinitionsBuilder(G_ADDRSPACE_CAST)
1312 .scalarize(0)
1313 .custom();
1314
1315 const auto needToSplitMemOp = [=](const LegalityQuery &Query,
1316 bool IsLoad) -> bool {
1317 const LLT DstTy = Query.Types[0];
1318
1319 // Split vector extloads.
1320 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
1321
1322 if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize)
1323 return true;
1324
1325 const LLT PtrTy = Query.Types[1];
1326 unsigned AS = PtrTy.getAddressSpace();
1327 if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad,
1328 Query.MMODescrs[0].Ordering !=
1330 return true;
1331
1332 // Catch weird sized loads that don't evenly divide into the access sizes
1333 // TODO: May be able to widen depending on alignment etc.
1334 unsigned NumRegs = (MemSize + 31) / 32;
1335 if (NumRegs == 3) {
1336 if (!ST.hasDwordx3LoadStores())
1337 return true;
1338 } else {
1339 // If the alignment allows, these should have been widened.
1340 if (!isPowerOf2_32(NumRegs))
1341 return true;
1342 }
1343
1344 return false;
1345 };
1346
1347 unsigned GlobalAlign32 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 32;
1348 unsigned GlobalAlign16 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 16;
1349 unsigned GlobalAlign8 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 8;
1350
1351 // TODO: Refine based on subtargets which support unaligned access or 128-bit
1352 // LDS
1353 // TODO: Unsupported flat for SI.
1354
1355 for (unsigned Op : {G_LOAD, G_STORE}) {
1356 const bool IsStore = Op == G_STORE;
1357
1358 auto &Actions = getActionDefinitionsBuilder(Op);
1359 // Explicitly list some common cases.
1360 // TODO: Does this help compile time at all?
1361 Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, S32, GlobalAlign32},
1362 {V2S32, GlobalPtr, V2S32, GlobalAlign32},
1363 {V4S32, GlobalPtr, V4S32, GlobalAlign32},
1364 {S64, GlobalPtr, S64, GlobalAlign32},
1365 {V2S64, GlobalPtr, V2S64, GlobalAlign32},
1366 {V2S16, GlobalPtr, V2S16, GlobalAlign32},
1367 {S32, GlobalPtr, S8, GlobalAlign8},
1368 {S32, GlobalPtr, S16, GlobalAlign16},
1369
1370 {S32, LocalPtr, S32, 32},
1371 {S64, LocalPtr, S64, 32},
1372 {V2S32, LocalPtr, V2S32, 32},
1373 {S32, LocalPtr, S8, 8},
1374 {S32, LocalPtr, S16, 16},
1375 {V2S16, LocalPtr, S32, 32},
1376
1377 {S32, PrivatePtr, S32, 32},
1378 {S32, PrivatePtr, S8, 8},
1379 {S32, PrivatePtr, S16, 16},
1380 {V2S16, PrivatePtr, S32, 32},
1381
1382 {S32, ConstantPtr, S32, GlobalAlign32},
1383 {V2S32, ConstantPtr, V2S32, GlobalAlign32},
1384 {V4S32, ConstantPtr, V4S32, GlobalAlign32},
1385 {S64, ConstantPtr, S64, GlobalAlign32},
1386 {V2S32, ConstantPtr, V2S32, GlobalAlign32}});
1387 Actions.legalIf(
1388 [=](const LegalityQuery &Query) -> bool {
1389 return isLoadStoreLegal(ST, Query);
1390 });
1391
1392 // The custom pointers (fat pointers, buffer resources) don't work with load
1393 // and store at this level. Fat pointers should have been lowered to
1394 // intrinsics before the translation to MIR.
1395 Actions.unsupportedIf(typeInSet(1, {BufferFatPtr, RsrcPtr}));
1396
1397 // Address space 8 pointers are handled by a 4xs32 load, bitcast, and
1398 // ptrtoint. This is needed to account for the fact that we can't have i128
1399 // as a register class for SelectionDAG reasons.
1400 Actions.customIf([=](const LegalityQuery &Query) -> bool {
1401 return hasBufferRsrcWorkaround(Query.Types[0]);
1402 });
1403
1404 // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to
1405 // 64-bits.
1406 //
1407 // TODO: Should generalize bitcast action into coerce, which will also cover
1408 // inserting addrspacecasts.
1409 Actions.customIf(typeIs(1, Constant32Ptr));
1410
1411 // Turn any illegal element vectors into something easier to deal
1412 // with. These will ultimately produce 32-bit scalar shifts to extract the
1413 // parts anyway.
1414 //
1415 // For odd 16-bit element vectors, prefer to split those into pieces with
1416 // 16-bit vector parts.
1417 Actions.bitcastIf(
1418 [=](const LegalityQuery &Query) -> bool {
1419 return shouldBitcastLoadStoreType(ST, Query.Types[0],
1420 Query.MMODescrs[0].MemoryTy);
1421 }, bitcastToRegisterType(0));
1422
1423 if (!IsStore) {
1424 // Widen suitably aligned loads by loading extra bytes. The standard
1425 // legalization actions can't properly express widening memory operands.
1426 Actions.customIf([=](const LegalityQuery &Query) -> bool {
1427 return shouldWidenLoad(ST, Query, G_LOAD);
1428 });
1429 }
1430
1431 // FIXME: load/store narrowing should be moved to lower action
1432 Actions
1433 .narrowScalarIf(
1434 [=](const LegalityQuery &Query) -> bool {
1435 return !Query.Types[0].isVector() &&
1436 needToSplitMemOp(Query, Op == G_LOAD);
1437 },
1438 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1439 const LLT DstTy = Query.Types[0];
1440 const LLT PtrTy = Query.Types[1];
1441
1442 const unsigned DstSize = DstTy.getSizeInBits();
1443 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
1444
1445 // Split extloads.
1446 if (DstSize > MemSize)
1447 return std::pair(0, LLT::scalar(MemSize));
1448
1449 unsigned MaxSize = maxSizeForAddrSpace(
1450 ST, PtrTy.getAddressSpace(), Op == G_LOAD,
1451 Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic);
1452 if (MemSize > MaxSize)
1453 return std::pair(0, LLT::scalar(MaxSize));
1454
1455 uint64_t Align = Query.MMODescrs[0].AlignInBits;
1456 return std::pair(0, LLT::scalar(Align));
1457 })
1458 .fewerElementsIf(
1459 [=](const LegalityQuery &Query) -> bool {
1460 return Query.Types[0].isVector() &&
1461 needToSplitMemOp(Query, Op == G_LOAD);
1462 },
1463 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1464 const LLT DstTy = Query.Types[0];
1465 const LLT PtrTy = Query.Types[1];
1466
1467 LLT EltTy = DstTy.getElementType();
1468 unsigned MaxSize = maxSizeForAddrSpace(
1469 ST, PtrTy.getAddressSpace(), Op == G_LOAD,
1470 Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic);
1471
1472 // FIXME: Handle widened to power of 2 results better. This ends
1473 // up scalarizing.
1474 // FIXME: 3 element stores scalarized on SI
1475
1476 // Split if it's too large for the address space.
1477 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
1478 if (MemSize > MaxSize) {
1479 unsigned NumElts = DstTy.getNumElements();
1480 unsigned EltSize = EltTy.getSizeInBits();
1481
1482 if (MaxSize % EltSize == 0) {
1483 return std::pair(
1485 ElementCount::getFixed(MaxSize / EltSize), EltTy));
1486 }
1487
1488 unsigned NumPieces = MemSize / MaxSize;
1489
1490 // FIXME: Refine when odd breakdowns handled
1491 // The scalars will need to be re-legalized.
1492 if (NumPieces == 1 || NumPieces >= NumElts ||
1493 NumElts % NumPieces != 0)
1494 return std::pair(0, EltTy);
1495
1496 return std::pair(0,
1497 LLT::fixed_vector(NumElts / NumPieces, EltTy));
1498 }
1499
1500 // FIXME: We could probably handle weird extending loads better.
1501 if (DstTy.getSizeInBits() > MemSize)
1502 return std::pair(0, EltTy);
1503
1504 unsigned EltSize = EltTy.getSizeInBits();
1505 unsigned DstSize = DstTy.getSizeInBits();
1506 if (!isPowerOf2_32(DstSize)) {
1507 // We're probably decomposing an odd sized store. Try to split
1508 // to the widest type. TODO: Account for alignment. As-is it
1509 // should be OK, since the new parts will be further legalized.
1510 unsigned FloorSize = llvm::bit_floor(DstSize);
1511 return std::pair(
1513 ElementCount::getFixed(FloorSize / EltSize), EltTy));
1514 }
1515
1516 // May need relegalization for the scalars.
1517 return std::pair(0, EltTy);
1518 })
1519 .minScalar(0, S32)
1520 .narrowScalarIf(isWideScalarExtLoadTruncStore(0), changeTo(0, S32))
1521 .widenScalarToNextPow2(0)
1522 .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0))
1523 .lower();
1524 }
1525
1526 // FIXME: Unaligned accesses not lowered.
1527 auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD})
1528 .legalForTypesWithMemDesc({{S32, GlobalPtr, S8, 8},
1529 {S32, GlobalPtr, S16, 2 * 8},
1530 {S32, LocalPtr, S8, 8},
1531 {S32, LocalPtr, S16, 16},
1532 {S32, PrivatePtr, S8, 8},
1533 {S32, PrivatePtr, S16, 16},
1534 {S32, ConstantPtr, S8, 8},
1535 {S32, ConstantPtr, S16, 2 * 8}})
1536 .legalIf(
1537 [=](const LegalityQuery &Query) -> bool {
1538 return isLoadStoreLegal(ST, Query);
1539 });
1540
1541 if (ST.hasFlatAddressSpace()) {
1542 ExtLoads.legalForTypesWithMemDesc(
1543 {{S32, FlatPtr, S8, 8}, {S32, FlatPtr, S16, 16}});
1544 }
1545
1546 // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to
1547 // 64-bits.
1548 //
1549 // TODO: Should generalize bitcast action into coerce, which will also cover
1550 // inserting addrspacecasts.
1551 ExtLoads.customIf(typeIs(1, Constant32Ptr));
1552
1553 ExtLoads.clampScalar(0, S32, S32)
1554 .widenScalarToNextPow2(0)
1555 .lower();
1556
1557 auto &Atomics = getActionDefinitionsBuilder(
1558 {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB,
1559 G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR,
1560 G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX,
1561 G_ATOMICRMW_UMIN, G_ATOMICRMW_UINC_WRAP, G_ATOMICRMW_UDEC_WRAP})
1562 .legalFor({{S32, GlobalPtr}, {S32, LocalPtr},
1563 {S64, GlobalPtr}, {S64, LocalPtr},
1564 {S32, RegionPtr}, {S64, RegionPtr}});
1565 if (ST.hasFlatAddressSpace()) {
1566 Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}});
1567 }
1568
1569 auto &Atomic = getActionDefinitionsBuilder(G_ATOMICRMW_FADD);
1570 if (ST.hasLDSFPAtomicAdd()) {
1571 Atomic.legalFor({{S32, LocalPtr}, {S32, RegionPtr}});
1572 if (ST.hasGFX90AInsts())
1573 Atomic.legalFor({{S64, LocalPtr}});
1574 if (ST.hasAtomicDsPkAdd16Insts())
1575 Atomic.legalFor({{V2S16, LocalPtr}});
1576 }
1577 if (ST.hasAtomicFaddInsts())
1578 Atomic.legalFor({{S32, GlobalPtr}});
1579 if (ST.hasFlatAtomicFaddF32Inst())
1580 Atomic.legalFor({{S32, FlatPtr}});
1581
1582 if (ST.hasGFX90AInsts()) {
1583 // These are legal with some caveats, and should have undergone expansion in
1584 // the IR in most situations
1585 // TODO: Move atomic expansion into legalizer
1586 Atomic.legalFor({
1587 {S32, GlobalPtr},
1588 {S64, GlobalPtr},
1589 {S64, FlatPtr}
1590 });
1591 }
1592
1593 // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output
1594 // demarshalling
1595 getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG)
1596 .customFor({{S32, GlobalPtr}, {S64, GlobalPtr},
1597 {S32, FlatPtr}, {S64, FlatPtr}})
1598 .legalFor({{S32, LocalPtr}, {S64, LocalPtr},
1599 {S32, RegionPtr}, {S64, RegionPtr}});
1600 // TODO: Pointer types, any 32-bit or 64-bit vector
1601
1602 // Condition should be s32 for scalar, s1 for vector.
1603 getActionDefinitionsBuilder(G_SELECT)
1604 .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16, GlobalPtr,
1605 LocalPtr, FlatPtr, PrivatePtr,
1606 LLT::fixed_vector(2, LocalPtr),
1607 LLT::fixed_vector(2, PrivatePtr)},
1608 {S1, S32})
1609 .clampScalar(0, S16, S64)
1610 .scalarize(1)
1611 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
1612 .fewerElementsIf(numElementsNotEven(0), scalarize(0))
1613 .clampMaxNumElements(0, S32, 2)
1614 .clampMaxNumElements(0, LocalPtr, 2)
1615 .clampMaxNumElements(0, PrivatePtr, 2)
1616 .scalarize(0)
1617 .widenScalarToNextPow2(0)
1618 .legalIf(all(isPointer(0), typeInSet(1, {S1, S32})));
1619
1620 // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can
1621 // be more flexible with the shift amount type.
1622 auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR})
1623 .legalFor({{S32, S32}, {S64, S32}});
1624 if (ST.has16BitInsts()) {
1625 if (ST.hasVOP3PInsts()) {
1626 Shifts.legalFor({{S16, S16}, {V2S16, V2S16}})
1627 .clampMaxNumElements(0, S16, 2);
1628 } else
1629 Shifts.legalFor({{S16, S16}});
1630
1631 // TODO: Support 16-bit shift amounts for all types
1632 Shifts.widenScalarIf(
1633 [=](const LegalityQuery &Query) {
1634 // Use 16-bit shift amounts for any 16-bit shift. Otherwise we want a
1635 // 32-bit amount.
1636 const LLT ValTy = Query.Types[0];
1637 const LLT AmountTy = Query.Types[1];
1638 return ValTy.getSizeInBits() <= 16 &&
1639 AmountTy.getSizeInBits() < 16;
1640 }, changeTo(1, S16));
1641 Shifts.maxScalarIf(typeIs(0, S16), 1, S16);
1642 Shifts.clampScalar(1, S32, S32);
1643 Shifts.widenScalarToNextPow2(0, 16);
1644 Shifts.clampScalar(0, S16, S64);
1645
1646 getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1647 .minScalar(0, S16)
1648 .scalarize(0)
1649 .lower();
1650 } else {
1651 // Make sure we legalize the shift amount type first, as the general
1652 // expansion for the shifted type will produce much worse code if it hasn't
1653 // been truncated already.
1654 Shifts.clampScalar(1, S32, S32);
1655 Shifts.widenScalarToNextPow2(0, 32);
1656 Shifts.clampScalar(0, S32, S64);
1657
1658 getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1659 .minScalar(0, S32)
1660 .scalarize(0)
1661 .lower();
1662 }
1663 Shifts.scalarize(0);
1664
1665 for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) {
1666 unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0;
1667 unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1;
1668 unsigned IdxTypeIdx = 2;
1669
1670 getActionDefinitionsBuilder(Op)
1671 .customIf([=](const LegalityQuery &Query) {
1672 const LLT EltTy = Query.Types[EltTypeIdx];
1673 const LLT VecTy = Query.Types[VecTypeIdx];
1674 const LLT IdxTy = Query.Types[IdxTypeIdx];
1675 const unsigned EltSize = EltTy.getSizeInBits();
1676 const bool isLegalVecType =
1678 // Address space 8 pointers are 128-bit wide values, but the logic
1679 // below will try to bitcast them to 2N x s64, which will fail.
1680 // Therefore, as an intermediate step, wrap extracts/insertions from a
1681 // ptrtoint-ing the vector and scalar arguments (or inttoptring the
1682 // extraction result) in order to produce a vector operation that can
1683 // be handled by the logic below.
1684 if (EltTy.isPointer() && EltSize > 64)
1685 return true;
1686 return (EltSize == 32 || EltSize == 64) &&
1687 VecTy.getSizeInBits() % 32 == 0 &&
1688 VecTy.getSizeInBits() <= MaxRegisterSize &&
1689 IdxTy.getSizeInBits() == 32 &&
1690 isLegalVecType;
1691 })
1692 .bitcastIf(all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltNarrowerThan(VecTypeIdx, 32)),
1693 bitcastToVectorElement32(VecTypeIdx))
1694 //.bitcastIf(vectorSmallerThan(1, 32), bitcastToScalar(1))
1695 .bitcastIf(
1696 all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltWiderThan(VecTypeIdx, 64)),
1697 [=](const LegalityQuery &Query) {
1698 // For > 64-bit element types, try to turn this into a 64-bit
1699 // element vector since we may be able to do better indexing
1700 // if this is scalar. If not, fall back to 32.
1701 const LLT EltTy = Query.Types[EltTypeIdx];
1702 const LLT VecTy = Query.Types[VecTypeIdx];
1703 const unsigned DstEltSize = EltTy.getSizeInBits();
1704 const unsigned VecSize = VecTy.getSizeInBits();
1705
1706 const unsigned TargetEltSize = DstEltSize % 64 == 0 ? 64 : 32;
1707 return std::pair(
1708 VecTypeIdx,
1709 LLT::fixed_vector(VecSize / TargetEltSize, TargetEltSize));
1710 })
1711 .clampScalar(EltTypeIdx, S32, S64)
1712 .clampScalar(VecTypeIdx, S32, S64)
1713 .clampScalar(IdxTypeIdx, S32, S32)
1714 .clampMaxNumElements(VecTypeIdx, S32, 32)
1715 // TODO: Clamp elements for 64-bit vectors?
1716 .moreElementsIf(
1717 isIllegalRegisterType(VecTypeIdx),
1719 // It should only be necessary with variable indexes.
1720 // As a last resort, lower to the stack
1721 .lower();
1722 }
1723
1724 getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT)
1725 .unsupportedIf([=](const LegalityQuery &Query) {
1726 const LLT &EltTy = Query.Types[1].getElementType();
1727 return Query.Types[0] != EltTy;
1728 });
1729
1730 for (unsigned Op : {G_EXTRACT, G_INSERT}) {
1731 unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0;
1732 unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1;
1733
1734 // FIXME: Doesn't handle extract of illegal sizes.
1735 getActionDefinitionsBuilder(Op)
1736 .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32)))
1737 .lowerIf([=](const LegalityQuery &Query) {
1738 // Sub-vector(or single element) insert and extract.
1739 // TODO: verify immediate offset here since lower only works with
1740 // whole elements.
1741 const LLT BigTy = Query.Types[BigTyIdx];
1742 return BigTy.isVector();
1743 })
1744 // FIXME: Multiples of 16 should not be legal.
1745 .legalIf([=](const LegalityQuery &Query) {
1746 const LLT BigTy = Query.Types[BigTyIdx];
1747 const LLT LitTy = Query.Types[LitTyIdx];
1748 return (BigTy.getSizeInBits() % 32 == 0) &&
1749 (LitTy.getSizeInBits() % 16 == 0);
1750 })
1751 .widenScalarIf(
1752 [=](const LegalityQuery &Query) {
1753 const LLT BigTy = Query.Types[BigTyIdx];
1754 return (BigTy.getScalarSizeInBits() < 16);
1755 },
1757 .widenScalarIf(
1758 [=](const LegalityQuery &Query) {
1759 const LLT LitTy = Query.Types[LitTyIdx];
1760 return (LitTy.getScalarSizeInBits() < 16);
1761 },
1763 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1764 .widenScalarToNextPow2(BigTyIdx, 32);
1765
1766 }
1767
1768 auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR)
1769 .legalForCartesianProduct(AllS32Vectors, {S32})
1770 .legalForCartesianProduct(AllS64Vectors, {S64})
1771 .clampNumElements(0, V16S32, V32S32)
1772 .clampNumElements(0, V2S64, V16S64)
1773 .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16))
1774 .moreElementsIf(
1777
1778 if (ST.hasScalarPackInsts()) {
1779 BuildVector
1780 // FIXME: Should probably widen s1 vectors straight to s32
1781 .minScalarOrElt(0, S16)
1782 .minScalar(1, S16);
1783
1784 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1785 .legalFor({V2S16, S32})
1786 .lower();
1787 } else {
1788 BuildVector.customFor({V2S16, S16});
1789 BuildVector.minScalarOrElt(0, S32);
1790
1791 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1792 .customFor({V2S16, S32})
1793 .lower();
1794 }
1795
1796 BuildVector.legalIf(isRegisterType(0));
1797
1798 // FIXME: Clamp maximum size
1799 getActionDefinitionsBuilder(G_CONCAT_VECTORS)
1800 .legalIf(all(isRegisterType(0), isRegisterType(1)))
1801 .clampMaxNumElements(0, S32, 32)
1802 .clampMaxNumElements(1, S16, 2) // TODO: Make 4?
1803 .clampMaxNumElements(0, S16, 64);
1804
1805 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower();
1806
1807 // Merge/Unmerge
1808 for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
1809 unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1;
1810 unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0;
1811
1812 auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) {
1813 const LLT Ty = Query.Types[TypeIdx];
1814 if (Ty.isVector()) {
1815 const LLT &EltTy = Ty.getElementType();
1816 if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 512)
1817 return true;
1818 if (!llvm::has_single_bit<uint32_t>(EltTy.getSizeInBits()))
1819 return true;
1820 }
1821 return false;
1822 };
1823
1824 auto &Builder = getActionDefinitionsBuilder(Op)
1825 .legalIf(all(isRegisterType(0), isRegisterType(1)))
1826 .lowerFor({{S16, V2S16}})
1827 .lowerIf([=](const LegalityQuery &Query) {
1828 const LLT BigTy = Query.Types[BigTyIdx];
1829 return BigTy.getSizeInBits() == 32;
1830 })
1831 // Try to widen to s16 first for small types.
1832 // TODO: Only do this on targets with legal s16 shifts
1833 .minScalarOrEltIf(scalarNarrowerThan(LitTyIdx, 16), LitTyIdx, S16)
1834 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16)
1835 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1836 .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32),
1837 elementTypeIs(1, S16)),
1838 changeTo(1, V2S16))
1839 // Clamp the little scalar to s8-s256 and make it a power of 2. It's not
1840 // worth considering the multiples of 64 since 2*192 and 2*384 are not
1841 // valid.
1842 .clampScalar(LitTyIdx, S32, S512)
1843 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32)
1844 // Break up vectors with weird elements into scalars
1845 .fewerElementsIf(
1846 [=](const LegalityQuery &Query) { return notValidElt(Query, LitTyIdx); },
1847 scalarize(0))
1848 .fewerElementsIf(
1849 [=](const LegalityQuery &Query) { return notValidElt(Query, BigTyIdx); },
1850 scalarize(1))
1851 .clampScalar(BigTyIdx, S32, MaxScalar);
1852
1853 if (Op == G_MERGE_VALUES) {
1854 Builder.widenScalarIf(
1855 // TODO: Use 16-bit shifts if legal for 8-bit values?
1856 [=](const LegalityQuery &Query) {
1857 const LLT Ty = Query.Types[LitTyIdx];
1858 return Ty.getSizeInBits() < 32;
1859 },
1860 changeTo(LitTyIdx, S32));
1861 }
1862
1863 Builder.widenScalarIf(
1864 [=](const LegalityQuery &Query) {
1865 const LLT Ty = Query.Types[BigTyIdx];
1866 return Ty.getSizeInBits() % 16 != 0;
1867 },
1868 [=](const LegalityQuery &Query) {
1869 // Pick the next power of 2, or a multiple of 64 over 128.
1870 // Whichever is smaller.
1871 const LLT &Ty = Query.Types[BigTyIdx];
1872 unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1);
1873 if (NewSizeInBits >= 256) {
1874 unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1);
1875 if (RoundedTo < NewSizeInBits)
1876 NewSizeInBits = RoundedTo;
1877 }
1878 return std::pair(BigTyIdx, LLT::scalar(NewSizeInBits));
1879 })
1880 // Any vectors left are the wrong size. Scalarize them.
1881 .scalarize(0)
1882 .scalarize(1);
1883 }
1884
1885 // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
1886 // RegBankSelect.
1887 auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG)
1888 .legalFor({{S32}, {S64}});
1889
1890 if (ST.hasVOP3PInsts()) {
1891 SextInReg.lowerFor({{V2S16}})
1892 // Prefer to reduce vector widths for 16-bit vectors before lowering, to
1893 // get more vector shift opportunities, since we'll get those when
1894 // expanded.
1895 .clampMaxNumElementsStrict(0, S16, 2);
1896 } else if (ST.has16BitInsts()) {
1897 SextInReg.lowerFor({{S32}, {S64}, {S16}});
1898 } else {
1899 // Prefer to promote to s32 before lowering if we don't have 16-bit
1900 // shifts. This avoid a lot of intermediate truncate and extend operations.
1901 SextInReg.lowerFor({{S32}, {S64}});
1902 }
1903
1904 SextInReg
1905 .scalarize(0)
1906 .clampScalar(0, S32, S64)
1907 .lower();
1908
1909 getActionDefinitionsBuilder({G_ROTR, G_ROTL})
1910 .scalarize(0)
1911 .lower();
1912
1913 // TODO: Only Try to form v2s16 with legal packed instructions.
1914 getActionDefinitionsBuilder(G_FSHR)
1915 .legalFor({{S32, S32}})
1916 .lowerFor({{V2S16, V2S16}})
1917 .clampMaxNumElementsStrict(0, S16, 2)
1918 .scalarize(0)
1919 .lower();
1920
1921 if (ST.hasVOP3PInsts()) {
1922 getActionDefinitionsBuilder(G_FSHL)
1923 .lowerFor({{V2S16, V2S16}})
1924 .clampMaxNumElementsStrict(0, S16, 2)
1925 .scalarize(0)
1926 .lower();
1927 } else {
1928 getActionDefinitionsBuilder(G_FSHL)
1929 .scalarize(0)
1930 .lower();
1931 }
1932
1933 getActionDefinitionsBuilder(G_READCYCLECOUNTER)
1934 .legalFor({S64});
1935
1936 getActionDefinitionsBuilder(G_FENCE)
1937 .alwaysLegal();
1938
1939 getActionDefinitionsBuilder({G_SMULO, G_UMULO})
1940 .scalarize(0)
1941 .minScalar(0, S32)
1942 .lower();
1943
1944 getActionDefinitionsBuilder({G_SBFX, G_UBFX})
1945 .legalFor({{S32, S32}, {S64, S32}})
1946 .clampScalar(1, S32, S32)
1947 .clampScalar(0, S32, S64)
1948 .widenScalarToNextPow2(0)
1949 .scalarize(0);
1950
1951 getActionDefinitionsBuilder({
1952 // TODO: Verify V_BFI_B32 is generated from expanded bit ops
1953 G_FCOPYSIGN,
1954
1955 G_ATOMIC_CMPXCHG_WITH_SUCCESS,
1956 G_ATOMICRMW_NAND,
1957 G_ATOMICRMW_FSUB,
1958 G_READ_REGISTER,
1959 G_WRITE_REGISTER,
1960
1961 G_SADDO, G_SSUBO,
1962
1963 // TODO: Implement
1964 G_FMINIMUM, G_FMAXIMUM}).lower();
1965
1966 getActionDefinitionsBuilder({G_MEMCPY, G_MEMCPY_INLINE, G_MEMMOVE, G_MEMSET})
1967 .lower();
1968
1969 getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE,
1970 G_INDEXED_LOAD, G_INDEXED_SEXTLOAD,
1971 G_INDEXED_ZEXTLOAD, G_INDEXED_STORE})
1972 .unsupported();
1973
1974 getLegacyLegalizerInfo().computeTables();
1975 verify(*ST.getInstrInfo());
1976}
1977
1979 MachineInstr &MI) const {
1980 MachineIRBuilder &B = Helper.MIRBuilder;
1981 MachineRegisterInfo &MRI = *B.getMRI();
1982
1983 switch (MI.getOpcode()) {
1984 case TargetOpcode::G_ADDRSPACE_CAST:
1985 return legalizeAddrSpaceCast(MI, MRI, B);
1986 case TargetOpcode::G_INTRINSIC_ROUNDEVEN:
1987 return legalizeFroundeven(MI, MRI, B);
1988 case TargetOpcode::G_FCEIL:
1989 return legalizeFceil(MI, MRI, B);
1990 case TargetOpcode::G_FREM:
1991 return legalizeFrem(MI, MRI, B);
1992 case TargetOpcode::G_INTRINSIC_TRUNC:
1993 return legalizeIntrinsicTrunc(MI, MRI, B);
1994 case TargetOpcode::G_SITOFP:
1995 return legalizeITOFP(MI, MRI, B, true);
1996 case TargetOpcode::G_UITOFP:
1997 return legalizeITOFP(MI, MRI, B, false);
1998 case TargetOpcode::G_FPTOSI:
1999 return legalizeFPTOI(MI, MRI, B, true);
2000 case TargetOpcode::G_FPTOUI:
2001 return legalizeFPTOI(MI, MRI, B, false);
2002 case TargetOpcode::G_FMINNUM:
2003 case TargetOpcode::G_FMAXNUM:
2004 case TargetOpcode::G_FMINNUM_IEEE:
2005 case TargetOpcode::G_FMAXNUM_IEEE:
2006 return legalizeMinNumMaxNum(Helper, MI);
2007 case TargetOpcode::G_EXTRACT_VECTOR_ELT:
2008 return legalizeExtractVectorElt(MI, MRI, B);
2009 case TargetOpcode::G_INSERT_VECTOR_ELT:
2010 return legalizeInsertVectorElt(MI, MRI, B);
2011 case TargetOpcode::G_FSIN:
2012 case TargetOpcode::G_FCOS:
2013 return legalizeSinCos(MI, MRI, B);
2014 case TargetOpcode::G_GLOBAL_VALUE:
2015 return legalizeGlobalValue(MI, MRI, B);
2016 case TargetOpcode::G_LOAD:
2017 case TargetOpcode::G_SEXTLOAD:
2018 case TargetOpcode::G_ZEXTLOAD:
2019 return legalizeLoad(Helper, MI);
2020 case TargetOpcode::G_STORE:
2021 return legalizeStore(Helper, MI);
2022 case TargetOpcode::G_FMAD:
2023 return legalizeFMad(MI, MRI, B);
2024 case TargetOpcode::G_FDIV:
2025 return legalizeFDIV(MI, MRI, B);
2026 case TargetOpcode::G_FFREXP:
2027 return legalizeFFREXP(MI, MRI, B);
2028 case TargetOpcode::G_FSQRT:
2029 return legalizeFSQRT(MI, MRI, B);
2030 case TargetOpcode::G_UDIV:
2031 case TargetOpcode::G_UREM:
2032 case TargetOpcode::G_UDIVREM:
2033 return legalizeUnsignedDIV_REM(MI, MRI, B);
2034 case TargetOpcode::G_SDIV:
2035 case TargetOpcode::G_SREM:
2036 case TargetOpcode::G_SDIVREM:
2037 return legalizeSignedDIV_REM(MI, MRI, B);
2038 case TargetOpcode::G_ATOMIC_CMPXCHG:
2039 return legalizeAtomicCmpXChg(MI, MRI, B);
2040 case TargetOpcode::G_FLOG2:
2041 return legalizeFlog2(MI, B);
2042 case TargetOpcode::G_FLOG:
2043 case TargetOpcode::G_FLOG10:
2044 return legalizeFlogCommon(MI, B);
2045 case TargetOpcode::G_FEXP2:
2046 return legalizeFExp2(MI, B);
2047 case TargetOpcode::G_FEXP:
2048 return legalizeFExp(MI, B);
2049 case TargetOpcode::G_FPOW:
2050 return legalizeFPow(MI, B);
2051 case TargetOpcode::G_FFLOOR:
2052 return legalizeFFloor(MI, MRI, B);
2053 case TargetOpcode::G_BUILD_VECTOR:
2054 case TargetOpcode::G_BUILD_VECTOR_TRUNC:
2055 return legalizeBuildVector(MI, MRI, B);
2056 case TargetOpcode::G_MUL:
2057 return legalizeMul(Helper, MI);
2058 case TargetOpcode::G_CTLZ:
2059 case TargetOpcode::G_CTTZ:
2060 return legalizeCTLZ_CTTZ(MI, MRI, B);
2061 case TargetOpcode::G_INTRINSIC_FPTRUNC_ROUND:
2062 return legalizeFPTruncRound(MI, B);
2063 case TargetOpcode::G_STACKSAVE:
2064 return legalizeStackSave(MI, B);
2065 default:
2066 return false;
2067 }
2068
2069 llvm_unreachable("expected switch to return");
2070}
2071
2073 unsigned AS,
2075 MachineIRBuilder &B) const {
2076 MachineFunction &MF = B.getMF();
2077 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
2078 const LLT S32 = LLT::scalar(32);
2079 const LLT S64 = LLT::scalar(64);
2080
2082
2083 if (ST.hasApertureRegs()) {
2084 // Note: this register is somewhat broken. When used as a 32-bit operand,
2085 // it only returns zeroes. The real value is in the upper 32 bits.
2086 // Thus, we must emit extract the high 32 bits.
2087 const unsigned ApertureRegNo = (AS == AMDGPUAS::LOCAL_ADDRESS)
2088 ? AMDGPU::SRC_SHARED_BASE
2089 : AMDGPU::SRC_PRIVATE_BASE;
2090 // FIXME: It would be more natural to emit a COPY here, but then copy
2091 // coalescing would kick in and it would think it's okay to use the "HI"
2092 // subregister (instead of extracting the HI 32 bits) which is an artificial
2093 // (unusable) register.
2094 // Register TableGen definitions would need an overhaul to get rid of the
2095 // artificial "HI" aperture registers and prevent this kind of issue from
2096 // happening.
2097 Register Dst = MRI.createGenericVirtualRegister(S64);
2098 MRI.setRegClass(Dst, &AMDGPU::SReg_64RegClass);
2099 B.buildInstr(AMDGPU::S_MOV_B64, {Dst}, {Register(ApertureRegNo)});
2100 return B.buildUnmerge(S32, Dst).getReg(1);
2101 }
2102
2103 // TODO: can we be smarter about machine pointer info?
2105 Register LoadAddr = MRI.createGenericVirtualRegister(
2107 // For code object version 5, private_base and shared_base are passed through
2108 // implicit kernargs.
2115 ST.getTargetLowering()->getImplicitParameterOffset(B.getMF(), Param);
2116
2117 Register KernargPtrReg = MRI.createGenericVirtualRegister(
2119
2120 if (!loadInputValue(KernargPtrReg, B,
2122 return Register();
2123
2125 PtrInfo,
2129
2130 // Pointer address
2131 B.buildPtrAdd(LoadAddr, KernargPtrReg,
2132 B.buildConstant(LLT::scalar(64), Offset).getReg(0));
2133 // Load address
2134 return B.buildLoad(S32, LoadAddr, *MMO).getReg(0);
2135 }
2136
2137 Register QueuePtr = MRI.createGenericVirtualRegister(
2139
2141 return Register();
2142
2143 // Offset into amd_queue_t for group_segment_aperture_base_hi /
2144 // private_segment_aperture_base_hi.
2145 uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
2146
2148 PtrInfo,
2151 LLT::scalar(32), commonAlignment(Align(64), StructOffset));
2152
2153 B.buildPtrAdd(LoadAddr, QueuePtr,
2154 B.buildConstant(LLT::scalar(64), StructOffset).getReg(0));
2155 return B.buildLoad(S32, LoadAddr, *MMO).getReg(0);
2156}
2157
2158/// Return true if the value is a known valid address, such that a null check is
2159/// not necessary.
2161 const AMDGPUTargetMachine &TM, unsigned AddrSpace) {
2162 MachineInstr *Def = MRI.getVRegDef(Val);
2163 switch (Def->getOpcode()) {
2164 case AMDGPU::G_FRAME_INDEX:
2165 case AMDGPU::G_GLOBAL_VALUE:
2166 case AMDGPU::G_BLOCK_ADDR:
2167 return true;
2168 case AMDGPU::G_CONSTANT: {
2169 const ConstantInt *CI = Def->getOperand(1).getCImm();
2170 return CI->getSExtValue() != TM.getNullPointerValue(AddrSpace);
2171 }
2172 default:
2173 return false;
2174 }
2175
2176 return false;
2177}
2178
2181 MachineIRBuilder &B) const {
2182 MachineFunction &MF = B.getMF();
2183
2184 const LLT S32 = LLT::scalar(32);
2185 Register Dst = MI.getOperand(0).getReg();
2186 Register Src = MI.getOperand(1).getReg();
2187
2188 LLT DstTy = MRI.getType(Dst);
2189 LLT SrcTy = MRI.getType(Src);
2190 unsigned DestAS = DstTy.getAddressSpace();
2191 unsigned SrcAS = SrcTy.getAddressSpace();
2192
2193 // TODO: Avoid reloading from the queue ptr for each cast, or at least each
2194 // vector element.
2195 assert(!DstTy.isVector());
2196
2197 const AMDGPUTargetMachine &TM
2198 = static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
2199
2200 if (TM.isNoopAddrSpaceCast(SrcAS, DestAS)) {
2201 MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST));
2202 return true;
2203 }
2204
2205 if (SrcAS == AMDGPUAS::FLAT_ADDRESS &&
2206 (DestAS == AMDGPUAS::LOCAL_ADDRESS ||
2207 DestAS == AMDGPUAS::PRIVATE_ADDRESS)) {
2208 if (isKnownNonNull(Src, MRI, TM, SrcAS)) {
2209 // Extract low 32-bits of the pointer.
2210 B.buildExtract(Dst, Src, 0);
2211 MI.eraseFromParent();
2212 return true;
2213 }
2214
2215 unsigned NullVal = TM.getNullPointerValue(DestAS);
2216
2217 auto SegmentNull = B.buildConstant(DstTy, NullVal);
2218 auto FlatNull = B.buildConstant(SrcTy, 0);
2219
2220 // Extract low 32-bits of the pointer.
2221 auto PtrLo32 = B.buildExtract(DstTy, Src, 0);
2222
2223 auto CmpRes =
2224 B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0));
2225 B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0));
2226
2227 MI.eraseFromParent();
2228 return true;
2229 }
2230
2231 if (DestAS == AMDGPUAS::FLAT_ADDRESS &&
2232 (SrcAS == AMDGPUAS::LOCAL_ADDRESS ||
2233 SrcAS == AMDGPUAS::PRIVATE_ADDRESS)) {
2234 Register ApertureReg = getSegmentAperture(SrcAS, MRI, B);
2235 if (!ApertureReg.isValid())
2236 return false;
2237
2238 // Coerce the type of the low half of the result so we can use merge_values.
2239 Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0);
2240
2241 // TODO: Should we allow mismatched types but matching sizes in merges to
2242 // avoid the ptrtoint?
2243 auto BuildPtr = B.buildMergeLikeInstr(DstTy, {SrcAsInt, ApertureReg});
2244
2245 if (isKnownNonNull(Src, MRI, TM, SrcAS)) {
2246 B.buildCopy(Dst, BuildPtr);
2247 MI.eraseFromParent();
2248 return true;
2249 }
2250
2251 auto SegmentNull = B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS));
2252 auto FlatNull = B.buildConstant(DstTy, TM.getNullPointerValue(DestAS));
2253
2254 auto CmpRes = B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src,
2255 SegmentNull.getReg(0));
2256
2257 B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull);
2258
2259 MI.eraseFromParent();
2260 return true;
2261 }
2262
2263 if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
2264 SrcTy.getSizeInBits() == 64) {
2265 // Truncate.
2266 B.buildExtract(Dst, Src, 0);
2267 MI.eraseFromParent();
2268 return true;
2269 }
2270
2271 if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
2272 DstTy.getSizeInBits() == 64) {
2274 uint32_t AddrHiVal = Info->get32BitAddressHighBits();
2275 auto PtrLo = B.buildPtrToInt(S32, Src);
2276 auto HighAddr = B.buildConstant(S32, AddrHiVal);
2277 B.buildMergeLikeInstr(Dst, {PtrLo, HighAddr});
2278 MI.eraseFromParent();
2279 return true;
2280 }
2281
2282 DiagnosticInfoUnsupported InvalidAddrSpaceCast(
2283 MF.getFunction(), "invalid addrspacecast", B.getDebugLoc());
2284
2285 LLVMContext &Ctx = MF.getFunction().getContext();
2286 Ctx.diagnose(InvalidAddrSpaceCast);
2287 B.buildUndef(Dst);
2288 MI.eraseFromParent();
2289 return true;
2290}
2291
2294 MachineIRBuilder &B) const {
2295 Register Src = MI.getOperand(1).getReg();
2296 LLT Ty = MRI.getType(Src);
2297 assert(Ty.isScalar() && Ty.getSizeInBits() == 64);
2298
2299 APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
2300 APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
2301
2302 auto C1 = B.buildFConstant(Ty, C1Val);
2303 auto CopySign = B.buildFCopysign(Ty, C1, Src);
2304
2305 // TODO: Should this propagate fast-math-flags?
2306 auto Tmp1 = B.buildFAdd(Ty, Src, CopySign);
2307 auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign);
2308
2309 auto C2 = B.buildFConstant(Ty, C2Val);
2310 auto Fabs = B.buildFAbs(Ty, Src);
2311
2312 auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2);
2313 B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2);
2314 MI.eraseFromParent();
2315 return true;
2316}
2317
2320 MachineIRBuilder &B) const {
2321
2322 const LLT S1 = LLT::scalar(1);
2323 const LLT S64 = LLT::scalar(64);
2324
2325 Register Src = MI.getOperand(1).getReg();
2326 assert(MRI.getType(Src) == S64);
2327
2328 // result = trunc(src)
2329 // if (src > 0.0 && src != result)
2330 // result += 1.0
2331
2332 auto Trunc = B.buildIntrinsicTrunc(S64, Src);
2333
2334 const auto Zero = B.buildFConstant(S64, 0.0);
2335 const auto One = B.buildFConstant(S64, 1.0);
2336 auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero);
2337 auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc);
2338 auto And = B.buildAnd(S1, Lt0, NeTrunc);
2339 auto Add = B.buildSelect(S64, And, One, Zero);
2340
2341 // TODO: Should this propagate fast-math-flags?
2342 B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add);
2343 MI.eraseFromParent();
2344 return true;
2345}
2346
2349 MachineIRBuilder &B) const {
2350 Register DstReg = MI.getOperand(0).getReg();
2351 Register Src0Reg = MI.getOperand(1).getReg();
2352 Register Src1Reg = MI.getOperand(2).getReg();
2353 auto Flags = MI.getFlags();
2354 LLT Ty = MRI.getType(DstReg);
2355
2356 auto Div = B.buildFDiv(Ty, Src0Reg, Src1Reg, Flags);
2357 auto Trunc = B.buildIntrinsicTrunc(Ty, Div, Flags);
2358 auto Neg = B.buildFNeg(Ty, Trunc, Flags);
2359 B.buildFMA(DstReg, Neg, Src1Reg, Src0Reg, Flags);
2360 MI.eraseFromParent();
2361 return true;
2362}
2363
2366 const unsigned FractBits = 52;
2367 const unsigned ExpBits = 11;
2368 LLT S32 = LLT::scalar(32);
2369
2370 auto Const0 = B.buildConstant(S32, FractBits - 32);
2371 auto Const1 = B.buildConstant(S32, ExpBits);
2372
2373 auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32})
2374 .addUse(Hi)
2375 .addUse(Const0.getReg(0))
2376 .addUse(Const1.getReg(0));
2377
2378 return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023));
2379}
2380
2383 MachineIRBuilder &B) const {
2384 const LLT S1 = LLT::scalar(1);
2385 const LLT S32 = LLT::scalar(32);
2386 const LLT S64 = LLT::scalar(64);
2387
2388 Register Src = MI.getOperand(1).getReg();
2389 assert(MRI.getType(Src) == S64);
2390
2391 // TODO: Should this use extract since the low half is unused?
2392 auto Unmerge = B.buildUnmerge({S32, S32}, Src);
2393 Register Hi = Unmerge.getReg(1);
2394
2395 // Extract the upper half, since this is where we will find the sign and
2396 // exponent.
2397 auto Exp = extractF64Exponent(Hi, B);
2398
2399 const unsigned FractBits = 52;
2400
2401 // Extract the sign bit.
2402 const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31);
2403 auto SignBit = B.buildAnd(S32, Hi, SignBitMask);
2404
2405 const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1);
2406
2407 const auto Zero32 = B.buildConstant(S32, 0);
2408
2409 // Extend back to 64-bits.
2410 auto SignBit64 = B.buildMergeLikeInstr(S64, {Zero32, SignBit});
2411
2412 auto Shr = B.buildAShr(S64, FractMask, Exp);
2413 auto Not = B.buildNot(S64, Shr);
2414 auto Tmp0 = B.buildAnd(S64, Src, Not);
2415 auto FiftyOne = B.buildConstant(S32, FractBits - 1);
2416
2417 auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32);
2418 auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne);
2419
2420 auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0);
2421 B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1);
2422 MI.eraseFromParent();
2423 return true;
2424}
2425
2428 MachineIRBuilder &B, bool Signed) const {
2429
2430 Register Dst = MI.getOperand(0).getReg();
2431 Register Src = MI.getOperand(1).getReg();
2432
2433 const LLT S64 = LLT::scalar(64);
2434 const LLT S32 = LLT::scalar(32);
2435
2436 assert(MRI.getType(Src) == S64);
2437
2438 auto Unmerge = B.buildUnmerge({S32, S32}, Src);
2439 auto ThirtyTwo = B.buildConstant(S32, 32);
2440
2441 if (MRI.getType(Dst) == S64) {
2442 auto CvtHi = Signed ? B.buildSITOFP(S64, Unmerge.getReg(1))
2443 : B.buildUITOFP(S64, Unmerge.getReg(1));
2444
2445 auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0));
2446 auto LdExp = B.buildFLdexp(S64, CvtHi, ThirtyTwo);
2447
2448 // TODO: Should this propagate fast-math-flags?
2449 B.buildFAdd(Dst, LdExp, CvtLo);
2450 MI.eraseFromParent();
2451 return true;
2452 }
2453
2454 assert(MRI.getType(Dst) == S32);
2455
2456 auto One = B.buildConstant(S32, 1);
2457
2458 MachineInstrBuilder ShAmt;
2459 if (Signed) {
2460 auto ThirtyOne = B.buildConstant(S32, 31);
2461 auto X = B.buildXor(S32, Unmerge.getReg(0), Unmerge.getReg(1));
2462 auto OppositeSign = B.buildAShr(S32, X, ThirtyOne);
2463 auto MaxShAmt = B.buildAdd(S32, ThirtyTwo, OppositeSign);
2464 auto LS = B.buildIntrinsic(Intrinsic::amdgcn_sffbh, {S32})
2465 .addUse(Unmerge.getReg(1));
2466 auto LS2 = B.buildSub(S32, LS, One);
2467 ShAmt = B.buildUMin(S32, LS2, MaxShAmt);
2468 } else
2469 ShAmt = B.buildCTLZ(S32, Unmerge.getReg(1));
2470 auto Norm = B.buildShl(S64, Src, ShAmt);
2471 auto Unmerge2 = B.buildUnmerge({S32, S32}, Norm);
2472 auto Adjust = B.buildUMin(S32, One, Unmerge2.getReg(0));
2473 auto Norm2 = B.buildOr(S32, Unmerge2.getReg(1), Adjust);
2474 auto FVal = Signed ? B.buildSITOFP(S32, Norm2) : B.buildUITOFP(S32, Norm2);
2475 auto Scale = B.buildSub(S32, ThirtyTwo, ShAmt);
2476 B.buildFLdexp(Dst, FVal, Scale);
2477 MI.eraseFromParent();
2478 return true;
2479}
2480
2481// TODO: Copied from DAG implementation. Verify logic and document how this
2482// actually works.
2486 bool Signed) const {
2487
2488 Register Dst = MI.getOperand(0).getReg();
2489 Register Src = MI.getOperand(1).getReg();
2490
2491 const LLT S64 = LLT::scalar(64);
2492 const LLT S32 = LLT::scalar(32);
2493
2494 const LLT SrcLT = MRI.getType(Src);
2495 assert((SrcLT == S32 || SrcLT == S64) && MRI.getType(Dst) == S64);
2496
2497 unsigned Flags = MI.getFlags();
2498
2499 // The basic idea of converting a floating point number into a pair of 32-bit
2500 // integers is illustrated as follows:
2501 //
2502 // tf := trunc(val);
2503 // hif := floor(tf * 2^-32);
2504 // lof := tf - hif * 2^32; // lof is always positive due to floor.
2505 // hi := fptoi(hif);
2506 // lo := fptoi(lof);
2507 //
2508 auto Trunc = B.buildIntrinsicTrunc(SrcLT, Src, Flags);
2510 if (Signed && SrcLT == S32) {
2511 // However, a 32-bit floating point number has only 23 bits mantissa and
2512 // it's not enough to hold all the significant bits of `lof` if val is
2513 // negative. To avoid the loss of precision, We need to take the absolute
2514 // value after truncating and flip the result back based on the original
2515 // signedness.
2516 Sign = B.buildAShr(S32, Src, B.buildConstant(S32, 31));
2517 Trunc = B.buildFAbs(S32, Trunc, Flags);
2518 }
2519 MachineInstrBuilder K0, K1;
2520 if (SrcLT == S64) {
2521 K0 = B.buildFConstant(
2522 S64, llvm::bit_cast<double>(UINT64_C(/*2^-32*/ 0x3df0000000000000)));
2523 K1 = B.buildFConstant(
2524 S64, llvm::bit_cast<double>(UINT64_C(/*-2^32*/ 0xc1f0000000000000)));
2525 } else {
2526 K0 = B.buildFConstant(
2527 S32, llvm::bit_cast<float>(UINT32_C(/*2^-32*/ 0x2f800000)));
2528 K1 = B.buildFConstant(
2529 S32, llvm::bit_cast<float>(UINT32_C(/*-2^32*/ 0xcf800000)));
2530 }
2531
2532 auto Mul = B.buildFMul(SrcLT, Trunc, K0, Flags);
2533 auto FloorMul = B.buildFFloor(SrcLT, Mul, Flags);
2534 auto Fma = B.buildFMA(SrcLT, FloorMul, K1, Trunc, Flags);
2535
2536 auto Hi = (Signed && SrcLT == S64) ? B.buildFPTOSI(S32, FloorMul)
2537 : B.buildFPTOUI(S32, FloorMul);
2538 auto Lo = B.buildFPTOUI(S32, Fma);
2539
2540 if (Signed && SrcLT == S32) {
2541 // Flip the result based on the signedness, which is either all 0s or 1s.
2542 Sign = B.buildMergeLikeInstr(S64, {Sign, Sign});
2543 // r := xor({lo, hi}, sign) - sign;
2544 B.buildSub(Dst, B.buildXor(S64, B.buildMergeLikeInstr(S64, {Lo, Hi}), Sign),
2545 Sign);
2546 } else
2547 B.buildMergeLikeInstr(Dst, {Lo, Hi});
2548 MI.eraseFromParent();
2549
2550 return true;
2551}
2552
2554 MachineInstr &MI) const {
2555 MachineFunction &MF = Helper.MIRBuilder.getMF();
2557
2558 const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE ||
2559 MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE;
2560
2561 // With ieee_mode disabled, the instructions have the correct behavior
2562 // already for G_FMINNUM/G_FMAXNUM
2563 if (!MFI->getMode().IEEE)
2564 return !IsIEEEOp;
2565
2566 if (IsIEEEOp)
2567 return true;
2568
2570}
2571
2574 MachineIRBuilder &B) const {
2575 // TODO: Should move some of this into LegalizerHelper.
2576
2577 // TODO: Promote dynamic indexing of s16 to s32
2578
2579 Register Dst = MI.getOperand(0).getReg();
2580 Register Vec = MI.getOperand(1).getReg();
2581
2582 LLT VecTy = MRI.getType(Vec);
2583 LLT EltTy = VecTy.getElementType();
2584 assert(EltTy == MRI.getType(Dst));
2585
2586 // Other legalization maps vector<? x [type bigger than 64 bits]> via bitcasts
2587 // but we can't go directly to that logic becasue you can't bitcast a vector
2588 // of pointers to a vector of integers. Therefore, introduce an intermediate
2589 // vector of integers using ptrtoint (and inttoptr on the output) in order to
2590 // drive the legalization forward.
2591 if (EltTy.isPointer() && EltTy.getSizeInBits() > 64) {
2592 LLT IntTy = LLT::scalar(EltTy.getSizeInBits());
2593 LLT IntVecTy = VecTy.changeElementType(IntTy);
2594
2595 auto IntVec = B.buildPtrToInt(IntVecTy, Vec);
2596 auto IntElt = B.buildExtractVectorElement(IntTy, IntVec, MI.getOperand(2));
2597 B.buildIntToPtr(Dst, IntElt);
2598
2599 MI.eraseFromParent();
2600 return true;
2601 }
2602
2603 // FIXME: Artifact combiner probably should have replaced the truncated
2604 // constant before this, so we shouldn't need
2605 // getIConstantVRegValWithLookThrough.
2606 std::optional<ValueAndVReg> MaybeIdxVal =
2607 getIConstantVRegValWithLookThrough(MI.getOperand(2).getReg(), MRI);
2608 if (!MaybeIdxVal) // Dynamic case will be selected to register indexing.
2609 return true;
2610 const uint64_t IdxVal = MaybeIdxVal->Value.getZExtValue();
2611
2612 if (IdxVal < VecTy.getNumElements()) {
2613 auto Unmerge = B.buildUnmerge(EltTy, Vec);
2614 B.buildCopy(Dst, Unmerge.getReg(IdxVal));
2615 } else {
2616 B.buildUndef(Dst);
2617 }
2618
2619 MI.eraseFromParent();
2620 return true;
2621}
2622
2625 MachineIRBuilder &B) const {
2626 // TODO: Should move some of this into LegalizerHelper.
2627
2628 // TODO: Promote dynamic indexing of s16 to s32
2629
2630 Register Dst = MI.getOperand(0).getReg();
2631 Register Vec = MI.getOperand(1).getReg();
2632 Register Ins = MI.getOperand(2).getReg();
2633
2634 LLT VecTy = MRI.getType(Vec);
2635 LLT EltTy = VecTy.getElementType();
2636 assert(EltTy == MRI.getType(Ins));
2637
2638 // Other legalization maps vector<? x [type bigger than 64 bits]> via bitcasts
2639 // but we can't go directly to that logic becasue you can't bitcast a vector
2640 // of pointers to a vector of integers. Therefore, make the pointer vector
2641 // into an equivalent vector of integers with ptrtoint, insert the ptrtoint'd
2642 // new value, and then inttoptr the result vector back. This will then allow
2643 // the rest of legalization to take over.
2644 if (EltTy.isPointer() && EltTy.getSizeInBits() > 64) {
2645 LLT IntTy = LLT::scalar(EltTy.getSizeInBits());
2646 LLT IntVecTy = VecTy.changeElementType(IntTy);
2647
2648 auto IntVecSource = B.buildPtrToInt(IntVecTy, Vec);
2649 auto IntIns = B.buildPtrToInt(IntTy, Ins);
2650 auto IntVecDest = B.buildInsertVectorElement(IntVecTy, IntVecSource, IntIns,
2651 MI.getOperand(3));
2652 B.buildIntToPtr(Dst, IntVecDest);
2653 MI.eraseFromParent();
2654 return true;
2655 }
2656
2657 // FIXME: Artifact combiner probably should have replaced the truncated
2658 // constant before this, so we shouldn't need
2659 // getIConstantVRegValWithLookThrough.
2660 std::optional<ValueAndVReg> MaybeIdxVal =
2661 getIConstantVRegValWithLookThrough(MI.getOperand(3).getReg(), MRI);
2662 if (!MaybeIdxVal) // Dynamic case will be selected to register indexing.
2663 return true;
2664
2665 const uint64_t IdxVal = MaybeIdxVal->Value.getZExtValue();
2666
2667 unsigned NumElts = VecTy.getNumElements();
2668 if (IdxVal < NumElts) {
2670 for (unsigned i = 0; i < NumElts; ++i)
2671 SrcRegs.push_back(MRI.createGenericVirtualRegister(EltTy));
2672 B.buildUnmerge(SrcRegs, Vec);
2673
2674 SrcRegs[IdxVal] = MI.getOperand(2).getReg();
2675 B.buildMergeLikeInstr(Dst, SrcRegs);
2676 } else {
2677 B.buildUndef(Dst);
2678 }
2679
2680 MI.eraseFromParent();
2681 return true;
2682}
2683
2686 MachineIRBuilder &B) const {
2687
2688 Register DstReg = MI.getOperand(0).getReg();
2689 Register SrcReg = MI.getOperand(1).getReg();
2690 LLT Ty = MRI.getType(DstReg);
2691 unsigned Flags = MI.getFlags();
2692
2693 Register TrigVal;
2694 auto OneOver2Pi = B.buildFConstant(Ty, 0.5 * numbers::inv_pi);
2695 if (ST.hasTrigReducedRange()) {
2696 auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags);
2697 TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty})
2698 .addUse(MulVal.getReg(0))
2699 .setMIFlags(Flags)
2700 .getReg(0);
2701 } else
2702 TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0);
2703
2704 Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ?
2705 Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos;
2706 B.buildIntrinsic(TrigIntrin, ArrayRef<Register>(DstReg))
2707 .addUse(TrigVal)
2708 .setMIFlags(Flags);
2709 MI.eraseFromParent();
2710 return true;
2711}
2712
2715 const GlobalValue *GV,
2716 int64_t Offset,
2717 unsigned GAFlags) const {
2718 assert(isInt<32>(Offset + 4) && "32-bit offset is expected!");
2719 // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered
2720 // to the following code sequence:
2721 //
2722 // For constant address space:
2723 // s_getpc_b64 s[0:1]
2724 // s_add_u32 s0, s0, $symbol
2725 // s_addc_u32 s1, s1, 0
2726 //
2727 // s_getpc_b64 returns the address of the s_add_u32 instruction and then
2728 // a fixup or relocation is emitted to replace $symbol with a literal
2729 // constant, which is a pc-relative offset from the encoding of the $symbol
2730 // operand to the global variable.
2731 //
2732 // For global address space:
2733 // s_getpc_b64 s[0:1]
2734 // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
2735 // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
2736 //
2737 // s_getpc_b64 returns the address of the s_add_u32 instruction and then
2738 // fixups or relocations are emitted to replace $symbol@*@lo and
2739 // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
2740 // which is a 64-bit pc-relative offset from the encoding of the $symbol
2741 // operand to the global variable.
2742
2744
2745 Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg :
2746 B.getMRI()->createGenericVirtualRegister(ConstPtrTy);
2747
2748 MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET)
2749 .addDef(PCReg);
2750
2751 MIB.addGlobalAddress(GV, Offset, GAFlags);
2752 if (GAFlags == SIInstrInfo::MO_NONE)
2753 MIB.addImm(0);
2754 else
2755 MIB.addGlobalAddress(GV, Offset, GAFlags + 1);
2756
2757 if (!B.getMRI()->getRegClassOrNull(PCReg))
2758 B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass);
2759
2760 if (PtrTy.getSizeInBits() == 32)
2761 B.buildExtract(DstReg, PCReg, 0);
2762 return true;
2763}
2764
2765// Emit a ABS32_LO / ABS32_HI relocation stub.
2767 Register DstReg, LLT PtrTy, MachineIRBuilder &B, const GlobalValue *GV,
2768 MachineRegisterInfo &MRI) const {
2769 bool RequiresHighHalf = PtrTy.getSizeInBits() != 32;
2770
2771 LLT S32 = LLT::scalar(32);
2772
2773 // Use the destination directly, if and only if we store the lower address
2774 // part only and we don't have a register class being set.
2775 Register AddrLo = !RequiresHighHalf && !MRI.getRegClassOrNull(DstReg)
2776 ? DstReg
2777 : MRI.createGenericVirtualRegister(S32);
2778
2779 if (!MRI.getRegClassOrNull(AddrLo))
2780 MRI.setRegClass(AddrLo, &AMDGPU::SReg_32RegClass);
2781
2782 // Write the lower half.
2783 B.buildInstr(AMDGPU::S_MOV_B32)
2784 .addDef(AddrLo)
2785 .addGlobalAddress(GV, 0, SIInstrInfo::MO_ABS32_LO);
2786
2787 // If required, write the upper half as well.
2788 if (RequiresHighHalf) {
2789 assert(PtrTy.getSizeInBits() == 64 &&
2790 "Must provide a 64-bit pointer type!");
2791
2792 Register AddrHi = MRI.createGenericVirtualRegister(S32);
2793 MRI.setRegClass(AddrHi, &AMDGPU::SReg_32RegClass);
2794
2795 B.buildInstr(AMDGPU::S_MOV_B32)
2796 .addDef(AddrHi)
2797 .addGlobalAddress(GV, 0, SIInstrInfo::MO_ABS32_HI);
2798
2799 // Use the destination directly, if and only if we don't have a register
2800 // class being set.
2801 Register AddrDst = !MRI.getRegClassOrNull(DstReg)
2802 ? DstReg
2803 : MRI.createGenericVirtualRegister(LLT::scalar(64));
2804
2805 if (!MRI.getRegClassOrNull(AddrDst))
2806 MRI.setRegClass(AddrDst, &AMDGPU::SReg_64RegClass);
2807
2808 B.buildMergeValues(AddrDst, {AddrLo, AddrHi});
2809
2810 // If we created a new register for the destination, cast the result into
2811 // the final output.
2812 if (AddrDst != DstReg)
2813 B.buildCast(DstReg, AddrDst);
2814 } else if (AddrLo != DstReg) {
2815 // If we created a new register for the destination, cast the result into
2816 // the final output.
2817 B.buildCast(DstReg, AddrLo);
2818 }
2819}
2820
2823 MachineIRBuilder &B) const {
2824 Register DstReg = MI.getOperand(0).getReg();
2825 LLT Ty = MRI.getType(DstReg);
2826 unsigned AS = Ty.getAddressSpace();
2827
2828 const GlobalValue *GV = MI.getOperand(1).getGlobal();
2829 MachineFunction &MF = B.getMF();
2831
2833 if (!MFI->isModuleEntryFunction() &&
2834 !GV->getName().equals("llvm.amdgcn.module.lds")) {
2835 const Function &Fn = MF.getFunction();
2836 DiagnosticInfoUnsupported BadLDSDecl(
2837 Fn, "local memory global used by non-kernel function", MI.getDebugLoc(),
2838 DS_Warning);
2839 Fn.getContext().diagnose(BadLDSDecl);
2840
2841 // We currently don't have a way to correctly allocate LDS objects that
2842 // aren't directly associated with a kernel. We do force inlining of
2843 // functions that use local objects. However, if these dead functions are
2844 // not eliminated, we don't want a compile time error. Just emit a warning
2845 // and a trap, since there should be no callable path here.
2846 B.buildIntrinsic(Intrinsic::trap, ArrayRef<Register>());
2847 B.buildUndef(DstReg);
2848 MI.eraseFromParent();
2849 return true;
2850 }
2851
2852 // TODO: We could emit code to handle the initialization somewhere.
2853 // We ignore the initializer for now and legalize it to allow selection.
2854 // The initializer will anyway get errored out during assembly emission.
2855 const SITargetLowering *TLI = ST.getTargetLowering();
2856 if (!TLI->shouldUseLDSConstAddress(GV)) {
2857 MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO);
2858 return true; // Leave in place;
2859 }
2860
2861 if (AS == AMDGPUAS::LOCAL_ADDRESS && GV->hasExternalLinkage()) {
2862 Type *Ty = GV->getValueType();
2863 // HIP uses an unsized array `extern __shared__ T s[]` or similar
2864 // zero-sized type in other languages to declare the dynamic shared
2865 // memory which size is not known at the compile time. They will be
2866 // allocated by the runtime and placed directly after the static
2867 // allocated ones. They all share the same offset.
2868 if (B.getDataLayout().getTypeAllocSize(Ty).isZero()) {
2869 // Adjust alignment for that dynamic shared memory array.
2870 MFI->setDynLDSAlign(MF.getFunction(), *cast<GlobalVariable>(GV));
2871 LLT S32 = LLT::scalar(32);
2872 auto Sz = B.buildIntrinsic(Intrinsic::amdgcn_groupstaticsize, {S32});
2873 B.buildIntToPtr(DstReg, Sz);
2874 MI.eraseFromParent();
2875 return true;
2876 }
2877 }
2878
2879 B.buildConstant(DstReg, MFI->allocateLDSGlobal(B.getDataLayout(),
2880 *cast<GlobalVariable>(GV)));
2881 MI.eraseFromParent();
2882 return true;
2883 }
2884
2885 if (ST.isAmdPalOS() || ST.isMesa3DOS()) {
2886 buildAbsGlobalAddress(DstReg, Ty, B, GV, MRI);
2887 MI.eraseFromParent();
2888 return true;
2889 }
2890
2891 const SITargetLowering *TLI = ST.getTargetLowering();
2892
2893 if (TLI->shouldEmitFixup(GV)) {
2894 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0);
2895 MI.eraseFromParent();
2896 return true;
2897 }
2898
2899 if (TLI->shouldEmitPCReloc(GV)) {
2900 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32);
2901 MI.eraseFromParent();
2902 return true;
2903 }
2904
2906 Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy);
2907
2908 LLT LoadTy = Ty.getSizeInBits() == 32 ? PtrTy : Ty;
2913 LoadTy, Align(8));
2914
2915 buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32);
2916
2917 if (Ty.getSizeInBits() == 32) {
2918 // Truncate if this is a 32-bit constant address.
2919 auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO);
2920 B.buildExtract(DstReg, Load, 0);
2921 } else
2922 B.buildLoad(DstReg, GOTAddr, *GOTMMO);
2923
2924 MI.eraseFromParent();
2925 return true;
2926}
2927
2929 if (Ty.isVector())
2930 return Ty.changeElementCount(
2933}
2934
2936 MachineInstr &MI) const {
2937 MachineIRBuilder &B = Helper.MIRBuilder;
2938 MachineRegisterInfo &MRI = *B.getMRI();
2939 GISelChangeObserver &Observer = Helper.Observer;
2940
2941 Register PtrReg = MI.getOperand(1).getReg();
2942 LLT PtrTy = MRI.getType(PtrReg);
2943 unsigned AddrSpace = PtrTy.getAddressSpace();
2944
2945 if (AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
2947 auto Cast = B.buildAddrSpaceCast(ConstPtr, PtrReg);
2948 Observer.changingInstr(MI);
2949 MI.getOperand(1).setReg(Cast.getReg(0));
2950 Observer.changedInstr(MI);
2951 return true;
2952 }
2953
2954 if (MI.getOpcode() != AMDGPU::G_LOAD)
2955 return false;
2956
2957 Register ValReg = MI.getOperand(0).getReg();
2958 LLT ValTy = MRI.getType(ValReg);
2959
2960 if (hasBufferRsrcWorkaround(ValTy)) {
2961 Observer.changingInstr(MI);
2963 Observer.changedInstr(MI);
2964 return true;
2965 }
2966
2967 MachineMemOperand *MMO = *MI.memoperands_begin();
2968 const unsigned ValSize = ValTy.getSizeInBits();
2969 const LLT MemTy = MMO->getMemoryType();
2970 const Align MemAlign = MMO->getAlign();
2971 const unsigned MemSize = MemTy.getSizeInBits();
2972 const uint64_t AlignInBits = 8 * MemAlign.value();
2973
2974 // Widen non-power-of-2 loads to the alignment if needed
2975 if (shouldWidenLoad(ST, MemTy, AlignInBits, AddrSpace, MI.getOpcode())) {
2976 const unsigned WideMemSize = PowerOf2Ceil(MemSize);
2977
2978 // This was already the correct extending load result type, so just adjust
2979 // the memory type.
2980 if (WideMemSize == ValSize) {
2981 MachineFunction &MF = B.getMF();
2982
2983 MachineMemOperand *WideMMO =
2984 MF.getMachineMemOperand(MMO, 0, WideMemSize / 8);
2985 Observer.changingInstr(MI);
2986 MI.setMemRefs(MF, {WideMMO});
2987 Observer.changedInstr(MI);
2988 return true;
2989 }
2990
2991 // Don't bother handling edge case that should probably never be produced.
2992 if (ValSize > WideMemSize)
2993 return false;
2994
2995 LLT WideTy = widenToNextPowerOf2(ValTy);
2996
2997 Register WideLoad;
2998 if (!WideTy.isVector()) {
2999 WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
3000 B.buildTrunc(ValReg, WideLoad).getReg(0);
3001 } else {
3002 // Extract the subvector.
3003
3004 if (isRegisterType(ValTy)) {
3005 // If this a case where G_EXTRACT is legal, use it.
3006 // (e.g. <3 x s32> -> <4 x s32>)
3007 WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
3008 B.buildExtract(ValReg, WideLoad, 0);
3009 } else {
3010 // For cases where the widened type isn't a nice register value, unmerge
3011 // from a widened register (e.g. <3 x s16> -> <4 x s16>)
3012 WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
3013 B.buildDeleteTrailingVectorElements(ValReg, WideLoad);
3014 }
3015 }
3016
3017 MI.eraseFromParent();
3018 return true;
3019 }
3020
3021 return false;
3022}
3023
3025 MachineInstr &MI) const {
3026 MachineIRBuilder &B = Helper.MIRBuilder;
3027 MachineRegisterInfo &MRI = *B.getMRI();
3028 GISelChangeObserver &Observer = Helper.Observer;
3029
3030 Register DataReg = MI.getOperand(0).getReg();
3031 LLT DataTy = MRI.getType(DataReg);
3032
3033 if (hasBufferRsrcWorkaround(DataTy)) {
3034 Observer.changingInstr(MI);
3036 Observer.changedInstr(MI);
3037 return true;
3038 }
3039 return false;
3040}
3041
3044 MachineIRBuilder &B) const {
3045 LLT Ty = MRI.getType(MI.getOperand(0).getReg());
3046 assert(Ty.isScalar());
3047
3048 MachineFunction &MF = B.getMF();
3050
3051 // TODO: Always legal with future ftz flag.
3052 // FIXME: Do we need just output?
3053 if (Ty == LLT::float32() &&
3055 return true;
3056 if (Ty == LLT::float16() &&
3058 return true;
3059
3060 MachineIRBuilder HelperBuilder(MI);
3061 GISelObserverWrapper DummyObserver;
3062 LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
3063 return Helper.lowerFMad(MI) == LegalizerHelper::Legalized;
3064}
3065
3068 Register DstReg = MI.getOperand(0).getReg();
3069 Register PtrReg = MI.getOperand(1).getReg();
3070 Register CmpVal = MI.getOperand(2).getReg();
3071 Register NewVal = MI.getOperand(3).getReg();
3072
3073 assert(AMDGPU::isFlatGlobalAddrSpace(MRI.getType(PtrReg).getAddressSpace()) &&
3074 "this should not have been custom lowered");
3075
3076 LLT ValTy = MRI.getType(CmpVal);
3077 LLT VecTy = LLT::fixed_vector(2, ValTy);
3078
3079 Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0);
3080
3081 B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG)
3082 .addDef(DstReg)
3083 .addUse(PtrReg)
3084 .addUse(PackedVal)
3085 .setMemRefs(MI.memoperands());
3086
3087 MI.eraseFromParent();
3088 return true;
3089}
3090
3091/// Return true if it's known that \p Src can never be an f32 denormal value.
3093 Register Src) {
3094 const MachineInstr *DefMI = MRI.getVRegDef(Src);
3095 switch (DefMI->getOpcode()) {
3096 case TargetOpcode::G_INTRINSIC: {
3097 switch (cast<GIntrinsic>(DefMI)->getIntrinsicID()) {
3098 case Intrinsic::amdgcn_frexp_mant:
3099 return true;
3100 default:
3101 break;
3102 }
3103
3104 break;
3105 }
3106 case TargetOpcode::G_FFREXP: {
3107 if (DefMI->getOperand(0).getReg() == Src)
3108 return true;
3109 break;
3110 }
3111 case TargetOpcode::G_FPEXT: {
3112 return MRI.getType(DefMI->getOperand(1).getReg()) == LLT::scalar(16);
3113 }
3114 default:
3115 return false;
3116 }
3117
3118 return false;
3119}
3120
3121static bool allowApproxFunc(const MachineFunction &MF, unsigned Flags) {
3122 if (Flags & MachineInstr::FmAfn)
3123 return true;
3124 const auto &Options = MF.getTarget().Options;
3125 return Options.UnsafeFPMath || Options.ApproxFuncFPMath;
3126}
3127
3129 unsigned Flags) {
3130 return !valueIsKnownNeverF32Denorm(MF.getRegInfo(), Src) &&
3133}
3134
3135std::pair<Register, Register>
3137 unsigned Flags) const {
3138 if (!needsDenormHandlingF32(B.getMF(), Src, Flags))
3139 return {};
3140
3141 const LLT F32 = LLT::scalar(32);
3142 auto SmallestNormal = B.buildFConstant(
3144 auto IsLtSmallestNormal =
3145 B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), Src, SmallestNormal);
3146
3147 auto Scale32 = B.buildFConstant(F32, 0x1.0p+32);
3148 auto One = B.buildFConstant(F32, 1.0);
3149 auto ScaleFactor =
3150 B.buildSelect(F32, IsLtSmallestNormal, Scale32, One, Flags);
3151 auto ScaledInput = B.buildFMul(F32, Src, ScaleFactor, Flags);
3152
3153 return {ScaledInput.getReg(0), IsLtSmallestNormal.getReg(0)};
3154}
3155
3157 MachineIRBuilder &B) const {
3158 // v_log_f32 is good enough for OpenCL, except it doesn't handle denormals.
3159 // If we have to handle denormals, scale up the input and adjust the result.
3160
3161 // scaled = x * (is_denormal ? 0x1.0p+32 : 1.0)
3162 // log2 = amdgpu_log2 - (is_denormal ? 32.0 : 0.0)
3163
3164 Register Dst = MI.getOperand(0).getReg();
3165 Register Src = MI.getOperand(1).getReg();
3166 LLT Ty = B.getMRI()->getType(Dst);
3167 unsigned Flags = MI.getFlags();
3168
3169 if (Ty == LLT::scalar(16)) {
3170 const LLT F32 = LLT::scalar(32);
3171 // Nothing in half is a denormal when promoted to f32.
3172 auto Ext = B.buildFPExt(F32, Src, Flags);
3173 auto Log2 = B.buildIntrinsic(Intrinsic::amdgcn_log, {F32})
3174 .addUse(Ext.getReg(0))
3175 .setMIFlags(Flags);
3176 B.buildFPTrunc(Dst, Log2, Flags);
3177 MI.eraseFromParent();
3178 return true;
3179 }
3180
3181 assert(Ty == LLT::scalar(32));
3182
3183 auto [ScaledInput, IsLtSmallestNormal] = getScaledLogInput(B, Src, Flags);
3184 if (!ScaledInput) {
3185 B.buildIntrinsic(Intrinsic::amdgcn_log, {MI.getOperand(0)})
3186 .addUse(Src)
3187 .setMIFlags(Flags);
3188 MI.eraseFromParent();
3189 return true;
3190 }
3191
3192 auto Log2 = B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty})
3193 .addUse(ScaledInput)
3194 .setMIFlags(Flags);
3195
3196 auto ThirtyTwo = B.buildFConstant(Ty, 32.0);
3197 auto Zero = B.buildFConstant(Ty, 0.0);
3198 auto ResultOffset =
3199 B.buildSelect(Ty, IsLtSmallestNormal, ThirtyTwo, Zero, Flags);
3200 B.buildFSub(Dst, Log2, ResultOffset, Flags);
3201
3202 MI.eraseFromParent();
3203 return true;
3204}
3205
3207 Register Z, unsigned Flags) {
3208 auto FMul = B.buildFMul(Ty, X, Y, Flags);
3209 return B.buildFAdd(Ty, FMul, Z, Flags).getReg(0);
3210}
3211
3213 MachineIRBuilder &B) const {
3214 const bool IsLog10 = MI.getOpcode() == TargetOpcode::G_FLOG10;
3215 assert(IsLog10 || MI.getOpcode() == TargetOpcode::G_FLOG);
3216
3217 MachineRegisterInfo &MRI = *B.getMRI();
3218 Register Dst = MI.getOperand(0).getReg();
3219 Register X = MI.getOperand(1).getReg();
3220 unsigned Flags = MI.getFlags();
3221 const LLT Ty = MRI.getType(X);
3222 MachineFunction &MF = B.getMF();
3223
3224 const LLT F32 = LLT::scalar(32);
3225 const LLT F16 = LLT::scalar(16);
3226
3227 const AMDGPUTargetMachine &TM =
3228 static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
3229
3230 if (Ty == F16 || MI.getFlag(MachineInstr::FmAfn) ||
3231 TM.Options.ApproxFuncFPMath || TM.Options.UnsafeFPMath) {
3232 if (Ty == F16 && !ST.has16BitInsts()) {
3233 Register LogVal = MRI.createGenericVirtualRegister(F32);
3234 auto PromoteSrc = B.buildFPExt(F32, X);
3235 legalizeFlogUnsafe(B, LogVal, PromoteSrc.getReg(0), IsLog10, Flags);
3236 B.buildFPTrunc(Dst, LogVal);
3237 } else {
3238 legalizeFlogUnsafe(B, Dst, X, IsLog10, Flags);
3239 }
3240
3241 MI.eraseFromParent();
3242 return true;
3243 }
3244
3245 auto [ScaledInput, IsScaled] = getScaledLogInput(B, X, Flags);
3246 if (ScaledInput)
3247 X = ScaledInput;
3248
3249 auto Y =
3250 B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty}).addUse(X).setMIFlags(Flags);
3251
3252 Register R;
3253 if (ST.hasFastFMAF32()) {
3254 // c+cc are ln(2)/ln(10) to more than 49 bits
3255 const float c_log10 = 0x1.344134p-2f;
3256 const float cc_log10 = 0x1.09f79ep-26f;
3257
3258 // c + cc is ln(2) to more than 49 bits
3259 const float c_log = 0x1.62e42ep-1f;
3260 const float cc_log = 0x1.efa39ep-25f;
3261
3262 auto C = B.buildFConstant(Ty, IsLog10 ? c_log10 : c_log);
3263 auto CC = B.buildFConstant(Ty, IsLog10 ? cc_log10 : cc_log);
3264
3265 R = B.buildFMul(Ty, Y, C, Flags).getReg(0);
3266 auto NegR = B.buildFNeg(Ty, R, Flags);
3267 auto FMA0 = B.buildFMA(Ty, Y, C, NegR, Flags);
3268 auto FMA1 = B.buildFMA(Ty, Y, CC, FMA0, Flags);
3269 R = B.buildFAdd(Ty, R, FMA1, Flags).getReg(0);
3270 } else {
3271 // ch+ct is ln(2)/ln(10) to more than 36 bits
3272 const float ch_log10 = 0x1.344000p-2f;
3273 const float ct_log10 = 0x1.3509f6p-18f;
3274
3275 // ch + ct is ln(2) to more than 36 bits
3276 const float ch_log = 0x1.62e000p-1f;
3277 const float ct_log = 0x1.0bfbe8p-15f;
3278
3279 auto CH = B.buildFConstant(Ty, IsLog10 ? ch_log10 : ch_log);
3280 auto CT = B.buildFConstant(Ty, IsLog10 ? ct_log10 : ct_log);
3281
3282 auto MaskConst = B.buildConstant(Ty, 0xfffff000);
3283 auto YH = B.buildAnd(Ty, Y, MaskConst);
3284 auto YT = B.buildFSub(Ty, Y, YH, Flags);
3285 auto YTCT = B.buildFMul(Ty, YT, CT, Flags);
3286
3287 Register Mad0 =
3288 getMad(B, Ty, YH.getReg(0), CT.getReg(0), YTCT.getReg(0), Flags);
3289 Register Mad1 = getMad(B, Ty, YT.getReg(0), CH.getReg(0), Mad0, Flags);
3290 R = getMad(B, Ty, YH.getReg(0), CH.getReg(0), Mad1, Flags);
3291 }
3292
3293 const bool IsFiniteOnly =
3294 (MI.getFlag(MachineInstr::FmNoNans) || TM.Options.NoNaNsFPMath) &&
3295 (MI.getFlag(MachineInstr::FmNoInfs) || TM.Options.NoInfsFPMath);
3296
3297 if (!IsFiniteOnly) {
3298 // Expand isfinite(x) => fabs(x) < inf
3299 auto Inf = B.buildFConstant(Ty, APFloat::getInf(APFloat::IEEEsingle()));
3300 auto Fabs = B.buildFAbs(Ty, Y);
3301 auto IsFinite =
3302 B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), Fabs, Inf, Flags);
3303 R = B.buildSelect(Ty, IsFinite, R, Y, Flags).getReg(0);
3304 }
3305
3306 if (ScaledInput) {
3307 auto Zero = B.buildFConstant(Ty, 0.0);
3308 auto ShiftK =
3309 B.buildFConstant(Ty, IsLog10 ? 0x1.344136p+3f : 0x1.62e430p+4f);
3310 auto Shift = B.buildSelect(Ty, IsScaled, ShiftK, Zero, Flags);
3311 B.buildFSub(Dst, R, Shift, Flags);
3312 } else {
3313 B.buildCopy(Dst, R);
3314 }
3315
3316 MI.eraseFromParent();
3317 return true;
3318}
3319
3321 Register Src, bool IsLog10,
3322 unsigned Flags) const {
3323 const double Log2BaseInverted =
3325
3326 LLT Ty = B.getMRI()->getType(Dst);
3327
3328 if (Ty == LLT::scalar(32)) {
3329 auto [ScaledInput, IsScaled] = getScaledLogInput(B, Src, Flags);
3330 if (ScaledInput) {
3331 auto LogSrc = B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty})
3332 .addUse(Src)
3333 .setMIFlags(Flags);
3334 auto ScaledResultOffset = B.buildFConstant(Ty, -32.0 * Log2BaseInverted);
3335 auto Zero = B.buildFConstant(Ty, 0.0);
3336 auto ResultOffset =
3337 B.buildSelect(Ty, IsScaled, ScaledResultOffset, Zero, Flags);
3338 auto Log2Inv = B.buildFConstant(Ty, Log2BaseInverted);
3339
3340 if (ST.hasFastFMAF32())
3341 B.buildFMA(Dst, LogSrc, Log2Inv, ResultOffset, Flags);
3342 else {
3343 auto Mul = B.buildFMul(Ty, LogSrc, Log2Inv, Flags);
3344 B.buildFAdd(Dst, Mul, ResultOffset, Flags);
3345 }
3346
3347 return true;
3348 }
3349 }
3350
3351 auto Log2Operand = Ty == LLT::scalar(16)
3352 ? B.buildFLog2(Ty, Src, Flags)
3353 : B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty})
3354 .addUse(Src)
3355 .setMIFlags(Flags);
3356 auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted);
3357 B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags);
3358 return true;
3359}
3360
3362 MachineIRBuilder &B) const {
3363 // v_exp_f32 is good enough for OpenCL, except it doesn't handle denormals.
3364 // If we have to handle denormals, scale up the input and adjust the result.
3365
3366 Register Dst = MI.getOperand(0).getReg();
3367 Register Src = MI.getOperand(1).getReg();
3368 unsigned Flags = MI.getFlags();
3369 LLT Ty = B.getMRI()->getType(Dst);
3370 const LLT F16 = LLT::scalar(16);
3371 const LLT F32 = LLT::scalar(32);
3372
3373 if (Ty == F16) {
3374 // Nothing in half is a denormal when promoted to f32.
3375 auto Ext = B.buildFPExt(F32, Src, Flags);
3376 auto Log2 = B.buildIntrinsic(Intrinsic::amdgcn_exp2, {F32})
3377 .addUse(Ext.getReg(0))
3378 .setMIFlags(Flags);
3379 B.buildFPTrunc(Dst, Log2, Flags);
3380 MI.eraseFromParent();
3381 return true;
3382 }
3383
3384 assert(Ty == F32);
3385
3386 if (!needsDenormHandlingF32(B.getMF(), Src, Flags)) {
3387 B.buildIntrinsic(Intrinsic::amdgcn_exp2, ArrayRef<Register>{Dst})
3388 .addUse(Src)
3389 .setMIFlags(Flags);
3390 MI.eraseFromParent();
3391 return true;
3392 }
3393
3394 // bool needs_scaling = x < -0x1.f80000p+6f;
3395 // v_exp_f32(x + (s ? 0x1.0p+6f : 0.0f)) * (s ? 0x1.0p-64f : 1.0f);
3396
3397 // -nextafter(128.0, -1)
3398 auto RangeCheckConst = B.buildFConstant(Ty, -0x1.f80000p+6f);
3399 auto NeedsScaling = B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), Src,
3400 RangeCheckConst, Flags);
3401
3402 auto SixtyFour = B.buildFConstant(Ty, 0x1.0p+6f);
3403 auto Zero = B.buildFConstant(Ty, 0.0);
3404 auto AddOffset = B.buildSelect(F32, NeedsScaling, SixtyFour, Zero, Flags);
3405 auto AddInput = B.buildFAdd(F32, Src, AddOffset, Flags);
3406
3407 auto Exp2 = B.buildIntrinsic(Intrinsic::amdgcn_exp2, {Ty})
3408 .addUse(AddInput.getReg(0))
3409 .setMIFlags(Flags);
3410
3411 auto TwoExpNeg64 = B.buildFConstant(Ty, 0x1.0p-64f);
3412 auto One = B.buildFConstant(Ty, 1.0);
3413 auto ResultScale = B.buildSelect(F32, NeedsScaling, TwoExpNeg64, One, Flags);
3414 B.buildFMul(Dst, Exp2, ResultScale, Flags);
3415 MI.eraseFromParent();
3416 return true;
3417}
3418
3420 Register X, unsigned Flags) const {
3421 LLT Ty = B.getMRI()->getType(Dst);
3422 LLT F32 = LLT::scalar(32);
3423
3424 if (Ty != F32 || !needsDenormHandlingF32(B.getMF(), X, Flags)) {
3425 auto Log2E = B.buildFConstant(Ty, numbers::log2e);
3426 auto Mul = B.buildFMul(Ty, X, Log2E, Flags);
3427
3428 if (Ty == F32) {
3429 B.buildIntrinsic(Intrinsic::amdgcn_exp2, ArrayRef<Register>{Dst})
3430 .addUse(Mul.getReg(0))
3431 .setMIFlags(Flags);
3432 } else {
3433 B.buildFExp2(Dst, Mul.getReg(0), Flags);
3434 }
3435
3436 return true;
3437 }
3438
3439 auto Threshold = B.buildFConstant(Ty, -0x1.5d58a0p+6f);
3440 auto NeedsScaling =
3441 B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), X, Threshold, Flags);
3442 auto ScaleOffset = B.buildFConstant(Ty, 0x1.0p+6f);
3443 auto ScaledX = B.buildFAdd(Ty, X, ScaleOffset, Flags);
3444 auto AdjustedX = B.buildSelect(Ty, NeedsScaling, ScaledX, X, Flags);
3445
3446 auto Log2E = B.buildFConstant(Ty, numbers::log2e);
3447 auto ExpInput = B.buildFMul(Ty, AdjustedX, Log2E, Flags);
3448
3449 auto Exp2 = B.buildIntrinsic(Intrinsic::amdgcn_exp2, {Ty})
3450 .addUse(ExpInput.getReg(0))
3451 .setMIFlags(Flags);
3452
3453 auto ResultScaleFactor = B.buildFConstant(Ty, 0x1.969d48p-93f);
3454 auto AdjustedResult = B.buildFMul(Ty, Exp2, ResultScaleFactor, Flags);
3455 B.buildSelect(Dst, NeedsScaling, AdjustedResult, Exp2, Flags);
3456 return true;
3457}
3458
3460 MachineIRBuilder &B) const {
3461 Register Dst = MI.getOperand(0).getReg();
3462 Register X = MI.getOperand(1).getReg();
3463 const unsigned Flags = MI.getFlags();
3464 MachineFunction &MF = B.getMF();
3465 MachineRegisterInfo &MRI = *B.getMRI();
3466 LLT Ty = MRI.getType(Dst);
3467 const LLT F16 = LLT::scalar(16);
3468 const LLT F32 = LLT::scalar(32);
3469 const bool IsExp10 = false; // TODO: For some reason exp10 is missing
3470
3471 if (Ty == F16) {
3472 // v_exp_f16 (fmul x, log2e)
3473 if (allowApproxFunc(MF, Flags)) {
3474 // TODO: Does this really require fast?
3475 legalizeFExpUnsafe(B, Dst, X, Flags);
3476 MI.eraseFromParent();
3477 return true;
3478 }
3479
3480 // exp(f16 x) ->
3481 // fptrunc (v_exp_f32 (fmul (fpext x), log2e))
3482
3483 // Nothing in half is a denormal when promoted to f32.
3484 auto Ext = B.buildFPExt(F32, X, Flags);
3485 Register Lowered = MRI.createGenericVirtualRegister(F32);
3486 legalizeFExpUnsafe(B, Lowered, Ext.getReg(0), Flags);
3487 B.buildFPTrunc(Dst, Lowered, Flags);
3488 MI.eraseFromParent();
3489 return true;
3490 }
3491
3492 assert(Ty == F32);
3493
3494 // TODO: Interpret allowApproxFunc as ignoring DAZ. This is currently copying
3495 // library behavior. Also, is known-not-daz source sufficient?
3496 if (allowApproxFunc(MF, Flags)) {
3497 legalizeFExpUnsafe(B, Dst, X, Flags);
3498 MI.eraseFromParent();
3499 return true;
3500 }
3501
3502 // Algorithm:
3503 //
3504 // e^x = 2^(x/ln(2)) = 2^(x*(64/ln(2))/64)
3505 //
3506 // x*(64/ln(2)) = n + f, |f| <= 0.5, n is integer
3507 // n = 64*m + j, 0 <= j < 64
3508 //
3509 // e^x = 2^((64*m + j + f)/64)
3510 // = (2^m) * (2^(j/64)) * 2^(f/64)
3511 // = (2^m) * (2^(j/64)) * e^(f*(ln(2)/64))
3512 //
3513 // f = x*(64/ln(2)) - n
3514 // r = f*(ln(2)/64) = x - n*(ln(2)/64)
3515 //
3516 // e^x = (2^m) * (2^(j/64)) * e^r
3517 //
3518 // (2^(j/64)) is precomputed
3519 //
3520 // e^r = 1 + r + (r^2)/2! + (r^3)/3! + (r^4)/4! + (r^5)/5!
3521 // e^r = 1 + q
3522 //
3523 // q = r + (r^2)/2! + (r^3)/3! + (r^4)/4! + (r^5)/5!
3524 //
3525 // e^x = (2^m) * ( (2^(j/64)) + q*(2^(j/64)) )
3526 const unsigned FlagsNoContract = Flags & ~MachineInstr::FmContract;
3527 Register PH, PL;
3528
3529 if (ST.hasFastFMAF32()) {
3530 const float c_exp = numbers::log2ef;
3531 const float cc_exp = 0x1.4ae0bep-26f; // c+cc are 49 bits
3532 const float c_exp10 = 0x1.a934f0p+1f;
3533 const float cc_exp10 = 0x1.2f346ep-24f;
3534
3535 auto C = B.buildFConstant(Ty, IsExp10 ? c_exp10 : c_exp);
3536 PH = B.buildFMul(Ty, X, C, Flags).getReg(0);
3537 auto NegPH = B.buildFNeg(Ty, PH, Flags);
3538 auto FMA0 = B.buildFMA(Ty, X, C, NegPH, Flags);
3539
3540 auto CC = B.buildFConstant(Ty, IsExp10 ? cc_exp10 : cc_exp);
3541 PL = B.buildFMA(Ty, X, CC, FMA0, Flags).getReg(0);
3542 } else {
3543 const float ch_exp = 0x1.714000p+0f;
3544 const float cl_exp = 0x1.47652ap-12f; // ch + cl are 36 bits
3545
3546 const float ch_exp10 = 0x1.a92000p+1f;
3547 const float cl_exp10 = 0x1.4f0978p-11f;
3548
3549 auto MaskConst = B.buildConstant(Ty, 0xfffff000);
3550 auto XH = B.buildAnd(Ty, X, MaskConst);
3551 auto XL = B.buildFSub(Ty, X, XH, Flags);
3552
3553 auto CH = B.buildFConstant(Ty, IsExp10 ? ch_exp10 : ch_exp);
3554 PH = B.buildFMul(Ty, XH, CH, Flags).getReg(0);
3555
3556 auto CL = B.buildFConstant(Ty, IsExp10 ? cl_exp10 : cl_exp);
3557 auto XLCL = B.buildFMul(Ty, XL, CL, Flags);
3558
3559 Register Mad0 =
3560 getMad(B, Ty, XL.getReg(0), CH.getReg(0), XLCL.getReg(0), Flags);
3561 PL = getMad(B, Ty, XH.getReg(0), CL.getReg(0), Mad0, Flags);
3562 }
3563
3564 auto E = B.buildIntrinsicRoundeven(Ty, PH, Flags);
3565
3566 // It is unsafe to contract this fsub into the PH multiply.
3567 auto PHSubE = B.buildFSub(Ty, PH, E, FlagsNoContract);
3568 auto A = B.buildFAdd(Ty, PHSubE, PL, Flags);
3569 auto IntE = B.buildFPTOSI(LLT::scalar(32), E);
3570
3571 auto Exp2 = B.buildIntrinsic(Intrinsic::amdgcn_exp2, {Ty})
3572 .addUse(A.getReg(0))
3573 .setMIFlags(Flags);
3574 auto R = B.buildFLdexp(Ty, Exp2, IntE, Flags);
3575
3576 auto UnderflowCheckConst =
3577 B.buildFConstant(Ty, IsExp10 ? -0x1.66d3e8p+5f : -0x1.9d1da0p+6f);
3578 auto Zero = B.buildFConstant(Ty, 0.0);
3579 auto Underflow =
3580 B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), X, UnderflowCheckConst);
3581
3582 R = B.buildSelect(Ty, Underflow, Zero, R);
3583
3584 const auto &Options = MF.getTarget().Options;
3585
3586 if (!(Flags & MachineInstr::FmNoInfs) && !Options.NoInfsFPMath) {
3587 auto OverflowCheckConst =
3588 B.buildFConstant(Ty, IsExp10 ? 0x1.344136p+5f : 0x1.62e430p+6f);
3589
3590 auto Overflow =
3591 B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), X, OverflowCheckConst);
3592 auto Inf = B.buildFConstant(Ty, APFloat::getInf(APFloat::IEEEsingle()));
3593 R = B.buildSelect(Ty, Overflow, Inf, R, Flags);
3594 }
3595
3596 B.buildCopy(Dst, R);
3597 MI.eraseFromParent();
3598 return true;
3599}
3600
3602 MachineIRBuilder &B) const {
3603 Register Dst = MI.getOperand(0).getReg();
3604 Register Src0 = MI.getOperand(1).getReg();
3605 Register Src1 = MI.getOperand(2).getReg();
3606 unsigned Flags = MI.getFlags();
3607 LLT Ty = B.getMRI()->getType(Dst);
3608 const LLT F16 = LLT::float16();
3609 const LLT F32 = LLT::float32();
3610
3611 if (Ty == F32) {
3612 auto Log = B.buildFLog2(F32, Src0, Flags);
3613 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {F32})
3614 .addUse(Log.getReg(0))
3615 .addUse(Src1)
3616 .setMIFlags(Flags);
3617 B.buildFExp2(Dst, Mul, Flags);
3618 } else if (Ty == F16) {
3619 // There's no f16 fmul_legacy, so we need to convert for it.
3620 auto Log = B.buildFLog2(F16, Src0, Flags);
3621 auto Ext0 = B.buildFPExt(F32, Log, Flags);
3622 auto Ext1 = B.buildFPExt(F32, Src1, Flags);
3623 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {F32})
3624 .addUse(Ext0.getReg(0))
3625 .addUse(Ext1.getReg(0))
3626 .setMIFlags(Flags);
3627 B.buildFExp2(Dst, B.buildFPTrunc(F16, Mul), Flags);
3628 } else
3629 return false;
3630
3631 MI.eraseFromParent();
3632 return true;
3633}
3634
3635// Find a source register, ignoring any possible source modifiers.
3637 Register ModSrc = OrigSrc;
3638 if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) {
3639 ModSrc = SrcFNeg->getOperand(1).getReg();
3640 if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
3641 ModSrc = SrcFAbs->getOperand(1).getReg();
3642 } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
3643 ModSrc = SrcFAbs->getOperand(1).getReg();
3644 return ModSrc;
3645}
3646
3649 MachineIRBuilder &B) const {
3650
3651 const LLT S1 = LLT::scalar(1);
3652 const LLT F64 = LLT::float64();
3653 Register Dst = MI.getOperand(0).getReg();
3654 Register OrigSrc = MI.getOperand(1).getReg();
3655 unsigned Flags = MI.getFlags();
3656 assert(ST.hasFractBug() && MRI.getType(Dst) == F64 &&
3657 "this should not have been custom lowered");
3658
3659 // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x))
3660 // is used instead. However, SI doesn't have V_FLOOR_F64, so the most
3661 // efficient way to implement it is using V_FRACT_F64. The workaround for the
3662 // V_FRACT bug is:
3663 // fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999)
3664 //
3665 // Convert floor(x) to (x - fract(x))
3666
3667 auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {F64})
3668 .addUse(OrigSrc)
3669 .setMIFlags(Flags);
3670
3671 // Give source modifier matching some assistance before obscuring a foldable
3672 // pattern.
3673
3674 // TODO: We can avoid the neg on the fract? The input sign to fract
3675 // shouldn't matter?
3676 Register ModSrc = stripAnySourceMods(OrigSrc, MRI);
3677
3678 auto Const =
3679 B.buildFConstant(F64, llvm::bit_cast<double>(0x3fefffffffffffff));
3680
3681 Register Min = MRI.createGenericVirtualRegister(F64);
3682
3683 // We don't need to concern ourselves with the snan handling difference, so
3684 // use the one which will directly select.
3685 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3686 if (MFI->getMode().IEEE)
3687 B.buildFMinNumIEEE(Min, Fract, Const, Flags);
3688 else
3689 B.buildFMinNum(Min, Fract, Const, Flags);
3690
3691 Register CorrectedFract = Min;
3692 if (!MI.getFlag(MachineInstr::FmNoNans)) {
3693 auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags);
3694 CorrectedFract = B.buildSelect(F64, IsNan, ModSrc, Min, Flags).getReg(0);
3695 }
3696
3697 auto NegFract = B.buildFNeg(F64, CorrectedFract, Flags);
3698 B.buildFAdd(Dst, OrigSrc, NegFract, Flags);
3699
3700 MI.eraseFromParent();
3701 return true;
3702}
3703
3704// Turn an illegal packed v2s16 build vector into bit operations.
3705// TODO: This should probably be a bitcast action in LegalizerHelper.
3708 Register Dst = MI.getOperand(0).getReg();
3709 const LLT S32 = LLT::scalar(32);
3710 const LLT S16 = LLT::scalar(16);
3711 assert(MRI.getType(Dst) == LLT::fixed_vector(2, 16));
3712
3713 Register Src0 = MI.getOperand(1).getReg();
3714 Register Src1 = MI.getOperand(2).getReg();
3715
3716 if (MI.getOpcode() == AMDGPU::G_BUILD_VECTOR_TRUNC) {
3717 assert(MRI.getType(Src0) == S32);
3718 Src0 = B.buildTrunc(S16, MI.getOperand(1).getReg()).getReg(0);
3719 Src1 = B.buildTrunc(S16, MI.getOperand(2).getReg()).getReg(0);
3720 }
3721
3722 auto Merge = B.buildMergeLikeInstr(S32, {Src0, Src1});
3723 B.buildBitcast(Dst, Merge);
3724
3725 MI.eraseFromParent();
3726 return true;
3727}
3728
3729// Build a big integer multiply or multiply-add using MAD_64_32 instructions.
3730//
3731// Source and accumulation registers must all be 32-bits.
3732//
3733// TODO: When the multiply is uniform, we should produce a code sequence
3734// that is better suited to instruction selection on the SALU. Instead of
3735// the outer loop going over parts of the result, the outer loop should go
3736// over parts of one of the factors. This should result in instruction
3737// selection that makes full use of S_ADDC_U32 instructions.
3740 ArrayRef<Register> Src0,
3741 ArrayRef<Register> Src1,
3742 bool UsePartialMad64_32,
3743 bool SeparateOddAlignedProducts) const {
3744 // Use (possibly empty) vectors of S1 registers to represent the set of
3745 // carries from one pair of positions to the next.
3746 using Carry = SmallVector<Register, 2>;
3747
3748 MachineIRBuilder &B = Helper.MIRBuilder;
3749 GISelKnownBits &KB = *Helper.getKnownBits();
3750
3751 const LLT S1 = LLT::scalar(1);
3752 const LLT S32 = LLT::scalar(32);
3753 const LLT S64 = LLT::scalar(64);
3754
3755 Register Zero32;
3756 Register Zero64;
3757
3758 auto getZero32 = [&]() -> Register {
3759 if (!Zero32)
3760 Zero32 = B.buildConstant(S32, 0).getReg(0);
3761 return Zero32;
3762 };
3763 auto getZero64 = [&]() -> Register {
3764 if (!Zero64)
3765 Zero64 = B.buildConstant(S64, 0).getReg(0);
3766 return Zero64;
3767 };
3768
3769 SmallVector<bool, 2> Src0KnownZeros, Src1KnownZeros;
3770 for (unsigned i = 0; i < Src0.size(); ++i) {
3771 Src0KnownZeros.push_back(KB.getKnownBits(Src0[i]).isZero());
3772 Src1KnownZeros.push_back(KB.getKnownBits(Src1[i]).isZero());
3773 }
3774
3775 // Merge the given carries into the 32-bit LocalAccum, which is modified
3776 // in-place.
3777 //
3778 // Returns the carry-out, which is a single S1 register or null.
3779 auto mergeCarry =
3780 [&](Register &LocalAccum, const Carry &CarryIn) -> Register {
3781 if (CarryIn.empty())
3782 return Register();
3783
3784 bool HaveCarryOut = true;
3785 Register CarryAccum;
3786 if (CarryIn.size() == 1) {
3787 if (!LocalAccum) {
3788 LocalAccum = B.buildZExt(S32, CarryIn[0]).getReg(0);
3789 return Register();
3790 }
3791
3792 CarryAccum = getZero32();
3793 } else {
3794 CarryAccum = B.buildZExt(S32, CarryIn[0]).getReg(0);
3795 for (unsigned i = 1; i + 1 < CarryIn.size(); ++i) {
3796 CarryAccum =
3797 B.buildUAdde(S32, S1, CarryAccum, getZero32(), CarryIn[i])
3798 .getReg(0);
3799 }
3800
3801 if (!LocalAccum) {
3802 LocalAccum = getZero32();
3803 HaveCarryOut = false;
3804 }
3805 }
3806
3807 auto Add =
3808 B.buildUAdde(S32, S1, CarryAccum, LocalAccum, CarryIn.back());
3809 LocalAccum = Add.getReg(0);
3810 return HaveCarryOut ? Add.getReg(1) : Register();
3811 };
3812
3813 // Build a multiply-add chain to compute
3814 //
3815 // LocalAccum + (partial products at DstIndex)
3816 // + (opportunistic subset of CarryIn)
3817 //
3818 // LocalAccum is an array of one or two 32-bit registers that are updated
3819 // in-place. The incoming registers may be null.
3820 //
3821 // In some edge cases, carry-ins can be consumed "for free". In that case,
3822 // the consumed carry bits are removed from CarryIn in-place.
3823 auto buildMadChain =
3824 [&](MutableArrayRef<Register> LocalAccum, unsigned DstIndex, Carry &CarryIn)
3825 -> Carry {
3826 assert((DstIndex + 1 < Accum.size() && LocalAccum.size() == 2) ||
3827 (DstIndex + 1 >= Accum.size() && LocalAccum.size() == 1));
3828
3829 Carry CarryOut;
3830 unsigned j0 = 0;
3831
3832 // Use plain 32-bit multiplication for the most significant part of the
3833 // result by default.
3834 if (LocalAccum.size() == 1 &&
3835 (!UsePartialMad64_32 || !CarryIn.empty())) {
3836 do {
3837 // Skip multiplication if one of the operands is 0
3838 unsigned j1 = DstIndex - j0;
3839 if (Src0KnownZeros[j0] || Src1KnownZeros[j1]) {
3840 ++j0;
3841 continue;
3842 }
3843 auto Mul = B.buildMul(S32, Src0[j0], Src1[j1]);
3844 if (!LocalAccum[0] || KB.getKnownBits(LocalAccum[0]).isZero()) {
3845 LocalAccum[0] = Mul.getReg(0);
3846 } else {
3847 if (CarryIn.empty()) {
3848 LocalAccum[0] = B.buildAdd(S32, LocalAccum[0], Mul).getReg(0);
3849 } else {
3850 LocalAccum[0] =
3851 B.buildUAdde(S32, S1, LocalAccum[0], Mul, CarryIn.back())
3852 .getReg(0);
3853 CarryIn.pop_back();
3854 }
3855 }
3856 ++j0;
3857 } while (j0 <= DstIndex && (!UsePartialMad64_32 || !CarryIn.empty()));
3858 }
3859
3860 // Build full 64-bit multiplies.
3861 if (j0 <= DstIndex) {
3862 bool HaveSmallAccum = false;
3863 Register Tmp;
3864
3865 if (LocalAccum[0]) {
3866 if (LocalAccum.size() == 1) {
3867 Tmp = B.buildAnyExt(S64, LocalAccum[0]).getReg(0);
3868 HaveSmallAccum = true;
3869 } else if (LocalAccum[1]) {
3870 Tmp = B.buildMergeLikeInstr(S64, LocalAccum).getReg(0);
3871 HaveSmallAccum = false;
3872 } else {
3873 Tmp = B.buildZExt(S64, LocalAccum[0]).getReg(0);
3874 HaveSmallAccum = true;
3875 }
3876 } else {
3877 assert(LocalAccum.size() == 1 || !LocalAccum[1]);
3878 Tmp = getZero64();
3879 HaveSmallAccum = true;
3880 }
3881
3882 do {
3883 unsigned j1 = DstIndex - j0;
3884 if (Src0KnownZeros[j0] || Src1KnownZeros[j1]) {
3885 ++j0;
3886 continue;
3887 }
3888 auto Mad = B.buildInstr(AMDGPU::G_AMDGPU_MAD_U64_U32, {S64, S1},
3889 {Src0[j0], Src1[j1], Tmp});
3890 Tmp = Mad.getReg(0);
3891 if (!HaveSmallAccum)
3892 CarryOut.push_back(Mad.getReg(1));
3893 HaveSmallAccum = false;
3894
3895 ++j0;
3896 } while (j0 <= DstIndex);
3897
3898 auto Unmerge = B.buildUnmerge(S32, Tmp);
3899 LocalAccum[0] = Unmerge.getReg(0);
3900 if (LocalAccum.size() > 1)
3901 LocalAccum[1] = Unmerge.getReg(1);
3902 }
3903
3904 return CarryOut;
3905 };
3906
3907 // Outer multiply loop, iterating over destination parts from least
3908 // significant to most significant parts.
3909 //
3910 // The columns of the following diagram correspond to the destination parts
3911 // affected by one iteration of the outer loop (ignoring boundary
3912 // conditions).
3913 //
3914 // Dest index relative to 2 * i: 1 0 -1
3915 // ------
3916 // Carries from previous iteration: e o
3917 // Even-aligned partial product sum: E E .
3918 // Odd-aligned partial product sum: O O
3919 //
3920 // 'o' is OddCarry, 'e' is EvenCarry.
3921 // EE and OO are computed from partial products via buildMadChain and use
3922 // accumulation where possible and appropriate.
3923 //
3924 Register SeparateOddCarry;
3925 Carry EvenCarry;
3926 Carry OddCarry;
3927
3928 for (unsigned i = 0; i <= Accum.size() / 2; ++i) {
3929 Carry OddCarryIn = std::move(OddCarry);
3930 Carry EvenCarryIn = std::move(EvenCarry);
3931 OddCarry.clear();
3932 EvenCarry.clear();
3933
3934 // Partial products at offset 2 * i.
3935 if (2 * i < Accum.size()) {
3936 auto LocalAccum = Accum.drop_front(2 * i).take_front(2);
3937 EvenCarry = buildMadChain(LocalAccum, 2 * i, EvenCarryIn);
3938 }
3939
3940 // Partial products at offset 2 * i - 1.
3941 if (i > 0) {
3942 if (!SeparateOddAlignedProducts) {
3943 auto LocalAccum = Accum.drop_front(2 * i - 1).take_front(2);
3944 OddCarry = buildMadChain(LocalAccum, 2 * i - 1, OddCarryIn);
3945 } else {
3946 bool IsHighest = 2 * i >= Accum.size();
3947 Register SeparateOddOut[2];
3948 auto LocalAccum = MutableArrayRef(SeparateOddOut)
3949 .take_front(IsHighest ? 1 : 2);
3950 OddCarry = buildMadChain(LocalAccum, 2 * i - 1, OddCarryIn);
3951
3953
3954 if (i == 1) {
3955 if (!IsHighest)
3956 Lo = B.buildUAddo(S32, S1, Accum[2 * i - 1], SeparateOddOut[0]);
3957 else
3958 Lo = B.buildAdd(S32, Accum[2 * i - 1], SeparateOddOut[0]);
3959 } else {
3960 Lo = B.buildUAdde(S32, S1, Accum[2 * i - 1], SeparateOddOut[0],
3961 SeparateOddCarry);
3962 }
3963 Accum[2 * i - 1] = Lo->getOperand(0).getReg();
3964
3965 if (!IsHighest) {
3966 auto Hi = B.buildUAdde(S32, S1, Accum[2 * i], SeparateOddOut[1],
3967 Lo->getOperand(1).getReg());
3968 Accum[2 * i] = Hi.getReg(0);
3969 SeparateOddCarry = Hi.getReg(1);
3970 }
3971 }
3972 }
3973
3974 // Add in the carries from the previous iteration
3975 if (i > 0) {
3976 if (Register CarryOut = mergeCarry(Accum[2 * i - 1], OddCarryIn))
3977 EvenCarryIn.push_back(CarryOut);
3978
3979 if (2 * i < Accum.size()) {
3980 if (Register CarryOut = mergeCarry(Accum[2 * i], EvenCarryIn))
3981 OddCarry.push_back(CarryOut);
3982 }
3983 }
3984 }
3985}
3986
3987// Custom narrowing of wide multiplies using wide multiply-add instructions.
3988//
3989// TODO: If the multiply is followed by an addition, we should attempt to
3990// integrate it to make better use of V_MAD_U64_U32's multiply-add capabilities.
3992 MachineInstr &MI) const {
3993 assert(ST.hasMad64_32());
3994 assert(MI.getOpcode() == TargetOpcode::G_MUL);
3995
3996 MachineIRBuilder &B = Helper.MIRBuilder;
3997 MachineRegisterInfo &MRI = *B.getMRI();
3998
3999 Register DstReg = MI.getOperand(0).getReg();
4000 Register Src0 = MI.getOperand(1).getReg();
4001 Register Src1 = MI.getOperand(2).getReg();
4002
4003 LLT Ty = MRI.getType(DstReg);
4004 assert(Ty.isScalar());
4005
4006 unsigned Size = Ty.getSizeInBits();
4007 unsigned NumParts = Size / 32;
4008 assert((Size % 32) == 0);
4009 assert(NumParts >= 2);
4010
4011 // Whether to use MAD_64_32 for partial products whose high half is
4012 // discarded. This avoids some ADD instructions but risks false dependency
4013 // stalls on some subtargets in some cases.
4014 const bool UsePartialMad64_32 = ST.getGeneration() < AMDGPUSubtarget::GFX10;
4015
4016 // Whether to compute odd-aligned partial products separately. This is
4017 // advisable on subtargets where the accumulator of MAD_64_32 must be placed
4018 // in an even-aligned VGPR.
4019 const bool SeparateOddAlignedProducts = ST.hasFullRate64Ops();
4020
4021 LLT S32 = LLT::scalar(32);
4022 SmallVector<Register, 2> Src0Parts, Src1Parts;
4023 for (unsigned i = 0; i < NumParts; ++i) {
4024 Src0Parts.push_back(MRI.createGenericVirtualRegister(S32));
4025 Src1Parts.push_back(MRI.createGenericVirtualRegister(S32));
4026 }
4027 B.buildUnmerge(Src0Parts, Src0);
4028 B.buildUnmerge(Src1Parts, Src1);
4029
4030 SmallVector<Register, 2> AccumRegs(NumParts);
4031 buildMultiply(Helper, AccumRegs, Src0Parts, Src1Parts, UsePartialMad64_32,
4032 SeparateOddAlignedProducts);
4033
4034 B.buildMergeLikeInstr(DstReg, AccumRegs);
4035 MI.eraseFromParent();
4036 return true;
4037}
4038
4039// Legalize ctlz/cttz to ffbh/ffbl instead of the default legalization to
4040// ctlz/cttz_zero_undef. This allows us to fix up the result for the zero input
4041// case with a single min instruction instead of a compare+select.
4044 MachineIRBuilder &B) const {
4045 Register Dst = MI.getOperand(0).getReg();
4046 Register Src = MI.getOperand(1).getReg();
4047 LLT DstTy = MRI.getType(Dst);
4048 LLT SrcTy = MRI.getType(Src);
4049
4050 unsigned NewOpc = MI.getOpcode() == AMDGPU::G_CTLZ
4051 ? AMDGPU::G_AMDGPU_FFBH_U32
4052 : AMDGPU::G_AMDGPU_FFBL_B32;
4053 auto Tmp = B.buildInstr(NewOpc, {DstTy}, {Src});
4054 B.buildUMin(Dst, Tmp, B.buildConstant(DstTy, SrcTy.getSizeInBits()));
4055
4056 MI.eraseFromParent();
4057 return true;
4058}
4059
4060// Check that this is a G_XOR x, -1
4061static bool isNot(const MachineRegisterInfo &MRI, const MachineInstr &MI) {
4062 if (MI.getOpcode() != TargetOpcode::G_XOR)
4063 return false;
4064 auto ConstVal = getIConstantVRegSExtVal(MI.getOperand(2).getReg(), MRI);
4065 return ConstVal && *ConstVal == -1;
4066}
4067
4068// Return the use branch instruction, otherwise null if the usage is invalid.
4069static MachineInstr *
4071 MachineBasicBlock *&UncondBrTarget, bool &Negated) {
4072 Register CondDef = MI.getOperand(0).getReg();
4073 if (!MRI.hasOneNonDBGUse(CondDef))
4074 return nullptr;
4075
4076 MachineBasicBlock *Parent = MI.getParent();
4077 MachineInstr *UseMI = &*MRI.use_instr_nodbg_begin(CondDef);
4078
4079 if (isNot(MRI, *UseMI)) {
4080 Register NegatedCond = UseMI->getOperand(0).getReg();
4081 if (!MRI.hasOneNonDBGUse(NegatedCond))
4082 return nullptr;
4083
4084 // We're deleting the def of this value, so we need to remove it.
4085 eraseInstr(*UseMI, MRI);
4086
4087 UseMI = &*MRI.use_instr_nodbg_begin(NegatedCond);
4088 Negated = true;
4089 }
4090
4091 if (UseMI->getParent() != Parent || UseMI->getOpcode() != AMDGPU::G_BRCOND)
4092 return nullptr;
4093
4094 // Make sure the cond br is followed by a G_BR, or is the last instruction.
4095 MachineBasicBlock::iterator Next = std::next(UseMI->getIterator());
4096 if (Next == Parent->end()) {
4097 MachineFunction::iterator NextMBB = std::next(Parent->getIterator());
4098 if (NextMBB == Parent->getParent()->end()) // Illegal intrinsic use.
4099 return nullptr;
4100 UncondBrTarget = &*NextMBB;
4101 } else {
4102 if (Next->getOpcode() != AMDGPU::G_BR)
4103 return nullptr;
4104 Br = &*Next;
4105 UncondBrTarget = Br->getOperand(0).getMBB();
4106 }
4107
4108 return UseMI;
4109}
4110
4112 const ArgDescriptor *Arg,
4113 const TargetRegisterClass *ArgRC,
4114 LLT ArgTy) const {
4115 MCRegister SrcReg = Arg->getRegister();
4116 assert(Register::isPhysicalRegister(SrcReg) && "Physical register expected");
4117 assert(DstReg.isVirtual() && "Virtual register expected");
4118
4119 Register LiveIn = getFunctionLiveInPhysReg(B.getMF(), B.getTII(), SrcReg,
4120 *ArgRC, B.getDebugLoc(), ArgTy);
4121 if (Arg->isMasked()) {
4122 // TODO: Should we try to emit this once in the entry block?
4123 const LLT S32 = LLT::scalar(32);
4124 const unsigned Mask = Arg->getMask();
4125 const unsigned Shift = llvm::countr_zero<unsigned>(Mask);
4126
4127 Register AndMaskSrc = LiveIn;
4128
4129 // TODO: Avoid clearing the high bits if we know workitem id y/z are always
4130 // 0.
4131 if (Shift != 0) {
4132 auto ShiftAmt = B.buildConstant(S32, Shift);
4133 AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0);
4134 }
4135
4136 B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift));
4137 } else {
4138 B.buildCopy(DstReg, LiveIn);
4139 }
4140
4141 return true;
4142}
4143
4145 Register DstReg, MachineIRBuilder &B,
4147 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
4148 const ArgDescriptor *Arg;
4149 const TargetRegisterClass *ArgRC;
4150 LLT ArgTy;
4151 std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType);
4152
4153 if (!Arg) {
4155 // The intrinsic may appear when we have a 0 sized kernarg segment, in which
4156 // case the pointer argument may be missing and we use null.
4157 B.buildConstant(DstReg, 0);
4158 return true;
4159 }
4160
4161 // It's undefined behavior if a function marked with the amdgpu-no-*
4162 // attributes uses the corresponding intrinsic.
4163 B.buildUndef(DstReg);
4164 return true;
4165 }
4166
4167 if (!Arg->isRegister() || !Arg->getRegister().isValid())
4168 return false; // TODO: Handle these
4169 return loadInputValue(DstReg, B, Arg, ArgRC, ArgTy);
4170}
4171
4175 if (!loadInputValue(MI.getOperand(0).getReg(), B, ArgType))
4176 return false;
4177
4178 MI.eraseFromParent();
4179 return true;
4180}
4181
4183 int64_t C) {
4184 B.buildConstant(MI.getOperand(0).getReg(), C);
4185 MI.eraseFromParent();
4186 return true;
4187}
4188
4191 unsigned Dim, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
4192 unsigned MaxID = ST.getMaxWorkitemID(B.getMF().getFunction(), Dim);
4193 if (MaxID == 0)
4194 return replaceWithConstant(B, MI, 0);
4195
4196 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
4197 const ArgDescriptor *Arg;
4198 const TargetRegisterClass *ArgRC;
4199 LLT ArgTy;
4200 std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType);
4201
4202 Register DstReg = MI.getOperand(0).getReg();
4203 if (!Arg) {
4204 // It's undefined behavior if a function marked with the amdgpu-no-*
4205 // attributes uses the corresponding intrinsic.
4206 B.buildUndef(DstReg);
4207 MI.eraseFromParent();
4208 return true;
4209 }
4210
4211 if (Arg->isMasked()) {
4212 // Don't bother inserting AssertZext for packed IDs since we're emitting the
4213 // masking operations anyway.
4214 //
4215 // TODO: We could assert the top bit is 0 for the source copy.
4216 if (!loadInputValue(DstReg, B, ArgType))
4217 return false;
4218 } else {
4219 Register TmpReg = MRI.createGenericVirtualRegister(LLT::scalar(32));
4220 if (!loadInputValue(TmpReg, B, ArgType))
4221 return false;
4222 B.buildAssertZExt(DstReg, TmpReg, llvm::bit_width(MaxID));
4223 }
4224
4225 MI.eraseFromParent();
4226 return true;
4227}
4228
4230 int64_t Offset) const {
4232 Register KernArgReg = B.getMRI()->createGenericVirtualRegister(PtrTy);
4233
4234 // TODO: If we passed in the base kernel offset we could have a better
4235 // alignment than 4, but we don't really need it.
4236 if (!loadInputValue(KernArgReg, B,
4238 llvm_unreachable("failed to find kernarg segment ptr");
4239
4240 auto COffset = B.buildConstant(LLT::scalar(64), Offset);
4241 // TODO: Should get nuw
4242 return B.buildPtrAdd(PtrTy, KernArgReg, COffset).getReg(0);
4243}
4244
4245/// Legalize a value that's loaded from kernel arguments. This is only used by
4246/// legacy intrinsics.
4250 Align Alignment) const {
4251 Register DstReg = MI.getOperand(0).getReg();
4252
4253 assert(B.getMRI()->getType(DstReg) == LLT::scalar(32) &&
4254 "unexpected kernarg parameter type");
4255
4258 B.buildLoad(DstReg, Ptr, PtrInfo, Align(4),
4261 MI.eraseFromParent();
4262 return true;
4263}
4264
4267 MachineIRBuilder &B) const {
4268 Register Dst = MI.getOperand(0).getReg();
4269 LLT DstTy = MRI.getType(Dst);
4270 LLT S16 = LLT::scalar(16);
4271 LLT S32 = LLT::scalar(32);
4272 LLT S64 = LLT::scalar(64);
4273
4274 if (DstTy == S16)
4275 return legalizeFDIV16(MI, MRI, B);
4276 if (DstTy == S32)
4277 return legalizeFDIV32(MI, MRI, B);
4278 if (DstTy == S64)
4279 return legalizeFDIV64(MI, MRI, B);
4280
4281 return false;
4282}
4283
4285 Register DstDivReg,
4286 Register DstRemReg,
4287 Register X,
4288 Register Y) const {
4289 const LLT S1 = LLT::scalar(1);
4290 const LLT S32 = LLT::scalar(32);
4291
4292 // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the
4293 // algorithm used here.
4294
4295 // Initial estimate of inv(y).
4296 auto FloatY = B.buildUITOFP(S32, Y);
4297 auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {FloatY});
4298 auto Scale = B.buildFConstant(S32, llvm::bit_cast<float>(0x4f7ffffe));
4299 auto ScaledY = B.buildFMul(S32, RcpIFlag, Scale);
4300 auto Z = B.buildFPTOUI(S32, ScaledY);
4301
4302 // One round of UNR.
4303 auto NegY = B.buildSub(S32, B.buildConstant(S32, 0), Y);
4304 auto NegYZ = B.buildMul(S32, NegY, Z);
4305 Z = B.buildAdd(S32, Z, B.buildUMulH(S32, Z, NegYZ));
4306
4307 // Quotient/remainder estimate.
4308 auto Q = B.buildUMulH(S32, X, Z);
4309 auto R = B.buildSub(S32, X, B.buildMul(S32, Q, Y));
4310
4311 // First quotient/remainder refinement.
4312 auto One = B.buildConstant(S32, 1);
4313 auto Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
4314 if (DstDivReg)
4315 Q = B.buildSelect(S32, Cond, B.buildAdd(S32, Q, One), Q);
4316 R = B.buildSelect(S32, Cond, B.buildSub(S32, R, Y), R);
4317
4318 // Second quotient/remainder refinement.
4319 Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
4320 if (DstDivReg)
4321 B.buildSelect(DstDivReg, Cond, B.buildAdd(S32, Q, One), Q);
4322
4323 if (DstRemReg)
4324 B.buildSelect(DstRemReg, Cond, B.buildSub(S32, R, Y), R);
4325}
4326
4327// Build integer reciprocal sequence around V_RCP_IFLAG_F32
4328//
4329// Return lo, hi of result
4330//
4331// %cvt.lo = G_UITOFP Val.lo
4332// %cvt.hi = G_UITOFP Val.hi
4333// %mad = G_FMAD %cvt.hi, 2**32, %cvt.lo
4334// %rcp = G_AMDGPU_RCP_IFLAG %mad
4335// %mul1 = G_FMUL %rcp, 0x5f7ffffc
4336// %mul2 = G_FMUL %mul1, 2**(-32)
4337// %trunc = G_INTRINSIC_TRUNC %mul2
4338// %mad2 = G_FMAD %trunc, -(2**32), %mul1
4339// return {G_FPTOUI %mad2, G_FPTOUI %trunc}
4340static std::pair<Register, Register> emitReciprocalU64(MachineIRBuilder &B,
4341 Register Val) {
4342 const LLT S32 = LLT::scalar(32);
4343 auto Unmerge = B.buildUnmerge(S32, Val);
4344
4345 auto CvtLo = B.buildUITOFP(S32, Unmerge.getReg(0));
4346 auto CvtHi = B.buildUITOFP(S32, Unmerge.getReg(1));
4347
4348 auto Mad = B.buildFMAD(
4349 S32, CvtHi, // 2**32
4350 B.buildFConstant(S32, llvm::bit_cast<float>(0x4f800000)), CvtLo);
4351
4352 auto Rcp = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Mad});
4353 auto Mul1 = B.buildFMul(
4354 S32, Rcp, B.buildFConstant(S32, llvm::bit_cast<float>(0x5f7ffffc)));
4355
4356 // 2**(-32)
4357 auto Mul2 = B.buildFMul(
4358 S32, Mul1, B.buildFConstant(S32, llvm::bit_cast<float>(0x2f800000)));
4359 auto Trunc = B.buildIntrinsicTrunc(S32, Mul2);
4360
4361 // -(2**32)
4362 auto Mad2 = B.buildFMAD(
4363 S32, Trunc, B.buildFConstant(S32, llvm::bit_cast<float>(0xcf800000)),
4364 Mul1);
4365
4366 auto ResultLo = B.buildFPTOUI(S32, Mad2);
4367 auto ResultHi = B.buildFPTOUI(S32, Trunc);
4368
4369 return {ResultLo.getReg(0), ResultHi.getReg(0)};
4370}
4371
4373 Register DstDivReg,
4374 Register DstRemReg,
4375 Register Numer,
4376 Register Denom) const {
4377 const LLT S32 = LLT::scalar(32);
4378 const LLT S64 = LLT::scalar(64);
4379 const LLT S1 = LLT::scalar(1);
4380 Register RcpLo, RcpHi;
4381
4382 std::tie(RcpLo, RcpHi) = emitReciprocalU64(B, Denom);
4383
4384 auto Rcp = B.buildMergeLikeInstr(S64, {RcpLo, RcpHi});
4385
4386 auto Zero64 = B.buildConstant(S64, 0);
4387 auto NegDenom = B.buildSub(S64, Zero64, Denom);
4388
4389 auto MulLo1 = B.buildMul(S64, NegDenom, Rcp);
4390 auto MulHi1 = B.buildUMulH(S64, Rcp, MulLo1);
4391
4392 auto UnmergeMulHi1 = B.buildUnmerge(S32, MulHi1);
4393 Register MulHi1_Lo = UnmergeMulHi1.getReg(0);
4394 Register MulHi1_Hi = UnmergeMulHi1.getReg(1);
4395
4396 auto Add1_Lo = B.buildUAddo(S32, S1, RcpLo, MulHi1_Lo);
4397 auto Add1_Hi = B.buildUAdde(S32, S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1));
4398 auto Add1 = B.buildMergeLikeInstr(S64, {Add1_Lo, Add1_Hi});
4399
4400 auto MulLo2 = B.buildMul(S64, NegDenom, Add1);
4401 auto MulHi2 = B.buildUMulH(S64, Add1, MulLo2);
4402 auto UnmergeMulHi2 = B.buildUnmerge(S32, MulHi2);
4403 Register MulHi2_Lo = UnmergeMulHi2.getReg(0);
4404 Register MulHi2_Hi = UnmergeMulHi2.getReg(1);
4405
4406 auto Zero32 = B.buildConstant(S32, 0);
4407 auto Add2_Lo = B.buildUAddo(S32, S1, Add1_Lo, MulHi2_Lo);
4408 auto Add2_Hi = B.buildUAdde(S32, S1, Add1_Hi, MulHi2_Hi, Add2_Lo.getReg(1));
4409 auto Add2 = B.buildMergeLikeInstr(S64, {Add2_Lo, Add2_Hi});
4410
4411 auto UnmergeNumer = B.buildUnmerge(S32, Numer);
4412 Register NumerLo = UnmergeNumer.getReg(0);
4413 Register NumerHi = UnmergeNumer.getReg(1);
4414
4415 auto MulHi3 = B.buildUMulH(S64, Numer, Add2);
4416 auto Mul3 = B.buildMul(S64, Denom, MulHi3);
4417 auto UnmergeMul3 = B.buildUnmerge(S32, Mul3);
4418 Register Mul3_Lo = UnmergeMul3.getReg(0);
4419 Register Mul3_Hi = UnmergeMul3.getReg(1);
4420 auto Sub1_Lo = B.buildUSubo(S32, S1, NumerLo, Mul3_Lo);
4421 auto Sub1_Hi = B.buildUSube(S32, S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1));
4422 auto Sub1_Mi = B.buildSub(S32, NumerHi, Mul3_Hi);
4423 auto Sub1 = B.buildMergeLikeInstr(S64, {Sub1_Lo, Sub1_Hi});
4424
4425 auto UnmergeDenom = B.buildUnmerge(S32, Denom);
4426 Register DenomLo = UnmergeDenom.getReg(0);
4427 Register DenomHi = UnmergeDenom.getReg(1);
4428
4429 auto CmpHi = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Hi, DenomHi);
4430 auto C1 = B.buildSExt(S32, CmpHi);
4431
4432 auto CmpLo = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Lo, DenomLo);
4433 auto C2 = B.buildSExt(S32, CmpLo);
4434
4435 auto CmpEq = B.buildICmp(CmpInst::ICMP_EQ, S1, Sub1_Hi, DenomHi);
4436 auto C3 = B.buildSelect(S32, CmpEq, C2, C1);
4437
4438 // TODO: Here and below portions of the code can be enclosed into if/endif.
4439 // Currently control flow is unconditional and we have 4 selects after
4440 // potential endif to substitute PHIs.
4441
4442 // if C3 != 0 ...
4443 auto Sub2_Lo = B.buildUSubo(S32, S1, Sub1_Lo, DenomLo);
4444 auto Sub2_Mi = B.buildUSube(S32, S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1));
4445 auto Sub2_Hi = B.buildUSube(S32, S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1));
4446 auto Sub2 = B.buildMergeLikeInstr(S64, {Sub2_Lo, Sub2_Hi});
4447
4448 auto One64 = B.buildConstant(S64, 1);
4449 auto Add3 = B.buildAdd(S64, MulHi3, One64);
4450
4451 auto C4 =
4452 B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Hi, DenomHi));
4453 auto C5 =
4454 B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Lo, DenomLo));
4455 auto C6 = B.buildSelect(
4456 S32, B.buildICmp(CmpInst::ICMP_EQ, S1, Sub2_Hi, DenomHi), C5, C4);
4457
4458 // if (C6 != 0)
4459 auto Add4 = B.buildAdd(S64, Add3, One64);
4460 auto Sub3_Lo = B.buildUSubo(S32, S1, Sub2_Lo, DenomLo);
4461
4462 auto Sub3_Mi = B.buildUSube(S32, S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1));
4463 auto Sub3_Hi = B.buildUSube(S32, S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1));
4464 auto Sub3 = B.buildMergeLikeInstr(S64, {Sub3_Lo, Sub3_Hi});
4465
4466 // endif C6
4467 // endif C3
4468
4469 if (DstDivReg) {
4470 auto Sel1 = B.buildSelect(
4471 S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Add4, Add3);
4472 B.buildSelect(DstDivReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32),
4473 Sel1, MulHi3);
4474 }
4475
4476 if (DstRemReg) {
4477 auto Sel2 = B.buildSelect(
4478 S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Sub3, Sub2);
4479 B.buildSelect(DstRemReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32),
4480 Sel2, Sub1);
4481 }
4482}
4483
4486 MachineIRBuilder &B) const {
4487 Register DstDivReg, DstRemReg;
4488 switch (MI.getOpcode()) {
4489 default:
4490 llvm_unreachable("Unexpected opcode!");
4491 case AMDGPU::G_UDIV: {
4492 DstDivReg = MI.getOperand(0).getReg();
4493 break;
4494 }
4495 case AMDGPU::G_UREM: {
4496 DstRemReg = MI.getOperand(0).getReg();
4497 break;
4498 }
4499 case AMDGPU::G_UDIVREM: {
4500 DstDivReg = MI.getOperand(0).getReg();
4501 DstRemReg = MI.getOperand(1).getReg();
4502 break;
4503 }
4504 }
4505
4506 const LLT S64 = LLT::scalar(64);
4507 const LLT S32 = LLT::scalar(32);
4508 const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs();
4509 Register Num = MI.getOperand(FirstSrcOpIdx).getReg();
4510 Register Den = MI.getOperand(FirstSrcOpIdx + 1).getReg();
4511 LLT Ty = MRI.getType(MI.getOperand(0).getReg());
4512
4513 if (Ty == S32)
4514 legalizeUnsignedDIV_REM32Impl(B, DstDivReg, DstRemReg, Num, Den);
4515 else if (Ty == S64)
4516 legalizeUnsignedDIV_REM64Impl(B, DstDivReg, DstRemReg, Num, Den);
4517 else
4518 return false;
4519
4520 MI.eraseFromParent();
4521 return true;
4522}
4523
4526 MachineIRBuilder &B) const {
4527 const LLT S64 = LLT::scalar(64);
4528 const LLT S32 = LLT::scalar(32);
4529
4530 LLT Ty = MRI.getType(MI.getOperand(0).getReg());
4531 if (Ty != S32 && Ty != S64)
4532 return false;
4533
4534 const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs();
4535 Register LHS = MI.getOperand(FirstSrcOpIdx).getReg();
4536 Register RHS = MI.getOperand(FirstSrcOpIdx + 1).getReg();
4537
4538 auto SignBitOffset = B.buildConstant(S32, Ty.getSizeInBits() - 1);
4539 auto LHSign = B.buildAShr(Ty, LHS, SignBitOffset);
4540 auto RHSign = B.buildAShr(Ty, RHS, SignBitOffset);
4541
4542 LHS = B.buildAdd(Ty, LHS, LHSign).getReg(0);
4543 RHS = B.buildAdd(Ty, RHS, RHSign).getReg(0);
4544
4545 LHS = B.buildXor(Ty, LHS, LHSign).getReg(0);
4546 RHS = B.buildXor(Ty, RHS, RHSign).getReg(0);
4547
4548 Register DstDivReg, DstRemReg, TmpDivReg, TmpRemReg;
4549 switch (MI.getOpcode()) {
4550 default:
4551 llvm_unreachable("Unexpected opcode!");
4552 case AMDGPU::G_SDIV: {
4553 DstDivReg = MI.getOperand(0).getReg();
4554 TmpDivReg = MRI.createGenericVirtualRegister(Ty);
4555 break;
4556 }
4557 case AMDGPU::G_SREM: {
4558 DstRemReg = MI.getOperand(0).getReg();
4559 TmpRemReg = MRI.createGenericVirtualRegister(Ty);
4560 break;
4561 }
4562 case AMDGPU::G_SDIVREM: {
4563 DstDivReg = MI.getOperand(0).getReg();
4564 DstRemReg = MI.getOperand(1).getReg();
4565 TmpDivReg = MRI.createGenericVirtualRegister(Ty);
4566 TmpRemReg = MRI.createGenericVirtualRegister(Ty);
4567 break;
4568 }
4569 }
4570
4571 if (Ty == S32)
4572 legalizeUnsignedDIV_REM32Impl(B, TmpDivReg, TmpRemReg, LHS, RHS);
4573 else
4574 legalizeUnsignedDIV_REM64Impl(B, TmpDivReg, TmpRemReg, LHS, RHS);
4575
4576 if (DstDivReg) {
4577 auto Sign = B.buildXor(Ty, LHSign, RHSign).getReg(0);
4578 auto SignXor = B.buildXor(Ty, TmpDivReg, Sign).getReg(0);
4579 B.buildSub(DstDivReg, SignXor, Sign);
4580 }
4581
4582 if (DstRemReg) {
4583 auto Sign = LHSign.getReg(0); // Remainder sign is the same as LHS
4584 auto SignXor = B.buildXor(Ty, TmpRemReg, Sign).getReg(0);
4585 B.buildSub(DstRemReg, SignXor, Sign);
4586 }
4587
4588 MI.eraseFromParent();
4589 return true;
4590}
4591
4594 MachineIRBuilder &B) const {
4595 Register Res = MI.getOperand(0).getReg();
4596 Register LHS = MI.getOperand(1).getReg();
4597 Register RHS = MI.getOperand(2).getReg();
4598 uint16_t Flags = MI.getFlags();
4599 LLT ResTy = MRI.getType(Res);
4600
4601 const MachineFunction &MF = B.getMF();
4602 bool AllowInaccurateRcp = MI.getFlag(MachineInstr::FmAfn) ||
4604
4605 if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) {
4606 if (!AllowInaccurateRcp && ResTy != LLT::scalar(16))
4607 return false;
4608
4609 // v_rcp_f32 and v_rsq_f32 do not support denormals, and according to
4610 // the CI documentation has a worst case error of 1 ulp.
4611 // OpenCL requires <= 2.5 ulp for 1.0 / x, so it should always be OK to
4612 // use it as long as we aren't trying to use denormals.
4613 //
4614 // v_rcp_f16 and v_rsq_f16 DO support denormals and 0.51ulp.
4615
4616 // 1 / x -> RCP(x)
4617 if (CLHS->isExactlyValue(1.0)) {
4618 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res)
4619 .addUse(RHS)
4620 .setMIFlags(Flags);
4621
4622 MI.eraseFromParent();
4623 return true;
4624 }
4625
4626 // -1 / x -> RCP( FNEG(x) )
4627 if (CLHS->isExactlyValue(-1.0)) {
4628 auto FNeg = B.buildFNeg(ResTy, RHS, Flags);
4629 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res)
4630 .addUse(FNeg.getReg(0))
4631 .setMIFlags(Flags);
4632
4633 MI.eraseFromParent();
4634 return true;
4635 }
4636 }
4637
4638 // For f16 require afn or arcp.
4639 // For f32 require afn.
4640 if (!AllowInaccurateRcp && (ResTy != LLT::scalar(16) ||
4641 !MI.getFlag(MachineInstr::FmArcp)))
4642 return false;
4643
4644 // x / y -> x * (1.0 / y)
4645 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy})
4646 .addUse(RHS)
4647 .setMIFlags(Flags);
4648 B.buildFMul(Res, LHS, RCP, Flags);
4649
4650 MI.eraseFromParent();
4651 return true;
4652}
4653
4656 MachineIRBuilder &B) const {
4657 Register Res = MI.getOperand(0).getReg();
4658 Register X = MI.getOperand(1).getReg();
4659 Register Y = MI.getOperand(2).getReg();
4660 uint16_t Flags = MI.getFlags();
4661 LLT ResTy = MRI.getType(Res);
4662
4663 const MachineFunction &MF = B.getMF();
4664 bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath ||
4665 MI.getFlag(MachineInstr::FmAfn);
4666
4667 if (!AllowInaccurateRcp)
4668 return false;
4669
4670 auto NegY = B.buildFNeg(ResTy, Y);
4671 auto One = B.buildFConstant(ResTy, 1.0);
4672
4673 auto R = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy})
4674 .addUse(Y)
4675 .setMIFlags(Flags);
4676
4677 auto Tmp0 = B.buildFMA(ResTy, NegY, R, One);
4678 R = B.buildFMA(ResTy, Tmp0, R, R);
4679
4680 auto Tmp1 = B.buildFMA(ResTy, NegY, R, One);
4681 R = B.buildFMA(ResTy, Tmp1, R, R);
4682
4683 auto Ret = B.buildFMul(ResTy, X, R);
4684 auto Tmp2 = B.buildFMA(ResTy, NegY, Ret, X);
4685
4686 B.buildFMA(Res, Tmp2, R, Ret);
4687 MI.eraseFromParent();
4688 return true;
4689}
4690
4693 MachineIRBuilder &B) const {
4695 return true;
4696
4697 Register Res = MI.getOperand(0).getReg();
4698 Register LHS = MI.getOperand(1).getReg();
4699 Register RHS = MI.getOperand(2).getReg();
4700
4701 uint16_t Flags = MI.getFlags();
4702
4703 LLT S16 = LLT::scalar(16);
4704 LLT S32 = LLT::scalar(32);
4705
4706 auto LHSExt = B.buildFPExt(S32, LHS, Flags);
4707 auto RHSExt = B.buildFPExt(S32, RHS, Flags);
4708
4709 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32})
4710 .addUse(RHSExt.getReg(0))
4711 .setMIFlags(Flags);
4712
4713 auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags);
4714 auto RDst = B.buildFPTrunc(S16, QUOT, Flags);
4715
4716 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res)
4717 .addUse(RDst.getReg(0))
4718 .addUse(RHS)
4719 .addUse(LHS)
4720 .setMIFlags(Flags);
4721
4722 MI.eraseFromParent();
4723 return true;
4724}
4725
4726static const unsigned SPDenormModeBitField =
4729
4730// Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions
4731// to enable denorm mode. When 'Enable' is false, disable denorm mode.
4733 const GCNSubtarget &ST,
4735 // Set SP denorm mode to this value.
4736 unsigned SPDenormMode =
4737 Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue();
4738
4739 if (ST.hasDenormModeInst()) {
4740 // Preserve default FP64FP16 denorm mode while updating FP32 mode.
4741 uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue();
4742
4743 uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2);
4744 B.buildInstr(AMDGPU::S_DENORM_MODE)
4745 .addImm(NewDenormModeValue);
4746
4747 } else {
4748 B.buildInstr(AMDGPU::S_SETREG_IMM32_B32)
4749 .addImm(SPDenormMode)
4750 .addImm(SPDenormModeBitField);
4751 }
4752}
4753
4756 MachineIRBuilder &B) const {
4758 return true;
4759
4760 Register Res = MI.getOperand(0).getReg();
4761 Register LHS = MI.getOperand(1).getReg();
4762 Register RHS = MI.getOperand(2).getReg();
4763 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
4764 SIModeRegisterDefaults Mode = MFI->getMode();
4765
4766 uint16_t Flags = MI.getFlags();
4767
4768 LLT S32 = LLT::scalar(32);
4769 LLT S1 = LLT::scalar(1);
4770
4771 auto One = B.buildFConstant(S32, 1.0f);
4772
4773 auto DenominatorScaled =
4774 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1})
4775 .addUse(LHS)
4776 .addUse(RHS)
4777 .addImm(0)
4778 .setMIFlags(Flags);
4779 auto NumeratorScaled =
4780 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1})
4781 .addUse(LHS)
4782 .addUse(RHS)
4783 .addImm(1)
4784 .setMIFlags(Flags);
4785
4786 auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32})
4787 .addUse(DenominatorScaled.getReg(0))
4788 .setMIFlags(Flags);
4789 auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags);
4790
4791 const bool PreservesDenormals = Mode.FP32Denormals == DenormalMode::getIEEE();
4792 const bool HasDynamicDenormals =
4793 (Mode.FP32Denormals.Input == DenormalMode::Dynamic) ||
4794 (Mode.FP32Denormals.Output == DenormalMode::Dynamic);
4795
4796 Register SavedSPDenormMode;
4797 if (!PreservesDenormals) {
4798 if (HasDynamicDenormals) {
4799 SavedSPDenormMode = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
4800 B.buildInstr(AMDGPU::S_GETREG_B32)
4801 .addDef(SavedSPDenormMode)
4802 .addImm(SPDenormModeBitField);
4803 }
4804 toggleSPDenormMode(true, B, ST, Mode);
4805 }
4806
4807 auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags);
4808 auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags);
4809 auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags);
4810 auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags);
4811 auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags);
4812 auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags);
4813
4814 if (!PreservesDenormals) {
4815 if (HasDynamicDenormals) {
4816 assert(SavedSPDenormMode);
4817 B.buildInstr(AMDGPU::S_SETREG_B32)
4818 .addReg(SavedSPDenormMode)
4819 .addImm(SPDenormModeBitField);
4820 } else
4821 toggleSPDenormMode(false, B, ST, Mode);
4822 }
4823
4824 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32})
4825 .addUse(Fma4.getReg(0))
4826 .addUse(Fma1.getReg(0))
4827 .addUse(Fma3.getReg(0))
4828 .addUse(NumeratorScaled.getReg(1))
4829 .setMIFlags(Flags);
4830
4831 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res)
4832 .addUse(Fmas.getReg(0))
4833 .addUse(RHS)
4834 .addUse(LHS)
4835 .setMIFlags(Flags);
4836
4837 MI.eraseFromParent();
4838 return true;
4839}
4840
4843 MachineIRBuilder &B) const {
4845 return true;
4846
4847 Register Res = MI.getOperand(0).getReg();
4848 Register LHS = MI.getOperand(1).getReg();
4849 Register RHS = MI.getOperand(2).getReg();
4850
4851 uint16_t Flags = MI.getFlags();
4852
4853 LLT S64 = LLT::scalar(64);
4854 LLT S1 = LLT::scalar(1);
4855
4856 auto One = B.buildFConstant(S64, 1.0);
4857
4858 auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1})
4859 .addUse(LHS)
4860 .addUse(RHS)
4861 .addImm(0)
4862 .setMIFlags(Flags);
4863
4864 auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags);
4865
4866 auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64})
4867 .addUse(DivScale0.getReg(0))
4868 .setMIFlags(Flags);
4869
4870 auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags);
4871 auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags);
4872 auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags);
4873
4874 auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1})
4875 .addUse(LHS)
4876 .addUse(RHS)
4877 .addImm(1)
4878 .setMIFlags(Flags);
4879
4880 auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags);
4881 auto Mul = B.buildFMul(S64, DivScale1.getReg(0), Fma3, Flags);
4882 auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags);
4883
4884 Register Scale;
4886 // Workaround a hardware bug on SI where the condition output from div_scale
4887 // is not usable.
4888
4889 LLT S32 = LLT::scalar(32);
4890
4891 auto NumUnmerge = B.buildUnmerge(S32, LHS);
4892 auto DenUnmerge = B.buildUnmerge(S32, RHS);
4893 auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0);
4894 auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1);
4895
4896