LLVM  15.0.0git
AMDGPULegalizerInfo.cpp
Go to the documentation of this file.
1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// This file implements the targeting of the Machinelegalizer class for
10 /// AMDGPU.
11 /// \todo This should be generated by TableGen.
12 //===----------------------------------------------------------------------===//
13 
14 #include "AMDGPULegalizerInfo.h"
15 
16 #include "AMDGPU.h"
17 #include "AMDGPUGlobalISelUtils.h"
18 #include "AMDGPUInstrInfo.h"
19 #include "AMDGPUTargetMachine.h"
20 #include "SIMachineFunctionInfo.h"
21 #include "Utils/AMDGPUBaseInfo.h"
22 #include "llvm/ADT/ScopeExit.h"
23 #include "llvm/BinaryFormat/ELF.h"
27 #include "llvm/IR/DiagnosticInfo.h"
28 #include "llvm/IR/IntrinsicsAMDGPU.h"
29 #include "llvm/IR/IntrinsicsR600.h"
30 
31 #define DEBUG_TYPE "amdgpu-legalinfo"
32 
33 using namespace llvm;
34 using namespace LegalizeActions;
35 using namespace LegalizeMutations;
36 using namespace LegalityPredicates;
37 using namespace MIPatternMatch;
38 
39 // Hack until load/store selection patterns support any tuple of legal types.
41  "amdgpu-global-isel-new-legality",
42  cl::desc("Use GlobalISel desired legality, rather than try to use"
43  "rules compatible with selection patterns"),
44  cl::init(false),
46 
47 static constexpr unsigned MaxRegisterSize = 1024;
48 
49 // Round the number of elements to the next power of two elements
51  unsigned NElts = Ty.getNumElements();
52  unsigned Pow2NElts = 1 << Log2_32_Ceil(NElts);
53  return Ty.changeElementCount(ElementCount::getFixed(Pow2NElts));
54 }
55 
56 // Round the number of bits to the next power of two bits
58  unsigned Bits = Ty.getSizeInBits();
59  unsigned Pow2Bits = 1 << Log2_32_Ceil(Bits);
60  return LLT::scalar(Pow2Bits);
61 }
62 
63 /// \returns true if this is an odd sized vector which should widen by adding an
64 /// additional element. This is mostly to handle <3 x s16> -> <4 x s16>. This
65 /// excludes s1 vectors, which should always be scalarized.
66 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) {
67  return [=](const LegalityQuery &Query) {
68  const LLT Ty = Query.Types[TypeIdx];
69  if (!Ty.isVector())
70  return false;
71 
72  const LLT EltTy = Ty.getElementType();
73  const unsigned EltSize = EltTy.getSizeInBits();
74  return Ty.getNumElements() % 2 != 0 &&
75  EltSize > 1 && EltSize < 32 &&
76  Ty.getSizeInBits() % 32 != 0;
77  };
78 }
79 
80 static LegalityPredicate sizeIsMultipleOf32(unsigned TypeIdx) {
81  return [=](const LegalityQuery &Query) {
82  const LLT Ty = Query.Types[TypeIdx];
83  return Ty.getSizeInBits() % 32 == 0;
84  };
85 }
86 
87 static LegalityPredicate isWideVec16(unsigned TypeIdx) {
88  return [=](const LegalityQuery &Query) {
89  const LLT Ty = Query.Types[TypeIdx];
90  const LLT EltTy = Ty.getScalarType();
91  return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2;
92  };
93 }
94 
95 static LegalizeMutation oneMoreElement(unsigned TypeIdx) {
96  return [=](const LegalityQuery &Query) {
97  const LLT Ty = Query.Types[TypeIdx];
98  const LLT EltTy = Ty.getElementType();
99  return std::make_pair(TypeIdx,
100  LLT::fixed_vector(Ty.getNumElements() + 1, EltTy));
101  };
102 }
103 
104 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) {
105  return [=](const LegalityQuery &Query) {
106  const LLT Ty = Query.Types[TypeIdx];
107  const LLT EltTy = Ty.getElementType();
108  unsigned Size = Ty.getSizeInBits();
109  unsigned Pieces = (Size + 63) / 64;
110  unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces;
111  return std::make_pair(
112  TypeIdx,
113  LLT::scalarOrVector(ElementCount::getFixed(NewNumElts), EltTy));
114  };
115 }
116 
117 // Increase the number of vector elements to reach the next multiple of 32-bit
118 // type.
119 static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) {
120  return [=](const LegalityQuery &Query) {
121  const LLT Ty = Query.Types[TypeIdx];
122 
123  const LLT EltTy = Ty.getElementType();
124  const int Size = Ty.getSizeInBits();
125  const int EltSize = EltTy.getSizeInBits();
126  const int NextMul32 = (Size + 31) / 32;
127 
128  assert(EltSize < 32);
129 
130  const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize;
131  return std::make_pair(TypeIdx, LLT::fixed_vector(NewNumElts, EltTy));
132  };
133 }
134 
135 static LLT getBitcastRegisterType(const LLT Ty) {
136  const unsigned Size = Ty.getSizeInBits();
137 
138  if (Size <= 32) {
139  // <2 x s8> -> s16
140  // <4 x s8> -> s32
141  return LLT::scalar(Size);
142  }
143 
144  return LLT::scalarOrVector(ElementCount::getFixed(Size / 32), 32);
145 }
146 
147 static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx) {
148  return [=](const LegalityQuery &Query) {
149  const LLT Ty = Query.Types[TypeIdx];
150  return std::make_pair(TypeIdx, getBitcastRegisterType(Ty));
151  };
152 }
153 
154 static LegalizeMutation bitcastToVectorElement32(unsigned TypeIdx) {
155  return [=](const LegalityQuery &Query) {
156  const LLT Ty = Query.Types[TypeIdx];
157  unsigned Size = Ty.getSizeInBits();
158  assert(Size % 32 == 0);
159  return std::make_pair(
160  TypeIdx, LLT::scalarOrVector(ElementCount::getFixed(Size / 32), 32));
161  };
162 }
163 
164 static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) {
165  return [=](const LegalityQuery &Query) {
166  const LLT QueryTy = Query.Types[TypeIdx];
167  return QueryTy.isVector() && QueryTy.getSizeInBits() < Size;
168  };
169 }
170 
171 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) {
172  return [=](const LegalityQuery &Query) {
173  const LLT QueryTy = Query.Types[TypeIdx];
174  return QueryTy.isVector() && QueryTy.getSizeInBits() > Size;
175  };
176 }
177 
178 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) {
179  return [=](const LegalityQuery &Query) {
180  const LLT QueryTy = Query.Types[TypeIdx];
181  return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0;
182  };
183 }
184 
185 static bool isRegisterSize(unsigned Size) {
186  return Size % 32 == 0 && Size <= MaxRegisterSize;
187 }
188 
189 static bool isRegisterVectorElementType(LLT EltTy) {
190  const int EltSize = EltTy.getSizeInBits();
191  return EltSize == 16 || EltSize % 32 == 0;
192 }
193 
194 static bool isRegisterVectorType(LLT Ty) {
195  const int EltSize = Ty.getElementType().getSizeInBits();
196  return EltSize == 32 || EltSize == 64 ||
197  (EltSize == 16 && Ty.getNumElements() % 2 == 0) ||
198  EltSize == 128 || EltSize == 256;
199 }
200 
201 static bool isRegisterType(LLT Ty) {
202  if (!isRegisterSize(Ty.getSizeInBits()))
203  return false;
204 
205  if (Ty.isVector())
206  return isRegisterVectorType(Ty);
207 
208  return true;
209 }
210 
211 // Any combination of 32 or 64-bit elements up the maximum register size, and
212 // multiples of v2s16.
213 static LegalityPredicate isRegisterType(unsigned TypeIdx) {
214  return [=](const LegalityQuery &Query) {
215  return isRegisterType(Query.Types[TypeIdx]);
216  };
217 }
218 
219 static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) {
220  return [=](const LegalityQuery &Query) {
221  const LLT QueryTy = Query.Types[TypeIdx];
222  if (!QueryTy.isVector())
223  return false;
224  const LLT EltTy = QueryTy.getElementType();
225  return EltTy == LLT::scalar(16) || EltTy.getSizeInBits() >= 32;
226  };
227 }
228 
229 // If we have a truncating store or an extending load with a data size larger
230 // than 32-bits, we need to reduce to a 32-bit type.
232  return [=](const LegalityQuery &Query) {
233  const LLT Ty = Query.Types[TypeIdx];
234  return !Ty.isVector() && Ty.getSizeInBits() > 32 &&
235  Query.MMODescrs[0].MemoryTy.getSizeInBits() < Ty.getSizeInBits();
236  };
237 }
238 
239 // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we
240 // handle some operations by just promoting the register during
241 // selection. There are also d16 loads on GFX9+ which preserve the high bits.
242 static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS,
243  bool IsLoad) {
244  switch (AS) {
246  // FIXME: Private element size.
247  return ST.enableFlatScratch() ? 128 : 32;
249  return ST.useDS128() ? 128 : 64;
253  // Treat constant and global as identical. SMRD loads are sometimes usable for
254  // global loads (ideally constant address space should be eliminated)
255  // depending on the context. Legality cannot be context dependent, but
256  // RegBankSelect can split the load as necessary depending on the pointer
257  // register bank/uniformity and if the memory is invariant or not written in a
258  // kernel.
259  return IsLoad ? 512 : 128;
260  default:
261  // Flat addresses may contextually need to be split to 32-bit parts if they
262  // may alias scratch depending on the subtarget.
263  return 128;
264  }
265 }
266 
268  const LegalityQuery &Query) {
269  const LLT Ty = Query.Types[0];
270 
271  // Handle G_LOAD, G_ZEXTLOAD, G_SEXTLOAD
272  const bool IsLoad = Query.Opcode != AMDGPU::G_STORE;
273 
274  unsigned RegSize = Ty.getSizeInBits();
275  uint64_t MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
276  uint64_t AlignBits = Query.MMODescrs[0].AlignInBits;
277  unsigned AS = Query.Types[1].getAddressSpace();
278 
279  // All of these need to be custom lowered to cast the pointer operand.
281  return false;
282 
283  // Do not handle extending vector loads.
284  if (Ty.isVector() && MemSize != RegSize)
285  return false;
286 
287  // TODO: We should be able to widen loads if the alignment is high enough, but
288  // we also need to modify the memory access size.
289 #if 0
290  // Accept widening loads based on alignment.
291  if (IsLoad && MemSize < Size)
292  MemSize = std::max(MemSize, Align);
293 #endif
294 
295  // Only 1-byte and 2-byte to 32-bit extloads are valid.
296  if (MemSize != RegSize && RegSize != 32)
297  return false;
298 
299  if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad))
300  return false;
301 
302  switch (MemSize) {
303  case 8:
304  case 16:
305  case 32:
306  case 64:
307  case 128:
308  break;
309  case 96:
310  if (!ST.hasDwordx3LoadStores())
311  return false;
312  break;
313  case 256:
314  case 512:
315  // These may contextually need to be broken down.
316  break;
317  default:
318  return false;
319  }
320 
321  assert(RegSize >= MemSize);
322 
323  if (AlignBits < MemSize) {
324  const SITargetLowering *TLI = ST.getTargetLowering();
325  if (!TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS,
326  Align(AlignBits / 8)))
327  return false;
328  }
329 
330  return true;
331 }
332 
333 // The current selector can't handle <6 x s16>, <8 x s16>, s96, s128 etc, so
334 // workaround this. Eventually it should ignore the type for loads and only care
335 // about the size. Return true in cases where we will workaround this for now by
336 // bitcasting.
337 static bool loadStoreBitcastWorkaround(const LLT Ty) {
338  if (EnableNewLegality)
339  return false;
340 
341  const unsigned Size = Ty.getSizeInBits();
342  if (Size <= 64)
343  return false;
344  if (!Ty.isVector())
345  return true;
346 
347  LLT EltTy = Ty.getElementType();
348  if (EltTy.isPointer())
349  return true;
350 
351  unsigned EltSize = EltTy.getSizeInBits();
352  return EltSize != 32 && EltSize != 64;
353 }
354 
355 static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query) {
356  const LLT Ty = Query.Types[0];
357  return isRegisterType(Ty) && isLoadStoreSizeLegal(ST, Query) &&
359 }
360 
361 /// Return true if a load or store of the type should be lowered with a bitcast
362 /// to a different type.
363 static bool shouldBitcastLoadStoreType(const GCNSubtarget &ST, const LLT Ty,
364  const LLT MemTy) {
365  const unsigned MemSizeInBits = MemTy.getSizeInBits();
366  const unsigned Size = Ty.getSizeInBits();
367  if (Size != MemSizeInBits)
368  return Size <= 32 && Ty.isVector();
369 
371  return true;
372 
373  // Don't try to handle bitcasting vector ext loads for now.
374  return Ty.isVector() && (!MemTy.isVector() || MemTy == Ty) &&
375  (Size <= 32 || isRegisterSize(Size)) &&
377 }
378 
379 /// Return true if we should legalize a load by widening an odd sized memory
380 /// access up to the alignment. Note this case when the memory access itself
381 /// changes, not the size of the result register.
382 static bool shouldWidenLoad(const GCNSubtarget &ST, LLT MemoryTy,
383  uint64_t AlignInBits, unsigned AddrSpace,
384  unsigned Opcode) {
385  unsigned SizeInBits = MemoryTy.getSizeInBits();
386  // We don't want to widen cases that are naturally legal.
387  if (isPowerOf2_32(SizeInBits))
388  return false;
389 
390  // If we have 96-bit memory operations, we shouldn't touch them. Note we may
391  // end up widening these for a scalar load during RegBankSelect, since there
392  // aren't 96-bit scalar loads.
393  if (SizeInBits == 96 && ST.hasDwordx3LoadStores())
394  return false;
395 
396  if (SizeInBits >= maxSizeForAddrSpace(ST, AddrSpace, Opcode))
397  return false;
398 
399  // A load is known dereferenceable up to the alignment, so it's legal to widen
400  // to it.
401  //
402  // TODO: Could check dereferenceable for less aligned cases.
403  unsigned RoundedSize = NextPowerOf2(SizeInBits);
404  if (AlignInBits < RoundedSize)
405  return false;
406 
407  // Do not widen if it would introduce a slow unaligned load.
408  const SITargetLowering *TLI = ST.getTargetLowering();
409  bool Fast = false;
411  RoundedSize, AddrSpace, Align(AlignInBits / 8),
412  MachineMemOperand::MOLoad, &Fast) &&
413  Fast;
414 }
415 
416 static bool shouldWidenLoad(const GCNSubtarget &ST, const LegalityQuery &Query,
417  unsigned Opcode) {
418  if (Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic)
419  return false;
420 
421  return shouldWidenLoad(ST, Query.MMODescrs[0].MemoryTy,
422  Query.MMODescrs[0].AlignInBits,
423  Query.Types[1].getAddressSpace(), Opcode);
424 }
425 
427  const GCNTargetMachine &TM)
428  : ST(ST_) {
429  using namespace TargetOpcode;
430 
431  auto GetAddrSpacePtr = [&TM](unsigned AS) {
432  return LLT::pointer(AS, TM.getPointerSizeInBits(AS));
433  };
434 
435  const LLT S1 = LLT::scalar(1);
436  const LLT S8 = LLT::scalar(8);
437  const LLT S16 = LLT::scalar(16);
438  const LLT S32 = LLT::scalar(32);
439  const LLT S64 = LLT::scalar(64);
440  const LLT S128 = LLT::scalar(128);
441  const LLT S256 = LLT::scalar(256);
442  const LLT S512 = LLT::scalar(512);
443  const LLT MaxScalar = LLT::scalar(MaxRegisterSize);
444 
445  const LLT V2S8 = LLT::fixed_vector(2, 8);
446  const LLT V2S16 = LLT::fixed_vector(2, 16);
447  const LLT V4S16 = LLT::fixed_vector(4, 16);
448 
449  const LLT V2S32 = LLT::fixed_vector(2, 32);
450  const LLT V3S32 = LLT::fixed_vector(3, 32);
451  const LLT V4S32 = LLT::fixed_vector(4, 32);
452  const LLT V5S32 = LLT::fixed_vector(5, 32);
453  const LLT V6S32 = LLT::fixed_vector(6, 32);
454  const LLT V7S32 = LLT::fixed_vector(7, 32);
455  const LLT V8S32 = LLT::fixed_vector(8, 32);
456  const LLT V9S32 = LLT::fixed_vector(9, 32);
457  const LLT V10S32 = LLT::fixed_vector(10, 32);
458  const LLT V11S32 = LLT::fixed_vector(11, 32);
459  const LLT V12S32 = LLT::fixed_vector(12, 32);
460  const LLT V13S32 = LLT::fixed_vector(13, 32);
461  const LLT V14S32 = LLT::fixed_vector(14, 32);
462  const LLT V15S32 = LLT::fixed_vector(15, 32);
463  const LLT V16S32 = LLT::fixed_vector(16, 32);
464  const LLT V32S32 = LLT::fixed_vector(32, 32);
465 
466  const LLT V2S64 = LLT::fixed_vector(2, 64);
467  const LLT V3S64 = LLT::fixed_vector(3, 64);
468  const LLT V4S64 = LLT::fixed_vector(4, 64);
469  const LLT V5S64 = LLT::fixed_vector(5, 64);
470  const LLT V6S64 = LLT::fixed_vector(6, 64);
471  const LLT V7S64 = LLT::fixed_vector(7, 64);
472  const LLT V8S64 = LLT::fixed_vector(8, 64);
473  const LLT V16S64 = LLT::fixed_vector(16, 64);
474 
475  std::initializer_list<LLT> AllS32Vectors =
476  {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32,
477  V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32};
478  std::initializer_list<LLT> AllS64Vectors =
479  {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64};
480 
481  const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS);
482  const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS);
483  const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT);
484  const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS);
485  const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS);
486  const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS);
487  const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS);
488 
489  const LLT CodePtr = FlatPtr;
490 
491  const std::initializer_list<LLT> AddrSpaces64 = {
492  GlobalPtr, ConstantPtr, FlatPtr
493  };
494 
495  const std::initializer_list<LLT> AddrSpaces32 = {
496  LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr
497  };
498 
499  const std::initializer_list<LLT> FPTypesBase = {
500  S32, S64
501  };
502 
503  const std::initializer_list<LLT> FPTypes16 = {
504  S32, S64, S16
505  };
506 
507  const std::initializer_list<LLT> FPTypesPK16 = {
508  S32, S64, S16, V2S16
509  };
510 
511  const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32;
512 
513  // s1 for VCC branches, s32 for SCC branches.
514  getActionDefinitionsBuilder(G_BRCOND).legalFor({S1, S32});
515 
516  // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more
517  // elements for v3s16
519  .legalFor({S32, S64, V2S16, S16, V4S16, S1, S128, S256})
520  .legalFor(AllS32Vectors)
521  .legalFor(AllS64Vectors)
522  .legalFor(AddrSpaces64)
523  .legalFor(AddrSpaces32)
524  .legalIf(isPointer(0))
525  .clampScalar(0, S16, S256)
526  .widenScalarToNextPow2(0, 32)
527  .clampMaxNumElements(0, S32, 16)
529  .scalarize(0);
530 
531  if (ST.hasVOP3PInsts() && ST.hasAddNoCarry() && ST.hasIntClamp()) {
532  // Full set of gfx9 features.
533  getActionDefinitionsBuilder({G_ADD, G_SUB})
534  .legalFor({S32, S16, V2S16})
535  .clampMaxNumElementsStrict(0, S16, 2)
536  .scalarize(0)
537  .minScalar(0, S16)
539  .maxScalar(0, S32);
540 
542  .legalFor({S32, S16, V2S16})
543  .clampMaxNumElementsStrict(0, S16, 2)
544  .scalarize(0)
545  .minScalar(0, S16)
547  .custom();
548  assert(ST.hasMad64_32());
549 
550  getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT, G_SADDSAT, G_SSUBSAT})
551  .legalFor({S32, S16, V2S16}) // Clamp modifier
552  .minScalarOrElt(0, S16)
553  .clampMaxNumElementsStrict(0, S16, 2)
554  .scalarize(0)
555  .widenScalarToNextPow2(0, 32)
556  .lower();
557  } else if (ST.has16BitInsts()) {
558  getActionDefinitionsBuilder({G_ADD, G_SUB})
559  .legalFor({S32, S16})
560  .minScalar(0, S16)
562  .maxScalar(0, S32)
563  .scalarize(0);
564 
566  .legalFor({S32, S16})
567  .scalarize(0)
568  .minScalar(0, S16)
569  .widenScalarToNextMultipleOf(0, 32)
570  .custom();
571  assert(ST.hasMad64_32());
572 
573  // Technically the saturating operations require clamp bit support, but this
574  // was introduced at the same time as 16-bit operations.
575  getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
576  .legalFor({S32, S16}) // Clamp modifier
577  .minScalar(0, S16)
578  .scalarize(0)
579  .widenScalarToNextPow2(0, 16)
580  .lower();
581 
582  // We're just lowering this, but it helps get a better result to try to
583  // coerce to the desired type first.
584  getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT})
585  .minScalar(0, S16)
586  .scalarize(0)
587  .lower();
588  } else {
589  getActionDefinitionsBuilder({G_ADD, G_SUB})
590  .legalFor({S32})
591  .widenScalarToNextMultipleOf(0, 32)
592  .clampScalar(0, S32, S32)
593  .scalarize(0);
594 
595  auto &Mul = getActionDefinitionsBuilder(G_MUL)
596  .legalFor({S32})
597  .scalarize(0)
598  .minScalar(0, S32)
599  .widenScalarToNextMultipleOf(0, 32);
600 
601  if (ST.hasMad64_32())
602  Mul.custom();
603  else
604  Mul.maxScalar(0, S32);
605 
606  if (ST.hasIntClamp()) {
607  getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
608  .legalFor({S32}) // Clamp modifier.
609  .scalarize(0)
610  .minScalarOrElt(0, S32)
611  .lower();
612  } else {
613  // Clamp bit support was added in VI, along with 16-bit operations.
614  getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
615  .minScalar(0, S32)
616  .scalarize(0)
617  .lower();
618  }
619 
620  // FIXME: DAG expansion gets better results. The widening uses the smaller
621  // range values and goes for the min/max lowering directly.
622  getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT})
623  .minScalar(0, S32)
624  .scalarize(0)
625  .lower();
626  }
627 
629  {G_SDIV, G_UDIV, G_SREM, G_UREM, G_SDIVREM, G_UDIVREM})
630  .customFor({S32, S64})
631  .clampScalar(0, S32, S64)
632  .widenScalarToNextPow2(0, 32)
633  .scalarize(0);
634 
635  auto &Mulh = getActionDefinitionsBuilder({G_UMULH, G_SMULH})
636  .legalFor({S32})
637  .maxScalar(0, S32);
638 
639  if (ST.hasVOP3PInsts()) {
640  Mulh
641  .clampMaxNumElements(0, S8, 2)
642  .lowerFor({V2S8});
643  }
644 
645  Mulh
646  .scalarize(0)
647  .lower();
648 
649  // Report legal for any types we can handle anywhere. For the cases only legal
650  // on the SALU, RegBankSelect will be able to re-legalize.
651  getActionDefinitionsBuilder({G_AND, G_OR, G_XOR})
652  .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16})
653  .clampScalar(0, S32, S64)
657  .scalarize(0);
658 
659  getActionDefinitionsBuilder({G_UADDO, G_USUBO,
660  G_UADDE, G_SADDE, G_USUBE, G_SSUBE})
661  .legalFor({{S32, S1}, {S32, S32}})
662  .minScalar(0, S32)
663  .scalarize(0)
664  .lower();
665 
666  getActionDefinitionsBuilder(G_BITCAST)
667  // Don't worry about the size constraint.
669  .lower();
670 
671 
672  getActionDefinitionsBuilder(G_CONSTANT)
673  .legalFor({S1, S32, S64, S16, GlobalPtr,
674  LocalPtr, ConstantPtr, PrivatePtr, FlatPtr })
675  .legalIf(isPointer(0))
676  .clampScalar(0, S32, S64)
678 
679  getActionDefinitionsBuilder(G_FCONSTANT)
680  .legalFor({S32, S64, S16})
681  .clampScalar(0, S16, S64);
682 
683  getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE})
684  .legalIf(isRegisterType(0))
685  // s1 and s16 are special cases because they have legal operations on
686  // them, but don't really occupy registers in the normal way.
687  .legalFor({S1, S16})
688  .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
689  .clampScalarOrElt(0, S32, MaxScalar)
690  .widenScalarToNextPow2(0, 32)
691  .clampMaxNumElements(0, S32, 16);
692 
693  getActionDefinitionsBuilder(G_FRAME_INDEX).legalFor({PrivatePtr});
694 
695  // If the amount is divergent, we have to do a wave reduction to get the
696  // maximum value, so this is expanded during RegBankSelect.
697  getActionDefinitionsBuilder(G_DYN_STACKALLOC)
698  .legalFor({{PrivatePtr, S32}});
699 
700  getActionDefinitionsBuilder(G_GLOBAL_VALUE)
701  .customIf(typeIsNot(0, PrivatePtr));
702 
703  getActionDefinitionsBuilder(G_BLOCK_ADDR).legalFor({CodePtr});
704 
705  auto &FPOpActions = getActionDefinitionsBuilder(
706  { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE})
707  .legalFor({S32, S64});
708  auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS})
709  .customFor({S32, S64});
710  auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV)
711  .customFor({S32, S64});
712 
713  if (ST.has16BitInsts()) {
714  if (ST.hasVOP3PInsts())
715  FPOpActions.legalFor({S16, V2S16});
716  else
717  FPOpActions.legalFor({S16});
718 
719  TrigActions.customFor({S16});
720  FDIVActions.customFor({S16});
721  }
722 
723  auto &MinNumMaxNum = getActionDefinitionsBuilder({
724  G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE});
725 
726  if (ST.hasVOP3PInsts()) {
727  MinNumMaxNum.customFor(FPTypesPK16)
729  .clampMaxNumElements(0, S16, 2)
730  .clampScalar(0, S16, S64)
731  .scalarize(0);
732  } else if (ST.has16BitInsts()) {
733  MinNumMaxNum.customFor(FPTypes16)
734  .clampScalar(0, S16, S64)
735  .scalarize(0);
736  } else {
737  MinNumMaxNum.customFor(FPTypesBase)
738  .clampScalar(0, S32, S64)
739  .scalarize(0);
740  }
741 
742  if (ST.hasVOP3PInsts())
743  FPOpActions.clampMaxNumElementsStrict(0, S16, 2);
744 
745  FPOpActions
746  .scalarize(0)
747  .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
748 
749  TrigActions
750  .scalarize(0)
751  .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
752 
753  FDIVActions
754  .scalarize(0)
755  .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
756 
757  getActionDefinitionsBuilder({G_FNEG, G_FABS})
758  .legalFor(FPTypesPK16)
759  .clampMaxNumElementsStrict(0, S16, 2)
760  .scalarize(0)
761  .clampScalar(0, S16, S64);
762 
763  if (ST.has16BitInsts()) {
764  getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR})
765  .legalFor({S32, S64, S16})
766  .scalarize(0)
767  .clampScalar(0, S16, S64);
768  } else {
770  .legalFor({S32, S64})
771  .scalarize(0)
772  .clampScalar(0, S32, S64);
773 
774  if (ST.hasFractBug()) {
776  .customFor({S64})
777  .legalFor({S32, S64})
778  .scalarize(0)
779  .clampScalar(0, S32, S64);
780  } else {
782  .legalFor({S32, S64})
783  .scalarize(0)
784  .clampScalar(0, S32, S64);
785  }
786  }
787 
788  getActionDefinitionsBuilder(G_FPTRUNC)
789  .legalFor({{S32, S64}, {S16, S32}})
790  .scalarize(0)
791  .lower();
792 
794  .legalFor({{S64, S32}, {S32, S16}})
795  .narrowScalarFor({{S64, S16}}, changeTo(0, S32))
796  .scalarize(0);
797 
798  auto &FSubActions = getActionDefinitionsBuilder(G_FSUB);
799  if (ST.has16BitInsts()) {
800  FSubActions
801  // Use actual fsub instruction
802  .legalFor({S32, S16})
803  // Must use fadd + fneg
804  .lowerFor({S64, V2S16});
805  } else {
806  FSubActions
807  // Use actual fsub instruction
808  .legalFor({S32})
809  // Must use fadd + fneg
810  .lowerFor({S64, S16, V2S16});
811  }
812 
813  FSubActions
814  .scalarize(0)
815  .clampScalar(0, S32, S64);
816 
817  // Whether this is legal depends on the floating point mode for the function.
818  auto &FMad = getActionDefinitionsBuilder(G_FMAD);
819  if (ST.hasMadF16() && ST.hasMadMacF32Insts())
820  FMad.customFor({S32, S16});
821  else if (ST.hasMadMacF32Insts())
822  FMad.customFor({S32});
823  else if (ST.hasMadF16())
824  FMad.customFor({S16});
825  FMad.scalarize(0)
826  .lower();
827 
828  auto &FRem = getActionDefinitionsBuilder(G_FREM);
829  if (ST.has16BitInsts()) {
830  FRem.customFor({S16, S32, S64});
831  } else {
832  FRem.minScalar(0, S32)
833  .customFor({S32, S64});
834  }
835  FRem.scalarize(0);
836 
837  // TODO: Do we need to clamp maximum bitwidth?
839  .legalIf(isScalar(0))
840  .legalFor({{V2S16, V2S32}})
841  .clampMaxNumElements(0, S16, 2)
842  // Avoid scalarizing in cases that should be truly illegal. In unresolvable
843  // situations (like an invalid implicit use), we don't want to infinite loop
844  // in the legalizer.
846  .alwaysLegal();
847 
848  getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT})
849  .legalFor({{S64, S32}, {S32, S16}, {S64, S16},
850  {S32, S1}, {S64, S1}, {S16, S1}})
851  .scalarize(0)
852  .clampScalar(0, S32, S64)
853  .widenScalarToNextPow2(1, 32);
854 
855  // TODO: Split s1->s64 during regbankselect for VALU.
856  auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP})
857  .legalFor({{S32, S32}, {S64, S32}, {S16, S32}})
858  .lowerIf(typeIs(1, S1))
859  .customFor({{S32, S64}, {S64, S64}});
860  if (ST.has16BitInsts())
861  IToFP.legalFor({{S16, S16}});
862  IToFP.clampScalar(1, S32, S64)
863  .minScalar(0, S32)
864  .scalarize(0)
866 
867  auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI})
868  .legalFor({{S32, S32}, {S32, S64}, {S32, S16}})
869  .customFor({{S64, S32}, {S64, S64}})
870  .narrowScalarFor({{S64, S16}}, changeTo(0, S32));
871  if (ST.has16BitInsts())
872  FPToI.legalFor({{S16, S16}});
873  else
874  FPToI.minScalar(1, S32);
875 
876  FPToI.minScalar(0, S32)
877  .widenScalarToNextPow2(0, 32)
878  .scalarize(0)
879  .lower();
880 
881  getActionDefinitionsBuilder(G_INTRINSIC_FPTRUNC_ROUND)
882  .customFor({S16, S32})
883  .scalarize(0)
884  .lower();
885 
886  // Lower roundeven into G_FRINT
887  getActionDefinitionsBuilder({G_INTRINSIC_ROUND, G_INTRINSIC_ROUNDEVEN})
888  .scalarize(0)
889  .lower();
890 
891  if (ST.has16BitInsts()) {
892  getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
893  .legalFor({S16, S32, S64})
894  .clampScalar(0, S16, S64)
895  .scalarize(0);
896  } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) {
897  getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
898  .legalFor({S32, S64})
899  .clampScalar(0, S32, S64)
900  .scalarize(0);
901  } else {
902  getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
903  .legalFor({S32})
904  .customFor({S64})
905  .clampScalar(0, S32, S64)
906  .scalarize(0);
907  }
908 
909  getActionDefinitionsBuilder(G_PTR_ADD)
910  .legalIf(all(isPointer(0), sameSize(0, 1)))
911  .scalarize(0)
912  .scalarSameSizeAs(1, 0);
913 
914  getActionDefinitionsBuilder(G_PTRMASK)
915  .legalIf(all(sameSize(0, 1), typeInSet(1, {S64, S32})))
916  .scalarSameSizeAs(1, 0)
917  .scalarize(0);
918 
919  auto &CmpBuilder =
920  getActionDefinitionsBuilder(G_ICMP)
921  // The compare output type differs based on the register bank of the output,
922  // so make both s1 and s32 legal.
923  //
924  // Scalar compares producing output in scc will be promoted to s32, as that
925  // is the allocatable register type that will be needed for the copy from
926  // scc. This will be promoted during RegBankSelect, and we assume something
927  // before that won't try to use s32 result types.
928  //
929  // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg
930  // bank.
931  .legalForCartesianProduct(
932  {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr})
933  .legalForCartesianProduct(
934  {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr});
935  if (ST.has16BitInsts()) {
936  CmpBuilder.legalFor({{S1, S16}});
937  }
938 
939  CmpBuilder
940  .widenScalarToNextPow2(1)
941  .clampScalar(1, S32, S64)
942  .scalarize(0)
943  .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1)));
944 
945  getActionDefinitionsBuilder(G_FCMP)
946  .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase)
947  .widenScalarToNextPow2(1)
948  .clampScalar(1, S32, S64)
949  .scalarize(0);
950 
951  // FIXME: fpow has a selection pattern that should move to custom lowering.
952  auto &Exp2Ops = getActionDefinitionsBuilder({G_FEXP2, G_FLOG2});
953  if (ST.has16BitInsts())
954  Exp2Ops.legalFor({S32, S16});
955  else
956  Exp2Ops.legalFor({S32});
957  Exp2Ops.clampScalar(0, MinScalarFPTy, S32);
958  Exp2Ops.scalarize(0);
959 
960  auto &ExpOps = getActionDefinitionsBuilder({G_FEXP, G_FLOG, G_FLOG10, G_FPOW});
961  if (ST.has16BitInsts())
962  ExpOps.customFor({{S32}, {S16}});
963  else
964  ExpOps.customFor({S32});
965  ExpOps.clampScalar(0, MinScalarFPTy, S32)
966  .scalarize(0);
967 
968  getActionDefinitionsBuilder(G_FPOWI)
969  .clampScalar(0, MinScalarFPTy, S32)
970  .lower();
971 
972  // The 64-bit versions produce 32-bit results, but only on the SALU.
973  getActionDefinitionsBuilder(G_CTPOP)
974  .legalFor({{S32, S32}, {S32, S64}})
975  .clampScalar(0, S32, S32)
976  .widenScalarToNextPow2(1, 32)
977  .clampScalar(1, S32, S64)
978  .scalarize(0)
979  .widenScalarToNextPow2(0, 32);
980 
981 
982  // The hardware instructions return a different result on 0 than the generic
983  // instructions expect. The hardware produces -1, but these produce the
984  // bitwidth.
985  getActionDefinitionsBuilder({G_CTLZ, G_CTTZ})
986  .scalarize(0)
987  .clampScalar(0, S32, S32)
988  .clampScalar(1, S32, S64)
989  .widenScalarToNextPow2(0, 32)
990  .widenScalarToNextPow2(1, 32)
991  .custom();
992 
993  // The 64-bit versions produce 32-bit results, but only on the SALU.
994  getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF})
995  .legalFor({{S32, S32}, {S32, S64}})
996  .clampScalar(0, S32, S32)
997  .clampScalar(1, S32, S64)
998  .scalarize(0)
999  .widenScalarToNextPow2(0, 32)
1000  .widenScalarToNextPow2(1, 32);
1001 
1002  // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
1003  // RegBankSelect.
1004  getActionDefinitionsBuilder(G_BITREVERSE)
1005  .legalFor({S32, S64})
1006  .clampScalar(0, S32, S64)
1007  .scalarize(0)
1008  .widenScalarToNextPow2(0);
1009 
1010  if (ST.has16BitInsts()) {
1011  getActionDefinitionsBuilder(G_BSWAP)
1012  .legalFor({S16, S32, V2S16})
1013  .clampMaxNumElementsStrict(0, S16, 2)
1014  // FIXME: Fixing non-power-of-2 before clamp is workaround for
1015  // narrowScalar limitation.
1016  .widenScalarToNextPow2(0)
1017  .clampScalar(0, S16, S32)
1018  .scalarize(0);
1019 
1020  if (ST.hasVOP3PInsts()) {
1021  getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
1022  .legalFor({S32, S16, V2S16})
1023  .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
1024  .clampMaxNumElements(0, S16, 2)
1025  .minScalar(0, S16)
1026  .widenScalarToNextPow2(0)
1027  .scalarize(0)
1028  .lower();
1029  } else {
1030  getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
1031  .legalFor({S32, S16})
1032  .widenScalarToNextPow2(0)
1033  .minScalar(0, S16)
1034  .scalarize(0)
1035  .lower();
1036  }
1037  } else {
1038  // TODO: Should have same legality without v_perm_b32
1039  getActionDefinitionsBuilder(G_BSWAP)
1040  .legalFor({S32})
1041  .lowerIf(scalarNarrowerThan(0, 32))
1042  // FIXME: Fixing non-power-of-2 before clamp is workaround for
1043  // narrowScalar limitation.
1044  .widenScalarToNextPow2(0)
1045  .maxScalar(0, S32)
1046  .scalarize(0)
1047  .lower();
1048 
1049  getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
1050  .legalFor({S32})
1051  .minScalar(0, S32)
1052  .widenScalarToNextPow2(0)
1053  .scalarize(0)
1054  .lower();
1055  }
1056 
1057  getActionDefinitionsBuilder(G_INTTOPTR)
1058  // List the common cases
1059  .legalForCartesianProduct(AddrSpaces64, {S64})
1060  .legalForCartesianProduct(AddrSpaces32, {S32})
1061  .scalarize(0)
1062  // Accept any address space as long as the size matches
1063  .legalIf(sameSize(0, 1))
1064  .widenScalarIf(smallerThan(1, 0),
1065  [](const LegalityQuery &Query) {
1066  return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
1067  })
1068  .narrowScalarIf(largerThan(1, 0),
1069  [](const LegalityQuery &Query) {
1070  return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
1071  });
1072 
1073  getActionDefinitionsBuilder(G_PTRTOINT)
1074  // List the common cases
1075  .legalForCartesianProduct(AddrSpaces64, {S64})
1076  .legalForCartesianProduct(AddrSpaces32, {S32})
1077  .scalarize(0)
1078  // Accept any address space as long as the size matches
1079  .legalIf(sameSize(0, 1))
1080  .widenScalarIf(smallerThan(0, 1),
1081  [](const LegalityQuery &Query) {
1082  return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
1083  })
1084  .narrowScalarIf(
1085  largerThan(0, 1),
1086  [](const LegalityQuery &Query) {
1087  return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
1088  });
1089 
1090  getActionDefinitionsBuilder(G_ADDRSPACE_CAST)
1091  .scalarize(0)
1092  .custom();
1093 
1094  const auto needToSplitMemOp = [=](const LegalityQuery &Query,
1095  bool IsLoad) -> bool {
1096  const LLT DstTy = Query.Types[0];
1097 
1098  // Split vector extloads.
1099  unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
1100 
1101  if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize)
1102  return true;
1103 
1104  const LLT PtrTy = Query.Types[1];
1105  unsigned AS = PtrTy.getAddressSpace();
1106  if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad))
1107  return true;
1108 
1109  // Catch weird sized loads that don't evenly divide into the access sizes
1110  // TODO: May be able to widen depending on alignment etc.
1111  unsigned NumRegs = (MemSize + 31) / 32;
1112  if (NumRegs == 3) {
1113  if (!ST.hasDwordx3LoadStores())
1114  return true;
1115  } else {
1116  // If the alignment allows, these should have been widened.
1117  if (!isPowerOf2_32(NumRegs))
1118  return true;
1119  }
1120 
1121  return false;
1122  };
1123 
1124  unsigned GlobalAlign32 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 32;
1125  unsigned GlobalAlign16 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 16;
1126  unsigned GlobalAlign8 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 8;
1127 
1128  // TODO: Refine based on subtargets which support unaligned access or 128-bit
1129  // LDS
1130  // TODO: Unsupported flat for SI.
1131 
1132  for (unsigned Op : {G_LOAD, G_STORE}) {
1133  const bool IsStore = Op == G_STORE;
1134 
1135  auto &Actions = getActionDefinitionsBuilder(Op);
1136  // Explicitly list some common cases.
1137  // TODO: Does this help compile time at all?
1138  Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, S32, GlobalAlign32},
1139  {V2S32, GlobalPtr, V2S32, GlobalAlign32},
1140  {V4S32, GlobalPtr, V4S32, GlobalAlign32},
1141  {S64, GlobalPtr, S64, GlobalAlign32},
1142  {V2S64, GlobalPtr, V2S64, GlobalAlign32},
1143  {V2S16, GlobalPtr, V2S16, GlobalAlign32},
1144  {S32, GlobalPtr, S8, GlobalAlign8},
1145  {S32, GlobalPtr, S16, GlobalAlign16},
1146 
1147  {S32, LocalPtr, S32, 32},
1148  {S64, LocalPtr, S64, 32},
1149  {V2S32, LocalPtr, V2S32, 32},
1150  {S32, LocalPtr, S8, 8},
1151  {S32, LocalPtr, S16, 16},
1152  {V2S16, LocalPtr, S32, 32},
1153 
1154  {S32, PrivatePtr, S32, 32},
1155  {S32, PrivatePtr, S8, 8},
1156  {S32, PrivatePtr, S16, 16},
1157  {V2S16, PrivatePtr, S32, 32},
1158 
1159  {S32, ConstantPtr, S32, GlobalAlign32},
1160  {V2S32, ConstantPtr, V2S32, GlobalAlign32},
1161  {V4S32, ConstantPtr, V4S32, GlobalAlign32},
1162  {S64, ConstantPtr, S64, GlobalAlign32},
1163  {V2S32, ConstantPtr, V2S32, GlobalAlign32}});
1164  Actions.legalIf(
1165  [=](const LegalityQuery &Query) -> bool {
1166  return isLoadStoreLegal(ST, Query);
1167  });
1168 
1169  // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to
1170  // 64-bits.
1171  //
1172  // TODO: Should generalize bitcast action into coerce, which will also cover
1173  // inserting addrspacecasts.
1174  Actions.customIf(typeIs(1, Constant32Ptr));
1175 
1176  // Turn any illegal element vectors into something easier to deal
1177  // with. These will ultimately produce 32-bit scalar shifts to extract the
1178  // parts anyway.
1179  //
1180  // For odd 16-bit element vectors, prefer to split those into pieces with
1181  // 16-bit vector parts.
1182  Actions.bitcastIf(
1183  [=](const LegalityQuery &Query) -> bool {
1184  return shouldBitcastLoadStoreType(ST, Query.Types[0],
1185  Query.MMODescrs[0].MemoryTy);
1186  }, bitcastToRegisterType(0));
1187 
1188  if (!IsStore) {
1189  // Widen suitably aligned loads by loading extra bytes. The standard
1190  // legalization actions can't properly express widening memory operands.
1191  Actions.customIf([=](const LegalityQuery &Query) -> bool {
1192  return shouldWidenLoad(ST, Query, G_LOAD);
1193  });
1194  }
1195 
1196  // FIXME: load/store narrowing should be moved to lower action
1197  Actions
1198  .narrowScalarIf(
1199  [=](const LegalityQuery &Query) -> bool {
1200  return !Query.Types[0].isVector() &&
1201  needToSplitMemOp(Query, Op == G_LOAD);
1202  },
1203  [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1204  const LLT DstTy = Query.Types[0];
1205  const LLT PtrTy = Query.Types[1];
1206 
1207  const unsigned DstSize = DstTy.getSizeInBits();
1208  unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
1209 
1210  // Split extloads.
1211  if (DstSize > MemSize)
1212  return std::make_pair(0, LLT::scalar(MemSize));
1213 
1214  unsigned MaxSize = maxSizeForAddrSpace(ST,
1215  PtrTy.getAddressSpace(),
1216  Op == G_LOAD);
1217  if (MemSize > MaxSize)
1218  return std::make_pair(0, LLT::scalar(MaxSize));
1219 
1220  uint64_t Align = Query.MMODescrs[0].AlignInBits;
1221  return std::make_pair(0, LLT::scalar(Align));
1222  })
1223  .fewerElementsIf(
1224  [=](const LegalityQuery &Query) -> bool {
1225  return Query.Types[0].isVector() &&
1226  needToSplitMemOp(Query, Op == G_LOAD);
1227  },
1228  [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1229  const LLT DstTy = Query.Types[0];
1230  const LLT PtrTy = Query.Types[1];
1231 
1232  LLT EltTy = DstTy.getElementType();
1233  unsigned MaxSize = maxSizeForAddrSpace(ST,
1234  PtrTy.getAddressSpace(),
1235  Op == G_LOAD);
1236 
1237  // FIXME: Handle widened to power of 2 results better. This ends
1238  // up scalarizing.
1239  // FIXME: 3 element stores scalarized on SI
1240 
1241  // Split if it's too large for the address space.
1242  unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
1243  if (MemSize > MaxSize) {
1244  unsigned NumElts = DstTy.getNumElements();
1245  unsigned EltSize = EltTy.getSizeInBits();
1246 
1247  if (MaxSize % EltSize == 0) {
1248  return std::make_pair(
1250  ElementCount::getFixed(MaxSize / EltSize), EltTy));
1251  }
1252 
1253  unsigned NumPieces = MemSize / MaxSize;
1254 
1255  // FIXME: Refine when odd breakdowns handled
1256  // The scalars will need to be re-legalized.
1257  if (NumPieces == 1 || NumPieces >= NumElts ||
1258  NumElts % NumPieces != 0)
1259  return std::make_pair(0, EltTy);
1260 
1261  return std::make_pair(
1262  0, LLT::fixed_vector(NumElts / NumPieces, EltTy));
1263  }
1264 
1265  // FIXME: We could probably handle weird extending loads better.
1266  if (DstTy.getSizeInBits() > MemSize)
1267  return std::make_pair(0, EltTy);
1268 
1269  unsigned EltSize = EltTy.getSizeInBits();
1270  unsigned DstSize = DstTy.getSizeInBits();
1271  if (!isPowerOf2_32(DstSize)) {
1272  // We're probably decomposing an odd sized store. Try to split
1273  // to the widest type. TODO: Account for alignment. As-is it
1274  // should be OK, since the new parts will be further legalized.
1275  unsigned FloorSize = PowerOf2Floor(DstSize);
1276  return std::make_pair(
1278  ElementCount::getFixed(FloorSize / EltSize), EltTy));
1279  }
1280 
1281  // May need relegalization for the scalars.
1282  return std::make_pair(0, EltTy);
1283  })
1284  .minScalar(0, S32)
1285  .narrowScalarIf(isWideScalarExtLoadTruncStore(0), changeTo(0, S32))
1286  .widenScalarToNextPow2(0)
1287  .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0))
1288  .lower();
1289  }
1290 
1291  // FIXME: Unaligned accesses not lowered.
1292  auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD})
1293  .legalForTypesWithMemDesc({{S32, GlobalPtr, S8, 8},
1294  {S32, GlobalPtr, S16, 2 * 8},
1295  {S32, LocalPtr, S8, 8},
1296  {S32, LocalPtr, S16, 16},
1297  {S32, PrivatePtr, S8, 8},
1298  {S32, PrivatePtr, S16, 16},
1299  {S32, ConstantPtr, S8, 8},
1300  {S32, ConstantPtr, S16, 2 * 8}})
1301  .legalIf(
1302  [=](const LegalityQuery &Query) -> bool {
1303  return isLoadStoreLegal(ST, Query);
1304  });
1305 
1306  if (ST.hasFlatAddressSpace()) {
1307  ExtLoads.legalForTypesWithMemDesc(
1308  {{S32, FlatPtr, S8, 8}, {S32, FlatPtr, S16, 16}});
1309  }
1310 
1311  // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to
1312  // 64-bits.
1313  //
1314  // TODO: Should generalize bitcast action into coerce, which will also cover
1315  // inserting addrspacecasts.
1316  ExtLoads.customIf(typeIs(1, Constant32Ptr));
1317 
1318  ExtLoads.clampScalar(0, S32, S32)
1319  .widenScalarToNextPow2(0)
1320  .lower();
1321 
1322  auto &Atomics = getActionDefinitionsBuilder(
1323  {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB,
1324  G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR,
1325  G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX,
1326  G_ATOMICRMW_UMIN})
1327  .legalFor({{S32, GlobalPtr}, {S32, LocalPtr},
1328  {S64, GlobalPtr}, {S64, LocalPtr},
1329  {S32, RegionPtr}, {S64, RegionPtr}});
1330  if (ST.hasFlatAddressSpace()) {
1331  Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}});
1332  }
1333 
1334  auto &Atomic = getActionDefinitionsBuilder(G_ATOMICRMW_FADD);
1335  if (ST.hasLDSFPAtomicAdd()) {
1336  Atomic.legalFor({{S32, LocalPtr}, {S32, RegionPtr}});
1337  if (ST.hasGFX90AInsts())
1338  Atomic.legalFor({{S64, LocalPtr}});
1339  if (ST.hasGFX940Insts())
1340  Atomic.legalFor({{V2S16, LocalPtr}});
1341  }
1342  if (ST.hasAtomicFaddInsts())
1343  Atomic.legalFor({{S32, GlobalPtr}});
1344 
1345  if (ST.hasGFX90AInsts()) {
1346  // These are legal with some caveats, and should have undergone expansion in
1347  // the IR in most situations
1348  // TODO: Move atomic expansion into legalizer
1349  // TODO: Also supports <2 x f16>
1350  Atomic.legalFor({
1351  {S32, GlobalPtr},
1352  {S64, GlobalPtr},
1353  {S64, FlatPtr}
1354  });
1355  }
1356 
1357  // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output
1358  // demarshalling
1359  getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG)
1360  .customFor({{S32, GlobalPtr}, {S64, GlobalPtr},
1361  {S32, FlatPtr}, {S64, FlatPtr}})
1362  .legalFor({{S32, LocalPtr}, {S64, LocalPtr},
1363  {S32, RegionPtr}, {S64, RegionPtr}});
1364  // TODO: Pointer types, any 32-bit or 64-bit vector
1365 
1366  // Condition should be s32 for scalar, s1 for vector.
1367  getActionDefinitionsBuilder(G_SELECT)
1368  .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16, GlobalPtr,
1369  LocalPtr, FlatPtr, PrivatePtr,
1370  LLT::fixed_vector(2, LocalPtr),
1371  LLT::fixed_vector(2, PrivatePtr)},
1372  {S1, S32})
1373  .clampScalar(0, S16, S64)
1374  .scalarize(1)
1375  .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
1376  .fewerElementsIf(numElementsNotEven(0), scalarize(0))
1377  .clampMaxNumElements(0, S32, 2)
1378  .clampMaxNumElements(0, LocalPtr, 2)
1379  .clampMaxNumElements(0, PrivatePtr, 2)
1380  .scalarize(0)
1381  .widenScalarToNextPow2(0)
1382  .legalIf(all(isPointer(0), typeInSet(1, {S1, S32})));
1383 
1384  // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can
1385  // be more flexible with the shift amount type.
1386  auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR})
1387  .legalFor({{S32, S32}, {S64, S32}});
1388  if (ST.has16BitInsts()) {
1389  if (ST.hasVOP3PInsts()) {
1390  Shifts.legalFor({{S16, S16}, {V2S16, V2S16}})
1391  .clampMaxNumElements(0, S16, 2);
1392  } else
1393  Shifts.legalFor({{S16, S16}});
1394 
1395  // TODO: Support 16-bit shift amounts for all types
1396  Shifts.widenScalarIf(
1397  [=](const LegalityQuery &Query) {
1398  // Use 16-bit shift amounts for any 16-bit shift. Otherwise we want a
1399  // 32-bit amount.
1400  const LLT ValTy = Query.Types[0];
1401  const LLT AmountTy = Query.Types[1];
1402  return ValTy.getSizeInBits() <= 16 &&
1403  AmountTy.getSizeInBits() < 16;
1404  }, changeTo(1, S16));
1405  Shifts.maxScalarIf(typeIs(0, S16), 1, S16);
1406  Shifts.clampScalar(1, S32, S32);
1407  Shifts.widenScalarToNextPow2(0, 16);
1408  Shifts.clampScalar(0, S16, S64);
1409 
1410  getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1411  .minScalar(0, S16)
1412  .scalarize(0)
1413  .lower();
1414  } else {
1415  // Make sure we legalize the shift amount type first, as the general
1416  // expansion for the shifted type will produce much worse code if it hasn't
1417  // been truncated already.
1418  Shifts.clampScalar(1, S32, S32);
1419  Shifts.widenScalarToNextPow2(0, 32);
1420  Shifts.clampScalar(0, S32, S64);
1421 
1422  getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1423  .minScalar(0, S32)
1424  .scalarize(0)
1425  .lower();
1426  }
1427  Shifts.scalarize(0);
1428 
1429  for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) {
1430  unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0;
1431  unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1;
1432  unsigned IdxTypeIdx = 2;
1433 
1434  getActionDefinitionsBuilder(Op)
1435  .customIf([=](const LegalityQuery &Query) {
1436  const LLT EltTy = Query.Types[EltTypeIdx];
1437  const LLT VecTy = Query.Types[VecTypeIdx];
1438  const LLT IdxTy = Query.Types[IdxTypeIdx];
1439  const unsigned EltSize = EltTy.getSizeInBits();
1440  return (EltSize == 32 || EltSize == 64) &&
1441  VecTy.getSizeInBits() % 32 == 0 &&
1442  VecTy.getSizeInBits() <= MaxRegisterSize &&
1443  IdxTy.getSizeInBits() == 32;
1444  })
1445  .bitcastIf(all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltNarrowerThan(VecTypeIdx, 32)),
1446  bitcastToVectorElement32(VecTypeIdx))
1447  //.bitcastIf(vectorSmallerThan(1, 32), bitcastToScalar(1))
1448  .bitcastIf(
1449  all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltWiderThan(VecTypeIdx, 64)),
1450  [=](const LegalityQuery &Query) {
1451  // For > 64-bit element types, try to turn this into a 64-bit
1452  // element vector since we may be able to do better indexing
1453  // if this is scalar. If not, fall back to 32.
1454  const LLT EltTy = Query.Types[EltTypeIdx];
1455  const LLT VecTy = Query.Types[VecTypeIdx];
1456  const unsigned DstEltSize = EltTy.getSizeInBits();
1457  const unsigned VecSize = VecTy.getSizeInBits();
1458 
1459  const unsigned TargetEltSize = DstEltSize % 64 == 0 ? 64 : 32;
1460  return std::make_pair(
1461  VecTypeIdx,
1462  LLT::fixed_vector(VecSize / TargetEltSize, TargetEltSize));
1463  })
1464  .clampScalar(EltTypeIdx, S32, S64)
1465  .clampScalar(VecTypeIdx, S32, S64)
1466  .clampScalar(IdxTypeIdx, S32, S32)
1467  .clampMaxNumElements(VecTypeIdx, S32, 32)
1468  // TODO: Clamp elements for 64-bit vectors?
1469  // It should only be necessary with variable indexes.
1470  // As a last resort, lower to the stack
1471  .lower();
1472  }
1473 
1474  getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT)
1475  .unsupportedIf([=](const LegalityQuery &Query) {
1476  const LLT &EltTy = Query.Types[1].getElementType();
1477  return Query.Types[0] != EltTy;
1478  });
1479 
1480  for (unsigned Op : {G_EXTRACT, G_INSERT}) {
1481  unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0;
1482  unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1;
1483 
1484  // FIXME: Doesn't handle extract of illegal sizes.
1485  getActionDefinitionsBuilder(Op)
1486  .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32)))
1487  .lowerIf([=](const LegalityQuery &Query) {
1488  // Sub-vector(or single element) insert and extract.
1489  // TODO: verify immediate offset here since lower only works with
1490  // whole elements.
1491  const LLT BigTy = Query.Types[BigTyIdx];
1492  return BigTy.isVector();
1493  })
1494  // FIXME: Multiples of 16 should not be legal.
1495  .legalIf([=](const LegalityQuery &Query) {
1496  const LLT BigTy = Query.Types[BigTyIdx];
1497  const LLT LitTy = Query.Types[LitTyIdx];
1498  return (BigTy.getSizeInBits() % 32 == 0) &&
1499  (LitTy.getSizeInBits() % 16 == 0);
1500  })
1501  .widenScalarIf(
1502  [=](const LegalityQuery &Query) {
1503  const LLT BigTy = Query.Types[BigTyIdx];
1504  return (BigTy.getScalarSizeInBits() < 16);
1505  },
1507  .widenScalarIf(
1508  [=](const LegalityQuery &Query) {
1509  const LLT LitTy = Query.Types[LitTyIdx];
1510  return (LitTy.getScalarSizeInBits() < 16);
1511  },
1513  .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1514  .widenScalarToNextPow2(BigTyIdx, 32);
1515 
1516  }
1517 
1518  auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR)
1519  .legalForCartesianProduct(AllS32Vectors, {S32})
1520  .legalForCartesianProduct(AllS64Vectors, {S64})
1521  .clampNumElements(0, V16S32, V32S32)
1522  .clampNumElements(0, V2S64, V16S64)
1523  .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16));
1524 
1525  if (ST.hasScalarPackInsts()) {
1526  BuildVector
1527  // FIXME: Should probably widen s1 vectors straight to s32
1528  .minScalarOrElt(0, S16)
1529  // Widen source elements and produce a G_BUILD_VECTOR_TRUNC
1530  .minScalar(1, S32);
1531 
1532  getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1533  .legalFor({V2S16, S32})
1534  .lower();
1535  BuildVector.minScalarOrElt(0, S32);
1536  } else {
1537  BuildVector.customFor({V2S16, S16});
1538  BuildVector.minScalarOrElt(0, S32);
1539 
1540  getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1541  .customFor({V2S16, S32})
1542  .lower();
1543  }
1544 
1545  BuildVector.legalIf(isRegisterType(0));
1546 
1547  // FIXME: Clamp maximum size
1548  getActionDefinitionsBuilder(G_CONCAT_VECTORS)
1549  .legalIf(all(isRegisterType(0), isRegisterType(1)))
1550  .clampMaxNumElements(0, S32, 32)
1551  .clampMaxNumElements(1, S16, 2) // TODO: Make 4?
1552  .clampMaxNumElements(0, S16, 64);
1553 
1554  // TODO: Don't fully scalarize v2s16 pieces? Or combine out those
1555  // pre-legalize.
1556  if (ST.hasVOP3PInsts()) {
1557  getActionDefinitionsBuilder(G_SHUFFLE_VECTOR)
1558  .customFor({V2S16, V2S16})
1559  .lower();
1560  } else
1561  getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower();
1562 
1563  // Merge/Unmerge
1564  for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
1565  unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1;
1566  unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0;
1567 
1568  auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) {
1569  const LLT Ty = Query.Types[TypeIdx];
1570  if (Ty.isVector()) {
1571  const LLT &EltTy = Ty.getElementType();
1572  if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 512)
1573  return true;
1574  if (!isPowerOf2_32(EltTy.getSizeInBits()))
1575  return true;
1576  }
1577  return false;
1578  };
1579 
1580  auto &Builder = getActionDefinitionsBuilder(Op)
1581  .legalIf(all(isRegisterType(0), isRegisterType(1)))
1582  .lowerFor({{S16, V2S16}})
1583  .lowerIf([=](const LegalityQuery &Query) {
1584  const LLT BigTy = Query.Types[BigTyIdx];
1585  return BigTy.getSizeInBits() == 32;
1586  })
1587  // Try to widen to s16 first for small types.
1588  // TODO: Only do this on targets with legal s16 shifts
1589  .minScalarOrEltIf(scalarNarrowerThan(LitTyIdx, 16), LitTyIdx, S16)
1590  .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16)
1591  .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1592  .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32),
1593  elementTypeIs(1, S16)),
1594  changeTo(1, V2S16))
1595  // Clamp the little scalar to s8-s256 and make it a power of 2. It's not
1596  // worth considering the multiples of 64 since 2*192 and 2*384 are not
1597  // valid.
1598  .clampScalar(LitTyIdx, S32, S512)
1599  .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32)
1600  // Break up vectors with weird elements into scalars
1601  .fewerElementsIf(
1602  [=](const LegalityQuery &Query) { return notValidElt(Query, LitTyIdx); },
1603  scalarize(0))
1604  .fewerElementsIf(
1605  [=](const LegalityQuery &Query) { return notValidElt(Query, BigTyIdx); },
1606  scalarize(1))
1607  .clampScalar(BigTyIdx, S32, MaxScalar);
1608 
1609  if (Op == G_MERGE_VALUES) {
1610  Builder.widenScalarIf(
1611  // TODO: Use 16-bit shifts if legal for 8-bit values?
1612  [=](const LegalityQuery &Query) {
1613  const LLT Ty = Query.Types[LitTyIdx];
1614  return Ty.getSizeInBits() < 32;
1615  },
1616  changeTo(LitTyIdx, S32));
1617  }
1618 
1619  Builder.widenScalarIf(
1620  [=](const LegalityQuery &Query) {
1621  const LLT Ty = Query.Types[BigTyIdx];
1622  return !isPowerOf2_32(Ty.getSizeInBits()) &&
1623  Ty.getSizeInBits() % 16 != 0;
1624  },
1625  [=](const LegalityQuery &Query) {
1626  // Pick the next power of 2, or a multiple of 64 over 128.
1627  // Whichever is smaller.
1628  const LLT &Ty = Query.Types[BigTyIdx];
1629  unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1);
1630  if (NewSizeInBits >= 256) {
1631  unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1);
1632  if (RoundedTo < NewSizeInBits)
1633  NewSizeInBits = RoundedTo;
1634  }
1635  return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits));
1636  })
1637  // Any vectors left are the wrong size. Scalarize them.
1638  .scalarize(0)
1639  .scalarize(1);
1640  }
1641 
1642  // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
1643  // RegBankSelect.
1644  auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG)
1645  .legalFor({{S32}, {S64}});
1646 
1647  if (ST.hasVOP3PInsts()) {
1648  SextInReg.lowerFor({{V2S16}})
1649  // Prefer to reduce vector widths for 16-bit vectors before lowering, to
1650  // get more vector shift opportunities, since we'll get those when
1651  // expanded.
1652  .clampMaxNumElementsStrict(0, S16, 2);
1653  } else if (ST.has16BitInsts()) {
1654  SextInReg.lowerFor({{S32}, {S64}, {S16}});
1655  } else {
1656  // Prefer to promote to s32 before lowering if we don't have 16-bit
1657  // shifts. This avoid a lot of intermediate truncate and extend operations.
1658  SextInReg.lowerFor({{S32}, {S64}});
1659  }
1660 
1661  SextInReg
1662  .scalarize(0)
1663  .clampScalar(0, S32, S64)
1664  .lower();
1665 
1666  getActionDefinitionsBuilder({G_ROTR, G_ROTL})
1667  .scalarize(0)
1668  .lower();
1669 
1670  // TODO: Only Try to form v2s16 with legal packed instructions.
1671  getActionDefinitionsBuilder(G_FSHR)
1672  .legalFor({{S32, S32}})
1673  .lowerFor({{V2S16, V2S16}})
1674  .clampMaxNumElementsStrict(0, S16, 2)
1675  .scalarize(0)
1676  .lower();
1677 
1678  if (ST.hasVOP3PInsts()) {
1679  getActionDefinitionsBuilder(G_FSHL)
1680  .lowerFor({{V2S16, V2S16}})
1681  .clampMaxNumElementsStrict(0, S16, 2)
1682  .scalarize(0)
1683  .lower();
1684  } else {
1685  getActionDefinitionsBuilder(G_FSHL)
1686  .scalarize(0)
1687  .lower();
1688  }
1689 
1690  getActionDefinitionsBuilder(G_READCYCLECOUNTER)
1691  .legalFor({S64});
1692 
1693  getActionDefinitionsBuilder(G_FENCE)
1694  .alwaysLegal();
1695 
1696  getActionDefinitionsBuilder({G_SMULO, G_UMULO})
1697  .scalarize(0)
1698  .minScalar(0, S32)
1699  .lower();
1700 
1701  getActionDefinitionsBuilder({G_SBFX, G_UBFX})
1702  .legalFor({{S32, S32}, {S64, S32}})
1703  .clampScalar(1, S32, S32)
1704  .clampScalar(0, S32, S64)
1705  .widenScalarToNextPow2(0)
1706  .scalarize(0);
1707 
1708  getActionDefinitionsBuilder({
1709  // TODO: Verify V_BFI_B32 is generated from expanded bit ops
1710  G_FCOPYSIGN,
1711 
1712  G_ATOMIC_CMPXCHG_WITH_SUCCESS,
1713  G_ATOMICRMW_NAND,
1714  G_ATOMICRMW_FSUB,
1715  G_READ_REGISTER,
1716  G_WRITE_REGISTER,
1717 
1718  G_SADDO, G_SSUBO,
1719 
1720  // TODO: Implement
1721  G_FMINIMUM, G_FMAXIMUM}).lower();
1722 
1723  getActionDefinitionsBuilder({G_MEMCPY, G_MEMCPY_INLINE, G_MEMMOVE, G_MEMSET})
1724  .lower();
1725 
1726  getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE,
1727  G_INDEXED_LOAD, G_INDEXED_SEXTLOAD,
1728  G_INDEXED_ZEXTLOAD, G_INDEXED_STORE})
1729  .unsupported();
1730 
1731  getLegacyLegalizerInfo().computeTables();
1732  verify(*ST.getInstrInfo());
1733 }
1734 
1735 bool AMDGPULegalizerInfo::legalizeCustom(LegalizerHelper &Helper,
1736  MachineInstr &MI) const {
1737  MachineIRBuilder &B = Helper.MIRBuilder;
1738  MachineRegisterInfo &MRI = *B.getMRI();
1739 
1740  switch (MI.getOpcode()) {
1741  case TargetOpcode::G_ADDRSPACE_CAST:
1742  return legalizeAddrSpaceCast(MI, MRI, B);
1743  case TargetOpcode::G_FRINT:
1744  return legalizeFrint(MI, MRI, B);
1745  case TargetOpcode::G_FCEIL:
1746  return legalizeFceil(MI, MRI, B);
1747  case TargetOpcode::G_FREM:
1748  return legalizeFrem(MI, MRI, B);
1749  case TargetOpcode::G_INTRINSIC_TRUNC:
1750  return legalizeIntrinsicTrunc(MI, MRI, B);
1751  case TargetOpcode::G_SITOFP:
1752  return legalizeITOFP(MI, MRI, B, true);
1753  case TargetOpcode::G_UITOFP:
1754  return legalizeITOFP(MI, MRI, B, false);
1755  case TargetOpcode::G_FPTOSI:
1756  return legalizeFPTOI(MI, MRI, B, true);
1757  case TargetOpcode::G_FPTOUI:
1758  return legalizeFPTOI(MI, MRI, B, false);
1759  case TargetOpcode::G_FMINNUM:
1760  case TargetOpcode::G_FMAXNUM:
1761  case TargetOpcode::G_FMINNUM_IEEE:
1762  case TargetOpcode::G_FMAXNUM_IEEE:
1763  return legalizeMinNumMaxNum(Helper, MI);
1764  case TargetOpcode::G_EXTRACT_VECTOR_ELT:
1765  return legalizeExtractVectorElt(MI, MRI, B);
1766  case TargetOpcode::G_INSERT_VECTOR_ELT:
1767  return legalizeInsertVectorElt(MI, MRI, B);
1768  case TargetOpcode::G_SHUFFLE_VECTOR:
1769  return legalizeShuffleVector(MI, MRI, B);
1770  case TargetOpcode::G_FSIN:
1771  case TargetOpcode::G_FCOS:
1772  return legalizeSinCos(MI, MRI, B);
1773  case TargetOpcode::G_GLOBAL_VALUE:
1774  return legalizeGlobalValue(MI, MRI, B);
1775  case TargetOpcode::G_LOAD:
1776  case TargetOpcode::G_SEXTLOAD:
1777  case TargetOpcode::G_ZEXTLOAD:
1778  return legalizeLoad(Helper, MI);
1779  case TargetOpcode::G_FMAD:
1780  return legalizeFMad(MI, MRI, B);
1781  case TargetOpcode::G_FDIV:
1782  return legalizeFDIV(MI, MRI, B);
1783  case TargetOpcode::G_UDIV:
1784  case TargetOpcode::G_UREM:
1785  case TargetOpcode::G_UDIVREM:
1786  return legalizeUnsignedDIV_REM(MI, MRI, B);
1787  case TargetOpcode::G_SDIV:
1788  case TargetOpcode::G_SREM:
1789  case TargetOpcode::G_SDIVREM:
1790  return legalizeSignedDIV_REM(MI, MRI, B);
1791  case TargetOpcode::G_ATOMIC_CMPXCHG:
1792  return legalizeAtomicCmpXChg(MI, MRI, B);
1793  case TargetOpcode::G_FLOG:
1794  return legalizeFlog(MI, B, numbers::ln2f);
1795  case TargetOpcode::G_FLOG10:
1797  case TargetOpcode::G_FEXP:
1798  return legalizeFExp(MI, B);
1799  case TargetOpcode::G_FPOW:
1800  return legalizeFPow(MI, B);
1801  case TargetOpcode::G_FFLOOR:
1802  return legalizeFFloor(MI, MRI, B);
1803  case TargetOpcode::G_BUILD_VECTOR:
1804  return legalizeBuildVector(MI, MRI, B);
1805  case TargetOpcode::G_MUL:
1806  return legalizeMul(Helper, MI);
1807  case TargetOpcode::G_CTLZ:
1808  case TargetOpcode::G_CTTZ:
1809  return legalizeCTLZ_CTTZ(MI, MRI, B);
1810  case TargetOpcode::G_INTRINSIC_FPTRUNC_ROUND:
1811  return legalizeFPTruncRound(MI, B);
1812  default:
1813  return false;
1814  }
1815 
1816  llvm_unreachable("expected switch to return");
1817 }
1818 
1820  unsigned AS,
1822  MachineIRBuilder &B) const {
1823  MachineFunction &MF = B.getMF();
1824  const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1825  const LLT S32 = LLT::scalar(32);
1826 
1828 
1829  if (ST.hasApertureRegs()) {
1830  // FIXME: Use inline constants (src_{shared, private}_base) instead of
1831  // getreg.
1832  unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ?
1835  unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ?
1838  unsigned Encoding =
1840  Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ |
1841  WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_;
1842 
1843  Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
1844 
1845  B.buildInstr(AMDGPU::S_GETREG_B32)
1846  .addDef(GetReg)
1847  .addImm(Encoding);
1848  MRI.setType(GetReg, S32);
1849 
1850  auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1);
1851  return B.buildShl(S32, GetReg, ShiftAmt).getReg(0);
1852  }
1853 
1854  // TODO: can we be smarter about machine pointer info?
1858  // For code object version 5, private_base and shared_base are passed through
1859  // implicit kernargs.
1864  uint64_t Offset =
1865  ST.getTargetLowering()->getImplicitParameterOffset(B.getMF(), Param);
1866 
1867  Register KernargPtrReg = MRI.createGenericVirtualRegister(
1869 
1870  if (!loadInputValue(KernargPtrReg, B,
1872  return Register();
1873 
1875  PtrInfo,
1878  LLT::scalar(32), commonAlignment(Align(64), Offset));
1879 
1880  // Pointer address
1881  B.buildPtrAdd(LoadAddr, KernargPtrReg,
1882  B.buildConstant(LLT::scalar(64), Offset).getReg(0));
1883  // Load address
1884  return B.buildLoad(S32, LoadAddr, *MMO).getReg(0);
1885  }
1886 
1889 
1891  return Register();
1892 
1893  // Offset into amd_queue_t for group_segment_aperture_base_hi /
1894  // private_segment_aperture_base_hi.
1895  uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
1896 
1898  PtrInfo,
1901  LLT::scalar(32), commonAlignment(Align(64), StructOffset));
1902 
1903  B.buildPtrAdd(LoadAddr, QueuePtr,
1904  B.buildConstant(LLT::scalar(64), StructOffset).getReg(0));
1905  return B.buildLoad(S32, LoadAddr, *MMO).getReg(0);
1906 }
1907 
1908 /// Return true if the value is a known valid address, such that a null check is
1909 /// not necessary.
1911  const AMDGPUTargetMachine &TM, unsigned AddrSpace) {
1912  MachineInstr *Def = MRI.getVRegDef(Val);
1913  switch (Def->getOpcode()) {
1914  case AMDGPU::G_FRAME_INDEX:
1915  case AMDGPU::G_GLOBAL_VALUE:
1916  case AMDGPU::G_BLOCK_ADDR:
1917  return true;
1918  case AMDGPU::G_CONSTANT: {
1919  const ConstantInt *CI = Def->getOperand(1).getCImm();
1920  return CI->getSExtValue() != TM.getNullPointerValue(AddrSpace);
1921  }
1922  default:
1923  return false;
1924  }
1925 
1926  return false;
1927 }
1928 
1931  MachineIRBuilder &B) const {
1932  MachineFunction &MF = B.getMF();
1933 
1934  const LLT S32 = LLT::scalar(32);
1935  Register Dst = MI.getOperand(0).getReg();
1936  Register Src = MI.getOperand(1).getReg();
1937 
1938  LLT DstTy = MRI.getType(Dst);
1939  LLT SrcTy = MRI.getType(Src);
1940  unsigned DestAS = DstTy.getAddressSpace();
1941  unsigned SrcAS = SrcTy.getAddressSpace();
1942 
1943  // TODO: Avoid reloading from the queue ptr for each cast, or at least each
1944  // vector element.
1945  assert(!DstTy.isVector());
1946 
1947  const AMDGPUTargetMachine &TM
1948  = static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
1949 
1950  if (TM.isNoopAddrSpaceCast(SrcAS, DestAS)) {
1951  MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST));
1952  return true;
1953  }
1954 
1955  if (SrcAS == AMDGPUAS::FLAT_ADDRESS &&
1956  (DestAS == AMDGPUAS::LOCAL_ADDRESS ||
1957  DestAS == AMDGPUAS::PRIVATE_ADDRESS)) {
1958  if (isKnownNonNull(Src, MRI, TM, SrcAS)) {
1959  // Extract low 32-bits of the pointer.
1960  B.buildExtract(Dst, Src, 0);
1961  MI.eraseFromParent();
1962  return true;
1963  }
1964 
1965  unsigned NullVal = TM.getNullPointerValue(DestAS);
1966 
1967  auto SegmentNull = B.buildConstant(DstTy, NullVal);
1968  auto FlatNull = B.buildConstant(SrcTy, 0);
1969 
1970  // Extract low 32-bits of the pointer.
1971  auto PtrLo32 = B.buildExtract(DstTy, Src, 0);
1972 
1973  auto CmpRes =
1974  B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0));
1975  B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0));
1976 
1977  MI.eraseFromParent();
1978  return true;
1979  }
1980 
1981  if (DestAS == AMDGPUAS::FLAT_ADDRESS &&
1982  (SrcAS == AMDGPUAS::LOCAL_ADDRESS ||
1983  SrcAS == AMDGPUAS::PRIVATE_ADDRESS)) {
1984  if (!ST.hasFlatAddressSpace())
1985  return false;
1986 
1987  Register ApertureReg = getSegmentAperture(SrcAS, MRI, B);
1988  if (!ApertureReg.isValid())
1989  return false;
1990 
1991  // Coerce the type of the low half of the result so we can use merge_values.
1992  Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0);
1993 
1994  // TODO: Should we allow mismatched types but matching sizes in merges to
1995  // avoid the ptrtoint?
1996  auto BuildPtr = B.buildMerge(DstTy, {SrcAsInt, ApertureReg});
1997 
1998  if (isKnownNonNull(Src, MRI, TM, SrcAS)) {
1999  B.buildCopy(Dst, BuildPtr);
2000  MI.eraseFromParent();
2001  return true;
2002  }
2003 
2004  auto SegmentNull = B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS));
2005  auto FlatNull = B.buildConstant(DstTy, TM.getNullPointerValue(DestAS));
2006 
2007  auto CmpRes = B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src,
2008  SegmentNull.getReg(0));
2009 
2010  B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull);
2011 
2012  MI.eraseFromParent();
2013  return true;
2014  }
2015 
2016  if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
2017  SrcTy.getSizeInBits() == 64) {
2018  // Truncate.
2019  B.buildExtract(Dst, Src, 0);
2020  MI.eraseFromParent();
2021  return true;
2022  }
2023 
2024  if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
2025  DstTy.getSizeInBits() == 64) {
2027  uint32_t AddrHiVal = Info->get32BitAddressHighBits();
2028 
2029  // FIXME: This is a bit ugly due to creating a merge of 2 pointers to
2030  // another. Merge operands are required to be the same type, but creating an
2031  // extra ptrtoint would be kind of pointless.
2032  auto HighAddr = B.buildConstant(
2034  B.buildMerge(Dst, {Src, HighAddr});
2035  MI.eraseFromParent();
2036  return true;
2037  }
2038 
2039  DiagnosticInfoUnsupported InvalidAddrSpaceCast(
2040  MF.getFunction(), "invalid addrspacecast", B.getDebugLoc());
2041 
2042  LLVMContext &Ctx = MF.getFunction().getContext();
2043  Ctx.diagnose(InvalidAddrSpaceCast);
2044  B.buildUndef(Dst);
2045  MI.eraseFromParent();
2046  return true;
2047 }
2048 
2051  MachineIRBuilder &B) const {
2052  Register Src = MI.getOperand(1).getReg();
2053  LLT Ty = MRI.getType(Src);
2054  assert(Ty.isScalar() && Ty.getSizeInBits() == 64);
2055 
2056  APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
2057  APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
2058 
2059  auto C1 = B.buildFConstant(Ty, C1Val);
2060  auto CopySign = B.buildFCopysign(Ty, C1, Src);
2061 
2062  // TODO: Should this propagate fast-math-flags?
2063  auto Tmp1 = B.buildFAdd(Ty, Src, CopySign);
2064  auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign);
2065 
2066  auto C2 = B.buildFConstant(Ty, C2Val);
2067  auto Fabs = B.buildFAbs(Ty, Src);
2068 
2069  auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2);
2070  B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2);
2071  MI.eraseFromParent();
2072  return true;
2073 }
2074 
2077  MachineIRBuilder &B) const {
2078 
2079  const LLT S1 = LLT::scalar(1);
2080  const LLT S64 = LLT::scalar(64);
2081 
2082  Register Src = MI.getOperand(1).getReg();
2083  assert(MRI.getType(Src) == S64);
2084 
2085  // result = trunc(src)
2086  // if (src > 0.0 && src != result)
2087  // result += 1.0
2088 
2089  auto Trunc = B.buildIntrinsicTrunc(S64, Src);
2090 
2091  const auto Zero = B.buildFConstant(S64, 0.0);
2092  const auto One = B.buildFConstant(S64, 1.0);
2093  auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero);
2094  auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc);
2095  auto And = B.buildAnd(S1, Lt0, NeTrunc);
2096  auto Add = B.buildSelect(S64, And, One, Zero);
2097 
2098  // TODO: Should this propagate fast-math-flags?
2099  B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add);
2100  MI.eraseFromParent();
2101  return true;
2102 }
2103 
2106  MachineIRBuilder &B) const {
2107  Register DstReg = MI.getOperand(0).getReg();
2108  Register Src0Reg = MI.getOperand(1).getReg();
2109  Register Src1Reg = MI.getOperand(2).getReg();
2110  auto Flags = MI.getFlags();
2111  LLT Ty = MRI.getType(DstReg);
2112 
2113  auto Div = B.buildFDiv(Ty, Src0Reg, Src1Reg, Flags);
2114  auto Trunc = B.buildIntrinsicTrunc(Ty, Div, Flags);
2115  auto Neg = B.buildFNeg(Ty, Trunc, Flags);
2116  B.buildFMA(DstReg, Neg, Src1Reg, Src0Reg, Flags);
2117  MI.eraseFromParent();
2118  return true;
2119 }
2120 
2122  MachineIRBuilder &B) {
2123  const unsigned FractBits = 52;
2124  const unsigned ExpBits = 11;
2125  LLT S32 = LLT::scalar(32);
2126 
2127  auto Const0 = B.buildConstant(S32, FractBits - 32);
2128  auto Const1 = B.buildConstant(S32, ExpBits);
2129 
2130  auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false)
2131  .addUse(Hi)
2132  .addUse(Const0.getReg(0))
2133  .addUse(Const1.getReg(0));
2134 
2135  return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023));
2136 }
2137 
2140  MachineIRBuilder &B) const {
2141  const LLT S1 = LLT::scalar(1);
2142  const LLT S32 = LLT::scalar(32);
2143  const LLT S64 = LLT::scalar(64);
2144 
2145  Register Src = MI.getOperand(1).getReg();
2146  assert(MRI.getType(Src) == S64);
2147 
2148  // TODO: Should this use extract since the low half is unused?
2149  auto Unmerge = B.buildUnmerge({S32, S32}, Src);
2150  Register Hi = Unmerge.getReg(1);
2151 
2152  // Extract the upper half, since this is where we will find the sign and
2153  // exponent.
2154  auto Exp = extractF64Exponent(Hi, B);
2155 
2156  const unsigned FractBits = 52;
2157 
2158  // Extract the sign bit.
2159  const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31);
2160  auto SignBit = B.buildAnd(S32, Hi, SignBitMask);
2161 
2162  const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1);
2163 
2164  const auto Zero32 = B.buildConstant(S32, 0);
2165 
2166  // Extend back to 64-bits.
2167  auto SignBit64 = B.buildMerge(S64, {Zero32, SignBit});
2168 
2169  auto Shr = B.buildAShr(S64, FractMask, Exp);
2170  auto Not = B.buildNot(S64, Shr);
2171  auto Tmp0 = B.buildAnd(S64, Src, Not);
2172  auto FiftyOne = B.buildConstant(S32, FractBits - 1);
2173 
2174  auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32);
2175  auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne);
2176 
2177  auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0);
2178  B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1);
2179  MI.eraseFromParent();
2180  return true;
2181 }
2182 
2185  MachineIRBuilder &B, bool Signed) const {
2186 
2187  Register Dst = MI.getOperand(0).getReg();
2188  Register Src = MI.getOperand(1).getReg();
2189 
2190  const LLT S64 = LLT::scalar(64);
2191  const LLT S32 = LLT::scalar(32);
2192 
2193  assert(MRI.getType(Src) == S64);
2194 
2195  auto Unmerge = B.buildUnmerge({S32, S32}, Src);
2196  auto ThirtyTwo = B.buildConstant(S32, 32);
2197 
2198  if (MRI.getType(Dst) == S64) {
2199  auto CvtHi = Signed ? B.buildSITOFP(S64, Unmerge.getReg(1))
2200  : B.buildUITOFP(S64, Unmerge.getReg(1));
2201 
2202  auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0));
2203  auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false)
2204  .addUse(CvtHi.getReg(0))
2205  .addUse(ThirtyTwo.getReg(0));
2206 
2207  // TODO: Should this propagate fast-math-flags?
2208  B.buildFAdd(Dst, LdExp, CvtLo);
2209  MI.eraseFromParent();
2210  return true;
2211  }
2212 
2213  assert(MRI.getType(Dst) == S32);
2214 
2215  auto One = B.buildConstant(S32, 1);
2216 
2217  MachineInstrBuilder ShAmt;
2218  if (Signed) {
2219  auto ThirtyOne = B.buildConstant(S32, 31);
2220  auto X = B.buildXor(S32, Unmerge.getReg(0), Unmerge.getReg(1));
2221  auto OppositeSign = B.buildAShr(S32, X, ThirtyOne);
2222  auto MaxShAmt = B.buildAdd(S32, ThirtyTwo, OppositeSign);
2223  auto LS = B.buildIntrinsic(Intrinsic::amdgcn_sffbh, {S32},
2224  /*HasSideEffects=*/false)
2225  .addUse(Unmerge.getReg(1));
2226  auto LS2 = B.buildSub(S32, LS, One);
2227  ShAmt = B.buildUMin(S32, LS2, MaxShAmt);
2228  } else
2229  ShAmt = B.buildCTLZ(S32, Unmerge.getReg(1));
2230  auto Norm = B.buildShl(S64, Src, ShAmt);
2231  auto Unmerge2 = B.buildUnmerge({S32, S32}, Norm);
2232  auto Adjust = B.buildUMin(S32, One, Unmerge2.getReg(0));
2233  auto Norm2 = B.buildOr(S32, Unmerge2.getReg(1), Adjust);
2234  auto FVal = Signed ? B.buildSITOFP(S32, Norm2) : B.buildUITOFP(S32, Norm2);
2235  auto Scale = B.buildSub(S32, ThirtyTwo, ShAmt);
2236  B.buildIntrinsic(Intrinsic::amdgcn_ldexp, ArrayRef<Register>{Dst},
2237  /*HasSideEffects=*/false)
2238  .addUse(FVal.getReg(0))
2239  .addUse(Scale.getReg(0));
2240  MI.eraseFromParent();
2241  return true;
2242 }
2243 
2244 // TODO: Copied from DAG implementation. Verify logic and document how this
2245 // actually works.
2249  bool Signed) const {
2250 
2251  Register Dst = MI.getOperand(0).getReg();
2252  Register Src = MI.getOperand(1).getReg();
2253 
2254  const LLT S64 = LLT::scalar(64);
2255  const LLT S32 = LLT::scalar(32);
2256 
2257  const LLT SrcLT = MRI.getType(Src);
2258  assert((SrcLT == S32 || SrcLT == S64) && MRI.getType(Dst) == S64);
2259 
2260  unsigned Flags = MI.getFlags();
2261 
2262  // The basic idea of converting a floating point number into a pair of 32-bit
2263  // integers is illustrated as follows:
2264  //
2265  // tf := trunc(val);
2266  // hif := floor(tf * 2^-32);
2267  // lof := tf - hif * 2^32; // lof is always positive due to floor.
2268  // hi := fptoi(hif);
2269  // lo := fptoi(lof);
2270  //
2271  auto Trunc = B.buildIntrinsicTrunc(SrcLT, Src, Flags);
2272  MachineInstrBuilder Sign;
2273  if (Signed && SrcLT == S32) {
2274  // However, a 32-bit floating point number has only 23 bits mantissa and
2275  // it's not enough to hold all the significant bits of `lof` if val is
2276  // negative. To avoid the loss of precision, We need to take the absolute
2277  // value after truncating and flip the result back based on the original
2278  // signedness.
2279  Sign = B.buildAShr(S32, Src, B.buildConstant(S32, 31));
2280  Trunc = B.buildFAbs(S32, Trunc, Flags);
2281  }
2282  MachineInstrBuilder K0, K1;
2283  if (SrcLT == S64) {
2284  K0 = B.buildFConstant(S64,
2285  BitsToDouble(UINT64_C(/*2^-32*/ 0x3df0000000000000)));
2286  K1 = B.buildFConstant(S64,
2287  BitsToDouble(UINT64_C(/*-2^32*/ 0xc1f0000000000000)));
2288  } else {
2289  K0 = B.buildFConstant(S32, BitsToFloat(UINT32_C(/*2^-32*/ 0x2f800000)));
2290  K1 = B.buildFConstant(S32, BitsToFloat(UINT32_C(/*-2^32*/ 0xcf800000)));
2291  }
2292 
2293  auto Mul = B.buildFMul(SrcLT, Trunc, K0, Flags);
2294  auto FloorMul = B.buildFFloor(SrcLT, Mul, Flags);
2295  auto Fma = B.buildFMA(SrcLT, FloorMul, K1, Trunc, Flags);
2296 
2297  auto Hi = (Signed && SrcLT == S64) ? B.buildFPTOSI(S32, FloorMul)
2298  : B.buildFPTOUI(S32, FloorMul);
2299  auto Lo = B.buildFPTOUI(S32, Fma);
2300 
2301  if (Signed && SrcLT == S32) {
2302  // Flip the result based on the signedness, which is either all 0s or 1s.
2303  Sign = B.buildMerge(S64, {Sign, Sign});
2304  // r := xor({lo, hi}, sign) - sign;
2305  B.buildSub(Dst, B.buildXor(S64, B.buildMerge(S64, {Lo, Hi}), Sign), Sign);
2306  } else
2307  B.buildMerge(Dst, {Lo, Hi});
2308  MI.eraseFromParent();
2309 
2310  return true;
2311 }
2312 
2314  MachineInstr &MI) const {
2315  MachineFunction &MF = Helper.MIRBuilder.getMF();
2317 
2318  const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE ||
2319  MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE;
2320 
2321  // With ieee_mode disabled, the instructions have the correct behavior
2322  // already for G_FMINNUM/G_FMAXNUM
2323  if (!MFI->getMode().IEEE)
2324  return !IsIEEEOp;
2325 
2326  if (IsIEEEOp)
2327  return true;
2328 
2330 }
2331 
2334  MachineIRBuilder &B) const {
2335  // TODO: Should move some of this into LegalizerHelper.
2336 
2337  // TODO: Promote dynamic indexing of s16 to s32
2338 
2339  // FIXME: Artifact combiner probably should have replaced the truncated
2340  // constant before this, so we shouldn't need
2341  // getIConstantVRegValWithLookThrough.
2342  Optional<ValueAndVReg> MaybeIdxVal =
2343  getIConstantVRegValWithLookThrough(MI.getOperand(2).getReg(), MRI);
2344  if (!MaybeIdxVal) // Dynamic case will be selected to register indexing.
2345  return true;
2346  const int64_t IdxVal = MaybeIdxVal->Value.getSExtValue();
2347 
2348  Register Dst = MI.getOperand(0).getReg();
2349  Register Vec = MI.getOperand(1).getReg();
2350 
2351  LLT VecTy = MRI.getType(Vec);
2352  LLT EltTy = VecTy.getElementType();
2353  assert(EltTy == MRI.getType(Dst));
2354 
2355  if (IdxVal < VecTy.getNumElements()) {
2356  auto Unmerge = B.buildUnmerge(EltTy, Vec);
2357  B.buildCopy(Dst, Unmerge.getReg(IdxVal));
2358  } else {
2359  B.buildUndef(Dst);
2360  }
2361 
2362  MI.eraseFromParent();
2363  return true;
2364 }
2365 
2368  MachineIRBuilder &B) const {
2369  // TODO: Should move some of this into LegalizerHelper.
2370 
2371  // TODO: Promote dynamic indexing of s16 to s32
2372 
2373  // FIXME: Artifact combiner probably should have replaced the truncated
2374  // constant before this, so we shouldn't need
2375  // getIConstantVRegValWithLookThrough.
2376  Optional<ValueAndVReg> MaybeIdxVal =
2377  getIConstantVRegValWithLookThrough(MI.getOperand(3).getReg(), MRI);
2378  if (!MaybeIdxVal) // Dynamic case will be selected to register indexing.
2379  return true;
2380 
2381  int64_t IdxVal = MaybeIdxVal->Value.getSExtValue();
2382  Register Dst = MI.getOperand(0).getReg();
2383  Register Vec = MI.getOperand(1).getReg();
2384  Register Ins = MI.getOperand(2).getReg();
2385 
2386  LLT VecTy = MRI.getType(Vec);
2387  LLT EltTy = VecTy.getElementType();
2388  assert(EltTy == MRI.getType(Ins));
2389  (void)Ins;
2390 
2391  unsigned NumElts = VecTy.getNumElements();
2392  if (IdxVal < NumElts) {
2393  SmallVector<Register, 8> SrcRegs;
2394  for (unsigned i = 0; i < NumElts; ++i)
2395  SrcRegs.push_back(MRI.createGenericVirtualRegister(EltTy));
2396  B.buildUnmerge(SrcRegs, Vec);
2397 
2398  SrcRegs[IdxVal] = MI.getOperand(2).getReg();
2399  B.buildMerge(Dst, SrcRegs);
2400  } else {
2401  B.buildUndef(Dst);
2402  }
2403 
2404  MI.eraseFromParent();
2405  return true;
2406 }
2407 
2410  MachineIRBuilder &B) const {
2411  const LLT V2S16 = LLT::fixed_vector(2, 16);
2412 
2413  Register Dst = MI.getOperand(0).getReg();
2414  Register Src0 = MI.getOperand(1).getReg();
2415  LLT DstTy = MRI.getType(Dst);
2416  LLT SrcTy = MRI.getType(Src0);
2417 
2418  if (SrcTy == V2S16 && DstTy == V2S16 &&
2419  AMDGPU::isLegalVOP3PShuffleMask(MI.getOperand(3).getShuffleMask()))
2420  return true;
2421 
2422  MachineIRBuilder HelperBuilder(MI);
2423  GISelObserverWrapper DummyObserver;
2424  LegalizerHelper Helper(B.getMF(), DummyObserver, HelperBuilder);
2426 }
2427 
2430  MachineIRBuilder &B) const {
2431 
2432  Register DstReg = MI.getOperand(0).getReg();
2433  Register SrcReg = MI.getOperand(1).getReg();
2434  LLT Ty = MRI.getType(DstReg);
2435  unsigned Flags = MI.getFlags();
2436 
2437  Register TrigVal;
2438  auto OneOver2Pi = B.buildFConstant(Ty, 0.5 * numbers::inv_pi);
2439  if (ST.hasTrigReducedRange()) {
2440  auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags);
2441  TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false)
2442  .addUse(MulVal.getReg(0))
2443  .setMIFlags(Flags).getReg(0);
2444  } else
2445  TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0);
2446 
2447  Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ?
2448  Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos;
2449  B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false)
2450  .addUse(TrigVal)
2451  .setMIFlags(Flags);
2452  MI.eraseFromParent();
2453  return true;
2454 }
2455 
2458  const GlobalValue *GV,
2459  int64_t Offset,
2460  unsigned GAFlags) const {
2461  assert(isInt<32>(Offset + 4) && "32-bit offset is expected!");
2462  // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered
2463  // to the following code sequence:
2464  //
2465  // For constant address space:
2466  // s_getpc_b64 s[0:1]
2467  // s_add_u32 s0, s0, $symbol
2468  // s_addc_u32 s1, s1, 0
2469  //
2470  // s_getpc_b64 returns the address of the s_add_u32 instruction and then
2471  // a fixup or relocation is emitted to replace $symbol with a literal
2472  // constant, which is a pc-relative offset from the encoding of the $symbol
2473  // operand to the global variable.
2474  //
2475  // For global address space:
2476  // s_getpc_b64 s[0:1]
2477  // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
2478  // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
2479  //
2480  // s_getpc_b64 returns the address of the s_add_u32 instruction and then
2481  // fixups or relocations are emitted to replace $symbol@*@lo and
2482  // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
2483  // which is a 64-bit pc-relative offset from the encoding of the $symbol
2484  // operand to the global variable.
2485  //
2486  // What we want here is an offset from the value returned by s_getpc
2487  // (which is the address of the s_add_u32 instruction) to the global
2488  // variable, but since the encoding of $symbol starts 4 bytes after the start
2489  // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too
2490  // small. This requires us to add 4 to the global variable offset in order to
2491  // compute the correct address. Similarly for the s_addc_u32 instruction, the
2492  // encoding of $symbol starts 12 bytes after the start of the s_add_u32
2493  // instruction.
2494 
2495  LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2496 
2497  Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg :
2498  B.getMRI()->createGenericVirtualRegister(ConstPtrTy);
2499 
2500  MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET)
2501  .addDef(PCReg);
2502 
2503  MIB.addGlobalAddress(GV, Offset + 4, GAFlags);
2504  if (GAFlags == SIInstrInfo::MO_NONE)
2505  MIB.addImm(0);
2506  else
2507  MIB.addGlobalAddress(GV, Offset + 12, GAFlags + 1);
2508 
2509  B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass);
2510 
2511  if (PtrTy.getSizeInBits() == 32)
2512  B.buildExtract(DstReg, PCReg, 0);
2513  return true;
2514  }
2515 
2518  MachineIRBuilder &B) const {
2519  Register DstReg = MI.getOperand(0).getReg();
2520  LLT Ty = MRI.getType(DstReg);
2521  unsigned AS = Ty.getAddressSpace();
2522 
2523  const GlobalValue *GV = MI.getOperand(1).getGlobal();
2524  MachineFunction &MF = B.getMF();
2526 
2528  if (!MFI->isModuleEntryFunction() &&
2529  !GV->getName().equals("llvm.amdgcn.module.lds")) {
2530  const Function &Fn = MF.getFunction();
2531  DiagnosticInfoUnsupported BadLDSDecl(
2532  Fn, "local memory global used by non-kernel function", MI.getDebugLoc(),
2533  DS_Warning);
2534  Fn.getContext().diagnose(BadLDSDecl);
2535 
2536  // We currently don't have a way to correctly allocate LDS objects that
2537  // aren't directly associated with a kernel. We do force inlining of
2538  // functions that use local objects. However, if these dead functions are
2539  // not eliminated, we don't want a compile time error. Just emit a warning
2540  // and a trap, since there should be no callable path here.
2541  B.buildIntrinsic(Intrinsic::trap, ArrayRef<Register>(), true);
2542  B.buildUndef(DstReg);
2543  MI.eraseFromParent();
2544  return true;
2545  }
2546 
2547  // TODO: We could emit code to handle the initialization somewhere.
2548  // We ignore the initializer for now and legalize it to allow selection.
2549  // The initializer will anyway get errored out during assembly emission.
2550  const SITargetLowering *TLI = ST.getTargetLowering();
2551  if (!TLI->shouldUseLDSConstAddress(GV)) {
2552  MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO);
2553  return true; // Leave in place;
2554  }
2555 
2556  if (AS == AMDGPUAS::LOCAL_ADDRESS && GV->hasExternalLinkage()) {
2557  Type *Ty = GV->getValueType();
2558  // HIP uses an unsized array `extern __shared__ T s[]` or similar
2559  // zero-sized type in other languages to declare the dynamic shared
2560  // memory which size is not known at the compile time. They will be
2561  // allocated by the runtime and placed directly after the static
2562  // allocated ones. They all share the same offset.
2563  if (B.getDataLayout().getTypeAllocSize(Ty).isZero()) {
2564  // Adjust alignment for that dynamic shared memory array.
2565  MFI->setDynLDSAlign(B.getDataLayout(), *cast<GlobalVariable>(GV));
2566  LLT S32 = LLT::scalar(32);
2567  auto Sz =
2568  B.buildIntrinsic(Intrinsic::amdgcn_groupstaticsize, {S32}, false);
2569  B.buildIntToPtr(DstReg, Sz);
2570  MI.eraseFromParent();
2571  return true;
2572  }
2573  }
2574 
2575  B.buildConstant(DstReg, MFI->allocateLDSGlobal(B.getDataLayout(),
2576  *cast<GlobalVariable>(GV)));
2577  MI.eraseFromParent();
2578  return true;
2579  }
2580 
2581  const SITargetLowering *TLI = ST.getTargetLowering();
2582 
2583  if (TLI->shouldEmitFixup(GV)) {
2584  buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0);
2585  MI.eraseFromParent();
2586  return true;
2587  }
2588 
2589  if (TLI->shouldEmitPCReloc(GV)) {
2590  buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32);
2591  MI.eraseFromParent();
2592  return true;
2593  }
2594 
2596  Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy);
2597 
2598  LLT LoadTy = Ty.getSizeInBits() == 32 ? PtrTy : Ty;
2603  LoadTy, Align(8));
2604 
2605  buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32);
2606 
2607  if (Ty.getSizeInBits() == 32) {
2608  // Truncate if this is a 32-bit constant address.
2609  auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO);
2610  B.buildExtract(DstReg, Load, 0);
2611  } else
2612  B.buildLoad(DstReg, GOTAddr, *GOTMMO);
2613 
2614  MI.eraseFromParent();
2615  return true;
2616 }
2617 
2619  if (Ty.isVector())
2620  return Ty.changeElementCount(
2622  return LLT::scalar(PowerOf2Ceil(Ty.getSizeInBits()));
2623 }
2624 
2626  MachineInstr &MI) const {
2627  MachineIRBuilder &B = Helper.MIRBuilder;
2628  MachineRegisterInfo &MRI = *B.getMRI();
2629  GISelChangeObserver &Observer = Helper.Observer;
2630 
2631  Register PtrReg = MI.getOperand(1).getReg();
2632  LLT PtrTy = MRI.getType(PtrReg);
2633  unsigned AddrSpace = PtrTy.getAddressSpace();
2634 
2635  if (AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
2637  auto Cast = B.buildAddrSpaceCast(ConstPtr, PtrReg);
2638  Observer.changingInstr(MI);
2639  MI.getOperand(1).setReg(Cast.getReg(0));
2640  Observer.changedInstr(MI);
2641  return true;
2642  }
2643 
2644  if (MI.getOpcode() != AMDGPU::G_LOAD)
2645  return false;
2646 
2647  Register ValReg = MI.getOperand(0).getReg();
2648  LLT ValTy = MRI.getType(ValReg);
2649 
2650  MachineMemOperand *MMO = *MI.memoperands_begin();
2651  const unsigned ValSize = ValTy.getSizeInBits();
2652  const LLT MemTy = MMO->getMemoryType();
2653  const Align MemAlign = MMO->getAlign();
2654  const unsigned MemSize = MemTy.getSizeInBits();
2655  const uint64_t AlignInBits = 8 * MemAlign.value();
2656 
2657  // Widen non-power-of-2 loads to the alignment if needed
2658  if (shouldWidenLoad(ST, MemTy, AlignInBits, AddrSpace, MI.getOpcode())) {
2659  const unsigned WideMemSize = PowerOf2Ceil(MemSize);
2660 
2661  // This was already the correct extending load result type, so just adjust
2662  // the memory type.
2663  if (WideMemSize == ValSize) {
2664  MachineFunction &MF = B.getMF();
2665 
2666  MachineMemOperand *WideMMO =
2667  MF.getMachineMemOperand(MMO, 0, WideMemSize / 8);
2668  Observer.changingInstr(MI);
2669  MI.setMemRefs(MF, {WideMMO});
2670  Observer.changedInstr(MI);
2671  return true;
2672  }
2673 
2674  // Don't bother handling edge case that should probably never be produced.
2675  if (ValSize > WideMemSize)
2676  return false;
2677 
2678  LLT WideTy = widenToNextPowerOf2(ValTy);
2679 
2680  Register WideLoad;
2681  if (!WideTy.isVector()) {
2682  WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
2683  B.buildTrunc(ValReg, WideLoad).getReg(0);
2684  } else {
2685  // Extract the subvector.
2686 
2687  if (isRegisterType(ValTy)) {
2688  // If this a case where G_EXTRACT is legal, use it.
2689  // (e.g. <3 x s32> -> <4 x s32>)
2690  WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
2691  B.buildExtract(ValReg, WideLoad, 0);
2692  } else {
2693  // For cases where the widened type isn't a nice register value, unmerge
2694  // from a widened register (e.g. <3 x s16> -> <4 x s16>)
2695  WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
2696  B.buildDeleteTrailingVectorElements(ValReg, WideLoad);
2697  }
2698  }
2699 
2700  MI.eraseFromParent();
2701  return true;
2702  }
2703 
2704  return false;
2705 }
2706 
2709  MachineIRBuilder &B) const {
2710  LLT Ty = MRI.getType(MI.getOperand(0).getReg());
2711  assert(Ty.isScalar());
2712 
2713  MachineFunction &MF = B.getMF();
2715 
2716  // TODO: Always legal with future ftz flag.
2717  // FIXME: Do we need just output?
2718  if (Ty == LLT::scalar(32) && !MFI->getMode().allFP32Denormals())
2719  return true;
2720  if (Ty == LLT::scalar(16) && !MFI->getMode().allFP64FP16Denormals())
2721  return true;
2722 
2723  MachineIRBuilder HelperBuilder(MI);
2724  GISelObserverWrapper DummyObserver;
2725  LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
2726  return Helper.lowerFMad(MI) == LegalizerHelper::Legalized;
2727 }
2728 
2731  Register DstReg = MI.getOperand(0).getReg();
2732  Register PtrReg = MI.getOperand(1).getReg();
2733  Register CmpVal = MI.getOperand(2).getReg();
2734  Register NewVal = MI.getOperand(3).getReg();
2735 
2737  "this should not have been custom lowered");
2738 
2739  LLT ValTy = MRI.getType(CmpVal);
2740  LLT VecTy = LLT::fixed_vector(2, ValTy);
2741 
2742  Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0);
2743 
2744  B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG)
2745  .addDef(DstReg)
2746  .addUse(PtrReg)
2747  .addUse(PackedVal)
2748  .setMemRefs(MI.memoperands());
2749 
2750  MI.eraseFromParent();
2751  return true;
2752 }
2753 
2755  MachineInstr &MI, MachineIRBuilder &B, double Log2BaseInverted) const {
2756  Register Dst = MI.getOperand(0).getReg();
2757  Register Src = MI.getOperand(1).getReg();
2758  LLT Ty = B.getMRI()->getType(Dst);
2759  unsigned Flags = MI.getFlags();
2760 
2761  auto Log2Operand = B.buildFLog2(Ty, Src, Flags);
2762  auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted);
2763 
2764  B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags);
2765  MI.eraseFromParent();
2766  return true;
2767 }
2768 
2770  MachineIRBuilder &B) const {
2771  Register Dst = MI.getOperand(0).getReg();
2772  Register Src = MI.getOperand(1).getReg();
2773  unsigned Flags = MI.getFlags();
2774  LLT Ty = B.getMRI()->getType(Dst);
2775 
2776  auto K = B.buildFConstant(Ty, numbers::log2e);
2777  auto Mul = B.buildFMul(Ty, Src, K, Flags);
2778  B.buildFExp2(Dst, Mul, Flags);
2779  MI.eraseFromParent();
2780  return true;
2781 }
2782 
2784  MachineIRBuilder &B) const {
2785  Register Dst = MI.getOperand(0).getReg();
2786  Register Src0 = MI.getOperand(1).getReg();
2787  Register Src1 = MI.getOperand(2).getReg();
2788  unsigned Flags = MI.getFlags();
2789  LLT Ty = B.getMRI()->getType(Dst);
2790  const LLT S16 = LLT::scalar(16);
2791  const LLT S32 = LLT::scalar(32);
2792 
2793  if (Ty == S32) {
2794  auto Log = B.buildFLog2(S32, Src0, Flags);
2795  auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false)
2796  .addUse(Log.getReg(0))
2797  .addUse(Src1)
2798  .setMIFlags(Flags);
2799  B.buildFExp2(Dst, Mul, Flags);
2800  } else if (Ty == S16) {
2801  // There's no f16 fmul_legacy, so we need to convert for it.
2802  auto Log = B.buildFLog2(S16, Src0, Flags);
2803  auto Ext0 = B.buildFPExt(S32, Log, Flags);
2804  auto Ext1 = B.buildFPExt(S32, Src1, Flags);
2805  auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false)
2806  .addUse(Ext0.getReg(0))
2807  .addUse(Ext1.getReg(0))
2808  .setMIFlags(Flags);
2809 
2810  B.buildFExp2(Dst, B.buildFPTrunc(S16, Mul), Flags);
2811  } else
2812  return false;
2813 
2814  MI.eraseFromParent();
2815  return true;
2816 }
2817 
2818 // Find a source register, ignoring any possible source modifiers.
2820  Register ModSrc = OrigSrc;
2821  if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) {
2822  ModSrc = SrcFNeg->getOperand(1).getReg();
2823  if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
2824  ModSrc = SrcFAbs->getOperand(1).getReg();
2825  } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
2826  ModSrc = SrcFAbs->getOperand(1).getReg();
2827  return ModSrc;
2828 }
2829 
2832  MachineIRBuilder &B) const {
2833 
2834  const LLT S1 = LLT::scalar(1);
2835  const LLT S64 = LLT::scalar(64);
2836  Register Dst = MI.getOperand(0).getReg();
2837  Register OrigSrc = MI.getOperand(1).getReg();
2838  unsigned Flags = MI.getFlags();
2839  assert(ST.hasFractBug() && MRI.getType(Dst) == S64 &&
2840  "this should not have been custom lowered");
2841 
2842  // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x))
2843  // is used instead. However, SI doesn't have V_FLOOR_F64, so the most
2844  // efficient way to implement it is using V_FRACT_F64. The workaround for the
2845  // V_FRACT bug is:
2846  // fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999)
2847  //
2848  // Convert floor(x) to (x - fract(x))
2849 
2850  auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {S64}, false)
2851  .addUse(OrigSrc)
2852  .setMIFlags(Flags);
2853 
2854  // Give source modifier matching some assistance before obscuring a foldable
2855  // pattern.
2856 
2857  // TODO: We can avoid the neg on the fract? The input sign to fract
2858  // shouldn't matter?
2859  Register ModSrc = stripAnySourceMods(OrigSrc, MRI);
2860 
2861  auto Const = B.buildFConstant(S64, BitsToDouble(0x3fefffffffffffff));
2862 
2864 
2865  // We don't need to concern ourselves with the snan handling difference, so
2866  // use the one which will directly select.
2867  const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2868  if (MFI->getMode().IEEE)
2869  B.buildFMinNumIEEE(Min, Fract, Const, Flags);
2870  else
2871  B.buildFMinNum(Min, Fract, Const, Flags);
2872 
2873  Register CorrectedFract = Min;
2874  if (!MI.getFlag(MachineInstr::FmNoNans)) {
2875  auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags);
2876  CorrectedFract = B.buildSelect(S64, IsNan, ModSrc, Min, Flags).getReg(0);
2877  }
2878 
2879  auto NegFract = B.buildFNeg(S64, CorrectedFract, Flags);
2880  B.buildFAdd(Dst, OrigSrc, NegFract, Flags);
2881 
2882  MI.eraseFromParent();
2883  return true;
2884 }
2885 
2886 // Turn an illegal packed v2s16 build vector into bit operations.
2887 // TODO: This should probably be a bitcast action in LegalizerHelper.
2890  Register Dst = MI.getOperand(0).getReg();
2891  const LLT S32 = LLT::scalar(32);
2892  assert(MRI.getType(Dst) == LLT::fixed_vector(2, 16));
2893 
2894  Register Src0 = MI.getOperand(1).getReg();
2895  Register Src1 = MI.getOperand(2).getReg();
2896  assert(MRI.getType(Src0) == LLT::scalar(16));
2897 
2898  auto Merge = B.buildMerge(S32, {Src0, Src1});
2899  B.buildBitcast(Dst, Merge);
2900 
2901  MI.eraseFromParent();
2902  return true;
2903 }
2904 
2905 // Build a big integer multiply or multiply-add using MAD_64_32 instructions.
2906 //
2907 // Source and accumulation registers must all be 32-bits.
2908 //
2909 // TODO: When the multiply is uniform, we should produce a code sequence
2910 // that is better suited to instruction selection on the SALU. Instead of
2911 // the outer loop going over parts of the result, the outer loop should go
2912 // over parts of one of the factors. This should result in instruction
2913 // selection that makes full use of S_ADDC_U32 instructions.
2917  bool UsePartialMad64_32, bool SeparateOddAlignedProducts) const {
2918  // Use (possibly empty) vectors of S1 registers to represent the set of
2919  // carries from one pair of positions to the next.
2920  using Carry = SmallVector<Register, 2>;
2921 
2922  MachineIRBuilder &B = Helper.MIRBuilder;
2923 
2924  const LLT S1 = LLT::scalar(1);
2925  const LLT S32 = LLT::scalar(32);
2926  const LLT S64 = LLT::scalar(64);
2927 
2928  Register Zero32;
2929  Register Zero64;
2930 
2931  auto getZero32 = [&]() -> Register {
2932  if (!Zero32)
2933  Zero32 = B.buildConstant(S32, 0).getReg(0);
2934  return Zero32;
2935  };
2936  auto getZero64 = [&]() -> Register {
2937  if (!Zero64)
2938  Zero64 = B.buildConstant(S64, 0).getReg(0);
2939  return Zero64;
2940  };
2941 
2942  // Merge the given carries into the 32-bit LocalAccum, which is modified
2943  // in-place.
2944  //
2945  // Returns the carry-out, which is a single S1 register or null.
2946  auto mergeCarry =
2947  [&](Register &LocalAccum, const Carry &CarryIn) -> Register {
2948  if (CarryIn.empty())
2949  return Register();
2950 
2951  bool HaveCarryOut = true;
2952  Register CarryAccum;
2953  if (CarryIn.size() == 1) {
2954  if (!LocalAccum) {
2955  LocalAccum = B.buildZExt(S32, CarryIn[0]).getReg(0);
2956  return Register();
2957  }
2958 
2959  CarryAccum = getZero32();
2960  } else {
2961  CarryAccum = B.buildZExt(S32, CarryIn[0]).getReg(0);
2962  for (unsigned i = 1; i + 1 < CarryIn.size(); ++i) {
2963  CarryAccum =
2964  B.buildUAdde(S32, S1, CarryAccum, getZero32(), CarryIn[i])
2965  .getReg(0);
2966  }
2967 
2968  if (!LocalAccum) {
2969  LocalAccum = getZero32();
2970  HaveCarryOut = false;
2971  }
2972  }
2973 
2974  auto Add =
2975  B.buildUAdde(S32, S1, CarryAccum, LocalAccum, CarryIn.back());
2976  LocalAccum = Add.getReg(0);
2977  return HaveCarryOut ? Add.getReg(1) : Register();
2978  };
2979 
2980  // Build a multiply-add chain to compute
2981  //
2982  // LocalAccum + (partial products at DstIndex)
2983  // + (opportunistic subset of CarryIn)
2984  //
2985  // LocalAccum is an array of one or two 32-bit registers that are updated
2986  // in-place. The incoming registers may be null.
2987  //
2988  // In some edge cases, carry-ins can be consumed "for free". In that case,
2989  // the consumed carry bits are removed from CarryIn in-place.
2990  auto buildMadChain =
2991  [&](MutableArrayRef<Register> LocalAccum, unsigned DstIndex, Carry &CarryIn)
2992  -> Carry {
2993  assert((DstIndex + 1 < Accum.size() && LocalAccum.size() == 2) ||
2994  (DstIndex + 1 >= Accum.size() && LocalAccum.size() == 1));
2995 
2996  Carry CarryOut;
2997  unsigned j0 = 0;
2998 
2999  // Use plain 32-bit multiplication for the most significant part of the
3000  // result by default.
3001  if (LocalAccum.size() == 1 &&
3002  (!UsePartialMad64_32 || !CarryIn.empty())) {
3003  do {
3004  unsigned j1 = DstIndex - j0;
3005  auto Mul = B.buildMul(S32, Src0[j0], Src1[j1]);
3006  if (!LocalAccum[0]) {
3007  LocalAccum[0] = Mul.getReg(0);
3008  } else {
3009  if (CarryIn.empty()) {
3010  LocalAccum[0] = B.buildAdd(S32, LocalAccum[0], Mul).getReg(0);
3011  } else {
3012  LocalAccum[0] =
3013  B.buildUAdde(S32, S1, LocalAccum[0], Mul, CarryIn.back())
3014  .getReg(0);
3015  CarryIn.pop_back();
3016  }
3017  }
3018  ++j0;
3019  } while (j0 <= DstIndex && (!UsePartialMad64_32 || !CarryIn.empty()));
3020  }
3021 
3022  // Build full 64-bit multiplies.
3023  if (j0 <= DstIndex) {
3024  bool HaveSmallAccum = false;
3025  Register Tmp;
3026 
3027  if (LocalAccum[0]) {
3028  if (LocalAccum.size() == 1) {
3029  Tmp = B.buildAnyExt(S64, LocalAccum[0]).getReg(0);
3030  HaveSmallAccum = true;
3031  } else if (LocalAccum[1]) {
3032  Tmp = B.buildMerge(S64, LocalAccum).getReg(0);
3033  HaveSmallAccum = false;
3034  } else {
3035  Tmp = B.buildZExt(S64, LocalAccum[0]).getReg(0);
3036  HaveSmallAccum = true;
3037  }
3038  } else {
3039  assert(LocalAccum.size() == 1 || !LocalAccum[1]);
3040  Tmp = getZero64();
3041  HaveSmallAccum = true;
3042  }
3043 
3044  do {
3045  unsigned j1 = DstIndex - j0;
3046  auto Mad = B.buildInstr(AMDGPU::G_AMDGPU_MAD_U64_U32, {S64, S1},
3047  {Src0[j0], Src1[j1], Tmp});
3048  Tmp = Mad.getReg(0);
3049  if (!HaveSmallAccum)
3050  CarryOut.push_back(Mad.getReg(1));
3051  HaveSmallAccum = false;
3052  ++j0;
3053  } while (j0 <= DstIndex);
3054 
3055  auto Unmerge = B.buildUnmerge(S32, Tmp);
3056  LocalAccum[0] = Unmerge.getReg(0);
3057  if (LocalAccum.size() > 1)
3058  LocalAccum[1] = Unmerge.getReg(1);
3059  }
3060 
3061  return CarryOut;
3062  };
3063 
3064  // Outer multiply loop, iterating over destination parts from least
3065  // significant to most significant parts.
3066  //
3067  // The columns of the following diagram correspond to the destination parts
3068  // affected by one iteration of the outer loop (ignoring boundary
3069  // conditions).
3070  //
3071  // Dest index relative to 2 * i: 1 0 -1
3072  // ------
3073  // Carries from previous iteration: e o
3074  // Even-aligned partial product sum: E E .
3075  // Odd-aligned partial product sum: O O
3076  //
3077  // 'o' is OddCarry, 'e' is EvenCarry.
3078  // EE and OO are computed from partial products via buildMadChain and use
3079  // accumulation where possible and appropriate.
3080  //
3081  Register SeparateOddCarry;
3082  Carry EvenCarry;
3083  Carry OddCarry;
3084 
3085  for (unsigned i = 0; i <= Accum.size() / 2; ++i) {
3086  Carry OddCarryIn = std::move(OddCarry);
3087  Carry EvenCarryIn = std::move(EvenCarry);
3088  OddCarry.clear();
3089  EvenCarry.clear();
3090 
3091  // Partial products at offset 2 * i.
3092  if (2 * i < Accum.size()) {
3093  auto LocalAccum = Accum.drop_front(2 * i).take_front(2);
3094  EvenCarry = buildMadChain(LocalAccum, 2 * i, EvenCarryIn);
3095  }
3096 
3097  // Partial products at offset 2 * i - 1.
3098  if (i > 0) {
3099  if (!SeparateOddAlignedProducts) {
3100  auto LocalAccum = Accum.drop_front(2 * i - 1).take_front(2);
3101  OddCarry = buildMadChain(LocalAccum, 2 * i - 1, OddCarryIn);
3102  } else {
3103  bool IsHighest = 2 * i >= Accum.size();
3104  Register SeparateOddOut[2];
3105  auto LocalAccum = makeMutableArrayRef(SeparateOddOut)
3106  .take_front(IsHighest ? 1 : 2);
3107  OddCarry = buildMadChain(LocalAccum, 2 * i - 1, OddCarryIn);
3108 
3109  MachineInstr *Lo;
3110 
3111  if (i == 1) {
3112  if (!IsHighest)
3113  Lo = B.buildUAddo(S32, S1, Accum[2 * i - 1], SeparateOddOut[0]);
3114  else
3115  Lo = B.buildAdd(S32, Accum[2 * i - 1], SeparateOddOut[0]);
3116  } else {
3117  Lo = B.buildUAdde(S32, S1, Accum[2 * i - 1], SeparateOddOut[0],
3118  SeparateOddCarry);
3119  }
3120  Accum[2 * i - 1] = Lo->getOperand(0).getReg();
3121 
3122  if (!IsHighest) {
3123  auto Hi = B.buildUAdde(S32, S1, Accum[2 * i], SeparateOddOut[1],
3124  Lo->getOperand(1).getReg());
3125  Accum[2 * i] = Hi.getReg(0);
3126  SeparateOddCarry = Hi.getReg(1);
3127  }
3128  }
3129  }
3130 
3131  // Add in the carries from the previous iteration
3132  if (i > 0) {
3133  if (Register CarryOut = mergeCarry(Accum[2 * i - 1], OddCarryIn))
3134  EvenCarryIn.push_back(CarryOut);
3135 
3136  if (2 * i < Accum.size()) {
3137  if (Register CarryOut = mergeCarry(Accum[2 * i], EvenCarryIn))
3138  OddCarry.push_back(CarryOut);
3139  }
3140  }
3141  }
3142 }
3143 
3144 // Custom narrowing of wide multiplies using wide multiply-add instructions.
3145 //
3146 // TODO: If the multiply is followed by an addition, we should attempt to
3147 // integrate it to make better use of V_MAD_U64_U32's multiply-add capabilities.
3149  MachineInstr &MI) const {
3150  assert(ST.hasMad64_32());
3151  assert(MI.getOpcode() == TargetOpcode::G_MUL);
3152 
3153  MachineIRBuilder &B = Helper.MIRBuilder;
3154  MachineRegisterInfo &MRI = *B.getMRI();
3155 
3156  Register DstReg = MI.getOperand(0).getReg();
3157  Register Src0 = MI.getOperand(1).getReg();
3158  Register Src1 = MI.getOperand(2).getReg();
3159 
3160  LLT Ty = MRI.getType(DstReg);
3161  assert(Ty.isScalar());
3162 
3163  unsigned Size = Ty.getSizeInBits();
3164  unsigned NumParts = Size / 32;
3165  assert((Size % 32) == 0);
3166  assert(NumParts >= 2);
3167 
3168  // Whether to use MAD_64_32 for partial products whose high half is
3169  // discarded. This avoids some ADD instructions but risks false dependency
3170  // stalls on some subtargets in some cases.
3171  const bool UsePartialMad64_32 = ST.getGeneration() < AMDGPUSubtarget::GFX10;
3172 
3173  // Whether to compute odd-aligned partial products separately. This is
3174  // advisable on subtargets where the accumulator of MAD_64_32 must be placed
3175  // in an even-aligned VGPR.
3176  const bool SeparateOddAlignedProducts = ST.hasFullRate64Ops();
3177 
3178  LLT S32 = LLT::scalar(32);
3179  SmallVector<Register, 2> Src0Parts, Src1Parts;
3180  for (unsigned i = 0; i < NumParts; ++i) {
3181  Src0Parts.push_back(MRI.createGenericVirtualRegister(S32));
3182  Src1Parts.push_back(MRI.createGenericVirtualRegister(S32));
3183  }
3184  B.buildUnmerge(Src0Parts, Src0);
3185  B.buildUnmerge(Src1Parts, Src1);
3186 
3187  SmallVector<Register, 2> AccumRegs(NumParts);
3188  buildMultiply(Helper, AccumRegs, Src0Parts, Src1Parts, UsePartialMad64_32,
3189  SeparateOddAlignedProducts);
3190 
3191  B.buildMerge(DstReg, AccumRegs);
3192  MI.eraseFromParent();
3193  return true;
3194 
3195 }
3196 
3197 // Legalize ctlz/cttz to ffbh/ffbl instead of the default legalization to
3198 // ctlz/cttz_zero_undef. This allows us to fix up the result for the zero input
3199 // case with a single min instruction instead of a compare+select.
3202  MachineIRBuilder &B) const {
3203  Register Dst = MI.getOperand(0).getReg();
3204  Register Src = MI.getOperand(1).getReg();
3205  LLT DstTy = MRI.getType(Dst);
3206  LLT SrcTy = MRI.getType(Src);
3207 
3208  unsigned NewOpc = MI.getOpcode() == AMDGPU::G_CTLZ
3209  ? AMDGPU::G_AMDGPU_FFBH_U32
3210  : AMDGPU::G_AMDGPU_FFBL_B32;
3211  auto Tmp = B.buildInstr(NewOpc, {DstTy}, {Src});
3212  B.buildUMin(Dst, Tmp, B.buildConstant(DstTy, SrcTy.getSizeInBits()));
3213 
3214  MI.eraseFromParent();
3215  return true;
3216 }
3217 
3218 // Check that this is a G_XOR x, -1
3219 static bool isNot(const MachineRegisterInfo &MRI, const MachineInstr &MI) {
3220  if (MI.getOpcode() != TargetOpcode::G_XOR)
3221  return false;
3222  auto ConstVal = getIConstantVRegSExtVal(MI.getOperand(2).getReg(), MRI);
3223  return ConstVal && *ConstVal == -1;
3224 }
3225 
3226 // Return the use branch instruction, otherwise null if the usage is invalid.
3227 static MachineInstr *
3229  MachineBasicBlock *&UncondBrTarget, bool &Negated) {
3230  Register CondDef = MI.getOperand(0).getReg();
3231  if (!MRI.hasOneNonDBGUse(CondDef))
3232  return nullptr;
3233 
3234  MachineBasicBlock *Parent = MI.getParent();
3236 
3237  if (isNot(MRI, *UseMI)) {
3238  Register NegatedCond = UseMI->getOperand(0).getReg();
3239  if (!MRI.hasOneNonDBGUse(NegatedCond))
3240  return nullptr;
3241 
3242  // We're deleting the def of this value, so we need to remove it.
3243  eraseInstr(*UseMI, MRI);
3244 
3245  UseMI = &*MRI.use_instr_nodbg_begin(NegatedCond);
3246  Negated = true;
3247  }
3248 
3249  if (UseMI->getParent() != Parent || UseMI->getOpcode() != AMDGPU::G_BRCOND)
3250  return nullptr;
3251 
3252  // Make sure the cond br is followed by a G_BR, or is the last instruction.
3253  MachineBasicBlock::iterator Next = std::next(UseMI->getIterator());
3254  if (Next == Parent->end()) {
3255  MachineFunction::iterator NextMBB = std::next(Parent->getIterator());
3256  if (NextMBB == Parent->getParent()->end()) // Illegal intrinsic use.
3257  return nullptr;
3258  UncondBrTarget = &*NextMBB;
3259  } else {
3260  if (Next->getOpcode() != AMDGPU::G_BR)
3261  return nullptr;
3262  Br = &*Next;
3263  UncondBrTarget = Br->getOperand(0).getMBB();
3264  }
3265 
3266  return UseMI;
3267 }
3268 
3270  const ArgDescriptor *Arg,
3271  const TargetRegisterClass *ArgRC,
3272  LLT ArgTy) const {
3273  MCRegister SrcReg = Arg->getRegister();
3274  assert(Register::isPhysicalRegister(SrcReg) && "Physical register expected");
3275  assert(DstReg.isVirtual() && "Virtual register expected");
3276 
3277  Register LiveIn = getFunctionLiveInPhysReg(B.getMF(), B.getTII(), SrcReg,
3278  *ArgRC, B.getDebugLoc(), ArgTy);
3279  if (Arg->isMasked()) {
3280  // TODO: Should we try to emit this once in the entry block?
3281  const LLT S32 = LLT::scalar(32);
3282  const unsigned Mask = Arg->getMask();
3283  const unsigned Shift = countTrailingZeros<unsigned>(Mask);
3284 
3285  Register AndMaskSrc = LiveIn;
3286 
3287  // TODO: Avoid clearing the high bits if we know workitem id y/z are always
3288  // 0.
3289  if (Shift != 0) {
3290  auto ShiftAmt = B.buildConstant(S32, Shift);
3291  AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0);
3292  }
3293 
3294  B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift));
3295  } else {
3296  B.buildCopy(DstReg, LiveIn);
3297  }
3298 
3299  return true;
3300 }
3301 
3303  Register DstReg, MachineIRBuilder &B,
3304  AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
3305  const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3306  const ArgDescriptor *Arg;
3307  const TargetRegisterClass *ArgRC;
3308  LLT ArgTy;
3309  std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType);
3310 
3311  if (!Arg) {
3313  // The intrinsic may appear when we have a 0 sized kernarg segment, in which
3314  // case the pointer argument may be missing and we use null.
3315  B.buildConstant(DstReg, 0);
3316  return true;
3317  }
3318 
3319  // It's undefined behavior if a function marked with the amdgpu-no-*
3320  // attributes uses the corresponding intrinsic.
3321  B.buildUndef(DstReg);
3322  return true;
3323  }
3324 
3325  if (!Arg->isRegister() || !Arg->getRegister().isValid())
3326  return false; // TODO: Handle these
3327  return loadInputValue(DstReg, B, Arg, ArgRC, ArgTy);
3328 }
3329 
3332  AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
3333  if (!loadInputValue(MI.getOperand(0).getReg(), B, ArgType))
3334  return false;
3335 
3336  MI.eraseFromParent();
3337  return true;
3338 }
3339 
3341  int64_t C) {
3342  B.buildConstant(MI.getOperand(0).getReg(), C);
3343  MI.eraseFromParent();
3344  return true;
3345 }
3346 
3349  unsigned Dim, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
3350  unsigned MaxID = ST.getMaxWorkitemID(B.getMF().getFunction(), Dim);
3351  if (MaxID == 0)
3352  return replaceWithConstant(B, MI, 0);
3353 
3354  const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3355  const ArgDescriptor *Arg;
3356  const TargetRegisterClass *ArgRC;
3357  LLT ArgTy;
3358  std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType);
3359 
3360  Register DstReg = MI.getOperand(0).getReg();
3361  if (!Arg) {
3362  // It's undefined behavior if a function marked with the amdgpu-no-*
3363  // attributes uses the corresponding intrinsic.
3364  B.buildUndef(DstReg);
3365  MI.eraseFromParent();
3366  return true;
3367  }
3368 
3369  if (Arg->isMasked()) {
3370  // Don't bother inserting AssertZext for packed IDs since we're emitting the
3371  // masking operations anyway.
3372  //
3373  // TODO: We could assert the top bit is 0 for the source copy.
3374  if (!loadInputValue(DstReg, B, ArgType))
3375  return false;
3376  } else {
3378  if (!loadInputValue(TmpReg, B, ArgType))
3379  return false;
3380  B.buildAssertZExt(DstReg, TmpReg, 32 - countLeadingZeros(MaxID));
3381  }
3382 
3383  MI.eraseFromParent();
3384  return true;
3385 }
3386 
3388  int64_t Offset) const {
3390  Register KernArgReg = B.getMRI()->createGenericVirtualRegister(PtrTy);
3391 
3392  // TODO: If we passed in the base kernel offset we could have a better
3393  // alignment than 4, but we don't really need it.
3394  if (!loadInputValue(KernArgReg, B,
3396  llvm_unreachable("failed to find kernarg segment ptr");
3397 
3398  auto COffset = B.buildConstant(LLT::scalar(64), Offset);
3399  // TODO: Should get nuw
3400  return B.buildPtrAdd(PtrTy, KernArgReg, COffset).getReg(0);
3401 }
3402 
3403 /// Legalize a value that's loaded from kernel arguments. This is only used by
3404 /// legacy intrinsics.
3407  uint64_t Offset,
3408  Align Alignment) const {
3409  Register DstReg = MI.getOperand(0).getReg();
3410 
3411  assert(B.getMRI()->getType(DstReg) == LLT::scalar(32) &&
3412  "unexpected kernarg parameter type");
3413 
3414  Register Ptr = getKernargParameterPtr(B, Offset);
3416  B.buildLoad(DstReg, Ptr, PtrInfo, Align(4),
3419  MI.eraseFromParent();
3420  return true;
3421 }
3422 
3425  MachineIRBuilder &B) const {
3426  Register Dst = MI.getOperand(0).getReg();
3427  LLT DstTy = MRI.getType(Dst);
3428  LLT S16 = LLT::scalar(16);
3429  LLT S32 = LLT::scalar(32);
3430  LLT S64 = LLT::scalar(64);
3431 
3432  if (DstTy == S16)
3433  return legalizeFDIV16(MI, MRI, B);
3434  if (DstTy == S32)
3435  return legalizeFDIV32(MI, MRI, B);
3436  if (DstTy == S64)
3437  return legalizeFDIV64(MI, MRI, B);
3438 
3439  return false;
3440 }
3441 
3443  Register DstDivReg,
3444  Register DstRemReg,
3445  Register X,
3446  Register Y) const {
3447  const LLT S1 = LLT::scalar(1);
3448  const LLT S32 = LLT::scalar(32);
3449 
3450  // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the
3451  // algorithm used here.
3452 
3453  // Initial estimate of inv(y).
3454  auto FloatY = B.buildUITOFP(S32, Y);
3455  auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {FloatY});
3456  auto Scale = B.buildFConstant(S32, BitsToFloat(0x4f7ffffe));
3457  auto ScaledY = B.buildFMul(S32, RcpIFlag, Scale);
3458  auto Z = B.buildFPTOUI(S32, ScaledY);
3459 
3460  // One round of UNR.
3461  auto NegY = B.buildSub(S32, B.buildConstant(S32, 0), Y);
3462  auto NegYZ = B.buildMul(S32, NegY, Z);
3463  Z = B.buildAdd(S32, Z, B.buildUMulH(S32, Z, NegYZ));
3464 
3465  // Quotient/remainder estimate.
3466  auto Q = B.buildUMulH(S32, X, Z);
3467  auto R = B.buildSub(S32, X, B.buildMul(S32, Q, Y));
3468 
3469  // First quotient/remainder refinement.
3470  auto One = B.buildConstant(S32, 1);
3471  auto Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
3472  if (DstDivReg)
3473  Q = B.buildSelect(S32, Cond, B.buildAdd(S32, Q, One), Q);
3474  R = B.buildSelect(S32, Cond, B.buildSub(S32, R, Y), R);
3475 
3476  // Second quotient/remainder refinement.
3477  Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
3478  if (DstDivReg)
3479  B.buildSelect(DstDivReg, Cond, B.buildAdd(S32, Q, One), Q);
3480 
3481  if (DstRemReg)
3482  B.buildSelect(DstRemReg, Cond, B.buildSub(S32, R, Y), R);
3483 }
3484 
3485 // Build integer reciprocal sequence around V_RCP_IFLAG_F32
3486 //
3487 // Return lo, hi of result
3488 //
3489 // %cvt.lo = G_UITOFP Val.lo
3490 // %cvt.hi = G_UITOFP Val.hi
3491 // %mad = G_FMAD %cvt.hi, 2**32, %cvt.lo
3492 // %rcp = G_AMDGPU_RCP_IFLAG %mad
3493 // %mul1 = G_FMUL %rcp, 0x5f7ffffc
3494 // %mul2 = G_FMUL %mul1, 2**(-32)
3495 // %trunc = G_INTRINSIC_TRUNC %mul2
3496 // %mad2 = G_FMAD %trunc, -(2**32), %mul1
3497 // return {G_FPTOUI %mad2, G_FPTOUI %trunc}
3498 static std::pair<Register, Register> emitReciprocalU64(MachineIRBuilder &B,
3499  Register Val) {
3500  const LLT S32 = LLT::scalar(32);
3501  auto Unmerge = B.buildUnmerge(S32, Val);
3502 
3503  auto CvtLo = B.buildUITOFP(S32, Unmerge.getReg(0));
3504  auto CvtHi = B.buildUITOFP(S32, Unmerge.getReg(1));
3505 
3506  auto Mad = B.buildFMAD(S32, CvtHi, // 2**32
3507  B.buildFConstant(S32, BitsToFloat(0x4f800000)), CvtLo);
3508 
3509  auto Rcp = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Mad});
3510  auto Mul1 =
3511  B.buildFMul(S32, Rcp, B.buildFConstant(S32, BitsToFloat(0x5f7ffffc)));
3512 
3513  // 2**(-32)
3514  auto Mul2 =
3515  B.buildFMul(S32, Mul1, B.buildFConstant(S32, BitsToFloat(0x2f800000)));
3516  auto Trunc = B.buildIntrinsicTrunc(S32, Mul2);
3517 
3518  // -(2**32)
3519  auto Mad2 = B.buildFMAD(S32, Trunc,
3520  B.buildFConstant(S32, BitsToFloat(0xcf800000)), Mul1);
3521 
3522  auto ResultLo = B.buildFPTOUI(S32, Mad2);
3523  auto ResultHi = B.buildFPTOUI(S32, Trunc);
3524 
3525  return {ResultLo.getReg(0), ResultHi.getReg(0)};
3526 }
3527 
3529  Register DstDivReg,
3530  Register DstRemReg,
3531  Register Numer,
3532  Register Denom) const {
3533  const LLT S32 = LLT::scalar(32);
3534  const LLT S64 = LLT::scalar(64);
3535  const LLT S1 = LLT::scalar(1);
3536  Register RcpLo, RcpHi;
3537 
3538  std::tie(RcpLo, RcpHi) = emitReciprocalU64(B, Denom);
3539 
3540  auto Rcp = B.buildMerge(S64, {RcpLo, RcpHi});
3541 
3542  auto Zero64 = B.buildConstant(S64, 0);
3543  auto NegDenom = B.buildSub(S64, Zero64, Denom);
3544 
3545  auto MulLo1 = B.buildMul(S64, NegDenom, Rcp);
3546  auto MulHi1 = B.buildUMulH(S64, Rcp, MulLo1);
3547 
3548  auto UnmergeMulHi1 = B.buildUnmerge(S32, MulHi1);
3549  Register MulHi1_Lo = UnmergeMulHi1.getReg(0);
3550  Register MulHi1_Hi = UnmergeMulHi1.getReg(1);
3551 
3552  auto Add1_Lo = B.buildUAddo(S32, S1, RcpLo, MulHi1_Lo);
3553  auto Add1_Hi = B.buildUAdde(S32, S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1));
3554  auto Add1 = B.buildMerge(S64, {Add1_Lo, Add1_Hi});
3555 
3556  auto MulLo2 = B.buildMul(S64, NegDenom, Add1);
3557  auto MulHi2 = B.buildUMulH(S64, Add1, MulLo2);
3558  auto UnmergeMulHi2 = B.buildUnmerge(S32, MulHi2);
3559  Register MulHi2_Lo = UnmergeMulHi2.getReg(0);
3560  Register MulHi2_Hi = UnmergeMulHi2.getReg(1);
3561 
3562  auto Zero32 = B.buildConstant(S32, 0);
3563  auto Add2_Lo = B.buildUAddo(S32, S1, Add1_Lo, MulHi2_Lo);
3564  auto Add2_Hi = B.buildUAdde(S32, S1, Add1_Hi, MulHi2_Hi, Add2_Lo.getReg(1));
3565  auto Add2 = B.buildMerge(S64, {Add2_Lo, Add2_Hi});
3566 
3567  auto UnmergeNumer = B.buildUnmerge(S32, Numer);
3568  Register NumerLo = UnmergeNumer.getReg(0);
3569  Register NumerHi = UnmergeNumer.getReg(1);
3570 
3571  auto MulHi3 = B.buildUMulH(S64, Numer, Add2);
3572  auto Mul3 = B.buildMul(S64, Denom, MulHi3);
3573  auto UnmergeMul3 = B.buildUnmerge(S32, Mul3);
3574  Register Mul3_Lo = UnmergeMul3.getReg(0);
3575  Register Mul3_Hi = UnmergeMul3.getReg(1);
3576  auto Sub1_Lo = B.buildUSubo(S32, S1, NumerLo, Mul3_Lo);
3577  auto Sub1_Hi = B.buildUSube(S32, S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1));
3578  auto Sub1_Mi = B.buildSub(S32, NumerHi, Mul3_Hi);
3579  auto Sub1 = B.buildMerge(S64, {Sub1_Lo, Sub1_Hi});
3580 
3581  auto UnmergeDenom = B.buildUnmerge(S32, Denom);
3582  Register DenomLo = UnmergeDenom.getReg(0);
3583  Register DenomHi = UnmergeDenom.getReg(1);
3584 
3585  auto CmpHi = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Hi, DenomHi);
3586  auto C1 = B.buildSExt(S32, CmpHi);
3587 
3588  auto CmpLo = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Lo, DenomLo);
3589  auto C2 = B.buildSExt(S32, CmpLo);
3590 
3591  auto CmpEq = B.buildICmp(CmpInst::ICMP_EQ, S1, Sub1_Hi, DenomHi);
3592  auto C3 = B.buildSelect(S32, CmpEq, C2, C1);
3593 
3594  // TODO: Here and below portions of the code can be enclosed into if/endif.
3595  // Currently control flow is unconditional and we have 4 selects after
3596  // potential endif to substitute PHIs.
3597 
3598  // if C3 != 0 ...
3599  auto Sub2_Lo = B.buildUSubo(S32, S1, Sub1_Lo, DenomLo);
3600  auto Sub2_Mi = B.buildUSube(S32, S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1));
3601  auto Sub2_Hi = B.buildUSube(S32, S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1));
3602  auto Sub2 = B.buildMerge(S64, {Sub2_Lo, Sub2_Hi});
3603 
3604  auto One64 = B.buildConstant(S64, 1);
3605  auto Add3 = B.buildAdd(S64, MulHi3, One64);
3606 
3607  auto C4 =
3608  B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Hi, DenomHi));
3609  auto C5 =
3610  B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Lo, DenomLo));
3611  auto C6 = B.buildSelect(
3612  S32, B.buildICmp(CmpInst::ICMP_EQ, S1, Sub2_Hi, DenomHi), C5, C4);
3613 
3614  // if (C6 != 0)
3615  auto Add4 = B.buildAdd(S64, Add3, One64);
3616  auto Sub3_Lo = B.buildUSubo(S32, S1, Sub2_Lo, DenomLo);
3617 
3618  auto Sub3_Mi = B.buildUSube(S32, S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1));
3619  auto Sub3_Hi = B.buildUSube(S32, S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1));
3620  auto Sub3 = B.buildMerge(S64, {Sub3_Lo, Sub3_Hi});
3621 
3622  // endif C6
3623  // endif C3
3624 
3625  if (DstDivReg) {
3626  auto Sel1 = B.buildSelect(
3627  S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Add4, Add3);
3628  B.buildSelect(DstDivReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32),
3629  Sel1, MulHi3);
3630  }
3631 
3632  if (DstRemReg) {
3633  auto Sel2 = B.buildSelect(
3634  S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Sub3, Sub2);
3635  B.buildSelect(DstRemReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32),
3636  Sel2, Sub1);
3637  }
3638 }
3639 
3642  MachineIRBuilder &B) const {
3643  Register DstDivReg, DstRemReg;
3644  switch (MI.getOpcode()) {
3645  default:
3646  llvm_unreachable("Unexpected opcode!");
3647  case AMDGPU::G_UDIV: {
3648  DstDivReg = MI.getOperand(0).getReg();
3649  break;
3650  }
3651  case AMDGPU::G_UREM: {
3652  DstRemReg = MI.getOperand(0).getReg();
3653  break;
3654  }
3655  case AMDGPU::G_UDIVREM: {
3656  DstDivReg = MI.getOperand(0).getReg();
3657  DstRemReg = MI.getOperand(1).getReg();
3658  break;
3659  }
3660  }
3661 
3662  const LLT S64 = LLT::scalar(64);
3663  const LLT S32 = LLT::scalar(32);
3664  const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs();
3665  Register Num = MI.getOperand(FirstSrcOpIdx).getReg();
3666  Register Den = MI.getOperand(FirstSrcOpIdx + 1).getReg();
3667  LLT Ty = MRI.getType(MI.getOperand(0).getReg());
3668 
3669  if (Ty == S32)
3670  legalizeUnsignedDIV_REM32Impl(B, DstDivReg, DstRemReg, Num, Den);
3671  else if (Ty == S64)
3672  legalizeUnsignedDIV_REM64Impl(B, DstDivReg, DstRemReg, Num, Den);
3673  else
3674  return false;
3675 
3676  MI.eraseFromParent();
3677  return true;
3678 }
3679 
3682  MachineIRBuilder &B) const {
3683  const LLT S64 = LLT::scalar(64);
3684  const LLT S32 = LLT::scalar(32);
3685 
3686  LLT Ty = MRI.getType(MI.getOperand(0).getReg());
3687  if (Ty != S32 && Ty != S64)
3688  return false;
3689 
3690  const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs();
3691  Register LHS = MI.getOperand(FirstSrcOpIdx).getReg();
3692  Register RHS = MI.getOperand(FirstSrcOpIdx + 1).getReg();
3693 
3694  auto SignBitOffset = B.buildConstant(S32, Ty.getSizeInBits() - 1);
3695  auto LHSign = B.buildAShr(Ty, LHS, SignBitOffset);
3696  auto RHSign = B.buildAShr(Ty, RHS, SignBitOffset);
3697 
3698  LHS = B.buildAdd(Ty, LHS, LHSign).getReg(0);
3699  RHS = B.buildAdd(Ty, RHS, RHSign).getReg(0);
3700 
3701  LHS = B.buildXor(Ty, LHS, LHSign).getReg(0);
3702  RHS = B.buildXor(Ty, RHS, RHSign).getReg(0);
3703 
3704  Register DstDivReg, DstRemReg, TmpDivReg, TmpRemReg;
3705  switch (MI.getOpcode()) {
3706  default:
3707  llvm_unreachable("Unexpected opcode!");
3708  case AMDGPU::G_SDIV: {
3709  DstDivReg = MI.getOperand(0).getReg();
3710  TmpDivReg = MRI.createGenericVirtualRegister(Ty);
3711  break;
3712  }
3713  case AMDGPU::G_SREM: {
3714  DstRemReg = MI.getOperand(0).getReg();
3715  TmpRemReg = MRI.createGenericVirtualRegister(Ty);
3716  break;
3717  }
3718  case AMDGPU::G_SDIVREM: {
3719  DstDivReg = MI.getOperand(0).getReg();
3720  DstRemReg = MI.getOperand(1).getReg();
3721  TmpDivReg = MRI.createGenericVirtualRegister(Ty);
3722  TmpRemReg = MRI.createGenericVirtualRegister(Ty);
3723  break;
3724  }
3725  }
3726 
3727  if (Ty == S32)
3728  legalizeUnsignedDIV_REM32Impl(B, TmpDivReg, TmpRemReg, LHS, RHS);
3729  else
3730  legalizeUnsignedDIV_REM64Impl(B, TmpDivReg, TmpRemReg, LHS, RHS);
3731 
3732  if (DstDivReg) {
3733  auto Sign = B.buildXor(Ty, LHSign, RHSign).getReg(0);
3734  auto SignXor = B.buildXor(Ty, TmpDivReg, Sign).getReg(0);
3735  B.buildSub(DstDivReg, SignXor, Sign);
3736  }
3737 
3738  if (DstRemReg) {
3739  auto Sign = LHSign.getReg(0); // Remainder sign is the same as LHS
3740  auto SignXor = B.buildXor(Ty, TmpRemReg, Sign).getReg(0);
3741  B.buildSub(DstRemReg, SignXor, Sign);
3742  }
3743 
3744  MI.eraseFromParent();
3745  return true;
3746 }
3747 
3750  MachineIRBuilder &B) const {
3751  Register Res = MI.getOperand(0).getReg();
3752  Register LHS = MI.getOperand(1).getReg();
3753  Register RHS = MI.getOperand(2).getReg();
3754  uint16_t Flags = MI.getFlags();
3755  LLT ResTy = MRI.getType(Res);
3756 
3757  const MachineFunction &MF = B.getMF();
3758  bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath ||
3759  MI.getFlag(MachineInstr::FmAfn);
3760 
3761  if (!AllowInaccurateRcp)
3762  return false;
3763 
3764  if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) {
3765  // 1 / x -> RCP(x)
3766  if (CLHS->isExactlyValue(1.0)) {
3767  B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
3768  .addUse(RHS)
3769  .setMIFlags(Flags);
3770 
3771  MI.eraseFromParent();
3772  return true;
3773  }
3774 
3775  // -1 / x -> RCP( FNEG(x) )
3776  if (CLHS->isExactlyValue(-1.0)) {
3777  auto FNeg = B.buildFNeg(ResTy, RHS, Flags);
3778  B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
3779  .addUse(FNeg.getReg(0))
3780  .setMIFlags(Flags);
3781 
3782  MI.eraseFromParent();
3783  return true;
3784  }
3785  }
3786 
3787  // x / y -> x * (1.0 / y)
3788  auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false)
3789  .addUse(RHS)
3790  .setMIFlags(Flags);
3791  B.buildFMul(Res, LHS, RCP, Flags);
3792 
3793  MI.eraseFromParent();
3794  return true;
3795 }
3796 
3799  MachineIRBuilder &B) const {
3800  Register Res = MI.getOperand(0).getReg();
3801  Register X = MI.getOperand(1).getReg();
3802  Register Y = MI.getOperand(2).getReg();
3803  uint16_t Flags = MI.getFlags();
3804  LLT ResTy = MRI.getType(Res);
3805 
3806  const MachineFunction &MF = B.getMF();
3807  bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath ||
3808  MI.getFlag(MachineInstr::FmAfn);
3809 
3810  if (!AllowInaccurateRcp)
3811  return false;
3812 
3813  auto NegY = B.buildFNeg(ResTy, Y);
3814  auto One = B.buildFConstant(ResTy, 1.0);
3815 
3816  auto R = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false)
3817  .addUse(Y)
3818  .setMIFlags(Flags);
3819 
3820  auto Tmp0 = B.buildFMA(ResTy, NegY, R, One);
3821  R = B.buildFMA(ResTy, Tmp0, R, R);
3822 
3823  auto Tmp1 = B.buildFMA(ResTy, NegY, R, One);
3824  R = B.buildFMA(ResTy, Tmp1, R, R);
3825 
3826  auto Ret = B.buildFMul(ResTy, X, R);
3827  auto Tmp2 = B.buildFMA(ResTy, NegY, Ret, X);
3828 
3829  B.buildFMA(Res, Tmp2, R, Ret);
3830  MI.eraseFromParent();
3831  return true;
3832 }
3833 
3836  MachineIRBuilder &B) const {
3837  if (legalizeFastUnsafeFDIV(MI, MRI, B))
3838  return true;
3839 
3840  Register Res = MI.getOperand(0).getReg();
3841  Register LHS = MI.getOperand(1).getReg();
3842  Register RHS = MI.getOperand(2).getReg();
3843 
3844  uint16_t Flags = MI.getFlags();
3845 
3846  LLT S16 = LLT::scalar(16);
3847  LLT S32 = LLT::scalar(32);
3848 
3849  auto LHSExt = B.buildFPExt(S32, LHS, Flags);
3850  auto RHSExt = B.buildFPExt(S32, RHS, Flags);
3851 
3852  auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3853  .addUse(RHSExt.getReg(0))
3854  .setMIFlags(Flags);
3855 
3856  auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags);
3857  auto RDst = B.buildFPTrunc(S16, QUOT, Flags);
3858 
3859  B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
3860  .addUse(RDst.getReg(0))
3861  .addUse(RHS)
3862  .addUse(LHS)
3863  .setMIFlags(Flags);
3864 
3865  MI.eraseFromParent();
3866  return true;
3867 }
3868 
3869 // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions
3870 // to enable denorm mode. When 'Enable' is false, disable denorm mode.
3871 static void toggleSPDenormMode(bool Enable,
3873  const GCNSubtarget &ST,
3874  AMDGPU::SIModeRegisterDefaults Mode) {
3875  // Set SP denorm mode to this value.
3876  unsigned SPDenormMode =
3877  Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue();
3878 
3879  if (ST.hasDenormModeInst()) {
3880  // Preserve default FP64FP16 denorm mode while updating FP32 mode.
3881  uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue();
3882 
3883  uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2);
3884  B.buildInstr(AMDGPU::S_DENORM_MODE)
3885  .addImm(NewDenormModeValue);
3886 
3887  } else {
3888  // Select FP32 bit field in mode register.
3889  unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE |
3892 
3893  B.buildInstr(AMDGPU::S_SETREG_IMM32_B32)
3894  .addImm(SPDenormMode)
3895  .addImm(SPDenormModeBitField);
3896  }
3897 }
3898 
3901  MachineIRBuilder &B) const {
3902  if (legalizeFastUnsafeFDIV(MI, MRI, B))
3903  return true;
3904 
3905  Register Res = MI.getOperand(0).getReg();
3906  Register LHS = MI.getOperand(1).getReg();
3907  Register RHS = MI.getOperand(2).getReg();
3908  const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3910 
3911  uint16_t Flags = MI.getFlags();
3912 
3913  LLT S32 = LLT::scalar(32);
3914  LLT S1 = LLT::scalar(1);
3915 
3916  auto One = B.buildFConstant(S32, 1.0f);
3917 
3918  auto DenominatorScaled =
3919  B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
3920  .addUse(LHS)
3921  .addUse(RHS)
3922  .addImm(0)
3923  .setMIFlags(Flags);
3924  auto NumeratorScaled =
3925  B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
3926  .addUse(LHS)
3927  .addUse(RHS)
3928  .addImm(1)
3929  .setMIFlags(Flags);
3930 
3931  auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3932  .addUse(DenominatorScaled.getReg(0))
3933  .setMIFlags(Flags);
3934  auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags);
3935 
3936  // FIXME: Doesn't correctly model the FP mode switch, and the FP operations
3937  // aren't modeled as reading it.
3938  if (!Mode.allFP32Denormals())
3939  toggleSPDenormMode(true, B, ST, Mode);
3940 
3941  auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags);
3942  auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags);
3943  auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags);
3944  auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags);
3945  auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags);
3946  auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags);
3947 
3948  if (!Mode.allFP32Denormals())
3949  toggleSPDenormMode(false, B, ST, Mode);
3950 
3951  auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false)
3952  .addUse(Fma4.getReg(0))
3953  .addUse(Fma1.getReg(0))
3954  .addUse(Fma3.getReg(0))
3955  .addUse(NumeratorScaled.getReg(1))
3956  .setMIFlags(Flags);
3957 
3958  B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
3959  .addUse(Fmas.getReg(0))
3960  .addUse(RHS)
3961  .addUse(LHS)
3962  .setMIFlags(Flags);
3963 
3964  MI.eraseFromParent();
3965  return true;
3966 }
3967 
3970  MachineIRBuilder &B) const {
3972  return true;
3973 
3974  Register Res = MI.getOperand(0).getReg();
3975  Register LHS = MI.getOperand(1).getReg();
3976  Register RHS = MI.getOperand(2).getReg();
3977 
3978  uint16_t Flags = MI.getFlags();
3979 
3980  LLT S64 = LLT::scalar(64);
3981  LLT S1 = LLT::scalar(1);
3982 
3983  auto One = B.buildFConstant(S64, 1.0);
3984 
3985  auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
3986  .addUse(LHS)
3987  .addUse(RHS)
3988  .addImm(0)
3989  .setMIFlags(Flags);
3990 
3991  auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags);
3992 
3993  auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false)
3994  .addUse(DivScale0.getReg(0))
3995  .setMIFlags(Flags);
3996 
3997  auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags);
3998  auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags);
3999  auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags);
4000 
4001  auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
4002  .addUse(LHS)
4003  .addUse(RHS)
4004  .addImm(1)
4005  .setMIFlags(Flags);
4006 
4007  auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags);
4008  auto Mul = B.buildFMul(S64, DivScale1.getReg(0), Fma3, Flags);
4009  auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags);
4010 
4011  Register Scale;
4013  // Workaround a hardware bug on SI where the condition output from div_scale
4014  // is not usable.
4015 
4016  LLT S32 = LLT::scalar(32);
4017 
4018  auto NumUnmerge = B.buildUnmerge(S32, LHS);
4019  auto DenUnmerge = B.buildUnmerge(S32, RHS);
4020  auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0);
4021  auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1);
4022 
4023  auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1),
4024  Scale1Unmerge.getReg(1));
4025  auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1),
4026  Scale0Unmerge.getReg(1));
4027  Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0);
4028  } else {
4029  Scale = DivScale1.getReg(1);
4030  }
4031 
4032  auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false)
4033  .addUse(Fma4.getReg(0))
4034  .addUse(Fma3.getReg(0))
4035  .addUse(Mul.getReg(0))
4036  .addUse(Scale)
4037  .setMIFlags(Flags);
4038 
4039  B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, makeArrayRef(Res), false)
4040  .addUse(Fmas.getReg(0))
4041  .addUse(RHS)
4042  .addUse(LHS)
4043  .setMIFlags(Flags);
4044 
4045  MI.eraseFromParent();
4046  return true;
4047 }
4048 
4051  MachineIRBuilder &B) const {
4052  Register Res = MI.getOperand(0).getReg();
4053  Register LHS = MI.getOperand(2).getReg();
4054  Register RHS = MI.getOperand(3).getReg();
4055  uint16_t Flags = MI.getFlags();
4056 
4057  LLT S32 = LLT::scalar(32);
4058  LLT S1 = LLT::scalar(1);
4059 
4060  auto Abs = B.buildFAbs(S32, RHS, Flags);
4061  const APFloat C0Val(1.0f);
4062 
4063  auto C0 = B.buildConstant(S32, 0x6f800000);
4064  auto C1 = B.buildConstant(S32, 0x2f800000);
4065  auto C2 = B.buildConstant(S32, FloatToBits(1.0f));
4066 
4067  auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags);
4068  auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags);
4069 
4070  auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags);
4071 
4072  auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
4073  .addUse(Mul0.getReg(0))
4074  .setMIFlags(Flags);
4075 
4076  auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags);
4077 
4078  B.buildFMul(Res, Sel, Mul1, Flags);
4079 
4080  MI.eraseFromParent();
4081  return true;
4082 }
4083 
4084 // Expand llvm.amdgcn.rsq.clamp on targets that don't support the instruction.
4085 // FIXME: Why do we handle this one but not other removed instructions?
4086 //
4087 // Reciprocal square root. The clamp prevents infinite results, clamping
4088 // infinities to max_float. D.f = 1.0 / sqrt(S0.f), result clamped to
4089 // +-max_float.
4092  MachineIRBuilder &B) const {
4094  return true;
4095 
4096  Register Dst = MI.getOperand(0).getReg();
4097  Register Src = MI.getOperand(2).getReg();
4098  auto Flags = MI.getFlags();
4099 
4100  LLT Ty = MRI.getType(Dst);
4101 
4102  const fltSemantics *FltSemantics;
4103  if (Ty == LLT::scalar(32))
4104  FltSemantics = &APFloat::IEEEsingle();
4105  else if (Ty == LLT::scalar(64))
4106  FltSemantics = &APFloat::IEEEdouble();
4107  else
4108  return false;
4109 
4110  auto Rsq = B.buildIntrinsic(Intrinsic::amdgcn_rsq, {Ty}, false)
4111  .addUse(Src)
4112  .setMIFlags(Flags);
4113 
4114  // We don't need to concern ourselves with the snan handling difference, since
4115  // the rsq quieted (or not) so use the one which will directly select.
4116  const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
4117  const bool UseIEEE = MFI->getMode().IEEE;
4118 
4119  auto MaxFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics));
4120  auto ClampMax = UseIEEE ? B.buildFMinNumIEEE(Ty, Rsq, MaxFlt, Flags) :
4121  B.buildFMinNum(Ty, Rsq, MaxFlt, Flags);
4122 
4123  auto MinFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics, true));
4124 
4125  if (UseIEEE)
4126  B.buildFMaxNumIEEE(Dst, ClampMax, MinFlt, Flags);
4127  else
4128  B.buildFMaxNum(Dst, ClampMax, MinFlt, Flags);
4129  MI.eraseFromParent();
4130  return true;
4131 }
4132 
4133 static unsigned getDSFPAtomicOpcode(Intrinsic::ID IID) {
4134  switch (IID) {
4135  case Intrinsic::amdgcn_ds_fadd:
4136  return AMDGPU::G_ATOMICRMW_FADD;
4137  case Intrinsic::amdgcn_ds_fmin:
4138  return AMDGPU::G_AMDGPU_ATOMIC_FMIN;
4139  case Intrinsic::amdgcn_ds_fmax:
4140  return AMDGPU::G_AMDGPU_ATOMIC_FMAX;
4141  default:
4142  llvm_unreachable("not a DS FP intrinsic");
4143  }
4144 }
4145 
4147  MachineInstr &MI,
4148  Intrinsic::ID IID) const {
4149  GISelChangeObserver &Observer = Helper.Observer;
4150  Observer.changingInstr(MI);
4151 
4152  MI.setDesc(ST.getInstrInfo()->get(getDSFPAtomicOpcode(IID)));
4153 
4154  // The remaining operands were used to set fields in the MemOperand on
4155  // construction.
4156  for (int I = 6; I > 3; --I)
4157  MI.removeOperand(I);
4158 
4159  MI.removeOperand(1); // Remove the intrinsic ID.
4160  Observer.changedInstr(MI);
4161  return true;
4162 }
4163 
4166  MachineIRBuilder &B) const {
4167  uint64_t Offset =
4170  LLT DstTy = MRI.getType(DstReg);
4171  LLT IdxTy = LLT::scalar(DstTy.getSizeInBits());
4172 
4173  Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy);
4174  if (!loadInputValue(KernargPtrReg, B,
4176  return false;
4177 
4178  // FIXME: This should be nuw
4179  B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0));
4180  return true;
4181 }
4182 
4185  MachineIRBuilder &B) const {
4186  const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
4187  if (!MFI->isEntryFunction()) {
4190  }
4191 
4192  Register DstReg = MI.getOperand(0).getReg();
4193  if (!getImplicitArgPtr(DstReg, MRI, B))
4194  return false;
4195 
4196  MI.eraseFromParent();
4197  return true;
4198 }
4199 
4203  unsigned AddrSpace) const {
4204  Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B);
4205  auto Unmerge = B.buildUnmerge(LLT::scalar(32), MI.getOperand(2).getReg());
4206  Register Hi32 = Unmerge.getReg(1);
4207 
4208  B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg);
4209  MI.eraseFromParent();
4210  return true;
4211 }
4212 
4213 // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args:
4214 // offset (the offset that is included in bounds checking and swizzling, to be
4215 // split between the instruction's voffset and immoffset fields) and soffset
4216 // (the offset that is excluded from bounds checking and swizzling, to go in
4217 // the instruction's soffset field). This function takes the first kind of
4218 // offset and figures out how to split it between voffset and immoffset.
4219 std::pair<Register, unsigned>
4221  Register OrigOffset) const {
4222  const unsigned MaxImm = 4095;
4223  Register BaseReg;
4224  unsigned ImmOffset;
4225  const LLT S32 = LLT::scalar(32);
4226  MachineRegisterInfo &MRI = *B.getMRI();
4227 
4228  std::tie(BaseReg, ImmOffset) =
4230 
4231  // If BaseReg is a pointer, convert it to int.
4232  if (MRI.getType(BaseReg).isPointer())
4233  BaseReg = B.buildPtrToInt(MRI.getType(OrigOffset), BaseReg).getReg(0);
4234 
4235  // If the immediate value is too big for the immoffset field, put the value
4236  // and -4096 into the immoffset field so that the value that is copied/added
4237  // for the voffset field is a multiple of 4096, and it stands more chance
4238  // of being CSEd with the copy/add for another similar load/store.
4239  // However, do not do that rounding down to a multiple of 4096 if that is a
4240  // negative number, as it appears to be illegal to have a negative offset
4241  // in the vgpr, even if adding the immediate offset makes it positive.
4242  unsigned Overflow = ImmOffset & ~MaxImm;
4243  ImmOffset -= Overflow;
4244  if ((int32_t)Overflow < 0) {
4245  Overflow += ImmOffset;
4246  ImmOffset = 0;
4247  }
4248 
4249  if (Overflow != 0) {
4250  if (!BaseReg) {
4251  BaseReg = B.buildConstant(S32, Overflow).getReg(0);
4252  } else {
4253  auto OverflowVal = B.buildConstant(S32, Overflow);
4254  BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0);
4255  }
4256  }
4257 
4258  if (!BaseReg)
4259  BaseReg = B.buildConstant(S32, 0).getReg(0);
4260 
4261  return std::make_pair(BaseReg, ImmOffset);
4262 }
4263 
4264 /// Update \p MMO based on the offset inputs to a raw/struct buffer intrinsic.
4266  Register VOffset, Register SOffset,
4267  unsigned ImmOffset, Register VIndex,
4268  MachineRegisterInfo &MRI) const {
4269  Optional<ValueAndVReg> MaybeVOffsetVal =
4271  Optional<ValueAndVReg> MaybeSOffsetVal =
4273  Optional<ValueAndVReg> MaybeVIndexVal =
4275  // If the combined VOffset + SOffset + ImmOffset + strided VIndex is constant,
4276  // update the MMO with that offset. The stride is unknown so we can only do
4277  // this if VIndex is constant 0.
4278  if (MaybeVOffsetVal && MaybeSOffsetVal && MaybeVIndexVal &&
4279  MaybeVIndexVal->Value == 0) {
4280  uint64_t TotalOffset = MaybeVOffsetVal->Value.getZExtValue() +
4281  MaybeSOffsetVal->Value.getZExtValue() + ImmOffset;
4282  MMO->setOffset(TotalOffset);
4283  } else {
4284  // We don't have a constant combined offset to use in the MMO. Give up.
4285  MMO->setValue((Value *)nullptr);
4286  }
4287 }
4288 
4289 /// Handle register layout difference for f16 images for some subtargets.
4292  Register Reg,
4293  bool ImageStore) const {
4294  const LLT S16 = LLT::scalar(16);
4295  const LLT S32 = LLT::scalar(32);
4296  LLT StoreVT = MRI.getType(Reg);
4297  assert(StoreVT.isVector() && StoreVT.getElementType() == S16);
4298 
4299  if (ST.hasUnpackedD16VMem()) {
4300  auto Unmerge = B.buildUnmerge(S16, Reg);
4301 
4302  SmallVector<Register, 4> WideRegs;
4303  for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
4304  WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0));
4305 
4306  int NumElts = StoreVT.getNumElements();
4307 
4308  return B.buildBuildVector(LLT::fixed_vector(NumElts, S32), WideRegs)
4309  .getReg(0);
4310  }
4311 
4312  if (ImageStore && ST.hasImageStoreD16Bug()) {
4313  if (StoreVT.getNumElements() == 2) {
4314  SmallVector<Register, 4> PackedRegs;
4315  Reg = B.buildBitcast(S32, Reg).getReg(0);
4316  PackedRegs.push_back(Reg);
4317  PackedRegs.resize(2, B.buildUndef(S32).getReg(0));
4318  return B.buildBuildVector(LLT::fixed_vector(2, S32), PackedRegs)
4319  .getReg(0);
4320  }
4321 
4322  if (StoreVT.getNumElements() == 3) {
4323  SmallVector<Register, 4> PackedRegs;
4324  auto Unmerge = B.buildUnmerge(S16, Reg);
4325  for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
4326  PackedRegs.push_back(Unmerge.getReg(I));
4327  PackedRegs.resize(6, B.buildUndef(S16).getReg(0));
4328  Reg = B.buildBuildVector(LLT::fixed_vector(6, S16), PackedRegs).getReg(0);
4329  return B.buildBitcast(LLT::fixed_vector(3, S32), Reg).getReg(0);
4330  }
4331 
4332  if (StoreVT.getNumElements() == 4) {
4333  SmallVector<Register, 4> PackedRegs;
4334  Reg = B.buildBitcast(LLT::fixed_vector(2, S32), Reg).getReg(0);
4335  auto Unmerge = B.buildUnmerge(S32, Reg);
4336  for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
4337  PackedRegs.push_back(Unmerge.getReg(I));
4338  PackedRegs.resize(4, B.buildUndef(S32).getReg(0));
4339  return B.buildBuildVector(LLT::fixed_vector(4, S32), PackedRegs)
4340  .getReg(0);
4341  }
4342 
4343  llvm_unreachable("invalid data type");
4344  }
4345 
4346  if (StoreVT == LLT::fixed_vector(3, S16)) {
4347  Reg = B.buildPadVectorWithUndefElements(LLT::fixed_vector(4, S16), Reg)
4348  .getReg(0);
4349  }
4350  return Reg;
4351 }
4352 
4354  MachineIRBuilder &B, Register VData, bool IsFormat) const {
4355  MachineRegisterInfo *MRI = B.getMRI();
4356  LLT Ty = MRI->getType(VData);
4357 
4358  const LLT S16 = LLT::scalar(16);
4359 
4360  // Fixup illegal register types for i8 stores.
4361  if (Ty == LLT::scalar(8) || Ty == S16) {
4362  Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0);
4363  return AnyExt;
4364  }
4365 
4366  if (Ty.isVector()) {
4367  if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) {
4368  if (IsFormat)
4369  return handleD16VData(B, *MRI, VData);
4370  }
4371  }
4372 
4373  return VData;
4374 }
4375 
4379  bool IsTyped,
4380  bool IsFormat) const {
4381  Register VData = MI.getOperand(1).getReg();
4382  LLT Ty = MRI.getType(VData);
4383  LLT EltTy = Ty.getScalarType();
4384  const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
4385  const LLT S32 = LLT::scalar(32);
4386 
4387  VData = fixStoreSourceType(B, VData, IsFormat);
4388  Register RSrc = MI.getOperand(2).getReg();
4389 
4390  MachineMemOperand *MMO = *MI.memoperands_begin();
4391  const int MemSize = MMO->getSize();
4392 
4393  unsigned ImmOffset;
4394 
4395  // The typed intrinsics add an immediate after the registers.
4396  const unsigned NumVIndexOps = IsTyped ? 8 : 7;
4397 
4398  // The struct intrinsic variants add one additional operand over raw.
4399  const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
4400  Register VIndex;
4401  int OpOffset = 0;
4402  if (HasVIndex) {
4403  VIndex = MI.getOperand(3).getReg();
4404  OpOffset = 1;
4405  } else {
4406  VIndex = B.buildConstant(S32, 0).getReg(0);
4407  }
4408 
4409  Register VOffset = MI.getOperand(3 + OpOffset).getReg();
4410  Register SOffset = MI.getOperand(4 + OpOffset).getReg();
4411 
4412  unsigned Format = 0;
4413  if (IsTyped) {
4414  Format = MI.getOperand(5 + OpOffset).getImm();
4415  ++OpOffset;
4416  }
4417 
4418  unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
4419 
4420  std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset);
4421  updateBufferMMO(MMO, VOffset, SOffset, ImmOffset, VIndex, MRI);
4422 
4423  unsigned Opc;
4424  if (IsTyped) {
4425  Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 :
4426  AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT;
4427  } else if (IsFormat) {
4428  Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 :
4429  AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT;
4430  } else {
4431  switch (MemSize) {
4432  case 1:
4433  Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE;
4434  break;
4435  case 2:
4436  Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT;
4437  break;
4438  default:
4439  Opc = AMDGPU::G_AMDGPU_BUFFER_STORE;
4440  break;
4441  }
4442  }
4443 
4444  auto MIB = B.buildInstr(Opc)
4445  .addUse(VData) // vdata
4446  .addUse(RSrc) // rsrc
4447  .addUse(VIndex) // vindex
4448  .addUse(VOffset) // voffset
4449  .addUse(SOffset) // soffset
4450  .addImm(ImmOffset); // offset(imm)
4451 
4452  if (IsTyped)
4453  MIB.addImm(Format);
4454 
4455  MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm)
4456  .addImm(HasVIndex ? -1 : 0) // idxen(imm)
4457  .addMemOperand(MMO);
4458 
4459  MI.eraseFromParent();
4460  return true;
4461 }
4462 
4466  bool IsFormat,
4467  bool IsTyped) const {
4468  // FIXME: Verifier should enforce 1 MMO for these intrinsics.
4469  MachineMemOperand *MMO = *MI.memoperands_begin();
4470  const LLT MemTy = MMO->getMemoryType();
4471  const LLT S32 = LLT::scalar(32);
4472 
4473  Register Dst = MI.getOperand(0).getReg();
4474  Register RSrc = MI.getOperand(2).getReg();
4475 
4476  // The typed intrinsics add an immediate after the registers.
4477  const unsigned NumVIndexOps = IsTyped ? 8 : 7;
4478 
4479  // The struct intrinsic variants add one additional operand over raw.
4480  const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
4481  Register VIndex;
4482  int OpOffset = 0;
4483  if (HasVIndex) {
4484  VIndex = MI.getOperand(3).getReg();
4485  OpOffset = 1;
4486  } else {
4487  VIndex = B.buildConstant(S32, 0).getReg(0);
4488  }
4489 
4490  Register VOffset = MI.getOperand(3 + OpOffset).getReg();
4491  Register SOffset = MI.getOperand(4 + OpOffset).getReg();
4492 
4493  unsigned Format = 0;
4494  if (IsTyped) {
4495  Format = MI.getOperand(5 + OpOffset).getImm();
4496  ++OpOffset;
4497  }
4498 
4499  unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
4500  unsigned ImmOffset;
4501 
4502  LLT Ty = MRI.getType(Dst);
4503  LLT EltTy = Ty.getScalarType();
4504  const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
4505  const bool Unpacked = ST.hasUnpackedD16VMem();
4506 
4507  std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset);
4508  updateBufferMMO(MMO, VOffset, SOffset, ImmOffset, VIndex, MRI);
4509 
4510  unsigned Opc;
4511 
4512  if (IsTyped) {
4513  Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 :
4514  AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT;
4515  } else if (IsFormat) {
4516  Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16 :
4517  AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT;
4518  } else {
4519  switch (MemTy.getSizeInBits()) {
4520  case 8:
4521  Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE;
4522  break;
4523  case 16:
4524  Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT;
4525  break;
4526  default:
4527  Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD;
4528  break;
4529  }
4530  }
4531 
4532  Register LoadDstReg;
4533 
4534  bool IsExtLoad =
4535  (!IsD16 && MemTy.getSizeInBits() < 32) || (IsD16 && !Ty.isVector());
4536  LLT UnpackedTy = Ty.changeElementSize(32);
4537 
4538  if (IsExtLoad)
4539  LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32);
4540  else if (Unpacked && IsD16 && Ty.isVector())
4541  LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy);
4542  else
4543  LoadDstReg = Dst;
4544 
4545  auto MIB = B.buildInstr(Opc)
4546  .addDef(LoadDstReg) // vdata
4547  .addUse(RSrc) // rsrc
4548  .addUse(VIndex) // vindex
4549  .addUse(VOffset) // voffset
4550  .addUse(SOffset) // soffset
4551  .addImm(ImmOffset); // offset(imm)
4552 
4553  if (IsTyped)
4554  MIB.addImm(Format);
4555 
4556  MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm)
4557  .addImm(HasVIndex ? -1 : 0) // idxen(imm)
4558  .addMemOperand(MMO);
4559 
4560  if (LoadDstReg != Dst) {
4561  B.setInsertPt(B.getMBB(), ++B.getInsertPt());
4562 
4563  // Widen result for extending loads was widened.
4564  if (IsExtLoad)
4565  B.buildTrunc(Dst, LoadDstReg);
4566  else {
4567  // Repack to original 16-bit vector result
4568  // FIXME: G_TRUNC should work, but legalization currently fails
4569  auto Unmerge = B.buildUnmerge(S32, LoadDstReg);
4570  SmallVector<Register, 4> Repack;
4571  for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I)
4572  Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0));
4573  B.buildMerge(Dst, Repack);
4574  }
4575  }
4576 
4577  MI.eraseFromParent();
4578  return true;
4579 }
4580 
4583  bool IsInc) const {
4584  unsigned Opc = IsInc ? AMDGPU::G_AMDGPU_ATOMIC_INC :
4585  AMDGPU::G_AMDGPU_ATOMIC_DEC;
4586  B.buildInstr(Opc)
4587  .addDef(MI.getOperand(0).getReg())
4588  .addUse(MI.getOperand(2).getReg())
4589  .addUse(MI.getOperand(3).getReg())
4590  .cloneMemRefs(MI);
4591  MI.eraseFromParent();
4592  return true;
4593 }
4594 
4595 static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) {
4596  switch (IntrID) {
4597  case Intrinsic::amdgcn_raw_buffer_atomic_swap:
4598  case Intrinsic::amdgcn_struct_buffer_atomic_swap:
4599  return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP;
4600  case Intrinsic::amdgcn_raw_buffer_atomic_add:
4601  case Intrinsic::amdgcn_struct_buffer_atomic_add:
4602  return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD;
4603  case Intrinsic::amdgcn_raw_buffer_atomic_sub:
4604  case Intrinsic::amdgcn_struct_buffer_atomic_sub:
4605  return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB;
4606  case Intrinsic::amdgcn_raw_buffer_atomic_smin:
4607  case Intrinsic::amdgcn_struct_buffer_atomic_smin:
4608  return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN;
4609  case Intrinsic::amdgcn_raw_buffer_atomic_umin:
4610  case Intrinsic::amdgcn_struct_buffer_atomic_umin:
4611  return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN;
4612  case Intrinsic::amdgcn_raw_buffer_atomic_smax:
4613  case Intrinsic::amdgcn_struct_buffer_atomic_smax:
4614  return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX;
4615  case Intrinsic::amdgcn_raw_buffer_atomic_umax:
4616  case Intrinsic::amdgcn_struct_buffer_atomic_umax:
4617  return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX;
4618  case Intrinsic::amdgcn_raw_buffer_atomic_and:
4619  case Intrinsic::amdgcn_struct_buffer_atomic_and:
4620  return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND;
4621  case Intrinsic::amdgcn_raw_buffer_atomic_or:
4622  case Intrinsic::amdgcn_struct_buffer_atomic_or:
4623  return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR;
4624  case Intrinsic::amdgcn_raw_buffer_atomic_xor:
4625  case Intrinsic::amdgcn_struct_buffer_atomic_xor:
4626  return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR;
4627  case Intrinsic::amdgcn_raw_buffer_atomic_inc:
4628  case Intrinsic::amdgcn_struct_buffer_atomic_inc:
4629  return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC;
4630  case Intrinsic::amdgcn_raw_buffer_atomic_dec:
4631  case Intrinsic::amdgcn_struct_buffer_atomic_dec:
4632  return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC;
4633  case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
4634  case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
4635  return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP;
4636  case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
4637  case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
4638  return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD;
4639  case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
4640  case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
4641  return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN;
4642  case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
4643  case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
4644  return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX;
4645  default:
4646  llvm_unreachable("unhandled atomic opcode");
4647  }
4648 }
4649 
4652  Intrinsic::ID IID) const {
4653  const bool IsCmpSwap = IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap ||
4654  IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap;
4655  const bool HasReturn = MI.getNumExplicitDefs() != 0;
4656 
4657  Register Dst;
4658 
4659  int OpOffset = 0;
4660  if (HasReturn) {
4661  // A few FP atomics do not support return values.
4662  Dst = MI.getOperand(0).getReg();
4663  } else {
4664  OpOffset = -1;
4665  }
4666 
4667  Register VData = MI.getOperand(2 + OpOffset).getReg();
4668  Register CmpVal;
4669 
4670  if (IsCmpSwap) {
4671  CmpVal = MI.getOperand(3 + OpOffset).getReg();
4672  ++OpOffset;
4673  }
4674 
4675  Register RSrc = MI.getOperand(3 + OpOffset).getReg();
4676  const unsigned NumVIndexOps = (IsCmpSwap ? 8 : 7) + HasReturn;
4677 
4678  // The struct intrinsic variants add one additional operand over raw.
4679  const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
4680  Register VIndex;
4681  if (HasVIndex) {
4682  VIndex = MI.getOperand(4 + OpOffset).getReg();
4683  ++OpOffset;
4684  } else {
4685  VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0);
4686  }
4687 
4688  Register VOffset = MI.getOperand(4 + OpOffset).getReg();
4689  Register SOffset = MI.getOperand(5 + OpOffset).getReg();
4690  unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm();
4691 
4692  MachineMemOperand *MMO = *MI.memoperands_begin();
4693 
4694  unsigned ImmOffset;
4695  std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset);
4696  updateBufferMMO(MMO, VOffset, SOffset, ImmOffset, VIndex, *B.getMRI());
4697 
4698  auto MIB = B.buildInstr(getBufferAtomicPseudo(IID));
4699 
4700  if (HasReturn)
4701  MIB.addDef(Dst);
4702 
4703  MIB.addUse(VData); // vdata
4704 
4705  if (IsCmpSwap)
4706  MIB.addReg(CmpVal);
4707 
4708  MIB.addUse(RSrc) // rsrc
4709  .addUse(VIndex) // vindex
4710  .addUse(VOffset) // voffset
4711  .addUse(SOffset) // soffset
4712  .addImm(ImmOffset) // offset(imm)
4713  .addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm)
4714  .addImm(HasVIndex ? -1 : 0) // idxen(imm)
4715  .addMemOperand(MMO);
4716 
4717  MI.eraseFromParent();
4718  return true;
4719 }
4720 
4721 /// Turn a set of s16 typed registers in \p AddrRegs into a dword sized
4722 /// vector with s16 typed elements.
4724  SmallVectorImpl<Register> &PackedAddrs,
4725  unsigned ArgOffset,
4726  const AMDGPU::ImageDimIntrinsicInfo *Intr,
4727  bool IsA16, bool IsG16) {
4728  const LLT S16 = LLT::scalar(16);
4729  const LLT V2S16 = LLT::fixed_vector(2, 16);
4730  auto EndIdx = Intr->VAddrEnd;
4731 
4732  for (unsigned I = Intr->VAddrStart; I < EndIdx; I++) {
4733  MachineOperand &SrcOp = MI.getOperand(ArgOffset + I);
4734  if (!SrcOp.isReg())
4735  continue; // _L to _LZ may have eliminated this.
4736 
4737  Register AddrReg = SrcOp.getReg();
4738 
4739  if ((I < Intr->GradientStart) ||
4740  (I >= Intr->GradientStart && I < Intr->CoordStart && !IsG16) ||
4741  (I >= Intr->CoordStart && !IsA16)) {
4742  if ((I < Intr->GradientStart) && IsA16 &&
4743  (B.getMRI()->getType(AddrReg) == S16)) {
4744  assert(I == Intr->BiasIndex && "Got unexpected 16-bit extra argument");
4745  // Special handling of bias when A16 is on. Bias is of type half but
4746  // occupies full 32-bit.
4747  PackedAddrs.push_back(
4748  B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
4749  .getReg(0));
4750  } else {
4751  assert((!IsA16 || Intr->NumBiasArgs == 0 || I != Intr->BiasIndex) &&
4752  "Bias needs to be converted to 16 bit in A16 mode");
4753  // Handle any gradient or coordinate operands that should not be packed
4754  AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0);
4755  PackedAddrs.push_back(AddrReg);
4756  }
4757  } else {
4758  // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D,
4759  // derivatives dx/dh and dx/dv are packed with undef.
4760  if (((I + 1) >= EndIdx) ||
4761  ((Intr->NumGradients / 2) % 2 == 1 &&
4762  (I == static_cast<unsigned>(Intr->GradientStart +
4763  (Intr->NumGradients / 2) - 1) ||
4764  I == static_cast<unsigned>(Intr->GradientStart +
4765  Intr->NumGradients - 1))) ||
4766  // Check for _L to _LZ optimization
4767  !MI.getOperand(ArgOffset + I + 1).isReg()) {
4768  PackedAddrs.push_back(
4769  B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
4770  .getReg(0));
4771  } else {
4772  PackedAddrs.push_back(
4773  B.buildBuildVector(
4774  V2S16, {AddrReg, MI.getOperand(ArgOffset + I + 1).getReg()})
4775  .getReg(0));
4776  ++I;
4777  }
4778  }
4779  }
4780 }
4781 
4782 /// Convert from separate vaddr components to a single vector address register,
4783 /// and replace the remaining operands with $noreg.
4785  int DimIdx, int NumVAddrs) {
4786  const LLT S32 = LLT::scalar(32);
4787 
4788  SmallVector<Register, 8> AddrRegs;
4789  for (int I = 0; I != NumVAddrs; ++I) {
4790  MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
4791  if (SrcOp.isReg()) {
4792  AddrRegs.push_back(SrcOp.getReg());
4793  assert(B.getMRI()->getType(SrcOp.getReg()) == S32);
4794  }
4795  }
4796 
4797  int NumAddrRegs = AddrRegs.size();
4798  if (NumAddrRegs != 1) {
4799  // Above 8 elements round up to next power of 2 (i.e. 16).
4800  if (NumAddrRegs > 8 && !isPowerOf2_32(NumAddrRegs)) {
4801  const int RoundedNumRegs = NextPowerOf2(NumAddrRegs);
4802  auto Undef = B.buildUndef(S32);
4803  AddrRegs.append(RoundedNumRegs - NumAddrRegs, Undef.getReg(0));
4804  NumAddrRegs = RoundedNumRegs;
4805  }
4806 
4807  auto VAddr =
4808  B.buildBuildVector(LLT::fixed_vector(NumAddrRegs, 32), AddrRegs);
4809  MI.getOperand(DimIdx).setReg(VAddr.getReg(0));
4810  }
4811 
4812  for (int I = 1; I != NumVAddrs; ++I) {
4813  MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
4814  if (SrcOp.isReg())
4815  MI.getOperand(DimIdx + I).setReg(AMDGPU::NoRegister);
4816  }
4817 }
4818 
4819 /// Rewrite image intrinsics to use register layouts expected by the subtarget.
4820 ///
4821 /// Depending on the subtarget, load/store with 16-bit element data need to be
4822 /// rewritten to use the low half of 32-bit registers, or directly use a packed
4823 /// layout. 16-bit addresses should also sometimes be packed into 32-bit
4824 /// registers.
4825 ///
4826 /// We don't want to directly select image instructions just yet, but also want
4827 /// to exposes all register repacking to the legalizer/combiners. We also don't
4828 /// want a selected instruction entering RegBankSelect. In order to avoid
4829 /// defining a multitude of intermediate image instructions, directly hack on
4830 /// the intrinsic's arguments. In cases like a16 addresses, this requires
4831 /// padding now unnecessary arguments with $noreg.
4834  const AMDGPU::ImageDimIntrinsicInfo *Intr) const {
4835 
4836  const unsigned NumDefs = MI.getNumExplicitDefs();
4837  const unsigned ArgOffset = NumDefs + 1;
4838  bool IsTFE = NumDefs == 2;
4839  // We are only processing the operands of d16 image operations on subtargets
4840  // that use the unpacked register layout, or need to repack the TFE result.
4841 
4842  // TODO: Do we need to guard against already legalized intrinsics?
4843  const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
4844  AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode);
4845 
4846  MachineRegisterInfo *MRI = B.getMRI();
4847  const LLT S32 = LLT::scalar(32);
4848  const LLT S16 = LLT::scalar(16);
4849  const LLT V2S16 = LLT::fixed_vector(2, 16);
4850 
4851  unsigned DMask = 0;
4852  Register VData = MI.getOperand(NumDefs == 0 ? 1 : 0).getReg();
4853  LLT Ty = MRI->getType(VData);
4854 
4855  // Check for 16 bit addresses and pack if true.
4856  LLT GradTy =
4857  MRI->getType(MI.getOperand(ArgOffset + Intr->GradientStart).getReg());
4858  LLT AddrTy =
4859  MRI->getType(MI.getOperand(ArgOffset + Intr->CoordStart).getReg());
4860  const bool IsG16 = GradTy == S16;
4861  const bool IsA16 = AddrTy == S16;
4862  const bool IsD16 = Ty.getScalarType() == S16;
4863 
4864  int DMaskLanes = 0;
4865  if (!BaseOpcode->Atomic) {
4866  DMask = MI.getOperand(ArgOffset + Intr->DMaskIndex).getImm();
4867  if (BaseOpcode->Gather4) {
4868  DMaskLanes = 4;
4869  } else if (DMask != 0) {
4870  DMaskLanes = countPopulation(DMask);
4871  } else if (!IsTFE && !BaseOpcode->Store) {
4872  // If dmask is 0, this is a no-op load. This can be eliminated.
4873  B.buildUndef(MI.getOperand(0));
4874  MI.eraseFromParent();
4875  return true;
4876  }
4877  }
4878 
4879  Observer.changingInstr(MI);
4880  auto ChangedInstr = make_scope_exit([&] { Observer.changedInstr(MI); });
4881 
4882  const unsigned StoreOpcode = IsD16 ? AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16
4883  : AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE;
4884  const unsigned LoadOpcode = IsD16 ? AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16
4885  : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD;
4886  unsigned NewOpcode = NumDefs == 0 ? StoreOpcode : LoadOpcode;
4887 
4888  // Track that we legalized this
4889  MI.setDesc(B.getTII().get(NewOpcode));
4890 
4891  // Expecting to get an error flag since TFC is on - and dmask is 0 Force
4892  // dmask to be at least 1 otherwise the instruction will fail
4893  if (IsTFE && DMask == 0) {
4894  DMask = 0x1;
4895  DMaskLanes = 1;
4896  MI.getOperand(ArgOffset + Intr->DMaskIndex).setImm(DMask);
4897  }
4898 
4899  if (BaseOpcode->Atomic) {
4900  Register VData0 = MI.getOperand(2).getReg();
4901  LLT Ty = MRI->getType(VData0);
4902 
4903  // TODO: Allow atomic swap and bit ops for v2s16/v4s16
4904  if (Ty.isVector())
4905  return false;
4906 
4907  if (BaseOpcode->AtomicX2) {
4908  Register VData1 = MI.getOperand(3).getReg();
4909  // The two values are packed in one register.
4910  LLT PackedTy = LLT::fixed_vector(2, Ty);
4911  auto Concat = B.buildBuildVector(PackedTy, {VData0, VData1});
4912  MI.getOperand(2).setReg(Concat.getReg(0));
4913  MI.getOperand(3).setReg(AMDGPU::NoRegister);
4914  }
4915  }
4916 
4917  unsigned CorrectedNumVAddrs = Intr->NumVAddrs;
4918 
4919  // Rewrite the addressing register layout before doing anything else.
4920  if (BaseOpcode->Gradients && !ST.hasG16() && (IsA16 != IsG16)) {
4921  // 16 bit gradients are supported, but are tied to the A16 control
4922  // so both gradients and addresses must be 16 bit
4923  return false;
4924  }
4925 
4926  if (IsA16 && !ST.hasA16()) {
4927  // A16 not supported
4928  return false;
4929  }
4930 
4931  if (IsA16 || IsG16) {
4932  if (Intr->NumVAddrs > 1) {
4933  SmallVector<Register, 4> PackedRegs;
4934 
4935  packImage16bitOpsToDwords(B, MI, PackedRegs, ArgOffset, Intr, IsA16,
4936  IsG16);
4937 
4938  // See also below in the non-a16 branch
4939  const bool UseNSA = ST.hasNSAEncoding() && PackedRegs.size() >= 3 &&
4940  PackedRegs.size() <= ST.getNSAMaxSize();
4941 
4942  if (!UseNSA && PackedRegs.size() > 1) {
4943  LLT PackedAddrTy = LLT::fixed_vector(2 * PackedRegs.size(), 16);
4944  auto Concat = B.buildConcatVectors(PackedAddrTy, PackedRegs);
4945  PackedRegs[0] = Concat.getReg(0);
4946  PackedRegs.resize(1);
4947  }
4948 
4949  const unsigned NumPacked = PackedRegs.size();
4950  for (unsigned I = Intr->VAddrStart; I < Intr->VAddrEnd; I++) {
4951  MachineOperand &SrcOp = MI.getOperand(ArgOffset + I);
4952  if (!SrcOp.isReg()) {
4953  assert(SrcOp.isImm() && SrcOp.getImm() == 0);
4954  continue;
4955  }
4956 
4957  assert(SrcOp.getReg() != AMDGPU::NoRegister);
4958 
4959  if (I - Intr->VAddrStart < NumPacked)
4960  SrcOp.setReg(PackedRegs[I - Intr->VAddrStart]);
4961  else
4962  SrcOp.setReg(AMDGPU::NoRegister);
4963  }
4964  }
4965  } else {
4966  // If the register allocator cannot place the address registers contiguously
4967  // without introducing moves, then using the non-sequential address encoding
4968  // is always preferable, since it saves VALU instructions and is usually a
4969  // wash in terms of code size or even better.
4970  //
4971  // However, we currently have no way of hinting to the register allocator
4972  // that MIMG addresses should be placed contiguously when it is possible to
4973  // do so, so force non-NSA for the common 2-address case as a heuristic.
4974  //
4975  // SIShrinkInstructions will convert NSA encodings to non-NSA after register
4976  // allocation when possible.
4977  //
4978  // TODO: we can actually allow partial NSA where the final register is a
4979  // contiguous set of the remaining addresses.
4980  // This could help where there are more addresses than supported.
4981  const bool UseNSA = ST.hasNSAEncoding() && CorrectedNumVAddrs >= 3 &&
4982  CorrectedNumVAddrs <= ST.getNSAMaxSize();
4983 
4984  if (!UseNSA && Intr->NumVAddrs > 1)
4985  convertImageAddrToPacked(B, MI, ArgOffset + Intr->VAddrStart,
4986  Intr->NumVAddrs);
4987  }
4988 
4989  int Flags = 0;
4990  if (IsA16)
4991  Flags |= 1;
4992  if (IsG16)
4993  Flags |= 2;
4994  MI.addOperand(MachineOperand::CreateImm(Flags));
4995 
4996  if (BaseOpcode->Store) { // No TFE for stores?
4997  // TODO: Handle dmask trim
4998  if (!Ty.isVector() || !IsD16)
4999  return true;
5000 
5001  Register RepackedReg = handleD16VData(B, *MRI, VData, true);
5002  if (RepackedReg != VData) {
5003  MI.getOperand(1).setReg(RepackedReg);
5004  }
5005 
5006  return true;
5007  }
5008 
5009  Register DstReg = MI.getOperand(0).getReg();
5010  const LLT EltTy = Ty.getScalarType();
5011  const int NumElts = Ty.isVector() ? Ty.getNumElements() : 1;
5012 
5013  // Confirm that the return type is large enough for the dmask specified
5014  if (NumElts < DMaskLanes)
5015  return false;
5016 
5017  if (NumElts > 4 || DMaskLanes > 4)
5018  return false;
5019 
5020  const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes;
5021  const LLT AdjustedTy =
5022  Ty.changeElementCount(ElementCount::getFixed(AdjustedNumElts));
5023 
5024  // The raw dword aligned data component of the load. The only legal cases
5025  // where this matters should be when using the packed D16 format, for
5026  // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>,
5027  LLT RoundedTy;
5028 
5029  // S32 vector to to cover all data, plus TFE result element.
5030  LLT TFETy;
5031 
5032  // Register type to use for each loaded component. Will be S32 or V2S16.
5033  LLT RegTy;
5034 
5035  if (IsD16 && ST.hasUnpackedD16VMem()) {
5036  RoundedTy =
5037  LLT::scalarOrVector(ElementCount::getFixed(AdjustedNumElts), 32);
5038  TFETy = LLT::fixed_vector(AdjustedNumElts + 1, 32);
5039  RegTy = S32;
5040  } else {
5041  unsigned EltSize = EltTy.getSizeInBits();
5042  unsigned RoundedElts = (AdjustedTy.getSizeInBits() + 31) / 32;
5043  unsigned RoundedSize = 32 * RoundedElts;
5044  RoundedTy = LLT::scalarOrVector(
5045  ElementCount::getFixed(RoundedSize / EltSize), EltSize);
5046  TFETy = LLT::fixed_vector(RoundedSize / 32 + 1, S32);
5047  RegTy = !IsTFE && EltSize == 16 ? V2S16 : S32;
5048  }
5049 
5050  // The return type does not need adjustment.
5051  // TODO: Should we change s16 case to s32 or <2 x s16>?
5052  if (!IsTFE && (RoundedTy == Ty || !Ty.isVector()))
5053  return true;
5054 
5055  Register Dst1Reg;
5056 
5057  // Insert after the instruction.
5058  B.setInsertPt(*MI.getParent(), ++MI.getIterator());
5059 
5060  // TODO: For TFE with d16, if we used a TFE type that was a multiple of <2 x
5061  // s16> instead of s32, we would only need 1 bitcast instead of multiple.
5062  const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy;
5063  const int ResultNumRegs = LoadResultTy.getSizeInBits() / 32;
5064 
5065  Register NewResultReg = MRI->createGenericVirtualRegister(LoadResultTy);
5066 
5067  MI.getOperand(0).setReg(NewResultReg);
5068 
5069  // In the IR, TFE is supposed to be used with a 2 element struct return
5070  // type. The instruction really returns these two values in one contiguous
5071  // register, with one additional dword beyond the loaded data. Rewrite the
5072  // return type to use a single register result.
5073 
5074  if (IsTFE) {
5075  Dst1Reg = MI.getOperand(1).getReg();
5076  if (MRI->getType(Dst1Reg) != S32)
5077  return false;
5078 
5079  // TODO: Make sure the TFE operand bit is set.
5080  MI.removeOperand(1);
5081 
5082  // Handle the easy case that requires no repack instructions.
5083  if (Ty == S32) {
5084  B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg);
5085  return true;
5086  }
5087  }
5088 
5089  // Now figure out how to copy the new result register back into the old
5090  // result.
5091  SmallVector<Register, 5> ResultRegs(ResultNumRegs, Dst1Reg);
5092 
5093  const int NumDataRegs = IsTFE ? ResultNumRegs - 1 : ResultNumRegs;
5094 
5095  if (ResultNumRegs == 1) {
5096  assert(!IsTFE);
5097  ResultRegs[0] = NewResultReg;
5098  } else {
5099  // We have to repack into a new vector of some kind.
5100  for (int I = 0; I != NumDataRegs; ++I)
5101  ResultRegs[I] = MRI->createGenericVirtualRegister(RegTy);
5102  B.buildUnmerge(ResultRegs, NewResultReg);
5103 
5104  // Drop the final TFE element to get the data part. The TFE result is
5105  // directly written to the right place already.
5106  if (IsTFE)
5107  ResultRegs.resize(NumDataRegs);
5108  }
5109 
5110  // For an s16 scalar result, we form an s32 result with a truncate regardless
5111  // of packed vs. unpacked.
5112  if (IsD16 && !Ty.isVector()) {
5113  B.buildTrunc(DstReg, ResultRegs[0]);
5114  return true;
5115  }
5116 
5117  // Avoid a build/concat_vector of 1 entry.
5118  if (Ty == V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) {
5119  B.buildBitcast(DstReg, ResultRegs[0]);
5120  return true;
5121  }
5122 
5123  assert(Ty.isVector());
5124 
5125  if (IsD16) {
5126  // For packed D16 results with TFE enabled, all the data components are
5127  // S32. Cast back to the expected type.
5128  //
5129  // TODO: We don't really need to use load s32 elements. We would only need one
5130  // cast for the TFE result if a multiple of v2s16 was used.
5131  if (RegTy != V2S16 && !ST.hasUnpackedD16VMem()) {
5132  for (Register &Reg : ResultRegs)
5133  Reg = B.buildBitcast(V2S16, Reg).getReg(0);
5134  } else if (ST.hasUnpackedD16VMem()) {
5135  for (Register &Reg : ResultRegs)
5136  Reg = B.buildTrunc(S16, Reg).getReg(0);
5137  }
5138  }
5139 
5140  auto padWithUndef = [&](LLT Ty, int NumElts) {
5141  if (NumElts == 0)
5142  return;
5143  Register Undef = B.buildUndef(Ty).getReg(0);
5144  for (int I = 0; I != NumElts; ++I)
5145  ResultRegs.push_back(Undef);
5146  };
5147 
5148  // Pad out any elements eliminated due to the dmask.
5149  LLT ResTy = MRI->getType(ResultRegs[0]);
5150  if (!ResTy.isVector()) {
5151  padWithUndef(ResTy, NumElts - ResultRegs.size());
5152  B.buildBuildVector(DstReg, ResultRegs);
5153  return true;
5154  }
5155 
5156  assert(!ST.hasUnpackedD16VMem() && ResTy == V2S16);
5157  const int RegsToCover = (Ty.getSizeInBits() + 31) / 32;
5158 
5159  // Deal with the one annoying legal case.
5160  const LLT V3S16 = LLT::fixed_vector(3, 16);
5161  if (Ty == V3S16) {
5162  if (IsTFE) {
5163  if (ResultRegs.size() == 1) {
5164  NewResultReg = ResultRegs[0];
5165  } else if (ResultRegs.size() == 2) {
5166  LLT V4S16 = LLT::fixed_vector(4, 16);
5167  NewResultReg = B.buildConcatVectors(V4S16, ResultRegs).getReg(0);
5168  } else {
5169  return false;
5170  }
5171  }
5172 
5173  if (MRI->getType(DstReg).getNumElements() <
5174  MRI->getType(NewResultReg).getNumElements()) {
5175  B.buildDeleteTrailingVectorElements(DstReg, NewResultReg);
5176  } else {
5177  B.buildPadVectorWithUndefElements(DstReg, NewResultReg);
5178  }
5179  return true;
5180  }
5181 
5182  padWithUndef(ResTy, RegsToCover - ResultRegs.size());
5183  B.buildConcatVectors(DstReg, ResultRegs);
5184  return true;
5185 }
5186 
5188  LegalizerHelper &Helper, MachineInstr &MI) const {
5189  MachineIRBuilder &B = Helper.MIRBuilder;
5190  GISelChangeObserver &Observer = Helper.Observer;
5191 
5192  Register Dst = MI.getOperand(0).getReg();
5193  LLT Ty = B.getMRI()->getType(Dst);
5194  unsigned Size = Ty.getSizeInBits();
5195  MachineFunction &MF = B.getMF();
5196 
5197  Observer.changingInstr(MI);
5198 
5199  if (shouldBitcastLoadStoreType(ST, Ty, LLT::scalar(Size))) {
5200  Ty = getBitcastRegisterType(Ty);
5201  Helper.bitcastDst(MI, Ty, 0);
5202  Dst = MI.getOperand(0).getReg();
5203  B.setInsertPt(B.getMBB(), MI);
5204  }
5205 
5206  // FIXME: We don't really need this intermediate instruction. The intrinsic
5207  // should be fixed to have a memory operand. Since it's readnone, we're not
5208  // allowed to add one.
5209  MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_LOAD));
5210  MI.removeOperand(1); // Remove intrinsic ID
5211 
5212  // FIXME: When intrinsic definition is fixed, this should have an MMO already.
5213  // TODO: Should this use datalayout alignment?
5214  const unsigned MemSize = (Size + 7) / 8;
5215  const Align MemAlign(4);
5220  MemSize, MemAlign);
5221  MI.addMemOperand(MF, MMO);
5222 
5223  // There are no 96-bit result scalar loads, but widening to 128-bit should
5224  // always be legal. We may need to restore this to a 96-bit result if it turns
5225  // out this needs to be converted to a vector load during RegBankSelect.
5226  if (!isPowerOf2_32(Size)) {
5227  if (Ty.isVector())
5228  Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0);
5229  else
5230  Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0);
5231  }
5232 
5233  Observer.changedInstr(MI);
5234  return true;
5235 }
5236 
5237 // TODO: Move to selection
5240  MachineIRBuilder &B) const {
5241  if (!ST.isTrapHandlerEnabled() ||
5243  return legalizeTrapEndpgm(MI, MRI, B);
5244 
5245  if (Optional<uint8_t> HsaAbiVer = AMDGPU::getHsaAbiVersion(&ST)) {
5246  switch (*HsaAbiVer) {
5249  return legalizeTrapHsaQueuePtr(MI, MRI, B);
5252  return ST.supportsGetDoorbellID() ?
5253