LLVM  14.0.0git
AMDGPULegalizerInfo.cpp
Go to the documentation of this file.
1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// This file implements the targeting of the Machinelegalizer class for
10 /// AMDGPU.
11 /// \todo This should be generated by TableGen.
12 //===----------------------------------------------------------------------===//
13 
14 #include "AMDGPULegalizerInfo.h"
15 
16 #include "AMDGPU.h"
17 #include "AMDGPUGlobalISelUtils.h"
18 #include "AMDGPUInstrInfo.h"
19 #include "AMDGPUTargetMachine.h"
20 #include "SIMachineFunctionInfo.h"
21 #include "Utils/AMDGPUBaseInfo.h"
22 #include "llvm/ADT/ScopeExit.h"
23 #include "llvm/BinaryFormat/ELF.h"
27 #include "llvm/IR/DiagnosticInfo.h"
28 #include "llvm/IR/IntrinsicsAMDGPU.h"
29 
30 #define DEBUG_TYPE "amdgpu-legalinfo"
31 
32 using namespace llvm;
33 using namespace LegalizeActions;
34 using namespace LegalizeMutations;
35 using namespace LegalityPredicates;
36 using namespace MIPatternMatch;
37 
38 // Hack until load/store selection patterns support any tuple of legal types.
40  "amdgpu-global-isel-new-legality",
41  cl::desc("Use GlobalISel desired legality, rather than try to use"
42  "rules compatible with selection patterns"),
43  cl::init(false),
45 
46 static constexpr unsigned MaxRegisterSize = 1024;
47 
48 // Round the number of elements to the next power of two elements
50  unsigned NElts = Ty.getNumElements();
51  unsigned Pow2NElts = 1 << Log2_32_Ceil(NElts);
52  return Ty.changeElementCount(ElementCount::getFixed(Pow2NElts));
53 }
54 
55 // Round the number of bits to the next power of two bits
57  unsigned Bits = Ty.getSizeInBits();
58  unsigned Pow2Bits = 1 << Log2_32_Ceil(Bits);
59  return LLT::scalar(Pow2Bits);
60 }
61 
62 /// \returns true if this is an odd sized vector which should widen by adding an
63 /// additional element. This is mostly to handle <3 x s16> -> <4 x s16>. This
64 /// excludes s1 vectors, which should always be scalarized.
65 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) {
66  return [=](const LegalityQuery &Query) {
67  const LLT Ty = Query.Types[TypeIdx];
68  if (!Ty.isVector())
69  return false;
70 
71  const LLT EltTy = Ty.getElementType();
72  const unsigned EltSize = EltTy.getSizeInBits();
73  return Ty.getNumElements() % 2 != 0 &&
74  EltSize > 1 && EltSize < 32 &&
75  Ty.getSizeInBits() % 32 != 0;
76  };
77 }
78 
79 static LegalityPredicate sizeIsMultipleOf32(unsigned TypeIdx) {
80  return [=](const LegalityQuery &Query) {
81  const LLT Ty = Query.Types[TypeIdx];
82  return Ty.getSizeInBits() % 32 == 0;
83  };
84 }
85 
86 static LegalityPredicate isWideVec16(unsigned TypeIdx) {
87  return [=](const LegalityQuery &Query) {
88  const LLT Ty = Query.Types[TypeIdx];
89  const LLT EltTy = Ty.getScalarType();
90  return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2;
91  };
92 }
93 
94 static LegalizeMutation oneMoreElement(unsigned TypeIdx) {
95  return [=](const LegalityQuery &Query) {
96  const LLT Ty = Query.Types[TypeIdx];
97  const LLT EltTy = Ty.getElementType();
98  return std::make_pair(TypeIdx,
99  LLT::fixed_vector(Ty.getNumElements() + 1, EltTy));
100  };
101 }
102 
103 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) {
104  return [=](const LegalityQuery &Query) {
105  const LLT Ty = Query.Types[TypeIdx];
106  const LLT EltTy = Ty.getElementType();
107  unsigned Size = Ty.getSizeInBits();
108  unsigned Pieces = (Size + 63) / 64;
109  unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces;
110  return std::make_pair(
111  TypeIdx,
112  LLT::scalarOrVector(ElementCount::getFixed(NewNumElts), EltTy));
113  };
114 }
115 
116 // Increase the number of vector elements to reach the next multiple of 32-bit
117 // type.
118 static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) {
119  return [=](const LegalityQuery &Query) {
120  const LLT Ty = Query.Types[TypeIdx];
121 
122  const LLT EltTy = Ty.getElementType();
123  const int Size = Ty.getSizeInBits();
124  const int EltSize = EltTy.getSizeInBits();
125  const int NextMul32 = (Size + 31) / 32;
126 
127  assert(EltSize < 32);
128 
129  const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize;
130  return std::make_pair(TypeIdx, LLT::fixed_vector(NewNumElts, EltTy));
131  };
132 }
133 
134 static LLT getBitcastRegisterType(const LLT Ty) {
135  const unsigned Size = Ty.getSizeInBits();
136 
137  LLT CoercedTy;
138  if (Size <= 32) {
139  // <2 x s8> -> s16
140  // <4 x s8> -> s32
141  return LLT::scalar(Size);
142  }
143 
145 }
146 
147 static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx) {
148  return [=](const LegalityQuery &Query) {
149  const LLT Ty = Query.Types[TypeIdx];
150  return std::make_pair(TypeIdx, getBitcastRegisterType(Ty));
151  };
152 }
153 
154 static LegalizeMutation bitcastToVectorElement32(unsigned TypeIdx) {
155  return [=](const LegalityQuery &Query) {
156  const LLT Ty = Query.Types[TypeIdx];
157  unsigned Size = Ty.getSizeInBits();
158  assert(Size % 32 == 0);
159  return std::make_pair(
160  TypeIdx, LLT::scalarOrVector(ElementCount::getFixed(Size / 32), 32));
161  };
162 }
163 
164 static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) {
165  return [=](const LegalityQuery &Query) {
166  const LLT QueryTy = Query.Types[TypeIdx];
167  return QueryTy.isVector() && QueryTy.getSizeInBits() < Size;
168  };
169 }
170 
171 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) {
172  return [=](const LegalityQuery &Query) {
173  const LLT QueryTy = Query.Types[TypeIdx];
174  return QueryTy.isVector() && QueryTy.getSizeInBits() > Size;
175  };
176 }
177 
178 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) {
179  return [=](const LegalityQuery &Query) {
180  const LLT QueryTy = Query.Types[TypeIdx];
181  return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0;
182  };
183 }
184 
185 static bool isRegisterSize(unsigned Size) {
186  return Size % 32 == 0 && Size <= MaxRegisterSize;
187 }
188 
189 static bool isRegisterVectorElementType(LLT EltTy) {
190  const int EltSize = EltTy.getSizeInBits();
191  return EltSize == 16 || EltSize % 32 == 0;
192 }
193 
194 static bool isRegisterVectorType(LLT Ty) {
195  const int EltSize = Ty.getElementType().getSizeInBits();
196  return EltSize == 32 || EltSize == 64 ||
197  (EltSize == 16 && Ty.getNumElements() % 2 == 0) ||
198  EltSize == 128 || EltSize == 256;
199 }
200 
201 static bool isRegisterType(LLT Ty) {
202  if (!isRegisterSize(Ty.getSizeInBits()))
203  return false;
204 
205  if (Ty.isVector())
206  return isRegisterVectorType(Ty);
207 
208  return true;
209 }
210 
211 // Any combination of 32 or 64-bit elements up the maximum register size, and
212 // multiples of v2s16.
213 static LegalityPredicate isRegisterType(unsigned TypeIdx) {
214  return [=](const LegalityQuery &Query) {
215  return isRegisterType(Query.Types[TypeIdx]);
216  };
217 }
218 
219 static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) {
220  return [=](const LegalityQuery &Query) {
221  const LLT QueryTy = Query.Types[TypeIdx];
222  if (!QueryTy.isVector())
223  return false;
224  const LLT EltTy = QueryTy.getElementType();
225  return EltTy == LLT::scalar(16) || EltTy.getSizeInBits() >= 32;
226  };
227 }
228 
229 // If we have a truncating store or an extending load with a data size larger
230 // than 32-bits, we need to reduce to a 32-bit type.
232  return [=](const LegalityQuery &Query) {
233  const LLT Ty = Query.Types[TypeIdx];
234  return !Ty.isVector() && Ty.getSizeInBits() > 32 &&
235  Query.MMODescrs[0].MemoryTy.getSizeInBits() < Ty.getSizeInBits();
236  };
237 }
238 
239 // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we
240 // handle some operations by just promoting the register during
241 // selection. There are also d16 loads on GFX9+ which preserve the high bits.
242 static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS,
243  bool IsLoad) {
244  switch (AS) {
246  // FIXME: Private element size.
247  return ST.enableFlatScratch() ? 128 : 32;
249  return ST.useDS128() ? 128 : 64;
253  // Treat constant and global as identical. SMRD loads are sometimes usable for
254  // global loads (ideally constant address space should be eliminated)
255  // depending on the context. Legality cannot be context dependent, but
256  // RegBankSelect can split the load as necessary depending on the pointer
257  // register bank/uniformity and if the memory is invariant or not written in a
258  // kernel.
259  return IsLoad ? 512 : 128;
260  default:
261  // Flat addresses may contextually need to be split to 32-bit parts if they
262  // may alias scratch depending on the subtarget.
263  return 128;
264  }
265 }
266 
268  const LegalityQuery &Query) {
269  const LLT Ty = Query.Types[0];
270 
271  // Handle G_LOAD, G_ZEXTLOAD, G_SEXTLOAD
272  const bool IsLoad = Query.Opcode != AMDGPU::G_STORE;
273 
274  unsigned RegSize = Ty.getSizeInBits();
275  uint64_t MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
276  uint64_t AlignBits = Query.MMODescrs[0].AlignInBits;
277  unsigned AS = Query.Types[1].getAddressSpace();
278 
279  // All of these need to be custom lowered to cast the pointer operand.
281  return false;
282 
283  // Do not handle extending vector loads.
284  if (Ty.isVector() && MemSize != RegSize)
285  return false;
286 
287  // TODO: We should be able to widen loads if the alignment is high enough, but
288  // we also need to modify the memory access size.
289 #if 0
290  // Accept widening loads based on alignment.
291  if (IsLoad && MemSize < Size)
292  MemSize = std::max(MemSize, Align);
293 #endif
294 
295  // Only 1-byte and 2-byte to 32-bit extloads are valid.
296  if (MemSize != RegSize && RegSize != 32)
297  return false;
298 
299  if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad))
300  return false;
301 
302  switch (MemSize) {
303  case 8:
304  case 16:
305  case 32:
306  case 64:
307  case 128:
308  break;
309  case 96:
310  if (!ST.hasDwordx3LoadStores())
311  return false;
312  break;
313  case 256:
314  case 512:
315  // These may contextually need to be broken down.
316  break;
317  default:
318  return false;
319  }
320 
321  assert(RegSize >= MemSize);
322 
323  if (AlignBits < MemSize) {
324  const SITargetLowering *TLI = ST.getTargetLowering();
325  if (!TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS,
326  Align(AlignBits / 8)))
327  return false;
328  }
329 
330  return true;
331 }
332 
333 // The current selector can't handle <6 x s16>, <8 x s16>, s96, s128 etc, so
334 // workaround this. Eventually it should ignore the type for loads and only care
335 // about the size. Return true in cases where we will workaround this for now by
336 // bitcasting.
337 static bool loadStoreBitcastWorkaround(const LLT Ty) {
338  if (EnableNewLegality)
339  return false;
340 
341  const unsigned Size = Ty.getSizeInBits();
342  if (Size <= 64)
343  return false;
344  if (!Ty.isVector())
345  return true;
346 
347  LLT EltTy = Ty.getElementType();
348  if (EltTy.isPointer())
349  return true;
350 
351  unsigned EltSize = EltTy.getSizeInBits();
352  return EltSize != 32 && EltSize != 64;
353 }
354 
355 static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query) {
356  const LLT Ty = Query.Types[0];
357  return isRegisterType(Ty) && isLoadStoreSizeLegal(ST, Query) &&
359 }
360 
361 /// Return true if a load or store of the type should be lowered with a bitcast
362 /// to a different type.
363 static bool shouldBitcastLoadStoreType(const GCNSubtarget &ST, const LLT Ty,
364  const LLT MemTy) {
365  const unsigned MemSizeInBits = MemTy.getSizeInBits();
366  const unsigned Size = Ty.getSizeInBits();
367  if (Size != MemSizeInBits)
368  return Size <= 32 && Ty.isVector();
369 
371  return true;
372 
373  // Don't try to handle bitcasting vector ext loads for now.
374  return Ty.isVector() && (!MemTy.isVector() || MemTy == Ty) &&
375  (Size <= 32 || isRegisterSize(Size)) &&
377 }
378 
379 /// Return true if we should legalize a load by widening an odd sized memory
380 /// access up to the alignment. Note this case when the memory access itself
381 /// changes, not the size of the result register.
382 static bool shouldWidenLoad(const GCNSubtarget &ST, LLT MemoryTy,
383  uint64_t AlignInBits, unsigned AddrSpace,
384  unsigned Opcode) {
385  unsigned SizeInBits = MemoryTy.getSizeInBits();
386  // We don't want to widen cases that are naturally legal.
387  if (isPowerOf2_32(SizeInBits))
388  return false;
389 
390  // If we have 96-bit memory operations, we shouldn't touch them. Note we may
391  // end up widening these for a scalar load during RegBankSelect, since there
392  // aren't 96-bit scalar loads.
393  if (SizeInBits == 96 && ST.hasDwordx3LoadStores())
394  return false;
395 
396  if (SizeInBits >= maxSizeForAddrSpace(ST, AddrSpace, Opcode))
397  return false;
398 
399  // A load is known dereferenceable up to the alignment, so it's legal to widen
400  // to it.
401  //
402  // TODO: Could check dereferenceable for less aligned cases.
403  unsigned RoundedSize = NextPowerOf2(SizeInBits);
404  if (AlignInBits < RoundedSize)
405  return false;
406 
407  // Do not widen if it would introduce a slow unaligned load.
408  const SITargetLowering *TLI = ST.getTargetLowering();
409  bool Fast = false;
411  RoundedSize, AddrSpace, Align(AlignInBits / 8),
412  MachineMemOperand::MOLoad, &Fast) &&
413  Fast;
414 }
415 
416 static bool shouldWidenLoad(const GCNSubtarget &ST, const LegalityQuery &Query,
417  unsigned Opcode) {
418  if (Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic)
419  return false;
420 
421  return shouldWidenLoad(ST, Query.MMODescrs[0].MemoryTy,
422  Query.MMODescrs[0].AlignInBits,
423  Query.Types[1].getAddressSpace(), Opcode);
424 }
425 
427  const GCNTargetMachine &TM)
428  : ST(ST_) {
429  using namespace TargetOpcode;
430 
431  auto GetAddrSpacePtr = [&TM](unsigned AS) {
432  return LLT::pointer(AS, TM.getPointerSizeInBits(AS));
433  };
434 
435  const LLT S1 = LLT::scalar(1);
436  const LLT S8 = LLT::scalar(8);
437  const LLT S16 = LLT::scalar(16);
438  const LLT S32 = LLT::scalar(32);
439  const LLT S64 = LLT::scalar(64);
440  const LLT S128 = LLT::scalar(128);
441  const LLT S256 = LLT::scalar(256);
442  const LLT S512 = LLT::scalar(512);
443  const LLT MaxScalar = LLT::scalar(MaxRegisterSize);
444 
445  const LLT V2S8 = LLT::fixed_vector(2, 8);
446  const LLT V2S16 = LLT::fixed_vector(2, 16);
447  const LLT V4S16 = LLT::fixed_vector(4, 16);
448 
449  const LLT V2S32 = LLT::fixed_vector(2, 32);
450  const LLT V3S32 = LLT::fixed_vector(3, 32);
451  const LLT V4S32 = LLT::fixed_vector(4, 32);
452  const LLT V5S32 = LLT::fixed_vector(5, 32);
453  const LLT V6S32 = LLT::fixed_vector(6, 32);
454  const LLT V7S32 = LLT::fixed_vector(7, 32);
455  const LLT V8S32 = LLT::fixed_vector(8, 32);
456  const LLT V9S32 = LLT::fixed_vector(9, 32);
457  const LLT V10S32 = LLT::fixed_vector(10, 32);
458  const LLT V11S32 = LLT::fixed_vector(11, 32);
459  const LLT V12S32 = LLT::fixed_vector(12, 32);
460  const LLT V13S32 = LLT::fixed_vector(13, 32);
461  const LLT V14S32 = LLT::fixed_vector(14, 32);
462  const LLT V15S32 = LLT::fixed_vector(15, 32);
463  const LLT V16S32 = LLT::fixed_vector(16, 32);
464  const LLT V32S32 = LLT::fixed_vector(32, 32);
465 
466  const LLT V2S64 = LLT::fixed_vector(2, 64);
467  const LLT V3S64 = LLT::fixed_vector(3, 64);
468  const LLT V4S64 = LLT::fixed_vector(4, 64);
469  const LLT V5S64 = LLT::fixed_vector(5, 64);
470  const LLT V6S64 = LLT::fixed_vector(6, 64);
471  const LLT V7S64 = LLT::fixed_vector(7, 64);
472  const LLT V8S64 = LLT::fixed_vector(8, 64);
473  const LLT V16S64 = LLT::fixed_vector(16, 64);
474 
475  std::initializer_list<LLT> AllS32Vectors =
476  {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32,
477  V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32};
478  std::initializer_list<LLT> AllS64Vectors =
479  {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64};
480 
481  const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS);
482  const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS);
483  const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT);
484  const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS);
485  const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS);
486  const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS);
487  const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS);
488 
489  const LLT CodePtr = FlatPtr;
490 
491  const std::initializer_list<LLT> AddrSpaces64 = {
492  GlobalPtr, ConstantPtr, FlatPtr
493  };
494 
495  const std::initializer_list<LLT> AddrSpaces32 = {
496  LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr
497  };
498 
499  const std::initializer_list<LLT> FPTypesBase = {
500  S32, S64
501  };
502 
503  const std::initializer_list<LLT> FPTypes16 = {
504  S32, S64, S16
505  };
506 
507  const std::initializer_list<LLT> FPTypesPK16 = {
508  S32, S64, S16, V2S16
509  };
510 
511  const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32;
512 
513  // s1 for VCC branches, s32 for SCC branches.
514  getActionDefinitionsBuilder(G_BRCOND).legalFor({S1, S32});
515 
516  // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more
517  // elements for v3s16
519  .legalFor({S32, S64, V2S16, S16, V4S16, S1, S128, S256})
520  .legalFor(AllS32Vectors)
521  .legalFor(AllS64Vectors)
522  .legalFor(AddrSpaces64)
523  .legalFor(AddrSpaces32)
524  .legalIf(isPointer(0))
525  .clampScalar(0, S16, S256)
526  .widenScalarToNextPow2(0, 32)
527  .clampMaxNumElements(0, S32, 16)
529  .scalarize(0);
530 
531  if (ST.hasVOP3PInsts() && ST.hasAddNoCarry() && ST.hasIntClamp()) {
532  // Full set of gfx9 features.
533  getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
534  .legalFor({S32, S16, V2S16})
535  .minScalar(0, S16)
536  .clampMaxNumElementsStrict(0, S16, 2)
538  .maxScalar(0, S32)
539  .scalarize(0);
540 
541  getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT, G_SADDSAT, G_SSUBSAT})
542  .legalFor({S32, S16, V2S16}) // Clamp modifier
543  .minScalarOrElt(0, S16)
544  .clampMaxNumElementsStrict(0, S16, 2)
545  .scalarize(0)
546  .widenScalarToNextPow2(0, 32)
547  .lower();
548  } else if (ST.has16BitInsts()) {
549  getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
550  .legalFor({S32, S16})
551  .minScalar(0, S16)
553  .maxScalar(0, S32)
554  .scalarize(0);
555 
556  // Technically the saturating operations require clamp bit support, but this
557  // was introduced at the same time as 16-bit operations.
558  getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
559  .legalFor({S32, S16}) // Clamp modifier
560  .minScalar(0, S16)
561  .scalarize(0)
562  .widenScalarToNextPow2(0, 16)
563  .lower();
564 
565  // We're just lowering this, but it helps get a better result to try to
566  // coerce to the desired type first.
567  getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT})
568  .minScalar(0, S16)
569  .scalarize(0)
570  .lower();
571  } else {
572  getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
573  .legalFor({S32})
574  .widenScalarToNextMultipleOf(0, 32)
575  .clampScalar(0, S32, S32)
576  .scalarize(0);
577 
578  if (ST.hasIntClamp()) {
579  getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
580  .legalFor({S32}) // Clamp modifier.
581  .scalarize(0)
582  .minScalarOrElt(0, S32)
583  .lower();
584  } else {
585  // Clamp bit support was added in VI, along with 16-bit operations.
586  getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
587  .minScalar(0, S32)
588  .scalarize(0)
589  .lower();
590  }
591 
592  // FIXME: DAG expansion gets better results. The widening uses the smaller
593  // range values and goes for the min/max lowering directly.
594  getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT})
595  .minScalar(0, S32)
596  .scalarize(0)
597  .lower();
598  }
599 
601  {G_SDIV, G_UDIV, G_SREM, G_UREM, G_SDIVREM, G_UDIVREM})
602  .customFor({S32, S64})
603  .clampScalar(0, S32, S64)
604  .widenScalarToNextPow2(0, 32)
605  .scalarize(0);
606 
607  auto &Mulh = getActionDefinitionsBuilder({G_UMULH, G_SMULH})
608  .legalFor({S32})
609  .maxScalar(0, S32);
610 
611  if (ST.hasVOP3PInsts()) {
612  Mulh
613  .clampMaxNumElements(0, S8, 2)
614  .lowerFor({V2S8});
615  }
616 
617  Mulh
618  .scalarize(0)
619  .lower();
620 
621  // Report legal for any types we can handle anywhere. For the cases only legal
622  // on the SALU, RegBankSelect will be able to re-legalize.
623  getActionDefinitionsBuilder({G_AND, G_OR, G_XOR})
624  .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16})
625  .clampScalar(0, S32, S64)
629  .scalarize(0);
630 
631  getActionDefinitionsBuilder({G_UADDO, G_USUBO,
632  G_UADDE, G_SADDE, G_USUBE, G_SSUBE})
633  .legalFor({{S32, S1}, {S32, S32}})
634  .minScalar(0, S32)
635  // TODO: .scalarize(0)
636  .lower();
637 
638  getActionDefinitionsBuilder(G_BITCAST)
639  // Don't worry about the size constraint.
641  .lower();
642 
643 
644  getActionDefinitionsBuilder(G_CONSTANT)
645  .legalFor({S1, S32, S64, S16, GlobalPtr,
646  LocalPtr, ConstantPtr, PrivatePtr, FlatPtr })
647  .legalIf(isPointer(0))
648  .clampScalar(0, S32, S64)
650 
651  getActionDefinitionsBuilder(G_FCONSTANT)
652  .legalFor({S32, S64, S16})
653  .clampScalar(0, S16, S64);
654 
655  getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE})
656  .legalIf(isRegisterType(0))
657  // s1 and s16 are special cases because they have legal operations on
658  // them, but don't really occupy registers in the normal way.
659  .legalFor({S1, S16})
660  .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
661  .clampScalarOrElt(0, S32, MaxScalar)
662  .widenScalarToNextPow2(0, 32)
663  .clampMaxNumElements(0, S32, 16);
664 
665  getActionDefinitionsBuilder(G_FRAME_INDEX).legalFor({PrivatePtr});
666 
667  // If the amount is divergent, we have to do a wave reduction to get the
668  // maximum value, so this is expanded during RegBankSelect.
669  getActionDefinitionsBuilder(G_DYN_STACKALLOC)
670  .legalFor({{PrivatePtr, S32}});
671 
672  getActionDefinitionsBuilder(G_GLOBAL_VALUE)
673  .customIf(typeIsNot(0, PrivatePtr));
674 
675  getActionDefinitionsBuilder(G_BLOCK_ADDR).legalFor({CodePtr});
676 
677  auto &FPOpActions = getActionDefinitionsBuilder(
678  { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE})
679  .legalFor({S32, S64});
680  auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS})
681  .customFor({S32, S64});
682  auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV)
683  .customFor({S32, S64});
684 
685  if (ST.has16BitInsts()) {
686  if (ST.hasVOP3PInsts())
687  FPOpActions.legalFor({S16, V2S16});
688  else
689  FPOpActions.legalFor({S16});
690 
691  TrigActions.customFor({S16});
692  FDIVActions.customFor({S16});
693  }
694 
695  auto &MinNumMaxNum = getActionDefinitionsBuilder({
696  G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE});
697 
698  if (ST.hasVOP3PInsts()) {
699  MinNumMaxNum.customFor(FPTypesPK16)
701  .clampMaxNumElements(0, S16, 2)
702  .clampScalar(0, S16, S64)
703  .scalarize(0);
704  } else if (ST.has16BitInsts()) {
705  MinNumMaxNum.customFor(FPTypes16)
706  .clampScalar(0, S16, S64)
707  .scalarize(0);
708  } else {
709  MinNumMaxNum.customFor(FPTypesBase)
710  .clampScalar(0, S32, S64)
711  .scalarize(0);
712  }
713 
714  if (ST.hasVOP3PInsts())
715  FPOpActions.clampMaxNumElementsStrict(0, S16, 2);
716 
717  FPOpActions
718  .scalarize(0)
719  .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
720 
721  TrigActions
722  .scalarize(0)
723  .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
724 
725  FDIVActions
726  .scalarize(0)
727  .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
728 
729  getActionDefinitionsBuilder({G_FNEG, G_FABS})
730  .legalFor(FPTypesPK16)
731  .clampMaxNumElementsStrict(0, S16, 2)
732  .scalarize(0)
733  .clampScalar(0, S16, S64);
734 
735  if (ST.has16BitInsts()) {
736  getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR})
737  .legalFor({S32, S64, S16})
738  .scalarize(0)
739  .clampScalar(0, S16, S64);
740  } else {
742  .legalFor({S32, S64})
743  .scalarize(0)
744  .clampScalar(0, S32, S64);
745 
746  if (ST.hasFractBug()) {
748  .customFor({S64})
749  .legalFor({S32, S64})
750  .scalarize(0)
751  .clampScalar(0, S32, S64);
752  } else {
754  .legalFor({S32, S64})
755  .scalarize(0)
756  .clampScalar(0, S32, S64);
757  }
758  }
759 
760  getActionDefinitionsBuilder(G_FPTRUNC)
761  .legalFor({{S32, S64}, {S16, S32}})
762  .scalarize(0)
763  .lower();
764 
766  .legalFor({{S64, S32}, {S32, S16}})
767  .narrowScalarFor({{S64, S16}}, changeTo(0, S32))
768  .scalarize(0);
769 
771  // Use actual fsub instruction
772  .legalFor({S32})
773  // Must use fadd + fneg
774  .lowerFor({S64, S16, V2S16})
775  .scalarize(0)
776  .clampScalar(0, S32, S64);
777 
778  // Whether this is legal depends on the floating point mode for the function.
779  auto &FMad = getActionDefinitionsBuilder(G_FMAD);
780  if (ST.hasMadF16() && ST.hasMadMacF32Insts())
781  FMad.customFor({S32, S16});
782  else if (ST.hasMadMacF32Insts())
783  FMad.customFor({S32});
784  else if (ST.hasMadF16())
785  FMad.customFor({S16});
786  FMad.scalarize(0)
787  .lower();
788 
789  auto &FRem = getActionDefinitionsBuilder(G_FREM);
790  if (ST.has16BitInsts()) {
791  FRem.customFor({S16, S32, S64});
792  } else {
793  FRem.minScalar(0, S32)
794  .customFor({S32, S64});
795  }
796  FRem.scalarize(0);
797 
798  // TODO: Do we need to clamp maximum bitwidth?
800  .legalIf(isScalar(0))
801  .legalFor({{V2S16, V2S32}})
802  .clampMaxNumElements(0, S16, 2)
803  // Avoid scalarizing in cases that should be truly illegal. In unresolvable
804  // situations (like an invalid implicit use), we don't want to infinite loop
805  // in the legalizer.
807  .alwaysLegal();
808 
809  getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT})
810  .legalFor({{S64, S32}, {S32, S16}, {S64, S16},
811  {S32, S1}, {S64, S1}, {S16, S1}})
812  .scalarize(0)
813  .clampScalar(0, S32, S64)
814  .widenScalarToNextPow2(1, 32);
815 
816  // TODO: Split s1->s64 during regbankselect for VALU.
817  auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP})
818  .legalFor({{S32, S32}, {S64, S32}, {S16, S32}})
819  .lowerIf(typeIs(1, S1))
820  .customFor({{S32, S64}, {S64, S64}});
821  if (ST.has16BitInsts())
822  IToFP.legalFor({{S16, S16}});
823  IToFP.clampScalar(1, S32, S64)
824  .minScalar(0, S32)
825  .scalarize(0)
827 
828  auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI})
829  .legalFor({{S32, S32}, {S32, S64}, {S32, S16}})
830  .customFor({{S64, S32}, {S64, S64}})
831  .narrowScalarFor({{S64, S16}}, changeTo(0, S32));
832  if (ST.has16BitInsts())
833  FPToI.legalFor({{S16, S16}});
834  else
835  FPToI.minScalar(1, S32);
836 
837  FPToI.minScalar(0, S32)
838  .widenScalarToNextPow2(0, 32)
839  .scalarize(0)
840  .lower();
841 
842  // Lower roundeven into G_FRINT
843  getActionDefinitionsBuilder({G_INTRINSIC_ROUND, G_INTRINSIC_ROUNDEVEN})
844  .scalarize(0)
845  .lower();
846 
847  if (ST.has16BitInsts()) {
848  getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
849  .legalFor({S16, S32, S64})
850  .clampScalar(0, S16, S64)
851  .scalarize(0);
852  } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) {
853  getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
854  .legalFor({S32, S64})
855  .clampScalar(0, S32, S64)
856  .scalarize(0);
857  } else {
858  getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
859  .legalFor({S32})
860  .customFor({S64})
861  .clampScalar(0, S32, S64)
862  .scalarize(0);
863  }
864 
865  getActionDefinitionsBuilder(G_PTR_ADD)
866  .legalIf(all(isPointer(0), sameSize(0, 1)))
867  .scalarize(0)
868  .scalarSameSizeAs(1, 0);
869 
870  getActionDefinitionsBuilder(G_PTRMASK)
871  .legalIf(all(sameSize(0, 1), typeInSet(1, {S64, S32})))
872  .scalarSameSizeAs(1, 0)
873  .scalarize(0);
874 
875  auto &CmpBuilder =
876  getActionDefinitionsBuilder(G_ICMP)
877  // The compare output type differs based on the register bank of the output,
878  // so make both s1 and s32 legal.
879  //
880  // Scalar compares producing output in scc will be promoted to s32, as that
881  // is the allocatable register type that will be needed for the copy from
882  // scc. This will be promoted during RegBankSelect, and we assume something
883  // before that won't try to use s32 result types.
884  //
885  // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg
886  // bank.
887  .legalForCartesianProduct(
888  {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr})
889  .legalForCartesianProduct(
890  {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr});
891  if (ST.has16BitInsts()) {
892  CmpBuilder.legalFor({{S1, S16}});
893  }
894 
895  CmpBuilder
896  .widenScalarToNextPow2(1)
897  .clampScalar(1, S32, S64)
898  .scalarize(0)
899  .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1)));
900 
901  getActionDefinitionsBuilder(G_FCMP)
902  .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase)
903  .widenScalarToNextPow2(1)
904  .clampScalar(1, S32, S64)
905  .scalarize(0);
906 
907  // FIXME: fpow has a selection pattern that should move to custom lowering.
908  auto &Exp2Ops = getActionDefinitionsBuilder({G_FEXP2, G_FLOG2});
909  if (ST.has16BitInsts())
910  Exp2Ops.legalFor({S32, S16});
911  else
912  Exp2Ops.legalFor({S32});
913  Exp2Ops.clampScalar(0, MinScalarFPTy, S32);
914  Exp2Ops.scalarize(0);
915 
916  auto &ExpOps = getActionDefinitionsBuilder({G_FEXP, G_FLOG, G_FLOG10, G_FPOW});
917  if (ST.has16BitInsts())
918  ExpOps.customFor({{S32}, {S16}});
919  else
920  ExpOps.customFor({S32});
921  ExpOps.clampScalar(0, MinScalarFPTy, S32)
922  .scalarize(0);
923 
924  getActionDefinitionsBuilder(G_FPOWI)
925  .clampScalar(0, MinScalarFPTy, S32)
926  .lower();
927 
928  // The 64-bit versions produce 32-bit results, but only on the SALU.
929  getActionDefinitionsBuilder(G_CTPOP)
930  .legalFor({{S32, S32}, {S32, S64}})
931  .clampScalar(0, S32, S32)
932  .clampScalar(1, S32, S64)
933  .scalarize(0)
934  .widenScalarToNextPow2(0, 32)
935  .widenScalarToNextPow2(1, 32);
936 
937  // The hardware instructions return a different result on 0 than the generic
938  // instructions expect. The hardware produces -1, but these produce the
939  // bitwidth.
940  getActionDefinitionsBuilder({G_CTLZ, G_CTTZ})
941  .scalarize(0)
942  .clampScalar(0, S32, S32)
943  .clampScalar(1, S32, S64)
944  .widenScalarToNextPow2(0, 32)
945  .widenScalarToNextPow2(1, 32)
946  .custom();
947 
948  // The 64-bit versions produce 32-bit results, but only on the SALU.
949  getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF})
950  .legalFor({{S32, S32}, {S32, S64}})
951  .clampScalar(0, S32, S32)
952  .clampScalar(1, S32, S64)
953  .scalarize(0)
954  .widenScalarToNextPow2(0, 32)
955  .widenScalarToNextPow2(1, 32);
956 
957  // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
958  // RegBankSelect.
959  getActionDefinitionsBuilder(G_BITREVERSE)
960  .legalFor({S32, S64})
961  .clampScalar(0, S32, S64)
962  .scalarize(0)
963  .widenScalarToNextPow2(0);
964 
965  if (ST.has16BitInsts()) {
966  getActionDefinitionsBuilder(G_BSWAP)
967  .legalFor({S16, S32, V2S16})
968  .clampMaxNumElementsStrict(0, S16, 2)
969  // FIXME: Fixing non-power-of-2 before clamp is workaround for
970  // narrowScalar limitation.
971  .widenScalarToNextPow2(0)
972  .clampScalar(0, S16, S32)
973  .scalarize(0);
974 
975  if (ST.hasVOP3PInsts()) {
976  getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
977  .legalFor({S32, S16, V2S16})
978  .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
979  .clampMaxNumElements(0, S16, 2)
980  .minScalar(0, S16)
981  .widenScalarToNextPow2(0)
982  .scalarize(0)
983  .lower();
984  } else {
985  getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
986  .legalFor({S32, S16})
987  .widenScalarToNextPow2(0)
988  .minScalar(0, S16)
989  .scalarize(0)
990  .lower();
991  }
992  } else {
993  // TODO: Should have same legality without v_perm_b32
994  getActionDefinitionsBuilder(G_BSWAP)
995  .legalFor({S32})
996  .lowerIf(scalarNarrowerThan(0, 32))
997  // FIXME: Fixing non-power-of-2 before clamp is workaround for
998  // narrowScalar limitation.
999  .widenScalarToNextPow2(0)
1000  .maxScalar(0, S32)
1001  .scalarize(0)
1002  .lower();
1003 
1004  getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
1005  .legalFor({S32})
1006  .minScalar(0, S32)
1007  .widenScalarToNextPow2(0)
1008  .scalarize(0)
1009  .lower();
1010  }
1011 
1012  getActionDefinitionsBuilder(G_INTTOPTR)
1013  // List the common cases
1014  .legalForCartesianProduct(AddrSpaces64, {S64})
1015  .legalForCartesianProduct(AddrSpaces32, {S32})
1016  .scalarize(0)
1017  // Accept any address space as long as the size matches
1018  .legalIf(sameSize(0, 1))
1019  .widenScalarIf(smallerThan(1, 0),
1020  [](const LegalityQuery &Query) {
1021  return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
1022  })
1023  .narrowScalarIf(largerThan(1, 0),
1024  [](const LegalityQuery &Query) {
1025  return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
1026  });
1027 
1028  getActionDefinitionsBuilder(G_PTRTOINT)
1029  // List the common cases
1030  .legalForCartesianProduct(AddrSpaces64, {S64})
1031  .legalForCartesianProduct(AddrSpaces32, {S32})
1032  .scalarize(0)
1033  // Accept any address space as long as the size matches
1034  .legalIf(sameSize(0, 1))
1035  .widenScalarIf(smallerThan(0, 1),
1036  [](const LegalityQuery &Query) {
1037  return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
1038  })
1039  .narrowScalarIf(
1040  largerThan(0, 1),
1041  [](const LegalityQuery &Query) {
1042  return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
1043  });
1044 
1045  getActionDefinitionsBuilder(G_ADDRSPACE_CAST)
1046  .scalarize(0)
1047  .custom();
1048 
1049  const auto needToSplitMemOp = [=](const LegalityQuery &Query,
1050  bool IsLoad) -> bool {
1051  const LLT DstTy = Query.Types[0];
1052 
1053  // Split vector extloads.
1054  unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
1055 
1056  if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize)
1057  return true;
1058 
1059  const LLT PtrTy = Query.Types[1];
1060  unsigned AS = PtrTy.getAddressSpace();
1061  if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad))
1062  return true;
1063 
1064  // Catch weird sized loads that don't evenly divide into the access sizes
1065  // TODO: May be able to widen depending on alignment etc.
1066  unsigned NumRegs = (MemSize + 31) / 32;
1067  if (NumRegs == 3) {
1068  if (!ST.hasDwordx3LoadStores())
1069  return true;
1070  } else {
1071  // If the alignment allows, these should have been widened.
1072  if (!isPowerOf2_32(NumRegs))
1073  return true;
1074  }
1075 
1076  return false;
1077  };
1078 
1079  unsigned GlobalAlign32 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 32;
1080  unsigned GlobalAlign16 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 16;
1081  unsigned GlobalAlign8 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 8;
1082 
1083  // TODO: Refine based on subtargets which support unaligned access or 128-bit
1084  // LDS
1085  // TODO: Unsupported flat for SI.
1086 
1087  for (unsigned Op : {G_LOAD, G_STORE}) {
1088  const bool IsStore = Op == G_STORE;
1089 
1090  auto &Actions = getActionDefinitionsBuilder(Op);
1091  // Explicitly list some common cases.
1092  // TODO: Does this help compile time at all?
1093  Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, S32, GlobalAlign32},
1094  {V2S32, GlobalPtr, V2S32, GlobalAlign32},
1095  {V4S32, GlobalPtr, V4S32, GlobalAlign32},
1096  {S64, GlobalPtr, S64, GlobalAlign32},
1097  {V2S64, GlobalPtr, V2S64, GlobalAlign32},
1098  {V2S16, GlobalPtr, V2S16, GlobalAlign32},
1099  {S32, GlobalPtr, S8, GlobalAlign8},
1100  {S32, GlobalPtr, S16, GlobalAlign16},
1101 
1102  {S32, LocalPtr, S32, 32},
1103  {S64, LocalPtr, S64, 32},
1104  {V2S32, LocalPtr, V2S32, 32},
1105  {S32, LocalPtr, S8, 8},
1106  {S32, LocalPtr, S16, 16},
1107  {V2S16, LocalPtr, S32, 32},
1108 
1109  {S32, PrivatePtr, S32, 32},
1110  {S32, PrivatePtr, S8, 8},
1111  {S32, PrivatePtr, S16, 16},
1112  {V2S16, PrivatePtr, S32, 32},
1113 
1114  {S32, ConstantPtr, S32, GlobalAlign32},
1115  {V2S32, ConstantPtr, V2S32, GlobalAlign32},
1116  {V4S32, ConstantPtr, V4S32, GlobalAlign32},
1117  {S64, ConstantPtr, S64, GlobalAlign32},
1118  {V2S32, ConstantPtr, V2S32, GlobalAlign32}});
1119  Actions.legalIf(
1120  [=](const LegalityQuery &Query) -> bool {
1121  return isLoadStoreLegal(ST, Query);
1122  });
1123 
1124  // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to
1125  // 64-bits.
1126  //
1127  // TODO: Should generalize bitcast action into coerce, which will also cover
1128  // inserting addrspacecasts.
1129  Actions.customIf(typeIs(1, Constant32Ptr));
1130 
1131  // Turn any illegal element vectors into something easier to deal
1132  // with. These will ultimately produce 32-bit scalar shifts to extract the
1133  // parts anyway.
1134  //
1135  // For odd 16-bit element vectors, prefer to split those into pieces with
1136  // 16-bit vector parts.
1137  Actions.bitcastIf(
1138  [=](const LegalityQuery &Query) -> bool {
1139  return shouldBitcastLoadStoreType(ST, Query.Types[0],
1140  Query.MMODescrs[0].MemoryTy);
1141  }, bitcastToRegisterType(0));
1142 
1143  if (!IsStore) {
1144  // Widen suitably aligned loads by loading extra bytes. The standard
1145  // legalization actions can't properly express widening memory operands.
1146  Actions.customIf([=](const LegalityQuery &Query) -> bool {
1147  return shouldWidenLoad(ST, Query, G_LOAD);
1148  });
1149  }
1150 
1151  // FIXME: load/store narrowing should be moved to lower action
1152  Actions
1153  .narrowScalarIf(
1154  [=](const LegalityQuery &Query) -> bool {
1155  return !Query.Types[0].isVector() &&
1156  needToSplitMemOp(Query, Op == G_LOAD);
1157  },
1158  [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1159  const LLT DstTy = Query.Types[0];
1160  const LLT PtrTy = Query.Types[1];
1161 
1162  const unsigned DstSize = DstTy.getSizeInBits();
1163  unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
1164 
1165  // Split extloads.
1166  if (DstSize > MemSize)
1167  return std::make_pair(0, LLT::scalar(MemSize));
1168 
1169  unsigned MaxSize = maxSizeForAddrSpace(ST,
1170  PtrTy.getAddressSpace(),
1171  Op == G_LOAD);
1172  if (MemSize > MaxSize)
1173  return std::make_pair(0, LLT::scalar(MaxSize));
1174 
1175  uint64_t Align = Query.MMODescrs[0].AlignInBits;
1176  return std::make_pair(0, LLT::scalar(Align));
1177  })
1178  .fewerElementsIf(
1179  [=](const LegalityQuery &Query) -> bool {
1180  return Query.Types[0].isVector() &&
1181  needToSplitMemOp(Query, Op == G_LOAD);
1182  },
1183  [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1184  const LLT DstTy = Query.Types[0];
1185  const LLT PtrTy = Query.Types[1];
1186 
1187  LLT EltTy = DstTy.getElementType();
1188  unsigned MaxSize = maxSizeForAddrSpace(ST,
1189  PtrTy.getAddressSpace(),
1190  Op == G_LOAD);
1191 
1192  // FIXME: Handle widened to power of 2 results better. This ends
1193  // up scalarizing.
1194  // FIXME: 3 element stores scalarized on SI
1195 
1196  // Split if it's too large for the address space.
1197  unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
1198  if (MemSize > MaxSize) {
1199  unsigned NumElts = DstTy.getNumElements();
1200  unsigned EltSize = EltTy.getSizeInBits();
1201 
1202  if (MaxSize % EltSize == 0) {
1203  return std::make_pair(
1205  ElementCount::getFixed(MaxSize / EltSize), EltTy));
1206  }
1207 
1208  unsigned NumPieces = MemSize / MaxSize;
1209 
1210  // FIXME: Refine when odd breakdowns handled
1211  // The scalars will need to be re-legalized.
1212  if (NumPieces == 1 || NumPieces >= NumElts ||
1213  NumElts % NumPieces != 0)
1214  return std::make_pair(0, EltTy);
1215 
1216  return std::make_pair(
1217  0, LLT::fixed_vector(NumElts / NumPieces, EltTy));
1218  }
1219 
1220  // FIXME: We could probably handle weird extending loads better.
1221  if (DstTy.getSizeInBits() > MemSize)
1222  return std::make_pair(0, EltTy);
1223 
1224  unsigned EltSize = EltTy.getSizeInBits();
1225  unsigned DstSize = DstTy.getSizeInBits();
1226  if (!isPowerOf2_32(DstSize)) {
1227  // We're probably decomposing an odd sized store. Try to split
1228  // to the widest type. TODO: Account for alignment. As-is it
1229  // should be OK, since the new parts will be further legalized.
1230  unsigned FloorSize = PowerOf2Floor(DstSize);
1231  return std::make_pair(
1233  ElementCount::getFixed(FloorSize / EltSize), EltTy));
1234  }
1235 
1236  // May need relegalization for the scalars.
1237  return std::make_pair(0, EltTy);
1238  })
1239  .minScalar(0, S32)
1240  .narrowScalarIf(isWideScalarExtLoadTruncStore(0), changeTo(0, S32))
1241  .widenScalarToNextPow2(0)
1242  .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0))
1243  .lower();
1244  }
1245 
1246  // FIXME: Unaligned accesses not lowered.
1247  auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD})
1248  .legalForTypesWithMemDesc({{S32, GlobalPtr, S8, 8},
1249  {S32, GlobalPtr, S16, 2 * 8},
1250  {S32, LocalPtr, S8, 8},
1251  {S32, LocalPtr, S16, 16},
1252  {S32, PrivatePtr, S8, 8},
1253  {S32, PrivatePtr, S16, 16},
1254  {S32, ConstantPtr, S8, 8},
1255  {S32, ConstantPtr, S16, 2 * 8}})
1256  .legalIf(
1257  [=](const LegalityQuery &Query) -> bool {
1258  return isLoadStoreLegal(ST, Query);
1259  });
1260 
1261  if (ST.hasFlatAddressSpace()) {
1262  ExtLoads.legalForTypesWithMemDesc(
1263  {{S32, FlatPtr, S8, 8}, {S32, FlatPtr, S16, 16}});
1264  }
1265 
1266  // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to
1267  // 64-bits.
1268  //
1269  // TODO: Should generalize bitcast action into coerce, which will also cover
1270  // inserting addrspacecasts.
1271  ExtLoads.customIf(typeIs(1, Constant32Ptr));
1272 
1273  ExtLoads.clampScalar(0, S32, S32)
1274  .widenScalarToNextPow2(0)
1275  .lower();
1276 
1277  auto &Atomics = getActionDefinitionsBuilder(
1278  {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB,
1279  G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR,
1280  G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX,
1281  G_ATOMICRMW_UMIN})
1282  .legalFor({{S32, GlobalPtr}, {S32, LocalPtr},
1283  {S64, GlobalPtr}, {S64, LocalPtr},
1284  {S32, RegionPtr}, {S64, RegionPtr}});
1285  if (ST.hasFlatAddressSpace()) {
1286  Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}});
1287  }
1288 
1289  auto &Atomic = getActionDefinitionsBuilder(G_ATOMICRMW_FADD);
1290  if (ST.hasLDSFPAtomicAdd()) {
1291  Atomic.legalFor({{S32, LocalPtr}, {S32, RegionPtr}});
1292  if (ST.hasGFX90AInsts())
1293  Atomic.legalFor({{S64, LocalPtr}});
1294  }
1295  if (ST.hasAtomicFaddInsts())
1296  Atomic.legalFor({{S32, GlobalPtr}});
1297 
1298  // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output
1299  // demarshalling
1300  getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG)
1301  .customFor({{S32, GlobalPtr}, {S64, GlobalPtr},
1302  {S32, FlatPtr}, {S64, FlatPtr}})
1303  .legalFor({{S32, LocalPtr}, {S64, LocalPtr},
1304  {S32, RegionPtr}, {S64, RegionPtr}});
1305  // TODO: Pointer types, any 32-bit or 64-bit vector
1306 
1307  // Condition should be s32 for scalar, s1 for vector.
1308  getActionDefinitionsBuilder(G_SELECT)
1309  .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16, GlobalPtr,
1310  LocalPtr, FlatPtr, PrivatePtr,
1311  LLT::fixed_vector(2, LocalPtr),
1312  LLT::fixed_vector(2, PrivatePtr)},
1313  {S1, S32})
1314  .clampScalar(0, S16, S64)
1315  .scalarize(1)
1316  .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
1317  .fewerElementsIf(numElementsNotEven(0), scalarize(0))
1318  .clampMaxNumElements(0, S32, 2)
1319  .clampMaxNumElements(0, LocalPtr, 2)
1320  .clampMaxNumElements(0, PrivatePtr, 2)
1321  .scalarize(0)
1322  .widenScalarToNextPow2(0)
1323  .legalIf(all(isPointer(0), typeInSet(1, {S1, S32})));
1324 
1325  // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can
1326  // be more flexible with the shift amount type.
1327  auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR})
1328  .legalFor({{S32, S32}, {S64, S32}});
1329  if (ST.has16BitInsts()) {
1330  if (ST.hasVOP3PInsts()) {
1331  Shifts.legalFor({{S16, S16}, {V2S16, V2S16}})
1332  .clampMaxNumElements(0, S16, 2);
1333  } else
1334  Shifts.legalFor({{S16, S16}});
1335 
1336  // TODO: Support 16-bit shift amounts for all types
1337  Shifts.widenScalarIf(
1338  [=](const LegalityQuery &Query) {
1339  // Use 16-bit shift amounts for any 16-bit shift. Otherwise we want a
1340  // 32-bit amount.
1341  const LLT ValTy = Query.Types[0];
1342  const LLT AmountTy = Query.Types[1];
1343  return ValTy.getSizeInBits() <= 16 &&
1344  AmountTy.getSizeInBits() < 16;
1345  }, changeTo(1, S16));
1346  Shifts.maxScalarIf(typeIs(0, S16), 1, S16);
1347  Shifts.clampScalar(1, S32, S32);
1348  Shifts.clampScalar(0, S16, S64);
1349  Shifts.widenScalarToNextPow2(0, 16);
1350 
1351  getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1352  .minScalar(0, S16)
1353  .scalarize(0)
1354  .lower();
1355  } else {
1356  // Make sure we legalize the shift amount type first, as the general
1357  // expansion for the shifted type will produce much worse code if it hasn't
1358  // been truncated already.
1359  Shifts.clampScalar(1, S32, S32);
1360  Shifts.clampScalar(0, S32, S64);
1361  Shifts.widenScalarToNextPow2(0, 32);
1362 
1363  getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1364  .minScalar(0, S32)
1365  .scalarize(0)
1366  .lower();
1367  }
1368  Shifts.scalarize(0);
1369 
1370  for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) {
1371  unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0;
1372  unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1;
1373  unsigned IdxTypeIdx = 2;
1374 
1375  getActionDefinitionsBuilder(Op)
1376  .customIf([=](const LegalityQuery &Query) {
1377  const LLT EltTy = Query.Types[EltTypeIdx];
1378  const LLT VecTy = Query.Types[VecTypeIdx];
1379  const LLT IdxTy = Query.Types[IdxTypeIdx];
1380  const unsigned EltSize = EltTy.getSizeInBits();
1381  return (EltSize == 32 || EltSize == 64) &&
1382  VecTy.getSizeInBits() % 32 == 0 &&
1383  VecTy.getSizeInBits() <= MaxRegisterSize &&
1384  IdxTy.getSizeInBits() == 32;
1385  })
1386  .bitcastIf(all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltNarrowerThan(VecTypeIdx, 32)),
1387  bitcastToVectorElement32(VecTypeIdx))
1388  //.bitcastIf(vectorSmallerThan(1, 32), bitcastToScalar(1))
1389  .bitcastIf(
1390  all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltWiderThan(VecTypeIdx, 64)),
1391  [=](const LegalityQuery &Query) {
1392  // For > 64-bit element types, try to turn this into a 64-bit
1393  // element vector since we may be able to do better indexing
1394  // if this is scalar. If not, fall back to 32.
1395  const LLT EltTy = Query.Types[EltTypeIdx];
1396  const LLT VecTy = Query.Types[VecTypeIdx];
1397  const unsigned DstEltSize = EltTy.getSizeInBits();
1398  const unsigned VecSize = VecTy.getSizeInBits();
1399 
1400  const unsigned TargetEltSize = DstEltSize % 64 == 0 ? 64 : 32;
1401  return std::make_pair(
1402  VecTypeIdx,
1403  LLT::fixed_vector(VecSize / TargetEltSize, TargetEltSize));
1404  })
1405  .clampScalar(EltTypeIdx, S32, S64)
1406  .clampScalar(VecTypeIdx, S32, S64)
1407  .clampScalar(IdxTypeIdx, S32, S32)
1408  .clampMaxNumElements(VecTypeIdx, S32, 32)
1409  // TODO: Clamp elements for 64-bit vectors?
1410  // It should only be necessary with variable indexes.
1411  // As a last resort, lower to the stack
1412  .lower();
1413  }
1414 
1415  getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT)
1416  .unsupportedIf([=](const LegalityQuery &Query) {
1417  const LLT &EltTy = Query.Types[1].getElementType();
1418  return Query.Types[0] != EltTy;
1419  });
1420 
1421  for (unsigned Op : {G_EXTRACT, G_INSERT}) {
1422  unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0;
1423  unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1;
1424 
1425  // FIXME: Doesn't handle extract of illegal sizes.
1426  getActionDefinitionsBuilder(Op)
1427  .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32)))
1428  .lowerIf([=](const LegalityQuery &Query) {
1429  // Sub-vector(or single element) insert and extract.
1430  // TODO: verify immediate offset here since lower only works with
1431  // whole elements.
1432  const LLT BigTy = Query.Types[BigTyIdx];
1433  return BigTy.isVector();
1434  })
1435  // FIXME: Multiples of 16 should not be legal.
1436  .legalIf([=](const LegalityQuery &Query) {
1437  const LLT BigTy = Query.Types[BigTyIdx];
1438  const LLT LitTy = Query.Types[LitTyIdx];
1439  return (BigTy.getSizeInBits() % 32 == 0) &&
1440  (LitTy.getSizeInBits() % 16 == 0);
1441  })
1442  .widenScalarIf(
1443  [=](const LegalityQuery &Query) {
1444  const LLT BigTy = Query.Types[BigTyIdx];
1445  return (BigTy.getScalarSizeInBits() < 16);
1446  },
1448  .widenScalarIf(
1449  [=](const LegalityQuery &Query) {
1450  const LLT LitTy = Query.Types[LitTyIdx];
1451  return (LitTy.getScalarSizeInBits() < 16);
1452  },
1454  .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1455  .widenScalarToNextPow2(BigTyIdx, 32);
1456 
1457  }
1458 
1459  auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR)
1460  .legalForCartesianProduct(AllS32Vectors, {S32})
1461  .legalForCartesianProduct(AllS64Vectors, {S64})
1462  .clampNumElements(0, V16S32, V32S32)
1463  .clampNumElements(0, V2S64, V16S64)
1464  .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16));
1465 
1466  if (ST.hasScalarPackInsts()) {
1467  BuildVector
1468  // FIXME: Should probably widen s1 vectors straight to s32
1469  .minScalarOrElt(0, S16)
1470  // Widen source elements and produce a G_BUILD_VECTOR_TRUNC
1471  .minScalar(1, S32);
1472 
1473  getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1474  .legalFor({V2S16, S32})
1475  .lower();
1476  BuildVector.minScalarOrElt(0, S32);
1477  } else {
1478  BuildVector.customFor({V2S16, S16});
1479  BuildVector.minScalarOrElt(0, S32);
1480 
1481  getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1482  .customFor({V2S16, S32})
1483  .lower();
1484  }
1485 
1486  BuildVector.legalIf(isRegisterType(0));
1487 
1488  // FIXME: Clamp maximum size
1489  getActionDefinitionsBuilder(G_CONCAT_VECTORS)
1490  .legalIf(all(isRegisterType(0), isRegisterType(1)))
1491  .clampMaxNumElements(0, S32, 32)
1492  .clampMaxNumElements(1, S16, 2) // TODO: Make 4?
1493  .clampMaxNumElements(0, S16, 64);
1494 
1495  // TODO: Don't fully scalarize v2s16 pieces? Or combine out thosse
1496  // pre-legalize.
1497  if (ST.hasVOP3PInsts()) {
1498  getActionDefinitionsBuilder(G_SHUFFLE_VECTOR)
1499  .customFor({V2S16, V2S16})
1500  .lower();
1501  } else
1502  getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower();
1503 
1504  // Merge/Unmerge
1505  for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
1506  unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1;
1507  unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0;
1508 
1509  auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) {
1510  const LLT Ty = Query.Types[TypeIdx];
1511  if (Ty.isVector()) {
1512  const LLT &EltTy = Ty.getElementType();
1513  if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 512)
1514  return true;
1515  if (!isPowerOf2_32(EltTy.getSizeInBits()))
1516  return true;
1517  }
1518  return false;
1519  };
1520 
1521  auto &Builder = getActionDefinitionsBuilder(Op)
1522  .legalIf(all(isRegisterType(0), isRegisterType(1)))
1523  .lowerFor({{S16, V2S16}})
1524  .lowerIf([=](const LegalityQuery &Query) {
1525  const LLT BigTy = Query.Types[BigTyIdx];
1526  return BigTy.getSizeInBits() == 32;
1527  })
1528  // Try to widen to s16 first for small types.
1529  // TODO: Only do this on targets with legal s16 shifts
1530  .minScalarOrEltIf(scalarNarrowerThan(LitTyIdx, 16), LitTyIdx, S16)
1531  .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16)
1532  .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1533  .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32),
1534  elementTypeIs(1, S16)),
1535  changeTo(1, V2S16))
1536  // Clamp the little scalar to s8-s256 and make it a power of 2. It's not
1537  // worth considering the multiples of 64 since 2*192 and 2*384 are not
1538  // valid.
1539  .clampScalar(LitTyIdx, S32, S512)
1540  .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32)
1541  // Break up vectors with weird elements into scalars
1542  .fewerElementsIf(
1543  [=](const LegalityQuery &Query) { return notValidElt(Query, LitTyIdx); },
1544  scalarize(0))
1545  .fewerElementsIf(
1546  [=](const LegalityQuery &Query) { return notValidElt(Query, BigTyIdx); },
1547  scalarize(1))
1548  .clampScalar(BigTyIdx, S32, MaxScalar);
1549 
1550  if (Op == G_MERGE_VALUES) {
1551  Builder.widenScalarIf(
1552  // TODO: Use 16-bit shifts if legal for 8-bit values?
1553  [=](const LegalityQuery &Query) {
1554  const LLT Ty = Query.Types[LitTyIdx];
1555  return Ty.getSizeInBits() < 32;
1556  },
1557  changeTo(LitTyIdx, S32));
1558  }
1559 
1560  Builder.widenScalarIf(
1561  [=](const LegalityQuery &Query) {
1562  const LLT Ty = Query.Types[BigTyIdx];
1563  return !isPowerOf2_32(Ty.getSizeInBits()) &&
1564  Ty.getSizeInBits() % 16 != 0;
1565  },
1566  [=](const LegalityQuery &Query) {
1567  // Pick the next power of 2, or a multiple of 64 over 128.
1568  // Whichever is smaller.
1569  const LLT &Ty = Query.Types[BigTyIdx];
1570  unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1);
1571  if (NewSizeInBits >= 256) {
1572  unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1);
1573  if (RoundedTo < NewSizeInBits)
1574  NewSizeInBits = RoundedTo;
1575  }
1576  return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits));
1577  })
1578  // Any vectors left are the wrong size. Scalarize them.
1579  .scalarize(0)
1580  .scalarize(1);
1581  }
1582 
1583  // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
1584  // RegBankSelect.
1585  auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG)
1586  .legalFor({{S32}, {S64}});
1587 
1588  if (ST.hasVOP3PInsts()) {
1589  SextInReg.lowerFor({{V2S16}})
1590  // Prefer to reduce vector widths for 16-bit vectors before lowering, to
1591  // get more vector shift opportunities, since we'll get those when
1592  // expanded.
1593  .clampMaxNumElementsStrict(0, S16, 2);
1594  } else if (ST.has16BitInsts()) {
1595  SextInReg.lowerFor({{S32}, {S64}, {S16}});
1596  } else {
1597  // Prefer to promote to s32 before lowering if we don't have 16-bit
1598  // shifts. This avoid a lot of intermediate truncate and extend operations.
1599  SextInReg.lowerFor({{S32}, {S64}});
1600  }
1601 
1602  SextInReg
1603  .scalarize(0)
1604  .clampScalar(0, S32, S64)
1605  .lower();
1606 
1607  getActionDefinitionsBuilder({G_ROTR, G_ROTL})
1608  .scalarize(0)
1609  .lower();
1610 
1611  // TODO: Only Try to form v2s16 with legal packed instructions.
1612  getActionDefinitionsBuilder(G_FSHR)
1613  .legalFor({{S32, S32}})
1614  .lowerFor({{V2S16, V2S16}})
1615  .clampMaxNumElementsStrict(0, S16, 2)
1616  .scalarize(0)
1617  .lower();
1618 
1619  if (ST.hasVOP3PInsts()) {
1620  getActionDefinitionsBuilder(G_FSHL)
1621  .lowerFor({{V2S16, V2S16}})
1622  .clampMaxNumElementsStrict(0, S16, 2)
1623  .scalarize(0)
1624  .lower();
1625  } else {
1626  getActionDefinitionsBuilder(G_FSHL)
1627  .scalarize(0)
1628  .lower();
1629  }
1630 
1631  getActionDefinitionsBuilder(G_READCYCLECOUNTER)
1632  .legalFor({S64});
1633 
1634  getActionDefinitionsBuilder(G_FENCE)
1635  .alwaysLegal();
1636 
1637  getActionDefinitionsBuilder({G_SMULO, G_UMULO})
1638  .scalarize(0)
1639  .minScalar(0, S32)
1640  .lower();
1641 
1642  getActionDefinitionsBuilder({G_SBFX, G_UBFX})
1643  .legalFor({{S32, S32}, {S64, S32}})
1644  .clampScalar(1, S32, S32)
1645  .clampScalar(0, S32, S64)
1646  .widenScalarToNextPow2(0)
1647  .scalarize(0);
1648 
1649  getActionDefinitionsBuilder({
1650  // TODO: Verify V_BFI_B32 is generated from expanded bit ops
1651  G_FCOPYSIGN,
1652 
1653  G_ATOMIC_CMPXCHG_WITH_SUCCESS,
1654  G_ATOMICRMW_NAND,
1655  G_ATOMICRMW_FSUB,
1656  G_READ_REGISTER,
1657  G_WRITE_REGISTER,
1658 
1659  G_SADDO, G_SSUBO,
1660 
1661  // TODO: Implement
1662  G_FMINIMUM, G_FMAXIMUM}).lower();
1663 
1664  getActionDefinitionsBuilder({G_MEMCPY, G_MEMCPY_INLINE, G_MEMMOVE, G_MEMSET})
1665  .lower();
1666 
1667  getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE,
1668  G_INDEXED_LOAD, G_INDEXED_SEXTLOAD,
1669  G_INDEXED_ZEXTLOAD, G_INDEXED_STORE})
1670  .unsupported();
1671 
1672  getLegacyLegalizerInfo().computeTables();
1673  verify(*ST.getInstrInfo());
1674 }
1675 
1676 bool AMDGPULegalizerInfo::legalizeCustom(LegalizerHelper &Helper,
1677  MachineInstr &MI) const {
1678  MachineIRBuilder &B = Helper.MIRBuilder;
1679  MachineRegisterInfo &MRI = *B.getMRI();
1680 
1681  switch (MI.getOpcode()) {
1682  case TargetOpcode::G_ADDRSPACE_CAST:
1683  return legalizeAddrSpaceCast(MI, MRI, B);
1684  case TargetOpcode::G_FRINT:
1685  return legalizeFrint(MI, MRI, B);
1686  case TargetOpcode::G_FCEIL:
1687  return legalizeFceil(MI, MRI, B);
1688  case TargetOpcode::G_FREM:
1689  return legalizeFrem(MI, MRI, B);
1690  case TargetOpcode::G_INTRINSIC_TRUNC:
1691  return legalizeIntrinsicTrunc(MI, MRI, B);
1692  case TargetOpcode::G_SITOFP:
1693  return legalizeITOFP(MI, MRI, B, true);
1694  case TargetOpcode::G_UITOFP:
1695  return legalizeITOFP(MI, MRI, B, false);
1696  case TargetOpcode::G_FPTOSI:
1697  return legalizeFPTOI(MI, MRI, B, true);
1698  case TargetOpcode::G_FPTOUI:
1699  return legalizeFPTOI(MI, MRI, B, false);
1700  case TargetOpcode::G_FMINNUM:
1701  case TargetOpcode::G_FMAXNUM:
1702  case TargetOpcode::G_FMINNUM_IEEE:
1703  case TargetOpcode::G_FMAXNUM_IEEE:
1704  return legalizeMinNumMaxNum(Helper, MI);
1705  case TargetOpcode::G_EXTRACT_VECTOR_ELT:
1706  return legalizeExtractVectorElt(MI, MRI, B);
1707  case TargetOpcode::G_INSERT_VECTOR_ELT:
1708  return legalizeInsertVectorElt(MI, MRI, B);
1709  case TargetOpcode::G_SHUFFLE_VECTOR:
1710  return legalizeShuffleVector(MI, MRI, B);
1711  case TargetOpcode::G_FSIN:
1712  case TargetOpcode::G_FCOS:
1713  return legalizeSinCos(MI, MRI, B);
1714  case TargetOpcode::G_GLOBAL_VALUE:
1715  return legalizeGlobalValue(MI, MRI, B);
1716  case TargetOpcode::G_LOAD:
1717  case TargetOpcode::G_SEXTLOAD:
1718  case TargetOpcode::G_ZEXTLOAD:
1719  return legalizeLoad(Helper, MI);
1720  case TargetOpcode::G_FMAD:
1721  return legalizeFMad(MI, MRI, B);
1722  case TargetOpcode::G_FDIV:
1723  return legalizeFDIV(MI, MRI, B);
1724  case TargetOpcode::G_UDIV:
1725  case TargetOpcode::G_UREM:
1726  case TargetOpcode::G_UDIVREM:
1727  return legalizeUnsignedDIV_REM(MI, MRI, B);
1728  case TargetOpcode::G_SDIV:
1729  case TargetOpcode::G_SREM:
1730  case TargetOpcode::G_SDIVREM:
1731  return legalizeSignedDIV_REM(MI, MRI, B);
1732  case TargetOpcode::G_ATOMIC_CMPXCHG:
1733  return legalizeAtomicCmpXChg(MI, MRI, B);
1734  case TargetOpcode::G_FLOG:
1735  return legalizeFlog(MI, B, numbers::ln2f);
1736  case TargetOpcode::G_FLOG10:
1738  case TargetOpcode::G_FEXP:
1739  return legalizeFExp(MI, B);
1740  case TargetOpcode::G_FPOW:
1741  return legalizeFPow(MI, B);
1742  case TargetOpcode::G_FFLOOR:
1743  return legalizeFFloor(MI, MRI, B);
1744  case TargetOpcode::G_BUILD_VECTOR:
1745  return legalizeBuildVector(MI, MRI, B);
1746  case TargetOpcode::G_CTLZ:
1747  case TargetOpcode::G_CTTZ:
1748  return legalizeCTLZ_CTTZ(MI, MRI, B);
1749  default:
1750  return false;
1751  }
1752 
1753  llvm_unreachable("expected switch to return");
1754 }
1755 
1757  unsigned AS,
1759  MachineIRBuilder &B) const {
1760  MachineFunction &MF = B.getMF();
1761  const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1762  const LLT S32 = LLT::scalar(32);
1763 
1765 
1766  if (ST.hasApertureRegs()) {
1767  // FIXME: Use inline constants (src_{shared, private}_base) instead of
1768  // getreg.
1769  unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ?
1772  unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ?
1775  unsigned Encoding =
1778  WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_;
1779 
1780  Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
1781 
1782  B.buildInstr(AMDGPU::S_GETREG_B32)
1783  .addDef(GetReg)
1784  .addImm(Encoding);
1785  MRI.setType(GetReg, S32);
1786 
1787  auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1);
1788  return B.buildShl(S32, GetReg, ShiftAmt).getReg(0);
1789  }
1790 
1793 
1795  return Register();
1796 
1797  // Offset into amd_queue_t for group_segment_aperture_base_hi /
1798  // private_segment_aperture_base_hi.
1799  uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
1800 
1801  // TODO: can we be smarter about machine pointer info?
1804  PtrInfo,
1807  LLT::scalar(32), commonAlignment(Align(64), StructOffset));
1808 
1809  Register LoadAddr;
1810 
1811  B.materializePtrAdd(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset);
1812  return B.buildLoad(S32, LoadAddr, *MMO).getReg(0);
1813 }
1814 
1815 /// Return true if the value is a known valid address, such that a null check is
1816 /// not necessary.
1818  const AMDGPUTargetMachine &TM, unsigned AddrSpace) {
1819  MachineInstr *Def = MRI.getVRegDef(Val);
1820  switch (Def->getOpcode()) {
1821  case AMDGPU::G_FRAME_INDEX:
1822  case AMDGPU::G_GLOBAL_VALUE:
1823  case AMDGPU::G_BLOCK_ADDR:
1824  return true;
1825  case AMDGPU::G_CONSTANT: {
1826  const ConstantInt *CI = Def->getOperand(1).getCImm();
1827  return CI->getSExtValue() != TM.getNullPointerValue(AddrSpace);
1828  }
1829  default:
1830  return false;
1831  }
1832 
1833  return false;
1834 }
1835 
1838  MachineIRBuilder &B) const {
1839  MachineFunction &MF = B.getMF();
1840 
1841  const LLT S32 = LLT::scalar(32);
1842  Register Dst = MI.getOperand(0).getReg();
1843  Register Src = MI.getOperand(1).getReg();
1844 
1845  LLT DstTy = MRI.getType(Dst);
1846  LLT SrcTy = MRI.getType(Src);
1847  unsigned DestAS = DstTy.getAddressSpace();
1848  unsigned SrcAS = SrcTy.getAddressSpace();
1849 
1850  // TODO: Avoid reloading from the queue ptr for each cast, or at least each
1851  // vector element.
1852  assert(!DstTy.isVector());
1853 
1854  const AMDGPUTargetMachine &TM
1855  = static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
1856 
1857  if (TM.isNoopAddrSpaceCast(SrcAS, DestAS)) {
1858  MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST));
1859  return true;
1860  }
1861 
1862  if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1863  // Truncate.
1864  B.buildExtract(Dst, Src, 0);
1865  MI.eraseFromParent();
1866  return true;
1867  }
1868 
1869  if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1871  uint32_t AddrHiVal = Info->get32BitAddressHighBits();
1872 
1873  // FIXME: This is a bit ugly due to creating a merge of 2 pointers to
1874  // another. Merge operands are required to be the same type, but creating an
1875  // extra ptrtoint would be kind of pointless.
1876  auto HighAddr = B.buildConstant(
1878  B.buildMerge(Dst, {Src, HighAddr});
1879  MI.eraseFromParent();
1880  return true;
1881  }
1882 
1883  if (SrcAS == AMDGPUAS::FLAT_ADDRESS) {
1884  assert(DestAS == AMDGPUAS::LOCAL_ADDRESS ||
1885  DestAS == AMDGPUAS::PRIVATE_ADDRESS);
1886 
1887  if (isKnownNonNull(Src, MRI, TM, SrcAS)) {
1888  // Extract low 32-bits of the pointer.
1889  B.buildExtract(Dst, Src, 0);
1890  MI.eraseFromParent();
1891  return true;
1892  }
1893 
1894  unsigned NullVal = TM.getNullPointerValue(DestAS);
1895 
1896  auto SegmentNull = B.buildConstant(DstTy, NullVal);
1897  auto FlatNull = B.buildConstant(SrcTy, 0);
1898 
1899  // Extract low 32-bits of the pointer.
1900  auto PtrLo32 = B.buildExtract(DstTy, Src, 0);
1901 
1902  auto CmpRes =
1903  B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0));
1904  B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0));
1905 
1906  MI.eraseFromParent();
1907  return true;
1908  }
1909 
1910  if (SrcAS != AMDGPUAS::LOCAL_ADDRESS && SrcAS != AMDGPUAS::PRIVATE_ADDRESS)
1911  return false;
1912 
1913  if (!ST.hasFlatAddressSpace())
1914  return false;
1915 
1916  Register ApertureReg = getSegmentAperture(SrcAS, MRI, B);
1917  if (!ApertureReg.isValid())
1918  return false;
1919 
1920  // Coerce the type of the low half of the result so we can use merge_values.
1921  Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0);
1922 
1923  // TODO: Should we allow mismatched types but matching sizes in merges to
1924  // avoid the ptrtoint?
1925  auto BuildPtr = B.buildMerge(DstTy, {SrcAsInt, ApertureReg});
1926 
1927  if (isKnownNonNull(Src, MRI, TM, SrcAS)) {
1928  B.buildCopy(Dst, BuildPtr);
1929  MI.eraseFromParent();
1930  return true;
1931  }
1932 
1933  auto SegmentNull = B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS));
1934  auto FlatNull = B.buildConstant(DstTy, TM.getNullPointerValue(DestAS));
1935 
1936  auto CmpRes =
1937  B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, SegmentNull.getReg(0));
1938 
1939  B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull);
1940 
1941  MI.eraseFromParent();
1942  return true;
1943 }
1944 
1947  MachineIRBuilder &B) const {
1948  Register Src = MI.getOperand(1).getReg();
1949  LLT Ty = MRI.getType(Src);
1950  assert(Ty.isScalar() && Ty.getSizeInBits() == 64);
1951 
1952  APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
1953  APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
1954 
1955  auto C1 = B.buildFConstant(Ty, C1Val);
1956  auto CopySign = B.buildFCopysign(Ty, C1, Src);
1957 
1958  // TODO: Should this propagate fast-math-flags?
1959  auto Tmp1 = B.buildFAdd(Ty, Src, CopySign);
1960  auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign);
1961 
1962  auto C2 = B.buildFConstant(Ty, C2Val);
1963  auto Fabs = B.buildFAbs(Ty, Src);
1964 
1965  auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2);
1966  B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2);
1967  MI.eraseFromParent();
1968  return true;
1969 }
1970 
1973  MachineIRBuilder &B) const {
1974 
1975  const LLT S1 = LLT::scalar(1);
1976  const LLT S64 = LLT::scalar(64);
1977 
1978  Register Src = MI.getOperand(1).getReg();
1979  assert(MRI.getType(Src) == S64);
1980 
1981  // result = trunc(src)
1982  // if (src > 0.0 && src != result)
1983  // result += 1.0
1984 
1985  auto Trunc = B.buildIntrinsicTrunc(S64, Src);
1986 
1987  const auto Zero = B.buildFConstant(S64, 0.0);
1988  const auto One = B.buildFConstant(S64, 1.0);
1989  auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero);
1990  auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc);
1991  auto And = B.buildAnd(S1, Lt0, NeTrunc);
1992  auto Add = B.buildSelect(S64, And, One, Zero);
1993 
1994  // TODO: Should this propagate fast-math-flags?
1995  B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add);
1996  MI.eraseFromParent();
1997  return true;
1998 }
1999 
2002  MachineIRBuilder &B) const {
2003  Register DstReg = MI.getOperand(0).getReg();
2004  Register Src0Reg = MI.getOperand(1).getReg();
2005  Register Src1Reg = MI.getOperand(2).getReg();
2006  auto Flags = MI.getFlags();
2007  LLT Ty = MRI.getType(DstReg);
2008 
2009  auto Div = B.buildFDiv(Ty, Src0Reg, Src1Reg, Flags);
2010  auto Trunc = B.buildIntrinsicTrunc(Ty, Div, Flags);
2011  auto Neg = B.buildFNeg(Ty, Trunc, Flags);
2012  B.buildFMA(DstReg, Neg, Src1Reg, Src0Reg, Flags);
2013  MI.eraseFromParent();
2014  return true;
2015 }
2016 
2018  MachineIRBuilder &B) {
2019  const unsigned FractBits = 52;
2020  const unsigned ExpBits = 11;
2021  LLT S32 = LLT::scalar(32);
2022 
2023  auto Const0 = B.buildConstant(S32, FractBits - 32);
2024  auto Const1 = B.buildConstant(S32, ExpBits);
2025 
2026  auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false)
2027  .addUse(Hi)
2028  .addUse(Const0.getReg(0))
2029  .addUse(Const1.getReg(0));
2030 
2031  return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023));
2032 }
2033 
2036  MachineIRBuilder &B) const {
2037  const LLT S1 = LLT::scalar(1);
2038  const LLT S32 = LLT::scalar(32);
2039  const LLT S64 = LLT::scalar(64);
2040 
2041  Register Src = MI.getOperand(1).getReg();
2042  assert(MRI.getType(Src) == S64);
2043 
2044  // TODO: Should this use extract since the low half is unused?
2045  auto Unmerge = B.buildUnmerge({S32, S32}, Src);
2046  Register Hi = Unmerge.getReg(1);
2047 
2048  // Extract the upper half, since this is where we will find the sign and
2049  // exponent.
2050  auto Exp = extractF64Exponent(Hi, B);
2051 
2052  const unsigned FractBits = 52;
2053 
2054  // Extract the sign bit.
2055  const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31);
2056  auto SignBit = B.buildAnd(S32, Hi, SignBitMask);
2057 
2058  const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1);
2059 
2060  const auto Zero32 = B.buildConstant(S32, 0);
2061 
2062  // Extend back to 64-bits.
2063  auto SignBit64 = B.buildMerge(S64, {Zero32, SignBit});
2064 
2065  auto Shr = B.buildAShr(S64, FractMask, Exp);
2066  auto Not = B.buildNot(S64, Shr);
2067  auto Tmp0 = B.buildAnd(S64, Src, Not);
2068  auto FiftyOne = B.buildConstant(S32, FractBits - 1);
2069 
2070  auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32);
2071  auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne);
2072 
2073  auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0);
2074  B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1);
2075  MI.eraseFromParent();
2076  return true;
2077 }
2078 
2081  MachineIRBuilder &B, bool Signed) const {
2082 
2083  Register Dst = MI.getOperand(0).getReg();
2084  Register Src = MI.getOperand(1).getReg();
2085 
2086  const LLT S64 = LLT::scalar(64);
2087  const LLT S32 = LLT::scalar(32);
2088 
2089  assert(MRI.getType(Src) == S64);
2090 
2091  auto Unmerge = B.buildUnmerge({S32, S32}, Src);
2092  auto ThirtyTwo = B.buildConstant(S32, 32);
2093 
2094  if (MRI.getType(Dst) == S64) {
2095  auto CvtHi = Signed ? B.buildSITOFP(S64, Unmerge.getReg(1))
2096  : B.buildUITOFP(S64, Unmerge.getReg(1));
2097 
2098  auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0));
2099  auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false)
2100  .addUse(CvtHi.getReg(0))
2101  .addUse(ThirtyTwo.getReg(0));
2102 
2103  // TODO: Should this propagate fast-math-flags?
2104  B.buildFAdd(Dst, LdExp, CvtLo);
2105  MI.eraseFromParent();
2106  return true;
2107  }
2108 
2109  assert(MRI.getType(Dst) == S32);
2110 
2111  auto One = B.buildConstant(S32, 1);
2112 
2113  MachineInstrBuilder ShAmt;
2114  if (Signed) {
2115  auto ThirtyOne = B.buildConstant(S32, 31);
2116  auto X = B.buildXor(S32, Unmerge.getReg(0), Unmerge.getReg(1));
2117  auto OppositeSign = B.buildAShr(S32, X, ThirtyOne);
2118  auto MaxShAmt = B.buildAdd(S32, ThirtyTwo, OppositeSign);
2119  auto LS = B.buildIntrinsic(Intrinsic::amdgcn_sffbh, {S32},
2120  /*HasSideEffects=*/false)
2121  .addUse(Unmerge.getReg(1));
2122  auto LS2 = B.buildSub(S32, LS, One);
2123  ShAmt = B.buildUMin(S32, LS2, MaxShAmt);
2124  } else
2125  ShAmt = B.buildCTLZ(S32, Unmerge.getReg(1));
2126  auto Norm = B.buildShl(S64, Src, ShAmt);
2127  auto Unmerge2 = B.buildUnmerge({S32, S32}, Norm);
2128  auto Adjust = B.buildUMin(S32, One, Unmerge2.getReg(0));
2129  auto Norm2 = B.buildOr(S32, Unmerge2.getReg(1), Adjust);
2130  auto FVal = Signed ? B.buildSITOFP(S32, Norm2) : B.buildUITOFP(S32, Norm2);
2131  auto Scale = B.buildSub(S32, ThirtyTwo, ShAmt);
2132  B.buildIntrinsic(Intrinsic::amdgcn_ldexp, ArrayRef<Register>{Dst},
2133  /*HasSideEffects=*/false)
2134  .addUse(FVal.getReg(0))
2135  .addUse(Scale.getReg(0));
2136  MI.eraseFromParent();
2137  return true;
2138 }
2139 
2140 // TODO: Copied from DAG implementation. Verify logic and document how this
2141 // actually works.
2145  bool Signed) const {
2146 
2147  Register Dst = MI.getOperand(0).getReg();
2148  Register Src = MI.getOperand(1).getReg();
2149 
2150  const LLT S64 = LLT::scalar(64);
2151  const LLT S32 = LLT::scalar(32);
2152 
2153  const LLT SrcLT = MRI.getType(Src);
2154  assert((SrcLT == S32 || SrcLT == S64) && MRI.getType(Dst) == S64);
2155 
2156  unsigned Flags = MI.getFlags();
2157 
2158  // The basic idea of converting a floating point number into a pair of 32-bit
2159  // integers is illustrated as follows:
2160  //
2161  // tf := trunc(val);
2162  // hif := floor(tf * 2^-32);
2163  // lof := tf - hif * 2^32; // lof is always positive due to floor.
2164  // hi := fptoi(hif);
2165  // lo := fptoi(lof);
2166  //
2167  auto Trunc = B.buildIntrinsicTrunc(SrcLT, Src, Flags);
2168  MachineInstrBuilder Sign;
2169  if (Signed && SrcLT == S32) {
2170  // However, a 32-bit floating point number has only 23 bits mantissa and
2171  // it's not enough to hold all the significant bits of `lof` if val is
2172  // negative. To avoid the loss of precision, We need to take the absolute
2173  // value after truncating and flip the result back based on the original
2174  // signedness.
2175  Sign = B.buildAShr(S32, Src, B.buildConstant(S32, 31));
2176  Trunc = B.buildFAbs(S32, Trunc, Flags);
2177  }
2178  MachineInstrBuilder K0, K1;
2179  if (SrcLT == S64) {
2180  K0 = B.buildFConstant(S64,
2181  BitsToDouble(UINT64_C(/*2^-32*/ 0x3df0000000000000)));
2182  K1 = B.buildFConstant(S64,
2183  BitsToDouble(UINT64_C(/*-2^32*/ 0xc1f0000000000000)));
2184  } else {
2185  K0 = B.buildFConstant(S32, BitsToFloat(UINT32_C(/*2^-32*/ 0x2f800000)));
2186  K1 = B.buildFConstant(S32, BitsToFloat(UINT32_C(/*-2^32*/ 0xcf800000)));
2187  }
2188 
2189  auto Mul = B.buildFMul(SrcLT, Trunc, K0, Flags);
2190  auto FloorMul = B.buildFFloor(SrcLT, Mul, Flags);
2191  auto Fma = B.buildFMA(SrcLT, FloorMul, K1, Trunc, Flags);
2192 
2193  auto Hi = (Signed && SrcLT == S64) ? B.buildFPTOSI(S32, FloorMul)
2194  : B.buildFPTOUI(S32, FloorMul);
2195  auto Lo = B.buildFPTOUI(S32, Fma);
2196 
2197  if (Signed && SrcLT == S32) {
2198  // Flip the result based on the signedness, which is either all 0s or 1s.
2199  Sign = B.buildMerge(S64, {Sign, Sign});
2200  // r := xor({lo, hi}, sign) - sign;
2201  B.buildSub(Dst, B.buildXor(S64, B.buildMerge(S64, {Lo, Hi}), Sign), Sign);
2202  } else
2203  B.buildMerge(Dst, {Lo, Hi});
2204  MI.eraseFromParent();
2205 
2206  return true;
2207 }
2208 
2210  MachineInstr &MI) const {
2211  MachineFunction &MF = Helper.MIRBuilder.getMF();
2213 
2214  const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE ||
2215  MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE;
2216 
2217  // With ieee_mode disabled, the instructions have the correct behavior
2218  // already for G_FMINNUM/G_FMAXNUM
2219  if (!MFI->getMode().IEEE)
2220  return !IsIEEEOp;
2221 
2222  if (IsIEEEOp)
2223  return true;
2224 
2226 }
2227 
2230  MachineIRBuilder &B) const {
2231  // TODO: Should move some of this into LegalizerHelper.
2232 
2233  // TODO: Promote dynamic indexing of s16 to s32
2234 
2235  // FIXME: Artifact combiner probably should have replaced the truncated
2236  // constant before this, so we shouldn't need
2237  // getIConstantVRegValWithLookThrough.
2238  Optional<ValueAndVReg> MaybeIdxVal =
2239  getIConstantVRegValWithLookThrough(MI.getOperand(2).getReg(), MRI);
2240  if (!MaybeIdxVal) // Dynamic case will be selected to register indexing.
2241  return true;
2242  const int64_t IdxVal = MaybeIdxVal->Value.getSExtValue();
2243 
2244  Register Dst = MI.getOperand(0).getReg();
2245  Register Vec = MI.getOperand(1).getReg();
2246 
2247  LLT VecTy = MRI.getType(Vec);
2248  LLT EltTy = VecTy.getElementType();
2249  assert(EltTy == MRI.getType(Dst));
2250 
2251  if (IdxVal < VecTy.getNumElements()) {
2252  auto Unmerge = B.buildUnmerge(EltTy, Vec);
2253  B.buildCopy(Dst, Unmerge.getReg(IdxVal));
2254  } else {
2255  B.buildUndef(Dst);
2256  }
2257 
2258  MI.eraseFromParent();
2259  return true;
2260 }
2261 
2264  MachineIRBuilder &B) const {
2265  // TODO: Should move some of this into LegalizerHelper.
2266 
2267  // TODO: Promote dynamic indexing of s16 to s32
2268 
2269  // FIXME: Artifact combiner probably should have replaced the truncated
2270  // constant before this, so we shouldn't need
2271  // getIConstantVRegValWithLookThrough.
2272  Optional<ValueAndVReg> MaybeIdxVal =
2273  getIConstantVRegValWithLookThrough(MI.getOperand(3).getReg(), MRI);
2274  if (!MaybeIdxVal) // Dynamic case will be selected to register indexing.
2275  return true;
2276 
2277  int64_t IdxVal = MaybeIdxVal->Value.getSExtValue();
2278  Register Dst = MI.getOperand(0).getReg();
2279  Register Vec = MI.getOperand(1).getReg();
2280  Register Ins = MI.getOperand(2).getReg();
2281 
2282  LLT VecTy = MRI.getType(Vec);
2283  LLT EltTy = VecTy.getElementType();
2284  assert(EltTy == MRI.getType(Ins));
2285  (void)Ins;
2286 
2287  unsigned NumElts = VecTy.getNumElements();
2288  if (IdxVal < NumElts) {
2289  SmallVector<Register, 8> SrcRegs;
2290  for (unsigned i = 0; i < NumElts; ++i)
2291  SrcRegs.push_back(MRI.createGenericVirtualRegister(EltTy));
2292  B.buildUnmerge(SrcRegs, Vec);
2293 
2294  SrcRegs[IdxVal] = MI.getOperand(2).getReg();
2295  B.buildMerge(Dst, SrcRegs);
2296  } else {
2297  B.buildUndef(Dst);
2298  }
2299 
2300  MI.eraseFromParent();
2301  return true;
2302 }
2303 
2306  MachineIRBuilder &B) const {
2307  const LLT V2S16 = LLT::fixed_vector(2, 16);
2308 
2309  Register Dst = MI.getOperand(0).getReg();
2310  Register Src0 = MI.getOperand(1).getReg();
2311  LLT DstTy = MRI.getType(Dst);
2312  LLT SrcTy = MRI.getType(Src0);
2313 
2314  if (SrcTy == V2S16 && DstTy == V2S16 &&
2315  AMDGPU::isLegalVOP3PShuffleMask(MI.getOperand(3).getShuffleMask()))
2316  return true;
2317 
2318  MachineIRBuilder HelperBuilder(MI);
2319  GISelObserverWrapper DummyObserver;
2320  LegalizerHelper Helper(B.getMF(), DummyObserver, HelperBuilder);
2322 }
2323 
2326  MachineIRBuilder &B) const {
2327 
2328  Register DstReg = MI.getOperand(0).getReg();
2329  Register SrcReg = MI.getOperand(1).getReg();
2330  LLT Ty = MRI.getType(DstReg);
2331  unsigned Flags = MI.getFlags();
2332 
2333  Register TrigVal;
2334  auto OneOver2Pi = B.buildFConstant(Ty, 0.5 * numbers::inv_pi);
2335  if (ST.hasTrigReducedRange()) {
2336  auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags);
2337  TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false)
2338  .addUse(MulVal.getReg(0))
2339  .setMIFlags(Flags).getReg(0);
2340  } else
2341  TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0);
2342 
2343  Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ?
2344  Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos;
2345  B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false)
2346  .addUse(TrigVal)
2347  .setMIFlags(Flags);
2348  MI.eraseFromParent();
2349  return true;
2350 }
2351 
2354  const GlobalValue *GV,
2355  int64_t Offset,
2356  unsigned GAFlags) const {
2357  assert(isInt<32>(Offset + 4) && "32-bit offset is expected!");
2358  // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered
2359  // to the following code sequence:
2360  //
2361  // For constant address space:
2362  // s_getpc_b64 s[0:1]
2363  // s_add_u32 s0, s0, $symbol
2364  // s_addc_u32 s1, s1, 0
2365  //
2366  // s_getpc_b64 returns the address of the s_add_u32 instruction and then
2367  // a fixup or relocation is emitted to replace $symbol with a literal
2368  // constant, which is a pc-relative offset from the encoding of the $symbol
2369  // operand to the global variable.
2370  //
2371  // For global address space:
2372  // s_getpc_b64 s[0:1]
2373  // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
2374  // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
2375  //
2376  // s_getpc_b64 returns the address of the s_add_u32 instruction and then
2377  // fixups or relocations are emitted to replace $symbol@*@lo and
2378  // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
2379  // which is a 64-bit pc-relative offset from the encoding of the $symbol
2380  // operand to the global variable.
2381  //
2382  // What we want here is an offset from the value returned by s_getpc
2383  // (which is the address of the s_add_u32 instruction) to the global
2384  // variable, but since the encoding of $symbol starts 4 bytes after the start
2385  // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too
2386  // small. This requires us to add 4 to the global variable offset in order to
2387  // compute the correct address. Similarly for the s_addc_u32 instruction, the
2388  // encoding of $symbol starts 12 bytes after the start of the s_add_u32
2389  // instruction.
2390 
2391  LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2392 
2393  Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg :
2394  B.getMRI()->createGenericVirtualRegister(ConstPtrTy);
2395 
2396  MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET)
2397  .addDef(PCReg);
2398 
2399  MIB.addGlobalAddress(GV, Offset + 4, GAFlags);
2400  if (GAFlags == SIInstrInfo::MO_NONE)
2401  MIB.addImm(0);
2402  else
2403  MIB.addGlobalAddress(GV, Offset + 12, GAFlags + 1);
2404 
2405  B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass);
2406 
2407  if (PtrTy.getSizeInBits() == 32)
2408  B.buildExtract(DstReg, PCReg, 0);
2409  return true;
2410  }
2411 
2414  MachineIRBuilder &B) const {
2415  Register DstReg = MI.getOperand(0).getReg();
2416  LLT Ty = MRI.getType(DstReg);
2417  unsigned AS = Ty.getAddressSpace();
2418 
2419  const GlobalValue *GV = MI.getOperand(1).getGlobal();
2420  MachineFunction &MF = B.getMF();
2422 
2424  if (!MFI->isModuleEntryFunction() &&
2425  !GV->getName().equals("llvm.amdgcn.module.lds")) {
2426  const Function &Fn = MF.getFunction();
2427  DiagnosticInfoUnsupported BadLDSDecl(
2428  Fn, "local memory global used by non-kernel function", MI.getDebugLoc(),
2429  DS_Warning);
2430  Fn.getContext().diagnose(BadLDSDecl);
2431 
2432  // We currently don't have a way to correctly allocate LDS objects that
2433  // aren't directly associated with a kernel. We do force inlining of
2434  // functions that use local objects. However, if these dead functions are
2435  // not eliminated, we don't want a compile time error. Just emit a warning
2436  // and a trap, since there should be no callable path here.
2437  B.buildIntrinsic(Intrinsic::trap, ArrayRef<Register>(), true);
2438  B.buildUndef(DstReg);
2439  MI.eraseFromParent();
2440  return true;
2441  }
2442 
2443  // TODO: We could emit code to handle the initialization somewhere.
2444  // We ignore the initializer for now and legalize it to allow selection.
2445  // The initializer will anyway get errored out during assembly emission.
2446  const SITargetLowering *TLI = ST.getTargetLowering();
2447  if (!TLI->shouldUseLDSConstAddress(GV)) {
2448  MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO);
2449  return true; // Leave in place;
2450  }
2451 
2452  if (AS == AMDGPUAS::LOCAL_ADDRESS && GV->hasExternalLinkage()) {
2453  Type *Ty = GV->getValueType();
2454  // HIP uses an unsized array `extern __shared__ T s[]` or similar
2455  // zero-sized type in other languages to declare the dynamic shared
2456  // memory which size is not known at the compile time. They will be
2457  // allocated by the runtime and placed directly after the static
2458  // allocated ones. They all share the same offset.
2459  if (B.getDataLayout().getTypeAllocSize(Ty).isZero()) {
2460  // Adjust alignment for that dynamic shared memory array.
2461  MFI->setDynLDSAlign(B.getDataLayout(), *cast<GlobalVariable>(GV));
2462  LLT S32 = LLT::scalar(32);
2463  auto Sz =
2464  B.buildIntrinsic(Intrinsic::amdgcn_groupstaticsize, {S32}, false);
2465  B.buildIntToPtr(DstReg, Sz);
2466  MI.eraseFromParent();
2467  return true;
2468  }
2469  }
2470 
2471  B.buildConstant(DstReg, MFI->allocateLDSGlobal(B.getDataLayout(),
2472  *cast<GlobalVariable>(GV)));
2473  MI.eraseFromParent();
2474  return true;
2475  }
2476 
2477  const SITargetLowering *TLI = ST.getTargetLowering();
2478 
2479  if (TLI->shouldEmitFixup(GV)) {
2480  buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0);
2481  MI.eraseFromParent();
2482  return true;
2483  }
2484 
2485  if (TLI->shouldEmitPCReloc(GV)) {
2486  buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32);
2487  MI.eraseFromParent();
2488  return true;
2489  }
2490 
2492  Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy);
2493 
2494  LLT LoadTy = Ty.getSizeInBits() == 32 ? PtrTy : Ty;
2499  LoadTy, Align(8));
2500 
2501  buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32);
2502 
2503  if (Ty.getSizeInBits() == 32) {
2504  // Truncate if this is a 32-bit constant address.
2505  auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO);
2506  B.buildExtract(DstReg, Load, 0);
2507  } else
2508  B.buildLoad(DstReg, GOTAddr, *GOTMMO);
2509 
2510  MI.eraseFromParent();
2511  return true;
2512 }
2513 
2515  if (Ty.isVector())
2516  return Ty.changeElementCount(
2518  return LLT::scalar(PowerOf2Ceil(Ty.getSizeInBits()));
2519 }
2520 
2522  MachineInstr &MI) const {
2523  MachineIRBuilder &B = Helper.MIRBuilder;
2524  MachineRegisterInfo &MRI = *B.getMRI();
2525  GISelChangeObserver &Observer = Helper.Observer;
2526 
2527  Register PtrReg = MI.getOperand(1).getReg();
2528  LLT PtrTy = MRI.getType(PtrReg);
2529  unsigned AddrSpace = PtrTy.getAddressSpace();
2530 
2531  if (AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
2533  auto Cast = B.buildAddrSpaceCast(ConstPtr, PtrReg);
2534  Observer.changingInstr(MI);
2535  MI.getOperand(1).setReg(Cast.getReg(0));
2536  Observer.changedInstr(MI);
2537  return true;
2538  }
2539 
2540  if (MI.getOpcode() != AMDGPU::G_LOAD)
2541  return false;
2542 
2543  Register ValReg = MI.getOperand(0).getReg();
2544  LLT ValTy = MRI.getType(ValReg);
2545 
2546  MachineMemOperand *MMO = *MI.memoperands_begin();
2547  const unsigned ValSize = ValTy.getSizeInBits();
2548  const LLT MemTy = MMO->getMemoryType();
2549  const Align MemAlign = MMO->getAlign();
2550  const unsigned MemSize = MemTy.getSizeInBits();
2551  const uint64_t AlignInBits = 8 * MemAlign.value();
2552 
2553  // Widen non-power-of-2 loads to the alignment if needed
2554  if (shouldWidenLoad(ST, MemTy, AlignInBits, AddrSpace, MI.getOpcode())) {
2555  const unsigned WideMemSize = PowerOf2Ceil(MemSize);
2556 
2557  // This was already the correct extending load result type, so just adjust
2558  // the memory type.
2559  if (WideMemSize == ValSize) {
2560  MachineFunction &MF = B.getMF();
2561 
2562  MachineMemOperand *WideMMO =
2563  MF.getMachineMemOperand(MMO, 0, WideMemSize / 8);
2564  Observer.changingInstr(MI);
2565  MI.setMemRefs(MF, {WideMMO});
2566  Observer.changedInstr(MI);
2567  return true;
2568  }
2569 
2570  // Don't bother handling edge case that should probably never be produced.
2571  if (ValSize > WideMemSize)
2572  return false;
2573 
2574  LLT WideTy = widenToNextPowerOf2(ValTy);
2575 
2576  Register WideLoad;
2577  if (!WideTy.isVector()) {
2578  WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
2579  B.buildTrunc(ValReg, WideLoad).getReg(0);
2580  } else {
2581  // Extract the subvector.
2582 
2583  if (isRegisterType(ValTy)) {
2584  // If this a case where G_EXTRACT is legal, use it.
2585  // (e.g. <3 x s32> -> <4 x s32>)
2586  WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
2587  B.buildExtract(ValReg, WideLoad, 0);
2588  } else {
2589  // For cases where the widened type isn't a nice register value, unmerge
2590  // from a widened register (e.g. <3 x s16> -> <4 x s16>)
2591  WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
2592  B.buildDeleteTrailingVectorElements(ValReg, WideLoad);
2593  }
2594  }
2595 
2596  MI.eraseFromParent();
2597  return true;
2598  }
2599 
2600  return false;
2601 }
2602 
2605  MachineIRBuilder &B) const {
2606  LLT Ty = MRI.getType(MI.getOperand(0).getReg());
2607  assert(Ty.isScalar());
2608 
2609  MachineFunction &MF = B.getMF();
2611 
2612  // TODO: Always legal with future ftz flag.
2613  // FIXME: Do we need just output?
2614  if (Ty == LLT::scalar(32) && !MFI->getMode().allFP32Denormals())
2615  return true;
2616  if (Ty == LLT::scalar(16) && !MFI->getMode().allFP64FP16Denormals())
2617  return true;
2618 
2619  MachineIRBuilder HelperBuilder(MI);
2620  GISelObserverWrapper DummyObserver;
2621  LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
2622  return Helper.lowerFMad(MI) == LegalizerHelper::Legalized;
2623 }
2624 
2627  Register DstReg = MI.getOperand(0).getReg();
2628  Register PtrReg = MI.getOperand(1).getReg();
2629  Register CmpVal = MI.getOperand(2).getReg();
2630  Register NewVal = MI.getOperand(3).getReg();
2631 
2633  "this should not have been custom lowered");
2634 
2635  LLT ValTy = MRI.getType(CmpVal);
2636  LLT VecTy = LLT::fixed_vector(2, ValTy);
2637 
2638  Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0);
2639 
2640  B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG)
2641  .addDef(DstReg)
2642  .addUse(PtrReg)
2643  .addUse(PackedVal)
2644  .setMemRefs(MI.memoperands());
2645 
2646  MI.eraseFromParent();
2647  return true;
2648 }
2649 
2651  MachineInstr &MI, MachineIRBuilder &B, double Log2BaseInverted) const {
2652  Register Dst = MI.getOperand(0).getReg();
2653  Register Src = MI.getOperand(1).getReg();
2654  LLT Ty = B.getMRI()->getType(Dst);
2655  unsigned Flags = MI.getFlags();
2656 
2657  auto Log2Operand = B.buildFLog2(Ty, Src, Flags);
2658  auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted);
2659 
2660  B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags);
2661  MI.eraseFromParent();
2662  return true;
2663 }
2664 
2666  MachineIRBuilder &B) const {
2667  Register Dst = MI.getOperand(0).getReg();
2668  Register Src = MI.getOperand(1).getReg();
2669  unsigned Flags = MI.getFlags();
2670  LLT Ty = B.getMRI()->getType(Dst);
2671 
2672  auto K = B.buildFConstant(Ty, numbers::log2e);
2673  auto Mul = B.buildFMul(Ty, Src, K, Flags);
2674  B.buildFExp2(Dst, Mul, Flags);
2675  MI.eraseFromParent();
2676  return true;
2677 }
2678 
2680  MachineIRBuilder &B) const {
2681  Register Dst = MI.getOperand(0).getReg();
2682  Register Src0 = MI.getOperand(1).getReg();
2683  Register Src1 = MI.getOperand(2).getReg();
2684  unsigned Flags = MI.getFlags();
2685  LLT Ty = B.getMRI()->getType(Dst);
2686  const LLT S16 = LLT::scalar(16);
2687  const LLT S32 = LLT::scalar(32);
2688 
2689  if (Ty == S32) {
2690  auto Log = B.buildFLog2(S32, Src0, Flags);
2691  auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false)
2692  .addUse(Log.getReg(0))
2693  .addUse(Src1)
2694  .setMIFlags(Flags);
2695  B.buildFExp2(Dst, Mul, Flags);
2696  } else if (Ty == S16) {
2697  // There's no f16 fmul_legacy, so we need to convert for it.
2698  auto Log = B.buildFLog2(S16, Src0, Flags);
2699  auto Ext0 = B.buildFPExt(S32, Log, Flags);
2700  auto Ext1 = B.buildFPExt(S32, Src1, Flags);
2701  auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false)
2702  .addUse(Ext0.getReg(0))
2703  .addUse(Ext1.getReg(0))
2704  .setMIFlags(Flags);
2705 
2706  B.buildFExp2(Dst, B.buildFPTrunc(S16, Mul), Flags);
2707  } else
2708  return false;
2709 
2710  MI.eraseFromParent();
2711  return true;
2712 }
2713 
2714 // Find a source register, ignoring any possible source modifiers.
2716  Register ModSrc = OrigSrc;
2717  if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) {
2718  ModSrc = SrcFNeg->getOperand(1).getReg();
2719  if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
2720  ModSrc = SrcFAbs->getOperand(1).getReg();
2721  } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
2722  ModSrc = SrcFAbs->getOperand(1).getReg();
2723  return ModSrc;
2724 }
2725 
2728  MachineIRBuilder &B) const {
2729 
2730  const LLT S1 = LLT::scalar(1);
2731  const LLT S64 = LLT::scalar(64);
2732  Register Dst = MI.getOperand(0).getReg();
2733  Register OrigSrc = MI.getOperand(1).getReg();
2734  unsigned Flags = MI.getFlags();
2735  assert(ST.hasFractBug() && MRI.getType(Dst) == S64 &&
2736  "this should not have been custom lowered");
2737 
2738  // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x))
2739  // is used instead. However, SI doesn't have V_FLOOR_F64, so the most
2740  // efficient way to implement it is using V_FRACT_F64. The workaround for the
2741  // V_FRACT bug is:
2742  // fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999)
2743  //
2744  // Convert floor(x) to (x - fract(x))
2745 
2746  auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {S64}, false)
2747  .addUse(OrigSrc)
2748  .setMIFlags(Flags);
2749 
2750  // Give source modifier matching some assistance before obscuring a foldable
2751  // pattern.
2752 
2753  // TODO: We can avoid the neg on the fract? The input sign to fract
2754  // shouldn't matter?
2755  Register ModSrc = stripAnySourceMods(OrigSrc, MRI);
2756 
2757  auto Const = B.buildFConstant(S64, BitsToDouble(0x3fefffffffffffff));
2758 
2760 
2761  // We don't need to concern ourselves with the snan handling difference, so
2762  // use the one which will directly select.
2763  const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2764  if (MFI->getMode().IEEE)
2765  B.buildFMinNumIEEE(Min, Fract, Const, Flags);
2766  else
2767  B.buildFMinNum(Min, Fract, Const, Flags);
2768 
2769  Register CorrectedFract = Min;
2770  if (!MI.getFlag(MachineInstr::FmNoNans)) {
2771  auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags);
2772  CorrectedFract = B.buildSelect(S64, IsNan, ModSrc, Min, Flags).getReg(0);
2773  }
2774 
2775  auto NegFract = B.buildFNeg(S64, CorrectedFract, Flags);
2776  B.buildFAdd(Dst, OrigSrc, NegFract, Flags);
2777 
2778  MI.eraseFromParent();
2779  return true;
2780 }
2781 
2782 // Turn an illegal packed v2s16 build vector into bit operations.
2783 // TODO: This should probably be a bitcast action in LegalizerHelper.
2786  Register Dst = MI.getOperand(0).getReg();
2787  const LLT S32 = LLT::scalar(32);
2788  assert(MRI.getType(Dst) == LLT::fixed_vector(2, 16));
2789 
2790  Register Src0 = MI.getOperand(1).getReg();
2791  Register Src1 = MI.getOperand(2).getReg();
2792  assert(MRI.getType(Src0) == LLT::scalar(16));
2793 
2794  auto Merge = B.buildMerge(S32, {Src0, Src1});
2795  B.buildBitcast(Dst, Merge);
2796 
2797  MI.eraseFromParent();
2798  return true;
2799 }
2800 
2801 // Legalize ctlz/cttz to ffbh/ffbl instead of the default legalization to
2802 // ctlz/cttz_zero_undef. This allows us to fix up the result for the zero input
2803 // case with a single min instruction instead of a compare+select.
2806  MachineIRBuilder &B) const {
2807  Register Dst = MI.getOperand(0).getReg();
2808  Register Src = MI.getOperand(1).getReg();
2809  LLT DstTy = MRI.getType(Dst);
2810  LLT SrcTy = MRI.getType(Src);
2811 
2812  unsigned NewOpc = MI.getOpcode() == AMDGPU::G_CTLZ
2813  ? AMDGPU::G_AMDGPU_FFBH_U32
2814  : AMDGPU::G_AMDGPU_FFBL_B32;
2815  auto Tmp = B.buildInstr(NewOpc, {DstTy}, {Src});
2816  B.buildUMin(Dst, Tmp, B.buildConstant(DstTy, SrcTy.getSizeInBits()));
2817 
2818  MI.eraseFromParent();
2819  return true;
2820 }
2821 
2822 // Check that this is a G_XOR x, -1
2823 static bool isNot(const MachineRegisterInfo &MRI, const MachineInstr &MI) {
2824  if (MI.getOpcode() != TargetOpcode::G_XOR)
2825  return false;
2826  auto ConstVal = getIConstantVRegSExtVal(MI.getOperand(2).getReg(), MRI);
2827  return ConstVal && *ConstVal == -1;
2828 }
2829 
2830 // Return the use branch instruction, otherwise null if the usage is invalid.
2831 static MachineInstr *
2833  MachineBasicBlock *&UncondBrTarget, bool &Negated) {
2834  Register CondDef = MI.getOperand(0).getReg();
2835  if (!MRI.hasOneNonDBGUse(CondDef))
2836  return nullptr;
2837 
2838  MachineBasicBlock *Parent = MI.getParent();
2840 
2841  if (isNot(MRI, *UseMI)) {
2842  Register NegatedCond = UseMI->getOperand(0).getReg();
2843  if (!MRI.hasOneNonDBGUse(NegatedCond))
2844  return nullptr;
2845 
2846  // We're deleting the def of this value, so we need to remove it.
2847  eraseInstr(*UseMI, MRI);
2848 
2849  UseMI = &*MRI.use_instr_nodbg_begin(NegatedCond);
2850  Negated = true;
2851  }
2852 
2853  if (UseMI->getParent() != Parent || UseMI->getOpcode() != AMDGPU::G_BRCOND)
2854  return nullptr;
2855 
2856  // Make sure the cond br is followed by a G_BR, or is the last instruction.
2857  MachineBasicBlock::iterator Next = std::next(UseMI->getIterator());
2858  if (Next == Parent->end()) {
2859  MachineFunction::iterator NextMBB = std::next(Parent->getIterator());
2860  if (NextMBB == Parent->getParent()->end()) // Illegal intrinsic use.
2861  return nullptr;
2862  UncondBrTarget = &*NextMBB;
2863  } else {
2864  if (Next->getOpcode() != AMDGPU::G_BR)
2865  return nullptr;
2866  Br = &*Next;
2867  UncondBrTarget = Br->getOperand(0).getMBB();
2868  }
2869 
2870  return UseMI;
2871 }
2872 
2874  const ArgDescriptor *Arg,
2875  const TargetRegisterClass *ArgRC,
2876  LLT ArgTy) const {
2877  MCRegister SrcReg = Arg->getRegister();
2878  assert(Register::isPhysicalRegister(SrcReg) && "Physical register expected");
2879  assert(DstReg.isVirtual() && "Virtual register expected");
2880 
2881  Register LiveIn = getFunctionLiveInPhysReg(B.getMF(), B.getTII(), SrcReg,
2882  *ArgRC, B.getDebugLoc(), ArgTy);
2883  if (Arg->isMasked()) {
2884  // TODO: Should we try to emit this once in the entry block?
2885  const LLT S32 = LLT::scalar(32);
2886  const unsigned Mask = Arg->getMask();
2887  const unsigned Shift = countTrailingZeros<unsigned>(Mask);
2888 
2889  Register AndMaskSrc = LiveIn;
2890 
2891  // TODO: Avoid clearing the high bits if we know workitem id y/z are always
2892  // 0.
2893  if (Shift != 0) {
2894  auto ShiftAmt = B.buildConstant(S32, Shift);
2895  AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0);
2896  }
2897 
2898  B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift));
2899  } else {
2900  B.buildCopy(DstReg, LiveIn);
2901  }
2902 
2903  return true;
2904 }
2905 
2907  Register DstReg, MachineIRBuilder &B,
2908  AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
2909  const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2910  const ArgDescriptor *Arg;
2911  const TargetRegisterClass *ArgRC;
2912  LLT ArgTy;
2913  std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType);
2914 
2915  if (!Arg) {
2917  // The intrinsic may appear when we have a 0 sized kernarg segment, in which
2918  // case the pointer argument may be missing and we use null.
2919  B.buildConstant(DstReg, 0);
2920  return true;
2921  }
2922 
2923  // It's undefined behavior if a function marked with the amdgpu-no-*
2924  // attributes uses the corresponding intrinsic.
2925  B.buildUndef(DstReg);
2926  return true;
2927  }
2928 
2929  if (!Arg->isRegister() || !Arg->getRegister().isValid())
2930  return false; // TODO: Handle these
2931  return loadInputValue(DstReg, B, Arg, ArgRC, ArgTy);
2932 }
2933 
2936  AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
2937  if (!loadInputValue(MI.getOperand(0).getReg(), B, ArgType))
2938  return false;
2939 
2940  MI.eraseFromParent();
2941  return true;
2942 }
2943 
2946  MachineIRBuilder &B) const {
2947  Register Dst = MI.getOperand(0).getReg();
2948  LLT DstTy = MRI.getType(Dst);
2949  LLT S16 = LLT::scalar(16);
2950  LLT S32 = LLT::scalar(32);
2951  LLT S64 = LLT::scalar(64);
2952 
2953  if (DstTy == S16)
2954  return legalizeFDIV16(MI, MRI, B);
2955  if (DstTy == S32)
2956  return legalizeFDIV32(MI, MRI, B);
2957  if (DstTy == S64)
2958  return legalizeFDIV64(MI, MRI, B);
2959 
2960  return false;
2961 }
2962 
2964  Register DstDivReg,
2965  Register DstRemReg,
2966  Register X,
2967  Register Y) const {
2968  const LLT S1 = LLT::scalar(1);
2969  const LLT S32 = LLT::scalar(32);
2970 
2971  // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the
2972  // algorithm used here.
2973 
2974  // Initial estimate of inv(y).
2975  auto FloatY = B.buildUITOFP(S32, Y);
2976  auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {FloatY});
2977  auto Scale = B.buildFConstant(S32, BitsToFloat(0x4f7ffffe));
2978  auto ScaledY = B.buildFMul(S32, RcpIFlag, Scale);
2979  auto Z = B.buildFPTOUI(S32, ScaledY);
2980 
2981  // One round of UNR.
2982  auto NegY = B.buildSub(S32, B.buildConstant(S32, 0), Y);
2983  auto NegYZ = B.buildMul(S32, NegY, Z);
2984  Z = B.buildAdd(S32, Z, B.buildUMulH(S32, Z, NegYZ));
2985 
2986  // Quotient/remainder estimate.
2987  auto Q = B.buildUMulH(S32, X, Z);
2988  auto R = B.buildSub(S32, X, B.buildMul(S32, Q, Y));
2989 
2990  // First quotient/remainder refinement.
2991  auto One = B.buildConstant(S32, 1);
2992  auto Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
2993  if (DstDivReg)
2994  Q = B.buildSelect(S32, Cond, B.buildAdd(S32, Q, One), Q);
2995  R = B.buildSelect(S32, Cond, B.buildSub(S32, R, Y), R);
2996 
2997  // Second quotient/remainder refinement.
2998  Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
2999  if (DstDivReg)
3000  B.buildSelect(DstDivReg, Cond, B.buildAdd(S32, Q, One), Q);
3001 
3002  if (DstRemReg)
3003  B.buildSelect(DstRemReg, Cond, B.buildSub(S32, R, Y), R);
3004 }
3005 
3006 // Build integer reciprocal sequence around V_RCP_IFLAG_F32
3007 //
3008 // Return lo, hi of result
3009 //
3010 // %cvt.lo = G_UITOFP Val.lo
3011 // %cvt.hi = G_UITOFP Val.hi
3012 // %mad = G_FMAD %cvt.hi, 2**32, %cvt.lo
3013 // %rcp = G_AMDGPU_RCP_IFLAG %mad
3014 // %mul1 = G_FMUL %rcp, 0x5f7ffffc
3015 // %mul2 = G_FMUL %mul1, 2**(-32)
3016 // %trunc = G_INTRINSIC_TRUNC %mul2
3017 // %mad2 = G_FMAD %trunc, -(2**32), %mul1
3018 // return {G_FPTOUI %mad2, G_FPTOUI %trunc}
3019 static std::pair<Register, Register> emitReciprocalU64(MachineIRBuilder &B,
3020  Register Val) {
3021  const LLT S32 = LLT::scalar(32);
3022  auto Unmerge = B.buildUnmerge(S32, Val);
3023 
3024  auto CvtLo = B.buildUITOFP(S32, Unmerge.getReg(0));
3025  auto CvtHi = B.buildUITOFP(S32, Unmerge.getReg(1));
3026 
3027  auto Mad = B.buildFMAD(S32, CvtHi, // 2**32
3028  B.buildFConstant(S32, BitsToFloat(0x4f800000)), CvtLo);
3029 
3030  auto Rcp = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Mad});
3031  auto Mul1 =
3032  B.buildFMul(S32, Rcp, B.buildFConstant(S32, BitsToFloat(0x5f7ffffc)));
3033 
3034  // 2**(-32)
3035  auto Mul2 =
3036  B.buildFMul(S32, Mul1, B.buildFConstant(S32, BitsToFloat(0x2f800000)));
3037  auto Trunc = B.buildIntrinsicTrunc(S32, Mul2);
3038 
3039  // -(2**32)
3040  auto Mad2 = B.buildFMAD(S32, Trunc,
3041  B.buildFConstant(S32, BitsToFloat(0xcf800000)), Mul1);
3042 
3043  auto ResultLo = B.buildFPTOUI(S32, Mad2);
3044  auto ResultHi = B.buildFPTOUI(S32, Trunc);
3045 
3046  return {ResultLo.getReg(0), ResultHi.getReg(0)};
3047 }
3048 
3050  Register DstDivReg,
3051  Register DstRemReg,
3052  Register Numer,
3053  Register Denom) const {
3054  const LLT S32 = LLT::scalar(32);
3055  const LLT S64 = LLT::scalar(64);
3056  const LLT S1 = LLT::scalar(1);
3057  Register RcpLo, RcpHi;
3058 
3059  std::tie(RcpLo, RcpHi) = emitReciprocalU64(B, Denom);
3060 
3061  auto Rcp = B.buildMerge(S64, {RcpLo, RcpHi});
3062 
3063  auto Zero64 = B.buildConstant(S64, 0);
3064  auto NegDenom = B.buildSub(S64, Zero64, Denom);
3065 
3066  auto MulLo1 = B.buildMul(S64, NegDenom, Rcp);
3067  auto MulHi1 = B.buildUMulH(S64, Rcp, MulLo1);
3068 
3069  auto UnmergeMulHi1 = B.buildUnmerge(S32, MulHi1);
3070  Register MulHi1_Lo = UnmergeMulHi1.getReg(0);
3071  Register MulHi1_Hi = UnmergeMulHi1.getReg(1);
3072 
3073  auto Add1_Lo = B.buildUAddo(S32, S1, RcpLo, MulHi1_Lo);
3074  auto Add1_Hi = B.buildUAdde(S32, S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1));
3075  auto Add1 = B.buildMerge(S64, {Add1_Lo, Add1_Hi});
3076 
3077  auto MulLo2 = B.buildMul(S64, NegDenom, Add1);
3078  auto MulHi2 = B.buildUMulH(S64, Add1, MulLo2);
3079  auto UnmergeMulHi2 = B.buildUnmerge(S32, MulHi2);
3080  Register MulHi2_Lo = UnmergeMulHi2.getReg(0);
3081  Register MulHi2_Hi = UnmergeMulHi2.getReg(1);
3082 
3083  auto Zero32 = B.buildConstant(S32, 0);
3084  auto Add2_Lo = B.buildUAddo(S32, S1, Add1_Lo, MulHi2_Lo);
3085  auto Add2_Hi = B.buildUAdde(S32, S1, Add1_Hi, MulHi2_Hi, Add2_Lo.getReg(1));
3086  auto Add2 = B.buildMerge(S64, {Add2_Lo, Add2_Hi});
3087 
3088  auto UnmergeNumer = B.buildUnmerge(S32, Numer);
3089  Register NumerLo = UnmergeNumer.getReg(0);
3090  Register NumerHi = UnmergeNumer.getReg(1);
3091 
3092  auto MulHi3 = B.buildUMulH(S64, Numer, Add2);
3093  auto Mul3 = B.buildMul(S64, Denom, MulHi3);
3094  auto UnmergeMul3 = B.buildUnmerge(S32, Mul3);
3095  Register Mul3_Lo = UnmergeMul3.getReg(0);
3096  Register Mul3_Hi = UnmergeMul3.getReg(1);
3097  auto Sub1_Lo = B.buildUSubo(S32, S1, NumerLo, Mul3_Lo);
3098  auto Sub1_Hi = B.buildUSube(S32, S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1));
3099  auto Sub1_Mi = B.buildSub(S32, NumerHi, Mul3_Hi);
3100  auto Sub1 = B.buildMerge(S64, {Sub1_Lo, Sub1_Hi});
3101 
3102  auto UnmergeDenom = B.buildUnmerge(S32, Denom);
3103  Register DenomLo = UnmergeDenom.getReg(0);
3104  Register DenomHi = UnmergeDenom.getReg(1);
3105 
3106  auto CmpHi = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Hi, DenomHi);
3107  auto C1 = B.buildSExt(S32, CmpHi);
3108 
3109  auto CmpLo = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Lo, DenomLo);
3110  auto C2 = B.buildSExt(S32, CmpLo);
3111 
3112  auto CmpEq = B.buildICmp(CmpInst::ICMP_EQ, S1, Sub1_Hi, DenomHi);
3113  auto C3 = B.buildSelect(S32, CmpEq, C2, C1);
3114 
3115  // TODO: Here and below portions of the code can be enclosed into if/endif.
3116  // Currently control flow is unconditional and we have 4 selects after
3117  // potential endif to substitute PHIs.
3118 
3119  // if C3 != 0 ...
3120  auto Sub2_Lo = B.buildUSubo(S32, S1, Sub1_Lo, DenomLo);
3121  auto Sub2_Mi = B.buildUSube(S32, S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1));
3122  auto Sub2_Hi = B.buildUSube(S32, S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1));
3123  auto Sub2 = B.buildMerge(S64, {Sub2_Lo, Sub2_Hi});
3124 
3125  auto One64 = B.buildConstant(S64, 1);
3126  auto Add3 = B.buildAdd(S64, MulHi3, One64);
3127 
3128  auto C4 =
3129  B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Hi, DenomHi));
3130  auto C5 =
3131  B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Lo, DenomLo));
3132  auto C6 = B.buildSelect(
3133  S32, B.buildICmp(CmpInst::ICMP_EQ, S1, Sub2_Hi, DenomHi), C5, C4);
3134 
3135  // if (C6 != 0)
3136  auto Add4 = B.buildAdd(S64, Add3, One64);
3137  auto Sub3_Lo = B.buildUSubo(S32, S1, Sub2_Lo, DenomLo);
3138 
3139  auto Sub3_Mi = B.buildUSube(S32, S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1));
3140  auto Sub3_Hi = B.buildUSube(S32, S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1));
3141  auto Sub3 = B.buildMerge(S64, {Sub3_Lo, Sub3_Hi});
3142 
3143  // endif C6
3144  // endif C3
3145 
3146  if (DstDivReg) {
3147  auto Sel1 = B.buildSelect(
3148  S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Add4, Add3);
3149  B.buildSelect(DstDivReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32),
3150  Sel1, MulHi3);
3151  }
3152 
3153  if (DstRemReg) {
3154  auto Sel2 = B.buildSelect(
3155  S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Sub3, Sub2);
3156  B.buildSelect(DstRemReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32),
3157  Sel2, Sub1);
3158  }
3159 }
3160 
3163  MachineIRBuilder &B) const {
3164  Register DstDivReg, DstRemReg;
3165  switch (MI.getOpcode()) {
3166  default:
3167  llvm_unreachable("Unexpected opcode!");
3168  case AMDGPU::G_UDIV: {
3169  DstDivReg = MI.getOperand(0).getReg();
3170  break;
3171  }
3172  case AMDGPU::G_UREM: {
3173  DstRemReg = MI.getOperand(0).getReg();
3174  break;
3175  }
3176  case AMDGPU::G_UDIVREM: {
3177  DstDivReg = MI.getOperand(0).getReg();
3178  DstRemReg = MI.getOperand(1).getReg();
3179  break;
3180  }
3181  }
3182 
3183  const LLT S64 = LLT::scalar(64);
3184  const LLT S32 = LLT::scalar(32);
3185  const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs();
3186  Register Num = MI.getOperand(FirstSrcOpIdx).getReg();
3187  Register Den = MI.getOperand(FirstSrcOpIdx + 1).getReg();
3188  LLT Ty = MRI.getType(MI.getOperand(0).getReg());
3189 
3190  if (Ty == S32)
3191  legalizeUnsignedDIV_REM32Impl(B, DstDivReg, DstRemReg, Num, Den);
3192  else if (Ty == S64)
3193  legalizeUnsignedDIV_REM64Impl(B, DstDivReg, DstRemReg, Num, Den);
3194  else
3195  return false;
3196 
3197  MI.eraseFromParent();
3198  return true;
3199 }
3200 
3203  MachineIRBuilder &B) const {
3204  const LLT S64 = LLT::scalar(64);
3205  const LLT S32 = LLT::scalar(32);
3206 
3207  LLT Ty = MRI.getType(MI.getOperand(0).getReg());
3208  if (Ty != S32 && Ty != S64)
3209  return false;
3210 
3211  const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs();
3212  Register LHS = MI.getOperand(FirstSrcOpIdx).getReg();
3213  Register RHS = MI.getOperand(FirstSrcOpIdx + 1).getReg();
3214 
3215  auto SignBitOffset = B.buildConstant(S32, Ty.getSizeInBits() - 1);
3216  auto LHSign = B.buildAShr(Ty, LHS, SignBitOffset);
3217  auto RHSign = B.buildAShr(Ty, RHS, SignBitOffset);
3218 
3219  LHS = B.buildAdd(Ty, LHS, LHSign).getReg(0);
3220  RHS = B.buildAdd(Ty, RHS, RHSign).getReg(0);
3221 
3222  LHS = B.buildXor(Ty, LHS, LHSign).getReg(0);
3223  RHS = B.buildXor(Ty, RHS, RHSign).getReg(0);
3224 
3225  Register DstDivReg, DstRemReg, TmpDivReg, TmpRemReg;
3226  switch (MI.getOpcode()) {
3227  default:
3228  llvm_unreachable("Unexpected opcode!");
3229  case AMDGPU::G_SDIV: {
3230  DstDivReg = MI.getOperand(0).getReg();
3231  TmpDivReg = MRI.createGenericVirtualRegister(Ty);
3232  break;
3233  }
3234  case AMDGPU::G_SREM: {
3235  DstRemReg = MI.getOperand(0).getReg();
3236  TmpRemReg = MRI.createGenericVirtualRegister(Ty);
3237  break;
3238  }
3239  case AMDGPU::G_SDIVREM: {
3240  DstDivReg = MI.getOperand(0).getReg();
3241  DstRemReg = MI.getOperand(1).getReg();
3242  TmpDivReg = MRI.createGenericVirtualRegister(Ty);
3243  TmpRemReg = MRI.createGenericVirtualRegister(Ty);
3244  break;
3245  }
3246  }
3247 
3248  if (Ty == S32)
3249  legalizeUnsignedDIV_REM32Impl(B, TmpDivReg, TmpRemReg, LHS, RHS);
3250  else
3251  legalizeUnsignedDIV_REM64Impl(B, TmpDivReg, TmpRemReg, LHS, RHS);
3252 
3253  if (DstDivReg) {
3254  auto Sign = B.buildXor(Ty, LHSign, RHSign).getReg(0);
3255  auto SignXor = B.buildXor(Ty, TmpDivReg, Sign).getReg(0);
3256  B.buildSub(DstDivReg, SignXor, Sign);
3257  }
3258 
3259  if (DstRemReg) {
3260  auto Sign = LHSign.getReg(0); // Remainder sign is the same as LHS
3261  auto SignXor = B.buildXor(Ty, TmpRemReg, Sign).getReg(0);
3262  B.buildSub(DstRemReg, SignXor, Sign);
3263  }
3264 
3265  MI.eraseFromParent();
3266  return true;
3267 }
3268 
3271  MachineIRBuilder &B) const {
3272  Register Res = MI.getOperand(0).getReg();
3273  Register LHS = MI.getOperand(1).getReg();
3274  Register RHS = MI.getOperand(2).getReg();
3275  uint16_t Flags = MI.getFlags();
3276  LLT ResTy = MRI.getType(Res);
3277 
3278  const MachineFunction &MF = B.getMF();
3279  bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath ||
3280  MI.getFlag(MachineInstr::FmAfn);
3281 
3282  if (!AllowInaccurateRcp)
3283  return false;
3284 
3285  if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) {
3286  // 1 / x -> RCP(x)
3287  if (CLHS->isExactlyValue(1.0)) {
3288  B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
3289  .addUse(RHS)
3290  .setMIFlags(Flags);
3291 
3292  MI.eraseFromParent();
3293  return true;
3294  }
3295 
3296  // -1 / x -> RCP( FNEG(x) )
3297  if (CLHS->isExactlyValue(-1.0)) {
3298  auto FNeg = B.buildFNeg(ResTy, RHS, Flags);
3299  B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
3300  .addUse(FNeg.getReg(0))
3301  .setMIFlags(Flags);
3302 
3303  MI.eraseFromParent();
3304  return true;
3305  }
3306  }
3307 
3308  // x / y -> x * (1.0 / y)
3309  auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false)
3310  .addUse(RHS)
3311  .setMIFlags(Flags);
3312  B.buildFMul(Res, LHS, RCP, Flags);
3313 
3314  MI.eraseFromParent();
3315  return true;
3316 }
3317 
3320  MachineIRBuilder &B) const {
3321  Register Res = MI.getOperand(0).getReg();
3322  Register X = MI.getOperand(1).getReg();
3323  Register Y = MI.getOperand(2).getReg();
3324  uint16_t Flags = MI.getFlags();
3325  LLT ResTy = MRI.getType(Res);
3326 
3327  const MachineFunction &MF = B.getMF();
3328  bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath ||
3329  MI.getFlag(MachineInstr::FmAfn);
3330 
3331  if (!AllowInaccurateRcp)
3332  return false;
3333 
3334  auto NegY = B.buildFNeg(ResTy, Y);
3335  auto One = B.buildFConstant(ResTy, 1.0);
3336 
3337  auto R = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false)
3338  .addUse(Y)
3339  .setMIFlags(Flags);
3340 
3341  auto Tmp0 = B.buildFMA(ResTy, NegY, R, One);
3342  R = B.buildFMA(ResTy, Tmp0, R, R);
3343 
3344  auto Tmp1 = B.buildFMA(ResTy, NegY, R, One);
3345  R = B.buildFMA(ResTy, Tmp1, R, R);
3346 
3347  auto Ret = B.buildFMul(ResTy, X, R);
3348  auto Tmp2 = B.buildFMA(ResTy, NegY, Ret, X);
3349 
3350  B.buildFMA(Res, Tmp2, R, Ret);
3351  MI.eraseFromParent();
3352  return true;
3353 }
3354 
3357  MachineIRBuilder &B) const {
3358  if (legalizeFastUnsafeFDIV(MI, MRI, B))
3359  return true;
3360 
3361  Register Res = MI.getOperand(0).getReg();
3362  Register LHS = MI.getOperand(1).getReg();
3363  Register RHS = MI.getOperand(2).getReg();
3364 
3365  uint16_t Flags = MI.getFlags();
3366 
3367  LLT S16 = LLT::scalar(16);
3368  LLT S32 = LLT::scalar(32);
3369 
3370  auto LHSExt = B.buildFPExt(S32, LHS, Flags);
3371  auto RHSExt = B.buildFPExt(S32, RHS, Flags);
3372 
3373  auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3374  .addUse(RHSExt.getReg(0))
3375  .setMIFlags(Flags);
3376 
3377  auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags);
3378  auto RDst = B.buildFPTrunc(S16, QUOT, Flags);
3379 
3380  B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
3381  .addUse(RDst.getReg(0))
3382  .addUse(RHS)
3383  .addUse(LHS)
3384  .setMIFlags(Flags);
3385 
3386  MI.eraseFromParent();
3387  return true;
3388 }
3389 
3390 // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions
3391 // to enable denorm mode. When 'Enable' is false, disable denorm mode.
3392 static void toggleSPDenormMode(bool Enable,
3394  const GCNSubtarget &ST,
3395  AMDGPU::SIModeRegisterDefaults Mode) {
3396  // Set SP denorm mode to this value.
3397  unsigned SPDenormMode =
3398  Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue();
3399 
3400  if (ST.hasDenormModeInst()) {
3401  // Preserve default FP64FP16 denorm mode while updating FP32 mode.
3402  uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue();
3403 
3404  uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2);
3405  B.buildInstr(AMDGPU::S_DENORM_MODE)
3406  .addImm(NewDenormModeValue);
3407 
3408  } else {
3409  // Select FP32 bit field in mode register.
3410  unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE |
3413 
3414  B.buildInstr(AMDGPU::S_SETREG_IMM32_B32)
3415  .addImm(SPDenormMode)
3416  .addImm(SPDenormModeBitField);
3417  }
3418 }
3419 
3422  MachineIRBuilder &B) const {
3423  if (legalizeFastUnsafeFDIV(MI, MRI, B))
3424  return true;
3425 
3426  Register Res = MI.getOperand(0).getReg();
3427  Register LHS = MI.getOperand(1).getReg();
3428  Register RHS = MI.getOperand(2).getReg();
3429  const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3431 
3432  uint16_t Flags = MI.getFlags();
3433 
3434  LLT S32 = LLT::scalar(32);
3435  LLT S1 = LLT::scalar(1);
3436 
3437  auto One = B.buildFConstant(S32, 1.0f);
3438 
3439  auto DenominatorScaled =
3440  B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
3441  .addUse(LHS)
3442  .addUse(RHS)
3443  .addImm(0)
3444  .setMIFlags(Flags);
3445  auto NumeratorScaled =
3446  B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
3447  .addUse(LHS)
3448  .addUse(RHS)
3449  .addImm(1)
3450  .setMIFlags(Flags);
3451 
3452  auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3453  .addUse(DenominatorScaled.getReg(0))
3454  .setMIFlags(Flags);
3455  auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags);
3456 
3457  // FIXME: Doesn't correctly model the FP mode switch, and the FP operations
3458  // aren't modeled as reading it.
3459  if (!Mode.allFP32Denormals())
3460  toggleSPDenormMode(true, B, ST, Mode);
3461 
3462  auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags);
3463  auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags);
3464  auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags);
3465  auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags);
3466  auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags);
3467  auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags);
3468 
3469  if (!Mode.allFP32Denormals())
3470  toggleSPDenormMode(false, B, ST, Mode);
3471 
3472  auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false)
3473  .addUse(Fma4.getReg(0))
3474  .addUse(Fma1.getReg(0))
3475  .addUse(Fma3.getReg(0))
3476  .addUse(NumeratorScaled.getReg(1))
3477  .setMIFlags(Flags);
3478 
3479  B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
3480  .addUse(Fmas.getReg(0))
3481  .addUse(RHS)
3482  .addUse(LHS)
3483  .setMIFlags(Flags);
3484 
3485  MI.eraseFromParent();
3486  return true;
3487 }
3488 
3491  MachineIRBuilder &B) const {
3493  return true;
3494 
3495  Register Res = MI.getOperand(0).getReg();
3496  Register LHS = MI.getOperand(1).getReg();
3497  Register RHS = MI.getOperand(2).getReg();
3498 
3499  uint16_t Flags = MI.getFlags();
3500 
3501  LLT S64 = LLT::scalar(64);
3502  LLT S1 = LLT::scalar(1);
3503 
3504  auto One = B.buildFConstant(S64, 1.0);
3505 
3506  auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
3507  .addUse(LHS)
3508  .addUse(RHS)
3509  .addImm(0)
3510  .setMIFlags(Flags);
3511 
3512  auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags);
3513 
3514  auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false)
3515  .addUse(DivScale0.getReg(0))
3516  .setMIFlags(Flags);
3517 
3518  auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags);
3519  auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags);
3520  auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags);
3521 
3522  auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
3523  .addUse(LHS)
3524  .addUse(RHS)
3525  .addImm(1)
3526  .setMIFlags(Flags);
3527 
3528  auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags);
3529  auto Mul = B.buildFMul(S64, DivScale1.getReg(0), Fma3, Flags);
3530  auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags);
3531 
3532  Register Scale;
3534  // Workaround a hardware bug on SI where the condition output from div_scale
3535  // is not usable.
3536 
3537  LLT S32 = LLT::scalar(32);
3538 
3539  auto NumUnmerge = B.buildUnmerge(S32, LHS);
3540  auto DenUnmerge = B.buildUnmerge(S32, RHS);
3541  auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0);
3542  auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1);
3543 
3544  auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1),
3545  Scale1Unmerge.getReg(1));
3546  auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1),
3547  Scale0Unmerge.getReg(1));
3548  Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0);
3549  } else {
3550  Scale = DivScale1.getReg(1);
3551  }
3552 
3553  auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false)
3554  .addUse(Fma4.getReg(0))
3555  .addUse(Fma3.getReg(0))
3556  .addUse(Mul.getReg(0))
3557  .addUse(Scale)
3558  .setMIFlags(Flags);
3559 
3560  B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, makeArrayRef(Res), false)
3561  .addUse(Fmas.getReg(0))
3562  .addUse(RHS)
3563  .addUse(LHS)
3564  .setMIFlags(Flags);
3565 
3566  MI.eraseFromParent();
3567  return true;
3568 }
3569 
3572  MachineIRBuilder &B) const {
3573  Register Res = MI.getOperand(0).getReg();
3574  Register LHS = MI.getOperand(2).getReg();
3575  Register RHS = MI.getOperand(3).getReg();
3576  uint16_t Flags = MI.getFlags();
3577 
3578  LLT S32 = LLT::scalar(32);
3579  LLT S1 = LLT::scalar(1);
3580 
3581  auto Abs = B.buildFAbs(S32, RHS, Flags);
3582  const APFloat C0Val(1.0f);
3583 
3584  auto C0 = B.buildConstant(S32, 0x6f800000);
3585  auto C1 = B.buildConstant(S32, 0x2f800000);
3586  auto C2 = B.buildConstant(S32, FloatToBits(1.0f));
3587 
3588  auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags);
3589  auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags);
3590 
3591  auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags);
3592 
3593  auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3594  .addUse(Mul0.getReg(0))
3595  .setMIFlags(Flags);
3596 
3597  auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags);
3598 
3599  B.buildFMul(Res, Sel, Mul1, Flags);
3600 
3601  MI.eraseFromParent();
3602  return true;
3603 }
3604 
3605 // Expand llvm.amdgcn.rsq.clamp on targets that don't support the instruction.
3606 // FIXME: Why do we handle this one but not other removed instructions?
3607 //
3608 // Reciprocal square root. The clamp prevents infinite results, clamping
3609 // infinities to max_float. D.f = 1.0 / sqrt(S0.f), result clamped to
3610 // +-max_float.
3613  MachineIRBuilder &B) const {
3615  return true;
3616 
3617  Register Dst = MI.getOperand(0).getReg();
3618  Register Src = MI.getOperand(2).getReg();
3619  auto Flags = MI.getFlags();
3620 
3621  LLT Ty = MRI.getType(Dst);
3622 
3623  const fltSemantics *FltSemantics;
3624  if (Ty == LLT::scalar(32))
3625  FltSemantics = &APFloat::IEEEsingle();
3626  else if (Ty == LLT::scalar(64))
3627  FltSemantics = &APFloat::IEEEdouble();
3628  else
3629  return false;
3630 
3631  auto Rsq = B.buildIntrinsic(Intrinsic::amdgcn_rsq, {Ty}, false)
3632  .addUse(Src)
3633  .setMIFlags(Flags);
3634 
3635  // We don't need to concern ourselves with the snan handling difference, since
3636  // the rsq quieted (or not) so use the one which will directly select.
3637  const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3638  const bool UseIEEE = MFI->getMode().IEEE;
3639 
3640  auto MaxFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics));
3641  auto ClampMax = UseIEEE ? B.buildFMinNumIEEE(Ty, Rsq, MaxFlt, Flags) :
3642  B.buildFMinNum(Ty, Rsq, MaxFlt, Flags);
3643 
3644  auto MinFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics, true));
3645 
3646  if (UseIEEE)
3647  B.buildFMaxNumIEEE(Dst, ClampMax, MinFlt, Flags);
3648  else
3649  B.buildFMaxNum(Dst, ClampMax, MinFlt, Flags);
3650  MI.eraseFromParent();
3651  return true;
3652 }
3653 
3654 static unsigned getDSFPAtomicOpcode(Intrinsic::ID IID) {
3655  switch (IID) {
3656  case Intrinsic::amdgcn_ds_fadd:
3657  return AMDGPU::G_ATOMICRMW_FADD;
3658  case Intrinsic::amdgcn_ds_fmin:
3659  return AMDGPU::G_AMDGPU_ATOMIC_FMIN;
3660  case Intrinsic::amdgcn_ds_fmax:
3661  return AMDGPU::G_AMDGPU_ATOMIC_FMAX;
3662  default:
3663  llvm_unreachable("not a DS FP intrinsic");
3664  }
3665 }
3666 
3668  MachineInstr &MI,
3669  Intrinsic::ID IID) const {
3670  GISelChangeObserver &Observer = Helper.Observer;
3671  Observer.changingInstr(MI);
3672 
3673  MI.setDesc(ST.getInstrInfo()->get(getDSFPAtomicOpcode(IID)));
3674 
3675  // The remaining operands were used to set fields in the MemOperand on
3676  // construction.
3677  for (int I = 6; I > 3; --I)
3678  MI.RemoveOperand(I);
3679 
3680  MI.RemoveOperand(1); // Remove the intrinsic ID.
3681  Observer.changedInstr(MI);
3682  return true;
3683 }
3684 
3687  MachineIRBuilder &B) const {
3688  uint64_t Offset =
3691  LLT DstTy = MRI.getType(DstReg);
3692  LLT IdxTy = LLT::scalar(DstTy.getSizeInBits());
3693 
3694  Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy);
3695  if (!loadInputValue(KernargPtrReg, B,
3697  return false;
3698 
3699  // FIXME: This should be nuw
3700  B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0));
3701  return true;
3702 }
3703 
3706  MachineIRBuilder &B) const {
3707  const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3708  if (!MFI->isEntryFunction()) {
3711  }
3712 
3713  Register DstReg = MI.getOperand(0).getReg();
3714  if (!getImplicitArgPtr(DstReg, MRI, B))
3715  return false;
3716 
3717  MI.eraseFromParent();
3718  return true;
3719 }
3720 
3724  unsigned AddrSpace) const {
3725  Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B);
3726  auto Unmerge = B.buildUnmerge(LLT::scalar(32), MI.getOperand(2).getReg());
3727  Register Hi32 = Unmerge.getReg(1);
3728 
3729  B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg);
3730  MI.eraseFromParent();
3731  return true;
3732 }
3733 
3734 // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args:
3735 // offset (the offset that is included in bounds checking and swizzling, to be
3736 // split between the instruction's voffset and immoffset fields) and soffset
3737 // (the offset that is excluded from bounds checking and swizzling, to go in
3738 // the instruction's soffset field). This function takes the first kind of
3739 // offset and figures out how to split it between voffset and immoffset.
3740 std::pair<Register, unsigned>
3742  Register OrigOffset) const {
3743  const unsigned MaxImm = 4095;
3744  Register BaseReg;
3745  unsigned ImmOffset;
3746  const LLT S32 = LLT::scalar(32);
3747  MachineRegisterInfo &MRI = *B.getMRI();
3748 
3749  std::tie(BaseReg, ImmOffset) =
3751 
3752  // If BaseReg is a pointer, convert it to int.
3753  if (MRI.getType(BaseReg).isPointer())
3754  BaseReg = B.buildPtrToInt(MRI.getType(OrigOffset), BaseReg).getReg(0);
3755 
3756  // If the immediate value is too big for the immoffset field, put the value
3757  // and -4096 into the immoffset field so that the value that is copied/added
3758  // for the voffset field is a multiple of 4096, and it stands more chance
3759  // of being CSEd with the copy/add for another similar load/store.
3760  // However, do not do that rounding down to a multiple of 4096 if that is a
3761  // negative number, as it appears to be illegal to have a negative offset
3762  // in the vgpr, even if adding the immediate offset makes it positive.
3763  unsigned Overflow = ImmOffset & ~MaxImm;
3764  ImmOffset -= Overflow;
3765  if ((int32_t)Overflow < 0) {
3766  Overflow += ImmOffset;
3767  ImmOffset = 0;
3768  }
3769 
3770  if (Overflow != 0) {
3771  if (!BaseReg) {
3772  BaseReg = B.buildConstant(S32, Overflow).getReg(0);
3773  } else {
3774  auto OverflowVal = B.buildConstant(S32, Overflow);
3775  BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0);
3776  }
3777  }
3778 
3779  if (!BaseReg)
3780  BaseReg = B.buildConstant(S32, 0).getReg(0);
3781 
3782  return std::make_pair(BaseReg, ImmOffset);
3783 }
3784 
3785 /// Update \p MMO based on the offset inputs to a raw/struct buffer intrinsic.
3787  Register VOffset, Register SOffset,
3788  unsigned ImmOffset, Register VIndex,
3789  MachineRegisterInfo &MRI) const {
3790  Optional<ValueAndVReg> MaybeVOffsetVal =
3792  Optional<ValueAndVReg> MaybeSOffsetVal =
3794  Optional<ValueAndVReg> MaybeVIndexVal =
3796  // If the combined VOffset + SOffset + ImmOffset + strided VIndex is constant,
3797  // update the MMO with that offset. The stride is unknown so we can only do
3798  // this if VIndex is constant 0.
3799  if (MaybeVOffsetVal && MaybeSOffsetVal && MaybeVIndexVal &&
3800  MaybeVIndexVal->Value == 0) {
3801  uint64_t TotalOffset = MaybeVOffsetVal->Value.getZExtValue() +
3802  MaybeSOffsetVal->Value.getZExtValue() + ImmOffset;
3803  MMO->setOffset(TotalOffset);
3804  } else {
3805  // We don't have a constant combined offset to use in the MMO. Give up.
3806  MMO->setValue((Value *)nullptr);
3807  }
3808 }
3809 
3810 /// Handle register layout difference for f16 images for some subtargets.
3813  Register Reg,
3814  bool ImageStore) const {
3815  const LLT S16 = LLT::scalar(16);
3816  const LLT S32 = LLT::scalar(32);
3817  LLT StoreVT = MRI.getType(Reg);
3818  assert(StoreVT.isVector() && StoreVT.getElementType() == S16);
3819 
3820  if (ST.hasUnpackedD16VMem()) {
3821  auto Unmerge = B.buildUnmerge(S16, Reg);
3822 
3823  SmallVector<Register, 4> WideRegs;
3824  for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
3825  WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0));
3826 
3827  int NumElts = StoreVT.getNumElements();
3828 
3829  return B.buildBuildVector(LLT::fixed_vector(NumElts, S32), WideRegs)
3830  .getReg(0);
3831  }
3832 
3833  if (ImageStore && ST.hasImageStoreD16Bug()) {
3834  if (StoreVT.getNumElements() == 2) {
3835  SmallVector<Register, 4> PackedRegs;
3836  Reg = B.buildBitcast(S32, Reg).getReg(0);
3837  PackedRegs.push_back(Reg);
3838  PackedRegs.resize(2, B.buildUndef(S32).getReg(0));
3839  return B.buildBuildVector(LLT::fixed_vector(2, S32), PackedRegs)
3840  .getReg(0);
3841  }
3842 
3843  if (StoreVT.getNumElements() == 3) {
3844  SmallVector<Register, 4> PackedRegs;
3845  auto Unmerge = B.buildUnmerge(S16, Reg);
3846  for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
3847  PackedRegs.push_back(Unmerge.getReg(I));
3848  PackedRegs.resize(6, B.buildUndef(S16).getReg(0));
3849  Reg = B.buildBuildVector(LLT::fixed_vector(6, S16), PackedRegs).getReg(0);
3850  return B.buildBitcast(LLT::fixed_vector(3, S32), Reg).getReg(0);
3851  }
3852 
3853  if (StoreVT.getNumElements() == 4) {
3854  SmallVector<Register, 4> PackedRegs;
3855  Reg = B.buildBitcast(LLT::fixed_vector(2, S32), Reg).getReg(0);
3856  auto Unmerge = B.buildUnmerge(S32, Reg);
3857  for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
3858  PackedRegs.push_back(Unmerge.getReg(I));
3859  PackedRegs.resize(4, B.buildUndef(S32).getReg(0));
3860  return B.buildBuildVector(LLT::fixed_vector(4, S32), PackedRegs)
3861  .getReg(0);
3862  }
3863 
3864  llvm_unreachable("invalid data type");
3865  }
3866 
3867  if (StoreVT == LLT::fixed_vector(3, S16)) {
3868  Reg = B.buildPadVectorWithUndefElements(LLT::fixed_vector(4, S16), Reg)
3869  .getReg(0);
3870  }
3871  return Reg;
3872 }
3873 
3875  MachineIRBuilder &B, Register VData, bool IsFormat) const {
3876  MachineRegisterInfo *MRI = B.getMRI();
3877  LLT Ty = MRI->getType(VData);
3878 
3879  const LLT S16 = LLT::scalar(16);
3880 
3881  // Fixup illegal register types for i8 stores.
3882  if (Ty == LLT::scalar(8) || Ty == S16) {
3883  Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0);
3884  return AnyExt;
3885  }
3886 
3887  if (Ty.isVector()) {
3888  if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) {
3889  if (IsFormat)
3890  return handleD16VData(B, *MRI, VData);
3891  }
3892  }
3893 
3894  return VData;
3895 }
3896 
3900  bool IsTyped,
3901  bool IsFormat) const {
3902  Register VData = MI.getOperand(1).getReg();
3903  LLT Ty = MRI.getType(VData);
3904  LLT EltTy = Ty.getScalarType();
3905  const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
3906  const LLT S32 = LLT::scalar(32);
3907 
3908  VData = fixStoreSourceType(B, VData, IsFormat);
3909  Register RSrc = MI.getOperand(2).getReg();
3910 
3911  MachineMemOperand *MMO = *MI.memoperands_begin();
3912  const int MemSize = MMO->getSize();
3913 
3914  unsigned ImmOffset;
3915 
3916  // The typed intrinsics add an immediate after the registers.
3917  const unsigned NumVIndexOps = IsTyped ? 8 : 7;
3918 
3919  // The struct intrinsic variants add one additional operand over raw.
3920  const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
3921  Register VIndex;
3922  int OpOffset = 0;
3923  if (HasVIndex) {
3924  VIndex = MI.getOperand(3).getReg();
3925  OpOffset = 1;
3926  } else {
3927  VIndex = B.buildConstant(S32, 0).getReg(0);
3928  }
3929 
3930  Register VOffset = MI.getOperand(3 + OpOffset).getReg();
3931  Register SOffset = MI.getOperand(4 + OpOffset).getReg();
3932 
3933  unsigned Format = 0;
3934  if (IsTyped) {
3935  Format = MI.getOperand(5 + OpOffset).getImm();
3936  ++OpOffset;
3937  }
3938 
3939  unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
3940 
3941  std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset);
3942  updateBufferMMO(MMO, VOffset, SOffset, ImmOffset, VIndex, MRI);
3943 
3944  unsigned Opc;
3945  if (IsTyped) {
3946  Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 :
3947  AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT;
3948  } else if (IsFormat) {
3949  Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 :
3950  AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT;
3951  } else {
3952  switch (MemSize) {
3953  case 1:
3954  Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE;
3955  break;
3956  case 2:
3957  Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT;
3958  break;
3959  default:
3960  Opc = AMDGPU::G_AMDGPU_BUFFER_STORE;
3961  break;
3962  }
3963  }
3964 
3965  auto MIB = B.buildInstr(Opc)
3966  .addUse(VData) // vdata
3967  .addUse(RSrc) // rsrc
3968  .addUse(VIndex) // vindex
3969  .addUse(VOffset) // voffset
3970  .addUse(SOffset) // soffset
3971  .addImm(ImmOffset); // offset(imm)
3972 
3973  if (IsTyped)
3974  MIB.addImm(Format);
3975 
3976  MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm)
3977  .addImm(HasVIndex ? -1 : 0) // idxen(imm)
3978  .addMemOperand(MMO);
3979 
3980  MI.eraseFromParent();
3981  return true;
3982 }
3983 
3987  bool IsFormat,
3988  bool IsTyped) const {
3989  // FIXME: Verifier should enforce 1 MMO for these intrinsics.
3990  MachineMemOperand *MMO = *MI.memoperands_begin();
3991  const LLT MemTy = MMO->getMemoryType();
3992  const LLT S32 = LLT::scalar(32);
3993 
3994  Register Dst = MI.getOperand(0).getReg();
3995  Register RSrc = MI.getOperand(2).getReg();
3996 
3997  // The typed intrinsics add an immediate after the registers.
3998  const unsigned NumVIndexOps = IsTyped ? 8 : 7;
3999 
4000  // The struct intrinsic variants add one additional operand over raw.
4001  const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
4002  Register VIndex;
4003  int OpOffset = 0;
4004  if (HasVIndex) {
4005  VIndex = MI.getOperand(3).getReg();
4006  OpOffset = 1;
4007  } else {
4008  VIndex = B.buildConstant(S32, 0).getReg(0);
4009  }
4010 
4011  Register VOffset = MI.getOperand(3 + OpOffset).getReg();
4012  Register SOffset = MI.getOperand(4 + OpOffset).getReg();
4013 
4014  unsigned Format = 0;
4015  if (IsTyped) {
4016  Format = MI.getOperand(5 + OpOffset).getImm();
4017  ++OpOffset;
4018  }
4019 
4020  unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
4021  unsigned ImmOffset;
4022 
4023  LLT Ty = MRI.getType(Dst);
4024  LLT EltTy = Ty.getScalarType();
4025  const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
4026  const bool Unpacked = ST.hasUnpackedD16VMem();
4027 
4028  std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset);
4029  updateBufferMMO(MMO, VOffset, SOffset, ImmOffset, VIndex, MRI);
4030 
4031  unsigned Opc;
4032 
4033  if (IsTyped) {
4034  Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 :
4035  AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT;
4036  } else if (IsFormat) {
4037  Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16 :
4038  AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT;
4039  } else {
4040  switch (MemTy.getSizeInBits()) {
4041  case 8:
4042  Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE;
4043  break;
4044  case 16:
4045  Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT;
4046  break;
4047  default:
4048  Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD;
4049  break;
4050  }
4051  }
4052 
4053  Register LoadDstReg;
4054 
4055  bool IsExtLoad =
4056  (!IsD16 && MemTy.getSizeInBits() < 32) || (IsD16 && !Ty.isVector());
4057  LLT UnpackedTy = Ty.changeElementSize(32);
4058 
4059  if (IsExtLoad)
4060  LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32);
4061  else if (Unpacked && IsD16 && Ty.isVector())
4062  LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy);
4063  else
4064  LoadDstReg = Dst;
4065 
4066  auto MIB = B.buildInstr(Opc)
4067  .addDef(LoadDstReg) // vdata
4068  .addUse(RSrc) // rsrc
4069  .addUse(VIndex) // vindex
4070  .addUse(VOffset) // voffset
4071  .addUse(SOffset) // soffset
4072  .addImm(ImmOffset); // offset(imm)
4073 
4074  if (IsTyped)
4075  MIB.addImm(Format);
4076 
4077  MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm)
4078  .addImm(HasVIndex ? -1 : 0) // idxen(imm)
4079  .addMemOperand(MMO);
4080 
4081  if (LoadDstReg != Dst) {
4082  B.setInsertPt(B.getMBB(), ++B.getInsertPt());
4083 
4084  // Widen result for extending loads was widened.
4085  if (IsExtLoad)
4086  B.buildTrunc(Dst, LoadDstReg);
4087  else {
4088  // Repack to original 16-bit vector result
4089  // FIXME: G_TRUNC should work, but legalization currently fails
4090  auto Unmerge = B.buildUnmerge(S32, LoadDstReg);
4091  SmallVector<Register, 4> Repack;
4092  for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I)
4093  Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0));
4094  B.buildMerge(Dst, Repack);
4095  }
4096  }
4097 
4098  MI.eraseFromParent();
4099  return true;
4100 }
4101 
4104  bool IsInc) const {
4105  unsigned Opc = IsInc ? AMDGPU::G_AMDGPU_ATOMIC_INC :
4106  AMDGPU::G_AMDGPU_ATOMIC_DEC;
4107  B.buildInstr(Opc)
4108  .addDef(MI.getOperand(0).getReg())
4109  .addUse(MI.getOperand(2).getReg())
4110  .addUse(MI.getOperand(3).getReg())
4111  .cloneMemRefs(MI);
4112  MI.eraseFromParent();
4113  return true;
4114 }
4115 
4116 static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) {
4117  switch (IntrID) {
4118  case Intrinsic::amdgcn_raw_buffer_atomic_swap:
4119  case Intrinsic::amdgcn_struct_buffer_atomic_swap:
4120  return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP;
4121  case Intrinsic::amdgcn_raw_buffer_atomic_add:
4122  case Intrinsic::amdgcn_struct_buffer_atomic_add:
4123  return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD;
4124  case Intrinsic::amdgcn_raw_buffer_atomic_sub:
4125  case Intrinsic::amdgcn_struct_buffer_atomic_sub:
4126  return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB;
4127  case Intrinsic::amdgcn_raw_buffer_atomic_smin:
4128  case Intrinsic::amdgcn_struct_buffer_atomic_smin:
4129  return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN;
4130  case Intrinsic::amdgcn_raw_buffer_atomic_umin:
4131  case Intrinsic::amdgcn_struct_buffer_atomic_umin:
4132  return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN;
4133  case Intrinsic::amdgcn_raw_buffer_atomic_smax:
4134  case Intrinsic::amdgcn_struct_buffer_atomic_smax:
4135  return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX;
4136  case Intrinsic::amdgcn_raw_buffer_atomic_umax:
4137  case Intrinsic::amdgcn_struct_buffer_atomic_umax:
4138  return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX;
4139  case Intrinsic::amdgcn_raw_buffer_atomic_and:
4140  case Intrinsic::amdgcn_struct_buffer_atomic_and:
4141  return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND;
4142  case Intrinsic::amdgcn_raw_buffer_atomic_or:
4143  case Intrinsic::amdgcn_struct_buffer_atomic_or:
4144  return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR;
4145  case Intrinsic::amdgcn_raw_buffer_atomic_xor:
4146  case Intrinsic::amdgcn_struct_buffer_atomic_xor:
4147  return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR;
4148  case Intrinsic::amdgcn_raw_buffer_atomic_inc:
4149  case Intrinsic::amdgcn_struct_buffer_atomic_inc:
4150  return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC;
4151  case Intrinsic::amdgcn_raw_buffer_atomic_dec:
4152  case Intrinsic::amdgcn_struct_buffer_atomic_dec:
4153  return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC;
4154  case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
4155  case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
4156  return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP;
4157  case Intrinsic::amdgcn_buffer_atomic_fadd:
4158  case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
4159  case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
4160  return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD;
4161  case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
4162  case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
4163  return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN;
4164  case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
4165  case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
4166  return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX;
4167  default:
4168  llvm_unreachable("unhandled atomic opcode");
4169  }
4170 }
4171 
4174  Intrinsic::ID IID) const {
4175  const bool IsCmpSwap = IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap ||
4176  IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap;
4177  const bool HasReturn = MI.getNumExplicitDefs() != 0;
4178 
4179  Register Dst;
4180 
4181  int OpOffset = 0;
4182  if (HasReturn) {
4183  // A few FP atomics do not support return values.
4184  Dst = MI.getOperand(0).getReg();
4185  } else {
4186  OpOffset = -1;
4187  }
4188 
4189  Register VData = MI.getOperand(2 + OpOffset).getReg();
4190  Register CmpVal;
4191 
4192  if (IsCmpSwap) {
4193  CmpVal = MI.getOperand(3 + OpOffset).getReg();
4194  ++OpOffset;
4195  }
4196 
4197  Register RSrc = MI.getOperand(3 + OpOffset).getReg();
4198  const unsigned NumVIndexOps = (IsCmpSwap ? 8 : 7) + HasReturn;
4199 
4200  // The struct intrinsic variants add one additional operand over raw.
4201  const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
4202  Register VIndex;
4203  if (HasVIndex) {
4204  VIndex = MI.getOperand(4 + OpOffset).getReg();
4205  ++OpOffset;
4206  } else {
4207  VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0);
4208  }
4209 
4210  Register VOffset = MI.getOperand(4 + OpOffset).getReg();
4211  Register SOffset = MI.getOperand(5 + OpOffset).getReg();
4212  unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm();
4213 
4214  MachineMemOperand *MMO = *MI.memoperands_begin();
4215 
4216  unsigned ImmOffset;
4217  std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset);
4218  updateBufferMMO(MMO, VOffset, SOffset, ImmOffset, VIndex, *B.getMRI());
4219 
4220  auto MIB = B.buildInstr(getBufferAtomicPseudo(IID));
4221 
4222  if (HasReturn)
4223  MIB.addDef(Dst);
4224 
4225  MIB.addUse(VData); // vdata
4226 
4227  if (IsCmpSwap)
4228  MIB.addReg(CmpVal);
4229 
4230  MIB.addUse(RSrc) // rsrc
4231  .addUse(VIndex) // vindex
4232  .addUse(VOffset) // voffset
4233  .addUse(SOffset) // soffset
4234  .addImm(ImmOffset) // offset(imm)
4235  .addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm)
4236  .addImm(HasVIndex ? -1 : 0) // idxen(imm)
4237  .addMemOperand(MMO);
4238 
4239  MI.eraseFromParent();
4240  return true;
4241 }
4242 
4243 /// Turn a set of s16 typed registers in \p AddrRegs into a dword sized
4244 /// vector with s16 typed elements.
4246  SmallVectorImpl<Register> &PackedAddrs,
4247  unsigned ArgOffset,
4248  const AMDGPU::ImageDimIntrinsicInfo *Intr,
4249  bool IsA16, bool IsG16) {
4250  const LLT S16 = LLT::scalar(16);
4251  const LLT V2S16 = LLT::fixed_vector(2, 16);
4252  auto EndIdx = Intr->VAddrEnd;
4253 
4254  for (unsigned I = Intr->VAddrStart; I < EndIdx; I++) {
4255  MachineOperand &SrcOp = MI.getOperand(ArgOffset + I);
4256  if (!SrcOp.isReg())
4257  continue; // _L to _LZ may have eliminated this.
4258 
4259  Register AddrReg = SrcOp.getReg();
4260 
4261  if ((I < Intr->GradientStart) ||
4262  (I >= Intr->GradientStart && I < Intr->CoordStart && !IsG16) ||
4263  (I >= Intr->CoordStart && !IsA16)) {
4264  // Handle any gradient or coordinate operands that should not be packed
4265  if ((I < Intr->GradientStart) && IsA16 &&
4266  (B.getMRI()->getType(AddrReg) == S16)) {
4267  // Special handling of bias when A16 is on. Bias is of type half but
4268  // occupies full 32-bit.
4269  PackedAddrs.push_back(
4270  B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
4271  .getReg(0));
4272  } else {
4273  AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0);
4274  PackedAddrs.push_back(AddrReg);
4275  }
4276  } else {
4277  // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D,
4278  // derivatives dx/dh and dx/dv are packed with undef.
4279  if (((I + 1) >= EndIdx) ||
4280  ((Intr->NumGradients / 2) % 2 == 1 &&
4281  (I == static_cast<unsigned>(Intr->GradientStart +
4282  (Intr->NumGradients / 2) - 1) ||
4283  I == static_cast<unsigned>(Intr->GradientStart +
4284  Intr->NumGradients - 1))) ||
4285  // Check for _L to _LZ optimization
4286  !MI.getOperand(ArgOffset + I + 1).isReg()) {
4287  PackedAddrs.push_back(
4288  B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
4289  .getReg(0));
4290  } else {
4291  PackedAddrs.push_back(
4292  B.buildBuildVector(
4293  V2S16, {AddrReg, MI.getOperand(ArgOffset + I + 1).getReg()})
4294  .getReg(0));
4295  ++I;
4296  }
4297  }
4298  }
4299 }
4300 
4301 /// Convert from separate vaddr components to a single vector address register,
4302 /// and replace the remaining operands with $noreg.
4304  int DimIdx, int NumVAddrs) {
4305  const LLT S32 = LLT::scalar(32);
4306 
4307  SmallVector<Register, 8> AddrRegs;
4308  for (int I = 0; I != NumVAddrs; ++I) {
4309  MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
4310  if (SrcOp.isReg()) {
4311  AddrRegs.push_back(SrcOp.getReg());
4312  assert(B.getMRI()->getType(SrcOp.getReg()) == S32);
4313  }
4314  }
4315 
4316  int NumAddrRegs = AddrRegs.size();
4317  if (NumAddrRegs != 1) {
4318  // Above 8 elements round up to next power of 2 (i.e. 16).
4319  if (NumAddrRegs > 8 && !isPowerOf2_32(NumAddrRegs)) {
4320  const int RoundedNumRegs = NextPowerOf2(NumAddrRegs);
4321  auto Undef = B.buildUndef(S32);
4322  AddrRegs.append(RoundedNumRegs - NumAddrRegs, Undef.getReg(0));
4323  NumAddrRegs = RoundedNumRegs;
4324  }
4325 
4326  auto VAddr =
4327  B.buildBuildVector(LLT::fixed_vector(NumAddrRegs, 32), AddrRegs);
4328  MI.getOperand(DimIdx).setReg(VAddr.getReg(0));
4329  }
4330 
4331  for (int I = 1; I != NumVAddrs; ++I) {
4332  MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
4333  if (SrcOp.isReg())
4334  MI.getOperand(DimIdx + I).setReg(AMDGPU::NoRegister);
4335  }
4336 }
4337 
4338 /// Rewrite image intrinsics to use register layouts expected by the subtarget.
4339 ///
4340 /// Depending on the subtarget, load/store with 16-bit element data need to be
4341 /// rewritten to use the low half of 32-bit registers, or directly use a packed
4342 /// layout. 16-bit addresses should also sometimes be packed into 32-bit
4343 /// registers.
4344 ///
4345 /// We don't want to directly select image instructions just yet, but also want
4346 /// to exposes all register repacking to the legalizer/combiners. We also don't
4347 /// want a selected instrution entering RegBankSelect. In order to avoid
4348 /// defining a multitude of intermediate image instructions, directly hack on
4349 /// the intrinsic's arguments. In cases like a16 addresses, this requires
4350 /// padding now unnecessary arguments with $noreg.
4353  const AMDGPU::ImageDimIntrinsicInfo *Intr) const {
4354 
4355  const unsigned NumDefs = MI.getNumExplicitDefs();
4356  const unsigned ArgOffset = NumDefs + 1;
4357  bool IsTFE = NumDefs == 2;
4358  // We are only processing the operands of d16 image operations on subtargets
4359  // that use the unpacked register layout, or need to repack the TFE result.
4360 
4361  // TODO: Do we need to guard against already legalized intrinsics?
4362  const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
4363  AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode);
4364 
4365  MachineRegisterInfo *MRI = B.getMRI();
4366  const LLT S32 = LLT::scalar(32);
4367  const LLT S16 = LLT::scalar(16);
4368  const LLT V2S16 = LLT::fixed_vector(2, 16);
4369 
4370  unsigned DMask = 0;
4371  Register VData = MI.getOperand(NumDefs == 0 ? 1 : 0).getReg();
4372  LLT Ty = MRI->getType(VData);
4373 
4374  // Check for 16 bit addresses and pack if true.
4375  LLT GradTy =
4376  MRI->getType(MI.getOperand(ArgOffset + Intr->GradientStart).getReg());
4377  LLT AddrTy =
4378  MRI->getType(MI.getOperand(ArgOffset + Intr->CoordStart).getReg());
4379  const bool IsG16 = GradTy == S16;
4380  const bool IsA16 = AddrTy == S16;
4381  const bool IsD16 = Ty.getScalarType() == S16;
4382 
4383  int DMaskLanes = 0;
4384  if (!BaseOpcode->Atomic) {
4385  DMask = MI.getOperand(ArgOffset + Intr->DMaskIndex).getImm();
4386  if (BaseOpcode->Gather4) {
4387  DMaskLanes = 4;
4388  } else if (DMask != 0) {
4389  DMaskLanes = countPopulation(DMask);
4390  } else if (!IsTFE && !BaseOpcode->Store) {
4391  // If dmask is 0, this is a no-op load. This can be eliminated.
4392  B.buildUndef(MI.getOperand(0));
4393  MI.eraseFromParent();
4394  return true;
4395  }
4396  }
4397 
4398  Observer.changingInstr(MI);
4399  auto ChangedInstr = make_scope_exit([&] { Observer.changedInstr(MI); });
4400 
4401  const unsigned StoreOpcode = IsD16 ? AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16
4402  : AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE;
4403  const unsigned LoadOpcode = IsD16 ? AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16
4404  : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD;
4405  unsigned NewOpcode = NumDefs == 0 ? StoreOpcode : LoadOpcode;
4406 
4407  // Track that we legalized this
4408  MI.setDesc(B.getTII().get(NewOpcode));
4409 
4410  // Expecting to get an error flag since TFC is on - and dmask is 0 Force
4411  // dmask to be at least 1 otherwise the instruction will fail
4412  if (IsTFE && DMask == 0) {
4413  DMask = 0x1;
4414  DMaskLanes = 1;
4415  MI.getOperand(ArgOffset + Intr->DMaskIndex).setImm(DMask);
4416  }
4417 
4418  if (BaseOpcode->Atomic) {
4419  Register VData0 = MI.getOperand(2).getReg();
4420  LLT Ty = MRI->getType(VData0);
4421 
4422  // TODO: Allow atomic swap and bit ops for v2s16/v4s16
4423  if (Ty.isVector())
4424  return false;
4425 
4426  if (BaseOpcode->AtomicX2) {
4427  Register VData1 = MI.getOperand(3).getReg();
4428  // The two values are packed in one register.
4429  LLT PackedTy = LLT::fixed_vector(2, Ty);
4430  auto Concat = B.buildBuildVector(PackedTy, {VData0, VData1});
4431  MI.getOperand(2).setReg(Concat.getReg(0));
4432  MI.getOperand(3).setReg(AMDGPU::NoRegister);
4433  }
4434  }
4435 
4436  unsigned CorrectedNumVAddrs = Intr->NumVAddrs;
4437 
4438  // Optimize _L to _LZ when _L is zero
4439  if (const AMDGPU::MIMGLZMappingInfo *LZMappingInfo =
4440  AMDGPU::getMIMGLZMappingInfo(Intr->BaseOpcode)) {
4441  const ConstantFP *ConstantLod;
4442 
4443  if (mi_match(MI.getOperand(ArgOffset + Intr->LodIndex).getReg(), *MRI,
4444  m_GFCst(ConstantLod))) {
4445  if (ConstantLod->isZero() || ConstantLod->isNegative()) {
4446  // Set new opcode to _lz variant of _l, and change the intrinsic ID.
4447  const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr =
4448  AMDGPU::getImageDimIntrinsicByBaseOpcode(LZMappingInfo->LZ,
4449  Intr->Dim);
4450 
4451  // The starting indexes should remain in the same place.
4452  --CorrectedNumVAddrs;
4453 
4454  MI.getOperand(NumDefs).setIntrinsicID(
4455  static_cast<Intrinsic::ID>(NewImageDimIntr->Intr));
4456  MI.RemoveOperand(ArgOffset + Intr->LodIndex);
4457  Intr = NewImageDimIntr;
4458  }
4459  }
4460  }
4461 
4462  // Optimize _mip away, when 'lod' is zero
4463  if (AMDGPU::getMIMGMIPMappingInfo(Intr->BaseOpcode)) {
4464  int64_t ConstantLod;
4465  if (mi_match(MI.getOperand(ArgOffset + Intr->MipIndex).getReg(), *MRI,
4466  m_ICst(ConstantLod))) {
4467  if (ConstantLod == 0) {
4468  // TODO: Change intrinsic opcode and remove operand instead or replacing
4469  // it with 0, as the _L to _LZ handling is done above.
4470  MI.getOperand(ArgOffset + Intr->MipIndex).ChangeToImmediate(0);
4471  --CorrectedNumVAddrs;
4472  }
4473  }
4474  }
4475 
4476  // Rewrite the addressing register layout before doing anything else.
4477  if (BaseOpcode->Gradients && !ST.hasG16() && (IsA16 != IsG16)) {
4478  // 16 bit gradients are supported, but are tied to the A16 control
4479  // so both gradients and addresses must be 16 bit
4480  return false;
4481  }
4482 
4483  if (IsA16 && !ST.hasA16()) {
4484  // A16 not supported
4485  return false;
4486  }
4487 
4488  if (IsA16 || IsG16) {
4489  if (Intr->NumVAddrs > 1) {
4490  SmallVector<Register, 4> PackedRegs;
4491 
4492  packImage16bitOpsToDwords(B, MI, PackedRegs, ArgOffset, Intr, IsA16,
4493  IsG16);
4494 
4495  // See also below in the non-a16 branch
4496  const bool UseNSA = ST.hasNSAEncoding() && PackedRegs.size() >= 3 &&
4497  PackedRegs.size() <= ST.getNSAMaxSize();
4498 
4499  if (!UseNSA && PackedRegs.size() > 1) {
4500  LLT PackedAddrTy = LLT::fixed_vector(2 * PackedRegs.size(), 16);
4501  auto Concat = B.buildConcatVectors(PackedAddrTy, PackedRegs);
4502  PackedRegs[0] = Concat.getReg(0);
4503  PackedRegs.resize(1);
4504  }
4505 
4506  const unsigned NumPacked = PackedRegs.size();
4507  for (unsigned I = Intr->VAddrStart; I < Intr->VAddrEnd; I++) {
4508  MachineOperand &SrcOp = MI.getOperand(ArgOffset + I);
4509  if (!SrcOp.isReg()) {
4510  assert(SrcOp.isImm() && SrcOp.getImm() == 0);
4511  continue;
4512  }
4513 
4514  assert(SrcOp.getReg() != AMDGPU::NoRegister);
4515 
4516  if (I - Intr->VAddrStart < NumPacked)
4517  SrcOp.setReg(PackedRegs[I - Intr->VAddrStart]);
4518  else
4519  SrcOp.setReg(AMDGPU::NoRegister);
4520  }
4521  }
4522  } else {
4523  // If the register allocator cannot place the address registers contiguously
4524  // without introducing moves, then using the non-sequential address encoding
4525  // is always preferable, since it saves VALU instructions and is usually a
4526  // wash in terms of code size or even better.
4527  //
4528  // However, we currently have no way of hinting to the register allocator
4529  // that MIMG addresses should be placed contiguously when it is possible to
4530  // do so, so force non-NSA for the common 2-address case as a heuristic.
4531  //
4532  // SIShrinkInstructions will convert NSA encodings to non-NSA after register
4533  // allocation when possible.
4534  const bool UseNSA = ST.hasNSAEncoding() && CorrectedNumVAddrs >= 3 &&
4535  CorrectedNumVAddrs <= ST.getNSAMaxSize();
4536 
4537  if (!UseNSA && Intr->NumVAddrs > 1)
4538  convertImageAddrToPacked(B, MI, ArgOffset + Intr->VAddrStart,
4539  Intr->NumVAddrs);
4540  }
4541 
4542  int Flags = 0;
4543  if (IsA16)
4544  Flags |= 1;
4545  if (IsG16)
4546  Flags |= 2;
4547  MI.addOperand(MachineOperand::CreateImm(Flags));
4548 
4549  if (BaseOpcode->Store) { // No TFE for stores?
4550  // TODO: Handle dmask trim
4551  if (!Ty.isVector() || !IsD16)
4552  return true;
4553 
4554  Register RepackedReg = handleD16VData(B, *MRI, VData, true);
4555  if (RepackedReg != VData) {
4556  MI.getOperand(1).setReg(RepackedReg);
4557  }
4558 
4559  return true;
4560  }
4561 
4562  Register DstReg = MI.getOperand(0).getReg();
4563  const LLT EltTy = Ty.getScalarType();
4564  const int NumElts = Ty.isVector() ? Ty.getNumElements() : 1;
4565 
4566  // Confirm that the return type is large enough for the dmask specified
4567  if (NumElts < DMaskLanes)
4568  return false;
4569 
4570  if (NumElts > 4 || DMaskLanes > 4)
4571  return false;
4572 
4573  const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes;
4574  const LLT AdjustedTy =
4575  Ty.changeElementCount(ElementCount::getFixed(AdjustedNumElts));
4576 
4577  // The raw dword aligned data component of the load. The only legal cases
4578  // where this matters should be when using the packed D16 format, for
4579  // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>,
4580  LLT RoundedTy;
4581 
4582  // S32 vector to to cover all data, plus TFE result element.
4583  LLT TFETy;
4584 
4585  // Register type to use for each loaded component. Will be S32 or V2S16.
4586  LLT RegTy;
4587 
4588  if (IsD16 && ST.hasUnpackedD16VMem()) {
4589  RoundedTy =
4590  LLT::scalarOrVector(ElementCount::getFixed(AdjustedNumElts), 32);
4591  TFETy = LLT::fixed_vector(AdjustedNumElts + 1, 32);
4592  RegTy = S32;
4593  } else {
4594  unsigned EltSize = EltTy.getSizeInBits();
4595  unsigned RoundedElts = (AdjustedTy.getSizeInBits() + 31) / 32;
4596  unsigned RoundedSize = 32 * RoundedElts;
4597  RoundedTy = LLT::scalarOrVector(
4598  ElementCount::getFixed(RoundedSize / EltSize), EltSize);
4599  TFETy = LLT::fixed_vector(RoundedSize / 32 + 1, S32);
4600  RegTy = !IsTFE && EltSize == 16 ? V2S16 : S32;
4601  }
4602 
4603  // The return type does not need adjustment.
4604  // TODO: Should we change s16 case to s32 or <2 x s16>?
4605  if (!IsTFE && (RoundedTy == Ty || !Ty.isVector()))
4606  return true;
4607 
4608  Register Dst1Reg;
4609 
4610  // Insert after the instruction.
4611  B.setInsertPt(*MI.getParent(), ++MI.getIterator());
4612 
4613  // TODO: For TFE with d16, if we used a TFE type that was a multiple of <2 x
4614  // s16> instead of s32, we would only need 1 bitcast instead of multiple.
4615  const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy;
4616  const int ResultNumRegs = LoadResultTy.getSizeInBits() / 32;
4617 
4618  Register NewResultReg = MRI->createGenericVirtualRegister(LoadResultTy);
4619 
4620  MI.getOperand(0).setReg(NewResultReg);
4621 
4622  // In the IR, TFE is supposed to be used with a 2 element struct return
4623  // type. The instruction really returns these two values in one contiguous
4624  // register, with one additional dword beyond the loaded data. Rewrite the
4625  // return type to use a single register result.
4626 
4627  if (IsTFE) {
4628  Dst1Reg = MI.getOperand(1).getReg();
4629  if (MRI->getType(Dst1Reg) != S32)
4630  return false;
4631 
4632  // TODO: Make sure the TFE operand bit is set.
4633  MI.RemoveOperand(1);
4634 
4635  // Handle the easy case that requires no repack instructions.
4636  if (Ty == S32) {
4637  B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg);
4638  return true;
4639  }
4640  }
4641 
4642  // Now figure out how to copy the new result register back into the old
4643  // result.
4644  SmallVector<Register, 5> ResultRegs(ResultNumRegs, Dst1Reg);
4645 
4646  const int NumDataRegs = IsTFE ? ResultNumRegs - 1 : ResultNumRegs;
4647 
4648  if (ResultNumRegs == 1) {
4649  assert(!IsTFE);
4650  ResultRegs[0] = NewResultReg;
4651  } else {
4652  // We have to repack into a new vector of some kind.
4653  for (int I = 0; I != NumDataRegs; ++I)
4654  ResultRegs[I] = MRI->createGenericVirtualRegister(RegTy);
4655  B.buildUnmerge(ResultRegs, NewResultReg);
4656 
4657  // Drop the final TFE element to get the data part. The TFE result is
4658  // directly written to the right place already.
4659  if (IsTFE)
4660  ResultRegs.resize(NumDataRegs);
4661  }
4662 
4663  // For an s16 scalar result, we form an s32 result with a truncate regardless
4664  // of packed vs. unpacked.
4665  if (IsD16 && !Ty.isVector()) {
4666  B.buildTrunc(DstReg, ResultRegs[0]);
4667  return true;
4668  }
4669 
4670  // Avoid a build/concat_vector of 1 entry.
4671  if (Ty == V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) {
4672  B.buildBitcast(DstReg, ResultRegs[0]);
4673  return true;
4674  }
4675 
4676  assert(Ty.isVector());
4677 
4678  if (IsD16) {
4679  // For packed D16 results with TFE enabled, all the data components are
4680  // S32. Cast back to the expected type.
4681  //
4682  // TODO: We don't really need to use load s32 elements. We would only need one
4683  // cast for the TFE result if a multiple of v2s16 was used.
4684  if (RegTy != V2S16 && !ST.hasUnpackedD16VMem()) {
4685  for (Register &Reg : ResultRegs)
4686  Reg = B.buildBitcast(V2S16, Reg).getReg(0);
4687  } else if (ST.hasUnpackedD16VMem()) {
4688  for (Register &Reg : ResultRegs)
4689  Reg = B.buildTrunc(S16, Reg).getReg(0);
4690  }
4691  }
4692 
4693  auto padWithUndef = [&](LLT Ty, int NumElts) {
4694  if (NumElts == 0)
4695  return;
4696  Register Undef = B.buildUndef(Ty).getReg(0);
4697  for (int I = 0; I != NumElts; ++I)
4698  ResultRegs.push_back(Undef);
4699  };
4700 
4701  // Pad out any elements eliminated due to the dmask.
4702  LLT ResTy = MRI->getType(ResultRegs[0]);
4703  if (!ResTy.isVector()) {
4704  padWithUndef(ResTy, NumElts - ResultRegs.size());
4705  B.buildBuildVector(DstReg, ResultRegs);
4706  return true;
4707  }
4708 
4709  assert(!ST.hasUnpackedD16VMem() && ResTy == V2S16);
4710  const int RegsToCover = (Ty.getSizeInBits() + 31) / 32;
4711 
4712  // Deal with the one annoying legal case.
4713  const LLT V3S16 = LLT::fixed_vector(3, 16);
4714  if (Ty == V3S16) {
4715  if (IsTFE) {
4716  if (ResultRegs.size() == 1) {
4717  NewResultReg = ResultRegs[0];
4718  } else if (ResultRegs.size() == 2) {
4719  LLT V4S16 = LLT::fixed_vector(4, 16);
4720  NewResultReg = B.buildConcatVectors(V4S16, ResultRegs).getReg(0);
4721  } else {
4722  return false;
4723  }
4724  }
4725 
4726  if (MRI->getType(DstReg).getNumElements() <
4727  MRI->getType(NewResultReg).getNumElements()) {
4728  B.buildDeleteTrailingVectorElements(DstReg, NewResultReg);
4729  } else {
4730  B.buildPadVectorWithUndefElements(DstReg, NewResultReg);
4731  }
4732  return true;
4733  }
4734 
4735  padWithUndef(ResTy, RegsToCover - ResultRegs.size());
4736  B.buildConcatVectors(DstReg, ResultRegs);
4737  return true;
4738 }
4739 
4741  LegalizerHelper &Helper, MachineInstr &MI) const {
4742  MachineIRBuilder &B = Helper.MIRBuilder;
4743  GISelChangeObserver &Observer = Helper.Observer;
4744 
4745  Register Dst = MI.getOperand(0).getReg();
4746  LLT Ty = B.getMRI()->getType(Dst);
4747  unsigned Size = Ty.getSizeInBits();
4748  MachineFunction &MF = B.getMF();
4749 
4750  Observer.changingInstr(MI);
4751 
4753  Ty = getBitcastRegisterType(Ty);
4754  Helper.bitcastDst(MI, Ty, 0);
4755  Dst = MI.getOperand(0).getReg();
4756  B.setInsertPt(B.getMBB(), MI);
4757  }
4758 
4759  // FIXME: We don't really need this intermediate instruction. The intrinsic
4760  // should be fixed to have a memory operand. Since it's readnone, we're not
4761  // allowed to add one.
4762  MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_LOAD));
4763  MI.RemoveOperand(1); // Remove intrinsic ID
4764 
4765  // FIXME: When intrinsic definition is fixed, this should have an MMO already.
4766  // TODO: Should this use datalayout alignment?
4767  const unsigned MemSize = (Size + 7) / 8;
4768  const Align MemAlign(4);
4773  MemSize, MemAlign);
4774  MI.addMemOperand(MF, MMO);
4775 
4776  // There are no 96-bit result scalar loads, but widening to 128-bit should
4777  // always be legal. We may need to restore this to a 96-bit result if it turns
4778  // out this needs to be converted to a vector load during RegBankSelect.
4779  if (!isPowerOf2_32(Size)) {
4780  if (Ty.isVector())
4781  Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0);
4782  else
4783  Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0);
4784  }
4785 
4786  Observer.changedInstr(MI);
4787  return true;
4788 }
4789 
4790 // TODO: Move to selection
4793  MachineIRBuilder &B) const {
4794  if (!ST.isTrapHandlerEnabled() ||
4796  return legalizeTrapEndpgm(MI, MRI, B);
4797 
4798  if (Optional<uint8_t> HsaAbiVer = AMDGPU::getHsaAbiVersion(&ST)) {
4799  switch (*HsaAbiVer) {
4802  return legalizeTrapHsaQueuePtr(MI, MRI, B);
4804  return ST.supportsGetDoorbellID() ?
4805  legalizeTrapHsa(MI, MRI, B) :
4807  }
4808  }
4809 
4810  llvm_unreachable("Unknown trap handler");
4811 }
4812 
4815  B.buildInstr(AMDGPU::S_ENDPGM).addImm(0);
4816  MI.eraseFromParent();
4817  return true;
4818 }
4819 
4822  // Pass queue pointer to trap handler as input, and insert trap instruction
4823  // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi
4824  Register LiveIn =
4827  return false;
4828 
4829  Register SGPR01(AMDGPU::SGPR0_SGPR1);
4830  B.buildCopy(SGPR01, LiveIn);
4831  B.buildInstr(AMDGPU::S_TRAP)
4832  .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap))
4833  .addReg(SGPR01, RegState::Implicit);
4834 
4835  MI.eraseFromParent();
4836  return true;
4837 }
4838 
4841  B.buildInstr(AMDGPU::S_TRAP)
4842  .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap));
4843  MI.eraseFromParent();
4844  return true;
4845 }
4846 
4849  // Is non-HSA path or trap-handler disabled? Then, report a warning
4850  // accordingly
4851  if (!ST.isTrapHandlerEnabled() ||
4853  DiagnosticInfoUnsupported NoTrap(B.getMF().getFunction(),
4854  "debugtrap handler not supported",
4855  MI.getDebugLoc(), DS_Warning);
4856  LLVMContext &Ctx = B.getMF().getFunction().getContext();
4857  Ctx.diagnose(NoTrap);
4858  } else {
4859  // Insert debug-trap instruction
4860  B.buildInstr(AMDGPU::S_TRAP)
4861  .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSADebugTrap));
4862  }
4863 
4864  MI.eraseFromParent();
4865  return true;
4866 }
4867 
4869  MachineIRBuilder &B) const {
4870  MachineRegisterInfo &MRI = *B.getMRI();
4871  const LLT S16 = LLT::scalar(16);
4872  const LLT S32 = LLT::scalar(32);
4873 
4874  Register DstReg = MI.getOperand(0).getReg();
4875  Register NodePtr = MI.getOperand(2).getReg();
4876  Register RayExtent = MI.getOperand(3).getReg();
4877  Register RayOrigin = MI.getOperand(4).getReg();
4878  Register RayDir = MI.getOperand(5).getReg();
4879  Register RayInvDir = MI.getOperand(6).getReg();
4880  Register TDescr = MI.getOperand(7).getReg();
4881 
4882  if (!ST.hasGFX10_AEncoding()) {
4883  DiagnosticInfoUnsupported BadIntrin(B.getMF().getFunction(),
4884  "intrinsic not supported on subtarget",
4885  MI.getDebugLoc());
4886  B.getMF().getFunction().getContext().diagnose(BadIntrin);
4887  return false;
4888  }
4889 
4890  const bool IsA16 = MRI.getType(RayDir).getElementType().getSizeInBits() == 16;
4891  const bool Is64 = MRI.getType(NodePtr).getSizeInBits() == 64;
4892  const unsigned NumVDataDwords = 4;
4893  const unsigned NumVAddrDwords = IsA16 ? (Is64 ? 9 : 8) : (Is64 ? 12 : 11);
4894  const bool UseNSA =
4895  ST.hasNSAEncoding() && NumVAddrDwords <= ST.getNSAMaxSize();
4896  const unsigned BaseOpcodes[2][2] = {
4897  {AMDGPU::IMAGE_BVH_INTERSECT_RAY, AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16},
4898  {AMDGPU::IMAGE_BVH64_INTERSECT_RAY,
4899  AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16}};
4900  int Opcode;
4901  if (UseNSA) {
4902  Opcode =
4903  AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16], AMDGPU::MIMGEncGfx10NSA,
4904  NumVDataDwords, NumVAddrDwords);
4905  } else {
4906  Opcode = AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16],
4907  AMDGPU::MIMGEncGfx10Default, NumVDataDwords,
4908  PowerOf2Ceil(NumVAddrDwords));
4909  }
4910  assert(Opcode != -1);
4911 
4913  if (Is64) {
4914  auto Unmerge = B.buildUnmerge({S32, S32}, NodePtr);
4915  Ops.push_back(Unmerge.getReg(0));
4916  Ops.push_back(Unmerge.getReg(1));
4917  } else {
4918  Ops.push_back(NodePtr);
4919  }
4920  Ops.push_back(RayExtent);
4921 
4922  auto packLanes = [&Ops, &S32, &B](Register Src) {
4923  auto Unmerge = B.buildUnmerge({S32, S32, S32}, Src);
4924  Ops.push_back(Unmerge.getReg(0));
4925  Ops.push_back(Unmerge.getReg(1));
4926  Ops.push_back(Unmerge.getReg(2));
4927  };
4928 
4929  packLanes(RayOrigin);
4930  if (IsA16) {
4931  auto UnmergeRayDir = B.buildUnmerge({S16, S16, S16}, RayDir);
4932  auto UnmergeRayInvDir = B.buildUnmerge({S16, S16, S16}, RayInvDir);
4936  B.buildMerge(R1, {UnmergeRayDir.getReg(0), UnmergeRayDir.getReg(1)});
4937  B.buildMerge(R2, {UnmergeRayDir.getReg(2), UnmergeRayInvDir.getReg(0)});
4938  B.buildMerge(R3, {UnmergeRayInvDir.getReg(1), UnmergeRayInvDir.getReg(2)});
4939  Ops.push_back(R1);
4940  Ops.push_back(R2);
4941  Ops.push_back(R3);
4942  } else {
4943  packLanes(RayDir);
4944  packLanes(RayInvDir);
4945  }
4946 
4947  if (!UseNSA) {
4948  // Build a single vector containing all the operands so far prepared.
4949  LLT OpTy = LLT::fixed_vector(Ops.size(), 32);
4950  Register MergedOps = B.buildMerge(OpTy, Ops).getReg(0);
4951  Ops.clear();
4952  Ops.push_back(MergedOps);
4953  }
4954 
4955  auto MIB = B.buildInstr(AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY)
4956  .addDef(DstReg)
4957  .addImm(Opcode);
4958 
4959  for (Register R : Ops) {
4960  MIB.addUse(R);
4961  }
4962 
4963  MIB.addUse(TDescr)
4964  .addImm(IsA16 ? 1 : 0)
4965  .cloneMemRefs(MI);
4966 
4967  MI.eraseFromParent();
4968  return true;
4969 }
4970 
4972  B.buildConstant(MI.getOperand(0).getReg(), C);
4973  MI.eraseFromParent();
4974  return true;
4975 }
4976 
4978  MachineInstr &MI) const {
4979  MachineIRBuilder &B = Helper.MIRBuilder;
4980  MachineRegisterInfo &MRI = *B.getMRI();
4981 
4982  // Replace the use G_BRCOND with the exec manipulate and branch pseudos.
4983  auto IntrID = MI.getIntrinsicID();
4984  switch (IntrID) {
4985  case Intrinsic::amdgcn_if:
4986  case Intrinsic::amdgcn_else: {
4987  MachineInstr *Br = nullptr;
4988  MachineBasicBlock *UncondBrTarget = nullptr;
4989  bool Negated = false;
4990  if (MachineInstr *BrCond =
4991  verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) {
4992  const SIRegisterInfo *TRI
4993  = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
4994 
4995  Register Def = MI.getOperand(1).getReg();
4996  Register Use = MI.getOperand(3).getReg();
4997 
4998  MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
4999 
5000  if (Negated)
5001  std::swap(CondBrTarget, UncondBrTarget);
5002 
5003  B.setInsertPt(B.getMBB(), BrCond->getIterator());
5004  if (IntrID == Intrinsic::amdgcn_if) {
5005  B.buildInstr(AMDGPU::SI_IF)
5006  .addDef(Def)
5007  .addUse(Use)
5008  .addMBB(UncondBrTarget);
5009  } else {
5010  B.buildInstr(AMDGPU::SI_ELSE)
5011  .addDef(Def)
5012  .addUse(Use)
5013  .addMBB(UncondBrTarget);
5014  }
5015 
5016  if (Br) {
5017  Br->getOperand(0).setMBB(CondBrTarget);
5018  } else {
5019  // The IRTranslator skips inserting the G_BR for fallthrough cases, but
5020  // since we're swapping branch targets it needs to be reinserted.
5021  // FIXME: IRTranslator should probably not do this
5022  B.buildBr(*CondBrTarget);
5023  }
5024 
5025  MRI.setRegClass(Def, TRI->getWaveMaskRegClass());
5026  MRI.setRegClass(Use, TRI->getWaveMaskRegClass());
5027  MI.eraseFromParent();
5028  BrCond->eraseFromParent();
5029  return true;
5030  }
5031 
5032  return false;
5033  }
5034  case Intrinsic::amdgcn_loop: {
5035  MachineInstr *Br = nullptr;
5036  MachineBasicBlock *UncondBrTarget = nullptr;
5037  bool Negated = false;
5038  if (MachineInstr *BrCond =
5039  verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) {
5040  const SIRegisterInfo *TRI
5041  = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
5042 
5043  MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
5044  Register Reg = MI.getOperand(2).getReg();
5045 
5046  if (Negated)
5047  std::swap(CondBrTarget, UncondBrTarget);
5048 
5049  B.setInsertPt(B.getMBB(), BrCond->getIterator());
5050  B.buildInstr(AMDGPU::SI_LOOP)
5051  .addUse(Reg)
5052  .addMBB(UncondBrTarget);
5053 
5054  if (Br)
5055  Br->getOperand(0).setMBB(CondBrTarget);
5056  else
5057  B.buildBr(*CondBrTarget);
5058 
5059  MI.eraseFromParent();
5060  BrCond->eraseFromParent();
5061  MRI.setRegClass(Reg, TRI->getWaveMaskRegClass());
5062  return true;
5063  }
5064 
5065  return false;
5066  }
5067  case Intrinsic::amdgcn_kernarg_segment_ptr:
5068  if (!AMDGPU::isKernel(B.getMF().getFunction().getCallingConv())) {
5069  // This only makes sense to call in a kernel, so just lower to null.
5070  B.buildConstant(MI.getOperand(0).getReg(), 0);
5071  MI.eraseFromParent();
5072  return true;
5073  }
5074 
5077  case Intrinsic::amdgcn_implicitarg_ptr:
5078  return legalizeImplicitArgPtr(MI, MRI, B);
5079  case Intrinsic::amdgcn_workitem_id_x:
5080  if (ST.getMaxWorkitemID(B.getMF().getFunction(), 0) == 0)
5081  return replaceWithConstant(B, MI, 0);
5084  case Intrinsic::amdgcn_workitem_id_y:
5085  if (ST.getMaxWorkitemID(B.getMF().getFunction(), 1) == 0)
5086  return replaceWithConstant(B, MI, 0);
5087 
5090  case Intrinsic::amdgcn_workitem_id_z:
5091  if (ST.getMaxWorkitemID(B.getMF().getFunction(), 2) == 0)
5092  return replaceWithConstant(B, MI, 0);
5093 
5096  case Intrinsic::amdgcn_workgroup_id_x:
5099  case Intrinsic::amdgcn_workgroup_id_y:
5102  case Intrinsic::amdgcn_workgroup_id_z:
5105  case Intrinsic::amdgcn_dispatch_ptr:
5108  case Intrinsic::amdgcn_queue_ptr:
5111  case Intrinsic::amdgcn_implicit_buffer_ptr:
5114  case Intrinsic::amdgcn_dispatch_id:
5117  case Intrinsic::amdgcn_fdiv_fast:
5118  return legalizeFDIVFastIntrin(MI, MRI, B);
5119  case Intrinsic::amdgcn_is_shared:
5121  case Intrinsic::amdgcn_is_private:
5123  case Intrinsic::amdgcn_wavefrontsize: {
5124  B.buildConstant(MI.getOperand(0), ST.getWavefrontSize());
5125  MI.eraseFromParent();
5126  return true;
5127  }
5128  case Intrinsic::amdgcn_s_buffer_load:
5129  return legalizeSBufferLoad(Helper, MI);
5130  case Intrinsic::amdgcn_raw_buffer_store:
5131  case Intrinsic::amdgcn_struct_buffer_store:
5132  return legalizeBufferStore(MI, MRI, B, false, false);
5133  case Intrinsic::amdgcn_raw_buffer_store_format:
5134  case Intrinsic::amdgcn_struct_buffer_store_format:
5135  return legalizeBufferStore(MI, MRI, B, false, true);
5136  case Intrinsic::amdgcn_raw_tbuffer_store:
5137  case Intrinsic::amdgcn_struct_tbuffer_store:
5138  return legalizeBufferStore(MI, MRI, B, true, true);
5139  case Intrinsic::amdgcn_raw_buffer_load:
5140  case Intrinsic::amdgcn_struct_buffer_load:
5141  return legalizeBufferLoad(MI, MRI, B, false, false);
5142  case Intrinsic::amdgcn_raw_buffer_load_format:
5143  case Intrinsic::amdgcn_struct_buffer_load_format:
5144  return legalizeBufferLoad(MI, MRI, B, true, false);
5145  case Intrinsic::amdgcn_raw_tbuffer_load:
5146  case Intrinsic::amdgcn_struct_tbuffer_load:
5147  return legalizeBufferLoad(MI, MRI, B, true, true);
5148  case Intrinsic::amdgcn_raw_buffer_atomic_swap:
5149  case Intrinsic::amdgcn_struct_buffer_atomic_swap:
5150  case Intrinsic::amdgcn_raw_buffer_atomic_add:
5151  case Intrinsic::amdgcn_struct_buffer_atomic_add:
5152  case Intrinsic::amdgcn_raw_buffer_atomic_sub:
5153  case Intrinsic::amdgcn_struct_buffer_atomic_sub:
5154