LLVM  14.0.0git
AMDGPULegalizerInfo.cpp
Go to the documentation of this file.
1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// This file implements the targeting of the Machinelegalizer class for
10 /// AMDGPU.
11 /// \todo This should be generated by TableGen.
12 //===----------------------------------------------------------------------===//
13 
14 #include "AMDGPULegalizerInfo.h"
15 
16 #include "AMDGPU.h"
17 #include "AMDGPUGlobalISelUtils.h"
18 #include "AMDGPUInstrInfo.h"
19 #include "AMDGPUTargetMachine.h"
20 #include "SIMachineFunctionInfo.h"
21 #include "Utils/AMDGPUBaseInfo.h"
22 #include "llvm/ADT/ScopeExit.h"
23 #include "llvm/BinaryFormat/ELF.h"
27 #include "llvm/IR/DiagnosticInfo.h"
28 #include "llvm/IR/IntrinsicsAMDGPU.h"
29 
30 #define DEBUG_TYPE "amdgpu-legalinfo"
31 
32 using namespace llvm;
33 using namespace LegalizeActions;
34 using namespace LegalizeMutations;
35 using namespace LegalityPredicates;
36 using namespace MIPatternMatch;
37 
38 // Hack until load/store selection patterns support any tuple of legal types.
40  "amdgpu-global-isel-new-legality",
41  cl::desc("Use GlobalISel desired legality, rather than try to use"
42  "rules compatible with selection patterns"),
43  cl::init(false),
45 
46 static constexpr unsigned MaxRegisterSize = 1024;
47 
48 // Round the number of elements to the next power of two elements
50  unsigned NElts = Ty.getNumElements();
51  unsigned Pow2NElts = 1 << Log2_32_Ceil(NElts);
52  return Ty.changeElementCount(ElementCount::getFixed(Pow2NElts));
53 }
54 
55 // Round the number of bits to the next power of two bits
57  unsigned Bits = Ty.getSizeInBits();
58  unsigned Pow2Bits = 1 << Log2_32_Ceil(Bits);
59  return LLT::scalar(Pow2Bits);
60 }
61 
62 /// \returs true if this is an odd sized vector which should widen by adding an
63 /// additional element. This is mostly to handle <3 x s16> -> <4 x s16>. This
64 /// excludes s1 vectors, which should always be scalarized.
65 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) {
66  return [=](const LegalityQuery &Query) {
67  const LLT Ty = Query.Types[TypeIdx];
68  if (!Ty.isVector())
69  return false;
70 
71  const LLT EltTy = Ty.getElementType();
72  const unsigned EltSize = EltTy.getSizeInBits();
73  return Ty.getNumElements() % 2 != 0 &&
74  EltSize > 1 && EltSize < 32 &&
75  Ty.getSizeInBits() % 32 != 0;
76  };
77 }
78 
79 static LegalityPredicate sizeIsMultipleOf32(unsigned TypeIdx) {
80  return [=](const LegalityQuery &Query) {
81  const LLT Ty = Query.Types[TypeIdx];
82  return Ty.getSizeInBits() % 32 == 0;
83  };
84 }
85 
86 static LegalityPredicate isWideVec16(unsigned TypeIdx) {
87  return [=](const LegalityQuery &Query) {
88  const LLT Ty = Query.Types[TypeIdx];
89  const LLT EltTy = Ty.getScalarType();
90  return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2;
91  };
92 }
93 
94 static LegalizeMutation oneMoreElement(unsigned TypeIdx) {
95  return [=](const LegalityQuery &Query) {
96  const LLT Ty = Query.Types[TypeIdx];
97  const LLT EltTy = Ty.getElementType();
98  return std::make_pair(TypeIdx,
99  LLT::fixed_vector(Ty.getNumElements() + 1, EltTy));
100  };
101 }
102 
103 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) {
104  return [=](const LegalityQuery &Query) {
105  const LLT Ty = Query.Types[TypeIdx];
106  const LLT EltTy = Ty.getElementType();
107  unsigned Size = Ty.getSizeInBits();
108  unsigned Pieces = (Size + 63) / 64;
109  unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces;
110  return std::make_pair(
111  TypeIdx,
112  LLT::scalarOrVector(ElementCount::getFixed(NewNumElts), EltTy));
113  };
114 }
115 
116 // Increase the number of vector elements to reach the next multiple of 32-bit
117 // type.
118 static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) {
119  return [=](const LegalityQuery &Query) {
120  const LLT Ty = Query.Types[TypeIdx];
121 
122  const LLT EltTy = Ty.getElementType();
123  const int Size = Ty.getSizeInBits();
124  const int EltSize = EltTy.getSizeInBits();
125  const int NextMul32 = (Size + 31) / 32;
126 
127  assert(EltSize < 32);
128 
129  const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize;
130  return std::make_pair(TypeIdx, LLT::fixed_vector(NewNumElts, EltTy));
131  };
132 }
133 
134 static LLT getBitcastRegisterType(const LLT Ty) {
135  const unsigned Size = Ty.getSizeInBits();
136 
137  LLT CoercedTy;
138  if (Size <= 32) {
139  // <2 x s8> -> s16
140  // <4 x s8> -> s32
141  return LLT::scalar(Size);
142  }
143 
145 }
146 
147 static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx) {
148  return [=](const LegalityQuery &Query) {
149  const LLT Ty = Query.Types[TypeIdx];
150  return std::make_pair(TypeIdx, getBitcastRegisterType(Ty));
151  };
152 }
153 
154 static LegalizeMutation bitcastToVectorElement32(unsigned TypeIdx) {
155  return [=](const LegalityQuery &Query) {
156  const LLT Ty = Query.Types[TypeIdx];
157  unsigned Size = Ty.getSizeInBits();
158  assert(Size % 32 == 0);
159  return std::make_pair(
160  TypeIdx, LLT::scalarOrVector(ElementCount::getFixed(Size / 32), 32));
161  };
162 }
163 
164 static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) {
165  return [=](const LegalityQuery &Query) {
166  const LLT QueryTy = Query.Types[TypeIdx];
167  return QueryTy.isVector() && QueryTy.getSizeInBits() < Size;
168  };
169 }
170 
171 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) {
172  return [=](const LegalityQuery &Query) {
173  const LLT QueryTy = Query.Types[TypeIdx];
174  return QueryTy.isVector() && QueryTy.getSizeInBits() > Size;
175  };
176 }
177 
178 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) {
179  return [=](const LegalityQuery &Query) {
180  const LLT QueryTy = Query.Types[TypeIdx];
181  return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0;
182  };
183 }
184 
185 static bool isRegisterSize(unsigned Size) {
186  return Size % 32 == 0 && Size <= MaxRegisterSize;
187 }
188 
189 static bool isRegisterVectorElementType(LLT EltTy) {
190  const int EltSize = EltTy.getSizeInBits();
191  return EltSize == 16 || EltSize % 32 == 0;
192 }
193 
194 static bool isRegisterVectorType(LLT Ty) {
195  const int EltSize = Ty.getElementType().getSizeInBits();
196  return EltSize == 32 || EltSize == 64 ||
197  (EltSize == 16 && Ty.getNumElements() % 2 == 0) ||
198  EltSize == 128 || EltSize == 256;
199 }
200 
201 static bool isRegisterType(LLT Ty) {
202  if (!isRegisterSize(Ty.getSizeInBits()))
203  return false;
204 
205  if (Ty.isVector())
206  return isRegisterVectorType(Ty);
207 
208  return true;
209 }
210 
211 // Any combination of 32 or 64-bit elements up the maximum register size, and
212 // multiples of v2s16.
213 static LegalityPredicate isRegisterType(unsigned TypeIdx) {
214  return [=](const LegalityQuery &Query) {
215  return isRegisterType(Query.Types[TypeIdx]);
216  };
217 }
218 
219 static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) {
220  return [=](const LegalityQuery &Query) {
221  const LLT QueryTy = Query.Types[TypeIdx];
222  if (!QueryTy.isVector())
223  return false;
224  const LLT EltTy = QueryTy.getElementType();
225  return EltTy == LLT::scalar(16) || EltTy.getSizeInBits() >= 32;
226  };
227 }
228 
229 // If we have a truncating store or an extending load with a data size larger
230 // than 32-bits, we need to reduce to a 32-bit type.
232  return [=](const LegalityQuery &Query) {
233  const LLT Ty = Query.Types[TypeIdx];
234  return !Ty.isVector() && Ty.getSizeInBits() > 32 &&
235  Query.MMODescrs[0].MemoryTy.getSizeInBits() < Ty.getSizeInBits();
236  };
237 }
238 
239 // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we
240 // handle some operations by just promoting the register during
241 // selection. There are also d16 loads on GFX9+ which preserve the high bits.
242 static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS,
243  bool IsLoad) {
244  switch (AS) {
246  // FIXME: Private element size.
247  return ST.enableFlatScratch() ? 128 : 32;
249  return ST.useDS128() ? 128 : 64;
253  // Treat constant and global as identical. SMRD loads are sometimes usable for
254  // global loads (ideally constant address space should be eliminated)
255  // depending on the context. Legality cannot be context dependent, but
256  // RegBankSelect can split the load as necessary depending on the pointer
257  // register bank/uniformity and if the memory is invariant or not written in a
258  // kernel.
259  return IsLoad ? 512 : 128;
260  default:
261  // Flat addresses may contextually need to be split to 32-bit parts if they
262  // may alias scratch depending on the subtarget.
263  return 128;
264  }
265 }
266 
268  const LegalityQuery &Query) {
269  const LLT Ty = Query.Types[0];
270 
271  // Handle G_LOAD, G_ZEXTLOAD, G_SEXTLOAD
272  const bool IsLoad = Query.Opcode != AMDGPU::G_STORE;
273 
274  unsigned RegSize = Ty.getSizeInBits();
275  unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
276  unsigned AlignBits = Query.MMODescrs[0].AlignInBits;
277  unsigned AS = Query.Types[1].getAddressSpace();
278 
279  // All of these need to be custom lowered to cast the pointer operand.
281  return false;
282 
283  // Do not handle extending vector loads.
284  if (Ty.isVector() && MemSize != RegSize)
285  return false;
286 
287  // TODO: We should be able to widen loads if the alignment is high enough, but
288  // we also need to modify the memory access size.
289 #if 0
290  // Accept widening loads based on alignment.
291  if (IsLoad && MemSize < Size)
292  MemSize = std::max(MemSize, Align);
293 #endif
294 
295  // Only 1-byte and 2-byte to 32-bit extloads are valid.
296  if (MemSize != RegSize && RegSize != 32)
297  return false;
298 
299  if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad))
300  return false;
301 
302  switch (MemSize) {
303  case 8:
304  case 16:
305  case 32:
306  case 64:
307  case 128:
308  break;
309  case 96:
310  if (!ST.hasDwordx3LoadStores())
311  return false;
312  break;
313  case 256:
314  case 512:
315  // These may contextually need to be broken down.
316  break;
317  default:
318  return false;
319  }
320 
321  assert(RegSize >= MemSize);
322 
323  if (AlignBits < MemSize) {
324  const SITargetLowering *TLI = ST.getTargetLowering();
325  if (!TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS,
326  Align(AlignBits / 8)))
327  return false;
328  }
329 
330  return true;
331 }
332 
333 // The current selector can't handle <6 x s16>, <8 x s16>, s96, s128 etc, so
334 // workaround this. Eventually it should ignore the type for loads and only care
335 // about the size. Return true in cases where we will workaround this for now by
336 // bitcasting.
337 static bool loadStoreBitcastWorkaround(const LLT Ty) {
338  if (EnableNewLegality)
339  return false;
340 
341  const unsigned Size = Ty.getSizeInBits();
342  if (Size <= 64)
343  return false;
344  if (!Ty.isVector())
345  return true;
346 
347  LLT EltTy = Ty.getElementType();
348  if (EltTy.isPointer())
349  return true;
350 
351  unsigned EltSize = EltTy.getSizeInBits();
352  return EltSize != 32 && EltSize != 64;
353 }
354 
355 static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query) {
356  const LLT Ty = Query.Types[0];
357  return isRegisterType(Ty) && isLoadStoreSizeLegal(ST, Query) &&
359 }
360 
361 /// Return true if a load or store of the type should be lowered with a bitcast
362 /// to a different type.
363 static bool shouldBitcastLoadStoreType(const GCNSubtarget &ST, const LLT Ty,
364  const LLT MemTy) {
365  const unsigned MemSizeInBits = MemTy.getSizeInBits();
366  const unsigned Size = Ty.getSizeInBits();
367  if (Size != MemSizeInBits)
368  return Size <= 32 && Ty.isVector();
369 
371  return true;
372 
373  // Don't try to handle bitcasting vector ext loads for now.
374  return Ty.isVector() && (!MemTy.isVector() || MemTy == Ty) &&
375  (Size <= 32 || isRegisterSize(Size)) &&
377 }
378 
379 /// Return true if we should legalize a load by widening an odd sized memory
380 /// access up to the alignment. Note this case when the memory access itself
381 /// changes, not the size of the result register.
382 static bool shouldWidenLoad(const GCNSubtarget &ST, LLT MemoryTy,
383  unsigned AlignInBits, unsigned AddrSpace,
384  unsigned Opcode) {
385  unsigned SizeInBits = MemoryTy.getSizeInBits();
386  // We don't want to widen cases that are naturally legal.
387  if (isPowerOf2_32(SizeInBits))
388  return false;
389 
390  // If we have 96-bit memory operations, we shouldn't touch them. Note we may
391  // end up widening these for a scalar load during RegBankSelect, since there
392  // aren't 96-bit scalar loads.
393  if (SizeInBits == 96 && ST.hasDwordx3LoadStores())
394  return false;
395 
396  if (SizeInBits >= maxSizeForAddrSpace(ST, AddrSpace, Opcode))
397  return false;
398 
399  // A load is known dereferenceable up to the alignment, so it's legal to widen
400  // to it.
401  //
402  // TODO: Could check dereferenceable for less aligned cases.
403  unsigned RoundedSize = NextPowerOf2(SizeInBits);
404  if (AlignInBits < RoundedSize)
405  return false;
406 
407  // Do not widen if it would introduce a slow unaligned load.
408  const SITargetLowering *TLI = ST.getTargetLowering();
409  bool Fast = false;
411  RoundedSize, AddrSpace, Align(AlignInBits / 8),
412  MachineMemOperand::MOLoad, &Fast) &&
413  Fast;
414 }
415 
416 static bool shouldWidenLoad(const GCNSubtarget &ST, const LegalityQuery &Query,
417  unsigned Opcode) {
418  if (Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic)
419  return false;
420 
421  return shouldWidenLoad(ST, Query.MMODescrs[0].MemoryTy,
422  Query.MMODescrs[0].AlignInBits,
423  Query.Types[1].getAddressSpace(), Opcode);
424 }
425 
427  const GCNTargetMachine &TM)
428  : ST(ST_) {
429  using namespace TargetOpcode;
430 
431  auto GetAddrSpacePtr = [&TM](unsigned AS) {
432  return LLT::pointer(AS, TM.getPointerSizeInBits(AS));
433  };
434 
435  const LLT S1 = LLT::scalar(1);
436  const LLT S8 = LLT::scalar(8);
437  const LLT S16 = LLT::scalar(16);
438  const LLT S32 = LLT::scalar(32);
439  const LLT S64 = LLT::scalar(64);
440  const LLT S128 = LLT::scalar(128);
441  const LLT S256 = LLT::scalar(256);
442  const LLT S512 = LLT::scalar(512);
443  const LLT MaxScalar = LLT::scalar(MaxRegisterSize);
444 
445  const LLT V2S8 = LLT::fixed_vector(2, 8);
446  const LLT V2S16 = LLT::fixed_vector(2, 16);
447  const LLT V4S16 = LLT::fixed_vector(4, 16);
448 
449  const LLT V2S32 = LLT::fixed_vector(2, 32);
450  const LLT V3S32 = LLT::fixed_vector(3, 32);
451  const LLT V4S32 = LLT::fixed_vector(4, 32);
452  const LLT V5S32 = LLT::fixed_vector(5, 32);
453  const LLT V6S32 = LLT::fixed_vector(6, 32);
454  const LLT V7S32 = LLT::fixed_vector(7, 32);
455  const LLT V8S32 = LLT::fixed_vector(8, 32);
456  const LLT V9S32 = LLT::fixed_vector(9, 32);
457  const LLT V10S32 = LLT::fixed_vector(10, 32);
458  const LLT V11S32 = LLT::fixed_vector(11, 32);
459  const LLT V12S32 = LLT::fixed_vector(12, 32);
460  const LLT V13S32 = LLT::fixed_vector(13, 32);
461  const LLT V14S32 = LLT::fixed_vector(14, 32);
462  const LLT V15S32 = LLT::fixed_vector(15, 32);
463  const LLT V16S32 = LLT::fixed_vector(16, 32);
464  const LLT V32S32 = LLT::fixed_vector(32, 32);
465 
466  const LLT V2S64 = LLT::fixed_vector(2, 64);
467  const LLT V3S64 = LLT::fixed_vector(3, 64);
468  const LLT V4S64 = LLT::fixed_vector(4, 64);
469  const LLT V5S64 = LLT::fixed_vector(5, 64);
470  const LLT V6S64 = LLT::fixed_vector(6, 64);
471  const LLT V7S64 = LLT::fixed_vector(7, 64);
472  const LLT V8S64 = LLT::fixed_vector(8, 64);
473  const LLT V16S64 = LLT::fixed_vector(16, 64);
474 
475  std::initializer_list<LLT> AllS32Vectors =
476  {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32,
477  V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32};
478  std::initializer_list<LLT> AllS64Vectors =
479  {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64};
480 
481  const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS);
482  const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS);
483  const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT);
484  const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS);
485  const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS);
486  const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS);
487  const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS);
488 
489  const LLT CodePtr = FlatPtr;
490 
491  const std::initializer_list<LLT> AddrSpaces64 = {
492  GlobalPtr, ConstantPtr, FlatPtr
493  };
494 
495  const std::initializer_list<LLT> AddrSpaces32 = {
496  LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr
497  };
498 
499  const std::initializer_list<LLT> FPTypesBase = {
500  S32, S64
501  };
502 
503  const std::initializer_list<LLT> FPTypes16 = {
504  S32, S64, S16
505  };
506 
507  const std::initializer_list<LLT> FPTypesPK16 = {
508  S32, S64, S16, V2S16
509  };
510 
511  const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32;
512 
513  // s1 for VCC branches, s32 for SCC branches.
514  getActionDefinitionsBuilder(G_BRCOND).legalFor({S1, S32});
515 
516  // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more
517  // elements for v3s16
519  .legalFor({S32, S64, V2S16, S16, V4S16, S1, S128, S256})
520  .legalFor(AllS32Vectors)
521  .legalFor(AllS64Vectors)
522  .legalFor(AddrSpaces64)
523  .legalFor(AddrSpaces32)
524  .legalIf(isPointer(0))
525  .clampScalar(0, S16, S256)
526  .widenScalarToNextPow2(0, 32)
527  .clampMaxNumElements(0, S32, 16)
529  .scalarize(0);
530 
531  if (ST.hasVOP3PInsts() && ST.hasAddNoCarry() && ST.hasIntClamp()) {
532  // Full set of gfx9 features.
533  getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
534  .legalFor({S32, S16, V2S16})
535  .minScalar(0, S16)
536  .clampMaxNumElements(0, S16, 2)
538  .maxScalar(0, S32)
539  .scalarize(0);
540 
541  getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT, G_SADDSAT, G_SSUBSAT})
542  .legalFor({S32, S16, V2S16}) // Clamp modifier
543  .minScalarOrElt(0, S16)
544  .clampMaxNumElements(0, S16, 2)
545  .scalarize(0)
546  .widenScalarToNextPow2(0, 32)
547  .lower();
548  } else if (ST.has16BitInsts()) {
549  getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
550  .legalFor({S32, S16})
551  .minScalar(0, S16)
553  .maxScalar(0, S32)
554  .scalarize(0);
555 
556  // Technically the saturating operations require clamp bit support, but this
557  // was introduced at the same time as 16-bit operations.
558  getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
559  .legalFor({S32, S16}) // Clamp modifier
560  .minScalar(0, S16)
561  .scalarize(0)
562  .widenScalarToNextPow2(0, 16)
563  .lower();
564 
565  // We're just lowering this, but it helps get a better result to try to
566  // coerce to the desired type first.
567  getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT})
568  .minScalar(0, S16)
569  .scalarize(0)
570  .lower();
571  } else {
572  getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
573  .legalFor({S32})
574  .widenScalarToNextMultipleOf(0, 32)
575  .clampScalar(0, S32, S32)
576  .scalarize(0);
577 
578  if (ST.hasIntClamp()) {
579  getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
580  .legalFor({S32}) // Clamp modifier.
581  .scalarize(0)
582  .minScalarOrElt(0, S32)
583  .lower();
584  } else {
585  // Clamp bit support was added in VI, along with 16-bit operations.
586  getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
587  .minScalar(0, S32)
588  .scalarize(0)
589  .lower();
590  }
591 
592  // FIXME: DAG expansion gets better results. The widening uses the smaller
593  // range values and goes for the min/max lowering directly.
594  getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT})
595  .minScalar(0, S32)
596  .scalarize(0)
597  .lower();
598  }
599 
601  {G_SDIV, G_UDIV, G_SREM, G_UREM, G_SDIVREM, G_UDIVREM})
602  .customFor({S32, S64})
603  .clampScalar(0, S32, S64)
604  .widenScalarToNextPow2(0, 32)
605  .scalarize(0);
606 
607  auto &Mulh = getActionDefinitionsBuilder({G_UMULH, G_SMULH})
608  .legalFor({S32})
609  .maxScalarOrElt(0, S32);
610 
611  if (ST.hasVOP3PInsts()) {
612  Mulh
613  .clampMaxNumElements(0, S8, 2)
614  .lowerFor({V2S8});
615  }
616 
617  Mulh
618  .scalarize(0)
619  .lower();
620 
621  // Report legal for any types we can handle anywhere. For the cases only legal
622  // on the SALU, RegBankSelect will be able to re-legalize.
623  getActionDefinitionsBuilder({G_AND, G_OR, G_XOR})
624  .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16})
625  .clampScalar(0, S32, S64)
629  .scalarize(0);
630 
631  getActionDefinitionsBuilder({G_UADDO, G_USUBO,
632  G_UADDE, G_SADDE, G_USUBE, G_SSUBE})
633  .legalFor({{S32, S1}, {S32, S32}})
634  .minScalar(0, S32)
635  // TODO: .scalarize(0)
636  .lower();
637 
638  getActionDefinitionsBuilder(G_BITCAST)
639  // Don't worry about the size constraint.
641  .lower();
642 
643 
644  getActionDefinitionsBuilder(G_CONSTANT)
645  .legalFor({S1, S32, S64, S16, GlobalPtr,
646  LocalPtr, ConstantPtr, PrivatePtr, FlatPtr })
647  .legalIf(isPointer(0))
648  .clampScalar(0, S32, S64)
650 
651  getActionDefinitionsBuilder(G_FCONSTANT)
652  .legalFor({S32, S64, S16})
653  .clampScalar(0, S16, S64);
654 
655  getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE})
656  .legalIf(isRegisterType(0))
657  // s1 and s16 are special cases because they have legal operations on
658  // them, but don't really occupy registers in the normal way.
659  .legalFor({S1, S16})
660  .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
661  .clampScalarOrElt(0, S32, MaxScalar)
662  .widenScalarToNextPow2(0, 32)
663  .clampMaxNumElements(0, S32, 16);
664 
665  getActionDefinitionsBuilder(G_FRAME_INDEX).legalFor({PrivatePtr});
666 
667  // If the amount is divergent, we have to do a wave reduction to get the
668  // maximum value, so this is expanded during RegBankSelect.
669  getActionDefinitionsBuilder(G_DYN_STACKALLOC)
670  .legalFor({{PrivatePtr, S32}});
671 
672  getActionDefinitionsBuilder(G_GLOBAL_VALUE)
673  .customIf(typeIsNot(0, PrivatePtr));
674 
675  getActionDefinitionsBuilder(G_BLOCK_ADDR).legalFor({CodePtr});
676 
677  auto &FPOpActions = getActionDefinitionsBuilder(
678  { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE})
679  .legalFor({S32, S64});
680  auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS})
681  .customFor({S32, S64});
682  auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV)
683  .customFor({S32, S64});
684 
685  if (ST.has16BitInsts()) {
686  if (ST.hasVOP3PInsts())
687  FPOpActions.legalFor({S16, V2S16});
688  else
689  FPOpActions.legalFor({S16});
690 
691  TrigActions.customFor({S16});
692  FDIVActions.customFor({S16});
693  }
694 
695  auto &MinNumMaxNum = getActionDefinitionsBuilder({
696  G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE});
697 
698  if (ST.hasVOP3PInsts()) {
699  MinNumMaxNum.customFor(FPTypesPK16)
701  .clampMaxNumElements(0, S16, 2)
702  .clampScalar(0, S16, S64)
703  .scalarize(0);
704  } else if (ST.has16BitInsts()) {
705  MinNumMaxNum.customFor(FPTypes16)
706  .clampScalar(0, S16, S64)
707  .scalarize(0);
708  } else {
709  MinNumMaxNum.customFor(FPTypesBase)
710  .clampScalar(0, S32, S64)
711  .scalarize(0);
712  }
713 
714  if (ST.hasVOP3PInsts())
715  FPOpActions.clampMaxNumElements(0, S16, 2);
716 
717  FPOpActions
718  .scalarize(0)
719  .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
720 
721  TrigActions
722  .scalarize(0)
723  .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
724 
725  FDIVActions
726  .scalarize(0)
727  .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
728 
729  getActionDefinitionsBuilder({G_FNEG, G_FABS})
730  .legalFor(FPTypesPK16)
731  .clampMaxNumElements(0, S16, 2)
732  .scalarize(0)
733  .clampScalar(0, S16, S64);
734 
735  if (ST.has16BitInsts()) {
736  getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR})
737  .legalFor({S32, S64, S16})
738  .scalarize(0)
739  .clampScalar(0, S16, S64);
740  } else {
742  .legalFor({S32, S64})
743  .scalarize(0)
744  .clampScalar(0, S32, S64);
745 
746  if (ST.hasFractBug()) {
748  .customFor({S64})
749  .legalFor({S32, S64})
750  .scalarize(0)
751  .clampScalar(0, S32, S64);
752  } else {
754  .legalFor({S32, S64})
755  .scalarize(0)
756  .clampScalar(0, S32, S64);
757  }
758  }
759 
760  getActionDefinitionsBuilder(G_FPTRUNC)
761  .legalFor({{S32, S64}, {S16, S32}})
762  .scalarize(0)
763  .lower();
764 
766  .legalFor({{S64, S32}, {S32, S16}})
767  .narrowScalarFor({{S64, S16}}, changeTo(0, S32))
768  .scalarize(0);
769 
771  // Use actual fsub instruction
772  .legalFor({S32})
773  // Must use fadd + fneg
774  .lowerFor({S64, S16, V2S16})
775  .scalarize(0)
776  .clampScalar(0, S32, S64);
777 
778  // Whether this is legal depends on the floating point mode for the function.
779  auto &FMad = getActionDefinitionsBuilder(G_FMAD);
780  if (ST.hasMadF16() && ST.hasMadMacF32Insts())
781  FMad.customFor({S32, S16});
782  else if (ST.hasMadMacF32Insts())
783  FMad.customFor({S32});
784  else if (ST.hasMadF16())
785  FMad.customFor({S16});
786  FMad.scalarize(0)
787  .lower();
788 
789  auto &FRem = getActionDefinitionsBuilder(G_FREM);
790  if (ST.has16BitInsts()) {
791  FRem.customFor({S16, S32, S64});
792  } else {
793  FRem.minScalar(0, S32)
794  .customFor({S32, S64});
795  }
796  FRem.scalarize(0);
797 
798  // TODO: Do we need to clamp maximum bitwidth?
800  .legalIf(isScalar(0))
801  .legalFor({{V2S16, V2S32}})
802  .clampMaxNumElements(0, S16, 2)
803  // Avoid scalarizing in cases that should be truly illegal. In unresolvable
804  // situations (like an invalid implicit use), we don't want to infinite loop
805  // in the legalizer.
807  .alwaysLegal();
808 
809  getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT})
810  .legalFor({{S64, S32}, {S32, S16}, {S64, S16},
811  {S32, S1}, {S64, S1}, {S16, S1}})
812  .scalarize(0)
813  .clampScalar(0, S32, S64)
814  .widenScalarToNextPow2(1, 32);
815 
816  // TODO: Split s1->s64 during regbankselect for VALU.
817  auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP})
818  .legalFor({{S32, S32}, {S64, S32}, {S16, S32}})
819  .lowerIf(typeIs(1, S1))
820  .customFor({{S32, S64}, {S64, S64}});
821  if (ST.has16BitInsts())
822  IToFP.legalFor({{S16, S16}});
823  IToFP.clampScalar(1, S32, S64)
824  .minScalar(0, S32)
825  .scalarize(0)
827 
828  auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI})
829  .legalFor({{S32, S32}, {S32, S64}, {S32, S16}})
830  .customFor({{S64, S32}, {S64, S64}})
831  .narrowScalarFor({{S64, S16}}, changeTo(0, S32));
832  if (ST.has16BitInsts())
833  FPToI.legalFor({{S16, S16}});
834  else
835  FPToI.minScalar(1, S32);
836 
837  FPToI.minScalar(0, S32)
838  .widenScalarToNextPow2(0, 32)
839  .scalarize(0)
840  .lower();
841 
842  // Lower roundeven into G_FRINT
843  getActionDefinitionsBuilder({G_INTRINSIC_ROUND, G_INTRINSIC_ROUNDEVEN})
844  .scalarize(0)
845  .lower();
846 
847  if (ST.has16BitInsts()) {
848  getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
849  .legalFor({S16, S32, S64})
850  .clampScalar(0, S16, S64)
851  .scalarize(0);
852  } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) {
853  getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
854  .legalFor({S32, S64})
855  .clampScalar(0, S32, S64)
856  .scalarize(0);
857  } else {
858  getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
859  .legalFor({S32})
860  .customFor({S64})
861  .clampScalar(0, S32, S64)
862  .scalarize(0);
863  }
864 
865  getActionDefinitionsBuilder(G_PTR_ADD)
866  .legalIf(all(isPointer(0), sameSize(0, 1)))
867  .scalarize(0)
868  .scalarSameSizeAs(1, 0);
869 
870  getActionDefinitionsBuilder(G_PTRMASK)
871  .legalIf(all(sameSize(0, 1), typeInSet(1, {S64, S32})))
872  .scalarSameSizeAs(1, 0)
873  .scalarize(0);
874 
875  auto &CmpBuilder =
876  getActionDefinitionsBuilder(G_ICMP)
877  // The compare output type differs based on the register bank of the output,
878  // so make both s1 and s32 legal.
879  //
880  // Scalar compares producing output in scc will be promoted to s32, as that
881  // is the allocatable register type that will be needed for the copy from
882  // scc. This will be promoted during RegBankSelect, and we assume something
883  // before that won't try to use s32 result types.
884  //
885  // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg
886  // bank.
887  .legalForCartesianProduct(
888  {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr})
889  .legalForCartesianProduct(
890  {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr});
891  if (ST.has16BitInsts()) {
892  CmpBuilder.legalFor({{S1, S16}});
893  }
894 
895  CmpBuilder
896  .widenScalarToNextPow2(1)
897  .clampScalar(1, S32, S64)
898  .scalarize(0)
899  .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1)));
900 
901  getActionDefinitionsBuilder(G_FCMP)
902  .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase)
903  .widenScalarToNextPow2(1)
904  .clampScalar(1, S32, S64)
905  .scalarize(0);
906 
907  // FIXME: fpow has a selection pattern that should move to custom lowering.
908  auto &Exp2Ops = getActionDefinitionsBuilder({G_FEXP2, G_FLOG2});
909  if (ST.has16BitInsts())
910  Exp2Ops.legalFor({S32, S16});
911  else
912  Exp2Ops.legalFor({S32});
913  Exp2Ops.clampScalar(0, MinScalarFPTy, S32);
914  Exp2Ops.scalarize(0);
915 
916  auto &ExpOps = getActionDefinitionsBuilder({G_FEXP, G_FLOG, G_FLOG10, G_FPOW});
917  if (ST.has16BitInsts())
918  ExpOps.customFor({{S32}, {S16}});
919  else
920  ExpOps.customFor({S32});
921  ExpOps.clampScalar(0, MinScalarFPTy, S32)
922  .scalarize(0);
923 
924  getActionDefinitionsBuilder(G_FPOWI)
925  .clampScalar(0, MinScalarFPTy, S32)
926  .lower();
927 
928  // The 64-bit versions produce 32-bit results, but only on the SALU.
929  getActionDefinitionsBuilder(G_CTPOP)
930  .legalFor({{S32, S32}, {S32, S64}})
931  .clampScalar(0, S32, S32)
932  .clampScalar(1, S32, S64)
933  .scalarize(0)
934  .widenScalarToNextPow2(0, 32)
935  .widenScalarToNextPow2(1, 32);
936 
937  // The hardware instructions return a different result on 0 than the generic
938  // instructions expect. The hardware produces -1, but these produce the
939  // bitwidth.
940  getActionDefinitionsBuilder({G_CTLZ, G_CTTZ})
941  .scalarize(0)
942  .clampScalar(0, S32, S32)
943  .clampScalar(1, S32, S64)
944  .widenScalarToNextPow2(0, 32)
945  .widenScalarToNextPow2(1, 32)
946  .custom();
947 
948  // The 64-bit versions produce 32-bit results, but only on the SALU.
949  getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF})
950  .legalFor({{S32, S32}, {S32, S64}})
951  .clampScalar(0, S32, S32)
952  .clampScalar(1, S32, S64)
953  .scalarize(0)
954  .widenScalarToNextPow2(0, 32)
955  .widenScalarToNextPow2(1, 32);
956 
957  // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
958  // RegBankSelect.
959  getActionDefinitionsBuilder(G_BITREVERSE)
960  .legalFor({S32, S64})
961  .clampScalar(0, S32, S64)
962  .scalarize(0)
963  .widenScalarToNextPow2(0);
964 
965  if (ST.has16BitInsts()) {
966  getActionDefinitionsBuilder(G_BSWAP)
967  .legalFor({S16, S32, V2S16})
968  .clampMaxNumElements(0, S16, 2)
969  // FIXME: Fixing non-power-of-2 before clamp is workaround for
970  // narrowScalar limitation.
971  .widenScalarToNextPow2(0)
972  .clampScalar(0, S16, S32)
973  .scalarize(0);
974 
975  if (ST.hasVOP3PInsts()) {
976  getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
977  .legalFor({S32, S16, V2S16})
978  .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
979  .clampMaxNumElements(0, S16, 2)
980  .minScalar(0, S16)
981  .widenScalarToNextPow2(0)
982  .scalarize(0)
983  .lower();
984  } else {
985  getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
986  .legalFor({S32, S16})
987  .widenScalarToNextPow2(0)
988  .minScalar(0, S16)
989  .scalarize(0)
990  .lower();
991  }
992  } else {
993  // TODO: Should have same legality without v_perm_b32
994  getActionDefinitionsBuilder(G_BSWAP)
995  .legalFor({S32})
996  .lowerIf(scalarNarrowerThan(0, 32))
997  // FIXME: Fixing non-power-of-2 before clamp is workaround for
998  // narrowScalar limitation.
999  .widenScalarToNextPow2(0)
1000  .maxScalar(0, S32)
1001  .scalarize(0)
1002  .lower();
1003 
1004  getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
1005  .legalFor({S32})
1006  .minScalar(0, S32)
1007  .widenScalarToNextPow2(0)
1008  .scalarize(0)
1009  .lower();
1010  }
1011 
1012  getActionDefinitionsBuilder(G_INTTOPTR)
1013  // List the common cases
1014  .legalForCartesianProduct(AddrSpaces64, {S64})
1015  .legalForCartesianProduct(AddrSpaces32, {S32})
1016  .scalarize(0)
1017  // Accept any address space as long as the size matches
1018  .legalIf(sameSize(0, 1))
1019  .widenScalarIf(smallerThan(1, 0),
1020  [](const LegalityQuery &Query) {
1021  return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
1022  })
1023  .narrowScalarIf(largerThan(1, 0),
1024  [](const LegalityQuery &Query) {
1025  return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
1026  });
1027 
1028  getActionDefinitionsBuilder(G_PTRTOINT)
1029  // List the common cases
1030  .legalForCartesianProduct(AddrSpaces64, {S64})
1031  .legalForCartesianProduct(AddrSpaces32, {S32})
1032  .scalarize(0)
1033  // Accept any address space as long as the size matches
1034  .legalIf(sameSize(0, 1))
1035  .widenScalarIf(smallerThan(0, 1),
1036  [](const LegalityQuery &Query) {
1037  return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
1038  })
1039  .narrowScalarIf(
1040  largerThan(0, 1),
1041  [](const LegalityQuery &Query) {
1042  return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
1043  });
1044 
1045  getActionDefinitionsBuilder(G_ADDRSPACE_CAST)
1046  .scalarize(0)
1047  .custom();
1048 
1049  const auto needToSplitMemOp = [=](const LegalityQuery &Query,
1050  bool IsLoad) -> bool {
1051  const LLT DstTy = Query.Types[0];
1052 
1053  // Split vector extloads.
1054  unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
1055  unsigned AlignBits = Query.MMODescrs[0].AlignInBits;
1056 
1057  if (MemSize < DstTy.getSizeInBits())
1058  MemSize = std::max(MemSize, AlignBits);
1059 
1060  if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize)
1061  return true;
1062 
1063  const LLT PtrTy = Query.Types[1];
1064  unsigned AS = PtrTy.getAddressSpace();
1065  if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad))
1066  return true;
1067 
1068  // Catch weird sized loads that don't evenly divide into the access sizes
1069  // TODO: May be able to widen depending on alignment etc.
1070  unsigned NumRegs = (MemSize + 31) / 32;
1071  if (NumRegs == 3) {
1072  if (!ST.hasDwordx3LoadStores())
1073  return true;
1074  } else {
1075  // If the alignment allows, these should have been widened.
1076  if (!isPowerOf2_32(NumRegs))
1077  return true;
1078  }
1079 
1080  if (AlignBits < MemSize) {
1081  const SITargetLowering *TLI = ST.getTargetLowering();
1082  return !TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS,
1083  Align(AlignBits / 8));
1084  }
1085 
1086  return false;
1087  };
1088 
1089  unsigned GlobalAlign32 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 32;
1090  unsigned GlobalAlign16 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 16;
1091  unsigned GlobalAlign8 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 8;
1092 
1093  // TODO: Refine based on subtargets which support unaligned access or 128-bit
1094  // LDS
1095  // TODO: Unsupported flat for SI.
1096 
1097  for (unsigned Op : {G_LOAD, G_STORE}) {
1098  const bool IsStore = Op == G_STORE;
1099 
1100  auto &Actions = getActionDefinitionsBuilder(Op);
1101  // Explicitly list some common cases.
1102  // TODO: Does this help compile time at all?
1103  Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, S32, GlobalAlign32},
1104  {V2S32, GlobalPtr, V2S32, GlobalAlign32},
1105  {V4S32, GlobalPtr, V4S32, GlobalAlign32},
1106  {S64, GlobalPtr, S64, GlobalAlign32},
1107  {V2S64, GlobalPtr, V2S64, GlobalAlign32},
1108  {V2S16, GlobalPtr, V2S16, GlobalAlign32},
1109  {S32, GlobalPtr, S8, GlobalAlign8},
1110  {S32, GlobalPtr, S16, GlobalAlign16},
1111 
1112  {S32, LocalPtr, S32, 32},
1113  {S64, LocalPtr, S64, 32},
1114  {V2S32, LocalPtr, V2S32, 32},
1115  {S32, LocalPtr, S8, 8},
1116  {S32, LocalPtr, S16, 16},
1117  {V2S16, LocalPtr, S32, 32},
1118 
1119  {S32, PrivatePtr, S32, 32},
1120  {S32, PrivatePtr, S8, 8},
1121  {S32, PrivatePtr, S16, 16},
1122  {V2S16, PrivatePtr, S32, 32},
1123 
1124  {S32, ConstantPtr, S32, GlobalAlign32},
1125  {V2S32, ConstantPtr, V2S32, GlobalAlign32},
1126  {V4S32, ConstantPtr, V4S32, GlobalAlign32},
1127  {S64, ConstantPtr, S64, GlobalAlign32},
1128  {V2S32, ConstantPtr, V2S32, GlobalAlign32}});
1129  Actions.legalIf(
1130  [=](const LegalityQuery &Query) -> bool {
1131  return isLoadStoreLegal(ST, Query);
1132  });
1133 
1134  // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to
1135  // 64-bits.
1136  //
1137  // TODO: Should generalize bitcast action into coerce, which will also cover
1138  // inserting addrspacecasts.
1139  Actions.customIf(typeIs(1, Constant32Ptr));
1140 
1141  // Turn any illegal element vectors into something easier to deal
1142  // with. These will ultimately produce 32-bit scalar shifts to extract the
1143  // parts anyway.
1144  //
1145  // For odd 16-bit element vectors, prefer to split those into pieces with
1146  // 16-bit vector parts.
1147  Actions.bitcastIf(
1148  [=](const LegalityQuery &Query) -> bool {
1149  return shouldBitcastLoadStoreType(ST, Query.Types[0],
1150  Query.MMODescrs[0].MemoryTy);
1151  }, bitcastToRegisterType(0));
1152 
1153  if (!IsStore) {
1154  // Widen suitably aligned loads by loading extra bytes. The standard
1155  // legalization actions can't properly express widening memory operands.
1156  Actions.customIf([=](const LegalityQuery &Query) -> bool {
1157  return shouldWidenLoad(ST, Query, G_LOAD);
1158  });
1159  }
1160 
1161  // FIXME: load/store narrowing should be moved to lower action
1162  Actions
1163  .narrowScalarIf(
1164  [=](const LegalityQuery &Query) -> bool {
1165  return !Query.Types[0].isVector() &&
1166  needToSplitMemOp(Query, Op == G_LOAD);
1167  },
1168  [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1169  const LLT DstTy = Query.Types[0];
1170  const LLT PtrTy = Query.Types[1];
1171 
1172  const unsigned DstSize = DstTy.getSizeInBits();
1173  unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
1174 
1175  // Split extloads.
1176  if (DstSize > MemSize)
1177  return std::make_pair(0, LLT::scalar(MemSize));
1178 
1179  if (!isPowerOf2_32(DstSize)) {
1180  // We're probably decomposing an odd sized store. Try to split
1181  // to the widest type. TODO: Account for alignment. As-is it
1182  // should be OK, since the new parts will be further legalized.
1183  unsigned FloorSize = PowerOf2Floor(DstSize);
1184  return std::make_pair(0, LLT::scalar(FloorSize));
1185  }
1186 
1187  if (DstSize > 32 && (DstSize % 32 != 0)) {
1188  // FIXME: Need a way to specify non-extload of larger size if
1189  // suitably aligned.
1190  return std::make_pair(0, LLT::scalar(32 * (DstSize / 32)));
1191  }
1192 
1193  unsigned MaxSize = maxSizeForAddrSpace(ST,
1194  PtrTy.getAddressSpace(),
1195  Op == G_LOAD);
1196  if (MemSize > MaxSize)
1197  return std::make_pair(0, LLT::scalar(MaxSize));
1198 
1199  unsigned Align = Query.MMODescrs[0].AlignInBits;
1200  return std::make_pair(0, LLT::scalar(Align));
1201  })
1202  .fewerElementsIf(
1203  [=](const LegalityQuery &Query) -> bool {
1204  return Query.Types[0].isVector() &&
1205  needToSplitMemOp(Query, Op == G_LOAD);
1206  },
1207  [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1208  const LLT DstTy = Query.Types[0];
1209  const LLT PtrTy = Query.Types[1];
1210 
1211  LLT EltTy = DstTy.getElementType();
1212  unsigned MaxSize = maxSizeForAddrSpace(ST,
1213  PtrTy.getAddressSpace(),
1214  Op == G_LOAD);
1215 
1216  // FIXME: Handle widened to power of 2 results better. This ends
1217  // up scalarizing.
1218  // FIXME: 3 element stores scalarized on SI
1219 
1220  // Split if it's too large for the address space.
1221  unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
1222  if (MemSize > MaxSize) {
1223  unsigned NumElts = DstTy.getNumElements();
1224  unsigned EltSize = EltTy.getSizeInBits();
1225 
1226  if (MaxSize % EltSize == 0) {
1227  return std::make_pair(
1229  ElementCount::getFixed(MaxSize / EltSize), EltTy));
1230  }
1231 
1232  unsigned NumPieces = MemSize / MaxSize;
1233 
1234  // FIXME: Refine when odd breakdowns handled
1235  // The scalars will need to be re-legalized.
1236  if (NumPieces == 1 || NumPieces >= NumElts ||
1237  NumElts % NumPieces != 0)
1238  return std::make_pair(0, EltTy);
1239 
1240  return std::make_pair(
1241  0, LLT::fixed_vector(NumElts / NumPieces, EltTy));
1242  }
1243 
1244  // FIXME: We could probably handle weird extending loads better.
1245  if (DstTy.getSizeInBits() > MemSize)
1246  return std::make_pair(0, EltTy);
1247 
1248  unsigned EltSize = EltTy.getSizeInBits();
1249  unsigned DstSize = DstTy.getSizeInBits();
1250  if (!isPowerOf2_32(DstSize)) {
1251  // We're probably decomposing an odd sized store. Try to split
1252  // to the widest type. TODO: Account for alignment. As-is it
1253  // should be OK, since the new parts will be further legalized.
1254  unsigned FloorSize = PowerOf2Floor(DstSize);
1255  return std::make_pair(
1257  ElementCount::getFixed(FloorSize / EltSize), EltTy));
1258  }
1259 
1260  // Need to split because of alignment.
1261  unsigned Align = Query.MMODescrs[0].AlignInBits;
1262  if (EltSize > Align &&
1263  (EltSize / Align < DstTy.getNumElements())) {
1264  return std::make_pair(
1265  0, LLT::fixed_vector(EltSize / Align, EltTy));
1266  }
1267 
1268  // May need relegalization for the scalars.
1269  return std::make_pair(0, EltTy);
1270  })
1271  .minScalar(0, S32)
1272  .narrowScalarIf(isWideScalarExtLoadTruncStore(0), changeTo(0, S32))
1273  .widenScalarToNextPow2(0)
1274  .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0))
1275  .lower();
1276  }
1277 
1278  // FIXME: Unaligned accesses not lowered.
1279  auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD})
1280  .legalForTypesWithMemDesc({{S32, GlobalPtr, S8, 8},
1281  {S32, GlobalPtr, S16, 2 * 8},
1282  {S32, LocalPtr, S8, 8},
1283  {S32, LocalPtr, S16, 16},
1284  {S32, PrivatePtr, S8, 8},
1285  {S32, PrivatePtr, S16, 16},
1286  {S32, ConstantPtr, S8, 8},
1287  {S32, ConstantPtr, S16, 2 * 8}})
1288  .legalIf(
1289  [=](const LegalityQuery &Query) -> bool {
1290  return isLoadStoreLegal(ST, Query);
1291  });
1292 
1293  if (ST.hasFlatAddressSpace()) {
1294  ExtLoads.legalForTypesWithMemDesc(
1295  {{S32, FlatPtr, S8, 8}, {S32, FlatPtr, S16, 16}});
1296  }
1297 
1298  // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to
1299  // 64-bits.
1300  //
1301  // TODO: Should generalize bitcast action into coerce, which will also cover
1302  // inserting addrspacecasts.
1303  ExtLoads.customIf(typeIs(1, Constant32Ptr));
1304 
1305  ExtLoads.clampScalar(0, S32, S32)
1306  .widenScalarToNextPow2(0)
1307  .lower();
1308 
1309  auto &Atomics = getActionDefinitionsBuilder(
1310  {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB,
1311  G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR,
1312  G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX,
1313  G_ATOMICRMW_UMIN})
1314  .legalFor({{S32, GlobalPtr}, {S32, LocalPtr},
1315  {S64, GlobalPtr}, {S64, LocalPtr},
1316  {S32, RegionPtr}, {S64, RegionPtr}});
1317  if (ST.hasFlatAddressSpace()) {
1318  Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}});
1319  }
1320 
1321  auto &Atomic = getActionDefinitionsBuilder(G_ATOMICRMW_FADD);
1322  if (ST.hasLDSFPAtomicAdd()) {
1323  Atomic.legalFor({{S32, LocalPtr}, {S32, RegionPtr}});
1324  if (ST.hasGFX90AInsts())
1325  Atomic.legalFor({{S64, LocalPtr}});
1326  }
1327  if (ST.hasAtomicFaddInsts())
1328  Atomic.legalFor({{S32, GlobalPtr}});
1329 
1330  // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output
1331  // demarshalling
1332  getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG)
1333  .customFor({{S32, GlobalPtr}, {S64, GlobalPtr},
1334  {S32, FlatPtr}, {S64, FlatPtr}})
1335  .legalFor({{S32, LocalPtr}, {S64, LocalPtr},
1336  {S32, RegionPtr}, {S64, RegionPtr}});
1337  // TODO: Pointer types, any 32-bit or 64-bit vector
1338 
1339  // Condition should be s32 for scalar, s1 for vector.
1340  getActionDefinitionsBuilder(G_SELECT)
1341  .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16, GlobalPtr,
1342  LocalPtr, FlatPtr, PrivatePtr,
1343  LLT::fixed_vector(2, LocalPtr),
1344  LLT::fixed_vector(2, PrivatePtr)},
1345  {S1, S32})
1346  .clampScalar(0, S16, S64)
1347  .scalarize(1)
1348  .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
1349  .fewerElementsIf(numElementsNotEven(0), scalarize(0))
1350  .clampMaxNumElements(0, S32, 2)
1351  .clampMaxNumElements(0, LocalPtr, 2)
1352  .clampMaxNumElements(0, PrivatePtr, 2)
1353  .scalarize(0)
1354  .widenScalarToNextPow2(0)
1355  .legalIf(all(isPointer(0), typeInSet(1, {S1, S32})));
1356 
1357  // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can
1358  // be more flexible with the shift amount type.
1359  auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR})
1360  .legalFor({{S32, S32}, {S64, S32}});
1361  if (ST.has16BitInsts()) {
1362  if (ST.hasVOP3PInsts()) {
1363  Shifts.legalFor({{S16, S16}, {V2S16, V2S16}})
1364  .clampMaxNumElements(0, S16, 2);
1365  } else
1366  Shifts.legalFor({{S16, S16}});
1367 
1368  // TODO: Support 16-bit shift amounts for all types
1369  Shifts.widenScalarIf(
1370  [=](const LegalityQuery &Query) {
1371  // Use 16-bit shift amounts for any 16-bit shift. Otherwise we want a
1372  // 32-bit amount.
1373  const LLT ValTy = Query.Types[0];
1374  const LLT AmountTy = Query.Types[1];
1375  return ValTy.getSizeInBits() <= 16 &&
1376  AmountTy.getSizeInBits() < 16;
1377  }, changeTo(1, S16));
1378  Shifts.maxScalarIf(typeIs(0, S16), 1, S16);
1379  Shifts.clampScalar(1, S32, S32);
1380  Shifts.clampScalar(0, S16, S64);
1381  Shifts.widenScalarToNextPow2(0, 16);
1382 
1383  getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1384  .minScalar(0, S16)
1385  .scalarize(0)
1386  .lower();
1387  } else {
1388  // Make sure we legalize the shift amount type first, as the general
1389  // expansion for the shifted type will produce much worse code if it hasn't
1390  // been truncated already.
1391  Shifts.clampScalar(1, S32, S32);
1392  Shifts.clampScalar(0, S32, S64);
1393  Shifts.widenScalarToNextPow2(0, 32);
1394 
1395  getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1396  .minScalar(0, S32)
1397  .scalarize(0)
1398  .lower();
1399  }
1400  Shifts.scalarize(0);
1401 
1402  for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) {
1403  unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0;
1404  unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1;
1405  unsigned IdxTypeIdx = 2;
1406 
1407  getActionDefinitionsBuilder(Op)
1408  .customIf([=](const LegalityQuery &Query) {
1409  const LLT EltTy = Query.Types[EltTypeIdx];
1410  const LLT VecTy = Query.Types[VecTypeIdx];
1411  const LLT IdxTy = Query.Types[IdxTypeIdx];
1412  const unsigned EltSize = EltTy.getSizeInBits();
1413  return (EltSize == 32 || EltSize == 64) &&
1414  VecTy.getSizeInBits() % 32 == 0 &&
1415  VecTy.getSizeInBits() <= MaxRegisterSize &&
1416  IdxTy.getSizeInBits() == 32;
1417  })
1418  .bitcastIf(all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltNarrowerThan(VecTypeIdx, 32)),
1419  bitcastToVectorElement32(VecTypeIdx))
1420  //.bitcastIf(vectorSmallerThan(1, 32), bitcastToScalar(1))
1421  .bitcastIf(
1422  all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltWiderThan(VecTypeIdx, 64)),
1423  [=](const LegalityQuery &Query) {
1424  // For > 64-bit element types, try to turn this into a 64-bit
1425  // element vector since we may be able to do better indexing
1426  // if this is scalar. If not, fall back to 32.
1427  const LLT EltTy = Query.Types[EltTypeIdx];
1428  const LLT VecTy = Query.Types[VecTypeIdx];
1429  const unsigned DstEltSize = EltTy.getSizeInBits();
1430  const unsigned VecSize = VecTy.getSizeInBits();
1431 
1432  const unsigned TargetEltSize = DstEltSize % 64 == 0 ? 64 : 32;
1433  return std::make_pair(
1434  VecTypeIdx,
1435  LLT::fixed_vector(VecSize / TargetEltSize, TargetEltSize));
1436  })
1437  .clampScalar(EltTypeIdx, S32, S64)
1438  .clampScalar(VecTypeIdx, S32, S64)
1439  .clampScalar(IdxTypeIdx, S32, S32)
1440  .clampMaxNumElements(VecTypeIdx, S32, 32)
1441  // TODO: Clamp elements for 64-bit vectors?
1442  // It should only be necessary with variable indexes.
1443  // As a last resort, lower to the stack
1444  .lower();
1445  }
1446 
1447  getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT)
1448  .unsupportedIf([=](const LegalityQuery &Query) {
1449  const LLT &EltTy = Query.Types[1].getElementType();
1450  return Query.Types[0] != EltTy;
1451  });
1452 
1453  for (unsigned Op : {G_EXTRACT, G_INSERT}) {
1454  unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0;
1455  unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1;
1456 
1457  // FIXME: Doesn't handle extract of illegal sizes.
1458  getActionDefinitionsBuilder(Op)
1459  .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32)))
1460  // FIXME: Multiples of 16 should not be legal.
1461  .legalIf([=](const LegalityQuery &Query) {
1462  const LLT BigTy = Query.Types[BigTyIdx];
1463  const LLT LitTy = Query.Types[LitTyIdx];
1464  return (BigTy.getSizeInBits() % 32 == 0) &&
1465  (LitTy.getSizeInBits() % 16 == 0);
1466  })
1467  .widenScalarIf(
1468  [=](const LegalityQuery &Query) {
1469  const LLT BigTy = Query.Types[BigTyIdx];
1470  return (BigTy.getScalarSizeInBits() < 16);
1471  },
1473  .widenScalarIf(
1474  [=](const LegalityQuery &Query) {
1475  const LLT LitTy = Query.Types[LitTyIdx];
1476  return (LitTy.getScalarSizeInBits() < 16);
1477  },
1479  .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1480  .widenScalarToNextPow2(BigTyIdx, 32);
1481 
1482  }
1483 
1484  auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR)
1485  .legalForCartesianProduct(AllS32Vectors, {S32})
1486  .legalForCartesianProduct(AllS64Vectors, {S64})
1487  .clampNumElements(0, V16S32, V32S32)
1488  .clampNumElements(0, V2S64, V16S64)
1489  .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16));
1490 
1491  if (ST.hasScalarPackInsts()) {
1492  BuildVector
1493  // FIXME: Should probably widen s1 vectors straight to s32
1494  .minScalarOrElt(0, S16)
1495  // Widen source elements and produce a G_BUILD_VECTOR_TRUNC
1496  .minScalar(1, S32);
1497 
1498  getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1499  .legalFor({V2S16, S32})
1500  .lower();
1501  BuildVector.minScalarOrElt(0, S32);
1502  } else {
1503  BuildVector.customFor({V2S16, S16});
1504  BuildVector.minScalarOrElt(0, S32);
1505 
1506  getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1507  .customFor({V2S16, S32})
1508  .lower();
1509  }
1510 
1511  BuildVector.legalIf(isRegisterType(0));
1512 
1513  // FIXME: Clamp maximum size
1514  getActionDefinitionsBuilder(G_CONCAT_VECTORS)
1515  .legalIf(all(isRegisterType(0), isRegisterType(1)))
1516  .clampMaxNumElements(0, S32, 32)
1517  .clampMaxNumElements(1, S16, 2) // TODO: Make 4?
1518  .clampMaxNumElements(0, S16, 64);
1519 
1520  // TODO: Don't fully scalarize v2s16 pieces? Or combine out thosse
1521  // pre-legalize.
1522  if (ST.hasVOP3PInsts()) {
1523  getActionDefinitionsBuilder(G_SHUFFLE_VECTOR)
1524  .customFor({V2S16, V2S16})
1525  .lower();
1526  } else
1527  getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower();
1528 
1529  // Merge/Unmerge
1530  for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
1531  unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1;
1532  unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0;
1533 
1534  auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) {
1535  const LLT Ty = Query.Types[TypeIdx];
1536  if (Ty.isVector()) {
1537  const LLT &EltTy = Ty.getElementType();
1538  if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 512)
1539  return true;
1540  if (!isPowerOf2_32(EltTy.getSizeInBits()))
1541  return true;
1542  }
1543  return false;
1544  };
1545 
1546  auto &Builder = getActionDefinitionsBuilder(Op)
1547  .legalIf(all(isRegisterType(0), isRegisterType(1)))
1548  .lowerFor({{S16, V2S16}})
1549  .lowerIf([=](const LegalityQuery &Query) {
1550  const LLT BigTy = Query.Types[BigTyIdx];
1551  return BigTy.getSizeInBits() == 32;
1552  })
1553  // Try to widen to s16 first for small types.
1554  // TODO: Only do this on targets with legal s16 shifts
1555  .minScalarOrEltIf(scalarNarrowerThan(LitTyIdx, 16), LitTyIdx, S16)
1556  .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16)
1557  .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1558  .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32),
1559  elementTypeIs(1, S16)),
1560  changeTo(1, V2S16))
1561  // Clamp the little scalar to s8-s256 and make it a power of 2. It's not
1562  // worth considering the multiples of 64 since 2*192 and 2*384 are not
1563  // valid.
1564  .clampScalar(LitTyIdx, S32, S512)
1565  .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32)
1566  // Break up vectors with weird elements into scalars
1567  .fewerElementsIf(
1568  [=](const LegalityQuery &Query) { return notValidElt(Query, LitTyIdx); },
1569  scalarize(0))
1570  .fewerElementsIf(
1571  [=](const LegalityQuery &Query) { return notValidElt(Query, BigTyIdx); },
1572  scalarize(1))
1573  .clampScalar(BigTyIdx, S32, MaxScalar);
1574 
1575  if (Op == G_MERGE_VALUES) {
1576  Builder.widenScalarIf(
1577  // TODO: Use 16-bit shifts if legal for 8-bit values?
1578  [=](const LegalityQuery &Query) {
1579  const LLT Ty = Query.Types[LitTyIdx];
1580  return Ty.getSizeInBits() < 32;
1581  },
1582  changeTo(LitTyIdx, S32));
1583  }
1584 
1585  Builder.widenScalarIf(
1586  [=](const LegalityQuery &Query) {
1587  const LLT Ty = Query.Types[BigTyIdx];
1588  return !isPowerOf2_32(Ty.getSizeInBits()) &&
1589  Ty.getSizeInBits() % 16 != 0;
1590  },
1591  [=](const LegalityQuery &Query) {
1592  // Pick the next power of 2, or a multiple of 64 over 128.
1593  // Whichever is smaller.
1594  const LLT &Ty = Query.Types[BigTyIdx];
1595  unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1);
1596  if (NewSizeInBits >= 256) {
1597  unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1);
1598  if (RoundedTo < NewSizeInBits)
1599  NewSizeInBits = RoundedTo;
1600  }
1601  return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits));
1602  })
1603  // Any vectors left are the wrong size. Scalarize them.
1604  .scalarize(0)
1605  .scalarize(1);
1606  }
1607 
1608  // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
1609  // RegBankSelect.
1610  auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG)
1611  .legalFor({{S32}, {S64}});
1612 
1613  if (ST.hasVOP3PInsts()) {
1614  SextInReg.lowerFor({{V2S16}})
1615  // Prefer to reduce vector widths for 16-bit vectors before lowering, to
1616  // get more vector shift opportunities, since we'll get those when
1617  // expanded.
1618  .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16));
1619  } else if (ST.has16BitInsts()) {
1620  SextInReg.lowerFor({{S32}, {S64}, {S16}});
1621  } else {
1622  // Prefer to promote to s32 before lowering if we don't have 16-bit
1623  // shifts. This avoid a lot of intermediate truncate and extend operations.
1624  SextInReg.lowerFor({{S32}, {S64}});
1625  }
1626 
1627  SextInReg
1628  .scalarize(0)
1629  .clampScalar(0, S32, S64)
1630  .lower();
1631 
1632  getActionDefinitionsBuilder({G_ROTR, G_ROTL})
1633  .scalarize(0)
1634  .lower();
1635 
1636  // TODO: Only Try to form v2s16 with legal packed instructions.
1637  getActionDefinitionsBuilder(G_FSHR)
1638  .legalFor({{S32, S32}})
1639  .lowerFor({{V2S16, V2S16}})
1640  .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16))
1641  .scalarize(0)
1642  .lower();
1643 
1644  if (ST.hasVOP3PInsts()) {
1645  getActionDefinitionsBuilder(G_FSHL)
1646  .lowerFor({{V2S16, V2S16}})
1647  .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16))
1648  .scalarize(0)
1649  .lower();
1650  } else {
1651  getActionDefinitionsBuilder(G_FSHL)
1652  .scalarize(0)
1653  .lower();
1654  }
1655 
1656  getActionDefinitionsBuilder(G_READCYCLECOUNTER)
1657  .legalFor({S64});
1658 
1659  getActionDefinitionsBuilder(G_FENCE)
1660  .alwaysLegal();
1661 
1662  getActionDefinitionsBuilder({G_SMULO, G_UMULO})
1663  .scalarize(0)
1664  .minScalar(0, S32)
1665  .lower();
1666 
1667  getActionDefinitionsBuilder({G_SBFX, G_UBFX})
1668  .legalFor({{S32, S32}, {S64, S32}})
1669  .clampScalar(1, S32, S32)
1670  .clampScalar(0, S32, S64)
1671  .widenScalarToNextPow2(0)
1672  .scalarize(0);
1673 
1674  getActionDefinitionsBuilder({
1675  // TODO: Verify V_BFI_B32 is generated from expanded bit ops
1676  G_FCOPYSIGN,
1677 
1678  G_ATOMIC_CMPXCHG_WITH_SUCCESS,
1679  G_ATOMICRMW_NAND,
1680  G_ATOMICRMW_FSUB,
1681  G_READ_REGISTER,
1682  G_WRITE_REGISTER,
1683 
1684  G_SADDO, G_SSUBO,
1685 
1686  // TODO: Implement
1687  G_FMINIMUM, G_FMAXIMUM}).lower();
1688 
1689  getActionDefinitionsBuilder({G_MEMCPY, G_MEMCPY_INLINE, G_MEMMOVE, G_MEMSET})
1690  .lower();
1691 
1692  getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE,
1693  G_INDEXED_LOAD, G_INDEXED_SEXTLOAD,
1694  G_INDEXED_ZEXTLOAD, G_INDEXED_STORE})
1695  .unsupported();
1696 
1697  getLegacyLegalizerInfo().computeTables();
1698  verify(*ST.getInstrInfo());
1699 }
1700 
1701 bool AMDGPULegalizerInfo::legalizeCustom(LegalizerHelper &Helper,
1702  MachineInstr &MI) const {
1703  MachineIRBuilder &B = Helper.MIRBuilder;
1704  MachineRegisterInfo &MRI = *B.getMRI();
1705 
1706  switch (MI.getOpcode()) {
1707  case TargetOpcode::G_ADDRSPACE_CAST:
1708  return legalizeAddrSpaceCast(MI, MRI, B);
1709  case TargetOpcode::G_FRINT:
1710  return legalizeFrint(MI, MRI, B);
1711  case TargetOpcode::G_FCEIL:
1712  return legalizeFceil(MI, MRI, B);
1713  case TargetOpcode::G_FREM:
1714  return legalizeFrem(MI, MRI, B);
1715  case TargetOpcode::G_INTRINSIC_TRUNC:
1716  return legalizeIntrinsicTrunc(MI, MRI, B);
1717  case TargetOpcode::G_SITOFP:
1718  return legalizeITOFP(MI, MRI, B, true);
1719  case TargetOpcode::G_UITOFP:
1720  return legalizeITOFP(MI, MRI, B, false);
1721  case TargetOpcode::G_FPTOSI:
1722  return legalizeFPTOI(MI, MRI, B, true);
1723  case TargetOpcode::G_FPTOUI:
1724  return legalizeFPTOI(MI, MRI, B, false);
1725  case TargetOpcode::G_FMINNUM:
1726  case TargetOpcode::G_FMAXNUM:
1727  case TargetOpcode::G_FMINNUM_IEEE:
1728  case TargetOpcode::G_FMAXNUM_IEEE:
1729  return legalizeMinNumMaxNum(Helper, MI);
1730  case TargetOpcode::G_EXTRACT_VECTOR_ELT:
1731  return legalizeExtractVectorElt(MI, MRI, B);
1732  case TargetOpcode::G_INSERT_VECTOR_ELT:
1733  return legalizeInsertVectorElt(MI, MRI, B);
1734  case TargetOpcode::G_SHUFFLE_VECTOR:
1735  return legalizeShuffleVector(MI, MRI, B);
1736  case TargetOpcode::G_FSIN:
1737  case TargetOpcode::G_FCOS:
1738  return legalizeSinCos(MI, MRI, B);
1739  case TargetOpcode::G_GLOBAL_VALUE:
1740  return legalizeGlobalValue(MI, MRI, B);
1741  case TargetOpcode::G_LOAD:
1742  case TargetOpcode::G_SEXTLOAD:
1743  case TargetOpcode::G_ZEXTLOAD:
1744  return legalizeLoad(Helper, MI);
1745  case TargetOpcode::G_FMAD:
1746  return legalizeFMad(MI, MRI, B);
1747  case TargetOpcode::G_FDIV:
1748  return legalizeFDIV(MI, MRI, B);
1749  case TargetOpcode::G_UDIV:
1750  case TargetOpcode::G_UREM:
1751  case TargetOpcode::G_UDIVREM:
1752  return legalizeUnsignedDIV_REM(MI, MRI, B);
1753  case TargetOpcode::G_SDIV:
1754  case TargetOpcode::G_SREM:
1755  case TargetOpcode::G_SDIVREM:
1756  return legalizeSignedDIV_REM(MI, MRI, B);
1757  case TargetOpcode::G_ATOMIC_CMPXCHG:
1758  return legalizeAtomicCmpXChg(MI, MRI, B);
1759  case TargetOpcode::G_FLOG:
1760  return legalizeFlog(MI, B, numbers::ln2f);
1761  case TargetOpcode::G_FLOG10:
1763  case TargetOpcode::G_FEXP:
1764  return legalizeFExp(MI, B);
1765  case TargetOpcode::G_FPOW:
1766  return legalizeFPow(MI, B);
1767  case TargetOpcode::G_FFLOOR:
1768  return legalizeFFloor(MI, MRI, B);
1769  case TargetOpcode::G_BUILD_VECTOR:
1770  return legalizeBuildVector(MI, MRI, B);
1771  case TargetOpcode::G_CTLZ:
1772  case TargetOpcode::G_CTTZ:
1773  return legalizeCTLZ_CTTZ(MI, MRI, B);
1774  default:
1775  return false;
1776  }
1777 
1778  llvm_unreachable("expected switch to return");
1779 }
1780 
1782  unsigned AS,
1784  MachineIRBuilder &B) const {
1785  MachineFunction &MF = B.getMF();
1786  const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1787  const LLT S32 = LLT::scalar(32);
1788 
1790 
1791  if (ST.hasApertureRegs()) {
1792  // FIXME: Use inline constants (src_{shared, private}_base) instead of
1793  // getreg.
1794  unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ?
1797  unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ?
1800  unsigned Encoding =
1803  WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_;
1804 
1805  Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
1806 
1807  B.buildInstr(AMDGPU::S_GETREG_B32)
1808  .addDef(GetReg)
1809  .addImm(Encoding);
1810  MRI.setType(GetReg, S32);
1811 
1812  auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1);
1813  return B.buildShl(S32, GetReg, ShiftAmt).getReg(0);
1814  }
1815 
1818 
1820  return Register();
1821 
1822  // Offset into amd_queue_t for group_segment_aperture_base_hi /
1823  // private_segment_aperture_base_hi.
1824  uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
1825 
1826  // TODO: can we be smarter about machine pointer info?
1829  PtrInfo,
1832  LLT::scalar(32), commonAlignment(Align(64), StructOffset));
1833 
1834  Register LoadAddr;
1835 
1836  B.materializePtrAdd(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset);
1837  return B.buildLoad(S32, LoadAddr, *MMO).getReg(0);
1838 }
1839 
1842  MachineIRBuilder &B) const {
1843  MachineFunction &MF = B.getMF();
1844 
1845  const LLT S32 = LLT::scalar(32);
1846  Register Dst = MI.getOperand(0).getReg();
1847  Register Src = MI.getOperand(1).getReg();
1848 
1849  LLT DstTy = MRI.getType(Dst);
1850  LLT SrcTy = MRI.getType(Src);
1851  unsigned DestAS = DstTy.getAddressSpace();
1852  unsigned SrcAS = SrcTy.getAddressSpace();
1853 
1854  // TODO: Avoid reloading from the queue ptr for each cast, or at least each
1855  // vector element.
1856  assert(!DstTy.isVector());
1857 
1858  const AMDGPUTargetMachine &TM
1859  = static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
1860 
1861  if (TM.isNoopAddrSpaceCast(SrcAS, DestAS)) {
1862  MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST));
1863  return true;
1864  }
1865 
1866  if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1867  // Truncate.
1868  B.buildExtract(Dst, Src, 0);
1869  MI.eraseFromParent();
1870  return true;
1871  }
1872 
1873  if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1875  uint32_t AddrHiVal = Info->get32BitAddressHighBits();
1876 
1877  // FIXME: This is a bit ugly due to creating a merge of 2 pointers to
1878  // another. Merge operands are required to be the same type, but creating an
1879  // extra ptrtoint would be kind of pointless.
1880  auto HighAddr = B.buildConstant(
1882  B.buildMerge(Dst, {Src, HighAddr});
1883  MI.eraseFromParent();
1884  return true;
1885  }
1886 
1887  if (SrcAS == AMDGPUAS::FLAT_ADDRESS) {
1888  assert(DestAS == AMDGPUAS::LOCAL_ADDRESS ||
1889  DestAS == AMDGPUAS::PRIVATE_ADDRESS);
1890  unsigned NullVal = TM.getNullPointerValue(DestAS);
1891 
1892  auto SegmentNull = B.buildConstant(DstTy, NullVal);
1893  auto FlatNull = B.buildConstant(SrcTy, 0);
1894 
1895  // Extract low 32-bits of the pointer.
1896  auto PtrLo32 = B.buildExtract(DstTy, Src, 0);
1897 
1898  auto CmpRes =
1899  B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0));
1900  B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0));
1901 
1902  MI.eraseFromParent();
1903  return true;
1904  }
1905 
1906  if (SrcAS != AMDGPUAS::LOCAL_ADDRESS && SrcAS != AMDGPUAS::PRIVATE_ADDRESS)
1907  return false;
1908 
1909  if (!ST.hasFlatAddressSpace())
1910  return false;
1911 
1912  auto SegmentNull =
1913  B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS));
1914  auto FlatNull =
1915  B.buildConstant(DstTy, TM.getNullPointerValue(DestAS));
1916 
1917  Register ApertureReg = getSegmentAperture(SrcAS, MRI, B);
1918  if (!ApertureReg.isValid())
1919  return false;
1920 
1921  auto CmpRes =
1922  B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, SegmentNull.getReg(0));
1923 
1924  // Coerce the type of the low half of the result so we can use merge_values.
1925  Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0);
1926 
1927  // TODO: Should we allow mismatched types but matching sizes in merges to
1928  // avoid the ptrtoint?
1929  auto BuildPtr = B.buildMerge(DstTy, {SrcAsInt, ApertureReg});
1930  B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull);
1931 
1932  MI.eraseFromParent();
1933  return true;
1934 }
1935 
1938  MachineIRBuilder &B) const {
1939  Register Src = MI.getOperand(1).getReg();
1940  LLT Ty = MRI.getType(Src);
1941  assert(Ty.isScalar() && Ty.getSizeInBits() == 64);
1942 
1943  APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
1944  APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
1945 
1946  auto C1 = B.buildFConstant(Ty, C1Val);
1947  auto CopySign = B.buildFCopysign(Ty, C1, Src);
1948 
1949  // TODO: Should this propagate fast-math-flags?
1950  auto Tmp1 = B.buildFAdd(Ty, Src, CopySign);
1951  auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign);
1952 
1953  auto C2 = B.buildFConstant(Ty, C2Val);
1954  auto Fabs = B.buildFAbs(Ty, Src);
1955 
1956  auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2);
1957  B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2);
1958  MI.eraseFromParent();
1959  return true;
1960 }
1961 
1964  MachineIRBuilder &B) const {
1965 
1966  const LLT S1 = LLT::scalar(1);
1967  const LLT S64 = LLT::scalar(64);
1968 
1969  Register Src = MI.getOperand(1).getReg();
1970  assert(MRI.getType(Src) == S64);
1971 
1972  // result = trunc(src)
1973  // if (src > 0.0 && src != result)
1974  // result += 1.0
1975 
1976  auto Trunc = B.buildIntrinsicTrunc(S64, Src);
1977 
1978  const auto Zero = B.buildFConstant(S64, 0.0);
1979  const auto One = B.buildFConstant(S64, 1.0);
1980  auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero);
1981  auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc);
1982  auto And = B.buildAnd(S1, Lt0, NeTrunc);
1983  auto Add = B.buildSelect(S64, And, One, Zero);
1984 
1985  // TODO: Should this propagate fast-math-flags?
1986  B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add);
1987  return true;
1988 }
1989 
1992  MachineIRBuilder &B) const {
1993  Register DstReg = MI.getOperand(0).getReg();
1994  Register Src0Reg = MI.getOperand(1).getReg();
1995  Register Src1Reg = MI.getOperand(2).getReg();
1996  auto Flags = MI.getFlags();
1997  LLT Ty = MRI.getType(DstReg);
1998 
1999  auto Div = B.buildFDiv(Ty, Src0Reg, Src1Reg, Flags);
2000  auto Trunc = B.buildIntrinsicTrunc(Ty, Div, Flags);
2001  auto Neg = B.buildFNeg(Ty, Trunc, Flags);
2002  B.buildFMA(DstReg, Neg, Src1Reg, Src0Reg, Flags);
2003  MI.eraseFromParent();
2004  return true;
2005 }
2006 
2008  MachineIRBuilder &B) {
2009  const unsigned FractBits = 52;
2010  const unsigned ExpBits = 11;
2011  LLT S32 = LLT::scalar(32);
2012 
2013  auto Const0 = B.buildConstant(S32, FractBits - 32);
2014  auto Const1 = B.buildConstant(S32, ExpBits);
2015 
2016  auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false)
2017  .addUse(Hi)
2018  .addUse(Const0.getReg(0))
2019  .addUse(Const1.getReg(0));
2020 
2021  return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023));
2022 }
2023 
2026  MachineIRBuilder &B) const {
2027  const LLT S1 = LLT::scalar(1);
2028  const LLT S32 = LLT::scalar(32);
2029  const LLT S64 = LLT::scalar(64);
2030 
2031  Register Src = MI.getOperand(1).getReg();
2032  assert(MRI.getType(Src) == S64);
2033 
2034  // TODO: Should this use extract since the low half is unused?
2035  auto Unmerge = B.buildUnmerge({S32, S32}, Src);
2036  Register Hi = Unmerge.getReg(1);
2037 
2038  // Extract the upper half, since this is where we will find the sign and
2039  // exponent.
2040  auto Exp = extractF64Exponent(Hi, B);
2041 
2042  const unsigned FractBits = 52;
2043 
2044  // Extract the sign bit.
2045  const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31);
2046  auto SignBit = B.buildAnd(S32, Hi, SignBitMask);
2047 
2048  const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1);
2049 
2050  const auto Zero32 = B.buildConstant(S32, 0);
2051 
2052  // Extend back to 64-bits.
2053  auto SignBit64 = B.buildMerge(S64, {Zero32, SignBit});
2054 
2055  auto Shr = B.buildAShr(S64, FractMask, Exp);
2056  auto Not = B.buildNot(S64, Shr);
2057  auto Tmp0 = B.buildAnd(S64, Src, Not);
2058  auto FiftyOne = B.buildConstant(S32, FractBits - 1);
2059 
2060  auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32);
2061  auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne);
2062 
2063  auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0);
2064  B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1);
2065  MI.eraseFromParent();
2066  return true;
2067 }
2068 
2071  MachineIRBuilder &B, bool Signed) const {
2072 
2073  Register Dst = MI.getOperand(0).getReg();
2074  Register Src = MI.getOperand(1).getReg();
2075 
2076  const LLT S64 = LLT::scalar(64);
2077  const LLT S32 = LLT::scalar(32);
2078 
2079  assert(MRI.getType(Src) == S64);
2080 
2081  auto Unmerge = B.buildUnmerge({S32, S32}, Src);
2082  auto ThirtyTwo = B.buildConstant(S32, 32);
2083 
2084  if (MRI.getType(Dst) == S64) {
2085  auto CvtHi = Signed ? B.buildSITOFP(S64, Unmerge.getReg(1))
2086  : B.buildUITOFP(S64, Unmerge.getReg(1));
2087 
2088  auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0));
2089  auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false)
2090  .addUse(CvtHi.getReg(0))
2091  .addUse(ThirtyTwo.getReg(0));
2092 
2093  // TODO: Should this propagate fast-math-flags?
2094  B.buildFAdd(Dst, LdExp, CvtLo);
2095  MI.eraseFromParent();
2096  return true;
2097  }
2098 
2099  assert(MRI.getType(Dst) == S32);
2100 
2101  auto One = B.buildConstant(S32, 1);
2102 
2103  MachineInstrBuilder ShAmt;
2104  if (Signed) {
2105  auto ThirtyOne = B.buildConstant(S32, 31);
2106  auto X = B.buildXor(S32, Unmerge.getReg(0), Unmerge.getReg(1));
2107  auto OppositeSign = B.buildAShr(S32, X, ThirtyOne);
2108  auto MaxShAmt = B.buildAdd(S32, ThirtyTwo, OppositeSign);
2109  auto LS = B.buildIntrinsic(Intrinsic::amdgcn_sffbh, {S32},
2110  /*HasSideEffects=*/false)
2111  .addUse(Unmerge.getReg(1));
2112  auto LS2 = B.buildSub(S32, LS, One);
2113  ShAmt = B.buildUMin(S32, LS2, MaxShAmt);
2114  } else
2115  ShAmt = B.buildCTLZ(S32, Unmerge.getReg(1));
2116  auto Norm = B.buildShl(S64, Src, ShAmt);
2117  auto Unmerge2 = B.buildUnmerge({S32, S32}, Norm);
2118  auto Adjust = B.buildUMin(S32, One, Unmerge2.getReg(0));
2119  auto Norm2 = B.buildOr(S32, Unmerge2.getReg(1), Adjust);
2120  auto FVal = Signed ? B.buildSITOFP(S32, Norm2) : B.buildUITOFP(S32, Norm2);
2121  auto Scale = B.buildSub(S32, ThirtyTwo, ShAmt);
2122  B.buildIntrinsic(Intrinsic::amdgcn_ldexp, ArrayRef<Register>{Dst},
2123  /*HasSideEffects=*/false)
2124  .addUse(FVal.getReg(0))
2125  .addUse(Scale.getReg(0));
2126  MI.eraseFromParent();
2127  return true;
2128 }
2129 
2130 // TODO: Copied from DAG implementation. Verify logic and document how this
2131 // actually works.
2135  bool Signed) const {
2136 
2137  Register Dst = MI.getOperand(0).getReg();
2138  Register Src = MI.getOperand(1).getReg();
2139 
2140  const LLT S64 = LLT::scalar(64);
2141  const LLT S32 = LLT::scalar(32);
2142 
2143  const LLT SrcLT = MRI.getType(Src);
2144  assert((SrcLT == S32 || SrcLT == S64) && MRI.getType(Dst) == S64);
2145 
2146  unsigned Flags = MI.getFlags();
2147 
2148  // The basic idea of converting a floating point number into a pair of 32-bit
2149  // integers is illustrated as follows:
2150  //
2151  // tf := trunc(val);
2152  // hif := floor(tf * 2^-32);
2153  // lof := tf - hif * 2^32; // lof is always positive due to floor.
2154  // hi := fptoi(hif);
2155  // lo := fptoi(lof);
2156  //
2157  auto Trunc = B.buildIntrinsicTrunc(SrcLT, Src, Flags);
2158  MachineInstrBuilder Sign;
2159  if (Signed && SrcLT == S32) {
2160  // However, a 32-bit floating point number has only 23 bits mantissa and
2161  // it's not enough to hold all the significant bits of `lof` if val is
2162  // negative. To avoid the loss of precision, We need to take the absolute
2163  // value after truncating and flip the result back based on the original
2164  // signedness.
2165  Sign = B.buildAShr(S32, Src, B.buildConstant(S32, 31));
2166  Trunc = B.buildFAbs(S32, Trunc, Flags);
2167  }
2168  MachineInstrBuilder K0, K1;
2169  if (SrcLT == S64) {
2170  K0 = B.buildFConstant(S64,
2171  BitsToDouble(UINT64_C(/*2^-32*/ 0x3df0000000000000)));
2172  K1 = B.buildFConstant(S64,
2173  BitsToDouble(UINT64_C(/*-2^32*/ 0xc1f0000000000000)));
2174  } else {
2175  K0 = B.buildFConstant(S32, BitsToFloat(UINT32_C(/*2^-32*/ 0x2f800000)));
2176  K1 = B.buildFConstant(S32, BitsToFloat(UINT32_C(/*-2^32*/ 0xcf800000)));
2177  }
2178 
2179  auto Mul = B.buildFMul(SrcLT, Trunc, K0, Flags);
2180  auto FloorMul = B.buildFFloor(SrcLT, Mul, Flags);
2181  auto Fma = B.buildFMA(SrcLT, FloorMul, K1, Trunc, Flags);
2182 
2183  auto Hi = (Signed && SrcLT == S64) ? B.buildFPTOSI(S32, FloorMul)
2184  : B.buildFPTOUI(S32, FloorMul);
2185  auto Lo = B.buildFPTOUI(S32, Fma);
2186 
2187  if (Signed && SrcLT == S32) {
2188  // Flip the result based on the signedness, which is either all 0s or 1s.
2189  Sign = B.buildMerge(S64, {Sign, Sign});
2190  // r := xor({lo, hi}, sign) - sign;
2191  B.buildSub(Dst, B.buildXor(S64, B.buildMerge(S64, {Lo, Hi}), Sign), Sign);
2192  } else
2193  B.buildMerge(Dst, {Lo, Hi});
2194  MI.eraseFromParent();
2195 
2196  return true;
2197 }
2198 
2200  MachineInstr &MI) const {
2201  MachineFunction &MF = Helper.MIRBuilder.getMF();
2203 
2204  const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE ||
2205  MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE;
2206 
2207  // With ieee_mode disabled, the instructions have the correct behavior
2208  // already for G_FMINNUM/G_FMAXNUM
2209  if (!MFI->getMode().IEEE)
2210  return !IsIEEEOp;
2211 
2212  if (IsIEEEOp)
2213  return true;
2214 
2216 }
2217 
2220  MachineIRBuilder &B) const {
2221  // TODO: Should move some of this into LegalizerHelper.
2222 
2223  // TODO: Promote dynamic indexing of s16 to s32
2224 
2225  // FIXME: Artifact combiner probably should have replaced the truncated
2226  // constant before this, so we shouldn't need
2227  // getIConstantVRegValWithLookThrough.
2228  Optional<ValueAndVReg> MaybeIdxVal =
2229  getIConstantVRegValWithLookThrough(MI.getOperand(2).getReg(), MRI);
2230  if (!MaybeIdxVal) // Dynamic case will be selected to register indexing.
2231  return true;
2232  const int64_t IdxVal = MaybeIdxVal->Value.getSExtValue();
2233 
2234  Register Dst = MI.getOperand(0).getReg();
2235  Register Vec = MI.getOperand(1).getReg();
2236 
2237  LLT VecTy = MRI.getType(Vec);
2238  LLT EltTy = VecTy.getElementType();
2239  assert(EltTy == MRI.getType(Dst));
2240 
2241  if (IdxVal < VecTy.getNumElements())
2242  B.buildExtract(Dst, Vec, IdxVal * EltTy.getSizeInBits());
2243  else
2244  B.buildUndef(Dst);
2245 
2246  MI.eraseFromParent();
2247  return true;
2248 }
2249 
2252  MachineIRBuilder &B) const {
2253  // TODO: Should move some of this into LegalizerHelper.
2254 
2255  // TODO: Promote dynamic indexing of s16 to s32
2256 
2257  // FIXME: Artifact combiner probably should have replaced the truncated
2258  // constant before this, so we shouldn't need
2259  // getIConstantVRegValWithLookThrough.
2260  Optional<ValueAndVReg> MaybeIdxVal =
2261  getIConstantVRegValWithLookThrough(MI.getOperand(3).getReg(), MRI);
2262  if (!MaybeIdxVal) // Dynamic case will be selected to register indexing.
2263  return true;
2264 
2265  int64_t IdxVal = MaybeIdxVal->Value.getSExtValue();
2266  Register Dst = MI.getOperand(0).getReg();
2267  Register Vec = MI.getOperand(1).getReg();
2268  Register Ins = MI.getOperand(2).getReg();
2269 
2270  LLT VecTy = MRI.getType(Vec);
2271  LLT EltTy = VecTy.getElementType();
2272  assert(EltTy == MRI.getType(Ins));
2273 
2274  if (IdxVal < VecTy.getNumElements())
2275  B.buildInsert(Dst, Vec, Ins, IdxVal * EltTy.getSizeInBits());
2276  else
2277  B.buildUndef(Dst);
2278 
2279  MI.eraseFromParent();
2280  return true;
2281 }
2282 
2285  MachineIRBuilder &B) const {
2286  const LLT V2S16 = LLT::fixed_vector(2, 16);
2287 
2288  Register Dst = MI.getOperand(0).getReg();
2289  Register Src0 = MI.getOperand(1).getReg();
2290  LLT DstTy = MRI.getType(Dst);
2291  LLT SrcTy = MRI.getType(Src0);
2292 
2293  if (SrcTy == V2S16 && DstTy == V2S16 &&
2294  AMDGPU::isLegalVOP3PShuffleMask(MI.getOperand(3).getShuffleMask()))
2295  return true;
2296 
2297  MachineIRBuilder HelperBuilder(MI);
2298  GISelObserverWrapper DummyObserver;
2299  LegalizerHelper Helper(B.getMF(), DummyObserver, HelperBuilder);
2301 }
2302 
2305  MachineIRBuilder &B) const {
2306 
2307  Register DstReg = MI.getOperand(0).getReg();
2308  Register SrcReg = MI.getOperand(1).getReg();
2309  LLT Ty = MRI.getType(DstReg);
2310  unsigned Flags = MI.getFlags();
2311 
2312  Register TrigVal;
2313  auto OneOver2Pi = B.buildFConstant(Ty, 0.5 * numbers::inv_pi);
2314  if (ST.hasTrigReducedRange()) {
2315  auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags);
2316  TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false)
2317  .addUse(MulVal.getReg(0))
2318  .setMIFlags(Flags).getReg(0);
2319  } else
2320  TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0);
2321 
2322  Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ?
2323  Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos;
2324  B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false)
2325  .addUse(TrigVal)
2326  .setMIFlags(Flags);
2327  MI.eraseFromParent();
2328  return true;
2329 }
2330 
2333  const GlobalValue *GV,
2334  int64_t Offset,
2335  unsigned GAFlags) const {
2336  assert(isInt<32>(Offset + 4) && "32-bit offset is expected!");
2337  // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered
2338  // to the following code sequence:
2339  //
2340  // For constant address space:
2341  // s_getpc_b64 s[0:1]
2342  // s_add_u32 s0, s0, $symbol
2343  // s_addc_u32 s1, s1, 0
2344  //
2345  // s_getpc_b64 returns the address of the s_add_u32 instruction and then
2346  // a fixup or relocation is emitted to replace $symbol with a literal
2347  // constant, which is a pc-relative offset from the encoding of the $symbol
2348  // operand to the global variable.
2349  //
2350  // For global address space:
2351  // s_getpc_b64 s[0:1]
2352  // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
2353  // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
2354  //
2355  // s_getpc_b64 returns the address of the s_add_u32 instruction and then
2356  // fixups or relocations are emitted to replace $symbol@*@lo and
2357  // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
2358  // which is a 64-bit pc-relative offset from the encoding of the $symbol
2359  // operand to the global variable.
2360  //
2361  // What we want here is an offset from the value returned by s_getpc
2362  // (which is the address of the s_add_u32 instruction) to the global
2363  // variable, but since the encoding of $symbol starts 4 bytes after the start
2364  // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too
2365  // small. This requires us to add 4 to the global variable offset in order to
2366  // compute the correct address. Similarly for the s_addc_u32 instruction, the
2367  // encoding of $symbol starts 12 bytes after the start of the s_add_u32
2368  // instruction.
2369 
2370  LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2371 
2372  Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg :
2373  B.getMRI()->createGenericVirtualRegister(ConstPtrTy);
2374 
2375  MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET)
2376  .addDef(PCReg);
2377 
2378  MIB.addGlobalAddress(GV, Offset + 4, GAFlags);
2379  if (GAFlags == SIInstrInfo::MO_NONE)
2380  MIB.addImm(0);
2381  else
2382  MIB.addGlobalAddress(GV, Offset + 12, GAFlags + 1);
2383 
2384  B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass);
2385 
2386  if (PtrTy.getSizeInBits() == 32)
2387  B.buildExtract(DstReg, PCReg, 0);
2388  return true;
2389  }
2390 
2393  MachineIRBuilder &B) const {
2394  Register DstReg = MI.getOperand(0).getReg();
2395  LLT Ty = MRI.getType(DstReg);
2396  unsigned AS = Ty.getAddressSpace();
2397 
2398  const GlobalValue *GV = MI.getOperand(1).getGlobal();
2399  MachineFunction &MF = B.getMF();
2401 
2403  if (!MFI->isModuleEntryFunction() &&
2404  !GV->getName().equals("llvm.amdgcn.module.lds")) {
2405  const Function &Fn = MF.getFunction();
2406  DiagnosticInfoUnsupported BadLDSDecl(
2407  Fn, "local memory global used by non-kernel function", MI.getDebugLoc(),
2408  DS_Warning);
2409  Fn.getContext().diagnose(BadLDSDecl);
2410 
2411  // We currently don't have a way to correctly allocate LDS objects that
2412  // aren't directly associated with a kernel. We do force inlining of
2413  // functions that use local objects. However, if these dead functions are
2414  // not eliminated, we don't want a compile time error. Just emit a warning
2415  // and a trap, since there should be no callable path here.
2416  B.buildIntrinsic(Intrinsic::trap, ArrayRef<Register>(), true);
2417  B.buildUndef(DstReg);
2418  MI.eraseFromParent();
2419  return true;
2420  }
2421 
2422  // TODO: We could emit code to handle the initialization somewhere.
2424  const SITargetLowering *TLI = ST.getTargetLowering();
2425  if (!TLI->shouldUseLDSConstAddress(GV)) {
2426  MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO);
2427  return true; // Leave in place;
2428  }
2429 
2430  if (AS == AMDGPUAS::LOCAL_ADDRESS && GV->hasExternalLinkage()) {
2431  Type *Ty = GV->getValueType();
2432  // HIP uses an unsized array `extern __shared__ T s[]` or similar
2433  // zero-sized type in other languages to declare the dynamic shared
2434  // memory which size is not known at the compile time. They will be
2435  // allocated by the runtime and placed directly after the static
2436  // allocated ones. They all share the same offset.
2437  if (B.getDataLayout().getTypeAllocSize(Ty).isZero()) {
2438  // Adjust alignment for that dynamic shared memory array.
2439  MFI->setDynLDSAlign(B.getDataLayout(), *cast<GlobalVariable>(GV));
2440  LLT S32 = LLT::scalar(32);
2441  auto Sz =
2442  B.buildIntrinsic(Intrinsic::amdgcn_groupstaticsize, {S32}, false);
2443  B.buildIntToPtr(DstReg, Sz);
2444  MI.eraseFromParent();
2445  return true;
2446  }
2447  }
2448 
2449  B.buildConstant(
2450  DstReg,
2451  MFI->allocateLDSGlobal(B.getDataLayout(), *cast<GlobalVariable>(GV)));
2452  MI.eraseFromParent();
2453  return true;
2454  }
2455 
2456  const Function &Fn = MF.getFunction();
2457  DiagnosticInfoUnsupported BadInit(
2458  Fn, "unsupported initializer for address space", MI.getDebugLoc());
2459  Fn.getContext().diagnose(BadInit);
2460  return true;
2461  }
2462 
2463  const SITargetLowering *TLI = ST.getTargetLowering();
2464 
2465  if (TLI->shouldEmitFixup(GV)) {
2466  buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0);
2467  MI.eraseFromParent();
2468  return true;
2469  }
2470 
2471  if (TLI->shouldEmitPCReloc(GV)) {
2472  buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32);
2473  MI.eraseFromParent();
2474  return true;
2475  }
2476 
2478  Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy);
2479 
2480  LLT LoadTy = Ty.getSizeInBits() == 32 ? PtrTy : Ty;
2485  LoadTy, Align(8));
2486 
2487  buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32);
2488 
2489  if (Ty.getSizeInBits() == 32) {
2490  // Truncate if this is a 32-bit constant adrdess.
2491  auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO);
2492  B.buildExtract(DstReg, Load, 0);
2493  } else
2494  B.buildLoad(DstReg, GOTAddr, *GOTMMO);
2495 
2496  MI.eraseFromParent();
2497  return true;
2498 }
2499 
2501  if (Ty.isVector())
2502  return Ty.changeElementCount(
2504  return LLT::scalar(PowerOf2Ceil(Ty.getSizeInBits()));
2505 }
2506 
2508  MachineInstr &MI) const {
2509  MachineIRBuilder &B = Helper.MIRBuilder;
2510  MachineRegisterInfo &MRI = *B.getMRI();
2511  GISelChangeObserver &Observer = Helper.Observer;
2512 
2513  Register PtrReg = MI.getOperand(1).getReg();
2514  LLT PtrTy = MRI.getType(PtrReg);
2515  unsigned AddrSpace = PtrTy.getAddressSpace();
2516 
2517  if (AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
2519  auto Cast = B.buildAddrSpaceCast(ConstPtr, PtrReg);
2520  Observer.changingInstr(MI);
2521  MI.getOperand(1).setReg(Cast.getReg(0));
2522  Observer.changedInstr(MI);
2523  return true;
2524  }
2525 
2526  if (MI.getOpcode() != AMDGPU::G_LOAD)
2527  return false;
2528 
2529  Register ValReg = MI.getOperand(0).getReg();
2530  LLT ValTy = MRI.getType(ValReg);
2531 
2532  MachineMemOperand *MMO = *MI.memoperands_begin();
2533  const unsigned ValSize = ValTy.getSizeInBits();
2534  const LLT MemTy = MMO->getMemoryType();
2535  const Align MemAlign = MMO->getAlign();
2536  const unsigned MemSize = MemTy.getSizeInBits();
2537  const unsigned AlignInBits = 8 * MemAlign.value();
2538 
2539  // Widen non-power-of-2 loads to the alignment if needed
2540  if (shouldWidenLoad(ST, MemTy, AlignInBits, AddrSpace, MI.getOpcode())) {
2541  const unsigned WideMemSize = PowerOf2Ceil(MemSize);
2542 
2543  // This was already the correct extending load result type, so just adjust
2544  // the memory type.
2545  if (WideMemSize == ValSize) {
2546  MachineFunction &MF = B.getMF();
2547 
2548  MachineMemOperand *WideMMO =
2549  MF.getMachineMemOperand(MMO, 0, WideMemSize / 8);
2550  Observer.changingInstr(MI);
2551  MI.setMemRefs(MF, {WideMMO});
2552  Observer.changedInstr(MI);
2553  return true;
2554  }
2555 
2556  // Don't bother handling edge case that should probably never be produced.
2557  if (ValSize > WideMemSize)
2558  return false;
2559 
2560  LLT WideTy = widenToNextPowerOf2(ValTy);
2561 
2562  Register WideLoad;
2563  if (!WideTy.isVector()) {
2564  WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
2565  B.buildTrunc(ValReg, WideLoad).getReg(0);
2566  } else {
2567  // Extract the subvector.
2568 
2569  if (isRegisterType(ValTy)) {
2570  // If this a case where G_EXTRACT is legal, use it.
2571  // (e.g. <3 x s32> -> <4 x s32>)
2572  WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
2573  B.buildExtract(ValReg, WideLoad, 0);
2574  } else {
2575  // For cases where the widened type isn't a nice register value, unmerge
2576  // from a widened register (e.g. <3 x s16> -> <4 x s16>)
2577  B.setInsertPt(B.getMBB(), ++B.getInsertPt());
2578  WideLoad = Helper.widenWithUnmerge(WideTy, ValReg);
2579  B.setInsertPt(B.getMBB(), MI.getIterator());
2580  B.buildLoadFromOffset(WideLoad, PtrReg, *MMO, 0);
2581  }
2582  }
2583 
2584  MI.eraseFromParent();
2585  return true;
2586  }
2587 
2588  return false;
2589 }
2590 
2593  MachineIRBuilder &B) const {
2594  LLT Ty = MRI.getType(MI.getOperand(0).getReg());
2595  assert(Ty.isScalar());
2596 
2597  MachineFunction &MF = B.getMF();
2599 
2600  // TODO: Always legal with future ftz flag.
2601  // FIXME: Do we need just output?
2602  if (Ty == LLT::scalar(32) && !MFI->getMode().allFP32Denormals())
2603  return true;
2604  if (Ty == LLT::scalar(16) && !MFI->getMode().allFP64FP16Denormals())
2605  return true;
2606 
2607  MachineIRBuilder HelperBuilder(MI);
2608  GISelObserverWrapper DummyObserver;
2609  LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
2610  return Helper.lowerFMad(MI) == LegalizerHelper::Legalized;
2611 }
2612 
2615  Register DstReg = MI.getOperand(0).getReg();
2616  Register PtrReg = MI.getOperand(1).getReg();
2617  Register CmpVal = MI.getOperand(2).getReg();
2618  Register NewVal = MI.getOperand(3).getReg();
2619 
2621  "this should not have been custom lowered");
2622 
2623  LLT ValTy = MRI.getType(CmpVal);
2624  LLT VecTy = LLT::fixed_vector(2, ValTy);
2625 
2626  Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0);
2627 
2628  B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG)
2629  .addDef(DstReg)
2630  .addUse(PtrReg)
2631  .addUse(PackedVal)
2632  .setMemRefs(MI.memoperands());
2633 
2634  MI.eraseFromParent();
2635  return true;
2636 }
2637 
2639  MachineInstr &MI, MachineIRBuilder &B, double Log2BaseInverted) const {
2640  Register Dst = MI.getOperand(0).getReg();
2641  Register Src = MI.getOperand(1).getReg();
2642  LLT Ty = B.getMRI()->getType(Dst);
2643  unsigned Flags = MI.getFlags();
2644 
2645  auto Log2Operand = B.buildFLog2(Ty, Src, Flags);
2646  auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted);
2647 
2648  B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags);
2649  MI.eraseFromParent();
2650  return true;
2651 }
2652 
2654  MachineIRBuilder &B) const {
2655  Register Dst = MI.getOperand(0).getReg();
2656  Register Src = MI.getOperand(1).getReg();
2657  unsigned Flags = MI.getFlags();
2658  LLT Ty = B.getMRI()->getType(Dst);
2659 
2660  auto K = B.buildFConstant(Ty, numbers::log2e);
2661  auto Mul = B.buildFMul(Ty, Src, K, Flags);
2662  B.buildFExp2(Dst, Mul, Flags);
2663  MI.eraseFromParent();
2664  return true;
2665 }
2666 
2668  MachineIRBuilder &B) const {
2669  Register Dst = MI.getOperand(0).getReg();
2670  Register Src0 = MI.getOperand(1).getReg();
2671  Register Src1 = MI.getOperand(2).getReg();
2672  unsigned Flags = MI.getFlags();
2673  LLT Ty = B.getMRI()->getType(Dst);
2674  const LLT S16 = LLT::scalar(16);
2675  const LLT S32 = LLT::scalar(32);
2676 
2677  if (Ty == S32) {
2678  auto Log = B.buildFLog2(S32, Src0, Flags);
2679  auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false)
2680  .addUse(Log.getReg(0))
2681  .addUse(Src1)
2682  .setMIFlags(Flags);
2683  B.buildFExp2(Dst, Mul, Flags);
2684  } else if (Ty == S16) {
2685  // There's no f16 fmul_legacy, so we need to convert for it.
2686  auto Log = B.buildFLog2(S16, Src0, Flags);
2687  auto Ext0 = B.buildFPExt(S32, Log, Flags);
2688  auto Ext1 = B.buildFPExt(S32, Src1, Flags);
2689  auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false)
2690  .addUse(Ext0.getReg(0))
2691  .addUse(Ext1.getReg(0))
2692  .setMIFlags(Flags);
2693 
2694  B.buildFExp2(Dst, B.buildFPTrunc(S16, Mul), Flags);
2695  } else
2696  return false;
2697 
2698  MI.eraseFromParent();
2699  return true;
2700 }
2701 
2702 // Find a source register, ignoring any possible source modifiers.
2704  Register ModSrc = OrigSrc;
2705  if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) {
2706  ModSrc = SrcFNeg->getOperand(1).getReg();
2707  if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
2708  ModSrc = SrcFAbs->getOperand(1).getReg();
2709  } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
2710  ModSrc = SrcFAbs->getOperand(1).getReg();
2711  return ModSrc;
2712 }
2713 
2716  MachineIRBuilder &B) const {
2717 
2718  const LLT S1 = LLT::scalar(1);
2719  const LLT S64 = LLT::scalar(64);
2720  Register Dst = MI.getOperand(0).getReg();
2721  Register OrigSrc = MI.getOperand(1).getReg();
2722  unsigned Flags = MI.getFlags();
2723  assert(ST.hasFractBug() && MRI.getType(Dst) == S64 &&
2724  "this should not have been custom lowered");
2725 
2726  // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x))
2727  // is used instead. However, SI doesn't have V_FLOOR_F64, so the most
2728  // efficient way to implement it is using V_FRACT_F64. The workaround for the
2729  // V_FRACT bug is:
2730  // fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999)
2731  //
2732  // Convert floor(x) to (x - fract(x))
2733 
2734  auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {S64}, false)
2735  .addUse(OrigSrc)
2736  .setMIFlags(Flags);
2737 
2738  // Give source modifier matching some assistance before obscuring a foldable
2739  // pattern.
2740 
2741  // TODO: We can avoid the neg on the fract? The input sign to fract
2742  // shouldn't matter?
2743  Register ModSrc = stripAnySourceMods(OrigSrc, MRI);
2744 
2745  auto Const = B.buildFConstant(S64, BitsToDouble(0x3fefffffffffffff));
2746 
2748 
2749  // We don't need to concern ourselves with the snan handling difference, so
2750  // use the one which will directly select.
2751  const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2752  if (MFI->getMode().IEEE)
2753  B.buildFMinNumIEEE(Min, Fract, Const, Flags);
2754  else
2755  B.buildFMinNum(Min, Fract, Const, Flags);
2756 
2757  Register CorrectedFract = Min;
2758  if (!MI.getFlag(MachineInstr::FmNoNans)) {
2759  auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags);
2760  CorrectedFract = B.buildSelect(S64, IsNan, ModSrc, Min, Flags).getReg(0);
2761  }
2762 
2763  auto NegFract = B.buildFNeg(S64, CorrectedFract, Flags);
2764  B.buildFAdd(Dst, OrigSrc, NegFract, Flags);
2765 
2766  MI.eraseFromParent();
2767  return true;
2768 }
2769 
2770 // Turn an illegal packed v2s16 build vector into bit operations.
2771 // TODO: This should probably be a bitcast action in LegalizerHelper.
2774  Register Dst = MI.getOperand(0).getReg();
2775  const LLT S32 = LLT::scalar(32);
2776  assert(MRI.getType(Dst) == LLT::fixed_vector(2, 16));
2777 
2778  Register Src0 = MI.getOperand(1).getReg();
2779  Register Src1 = MI.getOperand(2).getReg();
2780  assert(MRI.getType(Src0) == LLT::scalar(16));
2781 
2782  auto Merge = B.buildMerge(S32, {Src0, Src1});
2783  B.buildBitcast(Dst, Merge);
2784 
2785  MI.eraseFromParent();
2786  return true;
2787 }
2788 
2789 // Legalize ctlz/cttz to ffbh/ffbl instead of the default legalization to
2790 // ctlz/cttz_zero_undef. This allows us to fix up the result for the zero input
2791 // case with a single min instruction instead of a compare+select.
2794  MachineIRBuilder &B) const {
2795  Register Dst = MI.getOperand(0).getReg();
2796  Register Src = MI.getOperand(1).getReg();
2797  LLT DstTy = MRI.getType(Dst);
2798  LLT SrcTy = MRI.getType(Src);
2799 
2800  unsigned NewOpc = MI.getOpcode() == AMDGPU::G_CTLZ
2801  ? AMDGPU::G_AMDGPU_FFBH_U32
2802  : AMDGPU::G_AMDGPU_FFBL_B32;
2803  auto Tmp = B.buildInstr(NewOpc, {DstTy}, {Src});
2804  B.buildUMin(Dst, Tmp, B.buildConstant(DstTy, SrcTy.getSizeInBits()));
2805 
2806  MI.eraseFromParent();
2807  return true;
2808 }
2809 
2810 // Check that this is a G_XOR x, -1
2811 static bool isNot(const MachineRegisterInfo &MRI, const MachineInstr &MI) {
2812  if (MI.getOpcode() != TargetOpcode::G_XOR)
2813  return false;
2814  auto ConstVal = getIConstantVRegSExtVal(MI.getOperand(2).getReg(), MRI);
2815  return ConstVal && *ConstVal == -1;
2816 }
2817 
2818 // Return the use branch instruction, otherwise null if the usage is invalid.
2819 static MachineInstr *
2821  MachineBasicBlock *&UncondBrTarget, bool &Negated) {
2822  Register CondDef = MI.getOperand(0).getReg();
2823  if (!MRI.hasOneNonDBGUse(CondDef))
2824  return nullptr;
2825 
2826  MachineBasicBlock *Parent = MI.getParent();
2828 
2829  if (isNot(MRI, *UseMI)) {
2830  Register NegatedCond = UseMI->getOperand(0).getReg();
2831  if (!MRI.hasOneNonDBGUse(NegatedCond))
2832  return nullptr;
2833 
2834  // We're deleting the def of this value, so we need to remove it.
2836 
2837  UseMI = &*MRI.use_instr_nodbg_begin(NegatedCond);
2838  Negated = true;
2839  }
2840 
2841  if (UseMI->getParent() != Parent || UseMI->getOpcode() != AMDGPU::G_BRCOND)
2842  return nullptr;
2843 
2844  // Make sure the cond br is followed by a G_BR, or is the last instruction.
2845  MachineBasicBlock::iterator Next = std::next(UseMI->getIterator());
2846  if (Next == Parent->end()) {
2847  MachineFunction::iterator NextMBB = std::next(Parent->getIterator());
2848  if (NextMBB == Parent->getParent()->end()) // Illegal intrinsic use.
2849  return nullptr;
2850  UncondBrTarget = &*NextMBB;
2851  } else {
2852  if (Next->getOpcode() != AMDGPU::G_BR)
2853  return nullptr;
2854  Br = &*Next;
2855  UncondBrTarget = Br->getOperand(0).getMBB();
2856  }
2857 
2858  return UseMI;
2859 }
2860 
2862  const ArgDescriptor *Arg,
2863  const TargetRegisterClass *ArgRC,
2864  LLT ArgTy) const {
2865  MCRegister SrcReg = Arg->getRegister();
2866  assert(Register::isPhysicalRegister(SrcReg) && "Physical register expected");
2867  assert(DstReg.isVirtual() && "Virtual register expected");
2868 
2869  Register LiveIn = getFunctionLiveInPhysReg(B.getMF(), B.getTII(), SrcReg, *ArgRC,
2870  ArgTy);
2871  if (Arg->isMasked()) {
2872  // TODO: Should we try to emit this once in the entry block?
2873  const LLT S32 = LLT::scalar(32);
2874  const unsigned Mask = Arg->getMask();
2875  const unsigned Shift = countTrailingZeros<unsigned>(Mask);
2876 
2877  Register AndMaskSrc = LiveIn;
2878 
2879  if (Shift != 0) {
2880  auto ShiftAmt = B.buildConstant(S32, Shift);
2881  AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0);
2882  }
2883 
2884  B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift));
2885  } else {
2886  B.buildCopy(DstReg, LiveIn);
2887  }
2888 
2889  return true;
2890 }
2891 
2893  Register DstReg, MachineIRBuilder &B,
2894  AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
2895  const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2896  const ArgDescriptor *Arg;
2897  const TargetRegisterClass *ArgRC;
2898  LLT ArgTy;
2899  std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType);
2900 
2901  if (!Arg) {
2903  // The intrinsic may appear when we have a 0 sized kernarg segment, in which
2904  // case the pointer argument may be missing and we use null.
2905  B.buildConstant(DstReg, 0);
2906  return true;
2907  }
2908 
2909  // It's undefined behavior if a function marked with the amdgpu-no-*
2910  // attributes uses the corresponding intrinsic.
2911  B.buildUndef(DstReg);
2912  return true;
2913  }
2914 
2915  if (!Arg->isRegister() || !Arg->getRegister().isValid())
2916  return false; // TODO: Handle these
2917  return loadInputValue(DstReg, B, Arg, ArgRC, ArgTy);
2918 }
2919 
2922  AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
2923  if (!loadInputValue(MI.getOperand(0).getReg(), B, ArgType))
2924  return false;
2925 
2926  MI.eraseFromParent();
2927  return true;
2928 }
2929 
2932  MachineIRBuilder &B) const {
2933  Register Dst = MI.getOperand(0).getReg();
2934  LLT DstTy = MRI.getType(Dst);
2935  LLT S16 = LLT::scalar(16);
2936  LLT S32 = LLT::scalar(32);
2937  LLT S64 = LLT::scalar(64);
2938 
2939  if (DstTy == S16)
2940  return legalizeFDIV16(MI, MRI, B);
2941  if (DstTy == S32)
2942  return legalizeFDIV32(MI, MRI, B);
2943  if (DstTy == S64)
2944  return legalizeFDIV64(MI, MRI, B);
2945 
2946  return false;
2947 }
2948 
2950  Register DstDivReg,
2951  Register DstRemReg,
2952  Register X,
2953  Register Y) const {
2954  const LLT S1 = LLT::scalar(1);
2955  const LLT S32 = LLT::scalar(32);
2956 
2957  // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the
2958  // algorithm used here.
2959 
2960  // Initial estimate of inv(y).
2961  auto FloatY = B.buildUITOFP(S32, Y);
2962  auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {FloatY});
2963  auto Scale = B.buildFConstant(S32, BitsToFloat(0x4f7ffffe));
2964  auto ScaledY = B.buildFMul(S32, RcpIFlag, Scale);
2965  auto Z = B.buildFPTOUI(S32, ScaledY);
2966 
2967  // One round of UNR.
2968  auto NegY = B.buildSub(S32, B.buildConstant(S32, 0), Y);
2969  auto NegYZ = B.buildMul(S32, NegY, Z);
2970  Z = B.buildAdd(S32, Z, B.buildUMulH(S32, Z, NegYZ));
2971 
2972  // Quotient/remainder estimate.
2973  auto Q = B.buildUMulH(S32, X, Z);
2974  auto R = B.buildSub(S32, X, B.buildMul(S32, Q, Y));
2975 
2976  // First quotient/remainder refinement.
2977  auto One = B.buildConstant(S32, 1);
2978  auto Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
2979  if (DstDivReg)
2980  Q = B.buildSelect(S32, Cond, B.buildAdd(S32, Q, One), Q);
2981  R = B.buildSelect(S32, Cond, B.buildSub(S32, R, Y), R);
2982 
2983  // Second quotient/remainder refinement.
2984  Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
2985  if (DstDivReg)
2986  B.buildSelect(DstDivReg, Cond, B.buildAdd(S32, Q, One), Q);
2987 
2988  if (DstRemReg)
2989  B.buildSelect(DstRemReg, Cond, B.buildSub(S32, R, Y), R);
2990 }
2991 
2992 // Build integer reciprocal sequence arounud V_RCP_IFLAG_F32
2993 //
2994 // Return lo, hi of result
2995 //
2996 // %cvt.lo = G_UITOFP Val.lo
2997 // %cvt.hi = G_UITOFP Val.hi
2998 // %mad = G_FMAD %cvt.hi, 2**32, %cvt.lo
2999 // %rcp = G_AMDGPU_RCP_IFLAG %mad
3000 // %mul1 = G_FMUL %rcp, 0x5f7ffffc
3001 // %mul2 = G_FMUL %mul1, 2**(-32)
3002 // %trunc = G_INTRINSIC_TRUNC %mul2
3003 // %mad2 = G_FMAD %trunc, -(2**32), %mul1
3004 // return {G_FPTOUI %mad2, G_FPTOUI %trunc}
3005 static std::pair<Register, Register> emitReciprocalU64(MachineIRBuilder &B,
3006  Register Val) {
3007  const LLT S32 = LLT::scalar(32);
3008  auto Unmerge = B.buildUnmerge(S32, Val);
3009 
3010  auto CvtLo = B.buildUITOFP(S32, Unmerge.getReg(0));
3011  auto CvtHi = B.buildUITOFP(S32, Unmerge.getReg(1));
3012 
3013  auto Mad = B.buildFMAD(S32, CvtHi, // 2**32
3014  B.buildFConstant(S32, BitsToFloat(0x4f800000)), CvtLo);
3015 
3016  auto Rcp = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Mad});
3017  auto Mul1 =
3018  B.buildFMul(S32, Rcp, B.buildFConstant(S32, BitsToFloat(0x5f7ffffc)));
3019 
3020  // 2**(-32)
3021  auto Mul2 =
3022  B.buildFMul(S32, Mul1, B.buildFConstant(S32, BitsToFloat(0x2f800000)));
3023  auto Trunc = B.buildIntrinsicTrunc(S32, Mul2);
3024 
3025  // -(2**32)
3026  auto Mad2 = B.buildFMAD(S32, Trunc,
3027  B.buildFConstant(S32, BitsToFloat(0xcf800000)), Mul1);
3028 
3029  auto ResultLo = B.buildFPTOUI(S32, Mad2);
3030  auto ResultHi = B.buildFPTOUI(S32, Trunc);
3031 
3032  return {ResultLo.getReg(0), ResultHi.getReg(0)};
3033 }
3034 
3036  Register DstDivReg,
3037  Register DstRemReg,
3038  Register Numer,
3039  Register Denom) const {
3040  const LLT S32 = LLT::scalar(32);
3041  const LLT S64 = LLT::scalar(64);
3042  const LLT S1 = LLT::scalar(1);
3043  Register RcpLo, RcpHi;
3044 
3045  std::tie(RcpLo, RcpHi) = emitReciprocalU64(B, Denom);
3046 
3047  auto Rcp = B.buildMerge(S64, {RcpLo, RcpHi});
3048 
3049  auto Zero64 = B.buildConstant(S64, 0);
3050  auto NegDenom = B.buildSub(S64, Zero64, Denom);
3051 
3052  auto MulLo1 = B.buildMul(S64, NegDenom, Rcp);
3053  auto MulHi1 = B.buildUMulH(S64, Rcp, MulLo1);
3054 
3055  auto UnmergeMulHi1 = B.buildUnmerge(S32, MulHi1);
3056  Register MulHi1_Lo = UnmergeMulHi1.getReg(0);
3057  Register MulHi1_Hi = UnmergeMulHi1.getReg(1);
3058 
3059  auto Add1_Lo = B.buildUAddo(S32, S1, RcpLo, MulHi1_Lo);
3060  auto Add1_Hi = B.buildUAdde(S32, S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1));
3061  auto Add1_HiNc = B.buildAdd(S32, RcpHi, MulHi1_Hi);
3062  auto Add1 = B.buildMerge(S64, {Add1_Lo, Add1_Hi});
3063 
3064  auto MulLo2 = B.buildMul(S64, NegDenom, Add1);
3065  auto MulHi2 = B.buildUMulH(S64, Add1, MulLo2);
3066  auto UnmergeMulHi2 = B.buildUnmerge(S32, MulHi2);
3067  Register MulHi2_Lo = UnmergeMulHi2.getReg(0);
3068  Register MulHi2_Hi = UnmergeMulHi2.getReg(1);
3069 
3070  auto Zero32 = B.buildConstant(S32, 0);
3071  auto Add2_Lo = B.buildUAddo(S32, S1, Add1_Lo, MulHi2_Lo);
3072  auto Add2_HiC =
3073  B.buildUAdde(S32, S1, Add1_HiNc, MulHi2_Hi, Add1_Lo.getReg(1));
3074  auto Add2_Hi = B.buildUAdde(S32, S1, Add2_HiC, Zero32, Add2_Lo.getReg(1));
3075  auto Add2 = B.buildMerge(S64, {Add2_Lo, Add2_Hi});
3076 
3077  auto UnmergeNumer = B.buildUnmerge(S32, Numer);
3078  Register NumerLo = UnmergeNumer.getReg(0);
3079  Register NumerHi = UnmergeNumer.getReg(1);
3080 
3081  auto MulHi3 = B.buildUMulH(S64, Numer, Add2);
3082  auto Mul3 = B.buildMul(S64, Denom, MulHi3);
3083  auto UnmergeMul3 = B.buildUnmerge(S32, Mul3);
3084  Register Mul3_Lo = UnmergeMul3.getReg(0);
3085  Register Mul3_Hi = UnmergeMul3.getReg(1);
3086  auto Sub1_Lo = B.buildUSubo(S32, S1, NumerLo, Mul3_Lo);
3087  auto Sub1_Hi = B.buildUSube(S32, S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1));
3088  auto Sub1_Mi = B.buildSub(S32, NumerHi, Mul3_Hi);
3089  auto Sub1 = B.buildMerge(S64, {Sub1_Lo, Sub1_Hi});
3090 
3091  auto UnmergeDenom = B.buildUnmerge(S32, Denom);
3092  Register DenomLo = UnmergeDenom.getReg(0);
3093  Register DenomHi = UnmergeDenom.getReg(1);
3094 
3095  auto CmpHi = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Hi, DenomHi);
3096  auto C1 = B.buildSExt(S32, CmpHi);
3097 
3098  auto CmpLo = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Lo, DenomLo);
3099  auto C2 = B.buildSExt(S32, CmpLo);
3100 
3101  auto CmpEq = B.buildICmp(CmpInst::ICMP_EQ, S1, Sub1_Hi, DenomHi);
3102  auto C3 = B.buildSelect(S32, CmpEq, C2, C1);
3103 
3104  // TODO: Here and below portions of the code can be enclosed into if/endif.
3105  // Currently control flow is unconditional and we have 4 selects after
3106  // potential endif to substitute PHIs.
3107 
3108  // if C3 != 0 ...
3109  auto Sub2_Lo = B.buildUSubo(S32, S1, Sub1_Lo, DenomLo);
3110  auto Sub2_Mi = B.buildUSube(S32, S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1));
3111  auto Sub2_Hi = B.buildUSube(S32, S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1));
3112  auto Sub2 = B.buildMerge(S64, {Sub2_Lo, Sub2_Hi});
3113 
3114  auto One64 = B.buildConstant(S64, 1);
3115  auto Add3 = B.buildAdd(S64, MulHi3, One64);
3116 
3117  auto C4 =
3118  B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Hi, DenomHi));
3119  auto C5 =
3120  B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Lo, DenomLo));
3121  auto C6 = B.buildSelect(
3122  S32, B.buildICmp(CmpInst::ICMP_EQ, S1, Sub2_Hi, DenomHi), C5, C4);
3123 
3124  // if (C6 != 0)
3125  auto Add4 = B.buildAdd(S64, Add3, One64);
3126  auto Sub3_Lo = B.buildUSubo(S32, S1, Sub2_Lo, DenomLo);
3127 
3128  auto Sub3_Mi = B.buildUSube(S32, S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1));
3129  auto Sub3_Hi = B.buildUSube(S32, S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1));
3130  auto Sub3 = B.buildMerge(S64, {Sub3_Lo, Sub3_Hi});
3131 
3132  // endif C6
3133  // endif C3
3134 
3135  if (DstDivReg) {
3136  auto Sel1 = B.buildSelect(
3137  S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Add4, Add3);
3138  B.buildSelect(DstDivReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32),
3139  Sel1, MulHi3);
3140  }
3141 
3142  if (DstRemReg) {
3143  auto Sel2 = B.buildSelect(
3144  S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Sub3, Sub2);
3145  B.buildSelect(DstRemReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32),
3146  Sel2, Sub1);
3147  }
3148 }
3149 
3152  MachineIRBuilder &B) const {
3153  Register DstDivReg, DstRemReg;
3154  switch (MI.getOpcode()) {
3155  default:
3156  llvm_unreachable("Unexpected opcode!");
3157  case AMDGPU::G_UDIV: {
3158  DstDivReg = MI.getOperand(0).getReg();
3159  break;
3160  }
3161  case AMDGPU::G_UREM: {
3162  DstRemReg = MI.getOperand(0).getReg();
3163  break;
3164  }
3165  case AMDGPU::G_UDIVREM: {
3166  DstDivReg = MI.getOperand(0).getReg();
3167  DstRemReg = MI.getOperand(1).getReg();
3168  break;
3169  }
3170  }
3171 
3172  const LLT S64 = LLT::scalar(64);
3173  const LLT S32 = LLT::scalar(32);
3174  const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs();
3175  Register Num = MI.getOperand(FirstSrcOpIdx).getReg();
3176  Register Den = MI.getOperand(FirstSrcOpIdx + 1).getReg();
3177  LLT Ty = MRI.getType(MI.getOperand(0).getReg());
3178 
3179  if (Ty == S32)
3180  legalizeUnsignedDIV_REM32Impl(B, DstDivReg, DstRemReg, Num, Den);
3181  else if (Ty == S64)
3182  legalizeUnsignedDIV_REM64Impl(B, DstDivReg, DstRemReg, Num, Den);
3183  else
3184  return false;
3185 
3186  MI.eraseFromParent();
3187  return true;
3188 }
3189 
3192  MachineIRBuilder &B) const {
3193  const LLT S64 = LLT::scalar(64);
3194  const LLT S32 = LLT::scalar(32);
3195 
3196  LLT Ty = MRI.getType(MI.getOperand(0).getReg());
3197  if (Ty != S32 && Ty != S64)
3198  return false;
3199 
3200  const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs();
3201  Register LHS = MI.getOperand(FirstSrcOpIdx).getReg();
3202  Register RHS = MI.getOperand(FirstSrcOpIdx + 1).getReg();
3203 
3204  auto SignBitOffset = B.buildConstant(S32, Ty.getSizeInBits() - 1);
3205  auto LHSign = B.buildAShr(Ty, LHS, SignBitOffset);
3206  auto RHSign = B.buildAShr(Ty, RHS, SignBitOffset);
3207 
3208  LHS = B.buildAdd(Ty, LHS, LHSign).getReg(0);
3209  RHS = B.buildAdd(Ty, RHS, RHSign).getReg(0);
3210 
3211  LHS = B.buildXor(Ty, LHS, LHSign).getReg(0);
3212  RHS = B.buildXor(Ty, RHS, RHSign).getReg(0);
3213 
3214  Register DstDivReg, DstRemReg, TmpDivReg, TmpRemReg;
3215  switch (MI.getOpcode()) {
3216  default:
3217  llvm_unreachable("Unexpected opcode!");
3218  case AMDGPU::G_SDIV: {
3219  DstDivReg = MI.getOperand(0).getReg();
3220  TmpDivReg = MRI.createGenericVirtualRegister(Ty);
3221  break;
3222  }
3223  case AMDGPU::G_SREM: {
3224  DstRemReg = MI.getOperand(0).getReg();
3225  TmpRemReg = MRI.createGenericVirtualRegister(Ty);
3226  break;
3227  }
3228  case AMDGPU::G_SDIVREM: {
3229  DstDivReg = MI.getOperand(0).getReg();
3230  DstRemReg = MI.getOperand(1).getReg();
3231  TmpDivReg = MRI.createGenericVirtualRegister(Ty);
3232  TmpRemReg = MRI.createGenericVirtualRegister(Ty);
3233  break;
3234  }
3235  }
3236 
3237  if (Ty == S32)
3238  legalizeUnsignedDIV_REM32Impl(B, TmpDivReg, TmpRemReg, LHS, RHS);
3239  else
3240  legalizeUnsignedDIV_REM64Impl(B, TmpDivReg, TmpRemReg, LHS, RHS);
3241 
3242  if (DstDivReg) {
3243  auto Sign = B.buildXor(Ty, LHSign, RHSign).getReg(0);
3244  auto SignXor = B.buildXor(Ty, TmpDivReg, Sign).getReg(0);
3245  B.buildSub(DstDivReg, SignXor, Sign);
3246  }
3247 
3248  if (DstRemReg) {
3249  auto Sign = LHSign.getReg(0); // Remainder sign is the same as LHS
3250  auto SignXor = B.buildXor(Ty, TmpRemReg, Sign).getReg(0);
3251  B.buildSub(DstRemReg, SignXor, Sign);
3252  }
3253 
3254  MI.eraseFromParent();
3255  return true;
3256 }
3257 
3260  MachineIRBuilder &B) const {
3261  Register Res = MI.getOperand(0).getReg();
3262  Register LHS = MI.getOperand(1).getReg();
3263  Register RHS = MI.getOperand(2).getReg();
3264  uint16_t Flags = MI.getFlags();
3265  LLT ResTy = MRI.getType(Res);
3266 
3267  const MachineFunction &MF = B.getMF();
3268  bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath ||
3269  MI.getFlag(MachineInstr::FmAfn);
3270 
3271  if (!AllowInaccurateRcp)
3272  return false;
3273 
3274  if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) {
3275  // 1 / x -> RCP(x)
3276  if (CLHS->isExactlyValue(1.0)) {
3277  B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
3278  .addUse(RHS)
3279  .setMIFlags(Flags);
3280 
3281  MI.eraseFromParent();
3282  return true;
3283  }
3284 
3285  // -1 / x -> RCP( FNEG(x) )
3286  if (CLHS->isExactlyValue(-1.0)) {
3287  auto FNeg = B.buildFNeg(ResTy, RHS, Flags);
3288  B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
3289  .addUse(FNeg.getReg(0))
3290  .setMIFlags(Flags);
3291 
3292  MI.eraseFromParent();
3293  return true;
3294  }
3295  }
3296 
3297  // x / y -> x * (1.0 / y)
3298  auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false)
3299  .addUse(RHS)
3300  .setMIFlags(Flags);
3301  B.buildFMul(Res, LHS, RCP, Flags);
3302 
3303  MI.eraseFromParent();
3304  return true;
3305 }
3306 
3309  MachineIRBuilder &B) const {
3310  Register Res = MI.getOperand(0).getReg();
3311  Register X = MI.getOperand(1).getReg();
3312  Register Y = MI.getOperand(2).getReg();
3313  uint16_t Flags = MI.getFlags();
3314  LLT ResTy = MRI.getType(Res);
3315 
3316  const MachineFunction &MF = B.getMF();
3317  bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath ||
3318  MI.getFlag(MachineInstr::FmAfn);
3319 
3320  if (!AllowInaccurateRcp)
3321  return false;
3322 
3323  auto NegY = B.buildFNeg(ResTy, Y);
3324  auto One = B.buildFConstant(ResTy, 1.0);
3325 
3326  auto R = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false)
3327  .addUse(Y)
3328  .setMIFlags(Flags);
3329 
3330  auto Tmp0 = B.buildFMA(ResTy, NegY, R, One);
3331  R = B.buildFMA(ResTy, Tmp0, R, R);
3332 
3333  auto Tmp1 = B.buildFMA(ResTy, NegY, R, One);
3334  R = B.buildFMA(ResTy, Tmp1, R, R);
3335 
3336  auto Ret = B.buildFMul(ResTy, X, R);
3337  auto Tmp2 = B.buildFMA(ResTy, NegY, Ret, X);
3338 
3339  B.buildFMA(Res, Tmp2, R, Ret);
3340  MI.eraseFromParent();
3341  return true;
3342 }
3343 
3346  MachineIRBuilder &B) const {
3347  if (legalizeFastUnsafeFDIV(MI, MRI, B))
3348  return true;
3349 
3350  Register Res = MI.getOperand(0).getReg();
3351  Register LHS = MI.getOperand(1).getReg();
3352  Register RHS = MI.getOperand(2).getReg();
3353 
3354  uint16_t Flags = MI.getFlags();
3355 
3356  LLT S16 = LLT::scalar(16);
3357  LLT S32 = LLT::scalar(32);
3358 
3359  auto LHSExt = B.buildFPExt(S32, LHS, Flags);
3360  auto RHSExt = B.buildFPExt(S32, RHS, Flags);
3361 
3362  auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3363  .addUse(RHSExt.getReg(0))
3364  .setMIFlags(Flags);
3365 
3366  auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags);
3367  auto RDst = B.buildFPTrunc(S16, QUOT, Flags);
3368 
3369  B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
3370  .addUse(RDst.getReg(0))
3371  .addUse(RHS)
3372  .addUse(LHS)
3373  .setMIFlags(Flags);
3374 
3375  MI.eraseFromParent();
3376  return true;
3377 }
3378 
3379 // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions
3380 // to enable denorm mode. When 'Enable' is false, disable denorm mode.
3381 static void toggleSPDenormMode(bool Enable,
3383  const GCNSubtarget &ST,
3385  // Set SP denorm mode to this value.
3386  unsigned SPDenormMode =
3387  Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue();
3388 
3389  if (ST.hasDenormModeInst()) {
3390  // Preserve default FP64FP16 denorm mode while updating FP32 mode.
3391  uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue();
3392 
3393  uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2);
3394  B.buildInstr(AMDGPU::S_DENORM_MODE)
3395  .addImm(NewDenormModeValue);
3396 
3397  } else {
3398  // Select FP32 bit field in mode register.
3399  unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE |
3402 
3403  B.buildInstr(AMDGPU::S_SETREG_IMM32_B32)
3404  .addImm(SPDenormMode)
3405  .addImm(SPDenormModeBitField);
3406  }
3407 }
3408 
3411  MachineIRBuilder &B) const {
3412  if (legalizeFastUnsafeFDIV(MI, MRI, B))
3413  return true;
3414 
3415  Register Res = MI.getOperand(0).getReg();
3416  Register LHS = MI.getOperand(1).getReg();
3417  Register RHS = MI.getOperand(2).getReg();
3418  const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3420 
3421  uint16_t Flags = MI.getFlags();
3422 
3423  LLT S32 = LLT::scalar(32);
3424  LLT S1 = LLT::scalar(1);
3425 
3426  auto One = B.buildFConstant(S32, 1.0f);
3427 
3428  auto DenominatorScaled =
3429  B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
3430  .addUse(LHS)
3431  .addUse(RHS)
3432  .addImm(0)
3433  .setMIFlags(Flags);
3434  auto NumeratorScaled =
3435  B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
3436  .addUse(LHS)
3437  .addUse(RHS)
3438  .addImm(1)
3439  .setMIFlags(Flags);
3440 
3441  auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3442  .addUse(DenominatorScaled.getReg(0))
3443  .setMIFlags(Flags);
3444  auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags);
3445 
3446  // FIXME: Doesn't correctly model the FP mode switch, and the FP operations
3447  // aren't modeled as reading it.
3448  if (!Mode.allFP32Denormals())
3449  toggleSPDenormMode(true, B, ST, Mode);
3450 
3451  auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags);
3452  auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags);
3453  auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags);
3454  auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags);
3455  auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags);
3456  auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags);
3457 
3458  if (!Mode.allFP32Denormals())
3459  toggleSPDenormMode(false, B, ST, Mode);
3460 
3461  auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false)
3462  .addUse(Fma4.getReg(0))
3463  .addUse(Fma1.getReg(0))
3464  .addUse(Fma3.getReg(0))
3465  .addUse(NumeratorScaled.getReg(1))
3466  .setMIFlags(Flags);
3467 
3468  B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
3469  .addUse(Fmas.getReg(0))
3470  .addUse(RHS)
3471  .addUse(LHS)
3472  .setMIFlags(Flags);
3473 
3474  MI.eraseFromParent();
3475  return true;
3476 }
3477 
3480  MachineIRBuilder &B) const {
3482  return true;
3483 
3484  Register Res = MI.getOperand(0).getReg();
3485  Register LHS = MI.getOperand(1).getReg();
3486  Register RHS = MI.getOperand(2).getReg();
3487 
3488  uint16_t Flags = MI.getFlags();
3489 
3490  LLT S64 = LLT::scalar(64);
3491  LLT S1 = LLT::scalar(1);
3492 
3493  auto One = B.buildFConstant(S64, 1.0);
3494 
3495  auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
3496  .addUse(LHS)
3497  .addUse(RHS)
3498  .addImm(0)
3499  .setMIFlags(Flags);
3500 
3501  auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags);
3502 
3503  auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false)
3504  .addUse(DivScale0.getReg(0))
3505  .setMIFlags(Flags);
3506 
3507  auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags);
3508  auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags);
3509  auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags);
3510 
3511  auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
3512  .addUse(LHS)
3513  .addUse(RHS)
3514  .addImm(1)
3515  .setMIFlags(Flags);
3516 
3517  auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags);
3518  auto Mul = B.buildFMul(S64, DivScale1.getReg(0), Fma3, Flags);
3519  auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags);
3520 
3521  Register Scale;
3523  // Workaround a hardware bug on SI where the condition output from div_scale
3524  // is not usable.
3525 
3526  LLT S32 = LLT::scalar(32);
3527 
3528  auto NumUnmerge = B.buildUnmerge(S32, LHS);
3529  auto DenUnmerge = B.buildUnmerge(S32, RHS);
3530  auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0);
3531  auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1);
3532 
3533  auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1),
3534  Scale1Unmerge.getReg(1));
3535  auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1),
3536  Scale0Unmerge.getReg(1));
3537  Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0);
3538  } else {
3539  Scale = DivScale1.getReg(1);
3540  }
3541 
3542  auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false)
3543  .addUse(Fma4.getReg(0))
3544  .addUse(Fma3.getReg(0))
3545  .addUse(Mul.getReg(0))
3546  .addUse(Scale)
3547  .setMIFlags(Flags);
3548 
3549  B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, makeArrayRef(Res), false)
3550  .addUse(Fmas.getReg(0))
3551  .addUse(RHS)
3552  .addUse(LHS)
3553  .setMIFlags(Flags);
3554 
3555  MI.eraseFromParent();
3556  return true;
3557 }
3558 
3561  MachineIRBuilder &B) const {
3562  Register Res = MI.getOperand(0).getReg();
3563  Register LHS = MI.getOperand(2).getReg();
3564  Register RHS = MI.getOperand(3).getReg();
3565  uint16_t Flags = MI.getFlags();
3566 
3567  LLT S32 = LLT::scalar(32);
3568  LLT S1 = LLT::scalar(1);
3569 
3570  auto Abs = B.buildFAbs(S32, RHS, Flags);
3571  const APFloat C0Val(1.0f);
3572 
3573  auto C0 = B.buildConstant(S32, 0x6f800000);
3574  auto C1 = B.buildConstant(S32, 0x2f800000);
3575  auto C2 = B.buildConstant(S32, FloatToBits(1.0f));
3576 
3577  auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags);
3578  auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags);
3579 
3580  auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags);
3581 
3582  auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3583  .addUse(Mul0.getReg(0))
3584  .setMIFlags(Flags);
3585 
3586  auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags);
3587 
3588  B.buildFMul(Res, Sel, Mul1, Flags);
3589 
3590  MI.eraseFromParent();
3591  return true;
3592 }
3593 
3594 // Expand llvm.amdgcn.rsq.clamp on targets that don't support the instruction.
3595 // FIXME: Why do we handle this one but not other removed instructions?
3596 //
3597 // Reciprocal square root. The clamp prevents infinite results, clamping
3598 // infinities to max_float. D.f = 1.0 / sqrt(S0.f), result clamped to
3599 // +-max_float.
3602  MachineIRBuilder &B) const {
3604  return true;
3605 
3606  Register Dst = MI.getOperand(0).getReg();
3607  Register Src = MI.getOperand(2).getReg();
3608  auto Flags = MI.getFlags();
3609 
3610  LLT Ty = MRI.getType(Dst);
3611 
3612  const fltSemantics *FltSemantics;
3613  if (Ty == LLT::scalar(32))
3614  FltSemantics = &APFloat::IEEEsingle();
3615  else if (Ty == LLT::scalar(64))
3616  FltSemantics = &APFloat::IEEEdouble();
3617  else
3618  return false;
3619 
3620  auto Rsq = B.buildIntrinsic(Intrinsic::amdgcn_rsq, {Ty}, false)
3621  .addUse(Src)
3622  .setMIFlags(Flags);
3623 
3624  // We don't need to concern ourselves with the snan handling difference, since
3625  // the rsq quieted (or not) so use the one which will directly select.
3626  const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3627  const bool UseIEEE = MFI->getMode().IEEE;
3628 
3629  auto MaxFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics));
3630  auto ClampMax = UseIEEE ? B.buildFMinNumIEEE(Ty, Rsq, MaxFlt, Flags) :
3631  B.buildFMinNum(Ty, Rsq, MaxFlt, Flags);
3632 
3633  auto MinFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics, true));
3634 
3635  if (UseIEEE)
3636  B.buildFMaxNumIEEE(Dst, ClampMax, MinFlt, Flags);
3637  else
3638  B.buildFMaxNum(Dst, ClampMax, MinFlt, Flags);
3639  MI.eraseFromParent();
3640  return true;
3641 }
3642 
3643 static unsigned getDSFPAtomicOpcode(Intrinsic::ID IID) {
3644  switch (IID) {
3645  case Intrinsic::amdgcn_ds_fadd:
3646  return AMDGPU::G_ATOMICRMW_FADD;
3647  case Intrinsic::amdgcn_ds_fmin:
3648  return AMDGPU::G_AMDGPU_ATOMIC_FMIN;
3649  case Intrinsic::amdgcn_ds_fmax:
3650  return AMDGPU::G_AMDGPU_ATOMIC_FMAX;
3651  default:
3652  llvm_unreachable("not a DS FP intrinsic");
3653  }
3654 }
3655 
3657  MachineInstr &MI,
3658  Intrinsic::ID IID) const {
3659  GISelChangeObserver &Observer = Helper.Observer;
3660  Observer.changingInstr(MI);
3661 
3662  MI.setDesc(ST.getInstrInfo()->get(getDSFPAtomicOpcode(IID)));
3663 
3664  // The remaining operands were used to set fields in the MemOperand on
3665  // construction.
3666  for (int I = 6; I > 3; --I)
3667  MI.RemoveOperand(I);
3668 
3669  MI.RemoveOperand(1); // Remove the intrinsic ID.
3670  Observer.changedInstr(MI);
3671  return true;
3672 }
3673 
3676  MachineIRBuilder &B) const {
3677  uint64_t Offset =
3680  LLT DstTy = MRI.getType(DstReg);
3681  LLT IdxTy = LLT::scalar(DstTy.getSizeInBits());
3682 
3683  Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy);
3684  if (!loadInputValue(KernargPtrReg, B,
3686  return false;
3687 
3688  // FIXME: This should be nuw
3689  B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0));
3690  return true;
3691 }
3692 
3695  MachineIRBuilder &B) const {
3696  const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3697  if (!MFI->isEntryFunction()) {
3700  }
3701 
3702  Register DstReg = MI.getOperand(0).getReg();
3703  if (!getImplicitArgPtr(DstReg, MRI, B))
3704  return false;
3705 
3706  MI.eraseFromParent();
3707  return true;
3708 }
3709 
3713  unsigned AddrSpace) const {
3714  Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B);
3715  auto Unmerge = B.buildUnmerge(LLT::scalar(32), MI.getOperand(2).getReg());
3716  Register Hi32 = Unmerge.getReg(1);
3717 
3718  B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg);
3719  MI.eraseFromParent();
3720  return true;
3721 }
3722 
3723 // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args:
3724 // offset (the offset that is included in bounds checking and swizzling, to be
3725 // split between the instruction's voffset and immoffset fields) and soffset
3726 // (the offset that is excluded from bounds checking and swizzling, to go in
3727 // the instruction's soffset field). This function takes the first kind of
3728 // offset and figures out how to split it between voffset and immoffset.
3729 std::pair<Register, unsigned>
3731  Register OrigOffset) const {
3732  const unsigned MaxImm = 4095;
3733  Register BaseReg;
3734  unsigned ImmOffset;
3735  const LLT S32 = LLT::scalar(32);
3736  MachineRegisterInfo &MRI = *B.getMRI();
3737 
3738  std::tie(BaseReg, ImmOffset) =
3740 
3741  // If BaseReg is a pointer, convert it to int.
3742  if (MRI.getType(BaseReg).isPointer())
3743  BaseReg = B.buildPtrToInt(MRI.getType(OrigOffset), BaseReg).getReg(0);
3744 
3745  // If the immediate value is too big for the immoffset field, put the value
3746  // and -4096 into the immoffset field so that the value that is copied/added
3747  // for the voffset field is a multiple of 4096, and it stands more chance
3748  // of being CSEd with the copy/add for another similar load/store.
3749  // However, do not do that rounding down to a multiple of 4096 if that is a
3750  // negative number, as it appears to be illegal to have a negative offset
3751  // in the vgpr, even if adding the immediate offset makes it positive.
3752  unsigned Overflow = ImmOffset & ~MaxImm;
3753  ImmOffset -= Overflow;
3754  if ((int32_t)Overflow < 0) {
3755  Overflow += ImmOffset;
3756  ImmOffset = 0;
3757  }
3758 
3759  if (Overflow != 0) {
3760  if (!BaseReg) {
3761  BaseReg = B.buildConstant(S32, Overflow).getReg(0);
3762  } else {
3763  auto OverflowVal = B.buildConstant(S32, Overflow);
3764  BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0);
3765  }
3766  }
3767 
3768  if (!BaseReg)
3769  BaseReg = B.buildConstant(S32, 0).getReg(0);
3770 
3771  return std::make_pair(BaseReg, ImmOffset);
3772 }
3773 
3774 /// Update \p MMO based on the offset inputs to a raw/struct buffer intrinsic.
3776  Register VOffset, Register SOffset,
3777  unsigned ImmOffset, Register VIndex,
3778  MachineRegisterInfo &MRI) const {
3779  Optional<ValueAndVReg> MaybeVOffsetVal =
3781  Optional<ValueAndVReg> MaybeSOffsetVal =
3783  Optional<ValueAndVReg> MaybeVIndexVal =
3785  // If the combined VOffset + SOffset + ImmOffset + strided VIndex is constant,
3786  // update the MMO with that offset. The stride is unknown so we can only do
3787  // this if VIndex is constant 0.
3788  if (MaybeVOffsetVal && MaybeSOffsetVal && MaybeVIndexVal &&
3789  MaybeVIndexVal->Value == 0) {
3790  uint64_t TotalOffset = MaybeVOffsetVal->Value.getZExtValue() +
3791  MaybeSOffsetVal->Value.getZExtValue() + ImmOffset;
3792  MMO->setOffset(TotalOffset);
3793  } else {
3794  // We don't have a constant combined offset to use in the MMO. Give up.
3795  MMO->setValue((Value *)nullptr);
3796  }
3797 }
3798 
3799 /// Handle register layout difference for f16 images for some subtargets.
3802  Register Reg,
3803  bool ImageStore) const {
3804  const LLT S16 = LLT::scalar(16);
3805  const LLT S32 = LLT::scalar(32);
3806  LLT StoreVT = MRI.getType(Reg);
3807  assert(StoreVT.isVector() && StoreVT.getElementType() == S16);
3808 
3809  if (ST.hasUnpackedD16VMem()) {
3810  auto Unmerge = B.buildUnmerge(S16, Reg);
3811 
3812  SmallVector<Register, 4> WideRegs;
3813  for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
3814  WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0));
3815 
3816  int NumElts = StoreVT.getNumElements();
3817 
3818  return B.buildBuildVector(LLT::fixed_vector(NumElts, S32), WideRegs)
3819  .getReg(0);
3820  }
3821 
3822  if (ImageStore && ST.hasImageStoreD16Bug()) {
3823  if (StoreVT.getNumElements() == 2) {
3824  SmallVector<Register, 4> PackedRegs;
3825  Reg = B.buildBitcast(S32, Reg).getReg(0);
3826  PackedRegs.push_back(Reg);
3827  PackedRegs.resize(2, B.buildUndef(S32).getReg(0));
3828  return B.buildBuildVector(LLT::fixed_vector(2, S32), PackedRegs)
3829  .getReg(0);
3830  }
3831 
3832  if (StoreVT.getNumElements() == 3) {
3833  SmallVector<Register, 4> PackedRegs;
3834  auto Unmerge = B.buildUnmerge(S16, Reg);
3835  for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
3836  PackedRegs.push_back(Unmerge.getReg(I));
3837  PackedRegs.resize(6, B.buildUndef(S16).getReg(0));
3838  Reg = B.buildBuildVector(LLT::fixed_vector(6, S16), PackedRegs).getReg(0);
3839  return B.buildBitcast(LLT::fixed_vector(3, S32), Reg).getReg(0);
3840  }
3841 
3842  if (StoreVT.getNumElements() == 4) {
3843  SmallVector<Register, 4> PackedRegs;
3844  Reg = B.buildBitcast(LLT::fixed_vector(2, S32), Reg).getReg(0);
3845  auto Unmerge = B.buildUnmerge(S32, Reg);
3846  for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
3847  PackedRegs.push_back(Unmerge.getReg(I));
3848  PackedRegs.resize(4, B.buildUndef(S32).getReg(0));
3849  return B.buildBuildVector(LLT::fixed_vector(4, S32), PackedRegs)
3850  .getReg(0);
3851  }
3852 
3853  llvm_unreachable("invalid data type");
3854  }
3855 
3856  return Reg;
3857 }
3858 
3860  MachineIRBuilder &B, Register VData, bool IsFormat) const {
3861  MachineRegisterInfo *MRI = B.getMRI();
3862  LLT Ty = MRI->getType(VData);
3863 
3864  const LLT S16 = LLT::scalar(16);
3865 
3866  // Fixup illegal register types for i8 stores.
3867  if (Ty == LLT::scalar(8) || Ty == S16) {
3868  Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0);
3869  return AnyExt;
3870  }
3871 
3872  if (Ty.isVector()) {
3873  if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) {
3874  if (IsFormat)
3875  return handleD16VData(B, *MRI, VData);
3876  }
3877  }
3878 
3879  return VData;
3880 }
3881 
3885  bool IsTyped,
3886  bool IsFormat) const {
3887  Register VData = MI.getOperand(1).getReg();
3888  LLT Ty = MRI.getType(VData);
3889  LLT EltTy = Ty.getScalarType();
3890  const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
3891  const LLT S32 = LLT::scalar(32);
3892 
3893  VData = fixStoreSourceType(B, VData, IsFormat);
3894  Register RSrc = MI.getOperand(2).getReg();
3895 
3896  MachineMemOperand *MMO = *MI.memoperands_begin();
3897  const int MemSize = MMO->getSize();
3898 
3899  unsigned ImmOffset;
3900 
3901  // The typed intrinsics add an immediate after the registers.
3902  const unsigned NumVIndexOps = IsTyped ? 8 : 7;
3903 
3904  // The struct intrinsic variants add one additional operand over raw.
3905  const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
3906  Register VIndex;
3907  int OpOffset = 0;
3908  if (HasVIndex) {
3909  VIndex = MI.getOperand(3).getReg();
3910  OpOffset = 1;
3911  } else {
3912  VIndex = B.buildConstant(S32, 0).getReg(0);
3913  }
3914 
3915  Register VOffset = MI.getOperand(3 + OpOffset).getReg();
3916  Register SOffset = MI.getOperand(4 + OpOffset).getReg();
3917 
3918  unsigned Format = 0;
3919  if (IsTyped) {
3920  Format = MI.getOperand(5 + OpOffset).getImm();
3921  ++OpOffset;
3922  }
3923 
3924  unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
3925 
3926  std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset);
3927  updateBufferMMO(MMO, VOffset, SOffset, ImmOffset, VIndex, MRI);
3928 
3929  unsigned Opc;
3930  if (IsTyped) {
3931  Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 :
3932  AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT;
3933  } else if (IsFormat) {
3934  Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 :
3935  AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT;
3936  } else {
3937  switch (MemSize) {
3938  case 1:
3939  Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE;
3940  break;
3941  case 2:
3942  Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT;
3943  break;
3944  default:
3945  Opc = AMDGPU::G_AMDGPU_BUFFER_STORE;
3946  break;
3947  }
3948  }
3949 
3950  auto MIB = B.buildInstr(Opc)
3951  .addUse(VData) // vdata
3952  .addUse(RSrc) // rsrc
3953  .addUse(VIndex) // vindex
3954  .addUse(VOffset) // voffset
3955  .addUse(SOffset) // soffset
3956  .addImm(ImmOffset); // offset(imm)
3957 
3958  if (IsTyped)
3959  MIB.addImm(Format);
3960 
3961  MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm)
3962  .addImm(HasVIndex ? -1 : 0) // idxen(imm)
3963  .addMemOperand(MMO);
3964 
3965  MI.eraseFromParent();
3966  return true;
3967 }
3968 
3972  bool IsFormat,
3973  bool IsTyped) const {
3974  // FIXME: Verifier should enforce 1 MMO for these intrinsics.
3975  MachineMemOperand *MMO = *MI.memoperands_begin();
3976  const LLT MemTy = MMO->getMemoryType();
3977  const LLT S32 = LLT::scalar(32);
3978 
3979  Register Dst = MI.getOperand(0).getReg();
3980  Register RSrc = MI.getOperand(2).getReg();
3981 
3982  // The typed intrinsics add an immediate after the registers.
3983  const unsigned NumVIndexOps = IsTyped ? 8 : 7;
3984 
3985  // The struct intrinsic variants add one additional operand over raw.
3986  const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
3987  Register VIndex;
3988  int OpOffset = 0;
3989  if (HasVIndex) {
3990  VIndex = MI.getOperand(3).getReg();
3991  OpOffset = 1;
3992  } else {
3993  VIndex = B.buildConstant(S32, 0).getReg(0);
3994  }
3995 
3996  Register VOffset = MI.getOperand(3 + OpOffset).getReg();
3997  Register SOffset = MI.getOperand(4 + OpOffset).getReg();
3998 
3999  unsigned Format = 0;
4000  if (IsTyped) {
4001  Format = MI.getOperand(5 + OpOffset).getImm();
4002  ++OpOffset;
4003  }
4004 
4005  unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
4006  unsigned ImmOffset;
4007 
4008  LLT Ty = MRI.getType(Dst);
4009  LLT EltTy = Ty.getScalarType();
4010  const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
4011  const bool Unpacked = ST.hasUnpackedD16VMem();
4012 
4013  std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset);
4014  updateBufferMMO(MMO, VOffset, SOffset, ImmOffset, VIndex, MRI);
4015 
4016  unsigned Opc;
4017 
4018  if (IsTyped) {
4019  Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 :
4020  AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT;
4021  } else if (IsFormat) {
4022  Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16 :
4023  AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT;
4024  } else {
4025  switch (MemTy.getSizeInBits()) {
4026  case 8:
4027  Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE;
4028  break;
4029  case 16:
4030  Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT;
4031  break;
4032  default:
4033  Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD;
4034  break;
4035  }
4036  }
4037 
4038  Register LoadDstReg;
4039 
4040  bool IsExtLoad =
4041  (!IsD16 && MemTy.getSizeInBits() < 32) || (IsD16 && !Ty.isVector());
4042  LLT UnpackedTy = Ty.changeElementSize(32);
4043 
4044  if (IsExtLoad)
4045  LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32);
4046  else if (Unpacked && IsD16 && Ty.isVector())
4047  LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy);
4048  else
4049  LoadDstReg = Dst;
4050 
4051  auto MIB = B.buildInstr(Opc)
4052  .addDef(LoadDstReg) // vdata
4053  .addUse(RSrc) // rsrc
4054  .addUse(VIndex) // vindex
4055  .addUse(VOffset) // voffset
4056  .addUse(SOffset) // soffset
4057  .addImm(ImmOffset); // offset(imm)
4058 
4059  if (IsTyped)
4060  MIB.addImm(Format);
4061 
4062  MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm)
4063  .addImm(HasVIndex ? -1 : 0) // idxen(imm)
4064  .addMemOperand(MMO);
4065 
4066  if (LoadDstReg != Dst) {
4067  B.setInsertPt(B.getMBB(), ++B.getInsertPt());
4068 
4069  // Widen result for extending loads was widened.
4070  if (IsExtLoad)
4071  B.buildTrunc(Dst, LoadDstReg);
4072  else {
4073  // Repack to original 16-bit vector result
4074  // FIXME: G_TRUNC should work, but legalization currently fails
4075  auto Unmerge = B.buildUnmerge(S32, LoadDstReg);
4076  SmallVector<Register, 4> Repack;
4077  for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I)
4078  Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0));
4079  B.buildMerge(Dst, Repack);
4080  }
4081  }
4082 
4083  MI.eraseFromParent();
4084  return true;
4085 }
4086 
4089  bool IsInc) const {
4090  unsigned Opc = IsInc ? AMDGPU::G_AMDGPU_ATOMIC_INC :
4091  AMDGPU::G_AMDGPU_ATOMIC_DEC;
4092  B.buildInstr(Opc)
4093  .addDef(MI.getOperand(0).getReg())
4094  .addUse(MI.getOperand(2).getReg())
4095  .addUse(MI.getOperand(3).getReg())
4096  .cloneMemRefs(MI);
4097  MI.eraseFromParent();
4098  return true;
4099 }
4100 
4101 static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) {
4102  switch (IntrID) {
4103  case Intrinsic::amdgcn_raw_buffer_atomic_swap:
4104  case Intrinsic::amdgcn_struct_buffer_atomic_swap:
4105  return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP;
4106  case Intrinsic::amdgcn_raw_buffer_atomic_add:
4107  case Intrinsic::amdgcn_struct_buffer_atomic_add:
4108  return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD;
4109  case Intrinsic::amdgcn_raw_buffer_atomic_sub:
4110  case Intrinsic::amdgcn_struct_buffer_atomic_sub:
4111  return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB;
4112  case Intrinsic::amdgcn_raw_buffer_atomic_smin:
4113  case Intrinsic::amdgcn_struct_buffer_atomic_smin:
4114  return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN;
4115  case Intrinsic::amdgcn_raw_buffer_atomic_umin:
4116  case Intrinsic::amdgcn_struct_buffer_atomic_umin:
4117  return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN;
4118  case Intrinsic::amdgcn_raw_buffer_atomic_smax:
4119  case Intrinsic::amdgcn_struct_buffer_atomic_smax:
4120  return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX;
4121  case Intrinsic::amdgcn_raw_buffer_atomic_umax:
4122  case Intrinsic::amdgcn_struct_buffer_atomic_umax:
4123  return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX;
4124  case Intrinsic::amdgcn_raw_buffer_atomic_and:
4125  case Intrinsic::amdgcn_struct_buffer_atomic_and:
4126  return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND;
4127  case Intrinsic::amdgcn_raw_buffer_atomic_or:
4128  case Intrinsic::amdgcn_struct_buffer_atomic_or:
4129  return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR;
4130  case Intrinsic::amdgcn_raw_buffer_atomic_xor:
4131  case Intrinsic::amdgcn_struct_buffer_atomic_xor:
4132  return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR;
4133  case Intrinsic::amdgcn_raw_buffer_atomic_inc:
4134  case Intrinsic::amdgcn_struct_buffer_atomic_inc:
4135  return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC;
4136  case Intrinsic::amdgcn_raw_buffer_atomic_dec:
4137  case Intrinsic::amdgcn_struct_buffer_atomic_dec:
4138  return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC;
4139  case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
4140  case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
4141  return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP;
4142  case Intrinsic::amdgcn_buffer_atomic_fadd:
4143  case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
4144  case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
4145  return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD;
4146  case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
4147  case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
4148  return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN;
4149  case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
4150  case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
4151  return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX;
4152  default:
4153  llvm_unreachable("unhandled atomic opcode");
4154  }
4155 }
4156 
4159  Intrinsic::ID IID) const {
4160  const bool IsCmpSwap = IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap ||
4161  IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap;
4162  const bool HasReturn = MI.getNumExplicitDefs() != 0;
4163 
4164  Register Dst;
4165 
4166  int OpOffset = 0;
4167  if (HasReturn) {
4168  // A few FP atomics do not support return values.
4169  Dst = MI.getOperand(0).getReg();
4170  } else {
4171  OpOffset = -1;
4172  }
4173 
4174  Register VData = MI.getOperand(2 + OpOffset).getReg();
4175  Register CmpVal;
4176 
4177  if (IsCmpSwap) {
4178  CmpVal = MI.getOperand(3 + OpOffset).getReg();
4179  ++OpOffset;
4180  }
4181 
4182  Register RSrc = MI.getOperand(3 + OpOffset).getReg();
4183  const unsigned NumVIndexOps = (IsCmpSwap ? 8 : 7) + HasReturn;
4184 
4185  // The struct intrinsic variants add one additional operand over raw.
4186  const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
4187  Register VIndex;
4188  if (HasVIndex) {
4189  VIndex = MI.getOperand(4 + OpOffset).getReg();
4190  ++OpOffset;
4191  } else {
4192  VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0);
4193  }
4194 
4195  Register VOffset = MI.getOperand(4 + OpOffset).getReg();
4196  Register SOffset = MI.getOperand(5 + OpOffset).getReg();
4197  unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm();
4198 
4199  MachineMemOperand *MMO = *MI.memoperands_begin();
4200 
4201  unsigned ImmOffset;
4202  std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset);
4203  updateBufferMMO(MMO, VOffset, SOffset, ImmOffset, VIndex, *B.getMRI());
4204 
4205  auto MIB = B.buildInstr(getBufferAtomicPseudo(IID));
4206 
4207  if (HasReturn)
4208  MIB.addDef(Dst);
4209 
4210  MIB.addUse(VData); // vdata
4211 
4212  if (IsCmpSwap)
4213  MIB.addReg(CmpVal);
4214 
4215  MIB.addUse(RSrc) // rsrc
4216  .addUse(VIndex) // vindex
4217  .addUse(VOffset) // voffset
4218  .addUse(SOffset) // soffset
4219  .addImm(ImmOffset) // offset(imm)
4220  .addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm)
4221  .addImm(HasVIndex ? -1 : 0) // idxen(imm)
4222  .addMemOperand(MMO);
4223 
4224  MI.eraseFromParent();
4225  return true;
4226 }
4227 
4228 /// Turn a set of s16 typed registers in \p AddrRegs into a dword sized
4229 /// vector with s16 typed elements.
4231  SmallVectorImpl<Register> &PackedAddrs,
4232  unsigned ArgOffset,
4234  bool IsA16, bool IsG16) {
4235  const LLT S16 = LLT::scalar(16);
4236  const LLT V2S16 = LLT::fixed_vector(2, 16);
4237  auto EndIdx = Intr->VAddrEnd;
4238 
4239  for (unsigned I = Intr->VAddrStart; I < EndIdx; I++) {
4240  MachineOperand &SrcOp = MI.getOperand(ArgOffset + I);
4241  if (!SrcOp.isReg())
4242  continue; // _L to _LZ may have eliminated this.
4243 
4244  Register AddrReg = SrcOp.getReg();
4245 
4246  if ((I < Intr->GradientStart) ||
4247  (I >= Intr->GradientStart && I < Intr->CoordStart && !IsG16) ||
4248  (I >= Intr->CoordStart && !IsA16)) {
4249  // Handle any gradient or coordinate operands that should not be packed
4250  AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0);
4251  PackedAddrs.push_back(AddrReg);
4252  } else {
4253  // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D,
4254  // derivatives dx/dh and dx/dv are packed with undef.
4255  if (((I + 1) >= EndIdx) ||
4256  ((Intr->NumGradients / 2) % 2 == 1 &&
4257  (I == static_cast<unsigned>(Intr->GradientStart +
4258  (Intr->NumGradients / 2) - 1) ||
4259  I == static_cast<unsigned>(Intr->GradientStart +
4260  Intr->NumGradients - 1))) ||
4261  // Check for _L to _LZ optimization
4262  !MI.getOperand(ArgOffset + I + 1).isReg()) {
4263  PackedAddrs.push_back(
4264  B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
4265  .getReg(0));
4266  } else {
4267  PackedAddrs.push_back(
4268  B.buildBuildVector(
4269  V2S16, {AddrReg, MI.getOperand(ArgOffset + I + 1).getReg()})
4270  .getReg(0));
4271  ++I;
4272  }
4273  }
4274  }
4275 }
4276 
4277 /// Convert from separate vaddr components to a single vector address register,
4278 /// and replace the remaining operands with $noreg.
4280  int DimIdx, int NumVAddrs) {
4281  const LLT S32 = LLT::scalar(32);
4282 
4283  SmallVector<Register, 8> AddrRegs;
4284  for (int I = 0; I != NumVAddrs; ++I) {
4285  MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
4286  if (SrcOp.isReg()) {
4287  AddrRegs.push_back(SrcOp.getReg());
4288  assert(B.getMRI()->getType(SrcOp.getReg()) == S32);
4289  }
4290  }
4291 
4292  int NumAddrRegs = AddrRegs.size();
4293  if (NumAddrRegs != 1) {
4294  // Above 8 elements round up to next power of 2 (i.e. 16).
4295  if (NumAddrRegs > 8 && !isPowerOf2_32(NumAddrRegs)) {
4296  const int RoundedNumRegs = NextPowerOf2(NumAddrRegs);
4297  auto Undef = B.buildUndef(S32);
4298  AddrRegs.append(RoundedNumRegs - NumAddrRegs, Undef.getReg(0));
4299  NumAddrRegs = RoundedNumRegs;
4300  }
4301 
4302  auto VAddr =
4303  B.buildBuildVector(LLT::fixed_vector(NumAddrRegs, 32), AddrRegs);
4304  MI.getOperand(DimIdx).setReg(VAddr.getReg(0));
4305  }
4306 
4307  for (int I = 1; I != NumVAddrs; ++I) {
4308  MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
4309  if (SrcOp.isReg())
4310  MI.getOperand(DimIdx + I).setReg(AMDGPU::NoRegister);
4311  }
4312 }
4313 
4314 /// Rewrite image intrinsics to use register layouts expected by the subtarget.
4315 ///
4316 /// Depending on the subtarget, load/store with 16-bit element data need to be
4317 /// rewritten to use the low half of 32-bit registers, or directly use a packed
4318 /// layout. 16-bit addresses should also sometimes be packed into 32-bit
4319 /// registers.
4320 ///
4321 /// We don't want to directly select image instructions just yet, but also want
4322 /// to exposes all register repacking to the legalizer/combiners. We also don't
4323 /// want a selected instrution entering RegBankSelect. In order to avoid
4324 /// defining a multitude of intermediate image instructions, directly hack on
4325 /// the intrinsic's arguments. In cases like a16 addreses, this requires padding
4326 /// now unnecessary arguments with $noreg.
4329  const AMDGPU::ImageDimIntrinsicInfo *Intr) const {
4330 
4331  const unsigned NumDefs = MI.getNumExplicitDefs();
4332  const unsigned ArgOffset = NumDefs + 1;
4333  bool IsTFE = NumDefs == 2;
4334  // We are only processing the operands of d16 image operations on subtargets
4335  // that use the unpacked register layout, or need to repack the TFE result.
4336 
4337  // TODO: Do we need to guard against already legalized intrinsics?
4338  const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
4339  AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode);
4340 
4341  MachineRegisterInfo *MRI = B.getMRI();
4342  const LLT S32 = LLT::scalar(32);
4343  const LLT S16 = LLT::scalar(16);
4344  const LLT V2S16 = LLT::fixed_vector(2, 16);
4345 
4346  unsigned DMask = 0;
4347 
4348  // Check for 16 bit addresses and pack if true.
4349  LLT GradTy =
4350  MRI->getType(MI.getOperand(ArgOffset + Intr->GradientStart).getReg());
4351  LLT AddrTy =
4352  MRI->getType(MI.getOperand(ArgOffset + Intr->CoordStart).getReg());
4353  const bool IsG16 = GradTy == S16;
4354  const bool IsA16 = AddrTy == S16;
4355 
4356  int DMaskLanes = 0;
4357  if (!BaseOpcode->Atomic) {
4358  DMask = MI.getOperand(ArgOffset + Intr->DMaskIndex).getImm();
4359  if (BaseOpcode->Gather4) {
4360  DMaskLanes = 4;
4361  } else if (DMask != 0) {
4362  DMaskLanes = countPopulation(DMask);
4363  } else if (!IsTFE && !BaseOpcode->Store) {
4364  // If dmask is 0, this is a no-op load. This can be eliminated.
4365  B.buildUndef(MI.getOperand(0));
4366  MI.eraseFromParent();
4367  return true;
4368  }
4369  }
4370 
4371  Observer.changingInstr(MI);
4372  auto ChangedInstr = make_scope_exit([&] { Observer.changedInstr(MI); });
4373 
4374  unsigned NewOpcode = NumDefs == 0 ?
4375  AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD;
4376 
4377  // Track that we legalized this
4378  MI.setDesc(B.getTII().get(NewOpcode));
4379 
4380  // Expecting to get an error flag since TFC is on - and dmask is 0 Force
4381  // dmask to be at least 1 otherwise the instruction will fail
4382  if (IsTFE && DMask == 0) {
4383  DMask = 0x1;
4384  DMaskLanes = 1;
4385  MI.getOperand(ArgOffset + Intr->DMaskIndex).setImm(DMask);
4386  }
4387 
4388  if (BaseOpcode->Atomic) {
4389  Register VData0 = MI.getOperand(2).getReg();
4390  LLT Ty = MRI->getType(VData0);
4391 
4392  // TODO: Allow atomic swap and bit ops for v2s16/v4s16
4393  if (Ty.isVector())
4394  return false;
4395 
4396  if (BaseOpcode->AtomicX2) {
4397  Register VData1 = MI.getOperand(3).getReg();
4398  // The two values are packed in one register.
4399  LLT PackedTy = LLT::fixed_vector(2, Ty);
4400  auto Concat = B.buildBuildVector(PackedTy, {VData0, VData1});
4401  MI.getOperand(2).setReg(Concat.getReg(0));
4402  MI.getOperand(3).setReg(AMDGPU::NoRegister);
4403  }
4404  }
4405 
4406  unsigned CorrectedNumVAddrs = Intr->NumVAddrs;
4407 
4408  // Optimize _L to _LZ when _L is zero
4409  if (const AMDGPU::MIMGLZMappingInfo *LZMappingInfo =
4410  AMDGPU::getMIMGLZMappingInfo(Intr->BaseOpcode)) {
4411  const ConstantFP *ConstantLod;
4412 
4413  if (mi_match(MI.getOperand(ArgOffset + Intr->LodIndex).getReg(), *MRI,
4414  m_GFCst(ConstantLod))) {
4415  if (ConstantLod->isZero() || ConstantLod->isNegative()) {
4416  // Set new opcode to _lz variant of _l, and change the intrinsic ID.
4417  const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr =
4419  Intr->Dim);
4420 
4421  // The starting indexes should remain in the same place.
4422  --CorrectedNumVAddrs;
4423 
4424  MI.getOperand(MI.getNumExplicitDefs())
4425  .setIntrinsicID(static_cast<Intrinsic::ID>(NewImageDimIntr->Intr));
4426  MI.RemoveOperand(ArgOffset + Intr->LodIndex);
4427  Intr = NewImageDimIntr;
4428  }
4429  }
4430  }
4431 
4432  // Optimize _mip away, when 'lod' is zero
4433  if (AMDGPU::getMIMGMIPMappingInfo(Intr->BaseOpcode)) {
4434  int64_t ConstantLod;
4435  if (mi_match(MI.getOperand(ArgOffset + Intr->MipIndex).getReg(), *MRI,
4436  m_ICst(ConstantLod))) {
4437  if (ConstantLod == 0) {
4438  // TODO: Change intrinsic opcode and remove operand instead or replacing
4439  // it with 0, as the _L to _LZ handling is done above.
4440  MI.getOperand(ArgOffset + Intr->MipIndex).ChangeToImmediate(0);
4441  --CorrectedNumVAddrs;
4442  }
4443  }
4444  }
4445 
4446  // Rewrite the addressing register layout before doing anything else.
4447  if (BaseOpcode->Gradients && !ST.hasG16() && (IsA16 != IsG16)) {
4448  // 16 bit gradients are supported, but are tied to the A16 control
4449  // so both gradients and addresses must be 16 bit
4450  return false;
4451  }
4452 
4453  if (IsA16 && !ST.hasA16()) {
4454  // A16 not supported
4455  return false;
4456  }
4457 
4458  if (IsA16 || IsG16) {
4459  if (Intr->NumVAddrs > 1) {
4460  SmallVector<Register, 4> PackedRegs;
4461 
4462  packImage16bitOpsToDwords(B, MI, PackedRegs, ArgOffset, Intr, IsA16,
4463  IsG16);
4464 
4465  // See also below in the non-a16 branch
4466  const bool UseNSA = ST.hasNSAEncoding() && PackedRegs.size() >= 3 &&
4467  PackedRegs.size() <= ST.getNSAMaxSize();
4468 
4469  if (!UseNSA && PackedRegs.size() > 1) {
4470  LLT PackedAddrTy = LLT::fixed_vector(2 * PackedRegs.size(), 16);
4471  auto Concat = B.buildConcatVectors(PackedAddrTy, PackedRegs);
4472  PackedRegs[0] = Concat.getReg(0);
4473  PackedRegs.resize(1);
4474  }
4475 
4476  const unsigned NumPacked = PackedRegs.size();
4477  for (unsigned I = Intr->VAddrStart; I < Intr->VAddrEnd; I++) {
4478  MachineOperand &SrcOp = MI.getOperand(ArgOffset + I);
4479  if (!SrcOp.isReg()) {
4480  assert(SrcOp.isImm() && SrcOp.getImm() == 0);
4481  continue;
4482  }
4483 
4484  assert(SrcOp.getReg() != AMDGPU::NoRegister);
4485 
4486  if (I - Intr->VAddrStart < NumPacked)
4487  SrcOp.setReg(PackedRegs[I - Intr->VAddrStart]);
4488  else
4489  SrcOp.setReg(AMDGPU::NoRegister);
4490  }
4491  }
4492  } else {
4493  // If the register allocator cannot place the address registers contiguously
4494  // without introducing moves, then using the non-sequential address encoding
4495  // is always preferable, since it saves VALU instructions and is usually a
4496  // wash in terms of code size or even better.
4497  //
4498  // However, we currently have no way of hinting to the register allocator
4499  // that MIMG addresses should be placed contiguously when it is possible to
4500  // do so, so force non-NSA for the common 2-address case as a heuristic.
4501  //
4502  // SIShrinkInstructions will convert NSA encodings to non-NSA after register
4503  // allocation when possible.
4504  const bool UseNSA = ST.hasNSAEncoding() && CorrectedNumVAddrs >= 3 &&
4505  CorrectedNumVAddrs <= ST.getNSAMaxSize();
4506 
4507  if (!UseNSA && Intr->NumVAddrs > 1)
4508  convertImageAddrToPacked(B, MI, ArgOffset + Intr->VAddrStart,
4509  Intr->NumVAddrs);
4510  }
4511 
4512  int Flags = 0;
4513  if (IsA16)
4514  Flags |= 1;
4515  if (IsG16)
4516  Flags |= 2;
4517  MI.addOperand(MachineOperand::CreateImm(Flags));
4518 
4519  if (BaseOpcode->Store) { // No TFE for stores?
4520  // TODO: Handle dmask trim
4521  Register VData = MI.getOperand(1).getReg();
4522  LLT Ty = MRI->getType(VData);
4523  if (!Ty.isVector() || Ty.getElementType() != S16)
4524  return true;
4525 
4526  Register RepackedReg = handleD16VData(B, *MRI, VData, true);
4527  if (RepackedReg != VData) {
4528  MI.getOperand(1).setReg(RepackedReg);
4529  }
4530 
4531  return true;
4532  }
4533 
4534  Register DstReg = MI.getOperand(0).getReg();
4535  LLT Ty = MRI->getType(DstReg);
4536  const LLT EltTy = Ty.getScalarType();
4537  const bool IsD16 = Ty.getScalarType() == S16;
4538  const int NumElts = Ty.isVector() ? Ty.getNumElements() : 1;
4539 
4540  // Confirm that the return type is large enough for the dmask specified
4541  if (NumElts < DMaskLanes)
4542  return false;
4543 
4544  if (NumElts > 4 || DMaskLanes > 4)
4545  return false;
4546 
4547  const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes;
4548  const LLT AdjustedTy =
4549  Ty.changeElementCount(ElementCount::getFixed(AdjustedNumElts));
4550 
4551  // The raw dword aligned data component of the load. The only legal cases
4552  // where this matters should be when using the packed D16 format, for
4553  // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>,
4554  LLT RoundedTy;
4555 
4556  // S32 vector to to cover all data, plus TFE result element.
4557  LLT TFETy;
4558 
4559  // Register type to use for each loaded component. Will be S32 or V2S16.
4560  LLT RegTy;
4561 
4562  if (IsD16 && ST.hasUnpackedD16VMem()) {
4563  RoundedTy =
4564  LLT::scalarOrVector(ElementCount::getFixed(AdjustedNumElts), 32);
4565  TFETy = LLT::fixed_vector(AdjustedNumElts + 1, 32);
4566  RegTy = S32;
4567  } else {
4568  unsigned EltSize = EltTy.getSizeInBits();
4569  unsigned RoundedElts = (AdjustedTy.getSizeInBits() + 31) / 32;
4570  unsigned RoundedSize = 32 * RoundedElts;
4571  RoundedTy = LLT::scalarOrVector(
4572  ElementCount::getFixed(RoundedSize / EltSize), EltSize);
4573  TFETy = LLT::fixed_vector(RoundedSize / 32 + 1, S32);
4574  RegTy = !IsTFE && EltSize == 16 ? V2S16 : S32;
4575  }
4576 
4577  // The return type does not need adjustment.
4578  // TODO: Should we change s16 case to s32 or <2 x s16>?
4579  if (!IsTFE && (RoundedTy == Ty || !Ty.isVector()))
4580  return true;
4581 
4582  Register Dst1Reg;
4583 
4584  // Insert after the instruction.
4585  B.setInsertPt(*MI.getParent(), ++MI.getIterator());
4586 
4587  // TODO: For TFE with d16, if we used a TFE type that was a multiple of <2 x
4588  // s16> instead of s32, we would only need 1 bitcast instead of multiple.
4589  const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy;
4590  const int ResultNumRegs = LoadResultTy.getSizeInBits() / 32;
4591 
4592  Register NewResultReg = MRI->createGenericVirtualRegister(LoadResultTy);
4593 
4594  MI.getOperand(0).setReg(NewResultReg);
4595 
4596  // In the IR, TFE is supposed to be used with a 2 element struct return
4597  // type. The intruction really returns these two values in one contiguous
4598  // register, with one additional dword beyond the loaded data. Rewrite the
4599  // return type to use a single register result.
4600 
4601  if (IsTFE) {
4602  Dst1Reg = MI.getOperand(1).getReg();
4603  if (MRI->getType(Dst1Reg) != S32)
4604  return false;
4605 
4606  // TODO: Make sure the TFE operand bit is set.
4607  MI.RemoveOperand(1);
4608 
4609  // Handle the easy case that requires no repack instructions.
4610  if (Ty == S32) {
4611  B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg);
4612  return true;
4613  }
4614  }
4615 
4616  // Now figure out how to copy the new result register back into the old
4617  // result.
4618  SmallVector<Register, 5> ResultRegs(ResultNumRegs, Dst1Reg);
4619 
4620  const int NumDataRegs = IsTFE ? ResultNumRegs - 1 : ResultNumRegs;
4621 
4622  if (ResultNumRegs == 1) {
4623  assert(!IsTFE);
4624  ResultRegs[0] = NewResultReg;
4625  } else {
4626  // We have to repack into a new vector of some kind.
4627  for (int I = 0; I != NumDataRegs; ++I)
4628  ResultRegs[I] = MRI->createGenericVirtualRegister(RegTy);
4629  B.buildUnmerge(ResultRegs, NewResultReg);
4630 
4631  // Drop the final TFE element to get the data part. The TFE result is
4632  // directly written to the right place already.
4633  if (IsTFE)
4634  ResultRegs.resize(NumDataRegs);
4635  }
4636 
4637  // For an s16 scalar result, we form an s32 result with a truncate regardless
4638  // of packed vs. unpacked.
4639  if (IsD16 && !Ty.isVector()) {
4640  B.buildTrunc(DstReg, ResultRegs[0]);
4641  return true;
4642  }
4643 
4644  // Avoid a build/concat_vector of 1 entry.
4645  if (Ty == V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) {
4646  B.buildBitcast(DstReg, ResultRegs[0]);
4647  return true;
4648  }
4649 
4650  assert(Ty.isVector());
4651 
4652  if (IsD16) {
4653  // For packed D16 results with TFE enabled, all the data components are
4654  // S32. Cast back to the expected type.
4655  //
4656  // TODO: We don't really need to use load s32 elements. We would only need one
4657  // cast for the TFE result if a multiple of v2s16 was used.
4658  if (RegTy != V2S16 && !ST.hasUnpackedD16VMem()) {
4659  for (Register &Reg : ResultRegs)
4660  Reg = B.buildBitcast(V2S16, Reg).getReg(0);
4661  } else if (ST.hasUnpackedD16VMem()) {
4662  for (Register &Reg : ResultRegs)
4663  Reg = B.buildTrunc(S16, Reg).getReg(0);
4664  }
4665  }
4666 
4667  auto padWithUndef = [&](LLT Ty, int NumElts) {
4668  if (NumElts == 0)
4669  return;
4670  Register Undef = B.buildUndef(Ty).getReg(0);
4671  for (int I = 0; I != NumElts; ++I)
4672  ResultRegs.push_back(Undef);
4673  };
4674 
4675  // Pad out any elements eliminated due to the dmask.
4676  LLT ResTy = MRI->getType(ResultRegs[0]);
4677  if (!ResTy.isVector()) {
4678  padWithUndef(ResTy, NumElts - ResultRegs.size());
4679  B.buildBuildVector(DstReg, ResultRegs);
4680  return true;
4681  }
4682 
4683  assert(!ST.hasUnpackedD16VMem() && ResTy == V2S16);
4684  const int RegsToCover = (Ty.getSizeInBits() + 31) / 32;
4685 
4686  // Deal with the one annoying legal case.
4687  const LLT V3S16 = LLT::fixed_vector(3, 16);
4688  if (Ty == V3S16) {
4689  padWithUndef(ResTy, RegsToCover - ResultRegs.size() + 1);
4690  auto Concat = B.buildConcatVectors(LLT::fixed_vector(6, 16), ResultRegs);
4691  B.buildUnmerge({DstReg, MRI->createGenericVirtualRegister(V3S16)}, Concat);
4692  return true;
4693  }
4694 
4695  padWithUndef(ResTy, RegsToCover - ResultRegs.size());
4696  B.buildConcatVectors(DstReg, ResultRegs);
4697  return true;
4698 }
4699 
4701  LegalizerHelper &Helper, MachineInstr &MI) const {
4702  MachineIRBuilder &B = Helper.MIRBuilder;
4703  GISelChangeObserver &Observer = Helper.Observer;
4704 
4705  Register Dst = MI.getOperand(0).getReg();
4706  LLT Ty = B.getMRI()->getType(Dst);
4707  unsigned Size = Ty.getSizeInBits();
4708  MachineFunction &MF = B.getMF();
4709 
4710  Observer.changingInstr(MI);
4711 
4713  Ty = getBitcastRegisterType(Ty);
4714  Helper.bitcastDst(MI, Ty, 0);
4715  Dst = MI.getOperand(0).getReg();
4716  B.setInsertPt(B.getMBB(), MI);
4717  }
4718 
4719  // FIXME: We don't really need this intermediate instruction. The intrinsic
4720  // should be fixed to have a memory operand. Since it's readnone, we're not
4721  // allowed to add one.
4722  MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_LOAD));
4723  MI.RemoveOperand(1); // Remove intrinsic ID
4724 
4725  // FIXME: When intrinsic definition is fixed, this should have an MMO already.
4726  // TODO: Should this use datalayout alignment?
4727  const unsigned MemSize = (Size + 7) / 8;
4728  const Align MemAlign(4);
4733  MemSize, MemAlign);
4734  MI.addMemOperand(MF, MMO);
4735 
4736  // There are no 96-bit result scalar loads, but widening to 128-bit should
4737  // always be legal. We may need to restore this to a 96-bit result if it turns
4738  // out this needs to be converted to a vector load during RegBankSelect.
4739  if (!isPowerOf2_32(Size)) {
4740  if (Ty.isVector())
4741  Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0);
4742  else
4743  Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0);
4744  }
4745 
4746  Observer.changedInstr(MI);
4747  return true;
4748 }
4749 
4750 // TODO: Move to selection
4753  MachineIRBuilder &B) const {
4754  if (!ST.isTrapHandlerEnabled() ||
4756  return legalizeTrapEndpgm(MI, MRI, B);
4757 
4758  if (Optional<uint8_t> HsaAbiVer = AMDGPU::getHsaAbiVersion(&ST)) {
4759  switch (*HsaAbiVer) {
4762  return legalizeTrapHsaQueuePtr(MI, MRI, B);
4764  return ST.supportsGetDoorbellID() ?
4765  legalizeTrapHsa(MI, MRI, B) :
4767  }
4768  }
4769 
4770  llvm_unreachable("Unknown trap handler");
4771 }
4772 
4775  B.buildInstr(AMDGPU::S_ENDPGM).addImm(0);
4776  MI.eraseFromParent();
4777  return true;
4778 }
4779 
4782  // Pass queue pointer to trap handler as input, and insert trap instruction
4783  // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi
4784  Register LiveIn =
4787  return false;
4788 
4789  Register SGPR01(AMDGPU::SGPR0_SGPR1);
4790  B.buildCopy(SGPR01, LiveIn);
4791  B.buildInstr(AMDGPU::S_TRAP)
4792  .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap))
4793  .addReg(SGPR01, RegState::Implicit);
4794 
4795  MI.eraseFromParent();
4796  return true;
4797 }
4798 
4801  B.buildInstr(AMDGPU::S_TRAP)
4802  .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap));
4803  MI.eraseFromParent();
4804  return true;
4805 }
4806 
4809  // Is non-HSA path or trap-handler disabled? then, report a warning
4810  // accordingly
4811  if (!ST.isTrapHandlerEnabled() ||
4813  DiagnosticInfoUnsupported NoTrap(B.getMF().getFunction(),
4814  "debugtrap handler not supported",
4815  MI.getDebugLoc(), DS_Warning);
4816  LLVMContext &Ctx = B.getMF().getFunction().getContext();
4817  Ctx.diagnose(NoTrap);
4818  } else {
4819  // Insert debug-trap instruction
4820  B.buildInstr(AMDGPU::S_TRAP)
4821  .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSADebugTrap));
4822  }
4823 
4824  MI.eraseFromParent();
4825  return true;
4826 }
4827 
4829  MachineIRBuilder &B) const {
4830  MachineRegisterInfo &MRI = *B.getMRI();
4831  const LLT S16 = LLT::scalar(16);
4832  const LLT S32 = LLT::scalar(32);
4833 
4834  Register DstReg = MI.getOperand(0).getReg();
4835  Register NodePtr = MI.getOperand(2).getReg();
4836  Register RayExtent = MI.getOperand(3).getReg();
4837  Register RayOrigin = MI.getOperand(4).getReg();
4838  Register RayDir = MI.getOperand(5).getReg();
4839  Register RayInvDir = MI.getOperand(6).getReg();
4840  Register TDescr = MI.getOperand(7).getReg();
4841 
4842  if (!ST.hasGFX10_AEncoding()) {
4843  DiagnosticInfoUnsupported BadIntrin(B.getMF().getFunction(),
4844  "intrinsic not supported on subtarget",
4845  MI.getDebugLoc());
4846  B.getMF().getFunction().getContext().diagnose(BadIntrin);
4847  return false;
4848  }
4849 
4850  const bool IsA16 = MRI.getType(RayDir).getElementType().getSizeInBits() == 16;
4851  const bool Is64 = MRI.getType(NodePtr).getSizeInBits() == 64;
4852  const unsigned NumVDataDwords = 4;
4853  const unsigned NumVAddrDwords = IsA16 ? (Is64 ? 9 : 8) : (Is64 ? 12 : 11);
4854  const bool UseNSA =
4855  ST.hasNSAEncoding() && NumVAddrDwords <= ST.getNSAMaxSize();
4856  const unsigned BaseOpcodes[2][2] = {
4857  {AMDGPU::IMAGE_BVH_INTERSECT_RAY, AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16},
4858  {AMDGPU::IMAGE_BVH64_INTERSECT_RAY,
4859  AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16}};
4860  int Opcode;
4861  if (UseNSA) {
4862  Opcode =
4863  AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16], AMDGPU::MIMGEncGfx10NSA,
4864  NumVDataDwords, NumVAddrDwords);
4865  } else {
4866  Opcode = AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16],
4867  AMDGPU::MIMGEncGfx10Default, NumVDataDwords,
4868  PowerOf2Ceil(NumVAddrDwords));
4869  }
4870  assert(Opcode != -1);
4871 
4873  if (Is64) {
4874  auto Unmerge = B.buildUnmerge({S32, S32}, NodePtr);
4875  Ops.push_back(Unmerge.getReg(0));
4876  Ops.push_back(Unmerge.getReg(1));
4877  } else {
4878  Ops.push_back(NodePtr);
4879  }
4880  Ops.push_back(RayExtent);
4881 
4882  auto packLanes = [&Ops, &S32, &B] (Register Src) {
4883  auto Unmerge = B.buildUnmerge({S32, S32, S32, S32}, Src);
4884  Ops.push_back(Unmerge.getReg(0));
4885  Ops.push_back(Unmerge.getReg(1));
4886  Ops.push_back(Unmerge.getReg(2));
4887  };
4888 
4889  packLanes(RayOrigin);
4890  if (IsA16) {
4891  auto UnmergeRayDir = B.buildUnmerge({S16, S16, S16, S16}, RayDir);
4892  auto UnmergeRayInvDir = B.buildUnmerge({S16, S16, S16, S16}, RayInvDir);
4896  B.buildMerge(R1, {UnmergeRayDir.getReg(0), UnmergeRayDir.getReg(1)});
4897  B.buildMerge(R2, {UnmergeRayDir.getReg(2), UnmergeRayInvDir.getReg(0)});
4898  B.buildMerge(R3, {UnmergeRayInvDir.getReg(1), UnmergeRayInvDir.getReg(2)});
4899  Ops.push_back(R1);
4900  Ops.push_back(R2);
4901  Ops.push_back(R3);
4902  } else {
4903  packLanes(RayDir);
4904  packLanes(RayInvDir);
4905  }
4906 
4907  if (!UseNSA) {
4908  // Build a single vector containing all the operands so far prepared.
4909  LLT OpTy = LLT::fixed_vector(Ops.size(), 32);
4910  Register MergedOps = B.buildMerge(OpTy, Ops).getReg(0);
4911  Ops.clear();
4912  Ops.push_back(MergedOps);
4913  }
4914 
4915  auto MIB = B.buildInstr(AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY)
4916  .addDef(DstReg)
4917  .addImm(Opcode);
4918 
4919  for (Register R : Ops) {
4920  MIB.addUse(R);
4921  }
4922 
4923  MIB.addUse(TDescr)
4924  .addImm(IsA16 ? 1 : 0)
4925  .cloneMemRefs(MI);
4926 
4927  MI.eraseFromParent();
4928  return true;
4929 }
4930 
4932  MachineInstr &MI) const {
4933  MachineIRBuilder &B = Helper.MIRBuilder;
4934  MachineRegisterInfo &MRI = *B.getMRI();
4935 
4936  // Replace the use G_BRCOND with the exec manipulate and branch pseudos.
4937  auto IntrID = MI.getIntrinsicID();
4938  switch (IntrID) {
4939  case Intrinsic::amdgcn_if:
4940  case Intrinsic::amdgcn_else: {
4941  MachineInstr *Br = nullptr;
4942  MachineBasicBlock *UncondBrTarget = nullptr;
4943  bool Negated = false;
4944  if (MachineInstr *BrCond =
4945  verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) {
4946  const SIRegisterInfo *TRI
4947  = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
4948 
4949  Register Def = MI.getOperand(1).getReg();
4950  Register Use = MI.getOperand(3).getReg();
4951 
4952  MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
4953 
4954  if (Negated)
4955  std::swap(CondBrTarget, UncondBrTarget);
4956 
4957  B.setInsertPt(B.getMBB(), BrCond->getIterator());
4958  if (IntrID == Intrinsic::amdgcn_if) {
4959  B.buildInstr(AMDGPU::SI_IF)
4960  .addDef(Def)
4961  .addUse(Use)
4962  .addMBB(UncondBrTarget);
4963  } else {
4964  B.buildInstr(AMDGPU::SI_ELSE)
4965  .addDef(Def)
4966  .addUse(Use)
4967  .addMBB(UncondBrTarget);
4968  }
4969 
4970  if (Br) {
4971  Br->getOperand(0).setMBB(CondBrTarget);
4972  } else {
4973  // The IRTranslator skips inserting the G_BR for fallthrough cases, but
4974  // since we're swapping branch targets it needs to be reinserted.
4975  // FIXME: IRTranslator should probably not do this
4976  B.buildBr(*CondBrTarget);
4977  }
4978 
4979  MRI.setRegClass(Def, TRI->getWaveMaskRegClass());
4980  MRI.setRegClass(Use, TRI->getWaveMaskRegClass());
4981  MI.eraseFromParent();
4982  BrCond->eraseFromParent();
4983  return true;
4984  }
4985 
4986  return false;
4987  }
4988  case Intrinsic::amdgcn_loop: {
4989  MachineInstr *Br = nullptr;
4990  MachineBasicBlock *UncondBrTarget = nullptr;
4991  bool Negated = false;
4992  if (MachineInstr *BrCond =
4993  verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) {
4994  const SIRegisterInfo *TRI
4995  = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
4996 
4997  MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
4998  Register Reg = MI.getOperand(2).getReg();
4999 
5000  if (Negated)
5001  std::swap(CondBrTarget, UncondBrTarget);
5002 
5003  B.setInsertPt(B.getMBB(), BrCond->getIterator());
5004  B.buildInstr(AMDGPU::SI_LOOP)
5005  .addUse(Reg)
5006  .addMBB(UncondBrTarget);
5007 
5008  if (Br)
5009  Br->getOperand(0).setMBB(CondBrTarget);
5010  else
5011  B.buildBr(*CondBrTarget);
5012 
5013  MI.eraseFromParent();
5014  BrCond->eraseFromParent();
5015  MRI.setRegClass(Reg, TRI->getWaveMaskRegClass());
5016  return true;
5017  }
5018 
5019  return false;
5020  }
5021  case Intrinsic::amdgcn_kernarg_segment_ptr:
5022  if (!AMDGPU::isKernel(B.getMF().getFunction().getCallingConv())) {
5023  // This only makes sense to call in a kernel, so just lower to null.
5024  B.buildConstant(MI.getOperand(0).getReg(), 0);
5025  MI.eraseFromParent();
5026  return true;
5027  }
5028 
5031  case Intrinsic::amdgcn_implicitarg_ptr:
5032  return legalizeImplicitArgPtr(MI, MRI, B);
5033  case Intrinsic::amdgcn_workitem_id_x:
5036  case Intrinsic::amdgcn_workitem_id_y:
5039  case Intrinsic::amdgcn_workitem_id_z:
5042  case Intrinsic::amdgcn_workgroup_id_x:
5045  case Intrinsic::amdgcn_workgroup_id_y:
5048  case Intrinsic::amdgcn_workgroup_id_z:
5051  case Intrinsic::amdgcn_dispatch_ptr:
5054  case Intrinsic::amdgcn_queue_ptr:
5057  case Intrinsic::amdgcn_implicit_buffer_ptr:
5060  case Intrinsic::amdgcn_dispatch_id:
5063  case Intrinsic::amdgcn_fdiv_fast:
5064  return legalizeFDIVFastIntrin(MI, MRI, B);
5065  case Intrinsic::amdgcn_is_shared:
5067  case Intrinsic::amdgcn_is_private:
5069  case Intrinsic::amdgcn_wavefrontsize: {
5070  B.buildConstant(MI.getOperand(0), ST.getWavefrontSize());
5071  MI.eraseFromParent();
5072  return true;
5073  }
5074  case Intrinsic::amdgcn_s_buffer_load:
5075  return legalizeSBufferLoad(Helper, MI);
5076  case Intrinsic::amdgcn_raw_buffer_store:
5077  case Intrinsic::amdgcn_struct_buffer_store:
5078  return legalizeBufferStore(MI, MRI, B, false, false);
5079  case Intrinsic::amdgcn_raw_buffer_store_format:
5080  case Intrinsic::amdgcn_struct_buffer_store_format:
5081  return legalizeBufferStore(MI, MRI, B, false, true);
5082  case Intrinsic::amdgcn_raw_tbuffer_store:
5083  case Intrinsic::amdgcn_struct_tbuffer_store:
5084  return legalizeBufferStore(MI, MRI, B, true, true);
5085  case Intrinsic::amdgcn_raw_buffer_load:
5086  case Intrinsic::amdgcn_struct_buffer_load:
5087  return legalizeBufferLoad(MI, MRI, B, false, false);
5088  case Intrinsic::amdgcn_raw_buffer_load_format:
5089  case Intrinsic::amdgcn_struct_buffer_load_format:
5090  return legalizeBufferLoad(MI, MRI, B, true, false);
5091  case Intrinsic::amdgcn_raw_tbuffer_load:
5092  case Intrinsic::amdgcn_struct_tbuffer_load:
5093  return legalizeBufferLoad(MI, MRI, B, true, true);
5094  case Intrinsic::amdgcn_raw_buffer_atomic_swap:
5095  case Intrinsic::amdgcn_struct_buffer_atomic_swap:
5096  case Intrinsic::amdgcn_raw_buffer_atomic_add:
5097  case Intrinsic::amdgcn_struct_buffer_atomic_add:
5098  case Intrinsic::amdgcn_raw_buffer_atomic_sub:
5099  case Intrinsic::amdgcn_struct_buffer_atomic_sub:
5100  case Intrinsic::amdgcn_raw_buffer_atomic_smin:
5101  case Intrinsic::amdgcn_struct_buffer_atomic_smin:
5102  case Intrinsic::amdgcn_raw_buffer_atomic_umin:
5103  case Intrinsic::amdgcn_struct_buffer_atomic_umin:
5104  case Intrinsic::amdgcn_raw_buffer_atomic_smax:
5105  case Intrinsic::amdgcn_struct_buffer_atomic_smax:
5106  case Intrinsic::amdgcn_raw_buffer_atomic_umax:
5107  case Intrinsic::amdgcn_struct_buffer_atomic_umax:
5108  case Intrinsic::amdgcn_raw_buffer_atomic_and:
5109  case Intrinsic::amdgcn_struct_buffer_atomic_and:
5110  case Intrinsic::amdgcn_raw_buffer_atomic_or:
5111  case Intrinsic::amdgcn_struct_buffer_atomic_or:
5112  case Intrinsic::amdgcn_raw_buffer_atomic_xor:
5113  case Intrinsic::amdgcn_struct_buffer_atomic_xor:
5114  case Intrinsic::amdgcn_raw_buffer_atomic_inc:
5115  case Intrinsic::amdgcn_struct_buffer_atomic_inc:
5116  case Intrinsic::amdgcn_raw_buffer_atomic_dec:
5117  case Intrinsic::amdgcn_struct_buffer_atomic_dec:
5118  case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
5119  case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
5120  case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
5121  case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
5122  case Intrinsic::amdgcn_buffer_atomic_fadd:
5123  case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
5124  case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
5125  case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
5126  case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
5127  return legalizeBufferAtomic(MI, B, IntrID);
5128  case Intrinsic::amdgcn_atomic_inc:
5129  return legalizeAtomicIncDec(MI, B, true);
5130  case Intrinsic::amdgcn_atomic_dec:
5131  return legalizeAtomicIncDec(MI, B, false);
5132  case Intrinsic::trap:
5133  return legalizeTrapIntrinsic(MI, MRI, B);
5134  case Intrinsic::debugtrap:
5135  return legalizeDebugTrapIntrinsic(MI, MRI, B);
5136  case Intrinsic::amdgcn_rsq_clamp:
5137  return legalizeRsqClampIntrinsic(MI, MRI, B);
5138  case Intrinsic::amdgcn_ds_fadd:
5139  case Intrinsic::amdgcn_ds_fmin:
5140  case Intrinsic::amdgcn_ds_fmax:
5141  return legalizeDSAtomicFPIntrinsic(Helper, MI, IntrID);
5142  case Intrinsic::amdgcn_image_bvh_intersect_ray:
5143  return legalizeBVHIntrinsic(MI, B);
5144  default: {
5145  if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
5147  return legalizeImageIntrinsic(MI, B, Helper.Observer, ImageDimIntr);
5148  return true;
5149  }
5150  }
5151 
5152  return true;
5153 }
llvm::Check::Size
@ Size
Definition: FileCheck.h:73
llvm::NextPowerOf2
uint64_t NextPowerOf2(uint64_t A)
Returns the next power of two (in 64-bits) that is strictly greater than A.
Definition: MathExtras.h:683
MIPatternMatch.h
llvm::MachineMemOperand::setValue
void setValue(const Value *NewSV)
Change the SourceValue for this MachineMemOperand.
Definition: MachineMemOperand.h:318
llvm::AMDGPUFunctionArgInfo::PreloadedValue
PreloadedValue
Definition: AMDGPUArgumentUsageInfo.h:98
llvm::getIConstantVRegSExtVal
Optional< int64_t > getIConstantVRegSExtVal(Register VReg, const MachineRegisterInfo &MRI)
If VReg is defined by a G_CONSTANT fits in int64_t returns it.
Definition: Utils.cpp:282
llvm::AMDGPUFunctionArgInfo::QUEUE_PTR
@ QUEUE_PTR
Definition: AMDGPUArgumentUsageInfo.h:102
llvm::LegalizeRuleSet::widenScalarToNextPow2
LegalizeRuleSet & widenScalarToNextPow2(unsigned TypeIdx, unsigned MinSize=0)
Widen the scalar to the next power of two that is at least MinSize.
Definition: LegalizerInfo.h:840
S