LLVM  13.0.0git
AMDGPULegalizerInfo.cpp
Go to the documentation of this file.
1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// This file implements the targeting of the Machinelegalizer class for
10 /// AMDGPU.
11 /// \todo This should be generated by TableGen.
12 //===----------------------------------------------------------------------===//
13 
14 #include "AMDGPULegalizerInfo.h"
15 
16 #include "AMDGPU.h"
17 #include "AMDGPUGlobalISelUtils.h"
18 #include "AMDGPUInstrInfo.h"
19 #include "AMDGPUTargetMachine.h"
20 #include "SIMachineFunctionInfo.h"
21 #include "Utils/AMDGPUBaseInfo.h"
22 #include "llvm/ADT/ScopeExit.h"
23 #include "llvm/BinaryFormat/ELF.h"
27 #include "llvm/IR/DiagnosticInfo.h"
28 #include "llvm/IR/IntrinsicsAMDGPU.h"
29 
30 #define DEBUG_TYPE "amdgpu-legalinfo"
31 
32 using namespace llvm;
33 using namespace LegalizeActions;
34 using namespace LegalizeMutations;
35 using namespace LegalityPredicates;
36 using namespace MIPatternMatch;
37 
38 // Hack until load/store selection patterns support any tuple of legal types.
40  "amdgpu-global-isel-new-legality",
41  cl::desc("Use GlobalISel desired legality, rather than try to use"
42  "rules compatible with selection patterns"),
43  cl::init(false),
45 
46 static constexpr unsigned MaxRegisterSize = 1024;
47 
48 // Round the number of elements to the next power of two elements
50  unsigned NElts = Ty.getNumElements();
51  unsigned Pow2NElts = 1 << Log2_32_Ceil(NElts);
52  return Ty.changeNumElements(Pow2NElts);
53 }
54 
55 // Round the number of bits to the next power of two bits
57  unsigned Bits = Ty.getSizeInBits();
58  unsigned Pow2Bits = 1 << Log2_32_Ceil(Bits);
59  return LLT::scalar(Pow2Bits);
60 }
61 
62 /// \returs true if this is an odd sized vector which should widen by adding an
63 /// additional element. This is mostly to handle <3 x s16> -> <4 x s16>. This
64 /// excludes s1 vectors, which should always be scalarized.
65 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) {
66  return [=](const LegalityQuery &Query) {
67  const LLT Ty = Query.Types[TypeIdx];
68  if (!Ty.isVector())
69  return false;
70 
71  const LLT EltTy = Ty.getElementType();
72  const unsigned EltSize = EltTy.getSizeInBits();
73  return Ty.getNumElements() % 2 != 0 &&
74  EltSize > 1 && EltSize < 32 &&
75  Ty.getSizeInBits() % 32 != 0;
76  };
77 }
78 
79 static LegalityPredicate sizeIsMultipleOf32(unsigned TypeIdx) {
80  return [=](const LegalityQuery &Query) {
81  const LLT Ty = Query.Types[TypeIdx];
82  return Ty.getSizeInBits() % 32 == 0;
83  };
84 }
85 
86 static LegalityPredicate isWideVec16(unsigned TypeIdx) {
87  return [=](const LegalityQuery &Query) {
88  const LLT Ty = Query.Types[TypeIdx];
89  const LLT EltTy = Ty.getScalarType();
90  return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2;
91  };
92 }
93 
94 static LegalizeMutation oneMoreElement(unsigned TypeIdx) {
95  return [=](const LegalityQuery &Query) {
96  const LLT Ty = Query.Types[TypeIdx];
97  const LLT EltTy = Ty.getElementType();
98  return std::make_pair(TypeIdx, LLT::vector(Ty.getNumElements() + 1, EltTy));
99  };
100 }
101 
102 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) {
103  return [=](const LegalityQuery &Query) {
104  const LLT Ty = Query.Types[TypeIdx];
105  const LLT EltTy = Ty.getElementType();
106  unsigned Size = Ty.getSizeInBits();
107  unsigned Pieces = (Size + 63) / 64;
108  unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces;
109  return std::make_pair(TypeIdx, LLT::scalarOrVector(NewNumElts, EltTy));
110  };
111 }
112 
113 // Increase the number of vector elements to reach the next multiple of 32-bit
114 // type.
115 static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) {
116  return [=](const LegalityQuery &Query) {
117  const LLT Ty = Query.Types[TypeIdx];
118 
119  const LLT EltTy = Ty.getElementType();
120  const int Size = Ty.getSizeInBits();
121  const int EltSize = EltTy.getSizeInBits();
122  const int NextMul32 = (Size + 31) / 32;
123 
124  assert(EltSize < 32);
125 
126  const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize;
127  return std::make_pair(TypeIdx, LLT::vector(NewNumElts, EltTy));
128  };
129 }
130 
131 static LLT getBitcastRegisterType(const LLT Ty) {
132  const unsigned Size = Ty.getSizeInBits();
133 
134  LLT CoercedTy;
135  if (Size <= 32) {
136  // <2 x s8> -> s16
137  // <4 x s8> -> s32
138  return LLT::scalar(Size);
139  }
140 
141  return LLT::scalarOrVector(Size / 32, 32);
142 }
143 
144 static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx) {
145  return [=](const LegalityQuery &Query) {
146  const LLT Ty = Query.Types[TypeIdx];
147  return std::make_pair(TypeIdx, getBitcastRegisterType(Ty));
148  };
149 }
150 
151 static LegalizeMutation bitcastToVectorElement32(unsigned TypeIdx) {
152  return [=](const LegalityQuery &Query) {
153  const LLT Ty = Query.Types[TypeIdx];
154  unsigned Size = Ty.getSizeInBits();
155  assert(Size % 32 == 0);
156  return std::make_pair(TypeIdx, LLT::scalarOrVector(Size / 32, 32));
157  };
158 }
159 
160 static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) {
161  return [=](const LegalityQuery &Query) {
162  const LLT QueryTy = Query.Types[TypeIdx];
163  return QueryTy.isVector() && QueryTy.getSizeInBits() < Size;
164  };
165 }
166 
167 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) {
168  return [=](const LegalityQuery &Query) {
169  const LLT QueryTy = Query.Types[TypeIdx];
170  return QueryTy.isVector() && QueryTy.getSizeInBits() > Size;
171  };
172 }
173 
174 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) {
175  return [=](const LegalityQuery &Query) {
176  const LLT QueryTy = Query.Types[TypeIdx];
177  return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0;
178  };
179 }
180 
181 static bool isRegisterSize(unsigned Size) {
182  return Size % 32 == 0 && Size <= MaxRegisterSize;
183 }
184 
185 static bool isRegisterVectorElementType(LLT EltTy) {
186  const int EltSize = EltTy.getSizeInBits();
187  return EltSize == 16 || EltSize % 32 == 0;
188 }
189 
190 static bool isRegisterVectorType(LLT Ty) {
191  const int EltSize = Ty.getElementType().getSizeInBits();
192  return EltSize == 32 || EltSize == 64 ||
193  (EltSize == 16 && Ty.getNumElements() % 2 == 0) ||
194  EltSize == 128 || EltSize == 256;
195 }
196 
197 static bool isRegisterType(LLT Ty) {
198  if (!isRegisterSize(Ty.getSizeInBits()))
199  return false;
200 
201  if (Ty.isVector())
202  return isRegisterVectorType(Ty);
203 
204  return true;
205 }
206 
207 // Any combination of 32 or 64-bit elements up the maximum register size, and
208 // multiples of v2s16.
209 static LegalityPredicate isRegisterType(unsigned TypeIdx) {
210  return [=](const LegalityQuery &Query) {
211  return isRegisterType(Query.Types[TypeIdx]);
212  };
213 }
214 
215 static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) {
216  return [=](const LegalityQuery &Query) {
217  const LLT QueryTy = Query.Types[TypeIdx];
218  if (!QueryTy.isVector())
219  return false;
220  const LLT EltTy = QueryTy.getElementType();
221  return EltTy == LLT::scalar(16) || EltTy.getSizeInBits() >= 32;
222  };
223 }
224 
225 // If we have a truncating store or an extending load with a data size larger
226 // than 32-bits, we need to reduce to a 32-bit type.
228  return [=](const LegalityQuery &Query) {
229  const LLT Ty = Query.Types[TypeIdx];
230  return !Ty.isVector() && Ty.getSizeInBits() > 32 &&
231  Query.MMODescrs[0].SizeInBits < Ty.getSizeInBits();
232  };
233 }
234 
235 // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we
236 // handle some operations by just promoting the register during
237 // selection. There are also d16 loads on GFX9+ which preserve the high bits.
238 static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS,
239  bool IsLoad) {
240  switch (AS) {
242  // FIXME: Private element size.
243  return ST.enableFlatScratch() ? 128 : 32;
245  return ST.useDS128() ? 128 : 64;
249  // Treat constant and global as identical. SMRD loads are sometimes usable for
250  // global loads (ideally constant address space should be eliminated)
251  // depending on the context. Legality cannot be context dependent, but
252  // RegBankSelect can split the load as necessary depending on the pointer
253  // register bank/uniformity and if the memory is invariant or not written in a
254  // kernel.
255  return IsLoad ? 512 : 128;
256  default:
257  // Flat addresses may contextually need to be split to 32-bit parts if they
258  // may alias scratch depending on the subtarget.
259  return 128;
260  }
261 }
262 
264  const LegalityQuery &Query) {
265  const LLT Ty = Query.Types[0];
266 
267  // Handle G_LOAD, G_ZEXTLOAD, G_SEXTLOAD
268  const bool IsLoad = Query.Opcode != AMDGPU::G_STORE;
269 
270  unsigned RegSize = Ty.getSizeInBits();
271  unsigned MemSize = Query.MMODescrs[0].SizeInBits;
272  unsigned AlignBits = Query.MMODescrs[0].AlignInBits;
273  unsigned AS = Query.Types[1].getAddressSpace();
274 
275  // All of these need to be custom lowered to cast the pointer operand.
277  return false;
278 
279  // Do not handle extending vector loads.
280  if (Ty.isVector() && MemSize != RegSize)
281  return false;
282 
283  // TODO: We should be able to widen loads if the alignment is high enough, but
284  // we also need to modify the memory access size.
285 #if 0
286  // Accept widening loads based on alignment.
287  if (IsLoad && MemSize < Size)
288  MemSize = std::max(MemSize, Align);
289 #endif
290 
291  // Only 1-byte and 2-byte to 32-bit extloads are valid.
292  if (MemSize != RegSize && RegSize != 32)
293  return false;
294 
295  if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad))
296  return false;
297 
298  switch (MemSize) {
299  case 8:
300  case 16:
301  case 32:
302  case 64:
303  case 128:
304  break;
305  case 96:
306  if (!ST.hasDwordx3LoadStores())
307  return false;
308  break;
309  case 256:
310  case 512:
311  // These may contextually need to be broken down.
312  break;
313  default:
314  return false;
315  }
316 
317  assert(RegSize >= MemSize);
318 
319  if (AlignBits < MemSize) {
320  const SITargetLowering *TLI = ST.getTargetLowering();
321  if (!TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS,
322  Align(AlignBits / 8)))
323  return false;
324  }
325 
326  return true;
327 }
328 
329 // The current selector can't handle <6 x s16>, <8 x s16>, s96, s128 etc, so
330 // workaround this. Eventually it should ignore the type for loads and only care
331 // about the size. Return true in cases where we will workaround this for now by
332 // bitcasting.
333 static bool loadStoreBitcastWorkaround(const LLT Ty) {
334  if (EnableNewLegality)
335  return false;
336 
337  const unsigned Size = Ty.getSizeInBits();
338  if (Size <= 64)
339  return false;
340  if (!Ty.isVector())
341  return true;
342 
343  LLT EltTy = Ty.getElementType();
344  if (EltTy.isPointer())
345  return true;
346 
347  unsigned EltSize = EltTy.getSizeInBits();
348  return EltSize != 32 && EltSize != 64;
349 }
350 
351 static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query) {
352  const LLT Ty = Query.Types[0];
353  return isRegisterType(Ty) && isLoadStoreSizeLegal(ST, Query) &&
355 }
356 
357 /// Return true if a load or store of the type should be lowered with a bitcast
358 /// to a different type.
359 static bool shouldBitcastLoadStoreType(const GCNSubtarget &ST, const LLT Ty,
360  const unsigned MemSizeInBits) {
361  const unsigned Size = Ty.getSizeInBits();
362  if (Size != MemSizeInBits)
363  return Size <= 32 && Ty.isVector();
364 
366  return true;
367  return Ty.isVector() && (Size <= 32 || isRegisterSize(Size)) &&
369 }
370 
371 /// Return true if we should legalize a load by widening an odd sized memory
372 /// access up to the alignment. Note this case when the memory access itself
373 /// changes, not the size of the result register.
374 static bool shouldWidenLoad(const GCNSubtarget &ST, unsigned SizeInBits,
375  unsigned AlignInBits, unsigned AddrSpace,
376  unsigned Opcode) {
377  // We don't want to widen cases that are naturally legal.
378  if (isPowerOf2_32(SizeInBits))
379  return false;
380 
381  // If we have 96-bit memory operations, we shouldn't touch them. Note we may
382  // end up widening these for a scalar load during RegBankSelect, since there
383  // aren't 96-bit scalar loads.
384  if (SizeInBits == 96 && ST.hasDwordx3LoadStores())
385  return false;
386 
387  if (SizeInBits >= maxSizeForAddrSpace(ST, AddrSpace, Opcode))
388  return false;
389 
390  // A load is known dereferenceable up to the alignment, so it's legal to widen
391  // to it.
392  //
393  // TODO: Could check dereferenceable for less aligned cases.
394  unsigned RoundedSize = NextPowerOf2(SizeInBits);
395  if (AlignInBits < RoundedSize)
396  return false;
397 
398  // Do not widen if it would introduce a slow unaligned load.
399  const SITargetLowering *TLI = ST.getTargetLowering();
400  bool Fast = false;
402  RoundedSize, AddrSpace, Align(AlignInBits / 8),
403  MachineMemOperand::MOLoad, &Fast) &&
404  Fast;
405 }
406 
407 static bool shouldWidenLoad(const GCNSubtarget &ST, const LegalityQuery &Query,
408  unsigned Opcode) {
409  if (Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic)
410  return false;
411 
412  return shouldWidenLoad(ST, Query.MMODescrs[0].SizeInBits,
413  Query.MMODescrs[0].AlignInBits,
414  Query.Types[1].getAddressSpace(), Opcode);
415 }
416 
418  const GCNTargetMachine &TM)
419  : ST(ST_) {
420  using namespace TargetOpcode;
421 
422  auto GetAddrSpacePtr = [&TM](unsigned AS) {
423  return LLT::pointer(AS, TM.getPointerSizeInBits(AS));
424  };
425 
426  const LLT S1 = LLT::scalar(1);
427  const LLT S8 = LLT::scalar(8);
428  const LLT S16 = LLT::scalar(16);
429  const LLT S32 = LLT::scalar(32);
430  const LLT S64 = LLT::scalar(64);
431  const LLT S128 = LLT::scalar(128);
432  const LLT S256 = LLT::scalar(256);
433  const LLT S512 = LLT::scalar(512);
434  const LLT MaxScalar = LLT::scalar(MaxRegisterSize);
435 
436  const LLT V2S8 = LLT::vector(2, 8);
437  const LLT V2S16 = LLT::vector(2, 16);
438  const LLT V4S16 = LLT::vector(4, 16);
439 
440  const LLT V2S32 = LLT::vector(2, 32);
441  const LLT V3S32 = LLT::vector(3, 32);
442  const LLT V4S32 = LLT::vector(4, 32);
443  const LLT V5S32 = LLT::vector(5, 32);
444  const LLT V6S32 = LLT::vector(6, 32);
445  const LLT V7S32 = LLT::vector(7, 32);
446  const LLT V8S32 = LLT::vector(8, 32);
447  const LLT V9S32 = LLT::vector(9, 32);
448  const LLT V10S32 = LLT::vector(10, 32);
449  const LLT V11S32 = LLT::vector(11, 32);
450  const LLT V12S32 = LLT::vector(12, 32);
451  const LLT V13S32 = LLT::vector(13, 32);
452  const LLT V14S32 = LLT::vector(14, 32);
453  const LLT V15S32 = LLT::vector(15, 32);
454  const LLT V16S32 = LLT::vector(16, 32);
455  const LLT V32S32 = LLT::vector(32, 32);
456 
457  const LLT V2S64 = LLT::vector(2, 64);
458  const LLT V3S64 = LLT::vector(3, 64);
459  const LLT V4S64 = LLT::vector(4, 64);
460  const LLT V5S64 = LLT::vector(5, 64);
461  const LLT V6S64 = LLT::vector(6, 64);
462  const LLT V7S64 = LLT::vector(7, 64);
463  const LLT V8S64 = LLT::vector(8, 64);
464  const LLT V16S64 = LLT::vector(16, 64);
465 
466  std::initializer_list<LLT> AllS32Vectors =
467  {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32,
468  V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32};
469  std::initializer_list<LLT> AllS64Vectors =
470  {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64};
471 
472  const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS);
473  const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS);
474  const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT);
475  const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS);
476  const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS);
477  const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS);
478  const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS);
479 
480  const LLT CodePtr = FlatPtr;
481 
482  const std::initializer_list<LLT> AddrSpaces64 = {
483  GlobalPtr, ConstantPtr, FlatPtr
484  };
485 
486  const std::initializer_list<LLT> AddrSpaces32 = {
487  LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr
488  };
489 
490  const std::initializer_list<LLT> FPTypesBase = {
491  S32, S64
492  };
493 
494  const std::initializer_list<LLT> FPTypes16 = {
495  S32, S64, S16
496  };
497 
498  const std::initializer_list<LLT> FPTypesPK16 = {
499  S32, S64, S16, V2S16
500  };
501 
502  const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32;
503 
504  auto &LegacyInfo = getLegacyLegalizerInfo();
505  LegacyInfo.setAction({G_BRCOND, S1},
506  LegacyLegalizeActions::Legal); // VCC branches
507  LegacyInfo.setAction({G_BRCOND, S32},
508  LegacyLegalizeActions::Legal); // SCC branches
509 
510  // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more
511  // elements for v3s16
513  .legalFor({S32, S64, V2S16, S16, V4S16, S1, S128, S256})
514  .legalFor(AllS32Vectors)
515  .legalFor(AllS64Vectors)
516  .legalFor(AddrSpaces64)
517  .legalFor(AddrSpaces32)
518  .legalIf(isPointer(0))
519  .clampScalar(0, S16, S256)
520  .widenScalarToNextPow2(0, 32)
521  .clampMaxNumElements(0, S32, 16)
523  .scalarize(0);
524 
525  if (ST.hasVOP3PInsts() && ST.hasAddNoCarry() && ST.hasIntClamp()) {
526  // Full set of gfx9 features.
527  getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
528  .legalFor({S32, S16, V2S16})
529  .clampScalar(0, S16, S32)
530  .clampMaxNumElements(0, S16, 2)
531  .scalarize(0)
532  .widenScalarToNextPow2(0, 32);
533 
534  getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT, G_SADDSAT, G_SSUBSAT})
535  .legalFor({S32, S16, V2S16}) // Clamp modifier
536  .minScalarOrElt(0, S16)
537  .clampMaxNumElements(0, S16, 2)
538  .scalarize(0)
539  .widenScalarToNextPow2(0, 32)
540  .lower();
541  } else if (ST.has16BitInsts()) {
542  getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
543  .legalFor({S32, S16})
544  .clampScalar(0, S16, S32)
545  .scalarize(0)
546  .widenScalarToNextPow2(0, 32); // FIXME: min should be 16
547 
548  // Technically the saturating operations require clamp bit support, but this
549  // was introduced at the same time as 16-bit operations.
550  getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
551  .legalFor({S32, S16}) // Clamp modifier
552  .minScalar(0, S16)
553  .scalarize(0)
554  .widenScalarToNextPow2(0, 16)
555  .lower();
556 
557  // We're just lowering this, but it helps get a better result to try to
558  // coerce to the desired type first.
559  getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT})
560  .minScalar(0, S16)
561  .scalarize(0)
562  .lower();
563  } else {
564  getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
565  .legalFor({S32})
566  .clampScalar(0, S32, S32)
567  .scalarize(0);
568 
569  if (ST.hasIntClamp()) {
570  getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
571  .legalFor({S32}) // Clamp modifier.
572  .scalarize(0)
573  .minScalarOrElt(0, S32)
574  .lower();
575  } else {
576  // Clamp bit support was added in VI, along with 16-bit operations.
577  getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
578  .minScalar(0, S32)
579  .scalarize(0)
580  .lower();
581  }
582 
583  // FIXME: DAG expansion gets better results. The widening uses the smaller
584  // range values and goes for the min/max lowering directly.
585  getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT})
586  .minScalar(0, S32)
587  .scalarize(0)
588  .lower();
589  }
590 
592  {G_SDIV, G_UDIV, G_SREM, G_UREM, G_SDIVREM, G_UDIVREM})
593  .customFor({S32, S64})
594  .clampScalar(0, S32, S64)
595  .widenScalarToNextPow2(0, 32)
596  .scalarize(0);
597 
598  auto &Mulh = getActionDefinitionsBuilder({G_UMULH, G_SMULH})
599  .legalFor({S32})
600  .maxScalarOrElt(0, S32);
601 
602  if (ST.hasVOP3PInsts()) {
603  Mulh
604  .clampMaxNumElements(0, S8, 2)
605  .lowerFor({V2S8});
606  }
607 
608  Mulh
609  .scalarize(0)
610  .lower();
611 
612  // Report legal for any types we can handle anywhere. For the cases only legal
613  // on the SALU, RegBankSelect will be able to re-legalize.
614  getActionDefinitionsBuilder({G_AND, G_OR, G_XOR})
615  .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16})
616  .clampScalar(0, S32, S64)
620  .scalarize(0);
621 
622  getActionDefinitionsBuilder({G_UADDO, G_USUBO,
623  G_UADDE, G_SADDE, G_USUBE, G_SSUBE})
624  .legalFor({{S32, S1}, {S32, S32}})
625  .minScalar(0, S32)
626  // TODO: .scalarize(0)
627  .lower();
628 
629  getActionDefinitionsBuilder(G_BITCAST)
630  // Don't worry about the size constraint.
632  .lower();
633 
634 
635  getActionDefinitionsBuilder(G_CONSTANT)
636  .legalFor({S1, S32, S64, S16, GlobalPtr,
637  LocalPtr, ConstantPtr, PrivatePtr, FlatPtr })
638  .legalIf(isPointer(0))
639  .clampScalar(0, S32, S64)
641 
642  getActionDefinitionsBuilder(G_FCONSTANT)
643  .legalFor({S32, S64, S16})
644  .clampScalar(0, S16, S64);
645 
646  getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE})
647  .legalIf(isRegisterType(0))
648  // s1 and s16 are special cases because they have legal operations on
649  // them, but don't really occupy registers in the normal way.
650  .legalFor({S1, S16})
651  .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
652  .clampScalarOrElt(0, S32, MaxScalar)
653  .widenScalarToNextPow2(0, 32)
654  .clampMaxNumElements(0, S32, 16);
655 
656  LegacyInfo.setAction({G_FRAME_INDEX, PrivatePtr},
658 
659  // If the amount is divergent, we have to do a wave reduction to get the
660  // maximum value, so this is expanded during RegBankSelect.
661  getActionDefinitionsBuilder(G_DYN_STACKALLOC)
662  .legalFor({{PrivatePtr, S32}});
663 
664  getActionDefinitionsBuilder(G_GLOBAL_VALUE)
665  .customIf(typeIsNot(0, PrivatePtr));
666 
667  LegacyInfo.setAction({G_BLOCK_ADDR, CodePtr}, LegacyLegalizeActions::Legal);
668 
669  auto &FPOpActions = getActionDefinitionsBuilder(
670  { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE})
671  .legalFor({S32, S64});
672  auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS})
673  .customFor({S32, S64});
674  auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV)
675  .customFor({S32, S64});
676 
677  if (ST.has16BitInsts()) {
678  if (ST.hasVOP3PInsts())
679  FPOpActions.legalFor({S16, V2S16});
680  else
681  FPOpActions.legalFor({S16});
682 
683  TrigActions.customFor({S16});
684  FDIVActions.customFor({S16});
685  }
686 
687  auto &MinNumMaxNum = getActionDefinitionsBuilder({
688  G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE});
689 
690  if (ST.hasVOP3PInsts()) {
691  MinNumMaxNum.customFor(FPTypesPK16)
693  .clampMaxNumElements(0, S16, 2)
694  .clampScalar(0, S16, S64)
695  .scalarize(0);
696  } else if (ST.has16BitInsts()) {
697  MinNumMaxNum.customFor(FPTypes16)
698  .clampScalar(0, S16, S64)
699  .scalarize(0);
700  } else {
701  MinNumMaxNum.customFor(FPTypesBase)
702  .clampScalar(0, S32, S64)
703  .scalarize(0);
704  }
705 
706  if (ST.hasVOP3PInsts())
707  FPOpActions.clampMaxNumElements(0, S16, 2);
708 
709  FPOpActions
710  .scalarize(0)
711  .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
712 
713  TrigActions
714  .scalarize(0)
715  .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
716 
717  FDIVActions
718  .scalarize(0)
719  .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
720 
721  getActionDefinitionsBuilder({G_FNEG, G_FABS})
722  .legalFor(FPTypesPK16)
723  .clampMaxNumElements(0, S16, 2)
724  .scalarize(0)
725  .clampScalar(0, S16, S64);
726 
727  if (ST.has16BitInsts()) {
728  getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR})
729  .legalFor({S32, S64, S16})
730  .scalarize(0)
731  .clampScalar(0, S16, S64);
732  } else {
734  .legalFor({S32, S64})
735  .scalarize(0)
736  .clampScalar(0, S32, S64);
737 
738  if (ST.hasFractBug()) {
740  .customFor({S64})
741  .legalFor({S32, S64})
742  .scalarize(0)
743  .clampScalar(0, S32, S64);
744  } else {
746  .legalFor({S32, S64})
747  .scalarize(0)
748  .clampScalar(0, S32, S64);
749  }
750  }
751 
752  getActionDefinitionsBuilder(G_FPTRUNC)
753  .legalFor({{S32, S64}, {S16, S32}})
754  .scalarize(0)
755  .lower();
756 
758  .legalFor({{S64, S32}, {S32, S16}})
759  .narrowScalarFor({{S64, S16}}, changeTo(0, S32))
760  .scalarize(0);
761 
763  // Use actual fsub instruction
764  .legalFor({S32})
765  // Must use fadd + fneg
766  .lowerFor({S64, S16, V2S16})
767  .scalarize(0)
768  .clampScalar(0, S32, S64);
769 
770  // Whether this is legal depends on the floating point mode for the function.
771  auto &FMad = getActionDefinitionsBuilder(G_FMAD);
772  if (ST.hasMadF16() && ST.hasMadMacF32Insts())
773  FMad.customFor({S32, S16});
774  else if (ST.hasMadMacF32Insts())
775  FMad.customFor({S32});
776  else if (ST.hasMadF16())
777  FMad.customFor({S16});
778  FMad.scalarize(0)
779  .lower();
780 
781  auto &FRem = getActionDefinitionsBuilder(G_FREM);
782  if (ST.has16BitInsts()) {
783  FRem.customFor({S16, S32, S64});
784  } else {
785  FRem.minScalar(0, S32)
786  .customFor({S32, S64});
787  }
788  FRem.scalarize(0);
789 
790  // TODO: Do we need to clamp maximum bitwidth?
792  .legalIf(isScalar(0))
793  .legalFor({{V2S16, V2S32}})
794  .clampMaxNumElements(0, S16, 2)
795  // Avoid scalarizing in cases that should be truly illegal. In unresolvable
796  // situations (like an invalid implicit use), we don't want to infinite loop
797  // in the legalizer.
799  .alwaysLegal();
800 
801  getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT})
802  .legalFor({{S64, S32}, {S32, S16}, {S64, S16},
803  {S32, S1}, {S64, S1}, {S16, S1}})
804  .scalarize(0)
805  .clampScalar(0, S32, S64)
806  .widenScalarToNextPow2(1, 32);
807 
808  // TODO: Split s1->s64 during regbankselect for VALU.
809  auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP})
810  .legalFor({{S32, S32}, {S64, S32}, {S16, S32}})
811  .lowerFor({{S32, S64}})
812  .lowerIf(typeIs(1, S1))
813  .customFor({{S64, S64}});
814  if (ST.has16BitInsts())
815  IToFP.legalFor({{S16, S16}});
816  IToFP.clampScalar(1, S32, S64)
817  .minScalar(0, S32)
818  .scalarize(0)
820 
821  auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI})
822  .legalFor({{S32, S32}, {S32, S64}, {S32, S16}})
823  .customFor({{S64, S64}})
824  .narrowScalarFor({{S64, S16}}, changeTo(0, S32));
825  if (ST.has16BitInsts())
826  FPToI.legalFor({{S16, S16}});
827  else
828  FPToI.minScalar(1, S32);
829 
830  FPToI.minScalar(0, S32)
831  .widenScalarToNextPow2(0, 32)
832  .scalarize(0)
833  .lower();
834 
835  // Lower roundeven into G_FRINT
836  getActionDefinitionsBuilder({G_INTRINSIC_ROUND, G_INTRINSIC_ROUNDEVEN})
837  .scalarize(0)
838  .lower();
839 
840  if (ST.has16BitInsts()) {
841  getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
842  .legalFor({S16, S32, S64})
843  .clampScalar(0, S16, S64)
844  .scalarize(0);
845  } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) {
846  getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
847  .legalFor({S32, S64})
848  .clampScalar(0, S32, S64)
849  .scalarize(0);
850  } else {
851  getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
852  .legalFor({S32})
853  .customFor({S64})
854  .clampScalar(0, S32, S64)
855  .scalarize(0);
856  }
857 
858  getActionDefinitionsBuilder(G_PTR_ADD)
859  .legalIf(all(isPointer(0), sameSize(0, 1)))
860  .scalarize(0)
861  .scalarSameSizeAs(1, 0);
862 
863  getActionDefinitionsBuilder(G_PTRMASK)
864  .legalIf(all(sameSize(0, 1), typeInSet(1, {S64, S32})))
865  .scalarSameSizeAs(1, 0)
866  .scalarize(0);
867 
868  auto &CmpBuilder =
869  getActionDefinitionsBuilder(G_ICMP)
870  // The compare output type differs based on the register bank of the output,
871  // so make both s1 and s32 legal.
872  //
873  // Scalar compares producing output in scc will be promoted to s32, as that
874  // is the allocatable register type that will be needed for the copy from
875  // scc. This will be promoted during RegBankSelect, and we assume something
876  // before that won't try to use s32 result types.
877  //
878  // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg
879  // bank.
880  .legalForCartesianProduct(
881  {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr})
882  .legalForCartesianProduct(
883  {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr});
884  if (ST.has16BitInsts()) {
885  CmpBuilder.legalFor({{S1, S16}});
886  }
887 
888  CmpBuilder
889  .widenScalarToNextPow2(1)
890  .clampScalar(1, S32, S64)
891  .scalarize(0)
892  .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1)));
893 
894  getActionDefinitionsBuilder(G_FCMP)
895  .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase)
896  .widenScalarToNextPow2(1)
897  .clampScalar(1, S32, S64)
898  .scalarize(0);
899 
900  // FIXME: fpow has a selection pattern that should move to custom lowering.
901  auto &Exp2Ops = getActionDefinitionsBuilder({G_FEXP2, G_FLOG2});
902  if (ST.has16BitInsts())
903  Exp2Ops.legalFor({S32, S16});
904  else
905  Exp2Ops.legalFor({S32});
906  Exp2Ops.clampScalar(0, MinScalarFPTy, S32);
907  Exp2Ops.scalarize(0);
908 
909  auto &ExpOps = getActionDefinitionsBuilder({G_FEXP, G_FLOG, G_FLOG10, G_FPOW});
910  if (ST.has16BitInsts())
911  ExpOps.customFor({{S32}, {S16}});
912  else
913  ExpOps.customFor({S32});
914  ExpOps.clampScalar(0, MinScalarFPTy, S32)
915  .scalarize(0);
916 
917  getActionDefinitionsBuilder(G_FPOWI)
918  .clampScalar(0, MinScalarFPTy, S32)
919  .lower();
920 
921  // The 64-bit versions produce 32-bit results, but only on the SALU.
922  getActionDefinitionsBuilder(G_CTPOP)
923  .legalFor({{S32, S32}, {S32, S64}})
924  .clampScalar(0, S32, S32)
925  .clampScalar(1, S32, S64)
926  .scalarize(0)
927  .widenScalarToNextPow2(0, 32)
928  .widenScalarToNextPow2(1, 32);
929 
930  // The hardware instructions return a different result on 0 than the generic
931  // instructions expect. The hardware produces -1, but these produce the
932  // bitwidth.
933  getActionDefinitionsBuilder({G_CTLZ, G_CTTZ})
934  .scalarize(0)
935  .clampScalar(0, S32, S32)
936  .clampScalar(1, S32, S64)
937  .widenScalarToNextPow2(0, 32)
938  .widenScalarToNextPow2(1, 32)
939  .lower();
940 
941  // The 64-bit versions produce 32-bit results, but only on the SALU.
942  getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF})
943  .legalFor({{S32, S32}, {S32, S64}})
944  .clampScalar(0, S32, S32)
945  .clampScalar(1, S32, S64)
946  .scalarize(0)
947  .widenScalarToNextPow2(0, 32)
948  .widenScalarToNextPow2(1, 32);
949 
950  // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
951  // RegBankSelect.
952  getActionDefinitionsBuilder(G_BITREVERSE)
953  .legalFor({S32, S64})
954  .clampScalar(0, S32, S64)
955  .scalarize(0)
956  .widenScalarToNextPow2(0);
957 
958  if (ST.has16BitInsts()) {
959  getActionDefinitionsBuilder(G_BSWAP)
960  .legalFor({S16, S32, V2S16})
961  .clampMaxNumElements(0, S16, 2)
962  // FIXME: Fixing non-power-of-2 before clamp is workaround for
963  // narrowScalar limitation.
964  .widenScalarToNextPow2(0)
965  .clampScalar(0, S16, S32)
966  .scalarize(0);
967 
968  if (ST.hasVOP3PInsts()) {
969  getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
970  .legalFor({S32, S16, V2S16})
971  .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
972  .clampMaxNumElements(0, S16, 2)
973  .minScalar(0, S16)
974  .widenScalarToNextPow2(0)
975  .scalarize(0)
976  .lower();
977  } else {
978  getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
979  .legalFor({S32, S16})
980  .widenScalarToNextPow2(0)
981  .minScalar(0, S16)
982  .scalarize(0)
983  .lower();
984  }
985  } else {
986  // TODO: Should have same legality without v_perm_b32
987  getActionDefinitionsBuilder(G_BSWAP)
988  .legalFor({S32})
989  .lowerIf(scalarNarrowerThan(0, 32))
990  // FIXME: Fixing non-power-of-2 before clamp is workaround for
991  // narrowScalar limitation.
992  .widenScalarToNextPow2(0)
993  .maxScalar(0, S32)
994  .scalarize(0)
995  .lower();
996 
997  getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
998  .legalFor({S32})
999  .minScalar(0, S32)
1000  .widenScalarToNextPow2(0)
1001  .scalarize(0)
1002  .lower();
1003  }
1004 
1005  getActionDefinitionsBuilder(G_INTTOPTR)
1006  // List the common cases
1007  .legalForCartesianProduct(AddrSpaces64, {S64})
1008  .legalForCartesianProduct(AddrSpaces32, {S32})
1009  .scalarize(0)
1010  // Accept any address space as long as the size matches
1011  .legalIf(sameSize(0, 1))
1012  .widenScalarIf(smallerThan(1, 0),
1013  [](const LegalityQuery &Query) {
1014  return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
1015  })
1016  .narrowScalarIf(largerThan(1, 0),
1017  [](const LegalityQuery &Query) {
1018  return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
1019  });
1020 
1021  getActionDefinitionsBuilder(G_PTRTOINT)
1022  // List the common cases
1023  .legalForCartesianProduct(AddrSpaces64, {S64})
1024  .legalForCartesianProduct(AddrSpaces32, {S32})
1025  .scalarize(0)
1026  // Accept any address space as long as the size matches
1027  .legalIf(sameSize(0, 1))
1028  .widenScalarIf(smallerThan(0, 1),
1029  [](const LegalityQuery &Query) {
1030  return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
1031  })
1032  .narrowScalarIf(
1033  largerThan(0, 1),
1034  [](const LegalityQuery &Query) {
1035  return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
1036  });
1037 
1038  getActionDefinitionsBuilder(G_ADDRSPACE_CAST)
1039  .scalarize(0)
1040  .custom();
1041 
1042  const auto needToSplitMemOp = [=](const LegalityQuery &Query,
1043  bool IsLoad) -> bool {
1044  const LLT DstTy = Query.Types[0];
1045 
1046  // Split vector extloads.
1047  unsigned MemSize = Query.MMODescrs[0].SizeInBits;
1048  unsigned AlignBits = Query.MMODescrs[0].AlignInBits;
1049 
1050  if (MemSize < DstTy.getSizeInBits())
1051  MemSize = std::max(MemSize, AlignBits);
1052 
1053  if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize)
1054  return true;
1055 
1056  const LLT PtrTy = Query.Types[1];
1057  unsigned AS = PtrTy.getAddressSpace();
1058  if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad))
1059  return true;
1060 
1061  // Catch weird sized loads that don't evenly divide into the access sizes
1062  // TODO: May be able to widen depending on alignment etc.
1063  unsigned NumRegs = (MemSize + 31) / 32;
1064  if (NumRegs == 3) {
1065  if (!ST.hasDwordx3LoadStores())
1066  return true;
1067  } else {
1068  // If the alignment allows, these should have been widened.
1069  if (!isPowerOf2_32(NumRegs))
1070  return true;
1071  }
1072 
1073  if (AlignBits < MemSize) {
1074  const SITargetLowering *TLI = ST.getTargetLowering();
1075  return !TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS,
1076  Align(AlignBits / 8));
1077  }
1078 
1079  return false;
1080  };
1081 
1082  unsigned GlobalAlign32 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 32;
1083  unsigned GlobalAlign16 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 16;
1084  unsigned GlobalAlign8 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 8;
1085 
1086  // TODO: Refine based on subtargets which support unaligned access or 128-bit
1087  // LDS
1088  // TODO: Unsupported flat for SI.
1089 
1090  for (unsigned Op : {G_LOAD, G_STORE}) {
1091  const bool IsStore = Op == G_STORE;
1092 
1093  auto &Actions = getActionDefinitionsBuilder(Op);
1094  // Explicitly list some common cases.
1095  // TODO: Does this help compile time at all?
1096  Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, 32, GlobalAlign32},
1097  {V2S32, GlobalPtr, 64, GlobalAlign32},
1098  {V4S32, GlobalPtr, 128, GlobalAlign32},
1099  {S64, GlobalPtr, 64, GlobalAlign32},
1100  {V2S64, GlobalPtr, 128, GlobalAlign32},
1101  {V2S16, GlobalPtr, 32, GlobalAlign32},
1102  {S32, GlobalPtr, 8, GlobalAlign8},
1103  {S32, GlobalPtr, 16, GlobalAlign16},
1104 
1105  {S32, LocalPtr, 32, 32},
1106  {S64, LocalPtr, 64, 32},
1107  {V2S32, LocalPtr, 64, 32},
1108  {S32, LocalPtr, 8, 8},
1109  {S32, LocalPtr, 16, 16},
1110  {V2S16, LocalPtr, 32, 32},
1111 
1112  {S32, PrivatePtr, 32, 32},
1113  {S32, PrivatePtr, 8, 8},
1114  {S32, PrivatePtr, 16, 16},
1115  {V2S16, PrivatePtr, 32, 32},
1116 
1117  {S32, ConstantPtr, 32, GlobalAlign32},
1118  {V2S32, ConstantPtr, 64, GlobalAlign32},
1119  {V4S32, ConstantPtr, 128, GlobalAlign32},
1120  {S64, ConstantPtr, 64, GlobalAlign32},
1121  {V2S32, ConstantPtr, 32, GlobalAlign32}});
1122  Actions.legalIf(
1123  [=](const LegalityQuery &Query) -> bool {
1124  return isLoadStoreLegal(ST, Query);
1125  });
1126 
1127  // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to
1128  // 64-bits.
1129  //
1130  // TODO: Should generalize bitcast action into coerce, which will also cover
1131  // inserting addrspacecasts.
1132  Actions.customIf(typeIs(1, Constant32Ptr));
1133 
1134  // Turn any illegal element vectors into something easier to deal
1135  // with. These will ultimately produce 32-bit scalar shifts to extract the
1136  // parts anyway.
1137  //
1138  // For odd 16-bit element vectors, prefer to split those into pieces with
1139  // 16-bit vector parts.
1140  Actions.bitcastIf(
1141  [=](const LegalityQuery &Query) -> bool {
1142  return shouldBitcastLoadStoreType(ST, Query.Types[0],
1143  Query.MMODescrs[0].SizeInBits);
1144  }, bitcastToRegisterType(0));
1145 
1146  if (!IsStore) {
1147  // Widen suitably aligned loads by loading extra bytes. The standard
1148  // legalization actions can't properly express widening memory operands.
1149  Actions.customIf([=](const LegalityQuery &Query) -> bool {
1150  return shouldWidenLoad(ST, Query, G_LOAD);
1151  });
1152  }
1153 
1154  // FIXME: load/store narrowing should be moved to lower action
1155  Actions
1156  .narrowScalarIf(
1157  [=](const LegalityQuery &Query) -> bool {
1158  return !Query.Types[0].isVector() &&
1159  needToSplitMemOp(Query, Op == G_LOAD);
1160  },
1161  [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1162  const LLT DstTy = Query.Types[0];
1163  const LLT PtrTy = Query.Types[1];
1164 
1165  const unsigned DstSize = DstTy.getSizeInBits();
1166  unsigned MemSize = Query.MMODescrs[0].SizeInBits;
1167 
1168  // Split extloads.
1169  if (DstSize > MemSize)
1170  return std::make_pair(0, LLT::scalar(MemSize));
1171 
1172  if (!isPowerOf2_32(DstSize)) {
1173  // We're probably decomposing an odd sized store. Try to split
1174  // to the widest type. TODO: Account for alignment. As-is it
1175  // should be OK, since the new parts will be further legalized.
1176  unsigned FloorSize = PowerOf2Floor(DstSize);
1177  return std::make_pair(0, LLT::scalar(FloorSize));
1178  }
1179 
1180  if (DstSize > 32 && (DstSize % 32 != 0)) {
1181  // FIXME: Need a way to specify non-extload of larger size if
1182  // suitably aligned.
1183  return std::make_pair(0, LLT::scalar(32 * (DstSize / 32)));
1184  }
1185 
1186  unsigned MaxSize = maxSizeForAddrSpace(ST,
1187  PtrTy.getAddressSpace(),
1188  Op == G_LOAD);
1189  if (MemSize > MaxSize)
1190  return std::make_pair(0, LLT::scalar(MaxSize));
1191 
1192  unsigned Align = Query.MMODescrs[0].AlignInBits;
1193  return std::make_pair(0, LLT::scalar(Align));
1194  })
1195  .fewerElementsIf(
1196  [=](const LegalityQuery &Query) -> bool {
1197  return Query.Types[0].isVector() &&
1198  needToSplitMemOp(Query, Op == G_LOAD);
1199  },
1200  [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1201  const LLT DstTy = Query.Types[0];
1202  const LLT PtrTy = Query.Types[1];
1203 
1204  LLT EltTy = DstTy.getElementType();
1205  unsigned MaxSize = maxSizeForAddrSpace(ST,
1206  PtrTy.getAddressSpace(),
1207  Op == G_LOAD);
1208 
1209  // FIXME: Handle widened to power of 2 results better. This ends
1210  // up scalarizing.
1211  // FIXME: 3 element stores scalarized on SI
1212 
1213  // Split if it's too large for the address space.
1214  if (Query.MMODescrs[0].SizeInBits > MaxSize) {
1215  unsigned NumElts = DstTy.getNumElements();
1216  unsigned EltSize = EltTy.getSizeInBits();
1217 
1218  if (MaxSize % EltSize == 0) {
1219  return std::make_pair(
1220  0, LLT::scalarOrVector(MaxSize / EltSize, EltTy));
1221  }
1222 
1223  unsigned NumPieces = Query.MMODescrs[0].SizeInBits / MaxSize;
1224 
1225  // FIXME: Refine when odd breakdowns handled
1226  // The scalars will need to be re-legalized.
1227  if (NumPieces == 1 || NumPieces >= NumElts ||
1228  NumElts % NumPieces != 0)
1229  return std::make_pair(0, EltTy);
1230 
1231  return std::make_pair(0,
1232  LLT::vector(NumElts / NumPieces, EltTy));
1233  }
1234 
1235  // FIXME: We could probably handle weird extending loads better.
1236  unsigned MemSize = Query.MMODescrs[0].SizeInBits;
1237  if (DstTy.getSizeInBits() > MemSize)
1238  return std::make_pair(0, EltTy);
1239 
1240  unsigned EltSize = EltTy.getSizeInBits();
1241  unsigned DstSize = DstTy.getSizeInBits();
1242  if (!isPowerOf2_32(DstSize)) {
1243  // We're probably decomposing an odd sized store. Try to split
1244  // to the widest type. TODO: Account for alignment. As-is it
1245  // should be OK, since the new parts will be further legalized.
1246  unsigned FloorSize = PowerOf2Floor(DstSize);
1247  return std::make_pair(
1248  0, LLT::scalarOrVector(FloorSize / EltSize, EltTy));
1249  }
1250 
1251  // Need to split because of alignment.
1252  unsigned Align = Query.MMODescrs[0].AlignInBits;
1253  if (EltSize > Align &&
1254  (EltSize / Align < DstTy.getNumElements())) {
1255  return std::make_pair(0, LLT::vector(EltSize / Align, EltTy));
1256  }
1257 
1258  // May need relegalization for the scalars.
1259  return std::make_pair(0, EltTy);
1260  })
1261  .lowerIfMemSizeNotPow2()
1262  .minScalar(0, S32)
1263  .narrowScalarIf(isWideScalarExtLoadTruncStore(0), changeTo(0, S32))
1264  .widenScalarToNextPow2(0)
1265  .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0))
1266  .lower();
1267  }
1268 
1269  auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD})
1270  .legalForTypesWithMemDesc({{S32, GlobalPtr, 8, 8},
1271  {S32, GlobalPtr, 16, 2 * 8},
1272  {S32, LocalPtr, 8, 8},
1273  {S32, LocalPtr, 16, 16},
1274  {S32, PrivatePtr, 8, 8},
1275  {S32, PrivatePtr, 16, 16},
1276  {S32, ConstantPtr, 8, 8},
1277  {S32, ConstantPtr, 16, 2 * 8}})
1278  .legalIf(
1279  [=](const LegalityQuery &Query) -> bool {
1280  return isLoadStoreLegal(ST, Query);
1281  });
1282 
1283  if (ST.hasFlatAddressSpace()) {
1284  ExtLoads.legalForTypesWithMemDesc(
1285  {{S32, FlatPtr, 8, 8}, {S32, FlatPtr, 16, 16}});
1286  }
1287 
1288  // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to
1289  // 64-bits.
1290  //
1291  // TODO: Should generalize bitcast action into coerce, which will also cover
1292  // inserting addrspacecasts.
1293  ExtLoads.customIf(typeIs(1, Constant32Ptr));
1294 
1295  ExtLoads.clampScalar(0, S32, S32)
1296  .widenScalarToNextPow2(0)
1297  .unsupportedIfMemSizeNotPow2()
1298  .lower();
1299 
1300  auto &Atomics = getActionDefinitionsBuilder(
1301  {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB,
1302  G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR,
1303  G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX,
1304  G_ATOMICRMW_UMIN})
1305  .legalFor({{S32, GlobalPtr}, {S32, LocalPtr},
1306  {S64, GlobalPtr}, {S64, LocalPtr},
1307  {S32, RegionPtr}, {S64, RegionPtr}});
1308  if (ST.hasFlatAddressSpace()) {
1309  Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}});
1310  }
1311 
1312  auto &Atomic = getActionDefinitionsBuilder(G_ATOMICRMW_FADD);
1313  if (ST.hasLDSFPAtomics()) {
1314  Atomic.legalFor({{S32, LocalPtr}, {S32, RegionPtr}});
1315  if (ST.hasGFX90AInsts())
1316  Atomic.legalFor({{S64, LocalPtr}});
1317  }
1318  if (ST.hasAtomicFaddInsts())
1319  Atomic.legalFor({{S32, GlobalPtr}});
1320 
1321  // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output
1322  // demarshalling
1323  getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG)
1324  .customFor({{S32, GlobalPtr}, {S64, GlobalPtr},
1325  {S32, FlatPtr}, {S64, FlatPtr}})
1326  .legalFor({{S32, LocalPtr}, {S64, LocalPtr},
1327  {S32, RegionPtr}, {S64, RegionPtr}});
1328  // TODO: Pointer types, any 32-bit or 64-bit vector
1329 
1330  // Condition should be s32 for scalar, s1 for vector.
1331  getActionDefinitionsBuilder(G_SELECT)
1332  .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16,
1333  GlobalPtr, LocalPtr, FlatPtr, PrivatePtr,
1334  LLT::vector(2, LocalPtr), LLT::vector(2, PrivatePtr)}, {S1, S32})
1335  .clampScalar(0, S16, S64)
1336  .scalarize(1)
1337  .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
1338  .fewerElementsIf(numElementsNotEven(0), scalarize(0))
1339  .clampMaxNumElements(0, S32, 2)
1340  .clampMaxNumElements(0, LocalPtr, 2)
1341  .clampMaxNumElements(0, PrivatePtr, 2)
1342  .scalarize(0)
1343  .widenScalarToNextPow2(0)
1344  .legalIf(all(isPointer(0), typeInSet(1, {S1, S32})));
1345 
1346  // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can
1347  // be more flexible with the shift amount type.
1348  auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR})
1349  .legalFor({{S32, S32}, {S64, S32}});
1350  if (ST.has16BitInsts()) {
1351  if (ST.hasVOP3PInsts()) {
1352  Shifts.legalFor({{S16, S16}, {V2S16, V2S16}})
1353  .clampMaxNumElements(0, S16, 2);
1354  } else
1355  Shifts.legalFor({{S16, S16}});
1356 
1357  // TODO: Support 16-bit shift amounts for all types
1358  Shifts.widenScalarIf(
1359  [=](const LegalityQuery &Query) {
1360  // Use 16-bit shift amounts for any 16-bit shift. Otherwise we want a
1361  // 32-bit amount.
1362  const LLT ValTy = Query.Types[0];
1363  const LLT AmountTy = Query.Types[1];
1364  return ValTy.getSizeInBits() <= 16 &&
1365  AmountTy.getSizeInBits() < 16;
1366  }, changeTo(1, S16));
1367  Shifts.maxScalarIf(typeIs(0, S16), 1, S16);
1368  Shifts.clampScalar(1, S32, S32);
1369  Shifts.clampScalar(0, S16, S64);
1370  Shifts.widenScalarToNextPow2(0, 16);
1371 
1372  getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1373  .minScalar(0, S16)
1374  .scalarize(0)
1375  .lower();
1376  } else {
1377  // Make sure we legalize the shift amount type first, as the general
1378  // expansion for the shifted type will produce much worse code if it hasn't
1379  // been truncated already.
1380  Shifts.clampScalar(1, S32, S32);
1381  Shifts.clampScalar(0, S32, S64);
1382  Shifts.widenScalarToNextPow2(0, 32);
1383 
1384  getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1385  .minScalar(0, S32)
1386  .scalarize(0)
1387  .lower();
1388  }
1389  Shifts.scalarize(0);
1390 
1391  for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) {
1392  unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0;
1393  unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1;
1394  unsigned IdxTypeIdx = 2;
1395 
1396  getActionDefinitionsBuilder(Op)
1397  .customIf([=](const LegalityQuery &Query) {
1398  const LLT EltTy = Query.Types[EltTypeIdx];
1399  const LLT VecTy = Query.Types[VecTypeIdx];
1400  const LLT IdxTy = Query.Types[IdxTypeIdx];
1401  const unsigned EltSize = EltTy.getSizeInBits();
1402  return (EltSize == 32 || EltSize == 64) &&
1403  VecTy.getSizeInBits() % 32 == 0 &&
1404  VecTy.getSizeInBits() <= MaxRegisterSize &&
1405  IdxTy.getSizeInBits() == 32;
1406  })
1407  .bitcastIf(all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltNarrowerThan(VecTypeIdx, 32)),
1408  bitcastToVectorElement32(VecTypeIdx))
1409  //.bitcastIf(vectorSmallerThan(1, 32), bitcastToScalar(1))
1410  .bitcastIf(
1411  all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltWiderThan(VecTypeIdx, 64)),
1412  [=](const LegalityQuery &Query) {
1413  // For > 64-bit element types, try to turn this into a 64-bit
1414  // element vector since we may be able to do better indexing
1415  // if this is scalar. If not, fall back to 32.
1416  const LLT EltTy = Query.Types[EltTypeIdx];
1417  const LLT VecTy = Query.Types[VecTypeIdx];
1418  const unsigned DstEltSize = EltTy.getSizeInBits();
1419  const unsigned VecSize = VecTy.getSizeInBits();
1420 
1421  const unsigned TargetEltSize = DstEltSize % 64 == 0 ? 64 : 32;
1422  return std::make_pair(
1423  VecTypeIdx, LLT::vector(VecSize / TargetEltSize, TargetEltSize));
1424  })
1425  .clampScalar(EltTypeIdx, S32, S64)
1426  .clampScalar(VecTypeIdx, S32, S64)
1427  .clampScalar(IdxTypeIdx, S32, S32)
1428  .clampMaxNumElements(VecTypeIdx, S32, 32)
1429  // TODO: Clamp elements for 64-bit vectors?
1430  // It should only be necessary with variable indexes.
1431  // As a last resort, lower to the stack
1432  .lower();
1433  }
1434 
1435  getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT)
1436  .unsupportedIf([=](const LegalityQuery &Query) {
1437  const LLT &EltTy = Query.Types[1].getElementType();
1438  return Query.Types[0] != EltTy;
1439  });
1440 
1441  for (unsigned Op : {G_EXTRACT, G_INSERT}) {
1442  unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0;
1443  unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1;
1444 
1445  // FIXME: Doesn't handle extract of illegal sizes.
1446  getActionDefinitionsBuilder(Op)
1447  .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32)))
1448  // FIXME: Multiples of 16 should not be legal.
1449  .legalIf([=](const LegalityQuery &Query) {
1450  const LLT BigTy = Query.Types[BigTyIdx];
1451  const LLT LitTy = Query.Types[LitTyIdx];
1452  return (BigTy.getSizeInBits() % 32 == 0) &&
1453  (LitTy.getSizeInBits() % 16 == 0);
1454  })
1455  .widenScalarIf(
1456  [=](const LegalityQuery &Query) {
1457  const LLT BigTy = Query.Types[BigTyIdx];
1458  return (BigTy.getScalarSizeInBits() < 16);
1459  },
1461  .widenScalarIf(
1462  [=](const LegalityQuery &Query) {
1463  const LLT LitTy = Query.Types[LitTyIdx];
1464  return (LitTy.getScalarSizeInBits() < 16);
1465  },
1467  .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1468  .widenScalarToNextPow2(BigTyIdx, 32);
1469 
1470  }
1471 
1472  auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR)
1473  .legalForCartesianProduct(AllS32Vectors, {S32})
1474  .legalForCartesianProduct(AllS64Vectors, {S64})
1475  .clampNumElements(0, V16S32, V32S32)
1476  .clampNumElements(0, V2S64, V16S64)
1477  .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16));
1478 
1479  if (ST.hasScalarPackInsts()) {
1480  BuildVector
1481  // FIXME: Should probably widen s1 vectors straight to s32
1482  .minScalarOrElt(0, S16)
1483  // Widen source elements and produce a G_BUILD_VECTOR_TRUNC
1484  .minScalar(1, S32);
1485 
1486  getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1487  .legalFor({V2S16, S32})
1488  .lower();
1489  BuildVector.minScalarOrElt(0, S32);
1490  } else {
1491  BuildVector.customFor({V2S16, S16});
1492  BuildVector.minScalarOrElt(0, S32);
1493 
1494  getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1495  .customFor({V2S16, S32})
1496  .lower();
1497  }
1498 
1499  BuildVector.legalIf(isRegisterType(0));
1500 
1501  // FIXME: Clamp maximum size
1502  getActionDefinitionsBuilder(G_CONCAT_VECTORS)
1503  .legalIf(all(isRegisterType(0), isRegisterType(1)))
1504  .clampMaxNumElements(0, S32, 32)
1505  .clampMaxNumElements(1, S16, 2) // TODO: Make 4?
1506  .clampMaxNumElements(0, S16, 64);
1507 
1508  // TODO: Don't fully scalarize v2s16 pieces? Or combine out thosse
1509  // pre-legalize.
1510  if (ST.hasVOP3PInsts()) {
1511  getActionDefinitionsBuilder(G_SHUFFLE_VECTOR)
1512  .customFor({V2S16, V2S16})
1513  .lower();
1514  } else
1515  getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower();
1516 
1517  // Merge/Unmerge
1518  for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
1519  unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1;
1520  unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0;
1521 
1522  auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) {
1523  const LLT Ty = Query.Types[TypeIdx];
1524  if (Ty.isVector()) {
1525  const LLT &EltTy = Ty.getElementType();
1526  if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 512)
1527  return true;
1528  if (!isPowerOf2_32(EltTy.getSizeInBits()))
1529  return true;
1530  }
1531  return false;
1532  };
1533 
1534  auto &Builder = getActionDefinitionsBuilder(Op)
1535  .legalIf(all(isRegisterType(0), isRegisterType(1)))
1536  .lowerFor({{S16, V2S16}})
1537  .lowerIf([=](const LegalityQuery &Query) {
1538  const LLT BigTy = Query.Types[BigTyIdx];
1539  return BigTy.getSizeInBits() == 32;
1540  })
1541  // Try to widen to s16 first for small types.
1542  // TODO: Only do this on targets with legal s16 shifts
1543  .minScalarOrEltIf(scalarNarrowerThan(LitTyIdx, 16), LitTyIdx, S16)
1544  .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16)
1545  .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1546  .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32),
1547  elementTypeIs(1, S16)),
1548  changeTo(1, V2S16))
1549  // Clamp the little scalar to s8-s256 and make it a power of 2. It's not
1550  // worth considering the multiples of 64 since 2*192 and 2*384 are not
1551  // valid.
1552  .clampScalar(LitTyIdx, S32, S512)
1553  .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32)
1554  // Break up vectors with weird elements into scalars
1555  .fewerElementsIf(
1556  [=](const LegalityQuery &Query) { return notValidElt(Query, LitTyIdx); },
1557  scalarize(0))
1558  .fewerElementsIf(
1559  [=](const LegalityQuery &Query) { return notValidElt(Query, BigTyIdx); },
1560  scalarize(1))
1561  .clampScalar(BigTyIdx, S32, MaxScalar);
1562 
1563  if (Op == G_MERGE_VALUES) {
1564  Builder.widenScalarIf(
1565  // TODO: Use 16-bit shifts if legal for 8-bit values?
1566  [=](const LegalityQuery &Query) {
1567  const LLT Ty = Query.Types[LitTyIdx];
1568  return Ty.getSizeInBits() < 32;
1569  },
1570  changeTo(LitTyIdx, S32));
1571  }
1572 
1573  Builder.widenScalarIf(
1574  [=](const LegalityQuery &Query) {
1575  const LLT Ty = Query.Types[BigTyIdx];
1576  return !isPowerOf2_32(Ty.getSizeInBits()) &&
1577  Ty.getSizeInBits() % 16 != 0;
1578  },
1579  [=](const LegalityQuery &Query) {
1580  // Pick the next power of 2, or a multiple of 64 over 128.
1581  // Whichever is smaller.
1582  const LLT &Ty = Query.Types[BigTyIdx];
1583  unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1);
1584  if (NewSizeInBits >= 256) {
1585  unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1);
1586  if (RoundedTo < NewSizeInBits)
1587  NewSizeInBits = RoundedTo;
1588  }
1589  return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits));
1590  })
1591  // Any vectors left are the wrong size. Scalarize them.
1592  .scalarize(0)
1593  .scalarize(1);
1594  }
1595 
1596  // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
1597  // RegBankSelect.
1598  auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG)
1599  .legalFor({{S32}, {S64}});
1600 
1601  if (ST.hasVOP3PInsts()) {
1602  SextInReg.lowerFor({{V2S16}})
1603  // Prefer to reduce vector widths for 16-bit vectors before lowering, to
1604  // get more vector shift opportunities, since we'll get those when
1605  // expanded.
1606  .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16));
1607  } else if (ST.has16BitInsts()) {
1608  SextInReg.lowerFor({{S32}, {S64}, {S16}});
1609  } else {
1610  // Prefer to promote to s32 before lowering if we don't have 16-bit
1611  // shifts. This avoid a lot of intermediate truncate and extend operations.
1612  SextInReg.lowerFor({{S32}, {S64}});
1613  }
1614 
1615  SextInReg
1616  .scalarize(0)
1617  .clampScalar(0, S32, S64)
1618  .lower();
1619 
1620  // TODO: Only Try to form v2s16 with legal packed instructions.
1621  getActionDefinitionsBuilder(G_FSHR)
1622  .legalFor({{S32, S32}})
1623  .lowerFor({{V2S16, V2S16}})
1624  .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16))
1625  .scalarize(0)
1626  .lower();
1627 
1628  if (ST.hasVOP3PInsts()) {
1629  getActionDefinitionsBuilder(G_FSHL)
1630  .lowerFor({{V2S16, V2S16}})
1631  .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16))
1632  .scalarize(0)
1633  .lower();
1634  } else {
1635  getActionDefinitionsBuilder(G_FSHL)
1636  .scalarize(0)
1637  .lower();
1638  }
1639 
1640  getActionDefinitionsBuilder(G_READCYCLECOUNTER)
1641  .legalFor({S64});
1642 
1643  getActionDefinitionsBuilder(G_FENCE)
1644  .alwaysLegal();
1645 
1646  getActionDefinitionsBuilder({G_SMULO, G_UMULO})
1647  .scalarize(0)
1648  .minScalar(0, S32)
1649  .lower();
1650 
1651  getActionDefinitionsBuilder({
1652  // TODO: Verify V_BFI_B32 is generated from expanded bit ops
1653  G_FCOPYSIGN,
1654 
1655  G_ATOMIC_CMPXCHG_WITH_SUCCESS,
1656  G_ATOMICRMW_NAND,
1657  G_ATOMICRMW_FSUB,
1658  G_READ_REGISTER,
1659  G_WRITE_REGISTER,
1660 
1661  G_SADDO, G_SSUBO,
1662 
1663  // TODO: Implement
1664  G_FMINIMUM, G_FMAXIMUM}).lower();
1665 
1666  getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE,
1667  G_INDEXED_LOAD, G_INDEXED_SEXTLOAD,
1668  G_INDEXED_ZEXTLOAD, G_INDEXED_STORE})
1669  .unsupported();
1670 
1671  getLegacyLegalizerInfo().computeTables();
1672  verify(*ST.getInstrInfo());
1673 }
1674 
1675 bool AMDGPULegalizerInfo::legalizeCustom(LegalizerHelper &Helper,
1676  MachineInstr &MI) const {
1677  MachineIRBuilder &B = Helper.MIRBuilder;
1678  MachineRegisterInfo &MRI = *B.getMRI();
1679 
1680  switch (MI.getOpcode()) {
1681  case TargetOpcode::G_ADDRSPACE_CAST:
1682  return legalizeAddrSpaceCast(MI, MRI, B);
1683  case TargetOpcode::G_FRINT:
1684  return legalizeFrint(MI, MRI, B);
1685  case TargetOpcode::G_FCEIL:
1686  return legalizeFceil(MI, MRI, B);
1687  case TargetOpcode::G_FREM:
1688  return legalizeFrem(MI, MRI, B);
1689  case TargetOpcode::G_INTRINSIC_TRUNC:
1690  return legalizeIntrinsicTrunc(MI, MRI, B);
1691  case TargetOpcode::G_SITOFP:
1692  return legalizeITOFP(MI, MRI, B, true);
1693  case TargetOpcode::G_UITOFP:
1694  return legalizeITOFP(MI, MRI, B, false);
1695  case TargetOpcode::G_FPTOSI:
1696  return legalizeFPTOI(MI, MRI, B, true);
1697  case TargetOpcode::G_FPTOUI:
1698  return legalizeFPTOI(MI, MRI, B, false);
1699  case TargetOpcode::G_FMINNUM:
1700  case TargetOpcode::G_FMAXNUM:
1701  case TargetOpcode::G_FMINNUM_IEEE:
1702  case TargetOpcode::G_FMAXNUM_IEEE:
1703  return legalizeMinNumMaxNum(Helper, MI);
1704  case TargetOpcode::G_EXTRACT_VECTOR_ELT:
1705  return legalizeExtractVectorElt(MI, MRI, B);
1706  case TargetOpcode::G_INSERT_VECTOR_ELT:
1707  return legalizeInsertVectorElt(MI, MRI, B);
1708  case TargetOpcode::G_SHUFFLE_VECTOR:
1709  return legalizeShuffleVector(MI, MRI, B);
1710  case TargetOpcode::G_FSIN:
1711  case TargetOpcode::G_FCOS:
1712  return legalizeSinCos(MI, MRI, B);
1713  case TargetOpcode::G_GLOBAL_VALUE:
1714  return legalizeGlobalValue(MI, MRI, B);
1715  case TargetOpcode::G_LOAD:
1716  case TargetOpcode::G_SEXTLOAD:
1717  case TargetOpcode::G_ZEXTLOAD:
1718  return legalizeLoad(Helper, MI);
1719  case TargetOpcode::G_FMAD:
1720  return legalizeFMad(MI, MRI, B);
1721  case TargetOpcode::G_FDIV:
1722  return legalizeFDIV(MI, MRI, B);
1723  case TargetOpcode::G_UDIV:
1724  case TargetOpcode::G_UREM:
1725  case TargetOpcode::G_UDIVREM:
1726  return legalizeUnsignedDIV_REM(MI, MRI, B);
1727  case TargetOpcode::G_SDIV:
1728  case TargetOpcode::G_SREM:
1729  case TargetOpcode::G_SDIVREM:
1730  return legalizeSignedDIV_REM(MI, MRI, B);
1731  case TargetOpcode::G_ATOMIC_CMPXCHG:
1732  return legalizeAtomicCmpXChg(MI, MRI, B);
1733  case TargetOpcode::G_FLOG:
1734  return legalizeFlog(MI, B, numbers::ln2f);
1735  case TargetOpcode::G_FLOG10:
1737  case TargetOpcode::G_FEXP:
1738  return legalizeFExp(MI, B);
1739  case TargetOpcode::G_FPOW:
1740  return legalizeFPow(MI, B);
1741  case TargetOpcode::G_FFLOOR:
1742  return legalizeFFloor(MI, MRI, B);
1743  case TargetOpcode::G_BUILD_VECTOR:
1744  return legalizeBuildVector(MI, MRI, B);
1745  default:
1746  return false;
1747  }
1748 
1749  llvm_unreachable("expected switch to return");
1750 }
1751 
1753  unsigned AS,
1755  MachineIRBuilder &B) const {
1756  MachineFunction &MF = B.getMF();
1757  const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1758  const LLT S32 = LLT::scalar(32);
1759 
1761 
1762  if (ST.hasApertureRegs()) {
1763  // FIXME: Use inline constants (src_{shared, private}_base) instead of
1764  // getreg.
1765  unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ?
1768  unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ?
1771  unsigned Encoding =
1774  WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_;
1775 
1776  Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
1777 
1778  B.buildInstr(AMDGPU::S_GETREG_B32)
1779  .addDef(GetReg)
1780  .addImm(Encoding);
1781  MRI.setType(GetReg, S32);
1782 
1783  auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1);
1784  return B.buildShl(S32, GetReg, ShiftAmt).getReg(0);
1785  }
1786 
1789 
1791  return Register();
1792 
1793  // Offset into amd_queue_t for group_segment_aperture_base_hi /
1794  // private_segment_aperture_base_hi.
1795  uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
1796 
1797  // TODO: can we be smarter about machine pointer info?
1800  PtrInfo,
1803  4, commonAlignment(Align(64), StructOffset));
1804 
1805  Register LoadAddr;
1806 
1807  B.materializePtrAdd(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset);
1808  return B.buildLoad(S32, LoadAddr, *MMO).getReg(0);
1809 }
1810 
1813  MachineIRBuilder &B) const {
1814  MachineFunction &MF = B.getMF();
1815 
1816  const LLT S32 = LLT::scalar(32);
1817  Register Dst = MI.getOperand(0).getReg();
1818  Register Src = MI.getOperand(1).getReg();
1819 
1820  LLT DstTy = MRI.getType(Dst);
1821  LLT SrcTy = MRI.getType(Src);
1822  unsigned DestAS = DstTy.getAddressSpace();
1823  unsigned SrcAS = SrcTy.getAddressSpace();
1824 
1825  // TODO: Avoid reloading from the queue ptr for each cast, or at least each
1826  // vector element.
1827  assert(!DstTy.isVector());
1828 
1829  const AMDGPUTargetMachine &TM
1830  = static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
1831 
1832  if (TM.isNoopAddrSpaceCast(SrcAS, DestAS)) {
1833  MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST));
1834  return true;
1835  }
1836 
1837  if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1838  // Truncate.
1839  B.buildExtract(Dst, Src, 0);
1840  MI.eraseFromParent();
1841  return true;
1842  }
1843 
1844  if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1846  uint32_t AddrHiVal = Info->get32BitAddressHighBits();
1847 
1848  // FIXME: This is a bit ugly due to creating a merge of 2 pointers to
1849  // another. Merge operands are required to be the same type, but creating an
1850  // extra ptrtoint would be kind of pointless.
1851  auto HighAddr = B.buildConstant(
1853  B.buildMerge(Dst, {Src, HighAddr});
1854  MI.eraseFromParent();
1855  return true;
1856  }
1857 
1858  if (SrcAS == AMDGPUAS::FLAT_ADDRESS) {
1859  assert(DestAS == AMDGPUAS::LOCAL_ADDRESS ||
1860  DestAS == AMDGPUAS::PRIVATE_ADDRESS);
1861  unsigned NullVal = TM.getNullPointerValue(DestAS);
1862 
1863  auto SegmentNull = B.buildConstant(DstTy, NullVal);
1864  auto FlatNull = B.buildConstant(SrcTy, 0);
1865 
1866  // Extract low 32-bits of the pointer.
1867  auto PtrLo32 = B.buildExtract(DstTy, Src, 0);
1868 
1869  auto CmpRes =
1870  B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0));
1871  B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0));
1872 
1873  MI.eraseFromParent();
1874  return true;
1875  }
1876 
1877  if (SrcAS != AMDGPUAS::LOCAL_ADDRESS && SrcAS != AMDGPUAS::PRIVATE_ADDRESS)
1878  return false;
1879 
1880  if (!ST.hasFlatAddressSpace())
1881  return false;
1882 
1883  auto SegmentNull =
1884  B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS));
1885  auto FlatNull =
1886  B.buildConstant(DstTy, TM.getNullPointerValue(DestAS));
1887 
1888  Register ApertureReg = getSegmentAperture(SrcAS, MRI, B);
1889  if (!ApertureReg.isValid())
1890  return false;
1891 
1892  auto CmpRes =
1893  B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, SegmentNull.getReg(0));
1894 
1895  // Coerce the type of the low half of the result so we can use merge_values.
1896  Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0);
1897 
1898  // TODO: Should we allow mismatched types but matching sizes in merges to
1899  // avoid the ptrtoint?
1900  auto BuildPtr = B.buildMerge(DstTy, {SrcAsInt, ApertureReg});
1901  B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull);
1902 
1903  MI.eraseFromParent();
1904  return true;
1905 }
1906 
1909  MachineIRBuilder &B) const {
1910  Register Src = MI.getOperand(1).getReg();
1911  LLT Ty = MRI.getType(Src);
1912  assert(Ty.isScalar() && Ty.getSizeInBits() == 64);
1913 
1914  APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
1915  APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
1916 
1917  auto C1 = B.buildFConstant(Ty, C1Val);
1918  auto CopySign = B.buildFCopysign(Ty, C1, Src);
1919 
1920  // TODO: Should this propagate fast-math-flags?
1921  auto Tmp1 = B.buildFAdd(Ty, Src, CopySign);
1922  auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign);
1923 
1924  auto C2 = B.buildFConstant(Ty, C2Val);
1925  auto Fabs = B.buildFAbs(Ty, Src);
1926 
1927  auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2);
1928  B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2);
1929  MI.eraseFromParent();
1930  return true;
1931 }
1932 
1935  MachineIRBuilder &B) const {
1936 
1937  const LLT S1 = LLT::scalar(1);
1938  const LLT S64 = LLT::scalar(64);
1939 
1940  Register Src = MI.getOperand(1).getReg();
1941  assert(MRI.getType(Src) == S64);
1942 
1943  // result = trunc(src)
1944  // if (src > 0.0 && src != result)
1945  // result += 1.0
1946 
1947  auto Trunc = B.buildIntrinsicTrunc(S64, Src);
1948 
1949  const auto Zero = B.buildFConstant(S64, 0.0);
1950  const auto One = B.buildFConstant(S64, 1.0);
1951  auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero);
1952  auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc);
1953  auto And = B.buildAnd(S1, Lt0, NeTrunc);
1954  auto Add = B.buildSelect(S64, And, One, Zero);
1955 
1956  // TODO: Should this propagate fast-math-flags?
1957  B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add);
1958  return true;
1959 }
1960 
1963  MachineIRBuilder &B) const {
1964  Register DstReg = MI.getOperand(0).getReg();
1965  Register Src0Reg = MI.getOperand(1).getReg();
1966  Register Src1Reg = MI.getOperand(2).getReg();
1967  auto Flags = MI.getFlags();
1968  LLT Ty = MRI.getType(DstReg);
1969 
1970  auto Div = B.buildFDiv(Ty, Src0Reg, Src1Reg, Flags);
1971  auto Trunc = B.buildIntrinsicTrunc(Ty, Div, Flags);
1972  auto Neg = B.buildFNeg(Ty, Trunc, Flags);
1973  B.buildFMA(DstReg, Neg, Src1Reg, Src0Reg, Flags);
1974  MI.eraseFromParent();
1975  return true;
1976 }
1977 
1979  MachineIRBuilder &B) {
1980  const unsigned FractBits = 52;
1981  const unsigned ExpBits = 11;
1982  LLT S32 = LLT::scalar(32);
1983 
1984  auto Const0 = B.buildConstant(S32, FractBits - 32);
1985  auto Const1 = B.buildConstant(S32, ExpBits);
1986 
1987  auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false)
1988  .addUse(Hi)
1989  .addUse(Const0.getReg(0))
1990  .addUse(Const1.getReg(0));
1991 
1992  return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023));
1993 }
1994 
1997  MachineIRBuilder &B) const {
1998  const LLT S1 = LLT::scalar(1);
1999  const LLT S32 = LLT::scalar(32);
2000  const LLT S64 = LLT::scalar(64);
2001 
2002  Register Src = MI.getOperand(1).getReg();
2003  assert(MRI.getType(Src) == S64);
2004 
2005  // TODO: Should this use extract since the low half is unused?
2006  auto Unmerge = B.buildUnmerge({S32, S32}, Src);
2007  Register Hi = Unmerge.getReg(1);
2008 
2009  // Extract the upper half, since this is where we will find the sign and
2010  // exponent.
2011  auto Exp = extractF64Exponent(Hi, B);
2012 
2013  const unsigned FractBits = 52;
2014 
2015  // Extract the sign bit.
2016  const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31);
2017  auto SignBit = B.buildAnd(S32, Hi, SignBitMask);
2018 
2019  const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1);
2020 
2021  const auto Zero32 = B.buildConstant(S32, 0);
2022 
2023  // Extend back to 64-bits.
2024  auto SignBit64 = B.buildMerge(S64, {Zero32, SignBit});
2025 
2026  auto Shr = B.buildAShr(S64, FractMask, Exp);
2027  auto Not = B.buildNot(S64, Shr);
2028  auto Tmp0 = B.buildAnd(S64, Src, Not);
2029  auto FiftyOne = B.buildConstant(S32, FractBits - 1);
2030 
2031  auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32);
2032  auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne);
2033 
2034  auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0);
2035  B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1);
2036  MI.eraseFromParent();
2037  return true;
2038 }
2039 
2042  MachineIRBuilder &B, bool Signed) const {
2043 
2044  Register Dst = MI.getOperand(0).getReg();
2045  Register Src = MI.getOperand(1).getReg();
2046 
2047  const LLT S64 = LLT::scalar(64);
2048  const LLT S32 = LLT::scalar(32);
2049 
2050  assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64);
2051 
2052  auto Unmerge = B.buildUnmerge({S32, S32}, Src);
2053 
2054  auto CvtHi = Signed ?
2055  B.buildSITOFP(S64, Unmerge.getReg(1)) :
2056  B.buildUITOFP(S64, Unmerge.getReg(1));
2057 
2058  auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0));
2059 
2060  auto ThirtyTwo = B.buildConstant(S32, 32);
2061  auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false)
2062  .addUse(CvtHi.getReg(0))
2063  .addUse(ThirtyTwo.getReg(0));
2064 
2065  // TODO: Should this propagate fast-math-flags?
2066  B.buildFAdd(Dst, LdExp, CvtLo);
2067  MI.eraseFromParent();
2068  return true;
2069 }
2070 
2071 // TODO: Copied from DAG implementation. Verify logic and document how this
2072 // actually works.
2075  MachineIRBuilder &B, bool Signed) const {
2076 
2077  Register Dst = MI.getOperand(0).getReg();
2078  Register Src = MI.getOperand(1).getReg();
2079 
2080  const LLT S64 = LLT::scalar(64);
2081  const LLT S32 = LLT::scalar(32);
2082 
2083  assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64);
2084 
2085  unsigned Flags = MI.getFlags();
2086 
2087  auto Trunc = B.buildIntrinsicTrunc(S64, Src, Flags);
2088  auto K0 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0x3df0000000000000)));
2089  auto K1 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0xc1f0000000000000)));
2090 
2091  auto Mul = B.buildFMul(S64, Trunc, K0, Flags);
2092  auto FloorMul = B.buildFFloor(S64, Mul, Flags);
2093  auto Fma = B.buildFMA(S64, FloorMul, K1, Trunc, Flags);
2094 
2095  auto Hi = Signed ?
2096  B.buildFPTOSI(S32, FloorMul) :
2097  B.buildFPTOUI(S32, FloorMul);
2098  auto Lo = B.buildFPTOUI(S32, Fma);
2099 
2100  B.buildMerge(Dst, { Lo, Hi });
2101  MI.eraseFromParent();
2102 
2103  return true;
2104 }
2105 
2107  MachineInstr &MI) const {
2108  MachineFunction &MF = Helper.MIRBuilder.getMF();
2110 
2111  const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE ||
2112  MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE;
2113 
2114  // With ieee_mode disabled, the instructions have the correct behavior
2115  // already for G_FMINNUM/G_FMAXNUM
2116  if (!MFI->getMode().IEEE)
2117  return !IsIEEEOp;
2118 
2119  if (IsIEEEOp)
2120  return true;
2121 
2123 }
2124 
2127  MachineIRBuilder &B) const {
2128  // TODO: Should move some of this into LegalizerHelper.
2129 
2130  // TODO: Promote dynamic indexing of s16 to s32
2131 
2132  // FIXME: Artifact combiner probably should have replaced the truncated
2133  // constant before this, so we shouldn't need
2134  // getConstantVRegValWithLookThrough.
2135  Optional<ValueAndVReg> MaybeIdxVal =
2136  getConstantVRegValWithLookThrough(MI.getOperand(2).getReg(), MRI);
2137  if (!MaybeIdxVal) // Dynamic case will be selected to register indexing.
2138  return true;
2139  const int64_t IdxVal = MaybeIdxVal->Value.getSExtValue();
2140 
2141  Register Dst = MI.getOperand(0).getReg();
2142  Register Vec = MI.getOperand(1).getReg();
2143 
2144  LLT VecTy = MRI.getType(Vec);
2145  LLT EltTy = VecTy.getElementType();
2146  assert(EltTy == MRI.getType(Dst));
2147 
2148  if (IdxVal < VecTy.getNumElements())
2149  B.buildExtract(Dst, Vec, IdxVal * EltTy.getSizeInBits());
2150  else
2151  B.buildUndef(Dst);
2152 
2153  MI.eraseFromParent();
2154  return true;
2155 }
2156 
2159  MachineIRBuilder &B) const {
2160  // TODO: Should move some of this into LegalizerHelper.
2161 
2162  // TODO: Promote dynamic indexing of s16 to s32
2163 
2164  // FIXME: Artifact combiner probably should have replaced the truncated
2165  // constant before this, so we shouldn't need
2166  // getConstantVRegValWithLookThrough.
2167  Optional<ValueAndVReg> MaybeIdxVal =
2168  getConstantVRegValWithLookThrough(MI.getOperand(3).getReg(), MRI);
2169  if (!MaybeIdxVal) // Dynamic case will be selected to register indexing.
2170  return true;
2171 
2172  int64_t IdxVal = MaybeIdxVal->Value.getSExtValue();
2173  Register Dst = MI.getOperand(0).getReg();
2174  Register Vec = MI.getOperand(1).getReg();
2175  Register Ins = MI.getOperand(2).getReg();
2176 
2177  LLT VecTy = MRI.getType(Vec);
2178  LLT EltTy = VecTy.getElementType();
2179  assert(EltTy == MRI.getType(Ins));
2180 
2181  if (IdxVal < VecTy.getNumElements())
2182  B.buildInsert(Dst, Vec, Ins, IdxVal * EltTy.getSizeInBits());
2183  else
2184  B.buildUndef(Dst);
2185 
2186  MI.eraseFromParent();
2187  return true;
2188 }
2189 
2192  MachineIRBuilder &B) const {
2193  const LLT V2S16 = LLT::vector(2, 16);
2194 
2195  Register Dst = MI.getOperand(0).getReg();
2196  Register Src0 = MI.getOperand(1).getReg();
2197  LLT DstTy = MRI.getType(Dst);
2198  LLT SrcTy = MRI.getType(Src0);
2199 
2200  if (SrcTy == V2S16 && DstTy == V2S16 &&
2201  AMDGPU::isLegalVOP3PShuffleMask(MI.getOperand(3).getShuffleMask()))
2202  return true;
2203 
2204  MachineIRBuilder HelperBuilder(MI);
2205  GISelObserverWrapper DummyObserver;
2206  LegalizerHelper Helper(B.getMF(), DummyObserver, HelperBuilder);
2208 }
2209 
2212  MachineIRBuilder &B) const {
2213 
2214  Register DstReg = MI.getOperand(0).getReg();
2215  Register SrcReg = MI.getOperand(1).getReg();
2216  LLT Ty = MRI.getType(DstReg);
2217  unsigned Flags = MI.getFlags();
2218 
2219  Register TrigVal;
2220  auto OneOver2Pi = B.buildFConstant(Ty, 0.5 * numbers::inv_pi);
2221  if (ST.hasTrigReducedRange()) {
2222  auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags);
2223  TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false)
2224  .addUse(MulVal.getReg(0))
2225  .setMIFlags(Flags).getReg(0);
2226  } else
2227  TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0);
2228 
2229  Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ?
2230  Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos;
2231  B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false)
2232  .addUse(TrigVal)
2233  .setMIFlags(Flags);
2234  MI.eraseFromParent();
2235  return true;
2236 }
2237 
2240  const GlobalValue *GV,
2241  int64_t Offset,
2242  unsigned GAFlags) const {
2243  assert(isInt<32>(Offset + 4) && "32-bit offset is expected!");
2244  // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered
2245  // to the following code sequence:
2246  //
2247  // For constant address space:
2248  // s_getpc_b64 s[0:1]
2249  // s_add_u32 s0, s0, $symbol
2250  // s_addc_u32 s1, s1, 0
2251  //
2252  // s_getpc_b64 returns the address of the s_add_u32 instruction and then
2253  // a fixup or relocation is emitted to replace $symbol with a literal
2254  // constant, which is a pc-relative offset from the encoding of the $symbol
2255  // operand to the global variable.
2256  //
2257  // For global address space:
2258  // s_getpc_b64 s[0:1]
2259  // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
2260  // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
2261  //
2262  // s_getpc_b64 returns the address of the s_add_u32 instruction and then
2263  // fixups or relocations are emitted to replace $symbol@*@lo and
2264  // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
2265  // which is a 64-bit pc-relative offset from the encoding of the $symbol
2266  // operand to the global variable.
2267  //
2268  // What we want here is an offset from the value returned by s_getpc
2269  // (which is the address of the s_add_u32 instruction) to the global
2270  // variable, but since the encoding of $symbol starts 4 bytes after the start
2271  // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too
2272  // small. This requires us to add 4 to the global variable offset in order to
2273  // compute the correct address. Similarly for the s_addc_u32 instruction, the
2274  // encoding of $symbol starts 12 bytes after the start of the s_add_u32
2275  // instruction.
2276 
2277  LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2278 
2279  Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg :
2280  B.getMRI()->createGenericVirtualRegister(ConstPtrTy);
2281 
2282  MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET)
2283  .addDef(PCReg);
2284 
2285  MIB.addGlobalAddress(GV, Offset + 4, GAFlags);
2286  if (GAFlags == SIInstrInfo::MO_NONE)
2287  MIB.addImm(0);
2288  else
2289  MIB.addGlobalAddress(GV, Offset + 12, GAFlags + 1);
2290 
2291  B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass);
2292 
2293  if (PtrTy.getSizeInBits() == 32)
2294  B.buildExtract(DstReg, PCReg, 0);
2295  return true;
2296  }
2297 
2300  MachineIRBuilder &B) const {
2301  Register DstReg = MI.getOperand(0).getReg();
2302  LLT Ty = MRI.getType(DstReg);
2303  unsigned AS = Ty.getAddressSpace();
2304 
2305  const GlobalValue *GV = MI.getOperand(1).getGlobal();
2306  MachineFunction &MF = B.getMF();
2308 
2310  if (!MFI->isModuleEntryFunction() &&
2311  !GV->getName().equals("llvm.amdgcn.module.lds")) {
2312  const Function &Fn = MF.getFunction();
2313  DiagnosticInfoUnsupported BadLDSDecl(
2314  Fn, "local memory global used by non-kernel function", MI.getDebugLoc(),
2315  DS_Warning);
2316  Fn.getContext().diagnose(BadLDSDecl);
2317 
2318  // We currently don't have a way to correctly allocate LDS objects that
2319  // aren't directly associated with a kernel. We do force inlining of
2320  // functions that use local objects. However, if these dead functions are
2321  // not eliminated, we don't want a compile time error. Just emit a warning
2322  // and a trap, since there should be no callable path here.
2323  B.buildIntrinsic(Intrinsic::trap, ArrayRef<Register>(), true);
2324  B.buildUndef(DstReg);
2325  MI.eraseFromParent();
2326  return true;
2327  }
2328 
2329  // TODO: We could emit code to handle the initialization somewhere.
2331  const SITargetLowering *TLI = ST.getTargetLowering();
2332  if (!TLI->shouldUseLDSConstAddress(GV)) {
2333  MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO);
2334  return true; // Leave in place;
2335  }
2336 
2337  if (AS == AMDGPUAS::LOCAL_ADDRESS && GV->hasExternalLinkage()) {
2338  Type *Ty = GV->getValueType();
2339  // HIP uses an unsized array `extern __shared__ T s[]` or similar
2340  // zero-sized type in other languages to declare the dynamic shared
2341  // memory which size is not known at the compile time. They will be
2342  // allocated by the runtime and placed directly after the static
2343  // allocated ones. They all share the same offset.
2344  if (B.getDataLayout().getTypeAllocSize(Ty).isZero()) {
2345  // Adjust alignment for that dynamic shared memory array.
2346  MFI->setDynLDSAlign(B.getDataLayout(), *cast<GlobalVariable>(GV));
2347  LLT S32 = LLT::scalar(32);
2348  auto Sz =
2349  B.buildIntrinsic(Intrinsic::amdgcn_groupstaticsize, {S32}, false);
2350  B.buildIntToPtr(DstReg, Sz);
2351  MI.eraseFromParent();
2352  return true;
2353  }
2354  }
2355 
2356  B.buildConstant(
2357  DstReg,
2358  MFI->allocateLDSGlobal(B.getDataLayout(), *cast<GlobalVariable>(GV)));
2359  MI.eraseFromParent();
2360  return true;
2361  }
2362 
2363  const Function &Fn = MF.getFunction();
2364  DiagnosticInfoUnsupported BadInit(
2365  Fn, "unsupported initializer for address space", MI.getDebugLoc());
2366  Fn.getContext().diagnose(BadInit);
2367  return true;
2368  }
2369 
2370  const SITargetLowering *TLI = ST.getTargetLowering();
2371 
2372  if (TLI->shouldEmitFixup(GV)) {
2373  buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0);
2374  MI.eraseFromParent();
2375  return true;
2376  }
2377 
2378  if (TLI->shouldEmitPCReloc(GV)) {
2379  buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32);
2380  MI.eraseFromParent();
2381  return true;
2382  }
2383 
2385  Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy);
2386 
2391  8 /*Size*/, Align(8));
2392 
2393  buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32);
2394 
2395  if (Ty.getSizeInBits() == 32) {
2396  // Truncate if this is a 32-bit constant adrdess.
2397  auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO);
2398  B.buildExtract(DstReg, Load, 0);
2399  } else
2400  B.buildLoad(DstReg, GOTAddr, *GOTMMO);
2401 
2402  MI.eraseFromParent();
2403  return true;
2404 }
2405 
2407  if (Ty.isVector())
2409  return LLT::scalar(PowerOf2Ceil(Ty.getSizeInBits()));
2410 }
2411 
2413  MachineInstr &MI) const {
2414  MachineIRBuilder &B = Helper.MIRBuilder;
2415  MachineRegisterInfo &MRI = *B.getMRI();
2416  GISelChangeObserver &Observer = Helper.Observer;
2417 
2418  Register PtrReg = MI.getOperand(1).getReg();
2419  LLT PtrTy = MRI.getType(PtrReg);
2420  unsigned AddrSpace = PtrTy.getAddressSpace();
2421 
2422  if (AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
2424  auto Cast = B.buildAddrSpaceCast(ConstPtr, PtrReg);
2425  Observer.changingInstr(MI);
2426  MI.getOperand(1).setReg(Cast.getReg(0));
2427  Observer.changedInstr(MI);
2428  return true;
2429  }
2430 
2431  if (MI.getOpcode() != AMDGPU::G_LOAD)
2432  return false;
2433 
2434  Register ValReg = MI.getOperand(0).getReg();
2435  LLT ValTy = MRI.getType(ValReg);
2436 
2437  MachineMemOperand *MMO = *MI.memoperands_begin();
2438  const unsigned ValSize = ValTy.getSizeInBits();
2439  const unsigned MemSize = 8 * MMO->getSize();
2440  const Align MemAlign = MMO->getAlign();
2441  const unsigned AlignInBits = 8 * MemAlign.value();
2442 
2443  // Widen non-power-of-2 loads to the alignment if needed
2444  if (shouldWidenLoad(ST, MemSize, AlignInBits, AddrSpace, MI.getOpcode())) {
2445  const unsigned WideMemSize = PowerOf2Ceil(MemSize);
2446 
2447  // This was already the correct extending load result type, so just adjust
2448  // the memory type.
2449  if (WideMemSize == ValSize) {
2450  MachineFunction &MF = B.getMF();
2451 
2452  MachineMemOperand *WideMMO =
2453  MF.getMachineMemOperand(MMO, 0, WideMemSize / 8);
2454  Observer.changingInstr(MI);
2455  MI.setMemRefs(MF, {WideMMO});
2456  Observer.changedInstr(MI);
2457  return true;
2458  }
2459 
2460  // Don't bother handling edge case that should probably never be produced.
2461  if (ValSize > WideMemSize)
2462  return false;
2463 
2464  LLT WideTy = widenToNextPowerOf2(ValTy);
2465 
2466  Register WideLoad;
2467  if (!WideTy.isVector()) {
2468  WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
2469  B.buildTrunc(ValReg, WideLoad).getReg(0);
2470  } else {
2471  // Extract the subvector.
2472 
2473  if (isRegisterType(ValTy)) {
2474  // If this a case where G_EXTRACT is legal, use it.
2475  // (e.g. <3 x s32> -> <4 x s32>)
2476  WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
2477  B.buildExtract(ValReg, WideLoad, 0);
2478  } else {
2479  // For cases where the widened type isn't a nice register value, unmerge
2480  // from a widened register (e.g. <3 x s16> -> <4 x s16>)
2481  B.setInsertPt(B.getMBB(), ++B.getInsertPt());
2482  WideLoad = Helper.widenWithUnmerge(WideTy, ValReg);
2483  B.setInsertPt(B.getMBB(), MI.getIterator());
2484  B.buildLoadFromOffset(WideLoad, PtrReg, *MMO, 0);
2485  }
2486  }
2487 
2488  MI.eraseFromParent();
2489  return true;
2490  }
2491 
2492  return false;
2493 }
2494 
2497  MachineIRBuilder &B) const {
2498  LLT Ty = MRI.getType(MI.getOperand(0).getReg());
2499  assert(Ty.isScalar());
2500 
2501  MachineFunction &MF = B.getMF();
2503 
2504  // TODO: Always legal with future ftz flag.
2505  // FIXME: Do we need just output?
2506  if (Ty == LLT::scalar(32) && !MFI->getMode().allFP32Denormals())
2507  return true;
2508  if (Ty == LLT::scalar(16) && !MFI->getMode().allFP64FP16Denormals())
2509  return true;
2510 
2511  MachineIRBuilder HelperBuilder(MI);
2512  GISelObserverWrapper DummyObserver;
2513  LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
2514  return Helper.lowerFMad(MI) == LegalizerHelper::Legalized;
2515 }
2516 
2519  Register DstReg = MI.getOperand(0).getReg();
2520  Register PtrReg = MI.getOperand(1).getReg();
2521  Register CmpVal = MI.getOperand(2).getReg();
2522  Register NewVal = MI.getOperand(3).getReg();
2523 
2525  "this should not have been custom lowered");
2526 
2527  LLT ValTy = MRI.getType(CmpVal);
2528  LLT VecTy = LLT::vector(2, ValTy);
2529 
2530  Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0);
2531 
2532  B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG)
2533  .addDef(DstReg)
2534  .addUse(PtrReg)
2535  .addUse(PackedVal)
2536  .setMemRefs(MI.memoperands());
2537 
2538  MI.eraseFromParent();
2539  return true;
2540 }
2541 
2543  MachineInstr &MI, MachineIRBuilder &B, double Log2BaseInverted) const {
2544  Register Dst = MI.getOperand(0).getReg();
2545  Register Src = MI.getOperand(1).getReg();
2546  LLT Ty = B.getMRI()->getType(Dst);
2547  unsigned Flags = MI.getFlags();
2548 
2549  auto Log2Operand = B.buildFLog2(Ty, Src, Flags);
2550  auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted);
2551 
2552  B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags);
2553  MI.eraseFromParent();
2554  return true;
2555 }
2556 
2558  MachineIRBuilder &B) const {
2559  Register Dst = MI.getOperand(0).getReg();
2560  Register Src = MI.getOperand(1).getReg();
2561  unsigned Flags = MI.getFlags();
2562  LLT Ty = B.getMRI()->getType(Dst);
2563 
2564  auto K = B.buildFConstant(Ty, numbers::log2e);
2565  auto Mul = B.buildFMul(Ty, Src, K, Flags);
2566  B.buildFExp2(Dst, Mul, Flags);
2567  MI.eraseFromParent();
2568  return true;
2569 }
2570 
2572  MachineIRBuilder &B) const {
2573  Register Dst = MI.getOperand(0).getReg();
2574  Register Src0 = MI.getOperand(1).getReg();
2575  Register Src1 = MI.getOperand(2).getReg();
2576  unsigned Flags = MI.getFlags();
2577  LLT Ty = B.getMRI()->getType(Dst);
2578  const LLT S16 = LLT::scalar(16);
2579  const LLT S32 = LLT::scalar(32);
2580 
2581  if (Ty == S32) {
2582  auto Log = B.buildFLog2(S32, Src0, Flags);
2583  auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false)
2584  .addUse(Log.getReg(0))
2585  .addUse(Src1)
2586  .setMIFlags(Flags);
2587  B.buildFExp2(Dst, Mul, Flags);
2588  } else if (Ty == S16) {
2589  // There's no f16 fmul_legacy, so we need to convert for it.
2590  auto Log = B.buildFLog2(S16, Src0, Flags);
2591  auto Ext0 = B.buildFPExt(S32, Log, Flags);
2592  auto Ext1 = B.buildFPExt(S32, Src1, Flags);
2593  auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false)
2594  .addUse(Ext0.getReg(0))
2595  .addUse(Ext1.getReg(0))
2596  .setMIFlags(Flags);
2597 
2598  B.buildFExp2(Dst, B.buildFPTrunc(S16, Mul), Flags);
2599  } else
2600  return false;
2601 
2602  MI.eraseFromParent();
2603  return true;
2604 }
2605 
2606 // Find a source register, ignoring any possible source modifiers.
2608  Register ModSrc = OrigSrc;
2609  if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) {
2610  ModSrc = SrcFNeg->getOperand(1).getReg();
2611  if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
2612  ModSrc = SrcFAbs->getOperand(1).getReg();
2613  } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
2614  ModSrc = SrcFAbs->getOperand(1).getReg();
2615  return ModSrc;
2616 }
2617 
2620  MachineIRBuilder &B) const {
2621 
2622  const LLT S1 = LLT::scalar(1);
2623  const LLT S64 = LLT::scalar(64);
2624  Register Dst = MI.getOperand(0).getReg();
2625  Register OrigSrc = MI.getOperand(1).getReg();
2626  unsigned Flags = MI.getFlags();
2627  assert(ST.hasFractBug() && MRI.getType(Dst) == S64 &&
2628  "this should not have been custom lowered");
2629 
2630  // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x))
2631  // is used instead. However, SI doesn't have V_FLOOR_F64, so the most
2632  // efficient way to implement it is using V_FRACT_F64. The workaround for the
2633  // V_FRACT bug is:
2634  // fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999)
2635  //
2636  // Convert floor(x) to (x - fract(x))
2637 
2638  auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {S64}, false)
2639  .addUse(OrigSrc)
2640  .setMIFlags(Flags);
2641 
2642  // Give source modifier matching some assistance before obscuring a foldable
2643  // pattern.
2644 
2645  // TODO: We can avoid the neg on the fract? The input sign to fract
2646  // shouldn't matter?
2647  Register ModSrc = stripAnySourceMods(OrigSrc, MRI);
2648 
2649  auto Const = B.buildFConstant(S64, BitsToDouble(0x3fefffffffffffff));
2650 
2652 
2653  // We don't need to concern ourselves with the snan handling difference, so
2654  // use the one which will directly select.
2655  const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2656  if (MFI->getMode().IEEE)
2657  B.buildFMinNumIEEE(Min, Fract, Const, Flags);
2658  else
2659  B.buildFMinNum(Min, Fract, Const, Flags);
2660 
2661  Register CorrectedFract = Min;
2662  if (!MI.getFlag(MachineInstr::FmNoNans)) {
2663  auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags);
2664  CorrectedFract = B.buildSelect(S64, IsNan, ModSrc, Min, Flags).getReg(0);
2665  }
2666 
2667  auto NegFract = B.buildFNeg(S64, CorrectedFract, Flags);
2668  B.buildFAdd(Dst, OrigSrc, NegFract, Flags);
2669 
2670  MI.eraseFromParent();
2671  return true;
2672 }
2673 
2674 // Turn an illegal packed v2s16 build vector into bit operations.
2675 // TODO: This should probably be a bitcast action in LegalizerHelper.
2678  Register Dst = MI.getOperand(0).getReg();
2679  const LLT S32 = LLT::scalar(32);
2680  assert(MRI.getType(Dst) == LLT::vector(2, 16));
2681 
2682  Register Src0 = MI.getOperand(1).getReg();
2683  Register Src1 = MI.getOperand(2).getReg();
2684  assert(MRI.getType(Src0) == LLT::scalar(16));
2685 
2686  auto Merge = B.buildMerge(S32, {Src0, Src1});
2687  B.buildBitcast(Dst, Merge);
2688 
2689  MI.eraseFromParent();
2690  return true;
2691 }
2692 
2693 // Check that this is a G_XOR x, -1
2694 static bool isNot(const MachineRegisterInfo &MRI, const MachineInstr &MI) {
2695  if (MI.getOpcode() != TargetOpcode::G_XOR)
2696  return false;
2697  auto ConstVal = getConstantVRegSExtVal(MI.getOperand(2).getReg(), MRI);
2698  return ConstVal && *ConstVal == -1;
2699 }
2700 
2701 // Return the use branch instruction, otherwise null if the usage is invalid.
2702 static MachineInstr *
2704  MachineBasicBlock *&UncondBrTarget, bool &Negated) {
2705  Register CondDef = MI.getOperand(0).getReg();
2706  if (!MRI.hasOneNonDBGUse(CondDef))
2707  return nullptr;
2708 
2709  MachineBasicBlock *Parent = MI.getParent();
2711 
2712  if (isNot(MRI, *UseMI)) {
2713  Register NegatedCond = UseMI->getOperand(0).getReg();
2714  if (!MRI.hasOneNonDBGUse(NegatedCond))
2715  return nullptr;
2716 
2717  // We're deleting the def of this value, so we need to remove it.
2719 
2720  UseMI = &*MRI.use_instr_nodbg_begin(NegatedCond);
2721  Negated = true;
2722  }
2723 
2724  if (UseMI->getParent() != Parent || UseMI->getOpcode() != AMDGPU::G_BRCOND)
2725  return nullptr;
2726 
2727  // Make sure the cond br is followed by a G_BR, or is the last instruction.
2728  MachineBasicBlock::iterator Next = std::next(UseMI->getIterator());
2729  if (Next == Parent->end()) {
2730  MachineFunction::iterator NextMBB = std::next(Parent->getIterator());
2731  if (NextMBB == Parent->getParent()->end()) // Illegal intrinsic use.
2732  return nullptr;
2733  UncondBrTarget = &*NextMBB;
2734  } else {
2735  if (Next->getOpcode() != AMDGPU::G_BR)
2736  return nullptr;
2737  Br = &*Next;
2738  UncondBrTarget = Br->getOperand(0).getMBB();
2739  }
2740 
2741  return UseMI;
2742 }
2743 
2745  const ArgDescriptor *Arg,
2746  const TargetRegisterClass *ArgRC,
2747  LLT ArgTy) const {
2748  MCRegister SrcReg = Arg->getRegister();
2749  assert(Register::isPhysicalRegister(SrcReg) && "Physical register expected");
2750  assert(DstReg.isVirtual() && "Virtual register expected");
2751 
2752  Register LiveIn = getFunctionLiveInPhysReg(B.getMF(), B.getTII(), SrcReg, *ArgRC,
2753  ArgTy);
2754  if (Arg->isMasked()) {
2755  // TODO: Should we try to emit this once in the entry block?
2756  const LLT S32 = LLT::scalar(32);
2757  const unsigned Mask = Arg->getMask();
2758  const unsigned Shift = countTrailingZeros<unsigned>(Mask);
2759 
2760  Register AndMaskSrc = LiveIn;
2761 
2762  if (Shift != 0) {
2763  auto ShiftAmt = B.buildConstant(S32, Shift);
2764  AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0);
2765  }
2766 
2767  B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift));
2768  } else {
2769  B.buildCopy(DstReg, LiveIn);
2770  }
2771 
2772  return true;
2773 }
2774 
2776  Register DstReg, MachineIRBuilder &B,
2777  AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
2778  const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2779  const ArgDescriptor *Arg;
2780  const TargetRegisterClass *ArgRC;
2781  LLT ArgTy;
2782  std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType);
2783 
2784  if (!Arg->isRegister() || !Arg->getRegister().isValid())
2785  return false; // TODO: Handle these
2786  return loadInputValue(DstReg, B, Arg, ArgRC, ArgTy);
2787 }
2788 
2791  AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
2792  if (!loadInputValue(MI.getOperand(0).getReg(), B, ArgType))
2793  return false;
2794 
2795  MI.eraseFromParent();
2796  return true;
2797 }
2798 
2801  MachineIRBuilder &B) const {
2802  Register Dst = MI.getOperand(0).getReg();
2803  LLT DstTy = MRI.getType(Dst);
2804  LLT S16 = LLT::scalar(16);
2805  LLT S32 = LLT::scalar(32);
2806  LLT S64 = LLT::scalar(64);
2807 
2808  if (DstTy == S16)
2809  return legalizeFDIV16(MI, MRI, B);
2810  if (DstTy == S32)
2811  return legalizeFDIV32(MI, MRI, B);
2812  if (DstTy == S64)
2813  return legalizeFDIV64(MI, MRI, B);
2814 
2815  return false;
2816 }
2817 
2819  Register DstDivReg,
2820  Register DstRemReg,
2821  Register X,
2822  Register Y) const {
2823  const LLT S1 = LLT::scalar(1);
2824  const LLT S32 = LLT::scalar(32);
2825 
2826  // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the
2827  // algorithm used here.
2828 
2829  // Initial estimate of inv(y).
2830  auto FloatY = B.buildUITOFP(S32, Y);
2831  auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {FloatY});
2832  auto Scale = B.buildFConstant(S32, BitsToFloat(0x4f7ffffe));
2833  auto ScaledY = B.buildFMul(S32, RcpIFlag, Scale);
2834  auto Z = B.buildFPTOUI(S32, ScaledY);
2835 
2836  // One round of UNR.
2837  auto NegY = B.buildSub(S32, B.buildConstant(S32, 0), Y);
2838  auto NegYZ = B.buildMul(S32, NegY, Z);
2839  Z = B.buildAdd(S32, Z, B.buildUMulH(S32, Z, NegYZ));
2840 
2841  // Quotient/remainder estimate.
2842  auto Q = B.buildUMulH(S32, X, Z);
2843  auto R = B.buildSub(S32, X, B.buildMul(S32, Q, Y));
2844 
2845  // First quotient/remainder refinement.
2846  auto One = B.buildConstant(S32, 1);
2847  auto Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
2848  if (DstDivReg)
2849  Q = B.buildSelect(S32, Cond, B.buildAdd(S32, Q, One), Q);
2850  R = B.buildSelect(S32, Cond, B.buildSub(S32, R, Y), R);
2851 
2852  // Second quotient/remainder refinement.
2853  Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
2854  if (DstDivReg)
2855  B.buildSelect(DstDivReg, Cond, B.buildAdd(S32, Q, One), Q);
2856 
2857  if (DstRemReg)
2858  B.buildSelect(DstRemReg, Cond, B.buildSub(S32, R, Y), R);
2859 }
2860 
2861 // Build integer reciprocal sequence arounud V_RCP_IFLAG_F32
2862 //
2863 // Return lo, hi of result
2864 //
2865 // %cvt.lo = G_UITOFP Val.lo
2866 // %cvt.hi = G_UITOFP Val.hi
2867 // %mad = G_FMAD %cvt.hi, 2**32, %cvt.lo
2868 // %rcp = G_AMDGPU_RCP_IFLAG %mad
2869 // %mul1 = G_FMUL %rcp, 0x5f7ffffc
2870 // %mul2 = G_FMUL %mul1, 2**(-32)
2871 // %trunc = G_INTRINSIC_TRUNC %mul2
2872 // %mad2 = G_FMAD %trunc, -(2**32), %mul1
2873 // return {G_FPTOUI %mad2, G_FPTOUI %trunc}
2874 static std::pair<Register, Register> emitReciprocalU64(MachineIRBuilder &B,
2875  Register Val) {
2876  const LLT S32 = LLT::scalar(32);
2877  auto Unmerge = B.buildUnmerge(S32, Val);
2878 
2879  auto CvtLo = B.buildUITOFP(S32, Unmerge.getReg(0));
2880  auto CvtHi = B.buildUITOFP(S32, Unmerge.getReg(1));
2881 
2882  auto Mad = B.buildFMAD(S32, CvtHi, // 2**32
2883  B.buildFConstant(S32, BitsToFloat(0x4f800000)), CvtLo);
2884 
2885  auto Rcp = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Mad});
2886  auto Mul1 =
2887  B.buildFMul(S32, Rcp, B.buildFConstant(S32, BitsToFloat(0x5f7ffffc)));
2888 
2889  // 2**(-32)
2890  auto Mul2 =
2891  B.buildFMul(S32, Mul1, B.buildFConstant(S32, BitsToFloat(0x2f800000)));
2892  auto Trunc = B.buildIntrinsicTrunc(S32, Mul2);
2893 
2894  // -(2**32)
2895  auto Mad2 = B.buildFMAD(S32, Trunc,
2896  B.buildFConstant(S32, BitsToFloat(0xcf800000)), Mul1);
2897 
2898  auto ResultLo = B.buildFPTOUI(S32, Mad2);
2899  auto ResultHi = B.buildFPTOUI(S32, Trunc);
2900 
2901  return {ResultLo.getReg(0), ResultHi.getReg(0)};
2902 }
2903 
2905  Register DstDivReg,
2906  Register DstRemReg,
2907  Register Numer,
2908  Register Denom) const {
2909  const LLT S32 = LLT::scalar(32);
2910  const LLT S64 = LLT::scalar(64);
2911  const LLT S1 = LLT::scalar(1);
2912  Register RcpLo, RcpHi;
2913 
2914  std::tie(RcpLo, RcpHi) = emitReciprocalU64(B, Denom);
2915 
2916  auto Rcp = B.buildMerge(S64, {RcpLo, RcpHi});
2917 
2918  auto Zero64 = B.buildConstant(S64, 0);
2919  auto NegDenom = B.buildSub(S64, Zero64, Denom);
2920 
2921  auto MulLo1 = B.buildMul(S64, NegDenom, Rcp);
2922  auto MulHi1 = B.buildUMulH(S64, Rcp, MulLo1);
2923 
2924  auto UnmergeMulHi1 = B.buildUnmerge(S32, MulHi1);
2925  Register MulHi1_Lo = UnmergeMulHi1.getReg(0);
2926  Register MulHi1_Hi = UnmergeMulHi1.getReg(1);
2927 
2928  auto Add1_Lo = B.buildUAddo(S32, S1, RcpLo, MulHi1_Lo);
2929  auto Add1_Hi = B.buildUAdde(S32, S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1));
2930  auto Add1_HiNc = B.buildAdd(S32, RcpHi, MulHi1_Hi);
2931  auto Add1 = B.buildMerge(S64, {Add1_Lo, Add1_Hi});
2932 
2933  auto MulLo2 = B.buildMul(S64, NegDenom, Add1);
2934  auto MulHi2 = B.buildUMulH(S64, Add1, MulLo2);
2935  auto UnmergeMulHi2 = B.buildUnmerge(S32, MulHi2);
2936  Register MulHi2_Lo = UnmergeMulHi2.getReg(0);
2937  Register MulHi2_Hi = UnmergeMulHi2.getReg(1);
2938 
2939  auto Zero32 = B.buildConstant(S32, 0);
2940  auto Add2_Lo = B.buildUAddo(S32, S1, Add1_Lo, MulHi2_Lo);
2941  auto Add2_HiC =
2942  B.buildUAdde(S32, S1, Add1_HiNc, MulHi2_Hi, Add1_Lo.getReg(1));
2943  auto Add2_Hi = B.buildUAdde(S32, S1, Add2_HiC, Zero32, Add2_Lo.getReg(1));
2944  auto Add2 = B.buildMerge(S64, {Add2_Lo, Add2_Hi});
2945 
2946  auto UnmergeNumer = B.buildUnmerge(S32, Numer);
2947  Register NumerLo = UnmergeNumer.getReg(0);
2948  Register NumerHi = UnmergeNumer.getReg(1);
2949 
2950  auto MulHi3 = B.buildUMulH(S64, Numer, Add2);
2951  auto Mul3 = B.buildMul(S64, Denom, MulHi3);
2952  auto UnmergeMul3 = B.buildUnmerge(S32, Mul3);
2953  Register Mul3_Lo = UnmergeMul3.getReg(0);
2954  Register Mul3_Hi = UnmergeMul3.getReg(1);
2955  auto Sub1_Lo = B.buildUSubo(S32, S1, NumerLo, Mul3_Lo);
2956  auto Sub1_Hi = B.buildUSube(S32, S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1));
2957  auto Sub1_Mi = B.buildSub(S32, NumerHi, Mul3_Hi);
2958  auto Sub1 = B.buildMerge(S64, {Sub1_Lo, Sub1_Hi});
2959 
2960  auto UnmergeDenom = B.buildUnmerge(S32, Denom);
2961  Register DenomLo = UnmergeDenom.getReg(0);
2962  Register DenomHi = UnmergeDenom.getReg(1);
2963 
2964  auto CmpHi = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Hi, DenomHi);
2965  auto C1 = B.buildSExt(S32, CmpHi);
2966 
2967  auto CmpLo = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Lo, DenomLo);
2968  auto C2 = B.buildSExt(S32, CmpLo);
2969 
2970  auto CmpEq = B.buildICmp(CmpInst::ICMP_EQ, S1, Sub1_Hi, DenomHi);
2971  auto C3 = B.buildSelect(S32, CmpEq, C2, C1);
2972 
2973  // TODO: Here and below portions of the code can be enclosed into if/endif.
2974  // Currently control flow is unconditional and we have 4 selects after
2975  // potential endif to substitute PHIs.
2976 
2977  // if C3 != 0 ...
2978  auto Sub2_Lo = B.buildUSubo(S32, S1, Sub1_Lo, DenomLo);
2979  auto Sub2_Mi = B.buildUSube(S32, S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1));
2980  auto Sub2_Hi = B.buildUSube(S32, S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1));
2981  auto Sub2 = B.buildMerge(S64, {Sub2_Lo, Sub2_Hi});
2982 
2983  auto One64 = B.buildConstant(S64, 1);
2984  auto Add3 = B.buildAdd(S64, MulHi3, One64);
2985 
2986  auto C4 =
2987  B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Hi, DenomHi));
2988  auto C5 =
2989  B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Lo, DenomLo));
2990  auto C6 = B.buildSelect(
2991  S32, B.buildICmp(CmpInst::ICMP_EQ, S1, Sub2_Hi, DenomHi), C5, C4);
2992 
2993  // if (C6 != 0)
2994  auto Add4 = B.buildAdd(S64, Add3, One64);
2995  auto Sub3_Lo = B.buildUSubo(S32, S1, Sub2_Lo, DenomLo);
2996 
2997  auto Sub3_Mi = B.buildUSube(S32, S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1));
2998  auto Sub3_Hi = B.buildUSube(S32, S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1));
2999  auto Sub3 = B.buildMerge(S64, {Sub3_Lo, Sub3_Hi});
3000 
3001  // endif C6
3002  // endif C3
3003 
3004  if (DstDivReg) {
3005  auto Sel1 = B.buildSelect(
3006  S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Add4, Add3);
3007  B.buildSelect(DstDivReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32),
3008  Sel1, MulHi3);
3009  }
3010 
3011  if (DstRemReg) {
3012  auto Sel2 = B.buildSelect(
3013  S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Sub3, Sub2);
3014  B.buildSelect(DstRemReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32),
3015  Sel2, Sub1);
3016  }
3017 }
3018 
3021  MachineIRBuilder &B) const {
3022  Register DstDivReg, DstRemReg;
3023  switch (MI.getOpcode()) {
3024  default:
3025  llvm_unreachable("Unexpected opcode!");
3026  case AMDGPU::G_UDIV: {
3027  DstDivReg = MI.getOperand(0).getReg();
3028  break;
3029  }
3030  case AMDGPU::G_UREM: {
3031  DstRemReg = MI.getOperand(0).getReg();
3032  break;
3033  }
3034  case AMDGPU::G_UDIVREM: {
3035  DstDivReg = MI.getOperand(0).getReg();
3036  DstRemReg = MI.getOperand(1).getReg();
3037  break;
3038  }
3039  }
3040 
3041  const LLT S64 = LLT::scalar(64);
3042  const LLT S32 = LLT::scalar(32);
3043  const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs();
3044  Register Num = MI.getOperand(FirstSrcOpIdx).getReg();
3045  Register Den = MI.getOperand(FirstSrcOpIdx + 1).getReg();
3046  LLT Ty = MRI.getType(MI.getOperand(0).getReg());
3047 
3048  if (Ty == S32)
3049  legalizeUnsignedDIV_REM32Impl(B, DstDivReg, DstRemReg, Num, Den);
3050  else if (Ty == S64)
3051  legalizeUnsignedDIV_REM64Impl(B, DstDivReg, DstRemReg, Num, Den);
3052  else
3053  return false;
3054 
3055  MI.eraseFromParent();
3056  return true;
3057 }
3058 
3061  MachineIRBuilder &B) const {
3062  const LLT S64 = LLT::scalar(64);
3063  const LLT S32 = LLT::scalar(32);
3064 
3065  LLT Ty = MRI.getType(MI.getOperand(0).getReg());
3066  if (Ty != S32 && Ty != S64)
3067  return false;
3068 
3069  const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs();
3070  Register LHS = MI.getOperand(FirstSrcOpIdx).getReg();
3071  Register RHS = MI.getOperand(FirstSrcOpIdx + 1).getReg();
3072 
3073  auto SignBitOffset = B.buildConstant(S32, Ty.getSizeInBits() - 1);
3074  auto LHSign = B.buildAShr(Ty, LHS, SignBitOffset);
3075  auto RHSign = B.buildAShr(Ty, RHS, SignBitOffset);
3076 
3077  LHS = B.buildAdd(Ty, LHS, LHSign).getReg(0);
3078  RHS = B.buildAdd(Ty, RHS, RHSign).getReg(0);
3079 
3080  LHS = B.buildXor(Ty, LHS, LHSign).getReg(0);
3081  RHS = B.buildXor(Ty, RHS, RHSign).getReg(0);
3082 
3083  Register DstDivReg, DstRemReg, TmpDivReg, TmpRemReg;
3084  switch (MI.getOpcode()) {
3085  default:
3086  llvm_unreachable("Unexpected opcode!");
3087  case AMDGPU::G_SDIV: {
3088  DstDivReg = MI.getOperand(0).getReg();
3089  TmpDivReg = MRI.createGenericVirtualRegister(Ty);
3090  break;
3091  }
3092  case AMDGPU::G_SREM: {
3093  DstRemReg = MI.getOperand(0).getReg();
3094  TmpRemReg = MRI.createGenericVirtualRegister(Ty);
3095  break;
3096  }
3097  case AMDGPU::G_SDIVREM: {
3098  DstDivReg = MI.getOperand(0).getReg();
3099  DstRemReg = MI.getOperand(1).getReg();
3100  TmpDivReg = MRI.createGenericVirtualRegister(Ty);
3101  TmpRemReg = MRI.createGenericVirtualRegister(Ty);
3102  break;
3103  }
3104  }
3105 
3106  if (Ty == S32)
3107  legalizeUnsignedDIV_REM32Impl(B, TmpDivReg, TmpRemReg, LHS, RHS);
3108  else
3109  legalizeUnsignedDIV_REM64Impl(B, TmpDivReg, TmpRemReg, LHS, RHS);
3110 
3111  if (DstDivReg) {
3112  auto Sign = B.buildXor(Ty, LHSign, RHSign).getReg(0);
3113  auto SignXor = B.buildXor(Ty, TmpDivReg, Sign).getReg(0);
3114  B.buildSub(DstDivReg, SignXor, Sign);
3115  }
3116 
3117  if (DstRemReg) {
3118  auto Sign = LHSign.getReg(0); // Remainder sign is the same as LHS
3119  auto SignXor = B.buildXor(Ty, TmpRemReg, Sign).getReg(0);
3120  B.buildSub(DstRemReg, SignXor, Sign);
3121  }
3122 
3123  MI.eraseFromParent();
3124  return true;
3125 }
3126 
3129  MachineIRBuilder &B) const {
3130  Register Res = MI.getOperand(0).getReg();
3131  Register LHS = MI.getOperand(1).getReg();
3132  Register RHS = MI.getOperand(2).getReg();
3133  uint16_t Flags = MI.getFlags();
3134  LLT ResTy = MRI.getType(Res);
3135 
3136  const MachineFunction &MF = B.getMF();
3137  bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath ||
3138  MI.getFlag(MachineInstr::FmAfn);
3139 
3140  if (!AllowInaccurateRcp)
3141  return false;
3142 
3143  if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) {
3144  // 1 / x -> RCP(x)
3145  if (CLHS->isExactlyValue(1.0)) {
3146  B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
3147  .addUse(RHS)
3148  .setMIFlags(Flags);
3149 
3150  MI.eraseFromParent();
3151  return true;
3152  }
3153 
3154  // -1 / x -> RCP( FNEG(x) )
3155  if (CLHS->isExactlyValue(-1.0)) {
3156  auto FNeg = B.buildFNeg(ResTy, RHS, Flags);
3157  B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
3158  .addUse(FNeg.getReg(0))
3159  .setMIFlags(Flags);
3160 
3161  MI.eraseFromParent();
3162  return true;
3163  }
3164  }
3165 
3166  // x / y -> x * (1.0 / y)
3167  auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false)
3168  .addUse(RHS)
3169  .setMIFlags(Flags);
3170  B.buildFMul(Res, LHS, RCP, Flags);
3171 
3172  MI.eraseFromParent();
3173  return true;
3174 }
3175 
3178  MachineIRBuilder &B) const {
3179  Register Res = MI.getOperand(0).getReg();
3180  Register X = MI.getOperand(1).getReg();
3181  Register Y = MI.getOperand(2).getReg();
3182  uint16_t Flags = MI.getFlags();
3183  LLT ResTy = MRI.getType(Res);
3184 
3185  const MachineFunction &MF = B.getMF();
3186  bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath ||
3187  MI.getFlag(MachineInstr::FmAfn);
3188 
3189  if (!AllowInaccurateRcp)
3190  return false;
3191 
3192  auto NegY = B.buildFNeg(ResTy, Y);
3193  auto One = B.buildFConstant(ResTy, 1.0);
3194 
3195  auto R = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false)
3196  .addUse(Y)
3197  .setMIFlags(Flags);
3198 
3199  auto Tmp0 = B.buildFMA(ResTy, NegY, R, One);
3200  R = B.buildFMA(ResTy, Tmp0, R, R);
3201 
3202  auto Tmp1 = B.buildFMA(ResTy, NegY, R, One);
3203  R = B.buildFMA(ResTy, Tmp1, R, R);
3204 
3205  auto Ret = B.buildFMul(ResTy, X, R);
3206  auto Tmp2 = B.buildFMA(ResTy, NegY, Ret, X);
3207 
3208  B.buildFMA(Res, Tmp2, R, Ret);
3209  MI.eraseFromParent();
3210  return true;
3211 }
3212 
3215  MachineIRBuilder &B) const {
3216  if (legalizeFastUnsafeFDIV(MI, MRI, B))
3217  return true;
3218 
3219  Register Res = MI.getOperand(0).getReg();
3220  Register LHS = MI.getOperand(1).getReg();
3221  Register RHS = MI.getOperand(2).getReg();
3222 
3223  uint16_t Flags = MI.getFlags();
3224 
3225  LLT S16 = LLT::scalar(16);
3226  LLT S32 = LLT::scalar(32);
3227 
3228  auto LHSExt = B.buildFPExt(S32, LHS, Flags);
3229  auto RHSExt = B.buildFPExt(S32, RHS, Flags);
3230 
3231  auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3232  .addUse(RHSExt.getReg(0))
3233  .setMIFlags(Flags);
3234 
3235  auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags);
3236  auto RDst = B.buildFPTrunc(S16, QUOT, Flags);
3237 
3238  B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
3239  .addUse(RDst.getReg(0))
3240  .addUse(RHS)
3241  .addUse(LHS)
3242  .setMIFlags(Flags);
3243 
3244  MI.eraseFromParent();
3245  return true;
3246 }
3247 
3248 // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions
3249 // to enable denorm mode. When 'Enable' is false, disable denorm mode.
3250 static void toggleSPDenormMode(bool Enable,
3252  const GCNSubtarget &ST,
3254  // Set SP denorm mode to this value.
3255  unsigned SPDenormMode =
3256  Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue();
3257 
3258  if (ST.hasDenormModeInst()) {
3259  // Preserve default FP64FP16 denorm mode while updating FP32 mode.
3260  uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue();
3261 
3262  uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2);
3263  B.buildInstr(AMDGPU::S_DENORM_MODE)
3264  .addImm(NewDenormModeValue);
3265 
3266  } else {
3267  // Select FP32 bit field in mode register.
3268  unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE |
3271 
3272  B.buildInstr(AMDGPU::S_SETREG_IMM32_B32)
3273  .addImm(SPDenormMode)
3274  .addImm(SPDenormModeBitField);
3275  }
3276 }
3277 
3280  MachineIRBuilder &B) const {
3281  if (legalizeFastUnsafeFDIV(MI, MRI, B))
3282  return true;
3283 
3284  Register Res = MI.getOperand(0).getReg();
3285  Register LHS = MI.getOperand(1).getReg();
3286  Register RHS = MI.getOperand(2).getReg();
3287  const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3289 
3290  uint16_t Flags = MI.getFlags();
3291 
3292  LLT S32 = LLT::scalar(32);
3293  LLT S1 = LLT::scalar(1);
3294 
3295  auto One = B.buildFConstant(S32, 1.0f);
3296 
3297  auto DenominatorScaled =
3298  B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
3299  .addUse(LHS)
3300  .addUse(RHS)
3301  .addImm(0)
3302  .setMIFlags(Flags);
3303  auto NumeratorScaled =
3304  B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
3305  .addUse(LHS)
3306  .addUse(RHS)
3307  .addImm(1)
3308  .setMIFlags(Flags);
3309 
3310  auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3311  .addUse(DenominatorScaled.getReg(0))
3312  .setMIFlags(Flags);
3313  auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags);
3314 
3315  // FIXME: Doesn't correctly model the FP mode switch, and the FP operations
3316  // aren't modeled as reading it.
3317  if (!Mode.allFP32Denormals())
3318  toggleSPDenormMode(true, B, ST, Mode);
3319 
3320  auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags);
3321  auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags);
3322  auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags);
3323  auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags);
3324  auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags);
3325  auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags);
3326 
3327  if (!Mode.allFP32Denormals())
3328  toggleSPDenormMode(false, B, ST, Mode);
3329 
3330  auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false)
3331  .addUse(Fma4.getReg(0))
3332  .addUse(Fma1.getReg(0))
3333  .addUse(Fma3.getReg(0))
3334  .addUse(NumeratorScaled.getReg(1))
3335  .setMIFlags(Flags);
3336 
3337  B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
3338  .addUse(Fmas.getReg(0))
3339  .addUse(RHS)
3340  .addUse(LHS)
3341  .setMIFlags(Flags);
3342 
3343  MI.eraseFromParent();
3344  return true;
3345 }
3346 
3349  MachineIRBuilder &B) const {
3351  return true;
3352 
3353  Register Res = MI.getOperand(0).getReg();
3354  Register LHS = MI.getOperand(1).getReg();
3355  Register RHS = MI.getOperand(2).getReg();
3356 
3357  uint16_t Flags = MI.getFlags();
3358 
3359  LLT S64 = LLT::scalar(64);
3360  LLT S1 = LLT::scalar(1);
3361 
3362  auto One = B.buildFConstant(S64, 1.0);
3363 
3364  auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
3365  .addUse(LHS)
3366  .addUse(RHS)
3367  .addImm(0)
3368  .setMIFlags(Flags);
3369 
3370  auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags);
3371 
3372  auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false)
3373  .addUse(DivScale0.getReg(0))
3374  .setMIFlags(Flags);
3375 
3376  auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags);
3377  auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags);
3378  auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags);
3379 
3380  auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
3381  .addUse(LHS)
3382  .addUse(RHS)
3383  .addImm(1)
3384  .setMIFlags(Flags);
3385 
3386  auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags);
3387  auto Mul = B.buildFMul(S64, DivScale1.getReg(0), Fma3, Flags);
3388  auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags);
3389 
3390  Register Scale;
3392  // Workaround a hardware bug on SI where the condition output from div_scale
3393  // is not usable.
3394 
3395  LLT S32 = LLT::scalar(32);
3396 
3397  auto NumUnmerge = B.buildUnmerge(S32, LHS);
3398  auto DenUnmerge = B.buildUnmerge(S32, RHS);
3399  auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0);
3400  auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1);
3401 
3402  auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1),
3403  Scale1Unmerge.getReg(1));
3404  auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1),
3405  Scale0Unmerge.getReg(1));
3406  Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0);
3407  } else {
3408  Scale = DivScale1.getReg(1);
3409  }
3410 
3411  auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false)
3412  .addUse(Fma4.getReg(0))
3413  .addUse(Fma3.getReg(0))
3414  .addUse(Mul.getReg(0))
3415  .addUse(Scale)
3416  .setMIFlags(Flags);
3417 
3418  B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, makeArrayRef(Res), false)
3419  .addUse(Fmas.getReg(0))
3420  .addUse(RHS)
3421  .addUse(LHS)
3422  .setMIFlags(Flags);
3423 
3424  MI.eraseFromParent();
3425  return true;
3426 }
3427 
3430  MachineIRBuilder &B) const {
3431  Register Res = MI.getOperand(0).getReg();
3432  Register LHS = MI.getOperand(2).getReg();
3433  Register RHS = MI.getOperand(3).getReg();
3434  uint16_t Flags = MI.getFlags();
3435 
3436  LLT S32 = LLT::scalar(32);
3437  LLT S1 = LLT::scalar(1);
3438 
3439  auto Abs = B.buildFAbs(S32, RHS, Flags);
3440  const APFloat C0Val(1.0f);
3441 
3442  auto C0 = B.buildConstant(S32, 0x6f800000);
3443  auto C1 = B.buildConstant(S32, 0x2f800000);
3444  auto C2 = B.buildConstant(S32, FloatToBits(1.0f));
3445 
3446  auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags);
3447  auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags);
3448 
3449  auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags);
3450 
3451  auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3452  .addUse(Mul0.getReg(0))
3453  .setMIFlags(Flags);
3454 
3455  auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags);
3456 
3457  B.buildFMul(Res, Sel, Mul1, Flags);
3458 
3459  MI.eraseFromParent();
3460  return true;
3461 }
3462 
3463 // Expand llvm.amdgcn.rsq.clamp on targets that don't support the instruction.
3464 // FIXME: Why do we handle this one but not other removed instructions?
3465 //
3466 // Reciprocal square root. The clamp prevents infinite results, clamping
3467 // infinities to max_float. D.f = 1.0 / sqrt(S0.f), result clamped to
3468 // +-max_float.
3471  MachineIRBuilder &B) const {
3473  return true;
3474 
3475  Register Dst = MI.getOperand(0).getReg();
3476  Register Src = MI.getOperand(2).getReg();
3477  auto Flags = MI.getFlags();
3478 
3479  LLT Ty = MRI.getType(Dst);
3480 
3481  const fltSemantics *FltSemantics;
3482  if (Ty == LLT::scalar(32))
3483  FltSemantics = &APFloat::IEEEsingle();
3484  else if (Ty == LLT::scalar(64))
3485  FltSemantics = &APFloat::IEEEdouble();
3486  else
3487  return false;
3488 
3489  auto Rsq = B.buildIntrinsic(Intrinsic::amdgcn_rsq, {Ty}, false)
3490  .addUse(Src)
3491  .setMIFlags(Flags);
3492 
3493  // We don't need to concern ourselves with the snan handling difference, since
3494  // the rsq quieted (or not) so use the one which will directly select.
3495  const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3496  const bool UseIEEE = MFI->getMode().IEEE;
3497 
3498  auto MaxFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics));
3499  auto ClampMax = UseIEEE ? B.buildFMinNumIEEE(Ty, Rsq, MaxFlt, Flags) :
3500  B.buildFMinNum(Ty, Rsq, MaxFlt, Flags);
3501 
3502  auto MinFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics, true));
3503 
3504  if (UseIEEE)
3505  B.buildFMaxNumIEEE(Dst, ClampMax, MinFlt, Flags);
3506  else
3507  B.buildFMaxNum(Dst, ClampMax, MinFlt, Flags);
3508  MI.eraseFromParent();
3509  return true;
3510 }
3511 
3512 static unsigned getDSFPAtomicOpcode(Intrinsic::ID IID) {
3513  switch (IID) {
3514  case Intrinsic::amdgcn_ds_fadd:
3515  return AMDGPU::G_ATOMICRMW_FADD;
3516  case Intrinsic::amdgcn_ds_fmin:
3517  return AMDGPU::G_AMDGPU_ATOMIC_FMIN;
3518  case Intrinsic::amdgcn_ds_fmax:
3519  return AMDGPU::G_AMDGPU_ATOMIC_FMAX;
3520  default:
3521  llvm_unreachable("not a DS FP intrinsic");
3522  }
3523 }
3524 
3526  MachineInstr &MI,
3527  Intrinsic::ID IID) const {
3528  GISelChangeObserver &Observer = Helper.Observer;
3529  Observer.changingInstr(MI);
3530 
3531  MI.setDesc(ST.getInstrInfo()->get(getDSFPAtomicOpcode(IID)));
3532 
3533  // The remaining operands were used to set fields in the MemOperand on
3534  // construction.
3535  for (int I = 6; I > 3; --I)
3536  MI.RemoveOperand(I);
3537 
3538  MI.RemoveOperand(1); // Remove the intrinsic ID.
3539  Observer.changedInstr(MI);
3540  return true;
3541 }
3542 
3545  MachineIRBuilder &B) const {
3546  uint64_t Offset =
3549  LLT DstTy = MRI.getType(DstReg);
3550  LLT IdxTy = LLT::scalar(DstTy.getSizeInBits());
3551 
3552  Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy);
3553  if (!loadInputValue(KernargPtrReg, B,
3555  return false;
3556 
3557  // FIXME: This should be nuw
3558  B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0));
3559  return true;
3560 }
3561 
3564  MachineIRBuilder &B) const {
3565  const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3566  if (!MFI->isEntryFunction()) {
3569  }
3570 
3571  Register DstReg = MI.getOperand(0).getReg();
3572  if (!getImplicitArgPtr(DstReg, MRI, B))
3573  return false;
3574 
3575  MI.eraseFromParent();
3576  return true;
3577 }
3578 
3582  unsigned AddrSpace) const {
3583  Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B);
3584  auto Unmerge = B.buildUnmerge(LLT::scalar(32), MI.getOperand(2).getReg());
3585  Register Hi32 = Unmerge.getReg(1);
3586 
3587  B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg);
3588  MI.eraseFromParent();
3589  return true;
3590 }
3591 
3592 // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args:
3593 // offset (the offset that is included in bounds checking and swizzling, to be
3594 // split between the instruction's voffset and immoffset fields) and soffset
3595 // (the offset that is excluded from bounds checking and swizzling, to go in
3596 // the instruction's soffset field). This function takes the first kind of
3597 // offset and figures out how to split it between voffset and immoffset.
3598 std::tuple<Register, unsigned, unsigned>
3600  Register OrigOffset) const {
3601  const unsigned MaxImm = 4095;
3602  Register BaseReg;
3603  unsigned TotalConstOffset;
3604  const LLT S32 = LLT::scalar(32);
3605  MachineRegisterInfo &MRI = *B.getMRI();
3606 
3607  std::tie(BaseReg, TotalConstOffset) =
3609 
3610  unsigned ImmOffset = TotalConstOffset;
3611 
3612  // If BaseReg is a pointer, convert it to int.
3613  if (MRI.getType(BaseReg).isPointer())
3614  BaseReg = B.buildPtrToInt(MRI.getType(OrigOffset), BaseReg).getReg(0);
3615 
3616  // If the immediate value is too big for the immoffset field, put the value
3617  // and -4096 into the immoffset field so that the value that is copied/added
3618  // for the voffset field is a multiple of 4096, and it stands more chance
3619  // of being CSEd with the copy/add for another similar load/store.
3620  // However, do not do that rounding down to a multiple of 4096 if that is a
3621  // negative number, as it appears to be illegal to have a negative offset
3622  // in the vgpr, even if adding the immediate offset makes it positive.
3623  unsigned Overflow = ImmOffset & ~MaxImm;
3624  ImmOffset -= Overflow;
3625  if ((int32_t)Overflow < 0) {
3626  Overflow += ImmOffset;
3627  ImmOffset = 0;
3628  }
3629 
3630  if (Overflow != 0) {
3631  if (!BaseReg) {
3632  BaseReg = B.buildConstant(S32, Overflow).getReg(0);
3633  } else {
3634  auto OverflowVal = B.buildConstant(S32, Overflow);
3635  BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0);
3636  }
3637  }
3638 
3639  if (!BaseReg)
3640  BaseReg = B.buildConstant(S32, 0).getReg(0);
3641 
3642  return std::make_tuple(BaseReg, ImmOffset, TotalConstOffset);
3643 }
3644 
3645 /// Handle register layout difference for f16 images for some subtargets.
3648  Register Reg,
3649  bool ImageStore) const {
3650  const LLT S16 = LLT::scalar(16);
3651  const LLT S32 = LLT::scalar(32);
3652  LLT StoreVT = MRI.getType(Reg);
3653  assert(StoreVT.isVector() && StoreVT.getElementType() == S16);
3654 
3655  if (ST.hasUnpackedD16VMem()) {
3656  auto Unmerge = B.buildUnmerge(S16, Reg);
3657 
3658  SmallVector<Register, 4> WideRegs;
3659  for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
3660  WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0));
3661 
3662  int NumElts = StoreVT.getNumElements();
3663 
3664  return B.buildBuildVector(LLT::vector(NumElts, S32), WideRegs).getReg(0);
3665  }
3666 
3667  if (ImageStore && ST.hasImageStoreD16Bug()) {
3668  if (StoreVT.getNumElements() == 2) {
3669  SmallVector<Register, 4> PackedRegs;
3670  Reg = B.buildBitcast(S32, Reg).getReg(0);
3671  PackedRegs.push_back(Reg);
3672  PackedRegs.resize(2, B.buildUndef(S32).getReg(0));
3673  return B.buildBuildVector(LLT::vector(2, S32), PackedRegs).getReg(0);
3674  }
3675 
3676  if (StoreVT.getNumElements() == 3) {
3677  SmallVector<Register, 4> PackedRegs;
3678  auto Unmerge = B.buildUnmerge(S16, Reg);
3679  for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
3680  PackedRegs.push_back(Unmerge.getReg(I));
3681  PackedRegs.resize(6, B.buildUndef(S16).getReg(0));
3682  Reg = B.buildBuildVector(LLT::vector(6, S16), PackedRegs).getReg(0);
3683  return B.buildBitcast(LLT::vector(3, S32), Reg).getReg(0);
3684  }
3685 
3686  if (StoreVT.getNumElements() == 4) {
3687  SmallVector<Register, 4> PackedRegs;
3688  Reg = B.buildBitcast(LLT::vector(2, S32), Reg).getReg(0);
3689  auto Unmerge = B.buildUnmerge(S32, Reg);
3690  for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
3691  PackedRegs.push_back(Unmerge.getReg(I));
3692  PackedRegs.resize(4, B.buildUndef(S32).getReg(0));
3693  return B.buildBuildVector(LLT::vector(4, S32), PackedRegs).getReg(0);
3694  }
3695 
3696  llvm_unreachable("invalid data type");
3697  }
3698 
3699  return Reg;
3700 }
3701 
3703  MachineIRBuilder &B, Register VData, bool IsFormat) const {
3704  MachineRegisterInfo *MRI = B.getMRI();
3705  LLT Ty = MRI->getType(VData);
3706 
3707  const LLT S16 = LLT::scalar(16);
3708 
3709  // Fixup illegal register types for i8 stores.
3710  if (Ty == LLT::scalar(8) || Ty == S16) {
3711  Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0);
3712  return AnyExt;
3713  }
3714 
3715  if (Ty.isVector()) {
3716  if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) {
3717  if (IsFormat)
3718  return handleD16VData(B, *MRI, VData);
3719  }
3720  }
3721 
3722  return VData;
3723 }
3724 
3728  bool IsTyped,
3729  bool IsFormat) const {
3730  Register VData = MI.getOperand(1).getReg();
3731  LLT Ty = MRI.getType(VData);
3732  LLT EltTy = Ty.getScalarType();
3733  const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
3734  const LLT S32 = LLT::scalar(32);
3735 
3736  VData = fixStoreSourceType(B, VData, IsFormat);
3737  Register RSrc = MI.getOperand(2).getReg();
3738 
3739  MachineMemOperand *MMO = *MI.memoperands_begin();
3740  const int MemSize = MMO->getSize();
3741 
3742  unsigned ImmOffset;
3743  unsigned TotalOffset;
3744 
3745  // The typed intrinsics add an immediate after the registers.
3746  const unsigned NumVIndexOps = IsTyped ? 8 : 7;
3747 
3748  // The struct intrinsic variants add one additional operand over raw.
3749  const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
3750  Register VIndex;
3751  int OpOffset = 0;
3752  if (HasVIndex) {
3753  VIndex = MI.getOperand(3).getReg();
3754  OpOffset = 1;
3755  }
3756 
3757  Register VOffset = MI.getOperand(3 + OpOffset).getReg();
3758  Register SOffset = MI.getOperand(4 + OpOffset).getReg();
3759 
3760  unsigned Format = 0;
3761  if (IsTyped) {
3762  Format = MI.getOperand(5 + OpOffset).getImm();
3763  ++OpOffset;
3764  }
3765 
3766  unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
3767 
3768  std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
3769  if (TotalOffset != 0)
3770  MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize);
3771 
3772  unsigned Opc;
3773  if (IsTyped) {
3774  Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 :
3775  AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT;
3776  } else if (IsFormat) {
3777  Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 :
3778  AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT;
3779  } else {
3780  switch (MemSize) {
3781  case 1:
3782  Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE;
3783  break;
3784  case 2:
3785  Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT;
3786  break;
3787  default:
3788  Opc = AMDGPU::G_AMDGPU_BUFFER_STORE;
3789  break;
3790  }
3791  }
3792 
3793  if (!VIndex)
3794  VIndex = B.buildConstant(S32, 0).getReg(0);
3795 
3796  auto MIB = B.buildInstr(Opc)
3797  .addUse(VData) // vdata
3798  .addUse(RSrc) // rsrc
3799  .addUse(VIndex) // vindex
3800  .addUse(VOffset) // voffset
3801  .addUse(SOffset) // soffset
3802  .addImm(ImmOffset); // offset(imm)
3803 
3804  if (IsTyped)
3805  MIB.addImm(Format);
3806 
3807  MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm)
3808  .addImm(HasVIndex ? -1 : 0) // idxen(imm)
3809  .addMemOperand(MMO);
3810 
3811  MI.eraseFromParent();
3812  return true;
3813 }
3814 
3818  bool IsFormat,
3819  bool IsTyped) const {
3820  // FIXME: Verifier should enforce 1 MMO for these intrinsics.
3821  MachineMemOperand *MMO = *MI.memoperands_begin();
3822  const int MemSize = MMO->getSize();
3823  const LLT S32 = LLT::scalar(32);
3824 
3825  Register Dst = MI.getOperand(0).getReg();
3826  Register RSrc = MI.getOperand(2).getReg();
3827 
3828  // The typed intrinsics add an immediate after the registers.
3829  const unsigned NumVIndexOps = IsTyped ? 8 : 7;
3830 
3831  // The struct intrinsic variants add one additional operand over raw.
3832  const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
3833  Register VIndex;
3834  int OpOffset = 0;
3835  if (HasVIndex) {
3836  VIndex = MI.getOperand(3).getReg();
3837  OpOffset = 1;
3838  }
3839 
3840  Register VOffset = MI.getOperand(3 + OpOffset).getReg();
3841  Register SOffset = MI.getOperand(4 + OpOffset).getReg();
3842 
3843  unsigned Format = 0;
3844  if (IsTyped) {
3845  Format = MI.getOperand(5 + OpOffset).getImm();
3846  ++OpOffset;
3847  }
3848 
3849  unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
3850  unsigned ImmOffset;
3851  unsigned TotalOffset;
3852 
3853  LLT Ty = MRI.getType(Dst);
3854  LLT EltTy = Ty.getScalarType();
3855  const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
3856  const bool Unpacked = ST.hasUnpackedD16VMem();
3857 
3858  std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
3859  if (TotalOffset != 0)
3860  MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize);
3861 
3862  unsigned Opc;
3863 
3864  if (IsTyped) {
3865  Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 :
3866  AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT;
3867  } else if (IsFormat) {
3868  Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16 :
3869  AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT;
3870  } else {
3871  switch (MemSize) {
3872  case 1:
3873  Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE;
3874  break;
3875  case 2:
3876  Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT;
3877  break;
3878  default:
3879  Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD;
3880  break;
3881  }
3882  }
3883 
3884  Register LoadDstReg;
3885 
3886  bool IsExtLoad = (!IsD16 && MemSize < 4) || (IsD16 && !Ty.isVector());
3887  LLT UnpackedTy = Ty.changeElementSize(32);
3888 
3889  if (IsExtLoad)
3890  LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32);
3891  else if (Unpacked && IsD16 && Ty.isVector())
3892  LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy);
3893  else
3894  LoadDstReg = Dst;
3895 
3896  if (!VIndex)
3897  VIndex = B.buildConstant(S32, 0).getReg(0);
3898 
3899  auto MIB = B.buildInstr(Opc)
3900  .addDef(LoadDstReg) // vdata
3901  .addUse(RSrc) // rsrc
3902  .addUse(VIndex) // vindex
3903  .addUse(VOffset) // voffset
3904  .addUse(SOffset) // soffset
3905  .addImm(ImmOffset); // offset(imm)
3906 
3907  if (IsTyped)
3908  MIB.addImm(Format);
3909 
3910  MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm)
3911  .addImm(HasVIndex ? -1 : 0) // idxen(imm)
3912  .addMemOperand(MMO);
3913 
3914  if (LoadDstReg != Dst) {
3915  B.setInsertPt(B.getMBB(), ++B.getInsertPt());
3916 
3917  // Widen result for extending loads was widened.
3918  if (IsExtLoad)
3919  B.buildTrunc(Dst, LoadDstReg);
3920  else {
3921  // Repack to original 16-bit vector result
3922  // FIXME: G_TRUNC should work, but legalization currently fails
3923  auto Unmerge = B.buildUnmerge(S32, LoadDstReg);
3924  SmallVector<Register, 4> Repack;
3925  for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I)
3926  Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0));
3927  B.buildMerge(Dst, Repack);
3928  }
3929  }
3930 
3931  MI.eraseFromParent();
3932  return true;
3933 }
3934 
3937  bool IsInc) const {
3938  unsigned Opc = IsInc ? AMDGPU::G_AMDGPU_ATOMIC_INC :
3939  AMDGPU::G_AMDGPU_ATOMIC_DEC;
3940  B.buildInstr(Opc)
3941  .addDef(MI.getOperand(0).getReg())
3942  .addUse(MI.getOperand(2).getReg())
3943  .addUse(MI.getOperand(3).getReg())
3944  .cloneMemRefs(MI);
3945  MI.eraseFromParent();
3946  return true;
3947 }
3948 
3949 static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) {
3950  switch (IntrID) {
3951  case Intrinsic::amdgcn_raw_buffer_atomic_swap:
3952  case Intrinsic::amdgcn_struct_buffer_atomic_swap:
3953  return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP;
3954  case Intrinsic::amdgcn_raw_buffer_atomic_add:
3955  case Intrinsic::amdgcn_struct_buffer_atomic_add:
3956  return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD;
3957  case Intrinsic::amdgcn_raw_buffer_atomic_sub:
3958  case Intrinsic::amdgcn_struct_buffer_atomic_sub:
3959  return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB;
3960  case Intrinsic::amdgcn_raw_buffer_atomic_smin:
3961  case Intrinsic::amdgcn_struct_buffer_atomic_smin:
3962  return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN;
3963  case Intrinsic::amdgcn_raw_buffer_atomic_umin:
3964  case Intrinsic::amdgcn_struct_buffer_atomic_umin:
3965  return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN;
3966  case Intrinsic::amdgcn_raw_buffer_atomic_smax:
3967  case Intrinsic::amdgcn_struct_buffer_atomic_smax:
3968  return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX;
3969  case Intrinsic::amdgcn_raw_buffer_atomic_umax:
3970  case Intrinsic::amdgcn_struct_buffer_atomic_umax:
3971  return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX;
3972  case Intrinsic::amdgcn_raw_buffer_atomic_and:
3973  case Intrinsic::amdgcn_struct_buffer_atomic_and:
3974  return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND;
3975  case Intrinsic::amdgcn_raw_buffer_atomic_or:
3976  case Intrinsic::amdgcn_struct_buffer_atomic_or:
3977  return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR;
3978  case Intrinsic::amdgcn_raw_buffer_atomic_xor:
3979  case Intrinsic::amdgcn_struct_buffer_atomic_xor:
3980  return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR;
3981  case Intrinsic::amdgcn_raw_buffer_atomic_inc:
3982  case Intrinsic::amdgcn_struct_buffer_atomic_inc:
3983  return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC;
3984  case Intrinsic::amdgcn_raw_buffer_atomic_dec:
3985  case Intrinsic::amdgcn_struct_buffer_atomic_dec:
3986  return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC;
3987  case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
3988  case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
3989  return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP;
3990  case Intrinsic::amdgcn_buffer_atomic_fadd:
3991  case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
3992  case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
3993  return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD;
3994  case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
3995  case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
3996  return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN;
3997  case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
3998  case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
3999  return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX;
4000  default:
4001  llvm_unreachable("unhandled atomic opcode");
4002  }
4003 }
4004 
4007  Intrinsic::ID IID) const {
4008  const bool IsCmpSwap = IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap ||
4009  IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap;
4010  const bool HasReturn = MI.getNumExplicitDefs() != 0;
4011 
4012  Register Dst;
4013 
4014  int OpOffset = 0;
4015  if (HasReturn) {
4016  // A few FP atomics do not support return values.
4017  Dst = MI.getOperand(0).getReg();
4018  } else {
4019  OpOffset = -1;
4020  }
4021 
4022  Register VData = MI.getOperand(2 + OpOffset).getReg();
4023  Register CmpVal;
4024 
4025  if (IsCmpSwap) {
4026  CmpVal = MI.getOperand(3 + OpOffset).getReg();
4027  ++OpOffset;
4028  }
4029 
4030  Register RSrc = MI.getOperand(3 + OpOffset).getReg();
4031  const unsigned NumVIndexOps = (IsCmpSwap ? 8 : 7) + HasReturn;
4032 
4033  // The struct intrinsic variants add one additional operand over raw.
4034  const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
4035  Register VIndex;
4036  if (HasVIndex) {
4037  VIndex = MI.getOperand(4 + OpOffset).getReg();
4038  ++OpOffset;
4039  }
4040 
4041  Register VOffset = MI.getOperand(4 + OpOffset).getReg();
4042  Register SOffset = MI.getOperand(5 + OpOffset).getReg();
4043  unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm();
4044 
4045  MachineMemOperand *MMO = *MI.memoperands_begin();
4046 
4047  unsigned ImmOffset;
4048  unsigned TotalOffset;
4049  std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
4050  if (TotalOffset != 0)
4051  MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MMO->getSize());
4052 
4053  if (!VIndex)
4054  VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0);
4055 
4056  auto MIB = B.buildInstr(getBufferAtomicPseudo(IID));
4057 
4058  if (HasReturn)
4059  MIB.addDef(Dst);
4060 
4061  MIB.addUse(VData); // vdata
4062 
4063  if (IsCmpSwap)
4064  MIB.addReg(CmpVal);
4065 
4066  MIB.addUse(RSrc) // rsrc
4067  .addUse(VIndex) // vindex
4068  .addUse(VOffset) // voffset
4069  .addUse(SOffset) // soffset
4070  .addImm(ImmOffset) // offset(imm)
4071  .addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm)
4072  .addImm(HasVIndex ? -1 : 0) // idxen(imm)
4073  .addMemOperand(MMO);
4074 
4075  MI.eraseFromParent();
4076  return true;
4077 }
4078 
4079 /// Turn a set of s16 typed registers in \p AddrRegs into a dword sized
4080 /// vector with s16 typed elements.
4082  SmallVectorImpl<Register> &PackedAddrs,
4083  unsigned ArgOffset,
4085  bool IsA16, bool IsG16) {
4086  const LLT S16 = LLT::scalar(16);
4087  const LLT V2S16 = LLT::vector(2, 16);
4088  auto EndIdx = Intr->VAddrEnd;
4089 
4090  for (unsigned I = Intr->VAddrStart; I < EndIdx; I++) {
4091  MachineOperand &SrcOp = MI.getOperand(ArgOffset + I);
4092  if (!SrcOp.isReg())
4093  continue; // _L to _LZ may have eliminated this.
4094 
4095  Register AddrReg = SrcOp.getReg();
4096 
4097  if (I < Intr->GradientStart) {
4098  AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0);
4099  PackedAddrs.push_back(AddrReg);
4100  } else if ((I >= Intr->GradientStart && I < Intr->CoordStart && !IsG16) ||
4101  (I >= Intr->CoordStart && !IsA16)) {
4102  // Handle any gradient or coordinate operands that should not be packed
4103  PackedAddrs.push_back(AddrReg);
4104  } else {
4105  // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D,
4106  // derivatives dx/dh and dx/dv are packed with undef.
4107  if (((I + 1) >= EndIdx) ||
4108  ((Intr->NumGradients / 2) % 2 == 1 &&
4109  (I == static_cast<unsigned>(Intr->GradientStart +
4110  (Intr->NumGradients / 2) - 1) ||
4111  I == static_cast<unsigned>(Intr->GradientStart +
4112  Intr->NumGradients - 1))) ||
4113  // Check for _L to _LZ optimization
4114  !MI.getOperand(ArgOffset + I + 1).isReg()) {
4115  PackedAddrs.push_back(
4116  B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
4117  .getReg(0));
4118  } else {
4119  PackedAddrs.push_back(
4120  B.buildBuildVector(
4121  V2S16, {AddrReg, MI.getOperand(ArgOffset + I + 1).getReg()})
4122  .getReg(0));
4123  ++I;
4124  }
4125  }
4126  }
4127 }
4128 
4129 /// Convert from separate vaddr components to a single vector address register,
4130 /// and replace the remaining operands with $noreg.
4132  int DimIdx, int NumVAddrs) {
4133  const LLT S32 = LLT::scalar(32);
4134 
4135  SmallVector<Register, 8> AddrRegs;
4136  for (int I = 0; I != NumVAddrs; ++I) {
4137  MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
4138  if (SrcOp.isReg()) {
4139  AddrRegs.push_back(SrcOp.getReg());
4140  assert(B.getMRI()->getType(SrcOp.getReg()) == S32);
4141  }
4142  }
4143 
4144  int NumAddrRegs = AddrRegs.size();
4145  if (NumAddrRegs != 1) {
4146  // Round up to 8 elements for v5-v7
4147  // FIXME: Missing intermediate sized register classes and instructions.
4148  if (NumAddrRegs > 4 && !isPowerOf2_32(NumAddrRegs)) {
4149  const int RoundedNumRegs = NextPowerOf2(NumAddrRegs);
4150  auto Undef = B.buildUndef(S32);
4151  AddrRegs.append(RoundedNumRegs - NumAddrRegs, Undef.getReg(0));
4152  NumAddrRegs = RoundedNumRegs;
4153  }
4154 
4155  auto VAddr = B.buildBuildVector(LLT::vector(NumAddrRegs, 32), AddrRegs);
4156  MI.getOperand(DimIdx).setReg(VAddr.getReg(0));
4157  }
4158 
4159  for (int I = 1; I != NumVAddrs; ++I) {
4160  MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
4161  if (SrcOp.isReg())
4162  MI.getOperand(DimIdx + I).setReg(AMDGPU::NoRegister);
4163  }
4164 }
4165 
4166 /// Rewrite image intrinsics to use register layouts expected by the subtarget.
4167 ///
4168 /// Depending on the subtarget, load/store with 16-bit element data need to be
4169 /// rewritten to use the low half of 32-bit registers, or directly use a packed
4170 /// layout. 16-bit addresses should also sometimes be packed into 32-bit
4171 /// registers.
4172 ///
4173 /// We don't want to directly select image instructions just yet, but also want
4174 /// to exposes all register repacking to the legalizer/combiners. We also don't
4175 /// want a selected instrution entering RegBankSelect. In order to avoid
4176 /// defining a multitude of intermediate image instructions, directly hack on
4177 /// the intrinsic's arguments. In cases like a16 addreses, this requires padding
4178 /// now unnecessary arguments with $noreg.
4181  const AMDGPU::ImageDimIntrinsicInfo *Intr) const {
4182 
4183  const unsigned NumDefs = MI.getNumExplicitDefs();
4184  const unsigned ArgOffset = NumDefs + 1;
4185  bool IsTFE = NumDefs == 2;
4186  // We are only processing the operands of d16 image operations on subtargets
4187  // that use the unpacked register layout, or need to repack the TFE result.
4188 
4189  // TODO: Do we need to guard against already legalized intrinsics?
4190  const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
4191  AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode);
4192 
4193  MachineRegisterInfo *MRI = B.getMRI();
4194  const LLT S32 = LLT::scalar(32);
4195  const LLT S16 = LLT::scalar(16);
4196  const LLT V2S16 = LLT::vector(2, 16);
4197 
4198  unsigned DMask = 0;
4199 
4200  // Check for 16 bit addresses and pack if true.
4201  LLT GradTy =
4202  MRI->getType(MI.getOperand(ArgOffset + Intr->GradientStart).getReg());
4203  LLT AddrTy =
4204  MRI->getType(MI.getOperand(ArgOffset + Intr->CoordStart).getReg());
4205  const bool IsG16 = GradTy == S16;
4206  const bool IsA16 = AddrTy == S16;
4207 
4208  int DMaskLanes = 0;
4209  if (!BaseOpcode->Atomic) {
4210  DMask = MI.getOperand(ArgOffset + Intr->DMaskIndex).getImm();
4211  if (BaseOpcode->Gather4) {
4212  DMaskLanes = 4;
4213  } else if (DMask != 0) {
4214  DMaskLanes = countPopulation(DMask);
4215  } else if (!IsTFE && !BaseOpcode->Store) {
4216  // If dmask is 0, this is a no-op load. This can be eliminated.
4217  B.buildUndef(MI.getOperand(0));
4218  MI.eraseFromParent();
4219  return true;
4220  }
4221  }
4222 
4223  Observer.changingInstr(MI);
4224  auto ChangedInstr = make_scope_exit([&] { Observer.changedInstr(MI); });
4225 
4226  unsigned NewOpcode = NumDefs == 0 ?
4227  AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD;
4228 
4229  // Track that we legalized this
4230  MI.setDesc(B.getTII().get(NewOpcode));
4231 
4232  // Expecting to get an error flag since TFC is on - and dmask is 0 Force
4233  // dmask to be at least 1 otherwise the instruction will fail
4234  if (IsTFE && DMask == 0) {
4235  DMask = 0x1;
4236  DMaskLanes = 1;
4237  MI.getOperand(ArgOffset + Intr->DMaskIndex).setImm(DMask);
4238  }
4239 
4240  if (BaseOpcode->Atomic) {
4241  Register VData0 = MI.getOperand(2).getReg();
4242  LLT Ty = MRI->getType(VData0);
4243 
4244  // TODO: Allow atomic swap and bit ops for v2s16/v4s16
4245  if (Ty.isVector())
4246  return false;
4247 
4248  if (BaseOpcode->AtomicX2) {
4249  Register VData1 = MI.getOperand(3).getReg();
4250  // The two values are packed in one register.
4251  LLT PackedTy = LLT::vector(2, Ty);
4252  auto Concat = B.buildBuildVector(PackedTy, {VData0, VData1});
4253  MI.getOperand(2).setReg(Concat.getReg(0));
4254  MI.getOperand(3).setReg(AMDGPU::NoRegister);
4255  }
4256  }
4257 
4258  unsigned CorrectedNumVAddrs = Intr->NumVAddrs;
4259 
4260  // Optimize _L to _LZ when _L is zero
4261  if (const AMDGPU::MIMGLZMappingInfo *LZMappingInfo =
4262  AMDGPU::getMIMGLZMappingInfo(Intr->BaseOpcode)) {
4263  const ConstantFP *ConstantLod;
4264 
4265  if (mi_match(MI.getOperand(ArgOffset + Intr->LodIndex).getReg(), *MRI,
4266  m_GFCst(ConstantLod))) {
4267  if (ConstantLod->isZero() || ConstantLod->isNegative()) {
4268  // Set new opcode to _lz variant of _l, and change the intrinsic ID.
4269  const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr =
4271  Intr->Dim);
4272 
4273  // The starting indexes should remain in the same place.
4274  --CorrectedNumVAddrs;
4275 
4276  MI.getOperand(MI.getNumExplicitDefs())
4277  .setIntrinsicID(static_cast<Intrinsic::ID>(NewImageDimIntr->Intr));
4278  MI.RemoveOperand(ArgOffset + Intr->LodIndex);
4279  Intr = NewImageDimIntr;
4280  }
4281  }
4282  }
4283 
4284  // Optimize _mip away, when 'lod' is zero
4285  if (AMDGPU::getMIMGMIPMappingInfo(Intr->BaseOpcode)) {
4286  int64_t ConstantLod;
4287  if (mi_match(MI.getOperand(ArgOffset + Intr->MipIndex).getReg(), *MRI,
4288  m_ICst(ConstantLod))) {
4289  if (ConstantLod == 0) {
4290  // TODO: Change intrinsic opcode and remove operand instead or replacing
4291  // it with 0, as the _L to _LZ handling is done above.
4292  MI.getOperand(ArgOffset + Intr->MipIndex).ChangeToImmediate(0);
4293  --CorrectedNumVAddrs;
4294  }
4295  }
4296  }
4297 
4298  // Rewrite the addressing register layout before doing anything else.
4299  if (BaseOpcode->Gradients && !ST.hasG16() && (IsA16 != IsG16)) {
4300  // 16 bit gradients are supported, but are tied to the A16 control
4301  // so both gradients and addresses must be 16 bit
4302  return false;
4303  }
4304 
4305  if (IsA16 && !ST.hasA16()) {
4306  // A16 not supported
4307  return false;
4308  }
4309 
4310  if (IsA16 || IsG16) {
4311  if (Intr->NumVAddrs > 1) {
4312  SmallVector<Register, 4> PackedRegs;
4313 
4314  packImage16bitOpsToDwords(B, MI, PackedRegs, ArgOffset, Intr, IsA16,
4315  IsG16);
4316 
4317  // See also below in the non-a16 branch
4318  const bool UseNSA = PackedRegs.size() >= 3 && ST.hasNSAEncoding();
4319 
4320  if (!UseNSA && PackedRegs.size() > 1) {
4321  LLT PackedAddrTy = LLT::vector(2 * PackedRegs.size(), 16);
4322  auto Concat = B.buildConcatVectors(PackedAddrTy, PackedRegs);
4323  PackedRegs[0] = Concat.getReg(0);
4324  PackedRegs.resize(1);
4325  }
4326 
4327  const unsigned NumPacked = PackedRegs.size();
4328  for (unsigned I = Intr->VAddrStart; I < Intr->VAddrEnd; I++) {
4329  MachineOperand &SrcOp = MI.getOperand(ArgOffset + I);
4330  if (!SrcOp.isReg()) {
4331  assert(SrcOp.isImm() && SrcOp.getImm() == 0);
4332  continue;
4333  }
4334 
4335  assert(SrcOp.getReg() != AMDGPU::NoRegister);
4336 
4337  if (I - Intr->VAddrStart < NumPacked)
4338  SrcOp.setReg(PackedRegs[I - Intr->VAddrStart]);
4339  else
4340  SrcOp.setReg(AMDGPU::NoRegister);
4341  }
4342  }
4343  } else {
4344  // If the register allocator cannot place the address registers contiguously
4345  // without introducing moves, then using the non-sequential address encoding
4346  // is always preferable, since it saves VALU instructions and is usually a
4347  // wash in terms of code size or even better.
4348  //
4349  // However, we currently have no way of hinting to the register allocator
4350  // that MIMG addresses should be placed contiguously when it is possible to
4351  // do so, so force non-NSA for the common 2-address case as a heuristic.
4352  //
4353  // SIShrinkInstructions will convert NSA encodings to non-NSA after register
4354  // allocation when possible.
4355  const bool UseNSA = CorrectedNumVAddrs >= 3 && ST.hasNSAEncoding();
4356 
4357  if (!UseNSA && Intr->NumVAddrs > 1)
4358  convertImageAddrToPacked(B, MI, ArgOffset + Intr->VAddrStart,
4359  Intr->NumVAddrs);
4360  }
4361 
4362  int Flags = 0;
4363  if (IsA16)
4364  Flags |= 1;
4365  if (IsG16)
4366  Flags |= 2;
4367  MI.addOperand(MachineOperand::CreateImm(Flags));
4368 
4369  if (BaseOpcode->Store) { // No TFE for stores?
4370  // TODO: Handle dmask trim
4371  Register VData = MI.getOperand(1).getReg();
4372  LLT Ty = MRI->getType(VData);
4373  if (!Ty.isVector() || Ty.getElementType() != S16)
4374  return true;
4375 
4376  Register RepackedReg = handleD16VData(B, *MRI, VData, true);
4377  if (RepackedReg != VData) {
4378  MI.getOperand(1).setReg(RepackedReg);
4379  }
4380 
4381  return true;
4382  }
4383 
4384  Register DstReg = MI.getOperand(0).getReg();
4385  LLT Ty = MRI->getType(DstReg);
4386  const LLT EltTy = Ty.getScalarType();
4387  const bool IsD16 = Ty.getScalarType() == S16;
4388  const int NumElts = Ty.isVector() ? Ty.getNumElements() : 1;
4389 
4390  // Confirm that the return type is large enough for the dmask specified
4391  if (NumElts < DMaskLanes)
4392  return false;
4393 
4394  if (NumElts > 4 || DMaskLanes > 4)
4395  return false;
4396 
4397  const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes;
4398  const LLT AdjustedTy = Ty.changeNumElements(AdjustedNumElts);
4399 
4400  // The raw dword aligned data component of the load. The only legal cases
4401  // where this matters should be when using the packed D16 format, for
4402  // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>,
4403  LLT RoundedTy;
4404 
4405  // S32 vector to to cover all data, plus TFE result element.
4406  LLT TFETy;
4407 
4408  // Register type to use for each loaded component. Will be S32 or V2S16.
4409  LLT RegTy;
4410 
4411  if (IsD16 && ST.hasUnpackedD16VMem()) {
4412  RoundedTy = LLT::scalarOrVector(AdjustedNumElts, 32);
4413  TFETy = LLT::vector(AdjustedNumElts + 1, 32);
4414  RegTy = S32;
4415  } else {
4416  unsigned EltSize = EltTy.getSizeInBits();
4417  unsigned RoundedElts = (AdjustedTy.getSizeInBits() + 31) / 32;
4418  unsigned RoundedSize = 32 * RoundedElts;
4419  RoundedTy = LLT::scalarOrVector(RoundedSize / EltSize, EltSize);
4420  TFETy = LLT::vector(RoundedSize / 32 + 1, S32);
4421  RegTy = !IsTFE && EltSize == 16 ? V2S16 : S32;
4422  }
4423 
4424  // The return type does not need adjustment.
4425  // TODO: Should we change s16 case to s32 or <2 x s16>?
4426  if (!IsTFE && (RoundedTy == Ty || !Ty.isVector()))
4427  return true;
4428 
4429  Register Dst1Reg;
4430 
4431  // Insert after the instruction.
4432  B.setInsertPt(*MI.getParent(), ++MI.getIterator());
4433 
4434  // TODO: For TFE with d16, if we used a TFE type that was a multiple of <2 x
4435  // s16> instead of s32, we would only need 1 bitcast instead of multiple.
4436  const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy;
4437  const int ResultNumRegs = LoadResultTy.getSizeInBits() / 32;
4438 
4439  Register NewResultReg = MRI->createGenericVirtualRegister(LoadResultTy);
4440 
4441  MI.getOperand(0).setReg(NewResultReg);
4442 
4443  // In the IR, TFE is supposed to be used with a 2 element struct return
4444  // type. The intruction really returns these two values in one contiguous
4445  // register, with one additional dword beyond the loaded data. Rewrite the
4446  // return type to use a single register result.
4447 
4448  if (IsTFE) {
4449  Dst1Reg = MI.getOperand(1).getReg();
4450  if (MRI->getType(Dst1Reg) != S32)
4451  return false;
4452 
4453  // TODO: Make sure the TFE operand bit is set.
4454  MI.RemoveOperand(1);
4455 
4456  // Handle the easy case that requires no repack instructions.
4457  if (Ty == S32) {
4458  B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg);
4459  return true;
4460  }
4461  }
4462 
4463  // Now figure out how to copy the new result register back into the old
4464  // result.
4465  SmallVector<Register, 5> ResultRegs(ResultNumRegs, Dst1Reg);
4466 
4467  const int NumDataRegs = IsTFE ? ResultNumRegs - 1 : ResultNumRegs;
4468 
4469  if (ResultNumRegs == 1) {
4470  assert(!IsTFE);
4471  ResultRegs[0] = NewResultReg;
4472  } else {
4473  // We have to repack into a new vector of some kind.
4474  for (int I = 0; I != NumDataRegs; ++I)
4475  ResultRegs[I] = MRI->createGenericVirtualRegister(RegTy);
4476  B.buildUnmerge(ResultRegs, NewResultReg);
4477 
4478  // Drop the final TFE element to get the data part. The TFE result is
4479  // directly written to the right place already.
4480  if (IsTFE)
4481  ResultRegs.resize(NumDataRegs);
4482  }
4483 
4484  // For an s16 scalar result, we form an s32 result with a truncate regardless
4485  // of packed vs. unpacked.
4486  if (IsD16 && !Ty.isVector()) {
4487  B.buildTrunc(DstReg, ResultRegs[0]);
4488  return true;
4489  }
4490 
4491  // Avoid a build/concat_vector of 1 entry.
4492  if (Ty == V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) {
4493  B.buildBitcast(DstReg, ResultRegs[0]);
4494  return true;
4495  }
4496 
4497  assert(Ty.isVector());
4498 
4499  if (IsD16) {
4500  // For packed D16 results with TFE enabled, all the data components are
4501  // S32. Cast back to the expected type.
4502  //
4503  // TODO: We don't really need to use load s32 elements. We would only need one
4504  // cast for the TFE result if a multiple of v2s16 was used.
4505  if (RegTy != V2S16 && !ST.hasUnpackedD16VMem()) {
4506  for (Register &Reg : ResultRegs)
4507  Reg = B.buildBitcast(V2S16, Reg).getReg(0);
4508  } else if (ST.hasUnpackedD16VMem()) {
4509  for (Register &Reg : ResultRegs)
4510  Reg = B.buildTrunc(S16, Reg).getReg(0);
4511  }
4512  }
4513 
4514  auto padWithUndef = [&](LLT Ty, int NumElts) {
4515  if (NumElts == 0)
4516  return;
4517  Register Undef = B.buildUndef(Ty).getReg(0);
4518  for (int I = 0; I != NumElts; ++I)
4519  ResultRegs.push_back(Undef);
4520  };
4521 
4522  // Pad out any elements eliminated due to the dmask.
4523  LLT ResTy = MRI->getType(ResultRegs[0]);
4524  if (!ResTy.isVector()) {
4525  padWithUndef(ResTy, NumElts - ResultRegs.size());
4526  B.buildBuildVector(DstReg, ResultRegs);
4527  return true;
4528  }
4529 
4530  assert(!ST.hasUnpackedD16VMem() && ResTy == V2S16);
4531  const int RegsToCover = (Ty.getSizeInBits() + 31) / 32;
4532 
4533  // Deal with the one annoying legal case.
4534  const LLT V3S16 = LLT::vector(3, 16);
4535  if (Ty == V3S16) {
4536  padWithUndef(ResTy, RegsToCover - ResultRegs.size() + 1);
4537  auto Concat = B.buildConcatVectors(LLT::vector(6, 16), ResultRegs);
4538  B.buildUnmerge({DstReg, MRI->createGenericVirtualRegister(V3S16)}, Concat);
4539  return true;
4540  }
4541 
4542  padWithUndef(ResTy, RegsToCover - ResultRegs.size());
4543  B.buildConcatVectors(DstReg, ResultRegs);
4544  return true;
4545 }
4546 
4548  LegalizerHelper &Helper, MachineInstr &MI) const {
4549  MachineIRBuilder &B = Helper.MIRBuilder;
4550  GISelChangeObserver &Observer = Helper.Observer;
4551 
4552  Register Dst = MI.getOperand(0).getReg();
4553  LLT Ty = B.getMRI()->getType(Dst);
4554  unsigned Size = Ty.getSizeInBits();
4555  MachineFunction &MF = B.getMF();
4556 
4557  Observer.changingInstr(MI);
4558 
4559  if (shouldBitcastLoadStoreType(ST, Ty, Size)) {
4560  Ty = getBitcastRegisterType(Ty);
4561  Helper.bitcastDst(MI, Ty, 0);
4562  Dst = MI.getOperand(0).getReg();
4563  B.setInsertPt(B.getMBB(), MI);
4564  }
4565 
4566  // FIXME: We don't really need this intermediate instruction. The intrinsic
4567  // should be fixed to have a memory operand. Since it's readnone, we're not
4568  // allowed to add one.
4569  MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_LOAD));
4570  MI.RemoveOperand(1); // Remove intrinsic ID
4571 
4572  // FIXME: When intrinsic definition is fixed, this should have an MMO already.
4573  // TODO: Should this use datalayout alignment?
4574  const unsigned MemSize = (Size + 7) / 8;
4575  const Align MemAlign(4);
4580  MemSize, MemAlign);
4581  MI.addMemOperand(MF, MMO);
4582 
4583  // There are no 96-bit result scalar loads, but widening to 128-bit should
4584  // always be legal. We may need to restore this to a 96-bit result if it turns
4585  // out this needs to be converted to a vector load during RegBankSelect.
4586  if (!isPowerOf2_32(Size)) {
4587  if (Ty.isVector())
4588  Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0);
4589  else
4590  Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0);
4591  }
4592 
4593  Observer.changedInstr(MI);
4594  return true;
4595 }
4596 
4597 // TODO: Move to selection
4600  MachineIRBuilder &B) const {
4601  if (!ST.isTrapHandlerEnabled() ||
4603  return legalizeTrapEndpgm(MI, MRI, B);
4604 
4605  if (Optional<uint8_t> HsaAbiVer = AMDGPU::getHsaAbiVersion(&ST)) {
4606  switch (*HsaAbiVer) {
4609  return legalizeTrapHsaQueuePtr(MI, MRI, B);
4611  return ST.supportsGetDoorbellID() ?
4612  legalizeTrapHsa(MI, MRI, B) :
4614  }
4615  }
4616 
4617  llvm_unreachable("Unknown trap handler");
4618 }
4619 
4622  B.buildInstr(AMDGPU::S_ENDPGM).addImm(0);
4623  MI.eraseFromParent();
4624  return true;
4625 }
4626 
4629  // Pass queue pointer to trap handler as input, and insert trap instruction
4630  // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi
4631  Register LiveIn =
4634  return false;
4635 
4636  Register SGPR01(AMDGPU::SGPR0_SGPR1);
4637  B.buildCopy(SGPR01, LiveIn);
4638  B.buildInstr(AMDGPU::S_TRAP)
4639  .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap))
4640  .addReg(SGPR01, RegState::Implicit);
4641 
4642  MI.eraseFromParent();
4643  return true;
4644 }
4645 
4648  B.buildInstr(AMDGPU::S_TRAP)
4649  .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap));
4650  MI.eraseFromParent();
4651  return true;
4652 }
4653 
4656  // Is non-HSA path or trap-handler disabled? then, report a warning
4657  // accordingly
4658  if (!ST.isTrapHandlerEnabled() ||
4660  DiagnosticInfoUnsupported NoTrap(B.getMF().getFunction(),
4661  "debugtrap handler not supported",
4662  MI.getDebugLoc(), DS_Warning);
4663  LLVMContext &Ctx = B.getMF().getFunction().getContext();
4664  Ctx.diagnose(NoTrap);
4665  } else {
4666  // Insert debug-trap instruction
4667  B.buildInstr(AMDGPU::S_TRAP)
4668  .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSADebugTrap));
4669  }
4670 
4671  MI.eraseFromParent();
4672  return true;
4673 }
4674 
4676  MachineIRBuilder &B) const {
4677  MachineRegisterInfo &MRI = *B.getMRI();
4678  const LLT S16 = LLT::scalar(16);
4679  const LLT S32 = LLT::scalar(32);
4680 
4681  Register DstReg = MI.getOperand(0).getReg();
4682  Register NodePtr = MI.getOperand(2).getReg();
4683  Register RayExtent = MI.getOperand(3).getReg();
4684  Register RayOrigin = MI.getOperand(4).getReg();
4685  Register RayDir = MI.getOperand(5).getReg();
4686  Register RayInvDir = MI.getOperand(6).getReg();
4687  Register TDescr = MI.getOperand(7).getReg();
4688 
4689  bool IsA16 = MRI.getType(RayDir).getElementType().getSizeInBits() == 16;
4690  bool Is64 = MRI.getType(NodePtr).getSizeInBits() == 64;
4691  unsigned Opcode = IsA16 ? Is64 ? AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16_nsa
4692  : AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16_nsa
4693  : Is64 ? AMDGPU::IMAGE_BVH64_INTERSECT_RAY_nsa
4694  : AMDGPU::IMAGE_BVH_INTERSECT_RAY_nsa;
4695 
4697  if (Is64) {
4698  auto Unmerge = B.buildUnmerge({S32, S32}, NodePtr);
4699  Ops.push_back(Unmerge.getReg(0));
4700  Ops.push_back(Unmerge.getReg(1));
4701  } else {
4702  Ops.push_back(NodePtr);
4703  }
4704  Ops.push_back(RayExtent);
4705 
4706  auto packLanes = [&Ops, &S32, &B] (Register Src) {
4707  auto Unmerge = B.buildUnmerge({S32, S32, S32, S32}, Src);
4708  Ops.push_back(Unmerge.getReg(0));
4709  Ops.push_back(Unmerge.getReg(1));
4710  Ops.push_back(Unmerge.getReg(2));
4711  };
4712 
4713  packLanes(RayOrigin);
4714  if (IsA16) {
4715  auto UnmergeRayDir = B.buildUnmerge({S16, S16, S16, S16}, RayDir);
4716  auto UnmergeRayInvDir = B.buildUnmerge({S16, S16, S16, S16}, RayInvDir);
4720  B.buildMerge(R1, {UnmergeRayDir.getReg(0), UnmergeRayDir.getReg(1)});
4721  B.buildMerge(R2, {UnmergeRayDir.getReg(2), UnmergeRayInvDir.getReg(0)});
4722  B.buildMerge(R3, {UnmergeRayInvDir.getReg(1), UnmergeRayInvDir.getReg(2)});
4723  Ops.push_back(R1);
4724  Ops.push_back(R2);
4725  Ops.push_back(R3);
4726  } else {
4727  packLanes(RayDir);
4728  packLanes(RayInvDir);
4729  }
4730 
4731  auto MIB = B.buildInstr(AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY)
4732  .addDef(DstReg)
4733  .addImm(Opcode);
4734 
4735  for (Register R : Ops) {
4736  MIB.addUse(R);
4737  }
4738 
4739  MIB.addUse(TDescr)
4740  .addImm(IsA16 ? 1 : 0)
4741  .cloneMemRefs(MI);
4742 
4743  MI.eraseFromParent();
4744  return true;
4745 }
4746 
4748  MachineInstr &MI) const {
4749  MachineIRBuilder &B = Helper.MIRBuilder;
4750  MachineRegisterInfo &MRI = *B.getMRI();
4751 
4752  // Replace the use G_BRCOND with the exec manipulate and branch pseudos.
4753  auto IntrID = MI.getIntrinsicID();
4754  switch (IntrID) {
4755  case Intrinsic::amdgcn_if:
4756  case Intrinsic::amdgcn_else: {
4757  MachineInstr *Br = nullptr;
4758  MachineBasicBlock *UncondBrTarget = nullptr;
4759  bool Negated = false;
4760  if (MachineInstr *BrCond =
4761  verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) {
4762  const SIRegisterInfo *TRI
4763  = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
4764 
4765  Register Def = MI.getOperand(1).getReg();
4766  Register Use = MI.getOperand(3).getReg();
4767 
4768  MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
4769 
4770  if (Negated)
4771  std::swap(CondBrTarget, UncondBrTarget);
4772 
4773  B.setInsertPt(B.getMBB(), BrCond->getIterator());
4774  if (IntrID == Intrinsic::amdgcn_if) {
4775  B.buildInstr(AMDGPU::SI_IF)
4776  .addDef(Def)
4777  .addUse(Use)
4778  .addMBB(UncondBrTarget);
4779  } else {
4780  B.buildInstr(AMDGPU::SI_ELSE)
4781  .addDef(Def)
4782  .addUse(Use)
4783  .addMBB(UncondBrTarget);
4784  }
4785 
4786  if (Br) {
4787  Br->getOperand(0).setMBB(CondBrTarget);
4788  } else {
4789  // The IRTranslator skips inserting the G_BR for fallthrough cases, but
4790  // since we're swapping branch targets it needs to be reinserted.
4791  // FIXME: IRTranslator should probably not do this
4792  B.buildBr(*CondBrTarget);
4793  }
4794 
4795  MRI.setRegClass(Def, TRI->getWaveMaskRegClass());
4796  MRI.setRegClass(Use, TRI->getWaveMaskRegClass());
4797  MI.eraseFromParent();
4798  BrCond->eraseFromParent();
4799  return true;
4800  }
4801 
4802  return false;
4803  }
4804  case Intrinsic::amdgcn_loop: {
4805  MachineInstr *Br = nullptr;
4806  MachineBasicBlock *UncondBrTarget = nullptr;
4807  bool Negated = false;
4808  if (MachineInstr *BrCond =
4809  verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) {
4810  const SIRegisterInfo *TRI
4811  = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
4812 
4813  MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
4814  Register Reg = MI.getOperand(2).getReg();
4815 
4816  if (Negated)
4817  std::swap(CondBrTarget, UncondBrTarget);
4818 
4819  B.setInsertPt(B.getMBB(), BrCond->getIterator());
4820  B.buildInstr(AMDGPU::SI_LOOP)
4821  .addUse(Reg)
4822  .addMBB(UncondBrTarget);
4823 
4824  if (Br)
4825  Br->getOperand(0).setMBB(CondBrTarget);
4826  else
4827  B.buildBr(*CondBrTarget);
4828 
4829  MI.eraseFromParent();
4830  BrCond->eraseFromParent();
4831  MRI.setRegClass(Reg, TRI->getWaveMaskRegClass());
4832  return true;
4833  }
4834 
4835  return false;
4836  }
4837  case Intrinsic::amdgcn_kernarg_segment_ptr:
4838  if (!AMDGPU::isKernel(B.getMF().getFunction().getCallingConv())) {
4839  // This only makes sense to call in a kernel, so just lower to null.
4840  B.buildConstant(MI.getOperand(0).getReg(), 0);
4841  MI.eraseFromParent();
4842  return true;
4843  }
4844 
4847  case Intrinsic::amdgcn_implicitarg_ptr:
4848  return legalizeImplicitArgPtr(MI, MRI, B);
4849  case Intrinsic::amdgcn_workitem_id_x:
4852  case Intrinsic::amdgcn_workitem_id_y:
4855  case Intrinsic::amdgcn_workitem_id_z:
4858  case Intrinsic::amdgcn_workgroup_id_x:
4861  case Intrinsic::amdgcn_workgroup_id_y:
4864  case Intrinsic::amdgcn_workgroup_id_z:
4867  case Intrinsic::amdgcn_dispatch_ptr:
4870  case Intrinsic::amdgcn_queue_ptr:
4873  case Intrinsic::amdgcn_implicit_buffer_ptr:
4876  case Intrinsic::amdgcn_dispatch_id:
4879  case Intrinsic::amdgcn_fdiv_fast:
4880  return legalizeFDIVFastIntrin(MI, MRI, B);
4881  case Intrinsic::amdgcn_is_shared:
4883  case Intrinsic::amdgcn_is_private:
4885  case Intrinsic::amdgcn_wavefrontsize: {
4886  B.buildConstant(MI.getOperand(0), ST.getWavefrontSize());
4887  MI.eraseFromParent();
4888  return true;
4889  }
4890  case Intrinsic::amdgcn_s_buffer_load:
4891  return legalizeSBufferLoad(Helper, MI);
4892  case Intrinsic::amdgcn_raw_buffer_store:
4893  case Intrinsic::amdgcn_struct_buffer_store:
4894  return legalizeBufferStore(MI, MRI, B, false, false);
4895  case Intrinsic::amdgcn_raw_buffer_store_format:
4896  case Intrinsic::amdgcn_struct_buffer_store_format:
4897  return legalizeBufferStore(MI, MRI, B, false, true);
4898  case Intrinsic::amdgcn_raw_tbuffer_store:
4899  case Intrinsic::amdgcn_struct_tbuffer_store:
4900  return legalizeBufferStore(MI, MRI, B, true, true);
4901  case Intrinsic::amdgcn_raw_buffer_load:
4902  case Intrinsic::amdgcn_struct_buffer_load:
4903  return legalizeBufferLoad(MI, MRI, B, false, false);
4904  case Intrinsic::amdgcn_raw_buffer_load_format:
4905  case Intrinsic::amdgcn_struct_buffer_load_format:
4906  return legalizeBufferLoad(MI, MRI, B, true, false);
4907  case Intrinsic::amdgcn_raw_tbuffer_load:
4908  case Intrinsic::amdgcn_struct_tbuffer_load:
4909  return legalizeBufferLoad(MI, MRI, B, true, true);
4910  case Intrinsic::amdgcn_raw_buffer_atomic_swap:
4911  case Intrinsic::amdgcn_struct_buffer_atomic_swap:
4912  case Intrinsic::amdgcn_raw_buffer_atomic_add:
4913  case Intrinsic::amdgcn_struct_buffer_atomic_add:
4914  case Intrinsic::amdgcn_raw_buffer_atomic_sub:
4915  case Intrinsic::amdgcn_struct_buffer_atomic_sub:
4916  case Intrinsic::amdgcn_raw_buffer_atomic_smin:
4917  case Intrinsic::amdgcn_struct_buffer_atomic_smin:
4918  case Intrinsic::amdgcn_raw_buffer_atomic_umin:
4919  case Intrinsic::amdgcn_struct_buffer_atomic_umin:
4920  case Intrinsic::amdgcn_raw_buffer_atomic_smax:
4921  case Intrinsic::amdgcn_struct_buffer_atomic_smax:
4922  case Intrinsic::amdgcn_raw_buffer_atomic_umax:
4923  case Intrinsic::amdgcn_struct_buffer_atomic_umax:
4924  case Intrinsic::amdgcn_raw_buffer_atomic_and:
4925  case Intrinsic::amdgcn_struct_buffer_atomic_and:
4926  case Intrinsic::amdgcn_raw_buffer_atomic_or:
4927  case Intrinsic::amdgcn_struct_buffer_atomic_or:
4928  case Intrinsic::amdgcn_raw_buffer_atomic_xor:
4929  case Intrinsic::amdgcn_struct_buffer_atomic_xor:
4930  case Intrinsic::amdgcn_raw_buffer_atomic_inc:
4931  case Intrinsic::amdgcn_struct_buffer_atomic_inc:
4932  case Intrinsic::amdgcn_raw_buffer_atomic_dec:
4933  case Intrinsic::amdgcn_struct_buffer_atomic_dec:
4934  case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
4935  case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
4936  case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
4937  case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
4938  case Intrinsic::amdgcn_buffer_atomic_fadd:
4939  case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
4940  case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
4941  case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
4942  case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
4943  return legalizeBufferAtomic(MI, B, IntrID);
4944  case Intrinsic::amdgcn_atomic_inc:
4945  return legalizeAtomicIncDec(MI, B, true);
4946  case Intrinsic::amdgcn_atomic_dec:
4947  return legalizeAtomicIncDec(MI, B, false);
4948  case Intrinsic::trap:
4949  return legalizeTrapIntrinsic(MI, MRI, B);
4950  case Intrinsic::debugtrap:
4951  return legalizeDebugTrapIntrinsic(MI, MRI, B);
4952  case Intrinsic::amdgcn_rsq_clamp:
4953  return legalizeRsqClampIntrinsic(MI, MRI, B);
4954  case Intrinsic::amdgcn_ds_fadd:
4955  case Intrinsic::amdgcn_ds_fmin:
4956  case Intrinsic::amdgcn_ds_fmax:
4957  return legalizeDSAtomicFPIntrinsic(Helper, MI, IntrID);
4958  case Intrinsic::amdgcn_image_bvh_intersect_ray:
4959  return legalizeBVHIntrinsic(MI, B);
4960  default: {
4961  if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
4963  return legalizeImageIntrinsic(MI, B, Helper.Observer, ImageDimIntr);
4964  return true;
4965  }
4966  }
4967 
4968  return true;
4969 }
llvm::Check::Size
@ Size
Definition: FileCheck.h:73
llvm::NextPowerOf2
uint64_t NextPowerOf2(uint64_t A)
Returns the next power of two (in 64-bits) that is strictly greater than A.
Definition: MathExtras.h:683
MIPatternMatch.h
llvm::AMDGPUFunctionArgInfo::PreloadedValue
PreloadedValue
Definition: AMDGPUArgumentUsageInfo.h:98
llvm::AMDGPUFunctionArgInfo::QUEUE_PTR
@ QUEUE_PTR
Definition: AMDGPUArgumentUsageInfo.h:102
llvm::LegalizeRuleSet::widenScalarToNextPow2
LegalizeRuleSet & widenScalarToNextPow2(unsigned TypeIdx, unsigned MinSize=0)
Widen the scalar to the next power of two that is at least MinSize.
Definition: LegalizerInfo.h:822
Signed
@ Signed
Definition: NVPTXISelLowering.cpp:4544
llvm::AMDGPUMachineFunction::allocateLDSGlobal
unsigned allocateLDSGlobal(const DataLayout &DL, const GlobalVariable &GV)
Definition: AMDGPUMachineFunction.cpp:41
MI
IRTranslator LLVM IR MI
Definition: IRTranslator.cpp:102
Merge
R600 Clause Merge
Definition: R600ClauseMergePass.cpp:69
llvm::MachineInstrBuilder::addImm
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
Definition: MachineInstrBuilder.h:131
llvm
Definition: AllocatorList.h:23
llvm::LegalityPredicates::scalarOrEltWiderThan
LegalityPredicate scalarOrEltWiderThan(unsigned TypeIdx, unsigned Size)
True iff the specified type index is a scalar or a vector with an element type that's wider than the ...
Definition: LegalityPredicates.cpp:141
llvm::tgtok::Def
@ Def
Definition: TGLexer.h:50
Reg
unsigned Reg
Definition: MachineSink.cpp:1566
llvm::AMDGPULegalizerInfo::legalizeImplicitArgPtr
bool legalizeImplicitArgPtr(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
Definition: AMDGPULegalizerInfo.cpp:3562
llvm::CmpInst::ICMP_EQ
@ ICMP_EQ
equal
Definition: InstrTypes.h:743
UseMI
MachineInstrBuilder & UseMI
Definition: AArch64ExpandPseudoInsts.cpp:102
llvm::AMDGPU::MIMGBaseOpcodeInfo::Store
bool Store
Definition: AMDGPUBaseInfo.h:282
llvm::AMDGPU::MIMGBaseOpcodeInfo::Gradients
bool Gradients
Definition: AMDGPUBaseInfo.h:289
llvm::LLT::getScalarSizeInBits
unsigned getScalarSizeInBits() const
Definition: LowLevelTypeImpl.h:163
llvm::MachineRegisterInfo::createVirtualRegister
Register createVirtualRegister(const TargetRegisterClass *RegClass, StringRef Name="")
createVirtualRegister - Create and return a new virtual register in the function with the specified r...
Definition: MachineRegisterInfo.cpp:158
llvm::GlobalValue::hasExternalLinkage
bool hasExternalLinkage() const
Definition: GlobalValue.h:431
llvm::DiagnosticInfoUnsupported
Diagnostic information for unsupported feature in backend.
Definition: DiagnosticInfo.h:1003
llvm::MachineMemOperand::getAlign
Align getAlign() const
Return the minimum known alignment in bytes of the actual memory reference.
Definition: MachineOperand.cpp:1065
vectorWiderThan
static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size)
Definition: AMDGPULegalizerInfo.cpp:167
llvm::APFloatBase::IEEEsingle
static const fltSemantics & IEEEsingle() LLVM_READNONE
Definition: APFloat.cpp:170
SIMachineFunctionInfo.h
llvm::AMDGPU::Hwreg::ID_SHIFT_
@ ID_SHIFT_
Definition: SIDefines.h:375
llvm::LegalizerInfo::getActionDefinitionsBuilder
LegalizeRuleSet & getActionDefinitionsBuilder(unsigned Opcode)
Get the action definition builder for the given opcode.
Definition: LegalizerInfo.cpp:290
llvm::MachineRegisterInfo
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
Definition: MachineRegisterInfo.h:52
llvm::ArgDescriptor
Definition: AMDGPUArgumentUsageInfo.h:23
getPow2VectorType
static LLT getPow2VectorType(LLT Ty)
Definition: AMDGPULegalizerInfo.cpp:49
widenToNextPowerOf2
static LLT widenToNextPowerOf2(LLT Ty)
Definition: AMDGPULegalizerInfo.cpp:2406
llvm::Function
Definition: Function.h:61
convertImageAddrToPacked
static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI, int DimIdx, int NumVAddrs)
Convert from separate vaddr components to a single vector address register, and replace the remaining...
Definition: AMDGPULegalizerInfo.cpp:4131
llvm::getOpcodeDef
MachineInstr * getOpcodeDef(unsigned Opcode, Register Reg, const MachineRegisterInfo &MRI)
See if Reg is defined by an single def instruction that is Opcode.
Definition: Utils.cpp:418
llvm::AMDGPU::SIModeRegisterDefaults::IEEE
bool IEEE
Floating point opcodes that support exception flag gathering quiet and propagate signaling NaN inputs...
Definition: AMDGPUBaseInfo.h:907
C1
instcombine should handle this C2 when C1
Definition: README.txt:263
llvm::SIInstrInfo::MO_GOTPCREL32
@ MO_GOTPCREL32
Definition: SIInstrInfo.h:154
isWideScalarExtLoadTruncStore
static LegalityPredicate isWideScalarExtLoadTruncStore(unsigned TypeIdx)
Definition: AMDGPULegalizerInfo.cpp:227
llvm::SmallVector
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1167
llvm::MachineFunction::end
iterator end()
Definition: MachineFunction.h:760
llvm::LegalizeRuleSet::customFor
LegalizeRuleSet & customFor(std::initializer_list< LLT > Types)
Definition: LegalizerInfo.h:796
llvm::AMDGPULegalizerInfo::legalizeIsAddrSpace
bool legalizeIsAddrSpace(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, unsigned AddrSpace) const
Definition: AMDGPULegalizerInfo.cpp:3579
llvm::AMDGPU::MIMGBaseOpcodeInfo::Gather4
bool Gather4
Definition: AMDGPUBaseInfo.h:286
llvm::MachineFunction::getMachineMemOperand
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, uint64_t s, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
Definition: MachineFunction.cpp:430
llvm::APInt::getSExtValue
int64_t getSExtValue() const
Get sign extended value.
Definition: APInt.h:1643
llvm::CmpInst::FCMP_ONE
@ FCMP_ONE
0 1 1 0 True if ordered and operands are unequal
Definition: InstrTypes.h:730
llvm::MipsISD::Lo
@ Lo
Definition: MipsISelLowering.h:79
llvm::AMDGPU::ImageDimIntrinsicInfo
Definition: AMDGPUInstrInfo.h:50
llvm::MachineRegisterInfo::getTargetRegisterInfo
const TargetRegisterInfo * getTargetRegisterInfo() const
Definition: MachineRegisterInfo.h:153
llvm::MachineMemOperand::MOInvariant
@ MOInvariant
The memory access always returns the same value (or traps).
Definition: MachineMemOperand.h:144
llvm::CmpInst::ICMP_NE
@ ICMP_NE
not equal
Definition: InstrTypes.h:744
llvm::AMDGPU::getHsaAbiVersion
Optional< uint8_t > getHsaAbiVersion(const MCSubtargetInfo *STI)
Definition: AMDGPUBaseInfo.cpp:91
llvm::LegalityPredicates::typeIsNot
LegalityPredicate typeIsNot(unsigned TypeIdx, LLT Type)
True iff the given type index is not the specified type.
Definition: LegalizerInfo.h:250
llvm::AMDGPUSubtarget::SEA_ISLANDS
@ SEA_ISLANDS
Definition: AMDGPUSubtarget.h:38
llvm::AMDGPULegalizerInfo::loadInputValue
bool loadInputValue(Register DstReg, MachineIRBuilder &B, const ArgDescriptor *Arg, const TargetRegisterClass *ArgRC, LLT ArgTy) const
Definition: AMDGPULegalizerInfo.cpp:2744
llvm::MachineRegisterInfo::use_instr_nodbg_begin
use_instr_nodbg_iterator use_instr_nodbg_begin(Register RegNo) const
Definition: MachineRegisterInfo.h:535
loadStoreBitcastWorkaround
static bool loadStoreBitcastWorkaround(const LLT Ty)
Definition: AMDGPULegalizerInfo.cpp:333
llvm::AMDGPULegalizerInfo::legalizeShuffleVector
bool legalizeShuffleVector(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
Definition: AMDGPULegalizerInfo.cpp:2190
shouldWidenLoad
static bool shouldWidenLoad(const GCNSubtarget &ST, unsigned SizeInBits, unsigned AlignInBits, unsigned AddrSpace, unsigned Opcode)
Return true if we should legalize a load by widening an odd sized memory access up to the alignment.
Definition: AMDGPULegalizerInfo.cpp:374
sizeIsMultipleOf32
static LegalityPredicate sizeIsMultipleOf32(unsigned TypeIdx)
Definition: AMDGPULegalizerInfo.cpp:79
llvm::CmpInst::ICMP_SGT
@ ICMP_SGT
signed greater than
Definition: InstrTypes.h:749
llvm::LLT::getScalarType
LLT getScalarType() const
Definition: LowLevelTypeImpl.h:121
llvm::Function::getContext
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition: Function.cpp:327
llvm::AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE
@ WIDTH_M1_SRC_PRIVATE_BASE
Definition: SIDefines.h:399
Shift
bool Shift
Definition: README.txt:468
llvm::AMDGPULegalizerInfo::legalizeFrem
bool legalizeFrem(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
Definition: AMDGPULegalizerInfo.cpp:1961
llvm::SPII::Load
@ Load
Definition: SparcInstrInfo.h:32
llvm::ConstantFP::isZero
bool isZero() const
Return true if the value is positive or negative zero.
Definition: Constants.h:299
llvm::Type
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:46
llvm::LegalizerHelper::lowerFMad
LegalizeResult lowerFMad(MachineInstr &MI)
Definition: LegalizerHelper.cpp:6227
llvm::LegalityPredicates::largerThan
LegalityPredicate largerThan(unsigned TypeIdx0, unsigned TypeIdx1)
True iff the first type index has a larger total bit size than second type index.
Definition: LegalityPredicates.cpp:125
llvm::getFunctionLiveInPhysReg
Register getFunctionLiveInPhysReg(MachineFunction &MF, const TargetInstrInfo &TII, MCRegister PhysReg, const TargetRegisterClass &RC, LLT RegTy=LLT())
Return a virtual register corresponding to the incoming argument register PhysReg.
Definition: Utils.cpp:625
llvm::tgtok::Bits
@ Bits
Definition: TGLexer.h:50
llvm::FloatToBits
uint32_t FloatToBits(float Float)
This function takes a float and returns the bit equivalent 32-bit integer.
Definition: MathExtras.h:663
llvm::LegalityPredicates::scalarNarrowerThan
LegalityPredicate scalarNarrowerThan(unsigned TypeIdx, unsigned Size)
True iff the specified type index is a scalar that's narrower than the given size.
Definition: LegalityPredicates.cpp:101
llvm::GCNSubtarget::isTrapHandlerEnabled
bool isTrapHandlerEnabled() const
Definition: GCNSubtarget.h:528
llvm::MachineInstr::FmAfn
@ FmAfn
Definition: MachineInstr.h:98
llvm::MachineMemOperand
A description of a memory reference used in the backend.
Definition: MachineMemOperand.h:127
llvm::MachineMemOperand::MODereferenceable
@ MODereferenceable
The memory access is dereferenceable (i.e., doesn't trap).
Definition: MachineMemOperand.h:142
llvm::LegalityPredicates::elementTypeIs
LegalityPredicate elementTypeIs(unsigned TypeIdx, LLT EltTy)
True if the type index is a vector with element type EltTy.
Definition: LegalityPredicates.cpp:93
llvm::SITargetLowering::shouldEmitFixup
bool shouldEmitFixup(const GlobalValue *GV) const
Definition: SIISelLowering.cpp:4968
llvm::GCNSubtarget::hasFractBug
bool hasFractBug() const
Definition: GCNSubtarget.h:330
llvm::GCNSubtarget::hasNSAEncoding
bool hasNSAEncoding() const
Definition: GCNSubtarget.h:873
llvm::GCNSubtarget::TrapID::LLVMAMDHSADebugTrap
@ LLVMAMDHSADebugTrap
llvm::Optional
Definition: APInt.h:33
llvm::AMDGPU::SIModeRegisterDefaults
Definition: AMDGPUBaseInfo.h:902
llvm::AMDGPULegalizerInfo::legalizeInsertVectorElt
bool legalizeInsertVectorElt(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
Definition: AMDGPULegalizerInfo.cpp:2157
llvm::LLT::scalarOrVector
static LLT scalarOrVector(uint16_t NumElements, LLT ScalarTy)
Definition: LowLevelTypeImpl.h:74
Offset
uint64_t Offset
Definition: ELFObjHandler.cpp:81
getDSFPAtomicOpcode
static unsigned getDSFPAtomicOpcode(Intrinsic::ID IID)
Definition: AMDGPULegalizerInfo.cpp:3512
llvm::GCNSubtarget
Definition: GCNSubtarget.h:38
llvm::AMDGPULegalizerInfo::legalizeSBufferLoad
bool legalizeSBufferLoad(LegalizerHelper &Helper, MachineInstr &MI) const
Definition: AMDGPULegalizerInfo.cpp:4547
llvm::AMDGPUAS::REGION_ADDRESS
@ REGION_ADDRESS
Address space for region memory. (GDS)
Definition: