LLVM  12.0.0git
AMDGPULegalizerInfo.cpp
Go to the documentation of this file.
1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// This file implements the targeting of the Machinelegalizer class for
10 /// AMDGPU.
11 /// \todo This should be generated by TableGen.
12 //===----------------------------------------------------------------------===//
13 
14 #include "AMDGPULegalizerInfo.h"
15 
16 #include "AMDGPU.h"
17 #include "AMDGPUGlobalISelUtils.h"
18 #include "AMDGPUInstrInfo.h"
19 #include "AMDGPUTargetMachine.h"
20 #include "SIMachineFunctionInfo.h"
21 #include "llvm/ADT/ScopeExit.h"
25 #include "llvm/IR/DiagnosticInfo.h"
26 #include "llvm/IR/IntrinsicsAMDGPU.h"
27 
28 #define DEBUG_TYPE "amdgpu-legalinfo"
29 
30 using namespace llvm;
31 using namespace LegalizeActions;
32 using namespace LegalizeMutations;
33 using namespace LegalityPredicates;
34 using namespace MIPatternMatch;
35 
36 // Hack until load/store selection patterns support any tuple of legal types.
38  "amdgpu-global-isel-new-legality",
39  cl::desc("Use GlobalISel desired legality, rather than try to use"
40  "rules compatible with selection patterns"),
41  cl::init(false),
43 
44 static constexpr unsigned MaxRegisterSize = 1024;
45 
46 // Round the number of elements to the next power of two elements
48  unsigned NElts = Ty.getNumElements();
49  unsigned Pow2NElts = 1 << Log2_32_Ceil(NElts);
50  return Ty.changeNumElements(Pow2NElts);
51 }
52 
53 // Round the number of bits to the next power of two bits
55  unsigned Bits = Ty.getSizeInBits();
56  unsigned Pow2Bits = 1 << Log2_32_Ceil(Bits);
57  return LLT::scalar(Pow2Bits);
58 }
59 
60 /// \returs true if this is an odd sized vector which should widen by adding an
61 /// additional element. This is mostly to handle <3 x s16> -> <4 x s16>. This
62 /// excludes s1 vectors, which should always be scalarized.
63 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) {
64  return [=](const LegalityQuery &Query) {
65  const LLT Ty = Query.Types[TypeIdx];
66  if (!Ty.isVector())
67  return false;
68 
69  const LLT EltTy = Ty.getElementType();
70  const unsigned EltSize = EltTy.getSizeInBits();
71  return Ty.getNumElements() % 2 != 0 &&
72  EltSize > 1 && EltSize < 32 &&
73  Ty.getSizeInBits() % 32 != 0;
74  };
75 }
76 
77 static LegalityPredicate sizeIsMultipleOf32(unsigned TypeIdx) {
78  return [=](const LegalityQuery &Query) {
79  const LLT Ty = Query.Types[TypeIdx];
80  return Ty.getSizeInBits() % 32 == 0;
81  };
82 }
83 
84 static LegalityPredicate isWideVec16(unsigned TypeIdx) {
85  return [=](const LegalityQuery &Query) {
86  const LLT Ty = Query.Types[TypeIdx];
87  const LLT EltTy = Ty.getScalarType();
88  return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2;
89  };
90 }
91 
92 static LegalizeMutation oneMoreElement(unsigned TypeIdx) {
93  return [=](const LegalityQuery &Query) {
94  const LLT Ty = Query.Types[TypeIdx];
95  const LLT EltTy = Ty.getElementType();
96  return std::make_pair(TypeIdx, LLT::vector(Ty.getNumElements() + 1, EltTy));
97  };
98 }
99 
100 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) {
101  return [=](const LegalityQuery &Query) {
102  const LLT Ty = Query.Types[TypeIdx];
103  const LLT EltTy = Ty.getElementType();
104  unsigned Size = Ty.getSizeInBits();
105  unsigned Pieces = (Size + 63) / 64;
106  unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces;
107  return std::make_pair(TypeIdx, LLT::scalarOrVector(NewNumElts, EltTy));
108  };
109 }
110 
111 // Increase the number of vector elements to reach the next multiple of 32-bit
112 // type.
113 static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) {
114  return [=](const LegalityQuery &Query) {
115  const LLT Ty = Query.Types[TypeIdx];
116 
117  const LLT EltTy = Ty.getElementType();
118  const int Size = Ty.getSizeInBits();
119  const int EltSize = EltTy.getSizeInBits();
120  const int NextMul32 = (Size + 31) / 32;
121 
122  assert(EltSize < 32);
123 
124  const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize;
125  return std::make_pair(TypeIdx, LLT::vector(NewNumElts, EltTy));
126  };
127 }
128 
129 static LLT getBitcastRegisterType(const LLT Ty) {
130  const unsigned Size = Ty.getSizeInBits();
131 
132  LLT CoercedTy;
133  if (Size <= 32) {
134  // <2 x s8> -> s16
135  // <4 x s8> -> s32
136  return LLT::scalar(Size);
137  }
138 
139  return LLT::scalarOrVector(Size / 32, 32);
140 }
141 
142 static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx) {
143  return [=](const LegalityQuery &Query) {
144  const LLT Ty = Query.Types[TypeIdx];
145  return std::make_pair(TypeIdx, getBitcastRegisterType(Ty));
146  };
147 }
148 
149 static LegalizeMutation bitcastToVectorElement32(unsigned TypeIdx) {
150  return [=](const LegalityQuery &Query) {
151  const LLT Ty = Query.Types[TypeIdx];
152  unsigned Size = Ty.getSizeInBits();
153  assert(Size % 32 == 0);
154  return std::make_pair(TypeIdx, LLT::scalarOrVector(Size / 32, 32));
155  };
156 }
157 
158 static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) {
159  return [=](const LegalityQuery &Query) {
160  const LLT QueryTy = Query.Types[TypeIdx];
161  return QueryTy.isVector() && QueryTy.getSizeInBits() < Size;
162  };
163 }
164 
165 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) {
166  return [=](const LegalityQuery &Query) {
167  const LLT QueryTy = Query.Types[TypeIdx];
168  return QueryTy.isVector() && QueryTy.getSizeInBits() > Size;
169  };
170 }
171 
172 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) {
173  return [=](const LegalityQuery &Query) {
174  const LLT QueryTy = Query.Types[TypeIdx];
175  return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0;
176  };
177 }
178 
179 static bool isRegisterSize(unsigned Size) {
180  return Size % 32 == 0 && Size <= MaxRegisterSize;
181 }
182 
183 static bool isRegisterVectorElementType(LLT EltTy) {
184  const int EltSize = EltTy.getSizeInBits();
185  return EltSize == 16 || EltSize % 32 == 0;
186 }
187 
188 static bool isRegisterVectorType(LLT Ty) {
189  const int EltSize = Ty.getElementType().getSizeInBits();
190  return EltSize == 32 || EltSize == 64 ||
191  (EltSize == 16 && Ty.getNumElements() % 2 == 0) ||
192  EltSize == 128 || EltSize == 256;
193 }
194 
195 static bool isRegisterType(LLT Ty) {
196  if (!isRegisterSize(Ty.getSizeInBits()))
197  return false;
198 
199  if (Ty.isVector())
200  return isRegisterVectorType(Ty);
201 
202  return true;
203 }
204 
205 // Any combination of 32 or 64-bit elements up the maximum register size, and
206 // multiples of v2s16.
207 static LegalityPredicate isRegisterType(unsigned TypeIdx) {
208  return [=](const LegalityQuery &Query) {
209  return isRegisterType(Query.Types[TypeIdx]);
210  };
211 }
212 
213 static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) {
214  return [=](const LegalityQuery &Query) {
215  const LLT QueryTy = Query.Types[TypeIdx];
216  if (!QueryTy.isVector())
217  return false;
218  const LLT EltTy = QueryTy.getElementType();
219  return EltTy == LLT::scalar(16) || EltTy.getSizeInBits() >= 32;
220  };
221 }
222 
223 static LegalityPredicate isWideScalarTruncStore(unsigned TypeIdx) {
224  return [=](const LegalityQuery &Query) {
225  const LLT Ty = Query.Types[TypeIdx];
226  return !Ty.isVector() && Ty.getSizeInBits() > 32 &&
227  Query.MMODescrs[0].SizeInBits < Ty.getSizeInBits();
228  };
229 }
230 
231 // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we
232 // handle some operations by just promoting the register during
233 // selection. There are also d16 loads on GFX9+ which preserve the high bits.
234 static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS,
235  bool IsLoad) {
236  switch (AS) {
238  // FIXME: Private element size.
239  return ST.enableFlatScratch() ? 128 : 32;
241  return ST.useDS128() ? 128 : 64;
245  // Treat constant and global as identical. SMRD loads are sometimes usable for
246  // global loads (ideally constant address space should be eliminated)
247  // depending on the context. Legality cannot be context dependent, but
248  // RegBankSelect can split the load as necessary depending on the pointer
249  // register bank/uniformity and if the memory is invariant or not written in a
250  // kernel.
251  return IsLoad ? 512 : 128;
252  default:
253  // Flat addresses may contextually need to be split to 32-bit parts if they
254  // may alias scratch depending on the subtarget.
255  return 128;
256  }
257 }
258 
260  const LegalityQuery &Query,
261  unsigned Opcode) {
262  const LLT Ty = Query.Types[0];
263 
264  // Handle G_LOAD, G_ZEXTLOAD, G_SEXTLOAD
265  const bool IsLoad = Opcode != AMDGPU::G_STORE;
266 
267  unsigned RegSize = Ty.getSizeInBits();
268  unsigned MemSize = Query.MMODescrs[0].SizeInBits;
269  unsigned AlignBits = Query.MMODescrs[0].AlignInBits;
270  unsigned AS = Query.Types[1].getAddressSpace();
271 
272  // All of these need to be custom lowered to cast the pointer operand.
274  return false;
275 
276  // TODO: We should be able to widen loads if the alignment is high enough, but
277  // we also need to modify the memory access size.
278 #if 0
279  // Accept widening loads based on alignment.
280  if (IsLoad && MemSize < Size)
281  MemSize = std::max(MemSize, Align);
282 #endif
283 
284  // Only 1-byte and 2-byte to 32-bit extloads are valid.
285  if (MemSize != RegSize && RegSize != 32)
286  return false;
287 
288  if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad))
289  return false;
290 
291  switch (MemSize) {
292  case 8:
293  case 16:
294  case 32:
295  case 64:
296  case 128:
297  break;
298  case 96:
299  if (!ST.hasDwordx3LoadStores())
300  return false;
301  break;
302  case 256:
303  case 512:
304  // These may contextually need to be broken down.
305  break;
306  default:
307  return false;
308  }
309 
310  assert(RegSize >= MemSize);
311 
312  if (AlignBits < MemSize) {
313  const SITargetLowering *TLI = ST.getTargetLowering();
314  if (!TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS,
315  Align(AlignBits / 8)))
316  return false;
317  }
318 
319  return true;
320 }
321 
322 // The current selector can't handle <6 x s16>, <8 x s16>, s96, s128 etc, so
323 // workaround this. Eventually it should ignore the type for loads and only care
324 // about the size. Return true in cases where we will workaround this for now by
325 // bitcasting.
326 static bool loadStoreBitcastWorkaround(const LLT Ty) {
327  if (EnableNewLegality)
328  return false;
329 
330  const unsigned Size = Ty.getSizeInBits();
331  if (Size <= 64)
332  return false;
333  if (!Ty.isVector())
334  return true;
335 
336  LLT EltTy = Ty.getElementType();
337  if (EltTy.isPointer())
338  return true;
339 
340  unsigned EltSize = EltTy.getSizeInBits();
341  return EltSize != 32 && EltSize != 64;
342 }
343 
344 static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query,
345  unsigned Opcode) {
346  const LLT Ty = Query.Types[0];
347  return isRegisterType(Ty) && isLoadStoreSizeLegal(ST, Query, Opcode) &&
349 }
350 
351 /// Return true if a load or store of the type should be lowered with a bitcast
352 /// to a different type.
353 static bool shouldBitcastLoadStoreType(const GCNSubtarget &ST, const LLT Ty,
354  const unsigned MemSizeInBits) {
355  const unsigned Size = Ty.getSizeInBits();
356  if (Size != MemSizeInBits)
357  return Size <= 32 && Ty.isVector();
358 
360  return true;
361  return Ty.isVector() && (Size <= 32 || isRegisterSize(Size)) &&
363 }
364 
365 /// Return true if we should legalize a load by widening an odd sized memory
366 /// access up to the alignment. Note this case when the memory access itself
367 /// changes, not the size of the result register.
368 static bool shouldWidenLoad(const GCNSubtarget &ST, unsigned SizeInBits,
369  unsigned AlignInBits, unsigned AddrSpace,
370  unsigned Opcode) {
371  // We don't want to widen cases that are naturally legal.
372  if (isPowerOf2_32(SizeInBits))
373  return false;
374 
375  // If we have 96-bit memory operations, we shouldn't touch them. Note we may
376  // end up widening these for a scalar load during RegBankSelect, since there
377  // aren't 96-bit scalar loads.
378  if (SizeInBits == 96 && ST.hasDwordx3LoadStores())
379  return false;
380 
381  if (SizeInBits >= maxSizeForAddrSpace(ST, AddrSpace, Opcode))
382  return false;
383 
384  // A load is known dereferenceable up to the alignment, so it's legal to widen
385  // to it.
386  //
387  // TODO: Could check dereferenceable for less aligned cases.
388  unsigned RoundedSize = NextPowerOf2(SizeInBits);
389  if (AlignInBits < RoundedSize)
390  return false;
391 
392  // Do not widen if it would introduce a slow unaligned load.
393  const SITargetLowering *TLI = ST.getTargetLowering();
394  bool Fast = false;
396  RoundedSize, AddrSpace, Align(AlignInBits / 8),
397  MachineMemOperand::MOLoad, &Fast) &&
398  Fast;
399 }
400 
401 static bool shouldWidenLoad(const GCNSubtarget &ST, const LegalityQuery &Query,
402  unsigned Opcode) {
403  if (Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic)
404  return false;
405 
406  return shouldWidenLoad(ST, Query.MMODescrs[0].SizeInBits,
407  Query.MMODescrs[0].AlignInBits,
408  Query.Types[1].getAddressSpace(), Opcode);
409 }
410 
412  const GCNTargetMachine &TM)
413  : ST(ST_) {
414  using namespace TargetOpcode;
415 
416  auto GetAddrSpacePtr = [&TM](unsigned AS) {
417  return LLT::pointer(AS, TM.getPointerSizeInBits(AS));
418  };
419 
420  const LLT S1 = LLT::scalar(1);
421  const LLT S8 = LLT::scalar(8);
422  const LLT S16 = LLT::scalar(16);
423  const LLT S32 = LLT::scalar(32);
424  const LLT S64 = LLT::scalar(64);
425  const LLT S128 = LLT::scalar(128);
426  const LLT S256 = LLT::scalar(256);
427  const LLT S512 = LLT::scalar(512);
428  const LLT MaxScalar = LLT::scalar(MaxRegisterSize);
429 
430  const LLT V2S8 = LLT::vector(2, 8);
431  const LLT V2S16 = LLT::vector(2, 16);
432  const LLT V4S16 = LLT::vector(4, 16);
433 
434  const LLT V2S32 = LLT::vector(2, 32);
435  const LLT V3S32 = LLT::vector(3, 32);
436  const LLT V4S32 = LLT::vector(4, 32);
437  const LLT V5S32 = LLT::vector(5, 32);
438  const LLT V6S32 = LLT::vector(6, 32);
439  const LLT V7S32 = LLT::vector(7, 32);
440  const LLT V8S32 = LLT::vector(8, 32);
441  const LLT V9S32 = LLT::vector(9, 32);
442  const LLT V10S32 = LLT::vector(10, 32);
443  const LLT V11S32 = LLT::vector(11, 32);
444  const LLT V12S32 = LLT::vector(12, 32);
445  const LLT V13S32 = LLT::vector(13, 32);
446  const LLT V14S32 = LLT::vector(14, 32);
447  const LLT V15S32 = LLT::vector(15, 32);
448  const LLT V16S32 = LLT::vector(16, 32);
449  const LLT V32S32 = LLT::vector(32, 32);
450 
451  const LLT V2S64 = LLT::vector(2, 64);
452  const LLT V3S64 = LLT::vector(3, 64);
453  const LLT V4S64 = LLT::vector(4, 64);
454  const LLT V5S64 = LLT::vector(5, 64);
455  const LLT V6S64 = LLT::vector(6, 64);
456  const LLT V7S64 = LLT::vector(7, 64);
457  const LLT V8S64 = LLT::vector(8, 64);
458  const LLT V16S64 = LLT::vector(16, 64);
459 
460  std::initializer_list<LLT> AllS32Vectors =
461  {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32,
462  V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32};
463  std::initializer_list<LLT> AllS64Vectors =
464  {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64};
465 
466  const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS);
467  const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS);
468  const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT);
469  const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS);
470  const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS);
471  const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS);
472  const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS);
473 
474  const LLT CodePtr = FlatPtr;
475 
476  const std::initializer_list<LLT> AddrSpaces64 = {
477  GlobalPtr, ConstantPtr, FlatPtr
478  };
479 
480  const std::initializer_list<LLT> AddrSpaces32 = {
481  LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr
482  };
483 
484  const std::initializer_list<LLT> FPTypesBase = {
485  S32, S64
486  };
487 
488  const std::initializer_list<LLT> FPTypes16 = {
489  S32, S64, S16
490  };
491 
492  const std::initializer_list<LLT> FPTypesPK16 = {
493  S32, S64, S16, V2S16
494  };
495 
496  const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32;
497 
498  setAction({G_BRCOND, S1}, Legal); // VCC branches
499  setAction({G_BRCOND, S32}, Legal); // SCC branches
500 
501  // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more
502  // elements for v3s16
504  .legalFor({S32, S64, V2S16, S16, V4S16, S1, S128, S256})
505  .legalFor(AllS32Vectors)
506  .legalFor(AllS64Vectors)
507  .legalFor(AddrSpaces64)
508  .legalFor(AddrSpaces32)
509  .legalIf(isPointer(0))
510  .clampScalar(0, S16, S256)
511  .widenScalarToNextPow2(0, 32)
512  .clampMaxNumElements(0, S32, 16)
514  .scalarize(0);
515 
516  if (ST.hasVOP3PInsts() && ST.hasAddNoCarry() && ST.hasIntClamp()) {
517  // Full set of gfx9 features.
518  getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
519  .legalFor({S32, S16, V2S16})
520  .clampScalar(0, S16, S32)
521  .clampMaxNumElements(0, S16, 2)
522  .scalarize(0)
523  .widenScalarToNextPow2(0, 32);
524 
525  getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT, G_SADDSAT, G_SSUBSAT})
526  .legalFor({S32, S16, V2S16}) // Clamp modifier
527  .minScalarOrElt(0, S16)
528  .clampMaxNumElements(0, S16, 2)
529  .scalarize(0)
530  .widenScalarToNextPow2(0, 32)
531  .lower();
532  } else if (ST.has16BitInsts()) {
533  getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
534  .legalFor({S32, S16})
535  .clampScalar(0, S16, S32)
536  .scalarize(0)
537  .widenScalarToNextPow2(0, 32); // FIXME: min should be 16
538 
539  // Technically the saturating operations require clamp bit support, but this
540  // was introduced at the same time as 16-bit operations.
541  getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
542  .legalFor({S32, S16}) // Clamp modifier
543  .minScalar(0, S16)
544  .scalarize(0)
545  .widenScalarToNextPow2(0, 16)
546  .lower();
547 
548  // We're just lowering this, but it helps get a better result to try to
549  // coerce to the desired type first.
550  getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT})
551  .minScalar(0, S16)
552  .scalarize(0)
553  .lower();
554  } else {
555  getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
556  .legalFor({S32})
557  .clampScalar(0, S32, S32)
558  .scalarize(0);
559 
560  if (ST.hasIntClamp()) {
561  getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
562  .legalFor({S32}) // Clamp modifier.
563  .scalarize(0)
564  .minScalarOrElt(0, S32)
565  .lower();
566  } else {
567  // Clamp bit support was added in VI, along with 16-bit operations.
568  getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
569  .minScalar(0, S32)
570  .scalarize(0)
571  .lower();
572  }
573 
574  // FIXME: DAG expansion gets better results. The widening uses the smaller
575  // range values and goes for the min/max lowering directly.
576  getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT})
577  .minScalar(0, S32)
578  .scalarize(0)
579  .lower();
580  }
581 
582  getActionDefinitionsBuilder({G_SDIV, G_UDIV, G_SREM, G_UREM})
583  .customFor({S32, S64})
584  .clampScalar(0, S32, S64)
585  .widenScalarToNextPow2(0, 32)
586  .scalarize(0);
587 
588  auto &Mulh = getActionDefinitionsBuilder({G_UMULH, G_SMULH})
589  .legalFor({S32})
590  .maxScalarOrElt(0, S32);
591 
592  if (ST.hasVOP3PInsts()) {
593  Mulh
594  .clampMaxNumElements(0, S8, 2)
595  .lowerFor({V2S8});
596  }
597 
598  Mulh
599  .scalarize(0)
600  .lower();
601 
602  // Report legal for any types we can handle anywhere. For the cases only legal
603  // on the SALU, RegBankSelect will be able to re-legalize.
604  getActionDefinitionsBuilder({G_AND, G_OR, G_XOR})
605  .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16})
606  .clampScalar(0, S32, S64)
610  .scalarize(0);
611 
612  getActionDefinitionsBuilder({G_UADDO, G_USUBO,
613  G_UADDE, G_SADDE, G_USUBE, G_SSUBE})
614  .legalFor({{S32, S1}, {S32, S32}})
615  .minScalar(0, S32)
616  // TODO: .scalarize(0)
617  .lower();
618 
619  getActionDefinitionsBuilder(G_BITCAST)
620  // Don't worry about the size constraint.
622  .lower();
623 
624 
625  getActionDefinitionsBuilder(G_CONSTANT)
626  .legalFor({S1, S32, S64, S16, GlobalPtr,
627  LocalPtr, ConstantPtr, PrivatePtr, FlatPtr })
628  .legalIf(isPointer(0))
629  .clampScalar(0, S32, S64)
631 
632  getActionDefinitionsBuilder(G_FCONSTANT)
633  .legalFor({S32, S64, S16})
634  .clampScalar(0, S16, S64);
635 
636  getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE})
637  .legalIf(isRegisterType(0))
638  // s1 and s16 are special cases because they have legal operations on
639  // them, but don't really occupy registers in the normal way.
640  .legalFor({S1, S16})
641  .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
642  .clampScalarOrElt(0, S32, MaxScalar)
643  .widenScalarToNextPow2(0, 32)
644  .clampMaxNumElements(0, S32, 16);
645 
646  setAction({G_FRAME_INDEX, PrivatePtr}, Legal);
647 
648  // If the amount is divergent, we have to do a wave reduction to get the
649  // maximum value, so this is expanded during RegBankSelect.
650  getActionDefinitionsBuilder(G_DYN_STACKALLOC)
651  .legalFor({{PrivatePtr, S32}});
652 
653  getActionDefinitionsBuilder(G_GLOBAL_VALUE)
654  .customIf(typeIsNot(0, PrivatePtr));
655 
656  setAction({G_BLOCK_ADDR, CodePtr}, Legal);
657 
658  auto &FPOpActions = getActionDefinitionsBuilder(
659  { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE})
660  .legalFor({S32, S64});
661  auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS})
662  .customFor({S32, S64});
663  auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV)
664  .customFor({S32, S64});
665 
666  if (ST.has16BitInsts()) {
667  if (ST.hasVOP3PInsts())
668  FPOpActions.legalFor({S16, V2S16});
669  else
670  FPOpActions.legalFor({S16});
671 
672  TrigActions.customFor({S16});
673  FDIVActions.customFor({S16});
674  }
675 
676  auto &MinNumMaxNum = getActionDefinitionsBuilder({
677  G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE});
678 
679  if (ST.hasVOP3PInsts()) {
680  MinNumMaxNum.customFor(FPTypesPK16)
682  .clampMaxNumElements(0, S16, 2)
683  .clampScalar(0, S16, S64)
684  .scalarize(0);
685  } else if (ST.has16BitInsts()) {
686  MinNumMaxNum.customFor(FPTypes16)
687  .clampScalar(0, S16, S64)
688  .scalarize(0);
689  } else {
690  MinNumMaxNum.customFor(FPTypesBase)
691  .clampScalar(0, S32, S64)
692  .scalarize(0);
693  }
694 
695  if (ST.hasVOP3PInsts())
696  FPOpActions.clampMaxNumElements(0, S16, 2);
697 
698  FPOpActions
699  .scalarize(0)
700  .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
701 
702  TrigActions
703  .scalarize(0)
704  .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
705 
706  FDIVActions
707  .scalarize(0)
708  .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
709 
710  getActionDefinitionsBuilder({G_FNEG, G_FABS})
711  .legalFor(FPTypesPK16)
712  .clampMaxNumElements(0, S16, 2)
713  .scalarize(0)
714  .clampScalar(0, S16, S64);
715 
716  if (ST.has16BitInsts()) {
717  getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR})
718  .legalFor({S32, S64, S16})
719  .scalarize(0)
720  .clampScalar(0, S16, S64);
721  } else {
723  .legalFor({S32, S64})
724  .scalarize(0)
725  .clampScalar(0, S32, S64);
726 
727  if (ST.hasFractBug()) {
729  .customFor({S64})
730  .legalFor({S32, S64})
731  .scalarize(0)
732  .clampScalar(0, S32, S64);
733  } else {
735  .legalFor({S32, S64})
736  .scalarize(0)
737  .clampScalar(0, S32, S64);
738  }
739  }
740 
741  getActionDefinitionsBuilder(G_FPTRUNC)
742  .legalFor({{S32, S64}, {S16, S32}})
743  .scalarize(0)
744  .lower();
745 
747  .legalFor({{S64, S32}, {S32, S16}})
748  .narrowScalarFor({{S64, S16}}, changeTo(0, S32))
749  .scalarize(0);
750 
752  // Use actual fsub instruction
753  .legalFor({S32})
754  // Must use fadd + fneg
755  .lowerFor({S64, S16, V2S16})
756  .scalarize(0)
757  .clampScalar(0, S32, S64);
758 
759  // Whether this is legal depends on the floating point mode for the function.
760  auto &FMad = getActionDefinitionsBuilder(G_FMAD);
761  if (ST.hasMadF16() && ST.hasMadMacF32Insts())
762  FMad.customFor({S32, S16});
763  else if (ST.hasMadMacF32Insts())
764  FMad.customFor({S32});
765  else if (ST.hasMadF16())
766  FMad.customFor({S16});
767  FMad.scalarize(0)
768  .lower();
769 
770  auto &FRem = getActionDefinitionsBuilder(G_FREM);
771  if (ST.has16BitInsts()) {
772  FRem.customFor({S16, S32, S64});
773  } else {
774  FRem.minScalar(0, S32)
775  .customFor({S32, S64});
776  }
777  FRem.scalarize(0);
778 
779  // TODO: Do we need to clamp maximum bitwidth?
781  .legalIf(isScalar(0))
782  .legalFor({{V2S16, V2S32}})
783  .clampMaxNumElements(0, S16, 2)
784  // Avoid scalarizing in cases that should be truly illegal. In unresolvable
785  // situations (like an invalid implicit use), we don't want to infinite loop
786  // in the legalizer.
788  .alwaysLegal();
789 
790  getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT})
791  .legalFor({{S64, S32}, {S32, S16}, {S64, S16},
792  {S32, S1}, {S64, S1}, {S16, S1}})
793  .scalarize(0)
794  .clampScalar(0, S32, S64)
795  .widenScalarToNextPow2(1, 32);
796 
797  // TODO: Split s1->s64 during regbankselect for VALU.
798  auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP})
799  .legalFor({{S32, S32}, {S64, S32}, {S16, S32}})
800  .lowerFor({{S32, S64}})
801  .lowerIf(typeIs(1, S1))
802  .customFor({{S64, S64}});
803  if (ST.has16BitInsts())
804  IToFP.legalFor({{S16, S16}});
805  IToFP.clampScalar(1, S32, S64)
806  .minScalar(0, S32)
807  .scalarize(0)
809 
810  auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI})
811  .legalFor({{S32, S32}, {S32, S64}, {S32, S16}})
812  .customFor({{S64, S64}})
813  .narrowScalarFor({{S64, S16}}, changeTo(0, S32));
814  if (ST.has16BitInsts())
815  FPToI.legalFor({{S16, S16}});
816  else
817  FPToI.minScalar(1, S32);
818 
819  FPToI.minScalar(0, S32)
820  .scalarize(0)
821  .lower();
822 
823  // Lower roundeven into G_FRINT
824  getActionDefinitionsBuilder({G_INTRINSIC_ROUND, G_INTRINSIC_ROUNDEVEN})
825  .scalarize(0)
826  .lower();
827 
828  if (ST.has16BitInsts()) {
829  getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
830  .legalFor({S16, S32, S64})
831  .clampScalar(0, S16, S64)
832  .scalarize(0);
833  } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) {
834  getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
835  .legalFor({S32, S64})
836  .clampScalar(0, S32, S64)
837  .scalarize(0);
838  } else {
839  getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
840  .legalFor({S32})
841  .customFor({S64})
842  .clampScalar(0, S32, S64)
843  .scalarize(0);
844  }
845 
846  getActionDefinitionsBuilder(G_PTR_ADD)
847  .legalIf(all(isPointer(0), sameSize(0, 1)))
848  .scalarize(0)
849  .scalarSameSizeAs(1, 0);
850 
851  getActionDefinitionsBuilder(G_PTRMASK)
852  .legalIf(all(sameSize(0, 1), typeInSet(1, {S64, S32})))
853  .scalarSameSizeAs(1, 0)
854  .scalarize(0);
855 
856  auto &CmpBuilder =
857  getActionDefinitionsBuilder(G_ICMP)
858  // The compare output type differs based on the register bank of the output,
859  // so make both s1 and s32 legal.
860  //
861  // Scalar compares producing output in scc will be promoted to s32, as that
862  // is the allocatable register type that will be needed for the copy from
863  // scc. This will be promoted during RegBankSelect, and we assume something
864  // before that won't try to use s32 result types.
865  //
866  // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg
867  // bank.
868  .legalForCartesianProduct(
869  {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr})
870  .legalForCartesianProduct(
871  {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr});
872  if (ST.has16BitInsts()) {
873  CmpBuilder.legalFor({{S1, S16}});
874  }
875 
876  CmpBuilder
877  .widenScalarToNextPow2(1)
878  .clampScalar(1, S32, S64)
879  .scalarize(0)
880  .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1)));
881 
882  getActionDefinitionsBuilder(G_FCMP)
883  .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase)
884  .widenScalarToNextPow2(1)
885  .clampScalar(1, S32, S64)
886  .scalarize(0);
887 
888  // FIXME: fpow has a selection pattern that should move to custom lowering.
889  auto &Exp2Ops = getActionDefinitionsBuilder({G_FEXP2, G_FLOG2});
890  if (ST.has16BitInsts())
891  Exp2Ops.legalFor({S32, S16});
892  else
893  Exp2Ops.legalFor({S32});
894  Exp2Ops.clampScalar(0, MinScalarFPTy, S32);
895  Exp2Ops.scalarize(0);
896 
897  auto &ExpOps = getActionDefinitionsBuilder({G_FEXP, G_FLOG, G_FLOG10, G_FPOW});
898  if (ST.has16BitInsts())
899  ExpOps.customFor({{S32}, {S16}});
900  else
901  ExpOps.customFor({S32});
902  ExpOps.clampScalar(0, MinScalarFPTy, S32)
903  .scalarize(0);
904 
905  getActionDefinitionsBuilder(G_FPOWI)
906  .clampScalar(0, MinScalarFPTy, S32)
907  .lower();
908 
909  // The 64-bit versions produce 32-bit results, but only on the SALU.
910  getActionDefinitionsBuilder(G_CTPOP)
911  .legalFor({{S32, S32}, {S32, S64}})
912  .clampScalar(0, S32, S32)
913  .clampScalar(1, S32, S64)
914  .scalarize(0)
915  .widenScalarToNextPow2(0, 32)
916  .widenScalarToNextPow2(1, 32);
917 
918  // The hardware instructions return a different result on 0 than the generic
919  // instructions expect. The hardware produces -1, but these produce the
920  // bitwidth.
921  getActionDefinitionsBuilder({G_CTLZ, G_CTTZ})
922  .scalarize(0)
923  .clampScalar(0, S32, S32)
924  .clampScalar(1, S32, S64)
925  .widenScalarToNextPow2(0, 32)
926  .widenScalarToNextPow2(1, 32)
927  .lower();
928 
929  // The 64-bit versions produce 32-bit results, but only on the SALU.
930  getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF})
931  .legalFor({{S32, S32}, {S32, S64}})
932  .clampScalar(0, S32, S32)
933  .clampScalar(1, S32, S64)
934  .scalarize(0)
935  .widenScalarToNextPow2(0, 32)
936  .widenScalarToNextPow2(1, 32);
937 
938  getActionDefinitionsBuilder(G_BITREVERSE)
939  .legalFor({S32})
940  .clampScalar(0, S32, S32)
941  .scalarize(0);
942 
943  if (ST.has16BitInsts()) {
944  getActionDefinitionsBuilder(G_BSWAP)
945  .legalFor({S16, S32, V2S16})
946  .clampMaxNumElements(0, S16, 2)
947  // FIXME: Fixing non-power-of-2 before clamp is workaround for
948  // narrowScalar limitation.
949  .widenScalarToNextPow2(0)
950  .clampScalar(0, S16, S32)
951  .scalarize(0);
952 
953  if (ST.hasVOP3PInsts()) {
954  getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
955  .legalFor({S32, S16, V2S16})
956  .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
957  .clampMaxNumElements(0, S16, 2)
958  .minScalar(0, S16)
959  .widenScalarToNextPow2(0)
960  .scalarize(0)
961  .lower();
962  } else {
963  getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
964  .legalFor({S32, S16})
965  .widenScalarToNextPow2(0)
966  .minScalar(0, S16)
967  .scalarize(0)
968  .lower();
969  }
970  } else {
971  // TODO: Should have same legality without v_perm_b32
972  getActionDefinitionsBuilder(G_BSWAP)
973  .legalFor({S32})
974  .lowerIf(scalarNarrowerThan(0, 32))
975  // FIXME: Fixing non-power-of-2 before clamp is workaround for
976  // narrowScalar limitation.
977  .widenScalarToNextPow2(0)
978  .maxScalar(0, S32)
979  .scalarize(0)
980  .lower();
981 
982  getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
983  .legalFor({S32})
984  .minScalar(0, S32)
985  .widenScalarToNextPow2(0)
986  .scalarize(0)
987  .lower();
988  }
989 
990  getActionDefinitionsBuilder(G_INTTOPTR)
991  // List the common cases
992  .legalForCartesianProduct(AddrSpaces64, {S64})
993  .legalForCartesianProduct(AddrSpaces32, {S32})
994  .scalarize(0)
995  // Accept any address space as long as the size matches
996  .legalIf(sameSize(0, 1))
997  .widenScalarIf(smallerThan(1, 0),
998  [](const LegalityQuery &Query) {
999  return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
1000  })
1001  .narrowScalarIf(largerThan(1, 0),
1002  [](const LegalityQuery &Query) {
1003  return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
1004  });
1005 
1006  getActionDefinitionsBuilder(G_PTRTOINT)
1007  // List the common cases
1008  .legalForCartesianProduct(AddrSpaces64, {S64})
1009  .legalForCartesianProduct(AddrSpaces32, {S32})
1010  .scalarize(0)
1011  // Accept any address space as long as the size matches
1012  .legalIf(sameSize(0, 1))
1013  .widenScalarIf(smallerThan(0, 1),
1014  [](const LegalityQuery &Query) {
1015  return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
1016  })
1017  .narrowScalarIf(
1018  largerThan(0, 1),
1019  [](const LegalityQuery &Query) {
1020  return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
1021  });
1022 
1023  getActionDefinitionsBuilder(G_ADDRSPACE_CAST)
1024  .scalarize(0)
1025  .custom();
1026 
1027  const auto needToSplitMemOp = [=](const LegalityQuery &Query,
1028  bool IsLoad) -> bool {
1029  const LLT DstTy = Query.Types[0];
1030 
1031  // Split vector extloads.
1032  unsigned MemSize = Query.MMODescrs[0].SizeInBits;
1033  unsigned AlignBits = Query.MMODescrs[0].AlignInBits;
1034 
1035  if (MemSize < DstTy.getSizeInBits())
1036  MemSize = std::max(MemSize, AlignBits);
1037 
1038  if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize)
1039  return true;
1040 
1041  const LLT PtrTy = Query.Types[1];
1042  unsigned AS = PtrTy.getAddressSpace();
1043  if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad))
1044  return true;
1045 
1046  // Catch weird sized loads that don't evenly divide into the access sizes
1047  // TODO: May be able to widen depending on alignment etc.
1048  unsigned NumRegs = (MemSize + 31) / 32;
1049  if (NumRegs == 3) {
1050  if (!ST.hasDwordx3LoadStores())
1051  return true;
1052  } else {
1053  // If the alignment allows, these should have been widened.
1054  if (!isPowerOf2_32(NumRegs))
1055  return true;
1056  }
1057 
1058  if (AlignBits < MemSize) {
1059  const SITargetLowering *TLI = ST.getTargetLowering();
1060  return !TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS,
1061  Align(AlignBits / 8));
1062  }
1063 
1064  return false;
1065  };
1066 
1067  unsigned GlobalAlign32 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 32;
1068  unsigned GlobalAlign16 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 16;
1069  unsigned GlobalAlign8 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 8;
1070 
1071  // TODO: Refine based on subtargets which support unaligned access or 128-bit
1072  // LDS
1073  // TODO: Unsupported flat for SI.
1074 
1075  for (unsigned Op : {G_LOAD, G_STORE}) {
1076  const bool IsStore = Op == G_STORE;
1077 
1078  auto &Actions = getActionDefinitionsBuilder(Op);
1079  // Explicitly list some common cases.
1080  // TODO: Does this help compile time at all?
1081  Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, 32, GlobalAlign32},
1082  {V2S32, GlobalPtr, 64, GlobalAlign32},
1083  {V4S32, GlobalPtr, 128, GlobalAlign32},
1084  {S64, GlobalPtr, 64, GlobalAlign32},
1085  {V2S64, GlobalPtr, 128, GlobalAlign32},
1086  {V2S16, GlobalPtr, 32, GlobalAlign32},
1087  {S32, GlobalPtr, 8, GlobalAlign8},
1088  {S32, GlobalPtr, 16, GlobalAlign16},
1089 
1090  {S32, LocalPtr, 32, 32},
1091  {S64, LocalPtr, 64, 32},
1092  {V2S32, LocalPtr, 64, 32},
1093  {S32, LocalPtr, 8, 8},
1094  {S32, LocalPtr, 16, 16},
1095  {V2S16, LocalPtr, 32, 32},
1096 
1097  {S32, PrivatePtr, 32, 32},
1098  {S32, PrivatePtr, 8, 8},
1099  {S32, PrivatePtr, 16, 16},
1100  {V2S16, PrivatePtr, 32, 32},
1101 
1102  {S32, ConstantPtr, 32, GlobalAlign32},
1103  {V2S32, ConstantPtr, 64, GlobalAlign32},
1104  {V4S32, ConstantPtr, 128, GlobalAlign32},
1105  {S64, ConstantPtr, 64, GlobalAlign32},
1106  {V2S32, ConstantPtr, 32, GlobalAlign32}});
1107  Actions.legalIf(
1108  [=](const LegalityQuery &Query) -> bool {
1109  return isLoadStoreLegal(ST, Query, Op);
1110  });
1111 
1112  // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to
1113  // 64-bits.
1114  //
1115  // TODO: Should generalize bitcast action into coerce, which will also cover
1116  // inserting addrspacecasts.
1117  Actions.customIf(typeIs(1, Constant32Ptr));
1118 
1119  // Turn any illegal element vectors into something easier to deal
1120  // with. These will ultimately produce 32-bit scalar shifts to extract the
1121  // parts anyway.
1122  //
1123  // For odd 16-bit element vectors, prefer to split those into pieces with
1124  // 16-bit vector parts.
1125  Actions.bitcastIf(
1126  [=](const LegalityQuery &Query) -> bool {
1127  return shouldBitcastLoadStoreType(ST, Query.Types[0],
1128  Query.MMODescrs[0].SizeInBits);
1129  }, bitcastToRegisterType(0));
1130 
1131  if (!IsStore) {
1132  // Widen suitably aligned loads by loading extra bytes. The standard
1133  // legalization actions can't properly express widening memory operands.
1134  Actions.customIf([=](const LegalityQuery &Query) -> bool {
1135  return shouldWidenLoad(ST, Query, G_LOAD);
1136  });
1137  }
1138 
1139  // FIXME: load/store narrowing should be moved to lower action
1140  Actions
1141  .narrowScalarIf(
1142  [=](const LegalityQuery &Query) -> bool {
1143  return !Query.Types[0].isVector() &&
1144  needToSplitMemOp(Query, Op == G_LOAD);
1145  },
1146  [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1147  const LLT DstTy = Query.Types[0];
1148  const LLT PtrTy = Query.Types[1];
1149 
1150  const unsigned DstSize = DstTy.getSizeInBits();
1151  unsigned MemSize = Query.MMODescrs[0].SizeInBits;
1152 
1153  // Split extloads.
1154  if (DstSize > MemSize)
1155  return std::make_pair(0, LLT::scalar(MemSize));
1156 
1157  if (!isPowerOf2_32(DstSize)) {
1158  // We're probably decomposing an odd sized store. Try to split
1159  // to the widest type. TODO: Account for alignment. As-is it
1160  // should be OK, since the new parts will be further legalized.
1161  unsigned FloorSize = PowerOf2Floor(DstSize);
1162  return std::make_pair(0, LLT::scalar(FloorSize));
1163  }
1164 
1165  if (DstSize > 32 && (DstSize % 32 != 0)) {
1166  // FIXME: Need a way to specify non-extload of larger size if
1167  // suitably aligned.
1168  return std::make_pair(0, LLT::scalar(32 * (DstSize / 32)));
1169  }
1170 
1171  unsigned MaxSize = maxSizeForAddrSpace(ST,
1172  PtrTy.getAddressSpace(),
1173  Op == G_LOAD);
1174  if (MemSize > MaxSize)
1175  return std::make_pair(0, LLT::scalar(MaxSize));
1176 
1177  unsigned Align = Query.MMODescrs[0].AlignInBits;
1178  return std::make_pair(0, LLT::scalar(Align));
1179  })
1180  .fewerElementsIf(
1181  [=](const LegalityQuery &Query) -> bool {
1182  return Query.Types[0].isVector() &&
1183  needToSplitMemOp(Query, Op == G_LOAD);
1184  },
1185  [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1186  const LLT DstTy = Query.Types[0];
1187  const LLT PtrTy = Query.Types[1];
1188 
1189  LLT EltTy = DstTy.getElementType();
1190  unsigned MaxSize = maxSizeForAddrSpace(ST,
1191  PtrTy.getAddressSpace(),
1192  Op == G_LOAD);
1193 
1194  // FIXME: Handle widened to power of 2 results better. This ends
1195  // up scalarizing.
1196  // FIXME: 3 element stores scalarized on SI
1197 
1198  // Split if it's too large for the address space.
1199  if (Query.MMODescrs[0].SizeInBits > MaxSize) {
1200  unsigned NumElts = DstTy.getNumElements();
1201  unsigned EltSize = EltTy.getSizeInBits();
1202 
1203  if (MaxSize % EltSize == 0) {
1204  return std::make_pair(
1205  0, LLT::scalarOrVector(MaxSize / EltSize, EltTy));
1206  }
1207 
1208  unsigned NumPieces = Query.MMODescrs[0].SizeInBits / MaxSize;
1209 
1210  // FIXME: Refine when odd breakdowns handled
1211  // The scalars will need to be re-legalized.
1212  if (NumPieces == 1 || NumPieces >= NumElts ||
1213  NumElts % NumPieces != 0)
1214  return std::make_pair(0, EltTy);
1215 
1216  return std::make_pair(0,
1217  LLT::vector(NumElts / NumPieces, EltTy));
1218  }
1219 
1220  // FIXME: We could probably handle weird extending loads better.
1221  unsigned MemSize = Query.MMODescrs[0].SizeInBits;
1222  if (DstTy.getSizeInBits() > MemSize)
1223  return std::make_pair(0, EltTy);
1224 
1225  unsigned EltSize = EltTy.getSizeInBits();
1226  unsigned DstSize = DstTy.getSizeInBits();
1227  if (!isPowerOf2_32(DstSize)) {
1228  // We're probably decomposing an odd sized store. Try to split
1229  // to the widest type. TODO: Account for alignment. As-is it
1230  // should be OK, since the new parts will be further legalized.
1231  unsigned FloorSize = PowerOf2Floor(DstSize);
1232  return std::make_pair(
1233  0, LLT::scalarOrVector(FloorSize / EltSize, EltTy));
1234  }
1235 
1236  // Need to split because of alignment.
1237  unsigned Align = Query.MMODescrs[0].AlignInBits;
1238  if (EltSize > Align &&
1239  (EltSize / Align < DstTy.getNumElements())) {
1240  return std::make_pair(0, LLT::vector(EltSize / Align, EltTy));
1241  }
1242 
1243  // May need relegalization for the scalars.
1244  return std::make_pair(0, EltTy);
1245  })
1246  .lowerIfMemSizeNotPow2()
1247  .minScalar(0, S32);
1248 
1249  if (IsStore)
1250  Actions.narrowScalarIf(isWideScalarTruncStore(0), changeTo(0, S32));
1251 
1252  Actions
1253  .widenScalarToNextPow2(0)
1254  .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0))
1255  .lower();
1256  }
1257 
1258  auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD})
1259  .legalForTypesWithMemDesc({{S32, GlobalPtr, 8, 8},
1260  {S32, GlobalPtr, 16, 2 * 8},
1261  {S32, LocalPtr, 8, 8},
1262  {S32, LocalPtr, 16, 16},
1263  {S32, PrivatePtr, 8, 8},
1264  {S32, PrivatePtr, 16, 16},
1265  {S32, ConstantPtr, 8, 8},
1266  {S32, ConstantPtr, 16, 2 * 8}});
1267  if (ST.hasFlatAddressSpace()) {
1268  ExtLoads.legalForTypesWithMemDesc(
1269  {{S32, FlatPtr, 8, 8}, {S32, FlatPtr, 16, 16}});
1270  }
1271 
1272  ExtLoads.clampScalar(0, S32, S32)
1273  .widenScalarToNextPow2(0)
1274  .unsupportedIfMemSizeNotPow2()
1275  .lower();
1276 
1277  auto &Atomics = getActionDefinitionsBuilder(
1278  {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB,
1279  G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR,
1280  G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX,
1281  G_ATOMICRMW_UMIN})
1282  .legalFor({{S32, GlobalPtr}, {S32, LocalPtr},
1283  {S64, GlobalPtr}, {S64, LocalPtr},
1284  {S32, RegionPtr}, {S64, RegionPtr}});
1285  if (ST.hasFlatAddressSpace()) {
1286  Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}});
1287  }
1288 
1289  if (ST.hasLDSFPAtomics()) {
1290  getActionDefinitionsBuilder(G_ATOMICRMW_FADD)
1291  .legalFor({{S32, LocalPtr}, {S32, RegionPtr}});
1292  }
1293 
1294  // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output
1295  // demarshalling
1296  getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG)
1297  .customFor({{S32, GlobalPtr}, {S64, GlobalPtr},
1298  {S32, FlatPtr}, {S64, FlatPtr}})
1299  .legalFor({{S32, LocalPtr}, {S64, LocalPtr},
1300  {S32, RegionPtr}, {S64, RegionPtr}});
1301  // TODO: Pointer types, any 32-bit or 64-bit vector
1302 
1303  // Condition should be s32 for scalar, s1 for vector.
1304  getActionDefinitionsBuilder(G_SELECT)
1305  .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16,
1306  GlobalPtr, LocalPtr, FlatPtr, PrivatePtr,
1307  LLT::vector(2, LocalPtr), LLT::vector(2, PrivatePtr)}, {S1, S32})
1308  .clampScalar(0, S16, S64)
1309  .scalarize(1)
1310  .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
1311  .fewerElementsIf(numElementsNotEven(0), scalarize(0))
1312  .clampMaxNumElements(0, S32, 2)
1313  .clampMaxNumElements(0, LocalPtr, 2)
1314  .clampMaxNumElements(0, PrivatePtr, 2)
1315  .scalarize(0)
1316  .widenScalarToNextPow2(0)
1317  .legalIf(all(isPointer(0), typeInSet(1, {S1, S32})));
1318 
1319  // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can
1320  // be more flexible with the shift amount type.
1321  auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR})
1322  .legalFor({{S32, S32}, {S64, S32}});
1323  if (ST.has16BitInsts()) {
1324  if (ST.hasVOP3PInsts()) {
1325  Shifts.legalFor({{S16, S16}, {V2S16, V2S16}})
1326  .clampMaxNumElements(0, S16, 2);
1327  } else
1328  Shifts.legalFor({{S16, S16}});
1329 
1330  // TODO: Support 16-bit shift amounts for all types
1331  Shifts.widenScalarIf(
1332  [=](const LegalityQuery &Query) {
1333  // Use 16-bit shift amounts for any 16-bit shift. Otherwise we want a
1334  // 32-bit amount.
1335  const LLT ValTy = Query.Types[0];
1336  const LLT AmountTy = Query.Types[1];
1337  return ValTy.getSizeInBits() <= 16 &&
1338  AmountTy.getSizeInBits() < 16;
1339  }, changeTo(1, S16));
1340  Shifts.maxScalarIf(typeIs(0, S16), 1, S16);
1341  Shifts.clampScalar(1, S32, S32);
1342  Shifts.clampScalar(0, S16, S64);
1343  Shifts.widenScalarToNextPow2(0, 16);
1344 
1345  getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1346  .minScalar(0, S16)
1347  .scalarize(0)
1348  .lower();
1349  } else {
1350  // Make sure we legalize the shift amount type first, as the general
1351  // expansion for the shifted type will produce much worse code if it hasn't
1352  // been truncated already.
1353  Shifts.clampScalar(1, S32, S32);
1354  Shifts.clampScalar(0, S32, S64);
1355  Shifts.widenScalarToNextPow2(0, 32);
1356 
1357  getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1358  .minScalar(0, S32)
1359  .scalarize(0)
1360  .lower();
1361  }
1362  Shifts.scalarize(0);
1363 
1364  for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) {
1365  unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0;
1366  unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1;
1367  unsigned IdxTypeIdx = 2;
1368 
1369  getActionDefinitionsBuilder(Op)
1370  .customIf([=](const LegalityQuery &Query) {
1371  const LLT EltTy = Query.Types[EltTypeIdx];
1372  const LLT VecTy = Query.Types[VecTypeIdx];
1373  const LLT IdxTy = Query.Types[IdxTypeIdx];
1374  const unsigned EltSize = EltTy.getSizeInBits();
1375  return (EltSize == 32 || EltSize == 64) &&
1376  VecTy.getSizeInBits() % 32 == 0 &&
1377  VecTy.getSizeInBits() <= MaxRegisterSize &&
1378  IdxTy.getSizeInBits() == 32;
1379  })
1380  .bitcastIf(all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltNarrowerThan(VecTypeIdx, 32)),
1381  bitcastToVectorElement32(VecTypeIdx))
1382  //.bitcastIf(vectorSmallerThan(1, 32), bitcastToScalar(1))
1383  .bitcastIf(
1384  all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltWiderThan(VecTypeIdx, 64)),
1385  [=](const LegalityQuery &Query) {
1386  // For > 64-bit element types, try to turn this into a 64-bit
1387  // element vector since we may be able to do better indexing
1388  // if this is scalar. If not, fall back to 32.
1389  const LLT EltTy = Query.Types[EltTypeIdx];
1390  const LLT VecTy = Query.Types[VecTypeIdx];
1391  const unsigned DstEltSize = EltTy.getSizeInBits();
1392  const unsigned VecSize = VecTy.getSizeInBits();
1393 
1394  const unsigned TargetEltSize = DstEltSize % 64 == 0 ? 64 : 32;
1395  return std::make_pair(
1396  VecTypeIdx, LLT::vector(VecSize / TargetEltSize, TargetEltSize));
1397  })
1398  .clampScalar(EltTypeIdx, S32, S64)
1399  .clampScalar(VecTypeIdx, S32, S64)
1400  .clampScalar(IdxTypeIdx, S32, S32)
1401  .clampMaxNumElements(VecTypeIdx, S32, 32)
1402  // TODO: Clamp elements for 64-bit vectors?
1403  // It should only be necessary with variable indexes.
1404  // As a last resort, lower to the stack
1405  .lower();
1406  }
1407 
1408  getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT)
1409  .unsupportedIf([=](const LegalityQuery &Query) {
1410  const LLT &EltTy = Query.Types[1].getElementType();
1411  return Query.Types[0] != EltTy;
1412  });
1413 
1414  for (unsigned Op : {G_EXTRACT, G_INSERT}) {
1415  unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0;
1416  unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1;
1417 
1418  // FIXME: Doesn't handle extract of illegal sizes.
1419  getActionDefinitionsBuilder(Op)
1420  .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32)))
1421  // FIXME: Multiples of 16 should not be legal.
1422  .legalIf([=](const LegalityQuery &Query) {
1423  const LLT BigTy = Query.Types[BigTyIdx];
1424  const LLT LitTy = Query.Types[LitTyIdx];
1425  return (BigTy.getSizeInBits() % 32 == 0) &&
1426  (LitTy.getSizeInBits() % 16 == 0);
1427  })
1428  .widenScalarIf(
1429  [=](const LegalityQuery &Query) {
1430  const LLT BigTy = Query.Types[BigTyIdx];
1431  return (BigTy.getScalarSizeInBits() < 16);
1432  },
1434  .widenScalarIf(
1435  [=](const LegalityQuery &Query) {
1436  const LLT LitTy = Query.Types[LitTyIdx];
1437  return (LitTy.getScalarSizeInBits() < 16);
1438  },
1440  .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1441  .widenScalarToNextPow2(BigTyIdx, 32);
1442 
1443  }
1444 
1445  auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR)
1446  .legalForCartesianProduct(AllS32Vectors, {S32})
1447  .legalForCartesianProduct(AllS64Vectors, {S64})
1448  .clampNumElements(0, V16S32, V32S32)
1449  .clampNumElements(0, V2S64, V16S64)
1450  .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16));
1451 
1452  if (ST.hasScalarPackInsts()) {
1453  BuildVector
1454  // FIXME: Should probably widen s1 vectors straight to s32
1455  .minScalarOrElt(0, S16)
1456  // Widen source elements and produce a G_BUILD_VECTOR_TRUNC
1457  .minScalar(1, S32);
1458 
1459  getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1460  .legalFor({V2S16, S32})
1461  .lower();
1462  BuildVector.minScalarOrElt(0, S32);
1463  } else {
1464  BuildVector.customFor({V2S16, S16});
1465  BuildVector.minScalarOrElt(0, S32);
1466 
1467  getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1468  .customFor({V2S16, S32})
1469  .lower();
1470  }
1471 
1472  BuildVector.legalIf(isRegisterType(0));
1473 
1474  // FIXME: Clamp maximum size
1475  getActionDefinitionsBuilder(G_CONCAT_VECTORS)
1476  .legalIf(all(isRegisterType(0), isRegisterType(1)))
1477  .clampMaxNumElements(0, S32, 32)
1478  .clampMaxNumElements(1, S16, 2) // TODO: Make 4?
1479  .clampMaxNumElements(0, S16, 64);
1480 
1481  // TODO: Don't fully scalarize v2s16 pieces? Or combine out thosse
1482  // pre-legalize.
1483  if (ST.hasVOP3PInsts()) {
1484  getActionDefinitionsBuilder(G_SHUFFLE_VECTOR)
1485  .customFor({V2S16, V2S16})
1486  .lower();
1487  } else
1488  getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower();
1489 
1490  // Merge/Unmerge
1491  for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
1492  unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1;
1493  unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0;
1494 
1495  auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) {
1496  const LLT Ty = Query.Types[TypeIdx];
1497  if (Ty.isVector()) {
1498  const LLT &EltTy = Ty.getElementType();
1499  if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 512)
1500  return true;
1501  if (!isPowerOf2_32(EltTy.getSizeInBits()))
1502  return true;
1503  }
1504  return false;
1505  };
1506 
1507  auto &Builder = getActionDefinitionsBuilder(Op)
1508  .legalIf(all(isRegisterType(0), isRegisterType(1)))
1509  .lowerFor({{S16, V2S16}})
1510  .lowerIf([=](const LegalityQuery &Query) {
1511  const LLT BigTy = Query.Types[BigTyIdx];
1512  return BigTy.getSizeInBits() == 32;
1513  })
1514  // Try to widen to s16 first for small types.
1515  // TODO: Only do this on targets with legal s16 shifts
1516  .minScalarOrEltIf(scalarNarrowerThan(LitTyIdx, 16), LitTyIdx, S16)
1517  .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16)
1518  .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1519  .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32),
1520  elementTypeIs(1, S16)),
1521  changeTo(1, V2S16))
1522  // Clamp the little scalar to s8-s256 and make it a power of 2. It's not
1523  // worth considering the multiples of 64 since 2*192 and 2*384 are not
1524  // valid.
1525  .clampScalar(LitTyIdx, S32, S512)
1526  .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32)
1527  // Break up vectors with weird elements into scalars
1528  .fewerElementsIf(
1529  [=](const LegalityQuery &Query) { return notValidElt(Query, LitTyIdx); },
1530  scalarize(0))
1531  .fewerElementsIf(
1532  [=](const LegalityQuery &Query) { return notValidElt(Query, BigTyIdx); },
1533  scalarize(1))
1534  .clampScalar(BigTyIdx, S32, MaxScalar);
1535 
1536  if (Op == G_MERGE_VALUES) {
1537  Builder.widenScalarIf(
1538  // TODO: Use 16-bit shifts if legal for 8-bit values?
1539  [=](const LegalityQuery &Query) {
1540  const LLT Ty = Query.Types[LitTyIdx];
1541  return Ty.getSizeInBits() < 32;
1542  },
1543  changeTo(LitTyIdx, S32));
1544  }
1545 
1546  Builder.widenScalarIf(
1547  [=](const LegalityQuery &Query) {
1548  const LLT Ty = Query.Types[BigTyIdx];
1549  return !isPowerOf2_32(Ty.getSizeInBits()) &&
1550  Ty.getSizeInBits() % 16 != 0;
1551  },
1552  [=](const LegalityQuery &Query) {
1553  // Pick the next power of 2, or a multiple of 64 over 128.
1554  // Whichever is smaller.
1555  const LLT &Ty = Query.Types[BigTyIdx];
1556  unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1);
1557  if (NewSizeInBits >= 256) {
1558  unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1);
1559  if (RoundedTo < NewSizeInBits)
1560  NewSizeInBits = RoundedTo;
1561  }
1562  return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits));
1563  })
1564  // Any vectors left are the wrong size. Scalarize them.
1565  .scalarize(0)
1566  .scalarize(1);
1567  }
1568 
1569  // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
1570  // RegBankSelect.
1571  auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG)
1572  .legalFor({{S32}, {S64}});
1573 
1574  if (ST.hasVOP3PInsts()) {
1575  SextInReg.lowerFor({{V2S16}})
1576  // Prefer to reduce vector widths for 16-bit vectors before lowering, to
1577  // get more vector shift opportunities, since we'll get those when
1578  // expanded.
1579  .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16));
1580  } else if (ST.has16BitInsts()) {
1581  SextInReg.lowerFor({{S32}, {S64}, {S16}});
1582  } else {
1583  // Prefer to promote to s32 before lowering if we don't have 16-bit
1584  // shifts. This avoid a lot of intermediate truncate and extend operations.
1585  SextInReg.lowerFor({{S32}, {S64}});
1586  }
1587 
1588  SextInReg
1589  .scalarize(0)
1590  .clampScalar(0, S32, S64)
1591  .lower();
1592 
1593  getActionDefinitionsBuilder(G_FSHR)
1594  .legalFor({{S32, S32}})
1595  .scalarize(0)
1596  .lower();
1597 
1598  getActionDefinitionsBuilder(G_READCYCLECOUNTER)
1599  .legalFor({S64});
1600 
1601  getActionDefinitionsBuilder(G_FENCE)
1602  .alwaysLegal();
1603 
1604  getActionDefinitionsBuilder({
1605  // TODO: Verify V_BFI_B32 is generated from expanded bit ops
1606  G_FCOPYSIGN,
1607 
1608  G_ATOMIC_CMPXCHG_WITH_SUCCESS,
1609  G_ATOMICRMW_NAND,
1610  G_ATOMICRMW_FSUB,
1611  G_READ_REGISTER,
1612  G_WRITE_REGISTER,
1613 
1614  G_SADDO, G_SSUBO,
1615 
1616  // TODO: Implement
1617  G_FMINIMUM, G_FMAXIMUM,
1618  G_FSHL
1619  }).lower();
1620 
1621  getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE,
1622  G_INDEXED_LOAD, G_INDEXED_SEXTLOAD,
1623  G_INDEXED_ZEXTLOAD, G_INDEXED_STORE})
1624  .unsupported();
1625 
1626  computeTables();
1627  verify(*ST.getInstrInfo());
1628 }
1629 
1630 bool AMDGPULegalizerInfo::legalizeCustom(LegalizerHelper &Helper,
1631  MachineInstr &MI) const {
1632  MachineIRBuilder &B = Helper.MIRBuilder;
1633  MachineRegisterInfo &MRI = *B.getMRI();
1634 
1635  switch (MI.getOpcode()) {
1636  case TargetOpcode::G_ADDRSPACE_CAST:
1637  return legalizeAddrSpaceCast(MI, MRI, B);
1638  case TargetOpcode::G_FRINT:
1639  return legalizeFrint(MI, MRI, B);
1640  case TargetOpcode::G_FCEIL:
1641  return legalizeFceil(MI, MRI, B);
1642  case TargetOpcode::G_FREM:
1643  return legalizeFrem(MI, MRI, B);
1644  case TargetOpcode::G_INTRINSIC_TRUNC:
1645  return legalizeIntrinsicTrunc(MI, MRI, B);
1646  case TargetOpcode::G_SITOFP:
1647  return legalizeITOFP(MI, MRI, B, true);
1648  case TargetOpcode::G_UITOFP:
1649  return legalizeITOFP(MI, MRI, B, false);
1650  case TargetOpcode::G_FPTOSI:
1651  return legalizeFPTOI(MI, MRI, B, true);
1652  case TargetOpcode::G_FPTOUI:
1653  return legalizeFPTOI(MI, MRI, B, false);
1654  case TargetOpcode::G_FMINNUM:
1655  case TargetOpcode::G_FMAXNUM:
1656  case TargetOpcode::G_FMINNUM_IEEE:
1657  case TargetOpcode::G_FMAXNUM_IEEE:
1658  return legalizeMinNumMaxNum(Helper, MI);
1659  case TargetOpcode::G_EXTRACT_VECTOR_ELT:
1660  return legalizeExtractVectorElt(MI, MRI, B);
1661  case TargetOpcode::G_INSERT_VECTOR_ELT:
1662  return legalizeInsertVectorElt(MI, MRI, B);
1663  case TargetOpcode::G_SHUFFLE_VECTOR:
1664  return legalizeShuffleVector(MI, MRI, B);
1665  case TargetOpcode::G_FSIN:
1666  case TargetOpcode::G_FCOS:
1667  return legalizeSinCos(MI, MRI, B);
1668  case TargetOpcode::G_GLOBAL_VALUE:
1669  return legalizeGlobalValue(MI, MRI, B);
1670  case TargetOpcode::G_LOAD:
1671  return legalizeLoad(Helper, MI);
1672  case TargetOpcode::G_FMAD:
1673  return legalizeFMad(MI, MRI, B);
1674  case TargetOpcode::G_FDIV:
1675  return legalizeFDIV(MI, MRI, B);
1676  case TargetOpcode::G_UDIV:
1677  case TargetOpcode::G_UREM:
1678  return legalizeUDIV_UREM(MI, MRI, B);
1679  case TargetOpcode::G_SDIV:
1680  case TargetOpcode::G_SREM:
1681  return legalizeSDIV_SREM(MI, MRI, B);
1682  case TargetOpcode::G_ATOMIC_CMPXCHG:
1683  return legalizeAtomicCmpXChg(MI, MRI, B);
1684  case TargetOpcode::G_FLOG:
1685  return legalizeFlog(MI, B, numbers::ln2f);
1686  case TargetOpcode::G_FLOG10:
1688  case TargetOpcode::G_FEXP:
1689  return legalizeFExp(MI, B);
1690  case TargetOpcode::G_FPOW:
1691  return legalizeFPow(MI, B);
1692  case TargetOpcode::G_FFLOOR:
1693  return legalizeFFloor(MI, MRI, B);
1694  case TargetOpcode::G_BUILD_VECTOR:
1695  return legalizeBuildVector(MI, MRI, B);
1696  default:
1697  return false;
1698  }
1699 
1700  llvm_unreachable("expected switch to return");
1701 }
1702 
1704  unsigned AS,
1706  MachineIRBuilder &B) const {
1707  MachineFunction &MF = B.getMF();
1708  const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1709  const LLT S32 = LLT::scalar(32);
1710 
1712 
1713  if (ST.hasApertureRegs()) {
1714  // FIXME: Use inline constants (src_{shared, private}_base) instead of
1715  // getreg.
1716  unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ?
1719  unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ?
1722  unsigned Encoding =
1725  WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_;
1726 
1727  Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
1728 
1729  B.buildInstr(AMDGPU::S_GETREG_B32)
1730  .addDef(GetReg)
1731  .addImm(Encoding);
1732  MRI.setType(GetReg, S32);
1733 
1734  auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1);
1735  return B.buildShl(S32, GetReg, ShiftAmt).getReg(0);
1736  }
1737 
1738  Register QueuePtr = MRI.createGenericVirtualRegister(
1740 
1742  return Register();
1743 
1744  // Offset into amd_queue_t for group_segment_aperture_base_hi /
1745  // private_segment_aperture_base_hi.
1746  uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
1747 
1748  // TODO: can we be smarter about machine pointer info?
1751  PtrInfo,
1754  4, commonAlignment(Align(64), StructOffset));
1755 
1756  Register LoadAddr;
1757 
1758  B.materializePtrAdd(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset);
1759  return B.buildLoad(S32, LoadAddr, *MMO).getReg(0);
1760 }
1761 
1764  MachineIRBuilder &B) const {
1765  MachineFunction &MF = B.getMF();
1766 
1767  const LLT S32 = LLT::scalar(32);
1768  Register Dst = MI.getOperand(0).getReg();
1769  Register Src = MI.getOperand(1).getReg();
1770 
1771  LLT DstTy = MRI.getType(Dst);
1772  LLT SrcTy = MRI.getType(Src);
1773  unsigned DestAS = DstTy.getAddressSpace();
1774  unsigned SrcAS = SrcTy.getAddressSpace();
1775 
1776  // TODO: Avoid reloading from the queue ptr for each cast, or at least each
1777  // vector element.
1778  assert(!DstTy.isVector());
1779 
1780  const AMDGPUTargetMachine &TM
1781  = static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
1782 
1783  if (TM.isNoopAddrSpaceCast(SrcAS, DestAS)) {
1784  MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST));
1785  return true;
1786  }
1787 
1788  if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1789  // Truncate.
1790  B.buildExtract(Dst, Src, 0);
1791  MI.eraseFromParent();
1792  return true;
1793  }
1794 
1795  if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1797  uint32_t AddrHiVal = Info->get32BitAddressHighBits();
1798 
1799  // FIXME: This is a bit ugly due to creating a merge of 2 pointers to
1800  // another. Merge operands are required to be the same type, but creating an
1801  // extra ptrtoint would be kind of pointless.
1802  auto HighAddr = B.buildConstant(
1804  B.buildMerge(Dst, {Src, HighAddr});
1805  MI.eraseFromParent();
1806  return true;
1807  }
1808 
1809  if (SrcAS == AMDGPUAS::FLAT_ADDRESS) {
1810  assert(DestAS == AMDGPUAS::LOCAL_ADDRESS ||
1811  DestAS == AMDGPUAS::PRIVATE_ADDRESS);
1812  unsigned NullVal = TM.getNullPointerValue(DestAS);
1813 
1814  auto SegmentNull = B.buildConstant(DstTy, NullVal);
1815  auto FlatNull = B.buildConstant(SrcTy, 0);
1816 
1817  // Extract low 32-bits of the pointer.
1818  auto PtrLo32 = B.buildExtract(DstTy, Src, 0);
1819 
1820  auto CmpRes =
1821  B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0));
1822  B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0));
1823 
1824  MI.eraseFromParent();
1825  return true;
1826  }
1827 
1828  if (SrcAS != AMDGPUAS::LOCAL_ADDRESS && SrcAS != AMDGPUAS::PRIVATE_ADDRESS)
1829  return false;
1830 
1831  if (!ST.hasFlatAddressSpace())
1832  return false;
1833 
1834  auto SegmentNull =
1835  B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS));
1836  auto FlatNull =
1837  B.buildConstant(DstTy, TM.getNullPointerValue(DestAS));
1838 
1839  Register ApertureReg = getSegmentAperture(SrcAS, MRI, B);
1840  if (!ApertureReg.isValid())
1841  return false;
1842 
1843  auto CmpRes =
1844  B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, SegmentNull.getReg(0));
1845 
1846  // Coerce the type of the low half of the result so we can use merge_values.
1847  Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0);
1848 
1849  // TODO: Should we allow mismatched types but matching sizes in merges to
1850  // avoid the ptrtoint?
1851  auto BuildPtr = B.buildMerge(DstTy, {SrcAsInt, ApertureReg});
1852  B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull);
1853 
1854  MI.eraseFromParent();
1855  return true;
1856 }
1857 
1860  MachineIRBuilder &B) const {
1861  Register Src = MI.getOperand(1).getReg();
1862  LLT Ty = MRI.getType(Src);
1863  assert(Ty.isScalar() && Ty.getSizeInBits() == 64);
1864 
1865  APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
1866  APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
1867 
1868  auto C1 = B.buildFConstant(Ty, C1Val);
1869  auto CopySign = B.buildFCopysign(Ty, C1, Src);
1870 
1871  // TODO: Should this propagate fast-math-flags?
1872  auto Tmp1 = B.buildFAdd(Ty, Src, CopySign);
1873  auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign);
1874 
1875  auto C2 = B.buildFConstant(Ty, C2Val);
1876  auto Fabs = B.buildFAbs(Ty, Src);
1877 
1878  auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2);
1879  B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2);
1880  MI.eraseFromParent();
1881  return true;
1882 }
1883 
1886  MachineIRBuilder &B) const {
1887 
1888  const LLT S1 = LLT::scalar(1);
1889  const LLT S64 = LLT::scalar(64);
1890 
1891  Register Src = MI.getOperand(1).getReg();
1892  assert(MRI.getType(Src) == S64);
1893 
1894  // result = trunc(src)
1895  // if (src > 0.0 && src != result)
1896  // result += 1.0
1897 
1898  auto Trunc = B.buildIntrinsicTrunc(S64, Src);
1899 
1900  const auto Zero = B.buildFConstant(S64, 0.0);
1901  const auto One = B.buildFConstant(S64, 1.0);
1902  auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero);
1903  auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc);
1904  auto And = B.buildAnd(S1, Lt0, NeTrunc);
1905  auto Add = B.buildSelect(S64, And, One, Zero);
1906 
1907  // TODO: Should this propagate fast-math-flags?
1908  B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add);
1909  return true;
1910 }
1911 
1914  MachineIRBuilder &B) const {
1915  Register DstReg = MI.getOperand(0).getReg();
1916  Register Src0Reg = MI.getOperand(1).getReg();
1917  Register Src1Reg = MI.getOperand(2).getReg();
1918  auto Flags = MI.getFlags();
1919  LLT Ty = MRI.getType(DstReg);
1920 
1921  auto Div = B.buildFDiv(Ty, Src0Reg, Src1Reg, Flags);
1922  auto Trunc = B.buildIntrinsicTrunc(Ty, Div, Flags);
1923  auto Neg = B.buildFNeg(Ty, Trunc, Flags);
1924  B.buildFMA(DstReg, Neg, Src1Reg, Src0Reg, Flags);
1925  MI.eraseFromParent();
1926  return true;
1927 }
1928 
1930  MachineIRBuilder &B) {
1931  const unsigned FractBits = 52;
1932  const unsigned ExpBits = 11;
1933  LLT S32 = LLT::scalar(32);
1934 
1935  auto Const0 = B.buildConstant(S32, FractBits - 32);
1936  auto Const1 = B.buildConstant(S32, ExpBits);
1937 
1938  auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false)
1939  .addUse(Hi)
1940  .addUse(Const0.getReg(0))
1941  .addUse(Const1.getReg(0));
1942 
1943  return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023));
1944 }
1945 
1948  MachineIRBuilder &B) const {
1949  const LLT S1 = LLT::scalar(1);
1950  const LLT S32 = LLT::scalar(32);
1951  const LLT S64 = LLT::scalar(64);
1952 
1953  Register Src = MI.getOperand(1).getReg();
1954  assert(MRI.getType(Src) == S64);
1955 
1956  // TODO: Should this use extract since the low half is unused?
1957  auto Unmerge = B.buildUnmerge({S32, S32}, Src);
1958  Register Hi = Unmerge.getReg(1);
1959 
1960  // Extract the upper half, since this is where we will find the sign and
1961  // exponent.
1962  auto Exp = extractF64Exponent(Hi, B);
1963 
1964  const unsigned FractBits = 52;
1965 
1966  // Extract the sign bit.
1967  const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31);
1968  auto SignBit = B.buildAnd(S32, Hi, SignBitMask);
1969 
1970  const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1);
1971 
1972  const auto Zero32 = B.buildConstant(S32, 0);
1973 
1974  // Extend back to 64-bits.
1975  auto SignBit64 = B.buildMerge(S64, {Zero32, SignBit});
1976 
1977  auto Shr = B.buildAShr(S64, FractMask, Exp);
1978  auto Not = B.buildNot(S64, Shr);
1979  auto Tmp0 = B.buildAnd(S64, Src, Not);
1980  auto FiftyOne = B.buildConstant(S32, FractBits - 1);
1981 
1982  auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32);
1983  auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne);
1984 
1985  auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0);
1986  B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1);
1987  MI.eraseFromParent();
1988  return true;
1989 }
1990 
1993  MachineIRBuilder &B, bool Signed) const {
1994 
1995  Register Dst = MI.getOperand(0).getReg();
1996  Register Src = MI.getOperand(1).getReg();
1997 
1998  const LLT S64 = LLT::scalar(64);
1999  const LLT S32 = LLT::scalar(32);
2000 
2001  assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64);
2002 
2003  auto Unmerge = B.buildUnmerge({S32, S32}, Src);
2004 
2005  auto CvtHi = Signed ?
2006  B.buildSITOFP(S64, Unmerge.getReg(1)) :
2007  B.buildUITOFP(S64, Unmerge.getReg(1));
2008 
2009  auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0));
2010 
2011  auto ThirtyTwo = B.buildConstant(S32, 32);
2012  auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false)
2013  .addUse(CvtHi.getReg(0))
2014  .addUse(ThirtyTwo.getReg(0));
2015 
2016  // TODO: Should this propagate fast-math-flags?
2017  B.buildFAdd(Dst, LdExp, CvtLo);
2018  MI.eraseFromParent();
2019  return true;
2020 }
2021 
2022 // TODO: Copied from DAG implementation. Verify logic and document how this
2023 // actually works.
2026  MachineIRBuilder &B, bool Signed) const {
2027 
2028  Register Dst = MI.getOperand(0).getReg();
2029  Register Src = MI.getOperand(1).getReg();
2030 
2031  const LLT S64 = LLT::scalar(64);
2032  const LLT S32 = LLT::scalar(32);
2033 
2034  assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64);
2035 
2036  unsigned Flags = MI.getFlags();
2037 
2038  auto Trunc = B.buildIntrinsicTrunc(S64, Src, Flags);
2039  auto K0 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0x3df0000000000000)));
2040  auto K1 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0xc1f0000000000000)));
2041 
2042  auto Mul = B.buildFMul(S64, Trunc, K0, Flags);
2043  auto FloorMul = B.buildFFloor(S64, Mul, Flags);
2044  auto Fma = B.buildFMA(S64, FloorMul, K1, Trunc, Flags);
2045 
2046  auto Hi = Signed ?
2047  B.buildFPTOSI(S32, FloorMul) :
2048  B.buildFPTOUI(S32, FloorMul);
2049  auto Lo = B.buildFPTOUI(S32, Fma);
2050 
2051  B.buildMerge(Dst, { Lo, Hi });
2052  MI.eraseFromParent();
2053 
2054  return true;
2055 }
2056 
2058  MachineInstr &MI) const {
2059  MachineFunction &MF = Helper.MIRBuilder.getMF();
2061 
2062  const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE ||
2063  MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE;
2064 
2065  // With ieee_mode disabled, the instructions have the correct behavior
2066  // already for G_FMINNUM/G_FMAXNUM
2067  if (!MFI->getMode().IEEE)
2068  return !IsIEEEOp;
2069 
2070  if (IsIEEEOp)
2071  return true;
2072 
2074 }
2075 
2078  MachineIRBuilder &B) const {
2079  // TODO: Should move some of this into LegalizerHelper.
2080 
2081  // TODO: Promote dynamic indexing of s16 to s32
2082 
2083  // FIXME: Artifact combiner probably should have replaced the truncated
2084  // constant before this, so we shouldn't need
2085  // getConstantVRegValWithLookThrough.
2086  Optional<ValueAndVReg> MaybeIdxVal =
2087  getConstantVRegValWithLookThrough(MI.getOperand(2).getReg(), MRI);
2088  if (!MaybeIdxVal) // Dynamic case will be selected to register indexing.
2089  return true;
2090  const int64_t IdxVal = MaybeIdxVal->Value.getSExtValue();
2091 
2092  Register Dst = MI.getOperand(0).getReg();
2093  Register Vec = MI.getOperand(1).getReg();
2094 
2095  LLT VecTy = MRI.getType(Vec);
2096  LLT EltTy = VecTy.getElementType();
2097  assert(EltTy == MRI.getType(Dst));
2098 
2099  if (IdxVal < VecTy.getNumElements())
2100  B.buildExtract(Dst, Vec, IdxVal * EltTy.getSizeInBits());
2101  else
2102  B.buildUndef(Dst);
2103 
2104  MI.eraseFromParent();
2105  return true;
2106 }
2107 
2110  MachineIRBuilder &B) const {
2111  // TODO: Should move some of this into LegalizerHelper.
2112 
2113  // TODO: Promote dynamic indexing of s16 to s32
2114 
2115  // FIXME: Artifact combiner probably should have replaced the truncated
2116  // constant before this, so we shouldn't need
2117  // getConstantVRegValWithLookThrough.
2118  Optional<ValueAndVReg> MaybeIdxVal =
2119  getConstantVRegValWithLookThrough(MI.getOperand(3).getReg(), MRI);
2120  if (!MaybeIdxVal) // Dynamic case will be selected to register indexing.
2121  return true;
2122 
2123  int64_t IdxVal = MaybeIdxVal->Value.getSExtValue();
2124  Register Dst = MI.getOperand(0).getReg();
2125  Register Vec = MI.getOperand(1).getReg();
2126  Register Ins = MI.getOperand(2).getReg();
2127 
2128  LLT VecTy = MRI.getType(Vec);
2129  LLT EltTy = VecTy.getElementType();
2130  assert(EltTy == MRI.getType(Ins));
2131 
2132  if (IdxVal < VecTy.getNumElements())
2133  B.buildInsert(Dst, Vec, Ins, IdxVal * EltTy.getSizeInBits());
2134  else
2135  B.buildUndef(Dst);
2136 
2137  MI.eraseFromParent();
2138  return true;
2139 }
2140 
2143  MachineIRBuilder &B) const {
2144  const LLT V2S16 = LLT::vector(2, 16);
2145 
2146  Register Dst = MI.getOperand(0).getReg();
2147  Register Src0 = MI.getOperand(1).getReg();
2148  LLT DstTy = MRI.getType(Dst);
2149  LLT SrcTy = MRI.getType(Src0);
2150 
2151  if (SrcTy == V2S16 && DstTy == V2S16 &&
2152  AMDGPU::isLegalVOP3PShuffleMask(MI.getOperand(3).getShuffleMask()))
2153  return true;
2154 
2155  MachineIRBuilder HelperBuilder(MI);
2156  GISelObserverWrapper DummyObserver;
2157  LegalizerHelper Helper(B.getMF(), DummyObserver, HelperBuilder);
2159 }
2160 
2163  MachineIRBuilder &B) const {
2164 
2165  Register DstReg = MI.getOperand(0).getReg();
2166  Register SrcReg = MI.getOperand(1).getReg();
2167  LLT Ty = MRI.getType(DstReg);
2168  unsigned Flags = MI.getFlags();
2169 
2170  Register TrigVal;
2171  auto OneOver2Pi = B.buildFConstant(Ty, 0.5 * numbers::inv_pi);
2172  if (ST.hasTrigReducedRange()) {
2173  auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags);
2174  TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false)
2175  .addUse(MulVal.getReg(0))
2176  .setMIFlags(Flags).getReg(0);
2177  } else
2178  TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0);
2179 
2180  Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ?
2181  Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos;
2182  B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false)
2183  .addUse(TrigVal)
2184  .setMIFlags(Flags);
2185  MI.eraseFromParent();
2186  return true;
2187 }
2188 
2191  const GlobalValue *GV,
2192  int64_t Offset,
2193  unsigned GAFlags) const {
2194  assert(isInt<32>(Offset + 4) && "32-bit offset is expected!");
2195  // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered
2196  // to the following code sequence:
2197  //
2198  // For constant address space:
2199  // s_getpc_b64 s[0:1]
2200  // s_add_u32 s0, s0, $symbol
2201  // s_addc_u32 s1, s1, 0
2202  //
2203  // s_getpc_b64 returns the address of the s_add_u32 instruction and then
2204  // a fixup or relocation is emitted to replace $symbol with a literal
2205  // constant, which is a pc-relative offset from the encoding of the $symbol
2206  // operand to the global variable.
2207  //
2208  // For global address space:
2209  // s_getpc_b64 s[0:1]
2210  // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
2211  // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
2212  //
2213  // s_getpc_b64 returns the address of the s_add_u32 instruction and then
2214  // fixups or relocations are emitted to replace $symbol@*@lo and
2215  // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
2216  // which is a 64-bit pc-relative offset from the encoding of the $symbol
2217  // operand to the global variable.
2218  //
2219  // What we want here is an offset from the value returned by s_getpc
2220  // (which is the address of the s_add_u32 instruction) to the global
2221  // variable, but since the encoding of $symbol starts 4 bytes after the start
2222  // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too
2223  // small. This requires us to add 4 to the global variable offset in order to
2224  // compute the correct address. Similarly for the s_addc_u32 instruction, the
2225  // encoding of $symbol starts 12 bytes after the start of the s_add_u32
2226  // instruction.
2227 
2228  LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2229 
2230  Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg :
2231  B.getMRI()->createGenericVirtualRegister(ConstPtrTy);
2232 
2233  MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET)
2234  .addDef(PCReg);
2235 
2236  MIB.addGlobalAddress(GV, Offset + 4, GAFlags);
2237  if (GAFlags == SIInstrInfo::MO_NONE)
2238  MIB.addImm(0);
2239  else
2240  MIB.addGlobalAddress(GV, Offset + 12, GAFlags + 1);
2241 
2242  B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass);
2243 
2244  if (PtrTy.getSizeInBits() == 32)
2245  B.buildExtract(DstReg, PCReg, 0);
2246  return true;
2247  }
2248 
2251  MachineIRBuilder &B) const {
2252  Register DstReg = MI.getOperand(0).getReg();
2253  LLT Ty = MRI.getType(DstReg);
2254  unsigned AS = Ty.getAddressSpace();
2255 
2256  const GlobalValue *GV = MI.getOperand(1).getGlobal();
2257  MachineFunction &MF = B.getMF();
2259 
2261  if (!MFI->isModuleEntryFunction()) {
2262  const Function &Fn = MF.getFunction();
2263  DiagnosticInfoUnsupported BadLDSDecl(
2264  Fn, "local memory global used by non-kernel function", MI.getDebugLoc(),
2265  DS_Warning);
2266  Fn.getContext().diagnose(BadLDSDecl);
2267 
2268  // We currently don't have a way to correctly allocate LDS objects that
2269  // aren't directly associated with a kernel. We do force inlining of
2270  // functions that use local objects. However, if these dead functions are
2271  // not eliminated, we don't want a compile time error. Just emit a warning
2272  // and a trap, since there should be no callable path here.
2273  B.buildIntrinsic(Intrinsic::trap, ArrayRef<Register>(), true);
2274  B.buildUndef(DstReg);
2275  MI.eraseFromParent();
2276  return true;
2277  }
2278 
2279  // TODO: We could emit code to handle the initialization somewhere.
2281  const SITargetLowering *TLI = ST.getTargetLowering();
2282  if (!TLI->shouldUseLDSConstAddress(GV)) {
2283  MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO);
2284  return true; // Leave in place;
2285  }
2286 
2287  if (AS == AMDGPUAS::LOCAL_ADDRESS && GV->hasExternalLinkage()) {
2288  Type *Ty = GV->getValueType();
2289  // HIP uses an unsized array `extern __shared__ T s[]` or similar
2290  // zero-sized type in other languages to declare the dynamic shared
2291  // memory which size is not known at the compile time. They will be
2292  // allocated by the runtime and placed directly after the static
2293  // allocated ones. They all share the same offset.
2294  if (B.getDataLayout().getTypeAllocSize(Ty).isZero()) {
2295  // Adjust alignment for that dynamic shared memory array.
2296  MFI->setDynLDSAlign(B.getDataLayout(), *cast<GlobalVariable>(GV));
2297  LLT S32 = LLT::scalar(32);
2298  auto Sz =
2299  B.buildIntrinsic(Intrinsic::amdgcn_groupstaticsize, {S32}, false);
2300  B.buildIntToPtr(DstReg, Sz);
2301  MI.eraseFromParent();
2302  return true;
2303  }
2304  }
2305 
2306  B.buildConstant(
2307  DstReg,
2308  MFI->allocateLDSGlobal(B.getDataLayout(), *cast<GlobalVariable>(GV)));
2309  MI.eraseFromParent();
2310  return true;
2311  }
2312 
2313  const Function &Fn = MF.getFunction();
2314  DiagnosticInfoUnsupported BadInit(
2315  Fn, "unsupported initializer for address space", MI.getDebugLoc());
2316  Fn.getContext().diagnose(BadInit);
2317  return true;
2318  }
2319 
2320  const SITargetLowering *TLI = ST.getTargetLowering();
2321 
2322  if (TLI->shouldEmitFixup(GV)) {
2323  buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0);
2324  MI.eraseFromParent();
2325  return true;
2326  }
2327 
2328  if (TLI->shouldEmitPCReloc(GV)) {
2329  buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32);
2330  MI.eraseFromParent();
2331  return true;
2332  }
2333 
2335  Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy);
2336 
2341  8 /*Size*/, Align(8));
2342 
2343  buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32);
2344 
2345  if (Ty.getSizeInBits() == 32) {
2346  // Truncate if this is a 32-bit constant adrdess.
2347  auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO);
2348  B.buildExtract(DstReg, Load, 0);
2349  } else
2350  B.buildLoad(DstReg, GOTAddr, *GOTMMO);
2351 
2352  MI.eraseFromParent();
2353  return true;
2354 }
2355 
2357  if (Ty.isVector())
2359  return LLT::scalar(PowerOf2Ceil(Ty.getSizeInBits()));
2360 }
2361 
2363  MachineInstr &MI) const {
2364  MachineIRBuilder &B = Helper.MIRBuilder;
2365  MachineRegisterInfo &MRI = *B.getMRI();
2366  GISelChangeObserver &Observer = Helper.Observer;
2367 
2368  Register PtrReg = MI.getOperand(1).getReg();
2369  LLT PtrTy = MRI.getType(PtrReg);
2370  unsigned AddrSpace = PtrTy.getAddressSpace();
2371 
2372  if (AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
2374  auto Cast = B.buildAddrSpaceCast(ConstPtr, PtrReg);
2375  Observer.changingInstr(MI);
2376  MI.getOperand(1).setReg(Cast.getReg(0));
2377  Observer.changedInstr(MI);
2378  return true;
2379  }
2380 
2381  Register ValReg = MI.getOperand(0).getReg();
2382  LLT ValTy = MRI.getType(ValReg);
2383 
2384  MachineMemOperand *MMO = *MI.memoperands_begin();
2385  const unsigned ValSize = ValTy.getSizeInBits();
2386  const unsigned MemSize = 8 * MMO->getSize();
2387  const Align MemAlign = MMO->getAlign();
2388  const unsigned AlignInBits = 8 * MemAlign.value();
2389 
2390  // Widen non-power-of-2 loads to the alignment if needed
2391  if (shouldWidenLoad(ST, MemSize, AlignInBits, AddrSpace, MI.getOpcode())) {
2392  const unsigned WideMemSize = PowerOf2Ceil(MemSize);
2393 
2394  // This was already the correct extending load result type, so just adjust
2395  // the memory type.
2396  if (WideMemSize == ValSize) {
2397  MachineFunction &MF = B.getMF();
2398 
2399  MachineMemOperand *WideMMO =
2400  MF.getMachineMemOperand(MMO, 0, WideMemSize / 8);
2401  Observer.changingInstr(MI);
2402  MI.setMemRefs(MF, {WideMMO});
2403  Observer.changedInstr(MI);
2404  return true;
2405  }
2406 
2407  // Don't bother handling edge case that should probably never be produced.
2408  if (ValSize > WideMemSize)
2409  return false;
2410 
2411  LLT WideTy = widenToNextPowerOf2(ValTy);
2412 
2413  Register WideLoad;
2414  if (!WideTy.isVector()) {
2415  WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
2416  B.buildTrunc(ValReg, WideLoad).getReg(0);
2417  } else {
2418  // Extract the subvector.
2419 
2420  if (isRegisterType(ValTy)) {
2421  // If this a case where G_EXTRACT is legal, use it.
2422  // (e.g. <3 x s32> -> <4 x s32>)
2423  WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
2424  B.buildExtract(ValReg, WideLoad, 0);
2425  } else {
2426  // For cases where the widened type isn't a nice register value, unmerge
2427  // from a widened register (e.g. <3 x s16> -> <4 x s16>)
2428  B.setInsertPt(B.getMBB(), ++B.getInsertPt());
2429  WideLoad = Helper.widenWithUnmerge(WideTy, ValReg);
2430  B.setInsertPt(B.getMBB(), MI.getIterator());
2431  B.buildLoadFromOffset(WideLoad, PtrReg, *MMO, 0);
2432  }
2433  }
2434 
2435  MI.eraseFromParent();
2436  return true;
2437  }
2438 
2439  return false;
2440 }
2441 
2444  MachineIRBuilder &B) const {
2445  LLT Ty = MRI.getType(MI.getOperand(0).getReg());
2446  assert(Ty.isScalar());
2447 
2448  MachineFunction &MF = B.getMF();
2449  const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2450 
2451  // TODO: Always legal with future ftz flag.
2452  // FIXME: Do we need just output?
2453  if (Ty == LLT::scalar(32) && !MFI->getMode().allFP32Denormals())
2454  return true;
2455  if (Ty == LLT::scalar(16) && !MFI->getMode().allFP64FP16Denormals())
2456  return true;
2457 
2458  MachineIRBuilder HelperBuilder(MI);
2459  GISelObserverWrapper DummyObserver;
2460  LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
2461  return Helper.lowerFMad(MI) == LegalizerHelper::Legalized;
2462 }
2463 
2466  Register DstReg = MI.getOperand(0).getReg();
2467  Register PtrReg = MI.getOperand(1).getReg();
2468  Register CmpVal = MI.getOperand(2).getReg();
2469  Register NewVal = MI.getOperand(3).getReg();
2470 
2471  assert(AMDGPU::isFlatGlobalAddrSpace(MRI.getType(PtrReg).getAddressSpace()) &&
2472  "this should not have been custom lowered");
2473 
2474  LLT ValTy = MRI.getType(CmpVal);
2475  LLT VecTy = LLT::vector(2, ValTy);
2476 
2477  Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0);
2478 
2479  B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG)
2480  .addDef(DstReg)
2481  .addUse(PtrReg)
2482  .addUse(PackedVal)
2483  .setMemRefs(MI.memoperands());
2484 
2485  MI.eraseFromParent();
2486  return true;
2487 }
2488 
2490  MachineInstr &MI, MachineIRBuilder &B, double Log2BaseInverted) const {
2491  Register Dst = MI.getOperand(0).getReg();
2492  Register Src = MI.getOperand(1).getReg();
2493  LLT Ty = B.getMRI()->getType(Dst);
2494  unsigned Flags = MI.getFlags();
2495 
2496  auto Log2Operand = B.buildFLog2(Ty, Src, Flags);
2497  auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted);
2498 
2499  B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags);
2500  MI.eraseFromParent();
2501  return true;
2502 }
2503 
2505  MachineIRBuilder &B) const {
2506  Register Dst = MI.getOperand(0).getReg();
2507  Register Src = MI.getOperand(1).getReg();
2508  unsigned Flags = MI.getFlags();
2509  LLT Ty = B.getMRI()->getType(Dst);
2510 
2511  auto K = B.buildFConstant(Ty, numbers::log2e);
2512  auto Mul = B.buildFMul(Ty, Src, K, Flags);
2513  B.buildFExp2(Dst, Mul, Flags);
2514  MI.eraseFromParent();
2515  return true;
2516 }
2517 
2519  MachineIRBuilder &B) const {
2520  Register Dst = MI.getOperand(0).getReg();
2521  Register Src0 = MI.getOperand(1).getReg();
2522  Register Src1 = MI.getOperand(2).getReg();
2523  unsigned Flags = MI.getFlags();
2524  LLT Ty = B.getMRI()->getType(Dst);
2525  const LLT S16 = LLT::scalar(16);
2526  const LLT S32 = LLT::scalar(32);
2527 
2528  if (Ty == S32) {
2529  auto Log = B.buildFLog2(S32, Src0, Flags);
2530  auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false)
2531  .addUse(Log.getReg(0))
2532  .addUse(Src1)
2533  .setMIFlags(Flags);
2534  B.buildFExp2(Dst, Mul, Flags);
2535  } else if (Ty == S16) {
2536  // There's no f16 fmul_legacy, so we need to convert for it.
2537  auto Log = B.buildFLog2(S16, Src0, Flags);
2538  auto Ext0 = B.buildFPExt(S32, Log, Flags);
2539  auto Ext1 = B.buildFPExt(S32, Src1, Flags);
2540  auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false)
2541  .addUse(Ext0.getReg(0))
2542  .addUse(Ext1.getReg(0))
2543  .setMIFlags(Flags);
2544 
2545  B.buildFExp2(Dst, B.buildFPTrunc(S16, Mul), Flags);
2546  } else
2547  return false;
2548 
2549  MI.eraseFromParent();
2550  return true;
2551 }
2552 
2553 // Find a source register, ignoring any possible source modifiers.
2555  Register ModSrc = OrigSrc;
2556  if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) {
2557  ModSrc = SrcFNeg->getOperand(1).getReg();
2558  if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
2559  ModSrc = SrcFAbs->getOperand(1).getReg();
2560  } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
2561  ModSrc = SrcFAbs->getOperand(1).getReg();
2562  return ModSrc;
2563 }
2564 
2567  MachineIRBuilder &B) const {
2568 
2569  const LLT S1 = LLT::scalar(1);
2570  const LLT S64 = LLT::scalar(64);
2571  Register Dst = MI.getOperand(0).getReg();
2572  Register OrigSrc = MI.getOperand(1).getReg();
2573  unsigned Flags = MI.getFlags();
2574  assert(ST.hasFractBug() && MRI.getType(Dst) == S64 &&
2575  "this should not have been custom lowered");
2576 
2577  // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x))
2578  // is used instead. However, SI doesn't have V_FLOOR_F64, so the most
2579  // efficient way to implement it is using V_FRACT_F64. The workaround for the
2580  // V_FRACT bug is:
2581  // fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999)
2582  //
2583  // Convert floor(x) to (x - fract(x))
2584 
2585  auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {S64}, false)
2586  .addUse(OrigSrc)
2587  .setMIFlags(Flags);
2588 
2589  // Give source modifier matching some assistance before obscuring a foldable
2590  // pattern.
2591 
2592  // TODO: We can avoid the neg on the fract? The input sign to fract
2593  // shouldn't matter?
2594  Register ModSrc = stripAnySourceMods(OrigSrc, MRI);
2595 
2596  auto Const = B.buildFConstant(S64, BitsToDouble(0x3fefffffffffffff));
2597 
2598  Register Min = MRI.createGenericVirtualRegister(S64);
2599 
2600  // We don't need to concern ourselves with the snan handling difference, so
2601  // use the one which will directly select.
2602  const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2603  if (MFI->getMode().IEEE)
2604  B.buildFMinNumIEEE(Min, Fract, Const, Flags);
2605  else
2606  B.buildFMinNum(Min, Fract, Const, Flags);
2607 
2608  Register CorrectedFract = Min;
2609  if (!MI.getFlag(MachineInstr::FmNoNans)) {
2610  auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags);
2611  CorrectedFract = B.buildSelect(S64, IsNan, ModSrc, Min, Flags).getReg(0);
2612  }
2613 
2614  auto NegFract = B.buildFNeg(S64, CorrectedFract, Flags);
2615  B.buildFAdd(Dst, OrigSrc, NegFract, Flags);
2616 
2617  MI.eraseFromParent();
2618  return true;
2619 }
2620 
2621 // Turn an illegal packed v2s16 build vector into bit operations.
2622 // TODO: This should probably be a bitcast action in LegalizerHelper.
2625  Register Dst = MI.getOperand(0).getReg();
2626  const LLT S32 = LLT::scalar(32);
2627  assert(MRI.getType(Dst) == LLT::vector(2, 16));
2628 
2629  Register Src0 = MI.getOperand(1).getReg();
2630  Register Src1 = MI.getOperand(2).getReg();
2631  assert(MRI.getType(Src0) == LLT::scalar(16));
2632 
2633  auto Merge = B.buildMerge(S32, {Src0, Src1});
2634  B.buildBitcast(Dst, Merge);
2635 
2636  MI.eraseFromParent();
2637  return true;
2638 }
2639 
2640 // Check that this is a G_XOR x, -1
2641 static bool isNot(const MachineRegisterInfo &MRI, const MachineInstr &MI) {
2642  if (MI.getOpcode() != TargetOpcode::G_XOR)
2643  return false;
2644  auto ConstVal = getConstantVRegSExtVal(MI.getOperand(2).getReg(), MRI);
2645  return ConstVal && *ConstVal == -1;
2646 }
2647 
2648 // Return the use branch instruction, otherwise null if the usage is invalid.
2649 static MachineInstr *
2651  MachineBasicBlock *&UncondBrTarget, bool &Negated) {
2652  Register CondDef = MI.getOperand(0).getReg();
2653  if (!MRI.hasOneNonDBGUse(CondDef))
2654  return nullptr;
2655 
2656  MachineBasicBlock *Parent = MI.getParent();
2657  MachineInstr *UseMI = &*MRI.use_instr_nodbg_begin(CondDef);
2658 
2659  if (isNot(MRI, *UseMI)) {
2660  Register NegatedCond = UseMI->getOperand(0).getReg();
2661  if (!MRI.hasOneNonDBGUse(NegatedCond))
2662  return nullptr;
2663 
2664  // We're deleting the def of this value, so we need to remove it.
2666 
2667  UseMI = &*MRI.use_instr_nodbg_begin(NegatedCond);
2668  Negated = true;
2669  }
2670 
2671  if (UseMI->getParent() != Parent || UseMI->getOpcode() != AMDGPU::G_BRCOND)
2672  return nullptr;
2673 
2674  // Make sure the cond br is followed by a G_BR, or is the last instruction.
2675  MachineBasicBlock::iterator Next = std::next(UseMI->getIterator());
2676  if (Next == Parent->end()) {
2677  MachineFunction::iterator NextMBB = std::next(Parent->getIterator());
2678  if (NextMBB == Parent->getParent()->end()) // Illegal intrinsic use.
2679  return nullptr;
2680  UncondBrTarget = &*NextMBB;
2681  } else {
2682  if (Next->getOpcode() != AMDGPU::G_BR)
2683  return nullptr;
2684  Br = &*Next;
2685  UncondBrTarget = Br->getOperand(0).getMBB();
2686  }
2687 
2688  return UseMI;
2689 }
2690 
2692  const ArgDescriptor *Arg,
2693  const TargetRegisterClass *ArgRC,
2694  LLT ArgTy) const {
2695  MCRegister SrcReg = Arg->getRegister();
2696  assert(Register::isPhysicalRegister(SrcReg) && "Physical register expected");
2697  assert(DstReg.isVirtual() && "Virtual register expected");
2698 
2699  Register LiveIn = getFunctionLiveInPhysReg(B.getMF(), B.getTII(), SrcReg, *ArgRC,
2700  ArgTy);
2701  if (Arg->isMasked()) {
2702  // TODO: Should we try to emit this once in the entry block?
2703  const LLT S32 = LLT::scalar(32);
2704  const unsigned Mask = Arg->getMask();
2705  const unsigned Shift = countTrailingZeros<unsigned>(Mask);
2706 
2707  Register AndMaskSrc = LiveIn;
2708 
2709  if (Shift != 0) {
2710  auto ShiftAmt = B.buildConstant(S32, Shift);
2711  AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0);
2712  }
2713 
2714  B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift));
2715  } else {
2716  B.buildCopy(DstReg, LiveIn);
2717  }
2718 
2719  return true;
2720 }
2721 
2723  Register DstReg, MachineIRBuilder &B,
2724  AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
2725  const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2726  const ArgDescriptor *Arg;
2727  const TargetRegisterClass *ArgRC;
2728  LLT ArgTy;
2729  std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType);
2730 
2731  if (!Arg->isRegister() || !Arg->getRegister().isValid())
2732  return false; // TODO: Handle these
2733  return loadInputValue(DstReg, B, Arg, ArgRC, ArgTy);
2734 }
2735 
2738  AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
2739  if (!loadInputValue(MI.getOperand(0).getReg(), B, ArgType))
2740  return false;
2741 
2742  MI.eraseFromParent();
2743  return true;
2744 }
2745 
2748  MachineIRBuilder &B) const {
2749  Register Dst = MI.getOperand(0).getReg();
2750  LLT DstTy = MRI.getType(Dst);
2751  LLT S16 = LLT::scalar(16);
2752  LLT S32 = LLT::scalar(32);
2753  LLT S64 = LLT::scalar(64);
2754 
2755  if (legalizeFastUnsafeFDIV(MI, MRI, B))
2756  return true;
2757 
2758  if (DstTy == S16)
2759  return legalizeFDIV16(MI, MRI, B);
2760  if (DstTy == S32)
2761  return legalizeFDIV32(MI, MRI, B);
2762  if (DstTy == S64)
2763  return legalizeFDIV64(MI, MRI, B);
2764 
2765  return false;
2766 }
2767 
2769  Register DstReg,
2770  Register X,
2771  Register Y,
2772  bool IsDiv) const {
2773  const LLT S1 = LLT::scalar(1);
2774  const LLT S32 = LLT::scalar(32);
2775 
2776  // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the
2777  // algorithm used here.
2778 
2779  // Initial estimate of inv(y).
2780  auto FloatY = B.buildUITOFP(S32, Y);
2781  auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {FloatY});
2782  auto Scale = B.buildFConstant(S32, BitsToFloat(0x4f7ffffe));
2783  auto ScaledY = B.buildFMul(S32, RcpIFlag, Scale);
2784  auto Z = B.buildFPTOUI(S32, ScaledY);
2785 
2786  // One round of UNR.
2787  auto NegY = B.buildSub(S32, B.buildConstant(S32, 0), Y);
2788  auto NegYZ = B.buildMul(S32, NegY, Z);
2789  Z = B.buildAdd(S32, Z, B.buildUMulH(S32, Z, NegYZ));
2790 
2791  // Quotient/remainder estimate.
2792  auto Q = B.buildUMulH(S32, X, Z);
2793  auto R = B.buildSub(S32, X, B.buildMul(S32, Q, Y));
2794 
2795  // First quotient/remainder refinement.
2796  auto One = B.buildConstant(S32, 1);
2797  auto Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
2798  if (IsDiv)
2799  Q = B.buildSelect(S32, Cond, B.buildAdd(S32, Q, One), Q);
2800  R = B.buildSelect(S32, Cond, B.buildSub(S32, R, Y), R);
2801 
2802  // Second quotient/remainder refinement.
2803  Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
2804  if (IsDiv)
2805  B.buildSelect(DstReg, Cond, B.buildAdd(S32, Q, One), Q);
2806  else
2807  B.buildSelect(DstReg, Cond, B.buildSub(S32, R, Y), R);
2808 }
2809 
2812  MachineIRBuilder &B) const {
2813  const bool IsDiv = MI.getOpcode() == AMDGPU::G_UDIV;
2814  Register DstReg = MI.getOperand(0).getReg();
2815  Register Num = MI.getOperand(1).getReg();
2816  Register Den = MI.getOperand(2).getReg();
2817  legalizeUDIV_UREM32Impl(B, DstReg, Num, Den, IsDiv);
2818  MI.eraseFromParent();
2819  return true;
2820 }
2821 
2822 // Build integer reciprocal sequence arounud V_RCP_IFLAG_F32
2823 //
2824 // Return lo, hi of result
2825 //
2826 // %cvt.lo = G_UITOFP Val.lo
2827 // %cvt.hi = G_UITOFP Val.hi
2828 // %mad = G_FMAD %cvt.hi, 2**32, %cvt.lo
2829 // %rcp = G_AMDGPU_RCP_IFLAG %mad
2830 // %mul1 = G_FMUL %rcp, 0x5f7ffffc
2831 // %mul2 = G_FMUL %mul1, 2**(-32)
2832 // %trunc = G_INTRINSIC_TRUNC %mul2
2833 // %mad2 = G_FMAD %trunc, -(2**32), %mul1
2834 // return {G_FPTOUI %mad2, G_FPTOUI %trunc}
2835 static std::pair<Register, Register> emitReciprocalU64(MachineIRBuilder &B,
2836  Register Val) {
2837  const LLT S32 = LLT::scalar(32);
2838  auto Unmerge = B.buildUnmerge(S32, Val);
2839 
2840  auto CvtLo = B.buildUITOFP(S32, Unmerge.getReg(0));
2841  auto CvtHi = B.buildUITOFP(S32, Unmerge.getReg(1));
2842 
2843  auto Mad = B.buildFMAD(S32, CvtHi, // 2**32
2844  B.buildFConstant(S32, BitsToFloat(0x4f800000)), CvtLo);
2845 
2846  auto Rcp = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Mad});
2847  auto Mul1 =
2848  B.buildFMul(S32, Rcp, B.buildFConstant(S32, BitsToFloat(0x5f7ffffc)));
2849 
2850  // 2**(-32)
2851  auto Mul2 =
2852  B.buildFMul(S32, Mul1, B.buildFConstant(S32, BitsToFloat(0x2f800000)));
2853  auto Trunc = B.buildIntrinsicTrunc(S32, Mul2);
2854 
2855  // -(2**32)
2856  auto Mad2 = B.buildFMAD(S32, Trunc,
2857  B.buildFConstant(S32, BitsToFloat(0xcf800000)), Mul1);
2858 
2859  auto ResultLo = B.buildFPTOUI(S32, Mad2);
2860  auto ResultHi = B.buildFPTOUI(S32, Trunc);
2861 
2862  return {ResultLo.getReg(0), ResultHi.getReg(0)};
2863 }
2864 
2866  Register DstReg,
2867  Register Numer,
2868  Register Denom,
2869  bool IsDiv) const {
2870  const LLT S32 = LLT::scalar(32);
2871  const LLT S64 = LLT::scalar(64);
2872  const LLT S1 = LLT::scalar(1);
2873  Register RcpLo, RcpHi;
2874 
2875  std::tie(RcpLo, RcpHi) = emitReciprocalU64(B, Denom);
2876 
2877  auto Rcp = B.buildMerge(S64, {RcpLo, RcpHi});
2878 
2879  auto Zero64 = B.buildConstant(S64, 0);
2880  auto NegDenom = B.buildSub(S64, Zero64, Denom);
2881 
2882  auto MulLo1 = B.buildMul(S64, NegDenom, Rcp);
2883  auto MulHi1 = B.buildUMulH(S64, Rcp, MulLo1);
2884 
2885  auto UnmergeMulHi1 = B.buildUnmerge(S32, MulHi1);
2886  Register MulHi1_Lo = UnmergeMulHi1.getReg(0);
2887  Register MulHi1_Hi = UnmergeMulHi1.getReg(1);
2888 
2889  auto Add1_Lo = B.buildUAddo(S32, S1, RcpLo, MulHi1_Lo);
2890  auto Add1_Hi = B.buildUAdde(S32, S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1));
2891  auto Add1_HiNc = B.buildAdd(S32, RcpHi, MulHi1_Hi);
2892  auto Add1 = B.buildMerge(S64, {Add1_Lo, Add1_Hi});
2893 
2894  auto MulLo2 = B.buildMul(S64, NegDenom, Add1);
2895  auto MulHi2 = B.buildUMulH(S64, Add1, MulLo2);
2896  auto UnmergeMulHi2 = B.buildUnmerge(S32, MulHi2);
2897  Register MulHi2_Lo = UnmergeMulHi2.getReg(0);
2898  Register MulHi2_Hi = UnmergeMulHi2.getReg(1);
2899 
2900  auto Zero32 = B.buildConstant(S32, 0);
2901  auto Add2_Lo = B.buildUAddo(S32, S1, Add1_Lo, MulHi2_Lo);
2902  auto Add2_HiC =
2903  B.buildUAdde(S32, S1, Add1_HiNc, MulHi2_Hi, Add1_Lo.getReg(1));
2904  auto Add2_Hi = B.buildUAdde(S32, S1, Add2_HiC, Zero32, Add2_Lo.getReg(1));
2905  auto Add2 = B.buildMerge(S64, {Add2_Lo, Add2_Hi});
2906 
2907  auto UnmergeNumer = B.buildUnmerge(S32, Numer);
2908  Register NumerLo = UnmergeNumer.getReg(0);
2909  Register NumerHi = UnmergeNumer.getReg(1);
2910 
2911  auto MulHi3 = B.buildUMulH(S64, Numer, Add2);
2912  auto Mul3 = B.buildMul(S64, Denom, MulHi3);
2913  auto UnmergeMul3 = B.buildUnmerge(S32, Mul3);
2914  Register Mul3_Lo = UnmergeMul3.getReg(0);
2915  Register Mul3_Hi = UnmergeMul3.getReg(1);
2916  auto Sub1_Lo = B.buildUSubo(S32, S1, NumerLo, Mul3_Lo);
2917  auto Sub1_Hi = B.buildUSube(S32, S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1));
2918  auto Sub1_Mi = B.buildSub(S32, NumerHi, Mul3_Hi);
2919  auto Sub1 = B.buildMerge(S64, {Sub1_Lo, Sub1_Hi});
2920 
2921  auto UnmergeDenom = B.buildUnmerge(S32, Denom);
2922  Register DenomLo = UnmergeDenom.getReg(0);
2923  Register DenomHi = UnmergeDenom.getReg(1);
2924 
2925  auto CmpHi = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Hi, DenomHi);
2926  auto C1 = B.buildSExt(S32, CmpHi);
2927 
2928  auto CmpLo = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Lo, DenomLo);
2929  auto C2 = B.buildSExt(S32, CmpLo);
2930 
2931  auto CmpEq = B.buildICmp(CmpInst::ICMP_EQ, S1, Sub1_Hi, DenomHi);
2932  auto C3 = B.buildSelect(S32, CmpEq, C2, C1);
2933 
2934  // TODO: Here and below portions of the code can be enclosed into if/endif.
2935  // Currently control flow is unconditional and we have 4 selects after
2936  // potential endif to substitute PHIs.
2937 
2938  // if C3 != 0 ...
2939  auto Sub2_Lo = B.buildUSubo(S32, S1, Sub1_Lo, DenomLo);
2940  auto Sub2_Mi = B.buildUSube(S32, S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1));
2941  auto Sub2_Hi = B.buildUSube(S32, S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1));
2942  auto Sub2 = B.buildMerge(S64, {Sub2_Lo, Sub2_Hi});
2943 
2944  auto One64 = B.buildConstant(S64, 1);
2945  auto Add3 = B.buildAdd(S64, MulHi3, One64);
2946 
2947  auto C4 =
2948  B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Hi, DenomHi));
2949  auto C5 =
2950  B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Lo, DenomLo));
2951  auto C6 = B.buildSelect(
2952  S32, B.buildICmp(CmpInst::ICMP_EQ, S1, Sub2_Hi, DenomHi), C5, C4);
2953 
2954  // if (C6 != 0)
2955  auto Add4 = B.buildAdd(S64, Add3, One64);
2956  auto Sub3_Lo = B.buildUSubo(S32, S1, Sub2_Lo, DenomLo);
2957 
2958  auto Sub3_Mi = B.buildUSube(S32, S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1));
2959  auto Sub3_Hi = B.buildUSube(S32, S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1));
2960  auto Sub3 = B.buildMerge(S64, {Sub3_Lo, Sub3_Hi});
2961 
2962  // endif C6
2963  // endif C3
2964 
2965  if (IsDiv) {
2966  auto Sel1 = B.buildSelect(
2967  S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Add4, Add3);
2968  B.buildSelect(DstReg,
2969  B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), Sel1, MulHi3);
2970  } else {
2971  auto Sel2 = B.buildSelect(
2972  S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Sub3, Sub2);
2973  B.buildSelect(DstReg,
2974  B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), Sel2, Sub1);
2975  }
2976 }
2977 
2980  MachineIRBuilder &B) const {
2981  const LLT S64 = LLT::scalar(64);
2982  const LLT S32 = LLT::scalar(32);
2983  const bool IsDiv = MI.getOpcode() == AMDGPU::G_UDIV;
2984  Register DstReg = MI.getOperand(0).getReg();
2985  Register Num = MI.getOperand(1).getReg();
2986  Register Den = MI.getOperand(2).getReg();
2987  LLT Ty = MRI.getType(DstReg);
2988 
2989  if (Ty == S32)
2990  legalizeUDIV_UREM32Impl(B, DstReg, Num, Den, IsDiv);
2991  else if (Ty == S64)
2992  legalizeUDIV_UREM64Impl(B, DstReg, Num, Den, IsDiv);
2993  else
2994  return false;
2995 
2996  MI.eraseFromParent();
2997  return true;
2998 
2999 }
3000 
3003  MachineIRBuilder &B) const {
3004  const LLT S64 = LLT::scalar(64);
3005  const LLT S32 = LLT::scalar(32);
3006 
3007  Register DstReg = MI.getOperand(0).getReg();
3008  const LLT Ty = MRI.getType(DstReg);
3009  if (Ty != S32 && Ty != S64)
3010  return false;
3011 
3012  const bool IsDiv = MI.getOpcode() == AMDGPU::G_SDIV;
3013 
3014  Register LHS = MI.getOperand(1).getReg();
3015  Register RHS = MI.getOperand(2).getReg();
3016 
3017  auto SignBitOffset = B.buildConstant(S32, Ty.getSizeInBits() - 1);
3018  auto LHSign = B.buildAShr(Ty, LHS, SignBitOffset);
3019  auto RHSign = B.buildAShr(Ty, RHS, SignBitOffset);
3020 
3021  LHS = B.buildAdd(Ty, LHS, LHSign).getReg(0);
3022  RHS = B.buildAdd(Ty, RHS, RHSign).getReg(0);
3023 
3024  LHS = B.buildXor(Ty, LHS, LHSign).getReg(0);
3025  RHS = B.buildXor(Ty, RHS, RHSign).getReg(0);
3026 
3027  Register UDivRem = MRI.createGenericVirtualRegister(Ty);
3028  if (Ty == S32)
3029  legalizeUDIV_UREM32Impl(B, UDivRem, LHS, RHS, IsDiv);
3030  else
3031  legalizeUDIV_UREM64Impl(B, UDivRem, LHS, RHS, IsDiv);
3032 
3033  Register Sign;
3034  if (IsDiv)
3035  Sign = B.buildXor(Ty, LHSign, RHSign).getReg(0);
3036  else
3037  Sign = LHSign.getReg(0); // Remainder sign is the same as LHS
3038 
3039  UDivRem = B.buildXor(Ty, UDivRem, Sign).getReg(0);
3040  B.buildSub(DstReg, UDivRem, Sign);
3041 
3042  MI.eraseFromParent();
3043  return true;
3044 }
3045 
3048  MachineIRBuilder &B) const {
3049  Register Res = MI.getOperand(0).getReg();
3050  Register LHS = MI.getOperand(1).getReg();
3051  Register RHS = MI.getOperand(2).getReg();
3052  uint16_t Flags = MI.getFlags();
3053  LLT ResTy = MRI.getType(Res);
3054 
3055  const MachineFunction &MF = B.getMF();
3056  bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath ||
3057  MI.getFlag(MachineInstr::FmAfn);
3058 
3059  if (!AllowInaccurateRcp)
3060  return false;
3061 
3062  if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) {
3063  // 1 / x -> RCP(x)
3064  if (CLHS->isExactlyValue(1.0)) {
3065  B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
3066  .addUse(RHS)
3067  .setMIFlags(Flags);
3068 
3069  MI.eraseFromParent();
3070  return true;
3071  }
3072 
3073  // -1 / x -> RCP( FNEG(x) )
3074  if (CLHS->isExactlyValue(-1.0)) {
3075  auto FNeg = B.buildFNeg(ResTy, RHS, Flags);
3076  B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
3077  .addUse(FNeg.getReg(0))
3078  .setMIFlags(Flags);
3079 
3080  MI.eraseFromParent();
3081  return true;
3082  }
3083  }
3084 
3085  // x / y -> x * (1.0 / y)
3086  auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false)
3087  .addUse(RHS)
3088  .setMIFlags(Flags);
3089  B.buildFMul(Res, LHS, RCP, Flags);
3090 
3091  MI.eraseFromParent();
3092  return true;
3093 }
3094 
3097  MachineIRBuilder &B) const {
3098  Register Res = MI.getOperand(0).getReg();
3099  Register LHS = MI.getOperand(1).getReg();
3100  Register RHS = MI.getOperand(2).getReg();
3101 
3102  uint16_t Flags = MI.getFlags();
3103 
3104  LLT S16 = LLT::scalar(16);
3105  LLT S32 = LLT::scalar(32);
3106 
3107  auto LHSExt = B.buildFPExt(S32, LHS, Flags);
3108  auto RHSExt = B.buildFPExt(S32, RHS, Flags);
3109 
3110  auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3111  .addUse(RHSExt.getReg(0))
3112  .setMIFlags(Flags);
3113 
3114  auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags);
3115  auto RDst = B.buildFPTrunc(S16, QUOT, Flags);
3116 
3117  B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
3118  .addUse(RDst.getReg(0))
3119  .addUse(RHS)
3120  .addUse(LHS)
3121  .setMIFlags(Flags);
3122 
3123  MI.eraseFromParent();
3124  return true;
3125 }
3126 
3127 // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions
3128 // to enable denorm mode. When 'Enable' is false, disable denorm mode.
3129 static void toggleSPDenormMode(bool Enable,
3131  const GCNSubtarget &ST,
3133  // Set SP denorm mode to this value.
3134  unsigned SPDenormMode =
3135  Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue();
3136 
3137  if (ST.hasDenormModeInst()) {
3138  // Preserve default FP64FP16 denorm mode while updating FP32 mode.
3139  uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue();
3140 
3141  uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2);
3142  B.buildInstr(AMDGPU::S_DENORM_MODE)
3143  .addImm(NewDenormModeValue);
3144 
3145  } else {
3146  // Select FP32 bit field in mode register.
3147  unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE |
3150 
3151  B.buildInstr(AMDGPU::S_SETREG_IMM32_B32)
3152  .addImm(SPDenormMode)
3153  .addImm(SPDenormModeBitField);
3154  }
3155 }
3156 
3159  MachineIRBuilder &B) const {
3160  Register Res = MI.getOperand(0).getReg();
3161  Register LHS = MI.getOperand(1).getReg();
3162  Register RHS = MI.getOperand(2).getReg();
3163  const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3165 
3166  uint16_t Flags = MI.getFlags();
3167 
3168  LLT S32 = LLT::scalar(32);
3169  LLT S1 = LLT::scalar(1);
3170 
3171  auto One = B.buildFConstant(S32, 1.0f);
3172 
3173  auto DenominatorScaled =
3174  B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
3175  .addUse(LHS)
3176  .addUse(RHS)
3177  .addImm(0)
3178  .setMIFlags(Flags);
3179  auto NumeratorScaled =
3180  B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
3181  .addUse(LHS)
3182  .addUse(RHS)
3183  .addImm(1)
3184  .setMIFlags(Flags);
3185 
3186  auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3187  .addUse(DenominatorScaled.getReg(0))
3188  .setMIFlags(Flags);
3189  auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags);
3190 
3191  // FIXME: Doesn't correctly model the FP mode switch, and the FP operations
3192  // aren't modeled as reading it.
3193  if (!Mode.allFP32Denormals())
3194  toggleSPDenormMode(true, B, ST, Mode);
3195 
3196  auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags);
3197  auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags);
3198  auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags);
3199  auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags);
3200  auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags);
3201  auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags);
3202 
3203  if (!Mode.allFP32Denormals())
3204  toggleSPDenormMode(false, B, ST, Mode);
3205 
3206  auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false)
3207  .addUse(Fma4.getReg(0))
3208  .addUse(Fma1.getReg(0))
3209  .addUse(Fma3.getReg(0))
3210  .addUse(NumeratorScaled.getReg(1))
3211  .setMIFlags(Flags);
3212 
3213  B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
3214  .addUse(Fmas.getReg(0))
3215  .addUse(RHS)
3216  .addUse(LHS)
3217  .setMIFlags(Flags);
3218 
3219  MI.eraseFromParent();
3220  return true;
3221 }
3222 
3225  MachineIRBuilder &B) const {
3226  Register Res = MI.getOperand(0).getReg();
3227  Register LHS = MI.getOperand(1).getReg();
3228  Register RHS = MI.getOperand(2).getReg();
3229 
3230  uint16_t Flags = MI.getFlags();
3231 
3232  LLT S64 = LLT::scalar(64);
3233  LLT S1 = LLT::scalar(1);
3234 
3235  auto One = B.buildFConstant(S64, 1.0);
3236 
3237  auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
3238  .addUse(LHS)
3239  .addUse(RHS)
3240  .addImm(0)
3241  .setMIFlags(Flags);
3242 
3243  auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags);
3244 
3245  auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false)
3246  .addUse(DivScale0.getReg(0))
3247  .setMIFlags(Flags);
3248 
3249  auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags);
3250  auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags);
3251  auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags);
3252 
3253  auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
3254  .addUse(LHS)
3255  .addUse(RHS)
3256  .addImm(1)
3257  .setMIFlags(Flags);
3258 
3259  auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags);
3260  auto Mul = B.buildFMul(S64, DivScale1.getReg(0), Fma3, Flags);
3261  auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags);
3262 
3263  Register Scale;
3265  // Workaround a hardware bug on SI where the condition output from div_scale
3266  // is not usable.
3267 
3268  LLT S32 = LLT::scalar(32);
3269 
3270  auto NumUnmerge = B.buildUnmerge(S32, LHS);
3271  auto DenUnmerge = B.buildUnmerge(S32, RHS);
3272  auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0);
3273  auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1);
3274 
3275  auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1),
3276  Scale1Unmerge.getReg(1));
3277  auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1),
3278  Scale0Unmerge.getReg(1));
3279  Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0);
3280  } else {
3281  Scale = DivScale1.getReg(1);
3282  }
3283 
3284  auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false)
3285  .addUse(Fma4.getReg(0))
3286  .addUse(Fma3.getReg(0))
3287  .addUse(Mul.getReg(0))
3288  .addUse(Scale)
3289  .setMIFlags(Flags);
3290 
3291  B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, makeArrayRef(Res), false)
3292  .addUse(Fmas.getReg(0))
3293  .addUse(RHS)
3294  .addUse(LHS)
3295  .setMIFlags(Flags);
3296 
3297  MI.eraseFromParent();
3298  return true;
3299 }
3300 
3303  MachineIRBuilder &B) const {
3304  Register Res = MI.getOperand(0).getReg();
3305  Register LHS = MI.getOperand(2).getReg();
3306  Register RHS = MI.getOperand(3).getReg();
3307  uint16_t Flags = MI.getFlags();
3308 
3309  LLT S32 = LLT::scalar(32);
3310  LLT S1 = LLT::scalar(1);
3311 
3312  auto Abs = B.buildFAbs(S32, RHS, Flags);
3313  const APFloat C0Val(1.0f);
3314 
3315  auto C0 = B.buildConstant(S32, 0x6f800000);
3316  auto C1 = B.buildConstant(S32, 0x2f800000);
3317  auto C2 = B.buildConstant(S32, FloatToBits(1.0f));
3318 
3319  auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags);
3320  auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags);
3321 
3322  auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags);
3323 
3324  auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3325  .addUse(Mul0.getReg(0))
3326  .setMIFlags(Flags);
3327 
3328  auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags);
3329 
3330  B.buildFMul(Res, Sel, Mul1, Flags);
3331 
3332  MI.eraseFromParent();
3333  return true;
3334 }
3335 
3336 // Expand llvm.amdgcn.rsq.clamp on targets that don't support the instruction.
3337 // FIXME: Why do we handle this one but not other removed instructions?
3338 //
3339 // Reciprocal square root. The clamp prevents infinite results, clamping
3340 // infinities to max_float. D.f = 1.0 / sqrt(S0.f), result clamped to
3341 // +-max_float.
3344  MachineIRBuilder &B) const {
3346  return true;
3347 
3348  Register Dst = MI.getOperand(0).getReg();
3349  Register Src = MI.getOperand(2).getReg();
3350  auto Flags = MI.getFlags();
3351 
3352  LLT Ty = MRI.getType(Dst);
3353 
3354  const fltSemantics *FltSemantics;
3355  if (Ty == LLT::scalar(32))
3356  FltSemantics = &APFloat::IEEEsingle();
3357  else if (Ty == LLT::scalar(64))
3358  FltSemantics = &APFloat::IEEEdouble();
3359  else
3360  return false;
3361 
3362  auto Rsq = B.buildIntrinsic(Intrinsic::amdgcn_rsq, {Ty}, false)
3363  .addUse(Src)
3364  .setMIFlags(Flags);
3365 
3366  // We don't need to concern ourselves with the snan handling difference, since
3367  // the rsq quieted (or not) so use the one which will directly select.
3368  const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3369  const bool UseIEEE = MFI->getMode().IEEE;
3370 
3371  auto MaxFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics));
3372  auto ClampMax = UseIEEE ? B.buildFMinNumIEEE(Ty, Rsq, MaxFlt, Flags) :
3373  B.buildFMinNum(Ty, Rsq, MaxFlt, Flags);
3374 
3375  auto MinFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics, true));
3376 
3377  if (UseIEEE)
3378  B.buildFMaxNumIEEE(Dst, ClampMax, MinFlt, Flags);
3379  else
3380  B.buildFMaxNum(Dst, ClampMax, MinFlt, Flags);
3381  MI.eraseFromParent();
3382  return true;
3383 }
3384 
3385 static unsigned getDSFPAtomicOpcode(Intrinsic::ID IID) {
3386  switch (IID) {
3387  case Intrinsic::amdgcn_ds_fadd:
3388  return AMDGPU::G_ATOMICRMW_FADD;
3389  case Intrinsic::amdgcn_ds_fmin:
3390  return AMDGPU::G_AMDGPU_ATOMIC_FMIN;
3391  case Intrinsic::amdgcn_ds_fmax:
3392  return AMDGPU::G_AMDGPU_ATOMIC_FMAX;
3393  default:
3394  llvm_unreachable("not a DS FP intrinsic");
3395  }
3396 }
3397 
3399  MachineInstr &MI,
3400  Intrinsic::ID IID) const {
3401  GISelChangeObserver &Observer = Helper.Observer;
3402  Observer.changingInstr(MI);
3403 
3404  MI.setDesc(ST.getInstrInfo()->get(getDSFPAtomicOpcode(IID)));
3405 
3406  // The remaining operands were used to set fields in the MemOperand on
3407  // construction.
3408  for (int I = 6; I > 3; --I)
3409  MI.RemoveOperand(I);
3410 
3411  MI.RemoveOperand(1); // Remove the intrinsic ID.
3412  Observer.changedInstr(MI);
3413  return true;
3414 }
3415 
3418  MachineIRBuilder &B) const {
3419  uint64_t Offset =
3422  LLT DstTy = MRI.getType(DstReg);
3423  LLT IdxTy = LLT::scalar(DstTy.getSizeInBits());
3424 
3425  Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy);
3426  if (!loadInputValue(KernargPtrReg, B,
3428  return false;
3429 
3430  // FIXME: This should be nuw
3431  B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0));
3432  return true;
3433 }
3434 
3437  MachineIRBuilder &B) const {
3438  const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3439  if (!MFI->isEntryFunction()) {
3442  }
3443 
3444  Register DstReg = MI.getOperand(0).getReg();
3445  if (!getImplicitArgPtr(DstReg, MRI, B))
3446  return false;
3447 
3448  MI.eraseFromParent();
3449  return true;
3450 }
3451 
3455  unsigned AddrSpace) const {
3456  Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B);
3457  auto Unmerge = B.buildUnmerge(LLT::scalar(32), MI.getOperand(2).getReg());
3458  Register Hi32 = Unmerge.getReg(1);
3459 
3460  B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg);
3461  MI.eraseFromParent();
3462  return true;
3463 }
3464 
3465 // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args:
3466 // offset (the offset that is included in bounds checking and swizzling, to be
3467 // split between the instruction's voffset and immoffset fields) and soffset
3468 // (the offset that is excluded from bounds checking and swizzling, to go in
3469 // the instruction's soffset field). This function takes the first kind of
3470 // offset and figures out how to split it between voffset and immoffset.
3471 std::tuple<Register, unsigned, unsigned>
3473  Register OrigOffset) const {
3474  const unsigned MaxImm = 4095;
3475  Register BaseReg;
3476  unsigned TotalConstOffset;
3477  const LLT S32 = LLT::scalar(32);
3478 
3479  std::tie(BaseReg, TotalConstOffset) =
3480  AMDGPU::getBaseWithConstantOffset(*B.getMRI(), OrigOffset);
3481 
3482  unsigned ImmOffset = TotalConstOffset;
3483 
3484  // If the immediate value is too big for the immoffset field, put the value
3485  // and -4096 into the immoffset field so that the value that is copied/added
3486  // for the voffset field is a multiple of 4096, and it stands more chance
3487  // of being CSEd with the copy/add for another similar load/store.
3488  // However, do not do that rounding down to a multiple of 4096 if that is a
3489  // negative number, as it appears to be illegal to have a negative offset
3490  // in the vgpr, even if adding the immediate offset makes it positive.
3491  unsigned Overflow = ImmOffset & ~MaxImm;
3492  ImmOffset -= Overflow;
3493  if ((int32_t)Overflow < 0) {
3494  Overflow += ImmOffset;
3495  ImmOffset = 0;
3496  }
3497 
3498  if (Overflow != 0) {
3499  if (!BaseReg) {
3500  BaseReg = B.buildConstant(S32, Overflow).getReg(0);
3501  } else {
3502  auto OverflowVal = B.buildConstant(S32, Overflow);
3503  BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0);
3504  }
3505  }
3506 
3507  if (!BaseReg)
3508  BaseReg = B.buildConstant(S32, 0).getReg(0);
3509 
3510  return std::make_tuple(BaseReg, ImmOffset, TotalConstOffset);
3511 }
3512 
3513 /// Handle register layout difference for f16 images for some subtargets.
3516  Register Reg,
3517  bool ImageStore) const {
3518  const LLT S16 = LLT::scalar(16);
3519  const LLT S32 = LLT::scalar(32);
3520  LLT StoreVT = MRI.getType(Reg);
3521  assert(StoreVT.isVector() && StoreVT.getElementType() == S16);
3522 
3523  if (ST.hasUnpackedD16VMem()) {
3524  auto Unmerge = B.buildUnmerge(S16, Reg);
3525 
3526  SmallVector<Register, 4> WideRegs;
3527  for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
3528  WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0));
3529 
3530  int NumElts = StoreVT.getNumElements();
3531 
3532  return B.buildBuildVector(LLT::vector(NumElts, S32), WideRegs).getReg(0);
3533  }
3534 
3535  if (ImageStore && ST.hasImageStoreD16Bug()) {
3536  if (StoreVT.getNumElements() == 2) {
3537  SmallVector<Register, 4> PackedRegs;
3538  Reg = B.buildBitcast(S32, Reg).getReg(0);
3539  PackedRegs.push_back(Reg);
3540  PackedRegs.resize(2, B.buildUndef(S32).getReg(0));
3541  return B.buildBuildVector(LLT::vector(2, S32), PackedRegs).getReg(0);
3542  }
3543 
3544  if (StoreVT.getNumElements() == 3) {
3545  SmallVector<Register, 4> PackedRegs;
3546  auto Unmerge = B.buildUnmerge(S16, Reg);
3547  for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
3548  PackedRegs.push_back(Unmerge.getReg(I));
3549  PackedRegs.resize(6, B.buildUndef(S16).getReg(0));
3550  Reg = B.buildBuildVector(LLT::vector(6, S16), PackedRegs).getReg(0);
3551  return B.buildBitcast(LLT::vector(3, S32), Reg).getReg(0);
3552  }
3553 
3554  if (StoreVT.getNumElements() == 4) {
3555  SmallVector<Register, 4> PackedRegs;
3556  Reg = B.buildBitcast(LLT::vector(2, S32), Reg).getReg(0);
3557  auto Unmerge = B.buildUnmerge(S32, Reg);
3558  for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
3559  PackedRegs.push_back(Unmerge.getReg(I));
3560  PackedRegs.resize(4, B.buildUndef(S32).getReg(0));
3561  return B.buildBuildVector(LLT::vector(4, S32), PackedRegs).getReg(0);
3562  }
3563 
3564  llvm_unreachable("invalid data type");
3565  }
3566 
3567  return Reg;
3568 }
3569 
3571  MachineIRBuilder &B, Register VData, bool IsFormat) const {
3572  MachineRegisterInfo *MRI = B.getMRI();
3573  LLT Ty = MRI->getType(VData);
3574 
3575  const LLT S16 = LLT::scalar(16);
3576 
3577  // Fixup illegal register types for i8 stores.
3578  if (Ty == LLT::scalar(8) || Ty == S16) {
3579  Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0);
3580  return AnyExt;
3581  }
3582 
3583  if (Ty.isVector()) {
3584  if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) {
3585  if (IsFormat)
3586  return handleD16VData(B, *MRI, VData);
3587  }
3588  }
3589 
3590  return VData;
3591 }
3592 
3596  bool IsTyped,
3597  bool IsFormat) const {
3598  Register VData = MI.getOperand(1).getReg();
3599  LLT Ty = MRI.getType(VData);
3600  LLT EltTy = Ty.getScalarType();
3601  const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
3602  const LLT S32 = LLT::scalar(32);
3603 
3604  VData = fixStoreSourceType(B, VData, IsFormat);
3605  Register RSrc = MI.getOperand(2).getReg();
3606 
3607  MachineMemOperand *MMO = *MI.memoperands_begin();
3608  const int MemSize = MMO->getSize();
3609 
3610  unsigned ImmOffset;
3611  unsigned TotalOffset;
3612 
3613  // The typed intrinsics add an immediate after the registers.
3614  const unsigned NumVIndexOps = IsTyped ? 8 : 7;
3615 
3616  // The struct intrinsic variants add one additional operand over raw.
3617  const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
3618  Register VIndex;
3619  int OpOffset = 0;
3620  if (HasVIndex) {
3621  VIndex = MI.getOperand(3).getReg();
3622  OpOffset = 1;
3623  }
3624 
3625  Register VOffset = MI.getOperand(3 + OpOffset).getReg();
3626  Register SOffset = MI.getOperand(4 + OpOffset).getReg();
3627 
3628  unsigned Format = 0;
3629  if (IsTyped) {
3630  Format = MI.getOperand(5 + OpOffset).getImm();
3631  ++OpOffset;
3632  }
3633 
3634  unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
3635 
3636  std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
3637  if (TotalOffset != 0)
3638  MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize);
3639 
3640  unsigned Opc;
3641  if (IsTyped) {
3642  Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 :
3643  AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT;
3644  } else if (IsFormat) {
3645  Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 :
3646  AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT;
3647  } else {
3648  switch (MemSize) {
3649  case 1:
3650  Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE;
3651  break;
3652  case 2:
3653  Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT;
3654  break;
3655  default:
3656  Opc = AMDGPU::G_AMDGPU_BUFFER_STORE;
3657  break;
3658  }
3659  }
3660 
3661  if (!VIndex)
3662  VIndex = B.buildConstant(S32, 0).getReg(0);
3663 
3664  auto MIB = B.buildInstr(Opc)
3665  .addUse(VData) // vdata
3666  .addUse(RSrc) // rsrc
3667  .addUse(VIndex) // vindex
3668  .addUse(VOffset) // voffset
3669  .addUse(SOffset) // soffset
3670  .addImm(ImmOffset); // offset(imm)
3671 
3672  if (IsTyped)
3673  MIB.addImm(Format);
3674 
3675  MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm)
3676  .addImm(HasVIndex ? -1 : 0) // idxen(imm)
3677  .addMemOperand(MMO);
3678 
3679  MI.eraseFromParent();
3680  return true;
3681 }
3682 
3686  bool IsFormat,
3687  bool IsTyped) const {
3688  // FIXME: Verifier should enforce 1 MMO for these intrinsics.
3689  MachineMemOperand *MMO = *MI.memoperands_begin();
3690  const int MemSize = MMO->getSize();
3691  const LLT S32 = LLT::scalar(32);
3692 
3693  Register Dst = MI.getOperand(0).getReg();
3694  Register RSrc = MI.getOperand(2).getReg();
3695 
3696  // The typed intrinsics add an immediate after the registers.
3697  const unsigned NumVIndexOps = IsTyped ? 8 : 7;
3698 
3699  // The struct intrinsic variants add one additional operand over raw.
3700  const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
3701  Register VIndex;
3702  int OpOffset = 0;
3703  if (HasVIndex) {
3704  VIndex = MI.getOperand(3).getReg();
3705  OpOffset = 1;
3706  }
3707 
3708  Register VOffset = MI.getOperand(3 + OpOffset).getReg();
3709  Register SOffset = MI.getOperand(4 + OpOffset).getReg();
3710 
3711  unsigned Format = 0;
3712  if (IsTyped) {
3713  Format = MI.getOperand(5 + OpOffset).getImm();
3714  ++OpOffset;
3715  }
3716 
3717  unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
3718  unsigned ImmOffset;
3719  unsigned TotalOffset;
3720 
3721  LLT Ty = MRI.getType(Dst);
3722  LLT EltTy = Ty.getScalarType();
3723  const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
3724  const bool Unpacked = ST.hasUnpackedD16VMem();
3725 
3726  std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
3727  if (TotalOffset != 0)
3728  MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize);
3729 
3730  unsigned Opc;
3731 
3732  if (IsTyped) {
3733  Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 :
3734  AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT;
3735  } else if (IsFormat) {
3736  Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16 :
3737  AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT;
3738  } else {
3739  switch (MemSize) {
3740  case 1:
3741  Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE;
3742  break;
3743  case 2:
3744  Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT;
3745  break;
3746  default:
3747  Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD;
3748  break;
3749  }
3750  }
3751 
3752  Register LoadDstReg;
3753 
3754  bool IsExtLoad = (!IsD16 && MemSize < 4) || (IsD16 && !Ty.isVector());
3755  LLT UnpackedTy = Ty.changeElementSize(32);
3756 
3757  if (IsExtLoad)
3758  LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32);
3759  else if (Unpacked && IsD16 && Ty.isVector())
3760  LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy);
3761  else
3762  LoadDstReg = Dst;
3763 
3764  if (!VIndex)
3765  VIndex = B.buildConstant(S32, 0).getReg(0);
3766 
3767  auto MIB = B.buildInstr(Opc)
3768  .addDef(LoadDstReg) // vdata
3769  .addUse(RSrc) // rsrc
3770  .addUse(VIndex) // vindex
3771  .addUse(VOffset) // voffset
3772  .addUse(SOffset) // soffset
3773  .addImm(ImmOffset); // offset(imm)
3774 
3775  if (IsTyped)
3776  MIB.addImm(Format);
3777 
3778  MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm)
3779  .addImm(HasVIndex ? -1 : 0) // idxen(imm)
3780  .addMemOperand(MMO);
3781 
3782  if (LoadDstReg != Dst) {
3783  B.setInsertPt(B.getMBB(), ++B.getInsertPt());
3784 
3785  // Widen result for extending loads was widened.
3786  if (IsExtLoad)
3787  B.buildTrunc(Dst, LoadDstReg);
3788  else {
3789  // Repack to original 16-bit vector result
3790  // FIXME: G_TRUNC should work, but legalization currently fails
3791  auto Unmerge = B.buildUnmerge(S32, LoadDstReg);
3792  SmallVector<Register, 4> Repack;
3793  for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I)
3794  Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0));
3795  B.buildMerge(Dst, Repack);
3796  }
3797  }
3798 
3799  MI.eraseFromParent();
3800  return true;
3801 }
3802 
3805  bool IsInc) const {
3806  unsigned Opc = IsInc ? AMDGPU::G_AMDGPU_ATOMIC_INC :
3807  AMDGPU::G_AMDGPU_ATOMIC_DEC;
3808  B.buildInstr(Opc)
3809  .addDef(MI.getOperand(0).getReg())
3810  .addUse(MI.getOperand(2).getReg())
3811  .addUse(MI.getOperand(3).getReg())
3812  .cloneMemRefs(MI);
3813  MI.eraseFromParent();
3814  return true;
3815 }
3816 
3817 static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) {
3818  switch (IntrID) {
3819  case Intrinsic::amdgcn_raw_buffer_atomic_swap:
3820  case Intrinsic::amdgcn_struct_buffer_atomic_swap:
3821  return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP;
3822  case Intrinsic::amdgcn_raw_buffer_atomic_add:
3823  case Intrinsic::amdgcn_struct_buffer_atomic_add:
3824  return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD;
3825  case Intrinsic::amdgcn_raw_buffer_atomic_sub:
3826  case Intrinsic::amdgcn_struct_buffer_atomic_sub:
3827  return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB;
3828  case Intrinsic::amdgcn_raw_buffer_atomic_smin:
3829  case Intrinsic::amdgcn_struct_buffer_atomic_smin:
3830  return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN;
3831  case Intrinsic::amdgcn_raw_buffer_atomic_umin:
3832  case Intrinsic::amdgcn_struct_buffer_atomic_umin:
3833  return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN;
3834  case Intrinsic::amdgcn_raw_buffer_atomic_smax:
3835  case Intrinsic::amdgcn_struct_buffer_atomic_smax:
3836  return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX;
3837  case Intrinsic::amdgcn_raw_buffer_atomic_umax:
3838  case Intrinsic::amdgcn_struct_buffer_atomic_umax:
3839  return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX;
3840  case Intrinsic::amdgcn_raw_buffer_atomic_and:
3841  case Intrinsic::amdgcn_struct_buffer_atomic_and:
3842  return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND;
3843  case Intrinsic::amdgcn_raw_buffer_atomic_or:
3844  case Intrinsic::amdgcn_struct_buffer_atomic_or:
3845  return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR;
3846  case Intrinsic::amdgcn_raw_buffer_atomic_xor:
3847  case Intrinsic::amdgcn_struct_buffer_atomic_xor:
3848  return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR;
3849  case Intrinsic::amdgcn_raw_buffer_atomic_inc:
3850  case Intrinsic::amdgcn_struct_buffer_atomic_inc:
3851  return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC;
3852  case Intrinsic::amdgcn_raw_buffer_atomic_dec:
3853  case Intrinsic::amdgcn_struct_buffer_atomic_dec:
3854  return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC;
3855  case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
3856  case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
3857  return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP;
3858  case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
3859  case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
3860  return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD;
3861  default:
3862  llvm_unreachable("unhandled atomic opcode");
3863  }
3864 }
3865 
3868  Intrinsic::ID IID) const {
3869  const bool IsCmpSwap = IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap ||
3870  IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap;
3871  const bool HasReturn = MI.getNumExplicitDefs() != 0;
3872 
3873  Register Dst;
3874 
3875  int OpOffset = 0;
3876  if (HasReturn) {
3877  // A few FP atomics do not support return values.
3878  Dst = MI.getOperand(0).getReg();
3879  } else {
3880  OpOffset = -1;
3881  }
3882 
3883  Register VData = MI.getOperand(2 + OpOffset).getReg();
3884  Register CmpVal;
3885 
3886  if (IsCmpSwap) {
3887  CmpVal = MI.getOperand(3 + OpOffset).getReg();
3888  ++OpOffset;
3889  }
3890 
3891  Register RSrc = MI.getOperand(3 + OpOffset).getReg();
3892  const unsigned NumVIndexOps = (IsCmpSwap ? 8 : 7) + HasReturn;
3893 
3894  // The struct intrinsic variants add one additional operand over raw.
3895  const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
3896  Register VIndex;
3897  if (HasVIndex) {
3898  VIndex = MI.getOperand(4 + OpOffset).getReg();
3899  ++OpOffset;
3900  }
3901 
3902  Register VOffset = MI.getOperand(4 + OpOffset).getReg();
3903  Register SOffset = MI.getOperand(5 + OpOffset).getReg();
3904  unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm();
3905 
3906  MachineMemOperand *MMO = *MI.memoperands_begin();
3907 
3908  unsigned ImmOffset;
3909  unsigned TotalOffset;
3910  std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
3911  if (TotalOffset != 0)
3912  MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MMO->getSize());
3913 
3914  if (!VIndex)
3915  VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0);
3916 
3917  auto MIB = B.buildInstr(getBufferAtomicPseudo(IID));
3918 
3919  if (HasReturn)
3920  MIB.addDef(Dst);
3921 
3922  MIB.addUse(VData); // vdata
3923 
3924  if (IsCmpSwap)
3925  MIB.addReg(CmpVal);
3926 
3927  MIB.addUse(RSrc) // rsrc
3928  .addUse(VIndex) // vindex
3929  .addUse(VOffset) // voffset
3930  .addUse(SOffset) // soffset
3931  .addImm(ImmOffset) // offset(imm)
3932  .addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm)
3933  .addImm(HasVIndex ? -1 : 0) // idxen(imm)
3934  .addMemOperand(MMO);
3935 
3936  MI.eraseFromParent();
3937  return true;
3938 }
3939 
3940 /// Turn a set of s16 typed registers in \p A16AddrRegs into a dword sized
3941 /// vector with s16 typed elements.
3944  SmallVectorImpl<Register> &PackedAddrs, unsigned ArgOffset,
3945  const AMDGPU::ImageDimIntrinsicInfo *Intr, unsigned EndIdx) {
3946  const LLT S16 = LLT::scalar(16);
3947  const LLT V2S16 = LLT::vector(2, 16);
3948 
3949  for (unsigned I = Intr->VAddrStart; I < EndIdx; I++) {
3950  MachineOperand &SrcOp = MI.getOperand(ArgOffset + I);
3951  if (!SrcOp.isReg())
3952  continue; // _L to _LZ may have eliminated this.
3953 
3954  Register AddrReg = SrcOp.getReg();
3955 
3956  if (I < Intr->GradientStart) {
3957  AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0);
3958  PackedAddrs.push_back(AddrReg);
3959  } else {
3960  // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D,
3961  // derivatives dx/dh and dx/dv are packed with undef.
3962  if (((I + 1) >= EndIdx) ||
3963  ((Intr->NumGradients / 2) % 2 == 1 &&
3964  (I == static_cast<unsigned>(Intr->GradientStart +
3965  (Intr->NumGradients / 2) - 1) ||
3966  I == static_cast<unsigned>(Intr->GradientStart +
3967  Intr->NumGradients - 1))) ||
3968  // Check for _L to _LZ optimization
3969  !MI.getOperand(ArgOffset + I + 1).isReg()) {
3970  PackedAddrs.push_back(
3971  B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
3972  .getReg(0));
3973  } else {
3974  PackedAddrs.push_back(
3975  B.buildBuildVector(
3976  V2S16, {AddrReg, MI.getOperand(ArgOffset + I + 1).getReg()})
3977  .getReg(0));
3978  ++I;
3979  }
3980  }
3981  }
3982 }
3983 
3984 /// Convert from separate vaddr components to a single vector address register,
3985 /// and replace the remaining operands with $noreg.
3987  int DimIdx, int NumVAddrs) {
3988  const LLT S32 = LLT::scalar(32);
3989 
3990  SmallVector<Register, 8> AddrRegs;
3991  for (int I = 0; I != NumVAddrs; ++I) {
3992  MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
3993  if (SrcOp.isReg()) {
3994  AddrRegs.push_back(SrcOp.getReg());
3995  assert(B.getMRI()->getType(SrcOp.getReg()) == S32);
3996  }
3997  }
3998 
3999  int NumAddrRegs = AddrRegs.size();
4000  if (NumAddrRegs != 1) {
4001  // Round up to 8 elements for v5-v7
4002  // FIXME: Missing intermediate sized register classes and instructions.
4003  if (NumAddrRegs > 4 && !isPowerOf2_32(NumAddrRegs)) {
4004  const int RoundedNumRegs = NextPowerOf2(NumAddrRegs);
4005  auto Undef = B.buildUndef(S32);
4006  AddrRegs.append(RoundedNumRegs - NumAddrRegs, Undef.getReg(0));
4007  NumAddrRegs = RoundedNumRegs;
4008  }
4009 
4010  auto VAddr = B.buildBuildVector(LLT::vector(NumAddrRegs, 32), AddrRegs);
4011  MI.getOperand(DimIdx).setReg(VAddr.getReg(0));
4012  }
4013 
4014  for (int I = 1; I != NumVAddrs; ++I) {
4015  MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
4016  if (SrcOp.isReg())
4017  MI.getOperand(DimIdx + I).setReg(AMDGPU::NoRegister);
4018  }
4019 }
4020 
4021 /// Rewrite image intrinsics to use register layouts expected by the subtarget.
4022 ///
4023 /// Depending on the subtarget, load/store with 16-bit element data need to be
4024 /// rewritten to use the low half of 32-bit registers, or directly use a packed
4025 /// layout. 16-bit addresses should also sometimes be packed into 32-bit
4026 /// registers.
4027 ///
4028 /// We don't want to directly select image instructions just yet, but also want
4029 /// to exposes all register repacking to the legalizer/combiners. We also don't
4030 /// want a selected instrution entering RegBankSelect. In order to avoid
4031 /// defining a multitude of intermediate image instructions, directly hack on
4032 /// the intrinsic's arguments. In cases like a16 addreses, this requires padding
4033 /// now unnecessary arguments with $noreg.
4036  const AMDGPU::ImageDimIntrinsicInfo *Intr) const {
4037 
4038  const unsigned NumDefs = MI.getNumExplicitDefs();
4039  const unsigned ArgOffset = NumDefs + 1;
4040  bool IsTFE = NumDefs == 2;
4041  // We are only processing the operands of d16 image operations on subtargets
4042  // that use the unpacked register layout, or need to repack the TFE result.
4043 
4044  // TODO: Do we need to guard against already legalized intrinsics?
4045  const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
4046  AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode);
4047 
4048  MachineRegisterInfo *MRI = B.getMRI();
4049  const LLT S32 = LLT::scalar(32);
4050  const LLT S16 = LLT::scalar(16);
4051  const LLT V2S16 = LLT::vector(2, 16);
4052 
4053  unsigned DMask = 0;
4054 
4055  // Check for 16 bit addresses and pack if true.
4056  LLT GradTy =
4057  MRI->getType(MI.getOperand(ArgOffset + Intr->GradientStart).getReg());
4058  LLT AddrTy =
4059  MRI->getType(MI.getOperand(ArgOffset + Intr->CoordStart).getReg());
4060  const bool IsG16 = GradTy == S16;
4061  const bool IsA16 = AddrTy == S16;
4062 
4063  int DMaskLanes = 0;
4064  if (!BaseOpcode->Atomic) {
4065  DMask = MI.getOperand(ArgOffset + Intr->DMaskIndex).getImm();
4066  if (BaseOpcode->Gather4) {
4067  DMaskLanes = 4;
4068  } else if (DMask != 0) {
4069  DMaskLanes = countPopulation(DMask);
4070  } else if (!IsTFE && !BaseOpcode->Store) {
4071  // If dmask is 0, this is a no-op load. This can be eliminated.
4072  B.buildUndef(MI.getOperand(0));
4073  MI.eraseFromParent();
4074  return true;
4075  }
4076  }
4077 
4078  Observer.changingInstr(MI);
4079  auto ChangedInstr = make_scope_exit([&] { Observer.changedInstr(MI); });
4080 
4081  unsigned NewOpcode = NumDefs == 0 ?
4082  AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD;
4083 
4084  // Track that we legalized this
4085  MI.setDesc(B.getTII().get(NewOpcode));
4086 
4087  // Expecting to get an error flag since TFC is on - and dmask is 0 Force
4088  // dmask to be at least 1 otherwise the instruction will fail
4089  if (IsTFE && DMask == 0) {
4090  DMask = 0x1;
4091  DMaskLanes = 1;
4092  MI.getOperand(ArgOffset + Intr->DMaskIndex).setImm(DMask);
4093  }
4094 
4095  if (BaseOpcode->Atomic) {
4096  Register VData0 = MI.getOperand(2).getReg();
4097  LLT Ty = MRI->getType(VData0);
4098 
4099  // TODO: Allow atomic swap and bit ops for v2s16/v4s16
4100  if (Ty.isVector())
4101  return false;
4102 
4103  if (BaseOpcode->AtomicX2) {
4104  Register VData1 = MI.getOperand(3).getReg();
4105  // The two values are packed in one register.
4106  LLT PackedTy = LLT::vector(2, Ty);
4107  auto Concat = B.buildBuildVector(PackedTy, {VData0, VData1});
4108  MI.getOperand(2).setReg(Concat.getReg(0));
4109  MI.getOperand(3).setReg(AMDGPU::NoRegister);
4110  }
4111  }
4112 
4113  unsigned CorrectedNumVAddrs = Intr->NumVAddrs;
4114 
4115  // Optimize _L to _LZ when _L is zero
4116  if (const AMDGPU::MIMGLZMappingInfo *LZMappingInfo =
4117  AMDGPU::getMIMGLZMappingInfo(Intr->BaseOpcode)) {
4118  const ConstantFP *ConstantLod;
4119 
4120  if (mi_match(MI.getOperand(ArgOffset + Intr->LodIndex).getReg(), *MRI,
4121  m_GFCst(ConstantLod))) {
4122  if (ConstantLod->isZero() || ConstantLod->isNegative()) {
4123  // Set new opcode to _lz variant of _l, and change the intrinsic ID.
4124  const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr =
4126  Intr->Dim);
4127 
4128  // The starting indexes should remain in the same place.
4129  --CorrectedNumVAddrs;
4130 
4131  MI.getOperand(MI.getNumExplicitDefs())
4132  .setIntrinsicID(static_cast<Intrinsic::ID>(NewImageDimIntr->Intr));
4133  MI.RemoveOperand(ArgOffset + Intr->LodIndex);
4134  Intr = NewImageDimIntr;
4135  }
4136  }
4137  }
4138 
4139  // Optimize _mip away, when 'lod' is zero
4140  if (AMDGPU::getMIMGMIPMappingInfo(Intr->BaseOpcode)) {
4141  int64_t ConstantLod;
4142  if (mi_match(MI.getOperand(ArgOffset + Intr->MipIndex).getReg(), *MRI,
4143  m_ICst(ConstantLod))) {
4144  if (ConstantLod == 0) {
4145  // TODO: Change intrinsic opcode and remove operand instead or replacing
4146  // it with 0, as the _L to _LZ handling is done above.
4147  MI.getOperand(ArgOffset + Intr->MipIndex).ChangeToImmediate(0);
4148  --CorrectedNumVAddrs;
4149  }
4150  }
4151  }
4152 
4153  // Rewrite the addressing register layout before doing anything else.
4154  if (IsA16 || IsG16) {
4155  if (IsA16) {
4156  // Target must support the feature and gradients need to be 16 bit too
4157  if (!ST.hasA16() || !IsG16)
4158  return false;
4159  } else if (!ST.hasG16())
4160  return false;
4161 
4162  if (Intr->NumVAddrs > 1) {
4163  SmallVector<Register, 4> PackedRegs;
4164  // Don't compress addresses for G16
4165  const int PackEndIdx = IsA16 ? Intr->VAddrEnd : Intr->CoordStart;
4166  packImageA16AddressToDwords(B, MI, PackedRegs, ArgOffset, Intr,
4167  PackEndIdx);
4168 
4169  if (!IsA16) {
4170  // Add uncompressed address
4171  for (unsigned I = Intr->CoordStart; I < Intr->VAddrEnd; I++) {
4172  int AddrReg = MI.getOperand(ArgOffset + I).getReg();
4173  assert(B.getMRI()->getType(AddrReg) == LLT::scalar(32));
4174  PackedRegs.push_back(AddrReg);
4175  }
4176  }
4177 
4178  // See also below in the non-a16 branch
4179  const bool UseNSA = PackedRegs.size() >= 3 && ST.hasNSAEncoding();
4180 
4181  if (!UseNSA && PackedRegs.size() > 1) {
4182  LLT PackedAddrTy = LLT::vector(2 * PackedRegs.size(), 16);
4183  auto Concat = B.buildConcatVectors(PackedAddrTy, PackedRegs);
4184  PackedRegs[0] = Concat.getReg(0);
4185  PackedRegs.resize(1);
4186  }
4187 
4188  const unsigned NumPacked = PackedRegs.size();
4189  for (unsigned I = Intr->VAddrStart; I < Intr->VAddrEnd; I++) {
4190  MachineOperand &SrcOp = MI.getOperand(ArgOffset + I);
4191  if (!SrcOp.isReg()) {
4192  assert(SrcOp.isImm() && SrcOp.getImm() == 0);
4193  continue;
4194  }
4195 
4196  assert(SrcOp.getReg() != AMDGPU::NoRegister);
4197 
4198  if (I - Intr->VAddrStart < NumPacked)
4199  SrcOp.setReg(PackedRegs[I - Intr->VAddrStart]);
4200  else
4201  SrcOp.setReg(AMDGPU::NoRegister);
4202  }
4203  }
4204  } else {
4205  // If the register allocator cannot place the address registers contiguously
4206  // without introducing moves, then using the non-sequential address encoding
4207  // is always preferable, since it saves VALU instructions and is usually a
4208  // wash in terms of code size or even better.
4209  //
4210  // However, we currently have no way of hinting to the register allocator
4211  // that MIMG addresses should be placed contiguously when it is possible to
4212  // do so, so force non-NSA for the common 2-address case as a heuristic.
4213  //
4214  // SIShrinkInstructions will convert NSA encodings to non-NSA after register
4215  // allocation when possible.
4216  const bool UseNSA = CorrectedNumVAddrs >= 3 && ST.hasNSAEncoding();
4217 
4218  if (!UseNSA && Intr->NumVAddrs > 1)
4219  convertImageAddrToPacked(B, MI, ArgOffset + Intr->VAddrStart,
4220  Intr->NumVAddrs);
4221  }
4222 
4223  int Flags = 0;
4224  if (IsA16)
4225  Flags |= 1;
4226  if (IsG16)
4227  Flags |= 2;
4228  MI.addOperand(MachineOperand::CreateImm(Flags));
4229 
4230  if (BaseOpcode->Store) { // No TFE for stores?
4231  // TODO: Handle dmask trim
4232  Register VData = MI.getOperand(1).getReg();
4233  LLT Ty = MRI->getType(VData);
4234  if (!Ty.isVector() || Ty.getElementType() != S16)
4235  return true;
4236 
4237  Register RepackedReg = handleD16VData(B, *MRI, VData, true);
4238  if (RepackedReg != VData) {
4239  MI.getOperand(1).setReg(RepackedReg);
4240  }
4241 
4242  return true;
4243  }
4244 
4245  Register DstReg = MI.getOperand(0).getReg();
4246  LLT Ty = MRI->getType(DstReg);
4247  const LLT EltTy = Ty.getScalarType();
4248  const bool IsD16 = Ty.getScalarType() == S16;
4249  const int NumElts = Ty.isVector() ? Ty.getNumElements() : 1;
4250 
4251  // Confirm that the return type is large enough for the dmask specified
4252  if (NumElts < DMaskLanes)
4253  return false;
4254 
4255  if (NumElts > 4 || DMaskLanes > 4)
4256  return false;
4257 
4258  const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes;
4259  const LLT AdjustedTy = Ty.changeNumElements(AdjustedNumElts);
4260 
4261  // The raw dword aligned data component of the load. The only legal cases
4262  // where this matters should be when using the packed D16 format, for
4263  // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>,
4264  LLT RoundedTy;
4265 
4266  // S32 vector to to cover all data, plus TFE result element.
4267  LLT TFETy;
4268 
4269  // Register type to use for each loaded component. Will be S32 or V2S16.
4270  LLT RegTy;
4271 
4272  if (IsD16 && ST.hasUnpackedD16VMem()) {
4273  RoundedTy = LLT::scalarOrVector(AdjustedNumElts, 32);
4274  TFETy = LLT::vector(AdjustedNumElts + 1, 32);
4275  RegTy = S32;
4276  } else {
4277  unsigned EltSize = EltTy.getSizeInBits();
4278  unsigned RoundedElts = (AdjustedTy.getSizeInBits() + 31) / 32;
4279  unsigned RoundedSize = 32 * RoundedElts;
4280  RoundedTy = LLT::scalarOrVector(RoundedSize / EltSize, EltSize);
4281  TFETy = LLT::vector(RoundedSize / 32 + 1, S32);
4282  RegTy = !IsTFE && EltSize == 16 ? V2S16 : S32;
4283  }
4284 
4285  // The return type does not need adjustment.
4286  // TODO: Should we change s16 case to s32 or <2 x s16>?
4287  if (!IsTFE && (RoundedTy == Ty || !Ty.isVector()))
4288  return true;
4289 
4290  Register Dst1Reg;
4291 
4292  // Insert after the instruction.
4293  B.setInsertPt(*MI.getParent(), ++MI.getIterator());
4294 
4295  // TODO: For TFE with d16, if we used a TFE type that was a multiple of <2 x
4296  // s16> instead of s32, we would only need 1 bitcast instead of multiple.
4297  const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy;
4298  const int ResultNumRegs = LoadResultTy.getSizeInBits() / 32;
4299 
4300  Register NewResultReg = MRI->createGenericVirtualRegister(LoadResultTy);
4301 
4302  MI.getOperand(0).setReg(NewResultReg);
4303 
4304  // In the IR, TFE is supposed to be used with a 2 element struct return
4305  // type. The intruction really returns these two values in one contiguous
4306  // register, with one additional dword beyond the loaded data. Rewrite the
4307  // return type to use a single register result.
4308 
4309  if (IsTFE) {
4310  Dst1Reg = MI.getOperand(1).getReg();
4311  if (MRI->getType(Dst1Reg) != S32)
4312  return false;
4313 
4314  // TODO: Make sure the TFE operand bit is set.
4315  MI.RemoveOperand(1);
4316 
4317  // Handle the easy case that requires no repack instructions.
4318  if (Ty == S32) {
4319  B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg);
4320  return true;
4321  }
4322  }
4323 
4324  // Now figure out how to copy the new result register back into the old
4325  // result.
4326  SmallVector<Register, 5> ResultRegs(ResultNumRegs, Dst1Reg);
4327 
4328  const int NumDataRegs = IsTFE ? ResultNumRegs - 1 : ResultNumRegs;
4329 
4330  if (ResultNumRegs == 1) {
4331  assert(!IsTFE);
4332  ResultRegs[0] = NewResultReg;
4333  } else {
4334  // We have to repack into a new vector of some kind.
4335  for (int I = 0; I != NumDataRegs; ++I)
4336  ResultRegs[I] = MRI->createGenericVirtualRegister(RegTy);
4337  B.buildUnmerge(ResultRegs, NewResultReg);
4338 
4339  // Drop the final TFE element to get the data part. The TFE result is
4340  // directly written to the right place already.
4341  if (IsTFE)
4342  ResultRegs.resize(NumDataRegs);
4343  }
4344 
4345  // For an s16 scalar result, we form an s32 result with a truncate regardless
4346  // of packed vs. unpacked.
4347  if (IsD16 && !Ty.isVector()) {
4348  B.buildTrunc(DstReg, ResultRegs[0]);
4349  return true;
4350  }
4351 
4352  // Avoid a build/concat_vector of 1 entry.
4353  if (Ty == V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) {
4354  B.buildBitcast(DstReg, ResultRegs[0]);
4355  return true;
4356  }
4357 
4358  assert(Ty.isVector());
4359 
4360  if (IsD16) {
4361  // For packed D16 results with TFE enabled, all the data components are
4362  // S32. Cast back to the expected type.
4363  //
4364  // TODO: We don't really need to use load s32 elements. We would only need one
4365  // cast for the TFE result if a multiple of v2s16 was used.
4366  if (RegTy != V2S16 && !ST.hasUnpackedD16VMem()) {
4367  for (Register &Reg : ResultRegs)
4368  Reg = B.buildBitcast(V2S16, Reg).getReg(0);
4369  } else if (ST.hasUnpackedD16VMem()) {
4370  for (Register &Reg : ResultRegs)
4371  Reg = B.buildTrunc(S16, Reg).getReg(0);
4372  }
4373  }
4374 
4375  auto padWithUndef = [&](LLT Ty, int NumElts) {
4376  if (NumElts == 0)
4377  return;
4378  Register Undef = B.buildUndef(Ty).getReg(0);
4379  for (int I = 0; I != NumElts; ++I)
4380  ResultRegs.push_back(Undef);
4381  };
4382 
4383  // Pad out any elements eliminated due to the dmask.
4384  LLT ResTy = MRI->getType(ResultRegs[0]);
4385  if (!ResTy.isVector()) {
4386  padWithUndef(ResTy, NumElts - ResultRegs.size());
4387  B.buildBuildVector(DstReg, ResultRegs);
4388  return true;
4389  }
4390 
4391  assert(!ST.hasUnpackedD16VMem() && ResTy == V2S16);
4392  const int RegsToCover = (Ty.getSizeInBits() + 31) / 32;
4393 
4394  // Deal with the one annoying legal case.
4395  const LLT V3S16 = LLT::vector(3, 16);
4396  if (Ty == V3S16) {
4397  padWithUndef(ResTy, RegsToCover - ResultRegs.size() + 1);
4398  auto Concat = B.buildConcatVectors(LLT::vector(6, 16), ResultRegs);
4399  B.buildUnmerge({DstReg, MRI->createGenericVirtualRegister(V3S16)}, Concat);
4400  return true;
4401  }
4402 
4403  padWithUndef(ResTy, RegsToCover - ResultRegs.size());
4404  B.buildConcatVectors(DstReg, ResultRegs);
4405  return true;
4406 }
4407 
4409  LegalizerHelper &Helper, MachineInstr &MI) const {
4410  MachineIRBuilder &B = Helper.MIRBuilder;
4411  GISelChangeObserver &Observer = Helper.Observer;
4412 
4413  Register Dst = MI.getOperand(0).getReg();
4414  LLT Ty = B.getMRI()->getType(Dst);
4415  unsigned Size = Ty.getSizeInBits();
4416  MachineFunction &MF = B.getMF();
4417 
4418  Observer.changingInstr(MI);
4419 
4420  if (shouldBitcastLoadStoreType(ST, Ty, Size)) {
4421  Ty = getBitcastRegisterType(Ty);
4422  Helper.bitcastDst(MI, Ty, 0);
4423  Dst = MI.getOperand(0).getReg();
4424  B.setInsertPt(B.getMBB(), MI);
4425  }
4426 
4427  // FIXME: We don't really need this intermediate instruction. The intrinsic
4428  // should be fixed to have a memory operand. Since it's readnone, we're not
4429  // allowed to add one.
4430  MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_LOAD));
4431  MI.RemoveOperand(1); // Remove intrinsic ID
4432 
4433  // FIXME: When intrinsic definition is fixed, this should have an MMO already.
4434  // TODO: Should this use datalayout alignment?
4435  const unsigned MemSize = (Size + 7) / 8;
4436  const Align MemAlign(4);
4441  MemSize, MemAlign);
4442  MI.addMemOperand(MF, MMO);
4443 
4444  // There are no 96-bit result scalar loads, but widening to 128-bit should
4445  // always be legal. We may need to restore this to a 96-bit result if it turns
4446  // out this needs to be converted to a vector load during RegBankSelect.
4447  if (!isPowerOf2_32(Size)) {
4448  if (Ty.isVector())
4449  Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0);
4450  else
4451  Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0);
4452  }
4453 
4454  Observer.changedInstr(MI);
4455  return true;
4456 }
4457 
4458 // TODO: Move to selection
4461  MachineIRBuilder &B) const {
4462  // Is non-HSA path or trap-handler disabled? then, insert s_endpgm instruction
4464  !ST.isTrapHandlerEnabled()) {
4465  B.buildInstr(AMDGPU::S_ENDPGM).addImm(0);
4466  } else {
4467  // Pass queue pointer to trap handler as input, and insert trap instruction
4468  // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi
4469  MachineRegisterInfo &MRI = *B.getMRI();
4470 
4471  Register LiveIn =
4472  MRI.createGenericVirtualRegister(LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
4474  return false;
4475 
4476  Register SGPR01(AMDGPU::SGPR0_SGPR1);
4477  B.buildCopy(SGPR01, LiveIn);
4478  B.buildInstr(AMDGPU::S_TRAP)
4480  .addReg(SGPR01, RegState::Implicit);
4481  }
4482 
4483  MI.eraseFromParent();
4484  return true;
4485 }
4486 
4489  // Is non-HSA path or trap-handler disabled? then, report a warning
4490  // accordingly
4492  !ST.isTrapHandlerEnabled()) {
4493  DiagnosticInfoUnsupported NoTrap(B.getMF().getFunction(),
4494  "debugtrap handler not supported",
4495  MI.getDebugLoc(), DS_Warning);
4496  LLVMContext &Ctx = B.getMF().getFunction().getContext();
4497  Ctx.diagnose(NoTrap);
4498  } else {
4499  // Insert debug-trap instruction
4500  B.buildInstr(AMDGPU::S_TRAP).addImm(GCNSubtarget::TrapIDLLVMDebugTrap);
4501  }
4502 
4503  MI.eraseFromParent();
4504  return true;
4505 }
4506 
4508  MachineIRBuilder &B) const {
4509  MachineRegisterInfo &MRI = *B.getMRI();
4510  const LLT S16 = LLT::scalar(16);
4511  const LLT S32 = LLT::scalar(32);
4512 
4513  Register DstReg = MI.getOperand(0).getReg();
4514  Register NodePtr = MI.getOperand(2).getReg();
4515  Register RayExtent = MI.getOperand(3).getReg();
4516  Register RayOrigin = MI.getOperand(4).getReg();
4517  Register RayDir = MI.getOperand(5).getReg();
4518  Register RayInvDir = MI.getOperand(6).getReg();
4519  Register TDescr = MI.getOperand(7).getReg();
4520 
4521  bool IsA16 = MRI.getType(RayDir).getElementType().getSizeInBits() == 16;
4522  bool Is64 = MRI.getType(NodePtr).getSizeInBits() == 64;
4523  unsigned Opcode = IsA16 ? Is64 ? AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16_nsa
4524  : AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16_nsa
4525  : Is64 ? AMDGPU::IMAGE_BVH64_INTERSECT_RAY_nsa
4526  : AMDGPU::IMAGE_BVH_INTERSECT_RAY_nsa;
4527 
4529  if (Is64) {
4530  auto Unmerge = B.buildUnmerge({S32, S32}, NodePtr);
4531  Ops.push_back(Unmerge.getReg(0));
4532  Ops.push_back(Unmerge.getReg(1));
4533  } else {
4534  Ops.push_back(NodePtr);
4535  }
4536  Ops.push_back(RayExtent);
4537 
4538  auto packLanes = [&Ops, &S32, &B] (Register Src) {
4539  auto Unmerge = B.buildUnmerge({S32, S32, S32, S32}, Src);
4540  Ops.push_back(Unmerge.getReg(0));
4541  Ops.push_back(Unmerge.getReg(1));
4542  Ops.push_back(Unmerge.getReg(2));
4543  };
4544 
4545  packLanes(RayOrigin);
4546  if (IsA16) {
4547  auto UnmergeRayDir = B.buildUnmerge({S16, S16, S16, S16}, RayDir);
4548  auto UnmergeRayInvDir = B.buildUnmerge({S16, S16, S16, S16}, RayInvDir);
4549  Register R1 = MRI.createGenericVirtualRegister(S32);
4550  Register R2 = MRI.createGenericVirtualRegister(S32);
4551  Register R3 = MRI.createGenericVirtualRegister(S32);
4552  B.buildMerge(R1, {UnmergeRayDir.getReg(0), UnmergeRayDir.getReg(1)});
4553  B.buildMerge(R2, {UnmergeRayDir.getReg(2), UnmergeRayInvDir.getReg(0)});
4554  B.buildMerge(R3, {UnmergeRayInvDir.getReg(1), UnmergeRayInvDir.getReg(2)});
4555  Ops.push_back(R1);
4556  Ops.push_back(R2);
4557  Ops.push_back(R3);
4558  } else {
4559  packLanes(RayDir);
4560  packLanes(RayInvDir);
4561  }
4562 
4563  auto MIB = B.buildInstr(AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY)
4564  .addDef(DstReg)
4565  .addImm(Opcode);
4566 
4567  for (Register R : Ops) {
4568  MIB.addUse(R);
4569  }
4570 
4571  MIB.addUse(TDescr)
4572  .addImm(IsA16 ? 1 : 0)
4573  .cloneMemRefs(MI);
4574 
4575  MI.eraseFromParent();
4576  return true;
4577 }
4578 
4580  MachineInstr &MI) const {
4581  MachineIRBuilder &B = Helper.MIRBuilder;
4582  MachineRegisterInfo &MRI = *B.getMRI();
4583 
4584  // Replace the use G_BRCOND with the exec manipulate and branch pseudos.
4585  auto IntrID = MI.getIntrinsicID();
4586  switch (IntrID) {
4587  case Intrinsic::amdgcn_if:
4588  case Intrinsic::amdgcn_else: {
4589  MachineInstr *Br = nullptr;
4590  MachineBasicBlock *UncondBrTarget = nullptr;
4591  bool Negated = false;
4592  if (MachineInstr *BrCond =
4593  verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) {
4594  const SIRegisterInfo *TRI
4595  = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
4596 
4597  Register Def = MI.getOperand(1).getReg();
4598  Register Use = MI.getOperand(3).getReg();
4599 
4600  MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
4601 
4602  if (Negated)
4603  std::swap(CondBrTarget, UncondBrTarget);
4604 
4605  B.setInsertPt(B.getMBB(), BrCond->getIterator());
4606  if (IntrID == Intrinsic::amdgcn_if) {
4607  B.buildInstr(AMDGPU::SI_IF)
4608  .addDef(Def)
4609  .addUse(Use)
4610  .addMBB(UncondBrTarget);
4611  } else {
4612  B.buildInstr(AMDGPU::SI_ELSE)
4613  .addDef(Def)
4614  .addUse(Use)
4615  .addMBB(UncondBrTarget);
4616  }
4617 
4618  if (Br) {
4619  Br->getOperand(0).setMBB(CondBrTarget);
4620  } else {
4621  // The IRTranslator skips inserting the G_BR for fallthrough cases, but
4622  // since we're swapping branch targets it needs to be reinserted.
4623  // FIXME: IRTranslator should probably not do this
4624  B.buildBr(*CondBrTarget);
4625  }
4626 
4627  MRI.setRegClass(Def, TRI->getWaveMaskRegClass());
4628  MRI.setRegClass(Use, TRI->getWaveMaskRegClass());
4629  MI.eraseFromParent();
4630  BrCond->eraseFromParent();
4631  return true;
4632  }
4633 
4634  return false;
4635  }
4636  case Intrinsic::amdgcn_loop: {
4637  MachineInstr *Br = nullptr;
4638  MachineBasicBlock *UncondBrTarget = nullptr;
4639  bool Negated = false;
4640  if (MachineInstr *BrCond =
4641  verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) {
4642  const SIRegisterInfo *TRI
4643  = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
4644 
4645  MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
4646  Register Reg = MI.getOperand(2).getReg();
4647 
4648  if (Negated)
4649  std::swap(CondBrTarget, UncondBrTarget);
4650 
4651  B.setInsertPt(B.getMBB(), BrCond->getIterator());
4652  B.buildInstr(AMDGPU::SI_LOOP)
4653  .addUse(Reg)
4654  .addMBB(UncondBrTarget);
4655 
4656  if (Br)
4657  Br->getOperand(0).setMBB(CondBrTarget);
4658  else
4659  B.buildBr(*CondBrTarget);
4660 
4661  MI.eraseFromParent();
4662  BrCond->eraseFromParent();
4663  MRI.setRegClass(Reg, TRI->getWaveMaskRegClass());
4664  return true;
4665  }
4666 
4667  return false;
4668  }
4669  case Intrinsic::amdgcn_kernarg_segment_ptr:
4670  if (!AMDGPU::isKernel(B.getMF().getFunction().getCallingConv())) {
4671  // This only makes sense to call in a kernel, so just lower to null.
4672  B.buildConstant(MI.getOperand(0).getReg(), 0);
4673  MI.eraseFromParent();
4674  return true;
4675  }
4676 
4679  case Intrinsic::amdgcn_implicitarg_ptr:
4680  return legalizeImplicitArgPtr(MI, MRI, B);
4681  case Intrinsic::amdgcn_workitem_id_x:
4684  case Intrinsic::amdgcn_workitem_id_y:
4687  case Intrinsic::amdgcn_workitem_id_z:
4690  case Intrinsic::amdgcn_workgroup_id_x:
4693  case Intrinsic::amdgcn_workgroup_id_y:
4696  case Intrinsic::amdgcn_workgroup_id_z:
4699  case Intrinsic::amdgcn_dispatch_ptr:
4702  case Intrinsic::amdgcn_queue_ptr:
4705  case Intrinsic::amdgcn_implicit_buffer_ptr:
4708  case Intrinsic::amdgcn_dispatch_id:
4711  case Intrinsic::amdgcn_fdiv_fast:
4712  return legalizeFDIVFastIntrin(MI, MRI, B);
4713  case Intrinsic::amdgcn_is_shared:
4715  case Intrinsic::amdgcn_is_private:
4717  case Intrinsic::amdgcn_wavefrontsize: {
4718  B.buildConstant(MI.getOperand(0), ST.getWavefrontSize());
4719  MI.eraseFromParent();
4720  return true;
4721  }
4722  case Intrinsic::amdgcn_s_buffer_load:
4723  return legalizeSBufferLoad(Helper, MI);
4724  case Intrinsic::amdgcn_raw_buffer_store:
4725  case Intrinsic::amdgcn_struct_buffer_store:
4726  return legalizeBufferStore(MI, MRI, B, false, false);
4727  case Intrinsic::amdgcn_raw_buffer_store_format:
4728  case Intrinsic::amdgcn_struct_buffer_store_format:
4729  return legalizeBufferStore(MI, MRI, B, false, true);
4730  case Intrinsic::amdgcn_raw_tbuffer_store:
4731  case Intrinsic::amdgcn_struct_tbuffer_store:
4732  return legalizeBufferStore(MI, MRI, B, true, true);
4733  case Intrinsic::amdgcn_raw_buffer_load:
4734  case Intrinsic::amdgcn_struct_buffer_load:
4735  return legalizeBufferLoad(MI, MRI, B, false, false);
4736  case Intrinsic::amdgcn_raw_buffer_load_format:
4737  case Intrinsic::amdgcn_struct_buffer_load_format:
4738  return legalizeBufferLoad(MI, MRI, B, true, false);
4739  case Intrinsic::amdgcn_raw_tbuffer_load:
4740  case Intrinsic::amdgcn_struct_tbuffer_load:
4741  return legalizeBufferLoad(MI, MRI, B, true, true);
4742  case Intrinsic::amdgcn_raw_buffer_atomic_swap:
4743  case Intrinsic::amdgcn_struct_buffer_atomic_swap:
4744  case Intrinsic::amdgcn_raw_buffer_atomic_add:
4745  case Intrinsic::amdgcn_struct_buffer_atomic_add:
4746  case Intrinsic::amdgcn_raw_buffer_atomic_sub:
4747  case Intrinsic::amdgcn_struct_buffer_atomic_sub:
4748  case Intrinsic::amdgcn_raw_buffer_atomic_smin:
4749  case Intrinsic::amdgcn_struct_buffer_atomic_smin:
4750  case Intrinsic::amdgcn_raw_buffer_atomic_umin:
4751  case Intrinsic::amdgcn_struct_buffer_atomic_umin:
4752  case Intrinsic::amdgcn_raw_buffer_atomic_smax:
4753  case Intrinsic::amdgcn_struct_buffer_atomic_smax:
4754  case Intrinsic::amdgcn_raw_buffer_atomic_umax:
4755  case Intrinsic::amdgcn_struct_buffer_atomic_umax:
4756  case Intrinsic::amdgcn_raw_buffer_atomic_and:
4757  case Intrinsic::amdgcn_struct_buffer_atomic_and:
4758  case Intrinsic::amdgcn_raw_buffer_atomic_or:
4759  case Intrinsic::amdgcn_struct_buffer_atomic_or:
4760  case Intrinsic::amdgcn_raw_buffer_atomic_xor:
4761  case Intrinsic::amdgcn_struct_buffer_atomic_xor:
4762  case Intrinsic::amdgcn_raw_buffer_atomic_inc:
4763  case Intrinsic::amdgcn_struct_buffer_atomic_inc:
4764  case Intrinsic::amdgcn_raw_buffer_atomic_dec:
4765  case Intrinsic::amdgcn_struct_buffer_atomic_dec:
4766  case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
4767  case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
4768  case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
4769  case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
4770  return legalizeBufferAtomic(MI, B, IntrID);
4771  case Intrinsic::amdgcn_atomic_inc:
4772  return legalizeAtomicIncDec(MI, B, true);
4773  case Intrinsic::amdgcn_atomic_dec:
4774  return legalizeAtomicIncDec(MI, B, false);
4775  case Intrinsic::trap:
4776  return legalizeTrapIntrinsic(MI, MRI, B);
4777  case Intrinsic::debugtrap:
4778  return legalizeDebugTrapIntrinsic(MI, MRI, B);
4779  case Intrinsic::amdgcn_rsq_clamp:
4780  return legalizeRsqClampIntrinsic(MI, MRI, B);
4781  case Intrinsic::amdgcn_ds_fadd:
4782  case Intrinsic::amdgcn_ds_fmin:
4783  case Intrinsic::amdgcn_ds_fmax:
4784  return legalizeDSAtomicFPIntrinsic(Helper, MI, IntrID);
4785  case Intrinsic::amdgcn_image_bvh_intersect_ray:
4786  return legalizeBVHIntrinsic(MI, B);
4787  default: {
4788  if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
4790  return legalizeImageIntrinsic(MI, B, Helper.Observer, ImageDimIntr);
4791  return true;
4792  }
4793  }
4794 
4795  return true;
4796 }
LegalityPredicate typeIsNot(unsigned TypeIdx, LLT Type)
True iff the given type index is not the specified type.
bool shouldUseLDSConstAddress(const GlobalValue *GV) const
unsigned Log2_32_Ceil(uint32_t Value)
Return the ceil log base 2 of the specified value, 32 if the value is zero.
Definition: MathExtras.h:610
static GCMetadataPrinterRegistry::Add< ErlangGCPrinter > X("erlang", "erlang-compatible garbage collector")
bool shouldEmitFixup(const GlobalValue *GV) const
static LLT pointer(unsigned AddressSpace, unsigned SizeInBits)
Get a low-level pointer in the given address space.
Register getFunctionLiveInPhysReg(MachineFunction &MF, const TargetInstrInfo &TII, MCRegister PhysReg, const TargetRegisterClass &RC, LLT RegTy=LLT())
Return a virtual register corresponding to the incoming argument register PhysReg.
Definition: Utils.cpp:504
LegalizeRuleSet & clampMaxNumElements(unsigned TypeIdx, const LLT EltTy, unsigned MaxElements)
Limit the number of elements in EltTy vectors to at most MaxElements.
Diagnostic information for unsupported feature in backend.
Wrapper class representing physical registers. Should be passed by value.
Definition: MCRegister.h:22
MachineBasicBlock * getMBB() const
LegalityPredicate scalarNarrowerThan(unsigned TypeIdx, unsigned Size)
True iff the specified type index is a scalar that's narrower than the given size.
static bool hasDefinedInitializer(const GlobalValue *GV)
SI Whole Quad Mode
Optional< ValueAndVReg > getConstantVRegValWithLookThrough(Register VReg, const MachineRegisterInfo &MRI, bool LookThroughInstrs=true, bool HandleFConstants=true)
If VReg is defined by a statically evaluable chain of instructions rooted on a G_F/CONSTANT (LookThro...
Definition: Utils.cpp:278
This class represents lattice values for constants.
Definition: AllocatorList.h:23
DEBUG_TYPE to vector
bool hasUsableDivScaleConditionOutput() const
Condition output from div_scale is usable.
bool legalizeTrapIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFceil(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
Not emitted register (e.g. carry, or temporary result).
Register getSegmentAperture(unsigned AddrSpace, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
unsigned getScalarSizeInBits() const
bool legalizeInsertVectorElt(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx)
Address space for constant memory (VTX2).
Definition: AMDGPU.h:368
Address space for local memory.
Definition: AMDGPU.h:369
The LegalityQuery object bundles together all the information that's needed to decide whether a given...
bool isScalar() const
static bool isPhysicalRegister(unsigned Reg)
Return true if the specified register number is in the physical register namespace.
Definition: Register.h:65
bool hasFractBug() const
unsigned Reg
constexpr float ln10f
Definition: MathExtras.h:76
bool hasTrigReducedRange() const
float BitsToFloat(uint32_t Bits)
This function takes a 32-bit integer and returns the bit equivalent float.
Definition: MathExtras.h:644
LLT getScalarType() const
bool legalizeIsAddrSpace(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, unsigned AddrSpace) const
uint64_t getSize() const
Return the size in bytes of the memory reference.
LLVM_NODISCARD detail::scope_exit< typename std::decay< Callable >::type > make_scope_exit(Callable &&F)
Definition: ScopeExit.h:58
const SIInstrInfo * getInstrInfo() const override
bool legalizeAddrSpaceCast(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool mi_match(Reg R, const MachineRegisterInfo &MRI, Pattern &&P)
static LegalityPredicate isWideScalarTruncStore(unsigned TypeIdx)
bool legalizeIntrinsic(LegalizerHelper &Helper, MachineInstr &MI) const override
bool legalizeFExp(MachineInstr &MI, MachineIRBuilder &B) const
unsigned const TargetRegisterInfo * TRI
LegalityPredicate scalarOrEltWiderThan(unsigned TypeIdx, unsigned Size)
True iff the specified type index is a scalar or a vector with an element type that's wider than the ...
std::tuple< const ArgDescriptor *, const TargetRegisterClass *, LLT > getPreloadedValue(AMDGPUFunctionArgInfo::PreloadedValue Value) const
#define R2(n)
static bool isNot(const MachineRegisterInfo &MRI, const MachineInstr &MI)
static MachineInstr * verifyCFIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineInstr *&Br, MachineBasicBlock *&UncondBrTarget, bool &Negated)
static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID)
LegalityPredicate typeIs(unsigned TypeIdx, LLT TypesInit)
True iff the given type index is the specified type.
LegalityPredicate scalarOrEltNarrowerThan(unsigned TypeIdx, unsigned Size)
True iff the specified type index is a scalar or vector with an element type that's narrower than the...
LegalizeRuleSet & scalarize(unsigned TypeIdx)
bool legalizeDebugTrapIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
static LegalizeMutation oneMoreElement(unsigned TypeIdx)
LegalityPredicate isPointer(unsigned TypeIdx)
True iff the specified type index is a pointer (with any address space).
LLVM_READONLY const MIMGLZMappingInfo * getMIMGLZMappingInfo(unsigned L)
const ImageDimIntrinsicInfo * getImageDimInstrinsicByBaseOpcode(unsigned BaseOpcode, unsigned Dim)
Function & getFunction()
Return the LLVM function that this machine code represents.
void bitcastDst(MachineInstr &MI, LLT CastTy, unsigned OpIdx)
Legalize a single operand OpIdx of the machine instruction MI as a def by inserting a G_BITCAST from ...
bool getImplicitArgPtr(Register DstReg, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
static GCMetadataPrinterRegistry::Add< OcamlGCMetadataPrinter > Y("ocaml", "ocaml 3.10-compatible collector")
Value of the register doesn't matter.
int64_t getImm() const
#define FP_DENORM_FLUSH_NONE
Definition: SIDefines.h:835
bool isVector() const
static LLT getPow2ScalarType(LLT Ty)
bool legalizeLoad(LegalizerHelper &Helper, MachineInstr &MI) const
A description of a memory reference used in the backend.
Register fixStoreSourceType(MachineIRBuilder &B, Register VData, bool IsFormat) const
MachineIRBuilder & MIRBuilder
Expose MIRBuilder so clients can set their own RecordInsertInstruction functions.
LegalizeResult lowerFMinNumMaxNum(MachineInstr &MI)
static LegalizeMutation bitcastToVectorElement32(unsigned TypeIdx)
ArrayRef< T > makeArrayRef(const T &OneElt)
Construct an ArrayRef from a single element.
Definition: ArrayRef.h:458
bool legalizeITOFP(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, bool Signed) const
bool legalizeDSAtomicFPIntrinsic(LegalizerHelper &Helper, MachineInstr &MI, Intrinsic::ID IID) const
A Use represents the edge between a Value definition and its users.
Definition: Use.h:44
bool IEEE
Floating point opcodes that support exception flag gathering quiet and propagate signaling NaN inputs...
LegalityPredicate elementTypeIs(unsigned TypeIdx, LLT EltTy)
True if the type index is a vector with element type EltTy.
LegalizeRuleSet & widenScalarToNextPow2(unsigned TypeIdx, unsigned MinSize=0)
Widen the scalar to the next power of two that is at least MinSize.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: APFloat.h:43
void eraseFromParent()
Unlink 'this' from the containing basic block and delete it.
Bitwise or logical AND of integers.
LegalizeMutation changeTo(unsigned TypeIdx, LLT Ty)
Select this specific type for the given type index.
The memory access is dereferenceable (i.e., doesn't trap).
unsigned Intr
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
Definition: MachineInstr.h:475
bool isTrapHandlerEnabled() const
static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx)
Address space for 32-bit constant memory.
Definition: AMDGPU.h:372
constexpr double inv_pi
Definition: MathExtras.h:65
std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
Definition: BitmaskEnum.h:80
bool legalizeRsqClampIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
Align commonAlignment(Align A, Align B)
Returns the alignment that satisfies both alignments.
Definition: Alignment.h:221
zlib style complession
LLT getElementType() const
Returns the vector's element type. Only valid for vector types.
bool legalizeFastUnsafeFDIV(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
0 1 1 1 True if ordered (no nans)
Definition: InstrTypes.h:731
static bool shouldBitcastLoadStoreType(const GCNSubtarget &ST, const LLT Ty, const unsigned MemSizeInBits)
Return true if a load or store of the type should be lowered with a bitcast to a different type.
bool legalizeFPow(MachineInstr &MI, MachineIRBuilder &B) const
bool hasExternalLinkage() const
Definition: GlobalValue.h:431
bool legalizeIntrinsicTrunc(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
static bool isRegisterType(LLT Ty)
bool isFlatGlobalAddrSpace(unsigned AS)
Definition: AMDGPU.h:412
int64_t getSExtValue() const
Get sign extended value.
Definition: APInt.h:1643
LegalityPredicate smallerThan(unsigned TypeIdx0, unsigned TypeIdx1)
True iff the first type index has a smaller total bit size than second type index.
std::pair< Register, unsigned > getBaseWithConstantOffset(MachineRegisterInfo &MRI, Register Reg)
Returns base register and constant offset.
bool legalizeAtomicIncDec(MachineInstr &MI, MachineIRBuilder &B, bool IsInc) const
bool legalizeAtomicCmpXChg(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool isNegative() const
Return true if the sign bit is set.
Definition: Constants.h:319
static void toggleSPDenormMode(bool Enable, MachineIRBuilder &B, const GCNSubtarget &ST, AMDGPU::SIModeRegisterDefaults Mode)
LegalizeResult lowerShuffleVector(MachineInstr &MI)
unsigned greater or equal
Definition: InstrTypes.h:746
Predicate all(Predicate P0, Predicate P1)
True iff P0 and P1 are true.
MachineFunction & getMF()
Getter for the function we currently build.
const MachineInstrBuilder & addGlobalAddress(const GlobalValue *GV, int64_t Offset=0, unsigned TargetFlags=0) const
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: APInt.h:32
const ImageDimIntrinsicInfo * getImageDimIntrinsicInfo(unsigned Intr)
bool legalizeFDIV64(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
uint32_t getImplicitParameterOffset(const MachineFunction &MF, const ImplicitParameter Param) const
Helper function that returns the byte offset of the given type of implicit parameter.
static Error unsupported(const char *Str, const Triple &T)
Definition: MachO.cpp:71
LLVM_READONLY const MIMGBaseOpcodeInfo * getMIMGBaseOpcodeInfo(unsigned BaseOpcode)
bool isVirtual() const
Return true if the specified register number is in the virtual register namespace.
Definition: Register.h:91
static const fltSemantics & IEEEdouble() LLVM_READNONE
Definition: APFloat.cpp:166
static LLT scalar(unsigned SizeInBits)
Get a low-level scalar or aggregate "bag of bits".
uint64_t value() const
This is a hole in the type system and should not be abused.
Definition: Alignment.h:85
void legalizeUDIV_UREM64Impl(MachineIRBuilder &B, Register DstReg, Register Numer, Register Denom, bool IsDiv) const
Address space for private memory.
Definition: AMDGPU.h:370
Register widenWithUnmerge(LLT WideTy, Register OrigReg)
Widen OrigReg to WideTy by merging to a wider type, padding with G_IMPLICIT_DEF, and producing dead r...
bool legalizeFDIV(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
static constexpr int Concat[]
Analysis containing CSE Info
Definition: CSEInfo.cpp:25
uint32_t FloatToBits(float Float)
This function takes a float and returns the bit equivalent 32-bit integer.
Definition: MathExtras.h:664
LegalizeRuleSet & legalIf(LegalityPredicate Predicate)
The instruction is legal if predicate is true.
bool legalizeFDIVFastIntrin(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeSDIV_SREM(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
LegalizeMutation scalarize(unsigned TypeIdx)
Break up the vector type for the given type index into the element type.
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:427
Abstract class that contains various methods for clients to notify about changes.
static LegalityPredicate sizeIsMultipleOf32(unsigned TypeIdx)
LegalityPredicate largerThan(unsigned TypeIdx0, unsigned TypeIdx1)
True iff the first type index has a larger total bit size than second type index.
bool buildPCRelGlobalAddress(Register DstReg, LLT PtrTy, MachineIRBuilder &B, const GlobalValue *GV, int64_t Offset, unsigned GAFlags=SIInstrInfo::MO_NONE) const
unsigned UnsafeFPMath
UnsafeFPMath - This flag is enabled when the -enable-unsafe-fp-math flag is specified on the command ...
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
unsigned const MachineRegisterInfo * MRI
static LLT scalarOrVector(uint16_t NumElements, LLT ScalarTy)
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition: MathExtras.h:492
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:46
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
bool loadInputValue(Register DstReg, MachineIRBuilder &B, const ArgDescriptor *Arg, const TargetRegisterClass *ArgRC, LLT ArgTy) const
This is an important class for using LLVM in a threaded context.
Definition: LLVMContext.h:68
bool legalizePreloadedArgIntrin(MachineInstr &MI, MachineRegisterInfo &am