LLVM  10.0.0svn
AMDGPULegalizerInfo.cpp
Go to the documentation of this file.
1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// This file implements the targeting of the Machinelegalizer class for
10 /// AMDGPU.
11 /// \todo This should be generated by TableGen.
12 //===----------------------------------------------------------------------===//
13 
14 #if defined(_MSC_VER) || defined(__MINGW32__)
15 // According to Microsoft, one must set _USE_MATH_DEFINES in order to get M_PI
16 // from the Visual C++ cmath / math.h headers:
17 // https://docs.microsoft.com/en-us/cpp/c-runtime-library/math-constants?view=vs-2019
18 #define _USE_MATH_DEFINES
19 #endif
20 
21 #include "AMDGPU.h"
22 #include "AMDGPULegalizerInfo.h"
23 #include "AMDGPUTargetMachine.h"
24 #include "SIMachineFunctionInfo.h"
29 #include "llvm/IR/DerivedTypes.h"
30 #include "llvm/IR/DiagnosticInfo.h"
31 #include "llvm/IR/Type.h"
32 #include "llvm/Support/Debug.h"
33 
34 #define DEBUG_TYPE "amdgpu-legalinfo"
35 
36 using namespace llvm;
37 using namespace LegalizeActions;
38 using namespace LegalizeMutations;
39 using namespace LegalityPredicates;
40 
41 
42 static LegalityPredicate isMultiple32(unsigned TypeIdx,
43  unsigned MaxSize = 1024) {
44  return [=](const LegalityQuery &Query) {
45  const LLT Ty = Query.Types[TypeIdx];
46  const LLT EltTy = Ty.getScalarType();
47  return Ty.getSizeInBits() <= MaxSize && EltTy.getSizeInBits() % 32 == 0;
48  };
49 }
50 
51 static LegalityPredicate sizeIs(unsigned TypeIdx, unsigned Size) {
52  return [=](const LegalityQuery &Query) {
53  return Query.Types[TypeIdx].getSizeInBits() == Size;
54  };
55 }
56 
57 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) {
58  return [=](const LegalityQuery &Query) {
59  const LLT Ty = Query.Types[TypeIdx];
60  return Ty.isVector() &&
61  Ty.getNumElements() % 2 != 0 &&
62  Ty.getElementType().getSizeInBits() < 32 &&
63  Ty.getSizeInBits() % 32 != 0;
64  };
65 }
66 
67 static LegalityPredicate isWideVec16(unsigned TypeIdx) {
68  return [=](const LegalityQuery &Query) {
69  const LLT Ty = Query.Types[TypeIdx];
70  const LLT EltTy = Ty.getScalarType();
71  return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2;
72  };
73 }
74 
75 static LegalizeMutation oneMoreElement(unsigned TypeIdx) {
76  return [=](const LegalityQuery &Query) {
77  const LLT Ty = Query.Types[TypeIdx];
78  const LLT EltTy = Ty.getElementType();
79  return std::make_pair(TypeIdx, LLT::vector(Ty.getNumElements() + 1, EltTy));
80  };
81 }
82 
83 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) {
84  return [=](const LegalityQuery &Query) {
85  const LLT Ty = Query.Types[TypeIdx];
86  const LLT EltTy = Ty.getElementType();
87  unsigned Size = Ty.getSizeInBits();
88  unsigned Pieces = (Size + 63) / 64;
89  unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces;
90  return std::make_pair(TypeIdx, LLT::scalarOrVector(NewNumElts, EltTy));
91  };
92 }
93 
94 // Increase the number of vector elements to reach the next multiple of 32-bit
95 // type.
96 static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) {
97  return [=](const LegalityQuery &Query) {
98  const LLT Ty = Query.Types[TypeIdx];
99 
100  const LLT EltTy = Ty.getElementType();
101  const int Size = Ty.getSizeInBits();
102  const int EltSize = EltTy.getSizeInBits();
103  const int NextMul32 = (Size + 31) / 32;
104 
105  assert(EltSize < 32);
106 
107  const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize;
108  return std::make_pair(TypeIdx, LLT::vector(NewNumElts, EltTy));
109  };
110 }
111 
112 static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) {
113  return [=](const LegalityQuery &Query) {
114  const LLT QueryTy = Query.Types[TypeIdx];
115  return QueryTy.isVector() && QueryTy.getSizeInBits() < Size;
116  };
117 }
118 
119 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) {
120  return [=](const LegalityQuery &Query) {
121  const LLT QueryTy = Query.Types[TypeIdx];
122  return QueryTy.isVector() && QueryTy.getSizeInBits() > Size;
123  };
124 }
125 
126 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) {
127  return [=](const LegalityQuery &Query) {
128  const LLT QueryTy = Query.Types[TypeIdx];
129  return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0;
130  };
131 }
132 
133 // Any combination of 32 or 64-bit elements up to 1024 bits, and multiples of
134 // v2s16.
135 static LegalityPredicate isRegisterType(unsigned TypeIdx) {
136  return [=](const LegalityQuery &Query) {
137  const LLT Ty = Query.Types[TypeIdx];
138  if (Ty.isVector()) {
139  const int EltSize = Ty.getElementType().getSizeInBits();
140  return EltSize == 32 || EltSize == 64 ||
141  (EltSize == 16 && Ty.getNumElements() % 2 == 0) ||
142  EltSize == 128 || EltSize == 256;
143  }
144 
145  return Ty.getSizeInBits() % 32 == 0 && Ty.getSizeInBits() <= 1024;
146  };
147 }
148 
149 static LegalityPredicate elementTypeIs(unsigned TypeIdx, LLT Type) {
150  return [=](const LegalityQuery &Query) {
151  return Query.Types[TypeIdx].getElementType() == Type;
152  };
153 }
154 
155 static LegalityPredicate isWideScalarTruncStore(unsigned TypeIdx) {
156  return [=](const LegalityQuery &Query) {
157  const LLT Ty = Query.Types[TypeIdx];
158  return !Ty.isVector() && Ty.getSizeInBits() > 32 &&
159  Query.MMODescrs[0].SizeInBits < Ty.getSizeInBits();
160  };
161 }
162 
164  const GCNTargetMachine &TM)
165  : ST(ST_) {
166  using namespace TargetOpcode;
167 
168  auto GetAddrSpacePtr = [&TM](unsigned AS) {
169  return LLT::pointer(AS, TM.getPointerSizeInBits(AS));
170  };
171 
172  const LLT S1 = LLT::scalar(1);
173  const LLT S8 = LLT::scalar(8);
174  const LLT S16 = LLT::scalar(16);
175  const LLT S32 = LLT::scalar(32);
176  const LLT S64 = LLT::scalar(64);
177  const LLT S96 = LLT::scalar(96);
178  const LLT S128 = LLT::scalar(128);
179  const LLT S256 = LLT::scalar(256);
180  const LLT S1024 = LLT::scalar(1024);
181 
182  const LLT V2S16 = LLT::vector(2, 16);
183  const LLT V4S16 = LLT::vector(4, 16);
184 
185  const LLT V2S32 = LLT::vector(2, 32);
186  const LLT V3S32 = LLT::vector(3, 32);
187  const LLT V4S32 = LLT::vector(4, 32);
188  const LLT V5S32 = LLT::vector(5, 32);
189  const LLT V6S32 = LLT::vector(6, 32);
190  const LLT V7S32 = LLT::vector(7, 32);
191  const LLT V8S32 = LLT::vector(8, 32);
192  const LLT V9S32 = LLT::vector(9, 32);
193  const LLT V10S32 = LLT::vector(10, 32);
194  const LLT V11S32 = LLT::vector(11, 32);
195  const LLT V12S32 = LLT::vector(12, 32);
196  const LLT V13S32 = LLT::vector(13, 32);
197  const LLT V14S32 = LLT::vector(14, 32);
198  const LLT V15S32 = LLT::vector(15, 32);
199  const LLT V16S32 = LLT::vector(16, 32);
200  const LLT V32S32 = LLT::vector(32, 32);
201 
202  const LLT V2S64 = LLT::vector(2, 64);
203  const LLT V3S64 = LLT::vector(3, 64);
204  const LLT V4S64 = LLT::vector(4, 64);
205  const LLT V5S64 = LLT::vector(5, 64);
206  const LLT V6S64 = LLT::vector(6, 64);
207  const LLT V7S64 = LLT::vector(7, 64);
208  const LLT V8S64 = LLT::vector(8, 64);
209  const LLT V16S64 = LLT::vector(16, 64);
210 
211  std::initializer_list<LLT> AllS32Vectors =
212  {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32,
213  V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32};
214  std::initializer_list<LLT> AllS64Vectors =
215  {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64};
216 
217  const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS);
218  const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS);
219  const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT);
220  const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS);
221  const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS);
222  const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS);
223  const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS);
224 
225  const LLT CodePtr = FlatPtr;
226 
227  const std::initializer_list<LLT> AddrSpaces64 = {
228  GlobalPtr, ConstantPtr, FlatPtr
229  };
230 
231  const std::initializer_list<LLT> AddrSpaces32 = {
232  LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr
233  };
234 
235  const std::initializer_list<LLT> FPTypesBase = {
236  S32, S64
237  };
238 
239  const std::initializer_list<LLT> FPTypes16 = {
240  S32, S64, S16
241  };
242 
243  const std::initializer_list<LLT> FPTypesPK16 = {
244  S32, S64, S16, V2S16
245  };
246 
247  setAction({G_BRCOND, S1}, Legal);
248 
249  // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more
250  // elements for v3s16
252  .legalFor({S32, S64, V2S16, V4S16, S1, S128, S256})
253  .legalFor(AllS32Vectors)
254  .legalFor(AllS64Vectors)
255  .legalFor(AddrSpaces64)
256  .legalFor(AddrSpaces32)
257  .clampScalar(0, S32, S256)
258  .widenScalarToNextPow2(0, 32)
259  .clampMaxNumElements(0, S32, 16)
261  .legalIf(isPointer(0));
262 
263  if (ST.has16BitInsts()) {
264  getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
265  .legalFor({S32, S16})
266  .clampScalar(0, S16, S32)
267  .scalarize(0);
268  } else {
269  getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
270  .legalFor({S32})
271  .clampScalar(0, S32, S32)
272  .scalarize(0);
273  }
274 
275  getActionDefinitionsBuilder({G_UMULH, G_SMULH})
276  .legalFor({S32})
277  .clampScalar(0, S32, S32)
278  .scalarize(0);
279 
280  // Report legal for any types we can handle anywhere. For the cases only legal
281  // on the SALU, RegBankSelect will be able to re-legalize.
282  getActionDefinitionsBuilder({G_AND, G_OR, G_XOR})
283  .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16})
284  .clampScalar(0, S32, S64)
288  .scalarize(0);
289 
290  getActionDefinitionsBuilder({G_UADDO, G_USUBO,
291  G_UADDE, G_SADDE, G_USUBE, G_SSUBE})
292  .legalFor({{S32, S1}})
293  .clampScalar(0, S32, S32)
294  .scalarize(0); // TODO: Implement.
295 
296  getActionDefinitionsBuilder({G_SADDO, G_SSUBO})
297  .lower();
298 
299  getActionDefinitionsBuilder(G_BITCAST)
300  // Don't worry about the size constraint.
302  // FIXME: Testing hack
303  .legalForCartesianProduct({S16, LLT::vector(2, 8), });
304 
305  getActionDefinitionsBuilder(G_FCONSTANT)
306  .legalFor({S32, S64, S16})
307  .clampScalar(0, S16, S64);
308 
309  getActionDefinitionsBuilder(G_IMPLICIT_DEF)
310  .legalFor({S1, S32, S64, S16, V2S32, V4S32, V2S16, V4S16, GlobalPtr,
311  ConstantPtr, LocalPtr, FlatPtr, PrivatePtr})
312  .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
313  .clampScalarOrElt(0, S32, S1024)
314  .legalIf(isMultiple32(0))
315  .widenScalarToNextPow2(0, 32)
316  .clampMaxNumElements(0, S32, 16);
317 
318 
319  // FIXME: i1 operands to intrinsics should always be legal, but other i1
320  // values may not be legal. We need to figure out how to distinguish
321  // between these two scenarios.
322  getActionDefinitionsBuilder(G_CONSTANT)
323  .legalFor({S1, S32, S64, S16, GlobalPtr,
324  LocalPtr, ConstantPtr, PrivatePtr, FlatPtr })
325  .clampScalar(0, S32, S64)
327  .legalIf(isPointer(0));
328 
329  setAction({G_FRAME_INDEX, PrivatePtr}, Legal);
330  getActionDefinitionsBuilder(G_GLOBAL_VALUE)
331  .customFor({LocalPtr, GlobalPtr, ConstantPtr, Constant32Ptr});
332 
333 
334  auto &FPOpActions = getActionDefinitionsBuilder(
335  { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE})
336  .legalFor({S32, S64});
337  auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS})
338  .customFor({S32, S64});
339 
340  if (ST.has16BitInsts()) {
341  if (ST.hasVOP3PInsts())
342  FPOpActions.legalFor({S16, V2S16});
343  else
344  FPOpActions.legalFor({S16});
345 
346  TrigActions.customFor({S16});
347  }
348 
349  auto &MinNumMaxNum = getActionDefinitionsBuilder({
350  G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE});
351 
352  if (ST.hasVOP3PInsts()) {
353  MinNumMaxNum.customFor(FPTypesPK16)
354  .clampMaxNumElements(0, S16, 2)
355  .clampScalar(0, S16, S64)
356  .scalarize(0);
357  } else if (ST.has16BitInsts()) {
358  MinNumMaxNum.customFor(FPTypes16)
359  .clampScalar(0, S16, S64)
360  .scalarize(0);
361  } else {
362  MinNumMaxNum.customFor(FPTypesBase)
363  .clampScalar(0, S32, S64)
364  .scalarize(0);
365  }
366 
367  if (ST.hasVOP3PInsts())
368  FPOpActions.clampMaxNumElements(0, S16, 2);
369 
370  FPOpActions
371  .scalarize(0)
372  .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
373 
374  TrigActions
375  .scalarize(0)
376  .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
377 
378  getActionDefinitionsBuilder({G_FNEG, G_FABS})
379  .legalFor(FPTypesPK16)
380  .clampMaxNumElements(0, S16, 2)
381  .scalarize(0)
382  .clampScalar(0, S16, S64);
383 
384  // TODO: Implement
385  getActionDefinitionsBuilder({G_FMINIMUM, G_FMAXIMUM}).lower();
386 
387  if (ST.has16BitInsts()) {
388  getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR})
389  .legalFor({S32, S64, S16})
390  .scalarize(0)
391  .clampScalar(0, S16, S64);
392  } else {
393  getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR})
394  .legalFor({S32, S64})
395  .scalarize(0)
396  .clampScalar(0, S32, S64);
397  }
398 
399  getActionDefinitionsBuilder(G_FPTRUNC)
400  .legalFor({{S32, S64}, {S16, S32}})
401  .scalarize(0);
402 
404  .legalFor({{S64, S32}, {S32, S16}})
405  .lowerFor({{S64, S16}}) // FIXME: Implement
406  .scalarize(0);
407 
408  // TODO: Verify V_BFI_B32 is generated from expanded bit ops.
409  getActionDefinitionsBuilder(G_FCOPYSIGN).lower();
410 
412  // Use actual fsub instruction
413  .legalFor({S32})
414  // Must use fadd + fneg
415  .lowerFor({S64, S16, V2S16})
416  .scalarize(0)
417  .clampScalar(0, S32, S64);
418 
419  // Whether this is legal depends on the floating point mode for the function.
420  auto &FMad = getActionDefinitionsBuilder(G_FMAD);
421  if (ST.hasMadF16())
422  FMad.customFor({S32, S16});
423  else
424  FMad.customFor({S32});
425  FMad.scalarize(0)
426  .lower();
427 
428  getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT})
429  .legalFor({{S64, S32}, {S32, S16}, {S64, S16},
430  {S32, S1}, {S64, S1}, {S16, S1},
431  {S96, S32},
432  // FIXME: Hack
433  {S64, LLT::scalar(33)},
434  {S32, S8}, {S128, S32}, {S128, S64}, {S32, LLT::scalar(24)}})
435  .scalarize(0);
436 
437  // TODO: Split s1->s64 during regbankselect for VALU.
438  auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP})
439  .legalFor({{S32, S32}, {S64, S32}, {S16, S32}, {S32, S1}, {S16, S1}, {S64, S1}})
440  .lowerFor({{S32, S64}})
441  .customFor({{S64, S64}});
442  if (ST.has16BitInsts())
443  IToFP.legalFor({{S16, S16}});
444  IToFP.clampScalar(1, S32, S64)
445  .scalarize(0);
446 
447  auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI})
448  .legalFor({{S32, S32}, {S32, S64}, {S32, S16}});
449  if (ST.has16BitInsts())
450  FPToI.legalFor({{S16, S16}});
451  else
452  FPToI.minScalar(1, S32);
453 
454  FPToI.minScalar(0, S32)
455  .scalarize(0);
456 
457  getActionDefinitionsBuilder(G_INTRINSIC_ROUND)
458  .legalFor({S32, S64})
459  .scalarize(0);
460 
461  if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) {
462  getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
463  .legalFor({S32, S64})
464  .clampScalar(0, S32, S64)
465  .scalarize(0);
466  } else {
467  getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
468  .legalFor({S32})
469  .customFor({S64})
470  .clampScalar(0, S32, S64)
471  .scalarize(0);
472  }
473 
475  .legalForCartesianProduct(AddrSpaces64, {S64})
476  .legalForCartesianProduct(AddrSpaces32, {S32})
477  .scalarize(0);
478 
479  getActionDefinitionsBuilder(G_PTR_MASK)
480  .scalarize(0)
481  .alwaysLegal();
482 
483  setAction({G_BLOCK_ADDR, CodePtr}, Legal);
484 
485  auto &CmpBuilder =
488  {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr})
489  .legalFor({{S1, S32}, {S1, S64}});
490  if (ST.has16BitInsts()) {
491  CmpBuilder.legalFor({{S1, S16}});
492  }
493 
494  CmpBuilder
496  .clampScalar(1, S32, S64)
497  .scalarize(0)
498  .legalIf(all(typeIs(0, S1), isPointer(1)));
499 
501  .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase)
502  .widenScalarToNextPow2(1)
503  .clampScalar(1, S32, S64)
504  .scalarize(0);
505 
506  // FIXME: fexp, flog2, flog10 needs to be custom lowered.
507  getActionDefinitionsBuilder({G_FPOW, G_FEXP, G_FEXP2,
508  G_FLOG, G_FLOG2, G_FLOG10})
509  .legalFor({S32})
510  .scalarize(0);
511 
512  // The 64-bit versions produce 32-bit results, but only on the SALU.
513  getActionDefinitionsBuilder({G_CTLZ, G_CTLZ_ZERO_UNDEF,
514  G_CTTZ, G_CTTZ_ZERO_UNDEF,
515  G_CTPOP})
516  .legalFor({{S32, S32}, {S32, S64}})
517  .clampScalar(0, S32, S32)
518  .clampScalar(1, S32, S64)
519  .scalarize(0)
520  .widenScalarToNextPow2(0, 32)
521  .widenScalarToNextPow2(1, 32);
522 
523  // TODO: Expand for > s32
524  getActionDefinitionsBuilder({G_BSWAP, G_BITREVERSE})
525  .legalFor({S32})
526  .clampScalar(0, S32, S32)
527  .scalarize(0);
528 
529  if (ST.has16BitInsts()) {
530  if (ST.hasVOP3PInsts()) {
531  getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
532  .legalFor({S32, S16, V2S16})
533  .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
534  .clampMaxNumElements(0, S16, 2)
535  .clampScalar(0, S16, S32)
537  .scalarize(0);
538  } else {
539  getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
540  .legalFor({S32, S16})
541  .widenScalarToNextPow2(0)
542  .clampScalar(0, S16, S32)
543  .scalarize(0);
544  }
545  } else {
546  getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
547  .legalFor({S32})
548  .clampScalar(0, S32, S32)
550  .scalarize(0);
551  }
552 
553  auto smallerThan = [](unsigned TypeIdx0, unsigned TypeIdx1) {
554  return [=](const LegalityQuery &Query) {
555  return Query.Types[TypeIdx0].getSizeInBits() <
556  Query.Types[TypeIdx1].getSizeInBits();
557  };
558  };
559 
560  auto greaterThan = [](unsigned TypeIdx0, unsigned TypeIdx1) {
561  return [=](const LegalityQuery &Query) {
562  return Query.Types[TypeIdx0].getSizeInBits() >
563  Query.Types[TypeIdx1].getSizeInBits();
564  };
565  };
566 
567  getActionDefinitionsBuilder(G_INTTOPTR)
568  // List the common cases
569  .legalForCartesianProduct(AddrSpaces64, {S64})
570  .legalForCartesianProduct(AddrSpaces32, {S32})
571  .scalarize(0)
572  // Accept any address space as long as the size matches
573  .legalIf(sameSize(0, 1))
574  .widenScalarIf(smallerThan(1, 0),
575  [](const LegalityQuery &Query) {
576  return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
577  })
578  .narrowScalarIf(greaterThan(1, 0),
579  [](const LegalityQuery &Query) {
580  return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
581  });
582 
583  getActionDefinitionsBuilder(G_PTRTOINT)
584  // List the common cases
585  .legalForCartesianProduct(AddrSpaces64, {S64})
586  .legalForCartesianProduct(AddrSpaces32, {S32})
587  .scalarize(0)
588  // Accept any address space as long as the size matches
589  .legalIf(sameSize(0, 1))
590  .widenScalarIf(smallerThan(0, 1),
591  [](const LegalityQuery &Query) {
592  return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
593  })
594  .narrowScalarIf(
595  greaterThan(0, 1),
596  [](const LegalityQuery &Query) {
597  return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
598  });
599 
600  getActionDefinitionsBuilder(G_ADDRSPACE_CAST)
601  .scalarize(0)
602  .custom();
603 
604  // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we
605  // handle some operations by just promoting the register during
606  // selection. There are also d16 loads on GFX9+ which preserve the high bits.
607  auto maxSizeForAddrSpace = [this](unsigned AS) -> unsigned {
608  switch (AS) {
609  // FIXME: Private element size.
611  return 32;
612  // FIXME: Check subtarget
614  return ST.useDS128() ? 128 : 64;
615 
616  // Treat constant and global as identical. SMRD loads are sometimes usable
617  // for global loads (ideally constant address space should be eliminated)
618  // depending on the context. Legality cannot be context dependent, but
619  // RegBankSelect can split the load as necessary depending on the pointer
620  // register bank/uniformity and if the memory is invariant or not written in
621  // a kernel.
624  return 512;
625  default:
626  return 128;
627  }
628  };
629 
630  const auto needToSplitLoad = [=](const LegalityQuery &Query) -> bool {
631  const LLT DstTy = Query.Types[0];
632 
633  // Split vector extloads.
634  unsigned MemSize = Query.MMODescrs[0].SizeInBits;
635  if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize)
636  return true;
637 
638  const LLT PtrTy = Query.Types[1];
639  unsigned AS = PtrTy.getAddressSpace();
640  if (MemSize > maxSizeForAddrSpace(AS))
641  return true;
642 
643  // Catch weird sized loads that don't evenly divide into the access sizes
644  // TODO: May be able to widen depending on alignment etc.
645  unsigned NumRegs = MemSize / 32;
646  if (NumRegs == 3 && !ST.hasDwordx3LoadStores())
647  return true;
648 
649  unsigned Align = Query.MMODescrs[0].AlignInBits;
650  if (Align < MemSize) {
651  const SITargetLowering *TLI = ST.getTargetLowering();
652  return !TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, Align / 8);
653  }
654 
655  return false;
656  };
657 
658  unsigned GlobalAlign32 = ST.hasUnalignedBufferAccess() ? 0 : 32;
659  unsigned GlobalAlign16 = ST.hasUnalignedBufferAccess() ? 0 : 16;
660  unsigned GlobalAlign8 = ST.hasUnalignedBufferAccess() ? 0 : 8;
661 
662  // TODO: Refine based on subtargets which support unaligned access or 128-bit
663  // LDS
664  // TODO: Unsupported flat for SI.
665 
666  for (unsigned Op : {G_LOAD, G_STORE}) {
667  const bool IsStore = Op == G_STORE;
668 
669  auto &Actions = getActionDefinitionsBuilder(Op);
670  // Whitelist the common cases.
671  // TODO: Pointer loads
672  // TODO: Wide constant loads
673  // TODO: Only CI+ has 3x loads
674  // TODO: Loads to s16 on gfx9
675  Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, 32, GlobalAlign32},
676  {V2S32, GlobalPtr, 64, GlobalAlign32},
677  {V3S32, GlobalPtr, 96, GlobalAlign32},
678  {S96, GlobalPtr, 96, GlobalAlign32},
679  {V4S32, GlobalPtr, 128, GlobalAlign32},
680  {S128, GlobalPtr, 128, GlobalAlign32},
681  {S64, GlobalPtr, 64, GlobalAlign32},
682  {V2S64, GlobalPtr, 128, GlobalAlign32},
683  {V2S16, GlobalPtr, 32, GlobalAlign32},
684  {S32, GlobalPtr, 8, GlobalAlign8},
685  {S32, GlobalPtr, 16, GlobalAlign16},
686 
687  {S32, LocalPtr, 32, 32},
688  {S64, LocalPtr, 64, 32},
689  {V2S32, LocalPtr, 64, 32},
690  {S32, LocalPtr, 8, 8},
691  {S32, LocalPtr, 16, 16},
692  {V2S16, LocalPtr, 32, 32},
693 
694  {S32, PrivatePtr, 32, 32},
695  {S32, PrivatePtr, 8, 8},
696  {S32, PrivatePtr, 16, 16},
697  {V2S16, PrivatePtr, 32, 32},
698 
699  {S32, FlatPtr, 32, GlobalAlign32},
700  {S32, FlatPtr, 16, GlobalAlign16},
701  {S32, FlatPtr, 8, GlobalAlign8},
702  {V2S16, FlatPtr, 32, GlobalAlign32},
703 
704  {S32, ConstantPtr, 32, GlobalAlign32},
705  {V2S32, ConstantPtr, 64, GlobalAlign32},
706  {V3S32, ConstantPtr, 96, GlobalAlign32},
707  {V4S32, ConstantPtr, 128, GlobalAlign32},
708  {S64, ConstantPtr, 64, GlobalAlign32},
709  {S128, ConstantPtr, 128, GlobalAlign32},
710  {V2S32, ConstantPtr, 32, GlobalAlign32}});
711  Actions
712  .customIf(typeIs(1, Constant32Ptr))
713  .narrowScalarIf(
714  [=](const LegalityQuery &Query) -> bool {
715  return !Query.Types[0].isVector() && needToSplitLoad(Query);
716  },
717  [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
718  const LLT DstTy = Query.Types[0];
719  const LLT PtrTy = Query.Types[1];
720 
721  const unsigned DstSize = DstTy.getSizeInBits();
722  unsigned MemSize = Query.MMODescrs[0].SizeInBits;
723 
724  // Split extloads.
725  if (DstSize > MemSize)
726  return std::make_pair(0, LLT::scalar(MemSize));
727 
728  if (DstSize > 32 && (DstSize % 32 != 0)) {
729  // FIXME: Need a way to specify non-extload of larger size if
730  // suitably aligned.
731  return std::make_pair(0, LLT::scalar(32 * (DstSize / 32)));
732  }
733 
734  unsigned MaxSize = maxSizeForAddrSpace(PtrTy.getAddressSpace());
735  if (MemSize > MaxSize)
736  return std::make_pair(0, LLT::scalar(MaxSize));
737 
738  unsigned Align = Query.MMODescrs[0].AlignInBits;
739  return std::make_pair(0, LLT::scalar(Align));
740  })
741  .fewerElementsIf(
742  [=](const LegalityQuery &Query) -> bool {
743  return Query.Types[0].isVector() && needToSplitLoad(Query);
744  },
745  [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
746  const LLT DstTy = Query.Types[0];
747  const LLT PtrTy = Query.Types[1];
748 
749  LLT EltTy = DstTy.getElementType();
750  unsigned MaxSize = maxSizeForAddrSpace(PtrTy.getAddressSpace());
751 
752  // Split if it's too large for the address space.
753  if (Query.MMODescrs[0].SizeInBits > MaxSize) {
754  unsigned NumElts = DstTy.getNumElements();
755  unsigned NumPieces = Query.MMODescrs[0].SizeInBits / MaxSize;
756 
757  // FIXME: Refine when odd breakdowns handled
758  // The scalars will need to be re-legalized.
759  if (NumPieces == 1 || NumPieces >= NumElts ||
760  NumElts % NumPieces != 0)
761  return std::make_pair(0, EltTy);
762 
763  return std::make_pair(0,
764  LLT::vector(NumElts / NumPieces, EltTy));
765  }
766 
767  // Need to split because of alignment.
768  unsigned Align = Query.MMODescrs[0].AlignInBits;
769  unsigned EltSize = EltTy.getSizeInBits();
770  if (EltSize > Align &&
771  (EltSize / Align < DstTy.getNumElements())) {
772  return std::make_pair(0, LLT::vector(EltSize / Align, EltTy));
773  }
774 
775  // May need relegalization for the scalars.
776  return std::make_pair(0, EltTy);
777  })
778  .minScalar(0, S32);
779 
780  if (IsStore)
781  Actions.narrowScalarIf(isWideScalarTruncStore(0), changeTo(0, S32));
782 
783  // TODO: Need a bitcast lower option?
784  Actions
785  .legalIf([=](const LegalityQuery &Query) {
786  const LLT Ty0 = Query.Types[0];
787  unsigned Size = Ty0.getSizeInBits();
788  unsigned MemSize = Query.MMODescrs[0].SizeInBits;
789  unsigned Align = Query.MMODescrs[0].AlignInBits;
790 
791  // No extending vector loads.
792  if (Size > MemSize && Ty0.isVector())
793  return false;
794 
795  // FIXME: Widening store from alignment not valid.
796  if (MemSize < Size)
797  MemSize = std::max(MemSize, Align);
798 
799  switch (MemSize) {
800  case 8:
801  case 16:
802  return Size == 32;
803  case 32:
804  case 64:
805  case 128:
806  return true;
807  case 96:
808  return ST.hasDwordx3LoadStores();
809  case 256:
810  case 512:
811  return true;
812  default:
813  return false;
814  }
815  })
816  .widenScalarToNextPow2(0)
817  // TODO: v3s32->v4s32 with alignment
818  .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0));
819  }
820 
821  auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD})
822  .legalForTypesWithMemDesc({{S32, GlobalPtr, 8, 8},
823  {S32, GlobalPtr, 16, 2 * 8},
824  {S32, LocalPtr, 8, 8},
825  {S32, LocalPtr, 16, 16},
826  {S32, PrivatePtr, 8, 8},
827  {S32, PrivatePtr, 16, 16},
828  {S32, ConstantPtr, 8, 8},
829  {S32, ConstantPtr, 16, 2 * 8}});
830  if (ST.hasFlatAddressSpace()) {
831  ExtLoads.legalForTypesWithMemDesc(
832  {{S32, FlatPtr, 8, 8}, {S32, FlatPtr, 16, 16}});
833  }
834 
835  ExtLoads.clampScalar(0, S32, S32)
838  .lower();
839 
840  auto &Atomics = getActionDefinitionsBuilder(
841  {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB,
842  G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR,
843  G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX,
844  G_ATOMICRMW_UMIN, G_ATOMIC_CMPXCHG})
845  .legalFor({{S32, GlobalPtr}, {S32, LocalPtr},
846  {S64, GlobalPtr}, {S64, LocalPtr}});
847  if (ST.hasFlatAddressSpace()) {
848  Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}});
849  }
850 
851  getActionDefinitionsBuilder(G_ATOMICRMW_FADD)
852  .legalFor({{S32, LocalPtr}});
853 
854  getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG_WITH_SUCCESS)
855  .lower();
856 
857  // TODO: Pointer types, any 32-bit or 64-bit vector
859  .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16,
860  GlobalPtr, LocalPtr, FlatPtr, PrivatePtr,
861  LLT::vector(2, LocalPtr), LLT::vector(2, PrivatePtr)}, {S1})
862  .clampScalar(0, S16, S64)
865  .scalarize(1)
866  .clampMaxNumElements(0, S32, 2)
867  .clampMaxNumElements(0, LocalPtr, 2)
868  .clampMaxNumElements(0, PrivatePtr, 2)
869  .scalarize(0)
871  .legalIf(all(isPointer(0), typeIs(1, S1)));
872 
873  // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can
874  // be more flexible with the shift amount type.
875  auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR})
876  .legalFor({{S32, S32}, {S64, S32}});
877  if (ST.has16BitInsts()) {
878  if (ST.hasVOP3PInsts()) {
879  Shifts.legalFor({{S16, S32}, {S16, S16}, {V2S16, V2S16}})
880  .clampMaxNumElements(0, S16, 2);
881  } else
882  Shifts.legalFor({{S16, S32}, {S16, S16}});
883 
884  Shifts.clampScalar(1, S16, S32);
885  Shifts.clampScalar(0, S16, S64);
886  Shifts.widenScalarToNextPow2(0, 16);
887  } else {
888  // Make sure we legalize the shift amount type first, as the general
889  // expansion for the shifted type will produce much worse code if it hasn't
890  // been truncated already.
891  Shifts.clampScalar(1, S32, S32);
892  Shifts.clampScalar(0, S32, S64);
893  Shifts.widenScalarToNextPow2(0, 32);
894  }
895  Shifts.scalarize(0);
896 
897  for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) {
898  unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0;
899  unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1;
900  unsigned IdxTypeIdx = 2;
901 
903  .customIf([=](const LegalityQuery &Query) {
904  const LLT EltTy = Query.Types[EltTypeIdx];
905  const LLT VecTy = Query.Types[VecTypeIdx];
906  const LLT IdxTy = Query.Types[IdxTypeIdx];
907  return (EltTy.getSizeInBits() == 16 ||
908  EltTy.getSizeInBits() % 32 == 0) &&
909  VecTy.getSizeInBits() % 32 == 0 &&
910  VecTy.getSizeInBits() <= 1024 &&
911  IdxTy.getSizeInBits() == 32;
912  })
913  .clampScalar(EltTypeIdx, S32, S64)
914  .clampScalar(VecTypeIdx, S32, S64)
915  .clampScalar(IdxTypeIdx, S32, S32);
916  }
917 
918  getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT)
919  .unsupportedIf([=](const LegalityQuery &Query) {
920  const LLT &EltTy = Query.Types[1].getElementType();
921  return Query.Types[0] != EltTy;
922  });
923 
924  for (unsigned Op : {G_EXTRACT, G_INSERT}) {
925  unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0;
926  unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1;
927 
928  // FIXME: Doesn't handle extract of illegal sizes.
930  .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32)))
931  // FIXME: Multiples of 16 should not be legal.
932  .legalIf([=](const LegalityQuery &Query) {
933  const LLT BigTy = Query.Types[BigTyIdx];
934  const LLT LitTy = Query.Types[LitTyIdx];
935  return (BigTy.getSizeInBits() % 32 == 0) &&
936  (LitTy.getSizeInBits() % 16 == 0);
937  })
938  .widenScalarIf(
939  [=](const LegalityQuery &Query) {
940  const LLT BigTy = Query.Types[BigTyIdx];
941  return (BigTy.getScalarSizeInBits() < 16);
942  },
944  .widenScalarIf(
945  [=](const LegalityQuery &Query) {
946  const LLT LitTy = Query.Types[LitTyIdx];
947  return (LitTy.getScalarSizeInBits() < 16);
948  },
950  .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
951  .widenScalarToNextPow2(BigTyIdx, 32);
952 
953  }
954 
955  auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR)
956  .legalForCartesianProduct(AllS32Vectors, {S32})
957  .legalForCartesianProduct(AllS64Vectors, {S64})
958  .clampNumElements(0, V16S32, V32S32)
959  .clampNumElements(0, V2S64, V16S64)
960  .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16));
961 
962  if (ST.hasScalarPackInsts())
963  BuildVector.legalFor({V2S16, S32});
964 
965  BuildVector
966  .minScalarSameAs(1, 0)
968  .minScalarOrElt(0, S32);
969 
970  if (ST.hasScalarPackInsts()) {
971  getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
972  .legalFor({V2S16, S32})
973  .lower();
974  } else {
975  getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
976  .lower();
977  }
978 
979  getActionDefinitionsBuilder(G_CONCAT_VECTORS)
980  .legalIf(isRegisterType(0));
981 
982  // TODO: Don't fully scalarize v2s16 pieces
983  getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower();
984 
985  // Merge/Unmerge
986  for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
987  unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1;
988  unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0;
989 
990  auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) {
991  const LLT &Ty = Query.Types[TypeIdx];
992  if (Ty.isVector()) {
993  const LLT &EltTy = Ty.getElementType();
994  if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 64)
995  return true;
996  if (!isPowerOf2_32(EltTy.getSizeInBits()))
997  return true;
998  }
999  return false;
1000  };
1001 
1002  auto &Builder = getActionDefinitionsBuilder(Op)
1003  .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16)
1004  // Clamp the little scalar to s8-s256 and make it a power of 2. It's not
1005  // worth considering the multiples of 64 since 2*192 and 2*384 are not
1006  // valid.
1007  .clampScalar(LitTyIdx, S16, S256)
1008  .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32)
1009  .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1010  .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32),
1011  elementTypeIs(1, S16)),
1012  changeTo(1, V2S16))
1013  // Break up vectors with weird elements into scalars
1014  .fewerElementsIf(
1015  [=](const LegalityQuery &Query) { return notValidElt(Query, 0); },
1016  scalarize(0))
1017  .fewerElementsIf(
1018  [=](const LegalityQuery &Query) { return notValidElt(Query, 1); },
1019  scalarize(1))
1020  .clampScalar(BigTyIdx, S32, S1024)
1021  .lowerFor({{S16, V2S16}});
1022 
1023  if (Op == G_MERGE_VALUES) {
1024  Builder.widenScalarIf(
1025  // TODO: Use 16-bit shifts if legal for 8-bit values?
1026  [=](const LegalityQuery &Query) {
1027  const LLT Ty = Query.Types[LitTyIdx];
1028  return Ty.getSizeInBits() < 32;
1029  },
1030  changeTo(LitTyIdx, S32));
1031  }
1032 
1033  Builder.widenScalarIf(
1034  [=](const LegalityQuery &Query) {
1035  const LLT Ty = Query.Types[BigTyIdx];
1036  return !isPowerOf2_32(Ty.getSizeInBits()) &&
1037  Ty.getSizeInBits() % 16 != 0;
1038  },
1039  [=](const LegalityQuery &Query) {
1040  // Pick the next power of 2, or a multiple of 64 over 128.
1041  // Whichever is smaller.
1042  const LLT &Ty = Query.Types[BigTyIdx];
1043  unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1);
1044  if (NewSizeInBits >= 256) {
1045  unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1);
1046  if (RoundedTo < NewSizeInBits)
1047  NewSizeInBits = RoundedTo;
1048  }
1049  return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits));
1050  })
1051  .legalIf([=](const LegalityQuery &Query) {
1052  const LLT &BigTy = Query.Types[BigTyIdx];
1053  const LLT &LitTy = Query.Types[LitTyIdx];
1054 
1055  if (BigTy.isVector() && BigTy.getSizeInBits() < 32)
1056  return false;
1057  if (LitTy.isVector() && LitTy.getSizeInBits() < 32)
1058  return false;
1059 
1060  return BigTy.getSizeInBits() % 16 == 0 &&
1061  LitTy.getSizeInBits() % 16 == 0 &&
1062  BigTy.getSizeInBits() <= 1024;
1063  })
1064  // Any vectors left are the wrong size. Scalarize them.
1065  .scalarize(0)
1066  .scalarize(1);
1067  }
1068 
1069  getActionDefinitionsBuilder(G_SEXT_INREG).lower();
1070 
1071  computeTables();
1072  verify(*ST.getInstrInfo());
1073 }
1074 
1078  GISelChangeObserver &Observer) const {
1079  switch (MI.getOpcode()) {
1080  case TargetOpcode::G_ADDRSPACE_CAST:
1081  return legalizeAddrSpaceCast(MI, MRI, B);
1082  case TargetOpcode::G_FRINT:
1083  return legalizeFrint(MI, MRI, B);
1084  case TargetOpcode::G_FCEIL:
1085  return legalizeFceil(MI, MRI, B);
1086  case TargetOpcode::G_INTRINSIC_TRUNC:
1087  return legalizeIntrinsicTrunc(MI, MRI, B);
1088  case TargetOpcode::G_SITOFP:
1089  return legalizeITOFP(MI, MRI, B, true);
1090  case TargetOpcode::G_UITOFP:
1091  return legalizeITOFP(MI, MRI, B, false);
1092  case TargetOpcode::G_FMINNUM:
1093  case TargetOpcode::G_FMAXNUM:
1094  case TargetOpcode::G_FMINNUM_IEEE:
1095  case TargetOpcode::G_FMAXNUM_IEEE:
1096  return legalizeMinNumMaxNum(MI, MRI, B);
1097  case TargetOpcode::G_EXTRACT_VECTOR_ELT:
1098  return legalizeExtractVectorElt(MI, MRI, B);
1099  case TargetOpcode::G_INSERT_VECTOR_ELT:
1100  return legalizeInsertVectorElt(MI, MRI, B);
1101  case TargetOpcode::G_FSIN:
1102  case TargetOpcode::G_FCOS:
1103  return legalizeSinCos(MI, MRI, B);
1104  case TargetOpcode::G_GLOBAL_VALUE:
1105  return legalizeGlobalValue(MI, MRI, B);
1106  case TargetOpcode::G_LOAD:
1107  return legalizeLoad(MI, MRI, B, Observer);
1108  case TargetOpcode::G_FMAD:
1109  return legalizeFMad(MI, MRI, B);
1110  default:
1111  return false;
1112  }
1113 
1114  llvm_unreachable("expected switch to return");
1115 }
1116 
1118  unsigned AS,
1120  MachineIRBuilder &B) const {
1121  MachineFunction &MF = B.getMF();
1122  const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1123  const LLT S32 = LLT::scalar(32);
1124 
1126 
1127  if (ST.hasApertureRegs()) {
1128  // FIXME: Use inline constants (src_{shared, private}_base) instead of
1129  // getreg.
1130  unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ?
1133  unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ?
1136  unsigned Encoding =
1138  Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ |
1139  WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_;
1140 
1141  Register ApertureReg = MRI.createGenericVirtualRegister(S32);
1142  Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
1143 
1144  B.buildInstr(AMDGPU::S_GETREG_B32)
1145  .addDef(GetReg)
1146  .addImm(Encoding);
1147  MRI.setType(GetReg, S32);
1148 
1149  auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1);
1150  B.buildInstr(TargetOpcode::G_SHL)
1151  .addDef(ApertureReg)
1152  .addUse(GetReg)
1153  .addUse(ShiftAmt.getReg(0));
1154 
1155  return ApertureReg;
1156  }
1157 
1158  Register QueuePtr = MRI.createGenericVirtualRegister(
1160 
1162  if (!loadInputValue(QueuePtr, B, &MFI->getArgInfo().QueuePtr))
1163  return Register();
1164 
1165  // Offset into amd_queue_t for group_segment_aperture_base_hi /
1166  // private_segment_aperture_base_hi.
1167  uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
1168 
1169  // FIXME: Don't use undef
1173 
1174  MachinePointerInfo PtrInfo(V, StructOffset);
1176  PtrInfo,
1180  4,
1181  MinAlign(64, StructOffset));
1182 
1183  Register LoadResult = MRI.createGenericVirtualRegister(S32);
1184  Register LoadAddr;
1185 
1186  B.materializeGEP(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset);
1187  B.buildLoad(LoadResult, LoadAddr, *MMO);
1188  return LoadResult;
1189 }
1190 
1193  MachineIRBuilder &B) const {
1194  MachineFunction &MF = B.getMF();
1195 
1196  B.setInstr(MI);
1197 
1198  const LLT S32 = LLT::scalar(32);
1199  Register Dst = MI.getOperand(0).getReg();
1200  Register Src = MI.getOperand(1).getReg();
1201 
1202  LLT DstTy = MRI.getType(Dst);
1203  LLT SrcTy = MRI.getType(Src);
1204  unsigned DestAS = DstTy.getAddressSpace();
1205  unsigned SrcAS = SrcTy.getAddressSpace();
1206 
1207  // TODO: Avoid reloading from the queue ptr for each cast, or at least each
1208  // vector element.
1209  assert(!DstTy.isVector());
1210 
1211  const AMDGPUTargetMachine &TM
1212  = static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
1213 
1214  const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1215  if (ST.getTargetLowering()->isNoopAddrSpaceCast(SrcAS, DestAS)) {
1216  MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST));
1217  return true;
1218  }
1219 
1220  if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1221  // Truncate.
1222  B.buildExtract(Dst, Src, 0);
1223  MI.eraseFromParent();
1224  return true;
1225  }
1226 
1227  if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1229  uint32_t AddrHiVal = Info->get32BitAddressHighBits();
1230 
1231  // FIXME: This is a bit ugly due to creating a merge of 2 pointers to
1232  // another. Merge operands are required to be the same type, but creating an
1233  // extra ptrtoint would be kind of pointless.
1234  auto HighAddr = B.buildConstant(
1236  B.buildMerge(Dst, {Src, HighAddr.getReg(0)});
1237  MI.eraseFromParent();
1238  return true;
1239  }
1240 
1241  if (SrcAS == AMDGPUAS::FLAT_ADDRESS) {
1242  assert(DestAS == AMDGPUAS::LOCAL_ADDRESS ||
1243  DestAS == AMDGPUAS::PRIVATE_ADDRESS);
1244  unsigned NullVal = TM.getNullPointerValue(DestAS);
1245 
1246  auto SegmentNull = B.buildConstant(DstTy, NullVal);
1247  auto FlatNull = B.buildConstant(SrcTy, 0);
1248 
1249  Register PtrLo32 = MRI.createGenericVirtualRegister(DstTy);
1250 
1251  // Extract low 32-bits of the pointer.
1252  B.buildExtract(PtrLo32, Src, 0);
1253 
1255  B.buildICmp(CmpInst::ICMP_NE, CmpRes, Src, FlatNull.getReg(0));
1256  B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0));
1257 
1258  MI.eraseFromParent();
1259  return true;
1260  }
1261 
1262  if (SrcAS != AMDGPUAS::LOCAL_ADDRESS && SrcAS != AMDGPUAS::PRIVATE_ADDRESS)
1263  return false;
1264 
1265  if (!ST.hasFlatAddressSpace())
1266  return false;
1267 
1268  auto SegmentNull =
1269  B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS));
1270  auto FlatNull =
1271  B.buildConstant(DstTy, TM.getNullPointerValue(DestAS));
1272 
1273  Register ApertureReg = getSegmentAperture(SrcAS, MRI, B);
1274  if (!ApertureReg.isValid())
1275  return false;
1276 
1278  B.buildICmp(CmpInst::ICMP_NE, CmpRes, Src, SegmentNull.getReg(0));
1279 
1280  Register BuildPtr = MRI.createGenericVirtualRegister(DstTy);
1281 
1282  // Coerce the type of the low half of the result so we can use merge_values.
1283  Register SrcAsInt = MRI.createGenericVirtualRegister(S32);
1284  B.buildInstr(TargetOpcode::G_PTRTOINT)
1285  .addDef(SrcAsInt)
1286  .addUse(Src);
1287 
1288  // TODO: Should we allow mismatched types but matching sizes in merges to
1289  // avoid the ptrtoint?
1290  B.buildMerge(BuildPtr, {SrcAsInt, ApertureReg});
1291  B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull.getReg(0));
1292 
1293  MI.eraseFromParent();
1294  return true;
1295 }
1296 
1299  MachineIRBuilder &B) const {
1300  B.setInstr(MI);
1301 
1302  Register Src = MI.getOperand(1).getReg();
1303  LLT Ty = MRI.getType(Src);
1304  assert(Ty.isScalar() && Ty.getSizeInBits() == 64);
1305 
1306  APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
1307  APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
1308 
1309  auto C1 = B.buildFConstant(Ty, C1Val);
1310  auto CopySign = B.buildFCopysign(Ty, C1, Src);
1311 
1312  // TODO: Should this propagate fast-math-flags?
1313  auto Tmp1 = B.buildFAdd(Ty, Src, CopySign);
1314  auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign);
1315 
1316  auto C2 = B.buildFConstant(Ty, C2Val);
1317  auto Fabs = B.buildFAbs(Ty, Src);
1318 
1319  auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2);
1320  B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2);
1321  return true;
1322 }
1323 
1326  MachineIRBuilder &B) const {
1327  B.setInstr(MI);
1328 
1329  const LLT S1 = LLT::scalar(1);
1330  const LLT S64 = LLT::scalar(64);
1331 
1332  Register Src = MI.getOperand(1).getReg();
1333  assert(MRI.getType(Src) == S64);
1334 
1335  // result = trunc(src)
1336  // if (src > 0.0 && src != result)
1337  // result += 1.0
1338 
1339  auto Trunc = B.buildInstr(TargetOpcode::G_INTRINSIC_TRUNC, {S64}, {Src});
1340 
1341  const auto Zero = B.buildFConstant(S64, 0.0);
1342  const auto One = B.buildFConstant(S64, 1.0);
1343  auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero);
1344  auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc);
1345  auto And = B.buildAnd(S1, Lt0, NeTrunc);
1346  auto Add = B.buildSelect(S64, And, One, Zero);
1347 
1348  // TODO: Should this propagate fast-math-flags?
1349  B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add);
1350  return true;
1351 }
1352 
1354  MachineIRBuilder &B) {
1355  const unsigned FractBits = 52;
1356  const unsigned ExpBits = 11;
1357  LLT S32 = LLT::scalar(32);
1358 
1359  auto Const0 = B.buildConstant(S32, FractBits - 32);
1360  auto Const1 = B.buildConstant(S32, ExpBits);
1361 
1362  auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false)
1363  .addUse(Const0.getReg(0))
1364  .addUse(Const1.getReg(0));
1365 
1366  return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023));
1367 }
1368 
1371  MachineIRBuilder &B) const {
1372  B.setInstr(MI);
1373 
1374  const LLT S1 = LLT::scalar(1);
1375  const LLT S32 = LLT::scalar(32);
1376  const LLT S64 = LLT::scalar(64);
1377 
1378  Register Src = MI.getOperand(1).getReg();
1379  assert(MRI.getType(Src) == S64);
1380 
1381  // TODO: Should this use extract since the low half is unused?
1382  auto Unmerge = B.buildUnmerge({S32, S32}, Src);
1383  Register Hi = Unmerge.getReg(1);
1384 
1385  // Extract the upper half, since this is where we will find the sign and
1386  // exponent.
1387  auto Exp = extractF64Exponent(Hi, B);
1388 
1389  const unsigned FractBits = 52;
1390 
1391  // Extract the sign bit.
1392  const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31);
1393  auto SignBit = B.buildAnd(S32, Hi, SignBitMask);
1394 
1395  const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1);
1396 
1397  const auto Zero32 = B.buildConstant(S32, 0);
1398 
1399  // Extend back to 64-bits.
1400  auto SignBit64 = B.buildMerge(S64, {Zero32.getReg(0), SignBit.getReg(0)});
1401 
1402  auto Shr = B.buildAShr(S64, FractMask, Exp);
1403  auto Not = B.buildNot(S64, Shr);
1404  auto Tmp0 = B.buildAnd(S64, Src, Not);
1405  auto FiftyOne = B.buildConstant(S32, FractBits - 1);
1406 
1407  auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32);
1408  auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne);
1409 
1410  auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0);
1411  B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1);
1412  return true;
1413 }
1414 
1417  MachineIRBuilder &B, bool Signed) const {
1418  B.setInstr(MI);
1419 
1420  Register Dst = MI.getOperand(0).getReg();
1421  Register Src = MI.getOperand(1).getReg();
1422 
1423  const LLT S64 = LLT::scalar(64);
1424  const LLT S32 = LLT::scalar(32);
1425 
1426  assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64);
1427 
1428  auto Unmerge = B.buildUnmerge({S32, S32}, Src);
1429 
1430  auto CvtHi = Signed ?
1431  B.buildSITOFP(S64, Unmerge.getReg(1)) :
1432  B.buildUITOFP(S64, Unmerge.getReg(1));
1433 
1434  auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0));
1435 
1436  auto ThirtyTwo = B.buildConstant(S32, 32);
1437  auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false)
1438  .addUse(CvtHi.getReg(0))
1439  .addUse(ThirtyTwo.getReg(0));
1440 
1441  // TODO: Should this propagate fast-math-flags?
1442  B.buildFAdd(Dst, LdExp, CvtLo);
1443  MI.eraseFromParent();
1444  return true;
1445 }
1446 
1449  MachineIRBuilder &B) const {
1450  MachineFunction &MF = B.getMF();
1452 
1453  const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE ||
1454  MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE;
1455 
1456  // With ieee_mode disabled, the instructions have the correct behavior
1457  // already for G_FMINNUM/G_FMAXNUM
1458  if (!MFI->getMode().IEEE)
1459  return !IsIEEEOp;
1460 
1461  if (IsIEEEOp)
1462  return true;
1463 
1464  MachineIRBuilder HelperBuilder(MI);
1465  GISelObserverWrapper DummyObserver;
1466  LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
1467  HelperBuilder.setInstr(MI);
1468  return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized;
1469 }
1470 
1473  MachineIRBuilder &B) const {
1474  // TODO: Should move some of this into LegalizerHelper.
1475 
1476  // TODO: Promote dynamic indexing of s16 to s32
1477  // TODO: Dynamic s64 indexing is only legal for SGPR.
1479  if (!IdxVal) // Dynamic case will be selected to register indexing.
1480  return true;
1481 
1482  Register Dst = MI.getOperand(0).getReg();
1483  Register Vec = MI.getOperand(1).getReg();
1484 
1485  LLT VecTy = MRI.getType(Vec);
1486  LLT EltTy = VecTy.getElementType();
1487  assert(EltTy == MRI.getType(Dst));
1488 
1489  B.setInstr(MI);
1490 
1491  if (IdxVal.getValue() < VecTy.getNumElements())
1492  B.buildExtract(Dst, Vec, IdxVal.getValue() * EltTy.getSizeInBits());
1493  else
1494  B.buildUndef(Dst);
1495 
1496  MI.eraseFromParent();
1497  return true;
1498 }
1499 
1502  MachineIRBuilder &B) const {
1503  // TODO: Should move some of this into LegalizerHelper.
1504 
1505  // TODO: Promote dynamic indexing of s16 to s32
1506  // TODO: Dynamic s64 indexing is only legal for SGPR.
1508  if (!IdxVal) // Dynamic case will be selected to register indexing.
1509  return true;
1510 
1511  Register Dst = MI.getOperand(0).getReg();
1512  Register Vec = MI.getOperand(1).getReg();
1513  Register Ins = MI.getOperand(2).getReg();
1514 
1515  LLT VecTy = MRI.getType(Vec);
1516  LLT EltTy = VecTy.getElementType();
1517  assert(EltTy == MRI.getType(Ins));
1518 
1519  B.setInstr(MI);
1520 
1521  if (IdxVal.getValue() < VecTy.getNumElements())
1522  B.buildInsert(Dst, Vec, Ins, IdxVal.getValue() * EltTy.getSizeInBits());
1523  else
1524  B.buildUndef(Dst);
1525 
1526  MI.eraseFromParent();
1527  return true;
1528 }
1529 
1532  MachineIRBuilder &B) const {
1533  B.setInstr(MI);
1534 
1535  Register DstReg = MI.getOperand(0).getReg();
1536  Register SrcReg = MI.getOperand(1).getReg();
1537  LLT Ty = MRI.getType(DstReg);
1538  unsigned Flags = MI.getFlags();
1539 
1540  Register TrigVal;
1541  auto OneOver2Pi = B.buildFConstant(Ty, 0.5 / M_PI);
1542  if (ST.hasTrigReducedRange()) {
1543  auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags);
1544  TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false)
1545  .addUse(MulVal.getReg(0))
1546  .setMIFlags(Flags).getReg(0);
1547  } else
1548  TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0);
1549 
1550  Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ?
1551  Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos;
1552  B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false)
1553  .addUse(TrigVal)
1554  .setMIFlags(Flags);
1555  MI.eraseFromParent();
1556  return true;
1557 }
1558 
1560  Register DstReg, LLT PtrTy,
1561  MachineIRBuilder &B, const GlobalValue *GV,
1562  unsigned Offset, unsigned GAFlags) const {
1563  // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered
1564  // to the following code sequence:
1565  //
1566  // For constant address space:
1567  // s_getpc_b64 s[0:1]
1568  // s_add_u32 s0, s0, $symbol
1569  // s_addc_u32 s1, s1, 0
1570  //
1571  // s_getpc_b64 returns the address of the s_add_u32 instruction and then
1572  // a fixup or relocation is emitted to replace $symbol with a literal
1573  // constant, which is a pc-relative offset from the encoding of the $symbol
1574  // operand to the global variable.
1575  //
1576  // For global address space:
1577  // s_getpc_b64 s[0:1]
1578  // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
1579  // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
1580  //
1581  // s_getpc_b64 returns the address of the s_add_u32 instruction and then
1582  // fixups or relocations are emitted to replace $symbol@*@lo and
1583  // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
1584  // which is a 64-bit pc-relative offset from the encoding of the $symbol
1585  // operand to the global variable.
1586  //
1587  // What we want here is an offset from the value returned by s_getpc
1588  // (which is the address of the s_add_u32 instruction) to the global
1589  // variable, but since the encoding of $symbol starts 4 bytes after the start
1590  // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too
1591  // small. This requires us to add 4 to the global variable offset in order to
1592  // compute the correct address.
1593 
1594  LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
1595 
1596  Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg :
1597  B.getMRI()->createGenericVirtualRegister(ConstPtrTy);
1598 
1599  MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET)
1600  .addDef(PCReg);
1601 
1602  MIB.addGlobalAddress(GV, Offset + 4, GAFlags);
1603  if (GAFlags == SIInstrInfo::MO_NONE)
1604  MIB.addImm(0);
1605  else
1606  MIB.addGlobalAddress(GV, Offset + 4, GAFlags + 1);
1607 
1608  B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass);
1609 
1610  if (PtrTy.getSizeInBits() == 32)
1611  B.buildExtract(DstReg, PCReg, 0);
1612  return true;
1613  }
1614 
1617  MachineIRBuilder &B) const {
1618  Register DstReg = MI.getOperand(0).getReg();
1619  LLT Ty = MRI.getType(DstReg);
1620  unsigned AS = Ty.getAddressSpace();
1621 
1622  const GlobalValue *GV = MI.getOperand(1).getGlobal();
1623  MachineFunction &MF = B.getMF();
1625  B.setInstr(MI);
1626 
1628  if (!MFI->isEntryFunction()) {
1629  const Function &Fn = MF.getFunction();
1630  DiagnosticInfoUnsupported BadLDSDecl(
1631  Fn, "local memory global used by non-kernel function", MI.getDebugLoc());
1632  Fn.getContext().diagnose(BadLDSDecl);
1633  }
1634 
1635  // TODO: We could emit code to handle the initialization somewhere.
1637  B.buildConstant(DstReg, MFI->allocateLDSGlobal(B.getDataLayout(), *GV));
1638  MI.eraseFromParent();
1639  return true;
1640  }
1641 
1642  const Function &Fn = MF.getFunction();
1643  DiagnosticInfoUnsupported BadInit(
1644  Fn, "unsupported initializer for address space", MI.getDebugLoc());
1645  Fn.getContext().diagnose(BadInit);
1646  return true;
1647  }
1648 
1649  const SITargetLowering *TLI = ST.getTargetLowering();
1650 
1651  if (TLI->shouldEmitFixup(GV)) {
1652  buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0);
1653  MI.eraseFromParent();
1654  return true;
1655  }
1656 
1657  if (TLI->shouldEmitPCReloc(GV)) {
1658  buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32);
1659  MI.eraseFromParent();
1660  return true;
1661  }
1662 
1664  Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy);
1665 
1670  8 /*Size*/, 8 /*Align*/);
1671 
1672  buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32);
1673 
1674  if (Ty.getSizeInBits() == 32) {
1675  // Truncate if this is a 32-bit constant adrdess.
1676  auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO);
1677  B.buildExtract(DstReg, Load, 0);
1678  } else
1679  B.buildLoad(DstReg, GOTAddr, *GOTMMO);
1680 
1681  MI.eraseFromParent();
1682  return true;
1683 }
1684 
1687  MachineIRBuilder &B, GISelChangeObserver &Observer) const {
1688  B.setInstr(MI);
1690  auto Cast = B.buildAddrSpaceCast(ConstPtr, MI.getOperand(1).getReg());
1691  Observer.changingInstr(MI);
1692  MI.getOperand(1).setReg(Cast.getReg(0));
1693  Observer.changedInstr(MI);
1694  return true;
1695 }
1696 
1699  MachineIRBuilder &B) const {
1700  LLT Ty = MRI.getType(MI.getOperand(0).getReg());
1701  assert(Ty.isScalar());
1702 
1703  // TODO: Always legal with future ftz flag.
1704  if (Ty == LLT::scalar(32) && !ST.hasFP32Denormals())
1705  return true;
1706  if (Ty == LLT::scalar(16) && !ST.hasFP16Denormals())
1707  return true;
1708 
1709  MachineFunction &MF = B.getMF();
1710 
1711  MachineIRBuilder HelperBuilder(MI);
1712  GISelObserverWrapper DummyObserver;
1713  LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
1714  HelperBuilder.setMBB(*MI.getParent());
1715  return Helper.lowerFMad(MI) == LegalizerHelper::Legalized;
1716 }
1717 
1718 // Return the use branch instruction, otherwise null if the usage is invalid.
1721  Register CondDef = MI.getOperand(0).getReg();
1722  if (!MRI.hasOneNonDBGUse(CondDef))
1723  return nullptr;
1724 
1725  MachineInstr &UseMI = *MRI.use_instr_nodbg_begin(CondDef);
1726  return UseMI.getParent() == MI.getParent() &&
1727  UseMI.getOpcode() == AMDGPU::G_BRCOND ? &UseMI : nullptr;
1728 }
1729 
1731  Register Reg, LLT Ty) const {
1732  Register LiveIn = MRI.getLiveInVirtReg(Reg);
1733  if (LiveIn)
1734  return LiveIn;
1735 
1736  Register NewReg = MRI.createGenericVirtualRegister(Ty);
1737  MRI.addLiveIn(Reg, NewReg);
1738  return NewReg;
1739 }
1740 
1742  const ArgDescriptor *Arg) const {
1743  if (!Arg->isRegister() || !Arg->getRegister().isValid())
1744  return false; // TODO: Handle these
1745 
1746  assert(Arg->getRegister().isPhysical());
1747 
1748  MachineRegisterInfo &MRI = *B.getMRI();
1749 
1750  LLT Ty = MRI.getType(DstReg);
1751  Register LiveIn = getLiveInRegister(MRI, Arg->getRegister(), Ty);
1752 
1753  if (Arg->isMasked()) {
1754  // TODO: Should we try to emit this once in the entry block?
1755  const LLT S32 = LLT::scalar(32);
1756  const unsigned Mask = Arg->getMask();
1757  const unsigned Shift = countTrailingZeros<unsigned>(Mask);
1758 
1759  Register AndMaskSrc = LiveIn;
1760 
1761  if (Shift != 0) {
1762  auto ShiftAmt = B.buildConstant(S32, Shift);
1763  AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0);
1764  }
1765 
1766  B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift));
1767  } else
1768  B.buildCopy(DstReg, LiveIn);
1769 
1770  // Insert the argument copy if it doens't already exist.
1771  // FIXME: It seems EmitLiveInCopies isn't called anywhere?
1772  if (!MRI.getVRegDef(LiveIn)) {
1773  // FIXME: Should have scoped insert pt
1774  MachineBasicBlock &OrigInsBB = B.getMBB();
1775  auto OrigInsPt = B.getInsertPt();
1776 
1777  MachineBasicBlock &EntryMBB = B.getMF().front();
1778  EntryMBB.addLiveIn(Arg->getRegister());
1779  B.setInsertPt(EntryMBB, EntryMBB.begin());
1780  B.buildCopy(LiveIn, Arg->getRegister());
1781 
1782  B.setInsertPt(OrigInsBB, OrigInsPt);
1783  }
1784 
1785  return true;
1786 }
1787 
1789  MachineInstr &MI,
1792  AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
1793  B.setInstr(MI);
1794 
1796 
1797  const ArgDescriptor *Arg;
1798  const TargetRegisterClass *RC;
1799  std::tie(Arg, RC) = MFI->getPreloadedValue(ArgType);
1800  if (!Arg) {
1801  LLVM_DEBUG(dbgs() << "Required arg register missing\n");
1802  return false;
1803  }
1804 
1805  if (loadInputValue(MI.getOperand(0).getReg(), B, Arg)) {
1806  MI.eraseFromParent();
1807  return true;
1808  }
1809 
1810  return false;
1811 }
1812 
1815  MachineIRBuilder &B) const {
1816  B.setInstr(MI);
1817  Register Res = MI.getOperand(0).getReg();
1818  Register LHS = MI.getOperand(2).getReg();
1819  Register RHS = MI.getOperand(3).getReg();
1820  uint16_t Flags = MI.getFlags();
1821 
1822  LLT S32 = LLT::scalar(32);
1823  LLT S1 = LLT::scalar(1);
1824 
1825  auto Abs = B.buildFAbs(S32, RHS, Flags);
1826  const APFloat C0Val(1.0f);
1827 
1828  auto C0 = B.buildConstant(S32, 0x6f800000);
1829  auto C1 = B.buildConstant(S32, 0x2f800000);
1830  auto C2 = B.buildConstant(S32, FloatToBits(1.0f));
1831 
1832  auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags);
1833  auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags);
1834 
1835  auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags);
1836 
1837  auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
1838  .addUse(Mul0.getReg(0))
1839  .setMIFlags(Flags);
1840 
1841  auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags);
1842 
1843  B.buildFMul(Res, Sel, Mul1, Flags);
1844 
1845  MI.eraseFromParent();
1846  return true;
1847 }
1848 
1851  MachineIRBuilder &B) const {
1853  if (!MFI->isEntryFunction()) {
1854  return legalizePreloadedArgIntrin(MI, MRI, B,
1856  }
1857 
1858  B.setInstr(MI);
1859 
1860  uint64_t Offset =
1861  ST.getTargetLowering()->getImplicitParameterOffset(
1863  Register DstReg = MI.getOperand(0).getReg();
1864  LLT DstTy = MRI.getType(DstReg);
1865  LLT IdxTy = LLT::scalar(DstTy.getSizeInBits());
1866 
1867  const ArgDescriptor *Arg;
1868  const TargetRegisterClass *RC;
1869  std::tie(Arg, RC)
1871  if (!Arg)
1872  return false;
1873 
1874  Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy);
1875  if (!loadInputValue(KernargPtrReg, B, Arg))
1876  return false;
1877 
1878  B.buildGEP(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0));
1879  MI.eraseFromParent();
1880  return true;
1881 }
1882 
1886  unsigned AddrSpace) const {
1887  B.setInstr(MI);
1888  Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B);
1889  auto Hi32 = B.buildExtract(LLT::scalar(32), MI.getOperand(2).getReg(), 32);
1890  B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg);
1891  MI.eraseFromParent();
1892  return true;
1893 }
1894 
1895 /// Handle register layout difference for f16 images for some subtargets.
1898  Register Reg) const {
1899  if (!ST.hasUnpackedD16VMem())
1900  return Reg;
1901 
1902  const LLT S16 = LLT::scalar(16);
1903  const LLT S32 = LLT::scalar(32);
1904  LLT StoreVT = MRI.getType(Reg);
1905  assert(StoreVT.isVector() && StoreVT.getElementType() == S16);
1906 
1907  auto Unmerge = B.buildUnmerge(S16, Reg);
1908 
1909  SmallVector<Register, 4> WideRegs;
1910  for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
1911  WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0));
1912 
1913  int NumElts = StoreVT.getNumElements();
1914 
1915  return B.buildBuildVector(LLT::vector(NumElts, S32), WideRegs).getReg(0);
1916 }
1917 
1921  bool IsFormat) const {
1922  // TODO: Reject f16 format on targets where unsupported.
1923  Register VData = MI.getOperand(1).getReg();
1924  LLT Ty = MRI.getType(VData);
1925 
1926  B.setInstr(MI);
1927 
1928  const LLT S32 = LLT::scalar(32);
1929  const LLT S16 = LLT::scalar(16);
1930 
1931  // Fixup illegal register types for i8 stores.
1932  if (Ty == LLT::scalar(8) || Ty == S16) {
1933  Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0);
1934  MI.getOperand(1).setReg(AnyExt);
1935  return true;
1936  }
1937 
1938  if (Ty.isVector()) {
1939  if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) {
1940  if (IsFormat)
1941  MI.getOperand(1).setReg(handleD16VData(B, MRI, VData));
1942  return true;
1943  }
1944 
1945  return Ty.getElementType() == S32 && Ty.getNumElements() <= 4;
1946  }
1947 
1948  return Ty == S32;
1949 }
1950 
1953  MachineIRBuilder &B) const {
1954  // Replace the use G_BRCOND with the exec manipulate and branch pseudos.
1955  switch (MI.getIntrinsicID()) {
1956  case Intrinsic::amdgcn_if: {
1957  if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI)) {
1958  const SIRegisterInfo *TRI
1959  = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
1960 
1961  B.setInstr(*BrCond);
1962  Register Def = MI.getOperand(1).getReg();
1963  Register Use = MI.getOperand(3).getReg();
1964  B.buildInstr(AMDGPU::SI_IF)
1965  .addDef(Def)
1966  .addUse(Use)
1967  .addMBB(BrCond->getOperand(1).getMBB());
1968 
1969  MRI.setRegClass(Def, TRI->getWaveMaskRegClass());
1970  MRI.setRegClass(Use, TRI->getWaveMaskRegClass());
1971  MI.eraseFromParent();
1972  BrCond->eraseFromParent();
1973  return true;
1974  }
1975 
1976  return false;
1977  }
1978  case Intrinsic::amdgcn_loop: {
1979  if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI)) {
1980  const SIRegisterInfo *TRI
1981  = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
1982 
1983  B.setInstr(*BrCond);
1984  Register Reg = MI.getOperand(2).getReg();
1985  B.buildInstr(AMDGPU::SI_LOOP)
1986  .addUse(Reg)
1987  .addMBB(BrCond->getOperand(1).getMBB());
1988  MI.eraseFromParent();
1989  BrCond->eraseFromParent();
1990  MRI.setRegClass(Reg, TRI->getWaveMaskRegClass());
1991  return true;
1992  }
1993 
1994  return false;
1995  }
1996  case Intrinsic::amdgcn_kernarg_segment_ptr:
1999  case Intrinsic::amdgcn_implicitarg_ptr:
2000  return legalizeImplicitArgPtr(MI, MRI, B);
2001  case Intrinsic::amdgcn_workitem_id_x:
2002  return legalizePreloadedArgIntrin(MI, MRI, B,
2004  case Intrinsic::amdgcn_workitem_id_y:
2005  return legalizePreloadedArgIntrin(MI, MRI, B,
2007  case Intrinsic::amdgcn_workitem_id_z:
2008  return legalizePreloadedArgIntrin(MI, MRI, B,
2010  case Intrinsic::amdgcn_workgroup_id_x:
2011  return legalizePreloadedArgIntrin(MI, MRI, B,
2013  case Intrinsic::amdgcn_workgroup_id_y:
2014  return legalizePreloadedArgIntrin(MI, MRI, B,
2016  case Intrinsic::amdgcn_workgroup_id_z:
2017  return legalizePreloadedArgIntrin(MI, MRI, B,
2019  case Intrinsic::amdgcn_dispatch_ptr:
2020  return legalizePreloadedArgIntrin(MI, MRI, B,
2022  case Intrinsic::amdgcn_queue_ptr:
2023  return legalizePreloadedArgIntrin(MI, MRI, B,
2025  case Intrinsic::amdgcn_implicit_buffer_ptr:
2028  case Intrinsic::amdgcn_dispatch_id:
2029  return legalizePreloadedArgIntrin(MI, MRI, B,
2031  case Intrinsic::amdgcn_fdiv_fast:
2032  return legalizeFDIVFast(MI, MRI, B);
2033  case Intrinsic::amdgcn_is_shared:
2034  return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS);
2035  case Intrinsic::amdgcn_is_private:
2036  return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS);
2037  case Intrinsic::amdgcn_wavefrontsize: {
2038  B.setInstr(MI);
2039  B.buildConstant(MI.getOperand(0), ST.getWavefrontSize());
2040  MI.eraseFromParent();
2041  return true;
2042  }
2043  case Intrinsic::amdgcn_raw_buffer_store:
2044  return legalizeRawBufferStore(MI, MRI, B, false);
2045  case Intrinsic::amdgcn_raw_buffer_store_format:
2046  return legalizeRawBufferStore(MI, MRI, B, true);
2047  default:
2048  return true;
2049  }
2050 
2051  return true;
2052 }
virtual MachineInstrBuilder buildConstant(const DstOp &Res, const ConstantInt &Val)
Build and insert Res = G_CONSTANT Val.
unsigned Log2_32_Ceil(uint32_t Value)
Return the ceil log base 2 of the specified value, 32 if the value is zero.
Definition: MathExtras.h:598
bool loadInputValue(Register DstReg, MachineIRBuilder &B, const ArgDescriptor *Arg) const
Register getRegister() const
bool shouldEmitFixup(const GlobalValue *GV) const
static LLT pointer(unsigned AddressSpace, unsigned SizeInBits)
Get a low-level pointer in the given address space.
unsigned allocateLDSGlobal(const DataLayout &DL, const GlobalValue &GV)
bool hasApertureRegs() const
MachineInstrBuilder buildUnmerge(ArrayRef< LLT > Res, const SrcOp &Op)
Build and insert Res0, ...
Diagnostic information for unsupported feature in backend.
static bool hasDefinedInitializer(const GlobalValue *GV)
This class represents lattice values for constants.
Definition: AllocatorList.h:23
MachineInstrBuilder buildInsert(Register Res, Register Src, Register Op, unsigned Index)
bool legalizeFceil(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
Register getReg(unsigned Idx) const
Get the register for the operand index.
Register createVirtualRegister(const TargetRegisterClass *RegClass, StringRef Name="")
createVirtualRegister - Create and return a new virtual register in the function with the specified r...
Register getSegmentAperture(unsigned AddrSpace, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
unsigned getScalarSizeInBits() const
bool legalizeInsertVectorElt(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const override
Return true if MI is either legal or has been legalized and false if not legal.
MachineInstrBuilder buildFAbs(const DstOp &Dst, const SrcOp &Src0, Optional< unsigned > Flags=None)
Build and insert Res = G_FABS Op0.
void addLiveIn(unsigned Reg, unsigned vreg=0)
addLiveIn - Add the specified register as a live-in.
static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx)
Address space for 32-bit constant memory.
Definition: AMDGPU.h:277
const DebugLoc & getDebugLoc() const
Returns the debug location id of this MachineInstr.
Definition: MachineInstr.h:385
The LegalityQuery object bundles together all the information that&#39;s needed to decide whether a given...
bool isScalar() const
bool legalizeCustom(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, GISelChangeObserver &Observer) const override
unsigned Reg
bool legalizeRawBufferStore(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, bool IsFormat) const
static PointerType * get(Type *ElementType, unsigned AddressSpace)
This constructs a pointer to an object of the specified type in a numbered address space...
Definition: Type.cpp:637
LLT getScalarType() const
bool legalizeIsAddrSpace(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, unsigned AddrSpace) const
LLT getType(unsigned Reg) const
Get the low-level type of Reg or LLT{} if Reg is not a generic (target independent) virtual register...
bool legalizeAddrSpaceCast(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
static LegalityPredicate isWideScalarTruncStore(unsigned TypeIdx)
unsigned const TargetRegisterInfo * TRI
Address space for region memory. (GDS)
Definition: AMDGPU.h:271
LegalityPredicate typeIs(unsigned TypeIdx, LLT TypesInit)
True iff the given type index is the specified types.
LegalizeRuleSet & scalarize(unsigned TypeIdx)
LegalizeRuleSet & legalForCartesianProduct(std::initializer_list< LLT > Types)
The instruction is legal when type indexes 0 and 1 are both in the given list.
static LegalizeMutation oneMoreElement(unsigned TypeIdx)
bool legalizeLoad(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, GISelChangeObserver &Observer) const
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
LegalityPredicate isPointer(unsigned TypeIdx)
True iff the specified type index is a pointer (with any address space).
static LegalityPredicate isRegisterType(unsigned TypeIdx)
Optional< MachineInstrBuilder > materializeGEP(Register &Res, Register Op0, const LLT &ValueTy, uint64_t Value)
Materialize and insert Res = G_GEP Op0, (G_CONSTANT Value)
Register handleD16VData(MachineIRBuilder &B, MachineRegisterInfo &MRI, Register Reg) const
Handle register layout difference for f16 images for some subtargets.
AMDGPU::SIModeRegisterDefaults getMode() const
MachineInstrBuilder buildExtract(const DstOp &Res, const SrcOp &Src, uint64_t Index)
Build and insert `Res0, ...
MachineInstrBuilder buildFAdd(const DstOp &Dst, const SrcOp &Src0, const SrcOp &Src1, Optional< unsigned > Flags=None)
Build and insert Res = G_FADD Op0, Op1.
static MachineInstr * verifyCFIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI)
LegalizeRuleSet & custom()
Unconditionally custom lower.
Address space for global memory (RAT0, VTX0).
Definition: AMDGPU.h:270
bool isVector() const
A description of a memory reference used in the backend.
void setInsertPt(MachineBasicBlock &MBB, MachineBasicBlock::iterator II)
Set the insertion point before the specified position.
MachineInstrBuilder buildAnyExt(const DstOp &Res, const SrcOp &Op)
Build and insert Res = G_ANYEXT Op0.
LegalizeResult lowerFMinNumMaxNum(MachineInstr &MI)
bool legalizeITOFP(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, bool Signed) const
A Use represents the edge between a Value definition and its users.
Definition: Use.h:55
bool IEEE
Floating point opcodes that support exception flag gathering quiet and propagate signaling NaN inputs...
LegalizeRuleSet & minScalarOrElt(unsigned TypeIdx, const LLT &Ty)
Ensure the scalar or element is at least as wide as Ty.
LegalizeRuleSet & widenScalarToNextPow2(unsigned TypeIdx, unsigned MinSize=0)
Widen the scalar to the next power of two that is at least MinSize.
void eraseFromParent()
Unlink &#39;this&#39; from the containing basic block and delete it.
LegalizeMutation changeTo(unsigned TypeIdx, LLT Ty)
Select this specific type for the given type index.
static LegalityPredicate isMultiple32(unsigned TypeIdx, unsigned MaxSize=1024)
The memory access is dereferenceable (i.e., doesn&#39;t trap).
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
Definition: MachineInstr.h:411
bool legalizeFDIVFast(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, uint64_t s, unsigned base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
bool legalizeMinNumMaxNum(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
const MachineInstrBuilder & addDef(Register RegNo, unsigned Flags=0, unsigned SubReg=0) const
Add a virtual register definition operand.
LLT getElementType() const
Returns the vector&#39;s element type. Only valid for vector types.
static LegalityPredicate elementTypeIs(unsigned TypeIdx, LLT Type)
void verify(const MCInstrInfo &MII) const
Perform simple self-diagnostic and assert if there is anything obviously wrong with the actions set u...
bool legalizeIntrinsicTrunc(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
MachineInstrBuilder buildSITOFP(const DstOp &Dst, const SrcOp &Src0)
Build and insert Res = G_SITOFP Src0.
MachineInstrBuilder buildFSub(const DstOp &Dst, const SrcOp &Src0, const SrcOp &Src1)
Build and insert Res = G_FSUB Op0, Op1.
MachineInstrBuilder buildSelect(const DstOp &Res, const SrcOp &Tst, const SrcOp &Op0, const SrcOp &Op1, Optional< unsigned > Flags=None)
Build and insert a Res = G_SELECT Tst, Op0, Op1.
LegalizeRuleSet & clampMaxNumElements(unsigned TypeIdx, const LLT &EltTy, unsigned MaxElements)
Limit the number of elements in EltTy vectors to at most MaxElements.
void computeTables()
Compute any ancillary tables needed to quickly decide how an operation should be handled.
const T & getValue() const LLVM_LVALUE_FUNCTION
Definition: Optional.h:255
Predicate all(Predicate P0, Predicate P1)
True iff P0 and P1 are true.
MachineFunction & getMF()
Getter for the function we currently build.
bool isPhysical() const
Return true if the specified register number is in the physical register namespace.
Definition: Register.h:95
const MachineInstrBuilder & addGlobalAddress(const GlobalValue *GV, int64_t Offset=0, unsigned TargetFlags=0) const
MachineInstrBuilder buildLShr(const DstOp &Dst, const SrcOp &Src0, const SrcOp &Src1, Optional< unsigned > Flags=None)
unsigned getPointerSizeInBits(unsigned AS) const
void setReg(Register Reg)
Change the register this operand corresponds to.
static const fltSemantics & IEEEdouble() LLVM_READNONE
Definition: APFloat.cpp:158
static LLT scalar(unsigned SizeInBits)
Get a low-level scalar or aggregate "bag of bits".
MachineInstrBuilder buildSub(const DstOp &Dst, const SrcOp &Src0, const SrcOp &Src1, Optional< unsigned > Flags=None)
Build and insert Res = G_SUB Op0, Op1.
Analysis containing CSE Info
Definition: CSEInfo.cpp:20
MachineInstrBuilder buildFMul(const DstOp &Dst, const SrcOp &Src0, const SrcOp &Src1, Optional< unsigned > Flags=None)
uint32_t FloatToBits(float Float)
This function takes a float and returns the bit equivalent 32-bit integer.
Definition: MathExtras.h:652
LegalizeRuleSet & legalIf(LegalityPredicate Predicate)
The instruction is legal if predicate is true.
void addLiveIn(MCRegister PhysReg, LaneBitmask LaneMask=LaneBitmask::getAll())
Adds the specified register as a live in.
MachineBasicBlock::iterator getInsertPt()
Current insertion point for new instructions.
LegalizeRuleSet & lowerIf(LegalityPredicate Predicate)
The instruction is lowered if predicate is true.
LegalizeMutation scalarize(unsigned TypeIdx)
Break up the vector type for the given type index into the element type.
Address space for private memory.
Definition: AMDGPU.h:275
constexpr uint64_t MinAlign(uint64_t A, uint64_t B)
A and B are either alignments or offsets.
Definition: MathExtras.h:661
MachineInstrBuilder buildAddrSpaceCast(const DstOp &Dst, const SrcOp &Src)
Build and insert Dst = G_ADDRSPACE_CAST Src.
const MachineInstrBuilder & addUse(Register RegNo, unsigned Flags=0, unsigned SubReg=0) const
Add a virtual register use operand.
MachineRegisterInfo * getMRI()
Getter for MRI.
Abstract class that contains various methods for clients to notify about changes. ...
const TargetRegisterInfo * getTargetRegisterInfo() const
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
unsigned const MachineRegisterInfo * MRI
static LLT scalarOrVector(uint16_t NumElements, LLT ScalarTy)
static MachineInstrBuilder extractF64Exponent(unsigned Hi, MachineIRBuilder &B)
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition: MathExtras.h:465
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
The instances of the Type class are immutable: once they are created, they are never changed...
Definition: Type.h:46
MachineInstrBuilder buildInstr(unsigned Opcode)
Build and insert <empty> = Opcode <empty>.
bool legalizePreloadedArgIntrin(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const
Address space for local memory.
Definition: AMDGPU.h:274
MachineInstrBuilder & UseMI
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
MachineInstrBuilder buildBuildVector(const DstOp &Res, ArrayRef< Register > Ops)
Build and insert Res = G_BUILD_VECTOR Op0, ...
Address space for flat memory.
Definition: AMDGPU.h:269
bool legalizeSinCos(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool buildPCRelGlobalAddress(Register DstReg, LLT PtrTy, MachineIRBuilder &B, const GlobalValue *GV, unsigned Offset, unsigned GAFlags=SIInstrInfo::MO_NONE) const
const GlobalValue * getGlobal() const
LegalizeRuleSet & lowerFor(std::initializer_list< LLT > Types)
The instruction is lowered when type index 0 is any type in the given list.
Helper class to build MachineInstr.
bool legalizeImplicitArgPtr(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
void setType(unsigned VReg, LLT Ty)
Set the low-level type of VReg to Ty.
static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx)
static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size)
LegalizeRuleSet & clampNumElements(unsigned TypeIdx, const LLT &MinTy, const LLT &MaxTy)
Limit the number of elements for the given vectors to at least MinTy&#39;s number of elements and at most...
static LegalityPredicate isWideVec16(unsigned TypeIdx)
void setInstr(MachineInstr &MI)
Set the insertion point to before MI.
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
LegalizeRuleSet & legalForTypesWithMemDesc(std::initializer_list< LegalityPredicates::TypePairAndMemDesc > TypesAndMemDesc)
The instruction is legal when type indexes 0 and 1 along with the memory size and minimum alignment i...
LegalizeRuleSet & unsupportedIf(LegalityPredicate Predicate)
amdgpu Simplify well known AMD library false FunctionCallee Value * Arg
AMDGPULegalizerInfo(const GCNSubtarget &ST, const GCNTargetMachine &TM)
LegalizeRuleSet & clampScalar(unsigned TypeIdx, const LLT &MinTy, const LLT &MaxTy)
Limit the range of scalar sizes to MinTy and MaxTy.
unsigned getAddressSpace() const
LegalizeRuleSet & customIf(LegalityPredicate Predicate)
AMDGPUFunctionArgInfo & getArgInfo()
The AMDGPU TargetMachine interface definition for hw codgen targets.
MachineInstrBuilder buildGEP(const DstOp &Res, const SrcOp &Op0, const SrcOp &Op1)
Build and insert Res = G_GEP Op0, Op1.
std::function< std::pair< unsigned, LLT >(const LegalityQuery &)> LegalizeMutation
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function. ...
Definition: Function.cpp:205
MachineInstrBuilder buildCopy(const DstOp &Res, const SrcOp &Op)
Build and insert Res = COPY Op.
static UndefValue * get(Type *T)
Static factory methods - Return an &#39;undef&#39; object of the specified type.
Definition: Constants.cpp:1446
LegalizeRuleSet & minScalar(unsigned TypeIdx, const LLT &Ty)
Ensure the scalar is at least as wide as Ty.
const MachineBasicBlock & front() const
bool legalizeFrint(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
LegalizeRuleSet & getActionDefinitionsBuilder(unsigned Opcode)
Get the action definition builder for the given opcode.
This class contains a discriminated union of information about pointers in memory operands...
LegalizeRuleSet & unsupportedIfMemSizeNotPow2()
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
MachineInstrBuilder buildUITOFP(const DstOp &Dst, const SrcOp &Src0)
Build and insert Res = G_UITOFP Src0.
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:40
signed greater than
Definition: InstrTypes.h:759
LegalizeRuleSet & customFor(std::initializer_list< LLT > Types)
MachineInstrBuilder buildIntrinsic(Intrinsic::ID ID, ArrayRef< Register > Res, bool HasSideEffects)
Build and insert either a G_INTRINSIC (if HasSideEffects is false) or G_INTRINSIC_W_SIDE_EFFECTS inst...
bool hasFlatAddressSpace() const
0 0 1 0 True if ordered and greater than
Definition: InstrTypes.h:736
MachineInstrBuilder buildLoad(const DstOp &Res, const SrcOp &Addr, MachineMemOperand &MMO)
Build and insert Res = G_LOAD Addr, MMO.
MachineInstrBuilder buildICmp(CmpInst::Predicate Pred, const DstOp &Res, const SrcOp &Op0, const SrcOp &Op1)
Build and insert a Res = G_ICMP Pred, Op0, Op1.
void setDesc(const MCInstrDesc &tid)
Replace the instruction descriptor (thus opcode) of the current instruction with a new one...
virtual MachineInstrBuilder buildFConstant(const DstOp &Res, const ConstantFP &Val)
Build and insert Res = G_FCONSTANT Val.
Align max(MaybeAlign Lhs, Align Rhs)
Definition: Alignment.h:389
This is a &#39;vector&#39; (really, a variable-sized array), optimized for the case when the array is small...
Definition: SmallVector.h:837
LegalizeRuleSet & minScalarSameAs(unsigned TypeIdx, unsigned LargeTypeIdx)
Widen the scalar to match the size of another.
unsigned getSizeInBits() const
Returns the total size of the type. Must only be called on sized types.
signed less than
Definition: InstrTypes.h:761
Promote Memory to Register
Definition: Mem2Reg.cpp:109
const TargetInstrInfo & getTII()
LegalizeResult lowerFMad(MachineInstr &MI)
const Function & getFunction() const
Return the LLVM function that this machine code represents.
MachineInstrBuilder buildAShr(const DstOp &Dst, const SrcOp &Src0, const SrcOp &Src1, Optional< unsigned > Flags=None)
This file declares the MachineIRBuilder class.
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:132
MachineInstrBuilder buildFCmp(CmpInst::Predicate Pred, const DstOp &Res, const SrcOp &Op0, const SrcOp &Op1, Optional< unsigned > Flags=None)
Build and insert a Res = G_FCMP PredOp0, Op1.
LegalizeRuleSet & moreElementsIf(LegalityPredicate Predicate, LegalizeMutation Mutation)
Add more elements to reach the type selected by the mutation if the predicate is true.
virtual void changingInstr(MachineInstr &MI)=0
This instruction is about to be mutated in some way.
static unsigned getReg(const void *D, unsigned RC, unsigned RegNo)
Address space for constant memory (VTX2).
Definition: AMDGPU.h:273
This file declares the targeting of the Machinelegalizer class for AMDGPU.
Optional< int64_t > getConstantVRegVal(unsigned VReg, const MachineRegisterInfo &MRI)
If VReg is defined by a G_CONSTANT fits in int64_t returns it.
Definition: Utils.cpp:207
LegalizeRuleSet & fewerElementsIf(LegalityPredicate Predicate, LegalizeMutation Mutation)
Remove elements to reach the type selected by the mutation if the predicate is true.
bool isNoopAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override
Returns true if a cast between SrcAS and DestAS is a noop.
const MachineBasicBlock * getParent() const
Definition: MachineInstr.h:256
MachineRegisterInfo - Keep track of information for virtual and physical registers, including vreg register classes, use/def chains for registers, etc.
The memory access reads data.
bool isValid() const
Definition: Register.h:115
std::pair< const ArgDescriptor *, const TargetRegisterClass * > getPreloadedValue(AMDGPUFunctionArgInfo::PreloadedValue Value) const
LegalizeRuleSet & legalFor(std::initializer_list< LLT > Types)
The instruction is legal when type index 0 is any type in the given list.
Representation of each machine instruction.
Definition: MachineInstr.h:64
Instruction has been legalized and the MachineFunction changed.
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
LegalizeMutation widenScalarOrEltToNextPow2(unsigned TypeIdx, unsigned Min=0)
Widen the scalar type or vector element type for the given type index to the next power of 2...
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
static LegalityPredicate isSmallOddVector(unsigned TypeIdx)
const MachineInstrBuilder & setMIFlags(unsigned Flags) const
LegalizeRuleSet & lower()
The instruction is lowered.
ArrayRef< LLT > Types
const MCInstrDesc & get(unsigned Opcode) const
Return the machine instruction descriptor that corresponds to the specified instruction opcode...
Definition: MCInstrInfo.h:44
MachineInstrBuilder buildNot(const DstOp &Dst, const SrcOp &Src0)
Build and insert a bitwise not, NegOne = G_CONSTANT -1 Res = G_OR Op0, NegOne.
const MachineBasicBlock & getMBB() const
Getter for the basic block we currently build.
void setMBB(MachineBasicBlock &MBB)
Set the insertion point to the end of MBB.
MachineInstrBuilder buildFCopysign(const DstOp &Dst, const SrcOp &Src0, const SrcOp &Src1)
Build and insert Res = G_FCOPYSIGN Op0, Op1.
#define I(x, y, z)
Definition: MD5.cpp:58
bool allowsMisalignedMemoryAccessesImpl(unsigned Size, unsigned AS, unsigned Align, MachineMemOperand::Flags Flags=MachineMemOperand::MONone, bool *IsFast=nullptr) const
The memory access always returns the same value (or traps).
0 1 1 0 True if ordered and operands are unequal
Definition: InstrTypes.h:740
LegalizeRuleSet & alwaysLegal()
uint32_t Size
Definition: Profile.cpp:46
LegalizeRuleSet & widenScalarIf(LegalityPredicate Predicate, LegalizeMutation Mutation)
Widen the scalar to the one selected by the mutation if the predicate is true.
const DataLayout & getDataLayout() const
void diagnose(const DiagnosticInfo &DI)
Report a message to the currently installed diagnostic handler.
const LLVMTargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
bool hasOneNonDBGUse(unsigned RegNo) const
hasOneNonDBGUse - Return true if there is exactly one non-Debug use of the specified register...
bool legalizeExtractVectorElt(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
uint16_t getFlags() const
Return the MI flags bitvector.
Definition: MachineInstr.h:292
const TargetRegisterClass * getWaveMaskRegClass() const
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
bool legalizeFMad(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
use_instr_nodbg_iterator use_instr_nodbg_begin(unsigned RegNo) const
static MachinePointerInfo getGOT(MachineFunction &MF)
Return a MachinePointerInfo record that refers to a GOT entry.
unsigned getIntrinsicID() const
Returns the Intrinsic::ID for this instruction.
LLVM Value Representation.
Definition: Value.h:74
uint16_t getNumElements() const
Returns the number of elements in a vector LLT.
LegalizeRuleSet & clampScalarOrElt(unsigned TypeIdx, const LLT &MinTy, const LLT &MaxTy)
Limit the range of scalar sizes to MinTy and MaxTy.
std::underlying_type< E >::type Mask()
Get a bitmask with 1s in all places up to the high-order bit of E&#39;s largest value.
Definition: BitmaskEnum.h:80
MachineInstrBuilder buildAnd(const DstOp &Dst, const SrcOp &Src0, const SrcOp &Src1)
Build and insert Res = G_AND Op0, Op1.
MachineInstrBuilder buildMerge(const DstOp &Res, ArrayRef< Register > Ops)
Build and insert Res = G_MERGE_VALUES Op0, ...
IRTranslator LLVM IR MI
void setRegClass(unsigned Reg, const TargetRegisterClass *RC)
setRegClass - Set the register class of the specified virtual register.
std::function< bool(const LegalityQuery &)> LegalityPredicate
static LegalityPredicate numElementsNotEven(unsigned TypeIdx)
MachineInstrBuilder buildUndef(const DstOp &Res)
Build and insert Res = IMPLICIT_DEF.
static LLT vector(uint16_t NumElements, unsigned ScalarSizeInBits)
Get a low-level vector of some number of elements and element width.
Register createGenericVirtualRegister(LLT Ty, StringRef Name="")
Create and return a new generic virtual register with low-level type Ty.
Simple wrapper observer that takes several observers, and calls each one for each event...
Register getReg() const
getReg - Returns the register number.
bool legalizeGlobalValue(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
static LegalityPredicate sizeIs(unsigned TypeIdx, unsigned Size)
ArrayRef< MemDesc > MMODescrs
Operations which require memory can use this to place requirements on the memory type for each MMO...
const SITargetLowering * getTargetLowering() const override
#define LLVM_DEBUG(X)
Definition: Debug.h:122
unsigned getLiveInVirtReg(unsigned PReg) const
getLiveInVirtReg - If PReg is a live-in physical register, return the corresponding live-in physical ...
const MachineOperand & getOperand(unsigned i) const
Definition: MachineInstr.h:416
void setAction(const InstrAspect &Aspect, LegalizeAction Action)
More friendly way to set an action for common types that have an LLT representation.
static IntegerType * getInt8Ty(LLVMContext &C)
Definition: Type.cpp:178
static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size)
The operation is expected to be selectable directly by the target, and no transformation is necessary...
Definition: LegalizerInfo.h:47
Register getLiveInRegister(MachineRegisterInfo &MRI, Register Reg, LLT Ty) const
virtual void changedInstr(MachineInstr &MI)=0
This instruction was mutated in some way.
Wrapper class representing virtual and physical registers.
Definition: Register.h:19
LegalityPredicate sameSize(unsigned TypeIdx0, unsigned TypeIdx1)
True iff the specified type indices are both the same bit size.
bool shouldEmitPCReloc(const GlobalValue *GV) const