LLVM  10.0.0svn
AMDGPULegalizerInfo.cpp
Go to the documentation of this file.
1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// This file implements the targeting of the Machinelegalizer class for
10 /// AMDGPU.
11 /// \todo This should be generated by TableGen.
12 //===----------------------------------------------------------------------===//
13 
14 #if defined(_MSC_VER) || defined(__MINGW32__)
15 // According to Microsoft, one must set _USE_MATH_DEFINES in order to get M_PI
16 // from the Visual C++ cmath / math.h headers:
17 // https://docs.microsoft.com/en-us/cpp/c-runtime-library/math-constants?view=vs-2019
18 #define _USE_MATH_DEFINES
19 #endif
20 
21 #include "AMDGPU.h"
22 #include "AMDGPULegalizerInfo.h"
23 #include "AMDGPUTargetMachine.h"
24 #include "SIMachineFunctionInfo.h"
29 #include "llvm/IR/DerivedTypes.h"
30 #include "llvm/IR/DiagnosticInfo.h"
31 #include "llvm/IR/Type.h"
32 #include "llvm/Support/Debug.h"
33 
34 #define DEBUG_TYPE "amdgpu-legalinfo"
35 
36 using namespace llvm;
37 using namespace LegalizeActions;
38 using namespace LegalizeMutations;
39 using namespace LegalityPredicates;
40 
41 
42 static LegalityPredicate isMultiple32(unsigned TypeIdx,
43  unsigned MaxSize = 1024) {
44  return [=](const LegalityQuery &Query) {
45  const LLT Ty = Query.Types[TypeIdx];
46  const LLT EltTy = Ty.getScalarType();
47  return Ty.getSizeInBits() <= MaxSize && EltTy.getSizeInBits() % 32 == 0;
48  };
49 }
50 
51 static LegalityPredicate sizeIs(unsigned TypeIdx, unsigned Size) {
52  return [=](const LegalityQuery &Query) {
53  return Query.Types[TypeIdx].getSizeInBits() == Size;
54  };
55 }
56 
57 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) {
58  return [=](const LegalityQuery &Query) {
59  const LLT Ty = Query.Types[TypeIdx];
60  return Ty.isVector() &&
61  Ty.getNumElements() % 2 != 0 &&
62  Ty.getElementType().getSizeInBits() < 32 &&
63  Ty.getSizeInBits() % 32 != 0;
64  };
65 }
66 
67 static LegalityPredicate isWideVec16(unsigned TypeIdx) {
68  return [=](const LegalityQuery &Query) {
69  const LLT Ty = Query.Types[TypeIdx];
70  const LLT EltTy = Ty.getScalarType();
71  return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2;
72  };
73 }
74 
75 static LegalizeMutation oneMoreElement(unsigned TypeIdx) {
76  return [=](const LegalityQuery &Query) {
77  const LLT Ty = Query.Types[TypeIdx];
78  const LLT EltTy = Ty.getElementType();
79  return std::make_pair(TypeIdx, LLT::vector(Ty.getNumElements() + 1, EltTy));
80  };
81 }
82 
83 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) {
84  return [=](const LegalityQuery &Query) {
85  const LLT Ty = Query.Types[TypeIdx];
86  const LLT EltTy = Ty.getElementType();
87  unsigned Size = Ty.getSizeInBits();
88  unsigned Pieces = (Size + 63) / 64;
89  unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces;
90  return std::make_pair(TypeIdx, LLT::scalarOrVector(NewNumElts, EltTy));
91  };
92 }
93 
94 // Increase the number of vector elements to reach the next multiple of 32-bit
95 // type.
96 static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) {
97  return [=](const LegalityQuery &Query) {
98  const LLT Ty = Query.Types[TypeIdx];
99 
100  const LLT EltTy = Ty.getElementType();
101  const int Size = Ty.getSizeInBits();
102  const int EltSize = EltTy.getSizeInBits();
103  const int NextMul32 = (Size + 31) / 32;
104 
105  assert(EltSize < 32);
106 
107  const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize;
108  return std::make_pair(TypeIdx, LLT::vector(NewNumElts, EltTy));
109  };
110 }
111 
112 static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) {
113  return [=](const LegalityQuery &Query) {
114  const LLT QueryTy = Query.Types[TypeIdx];
115  return QueryTy.isVector() && QueryTy.getSizeInBits() < Size;
116  };
117 }
118 
119 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) {
120  return [=](const LegalityQuery &Query) {
121  const LLT QueryTy = Query.Types[TypeIdx];
122  return QueryTy.isVector() && QueryTy.getSizeInBits() > Size;
123  };
124 }
125 
126 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) {
127  return [=](const LegalityQuery &Query) {
128  const LLT QueryTy = Query.Types[TypeIdx];
129  return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0;
130  };
131 }
132 
133 // Any combination of 32 or 64-bit elements up to 1024 bits, and multiples of
134 // v2s16.
135 static LegalityPredicate isRegisterType(unsigned TypeIdx) {
136  return [=](const LegalityQuery &Query) {
137  const LLT Ty = Query.Types[TypeIdx];
138  if (Ty.isVector()) {
139  const int EltSize = Ty.getElementType().getSizeInBits();
140  return EltSize == 32 || EltSize == 64 ||
141  (EltSize == 16 && Ty.getNumElements() % 2 == 0) ||
142  EltSize == 128 || EltSize == 256;
143  }
144 
145  return Ty.getSizeInBits() % 32 == 0 && Ty.getSizeInBits() <= 1024;
146  };
147 }
148 
149 static LegalityPredicate elementTypeIs(unsigned TypeIdx, LLT Type) {
150  return [=](const LegalityQuery &Query) {
151  return Query.Types[TypeIdx].getElementType() == Type;
152  };
153 }
154 
155 static LegalityPredicate isWideScalarTruncStore(unsigned TypeIdx) {
156  return [=](const LegalityQuery &Query) {
157  const LLT Ty = Query.Types[TypeIdx];
158  return !Ty.isVector() && Ty.getSizeInBits() > 32 &&
159  Query.MMODescrs[0].SizeInBits < Ty.getSizeInBits();
160  };
161 }
162 
164  const GCNTargetMachine &TM)
165  : ST(ST_) {
166  using namespace TargetOpcode;
167 
168  auto GetAddrSpacePtr = [&TM](unsigned AS) {
169  return LLT::pointer(AS, TM.getPointerSizeInBits(AS));
170  };
171 
172  const LLT S1 = LLT::scalar(1);
173  const LLT S8 = LLT::scalar(8);
174  const LLT S16 = LLT::scalar(16);
175  const LLT S32 = LLT::scalar(32);
176  const LLT S64 = LLT::scalar(64);
177  const LLT S96 = LLT::scalar(96);
178  const LLT S128 = LLT::scalar(128);
179  const LLT S256 = LLT::scalar(256);
180  const LLT S1024 = LLT::scalar(1024);
181 
182  const LLT V2S16 = LLT::vector(2, 16);
183  const LLT V4S16 = LLT::vector(4, 16);
184 
185  const LLT V2S32 = LLT::vector(2, 32);
186  const LLT V3S32 = LLT::vector(3, 32);
187  const LLT V4S32 = LLT::vector(4, 32);
188  const LLT V5S32 = LLT::vector(5, 32);
189  const LLT V6S32 = LLT::vector(6, 32);
190  const LLT V7S32 = LLT::vector(7, 32);
191  const LLT V8S32 = LLT::vector(8, 32);
192  const LLT V9S32 = LLT::vector(9, 32);
193  const LLT V10S32 = LLT::vector(10, 32);
194  const LLT V11S32 = LLT::vector(11, 32);
195  const LLT V12S32 = LLT::vector(12, 32);
196  const LLT V13S32 = LLT::vector(13, 32);
197  const LLT V14S32 = LLT::vector(14, 32);
198  const LLT V15S32 = LLT::vector(15, 32);
199  const LLT V16S32 = LLT::vector(16, 32);
200  const LLT V32S32 = LLT::vector(32, 32);
201 
202  const LLT V2S64 = LLT::vector(2, 64);
203  const LLT V3S64 = LLT::vector(3, 64);
204  const LLT V4S64 = LLT::vector(4, 64);
205  const LLT V5S64 = LLT::vector(5, 64);
206  const LLT V6S64 = LLT::vector(6, 64);
207  const LLT V7S64 = LLT::vector(7, 64);
208  const LLT V8S64 = LLT::vector(8, 64);
209  const LLT V16S64 = LLT::vector(16, 64);
210 
211  std::initializer_list<LLT> AllS32Vectors =
212  {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32,
213  V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32};
214  std::initializer_list<LLT> AllS64Vectors =
215  {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64};
216 
217  const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS);
218  const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS);
219  const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT);
220  const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS);
221  const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS);
222  const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS);
223  const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS);
224 
225  const LLT CodePtr = FlatPtr;
226 
227  const std::initializer_list<LLT> AddrSpaces64 = {
228  GlobalPtr, ConstantPtr, FlatPtr
229  };
230 
231  const std::initializer_list<LLT> AddrSpaces32 = {
232  LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr
233  };
234 
235  const std::initializer_list<LLT> FPTypesBase = {
236  S32, S64
237  };
238 
239  const std::initializer_list<LLT> FPTypes16 = {
240  S32, S64, S16
241  };
242 
243  const std::initializer_list<LLT> FPTypesPK16 = {
244  S32, S64, S16, V2S16
245  };
246 
247  setAction({G_BRCOND, S1}, Legal);
248 
249  // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more
250  // elements for v3s16
252  .legalFor({S32, S64, V2S16, V4S16, S1, S128, S256})
253  .legalFor(AllS32Vectors)
254  .legalFor(AllS64Vectors)
255  .legalFor(AddrSpaces64)
256  .legalFor(AddrSpaces32)
257  .clampScalar(0, S32, S256)
258  .widenScalarToNextPow2(0, 32)
259  .clampMaxNumElements(0, S32, 16)
261  .legalIf(isPointer(0));
262 
263  if (ST.has16BitInsts()) {
264  getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
265  .legalFor({S32, S16})
266  .clampScalar(0, S16, S32)
267  .scalarize(0);
268  } else {
269  getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
270  .legalFor({S32})
271  .clampScalar(0, S32, S32)
272  .scalarize(0);
273  }
274 
275  getActionDefinitionsBuilder({G_UMULH, G_SMULH})
276  .legalFor({S32})
277  .clampScalar(0, S32, S32)
278  .scalarize(0);
279 
280  // Report legal for any types we can handle anywhere. For the cases only legal
281  // on the SALU, RegBankSelect will be able to re-legalize.
282  getActionDefinitionsBuilder({G_AND, G_OR, G_XOR})
283  .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16})
284  .clampScalar(0, S32, S64)
288  .scalarize(0);
289 
290  getActionDefinitionsBuilder({G_UADDO, G_USUBO,
291  G_UADDE, G_SADDE, G_USUBE, G_SSUBE})
292  .legalFor({{S32, S1}})
293  .clampScalar(0, S32, S32)
294  .scalarize(0); // TODO: Implement.
295 
296  getActionDefinitionsBuilder({G_SADDO, G_SSUBO})
297  .lower();
298 
299  getActionDefinitionsBuilder(G_BITCAST)
300  // Don't worry about the size constraint.
302  // FIXME: Testing hack
303  .legalForCartesianProduct({S16, LLT::vector(2, 8), });
304 
305  getActionDefinitionsBuilder(G_FCONSTANT)
306  .legalFor({S32, S64, S16})
307  .clampScalar(0, S16, S64);
308 
309  getActionDefinitionsBuilder(G_IMPLICIT_DEF)
310  .legalFor({S1, S32, S64, S16, V2S32, V4S32, V2S16, V4S16, GlobalPtr,
311  ConstantPtr, LocalPtr, FlatPtr, PrivatePtr})
312  .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
313  .clampScalarOrElt(0, S32, S1024)
314  .legalIf(isMultiple32(0))
315  .widenScalarToNextPow2(0, 32)
316  .clampMaxNumElements(0, S32, 16);
317 
318 
319  // FIXME: i1 operands to intrinsics should always be legal, but other i1
320  // values may not be legal. We need to figure out how to distinguish
321  // between these two scenarios.
322  getActionDefinitionsBuilder(G_CONSTANT)
323  .legalFor({S1, S32, S64, S16, GlobalPtr,
324  LocalPtr, ConstantPtr, PrivatePtr, FlatPtr })
325  .clampScalar(0, S32, S64)
327  .legalIf(isPointer(0));
328 
329  setAction({G_FRAME_INDEX, PrivatePtr}, Legal);
330  getActionDefinitionsBuilder(G_GLOBAL_VALUE)
331  .customFor({LocalPtr, GlobalPtr, ConstantPtr, Constant32Ptr});
332 
333 
334  auto &FPOpActions = getActionDefinitionsBuilder(
335  { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE})
336  .legalFor({S32, S64});
337  auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS})
338  .customFor({S32, S64});
339  auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV)
340  .customFor({S32, S64});
341 
342  if (ST.has16BitInsts()) {
343  if (ST.hasVOP3PInsts())
344  FPOpActions.legalFor({S16, V2S16});
345  else
346  FPOpActions.legalFor({S16});
347 
348  TrigActions.customFor({S16});
349  FDIVActions.customFor({S16});
350  }
351 
352  auto &MinNumMaxNum = getActionDefinitionsBuilder({
353  G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE});
354 
355  if (ST.hasVOP3PInsts()) {
356  MinNumMaxNum.customFor(FPTypesPK16)
357  .clampMaxNumElements(0, S16, 2)
358  .clampScalar(0, S16, S64)
359  .scalarize(0);
360  } else if (ST.has16BitInsts()) {
361  MinNumMaxNum.customFor(FPTypes16)
362  .clampScalar(0, S16, S64)
363  .scalarize(0);
364  } else {
365  MinNumMaxNum.customFor(FPTypesBase)
366  .clampScalar(0, S32, S64)
367  .scalarize(0);
368  }
369 
370  if (ST.hasVOP3PInsts())
371  FPOpActions.clampMaxNumElements(0, S16, 2);
372 
373  FPOpActions
374  .scalarize(0)
375  .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
376 
377  TrigActions
378  .scalarize(0)
379  .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
380 
381  FDIVActions
382  .scalarize(0)
383  .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
384 
385  getActionDefinitionsBuilder({G_FNEG, G_FABS})
386  .legalFor(FPTypesPK16)
387  .clampMaxNumElements(0, S16, 2)
388  .scalarize(0)
389  .clampScalar(0, S16, S64);
390 
391  // TODO: Implement
392  getActionDefinitionsBuilder({G_FMINIMUM, G_FMAXIMUM}).lower();
393 
394  if (ST.has16BitInsts()) {
395  getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR})
396  .legalFor({S32, S64, S16})
397  .scalarize(0)
398  .clampScalar(0, S16, S64);
399  } else {
400  getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR})
401  .legalFor({S32, S64})
402  .scalarize(0)
403  .clampScalar(0, S32, S64);
404  }
405 
406  getActionDefinitionsBuilder(G_FPTRUNC)
407  .legalFor({{S32, S64}, {S16, S32}})
408  .scalarize(0);
409 
411  .legalFor({{S64, S32}, {S32, S16}})
412  .lowerFor({{S64, S16}}) // FIXME: Implement
413  .scalarize(0);
414 
415  // TODO: Verify V_BFI_B32 is generated from expanded bit ops.
416  getActionDefinitionsBuilder(G_FCOPYSIGN).lower();
417 
419  // Use actual fsub instruction
420  .legalFor({S32})
421  // Must use fadd + fneg
422  .lowerFor({S64, S16, V2S16})
423  .scalarize(0)
424  .clampScalar(0, S32, S64);
425 
426  // Whether this is legal depends on the floating point mode for the function.
427  auto &FMad = getActionDefinitionsBuilder(G_FMAD);
428  if (ST.hasMadF16())
429  FMad.customFor({S32, S16});
430  else
431  FMad.customFor({S32});
432  FMad.scalarize(0)
433  .lower();
434 
435  getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT})
436  .legalFor({{S64, S32}, {S32, S16}, {S64, S16},
437  {S32, S1}, {S64, S1}, {S16, S1},
438  {S96, S32},
439  // FIXME: Hack
440  {S64, LLT::scalar(33)},
441  {S32, S8}, {S128, S32}, {S128, S64}, {S32, LLT::scalar(24)}})
442  .scalarize(0);
443 
444  // TODO: Split s1->s64 during regbankselect for VALU.
445  auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP})
446  .legalFor({{S32, S32}, {S64, S32}, {S16, S32}, {S32, S1}, {S16, S1}, {S64, S1}})
447  .lowerFor({{S32, S64}})
448  .customFor({{S64, S64}});
449  if (ST.has16BitInsts())
450  IToFP.legalFor({{S16, S16}});
451  IToFP.clampScalar(1, S32, S64)
452  .scalarize(0);
453 
454  auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI})
455  .legalFor({{S32, S32}, {S32, S64}, {S32, S16}});
456  if (ST.has16BitInsts())
457  FPToI.legalFor({{S16, S16}});
458  else
459  FPToI.minScalar(1, S32);
460 
461  FPToI.minScalar(0, S32)
462  .scalarize(0);
463 
464  getActionDefinitionsBuilder(G_INTRINSIC_ROUND)
465  .legalFor({S32, S64})
466  .scalarize(0);
467 
468  if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) {
469  getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
470  .legalFor({S32, S64})
471  .clampScalar(0, S32, S64)
472  .scalarize(0);
473  } else {
474  getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
475  .legalFor({S32})
476  .customFor({S64})
477  .clampScalar(0, S32, S64)
478  .scalarize(0);
479  }
480 
482  .legalForCartesianProduct(AddrSpaces64, {S64})
483  .legalForCartesianProduct(AddrSpaces32, {S32})
484  .scalarize(0);
485 
486  getActionDefinitionsBuilder(G_PTR_MASK)
487  .scalarize(0)
488  .alwaysLegal();
489 
490  setAction({G_BLOCK_ADDR, CodePtr}, Legal);
491 
492  auto &CmpBuilder =
495  {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr})
496  .legalFor({{S1, S32}, {S1, S64}});
497  if (ST.has16BitInsts()) {
498  CmpBuilder.legalFor({{S1, S16}});
499  }
500 
501  CmpBuilder
503  .clampScalar(1, S32, S64)
504  .scalarize(0)
505  .legalIf(all(typeIs(0, S1), isPointer(1)));
506 
508  .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase)
509  .widenScalarToNextPow2(1)
510  .clampScalar(1, S32, S64)
511  .scalarize(0);
512 
513  // FIXME: fexp, flog2, flog10 needs to be custom lowered.
514  getActionDefinitionsBuilder({G_FPOW, G_FEXP, G_FEXP2,
515  G_FLOG, G_FLOG2, G_FLOG10})
516  .legalFor({S32})
517  .scalarize(0);
518 
519  // The 64-bit versions produce 32-bit results, but only on the SALU.
520  getActionDefinitionsBuilder({G_CTLZ, G_CTLZ_ZERO_UNDEF,
521  G_CTTZ, G_CTTZ_ZERO_UNDEF,
522  G_CTPOP})
523  .legalFor({{S32, S32}, {S32, S64}})
524  .clampScalar(0, S32, S32)
525  .clampScalar(1, S32, S64)
526  .scalarize(0)
527  .widenScalarToNextPow2(0, 32)
528  .widenScalarToNextPow2(1, 32);
529 
530  // TODO: Expand for > s32
531  getActionDefinitionsBuilder({G_BSWAP, G_BITREVERSE})
532  .legalFor({S32})
533  .clampScalar(0, S32, S32)
534  .scalarize(0);
535 
536  if (ST.has16BitInsts()) {
537  if (ST.hasVOP3PInsts()) {
538  getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
539  .legalFor({S32, S16, V2S16})
540  .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
541  .clampMaxNumElements(0, S16, 2)
542  .clampScalar(0, S16, S32)
544  .scalarize(0);
545  } else {
546  getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
547  .legalFor({S32, S16})
548  .widenScalarToNextPow2(0)
549  .clampScalar(0, S16, S32)
550  .scalarize(0);
551  }
552  } else {
553  getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
554  .legalFor({S32})
555  .clampScalar(0, S32, S32)
557  .scalarize(0);
558  }
559 
560  auto smallerThan = [](unsigned TypeIdx0, unsigned TypeIdx1) {
561  return [=](const LegalityQuery &Query) {
562  return Query.Types[TypeIdx0].getSizeInBits() <
563  Query.Types[TypeIdx1].getSizeInBits();
564  };
565  };
566 
567  auto greaterThan = [](unsigned TypeIdx0, unsigned TypeIdx1) {
568  return [=](const LegalityQuery &Query) {
569  return Query.Types[TypeIdx0].getSizeInBits() >
570  Query.Types[TypeIdx1].getSizeInBits();
571  };
572  };
573 
574  getActionDefinitionsBuilder(G_INTTOPTR)
575  // List the common cases
576  .legalForCartesianProduct(AddrSpaces64, {S64})
577  .legalForCartesianProduct(AddrSpaces32, {S32})
578  .scalarize(0)
579  // Accept any address space as long as the size matches
580  .legalIf(sameSize(0, 1))
581  .widenScalarIf(smallerThan(1, 0),
582  [](const LegalityQuery &Query) {
583  return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
584  })
585  .narrowScalarIf(greaterThan(1, 0),
586  [](const LegalityQuery &Query) {
587  return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
588  });
589 
590  getActionDefinitionsBuilder(G_PTRTOINT)
591  // List the common cases
592  .legalForCartesianProduct(AddrSpaces64, {S64})
593  .legalForCartesianProduct(AddrSpaces32, {S32})
594  .scalarize(0)
595  // Accept any address space as long as the size matches
596  .legalIf(sameSize(0, 1))
597  .widenScalarIf(smallerThan(0, 1),
598  [](const LegalityQuery &Query) {
599  return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
600  })
601  .narrowScalarIf(
602  greaterThan(0, 1),
603  [](const LegalityQuery &Query) {
604  return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
605  });
606 
607  getActionDefinitionsBuilder(G_ADDRSPACE_CAST)
608  .scalarize(0)
609  .custom();
610 
611  // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we
612  // handle some operations by just promoting the register during
613  // selection. There are also d16 loads on GFX9+ which preserve the high bits.
614  auto maxSizeForAddrSpace = [this](unsigned AS) -> unsigned {
615  switch (AS) {
616  // FIXME: Private element size.
618  return 32;
619  // FIXME: Check subtarget
621  return ST.useDS128() ? 128 : 64;
622 
623  // Treat constant and global as identical. SMRD loads are sometimes usable
624  // for global loads (ideally constant address space should be eliminated)
625  // depending on the context. Legality cannot be context dependent, but
626  // RegBankSelect can split the load as necessary depending on the pointer
627  // register bank/uniformity and if the memory is invariant or not written in
628  // a kernel.
631  return 512;
632  default:
633  return 128;
634  }
635  };
636 
637  const auto needToSplitLoad = [=](const LegalityQuery &Query) -> bool {
638  const LLT DstTy = Query.Types[0];
639 
640  // Split vector extloads.
641  unsigned MemSize = Query.MMODescrs[0].SizeInBits;
642  if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize)
643  return true;
644 
645  const LLT PtrTy = Query.Types[1];
646  unsigned AS = PtrTy.getAddressSpace();
647  if (MemSize > maxSizeForAddrSpace(AS))
648  return true;
649 
650  // Catch weird sized loads that don't evenly divide into the access sizes
651  // TODO: May be able to widen depending on alignment etc.
652  unsigned NumRegs = MemSize / 32;
653  if (NumRegs == 3 && !ST.hasDwordx3LoadStores())
654  return true;
655 
656  unsigned Align = Query.MMODescrs[0].AlignInBits;
657  if (Align < MemSize) {
658  const SITargetLowering *TLI = ST.getTargetLowering();
659  return !TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, Align / 8);
660  }
661 
662  return false;
663  };
664 
665  unsigned GlobalAlign32 = ST.hasUnalignedBufferAccess() ? 0 : 32;
666  unsigned GlobalAlign16 = ST.hasUnalignedBufferAccess() ? 0 : 16;
667  unsigned GlobalAlign8 = ST.hasUnalignedBufferAccess() ? 0 : 8;
668 
669  // TODO: Refine based on subtargets which support unaligned access or 128-bit
670  // LDS
671  // TODO: Unsupported flat for SI.
672 
673  for (unsigned Op : {G_LOAD, G_STORE}) {
674  const bool IsStore = Op == G_STORE;
675 
676  auto &Actions = getActionDefinitionsBuilder(Op);
677  // Whitelist the common cases.
678  // TODO: Pointer loads
679  // TODO: Wide constant loads
680  // TODO: Only CI+ has 3x loads
681  // TODO: Loads to s16 on gfx9
682  Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, 32, GlobalAlign32},
683  {V2S32, GlobalPtr, 64, GlobalAlign32},
684  {V3S32, GlobalPtr, 96, GlobalAlign32},
685  {S96, GlobalPtr, 96, GlobalAlign32},
686  {V4S32, GlobalPtr, 128, GlobalAlign32},
687  {S128, GlobalPtr, 128, GlobalAlign32},
688  {S64, GlobalPtr, 64, GlobalAlign32},
689  {V2S64, GlobalPtr, 128, GlobalAlign32},
690  {V2S16, GlobalPtr, 32, GlobalAlign32},
691  {S32, GlobalPtr, 8, GlobalAlign8},
692  {S32, GlobalPtr, 16, GlobalAlign16},
693 
694  {S32, LocalPtr, 32, 32},
695  {S64, LocalPtr, 64, 32},
696  {V2S32, LocalPtr, 64, 32},
697  {S32, LocalPtr, 8, 8},
698  {S32, LocalPtr, 16, 16},
699  {V2S16, LocalPtr, 32, 32},
700 
701  {S32, PrivatePtr, 32, 32},
702  {S32, PrivatePtr, 8, 8},
703  {S32, PrivatePtr, 16, 16},
704  {V2S16, PrivatePtr, 32, 32},
705 
706  {S32, FlatPtr, 32, GlobalAlign32},
707  {S32, FlatPtr, 16, GlobalAlign16},
708  {S32, FlatPtr, 8, GlobalAlign8},
709  {V2S16, FlatPtr, 32, GlobalAlign32},
710 
711  {S32, ConstantPtr, 32, GlobalAlign32},
712  {V2S32, ConstantPtr, 64, GlobalAlign32},
713  {V3S32, ConstantPtr, 96, GlobalAlign32},
714  {V4S32, ConstantPtr, 128, GlobalAlign32},
715  {S64, ConstantPtr, 64, GlobalAlign32},
716  {S128, ConstantPtr, 128, GlobalAlign32},
717  {V2S32, ConstantPtr, 32, GlobalAlign32}});
718  Actions
719  .customIf(typeIs(1, Constant32Ptr))
720  .narrowScalarIf(
721  [=](const LegalityQuery &Query) -> bool {
722  return !Query.Types[0].isVector() && needToSplitLoad(Query);
723  },
724  [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
725  const LLT DstTy = Query.Types[0];
726  const LLT PtrTy = Query.Types[1];
727 
728  const unsigned DstSize = DstTy.getSizeInBits();
729  unsigned MemSize = Query.MMODescrs[0].SizeInBits;
730 
731  // Split extloads.
732  if (DstSize > MemSize)
733  return std::make_pair(0, LLT::scalar(MemSize));
734 
735  if (DstSize > 32 && (DstSize % 32 != 0)) {
736  // FIXME: Need a way to specify non-extload of larger size if
737  // suitably aligned.
738  return std::make_pair(0, LLT::scalar(32 * (DstSize / 32)));
739  }
740 
741  unsigned MaxSize = maxSizeForAddrSpace(PtrTy.getAddressSpace());
742  if (MemSize > MaxSize)
743  return std::make_pair(0, LLT::scalar(MaxSize));
744 
745  unsigned Align = Query.MMODescrs[0].AlignInBits;
746  return std::make_pair(0, LLT::scalar(Align));
747  })
748  .fewerElementsIf(
749  [=](const LegalityQuery &Query) -> bool {
750  return Query.Types[0].isVector() && needToSplitLoad(Query);
751  },
752  [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
753  const LLT DstTy = Query.Types[0];
754  const LLT PtrTy = Query.Types[1];
755 
756  LLT EltTy = DstTy.getElementType();
757  unsigned MaxSize = maxSizeForAddrSpace(PtrTy.getAddressSpace());
758 
759  // Split if it's too large for the address space.
760  if (Query.MMODescrs[0].SizeInBits > MaxSize) {
761  unsigned NumElts = DstTy.getNumElements();
762  unsigned NumPieces = Query.MMODescrs[0].SizeInBits / MaxSize;
763 
764  // FIXME: Refine when odd breakdowns handled
765  // The scalars will need to be re-legalized.
766  if (NumPieces == 1 || NumPieces >= NumElts ||
767  NumElts % NumPieces != 0)
768  return std::make_pair(0, EltTy);
769 
770  return std::make_pair(0,
771  LLT::vector(NumElts / NumPieces, EltTy));
772  }
773 
774  // Need to split because of alignment.
775  unsigned Align = Query.MMODescrs[0].AlignInBits;
776  unsigned EltSize = EltTy.getSizeInBits();
777  if (EltSize > Align &&
778  (EltSize / Align < DstTy.getNumElements())) {
779  return std::make_pair(0, LLT::vector(EltSize / Align, EltTy));
780  }
781 
782  // May need relegalization for the scalars.
783  return std::make_pair(0, EltTy);
784  })
785  .minScalar(0, S32);
786 
787  if (IsStore)
788  Actions.narrowScalarIf(isWideScalarTruncStore(0), changeTo(0, S32));
789 
790  // TODO: Need a bitcast lower option?
791  Actions
792  .legalIf([=](const LegalityQuery &Query) {
793  const LLT Ty0 = Query.Types[0];
794  unsigned Size = Ty0.getSizeInBits();
795  unsigned MemSize = Query.MMODescrs[0].SizeInBits;
796  unsigned Align = Query.MMODescrs[0].AlignInBits;
797 
798  // No extending vector loads.
799  if (Size > MemSize && Ty0.isVector())
800  return false;
801 
802  // FIXME: Widening store from alignment not valid.
803  if (MemSize < Size)
804  MemSize = std::max(MemSize, Align);
805 
806  switch (MemSize) {
807  case 8:
808  case 16:
809  return Size == 32;
810  case 32:
811  case 64:
812  case 128:
813  return true;
814  case 96:
815  return ST.hasDwordx3LoadStores();
816  case 256:
817  case 512:
818  return true;
819  default:
820  return false;
821  }
822  })
823  .widenScalarToNextPow2(0)
824  // TODO: v3s32->v4s32 with alignment
825  .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0));
826  }
827 
828  auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD})
829  .legalForTypesWithMemDesc({{S32, GlobalPtr, 8, 8},
830  {S32, GlobalPtr, 16, 2 * 8},
831  {S32, LocalPtr, 8, 8},
832  {S32, LocalPtr, 16, 16},
833  {S32, PrivatePtr, 8, 8},
834  {S32, PrivatePtr, 16, 16},
835  {S32, ConstantPtr, 8, 8},
836  {S32, ConstantPtr, 16, 2 * 8}});
837  if (ST.hasFlatAddressSpace()) {
838  ExtLoads.legalForTypesWithMemDesc(
839  {{S32, FlatPtr, 8, 8}, {S32, FlatPtr, 16, 16}});
840  }
841 
842  ExtLoads.clampScalar(0, S32, S32)
845  .lower();
846 
847  auto &Atomics = getActionDefinitionsBuilder(
848  {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB,
849  G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR,
850  G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX,
851  G_ATOMICRMW_UMIN, G_ATOMIC_CMPXCHG})
852  .legalFor({{S32, GlobalPtr}, {S32, LocalPtr},
853  {S64, GlobalPtr}, {S64, LocalPtr}});
854  if (ST.hasFlatAddressSpace()) {
855  Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}});
856  }
857 
858  getActionDefinitionsBuilder(G_ATOMICRMW_FADD)
859  .legalFor({{S32, LocalPtr}});
860 
861  getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG_WITH_SUCCESS)
862  .lower();
863 
864  // TODO: Pointer types, any 32-bit or 64-bit vector
866  .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16,
867  GlobalPtr, LocalPtr, FlatPtr, PrivatePtr,
868  LLT::vector(2, LocalPtr), LLT::vector(2, PrivatePtr)}, {S1})
869  .clampScalar(0, S16, S64)
872  .scalarize(1)
873  .clampMaxNumElements(0, S32, 2)
874  .clampMaxNumElements(0, LocalPtr, 2)
875  .clampMaxNumElements(0, PrivatePtr, 2)
876  .scalarize(0)
878  .legalIf(all(isPointer(0), typeIs(1, S1)));
879 
880  // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can
881  // be more flexible with the shift amount type.
882  auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR})
883  .legalFor({{S32, S32}, {S64, S32}});
884  if (ST.has16BitInsts()) {
885  if (ST.hasVOP3PInsts()) {
886  Shifts.legalFor({{S16, S32}, {S16, S16}, {V2S16, V2S16}})
887  .clampMaxNumElements(0, S16, 2);
888  } else
889  Shifts.legalFor({{S16, S32}, {S16, S16}});
890 
891  Shifts.clampScalar(1, S16, S32);
892  Shifts.clampScalar(0, S16, S64);
893  Shifts.widenScalarToNextPow2(0, 16);
894  } else {
895  // Make sure we legalize the shift amount type first, as the general
896  // expansion for the shifted type will produce much worse code if it hasn't
897  // been truncated already.
898  Shifts.clampScalar(1, S32, S32);
899  Shifts.clampScalar(0, S32, S64);
900  Shifts.widenScalarToNextPow2(0, 32);
901  }
902  Shifts.scalarize(0);
903 
904  for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) {
905  unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0;
906  unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1;
907  unsigned IdxTypeIdx = 2;
908 
910  .customIf([=](const LegalityQuery &Query) {
911  const LLT EltTy = Query.Types[EltTypeIdx];
912  const LLT VecTy = Query.Types[VecTypeIdx];
913  const LLT IdxTy = Query.Types[IdxTypeIdx];
914  return (EltTy.getSizeInBits() == 16 ||
915  EltTy.getSizeInBits() % 32 == 0) &&
916  VecTy.getSizeInBits() % 32 == 0 &&
917  VecTy.getSizeInBits() <= 1024 &&
918  IdxTy.getSizeInBits() == 32;
919  })
920  .clampScalar(EltTypeIdx, S32, S64)
921  .clampScalar(VecTypeIdx, S32, S64)
922  .clampScalar(IdxTypeIdx, S32, S32);
923  }
924 
925  getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT)
926  .unsupportedIf([=](const LegalityQuery &Query) {
927  const LLT &EltTy = Query.Types[1].getElementType();
928  return Query.Types[0] != EltTy;
929  });
930 
931  for (unsigned Op : {G_EXTRACT, G_INSERT}) {
932  unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0;
933  unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1;
934 
935  // FIXME: Doesn't handle extract of illegal sizes.
937  .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32)))
938  // FIXME: Multiples of 16 should not be legal.
939  .legalIf([=](const LegalityQuery &Query) {
940  const LLT BigTy = Query.Types[BigTyIdx];
941  const LLT LitTy = Query.Types[LitTyIdx];
942  return (BigTy.getSizeInBits() % 32 == 0) &&
943  (LitTy.getSizeInBits() % 16 == 0);
944  })
945  .widenScalarIf(
946  [=](const LegalityQuery &Query) {
947  const LLT BigTy = Query.Types[BigTyIdx];
948  return (BigTy.getScalarSizeInBits() < 16);
949  },
951  .widenScalarIf(
952  [=](const LegalityQuery &Query) {
953  const LLT LitTy = Query.Types[LitTyIdx];
954  return (LitTy.getScalarSizeInBits() < 16);
955  },
957  .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
958  .widenScalarToNextPow2(BigTyIdx, 32);
959 
960  }
961 
962  auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR)
963  .legalForCartesianProduct(AllS32Vectors, {S32})
964  .legalForCartesianProduct(AllS64Vectors, {S64})
965  .clampNumElements(0, V16S32, V32S32)
966  .clampNumElements(0, V2S64, V16S64)
967  .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16));
968 
969  if (ST.hasScalarPackInsts())
970  BuildVector.legalFor({V2S16, S32});
971 
972  BuildVector
973  .minScalarSameAs(1, 0)
975  .minScalarOrElt(0, S32);
976 
977  if (ST.hasScalarPackInsts()) {
978  getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
979  .legalFor({V2S16, S32})
980  .lower();
981  } else {
982  getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
983  .lower();
984  }
985 
986  getActionDefinitionsBuilder(G_CONCAT_VECTORS)
987  .legalIf(isRegisterType(0));
988 
989  // TODO: Don't fully scalarize v2s16 pieces
990  getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower();
991 
992  // Merge/Unmerge
993  for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
994  unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1;
995  unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0;
996 
997  auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) {
998  const LLT &Ty = Query.Types[TypeIdx];
999  if (Ty.isVector()) {
1000  const LLT &EltTy = Ty.getElementType();
1001  if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 64)
1002  return true;
1003  if (!isPowerOf2_32(EltTy.getSizeInBits()))
1004  return true;
1005  }
1006  return false;
1007  };
1008 
1009  auto &Builder = getActionDefinitionsBuilder(Op)
1010  .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16)
1011  // Clamp the little scalar to s8-s256 and make it a power of 2. It's not
1012  // worth considering the multiples of 64 since 2*192 and 2*384 are not
1013  // valid.
1014  .clampScalar(LitTyIdx, S16, S256)
1015  .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32)
1016  .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1017  .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32),
1018  elementTypeIs(1, S16)),
1019  changeTo(1, V2S16))
1020  // Break up vectors with weird elements into scalars
1021  .fewerElementsIf(
1022  [=](const LegalityQuery &Query) { return notValidElt(Query, 0); },
1023  scalarize(0))
1024  .fewerElementsIf(
1025  [=](const LegalityQuery &Query) { return notValidElt(Query, 1); },
1026  scalarize(1))
1027  .clampScalar(BigTyIdx, S32, S1024)
1028  .lowerFor({{S16, V2S16}});
1029 
1030  if (Op == G_MERGE_VALUES) {
1031  Builder.widenScalarIf(
1032  // TODO: Use 16-bit shifts if legal for 8-bit values?
1033  [=](const LegalityQuery &Query) {
1034  const LLT Ty = Query.Types[LitTyIdx];
1035  return Ty.getSizeInBits() < 32;
1036  },
1037  changeTo(LitTyIdx, S32));
1038  }
1039 
1040  Builder.widenScalarIf(
1041  [=](const LegalityQuery &Query) {
1042  const LLT Ty = Query.Types[BigTyIdx];
1043  return !isPowerOf2_32(Ty.getSizeInBits()) &&
1044  Ty.getSizeInBits() % 16 != 0;
1045  },
1046  [=](const LegalityQuery &Query) {
1047  // Pick the next power of 2, or a multiple of 64 over 128.
1048  // Whichever is smaller.
1049  const LLT &Ty = Query.Types[BigTyIdx];
1050  unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1);
1051  if (NewSizeInBits >= 256) {
1052  unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1);
1053  if (RoundedTo < NewSizeInBits)
1054  NewSizeInBits = RoundedTo;
1055  }
1056  return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits));
1057  })
1058  .legalIf([=](const LegalityQuery &Query) {
1059  const LLT &BigTy = Query.Types[BigTyIdx];
1060  const LLT &LitTy = Query.Types[LitTyIdx];
1061 
1062  if (BigTy.isVector() && BigTy.getSizeInBits() < 32)
1063  return false;
1064  if (LitTy.isVector() && LitTy.getSizeInBits() < 32)
1065  return false;
1066 
1067  return BigTy.getSizeInBits() % 16 == 0 &&
1068  LitTy.getSizeInBits() % 16 == 0 &&
1069  BigTy.getSizeInBits() <= 1024;
1070  })
1071  // Any vectors left are the wrong size. Scalarize them.
1072  .scalarize(0)
1073  .scalarize(1);
1074  }
1075 
1076  getActionDefinitionsBuilder(G_SEXT_INREG).lower();
1077 
1078  computeTables();
1079  verify(*ST.getInstrInfo());
1080 }
1081 
1085  GISelChangeObserver &Observer) const {
1086  switch (MI.getOpcode()) {
1087  case TargetOpcode::G_ADDRSPACE_CAST:
1088  return legalizeAddrSpaceCast(MI, MRI, B);
1089  case TargetOpcode::G_FRINT:
1090  return legalizeFrint(MI, MRI, B);
1091  case TargetOpcode::G_FCEIL:
1092  return legalizeFceil(MI, MRI, B);
1093  case TargetOpcode::G_INTRINSIC_TRUNC:
1094  return legalizeIntrinsicTrunc(MI, MRI, B);
1095  case TargetOpcode::G_SITOFP:
1096  return legalizeITOFP(MI, MRI, B, true);
1097  case TargetOpcode::G_UITOFP:
1098  return legalizeITOFP(MI, MRI, B, false);
1099  case TargetOpcode::G_FMINNUM:
1100  case TargetOpcode::G_FMAXNUM:
1101  case TargetOpcode::G_FMINNUM_IEEE:
1102  case TargetOpcode::G_FMAXNUM_IEEE:
1103  return legalizeMinNumMaxNum(MI, MRI, B);
1104  case TargetOpcode::G_EXTRACT_VECTOR_ELT:
1105  return legalizeExtractVectorElt(MI, MRI, B);
1106  case TargetOpcode::G_INSERT_VECTOR_ELT:
1107  return legalizeInsertVectorElt(MI, MRI, B);
1108  case TargetOpcode::G_FSIN:
1109  case TargetOpcode::G_FCOS:
1110  return legalizeSinCos(MI, MRI, B);
1111  case TargetOpcode::G_GLOBAL_VALUE:
1112  return legalizeGlobalValue(MI, MRI, B);
1113  case TargetOpcode::G_LOAD:
1114  return legalizeLoad(MI, MRI, B, Observer);
1115  case TargetOpcode::G_FMAD:
1116  return legalizeFMad(MI, MRI, B);
1117  case TargetOpcode::G_FDIV:
1118  return legalizeFDIV(MI, MRI, B);
1119  default:
1120  return false;
1121  }
1122 
1123  llvm_unreachable("expected switch to return");
1124 }
1125 
1127  unsigned AS,
1129  MachineIRBuilder &B) const {
1130  MachineFunction &MF = B.getMF();
1131  const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1132  const LLT S32 = LLT::scalar(32);
1133 
1135 
1136  if (ST.hasApertureRegs()) {
1137  // FIXME: Use inline constants (src_{shared, private}_base) instead of
1138  // getreg.
1139  unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ?
1142  unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ?
1145  unsigned Encoding =
1147  Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ |
1148  WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_;
1149 
1150  Register ApertureReg = MRI.createGenericVirtualRegister(S32);
1151  Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
1152 
1153  B.buildInstr(AMDGPU::S_GETREG_B32)
1154  .addDef(GetReg)
1155  .addImm(Encoding);
1156  MRI.setType(GetReg, S32);
1157 
1158  auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1);
1159  B.buildInstr(TargetOpcode::G_SHL)
1160  .addDef(ApertureReg)
1161  .addUse(GetReg)
1162  .addUse(ShiftAmt.getReg(0));
1163 
1164  return ApertureReg;
1165  }
1166 
1167  Register QueuePtr = MRI.createGenericVirtualRegister(
1169 
1171  if (!loadInputValue(QueuePtr, B, &MFI->getArgInfo().QueuePtr))
1172  return Register();
1173 
1174  // Offset into amd_queue_t for group_segment_aperture_base_hi /
1175  // private_segment_aperture_base_hi.
1176  uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
1177 
1178  // FIXME: Don't use undef
1182 
1183  MachinePointerInfo PtrInfo(V, StructOffset);
1185  PtrInfo,
1189  4,
1190  MinAlign(64, StructOffset));
1191 
1192  Register LoadResult = MRI.createGenericVirtualRegister(S32);
1193  Register LoadAddr;
1194 
1195  B.materializeGEP(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset);
1196  B.buildLoad(LoadResult, LoadAddr, *MMO);
1197  return LoadResult;
1198 }
1199 
1202  MachineIRBuilder &B) const {
1203  MachineFunction &MF = B.getMF();
1204 
1205  B.setInstr(MI);
1206 
1207  const LLT S32 = LLT::scalar(32);
1208  Register Dst = MI.getOperand(0).getReg();
1209  Register Src = MI.getOperand(1).getReg();
1210 
1211  LLT DstTy = MRI.getType(Dst);
1212  LLT SrcTy = MRI.getType(Src);
1213  unsigned DestAS = DstTy.getAddressSpace();
1214  unsigned SrcAS = SrcTy.getAddressSpace();
1215 
1216  // TODO: Avoid reloading from the queue ptr for each cast, or at least each
1217  // vector element.
1218  assert(!DstTy.isVector());
1219 
1220  const AMDGPUTargetMachine &TM
1221  = static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
1222 
1223  const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1224  if (ST.getTargetLowering()->isNoopAddrSpaceCast(SrcAS, DestAS)) {
1225  MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST));
1226  return true;
1227  }
1228 
1229  if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1230  // Truncate.
1231  B.buildExtract(Dst, Src, 0);
1232  MI.eraseFromParent();
1233  return true;
1234  }
1235 
1236  if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1238  uint32_t AddrHiVal = Info->get32BitAddressHighBits();
1239 
1240  // FIXME: This is a bit ugly due to creating a merge of 2 pointers to
1241  // another. Merge operands are required to be the same type, but creating an
1242  // extra ptrtoint would be kind of pointless.
1243  auto HighAddr = B.buildConstant(
1245  B.buildMerge(Dst, {Src, HighAddr.getReg(0)});
1246  MI.eraseFromParent();
1247  return true;
1248  }
1249 
1250  if (SrcAS == AMDGPUAS::FLAT_ADDRESS) {
1251  assert(DestAS == AMDGPUAS::LOCAL_ADDRESS ||
1252  DestAS == AMDGPUAS::PRIVATE_ADDRESS);
1253  unsigned NullVal = TM.getNullPointerValue(DestAS);
1254 
1255  auto SegmentNull = B.buildConstant(DstTy, NullVal);
1256  auto FlatNull = B.buildConstant(SrcTy, 0);
1257 
1258  Register PtrLo32 = MRI.createGenericVirtualRegister(DstTy);
1259 
1260  // Extract low 32-bits of the pointer.
1261  B.buildExtract(PtrLo32, Src, 0);
1262 
1264  B.buildICmp(CmpInst::ICMP_NE, CmpRes, Src, FlatNull.getReg(0));
1265  B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0));
1266 
1267  MI.eraseFromParent();
1268  return true;
1269  }
1270 
1271  if (SrcAS != AMDGPUAS::LOCAL_ADDRESS && SrcAS != AMDGPUAS::PRIVATE_ADDRESS)
1272  return false;
1273 
1274  if (!ST.hasFlatAddressSpace())
1275  return false;
1276 
1277  auto SegmentNull =
1278  B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS));
1279  auto FlatNull =
1280  B.buildConstant(DstTy, TM.getNullPointerValue(DestAS));
1281 
1282  Register ApertureReg = getSegmentAperture(SrcAS, MRI, B);
1283  if (!ApertureReg.isValid())
1284  return false;
1285 
1287  B.buildICmp(CmpInst::ICMP_NE, CmpRes, Src, SegmentNull.getReg(0));
1288 
1289  Register BuildPtr = MRI.createGenericVirtualRegister(DstTy);
1290 
1291  // Coerce the type of the low half of the result so we can use merge_values.
1292  Register SrcAsInt = MRI.createGenericVirtualRegister(S32);
1293  B.buildInstr(TargetOpcode::G_PTRTOINT)
1294  .addDef(SrcAsInt)
1295  .addUse(Src);
1296 
1297  // TODO: Should we allow mismatched types but matching sizes in merges to
1298  // avoid the ptrtoint?
1299  B.buildMerge(BuildPtr, {SrcAsInt, ApertureReg});
1300  B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull.getReg(0));
1301 
1302  MI.eraseFromParent();
1303  return true;
1304 }
1305 
1308  MachineIRBuilder &B) const {
1309  B.setInstr(MI);
1310 
1311  Register Src = MI.getOperand(1).getReg();
1312  LLT Ty = MRI.getType(Src);
1313  assert(Ty.isScalar() && Ty.getSizeInBits() == 64);
1314 
1315  APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
1316  APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
1317 
1318  auto C1 = B.buildFConstant(Ty, C1Val);
1319  auto CopySign = B.buildFCopysign(Ty, C1, Src);
1320 
1321  // TODO: Should this propagate fast-math-flags?
1322  auto Tmp1 = B.buildFAdd(Ty, Src, CopySign);
1323  auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign);
1324 
1325  auto C2 = B.buildFConstant(Ty, C2Val);
1326  auto Fabs = B.buildFAbs(Ty, Src);
1327 
1328  auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2);
1329  B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2);
1330  return true;
1331 }
1332 
1335  MachineIRBuilder &B) const {
1336  B.setInstr(MI);
1337 
1338  const LLT S1 = LLT::scalar(1);
1339  const LLT S64 = LLT::scalar(64);
1340 
1341  Register Src = MI.getOperand(1).getReg();
1342  assert(MRI.getType(Src) == S64);
1343 
1344  // result = trunc(src)
1345  // if (src > 0.0 && src != result)
1346  // result += 1.0
1347 
1348  auto Trunc = B.buildInstr(TargetOpcode::G_INTRINSIC_TRUNC, {S64}, {Src});
1349 
1350  const auto Zero = B.buildFConstant(S64, 0.0);
1351  const auto One = B.buildFConstant(S64, 1.0);
1352  auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero);
1353  auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc);
1354  auto And = B.buildAnd(S1, Lt0, NeTrunc);
1355  auto Add = B.buildSelect(S64, And, One, Zero);
1356 
1357  // TODO: Should this propagate fast-math-flags?
1358  B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add);
1359  return true;
1360 }
1361 
1363  MachineIRBuilder &B) {
1364  const unsigned FractBits = 52;
1365  const unsigned ExpBits = 11;
1366  LLT S32 = LLT::scalar(32);
1367 
1368  auto Const0 = B.buildConstant(S32, FractBits - 32);
1369  auto Const1 = B.buildConstant(S32, ExpBits);
1370 
1371  auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false)
1372  .addUse(Const0.getReg(0))
1373  .addUse(Const1.getReg(0));
1374 
1375  return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023));
1376 }
1377 
1380  MachineIRBuilder &B) const {
1381  B.setInstr(MI);
1382 
1383  const LLT S1 = LLT::scalar(1);
1384  const LLT S32 = LLT::scalar(32);
1385  const LLT S64 = LLT::scalar(64);
1386 
1387  Register Src = MI.getOperand(1).getReg();
1388  assert(MRI.getType(Src) == S64);
1389 
1390  // TODO: Should this use extract since the low half is unused?
1391  auto Unmerge = B.buildUnmerge({S32, S32}, Src);
1392  Register Hi = Unmerge.getReg(1);
1393 
1394  // Extract the upper half, since this is where we will find the sign and
1395  // exponent.
1396  auto Exp = extractF64Exponent(Hi, B);
1397 
1398  const unsigned FractBits = 52;
1399 
1400  // Extract the sign bit.
1401  const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31);
1402  auto SignBit = B.buildAnd(S32, Hi, SignBitMask);
1403 
1404  const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1);
1405 
1406  const auto Zero32 = B.buildConstant(S32, 0);
1407 
1408  // Extend back to 64-bits.
1409  auto SignBit64 = B.buildMerge(S64, {Zero32.getReg(0), SignBit.getReg(0)});
1410 
1411  auto Shr = B.buildAShr(S64, FractMask, Exp);
1412  auto Not = B.buildNot(S64, Shr);
1413  auto Tmp0 = B.buildAnd(S64, Src, Not);
1414  auto FiftyOne = B.buildConstant(S32, FractBits - 1);
1415 
1416  auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32);
1417  auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne);
1418 
1419  auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0);
1420  B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1);
1421  return true;
1422 }
1423 
1426  MachineIRBuilder &B, bool Signed) const {
1427  B.setInstr(MI);
1428 
1429  Register Dst = MI.getOperand(0).getReg();
1430  Register Src = MI.getOperand(1).getReg();
1431 
1432  const LLT S64 = LLT::scalar(64);
1433  const LLT S32 = LLT::scalar(32);
1434 
1435  assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64);
1436 
1437  auto Unmerge = B.buildUnmerge({S32, S32}, Src);
1438 
1439  auto CvtHi = Signed ?
1440  B.buildSITOFP(S64, Unmerge.getReg(1)) :
1441  B.buildUITOFP(S64, Unmerge.getReg(1));
1442 
1443  auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0));
1444 
1445  auto ThirtyTwo = B.buildConstant(S32, 32);
1446  auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false)
1447  .addUse(CvtHi.getReg(0))
1448  .addUse(ThirtyTwo.getReg(0));
1449 
1450  // TODO: Should this propagate fast-math-flags?
1451  B.buildFAdd(Dst, LdExp, CvtLo);
1452  MI.eraseFromParent();
1453  return true;
1454 }
1455 
1458  MachineIRBuilder &B) const {
1459  MachineFunction &MF = B.getMF();
1461 
1462  const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE ||
1463  MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE;
1464 
1465  // With ieee_mode disabled, the instructions have the correct behavior
1466  // already for G_FMINNUM/G_FMAXNUM
1467  if (!MFI->getMode().IEEE)
1468  return !IsIEEEOp;
1469 
1470  if (IsIEEEOp)
1471  return true;
1472 
1473  MachineIRBuilder HelperBuilder(MI);
1474  GISelObserverWrapper DummyObserver;
1475  LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
1476  HelperBuilder.setInstr(MI);
1477  return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized;
1478 }
1479 
1482  MachineIRBuilder &B) const {
1483  // TODO: Should move some of this into LegalizerHelper.
1484 
1485  // TODO: Promote dynamic indexing of s16 to s32
1486  // TODO: Dynamic s64 indexing is only legal for SGPR.
1488  if (!IdxVal) // Dynamic case will be selected to register indexing.
1489  return true;
1490 
1491  Register Dst = MI.getOperand(0).getReg();
1492  Register Vec = MI.getOperand(1).getReg();
1493 
1494  LLT VecTy = MRI.getType(Vec);
1495  LLT EltTy = VecTy.getElementType();
1496  assert(EltTy == MRI.getType(Dst));
1497 
1498  B.setInstr(MI);
1499 
1500  if (IdxVal.getValue() < VecTy.getNumElements())
1501  B.buildExtract(Dst, Vec, IdxVal.getValue() * EltTy.getSizeInBits());
1502  else
1503  B.buildUndef(Dst);
1504 
1505  MI.eraseFromParent();
1506  return true;
1507 }
1508 
1511  MachineIRBuilder &B) const {
1512  // TODO: Should move some of this into LegalizerHelper.
1513 
1514  // TODO: Promote dynamic indexing of s16 to s32
1515  // TODO: Dynamic s64 indexing is only legal for SGPR.
1517  if (!IdxVal) // Dynamic case will be selected to register indexing.
1518  return true;
1519 
1520  Register Dst = MI.getOperand(0).getReg();
1521  Register Vec = MI.getOperand(1).getReg();
1522  Register Ins = MI.getOperand(2).getReg();
1523 
1524  LLT VecTy = MRI.getType(Vec);
1525  LLT EltTy = VecTy.getElementType();
1526  assert(EltTy == MRI.getType(Ins));
1527 
1528  B.setInstr(MI);
1529 
1530  if (IdxVal.getValue() < VecTy.getNumElements())
1531  B.buildInsert(Dst, Vec, Ins, IdxVal.getValue() * EltTy.getSizeInBits());
1532  else
1533  B.buildUndef(Dst);
1534 
1535  MI.eraseFromParent();
1536  return true;
1537 }
1538 
1541  MachineIRBuilder &B) const {
1542  B.setInstr(MI);
1543 
1544  Register DstReg = MI.getOperand(0).getReg();
1545  Register SrcReg = MI.getOperand(1).getReg();
1546  LLT Ty = MRI.getType(DstReg);
1547  unsigned Flags = MI.getFlags();
1548 
1549  Register TrigVal;
1550  auto OneOver2Pi = B.buildFConstant(Ty, 0.5 / M_PI);
1551  if (ST.hasTrigReducedRange()) {
1552  auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags);
1553  TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false)
1554  .addUse(MulVal.getReg(0))
1555  .setMIFlags(Flags).getReg(0);
1556  } else
1557  TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0);
1558 
1559  Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ?
1560  Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos;
1561  B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false)
1562  .addUse(TrigVal)
1563  .setMIFlags(Flags);
1564  MI.eraseFromParent();
1565  return true;
1566 }
1567 
1569  Register DstReg, LLT PtrTy,
1570  MachineIRBuilder &B, const GlobalValue *GV,
1571  unsigned Offset, unsigned GAFlags) const {
1572  // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered
1573  // to the following code sequence:
1574  //
1575  // For constant address space:
1576  // s_getpc_b64 s[0:1]
1577  // s_add_u32 s0, s0, $symbol
1578  // s_addc_u32 s1, s1, 0
1579  //
1580  // s_getpc_b64 returns the address of the s_add_u32 instruction and then
1581  // a fixup or relocation is emitted to replace $symbol with a literal
1582  // constant, which is a pc-relative offset from the encoding of the $symbol
1583  // operand to the global variable.
1584  //
1585  // For global address space:
1586  // s_getpc_b64 s[0:1]
1587  // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
1588  // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
1589  //
1590  // s_getpc_b64 returns the address of the s_add_u32 instruction and then
1591  // fixups or relocations are emitted to replace $symbol@*@lo and
1592  // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
1593  // which is a 64-bit pc-relative offset from the encoding of the $symbol
1594  // operand to the global variable.
1595  //
1596  // What we want here is an offset from the value returned by s_getpc
1597  // (which is the address of the s_add_u32 instruction) to the global
1598  // variable, but since the encoding of $symbol starts 4 bytes after the start
1599  // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too
1600  // small. This requires us to add 4 to the global variable offset in order to
1601  // compute the correct address.
1602 
1603  LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
1604 
1605  Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg :
1606  B.getMRI()->createGenericVirtualRegister(ConstPtrTy);
1607 
1608  MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET)
1609  .addDef(PCReg);
1610 
1611  MIB.addGlobalAddress(GV, Offset + 4, GAFlags);
1612  if (GAFlags == SIInstrInfo::MO_NONE)
1613  MIB.addImm(0);
1614  else
1615  MIB.addGlobalAddress(GV, Offset + 4, GAFlags + 1);
1616 
1617  B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass);
1618 
1619  if (PtrTy.getSizeInBits() == 32)
1620  B.buildExtract(DstReg, PCReg, 0);
1621  return true;
1622  }
1623 
1626  MachineIRBuilder &B) const {
1627  Register DstReg = MI.getOperand(0).getReg();
1628  LLT Ty = MRI.getType(DstReg);
1629  unsigned AS = Ty.getAddressSpace();
1630 
1631  const GlobalValue *GV = MI.getOperand(1).getGlobal();
1632  MachineFunction &MF = B.getMF();
1634  B.setInstr(MI);
1635 
1637  if (!MFI->isEntryFunction()) {
1638  const Function &Fn = MF.getFunction();
1639  DiagnosticInfoUnsupported BadLDSDecl(
1640  Fn, "local memory global used by non-kernel function", MI.getDebugLoc());
1641  Fn.getContext().diagnose(BadLDSDecl);
1642  }
1643 
1644  // TODO: We could emit code to handle the initialization somewhere.
1646  B.buildConstant(DstReg, MFI->allocateLDSGlobal(B.getDataLayout(), *GV));
1647  MI.eraseFromParent();
1648  return true;
1649  }
1650 
1651  const Function &Fn = MF.getFunction();
1652  DiagnosticInfoUnsupported BadInit(
1653  Fn, "unsupported initializer for address space", MI.getDebugLoc());
1654  Fn.getContext().diagnose(BadInit);
1655  return true;
1656  }
1657 
1658  const SITargetLowering *TLI = ST.getTargetLowering();
1659 
1660  if (TLI->shouldEmitFixup(GV)) {
1661  buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0);
1662  MI.eraseFromParent();
1663  return true;
1664  }
1665 
1666  if (TLI->shouldEmitPCReloc(GV)) {
1667  buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32);
1668  MI.eraseFromParent();
1669  return true;
1670  }
1671 
1673  Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy);
1674 
1679  8 /*Size*/, 8 /*Align*/);
1680 
1681  buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32);
1682 
1683  if (Ty.getSizeInBits() == 32) {
1684  // Truncate if this is a 32-bit constant adrdess.
1685  auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO);
1686  B.buildExtract(DstReg, Load, 0);
1687  } else
1688  B.buildLoad(DstReg, GOTAddr, *GOTMMO);
1689 
1690  MI.eraseFromParent();
1691  return true;
1692 }
1693 
1696  MachineIRBuilder &B, GISelChangeObserver &Observer) const {
1697  B.setInstr(MI);
1699  auto Cast = B.buildAddrSpaceCast(ConstPtr, MI.getOperand(1).getReg());
1700  Observer.changingInstr(MI);
1701  MI.getOperand(1).setReg(Cast.getReg(0));
1702  Observer.changedInstr(MI);
1703  return true;
1704 }
1705 
1708  MachineIRBuilder &B) const {
1709  LLT Ty = MRI.getType(MI.getOperand(0).getReg());
1710  assert(Ty.isScalar());
1711 
1712  // TODO: Always legal with future ftz flag.
1713  if (Ty == LLT::scalar(32) && !ST.hasFP32Denormals())
1714  return true;
1715  if (Ty == LLT::scalar(16) && !ST.hasFP16Denormals())
1716  return true;
1717 
1718  MachineFunction &MF = B.getMF();
1719 
1720  MachineIRBuilder HelperBuilder(MI);
1721  GISelObserverWrapper DummyObserver;
1722  LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
1723  HelperBuilder.setMBB(*MI.getParent());
1724  return Helper.lowerFMad(MI) == LegalizerHelper::Legalized;
1725 }
1726 
1727 // Return the use branch instruction, otherwise null if the usage is invalid.
1730  Register CondDef = MI.getOperand(0).getReg();
1731  if (!MRI.hasOneNonDBGUse(CondDef))
1732  return nullptr;
1733 
1734  MachineInstr &UseMI = *MRI.use_instr_nodbg_begin(CondDef);
1735  return UseMI.getParent() == MI.getParent() &&
1736  UseMI.getOpcode() == AMDGPU::G_BRCOND ? &UseMI : nullptr;
1737 }
1738 
1740  Register Reg, LLT Ty) const {
1741  Register LiveIn = MRI.getLiveInVirtReg(Reg);
1742  if (LiveIn)
1743  return LiveIn;
1744 
1745  Register NewReg = MRI.createGenericVirtualRegister(Ty);
1746  MRI.addLiveIn(Reg, NewReg);
1747  return NewReg;
1748 }
1749 
1751  const ArgDescriptor *Arg) const {
1752  if (!Arg->isRegister() || !Arg->getRegister().isValid())
1753  return false; // TODO: Handle these
1754 
1755  assert(Arg->getRegister().isPhysical());
1756 
1757  MachineRegisterInfo &MRI = *B.getMRI();
1758 
1759  LLT Ty = MRI.getType(DstReg);
1760  Register LiveIn = getLiveInRegister(MRI, Arg->getRegister(), Ty);
1761 
1762  if (Arg->isMasked()) {
1763  // TODO: Should we try to emit this once in the entry block?
1764  const LLT S32 = LLT::scalar(32);
1765  const unsigned Mask = Arg->getMask();
1766  const unsigned Shift = countTrailingZeros<unsigned>(Mask);
1767 
1768  Register AndMaskSrc = LiveIn;
1769 
1770  if (Shift != 0) {
1771  auto ShiftAmt = B.buildConstant(S32, Shift);
1772  AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0);
1773  }
1774 
1775  B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift));
1776  } else
1777  B.buildCopy(DstReg, LiveIn);
1778 
1779  // Insert the argument copy if it doens't already exist.
1780  // FIXME: It seems EmitLiveInCopies isn't called anywhere?
1781  if (!MRI.getVRegDef(LiveIn)) {
1782  // FIXME: Should have scoped insert pt
1783  MachineBasicBlock &OrigInsBB = B.getMBB();
1784  auto OrigInsPt = B.getInsertPt();
1785 
1786  MachineBasicBlock &EntryMBB = B.getMF().front();
1787  EntryMBB.addLiveIn(Arg->getRegister());
1788  B.setInsertPt(EntryMBB, EntryMBB.begin());
1789  B.buildCopy(LiveIn, Arg->getRegister());
1790 
1791  B.setInsertPt(OrigInsBB, OrigInsPt);
1792  }
1793 
1794  return true;
1795 }
1796 
1798  MachineInstr &MI,
1801  AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
1802  B.setInstr(MI);
1803 
1805 
1806  const ArgDescriptor *Arg;
1807  const TargetRegisterClass *RC;
1808  std::tie(Arg, RC) = MFI->getPreloadedValue(ArgType);
1809  if (!Arg) {
1810  LLVM_DEBUG(dbgs() << "Required arg register missing\n");
1811  return false;
1812  }
1813 
1814  if (loadInputValue(MI.getOperand(0).getReg(), B, Arg)) {
1815  MI.eraseFromParent();
1816  return true;
1817  }
1818 
1819  return false;
1820 }
1821 
1824  MachineIRBuilder &B) const {
1825  B.setInstr(MI);
1826 
1827  if (legalizeFastUnsafeFDIV(MI, MRI, B))
1828  return true;
1829 
1830  return false;
1831 }
1832 
1835  MachineIRBuilder &B) const {
1836  Register Res = MI.getOperand(0).getReg();
1837  Register LHS = MI.getOperand(1).getReg();
1838  Register RHS = MI.getOperand(2).getReg();
1839 
1840  uint16_t Flags = MI.getFlags();
1841 
1842  LLT ResTy = MRI.getType(Res);
1843  LLT S32 = LLT::scalar(32);
1844  LLT S64 = LLT::scalar(64);
1845 
1846  const MachineFunction &MF = B.getMF();
1847  bool Unsafe =
1849 
1850  if (!MF.getTarget().Options.UnsafeFPMath && ResTy == S64)
1851  return false;
1852 
1853  if (!Unsafe && ResTy == S32 && ST.hasFP32Denormals())
1854  return false;
1855 
1856  if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) {
1857  // 1 / x -> RCP(x)
1858  if (CLHS->isExactlyValue(1.0)) {
1859  B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
1860  .addUse(RHS)
1861  .setMIFlags(Flags);
1862 
1863  MI.eraseFromParent();
1864  return true;
1865  }
1866 
1867  // -1 / x -> RCP( FNEG(x) )
1868  if (CLHS->isExactlyValue(-1.0)) {
1869  auto FNeg = B.buildFNeg(ResTy, RHS, Flags);
1870  B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
1871  .addUse(FNeg.getReg(0))
1872  .setMIFlags(Flags);
1873 
1874  MI.eraseFromParent();
1875  return true;
1876  }
1877  }
1878 
1879  // x / y -> x * (1.0 / y)
1880  if (Unsafe) {
1881  auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false)
1882  .addUse(RHS)
1883  .setMIFlags(Flags);
1884  B.buildFMul(Res, LHS, RCP, Flags);
1885 
1886  MI.eraseFromParent();
1887  return true;
1888  }
1889 
1890  return false;
1891 }
1892 
1895  MachineIRBuilder &B) const {
1896  B.setInstr(MI);
1897  Register Res = MI.getOperand(0).getReg();
1898  Register LHS = MI.getOperand(2).getReg();
1899  Register RHS = MI.getOperand(3).getReg();
1900  uint16_t Flags = MI.getFlags();
1901 
1902  LLT S32 = LLT::scalar(32);
1903  LLT S1 = LLT::scalar(1);
1904 
1905  auto Abs = B.buildFAbs(S32, RHS, Flags);
1906  const APFloat C0Val(1.0f);
1907 
1908  auto C0 = B.buildConstant(S32, 0x6f800000);
1909  auto C1 = B.buildConstant(S32, 0x2f800000);
1910  auto C2 = B.buildConstant(S32, FloatToBits(1.0f));
1911 
1912  auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags);
1913  auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags);
1914 
1915  auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags);
1916 
1917  auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
1918  .addUse(Mul0.getReg(0))
1919  .setMIFlags(Flags);
1920 
1921  auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags);
1922 
1923  B.buildFMul(Res, Sel, Mul1, Flags);
1924 
1925  MI.eraseFromParent();
1926  return true;
1927 }
1928 
1931  MachineIRBuilder &B) const {
1933  if (!MFI->isEntryFunction()) {
1934  return legalizePreloadedArgIntrin(MI, MRI, B,
1936  }
1937 
1938  B.setInstr(MI);
1939 
1940  uint64_t Offset =
1941  ST.getTargetLowering()->getImplicitParameterOffset(
1943  Register DstReg = MI.getOperand(0).getReg();
1944  LLT DstTy = MRI.getType(DstReg);
1945  LLT IdxTy = LLT::scalar(DstTy.getSizeInBits());
1946 
1947  const ArgDescriptor *Arg;
1948  const TargetRegisterClass *RC;
1949  std::tie(Arg, RC)
1951  if (!Arg)
1952  return false;
1953 
1954  Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy);
1955  if (!loadInputValue(KernargPtrReg, B, Arg))
1956  return false;
1957 
1958  B.buildGEP(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0));
1959  MI.eraseFromParent();
1960  return true;
1961 }
1962 
1966  unsigned AddrSpace) const {
1967  B.setInstr(MI);
1968  Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B);
1969  auto Hi32 = B.buildExtract(LLT::scalar(32), MI.getOperand(2).getReg(), 32);
1970  B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg);
1971  MI.eraseFromParent();
1972  return true;
1973 }
1974 
1975 /// Handle register layout difference for f16 images for some subtargets.
1978  Register Reg) const {
1979  if (!ST.hasUnpackedD16VMem())
1980  return Reg;
1981 
1982  const LLT S16 = LLT::scalar(16);
1983  const LLT S32 = LLT::scalar(32);
1984  LLT StoreVT = MRI.getType(Reg);
1985  assert(StoreVT.isVector() && StoreVT.getElementType() == S16);
1986 
1987  auto Unmerge = B.buildUnmerge(S16, Reg);
1988 
1989  SmallVector<Register, 4> WideRegs;
1990  for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
1991  WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0));
1992 
1993  int NumElts = StoreVT.getNumElements();
1994 
1995  return B.buildBuildVector(LLT::vector(NumElts, S32), WideRegs).getReg(0);
1996 }
1997 
2001  bool IsFormat) const {
2002  // TODO: Reject f16 format on targets where unsupported.
2003  Register VData = MI.getOperand(1).getReg();
2004  LLT Ty = MRI.getType(VData);
2005 
2006  B.setInstr(MI);
2007 
2008  const LLT S32 = LLT::scalar(32);
2009  const LLT S16 = LLT::scalar(16);
2010 
2011  // Fixup illegal register types for i8 stores.
2012  if (Ty == LLT::scalar(8) || Ty == S16) {
2013  Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0);
2014  MI.getOperand(1).setReg(AnyExt);
2015  return true;
2016  }
2017 
2018  if (Ty.isVector()) {
2019  if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) {
2020  if (IsFormat)
2021  MI.getOperand(1).setReg(handleD16VData(B, MRI, VData));
2022  return true;
2023  }
2024 
2025  return Ty.getElementType() == S32 && Ty.getNumElements() <= 4;
2026  }
2027 
2028  return Ty == S32;
2029 }
2030 
2033  MachineIRBuilder &B) const {
2034  // Replace the use G_BRCOND with the exec manipulate and branch pseudos.
2035  switch (MI.getIntrinsicID()) {
2036  case Intrinsic::amdgcn_if: {
2037  if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI)) {
2038  const SIRegisterInfo *TRI
2039  = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
2040 
2041  B.setInstr(*BrCond);
2042  Register Def = MI.getOperand(1).getReg();
2043  Register Use = MI.getOperand(3).getReg();
2044  B.buildInstr(AMDGPU::SI_IF)
2045  .addDef(Def)
2046  .addUse(Use)
2047  .addMBB(BrCond->getOperand(1).getMBB());
2048 
2049  MRI.setRegClass(Def, TRI->getWaveMaskRegClass());
2050  MRI.setRegClass(Use, TRI->getWaveMaskRegClass());
2051  MI.eraseFromParent();
2052  BrCond->eraseFromParent();
2053  return true;
2054  }
2055 
2056  return false;
2057  }
2058  case Intrinsic::amdgcn_loop: {
2059  if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI)) {
2060  const SIRegisterInfo *TRI
2061  = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
2062 
2063  B.setInstr(*BrCond);
2064  Register Reg = MI.getOperand(2).getReg();
2065  B.buildInstr(AMDGPU::SI_LOOP)
2066  .addUse(Reg)
2067  .addMBB(BrCond->getOperand(1).getMBB());
2068  MI.eraseFromParent();
2069  BrCond->eraseFromParent();
2070  MRI.setRegClass(Reg, TRI->getWaveMaskRegClass());
2071  return true;
2072  }
2073 
2074  return false;
2075  }
2076  case Intrinsic::amdgcn_kernarg_segment_ptr:
2079  case Intrinsic::amdgcn_implicitarg_ptr:
2080  return legalizeImplicitArgPtr(MI, MRI, B);
2081  case Intrinsic::amdgcn_workitem_id_x:
2082  return legalizePreloadedArgIntrin(MI, MRI, B,
2084  case Intrinsic::amdgcn_workitem_id_y:
2085  return legalizePreloadedArgIntrin(MI, MRI, B,
2087  case Intrinsic::amdgcn_workitem_id_z:
2088  return legalizePreloadedArgIntrin(MI, MRI, B,
2090  case Intrinsic::amdgcn_workgroup_id_x:
2091  return legalizePreloadedArgIntrin(MI, MRI, B,
2093  case Intrinsic::amdgcn_workgroup_id_y:
2094  return legalizePreloadedArgIntrin(MI, MRI, B,
2096  case Intrinsic::amdgcn_workgroup_id_z:
2097  return legalizePreloadedArgIntrin(MI, MRI, B,
2099  case Intrinsic::amdgcn_dispatch_ptr:
2100  return legalizePreloadedArgIntrin(MI, MRI, B,
2102  case Intrinsic::amdgcn_queue_ptr:
2103  return legalizePreloadedArgIntrin(MI, MRI, B,
2105  case Intrinsic::amdgcn_implicit_buffer_ptr:
2108  case Intrinsic::amdgcn_dispatch_id:
2109  return legalizePreloadedArgIntrin(MI, MRI, B,
2111  case Intrinsic::amdgcn_fdiv_fast:
2112  return legalizeFDIVFastIntrin(MI, MRI, B);
2113  case Intrinsic::amdgcn_is_shared:
2114  return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS);
2115  case Intrinsic::amdgcn_is_private:
2116  return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS);
2117  case Intrinsic::amdgcn_wavefrontsize: {
2118  B.setInstr(MI);
2119  B.buildConstant(MI.getOperand(0), ST.getWavefrontSize());
2120  MI.eraseFromParent();
2121  return true;
2122  }
2123  case Intrinsic::amdgcn_raw_buffer_store:
2124  return legalizeRawBufferStore(MI, MRI, B, false);
2125  case Intrinsic::amdgcn_raw_buffer_store_format:
2126  return legalizeRawBufferStore(MI, MRI, B, true);
2127  default:
2128  return true;
2129  }
2130 
2131  return true;
2132 }
virtual MachineInstrBuilder buildConstant(const DstOp &Res, const ConstantInt &Val)
Build and insert Res = G_CONSTANT Val.
unsigned Log2_32_Ceil(uint32_t Value)
Return the ceil log base 2 of the specified value, 32 if the value is zero.
Definition: MathExtras.h:598
bool loadInputValue(Register DstReg, MachineIRBuilder &B, const ArgDescriptor *Arg) const
Register getRegister() const
bool shouldEmitFixup(const GlobalValue *GV) const
static LLT pointer(unsigned AddressSpace, unsigned SizeInBits)
Get a low-level pointer in the given address space.
unsigned allocateLDSGlobal(const DataLayout &DL, const GlobalValue &GV)
bool hasApertureRegs() const
MachineInstrBuilder buildUnmerge(ArrayRef< LLT > Res, const SrcOp &Op)
Build and insert Res0, ...
Diagnostic information for unsupported feature in backend.
static bool hasDefinedInitializer(const GlobalValue *GV)
This class represents lattice values for constants.
Definition: AllocatorList.h:23
MachineInstrBuilder buildInsert(Register Res, Register Src, Register Op, unsigned Index)
bool legalizeFceil(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
Register getReg(unsigned Idx) const
Get the register for the operand index.
Register createVirtualRegister(const TargetRegisterClass *RegClass, StringRef Name="")
createVirtualRegister - Create and return a new virtual register in the function with the specified r...
Register getSegmentAperture(unsigned AddrSpace, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
unsigned getScalarSizeInBits() const
bool legalizeInsertVectorElt(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const override
Return true if MI is either legal or has been legalized and false if not legal.
MachineInstrBuilder buildFAbs(const DstOp &Dst, const SrcOp &Src0, Optional< unsigned > Flags=None)
Build and insert Res = G_FABS Op0.
void addLiveIn(unsigned Reg, unsigned vreg=0)
addLiveIn - Add the specified register as a live-in.
static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx)
const ConstantFP * getConstantFPVRegVal(unsigned VReg, const MachineRegisterInfo &MRI)
Definition: Utils.cpp:295
Address space for 32-bit constant memory.
Definition: AMDGPU.h:277
const DebugLoc & getDebugLoc() const
Returns the debug location id of this MachineInstr.
Definition: MachineInstr.h:384
The LegalityQuery object bundles together all the information that&#39;s needed to decide whether a given...
bool isScalar() const
bool legalizeCustom(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, GISelChangeObserver &Observer) const override
unsigned Reg
bool legalizeRawBufferStore(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, bool IsFormat) const
static PointerType * get(Type *ElementType, unsigned AddressSpace)
This constructs a pointer to an object of the specified type in a numbered address space...
Definition: Type.cpp:637
LLT getScalarType() const
bool legalizeIsAddrSpace(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, unsigned AddrSpace) const
LLT getType(unsigned Reg) const
Get the low-level type of Reg or LLT{} if Reg is not a generic (target independent) virtual register...
bool legalizeAddrSpaceCast(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
static LegalityPredicate isWideScalarTruncStore(unsigned TypeIdx)
unsigned const TargetRegisterInfo * TRI
Address space for region memory. (GDS)
Definition: AMDGPU.h:271
LegalityPredicate typeIs(unsigned TypeIdx, LLT TypesInit)
True iff the given type index is the specified types.
LegalizeRuleSet & scalarize(unsigned TypeIdx)
LegalizeRuleSet & legalForCartesianProduct(std::initializer_list< LLT > Types)
The instruction is legal when type indexes 0 and 1 are both in the given list.
static LegalizeMutation oneMoreElement(unsigned TypeIdx)
bool legalizeLoad(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, GISelChangeObserver &Observer) const
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
LegalityPredicate isPointer(unsigned TypeIdx)
True iff the specified type index is a pointer (with any address space).
static LegalityPredicate isRegisterType(unsigned TypeIdx)
Optional< MachineInstrBuilder > materializeGEP(Register &Res, Register Op0, const LLT &ValueTy, uint64_t Value)
Materialize and insert Res = G_GEP Op0, (G_CONSTANT Value)
Register handleD16VData(MachineIRBuilder &B, MachineRegisterInfo &MRI, Register Reg) const
Handle register layout difference for f16 images for some subtargets.
AMDGPU::SIModeRegisterDefaults getMode() const
MachineInstrBuilder buildExtract(const DstOp &Res, const SrcOp &Src, uint64_t Index)
Build and insert `Res0, ...
MachineInstrBuilder buildFAdd(const DstOp &Dst, const SrcOp &Src0, const SrcOp &Src1, Optional< unsigned > Flags=None)
Build and insert Res = G_FADD Op0, Op1.
static MachineInstr * verifyCFIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI)
LegalizeRuleSet & custom()
Unconditionally custom lower.
Address space for global memory (RAT0, VTX0).
Definition: AMDGPU.h:270
bool isVector() const
A description of a memory reference used in the backend.
void setInsertPt(MachineBasicBlock &MBB, MachineBasicBlock::iterator II)
Set the insertion point before the specified position.
MachineInstrBuilder buildAnyExt(const DstOp &Res, const SrcOp &Op)
Build and insert Res = G_ANYEXT Op0.
LegalizeResult lowerFMinNumMaxNum(MachineInstr &MI)
bool legalizeITOFP(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, bool Signed) const
A Use represents the edge between a Value definition and its users.
Definition: Use.h:55
bool IEEE
Floating point opcodes that support exception flag gathering quiet and propagate signaling NaN inputs...
LegalizeRuleSet & minScalarOrElt(unsigned TypeIdx, const LLT &Ty)
Ensure the scalar or element is at least as wide as Ty.
LegalizeRuleSet & widenScalarToNextPow2(unsigned TypeIdx, unsigned MinSize=0)
Widen the scalar to the next power of two that is at least MinSize.
void eraseFromParent()
Unlink &#39;this&#39; from the containing basic block and delete it.
LegalizeMutation changeTo(unsigned TypeIdx, LLT Ty)
Select this specific type for the given type index.
static LegalityPredicate isMultiple32(unsigned TypeIdx, unsigned MaxSize=1024)
The memory access is dereferenceable (i.e., doesn&#39;t trap).
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
Definition: MachineInstr.h:410
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, uint64_t s, unsigned base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
bool legalizeMinNumMaxNum(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
const MachineInstrBuilder & addDef(Register RegNo, unsigned Flags=0, unsigned SubReg=0) const
Add a virtual register definition operand.
LLT getElementType() const
Returns the vector&#39;s element type. Only valid for vector types.
bool legalizeFastUnsafeFDIV(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
static LegalityPredicate elementTypeIs(unsigned TypeIdx, LLT Type)
void verify(const MCInstrInfo &MII) const
Perform simple self-diagnostic and assert if there is anything obviously wrong with the actions set u...
bool legalizeIntrinsicTrunc(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
MachineInstrBuilder buildSITOFP(const DstOp &Dst, const SrcOp &Src0)
Build and insert Res = G_SITOFP Src0.
MachineInstrBuilder buildFSub(const DstOp &Dst, const SrcOp &Src0, const SrcOp &Src1)
Build and insert Res = G_FSUB Op0, Op1.
MachineInstrBuilder buildSelect(const DstOp &Res, const SrcOp &Tst, const SrcOp &Op0, const SrcOp &Op1, Optional< unsigned > Flags=None)
Build and insert a Res = G_SELECT Tst, Op0, Op1.
LegalizeRuleSet & clampMaxNumElements(unsigned TypeIdx, const LLT &EltTy, unsigned MaxElements)
Limit the number of elements in EltTy vectors to at most MaxElements.
void computeTables()
Compute any ancillary tables needed to quickly decide how an operation should be handled.
const T & getValue() const LLVM_LVALUE_FUNCTION
Definition: Optional.h:255
Predicate all(Predicate P0, Predicate P1)
True iff P0 and P1 are true.
MachineFunction & getMF()
Getter for the function we currently build.
bool isPhysical() const
Return true if the specified register number is in the physical register namespace.
Definition: Register.h:95
const MachineInstrBuilder & addGlobalAddress(const GlobalValue *GV, int64_t Offset=0, unsigned TargetFlags=0) const
MachineInstrBuilder buildLShr(const DstOp &Dst, const SrcOp &Src0, const SrcOp &Src1, Optional< unsigned > Flags=None)
unsigned getPointerSizeInBits(unsigned AS) const
void setReg(Register Reg)
Change the register this operand corresponds to.
static const fltSemantics & IEEEdouble() LLVM_READNONE
Definition: APFloat.cpp:158
static LLT scalar(unsigned SizeInBits)
Get a low-level scalar or aggregate "bag of bits".
MachineInstrBuilder buildSub(const DstOp &Dst, const SrcOp &Src0, const SrcOp &Src1, Optional< unsigned > Flags=None)
Build and insert Res = G_SUB Op0, Op1.
bool legalizeFDIV(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
Analysis containing CSE Info
Definition: CSEInfo.cpp:20
MachineInstrBuilder buildFMul(const DstOp &Dst, const SrcOp &Src0, const SrcOp &Src1, Optional< unsigned > Flags=None)
uint32_t FloatToBits(float Float)
This function takes a float and returns the bit equivalent 32-bit integer.
Definition: MathExtras.h:652
LegalizeRuleSet & legalIf(LegalityPredicate Predicate)
The instruction is legal if predicate is true.
bool legalizeFDIVFastIntrin(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
void addLiveIn(MCRegister PhysReg, LaneBitmask LaneMask=LaneBitmask::getAll())
Adds the specified register as a live in.
MachineBasicBlock::iterator getInsertPt()
Current insertion point for new instructions.
LegalizeRuleSet & lowerIf(LegalityPredicate Predicate)
The instruction is lowered if predicate is true.
LegalizeMutation scalarize(unsigned TypeIdx)
Break up the vector type for the given type index into the element type.
Address space for private memory.
Definition: AMDGPU.h:275
constexpr uint64_t MinAlign(uint64_t A, uint64_t B)
A and B are either alignments or offsets.
Definition: MathExtras.h:661
MachineInstrBuilder buildAddrSpaceCast(const DstOp &Dst, const SrcOp &Src)
Build and insert Dst = G_ADDRSPACE_CAST Src.
const MachineInstrBuilder & addUse(Register RegNo, unsigned Flags=0, unsigned SubReg=0) const
Add a virtual register use operand.
MachineRegisterInfo * getMRI()
Getter for MRI.
Abstract class that contains various methods for clients to notify about changes. ...
unsigned UnsafeFPMath
UnsafeFPMath - This flag is enabled when the -enable-unsafe-fp-math flag is specified on the command ...
const TargetRegisterInfo * getTargetRegisterInfo() const
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
unsigned const MachineRegisterInfo * MRI
static LLT scalarOrVector(uint16_t NumElements, LLT ScalarTy)
static MachineInstrBuilder extractF64Exponent(unsigned Hi, MachineIRBuilder &B)
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition: MathExtras.h:465
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
The instances of the Type class are immutable: once they are created, they are never changed...
Definition: Type.h:46
MachineInstrBuilder buildInstr(unsigned Opcode)
Build and insert <empty> = Opcode <empty>.
bool legalizePreloadedArgIntrin(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const
Address space for local memory.
Definition: AMDGPU.h:274
MachineInstrBuilder & UseMI
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
MachineInstrBuilder buildBuildVector(const DstOp &Res, ArrayRef< Register > Ops)
Build and insert Res = G_BUILD_VECTOR Op0, ...
Address space for flat memory.
Definition: AMDGPU.h:269
bool legalizeSinCos(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool buildPCRelGlobalAddress(Register DstReg, LLT PtrTy, MachineIRBuilder &B, const GlobalValue *GV, unsigned Offset, unsigned GAFlags=SIInstrInfo::MO_NONE) const
const GlobalValue * getGlobal() const
LegalizeRuleSet & lowerFor(std::initializer_list< LLT > Types)
The instruction is lowered when type index 0 is any type in the given list.
Helper class to build MachineInstr.
bool legalizeImplicitArgPtr(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
void setType(unsigned VReg, LLT Ty)
Set the low-level type of VReg to Ty.
static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx)
static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size)
LegalizeRuleSet & clampNumElements(unsigned TypeIdx, const LLT &MinTy, const LLT &MaxTy)
Limit the number of elements for the given vectors to at least MinTy&#39;s number of elements and at most...
static LegalityPredicate isWideVec16(unsigned TypeIdx)
void setInstr(MachineInstr &MI)
Set the insertion point to before MI.
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
LegalizeRuleSet & legalForTypesWithMemDesc(std::initializer_list< LegalityPredicates::TypePairAndMemDesc > TypesAndMemDesc)
The instruction is legal when type indexes 0 and 1 along with the memory size and minimum alignment i...
LegalizeRuleSet & unsupportedIf(LegalityPredicate Predicate)
amdgpu Simplify well known AMD library false FunctionCallee Value * Arg
AMDGPULegalizerInfo(const GCNSubtarget &ST, const GCNTargetMachine &TM)
LegalizeRuleSet & clampScalar(unsigned TypeIdx, const LLT &MinTy, const LLT &MaxTy)
Limit the range of scalar sizes to MinTy and MaxTy.
unsigned getAddressSpace() const
LegalizeRuleSet & customIf(LegalityPredicate Predicate)
AMDGPUFunctionArgInfo & getArgInfo()
The AMDGPU TargetMachine interface definition for hw codgen targets.
MachineInstrBuilder buildGEP(const DstOp &Res, const SrcOp &Op0, const SrcOp &Op1)
Build and insert Res = G_GEP Op0, Op1.
std::function< std::pair< unsigned, LLT >(const LegalityQuery &)> LegalizeMutation
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function. ...
Definition: Function.cpp:205
MachineInstrBuilder buildCopy(const DstOp &Res, const SrcOp &Op)
Build and insert Res = COPY Op.
static UndefValue * get(Type *T)
Static factory methods - Return an &#39;undef&#39; object of the specified type.
Definition: Constants.cpp:1446
LegalizeRuleSet & minScalar(unsigned TypeIdx, const LLT &Ty)
Ensure the scalar is at least as wide as Ty.
const MachineBasicBlock & front() const
bool legalizeFrint(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
LegalizeRuleSet & getActionDefinitionsBuilder(unsigned Opcode)
Get the action definition builder for the given opcode.
This class contains a discriminated union of information about pointers in memory operands...
LegalizeRuleSet & unsupportedIfMemSizeNotPow2()
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
MachineInstrBuilder buildUITOFP(const DstOp &Dst, const SrcOp &Src0)
Build and insert Res = G_UITOFP Src0.
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:40
signed greater than
Definition: InstrTypes.h:759
LegalizeRuleSet & customFor(std::initializer_list< LLT > Types)
MachineInstrBuilder buildIntrinsic(Intrinsic::ID ID, ArrayRef< Register > Res, bool HasSideEffects)
Build and insert either a G_INTRINSIC (if HasSideEffects is false) or G_INTRINSIC_W_SIDE_EFFECTS inst...
bool hasFlatAddressSpace() const
0 0 1 0 True if ordered and greater than
Definition: InstrTypes.h:736
MachineInstrBuilder buildLoad(const DstOp &Res, const SrcOp &Addr, MachineMemOperand &MMO)
Build and insert Res = G_LOAD Addr, MMO.
MachineInstrBuilder buildICmp(CmpInst::Predicate Pred, const DstOp &Res, const SrcOp &Op0, const SrcOp &Op1)
Build and insert a Res = G_ICMP Pred, Op0, Op1.
void setDesc(const MCInstrDesc &tid)
Replace the instruction descriptor (thus opcode) of the current instruction with a new one...
virtual MachineInstrBuilder buildFConstant(const DstOp &Res, const ConstantFP &Val)
Build and insert Res = G_FCONSTANT Val.
Align max(MaybeAlign Lhs, Align Rhs)
Definition: Alignment.h:390
This is a &#39;vector&#39; (really, a variable-sized array), optimized for the case when the array is small...
Definition: SmallVector.h:837
MachineInstrBuilder buildFNeg(const DstOp &Dst, const SrcOp &Src0, Optional< unsigned > Flags=None)
Build and insert Res = G_FNEG Op0.
LegalizeRuleSet & minScalarSameAs(unsigned TypeIdx, unsigned LargeTypeIdx)
Widen the scalar to match the size of another.
unsigned getSizeInBits() const
Returns the total size of the type. Must only be called on sized types.
signed less than
Definition: InstrTypes.h:761
Promote Memory to Register
Definition: Mem2Reg.cpp:109
const TargetInstrInfo & getTII()
LegalizeResult lowerFMad(MachineInstr &MI)
const Function & getFunction() const
Return the LLVM function that this machine code represents.
MachineInstrBuilder buildAShr(const DstOp &Dst, const SrcOp &Src0, const SrcOp &Src1, Optional< unsigned > Flags=None)
This file declares the MachineIRBuilder class.
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:132
MachineInstrBuilder buildFCmp(CmpInst::Predicate Pred, const DstOp &Res, const SrcOp &Op0, const SrcOp &Op1, Optional< unsigned > Flags=None)
Build and insert a Res = G_FCMP PredOp0, Op1.
LegalizeRuleSet & moreElementsIf(LegalityPredicate Predicate, LegalizeMutation Mutation)
Add more elements to reach the type selected by the mutation if the predicate is true.
virtual void changingInstr(MachineInstr &MI)=0
This instruction is about to be mutated in some way.
static unsigned getReg(const void *D, unsigned RC, unsigned RegNo)
Address space for constant memory (VTX2).
Definition: AMDGPU.h:273
This file declares the targeting of the Machinelegalizer class for AMDGPU.
Optional< int64_t > getConstantVRegVal(unsigned VReg, const MachineRegisterInfo &MRI)
If VReg is defined by a G_CONSTANT fits in int64_t returns it.
Definition: Utils.cpp:207
LegalizeRuleSet & fewerElementsIf(LegalityPredicate Predicate, LegalizeMutation Mutation)
Remove elements to reach the type selected by the mutation if the predicate is true.
bool isNoopAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override
Returns true if a cast between SrcAS and DestAS is a noop.
const MachineBasicBlock * getParent() const
Definition: MachineInstr.h:255
MachineRegisterInfo - Keep track of information for virtual and physical registers, including vreg register classes, use/def chains for registers, etc.
The memory access reads data.
bool isValid() const
Definition: Register.h:115
std::pair< const ArgDescriptor *, const TargetRegisterClass * > getPreloadedValue(AMDGPUFunctionArgInfo::PreloadedValue Value) const
LegalizeRuleSet & legalFor(std::initializer_list< LLT > Types)
The instruction is legal when type index 0 is any type in the given list.
Representation of each machine instruction.
Definition: MachineInstr.h:63
Instruction has been legalized and the MachineFunction changed.
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
LegalizeMutation widenScalarOrEltToNextPow2(unsigned TypeIdx, unsigned Min=0)
Widen the scalar type or vector element type for the given type index to the next power of 2...
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
static LegalityPredicate isSmallOddVector(unsigned TypeIdx)
const MachineInstrBuilder & setMIFlags(unsigned Flags) const
LegalizeRuleSet & lower()
The instruction is lowered.
ArrayRef< LLT > Types
const MCInstrDesc & get(unsigned Opcode) const
Return the machine instruction descriptor that corresponds to the specified instruction opcode...
Definition: MCInstrInfo.h:44
MachineInstrBuilder buildNot(const DstOp &Dst, const SrcOp &Src0)
Build and insert a bitwise not, NegOne = G_CONSTANT -1 Res = G_OR Op0, NegOne.
TargetOptions Options
const MachineBasicBlock & getMBB() const
Getter for the basic block we currently build.
void setMBB(MachineBasicBlock &MBB)
Set the insertion point to the end of MBB.
MachineInstrBuilder buildFCopysign(const DstOp &Dst, const SrcOp &Src0, const SrcOp &Src1)
Build and insert Res = G_FCOPYSIGN Op0, Op1.
#define I(x, y, z)
Definition: MD5.cpp:58
bool allowsMisalignedMemoryAccessesImpl(unsigned Size, unsigned AS, unsigned Align, MachineMemOperand::Flags Flags=MachineMemOperand::MONone, bool *IsFast=nullptr) const
The memory access always returns the same value (or traps).
0 1 1 0 True if ordered and operands are unequal
Definition: InstrTypes.h:740
LegalizeRuleSet & alwaysLegal()
uint32_t Size
Definition: Profile.cpp:46
LegalizeRuleSet & widenScalarIf(LegalityPredicate Predicate, LegalizeMutation Mutation)
Widen the scalar to the one selected by the mutation if the predicate is true.
const DataLayout & getDataLayout() const
void diagnose(const DiagnosticInfo &DI)
Report a message to the currently installed diagnostic handler.
const LLVMTargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
bool hasOneNonDBGUse(unsigned RegNo) const
hasOneNonDBGUse - Return true if there is exactly one non-Debug use of the specified register...
bool legalizeExtractVectorElt(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
uint16_t getFlags() const
Return the MI flags bitvector.
Definition: MachineInstr.h:291
const TargetRegisterClass * getWaveMaskRegClass() const
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
bool legalizeFMad(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
use_instr_nodbg_iterator use_instr_nodbg_begin(unsigned RegNo) const
static MachinePointerInfo getGOT(MachineFunction &MF)
Return a MachinePointerInfo record that refers to a GOT entry.
unsigned getIntrinsicID() const
Returns the Intrinsic::ID for this instruction.
LLVM Value Representation.
Definition: Value.h:74
uint16_t getNumElements() const
Returns the number of elements in a vector LLT.
LegalizeRuleSet & clampScalarOrElt(unsigned TypeIdx, const LLT &MinTy, const LLT &MaxTy)
Limit the range of scalar sizes to MinTy and MaxTy.
std::underlying_type< E >::type Mask()
Get a bitmask with 1s in all places up to the high-order bit of E&#39;s largest value.
Definition: BitmaskEnum.h:80
MachineInstrBuilder buildAnd(const DstOp &Dst, const SrcOp &Src0, const SrcOp &Src1)
Build and insert Res = G_AND Op0, Op1.
MachineInstrBuilder buildMerge(const DstOp &Res, ArrayRef< Register > Ops)
Build and insert Res = G_MERGE_VALUES Op0, ...
IRTranslator LLVM IR MI
void setRegClass(unsigned Reg, const TargetRegisterClass *RC)
setRegClass - Set the register class of the specified virtual register.
std::function< bool(const LegalityQuery &)> LegalityPredicate
static LegalityPredicate numElementsNotEven(unsigned TypeIdx)
MachineInstrBuilder buildUndef(const DstOp &Res)
Build and insert Res = IMPLICIT_DEF.
static LLT vector(uint16_t NumElements, unsigned ScalarSizeInBits)
Get a low-level vector of some number of elements and element width.
Register createGenericVirtualRegister(LLT Ty, StringRef Name="")
Create and return a new generic virtual register with low-level type Ty.
Simple wrapper observer that takes several observers, and calls each one for each event...
Register getReg() const
getReg - Returns the register number.
bool legalizeGlobalValue(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
static LegalityPredicate sizeIs(unsigned TypeIdx, unsigned Size)
ArrayRef< MemDesc > MMODescrs
Operations which require memory can use this to place requirements on the memory type for each MMO...
const SITargetLowering * getTargetLowering() const override
#define LLVM_DEBUG(X)
Definition: Debug.h:122
unsigned getLiveInVirtReg(unsigned PReg) const
getLiveInVirtReg - If PReg is a live-in physical register, return the corresponding live-in physical ...
const MachineOperand & getOperand(unsigned i) const
Definition: MachineInstr.h:415
void setAction(const InstrAspect &Aspect, LegalizeAction Action)
More friendly way to set an action for common types that have an LLT representation.
static IntegerType * getInt8Ty(LLVMContext &C)
Definition: Type.cpp:178
static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size)
The operation is expected to be selectable directly by the target, and no transformation is necessary...
Definition: LegalizerInfo.h:47
bool getFlag(MIFlag Flag) const
Return whether an MI flag is set.
Definition: MachineInstr.h:296
Register getLiveInRegister(MachineRegisterInfo &MRI, Register Reg, LLT Ty) const
virtual void changedInstr(MachineInstr &MI)=0
This instruction was mutated in some way.
Wrapper class representing virtual and physical registers.
Definition: Register.h:19
LegalityPredicate sameSize(unsigned TypeIdx0, unsigned TypeIdx1)
True iff the specified type indices are both the same bit size.
bool shouldEmitPCReloc(const GlobalValue *GV) const