LLVM 23.0.0git
AMDGPURegBankLegalizeRules.cpp
Go to the documentation of this file.
1//===-- AMDGPURegBankLegalizeRules.cpp ------------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// Definitions of RegBankLegalize Rules for all opcodes.
10/// Implementation of container for all the Rules and search.
11/// Fast search for most common case when Rule.Predicate checks LLT and
12/// uniformity of register in operand 0.
13//
14//===----------------------------------------------------------------------===//
15
17#include "AMDGPUInstrInfo.h"
18#include "GCNSubtarget.h"
21#include "llvm/IR/IntrinsicsAMDGPU.h"
23
24#define DEBUG_TYPE "amdgpu-regbanklegalize"
25
26using namespace llvm;
27using namespace AMDGPU;
28
29bool AMDGPU::isAnyPtr(LLT Ty, unsigned Width) {
30 return Ty.isPointer() && Ty.getSizeInBits() == Width;
31}
32
34 std::initializer_list<RegBankLLTMappingApplyID> DstOpMappingList,
35 std::initializer_list<RegBankLLTMappingApplyID> SrcOpMappingList,
37 : DstOpMapping(DstOpMappingList), SrcOpMapping(SrcOpMappingList),
39
41 std::initializer_list<UniformityLLTOpPredicateID> OpList,
42 std::function<bool(const MachineInstr &)> TestFunc)
44
46 const MachineUniformityInfo &MUI,
47 const MachineRegisterInfo &MRI) {
48 switch (UniID) {
49 case S1:
50 return MRI.getType(Reg) == LLT::scalar(1);
51 case S16:
52 return MRI.getType(Reg) == LLT::scalar(16);
53 case S32:
54 return MRI.getType(Reg) == LLT::scalar(32);
55 case S64:
56 return MRI.getType(Reg) == LLT::scalar(64);
57 case S128:
58 return MRI.getType(Reg) == LLT::scalar(128);
59 case P0:
60 return MRI.getType(Reg) == LLT::pointer(0, 64);
61 case P1:
62 return MRI.getType(Reg) == LLT::pointer(1, 64);
63 case P2:
64 return MRI.getType(Reg) == LLT::pointer(2, 32);
65 case P3:
66 return MRI.getType(Reg) == LLT::pointer(3, 32);
67 case P4:
68 return MRI.getType(Reg) == LLT::pointer(4, 64);
69 case P5:
70 return MRI.getType(Reg) == LLT::pointer(5, 32);
71 case P8:
72 return MRI.getType(Reg) == LLT::pointer(8, 128);
73 case Ptr32:
74 return isAnyPtr(MRI.getType(Reg), 32);
75 case Ptr64:
76 return isAnyPtr(MRI.getType(Reg), 64);
77 case Ptr128:
78 return isAnyPtr(MRI.getType(Reg), 128);
79 case V2S16:
80 return MRI.getType(Reg) == LLT::fixed_vector(2, 16);
81 case V2S32:
82 return MRI.getType(Reg) == LLT::fixed_vector(2, 32);
83 case V3S32:
84 return MRI.getType(Reg) == LLT::fixed_vector(3, 32);
85 case V4S32:
86 return MRI.getType(Reg) == LLT::fixed_vector(4, 32);
87 case B32:
88 return MRI.getType(Reg).getSizeInBits() == 32;
89 case B64:
90 return MRI.getType(Reg).getSizeInBits() == 64;
91 case B96:
92 return MRI.getType(Reg).getSizeInBits() == 96;
93 case B128:
94 return MRI.getType(Reg).getSizeInBits() == 128;
95 case B160:
96 return MRI.getType(Reg).getSizeInBits() == 160;
97 case B256:
98 return MRI.getType(Reg).getSizeInBits() == 256;
99 case B512:
100 return MRI.getType(Reg).getSizeInBits() == 512;
101 case DivAnyTy:
102 return MUI.isDivergentAtDef(Reg);
103 case UniS1:
104 return MRI.getType(Reg) == LLT::scalar(1) && MUI.isUniformAtDef(Reg);
105 case UniS16:
106 return MRI.getType(Reg) == LLT::scalar(16) && MUI.isUniformAtDef(Reg);
107 case UniS32:
108 return MRI.getType(Reg) == LLT::scalar(32) && MUI.isUniformAtDef(Reg);
109 case UniS64:
110 return MRI.getType(Reg) == LLT::scalar(64) && MUI.isUniformAtDef(Reg);
111 case UniS128:
112 return MRI.getType(Reg) == LLT::scalar(128) && MUI.isUniformAtDef(Reg);
113 case UniP0:
114 return MRI.getType(Reg) == LLT::pointer(0, 64) && MUI.isUniformAtDef(Reg);
115 case UniP1:
116 return MRI.getType(Reg) == LLT::pointer(1, 64) && MUI.isUniformAtDef(Reg);
117 case UniP2:
118 return MRI.getType(Reg) == LLT::pointer(2, 32) && MUI.isUniformAtDef(Reg);
119 case UniP3:
120 return MRI.getType(Reg) == LLT::pointer(3, 32) && MUI.isUniformAtDef(Reg);
121 case UniP4:
122 return MRI.getType(Reg) == LLT::pointer(4, 64) && MUI.isUniformAtDef(Reg);
123 case UniP5:
124 return MRI.getType(Reg) == LLT::pointer(5, 32) && MUI.isUniformAtDef(Reg);
125 case UniP6:
126 return MRI.getType(Reg) == LLT::pointer(6, 32) && MUI.isUniformAtDef(Reg);
127 case UniP8:
128 return MRI.getType(Reg) == LLT::pointer(8, 128) && MUI.isUniformAtDef(Reg);
129 case UniPtr32:
130 return isAnyPtr(MRI.getType(Reg), 32) && MUI.isUniformAtDef(Reg);
131 case UniPtr64:
132 return isAnyPtr(MRI.getType(Reg), 64) && MUI.isUniformAtDef(Reg);
133 case UniPtr128:
134 return isAnyPtr(MRI.getType(Reg), 128) && MUI.isUniformAtDef(Reg);
135 case UniV2S16:
136 return MRI.getType(Reg) == LLT::fixed_vector(2, 16) &&
137 MUI.isUniformAtDef(Reg);
138 case UniV2S32:
139 return MRI.getType(Reg) == LLT::fixed_vector(2, 32) &&
140 MUI.isUniformAtDef(Reg);
141 case UniV3S32:
142 return MRI.getType(Reg) == LLT::fixed_vector(3, 32) &&
143 MUI.isUniformAtDef(Reg);
144 case UniV4S32:
145 return MRI.getType(Reg) == LLT::fixed_vector(4, 32) &&
146 MUI.isUniformAtDef(Reg);
147 case UniV6S32:
148 return MRI.getType(Reg) == LLT::fixed_vector(6, 32) &&
149 MUI.isUniformAtDef(Reg);
150 case UniV8S16:
151 return MRI.getType(Reg) == LLT::fixed_vector(8, 16) &&
152 MUI.isUniformAtDef(Reg);
153 case UniV8S32:
154 return MRI.getType(Reg) == LLT::fixed_vector(8, 32) &&
155 MUI.isUniformAtDef(Reg);
156 case UniV16S16:
157 return MRI.getType(Reg) == LLT::fixed_vector(16, 16) &&
158 MUI.isUniformAtDef(Reg);
159 case UniV16S32:
160 return MRI.getType(Reg) == LLT::fixed_vector(16, 32) &&
161 MUI.isUniformAtDef(Reg);
162 case UniV32S16:
163 return MRI.getType(Reg) == LLT::fixed_vector(32, 16) &&
164 MUI.isUniformAtDef(Reg);
165 case UniV32S32:
166 return MRI.getType(Reg) == LLT::fixed_vector(32, 32) &&
167 MUI.isUniformAtDef(Reg);
168 case UniV2S64:
169 return MRI.getType(Reg) == LLT::fixed_vector(2, 64) &&
170 MUI.isUniformAtDef(Reg);
171 case UniB32:
172 return MRI.getType(Reg).getSizeInBits() == 32 && MUI.isUniformAtDef(Reg);
173 case UniB64:
174 return MRI.getType(Reg).getSizeInBits() == 64 && MUI.isUniformAtDef(Reg);
175 case UniB96:
176 return MRI.getType(Reg).getSizeInBits() == 96 && MUI.isUniformAtDef(Reg);
177 case UniB128:
178 return MRI.getType(Reg).getSizeInBits() == 128 && MUI.isUniformAtDef(Reg);
179 case UniB160:
180 return MRI.getType(Reg).getSizeInBits() == 160 && MUI.isUniformAtDef(Reg);
181 case UniB256:
182 return MRI.getType(Reg).getSizeInBits() == 256 && MUI.isUniformAtDef(Reg);
183 case UniB512:
184 return MRI.getType(Reg).getSizeInBits() == 512 && MUI.isUniformAtDef(Reg);
185 case UniBRC: {
186 if (MUI.isDivergentAtDef(Reg))
187 return false;
188 // Check if there is SGPR register class of same size as the LLT.
189 const SIRegisterInfo *TRI =
190 static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
191 // There is no 16 bit SGPR register class. Extra size check is required
192 // since getSGPRClassForBitWidth returns SReg_32RegClass for Size 16.
193 unsigned LLTSize = MRI.getType(Reg).getSizeInBits();
194 return LLTSize >= 32 && TRI->getSGPRClassForBitWidth(LLTSize);
195 }
196 case DivS1:
197 return MRI.getType(Reg) == LLT::scalar(1) && MUI.isDivergentAtDef(Reg);
198 case DivS16:
199 return MRI.getType(Reg) == LLT::scalar(16) && MUI.isDivergentAtDef(Reg);
200 case DivS32:
201 return MRI.getType(Reg) == LLT::scalar(32) && MUI.isDivergentAtDef(Reg);
202 case DivS64:
203 return MRI.getType(Reg) == LLT::scalar(64) && MUI.isDivergentAtDef(Reg);
204 case DivS128:
205 return MRI.getType(Reg) == LLT::scalar(128) && MUI.isDivergentAtDef(Reg);
206 case DivP0:
207 return MRI.getType(Reg) == LLT::pointer(0, 64) && MUI.isDivergentAtDef(Reg);
208 case DivP1:
209 return MRI.getType(Reg) == LLT::pointer(1, 64) && MUI.isDivergentAtDef(Reg);
210 case DivP2:
211 return MRI.getType(Reg) == LLT::pointer(2, 32) && MUI.isDivergentAtDef(Reg);
212 case DivP3:
213 return MRI.getType(Reg) == LLT::pointer(3, 32) && MUI.isDivergentAtDef(Reg);
214 case DivP4:
215 return MRI.getType(Reg) == LLT::pointer(4, 64) && MUI.isDivergentAtDef(Reg);
216 case DivP5:
217 return MRI.getType(Reg) == LLT::pointer(5, 32) && MUI.isDivergentAtDef(Reg);
218 case DivPtr32:
219 return isAnyPtr(MRI.getType(Reg), 32) && MUI.isDivergentAtDef(Reg);
220 case DivPtr64:
221 return isAnyPtr(MRI.getType(Reg), 64) && MUI.isDivergentAtDef(Reg);
222 case DivPtr128:
223 return isAnyPtr(MRI.getType(Reg), 128) && MUI.isDivergentAtDef(Reg);
224 case DivV2S16:
225 return MRI.getType(Reg) == LLT::fixed_vector(2, 16) &&
227 case DivV2S32:
228 return MRI.getType(Reg) == LLT::fixed_vector(2, 32) &&
230 case DivV4S32:
231 return MRI.getType(Reg) == LLT::fixed_vector(4, 32) &&
233 case DivV2S64:
234 return MRI.getType(Reg) == LLT::fixed_vector(2, 64) &&
236 case DivV3S32:
237 return MRI.getType(Reg) == LLT::fixed_vector(3, 32) &&
239 case DivV4S16:
240 return MRI.getType(Reg) == LLT::fixed_vector(4, 16) &&
242 case DivV8S16:
243 return MRI.getType(Reg) == LLT::fixed_vector(8, 16) &&
245 case DivV8S32:
246 return MRI.getType(Reg) == LLT::fixed_vector(8, 32) &&
248 case DivV16S16:
249 return MRI.getType(Reg) == LLT::fixed_vector(16, 16) &&
251 case DivV16S32:
252 return MRI.getType(Reg) == LLT::fixed_vector(16, 32) &&
254 case DivV6S32:
255 return MRI.getType(Reg) == LLT::fixed_vector(6, 32) &&
257 case DivV32S16:
258 return MRI.getType(Reg) == LLT::fixed_vector(32, 16) &&
260 case DivV32S32:
261 return MRI.getType(Reg) == LLT::fixed_vector(32, 32) &&
263 case DivB32:
264 return MRI.getType(Reg).getSizeInBits() == 32 && MUI.isDivergentAtDef(Reg);
265 case DivB64:
266 return MRI.getType(Reg).getSizeInBits() == 64 && MUI.isDivergentAtDef(Reg);
267 case DivB96:
268 return MRI.getType(Reg).getSizeInBits() == 96 && MUI.isDivergentAtDef(Reg);
269 case DivB128:
270 return MRI.getType(Reg).getSizeInBits() == 128 && MUI.isDivergentAtDef(Reg);
271 case DivB160:
272 return MRI.getType(Reg).getSizeInBits() == 160 && MUI.isDivergentAtDef(Reg);
273 case DivB256:
274 return MRI.getType(Reg).getSizeInBits() == 256 && MUI.isDivergentAtDef(Reg);
275 case DivB512:
276 return MRI.getType(Reg).getSizeInBits() == 512 && MUI.isDivergentAtDef(Reg);
277 case DivBRC: {
278 if (MUI.isUniformAtDef(Reg))
279 return false;
280 // Check if there is VGPR register class of same size as the LLT.
281 const SIRegisterInfo *TRI =
282 static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
283 return TRI->getSGPRClassForBitWidth(MRI.getType(Reg).getSizeInBits());
284 }
285 case BRC: {
286 // Check if there is SGPR and VGPR register class of same size as the LLT.
287 const SIRegisterInfo *TRI =
288 static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
289 unsigned LLTSize = MRI.getType(Reg).getSizeInBits();
290 return LLTSize >= 32 && TRI->getSGPRClassForBitWidth(LLTSize) &&
291 TRI->getVGPRClassForBitWidth(LLTSize);
292 }
293 case _:
294 return true;
295 default:
296 llvm_unreachable("missing matchUniformityAndLLT");
297 }
298}
299
301 const MachineUniformityInfo &MUI,
302 const MachineRegisterInfo &MRI) const {
303 // Check LLT signature.
304 for (unsigned i = 0; i < OpUniformityAndTypes.size(); ++i) {
305 const MachineOperand &MO = MI.getOperand(i);
306 if (OpUniformityAndTypes[i] == _) {
307 assert((!MI.getOperand(i).isReg() ||
308 !MI.getOperand(i).getReg().isVirtual()) &&
309 "_ is for non-register and physical register operands only");
310 continue;
311 }
312
313 // Remaining IDs check registers.
314 if (!MO.isReg())
315 return false;
316
317 if (!matchUniformityAndLLT(MO.getReg(), OpUniformityAndTypes[i], MUI, MRI))
318 return false;
319 }
320
321 // More complex check.
322 if (TestFunc)
323 return TestFunc(MI);
324
325 return true;
326}
327
329
331 : FastTypes(FastTypes) {}
332
334 if (Ty == LLT::scalar(16))
335 return S16;
336 if (Ty == LLT::scalar(32))
337 return S32;
338 if (Ty == LLT::scalar(64))
339 return S64;
340 if (Ty == LLT::fixed_vector(2, 16))
341 return V2S16;
342 if (Ty == LLT::fixed_vector(2, 32))
343 return V2S32;
344 if (Ty == LLT::fixed_vector(3, 32))
345 return V3S32;
346 if (Ty == LLT::fixed_vector(4, 32))
347 return V4S32;
348 return _;
349}
350
352 if (Ty == LLT::scalar(32) || Ty == LLT::fixed_vector(2, 16) ||
353 isAnyPtr(Ty, 32))
354 return B32;
355 if (Ty == LLT::scalar(64) || Ty == LLT::fixed_vector(2, 32) ||
356 Ty == LLT::fixed_vector(4, 16) || isAnyPtr(Ty, 64))
357 return B64;
358 if (Ty == LLT::fixed_vector(3, 32))
359 return B96;
360 if (Ty == LLT::fixed_vector(4, 32) || Ty == LLT::fixed_vector(2, 64) ||
361 Ty == LLT::fixed_vector(8, 16) || isAnyPtr(Ty, 128))
362 return B128;
363 return _;
364}
365
366const RegBankLLTMapping *
368 const MachineRegisterInfo &MRI,
369 const MachineUniformityInfo &MUI) const {
370 // Search in "Fast Rules".
371 // Note: if fast rules are enabled, RegBankLLTMapping must be added in each
372 // slot that could "match fast Predicate". If not, InvalidMapping is
373 // returned which results in failure, does not search "Slow Rules".
374 if (FastTypes != NoFastRules) {
375 Register Reg = MI.getOperand(0).getReg();
376 int Slot;
377 if (FastTypes == StandardB)
378 Slot = getFastPredicateSlot(LLTToBId(MRI.getType(Reg)));
379 else
380 Slot = getFastPredicateSlot(LLTToId(MRI.getType(Reg)));
381
382 if (Slot != -1)
383 return MUI.isUniformAtDef(Reg) ? &Uni[Slot] : &Div[Slot];
384 }
385
386 // Slow search for more complex rules.
387 for (const RegBankLegalizeRule &Rule : Rules) {
388 if (Rule.Predicate.match(MI, MUI, MRI))
389 return &Rule.OperandMapping;
390 }
391
392 return nullptr;
393}
394
396 Rules.push_back(Rule);
397}
398
400 RegBankLLTMapping RuleApplyIDs) {
401 int Slot = getFastPredicateSlot(Ty);
402 assert(Slot != -1 && "Ty unsupported in this FastRulesTypes");
403 Div[Slot] = std::move(RuleApplyIDs);
404}
405
407 RegBankLLTMapping RuleApplyIDs) {
408 int Slot = getFastPredicateSlot(Ty);
409 assert(Slot != -1 && "Ty unsupported in this FastRulesTypes");
410 Uni[Slot] = std::move(RuleApplyIDs);
411}
412
413int SetOfRulesForOpcode::getFastPredicateSlot(
415 switch (FastTypes) {
416 case Standard: {
417 switch (Ty) {
418 case S32:
419 return 0;
420 case S16:
421 return 1;
422 case S64:
423 return 2;
424 case V2S16:
425 return 3;
426 default:
427 return -1;
428 }
429 }
430 case StandardB: {
431 switch (Ty) {
432 case B32:
433 return 0;
434 case B64:
435 return 1;
436 case B96:
437 return 2;
438 case B128:
439 return 3;
440 default:
441 return -1;
442 }
443 }
444 case Vector: {
445 switch (Ty) {
446 case S32:
447 return 0;
448 case V2S32:
449 return 1;
450 case V3S32:
451 return 2;
452 case V4S32:
453 return 3;
454 default:
455 return -1;
456 }
457 }
458 default:
459 return -1;
460 }
461}
462
463RegBankLegalizeRules::RuleSetInitializer
464RegBankLegalizeRules::addRulesForGOpcs(std::initializer_list<unsigned> OpcList,
465 FastRulesTypes FastTypes) {
466 return RuleSetInitializer(OpcList, GRulesAlias, GRules, FastTypes);
467}
468
469RegBankLegalizeRules::RuleSetInitializer
470RegBankLegalizeRules::addRulesForIOpcs(std::initializer_list<unsigned> OpcList,
471 FastRulesTypes FastTypes) {
472 return RuleSetInitializer(OpcList, IRulesAlias, IRules, FastTypes);
473}
474
477 unsigned Opc = MI.getOpcode();
478 if (Opc == AMDGPU::G_INTRINSIC || Opc == AMDGPU::G_INTRINSIC_CONVERGENT ||
479 Opc == AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS ||
480 Opc == AMDGPU::G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS) {
481 unsigned IntrID = cast<GIntrinsic>(MI).getIntrinsicID();
482 auto IRAIt = IRulesAlias.find(IntrID);
483 if (IRAIt == IRulesAlias.end())
484 return nullptr;
485 return &IRules.at(IRAIt->second);
486 }
487
488 auto GRAIt = GRulesAlias.find(Opc);
489 if (GRAIt == GRulesAlias.end())
490 return nullptr;
491 return &GRules.at(GRAIt->second);
492}
493
494// Syntactic sugar wrapper for predicate lambda that enables '&&', '||' and '!'.
495class Predicate {
496private:
497 struct Elt {
498 // Save formula composed of Pred, '&&', '||' and '!' as a jump table.
499 // Sink ! to Pred. For example !((A && !B) || C) -> (!A || B) && !C
500 // Sequences of && and || will be represented by jumps, for example:
501 // (A && B && ... X) or (A && B && ... X) || Y
502 // A == true jump to B
503 // A == false jump to end or Y, result is A(false) or Y
504 // (A || B || ... X) or (A || B || ... X) && Y
505 // A == true jump to end or Y, result is A(true) or Y
506 // A == false jump to B
507 // Notice that when negating expression, we simply flip Neg on each Pred
508 // and swap TJumpOffset and FJumpOffset (&& becomes ||, || becomes &&).
509 std::function<bool(const MachineInstr &)> Pred;
510 bool Neg; // Neg of Pred is calculated before jump
511 unsigned TJumpOffset;
512 unsigned FJumpOffset;
513 };
514
515 SmallVector<Elt, 8> Expression;
516
517 Predicate(SmallVectorImpl<Elt> &&Expr) { Expression.swap(Expr); };
518
519public:
520 Predicate(std::function<bool(const MachineInstr &)> Pred) {
521 Expression.push_back({Pred, false, 1, 1});
522 };
523
524 bool operator()(const MachineInstr &MI) const {
525 unsigned Idx = 0;
526 unsigned ResultIdx = Expression.size();
527 bool Result;
528 do {
529 Result = Expression[Idx].Pred(MI);
530 Result = Expression[Idx].Neg ? !Result : Result;
531 if (Result) {
532 Idx += Expression[Idx].TJumpOffset;
533 } else {
534 Idx += Expression[Idx].FJumpOffset;
535 }
536 } while ((Idx != ResultIdx));
537
538 return Result;
539 };
540
541 Predicate operator!() const {
542 SmallVector<Elt, 8> NegExpression;
543 for (const Elt &ExprElt : Expression) {
544 NegExpression.push_back({ExprElt.Pred, !ExprElt.Neg, ExprElt.FJumpOffset,
545 ExprElt.TJumpOffset});
546 }
547 return Predicate(std::move(NegExpression));
548 };
549
550 Predicate operator&&(const Predicate &RHS) const {
551 SmallVector<Elt, 8> AndExpression = Expression;
552
553 unsigned RHSSize = RHS.Expression.size();
554 unsigned ResultIdx = Expression.size();
555 for (unsigned i = 0; i < ResultIdx; ++i) {
556 // LHS results in false, whole expression results in false.
557 if (i + AndExpression[i].FJumpOffset == ResultIdx)
558 AndExpression[i].FJumpOffset += RHSSize;
559 }
560
561 AndExpression.append(RHS.Expression);
562
563 return Predicate(std::move(AndExpression));
564 }
565
566 Predicate operator||(const Predicate &RHS) const {
567 SmallVector<Elt, 8> OrExpression = Expression;
568
569 unsigned RHSSize = RHS.Expression.size();
570 unsigned ResultIdx = Expression.size();
571 for (unsigned i = 0; i < ResultIdx; ++i) {
572 // LHS results in true, whole expression results in true.
573 if (i + OrExpression[i].TJumpOffset == ResultIdx)
574 OrExpression[i].TJumpOffset += RHSSize;
575 }
576
577 OrExpression.append(RHS.Expression);
578
579 return Predicate(std::move(OrExpression));
580 }
581};
582
583// Initialize rules
586 : ST(&_ST), MRI(&_MRI) {
587
588 addRulesForGOpcs({G_ADD, G_SUB}, Standard)
589 .Uni(S16, {{Sgpr32Trunc}, {Sgpr32AExt, Sgpr32AExt}})
590 .Div(S16, {{Vgpr16}, {Vgpr16, Vgpr16}})
591 .Uni(S32, {{Sgpr32}, {Sgpr32, Sgpr32}})
592 .Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32}})
594 .Div(V2S16, {{VgprV2S16}, {VgprV2S16, VgprV2S16}})
595 .Uni(S64, {{Sgpr64}, {Sgpr64, Sgpr64}})
596 .Div(S64, {{Vgpr64}, {Vgpr64, Vgpr64}})
599
600 addRulesForGOpcs({G_UADDO, G_USUBO}, Standard)
601 .Uni(S32, {{Sgpr32, Sgpr32Trunc}, {Sgpr32, Sgpr32}})
602 .Div(S32, {{Vgpr32, Vcc}, {Vgpr32, Vgpr32}});
603
604 addRulesForGOpcs({G_UADDE, G_USUBE, G_SADDE, G_SSUBE}, Standard)
606 .Div(S32, {{Vgpr32, Vcc}, {Vgpr32, Vgpr32, Vcc}});
607
608 addRulesForGOpcs({G_UADDSAT, G_SADDSAT, G_USUBSAT, G_SSUBSAT}, Standard)
609 .Uni(S16, {{UniInVgprS16}, {Vgpr16, Vgpr16}})
610 .Div(S16, {{Vgpr16}, {Vgpr16, Vgpr16}})
611 .Uni(S32, {{UniInVgprS32}, {Vgpr32, Vgpr32}})
612 .Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32}})
614 .Div(V2S16, {{VgprV2S16}, {VgprV2S16, VgprV2S16}});
615
616 bool HasVecMulU64 = ST->hasVMulU64Inst();
617 addRulesForGOpcs({G_MUL}, Standard)
618 .Div(S16, {{Vgpr16}, {Vgpr16, Vgpr16}})
619 .Uni(S32, {{Sgpr32}, {Sgpr32, Sgpr32}})
620 .Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32}})
621 .Uni(S64, {{SgprB64}, {SgprB64, SgprB64}})
623 .Div(V2S16, {{VgprV2S16}, {VgprV2S16, VgprV2S16}})
624 .Uni(S16, {{Sgpr32Trunc}, {Sgpr32AExt, Sgpr32AExt}})
625 .Div(S64, {{VgprB64}, {VgprB64, VgprB64}}, HasVecMulU64)
626 .Div(S64, {{VgprB64}, {VgprB64, VgprB64}, SplitTo32Mul}, !HasVecMulU64);
627
628 bool hasMulHi = ST->hasScalarMulHiInsts();
629 addRulesForGOpcs({G_UMULH, G_SMULH}, Standard)
630 .Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32}})
631 .Uni(S32, {{Sgpr32}, {Sgpr32, Sgpr32}}, hasMulHi)
632 .Uni(S32, {{UniInVgprS32}, {Vgpr32, Vgpr32}}, !hasMulHi);
633
634 addRulesForGOpcs({G_AMDGPU_MAD_U64_U32}, Standard)
635 .Div(S64, {{Vgpr64, Vcc}, {Vgpr32, Vgpr32, Vgpr64}})
637
638 bool HasScalarSMulU64 = ST->hasScalarSMulU64();
639 addRulesForGOpcs({G_AMDGPU_S_MUL_U64_U32, G_AMDGPU_S_MUL_I64_I32}, Standard)
640 .Uni(S64, {{Sgpr64}, {Sgpr64, Sgpr64}, UniMul64}, HasScalarSMulU64)
641 .Div(S64, {{Vgpr64}, {Vgpr64, Vgpr64}, DivSMulToMAD});
642
643 addRulesForGOpcs({G_XOR, G_OR, G_AND}, StandardB)
645 .Any({{DivS1}, {{Vcc}, {Vcc, Vcc}}})
646 .Any({{UniS16}, {{Sgpr16}, {Sgpr16, Sgpr16}}})
647 .Any({{DivS16}, {{Vgpr16}, {Vgpr16, Vgpr16}}})
648 .Uni(B32, {{SgprB32}, {SgprB32, SgprB32}})
649 .Div(B32, {{VgprB32}, {VgprB32, VgprB32}})
650 .Uni(B64, {{SgprB64}, {SgprB64, SgprB64}})
651 .Div(B64, {{VgprB64}, {VgprB64, VgprB64}, SplitTo32});
652
653 addRulesForGOpcs({G_SHL}, Standard)
654 .Uni(S16, {{Sgpr32Trunc}, {Sgpr32AExt, Sgpr32ZExt}})
655 .Div(S16, {{Vgpr16}, {Vgpr16, Vgpr16}})
657 .Div(V2S16, {{VgprV2S16}, {VgprV2S16, VgprV2S16}})
658 .Uni(S32, {{Sgpr32}, {Sgpr32, Sgpr32}})
659 .Uni(S64, {{Sgpr64}, {Sgpr64, Sgpr32}})
660 .Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32}})
661 .Div(S64, {{Vgpr64}, {Vgpr64, Vgpr32}});
662
663 addRulesForGOpcs({G_LSHR}, Standard)
664 .Uni(S16, {{Sgpr32Trunc}, {Sgpr32ZExt, Sgpr32ZExt}})
665 .Div(S16, {{Vgpr16}, {Vgpr16, Vgpr16}})
667 .Div(V2S16, {{VgprV2S16}, {VgprV2S16, VgprV2S16}})
668 .Uni(S32, {{Sgpr32}, {Sgpr32, Sgpr32}})
669 .Uni(S64, {{Sgpr64}, {Sgpr64, Sgpr32}})
670 .Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32}})
671 .Div(S64, {{Vgpr64}, {Vgpr64, Vgpr32}});
672
673 addRulesForGOpcs({G_ASHR}, Standard)
674 .Uni(S16, {{Sgpr32Trunc}, {Sgpr32SExt, Sgpr32ZExt}})
675 .Div(S16, {{Vgpr16}, {Vgpr16, Vgpr16}})
677 .Div(V2S16, {{VgprV2S16}, {VgprV2S16, VgprV2S16}})
678 .Uni(S32, {{Sgpr32}, {Sgpr32, Sgpr32}})
679 .Uni(S64, {{Sgpr64}, {Sgpr64, Sgpr32}})
680 .Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32}})
681 .Div(S64, {{Vgpr64}, {Vgpr64, Vgpr32}});
682
683 addRulesForGOpcs({G_FSHR}, Standard)
684 .Uni(S32, {{UniInVgprS32}, {Vgpr32, Vgpr32, Vgpr32}})
685 .Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32, Vgpr32}});
686
687 addRulesForGOpcs({G_BSWAP}, Standard)
688 .Uni(S16, {{UniInVgprS16}, {Vgpr16}})
689 .Div(S16, {{Vgpr16}, {Vgpr16}})
690 .Uni(S32, {{UniInVgprS32}, {Vgpr32}})
691 .Div(S32, {{Vgpr32}, {Vgpr32}})
692 .Uni(V2S16, {{UniInVgprV2S16}, {VgprV2S16}})
693 .Div(V2S16, {{VgprV2S16}, {VgprV2S16}});
694
695 addRulesForGOpcs({G_AMDGPU_CVT_F32_UBYTE0, G_AMDGPU_CVT_F32_UBYTE1,
696 G_AMDGPU_CVT_F32_UBYTE2, G_AMDGPU_CVT_F32_UBYTE3,
697 G_AMDGPU_RCP_IFLAG},
698 Standard)
699 .Uni(S32, {{UniInVgprS32}, {Vgpr32}})
700 .Div(S32, {{Vgpr32}, {Vgpr32}});
701
702 addRulesForGOpcs({G_FRAME_INDEX}).Any({{UniP5, _}, {{SgprP5}, {None}}});
703
704 addRulesForGOpcs({G_UBFX, G_SBFX}, Standard)
705 .Uni(S32, {{Sgpr32}, {Sgpr32, Sgpr32, Sgpr32}, S_BFE})
706 .Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32, Vgpr32}})
707 .Uni(S64, {{Sgpr64}, {Sgpr64, Sgpr32, Sgpr32}, S_BFE})
708 .Div(S64, {{Vgpr64}, {Vgpr64, Vgpr32, Vgpr32}, V_BFE});
709
710 addRulesForGOpcs({G_SMIN, G_SMAX}, Standard)
711 .Uni(S16, {{Sgpr32Trunc}, {Sgpr32SExt, Sgpr32SExt}})
712 .Div(S16, {{Vgpr16}, {Vgpr16, Vgpr16}})
713 .Uni(S32, {{Sgpr32}, {Sgpr32, Sgpr32}})
714 .Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32}})
716 .Div(V2S16, {{VgprV2S16}, {VgprV2S16, VgprV2S16}})
717 .Uni(S64, {{UniInVgprS64}, {Vgpr64, Vgpr64}})
718 .Div(S64, {{Vgpr64}, {Vgpr64, Vgpr64}});
719
720 addRulesForGOpcs({G_UMIN, G_UMAX}, Standard)
721 .Uni(S16, {{Sgpr32Trunc}, {Sgpr32ZExt, Sgpr32ZExt}})
722 .Div(S16, {{Vgpr16}, {Vgpr16, Vgpr16}})
723 .Uni(S32, {{Sgpr32}, {Sgpr32, Sgpr32}})
724 .Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32}})
726 .Div(V2S16, {{VgprV2S16}, {VgprV2S16, VgprV2S16}})
727 .Uni(S64, {{UniInVgprS64}, {Vgpr64, Vgpr64}})
728 .Div(S64, {{Vgpr64}, {Vgpr64, Vgpr64}});
729
730 addRulesForGOpcs({G_IMPLICIT_DEF})
731 .Any({{UniS1}, {{Sgpr32Trunc}, {}}})
732 .Any({{UniS16}, {{Sgpr16}, {}}})
733 .Any({{UniBRC}, {{SgprBRC}, {}}});
734
735 addRulesForGOpcs({G_CONSTANT}, Standard)
736 .Any({{UniS1, _}, {{Sgpr32Trunc}, {}, UniCstExt}})
737 .Uni(S16, {{Sgpr16}, {}})
738 .Uni(S32, {{Sgpr32}, {}})
739 .Uni(S64, {{Sgpr64}, {}})
740 .Any({{UniPtr32, _}, {{SgprPtr32}, {}}})
741 .Any({{UniPtr64, _}, {{SgprPtr64}, {}}});
742
743 addRulesForGOpcs({G_FCONSTANT}, Standard)
744 .Uni(S16, {{Sgpr16}, {}})
745 .Uni(S32, {{Sgpr32}, {}})
746 .Uni(S64, {{Sgpr64}, {}});
747
748 addRulesForGOpcs({G_FREEZE})
749 .Any({{UniS1}, {{Sgpr32Trunc}, {Sgpr32AExt}}})
750 .Any({{DivS1}, {{Vcc}, {Vcc}}})
751 .Any({{UniS16}, {{Sgpr16}, {Sgpr16}}})
752 .Any({{UniBRC}, {{SgprBRC}, {SgprBRC}}})
753 .Any({{DivBRC}, {{VgprBRC}, {VgprBRC}}});
754
755 addRulesForGOpcs({G_BITCAST})
756 .Any({{UniBRC}, {{SgprBRC}, {SgprBRC}}})
757 .Any({{DivBRC}, {{VgprBRC}, {VgprBRC}}});
758
759 addRulesForGOpcs({G_UNMERGE_VALUES})
760 .Any({{UniS16}, {{}, {}, UnmergeToShiftTrunc}})
761 .Any({{UniBRC}, {{}, {}, VerifyAllSgpr}})
762 .Any({{DivBRC}, {{}, {}, ApplyAllVgpr}});
763
764 addRulesForGOpcs({G_BUILD_VECTOR, G_MERGE_VALUES})
765 .Any({{UniBRC, S16}, {{}, {}, VerifyAllSgpr}})
766 .Any({{UniBRC, BRC}, {{}, {}, VerifyAllSgpr}})
767 .Any({{DivBRC, S16}, {{}, {}, ApplyAllVgpr}})
768 .Any({{DivBRC, BRC}, {{}, {}, ApplyAllVgpr}});
769
770 addRulesForGOpcs({G_CONCAT_VECTORS})
771 .Any({{UniBRC, BRC}, {{}, {}, VerifyAllSgpr}})
772 .Any({{DivBRC, BRC}, {{}, {}, ApplyAllVgpr}});
773
774 addRulesForGOpcs({G_PHI})
775 .Any({{UniS1}, {{}, {}, AextToS32InIncomingBlockGPHI}})
776 .Any({{UniS16}, {{}, {}, VerifyAllSgprGPHI}})
777 .Any({{UniBRC}, {{}, {}, VerifyAllSgprGPHI}})
778 .Any({{DivBRC}, {{}, {}, VerifyAllSgprOrVgprGPHI}});
779
780 addRulesForGOpcs({G_EXTRACT_VECTOR_ELT})
781 .Any({{UniB32, UniBRC, UniS32}, {{SgprB32}, {SgprBRC, Sgpr32}}})
782 .Any({{DivB32, DivBRC, UniS32}, {{VgprB32}, {VgprBRC, Sgpr32}}})
783 .Any({{DivB32, BRC, DivS32},
785 .Any({{UniB64, UniBRC, UniS32}, {{SgprB64}, {SgprBRC, Sgpr32}}})
786 .Any({{DivB64, DivBRC, UniS32},
788 .Any({{DivB64, BRC, DivS32},
790
791 addRulesForGOpcs({G_INSERT_VECTOR_ELT})
793 {{SgprBRC}, {SgprBRC, SgprB32, Sgpr32}}})
794 .Any(
795 {{DivBRC, BRC, B32, UniS32}, {{VgprBRC}, {VgprBRC, VgprB32, Sgpr32}}})
796 .Any({{DivBRC, BRC, B32, DivS32},
800 .Any({{DivBRC, BRC, B64, UniS32},
802 .Any({{DivBRC, BRC, B64, DivS32},
804
805 // INTERSECT_RAY {Div}, {{VgprDst...}, {VgprSrc, ..., Sgpr_WF_RsrcIdx}}
806 // INTERSECT_RAY {Uni}, {{UniInVgprDst...}, {VgprSrc, ..., Sgpr_WF_RsrcIdx}}
807 addRulesForGOpcs({G_AMDGPU_BVH_INTERSECT_RAY, G_AMDGPU_BVH_DUAL_INTERSECT_RAY,
808 G_AMDGPU_BVH8_INTERSECT_RAY})
809 .Any({{}, {{}, {}, ApplyBVH_INTERSECT_RAY}});
810
811 // LOAD {Div}, {{VgprDst...}, {VgprSrc, ..., Sgpr_WF_RsrcIdx}}
812 // LOAD {Uni}, {{UniInVgprDst...}, {VgprSrc, ..., Sgpr_WF_RsrcIdx}}
813 // LOAD_NORET {}, {{}, {Imm, VgprSrc, ..., Sgpr_WF_RsrcIdx}}
814 // STORE {}, {{}, {VgprSrc, ..., Sgpr_WF_RsrcIdx}}
815 addRulesForGOpcs({G_AMDGPU_INTRIN_IMAGE_LOAD, G_AMDGPU_INTRIN_IMAGE_LOAD_D16,
816 G_AMDGPU_INTRIN_IMAGE_LOAD_NORET,
817 G_AMDGPU_INTRIN_IMAGE_STORE,
818 G_AMDGPU_INTRIN_IMAGE_STORE_D16})
819 .Any({{}, {{}, {}, ApplyINTRIN_IMAGE}});
820
821 Predicate isSignedICmp([](const MachineInstr &MI) -> bool {
822 auto Pred =
823 static_cast<CmpInst::Predicate>(MI.getOperand(1).getPredicate());
824 return CmpInst::isSigned(Pred);
825 });
826
827 Predicate isEqualityICmp([](const MachineInstr &MI) -> bool {
828 auto Pred =
829 static_cast<CmpInst::Predicate>(MI.getOperand(1).getPredicate());
830 return ICmpInst::isEquality(Pred);
831 });
832
833 bool HasScalarCompareEq64 = ST->hasScalarCompareEq64();
834 // clang-format off
835 addRulesForGOpcs({G_ICMP})
836 .Any({{{UniS1, _, S16}, isEqualityICmp}, {{Sgpr32Trunc}, {None, Sgpr32ZExt, Sgpr32ZExt}}})
837 .Any({{{UniS1, _, S16}, !isEqualityICmp && isSignedICmp}, {{Sgpr32Trunc}, {None, Sgpr32SExt, Sgpr32SExt}}})
838 .Any({{{UniS1, _, S16}, !isEqualityICmp && !isSignedICmp}, {{Sgpr32Trunc}, {None, Sgpr32ZExt, Sgpr32ZExt}}})
839 .Any({{{DivS1, _, S16}}, {{Vcc}, {None, Vgpr16, Vgpr16}}})
840 .Any({{{UniS1, _, S32}}, {{Sgpr32Trunc}, {None, Sgpr32, Sgpr32}}})
841 .Any({{{DivS1, _, S32}}, {{Vcc}, {None, Vgpr32, Vgpr32}}})
842 .Any({{{UniS1, _, S64}, isEqualityICmp}, {{Sgpr32Trunc}, {None, Sgpr64, Sgpr64}}}, HasScalarCompareEq64)
843 .Any({{{UniS1, _, S64}, isEqualityICmp}, {{UniInVcc}, {None, Vgpr64, Vgpr64}}}, !HasScalarCompareEq64)
844 .Any({{{UniS1, _, S64}, !isEqualityICmp}, {{UniInVcc}, {None, Vgpr64, Vgpr64}}})
845 .Any({{{DivS1, _, S64}}, {{Vcc}, {None, Vgpr64, Vgpr64}}})
846 .Any({{{UniS1, _, Ptr32}}, {{Sgpr32Trunc}, {None, SgprPtr32, SgprPtr32}}})
847 .Any({{{DivS1, _, Ptr32}}, {{Vcc}, {None, VgprPtr32, VgprPtr32}}})
848 .Any({{{UniS1, _, Ptr64}, isEqualityICmp}, {{Sgpr32Trunc}, {None, SgprPtr64, SgprPtr64}}}, HasScalarCompareEq64)
849 .Any({{{UniS1, _, Ptr64}, isEqualityICmp}, {{UniInVcc}, {None, VgprPtr64, VgprPtr64}}}, !HasScalarCompareEq64)
850 .Any({{{UniS1, _, Ptr64}, !isEqualityICmp}, {{UniInVcc}, {None, VgprPtr64, VgprPtr64}}})
851 .Any({{{DivS1, _, Ptr64}}, {{Vcc}, {None, VgprPtr64, VgprPtr64}}});
852 // clang-format on
853
854 addRulesForGOpcs({G_BRCOND})
855 .Any({{UniS1}, {{}, {Sgpr32AExtBoolInReg}}})
856 .Any({{DivS1}, {{}, {Vcc}}});
857
858 addRulesForGOpcs({G_BR}).Any({{_}, {{}, {None}}});
859
860 addRulesForGOpcs({G_SELECT}, StandardB)
861 .Any({{DivS16}, {{Vgpr16}, {Vcc, Vgpr16, Vgpr16}}})
863 .Div(B32, {{VgprB32}, {Vcc, VgprB32, VgprB32}})
867
868 addRulesForGOpcs({G_ANYEXT})
869 .Any({{UniS16, S1}, {{None}, {None}}}) // should be combined away
870 .Any({{UniS32, S1}, {{None}, {None}}}) // should be combined away
871 .Any({{UniS64, S1}, {{None}, {None}}}) // should be combined away
872 .Any({{DivS16, S1}, {{Vgpr16}, {Vcc}, VccExtToSel}})
873 .Any({{DivS32, S1}, {{Vgpr32}, {Vcc}, VccExtToSel}})
874 .Any({{DivS64, S1}, {{Vgpr64}, {Vcc}, VccExtToSel}})
875 .Any({{UniS64, S32}, {{Sgpr64}, {Sgpr32}, Ext32To64}})
876 .Any({{DivS64, S32}, {{Vgpr64}, {Vgpr32}, Ext32To64}})
877 .Any({{UniS32, S16}, {{Sgpr32}, {Sgpr16}}})
878 .Any({{DivS32, S16}, {{Vgpr32}, {Vgpr16}}});
879
880 bool Has16bitCmp = ST->has16BitInsts();
881
882 // In global-isel G_TRUNC in-reg is treated as no-op, inst selected into COPY.
883 // It is up to user to deal with truncated bits.
884 // S1, S16, S32 and S64 results are handled with specific rules. Remaining
885 // (result, source) pairs with valid register classes are covered by the
886 // generic UniBRC/DivBRC wildcard rules.
887 addRulesForGOpcs({G_TRUNC})
888 .Any({{UniS1, UniS16}, {{None}, {None}}}) // should be combined away
889 .Any({{UniS1, UniS32}, {{None}, {None}}}) // should be combined away
890 .Any({{UniS1, UniS64}, {{None}, {None}}}) // should be combined away
891 .Any({{UniS16, S32}, {{Sgpr16}, {Sgpr32}}})
892 .Any({{UniBRC, UniBRC}, {{SgprBRC}, {SgprBRC}}})
893 .Any({{DivBRC, DivBRC}, {{VgprBRC}, {VgprBRC}}})
894 .Any({{UniV2S16, V2S32}, {{SgprV2S16}, {SgprV2S32}}})
895 .Any({{DivV2S16, V2S32}, {{VgprV2S16}, {VgprV2S32}}})
896 // This is non-trivial. VgprToVccCopy is done using compare instruction.
897 .Any({{DivS1, DivS16}, {{Vcc}, {Vgpr16}, VgprToVccCopy}}, Has16bitCmp)
899 !Has16bitCmp)
900 .Any({{DivS1, DivS32}, {{Vcc}, {Vgpr32}, VgprToVccCopy}})
901 .Any({{DivS1, DivS64}, {{Vcc}, {Vgpr64}, VgprToVccCopy}});
902
903 addRulesForGOpcs({G_ZEXT})
907 .Any({{DivS16, S1}, {{Vgpr16}, {Vcc}, VccExtToSel}})
908 .Any({{DivS32, S1}, {{Vgpr32}, {Vcc}, VccExtToSel}})
909 .Any({{DivS64, S1}, {{Vgpr64}, {Vcc}, VccExtToSel}})
910 .Any({{UniS64, S32}, {{Sgpr64}, {Sgpr32}, Ext32To64}})
911 .Any({{DivS64, S32}, {{Vgpr64}, {Vgpr32}, Ext32To64}})
912 // not extending S16 to S32 is questionable.
913 .Any({{UniS64, S16}, {{Sgpr64}, {Sgpr32ZExt}, Ext32To64}})
914 .Any({{DivS64, S16}, {{Vgpr64}, {Vgpr32ZExt}, Ext32To64}})
915 .Any({{UniS32, S16}, {{Sgpr32}, {Sgpr16}}})
916 .Any({{DivS32, S16}, {{Vgpr32}, {Vgpr16}}});
917
918 addRulesForGOpcs({G_SEXT})
922 .Any({{DivS16, S1}, {{Vgpr16}, {Vcc}, VccExtToSel}})
923 .Any({{DivS32, S1}, {{Vgpr32}, {Vcc}, VccExtToSel}})
924 .Any({{DivS64, S1}, {{Vgpr64}, {Vcc}, VccExtToSel}})
925 .Any({{UniS64, S32}, {{Sgpr64}, {Sgpr32}, Ext32To64}})
926 .Any({{DivS64, S32}, {{Vgpr64}, {Vgpr32}, Ext32To64}})
927 // not extending S16 to S32 is questionable.
928 .Any({{UniS64, S16}, {{Sgpr64}, {Sgpr32SExt}, Ext32To64}})
929 .Any({{DivS64, S16}, {{Vgpr64}, {Vgpr32SExt}, Ext32To64}})
930 .Any({{UniS32, S16}, {{Sgpr32}, {Sgpr16}}})
931 .Any({{DivS32, S16}, {{Vgpr32}, {Vgpr16}}});
932
933 addRulesForGOpcs({G_SEXT_INREG})
934 .Any({{UniS32, S32}, {{Sgpr32}, {Sgpr32}}})
935 .Any({{DivS32, S32}, {{Vgpr32}, {Vgpr32}}})
936 .Any({{UniS64, S64}, {{Sgpr64}, {Sgpr64}}})
938
939 addRulesForGOpcs({G_ASSERT_ZEXT, G_ASSERT_SEXT}, Standard)
940 .Uni(S32, {{Sgpr32}, {Sgpr32, Imm}})
941 .Div(S32, {{Vgpr32}, {Vgpr32, Imm}})
942 .Uni(S64, {{Sgpr64}, {Sgpr64, Imm}})
943 .Div(S64, {{Vgpr64}, {Vgpr64, Imm}});
944
945 addRulesForGOpcs({G_ASSERT_ALIGN}, Standard)
946 .Uni(S32, {{Sgpr32}, {Sgpr32}})
947 .Div(S32, {{Vgpr32}, {Vgpr32}})
948 .Uni(S64, {{Sgpr64}, {Sgpr64}})
949 .Div(S64, {{Vgpr64}, {Vgpr64}})
950 .Any({{UniPtr32}, {{SgprPtr32}, {SgprPtr32}}})
951 .Any({{DivPtr32}, {{VgprPtr32}, {VgprPtr32}}})
952 .Any({{UniPtr64}, {{SgprPtr64}, {SgprPtr64}}})
953 .Any({{DivPtr64}, {{VgprPtr64}, {VgprPtr64}}});
954
955 // Atomic read-modify-write operations: result and value are always VGPR,
956 // pointer varies by address space.
957 addRulesForGOpcs({G_ATOMICRMW_ADD, G_ATOMICRMW_SUB, G_ATOMICRMW_XCHG,
958 G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR,
959 G_ATOMICRMW_MIN, G_ATOMICRMW_MAX, G_ATOMICRMW_UMIN,
960 G_ATOMICRMW_UMAX, G_ATOMICRMW_UINC_WRAP,
961 G_ATOMICRMW_UDEC_WRAP, G_ATOMICRMW_FMIN, G_ATOMICRMW_FMAX})
962 .Any({{DivS32, P0, S32}, {{Vgpr32}, {VgprP0, Vgpr32}}})
963 .Any({{DivS64, P0, S64}, {{Vgpr64}, {VgprP0, Vgpr64}}})
964 .Any({{DivS32, P1, S32}, {{Vgpr32}, {VgprP1, Vgpr32}}})
965 .Any({{DivS64, P1, S64}, {{Vgpr64}, {VgprP1, Vgpr64}}})
966 .Any({{DivS32, P3, S32}, {{Vgpr32}, {VgprP3, Vgpr32}}})
967 .Any({{DivS64, P3, S64}, {{Vgpr64}, {VgprP3, Vgpr64}}});
968
969 addRulesForGOpcs({G_ATOMICRMW_USUB_SAT, G_ATOMICRMW_USUB_COND})
970 .Any({{DivS32, P0}, {{Vgpr32}, {VgprP0, Vgpr32}}})
971 .Any({{DivS32, P1}, {{Vgpr32}, {VgprP1, Vgpr32}}})
972 .Any({{DivS32, P3}, {{Vgpr32}, {VgprP3, Vgpr32}}});
973
974 bool HasAtomicFlatPkAdd16Insts = ST->hasAtomicFlatPkAdd16Insts();
975 bool HasAtomicBufferGlobalPkAddF16Insts =
976 ST->hasAtomicBufferGlobalPkAddF16NoRtnInsts() ||
977 ST->hasAtomicBufferGlobalPkAddF16Insts();
978 bool HasAtomicDsPkAdd16Insts = ST->hasAtomicDsPkAdd16Insts();
979 addRulesForGOpcs({G_ATOMICRMW_FADD})
980 .Any({{DivS32, P0, S32}, {{Vgpr32}, {VgprP0, Vgpr32}}})
981 .Any({{DivS64, P0, S64}, {{Vgpr64}, {VgprP0, Vgpr64}}})
982 .Any({{DivS32, P1, S32}, {{Vgpr32}, {VgprP1, Vgpr32}}})
983 .Any({{DivS64, P1, S64}, {{Vgpr64}, {VgprP1, Vgpr64}}})
984 .Any({{DivS32, P3, S32}, {{Vgpr32}, {VgprP3, Vgpr32}}})
985 .Any({{DivS64, P3, S64}, {{Vgpr64}, {VgprP3, Vgpr64}}})
986 .Any({{DivV2S16, P0, V2S16}, {{VgprV2S16}, {VgprP0, VgprV2S16}}},
987 HasAtomicFlatPkAdd16Insts)
988 .Any({{DivV2S16, P1, V2S16}, {{VgprV2S16}, {VgprP1, VgprV2S16}}},
989 HasAtomicBufferGlobalPkAddF16Insts)
990 .Any({{DivV2S16, P3, V2S16}, {{VgprV2S16}, {VgprP3, VgprV2S16}}},
991 HasAtomicDsPkAdd16Insts);
992
993 addRulesForGOpcs({G_ATOMIC_CMPXCHG})
994 .Any({{DivS32, P2}, {{Vgpr32}, {VgprP2, Vgpr32, Vgpr32}}})
995 .Any({{DivS64, P2}, {{Vgpr64}, {VgprP2, Vgpr64, Vgpr64}}})
996 .Any({{DivS32, P3}, {{Vgpr32}, {VgprP3, Vgpr32, Vgpr32}}})
997 .Any({{DivS64, P3}, {{Vgpr64}, {VgprP3, Vgpr64, Vgpr64}}});
998
999 addRulesForGOpcs({G_AMDGPU_ATOMIC_CMPXCHG})
1000 .Any({{DivS32, P0}, {{Vgpr32}, {VgprP0, VgprV2S32}}})
1001 .Any({{DivS32, P1}, {{Vgpr32}, {VgprP1, VgprV2S32}}})
1002 .Any({{DivS64, P0}, {{Vgpr64}, {VgprP0, VgprV2S64}}})
1003 .Any({{DivS64, P1}, {{Vgpr64}, {VgprP1, VgprV2S64}}});
1004
1005 addRulesForGOpcs({G_AMDGPU_BUFFER_ATOMIC_CMPSWAP}, Standard)
1006 .Div(S32, {{Vgpr32},
1008 .Div(S64, {{Vgpr64},
1010
1011 addRulesForGOpcs({G_AMDGPU_BUFFER_ATOMIC_ADD, G_AMDGPU_BUFFER_ATOMIC_AND,
1012 G_AMDGPU_BUFFER_ATOMIC_DEC, G_AMDGPU_BUFFER_ATOMIC_FMAX,
1013 G_AMDGPU_BUFFER_ATOMIC_FMIN, G_AMDGPU_BUFFER_ATOMIC_INC,
1014 G_AMDGPU_BUFFER_ATOMIC_OR, G_AMDGPU_BUFFER_ATOMIC_SMAX,
1015 G_AMDGPU_BUFFER_ATOMIC_SMIN, G_AMDGPU_BUFFER_ATOMIC_SUB,
1016 G_AMDGPU_BUFFER_ATOMIC_SWAP, G_AMDGPU_BUFFER_ATOMIC_UMAX,
1017 G_AMDGPU_BUFFER_ATOMIC_UMIN, G_AMDGPU_BUFFER_ATOMIC_XOR},
1018 Standard)
1021
1022 bool hasSMRDx3 = ST->hasScalarDwordx3Loads();
1023 bool hasSMRDSmall = ST->hasScalarSubwordLoads();
1024 bool usesTrue16 = ST->useRealTrue16Insts();
1025
1026 Predicate isAlign16([](const MachineInstr &MI) -> bool {
1027 return (*MI.memoperands_begin())->getAlign() >= Align(16);
1028 });
1029
1030 Predicate isAlign4([](const MachineInstr &MI) -> bool {
1031 return (*MI.memoperands_begin())->getAlign() >= Align(4);
1032 });
1033
1034 Predicate isAtomicMMO([](const MachineInstr &MI) -> bool {
1035 return (*MI.memoperands_begin())->isAtomic();
1036 });
1037
1038 Predicate isUniMMO([](const MachineInstr &MI) -> bool {
1039 return AMDGPU::isUniformMMO(*MI.memoperands_begin());
1040 });
1041
1042 Predicate isConst([](const MachineInstr &MI) -> bool {
1043 // Address space in MMO be different then address space on pointer.
1044 const MachineMemOperand *MMO = *MI.memoperands_begin();
1045 const unsigned AS = MMO->getAddrSpace();
1046 return AS == AMDGPUAS::CONSTANT_ADDRESS ||
1048 });
1049
1050 Predicate isVolatileMMO([](const MachineInstr &MI) -> bool {
1051 return (*MI.memoperands_begin())->isVolatile();
1052 });
1053
1054 Predicate isInvMMO([](const MachineInstr &MI) -> bool {
1055 return (*MI.memoperands_begin())->isInvariant();
1056 });
1057
1058 Predicate isNoClobberMMO([](const MachineInstr &MI) -> bool {
1059 return (*MI.memoperands_begin())->getFlags() & MONoClobber;
1060 });
1061
1062 Predicate isNaturalAligned([](const MachineInstr &MI) -> bool {
1063 const MachineMemOperand *MMO = *MI.memoperands_begin();
1064 return MMO->getAlign() >= Align(MMO->getSize().getValue());
1065 });
1066
1067 Predicate is8Or16BitMMO([](const MachineInstr &MI) -> bool {
1068 const MachineMemOperand *MMO = *MI.memoperands_begin();
1069 const unsigned MemSize = 8 * MMO->getSize().getValue();
1070 return MemSize == 16 || MemSize == 8;
1071 });
1072
1073 Predicate is32BitMMO([](const MachineInstr &MI) -> bool {
1074 const MachineMemOperand *MMO = *MI.memoperands_begin();
1075 return 8 * MMO->getSize().getValue() == 32;
1076 });
1077
1078 auto isUL = !isAtomicMMO && isUniMMO && (isConst || !isVolatileMMO) &&
1079 (isConst || isInvMMO || isNoClobberMMO);
1080
1081 // clang-format off
1082 // TODO: S32Dst, 16-bit any-extending load should not appear on True16 targets
1083 addRulesForGOpcs({G_LOAD})
1084 // flat, addrspace(0), never uniform - flat_load
1085 .Any({{DivS16, P0}, {{Vgpr16}, {VgprP0}}}, usesTrue16)
1086 .Any({{DivB32, P0}, {{VgprB32}, {VgprP0}}}) // 32-bit load, 8-bit and 16-bit any-extending load
1087 .Any({{DivB64, P0}, {{VgprB64}, {VgprP0}}})
1088 .Any({{DivB96, P0}, {{VgprB96}, {VgprP0}}})
1089 .Any({{DivB128, P0}, {{VgprB128}, {VgprP0}}})
1090
1091 // global, addrspace(1)
1092 // divergent - global_load
1093 .Any({{DivS16, P1}, {{Vgpr16}, {VgprP1}}}, usesTrue16)
1094 .Any({{DivB32, P1}, {{VgprB32}, {VgprP1}}}) //32-bit load, 8-bit and 16-bit any-extending load
1095 .Any({{DivB64, P1}, {{VgprB64}, {VgprP1}}})
1096 .Any({{DivB96, P1}, {{VgprB96}, {VgprP1}}})
1097 .Any({{DivB128, P1}, {{VgprB128}, {VgprP1}}})
1098 .Any({{DivB256, P1}, {{VgprB256}, {VgprP1}, SplitLoad}})
1099 .Any({{DivB512, P1}, {{VgprB512}, {VgprP1}, SplitLoad}})
1100
1101 // uniform - s_load
1102 .Any({{{UniS16, P1}, isNaturalAligned && isUL}, {{Sgpr32Trunc}, {SgprP1}}}, usesTrue16 && hasSMRDSmall) // s16 load
1103 .Any({{{UniS16, P1}, isAlign4 && isUL}, {{Sgpr32Trunc}, {SgprP1}, WidenMMOToS32}}, usesTrue16 && !hasSMRDSmall) // s16 load to 32-bit load
1104 .Any({{{UniB32, P1}, isNaturalAligned && isUL}, {{SgprB32}, {SgprP1}}}, hasSMRDSmall) //32-bit load, 8-bit and 16-bit any-extending load
1105 // TODO: SplitLoad when !isNaturalAligned && isUL and target hasSMRDSmall
1106 .Any({{{UniB32, P1}, is8Or16BitMMO && isAlign4 && isUL}, {{SgprB32}, {SgprP1}, WidenMMOToS32}}, !hasSMRDSmall) //8-bit and 16-bit any-extending load to 32-bit load
1107 .Any({{{UniB32, P1}, is32BitMMO && isAlign4 && isUL}, {{SgprB32}, {SgprP1}}}) //32-bit load
1108 .Any({{{UniB64, P1}, isAlign4 && isUL}, {{SgprB64}, {SgprP1}}})
1109 .Any({{{UniB96, P1}, isAlign16 && isUL}, {{SgprB96}, {SgprP1}, WidenLoad}}, !hasSMRDx3)
1110 .Any({{{UniB96, P1}, isAlign4 && !isAlign16 && isUL}, {{SgprB96}, {SgprP1}, SplitLoad}}, !hasSMRDx3)
1111 .Any({{{UniB96, P1}, isAlign4 && isUL}, {{SgprB96}, {SgprP1}}}, hasSMRDx3)
1112 .Any({{{UniB128, P1}, isAlign4 && isUL}, {{SgprB128}, {SgprP1}}})
1113 .Any({{{UniB256, P1}, isAlign4 && isUL}, {{SgprB256}, {SgprP1}}})
1114 .Any({{{UniB512, P1}, isAlign4 && isUL}, {{SgprB512}, {SgprP1}}})
1115
1116 // Uniform via global or buffer load, for example volatile or non-aligned
1117 // uniform load. Not using standard {{UniInVgprTy}, {VgprP1}} since it is
1118 // selected as global_load, use SgprP1 for pointer instead to match
1119 // patterns without flat-for-global, default for GFX7 and older.
1120 // -> +flat-for-global + {{UniInVgprTy}, {SgprP1}} - global_load
1121 // -> -flat-for-global + {{UniInVgprTy}, {SgprP1}} - buffer_load
1122 .Any({{{UniS16, P1}, !isNaturalAligned || !isUL}, {{UniInVgprS16}, {SgprP1}}}, usesTrue16 && hasSMRDSmall) // s16 load
1123 .Any({{{UniS16, P1}, !isAlign4 || !isUL}, {{UniInVgprS16}, {SgprP1}}}, usesTrue16 && !hasSMRDSmall) // s16 load
1124 .Any({{{UniB32, P1}, !isNaturalAligned || !isUL}, {{UniInVgprB32}, {SgprP1}}}, hasSMRDSmall) //32-bit load, 8-bit and 16-bit any-extending load
1125 .Any({{{UniB32, P1}, !isAlign4 || !isUL}, {{UniInVgprB32}, {SgprP1}}}, !hasSMRDSmall) //32-bit load, 8-bit and 16-bit any-extending load
1126 .Any({{{UniB64, P1}, !isAlign4 || !isUL}, {{UniInVgprB64}, {SgprP1}}})
1127 .Any({{{UniB96, P1}, !isAlign4 || !isUL}, {{UniInVgprB96}, {SgprP1}}})
1128 .Any({{{UniB128, P1}, !isAlign4 || !isUL}, {{UniInVgprB128}, {SgprP1}}})
1129 .Any({{{UniB256, P1}, !isAlign4 || !isUL}, {{UniInVgprB256}, {SgprP1}, SplitLoad}})
1130 .Any({{{UniB512, P1}, !isAlign4 || !isUL}, {{UniInVgprB512}, {SgprP1}, SplitLoad}})
1131
1132 // local, addrspace(3) - ds_load
1133 .Any({{DivS16, P3}, {{Vgpr16}, {VgprP3}}}, usesTrue16)
1134 .Any({{DivB32, P3}, {{VgprB32}, {VgprP3}}}) // 32-bit load, 8-bit and 16-bit any-extending load
1135 .Any({{DivB64, P3}, {{VgprB64}, {VgprP3}}})
1136 .Any({{DivB96, P3}, {{VgprB96}, {VgprP3}}})
1137 .Any({{DivB128, P3}, {{VgprB128}, {VgprP3}}})
1138
1139 .Any({{UniS16, P3}, {{UniInVgprS16}, {SgprP3}}}, usesTrue16) // 16-bit load
1140 .Any({{UniB32, P3}, {{UniInVgprB32}, {VgprP3}}}) // 32-bit load, 8-bit and 16-bit any-extending load
1141 .Any({{UniB64, P3}, {{UniInVgprB64}, {VgprP3}}})
1142 .Any({{UniB96, P3}, {{UniInVgprB96}, {VgprP3}}})
1143 .Any({{UniB128, P3}, {{UniInVgprB128}, {VgprP3}}})
1144
1145 // constant, addrspace(4)
1146 // divergent - global_load
1147 .Any({{DivS16, P4}, {{Vgpr16}, {VgprP4}}}, usesTrue16)
1148 .Any({{DivB32, P4}, {{VgprB32}, {VgprP4}}}) //32-bit load, 8-bit and 16-bit any-extending load
1149 .Any({{DivB64, P4}, {{VgprB64}, {VgprP4}}})
1150 .Any({{DivB96, P4}, {{VgprB96}, {VgprP4}}})
1151 .Any({{DivB128, P4}, {{VgprB128}, {VgprP4}}})
1152 .Any({{DivB256, P4}, {{VgprB256}, {VgprP4}, SplitLoad}})
1153 .Any({{DivB512, P4}, {{VgprB512}, {VgprP4}, SplitLoad}})
1154
1155 // uniform - s_load
1156 .Any({{{UniS16, P4}, isNaturalAligned && isUL}, {{Sgpr32Trunc}, {SgprP4}}}, usesTrue16 && hasSMRDSmall) // s16 load
1157 .Any({{{UniS16, P4}, isAlign4 && isUL}, {{Sgpr32Trunc}, {SgprP4}, WidenMMOToS32}}, usesTrue16 && !hasSMRDSmall) // s16 load to 32-bit load
1158 .Any({{{UniB32, P4}, isNaturalAligned && isUL}, {{SgprB32}, {SgprP4}}}, hasSMRDSmall) //32-bit load, 8-bit and 16-bit any-extending load
1159 .Any({{{UniB32, P4}, is8Or16BitMMO && isAlign4 && isUL}, {{SgprB32}, {SgprP4}, WidenMMOToS32}}, !hasSMRDSmall) //8-bit and 16-bit any-extending load to 32-bit load
1160 .Any({{{UniB32, P4}, is32BitMMO && isAlign4 && isUL}, {{SgprB32}, {SgprP4}}}) //32-bit load
1161 .Any({{{UniB64, P4}, isAlign4 && isUL}, {{SgprB64}, {SgprP4}}})
1162 .Any({{{UniB96, P4}, isAlign16 && isUL}, {{SgprB96}, {SgprP4}, WidenLoad}}, !hasSMRDx3)
1163 .Any({{{UniB96, P4}, isAlign4 && !isAlign16 && isUL}, {{SgprB96}, {SgprP4}, SplitLoad}}, !hasSMRDx3)
1164 .Any({{{UniB96, P4}, isAlign4 && isUL}, {{SgprB96}, {SgprP4}}}, hasSMRDx3)
1165 .Any({{{UniB128, P4}, isAlign4 && isUL}, {{SgprB128}, {SgprP4}}})
1166 .Any({{{UniB256, P4}, isAlign4 && isUL}, {{SgprB256}, {SgprP4}}})
1167 .Any({{{UniB512, P4}, isAlign4 && isUL}, {{SgprB512}, {SgprP4}}})
1168
1169 // uniform in vgpr - global_load or buffer_load
1170 .Any({{{UniS16, P4}, !isNaturalAligned || !isUL}, {{UniInVgprS16}, {SgprP4}}}, usesTrue16 && hasSMRDSmall) // s16 load
1171 .Any({{{UniS16, P4}, !isAlign4 || !isUL}, {{UniInVgprS16}, {SgprP4}}}, usesTrue16 && !hasSMRDSmall) // s16 load
1172 .Any({{{UniB32, P4}, !isNaturalAligned || !isUL}, {{UniInVgprB32}, {SgprP4}}}, hasSMRDSmall) //32-bit load, 8-bit and 16-bit any-extending load
1173 .Any({{{UniB32, P4}, !isAlign4 || !isUL}, {{UniInVgprB32}, {SgprP4}}}, !hasSMRDSmall) //32-bit load, 8-bit and 16-bit any-extending load
1174 .Any({{{UniB64, P4}, !isAlign4 || !isUL}, {{UniInVgprB64}, {SgprP4}}})
1175 .Any({{{UniB96, P4}, !isAlign4 || !isUL}, {{UniInVgprB96}, {SgprP4}}})
1176 .Any({{{UniB128, P4}, !isAlign4 || !isUL}, {{UniInVgprB128}, {SgprP4}}})
1177 .Any({{{UniB256, P4}, !isAlign4 || !isUL}, {{UniInVgprB256}, {SgprP4}, SplitLoad}})
1178 .Any({{{UniB512, P4}, !isAlign4 || !isUL}, {{UniInVgprB512}, {SgprP4}, SplitLoad}})
1179
1180 // private, addrspace(5), never uniform - scratch_load
1181 .Any({{DivS16, P5}, {{Vgpr16}, {VgprP5}}}, usesTrue16)
1182 .Any({{DivB32, P5}, {{VgprB32}, {VgprP5}}}) // 32-bit load, 8-bit and 16-bit any-extending load
1183 .Any({{DivB64, P5}, {{VgprB64}, {VgprP5}}})
1184 .Any({{DivB96, P5}, {{VgprB96}, {VgprP5}}})
1185 .Any({{DivB128, P5}, {{VgprB128}, {VgprP5}}})
1186
1187 .Any({{DivS32, Ptr128}, {{Vgpr32}, {VgprPtr128}}});
1188
1189
1190 addRulesForGOpcs({G_ZEXTLOAD, G_SEXTLOAD}) // i8 and i16 zeroextending loads
1191 .Any({{DivS32, P0}, {{Vgpr32}, {VgprP0}}})
1192 .Any({{DivS16, P0}, {{Vgpr16}, {VgprP0}}}, usesTrue16)
1193
1194 .Any({{DivS32, P1}, {{Vgpr32}, {VgprP1}}})
1195 .Any({{DivS16, P1}, {{Vgpr16}, {VgprP1}}}, usesTrue16)
1196 .Any({{{UniS32, P1}, isAlign4 && isUL}, {{Sgpr32}, {SgprP1}, WidenMMOToS32}}, !hasSMRDSmall)
1197 .Any({{{UniS32, P1}, isNaturalAligned && isUL}, {{Sgpr32}, {SgprP1}}}, hasSMRDSmall)
1198 .Any({{{UniS32, P1}, !isAlign4 || !isUL}, {{UniInVgprS32}, {SgprP1}}}, !hasSMRDSmall)
1199 .Any({{{UniS32, P1}, !isNaturalAligned || !isUL}, {{UniInVgprS32}, {SgprP1}}}, hasSMRDSmall)
1200
1201 .Any({{DivS32, P3}, {{Vgpr32}, {VgprP3}}})
1202 .Any({{DivS16, P3}, {{Vgpr16}, {VgprP3}}}, usesTrue16)
1203 .Any({{UniS32, P3}, {{UniInVgprS32}, {VgprP3}}})
1204
1205 .Any({{DivS32, P4}, {{Vgpr32}, {VgprP4}}})
1206 .Any({{DivS16, P4}, {{Vgpr16}, {VgprP4}}}, usesTrue16)
1207 .Any({{{UniS32, P4}, isAlign4 && isUL}, {{Sgpr32}, {SgprP4}, WidenMMOToS32}}, !hasSMRDSmall)
1208 .Any({{{UniS32, P4}, isNaturalAligned && isUL}, {{Sgpr32}, {SgprP4}}}, hasSMRDSmall)
1209 .Any({{{UniS32, P4}, !isAlign4 || !isUL}, {{UniInVgprS32}, {SgprP4}}}, !hasSMRDSmall)
1210 .Any({{{UniS32, P4}, !isNaturalAligned || !isUL}, {{UniInVgprS32}, {SgprP4}}}, hasSMRDSmall)
1211
1212 .Any({{DivS32, P5}, {{Vgpr32}, {VgprP5}}})
1213 .Any({{DivS16, P5}, {{Vgpr16}, {VgprP5}}}, usesTrue16);
1214
1215 addRulesForGOpcs({G_STORE})
1216 // addrspace(0)
1217 .Any({{S16, P0}, {{}, {Vgpr16, VgprP0}}}, usesTrue16) // 16-bit store
1218 .Any({{B32, P0}, {{}, {VgprB32, VgprP0}}}) // 32-bit store, 8-bit and 16-bit truncating store
1219 .Any({{B64, P0}, {{}, {VgprB64, VgprP0}}})
1220 .Any({{B96, P0}, {{}, {VgprB96, VgprP0}}})
1221 .Any({{B128, P0}, {{}, {VgprB128, VgprP0}}})
1222
1223 // addrspace(1), there are no stores to addrspace(4)
1224 // For targets:
1225 // - with "+flat-for-global" - global_store
1226 // - without(-flat-for-global) - buffer_store addr64
1227 .Any({{S16, DivP1}, {{}, {Vgpr16, VgprP1}}}, usesTrue16) // 16-bit store
1228 .Any({{B32, DivP1}, {{}, {VgprB32, VgprP1}}}) // 32-bit store, 8-bit and 16-bit truncating store
1229 .Any({{B64, DivP1}, {{}, {VgprB64, VgprP1}}})
1230 .Any({{B96, DivP1}, {{}, {VgprB96, VgprP1}}})
1231 .Any({{B128, DivP1}, {{}, {VgprB128, VgprP1}}})
1232
1233 // For UniP1, use sgpr ptr to match flat-for-global patterns. Targets:
1234 // - with "+flat-for-global" - global_store for both sgpr and vgpr ptr
1235 // - without(-flat-for-global) - need sgpr ptr to select buffer_store
1236 .Any({{S16, UniP1}, {{}, {Vgpr16, SgprP1}}}, usesTrue16) // 16-bit store
1237 .Any({{B32, UniP1}, {{}, {VgprB32, SgprP1}}}) // 32-bit store, 8-bit and 16-bit truncating store
1238 .Any({{B64, UniP1}, {{}, {VgprB64, SgprP1}}})
1239 .Any({{B96, UniP1}, {{}, {VgprB96, SgprP1}}})
1240 .Any({{B128, UniP1}, {{}, {VgprB128, SgprP1}}})
1241
1242 // addrspace(3) and addrspace(5)
1243 .Any({{S16, Ptr32}, {{}, {Vgpr16, VgprPtr32}}}, usesTrue16) // 16-bit store
1244 .Any({{B32, Ptr32}, {{}, {VgprB32, VgprPtr32}}}) // 32-bit store, 8-bit and 16-bit truncating store
1245 .Any({{B64, Ptr32}, {{}, {VgprB64, VgprPtr32}}})
1246 .Any({{B96, Ptr32}, {{}, {VgprB96, VgprPtr32}}})
1247 .Any({{B128, Ptr32}, {{}, {VgprB128, VgprPtr32}}});
1248
1249 // clang-format on
1250
1251 addRulesForGOpcs({G_AMDGPU_BUFFER_LOAD, G_AMDGPU_BUFFER_LOAD_FORMAT,
1252 G_AMDGPU_TBUFFER_LOAD_FORMAT},
1253 StandardB)
1262
1263 addRulesForGOpcs({G_AMDGPU_BUFFER_LOAD_USHORT, G_AMDGPU_BUFFER_LOAD_UBYTE,
1264 G_AMDGPU_BUFFER_LOAD_SSHORT, G_AMDGPU_BUFFER_LOAD_SBYTE},
1265 StandardB)
1268
1269 addRulesForGOpcs(
1270 {G_AMDGPU_BUFFER_LOAD_UBYTE_TFE, G_AMDGPU_BUFFER_LOAD_USHORT_TFE},
1271 StandardB)
1274
1275 addRulesForGOpcs({G_AMDGPU_BUFFER_LOAD_TFE, G_AMDGPU_BUFFER_LOAD_FORMAT_TFE},
1276 StandardB)
1284 .Any({{UniB160},
1286
1287 addRulesForGOpcs(
1288 {G_AMDGPU_BUFFER_LOAD_FORMAT_D16, G_AMDGPU_TBUFFER_LOAD_FORMAT_D16},
1289 StandardB)
1296
1297 addRulesForGOpcs({G_AMDGPU_S_BUFFER_LOAD})
1298 // waterfall expansion is part of S_BUF_to_BUF
1299 .Any({{UniB32}, {{SgprB32}, {SgprV4S32, Sgpr32}}})
1300 .Any({{DivB32, UniV4S32, DivB32},
1302 .Any({{DivB32, DivV4S32, UniB32},
1304 .Any({{DivB32, DivV4S32, DivB32},
1306
1307 .Any({{UniB64}, {{SgprB64}, {SgprV4S32, Sgpr32}}})
1308 .Any({{DivB64, UniV4S32, DivB32},
1310 .Any({{DivB64, DivV4S32, UniB32},
1312 .Any({{DivB64, DivV4S32, DivB32},
1314
1315 .Any({{UniB96}, {{SgprB96}, {SgprV4S32, Sgpr32}}})
1316 .Any({{DivB96, UniV4S32, DivB32},
1318 .Any({{DivB96, DivV4S32, UniB32},
1320 .Any({{DivB96, DivV4S32, DivB32},
1322
1323 .Any({{UniB128}, {{SgprB128}, {SgprV4S32, Sgpr32}}})
1324 .Any({{DivB128, UniV4S32, DivB32},
1326 .Any({{DivB128, DivV4S32, UniB32},
1328 .Any({{DivB128, DivV4S32, DivB32},
1330
1331 .Any({{UniB256}, {{SgprB256}, {SgprV4S32, Sgpr32}}})
1332 .Any({{DivB256, UniV4S32, DivB32},
1334 .Any({{DivB256, DivV4S32, UniB32},
1336 .Any({{DivB256, DivV4S32, DivB32},
1338
1339 .Any({{UniB512}, {{SgprB512}, {SgprV4S32, Sgpr32}}})
1340 .Any({{DivB512, UniV4S32, DivB32},
1342 .Any({{DivB512, DivV4S32, UniB32},
1344 .Any({{DivB512, DivV4S32, DivB32},
1346
1347 addRulesForGOpcs({G_AMDGPU_S_BUFFER_LOAD_SBYTE, G_AMDGPU_S_BUFFER_LOAD_UBYTE,
1348 G_AMDGPU_S_BUFFER_LOAD_SSHORT,
1349 G_AMDGPU_S_BUFFER_LOAD_USHORT})
1351 .Any({{DivS32, UniV4S32, DivS32},
1353 .Any({{DivS32, DivV4S32, UniS32},
1355 .Any({{DivS32, DivV4S32, DivS32},
1357
1358 addRulesForGOpcs({G_AMDGPU_BUFFER_STORE, G_AMDGPU_BUFFER_STORE_BYTE,
1359 G_AMDGPU_BUFFER_STORE_SHORT, G_AMDGPU_BUFFER_STORE_FORMAT,
1360 G_AMDGPU_BUFFER_STORE_FORMAT_D16,
1361 G_AMDGPU_TBUFFER_STORE_FORMAT,
1362 G_AMDGPU_TBUFFER_STORE_FORMAT_D16})
1363 .Any({{B32}, {{}, {VgprB32, SgprV4S32_WF, Vgpr32, Vgpr32, Sgpr32_WF}}})
1364 .Any({{B64}, {{}, {VgprB64, SgprV4S32_WF, Vgpr32, Vgpr32, Sgpr32_WF}}})
1365 .Any({{B96}, {{}, {VgprB96, SgprV4S32_WF, Vgpr32, Vgpr32, Sgpr32_WF}}})
1366 .Any({{B128}, {{}, {VgprB128, SgprV4S32_WF, Vgpr32, Vgpr32, Sgpr32_WF}}});
1367
1368 // Buffer atomics: resource descriptor + scalar offset are SGPR, data and
1369 // address components are VGPR.
1370 //
1371 // Operand order (SIInstructions.td BufferAtomicGenericInstruction):
1372 // dst = op vdata, rsrc, vindex, voffset, soffset, offset_imm, cachepolicy,
1373 // idxen_imm
1374 addRulesForGOpcs({G_AMDGPU_BUFFER_ATOMIC_FADD})
1375 .Any({{S32, S32, V4S32, S32, S32, S32},
1377 .Any({{S64, S64, V4S32, S32, S32, S32},
1379 .Any({{V2S16, V2S16, V4S32, S32, S32, S32},
1380 {{VgprV2S16},
1382
1383 addRulesForGOpcs({G_PTR_ADD})
1384 .Any({{UniPtr32}, {{SgprPtr32}, {SgprPtr32, Sgpr32}}})
1385 .Any({{DivPtr32}, {{VgprPtr32}, {VgprPtr32, Vgpr32}}})
1386 .Any({{UniPtr64}, {{SgprPtr64}, {SgprPtr64, Sgpr64}}})
1387 .Any({{DivPtr64}, {{VgprPtr64}, {VgprPtr64, Vgpr64}}});
1388
1389 addRulesForGOpcs({G_INTTOPTR})
1390 .Any({{UniPtr32}, {{SgprPtr32}, {Sgpr32}}})
1391 .Any({{DivPtr32}, {{VgprPtr32}, {Vgpr32}}})
1392 .Any({{UniPtr64}, {{SgprPtr64}, {Sgpr64}}})
1393 .Any({{DivPtr64}, {{VgprPtr64}, {Vgpr64}}})
1394 .Any({{UniPtr128}, {{SgprPtr128}, {Sgpr128}}})
1395 .Any({{DivPtr128}, {{VgprPtr128}, {Vgpr128}}});
1396
1397 addRulesForGOpcs({G_PTRTOINT})
1398 .Any({{UniS32}, {{Sgpr32}, {SgprPtr32}}})
1399 .Any({{DivS32}, {{Vgpr32}, {VgprPtr32}}})
1400 .Any({{UniS64}, {{Sgpr64}, {SgprPtr64}}})
1401 .Any({{DivS64}, {{Vgpr64}, {VgprPtr64}}})
1402 .Any({{UniS128}, {{Sgpr128}, {SgprPtr128}}})
1403 .Any({{DivS128}, {{Vgpr128}, {VgprPtr128}}});
1404
1405 // FIXME: Update llvm/test/CodeGen/AMDGPU/ptrmask.ll to use GlobalISel.
1406 // Currently crashes on P8 (buffer resource) tests due to legalizer issue.
1407 addRulesForGOpcs({G_PTRMASK})
1408 .Any({{UniP1}, {{SgprP1}, {SgprP1, Sgpr64}}})
1409 .Any({{DivP1}, {{VgprP1}, {VgprP1, Vgpr64}}})
1410 .Any({{UniP3}, {{SgprP3}, {SgprP3, Sgpr32}}})
1411 .Any({{DivP3}, {{VgprP3}, {VgprP3, Vgpr32}}});
1412
1413 addRulesForGOpcs({G_DYN_STACKALLOC})
1414 .Any({{UniP5, UniS32}, {{SgprP5}, {Sgpr32}, DynStackAlloc}})
1415 .Any({{UniP5, DivS32}, {{SgprP5}, {Vgpr32}, DynStackAlloc}});
1416
1417 addRulesForGOpcs({G_ABS}, Standard)
1418 .Uni(S16, {{Sgpr32Trunc}, {Sgpr32SExt}})
1419 .Div(S16, {{Vgpr16}, {Vgpr16}, AbsToNegMax})
1420 .Uni(S32, {{Sgpr32}, {Sgpr32}})
1421 .Div(S32, {{Vgpr32}, {Vgpr32}, AbsToNegMax})
1422 .Uni(V2S16, {{SgprV2S16}, {SgprV2S16}, AbsToS32})
1423 .Div(V2S16, {{VgprV2S16}, {VgprV2S16}, AbsToNegMax});
1424
1425 addRulesForGOpcs({G_BITREVERSE}, Standard)
1426 .Uni(S32, {{Sgpr32}, {Sgpr32}})
1427 .Div(S32, {{Vgpr32}, {Vgpr32}})
1428 .Uni(S64, {{Sgpr64}, {Sgpr64}})
1429 .Div(S64, {{Vgpr64}, {Vgpr64}});
1430
1431 addRulesForGOpcs({G_AMDGPU_FFBH_U32, G_AMDGPU_FFBL_B32, G_CTLZ_ZERO_POISON,
1432 G_CTTZ_ZERO_POISON})
1433 .Any({{UniS32, S32}, {{Sgpr32}, {Sgpr32}}})
1434 .Any({{DivS32, S32}, {{Vgpr32}, {Vgpr32}}})
1435 .Any({{UniS32, S64}, {{Sgpr32}, {Sgpr64}}})
1437
1438 addRulesForGOpcs({G_CTPOP})
1439 .Any({{UniS32, S32}, {{Sgpr32}, {Sgpr32}}})
1440 .Any({{DivS32, S32}, {{Vgpr32}, {Vgpr32}}})
1441 .Any({{UniS32, S64}, {{Sgpr32}, {Sgpr64}}})
1442 .Any({{DivS32, S64}, {{Vgpr32}, {Vgpr64}, CtPop64To32}});
1443
1444 addRulesForGOpcs({G_FENCE}).Any({{{}}, {{}, {}}});
1445
1446 addRulesForGOpcs({G_READSTEADYCOUNTER, G_READCYCLECOUNTER}, Standard)
1447 .Uni(S64, {{Sgpr64}, {}});
1448
1449 addRulesForGOpcs({G_GET_ROUNDING}, Standard)
1450 .Uni(S32, {{Sgpr32}, {}, LowerGetRounding});
1451
1452 addRulesForGOpcs({G_SET_ROUNDING}, Standard)
1455
1456 addRulesForGOpcs({G_BLOCK_ADDR}).Any({{UniP0}, {{SgprP0}, {}}});
1457
1458 addRulesForGOpcs({G_GLOBAL_VALUE})
1459 .Any({{UniP0}, {{SgprP0}, {}}})
1460 .Any({{UniP1}, {{SgprP1}, {}}})
1461 .Any({{UniP3}, {{SgprP3}, {}}})
1462 .Any({{UniP4}, {{SgprP4}, {}}})
1463 .Any({{UniP8}, {{SgprP8}, {}}});
1464
1465 addRulesForGOpcs({G_AMDGPU_WAVE_ADDRESS}).Any({{UniP5}, {{SgprP5}, {}}});
1466
1467 addRulesForGOpcs({G_AMDGPU_SPONENTRY}, Standard).Uni(S32, {{Sgpr32}, {}});
1468
1469 addRulesForGOpcs({G_SI_CALL})
1470 .Any({{_, UniP0}, {{None}, {SgprP0}}})
1471 .Any({{_, DivP0}, {{None}, {SgprP0Call_WF}}})
1472 .Any({{_, UniP4}, {{None}, {SgprP4}}})
1473 .Any({{_, DivP4}, {{None}, {SgprP4Call_WF}}});
1474
1475 bool hasSALUFloat = ST->hasSALUFloatInsts();
1476
1477 addRulesForGOpcs({G_FADD, G_FMUL, G_STRICT_FADD, G_STRICT_FMUL}, Standard)
1478 .Uni(S16, {{UniInVgprS16}, {Vgpr16, Vgpr16}}, !hasSALUFloat)
1479 .Uni(S16, {{Sgpr16}, {Sgpr16, Sgpr16}}, hasSALUFloat)
1480 .Div(S16, {{Vgpr16}, {Vgpr16, Vgpr16}})
1481 .Uni(S32, {{Sgpr32}, {Sgpr32, Sgpr32}}, hasSALUFloat)
1482 .Uni(S32, {{UniInVgprS32}, {Vgpr32, Vgpr32}}, !hasSALUFloat)
1483 .Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32}})
1484 .Uni(S64, {{UniInVgprS64}, {Vgpr64, Vgpr64}})
1485 .Div(S64, {{Vgpr64}, {Vgpr64, Vgpr64}})
1486 .Uni(V2S16, {{UniInVgprV2S16}, {VgprV2S16, VgprV2S16}}, !hasSALUFloat)
1488 hasSALUFloat)
1489 .Div(V2S16, {{VgprV2S16}, {VgprV2S16, VgprV2S16}})
1493 .Any({{DivV2S64}, {{VgprV2S64}, {VgprV2S64, VgprV2S64}}});
1494
1495 addRulesForGOpcs({G_FSUB, G_STRICT_FSUB}, Standard)
1496 .Div(S16, {{Vgpr16}, {Vgpr16, Vgpr16}})
1497 .Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32}})
1498 .Uni(S16, {{Sgpr16}, {Sgpr16, Sgpr16}}, hasSALUFloat)
1499 .Uni(S16, {{UniInVgprS16}, {Vgpr16, Vgpr16}}, !hasSALUFloat)
1500 .Uni(S32, {{Sgpr32}, {Sgpr32, Sgpr32}}, hasSALUFloat)
1501 .Uni(S32, {{UniInVgprS32}, {Vgpr32, Vgpr32}}, !hasSALUFloat);
1502
1503 addRulesForGOpcs({G_FMAD}, Standard)
1504 .Uni(S16, {{UniInVgprS16}, {Vgpr16, Vgpr16, Vgpr16}})
1505 .Div(S16, {{Vgpr16}, {Vgpr16, Vgpr16, Vgpr16}})
1506 .Uni(S32, {{UniInVgprS32}, {Vgpr32, Vgpr32, Vgpr32}})
1507 .Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32, Vgpr32}});
1508
1509 addRulesForGOpcs({G_FLDEXP, G_STRICT_FLDEXP}, Standard)
1510 .Uni(S32, {{UniInVgprS32}, {Vgpr32, Vgpr32}})
1511 .Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32}})
1512 .Uni(S16, {{UniInVgprS16}, {Vgpr16, Vgpr16}})
1513 .Div(S16, {{Vgpr16}, {Vgpr16, Vgpr16}})
1514 .Uni(S64, {{UniInVgprS64}, {Vgpr64, Vgpr32}})
1515 .Div(S64, {{Vgpr64}, {Vgpr64, Vgpr32}});
1516
1517 addRulesForGOpcs({G_FMA, G_STRICT_FMA}, Standard)
1518 .Div(S16, {{Vgpr16}, {Vgpr16, Vgpr16, Vgpr16}})
1519 .Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32, Vgpr32}})
1520 .Uni(S64, {{UniInVgprS64}, {Vgpr64, Vgpr64, Vgpr64}})
1521 .Div(S64, {{Vgpr64}, {Vgpr64, Vgpr64, Vgpr64}})
1525 .Uni(S16, {{Sgpr16}, {Sgpr16, Sgpr16, Sgpr16}}, hasSALUFloat)
1526 .Uni(S16, {{UniInVgprS16}, {Vgpr16, Vgpr16, Vgpr16}}, !hasSALUFloat)
1527 .Uni(S32, {{Sgpr32}, {Sgpr32, Sgpr32, Sgpr32}}, hasSALUFloat)
1528 .Uni(S32, {{UniInVgprS32}, {Vgpr32, Vgpr32, Vgpr32}}, !hasSALUFloat)
1529 .Uni(V2S16,
1531 hasSALUFloat)
1533 !hasSALUFloat)
1536
1537 addRulesForGOpcs({G_AMDGPU_FMED3}, Standard)
1538 .Uni(S16, {{UniInVgprS16}, {Vgpr16, Vgpr16, Vgpr16}})
1539 .Div(S16, {{Vgpr16}, {Vgpr16, Vgpr16, Vgpr16}})
1540 .Uni(S32, {{UniInVgprS32}, {Vgpr32, Vgpr32, Vgpr32}})
1541 .Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32, Vgpr32}});
1542
1543 // TODO: This opcode is generated from the i64->i16 signed clamped pattern in
1544 // the PreLegalizerCombiner. Move the combine to RegBankCombiner to keep more
1545 // instructions on SALU.
1546 addRulesForGOpcs({G_AMDGPU_SMED3}, Standard)
1547 .Uni(S32, {{UniInVgprS32}, {Vgpr32, Vgpr32, Vgpr32}})
1548 .Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32, Vgpr32}});
1549
1550 // FNEG and FABS are either folded as source modifiers or can be selected as
1551 // bitwise XOR and AND with Mask. XOR and AND are available on SALU but for
1552 // targets without SALU float we still select them as VGPR since there would
1553 // be no real sgpr use.
1554 addRulesForGOpcs({G_FNEG, G_FABS}, Standard)
1555 .Uni(S16, {{UniInVgprS16}, {Vgpr16}}, !hasSALUFloat)
1556 .Uni(S16, {{Sgpr16}, {Sgpr16}}, hasSALUFloat)
1557 .Div(S16, {{Vgpr16}, {Vgpr16}})
1558 .Uni(S32, {{UniInVgprS32}, {Vgpr32}}, !hasSALUFloat)
1559 .Uni(S32, {{Sgpr32}, {Sgpr32}}, hasSALUFloat)
1560 .Div(S32, {{Vgpr32}, {Vgpr32}})
1561 .Uni(S64, {{UniInVgprS64}, {Vgpr64}})
1562 .Div(S64, {{Vgpr64}, {Vgpr64}})
1563 .Uni(V2S16, {{UniInVgprV2S16}, {VgprV2S16}}, !hasSALUFloat)
1564 .Uni(V2S16, {{SgprV2S16}, {SgprV2S16}, ScalarizeToS16}, hasSALUFloat)
1565 .Div(V2S16, {{VgprV2S16}, {VgprV2S16}})
1566 .Any({{UniV2S32}, {{UniInVgprV2S32}, {VgprV2S32}}})
1567 .Any({{DivV2S32}, {{VgprV2S32}, {VgprV2S32}}});
1568
1569 addRulesForGOpcs({G_FCANONICALIZE}, Standard)
1570 .Uni(S32, {{UniInVgprS32}, {Vgpr32}})
1571 .Div(S32, {{Vgpr32}, {Vgpr32}})
1572 .Uni(S16, {{UniInVgprS16}, {Vgpr16}})
1573 .Div(S16, {{Vgpr16}, {Vgpr16}})
1574 .Uni(S64, {{UniInVgprS64}, {Vgpr64}})
1575 .Div(S64, {{Vgpr64}, {Vgpr64}})
1576 .Uni(V2S16, {{UniInVgprV2S16}, {VgprV2S16}})
1577 .Div(V2S16, {{VgprV2S16}, {VgprV2S16}})
1578 .Any({{UniV2S32}, {{UniInVgprV2S32}, {VgprV2S32}}})
1579 .Any({{DivV2S32}, {{VgprV2S32}, {VgprV2S32}}})
1580 .Any({{UniV2S64}, {{UniInVgprV2S64}, {VgprV2S64}}})
1581 .Any({{DivV2S64}, {{VgprV2S64}, {VgprV2S64}}});
1582
1583 bool hasPST = ST->hasPseudoScalarTrans();
1584 addRulesForGOpcs({G_FSQRT}, Standard)
1585 .Div(S16, {{Vgpr16}, {Vgpr16}})
1586 .Uni(S16, {{Sgpr16}, {Sgpr16}}, hasPST)
1587 .Uni(S16, {{UniInVgprS16}, {Vgpr16}}, !hasPST);
1588
1589 addRulesForGOpcs({G_FPTOUI, G_FPTOSI, G_FPTOUI_SAT, G_FPTOSI_SAT})
1590 .Any({{UniS16, S16}, {{UniInVgprS16}, {Vgpr16}}})
1591 .Any({{DivS16, S16}, {{Vgpr16}, {Vgpr16}}})
1592 .Any({{UniS32, S16}, {{Sgpr32}, {Sgpr16}}}, hasSALUFloat)
1593 .Any({{UniS32, S16}, {{UniInVgprS32}, {Vgpr16}}}, !hasSALUFloat)
1594 .Any({{DivS32, S16}, {{Vgpr32}, {Vgpr16}}})
1595 .Any({{UniS16, S32}, {{Sgpr16}, {Sgpr32}}}, hasSALUFloat)
1596 .Any({{UniS16, S32}, {{UniInVgprS16}, {Vgpr32}}}, !hasSALUFloat)
1597 .Any({{DivS16, S32}, {{Vgpr16}, {Vgpr32}}})
1598 .Any({{UniS32, S32}, {{Sgpr32}, {Sgpr32}}}, hasSALUFloat)
1599 .Any({{UniS32, S32}, {{UniInVgprS32}, {Vgpr32}}}, !hasSALUFloat)
1600 .Any({{DivS32, S32}, {{Vgpr32}, {Vgpr32}}})
1601 .Any({{UniS32, S64}, {{UniInVgprS32}, {Vgpr64}}})
1602 .Any({{DivS32, S64}, {{Vgpr32}, {Vgpr64}}})
1604 .Any({{DivV2S16, V2S32}, {{VgprV2S16}, {VgprV2S32}}});
1605
1606 addRulesForGOpcs({G_UITOFP, G_SITOFP})
1607 .Any({{UniS16, S16}, {{UniInVgprS16}, {Vgpr16}}})
1608 .Any({{DivS16, S16}, {{Vgpr16}, {Vgpr16}}})
1609 .Any({{UniS16, S32}, {{Sgpr16}, {Sgpr32}}}, hasSALUFloat)
1610 .Any({{UniS16, S32}, {{UniInVgprS16}, {Vgpr32}}}, !hasSALUFloat)
1611 .Any({{DivS16, S32}, {{Vgpr16}, {Vgpr32}}})
1612 .Any({{UniS32, S32}, {{Sgpr32}, {Sgpr32}}}, hasSALUFloat)
1613 .Any({{UniS32, S32}, {{UniInVgprS32}, {Vgpr32}}}, !hasSALUFloat)
1614 .Any({{DivS32, S32}, {{Vgpr32}, {Vgpr32}}})
1615 .Any({{UniS64, S32}, {{UniInVgprS64}, {Vgpr32}}})
1616 .Any({{DivS64, S32}, {{Vgpr64}, {Vgpr32}}});
1617
1618 addRulesForGOpcs({G_AMDGPU_S_BUFFER_PREFETCH})
1620
1621 Predicate IsDataPF([](const MachineInstr &MI) -> bool {
1622 // prefetch cache type: 0 == instruction (I$) prefetch, 1 == data prefetch.
1623 return MI.getOperand(3).getImm() != 0;
1624 });
1625
1626 bool HasSMemPF = ST->hasSafeSmemPrefetch();
1627 bool HasVMemPF = ST->hasVmemPrefInsts();
1628 addRulesForGOpcs({G_PREFETCH})
1629 // Safe smem prefetch keeps both data and instruction prefetch.
1630 .Any({{UniPtr64}, {{}, {SgprPtr64}}}, HasSMemPF)
1631 // Vmem prefetch keeps data prefetch only.
1632 .Any({{{UniPtr64}, IsDataPF}, {{}, {SgprPtr64}}}, !HasSMemPF && HasVMemPF)
1633 .Any({{{UniPtr64}, IsDataPF}, {{}, {}, DeletePrefetch}},
1634 !HasSMemPF && !HasVMemPF)
1635 .Any({{{UniPtr64}, !IsDataPF}, {{}, {}, DeletePrefetch}}, !HasSMemPF)
1636
1637 .Any({{{DivPtr64}, IsDataPF}, {{}, {VgprPtr64}}}, HasVMemPF)
1638 .Any({{{DivPtr64}, IsDataPF}, {{}, {}, DeletePrefetch}}, !HasVMemPF)
1639 .Any({{{DivPtr64}, !IsDataPF}, {{}, {}, DeletePrefetch}})
1640
1641 .Any({{P3}, {{}, {}, DeletePrefetch}})
1642 .Any({{P5}, {{}, {}, DeletePrefetch}})
1643 .Any({{UniP6}, {{}, {SgprP6}}}, HasSMemPF)
1644 .Any({{UniP6}, {{}, {}, DeletePrefetch}}, !HasSMemPF);
1645
1646 addRulesForGOpcs({G_FPEXT})
1647 .Any({{DivS32, S16}, {{Vgpr32}, {Vgpr16}}})
1648 .Any({{UniS64, S32}, {{UniInVgprS64}, {Vgpr32}}})
1649 .Any({{DivS64, S32}, {{Vgpr64}, {Vgpr32}}})
1650 .Any({{UniS32, S16}, {{Sgpr32}, {Sgpr16}}}, hasSALUFloat)
1651 .Any({{UniS32, S16}, {{UniInVgprS32}, {Vgpr16}}}, !hasSALUFloat);
1652
1653 addRulesForGOpcs({G_AMDGPU_CVT_PK_I16_I32}, Standard)
1654 .Uni(V2S16, {{UniInVgprV2S16}, {Vgpr32, Vgpr32}})
1655 .Div(V2S16, {{VgprV2S16}, {Vgpr32, Vgpr32}});
1656
1657 addRulesForGOpcs({G_AMDGPU_FMIN_LEGACY, G_AMDGPU_FMAX_LEGACY}, Standard)
1658 .Uni(S32, {{UniInVgprS32}, {Vgpr32, Vgpr32}})
1659 .Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32}});
1660
1661 bool hasSALUMinimumMaximumInsts = ST->hasSALUMinimumMaximumInsts();
1662
1663 addRulesForGOpcs({G_FMINIMUM, G_FMAXIMUM}, Standard)
1664 .Uni(S16, {{Sgpr16}, {Sgpr16, Sgpr16}}, hasSALUMinimumMaximumInsts)
1665 .Uni(S16, {{UniInVgprS16}, {Vgpr16, Vgpr16}}, !hasSALUMinimumMaximumInsts)
1666 .Div(S16, {{Vgpr16}, {Vgpr16, Vgpr16}})
1667 .Uni(S32, {{Sgpr32}, {Sgpr32, Sgpr32}}, hasSALUMinimumMaximumInsts)
1668 .Uni(S32, {{UniInVgprS32}, {Vgpr32, Vgpr32}}, !hasSALUMinimumMaximumInsts)
1669 .Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32}})
1670 .Uni(S64, {{UniInVgprS64}, {Vgpr64, Vgpr64}})
1671 .Div(S64, {{Vgpr64}, {Vgpr64, Vgpr64}})
1673 .Div(V2S16, {{VgprV2S16}, {VgprV2S16, VgprV2S16}});
1674
1675 addRulesForGOpcs({G_FMINNUM_IEEE, G_FMAXNUM_IEEE, G_FMINNUM, G_FMAXNUM,
1676 G_FMINIMUMNUM, G_FMAXIMUMNUM},
1677 Standard)
1678 .Div(S16, {{Vgpr16}, {Vgpr16, Vgpr16}})
1679 .Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32}})
1680 .Uni(S64, {{UniInVgprS64}, {Vgpr64, Vgpr64}})
1681 .Div(S64, {{Vgpr64}, {Vgpr64, Vgpr64}})
1683 .Div(V2S16, {{VgprV2S16}, {VgprV2S16, VgprV2S16}})
1684 .Uni(S16, {{Sgpr16}, {Sgpr16, Sgpr16}}, hasSALUFloat)
1685 .Uni(S16, {{UniInVgprS16}, {Vgpr16, Vgpr16}}, !hasSALUFloat)
1686 .Uni(S32, {{Sgpr32}, {Sgpr32, Sgpr32}}, hasSALUFloat)
1687 .Uni(S32, {{UniInVgprS32}, {Vgpr32, Vgpr32}}, !hasSALUFloat);
1688
1689 addRulesForGOpcs({G_FPTRUNC})
1690 .Any({{DivS16, S32}, {{Vgpr16}, {Vgpr32}}})
1691 .Any({{UniS32, S64}, {{UniInVgprS32}, {Vgpr64}}})
1692 .Any({{DivS32, S64}, {{Vgpr32}, {Vgpr64}}})
1694 .Any({{DivV2S16, V2S32}, {{VgprV2S16}, {VgprV2S32}}})
1695 .Any({{UniS16, S32}, {{Sgpr16}, {Sgpr32}}}, hasSALUFloat)
1696 .Any({{UniS16, S32}, {{UniInVgprS16}, {Vgpr32}}}, !hasSALUFloat);
1697
1698 addRulesForGOpcs({G_INTRINSIC_FPTRUNC_ROUND})
1699 .Any({{UniS16, S32}, {{Sgpr16}, {Sgpr32}}}, hasSALUFloat)
1700 .Any({{UniS16, S32}, {{UniInVgprS16}, {Vgpr32}}}, !hasSALUFloat)
1701 .Any({{DivS16, S32}, {{Vgpr16}, {Vgpr32}}})
1702 .Any({{UniS16, S64}, {{UniInVgprS16}, {Vgpr64}}})
1703 .Any({{DivS16, S64}, {{Vgpr16}, {Vgpr64}}})
1704 .Any({{UniS32, S64}, {{UniInVgprS32}, {Vgpr64}}})
1705 .Any({{DivS32, S64}, {{Vgpr32}, {Vgpr64}}});
1706
1707 addRulesForGOpcs({G_IS_FPCLASS})
1708 .Any({{DivS1, S16}, {{Vcc}, {Vgpr16}}})
1709 .Any({{UniS1, S16}, {{UniInVcc}, {Vgpr16}}})
1710 .Any({{DivS1, S32}, {{Vcc}, {Vgpr32}}})
1711 .Any({{UniS1, S32}, {{UniInVcc}, {Vgpr32}}})
1712 .Any({{DivS1, S64}, {{Vcc}, {Vgpr64}}})
1713 .Any({{UniS1, S64}, {{UniInVcc}, {Vgpr64}}});
1714
1715 addRulesForGOpcs({G_FCMP}, Standard)
1716 .Any({{UniS1, _, S16}, {{Sgpr32Trunc}, {None, Sgpr16, Sgpr16}}},
1717 hasSALUFloat)
1718 .Any({{UniS1, _, S16}, {{UniInVcc}, {None, Vgpr16, Vgpr16}}},
1719 !hasSALUFloat)
1720 .Any({{DivS1, _, S16}, {{Vcc}, {None, Vgpr16, Vgpr16}}})
1721 .Any({{UniS1, _, S32}, {{Sgpr32Trunc}, {None, Sgpr32, Sgpr32}}},
1722 hasSALUFloat)
1723 .Any({{UniS1, _, S32}, {{UniInVcc}, {None, Vgpr32, Vgpr32}}},
1724 !hasSALUFloat)
1725 .Any({{DivS1, _, S32}, {{Vcc}, {None, Vgpr32, Vgpr32}}})
1726 .Any({{UniS1, _, S64}, {{UniInVcc}, {None, Vgpr64, Vgpr64}}})
1727 .Any({{DivS1, _, S64}, {{Vcc}, {None, Vgpr64, Vgpr64}}});
1728
1729 addRulesForGOpcs({G_INTRINSIC_ROUNDEVEN, G_FEXP2, G_FLOG2}, Standard)
1730 .Uni(S16, {{UniInVgprS16}, {Vgpr16}})
1731 .Div(S16, {{Vgpr16}, {Vgpr16}})
1732 .Uni(S32, {{UniInVgprS32}, {Vgpr32}})
1733 .Div(S32, {{Vgpr32}, {Vgpr32}})
1734 .Uni(S64, {{UniInVgprS64}, {Vgpr64}})
1735 .Div(S64, {{Vgpr64}, {Vgpr64}});
1736
1737 addRulesForGOpcs({G_INTRINSIC_TRUNC, G_FFLOOR, G_FCEIL}, Standard)
1738 .Uni(S16, {{UniInVgprS16}, {Vgpr16}})
1739 .Uni(S16, {{Sgpr16}, {Sgpr16}}, hasSALUFloat)
1740 .Div(S16, {{Vgpr16}, {Vgpr16}})
1741 .Uni(S32, {{UniInVgprS32}, {Vgpr32}})
1742 .Uni(S32, {{Sgpr32}, {Sgpr32}}, hasSALUFloat)
1743 .Div(S32, {{Vgpr32}, {Vgpr32}})
1744 .Uni(S64, {{UniInVgprS64}, {Vgpr64}})
1745 .Div(S64, {{Vgpr64}, {Vgpr64}});
1746
1747 addRulesForGOpcs({G_AMDGPU_GLOBAL_LOAD_MONITOR, G_AMDGPU_FLAT_LOAD_MONITOR},
1748 StandardB)
1749 .Uni(B32, {{UniInVgprB32}, {SgprPtr64}})
1750 .Div(B32, {{VgprB32}, {VgprPtr64}})
1751 .Uni(B64, {{UniInVgprB64}, {SgprPtr64}})
1752 .Div(B64, {{VgprB64}, {VgprPtr64}})
1753 .Uni(B128, {{UniInVgprB128}, {SgprPtr64}})
1754 .Div(B128, {{VgprB128}, {VgprPtr64}});
1755
1756 addRulesForGOpcs({G_AMDGPU_WHOLE_WAVE_FUNC_SETUP})
1757 .Any({{DivS1}, {{Vcc}, {}}});
1758
1759 addRulesForGOpcs({G_AMDGPU_WHOLE_WAVE_FUNC_RETURN}).Any({{}, {{}, {Vcc}}});
1760
1761 using namespace Intrinsic;
1762
1763 addRulesForIOpcs({returnaddress}).Any({{UniP0}, {{SgprP0}, {}}});
1764
1765 // Note: amdgcn.icmp with i1 inputs is legalized to ballot in the legalizer,
1766 // so no S1 rules are needed here.
1767 addRulesForIOpcs({amdgcn_icmp})
1768 .Any({{UniS64, _, S16}, {{Sgpr64}, {IntrId, Vgpr16, Vgpr16}}})
1769 .Any({{UniS64, _, S32}, {{Sgpr64}, {IntrId, Vgpr32, Vgpr32}}})
1770 .Any({{UniS64, _, S64}, {{Sgpr64}, {IntrId, Vgpr64, Vgpr64}}})
1771
1772 .Any({{UniS32, _, S16}, {{Sgpr32}, {IntrId, Vgpr16, Vgpr16}}})
1773 .Any({{UniS32, _, S32}, {{Sgpr32}, {IntrId, Vgpr32, Vgpr32}}})
1774 .Any({{UniS32, _, S64}, {{Sgpr32}, {IntrId, Vgpr64, Vgpr64}}});
1775
1776 addRulesForIOpcs({amdgcn_fcmp})
1777 .Any({{UniS64, _, S16}, {{Sgpr64}, {IntrId, Vgpr16, Vgpr16}}})
1778 .Any({{UniS64, _, S32}, {{Sgpr64}, {IntrId, Vgpr32, Vgpr32}}})
1779 .Any({{UniS64, _, S64}, {{Sgpr64}, {IntrId, Vgpr64, Vgpr64}}})
1780
1781 .Any({{UniS32, _, S16}, {{Sgpr32}, {IntrId, Vgpr16, Vgpr16}}})
1782 .Any({{UniS32, _, S32}, {{Sgpr32}, {IntrId, Vgpr32, Vgpr32}}})
1783 .Any({{UniS32, _, S64}, {{Sgpr32}, {IntrId, Vgpr64, Vgpr64}}});
1784
1785 addRulesForIOpcs({amdgcn_s_getpc}).Any({{UniS64, _}, {{Sgpr64}, {}}});
1786
1787 addRulesForIOpcs({amdgcn_s_getreg}).Any({{}, {{Sgpr32}, {IntrId}}});
1788
1789 addRulesForIOpcs({amdgcn_s_setreg})
1790 .Any({{_, _, S32}, {{}, {IntrId, Imm, SgprB32_ReadFirstLane}}});
1791
1792 addRulesForIOpcs({amdgcn_s_sendmsg, amdgcn_s_sendmsghalt})
1793 .Any({{}, {{}, {IntrId, Imm, SgprB32_M0}}});
1794
1795 addRulesForIOpcs({amdgcn_s_sendmsg_rtn})
1796 .Any({{S32}, {{Sgpr32}, {}}})
1797 .Any({{S64}, {{Sgpr64}, {}}});
1798
1799 addRulesForIOpcs({amdgcn_s_memrealtime, amdgcn_s_memtime}, Standard)
1800 .Uni(S64, {{Sgpr64}, {IntrId}});
1801
1802 addRulesForIOpcs({amdgcn_groupstaticsize, amdgcn_pops_exiting_wave_id,
1803 amdgcn_reloc_constant, amdgcn_s_get_waveid_in_workgroup},
1804 Standard)
1805 .Uni(S32, {{Sgpr32}, {IntrId}});
1806
1807 // Intrinsics with no register operands.
1808 addRulesForIOpcs({amdgcn_asyncmark,
1809 amdgcn_endpgm,
1810 amdgcn_iglp_opt,
1811 amdgcn_init_exec,
1812 amdgcn_s_barrier,
1813 amdgcn_s_barrier_leave,
1814 amdgcn_s_barrier_signal,
1815 amdgcn_s_barrier_wait,
1816 amdgcn_s_monitor_sleep,
1817 amdgcn_s_nop,
1818 amdgcn_s_sethalt,
1819 amdgcn_s_setprio,
1820 amdgcn_s_setprio_inc_wg,
1821 amdgcn_s_sleep,
1822 amdgcn_s_ttracedata_imm,
1823 amdgcn_s_wait_asynccnt,
1824 amdgcn_s_wait_bvhcnt,
1825 amdgcn_s_wait_dscnt,
1826 amdgcn_s_wait_event,
1827 amdgcn_s_wait_event_export_ready,
1828 amdgcn_s_wait_expcnt,
1829 amdgcn_s_wait_kmcnt,
1830 amdgcn_s_wait_loadcnt,
1831 amdgcn_s_wait_samplecnt,
1832 amdgcn_s_wait_storecnt,
1833 amdgcn_s_wait_tensorcnt,
1834 amdgcn_s_waitcnt,
1835 amdgcn_sched_barrier,
1836 amdgcn_sched_group_barrier,
1837 amdgcn_unreachable,
1838 amdgcn_wait_asyncmark,
1839 amdgcn_wave_barrier})
1840 .Any({{}, {{}, {}}});
1841
1842 addRulesForIOpcs({amdgcn_init_exec_from_input})
1843 .Any({{}, {{}, {IntrId, Sgpr32}}});
1844
1845 addRulesForIOpcs({amdgcn_s_ttracedata}).Any({{}, {{}, {IntrId, SgprB32_M0}}});
1846
1847 addRulesForIOpcs({amdgcn_s_sleep_var})
1848 .Any({{}, {{}, {IntrId, SgprB32_ReadFirstLane}}});
1849
1850 addRulesForIOpcs({amdgcn_s_barrier_join, amdgcn_s_wakeup_barrier})
1851 .Any({{}, {{}, {IntrId, SgprB32_M0}}});
1852
1853 addRulesForIOpcs({amdgcn_s_barrier_signal_var, amdgcn_s_barrier_init})
1854 .Any({{}, {{}, {IntrId, SgprB32_M0, SgprB32_M0}}});
1855
1856 addRulesForIOpcs({amdgcn_s_barrier_signal_isfirst})
1857 .Any({{UniS1}, {{Sgpr32Trunc}, {}}});
1858
1859 addRulesForIOpcs(
1860 {amdgcn_s_get_named_barrier_state, amdgcn_s_get_barrier_state}, Standard)
1861 .Uni(S32, {{Sgpr32}, {IntrId, SgprB32_M0}});
1862
1863 addRulesForIOpcs({amdgcn_flat_prefetch}).Any({{}, {{}, {IntrId, VgprP0}}});
1864
1865 addRulesForIOpcs({amdgcn_global_prefetch}).Any({{}, {{}, {IntrId, VgprP1}}});
1866
1867 addRulesForIOpcs({amdgcn_s_prefetch_data, amdgcn_s_prefetch_inst})
1869
1870 addRulesForIOpcs({amdgcn_class})
1871 .Any({{UniS1, _, S16}, {{UniInVcc}, {IntrId, Vgpr16, Vgpr32}}})
1872 .Any({{DivS1, _, S16}, {{Vcc}, {IntrId, Vgpr16, Vgpr32}}})
1873 .Any({{UniS1, _, S32}, {{UniInVcc}, {IntrId, Vgpr32, Vgpr32}}})
1874 .Any({{DivS1, _, S32}, {{Vcc}, {IntrId, Vgpr32, Vgpr32}}})
1875 .Any({{UniS1, _, S64}, {{UniInVcc}, {IntrId, Vgpr64, Vgpr32}}})
1876 .Any({{DivS1, _, S64}, {{Vcc}, {IntrId, Vgpr64, Vgpr32}}});
1877
1878 // This is "intrinsic lane mask" it was set to i32/i64 in llvm-ir.
1879 addRulesForIOpcs({amdgcn_end_cf})
1880 .Any({{_, UniS32}, {{}, {IntrId, Sgpr32}}})
1881 .Any({{_, UniS64}, {{}, {IntrId, Sgpr64}}});
1882
1883 addRulesForIOpcs({amdgcn_if_break}, Standard)
1884 .Uni(S64, {{Sgpr64}, {IntrId, Vcc, Sgpr64}})
1885 .Uni(S32, {{Sgpr32}, {IntrId, Vcc, Sgpr32}});
1886
1887 addRulesForIOpcs({amdgcn_exp})
1888 .Any({{_, _, _, S32, S32, S32, S32},
1889 {{}, {IntrId, Imm, Imm, Vgpr32, Vgpr32, Vgpr32, Vgpr32}}});
1890
1891 addRulesForIOpcs({amdgcn_exp_compr})
1892 .Any({{_, _, _, V2S16}, {{}, {IntrId, Imm, Imm, VgprV2S16, VgprV2S16}}});
1893
1894 addRulesForIOpcs({amdgcn_exp_row})
1895 .Any({{_, _, _, S32, S32, S32, S32, _, S32},
1896 {{},
1898 SgprB32_M0}}});
1899
1900 addRulesForIOpcs({amdgcn_lds_direct_load}, StandardB)
1901 .Div(B32, {{VgprB32}, {IntrId, SgprB32_M0}});
1902
1903 addRulesForIOpcs({amdgcn_lds_param_load}, Standard)
1904 .Div(S32, {{Vgpr32}, {IntrId, Imm, Imm, SgprB32_M0}});
1905
1906 addRulesForIOpcs({amdgcn_mbcnt_lo, amdgcn_mbcnt_hi}, Standard)
1907 .Div(S32, {{}, {Vgpr32, None, Vgpr32, Vgpr32}});
1908
1909 addRulesForIOpcs({amdgcn_readfirstlane})
1910 .Any({{UniB32, _, DivB32}, {{}, {SgprB32, None, VgprB32}}})
1911 // this should not exist in the first place, it is from call lowering
1912 // readfirstlaning just in case register is not in sgpr.
1913 .Any({{UniS32, _, UniS32}, {{}, {Sgpr32, None, Vgpr32}}});
1914
1915 addRulesForIOpcs({amdgcn_readlane}, StandardB)
1917
1918 addRulesForIOpcs({amdgcn_s_quadmask, amdgcn_s_wqm}, StandardB)
1920 .Uni(B64, {{SgprB64}, {IntrId, SgprB64_ReadFirstLane}});
1921
1922 addRulesForIOpcs({amdgcn_writelane}, StandardB)
1923 .Div(B32,
1924 {{VgprB32},
1926
1927 addRulesForIOpcs({amdgcn_add_max_i32, amdgcn_add_max_u32, amdgcn_add_min_i32,
1928 amdgcn_add_min_u32},
1929 Standard)
1930 .Uni(S32, {{UniInVgprS32}, {IntrId, Vgpr32, Vgpr32, Vgpr32}})
1931 .Div(S32, {{Vgpr32}, {IntrId, Vgpr32, Vgpr32, Vgpr32}});
1932
1933 addRulesForIOpcs({amdgcn_pk_add_max_i16, amdgcn_pk_add_max_u16,
1934 amdgcn_pk_add_min_i16, amdgcn_pk_add_min_u16},
1935 Standard)
1938
1939 addRulesForIOpcs({amdgcn_permlane16, amdgcn_permlanex16}, Standard)
1940 .Div(S32, {{Vgpr32},
1943
1944 addRulesForIOpcs({amdgcn_permlane_bcast, amdgcn_permlane_up,
1945 amdgcn_permlane_down, amdgcn_permlane_xor},
1946 StandardB)
1947 .Div(B32,
1948 {{VgprB32},
1950
1951 addRulesForIOpcs({amdgcn_permlane_idx_gen}, Standard)
1953
1954 addRulesForIOpcs({amdgcn_perm}, Standard)
1955 .Uni(S32, {{UniInVgprS32}, {IntrId, Vgpr32, Vgpr32, Vgpr32}})
1956 .Div(S32, {{Vgpr32}, {IntrId, Vgpr32, Vgpr32, Vgpr32}});
1957
1958 addRulesForIOpcs(
1959 {amdgcn_wave_reduce_add, amdgcn_wave_reduce_and, amdgcn_wave_reduce_fadd,
1960 amdgcn_wave_reduce_fmax, amdgcn_wave_reduce_fmin,
1961 amdgcn_wave_reduce_fsub, amdgcn_wave_reduce_max, amdgcn_wave_reduce_min,
1962 amdgcn_wave_reduce_or, amdgcn_wave_reduce_sub, amdgcn_wave_reduce_umax,
1963 amdgcn_wave_reduce_umin, amdgcn_wave_reduce_xor},
1964 Standard)
1965 .Uni(S32, {{Sgpr32}, {IntrId, Sgpr32}})
1966 .Div(S32, {{Sgpr32ToVgprDst}, {IntrId, VgprB32}})
1967 .Uni(S64, {{Sgpr64}, {IntrId, Sgpr64}})
1968 .Div(S64, {{Sgpr64ToVgprDst}, {IntrId, VgprB64}});
1969
1970 addRulesForIOpcs({amdgcn_wave_shuffle}, Standard)
1971 .Uni(S32, {{UniInVgprS32}, {IntrId, Vgpr32, Vgpr32}})
1972 .Div(S32, {{Vgpr32}, {IntrId, Vgpr32, Vgpr32}});
1973
1974 addRulesForIOpcs({amdgcn_bitop3, amdgcn_fmad_ftz}, Standard)
1975 .Uni(S16, {{UniInVgprS16}, {IntrId, Vgpr16, Vgpr16, Vgpr16}})
1976 .Div(S16, {{Vgpr16}, {IntrId, Vgpr16, Vgpr16, Vgpr16}})
1977 .Uni(S32, {{UniInVgprS32}, {IntrId, Vgpr32, Vgpr32, Vgpr32}})
1978 .Div(S32, {{Vgpr32}, {IntrId, Vgpr32, Vgpr32, Vgpr32}});
1979
1980 addRulesForIOpcs({amdgcn_udot4, amdgcn_sdot4, amdgcn_udot8, amdgcn_sdot8,
1981 amdgcn_dot4_f32_bf8_bf8, amdgcn_dot4_f32_bf8_fp8,
1982 amdgcn_dot4_f32_fp8_fp8, amdgcn_dot4_f32_fp8_bf8},
1983 Standard)
1984 .Uni(S32, {{UniInVgprS32}, {IntrId, Vgpr32, Vgpr32, Vgpr32}})
1985 .Div(S32, {{Vgpr32}, {IntrId, Vgpr32, Vgpr32, Vgpr32}});
1986
1987 addRulesForIOpcs({amdgcn_rsq, amdgcn_rsq_clamp}, Standard)
1988 .Uni(S16, {{Sgpr16}, {IntrId, Sgpr16}}, hasPST)
1989 .Uni(S16, {{UniInVgprS16}, {IntrId, Vgpr16}}, !hasPST)
1990 .Div(S16, {{Vgpr16}, {IntrId, Vgpr16}})
1991 .Uni(S32, {{Sgpr32}, {IntrId, Sgpr32}}, hasPST)
1992 .Uni(S32, {{UniInVgprS32}, {IntrId, Vgpr32}}, !hasPST)
1993 .Div(S32, {{Vgpr32}, {IntrId, Vgpr32}})
1994 .Uni(S64, {{UniInVgprS64}, {IntrId, Vgpr64}})
1995 .Div(S64, {{Vgpr64}, {IntrId, Vgpr64}});
1996
1997 addRulesForIOpcs({amdgcn_mul_u24, amdgcn_mul_i24}, Standard)
1998 .Uni(S32, {{UniInVgprS32}, {IntrId, Vgpr32, Vgpr32}})
1999 .Div(S32, {{Vgpr32}, {IntrId, Vgpr32, Vgpr32}})
2000 .Uni(S64, {{UniInVgprS64}, {IntrId, Vgpr32, Vgpr32}})
2001 .Div(S64, {{Vgpr64}, {IntrId, Vgpr32, Vgpr32}});
2002
2003 addRulesForIOpcs({amdgcn_ds_bpermute, amdgcn_ds_bpermute_fi_b32,
2004 amdgcn_ds_permute, amdgcn_fmul_legacy, amdgcn_mulhi_i24,
2005 amdgcn_mulhi_u24},
2006 Standard)
2007 .Uni(S32, {{UniInVgprS32}, {IntrId, Vgpr32, Vgpr32}})
2008 .Div(S32, {{Vgpr32}, {IntrId, Vgpr32, Vgpr32}});
2009
2010 addRulesForIOpcs({amdgcn_cvt_sr_bf8_f32, amdgcn_cvt_sr_fp8_f32,
2011 amdgcn_cvt_sr_fp8_f32_e5m3, amdgcn_cvt_pk_bf8_f32,
2012 amdgcn_cvt_pk_fp8_f32, amdgcn_cvt_pk_fp8_f32_e5m3},
2013 Standard)
2014 .Uni(S32, {{UniInVgprS32}, {IntrId, Vgpr32, Vgpr32, Vgpr32}})
2015 .Div(S32, {{Vgpr32}, {IntrId, Vgpr32, Vgpr32, Vgpr32}});
2016
2017 addRulesForIOpcs({amdgcn_cvt_off_f32_i4, amdgcn_cvt_f32_bf8,
2018 amdgcn_cvt_f32_fp8, amdgcn_cvt_f32_fp8_e5m3},
2019 Standard)
2020 .Uni(S32, {{UniInVgprS32}, {IntrId, Vgpr32}})
2021 .Div(S32, {{Vgpr32}, {IntrId, Vgpr32}});
2022
2023 addRulesForIOpcs({amdgcn_cvt_pk_f32_bf8, amdgcn_cvt_pk_f32_fp8})
2024 .Any({{UniV2S32}, {{UniInVgprV2S32}, {IntrId, Vgpr32}}})
2025 .Any({{DivV2S32}, {{VgprV2S32}, {IntrId, Vgpr32}}});
2026
2027 addRulesForIOpcs({amdgcn_cvt_f16_bf8, amdgcn_cvt_f16_fp8}, Standard)
2028 .Uni(S16, {{UniInVgprS16}, {IntrId, Vgpr32}})
2029 .Div(S16, {{Vgpr16}, {IntrId, Vgpr32}});
2030
2031 addRulesForIOpcs({amdgcn_cvt_pk_f16_bf8, amdgcn_cvt_pk_f16_fp8}, Standard)
2032 .Uni(V2S16, {{UniInVgprV2S16}, {IntrId, Vgpr16}})
2033 .Div(V2S16, {{VgprV2S16}, {IntrId, Vgpr16}});
2034
2035 addRulesForIOpcs({amdgcn_cvt_pk_bf8_f16, amdgcn_cvt_pk_fp8_f16}, Standard)
2036 .Uni(S16, {{UniInVgprS16}, {IntrId, VgprV2S16}})
2037 .Div(S16, {{Vgpr16}, {IntrId, VgprV2S16}});
2038
2039 addRulesForIOpcs({amdgcn_cvt_sr_bf8_f16, amdgcn_cvt_sr_fp8_f16}, Standard)
2040 .Uni(S32, {{UniInVgprS32}, {IntrId, Vgpr16, Vgpr32, Vgpr32}})
2041 .Div(S32, {{Vgpr32}, {IntrId, Vgpr16, Vgpr32, Vgpr32}});
2042
2043 addRulesForIOpcs({amdgcn_cvt_sr_pk_f16_f32}, Standard)
2045 .Div(V2S16, {{VgprV2S16}, {IntrId, Vgpr32, Vgpr32, Vgpr32}});
2046
2047 addRulesForIOpcs({amdgcn_cvt_scalef32_sr_fp8_f16})
2048 .Any({{DivS32}, {{Vgpr32}, {IntrId, Vgpr32, Vgpr16, Vgpr32, Vgpr32}}});
2049
2050 addRulesForIOpcs({amdgcn_cvt_scalef32_sr_fp8_f32})
2051 .Any({{DivS32}, {{Vgpr32}, {IntrId, Vgpr32, Vgpr32, Vgpr32, Vgpr32}}});
2052
2053 addRulesForIOpcs({amdgcn_cubesc, amdgcn_cubetc, amdgcn_cubema, amdgcn_cubeid,
2054 amdgcn_fma_legacy},
2055 Standard)
2056 .Uni(S32, {{UniInVgprS32}, {IntrId, Vgpr32, Vgpr32, Vgpr32}})
2057 .Div(S32, {{Vgpr32}, {IntrId, Vgpr32, Vgpr32, Vgpr32}});
2058
2059 addRulesForIOpcs({amdgcn_frexp_mant, amdgcn_fract}, Standard)
2060 .Uni(S16, {{UniInVgprS16}, {IntrId, Vgpr16}})
2061 .Div(S16, {{Vgpr16}, {IntrId, Vgpr16}})
2062 .Uni(S32, {{UniInVgprS32}, {IntrId, Vgpr32}})
2063 .Div(S32, {{Vgpr32}, {IntrId, Vgpr32}})
2064 .Uni(S64, {{UniInVgprS64}, {IntrId, Vgpr64}})
2065 .Div(S64, {{Vgpr64}, {IntrId, Vgpr64}});
2066
2067 addRulesForIOpcs({amdgcn_prng_b32})
2068 .Any({{UniS32}, {{UniInVgprS32}, {IntrId, Vgpr32}}})
2069 .Any({{DivS32}, {{Vgpr32}, {IntrId, Vgpr32}}});
2070
2071 addRulesForIOpcs({amdgcn_sffbh}, Standard)
2072 .Uni(S32, {{Sgpr32}, {IntrId, Sgpr32}})
2073 .Div(S32, {{Vgpr32}, {IntrId, Vgpr32}});
2074
2075 addRulesForIOpcs({amdgcn_ubfe, amdgcn_sbfe}, Standard)
2076 .Div(S32, {{Vgpr32}, {IntrId, Vgpr32, Vgpr32, Vgpr32}})
2077 .Uni(S32, {{Sgpr32}, {IntrId, Sgpr32, Sgpr32, Sgpr32}, S_BFE})
2078 .Uni(S64, {{Sgpr64}, {IntrId, Sgpr64, Sgpr32, Sgpr32}, S_BFE})
2079 .Div(S64, {{Vgpr64}, {IntrId, Vgpr64, Vgpr32, Vgpr32}, V_BFE});
2080
2081 addRulesForIOpcs({amdgcn_cvt_pk_i16, amdgcn_cvt_pk_u16, amdgcn_cvt_pknorm_i16,
2082 amdgcn_cvt_pknorm_u16},
2083 Standard)
2084 .Uni(V2S16, {{UniInVgprV2S16}, {IntrId, Vgpr32, Vgpr32}})
2085 .Div(V2S16, {{VgprV2S16}, {IntrId, Vgpr32, Vgpr32}});
2086
2087 addRulesForIOpcs({amdgcn_cvt_pkrtz}, Standard)
2088 .Uni(V2S16, {{SgprV2S16}, {IntrId, Sgpr32, Sgpr32}}, hasSALUFloat)
2089 .Uni(V2S16, {{UniInVgprV2S16}, {IntrId, Vgpr32, Vgpr32}}, !hasSALUFloat)
2090 .Div(V2S16, {{VgprV2S16}, {IntrId, Vgpr32, Vgpr32}});
2091
2092 addRulesForIOpcs({amdgcn_cvt_scalef32_sr_pk32_bf6_f16,
2093 amdgcn_cvt_scalef32_sr_pk32_fp6_f16,
2094 amdgcn_cvt_scalef32_sr_pk32_bf6_bf16,
2095 amdgcn_cvt_scalef32_sr_pk32_fp6_bf16},
2096 Standard)
2098
2099 addRulesForIOpcs({amdgcn_cvt_scalef32_sr_pk32_bf6_f32,
2100 amdgcn_cvt_scalef32_sr_pk32_fp6_f32},
2101 Standard)
2103
2104 addRulesForIOpcs({amdgcn_cvt_scalef32_sr_pk_fp4_f16}, Standard)
2106 .Div(S32, {{Vgpr32}, {IntrId, Vgpr32, VgprV2S16, Vgpr32, Vgpr32}});
2107
2108 addRulesForIOpcs({amdgcn_cvt_scalef32_sr_pk_fp4_f32}, Standard)
2110 .Div(S32, {{Vgpr32}, {IntrId, Vgpr32, VgprV2S32, Vgpr32, Vgpr32}});
2111
2112 addRulesForIOpcs(
2113 {amdgcn_cvt_scalef32_2xpk16_fp6_f32, amdgcn_cvt_scalef32_2xpk16_bf6_f32})
2114 .Any(
2116 .Any({{UniV6S32},
2118
2119 addRulesForIOpcs({amdgcn_cvt_scalef32_f16_fp8, amdgcn_cvt_scalef32_f16_bf8},
2120 Standard)
2121 .Div(V2S16, {{VgprV2S16}, {IntrId, VgprV2S16, Vgpr32, Vgpr32}})
2123
2124 addRulesForIOpcs({amdgcn_cvt_scalef32_f32_fp8, amdgcn_cvt_scalef32_f32_bf8},
2125 Standard)
2126 .Div(S32, {{Vgpr32}, {IntrId, Vgpr32, Vgpr32}})
2127 .Uni(S32, {{UniInVgprS32}, {IntrId, Vgpr32, Vgpr32}});
2128
2129 addRulesForIOpcs(
2130 {amdgcn_cvt_scalef32_pk16_bf6_f16, amdgcn_cvt_scalef32_pk16_fp6_f16},
2131 Standard)
2134
2135 addRulesForIOpcs(
2136 {amdgcn_cvt_scalef32_pk16_bf6_f32, amdgcn_cvt_scalef32_pk16_fp6_f32},
2137 Standard)
2140
2141 addRulesForIOpcs(
2142 {amdgcn_cvt_scalef32_pk8_bf8_f16, amdgcn_cvt_scalef32_pk8_fp8_f16},
2143 Standard)
2146
2147 addRulesForIOpcs(
2148 {amdgcn_cvt_scalef32_pk8_bf8_f32, amdgcn_cvt_scalef32_pk8_fp8_f32},
2149 Standard)
2152
2153 addRulesForIOpcs({amdgcn_cvt_scalef32_pk8_fp4_f16}, Standard)
2154 .Div(S32, {{Vgpr32}, {IntrId, VgprV8S16, Vgpr32}})
2155 .Uni(S32, {{UniInVgprS32}, {IntrId, VgprV8S16, Vgpr32}});
2156
2157 addRulesForIOpcs({amdgcn_cvt_scalef32_pk8_fp4_f32}, Standard)
2158 .Div(S32, {{Vgpr32}, {IntrId, VgprV8S32, Vgpr32}})
2159 .Uni(S32, {{UniInVgprS32}, {IntrId, VgprV8S32, Vgpr32}});
2160
2161 addRulesForIOpcs({amdgcn_cvt_scalef32_sr_pk16_bf6_f16,
2162 amdgcn_cvt_scalef32_sr_pk16_fp6_f16},
2163 Standard)
2165 .Any({{UniV3S32},
2167
2168 addRulesForIOpcs({amdgcn_cvt_scalef32_sr_pk16_bf6_f32,
2169 amdgcn_cvt_scalef32_sr_pk16_fp6_f32},
2170 Standard)
2172 .Any({{UniV3S32},
2174
2175 addRulesForIOpcs(
2176 {amdgcn_cvt_scalef32_sr_pk8_bf8_f16, amdgcn_cvt_scalef32_sr_pk8_fp8_f16},
2177 Standard)
2179 .Any({{UniV2S32},
2181
2182 addRulesForIOpcs(
2183 {amdgcn_cvt_scalef32_sr_pk8_bf8_f32, amdgcn_cvt_scalef32_sr_pk8_fp8_f32},
2184 Standard)
2186 .Any({{UniV2S32},
2188
2189 addRulesForIOpcs({amdgcn_cvt_scalef32_sr_pk8_fp4_f16}, Standard)
2190 .Div(S32, {{Vgpr32}, {IntrId, VgprV8S16, Vgpr32, Vgpr32}})
2191 .Uni(S32, {{UniInVgprS32}, {IntrId, VgprV8S16, Vgpr32, Vgpr32}});
2192
2193 addRulesForIOpcs({amdgcn_cvt_scalef32_sr_pk8_fp4_f32}, Standard)
2194 .Div(S32, {{Vgpr32}, {IntrId, VgprV8S32, Vgpr32, Vgpr32}})
2195 .Uni(S32, {{UniInVgprS32}, {IntrId, VgprV8S32, Vgpr32, Vgpr32}});
2196
2197 addRulesForIOpcs(
2198 {amdgcn_cvt_scale_pk16_f16_bf6, amdgcn_cvt_scale_pk16_f16_fp6}, Standard)
2201
2202 addRulesForIOpcs(
2203 {amdgcn_cvt_scale_pk16_f32_bf6, amdgcn_cvt_scale_pk16_f32_fp6}, Standard)
2206
2207 addRulesForIOpcs({amdgcn_cvt_scale_pk8_f16_bf8, amdgcn_cvt_scale_pk8_f16_fp8},
2208 Standard)
2211
2212 addRulesForIOpcs({amdgcn_cvt_scale_pk8_f16_fp4}, Standard)
2213 .Any({{DivV8S16}, {{VgprV8S16}, {IntrId, Vgpr32, Vgpr32}}})
2215
2216 addRulesForIOpcs({amdgcn_cvt_scale_pk8_f32_bf8, amdgcn_cvt_scale_pk8_f32_fp8},
2217 Standard)
2220
2221 addRulesForIOpcs({amdgcn_cvt_scale_pk8_f32_fp4}, Standard)
2222 .Any({{DivV8S32}, {{VgprV8S32}, {IntrId, Vgpr32, Vgpr32}}})
2224
2225 addRulesForIOpcs(
2226 {amdgcn_cvt_scalef32_pk32_bf6_f16, amdgcn_cvt_scalef32_pk32_fp6_f16},
2227 Standard)
2230
2231 addRulesForIOpcs(
2232 {amdgcn_cvt_scalef32_pk_fp8_f32, amdgcn_cvt_scalef32_pk_bf8_f32},
2233 Standard)
2235 .Uni(V2S16,
2237
2238 addRulesForIOpcs(
2239 {amdgcn_cvt_scalef32_pk_f32_fp8, amdgcn_cvt_scalef32_pk_f32_bf8},
2240 Standard)
2241 .Any({{DivV2S32}, {{VgprV2S32}, {IntrId, Vgpr32, Vgpr32}}})
2243
2244 addRulesForIOpcs(
2245 {amdgcn_cvt_scalef32_pk_fp8_f16, amdgcn_cvt_scalef32_pk_bf8_f16},
2246 Standard)
2249
2250 addRulesForIOpcs({amdgcn_cvt_scalef32_pk_f32_fp4}, Standard)
2251 .Any({{DivV2S32}, {{VgprV2S32}, {IntrId, Vgpr32, Vgpr32}}})
2253
2254 addRulesForIOpcs({amdgcn_cvt_scalef32_pk_fp4_f32}, Standard)
2255 .Div(S32, {{Vgpr32}, {IntrId, Vgpr32, Vgpr32, Vgpr32, Vgpr32}})
2256 .Uni(S32, {{UniInVgprS32}, {IntrId, Vgpr32, Vgpr32, Vgpr32, Vgpr32}});
2257
2258 addRulesForIOpcs({amdgcn_cvt_scalef32_pk_f16_fp4,
2259 amdgcn_cvt_scalef32_pk_f16_fp8,
2260 amdgcn_cvt_scalef32_pk_f16_bf8},
2261 Standard)
2262 .Div(V2S16, {{VgprV2S16}, {IntrId, Vgpr32, Vgpr32}})
2263 .Uni(V2S16, {{UniInVgprV2S16}, {IntrId, Vgpr32, Vgpr32}});
2264
2265 addRulesForIOpcs(
2266 {amdgcn_cvt_scalef32_pk32_f32_fp6, amdgcn_cvt_scalef32_pk32_f32_bf6},
2267 Standard)
2270
2271 addRulesForIOpcs(
2272 {amdgcn_cvt_scalef32_pk32_f16_fp6, amdgcn_cvt_scalef32_pk32_f16_bf6},
2273 Standard)
2276
2277 addRulesForIOpcs({amdgcn_cvt_scalef32_pk_fp4_f16}, Standard)
2278 .Div(S32, {{Vgpr32}, {IntrId, Vgpr32, VgprV2S16, Vgpr32}})
2279 .Uni(S32, {{UniInVgprS32}, {IntrId, Vgpr32, VgprV2S16, Vgpr32}});
2280
2281 addRulesForIOpcs({amdgcn_global_load_tr_b64})
2282 .Any({{DivB64, _, UniP1}, {{VgprB64}, {IntrId, SgprP1}}})
2283 .Any({{DivB64, _, DivP1}, {{VgprB64}, {IntrId, VgprP1}}})
2284 .Any({{DivB32, _, UniP1}, {{VgprB32}, {IntrId, SgprP1}}})
2285 .Any({{DivB32, _, DivP1}, {{VgprB32}, {IntrId, VgprP1}}});
2286
2287 addRulesForIOpcs({amdgcn_global_load_tr_b128})
2288 .Any({{DivB64, _, UniP1}, {{VgprB64}, {IntrId, SgprP1}}})
2289 .Any({{DivB64, _, DivP1}, {{VgprB64}, {IntrId, VgprP1}}})
2290 .Any({{DivB128, _, UniP1}, {{VgprB128}, {IntrId, SgprP1}}})
2291 .Any({{DivB128, _, DivP1}, {{VgprB128}, {IntrId, VgprP1}}});
2292
2293 addRulesForIOpcs({amdgcn_global_load_tr4_b64})
2294 .Any({{DivV2S32, _, UniP1}, {{VgprV2S32}, {IntrId, SgprP1}}})
2295 .Any({{DivV2S32, _, DivP1}, {{VgprV2S32}, {IntrId, VgprP1}}});
2296
2297 addRulesForIOpcs({amdgcn_global_load_tr6_b96})
2298 .Any({{DivV3S32, _, UniP1}, {{VgprV3S32}, {IntrId, SgprP1}}})
2299 .Any({{DivV3S32, _, DivP1}, {{VgprV3S32}, {IntrId, VgprP1}}});
2300
2301 addRulesForIOpcs({amdgcn_ds_load_tr4_b64, amdgcn_ds_load_tr8_b64})
2302 .Any({{DivV2S32}, {{VgprV2S32}, {IntrId, VgprP3}}});
2303
2304 addRulesForIOpcs({amdgcn_ds_load_tr6_b96})
2305 .Any({{DivV3S32}, {{VgprV3S32}, {IntrId, VgprP3}}});
2306
2307 addRulesForIOpcs({amdgcn_ds_load_tr16_b128})
2308 .Any({{DivB128}, {{VgprB128}, {IntrId, VgprP3}}});
2309
2310 addRulesForIOpcs({amdgcn_global_atomic_ordered_add_b64})
2311 .Any({{DivS64}, {{Vgpr64}, {IntrId, VgprP1, Vgpr64}}});
2312
2313 addRulesForIOpcs(
2314 {amdgcn_global_atomic_fmin_num, amdgcn_global_atomic_fmax_num}, Standard)
2315 .Div(S32, {{Vgpr32}, {IntrId, VgprP1, Vgpr32}});
2316
2317 addRulesForIOpcs({amdgcn_flat_atomic_fmin_num, amdgcn_flat_atomic_fmax_num},
2318 Standard)
2319 .Div(S32, {{Vgpr32}, {IntrId, VgprP0, Vgpr32}});
2320
2321 addRulesForIOpcs({amdgcn_raw_buffer_load_lds})
2322 .Any({{_}, {{}, {IntrId, SgprV4S32, SgprP3, Imm, Vgpr32, Sgpr32}}});
2323
2324 addRulesForIOpcs({amdgcn_raw_buffer_load_async_lds})
2325 .Any({{_}, {{}, {IntrId, SgprV4S32, SgprB32_M0, Imm, Vgpr32, Sgpr32}}});
2326
2327 addRulesForIOpcs({amdgcn_struct_buffer_load_async_lds})
2328 .Any(
2329 {{_},
2331
2332 addRulesForIOpcs({amdgcn_struct_buffer_load_lds})
2333 .Any({{_},
2334 {{}, {IntrId, SgprV4S32, SgprP3, Imm, Vgpr32, Vgpr32, Sgpr32}}});
2335
2336 addRulesForIOpcs({amdgcn_raw_ptr_buffer_load_lds})
2337 .Any({{_}, {{}, {IntrId, SgprP8, SgprP3, Imm, Vgpr32, Sgpr32}}});
2338
2339 addRulesForIOpcs({amdgcn_raw_ptr_buffer_load_async_lds})
2340 .Any({{}, {{}, {IntrId, SgprP8, SgprB32_M0, Imm, VgprB32, SgprB32}}});
2341
2342 addRulesForIOpcs({amdgcn_struct_ptr_buffer_load_async_lds})
2343 .Any({{_},
2344 {{}, {IntrId, SgprP8, SgprB32_M0, Imm, Vgpr32, Vgpr32, Sgpr32}}});
2345
2346 addRulesForIOpcs({amdgcn_struct_ptr_buffer_load_lds})
2347 .Any({{_}, {{}, {IntrId, SgprP8, SgprP3, Imm, Vgpr32, Vgpr32, Sgpr32}}});
2348
2349 addRulesForIOpcs(
2350 {amdgcn_global_load_lds, amdgcn_load_to_lds, amdgcn_load_async_to_lds})
2351 .Any({{}, {{}, {IntrId, VgprP1, SgprB32_M0}}});
2352
2353 addRulesForIOpcs({amdgcn_global_load_async_to_lds_b8,
2354 amdgcn_global_load_async_to_lds_b32,
2355 amdgcn_global_load_async_to_lds_b64,
2356 amdgcn_global_load_async_to_lds_b128,
2357 amdgcn_global_store_async_from_lds_b8,
2358 amdgcn_global_store_async_from_lds_b32,
2359 amdgcn_global_store_async_from_lds_b64,
2360 amdgcn_global_store_async_from_lds_b128})
2361 .Any({{}, {{}, {IntrId, VgprP1, VgprP3}}});
2362
2363 addRulesForIOpcs({amdgcn_global_load_async_lds})
2364 .Any({{}, {{}, {IntrId, VgprP1, SgprB32_M0}}});
2365
2366 addRulesForIOpcs({amdgcn_tensor_load_to_lds, amdgcn_tensor_store_from_lds})
2367 .Any({{},
2368 {{},
2372
2373 addRulesForIOpcs({amdgcn_cluster_load_b32})
2375 .Any({{DivB32, _, UniP1}, {{VgprB32}, {IntrId, SgprP1, Imm, SgprB32_M0}}})
2376 .Any(
2377 {{DivB32, _, DivP1}, {{VgprB32}, {IntrId, VgprP1, Imm, SgprB32_M0}}});
2378
2379 addRulesForIOpcs({amdgcn_cluster_load_b64})
2381 .Any({{DivB64, _, UniP1}, {{VgprB64}, {IntrId, SgprP1, Imm, SgprB32_M0}}})
2382 .Any(
2383 {{DivB64, _, DivP1}, {{VgprB64}, {IntrId, VgprP1, Imm, SgprB32_M0}}});
2384
2385 addRulesForIOpcs({amdgcn_cluster_load_b128})
2387 .Any({{DivB128, _, UniP1},
2388 {{VgprB128}, {IntrId, SgprP1, Imm, SgprB32_M0}}})
2389 .Any({{DivB128, _, DivP1},
2390 {{VgprB128}, {IntrId, VgprP1, Imm, SgprB32_M0}}});
2391
2392 addRulesForIOpcs({amdgcn_cluster_load_async_to_lds_b8,
2393 amdgcn_cluster_load_async_to_lds_b32,
2394 amdgcn_cluster_load_async_to_lds_b64,
2395 amdgcn_cluster_load_async_to_lds_b128})
2396 .Any({{}, {{}, {IntrId, VgprP1, VgprP3, Imm, Imm, SgprB32_M0}}});
2397
2398 addRulesForIOpcs({amdgcn_perm_pk16_b4_u4}, StandardB)
2399 .Uni(B64, {{UniInVgprB64}, {IntrId, Vgpr32, Vgpr32, VgprV2S32}})
2400 .Div(B64, {{VgprB64}, {IntrId, Vgpr32, Vgpr32, VgprV2S32}});
2401
2402 addRulesForIOpcs({amdgcn_perm_pk16_b6_u4}, StandardB)
2404 .Div(B96, {{VgprB96}, {IntrId, Vgpr32, VgprB64, VgprV2S32}});
2405
2406 addRulesForIOpcs({amdgcn_perm_pk16_b8_u4}, StandardB)
2408 .Div(B128, {{VgprB128}, {IntrId, VgprB64, VgprB64, VgprV2S32}});
2409
2410 addRulesForIOpcs({amdgcn_wwm, amdgcn_strict_wwm, amdgcn_wqm, amdgcn_softwqm,
2411 amdgcn_strict_wqm},
2412 StandardB)
2413 .Div(B32, {{VgprB32}, {IntrId, VgprB32}})
2414 .Uni(B32, {{SgprB32}, {IntrId, SgprB32}})
2415 .Div(B64, {{VgprB64}, {IntrId, VgprB64}})
2416 .Uni(B64, {{SgprB64}, {IntrId, SgprB64}})
2417 .Div(B96, {{VgprB96}, {IntrId, VgprB96}})
2418 .Uni(B96, {{SgprB96}, {IntrId, SgprB96}})
2419 .Div(B128, {{VgprB128}, {IntrId, VgprB128}})
2420 .Uni(B128, {{SgprB128}, {IntrId, SgprB128}})
2421 .Any({{UniB256}, {{SgprB256}, {IntrId, SgprB256}}})
2422 .Any({{DivB256}, {{VgprB256}, {IntrId, VgprB256}}})
2423 .Any({{UniB512}, {{SgprB512}, {IntrId, SgprB512}}})
2424 .Any({{DivB512}, {{VgprB512}, {IntrId, VgprB512}}});
2425
2426 addRulesForIOpcs({amdgcn_init_whole_wave}).Any({{DivS1}, {{Vcc}, {IntrId}}});
2427
2428 addRulesForIOpcs({amdgcn_kill, amdgcn_wqm_demote})
2429 .Any({{}, {{}, {IntrId, Vcc}}});
2430
2431 addRulesForIOpcs({amdgcn_set_inactive}, StandardB)
2432 .Div(B32, {{VgprB32}, {IntrId, VgprB32, VgprB32}});
2433
2434 addRulesForIOpcs({amdgcn_set_inactive_chain_arg}, Standard)
2435 .Div(S32, {{Vgpr32}, {IntrId, Vgpr32, Vgpr32}});
2436
2437 addRulesForIOpcs({amdgcn_cvt_sr_bf16_f32, amdgcn_cvt_sr_f16_f32}, Standard)
2438 .Div(V2S16, {{VgprV2S16}, {IntrId, VgprV2S16, Vgpr32, Vgpr32}});
2439
2440 addRulesForIOpcs({amdgcn_ballot}, Standard)
2441 .Uni(S64, {{Sgpr64}, {IntrId, Vcc}})
2442 .Uni(S32, {{Sgpr32}, {IntrId, Vcc}});
2443
2444 addRulesForIOpcs({amdgcn_inverse_ballot})
2445 .Any({{DivS1, _, S32}, {{Vcc}, {IntrId, SgprB32_ReadFirstLane}}})
2446 .Any({{DivS1, _, S64}, {{Vcc}, {IntrId, SgprB64_ReadFirstLane}}});
2447
2448 addRulesForIOpcs({amdgcn_live_mask, amdgcn_ps_live})
2449 .Any({{DivS1}, {{Vcc}, {}}});
2450
2451 addRulesForIOpcs({amdgcn_mov_dpp, amdgcn_mov_dpp8}, StandardB)
2452 .Div(B32, {{VgprB32}, {IntrId, VgprB32}})
2453 .Div(B64, {{VgprB64}, {IntrId, VgprB64}});
2454
2455 addRulesForIOpcs({amdgcn_update_dpp}, StandardB)
2456 .Div(B32, {{VgprB32}, {IntrId, VgprB32, VgprB32}})
2457 .Div(B64, {{VgprB64}, {IntrId, VgprB64, VgprB64}});
2458
2459 addRulesForIOpcs({amdgcn_sin, amdgcn_cos}, Standard)
2460 .Div(S16, {{Vgpr16}, {IntrId, Vgpr16}})
2461 .Uni(S16, {{UniInVgprS16}, {IntrId, Vgpr16}})
2462 .Div(S32, {{Vgpr32}, {IntrId, Vgpr32}})
2463 .Uni(S32, {{UniInVgprS32}, {IntrId, Vgpr32}});
2464
2465 addRulesForIOpcs({amdgcn_trig_preop}, Standard)
2466 .Div(S64, {{Vgpr64}, {IntrId, Vgpr64, Vgpr32}})
2467 .Uni(S64, {{UniInVgprS64}, {IntrId, Vgpr64, Vgpr32}});
2468
2469 addRulesForIOpcs({amdgcn_exp2}, Standard)
2470 .Div(S16, {{Vgpr16}, {IntrId, Vgpr16}})
2471 .Uni(S16, {{Sgpr16}, {IntrId, Sgpr16}}, hasPST)
2472 .Uni(S16, {{UniInVgprS16}, {IntrId, Vgpr16}}, !hasPST)
2473 .Div(S32, {{Vgpr32}, {IntrId, Vgpr32}})
2474 .Uni(S32, {{Sgpr32}, {IntrId, Sgpr32}}, hasPST)
2475 .Uni(S32, {{UniInVgprS32}, {IntrId, Vgpr32}}, !hasPST);
2476
2477 addRulesForIOpcs({amdgcn_rcp, amdgcn_sqrt}, Standard)
2478 .Div(S16, {{Vgpr16}, {IntrId, Vgpr16}})
2479 .Uni(S16, {{Sgpr16}, {IntrId, Sgpr16}}, hasPST)
2480 .Uni(S16, {{UniInVgprS16}, {IntrId, Vgpr16}}, !hasPST)
2481 .Div(S32, {{Vgpr32}, {IntrId, Vgpr32}})
2482 .Uni(S32, {{Sgpr32}, {IntrId, Sgpr32}}, hasPST)
2483 .Uni(S32, {{UniInVgprS32}, {IntrId, Vgpr32}}, !hasPST)
2484 .Div(S64, {{Vgpr64}, {IntrId, Vgpr64}})
2485 .Uni(S64, {{UniInVgprS64}, {IntrId, Vgpr64}});
2486
2487 addRulesForIOpcs({amdgcn_log}, Standard)
2488 .Div(S16, {{Vgpr16}, {IntrId, Vgpr16}})
2489 .Uni(S16, {{Sgpr16}, {IntrId, Sgpr16}}, hasPST)
2490 .Uni(S16, {{UniInVgprS16}, {IntrId, Vgpr16}}, !hasPST)
2491 .Div(S32, {{Vgpr32}, {IntrId, Vgpr32}})
2492 .Uni(S32, {{Sgpr32}, {IntrId, Sgpr32}}, hasPST)
2493 .Uni(S32, {{UniInVgprS32}, {IntrId, Vgpr32}}, !hasPST);
2494
2495 addRulesForIOpcs({amdgcn_ds_atomic_async_barrier_arrive_b64})
2496 .Any({{}, {{}, {IntrId, VgprP3}}});
2497
2498 addRulesForIOpcs({amdgcn_ds_atomic_barrier_arrive_rtn_b64}, Standard)
2499 .Div(S64, {{Vgpr64}, {IntrId, VgprP3, Vgpr64}});
2500
2501 addRulesForIOpcs({amdgcn_ds_add_gs_reg_rtn, amdgcn_ds_sub_gs_reg_rtn},
2502 Standard)
2503 .Div(S32, {{Vgpr32}, {IntrId, Vgpr32}})
2504 .Div(S64, {{Vgpr64}, {IntrId, Vgpr32}});
2505
2506 addRulesForIOpcs({amdgcn_ds_append, amdgcn_ds_consume}, Standard)
2507 .Uni(S32, {{UniInVgprS32}, {IntrId, SgprB32_M0}})
2508 .Div(S32, {{Vgpr32}, {IntrId, SgprB32_M0}});
2509
2510 addRulesForIOpcs(
2511 {amdgcn_ds_bvh_stack_rtn, amdgcn_ds_bvh_stack_push4_pop1_rtn}, Standard)
2512 .Div(S32, {{Vgpr32, Vgpr32}, {IntrId, Vgpr32, Vgpr32, VgprV4S32}});
2513
2514 addRulesForIOpcs({amdgcn_ds_bvh_stack_push8_pop1_rtn}, Standard)
2515 .Div(S32, {{Vgpr32, Vgpr32}, {IntrId, Vgpr32, Vgpr32, VgprV8S32}});
2516
2517 addRulesForIOpcs({amdgcn_ds_bvh_stack_push8_pop2_rtn}, Standard)
2518 .Div(S64, {{Vgpr64, Vgpr32}, {IntrId, Vgpr32, Vgpr32, VgprV8S32}});
2519
2520 addRulesForIOpcs({amdgcn_ds_gws_sema_p, amdgcn_ds_gws_sema_v,
2521 amdgcn_ds_gws_sema_release_all})
2522 .Any({{}, {{}, {IntrId, SgprB32_M0}}});
2523
2524 addRulesForIOpcs(
2525 {amdgcn_ds_gws_barrier, amdgcn_ds_gws_init, amdgcn_ds_gws_sema_br})
2526 .Any({{}, {{}, {IntrId, Vgpr32, SgprB32_M0}}});
2527
2528 addRulesForIOpcs({amdgcn_ds_ordered_add, amdgcn_ds_ordered_swap}, Standard)
2529 .Div(S32, {{Vgpr32}, {IntrId, SgprB32_M0, Vgpr32}});
2530
2531 addRulesForIOpcs({amdgcn_ds_swizzle}, Standard)
2532 .Uni(S32, {{UniInVgprS32}, {IntrId, Vgpr32}})
2533 .Div(S32, {{Vgpr32}, {IntrId, Vgpr32}});
2534
2535 addRulesForIOpcs({amdgcn_permlane16_var, amdgcn_permlanex16_var}, Standard)
2536 .Div(S32, {{Vgpr32}, {IntrId, Vgpr32, Vgpr32, Vgpr32}});
2537
2538 addRulesForIOpcs({amdgcn_permlane16_swap, amdgcn_permlane32_swap}, Standard)
2539 .Div(S32, {{Vgpr32, Vgpr32}, {IntrId, Vgpr32, Vgpr32}});
2540
2541 addRulesForIOpcs({amdgcn_permlane64}, StandardB)
2542 .Div(B32, {{VgprB32}, {IntrId, VgprB32}});
2543
2544 addRulesForIOpcs({amdgcn_ds_read_tr4_b64, amdgcn_ds_read_tr8_b64})
2545 .Any({{DivV2S32}, {{VgprV2S32}, {IntrId, VgprP3}}});
2546
2547 addRulesForIOpcs({amdgcn_ds_read_tr6_b96})
2548 .Any({{DivV3S32}, {{VgprV3S32}, {IntrId, VgprP3}}});
2549
2550 addRulesForIOpcs({amdgcn_ds_read_tr16_b64})
2551 .Any({{DivV4S16}, {{VgprV4S16}, {IntrId, VgprP3}}});
2552
2553 addRulesForIOpcs({amdgcn_interp_p1}, Standard)
2554 .Div(S32, {{Vgpr32}, {IntrId, Vgpr32, Imm, Imm, SgprB32_M0}});
2555
2556 addRulesForIOpcs({amdgcn_interp_p1_f16}, Standard)
2557 .Div(S32, {{Vgpr32}, {IntrId, Vgpr32, Imm, Imm, Imm, SgprB32_M0}});
2558
2559 addRulesForIOpcs({amdgcn_interp_p2}, Standard)
2560 .Div(S32, {{Vgpr32}, {IntrId, Vgpr32, Vgpr32, Imm, Imm, SgprB32_M0}});
2561
2562 addRulesForIOpcs({amdgcn_interp_p2_f16}, Standard)
2563 .Div(S16,
2565
2566 addRulesForIOpcs({amdgcn_interp_mov}, Standard)
2567 .Div(S32, {{Vgpr32}, {IntrId, Imm, Imm, Imm, SgprB32_M0}});
2568
2569 addRulesForIOpcs({amdgcn_interp_inreg_p10, amdgcn_interp_inreg_p2,
2570 amdgcn_interp_inreg_p10_f16, amdgcn_interp_p10_rtz_f16},
2571 Standard)
2572 .Uni(S32, {{UniInVgprS32}, {IntrId, Vgpr32, Vgpr32, Vgpr32}})
2573 .Div(S32, {{Vgpr32}, {IntrId, Vgpr32, Vgpr32, Vgpr32}});
2574
2575 addRulesForIOpcs({amdgcn_interp_inreg_p2_f16, amdgcn_interp_p2_rtz_f16},
2576 Standard)
2577 .Uni(S16, {{UniInVgprS16}, {IntrId, Vgpr32, Vgpr32, Vgpr32}})
2578 .Div(S16, {{Vgpr16}, {IntrId, Vgpr32, Vgpr32, Vgpr32}});
2579
2580 addRulesForIOpcs({amdgcn_frexp_exp})
2581 .Any({{UniS16}, {{UniInVgprS16}, {IntrId, Vgpr16}}})
2582 .Any({{DivS16}, {{Vgpr16}, {IntrId, Vgpr16}}})
2583 .Any({{UniS32, _, S32}, {{UniInVgprS32}, {IntrId, Vgpr32}}})
2584 .Any({{DivS32, _, S32}, {{Vgpr32}, {IntrId, Vgpr32}}})
2585 .Any({{UniS32, _, S64}, {{UniInVgprS32}, {IntrId, Vgpr64}}})
2586 .Any({{DivS32, _, S64}, {{Vgpr32}, {IntrId, Vgpr64}}});
2587
2588 addRulesForIOpcs({amdgcn_div_fmas}, Standard)
2589 .Div(S32, {{Vgpr32}, {IntrId, Vgpr32, Vgpr32, Vgpr32, Vcc}})
2590 .Uni(S32, {{UniInVgprS32}, {IntrId, Vgpr32, Vgpr32, Vgpr32, Vcc}})
2591 .Div(S64, {{Vgpr64}, {IntrId, Vgpr64, Vgpr64, Vgpr64, Vcc}})
2592 .Uni(S64, {{UniInVgprS64}, {IntrId, Vgpr64, Vgpr64, Vgpr64, Vcc}});
2593
2594 addRulesForIOpcs({amdgcn_div_fixup}, Standard)
2595 .Div(S16, {{Vgpr16}, {IntrId, Vgpr16, Vgpr16, Vgpr16}})
2596 .Uni(S16, {{UniInVgprS16}, {IntrId, Vgpr16, Vgpr16, Vgpr16}})
2597 .Div(S32, {{Vgpr32}, {IntrId, Vgpr32, Vgpr32, Vgpr32}})
2598 .Uni(S32, {{UniInVgprS32}, {IntrId, Vgpr32, Vgpr32, Vgpr32}})
2599 .Div(S64, {{Vgpr64}, {IntrId, Vgpr64, Vgpr64, Vgpr64}})
2600 .Uni(S64, {{UniInVgprS64}, {IntrId, Vgpr64, Vgpr64, Vgpr64}});
2601
2602 addRulesForIOpcs({amdgcn_div_scale}, Standard)
2603 .Div(S32, {{Vgpr32, Vcc}, {IntrId, Vgpr32, Vgpr32}})
2604 .Uni(S32, {{UniInVgprS32, UniInVcc}, {IntrId, Vgpr32, Vgpr32}})
2605 .Div(S64, {{Vgpr64, Vcc}, {IntrId, Vgpr64, Vgpr64}})
2606 .Uni(S64, {{UniInVgprS64, UniInVcc}, {IntrId, Vgpr64, Vgpr64}});
2607
2608 addRulesForIOpcs({amdgcn_fdot2, amdgcn_sdot2, amdgcn_udot2}, Standard)
2610 .Div(S32, {{Vgpr32}, {IntrId, VgprV2S16, VgprV2S16, Vgpr32}});
2611
2612 addRulesForIOpcs({amdgcn_fdot2_f16_f16}, Standard)
2614 .Div(S16, {{Vgpr16}, {IntrId, VgprV2S16, VgprV2S16, Vgpr16}});
2615
2616 addRulesForIOpcs({amdgcn_sudot4, amdgcn_sudot8}, Standard)
2617 .Uni(S32, {{UniInVgprS32}, {IntrId, Imm, Vgpr32, Imm, Vgpr32, Vgpr32}})
2618 .Div(S32, {{Vgpr32}, {IntrId, Imm, Vgpr32, Imm, Vgpr32, Vgpr32}});
2619
2620 addRulesForIOpcs({amdgcn_s_alloc_vgpr})
2622
2623 addRulesForIOpcs({amdgcn_sat_pk4_i4_i8, amdgcn_sat_pk4_u4_u8}, Standard)
2624 .Uni(S16, {{UniInVgprS16}, {IntrId, Vgpr32}})
2625 .Div(S16, {{Vgpr16}, {IntrId, Vgpr32}});
2626
2627 bool HasGFX90AInsts = ST->hasGFX90AInsts();
2628
2629 // On gfx90a+ both AGPR-form and VGPR-form exists
2630 addRulesForIOpcs({amdgcn_mfma_f32_32x32x1f32, amdgcn_mfma_f32_16x16x1f32,
2631 amdgcn_mfma_f32_4x4x1f32, amdgcn_mfma_f32_32x32x2f32,
2632 amdgcn_mfma_f32_16x16x4f32, amdgcn_mfma_f32_32x32x4f16,
2633 amdgcn_mfma_f32_16x16x4f16, amdgcn_mfma_f32_4x4x4f16,
2634 amdgcn_mfma_f32_32x32x8f16, amdgcn_mfma_f32_16x16x16f16,
2635 amdgcn_mfma_i32_32x32x4i8, amdgcn_mfma_i32_16x16x4i8,
2636 amdgcn_mfma_i32_4x4x4i8, amdgcn_mfma_i32_32x32x8i8,
2637 amdgcn_mfma_i32_16x16x16i8, amdgcn_mfma_f32_32x32x2bf16,
2638 amdgcn_mfma_f32_16x16x2bf16, amdgcn_mfma_f32_4x4x2bf16,
2639 amdgcn_mfma_f32_32x32x4bf16, amdgcn_mfma_f32_16x16x8bf16})
2640 .Any({{DivAnyTy},
2642 !HasGFX90AInsts)
2643 .Any({{DivAnyTy},
2644 {{VgprOrAgprAnyTy},
2646 HasGFX90AInsts);
2647
2648 // gfx90a+ only MFMAs
2649 addRulesForIOpcs(
2650 {
2651 amdgcn_mfma_f32_32x32x4bf16_1k,
2652 amdgcn_mfma_f32_16x16x4bf16_1k,
2653 amdgcn_mfma_f32_4x4x4bf16_1k,
2654 amdgcn_mfma_f32_32x32x8bf16_1k,
2655 amdgcn_mfma_f32_16x16x16bf16_1k,
2656 amdgcn_mfma_f64_16x16x4f64,
2657 amdgcn_mfma_f64_4x4x4f64,
2658 amdgcn_mfma_i32_16x16x32_i8,
2659 amdgcn_mfma_i32_32x32x16_i8,
2660 amdgcn_mfma_f32_16x16x8_xf32,
2661 amdgcn_mfma_f32_32x32x4_xf32,
2662 amdgcn_mfma_f32_16x16x32_bf8_bf8,
2663 amdgcn_mfma_f32_16x16x32_bf8_fp8,
2664 amdgcn_mfma_f32_16x16x32_fp8_bf8,
2665 amdgcn_mfma_f32_16x16x32_fp8_fp8,
2666 amdgcn_mfma_f32_32x32x16_bf8_bf8,
2667 amdgcn_mfma_f32_32x32x16_bf8_fp8,
2668 amdgcn_mfma_f32_32x32x16_fp8_bf8,
2669 amdgcn_mfma_f32_32x32x16_fp8_fp8,
2670 // gfx950
2671 amdgcn_mfma_f32_16x16x32_f16,
2672 amdgcn_mfma_f32_32x32x16_f16,
2673 amdgcn_mfma_i32_16x16x64_i8,
2674 amdgcn_mfma_i32_32x32x32_i8,
2675 // TODO: bf16 variants fail in IRTranslator.
2676 // amdgcn_mfma_f32_16x16x32_bf16, amdgcn_mfma_f32_32x32x16_bf16,
2677 })
2678 .Any({{DivAnyTy},
2679 {{VgprOrAgprAnyTy},
2681
2682 addRulesForIOpcs(
2683 {// gfx942+
2684 amdgcn_smfmac_f32_16x16x32_f16, amdgcn_smfmac_f32_32x32x16_f16,
2685 amdgcn_smfmac_f32_16x16x32_bf16, amdgcn_smfmac_f32_32x32x16_bf16,
2686 amdgcn_smfmac_i32_16x16x64_i8, amdgcn_smfmac_i32_32x32x32_i8,
2687 amdgcn_smfmac_f32_16x16x64_bf8_bf8, amdgcn_smfmac_f32_16x16x64_bf8_fp8,
2688 amdgcn_smfmac_f32_16x16x64_fp8_bf8, amdgcn_smfmac_f32_16x16x64_fp8_fp8,
2689 amdgcn_smfmac_f32_32x32x32_bf8_bf8, amdgcn_smfmac_f32_32x32x32_bf8_fp8,
2690 amdgcn_smfmac_f32_32x32x32_fp8_bf8, amdgcn_smfmac_f32_32x32x32_fp8_fp8,
2691 // gfx950+
2692 amdgcn_smfmac_f32_16x16x64_f16, amdgcn_smfmac_f32_32x32x32_f16,
2693 amdgcn_smfmac_i32_16x16x128_i8, amdgcn_smfmac_i32_32x32x64_i8,
2694 amdgcn_smfmac_f32_16x16x128_bf8_bf8, amdgcn_smfmac_f32_16x16x128_bf8_fp8,
2695 amdgcn_smfmac_f32_16x16x128_fp8_bf8, amdgcn_smfmac_f32_16x16x128_fp8_fp8,
2696 amdgcn_smfmac_f32_32x32x64_bf8_bf8, amdgcn_smfmac_f32_32x32x64_bf8_fp8,
2697 amdgcn_smfmac_f32_32x32x64_fp8_bf8, amdgcn_smfmac_f32_32x32x64_fp8_fp8})
2698 .Any({{DivAnyTy},
2699 {{VgprOrAgprAnyTy},
2701
2702 addRulesForIOpcs({amdgcn_mfma_scale_f32_32x32x64_f8f6f4,
2703 amdgcn_mfma_scale_f32_16x16x128_f8f6f4})
2704 .Any({{DivAnyTy},
2705 {{VgprOrAgprAnyTy},
2707 Vgpr32, Imm, Vgpr32}}});
2708
2709 // WMMA/SWMMAC intrinsics: all register operands map to VGPR.
2710 addRulesForIOpcs(
2711 {// WMMA GFX11+
2712 amdgcn_wmma_f32_16x16x16_f16, amdgcn_wmma_f32_16x16x16_bf16,
2713 amdgcn_wmma_f16_16x16x16_f16, amdgcn_wmma_bf16_16x16x16_bf16,
2714 amdgcn_wmma_f16_16x16x16_f16_tied, amdgcn_wmma_bf16_16x16x16_bf16_tied,
2715 amdgcn_wmma_i32_16x16x16_iu8, amdgcn_wmma_i32_16x16x16_iu4,
2716 // WMMA GFX12
2717 amdgcn_wmma_f32_16x16x16_fp8_fp8, amdgcn_wmma_f32_16x16x16_fp8_bf8,
2718 amdgcn_wmma_f32_16x16x16_bf8_fp8, amdgcn_wmma_f32_16x16x16_bf8_bf8,
2719 amdgcn_wmma_i32_16x16x32_iu4,
2720 // WMMA GFX1250
2721 amdgcn_wmma_f32_16x16x4_f32, amdgcn_wmma_f32_16x16x32_bf16,
2722 amdgcn_wmma_f32_16x16x32_f16, amdgcn_wmma_f16_16x16x32_f16,
2723 amdgcn_wmma_bf16_16x16x32_bf16, amdgcn_wmma_bf16f32_16x16x32_bf16,
2724 amdgcn_wmma_f32_16x16x64_fp8_fp8, amdgcn_wmma_f32_16x16x64_fp8_bf8,
2725 amdgcn_wmma_f32_16x16x64_bf8_fp8, amdgcn_wmma_f32_16x16x64_bf8_bf8,
2726 amdgcn_wmma_f16_16x16x64_fp8_fp8, amdgcn_wmma_f16_16x16x64_fp8_bf8,
2727 amdgcn_wmma_f16_16x16x64_bf8_fp8, amdgcn_wmma_f16_16x16x64_bf8_bf8,
2728 amdgcn_wmma_f16_16x16x128_fp8_fp8, amdgcn_wmma_f16_16x16x128_fp8_bf8,
2729 amdgcn_wmma_f16_16x16x128_bf8_fp8, amdgcn_wmma_f16_16x16x128_bf8_bf8,
2730 amdgcn_wmma_f32_16x16x128_fp8_fp8, amdgcn_wmma_f32_16x16x128_fp8_bf8,
2731 amdgcn_wmma_f32_16x16x128_bf8_fp8, amdgcn_wmma_f32_16x16x128_bf8_bf8,
2732 amdgcn_wmma_i32_16x16x64_iu8, amdgcn_wmma_f32_16x16x128_f8f6f4,
2733 amdgcn_wmma_scale_f32_16x16x128_f8f6f4,
2734 amdgcn_wmma_scale16_f32_16x16x128_f8f6f4, amdgcn_wmma_f32_32x16x128_f4,
2735 amdgcn_wmma_scale_f32_32x16x128_f4, amdgcn_wmma_scale16_f32_32x16x128_f4,
2736 // WMMA GFX1251
2737 amdgcn_wmma_f64_16x16x4_f64,
2738 // SWMMAC GFX12
2739 amdgcn_swmmac_f32_16x16x32_f16, amdgcn_swmmac_f32_16x16x32_bf16,
2740 amdgcn_swmmac_f16_16x16x32_f16, amdgcn_swmmac_bf16_16x16x32_bf16,
2741 amdgcn_swmmac_i32_16x16x32_iu8, amdgcn_swmmac_i32_16x16x32_iu4,
2742 amdgcn_swmmac_i32_16x16x64_iu4, amdgcn_swmmac_f32_16x16x32_fp8_fp8,
2743 amdgcn_swmmac_f32_16x16x32_fp8_bf8, amdgcn_swmmac_f32_16x16x32_bf8_fp8,
2744 amdgcn_swmmac_f32_16x16x32_bf8_bf8,
2745 // SWMMAC GFX1250
2746 amdgcn_swmmac_f32_16x16x64_f16, amdgcn_swmmac_f32_16x16x64_bf16,
2747 amdgcn_swmmac_f16_16x16x64_f16, amdgcn_swmmac_bf16_16x16x64_bf16,
2748 amdgcn_swmmac_bf16f32_16x16x64_bf16, amdgcn_swmmac_f32_16x16x128_fp8_fp8,
2749 amdgcn_swmmac_f32_16x16x128_fp8_bf8, amdgcn_swmmac_f32_16x16x128_bf8_fp8,
2750 amdgcn_swmmac_f32_16x16x128_bf8_bf8, amdgcn_swmmac_f16_16x16x128_fp8_fp8,
2751 amdgcn_swmmac_f16_16x16x128_fp8_bf8, amdgcn_swmmac_f16_16x16x128_bf8_fp8,
2752 amdgcn_swmmac_f16_16x16x128_bf8_bf8, amdgcn_swmmac_i32_16x16x128_iu8})
2753 .Any({{}, {{}, {}, ApplyAllVgpr}});
2754
2755} // end initialize rules
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
AMDGPU address space definition.
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.
constexpr LLT S16
constexpr LLT S1
constexpr LLT V2S16
constexpr LLT S32
constexpr LLT V4S32
constexpr LLT V3S32
constexpr LLT S64
constexpr LLT V2S32
constexpr LLT S128
UniformityLLTOpPredicateID LLTToBId(LLT Ty)
bool matchUniformityAndLLT(Register Reg, UniformityLLTOpPredicateID UniID, const MachineUniformityInfo &MUI, const MachineRegisterInfo &MRI)
UniformityLLTOpPredicateID LLTToId(LLT Ty)
AMD GCN specific subclass of TargetSubtarget.
Declares convenience wrapper classes for interpreting MachineInstr instances as specific generic oper...
#define _
IRTranslator LLVM IR MI
Register Reg
Register const TargetRegisterInfo * TRI
Machine IR instance of the generic uniformity analysis.
bool operator()(const MachineInstr &MI) const
Predicate operator||(const Predicate &RHS) const
Predicate operator&&(const Predicate &RHS) const
Predicate(std::function< bool(const MachineInstr &)> Pred)
Predicate operator!() const
RegBankLegalizeRules(const GCNSubtarget &ST, MachineRegisterInfo &MRI)
const SetOfRulesForOpcode * getRulesForOpc(MachineInstr &MI) const
const RegBankLLTMapping * findMappingForMI(const MachineInstr &MI, const MachineRegisterInfo &MRI, const MachineUniformityInfo &MUI) const
void addFastRuleDivergent(UniformityLLTOpPredicateID Ty, RegBankLLTMapping RuleApplyIDs)
void addFastRuleUniform(UniformityLLTOpPredicateID Ty, RegBankLLTMapping RuleApplyIDs)
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition InstrTypes.h:740
bool isSigned() const
Definition InstrTypes.h:993
bool isDivergentAtDef(ConstValueRefT V) const
Whether V is divergent at its definition.
bool isUniformAtDef(ConstValueRefT V) const
Whether V is uniform/non-divergent at its definition.
bool isEquality() const
Return true if this predicate is either EQ or NE.
static constexpr LLT scalar(unsigned SizeInBits)
Get a low-level scalar or aggregate "bag of bits".
static constexpr LLT pointer(unsigned AddressSpace, unsigned SizeInBits)
Get a low-level pointer in the given address space.
constexpr TypeSize getSizeInBits() const
Returns the total size of the type. Must only be called on sized types.
static constexpr LLT fixed_vector(unsigned NumElements, unsigned ScalarSizeInBits)
Get a low-level fixed-width vector of some number of elements and element width.
TypeSize getValue() const
Representation of each machine instruction.
A description of a memory reference used in the backend.
LocationSize getSize() const
Return the size in bytes of the memory reference.
unsigned getAddrSpace() const
LLVM_ABI Align getAlign() const
Return the minimum known alignment in bytes of the actual memory reference.
MachineOperand class - Representation of each machine instruction operand.
bool isReg() const
isReg - Tests if this is a MO_Register operand.
Register getReg() const
getReg - Returns the register number.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
LLT getType(Register Reg) const
Get the low-level type of Reg or LLT{} if Reg is not a generic (target independent) virtual register.
const TargetRegisterInfo * getTargetRegisterInfo() const
Wrapper class representing virtual and physical registers.
Definition Register.h:20
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
void swap(SmallVectorImpl &RHS)
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ CONSTANT_ADDRESS_32BIT
Address space for 32-bit constant memory.
@ CONSTANT_ADDRESS
Address space for constant memory (VTX2).
bool isAnyPtr(LLT Ty, unsigned Width)
bool isUniformMMO(const MachineMemOperand *MMO)
This namespace contains an enum with a value for every intrinsic/builtin function known by LLVM.
This is an optimization pass for GlobalISel generic memory operations.
GenericUniformityInfo< MachineSSAContext > MachineUniformityInfo
static const MachineMemOperand::Flags MONoClobber
Mark the MMO of a uniform load if there are no potentially clobbering stores on any path from the sta...
Definition SIInstrInfo.h:44
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
SmallVector< UniformityLLTOpPredicateID, 4 > OpUniformityAndTypes
PredicateMapping(std::initializer_list< UniformityLLTOpPredicateID > OpList, std::function< bool(const MachineInstr &)> TestFunc=nullptr)
bool match(const MachineInstr &MI, const MachineUniformityInfo &MUI, const MachineRegisterInfo &MRI) const
std::function< bool(const MachineInstr &)> TestFunc
RegBankLLTMapping(std::initializer_list< RegBankLLTMappingApplyID > DstOpMappingList, std::initializer_list< RegBankLLTMappingApplyID > SrcOpMappingList, LoweringMethodID LoweringMethod=DoNotLower)
SmallVector< RegBankLLTMappingApplyID, 2 > DstOpMapping
SmallVector< RegBankLLTMappingApplyID, 4 > SrcOpMapping
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39