LLVM 23.0.0git
AMDGPURegBankLegalizeRules.cpp
Go to the documentation of this file.
1//===-- AMDGPURegBankLegalizeRules.cpp ------------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// Definitions of RegBankLegalize Rules for all opcodes.
10/// Implementation of container for all the Rules and search.
11/// Fast search for most common case when Rule.Predicate checks LLT and
12/// uniformity of register in operand 0.
13//
14//===----------------------------------------------------------------------===//
15
17#include "AMDGPUInstrInfo.h"
18#include "GCNSubtarget.h"
21#include "llvm/IR/IntrinsicsAMDGPU.h"
23
24#define DEBUG_TYPE "amdgpu-regbanklegalize"
25
26using namespace llvm;
27using namespace AMDGPU;
28
29bool AMDGPU::isAnyPtr(LLT Ty, unsigned Width) {
30 return Ty.isPointer() && Ty.getSizeInBits() == Width;
31}
32
34 std::initializer_list<RegBankLLTMappingApplyID> DstOpMappingList,
35 std::initializer_list<RegBankLLTMappingApplyID> SrcOpMappingList,
37 : DstOpMapping(DstOpMappingList), SrcOpMapping(SrcOpMappingList),
39
41 std::initializer_list<UniformityLLTOpPredicateID> OpList,
42 std::function<bool(const MachineInstr &)> TestFunc)
44
46 const MachineUniformityInfo &MUI,
47 const MachineRegisterInfo &MRI) {
48 switch (UniID) {
49 case S1:
50 return MRI.getType(Reg) == LLT::scalar(1);
51 case S16:
52 return MRI.getType(Reg) == LLT::scalar(16);
53 case S32:
54 return MRI.getType(Reg) == LLT::scalar(32);
55 case S64:
56 return MRI.getType(Reg) == LLT::scalar(64);
57 case S128:
58 return MRI.getType(Reg) == LLT::scalar(128);
59 case P0:
60 return MRI.getType(Reg) == LLT::pointer(0, 64);
61 case P1:
62 return MRI.getType(Reg) == LLT::pointer(1, 64);
63 case P2:
64 return MRI.getType(Reg) == LLT::pointer(2, 32);
65 case P3:
66 return MRI.getType(Reg) == LLT::pointer(3, 32);
67 case P4:
68 return MRI.getType(Reg) == LLT::pointer(4, 64);
69 case P5:
70 return MRI.getType(Reg) == LLT::pointer(5, 32);
71 case P8:
72 return MRI.getType(Reg) == LLT::pointer(8, 128);
73 case Ptr32:
74 return isAnyPtr(MRI.getType(Reg), 32);
75 case Ptr64:
76 return isAnyPtr(MRI.getType(Reg), 64);
77 case Ptr128:
78 return isAnyPtr(MRI.getType(Reg), 128);
79 case V2S16:
80 return MRI.getType(Reg) == LLT::fixed_vector(2, 16);
81 case V2S32:
82 return MRI.getType(Reg) == LLT::fixed_vector(2, 32);
83 case V3S32:
84 return MRI.getType(Reg) == LLT::fixed_vector(3, 32);
85 case V4S32:
86 return MRI.getType(Reg) == LLT::fixed_vector(4, 32);
87 case B32:
88 return MRI.getType(Reg).getSizeInBits() == 32;
89 case B64:
90 return MRI.getType(Reg).getSizeInBits() == 64;
91 case B96:
92 return MRI.getType(Reg).getSizeInBits() == 96;
93 case B128:
94 return MRI.getType(Reg).getSizeInBits() == 128;
95 case B160:
96 return MRI.getType(Reg).getSizeInBits() == 160;
97 case B256:
98 return MRI.getType(Reg).getSizeInBits() == 256;
99 case B512:
100 return MRI.getType(Reg).getSizeInBits() == 512;
101 case DivAnyTy:
102 return MUI.isDivergentAtDef(Reg);
103 case UniS1:
104 return MRI.getType(Reg) == LLT::scalar(1) && MUI.isUniformAtDef(Reg);
105 case UniS16:
106 return MRI.getType(Reg) == LLT::scalar(16) && MUI.isUniformAtDef(Reg);
107 case UniS32:
108 return MRI.getType(Reg) == LLT::scalar(32) && MUI.isUniformAtDef(Reg);
109 case UniS64:
110 return MRI.getType(Reg) == LLT::scalar(64) && MUI.isUniformAtDef(Reg);
111 case UniS128:
112 return MRI.getType(Reg) == LLT::scalar(128) && MUI.isUniformAtDef(Reg);
113 case UniP0:
114 return MRI.getType(Reg) == LLT::pointer(0, 64) && MUI.isUniformAtDef(Reg);
115 case UniP1:
116 return MRI.getType(Reg) == LLT::pointer(1, 64) && MUI.isUniformAtDef(Reg);
117 case UniP2:
118 return MRI.getType(Reg) == LLT::pointer(2, 32) && MUI.isUniformAtDef(Reg);
119 case UniP3:
120 return MRI.getType(Reg) == LLT::pointer(3, 32) && MUI.isUniformAtDef(Reg);
121 case UniP4:
122 return MRI.getType(Reg) == LLT::pointer(4, 64) && MUI.isUniformAtDef(Reg);
123 case UniP5:
124 return MRI.getType(Reg) == LLT::pointer(5, 32) && MUI.isUniformAtDef(Reg);
125 case UniP6:
126 return MRI.getType(Reg) == LLT::pointer(6, 32) && MUI.isUniformAtDef(Reg);
127 case UniP8:
128 return MRI.getType(Reg) == LLT::pointer(8, 128) && MUI.isUniformAtDef(Reg);
129 case UniPtr32:
130 return isAnyPtr(MRI.getType(Reg), 32) && MUI.isUniformAtDef(Reg);
131 case UniPtr64:
132 return isAnyPtr(MRI.getType(Reg), 64) && MUI.isUniformAtDef(Reg);
133 case UniPtr128:
134 return isAnyPtr(MRI.getType(Reg), 128) && MUI.isUniformAtDef(Reg);
135 case UniV2S16:
136 return MRI.getType(Reg) == LLT::fixed_vector(2, 16) &&
137 MUI.isUniformAtDef(Reg);
138 case UniV2S32:
139 return MRI.getType(Reg) == LLT::fixed_vector(2, 32) &&
140 MUI.isUniformAtDef(Reg);
141 case UniV3S32:
142 return MRI.getType(Reg) == LLT::fixed_vector(3, 32) &&
143 MUI.isUniformAtDef(Reg);
144 case UniV4S32:
145 return MRI.getType(Reg) == LLT::fixed_vector(4, 32) &&
146 MUI.isUniformAtDef(Reg);
147 case UniV6S32:
148 return MRI.getType(Reg) == LLT::fixed_vector(6, 32) &&
149 MUI.isUniformAtDef(Reg);
150 case UniV8S16:
151 return MRI.getType(Reg) == LLT::fixed_vector(8, 16) &&
152 MUI.isUniformAtDef(Reg);
153 case UniV8S32:
154 return MRI.getType(Reg) == LLT::fixed_vector(8, 32) &&
155 MUI.isUniformAtDef(Reg);
156 case UniV16S16:
157 return MRI.getType(Reg) == LLT::fixed_vector(16, 16) &&
158 MUI.isUniformAtDef(Reg);
159 case UniV16S32:
160 return MRI.getType(Reg) == LLT::fixed_vector(16, 32) &&
161 MUI.isUniformAtDef(Reg);
162 case UniV32S16:
163 return MRI.getType(Reg) == LLT::fixed_vector(32, 16) &&
164 MUI.isUniformAtDef(Reg);
165 case UniV32S32:
166 return MRI.getType(Reg) == LLT::fixed_vector(32, 32) &&
167 MUI.isUniformAtDef(Reg);
168 case UniB32:
169 return MRI.getType(Reg).getSizeInBits() == 32 && MUI.isUniformAtDef(Reg);
170 case UniB64:
171 return MRI.getType(Reg).getSizeInBits() == 64 && MUI.isUniformAtDef(Reg);
172 case UniB96:
173 return MRI.getType(Reg).getSizeInBits() == 96 && MUI.isUniformAtDef(Reg);
174 case UniB128:
175 return MRI.getType(Reg).getSizeInBits() == 128 && MUI.isUniformAtDef(Reg);
176 case UniB160:
177 return MRI.getType(Reg).getSizeInBits() == 160 && MUI.isUniformAtDef(Reg);
178 case UniB256:
179 return MRI.getType(Reg).getSizeInBits() == 256 && MUI.isUniformAtDef(Reg);
180 case UniB512:
181 return MRI.getType(Reg).getSizeInBits() == 512 && MUI.isUniformAtDef(Reg);
182 case UniBRC: {
183 if (MUI.isDivergentAtDef(Reg))
184 return false;
185 // Check if there is SGPR register class of same size as the LLT.
186 const SIRegisterInfo *TRI =
187 static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
188 // There is no 16 bit SGPR register class. Extra size check is required
189 // since getSGPRClassForBitWidth returns SReg_32RegClass for Size 16.
190 unsigned LLTSize = MRI.getType(Reg).getSizeInBits();
191 return LLTSize >= 32 && TRI->getSGPRClassForBitWidth(LLTSize);
192 }
193 case DivS1:
194 return MRI.getType(Reg) == LLT::scalar(1) && MUI.isDivergentAtDef(Reg);
195 case DivS16:
196 return MRI.getType(Reg) == LLT::scalar(16) && MUI.isDivergentAtDef(Reg);
197 case DivS32:
198 return MRI.getType(Reg) == LLT::scalar(32) && MUI.isDivergentAtDef(Reg);
199 case DivS64:
200 return MRI.getType(Reg) == LLT::scalar(64) && MUI.isDivergentAtDef(Reg);
201 case DivS128:
202 return MRI.getType(Reg) == LLT::scalar(128) && MUI.isDivergentAtDef(Reg);
203 case DivP0:
204 return MRI.getType(Reg) == LLT::pointer(0, 64) && MUI.isDivergentAtDef(Reg);
205 case DivP1:
206 return MRI.getType(Reg) == LLT::pointer(1, 64) && MUI.isDivergentAtDef(Reg);
207 case DivP2:
208 return MRI.getType(Reg) == LLT::pointer(2, 32) && MUI.isDivergentAtDef(Reg);
209 case DivP3:
210 return MRI.getType(Reg) == LLT::pointer(3, 32) && MUI.isDivergentAtDef(Reg);
211 case DivP4:
212 return MRI.getType(Reg) == LLT::pointer(4, 64) && MUI.isDivergentAtDef(Reg);
213 case DivP5:
214 return MRI.getType(Reg) == LLT::pointer(5, 32) && MUI.isDivergentAtDef(Reg);
215 case DivPtr32:
216 return isAnyPtr(MRI.getType(Reg), 32) && MUI.isDivergentAtDef(Reg);
217 case DivPtr64:
218 return isAnyPtr(MRI.getType(Reg), 64) && MUI.isDivergentAtDef(Reg);
219 case DivPtr128:
220 return isAnyPtr(MRI.getType(Reg), 128) && MUI.isDivergentAtDef(Reg);
221 case DivV2S16:
222 return MRI.getType(Reg) == LLT::fixed_vector(2, 16) &&
224 case DivV2S32:
225 return MRI.getType(Reg) == LLT::fixed_vector(2, 32) &&
227 case DivV4S32:
228 return MRI.getType(Reg) == LLT::fixed_vector(4, 32) &&
230 case DivV2S64:
231 return MRI.getType(Reg) == LLT::fixed_vector(2, 64) &&
233 case DivV3S32:
234 return MRI.getType(Reg) == LLT::fixed_vector(3, 32) &&
236 case DivV4S16:
237 return MRI.getType(Reg) == LLT::fixed_vector(4, 16) &&
239 case DivV8S16:
240 return MRI.getType(Reg) == LLT::fixed_vector(8, 16) &&
242 case DivV8S32:
243 return MRI.getType(Reg) == LLT::fixed_vector(8, 32) &&
245 case DivV16S16:
246 return MRI.getType(Reg) == LLT::fixed_vector(16, 16) &&
248 case DivV16S32:
249 return MRI.getType(Reg) == LLT::fixed_vector(16, 32) &&
251 case DivV6S32:
252 return MRI.getType(Reg) == LLT::fixed_vector(6, 32) &&
254 case DivV32S16:
255 return MRI.getType(Reg) == LLT::fixed_vector(32, 16) &&
257 case DivV32S32:
258 return MRI.getType(Reg) == LLT::fixed_vector(32, 32) &&
260 case DivB32:
261 return MRI.getType(Reg).getSizeInBits() == 32 && MUI.isDivergentAtDef(Reg);
262 case DivB64:
263 return MRI.getType(Reg).getSizeInBits() == 64 && MUI.isDivergentAtDef(Reg);
264 case DivB96:
265 return MRI.getType(Reg).getSizeInBits() == 96 && MUI.isDivergentAtDef(Reg);
266 case DivB128:
267 return MRI.getType(Reg).getSizeInBits() == 128 && MUI.isDivergentAtDef(Reg);
268 case DivB160:
269 return MRI.getType(Reg).getSizeInBits() == 160 && MUI.isDivergentAtDef(Reg);
270 case DivB256:
271 return MRI.getType(Reg).getSizeInBits() == 256 && MUI.isDivergentAtDef(Reg);
272 case DivB512:
273 return MRI.getType(Reg).getSizeInBits() == 512 && MUI.isDivergentAtDef(Reg);
274 case DivBRC: {
275 if (MUI.isUniformAtDef(Reg))
276 return false;
277 // Check if there is VGPR register class of same size as the LLT.
278 const SIRegisterInfo *TRI =
279 static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
280 return TRI->getSGPRClassForBitWidth(MRI.getType(Reg).getSizeInBits());
281 }
282 case BRC: {
283 // Check if there is SGPR and VGPR register class of same size as the LLT.
284 const SIRegisterInfo *TRI =
285 static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
286 unsigned LLTSize = MRI.getType(Reg).getSizeInBits();
287 return LLTSize >= 32 && TRI->getSGPRClassForBitWidth(LLTSize) &&
288 TRI->getVGPRClassForBitWidth(LLTSize);
289 }
290 case _:
291 return true;
292 default:
293 llvm_unreachable("missing matchUniformityAndLLT");
294 }
295}
296
298 const MachineUniformityInfo &MUI,
299 const MachineRegisterInfo &MRI) const {
300 // Check LLT signature.
301 for (unsigned i = 0; i < OpUniformityAndTypes.size(); ++i) {
302 const MachineOperand &MO = MI.getOperand(i);
303 if (OpUniformityAndTypes[i] == _) {
304 assert((!MI.getOperand(i).isReg() ||
305 !MI.getOperand(i).getReg().isVirtual()) &&
306 "_ is for non-register and physical register operands only");
307 continue;
308 }
309
310 // Remaining IDs check registers.
311 if (!MO.isReg())
312 return false;
313
314 if (!matchUniformityAndLLT(MO.getReg(), OpUniformityAndTypes[i], MUI, MRI))
315 return false;
316 }
317
318 // More complex check.
319 if (TestFunc)
320 return TestFunc(MI);
321
322 return true;
323}
324
326
328 : FastTypes(FastTypes) {}
329
331 if (Ty == LLT::scalar(16))
332 return S16;
333 if (Ty == LLT::scalar(32))
334 return S32;
335 if (Ty == LLT::scalar(64))
336 return S64;
337 if (Ty == LLT::fixed_vector(2, 16))
338 return V2S16;
339 if (Ty == LLT::fixed_vector(2, 32))
340 return V2S32;
341 if (Ty == LLT::fixed_vector(3, 32))
342 return V3S32;
343 if (Ty == LLT::fixed_vector(4, 32))
344 return V4S32;
345 return _;
346}
347
349 if (Ty == LLT::scalar(32) || Ty == LLT::fixed_vector(2, 16) ||
350 isAnyPtr(Ty, 32))
351 return B32;
352 if (Ty == LLT::scalar(64) || Ty == LLT::fixed_vector(2, 32) ||
353 Ty == LLT::fixed_vector(4, 16) || isAnyPtr(Ty, 64))
354 return B64;
355 if (Ty == LLT::fixed_vector(3, 32))
356 return B96;
357 if (Ty == LLT::fixed_vector(4, 32) || Ty == LLT::fixed_vector(2, 64) ||
358 Ty == LLT::fixed_vector(8, 16) || isAnyPtr(Ty, 128))
359 return B128;
360 return _;
361}
362
363const RegBankLLTMapping *
365 const MachineRegisterInfo &MRI,
366 const MachineUniformityInfo &MUI) const {
367 // Search in "Fast Rules".
368 // Note: if fast rules are enabled, RegBankLLTMapping must be added in each
369 // slot that could "match fast Predicate". If not, InvalidMapping is
370 // returned which results in failure, does not search "Slow Rules".
371 if (FastTypes != NoFastRules) {
372 Register Reg = MI.getOperand(0).getReg();
373 int Slot;
374 if (FastTypes == StandardB)
375 Slot = getFastPredicateSlot(LLTToBId(MRI.getType(Reg)));
376 else
377 Slot = getFastPredicateSlot(LLTToId(MRI.getType(Reg)));
378
379 if (Slot != -1)
380 return MUI.isUniformAtDef(Reg) ? &Uni[Slot] : &Div[Slot];
381 }
382
383 // Slow search for more complex rules.
384 for (const RegBankLegalizeRule &Rule : Rules) {
385 if (Rule.Predicate.match(MI, MUI, MRI))
386 return &Rule.OperandMapping;
387 }
388
389 return nullptr;
390}
391
393 Rules.push_back(Rule);
394}
395
397 RegBankLLTMapping RuleApplyIDs) {
398 int Slot = getFastPredicateSlot(Ty);
399 assert(Slot != -1 && "Ty unsupported in this FastRulesTypes");
400 Div[Slot] = std::move(RuleApplyIDs);
401}
402
404 RegBankLLTMapping RuleApplyIDs) {
405 int Slot = getFastPredicateSlot(Ty);
406 assert(Slot != -1 && "Ty unsupported in this FastRulesTypes");
407 Uni[Slot] = std::move(RuleApplyIDs);
408}
409
410int SetOfRulesForOpcode::getFastPredicateSlot(
412 switch (FastTypes) {
413 case Standard: {
414 switch (Ty) {
415 case S32:
416 return 0;
417 case S16:
418 return 1;
419 case S64:
420 return 2;
421 case V2S16:
422 return 3;
423 default:
424 return -1;
425 }
426 }
427 case StandardB: {
428 switch (Ty) {
429 case B32:
430 return 0;
431 case B64:
432 return 1;
433 case B96:
434 return 2;
435 case B128:
436 return 3;
437 default:
438 return -1;
439 }
440 }
441 case Vector: {
442 switch (Ty) {
443 case S32:
444 return 0;
445 case V2S32:
446 return 1;
447 case V3S32:
448 return 2;
449 case V4S32:
450 return 3;
451 default:
452 return -1;
453 }
454 }
455 default:
456 return -1;
457 }
458}
459
460RegBankLegalizeRules::RuleSetInitializer
461RegBankLegalizeRules::addRulesForGOpcs(std::initializer_list<unsigned> OpcList,
462 FastRulesTypes FastTypes) {
463 return RuleSetInitializer(OpcList, GRulesAlias, GRules, FastTypes);
464}
465
466RegBankLegalizeRules::RuleSetInitializer
467RegBankLegalizeRules::addRulesForIOpcs(std::initializer_list<unsigned> OpcList,
468 FastRulesTypes FastTypes) {
469 return RuleSetInitializer(OpcList, IRulesAlias, IRules, FastTypes);
470}
471
474 unsigned Opc = MI.getOpcode();
475 if (Opc == AMDGPU::G_INTRINSIC || Opc == AMDGPU::G_INTRINSIC_CONVERGENT ||
476 Opc == AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS ||
477 Opc == AMDGPU::G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS) {
478 unsigned IntrID = cast<GIntrinsic>(MI).getIntrinsicID();
479 auto IRAIt = IRulesAlias.find(IntrID);
480 if (IRAIt == IRulesAlias.end())
481 return nullptr;
482 return &IRules.at(IRAIt->second);
483 }
484
485 auto GRAIt = GRulesAlias.find(Opc);
486 if (GRAIt == GRulesAlias.end())
487 return nullptr;
488 return &GRules.at(GRAIt->second);
489}
490
491// Syntactic sugar wrapper for predicate lambda that enables '&&', '||' and '!'.
492class Predicate {
493private:
494 struct Elt {
495 // Save formula composed of Pred, '&&', '||' and '!' as a jump table.
496 // Sink ! to Pred. For example !((A && !B) || C) -> (!A || B) && !C
497 // Sequences of && and || will be represented by jumps, for example:
498 // (A && B && ... X) or (A && B && ... X) || Y
499 // A == true jump to B
500 // A == false jump to end or Y, result is A(false) or Y
501 // (A || B || ... X) or (A || B || ... X) && Y
502 // A == true jump to end or Y, result is A(true) or Y
503 // A == false jump to B
504 // Notice that when negating expression, we simply flip Neg on each Pred
505 // and swap TJumpOffset and FJumpOffset (&& becomes ||, || becomes &&).
506 std::function<bool(const MachineInstr &)> Pred;
507 bool Neg; // Neg of Pred is calculated before jump
508 unsigned TJumpOffset;
509 unsigned FJumpOffset;
510 };
511
512 SmallVector<Elt, 8> Expression;
513
514 Predicate(SmallVectorImpl<Elt> &&Expr) { Expression.swap(Expr); };
515
516public:
517 Predicate(std::function<bool(const MachineInstr &)> Pred) {
518 Expression.push_back({Pred, false, 1, 1});
519 };
520
521 bool operator()(const MachineInstr &MI) const {
522 unsigned Idx = 0;
523 unsigned ResultIdx = Expression.size();
524 bool Result;
525 do {
526 Result = Expression[Idx].Pred(MI);
527 Result = Expression[Idx].Neg ? !Result : Result;
528 if (Result) {
529 Idx += Expression[Idx].TJumpOffset;
530 } else {
531 Idx += Expression[Idx].FJumpOffset;
532 }
533 } while ((Idx != ResultIdx));
534
535 return Result;
536 };
537
538 Predicate operator!() const {
539 SmallVector<Elt, 8> NegExpression;
540 for (const Elt &ExprElt : Expression) {
541 NegExpression.push_back({ExprElt.Pred, !ExprElt.Neg, ExprElt.FJumpOffset,
542 ExprElt.TJumpOffset});
543 }
544 return Predicate(std::move(NegExpression));
545 };
546
547 Predicate operator&&(const Predicate &RHS) const {
548 SmallVector<Elt, 8> AndExpression = Expression;
549
550 unsigned RHSSize = RHS.Expression.size();
551 unsigned ResultIdx = Expression.size();
552 for (unsigned i = 0; i < ResultIdx; ++i) {
553 // LHS results in false, whole expression results in false.
554 if (i + AndExpression[i].FJumpOffset == ResultIdx)
555 AndExpression[i].FJumpOffset += RHSSize;
556 }
557
558 AndExpression.append(RHS.Expression);
559
560 return Predicate(std::move(AndExpression));
561 }
562
563 Predicate operator||(const Predicate &RHS) const {
564 SmallVector<Elt, 8> OrExpression = Expression;
565
566 unsigned RHSSize = RHS.Expression.size();
567 unsigned ResultIdx = Expression.size();
568 for (unsigned i = 0; i < ResultIdx; ++i) {
569 // LHS results in true, whole expression results in true.
570 if (i + OrExpression[i].TJumpOffset == ResultIdx)
571 OrExpression[i].TJumpOffset += RHSSize;
572 }
573
574 OrExpression.append(RHS.Expression);
575
576 return Predicate(std::move(OrExpression));
577 }
578};
579
580// Initialize rules
583 : ST(&_ST), MRI(&_MRI) {
584
585 addRulesForGOpcs({G_ADD, G_SUB}, Standard)
586 .Uni(S16, {{Sgpr32Trunc}, {Sgpr32AExt, Sgpr32AExt}})
587 .Div(S16, {{Vgpr16}, {Vgpr16, Vgpr16}})
588 .Uni(S32, {{Sgpr32}, {Sgpr32, Sgpr32}})
589 .Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32}})
591 .Div(V2S16, {{VgprV2S16}, {VgprV2S16, VgprV2S16}})
592 .Uni(S64, {{Sgpr64}, {Sgpr64, Sgpr64}})
593 .Div(S64, {{Vgpr64}, {Vgpr64, Vgpr64}});
594
595 addRulesForGOpcs({G_UADDO, G_USUBO}, Standard)
596 .Uni(S32, {{Sgpr32, Sgpr32Trunc}, {Sgpr32, Sgpr32}})
597 .Div(S32, {{Vgpr32, Vcc}, {Vgpr32, Vgpr32}});
598
599 addRulesForGOpcs({G_UADDE, G_USUBE, G_SADDE, G_SSUBE}, Standard)
601 .Div(S32, {{Vgpr32, Vcc}, {Vgpr32, Vgpr32, Vcc}});
602
603 addRulesForGOpcs({G_UADDSAT, G_SADDSAT, G_USUBSAT, G_SSUBSAT}, Standard)
604 .Uni(S16, {{UniInVgprS16}, {Vgpr16, Vgpr16}})
605 .Div(S16, {{Vgpr16}, {Vgpr16, Vgpr16}})
606 .Uni(S32, {{UniInVgprS32}, {Vgpr32, Vgpr32}})
607 .Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32}})
609 .Div(V2S16, {{VgprV2S16}, {VgprV2S16, VgprV2S16}});
610
611 bool HasVecMulU64 = ST->hasVMulU64Inst();
612 addRulesForGOpcs({G_MUL}, Standard)
613 .Div(S16, {{Vgpr16}, {Vgpr16, Vgpr16}})
614 .Uni(S32, {{Sgpr32}, {Sgpr32, Sgpr32}})
615 .Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32}})
616 .Uni(S64, {{SgprB64}, {SgprB64, SgprB64}})
618 .Div(V2S16, {{VgprV2S16}, {VgprV2S16, VgprV2S16}})
619 .Uni(S16, {{Sgpr32Trunc}, {Sgpr32AExt, Sgpr32AExt}})
620 .Div(S64, {{VgprB64}, {VgprB64, VgprB64}}, HasVecMulU64)
621 .Div(S64, {{VgprB64}, {VgprB64, VgprB64}, SplitTo32Mul}, !HasVecMulU64);
622
623 bool hasMulHi = ST->hasScalarMulHiInsts();
624 addRulesForGOpcs({G_UMULH, G_SMULH}, Standard)
625 .Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32}})
626 .Uni(S32, {{Sgpr32}, {Sgpr32, Sgpr32}}, hasMulHi)
627 .Uni(S32, {{UniInVgprS32}, {Vgpr32, Vgpr32}}, !hasMulHi);
628
629 addRulesForGOpcs({G_AMDGPU_MAD_U64_U32}, Standard)
630 .Div(S64, {{Vgpr64, Vcc}, {Vgpr32, Vgpr32, Vgpr64}})
632
633 bool HasScalarSMulU64 = ST->hasScalarSMulU64();
634 addRulesForGOpcs({G_AMDGPU_S_MUL_U64_U32, G_AMDGPU_S_MUL_I64_I32}, Standard)
635 .Uni(S64, {{Sgpr64}, {Sgpr64, Sgpr64}, UniMul64}, HasScalarSMulU64)
636 .Div(S64, {{Vgpr64}, {Vgpr64, Vgpr64}, DivSMulToMAD});
637
638 addRulesForGOpcs({G_XOR, G_OR, G_AND}, StandardB)
640 .Any({{DivS1}, {{Vcc}, {Vcc, Vcc}}})
641 .Any({{UniS16}, {{Sgpr16}, {Sgpr16, Sgpr16}}})
642 .Any({{DivS16}, {{Vgpr16}, {Vgpr16, Vgpr16}}})
643 .Uni(B32, {{SgprB32}, {SgprB32, SgprB32}})
644 .Div(B32, {{VgprB32}, {VgprB32, VgprB32}})
645 .Uni(B64, {{SgprB64}, {SgprB64, SgprB64}})
646 .Div(B64, {{VgprB64}, {VgprB64, VgprB64}, SplitTo32});
647
648 addRulesForGOpcs({G_SHL}, Standard)
649 .Uni(S16, {{Sgpr32Trunc}, {Sgpr32AExt, Sgpr32ZExt}})
650 .Div(S16, {{Vgpr16}, {Vgpr16, Vgpr16}})
652 .Div(V2S16, {{VgprV2S16}, {VgprV2S16, VgprV2S16}})
653 .Uni(S32, {{Sgpr32}, {Sgpr32, Sgpr32}})
654 .Uni(S64, {{Sgpr64}, {Sgpr64, Sgpr32}})
655 .Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32}})
656 .Div(S64, {{Vgpr64}, {Vgpr64, Vgpr32}});
657
658 addRulesForGOpcs({G_LSHR}, Standard)
659 .Uni(S16, {{Sgpr32Trunc}, {Sgpr32ZExt, Sgpr32ZExt}})
660 .Div(S16, {{Vgpr16}, {Vgpr16, Vgpr16}})
662 .Div(V2S16, {{VgprV2S16}, {VgprV2S16, VgprV2S16}})
663 .Uni(S32, {{Sgpr32}, {Sgpr32, Sgpr32}})
664 .Uni(S64, {{Sgpr64}, {Sgpr64, Sgpr32}})
665 .Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32}})
666 .Div(S64, {{Vgpr64}, {Vgpr64, Vgpr32}});
667
668 addRulesForGOpcs({G_ASHR}, Standard)
669 .Uni(S16, {{Sgpr32Trunc}, {Sgpr32SExt, Sgpr32ZExt}})
670 .Div(S16, {{Vgpr16}, {Vgpr16, Vgpr16}})
672 .Div(V2S16, {{VgprV2S16}, {VgprV2S16, VgprV2S16}})
673 .Uni(S32, {{Sgpr32}, {Sgpr32, Sgpr32}})
674 .Uni(S64, {{Sgpr64}, {Sgpr64, Sgpr32}})
675 .Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32}})
676 .Div(S64, {{Vgpr64}, {Vgpr64, Vgpr32}});
677
678 addRulesForGOpcs({G_FSHR}, Standard)
679 .Uni(S32, {{UniInVgprS32}, {Vgpr32, Vgpr32, Vgpr32}})
680 .Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32, Vgpr32}});
681
682 addRulesForGOpcs({G_BSWAP}, Standard)
683 .Uni(S16, {{UniInVgprS16}, {Vgpr16}})
684 .Div(S16, {{Vgpr16}, {Vgpr16}})
685 .Uni(S32, {{UniInVgprS32}, {Vgpr32}})
686 .Div(S32, {{Vgpr32}, {Vgpr32}})
687 .Uni(V2S16, {{UniInVgprV2S16}, {VgprV2S16}})
688 .Div(V2S16, {{VgprV2S16}, {VgprV2S16}});
689
690 addRulesForGOpcs({G_AMDGPU_CVT_F32_UBYTE0, G_AMDGPU_CVT_F32_UBYTE1,
691 G_AMDGPU_CVT_F32_UBYTE2, G_AMDGPU_CVT_F32_UBYTE3,
692 G_AMDGPU_RCP_IFLAG},
693 Standard)
694 .Uni(S32, {{UniInVgprS32}, {Vgpr32}})
695 .Div(S32, {{Vgpr32}, {Vgpr32}});
696
697 addRulesForGOpcs({G_FRAME_INDEX}).Any({{UniP5, _}, {{SgprP5}, {None}}});
698
699 addRulesForGOpcs({G_UBFX, G_SBFX}, Standard)
700 .Uni(S32, {{Sgpr32}, {Sgpr32, Sgpr32, Sgpr32}, S_BFE})
701 .Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32, Vgpr32}})
702 .Uni(S64, {{Sgpr64}, {Sgpr64, Sgpr32, Sgpr32}, S_BFE})
703 .Div(S64, {{Vgpr64}, {Vgpr64, Vgpr32, Vgpr32}, V_BFE});
704
705 addRulesForGOpcs({G_SMIN, G_SMAX}, Standard)
706 .Uni(S16, {{Sgpr32Trunc}, {Sgpr32SExt, Sgpr32SExt}})
707 .Div(S16, {{Vgpr16}, {Vgpr16, Vgpr16}})
708 .Uni(S32, {{Sgpr32}, {Sgpr32, Sgpr32}})
709 .Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32}})
711 .Div(V2S16, {{VgprV2S16}, {VgprV2S16, VgprV2S16}})
712 .Uni(S64, {{UniInVgprS64}, {Vgpr64, Vgpr64}})
713 .Div(S64, {{Vgpr64}, {Vgpr64, Vgpr64}});
714
715 addRulesForGOpcs({G_UMIN, G_UMAX}, Standard)
716 .Uni(S16, {{Sgpr32Trunc}, {Sgpr32ZExt, Sgpr32ZExt}})
717 .Div(S16, {{Vgpr16}, {Vgpr16, Vgpr16}})
718 .Uni(S32, {{Sgpr32}, {Sgpr32, Sgpr32}})
719 .Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32}})
721 .Div(V2S16, {{VgprV2S16}, {VgprV2S16, VgprV2S16}})
722 .Uni(S64, {{UniInVgprS64}, {Vgpr64, Vgpr64}})
723 .Div(S64, {{Vgpr64}, {Vgpr64, Vgpr64}});
724
725 addRulesForGOpcs({G_IMPLICIT_DEF})
726 .Any({{UniS1}, {{Sgpr32Trunc}, {}}})
727 .Any({{UniS16}, {{Sgpr16}, {}}})
728 .Any({{UniBRC}, {{SgprBRC}, {}}});
729
730 addRulesForGOpcs({G_CONSTANT}, Standard)
731 .Any({{UniS1, _}, {{Sgpr32Trunc}, {}, UniCstExt}})
732 .Uni(S16, {{Sgpr16}, {}})
733 .Uni(S32, {{Sgpr32}, {}})
734 .Uni(S64, {{Sgpr64}, {}})
735 .Any({{UniPtr32, _}, {{SgprPtr32}, {}}})
736 .Any({{UniPtr64, _}, {{SgprPtr64}, {}}});
737
738 addRulesForGOpcs({G_FCONSTANT}, Standard)
739 .Uni(S16, {{Sgpr16}, {}})
740 .Uni(S32, {{Sgpr32}, {}})
741 .Uni(S64, {{Sgpr64}, {}});
742
743 addRulesForGOpcs({G_FREEZE})
744 .Any({{UniS1}, {{Sgpr32Trunc}, {Sgpr32AExt}}})
745 .Any({{DivS1}, {{Vcc}, {Vcc}}})
746 .Any({{UniS16}, {{Sgpr16}, {Sgpr16}}})
747 .Any({{UniBRC}, {{SgprBRC}, {SgprBRC}}})
748 .Any({{DivBRC}, {{VgprBRC}, {VgprBRC}}});
749
750 addRulesForGOpcs({G_BITCAST})
751 .Any({{UniBRC}, {{SgprBRC}, {SgprBRC}}})
752 .Any({{DivBRC}, {{VgprBRC}, {VgprBRC}}});
753
754 addRulesForGOpcs({G_UNMERGE_VALUES})
755 .Any({{UniS16}, {{}, {}, UnmergeToShiftTrunc}})
756 .Any({{UniBRC}, {{}, {}, VerifyAllSgpr}})
757 .Any({{DivBRC}, {{}, {}, ApplyAllVgpr}});
758
759 addRulesForGOpcs({G_BUILD_VECTOR, G_MERGE_VALUES})
760 .Any({{UniBRC, S16}, {{}, {}, VerifyAllSgpr}})
761 .Any({{UniBRC, BRC}, {{}, {}, VerifyAllSgpr}})
762 .Any({{DivBRC, S16}, {{}, {}, ApplyAllVgpr}})
763 .Any({{DivBRC, BRC}, {{}, {}, ApplyAllVgpr}});
764
765 addRulesForGOpcs({G_CONCAT_VECTORS})
766 .Any({{UniBRC, BRC}, {{}, {}, VerifyAllSgpr}})
767 .Any({{DivBRC, BRC}, {{}, {}, ApplyAllVgpr}});
768
769 addRulesForGOpcs({G_PHI})
770 .Any({{UniS1}, {{}, {}, AextToS32InIncomingBlockGPHI}})
771 .Any({{UniS16}, {{}, {}, VerifyAllSgprGPHI}})
772 .Any({{UniBRC}, {{}, {}, VerifyAllSgprGPHI}})
773 .Any({{DivBRC}, {{}, {}, VerifyAllSgprOrVgprGPHI}});
774
775 addRulesForGOpcs({G_EXTRACT_VECTOR_ELT})
776 .Any({{UniB32, UniBRC, UniS32}, {{SgprB32}, {SgprBRC, Sgpr32}}})
777 .Any({{DivB32, DivBRC, UniS32}, {{VgprB32}, {VgprBRC, Sgpr32}}})
778 .Any({{DivB32, BRC, DivS32},
780 .Any({{UniB64, UniBRC, UniS32}, {{SgprB64}, {SgprBRC, Sgpr32}}})
781 .Any({{DivB64, DivBRC, UniS32},
783 .Any({{DivB64, BRC, DivS32},
785
786 addRulesForGOpcs({G_INSERT_VECTOR_ELT})
788 {{SgprBRC}, {SgprBRC, SgprB32, Sgpr32}}})
789 .Any(
790 {{DivBRC, BRC, B32, UniS32}, {{VgprBRC}, {VgprBRC, VgprB32, Sgpr32}}})
791 .Any({{DivBRC, BRC, B32, DivS32},
795 .Any({{DivBRC, BRC, B64, UniS32},
797 .Any({{DivBRC, BRC, B64, DivS32},
799
800 // INTERSECT_RAY {Div}, {{VgprDst...}, {VgprSrc, ..., Sgpr_WF_RsrcIdx}}
801 // INTERSECT_RAY {Uni}, {{UniInVgprDst...}, {VgprSrc, ..., Sgpr_WF_RsrcIdx}}
802 addRulesForGOpcs({G_AMDGPU_BVH_INTERSECT_RAY, G_AMDGPU_BVH_DUAL_INTERSECT_RAY,
803 G_AMDGPU_BVH8_INTERSECT_RAY})
804 .Any({{}, {{}, {}, ApplyBVH_INTERSECT_RAY}});
805
806 // LOAD {Div}, {{VgprDst...}, {VgprSrc, ..., Sgpr_WF_RsrcIdx}}
807 // LOAD {Uni}, {{UniInVgprDst...}, {VgprSrc, ..., Sgpr_WF_RsrcIdx}}
808 // LOAD_NORET {}, {{}, {Imm, VgprSrc, ..., Sgpr_WF_RsrcIdx}}
809 // STORE {}, {{}, {VgprSrc, ..., Sgpr_WF_RsrcIdx}}
810 addRulesForGOpcs({G_AMDGPU_INTRIN_IMAGE_LOAD, G_AMDGPU_INTRIN_IMAGE_LOAD_D16,
811 G_AMDGPU_INTRIN_IMAGE_LOAD_NORET,
812 G_AMDGPU_INTRIN_IMAGE_STORE,
813 G_AMDGPU_INTRIN_IMAGE_STORE_D16})
814 .Any({{}, {{}, {}, ApplyINTRIN_IMAGE}});
815
816 Predicate isSignedICmp([](const MachineInstr &MI) -> bool {
817 auto Pred =
818 static_cast<CmpInst::Predicate>(MI.getOperand(1).getPredicate());
819 return CmpInst::isSigned(Pred);
820 });
821
822 Predicate isEqualityICmp([](const MachineInstr &MI) -> bool {
823 auto Pred =
824 static_cast<CmpInst::Predicate>(MI.getOperand(1).getPredicate());
825 return ICmpInst::isEquality(Pred);
826 });
827
828 bool HasScalarCompareEq64 = ST->hasScalarCompareEq64();
829 // clang-format off
830 addRulesForGOpcs({G_ICMP})
831 .Any({{{UniS1, _, S16}, isEqualityICmp}, {{Sgpr32Trunc}, {None, Sgpr32ZExt, Sgpr32ZExt}}})
832 .Any({{{UniS1, _, S16}, !isEqualityICmp && isSignedICmp}, {{Sgpr32Trunc}, {None, Sgpr32SExt, Sgpr32SExt}}})
833 .Any({{{UniS1, _, S16}, !isEqualityICmp && !isSignedICmp}, {{Sgpr32Trunc}, {None, Sgpr32ZExt, Sgpr32ZExt}}})
834 .Any({{{DivS1, _, S16}}, {{Vcc}, {None, Vgpr16, Vgpr16}}})
835 .Any({{{UniS1, _, S32}}, {{Sgpr32Trunc}, {None, Sgpr32, Sgpr32}}})
836 .Any({{{DivS1, _, S32}}, {{Vcc}, {None, Vgpr32, Vgpr32}}})
837 .Any({{{UniS1, _, S64}, isEqualityICmp}, {{Sgpr32Trunc}, {None, Sgpr64, Sgpr64}}}, HasScalarCompareEq64)
838 .Any({{{UniS1, _, S64}, isEqualityICmp}, {{UniInVcc}, {None, Vgpr64, Vgpr64}}}, !HasScalarCompareEq64)
839 .Any({{{UniS1, _, S64}, !isEqualityICmp}, {{UniInVcc}, {None, Vgpr64, Vgpr64}}})
840 .Any({{{DivS1, _, S64}}, {{Vcc}, {None, Vgpr64, Vgpr64}}})
841 .Any({{{UniS1, _, Ptr32}}, {{Sgpr32Trunc}, {None, SgprPtr32, SgprPtr32}}})
842 .Any({{{DivS1, _, Ptr32}}, {{Vcc}, {None, VgprPtr32, VgprPtr32}}})
843 .Any({{{UniS1, _, Ptr64}, isEqualityICmp}, {{Sgpr32Trunc}, {None, SgprPtr64, SgprPtr64}}}, HasScalarCompareEq64)
844 .Any({{{UniS1, _, Ptr64}, isEqualityICmp}, {{UniInVcc}, {None, VgprPtr64, VgprPtr64}}}, !HasScalarCompareEq64)
845 .Any({{{UniS1, _, Ptr64}, !isEqualityICmp}, {{UniInVcc}, {None, VgprPtr64, VgprPtr64}}})
846 .Any({{{DivS1, _, Ptr64}}, {{Vcc}, {None, VgprPtr64, VgprPtr64}}});
847 // clang-format on
848
849 addRulesForGOpcs({G_BRCOND})
850 .Any({{UniS1}, {{}, {Sgpr32AExtBoolInReg}}})
851 .Any({{DivS1}, {{}, {Vcc}}});
852
853 addRulesForGOpcs({G_BR}).Any({{_}, {{}, {None}}});
854
855 addRulesForGOpcs({G_SELECT}, StandardB)
856 .Any({{DivS16}, {{Vgpr16}, {Vcc, Vgpr16, Vgpr16}}})
858 .Div(B32, {{VgprB32}, {Vcc, VgprB32, VgprB32}})
862
863 addRulesForGOpcs({G_ANYEXT})
864 .Any({{UniS16, S1}, {{None}, {None}}}) // should be combined away
865 .Any({{UniS32, S1}, {{None}, {None}}}) // should be combined away
866 .Any({{UniS64, S1}, {{None}, {None}}}) // should be combined away
867 .Any({{DivS16, S1}, {{Vgpr16}, {Vcc}, VccExtToSel}})
868 .Any({{DivS32, S1}, {{Vgpr32}, {Vcc}, VccExtToSel}})
869 .Any({{DivS64, S1}, {{Vgpr64}, {Vcc}, VccExtToSel}})
870 .Any({{UniS64, S32}, {{Sgpr64}, {Sgpr32}, Ext32To64}})
871 .Any({{DivS64, S32}, {{Vgpr64}, {Vgpr32}, Ext32To64}})
872 .Any({{UniS32, S16}, {{Sgpr32}, {Sgpr16}}})
873 .Any({{DivS32, S16}, {{Vgpr32}, {Vgpr16}}});
874
875 bool Has16bitCmp = ST->has16BitInsts();
876
877 // In global-isel G_TRUNC in-reg is treated as no-op, inst selected into COPY.
878 // It is up to user to deal with truncated bits.
879 // S1, S16, S32 and S64 results are handled with specific rules. Remaining
880 // (result, source) pairs with valid register classes are covered by the
881 // generic UniBRC/DivBRC wildcard rules.
882 addRulesForGOpcs({G_TRUNC})
883 .Any({{UniS1, UniS16}, {{None}, {None}}}) // should be combined away
884 .Any({{UniS1, UniS32}, {{None}, {None}}}) // should be combined away
885 .Any({{UniS1, UniS64}, {{None}, {None}}}) // should be combined away
886 .Any({{UniS16, S32}, {{Sgpr16}, {Sgpr32}}})
887 .Any({{UniBRC, UniBRC}, {{SgprBRC}, {SgprBRC}}})
888 .Any({{DivBRC, DivBRC}, {{VgprBRC}, {VgprBRC}}})
889 .Any({{UniV2S16, V2S32}, {{SgprV2S16}, {SgprV2S32}}})
890 .Any({{DivV2S16, V2S32}, {{VgprV2S16}, {VgprV2S32}}})
891 // This is non-trivial. VgprToVccCopy is done using compare instruction.
892 .Any({{DivS1, DivS16}, {{Vcc}, {Vgpr16}, VgprToVccCopy}}, Has16bitCmp)
894 !Has16bitCmp)
895 .Any({{DivS1, DivS32}, {{Vcc}, {Vgpr32}, VgprToVccCopy}})
896 .Any({{DivS1, DivS64}, {{Vcc}, {Vgpr64}, VgprToVccCopy}});
897
898 addRulesForGOpcs({G_ZEXT})
902 .Any({{DivS16, S1}, {{Vgpr16}, {Vcc}, VccExtToSel}})
903 .Any({{DivS32, S1}, {{Vgpr32}, {Vcc}, VccExtToSel}})
904 .Any({{DivS64, S1}, {{Vgpr64}, {Vcc}, VccExtToSel}})
905 .Any({{UniS64, S32}, {{Sgpr64}, {Sgpr32}, Ext32To64}})
906 .Any({{DivS64, S32}, {{Vgpr64}, {Vgpr32}, Ext32To64}})
907 // not extending S16 to S32 is questionable.
908 .Any({{UniS64, S16}, {{Sgpr64}, {Sgpr32ZExt}, Ext32To64}})
909 .Any({{DivS64, S16}, {{Vgpr64}, {Vgpr32ZExt}, Ext32To64}})
910 .Any({{UniS32, S16}, {{Sgpr32}, {Sgpr16}}})
911 .Any({{DivS32, S16}, {{Vgpr32}, {Vgpr16}}});
912
913 addRulesForGOpcs({G_SEXT})
917 .Any({{DivS16, S1}, {{Vgpr16}, {Vcc}, VccExtToSel}})
918 .Any({{DivS32, S1}, {{Vgpr32}, {Vcc}, VccExtToSel}})
919 .Any({{DivS64, S1}, {{Vgpr64}, {Vcc}, VccExtToSel}})
920 .Any({{UniS64, S32}, {{Sgpr64}, {Sgpr32}, Ext32To64}})
921 .Any({{DivS64, S32}, {{Vgpr64}, {Vgpr32}, Ext32To64}})
922 // not extending S16 to S32 is questionable.
923 .Any({{UniS64, S16}, {{Sgpr64}, {Sgpr32SExt}, Ext32To64}})
924 .Any({{DivS64, S16}, {{Vgpr64}, {Vgpr32SExt}, Ext32To64}})
925 .Any({{UniS32, S16}, {{Sgpr32}, {Sgpr16}}})
926 .Any({{DivS32, S16}, {{Vgpr32}, {Vgpr16}}});
927
928 addRulesForGOpcs({G_SEXT_INREG})
929 .Any({{UniS32, S32}, {{Sgpr32}, {Sgpr32}}})
930 .Any({{DivS32, S32}, {{Vgpr32}, {Vgpr32}}})
931 .Any({{UniS64, S64}, {{Sgpr64}, {Sgpr64}}})
933
934 addRulesForGOpcs({G_ASSERT_ZEXT, G_ASSERT_SEXT}, Standard)
935 .Uni(S32, {{Sgpr32}, {Sgpr32, Imm}})
936 .Div(S32, {{Vgpr32}, {Vgpr32, Imm}})
937 .Uni(S64, {{Sgpr64}, {Sgpr64, Imm}})
938 .Div(S64, {{Vgpr64}, {Vgpr64, Imm}});
939
940 addRulesForGOpcs({G_ASSERT_ALIGN}, Standard)
941 .Uni(S32, {{Sgpr32}, {Sgpr32}})
942 .Div(S32, {{Vgpr32}, {Vgpr32}})
943 .Uni(S64, {{Sgpr64}, {Sgpr64}})
944 .Div(S64, {{Vgpr64}, {Vgpr64}})
945 .Any({{UniPtr32}, {{SgprPtr32}, {SgprPtr32}}})
946 .Any({{DivPtr32}, {{VgprPtr32}, {VgprPtr32}}})
947 .Any({{UniPtr64}, {{SgprPtr64}, {SgprPtr64}}})
948 .Any({{DivPtr64}, {{VgprPtr64}, {VgprPtr64}}});
949
950 // Atomic read-modify-write operations: result and value are always VGPR,
951 // pointer varies by address space.
952 addRulesForGOpcs({G_ATOMICRMW_ADD, G_ATOMICRMW_SUB, G_ATOMICRMW_XCHG,
953 G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR,
954 G_ATOMICRMW_MIN, G_ATOMICRMW_MAX, G_ATOMICRMW_UMIN,
955 G_ATOMICRMW_UMAX, G_ATOMICRMW_UINC_WRAP,
956 G_ATOMICRMW_UDEC_WRAP, G_ATOMICRMW_FMIN, G_ATOMICRMW_FMAX})
957 .Any({{DivS32, P0, S32}, {{Vgpr32}, {VgprP0, Vgpr32}}})
958 .Any({{DivS64, P0, S64}, {{Vgpr64}, {VgprP0, Vgpr64}}})
959 .Any({{DivS32, P1, S32}, {{Vgpr32}, {VgprP1, Vgpr32}}})
960 .Any({{DivS64, P1, S64}, {{Vgpr64}, {VgprP1, Vgpr64}}})
961 .Any({{DivS32, P3, S32}, {{Vgpr32}, {VgprP3, Vgpr32}}})
962 .Any({{DivS64, P3, S64}, {{Vgpr64}, {VgprP3, Vgpr64}}});
963
964 addRulesForGOpcs({G_ATOMICRMW_USUB_SAT, G_ATOMICRMW_USUB_COND})
965 .Any({{DivS32, P0}, {{Vgpr32}, {VgprP0, Vgpr32}}})
966 .Any({{DivS32, P1}, {{Vgpr32}, {VgprP1, Vgpr32}}})
967 .Any({{DivS32, P3}, {{Vgpr32}, {VgprP3, Vgpr32}}});
968
969 bool HasAtomicFlatPkAdd16Insts = ST->hasAtomicFlatPkAdd16Insts();
970 bool HasAtomicBufferGlobalPkAddF16Insts =
971 ST->hasAtomicBufferGlobalPkAddF16NoRtnInsts() ||
972 ST->hasAtomicBufferGlobalPkAddF16Insts();
973 bool HasAtomicDsPkAdd16Insts = ST->hasAtomicDsPkAdd16Insts();
974 addRulesForGOpcs({G_ATOMICRMW_FADD})
975 .Any({{DivS32, P0, S32}, {{Vgpr32}, {VgprP0, Vgpr32}}})
976 .Any({{DivS64, P0, S64}, {{Vgpr64}, {VgprP0, Vgpr64}}})
977 .Any({{DivS32, P1, S32}, {{Vgpr32}, {VgprP1, Vgpr32}}})
978 .Any({{DivS64, P1, S64}, {{Vgpr64}, {VgprP1, Vgpr64}}})
979 .Any({{DivS32, P3, S32}, {{Vgpr32}, {VgprP3, Vgpr32}}})
980 .Any({{DivS64, P3, S64}, {{Vgpr64}, {VgprP3, Vgpr64}}})
981 .Any({{DivV2S16, P0, V2S16}, {{VgprV2S16}, {VgprP0, VgprV2S16}}},
982 HasAtomicFlatPkAdd16Insts)
983 .Any({{DivV2S16, P1, V2S16}, {{VgprV2S16}, {VgprP1, VgprV2S16}}},
984 HasAtomicBufferGlobalPkAddF16Insts)
985 .Any({{DivV2S16, P3, V2S16}, {{VgprV2S16}, {VgprP3, VgprV2S16}}},
986 HasAtomicDsPkAdd16Insts);
987
988 addRulesForGOpcs({G_ATOMIC_CMPXCHG})
989 .Any({{DivS32, P2}, {{Vgpr32}, {VgprP2, Vgpr32, Vgpr32}}})
990 .Any({{DivS64, P2}, {{Vgpr64}, {VgprP2, Vgpr64, Vgpr64}}})
991 .Any({{DivS32, P3}, {{Vgpr32}, {VgprP3, Vgpr32, Vgpr32}}})
992 .Any({{DivS64, P3}, {{Vgpr64}, {VgprP3, Vgpr64, Vgpr64}}});
993
994 addRulesForGOpcs({G_AMDGPU_ATOMIC_CMPXCHG})
995 .Any({{DivS32, P0}, {{Vgpr32}, {VgprP0, VgprV2S32}}})
996 .Any({{DivS32, P1}, {{Vgpr32}, {VgprP1, VgprV2S32}}})
997 .Any({{DivS64, P0}, {{Vgpr64}, {VgprP0, VgprV2S64}}})
998 .Any({{DivS64, P1}, {{Vgpr64}, {VgprP1, VgprV2S64}}});
999
1000 addRulesForGOpcs({G_AMDGPU_BUFFER_ATOMIC_CMPSWAP}, Standard)
1001 .Div(S32, {{Vgpr32},
1003 .Div(S64, {{Vgpr64},
1005
1006 addRulesForGOpcs({G_AMDGPU_BUFFER_ATOMIC_ADD, G_AMDGPU_BUFFER_ATOMIC_AND,
1007 G_AMDGPU_BUFFER_ATOMIC_DEC, G_AMDGPU_BUFFER_ATOMIC_FMAX,
1008 G_AMDGPU_BUFFER_ATOMIC_FMIN, G_AMDGPU_BUFFER_ATOMIC_INC,
1009 G_AMDGPU_BUFFER_ATOMIC_OR, G_AMDGPU_BUFFER_ATOMIC_SMAX,
1010 G_AMDGPU_BUFFER_ATOMIC_SMIN, G_AMDGPU_BUFFER_ATOMIC_SUB,
1011 G_AMDGPU_BUFFER_ATOMIC_SWAP, G_AMDGPU_BUFFER_ATOMIC_UMAX,
1012 G_AMDGPU_BUFFER_ATOMIC_UMIN, G_AMDGPU_BUFFER_ATOMIC_XOR},
1013 Standard)
1016
1017 bool hasSMRDx3 = ST->hasScalarDwordx3Loads();
1018 bool hasSMRDSmall = ST->hasScalarSubwordLoads();
1019 bool usesTrue16 = ST->useRealTrue16Insts();
1020
1021 Predicate isAlign16([](const MachineInstr &MI) -> bool {
1022 return (*MI.memoperands_begin())->getAlign() >= Align(16);
1023 });
1024
1025 Predicate isAlign4([](const MachineInstr &MI) -> bool {
1026 return (*MI.memoperands_begin())->getAlign() >= Align(4);
1027 });
1028
1029 Predicate isAtomicMMO([](const MachineInstr &MI) -> bool {
1030 return (*MI.memoperands_begin())->isAtomic();
1031 });
1032
1033 Predicate isUniMMO([](const MachineInstr &MI) -> bool {
1034 return AMDGPU::isUniformMMO(*MI.memoperands_begin());
1035 });
1036
1037 Predicate isConst([](const MachineInstr &MI) -> bool {
1038 // Address space in MMO be different then address space on pointer.
1039 const MachineMemOperand *MMO = *MI.memoperands_begin();
1040 const unsigned AS = MMO->getAddrSpace();
1041 return AS == AMDGPUAS::CONSTANT_ADDRESS ||
1043 });
1044
1045 Predicate isVolatileMMO([](const MachineInstr &MI) -> bool {
1046 return (*MI.memoperands_begin())->isVolatile();
1047 });
1048
1049 Predicate isInvMMO([](const MachineInstr &MI) -> bool {
1050 return (*MI.memoperands_begin())->isInvariant();
1051 });
1052
1053 Predicate isNoClobberMMO([](const MachineInstr &MI) -> bool {
1054 return (*MI.memoperands_begin())->getFlags() & MONoClobber;
1055 });
1056
1057 Predicate isNaturalAligned([](const MachineInstr &MI) -> bool {
1058 const MachineMemOperand *MMO = *MI.memoperands_begin();
1059 return MMO->getAlign() >= Align(MMO->getSize().getValue());
1060 });
1061
1062 Predicate is8Or16BitMMO([](const MachineInstr &MI) -> bool {
1063 const MachineMemOperand *MMO = *MI.memoperands_begin();
1064 const unsigned MemSize = 8 * MMO->getSize().getValue();
1065 return MemSize == 16 || MemSize == 8;
1066 });
1067
1068 Predicate is32BitMMO([](const MachineInstr &MI) -> bool {
1069 const MachineMemOperand *MMO = *MI.memoperands_begin();
1070 return 8 * MMO->getSize().getValue() == 32;
1071 });
1072
1073 auto isUL = !isAtomicMMO && isUniMMO && (isConst || !isVolatileMMO) &&
1074 (isConst || isInvMMO || isNoClobberMMO);
1075
1076 // clang-format off
1077 // TODO: S32Dst, 16-bit any-extending load should not appear on True16 targets
1078 addRulesForGOpcs({G_LOAD})
1079 // flat, addrspace(0), never uniform - flat_load
1080 .Any({{DivS16, P0}, {{Vgpr16}, {VgprP0}}}, usesTrue16)
1081 .Any({{DivB32, P0}, {{VgprB32}, {VgprP0}}}) // 32-bit load, 8-bit and 16-bit any-extending load
1082 .Any({{DivB64, P0}, {{VgprB64}, {VgprP0}}})
1083 .Any({{DivB96, P0}, {{VgprB96}, {VgprP0}}})
1084 .Any({{DivB128, P0}, {{VgprB128}, {VgprP0}}})
1085
1086 // global, addrspace(1)
1087 // divergent - global_load
1088 .Any({{DivS16, P1}, {{Vgpr16}, {VgprP1}}}, usesTrue16)
1089 .Any({{DivB32, P1}, {{VgprB32}, {VgprP1}}}) //32-bit load, 8-bit and 16-bit any-extending load
1090 .Any({{DivB64, P1}, {{VgprB64}, {VgprP1}}})
1091 .Any({{DivB96, P1}, {{VgprB96}, {VgprP1}}})
1092 .Any({{DivB128, P1}, {{VgprB128}, {VgprP1}}})
1093 .Any({{DivB256, P1}, {{VgprB256}, {VgprP1}, SplitLoad}})
1094 .Any({{DivB512, P1}, {{VgprB512}, {VgprP1}, SplitLoad}})
1095
1096 // uniform - s_load
1097 .Any({{{UniS16, P1}, isNaturalAligned && isUL}, {{Sgpr32Trunc}, {SgprP1}}}, usesTrue16 && hasSMRDSmall) // s16 load
1098 .Any({{{UniS16, P1}, isAlign4 && isUL}, {{Sgpr32Trunc}, {SgprP1}, WidenMMOToS32}}, usesTrue16 && !hasSMRDSmall) // s16 load to 32-bit load
1099 .Any({{{UniB32, P1}, isNaturalAligned && isUL}, {{SgprB32}, {SgprP1}}}, hasSMRDSmall) //32-bit load, 8-bit and 16-bit any-extending load
1100 // TODO: SplitLoad when !isNaturalAligned && isUL and target hasSMRDSmall
1101 .Any({{{UniB32, P1}, is8Or16BitMMO && isAlign4 && isUL}, {{SgprB32}, {SgprP1}, WidenMMOToS32}}, !hasSMRDSmall) //8-bit and 16-bit any-extending load to 32-bit load
1102 .Any({{{UniB32, P1}, is32BitMMO && isAlign4 && isUL}, {{SgprB32}, {SgprP1}}}) //32-bit load
1103 .Any({{{UniB64, P1}, isAlign4 && isUL}, {{SgprB64}, {SgprP1}}})
1104 .Any({{{UniB96, P1}, isAlign16 && isUL}, {{SgprB96}, {SgprP1}, WidenLoad}}, !hasSMRDx3)
1105 .Any({{{UniB96, P1}, isAlign4 && !isAlign16 && isUL}, {{SgprB96}, {SgprP1}, SplitLoad}}, !hasSMRDx3)
1106 .Any({{{UniB96, P1}, isAlign4 && isUL}, {{SgprB96}, {SgprP1}}}, hasSMRDx3)
1107 .Any({{{UniB128, P1}, isAlign4 && isUL}, {{SgprB128}, {SgprP1}}})
1108 .Any({{{UniB256, P1}, isAlign4 && isUL}, {{SgprB256}, {SgprP1}}})
1109 .Any({{{UniB512, P1}, isAlign4 && isUL}, {{SgprB512}, {SgprP1}}})
1110
1111 // Uniform via global or buffer load, for example volatile or non-aligned
1112 // uniform load. Not using standard {{UniInVgprTy}, {VgprP1}} since it is
1113 // selected as global_load, use SgprP1 for pointer instead to match
1114 // patterns without flat-for-global, default for GFX7 and older.
1115 // -> +flat-for-global + {{UniInVgprTy}, {SgprP1}} - global_load
1116 // -> -flat-for-global + {{UniInVgprTy}, {SgprP1}} - buffer_load
1117 .Any({{{UniS16, P1}, !isNaturalAligned || !isUL}, {{UniInVgprS16}, {SgprP1}}}, usesTrue16 && hasSMRDSmall) // s16 load
1118 .Any({{{UniS16, P1}, !isAlign4 || !isUL}, {{UniInVgprS16}, {SgprP1}}}, usesTrue16 && !hasSMRDSmall) // s16 load
1119 .Any({{{UniB32, P1}, !isNaturalAligned || !isUL}, {{UniInVgprB32}, {SgprP1}}}, hasSMRDSmall) //32-bit load, 8-bit and 16-bit any-extending load
1120 .Any({{{UniB32, P1}, !isAlign4 || !isUL}, {{UniInVgprB32}, {SgprP1}}}, !hasSMRDSmall) //32-bit load, 8-bit and 16-bit any-extending load
1121 .Any({{{UniB64, P1}, !isAlign4 || !isUL}, {{UniInVgprB64}, {SgprP1}}})
1122 .Any({{{UniB96, P1}, !isAlign4 || !isUL}, {{UniInVgprB96}, {SgprP1}}})
1123 .Any({{{UniB128, P1}, !isAlign4 || !isUL}, {{UniInVgprB128}, {SgprP1}}})
1124 .Any({{{UniB256, P1}, !isAlign4 || !isUL}, {{UniInVgprB256}, {SgprP1}, SplitLoad}})
1125 .Any({{{UniB512, P1}, !isAlign4 || !isUL}, {{UniInVgprB512}, {SgprP1}, SplitLoad}})
1126
1127 // local, addrspace(3) - ds_load
1128 .Any({{DivS16, P3}, {{Vgpr16}, {VgprP3}}}, usesTrue16)
1129 .Any({{DivB32, P3}, {{VgprB32}, {VgprP3}}}) // 32-bit load, 8-bit and 16-bit any-extending load
1130 .Any({{DivB64, P3}, {{VgprB64}, {VgprP3}}})
1131 .Any({{DivB96, P3}, {{VgprB96}, {VgprP3}}})
1132 .Any({{DivB128, P3}, {{VgprB128}, {VgprP3}}})
1133
1134 .Any({{UniS16, P3}, {{UniInVgprS16}, {SgprP3}}}, usesTrue16) // 16-bit load
1135 .Any({{UniB32, P3}, {{UniInVgprB32}, {VgprP3}}}) // 32-bit load, 8-bit and 16-bit any-extending load
1136 .Any({{UniB64, P3}, {{UniInVgprB64}, {VgprP3}}})
1137 .Any({{UniB96, P3}, {{UniInVgprB96}, {VgprP3}}})
1138 .Any({{UniB128, P3}, {{UniInVgprB128}, {VgprP3}}})
1139
1140 // constant, addrspace(4)
1141 // divergent - global_load
1142 .Any({{DivS16, P4}, {{Vgpr16}, {VgprP4}}}, usesTrue16)
1143 .Any({{DivB32, P4}, {{VgprB32}, {VgprP4}}}) //32-bit load, 8-bit and 16-bit any-extending load
1144 .Any({{DivB64, P4}, {{VgprB64}, {VgprP4}}})
1145 .Any({{DivB96, P4}, {{VgprB96}, {VgprP4}}})
1146 .Any({{DivB128, P4}, {{VgprB128}, {VgprP4}}})
1147 .Any({{DivB256, P4}, {{VgprB256}, {VgprP4}, SplitLoad}})
1148 .Any({{DivB512, P4}, {{VgprB512}, {VgprP4}, SplitLoad}})
1149
1150 // uniform - s_load
1151 .Any({{{UniS16, P4}, isNaturalAligned && isUL}, {{Sgpr32Trunc}, {SgprP4}}}, usesTrue16 && hasSMRDSmall) // s16 load
1152 .Any({{{UniS16, P4}, isAlign4 && isUL}, {{Sgpr32Trunc}, {SgprP4}, WidenMMOToS32}}, usesTrue16 && !hasSMRDSmall) // s16 load to 32-bit load
1153 .Any({{{UniB32, P4}, isNaturalAligned && isUL}, {{SgprB32}, {SgprP4}}}, hasSMRDSmall) //32-bit load, 8-bit and 16-bit any-extending load
1154 .Any({{{UniB32, P4}, is8Or16BitMMO && isAlign4 && isUL}, {{SgprB32}, {SgprP4}, WidenMMOToS32}}, !hasSMRDSmall) //8-bit and 16-bit any-extending load to 32-bit load
1155 .Any({{{UniB32, P4}, is32BitMMO && isAlign4 && isUL}, {{SgprB32}, {SgprP4}}}) //32-bit load
1156 .Any({{{UniB64, P4}, isAlign4 && isUL}, {{SgprB64}, {SgprP4}}})
1157 .Any({{{UniB96, P4}, isAlign16 && isUL}, {{SgprB96}, {SgprP4}, WidenLoad}}, !hasSMRDx3)
1158 .Any({{{UniB96, P4}, isAlign4 && !isAlign16 && isUL}, {{SgprB96}, {SgprP4}, SplitLoad}}, !hasSMRDx3)
1159 .Any({{{UniB96, P4}, isAlign4 && isUL}, {{SgprB96}, {SgprP4}}}, hasSMRDx3)
1160 .Any({{{UniB128, P4}, isAlign4 && isUL}, {{SgprB128}, {SgprP4}}})
1161 .Any({{{UniB256, P4}, isAlign4 && isUL}, {{SgprB256}, {SgprP4}}})
1162 .Any({{{UniB512, P4}, isAlign4 && isUL}, {{SgprB512}, {SgprP4}}})
1163
1164 // uniform in vgpr - global_load or buffer_load
1165 .Any({{{UniS16, P4}, !isNaturalAligned || !isUL}, {{UniInVgprS16}, {SgprP4}}}, usesTrue16 && hasSMRDSmall) // s16 load
1166 .Any({{{UniS16, P4}, !isAlign4 || !isUL}, {{UniInVgprS16}, {SgprP4}}}, usesTrue16 && !hasSMRDSmall) // s16 load
1167 .Any({{{UniB32, P4}, !isNaturalAligned || !isUL}, {{UniInVgprB32}, {SgprP4}}}, hasSMRDSmall) //32-bit load, 8-bit and 16-bit any-extending load
1168 .Any({{{UniB32, P4}, !isAlign4 || !isUL}, {{UniInVgprB32}, {SgprP4}}}, !hasSMRDSmall) //32-bit load, 8-bit and 16-bit any-extending load
1169 .Any({{{UniB64, P4}, !isAlign4 || !isUL}, {{UniInVgprB64}, {SgprP4}}})
1170 .Any({{{UniB96, P4}, !isAlign4 || !isUL}, {{UniInVgprB96}, {SgprP4}}})
1171 .Any({{{UniB128, P4}, !isAlign4 || !isUL}, {{UniInVgprB128}, {SgprP4}}})
1172 .Any({{{UniB256, P4}, !isAlign4 || !isUL}, {{UniInVgprB256}, {SgprP4}, SplitLoad}})
1173 .Any({{{UniB512, P4}, !isAlign4 || !isUL}, {{UniInVgprB512}, {SgprP4}, SplitLoad}})
1174
1175 // private, addrspace(5), never uniform - scratch_load
1176 .Any({{DivS16, P5}, {{Vgpr16}, {VgprP5}}}, usesTrue16)
1177 .Any({{DivB32, P5}, {{VgprB32}, {VgprP5}}}) // 32-bit load, 8-bit and 16-bit any-extending load
1178 .Any({{DivB64, P5}, {{VgprB64}, {VgprP5}}})
1179 .Any({{DivB96, P5}, {{VgprB96}, {VgprP5}}})
1180 .Any({{DivB128, P5}, {{VgprB128}, {VgprP5}}})
1181
1182 .Any({{DivS32, Ptr128}, {{Vgpr32}, {VgprPtr128}}});
1183
1184
1185 addRulesForGOpcs({G_ZEXTLOAD, G_SEXTLOAD}) // i8 and i16 zeroextending loads
1186 .Any({{DivS32, P0}, {{Vgpr32}, {VgprP0}}})
1187 .Any({{DivS16, P0}, {{Vgpr16}, {VgprP0}}}, usesTrue16)
1188
1189 .Any({{DivS32, P1}, {{Vgpr32}, {VgprP1}}})
1190 .Any({{DivS16, P1}, {{Vgpr16}, {VgprP1}}}, usesTrue16)
1191 .Any({{{UniS32, P1}, isAlign4 && isUL}, {{Sgpr32}, {SgprP1}, WidenMMOToS32}}, !hasSMRDSmall)
1192 .Any({{{UniS32, P1}, isNaturalAligned && isUL}, {{Sgpr32}, {SgprP1}}}, hasSMRDSmall)
1193 .Any({{{UniS32, P1}, !isAlign4 || !isUL}, {{UniInVgprS32}, {SgprP1}}}, !hasSMRDSmall)
1194 .Any({{{UniS32, P1}, !isNaturalAligned || !isUL}, {{UniInVgprS32}, {SgprP1}}}, hasSMRDSmall)
1195
1196 .Any({{DivS32, P3}, {{Vgpr32}, {VgprP3}}})
1197 .Any({{DivS16, P3}, {{Vgpr16}, {VgprP3}}}, usesTrue16)
1198 .Any({{UniS32, P3}, {{UniInVgprS32}, {VgprP3}}})
1199
1200 .Any({{DivS32, P4}, {{Vgpr32}, {VgprP4}}})
1201 .Any({{DivS16, P4}, {{Vgpr16}, {VgprP4}}}, usesTrue16)
1202 .Any({{{UniS32, P4}, isAlign4 && isUL}, {{Sgpr32}, {SgprP4}, WidenMMOToS32}}, !hasSMRDSmall)
1203 .Any({{{UniS32, P4}, isNaturalAligned && isUL}, {{Sgpr32}, {SgprP4}}}, hasSMRDSmall)
1204 .Any({{{UniS32, P4}, !isAlign4 || !isUL}, {{UniInVgprS32}, {SgprP4}}}, !hasSMRDSmall)
1205 .Any({{{UniS32, P4}, !isNaturalAligned || !isUL}, {{UniInVgprS32}, {SgprP4}}}, hasSMRDSmall)
1206
1207 .Any({{DivS32, P5}, {{Vgpr32}, {VgprP5}}})
1208 .Any({{DivS16, P5}, {{Vgpr16}, {VgprP5}}}, usesTrue16);
1209
1210 addRulesForGOpcs({G_STORE})
1211 // addrspace(0)
1212 .Any({{S16, P0}, {{}, {Vgpr16, VgprP0}}}, usesTrue16) // 16-bit store
1213 .Any({{B32, P0}, {{}, {VgprB32, VgprP0}}}) // 32-bit store, 8-bit and 16-bit truncating store
1214 .Any({{B64, P0}, {{}, {VgprB64, VgprP0}}})
1215 .Any({{B96, P0}, {{}, {VgprB96, VgprP0}}})
1216 .Any({{B128, P0}, {{}, {VgprB128, VgprP0}}})
1217
1218 // addrspace(1), there are no stores to addrspace(4)
1219 // For targets:
1220 // - with "+flat-for-global" - global_store
1221 // - without(-flat-for-global) - buffer_store addr64
1222 .Any({{S16, DivP1}, {{}, {Vgpr16, VgprP1}}}, usesTrue16) // 16-bit store
1223 .Any({{B32, DivP1}, {{}, {VgprB32, VgprP1}}}) // 32-bit store, 8-bit and 16-bit truncating store
1224 .Any({{B64, DivP1}, {{}, {VgprB64, VgprP1}}})
1225 .Any({{B96, DivP1}, {{}, {VgprB96, VgprP1}}})
1226 .Any({{B128, DivP1}, {{}, {VgprB128, VgprP1}}})
1227
1228 // For UniP1, use sgpr ptr to match flat-for-global patterns. Targets:
1229 // - with "+flat-for-global" - global_store for both sgpr and vgpr ptr
1230 // - without(-flat-for-global) - need sgpr ptr to select buffer_store
1231 .Any({{S16, UniP1}, {{}, {Vgpr16, SgprP1}}}, usesTrue16) // 16-bit store
1232 .Any({{B32, UniP1}, {{}, {VgprB32, SgprP1}}}) // 32-bit store, 8-bit and 16-bit truncating store
1233 .Any({{B64, UniP1}, {{}, {VgprB64, SgprP1}}})
1234 .Any({{B96, UniP1}, {{}, {VgprB96, SgprP1}}})
1235 .Any({{B128, UniP1}, {{}, {VgprB128, SgprP1}}})
1236
1237 // addrspace(3) and addrspace(5)
1238 .Any({{S16, Ptr32}, {{}, {Vgpr16, VgprPtr32}}}, usesTrue16) // 16-bit store
1239 .Any({{B32, Ptr32}, {{}, {VgprB32, VgprPtr32}}}) // 32-bit store, 8-bit and 16-bit truncating store
1240 .Any({{B64, Ptr32}, {{}, {VgprB64, VgprPtr32}}})
1241 .Any({{B96, Ptr32}, {{}, {VgprB96, VgprPtr32}}})
1242 .Any({{B128, Ptr32}, {{}, {VgprB128, VgprPtr32}}});
1243
1244 // clang-format on
1245
1246 addRulesForGOpcs({G_AMDGPU_BUFFER_LOAD, G_AMDGPU_BUFFER_LOAD_FORMAT,
1247 G_AMDGPU_TBUFFER_LOAD_FORMAT},
1248 StandardB)
1257
1258 addRulesForGOpcs({G_AMDGPU_BUFFER_LOAD_USHORT, G_AMDGPU_BUFFER_LOAD_UBYTE,
1259 G_AMDGPU_BUFFER_LOAD_SSHORT, G_AMDGPU_BUFFER_LOAD_SBYTE},
1260 StandardB)
1263
1264 addRulesForGOpcs(
1265 {G_AMDGPU_BUFFER_LOAD_UBYTE_TFE, G_AMDGPU_BUFFER_LOAD_USHORT_TFE},
1266 StandardB)
1269
1270 addRulesForGOpcs({G_AMDGPU_BUFFER_LOAD_TFE, G_AMDGPU_BUFFER_LOAD_FORMAT_TFE},
1271 StandardB)
1279 .Any({{UniB160},
1281
1282 addRulesForGOpcs(
1283 {G_AMDGPU_BUFFER_LOAD_FORMAT_D16, G_AMDGPU_TBUFFER_LOAD_FORMAT_D16},
1284 StandardB)
1291
1292 addRulesForGOpcs({G_AMDGPU_S_BUFFER_LOAD})
1293 // waterfall expansion is part of S_BUF_to_BUF
1294 .Any({{UniB32}, {{SgprB32}, {SgprV4S32, Sgpr32}}})
1295 .Any({{DivB32, UniV4S32, DivB32},
1297 .Any({{DivB32, DivV4S32, UniB32},
1299 .Any({{DivB32, DivV4S32, DivB32},
1301
1302 .Any({{UniB64}, {{SgprB64}, {SgprV4S32, Sgpr32}}})
1303 .Any({{DivB64, UniV4S32, DivB32},
1305 .Any({{DivB64, DivV4S32, UniB32},
1307 .Any({{DivB64, DivV4S32, DivB32},
1309
1310 .Any({{UniB96}, {{SgprB96}, {SgprV4S32, Sgpr32}}})
1311 .Any({{DivB96, UniV4S32, DivB32},
1313 .Any({{DivB96, DivV4S32, UniB32},
1315 .Any({{DivB96, DivV4S32, DivB32},
1317
1318 .Any({{UniB128}, {{SgprB128}, {SgprV4S32, Sgpr32}}})
1319 .Any({{DivB128, UniV4S32, DivB32},
1321 .Any({{DivB128, DivV4S32, UniB32},
1323 .Any({{DivB128, DivV4S32, DivB32},
1325
1326 .Any({{UniB256}, {{SgprB256}, {SgprV4S32, Sgpr32}}})
1327 .Any({{DivB256, UniV4S32, DivB32},
1329 .Any({{DivB256, DivV4S32, UniB32},
1331 .Any({{DivB256, DivV4S32, DivB32},
1333
1334 .Any({{UniB512}, {{SgprB512}, {SgprV4S32, Sgpr32}}})
1335 .Any({{DivB512, UniV4S32, DivB32},
1337 .Any({{DivB512, DivV4S32, UniB32},
1339 .Any({{DivB512, DivV4S32, DivB32},
1341
1342 addRulesForGOpcs({G_AMDGPU_S_BUFFER_LOAD_SBYTE, G_AMDGPU_S_BUFFER_LOAD_UBYTE,
1343 G_AMDGPU_S_BUFFER_LOAD_SSHORT,
1344 G_AMDGPU_S_BUFFER_LOAD_USHORT})
1346 .Any({{DivS32, UniV4S32, DivS32},
1348 .Any({{DivS32, DivV4S32, UniS32},
1350 .Any({{DivS32, DivV4S32, DivS32},
1352
1353 addRulesForGOpcs({G_AMDGPU_BUFFER_STORE, G_AMDGPU_BUFFER_STORE_BYTE,
1354 G_AMDGPU_BUFFER_STORE_SHORT, G_AMDGPU_BUFFER_STORE_FORMAT,
1355 G_AMDGPU_BUFFER_STORE_FORMAT_D16,
1356 G_AMDGPU_TBUFFER_STORE_FORMAT,
1357 G_AMDGPU_TBUFFER_STORE_FORMAT_D16})
1358 .Any({{B32}, {{}, {VgprB32, SgprV4S32_WF, Vgpr32, Vgpr32, Sgpr32_WF}}})
1359 .Any({{B64}, {{}, {VgprB64, SgprV4S32_WF, Vgpr32, Vgpr32, Sgpr32_WF}}})
1360 .Any({{B96}, {{}, {VgprB96, SgprV4S32_WF, Vgpr32, Vgpr32, Sgpr32_WF}}})
1361 .Any({{B128}, {{}, {VgprB128, SgprV4S32_WF, Vgpr32, Vgpr32, Sgpr32_WF}}});
1362
1363 // Buffer atomics: resource descriptor + scalar offset are SGPR, data and
1364 // address components are VGPR.
1365 //
1366 // Operand order (SIInstructions.td BufferAtomicGenericInstruction):
1367 // dst = op vdata, rsrc, vindex, voffset, soffset, offset_imm, cachepolicy,
1368 // idxen_imm
1369 addRulesForGOpcs({G_AMDGPU_BUFFER_ATOMIC_FADD})
1370 .Any({{S32, S32, V4S32, S32, S32, S32},
1372 .Any({{S64, S64, V4S32, S32, S32, S32},
1374 .Any({{V2S16, V2S16, V4S32, S32, S32, S32},
1375 {{VgprV2S16},
1377
1378 addRulesForGOpcs({G_PTR_ADD})
1379 .Any({{UniPtr32}, {{SgprPtr32}, {SgprPtr32, Sgpr32}}})
1380 .Any({{DivPtr32}, {{VgprPtr32}, {VgprPtr32, Vgpr32}}})
1381 .Any({{UniPtr64}, {{SgprPtr64}, {SgprPtr64, Sgpr64}}})
1382 .Any({{DivPtr64}, {{VgprPtr64}, {VgprPtr64, Vgpr64}}});
1383
1384 addRulesForGOpcs({G_INTTOPTR})
1385 .Any({{UniPtr32}, {{SgprPtr32}, {Sgpr32}}})
1386 .Any({{DivPtr32}, {{VgprPtr32}, {Vgpr32}}})
1387 .Any({{UniPtr64}, {{SgprPtr64}, {Sgpr64}}})
1388 .Any({{DivPtr64}, {{VgprPtr64}, {Vgpr64}}})
1389 .Any({{UniPtr128}, {{SgprPtr128}, {Sgpr128}}})
1390 .Any({{DivPtr128}, {{VgprPtr128}, {Vgpr128}}});
1391
1392 addRulesForGOpcs({G_PTRTOINT})
1393 .Any({{UniS32}, {{Sgpr32}, {SgprPtr32}}})
1394 .Any({{DivS32}, {{Vgpr32}, {VgprPtr32}}})
1395 .Any({{UniS64}, {{Sgpr64}, {SgprPtr64}}})
1396 .Any({{DivS64}, {{Vgpr64}, {VgprPtr64}}})
1397 .Any({{UniS128}, {{Sgpr128}, {SgprPtr128}}})
1398 .Any({{DivS128}, {{Vgpr128}, {VgprPtr128}}});
1399
1400 // FIXME: Update llvm/test/CodeGen/AMDGPU/ptrmask.ll to use GlobalISel.
1401 // Currently crashes on P8 (buffer resource) tests due to legalizer issue.
1402 addRulesForGOpcs({G_PTRMASK})
1403 .Any({{UniP1}, {{SgprP1}, {SgprP1, Sgpr64}}})
1404 .Any({{DivP1}, {{VgprP1}, {VgprP1, Vgpr64}}})
1405 .Any({{UniP3}, {{SgprP3}, {SgprP3, Sgpr32}}})
1406 .Any({{DivP3}, {{VgprP3}, {VgprP3, Vgpr32}}});
1407
1408 addRulesForGOpcs({G_DYN_STACKALLOC})
1409 .Any({{UniP5, UniS32}, {{SgprP5}, {Sgpr32}, DynStackAlloc}})
1410 .Any({{UniP5, DivS32}, {{SgprP5}, {Vgpr32}, DynStackAlloc}});
1411
1412 addRulesForGOpcs({G_ABS}, Standard)
1413 .Uni(S16, {{Sgpr32Trunc}, {Sgpr32SExt}})
1414 .Div(S16, {{Vgpr16}, {Vgpr16}, AbsToNegMax})
1415 .Uni(S32, {{Sgpr32}, {Sgpr32}})
1416 .Div(S32, {{Vgpr32}, {Vgpr32}, AbsToNegMax})
1417 .Uni(V2S16, {{SgprV2S16}, {SgprV2S16}, AbsToS32})
1418 .Div(V2S16, {{VgprV2S16}, {VgprV2S16}, AbsToNegMax});
1419
1420 addRulesForGOpcs({G_BITREVERSE}, Standard)
1421 .Uni(S32, {{Sgpr32}, {Sgpr32}})
1422 .Div(S32, {{Vgpr32}, {Vgpr32}})
1423 .Uni(S64, {{Sgpr64}, {Sgpr64}})
1424 .Div(S64, {{Vgpr64}, {Vgpr64}});
1425
1426 addRulesForGOpcs({G_AMDGPU_FFBH_U32, G_AMDGPU_FFBL_B32, G_CTLZ_ZERO_POISON,
1427 G_CTTZ_ZERO_POISON})
1428 .Any({{UniS32, S32}, {{Sgpr32}, {Sgpr32}}})
1429 .Any({{DivS32, S32}, {{Vgpr32}, {Vgpr32}}})
1430 .Any({{UniS32, S64}, {{Sgpr32}, {Sgpr64}}})
1432
1433 addRulesForGOpcs({G_CTPOP})
1434 .Any({{UniS32, S32}, {{Sgpr32}, {Sgpr32}}})
1435 .Any({{DivS32, S32}, {{Vgpr32}, {Vgpr32}}})
1436 .Any({{UniS32, S64}, {{Sgpr32}, {Sgpr64}}})
1437 .Any({{DivS32, S64}, {{Vgpr32}, {Vgpr64}, CtPop64To32}});
1438
1439 addRulesForGOpcs({G_FENCE}).Any({{{}}, {{}, {}}});
1440
1441 addRulesForGOpcs({G_READSTEADYCOUNTER, G_READCYCLECOUNTER}, Standard)
1442 .Uni(S64, {{Sgpr64}, {}});
1443
1444 addRulesForGOpcs({G_BLOCK_ADDR}).Any({{UniP0}, {{SgprP0}, {}}});
1445
1446 addRulesForGOpcs({G_GLOBAL_VALUE})
1447 .Any({{UniP0}, {{SgprP0}, {}}})
1448 .Any({{UniP1}, {{SgprP1}, {}}})
1449 .Any({{UniP3}, {{SgprP3}, {}}})
1450 .Any({{UniP4}, {{SgprP4}, {}}})
1451 .Any({{UniP8}, {{SgprP8}, {}}});
1452
1453 addRulesForGOpcs({G_AMDGPU_WAVE_ADDRESS}).Any({{UniP5}, {{SgprP5}, {}}});
1454
1455 addRulesForGOpcs({G_AMDGPU_SPONENTRY}, Standard).Uni(S32, {{Sgpr32}, {}});
1456
1457 addRulesForGOpcs({G_SI_CALL})
1458 .Any({{_, UniP0}, {{None}, {SgprP0}}})
1459 .Any({{_, DivP0}, {{None}, {SgprP0Call_WF}}})
1460 .Any({{_, UniP4}, {{None}, {SgprP4}}})
1461 .Any({{_, DivP4}, {{None}, {SgprP4Call_WF}}});
1462
1463 bool hasSALUFloat = ST->hasSALUFloatInsts();
1464
1465 addRulesForGOpcs({G_FADD, G_FMUL, G_STRICT_FADD, G_STRICT_FMUL}, Standard)
1466 .Uni(S16, {{UniInVgprS16}, {Vgpr16, Vgpr16}}, !hasSALUFloat)
1467 .Uni(S16, {{Sgpr16}, {Sgpr16, Sgpr16}}, hasSALUFloat)
1468 .Div(S16, {{Vgpr16}, {Vgpr16, Vgpr16}})
1469 .Uni(S32, {{Sgpr32}, {Sgpr32, Sgpr32}}, hasSALUFloat)
1470 .Uni(S32, {{UniInVgprS32}, {Vgpr32, Vgpr32}}, !hasSALUFloat)
1471 .Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32}})
1472 .Uni(S64, {{UniInVgprS64}, {Vgpr64, Vgpr64}})
1473 .Div(S64, {{Vgpr64}, {Vgpr64, Vgpr64}})
1474 .Uni(V2S16, {{UniInVgprV2S16}, {VgprV2S16, VgprV2S16}}, !hasSALUFloat)
1476 hasSALUFloat)
1477 .Div(V2S16, {{VgprV2S16}, {VgprV2S16, VgprV2S16}})
1479 .Any({{DivV2S32}, {{VgprV2S32}, {VgprV2S32, VgprV2S32}}});
1480
1481 addRulesForGOpcs({G_FSUB, G_STRICT_FSUB}, Standard)
1482 .Div(S16, {{Vgpr16}, {Vgpr16, Vgpr16}})
1483 .Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32}})
1484 .Uni(S16, {{Sgpr16}, {Sgpr16, Sgpr16}}, hasSALUFloat)
1485 .Uni(S16, {{UniInVgprS16}, {Vgpr16, Vgpr16}}, !hasSALUFloat)
1486 .Uni(S32, {{Sgpr32}, {Sgpr32, Sgpr32}}, hasSALUFloat)
1487 .Uni(S32, {{UniInVgprS32}, {Vgpr32, Vgpr32}}, !hasSALUFloat);
1488
1489 addRulesForGOpcs({G_FMAD}, Standard)
1490 .Uni(S16, {{UniInVgprS16}, {Vgpr16, Vgpr16, Vgpr16}})
1491 .Div(S16, {{Vgpr16}, {Vgpr16, Vgpr16, Vgpr16}})
1492 .Uni(S32, {{UniInVgprS32}, {Vgpr32, Vgpr32, Vgpr32}})
1493 .Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32, Vgpr32}});
1494
1495 addRulesForGOpcs({G_FLDEXP, G_STRICT_FLDEXP}, Standard)
1496 .Uni(S32, {{UniInVgprS32}, {Vgpr32, Vgpr32}})
1497 .Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32}})
1498 .Uni(S16, {{UniInVgprS16}, {Vgpr16, Vgpr16}})
1499 .Div(S16, {{Vgpr16}, {Vgpr16, Vgpr16}})
1500 .Uni(S64, {{UniInVgprS64}, {Vgpr64, Vgpr32}})
1501 .Div(S64, {{Vgpr64}, {Vgpr64, Vgpr32}});
1502
1503 addRulesForGOpcs({G_FMA, G_STRICT_FMA}, Standard)
1504 .Div(S16, {{Vgpr16}, {Vgpr16, Vgpr16, Vgpr16}})
1505 .Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32, Vgpr32}})
1506 .Uni(S64, {{UniInVgprS64}, {Vgpr64, Vgpr64, Vgpr64}})
1507 .Div(S64, {{Vgpr64}, {Vgpr64, Vgpr64, Vgpr64}})
1511 .Uni(S16, {{Sgpr16}, {Sgpr16, Sgpr16, Sgpr16}}, hasSALUFloat)
1512 .Uni(S16, {{UniInVgprS16}, {Vgpr16, Vgpr16, Vgpr16}}, !hasSALUFloat)
1513 .Uni(S32, {{Sgpr32}, {Sgpr32, Sgpr32, Sgpr32}}, hasSALUFloat)
1514 .Uni(S32, {{UniInVgprS32}, {Vgpr32, Vgpr32, Vgpr32}}, !hasSALUFloat)
1515 .Uni(V2S16,
1517 hasSALUFloat)
1519 !hasSALUFloat);
1520
1521 addRulesForGOpcs({G_AMDGPU_FMED3}, Standard)
1522 .Uni(S16, {{UniInVgprS16}, {Vgpr16, Vgpr16, Vgpr16}})
1523 .Div(S16, {{Vgpr16}, {Vgpr16, Vgpr16, Vgpr16}})
1524 .Uni(S32, {{UniInVgprS32}, {Vgpr32, Vgpr32, Vgpr32}})
1525 .Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32, Vgpr32}});
1526
1527 // TODO: This opcode is generated from the i64->i16 signed clamped pattern in
1528 // the PreLegalizerCombiner. Move the combine to RegBankCombiner to keep more
1529 // instructions on SALU.
1530 addRulesForGOpcs({G_AMDGPU_SMED3}, Standard)
1531 .Uni(S32, {{UniInVgprS32}, {Vgpr32, Vgpr32, Vgpr32}})
1532 .Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32, Vgpr32}});
1533
1534 // FNEG and FABS are either folded as source modifiers or can be selected as
1535 // bitwise XOR and AND with Mask. XOR and AND are available on SALU but for
1536 // targets without SALU float we still select them as VGPR since there would
1537 // be no real sgpr use.
1538 addRulesForGOpcs({G_FNEG, G_FABS}, Standard)
1539 .Uni(S16, {{UniInVgprS16}, {Vgpr16}}, !hasSALUFloat)
1540 .Uni(S16, {{Sgpr16}, {Sgpr16}}, hasSALUFloat)
1541 .Div(S16, {{Vgpr16}, {Vgpr16}})
1542 .Uni(S32, {{UniInVgprS32}, {Vgpr32}}, !hasSALUFloat)
1543 .Uni(S32, {{Sgpr32}, {Sgpr32}}, hasSALUFloat)
1544 .Div(S32, {{Vgpr32}, {Vgpr32}})
1545 .Uni(S64, {{UniInVgprS64}, {Vgpr64}})
1546 .Div(S64, {{Vgpr64}, {Vgpr64}})
1547 .Uni(V2S16, {{UniInVgprV2S16}, {VgprV2S16}}, !hasSALUFloat)
1548 .Uni(V2S16, {{SgprV2S16}, {SgprV2S16}, ScalarizeToS16}, hasSALUFloat)
1549 .Div(V2S16, {{VgprV2S16}, {VgprV2S16}})
1550 .Any({{UniV2S32}, {{UniInVgprV2S32}, {VgprV2S32}}})
1551 .Any({{DivV2S32}, {{VgprV2S32}, {VgprV2S32}}});
1552
1553 addRulesForGOpcs({G_FCANONICALIZE}, Standard)
1554 .Uni(S32, {{UniInVgprS32}, {Vgpr32}})
1555 .Div(S32, {{Vgpr32}, {Vgpr32}})
1556 .Uni(S16, {{UniInVgprS16}, {Vgpr16}})
1557 .Div(S16, {{Vgpr16}, {Vgpr16}})
1558 .Uni(S64, {{UniInVgprS64}, {Vgpr64}})
1559 .Div(S64, {{Vgpr64}, {Vgpr64}})
1560 .Uni(V2S16, {{UniInVgprV2S16}, {VgprV2S16}})
1561 .Div(V2S16, {{VgprV2S16}, {VgprV2S16}})
1562 .Any({{UniV2S32}, {{UniInVgprV2S32}, {VgprV2S32}}})
1563 .Any({{DivV2S32}, {{VgprV2S32}, {VgprV2S32}}});
1564
1565 bool hasPST = ST->hasPseudoScalarTrans();
1566 addRulesForGOpcs({G_FSQRT}, Standard)
1567 .Div(S16, {{Vgpr16}, {Vgpr16}})
1568 .Uni(S16, {{Sgpr16}, {Sgpr16}}, hasPST)
1569 .Uni(S16, {{UniInVgprS16}, {Vgpr16}}, !hasPST);
1570
1571 addRulesForGOpcs({G_FPTOUI, G_FPTOSI, G_FPTOUI_SAT, G_FPTOSI_SAT})
1572 .Any({{UniS16, S16}, {{UniInVgprS16}, {Vgpr16}}})
1573 .Any({{DivS16, S16}, {{Vgpr16}, {Vgpr16}}})
1574 .Any({{UniS32, S16}, {{Sgpr32}, {Sgpr16}}}, hasSALUFloat)
1575 .Any({{UniS32, S16}, {{UniInVgprS32}, {Vgpr16}}}, !hasSALUFloat)
1576 .Any({{DivS32, S16}, {{Vgpr32}, {Vgpr16}}})
1577 .Any({{UniS32, S32}, {{Sgpr32}, {Sgpr32}}}, hasSALUFloat)
1578 .Any({{UniS32, S32}, {{UniInVgprS32}, {Vgpr32}}}, !hasSALUFloat)
1579 .Any({{DivS32, S32}, {{Vgpr32}, {Vgpr32}}})
1580 .Any({{UniS32, S64}, {{UniInVgprS32}, {Vgpr64}}})
1581 .Any({{DivS32, S64}, {{Vgpr32}, {Vgpr64}}});
1582
1583 addRulesForGOpcs({G_UITOFP, G_SITOFP})
1584 .Any({{UniS16, S16}, {{UniInVgprS16}, {Vgpr16}}})
1585 .Any({{DivS16, S16}, {{Vgpr16}, {Vgpr16}}})
1586 .Any({{UniS16, S32}, {{Sgpr16}, {Sgpr32}}}, hasSALUFloat)
1587 .Any({{UniS16, S32}, {{UniInVgprS16}, {Vgpr32}}}, !hasSALUFloat)
1588 .Any({{DivS16, S32}, {{Vgpr16}, {Vgpr32}}})
1589 .Any({{UniS32, S32}, {{Sgpr32}, {Sgpr32}}}, hasSALUFloat)
1590 .Any({{UniS32, S32}, {{UniInVgprS32}, {Vgpr32}}}, !hasSALUFloat)
1591 .Any({{DivS32, S32}, {{Vgpr32}, {Vgpr32}}})
1592 .Any({{UniS64, S32}, {{UniInVgprS64}, {Vgpr32}}})
1593 .Any({{DivS64, S32}, {{Vgpr64}, {Vgpr32}}});
1594
1595 addRulesForGOpcs({G_AMDGPU_S_BUFFER_PREFETCH})
1597
1598 Predicate IsDataPF([](const MachineInstr &MI) -> bool {
1599 // prefetch cache type: 0 == instruction (I$) prefetch, 1 == data prefetch.
1600 return MI.getOperand(3).getImm() != 0;
1601 });
1602
1603 bool HasSMemPF = ST->hasSafeSmemPrefetch();
1604 bool HasVMemPF = ST->hasVmemPrefInsts();
1605 addRulesForGOpcs({G_PREFETCH})
1606 // Safe smem prefetch keeps both data and instruction prefetch.
1607 .Any({{UniPtr64}, {{}, {SgprPtr64}}}, HasSMemPF)
1608 // Vmem prefetch keeps data prefetch only.
1609 .Any({{{UniPtr64}, IsDataPF}, {{}, {SgprPtr64}}}, !HasSMemPF && HasVMemPF)
1610 .Any({{{UniPtr64}, IsDataPF}, {{}, {}, DeletePrefetch}},
1611 !HasSMemPF && !HasVMemPF)
1612 .Any({{{UniPtr64}, !IsDataPF}, {{}, {}, DeletePrefetch}}, !HasSMemPF)
1613
1614 .Any({{{DivPtr64}, IsDataPF}, {{}, {VgprPtr64}}}, HasVMemPF)
1615 .Any({{{DivPtr64}, IsDataPF}, {{}, {}, DeletePrefetch}}, !HasVMemPF)
1616 .Any({{{DivPtr64}, !IsDataPF}, {{}, {}, DeletePrefetch}})
1617
1618 .Any({{P3}, {{}, {}, DeletePrefetch}})
1619 .Any({{P5}, {{}, {}, DeletePrefetch}})
1620 .Any({{UniP6}, {{}, {SgprP6}}}, HasSMemPF)
1621 .Any({{UniP6}, {{}, {}, DeletePrefetch}}, !HasSMemPF);
1622
1623 addRulesForGOpcs({G_FPEXT})
1624 .Any({{DivS32, S16}, {{Vgpr32}, {Vgpr16}}})
1625 .Any({{UniS64, S32}, {{UniInVgprS64}, {Vgpr32}}})
1626 .Any({{DivS64, S32}, {{Vgpr64}, {Vgpr32}}})
1627 .Any({{UniS32, S16}, {{Sgpr32}, {Sgpr16}}}, hasSALUFloat)
1628 .Any({{UniS32, S16}, {{UniInVgprS32}, {Vgpr16}}}, !hasSALUFloat);
1629
1630 addRulesForGOpcs({G_AMDGPU_CVT_PK_I16_I32}, Standard)
1631 .Uni(V2S16, {{UniInVgprV2S16}, {Vgpr32, Vgpr32}})
1632 .Div(V2S16, {{VgprV2S16}, {Vgpr32, Vgpr32}});
1633
1634 addRulesForGOpcs({G_AMDGPU_FMIN_LEGACY, G_AMDGPU_FMAX_LEGACY}, Standard)
1635 .Uni(S32, {{UniInVgprS32}, {Vgpr32, Vgpr32}})
1636 .Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32}});
1637
1638 bool hasSALUMinimumMaximumInsts = ST->hasSALUMinimumMaximumInsts();
1639
1640 addRulesForGOpcs({G_FMINIMUM, G_FMAXIMUM}, Standard)
1641 .Uni(S16, {{Sgpr16}, {Sgpr16, Sgpr16}}, hasSALUMinimumMaximumInsts)
1642 .Uni(S16, {{UniInVgprS16}, {Vgpr16, Vgpr16}}, !hasSALUMinimumMaximumInsts)
1643 .Div(S16, {{Vgpr16}, {Vgpr16, Vgpr16}})
1644 .Uni(S32, {{Sgpr32}, {Sgpr32, Sgpr32}}, hasSALUMinimumMaximumInsts)
1645 .Uni(S32, {{UniInVgprS32}, {Vgpr32, Vgpr32}}, !hasSALUMinimumMaximumInsts)
1646 .Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32}})
1647 .Uni(S64, {{UniInVgprS64}, {Vgpr64, Vgpr64}})
1648 .Div(S64, {{Vgpr64}, {Vgpr64, Vgpr64}})
1650 .Div(V2S16, {{VgprV2S16}, {VgprV2S16, VgprV2S16}});
1651
1652 addRulesForGOpcs({G_FMINNUM_IEEE, G_FMAXNUM_IEEE, G_FMINNUM, G_FMAXNUM,
1653 G_FMINIMUMNUM, G_FMAXIMUMNUM},
1654 Standard)
1655 .Div(S16, {{Vgpr16}, {Vgpr16, Vgpr16}})
1656 .Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32}})
1657 .Uni(S64, {{UniInVgprS64}, {Vgpr64, Vgpr64}})
1658 .Div(S64, {{Vgpr64}, {Vgpr64, Vgpr64}})
1660 .Div(V2S16, {{VgprV2S16}, {VgprV2S16, VgprV2S16}})
1661 .Uni(S16, {{Sgpr16}, {Sgpr16, Sgpr16}}, hasSALUFloat)
1662 .Uni(S16, {{UniInVgprS16}, {Vgpr16, Vgpr16}}, !hasSALUFloat)
1663 .Uni(S32, {{Sgpr32}, {Sgpr32, Sgpr32}}, hasSALUFloat)
1664 .Uni(S32, {{UniInVgprS32}, {Vgpr32, Vgpr32}}, !hasSALUFloat);
1665
1666 addRulesForGOpcs({G_FPTRUNC})
1667 .Any({{DivS16, S32}, {{Vgpr16}, {Vgpr32}}})
1668 .Any({{UniS32, S64}, {{UniInVgprS32}, {Vgpr64}}})
1669 .Any({{DivS32, S64}, {{Vgpr32}, {Vgpr64}}})
1671 .Any({{DivV2S16, V2S32}, {{VgprV2S16}, {VgprV2S32}}})
1672 .Any({{UniS16, S32}, {{Sgpr16}, {Sgpr32}}}, hasSALUFloat)
1673 .Any({{UniS16, S32}, {{UniInVgprS16}, {Vgpr32}}}, !hasSALUFloat);
1674
1675 addRulesForGOpcs({G_IS_FPCLASS})
1676 .Any({{DivS1, S16}, {{Vcc}, {Vgpr16}}})
1677 .Any({{UniS1, S16}, {{UniInVcc}, {Vgpr16}}})
1678 .Any({{DivS1, S32}, {{Vcc}, {Vgpr32}}})
1679 .Any({{UniS1, S32}, {{UniInVcc}, {Vgpr32}}})
1680 .Any({{DivS1, S64}, {{Vcc}, {Vgpr64}}})
1681 .Any({{UniS1, S64}, {{UniInVcc}, {Vgpr64}}});
1682
1683 addRulesForGOpcs({G_FCMP}, Standard)
1684 .Any({{UniS1, _, S16}, {{Sgpr32Trunc}, {None, Sgpr16, Sgpr16}}},
1685 hasSALUFloat)
1686 .Any({{UniS1, _, S16}, {{UniInVcc}, {None, Vgpr16, Vgpr16}}},
1687 !hasSALUFloat)
1688 .Any({{DivS1, _, S16}, {{Vcc}, {None, Vgpr16, Vgpr16}}})
1689 .Any({{UniS1, _, S32}, {{Sgpr32Trunc}, {None, Sgpr32, Sgpr32}}},
1690 hasSALUFloat)
1691 .Any({{UniS1, _, S32}, {{UniInVcc}, {None, Vgpr32, Vgpr32}}},
1692 !hasSALUFloat)
1693 .Any({{DivS1, _, S32}, {{Vcc}, {None, Vgpr32, Vgpr32}}})
1694 .Any({{UniS1, _, S64}, {{UniInVcc}, {None, Vgpr64, Vgpr64}}})
1695 .Any({{DivS1, _, S64}, {{Vcc}, {None, Vgpr64, Vgpr64}}});
1696
1697 addRulesForGOpcs({G_INTRINSIC_ROUNDEVEN, G_FEXP2, G_FLOG2}, Standard)
1698 .Uni(S16, {{UniInVgprS16}, {Vgpr16}})
1699 .Div(S16, {{Vgpr16}, {Vgpr16}})
1700 .Uni(S32, {{UniInVgprS32}, {Vgpr32}})
1701 .Div(S32, {{Vgpr32}, {Vgpr32}})
1702 .Uni(S64, {{UniInVgprS64}, {Vgpr64}})
1703 .Div(S64, {{Vgpr64}, {Vgpr64}});
1704
1705 addRulesForGOpcs({G_INTRINSIC_TRUNC, G_FFLOOR, G_FCEIL}, Standard)
1706 .Uni(S16, {{UniInVgprS16}, {Vgpr16}})
1707 .Uni(S16, {{Sgpr16}, {Sgpr16}}, hasSALUFloat)
1708 .Div(S16, {{Vgpr16}, {Vgpr16}})
1709 .Uni(S32, {{UniInVgprS32}, {Vgpr32}})
1710 .Uni(S32, {{Sgpr32}, {Sgpr32}}, hasSALUFloat)
1711 .Div(S32, {{Vgpr32}, {Vgpr32}})
1712 .Uni(S64, {{UniInVgprS64}, {Vgpr64}})
1713 .Div(S64, {{Vgpr64}, {Vgpr64}});
1714
1715 addRulesForGOpcs({G_AMDGPU_GLOBAL_LOAD_MONITOR, G_AMDGPU_FLAT_LOAD_MONITOR},
1716 StandardB)
1717 .Uni(B32, {{UniInVgprB32}, {SgprPtr64}})
1718 .Div(B32, {{VgprB32}, {VgprPtr64}})
1719 .Uni(B64, {{UniInVgprB64}, {SgprPtr64}})
1720 .Div(B64, {{VgprB64}, {VgprPtr64}})
1721 .Uni(B128, {{UniInVgprB128}, {SgprPtr64}})
1722 .Div(B128, {{VgprB128}, {VgprPtr64}});
1723
1724 addRulesForGOpcs({G_AMDGPU_WHOLE_WAVE_FUNC_SETUP})
1725 .Any({{DivS1}, {{Vcc}, {}}});
1726
1727 addRulesForGOpcs({G_AMDGPU_WHOLE_WAVE_FUNC_RETURN}).Any({{}, {{}, {Vcc}}});
1728
1729 using namespace Intrinsic;
1730
1731 addRulesForIOpcs({returnaddress}).Any({{UniP0}, {{SgprP0}, {}}});
1732
1733 // Note: amdgcn.icmp with i1 inputs is legalized to ballot in the legalizer,
1734 // so no S1 rules are needed here.
1735 addRulesForIOpcs({amdgcn_icmp})
1736 .Any({{UniS64, _, S16}, {{Sgpr64}, {IntrId, Vgpr16, Vgpr16}}})
1737 .Any({{UniS64, _, S32}, {{Sgpr64}, {IntrId, Vgpr32, Vgpr32}}})
1738 .Any({{UniS64, _, S64}, {{Sgpr64}, {IntrId, Vgpr64, Vgpr64}}})
1739
1740 .Any({{UniS32, _, S16}, {{Sgpr32}, {IntrId, Vgpr16, Vgpr16}}})
1741 .Any({{UniS32, _, S32}, {{Sgpr32}, {IntrId, Vgpr32, Vgpr32}}})
1742 .Any({{UniS32, _, S64}, {{Sgpr32}, {IntrId, Vgpr64, Vgpr64}}});
1743
1744 addRulesForIOpcs({amdgcn_fcmp})
1745 .Any({{UniS64, _, S16}, {{Sgpr64}, {IntrId, Vgpr16, Vgpr16}}})
1746 .Any({{UniS64, _, S32}, {{Sgpr64}, {IntrId, Vgpr32, Vgpr32}}})
1747 .Any({{UniS64, _, S64}, {{Sgpr64}, {IntrId, Vgpr64, Vgpr64}}})
1748
1749 .Any({{UniS32, _, S16}, {{Sgpr32}, {IntrId, Vgpr16, Vgpr16}}})
1750 .Any({{UniS32, _, S32}, {{Sgpr32}, {IntrId, Vgpr32, Vgpr32}}})
1751 .Any({{UniS32, _, S64}, {{Sgpr32}, {IntrId, Vgpr64, Vgpr64}}});
1752
1753 addRulesForIOpcs({amdgcn_s_getpc}).Any({{UniS64, _}, {{Sgpr64}, {None}}});
1754
1755 addRulesForIOpcs({amdgcn_s_getreg}).Any({{}, {{Sgpr32}, {IntrId, Imm}}});
1756
1757 addRulesForIOpcs({amdgcn_s_setreg})
1758 .Any({{_, _, S32}, {{}, {IntrId, Imm, SgprB32_ReadFirstLane}}});
1759
1760 addRulesForIOpcs({amdgcn_s_sendmsg, amdgcn_s_sendmsghalt})
1761 .Any({{}, {{}, {IntrId, Imm, SgprB32_M0}}});
1762
1763 addRulesForIOpcs({amdgcn_s_sendmsg_rtn})
1764 .Any({{S32}, {{Sgpr32}, {}}})
1765 .Any({{S64}, {{Sgpr64}, {}}});
1766
1767 addRulesForIOpcs({amdgcn_s_memrealtime, amdgcn_s_memtime}, Standard)
1768 .Uni(S64, {{Sgpr64}, {IntrId}});
1769
1770 addRulesForIOpcs({amdgcn_groupstaticsize, amdgcn_pops_exiting_wave_id,
1771 amdgcn_reloc_constant, amdgcn_s_get_waveid_in_workgroup},
1772 Standard)
1773 .Uni(S32, {{Sgpr32}, {IntrId}});
1774
1775 // Intrinsics with no register operands.
1776 addRulesForIOpcs({amdgcn_asyncmark,
1777 amdgcn_endpgm,
1778 amdgcn_iglp_opt,
1779 amdgcn_init_exec,
1780 amdgcn_s_barrier,
1781 amdgcn_s_barrier_leave,
1782 amdgcn_s_barrier_signal,
1783 amdgcn_s_barrier_wait,
1784 amdgcn_s_monitor_sleep,
1785 amdgcn_s_nop,
1786 amdgcn_s_sethalt,
1787 amdgcn_s_setprio,
1788 amdgcn_s_setprio_inc_wg,
1789 amdgcn_s_sleep,
1790 amdgcn_s_ttracedata_imm,
1791 amdgcn_s_wait_asynccnt,
1792 amdgcn_s_wait_bvhcnt,
1793 amdgcn_s_wait_dscnt,
1794 amdgcn_s_wait_event,
1795 amdgcn_s_wait_event_export_ready,
1796 amdgcn_s_wait_expcnt,
1797 amdgcn_s_wait_kmcnt,
1798 amdgcn_s_wait_loadcnt,
1799 amdgcn_s_wait_samplecnt,
1800 amdgcn_s_wait_storecnt,
1801 amdgcn_s_wait_tensorcnt,
1802 amdgcn_s_waitcnt,
1803 amdgcn_sched_barrier,
1804 amdgcn_sched_group_barrier,
1805 amdgcn_unreachable,
1806 amdgcn_wait_asyncmark,
1807 amdgcn_wave_barrier})
1808 .Any({{}, {{}, {}}});
1809
1810 addRulesForIOpcs({amdgcn_init_exec_from_input})
1811 .Any({{}, {{}, {IntrId, Sgpr32, Imm}}});
1812
1813 addRulesForIOpcs({amdgcn_s_ttracedata}).Any({{}, {{}, {IntrId, SgprB32_M0}}});
1814
1815 addRulesForIOpcs({amdgcn_s_sleep_var})
1816 .Any({{}, {{}, {IntrId, SgprB32_ReadFirstLane}}});
1817
1818 addRulesForIOpcs({amdgcn_s_barrier_join, amdgcn_s_wakeup_barrier})
1819 .Any({{}, {{}, {IntrId, SgprB32_M0}}});
1820
1821 addRulesForIOpcs({amdgcn_s_barrier_signal_var, amdgcn_s_barrier_init})
1822 .Any({{}, {{}, {IntrId, SgprB32_M0, SgprB32_M0}}});
1823
1824 addRulesForIOpcs({amdgcn_s_barrier_signal_isfirst})
1825 .Any({{UniS1}, {{Sgpr32Trunc}, {}}});
1826
1827 addRulesForIOpcs(
1828 {amdgcn_s_get_named_barrier_state, amdgcn_s_get_barrier_state}, Standard)
1829 .Uni(S32, {{Sgpr32}, {IntrId, SgprB32_M0}});
1830
1831 addRulesForIOpcs({amdgcn_flat_prefetch}).Any({{}, {{}, {IntrId, VgprP0}}});
1832
1833 addRulesForIOpcs({amdgcn_global_prefetch}).Any({{}, {{}, {IntrId, VgprP1}}});
1834
1835 addRulesForIOpcs({amdgcn_s_prefetch_data, amdgcn_s_prefetch_inst})
1837
1838 addRulesForIOpcs({amdgcn_class})
1839 .Any({{UniS1, _, S16}, {{UniInVcc}, {IntrId, Vgpr16, Vgpr32}}})
1840 .Any({{DivS1, _, S16}, {{Vcc}, {IntrId, Vgpr16, Vgpr32}}})
1841 .Any({{UniS1, _, S32}, {{UniInVcc}, {IntrId, Vgpr32, Vgpr32}}})
1842 .Any({{DivS1, _, S32}, {{Vcc}, {IntrId, Vgpr32, Vgpr32}}})
1843 .Any({{UniS1, _, S64}, {{UniInVcc}, {IntrId, Vgpr64, Vgpr32}}})
1844 .Any({{DivS1, _, S64}, {{Vcc}, {IntrId, Vgpr64, Vgpr32}}});
1845
1846 // This is "intrinsic lane mask" it was set to i32/i64 in llvm-ir.
1847 addRulesForIOpcs({amdgcn_end_cf})
1848 .Any({{_, UniS32}, {{}, {IntrId, Sgpr32}}})
1849 .Any({{_, UniS64}, {{}, {IntrId, Sgpr64}}});
1850
1851 addRulesForIOpcs({amdgcn_if_break}, Standard)
1852 .Uni(S64, {{Sgpr64}, {IntrId, Vcc, Sgpr64}})
1853 .Uni(S32, {{Sgpr32}, {IntrId, Vcc, Sgpr32}});
1854
1855 addRulesForIOpcs({amdgcn_exp})
1856 .Any({{_, _, _, S32, S32, S32, S32},
1857 {{}, {IntrId, Imm, Imm, Vgpr32, Vgpr32, Vgpr32, Vgpr32}}});
1858
1859 addRulesForIOpcs({amdgcn_exp_compr})
1860 .Any({{_, _, _, V2S16}, {{}, {IntrId, Imm, Imm, VgprV2S16, VgprV2S16}}});
1861
1862 addRulesForIOpcs({amdgcn_exp_row})
1863 .Any({{_, _, _, S32, S32, S32, S32, _, S32},
1864 {{},
1866 SgprB32_M0}}});
1867
1868 addRulesForIOpcs({amdgcn_lds_direct_load}, StandardB)
1869 .Div(B32, {{VgprB32}, {IntrId, SgprB32_M0}});
1870
1871 addRulesForIOpcs({amdgcn_lds_param_load}, Standard)
1872 .Div(S32, {{Vgpr32}, {IntrId, Imm, Imm, SgprB32_M0}});
1873
1874 addRulesForIOpcs({amdgcn_mbcnt_lo, amdgcn_mbcnt_hi}, Standard)
1875 .Div(S32, {{}, {Vgpr32, None, Vgpr32, Vgpr32}});
1876
1877 addRulesForIOpcs({amdgcn_readfirstlane})
1878 .Any({{UniB32, _, DivB32}, {{}, {SgprB32, None, VgprB32}}})
1879 // this should not exist in the first place, it is from call lowering
1880 // readfirstlaning just in case register is not in sgpr.
1881 .Any({{UniS32, _, UniS32}, {{}, {Sgpr32, None, Vgpr32}}});
1882
1883 addRulesForIOpcs({amdgcn_readlane}, StandardB)
1885
1886 addRulesForIOpcs({amdgcn_s_quadmask, amdgcn_s_wqm}, StandardB)
1888 .Uni(B64, {{SgprB64}, {IntrId, SgprB64_ReadFirstLane}});
1889
1890 addRulesForIOpcs({amdgcn_writelane}, StandardB)
1891 .Div(B32,
1892 {{VgprB32},
1894
1895 addRulesForIOpcs({amdgcn_add_max_i32, amdgcn_add_max_u32, amdgcn_add_min_i32,
1896 amdgcn_add_min_u32},
1897 Standard)
1898 .Uni(S32, {{UniInVgprS32}, {IntrId, Vgpr32, Vgpr32, Vgpr32}})
1899 .Div(S32, {{Vgpr32}, {IntrId, Vgpr32, Vgpr32, Vgpr32}});
1900
1901 addRulesForIOpcs({amdgcn_pk_add_max_i16, amdgcn_pk_add_max_u16,
1902 amdgcn_pk_add_min_i16, amdgcn_pk_add_min_u16},
1903 Standard)
1906
1907 addRulesForIOpcs({amdgcn_permlane16, amdgcn_permlanex16}, Standard)
1908 .Div(S32, {{Vgpr32},
1911
1912 addRulesForIOpcs({amdgcn_permlane_bcast, amdgcn_permlane_up,
1913 amdgcn_permlane_down, amdgcn_permlane_xor},
1914 StandardB)
1915 .Div(B32,
1916 {{VgprB32},
1918
1919 addRulesForIOpcs({amdgcn_permlane_idx_gen}, Standard)
1921
1922 addRulesForIOpcs({amdgcn_perm}, Standard)
1923 .Uni(S32, {{UniInVgprS32}, {IntrId, Vgpr32, Vgpr32, Vgpr32}})
1924 .Div(S32, {{Vgpr32}, {IntrId, Vgpr32, Vgpr32, Vgpr32}});
1925
1926 addRulesForIOpcs(
1927 {amdgcn_wave_reduce_add, amdgcn_wave_reduce_and, amdgcn_wave_reduce_fadd,
1928 amdgcn_wave_reduce_fmax, amdgcn_wave_reduce_fmin,
1929 amdgcn_wave_reduce_fsub, amdgcn_wave_reduce_max, amdgcn_wave_reduce_min,
1930 amdgcn_wave_reduce_or, amdgcn_wave_reduce_sub, amdgcn_wave_reduce_umax,
1931 amdgcn_wave_reduce_umin, amdgcn_wave_reduce_xor},
1932 Standard)
1933 .Uni(S32, {{Sgpr32}, {IntrId, Sgpr32}})
1934 .Div(S32, {{Sgpr32ToVgprDst}, {IntrId, VgprB32}})
1935 .Uni(S64, {{Sgpr64}, {IntrId, Sgpr64}})
1936 .Div(S64, {{Sgpr64ToVgprDst}, {IntrId, VgprB64}});
1937
1938 addRulesForIOpcs({amdgcn_wave_shuffle}, Standard)
1939 .Uni(S32, {{UniInVgprS32}, {IntrId, Vgpr32, Vgpr32}})
1940 .Div(S32, {{Vgpr32}, {IntrId, Vgpr32, Vgpr32}});
1941
1942 addRulesForIOpcs({amdgcn_bitop3, amdgcn_fmad_ftz}, Standard)
1943 .Uni(S16, {{UniInVgprS16}, {IntrId, Vgpr16, Vgpr16, Vgpr16}})
1944 .Div(S16, {{Vgpr16}, {IntrId, Vgpr16, Vgpr16, Vgpr16}})
1945 .Uni(S32, {{UniInVgprS32}, {IntrId, Vgpr32, Vgpr32, Vgpr32}})
1946 .Div(S32, {{Vgpr32}, {IntrId, Vgpr32, Vgpr32, Vgpr32}});
1947
1948 addRulesForIOpcs({amdgcn_udot4, amdgcn_sdot4, amdgcn_udot8, amdgcn_sdot8,
1949 amdgcn_dot4_f32_bf8_bf8, amdgcn_dot4_f32_bf8_fp8,
1950 amdgcn_dot4_f32_fp8_fp8, amdgcn_dot4_f32_fp8_bf8},
1951 Standard)
1952 .Uni(S32, {{UniInVgprS32}, {IntrId, Vgpr32, Vgpr32, Vgpr32}})
1953 .Div(S32, {{Vgpr32}, {IntrId, Vgpr32, Vgpr32, Vgpr32}});
1954
1955 addRulesForIOpcs({amdgcn_rsq, amdgcn_rsq_clamp}, Standard)
1956 .Uni(S16, {{Sgpr16}, {IntrId, Sgpr16}}, hasPST)
1957 .Uni(S16, {{UniInVgprS16}, {IntrId, Vgpr16}}, !hasPST)
1958 .Div(S16, {{Vgpr16}, {IntrId, Vgpr16}})
1959 .Uni(S32, {{Sgpr32}, {IntrId, Sgpr32}}, hasPST)
1960 .Uni(S32, {{UniInVgprS32}, {IntrId, Vgpr32}}, !hasPST)
1961 .Div(S32, {{Vgpr32}, {IntrId, Vgpr32}})
1962 .Uni(S64, {{UniInVgprS64}, {IntrId, Vgpr64}})
1963 .Div(S64, {{Vgpr64}, {IntrId, Vgpr64}});
1964
1965 addRulesForIOpcs({amdgcn_mul_u24, amdgcn_mul_i24}, Standard)
1966 .Uni(S32, {{UniInVgprS32}, {IntrId, Vgpr32, Vgpr32}})
1967 .Div(S32, {{Vgpr32}, {IntrId, Vgpr32, Vgpr32}})
1968 .Uni(S64, {{UniInVgprS64}, {IntrId, Vgpr32, Vgpr32}})
1969 .Div(S64, {{Vgpr64}, {IntrId, Vgpr32, Vgpr32}});
1970
1971 addRulesForIOpcs({amdgcn_ds_bpermute, amdgcn_ds_bpermute_fi_b32,
1972 amdgcn_ds_permute, amdgcn_fmul_legacy, amdgcn_mulhi_i24,
1973 amdgcn_mulhi_u24},
1974 Standard)
1975 .Uni(S32, {{UniInVgprS32}, {IntrId, Vgpr32, Vgpr32}})
1976 .Div(S32, {{Vgpr32}, {IntrId, Vgpr32, Vgpr32}});
1977
1978 addRulesForIOpcs({amdgcn_cvt_sr_bf8_f32, amdgcn_cvt_sr_fp8_f32,
1979 amdgcn_cvt_sr_fp8_f32_e5m3, amdgcn_cvt_pk_bf8_f32,
1980 amdgcn_cvt_pk_fp8_f32, amdgcn_cvt_pk_fp8_f32_e5m3},
1981 Standard)
1982 .Uni(S32, {{UniInVgprS32}, {IntrId, Vgpr32, Vgpr32, Vgpr32}})
1983 .Div(S32, {{Vgpr32}, {IntrId, Vgpr32, Vgpr32, Vgpr32}});
1984
1985 addRulesForIOpcs({amdgcn_cvt_off_f32_i4, amdgcn_cvt_f32_bf8,
1986 amdgcn_cvt_f32_fp8, amdgcn_cvt_f32_fp8_e5m3},
1987 Standard)
1988 .Uni(S32, {{UniInVgprS32}, {IntrId, Vgpr32}})
1989 .Div(S32, {{Vgpr32}, {IntrId, Vgpr32}});
1990
1991 addRulesForIOpcs({amdgcn_cvt_pk_f32_bf8, amdgcn_cvt_pk_f32_fp8})
1992 .Any({{UniV2S32}, {{UniInVgprV2S32}, {IntrId, Vgpr32}}})
1993 .Any({{DivV2S32}, {{VgprV2S32}, {IntrId, Vgpr32}}});
1994
1995 addRulesForIOpcs({amdgcn_cvt_f16_bf8, amdgcn_cvt_f16_fp8}, Standard)
1996 .Uni(S16, {{UniInVgprS16}, {IntrId, Vgpr32, Imm}})
1997 .Div(S16, {{Vgpr16}, {IntrId, Vgpr32, Imm}});
1998
1999 addRulesForIOpcs({amdgcn_cvt_pk_f16_bf8, amdgcn_cvt_pk_f16_fp8}, Standard)
2000 .Uni(V2S16, {{UniInVgprV2S16}, {IntrId, Vgpr16}})
2001 .Div(V2S16, {{VgprV2S16}, {IntrId, Vgpr16}});
2002
2003 addRulesForIOpcs({amdgcn_cvt_pk_bf8_f16, amdgcn_cvt_pk_fp8_f16}, Standard)
2004 .Uni(S16, {{UniInVgprS16}, {IntrId, VgprV2S16}})
2005 .Div(S16, {{Vgpr16}, {IntrId, VgprV2S16}});
2006
2007 addRulesForIOpcs({amdgcn_cvt_sr_bf8_f16, amdgcn_cvt_sr_fp8_f16}, Standard)
2008 .Uni(S32, {{UniInVgprS32}, {IntrId, Vgpr16, Vgpr32, Vgpr32, Imm}})
2009 .Div(S32, {{Vgpr32}, {IntrId, Vgpr16, Vgpr32, Vgpr32, Imm}});
2010
2011 addRulesForIOpcs({amdgcn_cvt_sr_pk_f16_f32}, Standard)
2013 .Div(V2S16, {{VgprV2S16}, {IntrId, Vgpr32, Vgpr32, Vgpr32}});
2014
2015 addRulesForIOpcs({amdgcn_cvt_scalef32_sr_fp8_f16})
2016 .Any({{DivS32},
2017 {{Vgpr32}, {IntrId, Vgpr32, Vgpr16, Vgpr32, Vgpr32, Imm}}});
2018
2019 addRulesForIOpcs({amdgcn_cvt_scalef32_sr_fp8_f32})
2020 .Any({{DivS32},
2021 {{Vgpr32}, {IntrId, Vgpr32, Vgpr32, Vgpr32, Vgpr32, Imm}}});
2022
2023 addRulesForIOpcs({amdgcn_cubesc, amdgcn_cubetc, amdgcn_cubema, amdgcn_cubeid,
2024 amdgcn_fma_legacy},
2025 Standard)
2026 .Uni(S32, {{UniInVgprS32}, {IntrId, Vgpr32, Vgpr32, Vgpr32}})
2027 .Div(S32, {{Vgpr32}, {IntrId, Vgpr32, Vgpr32, Vgpr32}});
2028
2029 addRulesForIOpcs({amdgcn_frexp_mant, amdgcn_fract}, Standard)
2030 .Uni(S16, {{UniInVgprS16}, {IntrId, Vgpr16}})
2031 .Div(S16, {{Vgpr16}, {IntrId, Vgpr16}})
2032 .Uni(S32, {{UniInVgprS32}, {IntrId, Vgpr32}})
2033 .Div(S32, {{Vgpr32}, {IntrId, Vgpr32}})
2034 .Uni(S64, {{UniInVgprS64}, {IntrId, Vgpr64}})
2035 .Div(S64, {{Vgpr64}, {IntrId, Vgpr64}});
2036
2037 addRulesForIOpcs({amdgcn_prng_b32})
2038 .Any({{UniS32}, {{UniInVgprS32}, {IntrId, Vgpr32}}})
2039 .Any({{DivS32}, {{Vgpr32}, {IntrId, Vgpr32}}});
2040
2041 addRulesForIOpcs({amdgcn_sffbh}, Standard)
2042 .Uni(S32, {{Sgpr32}, {IntrId, Sgpr32}})
2043 .Div(S32, {{Vgpr32}, {IntrId, Vgpr32}});
2044
2045 addRulesForIOpcs({amdgcn_ubfe, amdgcn_sbfe}, Standard)
2046 .Div(S32, {{Vgpr32}, {IntrId, Vgpr32, Vgpr32, Vgpr32}})
2047 .Uni(S32, {{Sgpr32}, {IntrId, Sgpr32, Sgpr32, Sgpr32}, S_BFE})
2048 .Uni(S64, {{Sgpr64}, {IntrId, Sgpr64, Sgpr32, Sgpr32}, S_BFE})
2049 .Div(S64, {{Vgpr64}, {IntrId, Vgpr64, Vgpr32, Vgpr32}, V_BFE});
2050
2051 addRulesForIOpcs({amdgcn_cvt_pk_i16, amdgcn_cvt_pk_u16, amdgcn_cvt_pknorm_i16,
2052 amdgcn_cvt_pknorm_u16, amdgcn_cvt_pkrtz},
2053 Standard)
2054 .Uni(V2S16, {{UniInVgprV2S16}, {IntrId, Vgpr32, Vgpr32}})
2055 .Div(V2S16, {{VgprV2S16}, {IntrId, Vgpr32, Vgpr32}});
2056
2057 addRulesForIOpcs({amdgcn_cvt_scalef32_sr_pk32_bf6_f16,
2058 amdgcn_cvt_scalef32_sr_pk32_fp6_f16,
2059 amdgcn_cvt_scalef32_sr_pk32_bf6_bf16,
2060 amdgcn_cvt_scalef32_sr_pk32_fp6_bf16},
2061 Standard)
2063
2064 addRulesForIOpcs({amdgcn_cvt_scalef32_sr_pk32_bf6_f32,
2065 amdgcn_cvt_scalef32_sr_pk32_fp6_f32},
2066 Standard)
2068
2069 addRulesForIOpcs({amdgcn_cvt_scalef32_sr_pk_fp4_f16}, Standard)
2071 .Div(S32, {{Vgpr32}, {IntrId, Vgpr32, VgprV2S16, Vgpr32, Vgpr32}});
2072
2073 addRulesForIOpcs({amdgcn_cvt_scalef32_sr_pk_fp4_f32}, Standard)
2075 .Div(S32, {{Vgpr32}, {IntrId, Vgpr32, VgprV2S32, Vgpr32, Vgpr32}});
2076
2077 addRulesForIOpcs(
2078 {amdgcn_cvt_scalef32_2xpk16_fp6_f32, amdgcn_cvt_scalef32_2xpk16_bf6_f32})
2079 .Any(
2081 .Any({{UniV6S32},
2083
2084 addRulesForIOpcs({amdgcn_cvt_scalef32_f16_fp8, amdgcn_cvt_scalef32_f16_bf8},
2085 Standard)
2086 .Div(V2S16, {{VgprV2S16}, {IntrId, VgprV2S16, Vgpr32, Vgpr32}})
2088
2089 addRulesForIOpcs({amdgcn_cvt_scalef32_f32_fp8, amdgcn_cvt_scalef32_f32_bf8},
2090 Standard)
2091 .Div(S32, {{Vgpr32}, {IntrId, Vgpr32, Vgpr32}})
2092 .Uni(S32, {{UniInVgprS32}, {IntrId, Vgpr32, Vgpr32}});
2093
2094 addRulesForIOpcs(
2095 {amdgcn_cvt_scalef32_pk16_bf6_f16, amdgcn_cvt_scalef32_pk16_fp6_f16},
2096 Standard)
2099
2100 addRulesForIOpcs(
2101 {amdgcn_cvt_scalef32_pk16_bf6_f32, amdgcn_cvt_scalef32_pk16_fp6_f32},
2102 Standard)
2105
2106 addRulesForIOpcs(
2107 {amdgcn_cvt_scalef32_pk8_bf8_f16, amdgcn_cvt_scalef32_pk8_fp8_f16},
2108 Standard)
2111
2112 addRulesForIOpcs(
2113 {amdgcn_cvt_scalef32_pk8_bf8_f32, amdgcn_cvt_scalef32_pk8_fp8_f32},
2114 Standard)
2117
2118 addRulesForIOpcs({amdgcn_cvt_scalef32_pk8_fp4_f16}, Standard)
2119 .Div(S32, {{Vgpr32}, {IntrId, VgprV8S16, Vgpr32}})
2120 .Uni(S32, {{UniInVgprS32}, {IntrId, VgprV8S16, Vgpr32}});
2121
2122 addRulesForIOpcs({amdgcn_cvt_scalef32_pk8_fp4_f32}, Standard)
2123 .Div(S32, {{Vgpr32}, {IntrId, VgprV8S32, Vgpr32}})
2124 .Uni(S32, {{UniInVgprS32}, {IntrId, VgprV8S32, Vgpr32}});
2125
2126 addRulesForIOpcs({amdgcn_cvt_scalef32_sr_pk16_bf6_f16,
2127 amdgcn_cvt_scalef32_sr_pk16_fp6_f16},
2128 Standard)
2130 .Any({{UniV3S32},
2132
2133 addRulesForIOpcs({amdgcn_cvt_scalef32_sr_pk16_bf6_f32,
2134 amdgcn_cvt_scalef32_sr_pk16_fp6_f32},
2135 Standard)
2137 .Any({{UniV3S32},
2139
2140 addRulesForIOpcs(
2141 {amdgcn_cvt_scalef32_sr_pk8_bf8_f16, amdgcn_cvt_scalef32_sr_pk8_fp8_f16},
2142 Standard)
2144 .Any({{UniV2S32},
2146
2147 addRulesForIOpcs(
2148 {amdgcn_cvt_scalef32_sr_pk8_bf8_f32, amdgcn_cvt_scalef32_sr_pk8_fp8_f32},
2149 Standard)
2151 .Any({{UniV2S32},
2153
2154 addRulesForIOpcs({amdgcn_cvt_scalef32_sr_pk8_fp4_f16}, Standard)
2155 .Div(S32, {{Vgpr32}, {IntrId, VgprV8S16, Vgpr32, Vgpr32}})
2156 .Uni(S32, {{UniInVgprS32}, {IntrId, VgprV8S16, Vgpr32, Vgpr32}});
2157
2158 addRulesForIOpcs({amdgcn_cvt_scalef32_sr_pk8_fp4_f32}, Standard)
2159 .Div(S32, {{Vgpr32}, {IntrId, VgprV8S32, Vgpr32, Vgpr32}})
2160 .Uni(S32, {{UniInVgprS32}, {IntrId, VgprV8S32, Vgpr32, Vgpr32}});
2161
2162 addRulesForIOpcs(
2163 {amdgcn_cvt_scale_pk16_f16_bf6, amdgcn_cvt_scale_pk16_f16_fp6}, Standard)
2166
2167 addRulesForIOpcs(
2168 {amdgcn_cvt_scale_pk16_f32_bf6, amdgcn_cvt_scale_pk16_f32_fp6}, Standard)
2171
2172 addRulesForIOpcs({amdgcn_cvt_scale_pk8_f16_bf8, amdgcn_cvt_scale_pk8_f16_fp8},
2173 Standard)
2176
2177 addRulesForIOpcs({amdgcn_cvt_scale_pk8_f16_fp4}, Standard)
2178 .Any({{DivV8S16}, {{VgprV8S16}, {IntrId, Vgpr32, Vgpr32}}})
2180
2181 addRulesForIOpcs({amdgcn_cvt_scale_pk8_f32_bf8, amdgcn_cvt_scale_pk8_f32_fp8},
2182 Standard)
2185
2186 addRulesForIOpcs({amdgcn_cvt_scale_pk8_f32_fp4}, Standard)
2187 .Any({{DivV8S32}, {{VgprV8S32}, {IntrId, Vgpr32, Vgpr32}}})
2189
2190 addRulesForIOpcs(
2191 {amdgcn_cvt_scalef32_pk32_bf6_f16, amdgcn_cvt_scalef32_pk32_fp6_f16},
2192 Standard)
2195
2196 addRulesForIOpcs(
2197 {amdgcn_cvt_scalef32_pk_fp8_f32, amdgcn_cvt_scalef32_pk_bf8_f32},
2198 Standard)
2199 .Div(V2S16,
2201 .Uni(V2S16, {{UniInVgprV2S16},
2203
2204 addRulesForIOpcs(
2205 {amdgcn_cvt_scalef32_pk_f32_fp8, amdgcn_cvt_scalef32_pk_f32_bf8},
2206 Standard)
2207 .Any({{DivV2S32}, {{VgprV2S32}, {IntrId, Vgpr32, Vgpr32, Imm}}})
2209
2210 addRulesForIOpcs(
2211 {amdgcn_cvt_scalef32_pk_fp8_f16, amdgcn_cvt_scalef32_pk_bf8_f16},
2212 Standard)
2214 .Uni(V2S16,
2216
2217 addRulesForIOpcs({amdgcn_cvt_scalef32_pk_f32_fp4}, Standard)
2218 .Any({{DivV2S32}, {{VgprV2S32}, {IntrId, Vgpr32, Vgpr32, Imm}}})
2220
2221 addRulesForIOpcs({amdgcn_cvt_scalef32_pk_fp4_f32}, Standard)
2222 .Div(S32, {{Vgpr32}, {IntrId, Vgpr32, Vgpr32, Vgpr32, Vgpr32, Imm}})
2223 .Uni(S32,
2225
2226 addRulesForIOpcs({amdgcn_cvt_scalef32_pk_f16_fp4,
2227 amdgcn_cvt_scalef32_pk_f16_fp8,
2228 amdgcn_cvt_scalef32_pk_f16_bf8},
2229 Standard)
2230 .Div(V2S16, {{VgprV2S16}, {IntrId, Vgpr32, Vgpr32, Imm}})
2231 .Uni(V2S16, {{UniInVgprV2S16}, {IntrId, Vgpr32, Vgpr32, Imm}});
2232
2233 addRulesForIOpcs(
2234 {amdgcn_cvt_scalef32_pk32_f32_fp6, amdgcn_cvt_scalef32_pk32_f32_bf6},
2235 Standard)
2238
2239 addRulesForIOpcs(
2240 {amdgcn_cvt_scalef32_pk32_f16_fp6, amdgcn_cvt_scalef32_pk32_f16_bf6},
2241 Standard)
2244
2245 addRulesForIOpcs({amdgcn_cvt_scalef32_pk_fp4_f16}, Standard)
2246 .Div(S32, {{Vgpr32}, {IntrId, Vgpr32, VgprV2S16, Vgpr32, Imm}})
2247 .Uni(S32, {{UniInVgprS32}, {IntrId, Vgpr32, VgprV2S16, Vgpr32, Imm}});
2248
2249 addRulesForIOpcs({amdgcn_global_load_tr_b64})
2250 .Any({{DivB64, _, UniP1}, {{VgprB64}, {IntrId, SgprP1}}})
2251 .Any({{DivB64, _, DivP1}, {{VgprB64}, {IntrId, VgprP1}}})
2252 .Any({{DivB32, _, UniP1}, {{VgprB32}, {IntrId, SgprP1}}})
2253 .Any({{DivB32, _, DivP1}, {{VgprB32}, {IntrId, VgprP1}}});
2254
2255 addRulesForIOpcs({amdgcn_global_load_tr_b128})
2256 .Any({{DivB64, _, UniP1}, {{VgprB64}, {IntrId, SgprP1}}})
2257 .Any({{DivB64, _, DivP1}, {{VgprB64}, {IntrId, VgprP1}}})
2258 .Any({{DivB128, _, UniP1}, {{VgprB128}, {IntrId, SgprP1}}})
2259 .Any({{DivB128, _, DivP1}, {{VgprB128}, {IntrId, VgprP1}}});
2260
2261 addRulesForIOpcs({amdgcn_global_load_tr4_b64})
2262 .Any({{DivV2S32, _, UniP1}, {{VgprV2S32}, {IntrId, SgprP1}}})
2263 .Any({{DivV2S32, _, DivP1}, {{VgprV2S32}, {IntrId, VgprP1}}});
2264
2265 addRulesForIOpcs({amdgcn_global_load_tr6_b96})
2266 .Any({{DivV3S32, _, UniP1}, {{VgprV3S32}, {IntrId, SgprP1}}})
2267 .Any({{DivV3S32, _, DivP1}, {{VgprV3S32}, {IntrId, VgprP1}}});
2268
2269 addRulesForIOpcs({amdgcn_ds_load_tr4_b64, amdgcn_ds_load_tr8_b64})
2270 .Any({{DivV2S32}, {{VgprV2S32}, {IntrId, VgprP3}}});
2271
2272 addRulesForIOpcs({amdgcn_ds_load_tr6_b96})
2273 .Any({{DivV3S32}, {{VgprV3S32}, {IntrId, VgprP3}}});
2274
2275 addRulesForIOpcs({amdgcn_ds_load_tr16_b128})
2276 .Any({{DivB128}, {{VgprB128}, {IntrId, VgprP3}}});
2277
2278 addRulesForIOpcs({amdgcn_global_atomic_ordered_add_b64})
2279 .Any({{DivS64}, {{Vgpr64}, {IntrId, VgprP1, Vgpr64}}});
2280
2281 addRulesForIOpcs(
2282 {amdgcn_global_atomic_fmin_num, amdgcn_global_atomic_fmax_num}, Standard)
2283 .Div(S32, {{Vgpr32}, {IntrId, VgprP1, Vgpr32}});
2284
2285 addRulesForIOpcs({amdgcn_flat_atomic_fmin_num, amdgcn_flat_atomic_fmax_num},
2286 Standard)
2287 .Div(S32, {{Vgpr32}, {IntrId, VgprP0, Vgpr32}});
2288
2289 addRulesForIOpcs({amdgcn_raw_buffer_load_lds})
2290 .Any({{_}, {{}, {IntrId, SgprV4S32, SgprP3, Imm, Vgpr32, Sgpr32}}});
2291
2292 addRulesForIOpcs({amdgcn_raw_buffer_load_async_lds})
2293 .Any({{_}, {{}, {IntrId, SgprV4S32, SgprB32_M0, Imm, Vgpr32, Sgpr32}}});
2294
2295 addRulesForIOpcs({amdgcn_struct_buffer_load_async_lds})
2296 .Any(
2297 {{_},
2299
2300 addRulesForIOpcs({amdgcn_struct_buffer_load_lds})
2301 .Any({{_},
2302 {{}, {IntrId, SgprV4S32, SgprP3, Imm, Vgpr32, Vgpr32, Sgpr32}}});
2303
2304 addRulesForIOpcs({amdgcn_raw_ptr_buffer_load_lds})
2305 .Any({{_}, {{}, {IntrId, SgprP8, SgprP3, Imm, Vgpr32, Sgpr32}}});
2306
2307 addRulesForIOpcs({amdgcn_raw_ptr_buffer_load_async_lds})
2308 .Any({{}, {{}, {IntrId, SgprP8, SgprB32_M0, Imm, VgprB32, SgprB32}}});
2309
2310 addRulesForIOpcs({amdgcn_struct_ptr_buffer_load_async_lds})
2311 .Any({{_},
2312 {{}, {IntrId, SgprP8, SgprB32_M0, Imm, Vgpr32, Vgpr32, Sgpr32}}});
2313
2314 addRulesForIOpcs({amdgcn_struct_ptr_buffer_load_lds})
2315 .Any({{_}, {{}, {IntrId, SgprP8, SgprP3, Imm, Vgpr32, Vgpr32, Sgpr32}}});
2316
2317 addRulesForIOpcs({amdgcn_global_load_lds, amdgcn_load_to_lds})
2318 .Any({{}, {{}, {IntrId, VgprP1, SgprB32_M0}}});
2319
2320 addRulesForIOpcs({amdgcn_global_load_async_to_lds_b8,
2321 amdgcn_global_load_async_to_lds_b32,
2322 amdgcn_global_load_async_to_lds_b64,
2323 amdgcn_global_load_async_to_lds_b128,
2324 amdgcn_global_store_async_from_lds_b8,
2325 amdgcn_global_store_async_from_lds_b32,
2326 amdgcn_global_store_async_from_lds_b64,
2327 amdgcn_global_store_async_from_lds_b128})
2328 .Any({{}, {{}, {IntrId, VgprP1, VgprP3}}});
2329
2330 addRulesForIOpcs({amdgcn_global_load_async_lds})
2331 .Any({{}, {{}, {IntrId, VgprP1, SgprB32_M0}}});
2332
2333 addRulesForIOpcs({amdgcn_tensor_load_to_lds, amdgcn_tensor_store_from_lds})
2334 .Any({{},
2335 {{},
2339
2340 addRulesForIOpcs({amdgcn_cluster_load_b32})
2342 .Any({{DivB32, _, UniP1}, {{VgprB32}, {IntrId, SgprP1, Imm, SgprB32_M0}}})
2343 .Any(
2344 {{DivB32, _, DivP1}, {{VgprB32}, {IntrId, VgprP1, Imm, SgprB32_M0}}});
2345
2346 addRulesForIOpcs({amdgcn_cluster_load_b64})
2348 .Any({{DivB64, _, UniP1}, {{VgprB64}, {IntrId, SgprP1, Imm, SgprB32_M0}}})
2349 .Any(
2350 {{DivB64, _, DivP1}, {{VgprB64}, {IntrId, VgprP1, Imm, SgprB32_M0}}});
2351
2352 addRulesForIOpcs({amdgcn_cluster_load_b128})
2354 .Any({{DivB128, _, UniP1},
2355 {{VgprB128}, {IntrId, SgprP1, Imm, SgprB32_M0}}})
2356 .Any({{DivB128, _, DivP1},
2357 {{VgprB128}, {IntrId, VgprP1, Imm, SgprB32_M0}}});
2358
2359 addRulesForIOpcs({amdgcn_cluster_load_async_to_lds_b8,
2360 amdgcn_cluster_load_async_to_lds_b32,
2361 amdgcn_cluster_load_async_to_lds_b64,
2362 amdgcn_cluster_load_async_to_lds_b128})
2363 .Any({{}, {{}, {IntrId, VgprP1, VgprP3, Imm, Imm, SgprB32_M0}}});
2364
2365 addRulesForIOpcs({amdgcn_perm_pk16_b4_u4}, StandardB)
2366 .Uni(B64, {{UniInVgprB64}, {IntrId, Vgpr32, Vgpr32, VgprV2S32}})
2367 .Div(B64, {{VgprB64}, {IntrId, Vgpr32, Vgpr32, VgprV2S32}});
2368
2369 addRulesForIOpcs({amdgcn_perm_pk16_b6_u4}, StandardB)
2371 .Div(B96, {{VgprB96}, {IntrId, Vgpr32, VgprB64, VgprV2S32}});
2372
2373 addRulesForIOpcs({amdgcn_perm_pk16_b8_u4}, StandardB)
2375 .Div(B128, {{VgprB128}, {IntrId, VgprB64, VgprB64, VgprV2S32}});
2376
2377 addRulesForIOpcs({amdgcn_wwm, amdgcn_strict_wwm, amdgcn_wqm, amdgcn_softwqm,
2378 amdgcn_strict_wqm},
2379 StandardB)
2380 .Div(B32, {{VgprB32}, {IntrId, VgprB32}})
2381 .Uni(B32, {{SgprB32}, {IntrId, SgprB32}})
2382 .Div(B64, {{VgprB64}, {IntrId, VgprB64}})
2383 .Uni(B64, {{SgprB64}, {IntrId, SgprB64}})
2384 .Div(B96, {{VgprB96}, {IntrId, VgprB96}})
2385 .Uni(B96, {{SgprB96}, {IntrId, SgprB96}})
2386 .Div(B128, {{VgprB128}, {IntrId, VgprB128}})
2387 .Uni(B128, {{SgprB128}, {IntrId, SgprB128}})
2388 .Any({{UniB256}, {{SgprB256}, {IntrId, SgprB256}}})
2389 .Any({{DivB256}, {{VgprB256}, {IntrId, VgprB256}}})
2390 .Any({{UniB512}, {{SgprB512}, {IntrId, SgprB512}}})
2391 .Any({{DivB512}, {{VgprB512}, {IntrId, VgprB512}}});
2392
2393 addRulesForIOpcs({amdgcn_init_whole_wave}).Any({{DivS1}, {{Vcc}, {IntrId}}});
2394
2395 addRulesForIOpcs({amdgcn_kill, amdgcn_wqm_demote})
2396 .Any({{}, {{}, {IntrId, Vcc}}});
2397
2398 addRulesForIOpcs({amdgcn_set_inactive}, StandardB)
2399 .Div(B32, {{VgprB32}, {IntrId, VgprB32, VgprB32}});
2400
2401 addRulesForIOpcs({amdgcn_set_inactive_chain_arg}, Standard)
2402 .Div(S32, {{Vgpr32}, {IntrId, Vgpr32, Vgpr32}});
2403
2404 addRulesForIOpcs({amdgcn_cvt_sr_bf16_f32, amdgcn_cvt_sr_f16_f32}, Standard)
2405 .Div(V2S16, {{VgprV2S16}, {IntrId, VgprV2S16, Vgpr32, Vgpr32, Imm}});
2406
2407 addRulesForIOpcs({amdgcn_ballot}, Standard)
2408 .Uni(S64, {{Sgpr64}, {IntrId, Vcc}})
2409 .Uni(S32, {{Sgpr32}, {IntrId, Vcc}});
2410
2411 addRulesForIOpcs({amdgcn_inverse_ballot})
2412 .Any({{DivS1, _, S32}, {{Vcc}, {IntrId, SgprB32_ReadFirstLane}}})
2413 .Any({{DivS1, _, S64}, {{Vcc}, {IntrId, SgprB64_ReadFirstLane}}});
2414
2415 addRulesForIOpcs({amdgcn_live_mask, amdgcn_ps_live})
2416 .Any({{DivS1}, {{Vcc}, {}}});
2417
2418 addRulesForIOpcs({amdgcn_mov_dpp, amdgcn_mov_dpp8}, StandardB)
2419 .Div(B32, {{VgprB32}, {IntrId, VgprB32}})
2420 .Div(B64, {{VgprB64}, {IntrId, VgprB64}});
2421
2422 addRulesForIOpcs({amdgcn_update_dpp}, StandardB)
2423 .Div(B32, {{VgprB32}, {IntrId, VgprB32, VgprB32}})
2424 .Div(B64, {{VgprB64}, {IntrId, VgprB64, VgprB64}});
2425
2426 addRulesForIOpcs({amdgcn_sin, amdgcn_cos}, Standard)
2427 .Div(S16, {{Vgpr16}, {IntrId, Vgpr16}})
2428 .Uni(S16, {{UniInVgprS16}, {IntrId, Vgpr16}})
2429 .Div(S32, {{Vgpr32}, {IntrId, Vgpr32}})
2430 .Uni(S32, {{UniInVgprS32}, {IntrId, Vgpr32}});
2431
2432 addRulesForIOpcs({amdgcn_trig_preop}, Standard)
2433 .Div(S64, {{Vgpr64}, {IntrId, Vgpr64, Vgpr32}})
2434 .Uni(S64, {{UniInVgprS64}, {IntrId, Vgpr64, Vgpr32}});
2435
2436 addRulesForIOpcs({amdgcn_exp2}, Standard)
2437 .Div(S16, {{Vgpr16}, {IntrId, Vgpr16}})
2438 .Uni(S16, {{Sgpr16}, {IntrId, Sgpr16}}, hasPST)
2439 .Uni(S16, {{UniInVgprS16}, {IntrId, Vgpr16}}, !hasPST)
2440 .Div(S32, {{Vgpr32}, {IntrId, Vgpr32}})
2441 .Uni(S32, {{Sgpr32}, {IntrId, Sgpr32}}, hasPST)
2442 .Uni(S32, {{UniInVgprS32}, {IntrId, Vgpr32}}, !hasPST);
2443
2444 addRulesForIOpcs({amdgcn_rcp, amdgcn_sqrt}, Standard)
2445 .Div(S16, {{Vgpr16}, {IntrId, Vgpr16}})
2446 .Uni(S16, {{Sgpr16}, {IntrId, Sgpr16}}, hasPST)
2447 .Uni(S16, {{UniInVgprS16}, {IntrId, Vgpr16}}, !hasPST)
2448 .Div(S32, {{Vgpr32}, {IntrId, Vgpr32}})
2449 .Uni(S32, {{Sgpr32}, {IntrId, Sgpr32}}, hasPST)
2450 .Uni(S32, {{UniInVgprS32}, {IntrId, Vgpr32}}, !hasPST)
2451 .Div(S64, {{Vgpr64}, {IntrId, Vgpr64}})
2452 .Uni(S64, {{UniInVgprS64}, {IntrId, Vgpr64}});
2453
2454 addRulesForIOpcs({amdgcn_log}, Standard)
2455 .Div(S16, {{Vgpr16}, {IntrId, Vgpr16}})
2456 .Uni(S16, {{Sgpr16}, {IntrId, Sgpr16}}, hasPST)
2457 .Uni(S16, {{UniInVgprS16}, {IntrId, Vgpr16}}, !hasPST)
2458 .Div(S32, {{Vgpr32}, {IntrId, Vgpr32}})
2459 .Uni(S32, {{Sgpr32}, {IntrId, Sgpr32}}, hasPST)
2460 .Uni(S32, {{UniInVgprS32}, {IntrId, Vgpr32}}, !hasPST);
2461
2462 addRulesForIOpcs({amdgcn_ds_atomic_async_barrier_arrive_b64})
2463 .Any({{}, {{}, {IntrId, VgprP3}}});
2464
2465 addRulesForIOpcs({amdgcn_ds_atomic_barrier_arrive_rtn_b64}, Standard)
2466 .Div(S64, {{Vgpr64}, {IntrId, VgprP3, Vgpr64}});
2467
2468 addRulesForIOpcs({amdgcn_ds_add_gs_reg_rtn, amdgcn_ds_sub_gs_reg_rtn},
2469 Standard)
2470 .Div(S32, {{Vgpr32}, {IntrId, Vgpr32}})
2471 .Div(S64, {{Vgpr64}, {IntrId, Vgpr32}});
2472
2473 addRulesForIOpcs({amdgcn_ds_append, amdgcn_ds_consume}, Standard)
2474 .Uni(S32, {{UniInVgprS32}, {IntrId, SgprB32_M0}})
2475 .Div(S32, {{Vgpr32}, {IntrId, SgprB32_M0}});
2476
2477 addRulesForIOpcs(
2478 {amdgcn_ds_bvh_stack_rtn, amdgcn_ds_bvh_stack_push4_pop1_rtn}, Standard)
2479 .Div(S32, {{Vgpr32, Vgpr32}, {IntrId, Vgpr32, Vgpr32, VgprV4S32}});
2480
2481 addRulesForIOpcs({amdgcn_ds_bvh_stack_push8_pop1_rtn}, Standard)
2482 .Div(S32, {{Vgpr32, Vgpr32}, {IntrId, Vgpr32, Vgpr32, VgprV8S32}});
2483
2484 addRulesForIOpcs({amdgcn_ds_bvh_stack_push8_pop2_rtn}, Standard)
2485 .Div(S64, {{Vgpr64, Vgpr32}, {IntrId, Vgpr32, Vgpr32, VgprV8S32}});
2486
2487 addRulesForIOpcs({amdgcn_ds_gws_sema_p, amdgcn_ds_gws_sema_v,
2488 amdgcn_ds_gws_sema_release_all})
2489 .Any({{}, {{}, {IntrId, SgprB32_M0}}});
2490
2491 addRulesForIOpcs(
2492 {amdgcn_ds_gws_barrier, amdgcn_ds_gws_init, amdgcn_ds_gws_sema_br})
2493 .Any({{}, {{}, {IntrId, Vgpr32, SgprB32_M0}}});
2494
2495 addRulesForIOpcs({amdgcn_ds_ordered_add, amdgcn_ds_ordered_swap}, Standard)
2496 .Div(S32, {{Vgpr32}, {IntrId, SgprB32_M0, Vgpr32}});
2497
2498 addRulesForIOpcs({amdgcn_ds_swizzle}, Standard)
2499 .Uni(S32, {{UniInVgprS32}, {IntrId, Vgpr32}})
2500 .Div(S32, {{Vgpr32}, {IntrId, Vgpr32}});
2501
2502 addRulesForIOpcs({amdgcn_permlane16_var, amdgcn_permlanex16_var}, Standard)
2503 .Div(S32, {{Vgpr32}, {IntrId, Vgpr32, Vgpr32, Vgpr32}});
2504
2505 addRulesForIOpcs({amdgcn_permlane16_swap, amdgcn_permlane32_swap}, Standard)
2506 .Div(S32, {{Vgpr32, Vgpr32}, {IntrId, Vgpr32, Vgpr32}});
2507
2508 addRulesForIOpcs({amdgcn_permlane64}, StandardB)
2509 .Div(B32, {{VgprB32}, {IntrId, VgprB32}});
2510
2511 addRulesForIOpcs({amdgcn_ds_read_tr4_b64, amdgcn_ds_read_tr8_b64})
2512 .Any({{DivV2S32}, {{VgprV2S32}, {IntrId, VgprP3}}});
2513
2514 addRulesForIOpcs({amdgcn_ds_read_tr6_b96})
2515 .Any({{DivV3S32}, {{VgprV3S32}, {IntrId, VgprP3}}});
2516
2517 addRulesForIOpcs({amdgcn_ds_read_tr16_b64})
2518 .Any({{DivV4S16}, {{VgprV4S16}, {IntrId, VgprP3}}});
2519
2520 addRulesForIOpcs({amdgcn_interp_p1}, Standard)
2521 .Div(S32, {{Vgpr32}, {IntrId, Vgpr32, Imm, Imm, SgprB32_M0}});
2522
2523 addRulesForIOpcs({amdgcn_interp_p1_f16}, Standard)
2524 .Div(S32, {{Vgpr32}, {IntrId, Vgpr32, Imm, Imm, Imm, SgprB32_M0}});
2525
2526 addRulesForIOpcs({amdgcn_interp_p2}, Standard)
2527 .Div(S32, {{Vgpr32}, {IntrId, Vgpr32, Vgpr32, Imm, Imm, SgprB32_M0}});
2528
2529 addRulesForIOpcs({amdgcn_interp_p2_f16}, Standard)
2530 .Div(S16,
2532
2533 addRulesForIOpcs({amdgcn_interp_mov}, Standard)
2534 .Div(S32, {{Vgpr32}, {IntrId, Imm, Imm, Imm, SgprB32_M0}});
2535
2536 addRulesForIOpcs({amdgcn_interp_inreg_p10, amdgcn_interp_inreg_p2,
2537 amdgcn_interp_inreg_p10_f16, amdgcn_interp_p10_rtz_f16},
2538 Standard)
2539 .Uni(S32, {{UniInVgprS32}, {IntrId, Vgpr32, Vgpr32, Vgpr32}})
2540 .Div(S32, {{Vgpr32}, {IntrId, Vgpr32, Vgpr32, Vgpr32}});
2541
2542 addRulesForIOpcs({amdgcn_interp_inreg_p2_f16, amdgcn_interp_p2_rtz_f16},
2543 Standard)
2544 .Uni(S16, {{UniInVgprS16}, {IntrId, Vgpr32, Vgpr32, Vgpr32}})
2545 .Div(S16, {{Vgpr16}, {IntrId, Vgpr32, Vgpr32, Vgpr32}});
2546
2547 addRulesForIOpcs({amdgcn_frexp_exp})
2548 .Any({{UniS16}, {{UniInVgprS16}, {IntrId, Vgpr16}}})
2549 .Any({{DivS16}, {{Vgpr16}, {IntrId, Vgpr16}}})
2550 .Any({{UniS32, _, S32}, {{UniInVgprS32}, {IntrId, Vgpr32}}})
2551 .Any({{DivS32, _, S32}, {{Vgpr32}, {IntrId, Vgpr32}}})
2552 .Any({{UniS32, _, S64}, {{UniInVgprS32}, {IntrId, Vgpr64}}})
2553 .Any({{DivS32, _, S64}, {{Vgpr32}, {IntrId, Vgpr64}}});
2554
2555 addRulesForIOpcs({amdgcn_div_fmas}, Standard)
2556 .Div(S32, {{Vgpr32}, {IntrId, Vgpr32, Vgpr32, Vgpr32, Vcc}})
2557 .Uni(S32, {{UniInVgprS32}, {IntrId, Vgpr32, Vgpr32, Vgpr32, Vcc}})
2558 .Div(S64, {{Vgpr64}, {IntrId, Vgpr64, Vgpr64, Vgpr64, Vcc}})
2559 .Uni(S64, {{UniInVgprS64}, {IntrId, Vgpr64, Vgpr64, Vgpr64, Vcc}});
2560
2561 addRulesForIOpcs({amdgcn_div_fixup}, Standard)
2562 .Div(S16, {{Vgpr16}, {IntrId, Vgpr16, Vgpr16, Vgpr16}})
2563 .Uni(S16, {{UniInVgprS16}, {IntrId, Vgpr16, Vgpr16, Vgpr16}})
2564 .Div(S32, {{Vgpr32}, {IntrId, Vgpr32, Vgpr32, Vgpr32}})
2565 .Uni(S32, {{UniInVgprS32}, {IntrId, Vgpr32, Vgpr32, Vgpr32}})
2566 .Div(S64, {{Vgpr64}, {IntrId, Vgpr64, Vgpr64, Vgpr64}})
2567 .Uni(S64, {{UniInVgprS64}, {IntrId, Vgpr64, Vgpr64, Vgpr64}});
2568
2569 addRulesForIOpcs({amdgcn_div_scale}, Standard)
2570 .Div(S32, {{Vgpr32, Vcc}, {IntrId, Vgpr32, Vgpr32}})
2571 .Uni(S32, {{UniInVgprS32, UniInVcc}, {IntrId, Vgpr32, Vgpr32}})
2572 .Div(S64, {{Vgpr64, Vcc}, {IntrId, Vgpr64, Vgpr64}})
2573 .Uni(S64, {{UniInVgprS64, UniInVcc}, {IntrId, Vgpr64, Vgpr64}});
2574
2575 addRulesForIOpcs({amdgcn_fdot2, amdgcn_sdot2, amdgcn_udot2}, Standard)
2577 .Div(S32, {{Vgpr32}, {IntrId, VgprV2S16, VgprV2S16, Vgpr32}});
2578
2579 addRulesForIOpcs({amdgcn_fdot2_f16_f16}, Standard)
2581 .Div(S16, {{Vgpr16}, {IntrId, VgprV2S16, VgprV2S16, Vgpr16}});
2582
2583 addRulesForIOpcs({amdgcn_sudot4, amdgcn_sudot8}, Standard)
2584 .Uni(S32, {{UniInVgprS32}, {IntrId, Imm, Vgpr32, Imm, Vgpr32, Vgpr32}})
2585 .Div(S32, {{Vgpr32}, {IntrId, Imm, Vgpr32, Imm, Vgpr32, Vgpr32}});
2586
2587 addRulesForIOpcs({amdgcn_s_alloc_vgpr})
2589
2590 addRulesForIOpcs({amdgcn_sat_pk4_i4_i8, amdgcn_sat_pk4_u4_u8}, Standard)
2591 .Uni(S16, {{UniInVgprS16}, {IntrId, Vgpr32}})
2592 .Div(S16, {{Vgpr16}, {IntrId, Vgpr32}});
2593
2594 bool HasGFX90AInsts = ST->hasGFX90AInsts();
2595
2596 // On gfx90a+ both AGPR-form and VGPR-form exists
2597 addRulesForIOpcs({amdgcn_mfma_f32_32x32x1f32, amdgcn_mfma_f32_16x16x1f32,
2598 amdgcn_mfma_f32_4x4x1f32, amdgcn_mfma_f32_32x32x2f32,
2599 amdgcn_mfma_f32_16x16x4f32, amdgcn_mfma_f32_32x32x4f16,
2600 amdgcn_mfma_f32_16x16x4f16, amdgcn_mfma_f32_4x4x4f16,
2601 amdgcn_mfma_f32_32x32x8f16, amdgcn_mfma_f32_16x16x16f16,
2602 amdgcn_mfma_i32_32x32x4i8, amdgcn_mfma_i32_16x16x4i8,
2603 amdgcn_mfma_i32_4x4x4i8, amdgcn_mfma_i32_32x32x8i8,
2604 amdgcn_mfma_i32_16x16x16i8, amdgcn_mfma_f32_32x32x2bf16,
2605 amdgcn_mfma_f32_16x16x2bf16, amdgcn_mfma_f32_4x4x2bf16,
2606 amdgcn_mfma_f32_32x32x4bf16, amdgcn_mfma_f32_16x16x8bf16})
2607 .Any({{DivAnyTy},
2609 !HasGFX90AInsts)
2610 .Any({{DivAnyTy},
2611 {{VgprOrAgprAnyTy},
2613 HasGFX90AInsts);
2614
2615 // gfx90a+ only MFMAs
2616 addRulesForIOpcs(
2617 {amdgcn_mfma_f32_32x32x4bf16_1k, amdgcn_mfma_f32_16x16x4bf16_1k,
2618 amdgcn_mfma_f32_4x4x4bf16_1k, amdgcn_mfma_f32_32x32x8bf16_1k,
2619 amdgcn_mfma_f32_16x16x16bf16_1k, amdgcn_mfma_f64_16x16x4f64,
2620 amdgcn_mfma_f64_4x4x4f64, amdgcn_mfma_i32_16x16x32_i8,
2621 amdgcn_mfma_i32_32x32x16_i8, amdgcn_mfma_f32_16x16x8_xf32,
2622 amdgcn_mfma_f32_32x32x4_xf32, amdgcn_mfma_f32_16x16x32_bf8_bf8,
2623 amdgcn_mfma_f32_16x16x32_bf8_fp8, amdgcn_mfma_f32_16x16x32_fp8_bf8,
2624 amdgcn_mfma_f32_16x16x32_fp8_fp8, amdgcn_mfma_f32_32x32x16_bf8_bf8,
2625 amdgcn_mfma_f32_32x32x16_bf8_fp8, amdgcn_mfma_f32_32x32x16_fp8_bf8,
2626 amdgcn_mfma_f32_32x32x16_fp8_fp8})
2627 .Any({{DivAnyTy},
2628 {{VgprOrAgprAnyTy},
2630
2631 addRulesForIOpcs(
2632 {// gfx942+
2633 amdgcn_smfmac_f32_16x16x32_f16, amdgcn_smfmac_f32_32x32x16_f16,
2634 amdgcn_smfmac_f32_16x16x32_bf16, amdgcn_smfmac_f32_32x32x16_bf16,
2635 amdgcn_smfmac_i32_16x16x64_i8, amdgcn_smfmac_i32_32x32x32_i8,
2636 amdgcn_smfmac_f32_16x16x64_bf8_bf8, amdgcn_smfmac_f32_16x16x64_bf8_fp8,
2637 amdgcn_smfmac_f32_16x16x64_fp8_bf8, amdgcn_smfmac_f32_16x16x64_fp8_fp8,
2638 amdgcn_smfmac_f32_32x32x32_bf8_bf8, amdgcn_smfmac_f32_32x32x32_bf8_fp8,
2639 amdgcn_smfmac_f32_32x32x32_fp8_bf8, amdgcn_smfmac_f32_32x32x32_fp8_fp8,
2640 // gfx950+
2641 amdgcn_smfmac_f32_16x16x64_f16, amdgcn_smfmac_f32_32x32x32_f16,
2642 amdgcn_smfmac_i32_16x16x128_i8, amdgcn_smfmac_i32_32x32x64_i8,
2643 amdgcn_smfmac_f32_16x16x128_bf8_bf8, amdgcn_smfmac_f32_16x16x128_bf8_fp8,
2644 amdgcn_smfmac_f32_16x16x128_fp8_bf8, amdgcn_smfmac_f32_16x16x128_fp8_fp8,
2645 amdgcn_smfmac_f32_32x32x64_bf8_bf8, amdgcn_smfmac_f32_32x32x64_bf8_fp8,
2646 amdgcn_smfmac_f32_32x32x64_fp8_bf8, amdgcn_smfmac_f32_32x32x64_fp8_fp8})
2647 .Any({{DivAnyTy},
2648 {{VgprOrAgprAnyTy},
2650
2651 // WMMA/SWMMAC intrinsics: all register operands map to VGPR.
2652 addRulesForIOpcs(
2653 {// WMMA GFX11+
2654 amdgcn_wmma_f32_16x16x16_f16, amdgcn_wmma_f32_16x16x16_bf16,
2655 amdgcn_wmma_f16_16x16x16_f16, amdgcn_wmma_bf16_16x16x16_bf16,
2656 amdgcn_wmma_f16_16x16x16_f16_tied, amdgcn_wmma_bf16_16x16x16_bf16_tied,
2657 amdgcn_wmma_i32_16x16x16_iu8, amdgcn_wmma_i32_16x16x16_iu4,
2658 // WMMA GFX12
2659 amdgcn_wmma_f32_16x16x16_fp8_fp8, amdgcn_wmma_f32_16x16x16_fp8_bf8,
2660 amdgcn_wmma_f32_16x16x16_bf8_fp8, amdgcn_wmma_f32_16x16x16_bf8_bf8,
2661 amdgcn_wmma_i32_16x16x32_iu4,
2662 // WMMA GFX1250
2663 amdgcn_wmma_f32_16x16x4_f32, amdgcn_wmma_f32_16x16x32_bf16,
2664 amdgcn_wmma_f32_16x16x32_f16, amdgcn_wmma_f16_16x16x32_f16,
2665 amdgcn_wmma_bf16_16x16x32_bf16, amdgcn_wmma_bf16f32_16x16x32_bf16,
2666 amdgcn_wmma_f32_16x16x64_fp8_fp8, amdgcn_wmma_f32_16x16x64_fp8_bf8,
2667 amdgcn_wmma_f32_16x16x64_bf8_fp8, amdgcn_wmma_f32_16x16x64_bf8_bf8,
2668 amdgcn_wmma_f16_16x16x64_fp8_fp8, amdgcn_wmma_f16_16x16x64_fp8_bf8,
2669 amdgcn_wmma_f16_16x16x64_bf8_fp8, amdgcn_wmma_f16_16x16x64_bf8_bf8,
2670 amdgcn_wmma_f16_16x16x128_fp8_fp8, amdgcn_wmma_f16_16x16x128_fp8_bf8,
2671 amdgcn_wmma_f16_16x16x128_bf8_fp8, amdgcn_wmma_f16_16x16x128_bf8_bf8,
2672 amdgcn_wmma_f32_16x16x128_fp8_fp8, amdgcn_wmma_f32_16x16x128_fp8_bf8,
2673 amdgcn_wmma_f32_16x16x128_bf8_fp8, amdgcn_wmma_f32_16x16x128_bf8_bf8,
2674 amdgcn_wmma_i32_16x16x64_iu8, amdgcn_wmma_f32_16x16x128_f8f6f4,
2675 amdgcn_wmma_scale_f32_16x16x128_f8f6f4,
2676 amdgcn_wmma_scale16_f32_16x16x128_f8f6f4, amdgcn_wmma_f32_32x16x128_f4,
2677 amdgcn_wmma_scale_f32_32x16x128_f4, amdgcn_wmma_scale16_f32_32x16x128_f4,
2678 // SWMMAC GFX12
2679 amdgcn_swmmac_f32_16x16x32_f16, amdgcn_swmmac_f32_16x16x32_bf16,
2680 amdgcn_swmmac_f16_16x16x32_f16, amdgcn_swmmac_bf16_16x16x32_bf16,
2681 amdgcn_swmmac_i32_16x16x32_iu8, amdgcn_swmmac_i32_16x16x32_iu4,
2682 amdgcn_swmmac_i32_16x16x64_iu4, amdgcn_swmmac_f32_16x16x32_fp8_fp8,
2683 amdgcn_swmmac_f32_16x16x32_fp8_bf8, amdgcn_swmmac_f32_16x16x32_bf8_fp8,
2684 amdgcn_swmmac_f32_16x16x32_bf8_bf8,
2685 // SWMMAC GFX1250
2686 amdgcn_swmmac_f32_16x16x64_f16, amdgcn_swmmac_f32_16x16x64_bf16,
2687 amdgcn_swmmac_f16_16x16x64_f16, amdgcn_swmmac_bf16_16x16x64_bf16,
2688 amdgcn_swmmac_bf16f32_16x16x64_bf16, amdgcn_swmmac_f32_16x16x128_fp8_fp8,
2689 amdgcn_swmmac_f32_16x16x128_fp8_bf8, amdgcn_swmmac_f32_16x16x128_bf8_fp8,
2690 amdgcn_swmmac_f32_16x16x128_bf8_bf8, amdgcn_swmmac_f16_16x16x128_fp8_fp8,
2691 amdgcn_swmmac_f16_16x16x128_fp8_bf8, amdgcn_swmmac_f16_16x16x128_bf8_fp8,
2692 amdgcn_swmmac_f16_16x16x128_bf8_bf8, amdgcn_swmmac_i32_16x16x128_iu8})
2693 .Any({{}, {{}, {}, ApplyAllVgpr}});
2694
2695} // end initialize rules
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
AMDGPU address space definition.
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.
constexpr LLT S16
constexpr LLT S1
constexpr LLT V2S16
constexpr LLT S32
constexpr LLT V4S32
constexpr LLT V3S32
constexpr LLT S64
constexpr LLT V2S32
constexpr LLT S128
UniformityLLTOpPredicateID LLTToBId(LLT Ty)
bool matchUniformityAndLLT(Register Reg, UniformityLLTOpPredicateID UniID, const MachineUniformityInfo &MUI, const MachineRegisterInfo &MRI)
UniformityLLTOpPredicateID LLTToId(LLT Ty)
AMD GCN specific subclass of TargetSubtarget.
Declares convenience wrapper classes for interpreting MachineInstr instances as specific generic oper...
#define _
IRTranslator LLVM IR MI
Register Reg
Register const TargetRegisterInfo * TRI
Machine IR instance of the generic uniformity analysis.
bool operator()(const MachineInstr &MI) const
Predicate operator||(const Predicate &RHS) const
Predicate operator&&(const Predicate &RHS) const
Predicate(std::function< bool(const MachineInstr &)> Pred)
Predicate operator!() const
RegBankLegalizeRules(const GCNSubtarget &ST, MachineRegisterInfo &MRI)
const SetOfRulesForOpcode * getRulesForOpc(MachineInstr &MI) const
const RegBankLLTMapping * findMappingForMI(const MachineInstr &MI, const MachineRegisterInfo &MRI, const MachineUniformityInfo &MUI) const
void addFastRuleDivergent(UniformityLLTOpPredicateID Ty, RegBankLLTMapping RuleApplyIDs)
void addFastRuleUniform(UniformityLLTOpPredicateID Ty, RegBankLLTMapping RuleApplyIDs)
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition InstrTypes.h:740
bool isSigned() const
Definition InstrTypes.h:993
bool isDivergentAtDef(ConstValueRefT V) const
Whether V is divergent at its definition.
bool isUniformAtDef(ConstValueRefT V) const
Whether V is uniform/non-divergent at its definition.
bool isEquality() const
Return true if this predicate is either EQ or NE.
static constexpr LLT scalar(unsigned SizeInBits)
Get a low-level scalar or aggregate "bag of bits".
static constexpr LLT pointer(unsigned AddressSpace, unsigned SizeInBits)
Get a low-level pointer in the given address space.
constexpr TypeSize getSizeInBits() const
Returns the total size of the type. Must only be called on sized types.
static constexpr LLT fixed_vector(unsigned NumElements, unsigned ScalarSizeInBits)
Get a low-level fixed-width vector of some number of elements and element width.
TypeSize getValue() const
Representation of each machine instruction.
A description of a memory reference used in the backend.
LocationSize getSize() const
Return the size in bytes of the memory reference.
unsigned getAddrSpace() const
LLVM_ABI Align getAlign() const
Return the minimum known alignment in bytes of the actual memory reference.
MachineOperand class - Representation of each machine instruction operand.
bool isReg() const
isReg - Tests if this is a MO_Register operand.
Register getReg() const
getReg - Returns the register number.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
LLT getType(Register Reg) const
Get the low-level type of Reg or LLT{} if Reg is not a generic (target independent) virtual register.
const TargetRegisterInfo * getTargetRegisterInfo() const
Wrapper class representing virtual and physical registers.
Definition Register.h:20
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
void swap(SmallVectorImpl &RHS)
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ CONSTANT_ADDRESS_32BIT
Address space for 32-bit constant memory.
@ CONSTANT_ADDRESS
Address space for constant memory (VTX2).
bool isAnyPtr(LLT Ty, unsigned Width)
bool isUniformMMO(const MachineMemOperand *MMO)
This namespace contains an enum with a value for every intrinsic/builtin function known by LLVM.
This is an optimization pass for GlobalISel generic memory operations.
GenericUniformityInfo< MachineSSAContext > MachineUniformityInfo
static const MachineMemOperand::Flags MONoClobber
Mark the MMO of a uniform load if there are no potentially clobbering stores on any path from the sta...
Definition SIInstrInfo.h:44
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
SmallVector< UniformityLLTOpPredicateID, 4 > OpUniformityAndTypes
PredicateMapping(std::initializer_list< UniformityLLTOpPredicateID > OpList, std::function< bool(const MachineInstr &)> TestFunc=nullptr)
bool match(const MachineInstr &MI, const MachineUniformityInfo &MUI, const MachineRegisterInfo &MRI) const
std::function< bool(const MachineInstr &)> TestFunc
RegBankLLTMapping(std::initializer_list< RegBankLLTMappingApplyID > DstOpMappingList, std::initializer_list< RegBankLLTMappingApplyID > SrcOpMappingList, LoweringMethodID LoweringMethod=DoNotLower)
SmallVector< RegBankLLTMappingApplyID, 2 > DstOpMapping
SmallVector< RegBankLLTMappingApplyID, 4 > SrcOpMapping
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39