LLVM 23.0.0git
AMDGPURegBankLegalizeHelper.cpp
Go to the documentation of this file.
1//===-- AMDGPURegBankLegalizeHelper.cpp -----------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// Implements actual lowering algorithms for each ID that can be used in
10/// Rule.OperandMapping. Similar to legalizer helper but with register banks.
11//
12//===----------------------------------------------------------------------===//
13
16#include "AMDGPUInstrInfo.h"
17#include "AMDGPULaneMaskUtils.h"
20#include "GCNSubtarget.h"
27#include "llvm/IR/IntrinsicsAMDGPU.h"
28
29#define DEBUG_TYPE "amdgpu-regbanklegalize"
30
31using namespace llvm;
32using namespace AMDGPU;
33
36 const RegisterBankInfo &RBI, const RegBankLegalizeRules &RBLRules)
37 : MF(B.getMF()), ST(MF.getSubtarget<GCNSubtarget>()), B(B),
38 MRI(*B.getMRI()), MUI(MUI), RBI(RBI), MORE(MF, nullptr),
39 RBLRules(RBLRules), IsWave32(ST.isWave32()),
40 SgprRB(&RBI.getRegBank(AMDGPU::SGPRRegBankID)),
41 VgprRB(&RBI.getRegBank(AMDGPU::VGPRRegBankID)),
42 AgprRB(&RBI.getRegBank(AMDGPU::AGPRRegBankID)),
43 VccRB(&RBI.getRegBank(AMDGPU::VCCRegBankID)) {}
44
46 const SetOfRulesForOpcode *RuleSet = RBLRules.getRulesForOpc(MI);
47 if (!RuleSet) {
48 reportGISelFailure(MF, MORE, "amdgpu-regbanklegalize",
49 "No AMDGPU RegBankLegalize rules defined for opcode",
50 MI);
51 return false;
52 }
53
54 const RegBankLLTMapping *Mapping = RuleSet->findMappingForMI(MI, MRI, MUI);
55 if (!Mapping) {
56 reportGISelFailure(MF, MORE, "amdgpu-regbanklegalize",
57 "AMDGPU RegBankLegalize: none of the rules defined with "
58 "'Any' for MI's opcode matched MI",
59 MI);
60 return false;
61 }
62
63 WaterfallInfo WFI;
64 unsigned OpIdx = 0;
65 if (!Mapping->DstOpMapping.empty()) {
66 B.setInsertPt(*MI.getParent(), std::next(MI.getIterator()));
67 if (!applyMappingDst(MI, OpIdx, Mapping->DstOpMapping))
68 return false;
69 }
70 if (!Mapping->SrcOpMapping.empty()) {
71 B.setInstr(MI);
72 if (!applyMappingSrc(MI, OpIdx, Mapping->SrcOpMapping, WFI))
73 return false;
74 }
75
76 if (!lower(MI, *Mapping, WFI))
77 return false;
78
79 return true;
80}
81
82bool RegBankLegalizeHelper::executeInWaterfallLoop(MachineIRBuilder &B,
83 const WaterfallInfo &WFI) {
84 assert(WFI.Start.isValid() && WFI.End.isValid() &&
85 "Waterfall range not initialized");
86
87 // Track use registers which have already been expanded with a readfirstlane
88 // sequence. This may have multiple uses if moving a sequence.
89 DenseMap<Register, Register> WaterfalledRegMap;
90
91 MachineBasicBlock &MBB = B.getMBB();
92 MachineFunction &MF = B.getMF();
93
96
98 const TargetRegisterClass *WaveRC = TRI->getWaveMaskRegClass();
100
101#ifndef NDEBUG
102 const int OrigRangeSize = std::distance(BeginIt, EndIt);
103#endif
104
105 MachineRegisterInfo &MRI = *B.getMRI();
106 Register SaveExecReg = MRI.createVirtualRegister(WaveRC);
107 Register InitSaveExecReg = MRI.createVirtualRegister(WaveRC);
108
109 // Don't bother using generic instructions/registers for the exec mask.
110 B.buildInstr(TargetOpcode::IMPLICIT_DEF).addDef(InitSaveExecReg);
111
112 Register SavedExec = MRI.createVirtualRegister(WaveRC);
113
114 // To insert the loop we need to split the block. Move everything before
115 // this point to a new block, and insert a new empty block before this
116 // instruction.
119 MachineBasicBlock *RestoreExecBB = MF.CreateMachineBasicBlock();
120 MachineBasicBlock *RemainderBB = MF.CreateMachineBasicBlock();
122 ++MBBI;
123 MF.insert(MBBI, LoopBB);
124 MF.insert(MBBI, BodyBB);
125 MF.insert(MBBI, RestoreExecBB);
126 MF.insert(MBBI, RemainderBB);
127
128 LoopBB->addSuccessor(BodyBB);
129 BodyBB->addSuccessor(RestoreExecBB);
130 BodyBB->addSuccessor(LoopBB);
131
132 // Move the rest of the block into a new block.
134 RemainderBB->splice(RemainderBB->begin(), &MBB, EndIt, MBB.end());
135
136 MBB.addSuccessor(LoopBB);
137 RestoreExecBB->addSuccessor(RemainderBB);
138
139 B.setInsertPt(*LoopBB, LoopBB->end());
140
141 // +-MBB:------------+
142 // | ... |
143 // | %0 = G_INST_1 |
144 // | %Dst = MI %Vgpr |
145 // | %1 = G_INST_2 |
146 // | ... |
147 // +-----------------+
148 // ->
149 // +-MBB-------------------------------+
150 // | ... |
151 // | %0 = G_INST_1 |
152 // | %SaveExecReg = S_MOV_B32 $exec_lo |
153 // +----------------|------------------+
154 // | /------------------------------|
155 // V V |
156 // +-LoopBB---------------------------------------------------------------+ |
157 // | %CurrentLaneReg:sgpr(s32) = READFIRSTLANE %Vgpr | |
158 // | instead of executing for each lane, see if other lanes had | |
159 // | same value for %Vgpr and execute for them also. | |
160 // | %CondReg:vcc(s1) = G_ICMP eq %CurrentLaneReg, %Vgpr | |
161 // | %CondRegLM:sreg_32 = ballot %CondReg // copy vcc to sreg32 lane mask | |
162 // | %SavedExec = S_AND_SAVEEXEC_B32 %CondRegLM | |
163 // | exec is active for lanes with the same "CurrentLane value" in Vgpr | |
164 // +----------------|-----------------------------------------------------+ |
165 // V |
166 // +-BodyBB------------------------------------------------------------+ |
167 // | %Dst = MI %CurrentLaneReg:sgpr(s32) | |
168 // | executed only for active lanes and written to Dst | |
169 // | $exec = S_XOR_B32 $exec, %SavedExec | |
170 // | set active lanes to 0 in SavedExec, lanes that did not write to | |
171 // | Dst yet, and set this as new exec (for READFIRSTLANE and ICMP) | |
172 // | SI_WATERFALL_LOOP LoopBB |-----|
173 // +----------------|--------------------------------------------------+
174 // V
175 // +-RestoreExecBB--------------------------+
176 // | $exec_lo = S_MOV_B32_term %SaveExecReg |
177 // +----------------|-----------------------+
178 // V
179 // +-RemainderBB:----------------------+
180 // | %1 = G_INST_2 |
181 // | ... |
182 // +---------------------------------- +
183
184 // Move the instruction into the loop body. Note we moved everything after
185 // Range.end() already into a new block, so Range.end() is no longer valid.
186 BodyBB->splice(BodyBB->end(), &MBB, BeginIt, MBB.end());
187
188 // Figure out the iterator range after splicing the instructions.
189 MachineBasicBlock::iterator NewBegin = BeginIt;
190 auto NewEnd = BodyBB->end();
191 assert(std::distance(NewBegin, NewEnd) == OrigRangeSize);
192
193 B.setMBB(*LoopBB);
194 Register CondReg;
195
196 for (MachineInstr &MI : make_range(NewBegin, NewEnd)) {
197 for (MachineOperand &Op : MI.all_uses()) {
198 Register OldReg = Op.getReg();
199 if (!WFI.SgprWaterfallOperandRegs.count(OldReg))
200 continue;
201
202 // See if we already processed this register in another instruction in
203 // the sequence.
204 auto OldVal = WaterfalledRegMap.find(OldReg);
205 if (OldVal != WaterfalledRegMap.end()) {
206 Op.setReg(OldVal->second);
207 continue;
208 }
209
210 Register OpReg = Op.getReg();
211 LLT OpTy = MRI.getType(OpReg);
212
213 // TODO: support for agpr
214 assert(MRI.getRegBank(OpReg) == VgprRB);
215 Register CurrentLaneReg = MRI.createVirtualRegister({SgprRB, OpTy});
216 buildReadFirstLane(B, CurrentLaneReg, OpReg, RBI);
217
218 // Build the comparison(s), CurrentLaneReg == OpReg.
219 unsigned OpSize = OpTy.getSizeInBits();
220 unsigned PartSize = (OpSize % 64 == 0) ? 64 : 32;
221 LLT PartTy = LLT::scalar(PartSize);
222 unsigned NumParts = OpSize / PartSize;
224 SmallVector<Register, 8> CurrentLaneParts;
225
226 if (NumParts == 1) {
227 OpParts.push_back(OpReg);
228 CurrentLaneParts.push_back(CurrentLaneReg);
229 } else {
230 auto UnmergeOp = B.buildUnmerge({VgprRB, PartTy}, OpReg);
231 auto UnmergeCurrLane = B.buildUnmerge({SgprRB, PartTy}, CurrentLaneReg);
232 for (unsigned i = 0; i < NumParts; ++i) {
233 OpParts.push_back(UnmergeOp.getReg(i));
234 CurrentLaneParts.push_back(UnmergeCurrLane.getReg(i));
235 }
236 }
237
238 for (unsigned i = 0; i < NumParts; ++i) {
239 Register CmpReg = MRI.createVirtualRegister(VccRB_S1);
240 B.buildICmp(CmpInst::ICMP_EQ, CmpReg, CurrentLaneParts[i], OpParts[i]);
241
242 if (!CondReg)
243 CondReg = CmpReg;
244 else
245 CondReg = B.buildAnd(VccRB_S1, CondReg, CmpReg).getReg(0);
246 }
247
248 Op.setReg(CurrentLaneReg);
249
250 // Make sure we don't re-process this register again.
251 WaterfalledRegMap.insert(std::pair(OldReg, Op.getReg()));
252 }
253 }
254
255 // Copy vcc to sgpr32/64, ballot becomes a no-op during instruction selection.
256 Register CondRegLM =
257 MRI.createVirtualRegister({WaveRC, LLT::scalar(IsWave32 ? 32 : 64)});
258 B.buildIntrinsic(Intrinsic::amdgcn_ballot, CondRegLM).addReg(CondReg);
259
260 // Update EXEC, save the original EXEC value to SavedExec.
261 B.buildInstr(LMC.AndSaveExecOpc)
262 .addDef(SavedExec)
263 .addReg(CondRegLM, RegState::Kill);
264 MRI.setSimpleHint(SavedExec, CondRegLM);
265
266 B.setInsertPt(*BodyBB, BodyBB->end());
267
268 // Update EXEC, switch all done bits to 0 and all todo bits to 1.
269 B.buildInstr(LMC.XorTermOpc)
270 .addDef(LMC.ExecReg)
271 .addReg(LMC.ExecReg)
272 .addReg(SavedExec);
273
274 // XXX - s_xor_b64 sets scc to 1 if the result is nonzero, so can we use
275 // s_cbranch_scc0?
276
277 // Loop back to V_READFIRSTLANE_B32 if there are still variants to cover.
278 B.buildInstr(AMDGPU::SI_WATERFALL_LOOP).addMBB(LoopBB);
279
280 // Save the EXEC mask before the loop.
281 B.setInsertPt(MBB, MBB.end());
282 B.buildInstr(LMC.MovOpc).addDef(SaveExecReg).addReg(LMC.ExecReg);
283
284 // Restore the EXEC mask after the loop.
285 B.setInsertPt(*RestoreExecBB, RestoreExecBB->begin());
286 B.buildInstr(LMC.MovTermOpc).addDef(LMC.ExecReg).addReg(SaveExecReg);
287
288 // Set the insert point after the original instruction, so any new
289 // instructions will be in the remainder.
290 B.setInsertPt(*RemainderBB, RemainderBB->begin());
291
292 return true;
293}
294
295bool RegBankLegalizeHelper::splitLoad(MachineInstr &MI,
296 ArrayRef<LLT> LLTBreakdown, LLT MergeTy) {
297 MachineFunction &MF = B.getMF();
298 assert(MI.getNumMemOperands() == 1);
299 MachineMemOperand &BaseMMO = **MI.memoperands_begin();
300 Register Dst = MI.getOperand(0).getReg();
301 const RegisterBank *DstRB = MRI.getRegBankOrNull(Dst);
302 Register Base = MI.getOperand(1).getReg();
303 LLT PtrTy = MRI.getType(Base);
304 const RegisterBank *PtrRB = MRI.getRegBankOrNull(Base);
305 LLT OffsetTy = LLT::scalar(PtrTy.getSizeInBits());
306 SmallVector<Register, 4> LoadPartRegs;
307
308 unsigned ByteOffset = 0;
309 for (LLT PartTy : LLTBreakdown) {
310 Register BasePlusOffset;
311 if (ByteOffset == 0) {
312 BasePlusOffset = Base;
313 } else {
314 auto Offset = B.buildConstant({PtrRB, OffsetTy}, ByteOffset);
315 BasePlusOffset =
316 B.buildObjectPtrOffset({PtrRB, PtrTy}, Base, Offset).getReg(0);
317 }
318 auto *OffsetMMO = MF.getMachineMemOperand(&BaseMMO, ByteOffset, PartTy);
319 auto LoadPart = B.buildLoad({DstRB, PartTy}, BasePlusOffset, *OffsetMMO);
320 LoadPartRegs.push_back(LoadPart.getReg(0));
321 ByteOffset += PartTy.getSizeInBytes();
322 }
323
324 if (!MergeTy.isValid()) {
325 // Loads are of same size, concat or merge them together.
326 B.buildMergeLikeInstr(Dst, LoadPartRegs);
327 } else {
328 // Loads are not all of same size, need to unmerge them to smaller pieces
329 // of MergeTy type, then merge pieces to Dst.
330 SmallVector<Register, 4> MergeTyParts;
331 for (Register Reg : LoadPartRegs) {
332 if (MRI.getType(Reg) == MergeTy) {
333 MergeTyParts.push_back(Reg);
334 } else {
335 auto Unmerge = B.buildUnmerge({DstRB, MergeTy}, Reg);
336 for (unsigned i = 0; i < Unmerge->getNumOperands() - 1; ++i)
337 MergeTyParts.push_back(Unmerge.getReg(i));
338 }
339 }
340 B.buildMergeLikeInstr(Dst, MergeTyParts);
341 }
342 MI.eraseFromParent();
343 return true;
344}
345
346bool RegBankLegalizeHelper::widenLoad(MachineInstr &MI, LLT WideTy,
347 LLT MergeTy) {
348 MachineFunction &MF = B.getMF();
349 assert(MI.getNumMemOperands() == 1);
350 MachineMemOperand &BaseMMO = **MI.memoperands_begin();
351 Register Dst = MI.getOperand(0).getReg();
352 const RegisterBank *DstRB = MRI.getRegBankOrNull(Dst);
353 Register Base = MI.getOperand(1).getReg();
354
355 MachineMemOperand *WideMMO = MF.getMachineMemOperand(&BaseMMO, 0, WideTy);
356 auto WideLoad = B.buildLoad({DstRB, WideTy}, Base, *WideMMO);
357
358 if (WideTy.isScalar()) {
359 B.buildTrunc(Dst, WideLoad);
360 } else {
361 SmallVector<Register, 4> MergeTyParts;
362 auto Unmerge = B.buildUnmerge({DstRB, MergeTy}, WideLoad);
363
364 LLT DstTy = MRI.getType(Dst);
365 unsigned NumElts = DstTy.getSizeInBits() / MergeTy.getSizeInBits();
366 for (unsigned i = 0; i < NumElts; ++i) {
367 MergeTyParts.push_back(Unmerge.getReg(i));
368 }
369 B.buildMergeLikeInstr(Dst, MergeTyParts);
370 }
371 MI.eraseFromParent();
372 return true;
373}
374
375bool RegBankLegalizeHelper::widenMMOToS32(GAnyLoad &MI) const {
376 Register Dst = MI.getDstReg();
377 Register Ptr = MI.getPointerReg();
378 MachineMemOperand &MMO = MI.getMMO();
379 unsigned MemSize = 8 * MMO.getSize().getValue();
380
381 MachineMemOperand *WideMMO = B.getMF().getMachineMemOperand(&MMO, 0, S32);
382
383 if (MI.getOpcode() == G_LOAD) {
384 B.buildLoad(Dst, Ptr, *WideMMO);
385 } else {
386 auto Load = B.buildLoad(SgprRB_S32, Ptr, *WideMMO);
387
388 if (MI.getOpcode() == G_ZEXTLOAD) {
389 APInt Mask = APInt::getLowBitsSet(S32.getSizeInBits(), MemSize);
390 auto MaskCst = B.buildConstant(SgprRB_S32, Mask);
391 B.buildAnd(Dst, Load, MaskCst);
392 } else {
393 assert(MI.getOpcode() == G_SEXTLOAD);
394 B.buildSExtInReg(Dst, Load, MemSize);
395 }
396 }
397
398 MI.eraseFromParent();
399 return true;
400}
401
402bool RegBankLegalizeHelper::lowerVccExtToSel(MachineInstr &MI) {
403 Register Dst = MI.getOperand(0).getReg();
404 LLT Ty = MRI.getType(Dst);
405 Register Src = MI.getOperand(1).getReg();
406 unsigned Opc = MI.getOpcode();
407 int TrueExtCst = Opc == G_SEXT ? -1 : 1;
408 if (Ty == S32 || Ty == S16) {
409 auto True = B.buildConstant({VgprRB, Ty}, TrueExtCst);
410 auto False = B.buildConstant({VgprRB, Ty}, 0);
411 B.buildSelect(Dst, Src, True, False);
412 } else if (Ty == S64) {
413 auto True = B.buildConstant({VgprRB_S32}, TrueExtCst);
414 auto False = B.buildConstant({VgprRB_S32}, 0);
415 auto Lo = B.buildSelect({VgprRB_S32}, Src, True, False);
416 MachineInstrBuilder Hi;
417 switch (Opc) {
418 case G_SEXT:
419 Hi = Lo;
420 break;
421 case G_ZEXT:
422 Hi = False;
423 break;
424 case G_ANYEXT:
425 Hi = B.buildUndef({VgprRB_S32});
426 break;
427 default:
429 MF, MORE, "amdgpu-regbanklegalize",
430 "AMDGPU RegBankLegalize: lowerVccExtToSel, Opcode not supported", MI);
431 return false;
432 }
433
434 B.buildMergeValues(Dst, {Lo.getReg(0), Hi.getReg(0)});
435 } else {
437 MF, MORE, "amdgpu-regbanklegalize",
438 "AMDGPU RegBankLegalize: lowerVccExtToSel, Type not supported", MI);
439 return false;
440 }
441
442 MI.eraseFromParent();
443 return true;
444}
445
446std::pair<Register, Register> RegBankLegalizeHelper::unpackZExt(Register Reg) {
447 auto PackedS32 = B.buildBitcast(SgprRB_S32, Reg);
448 auto Mask = B.buildConstant(SgprRB_S32, 0x0000ffff);
449 auto Lo = B.buildAnd(SgprRB_S32, PackedS32, Mask);
450 auto Hi = B.buildLShr(SgprRB_S32, PackedS32, B.buildConstant(SgprRB_S32, 16));
451 return {Lo.getReg(0), Hi.getReg(0)};
452}
453
454std::pair<Register, Register> RegBankLegalizeHelper::unpackSExt(Register Reg) {
455 auto PackedS32 = B.buildBitcast(SgprRB_S32, Reg);
456 auto Lo = B.buildSExtInReg(SgprRB_S32, PackedS32, 16);
457 auto Hi = B.buildAShr(SgprRB_S32, PackedS32, B.buildConstant(SgprRB_S32, 16));
458 return {Lo.getReg(0), Hi.getReg(0)};
459}
460
461std::pair<Register, Register> RegBankLegalizeHelper::unpackAExt(Register Reg) {
462 auto PackedS32 = B.buildBitcast(SgprRB_S32, Reg);
463 auto Lo = PackedS32;
464 auto Hi = B.buildLShr(SgprRB_S32, PackedS32, B.buildConstant(SgprRB_S32, 16));
465 return {Lo.getReg(0), Hi.getReg(0)};
466}
467
468std::pair<Register, Register>
469RegBankLegalizeHelper::unpackAExtTruncS16(Register Reg) {
470 auto [Lo32, Hi32] = unpackAExt(Reg);
471 return {B.buildTrunc(SgprRB_S16, Lo32).getReg(0),
472 B.buildTrunc(SgprRB_S16, Hi32).getReg(0)};
473}
474
475bool RegBankLegalizeHelper::lowerUnpackBitShift(MachineInstr &MI) {
476 Register Lo, Hi;
477 switch (MI.getOpcode()) {
478 case AMDGPU::G_SHL: {
479 auto [Val0, Val1] = unpackAExt(MI.getOperand(1).getReg());
480 auto [Amt0, Amt1] = unpackAExt(MI.getOperand(2).getReg());
481 Lo = B.buildInstr(MI.getOpcode(), {SgprRB_S32}, {Val0, Amt0}).getReg(0);
482 Hi = B.buildInstr(MI.getOpcode(), {SgprRB_S32}, {Val1, Amt1}).getReg(0);
483 break;
484 }
485 case AMDGPU::G_LSHR: {
486 auto [Val0, Val1] = unpackZExt(MI.getOperand(1).getReg());
487 auto [Amt0, Amt1] = unpackZExt(MI.getOperand(2).getReg());
488 Lo = B.buildInstr(MI.getOpcode(), {SgprRB_S32}, {Val0, Amt0}).getReg(0);
489 Hi = B.buildInstr(MI.getOpcode(), {SgprRB_S32}, {Val1, Amt1}).getReg(0);
490 break;
491 }
492 case AMDGPU::G_ASHR: {
493 auto [Val0, Val1] = unpackSExt(MI.getOperand(1).getReg());
494 auto [Amt0, Amt1] = unpackSExt(MI.getOperand(2).getReg());
495 Lo = B.buildAShr(SgprRB_S32, Val0, Amt0).getReg(0);
496 Hi = B.buildAShr(SgprRB_S32, Val1, Amt1).getReg(0);
497 break;
498 }
499 default:
501 MF, MORE, "amdgpu-regbanklegalize",
502 "AMDGPU RegBankLegalize: lowerUnpackBitShift, case not implemented",
503 MI);
504 return false;
505 }
506 B.buildBuildVectorTrunc(MI.getOperand(0).getReg(), {Lo, Hi});
507 MI.eraseFromParent();
508 return true;
509}
510
511bool RegBankLegalizeHelper::lowerUnpackMinMax(MachineInstr &MI) {
512 Register Lo, Hi;
513 switch (MI.getOpcode()) {
514 case AMDGPU::G_SMIN:
515 case AMDGPU::G_SMAX: {
516 // For signed operations, use sign extension
517 auto [Val0_Lo, Val0_Hi] = unpackSExt(MI.getOperand(1).getReg());
518 auto [Val1_Lo, Val1_Hi] = unpackSExt(MI.getOperand(2).getReg());
519 Lo = B.buildInstr(MI.getOpcode(), {SgprRB_S32}, {Val0_Lo, Val1_Lo})
520 .getReg(0);
521 Hi = B.buildInstr(MI.getOpcode(), {SgprRB_S32}, {Val0_Hi, Val1_Hi})
522 .getReg(0);
523 break;
524 }
525 case AMDGPU::G_UMIN:
526 case AMDGPU::G_UMAX: {
527 // For unsigned operations, use zero extension
528 auto [Val0_Lo, Val0_Hi] = unpackZExt(MI.getOperand(1).getReg());
529 auto [Val1_Lo, Val1_Hi] = unpackZExt(MI.getOperand(2).getReg());
530 Lo = B.buildInstr(MI.getOpcode(), {SgprRB_S32}, {Val0_Lo, Val1_Lo})
531 .getReg(0);
532 Hi = B.buildInstr(MI.getOpcode(), {SgprRB_S32}, {Val0_Hi, Val1_Hi})
533 .getReg(0);
534 break;
535 }
536 default:
538 MF, MORE, "amdgpu-regbanklegalize",
539 "AMDGPU RegBankLegalize: lowerUnpackMinMax, case not implemented", MI);
540 return false;
541 }
542 B.buildBuildVectorTrunc(MI.getOperand(0).getReg(), {Lo, Hi});
543 MI.eraseFromParent();
544 return true;
545}
546
547bool RegBankLegalizeHelper::lowerUnpackAExt(MachineInstr &MI) {
548 auto [Op1Lo, Op1Hi] = unpackAExt(MI.getOperand(1).getReg());
549 auto [Op2Lo, Op2Hi] = unpackAExt(MI.getOperand(2).getReg());
550 auto ResLo = B.buildInstr(MI.getOpcode(), {SgprRB_S32}, {Op1Lo, Op2Lo});
551 auto ResHi = B.buildInstr(MI.getOpcode(), {SgprRB_S32}, {Op1Hi, Op2Hi});
552 B.buildBuildVectorTrunc(MI.getOperand(0).getReg(),
553 {ResLo.getReg(0), ResHi.getReg(0)});
554 MI.eraseFromParent();
555 return true;
556}
557
560 return (GI->is(Intrinsic::amdgcn_sbfe));
561
562 return MI.getOpcode() == AMDGPU::G_SBFX;
563}
564
565bool RegBankLegalizeHelper::lowerV_BFE(MachineInstr &MI) {
566 Register Dst = MI.getOperand(0).getReg();
567 assert(MRI.getType(Dst) == LLT::scalar(64));
568 bool Signed = isSignedBFE(MI);
569 unsigned FirstOpnd = isa<GIntrinsic>(MI) ? 2 : 1;
570 // Extract bitfield from Src, LSBit is the least-significant bit for the
571 // extraction (field offset) and Width is size of bitfield.
572 Register Src = MI.getOperand(FirstOpnd).getReg();
573 Register LSBit = MI.getOperand(FirstOpnd + 1).getReg();
574 Register Width = MI.getOperand(FirstOpnd + 2).getReg();
575 // Comments are for signed bitfield extract, similar for unsigned. x is sign
576 // bit. s is sign, l is LSB and y are remaining bits of bitfield to extract.
577
578 // Src >> LSBit Hi|Lo: x?????syyyyyyl??? -> xxxx?????syyyyyyl
579 unsigned SHROpc = Signed ? AMDGPU::G_ASHR : AMDGPU::G_LSHR;
580 auto SHRSrc = B.buildInstr(SHROpc, {{VgprRB, S64}}, {Src, LSBit});
581
582 auto ConstWidth = getIConstantVRegValWithLookThrough(Width, MRI);
583
584 // Expand to Src >> LSBit << (64 - Width) >> (64 - Width)
585 // << (64 - Width): Hi|Lo: xxxx?????syyyyyyl -> syyyyyyl000000000
586 // >> (64 - Width): Hi|Lo: syyyyyyl000000000 -> ssssssssssyyyyyyl
587 if (!ConstWidth) {
588 auto Amt = B.buildSub(VgprRB_S32, B.buildConstant(SgprRB_S32, 64), Width);
589 auto SignBit = B.buildShl({VgprRB, S64}, SHRSrc, Amt);
590 B.buildInstr(SHROpc, {Dst}, {SignBit, Amt});
591 MI.eraseFromParent();
592 return true;
593 }
594
595 uint64_t WidthImm = ConstWidth->Value.getZExtValue();
596 auto UnmergeSHRSrc = B.buildUnmerge(VgprRB_S32, SHRSrc);
597 Register SHRSrcLo = UnmergeSHRSrc.getReg(0);
598 Register SHRSrcHi = UnmergeSHRSrc.getReg(1);
599 auto Zero = B.buildConstant({VgprRB, S32}, 0);
600 unsigned BFXOpc = Signed ? AMDGPU::G_SBFX : AMDGPU::G_UBFX;
601
602 if (WidthImm <= 32) {
603 // SHRSrc Hi|Lo: ????????|???syyyl -> ????????|ssssyyyl
604 auto Lo = B.buildInstr(BFXOpc, {VgprRB_S32}, {SHRSrcLo, Zero, Width});
605 MachineInstrBuilder Hi;
606 if (Signed) {
607 // SHRSrc Hi|Lo: ????????|ssssyyyl -> ssssssss|ssssyyyl
608 Hi = B.buildAShr(VgprRB_S32, Lo, B.buildConstant(VgprRB_S32, 31));
609 } else {
610 // SHRSrc Hi|Lo: ????????|000syyyl -> 00000000|000syyyl
611 Hi = Zero;
612 }
613 B.buildMergeLikeInstr(Dst, {Lo, Hi});
614 } else {
615 auto Amt = B.buildConstant(VgprRB_S32, WidthImm - 32);
616 // SHRSrc Hi|Lo: ??????sy|yyyyyyyl -> sssssssy|yyyyyyyl
617 auto Hi = B.buildInstr(BFXOpc, {VgprRB_S32}, {SHRSrcHi, Zero, Amt});
618 B.buildMergeLikeInstr(Dst, {SHRSrcLo, Hi});
619 }
620
621 MI.eraseFromParent();
622 return true;
623}
624
625bool RegBankLegalizeHelper::lowerS_BFE(MachineInstr &MI) {
626 Register DstReg = MI.getOperand(0).getReg();
627 LLT Ty = MRI.getType(DstReg);
628 bool Signed = isSignedBFE(MI);
629 unsigned FirstOpnd = isa<GIntrinsic>(MI) ? 2 : 1;
630 Register Src = MI.getOperand(FirstOpnd).getReg();
631 Register LSBit = MI.getOperand(FirstOpnd + 1).getReg();
632 Register Width = MI.getOperand(FirstOpnd + 2).getReg();
633 // For uniform bit field extract there are 4 available instructions, but
634 // LSBit(field offset) and Width(size of bitfield) need to be packed in S32,
635 // field offset in low and size in high 16 bits.
636
637 // Src1 Hi16|Lo16 = Size|FieldOffset
638 auto Mask = B.buildConstant(SgprRB_S32, maskTrailingOnes<unsigned>(6));
639 auto FieldOffset = B.buildAnd(SgprRB_S32, LSBit, Mask);
640 auto Size = B.buildShl(SgprRB_S32, Width, B.buildConstant(SgprRB_S32, 16));
641 auto Src1 = B.buildOr(SgprRB_S32, FieldOffset, Size);
642 unsigned Opc32 = Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32;
643 unsigned Opc64 = Signed ? AMDGPU::S_BFE_I64 : AMDGPU::S_BFE_U64;
644 unsigned Opc = Ty == S32 ? Opc32 : Opc64;
645
646 // Select machine instruction, because of reg class constraining, insert
647 // copies from reg class to reg bank.
648 auto S_BFE = B.buildInstr(Opc, {{SgprRB, Ty}},
649 {B.buildCopy(Ty, Src), B.buildCopy(S32, Src1)});
650 constrainSelectedInstRegOperands(*S_BFE, *ST.getInstrInfo(),
651 *ST.getRegisterInfo(), RBI);
652
653 B.buildCopy(DstReg, S_BFE->getOperand(0).getReg());
654 MI.eraseFromParent();
655 return true;
656}
657
658bool RegBankLegalizeHelper::lowerSplitTo32(MachineInstr &MI) {
659 Register Dst = MI.getOperand(0).getReg();
660 LLT DstTy = MRI.getType(Dst);
661 assert(DstTy == V4S16 || DstTy == V2S32 || DstTy == S64);
662 LLT Ty = DstTy == V4S16 ? V2S16 : S32;
663 auto Op1 = B.buildUnmerge({VgprRB, Ty}, MI.getOperand(1).getReg());
664 auto Op2 = B.buildUnmerge({VgprRB, Ty}, MI.getOperand(2).getReg());
665 unsigned Opc = MI.getOpcode();
666 auto Flags = MI.getFlags();
667 auto Lo =
668 B.buildInstr(Opc, {{VgprRB, Ty}}, {Op1.getReg(0), Op2.getReg(0)}, Flags);
669 auto Hi =
670 B.buildInstr(Opc, {{VgprRB, Ty}}, {Op1.getReg(1), Op2.getReg(1)}, Flags);
671 B.buildMergeLikeInstr(Dst, {Lo, Hi});
672 MI.eraseFromParent();
673 return true;
674}
675
676bool RegBankLegalizeHelper::lowerSplitTo32Mul(MachineInstr &MI) {
677 Register Dst = MI.getOperand(0).getReg();
678 assert(MRI.getType(Dst) == S64);
679 auto Op1 = B.buildUnmerge({VgprRB_S32}, MI.getOperand(1).getReg());
680 auto Op2 = B.buildUnmerge({VgprRB_S32}, MI.getOperand(2).getReg());
681
682 // TODO: G_AMDGPU_MAD_* optimizations for G_MUL divergent S64 operation to
683 // match GlobalISel with old regbankselect.
684 auto Lo = B.buildMul(VgprRB_S32, Op1.getReg(0), Op2.getReg(0));
685 auto Carry = B.buildUMulH(VgprRB_S32, Op1.getReg(0), Op2.getReg(0));
686 auto MulLo0Hi1 = B.buildMul(VgprRB_S32, Op1.getReg(0), Op2.getReg(1));
687 auto MulHi0Lo1 = B.buildMul(VgprRB_S32, Op1.getReg(1), Op2.getReg(0));
688 auto Sum = B.buildAdd(VgprRB_S32, MulLo0Hi1, MulHi0Lo1);
689 auto Hi = B.buildAdd(VgprRB_S32, Sum, Carry);
690
691 B.buildMergeLikeInstr(Dst, {Lo, Hi});
692 MI.eraseFromParent();
693 return true;
694}
695
696bool RegBankLegalizeHelper::lowerSplitTo16(MachineInstr &MI) {
697 Register Dst = MI.getOperand(0).getReg();
698 assert(MRI.getType(Dst) == V2S16);
699 unsigned Opc = MI.getOpcode();
700 unsigned NumOps = MI.getNumOperands();
701 auto Flags = MI.getFlags();
702
703 auto [Op1Lo, Op1Hi] = unpackAExtTruncS16(MI.getOperand(1).getReg());
704
705 if (NumOps == 2) {
706 auto Lo = B.buildInstr(Opc, {SgprRB_S16}, {Op1Lo}, Flags);
707 auto Hi = B.buildInstr(Opc, {SgprRB_S16}, {Op1Hi}, Flags);
708 B.buildMergeLikeInstr(Dst, {Lo, Hi});
709 MI.eraseFromParent();
710 return true;
711 }
712
713 auto [Op2Lo, Op2Hi] = unpackAExtTruncS16(MI.getOperand(2).getReg());
714
715 if (NumOps == 3) {
716 auto Lo = B.buildInstr(Opc, {SgprRB_S16}, {Op1Lo, Op2Lo}, Flags);
717 auto Hi = B.buildInstr(Opc, {SgprRB_S16}, {Op1Hi, Op2Hi}, Flags);
718 B.buildMergeLikeInstr(Dst, {Lo, Hi});
719 MI.eraseFromParent();
720 return true;
721 }
722
723 assert(NumOps == 4);
724 auto [Op3Lo, Op3Hi] = unpackAExtTruncS16(MI.getOperand(3).getReg());
725 auto Lo = B.buildInstr(Opc, {SgprRB_S16}, {Op1Lo, Op2Lo, Op3Lo}, Flags);
726 auto Hi = B.buildInstr(Opc, {SgprRB_S16}, {Op1Hi, Op2Hi, Op3Hi}, Flags);
727 B.buildMergeLikeInstr(Dst, {Lo, Hi});
728 MI.eraseFromParent();
729 return true;
730}
731
732bool RegBankLegalizeHelper::lowerUniMAD64(MachineInstr &MI) {
733 Register Dst0 = MI.getOperand(0).getReg();
734 Register Dst1 = MI.getOperand(1).getReg();
735 Register Src0 = MI.getOperand(2).getReg();
736 Register Src1 = MI.getOperand(3).getReg();
737 Register Src2 = MI.getOperand(4).getReg();
738
739 const GCNSubtarget &ST = B.getMF().getSubtarget<GCNSubtarget>();
740
741 // Keep the multiplication on the SALU.
742 Register DstLo = B.buildMul(SgprRB_S32, Src0, Src1).getReg(0);
743 Register DstHi = MRI.createVirtualRegister(SgprRB_S32);
744 if (ST.hasScalarMulHiInsts()) {
745 B.buildInstr(AMDGPU::G_UMULH, {{DstHi}}, {Src0, Src1});
746 } else {
747 auto VSrc0 = B.buildCopy(VgprRB_S32, Src0);
748 auto VSrc1 = B.buildCopy(VgprRB_S32, Src1);
749 auto MulHi = B.buildInstr(AMDGPU::G_UMULH, {VgprRB_S32}, {VSrc0, VSrc1});
750 buildReadAnyLane(B, DstHi, MulHi.getReg(0), RBI);
751 }
752
753 // Accumulate and produce the "carry-out" bit.
754
755 // The "carry-out" is defined as bit 64 of the result when computed as a
756 // big integer. For unsigned multiply-add, this matches the usual
757 // definition of carry-out.
758 if (mi_match(Src2, MRI, MIPatternMatch::m_ZeroInt())) {
759 // No accumulate: result is just the multiplication, carry is 0.
760 B.buildMergeLikeInstr(Dst0, {DstLo, DstHi});
761 B.buildConstant(Dst1, 0);
762 } else {
763 // Accumulate: add Src2 to the multiplication result with carry chain.
764 Register Src2Lo = MRI.createVirtualRegister(SgprRB_S32);
765 Register Src2Hi = MRI.createVirtualRegister(SgprRB_S32);
766 B.buildUnmerge({Src2Lo, Src2Hi}, Src2);
767
768 auto AddLo = B.buildUAddo(SgprRB_S32, SgprRB_S32, DstLo, Src2Lo);
769 auto AddHi =
770 B.buildUAdde(SgprRB_S32, SgprRB_S32, DstHi, Src2Hi, AddLo.getReg(1));
771 B.buildMergeLikeInstr(Dst0, {AddLo.getReg(0), AddHi.getReg(0)});
772 B.buildCopy(Dst1, AddHi.getReg(1));
773 }
774
775 MI.eraseFromParent();
776 return true;
777}
778
779bool RegBankLegalizeHelper::lowerSplitTo32Select(MachineInstr &MI) {
780 Register Dst = MI.getOperand(0).getReg();
781 LLT DstTy = MRI.getType(Dst);
782 assert(DstTy == V4S16 || DstTy == V2S32 || DstTy == S64 ||
783 (DstTy.isPointer() && DstTy.getSizeInBits() == 64));
784 LLT Ty = DstTy == V4S16 ? V2S16 : S32;
785 auto Op2 = B.buildUnmerge({VgprRB, Ty}, MI.getOperand(2).getReg());
786 auto Op3 = B.buildUnmerge({VgprRB, Ty}, MI.getOperand(3).getReg());
787 Register Cond = MI.getOperand(1).getReg();
788 auto Flags = MI.getFlags();
789 auto Lo =
790 B.buildSelect({VgprRB, Ty}, Cond, Op2.getReg(0), Op3.getReg(0), Flags);
791 auto Hi =
792 B.buildSelect({VgprRB, Ty}, Cond, Op2.getReg(1), Op3.getReg(1), Flags);
793
794 B.buildMergeLikeInstr(Dst, {Lo, Hi});
795 MI.eraseFromParent();
796 return true;
797}
798
799bool RegBankLegalizeHelper::lowerSplitTo32SExtInReg(MachineInstr &MI) {
800 auto Op1 = B.buildUnmerge(VgprRB_S32, MI.getOperand(1).getReg());
801 int Amt = MI.getOperand(2).getImm();
802 Register Lo, Hi;
803 // Hi|Lo: s sign bit, ?/x bits changed/not changed by sign-extend
804 if (Amt <= 32) {
805 auto Freeze = B.buildFreeze(VgprRB_S32, Op1.getReg(0));
806 if (Amt == 32) {
807 // Hi|Lo: ????????|sxxxxxxx -> ssssssss|sxxxxxxx
808 Lo = Freeze.getReg(0);
809 } else {
810 // Hi|Lo: ????????|???sxxxx -> ssssssss|ssssxxxx
811 Lo = B.buildSExtInReg(VgprRB_S32, Freeze, Amt).getReg(0);
812 }
813
814 auto SignExtCst = B.buildConstant(SgprRB_S32, 31);
815 Hi = B.buildAShr(VgprRB_S32, Lo, SignExtCst).getReg(0);
816 } else {
817 // Hi|Lo: ?????sxx|xxxxxxxx -> ssssssxx|xxxxxxxx
818 Lo = Op1.getReg(0);
819 Hi = B.buildSExtInReg(VgprRB_S32, Op1.getReg(1), Amt - 32).getReg(0);
820 }
821
822 B.buildMergeLikeInstr(MI.getOperand(0).getReg(), {Lo, Hi});
823 MI.eraseFromParent();
824 return true;
825}
826
827bool RegBankLegalizeHelper::lowerSplitBitCount64To32(MachineInstr &MI) {
828 // Split 64-bit find-first-bit operations into 32-bit halves:
829 // (ffbh hi:lo) -> umin(ffbh(hi), uaddsat(ffbh(lo), 32))
830 // (ffbl hi:lo) -> umin(ffbl(lo), uaddsat(ffbl(hi), 32))
831 // (ctlz_zero_undef hi:lo) -> umin(ffbh(hi), add(ffbh(lo), 32))
832 // (cttz_zero_undef hi:lo) -> umin(ffbl(lo), add(ffbl(hi), 32))
833 unsigned Opc = MI.getOpcode();
834
835 // FFBH/FFBL return 0xFFFFFFFF on zero input, using uaddsat to avoid
836 // wrapping. CTLZ/CTTZ guarantee non-zero input (zero_undef), so plain add
837 // is fine.
838 unsigned FFBOpc;
839 unsigned AddOpc;
840 bool SearchFromMSB;
841 switch (Opc) {
842 case AMDGPU::G_AMDGPU_FFBH_U32:
843 FFBOpc = Opc;
844 AddOpc = AMDGPU::G_UADDSAT;
845 SearchFromMSB = true;
846 break;
847 case AMDGPU::G_AMDGPU_FFBL_B32:
848 FFBOpc = Opc;
849 AddOpc = AMDGPU::G_UADDSAT;
850 SearchFromMSB = false;
851 break;
852 case AMDGPU::G_CTLZ_ZERO_UNDEF:
853 FFBOpc = AMDGPU::G_AMDGPU_FFBH_U32;
854 AddOpc = AMDGPU::G_ADD;
855 SearchFromMSB = true;
856 break;
857 case AMDGPU::G_CTTZ_ZERO_UNDEF:
858 FFBOpc = AMDGPU::G_AMDGPU_FFBL_B32;
859 AddOpc = AMDGPU::G_ADD;
860 SearchFromMSB = false;
861 break;
862 default:
863 llvm_unreachable("unexpected opcode in lowerSplitBitCount64To32");
864 }
865
866 auto Unmerge = B.buildUnmerge(VgprRB_S32, MI.getOperand(1).getReg());
867 Register Lo = Unmerge.getReg(0);
868 Register Hi = Unmerge.getReg(1);
869
870 // MSB-first (FFBH/CTLZ) searches hi first; LSB-first (FFBL/CTTZ) searches
871 // lo first. The secondary half adds 32 to account for the primary half's
872 // width.
873 auto Primary = B.buildInstr(FFBOpc, {VgprRB_S32}, {SearchFromMSB ? Hi : Lo});
874 auto Secondary =
875 B.buildInstr(FFBOpc, {VgprRB_S32}, {SearchFromMSB ? Lo : Hi});
876
877 auto Adjusted = B.buildInstr(AddOpc, {VgprRB_S32},
878 {Secondary, B.buildConstant(VgprRB_S32, 32)});
879 B.buildUMin(MI.getOperand(0).getReg(), Primary, Adjusted);
880
881 MI.eraseFromParent();
882 return true;
883}
884
885bool RegBankLegalizeHelper::lowerExtrVecEltToSel(MachineInstr &MI) {
886 // Lower extract vector element to a compare-select chain:
887 // result = elt[0]
888 // for i in 1..N-1:
889 // result = (idx == i) ? elt[i] : result
890 //
891 // When the index is divergent, each lane may want a different element, so
892 // we must check every element per lane.
893 Register Dst = MI.getOperand(0).getReg();
894 Register Src = MI.getOperand(1).getReg();
895 Register Idx = MI.getOperand(2).getReg();
896
897 LLT VecTy = MRI.getType(Src);
898 LLT ScalarTy = VecTy.getScalarType();
899 unsigned NumElts = VecTy.getNumElements();
900 MachineRegisterInfo::VRegAttrs VgprRB_EltTy = {VgprRB, ScalarTy};
901
902 auto Unmerge = B.buildUnmerge(VgprRB_EltTy, Src);
903
904 if (ScalarTy.getSizeInBits() == 32) {
905 Register PrevSelect = Unmerge.getReg(0);
906 for (unsigned I = 1; I < NumElts; ++I) {
907 auto IdxConst = B.buildConstant({SgprRB, MRI.getType(Idx)}, I);
908 auto Cmp = B.buildICmp(CmpInst::ICMP_EQ, VccRB_S1, Idx, IdxConst);
909 PrevSelect =
910 B.buildSelect(VgprRB_EltTy, Cmp, Unmerge.getReg(I), PrevSelect)
911 .getReg(0);
912 }
913 B.buildCopy(Dst, PrevSelect);
914 } else if (ScalarTy.getSizeInBits() == 64) {
915 auto InitUnmerge = B.buildUnmerge(VgprRB_S32, Unmerge.getReg(0));
916 Register PrevLo = InitUnmerge.getReg(0);
917 Register PrevHi = InitUnmerge.getReg(1);
918 for (unsigned I = 1; I < NumElts; ++I) {
919 auto IdxConst = B.buildConstant({SgprRB, MRI.getType(Idx)}, I);
920 auto Cmp = B.buildICmp(CmpInst::ICMP_EQ, VccRB_S1, Idx, IdxConst);
921 auto EltUnmerge = B.buildUnmerge(VgprRB_S32, Unmerge.getReg(I));
922 PrevLo = B.buildSelect(VgprRB_S32, Cmp, EltUnmerge.getReg(0), PrevLo)
923 .getReg(0);
924 PrevHi = B.buildSelect(VgprRB_S32, Cmp, EltUnmerge.getReg(1), PrevHi)
925 .getReg(0);
926 }
927 B.buildMergeLikeInstr(Dst, {PrevLo, PrevHi});
928 } else {
930 MF, MORE, "amdgpu-regbanklegalize",
931 "AMDGPU RegBankLegalize: ExtrVecEltToSel unsupported element type", MI);
932 return false;
933 }
934
935 MI.eraseFromParent();
936 return true;
937}
938
939bool RegBankLegalizeHelper::lowerExtrVecEltTo32(MachineInstr &MI) {
940 // Reduce a 64-bit element extract to two 32-bit extracts:
941 // vec32 = bitcast <N x s64> to <2N x s32>
942 // lo = vec32[idx * 2]
943 // hi = vec32[idx * 2 + 1]
944 // result = merge(lo, hi)
945 //
946 // When the index is uniform, all lanes extract the same element, so we can
947 // just split the s64 extract into two s32 extracts which lower to MOVREL.
948 Register Dst = MI.getOperand(0).getReg();
949 Register Src = MI.getOperand(1).getReg();
950 Register Idx = MI.getOperand(2).getReg();
951
952 LLT SrcTy = MRI.getType(Src);
953 LLT Vec32Ty = LLT::fixed_vector(2 * SrcTy.getNumElements(), 32);
954
955 assert(MRI.getRegBank(Src) == VgprRB && MRI.getRegBank(Idx) == SgprRB &&
956 "expected VGPR src and SGPR idx");
957
958 auto CastSrc = B.buildBitcast({VgprRB, Vec32Ty}, Src);
959
960 // Calculate new Lo and Hi indices
961 auto One = B.buildConstant(SgprRB_S32, 1);
962 auto IdxLo = B.buildShl(SgprRB_S32, Idx, One);
963 auto IdxHi = B.buildAdd(SgprRB_S32, IdxLo, One);
964
965 auto ExtLo = B.buildExtractVectorElement(VgprRB_S32, CastSrc, IdxLo);
966 auto ExtHi = B.buildExtractVectorElement(VgprRB_S32, CastSrc, IdxHi);
967
968 B.buildMergeLikeInstr(Dst, {ExtLo.getReg(0), ExtHi.getReg(0)});
969
970 MI.eraseFromParent();
971 return true;
972}
973
974bool RegBankLegalizeHelper::lowerInsVecEltToSel(MachineInstr &MI) {
975 // Lower insert vector element to a compare-select chain:
976 // for i in 0..N-1:
977 // result[i] = (idx == i) ? elt : srcVec[i]
978 // dst = merge(result[0..N-1])
979 //
980 // VGPR B64 requires splitting to lo/hi s32 pairs since there is no
981 // v_cndmask_b64. SGPR B64/B32 and VGPR B32 can be handled natively.
982 Register Dst = MI.getOperand(0).getReg();
983 Register Src = MI.getOperand(1).getReg();
984 Register Elt = MI.getOperand(2).getReg();
985 Register Idx = MI.getOperand(3).getReg();
986
987 LLT VecTy = MRI.getType(Src);
988 LLT ScalarTy = VecTy.getScalarType();
989 unsigned NumElts = VecTy.getNumElements();
990 const RegisterBank *SrcRB = MRI.getRegBank(Src);
991 bool IsSGPR = (SrcRB == SgprRB);
992 SmallVector<Register, 16> Selects;
993
994 if (!IsSGPR && ScalarTy.getSizeInBits() == 64) {
995 // VGPR B64: split to 32-bit lo/hi since there is no v_cndmask_b64.
996 auto Unmerge = B.buildUnmerge(VgprRB_S32, Src);
997 auto EltUnmerge = B.buildUnmerge(VgprRB_S32, Elt);
998 Register EltLo = EltUnmerge.getReg(0);
999 Register EltHi = EltUnmerge.getReg(1);
1000 for (unsigned I = 0; I < NumElts; ++I) {
1001 auto IdxConst = B.buildConstant(VgprRB_S32, I);
1002 auto Cmp = B.buildICmp(CmpInst::ICMP_EQ, VccRB_S1, Idx, IdxConst);
1003 Selects.push_back(
1004 B.buildSelect(VgprRB_S32, Cmp, EltLo, Unmerge.getReg(2 * I))
1005 .getReg(0));
1006 Selects.push_back(
1007 B.buildSelect(VgprRB_S32, Cmp, EltHi, Unmerge.getReg(2 * I + 1))
1008 .getReg(0));
1009 }
1010 LLT Vec32Ty = LLT::fixed_vector(2 * NumElts, 32);
1011 auto Vec32 = B.buildBuildVector({VgprRB, Vec32Ty}, Selects);
1012 B.buildBitcast(Dst, Vec32);
1013 } else if (ScalarTy.getSizeInBits() == 32 || ScalarTy.getSizeInBits() == 64) {
1014 // B32 (any bank) and SGPR B64: element-wise select at native width.
1015 MachineRegisterInfo::VRegAttrs SrcRB_EltTy = {SrcRB, ScalarTy};
1016 MachineRegisterInfo::VRegAttrs CmpTy = IsSGPR ? SgprRB_S32 : VccRB_S1;
1017 auto Unmerge = B.buildUnmerge(SrcRB_EltTy, Src);
1018 for (unsigned I = 0; I < NumElts; ++I) {
1019 auto IdxConst = B.buildConstant(SgprRB_S32, I);
1020 auto Cmp = B.buildICmp(CmpInst::ICMP_EQ, CmpTy, Idx, IdxConst);
1021 Selects.push_back(
1022 B.buildSelect(SrcRB_EltTy, Cmp, Elt, Unmerge.getReg(I)).getReg(0));
1023 }
1024 B.buildMergeLikeInstr(Dst, Selects);
1025 } else {
1027 MF, MORE, "amdgpu-regbanklegalize",
1028 "AMDGPU RegBankLegalize: InsVecEltToSel unsupported element type", MI);
1029 return false;
1030 }
1031
1032 MI.eraseFromParent();
1033 return true;
1034}
1035
1036bool RegBankLegalizeHelper::lowerInsVecEltTo32(MachineInstr &MI) {
1037 // Reduce a 64-bit element insert to two 32-bit inserts:
1038 // vec32 = bitcast <N x s64> to <2N x s32>
1039 // lo, hi = unmerge elt
1040 // vec32[idx * 2] = lo
1041 // vec32[idx * 2 + 1] = hi
1042 // dst = bitcast <2N x s32> to <N x s64>
1043 //
1044 // When the index is uniform, all lanes insert at the same position, so we
1045 // can split the s64 insert into two s32 inserts which lower to MOVREL/GPRIDX.
1046 Register Dst = MI.getOperand(0).getReg();
1047 Register Src = MI.getOperand(1).getReg();
1048 Register Elt = MI.getOperand(2).getReg();
1049 Register Idx = MI.getOperand(3).getReg();
1050
1051 LLT SrcTy = MRI.getType(Src);
1052 LLT Vec32Ty = LLT::fixed_vector(2 * SrcTy.getNumElements(), 32);
1053
1054 assert(MRI.getRegBank(Src) == VgprRB && MRI.getRegBank(Idx) == SgprRB &&
1055 "expected VGPR src and SGPR idx");
1056
1057 MachineRegisterInfo::VRegAttrs VgprRB_Vec32Ty = {VgprRB, Vec32Ty};
1058
1059 auto CastSrc = B.buildBitcast(VgprRB_Vec32Ty, Src);
1060 auto EltUnmerge = B.buildUnmerge(VgprRB_S32, Elt);
1061
1062 // Calculate new Lo and Hi indices
1063 auto One = B.buildConstant(SgprRB_S32, 1);
1064 auto IdxLo = B.buildShl(SgprRB_S32, Idx, One);
1065 auto IdxHi = B.buildAdd(SgprRB_S32, IdxLo, One);
1066
1067 auto InsLo = B.buildInsertVectorElement(VgprRB_Vec32Ty, CastSrc,
1068 EltUnmerge.getReg(0), IdxLo);
1069 auto InsHi = B.buildInsertVectorElement(VgprRB_Vec32Ty, InsLo,
1070 EltUnmerge.getReg(1), IdxHi);
1071
1072 B.buildBitcast(Dst, InsHi);
1073
1074 MI.eraseFromParent();
1075 return true;
1076}
1077
1078bool RegBankLegalizeHelper::lowerAbsToNegMax(MachineInstr &MI) {
1079 // Lower divergent G_ABS to smax(x, 0 - x) in the VGPR bank:
1080 // zero = 0
1081 // neg = G_SUB zero, x
1082 // dst = G_SMAX x, neg
1083 //
1084 // There is no integer v_abs instruction on AMDGPU, so divergent G_ABS is
1085 // expanded to this sub/smax pair.
1086 Register DstReg = MI.getOperand(0).getReg();
1087 Register SrcReg = MI.getOperand(1).getReg();
1088 LLT Ty = MRI.getType(DstReg);
1089
1090 Register Zero;
1091 if (Ty == V2S16) {
1092 // buildConstant cannot produce a V2S16 directly; pack two S16 zeros.
1093 Register Zero16 = B.buildConstant({VgprRB, S16}, 0).getReg(0);
1094 Zero = B.buildBuildVector({VgprRB, Ty}, {Zero16, Zero16}).getReg(0);
1095 } else {
1096 assert((Ty == S32 || Ty == S16) && "unexpected type for AbsToNegMax");
1097 Zero = B.buildConstant({VgprRB, Ty}, 0).getReg(0);
1098 }
1099
1100 auto Neg = B.buildSub({VgprRB, Ty}, Zero, SrcReg);
1101 B.buildSMax(DstReg, SrcReg, Neg);
1102 MI.eraseFromParent();
1103 return true;
1104}
1105
1106bool RegBankLegalizeHelper::lowerAbsToS32(MachineInstr &MI) {
1107 // Lower uniform V2S16 abs by unpacking the values to two separate SGPR
1108 // registers and re-emitting G_ABS on each:
1109 // packed = bitcast <2 x s16> src to s32
1110 // lo = sext_inreg packed, 16
1111 // hi = ashr packed, 16
1112 // dst = build_vector_trunc G_ABS(lo), G_ABS(hi)
1113 //
1114 // SALU only has s_abs_i32, with no direct uniform V2S16 abs. The
1115 // re-emitted G_ABS(SgprRB, S32) selects to s_abs_i32 on each value.
1116 auto Bitcast = B.buildBitcast({SgprRB_S32}, MI.getOperand(1).getReg());
1117 auto SextInReg = B.buildSExtInReg({SgprRB_S32}, Bitcast, 16);
1118 auto ShiftHi =
1119 B.buildAShr({SgprRB_S32}, Bitcast, B.buildConstant({SgprRB_S32}, 16));
1120
1121 auto AbsLo = B.buildInstr(AMDGPU::G_ABS, {{SgprRB_S32}}, {SextInReg});
1122 auto AbsHi = B.buildInstr(AMDGPU::G_ABS, {{SgprRB_S32}}, {ShiftHi});
1123 B.buildBuildVectorTrunc(MI.getOperand(0).getReg(),
1124 {AbsLo.getReg(0), AbsHi.getReg(0)});
1125
1126 MI.eraseFromParent();
1127 return true;
1128}
1129
1130bool RegBankLegalizeHelper::lower(MachineInstr &MI,
1131 const RegBankLLTMapping &Mapping,
1132 WaterfallInfo &WFI) {
1133
1134 switch (Mapping.LoweringMethod) {
1135 case DoNotLower:
1136 break;
1137 case VccExtToSel:
1138 return lowerVccExtToSel(MI);
1139 case UniExtToSel: {
1140 LLT Ty = MRI.getType(MI.getOperand(0).getReg());
1141 auto True = B.buildConstant({SgprRB, Ty},
1142 MI.getOpcode() == AMDGPU::G_SEXT ? -1 : 1);
1143 auto False = B.buildConstant({SgprRB, Ty}, 0);
1144 // Input to G_{Z|S}EXT is 'Legalizer legal' S1. Most common case is compare.
1145 // We are making select here. S1 cond was already 'any-extended to S32' +
1146 // 'AND with 1 to clean high bits' by Sgpr32AExtBoolInReg.
1147 B.buildSelect(MI.getOperand(0).getReg(), MI.getOperand(1).getReg(), True,
1148 False);
1149 MI.eraseFromParent();
1150 return true;
1151 }
1152 case UnpackBitShift:
1153 return lowerUnpackBitShift(MI);
1154 case UnpackMinMax:
1155 return lowerUnpackMinMax(MI);
1156 case ScalarizeToS16:
1157 return lowerSplitTo16(MI);
1158 case Ext32To64: {
1159 const RegisterBank *RB = MRI.getRegBank(MI.getOperand(0).getReg());
1160 MachineInstrBuilder Hi;
1161 switch (MI.getOpcode()) {
1162 case AMDGPU::G_ZEXT: {
1163 Hi = B.buildConstant({RB, S32}, 0);
1164 break;
1165 }
1166 case AMDGPU::G_SEXT: {
1167 // Replicate sign bit from 32-bit extended part.
1168 auto ShiftAmt = B.buildConstant({RB, S32}, 31);
1169 Hi = B.buildAShr({RB, S32}, MI.getOperand(1).getReg(), ShiftAmt);
1170 break;
1171 }
1172 case AMDGPU::G_ANYEXT: {
1173 Hi = B.buildUndef({RB, S32});
1174 break;
1175 }
1176 default:
1177 reportGISelFailure(MF, MORE, "amdgpu-regbanklegalize",
1178 "AMDGPU RegBankLegalize: Ext32To64, unsuported opcode",
1179 MI);
1180 return false;
1181 }
1182
1183 B.buildMergeLikeInstr(MI.getOperand(0).getReg(),
1184 {MI.getOperand(1).getReg(), Hi});
1185 MI.eraseFromParent();
1186 return true;
1187 }
1188 case UniCstExt: {
1189 uint64_t ConstVal = MI.getOperand(1).getCImm()->getZExtValue();
1190 B.buildConstant(MI.getOperand(0).getReg(), ConstVal);
1191
1192 MI.eraseFromParent();
1193 return true;
1194 }
1195 case VgprToVccCopy: {
1196 Register Src = MI.getOperand(1).getReg();
1197 LLT Ty = MRI.getType(Src);
1198 // Take lowest bit from each lane and put it in lane mask.
1199 // Lowering via compare, but we need to clean high bits first as compare
1200 // compares all bits in register.
1201 Register BoolSrc = MRI.createVirtualRegister({VgprRB, Ty});
1202 if (Ty == S64) {
1203 auto Src64 = B.buildUnmerge(VgprRB_S32, Src);
1204 auto One = B.buildConstant(VgprRB_S32, 1);
1205 auto AndLo = B.buildAnd(VgprRB_S32, Src64.getReg(0), One);
1206 auto Zero = B.buildConstant(VgprRB_S32, 0);
1207 auto AndHi = B.buildAnd(VgprRB_S32, Src64.getReg(1), Zero);
1208 B.buildMergeLikeInstr(BoolSrc, {AndLo, AndHi});
1209 } else {
1210 assert(Ty == S32 || Ty == S16);
1211 auto One = B.buildConstant({VgprRB, Ty}, 1);
1212 B.buildAnd(BoolSrc, Src, One);
1213 }
1214 auto Zero = B.buildConstant({VgprRB, Ty}, 0);
1215 B.buildICmp(CmpInst::ICMP_NE, MI.getOperand(0).getReg(), BoolSrc, Zero);
1216 MI.eraseFromParent();
1217 return true;
1218 }
1219 case V_BFE:
1220 return lowerV_BFE(MI);
1221 case S_BFE:
1222 return lowerS_BFE(MI);
1223 case UniMAD64:
1224 return lowerUniMAD64(MI);
1225 case UniMul64: {
1226 B.buildMul(MI.getOperand(0), MI.getOperand(1), MI.getOperand(2));
1227 MI.eraseFromParent();
1228 return true;
1229 }
1230 case DivSMulToMAD: {
1231 auto Op1 = B.buildTrunc(VgprRB_S32, MI.getOperand(1));
1232 auto Op2 = B.buildTrunc(VgprRB_S32, MI.getOperand(2));
1233 auto Zero = B.buildConstant({VgprRB, S64}, 0);
1234
1235 unsigned NewOpc = MI.getOpcode() == AMDGPU::G_AMDGPU_S_MUL_U64_U32
1236 ? AMDGPU::G_AMDGPU_MAD_U64_U32
1237 : AMDGPU::G_AMDGPU_MAD_I64_I32;
1238
1239 B.buildInstr(NewOpc, {MI.getOperand(0).getReg(), {SgprRB, S32}},
1240 {Op1, Op2, Zero});
1241 MI.eraseFromParent();
1242 return true;
1243 }
1244 case SplitTo32:
1245 return lowerSplitTo32(MI);
1246 case SplitTo32Mul:
1247 return lowerSplitTo32Mul(MI);
1248 case SplitTo32Select:
1249 return lowerSplitTo32Select(MI);
1250 case SplitTo32SExtInReg:
1251 return lowerSplitTo32SExtInReg(MI);
1252 case SplitLoad: {
1253 LLT DstTy = MRI.getType(MI.getOperand(0).getReg());
1254 unsigned Size = DstTy.getSizeInBits();
1255 // Even split to 128-bit loads
1256 if (Size > 128) {
1257 LLT B128;
1258 if (DstTy.isVector()) {
1259 LLT EltTy = DstTy.getElementType();
1260 B128 = LLT::fixed_vector(128 / EltTy.getSizeInBits(), EltTy);
1261 } else {
1262 B128 = LLT::scalar(128);
1263 }
1264 if (Size / 128 == 2)
1265 splitLoad(MI, {B128, B128});
1266 else if (Size / 128 == 4)
1267 splitLoad(MI, {B128, B128, B128, B128});
1268 else {
1269 reportGISelFailure(MF, MORE, "amdgpu-regbanklegalize",
1270 "AMDGPU RegBankLegalize: SplitLoad, unsuported type",
1271 MI);
1272 return false;
1273 }
1274 }
1275 // 64 and 32 bit load
1276 else if (DstTy == S96)
1277 splitLoad(MI, {S64, S32}, S32);
1278 else if (DstTy == V3S32)
1279 splitLoad(MI, {V2S32, S32}, S32);
1280 else if (DstTy == V6S16)
1281 splitLoad(MI, {V4S16, V2S16}, V2S16);
1282 else {
1283 reportGISelFailure(MF, MORE, "amdgpu-regbanklegalize",
1284 "AMDGPU RegBankLegalize: SplitLoad, unsuported type",
1285 MI);
1286 return false;
1287 }
1288 return true;
1289 }
1290 case WidenLoad: {
1291 LLT DstTy = MRI.getType(MI.getOperand(0).getReg());
1292 if (DstTy == S96)
1293 widenLoad(MI, S128);
1294 else if (DstTy == V3S32)
1295 widenLoad(MI, V4S32, S32);
1296 else if (DstTy == V6S16)
1297 widenLoad(MI, V8S16, V2S16);
1298 else {
1299 reportGISelFailure(MF, MORE, "amdgpu-regbanklegalize",
1300 "AMDGPU RegBankLegalize: WidenLoad, unsuported type",
1301 MI);
1302 return false;
1303 }
1304 return true;
1305 }
1306 case UnpackAExt:
1307 return lowerUnpackAExt(MI);
1308 case WidenMMOToS32:
1309 return widenMMOToS32(cast<GAnyLoad>(MI));
1310 case VerifyAllSgpr: {
1311 assert(llvm::all_of(MI.operands(), [&](const MachineOperand &Op) {
1312 return MRI.getRegBankOrNull(Op.getReg()) == SgprRB;
1313 }));
1314 return true;
1315 }
1316 case ApplyAllVgpr: {
1317 assert(llvm::all_of(MI.defs(), [&](const MachineOperand &Op) {
1318 return MRI.getRegBankOrNull(Op.getReg()) == VgprRB;
1319 }));
1320 B.setInstrAndDebugLoc(MI);
1321 for (unsigned i = MI.getNumDefs(); i < MI.getNumOperands(); ++i) {
1322 MachineOperand &Op = MI.getOperand(i);
1323 if (!Op.isReg())
1324 continue;
1325 Register Reg = Op.getReg();
1326 if (MRI.getRegBank(Reg) != VgprRB) {
1327 auto Copy = B.buildCopy({VgprRB, MRI.getType(Reg)}, Reg);
1328 Op.setReg(Copy.getReg(0));
1329 }
1330 }
1331 return true;
1332 }
1333 case UnmergeToShiftTrunc: {
1334 GUnmerge *Unmerge = dyn_cast<GUnmerge>(&MI);
1335 LLT Ty = MRI.getType(Unmerge->getSourceReg());
1336 if (Ty.getSizeInBits() % 32 != 0) {
1337 reportGISelFailure(MF, MORE, "amdgpu-regbanklegalize",
1338 "AMDGPU RegBankLegalize: unmerge not multiple of 32",
1339 MI);
1340 return false;
1341 }
1342
1343 B.setInstrAndDebugLoc(MI);
1344 if (Ty.getSizeInBits() > 32) {
1345 auto UnmergeV2S16 =
1346 B.buildUnmerge({SgprRB, V2S16}, Unmerge->getSourceReg());
1347 for (unsigned i = 0; i < UnmergeV2S16->getNumDefs(); ++i) {
1348 auto [Dst0S32, Dst1S32] =
1349 unpackAExt(UnmergeV2S16->getOperand(i).getReg());
1350 B.buildTrunc(MI.getOperand(i * 2).getReg(), Dst0S32);
1351 B.buildTrunc(MI.getOperand(i * 2 + 1).getReg(), Dst1S32);
1352 }
1353 } else {
1354 auto [Dst0S32, Dst1S32] = unpackAExt(MI.getOperand(2).getReg());
1355 B.buildTrunc(MI.getOperand(0).getReg(), Dst0S32);
1356 B.buildTrunc(MI.getOperand(1).getReg(), Dst1S32);
1357 }
1358
1359 MI.eraseFromParent();
1360 return true;
1361 }
1363 Register Dst = MI.getOperand(0).getReg();
1364 Register NewDst = MRI.createVirtualRegister(SgprRB_S32);
1365 B.setInsertPt(*MI.getParent(), MI.getParent()->getFirstNonPHI());
1366 MI.getOperand(0).setReg(NewDst);
1367 B.buildTrunc(Dst, NewDst);
1368
1369 for (unsigned i = 1; i < MI.getNumOperands(); i += 2) {
1370 Register UseReg = MI.getOperand(i).getReg();
1371
1372 auto DefMI = MRI.getVRegDef(UseReg)->getIterator();
1373 MachineBasicBlock *DefMBB = DefMI->getParent();
1374
1375 B.setInsertPt(*DefMBB, DefMBB->SkipPHIsAndLabels(std::next(DefMI)));
1376
1377 auto NewUse = B.buildAnyExt(SgprRB_S32, UseReg);
1378 MI.getOperand(i).setReg(NewUse.getReg(0));
1379 }
1380 break;
1381 }
1382 case VerifyAllSgprGPHI: {
1383 assert(llvm::all_of(MI.operands(), [&](const MachineOperand &Op) {
1384 if (Op.isMBB())
1385 return true;
1386 return MRI.getRegBankOrNull(Op.getReg()) == SgprRB;
1387 }));
1388 return true;
1389 }
1391 assert(MRI.getRegBankOrNull(MI.getOperand(0).getReg()) == VgprRB);
1392 assert(llvm::all_of(MI.operands(), [&](const MachineOperand &Op) {
1393 if (Op.isMBB())
1394 return true;
1395 const RegisterBank *RB = MRI.getRegBankOrNull(Op.getReg());
1396 return RB == VgprRB || RB == SgprRB;
1397 }));
1398 return true;
1399 }
1400 case ApplyINTRIN_IMAGE: {
1401 const AMDGPU::RsrcIntrinsic *RSrcIntrin =
1403 assert(RSrcIntrin && RSrcIntrin->IsImage);
1404 // The reported argument index is relative to the IR intrinsic call
1405 // arguments, so shift by the number of defs and the intrinsic ID.
1406 unsigned RsrcIdx = RSrcIntrin->RsrcArg + MI.getNumExplicitDefs() + 1;
1407 return applyRegisterBanksVgprWithSgprRsrc(MI, RsrcIdx);
1408 }
1410 // Rsrc is the last register operand. Base BVH trails an A16 immediate
1411 // after rsrc; dual/BVH8 do not. Scan backwards for the last virtual
1412 // register.
1413 unsigned RsrcIdx = MI.getNumOperands();
1414 while (RsrcIdx-- > MI.getNumExplicitDefs()) {
1415 const MachineOperand &Op = MI.getOperand(RsrcIdx);
1416 if (Op.isReg() && Op.getReg().isVirtual())
1417 break;
1418 }
1419 return applyRegisterBanksVgprWithSgprRsrc(MI, RsrcIdx);
1420 }
1422 return lowerSplitBitCount64To32(MI);
1423 case ExtrVecEltToSel:
1424 return lowerExtrVecEltToSel(MI);
1425 case ExtrVecEltTo32:
1426 return lowerExtrVecEltTo32(MI);
1427 case InsVecEltToSel:
1428 return lowerInsVecEltToSel(MI);
1429 case InsVecEltTo32:
1430 return lowerInsVecEltTo32(MI);
1431 case AbsToNegMax:
1432 return lowerAbsToNegMax(MI);
1433 case AbsToS32:
1434 return lowerAbsToS32(MI);
1435 }
1436
1437 if (!WFI.SgprWaterfallOperandRegs.empty()) {
1438 if (!executeInWaterfallLoop(B, WFI))
1439 return false;
1440 }
1441 return true;
1442}
1443
1444LLT RegBankLegalizeHelper::getTyFromID(RegBankLLTMappingApplyID ID) {
1445 switch (ID) {
1446 case Vcc:
1447 case UniInVcc:
1448 return LLT::scalar(1);
1449 case Sgpr16:
1450 case Vgpr16:
1451 case UniInVgprS16:
1452 return LLT::scalar(16);
1453 case Sgpr32:
1454 case Sgpr32_WF:
1455 case Sgpr32Trunc:
1456 case Sgpr32AExt:
1458 case Sgpr32SExt:
1459 case Sgpr32ZExt:
1460 case UniInVgprS32:
1461 case Sgpr32ToVgprDst:
1462 case Vgpr32:
1463 case Vgpr32AExt:
1464 case Vgpr32SExt:
1465 case Vgpr32ZExt:
1466 return LLT::scalar(32);
1467 case Sgpr64:
1468 case Vgpr64:
1469 case UniInVgprS64:
1470 case Sgpr64ToVgprDst:
1471 return LLT::scalar(64);
1472 case Sgpr128:
1473 case Vgpr128:
1474 return LLT::scalar(128);
1475 case SgprP0:
1476 case SgprP0Call_WF:
1477 case VgprP0:
1478 return LLT::pointer(0, 64);
1479 case SgprP1:
1480 case VgprP1:
1481 return LLT::pointer(1, 64);
1482 case SgprP2:
1483 case VgprP2:
1484 return LLT::pointer(2, 32);
1485 case SgprP3:
1486 case VgprP3:
1487 return LLT::pointer(3, 32);
1488 case SgprP4:
1489 case SgprP4Call_WF:
1490 case VgprP4:
1491 return LLT::pointer(4, 64);
1492 case SgprP5:
1493 case VgprP5:
1494 return LLT::pointer(5, 32);
1495 case SgprP8:
1496 return LLT::pointer(8, 128);
1497 case SgprV2S16:
1498 case VgprV2S16:
1499 case UniInVgprV2S16:
1500 return LLT::fixed_vector(2, 16);
1501 case SgprV2S32:
1502 case VgprV2S32:
1503 case UniInVgprV2S32:
1504 return LLT::fixed_vector(2, 32);
1505 case VgprV3S32:
1506 return LLT::fixed_vector(3, 32);
1507 case VgprV4S16:
1508 return LLT::fixed_vector(4, 16);
1509 case SgprV4S32:
1510 case SgprV4S32_WF:
1512 case VgprV4S32:
1513 case UniInVgprV4S32:
1514 return LLT::fixed_vector(4, 32);
1515 case VgprV8S32:
1516 return LLT::fixed_vector(8, 32);
1517 case VgprV2S64:
1518 case UniInVgprV2S64:
1519 return LLT::fixed_vector(2, 64);
1520 case VgprV6S32:
1521 return LLT::fixed_vector(6, 32);
1522 case VgprV32S16:
1523 return LLT::fixed_vector(32, 16);
1524 case VgprV32S32:
1525 return LLT::fixed_vector(32, 32);
1526 default:
1527 return LLT();
1528 }
1529}
1530
1531LLT RegBankLegalizeHelper::getBTyFromID(RegBankLLTMappingApplyID ID, LLT Ty) {
1532 switch (ID) {
1533 case SgprB32:
1534 case VgprB32:
1535 case SgprB32_M0:
1537 case UniInVgprB32:
1538 if (Ty == LLT::scalar(32) || Ty == LLT::fixed_vector(2, 16) ||
1539 isAnyPtr(Ty, 32))
1540 return Ty;
1541 return LLT();
1542 case SgprPtr32:
1543 case VgprPtr32:
1544 return isAnyPtr(Ty, 32) ? Ty : LLT();
1545 case SgprPtr64:
1546 case VgprPtr64:
1547 return isAnyPtr(Ty, 64) ? Ty : LLT();
1548 case SgprPtr128:
1549 case VgprPtr128:
1550 return isAnyPtr(Ty, 128) ? Ty : LLT();
1551 case SgprB64:
1552 case VgprB64:
1554 case UniInVgprB64:
1555 if (Ty == LLT::scalar(64) || Ty == LLT::fixed_vector(2, 32) ||
1556 Ty == LLT::fixed_vector(4, 16) || isAnyPtr(Ty, 64))
1557 return Ty;
1558 return LLT();
1559 case SgprB96:
1560 case VgprB96:
1561 case UniInVgprB96:
1562 if (Ty == LLT::scalar(96) || Ty == LLT::fixed_vector(3, 32) ||
1563 Ty == LLT::fixed_vector(6, 16))
1564 return Ty;
1565 return LLT();
1566 case SgprB128:
1567 case VgprB128:
1568 case UniInVgprB128:
1569 if (Ty == LLT::scalar(128) || Ty == LLT::fixed_vector(4, 32) ||
1570 Ty == LLT::fixed_vector(2, 64) || Ty == LLT::fixed_vector(8, 16) ||
1571 isAnyPtr(Ty, 128))
1572 return Ty;
1573 return LLT();
1574 case VgprB160:
1575 case UniInVgprB160:
1576 if (Ty.getSizeInBits() == 160)
1577 return Ty;
1578 return LLT();
1579 case SgprB256:
1580 case VgprB256:
1581 case UniInVgprB256:
1582 if (Ty == LLT::scalar(256) || Ty == LLT::fixed_vector(8, 32) ||
1583 Ty == LLT::fixed_vector(4, 64) || Ty == LLT::fixed_vector(16, 16))
1584 return Ty;
1585 return LLT();
1586 case SgprB512:
1587 case VgprB512:
1588 case UniInVgprB512:
1589 if (Ty == LLT::scalar(512) || Ty == LLT::fixed_vector(16, 32) ||
1590 Ty == LLT::fixed_vector(8, 64))
1591 return Ty;
1592 return LLT();
1593 case SgprBRC: {
1594 const SIRegisterInfo *TRI =
1595 static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
1596 unsigned LLTSize = Ty.getSizeInBits();
1597 if (LLTSize >= 32 && TRI->getSGPRClassForBitWidth(LLTSize))
1598 return Ty;
1599 return LLT();
1600 }
1601 case VgprBRC: {
1602 const SIRegisterInfo *TRI =
1603 static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
1604 if (TRI->getSGPRClassForBitWidth(Ty.getSizeInBits()))
1605 return Ty;
1606 return LLT();
1607 }
1608 default:
1609 return LLT();
1610 }
1611}
1612
1613const RegisterBank *
1614RegBankLegalizeHelper::getRegBankFromID(RegBankLLTMappingApplyID ID) {
1615 switch (ID) {
1616 case Vcc:
1617 return VccRB;
1618 case Sgpr16:
1619 case Sgpr32:
1620 case Sgpr32_WF:
1621 case Sgpr64:
1622 case Sgpr128:
1623 case SgprP0:
1624 case SgprP0Call_WF:
1625 case SgprP1:
1626 case SgprP2:
1627 case SgprP3:
1628 case SgprP4:
1629 case SgprP4Call_WF:
1630 case SgprP5:
1631 case SgprP8:
1632 case SgprPtr32:
1633 case SgprPtr64:
1634 case SgprPtr128:
1635 case SgprV2S16:
1636 case SgprV2S32:
1637 case SgprV4S32:
1638 case SgprV4S32_WF:
1640 case SgprB32:
1641 case SgprB64:
1642 case SgprB96:
1643 case SgprB128:
1644 case SgprB256:
1645 case SgprB512:
1646 case SgprBRC:
1647 case UniInVcc:
1648 case UniInVgprS16:
1649 case UniInVgprS32:
1650 case UniInVgprS64:
1651 case UniInVgprV2S16:
1652 case UniInVgprV2S32:
1653 case UniInVgprV4S32:
1654 case UniInVgprV2S64:
1655 case UniInVgprB32:
1656 case UniInVgprB64:
1657 case UniInVgprB96:
1658 case UniInVgprB128:
1659 case UniInVgprB160:
1660 case UniInVgprB256:
1661 case UniInVgprB512:
1662 case Sgpr32Trunc:
1663 case Sgpr32AExt:
1665 case Sgpr32SExt:
1666 case Sgpr32ZExt:
1667 return SgprRB;
1668 case AgprAnyTy:
1669 return AgprRB;
1670 case Vgpr16:
1671 case Vgpr32:
1672 case Vgpr64:
1673 case Vgpr128:
1674 case VgprP0:
1675 case VgprP1:
1676 case VgprP2:
1677 case VgprP3:
1678 case VgprP4:
1679 case VgprP5:
1680 case VgprPtr32:
1681 case VgprPtr64:
1682 case VgprPtr128:
1683 case VgprV2S16:
1684 case VgprV2S32:
1685 case VgprV2S64:
1686 case VgprV3S32:
1687 case VgprV4S16:
1688 case VgprV4S32:
1689 case VgprV6S32:
1690 case VgprV8S32:
1691 case VgprV32S16:
1692 case VgprB32:
1693 case VgprB64:
1694 case VgprB96:
1695 case VgprB128:
1696 case VgprB160:
1697 case VgprB256:
1698 case VgprB512:
1699 case VgprBRC:
1700 case VgprAnyTy:
1701 case Vgpr32AExt:
1702 case Vgpr32SExt:
1703 case Vgpr32ZExt:
1704 case Sgpr32ToVgprDst:
1705 case Sgpr64ToVgprDst:
1706 return VgprRB;
1707 default:
1708 return nullptr;
1709 }
1710}
1711
1712bool RegBankLegalizeHelper::applyMappingDst(
1713 MachineInstr &MI, unsigned &OpIdx,
1714 const SmallVectorImpl<RegBankLLTMappingApplyID> &MethodIDs) {
1715 // Defs start from operand 0
1716 for (; OpIdx < MethodIDs.size(); ++OpIdx) {
1717 if (MethodIDs[OpIdx] == None)
1718 continue;
1719 MachineOperand &Op = MI.getOperand(OpIdx);
1720 Register Reg = Op.getReg();
1721 LLT Ty = MRI.getType(Reg);
1722 [[maybe_unused]] const RegisterBank *RB = MRI.getRegBank(Reg);
1723
1724 switch (MethodIDs[OpIdx]) {
1725 // vcc, sgpr and vgpr scalars, pointers and vectors
1726 case Vcc:
1727 case Sgpr16:
1728 case Sgpr32:
1729 case Sgpr64:
1730 case Sgpr128:
1731 case SgprP0:
1732 case SgprP1:
1733 case SgprP3:
1734 case SgprP4:
1735 case SgprP5:
1736 case SgprP8:
1737 case SgprV2S16:
1738 case SgprV2S32:
1739 case SgprV4S32:
1740 case Vgpr16:
1741 case Vgpr32:
1742 case Vgpr64:
1743 case Vgpr128:
1744 case VgprP0:
1745 case VgprP1:
1746 case VgprP2:
1747 case VgprP3:
1748 case VgprP4:
1749 case VgprP5:
1750 case VgprV2S16:
1751 case VgprV2S32:
1752 case VgprV2S64:
1753 case VgprV3S32:
1754 case VgprV4S16:
1755 case VgprV4S32:
1756 case VgprV6S32:
1757 case VgprV8S32:
1758 case VgprV32S16: {
1759 assert(Ty == getTyFromID(MethodIDs[OpIdx]));
1760 assert(RB == getRegBankFromID(MethodIDs[OpIdx]));
1761 break;
1762 }
1763 // sgpr and vgpr B-types
1764 case SgprB32:
1765 case SgprB64:
1766 case SgprB96:
1767 case SgprB128:
1768 case SgprB256:
1769 case SgprB512:
1770 case SgprBRC:
1771 case SgprPtr32:
1772 case SgprPtr64:
1773 case SgprPtr128:
1774 case VgprB32:
1775 case VgprB64:
1776 case VgprB96:
1777 case VgprB128:
1778 case VgprB160:
1779 case VgprB256:
1780 case VgprB512:
1781 case VgprBRC:
1782 case VgprPtr32:
1783 case VgprPtr64:
1784 case VgprPtr128: {
1785 assert(Ty == getBTyFromID(MethodIDs[OpIdx], Ty));
1786 assert(RB == getRegBankFromID(MethodIDs[OpIdx]));
1787 break;
1788 }
1789 case VgprAnyTy: {
1790 assert(RB == VgprRB);
1791 break;
1792 }
1793 case AgprAnyTy: {
1794 if (RB == AgprRB)
1795 break;
1796 Register NewAgprDst = MRI.createVirtualRegister({AgprRB, Ty});
1797 Op.setReg(NewAgprDst);
1798 if (!MRI.use_nodbg_empty(Reg))
1799 B.buildCopy(Reg, NewAgprDst);
1800 break;
1801 }
1802 // uniform in vcc/vgpr: scalars, vectors and B-types
1803 case UniInVcc: {
1804 assert(Ty == S1);
1805 assert(RB == SgprRB);
1806 Register NewDst = MRI.createVirtualRegister(VccRB_S1);
1807 Op.setReg(NewDst);
1808 if (!MRI.use_empty(Reg)) {
1809 auto CopyS32_Vcc =
1810 B.buildInstr(AMDGPU::G_AMDGPU_COPY_SCC_VCC, {SgprRB_S32}, {NewDst});
1811 B.buildTrunc(Reg, CopyS32_Vcc);
1812 }
1813 break;
1814 }
1815 case UniInVgprS16: {
1816 assert(Ty == getTyFromID(MethodIDs[OpIdx]));
1817 assert(RB == SgprRB);
1818 Register NewVgprDstS16 = MRI.createVirtualRegister({VgprRB, S16});
1819 Register NewVgprDstS32 = MRI.createVirtualRegister({VgprRB, S32});
1820 Register NewSgprDstS32 = MRI.createVirtualRegister({SgprRB, S32});
1821 Op.setReg(NewVgprDstS16);
1822 B.buildAnyExt(NewVgprDstS32, NewVgprDstS16);
1823 buildReadAnyLane(B, NewSgprDstS32, NewVgprDstS32, RBI);
1824 B.buildTrunc(Reg, NewSgprDstS32);
1825 break;
1826 }
1827 case UniInVgprS32:
1828 case UniInVgprS64:
1829 case UniInVgprV2S16:
1830 case UniInVgprV2S32:
1831 case UniInVgprV4S32:
1832 case UniInVgprV2S64: {
1833 assert(Ty == getTyFromID(MethodIDs[OpIdx]));
1834 assert(RB == SgprRB);
1835 Register NewVgprDst = MRI.createVirtualRegister({VgprRB, Ty});
1836 Op.setReg(NewVgprDst);
1837 buildReadAnyLane(B, Reg, NewVgprDst, RBI);
1838 break;
1839 }
1840 case UniInVgprB32:
1841 case UniInVgprB64:
1842 case UniInVgprB96:
1843 case UniInVgprB128:
1844 case UniInVgprB160:
1845 case UniInVgprB256:
1846 case UniInVgprB512: {
1847 assert(Ty == getBTyFromID(MethodIDs[OpIdx], Ty));
1848 assert(RB == SgprRB);
1849 Register NewVgprDst = MRI.createVirtualRegister({VgprRB, Ty});
1850 Op.setReg(NewVgprDst);
1851 AMDGPU::buildReadAnyLane(B, Reg, NewVgprDst, RBI);
1852 break;
1853 }
1854 // sgpr trunc
1855 case Sgpr32Trunc: {
1856 assert(Ty.getSizeInBits() < 32);
1857 assert(RB == SgprRB);
1858 Register NewDst = MRI.createVirtualRegister(SgprRB_S32);
1859 Op.setReg(NewDst);
1860 if (!MRI.use_empty(Reg))
1861 B.buildTrunc(Reg, NewDst);
1862 break;
1863 }
1864 case Sgpr32ToVgprDst:
1865 case Sgpr64ToVgprDst: {
1866 assert(Ty == getTyFromID(MethodIDs[OpIdx]));
1867 assert(RB == VgprRB);
1868 Op.setReg(MRI.createVirtualRegister({SgprRB, Ty}));
1869 B.buildCopy(Reg, Op.getReg());
1870 break;
1871 }
1872 case InvalidMapping: {
1874 MF, MORE, "amdgpu-regbanklegalize",
1875 "AMDGPU RegBankLegalize: missing fast rule ('Div' or 'Uni') for", MI);
1876 return false;
1877 }
1878 default:
1880 MF, MORE, "amdgpu-regbanklegalize",
1881 "AMDGPU RegBankLegalize: applyMappingDst, ID not supported", MI);
1882 return false;
1883 }
1884 }
1885
1886 return true;
1887}
1888
1889bool RegBankLegalizeHelper::applyMappingSrc(
1890 MachineInstr &MI, unsigned &OpIdx,
1891 const SmallVectorImpl<RegBankLLTMappingApplyID> &MethodIDs,
1892 WaterfallInfo &WFI) {
1893 for (unsigned i = 0; i < MethodIDs.size(); ++OpIdx, ++i) {
1894 if (MethodIDs[i] == None || MethodIDs[i] == IntrId || MethodIDs[i] == Imm)
1895 continue;
1896
1897 MachineOperand &Op = MI.getOperand(OpIdx);
1898 Register Reg = Op.getReg();
1899 LLT Ty = MRI.getType(Reg);
1900 const RegisterBank *RB = MRI.getRegBank(Reg);
1901
1902 switch (MethodIDs[i]) {
1903 case Vcc: {
1904 assert(Ty == S1);
1905 assert(RB == VccRB || RB == SgprRB);
1906 if (RB == SgprRB) {
1907 auto Aext = B.buildAnyExt(SgprRB_S32, Reg);
1908 auto CopyVcc_Scc =
1909 B.buildInstr(AMDGPU::G_AMDGPU_COPY_VCC_SCC, {VccRB_S1}, {Aext});
1910 Op.setReg(CopyVcc_Scc.getReg(0));
1911 }
1912 break;
1913 }
1914 // sgpr scalars, pointers and vectors
1915 case Sgpr16:
1916 case Sgpr32:
1917 case Sgpr64:
1918 case Sgpr128:
1919 case SgprP0:
1920 case SgprP1:
1921 case SgprP3:
1922 case SgprP4:
1923 case SgprP5:
1924 case SgprP8:
1925 case SgprV2S16:
1926 case SgprV2S32:
1927 case SgprV4S32: {
1928 assert(Ty == getTyFromID(MethodIDs[i]));
1929 assert(RB == getRegBankFromID(MethodIDs[i]));
1930 break;
1931 }
1932 // sgpr B-types
1933 case SgprB32:
1934 case SgprB64:
1935 case SgprB96:
1936 case SgprB128:
1937 case SgprB256:
1938 case SgprB512:
1939 case SgprBRC:
1940 case SgprPtr32:
1941 case SgprPtr64:
1942 case SgprPtr128: {
1943 assert(Ty == getBTyFromID(MethodIDs[i], Ty));
1944 assert(RB == getRegBankFromID(MethodIDs[i]));
1945 break;
1946 }
1947 // vgpr scalars, pointers and vectors
1948 case Vgpr16:
1949 case Vgpr32:
1950 case Vgpr64:
1951 case Vgpr128:
1952 case VgprP0:
1953 case VgprP1:
1954 case VgprP2:
1955 case VgprP3:
1956 case VgprP4:
1957 case VgprP5:
1958 case VgprV2S16:
1959 case VgprV2S32:
1960 case VgprV2S64:
1961 case VgprV3S32:
1962 case VgprV4S16:
1963 case VgprV4S32:
1964 case VgprV6S32:
1965 case VgprV8S32:
1966 case VgprV32S16:
1967 case VgprV32S32: {
1968 assert(Ty == getTyFromID(MethodIDs[i]));
1969 if (RB != VgprRB) {
1970 auto CopyToVgpr = B.buildCopy({VgprRB, Ty}, Reg);
1971 Op.setReg(CopyToVgpr.getReg(0));
1972 }
1973 break;
1974 }
1975 // vgpr B-types
1976 case VgprB32:
1977 case VgprB64:
1978 case VgprB96:
1979 case VgprB128:
1980 case VgprB160:
1981 case VgprB256:
1982 case VgprB512:
1983 case VgprBRC:
1984 case VgprPtr32:
1985 case VgprPtr64:
1986 case VgprPtr128: {
1987 assert(Ty == getBTyFromID(MethodIDs[i], Ty));
1988 if (RB != VgprRB) {
1989 auto CopyToVgpr = B.buildCopy({VgprRB, Ty}, Reg);
1990 Op.setReg(CopyToVgpr.getReg(0));
1991 }
1992 break;
1993 }
1994 case VgprAnyTy: {
1995 if (RB != VgprRB) {
1996 auto CopyToVgpr = B.buildCopy({VgprRB, Ty}, Reg);
1997 Op.setReg(CopyToVgpr.getReg(0));
1998 }
1999 break;
2000 }
2001 case AgprAnyTy: {
2002 if (RB != AgprRB) {
2003 auto CopyToAgpr = B.buildCopy({AgprRB, Ty}, Reg);
2004 Op.setReg(CopyToAgpr.getReg(0));
2005 }
2006 break;
2007 }
2008 // sgpr waterfall, scalars, and vectors
2009 case Sgpr32_WF:
2010 case SgprV4S32_WF: {
2011 assert(Ty == getTyFromID(MethodIDs[i]));
2012 if (RB != SgprRB) {
2013 WFI.SgprWaterfallOperandRegs.insert(Reg);
2014 if (!WFI.Start.isValid()) {
2015 WFI.Start = MI.getIterator();
2016 WFI.End = std::next(MI.getIterator());
2017 }
2018 }
2019 break;
2020 }
2021 case SgprP0Call_WF:
2022 case SgprP4Call_WF: {
2023 assert(Ty == getTyFromID(MethodIDs[i]));
2024 if (RB != SgprRB) {
2025 WFI.SgprWaterfallOperandRegs.insert(Reg);
2026
2027 // Find the ADJCALLSTACKUP before the call.
2028 MachineBasicBlock::iterator Start = MI.getIterator();
2029 while (Start->getOpcode() != AMDGPU::ADJCALLSTACKUP)
2030 --Start;
2031
2032 // Find the ADJCALLSTACKDOWN after the call (include it in range).
2033 MachineBasicBlock::iterator End = MI.getIterator();
2034 while (End->getOpcode() != AMDGPU::ADJCALLSTACKDOWN)
2035 ++End;
2036 ++End;
2037
2038 B.setInsertPt(*MI.getParent(), Start);
2039 WFI.Start = Start;
2040 WFI.End = End;
2041 }
2042 break;
2043 }
2044 case SgprB32_M0:
2046 case SgprB64_ReadFirstLane: {
2047 assert(Ty == getBTyFromID(MethodIDs[i], Ty));
2048 if (RB == SgprRB)
2049 break;
2050 assert(RB == VgprRB);
2051 Register NewSGPR = MRI.createVirtualRegister({SgprRB, Ty});
2052 buildReadFirstLane(B, NewSGPR, Op.getReg(), RBI);
2053 Op.setReg(NewSGPR);
2054 break;
2055 }
2057 assert(Ty == getTyFromID(MethodIDs[i]));
2058 if (RB == SgprRB)
2059 break;
2060 assert(RB == VgprRB);
2061 Register NewSGPR = MRI.createVirtualRegister({SgprRB, Ty});
2062 buildReadFirstLane(B, NewSGPR, Op.getReg(), RBI);
2063 Op.setReg(NewSGPR);
2064 break;
2065 }
2066 // sgpr and vgpr scalars with extend
2067 case Sgpr32AExt: {
2068 // Note: this ext allows S1, and it is meant to be combined away.
2069 assert(Ty.getSizeInBits() < 32);
2070 assert(RB == SgprRB);
2071 auto Aext = B.buildAnyExt(SgprRB_S32, Reg);
2072 Op.setReg(Aext.getReg(0));
2073 break;
2074 }
2075 case Sgpr32AExtBoolInReg: {
2076 // Note: this ext allows S1, and it is meant to be combined away.
2077 assert(Ty.getSizeInBits() == 1);
2078 assert(RB == SgprRB);
2079 auto Aext = B.buildAnyExt(SgprRB_S32, Reg);
2080 // Zext SgprS1 is not legal, make AND with 1 instead. This instruction is
2081 // most of times meant to be combined away in AMDGPURegBankCombiner.
2082 auto Cst1 = B.buildConstant(SgprRB_S32, 1);
2083 auto BoolInReg = B.buildAnd(SgprRB_S32, Aext, Cst1);
2084 Op.setReg(BoolInReg.getReg(0));
2085 break;
2086 }
2087 case Sgpr32SExt: {
2088 assert(1 < Ty.getSizeInBits() && Ty.getSizeInBits() < 32);
2089 assert(RB == SgprRB);
2090 auto Sext = B.buildSExt(SgprRB_S32, Reg);
2091 Op.setReg(Sext.getReg(0));
2092 break;
2093 }
2094 case Sgpr32ZExt: {
2095 assert(1 < Ty.getSizeInBits() && Ty.getSizeInBits() < 32);
2096 assert(RB == SgprRB);
2097 auto Zext = B.buildZExt({SgprRB, S32}, Reg);
2098 Op.setReg(Zext.getReg(0));
2099 break;
2100 }
2101 case Vgpr32AExt: {
2102 assert(Ty.getSizeInBits() < 32);
2103 assert(RB == VgprRB);
2104 auto Aext = B.buildAnyExt({VgprRB, S32}, Reg);
2105 Op.setReg(Aext.getReg(0));
2106 break;
2107 }
2108 case Vgpr32SExt: {
2109 // Note this ext allows S1, and it is meant to be combined away.
2110 assert(Ty.getSizeInBits() < 32);
2111 assert(RB == VgprRB);
2112 auto Sext = B.buildSExt({VgprRB, S32}, Reg);
2113 Op.setReg(Sext.getReg(0));
2114 break;
2115 }
2116 case Vgpr32ZExt: {
2117 // Note this ext allows S1, and it is meant to be combined away.
2118 assert(Ty.getSizeInBits() < 32);
2119 assert(RB == VgprRB);
2120 auto Zext = B.buildZExt({VgprRB, S32}, Reg);
2121 Op.setReg(Zext.getReg(0));
2122 break;
2123 }
2124 default:
2126 MF, MORE, "amdgpu-regbanklegalize",
2127 "AMDGPU RegBankLegalize: applyMappingSrc, ID not supported", MI);
2128 return false;
2129 }
2130 }
2131 return true;
2132}
2133
2134[[maybe_unused]] static bool verifyRegBankOnOperands(MachineInstr &MI,
2135 const RegisterBank *RB,
2137 unsigned StartOpIdx,
2138 unsigned EndOpIdx) {
2139 for (unsigned i = StartOpIdx; i <= EndOpIdx; ++i) {
2140 if (MRI.getRegBankOrNull(MI.getOperand(i).getReg()) != RB)
2141 return false;
2142 }
2143 return true;
2144}
2145
2146bool RegBankLegalizeHelper::applyRegisterBanksVgprWithSgprRsrc(
2147 MachineInstr &MI, unsigned RsrcIdx) {
2148 const unsigned NumDefs = MI.getNumExplicitDefs();
2149
2150 MachineBasicBlock *MBB = MI.getParent();
2151 B.setInsertPt(*MBB, MBB->SkipPHIsAndLabels(std::next(MI.getIterator())));
2152
2153 // Defs are vgpr.
2154 for (unsigned i = 0; i < NumDefs; ++i) {
2155 Register Reg = MI.getOperand(i).getReg();
2156 if (MRI.getRegBank(Reg) == VgprRB)
2157 continue;
2158
2159 Register NewVgprDst = MRI.createVirtualRegister({VgprRB, MRI.getType(Reg)});
2160 MI.getOperand(i).setReg(NewVgprDst);
2161 buildReadAnyLane(B, Reg, NewVgprDst, RBI);
2162 }
2163
2164 B.setInstrAndDebugLoc(MI);
2165
2166 // Register uses before RsrcIdx are vgpr.
2167 for (unsigned i = NumDefs; i < RsrcIdx; ++i) {
2168 MachineOperand &Op = MI.getOperand(i);
2169 if (!Op.isReg())
2170 continue;
2171
2172 Register Reg = Op.getReg();
2173 if (!Reg.isVirtual())
2174 continue;
2175
2176 if (MRI.getRegBank(Reg) == VgprRB)
2177 continue;
2178
2179 auto Copy = B.buildCopy({VgprRB, MRI.getType(Reg)}, Reg);
2180 Op.setReg(Copy.getReg(0));
2181 }
2182
2183 SmallSet<Register, 4> OpsToWaterfall;
2184
2185 // Register use RsrcIdx (and later register operands) is sgpr.
2186 for (unsigned i = RsrcIdx; i < MI.getNumOperands(); ++i) {
2187 MachineOperand &Op = MI.getOperand(i);
2188 if (!Op.isReg())
2189 continue;
2190
2191 Register Reg = Op.getReg();
2192 if (MRI.getRegBank(Reg) != SgprRB)
2193 OpsToWaterfall.insert(Reg);
2194 }
2195
2196 if (!OpsToWaterfall.empty()) {
2197 MachineBasicBlock::iterator MII = MI.getIterator();
2198 executeInWaterfallLoop(B, {OpsToWaterfall, MII, std::next(MII)});
2199 }
2200
2201 return true;
2202}
MachineInstrBuilder MachineInstrBuilder & DefMI
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.
Provides AMDGPU specific target descriptions.
static bool isSignedBFE(MachineInstr &MI)
static bool verifyRegBankOnOperands(MachineInstr &MI, const RegisterBank *RB, MachineRegisterInfo &MRI, unsigned StartOpIdx, unsigned EndOpIdx)
This file declares the targeting of the RegisterBankInfo class for AMDGPU.
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator MBBI
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
AMD GCN specific subclass of TargetSubtarget.
Declares convenience wrapper classes for interpreting MachineInstr instances as specific generic oper...
static Register UseReg(const MachineOperand &MO)
IRTranslator LLVM IR MI
const size_t AbstractManglingParser< Derived, Alloc >::NumOps
#define I(x, y, z)
Definition MD5.cpp:57
Contains matchers for matching SSA Machine Instructions.
This file declares the MachineIRBuilder class.
Register Reg
Register const TargetRegisterInfo * TRI
Machine IR instance of the generic uniformity analysis.
Promote Memory to Register
Definition Mem2Reg.cpp:110
static MCRegister getReg(const MCDisassembler *D, unsigned RC, unsigned RegNo)
MachineInstr unsigned OpIdx
const SmallVectorImpl< MachineOperand > & Cond
static const LaneMaskConstants & get(const GCNSubtarget &ST)
RegBankLegalizeHelper(MachineIRBuilder &B, const MachineUniformityInfo &MUI, const RegisterBankInfo &RBI, const RegBankLegalizeRules &RBLRules)
const RegBankLLTMapping * findMappingForMI(const MachineInstr &MI, const MachineRegisterInfo &MRI, const MachineUniformityInfo &MUI) const
static APInt getLowBitsSet(unsigned numBits, unsigned loBitsSet)
Constructs an APInt value that has the bottom loBitsSet bits set.
Definition APInt.h:307
@ ICMP_NE
not equal
Definition InstrTypes.h:698
iterator find(const_arg_type_t< KeyT > Val)
Definition DenseMap.h:178
iterator end()
Definition DenseMap.h:81
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition DenseMap.h:241
const SIRegisterInfo * getRegisterInfo() const override
Represents a call to an intrinsic.
Register getSourceReg() const
Get the unmerge source register.
constexpr bool isScalar() const
LLT getScalarType() const
static constexpr LLT scalar(unsigned SizeInBits)
Get a low-level scalar or aggregate "bag of bits".
constexpr bool isValid() const
constexpr uint16_t getNumElements() const
Returns the number of elements in a vector LLT.
constexpr bool isVector() const
static constexpr LLT pointer(unsigned AddressSpace, unsigned SizeInBits)
Get a low-level pointer in the given address space.
constexpr TypeSize getSizeInBits() const
Returns the total size of the type. Must only be called on sized types.
constexpr bool isPointer() const
static constexpr LLT fixed_vector(unsigned NumElements, unsigned ScalarSizeInBits)
Get a low-level fixed-width vector of some number of elements and element width.
constexpr TypeSize getSizeInBytes() const
Returns the total size of the type in bytes, i.e.
LLT getElementType() const
Returns the vector's element type. Only valid for vector types.
TypeSize getValue() const
LLVM_ABI void transferSuccessorsAndUpdatePHIs(MachineBasicBlock *FromMBB)
Transfers all the successors, as in transferSuccessors, and update PHI operands in the successor bloc...
LLVM_ABI iterator SkipPHIsAndLabels(iterator I)
Return the first instruction in MBB after I that is not a PHI or a label.
LLVM_ABI void addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob=BranchProbability::getUnknown())
Add Succ as a successor of this MachineBasicBlock.
void splice(iterator Where, MachineBasicBlock *Other, iterator From)
Take an instruction from MBB 'Other' at the position From, and insert it into this MBB right before '...
MachineInstrBundleIterator< MachineInstr > iterator
BasicBlockListType::iterator iterator
MachineBasicBlock * CreateMachineBasicBlock(const BasicBlock *BB=nullptr, std::optional< UniqueBBID > BBID=std::nullopt)
CreateMachineInstr - Allocate a new MachineInstr.
void insert(iterator MBBI, MachineBasicBlock *MBB)
Helper class to build MachineInstr.
Representation of each machine instruction.
const MachineBasicBlock * getParent() const
LocationSize getSize() const
Return the size in bytes of the memory reference.
MachineOperand class - Representation of each machine instruction operand.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
const RegisterBank * getRegBank(Register Reg) const
Return the register bank of Reg.
LLVM_ABI Register createVirtualRegister(const TargetRegisterClass *RegClass, StringRef Name="")
createVirtualRegister - Create and return a new virtual register in the function with the specified r...
LLT getType(Register Reg) const
Get the low-level type of Reg or LLT{} if Reg is not a generic (target independent) virtual register.
const RegisterBank * getRegBankOrNull(Register Reg) const
Return the register bank of Reg, or null if Reg has not been assigned a register bank or has been ass...
Holds all the information related to register banks.
This class implements the register bank concept.
Wrapper class representing virtual and physical registers.
Definition Register.h:20
constexpr bool isVirtual() const
Return true if the specified register number is in the virtual register namespace.
Definition Register.h:79
bool empty() const
Definition SmallSet.h:169
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
Definition SmallSet.h:184
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
self_iterator getIterator()
Definition ilist_node.h:123
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
bool isAnyPtr(LLT Ty, unsigned Width)
Intrinsic::ID getIntrinsicID(const MachineInstr &I)
Return the intrinsic ID for opcodes with the G_AMDGPU_INTRIN_ prefix.
void buildReadAnyLane(MachineIRBuilder &B, Register SgprDst, Register VgprSrc, const RegisterBankInfo &RBI)
const RsrcIntrinsic * lookupRsrcIntrinsic(unsigned Intr)
void buildReadFirstLane(MachineIRBuilder &B, Register SgprDst, Register VgprSrc, const RegisterBankInfo &RBI)
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ Bitcast
Perform the operation on a different, but equivalently sized type.
SpecificConstantMatch m_ZeroInt()
Convenience matchers for specific integer values.
bool mi_match(Reg R, const MachineRegisterInfo &MRI, Pattern &&P)
This is an optimization pass for GlobalISel generic memory operations.
GenericUniformityInfo< MachineSSAContext > MachineUniformityInfo
@ Offset
Definition DWP.cpp:557
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1738
@ Kill
The last use of a register.
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
LLVM_ABI void constrainSelectedInstRegOperands(MachineInstr &I, const TargetInstrInfo &TII, const TargetRegisterInfo &TRI, const RegisterBankInfo &RBI)
Mutate the newly-selected instruction I to constrain its (possibly generic) virtual register operands...
Definition Utils.cpp:155
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
LLVM_ABI void reportGISelFailure(MachineFunction &MF, MachineOptimizationRemarkEmitter &MORE, MachineOptimizationRemarkMissed &R)
Report an ISel error as a missed optimization remark to the LLVMContext's diagnostic stream.
Definition Utils.cpp:257
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
DWARFExpression::Operation Op
ArrayRef(const T &OneElt) -> ArrayRef< T >
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
LLVM_ABI std::optional< ValueAndVReg > getIConstantVRegValWithLookThrough(Register VReg, const MachineRegisterInfo &MRI, bool LookThroughInstrs=true)
If VReg is defined by a statically evaluable chain of instructions rooted on a G_CONSTANT returns its...
Definition Utils.cpp:432
constexpr T maskTrailingOnes(unsigned N)
Create a bitmask with the N right-most bits set to 1, and all other bits set to 0.
Definition MathExtras.h:77
SmallVector< RegBankLLTMappingApplyID, 2 > DstOpMapping
SmallVector< RegBankLLTMappingApplyID, 4 > SrcOpMapping
Holds waterfall loop information: the set of SGPR operand registers that need waterfalling,...
MachineBasicBlock::iterator Start
SmallSet< Register, 4 > SgprWaterfallOperandRegs