LLVM 23.0.0git
AMDGPURegBankLegalizeHelper.cpp
Go to the documentation of this file.
1//===-- AMDGPURegBankLegalizeHelper.cpp -----------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// Implements actual lowering algorithms for each ID that can be used in
10/// Rule.OperandMapping. Similar to legalizer helper but with register banks.
11//
12//===----------------------------------------------------------------------===//
13
16#include "AMDGPUInstrInfo.h"
17#include "AMDGPULaneMaskUtils.h"
20#include "GCNSubtarget.h"
27#include "llvm/IR/IntrinsicsAMDGPU.h"
28
29#define DEBUG_TYPE "amdgpu-regbanklegalize"
30
31using namespace llvm;
32using namespace AMDGPU;
33
36 const RegisterBankInfo &RBI, const RegBankLegalizeRules &RBLRules)
37 : MF(B.getMF()), ST(MF.getSubtarget<GCNSubtarget>()), B(B),
38 MRI(*B.getMRI()), MUI(MUI), RBI(RBI), MORE(MF, nullptr),
39 RBLRules(RBLRules), IsWave32(ST.isWave32()),
40 SgprRB(&RBI.getRegBank(AMDGPU::SGPRRegBankID)),
41 VgprRB(&RBI.getRegBank(AMDGPU::VGPRRegBankID)),
42 AgprRB(&RBI.getRegBank(AMDGPU::AGPRRegBankID)),
43 VccRB(&RBI.getRegBank(AMDGPU::VCCRegBankID)) {}
44
46 const SetOfRulesForOpcode *RuleSet = RBLRules.getRulesForOpc(MI);
47 if (!RuleSet) {
48 reportGISelFailure(MF, MORE, "amdgpu-regbanklegalize",
49 "No AMDGPU RegBankLegalize rules defined for opcode",
50 MI);
51 return false;
52 }
53
54 const RegBankLLTMapping *Mapping = RuleSet->findMappingForMI(MI, MRI, MUI);
55 if (!Mapping) {
56 reportGISelFailure(MF, MORE, "amdgpu-regbanklegalize",
57 "AMDGPU RegBankLegalize: none of the rules defined with "
58 "'Any' for MI's opcode matched MI",
59 MI);
60 return false;
61 }
62
63 WaterfallInfo WFI;
64 unsigned OpIdx = 0;
65 if (!Mapping->DstOpMapping.empty()) {
66 B.setInsertPt(*MI.getParent(), std::next(MI.getIterator()));
67 if (!applyMappingDst(MI, OpIdx, Mapping->DstOpMapping))
68 return false;
69 }
70 if (!Mapping->SrcOpMapping.empty()) {
71 B.setInstr(MI);
72 if (!applyMappingSrc(MI, OpIdx, Mapping->SrcOpMapping, WFI))
73 return false;
74 }
75
76 if (!lower(MI, *Mapping, WFI))
77 return false;
78
79 return true;
80}
81
82bool RegBankLegalizeHelper::executeInWaterfallLoop(MachineIRBuilder &B,
83 const WaterfallInfo &WFI) {
84 assert(WFI.Start.isValid() && WFI.End.isValid() &&
85 "Waterfall range not initialized");
86
87 // Track use registers which have already been expanded with a readfirstlane
88 // sequence. This may have multiple uses if moving a sequence.
89 DenseMap<Register, Register> WaterfalledRegMap;
90
91 MachineBasicBlock &MBB = B.getMBB();
92 MachineFunction &MF = B.getMF();
93
96
98 const TargetRegisterClass *WaveRC = TRI->getWaveMaskRegClass();
100
101#ifndef NDEBUG
102 const int OrigRangeSize = std::distance(BeginIt, EndIt);
103#endif
104
105 MachineRegisterInfo &MRI = *B.getMRI();
106 Register SaveExecReg = MRI.createVirtualRegister(WaveRC);
107 Register InitSaveExecReg = MRI.createVirtualRegister(WaveRC);
108
109 // Don't bother using generic instructions/registers for the exec mask.
110 B.buildInstr(TargetOpcode::IMPLICIT_DEF).addDef(InitSaveExecReg);
111
112 Register SavedExec = MRI.createVirtualRegister(WaveRC);
113
114 // To insert the loop we need to split the block. Move everything before
115 // this point to a new block, and insert a new empty block before this
116 // instruction.
119 MachineBasicBlock *RestoreExecBB = MF.CreateMachineBasicBlock();
120 MachineBasicBlock *RemainderBB = MF.CreateMachineBasicBlock();
122 ++MBBI;
123 MF.insert(MBBI, LoopBB);
124 MF.insert(MBBI, BodyBB);
125 MF.insert(MBBI, RestoreExecBB);
126 MF.insert(MBBI, RemainderBB);
127
128 LoopBB->addSuccessor(BodyBB);
129 BodyBB->addSuccessor(RestoreExecBB);
130 BodyBB->addSuccessor(LoopBB);
131
132 // Move the rest of the block into a new block.
134 RemainderBB->splice(RemainderBB->begin(), &MBB, EndIt, MBB.end());
135
136 MBB.addSuccessor(LoopBB);
137 RestoreExecBB->addSuccessor(RemainderBB);
138
139 B.setInsertPt(*LoopBB, LoopBB->end());
140
141 // +-MBB:------------+
142 // | ... |
143 // | %0 = G_INST_1 |
144 // | %Dst = MI %Vgpr |
145 // | %1 = G_INST_2 |
146 // | ... |
147 // +-----------------+
148 // ->
149 // +-MBB-------------------------------+
150 // | ... |
151 // | %0 = G_INST_1 |
152 // | %SaveExecReg = S_MOV_B32 $exec_lo |
153 // +----------------|------------------+
154 // | /------------------------------|
155 // V V |
156 // +-LoopBB---------------------------------------------------------------+ |
157 // | %CurrentLaneReg:sgpr(s32) = READFIRSTLANE %Vgpr | |
158 // | instead of executing for each lane, see if other lanes had | |
159 // | same value for %Vgpr and execute for them also. | |
160 // | %CondReg:vcc(s1) = G_ICMP eq %CurrentLaneReg, %Vgpr | |
161 // | %CondRegLM:sreg_32 = ballot %CondReg // copy vcc to sreg32 lane mask | |
162 // | %SavedExec = S_AND_SAVEEXEC_B32 %CondRegLM | |
163 // | exec is active for lanes with the same "CurrentLane value" in Vgpr | |
164 // +----------------|-----------------------------------------------------+ |
165 // V |
166 // +-BodyBB------------------------------------------------------------+ |
167 // | %Dst = MI %CurrentLaneReg:sgpr(s32) | |
168 // | executed only for active lanes and written to Dst | |
169 // | $exec = S_XOR_B32 $exec, %SavedExec | |
170 // | set active lanes to 0 in SavedExec, lanes that did not write to | |
171 // | Dst yet, and set this as new exec (for READFIRSTLANE and ICMP) | |
172 // | SI_WATERFALL_LOOP LoopBB |-----|
173 // +----------------|--------------------------------------------------+
174 // V
175 // +-RestoreExecBB--------------------------+
176 // | $exec_lo = S_MOV_B32_term %SaveExecReg |
177 // +----------------|-----------------------+
178 // V
179 // +-RemainderBB:----------------------+
180 // | %1 = G_INST_2 |
181 // | ... |
182 // +---------------------------------- +
183
184 // Move the instruction into the loop body. Note we moved everything after
185 // Range.end() already into a new block, so Range.end() is no longer valid.
186 BodyBB->splice(BodyBB->end(), &MBB, BeginIt, MBB.end());
187
188 // Figure out the iterator range after splicing the instructions.
189 MachineBasicBlock::iterator NewBegin = BeginIt;
190 auto NewEnd = BodyBB->end();
191 assert(std::distance(NewBegin, NewEnd) == OrigRangeSize);
192
193 B.setMBB(*LoopBB);
194 Register CondReg;
195
196 for (MachineInstr &MI : make_range(NewBegin, NewEnd)) {
197 for (MachineOperand &Op : MI.all_uses()) {
198 Register OldReg = Op.getReg();
199 if (!WFI.SgprWaterfallOperandRegs.count(OldReg))
200 continue;
201
202 // See if we already processed this register in another instruction in
203 // the sequence.
204 auto OldVal = WaterfalledRegMap.find(OldReg);
205 if (OldVal != WaterfalledRegMap.end()) {
206 Op.setReg(OldVal->second);
207 continue;
208 }
209
210 Register OpReg = Op.getReg();
211 LLT OpTy = MRI.getType(OpReg);
212
213 // TODO: support for agpr
214 assert(MRI.getRegBank(OpReg) == VgprRB);
215 Register CurrentLaneReg = MRI.createVirtualRegister({SgprRB, OpTy});
216 buildReadFirstLane(B, CurrentLaneReg, OpReg, RBI);
217
218 // Build the comparison(s), CurrentLaneReg == OpReg.
219 unsigned OpSize = OpTy.getSizeInBits();
220 unsigned PartSize = (OpSize % 64 == 0) ? 64 : 32;
221 LLT PartTy = LLT::scalar(PartSize);
222 unsigned NumParts = OpSize / PartSize;
224 SmallVector<Register, 8> CurrentLaneParts;
225
226 if (NumParts == 1) {
227 OpParts.push_back(OpReg);
228 CurrentLaneParts.push_back(CurrentLaneReg);
229 } else {
230 auto UnmergeOp = B.buildUnmerge({VgprRB, PartTy}, OpReg);
231 auto UnmergeCurrLane = B.buildUnmerge({SgprRB, PartTy}, CurrentLaneReg);
232 for (unsigned i = 0; i < NumParts; ++i) {
233 OpParts.push_back(UnmergeOp.getReg(i));
234 CurrentLaneParts.push_back(UnmergeCurrLane.getReg(i));
235 }
236 }
237
238 for (unsigned i = 0; i < NumParts; ++i) {
239 Register CmpReg = MRI.createVirtualRegister(VccRB_S1);
240 B.buildICmp(CmpInst::ICMP_EQ, CmpReg, CurrentLaneParts[i], OpParts[i]);
241
242 if (!CondReg)
243 CondReg = CmpReg;
244 else
245 CondReg = B.buildAnd(VccRB_S1, CondReg, CmpReg).getReg(0);
246 }
247
248 Op.setReg(CurrentLaneReg);
249
250 // Make sure we don't re-process this register again.
251 WaterfalledRegMap.insert(std::pair(OldReg, Op.getReg()));
252 }
253 }
254
255 // Copy vcc to sgpr32/64, ballot becomes a no-op during instruction selection.
256 Register CondRegLM =
257 MRI.createVirtualRegister({WaveRC, LLT::scalar(IsWave32 ? 32 : 64)});
258 B.buildIntrinsic(Intrinsic::amdgcn_ballot, CondRegLM).addReg(CondReg);
259
260 // Update EXEC, save the original EXEC value to SavedExec.
261 B.buildInstr(LMC.AndSaveExecOpc)
262 .addDef(SavedExec)
263 .addReg(CondRegLM, RegState::Kill);
264 MRI.setSimpleHint(SavedExec, CondRegLM);
265
266 B.setInsertPt(*BodyBB, BodyBB->end());
267
268 // Update EXEC, switch all done bits to 0 and all todo bits to 1.
269 B.buildInstr(LMC.XorTermOpc)
270 .addDef(LMC.ExecReg)
271 .addReg(LMC.ExecReg)
272 .addReg(SavedExec);
273
274 // XXX - s_xor_b64 sets scc to 1 if the result is nonzero, so can we use
275 // s_cbranch_scc0?
276
277 // Loop back to V_READFIRSTLANE_B32 if there are still variants to cover.
278 B.buildInstr(AMDGPU::SI_WATERFALL_LOOP).addMBB(LoopBB);
279
280 // Save the EXEC mask before the loop.
281 B.setInsertPt(MBB, MBB.end());
282 B.buildInstr(LMC.MovOpc).addDef(SaveExecReg).addReg(LMC.ExecReg);
283
284 // Restore the EXEC mask after the loop.
285 B.setInsertPt(*RestoreExecBB, RestoreExecBB->begin());
286 B.buildInstr(LMC.MovTermOpc).addDef(LMC.ExecReg).addReg(SaveExecReg);
287
288 // Set the insert point after the original instruction, so any new
289 // instructions will be in the remainder.
290 B.setInsertPt(*RemainderBB, RemainderBB->begin());
291
292 return true;
293}
294
295bool RegBankLegalizeHelper::splitLoad(MachineInstr &MI,
296 ArrayRef<LLT> LLTBreakdown, LLT MergeTy) {
297 MachineFunction &MF = B.getMF();
298 assert(MI.getNumMemOperands() == 1);
299 MachineMemOperand &BaseMMO = **MI.memoperands_begin();
300 Register Dst = MI.getOperand(0).getReg();
301 const RegisterBank *DstRB = MRI.getRegBankOrNull(Dst);
302 Register Base = MI.getOperand(1).getReg();
303 LLT PtrTy = MRI.getType(Base);
304 const RegisterBank *PtrRB = MRI.getRegBankOrNull(Base);
305 LLT OffsetTy = LLT::scalar(PtrTy.getSizeInBits());
306 SmallVector<Register, 4> LoadPartRegs;
307
308 unsigned ByteOffset = 0;
309 for (LLT PartTy : LLTBreakdown) {
310 Register BasePlusOffset;
311 if (ByteOffset == 0) {
312 BasePlusOffset = Base;
313 } else {
314 auto Offset = B.buildConstant({PtrRB, OffsetTy}, ByteOffset);
315 BasePlusOffset =
316 B.buildObjectPtrOffset({PtrRB, PtrTy}, Base, Offset).getReg(0);
317 }
318 auto *OffsetMMO = MF.getMachineMemOperand(&BaseMMO, ByteOffset, PartTy);
319 auto LoadPart = B.buildLoad({DstRB, PartTy}, BasePlusOffset, *OffsetMMO);
320 LoadPartRegs.push_back(LoadPart.getReg(0));
321 ByteOffset += PartTy.getSizeInBytes();
322 }
323
324 if (!MergeTy.isValid()) {
325 // Loads are of same size, concat or merge them together.
326 B.buildMergeLikeInstr(Dst, LoadPartRegs);
327 } else {
328 // Loads are not all of same size, need to unmerge them to smaller pieces
329 // of MergeTy type, then merge pieces to Dst.
330 SmallVector<Register, 4> MergeTyParts;
331 for (Register Reg : LoadPartRegs) {
332 if (MRI.getType(Reg) == MergeTy) {
333 MergeTyParts.push_back(Reg);
334 } else {
335 auto Unmerge = B.buildUnmerge({DstRB, MergeTy}, Reg);
336 for (unsigned i = 0; i < Unmerge->getNumOperands() - 1; ++i)
337 MergeTyParts.push_back(Unmerge.getReg(i));
338 }
339 }
340 B.buildMergeLikeInstr(Dst, MergeTyParts);
341 }
342 MI.eraseFromParent();
343 return true;
344}
345
346bool RegBankLegalizeHelper::widenLoad(MachineInstr &MI, LLT WideTy,
347 LLT MergeTy) {
348 MachineFunction &MF = B.getMF();
349 assert(MI.getNumMemOperands() == 1);
350 MachineMemOperand &BaseMMO = **MI.memoperands_begin();
351 Register Dst = MI.getOperand(0).getReg();
352 const RegisterBank *DstRB = MRI.getRegBankOrNull(Dst);
353 Register Base = MI.getOperand(1).getReg();
354
355 MachineMemOperand *WideMMO = MF.getMachineMemOperand(&BaseMMO, 0, WideTy);
356 auto WideLoad = B.buildLoad({DstRB, WideTy}, Base, *WideMMO);
357
358 if (WideTy.isScalar()) {
359 B.buildTrunc(Dst, WideLoad);
360 } else {
361 SmallVector<Register, 4> MergeTyParts;
362 auto Unmerge = B.buildUnmerge({DstRB, MergeTy}, WideLoad);
363
364 LLT DstTy = MRI.getType(Dst);
365 unsigned NumElts = DstTy.getSizeInBits() / MergeTy.getSizeInBits();
366 for (unsigned i = 0; i < NumElts; ++i) {
367 MergeTyParts.push_back(Unmerge.getReg(i));
368 }
369 B.buildMergeLikeInstr(Dst, MergeTyParts);
370 }
371 MI.eraseFromParent();
372 return true;
373}
374
375bool RegBankLegalizeHelper::widenMMOToS32(GAnyLoad &MI) const {
376 Register Dst = MI.getDstReg();
377 Register Ptr = MI.getPointerReg();
378 MachineMemOperand &MMO = MI.getMMO();
379 unsigned MemSize = 8 * MMO.getSize().getValue();
380
381 MachineMemOperand *WideMMO = B.getMF().getMachineMemOperand(&MMO, 0, S32);
382
383 if (MI.getOpcode() == G_LOAD) {
384 B.buildLoad(Dst, Ptr, *WideMMO);
385 } else {
386 auto Load = B.buildLoad(SgprRB_S32, Ptr, *WideMMO);
387
388 if (MI.getOpcode() == G_ZEXTLOAD) {
389 APInt Mask = APInt::getLowBitsSet(S32.getSizeInBits(), MemSize);
390 auto MaskCst = B.buildConstant(SgprRB_S32, Mask);
391 B.buildAnd(Dst, Load, MaskCst);
392 } else {
393 assert(MI.getOpcode() == G_SEXTLOAD);
394 B.buildSExtInReg(Dst, Load, MemSize);
395 }
396 }
397
398 MI.eraseFromParent();
399 return true;
400}
401
402bool RegBankLegalizeHelper::lowerVccExtToSel(MachineInstr &MI) {
403 Register Dst = MI.getOperand(0).getReg();
404 LLT Ty = MRI.getType(Dst);
405 Register Src = MI.getOperand(1).getReg();
406 unsigned Opc = MI.getOpcode();
407 int TrueExtCst = Opc == G_SEXT ? -1 : 1;
408 if (Ty == S32 || Ty == S16) {
409 auto True = B.buildConstant({VgprRB, Ty}, TrueExtCst);
410 auto False = B.buildConstant({VgprRB, Ty}, 0);
411 B.buildSelect(Dst, Src, True, False);
412 } else if (Ty == S64) {
413 auto True = B.buildConstant({VgprRB_S32}, TrueExtCst);
414 auto False = B.buildConstant({VgprRB_S32}, 0);
415 auto Lo = B.buildSelect({VgprRB_S32}, Src, True, False);
416 MachineInstrBuilder Hi;
417 switch (Opc) {
418 case G_SEXT:
419 Hi = Lo;
420 break;
421 case G_ZEXT:
422 Hi = False;
423 break;
424 case G_ANYEXT:
425 Hi = B.buildUndef({VgprRB_S32});
426 break;
427 default:
429 MF, MORE, "amdgpu-regbanklegalize",
430 "AMDGPU RegBankLegalize: lowerVccExtToSel, Opcode not supported", MI);
431 return false;
432 }
433
434 B.buildMergeValues(Dst, {Lo.getReg(0), Hi.getReg(0)});
435 } else {
437 MF, MORE, "amdgpu-regbanklegalize",
438 "AMDGPU RegBankLegalize: lowerVccExtToSel, Type not supported", MI);
439 return false;
440 }
441
442 MI.eraseFromParent();
443 return true;
444}
445
446std::pair<Register, Register> RegBankLegalizeHelper::unpackZExt(Register Reg) {
447 auto PackedS32 = B.buildBitcast(SgprRB_S32, Reg);
448 auto Mask = B.buildConstant(SgprRB_S32, 0x0000ffff);
449 auto Lo = B.buildAnd(SgprRB_S32, PackedS32, Mask);
450 auto Hi = B.buildLShr(SgprRB_S32, PackedS32, B.buildConstant(SgprRB_S32, 16));
451 return {Lo.getReg(0), Hi.getReg(0)};
452}
453
454std::pair<Register, Register> RegBankLegalizeHelper::unpackSExt(Register Reg) {
455 auto PackedS32 = B.buildBitcast(SgprRB_S32, Reg);
456 auto Lo = B.buildSExtInReg(SgprRB_S32, PackedS32, 16);
457 auto Hi = B.buildAShr(SgprRB_S32, PackedS32, B.buildConstant(SgprRB_S32, 16));
458 return {Lo.getReg(0), Hi.getReg(0)};
459}
460
461std::pair<Register, Register> RegBankLegalizeHelper::unpackAExt(Register Reg) {
462 auto PackedS32 = B.buildBitcast(SgprRB_S32, Reg);
463 auto Lo = PackedS32;
464 auto Hi = B.buildLShr(SgprRB_S32, PackedS32, B.buildConstant(SgprRB_S32, 16));
465 return {Lo.getReg(0), Hi.getReg(0)};
466}
467
468std::pair<Register, Register>
469RegBankLegalizeHelper::unpackAExtTruncS16(Register Reg) {
470 auto [Lo32, Hi32] = unpackAExt(Reg);
471 return {B.buildTrunc(SgprRB_S16, Lo32).getReg(0),
472 B.buildTrunc(SgprRB_S16, Hi32).getReg(0)};
473}
474
475bool RegBankLegalizeHelper::lowerUnpackBitShift(MachineInstr &MI) {
476 Register Lo, Hi;
477 switch (MI.getOpcode()) {
478 case AMDGPU::G_SHL: {
479 auto [Val0, Val1] = unpackAExt(MI.getOperand(1).getReg());
480 auto [Amt0, Amt1] = unpackAExt(MI.getOperand(2).getReg());
481 Lo = B.buildInstr(MI.getOpcode(), {SgprRB_S32}, {Val0, Amt0}).getReg(0);
482 Hi = B.buildInstr(MI.getOpcode(), {SgprRB_S32}, {Val1, Amt1}).getReg(0);
483 break;
484 }
485 case AMDGPU::G_LSHR: {
486 auto [Val0, Val1] = unpackZExt(MI.getOperand(1).getReg());
487 auto [Amt0, Amt1] = unpackZExt(MI.getOperand(2).getReg());
488 Lo = B.buildInstr(MI.getOpcode(), {SgprRB_S32}, {Val0, Amt0}).getReg(0);
489 Hi = B.buildInstr(MI.getOpcode(), {SgprRB_S32}, {Val1, Amt1}).getReg(0);
490 break;
491 }
492 case AMDGPU::G_ASHR: {
493 auto [Val0, Val1] = unpackSExt(MI.getOperand(1).getReg());
494 auto [Amt0, Amt1] = unpackSExt(MI.getOperand(2).getReg());
495 Lo = B.buildAShr(SgprRB_S32, Val0, Amt0).getReg(0);
496 Hi = B.buildAShr(SgprRB_S32, Val1, Amt1).getReg(0);
497 break;
498 }
499 default:
501 MF, MORE, "amdgpu-regbanklegalize",
502 "AMDGPU RegBankLegalize: lowerUnpackBitShift, case not implemented",
503 MI);
504 return false;
505 }
506 B.buildBuildVectorTrunc(MI.getOperand(0).getReg(), {Lo, Hi});
507 MI.eraseFromParent();
508 return true;
509}
510
511bool RegBankLegalizeHelper::lowerUnpackMinMax(MachineInstr &MI) {
512 Register Lo, Hi;
513 switch (MI.getOpcode()) {
514 case AMDGPU::G_SMIN:
515 case AMDGPU::G_SMAX: {
516 // For signed operations, use sign extension
517 auto [Val0_Lo, Val0_Hi] = unpackSExt(MI.getOperand(1).getReg());
518 auto [Val1_Lo, Val1_Hi] = unpackSExt(MI.getOperand(2).getReg());
519 Lo = B.buildInstr(MI.getOpcode(), {SgprRB_S32}, {Val0_Lo, Val1_Lo})
520 .getReg(0);
521 Hi = B.buildInstr(MI.getOpcode(), {SgprRB_S32}, {Val0_Hi, Val1_Hi})
522 .getReg(0);
523 break;
524 }
525 case AMDGPU::G_UMIN:
526 case AMDGPU::G_UMAX: {
527 // For unsigned operations, use zero extension
528 auto [Val0_Lo, Val0_Hi] = unpackZExt(MI.getOperand(1).getReg());
529 auto [Val1_Lo, Val1_Hi] = unpackZExt(MI.getOperand(2).getReg());
530 Lo = B.buildInstr(MI.getOpcode(), {SgprRB_S32}, {Val0_Lo, Val1_Lo})
531 .getReg(0);
532 Hi = B.buildInstr(MI.getOpcode(), {SgprRB_S32}, {Val0_Hi, Val1_Hi})
533 .getReg(0);
534 break;
535 }
536 default:
538 MF, MORE, "amdgpu-regbanklegalize",
539 "AMDGPU RegBankLegalize: lowerUnpackMinMax, case not implemented", MI);
540 return false;
541 }
542 B.buildBuildVectorTrunc(MI.getOperand(0).getReg(), {Lo, Hi});
543 MI.eraseFromParent();
544 return true;
545}
546
547bool RegBankLegalizeHelper::lowerUnpackAExt(MachineInstr &MI) {
548 auto [Op1Lo, Op1Hi] = unpackAExt(MI.getOperand(1).getReg());
549 auto [Op2Lo, Op2Hi] = unpackAExt(MI.getOperand(2).getReg());
550 auto ResLo = B.buildInstr(MI.getOpcode(), {SgprRB_S32}, {Op1Lo, Op2Lo});
551 auto ResHi = B.buildInstr(MI.getOpcode(), {SgprRB_S32}, {Op1Hi, Op2Hi});
552 B.buildBuildVectorTrunc(MI.getOperand(0).getReg(),
553 {ResLo.getReg(0), ResHi.getReg(0)});
554 MI.eraseFromParent();
555 return true;
556}
557
560 return (GI->is(Intrinsic::amdgcn_sbfe));
561
562 return MI.getOpcode() == AMDGPU::G_SBFX;
563}
564
565bool RegBankLegalizeHelper::lowerV_BFE(MachineInstr &MI) {
566 Register Dst = MI.getOperand(0).getReg();
567 assert(MRI.getType(Dst) == LLT::scalar(64));
568 bool Signed = isSignedBFE(MI);
569 unsigned FirstOpnd = isa<GIntrinsic>(MI) ? 2 : 1;
570 // Extract bitfield from Src, LSBit is the least-significant bit for the
571 // extraction (field offset) and Width is size of bitfield.
572 Register Src = MI.getOperand(FirstOpnd).getReg();
573 Register LSBit = MI.getOperand(FirstOpnd + 1).getReg();
574 Register Width = MI.getOperand(FirstOpnd + 2).getReg();
575 // Comments are for signed bitfield extract, similar for unsigned. x is sign
576 // bit. s is sign, l is LSB and y are remaining bits of bitfield to extract.
577
578 // Src >> LSBit Hi|Lo: x?????syyyyyyl??? -> xxxx?????syyyyyyl
579 unsigned SHROpc = Signed ? AMDGPU::G_ASHR : AMDGPU::G_LSHR;
580 auto SHRSrc = B.buildInstr(SHROpc, {{VgprRB, S64}}, {Src, LSBit});
581
582 auto ConstWidth = getIConstantVRegValWithLookThrough(Width, MRI);
583
584 // Expand to Src >> LSBit << (64 - Width) >> (64 - Width)
585 // << (64 - Width): Hi|Lo: xxxx?????syyyyyyl -> syyyyyyl000000000
586 // >> (64 - Width): Hi|Lo: syyyyyyl000000000 -> ssssssssssyyyyyyl
587 if (!ConstWidth) {
588 auto Amt = B.buildSub(VgprRB_S32, B.buildConstant(SgprRB_S32, 64), Width);
589 auto SignBit = B.buildShl({VgprRB, S64}, SHRSrc, Amt);
590 B.buildInstr(SHROpc, {Dst}, {SignBit, Amt});
591 MI.eraseFromParent();
592 return true;
593 }
594
595 uint64_t WidthImm = ConstWidth->Value.getZExtValue();
596 auto UnmergeSHRSrc = B.buildUnmerge(VgprRB_S32, SHRSrc);
597 Register SHRSrcLo = UnmergeSHRSrc.getReg(0);
598 Register SHRSrcHi = UnmergeSHRSrc.getReg(1);
599 auto Zero = B.buildConstant({VgprRB, S32}, 0);
600 unsigned BFXOpc = Signed ? AMDGPU::G_SBFX : AMDGPU::G_UBFX;
601
602 if (WidthImm <= 32) {
603 // SHRSrc Hi|Lo: ????????|???syyyl -> ????????|ssssyyyl
604 auto Lo = B.buildInstr(BFXOpc, {VgprRB_S32}, {SHRSrcLo, Zero, Width});
605 MachineInstrBuilder Hi;
606 if (Signed) {
607 // SHRSrc Hi|Lo: ????????|ssssyyyl -> ssssssss|ssssyyyl
608 Hi = B.buildAShr(VgprRB_S32, Lo, B.buildConstant(VgprRB_S32, 31));
609 } else {
610 // SHRSrc Hi|Lo: ????????|000syyyl -> 00000000|000syyyl
611 Hi = Zero;
612 }
613 B.buildMergeLikeInstr(Dst, {Lo, Hi});
614 } else {
615 auto Amt = B.buildConstant(VgprRB_S32, WidthImm - 32);
616 // SHRSrc Hi|Lo: ??????sy|yyyyyyyl -> sssssssy|yyyyyyyl
617 auto Hi = B.buildInstr(BFXOpc, {VgprRB_S32}, {SHRSrcHi, Zero, Amt});
618 B.buildMergeLikeInstr(Dst, {SHRSrcLo, Hi});
619 }
620
621 MI.eraseFromParent();
622 return true;
623}
624
625bool RegBankLegalizeHelper::lowerS_BFE(MachineInstr &MI) {
626 Register DstReg = MI.getOperand(0).getReg();
627 LLT Ty = MRI.getType(DstReg);
628 bool Signed = isSignedBFE(MI);
629 unsigned FirstOpnd = isa<GIntrinsic>(MI) ? 2 : 1;
630 Register Src = MI.getOperand(FirstOpnd).getReg();
631 Register LSBit = MI.getOperand(FirstOpnd + 1).getReg();
632 Register Width = MI.getOperand(FirstOpnd + 2).getReg();
633 // For uniform bit field extract there are 4 available instructions, but
634 // LSBit(field offset) and Width(size of bitfield) need to be packed in S32,
635 // field offset in low and size in high 16 bits.
636
637 // Src1 Hi16|Lo16 = Size|FieldOffset
638 auto Mask = B.buildConstant(SgprRB_S32, maskTrailingOnes<unsigned>(6));
639 auto FieldOffset = B.buildAnd(SgprRB_S32, LSBit, Mask);
640 auto Size = B.buildShl(SgprRB_S32, Width, B.buildConstant(SgprRB_S32, 16));
641 auto Src1 = B.buildOr(SgprRB_S32, FieldOffset, Size);
642 unsigned Opc32 = Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32;
643 unsigned Opc64 = Signed ? AMDGPU::S_BFE_I64 : AMDGPU::S_BFE_U64;
644 unsigned Opc = Ty == S32 ? Opc32 : Opc64;
645
646 // Select machine instruction, because of reg class constraining, insert
647 // copies from reg class to reg bank.
648 auto S_BFE = B.buildInstr(Opc, {{SgprRB, Ty}},
649 {B.buildCopy(Ty, Src), B.buildCopy(S32, Src1)});
650 constrainSelectedInstRegOperands(*S_BFE, *ST.getInstrInfo(),
651 *ST.getRegisterInfo(), RBI);
652
653 B.buildCopy(DstReg, S_BFE->getOperand(0).getReg());
654 MI.eraseFromParent();
655 return true;
656}
657
658bool RegBankLegalizeHelper::lowerSplitTo32(MachineInstr &MI) {
659 Register Dst = MI.getOperand(0).getReg();
660 LLT DstTy = MRI.getType(Dst);
661 assert(DstTy == V4S16 || DstTy == V2S32 || DstTy == S64);
662 LLT Ty = DstTy == V4S16 ? V2S16 : S32;
663 auto Op1 = B.buildUnmerge({VgprRB, Ty}, MI.getOperand(1).getReg());
664 auto Op2 = B.buildUnmerge({VgprRB, Ty}, MI.getOperand(2).getReg());
665 unsigned Opc = MI.getOpcode();
666 auto Flags = MI.getFlags();
667 auto Lo =
668 B.buildInstr(Opc, {{VgprRB, Ty}}, {Op1.getReg(0), Op2.getReg(0)}, Flags);
669 auto Hi =
670 B.buildInstr(Opc, {{VgprRB, Ty}}, {Op1.getReg(1), Op2.getReg(1)}, Flags);
671 B.buildMergeLikeInstr(Dst, {Lo, Hi});
672 MI.eraseFromParent();
673 return true;
674}
675
676bool RegBankLegalizeHelper::lowerSplitTo32Mul(MachineInstr &MI) {
677 Register Dst = MI.getOperand(0).getReg();
678 assert(MRI.getType(Dst) == S64);
679 auto Op1 = B.buildUnmerge({VgprRB_S32}, MI.getOperand(1).getReg());
680 auto Op2 = B.buildUnmerge({VgprRB_S32}, MI.getOperand(2).getReg());
681
682 // TODO: G_AMDGPU_MAD_* optimizations for G_MUL divergent S64 operation to
683 // match GlobalISel with old regbankselect.
684 auto Lo = B.buildMul(VgprRB_S32, Op1.getReg(0), Op2.getReg(0));
685 auto Carry = B.buildUMulH(VgprRB_S32, Op1.getReg(0), Op2.getReg(0));
686 auto MulLo0Hi1 = B.buildMul(VgprRB_S32, Op1.getReg(0), Op2.getReg(1));
687 auto MulHi0Lo1 = B.buildMul(VgprRB_S32, Op1.getReg(1), Op2.getReg(0));
688 auto Sum = B.buildAdd(VgprRB_S32, MulLo0Hi1, MulHi0Lo1);
689 auto Hi = B.buildAdd(VgprRB_S32, Sum, Carry);
690
691 B.buildMergeLikeInstr(Dst, {Lo, Hi});
692 MI.eraseFromParent();
693 return true;
694}
695
696bool RegBankLegalizeHelper::lowerSplitTo16(MachineInstr &MI) {
697 Register Dst = MI.getOperand(0).getReg();
698 assert(MRI.getType(Dst) == V2S16);
699 unsigned Opc = MI.getOpcode();
700 unsigned NumOps = MI.getNumOperands();
701 auto Flags = MI.getFlags();
702
703 auto [Op1Lo, Op1Hi] = unpackAExtTruncS16(MI.getOperand(1).getReg());
704
705 if (NumOps == 2) {
706 auto Lo = B.buildInstr(Opc, {SgprRB_S16}, {Op1Lo}, Flags);
707 auto Hi = B.buildInstr(Opc, {SgprRB_S16}, {Op1Hi}, Flags);
708 B.buildMergeLikeInstr(Dst, {Lo, Hi});
709 MI.eraseFromParent();
710 return true;
711 }
712
713 auto [Op2Lo, Op2Hi] = unpackAExtTruncS16(MI.getOperand(2).getReg());
714
715 if (NumOps == 3) {
716 auto Lo = B.buildInstr(Opc, {SgprRB_S16}, {Op1Lo, Op2Lo}, Flags);
717 auto Hi = B.buildInstr(Opc, {SgprRB_S16}, {Op1Hi, Op2Hi}, Flags);
718 B.buildMergeLikeInstr(Dst, {Lo, Hi});
719 MI.eraseFromParent();
720 return true;
721 }
722
723 assert(NumOps == 4);
724 auto [Op3Lo, Op3Hi] = unpackAExtTruncS16(MI.getOperand(3).getReg());
725 auto Lo = B.buildInstr(Opc, {SgprRB_S16}, {Op1Lo, Op2Lo, Op3Lo}, Flags);
726 auto Hi = B.buildInstr(Opc, {SgprRB_S16}, {Op1Hi, Op2Hi, Op3Hi}, Flags);
727 B.buildMergeLikeInstr(Dst, {Lo, Hi});
728 MI.eraseFromParent();
729 return true;
730}
731
732bool RegBankLegalizeHelper::lowerUniMAD64(MachineInstr &MI) {
733 Register Dst0 = MI.getOperand(0).getReg();
734 Register Dst1 = MI.getOperand(1).getReg();
735 Register Src0 = MI.getOperand(2).getReg();
736 Register Src1 = MI.getOperand(3).getReg();
737 Register Src2 = MI.getOperand(4).getReg();
738
739 const GCNSubtarget &ST = B.getMF().getSubtarget<GCNSubtarget>();
740
741 // Keep the multiplication on the SALU.
742 Register DstLo = B.buildMul(SgprRB_S32, Src0, Src1).getReg(0);
743 Register DstHi = MRI.createVirtualRegister(SgprRB_S32);
744 if (ST.hasScalarMulHiInsts()) {
745 B.buildInstr(AMDGPU::G_UMULH, {{DstHi}}, {Src0, Src1});
746 } else {
747 auto VSrc0 = B.buildCopy(VgprRB_S32, Src0);
748 auto VSrc1 = B.buildCopy(VgprRB_S32, Src1);
749 auto MulHi = B.buildInstr(AMDGPU::G_UMULH, {VgprRB_S32}, {VSrc0, VSrc1});
750 buildReadAnyLane(B, DstHi, MulHi.getReg(0), RBI);
751 }
752
753 // Accumulate and produce the "carry-out" bit.
754
755 // The "carry-out" is defined as bit 64 of the result when computed as a
756 // big integer. For unsigned multiply-add, this matches the usual
757 // definition of carry-out.
758 if (mi_match(Src2, MRI, MIPatternMatch::m_ZeroInt())) {
759 // No accumulate: result is just the multiplication, carry is 0.
760 B.buildMergeLikeInstr(Dst0, {DstLo, DstHi});
761 B.buildConstant(Dst1, 0);
762 } else {
763 // Accumulate: add Src2 to the multiplication result with carry chain.
764 Register Src2Lo = MRI.createVirtualRegister(SgprRB_S32);
765 Register Src2Hi = MRI.createVirtualRegister(SgprRB_S32);
766 B.buildUnmerge({Src2Lo, Src2Hi}, Src2);
767
768 auto AddLo = B.buildUAddo(SgprRB_S32, SgprRB_S32, DstLo, Src2Lo);
769 auto AddHi =
770 B.buildUAdde(SgprRB_S32, SgprRB_S32, DstHi, Src2Hi, AddLo.getReg(1));
771 B.buildMergeLikeInstr(Dst0, {AddLo.getReg(0), AddHi.getReg(0)});
772 B.buildCopy(Dst1, AddHi.getReg(1));
773 }
774
775 MI.eraseFromParent();
776 return true;
777}
778
779bool RegBankLegalizeHelper::lowerSplitTo32Select(MachineInstr &MI) {
780 Register Dst = MI.getOperand(0).getReg();
781 LLT DstTy = MRI.getType(Dst);
782 assert(DstTy == V4S16 || DstTy == V2S32 || DstTy == S64 ||
783 (DstTy.isPointer() && DstTy.getSizeInBits() == 64));
784 LLT Ty = DstTy == V4S16 ? V2S16 : S32;
785 auto Op2 = B.buildUnmerge({VgprRB, Ty}, MI.getOperand(2).getReg());
786 auto Op3 = B.buildUnmerge({VgprRB, Ty}, MI.getOperand(3).getReg());
787 Register Cond = MI.getOperand(1).getReg();
788 auto Flags = MI.getFlags();
789 auto Lo =
790 B.buildSelect({VgprRB, Ty}, Cond, Op2.getReg(0), Op3.getReg(0), Flags);
791 auto Hi =
792 B.buildSelect({VgprRB, Ty}, Cond, Op2.getReg(1), Op3.getReg(1), Flags);
793
794 B.buildMergeLikeInstr(Dst, {Lo, Hi});
795 MI.eraseFromParent();
796 return true;
797}
798
799bool RegBankLegalizeHelper::lowerSplitTo32SExtInReg(MachineInstr &MI) {
800 auto Op1 = B.buildUnmerge(VgprRB_S32, MI.getOperand(1).getReg());
801 int Amt = MI.getOperand(2).getImm();
802 Register Lo, Hi;
803 // Hi|Lo: s sign bit, ?/x bits changed/not changed by sign-extend
804 if (Amt <= 32) {
805 auto Freeze = B.buildFreeze(VgprRB_S32, Op1.getReg(0));
806 if (Amt == 32) {
807 // Hi|Lo: ????????|sxxxxxxx -> ssssssss|sxxxxxxx
808 Lo = Freeze.getReg(0);
809 } else {
810 // Hi|Lo: ????????|???sxxxx -> ssssssss|ssssxxxx
811 Lo = B.buildSExtInReg(VgprRB_S32, Freeze, Amt).getReg(0);
812 }
813
814 auto SignExtCst = B.buildConstant(SgprRB_S32, 31);
815 Hi = B.buildAShr(VgprRB_S32, Lo, SignExtCst).getReg(0);
816 } else {
817 // Hi|Lo: ?????sxx|xxxxxxxx -> ssssssxx|xxxxxxxx
818 Lo = Op1.getReg(0);
819 Hi = B.buildSExtInReg(VgprRB_S32, Op1.getReg(1), Amt - 32).getReg(0);
820 }
821
822 B.buildMergeLikeInstr(MI.getOperand(0).getReg(), {Lo, Hi});
823 MI.eraseFromParent();
824 return true;
825}
826
827bool RegBankLegalizeHelper::lowerSplitBitCount64To32(MachineInstr &MI) {
828 // Split 64-bit find-first-bit operations into 32-bit halves:
829 // (ffbh hi:lo) -> umin(ffbh(hi), uaddsat(ffbh(lo), 32))
830 // (ffbl hi:lo) -> umin(ffbl(lo), uaddsat(ffbl(hi), 32))
831 // (ctlz_zero_poison hi:lo) -> umin(ffbh(hi), add(ffbh(lo), 32))
832 // (cttz_zero_poison hi:lo) -> umin(ffbl(lo), add(ffbl(hi), 32))
833 unsigned Opc = MI.getOpcode();
834
835 // FFBH/FFBL return 0xFFFFFFFF on zero input, using uaddsat to avoid
836 // wrapping. CTLZ/CTTZ guarantee non-zero input (zero_poison), so plain add
837 // is fine.
838 unsigned FFBOpc;
839 unsigned AddOpc;
840 bool SearchFromMSB;
841 switch (Opc) {
842 case AMDGPU::G_AMDGPU_FFBH_U32:
843 FFBOpc = Opc;
844 AddOpc = AMDGPU::G_UADDSAT;
845 SearchFromMSB = true;
846 break;
847 case AMDGPU::G_AMDGPU_FFBL_B32:
848 FFBOpc = Opc;
849 AddOpc = AMDGPU::G_UADDSAT;
850 SearchFromMSB = false;
851 break;
852 case AMDGPU::G_CTLZ_ZERO_POISON:
853 FFBOpc = AMDGPU::G_AMDGPU_FFBH_U32;
854 AddOpc = AMDGPU::G_ADD;
855 SearchFromMSB = true;
856 break;
857 case AMDGPU::G_CTTZ_ZERO_POISON:
858 FFBOpc = AMDGPU::G_AMDGPU_FFBL_B32;
859 AddOpc = AMDGPU::G_ADD;
860 SearchFromMSB = false;
861 break;
862 default:
863 llvm_unreachable("unexpected opcode in lowerSplitBitCount64To32");
864 }
865
866 auto Unmerge = B.buildUnmerge(VgprRB_S32, MI.getOperand(1).getReg());
867 Register Lo = Unmerge.getReg(0);
868 Register Hi = Unmerge.getReg(1);
869
870 // MSB-first (FFBH/CTLZ) searches hi first; LSB-first (FFBL/CTTZ) searches
871 // lo first. The secondary half adds 32 to account for the primary half's
872 // width.
873 auto Primary = B.buildInstr(FFBOpc, {VgprRB_S32}, {SearchFromMSB ? Hi : Lo});
874 auto Secondary =
875 B.buildInstr(FFBOpc, {VgprRB_S32}, {SearchFromMSB ? Lo : Hi});
876
877 auto Adjusted = B.buildInstr(AddOpc, {VgprRB_S32},
878 {Secondary, B.buildConstant(VgprRB_S32, 32)});
879 B.buildUMin(MI.getOperand(0).getReg(), Primary, Adjusted);
880
881 MI.eraseFromParent();
882 return true;
883}
884
885bool RegBankLegalizeHelper::lowerExtrVecEltToSel(MachineInstr &MI) {
886 // Lower extract vector element to a compare-select chain:
887 // result = elt[0]
888 // for i in 1..N-1:
889 // result = (idx == i) ? elt[i] : result
890 //
891 // When the index is divergent, each lane may want a different element, so
892 // we must check every element per lane.
893 Register Dst = MI.getOperand(0).getReg();
894 Register Src = MI.getOperand(1).getReg();
895 Register Idx = MI.getOperand(2).getReg();
896
897 LLT VecTy = MRI.getType(Src);
898 LLT ScalarTy = VecTy.getScalarType();
899 unsigned NumElts = VecTy.getNumElements();
900 MachineRegisterInfo::VRegAttrs VgprRB_EltTy = {VgprRB, ScalarTy};
901
902 auto Unmerge = B.buildUnmerge(VgprRB_EltTy, Src);
903
904 if (ScalarTy.getSizeInBits() == 32) {
905 Register PrevSelect = Unmerge.getReg(0);
906 for (unsigned I = 1; I < NumElts; ++I) {
907 auto IdxConst = B.buildConstant({SgprRB, MRI.getType(Idx)}, I);
908 auto Cmp = B.buildICmp(CmpInst::ICMP_EQ, VccRB_S1, Idx, IdxConst);
909 PrevSelect =
910 B.buildSelect(VgprRB_EltTy, Cmp, Unmerge.getReg(I), PrevSelect)
911 .getReg(0);
912 }
913 B.buildCopy(Dst, PrevSelect);
914 } else if (ScalarTy.getSizeInBits() == 64) {
915 auto InitUnmerge = B.buildUnmerge(VgprRB_S32, Unmerge.getReg(0));
916 Register PrevLo = InitUnmerge.getReg(0);
917 Register PrevHi = InitUnmerge.getReg(1);
918 for (unsigned I = 1; I < NumElts; ++I) {
919 auto IdxConst = B.buildConstant({SgprRB, MRI.getType(Idx)}, I);
920 auto Cmp = B.buildICmp(CmpInst::ICMP_EQ, VccRB_S1, Idx, IdxConst);
921 auto EltUnmerge = B.buildUnmerge(VgprRB_S32, Unmerge.getReg(I));
922 PrevLo = B.buildSelect(VgprRB_S32, Cmp, EltUnmerge.getReg(0), PrevLo)
923 .getReg(0);
924 PrevHi = B.buildSelect(VgprRB_S32, Cmp, EltUnmerge.getReg(1), PrevHi)
925 .getReg(0);
926 }
927 B.buildMergeLikeInstr(Dst, {PrevLo, PrevHi});
928 } else {
930 MF, MORE, "amdgpu-regbanklegalize",
931 "AMDGPU RegBankLegalize: ExtrVecEltToSel unsupported element type", MI);
932 return false;
933 }
934
935 MI.eraseFromParent();
936 return true;
937}
938
939bool RegBankLegalizeHelper::lowerExtrVecEltTo32(MachineInstr &MI) {
940 // Reduce a 64-bit element extract to two 32-bit extracts:
941 // vec32 = bitcast <N x s64> to <2N x s32>
942 // lo = vec32[idx * 2]
943 // hi = vec32[idx * 2 + 1]
944 // result = merge(lo, hi)
945 //
946 // When the index is uniform, all lanes extract the same element, so we can
947 // just split the s64 extract into two s32 extracts which lower to MOVREL.
948 Register Dst = MI.getOperand(0).getReg();
949 Register Src = MI.getOperand(1).getReg();
950 Register Idx = MI.getOperand(2).getReg();
951
952 LLT SrcTy = MRI.getType(Src);
953 LLT Vec32Ty = LLT::fixed_vector(2 * SrcTy.getNumElements(), 32);
954
955 assert(MRI.getRegBank(Src) == VgprRB && MRI.getRegBank(Idx) == SgprRB &&
956 "expected VGPR src and SGPR idx");
957
958 auto CastSrc = B.buildBitcast({VgprRB, Vec32Ty}, Src);
959
960 // Calculate new Lo and Hi indices
961 auto One = B.buildConstant(SgprRB_S32, 1);
962 auto IdxLo = B.buildShl(SgprRB_S32, Idx, One);
963 auto IdxHi = B.buildAdd(SgprRB_S32, IdxLo, One);
964
965 auto ExtLo = B.buildExtractVectorElement(VgprRB_S32, CastSrc, IdxLo);
966 auto ExtHi = B.buildExtractVectorElement(VgprRB_S32, CastSrc, IdxHi);
967
968 B.buildMergeLikeInstr(Dst, {ExtLo.getReg(0), ExtHi.getReg(0)});
969
970 MI.eraseFromParent();
971 return true;
972}
973
974bool RegBankLegalizeHelper::lowerInsVecEltToSel(MachineInstr &MI) {
975 // Lower insert vector element to a compare-select chain:
976 // for i in 0..N-1:
977 // result[i] = (idx == i) ? elt : srcVec[i]
978 // dst = merge(result[0..N-1])
979 //
980 // VGPR B64 requires splitting to lo/hi s32 pairs since there is no
981 // v_cndmask_b64. SGPR B64/B32 and VGPR B32 can be handled natively.
982 Register Dst = MI.getOperand(0).getReg();
983 Register Src = MI.getOperand(1).getReg();
984 Register Elt = MI.getOperand(2).getReg();
985 Register Idx = MI.getOperand(3).getReg();
986
987 LLT VecTy = MRI.getType(Src);
988 LLT ScalarTy = VecTy.getScalarType();
989 unsigned NumElts = VecTy.getNumElements();
990 const RegisterBank *SrcRB = MRI.getRegBank(Src);
991 bool IsSGPR = (SrcRB == SgprRB);
992 SmallVector<Register, 16> Selects;
993
994 if (!IsSGPR && ScalarTy.getSizeInBits() == 64) {
995 // VGPR B64: split to 32-bit lo/hi since there is no v_cndmask_b64.
996 auto Unmerge = B.buildUnmerge(VgprRB_S32, Src);
997 auto EltUnmerge = B.buildUnmerge(VgprRB_S32, Elt);
998 Register EltLo = EltUnmerge.getReg(0);
999 Register EltHi = EltUnmerge.getReg(1);
1000 for (unsigned I = 0; I < NumElts; ++I) {
1001 auto IdxConst = B.buildConstant(VgprRB_S32, I);
1002 auto Cmp = B.buildICmp(CmpInst::ICMP_EQ, VccRB_S1, Idx, IdxConst);
1003 Selects.push_back(
1004 B.buildSelect(VgprRB_S32, Cmp, EltLo, Unmerge.getReg(2 * I))
1005 .getReg(0));
1006 Selects.push_back(
1007 B.buildSelect(VgprRB_S32, Cmp, EltHi, Unmerge.getReg(2 * I + 1))
1008 .getReg(0));
1009 }
1010 LLT Vec32Ty = LLT::fixed_vector(2 * NumElts, 32);
1011 auto Vec32 = B.buildBuildVector({VgprRB, Vec32Ty}, Selects);
1012 B.buildBitcast(Dst, Vec32);
1013 } else if (ScalarTy.getSizeInBits() == 32 || ScalarTy.getSizeInBits() == 64) {
1014 // B32 (any bank) and SGPR B64: element-wise select at native width.
1015 MachineRegisterInfo::VRegAttrs SrcRB_EltTy = {SrcRB, ScalarTy};
1016 MachineRegisterInfo::VRegAttrs CmpTy = IsSGPR ? SgprRB_S32 : VccRB_S1;
1017 auto Unmerge = B.buildUnmerge(SrcRB_EltTy, Src);
1018 for (unsigned I = 0; I < NumElts; ++I) {
1019 auto IdxConst = B.buildConstant(SgprRB_S32, I);
1020 auto Cmp = B.buildICmp(CmpInst::ICMP_EQ, CmpTy, Idx, IdxConst);
1021 Selects.push_back(
1022 B.buildSelect(SrcRB_EltTy, Cmp, Elt, Unmerge.getReg(I)).getReg(0));
1023 }
1024 B.buildMergeLikeInstr(Dst, Selects);
1025 } else {
1027 MF, MORE, "amdgpu-regbanklegalize",
1028 "AMDGPU RegBankLegalize: InsVecEltToSel unsupported element type", MI);
1029 return false;
1030 }
1031
1032 MI.eraseFromParent();
1033 return true;
1034}
1035
1036bool RegBankLegalizeHelper::lowerInsVecEltTo32(MachineInstr &MI) {
1037 // Reduce a 64-bit element insert to two 32-bit inserts:
1038 // vec32 = bitcast <N x s64> to <2N x s32>
1039 // lo, hi = unmerge elt
1040 // vec32[idx * 2] = lo
1041 // vec32[idx * 2 + 1] = hi
1042 // dst = bitcast <2N x s32> to <N x s64>
1043 //
1044 // When the index is uniform, all lanes insert at the same position, so we
1045 // can split the s64 insert into two s32 inserts which lower to MOVREL/GPRIDX.
1046 Register Dst = MI.getOperand(0).getReg();
1047 Register Src = MI.getOperand(1).getReg();
1048 Register Elt = MI.getOperand(2).getReg();
1049 Register Idx = MI.getOperand(3).getReg();
1050
1051 LLT SrcTy = MRI.getType(Src);
1052 LLT Vec32Ty = LLT::fixed_vector(2 * SrcTy.getNumElements(), 32);
1053
1054 assert(MRI.getRegBank(Src) == VgprRB && MRI.getRegBank(Idx) == SgprRB &&
1055 "expected VGPR src and SGPR idx");
1056
1057 MachineRegisterInfo::VRegAttrs VgprRB_Vec32Ty = {VgprRB, Vec32Ty};
1058
1059 auto CastSrc = B.buildBitcast(VgprRB_Vec32Ty, Src);
1060 auto EltUnmerge = B.buildUnmerge(VgprRB_S32, Elt);
1061
1062 // Calculate new Lo and Hi indices
1063 auto One = B.buildConstant(SgprRB_S32, 1);
1064 auto IdxLo = B.buildShl(SgprRB_S32, Idx, One);
1065 auto IdxHi = B.buildAdd(SgprRB_S32, IdxLo, One);
1066
1067 auto InsLo = B.buildInsertVectorElement(VgprRB_Vec32Ty, CastSrc,
1068 EltUnmerge.getReg(0), IdxLo);
1069 auto InsHi = B.buildInsertVectorElement(VgprRB_Vec32Ty, InsLo,
1070 EltUnmerge.getReg(1), IdxHi);
1071
1072 B.buildBitcast(Dst, InsHi);
1073
1074 MI.eraseFromParent();
1075 return true;
1076}
1077
1078bool RegBankLegalizeHelper::lowerAbsToNegMax(MachineInstr &MI) {
1079 // Lower divergent G_ABS to smax(x, 0 - x) in the VGPR bank:
1080 // zero = 0
1081 // neg = G_SUB zero, x
1082 // dst = G_SMAX x, neg
1083 //
1084 // There is no integer v_abs instruction on AMDGPU, so divergent G_ABS is
1085 // expanded to this sub/smax pair.
1086 Register DstReg = MI.getOperand(0).getReg();
1087 Register SrcReg = MI.getOperand(1).getReg();
1088 LLT Ty = MRI.getType(DstReg);
1089
1090 Register Zero;
1091 if (Ty == V2S16) {
1092 // buildConstant cannot produce a V2S16 directly; pack two S16 zeros.
1093 Register Zero16 = B.buildConstant({VgprRB, S16}, 0).getReg(0);
1094 Zero = B.buildBuildVector({VgprRB, Ty}, {Zero16, Zero16}).getReg(0);
1095 } else {
1096 assert((Ty == S32 || Ty == S16) && "unexpected type for AbsToNegMax");
1097 Zero = B.buildConstant({VgprRB, Ty}, 0).getReg(0);
1098 }
1099
1100 auto Neg = B.buildSub({VgprRB, Ty}, Zero, SrcReg);
1101 B.buildSMax(DstReg, SrcReg, Neg);
1102 MI.eraseFromParent();
1103 return true;
1104}
1105
1106bool RegBankLegalizeHelper::lowerAbsToS32(MachineInstr &MI) {
1107 // Lower uniform V2S16 abs by unpacking the values to two separate SGPR
1108 // registers and re-emitting G_ABS on each:
1109 // packed = bitcast <2 x s16> src to s32
1110 // lo = sext_inreg packed, 16
1111 // hi = ashr packed, 16
1112 // dst = build_vector_trunc G_ABS(lo), G_ABS(hi)
1113 //
1114 // SALU only has s_abs_i32, with no direct uniform V2S16 abs. The
1115 // re-emitted G_ABS(SgprRB, S32) selects to s_abs_i32 on each value.
1116 auto Bitcast = B.buildBitcast({SgprRB_S32}, MI.getOperand(1).getReg());
1117 auto SextInReg = B.buildSExtInReg({SgprRB_S32}, Bitcast, 16);
1118 auto ShiftHi =
1119 B.buildAShr({SgprRB_S32}, Bitcast, B.buildConstant({SgprRB_S32}, 16));
1120
1121 auto AbsLo = B.buildInstr(AMDGPU::G_ABS, {{SgprRB_S32}}, {SextInReg});
1122 auto AbsHi = B.buildInstr(AMDGPU::G_ABS, {{SgprRB_S32}}, {ShiftHi});
1123 B.buildBuildVectorTrunc(MI.getOperand(0).getReg(),
1124 {AbsLo.getReg(0), AbsHi.getReg(0)});
1125
1126 MI.eraseFromParent();
1127 return true;
1128}
1129
1130bool RegBankLegalizeHelper::lower(MachineInstr &MI,
1131 const RegBankLLTMapping &Mapping,
1132 WaterfallInfo &WFI) {
1133
1134 switch (Mapping.LoweringMethod) {
1135 case DoNotLower:
1136 break;
1137 case VccExtToSel:
1138 return lowerVccExtToSel(MI);
1139 case UniExtToSel: {
1140 LLT Ty = MRI.getType(MI.getOperand(0).getReg());
1141 auto True = B.buildConstant({SgprRB, Ty},
1142 MI.getOpcode() == AMDGPU::G_SEXT ? -1 : 1);
1143 auto False = B.buildConstant({SgprRB, Ty}, 0);
1144 // Input to G_{Z|S}EXT is 'Legalizer legal' S1. Most common case is compare.
1145 // We are making select here. S1 cond was already 'any-extended to S32' +
1146 // 'AND with 1 to clean high bits' by Sgpr32AExtBoolInReg.
1147 B.buildSelect(MI.getOperand(0).getReg(), MI.getOperand(1).getReg(), True,
1148 False);
1149 MI.eraseFromParent();
1150 return true;
1151 }
1152 case UnpackBitShift:
1153 return lowerUnpackBitShift(MI);
1154 case UnpackMinMax:
1155 return lowerUnpackMinMax(MI);
1156 case ScalarizeToS16:
1157 return lowerSplitTo16(MI);
1158 case Ext32To64: {
1159 const RegisterBank *RB = MRI.getRegBank(MI.getOperand(0).getReg());
1160 MachineInstrBuilder Hi;
1161 switch (MI.getOpcode()) {
1162 case AMDGPU::G_ZEXT: {
1163 Hi = B.buildConstant({RB, S32}, 0);
1164 break;
1165 }
1166 case AMDGPU::G_SEXT: {
1167 // Replicate sign bit from 32-bit extended part.
1168 auto ShiftAmt = B.buildConstant({RB, S32}, 31);
1169 Hi = B.buildAShr({RB, S32}, MI.getOperand(1).getReg(), ShiftAmt);
1170 break;
1171 }
1172 case AMDGPU::G_ANYEXT: {
1173 Hi = B.buildUndef({RB, S32});
1174 break;
1175 }
1176 default:
1177 reportGISelFailure(MF, MORE, "amdgpu-regbanklegalize",
1178 "AMDGPU RegBankLegalize: Ext32To64, unsuported opcode",
1179 MI);
1180 return false;
1181 }
1182
1183 B.buildMergeLikeInstr(MI.getOperand(0).getReg(),
1184 {MI.getOperand(1).getReg(), Hi});
1185 MI.eraseFromParent();
1186 return true;
1187 }
1188 case UniCstExt: {
1189 uint64_t ConstVal = MI.getOperand(1).getCImm()->getZExtValue();
1190 B.buildConstant(MI.getOperand(0).getReg(), ConstVal);
1191
1192 MI.eraseFromParent();
1193 return true;
1194 }
1195 case VgprToVccCopy: {
1196 Register Src = MI.getOperand(1).getReg();
1197 LLT Ty = MRI.getType(Src);
1198 // Take lowest bit from each lane and put it in lane mask.
1199 // Lowering via compare, but we need to clean high bits first as compare
1200 // compares all bits in register.
1201 Register BoolSrc = MRI.createVirtualRegister({VgprRB, Ty});
1202 if (Ty == S64) {
1203 auto Src64 = B.buildUnmerge(VgprRB_S32, Src);
1204 auto One = B.buildConstant(VgprRB_S32, 1);
1205 auto AndLo = B.buildAnd(VgprRB_S32, Src64.getReg(0), One);
1206 auto Zero = B.buildConstant(VgprRB_S32, 0);
1207 auto AndHi = B.buildAnd(VgprRB_S32, Src64.getReg(1), Zero);
1208 B.buildMergeLikeInstr(BoolSrc, {AndLo, AndHi});
1209 } else {
1210 assert(Ty == S32 || Ty == S16);
1211 auto One = B.buildConstant({VgprRB, Ty}, 1);
1212 B.buildAnd(BoolSrc, Src, One);
1213 }
1214 auto Zero = B.buildConstant({VgprRB, Ty}, 0);
1215 B.buildICmp(CmpInst::ICMP_NE, MI.getOperand(0).getReg(), BoolSrc, Zero);
1216 MI.eraseFromParent();
1217 return true;
1218 }
1219 case V_BFE:
1220 return lowerV_BFE(MI);
1221 case S_BFE:
1222 return lowerS_BFE(MI);
1223 case UniMAD64:
1224 return lowerUniMAD64(MI);
1225 case UniMul64: {
1226 B.buildMul(MI.getOperand(0), MI.getOperand(1), MI.getOperand(2));
1227 MI.eraseFromParent();
1228 return true;
1229 }
1230 case DivSMulToMAD: {
1231 auto Op1 = B.buildTrunc(VgprRB_S32, MI.getOperand(1));
1232 auto Op2 = B.buildTrunc(VgprRB_S32, MI.getOperand(2));
1233 auto Zero = B.buildConstant({VgprRB, S64}, 0);
1234
1235 unsigned NewOpc = MI.getOpcode() == AMDGPU::G_AMDGPU_S_MUL_U64_U32
1236 ? AMDGPU::G_AMDGPU_MAD_U64_U32
1237 : AMDGPU::G_AMDGPU_MAD_I64_I32;
1238
1239 B.buildInstr(NewOpc, {MI.getOperand(0).getReg(), {SgprRB, S32}},
1240 {Op1, Op2, Zero});
1241 MI.eraseFromParent();
1242 return true;
1243 }
1244 case SplitTo32:
1245 return lowerSplitTo32(MI);
1246 case SplitTo32Mul:
1247 return lowerSplitTo32Mul(MI);
1248 case SplitTo32Select:
1249 return lowerSplitTo32Select(MI);
1250 case SplitTo32SExtInReg:
1251 return lowerSplitTo32SExtInReg(MI);
1252 case CtPop64To32: {
1253 auto Unmerge = B.buildUnmerge({VgprRB, S32}, MI.getOperand(1).getReg());
1254 auto LoPopCnt = B.buildCTPOP({VgprRB, S32}, Unmerge.getReg(0));
1255 auto HiPopCnt = B.buildCTPOP({VgprRB, S32}, Unmerge.getReg(1));
1256 // Max popcount of two 32-bit values is 64, so this add cannot overflow.
1257 B.buildAdd(MI.getOperand(0).getReg(), LoPopCnt, HiPopCnt,
1259
1260 MI.eraseFromParent();
1261 break;
1262 }
1263 case SplitLoad: {
1264 LLT DstTy = MRI.getType(MI.getOperand(0).getReg());
1265 unsigned Size = DstTy.getSizeInBits();
1266 // Even split to 128-bit loads
1267 if (Size > 128) {
1268 LLT B128;
1269 if (DstTy.isVector()) {
1270 LLT EltTy = DstTy.getElementType();
1271 B128 = LLT::fixed_vector(128 / EltTy.getSizeInBits(), EltTy);
1272 } else {
1273 B128 = LLT::scalar(128);
1274 }
1275 if (Size / 128 == 2)
1276 splitLoad(MI, {B128, B128});
1277 else if (Size / 128 == 4)
1278 splitLoad(MI, {B128, B128, B128, B128});
1279 else {
1280 reportGISelFailure(MF, MORE, "amdgpu-regbanklegalize",
1281 "AMDGPU RegBankLegalize: SplitLoad, unsuported type",
1282 MI);
1283 return false;
1284 }
1285 }
1286 // 64 and 32 bit load
1287 else if (DstTy == S96)
1288 splitLoad(MI, {S64, S32}, S32);
1289 else if (DstTy == V3S32)
1290 splitLoad(MI, {V2S32, S32}, S32);
1291 else if (DstTy == V6S16)
1292 splitLoad(MI, {V4S16, V2S16}, V2S16);
1293 else {
1294 reportGISelFailure(MF, MORE, "amdgpu-regbanklegalize",
1295 "AMDGPU RegBankLegalize: SplitLoad, unsuported type",
1296 MI);
1297 return false;
1298 }
1299 return true;
1300 }
1301 case WidenLoad: {
1302 LLT DstTy = MRI.getType(MI.getOperand(0).getReg());
1303 if (DstTy == S96)
1304 widenLoad(MI, S128);
1305 else if (DstTy == V3S32)
1306 widenLoad(MI, V4S32, S32);
1307 else if (DstTy == V6S16)
1308 widenLoad(MI, V8S16, V2S16);
1309 else {
1310 reportGISelFailure(MF, MORE, "amdgpu-regbanklegalize",
1311 "AMDGPU RegBankLegalize: WidenLoad, unsuported type",
1312 MI);
1313 return false;
1314 }
1315 return true;
1316 }
1317 case UnpackAExt:
1318 return lowerUnpackAExt(MI);
1319 case WidenMMOToS32:
1320 return widenMMOToS32(cast<GAnyLoad>(MI));
1321 case VerifyAllSgpr: {
1322 assert(llvm::all_of(MI.operands(), [&](const MachineOperand &Op) {
1323 return MRI.getRegBankOrNull(Op.getReg()) == SgprRB;
1324 }));
1325 return true;
1326 }
1327 case ApplyAllVgpr: {
1328 assert(llvm::all_of(MI.defs(), [&](const MachineOperand &Op) {
1329 return MRI.getRegBankOrNull(Op.getReg()) == VgprRB;
1330 }));
1331 B.setInstrAndDebugLoc(MI);
1332 for (unsigned i = MI.getNumDefs(); i < MI.getNumOperands(); ++i) {
1333 MachineOperand &Op = MI.getOperand(i);
1334 if (!Op.isReg())
1335 continue;
1336 Register Reg = Op.getReg();
1337 if (MRI.getRegBank(Reg) != VgprRB) {
1338 auto Copy = B.buildCopy({VgprRB, MRI.getType(Reg)}, Reg);
1339 Op.setReg(Copy.getReg(0));
1340 }
1341 }
1342 return true;
1343 }
1344 case UnmergeToShiftTrunc: {
1345 GUnmerge *Unmerge = dyn_cast<GUnmerge>(&MI);
1346 LLT Ty = MRI.getType(Unmerge->getSourceReg());
1347 if (Ty.getSizeInBits() % 32 != 0) {
1348 reportGISelFailure(MF, MORE, "amdgpu-regbanklegalize",
1349 "AMDGPU RegBankLegalize: unmerge not multiple of 32",
1350 MI);
1351 return false;
1352 }
1353
1354 B.setInstrAndDebugLoc(MI);
1355 if (Ty.getSizeInBits() > 32) {
1356 auto UnmergeV2S16 =
1357 B.buildUnmerge({SgprRB, V2S16}, Unmerge->getSourceReg());
1358 for (unsigned i = 0; i < UnmergeV2S16->getNumDefs(); ++i) {
1359 auto [Dst0S32, Dst1S32] =
1360 unpackAExt(UnmergeV2S16->getOperand(i).getReg());
1361 B.buildTrunc(MI.getOperand(i * 2).getReg(), Dst0S32);
1362 B.buildTrunc(MI.getOperand(i * 2 + 1).getReg(), Dst1S32);
1363 }
1364 } else {
1365 auto [Dst0S32, Dst1S32] = unpackAExt(MI.getOperand(2).getReg());
1366 B.buildTrunc(MI.getOperand(0).getReg(), Dst0S32);
1367 B.buildTrunc(MI.getOperand(1).getReg(), Dst1S32);
1368 }
1369
1370 MI.eraseFromParent();
1371 return true;
1372 }
1374 Register Dst = MI.getOperand(0).getReg();
1375 Register NewDst = MRI.createVirtualRegister(SgprRB_S32);
1376 B.setInsertPt(*MI.getParent(), MI.getParent()->getFirstNonPHI());
1377 MI.getOperand(0).setReg(NewDst);
1378 B.buildTrunc(Dst, NewDst);
1379
1380 for (unsigned i = 1; i < MI.getNumOperands(); i += 2) {
1381 Register UseReg = MI.getOperand(i).getReg();
1382
1383 auto DefMI = MRI.getVRegDef(UseReg)->getIterator();
1384 MachineBasicBlock *DefMBB = DefMI->getParent();
1385
1386 B.setInsertPt(*DefMBB, DefMBB->SkipPHIsAndLabels(std::next(DefMI)));
1387
1388 auto NewUse = B.buildAnyExt(SgprRB_S32, UseReg);
1389 MI.getOperand(i).setReg(NewUse.getReg(0));
1390 }
1391 break;
1392 }
1393 case VerifyAllSgprGPHI: {
1394 assert(llvm::all_of(MI.operands(), [&](const MachineOperand &Op) {
1395 if (Op.isMBB())
1396 return true;
1397 return MRI.getRegBankOrNull(Op.getReg()) == SgprRB;
1398 }));
1399 return true;
1400 }
1402 assert(MRI.getRegBankOrNull(MI.getOperand(0).getReg()) == VgprRB);
1403 assert(llvm::all_of(MI.operands(), [&](const MachineOperand &Op) {
1404 if (Op.isMBB())
1405 return true;
1406 const RegisterBank *RB = MRI.getRegBankOrNull(Op.getReg());
1407 return RB == VgprRB || RB == SgprRB;
1408 }));
1409 return true;
1410 }
1411 case ApplyINTRIN_IMAGE: {
1412 const AMDGPU::RsrcIntrinsic *RSrcIntrin =
1414 assert(RSrcIntrin && RSrcIntrin->IsImage);
1415 // The reported argument index is relative to the IR intrinsic call
1416 // arguments, so shift by the number of defs and the intrinsic ID.
1417 unsigned RsrcIdx = RSrcIntrin->RsrcArg + MI.getNumExplicitDefs() + 1;
1418 return applyRegisterBanksVgprWithSgprRsrc(MI, RsrcIdx);
1419 }
1421 // Rsrc is the last register operand. Base BVH trails an A16 immediate
1422 // after rsrc; dual/BVH8 do not. Scan backwards for the last virtual
1423 // register.
1424 unsigned RsrcIdx = MI.getNumOperands();
1425 while (RsrcIdx-- > MI.getNumExplicitDefs()) {
1426 const MachineOperand &Op = MI.getOperand(RsrcIdx);
1427 if (Op.isReg() && Op.getReg().isVirtual())
1428 break;
1429 }
1430 return applyRegisterBanksVgprWithSgprRsrc(MI, RsrcIdx);
1431 }
1433 return lowerSplitBitCount64To32(MI);
1434 case ExtrVecEltToSel:
1435 return lowerExtrVecEltToSel(MI);
1436 case ExtrVecEltTo32:
1437 return lowerExtrVecEltTo32(MI);
1438 case InsVecEltToSel:
1439 return lowerInsVecEltToSel(MI);
1440 case InsVecEltTo32:
1441 return lowerInsVecEltTo32(MI);
1442 case AbsToNegMax:
1443 return lowerAbsToNegMax(MI);
1444 case AbsToS32:
1445 return lowerAbsToS32(MI);
1446 }
1447
1448 if (!WFI.SgprWaterfallOperandRegs.empty()) {
1449 if (!executeInWaterfallLoop(B, WFI))
1450 return false;
1451 }
1452 return true;
1453}
1454
1455LLT RegBankLegalizeHelper::getTyFromID(RegBankLLTMappingApplyID ID) {
1456 switch (ID) {
1457 case Vcc:
1458 case UniInVcc:
1459 return LLT::scalar(1);
1460 case Sgpr16:
1461 case Vgpr16:
1462 case UniInVgprS16:
1463 return LLT::scalar(16);
1464 case Sgpr32:
1465 case Sgpr32_WF:
1466 case Sgpr32Trunc:
1467 case Sgpr32AExt:
1469 case Sgpr32SExt:
1470 case Sgpr32ZExt:
1471 case UniInVgprS32:
1472 case Sgpr32ToVgprDst:
1473 case Vgpr32:
1474 case Vgpr32AExt:
1475 case Vgpr32SExt:
1476 case Vgpr32ZExt:
1477 return LLT::scalar(32);
1478 case Sgpr64:
1479 case Vgpr64:
1480 case UniInVgprS64:
1481 case Sgpr64ToVgprDst:
1482 return LLT::scalar(64);
1483 case Sgpr128:
1484 case Vgpr128:
1485 return LLT::scalar(128);
1486 case SgprP0:
1487 case SgprP0Call_WF:
1488 case VgprP0:
1489 return LLT::pointer(0, 64);
1490 case SgprP1:
1491 case VgprP1:
1492 return LLT::pointer(1, 64);
1493 case SgprP2:
1494 case VgprP2:
1495 return LLT::pointer(2, 32);
1496 case SgprP3:
1497 case VgprP3:
1498 return LLT::pointer(3, 32);
1499 case SgprP4:
1500 case SgprP4Call_WF:
1501 case VgprP4:
1502 return LLT::pointer(4, 64);
1503 case SgprP5:
1504 case VgprP5:
1505 return LLT::pointer(5, 32);
1506 case SgprP8:
1507 return LLT::pointer(8, 128);
1508 case SgprV2S16:
1509 case VgprV2S16:
1510 case UniInVgprV2S16:
1511 return LLT::fixed_vector(2, 16);
1512 case SgprV2S32:
1513 case VgprV2S32:
1514 case UniInVgprV2S32:
1515 return LLT::fixed_vector(2, 32);
1516 case VgprV3S32:
1517 return LLT::fixed_vector(3, 32);
1518 case VgprV4S16:
1519 return LLT::fixed_vector(4, 16);
1520 case SgprV4S32:
1521 case SgprV4S32_WF:
1523 case VgprV4S32:
1524 case UniInVgprV4S32:
1525 return LLT::fixed_vector(4, 32);
1526 case VgprV8S32:
1527 return LLT::fixed_vector(8, 32);
1528 case VgprV2S64:
1529 case UniInVgprV2S64:
1530 return LLT::fixed_vector(2, 64);
1531 case VgprV6S32:
1532 return LLT::fixed_vector(6, 32);
1533 case VgprV32S16:
1534 return LLT::fixed_vector(32, 16);
1535 case VgprV32S32:
1536 return LLT::fixed_vector(32, 32);
1537 default:
1538 return LLT();
1539 }
1540}
1541
1542LLT RegBankLegalizeHelper::getBTyFromID(RegBankLLTMappingApplyID ID, LLT Ty) {
1543 switch (ID) {
1544 case SgprB32:
1545 case VgprB32:
1546 case SgprB32_M0:
1548 case UniInVgprB32:
1549 if (Ty == LLT::scalar(32) || Ty == LLT::fixed_vector(2, 16) ||
1550 isAnyPtr(Ty, 32))
1551 return Ty;
1552 return LLT();
1553 case SgprPtr32:
1554 case VgprPtr32:
1555 return isAnyPtr(Ty, 32) ? Ty : LLT();
1556 case SgprPtr64:
1557 case VgprPtr64:
1558 return isAnyPtr(Ty, 64) ? Ty : LLT();
1559 case SgprPtr128:
1560 case VgprPtr128:
1561 return isAnyPtr(Ty, 128) ? Ty : LLT();
1562 case SgprB64:
1563 case VgprB64:
1565 case UniInVgprB64:
1566 if (Ty == LLT::scalar(64) || Ty == LLT::fixed_vector(2, 32) ||
1567 Ty == LLT::fixed_vector(4, 16) || isAnyPtr(Ty, 64))
1568 return Ty;
1569 return LLT();
1570 case SgprB96:
1571 case VgprB96:
1572 case UniInVgprB96:
1573 if (Ty == LLT::scalar(96) || Ty == LLT::fixed_vector(3, 32) ||
1574 Ty == LLT::fixed_vector(6, 16))
1575 return Ty;
1576 return LLT();
1577 case SgprB128:
1578 case VgprB128:
1579 case UniInVgprB128:
1580 if (Ty == LLT::scalar(128) || Ty == LLT::fixed_vector(4, 32) ||
1581 Ty == LLT::fixed_vector(2, 64) || Ty == LLT::fixed_vector(8, 16) ||
1582 isAnyPtr(Ty, 128))
1583 return Ty;
1584 return LLT();
1585 case VgprB160:
1586 case UniInVgprB160:
1587 if (Ty.getSizeInBits() == 160)
1588 return Ty;
1589 return LLT();
1590 case SgprB256:
1591 case VgprB256:
1592 case UniInVgprB256:
1593 if (Ty == LLT::scalar(256) || Ty == LLT::fixed_vector(8, 32) ||
1594 Ty == LLT::fixed_vector(4, 64) || Ty == LLT::fixed_vector(16, 16))
1595 return Ty;
1596 return LLT();
1597 case SgprB512:
1598 case VgprB512:
1599 case UniInVgprB512:
1600 if (Ty == LLT::scalar(512) || Ty == LLT::fixed_vector(16, 32) ||
1601 Ty == LLT::fixed_vector(8, 64))
1602 return Ty;
1603 return LLT();
1604 case SgprBRC: {
1605 const SIRegisterInfo *TRI =
1606 static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
1607 unsigned LLTSize = Ty.getSizeInBits();
1608 if (LLTSize >= 32 && TRI->getSGPRClassForBitWidth(LLTSize))
1609 return Ty;
1610 return LLT();
1611 }
1612 case VgprBRC: {
1613 const SIRegisterInfo *TRI =
1614 static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
1615 if (TRI->getSGPRClassForBitWidth(Ty.getSizeInBits()))
1616 return Ty;
1617 return LLT();
1618 }
1619 default:
1620 return LLT();
1621 }
1622}
1623
1624const RegisterBank *
1625RegBankLegalizeHelper::getRegBankFromID(RegBankLLTMappingApplyID ID) {
1626 switch (ID) {
1627 case Vcc:
1628 return VccRB;
1629 case Sgpr16:
1630 case Sgpr32:
1631 case Sgpr32_WF:
1632 case Sgpr64:
1633 case Sgpr128:
1634 case SgprP0:
1635 case SgprP0Call_WF:
1636 case SgprP1:
1637 case SgprP2:
1638 case SgprP3:
1639 case SgprP4:
1640 case SgprP4Call_WF:
1641 case SgprP5:
1642 case SgprP8:
1643 case SgprPtr32:
1644 case SgprPtr64:
1645 case SgprPtr128:
1646 case SgprV2S16:
1647 case SgprV2S32:
1648 case SgprV4S32:
1649 case SgprV4S32_WF:
1651 case SgprB32:
1652 case SgprB64:
1653 case SgprB96:
1654 case SgprB128:
1655 case SgprB256:
1656 case SgprB512:
1657 case SgprBRC:
1658 case UniInVcc:
1659 case UniInVgprS16:
1660 case UniInVgprS32:
1661 case UniInVgprS64:
1662 case UniInVgprV2S16:
1663 case UniInVgprV2S32:
1664 case UniInVgprV4S32:
1665 case UniInVgprV2S64:
1666 case UniInVgprB32:
1667 case UniInVgprB64:
1668 case UniInVgprB96:
1669 case UniInVgprB128:
1670 case UniInVgprB160:
1671 case UniInVgprB256:
1672 case UniInVgprB512:
1673 case Sgpr32Trunc:
1674 case Sgpr32AExt:
1676 case Sgpr32SExt:
1677 case Sgpr32ZExt:
1678 return SgprRB;
1679 case AgprAnyTy:
1680 return AgprRB;
1681 case Vgpr16:
1682 case Vgpr32:
1683 case Vgpr64:
1684 case Vgpr128:
1685 case VgprP0:
1686 case VgprP1:
1687 case VgprP2:
1688 case VgprP3:
1689 case VgprP4:
1690 case VgprP5:
1691 case VgprPtr32:
1692 case VgprPtr64:
1693 case VgprPtr128:
1694 case VgprV2S16:
1695 case VgprV2S32:
1696 case VgprV2S64:
1697 case VgprV3S32:
1698 case VgprV4S16:
1699 case VgprV4S32:
1700 case VgprV6S32:
1701 case VgprV8S32:
1702 case VgprV32S16:
1703 case VgprB32:
1704 case VgprB64:
1705 case VgprB96:
1706 case VgprB128:
1707 case VgprB160:
1708 case VgprB256:
1709 case VgprB512:
1710 case VgprBRC:
1711 case VgprAnyTy:
1712 case Vgpr32AExt:
1713 case Vgpr32SExt:
1714 case Vgpr32ZExt:
1715 case Sgpr32ToVgprDst:
1716 case Sgpr64ToVgprDst:
1717 return VgprRB;
1718 default:
1719 return nullptr;
1720 }
1721}
1722
1723bool RegBankLegalizeHelper::applyMappingDst(
1724 MachineInstr &MI, unsigned &OpIdx,
1725 const SmallVectorImpl<RegBankLLTMappingApplyID> &MethodIDs) {
1726 // Defs start from operand 0
1727 for (; OpIdx < MethodIDs.size(); ++OpIdx) {
1728 if (MethodIDs[OpIdx] == None)
1729 continue;
1730 MachineOperand &Op = MI.getOperand(OpIdx);
1731 Register Reg = Op.getReg();
1732 LLT Ty = MRI.getType(Reg);
1733 [[maybe_unused]] const RegisterBank *RB = MRI.getRegBank(Reg);
1734
1735 switch (MethodIDs[OpIdx]) {
1736 // vcc, sgpr and vgpr scalars, pointers and vectors
1737 case Vcc:
1738 case Sgpr16:
1739 case Sgpr32:
1740 case Sgpr64:
1741 case Sgpr128:
1742 case SgprP0:
1743 case SgprP1:
1744 case SgprP3:
1745 case SgprP4:
1746 case SgprP5:
1747 case SgprP8:
1748 case SgprV2S16:
1749 case SgprV2S32:
1750 case SgprV4S32:
1751 case Vgpr16:
1752 case Vgpr32:
1753 case Vgpr64:
1754 case Vgpr128:
1755 case VgprP0:
1756 case VgprP1:
1757 case VgprP2:
1758 case VgprP3:
1759 case VgprP4:
1760 case VgprP5:
1761 case VgprV2S16:
1762 case VgprV2S32:
1763 case VgprV2S64:
1764 case VgprV3S32:
1765 case VgprV4S16:
1766 case VgprV4S32:
1767 case VgprV6S32:
1768 case VgprV8S32:
1769 case VgprV32S16: {
1770 assert(Ty == getTyFromID(MethodIDs[OpIdx]));
1771 assert(RB == getRegBankFromID(MethodIDs[OpIdx]));
1772 break;
1773 }
1774 // sgpr and vgpr B-types
1775 case SgprB32:
1776 case SgprB64:
1777 case SgprB96:
1778 case SgprB128:
1779 case SgprB256:
1780 case SgprB512:
1781 case SgprBRC:
1782 case SgprPtr32:
1783 case SgprPtr64:
1784 case SgprPtr128:
1785 case VgprB32:
1786 case VgprB64:
1787 case VgprB96:
1788 case VgprB128:
1789 case VgprB160:
1790 case VgprB256:
1791 case VgprB512:
1792 case VgprBRC:
1793 case VgprPtr32:
1794 case VgprPtr64:
1795 case VgprPtr128: {
1796 assert(Ty == getBTyFromID(MethodIDs[OpIdx], Ty));
1797 assert(RB == getRegBankFromID(MethodIDs[OpIdx]));
1798 break;
1799 }
1800 case VgprAnyTy: {
1801 assert(RB == VgprRB);
1802 break;
1803 }
1804 case AgprAnyTy: {
1805 if (RB == AgprRB)
1806 break;
1807 Register NewAgprDst = MRI.createVirtualRegister({AgprRB, Ty});
1808 Op.setReg(NewAgprDst);
1809 if (!MRI.use_nodbg_empty(Reg))
1810 B.buildCopy(Reg, NewAgprDst);
1811 break;
1812 }
1813 // uniform in vcc/vgpr: scalars, vectors and B-types
1814 case UniInVcc: {
1815 assert(Ty == S1);
1816 assert(RB == SgprRB);
1817 Register NewDst = MRI.createVirtualRegister(VccRB_S1);
1818 Op.setReg(NewDst);
1819 if (!MRI.use_empty(Reg)) {
1820 auto CopyS32_Vcc =
1821 B.buildInstr(AMDGPU::G_AMDGPU_COPY_SCC_VCC, {SgprRB_S32}, {NewDst});
1822 B.buildTrunc(Reg, CopyS32_Vcc);
1823 }
1824 break;
1825 }
1826 case UniInVgprS16: {
1827 assert(Ty == getTyFromID(MethodIDs[OpIdx]));
1828 assert(RB == SgprRB);
1829 Register NewVgprDstS16 = MRI.createVirtualRegister({VgprRB, S16});
1830 Register NewVgprDstS32 = MRI.createVirtualRegister({VgprRB, S32});
1831 Register NewSgprDstS32 = MRI.createVirtualRegister({SgprRB, S32});
1832 Op.setReg(NewVgprDstS16);
1833 B.buildAnyExt(NewVgprDstS32, NewVgprDstS16);
1834 buildReadAnyLane(B, NewSgprDstS32, NewVgprDstS32, RBI);
1835 B.buildTrunc(Reg, NewSgprDstS32);
1836 break;
1837 }
1838 case UniInVgprS32:
1839 case UniInVgprS64:
1840 case UniInVgprV2S16:
1841 case UniInVgprV2S32:
1842 case UniInVgprV4S32:
1843 case UniInVgprV2S64: {
1844 assert(Ty == getTyFromID(MethodIDs[OpIdx]));
1845 assert(RB == SgprRB);
1846 Register NewVgprDst = MRI.createVirtualRegister({VgprRB, Ty});
1847 Op.setReg(NewVgprDst);
1848 buildReadAnyLane(B, Reg, NewVgprDst, RBI);
1849 break;
1850 }
1851 case UniInVgprB32:
1852 case UniInVgprB64:
1853 case UniInVgprB96:
1854 case UniInVgprB128:
1855 case UniInVgprB160:
1856 case UniInVgprB256:
1857 case UniInVgprB512: {
1858 assert(Ty == getBTyFromID(MethodIDs[OpIdx], Ty));
1859 assert(RB == SgprRB);
1860 Register NewVgprDst = MRI.createVirtualRegister({VgprRB, Ty});
1861 Op.setReg(NewVgprDst);
1862 AMDGPU::buildReadAnyLane(B, Reg, NewVgprDst, RBI);
1863 break;
1864 }
1865 // sgpr trunc
1866 case Sgpr32Trunc: {
1867 assert(Ty.getSizeInBits() < 32);
1868 assert(RB == SgprRB);
1869 Register NewDst = MRI.createVirtualRegister(SgprRB_S32);
1870 Op.setReg(NewDst);
1871 if (!MRI.use_empty(Reg))
1872 B.buildTrunc(Reg, NewDst);
1873 break;
1874 }
1875 case Sgpr32ToVgprDst:
1876 case Sgpr64ToVgprDst: {
1877 assert(Ty == getTyFromID(MethodIDs[OpIdx]));
1878 assert(RB == VgprRB);
1879 Op.setReg(MRI.createVirtualRegister({SgprRB, Ty}));
1880 B.buildCopy(Reg, Op.getReg());
1881 break;
1882 }
1883 case InvalidMapping: {
1885 MF, MORE, "amdgpu-regbanklegalize",
1886 "AMDGPU RegBankLegalize: missing fast rule ('Div' or 'Uni') for", MI);
1887 return false;
1888 }
1889 default:
1891 MF, MORE, "amdgpu-regbanklegalize",
1892 "AMDGPU RegBankLegalize: applyMappingDst, ID not supported", MI);
1893 return false;
1894 }
1895 }
1896
1897 return true;
1898}
1899
1900bool RegBankLegalizeHelper::applyMappingSrc(
1901 MachineInstr &MI, unsigned &OpIdx,
1902 const SmallVectorImpl<RegBankLLTMappingApplyID> &MethodIDs,
1903 WaterfallInfo &WFI) {
1904 for (unsigned i = 0; i < MethodIDs.size(); ++OpIdx, ++i) {
1905 if (MethodIDs[i] == None || MethodIDs[i] == IntrId || MethodIDs[i] == Imm)
1906 continue;
1907
1908 MachineOperand &Op = MI.getOperand(OpIdx);
1909 Register Reg = Op.getReg();
1910 LLT Ty = MRI.getType(Reg);
1911 const RegisterBank *RB = MRI.getRegBank(Reg);
1912
1913 switch (MethodIDs[i]) {
1914 case Vcc: {
1915 assert(Ty == S1);
1916 assert(RB == VccRB || RB == SgprRB);
1917 if (RB == SgprRB) {
1918 auto Aext = B.buildAnyExt(SgprRB_S32, Reg);
1919 auto CopyVcc_Scc =
1920 B.buildInstr(AMDGPU::G_AMDGPU_COPY_VCC_SCC, {VccRB_S1}, {Aext});
1921 Op.setReg(CopyVcc_Scc.getReg(0));
1922 }
1923 break;
1924 }
1925 // sgpr scalars, pointers and vectors
1926 case Sgpr16:
1927 case Sgpr32:
1928 case Sgpr64:
1929 case Sgpr128:
1930 case SgprP0:
1931 case SgprP1:
1932 case SgprP3:
1933 case SgprP4:
1934 case SgprP5:
1935 case SgprP8:
1936 case SgprV2S16:
1937 case SgprV2S32:
1938 case SgprV4S32: {
1939 assert(Ty == getTyFromID(MethodIDs[i]));
1940 assert(RB == getRegBankFromID(MethodIDs[i]));
1941 break;
1942 }
1943 // sgpr B-types
1944 case SgprB32:
1945 case SgprB64:
1946 case SgprB96:
1947 case SgprB128:
1948 case SgprB256:
1949 case SgprB512:
1950 case SgprBRC:
1951 case SgprPtr32:
1952 case SgprPtr64:
1953 case SgprPtr128: {
1954 assert(Ty == getBTyFromID(MethodIDs[i], Ty));
1955 assert(RB == getRegBankFromID(MethodIDs[i]));
1956 break;
1957 }
1958 // vgpr scalars, pointers and vectors
1959 case Vgpr16:
1960 case Vgpr32:
1961 case Vgpr64:
1962 case Vgpr128:
1963 case VgprP0:
1964 case VgprP1:
1965 case VgprP2:
1966 case VgprP3:
1967 case VgprP4:
1968 case VgprP5:
1969 case VgprV2S16:
1970 case VgprV2S32:
1971 case VgprV2S64:
1972 case VgprV3S32:
1973 case VgprV4S16:
1974 case VgprV4S32:
1975 case VgprV6S32:
1976 case VgprV8S32:
1977 case VgprV32S16:
1978 case VgprV32S32: {
1979 assert(Ty == getTyFromID(MethodIDs[i]));
1980 if (RB != VgprRB) {
1981 auto CopyToVgpr = B.buildCopy({VgprRB, Ty}, Reg);
1982 Op.setReg(CopyToVgpr.getReg(0));
1983 }
1984 break;
1985 }
1986 // vgpr B-types
1987 case VgprB32:
1988 case VgprB64:
1989 case VgprB96:
1990 case VgprB128:
1991 case VgprB160:
1992 case VgprB256:
1993 case VgprB512:
1994 case VgprBRC:
1995 case VgprPtr32:
1996 case VgprPtr64:
1997 case VgprPtr128: {
1998 assert(Ty == getBTyFromID(MethodIDs[i], Ty));
1999 if (RB != VgprRB) {
2000 auto CopyToVgpr = B.buildCopy({VgprRB, Ty}, Reg);
2001 Op.setReg(CopyToVgpr.getReg(0));
2002 }
2003 break;
2004 }
2005 case VgprAnyTy: {
2006 if (RB != VgprRB) {
2007 auto CopyToVgpr = B.buildCopy({VgprRB, Ty}, Reg);
2008 Op.setReg(CopyToVgpr.getReg(0));
2009 }
2010 break;
2011 }
2012 case AgprAnyTy: {
2013 if (RB != AgprRB) {
2014 auto CopyToAgpr = B.buildCopy({AgprRB, Ty}, Reg);
2015 Op.setReg(CopyToAgpr.getReg(0));
2016 }
2017 break;
2018 }
2019 // sgpr waterfall, scalars, and vectors
2020 case Sgpr32_WF:
2021 case SgprV4S32_WF: {
2022 assert(Ty == getTyFromID(MethodIDs[i]));
2023 if (RB != SgprRB) {
2024 WFI.SgprWaterfallOperandRegs.insert(Reg);
2025 if (!WFI.Start.isValid()) {
2026 WFI.Start = MI.getIterator();
2027 WFI.End = std::next(MI.getIterator());
2028 }
2029 }
2030 break;
2031 }
2032 case SgprP0Call_WF:
2033 case SgprP4Call_WF: {
2034 assert(Ty == getTyFromID(MethodIDs[i]));
2035 if (RB != SgprRB) {
2036 WFI.SgprWaterfallOperandRegs.insert(Reg);
2037
2038 // Find the ADJCALLSTACKUP before the call.
2039 MachineBasicBlock::iterator Start = MI.getIterator();
2040 while (Start->getOpcode() != AMDGPU::ADJCALLSTACKUP)
2041 --Start;
2042
2043 // Find the ADJCALLSTACKDOWN after the call (include it in range).
2044 MachineBasicBlock::iterator End = MI.getIterator();
2045 while (End->getOpcode() != AMDGPU::ADJCALLSTACKDOWN)
2046 ++End;
2047 ++End;
2048
2049 B.setInsertPt(*MI.getParent(), Start);
2050 WFI.Start = Start;
2051 WFI.End = End;
2052 }
2053 break;
2054 }
2055 case SgprB32_M0:
2057 case SgprB64_ReadFirstLane: {
2058 assert(Ty == getBTyFromID(MethodIDs[i], Ty));
2059 if (RB == SgprRB)
2060 break;
2061 assert(RB == VgprRB);
2062 Register NewSGPR = MRI.createVirtualRegister({SgprRB, Ty});
2063 buildReadFirstLane(B, NewSGPR, Op.getReg(), RBI);
2064 Op.setReg(NewSGPR);
2065 break;
2066 }
2068 assert(Ty == getTyFromID(MethodIDs[i]));
2069 if (RB == SgprRB)
2070 break;
2071 assert(RB == VgprRB);
2072 Register NewSGPR = MRI.createVirtualRegister({SgprRB, Ty});
2073 buildReadFirstLane(B, NewSGPR, Op.getReg(), RBI);
2074 Op.setReg(NewSGPR);
2075 break;
2076 }
2077 // sgpr and vgpr scalars with extend
2078 case Sgpr32AExt: {
2079 // Note: this ext allows S1, and it is meant to be combined away.
2080 assert(Ty.getSizeInBits() < 32);
2081 assert(RB == SgprRB);
2082 auto Aext = B.buildAnyExt(SgprRB_S32, Reg);
2083 Op.setReg(Aext.getReg(0));
2084 break;
2085 }
2086 case Sgpr32AExtBoolInReg: {
2087 // Note: this ext allows S1, and it is meant to be combined away.
2088 assert(Ty.getSizeInBits() == 1);
2089 assert(RB == SgprRB);
2090 auto Aext = B.buildAnyExt(SgprRB_S32, Reg);
2091 // Zext SgprS1 is not legal, make AND with 1 instead. This instruction is
2092 // most of times meant to be combined away in AMDGPURegBankCombiner.
2093 auto Cst1 = B.buildConstant(SgprRB_S32, 1);
2094 auto BoolInReg = B.buildAnd(SgprRB_S32, Aext, Cst1);
2095 Op.setReg(BoolInReg.getReg(0));
2096 break;
2097 }
2098 case Sgpr32SExt: {
2099 assert(1 < Ty.getSizeInBits() && Ty.getSizeInBits() < 32);
2100 assert(RB == SgprRB);
2101 auto Sext = B.buildSExt(SgprRB_S32, Reg);
2102 Op.setReg(Sext.getReg(0));
2103 break;
2104 }
2105 case Sgpr32ZExt: {
2106 assert(1 < Ty.getSizeInBits() && Ty.getSizeInBits() < 32);
2107 assert(RB == SgprRB);
2108 auto Zext = B.buildZExt({SgprRB, S32}, Reg);
2109 Op.setReg(Zext.getReg(0));
2110 break;
2111 }
2112 case Vgpr32AExt: {
2113 assert(Ty.getSizeInBits() < 32);
2114 assert(RB == VgprRB);
2115 auto Aext = B.buildAnyExt({VgprRB, S32}, Reg);
2116 Op.setReg(Aext.getReg(0));
2117 break;
2118 }
2119 case Vgpr32SExt: {
2120 // Note this ext allows S1, and it is meant to be combined away.
2121 assert(Ty.getSizeInBits() < 32);
2122 assert(RB == VgprRB);
2123 auto Sext = B.buildSExt({VgprRB, S32}, Reg);
2124 Op.setReg(Sext.getReg(0));
2125 break;
2126 }
2127 case Vgpr32ZExt: {
2128 // Note this ext allows S1, and it is meant to be combined away.
2129 assert(Ty.getSizeInBits() < 32);
2130 assert(RB == VgprRB);
2131 auto Zext = B.buildZExt({VgprRB, S32}, Reg);
2132 Op.setReg(Zext.getReg(0));
2133 break;
2134 }
2135 default:
2137 MF, MORE, "amdgpu-regbanklegalize",
2138 "AMDGPU RegBankLegalize: applyMappingSrc, ID not supported", MI);
2139 return false;
2140 }
2141 }
2142 return true;
2143}
2144
2145[[maybe_unused]] static bool verifyRegBankOnOperands(MachineInstr &MI,
2146 const RegisterBank *RB,
2148 unsigned StartOpIdx,
2149 unsigned EndOpIdx) {
2150 for (unsigned i = StartOpIdx; i <= EndOpIdx; ++i) {
2151 if (MRI.getRegBankOrNull(MI.getOperand(i).getReg()) != RB)
2152 return false;
2153 }
2154 return true;
2155}
2156
2157bool RegBankLegalizeHelper::applyRegisterBanksVgprWithSgprRsrc(
2158 MachineInstr &MI, unsigned RsrcIdx) {
2159 const unsigned NumDefs = MI.getNumExplicitDefs();
2160
2161 MachineBasicBlock *MBB = MI.getParent();
2162 B.setInsertPt(*MBB, MBB->SkipPHIsAndLabels(std::next(MI.getIterator())));
2163
2164 // Defs are vgpr.
2165 for (unsigned i = 0; i < NumDefs; ++i) {
2166 Register Reg = MI.getOperand(i).getReg();
2167 if (MRI.getRegBank(Reg) == VgprRB)
2168 continue;
2169
2170 Register NewVgprDst = MRI.createVirtualRegister({VgprRB, MRI.getType(Reg)});
2171 MI.getOperand(i).setReg(NewVgprDst);
2172 buildReadAnyLane(B, Reg, NewVgprDst, RBI);
2173 }
2174
2175 B.setInstrAndDebugLoc(MI);
2176
2177 // Register uses before RsrcIdx are vgpr.
2178 for (unsigned i = NumDefs; i < RsrcIdx; ++i) {
2179 MachineOperand &Op = MI.getOperand(i);
2180 if (!Op.isReg())
2181 continue;
2182
2183 Register Reg = Op.getReg();
2184 if (!Reg.isVirtual())
2185 continue;
2186
2187 if (MRI.getRegBank(Reg) == VgprRB)
2188 continue;
2189
2190 auto Copy = B.buildCopy({VgprRB, MRI.getType(Reg)}, Reg);
2191 Op.setReg(Copy.getReg(0));
2192 }
2193
2194 SmallSet<Register, 4> OpsToWaterfall;
2195
2196 // Register use RsrcIdx (and later register operands) is sgpr.
2197 for (unsigned i = RsrcIdx; i < MI.getNumOperands(); ++i) {
2198 MachineOperand &Op = MI.getOperand(i);
2199 if (!Op.isReg())
2200 continue;
2201
2202 Register Reg = Op.getReg();
2203 if (MRI.getRegBank(Reg) != SgprRB)
2204 OpsToWaterfall.insert(Reg);
2205 }
2206
2207 if (!OpsToWaterfall.empty()) {
2208 MachineBasicBlock::iterator MII = MI.getIterator();
2209 executeInWaterfallLoop(B, {OpsToWaterfall, MII, std::next(MII)});
2210 }
2211
2212 return true;
2213}
MachineInstrBuilder MachineInstrBuilder & DefMI
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.
Provides AMDGPU specific target descriptions.
static bool isSignedBFE(MachineInstr &MI)
static bool verifyRegBankOnOperands(MachineInstr &MI, const RegisterBank *RB, MachineRegisterInfo &MRI, unsigned StartOpIdx, unsigned EndOpIdx)
This file declares the targeting of the RegisterBankInfo class for AMDGPU.
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator MBBI
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
AMD GCN specific subclass of TargetSubtarget.
Declares convenience wrapper classes for interpreting MachineInstr instances as specific generic oper...
static Register UseReg(const MachineOperand &MO)
IRTranslator LLVM IR MI
const size_t AbstractManglingParser< Derived, Alloc >::NumOps
#define I(x, y, z)
Definition MD5.cpp:57
Contains matchers for matching SSA Machine Instructions.
This file declares the MachineIRBuilder class.
Register Reg
Register const TargetRegisterInfo * TRI
Machine IR instance of the generic uniformity analysis.
Promote Memory to Register
Definition Mem2Reg.cpp:110
static MCRegister getReg(const MCDisassembler *D, unsigned RC, unsigned RegNo)
MachineInstr unsigned OpIdx
const SmallVectorImpl< MachineOperand > & Cond
static const LaneMaskConstants & get(const GCNSubtarget &ST)
RegBankLegalizeHelper(MachineIRBuilder &B, const MachineUniformityInfo &MUI, const RegisterBankInfo &RBI, const RegBankLegalizeRules &RBLRules)
const RegBankLLTMapping * findMappingForMI(const MachineInstr &MI, const MachineRegisterInfo &MRI, const MachineUniformityInfo &MUI) const
static APInt getLowBitsSet(unsigned numBits, unsigned loBitsSet)
Constructs an APInt value that has the bottom loBitsSet bits set.
Definition APInt.h:307
@ ICMP_NE
not equal
Definition InstrTypes.h:762
iterator find(const_arg_type_t< KeyT > Val)
Definition DenseMap.h:178
iterator end()
Definition DenseMap.h:81
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition DenseMap.h:239
const SIRegisterInfo * getRegisterInfo() const override
Represents a call to an intrinsic.
Register getSourceReg() const
Get the unmerge source register.
constexpr bool isScalar() const
LLT getScalarType() const
static constexpr LLT scalar(unsigned SizeInBits)
Get a low-level scalar or aggregate "bag of bits".
constexpr bool isValid() const
constexpr uint16_t getNumElements() const
Returns the number of elements in a vector LLT.
constexpr bool isVector() const
static constexpr LLT pointer(unsigned AddressSpace, unsigned SizeInBits)
Get a low-level pointer in the given address space.
constexpr TypeSize getSizeInBits() const
Returns the total size of the type. Must only be called on sized types.
constexpr bool isPointer() const
static constexpr LLT fixed_vector(unsigned NumElements, unsigned ScalarSizeInBits)
Get a low-level fixed-width vector of some number of elements and element width.
constexpr TypeSize getSizeInBytes() const
Returns the total size of the type in bytes, i.e.
LLT getElementType() const
Returns the vector's element type. Only valid for vector types.
TypeSize getValue() const
LLVM_ABI void transferSuccessorsAndUpdatePHIs(MachineBasicBlock *FromMBB)
Transfers all the successors, as in transferSuccessors, and update PHI operands in the successor bloc...
LLVM_ABI iterator SkipPHIsAndLabels(iterator I)
Return the first instruction in MBB after I that is not a PHI or a label.
LLVM_ABI void addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob=BranchProbability::getUnknown())
Add Succ as a successor of this MachineBasicBlock.
void splice(iterator Where, MachineBasicBlock *Other, iterator From)
Take an instruction from MBB 'Other' at the position From, and insert it into this MBB right before '...
MachineInstrBundleIterator< MachineInstr > iterator
BasicBlockListType::iterator iterator
MachineBasicBlock * CreateMachineBasicBlock(const BasicBlock *BB=nullptr, std::optional< UniqueBBID > BBID=std::nullopt)
CreateMachineInstr - Allocate a new MachineInstr.
void insert(iterator MBBI, MachineBasicBlock *MBB)
Helper class to build MachineInstr.
Representation of each machine instruction.
const MachineBasicBlock * getParent() const
LocationSize getSize() const
Return the size in bytes of the memory reference.
MachineOperand class - Representation of each machine instruction operand.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
const RegisterBank * getRegBank(Register Reg) const
Return the register bank of Reg.
LLVM_ABI Register createVirtualRegister(const TargetRegisterClass *RegClass, StringRef Name="")
createVirtualRegister - Create and return a new virtual register in the function with the specified r...
LLT getType(Register Reg) const
Get the low-level type of Reg or LLT{} if Reg is not a generic (target independent) virtual register.
const RegisterBank * getRegBankOrNull(Register Reg) const
Return the register bank of Reg, or null if Reg has not been assigned a register bank or has been ass...
Holds all the information related to register banks.
This class implements the register bank concept.
Wrapper class representing virtual and physical registers.
Definition Register.h:20
constexpr bool isVirtual() const
Return true if the specified register number is in the virtual register namespace.
Definition Register.h:79
bool empty() const
Definition SmallSet.h:169
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
Definition SmallSet.h:184
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
self_iterator getIterator()
Definition ilist_node.h:123
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
bool isAnyPtr(LLT Ty, unsigned Width)
Intrinsic::ID getIntrinsicID(const MachineInstr &I)
Return the intrinsic ID for opcodes with the G_AMDGPU_INTRIN_ prefix.
void buildReadAnyLane(MachineIRBuilder &B, Register SgprDst, Register VgprSrc, const RegisterBankInfo &RBI)
const RsrcIntrinsic * lookupRsrcIntrinsic(unsigned Intr)
void buildReadFirstLane(MachineIRBuilder &B, Register SgprDst, Register VgprSrc, const RegisterBankInfo &RBI)
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ Bitcast
Perform the operation on a different, but equivalently sized type.
SpecificConstantMatch m_ZeroInt()
Convenience matchers for specific integer values.
bool mi_match(Reg R, const MachineRegisterInfo &MRI, Pattern &&P)
This is an optimization pass for GlobalISel generic memory operations.
GenericUniformityInfo< MachineSSAContext > MachineUniformityInfo
@ Offset
Definition DWP.cpp:558
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1738
@ Kill
The last use of a register.
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
LLVM_ABI void constrainSelectedInstRegOperands(MachineInstr &I, const TargetInstrInfo &TII, const TargetRegisterInfo &TRI, const RegisterBankInfo &RBI)
Mutate the newly-selected instruction I to constrain its (possibly generic) virtual register operands...
Definition Utils.cpp:156
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
LLVM_ABI void reportGISelFailure(MachineFunction &MF, MachineOptimizationRemarkEmitter &MORE, MachineOptimizationRemarkMissed &R)
Report an ISel error as a missed optimization remark to the LLVMContext's diagnostic stream.
Definition Utils.cpp:258
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
DWARFExpression::Operation Op
ArrayRef(const T &OneElt) -> ArrayRef< T >
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
LLVM_ABI std::optional< ValueAndVReg > getIConstantVRegValWithLookThrough(Register VReg, const MachineRegisterInfo &MRI, bool LookThroughInstrs=true)
If VReg is defined by a statically evaluable chain of instructions rooted on a G_CONSTANT returns its...
Definition Utils.cpp:433
constexpr T maskTrailingOnes(unsigned N)
Create a bitmask with the N right-most bits set to 1, and all other bits set to 0.
Definition MathExtras.h:77
SmallVector< RegBankLLTMappingApplyID, 2 > DstOpMapping
SmallVector< RegBankLLTMappingApplyID, 4 > SrcOpMapping
Holds waterfall loop information: the set of SGPR operand registers that need waterfalling,...
MachineBasicBlock::iterator Start
SmallSet< Register, 4 > SgprWaterfallOperandRegs