LLVM 23.0.0git
AMDGPURegBankLegalizeHelper.cpp
Go to the documentation of this file.
1//===-- AMDGPURegBankLegalizeHelper.cpp -----------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// Implements actual lowering algorithms for each ID that can be used in
10/// Rule.OperandMapping. Similar to legalizer helper but with register banks.
11//
12//===----------------------------------------------------------------------===//
13
16#include "AMDGPUInstrInfo.h"
19#include "GCNSubtarget.h"
26#include "llvm/IR/IntrinsicsAMDGPU.h"
27
28#define DEBUG_TYPE "amdgpu-regbanklegalize"
29
30using namespace llvm;
31using namespace AMDGPU;
32
35 const RegisterBankInfo &RBI, const RegBankLegalizeRules &RBLRules)
36 : MF(B.getMF()), ST(MF.getSubtarget<GCNSubtarget>()), B(B),
37 MRI(*B.getMRI()), MUI(MUI), RBI(RBI), MORE(MF, nullptr),
38 RBLRules(RBLRules), IsWave32(ST.isWave32()),
39 SgprRB(&RBI.getRegBank(AMDGPU::SGPRRegBankID)),
40 VgprRB(&RBI.getRegBank(AMDGPU::VGPRRegBankID)),
41 VccRB(&RBI.getRegBank(AMDGPU::VCCRegBankID)) {}
42
44 const SetOfRulesForOpcode *RuleSet = RBLRules.getRulesForOpc(MI);
45 if (!RuleSet) {
46 reportGISelFailure(MF, MORE, "amdgpu-regbanklegalize",
47 "No AMDGPU RegBankLegalize rules defined for opcode",
48 MI);
49 return false;
50 }
51
52 const RegBankLLTMapping *Mapping = RuleSet->findMappingForMI(MI, MRI, MUI);
53 if (!Mapping) {
54 reportGISelFailure(MF, MORE, "amdgpu-regbanklegalize",
55 "AMDGPU RegBankLegalize: none of the rules defined with "
56 "'Any' for MI's opcode matched MI",
57 MI);
58 return false;
59 }
60
61 WaterfallInfo WFI;
62 unsigned OpIdx = 0;
63 if (Mapping->DstOpMapping.size() > 0) {
64 B.setInsertPt(*MI.getParent(), std::next(MI.getIterator()));
65 if (!applyMappingDst(MI, OpIdx, Mapping->DstOpMapping))
66 return false;
67 }
68 if (Mapping->SrcOpMapping.size() > 0) {
69 B.setInstr(MI);
70 if (!applyMappingSrc(MI, OpIdx, Mapping->SrcOpMapping, WFI))
71 return false;
72 }
73
74 if (!lower(MI, *Mapping, WFI))
75 return false;
76
77 return true;
78}
79
80bool RegBankLegalizeHelper::executeInWaterfallLoop(MachineIRBuilder &B,
81 const WaterfallInfo &WFI) {
82 assert(WFI.Start.isValid() && WFI.End.isValid() &&
83 "Waterfall range not initialized");
84
85 // Track use registers which have already been expanded with a readfirstlane
86 // sequence. This may have multiple uses if moving a sequence.
87 DenseMap<Register, Register> WaterfalledRegMap;
88
89 MachineBasicBlock &MBB = B.getMBB();
90 MachineFunction &MF = B.getMF();
91
94
96 const TargetRegisterClass *WaveRC = TRI->getWaveMaskRegClass();
97 unsigned MovExecOpc, MovExecTermOpc, XorTermOpc, AndSaveExecOpc, ExecReg;
98 if (IsWave32) {
99 MovExecOpc = AMDGPU::S_MOV_B32;
100 MovExecTermOpc = AMDGPU::S_MOV_B32_term;
101 XorTermOpc = AMDGPU::S_XOR_B32_term;
102 AndSaveExecOpc = AMDGPU::S_AND_SAVEEXEC_B32;
103 ExecReg = AMDGPU::EXEC_LO;
104 } else {
105 MovExecOpc = AMDGPU::S_MOV_B64;
106 MovExecTermOpc = AMDGPU::S_MOV_B64_term;
107 XorTermOpc = AMDGPU::S_XOR_B64_term;
108 AndSaveExecOpc = AMDGPU::S_AND_SAVEEXEC_B64;
109 ExecReg = AMDGPU::EXEC;
110 }
111
112#ifndef NDEBUG
113 const int OrigRangeSize = std::distance(BeginIt, EndIt);
114#endif
115
116 MachineRegisterInfo &MRI = *B.getMRI();
117 Register SaveExecReg = MRI.createVirtualRegister(WaveRC);
118 Register InitSaveExecReg = MRI.createVirtualRegister(WaveRC);
119
120 // Don't bother using generic instructions/registers for the exec mask.
121 B.buildInstr(TargetOpcode::IMPLICIT_DEF).addDef(InitSaveExecReg);
122
123 Register SavedExec = MRI.createVirtualRegister(WaveRC);
124
125 // To insert the loop we need to split the block. Move everything before
126 // this point to a new block, and insert a new empty block before this
127 // instruction.
130 MachineBasicBlock *RestoreExecBB = MF.CreateMachineBasicBlock();
131 MachineBasicBlock *RemainderBB = MF.CreateMachineBasicBlock();
133 ++MBBI;
134 MF.insert(MBBI, LoopBB);
135 MF.insert(MBBI, BodyBB);
136 MF.insert(MBBI, RestoreExecBB);
137 MF.insert(MBBI, RemainderBB);
138
139 LoopBB->addSuccessor(BodyBB);
140 BodyBB->addSuccessor(RestoreExecBB);
141 BodyBB->addSuccessor(LoopBB);
142
143 // Move the rest of the block into a new block.
145 RemainderBB->splice(RemainderBB->begin(), &MBB, EndIt, MBB.end());
146
147 MBB.addSuccessor(LoopBB);
148 RestoreExecBB->addSuccessor(RemainderBB);
149
150 B.setInsertPt(*LoopBB, LoopBB->end());
151
152 // +-MBB:------------+
153 // | ... |
154 // | %0 = G_INST_1 |
155 // | %Dst = MI %Vgpr |
156 // | %1 = G_INST_2 |
157 // | ... |
158 // +-----------------+
159 // ->
160 // +-MBB-------------------------------+
161 // | ... |
162 // | %0 = G_INST_1 |
163 // | %SaveExecReg = S_MOV_B32 $exec_lo |
164 // +----------------|------------------+
165 // | /------------------------------|
166 // V V |
167 // +-LoopBB---------------------------------------------------------------+ |
168 // | %CurrentLaneReg:sgpr(s32) = READFIRSTLANE %Vgpr | |
169 // | instead of executing for each lane, see if other lanes had | |
170 // | same value for %Vgpr and execute for them also. | |
171 // | %CondReg:vcc(s1) = G_ICMP eq %CurrentLaneReg, %Vgpr | |
172 // | %CondRegLM:sreg_32 = ballot %CondReg // copy vcc to sreg32 lane mask | |
173 // | %SavedExec = S_AND_SAVEEXEC_B32 %CondRegLM | |
174 // | exec is active for lanes with the same "CurrentLane value" in Vgpr | |
175 // +----------------|-----------------------------------------------------+ |
176 // V |
177 // +-BodyBB------------------------------------------------------------+ |
178 // | %Dst = MI %CurrentLaneReg:sgpr(s32) | |
179 // | executed only for active lanes and written to Dst | |
180 // | $exec = S_XOR_B32 $exec, %SavedExec | |
181 // | set active lanes to 0 in SavedExec, lanes that did not write to | |
182 // | Dst yet, and set this as new exec (for READFIRSTLANE and ICMP) | |
183 // | SI_WATERFALL_LOOP LoopBB |-----|
184 // +----------------|--------------------------------------------------+
185 // V
186 // +-RestoreExecBB--------------------------+
187 // | $exec_lo = S_MOV_B32_term %SaveExecReg |
188 // +----------------|-----------------------+
189 // V
190 // +-RemainderBB:----------------------+
191 // | %1 = G_INST_2 |
192 // | ... |
193 // +---------------------------------- +
194
195 // Move the instruction into the loop body. Note we moved everything after
196 // Range.end() already into a new block, so Range.end() is no longer valid.
197 BodyBB->splice(BodyBB->end(), &MBB, BeginIt, MBB.end());
198
199 // Figure out the iterator range after splicing the instructions.
200 MachineBasicBlock::iterator NewBegin = BeginIt;
201 auto NewEnd = BodyBB->end();
202 assert(std::distance(NewBegin, NewEnd) == OrigRangeSize);
203
204 B.setMBB(*LoopBB);
205 Register CondReg;
206
207 for (MachineInstr &MI : make_range(NewBegin, NewEnd)) {
208 for (MachineOperand &Op : MI.all_uses()) {
209 Register OldReg = Op.getReg();
210 if (!WFI.SgprWaterfallOperandRegs.count(OldReg))
211 continue;
212
213 // See if we already processed this register in another instruction in
214 // the sequence.
215 auto OldVal = WaterfalledRegMap.find(OldReg);
216 if (OldVal != WaterfalledRegMap.end()) {
217 Op.setReg(OldVal->second);
218 continue;
219 }
220
221 Register OpReg = Op.getReg();
222 LLT OpTy = MRI.getType(OpReg);
223
224 // TODO: support for agpr
225 assert(MRI.getRegBank(OpReg) == VgprRB);
226 Register CurrentLaneReg = MRI.createVirtualRegister({SgprRB, OpTy});
227 buildReadFirstLane(B, CurrentLaneReg, OpReg, RBI);
228
229 // Build the comparison(s), CurrentLaneReg == OpReg.
230 unsigned OpSize = OpTy.getSizeInBits();
231 unsigned PartSize = (OpSize % 64 == 0) ? 64 : 32;
232 LLT PartTy = LLT::scalar(PartSize);
233 unsigned NumParts = OpSize / PartSize;
235 SmallVector<Register, 8> CurrentLaneParts;
236
237 if (NumParts == 1) {
238 OpParts.push_back(OpReg);
239 CurrentLaneParts.push_back(CurrentLaneReg);
240 } else {
241 auto UnmergeOp = B.buildUnmerge({VgprRB, PartTy}, OpReg);
242 auto UnmergeCurrLane = B.buildUnmerge({SgprRB, PartTy}, CurrentLaneReg);
243 for (unsigned i = 0; i < NumParts; ++i) {
244 OpParts.push_back(UnmergeOp.getReg(i));
245 CurrentLaneParts.push_back(UnmergeCurrLane.getReg(i));
246 }
247 }
248
249 for (unsigned i = 0; i < NumParts; ++i) {
250 Register CmpReg = MRI.createVirtualRegister(VccRB_S1);
251 B.buildICmp(CmpInst::ICMP_EQ, CmpReg, CurrentLaneParts[i], OpParts[i]);
252
253 if (!CondReg)
254 CondReg = CmpReg;
255 else
256 CondReg = B.buildAnd(VccRB_S1, CondReg, CmpReg).getReg(0);
257 }
258
259 Op.setReg(CurrentLaneReg);
260
261 // Make sure we don't re-process this register again.
262 WaterfalledRegMap.insert(std::pair(OldReg, Op.getReg()));
263 }
264 }
265
266 // Copy vcc to sgpr32/64, ballot becomes a no-op during instruction selection.
267 Register CondRegLM =
268 MRI.createVirtualRegister({WaveRC, LLT::scalar(IsWave32 ? 32 : 64)});
269 B.buildIntrinsic(Intrinsic::amdgcn_ballot, CondRegLM).addReg(CondReg);
270
271 // Update EXEC, save the original EXEC value to SavedExec.
272 B.buildInstr(AndSaveExecOpc)
273 .addDef(SavedExec)
274 .addReg(CondRegLM, RegState::Kill);
275 MRI.setSimpleHint(SavedExec, CondRegLM);
276
277 B.setInsertPt(*BodyBB, BodyBB->end());
278
279 // Update EXEC, switch all done bits to 0 and all todo bits to 1.
280 B.buildInstr(XorTermOpc).addDef(ExecReg).addReg(ExecReg).addReg(SavedExec);
281
282 // XXX - s_xor_b64 sets scc to 1 if the result is nonzero, so can we use
283 // s_cbranch_scc0?
284
285 // Loop back to V_READFIRSTLANE_B32 if there are still variants to cover.
286 B.buildInstr(AMDGPU::SI_WATERFALL_LOOP).addMBB(LoopBB);
287
288 // Save the EXEC mask before the loop.
289 B.setInsertPt(MBB, MBB.end());
290 B.buildInstr(MovExecOpc).addDef(SaveExecReg).addReg(ExecReg);
291
292 // Restore the EXEC mask after the loop.
293 B.setInsertPt(*RestoreExecBB, RestoreExecBB->begin());
294 B.buildInstr(MovExecTermOpc).addDef(ExecReg).addReg(SaveExecReg);
295
296 // Set the insert point after the original instruction, so any new
297 // instructions will be in the remainder.
298 B.setInsertPt(*RemainderBB, RemainderBB->begin());
299
300 return true;
301}
302
303bool RegBankLegalizeHelper::splitLoad(MachineInstr &MI,
304 ArrayRef<LLT> LLTBreakdown, LLT MergeTy) {
305 MachineFunction &MF = B.getMF();
306 assert(MI.getNumMemOperands() == 1);
307 MachineMemOperand &BaseMMO = **MI.memoperands_begin();
308 Register Dst = MI.getOperand(0).getReg();
309 const RegisterBank *DstRB = MRI.getRegBankOrNull(Dst);
310 Register Base = MI.getOperand(1).getReg();
311 LLT PtrTy = MRI.getType(Base);
312 const RegisterBank *PtrRB = MRI.getRegBankOrNull(Base);
313 LLT OffsetTy = LLT::scalar(PtrTy.getSizeInBits());
314 SmallVector<Register, 4> LoadPartRegs;
315
316 unsigned ByteOffset = 0;
317 for (LLT PartTy : LLTBreakdown) {
318 Register BasePlusOffset;
319 if (ByteOffset == 0) {
320 BasePlusOffset = Base;
321 } else {
322 auto Offset = B.buildConstant({PtrRB, OffsetTy}, ByteOffset);
323 BasePlusOffset =
324 B.buildObjectPtrOffset({PtrRB, PtrTy}, Base, Offset).getReg(0);
325 }
326 auto *OffsetMMO = MF.getMachineMemOperand(&BaseMMO, ByteOffset, PartTy);
327 auto LoadPart = B.buildLoad({DstRB, PartTy}, BasePlusOffset, *OffsetMMO);
328 LoadPartRegs.push_back(LoadPart.getReg(0));
329 ByteOffset += PartTy.getSizeInBytes();
330 }
331
332 if (!MergeTy.isValid()) {
333 // Loads are of same size, concat or merge them together.
334 B.buildMergeLikeInstr(Dst, LoadPartRegs);
335 } else {
336 // Loads are not all of same size, need to unmerge them to smaller pieces
337 // of MergeTy type, then merge pieces to Dst.
338 SmallVector<Register, 4> MergeTyParts;
339 for (Register Reg : LoadPartRegs) {
340 if (MRI.getType(Reg) == MergeTy) {
341 MergeTyParts.push_back(Reg);
342 } else {
343 auto Unmerge = B.buildUnmerge({DstRB, MergeTy}, Reg);
344 for (unsigned i = 0; i < Unmerge->getNumOperands() - 1; ++i)
345 MergeTyParts.push_back(Unmerge.getReg(i));
346 }
347 }
348 B.buildMergeLikeInstr(Dst, MergeTyParts);
349 }
350 MI.eraseFromParent();
351 return true;
352}
353
354bool RegBankLegalizeHelper::widenLoad(MachineInstr &MI, LLT WideTy,
355 LLT MergeTy) {
356 MachineFunction &MF = B.getMF();
357 assert(MI.getNumMemOperands() == 1);
358 MachineMemOperand &BaseMMO = **MI.memoperands_begin();
359 Register Dst = MI.getOperand(0).getReg();
360 const RegisterBank *DstRB = MRI.getRegBankOrNull(Dst);
361 Register Base = MI.getOperand(1).getReg();
362
363 MachineMemOperand *WideMMO = MF.getMachineMemOperand(&BaseMMO, 0, WideTy);
364 auto WideLoad = B.buildLoad({DstRB, WideTy}, Base, *WideMMO);
365
366 if (WideTy.isScalar()) {
367 B.buildTrunc(Dst, WideLoad);
368 } else {
369 SmallVector<Register, 4> MergeTyParts;
370 auto Unmerge = B.buildUnmerge({DstRB, MergeTy}, WideLoad);
371
372 LLT DstTy = MRI.getType(Dst);
373 unsigned NumElts = DstTy.getSizeInBits() / MergeTy.getSizeInBits();
374 for (unsigned i = 0; i < NumElts; ++i) {
375 MergeTyParts.push_back(Unmerge.getReg(i));
376 }
377 B.buildMergeLikeInstr(Dst, MergeTyParts);
378 }
379 MI.eraseFromParent();
380 return true;
381}
382
383bool RegBankLegalizeHelper::widenMMOToS32(GAnyLoad &MI) const {
384 Register Dst = MI.getDstReg();
385 Register Ptr = MI.getPointerReg();
386 MachineMemOperand &MMO = MI.getMMO();
387 unsigned MemSize = 8 * MMO.getSize().getValue();
388
389 MachineMemOperand *WideMMO = B.getMF().getMachineMemOperand(&MMO, 0, S32);
390
391 if (MI.getOpcode() == G_LOAD) {
392 B.buildLoad(Dst, Ptr, *WideMMO);
393 } else {
394 auto Load = B.buildLoad(SgprRB_S32, Ptr, *WideMMO);
395
396 if (MI.getOpcode() == G_ZEXTLOAD) {
397 APInt Mask = APInt::getLowBitsSet(S32.getSizeInBits(), MemSize);
398 auto MaskCst = B.buildConstant(SgprRB_S32, Mask);
399 B.buildAnd(Dst, Load, MaskCst);
400 } else {
401 assert(MI.getOpcode() == G_SEXTLOAD);
402 B.buildSExtInReg(Dst, Load, MemSize);
403 }
404 }
405
406 MI.eraseFromParent();
407 return true;
408}
409
410bool RegBankLegalizeHelper::lowerVccExtToSel(MachineInstr &MI) {
411 Register Dst = MI.getOperand(0).getReg();
412 LLT Ty = MRI.getType(Dst);
413 Register Src = MI.getOperand(1).getReg();
414 unsigned Opc = MI.getOpcode();
415 int TrueExtCst = Opc == G_SEXT ? -1 : 1;
416 if (Ty == S32 || Ty == S16) {
417 auto True = B.buildConstant({VgprRB, Ty}, TrueExtCst);
418 auto False = B.buildConstant({VgprRB, Ty}, 0);
419 B.buildSelect(Dst, Src, True, False);
420 } else if (Ty == S64) {
421 auto True = B.buildConstant({VgprRB_S32}, TrueExtCst);
422 auto False = B.buildConstant({VgprRB_S32}, 0);
423 auto Lo = B.buildSelect({VgprRB_S32}, Src, True, False);
424 MachineInstrBuilder Hi;
425 switch (Opc) {
426 case G_SEXT:
427 Hi = Lo;
428 break;
429 case G_ZEXT:
430 Hi = False;
431 break;
432 case G_ANYEXT:
433 Hi = B.buildUndef({VgprRB_S32});
434 break;
435 default:
437 MF, MORE, "amdgpu-regbanklegalize",
438 "AMDGPU RegBankLegalize: lowerVccExtToSel, Opcode not supported", MI);
439 return false;
440 }
441
442 B.buildMergeValues(Dst, {Lo.getReg(0), Hi.getReg(0)});
443 } else {
445 MF, MORE, "amdgpu-regbanklegalize",
446 "AMDGPU RegBankLegalize: lowerVccExtToSel, Type not supported", MI);
447 return false;
448 }
449
450 MI.eraseFromParent();
451 return true;
452}
453
454std::pair<Register, Register> RegBankLegalizeHelper::unpackZExt(Register Reg) {
455 auto PackedS32 = B.buildBitcast(SgprRB_S32, Reg);
456 auto Mask = B.buildConstant(SgprRB_S32, 0x0000ffff);
457 auto Lo = B.buildAnd(SgprRB_S32, PackedS32, Mask);
458 auto Hi = B.buildLShr(SgprRB_S32, PackedS32, B.buildConstant(SgprRB_S32, 16));
459 return {Lo.getReg(0), Hi.getReg(0)};
460}
461
462std::pair<Register, Register> RegBankLegalizeHelper::unpackSExt(Register Reg) {
463 auto PackedS32 = B.buildBitcast(SgprRB_S32, Reg);
464 auto Lo = B.buildSExtInReg(SgprRB_S32, PackedS32, 16);
465 auto Hi = B.buildAShr(SgprRB_S32, PackedS32, B.buildConstant(SgprRB_S32, 16));
466 return {Lo.getReg(0), Hi.getReg(0)};
467}
468
469std::pair<Register, Register> RegBankLegalizeHelper::unpackAExt(Register Reg) {
470 auto PackedS32 = B.buildBitcast(SgprRB_S32, Reg);
471 auto Lo = PackedS32;
472 auto Hi = B.buildLShr(SgprRB_S32, PackedS32, B.buildConstant(SgprRB_S32, 16));
473 return {Lo.getReg(0), Hi.getReg(0)};
474}
475
476std::pair<Register, Register>
477RegBankLegalizeHelper::unpackAExtTruncS16(Register Reg) {
478 auto [Lo32, Hi32] = unpackAExt(Reg);
479 return {B.buildTrunc(SgprRB_S16, Lo32).getReg(0),
480 B.buildTrunc(SgprRB_S16, Hi32).getReg(0)};
481}
482
483bool RegBankLegalizeHelper::lowerUnpackBitShift(MachineInstr &MI) {
484 Register Lo, Hi;
485 switch (MI.getOpcode()) {
486 case AMDGPU::G_SHL: {
487 auto [Val0, Val1] = unpackAExt(MI.getOperand(1).getReg());
488 auto [Amt0, Amt1] = unpackAExt(MI.getOperand(2).getReg());
489 Lo = B.buildInstr(MI.getOpcode(), {SgprRB_S32}, {Val0, Amt0}).getReg(0);
490 Hi = B.buildInstr(MI.getOpcode(), {SgprRB_S32}, {Val1, Amt1}).getReg(0);
491 break;
492 }
493 case AMDGPU::G_LSHR: {
494 auto [Val0, Val1] = unpackZExt(MI.getOperand(1).getReg());
495 auto [Amt0, Amt1] = unpackZExt(MI.getOperand(2).getReg());
496 Lo = B.buildInstr(MI.getOpcode(), {SgprRB_S32}, {Val0, Amt0}).getReg(0);
497 Hi = B.buildInstr(MI.getOpcode(), {SgprRB_S32}, {Val1, Amt1}).getReg(0);
498 break;
499 }
500 case AMDGPU::G_ASHR: {
501 auto [Val0, Val1] = unpackSExt(MI.getOperand(1).getReg());
502 auto [Amt0, Amt1] = unpackSExt(MI.getOperand(2).getReg());
503 Lo = B.buildAShr(SgprRB_S32, Val0, Amt0).getReg(0);
504 Hi = B.buildAShr(SgprRB_S32, Val1, Amt1).getReg(0);
505 break;
506 }
507 default:
509 MF, MORE, "amdgpu-regbanklegalize",
510 "AMDGPU RegBankLegalize: lowerUnpackBitShift, case not implemented",
511 MI);
512 return false;
513 }
514 B.buildBuildVectorTrunc(MI.getOperand(0).getReg(), {Lo, Hi});
515 MI.eraseFromParent();
516 return true;
517}
518
519bool RegBankLegalizeHelper::lowerUnpackMinMax(MachineInstr &MI) {
520 Register Lo, Hi;
521 switch (MI.getOpcode()) {
522 case AMDGPU::G_SMIN:
523 case AMDGPU::G_SMAX: {
524 // For signed operations, use sign extension
525 auto [Val0_Lo, Val0_Hi] = unpackSExt(MI.getOperand(1).getReg());
526 auto [Val1_Lo, Val1_Hi] = unpackSExt(MI.getOperand(2).getReg());
527 Lo = B.buildInstr(MI.getOpcode(), {SgprRB_S32}, {Val0_Lo, Val1_Lo})
528 .getReg(0);
529 Hi = B.buildInstr(MI.getOpcode(), {SgprRB_S32}, {Val0_Hi, Val1_Hi})
530 .getReg(0);
531 break;
532 }
533 case AMDGPU::G_UMIN:
534 case AMDGPU::G_UMAX: {
535 // For unsigned operations, use zero extension
536 auto [Val0_Lo, Val0_Hi] = unpackZExt(MI.getOperand(1).getReg());
537 auto [Val1_Lo, Val1_Hi] = unpackZExt(MI.getOperand(2).getReg());
538 Lo = B.buildInstr(MI.getOpcode(), {SgprRB_S32}, {Val0_Lo, Val1_Lo})
539 .getReg(0);
540 Hi = B.buildInstr(MI.getOpcode(), {SgprRB_S32}, {Val0_Hi, Val1_Hi})
541 .getReg(0);
542 break;
543 }
544 default:
546 MF, MORE, "amdgpu-regbanklegalize",
547 "AMDGPU RegBankLegalize: lowerUnpackMinMax, case not implemented", MI);
548 return false;
549 }
550 B.buildBuildVectorTrunc(MI.getOperand(0).getReg(), {Lo, Hi});
551 MI.eraseFromParent();
552 return true;
553}
554
555bool RegBankLegalizeHelper::lowerUnpackAExt(MachineInstr &MI) {
556 auto [Op1Lo, Op1Hi] = unpackAExt(MI.getOperand(1).getReg());
557 auto [Op2Lo, Op2Hi] = unpackAExt(MI.getOperand(2).getReg());
558 auto ResLo = B.buildInstr(MI.getOpcode(), {SgprRB_S32}, {Op1Lo, Op2Lo});
559 auto ResHi = B.buildInstr(MI.getOpcode(), {SgprRB_S32}, {Op1Hi, Op2Hi});
560 B.buildBuildVectorTrunc(MI.getOperand(0).getReg(),
561 {ResLo.getReg(0), ResHi.getReg(0)});
562 MI.eraseFromParent();
563 return true;
564}
565
568 return (GI->is(Intrinsic::amdgcn_sbfe));
569
570 return MI.getOpcode() == AMDGPU::G_SBFX;
571}
572
573bool RegBankLegalizeHelper::lowerV_BFE(MachineInstr &MI) {
574 Register Dst = MI.getOperand(0).getReg();
575 assert(MRI.getType(Dst) == LLT::scalar(64));
576 bool Signed = isSignedBFE(MI);
577 unsigned FirstOpnd = isa<GIntrinsic>(MI) ? 2 : 1;
578 // Extract bitfield from Src, LSBit is the least-significant bit for the
579 // extraction (field offset) and Width is size of bitfield.
580 Register Src = MI.getOperand(FirstOpnd).getReg();
581 Register LSBit = MI.getOperand(FirstOpnd + 1).getReg();
582 Register Width = MI.getOperand(FirstOpnd + 2).getReg();
583 // Comments are for signed bitfield extract, similar for unsigned. x is sign
584 // bit. s is sign, l is LSB and y are remaining bits of bitfield to extract.
585
586 // Src >> LSBit Hi|Lo: x?????syyyyyyl??? -> xxxx?????syyyyyyl
587 unsigned SHROpc = Signed ? AMDGPU::G_ASHR : AMDGPU::G_LSHR;
588 auto SHRSrc = B.buildInstr(SHROpc, {{VgprRB, S64}}, {Src, LSBit});
589
590 auto ConstWidth = getIConstantVRegValWithLookThrough(Width, MRI);
591
592 // Expand to Src >> LSBit << (64 - Width) >> (64 - Width)
593 // << (64 - Width): Hi|Lo: xxxx?????syyyyyyl -> syyyyyyl000000000
594 // >> (64 - Width): Hi|Lo: syyyyyyl000000000 -> ssssssssssyyyyyyl
595 if (!ConstWidth) {
596 auto Amt = B.buildSub(VgprRB_S32, B.buildConstant(SgprRB_S32, 64), Width);
597 auto SignBit = B.buildShl({VgprRB, S64}, SHRSrc, Amt);
598 B.buildInstr(SHROpc, {Dst}, {SignBit, Amt});
599 MI.eraseFromParent();
600 return true;
601 }
602
603 uint64_t WidthImm = ConstWidth->Value.getZExtValue();
604 auto UnmergeSHRSrc = B.buildUnmerge(VgprRB_S32, SHRSrc);
605 Register SHRSrcLo = UnmergeSHRSrc.getReg(0);
606 Register SHRSrcHi = UnmergeSHRSrc.getReg(1);
607 auto Zero = B.buildConstant({VgprRB, S32}, 0);
608 unsigned BFXOpc = Signed ? AMDGPU::G_SBFX : AMDGPU::G_UBFX;
609
610 if (WidthImm <= 32) {
611 // SHRSrc Hi|Lo: ????????|???syyyl -> ????????|ssssyyyl
612 auto Lo = B.buildInstr(BFXOpc, {VgprRB_S32}, {SHRSrcLo, Zero, Width});
613 MachineInstrBuilder Hi;
614 if (Signed) {
615 // SHRSrc Hi|Lo: ????????|ssssyyyl -> ssssssss|ssssyyyl
616 Hi = B.buildAShr(VgprRB_S32, Lo, B.buildConstant(VgprRB_S32, 31));
617 } else {
618 // SHRSrc Hi|Lo: ????????|000syyyl -> 00000000|000syyyl
619 Hi = Zero;
620 }
621 B.buildMergeLikeInstr(Dst, {Lo, Hi});
622 } else {
623 auto Amt = B.buildConstant(VgprRB_S32, WidthImm - 32);
624 // SHRSrc Hi|Lo: ??????sy|yyyyyyyl -> sssssssy|yyyyyyyl
625 auto Hi = B.buildInstr(BFXOpc, {VgprRB_S32}, {SHRSrcHi, Zero, Amt});
626 B.buildMergeLikeInstr(Dst, {SHRSrcLo, Hi});
627 }
628
629 MI.eraseFromParent();
630 return true;
631}
632
633bool RegBankLegalizeHelper::lowerS_BFE(MachineInstr &MI) {
634 Register DstReg = MI.getOperand(0).getReg();
635 LLT Ty = MRI.getType(DstReg);
636 bool Signed = isSignedBFE(MI);
637 unsigned FirstOpnd = isa<GIntrinsic>(MI) ? 2 : 1;
638 Register Src = MI.getOperand(FirstOpnd).getReg();
639 Register LSBit = MI.getOperand(FirstOpnd + 1).getReg();
640 Register Width = MI.getOperand(FirstOpnd + 2).getReg();
641 // For uniform bit field extract there are 4 available instructions, but
642 // LSBit(field offset) and Width(size of bitfield) need to be packed in S32,
643 // field offset in low and size in high 16 bits.
644
645 // Src1 Hi16|Lo16 = Size|FieldOffset
646 auto Mask = B.buildConstant(SgprRB_S32, maskTrailingOnes<unsigned>(6));
647 auto FieldOffset = B.buildAnd(SgprRB_S32, LSBit, Mask);
648 auto Size = B.buildShl(SgprRB_S32, Width, B.buildConstant(SgprRB_S32, 16));
649 auto Src1 = B.buildOr(SgprRB_S32, FieldOffset, Size);
650 unsigned Opc32 = Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32;
651 unsigned Opc64 = Signed ? AMDGPU::S_BFE_I64 : AMDGPU::S_BFE_U64;
652 unsigned Opc = Ty == S32 ? Opc32 : Opc64;
653
654 // Select machine instruction, because of reg class constraining, insert
655 // copies from reg class to reg bank.
656 auto S_BFE = B.buildInstr(Opc, {{SgprRB, Ty}},
657 {B.buildCopy(Ty, Src), B.buildCopy(S32, Src1)});
658 constrainSelectedInstRegOperands(*S_BFE, *ST.getInstrInfo(),
659 *ST.getRegisterInfo(), RBI);
660
661 B.buildCopy(DstReg, S_BFE->getOperand(0).getReg());
662 MI.eraseFromParent();
663 return true;
664}
665
666bool RegBankLegalizeHelper::lowerSplitTo32(MachineInstr &MI) {
667 Register Dst = MI.getOperand(0).getReg();
668 LLT DstTy = MRI.getType(Dst);
669 assert(DstTy == V4S16 || DstTy == V2S32 || DstTy == S64);
670 LLT Ty = DstTy == V4S16 ? V2S16 : S32;
671 auto Op1 = B.buildUnmerge({VgprRB, Ty}, MI.getOperand(1).getReg());
672 auto Op2 = B.buildUnmerge({VgprRB, Ty}, MI.getOperand(2).getReg());
673 unsigned Opc = MI.getOpcode();
674 auto Flags = MI.getFlags();
675 auto Lo =
676 B.buildInstr(Opc, {{VgprRB, Ty}}, {Op1.getReg(0), Op2.getReg(0)}, Flags);
677 auto Hi =
678 B.buildInstr(Opc, {{VgprRB, Ty}}, {Op1.getReg(1), Op2.getReg(1)}, Flags);
679 B.buildMergeLikeInstr(Dst, {Lo, Hi});
680 MI.eraseFromParent();
681 return true;
682}
683
684bool RegBankLegalizeHelper::lowerSplitTo32Mul(MachineInstr &MI) {
685 Register Dst = MI.getOperand(0).getReg();
686 assert(MRI.getType(Dst) == S64);
687 auto Op1 = B.buildUnmerge({VgprRB_S32}, MI.getOperand(1).getReg());
688 auto Op2 = B.buildUnmerge({VgprRB_S32}, MI.getOperand(2).getReg());
689
690 // TODO: G_AMDGPU_MAD_* optimizations for G_MUL divergent S64 operation to
691 // match GlobalISel with old regbankselect.
692 auto Lo = B.buildMul(VgprRB_S32, Op1.getReg(0), Op2.getReg(0));
693 auto Carry = B.buildUMulH(VgprRB_S32, Op1.getReg(0), Op2.getReg(0));
694 auto MulLo0Hi1 = B.buildMul(VgprRB_S32, Op1.getReg(0), Op2.getReg(1));
695 auto MulHi0Lo1 = B.buildMul(VgprRB_S32, Op1.getReg(1), Op2.getReg(0));
696 auto Sum = B.buildAdd(VgprRB_S32, MulLo0Hi1, MulHi0Lo1);
697 auto Hi = B.buildAdd(VgprRB_S32, Sum, Carry);
698
699 B.buildMergeLikeInstr(Dst, {Lo, Hi});
700 MI.eraseFromParent();
701 return true;
702}
703
704bool RegBankLegalizeHelper::lowerSplitTo16(MachineInstr &MI) {
705 Register Dst = MI.getOperand(0).getReg();
706 assert(MRI.getType(Dst) == V2S16);
707 unsigned Opc = MI.getOpcode();
708 unsigned NumOps = MI.getNumOperands();
709 auto Flags = MI.getFlags();
710
711 auto [Op1Lo, Op1Hi] = unpackAExtTruncS16(MI.getOperand(1).getReg());
712
713 if (NumOps == 2) {
714 auto Lo = B.buildInstr(Opc, {SgprRB_S16}, {Op1Lo}, Flags);
715 auto Hi = B.buildInstr(Opc, {SgprRB_S16}, {Op1Hi}, Flags);
716 B.buildMergeLikeInstr(Dst, {Lo, Hi});
717 MI.eraseFromParent();
718 return true;
719 }
720
721 auto [Op2Lo, Op2Hi] = unpackAExtTruncS16(MI.getOperand(2).getReg());
722
723 if (NumOps == 3) {
724 auto Lo = B.buildInstr(Opc, {SgprRB_S16}, {Op1Lo, Op2Lo}, Flags);
725 auto Hi = B.buildInstr(Opc, {SgprRB_S16}, {Op1Hi, Op2Hi}, Flags);
726 B.buildMergeLikeInstr(Dst, {Lo, Hi});
727 MI.eraseFromParent();
728 return true;
729 }
730
731 assert(NumOps == 4);
732 auto [Op3Lo, Op3Hi] = unpackAExtTruncS16(MI.getOperand(3).getReg());
733 auto Lo = B.buildInstr(Opc, {SgprRB_S16}, {Op1Lo, Op2Lo, Op3Lo}, Flags);
734 auto Hi = B.buildInstr(Opc, {SgprRB_S16}, {Op1Hi, Op2Hi, Op3Hi}, Flags);
735 B.buildMergeLikeInstr(Dst, {Lo, Hi});
736 MI.eraseFromParent();
737 return true;
738}
739
740bool RegBankLegalizeHelper::lowerUniMAD64(MachineInstr &MI) {
741 Register Dst0 = MI.getOperand(0).getReg();
742 Register Dst1 = MI.getOperand(1).getReg();
743 Register Src0 = MI.getOperand(2).getReg();
744 Register Src1 = MI.getOperand(3).getReg();
745 Register Src2 = MI.getOperand(4).getReg();
746
747 const GCNSubtarget &ST = B.getMF().getSubtarget<GCNSubtarget>();
748
749 // Keep the multiplication on the SALU.
750 Register DstLo = B.buildMul(SgprRB_S32, Src0, Src1).getReg(0);
751 Register DstHi = MRI.createVirtualRegister(SgprRB_S32);
752 if (ST.hasScalarMulHiInsts()) {
753 B.buildInstr(AMDGPU::G_UMULH, {{DstHi}}, {Src0, Src1});
754 } else {
755 auto VSrc0 = B.buildCopy(VgprRB_S32, Src0);
756 auto VSrc1 = B.buildCopy(VgprRB_S32, Src1);
757 auto MulHi = B.buildInstr(AMDGPU::G_UMULH, {VgprRB_S32}, {VSrc0, VSrc1});
758 buildReadAnyLane(B, DstHi, MulHi.getReg(0), RBI);
759 }
760
761 // Accumulate and produce the "carry-out" bit.
762
763 // The "carry-out" is defined as bit 64 of the result when computed as a
764 // big integer. For unsigned multiply-add, this matches the usual
765 // definition of carry-out.
766 if (mi_match(Src2, MRI, MIPatternMatch::m_ZeroInt())) {
767 // No accumulate: result is just the multiplication, carry is 0.
768 B.buildMergeLikeInstr(Dst0, {DstLo, DstHi});
769 B.buildConstant(Dst1, 0);
770 } else {
771 // Accumulate: add Src2 to the multiplication result with carry chain.
772 Register Src2Lo = MRI.createVirtualRegister(SgprRB_S32);
773 Register Src2Hi = MRI.createVirtualRegister(SgprRB_S32);
774 B.buildUnmerge({Src2Lo, Src2Hi}, Src2);
775
776 auto AddLo = B.buildUAddo(SgprRB_S32, SgprRB_S32, DstLo, Src2Lo);
777 auto AddHi =
778 B.buildUAdde(SgprRB_S32, SgprRB_S32, DstHi, Src2Hi, AddLo.getReg(1));
779 B.buildMergeLikeInstr(Dst0, {AddLo.getReg(0), AddHi.getReg(0)});
780 B.buildCopy(Dst1, AddHi.getReg(1));
781 }
782
783 MI.eraseFromParent();
784 return true;
785}
786
787bool RegBankLegalizeHelper::lowerSplitTo32Select(MachineInstr &MI) {
788 Register Dst = MI.getOperand(0).getReg();
789 LLT DstTy = MRI.getType(Dst);
790 assert(DstTy == V4S16 || DstTy == V2S32 || DstTy == S64 ||
791 (DstTy.isPointer() && DstTy.getSizeInBits() == 64));
792 LLT Ty = DstTy == V4S16 ? V2S16 : S32;
793 auto Op2 = B.buildUnmerge({VgprRB, Ty}, MI.getOperand(2).getReg());
794 auto Op3 = B.buildUnmerge({VgprRB, Ty}, MI.getOperand(3).getReg());
795 Register Cond = MI.getOperand(1).getReg();
796 auto Flags = MI.getFlags();
797 auto Lo =
798 B.buildSelect({VgprRB, Ty}, Cond, Op2.getReg(0), Op3.getReg(0), Flags);
799 auto Hi =
800 B.buildSelect({VgprRB, Ty}, Cond, Op2.getReg(1), Op3.getReg(1), Flags);
801
802 B.buildMergeLikeInstr(Dst, {Lo, Hi});
803 MI.eraseFromParent();
804 return true;
805}
806
807bool RegBankLegalizeHelper::lowerSplitTo32SExtInReg(MachineInstr &MI) {
808 auto Op1 = B.buildUnmerge(VgprRB_S32, MI.getOperand(1).getReg());
809 int Amt = MI.getOperand(2).getImm();
810 Register Lo, Hi;
811 // Hi|Lo: s sign bit, ?/x bits changed/not changed by sign-extend
812 if (Amt <= 32) {
813 auto Freeze = B.buildFreeze(VgprRB_S32, Op1.getReg(0));
814 if (Amt == 32) {
815 // Hi|Lo: ????????|sxxxxxxx -> ssssssss|sxxxxxxx
816 Lo = Freeze.getReg(0);
817 } else {
818 // Hi|Lo: ????????|???sxxxx -> ssssssss|ssssxxxx
819 Lo = B.buildSExtInReg(VgprRB_S32, Freeze, Amt).getReg(0);
820 }
821
822 auto SignExtCst = B.buildConstant(SgprRB_S32, 31);
823 Hi = B.buildAShr(VgprRB_S32, Lo, SignExtCst).getReg(0);
824 } else {
825 // Hi|Lo: ?????sxx|xxxxxxxx -> ssssssxx|xxxxxxxx
826 Lo = Op1.getReg(0);
827 Hi = B.buildSExtInReg(VgprRB_S32, Op1.getReg(1), Amt - 32).getReg(0);
828 }
829
830 B.buildMergeLikeInstr(MI.getOperand(0).getReg(), {Lo, Hi});
831 MI.eraseFromParent();
832 return true;
833}
834
835bool RegBankLegalizeHelper::lowerSplitBitCount64To32(MachineInstr &MI) {
836 // Split 64-bit find-first-bit operations into 32-bit halves:
837 // (ffbh hi:lo) -> umin(ffbh(hi), uaddsat(ffbh(lo), 32))
838 // (ffbl hi:lo) -> umin(ffbl(lo), uaddsat(ffbl(hi), 32))
839 // (ctlz_zero_undef hi:lo) -> umin(ffbh(hi), add(ffbh(lo), 32))
840 // (cttz_zero_undef hi:lo) -> umin(ffbl(lo), add(ffbl(hi), 32))
841 unsigned Opc = MI.getOpcode();
842
843 // FFBH/FFBL return 0xFFFFFFFF on zero input, using uaddsat to avoid
844 // wrapping. CTLZ/CTTZ guarantee non-zero input (zero_undef), so plain add
845 // is fine.
846 unsigned FFBOpc;
847 unsigned AddOpc;
848 bool SearchFromMSB;
849 switch (Opc) {
850 case AMDGPU::G_AMDGPU_FFBH_U32:
851 FFBOpc = Opc;
852 AddOpc = AMDGPU::G_UADDSAT;
853 SearchFromMSB = true;
854 break;
855 case AMDGPU::G_AMDGPU_FFBL_B32:
856 FFBOpc = Opc;
857 AddOpc = AMDGPU::G_UADDSAT;
858 SearchFromMSB = false;
859 break;
860 case AMDGPU::G_CTLZ_ZERO_UNDEF:
861 FFBOpc = AMDGPU::G_AMDGPU_FFBH_U32;
862 AddOpc = AMDGPU::G_ADD;
863 SearchFromMSB = true;
864 break;
865 case AMDGPU::G_CTTZ_ZERO_UNDEF:
866 FFBOpc = AMDGPU::G_AMDGPU_FFBL_B32;
867 AddOpc = AMDGPU::G_ADD;
868 SearchFromMSB = false;
869 break;
870 default:
871 llvm_unreachable("unexpected opcode in lowerSplitBitCount64To32");
872 }
873
874 auto Unmerge = B.buildUnmerge(VgprRB_S32, MI.getOperand(1).getReg());
875 Register Lo = Unmerge.getReg(0);
876 Register Hi = Unmerge.getReg(1);
877
878 // MSB-first (FFBH/CTLZ) searches hi first; LSB-first (FFBL/CTTZ) searches
879 // lo first. The secondary half adds 32 to account for the primary half's
880 // width.
881 auto Primary = B.buildInstr(FFBOpc, {VgprRB_S32}, {SearchFromMSB ? Hi : Lo});
882 auto Secondary =
883 B.buildInstr(FFBOpc, {VgprRB_S32}, {SearchFromMSB ? Lo : Hi});
884
885 auto Adjusted = B.buildInstr(AddOpc, {VgprRB_S32},
886 {Secondary, B.buildConstant(VgprRB_S32, 32)});
887 B.buildUMin(MI.getOperand(0).getReg(), Primary, Adjusted);
888
889 MI.eraseFromParent();
890 return true;
891}
892
893bool RegBankLegalizeHelper::lower(MachineInstr &MI,
894 const RegBankLLTMapping &Mapping,
895 WaterfallInfo &WFI) {
896
897 switch (Mapping.LoweringMethod) {
898 case DoNotLower:
899 break;
900 case VccExtToSel:
901 return lowerVccExtToSel(MI);
902 case UniExtToSel: {
903 LLT Ty = MRI.getType(MI.getOperand(0).getReg());
904 auto True = B.buildConstant({SgprRB, Ty},
905 MI.getOpcode() == AMDGPU::G_SEXT ? -1 : 1);
906 auto False = B.buildConstant({SgprRB, Ty}, 0);
907 // Input to G_{Z|S}EXT is 'Legalizer legal' S1. Most common case is compare.
908 // We are making select here. S1 cond was already 'any-extended to S32' +
909 // 'AND with 1 to clean high bits' by Sgpr32AExtBoolInReg.
910 B.buildSelect(MI.getOperand(0).getReg(), MI.getOperand(1).getReg(), True,
911 False);
912 MI.eraseFromParent();
913 return true;
914 }
915 case UnpackBitShift:
916 return lowerUnpackBitShift(MI);
917 case UnpackMinMax:
918 return lowerUnpackMinMax(MI);
919 case ScalarizeToS16:
920 return lowerSplitTo16(MI);
921 case Ext32To64: {
922 const RegisterBank *RB = MRI.getRegBank(MI.getOperand(0).getReg());
923 MachineInstrBuilder Hi;
924 switch (MI.getOpcode()) {
925 case AMDGPU::G_ZEXT: {
926 Hi = B.buildConstant({RB, S32}, 0);
927 break;
928 }
929 case AMDGPU::G_SEXT: {
930 // Replicate sign bit from 32-bit extended part.
931 auto ShiftAmt = B.buildConstant({RB, S32}, 31);
932 Hi = B.buildAShr({RB, S32}, MI.getOperand(1).getReg(), ShiftAmt);
933 break;
934 }
935 case AMDGPU::G_ANYEXT: {
936 Hi = B.buildUndef({RB, S32});
937 break;
938 }
939 default:
940 reportGISelFailure(MF, MORE, "amdgpu-regbanklegalize",
941 "AMDGPU RegBankLegalize: Ext32To64, unsuported opcode",
942 MI);
943 return false;
944 }
945
946 B.buildMergeLikeInstr(MI.getOperand(0).getReg(),
947 {MI.getOperand(1).getReg(), Hi});
948 MI.eraseFromParent();
949 return true;
950 }
951 case UniCstExt: {
952 uint64_t ConstVal = MI.getOperand(1).getCImm()->getZExtValue();
953 B.buildConstant(MI.getOperand(0).getReg(), ConstVal);
954
955 MI.eraseFromParent();
956 return true;
957 }
958 case VgprToVccCopy: {
959 Register Src = MI.getOperand(1).getReg();
960 LLT Ty = MRI.getType(Src);
961 // Take lowest bit from each lane and put it in lane mask.
962 // Lowering via compare, but we need to clean high bits first as compare
963 // compares all bits in register.
964 Register BoolSrc = MRI.createVirtualRegister({VgprRB, Ty});
965 if (Ty == S64) {
966 auto Src64 = B.buildUnmerge(VgprRB_S32, Src);
967 auto One = B.buildConstant(VgprRB_S32, 1);
968 auto AndLo = B.buildAnd(VgprRB_S32, Src64.getReg(0), One);
969 auto Zero = B.buildConstant(VgprRB_S32, 0);
970 auto AndHi = B.buildAnd(VgprRB_S32, Src64.getReg(1), Zero);
971 B.buildMergeLikeInstr(BoolSrc, {AndLo, AndHi});
972 } else {
973 assert(Ty == S32 || Ty == S16);
974 auto One = B.buildConstant({VgprRB, Ty}, 1);
975 B.buildAnd(BoolSrc, Src, One);
976 }
977 auto Zero = B.buildConstant({VgprRB, Ty}, 0);
978 B.buildICmp(CmpInst::ICMP_NE, MI.getOperand(0).getReg(), BoolSrc, Zero);
979 MI.eraseFromParent();
980 return true;
981 }
982 case V_BFE:
983 return lowerV_BFE(MI);
984 case S_BFE:
985 return lowerS_BFE(MI);
986 case UniMAD64:
987 return lowerUniMAD64(MI);
988 case UniMul64: {
989 B.buildMul(MI.getOperand(0), MI.getOperand(1), MI.getOperand(2));
990 MI.eraseFromParent();
991 return true;
992 }
993 case DivSMulToMAD: {
994 auto Op1 = B.buildTrunc(VgprRB_S32, MI.getOperand(1));
995 auto Op2 = B.buildTrunc(VgprRB_S32, MI.getOperand(2));
996 auto Zero = B.buildConstant({VgprRB, S64}, 0);
997
998 unsigned NewOpc = MI.getOpcode() == AMDGPU::G_AMDGPU_S_MUL_U64_U32
999 ? AMDGPU::G_AMDGPU_MAD_U64_U32
1000 : AMDGPU::G_AMDGPU_MAD_I64_I32;
1001
1002 B.buildInstr(NewOpc, {MI.getOperand(0).getReg(), {SgprRB, S32}},
1003 {Op1, Op2, Zero});
1004 MI.eraseFromParent();
1005 return true;
1006 }
1007 case SplitTo32:
1008 return lowerSplitTo32(MI);
1009 case SplitTo32Mul:
1010 return lowerSplitTo32Mul(MI);
1011 case SplitTo32Select:
1012 return lowerSplitTo32Select(MI);
1013 case SplitTo32SExtInReg:
1014 return lowerSplitTo32SExtInReg(MI);
1015 case SplitLoad: {
1016 LLT DstTy = MRI.getType(MI.getOperand(0).getReg());
1017 unsigned Size = DstTy.getSizeInBits();
1018 // Even split to 128-bit loads
1019 if (Size > 128) {
1020 LLT B128;
1021 if (DstTy.isVector()) {
1022 LLT EltTy = DstTy.getElementType();
1023 B128 = LLT::fixed_vector(128 / EltTy.getSizeInBits(), EltTy);
1024 } else {
1025 B128 = LLT::scalar(128);
1026 }
1027 if (Size / 128 == 2)
1028 splitLoad(MI, {B128, B128});
1029 else if (Size / 128 == 4)
1030 splitLoad(MI, {B128, B128, B128, B128});
1031 else {
1032 reportGISelFailure(MF, MORE, "amdgpu-regbanklegalize",
1033 "AMDGPU RegBankLegalize: SplitLoad, unsuported type",
1034 MI);
1035 return false;
1036 }
1037 }
1038 // 64 and 32 bit load
1039 else if (DstTy == S96)
1040 splitLoad(MI, {S64, S32}, S32);
1041 else if (DstTy == V3S32)
1042 splitLoad(MI, {V2S32, S32}, S32);
1043 else if (DstTy == V6S16)
1044 splitLoad(MI, {V4S16, V2S16}, V2S16);
1045 else {
1046 reportGISelFailure(MF, MORE, "amdgpu-regbanklegalize",
1047 "AMDGPU RegBankLegalize: SplitLoad, unsuported type",
1048 MI);
1049 return false;
1050 }
1051 return true;
1052 }
1053 case WidenLoad: {
1054 LLT DstTy = MRI.getType(MI.getOperand(0).getReg());
1055 if (DstTy == S96)
1056 widenLoad(MI, S128);
1057 else if (DstTy == V3S32)
1058 widenLoad(MI, V4S32, S32);
1059 else if (DstTy == V6S16)
1060 widenLoad(MI, V8S16, V2S16);
1061 else {
1062 reportGISelFailure(MF, MORE, "amdgpu-regbanklegalize",
1063 "AMDGPU RegBankLegalize: WidenLoad, unsuported type",
1064 MI);
1065 return false;
1066 }
1067 return true;
1068 }
1069 case UnpackAExt:
1070 return lowerUnpackAExt(MI);
1071 case WidenMMOToS32:
1072 return widenMMOToS32(cast<GAnyLoad>(MI));
1073 case VerifyAllSgpr: {
1074 assert(llvm::all_of(MI.operands(), [&](const MachineOperand &Op) {
1075 return MRI.getRegBankOrNull(Op.getReg()) == SgprRB;
1076 }));
1077 return true;
1078 }
1079 case ApplyAllVgpr: {
1080 assert(llvm::all_of(MI.defs(), [&](const MachineOperand &Op) {
1081 return MRI.getRegBankOrNull(Op.getReg()) == VgprRB;
1082 }));
1083 B.setInstrAndDebugLoc(MI);
1084 for (unsigned i = MI.getNumDefs(); i < MI.getNumOperands(); ++i) {
1085 Register Reg = MI.getOperand(i).getReg();
1086 if (MRI.getRegBank(Reg) != VgprRB) {
1087 auto Copy = B.buildCopy({VgprRB, MRI.getType(Reg)}, Reg);
1088 MI.getOperand(i).setReg(Copy.getReg(0));
1089 }
1090 }
1091 return true;
1092 }
1093 case UnmergeToShiftTrunc: {
1094 GUnmerge *Unmerge = dyn_cast<GUnmerge>(&MI);
1095 LLT Ty = MRI.getType(Unmerge->getSourceReg());
1096 if (Ty.getSizeInBits() % 32 != 0) {
1097 reportGISelFailure(MF, MORE, "amdgpu-regbanklegalize",
1098 "AMDGPU RegBankLegalize: unmerge not multiple of 32",
1099 MI);
1100 return false;
1101 }
1102
1103 B.setInstrAndDebugLoc(MI);
1104 if (Ty.getSizeInBits() > 32) {
1105 auto UnmergeV2S16 =
1106 B.buildUnmerge({SgprRB, V2S16}, Unmerge->getSourceReg());
1107 for (unsigned i = 0; i < UnmergeV2S16->getNumDefs(); ++i) {
1108 auto [Dst0S32, Dst1S32] =
1109 unpackAExt(UnmergeV2S16->getOperand(i).getReg());
1110 B.buildTrunc(MI.getOperand(i * 2).getReg(), Dst0S32);
1111 B.buildTrunc(MI.getOperand(i * 2 + 1).getReg(), Dst1S32);
1112 }
1113 } else {
1114 auto [Dst0S32, Dst1S32] = unpackAExt(MI.getOperand(2).getReg());
1115 B.buildTrunc(MI.getOperand(0).getReg(), Dst0S32);
1116 B.buildTrunc(MI.getOperand(1).getReg(), Dst1S32);
1117 }
1118
1119 MI.eraseFromParent();
1120 return true;
1121 }
1123 Register Dst = MI.getOperand(0).getReg();
1124 Register NewDst = MRI.createVirtualRegister(SgprRB_S32);
1125 B.setInsertPt(*MI.getParent(), MI.getParent()->getFirstNonPHI());
1126 MI.getOperand(0).setReg(NewDst);
1127 B.buildTrunc(Dst, NewDst);
1128
1129 for (unsigned i = 1; i < MI.getNumOperands(); i += 2) {
1130 Register UseReg = MI.getOperand(i).getReg();
1131
1132 auto DefMI = MRI.getVRegDef(UseReg)->getIterator();
1133 MachineBasicBlock *DefMBB = DefMI->getParent();
1134
1135 B.setInsertPt(*DefMBB, DefMBB->SkipPHIsAndLabels(std::next(DefMI)));
1136
1137 auto NewUse = B.buildAnyExt(SgprRB_S32, UseReg);
1138 MI.getOperand(i).setReg(NewUse.getReg(0));
1139 }
1140 break;
1141 }
1142 case VerifyAllSgprGPHI: {
1143 assert(llvm::all_of(MI.operands(), [&](const MachineOperand &Op) {
1144 if (Op.isMBB())
1145 return true;
1146 return MRI.getRegBankOrNull(Op.getReg()) == SgprRB;
1147 }));
1148 return true;
1149 }
1151 assert(MRI.getRegBankOrNull(MI.getOperand(0).getReg()) == VgprRB);
1152 assert(llvm::all_of(MI.operands(), [&](const MachineOperand &Op) {
1153 if (Op.isMBB())
1154 return true;
1155 const RegisterBank *RB = MRI.getRegBankOrNull(Op.getReg());
1156 return RB == VgprRB || RB == SgprRB;
1157 }));
1158 return true;
1159 }
1160 case ApplyINTRIN_IMAGE:
1161 return applyRegisterBanksINTRIN_IMAGE(MI);
1163 return lowerSplitBitCount64To32(MI);
1164 }
1165
1166 if (!WFI.SgprWaterfallOperandRegs.empty()) {
1167 if (!executeInWaterfallLoop(B, WFI))
1168 return false;
1169 }
1170 return true;
1171}
1172
1173LLT RegBankLegalizeHelper::getTyFromID(RegBankLLTMappingApplyID ID) {
1174 switch (ID) {
1175 case Vcc:
1176 case UniInVcc:
1177 return LLT::scalar(1);
1178 case Sgpr16:
1179 case Vgpr16:
1180 case UniInVgprS16:
1181 return LLT::scalar(16);
1182 case Sgpr32:
1183 case Sgpr32_WF:
1184 case Sgpr32Trunc:
1185 case Sgpr32AExt:
1187 case Sgpr32SExt:
1188 case Sgpr32ZExt:
1189 case UniInVgprS32:
1190 case Sgpr32ToVgprDst:
1191 case Vgpr32:
1192 case Vgpr32AExt:
1193 case Vgpr32SExt:
1194 case Vgpr32ZExt:
1195 return LLT::scalar(32);
1196 case Sgpr64:
1197 case Vgpr64:
1198 case UniInVgprS64:
1199 case Sgpr64ToVgprDst:
1200 return LLT::scalar(64);
1201 case Sgpr128:
1202 case Vgpr128:
1203 return LLT::scalar(128);
1204 case SgprP0:
1205 case SgprP0Call_WF:
1206 case VgprP0:
1207 return LLT::pointer(0, 64);
1208 case SgprP1:
1209 case VgprP1:
1210 return LLT::pointer(1, 64);
1211 case SgprP2:
1212 case VgprP2:
1213 return LLT::pointer(2, 32);
1214 case SgprP3:
1215 case VgprP3:
1216 return LLT::pointer(3, 32);
1217 case SgprP4:
1218 case SgprP4Call_WF:
1219 case VgprP4:
1220 return LLT::pointer(4, 64);
1221 case SgprP5:
1222 case VgprP5:
1223 return LLT::pointer(5, 32);
1224 case SgprP8:
1225 return LLT::pointer(8, 128);
1226 case SgprV2S16:
1227 case VgprV2S16:
1228 case UniInVgprV2S16:
1229 return LLT::fixed_vector(2, 16);
1230 case SgprV2S32:
1231 case VgprV2S32:
1232 case UniInVgprV2S32:
1233 return LLT::fixed_vector(2, 32);
1234 case VgprV3S32:
1235 return LLT::fixed_vector(3, 32);
1236 case VgprV4S16:
1237 return LLT::fixed_vector(4, 16);
1238 case SgprV4S32:
1239 case SgprV4S32_WF:
1240 case VgprV4S32:
1241 case UniInVgprV4S32:
1242 return LLT::fixed_vector(4, 32);
1243 case VgprV8S32:
1244 return LLT::fixed_vector(8, 32);
1245 case VgprV2S64:
1246 case UniInVgprV2S64:
1247 return LLT::fixed_vector(2, 64);
1248 default:
1249 return LLT();
1250 }
1251}
1252
1253LLT RegBankLegalizeHelper::getBTyFromID(RegBankLLTMappingApplyID ID, LLT Ty) {
1254 switch (ID) {
1255 case SgprB32:
1256 case VgprB32:
1257 case SgprB32_M0:
1259 case UniInVgprB32:
1260 if (Ty == LLT::scalar(32) || Ty == LLT::fixed_vector(2, 16) ||
1261 isAnyPtr(Ty, 32))
1262 return Ty;
1263 return LLT();
1264 case SgprPtr32:
1265 case VgprPtr32:
1266 return isAnyPtr(Ty, 32) ? Ty : LLT();
1267 case SgprPtr64:
1268 case VgprPtr64:
1269 return isAnyPtr(Ty, 64) ? Ty : LLT();
1270 case SgprPtr128:
1271 case VgprPtr128:
1272 return isAnyPtr(Ty, 128) ? Ty : LLT();
1273 case SgprB64:
1274 case VgprB64:
1275 case UniInVgprB64:
1276 if (Ty == LLT::scalar(64) || Ty == LLT::fixed_vector(2, 32) ||
1277 Ty == LLT::fixed_vector(4, 16) || isAnyPtr(Ty, 64))
1278 return Ty;
1279 return LLT();
1280 case SgprB96:
1281 case VgprB96:
1282 case UniInVgprB96:
1283 if (Ty == LLT::scalar(96) || Ty == LLT::fixed_vector(3, 32) ||
1284 Ty == LLT::fixed_vector(6, 16))
1285 return Ty;
1286 return LLT();
1287 case SgprB128:
1288 case VgprB128:
1289 case UniInVgprB128:
1290 if (Ty == LLT::scalar(128) || Ty == LLT::fixed_vector(4, 32) ||
1291 Ty == LLT::fixed_vector(2, 64) || Ty == LLT::fixed_vector(8, 16) ||
1292 isAnyPtr(Ty, 128))
1293 return Ty;
1294 return LLT();
1295 case VgprB160:
1296 case UniInVgprB160:
1297 if (Ty.getSizeInBits() == 160)
1298 return Ty;
1299 return LLT();
1300 case SgprB256:
1301 case VgprB256:
1302 case UniInVgprB256:
1303 if (Ty == LLT::scalar(256) || Ty == LLT::fixed_vector(8, 32) ||
1304 Ty == LLT::fixed_vector(4, 64) || Ty == LLT::fixed_vector(16, 16))
1305 return Ty;
1306 return LLT();
1307 case SgprB512:
1308 case VgprB512:
1309 case UniInVgprB512:
1310 if (Ty == LLT::scalar(512) || Ty == LLT::fixed_vector(16, 32) ||
1311 Ty == LLT::fixed_vector(8, 64))
1312 return Ty;
1313 return LLT();
1314 case SgprBRC: {
1315 const SIRegisterInfo *TRI =
1316 static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
1317 unsigned LLTSize = Ty.getSizeInBits();
1318 if (LLTSize >= 32 && TRI->getSGPRClassForBitWidth(LLTSize))
1319 return Ty;
1320 return LLT();
1321 }
1322 case VgprBRC: {
1323 const SIRegisterInfo *TRI =
1324 static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
1325 if (TRI->getSGPRClassForBitWidth(Ty.getSizeInBits()))
1326 return Ty;
1327 return LLT();
1328 }
1329 default:
1330 return LLT();
1331 }
1332}
1333
1334const RegisterBank *
1335RegBankLegalizeHelper::getRegBankFromID(RegBankLLTMappingApplyID ID) {
1336 switch (ID) {
1337 case Vcc:
1338 return VccRB;
1339 case Sgpr16:
1340 case Sgpr32:
1341 case Sgpr32_WF:
1342 case Sgpr64:
1343 case Sgpr128:
1344 case SgprP0:
1345 case SgprP0Call_WF:
1346 case SgprP1:
1347 case SgprP2:
1348 case SgprP3:
1349 case SgprP4:
1350 case SgprP4Call_WF:
1351 case SgprP5:
1352 case SgprP8:
1353 case SgprPtr32:
1354 case SgprPtr64:
1355 case SgprPtr128:
1356 case SgprV2S16:
1357 case SgprV2S32:
1358 case SgprV4S32:
1359 case SgprV4S32_WF:
1360 case SgprB32:
1361 case SgprB64:
1362 case SgprB96:
1363 case SgprB128:
1364 case SgprB256:
1365 case SgprB512:
1366 case SgprBRC:
1367 case UniInVcc:
1368 case UniInVgprS16:
1369 case UniInVgprS32:
1370 case UniInVgprS64:
1371 case UniInVgprV2S16:
1372 case UniInVgprV2S32:
1373 case UniInVgprV4S32:
1374 case UniInVgprV2S64:
1375 case UniInVgprB32:
1376 case UniInVgprB64:
1377 case UniInVgprB96:
1378 case UniInVgprB128:
1379 case UniInVgprB160:
1380 case UniInVgprB256:
1381 case UniInVgprB512:
1382 case Sgpr32Trunc:
1383 case Sgpr32AExt:
1385 case Sgpr32SExt:
1386 case Sgpr32ZExt:
1387 return SgprRB;
1388 case Vgpr16:
1389 case Vgpr32:
1390 case Vgpr64:
1391 case Vgpr128:
1392 case VgprP0:
1393 case VgprP1:
1394 case VgprP2:
1395 case VgprP3:
1396 case VgprP4:
1397 case VgprP5:
1398 case VgprPtr32:
1399 case VgprPtr64:
1400 case VgprPtr128:
1401 case VgprV2S16:
1402 case VgprV2S32:
1403 case VgprV2S64:
1404 case VgprV3S32:
1405 case VgprV4S16:
1406 case VgprV4S32:
1407 case VgprV8S32:
1408 case VgprB32:
1409 case VgprB64:
1410 case VgprB96:
1411 case VgprB128:
1412 case VgprB160:
1413 case VgprB256:
1414 case VgprB512:
1415 case VgprBRC:
1416 case Vgpr32AExt:
1417 case Vgpr32SExt:
1418 case Vgpr32ZExt:
1419 case Sgpr32ToVgprDst:
1420 case Sgpr64ToVgprDst:
1421 return VgprRB;
1422 default:
1423 return nullptr;
1424 }
1425}
1426
1427bool RegBankLegalizeHelper::applyMappingDst(
1428 MachineInstr &MI, unsigned &OpIdx,
1429 const SmallVectorImpl<RegBankLLTMappingApplyID> &MethodIDs) {
1430 // Defs start from operand 0
1431 for (; OpIdx < MethodIDs.size(); ++OpIdx) {
1432 if (MethodIDs[OpIdx] == None)
1433 continue;
1434 MachineOperand &Op = MI.getOperand(OpIdx);
1435 Register Reg = Op.getReg();
1436 LLT Ty = MRI.getType(Reg);
1437 [[maybe_unused]] const RegisterBank *RB = MRI.getRegBank(Reg);
1438
1439 switch (MethodIDs[OpIdx]) {
1440 // vcc, sgpr and vgpr scalars, pointers and vectors
1441 case Vcc:
1442 case Sgpr16:
1443 case Sgpr32:
1444 case Sgpr64:
1445 case Sgpr128:
1446 case SgprP0:
1447 case SgprP1:
1448 case SgprP3:
1449 case SgprP4:
1450 case SgprP5:
1451 case SgprP8:
1452 case SgprV2S16:
1453 case SgprV2S32:
1454 case SgprV4S32:
1455 case Vgpr16:
1456 case Vgpr32:
1457 case Vgpr64:
1458 case Vgpr128:
1459 case VgprP0:
1460 case VgprP1:
1461 case VgprP2:
1462 case VgprP3:
1463 case VgprP4:
1464 case VgprP5:
1465 case VgprV2S16:
1466 case VgprV2S32:
1467 case VgprV2S64:
1468 case VgprV3S32:
1469 case VgprV4S16:
1470 case VgprV4S32:
1471 case VgprV8S32: {
1472 assert(Ty == getTyFromID(MethodIDs[OpIdx]));
1473 assert(RB == getRegBankFromID(MethodIDs[OpIdx]));
1474 break;
1475 }
1476 // sgpr and vgpr B-types
1477 case SgprB32:
1478 case SgprB64:
1479 case SgprB96:
1480 case SgprB128:
1481 case SgprB256:
1482 case SgprB512:
1483 case SgprBRC:
1484 case SgprPtr32:
1485 case SgprPtr64:
1486 case SgprPtr128:
1487 case VgprB32:
1488 case VgprB64:
1489 case VgprB96:
1490 case VgprB128:
1491 case VgprB160:
1492 case VgprB256:
1493 case VgprB512:
1494 case VgprBRC:
1495 case VgprPtr32:
1496 case VgprPtr64:
1497 case VgprPtr128: {
1498 assert(Ty == getBTyFromID(MethodIDs[OpIdx], Ty));
1499 assert(RB == getRegBankFromID(MethodIDs[OpIdx]));
1500 break;
1501 }
1502 // uniform in vcc/vgpr: scalars, vectors and B-types
1503 case UniInVcc: {
1504 assert(Ty == S1);
1505 assert(RB == SgprRB);
1506 Register NewDst = MRI.createVirtualRegister(VccRB_S1);
1507 Op.setReg(NewDst);
1508 if (!MRI.use_empty(Reg)) {
1509 auto CopyS32_Vcc =
1510 B.buildInstr(AMDGPU::G_AMDGPU_COPY_SCC_VCC, {SgprRB_S32}, {NewDst});
1511 B.buildTrunc(Reg, CopyS32_Vcc);
1512 }
1513 break;
1514 }
1515 case UniInVgprS16: {
1516 assert(Ty == getTyFromID(MethodIDs[OpIdx]));
1517 assert(RB == SgprRB);
1518 Register NewVgprDstS16 = MRI.createVirtualRegister({VgprRB, S16});
1519 Register NewVgprDstS32 = MRI.createVirtualRegister({VgprRB, S32});
1520 Register NewSgprDstS32 = MRI.createVirtualRegister({SgprRB, S32});
1521 Op.setReg(NewVgprDstS16);
1522 B.buildAnyExt(NewVgprDstS32, NewVgprDstS16);
1523 buildReadAnyLane(B, NewSgprDstS32, NewVgprDstS32, RBI);
1524 B.buildTrunc(Reg, NewSgprDstS32);
1525 break;
1526 }
1527 case UniInVgprS32:
1528 case UniInVgprS64:
1529 case UniInVgprV2S16:
1530 case UniInVgprV2S32:
1531 case UniInVgprV4S32:
1532 case UniInVgprV2S64: {
1533 assert(Ty == getTyFromID(MethodIDs[OpIdx]));
1534 assert(RB == SgprRB);
1535 Register NewVgprDst = MRI.createVirtualRegister({VgprRB, Ty});
1536 Op.setReg(NewVgprDst);
1537 buildReadAnyLane(B, Reg, NewVgprDst, RBI);
1538 break;
1539 }
1540 case UniInVgprB32:
1541 case UniInVgprB64:
1542 case UniInVgprB96:
1543 case UniInVgprB128:
1544 case UniInVgprB160:
1545 case UniInVgprB256:
1546 case UniInVgprB512: {
1547 assert(Ty == getBTyFromID(MethodIDs[OpIdx], Ty));
1548 assert(RB == SgprRB);
1549 Register NewVgprDst = MRI.createVirtualRegister({VgprRB, Ty});
1550 Op.setReg(NewVgprDst);
1551 AMDGPU::buildReadAnyLane(B, Reg, NewVgprDst, RBI);
1552 break;
1553 }
1554 // sgpr trunc
1555 case Sgpr32Trunc: {
1556 assert(Ty.getSizeInBits() < 32);
1557 assert(RB == SgprRB);
1558 Register NewDst = MRI.createVirtualRegister(SgprRB_S32);
1559 Op.setReg(NewDst);
1560 if (!MRI.use_empty(Reg))
1561 B.buildTrunc(Reg, NewDst);
1562 break;
1563 }
1564 case Sgpr32ToVgprDst:
1565 case Sgpr64ToVgprDst: {
1566 assert(Ty == getTyFromID(MethodIDs[OpIdx]));
1567 assert(RB == VgprRB);
1568 Op.setReg(MRI.createVirtualRegister({SgprRB, Ty}));
1569 B.buildCopy(Reg, Op.getReg());
1570 break;
1571 }
1572 case InvalidMapping: {
1574 MF, MORE, "amdgpu-regbanklegalize",
1575 "AMDGPU RegBankLegalize: missing fast rule ('Div' or 'Uni') for", MI);
1576 return false;
1577 }
1578 default:
1580 MF, MORE, "amdgpu-regbanklegalize",
1581 "AMDGPU RegBankLegalize: applyMappingDst, ID not supported", MI);
1582 return false;
1583 }
1584 }
1585
1586 return true;
1587}
1588
1589bool RegBankLegalizeHelper::applyMappingSrc(
1590 MachineInstr &MI, unsigned &OpIdx,
1591 const SmallVectorImpl<RegBankLLTMappingApplyID> &MethodIDs,
1592 WaterfallInfo &WFI) {
1593 for (unsigned i = 0; i < MethodIDs.size(); ++OpIdx, ++i) {
1594 if (MethodIDs[i] == None || MethodIDs[i] == IntrId || MethodIDs[i] == Imm)
1595 continue;
1596
1597 MachineOperand &Op = MI.getOperand(OpIdx);
1598 Register Reg = Op.getReg();
1599 LLT Ty = MRI.getType(Reg);
1600 const RegisterBank *RB = MRI.getRegBank(Reg);
1601
1602 switch (MethodIDs[i]) {
1603 case Vcc: {
1604 assert(Ty == S1);
1605 assert(RB == VccRB || RB == SgprRB);
1606 if (RB == SgprRB) {
1607 auto Aext = B.buildAnyExt(SgprRB_S32, Reg);
1608 auto CopyVcc_Scc =
1609 B.buildInstr(AMDGPU::G_AMDGPU_COPY_VCC_SCC, {VccRB_S1}, {Aext});
1610 Op.setReg(CopyVcc_Scc.getReg(0));
1611 }
1612 break;
1613 }
1614 // sgpr scalars, pointers and vectors
1615 case Sgpr16:
1616 case Sgpr32:
1617 case Sgpr64:
1618 case Sgpr128:
1619 case SgprP0:
1620 case SgprP1:
1621 case SgprP3:
1622 case SgprP4:
1623 case SgprP5:
1624 case SgprP8:
1625 case SgprV2S16:
1626 case SgprV2S32:
1627 case SgprV4S32: {
1628 assert(Ty == getTyFromID(MethodIDs[i]));
1629 assert(RB == getRegBankFromID(MethodIDs[i]));
1630 break;
1631 }
1632 // sgpr B-types
1633 case SgprB32:
1634 case SgprB64:
1635 case SgprB96:
1636 case SgprB128:
1637 case SgprB256:
1638 case SgprB512:
1639 case SgprBRC:
1640 case SgprPtr32:
1641 case SgprPtr64:
1642 case SgprPtr128: {
1643 assert(Ty == getBTyFromID(MethodIDs[i], Ty));
1644 assert(RB == getRegBankFromID(MethodIDs[i]));
1645 break;
1646 }
1647 // vgpr scalars, pointers and vectors
1648 case Vgpr16:
1649 case Vgpr32:
1650 case Vgpr64:
1651 case Vgpr128:
1652 case VgprP0:
1653 case VgprP1:
1654 case VgprP2:
1655 case VgprP3:
1656 case VgprP4:
1657 case VgprP5:
1658 case VgprV2S16:
1659 case VgprV2S32:
1660 case VgprV2S64:
1661 case VgprV3S32:
1662 case VgprV4S16:
1663 case VgprV4S32:
1664 case VgprV8S32: {
1665 assert(Ty == getTyFromID(MethodIDs[i]));
1666 if (RB != VgprRB) {
1667 auto CopyToVgpr = B.buildCopy({VgprRB, Ty}, Reg);
1668 Op.setReg(CopyToVgpr.getReg(0));
1669 }
1670 break;
1671 }
1672 // vgpr B-types
1673 case VgprB32:
1674 case VgprB64:
1675 case VgprB96:
1676 case VgprB128:
1677 case VgprB160:
1678 case VgprB256:
1679 case VgprB512:
1680 case VgprBRC:
1681 case VgprPtr32:
1682 case VgprPtr64:
1683 case VgprPtr128: {
1684 assert(Ty == getBTyFromID(MethodIDs[i], Ty));
1685 if (RB != VgprRB) {
1686 auto CopyToVgpr = B.buildCopy({VgprRB, Ty}, Reg);
1687 Op.setReg(CopyToVgpr.getReg(0));
1688 }
1689 break;
1690 }
1691 // sgpr waterfall, scalars, and vectors
1692 case Sgpr32_WF:
1693 case SgprV4S32_WF: {
1694 assert(Ty == getTyFromID(MethodIDs[i]));
1695 if (RB != SgprRB) {
1696 WFI.SgprWaterfallOperandRegs.insert(Reg);
1697 if (!WFI.Start.isValid()) {
1698 WFI.Start = MI.getIterator();
1699 WFI.End = std::next(MI.getIterator());
1700 }
1701 }
1702 break;
1703 }
1704 case SgprP0Call_WF:
1705 case SgprP4Call_WF: {
1706 assert(Ty == getTyFromID(MethodIDs[i]));
1707 if (RB != SgprRB) {
1708 WFI.SgprWaterfallOperandRegs.insert(Reg);
1709
1710 // Find the ADJCALLSTACKUP before the call.
1711 MachineBasicBlock::iterator Start = MI.getIterator();
1712 while (Start->getOpcode() != AMDGPU::ADJCALLSTACKUP)
1713 --Start;
1714
1715 // Find the ADJCALLSTACKDOWN after the call (include it in range).
1716 MachineBasicBlock::iterator End = MI.getIterator();
1717 while (End->getOpcode() != AMDGPU::ADJCALLSTACKDOWN)
1718 ++End;
1719 ++End;
1720
1721 B.setInsertPt(*MI.getParent(), Start);
1722 WFI.Start = Start;
1723 WFI.End = End;
1724 }
1725 break;
1726 }
1727 case SgprB32_M0:
1728 case SgprB32_ReadFirstLane: {
1729 assert(Ty == getBTyFromID(MethodIDs[i], Ty));
1730 if (RB == SgprRB)
1731 break;
1732 assert(RB == VgprRB);
1733 Register NewSGPR32 = MRI.createVirtualRegister({SgprRB, Ty});
1734 buildReadFirstLane(B, NewSGPR32, Op.getReg(), RBI);
1735 Op.setReg(NewSGPR32);
1736 break;
1737 }
1738 // sgpr and vgpr scalars with extend
1739 case Sgpr32AExt: {
1740 // Note: this ext allows S1, and it is meant to be combined away.
1741 assert(Ty.getSizeInBits() < 32);
1742 assert(RB == SgprRB);
1743 auto Aext = B.buildAnyExt(SgprRB_S32, Reg);
1744 Op.setReg(Aext.getReg(0));
1745 break;
1746 }
1747 case Sgpr32AExtBoolInReg: {
1748 // Note: this ext allows S1, and it is meant to be combined away.
1749 assert(Ty.getSizeInBits() == 1);
1750 assert(RB == SgprRB);
1751 auto Aext = B.buildAnyExt(SgprRB_S32, Reg);
1752 // Zext SgprS1 is not legal, make AND with 1 instead. This instruction is
1753 // most of times meant to be combined away in AMDGPURegBankCombiner.
1754 auto Cst1 = B.buildConstant(SgprRB_S32, 1);
1755 auto BoolInReg = B.buildAnd(SgprRB_S32, Aext, Cst1);
1756 Op.setReg(BoolInReg.getReg(0));
1757 break;
1758 }
1759 case Sgpr32SExt: {
1760 assert(1 < Ty.getSizeInBits() && Ty.getSizeInBits() < 32);
1761 assert(RB == SgprRB);
1762 auto Sext = B.buildSExt(SgprRB_S32, Reg);
1763 Op.setReg(Sext.getReg(0));
1764 break;
1765 }
1766 case Sgpr32ZExt: {
1767 assert(1 < Ty.getSizeInBits() && Ty.getSizeInBits() < 32);
1768 assert(RB == SgprRB);
1769 auto Zext = B.buildZExt({SgprRB, S32}, Reg);
1770 Op.setReg(Zext.getReg(0));
1771 break;
1772 }
1773 case Vgpr32AExt: {
1774 assert(Ty.getSizeInBits() < 32);
1775 assert(RB == VgprRB);
1776 auto Aext = B.buildAnyExt({VgprRB, S32}, Reg);
1777 Op.setReg(Aext.getReg(0));
1778 break;
1779 }
1780 case Vgpr32SExt: {
1781 // Note this ext allows S1, and it is meant to be combined away.
1782 assert(Ty.getSizeInBits() < 32);
1783 assert(RB == VgprRB);
1784 auto Sext = B.buildSExt({VgprRB, S32}, Reg);
1785 Op.setReg(Sext.getReg(0));
1786 break;
1787 }
1788 case Vgpr32ZExt: {
1789 // Note this ext allows S1, and it is meant to be combined away.
1790 assert(Ty.getSizeInBits() < 32);
1791 assert(RB == VgprRB);
1792 auto Zext = B.buildZExt({VgprRB, S32}, Reg);
1793 Op.setReg(Zext.getReg(0));
1794 break;
1795 }
1796 default:
1798 MF, MORE, "amdgpu-regbanklegalize",
1799 "AMDGPU RegBankLegalize: applyMappingSrc, ID not supported", MI);
1800 return false;
1801 }
1802 }
1803 return true;
1804}
1805
1806[[maybe_unused]] static bool verifyRegBankOnOperands(MachineInstr &MI,
1807 const RegisterBank *RB,
1809 unsigned StartOpIdx,
1810 unsigned EndOpIdx) {
1811 for (unsigned i = StartOpIdx; i <= EndOpIdx; ++i) {
1812 if (MRI.getRegBankOrNull(MI.getOperand(i).getReg()) != RB)
1813 return false;
1814 }
1815 return true;
1816}
1817
1819 const RegisterBank *RB = MRI.getRegBank(MI.getOperand(0).getReg());
1820 // Put RB on all registers
1821 unsigned NumDefs = MI.getNumDefs();
1822 unsigned NumOperands = MI.getNumOperands();
1823
1824 assert(verifyRegBankOnOperands(MI, RB, MRI, 0, NumDefs - 1));
1825 if (RB == SgprRB)
1826 assert(verifyRegBankOnOperands(MI, RB, MRI, NumDefs, NumOperands - 1));
1827
1828 if (RB == VgprRB) {
1829 B.setInstr(MI);
1830 for (unsigned i = NumDefs; i < NumOperands; ++i) {
1831 Register Reg = MI.getOperand(i).getReg();
1832 if (MRI.getRegBank(Reg) != RB) {
1833 auto Copy = B.buildCopy({VgprRB, MRI.getType(Reg)}, Reg);
1834 MI.getOperand(i).setReg(Copy.getReg(0));
1835 }
1836 }
1837 }
1838}
1839
1840bool RegBankLegalizeHelper::applyRegisterBanksINTRIN_IMAGE(MachineInstr &MI) {
1841 const AMDGPU::RsrcIntrinsic *RSrcIntrin =
1843 assert(RSrcIntrin && RSrcIntrin->IsImage);
1844
1845 unsigned RsrcIdx = RSrcIntrin->RsrcArg;
1846 const unsigned NumDefs = MI.getNumExplicitDefs();
1847
1848 // The reported argument index is relative to the IR intrinsic call arguments,
1849 // so we need to shift by the number of defs and the intrinsic ID.
1850 RsrcIdx += NumDefs + 1;
1851
1852 MachineBasicBlock *MBB = MI.getParent();
1853 B.setInsertPt(*MBB, MBB->SkipPHIsAndLabels(std::next(MI.getIterator())));
1854
1855 // Defs(for image loads with return) are vgpr.
1856 for (unsigned i = 0; i < NumDefs; ++i) {
1857 const RegisterBank *RB = MRI.getRegBank(MI.getOperand(i).getReg());
1858 if (RB == VgprRB)
1859 continue;
1860
1861 Register Reg = MI.getOperand(i).getReg();
1862 Register NewVgprDst = MRI.createVirtualRegister({VgprRB, MRI.getType(Reg)});
1863 MI.getOperand(i).setReg(NewVgprDst);
1864 buildReadAnyLane(B, Reg, NewVgprDst, RBI);
1865 }
1866
1867 B.setInstrAndDebugLoc(MI);
1868
1869 // Register uses(before RsrcIdx) are vgpr.
1870 for (unsigned i = 1; i < RsrcIdx; ++i) {
1871 MachineOperand &Op = MI.getOperand(i);
1872 if (!Op.isReg())
1873 continue;
1874
1875 Register Reg = Op.getReg();
1876 if (!Reg.isVirtual())
1877 continue;
1878
1879 if (MRI.getRegBank(Reg) == VgprRB)
1880 continue;
1881
1882 auto Copy = B.buildCopy({VgprRB, MRI.getType(Reg)}, Reg);
1883 Op.setReg(Copy.getReg(0));
1884 }
1885
1886 SmallSet<Register, 4> OpsToWaterfall;
1887
1888 // Register use RsrcIdx(and RsrcIdx+1 in some cases) is sgpr.
1889 for (unsigned i = RsrcIdx; i < MI.getNumOperands(); ++i) {
1890 MachineOperand &Op = MI.getOperand(i);
1891 if (!Op.isReg())
1892 continue;
1893
1894 Register Reg = Op.getReg();
1895 if (MRI.getRegBank(Reg) != SgprRB)
1896 OpsToWaterfall.insert(Reg);
1897 }
1898
1899 if (!OpsToWaterfall.empty()) {
1900 MachineBasicBlock::iterator MII = MI.getIterator();
1901 executeInWaterfallLoop(B, {OpsToWaterfall, MII, std::next(MII)});
1902 }
1903
1904 return true;
1905}
MachineInstrBuilder MachineInstrBuilder & DefMI
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.
Provides AMDGPU specific target descriptions.
static bool isSignedBFE(MachineInstr &MI)
static bool verifyRegBankOnOperands(MachineInstr &MI, const RegisterBank *RB, MachineRegisterInfo &MRI, unsigned StartOpIdx, unsigned EndOpIdx)
This file declares the targeting of the RegisterBankInfo class for AMDGPU.
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator MBBI
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
AMD GCN specific subclass of TargetSubtarget.
Declares convenience wrapper classes for interpreting MachineInstr instances as specific generic oper...
static Register UseReg(const MachineOperand &MO)
IRTranslator LLVM IR MI
const size_t AbstractManglingParser< Derived, Alloc >::NumOps
Contains matchers for matching SSA Machine Instructions.
This file declares the MachineIRBuilder class.
Register Reg
Register const TargetRegisterInfo * TRI
Machine IR instance of the generic uniformity analysis.
Promote Memory to Register
Definition Mem2Reg.cpp:110
static MCRegister getReg(const MCDisassembler *D, unsigned RC, unsigned RegNo)
MachineInstr unsigned OpIdx
const SmallVectorImpl< MachineOperand > & Cond
RegBankLegalizeHelper(MachineIRBuilder &B, const MachineUniformityInfo &MUI, const RegisterBankInfo &RBI, const RegBankLegalizeRules &RBLRules)
const RegBankLLTMapping * findMappingForMI(const MachineInstr &MI, const MachineRegisterInfo &MRI, const MachineUniformityInfo &MUI) const
static APInt getLowBitsSet(unsigned numBits, unsigned loBitsSet)
Constructs an APInt value that has the bottom loBitsSet bits set.
Definition APInt.h:307
@ ICMP_NE
not equal
Definition InstrTypes.h:698
iterator find(const_arg_type_t< KeyT > Val)
Definition DenseMap.h:178
iterator end()
Definition DenseMap.h:81
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition DenseMap.h:241
const SIRegisterInfo * getRegisterInfo() const override
Represents a call to an intrinsic.
Register getSourceReg() const
Get the unmerge source register.
constexpr bool isScalar() const
static constexpr LLT scalar(unsigned SizeInBits)
Get a low-level scalar or aggregate "bag of bits".
constexpr bool isValid() const
constexpr bool isVector() const
static constexpr LLT pointer(unsigned AddressSpace, unsigned SizeInBits)
Get a low-level pointer in the given address space.
constexpr TypeSize getSizeInBits() const
Returns the total size of the type. Must only be called on sized types.
constexpr bool isPointer() const
constexpr LLT getElementType() const
Returns the vector's element type. Only valid for vector types.
static constexpr LLT fixed_vector(unsigned NumElements, unsigned ScalarSizeInBits)
Get a low-level fixed-width vector of some number of elements and element width.
constexpr TypeSize getSizeInBytes() const
Returns the total size of the type in bytes, i.e.
TypeSize getValue() const
LLVM_ABI void transferSuccessorsAndUpdatePHIs(MachineBasicBlock *FromMBB)
Transfers all the successors, as in transferSuccessors, and update PHI operands in the successor bloc...
LLVM_ABI iterator SkipPHIsAndLabels(iterator I)
Return the first instruction in MBB after I that is not a PHI or a label.
LLVM_ABI void addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob=BranchProbability::getUnknown())
Add Succ as a successor of this MachineBasicBlock.
void splice(iterator Where, MachineBasicBlock *Other, iterator From)
Take an instruction from MBB 'Other' at the position From, and insert it into this MBB right before '...
MachineInstrBundleIterator< MachineInstr > iterator
BasicBlockListType::iterator iterator
MachineBasicBlock * CreateMachineBasicBlock(const BasicBlock *BB=nullptr, std::optional< UniqueBBID > BBID=std::nullopt)
CreateMachineInstr - Allocate a new MachineInstr.
void insert(iterator MBBI, MachineBasicBlock *MBB)
Helper class to build MachineInstr.
Representation of each machine instruction.
const MachineBasicBlock * getParent() const
LocationSize getSize() const
Return the size in bytes of the memory reference.
MachineOperand class - Representation of each machine instruction operand.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
const RegisterBank * getRegBank(Register Reg) const
Return the register bank of Reg.
LLVM_ABI Register createVirtualRegister(const TargetRegisterClass *RegClass, StringRef Name="")
createVirtualRegister - Create and return a new virtual register in the function with the specified r...
LLT getType(Register Reg) const
Get the low-level type of Reg or LLT{} if Reg is not a generic (target independent) virtual register.
const RegisterBank * getRegBankOrNull(Register Reg) const
Return the register bank of Reg, or null if Reg has not been assigned a register bank or has been ass...
Holds all the information related to register banks.
This class implements the register bank concept.
Wrapper class representing virtual and physical registers.
Definition Register.h:20
bool empty() const
Definition SmallSet.h:169
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
Definition SmallSet.h:184
void push_back(const T &Elt)
self_iterator getIterator()
Definition ilist_node.h:123
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
bool isAnyPtr(LLT Ty, unsigned Width)
Intrinsic::ID getIntrinsicID(const MachineInstr &I)
Return the intrinsic ID for opcodes with the G_AMDGPU_INTRIN_ prefix.
void buildReadAnyLane(MachineIRBuilder &B, Register SgprDst, Register VgprSrc, const RegisterBankInfo &RBI)
const RsrcIntrinsic * lookupRsrcIntrinsic(unsigned Intr)
void buildReadFirstLane(MachineIRBuilder &B, Register SgprDst, Register VgprSrc, const RegisterBankInfo &RBI)
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
SpecificConstantMatch m_ZeroInt()
Convenience matchers for specific integer values.
bool mi_match(Reg R, const MachineRegisterInfo &MRI, Pattern &&P)
This is an optimization pass for GlobalISel generic memory operations.
Definition Types.h:26
GenericUniformityInfo< MachineSSAContext > MachineUniformityInfo
@ Offset
Definition DWP.cpp:532
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1739
@ Kill
The last use of a register.
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
LLVM_ABI void constrainSelectedInstRegOperands(MachineInstr &I, const TargetInstrInfo &TII, const TargetRegisterInfo &TRI, const RegisterBankInfo &RBI)
Mutate the newly-selected instruction I to constrain its (possibly generic) virtual register operands...
Definition Utils.cpp:155
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
LLVM_ABI void reportGISelFailure(MachineFunction &MF, MachineOptimizationRemarkEmitter &MORE, MachineOptimizationRemarkMissed &R)
Report an ISel error as a missed optimization remark to the LLVMContext's diagnostic stream.
Definition Utils.cpp:258
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
DWARFExpression::Operation Op
ArrayRef(const T &OneElt) -> ArrayRef< T >
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
LLVM_ABI std::optional< ValueAndVReg > getIConstantVRegValWithLookThrough(Register VReg, const MachineRegisterInfo &MRI, bool LookThroughInstrs=true)
If VReg is defined by a statically evaluable chain of instructions rooted on a G_CONSTANT returns its...
Definition Utils.cpp:433
constexpr T maskTrailingOnes(unsigned N)
Create a bitmask with the N right-most bits set to 1, and all other bits set to 0.
Definition MathExtras.h:77
SmallVector< RegBankLLTMappingApplyID, 2 > DstOpMapping
SmallVector< RegBankLLTMappingApplyID, 4 > SrcOpMapping
Holds waterfall loop information: the set of SGPR operand registers that need waterfalling,...
MachineBasicBlock::iterator Start
SmallSet< Register, 4 > SgprWaterfallOperandRegs