LLVM 23.0.0git
AMDGPURegBankLegalizeHelper.cpp
Go to the documentation of this file.
1//===-- AMDGPURegBankLegalizeHelper.cpp -----------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// Implements actual lowering algorithms for each ID that can be used in
10/// Rule.OperandMapping. Similar to legalizer helper but with register banks.
11//
12//===----------------------------------------------------------------------===//
13
16#include "AMDGPUInstrInfo.h"
19#include "GCNSubtarget.h"
26#include "llvm/IR/IntrinsicsAMDGPU.h"
27
28#define DEBUG_TYPE "amdgpu-regbanklegalize"
29
30using namespace llvm;
31using namespace AMDGPU;
32
35 const RegisterBankInfo &RBI, const RegBankLegalizeRules &RBLRules)
36 : MF(B.getMF()), ST(MF.getSubtarget<GCNSubtarget>()), B(B),
37 MRI(*B.getMRI()), MUI(MUI), RBI(RBI), MORE(MF, nullptr),
38 RBLRules(RBLRules), IsWave32(ST.isWave32()),
39 SgprRB(&RBI.getRegBank(AMDGPU::SGPRRegBankID)),
40 VgprRB(&RBI.getRegBank(AMDGPU::VGPRRegBankID)),
41 VccRB(&RBI.getRegBank(AMDGPU::VCCRegBankID)) {}
42
44 const SetOfRulesForOpcode *RuleSet = RBLRules.getRulesForOpc(MI);
45 if (!RuleSet) {
46 reportGISelFailure(MF, MORE, "amdgpu-regbanklegalize",
47 "No AMDGPU RegBankLegalize rules defined for opcode",
48 MI);
49 return false;
50 }
51
52 const RegBankLLTMapping *Mapping = RuleSet->findMappingForMI(MI, MRI, MUI);
53 if (!Mapping) {
54 reportGISelFailure(MF, MORE, "amdgpu-regbanklegalize",
55 "AMDGPU RegBankLegalize: none of the rules defined with "
56 "'Any' for MI's opcode matched MI",
57 MI);
58 return false;
59 }
60
61 WaterfallInfo WFI;
62 unsigned OpIdx = 0;
63 if (Mapping->DstOpMapping.size() > 0) {
64 B.setInsertPt(*MI.getParent(), std::next(MI.getIterator()));
65 if (!applyMappingDst(MI, OpIdx, Mapping->DstOpMapping))
66 return false;
67 }
68 if (Mapping->SrcOpMapping.size() > 0) {
69 B.setInstr(MI);
70 if (!applyMappingSrc(MI, OpIdx, Mapping->SrcOpMapping, WFI))
71 return false;
72 }
73
74 if (!lower(MI, *Mapping, WFI))
75 return false;
76
77 return true;
78}
79
80bool RegBankLegalizeHelper::executeInWaterfallLoop(MachineIRBuilder &B,
81 const WaterfallInfo &WFI) {
82 assert(WFI.Start.isValid() && WFI.End.isValid() &&
83 "Waterfall range not initialized");
84
85 // Track use registers which have already been expanded with a readfirstlane
86 // sequence. This may have multiple uses if moving a sequence.
87 DenseMap<Register, Register> WaterfalledRegMap;
88
89 MachineBasicBlock &MBB = B.getMBB();
90 MachineFunction &MF = B.getMF();
91
94
96 const TargetRegisterClass *WaveRC = TRI->getWaveMaskRegClass();
97 unsigned MovExecOpc, MovExecTermOpc, XorTermOpc, AndSaveExecOpc, ExecReg;
98 if (IsWave32) {
99 MovExecOpc = AMDGPU::S_MOV_B32;
100 MovExecTermOpc = AMDGPU::S_MOV_B32_term;
101 XorTermOpc = AMDGPU::S_XOR_B32_term;
102 AndSaveExecOpc = AMDGPU::S_AND_SAVEEXEC_B32;
103 ExecReg = AMDGPU::EXEC_LO;
104 } else {
105 MovExecOpc = AMDGPU::S_MOV_B64;
106 MovExecTermOpc = AMDGPU::S_MOV_B64_term;
107 XorTermOpc = AMDGPU::S_XOR_B64_term;
108 AndSaveExecOpc = AMDGPU::S_AND_SAVEEXEC_B64;
109 ExecReg = AMDGPU::EXEC;
110 }
111
112#ifndef NDEBUG
113 const int OrigRangeSize = std::distance(BeginIt, EndIt);
114#endif
115
116 MachineRegisterInfo &MRI = *B.getMRI();
117 Register SaveExecReg = MRI.createVirtualRegister(WaveRC);
118 Register InitSaveExecReg = MRI.createVirtualRegister(WaveRC);
119
120 // Don't bother using generic instructions/registers for the exec mask.
121 B.buildInstr(TargetOpcode::IMPLICIT_DEF).addDef(InitSaveExecReg);
122
123 Register SavedExec = MRI.createVirtualRegister(WaveRC);
124
125 // To insert the loop we need to split the block. Move everything before
126 // this point to a new block, and insert a new empty block before this
127 // instruction.
130 MachineBasicBlock *RestoreExecBB = MF.CreateMachineBasicBlock();
131 MachineBasicBlock *RemainderBB = MF.CreateMachineBasicBlock();
133 ++MBBI;
134 MF.insert(MBBI, LoopBB);
135 MF.insert(MBBI, BodyBB);
136 MF.insert(MBBI, RestoreExecBB);
137 MF.insert(MBBI, RemainderBB);
138
139 LoopBB->addSuccessor(BodyBB);
140 BodyBB->addSuccessor(RestoreExecBB);
141 BodyBB->addSuccessor(LoopBB);
142
143 // Move the rest of the block into a new block.
145 RemainderBB->splice(RemainderBB->begin(), &MBB, EndIt, MBB.end());
146
147 MBB.addSuccessor(LoopBB);
148 RestoreExecBB->addSuccessor(RemainderBB);
149
150 B.setInsertPt(*LoopBB, LoopBB->end());
151
152 // +-MBB:------------+
153 // | ... |
154 // | %0 = G_INST_1 |
155 // | %Dst = MI %Vgpr |
156 // | %1 = G_INST_2 |
157 // | ... |
158 // +-----------------+
159 // ->
160 // +-MBB-------------------------------+
161 // | ... |
162 // | %0 = G_INST_1 |
163 // | %SaveExecReg = S_MOV_B32 $exec_lo |
164 // +----------------|------------------+
165 // | /------------------------------|
166 // V V |
167 // +-LoopBB---------------------------------------------------------------+ |
168 // | %CurrentLaneReg:sgpr(s32) = READFIRSTLANE %Vgpr | |
169 // | instead of executing for each lane, see if other lanes had | |
170 // | same value for %Vgpr and execute for them also. | |
171 // | %CondReg:vcc(s1) = G_ICMP eq %CurrentLaneReg, %Vgpr | |
172 // | %CondRegLM:sreg_32 = ballot %CondReg // copy vcc to sreg32 lane mask | |
173 // | %SavedExec = S_AND_SAVEEXEC_B32 %CondRegLM | |
174 // | exec is active for lanes with the same "CurrentLane value" in Vgpr | |
175 // +----------------|-----------------------------------------------------+ |
176 // V |
177 // +-BodyBB------------------------------------------------------------+ |
178 // | %Dst = MI %CurrentLaneReg:sgpr(s32) | |
179 // | executed only for active lanes and written to Dst | |
180 // | $exec = S_XOR_B32 $exec, %SavedExec | |
181 // | set active lanes to 0 in SavedExec, lanes that did not write to | |
182 // | Dst yet, and set this as new exec (for READFIRSTLANE and ICMP) | |
183 // | SI_WATERFALL_LOOP LoopBB |-----|
184 // +----------------|--------------------------------------------------+
185 // V
186 // +-RestoreExecBB--------------------------+
187 // | $exec_lo = S_MOV_B32_term %SaveExecReg |
188 // +----------------|-----------------------+
189 // V
190 // +-RemainderBB:----------------------+
191 // | %1 = G_INST_2 |
192 // | ... |
193 // +---------------------------------- +
194
195 // Move the instruction into the loop body. Note we moved everything after
196 // Range.end() already into a new block, so Range.end() is no longer valid.
197 BodyBB->splice(BodyBB->end(), &MBB, BeginIt, MBB.end());
198
199 // Figure out the iterator range after splicing the instructions.
200 MachineBasicBlock::iterator NewBegin = BeginIt;
201 auto NewEnd = BodyBB->end();
202 assert(std::distance(NewBegin, NewEnd) == OrigRangeSize);
203
204 B.setMBB(*LoopBB);
205 Register CondReg;
206
207 for (MachineInstr &MI : make_range(NewBegin, NewEnd)) {
208 for (MachineOperand &Op : MI.all_uses()) {
209 Register OldReg = Op.getReg();
210 if (!WFI.SgprWaterfallOperandRegs.count(OldReg))
211 continue;
212
213 // See if we already processed this register in another instruction in
214 // the sequence.
215 auto OldVal = WaterfalledRegMap.find(OldReg);
216 if (OldVal != WaterfalledRegMap.end()) {
217 Op.setReg(OldVal->second);
218 continue;
219 }
220
221 Register OpReg = Op.getReg();
222 LLT OpTy = MRI.getType(OpReg);
223
224 // TODO: support for agpr
225 assert(MRI.getRegBank(OpReg) == VgprRB);
226 Register CurrentLaneReg = MRI.createVirtualRegister({SgprRB, OpTy});
227 buildReadFirstLane(B, CurrentLaneReg, OpReg, RBI);
228
229 // Build the comparison(s), CurrentLaneReg == OpReg.
230 unsigned OpSize = OpTy.getSizeInBits();
231 unsigned PartSize = (OpSize % 64 == 0) ? 64 : 32;
232 LLT PartTy = LLT::scalar(PartSize);
233 unsigned NumParts = OpSize / PartSize;
235 SmallVector<Register, 8> CurrentLaneParts;
236
237 if (NumParts == 1) {
238 OpParts.push_back(OpReg);
239 CurrentLaneParts.push_back(CurrentLaneReg);
240 } else {
241 auto UnmergeOp = B.buildUnmerge({VgprRB, PartTy}, OpReg);
242 auto UnmergeCurrLane = B.buildUnmerge({SgprRB, PartTy}, CurrentLaneReg);
243 for (unsigned i = 0; i < NumParts; ++i) {
244 OpParts.push_back(UnmergeOp.getReg(i));
245 CurrentLaneParts.push_back(UnmergeCurrLane.getReg(i));
246 }
247 }
248
249 for (unsigned i = 0; i < NumParts; ++i) {
250 Register CmpReg = MRI.createVirtualRegister(VccRB_S1);
251 B.buildICmp(CmpInst::ICMP_EQ, CmpReg, CurrentLaneParts[i], OpParts[i]);
252
253 if (!CondReg)
254 CondReg = CmpReg;
255 else
256 CondReg = B.buildAnd(VccRB_S1, CondReg, CmpReg).getReg(0);
257 }
258
259 Op.setReg(CurrentLaneReg);
260
261 // Make sure we don't re-process this register again.
262 WaterfalledRegMap.insert(std::pair(OldReg, Op.getReg()));
263 }
264 }
265
266 // Copy vcc to sgpr32/64, ballot becomes a no-op during instruction selection.
267 Register CondRegLM =
268 MRI.createVirtualRegister({WaveRC, LLT::scalar(IsWave32 ? 32 : 64)});
269 B.buildIntrinsic(Intrinsic::amdgcn_ballot, CondRegLM).addReg(CondReg);
270
271 // Update EXEC, save the original EXEC value to SavedExec.
272 B.buildInstr(AndSaveExecOpc)
273 .addDef(SavedExec)
274 .addReg(CondRegLM, RegState::Kill);
275 MRI.setSimpleHint(SavedExec, CondRegLM);
276
277 B.setInsertPt(*BodyBB, BodyBB->end());
278
279 // Update EXEC, switch all done bits to 0 and all todo bits to 1.
280 B.buildInstr(XorTermOpc).addDef(ExecReg).addReg(ExecReg).addReg(SavedExec);
281
282 // XXX - s_xor_b64 sets scc to 1 if the result is nonzero, so can we use
283 // s_cbranch_scc0?
284
285 // Loop back to V_READFIRSTLANE_B32 if there are still variants to cover.
286 B.buildInstr(AMDGPU::SI_WATERFALL_LOOP).addMBB(LoopBB);
287
288 // Save the EXEC mask before the loop.
289 B.setInsertPt(MBB, MBB.end());
290 B.buildInstr(MovExecOpc).addDef(SaveExecReg).addReg(ExecReg);
291
292 // Restore the EXEC mask after the loop.
293 B.setInsertPt(*RestoreExecBB, RestoreExecBB->begin());
294 B.buildInstr(MovExecTermOpc).addDef(ExecReg).addReg(SaveExecReg);
295
296 // Set the insert point after the original instruction, so any new
297 // instructions will be in the remainder.
298 B.setInsertPt(*RemainderBB, RemainderBB->begin());
299
300 return true;
301}
302
303bool RegBankLegalizeHelper::splitLoad(MachineInstr &MI,
304 ArrayRef<LLT> LLTBreakdown, LLT MergeTy) {
305 MachineFunction &MF = B.getMF();
306 assert(MI.getNumMemOperands() == 1);
307 MachineMemOperand &BaseMMO = **MI.memoperands_begin();
308 Register Dst = MI.getOperand(0).getReg();
309 const RegisterBank *DstRB = MRI.getRegBankOrNull(Dst);
310 Register Base = MI.getOperand(1).getReg();
311 LLT PtrTy = MRI.getType(Base);
312 const RegisterBank *PtrRB = MRI.getRegBankOrNull(Base);
313 LLT OffsetTy = LLT::scalar(PtrTy.getSizeInBits());
314 SmallVector<Register, 4> LoadPartRegs;
315
316 unsigned ByteOffset = 0;
317 for (LLT PartTy : LLTBreakdown) {
318 Register BasePlusOffset;
319 if (ByteOffset == 0) {
320 BasePlusOffset = Base;
321 } else {
322 auto Offset = B.buildConstant({PtrRB, OffsetTy}, ByteOffset);
323 BasePlusOffset =
324 B.buildObjectPtrOffset({PtrRB, PtrTy}, Base, Offset).getReg(0);
325 }
326 auto *OffsetMMO = MF.getMachineMemOperand(&BaseMMO, ByteOffset, PartTy);
327 auto LoadPart = B.buildLoad({DstRB, PartTy}, BasePlusOffset, *OffsetMMO);
328 LoadPartRegs.push_back(LoadPart.getReg(0));
329 ByteOffset += PartTy.getSizeInBytes();
330 }
331
332 if (!MergeTy.isValid()) {
333 // Loads are of same size, concat or merge them together.
334 B.buildMergeLikeInstr(Dst, LoadPartRegs);
335 } else {
336 // Loads are not all of same size, need to unmerge them to smaller pieces
337 // of MergeTy type, then merge pieces to Dst.
338 SmallVector<Register, 4> MergeTyParts;
339 for (Register Reg : LoadPartRegs) {
340 if (MRI.getType(Reg) == MergeTy) {
341 MergeTyParts.push_back(Reg);
342 } else {
343 auto Unmerge = B.buildUnmerge({DstRB, MergeTy}, Reg);
344 for (unsigned i = 0; i < Unmerge->getNumOperands() - 1; ++i)
345 MergeTyParts.push_back(Unmerge.getReg(i));
346 }
347 }
348 B.buildMergeLikeInstr(Dst, MergeTyParts);
349 }
350 MI.eraseFromParent();
351 return true;
352}
353
354bool RegBankLegalizeHelper::widenLoad(MachineInstr &MI, LLT WideTy,
355 LLT MergeTy) {
356 MachineFunction &MF = B.getMF();
357 assert(MI.getNumMemOperands() == 1);
358 MachineMemOperand &BaseMMO = **MI.memoperands_begin();
359 Register Dst = MI.getOperand(0).getReg();
360 const RegisterBank *DstRB = MRI.getRegBankOrNull(Dst);
361 Register Base = MI.getOperand(1).getReg();
362
363 MachineMemOperand *WideMMO = MF.getMachineMemOperand(&BaseMMO, 0, WideTy);
364 auto WideLoad = B.buildLoad({DstRB, WideTy}, Base, *WideMMO);
365
366 if (WideTy.isScalar()) {
367 B.buildTrunc(Dst, WideLoad);
368 } else {
369 SmallVector<Register, 4> MergeTyParts;
370 auto Unmerge = B.buildUnmerge({DstRB, MergeTy}, WideLoad);
371
372 LLT DstTy = MRI.getType(Dst);
373 unsigned NumElts = DstTy.getSizeInBits() / MergeTy.getSizeInBits();
374 for (unsigned i = 0; i < NumElts; ++i) {
375 MergeTyParts.push_back(Unmerge.getReg(i));
376 }
377 B.buildMergeLikeInstr(Dst, MergeTyParts);
378 }
379 MI.eraseFromParent();
380 return true;
381}
382
383bool RegBankLegalizeHelper::widenMMOToS32(GAnyLoad &MI) const {
384 Register Dst = MI.getDstReg();
385 Register Ptr = MI.getPointerReg();
386 MachineMemOperand &MMO = MI.getMMO();
387 unsigned MemSize = 8 * MMO.getSize().getValue();
388
389 MachineMemOperand *WideMMO = B.getMF().getMachineMemOperand(&MMO, 0, S32);
390
391 if (MI.getOpcode() == G_LOAD) {
392 B.buildLoad(Dst, Ptr, *WideMMO);
393 } else {
394 auto Load = B.buildLoad(SgprRB_S32, Ptr, *WideMMO);
395
396 if (MI.getOpcode() == G_ZEXTLOAD) {
397 APInt Mask = APInt::getLowBitsSet(S32.getSizeInBits(), MemSize);
398 auto MaskCst = B.buildConstant(SgprRB_S32, Mask);
399 B.buildAnd(Dst, Load, MaskCst);
400 } else {
401 assert(MI.getOpcode() == G_SEXTLOAD);
402 B.buildSExtInReg(Dst, Load, MemSize);
403 }
404 }
405
406 MI.eraseFromParent();
407 return true;
408}
409
410bool RegBankLegalizeHelper::lowerVccExtToSel(MachineInstr &MI) {
411 Register Dst = MI.getOperand(0).getReg();
412 LLT Ty = MRI.getType(Dst);
413 Register Src = MI.getOperand(1).getReg();
414 unsigned Opc = MI.getOpcode();
415 int TrueExtCst = Opc == G_SEXT ? -1 : 1;
416 if (Ty == S32 || Ty == S16) {
417 auto True = B.buildConstant({VgprRB, Ty}, TrueExtCst);
418 auto False = B.buildConstant({VgprRB, Ty}, 0);
419 B.buildSelect(Dst, Src, True, False);
420 } else if (Ty == S64) {
421 auto True = B.buildConstant({VgprRB_S32}, TrueExtCst);
422 auto False = B.buildConstant({VgprRB_S32}, 0);
423 auto Lo = B.buildSelect({VgprRB_S32}, Src, True, False);
424 MachineInstrBuilder Hi;
425 switch (Opc) {
426 case G_SEXT:
427 Hi = Lo;
428 break;
429 case G_ZEXT:
430 Hi = False;
431 break;
432 case G_ANYEXT:
433 Hi = B.buildUndef({VgprRB_S32});
434 break;
435 default:
437 MF, MORE, "amdgpu-regbanklegalize",
438 "AMDGPU RegBankLegalize: lowerVccExtToSel, Opcode not supported", MI);
439 return false;
440 }
441
442 B.buildMergeValues(Dst, {Lo.getReg(0), Hi.getReg(0)});
443 } else {
445 MF, MORE, "amdgpu-regbanklegalize",
446 "AMDGPU RegBankLegalize: lowerVccExtToSel, Type not supported", MI);
447 return false;
448 }
449
450 MI.eraseFromParent();
451 return true;
452}
453
454std::pair<Register, Register> RegBankLegalizeHelper::unpackZExt(Register Reg) {
455 auto PackedS32 = B.buildBitcast(SgprRB_S32, Reg);
456 auto Mask = B.buildConstant(SgprRB_S32, 0x0000ffff);
457 auto Lo = B.buildAnd(SgprRB_S32, PackedS32, Mask);
458 auto Hi = B.buildLShr(SgprRB_S32, PackedS32, B.buildConstant(SgprRB_S32, 16));
459 return {Lo.getReg(0), Hi.getReg(0)};
460}
461
462std::pair<Register, Register> RegBankLegalizeHelper::unpackSExt(Register Reg) {
463 auto PackedS32 = B.buildBitcast(SgprRB_S32, Reg);
464 auto Lo = B.buildSExtInReg(SgprRB_S32, PackedS32, 16);
465 auto Hi = B.buildAShr(SgprRB_S32, PackedS32, B.buildConstant(SgprRB_S32, 16));
466 return {Lo.getReg(0), Hi.getReg(0)};
467}
468
469std::pair<Register, Register> RegBankLegalizeHelper::unpackAExt(Register Reg) {
470 auto PackedS32 = B.buildBitcast(SgprRB_S32, Reg);
471 auto Lo = PackedS32;
472 auto Hi = B.buildLShr(SgprRB_S32, PackedS32, B.buildConstant(SgprRB_S32, 16));
473 return {Lo.getReg(0), Hi.getReg(0)};
474}
475
476std::pair<Register, Register>
477RegBankLegalizeHelper::unpackAExtTruncS16(Register Reg) {
478 auto [Lo32, Hi32] = unpackAExt(Reg);
479 return {B.buildTrunc(SgprRB_S16, Lo32).getReg(0),
480 B.buildTrunc(SgprRB_S16, Hi32).getReg(0)};
481}
482
483bool RegBankLegalizeHelper::lowerUnpackBitShift(MachineInstr &MI) {
484 Register Lo, Hi;
485 switch (MI.getOpcode()) {
486 case AMDGPU::G_SHL: {
487 auto [Val0, Val1] = unpackAExt(MI.getOperand(1).getReg());
488 auto [Amt0, Amt1] = unpackAExt(MI.getOperand(2).getReg());
489 Lo = B.buildInstr(MI.getOpcode(), {SgprRB_S32}, {Val0, Amt0}).getReg(0);
490 Hi = B.buildInstr(MI.getOpcode(), {SgprRB_S32}, {Val1, Amt1}).getReg(0);
491 break;
492 }
493 case AMDGPU::G_LSHR: {
494 auto [Val0, Val1] = unpackZExt(MI.getOperand(1).getReg());
495 auto [Amt0, Amt1] = unpackZExt(MI.getOperand(2).getReg());
496 Lo = B.buildInstr(MI.getOpcode(), {SgprRB_S32}, {Val0, Amt0}).getReg(0);
497 Hi = B.buildInstr(MI.getOpcode(), {SgprRB_S32}, {Val1, Amt1}).getReg(0);
498 break;
499 }
500 case AMDGPU::G_ASHR: {
501 auto [Val0, Val1] = unpackSExt(MI.getOperand(1).getReg());
502 auto [Amt0, Amt1] = unpackSExt(MI.getOperand(2).getReg());
503 Lo = B.buildAShr(SgprRB_S32, Val0, Amt0).getReg(0);
504 Hi = B.buildAShr(SgprRB_S32, Val1, Amt1).getReg(0);
505 break;
506 }
507 default:
509 MF, MORE, "amdgpu-regbanklegalize",
510 "AMDGPU RegBankLegalize: lowerUnpackBitShift, case not implemented",
511 MI);
512 return false;
513 }
514 B.buildBuildVectorTrunc(MI.getOperand(0).getReg(), {Lo, Hi});
515 MI.eraseFromParent();
516 return true;
517}
518
519bool RegBankLegalizeHelper::lowerUnpackMinMax(MachineInstr &MI) {
520 Register Lo, Hi;
521 switch (MI.getOpcode()) {
522 case AMDGPU::G_SMIN:
523 case AMDGPU::G_SMAX: {
524 // For signed operations, use sign extension
525 auto [Val0_Lo, Val0_Hi] = unpackSExt(MI.getOperand(1).getReg());
526 auto [Val1_Lo, Val1_Hi] = unpackSExt(MI.getOperand(2).getReg());
527 Lo = B.buildInstr(MI.getOpcode(), {SgprRB_S32}, {Val0_Lo, Val1_Lo})
528 .getReg(0);
529 Hi = B.buildInstr(MI.getOpcode(), {SgprRB_S32}, {Val0_Hi, Val1_Hi})
530 .getReg(0);
531 break;
532 }
533 case AMDGPU::G_UMIN:
534 case AMDGPU::G_UMAX: {
535 // For unsigned operations, use zero extension
536 auto [Val0_Lo, Val0_Hi] = unpackZExt(MI.getOperand(1).getReg());
537 auto [Val1_Lo, Val1_Hi] = unpackZExt(MI.getOperand(2).getReg());
538 Lo = B.buildInstr(MI.getOpcode(), {SgprRB_S32}, {Val0_Lo, Val1_Lo})
539 .getReg(0);
540 Hi = B.buildInstr(MI.getOpcode(), {SgprRB_S32}, {Val0_Hi, Val1_Hi})
541 .getReg(0);
542 break;
543 }
544 default:
546 MF, MORE, "amdgpu-regbanklegalize",
547 "AMDGPU RegBankLegalize: lowerUnpackMinMax, case not implemented", MI);
548 return false;
549 }
550 B.buildBuildVectorTrunc(MI.getOperand(0).getReg(), {Lo, Hi});
551 MI.eraseFromParent();
552 return true;
553}
554
555bool RegBankLegalizeHelper::lowerUnpackAExt(MachineInstr &MI) {
556 auto [Op1Lo, Op1Hi] = unpackAExt(MI.getOperand(1).getReg());
557 auto [Op2Lo, Op2Hi] = unpackAExt(MI.getOperand(2).getReg());
558 auto ResLo = B.buildInstr(MI.getOpcode(), {SgprRB_S32}, {Op1Lo, Op2Lo});
559 auto ResHi = B.buildInstr(MI.getOpcode(), {SgprRB_S32}, {Op1Hi, Op2Hi});
560 B.buildBuildVectorTrunc(MI.getOperand(0).getReg(),
561 {ResLo.getReg(0), ResHi.getReg(0)});
562 MI.eraseFromParent();
563 return true;
564}
565
568 return (GI->is(Intrinsic::amdgcn_sbfe));
569
570 return MI.getOpcode() == AMDGPU::G_SBFX;
571}
572
573bool RegBankLegalizeHelper::lowerV_BFE(MachineInstr &MI) {
574 Register Dst = MI.getOperand(0).getReg();
575 assert(MRI.getType(Dst) == LLT::scalar(64));
576 bool Signed = isSignedBFE(MI);
577 unsigned FirstOpnd = isa<GIntrinsic>(MI) ? 2 : 1;
578 // Extract bitfield from Src, LSBit is the least-significant bit for the
579 // extraction (field offset) and Width is size of bitfield.
580 Register Src = MI.getOperand(FirstOpnd).getReg();
581 Register LSBit = MI.getOperand(FirstOpnd + 1).getReg();
582 Register Width = MI.getOperand(FirstOpnd + 2).getReg();
583 // Comments are for signed bitfield extract, similar for unsigned. x is sign
584 // bit. s is sign, l is LSB and y are remaining bits of bitfield to extract.
585
586 // Src >> LSBit Hi|Lo: x?????syyyyyyl??? -> xxxx?????syyyyyyl
587 unsigned SHROpc = Signed ? AMDGPU::G_ASHR : AMDGPU::G_LSHR;
588 auto SHRSrc = B.buildInstr(SHROpc, {{VgprRB, S64}}, {Src, LSBit});
589
590 auto ConstWidth = getIConstantVRegValWithLookThrough(Width, MRI);
591
592 // Expand to Src >> LSBit << (64 - Width) >> (64 - Width)
593 // << (64 - Width): Hi|Lo: xxxx?????syyyyyyl -> syyyyyyl000000000
594 // >> (64 - Width): Hi|Lo: syyyyyyl000000000 -> ssssssssssyyyyyyl
595 if (!ConstWidth) {
596 auto Amt = B.buildSub(VgprRB_S32, B.buildConstant(SgprRB_S32, 64), Width);
597 auto SignBit = B.buildShl({VgprRB, S64}, SHRSrc, Amt);
598 B.buildInstr(SHROpc, {Dst}, {SignBit, Amt});
599 MI.eraseFromParent();
600 return true;
601 }
602
603 uint64_t WidthImm = ConstWidth->Value.getZExtValue();
604 auto UnmergeSHRSrc = B.buildUnmerge(VgprRB_S32, SHRSrc);
605 Register SHRSrcLo = UnmergeSHRSrc.getReg(0);
606 Register SHRSrcHi = UnmergeSHRSrc.getReg(1);
607 auto Zero = B.buildConstant({VgprRB, S32}, 0);
608 unsigned BFXOpc = Signed ? AMDGPU::G_SBFX : AMDGPU::G_UBFX;
609
610 if (WidthImm <= 32) {
611 // SHRSrc Hi|Lo: ????????|???syyyl -> ????????|ssssyyyl
612 auto Lo = B.buildInstr(BFXOpc, {VgprRB_S32}, {SHRSrcLo, Zero, Width});
613 MachineInstrBuilder Hi;
614 if (Signed) {
615 // SHRSrc Hi|Lo: ????????|ssssyyyl -> ssssssss|ssssyyyl
616 Hi = B.buildAShr(VgprRB_S32, Lo, B.buildConstant(VgprRB_S32, 31));
617 } else {
618 // SHRSrc Hi|Lo: ????????|000syyyl -> 00000000|000syyyl
619 Hi = Zero;
620 }
621 B.buildMergeLikeInstr(Dst, {Lo, Hi});
622 } else {
623 auto Amt = B.buildConstant(VgprRB_S32, WidthImm - 32);
624 // SHRSrc Hi|Lo: ??????sy|yyyyyyyl -> sssssssy|yyyyyyyl
625 auto Hi = B.buildInstr(BFXOpc, {VgprRB_S32}, {SHRSrcHi, Zero, Amt});
626 B.buildMergeLikeInstr(Dst, {SHRSrcLo, Hi});
627 }
628
629 MI.eraseFromParent();
630 return true;
631}
632
633bool RegBankLegalizeHelper::lowerS_BFE(MachineInstr &MI) {
634 Register DstReg = MI.getOperand(0).getReg();
635 LLT Ty = MRI.getType(DstReg);
636 bool Signed = isSignedBFE(MI);
637 unsigned FirstOpnd = isa<GIntrinsic>(MI) ? 2 : 1;
638 Register Src = MI.getOperand(FirstOpnd).getReg();
639 Register LSBit = MI.getOperand(FirstOpnd + 1).getReg();
640 Register Width = MI.getOperand(FirstOpnd + 2).getReg();
641 // For uniform bit field extract there are 4 available instructions, but
642 // LSBit(field offset) and Width(size of bitfield) need to be packed in S32,
643 // field offset in low and size in high 16 bits.
644
645 // Src1 Hi16|Lo16 = Size|FieldOffset
646 auto Mask = B.buildConstant(SgprRB_S32, maskTrailingOnes<unsigned>(6));
647 auto FieldOffset = B.buildAnd(SgprRB_S32, LSBit, Mask);
648 auto Size = B.buildShl(SgprRB_S32, Width, B.buildConstant(SgprRB_S32, 16));
649 auto Src1 = B.buildOr(SgprRB_S32, FieldOffset, Size);
650 unsigned Opc32 = Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32;
651 unsigned Opc64 = Signed ? AMDGPU::S_BFE_I64 : AMDGPU::S_BFE_U64;
652 unsigned Opc = Ty == S32 ? Opc32 : Opc64;
653
654 // Select machine instruction, because of reg class constraining, insert
655 // copies from reg class to reg bank.
656 auto S_BFE = B.buildInstr(Opc, {{SgprRB, Ty}},
657 {B.buildCopy(Ty, Src), B.buildCopy(S32, Src1)});
658 constrainSelectedInstRegOperands(*S_BFE, *ST.getInstrInfo(),
659 *ST.getRegisterInfo(), RBI);
660
661 B.buildCopy(DstReg, S_BFE->getOperand(0).getReg());
662 MI.eraseFromParent();
663 return true;
664}
665
666bool RegBankLegalizeHelper::lowerSplitTo32(MachineInstr &MI) {
667 Register Dst = MI.getOperand(0).getReg();
668 LLT DstTy = MRI.getType(Dst);
669 assert(DstTy == V4S16 || DstTy == V2S32 || DstTy == S64);
670 LLT Ty = DstTy == V4S16 ? V2S16 : S32;
671 auto Op1 = B.buildUnmerge({VgprRB, Ty}, MI.getOperand(1).getReg());
672 auto Op2 = B.buildUnmerge({VgprRB, Ty}, MI.getOperand(2).getReg());
673 unsigned Opc = MI.getOpcode();
674 auto Flags = MI.getFlags();
675 auto Lo =
676 B.buildInstr(Opc, {{VgprRB, Ty}}, {Op1.getReg(0), Op2.getReg(0)}, Flags);
677 auto Hi =
678 B.buildInstr(Opc, {{VgprRB, Ty}}, {Op1.getReg(1), Op2.getReg(1)}, Flags);
679 B.buildMergeLikeInstr(Dst, {Lo, Hi});
680 MI.eraseFromParent();
681 return true;
682}
683
684bool RegBankLegalizeHelper::lowerSplitTo32Mul(MachineInstr &MI) {
685 Register Dst = MI.getOperand(0).getReg();
686 assert(MRI.getType(Dst) == S64);
687 auto Op1 = B.buildUnmerge({VgprRB_S32}, MI.getOperand(1).getReg());
688 auto Op2 = B.buildUnmerge({VgprRB_S32}, MI.getOperand(2).getReg());
689
690 // TODO: G_AMDGPU_MAD_* optimizations for G_MUL divergent S64 operation to
691 // match GlobalISel with old regbankselect.
692 auto Lo = B.buildMul(VgprRB_S32, Op1.getReg(0), Op2.getReg(0));
693 auto Carry = B.buildUMulH(VgprRB_S32, Op1.getReg(0), Op2.getReg(0));
694 auto MulLo0Hi1 = B.buildMul(VgprRB_S32, Op1.getReg(0), Op2.getReg(1));
695 auto MulHi0Lo1 = B.buildMul(VgprRB_S32, Op1.getReg(1), Op2.getReg(0));
696 auto Sum = B.buildAdd(VgprRB_S32, MulLo0Hi1, MulHi0Lo1);
697 auto Hi = B.buildAdd(VgprRB_S32, Sum, Carry);
698
699 B.buildMergeLikeInstr(Dst, {Lo, Hi});
700 MI.eraseFromParent();
701 return true;
702}
703
704bool RegBankLegalizeHelper::lowerSplitTo16(MachineInstr &MI) {
705 Register Dst = MI.getOperand(0).getReg();
706 assert(MRI.getType(Dst) == V2S16);
707 unsigned Opc = MI.getOpcode();
708 unsigned NumOps = MI.getNumOperands();
709 auto Flags = MI.getFlags();
710
711 auto [Op1Lo, Op1Hi] = unpackAExtTruncS16(MI.getOperand(1).getReg());
712
713 if (NumOps == 2) {
714 auto Lo = B.buildInstr(Opc, {SgprRB_S16}, {Op1Lo}, Flags);
715 auto Hi = B.buildInstr(Opc, {SgprRB_S16}, {Op1Hi}, Flags);
716 B.buildMergeLikeInstr(Dst, {Lo, Hi});
717 MI.eraseFromParent();
718 return true;
719 }
720
721 auto [Op2Lo, Op2Hi] = unpackAExtTruncS16(MI.getOperand(2).getReg());
722
723 if (NumOps == 3) {
724 auto Lo = B.buildInstr(Opc, {SgprRB_S16}, {Op1Lo, Op2Lo}, Flags);
725 auto Hi = B.buildInstr(Opc, {SgprRB_S16}, {Op1Hi, Op2Hi}, Flags);
726 B.buildMergeLikeInstr(Dst, {Lo, Hi});
727 MI.eraseFromParent();
728 return true;
729 }
730
731 assert(NumOps == 4);
732 auto [Op3Lo, Op3Hi] = unpackAExtTruncS16(MI.getOperand(3).getReg());
733 auto Lo = B.buildInstr(Opc, {SgprRB_S16}, {Op1Lo, Op2Lo, Op3Lo}, Flags);
734 auto Hi = B.buildInstr(Opc, {SgprRB_S16}, {Op1Hi, Op2Hi, Op3Hi}, Flags);
735 B.buildMergeLikeInstr(Dst, {Lo, Hi});
736 MI.eraseFromParent();
737 return true;
738}
739
740bool RegBankLegalizeHelper::lowerUniMAD64(MachineInstr &MI) {
741 Register Dst0 = MI.getOperand(0).getReg();
742 Register Dst1 = MI.getOperand(1).getReg();
743 Register Src0 = MI.getOperand(2).getReg();
744 Register Src1 = MI.getOperand(3).getReg();
745 Register Src2 = MI.getOperand(4).getReg();
746
747 const GCNSubtarget &ST = B.getMF().getSubtarget<GCNSubtarget>();
748
749 // Keep the multiplication on the SALU.
750 Register DstLo = B.buildMul(SgprRB_S32, Src0, Src1).getReg(0);
751 Register DstHi = MRI.createVirtualRegister(SgprRB_S32);
752 if (ST.hasScalarMulHiInsts()) {
753 B.buildInstr(AMDGPU::G_UMULH, {{DstHi}}, {Src0, Src1});
754 } else {
755 auto VSrc0 = B.buildCopy(VgprRB_S32, Src0);
756 auto VSrc1 = B.buildCopy(VgprRB_S32, Src1);
757 auto MulHi = B.buildInstr(AMDGPU::G_UMULH, {VgprRB_S32}, {VSrc0, VSrc1});
758 buildReadAnyLane(B, DstHi, MulHi.getReg(0), RBI);
759 }
760
761 // Accumulate and produce the "carry-out" bit.
762
763 // The "carry-out" is defined as bit 64 of the result when computed as a
764 // big integer. For unsigned multiply-add, this matches the usual
765 // definition of carry-out.
766 if (mi_match(Src2, MRI, MIPatternMatch::m_ZeroInt())) {
767 // No accumulate: result is just the multiplication, carry is 0.
768 B.buildMergeLikeInstr(Dst0, {DstLo, DstHi});
769 B.buildConstant(Dst1, 0);
770 } else {
771 // Accumulate: add Src2 to the multiplication result with carry chain.
772 Register Src2Lo = MRI.createVirtualRegister(SgprRB_S32);
773 Register Src2Hi = MRI.createVirtualRegister(SgprRB_S32);
774 B.buildUnmerge({Src2Lo, Src2Hi}, Src2);
775
776 auto AddLo = B.buildUAddo(SgprRB_S32, SgprRB_S32, DstLo, Src2Lo);
777 auto AddHi =
778 B.buildUAdde(SgprRB_S32, SgprRB_S32, DstHi, Src2Hi, AddLo.getReg(1));
779 B.buildMergeLikeInstr(Dst0, {AddLo.getReg(0), AddHi.getReg(0)});
780 B.buildCopy(Dst1, AddHi.getReg(1));
781 }
782
783 MI.eraseFromParent();
784 return true;
785}
786
787bool RegBankLegalizeHelper::lowerSplitTo32Select(MachineInstr &MI) {
788 Register Dst = MI.getOperand(0).getReg();
789 LLT DstTy = MRI.getType(Dst);
790 assert(DstTy == V4S16 || DstTy == V2S32 || DstTy == S64 ||
791 (DstTy.isPointer() && DstTy.getSizeInBits() == 64));
792 LLT Ty = DstTy == V4S16 ? V2S16 : S32;
793 auto Op2 = B.buildUnmerge({VgprRB, Ty}, MI.getOperand(2).getReg());
794 auto Op3 = B.buildUnmerge({VgprRB, Ty}, MI.getOperand(3).getReg());
795 Register Cond = MI.getOperand(1).getReg();
796 auto Flags = MI.getFlags();
797 auto Lo =
798 B.buildSelect({VgprRB, Ty}, Cond, Op2.getReg(0), Op3.getReg(0), Flags);
799 auto Hi =
800 B.buildSelect({VgprRB, Ty}, Cond, Op2.getReg(1), Op3.getReg(1), Flags);
801
802 B.buildMergeLikeInstr(Dst, {Lo, Hi});
803 MI.eraseFromParent();
804 return true;
805}
806
807bool RegBankLegalizeHelper::lowerSplitTo32SExtInReg(MachineInstr &MI) {
808 auto Op1 = B.buildUnmerge(VgprRB_S32, MI.getOperand(1).getReg());
809 int Amt = MI.getOperand(2).getImm();
810 Register Lo, Hi;
811 // Hi|Lo: s sign bit, ?/x bits changed/not changed by sign-extend
812 if (Amt <= 32) {
813 auto Freeze = B.buildFreeze(VgprRB_S32, Op1.getReg(0));
814 if (Amt == 32) {
815 // Hi|Lo: ????????|sxxxxxxx -> ssssssss|sxxxxxxx
816 Lo = Freeze.getReg(0);
817 } else {
818 // Hi|Lo: ????????|???sxxxx -> ssssssss|ssssxxxx
819 Lo = B.buildSExtInReg(VgprRB_S32, Freeze, Amt).getReg(0);
820 }
821
822 auto SignExtCst = B.buildConstant(SgprRB_S32, 31);
823 Hi = B.buildAShr(VgprRB_S32, Lo, SignExtCst).getReg(0);
824 } else {
825 // Hi|Lo: ?????sxx|xxxxxxxx -> ssssssxx|xxxxxxxx
826 Lo = Op1.getReg(0);
827 Hi = B.buildSExtInReg(VgprRB_S32, Op1.getReg(1), Amt - 32).getReg(0);
828 }
829
830 B.buildMergeLikeInstr(MI.getOperand(0).getReg(), {Lo, Hi});
831 MI.eraseFromParent();
832 return true;
833}
834
835bool RegBankLegalizeHelper::lowerSplitBitCount64To32(MachineInstr &MI) {
836 // Split 64-bit find-first-bit operations into 32-bit halves:
837 // (ffbh hi:lo) -> umin(ffbh(hi), uaddsat(ffbh(lo), 32))
838 // (ffbl hi:lo) -> umin(ffbl(lo), uaddsat(ffbl(hi), 32))
839 // (ctlz_zero_undef hi:lo) -> umin(ffbh(hi), add(ffbh(lo), 32))
840 // (cttz_zero_undef hi:lo) -> umin(ffbl(lo), add(ffbl(hi), 32))
841 unsigned Opc = MI.getOpcode();
842
843 // FFBH/FFBL return 0xFFFFFFFF on zero input, using uaddsat to avoid
844 // wrapping. CTLZ/CTTZ guarantee non-zero input (zero_undef), so plain add
845 // is fine.
846 unsigned FFBOpc;
847 unsigned AddOpc;
848 bool SearchFromMSB;
849 switch (Opc) {
850 case AMDGPU::G_AMDGPU_FFBH_U32:
851 FFBOpc = Opc;
852 AddOpc = AMDGPU::G_UADDSAT;
853 SearchFromMSB = true;
854 break;
855 case AMDGPU::G_AMDGPU_FFBL_B32:
856 FFBOpc = Opc;
857 AddOpc = AMDGPU::G_UADDSAT;
858 SearchFromMSB = false;
859 break;
860 case AMDGPU::G_CTLZ_ZERO_UNDEF:
861 FFBOpc = AMDGPU::G_AMDGPU_FFBH_U32;
862 AddOpc = AMDGPU::G_ADD;
863 SearchFromMSB = true;
864 break;
865 case AMDGPU::G_CTTZ_ZERO_UNDEF:
866 FFBOpc = AMDGPU::G_AMDGPU_FFBL_B32;
867 AddOpc = AMDGPU::G_ADD;
868 SearchFromMSB = false;
869 break;
870 default:
871 llvm_unreachable("unexpected opcode in lowerSplitBitCount64To32");
872 }
873
874 auto Unmerge = B.buildUnmerge(VgprRB_S32, MI.getOperand(1).getReg());
875 Register Lo = Unmerge.getReg(0);
876 Register Hi = Unmerge.getReg(1);
877
878 // MSB-first (FFBH/CTLZ) searches hi first; LSB-first (FFBL/CTTZ) searches
879 // lo first. The secondary half adds 32 to account for the primary half's
880 // width.
881 auto Primary = B.buildInstr(FFBOpc, {VgprRB_S32}, {SearchFromMSB ? Hi : Lo});
882 auto Secondary =
883 B.buildInstr(FFBOpc, {VgprRB_S32}, {SearchFromMSB ? Lo : Hi});
884
885 auto Adjusted = B.buildInstr(AddOpc, {VgprRB_S32},
886 {Secondary, B.buildConstant(VgprRB_S32, 32)});
887 B.buildUMin(MI.getOperand(0).getReg(), Primary, Adjusted);
888
889 MI.eraseFromParent();
890 return true;
891}
892
893bool RegBankLegalizeHelper::lower(MachineInstr &MI,
894 const RegBankLLTMapping &Mapping,
895 WaterfallInfo &WFI) {
896
897 switch (Mapping.LoweringMethod) {
898 case DoNotLower:
899 break;
900 case VccExtToSel:
901 return lowerVccExtToSel(MI);
902 case UniExtToSel: {
903 LLT Ty = MRI.getType(MI.getOperand(0).getReg());
904 auto True = B.buildConstant({SgprRB, Ty},
905 MI.getOpcode() == AMDGPU::G_SEXT ? -1 : 1);
906 auto False = B.buildConstant({SgprRB, Ty}, 0);
907 // Input to G_{Z|S}EXT is 'Legalizer legal' S1. Most common case is compare.
908 // We are making select here. S1 cond was already 'any-extended to S32' +
909 // 'AND with 1 to clean high bits' by Sgpr32AExtBoolInReg.
910 B.buildSelect(MI.getOperand(0).getReg(), MI.getOperand(1).getReg(), True,
911 False);
912 MI.eraseFromParent();
913 return true;
914 }
915 case UnpackBitShift:
916 return lowerUnpackBitShift(MI);
917 case UnpackMinMax:
918 return lowerUnpackMinMax(MI);
919 case ScalarizeToS16:
920 return lowerSplitTo16(MI);
921 case Ext32To64: {
922 const RegisterBank *RB = MRI.getRegBank(MI.getOperand(0).getReg());
923 MachineInstrBuilder Hi;
924 switch (MI.getOpcode()) {
925 case AMDGPU::G_ZEXT: {
926 Hi = B.buildConstant({RB, S32}, 0);
927 break;
928 }
929 case AMDGPU::G_SEXT: {
930 // Replicate sign bit from 32-bit extended part.
931 auto ShiftAmt = B.buildConstant({RB, S32}, 31);
932 Hi = B.buildAShr({RB, S32}, MI.getOperand(1).getReg(), ShiftAmt);
933 break;
934 }
935 case AMDGPU::G_ANYEXT: {
936 Hi = B.buildUndef({RB, S32});
937 break;
938 }
939 default:
940 reportGISelFailure(MF, MORE, "amdgpu-regbanklegalize",
941 "AMDGPU RegBankLegalize: Ext32To64, unsuported opcode",
942 MI);
943 return false;
944 }
945
946 B.buildMergeLikeInstr(MI.getOperand(0).getReg(),
947 {MI.getOperand(1).getReg(), Hi});
948 MI.eraseFromParent();
949 return true;
950 }
951 case UniCstExt: {
952 uint64_t ConstVal = MI.getOperand(1).getCImm()->getZExtValue();
953 B.buildConstant(MI.getOperand(0).getReg(), ConstVal);
954
955 MI.eraseFromParent();
956 return true;
957 }
958 case VgprToVccCopy: {
959 Register Src = MI.getOperand(1).getReg();
960 LLT Ty = MRI.getType(Src);
961 // Take lowest bit from each lane and put it in lane mask.
962 // Lowering via compare, but we need to clean high bits first as compare
963 // compares all bits in register.
964 Register BoolSrc = MRI.createVirtualRegister({VgprRB, Ty});
965 if (Ty == S64) {
966 auto Src64 = B.buildUnmerge(VgprRB_S32, Src);
967 auto One = B.buildConstant(VgprRB_S32, 1);
968 auto AndLo = B.buildAnd(VgprRB_S32, Src64.getReg(0), One);
969 auto Zero = B.buildConstant(VgprRB_S32, 0);
970 auto AndHi = B.buildAnd(VgprRB_S32, Src64.getReg(1), Zero);
971 B.buildMergeLikeInstr(BoolSrc, {AndLo, AndHi});
972 } else {
973 assert(Ty == S32 || Ty == S16);
974 auto One = B.buildConstant({VgprRB, Ty}, 1);
975 B.buildAnd(BoolSrc, Src, One);
976 }
977 auto Zero = B.buildConstant({VgprRB, Ty}, 0);
978 B.buildICmp(CmpInst::ICMP_NE, MI.getOperand(0).getReg(), BoolSrc, Zero);
979 MI.eraseFromParent();
980 return true;
981 }
982 case V_BFE:
983 return lowerV_BFE(MI);
984 case S_BFE:
985 return lowerS_BFE(MI);
986 case UniMAD64:
987 return lowerUniMAD64(MI);
988 case UniMul64: {
989 B.buildMul(MI.getOperand(0), MI.getOperand(1), MI.getOperand(2));
990 MI.eraseFromParent();
991 return true;
992 }
993 case DivSMulToMAD: {
994 auto Op1 = B.buildTrunc(VgprRB_S32, MI.getOperand(1));
995 auto Op2 = B.buildTrunc(VgprRB_S32, MI.getOperand(2));
996 auto Zero = B.buildConstant({VgprRB, S64}, 0);
997
998 unsigned NewOpc = MI.getOpcode() == AMDGPU::G_AMDGPU_S_MUL_U64_U32
999 ? AMDGPU::G_AMDGPU_MAD_U64_U32
1000 : AMDGPU::G_AMDGPU_MAD_I64_I32;
1001
1002 B.buildInstr(NewOpc, {MI.getOperand(0).getReg(), {SgprRB, S32}},
1003 {Op1, Op2, Zero});
1004 MI.eraseFromParent();
1005 return true;
1006 }
1007 case SplitTo32:
1008 return lowerSplitTo32(MI);
1009 case SplitTo32Mul:
1010 return lowerSplitTo32Mul(MI);
1011 case SplitTo32Select:
1012 return lowerSplitTo32Select(MI);
1013 case SplitTo32SExtInReg:
1014 return lowerSplitTo32SExtInReg(MI);
1015 case SplitLoad: {
1016 LLT DstTy = MRI.getType(MI.getOperand(0).getReg());
1017 unsigned Size = DstTy.getSizeInBits();
1018 // Even split to 128-bit loads
1019 if (Size > 128) {
1020 LLT B128;
1021 if (DstTy.isVector()) {
1022 LLT EltTy = DstTy.getElementType();
1023 B128 = LLT::fixed_vector(128 / EltTy.getSizeInBits(), EltTy);
1024 } else {
1025 B128 = LLT::scalar(128);
1026 }
1027 if (Size / 128 == 2)
1028 splitLoad(MI, {B128, B128});
1029 else if (Size / 128 == 4)
1030 splitLoad(MI, {B128, B128, B128, B128});
1031 else {
1032 reportGISelFailure(MF, MORE, "amdgpu-regbanklegalize",
1033 "AMDGPU RegBankLegalize: SplitLoad, unsuported type",
1034 MI);
1035 return false;
1036 }
1037 }
1038 // 64 and 32 bit load
1039 else if (DstTy == S96)
1040 splitLoad(MI, {S64, S32}, S32);
1041 else if (DstTy == V3S32)
1042 splitLoad(MI, {V2S32, S32}, S32);
1043 else if (DstTy == V6S16)
1044 splitLoad(MI, {V4S16, V2S16}, V2S16);
1045 else {
1046 reportGISelFailure(MF, MORE, "amdgpu-regbanklegalize",
1047 "AMDGPU RegBankLegalize: SplitLoad, unsuported type",
1048 MI);
1049 return false;
1050 }
1051 return true;
1052 }
1053 case WidenLoad: {
1054 LLT DstTy = MRI.getType(MI.getOperand(0).getReg());
1055 if (DstTy == S96)
1056 widenLoad(MI, S128);
1057 else if (DstTy == V3S32)
1058 widenLoad(MI, V4S32, S32);
1059 else if (DstTy == V6S16)
1060 widenLoad(MI, V8S16, V2S16);
1061 else {
1062 reportGISelFailure(MF, MORE, "amdgpu-regbanklegalize",
1063 "AMDGPU RegBankLegalize: WidenLoad, unsuported type",
1064 MI);
1065 return false;
1066 }
1067 return true;
1068 }
1069 case UnpackAExt:
1070 return lowerUnpackAExt(MI);
1071 case WidenMMOToS32:
1072 return widenMMOToS32(cast<GAnyLoad>(MI));
1073 case VerifyAllSgpr: {
1074 assert(llvm::all_of(MI.operands(), [&](const MachineOperand &Op) {
1075 return MRI.getRegBankOrNull(Op.getReg()) == SgprRB;
1076 }));
1077 return true;
1078 }
1079 case ApplyAllVgpr: {
1080 assert(llvm::all_of(MI.defs(), [&](const MachineOperand &Op) {
1081 return MRI.getRegBankOrNull(Op.getReg()) == VgprRB;
1082 }));
1083 B.setInstrAndDebugLoc(MI);
1084 for (unsigned i = MI.getNumDefs(); i < MI.getNumOperands(); ++i) {
1085 Register Reg = MI.getOperand(i).getReg();
1086 if (MRI.getRegBank(Reg) != VgprRB) {
1087 auto Copy = B.buildCopy({VgprRB, MRI.getType(Reg)}, Reg);
1088 MI.getOperand(i).setReg(Copy.getReg(0));
1089 }
1090 }
1091 return true;
1092 }
1093 case UnmergeToShiftTrunc: {
1094 GUnmerge *Unmerge = dyn_cast<GUnmerge>(&MI);
1095 LLT Ty = MRI.getType(Unmerge->getSourceReg());
1096 if (Ty.getSizeInBits() % 32 != 0) {
1097 reportGISelFailure(MF, MORE, "amdgpu-regbanklegalize",
1098 "AMDGPU RegBankLegalize: unmerge not multiple of 32",
1099 MI);
1100 return false;
1101 }
1102
1103 B.setInstrAndDebugLoc(MI);
1104 if (Ty.getSizeInBits() > 32) {
1105 auto UnmergeV2S16 =
1106 B.buildUnmerge({SgprRB, V2S16}, Unmerge->getSourceReg());
1107 for (unsigned i = 0; i < UnmergeV2S16->getNumDefs(); ++i) {
1108 auto [Dst0S32, Dst1S32] =
1109 unpackAExt(UnmergeV2S16->getOperand(i).getReg());
1110 B.buildTrunc(MI.getOperand(i * 2).getReg(), Dst0S32);
1111 B.buildTrunc(MI.getOperand(i * 2 + 1).getReg(), Dst1S32);
1112 }
1113 } else {
1114 auto [Dst0S32, Dst1S32] = unpackAExt(MI.getOperand(2).getReg());
1115 B.buildTrunc(MI.getOperand(0).getReg(), Dst0S32);
1116 B.buildTrunc(MI.getOperand(1).getReg(), Dst1S32);
1117 }
1118
1119 MI.eraseFromParent();
1120 return true;
1121 }
1123 Register Dst = MI.getOperand(0).getReg();
1124 Register NewDst = MRI.createVirtualRegister(SgprRB_S32);
1125 B.setInsertPt(*MI.getParent(), MI.getParent()->getFirstNonPHI());
1126 MI.getOperand(0).setReg(NewDst);
1127 B.buildTrunc(Dst, NewDst);
1128
1129 for (unsigned i = 1; i < MI.getNumOperands(); i += 2) {
1130 Register UseReg = MI.getOperand(i).getReg();
1131
1132 auto DefMI = MRI.getVRegDef(UseReg)->getIterator();
1133 MachineBasicBlock *DefMBB = DefMI->getParent();
1134
1135 B.setInsertPt(*DefMBB, DefMBB->SkipPHIsAndLabels(std::next(DefMI)));
1136
1137 auto NewUse = B.buildAnyExt(SgprRB_S32, UseReg);
1138 MI.getOperand(i).setReg(NewUse.getReg(0));
1139 }
1140 break;
1141 }
1142 case VerifyAllSgprGPHI: {
1143 assert(llvm::all_of(MI.operands(), [&](const MachineOperand &Op) {
1144 if (Op.isMBB())
1145 return true;
1146 return MRI.getRegBankOrNull(Op.getReg()) == SgprRB;
1147 }));
1148 return true;
1149 }
1151 assert(MRI.getRegBankOrNull(MI.getOperand(0).getReg()) == VgprRB);
1152 assert(llvm::all_of(MI.operands(), [&](const MachineOperand &Op) {
1153 if (Op.isMBB())
1154 return true;
1155 const RegisterBank *RB = MRI.getRegBankOrNull(Op.getReg());
1156 return RB == VgprRB || RB == SgprRB;
1157 }));
1158 return true;
1159 }
1160 case ApplyINTRIN_IMAGE:
1161 return applyRegisterBanksINTRIN_IMAGE(MI);
1163 return lowerSplitBitCount64To32(MI);
1164 }
1165
1166 if (!WFI.SgprWaterfallOperandRegs.empty()) {
1167 if (!executeInWaterfallLoop(B, WFI))
1168 return false;
1169 }
1170 return true;
1171}
1172
1173LLT RegBankLegalizeHelper::getTyFromID(RegBankLLTMappingApplyID ID) {
1174 switch (ID) {
1175 case Vcc:
1176 case UniInVcc:
1177 return LLT::scalar(1);
1178 case Sgpr16:
1179 case Vgpr16:
1180 case UniInVgprS16:
1181 return LLT::scalar(16);
1182 case Sgpr32:
1183 case Sgpr32_WF:
1184 case Sgpr32Trunc:
1185 case Sgpr32AExt:
1187 case Sgpr32SExt:
1188 case Sgpr32ZExt:
1189 case UniInVgprS32:
1190 case Vgpr32:
1191 case Vgpr32AExt:
1192 case Vgpr32SExt:
1193 case Vgpr32ZExt:
1194 return LLT::scalar(32);
1195 case Sgpr64:
1196 case Vgpr64:
1197 case UniInVgprS64:
1198 return LLT::scalar(64);
1199 case Sgpr128:
1200 case Vgpr128:
1201 return LLT::scalar(128);
1202 case SgprP0:
1203 case SgprP0Call_WF:
1204 case VgprP0:
1205 return LLT::pointer(0, 64);
1206 case SgprP1:
1207 case VgprP1:
1208 return LLT::pointer(1, 64);
1209 case SgprP2:
1210 case VgprP2:
1211 return LLT::pointer(2, 32);
1212 case SgprP3:
1213 case VgprP3:
1214 return LLT::pointer(3, 32);
1215 case SgprP4:
1216 case SgprP4Call_WF:
1217 case VgprP4:
1218 return LLT::pointer(4, 64);
1219 case SgprP5:
1220 case VgprP5:
1221 return LLT::pointer(5, 32);
1222 case SgprP8:
1223 return LLT::pointer(8, 128);
1224 case SgprV2S16:
1225 case VgprV2S16:
1226 case UniInVgprV2S16:
1227 return LLT::fixed_vector(2, 16);
1228 case SgprV2S32:
1229 case VgprV2S32:
1230 case UniInVgprV2S32:
1231 return LLT::fixed_vector(2, 32);
1232 case VgprV3S32:
1233 return LLT::fixed_vector(3, 32);
1234 case VgprV4S16:
1235 return LLT::fixed_vector(4, 16);
1236 case SgprV4S32:
1237 case SgprV4S32_WF:
1238 case VgprV4S32:
1239 case UniInVgprV4S32:
1240 return LLT::fixed_vector(4, 32);
1241 case VgprV8S32:
1242 return LLT::fixed_vector(8, 32);
1243 case VgprV2S64:
1244 case UniInVgprV2S64:
1245 return LLT::fixed_vector(2, 64);
1246 default:
1247 return LLT();
1248 }
1249}
1250
1251LLT RegBankLegalizeHelper::getBTyFromID(RegBankLLTMappingApplyID ID, LLT Ty) {
1252 switch (ID) {
1253 case SgprB32:
1254 case VgprB32:
1255 case SgprB32_M0:
1256 case UniInVgprB32:
1257 if (Ty == LLT::scalar(32) || Ty == LLT::fixed_vector(2, 16) ||
1258 isAnyPtr(Ty, 32))
1259 return Ty;
1260 return LLT();
1261 case SgprPtr32:
1262 case VgprPtr32:
1263 return isAnyPtr(Ty, 32) ? Ty : LLT();
1264 case SgprPtr64:
1265 case VgprPtr64:
1266 return isAnyPtr(Ty, 64) ? Ty : LLT();
1267 case SgprPtr128:
1268 case VgprPtr128:
1269 return isAnyPtr(Ty, 128) ? Ty : LLT();
1270 case SgprB64:
1271 case VgprB64:
1272 case UniInVgprB64:
1273 if (Ty == LLT::scalar(64) || Ty == LLT::fixed_vector(2, 32) ||
1274 Ty == LLT::fixed_vector(4, 16) || isAnyPtr(Ty, 64))
1275 return Ty;
1276 return LLT();
1277 case SgprB96:
1278 case VgprB96:
1279 case UniInVgprB96:
1280 if (Ty == LLT::scalar(96) || Ty == LLT::fixed_vector(3, 32) ||
1281 Ty == LLT::fixed_vector(6, 16))
1282 return Ty;
1283 return LLT();
1284 case SgprB128:
1285 case VgprB128:
1286 case UniInVgprB128:
1287 if (Ty == LLT::scalar(128) || Ty == LLT::fixed_vector(4, 32) ||
1288 Ty == LLT::fixed_vector(2, 64) || Ty == LLT::fixed_vector(8, 16) ||
1289 isAnyPtr(Ty, 128))
1290 return Ty;
1291 return LLT();
1292 case VgprB160:
1293 case UniInVgprB160:
1294 if (Ty.getSizeInBits() == 160)
1295 return Ty;
1296 return LLT();
1297 case SgprB256:
1298 case VgprB256:
1299 case UniInVgprB256:
1300 if (Ty == LLT::scalar(256) || Ty == LLT::fixed_vector(8, 32) ||
1301 Ty == LLT::fixed_vector(4, 64) || Ty == LLT::fixed_vector(16, 16))
1302 return Ty;
1303 return LLT();
1304 case SgprB512:
1305 case VgprB512:
1306 case UniInVgprB512:
1307 if (Ty == LLT::scalar(512) || Ty == LLT::fixed_vector(16, 32) ||
1308 Ty == LLT::fixed_vector(8, 64))
1309 return Ty;
1310 return LLT();
1311 case SgprBRC: {
1312 const SIRegisterInfo *TRI =
1313 static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
1314 unsigned LLTSize = Ty.getSizeInBits();
1315 if (LLTSize >= 32 && TRI->getSGPRClassForBitWidth(LLTSize))
1316 return Ty;
1317 return LLT();
1318 }
1319 case VgprBRC: {
1320 const SIRegisterInfo *TRI =
1321 static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
1322 if (TRI->getSGPRClassForBitWidth(Ty.getSizeInBits()))
1323 return Ty;
1324 return LLT();
1325 }
1326 default:
1327 return LLT();
1328 }
1329}
1330
1331const RegisterBank *
1332RegBankLegalizeHelper::getRegBankFromID(RegBankLLTMappingApplyID ID) {
1333 switch (ID) {
1334 case Vcc:
1335 return VccRB;
1336 case Sgpr16:
1337 case Sgpr32:
1338 case Sgpr32_WF:
1339 case Sgpr64:
1340 case Sgpr128:
1341 case SgprP0:
1342 case SgprP0Call_WF:
1343 case SgprP1:
1344 case SgprP2:
1345 case SgprP3:
1346 case SgprP4:
1347 case SgprP4Call_WF:
1348 case SgprP5:
1349 case SgprP8:
1350 case SgprPtr32:
1351 case SgprPtr64:
1352 case SgprPtr128:
1353 case SgprV2S16:
1354 case SgprV2S32:
1355 case SgprV4S32:
1356 case SgprV4S32_WF:
1357 case SgprB32:
1358 case SgprB64:
1359 case SgprB96:
1360 case SgprB128:
1361 case SgprB256:
1362 case SgprB512:
1363 case SgprBRC:
1364 case UniInVcc:
1365 case UniInVgprS16:
1366 case UniInVgprS32:
1367 case UniInVgprS64:
1368 case UniInVgprV2S16:
1369 case UniInVgprV2S32:
1370 case UniInVgprV4S32:
1371 case UniInVgprV2S64:
1372 case UniInVgprB32:
1373 case UniInVgprB64:
1374 case UniInVgprB96:
1375 case UniInVgprB128:
1376 case UniInVgprB160:
1377 case UniInVgprB256:
1378 case UniInVgprB512:
1379 case Sgpr32Trunc:
1380 case Sgpr32AExt:
1382 case Sgpr32SExt:
1383 case Sgpr32ZExt:
1384 return SgprRB;
1385 case Vgpr16:
1386 case Vgpr32:
1387 case Vgpr64:
1388 case Vgpr128:
1389 case VgprP0:
1390 case VgprP1:
1391 case VgprP2:
1392 case VgprP3:
1393 case VgprP4:
1394 case VgprP5:
1395 case VgprPtr32:
1396 case VgprPtr64:
1397 case VgprPtr128:
1398 case VgprV2S16:
1399 case VgprV2S32:
1400 case VgprV2S64:
1401 case VgprV3S32:
1402 case VgprV4S16:
1403 case VgprV4S32:
1404 case VgprV8S32:
1405 case VgprB32:
1406 case VgprB64:
1407 case VgprB96:
1408 case VgprB128:
1409 case VgprB160:
1410 case VgprB256:
1411 case VgprB512:
1412 case VgprBRC:
1413 case Vgpr32AExt:
1414 case Vgpr32SExt:
1415 case Vgpr32ZExt:
1416 return VgprRB;
1417 default:
1418 return nullptr;
1419 }
1420}
1421
1422bool RegBankLegalizeHelper::applyMappingDst(
1423 MachineInstr &MI, unsigned &OpIdx,
1424 const SmallVectorImpl<RegBankLLTMappingApplyID> &MethodIDs) {
1425 // Defs start from operand 0
1426 for (; OpIdx < MethodIDs.size(); ++OpIdx) {
1427 if (MethodIDs[OpIdx] == None)
1428 continue;
1429 MachineOperand &Op = MI.getOperand(OpIdx);
1430 Register Reg = Op.getReg();
1431 LLT Ty = MRI.getType(Reg);
1432 [[maybe_unused]] const RegisterBank *RB = MRI.getRegBank(Reg);
1433
1434 switch (MethodIDs[OpIdx]) {
1435 // vcc, sgpr and vgpr scalars, pointers and vectors
1436 case Vcc:
1437 case Sgpr16:
1438 case Sgpr32:
1439 case Sgpr64:
1440 case Sgpr128:
1441 case SgprP0:
1442 case SgprP1:
1443 case SgprP3:
1444 case SgprP4:
1445 case SgprP5:
1446 case SgprP8:
1447 case SgprV2S16:
1448 case SgprV2S32:
1449 case SgprV4S32:
1450 case Vgpr16:
1451 case Vgpr32:
1452 case Vgpr64:
1453 case Vgpr128:
1454 case VgprP0:
1455 case VgprP1:
1456 case VgprP2:
1457 case VgprP3:
1458 case VgprP4:
1459 case VgprP5:
1460 case VgprV2S16:
1461 case VgprV2S32:
1462 case VgprV2S64:
1463 case VgprV3S32:
1464 case VgprV4S16:
1465 case VgprV4S32:
1466 case VgprV8S32: {
1467 assert(Ty == getTyFromID(MethodIDs[OpIdx]));
1468 assert(RB == getRegBankFromID(MethodIDs[OpIdx]));
1469 break;
1470 }
1471 // sgpr and vgpr B-types
1472 case SgprB32:
1473 case SgprB64:
1474 case SgprB96:
1475 case SgprB128:
1476 case SgprB256:
1477 case SgprB512:
1478 case SgprBRC:
1479 case SgprPtr32:
1480 case SgprPtr64:
1481 case SgprPtr128:
1482 case VgprB32:
1483 case VgprB64:
1484 case VgprB96:
1485 case VgprB128:
1486 case VgprB160:
1487 case VgprB256:
1488 case VgprB512:
1489 case VgprBRC:
1490 case VgprPtr32:
1491 case VgprPtr64:
1492 case VgprPtr128: {
1493 assert(Ty == getBTyFromID(MethodIDs[OpIdx], Ty));
1494 assert(RB == getRegBankFromID(MethodIDs[OpIdx]));
1495 break;
1496 }
1497 // uniform in vcc/vgpr: scalars, vectors and B-types
1498 case UniInVcc: {
1499 assert(Ty == S1);
1500 assert(RB == SgprRB);
1501 Register NewDst = MRI.createVirtualRegister(VccRB_S1);
1502 Op.setReg(NewDst);
1503 if (!MRI.use_empty(Reg)) {
1504 auto CopyS32_Vcc =
1505 B.buildInstr(AMDGPU::G_AMDGPU_COPY_SCC_VCC, {SgprRB_S32}, {NewDst});
1506 B.buildTrunc(Reg, CopyS32_Vcc);
1507 }
1508 break;
1509 }
1510 case UniInVgprS16: {
1511 assert(Ty == getTyFromID(MethodIDs[OpIdx]));
1512 assert(RB == SgprRB);
1513 Register NewVgprDstS16 = MRI.createVirtualRegister({VgprRB, S16});
1514 Register NewVgprDstS32 = MRI.createVirtualRegister({VgprRB, S32});
1515 Register NewSgprDstS32 = MRI.createVirtualRegister({SgprRB, S32});
1516 Op.setReg(NewVgprDstS16);
1517 B.buildAnyExt(NewVgprDstS32, NewVgprDstS16);
1518 buildReadAnyLane(B, NewSgprDstS32, NewVgprDstS32, RBI);
1519 B.buildTrunc(Reg, NewSgprDstS32);
1520 break;
1521 }
1522 case UniInVgprS32:
1523 case UniInVgprS64:
1524 case UniInVgprV2S16:
1525 case UniInVgprV2S32:
1526 case UniInVgprV4S32:
1527 case UniInVgprV2S64: {
1528 assert(Ty == getTyFromID(MethodIDs[OpIdx]));
1529 assert(RB == SgprRB);
1530 Register NewVgprDst = MRI.createVirtualRegister({VgprRB, Ty});
1531 Op.setReg(NewVgprDst);
1532 buildReadAnyLane(B, Reg, NewVgprDst, RBI);
1533 break;
1534 }
1535 case UniInVgprB32:
1536 case UniInVgprB64:
1537 case UniInVgprB96:
1538 case UniInVgprB128:
1539 case UniInVgprB160:
1540 case UniInVgprB256:
1541 case UniInVgprB512: {
1542 assert(Ty == getBTyFromID(MethodIDs[OpIdx], Ty));
1543 assert(RB == SgprRB);
1544 Register NewVgprDst = MRI.createVirtualRegister({VgprRB, Ty});
1545 Op.setReg(NewVgprDst);
1546 AMDGPU::buildReadAnyLane(B, Reg, NewVgprDst, RBI);
1547 break;
1548 }
1549 // sgpr trunc
1550 case Sgpr32Trunc: {
1551 assert(Ty.getSizeInBits() < 32);
1552 assert(RB == SgprRB);
1553 Register NewDst = MRI.createVirtualRegister(SgprRB_S32);
1554 Op.setReg(NewDst);
1555 if (!MRI.use_empty(Reg))
1556 B.buildTrunc(Reg, NewDst);
1557 break;
1558 }
1559 case InvalidMapping: {
1561 MF, MORE, "amdgpu-regbanklegalize",
1562 "AMDGPU RegBankLegalize: missing fast rule ('Div' or 'Uni') for", MI);
1563 return false;
1564 }
1565 default:
1567 MF, MORE, "amdgpu-regbanklegalize",
1568 "AMDGPU RegBankLegalize: applyMappingDst, ID not supported", MI);
1569 return false;
1570 }
1571 }
1572
1573 return true;
1574}
1575
1576bool RegBankLegalizeHelper::applyMappingSrc(
1577 MachineInstr &MI, unsigned &OpIdx,
1578 const SmallVectorImpl<RegBankLLTMappingApplyID> &MethodIDs,
1579 WaterfallInfo &WFI) {
1580 for (unsigned i = 0; i < MethodIDs.size(); ++OpIdx, ++i) {
1581 if (MethodIDs[i] == None || MethodIDs[i] == IntrId || MethodIDs[i] == Imm)
1582 continue;
1583
1584 MachineOperand &Op = MI.getOperand(OpIdx);
1585 Register Reg = Op.getReg();
1586 LLT Ty = MRI.getType(Reg);
1587 const RegisterBank *RB = MRI.getRegBank(Reg);
1588
1589 switch (MethodIDs[i]) {
1590 case Vcc: {
1591 assert(Ty == S1);
1592 assert(RB == VccRB || RB == SgprRB);
1593 if (RB == SgprRB) {
1594 auto Aext = B.buildAnyExt(SgprRB_S32, Reg);
1595 auto CopyVcc_Scc =
1596 B.buildInstr(AMDGPU::G_AMDGPU_COPY_VCC_SCC, {VccRB_S1}, {Aext});
1597 Op.setReg(CopyVcc_Scc.getReg(0));
1598 }
1599 break;
1600 }
1601 // sgpr scalars, pointers and vectors
1602 case Sgpr16:
1603 case Sgpr32:
1604 case Sgpr64:
1605 case Sgpr128:
1606 case SgprP0:
1607 case SgprP1:
1608 case SgprP3:
1609 case SgprP4:
1610 case SgprP5:
1611 case SgprP8:
1612 case SgprV2S16:
1613 case SgprV2S32:
1614 case SgprV4S32: {
1615 assert(Ty == getTyFromID(MethodIDs[i]));
1616 assert(RB == getRegBankFromID(MethodIDs[i]));
1617 break;
1618 }
1619 // sgpr B-types
1620 case SgprB32:
1621 case SgprB64:
1622 case SgprB96:
1623 case SgprB128:
1624 case SgprB256:
1625 case SgprB512:
1626 case SgprBRC:
1627 case SgprPtr32:
1628 case SgprPtr64:
1629 case SgprPtr128: {
1630 assert(Ty == getBTyFromID(MethodIDs[i], Ty));
1631 assert(RB == getRegBankFromID(MethodIDs[i]));
1632 break;
1633 }
1634 // vgpr scalars, pointers and vectors
1635 case Vgpr16:
1636 case Vgpr32:
1637 case Vgpr64:
1638 case Vgpr128:
1639 case VgprP0:
1640 case VgprP1:
1641 case VgprP2:
1642 case VgprP3:
1643 case VgprP4:
1644 case VgprP5:
1645 case VgprV2S16:
1646 case VgprV2S32:
1647 case VgprV2S64:
1648 case VgprV3S32:
1649 case VgprV4S16:
1650 case VgprV4S32:
1651 case VgprV8S32: {
1652 assert(Ty == getTyFromID(MethodIDs[i]));
1653 if (RB != VgprRB) {
1654 auto CopyToVgpr = B.buildCopy({VgprRB, Ty}, Reg);
1655 Op.setReg(CopyToVgpr.getReg(0));
1656 }
1657 break;
1658 }
1659 // vgpr B-types
1660 case VgprB32:
1661 case VgprB64:
1662 case VgprB96:
1663 case VgprB128:
1664 case VgprB160:
1665 case VgprB256:
1666 case VgprB512:
1667 case VgprBRC:
1668 case VgprPtr32:
1669 case VgprPtr64:
1670 case VgprPtr128: {
1671 assert(Ty == getBTyFromID(MethodIDs[i], Ty));
1672 if (RB != VgprRB) {
1673 auto CopyToVgpr = B.buildCopy({VgprRB, Ty}, Reg);
1674 Op.setReg(CopyToVgpr.getReg(0));
1675 }
1676 break;
1677 }
1678 // sgpr waterfall, scalars, and vectors
1679 case Sgpr32_WF:
1680 case SgprV4S32_WF: {
1681 assert(Ty == getTyFromID(MethodIDs[i]));
1682 if (RB != SgprRB) {
1683 WFI.SgprWaterfallOperandRegs.insert(Reg);
1684 if (!WFI.Start.isValid()) {
1685 WFI.Start = MI.getIterator();
1686 WFI.End = std::next(MI.getIterator());
1687 }
1688 }
1689 break;
1690 }
1691 case SgprP0Call_WF:
1692 case SgprP4Call_WF: {
1693 assert(Ty == getTyFromID(MethodIDs[i]));
1694 if (RB != SgprRB) {
1695 WFI.SgprWaterfallOperandRegs.insert(Reg);
1696
1697 // Find the ADJCALLSTACKUP before the call.
1698 MachineBasicBlock::iterator Start = MI.getIterator();
1699 while (Start->getOpcode() != AMDGPU::ADJCALLSTACKUP)
1700 --Start;
1701
1702 // Find the ADJCALLSTACKDOWN after the call (include it in range).
1703 MachineBasicBlock::iterator End = MI.getIterator();
1704 while (End->getOpcode() != AMDGPU::ADJCALLSTACKDOWN)
1705 ++End;
1706 ++End;
1707
1708 B.setInsertPt(*MI.getParent(), Start);
1709 WFI.Start = Start;
1710 WFI.End = End;
1711 }
1712 break;
1713 }
1714 case SgprB32_M0: {
1715 assert(Ty == getBTyFromID(MethodIDs[i], Ty));
1716 if (RB == SgprRB)
1717 break;
1718 assert(RB == VgprRB);
1719 Register NewSGPR32 = MRI.createVirtualRegister({SgprRB, Ty});
1720 buildReadFirstLane(B, NewSGPR32, Op.getReg(), RBI);
1721 Op.setReg(NewSGPR32);
1722 break;
1723 }
1724 // sgpr and vgpr scalars with extend
1725 case Sgpr32AExt: {
1726 // Note: this ext allows S1, and it is meant to be combined away.
1727 assert(Ty.getSizeInBits() < 32);
1728 assert(RB == SgprRB);
1729 auto Aext = B.buildAnyExt(SgprRB_S32, Reg);
1730 Op.setReg(Aext.getReg(0));
1731 break;
1732 }
1733 case Sgpr32AExtBoolInReg: {
1734 // Note: this ext allows S1, and it is meant to be combined away.
1735 assert(Ty.getSizeInBits() == 1);
1736 assert(RB == SgprRB);
1737 auto Aext = B.buildAnyExt(SgprRB_S32, Reg);
1738 // Zext SgprS1 is not legal, make AND with 1 instead. This instruction is
1739 // most of times meant to be combined away in AMDGPURegBankCombiner.
1740 auto Cst1 = B.buildConstant(SgprRB_S32, 1);
1741 auto BoolInReg = B.buildAnd(SgprRB_S32, Aext, Cst1);
1742 Op.setReg(BoolInReg.getReg(0));
1743 break;
1744 }
1745 case Sgpr32SExt: {
1746 assert(1 < Ty.getSizeInBits() && Ty.getSizeInBits() < 32);
1747 assert(RB == SgprRB);
1748 auto Sext = B.buildSExt(SgprRB_S32, Reg);
1749 Op.setReg(Sext.getReg(0));
1750 break;
1751 }
1752 case Sgpr32ZExt: {
1753 assert(1 < Ty.getSizeInBits() && Ty.getSizeInBits() < 32);
1754 assert(RB == SgprRB);
1755 auto Zext = B.buildZExt({SgprRB, S32}, Reg);
1756 Op.setReg(Zext.getReg(0));
1757 break;
1758 }
1759 case Vgpr32AExt: {
1760 assert(Ty.getSizeInBits() < 32);
1761 assert(RB == VgprRB);
1762 auto Aext = B.buildAnyExt({VgprRB, S32}, Reg);
1763 Op.setReg(Aext.getReg(0));
1764 break;
1765 }
1766 case Vgpr32SExt: {
1767 // Note this ext allows S1, and it is meant to be combined away.
1768 assert(Ty.getSizeInBits() < 32);
1769 assert(RB == VgprRB);
1770 auto Sext = B.buildSExt({VgprRB, S32}, Reg);
1771 Op.setReg(Sext.getReg(0));
1772 break;
1773 }
1774 case Vgpr32ZExt: {
1775 // Note this ext allows S1, and it is meant to be combined away.
1776 assert(Ty.getSizeInBits() < 32);
1777 assert(RB == VgprRB);
1778 auto Zext = B.buildZExt({VgprRB, S32}, Reg);
1779 Op.setReg(Zext.getReg(0));
1780 break;
1781 }
1782 default:
1784 MF, MORE, "amdgpu-regbanklegalize",
1785 "AMDGPU RegBankLegalize: applyMappingSrc, ID not supported", MI);
1786 return false;
1787 }
1788 }
1789 return true;
1790}
1791
1792[[maybe_unused]] static bool verifyRegBankOnOperands(MachineInstr &MI,
1793 const RegisterBank *RB,
1795 unsigned StartOpIdx,
1796 unsigned EndOpIdx) {
1797 for (unsigned i = StartOpIdx; i <= EndOpIdx; ++i) {
1798 if (MRI.getRegBankOrNull(MI.getOperand(i).getReg()) != RB)
1799 return false;
1800 }
1801 return true;
1802}
1803
1805 const RegisterBank *RB = MRI.getRegBank(MI.getOperand(0).getReg());
1806 // Put RB on all registers
1807 unsigned NumDefs = MI.getNumDefs();
1808 unsigned NumOperands = MI.getNumOperands();
1809
1810 assert(verifyRegBankOnOperands(MI, RB, MRI, 0, NumDefs - 1));
1811 if (RB == SgprRB)
1812 assert(verifyRegBankOnOperands(MI, RB, MRI, NumDefs, NumOperands - 1));
1813
1814 if (RB == VgprRB) {
1815 B.setInstr(MI);
1816 for (unsigned i = NumDefs; i < NumOperands; ++i) {
1817 Register Reg = MI.getOperand(i).getReg();
1818 if (MRI.getRegBank(Reg) != RB) {
1819 auto Copy = B.buildCopy({VgprRB, MRI.getType(Reg)}, Reg);
1820 MI.getOperand(i).setReg(Copy.getReg(0));
1821 }
1822 }
1823 }
1824}
1825
1826bool RegBankLegalizeHelper::applyRegisterBanksINTRIN_IMAGE(MachineInstr &MI) {
1827 const AMDGPU::RsrcIntrinsic *RSrcIntrin =
1829 assert(RSrcIntrin && RSrcIntrin->IsImage);
1830
1831 unsigned RsrcIdx = RSrcIntrin->RsrcArg;
1832 const unsigned NumDefs = MI.getNumExplicitDefs();
1833
1834 // The reported argument index is relative to the IR intrinsic call arguments,
1835 // so we need to shift by the number of defs and the intrinsic ID.
1836 RsrcIdx += NumDefs + 1;
1837
1838 MachineBasicBlock *MBB = MI.getParent();
1839 B.setInsertPt(*MBB, MBB->SkipPHIsAndLabels(std::next(MI.getIterator())));
1840
1841 // Defs(for image loads with return) are vgpr.
1842 for (unsigned i = 0; i < NumDefs; ++i) {
1843 const RegisterBank *RB = MRI.getRegBank(MI.getOperand(i).getReg());
1844 if (RB == VgprRB)
1845 continue;
1846
1847 Register Reg = MI.getOperand(i).getReg();
1848 Register NewVgprDst = MRI.createVirtualRegister({VgprRB, MRI.getType(Reg)});
1849 MI.getOperand(i).setReg(NewVgprDst);
1850 buildReadAnyLane(B, Reg, NewVgprDst, RBI);
1851 }
1852
1853 B.setInstrAndDebugLoc(MI);
1854
1855 // Register uses(before RsrcIdx) are vgpr.
1856 for (unsigned i = 1; i < RsrcIdx; ++i) {
1857 MachineOperand &Op = MI.getOperand(i);
1858 if (!Op.isReg())
1859 continue;
1860
1861 Register Reg = Op.getReg();
1862 if (!Reg.isVirtual())
1863 continue;
1864
1865 if (MRI.getRegBank(Reg) == VgprRB)
1866 continue;
1867
1868 auto Copy = B.buildCopy({VgprRB, MRI.getType(Reg)}, Reg);
1869 Op.setReg(Copy.getReg(0));
1870 }
1871
1872 SmallSet<Register, 4> OpsToWaterfall;
1873
1874 // Register use RsrcIdx(and RsrcIdx+1 in some cases) is sgpr.
1875 for (unsigned i = RsrcIdx; i < MI.getNumOperands(); ++i) {
1876 MachineOperand &Op = MI.getOperand(i);
1877 if (!Op.isReg())
1878 continue;
1879
1880 Register Reg = Op.getReg();
1881 if (MRI.getRegBank(Reg) != SgprRB)
1882 OpsToWaterfall.insert(Reg);
1883 }
1884
1885 if (!OpsToWaterfall.empty()) {
1886 MachineBasicBlock::iterator MII = MI.getIterator();
1887 executeInWaterfallLoop(B, {OpsToWaterfall, MII, std::next(MII)});
1888 }
1889
1890 return true;
1891}
MachineInstrBuilder MachineInstrBuilder & DefMI
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.
Provides AMDGPU specific target descriptions.
static bool isSignedBFE(MachineInstr &MI)
static bool verifyRegBankOnOperands(MachineInstr &MI, const RegisterBank *RB, MachineRegisterInfo &MRI, unsigned StartOpIdx, unsigned EndOpIdx)
This file declares the targeting of the RegisterBankInfo class for AMDGPU.
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator MBBI
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
AMD GCN specific subclass of TargetSubtarget.
Declares convenience wrapper classes for interpreting MachineInstr instances as specific generic oper...
static Register UseReg(const MachineOperand &MO)
IRTranslator LLVM IR MI
const size_t AbstractManglingParser< Derived, Alloc >::NumOps
Contains matchers for matching SSA Machine Instructions.
This file declares the MachineIRBuilder class.
Register Reg
Register const TargetRegisterInfo * TRI
Machine IR instance of the generic uniformity analysis.
Promote Memory to Register
Definition Mem2Reg.cpp:110
static MCRegister getReg(const MCDisassembler *D, unsigned RC, unsigned RegNo)
MachineInstr unsigned OpIdx
const SmallVectorImpl< MachineOperand > & Cond
RegBankLegalizeHelper(MachineIRBuilder &B, const MachineUniformityInfo &MUI, const RegisterBankInfo &RBI, const RegBankLegalizeRules &RBLRules)
const RegBankLLTMapping * findMappingForMI(const MachineInstr &MI, const MachineRegisterInfo &MRI, const MachineUniformityInfo &MUI) const
static APInt getLowBitsSet(unsigned numBits, unsigned loBitsSet)
Constructs an APInt value that has the bottom loBitsSet bits set.
Definition APInt.h:307
@ ICMP_NE
not equal
Definition InstrTypes.h:698
iterator find(const_arg_type_t< KeyT > Val)
Definition DenseMap.h:178
iterator end()
Definition DenseMap.h:81
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition DenseMap.h:241
const SIRegisterInfo * getRegisterInfo() const override
Represents a call to an intrinsic.
Register getSourceReg() const
Get the unmerge source register.
constexpr bool isScalar() const
static constexpr LLT scalar(unsigned SizeInBits)
Get a low-level scalar or aggregate "bag of bits".
constexpr bool isValid() const
constexpr bool isVector() const
static constexpr LLT pointer(unsigned AddressSpace, unsigned SizeInBits)
Get a low-level pointer in the given address space.
constexpr TypeSize getSizeInBits() const
Returns the total size of the type. Must only be called on sized types.
constexpr bool isPointer() const
constexpr LLT getElementType() const
Returns the vector's element type. Only valid for vector types.
static constexpr LLT fixed_vector(unsigned NumElements, unsigned ScalarSizeInBits)
Get a low-level fixed-width vector of some number of elements and element width.
constexpr TypeSize getSizeInBytes() const
Returns the total size of the type in bytes, i.e.
TypeSize getValue() const
LLVM_ABI void transferSuccessorsAndUpdatePHIs(MachineBasicBlock *FromMBB)
Transfers all the successors, as in transferSuccessors, and update PHI operands in the successor bloc...
LLVM_ABI iterator SkipPHIsAndLabels(iterator I)
Return the first instruction in MBB after I that is not a PHI or a label.
LLVM_ABI void addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob=BranchProbability::getUnknown())
Add Succ as a successor of this MachineBasicBlock.
void splice(iterator Where, MachineBasicBlock *Other, iterator From)
Take an instruction from MBB 'Other' at the position From, and insert it into this MBB right before '...
MachineInstrBundleIterator< MachineInstr > iterator
BasicBlockListType::iterator iterator
MachineBasicBlock * CreateMachineBasicBlock(const BasicBlock *BB=nullptr, std::optional< UniqueBBID > BBID=std::nullopt)
CreateMachineInstr - Allocate a new MachineInstr.
void insert(iterator MBBI, MachineBasicBlock *MBB)
Helper class to build MachineInstr.
Representation of each machine instruction.
const MachineBasicBlock * getParent() const
LocationSize getSize() const
Return the size in bytes of the memory reference.
MachineOperand class - Representation of each machine instruction operand.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
const RegisterBank * getRegBank(Register Reg) const
Return the register bank of Reg.
LLVM_ABI Register createVirtualRegister(const TargetRegisterClass *RegClass, StringRef Name="")
createVirtualRegister - Create and return a new virtual register in the function with the specified r...
LLT getType(Register Reg) const
Get the low-level type of Reg or LLT{} if Reg is not a generic (target independent) virtual register.
const RegisterBank * getRegBankOrNull(Register Reg) const
Return the register bank of Reg, or null if Reg has not been assigned a register bank or has been ass...
Holds all the information related to register banks.
This class implements the register bank concept.
Wrapper class representing virtual and physical registers.
Definition Register.h:20
bool empty() const
Definition SmallSet.h:169
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
Definition SmallSet.h:184
void push_back(const T &Elt)
self_iterator getIterator()
Definition ilist_node.h:123
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
bool isAnyPtr(LLT Ty, unsigned Width)
Intrinsic::ID getIntrinsicID(const MachineInstr &I)
Return the intrinsic ID for opcodes with the G_AMDGPU_INTRIN_ prefix.
void buildReadAnyLane(MachineIRBuilder &B, Register SgprDst, Register VgprSrc, const RegisterBankInfo &RBI)
const RsrcIntrinsic * lookupRsrcIntrinsic(unsigned Intr)
void buildReadFirstLane(MachineIRBuilder &B, Register SgprDst, Register VgprSrc, const RegisterBankInfo &RBI)
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
SpecificConstantMatch m_ZeroInt()
Convenience matchers for specific integer values.
bool mi_match(Reg R, const MachineRegisterInfo &MRI, Pattern &&P)
This is an optimization pass for GlobalISel generic memory operations.
Definition Types.h:26
GenericUniformityInfo< MachineSSAContext > MachineUniformityInfo
@ Offset
Definition DWP.cpp:532
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1739
@ Kill
The last use of a register.
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
LLVM_ABI void constrainSelectedInstRegOperands(MachineInstr &I, const TargetInstrInfo &TII, const TargetRegisterInfo &TRI, const RegisterBankInfo &RBI)
Mutate the newly-selected instruction I to constrain its (possibly generic) virtual register operands...
Definition Utils.cpp:155
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
LLVM_ABI void reportGISelFailure(MachineFunction &MF, MachineOptimizationRemarkEmitter &MORE, MachineOptimizationRemarkMissed &R)
Report an ISel error as a missed optimization remark to the LLVMContext's diagnostic stream.
Definition Utils.cpp:258
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
DWARFExpression::Operation Op
ArrayRef(const T &OneElt) -> ArrayRef< T >
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
LLVM_ABI std::optional< ValueAndVReg > getIConstantVRegValWithLookThrough(Register VReg, const MachineRegisterInfo &MRI, bool LookThroughInstrs=true)
If VReg is defined by a statically evaluable chain of instructions rooted on a G_CONSTANT returns its...
Definition Utils.cpp:433
constexpr T maskTrailingOnes(unsigned N)
Create a bitmask with the N right-most bits set to 1, and all other bits set to 0.
Definition MathExtras.h:77
SmallVector< RegBankLLTMappingApplyID, 2 > DstOpMapping
SmallVector< RegBankLLTMappingApplyID, 4 > SrcOpMapping
Holds waterfall loop information: the set of SGPR operand registers that need waterfalling,...
MachineBasicBlock::iterator Start
SmallSet< Register, 4 > SgprWaterfallOperandRegs