LLVM 23.0.0git
AMDGPURegBankLegalizeHelper.cpp
Go to the documentation of this file.
1//===-- AMDGPURegBankLegalizeHelper.cpp -----------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// Implements actual lowering algorithms for each ID that can be used in
10/// Rule.OperandMapping. Similar to legalizer helper but with register banks.
11//
12//===----------------------------------------------------------------------===//
13
16#include "AMDGPUInstrInfo.h"
19#include "GCNSubtarget.h"
26#include "llvm/IR/IntrinsicsAMDGPU.h"
27
28#define DEBUG_TYPE "amdgpu-regbanklegalize"
29
30using namespace llvm;
31using namespace AMDGPU;
32
35 const RegisterBankInfo &RBI, const RegBankLegalizeRules &RBLRules)
36 : MF(B.getMF()), ST(MF.getSubtarget<GCNSubtarget>()), B(B),
37 MRI(*B.getMRI()), MUI(MUI), RBI(RBI), MORE(MF, nullptr),
38 RBLRules(RBLRules), IsWave32(ST.isWave32()),
39 SgprRB(&RBI.getRegBank(AMDGPU::SGPRRegBankID)),
40 VgprRB(&RBI.getRegBank(AMDGPU::VGPRRegBankID)),
41 VccRB(&RBI.getRegBank(AMDGPU::VCCRegBankID)) {}
42
44 const SetOfRulesForOpcode *RuleSet = RBLRules.getRulesForOpc(MI);
45 if (!RuleSet) {
46 reportGISelFailure(MF, MORE, "amdgpu-regbanklegalize",
47 "No AMDGPU RegBankLegalize rules defined for opcode",
48 MI);
49 return false;
50 }
51
52 const RegBankLLTMapping *Mapping = RuleSet->findMappingForMI(MI, MRI, MUI);
53 if (!Mapping) {
54 reportGISelFailure(MF, MORE, "amdgpu-regbanklegalize",
55 "AMDGPU RegBankLegalize: none of the rules defined with "
56 "'Any' for MI's opcode matched MI",
57 MI);
58 return false;
59 }
60
61 WaterfallInfo WFI;
62 unsigned OpIdx = 0;
63 if (Mapping->DstOpMapping.size() > 0) {
64 B.setInsertPt(*MI.getParent(), std::next(MI.getIterator()));
65 if (!applyMappingDst(MI, OpIdx, Mapping->DstOpMapping))
66 return false;
67 }
68 if (Mapping->SrcOpMapping.size() > 0) {
69 B.setInstr(MI);
70 if (!applyMappingSrc(MI, OpIdx, Mapping->SrcOpMapping, WFI))
71 return false;
72 }
73
74 if (!lower(MI, *Mapping, WFI))
75 return false;
76
77 return true;
78}
79
80bool RegBankLegalizeHelper::executeInWaterfallLoop(MachineIRBuilder &B,
81 const WaterfallInfo &WFI) {
82 assert(WFI.Start.isValid() && WFI.End.isValid() &&
83 "Waterfall range not initialized");
84
85 // Track use registers which have already been expanded with a readfirstlane
86 // sequence. This may have multiple uses if moving a sequence.
87 DenseMap<Register, Register> WaterfalledRegMap;
88
89 MachineBasicBlock &MBB = B.getMBB();
90 MachineFunction &MF = B.getMF();
91
94
96 const TargetRegisterClass *WaveRC = TRI->getWaveMaskRegClass();
97 unsigned MovExecOpc, MovExecTermOpc, XorTermOpc, AndSaveExecOpc, ExecReg;
98 if (IsWave32) {
99 MovExecOpc = AMDGPU::S_MOV_B32;
100 MovExecTermOpc = AMDGPU::S_MOV_B32_term;
101 XorTermOpc = AMDGPU::S_XOR_B32_term;
102 AndSaveExecOpc = AMDGPU::S_AND_SAVEEXEC_B32;
103 ExecReg = AMDGPU::EXEC_LO;
104 } else {
105 MovExecOpc = AMDGPU::S_MOV_B64;
106 MovExecTermOpc = AMDGPU::S_MOV_B64_term;
107 XorTermOpc = AMDGPU::S_XOR_B64_term;
108 AndSaveExecOpc = AMDGPU::S_AND_SAVEEXEC_B64;
109 ExecReg = AMDGPU::EXEC;
110 }
111
112#ifndef NDEBUG
113 const int OrigRangeSize = std::distance(BeginIt, EndIt);
114#endif
115
116 MachineRegisterInfo &MRI = *B.getMRI();
117 Register SaveExecReg = MRI.createVirtualRegister(WaveRC);
118 Register InitSaveExecReg = MRI.createVirtualRegister(WaveRC);
119
120 // Don't bother using generic instructions/registers for the exec mask.
121 B.buildInstr(TargetOpcode::IMPLICIT_DEF).addDef(InitSaveExecReg);
122
123 Register SavedExec = MRI.createVirtualRegister(WaveRC);
124
125 // To insert the loop we need to split the block. Move everything before
126 // this point to a new block, and insert a new empty block before this
127 // instruction.
130 MachineBasicBlock *RestoreExecBB = MF.CreateMachineBasicBlock();
131 MachineBasicBlock *RemainderBB = MF.CreateMachineBasicBlock();
133 ++MBBI;
134 MF.insert(MBBI, LoopBB);
135 MF.insert(MBBI, BodyBB);
136 MF.insert(MBBI, RestoreExecBB);
137 MF.insert(MBBI, RemainderBB);
138
139 LoopBB->addSuccessor(BodyBB);
140 BodyBB->addSuccessor(RestoreExecBB);
141 BodyBB->addSuccessor(LoopBB);
142
143 // Move the rest of the block into a new block.
145 RemainderBB->splice(RemainderBB->begin(), &MBB, EndIt, MBB.end());
146
147 MBB.addSuccessor(LoopBB);
148 RestoreExecBB->addSuccessor(RemainderBB);
149
150 B.setInsertPt(*LoopBB, LoopBB->end());
151
152 // +-MBB:------------+
153 // | ... |
154 // | %0 = G_INST_1 |
155 // | %Dst = MI %Vgpr |
156 // | %1 = G_INST_2 |
157 // | ... |
158 // +-----------------+
159 // ->
160 // +-MBB-------------------------------+
161 // | ... |
162 // | %0 = G_INST_1 |
163 // | %SaveExecReg = S_MOV_B32 $exec_lo |
164 // +----------------|------------------+
165 // | /------------------------------|
166 // V V |
167 // +-LoopBB---------------------------------------------------------------+ |
168 // | %CurrentLaneReg:sgpr(s32) = READFIRSTLANE %Vgpr | |
169 // | instead of executing for each lane, see if other lanes had | |
170 // | same value for %Vgpr and execute for them also. | |
171 // | %CondReg:vcc(s1) = G_ICMP eq %CurrentLaneReg, %Vgpr | |
172 // | %CondRegLM:sreg_32 = ballot %CondReg // copy vcc to sreg32 lane mask | |
173 // | %SavedExec = S_AND_SAVEEXEC_B32 %CondRegLM | |
174 // | exec is active for lanes with the same "CurrentLane value" in Vgpr | |
175 // +----------------|-----------------------------------------------------+ |
176 // V |
177 // +-BodyBB------------------------------------------------------------+ |
178 // | %Dst = MI %CurrentLaneReg:sgpr(s32) | |
179 // | executed only for active lanes and written to Dst | |
180 // | $exec = S_XOR_B32 $exec, %SavedExec | |
181 // | set active lanes to 0 in SavedExec, lanes that did not write to | |
182 // | Dst yet, and set this as new exec (for READFIRSTLANE and ICMP) | |
183 // | SI_WATERFALL_LOOP LoopBB |-----|
184 // +----------------|--------------------------------------------------+
185 // V
186 // +-RestoreExecBB--------------------------+
187 // | $exec_lo = S_MOV_B32_term %SaveExecReg |
188 // +----------------|-----------------------+
189 // V
190 // +-RemainderBB:----------------------+
191 // | %1 = G_INST_2 |
192 // | ... |
193 // +---------------------------------- +
194
195 // Move the instruction into the loop body. Note we moved everything after
196 // Range.end() already into a new block, so Range.end() is no longer valid.
197 BodyBB->splice(BodyBB->end(), &MBB, BeginIt, MBB.end());
198
199 // Figure out the iterator range after splicing the instructions.
200 MachineBasicBlock::iterator NewBegin = BeginIt;
201 auto NewEnd = BodyBB->end();
202 assert(std::distance(NewBegin, NewEnd) == OrigRangeSize);
203
204 B.setMBB(*LoopBB);
205 Register CondReg;
206
207 for (MachineInstr &MI : make_range(NewBegin, NewEnd)) {
208 for (MachineOperand &Op : MI.all_uses()) {
209 Register OldReg = Op.getReg();
210 if (!WFI.SgprWaterfallOperandRegs.count(OldReg))
211 continue;
212
213 // See if we already processed this register in another instruction in
214 // the sequence.
215 auto OldVal = WaterfalledRegMap.find(OldReg);
216 if (OldVal != WaterfalledRegMap.end()) {
217 Op.setReg(OldVal->second);
218 continue;
219 }
220
221 Register OpReg = Op.getReg();
222 LLT OpTy = MRI.getType(OpReg);
223
224 // TODO: support for agpr
225 assert(MRI.getRegBank(OpReg) == VgprRB);
226 Register CurrentLaneReg = MRI.createVirtualRegister({SgprRB, OpTy});
227 buildReadFirstLane(B, CurrentLaneReg, OpReg, RBI);
228
229 // Build the comparison(s), CurrentLaneReg == OpReg.
230 unsigned OpSize = OpTy.getSizeInBits();
231 unsigned PartSize = (OpSize % 64 == 0) ? 64 : 32;
232 LLT PartTy = LLT::scalar(PartSize);
233 unsigned NumParts = OpSize / PartSize;
235 SmallVector<Register, 8> CurrentLaneParts;
236
237 if (NumParts == 1) {
238 OpParts.push_back(OpReg);
239 CurrentLaneParts.push_back(CurrentLaneReg);
240 } else {
241 auto UnmergeOp = B.buildUnmerge({VgprRB, PartTy}, OpReg);
242 auto UnmergeCurrLane = B.buildUnmerge({SgprRB, PartTy}, CurrentLaneReg);
243 for (unsigned i = 0; i < NumParts; ++i) {
244 OpParts.push_back(UnmergeOp.getReg(i));
245 CurrentLaneParts.push_back(UnmergeCurrLane.getReg(i));
246 }
247 }
248
249 for (unsigned i = 0; i < NumParts; ++i) {
250 Register CmpReg = MRI.createVirtualRegister(VccRB_S1);
251 B.buildICmp(CmpInst::ICMP_EQ, CmpReg, CurrentLaneParts[i], OpParts[i]);
252
253 if (!CondReg)
254 CondReg = CmpReg;
255 else
256 CondReg = B.buildAnd(VccRB_S1, CondReg, CmpReg).getReg(0);
257 }
258
259 Op.setReg(CurrentLaneReg);
260
261 // Make sure we don't re-process this register again.
262 WaterfalledRegMap.insert(std::pair(OldReg, Op.getReg()));
263 }
264 }
265
266 // Copy vcc to sgpr32/64, ballot becomes a no-op during instruction selection.
267 Register CondRegLM =
268 MRI.createVirtualRegister({WaveRC, LLT::scalar(IsWave32 ? 32 : 64)});
269 B.buildIntrinsic(Intrinsic::amdgcn_ballot, CondRegLM).addReg(CondReg);
270
271 // Update EXEC, save the original EXEC value to SavedExec.
272 B.buildInstr(AndSaveExecOpc)
273 .addDef(SavedExec)
274 .addReg(CondRegLM, RegState::Kill);
275 MRI.setSimpleHint(SavedExec, CondRegLM);
276
277 B.setInsertPt(*BodyBB, BodyBB->end());
278
279 // Update EXEC, switch all done bits to 0 and all todo bits to 1.
280 B.buildInstr(XorTermOpc).addDef(ExecReg).addReg(ExecReg).addReg(SavedExec);
281
282 // XXX - s_xor_b64 sets scc to 1 if the result is nonzero, so can we use
283 // s_cbranch_scc0?
284
285 // Loop back to V_READFIRSTLANE_B32 if there are still variants to cover.
286 B.buildInstr(AMDGPU::SI_WATERFALL_LOOP).addMBB(LoopBB);
287
288 // Save the EXEC mask before the loop.
289 B.setInsertPt(MBB, MBB.end());
290 B.buildInstr(MovExecOpc).addDef(SaveExecReg).addReg(ExecReg);
291
292 // Restore the EXEC mask after the loop.
293 B.setInsertPt(*RestoreExecBB, RestoreExecBB->begin());
294 B.buildInstr(MovExecTermOpc).addDef(ExecReg).addReg(SaveExecReg);
295
296 // Set the insert point after the original instruction, so any new
297 // instructions will be in the remainder.
298 B.setInsertPt(*RemainderBB, RemainderBB->begin());
299
300 return true;
301}
302
303bool RegBankLegalizeHelper::splitLoad(MachineInstr &MI,
304 ArrayRef<LLT> LLTBreakdown, LLT MergeTy) {
305 MachineFunction &MF = B.getMF();
306 assert(MI.getNumMemOperands() == 1);
307 MachineMemOperand &BaseMMO = **MI.memoperands_begin();
308 Register Dst = MI.getOperand(0).getReg();
309 const RegisterBank *DstRB = MRI.getRegBankOrNull(Dst);
310 Register Base = MI.getOperand(1).getReg();
311 LLT PtrTy = MRI.getType(Base);
312 const RegisterBank *PtrRB = MRI.getRegBankOrNull(Base);
313 LLT OffsetTy = LLT::scalar(PtrTy.getSizeInBits());
314 SmallVector<Register, 4> LoadPartRegs;
315
316 unsigned ByteOffset = 0;
317 for (LLT PartTy : LLTBreakdown) {
318 Register BasePlusOffset;
319 if (ByteOffset == 0) {
320 BasePlusOffset = Base;
321 } else {
322 auto Offset = B.buildConstant({PtrRB, OffsetTy}, ByteOffset);
323 BasePlusOffset =
324 B.buildObjectPtrOffset({PtrRB, PtrTy}, Base, Offset).getReg(0);
325 }
326 auto *OffsetMMO = MF.getMachineMemOperand(&BaseMMO, ByteOffset, PartTy);
327 auto LoadPart = B.buildLoad({DstRB, PartTy}, BasePlusOffset, *OffsetMMO);
328 LoadPartRegs.push_back(LoadPart.getReg(0));
329 ByteOffset += PartTy.getSizeInBytes();
330 }
331
332 if (!MergeTy.isValid()) {
333 // Loads are of same size, concat or merge them together.
334 B.buildMergeLikeInstr(Dst, LoadPartRegs);
335 } else {
336 // Loads are not all of same size, need to unmerge them to smaller pieces
337 // of MergeTy type, then merge pieces to Dst.
338 SmallVector<Register, 4> MergeTyParts;
339 for (Register Reg : LoadPartRegs) {
340 if (MRI.getType(Reg) == MergeTy) {
341 MergeTyParts.push_back(Reg);
342 } else {
343 auto Unmerge = B.buildUnmerge({DstRB, MergeTy}, Reg);
344 for (unsigned i = 0; i < Unmerge->getNumOperands() - 1; ++i)
345 MergeTyParts.push_back(Unmerge.getReg(i));
346 }
347 }
348 B.buildMergeLikeInstr(Dst, MergeTyParts);
349 }
350 MI.eraseFromParent();
351 return true;
352}
353
354bool RegBankLegalizeHelper::widenLoad(MachineInstr &MI, LLT WideTy,
355 LLT MergeTy) {
356 MachineFunction &MF = B.getMF();
357 assert(MI.getNumMemOperands() == 1);
358 MachineMemOperand &BaseMMO = **MI.memoperands_begin();
359 Register Dst = MI.getOperand(0).getReg();
360 const RegisterBank *DstRB = MRI.getRegBankOrNull(Dst);
361 Register Base = MI.getOperand(1).getReg();
362
363 MachineMemOperand *WideMMO = MF.getMachineMemOperand(&BaseMMO, 0, WideTy);
364 auto WideLoad = B.buildLoad({DstRB, WideTy}, Base, *WideMMO);
365
366 if (WideTy.isScalar()) {
367 B.buildTrunc(Dst, WideLoad);
368 } else {
369 SmallVector<Register, 4> MergeTyParts;
370 auto Unmerge = B.buildUnmerge({DstRB, MergeTy}, WideLoad);
371
372 LLT DstTy = MRI.getType(Dst);
373 unsigned NumElts = DstTy.getSizeInBits() / MergeTy.getSizeInBits();
374 for (unsigned i = 0; i < NumElts; ++i) {
375 MergeTyParts.push_back(Unmerge.getReg(i));
376 }
377 B.buildMergeLikeInstr(Dst, MergeTyParts);
378 }
379 MI.eraseFromParent();
380 return true;
381}
382
383bool RegBankLegalizeHelper::widenMMOToS32(GAnyLoad &MI) const {
384 Register Dst = MI.getDstReg();
385 Register Ptr = MI.getPointerReg();
386 MachineMemOperand &MMO = MI.getMMO();
387 unsigned MemSize = 8 * MMO.getSize().getValue();
388
389 MachineMemOperand *WideMMO = B.getMF().getMachineMemOperand(&MMO, 0, S32);
390
391 if (MI.getOpcode() == G_LOAD) {
392 B.buildLoad(Dst, Ptr, *WideMMO);
393 } else {
394 auto Load = B.buildLoad(SgprRB_S32, Ptr, *WideMMO);
395
396 if (MI.getOpcode() == G_ZEXTLOAD) {
397 APInt Mask = APInt::getLowBitsSet(S32.getSizeInBits(), MemSize);
398 auto MaskCst = B.buildConstant(SgprRB_S32, Mask);
399 B.buildAnd(Dst, Load, MaskCst);
400 } else {
401 assert(MI.getOpcode() == G_SEXTLOAD);
402 B.buildSExtInReg(Dst, Load, MemSize);
403 }
404 }
405
406 MI.eraseFromParent();
407 return true;
408}
409
410bool RegBankLegalizeHelper::lowerVccExtToSel(MachineInstr &MI) {
411 Register Dst = MI.getOperand(0).getReg();
412 LLT Ty = MRI.getType(Dst);
413 Register Src = MI.getOperand(1).getReg();
414 unsigned Opc = MI.getOpcode();
415 int TrueExtCst = Opc == G_SEXT ? -1 : 1;
416 if (Ty == S32 || Ty == S16) {
417 auto True = B.buildConstant({VgprRB, Ty}, TrueExtCst);
418 auto False = B.buildConstant({VgprRB, Ty}, 0);
419 B.buildSelect(Dst, Src, True, False);
420 } else if (Ty == S64) {
421 auto True = B.buildConstant({VgprRB_S32}, TrueExtCst);
422 auto False = B.buildConstant({VgprRB_S32}, 0);
423 auto Lo = B.buildSelect({VgprRB_S32}, Src, True, False);
424 MachineInstrBuilder Hi;
425 switch (Opc) {
426 case G_SEXT:
427 Hi = Lo;
428 break;
429 case G_ZEXT:
430 Hi = False;
431 break;
432 case G_ANYEXT:
433 Hi = B.buildUndef({VgprRB_S32});
434 break;
435 default:
437 MF, MORE, "amdgpu-regbanklegalize",
438 "AMDGPU RegBankLegalize: lowerVccExtToSel, Opcode not supported", MI);
439 return false;
440 }
441
442 B.buildMergeValues(Dst, {Lo.getReg(0), Hi.getReg(0)});
443 } else {
445 MF, MORE, "amdgpu-regbanklegalize",
446 "AMDGPU RegBankLegalize: lowerVccExtToSel, Type not supported", MI);
447 return false;
448 }
449
450 MI.eraseFromParent();
451 return true;
452}
453
454std::pair<Register, Register> RegBankLegalizeHelper::unpackZExt(Register Reg) {
455 auto PackedS32 = B.buildBitcast(SgprRB_S32, Reg);
456 auto Mask = B.buildConstant(SgprRB_S32, 0x0000ffff);
457 auto Lo = B.buildAnd(SgprRB_S32, PackedS32, Mask);
458 auto Hi = B.buildLShr(SgprRB_S32, PackedS32, B.buildConstant(SgprRB_S32, 16));
459 return {Lo.getReg(0), Hi.getReg(0)};
460}
461
462std::pair<Register, Register> RegBankLegalizeHelper::unpackSExt(Register Reg) {
463 auto PackedS32 = B.buildBitcast(SgprRB_S32, Reg);
464 auto Lo = B.buildSExtInReg(SgprRB_S32, PackedS32, 16);
465 auto Hi = B.buildAShr(SgprRB_S32, PackedS32, B.buildConstant(SgprRB_S32, 16));
466 return {Lo.getReg(0), Hi.getReg(0)};
467}
468
469std::pair<Register, Register> RegBankLegalizeHelper::unpackAExt(Register Reg) {
470 auto PackedS32 = B.buildBitcast(SgprRB_S32, Reg);
471 auto Lo = PackedS32;
472 auto Hi = B.buildLShr(SgprRB_S32, PackedS32, B.buildConstant(SgprRB_S32, 16));
473 return {Lo.getReg(0), Hi.getReg(0)};
474}
475
476std::pair<Register, Register>
477RegBankLegalizeHelper::unpackAExtTruncS16(Register Reg) {
478 auto [Lo32, Hi32] = unpackAExt(Reg);
479 return {B.buildTrunc(SgprRB_S16, Lo32).getReg(0),
480 B.buildTrunc(SgprRB_S16, Hi32).getReg(0)};
481}
482
483bool RegBankLegalizeHelper::lowerUnpackBitShift(MachineInstr &MI) {
484 Register Lo, Hi;
485 switch (MI.getOpcode()) {
486 case AMDGPU::G_SHL: {
487 auto [Val0, Val1] = unpackAExt(MI.getOperand(1).getReg());
488 auto [Amt0, Amt1] = unpackAExt(MI.getOperand(2).getReg());
489 Lo = B.buildInstr(MI.getOpcode(), {SgprRB_S32}, {Val0, Amt0}).getReg(0);
490 Hi = B.buildInstr(MI.getOpcode(), {SgprRB_S32}, {Val1, Amt1}).getReg(0);
491 break;
492 }
493 case AMDGPU::G_LSHR: {
494 auto [Val0, Val1] = unpackZExt(MI.getOperand(1).getReg());
495 auto [Amt0, Amt1] = unpackZExt(MI.getOperand(2).getReg());
496 Lo = B.buildInstr(MI.getOpcode(), {SgprRB_S32}, {Val0, Amt0}).getReg(0);
497 Hi = B.buildInstr(MI.getOpcode(), {SgprRB_S32}, {Val1, Amt1}).getReg(0);
498 break;
499 }
500 case AMDGPU::G_ASHR: {
501 auto [Val0, Val1] = unpackSExt(MI.getOperand(1).getReg());
502 auto [Amt0, Amt1] = unpackSExt(MI.getOperand(2).getReg());
503 Lo = B.buildAShr(SgprRB_S32, Val0, Amt0).getReg(0);
504 Hi = B.buildAShr(SgprRB_S32, Val1, Amt1).getReg(0);
505 break;
506 }
507 default:
509 MF, MORE, "amdgpu-regbanklegalize",
510 "AMDGPU RegBankLegalize: lowerUnpackBitShift, case not implemented",
511 MI);
512 return false;
513 }
514 B.buildBuildVectorTrunc(MI.getOperand(0).getReg(), {Lo, Hi});
515 MI.eraseFromParent();
516 return true;
517}
518
519bool RegBankLegalizeHelper::lowerUnpackMinMax(MachineInstr &MI) {
520 Register Lo, Hi;
521 switch (MI.getOpcode()) {
522 case AMDGPU::G_SMIN:
523 case AMDGPU::G_SMAX: {
524 // For signed operations, use sign extension
525 auto [Val0_Lo, Val0_Hi] = unpackSExt(MI.getOperand(1).getReg());
526 auto [Val1_Lo, Val1_Hi] = unpackSExt(MI.getOperand(2).getReg());
527 Lo = B.buildInstr(MI.getOpcode(), {SgprRB_S32}, {Val0_Lo, Val1_Lo})
528 .getReg(0);
529 Hi = B.buildInstr(MI.getOpcode(), {SgprRB_S32}, {Val0_Hi, Val1_Hi})
530 .getReg(0);
531 break;
532 }
533 case AMDGPU::G_UMIN:
534 case AMDGPU::G_UMAX: {
535 // For unsigned operations, use zero extension
536 auto [Val0_Lo, Val0_Hi] = unpackZExt(MI.getOperand(1).getReg());
537 auto [Val1_Lo, Val1_Hi] = unpackZExt(MI.getOperand(2).getReg());
538 Lo = B.buildInstr(MI.getOpcode(), {SgprRB_S32}, {Val0_Lo, Val1_Lo})
539 .getReg(0);
540 Hi = B.buildInstr(MI.getOpcode(), {SgprRB_S32}, {Val0_Hi, Val1_Hi})
541 .getReg(0);
542 break;
543 }
544 default:
546 MF, MORE, "amdgpu-regbanklegalize",
547 "AMDGPU RegBankLegalize: lowerUnpackMinMax, case not implemented", MI);
548 return false;
549 }
550 B.buildBuildVectorTrunc(MI.getOperand(0).getReg(), {Lo, Hi});
551 MI.eraseFromParent();
552 return true;
553}
554
555bool RegBankLegalizeHelper::lowerUnpackAExt(MachineInstr &MI) {
556 auto [Op1Lo, Op1Hi] = unpackAExt(MI.getOperand(1).getReg());
557 auto [Op2Lo, Op2Hi] = unpackAExt(MI.getOperand(2).getReg());
558 auto ResLo = B.buildInstr(MI.getOpcode(), {SgprRB_S32}, {Op1Lo, Op2Lo});
559 auto ResHi = B.buildInstr(MI.getOpcode(), {SgprRB_S32}, {Op1Hi, Op2Hi});
560 B.buildBuildVectorTrunc(MI.getOperand(0).getReg(),
561 {ResLo.getReg(0), ResHi.getReg(0)});
562 MI.eraseFromParent();
563 return true;
564}
565
568 return (GI->is(Intrinsic::amdgcn_sbfe));
569
570 return MI.getOpcode() == AMDGPU::G_SBFX;
571}
572
573bool RegBankLegalizeHelper::lowerV_BFE(MachineInstr &MI) {
574 Register Dst = MI.getOperand(0).getReg();
575 assert(MRI.getType(Dst) == LLT::scalar(64));
576 bool Signed = isSignedBFE(MI);
577 unsigned FirstOpnd = isa<GIntrinsic>(MI) ? 2 : 1;
578 // Extract bitfield from Src, LSBit is the least-significant bit for the
579 // extraction (field offset) and Width is size of bitfield.
580 Register Src = MI.getOperand(FirstOpnd).getReg();
581 Register LSBit = MI.getOperand(FirstOpnd + 1).getReg();
582 Register Width = MI.getOperand(FirstOpnd + 2).getReg();
583 // Comments are for signed bitfield extract, similar for unsigned. x is sign
584 // bit. s is sign, l is LSB and y are remaining bits of bitfield to extract.
585
586 // Src >> LSBit Hi|Lo: x?????syyyyyyl??? -> xxxx?????syyyyyyl
587 unsigned SHROpc = Signed ? AMDGPU::G_ASHR : AMDGPU::G_LSHR;
588 auto SHRSrc = B.buildInstr(SHROpc, {{VgprRB, S64}}, {Src, LSBit});
589
590 auto ConstWidth = getIConstantVRegValWithLookThrough(Width, MRI);
591
592 // Expand to Src >> LSBit << (64 - Width) >> (64 - Width)
593 // << (64 - Width): Hi|Lo: xxxx?????syyyyyyl -> syyyyyyl000000000
594 // >> (64 - Width): Hi|Lo: syyyyyyl000000000 -> ssssssssssyyyyyyl
595 if (!ConstWidth) {
596 auto Amt = B.buildSub(VgprRB_S32, B.buildConstant(SgprRB_S32, 64), Width);
597 auto SignBit = B.buildShl({VgprRB, S64}, SHRSrc, Amt);
598 B.buildInstr(SHROpc, {Dst}, {SignBit, Amt});
599 MI.eraseFromParent();
600 return true;
601 }
602
603 uint64_t WidthImm = ConstWidth->Value.getZExtValue();
604 auto UnmergeSHRSrc = B.buildUnmerge(VgprRB_S32, SHRSrc);
605 Register SHRSrcLo = UnmergeSHRSrc.getReg(0);
606 Register SHRSrcHi = UnmergeSHRSrc.getReg(1);
607 auto Zero = B.buildConstant({VgprRB, S32}, 0);
608 unsigned BFXOpc = Signed ? AMDGPU::G_SBFX : AMDGPU::G_UBFX;
609
610 if (WidthImm <= 32) {
611 // SHRSrc Hi|Lo: ????????|???syyyl -> ????????|ssssyyyl
612 auto Lo = B.buildInstr(BFXOpc, {VgprRB_S32}, {SHRSrcLo, Zero, Width});
613 MachineInstrBuilder Hi;
614 if (Signed) {
615 // SHRSrc Hi|Lo: ????????|ssssyyyl -> ssssssss|ssssyyyl
616 Hi = B.buildAShr(VgprRB_S32, Lo, B.buildConstant(VgprRB_S32, 31));
617 } else {
618 // SHRSrc Hi|Lo: ????????|000syyyl -> 00000000|000syyyl
619 Hi = Zero;
620 }
621 B.buildMergeLikeInstr(Dst, {Lo, Hi});
622 } else {
623 auto Amt = B.buildConstant(VgprRB_S32, WidthImm - 32);
624 // SHRSrc Hi|Lo: ??????sy|yyyyyyyl -> sssssssy|yyyyyyyl
625 auto Hi = B.buildInstr(BFXOpc, {VgprRB_S32}, {SHRSrcHi, Zero, Amt});
626 B.buildMergeLikeInstr(Dst, {SHRSrcLo, Hi});
627 }
628
629 MI.eraseFromParent();
630 return true;
631}
632
633bool RegBankLegalizeHelper::lowerS_BFE(MachineInstr &MI) {
634 Register DstReg = MI.getOperand(0).getReg();
635 LLT Ty = MRI.getType(DstReg);
636 bool Signed = isSignedBFE(MI);
637 unsigned FirstOpnd = isa<GIntrinsic>(MI) ? 2 : 1;
638 Register Src = MI.getOperand(FirstOpnd).getReg();
639 Register LSBit = MI.getOperand(FirstOpnd + 1).getReg();
640 Register Width = MI.getOperand(FirstOpnd + 2).getReg();
641 // For uniform bit field extract there are 4 available instructions, but
642 // LSBit(field offset) and Width(size of bitfield) need to be packed in S32,
643 // field offset in low and size in high 16 bits.
644
645 // Src1 Hi16|Lo16 = Size|FieldOffset
646 auto Mask = B.buildConstant(SgprRB_S32, maskTrailingOnes<unsigned>(6));
647 auto FieldOffset = B.buildAnd(SgprRB_S32, LSBit, Mask);
648 auto Size = B.buildShl(SgprRB_S32, Width, B.buildConstant(SgprRB_S32, 16));
649 auto Src1 = B.buildOr(SgprRB_S32, FieldOffset, Size);
650 unsigned Opc32 = Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32;
651 unsigned Opc64 = Signed ? AMDGPU::S_BFE_I64 : AMDGPU::S_BFE_U64;
652 unsigned Opc = Ty == S32 ? Opc32 : Opc64;
653
654 // Select machine instruction, because of reg class constraining, insert
655 // copies from reg class to reg bank.
656 auto S_BFE = B.buildInstr(Opc, {{SgprRB, Ty}},
657 {B.buildCopy(Ty, Src), B.buildCopy(S32, Src1)});
658 constrainSelectedInstRegOperands(*S_BFE, *ST.getInstrInfo(),
659 *ST.getRegisterInfo(), RBI);
660
661 B.buildCopy(DstReg, S_BFE->getOperand(0).getReg());
662 MI.eraseFromParent();
663 return true;
664}
665
666bool RegBankLegalizeHelper::lowerSplitTo32(MachineInstr &MI) {
667 Register Dst = MI.getOperand(0).getReg();
668 LLT DstTy = MRI.getType(Dst);
669 assert(DstTy == V4S16 || DstTy == V2S32 || DstTy == S64);
670 LLT Ty = DstTy == V4S16 ? V2S16 : S32;
671 auto Op1 = B.buildUnmerge({VgprRB, Ty}, MI.getOperand(1).getReg());
672 auto Op2 = B.buildUnmerge({VgprRB, Ty}, MI.getOperand(2).getReg());
673 unsigned Opc = MI.getOpcode();
674 auto Flags = MI.getFlags();
675 auto Lo =
676 B.buildInstr(Opc, {{VgprRB, Ty}}, {Op1.getReg(0), Op2.getReg(0)}, Flags);
677 auto Hi =
678 B.buildInstr(Opc, {{VgprRB, Ty}}, {Op1.getReg(1), Op2.getReg(1)}, Flags);
679 B.buildMergeLikeInstr(Dst, {Lo, Hi});
680 MI.eraseFromParent();
681 return true;
682}
683
684bool RegBankLegalizeHelper::lowerSplitTo32Mul(MachineInstr &MI) {
685 Register Dst = MI.getOperand(0).getReg();
686 assert(MRI.getType(Dst) == S64);
687 auto Op1 = B.buildUnmerge({VgprRB_S32}, MI.getOperand(1).getReg());
688 auto Op2 = B.buildUnmerge({VgprRB_S32}, MI.getOperand(2).getReg());
689
690 // TODO: G_AMDGPU_MAD_* optimizations for G_MUL divergent S64 operation to
691 // match GlobalISel with old regbankselect.
692 auto Lo = B.buildMul(VgprRB_S32, Op1.getReg(0), Op2.getReg(0));
693 auto Carry = B.buildUMulH(VgprRB_S32, Op1.getReg(0), Op2.getReg(0));
694 auto MulLo0Hi1 = B.buildMul(VgprRB_S32, Op1.getReg(0), Op2.getReg(1));
695 auto MulHi0Lo1 = B.buildMul(VgprRB_S32, Op1.getReg(1), Op2.getReg(0));
696 auto Sum = B.buildAdd(VgprRB_S32, MulLo0Hi1, MulHi0Lo1);
697 auto Hi = B.buildAdd(VgprRB_S32, Sum, Carry);
698
699 B.buildMergeLikeInstr(Dst, {Lo, Hi});
700 MI.eraseFromParent();
701 return true;
702}
703
704bool RegBankLegalizeHelper::lowerSplitTo16(MachineInstr &MI) {
705 Register Dst = MI.getOperand(0).getReg();
706 assert(MRI.getType(Dst) == V2S16);
707 unsigned Opc = MI.getOpcode();
708 unsigned NumOps = MI.getNumOperands();
709 auto Flags = MI.getFlags();
710
711 auto [Op1Lo, Op1Hi] = unpackAExtTruncS16(MI.getOperand(1).getReg());
712
713 if (NumOps == 2) {
714 auto Lo = B.buildInstr(Opc, {SgprRB_S16}, {Op1Lo}, Flags);
715 auto Hi = B.buildInstr(Opc, {SgprRB_S16}, {Op1Hi}, Flags);
716 B.buildMergeLikeInstr(Dst, {Lo, Hi});
717 MI.eraseFromParent();
718 return true;
719 }
720
721 auto [Op2Lo, Op2Hi] = unpackAExtTruncS16(MI.getOperand(2).getReg());
722
723 if (NumOps == 3) {
724 auto Lo = B.buildInstr(Opc, {SgprRB_S16}, {Op1Lo, Op2Lo}, Flags);
725 auto Hi = B.buildInstr(Opc, {SgprRB_S16}, {Op1Hi, Op2Hi}, Flags);
726 B.buildMergeLikeInstr(Dst, {Lo, Hi});
727 MI.eraseFromParent();
728 return true;
729 }
730
731 assert(NumOps == 4);
732 auto [Op3Lo, Op3Hi] = unpackAExtTruncS16(MI.getOperand(3).getReg());
733 auto Lo = B.buildInstr(Opc, {SgprRB_S16}, {Op1Lo, Op2Lo, Op3Lo}, Flags);
734 auto Hi = B.buildInstr(Opc, {SgprRB_S16}, {Op1Hi, Op2Hi, Op3Hi}, Flags);
735 B.buildMergeLikeInstr(Dst, {Lo, Hi});
736 MI.eraseFromParent();
737 return true;
738}
739
740bool RegBankLegalizeHelper::lowerUniMAD64(MachineInstr &MI) {
741 Register Dst0 = MI.getOperand(0).getReg();
742 Register Dst1 = MI.getOperand(1).getReg();
743 Register Src0 = MI.getOperand(2).getReg();
744 Register Src1 = MI.getOperand(3).getReg();
745 Register Src2 = MI.getOperand(4).getReg();
746
747 const GCNSubtarget &ST = B.getMF().getSubtarget<GCNSubtarget>();
748
749 // Keep the multiplication on the SALU.
750 Register DstLo = B.buildMul(SgprRB_S32, Src0, Src1).getReg(0);
751 Register DstHi = MRI.createVirtualRegister(SgprRB_S32);
752 if (ST.hasScalarMulHiInsts()) {
753 B.buildInstr(AMDGPU::G_UMULH, {{DstHi}}, {Src0, Src1});
754 } else {
755 auto VSrc0 = B.buildCopy(VgprRB_S32, Src0);
756 auto VSrc1 = B.buildCopy(VgprRB_S32, Src1);
757 auto MulHi = B.buildInstr(AMDGPU::G_UMULH, {VgprRB_S32}, {VSrc0, VSrc1});
758 buildReadAnyLane(B, DstHi, MulHi.getReg(0), RBI);
759 }
760
761 // Accumulate and produce the "carry-out" bit.
762
763 // The "carry-out" is defined as bit 64 of the result when computed as a
764 // big integer. For unsigned multiply-add, this matches the usual
765 // definition of carry-out.
766 if (mi_match(Src2, MRI, MIPatternMatch::m_ZeroInt())) {
767 // No accumulate: result is just the multiplication, carry is 0.
768 B.buildMergeLikeInstr(Dst0, {DstLo, DstHi});
769 B.buildConstant(Dst1, 0);
770 } else {
771 // Accumulate: add Src2 to the multiplication result with carry chain.
772 Register Src2Lo = MRI.createVirtualRegister(SgprRB_S32);
773 Register Src2Hi = MRI.createVirtualRegister(SgprRB_S32);
774 B.buildUnmerge({Src2Lo, Src2Hi}, Src2);
775
776 auto AddLo = B.buildUAddo(SgprRB_S32, SgprRB_S32, DstLo, Src2Lo);
777 auto AddHi =
778 B.buildUAdde(SgprRB_S32, SgprRB_S32, DstHi, Src2Hi, AddLo.getReg(1));
779 B.buildMergeLikeInstr(Dst0, {AddLo.getReg(0), AddHi.getReg(0)});
780 B.buildCopy(Dst1, AddHi.getReg(1));
781 }
782
783 MI.eraseFromParent();
784 return true;
785}
786
787bool RegBankLegalizeHelper::lowerSplitTo32Select(MachineInstr &MI) {
788 Register Dst = MI.getOperand(0).getReg();
789 LLT DstTy = MRI.getType(Dst);
790 assert(DstTy == V4S16 || DstTy == V2S32 || DstTy == S64 ||
791 (DstTy.isPointer() && DstTy.getSizeInBits() == 64));
792 LLT Ty = DstTy == V4S16 ? V2S16 : S32;
793 auto Op2 = B.buildUnmerge({VgprRB, Ty}, MI.getOperand(2).getReg());
794 auto Op3 = B.buildUnmerge({VgprRB, Ty}, MI.getOperand(3).getReg());
795 Register Cond = MI.getOperand(1).getReg();
796 auto Flags = MI.getFlags();
797 auto Lo =
798 B.buildSelect({VgprRB, Ty}, Cond, Op2.getReg(0), Op3.getReg(0), Flags);
799 auto Hi =
800 B.buildSelect({VgprRB, Ty}, Cond, Op2.getReg(1), Op3.getReg(1), Flags);
801
802 B.buildMergeLikeInstr(Dst, {Lo, Hi});
803 MI.eraseFromParent();
804 return true;
805}
806
807bool RegBankLegalizeHelper::lowerSplitTo32SExtInReg(MachineInstr &MI) {
808 auto Op1 = B.buildUnmerge(VgprRB_S32, MI.getOperand(1).getReg());
809 int Amt = MI.getOperand(2).getImm();
810 Register Lo, Hi;
811 // Hi|Lo: s sign bit, ?/x bits changed/not changed by sign-extend
812 if (Amt <= 32) {
813 auto Freeze = B.buildFreeze(VgprRB_S32, Op1.getReg(0));
814 if (Amt == 32) {
815 // Hi|Lo: ????????|sxxxxxxx -> ssssssss|sxxxxxxx
816 Lo = Freeze.getReg(0);
817 } else {
818 // Hi|Lo: ????????|???sxxxx -> ssssssss|ssssxxxx
819 Lo = B.buildSExtInReg(VgprRB_S32, Freeze, Amt).getReg(0);
820 }
821
822 auto SignExtCst = B.buildConstant(SgprRB_S32, 31);
823 Hi = B.buildAShr(VgprRB_S32, Lo, SignExtCst).getReg(0);
824 } else {
825 // Hi|Lo: ?????sxx|xxxxxxxx -> ssssssxx|xxxxxxxx
826 Lo = Op1.getReg(0);
827 Hi = B.buildSExtInReg(VgprRB_S32, Op1.getReg(1), Amt - 32).getReg(0);
828 }
829
830 B.buildMergeLikeInstr(MI.getOperand(0).getReg(), {Lo, Hi});
831 MI.eraseFromParent();
832 return true;
833}
834
835bool RegBankLegalizeHelper::lower(MachineInstr &MI,
836 const RegBankLLTMapping &Mapping,
837 WaterfallInfo &WFI) {
838
839 switch (Mapping.LoweringMethod) {
840 case DoNotLower:
841 break;
842 case VccExtToSel:
843 return lowerVccExtToSel(MI);
844 case UniExtToSel: {
845 LLT Ty = MRI.getType(MI.getOperand(0).getReg());
846 auto True = B.buildConstant({SgprRB, Ty},
847 MI.getOpcode() == AMDGPU::G_SEXT ? -1 : 1);
848 auto False = B.buildConstant({SgprRB, Ty}, 0);
849 // Input to G_{Z|S}EXT is 'Legalizer legal' S1. Most common case is compare.
850 // We are making select here. S1 cond was already 'any-extended to S32' +
851 // 'AND with 1 to clean high bits' by Sgpr32AExtBoolInReg.
852 B.buildSelect(MI.getOperand(0).getReg(), MI.getOperand(1).getReg(), True,
853 False);
854 MI.eraseFromParent();
855 return true;
856 }
857 case UnpackBitShift:
858 return lowerUnpackBitShift(MI);
859 case UnpackMinMax:
860 return lowerUnpackMinMax(MI);
861 case ScalarizeToS16:
862 return lowerSplitTo16(MI);
863 case Ext32To64: {
864 const RegisterBank *RB = MRI.getRegBank(MI.getOperand(0).getReg());
865 MachineInstrBuilder Hi;
866 switch (MI.getOpcode()) {
867 case AMDGPU::G_ZEXT: {
868 Hi = B.buildConstant({RB, S32}, 0);
869 break;
870 }
871 case AMDGPU::G_SEXT: {
872 // Replicate sign bit from 32-bit extended part.
873 auto ShiftAmt = B.buildConstant({RB, S32}, 31);
874 Hi = B.buildAShr({RB, S32}, MI.getOperand(1).getReg(), ShiftAmt);
875 break;
876 }
877 case AMDGPU::G_ANYEXT: {
878 Hi = B.buildUndef({RB, S32});
879 break;
880 }
881 default:
882 reportGISelFailure(MF, MORE, "amdgpu-regbanklegalize",
883 "AMDGPU RegBankLegalize: Ext32To64, unsuported opcode",
884 MI);
885 return false;
886 }
887
888 B.buildMergeLikeInstr(MI.getOperand(0).getReg(),
889 {MI.getOperand(1).getReg(), Hi});
890 MI.eraseFromParent();
891 return true;
892 }
893 case UniCstExt: {
894 uint64_t ConstVal = MI.getOperand(1).getCImm()->getZExtValue();
895 B.buildConstant(MI.getOperand(0).getReg(), ConstVal);
896
897 MI.eraseFromParent();
898 return true;
899 }
900 case VgprToVccCopy: {
901 Register Src = MI.getOperand(1).getReg();
902 LLT Ty = MRI.getType(Src);
903 // Take lowest bit from each lane and put it in lane mask.
904 // Lowering via compare, but we need to clean high bits first as compare
905 // compares all bits in register.
906 Register BoolSrc = MRI.createVirtualRegister({VgprRB, Ty});
907 if (Ty == S64) {
908 auto Src64 = B.buildUnmerge(VgprRB_S32, Src);
909 auto One = B.buildConstant(VgprRB_S32, 1);
910 auto AndLo = B.buildAnd(VgprRB_S32, Src64.getReg(0), One);
911 auto Zero = B.buildConstant(VgprRB_S32, 0);
912 auto AndHi = B.buildAnd(VgprRB_S32, Src64.getReg(1), Zero);
913 B.buildMergeLikeInstr(BoolSrc, {AndLo, AndHi});
914 } else {
915 assert(Ty == S32 || Ty == S16);
916 auto One = B.buildConstant({VgprRB, Ty}, 1);
917 B.buildAnd(BoolSrc, Src, One);
918 }
919 auto Zero = B.buildConstant({VgprRB, Ty}, 0);
920 B.buildICmp(CmpInst::ICMP_NE, MI.getOperand(0).getReg(), BoolSrc, Zero);
921 MI.eraseFromParent();
922 return true;
923 }
924 case V_BFE:
925 return lowerV_BFE(MI);
926 case S_BFE:
927 return lowerS_BFE(MI);
928 case UniMAD64:
929 return lowerUniMAD64(MI);
930 case UniMul64: {
931 B.buildMul(MI.getOperand(0), MI.getOperand(1), MI.getOperand(2));
932 MI.eraseFromParent();
933 return true;
934 }
935 case DivSMulToMAD: {
936 auto Op1 = B.buildTrunc(VgprRB_S32, MI.getOperand(1));
937 auto Op2 = B.buildTrunc(VgprRB_S32, MI.getOperand(2));
938 auto Zero = B.buildConstant({VgprRB, S64}, 0);
939
940 unsigned NewOpc = MI.getOpcode() == AMDGPU::G_AMDGPU_S_MUL_U64_U32
941 ? AMDGPU::G_AMDGPU_MAD_U64_U32
942 : AMDGPU::G_AMDGPU_MAD_I64_I32;
943
944 B.buildInstr(NewOpc, {MI.getOperand(0).getReg(), {SgprRB, S32}},
945 {Op1, Op2, Zero});
946 MI.eraseFromParent();
947 return true;
948 }
949 case SplitTo32:
950 return lowerSplitTo32(MI);
951 case SplitTo32Mul:
952 return lowerSplitTo32Mul(MI);
953 case SplitTo32Select:
954 return lowerSplitTo32Select(MI);
956 return lowerSplitTo32SExtInReg(MI);
957 case SplitLoad: {
958 LLT DstTy = MRI.getType(MI.getOperand(0).getReg());
959 unsigned Size = DstTy.getSizeInBits();
960 // Even split to 128-bit loads
961 if (Size > 128) {
962 LLT B128;
963 if (DstTy.isVector()) {
964 LLT EltTy = DstTy.getElementType();
965 B128 = LLT::fixed_vector(128 / EltTy.getSizeInBits(), EltTy);
966 } else {
967 B128 = LLT::scalar(128);
968 }
969 if (Size / 128 == 2)
970 splitLoad(MI, {B128, B128});
971 else if (Size / 128 == 4)
972 splitLoad(MI, {B128, B128, B128, B128});
973 else {
974 reportGISelFailure(MF, MORE, "amdgpu-regbanklegalize",
975 "AMDGPU RegBankLegalize: SplitLoad, unsuported type",
976 MI);
977 return false;
978 }
979 }
980 // 64 and 32 bit load
981 else if (DstTy == S96)
982 splitLoad(MI, {S64, S32}, S32);
983 else if (DstTy == V3S32)
984 splitLoad(MI, {V2S32, S32}, S32);
985 else if (DstTy == V6S16)
986 splitLoad(MI, {V4S16, V2S16}, V2S16);
987 else {
988 reportGISelFailure(MF, MORE, "amdgpu-regbanklegalize",
989 "AMDGPU RegBankLegalize: SplitLoad, unsuported type",
990 MI);
991 return false;
992 }
993 return true;
994 }
995 case WidenLoad: {
996 LLT DstTy = MRI.getType(MI.getOperand(0).getReg());
997 if (DstTy == S96)
998 widenLoad(MI, S128);
999 else if (DstTy == V3S32)
1000 widenLoad(MI, V4S32, S32);
1001 else if (DstTy == V6S16)
1002 widenLoad(MI, V8S16, V2S16);
1003 else {
1004 reportGISelFailure(MF, MORE, "amdgpu-regbanklegalize",
1005 "AMDGPU RegBankLegalize: WidenLoad, unsuported type",
1006 MI);
1007 return false;
1008 }
1009 return true;
1010 }
1011 case UnpackAExt:
1012 return lowerUnpackAExt(MI);
1013 case WidenMMOToS32:
1014 return widenMMOToS32(cast<GAnyLoad>(MI));
1015 case VerifyAllSgpr: {
1016 assert(llvm::all_of(MI.operands(), [&](const MachineOperand &Op) {
1017 return MRI.getRegBankOrNull(Op.getReg()) == SgprRB;
1018 }));
1019 return true;
1020 }
1021 case ApplyAllVgpr: {
1022 assert(llvm::all_of(MI.defs(), [&](const MachineOperand &Op) {
1023 return MRI.getRegBankOrNull(Op.getReg()) == VgprRB;
1024 }));
1025 B.setInstrAndDebugLoc(MI);
1026 for (unsigned i = MI.getNumDefs(); i < MI.getNumOperands(); ++i) {
1027 Register Reg = MI.getOperand(i).getReg();
1028 if (MRI.getRegBank(Reg) != VgprRB) {
1029 auto Copy = B.buildCopy({VgprRB, MRI.getType(Reg)}, Reg);
1030 MI.getOperand(i).setReg(Copy.getReg(0));
1031 }
1032 }
1033 return true;
1034 }
1035 case UnmergeToShiftTrunc: {
1036 GUnmerge *Unmerge = dyn_cast<GUnmerge>(&MI);
1037 LLT Ty = MRI.getType(Unmerge->getSourceReg());
1038 if (Ty.getSizeInBits() % 32 != 0) {
1039 reportGISelFailure(MF, MORE, "amdgpu-regbanklegalize",
1040 "AMDGPU RegBankLegalize: unmerge not multiple of 32",
1041 MI);
1042 return false;
1043 }
1044
1045 B.setInstrAndDebugLoc(MI);
1046 if (Ty.getSizeInBits() > 32) {
1047 auto UnmergeV2S16 =
1048 B.buildUnmerge({SgprRB, V2S16}, Unmerge->getSourceReg());
1049 for (unsigned i = 0; i < UnmergeV2S16->getNumDefs(); ++i) {
1050 auto [Dst0S32, Dst1S32] =
1051 unpackAExt(UnmergeV2S16->getOperand(i).getReg());
1052 B.buildTrunc(MI.getOperand(i * 2).getReg(), Dst0S32);
1053 B.buildTrunc(MI.getOperand(i * 2 + 1).getReg(), Dst1S32);
1054 }
1055 } else {
1056 auto [Dst0S32, Dst1S32] = unpackAExt(MI.getOperand(2).getReg());
1057 B.buildTrunc(MI.getOperand(0).getReg(), Dst0S32);
1058 B.buildTrunc(MI.getOperand(1).getReg(), Dst1S32);
1059 }
1060
1061 MI.eraseFromParent();
1062 return true;
1063 }
1064 case ApplyINTRIN_IMAGE:
1065 return applyRegisterBanksINTRIN_IMAGE(MI);
1066 }
1067
1068 if (!WFI.SgprWaterfallOperandRegs.empty()) {
1069 if (!executeInWaterfallLoop(B, WFI))
1070 return false;
1071 }
1072 return true;
1073}
1074
1075LLT RegBankLegalizeHelper::getTyFromID(RegBankLLTMappingApplyID ID) {
1076 switch (ID) {
1077 case Vcc:
1078 case UniInVcc:
1079 return LLT::scalar(1);
1080 case Sgpr16:
1081 case Vgpr16:
1082 case UniInVgprS16:
1083 return LLT::scalar(16);
1084 case Sgpr32:
1085 case Sgpr32_WF:
1086 case Sgpr32Trunc:
1087 case Sgpr32AExt:
1089 case Sgpr32SExt:
1090 case Sgpr32ZExt:
1091 case UniInVgprS32:
1092 case Vgpr32:
1093 case Vgpr32AExt:
1094 case Vgpr32SExt:
1095 case Vgpr32ZExt:
1096 return LLT::scalar(32);
1097 case Sgpr64:
1098 case Vgpr64:
1099 case UniInVgprS64:
1100 return LLT::scalar(64);
1101 case Sgpr128:
1102 case Vgpr128:
1103 return LLT::scalar(128);
1104 case SgprP0:
1105 case SgprP0Call_WF:
1106 case VgprP0:
1107 return LLT::pointer(0, 64);
1108 case SgprP1:
1109 case VgprP1:
1110 return LLT::pointer(1, 64);
1111 case SgprP2:
1112 case VgprP2:
1113 return LLT::pointer(2, 32);
1114 case SgprP3:
1115 case VgprP3:
1116 return LLT::pointer(3, 32);
1117 case SgprP4:
1118 case SgprP4Call_WF:
1119 case VgprP4:
1120 return LLT::pointer(4, 64);
1121 case SgprP5:
1122 case VgprP5:
1123 return LLT::pointer(5, 32);
1124 case SgprP8:
1125 return LLT::pointer(8, 128);
1126 case SgprV2S16:
1127 case VgprV2S16:
1128 case UniInVgprV2S16:
1129 return LLT::fixed_vector(2, 16);
1130 case SgprV2S32:
1131 case VgprV2S32:
1132 case UniInVgprV2S32:
1133 return LLT::fixed_vector(2, 32);
1134 case VgprV3S32:
1135 return LLT::fixed_vector(3, 32);
1136 case SgprV4S32:
1137 case SgprV4S32_WF:
1138 case VgprV4S32:
1139 case UniInVgprV4S32:
1140 return LLT::fixed_vector(4, 32);
1141 case VgprV2S64:
1142 case UniInVgprV2S64:
1143 return LLT::fixed_vector(2, 64);
1144 default:
1145 return LLT();
1146 }
1147}
1148
1149LLT RegBankLegalizeHelper::getBTyFromID(RegBankLLTMappingApplyID ID, LLT Ty) {
1150 switch (ID) {
1151 case SgprB32:
1152 case VgprB32:
1153 case UniInVgprB32:
1154 if (Ty == LLT::scalar(32) || Ty == LLT::fixed_vector(2, 16) ||
1155 isAnyPtr(Ty, 32))
1156 return Ty;
1157 return LLT();
1158 case SgprPtr32:
1159 case VgprPtr32:
1160 return isAnyPtr(Ty, 32) ? Ty : LLT();
1161 case SgprPtr64:
1162 case VgprPtr64:
1163 return isAnyPtr(Ty, 64) ? Ty : LLT();
1164 case SgprPtr128:
1165 case VgprPtr128:
1166 return isAnyPtr(Ty, 128) ? Ty : LLT();
1167 case SgprB64:
1168 case VgprB64:
1169 case UniInVgprB64:
1170 if (Ty == LLT::scalar(64) || Ty == LLT::fixed_vector(2, 32) ||
1171 Ty == LLT::fixed_vector(4, 16) || isAnyPtr(Ty, 64))
1172 return Ty;
1173 return LLT();
1174 case SgprB96:
1175 case VgprB96:
1176 case UniInVgprB96:
1177 if (Ty == LLT::scalar(96) || Ty == LLT::fixed_vector(3, 32) ||
1178 Ty == LLT::fixed_vector(6, 16))
1179 return Ty;
1180 return LLT();
1181 case SgprB128:
1182 case VgprB128:
1183 case UniInVgprB128:
1184 if (Ty == LLT::scalar(128) || Ty == LLT::fixed_vector(4, 32) ||
1185 Ty == LLT::fixed_vector(2, 64) || Ty == LLT::fixed_vector(8, 16) ||
1186 isAnyPtr(Ty, 128))
1187 return Ty;
1188 return LLT();
1189 case VgprB160:
1190 case UniInVgprB160:
1191 if (Ty.getSizeInBits() == 160)
1192 return Ty;
1193 return LLT();
1194 case SgprB256:
1195 case VgprB256:
1196 case UniInVgprB256:
1197 if (Ty == LLT::scalar(256) || Ty == LLT::fixed_vector(8, 32) ||
1198 Ty == LLT::fixed_vector(4, 64) || Ty == LLT::fixed_vector(16, 16))
1199 return Ty;
1200 return LLT();
1201 case SgprB512:
1202 case VgprB512:
1203 case UniInVgprB512:
1204 if (Ty == LLT::scalar(512) || Ty == LLT::fixed_vector(16, 32) ||
1205 Ty == LLT::fixed_vector(8, 64))
1206 return Ty;
1207 return LLT();
1208 case SgprBRC: {
1209 const SIRegisterInfo *TRI =
1210 static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
1211 unsigned LLTSize = Ty.getSizeInBits();
1212 if (LLTSize >= 32 && TRI->getSGPRClassForBitWidth(LLTSize))
1213 return Ty;
1214 return LLT();
1215 }
1216 case VgprBRC: {
1217 const SIRegisterInfo *TRI =
1218 static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
1219 if (TRI->getSGPRClassForBitWidth(Ty.getSizeInBits()))
1220 return Ty;
1221 return LLT();
1222 }
1223 default:
1224 return LLT();
1225 }
1226}
1227
1228const RegisterBank *
1229RegBankLegalizeHelper::getRegBankFromID(RegBankLLTMappingApplyID ID) {
1230 switch (ID) {
1231 case Vcc:
1232 return VccRB;
1233 case Sgpr16:
1234 case Sgpr32:
1235 case Sgpr32_WF:
1236 case Sgpr64:
1237 case Sgpr128:
1238 case SgprP0:
1239 case SgprP0Call_WF:
1240 case SgprP1:
1241 case SgprP2:
1242 case SgprP3:
1243 case SgprP4:
1244 case SgprP4Call_WF:
1245 case SgprP5:
1246 case SgprP8:
1247 case SgprPtr32:
1248 case SgprPtr64:
1249 case SgprPtr128:
1250 case SgprV2S16:
1251 case SgprV2S32:
1252 case SgprV4S32:
1253 case SgprV4S32_WF:
1254 case SgprB32:
1255 case SgprB64:
1256 case SgprB96:
1257 case SgprB128:
1258 case SgprB256:
1259 case SgprB512:
1260 case SgprBRC:
1261 case UniInVcc:
1262 case UniInVgprS16:
1263 case UniInVgprS32:
1264 case UniInVgprS64:
1265 case UniInVgprV2S16:
1266 case UniInVgprV2S32:
1267 case UniInVgprV4S32:
1268 case UniInVgprV2S64:
1269 case UniInVgprB32:
1270 case UniInVgprB64:
1271 case UniInVgprB96:
1272 case UniInVgprB128:
1273 case UniInVgprB160:
1274 case UniInVgprB256:
1275 case UniInVgprB512:
1276 case Sgpr32Trunc:
1277 case Sgpr32AExt:
1279 case Sgpr32SExt:
1280 case Sgpr32ZExt:
1281 return SgprRB;
1282 case Vgpr16:
1283 case Vgpr32:
1284 case Vgpr64:
1285 case Vgpr128:
1286 case VgprP0:
1287 case VgprP1:
1288 case VgprP2:
1289 case VgprP3:
1290 case VgprP4:
1291 case VgprP5:
1292 case VgprPtr32:
1293 case VgprPtr64:
1294 case VgprPtr128:
1295 case VgprV2S16:
1296 case VgprV2S32:
1297 case VgprV2S64:
1298 case VgprV3S32:
1299 case VgprV4S32:
1300 case VgprB32:
1301 case VgprB64:
1302 case VgprB96:
1303 case VgprB128:
1304 case VgprB160:
1305 case VgprB256:
1306 case VgprB512:
1307 case VgprBRC:
1308 case Vgpr32AExt:
1309 case Vgpr32SExt:
1310 case Vgpr32ZExt:
1311 return VgprRB;
1312 default:
1313 return nullptr;
1314 }
1315}
1316
1317bool RegBankLegalizeHelper::applyMappingDst(
1318 MachineInstr &MI, unsigned &OpIdx,
1319 const SmallVectorImpl<RegBankLLTMappingApplyID> &MethodIDs) {
1320 // Defs start from operand 0
1321 for (; OpIdx < MethodIDs.size(); ++OpIdx) {
1322 if (MethodIDs[OpIdx] == None)
1323 continue;
1324 MachineOperand &Op = MI.getOperand(OpIdx);
1325 Register Reg = Op.getReg();
1326 LLT Ty = MRI.getType(Reg);
1327 [[maybe_unused]] const RegisterBank *RB = MRI.getRegBank(Reg);
1328
1329 switch (MethodIDs[OpIdx]) {
1330 // vcc, sgpr and vgpr scalars, pointers and vectors
1331 case Vcc:
1332 case Sgpr16:
1333 case Sgpr32:
1334 case Sgpr64:
1335 case Sgpr128:
1336 case SgprP0:
1337 case SgprP1:
1338 case SgprP3:
1339 case SgprP4:
1340 case SgprP5:
1341 case SgprP8:
1342 case SgprV2S16:
1343 case SgprV2S32:
1344 case SgprV4S32:
1345 case Vgpr16:
1346 case Vgpr32:
1347 case Vgpr64:
1348 case Vgpr128:
1349 case VgprP0:
1350 case VgprP1:
1351 case VgprP2:
1352 case VgprP3:
1353 case VgprP4:
1354 case VgprP5:
1355 case VgprV2S16:
1356 case VgprV2S32:
1357 case VgprV2S64:
1358 case VgprV3S32:
1359 case VgprV4S32: {
1360 assert(Ty == getTyFromID(MethodIDs[OpIdx]));
1361 assert(RB == getRegBankFromID(MethodIDs[OpIdx]));
1362 break;
1363 }
1364 // sgpr and vgpr B-types
1365 case SgprB32:
1366 case SgprB64:
1367 case SgprB96:
1368 case SgprB128:
1369 case SgprB256:
1370 case SgprB512:
1371 case SgprBRC:
1372 case SgprPtr32:
1373 case SgprPtr64:
1374 case SgprPtr128:
1375 case VgprB32:
1376 case VgprB64:
1377 case VgprB96:
1378 case VgprB128:
1379 case VgprB160:
1380 case VgprB256:
1381 case VgprB512:
1382 case VgprBRC:
1383 case VgprPtr32:
1384 case VgprPtr64:
1385 case VgprPtr128: {
1386 assert(Ty == getBTyFromID(MethodIDs[OpIdx], Ty));
1387 assert(RB == getRegBankFromID(MethodIDs[OpIdx]));
1388 break;
1389 }
1390 // uniform in vcc/vgpr: scalars, vectors and B-types
1391 case UniInVcc: {
1392 assert(Ty == S1);
1393 assert(RB == SgprRB);
1394 Register NewDst = MRI.createVirtualRegister(VccRB_S1);
1395 Op.setReg(NewDst);
1396 if (!MRI.use_empty(Reg)) {
1397 auto CopyS32_Vcc =
1398 B.buildInstr(AMDGPU::G_AMDGPU_COPY_SCC_VCC, {SgprRB_S32}, {NewDst});
1399 B.buildTrunc(Reg, CopyS32_Vcc);
1400 }
1401 break;
1402 }
1403 case UniInVgprS16: {
1404 assert(Ty == getTyFromID(MethodIDs[OpIdx]));
1405 assert(RB == SgprRB);
1406 Register NewVgprDstS16 = MRI.createVirtualRegister({VgprRB, S16});
1407 Register NewVgprDstS32 = MRI.createVirtualRegister({VgprRB, S32});
1408 Register NewSgprDstS32 = MRI.createVirtualRegister({SgprRB, S32});
1409 Op.setReg(NewVgprDstS16);
1410 B.buildAnyExt(NewVgprDstS32, NewVgprDstS16);
1411 buildReadAnyLane(B, NewSgprDstS32, NewVgprDstS32, RBI);
1412 B.buildTrunc(Reg, NewSgprDstS32);
1413 break;
1414 }
1415 case UniInVgprS32:
1416 case UniInVgprS64:
1417 case UniInVgprV2S16:
1418 case UniInVgprV2S32:
1419 case UniInVgprV4S32:
1420 case UniInVgprV2S64: {
1421 assert(Ty == getTyFromID(MethodIDs[OpIdx]));
1422 assert(RB == SgprRB);
1423 Register NewVgprDst = MRI.createVirtualRegister({VgprRB, Ty});
1424 Op.setReg(NewVgprDst);
1425 buildReadAnyLane(B, Reg, NewVgprDst, RBI);
1426 break;
1427 }
1428 case UniInVgprB32:
1429 case UniInVgprB64:
1430 case UniInVgprB96:
1431 case UniInVgprB128:
1432 case UniInVgprB160:
1433 case UniInVgprB256:
1434 case UniInVgprB512: {
1435 assert(Ty == getBTyFromID(MethodIDs[OpIdx], Ty));
1436 assert(RB == SgprRB);
1437 Register NewVgprDst = MRI.createVirtualRegister({VgprRB, Ty});
1438 Op.setReg(NewVgprDst);
1439 AMDGPU::buildReadAnyLane(B, Reg, NewVgprDst, RBI);
1440 break;
1441 }
1442 // sgpr trunc
1443 case Sgpr32Trunc: {
1444 assert(Ty.getSizeInBits() < 32);
1445 assert(RB == SgprRB);
1446 Register NewDst = MRI.createVirtualRegister(SgprRB_S32);
1447 Op.setReg(NewDst);
1448 if (!MRI.use_empty(Reg))
1449 B.buildTrunc(Reg, NewDst);
1450 break;
1451 }
1452 case InvalidMapping: {
1454 MF, MORE, "amdgpu-regbanklegalize",
1455 "AMDGPU RegBankLegalize: missing fast rule ('Div' or 'Uni') for", MI);
1456 return false;
1457 }
1458 default:
1460 MF, MORE, "amdgpu-regbanklegalize",
1461 "AMDGPU RegBankLegalize: applyMappingDst, ID not supported", MI);
1462 return false;
1463 }
1464 }
1465
1466 return true;
1467}
1468
1469bool RegBankLegalizeHelper::applyMappingSrc(
1470 MachineInstr &MI, unsigned &OpIdx,
1471 const SmallVectorImpl<RegBankLLTMappingApplyID> &MethodIDs,
1472 WaterfallInfo &WFI) {
1473 for (unsigned i = 0; i < MethodIDs.size(); ++OpIdx, ++i) {
1474 if (MethodIDs[i] == None || MethodIDs[i] == IntrId || MethodIDs[i] == Imm)
1475 continue;
1476
1477 MachineOperand &Op = MI.getOperand(OpIdx);
1478 Register Reg = Op.getReg();
1479 LLT Ty = MRI.getType(Reg);
1480 const RegisterBank *RB = MRI.getRegBank(Reg);
1481
1482 switch (MethodIDs[i]) {
1483 case Vcc: {
1484 assert(Ty == S1);
1485 assert(RB == VccRB || RB == SgprRB);
1486 if (RB == SgprRB) {
1487 auto Aext = B.buildAnyExt(SgprRB_S32, Reg);
1488 auto CopyVcc_Scc =
1489 B.buildInstr(AMDGPU::G_AMDGPU_COPY_VCC_SCC, {VccRB_S1}, {Aext});
1490 Op.setReg(CopyVcc_Scc.getReg(0));
1491 }
1492 break;
1493 }
1494 // sgpr scalars, pointers and vectors
1495 case Sgpr16:
1496 case Sgpr32:
1497 case Sgpr64:
1498 case Sgpr128:
1499 case SgprP0:
1500 case SgprP1:
1501 case SgprP3:
1502 case SgprP4:
1503 case SgprP5:
1504 case SgprP8:
1505 case SgprV2S16:
1506 case SgprV2S32:
1507 case SgprV4S32: {
1508 assert(Ty == getTyFromID(MethodIDs[i]));
1509 assert(RB == getRegBankFromID(MethodIDs[i]));
1510 break;
1511 }
1512 // sgpr B-types
1513 case SgprB32:
1514 case SgprB64:
1515 case SgprB96:
1516 case SgprB128:
1517 case SgprB256:
1518 case SgprB512:
1519 case SgprBRC:
1520 case SgprPtr32:
1521 case SgprPtr64:
1522 case SgprPtr128: {
1523 assert(Ty == getBTyFromID(MethodIDs[i], Ty));
1524 assert(RB == getRegBankFromID(MethodIDs[i]));
1525 break;
1526 }
1527 // vgpr scalars, pointers and vectors
1528 case Vgpr16:
1529 case Vgpr32:
1530 case Vgpr64:
1531 case Vgpr128:
1532 case VgprP0:
1533 case VgprP1:
1534 case VgprP2:
1535 case VgprP3:
1536 case VgprP4:
1537 case VgprP5:
1538 case VgprV2S16:
1539 case VgprV2S32:
1540 case VgprV2S64:
1541 case VgprV3S32:
1542 case VgprV4S32: {
1543 assert(Ty == getTyFromID(MethodIDs[i]));
1544 if (RB != VgprRB) {
1545 auto CopyToVgpr = B.buildCopy({VgprRB, Ty}, Reg);
1546 Op.setReg(CopyToVgpr.getReg(0));
1547 }
1548 break;
1549 }
1550 // vgpr B-types
1551 case VgprB32:
1552 case VgprB64:
1553 case VgprB96:
1554 case VgprB128:
1555 case VgprB160:
1556 case VgprB256:
1557 case VgprB512:
1558 case VgprBRC:
1559 case VgprPtr32:
1560 case VgprPtr64:
1561 case VgprPtr128: {
1562 assert(Ty == getBTyFromID(MethodIDs[i], Ty));
1563 if (RB != VgprRB) {
1564 auto CopyToVgpr = B.buildCopy({VgprRB, Ty}, Reg);
1565 Op.setReg(CopyToVgpr.getReg(0));
1566 }
1567 break;
1568 }
1569 // sgpr waterfall, scalars, and vectors
1570 case Sgpr32_WF:
1571 case SgprV4S32_WF: {
1572 assert(Ty == getTyFromID(MethodIDs[i]));
1573 if (RB != SgprRB) {
1574 WFI.SgprWaterfallOperandRegs.insert(Reg);
1575 if (!WFI.Start.isValid()) {
1576 WFI.Start = MI.getIterator();
1577 WFI.End = std::next(MI.getIterator());
1578 }
1579 }
1580 break;
1581 }
1582 case SgprP0Call_WF:
1583 case SgprP4Call_WF: {
1584 assert(Ty == getTyFromID(MethodIDs[i]));
1585 if (RB != SgprRB) {
1586 WFI.SgprWaterfallOperandRegs.insert(Reg);
1587
1588 // Find the ADJCALLSTACKUP before the call.
1589 MachineBasicBlock::iterator Start = MI.getIterator();
1590 while (Start->getOpcode() != AMDGPU::ADJCALLSTACKUP)
1591 --Start;
1592
1593 // Find the ADJCALLSTACKDOWN after the call (include it in range).
1594 MachineBasicBlock::iterator End = MI.getIterator();
1595 while (End->getOpcode() != AMDGPU::ADJCALLSTACKDOWN)
1596 ++End;
1597 ++End;
1598
1599 B.setInsertPt(*MI.getParent(), Start);
1600 WFI.Start = Start;
1601 WFI.End = End;
1602 }
1603 break;
1604 }
1605 // sgpr and vgpr scalars with extend
1606 case Sgpr32AExt: {
1607 // Note: this ext allows S1, and it is meant to be combined away.
1608 assert(Ty.getSizeInBits() < 32);
1609 assert(RB == SgprRB);
1610 auto Aext = B.buildAnyExt(SgprRB_S32, Reg);
1611 Op.setReg(Aext.getReg(0));
1612 break;
1613 }
1614 case Sgpr32AExtBoolInReg: {
1615 // Note: this ext allows S1, and it is meant to be combined away.
1616 assert(Ty.getSizeInBits() == 1);
1617 assert(RB == SgprRB);
1618 auto Aext = B.buildAnyExt(SgprRB_S32, Reg);
1619 // Zext SgprS1 is not legal, make AND with 1 instead. This instruction is
1620 // most of times meant to be combined away in AMDGPURegBankCombiner.
1621 auto Cst1 = B.buildConstant(SgprRB_S32, 1);
1622 auto BoolInReg = B.buildAnd(SgprRB_S32, Aext, Cst1);
1623 Op.setReg(BoolInReg.getReg(0));
1624 break;
1625 }
1626 case Sgpr32SExt: {
1627 assert(1 < Ty.getSizeInBits() && Ty.getSizeInBits() < 32);
1628 assert(RB == SgprRB);
1629 auto Sext = B.buildSExt(SgprRB_S32, Reg);
1630 Op.setReg(Sext.getReg(0));
1631 break;
1632 }
1633 case Sgpr32ZExt: {
1634 assert(1 < Ty.getSizeInBits() && Ty.getSizeInBits() < 32);
1635 assert(RB == SgprRB);
1636 auto Zext = B.buildZExt({SgprRB, S32}, Reg);
1637 Op.setReg(Zext.getReg(0));
1638 break;
1639 }
1640 case Vgpr32AExt: {
1641 assert(Ty.getSizeInBits() < 32);
1642 assert(RB == VgprRB);
1643 auto Aext = B.buildAnyExt({VgprRB, S32}, Reg);
1644 Op.setReg(Aext.getReg(0));
1645 break;
1646 }
1647 case Vgpr32SExt: {
1648 // Note this ext allows S1, and it is meant to be combined away.
1649 assert(Ty.getSizeInBits() < 32);
1650 assert(RB == VgprRB);
1651 auto Sext = B.buildSExt({VgprRB, S32}, Reg);
1652 Op.setReg(Sext.getReg(0));
1653 break;
1654 }
1655 case Vgpr32ZExt: {
1656 // Note this ext allows S1, and it is meant to be combined away.
1657 assert(Ty.getSizeInBits() < 32);
1658 assert(RB == VgprRB);
1659 auto Zext = B.buildZExt({VgprRB, S32}, Reg);
1660 Op.setReg(Zext.getReg(0));
1661 break;
1662 }
1663 default:
1665 MF, MORE, "amdgpu-regbanklegalize",
1666 "AMDGPU RegBankLegalize: applyMappingSrc, ID not supported", MI);
1667 return false;
1668 }
1669 }
1670 return true;
1671}
1672
1674 Register Dst = MI.getOperand(0).getReg();
1675 LLT Ty = MRI.getType(Dst);
1676
1677 if (Ty == LLT::scalar(1) && MUI.isUniform(Dst)) {
1678 B.setInsertPt(*MI.getParent(), MI.getParent()->getFirstNonPHI());
1679
1680 Register NewDst = MRI.createVirtualRegister(SgprRB_S32);
1681 MI.getOperand(0).setReg(NewDst);
1682 B.buildTrunc(Dst, NewDst);
1683
1684 for (unsigned i = 1; i < MI.getNumOperands(); i += 2) {
1685 Register UseReg = MI.getOperand(i).getReg();
1686
1687 auto DefMI = MRI.getVRegDef(UseReg)->getIterator();
1688 MachineBasicBlock *DefMBB = DefMI->getParent();
1689
1690 B.setInsertPt(*DefMBB, DefMBB->SkipPHIsAndLabels(std::next(DefMI)));
1691
1692 auto NewUse = B.buildAnyExt(SgprRB_S32, UseReg);
1693 MI.getOperand(i).setReg(NewUse.getReg(0));
1694 }
1695
1696 return true;
1697 }
1698
1699 // ALL divergent i1 phis should have been lowered and inst-selected into PHI
1700 // with sgpr reg class and S1 LLT in AMDGPUGlobalISelDivergenceLowering pass.
1701 // Note: this includes divergent phis that don't require lowering.
1702 if (Ty == LLT::scalar(1) && MUI.isDivergent(Dst)) {
1703 reportGISelFailure(MF, MORE, "amdgpu-regbanklegalize",
1704 "AMDGPU RegBankLegalize: Can't lower divergent S1 G_PHI",
1705 MI);
1706 return false;
1707 }
1708
1709 // We accept all types that can fit in some register class.
1710 // Uniform G_PHIs have all sgpr registers.
1711 // Divergent G_PHIs have vgpr dst but inputs can be sgpr or vgpr.
1712 if (Ty == LLT::scalar(32) || Ty == LLT::pointer(1, 64) ||
1713 Ty == LLT::pointer(4, 64)) {
1714 return true;
1715 }
1716
1717 reportGISelFailure(MF, MORE, "amdgpu-regbanklegalize",
1718 "AMDGPU RegBankLegalize: type not supported for G_PHI",
1719 MI);
1720 return false;
1721}
1722
1723[[maybe_unused]] static bool verifyRegBankOnOperands(MachineInstr &MI,
1724 const RegisterBank *RB,
1726 unsigned StartOpIdx,
1727 unsigned EndOpIdx) {
1728 for (unsigned i = StartOpIdx; i <= EndOpIdx; ++i) {
1729 if (MRI.getRegBankOrNull(MI.getOperand(i).getReg()) != RB)
1730 return false;
1731 }
1732 return true;
1733}
1734
1736 const RegisterBank *RB = MRI.getRegBank(MI.getOperand(0).getReg());
1737 // Put RB on all registers
1738 unsigned NumDefs = MI.getNumDefs();
1739 unsigned NumOperands = MI.getNumOperands();
1740
1741 assert(verifyRegBankOnOperands(MI, RB, MRI, 0, NumDefs - 1));
1742 if (RB == SgprRB)
1743 assert(verifyRegBankOnOperands(MI, RB, MRI, NumDefs, NumOperands - 1));
1744
1745 if (RB == VgprRB) {
1746 B.setInstr(MI);
1747 for (unsigned i = NumDefs; i < NumOperands; ++i) {
1748 Register Reg = MI.getOperand(i).getReg();
1749 if (MRI.getRegBank(Reg) != RB) {
1750 auto Copy = B.buildCopy({VgprRB, MRI.getType(Reg)}, Reg);
1751 MI.getOperand(i).setReg(Copy.getReg(0));
1752 }
1753 }
1754 }
1755}
1756
1757bool RegBankLegalizeHelper::applyRegisterBanksINTRIN_IMAGE(MachineInstr &MI) {
1758 const AMDGPU::RsrcIntrinsic *RSrcIntrin =
1760 assert(RSrcIntrin && RSrcIntrin->IsImage);
1761
1762 unsigned RsrcIdx = RSrcIntrin->RsrcArg;
1763 const unsigned NumDefs = MI.getNumExplicitDefs();
1764
1765 // The reported argument index is relative to the IR intrinsic call arguments,
1766 // so we need to shift by the number of defs and the intrinsic ID.
1767 RsrcIdx += NumDefs + 1;
1768
1769 MachineBasicBlock *MBB = MI.getParent();
1770 B.setInsertPt(*MBB, MBB->SkipPHIsAndLabels(std::next(MI.getIterator())));
1771
1772 // Defs(for image loads with return) are vgpr.
1773 for (unsigned i = 0; i < NumDefs; ++i) {
1774 const RegisterBank *RB = MRI.getRegBank(MI.getOperand(i).getReg());
1775 if (RB == VgprRB)
1776 continue;
1777
1778 Register Reg = MI.getOperand(i).getReg();
1779 Register NewVgprDst = MRI.createVirtualRegister({VgprRB, MRI.getType(Reg)});
1780 MI.getOperand(i).setReg(NewVgprDst);
1781 buildReadAnyLane(B, Reg, NewVgprDst, RBI);
1782 }
1783
1784 B.setInstrAndDebugLoc(MI);
1785
1786 // Register uses(before RsrcIdx) are vgpr.
1787 for (unsigned i = 1; i < RsrcIdx; ++i) {
1788 MachineOperand &Op = MI.getOperand(i);
1789 if (!Op.isReg())
1790 continue;
1791
1792 Register Reg = Op.getReg();
1793 if (!Reg.isVirtual())
1794 continue;
1795
1796 if (MRI.getRegBank(Reg) == VgprRB)
1797 continue;
1798
1799 auto Copy = B.buildCopy({VgprRB, MRI.getType(Reg)}, Reg);
1800 Op.setReg(Copy.getReg(0));
1801 }
1802
1803 SmallSet<Register, 4> OpsToWaterfall;
1804
1805 // Register use RsrcIdx(and RsrcIdx+1 in some cases) is sgpr.
1806 for (unsigned i = RsrcIdx; i < MI.getNumOperands(); ++i) {
1807 MachineOperand &Op = MI.getOperand(i);
1808 if (!Op.isReg())
1809 continue;
1810
1811 Register Reg = Op.getReg();
1812 if (MRI.getRegBank(Reg) != SgprRB)
1813 OpsToWaterfall.insert(Reg);
1814 }
1815
1816 if (!OpsToWaterfall.empty()) {
1817 MachineBasicBlock::iterator MII = MI.getIterator();
1818 executeInWaterfallLoop(B, {OpsToWaterfall, MII, std::next(MII)});
1819 }
1820
1821 return true;
1822}
unsigned const MachineRegisterInfo * MRI
MachineInstrBuilder MachineInstrBuilder & DefMI
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.
Provides AMDGPU specific target descriptions.
static bool isSignedBFE(MachineInstr &MI)
static bool verifyRegBankOnOperands(MachineInstr &MI, const RegisterBank *RB, MachineRegisterInfo &MRI, unsigned StartOpIdx, unsigned EndOpIdx)
This file declares the targeting of the RegisterBankInfo class for AMDGPU.
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator MBBI
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
AMD GCN specific subclass of TargetSubtarget.
Declares convenience wrapper classes for interpreting MachineInstr instances as specific generic oper...
static Register UseReg(const MachineOperand &MO)
IRTranslator LLVM IR MI
const size_t AbstractManglingParser< Derived, Alloc >::NumOps
Contains matchers for matching SSA Machine Instructions.
This file declares the MachineIRBuilder class.
Register Reg
Register const TargetRegisterInfo * TRI
Machine IR instance of the generic uniformity analysis.
Promote Memory to Register
Definition Mem2Reg.cpp:110
static MCRegister getReg(const MCDisassembler *D, unsigned RC, unsigned RegNo)
MachineInstr unsigned OpIdx
const SmallVectorImpl< MachineOperand > & Cond
RegBankLegalizeHelper(MachineIRBuilder &B, const MachineUniformityInfo &MUI, const RegisterBankInfo &RBI, const RegBankLegalizeRules &RBLRules)
const RegBankLLTMapping * findMappingForMI(const MachineInstr &MI, const MachineRegisterInfo &MRI, const MachineUniformityInfo &MUI) const
static APInt getLowBitsSet(unsigned numBits, unsigned loBitsSet)
Constructs an APInt value that has the bottom loBitsSet bits set.
Definition APInt.h:307
@ ICMP_NE
not equal
Definition InstrTypes.h:698
iterator find(const_arg_type_t< KeyT > Val)
Definition DenseMap.h:178
iterator end()
Definition DenseMap.h:81
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition DenseMap.h:241
const SIRegisterInfo * getRegisterInfo() const override
Represents a call to an intrinsic.
Register getSourceReg() const
Get the unmerge source register.
constexpr bool isScalar() const
static constexpr LLT scalar(unsigned SizeInBits)
Get a low-level scalar or aggregate "bag of bits".
constexpr bool isValid() const
constexpr bool isVector() const
static constexpr LLT pointer(unsigned AddressSpace, unsigned SizeInBits)
Get a low-level pointer in the given address space.
constexpr TypeSize getSizeInBits() const
Returns the total size of the type. Must only be called on sized types.
constexpr bool isPointer() const
constexpr LLT getElementType() const
Returns the vector's element type. Only valid for vector types.
static constexpr LLT fixed_vector(unsigned NumElements, unsigned ScalarSizeInBits)
Get a low-level fixed-width vector of some number of elements and element width.
constexpr TypeSize getSizeInBytes() const
Returns the total size of the type in bytes, i.e.
TypeSize getValue() const
LLVM_ABI void transferSuccessorsAndUpdatePHIs(MachineBasicBlock *FromMBB)
Transfers all the successors, as in transferSuccessors, and update PHI operands in the successor bloc...
LLVM_ABI iterator SkipPHIsAndLabels(iterator I)
Return the first instruction in MBB after I that is not a PHI or a label.
LLVM_ABI void addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob=BranchProbability::getUnknown())
Add Succ as a successor of this MachineBasicBlock.
void splice(iterator Where, MachineBasicBlock *Other, iterator From)
Take an instruction from MBB 'Other' at the position From, and insert it into this MBB right before '...
MachineInstrBundleIterator< MachineInstr > iterator
BasicBlockListType::iterator iterator
MachineBasicBlock * CreateMachineBasicBlock(const BasicBlock *BB=nullptr, std::optional< UniqueBBID > BBID=std::nullopt)
CreateMachineInstr - Allocate a new MachineInstr.
void insert(iterator MBBI, MachineBasicBlock *MBB)
Helper class to build MachineInstr.
Representation of each machine instruction.
LocationSize getSize() const
Return the size in bytes of the memory reference.
MachineOperand class - Representation of each machine instruction operand.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
Holds all the information related to register banks.
This class implements the register bank concept.
Wrapper class representing virtual and physical registers.
Definition Register.h:20
bool empty() const
Definition SmallSet.h:169
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
Definition SmallSet.h:184
void push_back(const T &Elt)
bool isAnyPtr(LLT Ty, unsigned Width)
Intrinsic::ID getIntrinsicID(const MachineInstr &I)
Return the intrinsic ID for opcodes with the G_AMDGPU_INTRIN_ prefix.
void buildReadAnyLane(MachineIRBuilder &B, Register SgprDst, Register VgprSrc, const RegisterBankInfo &RBI)
const RsrcIntrinsic * lookupRsrcIntrinsic(unsigned Intr)
void buildReadFirstLane(MachineIRBuilder &B, Register SgprDst, Register VgprSrc, const RegisterBankInfo &RBI)
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
SpecificConstantMatch m_ZeroInt()
Convenience matchers for specific integer values.
bool mi_match(Reg R, const MachineRegisterInfo &MRI, Pattern &&P)
This is an optimization pass for GlobalISel generic memory operations.
Definition Types.h:26
GenericUniformityInfo< MachineSSAContext > MachineUniformityInfo
@ Offset
Definition DWP.cpp:532
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1739
@ Kill
The last use of a register.
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
LLVM_ABI void constrainSelectedInstRegOperands(MachineInstr &I, const TargetInstrInfo &TII, const TargetRegisterInfo &TRI, const RegisterBankInfo &RBI)
Mutate the newly-selected instruction I to constrain its (possibly generic) virtual register operands...
Definition Utils.cpp:155
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
LLVM_ABI void reportGISelFailure(MachineFunction &MF, MachineOptimizationRemarkEmitter &MORE, MachineOptimizationRemarkMissed &R)
Report an ISel error as a missed optimization remark to the LLVMContext's diagnostic stream.
Definition Utils.cpp:258
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
DWARFExpression::Operation Op
ArrayRef(const T &OneElt) -> ArrayRef< T >
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
LLVM_ABI std::optional< ValueAndVReg > getIConstantVRegValWithLookThrough(Register VReg, const MachineRegisterInfo &MRI, bool LookThroughInstrs=true)
If VReg is defined by a statically evaluable chain of instructions rooted on a G_CONSTANT returns its...
Definition Utils.cpp:433
constexpr T maskTrailingOnes(unsigned N)
Create a bitmask with the N right-most bits set to 1, and all other bits set to 0.
Definition MathExtras.h:77
SmallVector< RegBankLLTMappingApplyID, 2 > DstOpMapping
SmallVector< RegBankLLTMappingApplyID, 4 > SrcOpMapping
Holds waterfall loop information: the set of SGPR operand registers that need waterfalling,...
MachineBasicBlock::iterator Start
SmallSet< Register, 4 > SgprWaterfallOperandRegs