LLVM 22.0.0git
AMDGPURegBankLegalizeHelper.cpp
Go to the documentation of this file.
1//===-- AMDGPURegBankLegalizeHelper.cpp -----------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// Implements actual lowering algorithms for each ID that can be used in
10/// Rule.OperandMapping. Similar to legalizer helper but with register banks.
11//
12//===----------------------------------------------------------------------===//
13
16#include "AMDGPUInstrInfo.h"
19#include "GCNSubtarget.h"
25#include "llvm/IR/IntrinsicsAMDGPU.h"
26
27#define DEBUG_TYPE "amdgpu-regbanklegalize"
28
29using namespace llvm;
30using namespace AMDGPU;
31
34 const RegisterBankInfo &RBI, const RegBankLegalizeRules &RBLRules)
35 : ST(B.getMF().getSubtarget<GCNSubtarget>()), B(B), MRI(*B.getMRI()),
36 MUI(MUI), RBI(RBI), RBLRules(RBLRules), IsWave32(ST.isWave32()),
37 SgprRB(&RBI.getRegBank(AMDGPU::SGPRRegBankID)),
38 VgprRB(&RBI.getRegBank(AMDGPU::VGPRRegBankID)),
39 VccRB(&RBI.getRegBank(AMDGPU::VCCRegBankID)) {}
40
42 const SetOfRulesForOpcode &RuleSet = RBLRules.getRulesForOpc(MI);
43 const RegBankLLTMapping &Mapping = RuleSet.findMappingForMI(MI, MRI, MUI);
44
45 SmallSet<Register, 4> WaterfallSgprs;
46 unsigned OpIdx = 0;
47 if (Mapping.DstOpMapping.size() > 0) {
48 B.setInsertPt(*MI.getParent(), std::next(MI.getIterator()));
49 applyMappingDst(MI, OpIdx, Mapping.DstOpMapping);
50 }
51 if (Mapping.SrcOpMapping.size() > 0) {
52 B.setInstr(MI);
53 applyMappingSrc(MI, OpIdx, Mapping.SrcOpMapping, WaterfallSgprs);
54 }
55
56 lower(MI, Mapping, WaterfallSgprs);
57}
58
59bool RegBankLegalizeHelper::executeInWaterfallLoop(
61 SmallSet<Register, 4> &SGPROperandRegs) {
62 // Track use registers which have already been expanded with a readfirstlane
63 // sequence. This may have multiple uses if moving a sequence.
64 DenseMap<Register, Register> WaterfalledRegMap;
65
66 MachineBasicBlock &MBB = B.getMBB();
67 MachineFunction &MF = B.getMF();
68
70 const TargetRegisterClass *WaveRC = TRI->getWaveMaskRegClass();
71 unsigned MovExecOpc, MovExecTermOpc, XorTermOpc, AndSaveExecOpc, ExecReg;
72 if (IsWave32) {
73 MovExecOpc = AMDGPU::S_MOV_B32;
74 MovExecTermOpc = AMDGPU::S_MOV_B32_term;
75 XorTermOpc = AMDGPU::S_XOR_B32_term;
76 AndSaveExecOpc = AMDGPU::S_AND_SAVEEXEC_B32;
77 ExecReg = AMDGPU::EXEC_LO;
78 } else {
79 MovExecOpc = AMDGPU::S_MOV_B64;
80 MovExecTermOpc = AMDGPU::S_MOV_B64_term;
81 XorTermOpc = AMDGPU::S_XOR_B64_term;
82 AndSaveExecOpc = AMDGPU::S_AND_SAVEEXEC_B64;
83 ExecReg = AMDGPU::EXEC;
84 }
85
86#ifndef NDEBUG
87 const int OrigRangeSize = std::distance(Range.begin(), Range.end());
88#endif
89
90 MachineRegisterInfo &MRI = *B.getMRI();
91 Register SaveExecReg = MRI.createVirtualRegister(WaveRC);
92 Register InitSaveExecReg = MRI.createVirtualRegister(WaveRC);
93
94 // Don't bother using generic instructions/registers for the exec mask.
95 B.buildInstr(TargetOpcode::IMPLICIT_DEF).addDef(InitSaveExecReg);
96
97 Register SavedExec = MRI.createVirtualRegister(WaveRC);
98
99 // To insert the loop we need to split the block. Move everything before
100 // this point to a new block, and insert a new empty block before this
101 // instruction.
104 MachineBasicBlock *RestoreExecBB = MF.CreateMachineBasicBlock();
105 MachineBasicBlock *RemainderBB = MF.CreateMachineBasicBlock();
107 ++MBBI;
108 MF.insert(MBBI, LoopBB);
109 MF.insert(MBBI, BodyBB);
110 MF.insert(MBBI, RestoreExecBB);
111 MF.insert(MBBI, RemainderBB);
112
113 LoopBB->addSuccessor(BodyBB);
114 BodyBB->addSuccessor(RestoreExecBB);
115 BodyBB->addSuccessor(LoopBB);
116
117 // Move the rest of the block into a new block.
119 RemainderBB->splice(RemainderBB->begin(), &MBB, Range.end(), MBB.end());
120
121 MBB.addSuccessor(LoopBB);
122 RestoreExecBB->addSuccessor(RemainderBB);
123
124 B.setInsertPt(*LoopBB, LoopBB->end());
125
126 // +-MBB:------------+
127 // | ... |
128 // | %0 = G_INST_1 |
129 // | %Dst = MI %Vgpr |
130 // | %1 = G_INST_2 |
131 // | ... |
132 // +-----------------+
133 // ->
134 // +-MBB-------------------------------+
135 // | ... |
136 // | %0 = G_INST_1 |
137 // | %SaveExecReg = S_MOV_B32 $exec_lo |
138 // +----------------|------------------+
139 // | /------------------------------|
140 // V V |
141 // +-LoopBB---------------------------------------------------------------+ |
142 // | %CurrentLaneReg:sgpr(s32) = READFIRSTLANE %Vgpr | |
143 // | instead of executing for each lane, see if other lanes had | |
144 // | same value for %Vgpr and execute for them also. | |
145 // | %CondReg:vcc(s1) = G_ICMP eq %CurrentLaneReg, %Vgpr | |
146 // | %CondRegLM:sreg_32 = ballot %CondReg // copy vcc to sreg32 lane mask | |
147 // | %SavedExec = S_AND_SAVEEXEC_B32 %CondRegLM | |
148 // | exec is active for lanes with the same "CurrentLane value" in Vgpr | |
149 // +----------------|-----------------------------------------------------+ |
150 // V |
151 // +-BodyBB------------------------------------------------------------+ |
152 // | %Dst = MI %CurrentLaneReg:sgpr(s32) | |
153 // | executed only for active lanes and written to Dst | |
154 // | $exec = S_XOR_B32 $exec, %SavedExec | |
155 // | set active lanes to 0 in SavedExec, lanes that did not write to | |
156 // | Dst yet, and set this as new exec (for READFIRSTLANE and ICMP) | |
157 // | SI_WATERFALL_LOOP LoopBB |-----|
158 // +----------------|--------------------------------------------------+
159 // V
160 // +-RestoreExecBB--------------------------+
161 // | $exec_lo = S_MOV_B32_term %SaveExecReg |
162 // +----------------|-----------------------+
163 // V
164 // +-RemainderBB:----------------------+
165 // | %1 = G_INST_2 |
166 // | ... |
167 // +---------------------------------- +
168
169 // Move the instruction into the loop body. Note we moved everything after
170 // Range.end() already into a new block, so Range.end() is no longer valid.
171 BodyBB->splice(BodyBB->end(), &MBB, Range.begin(), MBB.end());
172
173 // Figure out the iterator range after splicing the instructions.
174 MachineBasicBlock::iterator NewBegin = Range.begin()->getIterator();
175 auto NewEnd = BodyBB->end();
176 assert(std::distance(NewBegin, NewEnd) == OrigRangeSize);
177
178 B.setMBB(*LoopBB);
179 Register CondReg;
180
181 for (MachineInstr &MI : make_range(NewBegin, NewEnd)) {
182 for (MachineOperand &Op : MI.all_uses()) {
183 Register OldReg = Op.getReg();
184 if (!SGPROperandRegs.count(OldReg))
185 continue;
186
187 // See if we already processed this register in another instruction in
188 // the sequence.
189 auto OldVal = WaterfalledRegMap.find(OldReg);
190 if (OldVal != WaterfalledRegMap.end()) {
191 Op.setReg(OldVal->second);
192 continue;
193 }
194
195 Register OpReg = Op.getReg();
196 LLT OpTy = MRI.getType(OpReg);
197
198 // TODO: support for agpr
199 assert(MRI.getRegBank(OpReg) == VgprRB);
200 Register CurrentLaneReg = MRI.createVirtualRegister({SgprRB, OpTy});
201 buildReadFirstLane(B, CurrentLaneReg, OpReg, RBI);
202
203 // Build the comparison(s), CurrentLaneReg == OpReg.
204 unsigned OpSize = OpTy.getSizeInBits();
205 unsigned PartSize = (OpSize % 64 == 0) ? 64 : 32;
206 LLT PartTy = LLT::scalar(PartSize);
207 unsigned NumParts = OpSize / PartSize;
209 SmallVector<Register, 8> CurrentLaneParts;
210
211 if (NumParts == 1) {
212 OpParts.push_back(OpReg);
213 CurrentLaneParts.push_back(CurrentLaneReg);
214 } else {
215 auto UnmergeOp = B.buildUnmerge({VgprRB, PartTy}, OpReg);
216 auto UnmergeCurrLane = B.buildUnmerge({SgprRB, PartTy}, CurrentLaneReg);
217 for (unsigned i = 0; i < NumParts; ++i) {
218 OpParts.push_back(UnmergeOp.getReg(i));
219 CurrentLaneParts.push_back(UnmergeCurrLane.getReg(i));
220 }
221 }
222
223 for (unsigned i = 0; i < NumParts; ++i) {
224 Register CmpReg = MRI.createVirtualRegister(VccRB_S1);
225 B.buildICmp(CmpInst::ICMP_EQ, CmpReg, CurrentLaneParts[i], OpParts[i]);
226
227 if (!CondReg)
228 CondReg = CmpReg;
229 else
230 CondReg = B.buildAnd(VccRB_S1, CondReg, CmpReg).getReg(0);
231 }
232
233 Op.setReg(CurrentLaneReg);
234
235 // Make sure we don't re-process this register again.
236 WaterfalledRegMap.insert(std::pair(OldReg, Op.getReg()));
237 }
238 }
239
240 // Copy vcc to sgpr32/64, ballot becomes a no-op during instruction selection.
241 Register CondRegLM =
242 MRI.createVirtualRegister({WaveRC, LLT::scalar(IsWave32 ? 32 : 64)});
243 B.buildIntrinsic(Intrinsic::amdgcn_ballot, CondRegLM).addReg(CondReg);
244
245 // Update EXEC, save the original EXEC value to SavedExec.
246 B.buildInstr(AndSaveExecOpc)
247 .addDef(SavedExec)
248 .addReg(CondRegLM, RegState::Kill);
249 MRI.setSimpleHint(SavedExec, CondRegLM);
250
251 B.setInsertPt(*BodyBB, BodyBB->end());
252
253 // Update EXEC, switch all done bits to 0 and all todo bits to 1.
254 B.buildInstr(XorTermOpc).addDef(ExecReg).addReg(ExecReg).addReg(SavedExec);
255
256 // XXX - s_xor_b64 sets scc to 1 if the result is nonzero, so can we use
257 // s_cbranch_scc0?
258
259 // Loop back to V_READFIRSTLANE_B32 if there are still variants to cover.
260 B.buildInstr(AMDGPU::SI_WATERFALL_LOOP).addMBB(LoopBB);
261
262 // Save the EXEC mask before the loop.
263 B.setInsertPt(MBB, MBB.end());
264 B.buildInstr(MovExecOpc).addDef(SaveExecReg).addReg(ExecReg);
265
266 // Restore the EXEC mask after the loop.
267 B.setInsertPt(*RestoreExecBB, RestoreExecBB->begin());
268 B.buildInstr(MovExecTermOpc).addDef(ExecReg).addReg(SaveExecReg);
269
270 // Set the insert point after the original instruction, so any new
271 // instructions will be in the remainder.
272 B.setInsertPt(*RemainderBB, RemainderBB->begin());
273
274 return true;
275}
276
277void RegBankLegalizeHelper::splitLoad(MachineInstr &MI,
278 ArrayRef<LLT> LLTBreakdown, LLT MergeTy) {
279 MachineFunction &MF = B.getMF();
280 assert(MI.getNumMemOperands() == 1);
281 MachineMemOperand &BaseMMO = **MI.memoperands_begin();
282 Register Dst = MI.getOperand(0).getReg();
283 const RegisterBank *DstRB = MRI.getRegBankOrNull(Dst);
284 Register Base = MI.getOperand(1).getReg();
285 LLT PtrTy = MRI.getType(Base);
286 const RegisterBank *PtrRB = MRI.getRegBankOrNull(Base);
287 LLT OffsetTy = LLT::scalar(PtrTy.getSizeInBits());
288 SmallVector<Register, 4> LoadPartRegs;
289
290 unsigned ByteOffset = 0;
291 for (LLT PartTy : LLTBreakdown) {
292 Register BasePlusOffset;
293 if (ByteOffset == 0) {
294 BasePlusOffset = Base;
295 } else {
296 auto Offset = B.buildConstant({PtrRB, OffsetTy}, ByteOffset);
297 BasePlusOffset =
298 B.buildObjectPtrOffset({PtrRB, PtrTy}, Base, Offset).getReg(0);
299 }
300 auto *OffsetMMO = MF.getMachineMemOperand(&BaseMMO, ByteOffset, PartTy);
301 auto LoadPart = B.buildLoad({DstRB, PartTy}, BasePlusOffset, *OffsetMMO);
302 LoadPartRegs.push_back(LoadPart.getReg(0));
303 ByteOffset += PartTy.getSizeInBytes();
304 }
305
306 if (!MergeTy.isValid()) {
307 // Loads are of same size, concat or merge them together.
308 B.buildMergeLikeInstr(Dst, LoadPartRegs);
309 } else {
310 // Loads are not all of same size, need to unmerge them to smaller pieces
311 // of MergeTy type, then merge pieces to Dst.
312 SmallVector<Register, 4> MergeTyParts;
313 for (Register Reg : LoadPartRegs) {
314 if (MRI.getType(Reg) == MergeTy) {
315 MergeTyParts.push_back(Reg);
316 } else {
317 auto Unmerge = B.buildUnmerge({DstRB, MergeTy}, Reg);
318 for (unsigned i = 0; i < Unmerge->getNumOperands() - 1; ++i)
319 MergeTyParts.push_back(Unmerge.getReg(i));
320 }
321 }
322 B.buildMergeLikeInstr(Dst, MergeTyParts);
323 }
324 MI.eraseFromParent();
325}
326
327void RegBankLegalizeHelper::widenLoad(MachineInstr &MI, LLT WideTy,
328 LLT MergeTy) {
329 MachineFunction &MF = B.getMF();
330 assert(MI.getNumMemOperands() == 1);
331 MachineMemOperand &BaseMMO = **MI.memoperands_begin();
332 Register Dst = MI.getOperand(0).getReg();
333 const RegisterBank *DstRB = MRI.getRegBankOrNull(Dst);
334 Register Base = MI.getOperand(1).getReg();
335
336 MachineMemOperand *WideMMO = MF.getMachineMemOperand(&BaseMMO, 0, WideTy);
337 auto WideLoad = B.buildLoad({DstRB, WideTy}, Base, *WideMMO);
338
339 if (WideTy.isScalar()) {
340 B.buildTrunc(Dst, WideLoad);
341 } else {
342 SmallVector<Register, 4> MergeTyParts;
343 auto Unmerge = B.buildUnmerge({DstRB, MergeTy}, WideLoad);
344
345 LLT DstTy = MRI.getType(Dst);
346 unsigned NumElts = DstTy.getSizeInBits() / MergeTy.getSizeInBits();
347 for (unsigned i = 0; i < NumElts; ++i) {
348 MergeTyParts.push_back(Unmerge.getReg(i));
349 }
350 B.buildMergeLikeInstr(Dst, MergeTyParts);
351 }
352 MI.eraseFromParent();
353}
354
355void RegBankLegalizeHelper::widenMMOToS32(GAnyLoad &MI) const {
356 Register Dst = MI.getDstReg();
357 Register Ptr = MI.getPointerReg();
358 MachineMemOperand &MMO = MI.getMMO();
359 unsigned MemSize = 8 * MMO.getSize().getValue();
360
361 MachineMemOperand *WideMMO = B.getMF().getMachineMemOperand(&MMO, 0, S32);
362
363 if (MI.getOpcode() == G_LOAD) {
364 B.buildLoad(Dst, Ptr, *WideMMO);
365 } else {
366 auto Load = B.buildLoad(SgprRB_S32, Ptr, *WideMMO);
367
368 if (MI.getOpcode() == G_ZEXTLOAD) {
369 APInt Mask = APInt::getLowBitsSet(S32.getSizeInBits(), MemSize);
370 auto MaskCst = B.buildConstant(SgprRB_S32, Mask);
371 B.buildAnd(Dst, Load, MaskCst);
372 } else {
373 assert(MI.getOpcode() == G_SEXTLOAD);
374 B.buildSExtInReg(Dst, Load, MemSize);
375 }
376 }
377
378 MI.eraseFromParent();
379}
380
381void RegBankLegalizeHelper::lowerVccExtToSel(MachineInstr &MI) {
382 Register Dst = MI.getOperand(0).getReg();
383 LLT Ty = MRI.getType(Dst);
384 Register Src = MI.getOperand(1).getReg();
385 unsigned Opc = MI.getOpcode();
386 int TrueExtCst = Opc == G_SEXT ? -1 : 1;
387 if (Ty == S32 || Ty == S16) {
388 auto True = B.buildConstant({VgprRB, Ty}, TrueExtCst);
389 auto False = B.buildConstant({VgprRB, Ty}, 0);
390 B.buildSelect(Dst, Src, True, False);
391 } else if (Ty == S64) {
392 auto True = B.buildConstant({VgprRB_S32}, TrueExtCst);
393 auto False = B.buildConstant({VgprRB_S32}, 0);
394 auto Lo = B.buildSelect({VgprRB_S32}, Src, True, False);
395 MachineInstrBuilder Hi;
396 switch (Opc) {
397 case G_SEXT:
398 Hi = Lo;
399 break;
400 case G_ZEXT:
401 Hi = False;
402 break;
403 case G_ANYEXT:
404 Hi = B.buildUndef({VgprRB_S32});
405 break;
406 default:
407 llvm_unreachable("Opcode not supported");
408 }
409
410 B.buildMergeValues(Dst, {Lo.getReg(0), Hi.getReg(0)});
411 } else {
412 llvm_unreachable("Type not supported");
413 }
414
415 MI.eraseFromParent();
416}
417
418std::pair<Register, Register> RegBankLegalizeHelper::unpackZExt(Register Reg) {
419 auto PackedS32 = B.buildBitcast(SgprRB_S32, Reg);
420 auto Mask = B.buildConstant(SgprRB_S32, 0x0000ffff);
421 auto Lo = B.buildAnd(SgprRB_S32, PackedS32, Mask);
422 auto Hi = B.buildLShr(SgprRB_S32, PackedS32, B.buildConstant(SgprRB_S32, 16));
423 return {Lo.getReg(0), Hi.getReg(0)};
424}
425
426std::pair<Register, Register> RegBankLegalizeHelper::unpackSExt(Register Reg) {
427 auto PackedS32 = B.buildBitcast(SgprRB_S32, Reg);
428 auto Lo = B.buildSExtInReg(SgprRB_S32, PackedS32, 16);
429 auto Hi = B.buildAShr(SgprRB_S32, PackedS32, B.buildConstant(SgprRB_S32, 16));
430 return {Lo.getReg(0), Hi.getReg(0)};
431}
432
433std::pair<Register, Register> RegBankLegalizeHelper::unpackAExt(Register Reg) {
434 auto PackedS32 = B.buildBitcast(SgprRB_S32, Reg);
435 auto Lo = PackedS32;
436 auto Hi = B.buildLShr(SgprRB_S32, PackedS32, B.buildConstant(SgprRB_S32, 16));
437 return {Lo.getReg(0), Hi.getReg(0)};
438}
439
440void RegBankLegalizeHelper::lowerUnpackBitShift(MachineInstr &MI) {
441 Register Lo, Hi;
442 switch (MI.getOpcode()) {
443 case AMDGPU::G_SHL: {
444 auto [Val0, Val1] = unpackAExt(MI.getOperand(1).getReg());
445 auto [Amt0, Amt1] = unpackAExt(MI.getOperand(2).getReg());
446 Lo = B.buildInstr(MI.getOpcode(), {SgprRB_S32}, {Val0, Amt0}).getReg(0);
447 Hi = B.buildInstr(MI.getOpcode(), {SgprRB_S32}, {Val1, Amt1}).getReg(0);
448 break;
449 }
450 case AMDGPU::G_LSHR: {
451 auto [Val0, Val1] = unpackZExt(MI.getOperand(1).getReg());
452 auto [Amt0, Amt1] = unpackZExt(MI.getOperand(2).getReg());
453 Lo = B.buildInstr(MI.getOpcode(), {SgprRB_S32}, {Val0, Amt0}).getReg(0);
454 Hi = B.buildInstr(MI.getOpcode(), {SgprRB_S32}, {Val1, Amt1}).getReg(0);
455 break;
456 }
457 case AMDGPU::G_ASHR: {
458 auto [Val0, Val1] = unpackSExt(MI.getOperand(1).getReg());
459 auto [Amt0, Amt1] = unpackSExt(MI.getOperand(2).getReg());
460 Lo = B.buildAShr(SgprRB_S32, Val0, Amt0).getReg(0);
461 Hi = B.buildAShr(SgprRB_S32, Val1, Amt1).getReg(0);
462 break;
463 }
464 default:
465 llvm_unreachable("Unpack lowering not implemented");
466 }
467 B.buildBuildVectorTrunc(MI.getOperand(0).getReg(), {Lo, Hi});
468 MI.eraseFromParent();
469}
470
471void RegBankLegalizeHelper::lowerUnpackMinMax(MachineInstr &MI) {
472 Register Lo, Hi;
473 switch (MI.getOpcode()) {
474 case AMDGPU::G_SMIN:
475 case AMDGPU::G_SMAX: {
476 // For signed operations, use sign extension
477 auto [Val0_Lo, Val0_Hi] = unpackSExt(MI.getOperand(1).getReg());
478 auto [Val1_Lo, Val1_Hi] = unpackSExt(MI.getOperand(2).getReg());
479 Lo = B.buildInstr(MI.getOpcode(), {SgprRB_S32}, {Val0_Lo, Val1_Lo})
480 .getReg(0);
481 Hi = B.buildInstr(MI.getOpcode(), {SgprRB_S32}, {Val0_Hi, Val1_Hi})
482 .getReg(0);
483 break;
484 }
485 case AMDGPU::G_UMIN:
486 case AMDGPU::G_UMAX: {
487 // For unsigned operations, use zero extension
488 auto [Val0_Lo, Val0_Hi] = unpackZExt(MI.getOperand(1).getReg());
489 auto [Val1_Lo, Val1_Hi] = unpackZExt(MI.getOperand(2).getReg());
490 Lo = B.buildInstr(MI.getOpcode(), {SgprRB_S32}, {Val0_Lo, Val1_Lo})
491 .getReg(0);
492 Hi = B.buildInstr(MI.getOpcode(), {SgprRB_S32}, {Val0_Hi, Val1_Hi})
493 .getReg(0);
494 break;
495 }
496 default:
497 llvm_unreachable("Unpack min/max lowering not implemented");
498 }
499 B.buildBuildVectorTrunc(MI.getOperand(0).getReg(), {Lo, Hi});
500 MI.eraseFromParent();
501}
502
503void RegBankLegalizeHelper::lowerUnpackAExt(MachineInstr &MI) {
504 auto [Op1Lo, Op1Hi] = unpackAExt(MI.getOperand(1).getReg());
505 auto [Op2Lo, Op2Hi] = unpackAExt(MI.getOperand(2).getReg());
506 auto ResLo = B.buildInstr(MI.getOpcode(), {SgprRB_S32}, {Op1Lo, Op2Lo});
507 auto ResHi = B.buildInstr(MI.getOpcode(), {SgprRB_S32}, {Op1Hi, Op2Hi});
508 B.buildBuildVectorTrunc(MI.getOperand(0).getReg(),
509 {ResLo.getReg(0), ResHi.getReg(0)});
510 MI.eraseFromParent();
511}
512
515 return (GI->is(Intrinsic::amdgcn_sbfe));
516
517 return MI.getOpcode() == AMDGPU::G_SBFX;
518}
519
520void RegBankLegalizeHelper::lowerV_BFE(MachineInstr &MI) {
521 Register Dst = MI.getOperand(0).getReg();
522 assert(MRI.getType(Dst) == LLT::scalar(64));
523 bool Signed = isSignedBFE(MI);
524 unsigned FirstOpnd = isa<GIntrinsic>(MI) ? 2 : 1;
525 // Extract bitfield from Src, LSBit is the least-significant bit for the
526 // extraction (field offset) and Width is size of bitfield.
527 Register Src = MI.getOperand(FirstOpnd).getReg();
528 Register LSBit = MI.getOperand(FirstOpnd + 1).getReg();
529 Register Width = MI.getOperand(FirstOpnd + 2).getReg();
530 // Comments are for signed bitfield extract, similar for unsigned. x is sign
531 // bit. s is sign, l is LSB and y are remaining bits of bitfield to extract.
532
533 // Src >> LSBit Hi|Lo: x?????syyyyyyl??? -> xxxx?????syyyyyyl
534 unsigned SHROpc = Signed ? AMDGPU::G_ASHR : AMDGPU::G_LSHR;
535 auto SHRSrc = B.buildInstr(SHROpc, {{VgprRB, S64}}, {Src, LSBit});
536
537 auto ConstWidth = getIConstantVRegValWithLookThrough(Width, MRI);
538
539 // Expand to Src >> LSBit << (64 - Width) >> (64 - Width)
540 // << (64 - Width): Hi|Lo: xxxx?????syyyyyyl -> syyyyyyl000000000
541 // >> (64 - Width): Hi|Lo: syyyyyyl000000000 -> ssssssssssyyyyyyl
542 if (!ConstWidth) {
543 auto Amt = B.buildSub(VgprRB_S32, B.buildConstant(SgprRB_S32, 64), Width);
544 auto SignBit = B.buildShl({VgprRB, S64}, SHRSrc, Amt);
545 B.buildInstr(SHROpc, {Dst}, {SignBit, Amt});
546 MI.eraseFromParent();
547 return;
548 }
549
550 uint64_t WidthImm = ConstWidth->Value.getZExtValue();
551 auto UnmergeSHRSrc = B.buildUnmerge(VgprRB_S32, SHRSrc);
552 Register SHRSrcLo = UnmergeSHRSrc.getReg(0);
553 Register SHRSrcHi = UnmergeSHRSrc.getReg(1);
554 auto Zero = B.buildConstant({VgprRB, S32}, 0);
555 unsigned BFXOpc = Signed ? AMDGPU::G_SBFX : AMDGPU::G_UBFX;
556
557 if (WidthImm <= 32) {
558 // SHRSrc Hi|Lo: ????????|???syyyl -> ????????|ssssyyyl
559 auto Lo = B.buildInstr(BFXOpc, {VgprRB_S32}, {SHRSrcLo, Zero, Width});
560 MachineInstrBuilder Hi;
561 if (Signed) {
562 // SHRSrc Hi|Lo: ????????|ssssyyyl -> ssssssss|ssssyyyl
563 Hi = B.buildAShr(VgprRB_S32, Lo, B.buildConstant(VgprRB_S32, 31));
564 } else {
565 // SHRSrc Hi|Lo: ????????|000syyyl -> 00000000|000syyyl
566 Hi = Zero;
567 }
568 B.buildMergeLikeInstr(Dst, {Lo, Hi});
569 } else {
570 auto Amt = B.buildConstant(VgprRB_S32, WidthImm - 32);
571 // SHRSrc Hi|Lo: ??????sy|yyyyyyyl -> sssssssy|yyyyyyyl
572 auto Hi = B.buildInstr(BFXOpc, {VgprRB_S32}, {SHRSrcHi, Zero, Amt});
573 B.buildMergeLikeInstr(Dst, {SHRSrcLo, Hi});
574 }
575
576 MI.eraseFromParent();
577}
578
579void RegBankLegalizeHelper::lowerS_BFE(MachineInstr &MI) {
580 Register DstReg = MI.getOperand(0).getReg();
581 LLT Ty = MRI.getType(DstReg);
582 bool Signed = isSignedBFE(MI);
583 unsigned FirstOpnd = isa<GIntrinsic>(MI) ? 2 : 1;
584 Register Src = MI.getOperand(FirstOpnd).getReg();
585 Register LSBit = MI.getOperand(FirstOpnd + 1).getReg();
586 Register Width = MI.getOperand(FirstOpnd + 2).getReg();
587 // For uniform bit field extract there are 4 available instructions, but
588 // LSBit(field offset) and Width(size of bitfield) need to be packed in S32,
589 // field offset in low and size in high 16 bits.
590
591 // Src1 Hi16|Lo16 = Size|FieldOffset
592 auto Mask = B.buildConstant(SgprRB_S32, maskTrailingOnes<unsigned>(6));
593 auto FieldOffset = B.buildAnd(SgprRB_S32, LSBit, Mask);
594 auto Size = B.buildShl(SgprRB_S32, Width, B.buildConstant(SgprRB_S32, 16));
595 auto Src1 = B.buildOr(SgprRB_S32, FieldOffset, Size);
596 unsigned Opc32 = Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32;
597 unsigned Opc64 = Signed ? AMDGPU::S_BFE_I64 : AMDGPU::S_BFE_U64;
598 unsigned Opc = Ty == S32 ? Opc32 : Opc64;
599
600 // Select machine instruction, because of reg class constraining, insert
601 // copies from reg class to reg bank.
602 auto S_BFE = B.buildInstr(Opc, {{SgprRB, Ty}},
603 {B.buildCopy(Ty, Src), B.buildCopy(S32, Src1)});
604 if (!constrainSelectedInstRegOperands(*S_BFE, *ST.getInstrInfo(),
605 *ST.getRegisterInfo(), RBI))
606 llvm_unreachable("failed to constrain BFE");
607
608 B.buildCopy(DstReg, S_BFE->getOperand(0).getReg());
609 MI.eraseFromParent();
610}
611
612void RegBankLegalizeHelper::lowerSplitTo32(MachineInstr &MI) {
613 Register Dst = MI.getOperand(0).getReg();
614 LLT DstTy = MRI.getType(Dst);
615 assert(DstTy == V4S16 || DstTy == V2S32 || DstTy == S64);
616 LLT Ty = DstTy == V4S16 ? V2S16 : S32;
617 auto Op1 = B.buildUnmerge({VgprRB, Ty}, MI.getOperand(1).getReg());
618 auto Op2 = B.buildUnmerge({VgprRB, Ty}, MI.getOperand(2).getReg());
619 unsigned Opc = MI.getOpcode();
620 auto Flags = MI.getFlags();
621 auto Lo =
622 B.buildInstr(Opc, {{VgprRB, Ty}}, {Op1.getReg(0), Op2.getReg(0)}, Flags);
623 auto Hi =
624 B.buildInstr(Opc, {{VgprRB, Ty}}, {Op1.getReg(1), Op2.getReg(1)}, Flags);
625 B.buildMergeLikeInstr(Dst, {Lo, Hi});
626 MI.eraseFromParent();
627}
628
629void RegBankLegalizeHelper::lowerSplitTo16(MachineInstr &MI) {
630 Register Dst = MI.getOperand(0).getReg();
631 assert(MRI.getType(Dst) == V2S16);
632 auto [Op1Lo32, Op1Hi32] = unpackAExt(MI.getOperand(1).getReg());
633 auto [Op2Lo32, Op2Hi32] = unpackAExt(MI.getOperand(2).getReg());
634 unsigned Opc = MI.getOpcode();
635 auto Flags = MI.getFlags();
636 auto Op1Lo = B.buildTrunc(SgprRB_S16, Op1Lo32);
637 auto Op1Hi = B.buildTrunc(SgprRB_S16, Op1Hi32);
638 auto Op2Lo = B.buildTrunc(SgprRB_S16, Op2Lo32);
639 auto Op2Hi = B.buildTrunc(SgprRB_S16, Op2Hi32);
640 auto Lo = B.buildInstr(Opc, {SgprRB_S16}, {Op1Lo, Op2Lo}, Flags);
641 auto Hi = B.buildInstr(Opc, {SgprRB_S16}, {Op1Hi, Op2Hi}, Flags);
642 B.buildMergeLikeInstr(Dst, {Lo, Hi});
643 MI.eraseFromParent();
644}
645
646void RegBankLegalizeHelper::lowerSplitTo32Select(MachineInstr &MI) {
647 Register Dst = MI.getOperand(0).getReg();
648 LLT DstTy = MRI.getType(Dst);
649 assert(DstTy == V4S16 || DstTy == V2S32 || DstTy == S64 ||
650 (DstTy.isPointer() && DstTy.getSizeInBits() == 64));
651 LLT Ty = DstTy == V4S16 ? V2S16 : S32;
652 auto Op2 = B.buildUnmerge({VgprRB, Ty}, MI.getOperand(2).getReg());
653 auto Op3 = B.buildUnmerge({VgprRB, Ty}, MI.getOperand(3).getReg());
654 Register Cond = MI.getOperand(1).getReg();
655 auto Flags = MI.getFlags();
656 auto Lo =
657 B.buildSelect({VgprRB, Ty}, Cond, Op2.getReg(0), Op3.getReg(0), Flags);
658 auto Hi =
659 B.buildSelect({VgprRB, Ty}, Cond, Op2.getReg(1), Op3.getReg(1), Flags);
660
661 B.buildMergeLikeInstr(Dst, {Lo, Hi});
662 MI.eraseFromParent();
663}
664
665void RegBankLegalizeHelper::lowerSplitTo32SExtInReg(MachineInstr &MI) {
666 auto Op1 = B.buildUnmerge(VgprRB_S32, MI.getOperand(1).getReg());
667 int Amt = MI.getOperand(2).getImm();
668 Register Lo, Hi;
669 // Hi|Lo: s sign bit, ?/x bits changed/not changed by sign-extend
670 if (Amt <= 32) {
671 auto Freeze = B.buildFreeze(VgprRB_S32, Op1.getReg(0));
672 if (Amt == 32) {
673 // Hi|Lo: ????????|sxxxxxxx -> ssssssss|sxxxxxxx
674 Lo = Freeze.getReg(0);
675 } else {
676 // Hi|Lo: ????????|???sxxxx -> ssssssss|ssssxxxx
677 Lo = B.buildSExtInReg(VgprRB_S32, Freeze, Amt).getReg(0);
678 }
679
680 auto SignExtCst = B.buildConstant(SgprRB_S32, 31);
681 Hi = B.buildAShr(VgprRB_S32, Lo, SignExtCst).getReg(0);
682 } else {
683 // Hi|Lo: ?????sxx|xxxxxxxx -> ssssssxx|xxxxxxxx
684 Lo = Op1.getReg(0);
685 Hi = B.buildSExtInReg(VgprRB_S32, Op1.getReg(1), Amt - 32).getReg(0);
686 }
687
688 B.buildMergeLikeInstr(MI.getOperand(0).getReg(), {Lo, Hi});
689 MI.eraseFromParent();
690}
691
692void RegBankLegalizeHelper::lower(MachineInstr &MI,
693 const RegBankLLTMapping &Mapping,
694 SmallSet<Register, 4> &WaterfallSgprs) {
695
696 switch (Mapping.LoweringMethod) {
697 case DoNotLower:
698 break;
699 case VccExtToSel:
700 return lowerVccExtToSel(MI);
701 case UniExtToSel: {
702 LLT Ty = MRI.getType(MI.getOperand(0).getReg());
703 auto True = B.buildConstant({SgprRB, Ty},
704 MI.getOpcode() == AMDGPU::G_SEXT ? -1 : 1);
705 auto False = B.buildConstant({SgprRB, Ty}, 0);
706 // Input to G_{Z|S}EXT is 'Legalizer legal' S1. Most common case is compare.
707 // We are making select here. S1 cond was already 'any-extended to S32' +
708 // 'AND with 1 to clean high bits' by Sgpr32AExtBoolInReg.
709 B.buildSelect(MI.getOperand(0).getReg(), MI.getOperand(1).getReg(), True,
710 False);
711 MI.eraseFromParent();
712 return;
713 }
714 case UnpackBitShift:
715 return lowerUnpackBitShift(MI);
716 case UnpackMinMax:
717 return lowerUnpackMinMax(MI);
718 case ScalarizeToS16:
719 return lowerSplitTo16(MI);
720 case Ext32To64: {
721 const RegisterBank *RB = MRI.getRegBank(MI.getOperand(0).getReg());
722 MachineInstrBuilder Hi;
723 switch (MI.getOpcode()) {
724 case AMDGPU::G_ZEXT: {
725 Hi = B.buildConstant({RB, S32}, 0);
726 break;
727 }
728 case AMDGPU::G_SEXT: {
729 // Replicate sign bit from 32-bit extended part.
730 auto ShiftAmt = B.buildConstant({RB, S32}, 31);
731 Hi = B.buildAShr({RB, S32}, MI.getOperand(1).getReg(), ShiftAmt);
732 break;
733 }
734 case AMDGPU::G_ANYEXT: {
735 Hi = B.buildUndef({RB, S32});
736 break;
737 }
738 default:
739 llvm_unreachable("Unsuported Opcode in Ext32To64");
740 }
741
742 B.buildMergeLikeInstr(MI.getOperand(0).getReg(),
743 {MI.getOperand(1).getReg(), Hi});
744 MI.eraseFromParent();
745 return;
746 }
747 case UniCstExt: {
748 uint64_t ConstVal = MI.getOperand(1).getCImm()->getZExtValue();
749 B.buildConstant(MI.getOperand(0).getReg(), ConstVal);
750
751 MI.eraseFromParent();
752 return;
753 }
754 case VgprToVccCopy: {
755 Register Src = MI.getOperand(1).getReg();
756 LLT Ty = MRI.getType(Src);
757 // Take lowest bit from each lane and put it in lane mask.
758 // Lowering via compare, but we need to clean high bits first as compare
759 // compares all bits in register.
760 Register BoolSrc = MRI.createVirtualRegister({VgprRB, Ty});
761 if (Ty == S64) {
762 auto Src64 = B.buildUnmerge(VgprRB_S32, Src);
763 auto One = B.buildConstant(VgprRB_S32, 1);
764 auto AndLo = B.buildAnd(VgprRB_S32, Src64.getReg(0), One);
765 auto Zero = B.buildConstant(VgprRB_S32, 0);
766 auto AndHi = B.buildAnd(VgprRB_S32, Src64.getReg(1), Zero);
767 B.buildMergeLikeInstr(BoolSrc, {AndLo, AndHi});
768 } else {
769 assert(Ty == S32 || Ty == S16);
770 auto One = B.buildConstant({VgprRB, Ty}, 1);
771 B.buildAnd(BoolSrc, Src, One);
772 }
773 auto Zero = B.buildConstant({VgprRB, Ty}, 0);
774 B.buildICmp(CmpInst::ICMP_NE, MI.getOperand(0).getReg(), BoolSrc, Zero);
775 MI.eraseFromParent();
776 return;
777 }
778 case V_BFE:
779 return lowerV_BFE(MI);
780 case S_BFE:
781 return lowerS_BFE(MI);
782 case SplitTo32:
783 return lowerSplitTo32(MI);
784 case SplitTo32Select:
785 return lowerSplitTo32Select(MI);
787 return lowerSplitTo32SExtInReg(MI);
788 case SplitLoad: {
789 LLT DstTy = MRI.getType(MI.getOperand(0).getReg());
790 unsigned Size = DstTy.getSizeInBits();
791 // Even split to 128-bit loads
792 if (Size > 128) {
793 LLT B128;
794 if (DstTy.isVector()) {
795 LLT EltTy = DstTy.getElementType();
796 B128 = LLT::fixed_vector(128 / EltTy.getSizeInBits(), EltTy);
797 } else {
798 B128 = LLT::scalar(128);
799 }
800 if (Size / 128 == 2)
801 splitLoad(MI, {B128, B128});
802 else if (Size / 128 == 4)
803 splitLoad(MI, {B128, B128, B128, B128});
804 else {
805 LLVM_DEBUG(dbgs() << "MI: "; MI.dump(););
806 llvm_unreachable("SplitLoad type not supported for MI");
807 }
808 }
809 // 64 and 32 bit load
810 else if (DstTy == S96)
811 splitLoad(MI, {S64, S32}, S32);
812 else if (DstTy == V3S32)
813 splitLoad(MI, {V2S32, S32}, S32);
814 else if (DstTy == V6S16)
815 splitLoad(MI, {V4S16, V2S16}, V2S16);
816 else {
817 LLVM_DEBUG(dbgs() << "MI: "; MI.dump(););
818 llvm_unreachable("SplitLoad type not supported for MI");
819 }
820 break;
821 }
822 case WidenLoad: {
823 LLT DstTy = MRI.getType(MI.getOperand(0).getReg());
824 if (DstTy == S96)
825 widenLoad(MI, S128);
826 else if (DstTy == V3S32)
827 widenLoad(MI, V4S32, S32);
828 else if (DstTy == V6S16)
829 widenLoad(MI, V8S16, V2S16);
830 else {
831 LLVM_DEBUG(dbgs() << "MI: "; MI.dump(););
832 llvm_unreachable("WidenLoad type not supported for MI");
833 }
834 break;
835 }
836 case UnpackAExt:
837 return lowerUnpackAExt(MI);
838 case WidenMMOToS32:
839 return widenMMOToS32(cast<GAnyLoad>(MI));
840 }
841
842 if (!WaterfallSgprs.empty()) {
843 MachineBasicBlock::iterator I = MI.getIterator();
844 executeInWaterfallLoop(B, make_range(I, std::next(I)), WaterfallSgprs);
845 }
846}
847
848LLT RegBankLegalizeHelper::getTyFromID(RegBankLLTMappingApplyID ID) {
849 switch (ID) {
850 case Vcc:
851 case UniInVcc:
852 return LLT::scalar(1);
853 case Sgpr16:
854 case Vgpr16:
855 case UniInVgprS16:
856 return LLT::scalar(16);
857 case Sgpr32:
858 case Sgpr32_WF:
859 case Sgpr32Trunc:
860 case Sgpr32AExt:
862 case Sgpr32SExt:
863 case Sgpr32ZExt:
864 case UniInVgprS32:
865 case Vgpr32:
866 case Vgpr32SExt:
867 case Vgpr32ZExt:
868 return LLT::scalar(32);
869 case Sgpr64:
870 case Vgpr64:
871 case UniInVgprS64:
872 return LLT::scalar(64);
873 case Sgpr128:
874 case Vgpr128:
875 return LLT::scalar(128);
876 case VgprP0:
877 return LLT::pointer(0, 64);
878 case SgprP1:
879 case VgprP1:
880 return LLT::pointer(1, 64);
881 case SgprP3:
882 case VgprP3:
883 return LLT::pointer(3, 32);
884 case SgprP4:
885 case VgprP4:
886 return LLT::pointer(4, 64);
887 case SgprP5:
888 case VgprP5:
889 return LLT::pointer(5, 32);
890 case SgprV2S16:
891 case VgprV2S16:
892 case UniInVgprV2S16:
893 return LLT::fixed_vector(2, 16);
894 case SgprV2S32:
895 case VgprV2S32:
896 return LLT::fixed_vector(2, 32);
897 case SgprV4S32:
898 case SgprV4S32_WF:
899 case VgprV4S32:
900 case UniInVgprV4S32:
901 return LLT::fixed_vector(4, 32);
902 default:
903 return LLT();
904 }
905}
906
907LLT RegBankLegalizeHelper::getBTyFromID(RegBankLLTMappingApplyID ID, LLT Ty) {
908 switch (ID) {
909 case SgprB32:
910 case VgprB32:
911 case UniInVgprB32:
912 if (Ty == LLT::scalar(32) || Ty == LLT::fixed_vector(2, 16) ||
913 isAnyPtr(Ty, 32))
914 return Ty;
915 return LLT();
916 case SgprPtr32:
917 case VgprPtr32:
918 return isAnyPtr(Ty, 32) ? Ty : LLT();
919 case SgprPtr64:
920 case VgprPtr64:
921 return isAnyPtr(Ty, 64) ? Ty : LLT();
922 case SgprPtr128:
923 case VgprPtr128:
924 return isAnyPtr(Ty, 128) ? Ty : LLT();
925 case SgprB64:
926 case VgprB64:
927 case UniInVgprB64:
928 if (Ty == LLT::scalar(64) || Ty == LLT::fixed_vector(2, 32) ||
929 Ty == LLT::fixed_vector(4, 16) || isAnyPtr(Ty, 64))
930 return Ty;
931 return LLT();
932 case SgprB96:
933 case VgprB96:
934 case UniInVgprB96:
935 if (Ty == LLT::scalar(96) || Ty == LLT::fixed_vector(3, 32) ||
936 Ty == LLT::fixed_vector(6, 16))
937 return Ty;
938 return LLT();
939 case SgprB128:
940 case VgprB128:
941 case UniInVgprB128:
942 if (Ty == LLT::scalar(128) || Ty == LLT::fixed_vector(4, 32) ||
943 Ty == LLT::fixed_vector(2, 64) || isAnyPtr(Ty, 128))
944 return Ty;
945 return LLT();
946 case SgprB256:
947 case VgprB256:
948 case UniInVgprB256:
949 if (Ty == LLT::scalar(256) || Ty == LLT::fixed_vector(8, 32) ||
950 Ty == LLT::fixed_vector(4, 64) || Ty == LLT::fixed_vector(16, 16))
951 return Ty;
952 return LLT();
953 case SgprB512:
954 case VgprB512:
955 case UniInVgprB512:
956 if (Ty == LLT::scalar(512) || Ty == LLT::fixed_vector(16, 32) ||
957 Ty == LLT::fixed_vector(8, 64))
958 return Ty;
959 return LLT();
960 default:
961 return LLT();
962 }
963}
964
965const RegisterBank *
966RegBankLegalizeHelper::getRegBankFromID(RegBankLLTMappingApplyID ID) {
967 switch (ID) {
968 case Vcc:
969 return VccRB;
970 case Sgpr16:
971 case Sgpr32:
972 case Sgpr32_WF:
973 case Sgpr64:
974 case Sgpr128:
975 case SgprP1:
976 case SgprP3:
977 case SgprP4:
978 case SgprP5:
979 case SgprPtr32:
980 case SgprPtr64:
981 case SgprPtr128:
982 case SgprV2S16:
983 case SgprV2S32:
984 case SgprV4S32:
985 case SgprV4S32_WF:
986 case SgprB32:
987 case SgprB64:
988 case SgprB96:
989 case SgprB128:
990 case SgprB256:
991 case SgprB512:
992 case UniInVcc:
993 case UniInVgprS16:
994 case UniInVgprS32:
995 case UniInVgprS64:
996 case UniInVgprV2S16:
997 case UniInVgprV4S32:
998 case UniInVgprB32:
999 case UniInVgprB64:
1000 case UniInVgprB96:
1001 case UniInVgprB128:
1002 case UniInVgprB256:
1003 case UniInVgprB512:
1004 case Sgpr32Trunc:
1005 case Sgpr32AExt:
1007 case Sgpr32SExt:
1008 case Sgpr32ZExt:
1009 return SgprRB;
1010 case Vgpr16:
1011 case Vgpr32:
1012 case Vgpr64:
1013 case Vgpr128:
1014 case VgprP0:
1015 case VgprP1:
1016 case VgprP3:
1017 case VgprP4:
1018 case VgprP5:
1019 case VgprPtr32:
1020 case VgprPtr64:
1021 case VgprPtr128:
1022 case VgprV2S16:
1023 case VgprV2S32:
1024 case VgprV4S32:
1025 case VgprB32:
1026 case VgprB64:
1027 case VgprB96:
1028 case VgprB128:
1029 case VgprB256:
1030 case VgprB512:
1031 case Vgpr32SExt:
1032 case Vgpr32ZExt:
1033 return VgprRB;
1034 default:
1035 return nullptr;
1036 }
1037}
1038
1039void RegBankLegalizeHelper::applyMappingDst(
1040 MachineInstr &MI, unsigned &OpIdx,
1041 const SmallVectorImpl<RegBankLLTMappingApplyID> &MethodIDs) {
1042 // Defs start from operand 0
1043 for (; OpIdx < MethodIDs.size(); ++OpIdx) {
1044 if (MethodIDs[OpIdx] == None)
1045 continue;
1046 MachineOperand &Op = MI.getOperand(OpIdx);
1047 Register Reg = Op.getReg();
1048 LLT Ty = MRI.getType(Reg);
1049 [[maybe_unused]] const RegisterBank *RB = MRI.getRegBank(Reg);
1050
1051 switch (MethodIDs[OpIdx]) {
1052 // vcc, sgpr and vgpr scalars, pointers and vectors
1053 case Vcc:
1054 case Sgpr16:
1055 case Sgpr32:
1056 case Sgpr64:
1057 case Sgpr128:
1058 case SgprP1:
1059 case SgprP3:
1060 case SgprP4:
1061 case SgprP5:
1062 case SgprV2S16:
1063 case SgprV2S32:
1064 case SgprV4S32:
1065 case Vgpr16:
1066 case Vgpr32:
1067 case Vgpr64:
1068 case Vgpr128:
1069 case VgprP0:
1070 case VgprP1:
1071 case VgprP3:
1072 case VgprP4:
1073 case VgprP5:
1074 case VgprV2S16:
1075 case VgprV2S32:
1076 case VgprV4S32: {
1077 assert(Ty == getTyFromID(MethodIDs[OpIdx]));
1078 assert(RB == getRegBankFromID(MethodIDs[OpIdx]));
1079 break;
1080 }
1081 // sgpr and vgpr B-types
1082 case SgprB32:
1083 case SgprB64:
1084 case SgprB96:
1085 case SgprB128:
1086 case SgprB256:
1087 case SgprB512:
1088 case SgprPtr32:
1089 case SgprPtr64:
1090 case SgprPtr128:
1091 case VgprB32:
1092 case VgprB64:
1093 case VgprB96:
1094 case VgprB128:
1095 case VgprB256:
1096 case VgprB512:
1097 case VgprPtr32:
1098 case VgprPtr64:
1099 case VgprPtr128: {
1100 assert(Ty == getBTyFromID(MethodIDs[OpIdx], Ty));
1101 assert(RB == getRegBankFromID(MethodIDs[OpIdx]));
1102 break;
1103 }
1104 // uniform in vcc/vgpr: scalars, vectors and B-types
1105 case UniInVcc: {
1106 assert(Ty == S1);
1107 assert(RB == SgprRB);
1108 Register NewDst = MRI.createVirtualRegister(VccRB_S1);
1109 Op.setReg(NewDst);
1110 auto CopyS32_Vcc =
1111 B.buildInstr(AMDGPU::G_AMDGPU_COPY_SCC_VCC, {SgprRB_S32}, {NewDst});
1112 B.buildTrunc(Reg, CopyS32_Vcc);
1113 break;
1114 }
1115 case UniInVgprS16: {
1116 assert(Ty == getTyFromID(MethodIDs[OpIdx]));
1117 assert(RB == SgprRB);
1118 Register NewVgprDstS16 = MRI.createVirtualRegister({VgprRB, S16});
1119 Register NewVgprDstS32 = MRI.createVirtualRegister({VgprRB, S32});
1120 Register NewSgprDstS32 = MRI.createVirtualRegister({SgprRB, S32});
1121 Op.setReg(NewVgprDstS16);
1122 B.buildAnyExt(NewVgprDstS32, NewVgprDstS16);
1123 buildReadAnyLane(B, NewSgprDstS32, NewVgprDstS32, RBI);
1124 B.buildTrunc(Reg, NewSgprDstS32);
1125 break;
1126 }
1127 case UniInVgprS32:
1128 case UniInVgprS64:
1129 case UniInVgprV2S16:
1130 case UniInVgprV4S32: {
1131 assert(Ty == getTyFromID(MethodIDs[OpIdx]));
1132 assert(RB == SgprRB);
1133 Register NewVgprDst = MRI.createVirtualRegister({VgprRB, Ty});
1134 Op.setReg(NewVgprDst);
1135 buildReadAnyLane(B, Reg, NewVgprDst, RBI);
1136 break;
1137 }
1138 case UniInVgprB32:
1139 case UniInVgprB64:
1140 case UniInVgprB96:
1141 case UniInVgprB128:
1142 case UniInVgprB256:
1143 case UniInVgprB512: {
1144 assert(Ty == getBTyFromID(MethodIDs[OpIdx], Ty));
1145 assert(RB == SgprRB);
1146 Register NewVgprDst = MRI.createVirtualRegister({VgprRB, Ty});
1147 Op.setReg(NewVgprDst);
1148 AMDGPU::buildReadAnyLane(B, Reg, NewVgprDst, RBI);
1149 break;
1150 }
1151 // sgpr trunc
1152 case Sgpr32Trunc: {
1153 assert(Ty.getSizeInBits() < 32);
1154 assert(RB == SgprRB);
1155 Register NewDst = MRI.createVirtualRegister(SgprRB_S32);
1156 Op.setReg(NewDst);
1157 if (!MRI.use_empty(Reg))
1158 B.buildTrunc(Reg, NewDst);
1159 break;
1160 }
1161 case InvalidMapping: {
1162 LLVM_DEBUG(dbgs() << "Instruction with Invalid mapping: "; MI.dump(););
1163 llvm_unreachable("missing fast rule for MI");
1164 }
1165 default:
1166 llvm_unreachable("ID not supported");
1167 }
1168 }
1169}
1170
1171void RegBankLegalizeHelper::applyMappingSrc(
1172 MachineInstr &MI, unsigned &OpIdx,
1173 const SmallVectorImpl<RegBankLLTMappingApplyID> &MethodIDs,
1174 SmallSet<Register, 4> &SgprWaterfallOperandRegs) {
1175 for (unsigned i = 0; i < MethodIDs.size(); ++OpIdx, ++i) {
1176 if (MethodIDs[i] == None || MethodIDs[i] == IntrId || MethodIDs[i] == Imm)
1177 continue;
1178
1179 MachineOperand &Op = MI.getOperand(OpIdx);
1180 Register Reg = Op.getReg();
1181 LLT Ty = MRI.getType(Reg);
1182 const RegisterBank *RB = MRI.getRegBank(Reg);
1183
1184 switch (MethodIDs[i]) {
1185 case Vcc: {
1186 assert(Ty == S1);
1187 assert(RB == VccRB || RB == SgprRB);
1188 if (RB == SgprRB) {
1189 auto Aext = B.buildAnyExt(SgprRB_S32, Reg);
1190 auto CopyVcc_Scc =
1191 B.buildInstr(AMDGPU::G_AMDGPU_COPY_VCC_SCC, {VccRB_S1}, {Aext});
1192 Op.setReg(CopyVcc_Scc.getReg(0));
1193 }
1194 break;
1195 }
1196 // sgpr scalars, pointers and vectors
1197 case Sgpr16:
1198 case Sgpr32:
1199 case Sgpr64:
1200 case Sgpr128:
1201 case SgprP1:
1202 case SgprP3:
1203 case SgprP4:
1204 case SgprP5:
1205 case SgprV2S16:
1206 case SgprV2S32:
1207 case SgprV4S32: {
1208 assert(Ty == getTyFromID(MethodIDs[i]));
1209 assert(RB == getRegBankFromID(MethodIDs[i]));
1210 break;
1211 }
1212 // sgpr B-types
1213 case SgprB32:
1214 case SgprB64:
1215 case SgprB96:
1216 case SgprB128:
1217 case SgprB256:
1218 case SgprB512:
1219 case SgprPtr32:
1220 case SgprPtr64:
1221 case SgprPtr128: {
1222 assert(Ty == getBTyFromID(MethodIDs[i], Ty));
1223 assert(RB == getRegBankFromID(MethodIDs[i]));
1224 break;
1225 }
1226 // vgpr scalars, pointers and vectors
1227 case Vgpr16:
1228 case Vgpr32:
1229 case Vgpr64:
1230 case Vgpr128:
1231 case VgprP0:
1232 case VgprP1:
1233 case VgprP3:
1234 case VgprP4:
1235 case VgprP5:
1236 case VgprV2S16:
1237 case VgprV2S32:
1238 case VgprV4S32: {
1239 assert(Ty == getTyFromID(MethodIDs[i]));
1240 if (RB != VgprRB) {
1241 auto CopyToVgpr = B.buildCopy({VgprRB, Ty}, Reg);
1242 Op.setReg(CopyToVgpr.getReg(0));
1243 }
1244 break;
1245 }
1246 // vgpr B-types
1247 case VgprB32:
1248 case VgprB64:
1249 case VgprB96:
1250 case VgprB128:
1251 case VgprB256:
1252 case VgprB512:
1253 case VgprPtr32:
1254 case VgprPtr64:
1255 case VgprPtr128: {
1256 assert(Ty == getBTyFromID(MethodIDs[i], Ty));
1257 if (RB != VgprRB) {
1258 auto CopyToVgpr = B.buildCopy({VgprRB, Ty}, Reg);
1259 Op.setReg(CopyToVgpr.getReg(0));
1260 }
1261 break;
1262 }
1263 // sgpr waterfall, scalars and vectors
1264 case Sgpr32_WF:
1265 case SgprV4S32_WF: {
1266 assert(Ty == getTyFromID(MethodIDs[i]));
1267 if (RB != SgprRB)
1268 SgprWaterfallOperandRegs.insert(Reg);
1269 break;
1270 }
1271 // sgpr and vgpr scalars with extend
1272 case Sgpr32AExt: {
1273 // Note: this ext allows S1, and it is meant to be combined away.
1274 assert(Ty.getSizeInBits() < 32);
1275 assert(RB == SgprRB);
1276 auto Aext = B.buildAnyExt(SgprRB_S32, Reg);
1277 Op.setReg(Aext.getReg(0));
1278 break;
1279 }
1280 case Sgpr32AExtBoolInReg: {
1281 // Note: this ext allows S1, and it is meant to be combined away.
1282 assert(Ty.getSizeInBits() == 1);
1283 assert(RB == SgprRB);
1284 auto Aext = B.buildAnyExt(SgprRB_S32, Reg);
1285 // Zext SgprS1 is not legal, make AND with 1 instead. This instruction is
1286 // most of times meant to be combined away in AMDGPURegBankCombiner.
1287 auto Cst1 = B.buildConstant(SgprRB_S32, 1);
1288 auto BoolInReg = B.buildAnd(SgprRB_S32, Aext, Cst1);
1289 Op.setReg(BoolInReg.getReg(0));
1290 break;
1291 }
1292 case Sgpr32SExt: {
1293 assert(1 < Ty.getSizeInBits() && Ty.getSizeInBits() < 32);
1294 assert(RB == SgprRB);
1295 auto Sext = B.buildSExt(SgprRB_S32, Reg);
1296 Op.setReg(Sext.getReg(0));
1297 break;
1298 }
1299 case Sgpr32ZExt: {
1300 assert(1 < Ty.getSizeInBits() && Ty.getSizeInBits() < 32);
1301 assert(RB == SgprRB);
1302 auto Zext = B.buildZExt({SgprRB, S32}, Reg);
1303 Op.setReg(Zext.getReg(0));
1304 break;
1305 }
1306 case Vgpr32SExt: {
1307 // Note this ext allows S1, and it is meant to be combined away.
1308 assert(Ty.getSizeInBits() < 32);
1309 assert(RB == VgprRB);
1310 auto Sext = B.buildSExt({VgprRB, S32}, Reg);
1311 Op.setReg(Sext.getReg(0));
1312 break;
1313 }
1314 case Vgpr32ZExt: {
1315 // Note this ext allows S1, and it is meant to be combined away.
1316 assert(Ty.getSizeInBits() < 32);
1317 assert(RB == VgprRB);
1318 auto Zext = B.buildZExt({VgprRB, S32}, Reg);
1319 Op.setReg(Zext.getReg(0));
1320 break;
1321 }
1322 default:
1323 llvm_unreachable("ID not supported");
1324 }
1325 }
1326}
1327
1329 Register Dst = MI.getOperand(0).getReg();
1330 LLT Ty = MRI.getType(Dst);
1331
1332 if (Ty == LLT::scalar(1) && MUI.isUniform(Dst)) {
1333 B.setInsertPt(*MI.getParent(), MI.getParent()->getFirstNonPHI());
1334
1335 Register NewDst = MRI.createVirtualRegister(SgprRB_S32);
1336 MI.getOperand(0).setReg(NewDst);
1337 B.buildTrunc(Dst, NewDst);
1338
1339 for (unsigned i = 1; i < MI.getNumOperands(); i += 2) {
1340 Register UseReg = MI.getOperand(i).getReg();
1341
1342 auto DefMI = MRI.getVRegDef(UseReg)->getIterator();
1343 MachineBasicBlock *DefMBB = DefMI->getParent();
1344
1345 B.setInsertPt(*DefMBB, DefMBB->SkipPHIsAndLabels(std::next(DefMI)));
1346
1347 auto NewUse = B.buildAnyExt(SgprRB_S32, UseReg);
1348 MI.getOperand(i).setReg(NewUse.getReg(0));
1349 }
1350
1351 return;
1352 }
1353
1354 // ALL divergent i1 phis should be already lowered and inst-selected into PHI
1355 // with sgpr reg class and S1 LLT.
1356 // Note: this includes divergent phis that don't require lowering.
1357 if (Ty == LLT::scalar(1) && MUI.isDivergent(Dst)) {
1358 LLVM_DEBUG(dbgs() << "Divergent S1 G_PHI: "; MI.dump(););
1359 llvm_unreachable("Make sure to run AMDGPUGlobalISelDivergenceLowering "
1360 "before RegBankLegalize to lower lane mask(vcc) phis");
1361 }
1362
1363 // We accept all types that can fit in some register class.
1364 // Uniform G_PHIs have all sgpr registers.
1365 // Divergent G_PHIs have vgpr dst but inputs can be sgpr or vgpr.
1366 if (Ty == LLT::scalar(32) || Ty == LLT::pointer(1, 64) ||
1367 Ty == LLT::pointer(4, 64)) {
1368 return;
1369 }
1370
1371 LLVM_DEBUG(dbgs() << "G_PHI not handled: "; MI.dump(););
1372 llvm_unreachable("type not supported");
1373}
1374
1375[[maybe_unused]] static bool verifyRegBankOnOperands(MachineInstr &MI,
1376 const RegisterBank *RB,
1378 unsigned StartOpIdx,
1379 unsigned EndOpIdx) {
1380 for (unsigned i = StartOpIdx; i <= EndOpIdx; ++i) {
1381 if (MRI.getRegBankOrNull(MI.getOperand(i).getReg()) != RB)
1382 return false;
1383 }
1384 return true;
1385}
1386
1388 const RegisterBank *RB = MRI.getRegBank(MI.getOperand(0).getReg());
1389 // Put RB on all registers
1390 unsigned NumDefs = MI.getNumDefs();
1391 unsigned NumOperands = MI.getNumOperands();
1392
1393 assert(verifyRegBankOnOperands(MI, RB, MRI, 0, NumDefs - 1));
1394 if (RB == SgprRB)
1395 assert(verifyRegBankOnOperands(MI, RB, MRI, NumDefs, NumOperands - 1));
1396
1397 if (RB == VgprRB) {
1398 B.setInstr(MI);
1399 for (unsigned i = NumDefs; i < NumOperands; ++i) {
1400 Register Reg = MI.getOperand(i).getReg();
1401 if (MRI.getRegBank(Reg) != RB) {
1402 auto Copy = B.buildCopy({VgprRB, MRI.getType(Reg)}, Reg);
1403 MI.getOperand(i).setReg(Copy.getReg(0));
1404 }
1405 }
1406 }
1407}
unsigned const MachineRegisterInfo * MRI
MachineInstrBuilder MachineInstrBuilder & DefMI
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.
Provides AMDGPU specific target descriptions.
static bool isSignedBFE(MachineInstr &MI)
static bool verifyRegBankOnOperands(MachineInstr &MI, const RegisterBank *RB, MachineRegisterInfo &MRI, unsigned StartOpIdx, unsigned EndOpIdx)
This file declares the targeting of the RegisterBankInfo class for AMDGPU.
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator MBBI
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
AMD GCN specific subclass of TargetSubtarget.
Declares convenience wrapper classes for interpreting MachineInstr instances as specific generic oper...
static Register UseReg(const MachineOperand &MO)
IRTranslator LLVM IR MI
#define I(x, y, z)
Definition MD5.cpp:58
This file declares the MachineIRBuilder class.
Register Reg
Register const TargetRegisterInfo * TRI
Machine IR instance of the generic uniformity analysis.
Promote Memory to Register
Definition Mem2Reg.cpp:110
static unsigned getReg(const MCDisassembler *D, unsigned RC, unsigned RegNo)
MachineInstr unsigned OpIdx
ConstantRange Range(APInt(BitWidth, Low), APInt(BitWidth, High))
const SmallVectorImpl< MachineOperand > & Cond
#define LLVM_DEBUG(...)
Definition Debug.h:114
RegBankLegalizeHelper(MachineIRBuilder &B, const MachineUniformityInfo &MUI, const RegisterBankInfo &RBI, const RegBankLegalizeRules &RBLRules)
const RegBankLLTMapping & findMappingForMI(const MachineInstr &MI, const MachineRegisterInfo &MRI, const MachineUniformityInfo &MUI) const
static APInt getLowBitsSet(unsigned numBits, unsigned loBitsSet)
Constructs an APInt value that has the bottom loBitsSet bits set.
Definition APInt.h:307
@ ICMP_NE
not equal
Definition InstrTypes.h:698
iterator find(const_arg_type_t< KeyT > Val)
Definition DenseMap.h:178
iterator end()
Definition DenseMap.h:81
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition DenseMap.h:233
const SIRegisterInfo * getRegisterInfo() const override
Represents a call to an intrinsic.
constexpr bool isScalar() const
static constexpr LLT scalar(unsigned SizeInBits)
Get a low-level scalar or aggregate "bag of bits".
constexpr bool isValid() const
constexpr bool isVector() const
static constexpr LLT pointer(unsigned AddressSpace, unsigned SizeInBits)
Get a low-level pointer in the given address space.
constexpr TypeSize getSizeInBits() const
Returns the total size of the type. Must only be called on sized types.
constexpr bool isPointer() const
constexpr LLT getElementType() const
Returns the vector's element type. Only valid for vector types.
static constexpr LLT fixed_vector(unsigned NumElements, unsigned ScalarSizeInBits)
Get a low-level fixed-width vector of some number of elements and element width.
constexpr TypeSize getSizeInBytes() const
Returns the total size of the type in bytes, i.e.
TypeSize getValue() const
LLVM_ABI void transferSuccessorsAndUpdatePHIs(MachineBasicBlock *FromMBB)
Transfers all the successors, as in transferSuccessors, and update PHI operands in the successor bloc...
LLVM_ABI iterator SkipPHIsAndLabels(iterator I)
Return the first instruction in MBB after I that is not a PHI or a label.
LLVM_ABI void addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob=BranchProbability::getUnknown())
Add Succ as a successor of this MachineBasicBlock.
void splice(iterator Where, MachineBasicBlock *Other, iterator From)
Take an instruction from MBB 'Other' at the position From, and insert it into this MBB right before '...
MachineInstrBundleIterator< MachineInstr > iterator
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
BasicBlockListType::iterator iterator
MachineBasicBlock * CreateMachineBasicBlock(const BasicBlock *BB=nullptr, std::optional< UniqueBBID > BBID=std::nullopt)
CreateMachineInstr - Allocate a new MachineInstr.
void insert(iterator MBBI, MachineBasicBlock *MBB)
Helper class to build MachineInstr.
Representation of each machine instruction.
LocationSize getSize() const
Return the size in bytes of the memory reference.
MachineOperand class - Representation of each machine instruction operand.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
void dump() const
Definition Pass.cpp:146
Holds all the information related to register banks.
This class implements the register bank concept.
Wrapper class representing virtual and physical registers.
Definition Register.h:19
SmallSet - This maintains a set of unique values, optimizing for the case when the set is small (less...
Definition SmallSet.h:133
size_type count(const T &V) const
count - Return 1 if the element is in the set, 0 otherwise.
Definition SmallSet.h:175
bool empty() const
Definition SmallSet.h:168
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
Definition SmallSet.h:183
void push_back(const T &Elt)
A range adaptor for a pair of iterators.
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
bool isAnyPtr(LLT Ty, unsigned Width)
void buildReadAnyLane(MachineIRBuilder &B, Register SgprDst, Register VgprSrc, const RegisterBankInfo &RBI)
void buildReadFirstLane(MachineIRBuilder &B, Register SgprDst, Register VgprSrc, const RegisterBankInfo &RBI)
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ Kill
The last use of a register.
This is an optimization pass for GlobalISel generic memory operations.
GenericUniformityInfo< MachineSSAContext > MachineUniformityInfo
@ Offset
Definition DWP.cpp:477
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
LLVM_ABI bool constrainSelectedInstRegOperands(MachineInstr &I, const TargetInstrInfo &TII, const TargetRegisterInfo &TRI, const RegisterBankInfo &RBI)
Mutate the newly-selected instruction I to constrain its (possibly generic) virtual register operands...
Definition Utils.cpp:155
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
DWARFExpression::Operation Op
ArrayRef(const T &OneElt) -> ArrayRef< T >
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
LLVM_ABI std::optional< ValueAndVReg > getIConstantVRegValWithLookThrough(Register VReg, const MachineRegisterInfo &MRI, bool LookThroughInstrs=true)
If VReg is defined by a statically evaluable chain of instructions rooted on a G_CONSTANT returns its...
Definition Utils.cpp:433
constexpr T maskTrailingOnes(unsigned N)
Create a bitmask with the N right-most bits set to 1, and all other bits set to 0.
Definition MathExtras.h:77
SmallVector< RegBankLLTMappingApplyID, 2 > DstOpMapping
SmallVector< RegBankLLTMappingApplyID, 4 > SrcOpMapping