LLVM 22.0.0git
AMDGPURegBankLegalizeHelper.cpp
Go to the documentation of this file.
1//===-- AMDGPURegBankLegalizeHelper.cpp -----------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// Implements actual lowering algorithms for each ID that can be used in
10/// Rule.OperandMapping. Similar to legalizer helper but with register banks.
11//
12//===----------------------------------------------------------------------===//
13
16#include "AMDGPUInstrInfo.h"
19#include "GCNSubtarget.h"
25#include "llvm/IR/IntrinsicsAMDGPU.h"
26
27#define DEBUG_TYPE "amdgpu-regbanklegalize"
28
29using namespace llvm;
30using namespace AMDGPU;
31
34 const RegisterBankInfo &RBI, const RegBankLegalizeRules &RBLRules)
35 : ST(B.getMF().getSubtarget<GCNSubtarget>()), B(B), MRI(*B.getMRI()),
36 MUI(MUI), RBI(RBI), RBLRules(RBLRules), IsWave32(ST.isWave32()),
37 SgprRB(&RBI.getRegBank(AMDGPU::SGPRRegBankID)),
38 VgprRB(&RBI.getRegBank(AMDGPU::VGPRRegBankID)),
39 VccRB(&RBI.getRegBank(AMDGPU::VCCRegBankID)) {}
40
42 const SetOfRulesForOpcode &RuleSet = RBLRules.getRulesForOpc(MI);
43 const RegBankLLTMapping &Mapping = RuleSet.findMappingForMI(MI, MRI, MUI);
44
45 SmallSet<Register, 4> WaterfallSgprs;
46 unsigned OpIdx = 0;
47 if (Mapping.DstOpMapping.size() > 0) {
48 B.setInsertPt(*MI.getParent(), std::next(MI.getIterator()));
49 applyMappingDst(MI, OpIdx, Mapping.DstOpMapping);
50 }
51 if (Mapping.SrcOpMapping.size() > 0) {
52 B.setInstr(MI);
53 applyMappingSrc(MI, OpIdx, Mapping.SrcOpMapping, WaterfallSgprs);
54 }
55
56 lower(MI, Mapping, WaterfallSgprs);
57}
58
59bool RegBankLegalizeHelper::executeInWaterfallLoop(
61 SmallSet<Register, 4> &SGPROperandRegs) {
62 // Track use registers which have already been expanded with a readfirstlane
63 // sequence. This may have multiple uses if moving a sequence.
64 DenseMap<Register, Register> WaterfalledRegMap;
65
66 MachineBasicBlock &MBB = B.getMBB();
67 MachineFunction &MF = B.getMF();
68
70 const TargetRegisterClass *WaveRC = TRI->getWaveMaskRegClass();
71 unsigned MovExecOpc, MovExecTermOpc, XorTermOpc, AndSaveExecOpc, ExecReg;
72 if (IsWave32) {
73 MovExecOpc = AMDGPU::S_MOV_B32;
74 MovExecTermOpc = AMDGPU::S_MOV_B32_term;
75 XorTermOpc = AMDGPU::S_XOR_B32_term;
76 AndSaveExecOpc = AMDGPU::S_AND_SAVEEXEC_B32;
77 ExecReg = AMDGPU::EXEC_LO;
78 } else {
79 MovExecOpc = AMDGPU::S_MOV_B64;
80 MovExecTermOpc = AMDGPU::S_MOV_B64_term;
81 XorTermOpc = AMDGPU::S_XOR_B64_term;
82 AndSaveExecOpc = AMDGPU::S_AND_SAVEEXEC_B64;
83 ExecReg = AMDGPU::EXEC;
84 }
85
86#ifndef NDEBUG
87 const int OrigRangeSize = std::distance(Range.begin(), Range.end());
88#endif
89
90 MachineRegisterInfo &MRI = *B.getMRI();
91 Register SaveExecReg = MRI.createVirtualRegister(WaveRC);
92 Register InitSaveExecReg = MRI.createVirtualRegister(WaveRC);
93
94 // Don't bother using generic instructions/registers for the exec mask.
95 B.buildInstr(TargetOpcode::IMPLICIT_DEF).addDef(InitSaveExecReg);
96
97 Register SavedExec = MRI.createVirtualRegister(WaveRC);
98
99 // To insert the loop we need to split the block. Move everything before
100 // this point to a new block, and insert a new empty block before this
101 // instruction.
104 MachineBasicBlock *RestoreExecBB = MF.CreateMachineBasicBlock();
105 MachineBasicBlock *RemainderBB = MF.CreateMachineBasicBlock();
107 ++MBBI;
108 MF.insert(MBBI, LoopBB);
109 MF.insert(MBBI, BodyBB);
110 MF.insert(MBBI, RestoreExecBB);
111 MF.insert(MBBI, RemainderBB);
112
113 LoopBB->addSuccessor(BodyBB);
114 BodyBB->addSuccessor(RestoreExecBB);
115 BodyBB->addSuccessor(LoopBB);
116
117 // Move the rest of the block into a new block.
119 RemainderBB->splice(RemainderBB->begin(), &MBB, Range.end(), MBB.end());
120
121 MBB.addSuccessor(LoopBB);
122 RestoreExecBB->addSuccessor(RemainderBB);
123
124 B.setInsertPt(*LoopBB, LoopBB->end());
125
126 // +-MBB:------------+
127 // | ... |
128 // | %0 = G_INST_1 |
129 // | %Dst = MI %Vgpr |
130 // | %1 = G_INST_2 |
131 // | ... |
132 // +-----------------+
133 // ->
134 // +-MBB-------------------------------+
135 // | ... |
136 // | %0 = G_INST_1 |
137 // | %SaveExecReg = S_MOV_B32 $exec_lo |
138 // +----------------|------------------+
139 // | /------------------------------|
140 // V V |
141 // +-LoopBB---------------------------------------------------------------+ |
142 // | %CurrentLaneReg:sgpr(s32) = READFIRSTLANE %Vgpr | |
143 // | instead of executing for each lane, see if other lanes had | |
144 // | same value for %Vgpr and execute for them also. | |
145 // | %CondReg:vcc(s1) = G_ICMP eq %CurrentLaneReg, %Vgpr | |
146 // | %CondRegLM:sreg_32 = ballot %CondReg // copy vcc to sreg32 lane mask | |
147 // | %SavedExec = S_AND_SAVEEXEC_B32 %CondRegLM | |
148 // | exec is active for lanes with the same "CurrentLane value" in Vgpr | |
149 // +----------------|-----------------------------------------------------+ |
150 // V |
151 // +-BodyBB------------------------------------------------------------+ |
152 // | %Dst = MI %CurrentLaneReg:sgpr(s32) | |
153 // | executed only for active lanes and written to Dst | |
154 // | $exec = S_XOR_B32 $exec, %SavedExec | |
155 // | set active lanes to 0 in SavedExec, lanes that did not write to | |
156 // | Dst yet, and set this as new exec (for READFIRSTLANE and ICMP) | |
157 // | SI_WATERFALL_LOOP LoopBB |-----|
158 // +----------------|--------------------------------------------------+
159 // V
160 // +-RestoreExecBB--------------------------+
161 // | $exec_lo = S_MOV_B32_term %SaveExecReg |
162 // +----------------|-----------------------+
163 // V
164 // +-RemainderBB:----------------------+
165 // | %1 = G_INST_2 |
166 // | ... |
167 // +---------------------------------- +
168
169 // Move the instruction into the loop body. Note we moved everything after
170 // Range.end() already into a new block, so Range.end() is no longer valid.
171 BodyBB->splice(BodyBB->end(), &MBB, Range.begin(), MBB.end());
172
173 // Figure out the iterator range after splicing the instructions.
174 MachineBasicBlock::iterator NewBegin = Range.begin()->getIterator();
175 auto NewEnd = BodyBB->end();
176 assert(std::distance(NewBegin, NewEnd) == OrigRangeSize);
177
178 B.setMBB(*LoopBB);
179 Register CondReg;
180
181 for (MachineInstr &MI : make_range(NewBegin, NewEnd)) {
182 for (MachineOperand &Op : MI.all_uses()) {
183 Register OldReg = Op.getReg();
184 if (!SGPROperandRegs.count(OldReg))
185 continue;
186
187 // See if we already processed this register in another instruction in
188 // the sequence.
189 auto OldVal = WaterfalledRegMap.find(OldReg);
190 if (OldVal != WaterfalledRegMap.end()) {
191 Op.setReg(OldVal->second);
192 continue;
193 }
194
195 Register OpReg = Op.getReg();
196 LLT OpTy = MRI.getType(OpReg);
197
198 // TODO: support for agpr
199 assert(MRI.getRegBank(OpReg) == VgprRB);
200 Register CurrentLaneReg = MRI.createVirtualRegister({SgprRB, OpTy});
201 buildReadFirstLane(B, CurrentLaneReg, OpReg, RBI);
202
203 // Build the comparison(s), CurrentLaneReg == OpReg.
204 unsigned OpSize = OpTy.getSizeInBits();
205 unsigned PartSize = (OpSize % 64 == 0) ? 64 : 32;
206 LLT PartTy = LLT::scalar(PartSize);
207 unsigned NumParts = OpSize / PartSize;
209 SmallVector<Register, 8> CurrentLaneParts;
210
211 if (NumParts == 1) {
212 OpParts.push_back(OpReg);
213 CurrentLaneParts.push_back(CurrentLaneReg);
214 } else {
215 auto UnmergeOp = B.buildUnmerge({VgprRB, PartTy}, OpReg);
216 auto UnmergeCurrLane = B.buildUnmerge({SgprRB, PartTy}, CurrentLaneReg);
217 for (unsigned i = 0; i < NumParts; ++i) {
218 OpParts.push_back(UnmergeOp.getReg(i));
219 CurrentLaneParts.push_back(UnmergeCurrLane.getReg(i));
220 }
221 }
222
223 for (unsigned i = 0; i < NumParts; ++i) {
224 Register CmpReg = MRI.createVirtualRegister(VccRB_S1);
225 B.buildICmp(CmpInst::ICMP_EQ, CmpReg, CurrentLaneParts[i], OpParts[i]);
226
227 if (!CondReg)
228 CondReg = CmpReg;
229 else
230 CondReg = B.buildAnd(VccRB_S1, CondReg, CmpReg).getReg(0);
231 }
232
233 Op.setReg(CurrentLaneReg);
234
235 // Make sure we don't re-process this register again.
236 WaterfalledRegMap.insert(std::pair(OldReg, Op.getReg()));
237 }
238 }
239
240 // Copy vcc to sgpr32/64, ballot becomes a no-op during instruction selection.
241 Register CondRegLM =
242 MRI.createVirtualRegister({WaveRC, LLT::scalar(IsWave32 ? 32 : 64)});
243 B.buildIntrinsic(Intrinsic::amdgcn_ballot, CondRegLM).addReg(CondReg);
244
245 // Update EXEC, save the original EXEC value to SavedExec.
246 B.buildInstr(AndSaveExecOpc)
247 .addDef(SavedExec)
248 .addReg(CondRegLM, RegState::Kill);
249 MRI.setSimpleHint(SavedExec, CondRegLM);
250
251 B.setInsertPt(*BodyBB, BodyBB->end());
252
253 // Update EXEC, switch all done bits to 0 and all todo bits to 1.
254 B.buildInstr(XorTermOpc).addDef(ExecReg).addReg(ExecReg).addReg(SavedExec);
255
256 // XXX - s_xor_b64 sets scc to 1 if the result is nonzero, so can we use
257 // s_cbranch_scc0?
258
259 // Loop back to V_READFIRSTLANE_B32 if there are still variants to cover.
260 B.buildInstr(AMDGPU::SI_WATERFALL_LOOP).addMBB(LoopBB);
261
262 // Save the EXEC mask before the loop.
263 B.setInsertPt(MBB, MBB.end());
264 B.buildInstr(MovExecOpc).addDef(SaveExecReg).addReg(ExecReg);
265
266 // Restore the EXEC mask after the loop.
267 B.setInsertPt(*RestoreExecBB, RestoreExecBB->begin());
268 B.buildInstr(MovExecTermOpc).addDef(ExecReg).addReg(SaveExecReg);
269
270 // Set the insert point after the original instruction, so any new
271 // instructions will be in the remainder.
272 B.setInsertPt(*RemainderBB, RemainderBB->begin());
273
274 return true;
275}
276
277void RegBankLegalizeHelper::splitLoad(MachineInstr &MI,
278 ArrayRef<LLT> LLTBreakdown, LLT MergeTy) {
279 MachineFunction &MF = B.getMF();
280 assert(MI.getNumMemOperands() == 1);
281 MachineMemOperand &BaseMMO = **MI.memoperands_begin();
282 Register Dst = MI.getOperand(0).getReg();
283 const RegisterBank *DstRB = MRI.getRegBankOrNull(Dst);
284 Register Base = MI.getOperand(1).getReg();
285 LLT PtrTy = MRI.getType(Base);
286 const RegisterBank *PtrRB = MRI.getRegBankOrNull(Base);
287 LLT OffsetTy = LLT::scalar(PtrTy.getSizeInBits());
288 SmallVector<Register, 4> LoadPartRegs;
289
290 unsigned ByteOffset = 0;
291 for (LLT PartTy : LLTBreakdown) {
292 Register BasePlusOffset;
293 if (ByteOffset == 0) {
294 BasePlusOffset = Base;
295 } else {
296 auto Offset = B.buildConstant({PtrRB, OffsetTy}, ByteOffset);
297 BasePlusOffset =
298 B.buildObjectPtrOffset({PtrRB, PtrTy}, Base, Offset).getReg(0);
299 }
300 auto *OffsetMMO = MF.getMachineMemOperand(&BaseMMO, ByteOffset, PartTy);
301 auto LoadPart = B.buildLoad({DstRB, PartTy}, BasePlusOffset, *OffsetMMO);
302 LoadPartRegs.push_back(LoadPart.getReg(0));
303 ByteOffset += PartTy.getSizeInBytes();
304 }
305
306 if (!MergeTy.isValid()) {
307 // Loads are of same size, concat or merge them together.
308 B.buildMergeLikeInstr(Dst, LoadPartRegs);
309 } else {
310 // Loads are not all of same size, need to unmerge them to smaller pieces
311 // of MergeTy type, then merge pieces to Dst.
312 SmallVector<Register, 4> MergeTyParts;
313 for (Register Reg : LoadPartRegs) {
314 if (MRI.getType(Reg) == MergeTy) {
315 MergeTyParts.push_back(Reg);
316 } else {
317 auto Unmerge = B.buildUnmerge({DstRB, MergeTy}, Reg);
318 for (unsigned i = 0; i < Unmerge->getNumOperands() - 1; ++i)
319 MergeTyParts.push_back(Unmerge.getReg(i));
320 }
321 }
322 B.buildMergeLikeInstr(Dst, MergeTyParts);
323 }
324 MI.eraseFromParent();
325}
326
327void RegBankLegalizeHelper::widenLoad(MachineInstr &MI, LLT WideTy,
328 LLT MergeTy) {
329 MachineFunction &MF = B.getMF();
330 assert(MI.getNumMemOperands() == 1);
331 MachineMemOperand &BaseMMO = **MI.memoperands_begin();
332 Register Dst = MI.getOperand(0).getReg();
333 const RegisterBank *DstRB = MRI.getRegBankOrNull(Dst);
334 Register Base = MI.getOperand(1).getReg();
335
336 MachineMemOperand *WideMMO = MF.getMachineMemOperand(&BaseMMO, 0, WideTy);
337 auto WideLoad = B.buildLoad({DstRB, WideTy}, Base, *WideMMO);
338
339 if (WideTy.isScalar()) {
340 B.buildTrunc(Dst, WideLoad);
341 } else {
342 SmallVector<Register, 4> MergeTyParts;
343 auto Unmerge = B.buildUnmerge({DstRB, MergeTy}, WideLoad);
344
345 LLT DstTy = MRI.getType(Dst);
346 unsigned NumElts = DstTy.getSizeInBits() / MergeTy.getSizeInBits();
347 for (unsigned i = 0; i < NumElts; ++i) {
348 MergeTyParts.push_back(Unmerge.getReg(i));
349 }
350 B.buildMergeLikeInstr(Dst, MergeTyParts);
351 }
352 MI.eraseFromParent();
353}
354
355void RegBankLegalizeHelper::widenMMOToS32(GAnyLoad &MI) const {
356 Register Dst = MI.getDstReg();
357 Register Ptr = MI.getPointerReg();
358 MachineMemOperand &MMO = MI.getMMO();
359 unsigned MemSize = 8 * MMO.getSize().getValue();
360
361 MachineMemOperand *WideMMO = B.getMF().getMachineMemOperand(&MMO, 0, S32);
362
363 if (MI.getOpcode() == G_LOAD) {
364 B.buildLoad(Dst, Ptr, *WideMMO);
365 } else {
366 auto Load = B.buildLoad(SgprRB_S32, Ptr, *WideMMO);
367
368 if (MI.getOpcode() == G_ZEXTLOAD) {
369 APInt Mask = APInt::getLowBitsSet(S32.getSizeInBits(), MemSize);
370 auto MaskCst = B.buildConstant(SgprRB_S32, Mask);
371 B.buildAnd(Dst, Load, MaskCst);
372 } else {
373 assert(MI.getOpcode() == G_SEXTLOAD);
374 B.buildSExtInReg(Dst, Load, MemSize);
375 }
376 }
377
378 MI.eraseFromParent();
379}
380
381void RegBankLegalizeHelper::lowerVccExtToSel(MachineInstr &MI) {
382 Register Dst = MI.getOperand(0).getReg();
383 LLT Ty = MRI.getType(Dst);
384 Register Src = MI.getOperand(1).getReg();
385 unsigned Opc = MI.getOpcode();
386 int TrueExtCst = Opc == G_SEXT ? -1 : 1;
387 if (Ty == S32 || Ty == S16) {
388 auto True = B.buildConstant({VgprRB, Ty}, TrueExtCst);
389 auto False = B.buildConstant({VgprRB, Ty}, 0);
390 B.buildSelect(Dst, Src, True, False);
391 } else if (Ty == S64) {
392 auto True = B.buildConstant({VgprRB_S32}, TrueExtCst);
393 auto False = B.buildConstant({VgprRB_S32}, 0);
394 auto Lo = B.buildSelect({VgprRB_S32}, Src, True, False);
395 MachineInstrBuilder Hi;
396 switch (Opc) {
397 case G_SEXT:
398 Hi = Lo;
399 break;
400 case G_ZEXT:
401 Hi = False;
402 break;
403 case G_ANYEXT:
404 Hi = B.buildUndef({VgprRB_S32});
405 break;
406 default:
407 llvm_unreachable("Opcode not supported");
408 }
409
410 B.buildMergeValues(Dst, {Lo.getReg(0), Hi.getReg(0)});
411 } else {
412 llvm_unreachable("Type not supported");
413 }
414
415 MI.eraseFromParent();
416}
417
418std::pair<Register, Register> RegBankLegalizeHelper::unpackZExt(Register Reg) {
419 auto PackedS32 = B.buildBitcast(SgprRB_S32, Reg);
420 auto Mask = B.buildConstant(SgprRB_S32, 0x0000ffff);
421 auto Lo = B.buildAnd(SgprRB_S32, PackedS32, Mask);
422 auto Hi = B.buildLShr(SgprRB_S32, PackedS32, B.buildConstant(SgprRB_S32, 16));
423 return {Lo.getReg(0), Hi.getReg(0)};
424}
425
426std::pair<Register, Register> RegBankLegalizeHelper::unpackSExt(Register Reg) {
427 auto PackedS32 = B.buildBitcast(SgprRB_S32, Reg);
428 auto Lo = B.buildSExtInReg(SgprRB_S32, PackedS32, 16);
429 auto Hi = B.buildAShr(SgprRB_S32, PackedS32, B.buildConstant(SgprRB_S32, 16));
430 return {Lo.getReg(0), Hi.getReg(0)};
431}
432
433std::pair<Register, Register> RegBankLegalizeHelper::unpackAExt(Register Reg) {
434 auto PackedS32 = B.buildBitcast(SgprRB_S32, Reg);
435 auto Lo = PackedS32;
436 auto Hi = B.buildLShr(SgprRB_S32, PackedS32, B.buildConstant(SgprRB_S32, 16));
437 return {Lo.getReg(0), Hi.getReg(0)};
438}
439
440void RegBankLegalizeHelper::lowerUnpackBitShift(MachineInstr &MI) {
441 Register Lo, Hi;
442 switch (MI.getOpcode()) {
443 case AMDGPU::G_SHL: {
444 auto [Val0, Val1] = unpackAExt(MI.getOperand(1).getReg());
445 auto [Amt0, Amt1] = unpackAExt(MI.getOperand(2).getReg());
446 Lo = B.buildInstr(MI.getOpcode(), {SgprRB_S32}, {Val0, Amt0}).getReg(0);
447 Hi = B.buildInstr(MI.getOpcode(), {SgprRB_S32}, {Val1, Amt1}).getReg(0);
448 break;
449 }
450 case AMDGPU::G_LSHR: {
451 auto [Val0, Val1] = unpackZExt(MI.getOperand(1).getReg());
452 auto [Amt0, Amt1] = unpackZExt(MI.getOperand(2).getReg());
453 Lo = B.buildInstr(MI.getOpcode(), {SgprRB_S32}, {Val0, Amt0}).getReg(0);
454 Hi = B.buildInstr(MI.getOpcode(), {SgprRB_S32}, {Val1, Amt1}).getReg(0);
455 break;
456 }
457 case AMDGPU::G_ASHR: {
458 auto [Val0, Val1] = unpackSExt(MI.getOperand(1).getReg());
459 auto [Amt0, Amt1] = unpackSExt(MI.getOperand(2).getReg());
460 Lo = B.buildAShr(SgprRB_S32, Val0, Amt0).getReg(0);
461 Hi = B.buildAShr(SgprRB_S32, Val1, Amt1).getReg(0);
462 break;
463 }
464 default:
465 llvm_unreachable("Unpack lowering not implemented");
466 }
467 B.buildBuildVectorTrunc(MI.getOperand(0).getReg(), {Lo, Hi});
468 MI.eraseFromParent();
469}
470
471void RegBankLegalizeHelper::lowerUnpackMinMax(MachineInstr &MI) {
472 Register Lo, Hi;
473 switch (MI.getOpcode()) {
474 case AMDGPU::G_SMIN:
475 case AMDGPU::G_SMAX: {
476 // For signed operations, use sign extension
477 auto [Val0_Lo, Val0_Hi] = unpackSExt(MI.getOperand(1).getReg());
478 auto [Val1_Lo, Val1_Hi] = unpackSExt(MI.getOperand(2).getReg());
479 Lo = B.buildInstr(MI.getOpcode(), {SgprRB_S32}, {Val0_Lo, Val1_Lo})
480 .getReg(0);
481 Hi = B.buildInstr(MI.getOpcode(), {SgprRB_S32}, {Val0_Hi, Val1_Hi})
482 .getReg(0);
483 break;
484 }
485 case AMDGPU::G_UMIN:
486 case AMDGPU::G_UMAX: {
487 // For unsigned operations, use zero extension
488 auto [Val0_Lo, Val0_Hi] = unpackZExt(MI.getOperand(1).getReg());
489 auto [Val1_Lo, Val1_Hi] = unpackZExt(MI.getOperand(2).getReg());
490 Lo = B.buildInstr(MI.getOpcode(), {SgprRB_S32}, {Val0_Lo, Val1_Lo})
491 .getReg(0);
492 Hi = B.buildInstr(MI.getOpcode(), {SgprRB_S32}, {Val0_Hi, Val1_Hi})
493 .getReg(0);
494 break;
495 }
496 default:
497 llvm_unreachable("Unpack min/max lowering not implemented");
498 }
499 B.buildBuildVectorTrunc(MI.getOperand(0).getReg(), {Lo, Hi});
500 MI.eraseFromParent();
501}
502
505 return (GI->is(Intrinsic::amdgcn_sbfe));
506
507 return MI.getOpcode() == AMDGPU::G_SBFX;
508}
509
510void RegBankLegalizeHelper::lowerV_BFE(MachineInstr &MI) {
511 Register Dst = MI.getOperand(0).getReg();
512 assert(MRI.getType(Dst) == LLT::scalar(64));
513 bool Signed = isSignedBFE(MI);
514 unsigned FirstOpnd = isa<GIntrinsic>(MI) ? 2 : 1;
515 // Extract bitfield from Src, LSBit is the least-significant bit for the
516 // extraction (field offset) and Width is size of bitfield.
517 Register Src = MI.getOperand(FirstOpnd).getReg();
518 Register LSBit = MI.getOperand(FirstOpnd + 1).getReg();
519 Register Width = MI.getOperand(FirstOpnd + 2).getReg();
520 // Comments are for signed bitfield extract, similar for unsigned. x is sign
521 // bit. s is sign, l is LSB and y are remaining bits of bitfield to extract.
522
523 // Src >> LSBit Hi|Lo: x?????syyyyyyl??? -> xxxx?????syyyyyyl
524 unsigned SHROpc = Signed ? AMDGPU::G_ASHR : AMDGPU::G_LSHR;
525 auto SHRSrc = B.buildInstr(SHROpc, {{VgprRB, S64}}, {Src, LSBit});
526
527 auto ConstWidth = getIConstantVRegValWithLookThrough(Width, MRI);
528
529 // Expand to Src >> LSBit << (64 - Width) >> (64 - Width)
530 // << (64 - Width): Hi|Lo: xxxx?????syyyyyyl -> syyyyyyl000000000
531 // >> (64 - Width): Hi|Lo: syyyyyyl000000000 -> ssssssssssyyyyyyl
532 if (!ConstWidth) {
533 auto Amt = B.buildSub(VgprRB_S32, B.buildConstant(SgprRB_S32, 64), Width);
534 auto SignBit = B.buildShl({VgprRB, S64}, SHRSrc, Amt);
535 B.buildInstr(SHROpc, {Dst}, {SignBit, Amt});
536 MI.eraseFromParent();
537 return;
538 }
539
540 uint64_t WidthImm = ConstWidth->Value.getZExtValue();
541 auto UnmergeSHRSrc = B.buildUnmerge(VgprRB_S32, SHRSrc);
542 Register SHRSrcLo = UnmergeSHRSrc.getReg(0);
543 Register SHRSrcHi = UnmergeSHRSrc.getReg(1);
544 auto Zero = B.buildConstant({VgprRB, S32}, 0);
545 unsigned BFXOpc = Signed ? AMDGPU::G_SBFX : AMDGPU::G_UBFX;
546
547 if (WidthImm <= 32) {
548 // SHRSrc Hi|Lo: ????????|???syyyl -> ????????|ssssyyyl
549 auto Lo = B.buildInstr(BFXOpc, {VgprRB_S32}, {SHRSrcLo, Zero, Width});
550 MachineInstrBuilder Hi;
551 if (Signed) {
552 // SHRSrc Hi|Lo: ????????|ssssyyyl -> ssssssss|ssssyyyl
553 Hi = B.buildAShr(VgprRB_S32, Lo, B.buildConstant(VgprRB_S32, 31));
554 } else {
555 // SHRSrc Hi|Lo: ????????|000syyyl -> 00000000|000syyyl
556 Hi = Zero;
557 }
558 B.buildMergeLikeInstr(Dst, {Lo, Hi});
559 } else {
560 auto Amt = B.buildConstant(VgprRB_S32, WidthImm - 32);
561 // SHRSrc Hi|Lo: ??????sy|yyyyyyyl -> sssssssy|yyyyyyyl
562 auto Hi = B.buildInstr(BFXOpc, {VgprRB_S32}, {SHRSrcHi, Zero, Amt});
563 B.buildMergeLikeInstr(Dst, {SHRSrcLo, Hi});
564 }
565
566 MI.eraseFromParent();
567}
568
569void RegBankLegalizeHelper::lowerS_BFE(MachineInstr &MI) {
570 Register DstReg = MI.getOperand(0).getReg();
571 LLT Ty = MRI.getType(DstReg);
572 bool Signed = isSignedBFE(MI);
573 unsigned FirstOpnd = isa<GIntrinsic>(MI) ? 2 : 1;
574 Register Src = MI.getOperand(FirstOpnd).getReg();
575 Register LSBit = MI.getOperand(FirstOpnd + 1).getReg();
576 Register Width = MI.getOperand(FirstOpnd + 2).getReg();
577 // For uniform bit field extract there are 4 available instructions, but
578 // LSBit(field offset) and Width(size of bitfield) need to be packed in S32,
579 // field offset in low and size in high 16 bits.
580
581 // Src1 Hi16|Lo16 = Size|FieldOffset
582 auto Mask = B.buildConstant(SgprRB_S32, maskTrailingOnes<unsigned>(6));
583 auto FieldOffset = B.buildAnd(SgprRB_S32, LSBit, Mask);
584 auto Size = B.buildShl(SgprRB_S32, Width, B.buildConstant(SgprRB_S32, 16));
585 auto Src1 = B.buildOr(SgprRB_S32, FieldOffset, Size);
586 unsigned Opc32 = Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32;
587 unsigned Opc64 = Signed ? AMDGPU::S_BFE_I64 : AMDGPU::S_BFE_U64;
588 unsigned Opc = Ty == S32 ? Opc32 : Opc64;
589
590 // Select machine instruction, because of reg class constraining, insert
591 // copies from reg class to reg bank.
592 auto S_BFE = B.buildInstr(Opc, {{SgprRB, Ty}},
593 {B.buildCopy(Ty, Src), B.buildCopy(S32, Src1)});
594 if (!constrainSelectedInstRegOperands(*S_BFE, *ST.getInstrInfo(),
595 *ST.getRegisterInfo(), RBI))
596 llvm_unreachable("failed to constrain BFE");
597
598 B.buildCopy(DstReg, S_BFE->getOperand(0).getReg());
599 MI.eraseFromParent();
600}
601
602void RegBankLegalizeHelper::lowerSplitTo32(MachineInstr &MI) {
603 Register Dst = MI.getOperand(0).getReg();
604 LLT DstTy = MRI.getType(Dst);
605 assert(DstTy == V4S16 || DstTy == V2S32 || DstTy == S64);
606 LLT Ty = DstTy == V4S16 ? V2S16 : S32;
607 auto Op1 = B.buildUnmerge({VgprRB, Ty}, MI.getOperand(1).getReg());
608 auto Op2 = B.buildUnmerge({VgprRB, Ty}, MI.getOperand(2).getReg());
609 unsigned Opc = MI.getOpcode();
610 auto Flags = MI.getFlags();
611 auto Lo =
612 B.buildInstr(Opc, {{VgprRB, Ty}}, {Op1.getReg(0), Op2.getReg(0)}, Flags);
613 auto Hi =
614 B.buildInstr(Opc, {{VgprRB, Ty}}, {Op1.getReg(1), Op2.getReg(1)}, Flags);
615 B.buildMergeLikeInstr(Dst, {Lo, Hi});
616 MI.eraseFromParent();
617}
618
619void RegBankLegalizeHelper::lowerSplitTo32Select(MachineInstr &MI) {
620 Register Dst = MI.getOperand(0).getReg();
621 LLT DstTy = MRI.getType(Dst);
622 assert(DstTy == V4S16 || DstTy == V2S32 || DstTy == S64 ||
623 (DstTy.isPointer() && DstTy.getSizeInBits() == 64));
624 LLT Ty = DstTy == V4S16 ? V2S16 : S32;
625 auto Op2 = B.buildUnmerge({VgprRB, Ty}, MI.getOperand(2).getReg());
626 auto Op3 = B.buildUnmerge({VgprRB, Ty}, MI.getOperand(3).getReg());
627 Register Cond = MI.getOperand(1).getReg();
628 auto Flags = MI.getFlags();
629 auto Lo =
630 B.buildSelect({VgprRB, Ty}, Cond, Op2.getReg(0), Op3.getReg(0), Flags);
631 auto Hi =
632 B.buildSelect({VgprRB, Ty}, Cond, Op2.getReg(1), Op3.getReg(1), Flags);
633
634 B.buildMergeLikeInstr(Dst, {Lo, Hi});
635 MI.eraseFromParent();
636}
637
638void RegBankLegalizeHelper::lowerSplitTo32SExtInReg(MachineInstr &MI) {
639 auto Op1 = B.buildUnmerge(VgprRB_S32, MI.getOperand(1).getReg());
640 int Amt = MI.getOperand(2).getImm();
641 Register Lo, Hi;
642 // Hi|Lo: s sign bit, ?/x bits changed/not changed by sign-extend
643 if (Amt <= 32) {
644 auto Freeze = B.buildFreeze(VgprRB_S32, Op1.getReg(0));
645 if (Amt == 32) {
646 // Hi|Lo: ????????|sxxxxxxx -> ssssssss|sxxxxxxx
647 Lo = Freeze.getReg(0);
648 } else {
649 // Hi|Lo: ????????|???sxxxx -> ssssssss|ssssxxxx
650 Lo = B.buildSExtInReg(VgprRB_S32, Freeze, Amt).getReg(0);
651 }
652
653 auto SignExtCst = B.buildConstant(SgprRB_S32, 31);
654 Hi = B.buildAShr(VgprRB_S32, Lo, SignExtCst).getReg(0);
655 } else {
656 // Hi|Lo: ?????sxx|xxxxxxxx -> ssssssxx|xxxxxxxx
657 Lo = Op1.getReg(0);
658 Hi = B.buildSExtInReg(VgprRB_S32, Op1.getReg(1), Amt - 32).getReg(0);
659 }
660
661 B.buildMergeLikeInstr(MI.getOperand(0).getReg(), {Lo, Hi});
662 MI.eraseFromParent();
663}
664
665void RegBankLegalizeHelper::lower(MachineInstr &MI,
666 const RegBankLLTMapping &Mapping,
667 SmallSet<Register, 4> &WaterfallSgprs) {
668
669 switch (Mapping.LoweringMethod) {
670 case DoNotLower:
671 break;
672 case VccExtToSel:
673 return lowerVccExtToSel(MI);
674 case UniExtToSel: {
675 LLT Ty = MRI.getType(MI.getOperand(0).getReg());
676 auto True = B.buildConstant({SgprRB, Ty},
677 MI.getOpcode() == AMDGPU::G_SEXT ? -1 : 1);
678 auto False = B.buildConstant({SgprRB, Ty}, 0);
679 // Input to G_{Z|S}EXT is 'Legalizer legal' S1. Most common case is compare.
680 // We are making select here. S1 cond was already 'any-extended to S32' +
681 // 'AND with 1 to clean high bits' by Sgpr32AExtBoolInReg.
682 B.buildSelect(MI.getOperand(0).getReg(), MI.getOperand(1).getReg(), True,
683 False);
684 MI.eraseFromParent();
685 return;
686 }
687 case UnpackBitShift:
688 return lowerUnpackBitShift(MI);
689 case UnpackMinMax:
690 return lowerUnpackMinMax(MI);
691 case Ext32To64: {
692 const RegisterBank *RB = MRI.getRegBank(MI.getOperand(0).getReg());
693 MachineInstrBuilder Hi;
694 switch (MI.getOpcode()) {
695 case AMDGPU::G_ZEXT: {
696 Hi = B.buildConstant({RB, S32}, 0);
697 break;
698 }
699 case AMDGPU::G_SEXT: {
700 // Replicate sign bit from 32-bit extended part.
701 auto ShiftAmt = B.buildConstant({RB, S32}, 31);
702 Hi = B.buildAShr({RB, S32}, MI.getOperand(1).getReg(), ShiftAmt);
703 break;
704 }
705 case AMDGPU::G_ANYEXT: {
706 Hi = B.buildUndef({RB, S32});
707 break;
708 }
709 default:
710 llvm_unreachable("Unsuported Opcode in Ext32To64");
711 }
712
713 B.buildMergeLikeInstr(MI.getOperand(0).getReg(),
714 {MI.getOperand(1).getReg(), Hi});
715 MI.eraseFromParent();
716 return;
717 }
718 case UniCstExt: {
719 uint64_t ConstVal = MI.getOperand(1).getCImm()->getZExtValue();
720 B.buildConstant(MI.getOperand(0).getReg(), ConstVal);
721
722 MI.eraseFromParent();
723 return;
724 }
725 case VgprToVccCopy: {
726 Register Src = MI.getOperand(1).getReg();
727 LLT Ty = MRI.getType(Src);
728 // Take lowest bit from each lane and put it in lane mask.
729 // Lowering via compare, but we need to clean high bits first as compare
730 // compares all bits in register.
731 Register BoolSrc = MRI.createVirtualRegister({VgprRB, Ty});
732 if (Ty == S64) {
733 auto Src64 = B.buildUnmerge(VgprRB_S32, Src);
734 auto One = B.buildConstant(VgprRB_S32, 1);
735 auto AndLo = B.buildAnd(VgprRB_S32, Src64.getReg(0), One);
736 auto Zero = B.buildConstant(VgprRB_S32, 0);
737 auto AndHi = B.buildAnd(VgprRB_S32, Src64.getReg(1), Zero);
738 B.buildMergeLikeInstr(BoolSrc, {AndLo, AndHi});
739 } else {
740 assert(Ty == S32 || Ty == S16);
741 auto One = B.buildConstant({VgprRB, Ty}, 1);
742 B.buildAnd(BoolSrc, Src, One);
743 }
744 auto Zero = B.buildConstant({VgprRB, Ty}, 0);
745 B.buildICmp(CmpInst::ICMP_NE, MI.getOperand(0).getReg(), BoolSrc, Zero);
746 MI.eraseFromParent();
747 return;
748 }
749 case V_BFE:
750 return lowerV_BFE(MI);
751 case S_BFE:
752 return lowerS_BFE(MI);
753 case SplitTo32:
754 return lowerSplitTo32(MI);
755 case SplitTo32Select:
756 return lowerSplitTo32Select(MI);
758 return lowerSplitTo32SExtInReg(MI);
759 case SplitLoad: {
760 LLT DstTy = MRI.getType(MI.getOperand(0).getReg());
761 unsigned Size = DstTy.getSizeInBits();
762 // Even split to 128-bit loads
763 if (Size > 128) {
764 LLT B128;
765 if (DstTy.isVector()) {
766 LLT EltTy = DstTy.getElementType();
767 B128 = LLT::fixed_vector(128 / EltTy.getSizeInBits(), EltTy);
768 } else {
769 B128 = LLT::scalar(128);
770 }
771 if (Size / 128 == 2)
772 splitLoad(MI, {B128, B128});
773 else if (Size / 128 == 4)
774 splitLoad(MI, {B128, B128, B128, B128});
775 else {
776 LLVM_DEBUG(dbgs() << "MI: "; MI.dump(););
777 llvm_unreachable("SplitLoad type not supported for MI");
778 }
779 }
780 // 64 and 32 bit load
781 else if (DstTy == S96)
782 splitLoad(MI, {S64, S32}, S32);
783 else if (DstTy == V3S32)
784 splitLoad(MI, {V2S32, S32}, S32);
785 else if (DstTy == V6S16)
786 splitLoad(MI, {V4S16, V2S16}, V2S16);
787 else {
788 LLVM_DEBUG(dbgs() << "MI: "; MI.dump(););
789 llvm_unreachable("SplitLoad type not supported for MI");
790 }
791 break;
792 }
793 case WidenLoad: {
794 LLT DstTy = MRI.getType(MI.getOperand(0).getReg());
795 if (DstTy == S96)
796 widenLoad(MI, S128);
797 else if (DstTy == V3S32)
798 widenLoad(MI, V4S32, S32);
799 else if (DstTy == V6S16)
800 widenLoad(MI, V8S16, V2S16);
801 else {
802 LLVM_DEBUG(dbgs() << "MI: "; MI.dump(););
803 llvm_unreachable("WidenLoad type not supported for MI");
804 }
805 break;
806 }
807 case WidenMMOToS32:
808 return widenMMOToS32(cast<GAnyLoad>(MI));
809 }
810
811 if (!WaterfallSgprs.empty()) {
812 MachineBasicBlock::iterator I = MI.getIterator();
813 executeInWaterfallLoop(B, make_range(I, std::next(I)), WaterfallSgprs);
814 }
815}
816
817LLT RegBankLegalizeHelper::getTyFromID(RegBankLLTMappingApplyID ID) {
818 switch (ID) {
819 case Vcc:
820 case UniInVcc:
821 return LLT::scalar(1);
822 case Sgpr16:
823 case Vgpr16:
824 case UniInVgprS16:
825 return LLT::scalar(16);
826 case Sgpr32:
827 case Sgpr32_WF:
828 case Sgpr32Trunc:
829 case Sgpr32AExt:
831 case Sgpr32SExt:
832 case Sgpr32ZExt:
833 case UniInVgprS32:
834 case Vgpr32:
835 case Vgpr32SExt:
836 case Vgpr32ZExt:
837 return LLT::scalar(32);
838 case Sgpr64:
839 case Vgpr64:
840 return LLT::scalar(64);
841 case Sgpr128:
842 case Vgpr128:
843 return LLT::scalar(128);
844 case VgprP0:
845 return LLT::pointer(0, 64);
846 case SgprP1:
847 case VgprP1:
848 return LLT::pointer(1, 64);
849 case SgprP3:
850 case VgprP3:
851 return LLT::pointer(3, 32);
852 case SgprP4:
853 case VgprP4:
854 return LLT::pointer(4, 64);
855 case SgprP5:
856 case VgprP5:
857 return LLT::pointer(5, 32);
858 case SgprV2S16:
859 case VgprV2S16:
860 case UniInVgprV2S16:
861 return LLT::fixed_vector(2, 16);
862 case SgprV2S32:
863 case VgprV2S32:
864 return LLT::fixed_vector(2, 32);
865 case SgprV4S32:
866 case SgprV4S32_WF:
867 case VgprV4S32:
868 case UniInVgprV4S32:
869 return LLT::fixed_vector(4, 32);
870 default:
871 return LLT();
872 }
873}
874
875LLT RegBankLegalizeHelper::getBTyFromID(RegBankLLTMappingApplyID ID, LLT Ty) {
876 switch (ID) {
877 case SgprB32:
878 case VgprB32:
879 case UniInVgprB32:
880 if (Ty == LLT::scalar(32) || Ty == LLT::fixed_vector(2, 16) ||
881 isAnyPtr(Ty, 32))
882 return Ty;
883 return LLT();
884 case SgprPtr32:
885 case VgprPtr32:
886 return isAnyPtr(Ty, 32) ? Ty : LLT();
887 case SgprPtr64:
888 case VgprPtr64:
889 return isAnyPtr(Ty, 64) ? Ty : LLT();
890 case SgprPtr128:
891 case VgprPtr128:
892 return isAnyPtr(Ty, 128) ? Ty : LLT();
893 case SgprB64:
894 case VgprB64:
895 case UniInVgprB64:
896 if (Ty == LLT::scalar(64) || Ty == LLT::fixed_vector(2, 32) ||
897 Ty == LLT::fixed_vector(4, 16) || isAnyPtr(Ty, 64))
898 return Ty;
899 return LLT();
900 case SgprB96:
901 case VgprB96:
902 case UniInVgprB96:
903 if (Ty == LLT::scalar(96) || Ty == LLT::fixed_vector(3, 32) ||
904 Ty == LLT::fixed_vector(6, 16))
905 return Ty;
906 return LLT();
907 case SgprB128:
908 case VgprB128:
909 case UniInVgprB128:
910 if (Ty == LLT::scalar(128) || Ty == LLT::fixed_vector(4, 32) ||
911 Ty == LLT::fixed_vector(2, 64) || isAnyPtr(Ty, 128))
912 return Ty;
913 return LLT();
914 case SgprB256:
915 case VgprB256:
916 case UniInVgprB256:
917 if (Ty == LLT::scalar(256) || Ty == LLT::fixed_vector(8, 32) ||
918 Ty == LLT::fixed_vector(4, 64) || Ty == LLT::fixed_vector(16, 16))
919 return Ty;
920 return LLT();
921 case SgprB512:
922 case VgprB512:
923 case UniInVgprB512:
924 if (Ty == LLT::scalar(512) || Ty == LLT::fixed_vector(16, 32) ||
925 Ty == LLT::fixed_vector(8, 64))
926 return Ty;
927 return LLT();
928 default:
929 return LLT();
930 }
931}
932
933const RegisterBank *
934RegBankLegalizeHelper::getRegBankFromID(RegBankLLTMappingApplyID ID) {
935 switch (ID) {
936 case Vcc:
937 return VccRB;
938 case Sgpr16:
939 case Sgpr32:
940 case Sgpr32_WF:
941 case Sgpr64:
942 case Sgpr128:
943 case SgprP1:
944 case SgprP3:
945 case SgprP4:
946 case SgprP5:
947 case SgprPtr32:
948 case SgprPtr64:
949 case SgprPtr128:
950 case SgprV2S16:
951 case SgprV2S32:
952 case SgprV4S32:
953 case SgprV4S32_WF:
954 case SgprB32:
955 case SgprB64:
956 case SgprB96:
957 case SgprB128:
958 case SgprB256:
959 case SgprB512:
960 case UniInVcc:
961 case UniInVgprS16:
962 case UniInVgprS32:
963 case UniInVgprV2S16:
964 case UniInVgprV4S32:
965 case UniInVgprB32:
966 case UniInVgprB64:
967 case UniInVgprB96:
968 case UniInVgprB128:
969 case UniInVgprB256:
970 case UniInVgprB512:
971 case Sgpr32Trunc:
972 case Sgpr32AExt:
974 case Sgpr32SExt:
975 case Sgpr32ZExt:
976 return SgprRB;
977 case Vgpr16:
978 case Vgpr32:
979 case Vgpr64:
980 case Vgpr128:
981 case VgprP0:
982 case VgprP1:
983 case VgprP3:
984 case VgprP4:
985 case VgprP5:
986 case VgprPtr32:
987 case VgprPtr64:
988 case VgprPtr128:
989 case VgprV2S16:
990 case VgprV2S32:
991 case VgprV4S32:
992 case VgprB32:
993 case VgprB64:
994 case VgprB96:
995 case VgprB128:
996 case VgprB256:
997 case VgprB512:
998 case Vgpr32SExt:
999 case Vgpr32ZExt:
1000 return VgprRB;
1001 default:
1002 return nullptr;
1003 }
1004}
1005
1006void RegBankLegalizeHelper::applyMappingDst(
1007 MachineInstr &MI, unsigned &OpIdx,
1008 const SmallVectorImpl<RegBankLLTMappingApplyID> &MethodIDs) {
1009 // Defs start from operand 0
1010 for (; OpIdx < MethodIDs.size(); ++OpIdx) {
1011 if (MethodIDs[OpIdx] == None)
1012 continue;
1013 MachineOperand &Op = MI.getOperand(OpIdx);
1014 Register Reg = Op.getReg();
1015 LLT Ty = MRI.getType(Reg);
1016 [[maybe_unused]] const RegisterBank *RB = MRI.getRegBank(Reg);
1017
1018 switch (MethodIDs[OpIdx]) {
1019 // vcc, sgpr and vgpr scalars, pointers and vectors
1020 case Vcc:
1021 case Sgpr16:
1022 case Sgpr32:
1023 case Sgpr64:
1024 case Sgpr128:
1025 case SgprP1:
1026 case SgprP3:
1027 case SgprP4:
1028 case SgprP5:
1029 case SgprV2S16:
1030 case SgprV2S32:
1031 case SgprV4S32:
1032 case Vgpr16:
1033 case Vgpr32:
1034 case Vgpr64:
1035 case Vgpr128:
1036 case VgprP0:
1037 case VgprP1:
1038 case VgprP3:
1039 case VgprP4:
1040 case VgprP5:
1041 case VgprV2S16:
1042 case VgprV2S32:
1043 case VgprV4S32: {
1044 assert(Ty == getTyFromID(MethodIDs[OpIdx]));
1045 assert(RB == getRegBankFromID(MethodIDs[OpIdx]));
1046 break;
1047 }
1048 // sgpr and vgpr B-types
1049 case SgprB32:
1050 case SgprB64:
1051 case SgprB96:
1052 case SgprB128:
1053 case SgprB256:
1054 case SgprB512:
1055 case SgprPtr32:
1056 case SgprPtr64:
1057 case SgprPtr128:
1058 case VgprB32:
1059 case VgprB64:
1060 case VgprB96:
1061 case VgprB128:
1062 case VgprB256:
1063 case VgprB512:
1064 case VgprPtr32:
1065 case VgprPtr64:
1066 case VgprPtr128: {
1067 assert(Ty == getBTyFromID(MethodIDs[OpIdx], Ty));
1068 assert(RB == getRegBankFromID(MethodIDs[OpIdx]));
1069 break;
1070 }
1071 // uniform in vcc/vgpr: scalars, vectors and B-types
1072 case UniInVcc: {
1073 assert(Ty == S1);
1074 assert(RB == SgprRB);
1075 Register NewDst = MRI.createVirtualRegister(VccRB_S1);
1076 Op.setReg(NewDst);
1077 auto CopyS32_Vcc =
1078 B.buildInstr(AMDGPU::G_AMDGPU_COPY_SCC_VCC, {SgprRB_S32}, {NewDst});
1079 B.buildTrunc(Reg, CopyS32_Vcc);
1080 break;
1081 }
1082 case UniInVgprS16: {
1083 assert(Ty == getTyFromID(MethodIDs[OpIdx]));
1084 assert(RB == SgprRB);
1085 Register NewVgprDstS16 = MRI.createVirtualRegister({VgprRB, S16});
1086 Register NewVgprDstS32 = MRI.createVirtualRegister({VgprRB, S32});
1087 Register NewSgprDstS32 = MRI.createVirtualRegister({SgprRB, S32});
1088 Op.setReg(NewVgprDstS16);
1089 B.buildAnyExt(NewVgprDstS32, NewVgprDstS16);
1090 buildReadAnyLane(B, NewSgprDstS32, NewVgprDstS32, RBI);
1091 B.buildTrunc(Reg, NewSgprDstS32);
1092 break;
1093 }
1094 case UniInVgprS32:
1095 case UniInVgprV2S16:
1096 case UniInVgprV4S32: {
1097 assert(Ty == getTyFromID(MethodIDs[OpIdx]));
1098 assert(RB == SgprRB);
1099 Register NewVgprDst = MRI.createVirtualRegister({VgprRB, Ty});
1100 Op.setReg(NewVgprDst);
1101 buildReadAnyLane(B, Reg, NewVgprDst, RBI);
1102 break;
1103 }
1104 case UniInVgprB32:
1105 case UniInVgprB64:
1106 case UniInVgprB96:
1107 case UniInVgprB128:
1108 case UniInVgprB256:
1109 case UniInVgprB512: {
1110 assert(Ty == getBTyFromID(MethodIDs[OpIdx], Ty));
1111 assert(RB == SgprRB);
1112 Register NewVgprDst = MRI.createVirtualRegister({VgprRB, Ty});
1113 Op.setReg(NewVgprDst);
1114 AMDGPU::buildReadAnyLane(B, Reg, NewVgprDst, RBI);
1115 break;
1116 }
1117 // sgpr trunc
1118 case Sgpr32Trunc: {
1119 assert(Ty.getSizeInBits() < 32);
1120 assert(RB == SgprRB);
1121 Register NewDst = MRI.createVirtualRegister(SgprRB_S32);
1122 Op.setReg(NewDst);
1123 B.buildTrunc(Reg, NewDst);
1124 break;
1125 }
1126 case InvalidMapping: {
1127 LLVM_DEBUG(dbgs() << "Instruction with Invalid mapping: "; MI.dump(););
1128 llvm_unreachable("missing fast rule for MI");
1129 }
1130 default:
1131 llvm_unreachable("ID not supported");
1132 }
1133 }
1134}
1135
1136void RegBankLegalizeHelper::applyMappingSrc(
1137 MachineInstr &MI, unsigned &OpIdx,
1138 const SmallVectorImpl<RegBankLLTMappingApplyID> &MethodIDs,
1139 SmallSet<Register, 4> &SgprWaterfallOperandRegs) {
1140 for (unsigned i = 0; i < MethodIDs.size(); ++OpIdx, ++i) {
1141 if (MethodIDs[i] == None || MethodIDs[i] == IntrId || MethodIDs[i] == Imm)
1142 continue;
1143
1144 MachineOperand &Op = MI.getOperand(OpIdx);
1145 Register Reg = Op.getReg();
1146 LLT Ty = MRI.getType(Reg);
1147 const RegisterBank *RB = MRI.getRegBank(Reg);
1148
1149 switch (MethodIDs[i]) {
1150 case Vcc: {
1151 assert(Ty == S1);
1152 assert(RB == VccRB || RB == SgprRB);
1153 if (RB == SgprRB) {
1154 auto Aext = B.buildAnyExt(SgprRB_S32, Reg);
1155 auto CopyVcc_Scc =
1156 B.buildInstr(AMDGPU::G_AMDGPU_COPY_VCC_SCC, {VccRB_S1}, {Aext});
1157 Op.setReg(CopyVcc_Scc.getReg(0));
1158 }
1159 break;
1160 }
1161 // sgpr scalars, pointers and vectors
1162 case Sgpr16:
1163 case Sgpr32:
1164 case Sgpr64:
1165 case Sgpr128:
1166 case SgprP1:
1167 case SgprP3:
1168 case SgprP4:
1169 case SgprP5:
1170 case SgprV2S16:
1171 case SgprV2S32:
1172 case SgprV4S32: {
1173 assert(Ty == getTyFromID(MethodIDs[i]));
1174 assert(RB == getRegBankFromID(MethodIDs[i]));
1175 break;
1176 }
1177 // sgpr B-types
1178 case SgprB32:
1179 case SgprB64:
1180 case SgprB96:
1181 case SgprB128:
1182 case SgprB256:
1183 case SgprB512:
1184 case SgprPtr32:
1185 case SgprPtr64:
1186 case SgprPtr128: {
1187 assert(Ty == getBTyFromID(MethodIDs[i], Ty));
1188 assert(RB == getRegBankFromID(MethodIDs[i]));
1189 break;
1190 }
1191 // vgpr scalars, pointers and vectors
1192 case Vgpr16:
1193 case Vgpr32:
1194 case Vgpr64:
1195 case Vgpr128:
1196 case VgprP0:
1197 case VgprP1:
1198 case VgprP3:
1199 case VgprP4:
1200 case VgprP5:
1201 case VgprV2S16:
1202 case VgprV2S32:
1203 case VgprV4S32: {
1204 assert(Ty == getTyFromID(MethodIDs[i]));
1205 if (RB != VgprRB) {
1206 auto CopyToVgpr = B.buildCopy({VgprRB, Ty}, Reg);
1207 Op.setReg(CopyToVgpr.getReg(0));
1208 }
1209 break;
1210 }
1211 // vgpr B-types
1212 case VgprB32:
1213 case VgprB64:
1214 case VgprB96:
1215 case VgprB128:
1216 case VgprB256:
1217 case VgprB512:
1218 case VgprPtr32:
1219 case VgprPtr64:
1220 case VgprPtr128: {
1221 assert(Ty == getBTyFromID(MethodIDs[i], Ty));
1222 if (RB != VgprRB) {
1223 auto CopyToVgpr = B.buildCopy({VgprRB, Ty}, Reg);
1224 Op.setReg(CopyToVgpr.getReg(0));
1225 }
1226 break;
1227 }
1228 // sgpr waterfall, scalars and vectors
1229 case Sgpr32_WF:
1230 case SgprV4S32_WF: {
1231 assert(Ty == getTyFromID(MethodIDs[i]));
1232 if (RB != SgprRB)
1233 SgprWaterfallOperandRegs.insert(Reg);
1234 break;
1235 }
1236 // sgpr and vgpr scalars with extend
1237 case Sgpr32AExt: {
1238 // Note: this ext allows S1, and it is meant to be combined away.
1239 assert(Ty.getSizeInBits() < 32);
1240 assert(RB == SgprRB);
1241 auto Aext = B.buildAnyExt(SgprRB_S32, Reg);
1242 Op.setReg(Aext.getReg(0));
1243 break;
1244 }
1245 case Sgpr32AExtBoolInReg: {
1246 // Note: this ext allows S1, and it is meant to be combined away.
1247 assert(Ty.getSizeInBits() == 1);
1248 assert(RB == SgprRB);
1249 auto Aext = B.buildAnyExt(SgprRB_S32, Reg);
1250 // Zext SgprS1 is not legal, make AND with 1 instead. This instruction is
1251 // most of times meant to be combined away in AMDGPURegBankCombiner.
1252 auto Cst1 = B.buildConstant(SgprRB_S32, 1);
1253 auto BoolInReg = B.buildAnd(SgprRB_S32, Aext, Cst1);
1254 Op.setReg(BoolInReg.getReg(0));
1255 break;
1256 }
1257 case Sgpr32SExt: {
1258 assert(1 < Ty.getSizeInBits() && Ty.getSizeInBits() < 32);
1259 assert(RB == SgprRB);
1260 auto Sext = B.buildSExt(SgprRB_S32, Reg);
1261 Op.setReg(Sext.getReg(0));
1262 break;
1263 }
1264 case Sgpr32ZExt: {
1265 assert(1 < Ty.getSizeInBits() && Ty.getSizeInBits() < 32);
1266 assert(RB == SgprRB);
1267 auto Zext = B.buildZExt({SgprRB, S32}, Reg);
1268 Op.setReg(Zext.getReg(0));
1269 break;
1270 }
1271 case Vgpr32SExt: {
1272 // Note this ext allows S1, and it is meant to be combined away.
1273 assert(Ty.getSizeInBits() < 32);
1274 assert(RB == VgprRB);
1275 auto Sext = B.buildSExt({VgprRB, S32}, Reg);
1276 Op.setReg(Sext.getReg(0));
1277 break;
1278 }
1279 case Vgpr32ZExt: {
1280 // Note this ext allows S1, and it is meant to be combined away.
1281 assert(Ty.getSizeInBits() < 32);
1282 assert(RB == VgprRB);
1283 auto Zext = B.buildZExt({VgprRB, S32}, Reg);
1284 Op.setReg(Zext.getReg(0));
1285 break;
1286 }
1287 default:
1288 llvm_unreachable("ID not supported");
1289 }
1290 }
1291}
1292
1294 Register Dst = MI.getOperand(0).getReg();
1295 LLT Ty = MRI.getType(Dst);
1296
1297 if (Ty == LLT::scalar(1) && MUI.isUniform(Dst)) {
1298 B.setInsertPt(*MI.getParent(), MI.getParent()->getFirstNonPHI());
1299
1300 Register NewDst = MRI.createVirtualRegister(SgprRB_S32);
1301 MI.getOperand(0).setReg(NewDst);
1302 B.buildTrunc(Dst, NewDst);
1303
1304 for (unsigned i = 1; i < MI.getNumOperands(); i += 2) {
1305 Register UseReg = MI.getOperand(i).getReg();
1306
1307 auto DefMI = MRI.getVRegDef(UseReg)->getIterator();
1308 MachineBasicBlock *DefMBB = DefMI->getParent();
1309
1310 B.setInsertPt(*DefMBB, DefMBB->SkipPHIsAndLabels(std::next(DefMI)));
1311
1312 auto NewUse = B.buildAnyExt(SgprRB_S32, UseReg);
1313 MI.getOperand(i).setReg(NewUse.getReg(0));
1314 }
1315
1316 return;
1317 }
1318
1319 // ALL divergent i1 phis should be already lowered and inst-selected into PHI
1320 // with sgpr reg class and S1 LLT.
1321 // Note: this includes divergent phis that don't require lowering.
1322 if (Ty == LLT::scalar(1) && MUI.isDivergent(Dst)) {
1323 LLVM_DEBUG(dbgs() << "Divergent S1 G_PHI: "; MI.dump(););
1324 llvm_unreachable("Make sure to run AMDGPUGlobalISelDivergenceLowering "
1325 "before RegBankLegalize to lower lane mask(vcc) phis");
1326 }
1327
1328 // We accept all types that can fit in some register class.
1329 // Uniform G_PHIs have all sgpr registers.
1330 // Divergent G_PHIs have vgpr dst but inputs can be sgpr or vgpr.
1331 if (Ty == LLT::scalar(32) || Ty == LLT::pointer(1, 64) ||
1332 Ty == LLT::pointer(4, 64)) {
1333 return;
1334 }
1335
1336 LLVM_DEBUG(dbgs() << "G_PHI not handled: "; MI.dump(););
1337 llvm_unreachable("type not supported");
1338}
1339
1340[[maybe_unused]] static bool verifyRegBankOnOperands(MachineInstr &MI,
1341 const RegisterBank *RB,
1343 unsigned StartOpIdx,
1344 unsigned EndOpIdx) {
1345 for (unsigned i = StartOpIdx; i <= EndOpIdx; ++i) {
1346 if (MRI.getRegBankOrNull(MI.getOperand(i).getReg()) != RB)
1347 return false;
1348 }
1349 return true;
1350}
1351
1353 const RegisterBank *RB = MRI.getRegBank(MI.getOperand(0).getReg());
1354 // Put RB on all registers
1355 unsigned NumDefs = MI.getNumDefs();
1356 unsigned NumOperands = MI.getNumOperands();
1357
1358 assert(verifyRegBankOnOperands(MI, RB, MRI, 0, NumDefs - 1));
1359 if (RB == SgprRB)
1360 assert(verifyRegBankOnOperands(MI, RB, MRI, NumDefs, NumOperands - 1));
1361
1362 if (RB == VgprRB) {
1363 B.setInstr(MI);
1364 for (unsigned i = NumDefs; i < NumOperands; ++i) {
1365 Register Reg = MI.getOperand(i).getReg();
1366 if (MRI.getRegBank(Reg) != RB) {
1367 auto Copy = B.buildCopy({VgprRB, MRI.getType(Reg)}, Reg);
1368 MI.getOperand(i).setReg(Copy.getReg(0));
1369 }
1370 }
1371 }
1372}
unsigned const MachineRegisterInfo * MRI
MachineInstrBuilder MachineInstrBuilder & DefMI
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.
Provides AMDGPU specific target descriptions.
static bool isSignedBFE(MachineInstr &MI)
static bool verifyRegBankOnOperands(MachineInstr &MI, const RegisterBank *RB, MachineRegisterInfo &MRI, unsigned StartOpIdx, unsigned EndOpIdx)
This file declares the targeting of the RegisterBankInfo class for AMDGPU.
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator MBBI
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
AMD GCN specific subclass of TargetSubtarget.
Declares convenience wrapper classes for interpreting MachineInstr instances as specific generic oper...
static Register UseReg(const MachineOperand &MO)
IRTranslator LLVM IR MI
#define I(x, y, z)
Definition MD5.cpp:58
This file declares the MachineIRBuilder class.
Register Reg
Register const TargetRegisterInfo * TRI
Machine IR instance of the generic uniformity analysis.
Promote Memory to Register
Definition Mem2Reg.cpp:110
static unsigned getReg(const MCDisassembler *D, unsigned RC, unsigned RegNo)
MachineInstr unsigned OpIdx
ConstantRange Range(APInt(BitWidth, Low), APInt(BitWidth, High))
const SmallVectorImpl< MachineOperand > & Cond
#define LLVM_DEBUG(...)
Definition Debug.h:114
RegBankLegalizeHelper(MachineIRBuilder &B, const MachineUniformityInfo &MUI, const RegisterBankInfo &RBI, const RegBankLegalizeRules &RBLRules)
const RegBankLLTMapping & findMappingForMI(const MachineInstr &MI, const MachineRegisterInfo &MRI, const MachineUniformityInfo &MUI) const
static APInt getLowBitsSet(unsigned numBits, unsigned loBitsSet)
Constructs an APInt value that has the bottom loBitsSet bits set.
Definition APInt.h:306
@ ICMP_NE
not equal
Definition InstrTypes.h:698
iterator find(const_arg_type_t< KeyT > Val)
Definition DenseMap.h:167
iterator end()
Definition DenseMap.h:81
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition DenseMap.h:222
const SIRegisterInfo * getRegisterInfo() const override
Represents a call to an intrinsic.
constexpr bool isScalar() const
static constexpr LLT scalar(unsigned SizeInBits)
Get a low-level scalar or aggregate "bag of bits".
constexpr bool isValid() const
constexpr bool isVector() const
static constexpr LLT pointer(unsigned AddressSpace, unsigned SizeInBits)
Get a low-level pointer in the given address space.
constexpr TypeSize getSizeInBits() const
Returns the total size of the type. Must only be called on sized types.
constexpr bool isPointer() const
constexpr LLT getElementType() const
Returns the vector's element type. Only valid for vector types.
static constexpr LLT fixed_vector(unsigned NumElements, unsigned ScalarSizeInBits)
Get a low-level fixed-width vector of some number of elements and element width.
constexpr TypeSize getSizeInBytes() const
Returns the total size of the type in bytes, i.e.
TypeSize getValue() const
LLVM_ABI void transferSuccessorsAndUpdatePHIs(MachineBasicBlock *FromMBB)
Transfers all the successors, as in transferSuccessors, and update PHI operands in the successor bloc...
LLVM_ABI iterator SkipPHIsAndLabels(iterator I)
Return the first instruction in MBB after I that is not a PHI or a label.
LLVM_ABI void addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob=BranchProbability::getUnknown())
Add Succ as a successor of this MachineBasicBlock.
void splice(iterator Where, MachineBasicBlock *Other, iterator From)
Take an instruction from MBB 'Other' at the position From, and insert it into this MBB right before '...
MachineInstrBundleIterator< MachineInstr > iterator
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
BasicBlockListType::iterator iterator
MachineBasicBlock * CreateMachineBasicBlock(const BasicBlock *BB=nullptr, std::optional< UniqueBBID > BBID=std::nullopt)
CreateMachineInstr - Allocate a new MachineInstr.
void insert(iterator MBBI, MachineBasicBlock *MBB)
Helper class to build MachineInstr.
Representation of each machine instruction.
LocationSize getSize() const
Return the size in bytes of the memory reference.
MachineOperand class - Representation of each machine instruction operand.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
void dump() const
Definition Pass.cpp:146
Holds all the information related to register banks.
This class implements the register bank concept.
Wrapper class representing virtual and physical registers.
Definition Register.h:19
SmallSet - This maintains a set of unique values, optimizing for the case when the set is small (less...
Definition SmallSet.h:133
size_type count(const T &V) const
count - Return 1 if the element is in the set, 0 otherwise.
Definition SmallSet.h:175
bool empty() const
Definition SmallSet.h:168
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
Definition SmallSet.h:183
void push_back(const T &Elt)
A range adaptor for a pair of iterators.
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
bool isAnyPtr(LLT Ty, unsigned Width)
void buildReadAnyLane(MachineIRBuilder &B, Register SgprDst, Register VgprSrc, const RegisterBankInfo &RBI)
void buildReadFirstLane(MachineIRBuilder &B, Register SgprDst, Register VgprSrc, const RegisterBankInfo &RBI)
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ Kill
The last use of a register.
This is an optimization pass for GlobalISel generic memory operations.
GenericUniformityInfo< MachineSSAContext > MachineUniformityInfo
@ Offset
Definition DWP.cpp:477
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:644
LLVM_ABI bool constrainSelectedInstRegOperands(MachineInstr &I, const TargetInstrInfo &TII, const TargetRegisterInfo &TRI, const RegisterBankInfo &RBI)
Mutate the newly-selected instruction I to constrain its (possibly generic) virtual register operands...
Definition Utils.cpp:155
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:548
DWARFExpression::Operation Op
ArrayRef(const T &OneElt) -> ArrayRef< T >
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:560
LLVM_ABI std::optional< ValueAndVReg > getIConstantVRegValWithLookThrough(Register VReg, const MachineRegisterInfo &MRI, bool LookThroughInstrs=true)
If VReg is defined by a statically evaluable chain of instructions rooted on a G_CONSTANT returns its...
Definition Utils.cpp:433
constexpr T maskTrailingOnes(unsigned N)
Create a bitmask with the N right-most bits set to 1, and all other bits set to 0.
Definition MathExtras.h:86
SmallVector< RegBankLLTMappingApplyID, 2 > DstOpMapping
SmallVector< RegBankLLTMappingApplyID, 4 > SrcOpMapping