LLVM 22.0.0git
AMDGPURegBankLegalizeHelper.cpp
Go to the documentation of this file.
1//===-- AMDGPURegBankLegalizeHelper.cpp -----------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// Implements actual lowering algorithms for each ID that can be used in
10/// Rule.OperandMapping. Similar to legalizer helper but with register banks.
11//
12//===----------------------------------------------------------------------===//
13
16#include "AMDGPUInstrInfo.h"
19#include "GCNSubtarget.h"
25#include "llvm/IR/IntrinsicsAMDGPU.h"
26
27#define DEBUG_TYPE "amdgpu-regbanklegalize"
28
29using namespace llvm;
30using namespace AMDGPU;
31
34 const RegisterBankInfo &RBI, const RegBankLegalizeRules &RBLRules)
35 : ST(B.getMF().getSubtarget<GCNSubtarget>()), B(B), MRI(*B.getMRI()),
36 MUI(MUI), RBI(RBI), RBLRules(RBLRules), IsWave32(ST.isWave32()),
37 SgprRB(&RBI.getRegBank(AMDGPU::SGPRRegBankID)),
38 VgprRB(&RBI.getRegBank(AMDGPU::VGPRRegBankID)),
39 VccRB(&RBI.getRegBank(AMDGPU::VCCRegBankID)) {}
40
42 const SetOfRulesForOpcode &RuleSet = RBLRules.getRulesForOpc(MI);
43 const RegBankLLTMapping &Mapping = RuleSet.findMappingForMI(MI, MRI, MUI);
44
45 SmallSet<Register, 4> WaterfallSgprs;
46 unsigned OpIdx = 0;
47 if (Mapping.DstOpMapping.size() > 0) {
48 B.setInsertPt(*MI.getParent(), std::next(MI.getIterator()));
49 applyMappingDst(MI, OpIdx, Mapping.DstOpMapping);
50 }
51 if (Mapping.SrcOpMapping.size() > 0) {
52 B.setInstr(MI);
53 applyMappingSrc(MI, OpIdx, Mapping.SrcOpMapping, WaterfallSgprs);
54 }
55
56 lower(MI, Mapping, WaterfallSgprs);
57}
58
59bool RegBankLegalizeHelper::executeInWaterfallLoop(
61 SmallSet<Register, 4> &SGPROperandRegs) {
62 // Track use registers which have already been expanded with a readfirstlane
63 // sequence. This may have multiple uses if moving a sequence.
64 DenseMap<Register, Register> WaterfalledRegMap;
65
66 MachineBasicBlock &MBB = B.getMBB();
67 MachineFunction &MF = B.getMF();
68
70 const TargetRegisterClass *WaveRC = TRI->getWaveMaskRegClass();
71 unsigned MovExecOpc, MovExecTermOpc, XorTermOpc, AndSaveExecOpc, ExecReg;
72 if (IsWave32) {
73 MovExecOpc = AMDGPU::S_MOV_B32;
74 MovExecTermOpc = AMDGPU::S_MOV_B32_term;
75 XorTermOpc = AMDGPU::S_XOR_B32_term;
76 AndSaveExecOpc = AMDGPU::S_AND_SAVEEXEC_B32;
77 ExecReg = AMDGPU::EXEC_LO;
78 } else {
79 MovExecOpc = AMDGPU::S_MOV_B64;
80 MovExecTermOpc = AMDGPU::S_MOV_B64_term;
81 XorTermOpc = AMDGPU::S_XOR_B64_term;
82 AndSaveExecOpc = AMDGPU::S_AND_SAVEEXEC_B64;
83 ExecReg = AMDGPU::EXEC;
84 }
85
86#ifndef NDEBUG
87 const int OrigRangeSize = std::distance(Range.begin(), Range.end());
88#endif
89
90 MachineRegisterInfo &MRI = *B.getMRI();
91 Register SaveExecReg = MRI.createVirtualRegister(WaveRC);
92 Register InitSaveExecReg = MRI.createVirtualRegister(WaveRC);
93
94 // Don't bother using generic instructions/registers for the exec mask.
95 B.buildInstr(TargetOpcode::IMPLICIT_DEF).addDef(InitSaveExecReg);
96
97 Register SavedExec = MRI.createVirtualRegister(WaveRC);
98
99 // To insert the loop we need to split the block. Move everything before
100 // this point to a new block, and insert a new empty block before this
101 // instruction.
104 MachineBasicBlock *RestoreExecBB = MF.CreateMachineBasicBlock();
105 MachineBasicBlock *RemainderBB = MF.CreateMachineBasicBlock();
107 ++MBBI;
108 MF.insert(MBBI, LoopBB);
109 MF.insert(MBBI, BodyBB);
110 MF.insert(MBBI, RestoreExecBB);
111 MF.insert(MBBI, RemainderBB);
112
113 LoopBB->addSuccessor(BodyBB);
114 BodyBB->addSuccessor(RestoreExecBB);
115 BodyBB->addSuccessor(LoopBB);
116
117 // Move the rest of the block into a new block.
119 RemainderBB->splice(RemainderBB->begin(), &MBB, Range.end(), MBB.end());
120
121 MBB.addSuccessor(LoopBB);
122 RestoreExecBB->addSuccessor(RemainderBB);
123
124 B.setInsertPt(*LoopBB, LoopBB->end());
125
126 // +-MBB:------------+
127 // | ... |
128 // | %0 = G_INST_1 |
129 // | %Dst = MI %Vgpr |
130 // | %1 = G_INST_2 |
131 // | ... |
132 // +-----------------+
133 // ->
134 // +-MBB-------------------------------+
135 // | ... |
136 // | %0 = G_INST_1 |
137 // | %SaveExecReg = S_MOV_B32 $exec_lo |
138 // +----------------|------------------+
139 // | /------------------------------|
140 // V V |
141 // +-LoopBB---------------------------------------------------------------+ |
142 // | %CurrentLaneReg:sgpr(s32) = READFIRSTLANE %Vgpr | |
143 // | instead of executing for each lane, see if other lanes had | |
144 // | same value for %Vgpr and execute for them also. | |
145 // | %CondReg:vcc(s1) = G_ICMP eq %CurrentLaneReg, %Vgpr | |
146 // | %CondRegLM:sreg_32 = ballot %CondReg // copy vcc to sreg32 lane mask | |
147 // | %SavedExec = S_AND_SAVEEXEC_B32 %CondRegLM | |
148 // | exec is active for lanes with the same "CurrentLane value" in Vgpr | |
149 // +----------------|-----------------------------------------------------+ |
150 // V |
151 // +-BodyBB------------------------------------------------------------+ |
152 // | %Dst = MI %CurrentLaneReg:sgpr(s32) | |
153 // | executed only for active lanes and written to Dst | |
154 // | $exec = S_XOR_B32 $exec, %SavedExec | |
155 // | set active lanes to 0 in SavedExec, lanes that did not write to | |
156 // | Dst yet, and set this as new exec (for READFIRSTLANE and ICMP) | |
157 // | SI_WATERFALL_LOOP LoopBB |-----|
158 // +----------------|--------------------------------------------------+
159 // V
160 // +-RestoreExecBB--------------------------+
161 // | $exec_lo = S_MOV_B32_term %SaveExecReg |
162 // +----------------|-----------------------+
163 // V
164 // +-RemainderBB:----------------------+
165 // | %1 = G_INST_2 |
166 // | ... |
167 // +---------------------------------- +
168
169 // Move the instruction into the loop body. Note we moved everything after
170 // Range.end() already into a new block, so Range.end() is no longer valid.
171 BodyBB->splice(BodyBB->end(), &MBB, Range.begin(), MBB.end());
172
173 // Figure out the iterator range after splicing the instructions.
174 MachineBasicBlock::iterator NewBegin = Range.begin()->getIterator();
175 auto NewEnd = BodyBB->end();
176 assert(std::distance(NewBegin, NewEnd) == OrigRangeSize);
177
178 B.setMBB(*LoopBB);
179 Register CondReg;
180
181 for (MachineInstr &MI : make_range(NewBegin, NewEnd)) {
182 for (MachineOperand &Op : MI.all_uses()) {
183 Register OldReg = Op.getReg();
184 if (!SGPROperandRegs.count(OldReg))
185 continue;
186
187 // See if we already processed this register in another instruction in
188 // the sequence.
189 auto OldVal = WaterfalledRegMap.find(OldReg);
190 if (OldVal != WaterfalledRegMap.end()) {
191 Op.setReg(OldVal->second);
192 continue;
193 }
194
195 Register OpReg = Op.getReg();
196 LLT OpTy = MRI.getType(OpReg);
197
198 // TODO: support for agpr
199 assert(MRI.getRegBank(OpReg) == VgprRB);
200 Register CurrentLaneReg = MRI.createVirtualRegister({SgprRB, OpTy});
201 buildReadFirstLane(B, CurrentLaneReg, OpReg, RBI);
202
203 // Build the comparison(s), CurrentLaneReg == OpReg.
204 unsigned OpSize = OpTy.getSizeInBits();
205 unsigned PartSize = (OpSize % 64 == 0) ? 64 : 32;
206 LLT PartTy = LLT::scalar(PartSize);
207 unsigned NumParts = OpSize / PartSize;
209 SmallVector<Register, 8> CurrentLaneParts;
210
211 if (NumParts == 1) {
212 OpParts.push_back(OpReg);
213 CurrentLaneParts.push_back(CurrentLaneReg);
214 } else {
215 auto UnmergeOp = B.buildUnmerge({VgprRB, PartTy}, OpReg);
216 auto UnmergeCurrLane = B.buildUnmerge({SgprRB, PartTy}, CurrentLaneReg);
217 for (unsigned i = 0; i < NumParts; ++i) {
218 OpParts.push_back(UnmergeOp.getReg(i));
219 CurrentLaneParts.push_back(UnmergeCurrLane.getReg(i));
220 }
221 }
222
223 for (unsigned i = 0; i < NumParts; ++i) {
224 Register CmpReg = MRI.createVirtualRegister(VccRB_S1);
225 B.buildICmp(CmpInst::ICMP_EQ, CmpReg, CurrentLaneParts[i], OpParts[i]);
226
227 if (!CondReg)
228 CondReg = CmpReg;
229 else
230 CondReg = B.buildAnd(VccRB_S1, CondReg, CmpReg).getReg(0);
231 }
232
233 Op.setReg(CurrentLaneReg);
234
235 // Make sure we don't re-process this register again.
236 WaterfalledRegMap.insert(std::pair(OldReg, Op.getReg()));
237 }
238 }
239
240 // Copy vcc to sgpr32/64, ballot becomes a no-op during instruction selection.
241 Register CondRegLM =
242 MRI.createVirtualRegister({WaveRC, LLT::scalar(IsWave32 ? 32 : 64)});
243 B.buildIntrinsic(Intrinsic::amdgcn_ballot, CondRegLM).addReg(CondReg);
244
245 // Update EXEC, save the original EXEC value to SavedExec.
246 B.buildInstr(AndSaveExecOpc)
247 .addDef(SavedExec)
248 .addReg(CondRegLM, RegState::Kill);
249 MRI.setSimpleHint(SavedExec, CondRegLM);
250
251 B.setInsertPt(*BodyBB, BodyBB->end());
252
253 // Update EXEC, switch all done bits to 0 and all todo bits to 1.
254 B.buildInstr(XorTermOpc).addDef(ExecReg).addReg(ExecReg).addReg(SavedExec);
255
256 // XXX - s_xor_b64 sets scc to 1 if the result is nonzero, so can we use
257 // s_cbranch_scc0?
258
259 // Loop back to V_READFIRSTLANE_B32 if there are still variants to cover.
260 B.buildInstr(AMDGPU::SI_WATERFALL_LOOP).addMBB(LoopBB);
261
262 // Save the EXEC mask before the loop.
263 B.setInsertPt(MBB, MBB.end());
264 B.buildInstr(MovExecOpc).addDef(SaveExecReg).addReg(ExecReg);
265
266 // Restore the EXEC mask after the loop.
267 B.setInsertPt(*RestoreExecBB, RestoreExecBB->begin());
268 B.buildInstr(MovExecTermOpc).addDef(ExecReg).addReg(SaveExecReg);
269
270 // Set the insert point after the original instruction, so any new
271 // instructions will be in the remainder.
272 B.setInsertPt(*RemainderBB, RemainderBB->begin());
273
274 return true;
275}
276
277void RegBankLegalizeHelper::splitLoad(MachineInstr &MI,
278 ArrayRef<LLT> LLTBreakdown, LLT MergeTy) {
279 MachineFunction &MF = B.getMF();
280 assert(MI.getNumMemOperands() == 1);
281 MachineMemOperand &BaseMMO = **MI.memoperands_begin();
282 Register Dst = MI.getOperand(0).getReg();
283 const RegisterBank *DstRB = MRI.getRegBankOrNull(Dst);
284 Register Base = MI.getOperand(1).getReg();
285 LLT PtrTy = MRI.getType(Base);
286 const RegisterBank *PtrRB = MRI.getRegBankOrNull(Base);
287 LLT OffsetTy = LLT::scalar(PtrTy.getSizeInBits());
288 SmallVector<Register, 4> LoadPartRegs;
289
290 unsigned ByteOffset = 0;
291 for (LLT PartTy : LLTBreakdown) {
292 Register BasePlusOffset;
293 if (ByteOffset == 0) {
294 BasePlusOffset = Base;
295 } else {
296 auto Offset = B.buildConstant({PtrRB, OffsetTy}, ByteOffset);
297 BasePlusOffset =
298 B.buildObjectPtrOffset({PtrRB, PtrTy}, Base, Offset).getReg(0);
299 }
300 auto *OffsetMMO = MF.getMachineMemOperand(&BaseMMO, ByteOffset, PartTy);
301 auto LoadPart = B.buildLoad({DstRB, PartTy}, BasePlusOffset, *OffsetMMO);
302 LoadPartRegs.push_back(LoadPart.getReg(0));
303 ByteOffset += PartTy.getSizeInBytes();
304 }
305
306 if (!MergeTy.isValid()) {
307 // Loads are of same size, concat or merge them together.
308 B.buildMergeLikeInstr(Dst, LoadPartRegs);
309 } else {
310 // Loads are not all of same size, need to unmerge them to smaller pieces
311 // of MergeTy type, then merge pieces to Dst.
312 SmallVector<Register, 4> MergeTyParts;
313 for (Register Reg : LoadPartRegs) {
314 if (MRI.getType(Reg) == MergeTy) {
315 MergeTyParts.push_back(Reg);
316 } else {
317 auto Unmerge = B.buildUnmerge({DstRB, MergeTy}, Reg);
318 for (unsigned i = 0; i < Unmerge->getNumOperands() - 1; ++i)
319 MergeTyParts.push_back(Unmerge.getReg(i));
320 }
321 }
322 B.buildMergeLikeInstr(Dst, MergeTyParts);
323 }
324 MI.eraseFromParent();
325}
326
327void RegBankLegalizeHelper::widenLoad(MachineInstr &MI, LLT WideTy,
328 LLT MergeTy) {
329 MachineFunction &MF = B.getMF();
330 assert(MI.getNumMemOperands() == 1);
331 MachineMemOperand &BaseMMO = **MI.memoperands_begin();
332 Register Dst = MI.getOperand(0).getReg();
333 const RegisterBank *DstRB = MRI.getRegBankOrNull(Dst);
334 Register Base = MI.getOperand(1).getReg();
335
336 MachineMemOperand *WideMMO = MF.getMachineMemOperand(&BaseMMO, 0, WideTy);
337 auto WideLoad = B.buildLoad({DstRB, WideTy}, Base, *WideMMO);
338
339 if (WideTy.isScalar()) {
340 B.buildTrunc(Dst, WideLoad);
341 } else {
342 SmallVector<Register, 4> MergeTyParts;
343 auto Unmerge = B.buildUnmerge({DstRB, MergeTy}, WideLoad);
344
345 LLT DstTy = MRI.getType(Dst);
346 unsigned NumElts = DstTy.getSizeInBits() / MergeTy.getSizeInBits();
347 for (unsigned i = 0; i < NumElts; ++i) {
348 MergeTyParts.push_back(Unmerge.getReg(i));
349 }
350 B.buildMergeLikeInstr(Dst, MergeTyParts);
351 }
352 MI.eraseFromParent();
353}
354
355void RegBankLegalizeHelper::lowerVccExtToSel(MachineInstr &MI) {
356 Register Dst = MI.getOperand(0).getReg();
357 LLT Ty = MRI.getType(Dst);
358 Register Src = MI.getOperand(1).getReg();
359 unsigned Opc = MI.getOpcode();
360 int TrueExtCst = Opc == G_SEXT ? -1 : 1;
361 if (Ty == S32 || Ty == S16) {
362 auto True = B.buildConstant({VgprRB, Ty}, TrueExtCst);
363 auto False = B.buildConstant({VgprRB, Ty}, 0);
364 B.buildSelect(Dst, Src, True, False);
365 } else if (Ty == S64) {
366 auto True = B.buildConstant({VgprRB_S32}, TrueExtCst);
367 auto False = B.buildConstant({VgprRB_S32}, 0);
368 auto Lo = B.buildSelect({VgprRB_S32}, Src, True, False);
370 switch (Opc) {
371 case G_SEXT:
372 Hi = Lo;
373 break;
374 case G_ZEXT:
375 Hi = False;
376 break;
377 case G_ANYEXT:
378 Hi = B.buildUndef({VgprRB_S32});
379 break;
380 default:
381 llvm_unreachable("Opcode not supported");
382 }
383
384 B.buildMergeValues(Dst, {Lo.getReg(0), Hi.getReg(0)});
385 } else {
386 llvm_unreachable("Type not supported");
387 }
388
389 MI.eraseFromParent();
390}
391
392std::pair<Register, Register> RegBankLegalizeHelper::unpackZExt(Register Reg) {
393 auto PackedS32 = B.buildBitcast(SgprRB_S32, Reg);
394 auto Mask = B.buildConstant(SgprRB_S32, 0x0000ffff);
395 auto Lo = B.buildAnd(SgprRB_S32, PackedS32, Mask);
396 auto Hi = B.buildLShr(SgprRB_S32, PackedS32, B.buildConstant(SgprRB_S32, 16));
397 return {Lo.getReg(0), Hi.getReg(0)};
398}
399
400std::pair<Register, Register> RegBankLegalizeHelper::unpackSExt(Register Reg) {
401 auto PackedS32 = B.buildBitcast(SgprRB_S32, Reg);
402 auto Lo = B.buildSExtInReg(SgprRB_S32, PackedS32, 16);
403 auto Hi = B.buildAShr(SgprRB_S32, PackedS32, B.buildConstant(SgprRB_S32, 16));
404 return {Lo.getReg(0), Hi.getReg(0)};
405}
406
407std::pair<Register, Register> RegBankLegalizeHelper::unpackAExt(Register Reg) {
408 auto PackedS32 = B.buildBitcast(SgprRB_S32, Reg);
409 auto Lo = PackedS32;
410 auto Hi = B.buildLShr(SgprRB_S32, PackedS32, B.buildConstant(SgprRB_S32, 16));
411 return {Lo.getReg(0), Hi.getReg(0)};
412}
413
414void RegBankLegalizeHelper::lowerUnpackBitShift(MachineInstr &MI) {
415 Register Lo, Hi;
416 switch (MI.getOpcode()) {
417 case AMDGPU::G_SHL: {
418 auto [Val0, Val1] = unpackAExt(MI.getOperand(1).getReg());
419 auto [Amt0, Amt1] = unpackAExt(MI.getOperand(2).getReg());
420 Lo = B.buildInstr(MI.getOpcode(), {SgprRB_S32}, {Val0, Amt0}).getReg(0);
421 Hi = B.buildInstr(MI.getOpcode(), {SgprRB_S32}, {Val1, Amt1}).getReg(0);
422 break;
423 }
424 case AMDGPU::G_LSHR: {
425 auto [Val0, Val1] = unpackZExt(MI.getOperand(1).getReg());
426 auto [Amt0, Amt1] = unpackZExt(MI.getOperand(2).getReg());
427 Lo = B.buildInstr(MI.getOpcode(), {SgprRB_S32}, {Val0, Amt0}).getReg(0);
428 Hi = B.buildInstr(MI.getOpcode(), {SgprRB_S32}, {Val1, Amt1}).getReg(0);
429 break;
430 }
431 case AMDGPU::G_ASHR: {
432 auto [Val0, Val1] = unpackSExt(MI.getOperand(1).getReg());
433 auto [Amt0, Amt1] = unpackSExt(MI.getOperand(2).getReg());
434 Lo = B.buildAShr(SgprRB_S32, Val0, Amt0).getReg(0);
435 Hi = B.buildAShr(SgprRB_S32, Val1, Amt1).getReg(0);
436 break;
437 }
438 default:
439 llvm_unreachable("Unpack lowering not implemented");
440 }
441 B.buildBuildVectorTrunc(MI.getOperand(0).getReg(), {Lo, Hi});
442 MI.eraseFromParent();
443}
444
446 if (GIntrinsic *GI = dyn_cast<GIntrinsic>(&MI))
447 return (GI->is(Intrinsic::amdgcn_sbfe));
448
449 return MI.getOpcode() == AMDGPU::G_SBFX;
450}
451
452void RegBankLegalizeHelper::lowerV_BFE(MachineInstr &MI) {
453 Register Dst = MI.getOperand(0).getReg();
454 assert(MRI.getType(Dst) == LLT::scalar(64));
455 bool Signed = isSignedBFE(MI);
456 unsigned FirstOpnd = isa<GIntrinsic>(MI) ? 2 : 1;
457 // Extract bitfield from Src, LSBit is the least-significant bit for the
458 // extraction (field offset) and Width is size of bitfield.
459 Register Src = MI.getOperand(FirstOpnd).getReg();
460 Register LSBit = MI.getOperand(FirstOpnd + 1).getReg();
461 Register Width = MI.getOperand(FirstOpnd + 2).getReg();
462 // Comments are for signed bitfield extract, similar for unsigned. x is sign
463 // bit. s is sign, l is LSB and y are remaining bits of bitfield to extract.
464
465 // Src >> LSBit Hi|Lo: x?????syyyyyyl??? -> xxxx?????syyyyyyl
466 unsigned SHROpc = Signed ? AMDGPU::G_ASHR : AMDGPU::G_LSHR;
467 auto SHRSrc = B.buildInstr(SHROpc, {{VgprRB, S64}}, {Src, LSBit});
468
469 auto ConstWidth = getIConstantVRegValWithLookThrough(Width, MRI);
470
471 // Expand to Src >> LSBit << (64 - Width) >> (64 - Width)
472 // << (64 - Width): Hi|Lo: xxxx?????syyyyyyl -> syyyyyyl000000000
473 // >> (64 - Width): Hi|Lo: syyyyyyl000000000 -> ssssssssssyyyyyyl
474 if (!ConstWidth) {
475 auto Amt = B.buildSub(VgprRB_S32, B.buildConstant(SgprRB_S32, 64), Width);
476 auto SignBit = B.buildShl({VgprRB, S64}, SHRSrc, Amt);
477 B.buildInstr(SHROpc, {Dst}, {SignBit, Amt});
478 MI.eraseFromParent();
479 return;
480 }
481
482 uint64_t WidthImm = ConstWidth->Value.getZExtValue();
483 auto UnmergeSHRSrc = B.buildUnmerge(VgprRB_S32, SHRSrc);
484 Register SHRSrcLo = UnmergeSHRSrc.getReg(0);
485 Register SHRSrcHi = UnmergeSHRSrc.getReg(1);
486 auto Zero = B.buildConstant({VgprRB, S32}, 0);
487 unsigned BFXOpc = Signed ? AMDGPU::G_SBFX : AMDGPU::G_UBFX;
488
489 if (WidthImm <= 32) {
490 // SHRSrc Hi|Lo: ????????|???syyyl -> ????????|ssssyyyl
491 auto Lo = B.buildInstr(BFXOpc, {VgprRB_S32}, {SHRSrcLo, Zero, Width});
493 if (Signed) {
494 // SHRSrc Hi|Lo: ????????|ssssyyyl -> ssssssss|ssssyyyl
495 Hi = B.buildAShr(VgprRB_S32, Lo, B.buildConstant(VgprRB_S32, 31));
496 } else {
497 // SHRSrc Hi|Lo: ????????|000syyyl -> 00000000|000syyyl
498 Hi = Zero;
499 }
500 B.buildMergeLikeInstr(Dst, {Lo, Hi});
501 } else {
502 auto Amt = B.buildConstant(VgprRB_S32, WidthImm - 32);
503 // SHRSrc Hi|Lo: ??????sy|yyyyyyyl -> sssssssy|yyyyyyyl
504 auto Hi = B.buildInstr(BFXOpc, {VgprRB_S32}, {SHRSrcHi, Zero, Amt});
505 B.buildMergeLikeInstr(Dst, {SHRSrcLo, Hi});
506 }
507
508 MI.eraseFromParent();
509}
510
511void RegBankLegalizeHelper::lowerS_BFE(MachineInstr &MI) {
512 Register DstReg = MI.getOperand(0).getReg();
513 LLT Ty = MRI.getType(DstReg);
514 bool Signed = isSignedBFE(MI);
515 unsigned FirstOpnd = isa<GIntrinsic>(MI) ? 2 : 1;
516 Register Src = MI.getOperand(FirstOpnd).getReg();
517 Register LSBit = MI.getOperand(FirstOpnd + 1).getReg();
518 Register Width = MI.getOperand(FirstOpnd + 2).getReg();
519 // For uniform bit field extract there are 4 available instructions, but
520 // LSBit(field offset) and Width(size of bitfield) need to be packed in S32,
521 // field offset in low and size in high 16 bits.
522
523 // Src1 Hi16|Lo16 = Size|FieldOffset
524 auto Mask = B.buildConstant(SgprRB_S32, maskTrailingOnes<unsigned>(6));
525 auto FieldOffset = B.buildAnd(SgprRB_S32, LSBit, Mask);
526 auto Size = B.buildShl(SgprRB_S32, Width, B.buildConstant(SgprRB_S32, 16));
527 auto Src1 = B.buildOr(SgprRB_S32, FieldOffset, Size);
528 unsigned Opc32 = Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32;
529 unsigned Opc64 = Signed ? AMDGPU::S_BFE_I64 : AMDGPU::S_BFE_U64;
530 unsigned Opc = Ty == S32 ? Opc32 : Opc64;
531
532 // Select machine instruction, because of reg class constraining, insert
533 // copies from reg class to reg bank.
534 auto S_BFE = B.buildInstr(Opc, {{SgprRB, Ty}},
535 {B.buildCopy(Ty, Src), B.buildCopy(S32, Src1)});
537 *ST.getRegisterInfo(), RBI))
538 llvm_unreachable("failed to constrain BFE");
539
540 B.buildCopy(DstReg, S_BFE->getOperand(0).getReg());
541 MI.eraseFromParent();
542}
543
544void RegBankLegalizeHelper::lowerSplitTo32(MachineInstr &MI) {
545 Register Dst = MI.getOperand(0).getReg();
546 LLT DstTy = MRI.getType(Dst);
547 assert(DstTy == V4S16 || DstTy == V2S32 || DstTy == S64);
548 LLT Ty = DstTy == V4S16 ? V2S16 : S32;
549 auto Op1 = B.buildUnmerge({VgprRB, Ty}, MI.getOperand(1).getReg());
550 auto Op2 = B.buildUnmerge({VgprRB, Ty}, MI.getOperand(2).getReg());
551 unsigned Opc = MI.getOpcode();
552 auto Flags = MI.getFlags();
553 auto Lo =
554 B.buildInstr(Opc, {{VgprRB, Ty}}, {Op1.getReg(0), Op2.getReg(0)}, Flags);
555 auto Hi =
556 B.buildInstr(Opc, {{VgprRB, Ty}}, {Op1.getReg(1), Op2.getReg(1)}, Flags);
557 B.buildMergeLikeInstr(Dst, {Lo, Hi});
558 MI.eraseFromParent();
559}
560
561void RegBankLegalizeHelper::lowerSplitTo32Select(MachineInstr &MI) {
562 Register Dst = MI.getOperand(0).getReg();
563 LLT DstTy = MRI.getType(Dst);
564 assert(DstTy == V4S16 || DstTy == V2S32 || DstTy == S64 ||
565 (DstTy.isPointer() && DstTy.getSizeInBits() == 64));
566 LLT Ty = DstTy == V4S16 ? V2S16 : S32;
567 auto Op2 = B.buildUnmerge({VgprRB, Ty}, MI.getOperand(2).getReg());
568 auto Op3 = B.buildUnmerge({VgprRB, Ty}, MI.getOperand(3).getReg());
569 Register Cond = MI.getOperand(1).getReg();
570 auto Flags = MI.getFlags();
571 auto Lo =
572 B.buildSelect({VgprRB, Ty}, Cond, Op2.getReg(0), Op3.getReg(0), Flags);
573 auto Hi =
574 B.buildSelect({VgprRB, Ty}, Cond, Op2.getReg(1), Op3.getReg(1), Flags);
575
576 B.buildMergeLikeInstr(Dst, {Lo, Hi});
577 MI.eraseFromParent();
578}
579
580void RegBankLegalizeHelper::lowerSplitTo32SExtInReg(MachineInstr &MI) {
581 auto Op1 = B.buildUnmerge(VgprRB_S32, MI.getOperand(1).getReg());
582 int Amt = MI.getOperand(2).getImm();
583 Register Lo, Hi;
584 // Hi|Lo: s sign bit, ?/x bits changed/not changed by sign-extend
585 if (Amt <= 32) {
586 auto Freeze = B.buildFreeze(VgprRB_S32, Op1.getReg(0));
587 if (Amt == 32) {
588 // Hi|Lo: ????????|sxxxxxxx -> ssssssss|sxxxxxxx
589 Lo = Freeze.getReg(0);
590 } else {
591 // Hi|Lo: ????????|???sxxxx -> ssssssss|ssssxxxx
592 Lo = B.buildSExtInReg(VgprRB_S32, Freeze, Amt).getReg(0);
593 }
594
595 auto SignExtCst = B.buildConstant(SgprRB_S32, 31);
596 Hi = B.buildAShr(VgprRB_S32, Lo, SignExtCst).getReg(0);
597 } else {
598 // Hi|Lo: ?????sxx|xxxxxxxx -> ssssssxx|xxxxxxxx
599 Lo = Op1.getReg(0);
600 Hi = B.buildSExtInReg(VgprRB_S32, Op1.getReg(1), Amt - 32).getReg(0);
601 }
602
603 B.buildMergeLikeInstr(MI.getOperand(0).getReg(), {Lo, Hi});
604 MI.eraseFromParent();
605}
606
607void RegBankLegalizeHelper::lower(MachineInstr &MI,
608 const RegBankLLTMapping &Mapping,
609 SmallSet<Register, 4> &WaterfallSgprs) {
610
611 switch (Mapping.LoweringMethod) {
612 case DoNotLower:
613 break;
614 case VccExtToSel:
615 return lowerVccExtToSel(MI);
616 case UniExtToSel: {
617 LLT Ty = MRI.getType(MI.getOperand(0).getReg());
618 auto True = B.buildConstant({SgprRB, Ty},
619 MI.getOpcode() == AMDGPU::G_SEXT ? -1 : 1);
620 auto False = B.buildConstant({SgprRB, Ty}, 0);
621 // Input to G_{Z|S}EXT is 'Legalizer legal' S1. Most common case is compare.
622 // We are making select here. S1 cond was already 'any-extended to S32' +
623 // 'AND with 1 to clean high bits' by Sgpr32AExtBoolInReg.
624 B.buildSelect(MI.getOperand(0).getReg(), MI.getOperand(1).getReg(), True,
625 False);
626 MI.eraseFromParent();
627 return;
628 }
629 case UnpackBitShift:
630 return lowerUnpackBitShift(MI);
631 case Ext32To64: {
632 const RegisterBank *RB = MRI.getRegBank(MI.getOperand(0).getReg());
634 switch (MI.getOpcode()) {
635 case AMDGPU::G_ZEXT: {
636 Hi = B.buildConstant({RB, S32}, 0);
637 break;
638 }
639 case AMDGPU::G_SEXT: {
640 // Replicate sign bit from 32-bit extended part.
641 auto ShiftAmt = B.buildConstant({RB, S32}, 31);
642 Hi = B.buildAShr({RB, S32}, MI.getOperand(1).getReg(), ShiftAmt);
643 break;
644 }
645 case AMDGPU::G_ANYEXT: {
646 Hi = B.buildUndef({RB, S32});
647 break;
648 }
649 default:
650 llvm_unreachable("Unsuported Opcode in Ext32To64");
651 }
652
653 B.buildMergeLikeInstr(MI.getOperand(0).getReg(),
654 {MI.getOperand(1).getReg(), Hi});
655 MI.eraseFromParent();
656 return;
657 }
658 case UniCstExt: {
659 uint64_t ConstVal = MI.getOperand(1).getCImm()->getZExtValue();
660 B.buildConstant(MI.getOperand(0).getReg(), ConstVal);
661
662 MI.eraseFromParent();
663 return;
664 }
665 case VgprToVccCopy: {
666 Register Src = MI.getOperand(1).getReg();
667 LLT Ty = MRI.getType(Src);
668 // Take lowest bit from each lane and put it in lane mask.
669 // Lowering via compare, but we need to clean high bits first as compare
670 // compares all bits in register.
671 Register BoolSrc = MRI.createVirtualRegister({VgprRB, Ty});
672 if (Ty == S64) {
673 auto Src64 = B.buildUnmerge(VgprRB_S32, Src);
674 auto One = B.buildConstant(VgprRB_S32, 1);
675 auto AndLo = B.buildAnd(VgprRB_S32, Src64.getReg(0), One);
676 auto Zero = B.buildConstant(VgprRB_S32, 0);
677 auto AndHi = B.buildAnd(VgprRB_S32, Src64.getReg(1), Zero);
678 B.buildMergeLikeInstr(BoolSrc, {AndLo, AndHi});
679 } else {
680 assert(Ty == S32 || Ty == S16);
681 auto One = B.buildConstant({VgprRB, Ty}, 1);
682 B.buildAnd(BoolSrc, Src, One);
683 }
684 auto Zero = B.buildConstant({VgprRB, Ty}, 0);
685 B.buildICmp(CmpInst::ICMP_NE, MI.getOperand(0).getReg(), BoolSrc, Zero);
686 MI.eraseFromParent();
687 return;
688 }
689 case V_BFE:
690 return lowerV_BFE(MI);
691 case S_BFE:
692 return lowerS_BFE(MI);
693 case SplitTo32:
694 return lowerSplitTo32(MI);
695 case SplitTo32Select:
696 return lowerSplitTo32Select(MI);
698 return lowerSplitTo32SExtInReg(MI);
699 case SplitLoad: {
700 LLT DstTy = MRI.getType(MI.getOperand(0).getReg());
701 unsigned Size = DstTy.getSizeInBits();
702 // Even split to 128-bit loads
703 if (Size > 128) {
704 LLT B128;
705 if (DstTy.isVector()) {
706 LLT EltTy = DstTy.getElementType();
707 B128 = LLT::fixed_vector(128 / EltTy.getSizeInBits(), EltTy);
708 } else {
709 B128 = LLT::scalar(128);
710 }
711 if (Size / 128 == 2)
712 splitLoad(MI, {B128, B128});
713 else if (Size / 128 == 4)
714 splitLoad(MI, {B128, B128, B128, B128});
715 else {
716 LLVM_DEBUG(dbgs() << "MI: "; MI.dump(););
717 llvm_unreachable("SplitLoad type not supported for MI");
718 }
719 }
720 // 64 and 32 bit load
721 else if (DstTy == S96)
722 splitLoad(MI, {S64, S32}, S32);
723 else if (DstTy == V3S32)
724 splitLoad(MI, {V2S32, S32}, S32);
725 else if (DstTy == V6S16)
726 splitLoad(MI, {V4S16, V2S16}, V2S16);
727 else {
728 LLVM_DEBUG(dbgs() << "MI: "; MI.dump(););
729 llvm_unreachable("SplitLoad type not supported for MI");
730 }
731 break;
732 }
733 case WidenLoad: {
734 LLT DstTy = MRI.getType(MI.getOperand(0).getReg());
735 if (DstTy == S96)
736 widenLoad(MI, S128);
737 else if (DstTy == V3S32)
738 widenLoad(MI, V4S32, S32);
739 else if (DstTy == V6S16)
740 widenLoad(MI, V8S16, V2S16);
741 else {
742 LLVM_DEBUG(dbgs() << "MI: "; MI.dump(););
743 llvm_unreachable("WidenLoad type not supported for MI");
744 }
745 break;
746 }
747 }
748
749 if (!WaterfallSgprs.empty()) {
750 MachineBasicBlock::iterator I = MI.getIterator();
751 executeInWaterfallLoop(B, make_range(I, std::next(I)), WaterfallSgprs);
752 }
753}
754
755LLT RegBankLegalizeHelper::getTyFromID(RegBankLLTMappingApplyID ID) {
756 switch (ID) {
757 case Vcc:
758 case UniInVcc:
759 return LLT::scalar(1);
760 case Sgpr16:
761 case Vgpr16:
762 return LLT::scalar(16);
763 case Sgpr32:
764 case Sgpr32_WF:
765 case Sgpr32Trunc:
766 case Sgpr32AExt:
768 case Sgpr32SExt:
769 case Sgpr32ZExt:
770 case UniInVgprS32:
771 case Vgpr32:
772 case Vgpr32SExt:
773 case Vgpr32ZExt:
774 return LLT::scalar(32);
775 case Sgpr64:
776 case Vgpr64:
777 return LLT::scalar(64);
778 case Sgpr128:
779 case Vgpr128:
780 return LLT::scalar(128);
781 case VgprP0:
782 return LLT::pointer(0, 64);
783 case SgprP1:
784 case VgprP1:
785 return LLT::pointer(1, 64);
786 case SgprP3:
787 case VgprP3:
788 return LLT::pointer(3, 32);
789 case SgprP4:
790 case VgprP4:
791 return LLT::pointer(4, 64);
792 case SgprP5:
793 case VgprP5:
794 return LLT::pointer(5, 32);
795 case SgprV2S16:
796 case VgprV2S16:
797 case UniInVgprV2S16:
798 return LLT::fixed_vector(2, 16);
799 case SgprV2S32:
800 case VgprV2S32:
801 return LLT::fixed_vector(2, 32);
802 case SgprV4S32:
803 case SgprV4S32_WF:
804 case VgprV4S32:
805 case UniInVgprV4S32:
806 return LLT::fixed_vector(4, 32);
807 default:
808 return LLT();
809 }
810}
811
812LLT RegBankLegalizeHelper::getBTyFromID(RegBankLLTMappingApplyID ID, LLT Ty) {
813 switch (ID) {
814 case SgprB32:
815 case VgprB32:
816 case UniInVgprB32:
817 if (Ty == LLT::scalar(32) || Ty == LLT::fixed_vector(2, 16) ||
818 isAnyPtr(Ty, 32))
819 return Ty;
820 return LLT();
821 case SgprPtr32:
822 case VgprPtr32:
823 return isAnyPtr(Ty, 32) ? Ty : LLT();
824 case SgprPtr64:
825 case VgprPtr64:
826 return isAnyPtr(Ty, 64) ? Ty : LLT();
827 case SgprPtr128:
828 case VgprPtr128:
829 return isAnyPtr(Ty, 128) ? Ty : LLT();
830 case SgprB64:
831 case VgprB64:
832 case UniInVgprB64:
833 if (Ty == LLT::scalar(64) || Ty == LLT::fixed_vector(2, 32) ||
834 Ty == LLT::fixed_vector(4, 16) || isAnyPtr(Ty, 64))
835 return Ty;
836 return LLT();
837 case SgprB96:
838 case VgprB96:
839 case UniInVgprB96:
840 if (Ty == LLT::scalar(96) || Ty == LLT::fixed_vector(3, 32) ||
841 Ty == LLT::fixed_vector(6, 16))
842 return Ty;
843 return LLT();
844 case SgprB128:
845 case VgprB128:
846 case UniInVgprB128:
847 if (Ty == LLT::scalar(128) || Ty == LLT::fixed_vector(4, 32) ||
848 Ty == LLT::fixed_vector(2, 64) || isAnyPtr(Ty, 128))
849 return Ty;
850 return LLT();
851 case SgprB256:
852 case VgprB256:
853 case UniInVgprB256:
854 if (Ty == LLT::scalar(256) || Ty == LLT::fixed_vector(8, 32) ||
855 Ty == LLT::fixed_vector(4, 64) || Ty == LLT::fixed_vector(16, 16))
856 return Ty;
857 return LLT();
858 case SgprB512:
859 case VgprB512:
860 case UniInVgprB512:
861 if (Ty == LLT::scalar(512) || Ty == LLT::fixed_vector(16, 32) ||
862 Ty == LLT::fixed_vector(8, 64))
863 return Ty;
864 return LLT();
865 default:
866 return LLT();
867 }
868}
869
870const RegisterBank *
871RegBankLegalizeHelper::getRegBankFromID(RegBankLLTMappingApplyID ID) {
872 switch (ID) {
873 case Vcc:
874 return VccRB;
875 case Sgpr16:
876 case Sgpr32:
877 case Sgpr32_WF:
878 case Sgpr64:
879 case Sgpr128:
880 case SgprP1:
881 case SgprP3:
882 case SgprP4:
883 case SgprP5:
884 case SgprPtr32:
885 case SgprPtr64:
886 case SgprPtr128:
887 case SgprV2S16:
888 case SgprV2S32:
889 case SgprV4S32:
890 case SgprV4S32_WF:
891 case SgprB32:
892 case SgprB64:
893 case SgprB96:
894 case SgprB128:
895 case SgprB256:
896 case SgprB512:
897 case UniInVcc:
898 case UniInVgprS32:
899 case UniInVgprV2S16:
900 case UniInVgprV4S32:
901 case UniInVgprB32:
902 case UniInVgprB64:
903 case UniInVgprB96:
904 case UniInVgprB128:
905 case UniInVgprB256:
906 case UniInVgprB512:
907 case Sgpr32Trunc:
908 case Sgpr32AExt:
910 case Sgpr32SExt:
911 case Sgpr32ZExt:
912 return SgprRB;
913 case Vgpr16:
914 case Vgpr32:
915 case Vgpr64:
916 case Vgpr128:
917 case VgprP0:
918 case VgprP1:
919 case VgprP3:
920 case VgprP4:
921 case VgprP5:
922 case VgprPtr32:
923 case VgprPtr64:
924 case VgprPtr128:
925 case VgprV2S16:
926 case VgprV2S32:
927 case VgprV4S32:
928 case VgprB32:
929 case VgprB64:
930 case VgprB96:
931 case VgprB128:
932 case VgprB256:
933 case VgprB512:
934 case Vgpr32SExt:
935 case Vgpr32ZExt:
936 return VgprRB;
937 default:
938 return nullptr;
939 }
940}
941
942void RegBankLegalizeHelper::applyMappingDst(
943 MachineInstr &MI, unsigned &OpIdx,
945 // Defs start from operand 0
946 for (; OpIdx < MethodIDs.size(); ++OpIdx) {
947 if (MethodIDs[OpIdx] == None)
948 continue;
949 MachineOperand &Op = MI.getOperand(OpIdx);
950 Register Reg = Op.getReg();
951 LLT Ty = MRI.getType(Reg);
952 [[maybe_unused]] const RegisterBank *RB = MRI.getRegBank(Reg);
953
954 switch (MethodIDs[OpIdx]) {
955 // vcc, sgpr and vgpr scalars, pointers and vectors
956 case Vcc:
957 case Sgpr16:
958 case Sgpr32:
959 case Sgpr64:
960 case Sgpr128:
961 case SgprP1:
962 case SgprP3:
963 case SgprP4:
964 case SgprP5:
965 case SgprV2S16:
966 case SgprV2S32:
967 case SgprV4S32:
968 case Vgpr16:
969 case Vgpr32:
970 case Vgpr64:
971 case Vgpr128:
972 case VgprP0:
973 case VgprP1:
974 case VgprP3:
975 case VgprP4:
976 case VgprP5:
977 case VgprV2S16:
978 case VgprV2S32:
979 case VgprV4S32: {
980 assert(Ty == getTyFromID(MethodIDs[OpIdx]));
981 assert(RB == getRegBankFromID(MethodIDs[OpIdx]));
982 break;
983 }
984 // sgpr and vgpr B-types
985 case SgprB32:
986 case SgprB64:
987 case SgprB96:
988 case SgprB128:
989 case SgprB256:
990 case SgprB512:
991 case SgprPtr32:
992 case SgprPtr64:
993 case SgprPtr128:
994 case VgprB32:
995 case VgprB64:
996 case VgprB96:
997 case VgprB128:
998 case VgprB256:
999 case VgprB512:
1000 case VgprPtr32:
1001 case VgprPtr64:
1002 case VgprPtr128: {
1003 assert(Ty == getBTyFromID(MethodIDs[OpIdx], Ty));
1004 assert(RB == getRegBankFromID(MethodIDs[OpIdx]));
1005 break;
1006 }
1007 // uniform in vcc/vgpr: scalars, vectors and B-types
1008 case UniInVcc: {
1009 assert(Ty == S1);
1010 assert(RB == SgprRB);
1011 Register NewDst = MRI.createVirtualRegister(VccRB_S1);
1012 Op.setReg(NewDst);
1013 auto CopyS32_Vcc =
1014 B.buildInstr(AMDGPU::G_AMDGPU_COPY_SCC_VCC, {SgprRB_S32}, {NewDst});
1015 B.buildTrunc(Reg, CopyS32_Vcc);
1016 break;
1017 }
1018 case UniInVgprS32:
1019 case UniInVgprV2S16:
1020 case UniInVgprV4S32: {
1021 assert(Ty == getTyFromID(MethodIDs[OpIdx]));
1022 assert(RB == SgprRB);
1023 Register NewVgprDst = MRI.createVirtualRegister({VgprRB, Ty});
1024 Op.setReg(NewVgprDst);
1025 buildReadAnyLane(B, Reg, NewVgprDst, RBI);
1026 break;
1027 }
1028 case UniInVgprB32:
1029 case UniInVgprB64:
1030 case UniInVgprB96:
1031 case UniInVgprB128:
1032 case UniInVgprB256:
1033 case UniInVgprB512: {
1034 assert(Ty == getBTyFromID(MethodIDs[OpIdx], Ty));
1035 assert(RB == SgprRB);
1036 Register NewVgprDst = MRI.createVirtualRegister({VgprRB, Ty});
1037 Op.setReg(NewVgprDst);
1038 AMDGPU::buildReadAnyLane(B, Reg, NewVgprDst, RBI);
1039 break;
1040 }
1041 // sgpr trunc
1042 case Sgpr32Trunc: {
1043 assert(Ty.getSizeInBits() < 32);
1044 assert(RB == SgprRB);
1045 Register NewDst = MRI.createVirtualRegister(SgprRB_S32);
1046 Op.setReg(NewDst);
1047 B.buildTrunc(Reg, NewDst);
1048 break;
1049 }
1050 case InvalidMapping: {
1051 LLVM_DEBUG(dbgs() << "Instruction with Invalid mapping: "; MI.dump(););
1052 llvm_unreachable("missing fast rule for MI");
1053 }
1054 default:
1055 llvm_unreachable("ID not supported");
1056 }
1057 }
1058}
1059
1060void RegBankLegalizeHelper::applyMappingSrc(
1061 MachineInstr &MI, unsigned &OpIdx,
1063 SmallSet<Register, 4> &SgprWaterfallOperandRegs) {
1064 for (unsigned i = 0; i < MethodIDs.size(); ++OpIdx, ++i) {
1065 if (MethodIDs[i] == None || MethodIDs[i] == IntrId || MethodIDs[i] == Imm)
1066 continue;
1067
1068 MachineOperand &Op = MI.getOperand(OpIdx);
1069 Register Reg = Op.getReg();
1070 LLT Ty = MRI.getType(Reg);
1071 const RegisterBank *RB = MRI.getRegBank(Reg);
1072
1073 switch (MethodIDs[i]) {
1074 case Vcc: {
1075 assert(Ty == S1);
1076 assert(RB == VccRB || RB == SgprRB);
1077 if (RB == SgprRB) {
1078 auto Aext = B.buildAnyExt(SgprRB_S32, Reg);
1079 auto CopyVcc_Scc =
1080 B.buildInstr(AMDGPU::G_AMDGPU_COPY_VCC_SCC, {VccRB_S1}, {Aext});
1081 Op.setReg(CopyVcc_Scc.getReg(0));
1082 }
1083 break;
1084 }
1085 // sgpr scalars, pointers and vectors
1086 case Sgpr16:
1087 case Sgpr32:
1088 case Sgpr64:
1089 case Sgpr128:
1090 case SgprP1:
1091 case SgprP3:
1092 case SgprP4:
1093 case SgprP5:
1094 case SgprV2S16:
1095 case SgprV2S32:
1096 case SgprV4S32: {
1097 assert(Ty == getTyFromID(MethodIDs[i]));
1098 assert(RB == getRegBankFromID(MethodIDs[i]));
1099 break;
1100 }
1101 // sgpr B-types
1102 case SgprB32:
1103 case SgprB64:
1104 case SgprB96:
1105 case SgprB128:
1106 case SgprB256:
1107 case SgprB512:
1108 case SgprPtr32:
1109 case SgprPtr64:
1110 case SgprPtr128: {
1111 assert(Ty == getBTyFromID(MethodIDs[i], Ty));
1112 assert(RB == getRegBankFromID(MethodIDs[i]));
1113 break;
1114 }
1115 // vgpr scalars, pointers and vectors
1116 case Vgpr16:
1117 case Vgpr32:
1118 case Vgpr64:
1119 case Vgpr128:
1120 case VgprP0:
1121 case VgprP1:
1122 case VgprP3:
1123 case VgprP4:
1124 case VgprP5:
1125 case VgprV2S16:
1126 case VgprV2S32:
1127 case VgprV4S32: {
1128 assert(Ty == getTyFromID(MethodIDs[i]));
1129 if (RB != VgprRB) {
1130 auto CopyToVgpr = B.buildCopy({VgprRB, Ty}, Reg);
1131 Op.setReg(CopyToVgpr.getReg(0));
1132 }
1133 break;
1134 }
1135 // vgpr B-types
1136 case VgprB32:
1137 case VgprB64:
1138 case VgprB96:
1139 case VgprB128:
1140 case VgprB256:
1141 case VgprB512:
1142 case VgprPtr32:
1143 case VgprPtr64:
1144 case VgprPtr128: {
1145 assert(Ty == getBTyFromID(MethodIDs[i], Ty));
1146 if (RB != VgprRB) {
1147 auto CopyToVgpr = B.buildCopy({VgprRB, Ty}, Reg);
1148 Op.setReg(CopyToVgpr.getReg(0));
1149 }
1150 break;
1151 }
1152 // sgpr waterfall, scalars and vectors
1153 case Sgpr32_WF:
1154 case SgprV4S32_WF: {
1155 assert(Ty == getTyFromID(MethodIDs[i]));
1156 if (RB != SgprRB)
1157 SgprWaterfallOperandRegs.insert(Reg);
1158 break;
1159 }
1160 // sgpr and vgpr scalars with extend
1161 case Sgpr32AExt: {
1162 // Note: this ext allows S1, and it is meant to be combined away.
1163 assert(Ty.getSizeInBits() < 32);
1164 assert(RB == SgprRB);
1165 auto Aext = B.buildAnyExt(SgprRB_S32, Reg);
1166 Op.setReg(Aext.getReg(0));
1167 break;
1168 }
1169 case Sgpr32AExtBoolInReg: {
1170 // Note: this ext allows S1, and it is meant to be combined away.
1171 assert(Ty.getSizeInBits() == 1);
1172 assert(RB == SgprRB);
1173 auto Aext = B.buildAnyExt(SgprRB_S32, Reg);
1174 // Zext SgprS1 is not legal, make AND with 1 instead. This instruction is
1175 // most of times meant to be combined away in AMDGPURegBankCombiner.
1176 auto Cst1 = B.buildConstant(SgprRB_S32, 1);
1177 auto BoolInReg = B.buildAnd(SgprRB_S32, Aext, Cst1);
1178 Op.setReg(BoolInReg.getReg(0));
1179 break;
1180 }
1181 case Sgpr32SExt: {
1182 assert(1 < Ty.getSizeInBits() && Ty.getSizeInBits() < 32);
1183 assert(RB == SgprRB);
1184 auto Sext = B.buildSExt(SgprRB_S32, Reg);
1185 Op.setReg(Sext.getReg(0));
1186 break;
1187 }
1188 case Sgpr32ZExt: {
1189 assert(1 < Ty.getSizeInBits() && Ty.getSizeInBits() < 32);
1190 assert(RB == SgprRB);
1191 auto Zext = B.buildZExt({SgprRB, S32}, Reg);
1192 Op.setReg(Zext.getReg(0));
1193 break;
1194 }
1195 case Vgpr32SExt: {
1196 // Note this ext allows S1, and it is meant to be combined away.
1197 assert(Ty.getSizeInBits() < 32);
1198 assert(RB == VgprRB);
1199 auto Sext = B.buildSExt({VgprRB, S32}, Reg);
1200 Op.setReg(Sext.getReg(0));
1201 break;
1202 }
1203 case Vgpr32ZExt: {
1204 // Note this ext allows S1, and it is meant to be combined away.
1205 assert(Ty.getSizeInBits() < 32);
1206 assert(RB == VgprRB);
1207 auto Zext = B.buildZExt({VgprRB, S32}, Reg);
1208 Op.setReg(Zext.getReg(0));
1209 break;
1210 }
1211 default:
1212 llvm_unreachable("ID not supported");
1213 }
1214 }
1215}
1216
1218 Register Dst = MI.getOperand(0).getReg();
1219 LLT Ty = MRI.getType(Dst);
1220
1221 if (Ty == LLT::scalar(1) && MUI.isUniform(Dst)) {
1222 B.setInsertPt(*MI.getParent(), MI.getParent()->getFirstNonPHI());
1223
1224 Register NewDst = MRI.createVirtualRegister(SgprRB_S32);
1225 MI.getOperand(0).setReg(NewDst);
1226 B.buildTrunc(Dst, NewDst);
1227
1228 for (unsigned i = 1; i < MI.getNumOperands(); i += 2) {
1229 Register UseReg = MI.getOperand(i).getReg();
1230
1231 auto DefMI = MRI.getVRegDef(UseReg)->getIterator();
1232 MachineBasicBlock *DefMBB = DefMI->getParent();
1233
1234 B.setInsertPt(*DefMBB, DefMBB->SkipPHIsAndLabels(std::next(DefMI)));
1235
1236 auto NewUse = B.buildAnyExt(SgprRB_S32, UseReg);
1237 MI.getOperand(i).setReg(NewUse.getReg(0));
1238 }
1239
1240 return;
1241 }
1242
1243 // ALL divergent i1 phis should be already lowered and inst-selected into PHI
1244 // with sgpr reg class and S1 LLT.
1245 // Note: this includes divergent phis that don't require lowering.
1246 if (Ty == LLT::scalar(1) && MUI.isDivergent(Dst)) {
1247 LLVM_DEBUG(dbgs() << "Divergent S1 G_PHI: "; MI.dump(););
1248 llvm_unreachable("Make sure to run AMDGPUGlobalISelDivergenceLowering "
1249 "before RegBankLegalize to lower lane mask(vcc) phis");
1250 }
1251
1252 // We accept all types that can fit in some register class.
1253 // Uniform G_PHIs have all sgpr registers.
1254 // Divergent G_PHIs have vgpr dst but inputs can be sgpr or vgpr.
1255 if (Ty == LLT::scalar(32) || Ty == LLT::pointer(1, 64) ||
1256 Ty == LLT::pointer(4, 64)) {
1257 return;
1258 }
1259
1260 LLVM_DEBUG(dbgs() << "G_PHI not handled: "; MI.dump(););
1261 llvm_unreachable("type not supported");
1262}
1263
1264[[maybe_unused]] static bool verifyRegBankOnOperands(MachineInstr &MI,
1265 const RegisterBank *RB,
1267 unsigned StartOpIdx,
1268 unsigned EndOpIdx) {
1269 for (unsigned i = StartOpIdx; i <= EndOpIdx; ++i) {
1270 if (MRI.getRegBankOrNull(MI.getOperand(i).getReg()) != RB)
1271 return false;
1272 }
1273 return true;
1274}
1275
1277 const RegisterBank *RB = MRI.getRegBank(MI.getOperand(0).getReg());
1278 // Put RB on all registers
1279 unsigned NumDefs = MI.getNumDefs();
1280 unsigned NumOperands = MI.getNumOperands();
1281
1282 assert(verifyRegBankOnOperands(MI, RB, MRI, 0, NumDefs - 1));
1283 if (RB == SgprRB)
1284 assert(verifyRegBankOnOperands(MI, RB, MRI, NumDefs, NumOperands - 1));
1285
1286 if (RB == VgprRB) {
1287 B.setInstr(MI);
1288 for (unsigned i = NumDefs; i < NumOperands; ++i) {
1289 Register Reg = MI.getOperand(i).getReg();
1290 if (MRI.getRegBank(Reg) != RB) {
1291 auto Copy = B.buildCopy({VgprRB, MRI.getType(Reg)}, Reg);
1292 MI.getOperand(i).setReg(Copy.getReg(0));
1293 }
1294 }
1295 }
1296}
unsigned const MachineRegisterInfo * MRI
MachineInstrBuilder MachineInstrBuilder & DefMI
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.
Provides AMDGPU specific target descriptions.
static bool isSignedBFE(MachineInstr &MI)
static bool verifyRegBankOnOperands(MachineInstr &MI, const RegisterBank *RB, MachineRegisterInfo &MRI, unsigned StartOpIdx, unsigned EndOpIdx)
This file declares the targeting of the RegisterBankInfo class for AMDGPU.
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator MBBI
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
uint64_t Size
AMD GCN specific subclass of TargetSubtarget.
Declares convenience wrapper classes for interpreting MachineInstr instances as specific generic oper...
static Register UseReg(const MachineOperand &MO)
IRTranslator LLVM IR MI
#define I(x, y, z)
Definition: MD5.cpp:58
This file declares the MachineIRBuilder class.
Register const TargetRegisterInfo * TRI
Machine IR instance of the generic uniformity analysis.
static unsigned getReg(const MCDisassembler *D, unsigned RC, unsigned RegNo)
MachineInstr unsigned OpIdx
ConstantRange Range(APInt(BitWidth, Low), APInt(BitWidth, High))
const SmallVectorImpl< MachineOperand > & Cond
#define LLVM_DEBUG(...)
Definition: Debug.h:119
RegBankLegalizeHelper(MachineIRBuilder &B, const MachineUniformityInfo &MUI, const RegisterBankInfo &RBI, const RegBankLegalizeRules &RBLRules)
const SetOfRulesForOpcode & getRulesForOpc(MachineInstr &MI) const
const RegBankLLTMapping & findMappingForMI(const MachineInstr &MI, const MachineRegisterInfo &MRI, const MachineUniformityInfo &MUI) const
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
@ ICMP_EQ
equal
Definition: InstrTypes.h:699
@ ICMP_NE
not equal
Definition: InstrTypes.h:700
This class represents an Operation in the Expression.
iterator find(const_arg_type_t< KeyT > Val)
Definition: DenseMap.h:165
iterator end()
Definition: DenseMap.h:81
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition: DenseMap.h:214
const SIInstrInfo * getInstrInfo() const override
Definition: GCNSubtarget.h:309
const SIRegisterInfo * getRegisterInfo() const override
Definition: GCNSubtarget.h:321
Represents a call to an intrinsic.
bool isDivergent(ConstValueRefT V) const
Whether V is divergent at its definition.
bool isUniform(ConstValueRefT V) const
Whether V is uniform/non-divergent.
constexpr bool isScalar() const
Definition: LowLevelType.h:147
static constexpr LLT scalar(unsigned SizeInBits)
Get a low-level scalar or aggregate "bag of bits".
Definition: LowLevelType.h:43
constexpr bool isValid() const
Definition: LowLevelType.h:146
constexpr bool isVector() const
Definition: LowLevelType.h:149
static constexpr LLT pointer(unsigned AddressSpace, unsigned SizeInBits)
Get a low-level pointer in the given address space.
Definition: LowLevelType.h:58
constexpr TypeSize getSizeInBits() const
Returns the total size of the type. Must only be called on sized types.
Definition: LowLevelType.h:191
constexpr bool isPointer() const
Definition: LowLevelType.h:150
constexpr LLT getElementType() const
Returns the vector's element type. Only valid for vector types.
Definition: LowLevelType.h:278
static constexpr LLT fixed_vector(unsigned NumElements, unsigned ScalarSizeInBits)
Get a low-level fixed-width vector of some number of elements and element width.
Definition: LowLevelType.h:101
constexpr TypeSize getSizeInBytes() const
Returns the total size of the type in bytes, i.e.
Definition: LowLevelType.h:201
LLVM_ABI void transferSuccessorsAndUpdatePHIs(MachineBasicBlock *FromMBB)
Transfers all the successors, as in transferSuccessors, and update PHI operands in the successor bloc...
LLVM_ABI iterator SkipPHIsAndLabels(iterator I)
Return the first instruction in MBB after I that is not a PHI or a label.
LLVM_ABI void addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob=BranchProbability::getUnknown())
Add Succ as a successor of this MachineBasicBlock.
void splice(iterator Where, MachineBasicBlock *Other, iterator From)
Take an instruction from MBB 'Other' at the position From, and insert it into this MBB right before '...
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
MachineBasicBlock * CreateMachineBasicBlock(const BasicBlock *BB=nullptr, std::optional< UniqueBBID > BBID=std::nullopt)
CreateMachineBasicBlock - Allocate a new MachineBasicBlock.
void insert(iterator MBBI, MachineBasicBlock *MBB)
Helper class to build MachineInstr.
void setInsertPt(MachineBasicBlock &MBB, MachineBasicBlock::iterator II)
Set the insertion point before the specified position.
void setInstr(MachineInstr &MI)
Set the insertion point to before MI.
Representation of each machine instruction.
Definition: MachineInstr.h:72
const MachineBasicBlock * getParent() const
Definition: MachineInstr.h:359
A description of a memory reference used in the backend.
MachineOperand class - Representation of each machine instruction operand.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
void dump() const
Definition: Pass.cpp:146
Holds all the information related to register banks.
This class implements the register bank concept.
Definition: RegisterBank.h:29
Wrapper class representing virtual and physical registers.
Definition: Register.h:19
SmallSet - This maintains a set of unique values, optimizing for the case when the set is small (less...
Definition: SmallSet.h:134
size_type count(const T &V) const
count - Return 1 if the element is in the set, 0 otherwise.
Definition: SmallSet.h:176
bool empty() const
Definition: SmallSet.h:169
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
Definition: SmallSet.h:182
size_t size() const
Definition: SmallVector.h:79
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:574
void push_back(const T &Elt)
Definition: SmallVector.h:414
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1197
A range adaptor for a pair of iterators.
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
bool isAnyPtr(LLT Ty, unsigned Width)
void buildReadAnyLane(MachineIRBuilder &B, Register SgprDst, Register VgprSrc, const RegisterBankInfo &RBI)
void buildReadFirstLane(MachineIRBuilder &B, Register SgprDst, Register VgprSrc, const RegisterBankInfo &RBI)
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
Definition: BitmaskEnum.h:126
@ Kill
The last use of a register.
Reg
All possible values of the reg field in the ModR/M byte.
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
@ Offset
Definition: DWP.cpp:477
LLVM_ABI bool constrainSelectedInstRegOperands(MachineInstr &I, const TargetInstrInfo &TII, const TargetRegisterInfo &TRI, const RegisterBankInfo &RBI)
Mutate the newly-selected instruction I to constrain its (possibly generic) virtual register operands...
Definition: Utils.cpp:155
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:207
LLVM_ABI std::optional< ValueAndVReg > getIConstantVRegValWithLookThrough(Register VReg, const MachineRegisterInfo &MRI, bool LookThroughInstrs=true)
If VReg is defined by a statically evaluable chain of instructions rooted on a G_CONSTANT returns its...
Definition: Utils.cpp:433
SmallVector< RegBankLLTMappingApplyID, 2 > DstOpMapping
SmallVector< RegBankLLTMappingApplyID, 4 > SrcOpMapping