LLVM 23.0.0git
AMDGPURegBankLegalizeHelper.cpp
Go to the documentation of this file.
1//===-- AMDGPURegBankLegalizeHelper.cpp -----------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// Implements actual lowering algorithms for each ID that can be used in
10/// Rule.OperandMapping. Similar to legalizer helper but with register banks.
11//
12//===----------------------------------------------------------------------===//
13
16#include "AMDGPUInstrInfo.h"
19#include "GCNSubtarget.h"
26#include "llvm/IR/IntrinsicsAMDGPU.h"
27
28#define DEBUG_TYPE "amdgpu-regbanklegalize"
29
30using namespace llvm;
31using namespace AMDGPU;
32
35 const RegisterBankInfo &RBI, const RegBankLegalizeRules &RBLRules)
36 : MF(B.getMF()), ST(MF.getSubtarget<GCNSubtarget>()), B(B),
37 MRI(*B.getMRI()), MUI(MUI), RBI(RBI), MORE(MF, nullptr),
38 RBLRules(RBLRules), IsWave32(ST.isWave32()),
39 SgprRB(&RBI.getRegBank(AMDGPU::SGPRRegBankID)),
40 VgprRB(&RBI.getRegBank(AMDGPU::VGPRRegBankID)),
41 VccRB(&RBI.getRegBank(AMDGPU::VCCRegBankID)) {}
42
44 const SetOfRulesForOpcode *RuleSet = RBLRules.getRulesForOpc(MI);
45 if (!RuleSet) {
46 reportGISelFailure(MF, MORE, "amdgpu-regbanklegalize",
47 "No AMDGPU RegBankLegalize rules defined for opcode",
48 MI);
49 return false;
50 }
51
52 const RegBankLLTMapping *Mapping = RuleSet->findMappingForMI(MI, MRI, MUI);
53 if (!Mapping) {
54 reportGISelFailure(MF, MORE, "amdgpu-regbanklegalize",
55 "AMDGPU RegBankLegalize: none of the rules defined with "
56 "'Any' for MI's opcode matched MI",
57 MI);
58 return false;
59 }
60
61 WaterfallInfo WFI;
62 unsigned OpIdx = 0;
63 if (!Mapping->DstOpMapping.empty()) {
64 B.setInsertPt(*MI.getParent(), std::next(MI.getIterator()));
65 if (!applyMappingDst(MI, OpIdx, Mapping->DstOpMapping))
66 return false;
67 }
68 if (!Mapping->SrcOpMapping.empty()) {
69 B.setInstr(MI);
70 if (!applyMappingSrc(MI, OpIdx, Mapping->SrcOpMapping, WFI))
71 return false;
72 }
73
74 if (!lower(MI, *Mapping, WFI))
75 return false;
76
77 return true;
78}
79
80bool RegBankLegalizeHelper::executeInWaterfallLoop(MachineIRBuilder &B,
81 const WaterfallInfo &WFI) {
82 assert(WFI.Start.isValid() && WFI.End.isValid() &&
83 "Waterfall range not initialized");
84
85 // Track use registers which have already been expanded with a readfirstlane
86 // sequence. This may have multiple uses if moving a sequence.
87 DenseMap<Register, Register> WaterfalledRegMap;
88
89 MachineBasicBlock &MBB = B.getMBB();
90 MachineFunction &MF = B.getMF();
91
94
96 const TargetRegisterClass *WaveRC = TRI->getWaveMaskRegClass();
97 unsigned MovExecOpc, MovExecTermOpc, XorTermOpc, AndSaveExecOpc, ExecReg;
98 if (IsWave32) {
99 MovExecOpc = AMDGPU::S_MOV_B32;
100 MovExecTermOpc = AMDGPU::S_MOV_B32_term;
101 XorTermOpc = AMDGPU::S_XOR_B32_term;
102 AndSaveExecOpc = AMDGPU::S_AND_SAVEEXEC_B32;
103 ExecReg = AMDGPU::EXEC_LO;
104 } else {
105 MovExecOpc = AMDGPU::S_MOV_B64;
106 MovExecTermOpc = AMDGPU::S_MOV_B64_term;
107 XorTermOpc = AMDGPU::S_XOR_B64_term;
108 AndSaveExecOpc = AMDGPU::S_AND_SAVEEXEC_B64;
109 ExecReg = AMDGPU::EXEC;
110 }
111
112#ifndef NDEBUG
113 const int OrigRangeSize = std::distance(BeginIt, EndIt);
114#endif
115
116 MachineRegisterInfo &MRI = *B.getMRI();
117 Register SaveExecReg = MRI.createVirtualRegister(WaveRC);
118 Register InitSaveExecReg = MRI.createVirtualRegister(WaveRC);
119
120 // Don't bother using generic instructions/registers for the exec mask.
121 B.buildInstr(TargetOpcode::IMPLICIT_DEF).addDef(InitSaveExecReg);
122
123 Register SavedExec = MRI.createVirtualRegister(WaveRC);
124
125 // To insert the loop we need to split the block. Move everything before
126 // this point to a new block, and insert a new empty block before this
127 // instruction.
130 MachineBasicBlock *RestoreExecBB = MF.CreateMachineBasicBlock();
131 MachineBasicBlock *RemainderBB = MF.CreateMachineBasicBlock();
133 ++MBBI;
134 MF.insert(MBBI, LoopBB);
135 MF.insert(MBBI, BodyBB);
136 MF.insert(MBBI, RestoreExecBB);
137 MF.insert(MBBI, RemainderBB);
138
139 LoopBB->addSuccessor(BodyBB);
140 BodyBB->addSuccessor(RestoreExecBB);
141 BodyBB->addSuccessor(LoopBB);
142
143 // Move the rest of the block into a new block.
145 RemainderBB->splice(RemainderBB->begin(), &MBB, EndIt, MBB.end());
146
147 MBB.addSuccessor(LoopBB);
148 RestoreExecBB->addSuccessor(RemainderBB);
149
150 B.setInsertPt(*LoopBB, LoopBB->end());
151
152 // +-MBB:------------+
153 // | ... |
154 // | %0 = G_INST_1 |
155 // | %Dst = MI %Vgpr |
156 // | %1 = G_INST_2 |
157 // | ... |
158 // +-----------------+
159 // ->
160 // +-MBB-------------------------------+
161 // | ... |
162 // | %0 = G_INST_1 |
163 // | %SaveExecReg = S_MOV_B32 $exec_lo |
164 // +----------------|------------------+
165 // | /------------------------------|
166 // V V |
167 // +-LoopBB---------------------------------------------------------------+ |
168 // | %CurrentLaneReg:sgpr(s32) = READFIRSTLANE %Vgpr | |
169 // | instead of executing for each lane, see if other lanes had | |
170 // | same value for %Vgpr and execute for them also. | |
171 // | %CondReg:vcc(s1) = G_ICMP eq %CurrentLaneReg, %Vgpr | |
172 // | %CondRegLM:sreg_32 = ballot %CondReg // copy vcc to sreg32 lane mask | |
173 // | %SavedExec = S_AND_SAVEEXEC_B32 %CondRegLM | |
174 // | exec is active for lanes with the same "CurrentLane value" in Vgpr | |
175 // +----------------|-----------------------------------------------------+ |
176 // V |
177 // +-BodyBB------------------------------------------------------------+ |
178 // | %Dst = MI %CurrentLaneReg:sgpr(s32) | |
179 // | executed only for active lanes and written to Dst | |
180 // | $exec = S_XOR_B32 $exec, %SavedExec | |
181 // | set active lanes to 0 in SavedExec, lanes that did not write to | |
182 // | Dst yet, and set this as new exec (for READFIRSTLANE and ICMP) | |
183 // | SI_WATERFALL_LOOP LoopBB |-----|
184 // +----------------|--------------------------------------------------+
185 // V
186 // +-RestoreExecBB--------------------------+
187 // | $exec_lo = S_MOV_B32_term %SaveExecReg |
188 // +----------------|-----------------------+
189 // V
190 // +-RemainderBB:----------------------+
191 // | %1 = G_INST_2 |
192 // | ... |
193 // +---------------------------------- +
194
195 // Move the instruction into the loop body. Note we moved everything after
196 // Range.end() already into a new block, so Range.end() is no longer valid.
197 BodyBB->splice(BodyBB->end(), &MBB, BeginIt, MBB.end());
198
199 // Figure out the iterator range after splicing the instructions.
200 MachineBasicBlock::iterator NewBegin = BeginIt;
201 auto NewEnd = BodyBB->end();
202 assert(std::distance(NewBegin, NewEnd) == OrigRangeSize);
203
204 B.setMBB(*LoopBB);
205 Register CondReg;
206
207 for (MachineInstr &MI : make_range(NewBegin, NewEnd)) {
208 for (MachineOperand &Op : MI.all_uses()) {
209 Register OldReg = Op.getReg();
210 if (!WFI.SgprWaterfallOperandRegs.count(OldReg))
211 continue;
212
213 // See if we already processed this register in another instruction in
214 // the sequence.
215 auto OldVal = WaterfalledRegMap.find(OldReg);
216 if (OldVal != WaterfalledRegMap.end()) {
217 Op.setReg(OldVal->second);
218 continue;
219 }
220
221 Register OpReg = Op.getReg();
222 LLT OpTy = MRI.getType(OpReg);
223
224 // TODO: support for agpr
225 assert(MRI.getRegBank(OpReg) == VgprRB);
226 Register CurrentLaneReg = MRI.createVirtualRegister({SgprRB, OpTy});
227 buildReadFirstLane(B, CurrentLaneReg, OpReg, RBI);
228
229 // Build the comparison(s), CurrentLaneReg == OpReg.
230 unsigned OpSize = OpTy.getSizeInBits();
231 unsigned PartSize = (OpSize % 64 == 0) ? 64 : 32;
232 LLT PartTy = LLT::scalar(PartSize);
233 unsigned NumParts = OpSize / PartSize;
235 SmallVector<Register, 8> CurrentLaneParts;
236
237 if (NumParts == 1) {
238 OpParts.push_back(OpReg);
239 CurrentLaneParts.push_back(CurrentLaneReg);
240 } else {
241 auto UnmergeOp = B.buildUnmerge({VgprRB, PartTy}, OpReg);
242 auto UnmergeCurrLane = B.buildUnmerge({SgprRB, PartTy}, CurrentLaneReg);
243 for (unsigned i = 0; i < NumParts; ++i) {
244 OpParts.push_back(UnmergeOp.getReg(i));
245 CurrentLaneParts.push_back(UnmergeCurrLane.getReg(i));
246 }
247 }
248
249 for (unsigned i = 0; i < NumParts; ++i) {
250 Register CmpReg = MRI.createVirtualRegister(VccRB_S1);
251 B.buildICmp(CmpInst::ICMP_EQ, CmpReg, CurrentLaneParts[i], OpParts[i]);
252
253 if (!CondReg)
254 CondReg = CmpReg;
255 else
256 CondReg = B.buildAnd(VccRB_S1, CondReg, CmpReg).getReg(0);
257 }
258
259 Op.setReg(CurrentLaneReg);
260
261 // Make sure we don't re-process this register again.
262 WaterfalledRegMap.insert(std::pair(OldReg, Op.getReg()));
263 }
264 }
265
266 // Copy vcc to sgpr32/64, ballot becomes a no-op during instruction selection.
267 Register CondRegLM =
268 MRI.createVirtualRegister({WaveRC, LLT::scalar(IsWave32 ? 32 : 64)});
269 B.buildIntrinsic(Intrinsic::amdgcn_ballot, CondRegLM).addReg(CondReg);
270
271 // Update EXEC, save the original EXEC value to SavedExec.
272 B.buildInstr(AndSaveExecOpc)
273 .addDef(SavedExec)
274 .addReg(CondRegLM, RegState::Kill);
275 MRI.setSimpleHint(SavedExec, CondRegLM);
276
277 B.setInsertPt(*BodyBB, BodyBB->end());
278
279 // Update EXEC, switch all done bits to 0 and all todo bits to 1.
280 B.buildInstr(XorTermOpc).addDef(ExecReg).addReg(ExecReg).addReg(SavedExec);
281
282 // XXX - s_xor_b64 sets scc to 1 if the result is nonzero, so can we use
283 // s_cbranch_scc0?
284
285 // Loop back to V_READFIRSTLANE_B32 if there are still variants to cover.
286 B.buildInstr(AMDGPU::SI_WATERFALL_LOOP).addMBB(LoopBB);
287
288 // Save the EXEC mask before the loop.
289 B.setInsertPt(MBB, MBB.end());
290 B.buildInstr(MovExecOpc).addDef(SaveExecReg).addReg(ExecReg);
291
292 // Restore the EXEC mask after the loop.
293 B.setInsertPt(*RestoreExecBB, RestoreExecBB->begin());
294 B.buildInstr(MovExecTermOpc).addDef(ExecReg).addReg(SaveExecReg);
295
296 // Set the insert point after the original instruction, so any new
297 // instructions will be in the remainder.
298 B.setInsertPt(*RemainderBB, RemainderBB->begin());
299
300 return true;
301}
302
303bool RegBankLegalizeHelper::splitLoad(MachineInstr &MI,
304 ArrayRef<LLT> LLTBreakdown, LLT MergeTy) {
305 MachineFunction &MF = B.getMF();
306 assert(MI.getNumMemOperands() == 1);
307 MachineMemOperand &BaseMMO = **MI.memoperands_begin();
308 Register Dst = MI.getOperand(0).getReg();
309 const RegisterBank *DstRB = MRI.getRegBankOrNull(Dst);
310 Register Base = MI.getOperand(1).getReg();
311 LLT PtrTy = MRI.getType(Base);
312 const RegisterBank *PtrRB = MRI.getRegBankOrNull(Base);
313 LLT OffsetTy = LLT::scalar(PtrTy.getSizeInBits());
314 SmallVector<Register, 4> LoadPartRegs;
315
316 unsigned ByteOffset = 0;
317 for (LLT PartTy : LLTBreakdown) {
318 Register BasePlusOffset;
319 if (ByteOffset == 0) {
320 BasePlusOffset = Base;
321 } else {
322 auto Offset = B.buildConstant({PtrRB, OffsetTy}, ByteOffset);
323 BasePlusOffset =
324 B.buildObjectPtrOffset({PtrRB, PtrTy}, Base, Offset).getReg(0);
325 }
326 auto *OffsetMMO = MF.getMachineMemOperand(&BaseMMO, ByteOffset, PartTy);
327 auto LoadPart = B.buildLoad({DstRB, PartTy}, BasePlusOffset, *OffsetMMO);
328 LoadPartRegs.push_back(LoadPart.getReg(0));
329 ByteOffset += PartTy.getSizeInBytes();
330 }
331
332 if (!MergeTy.isValid()) {
333 // Loads are of same size, concat or merge them together.
334 B.buildMergeLikeInstr(Dst, LoadPartRegs);
335 } else {
336 // Loads are not all of same size, need to unmerge them to smaller pieces
337 // of MergeTy type, then merge pieces to Dst.
338 SmallVector<Register, 4> MergeTyParts;
339 for (Register Reg : LoadPartRegs) {
340 if (MRI.getType(Reg) == MergeTy) {
341 MergeTyParts.push_back(Reg);
342 } else {
343 auto Unmerge = B.buildUnmerge({DstRB, MergeTy}, Reg);
344 for (unsigned i = 0; i < Unmerge->getNumOperands() - 1; ++i)
345 MergeTyParts.push_back(Unmerge.getReg(i));
346 }
347 }
348 B.buildMergeLikeInstr(Dst, MergeTyParts);
349 }
350 MI.eraseFromParent();
351 return true;
352}
353
354bool RegBankLegalizeHelper::widenLoad(MachineInstr &MI, LLT WideTy,
355 LLT MergeTy) {
356 MachineFunction &MF = B.getMF();
357 assert(MI.getNumMemOperands() == 1);
358 MachineMemOperand &BaseMMO = **MI.memoperands_begin();
359 Register Dst = MI.getOperand(0).getReg();
360 const RegisterBank *DstRB = MRI.getRegBankOrNull(Dst);
361 Register Base = MI.getOperand(1).getReg();
362
363 MachineMemOperand *WideMMO = MF.getMachineMemOperand(&BaseMMO, 0, WideTy);
364 auto WideLoad = B.buildLoad({DstRB, WideTy}, Base, *WideMMO);
365
366 if (WideTy.isScalar()) {
367 B.buildTrunc(Dst, WideLoad);
368 } else {
369 SmallVector<Register, 4> MergeTyParts;
370 auto Unmerge = B.buildUnmerge({DstRB, MergeTy}, WideLoad);
371
372 LLT DstTy = MRI.getType(Dst);
373 unsigned NumElts = DstTy.getSizeInBits() / MergeTy.getSizeInBits();
374 for (unsigned i = 0; i < NumElts; ++i) {
375 MergeTyParts.push_back(Unmerge.getReg(i));
376 }
377 B.buildMergeLikeInstr(Dst, MergeTyParts);
378 }
379 MI.eraseFromParent();
380 return true;
381}
382
383bool RegBankLegalizeHelper::widenMMOToS32(GAnyLoad &MI) const {
384 Register Dst = MI.getDstReg();
385 Register Ptr = MI.getPointerReg();
386 MachineMemOperand &MMO = MI.getMMO();
387 unsigned MemSize = 8 * MMO.getSize().getValue();
388
389 MachineMemOperand *WideMMO = B.getMF().getMachineMemOperand(&MMO, 0, S32);
390
391 if (MI.getOpcode() == G_LOAD) {
392 B.buildLoad(Dst, Ptr, *WideMMO);
393 } else {
394 auto Load = B.buildLoad(SgprRB_S32, Ptr, *WideMMO);
395
396 if (MI.getOpcode() == G_ZEXTLOAD) {
397 APInt Mask = APInt::getLowBitsSet(S32.getSizeInBits(), MemSize);
398 auto MaskCst = B.buildConstant(SgprRB_S32, Mask);
399 B.buildAnd(Dst, Load, MaskCst);
400 } else {
401 assert(MI.getOpcode() == G_SEXTLOAD);
402 B.buildSExtInReg(Dst, Load, MemSize);
403 }
404 }
405
406 MI.eraseFromParent();
407 return true;
408}
409
410bool RegBankLegalizeHelper::lowerVccExtToSel(MachineInstr &MI) {
411 Register Dst = MI.getOperand(0).getReg();
412 LLT Ty = MRI.getType(Dst);
413 Register Src = MI.getOperand(1).getReg();
414 unsigned Opc = MI.getOpcode();
415 int TrueExtCst = Opc == G_SEXT ? -1 : 1;
416 if (Ty == S32 || Ty == S16) {
417 auto True = B.buildConstant({VgprRB, Ty}, TrueExtCst);
418 auto False = B.buildConstant({VgprRB, Ty}, 0);
419 B.buildSelect(Dst, Src, True, False);
420 } else if (Ty == S64) {
421 auto True = B.buildConstant({VgprRB_S32}, TrueExtCst);
422 auto False = B.buildConstant({VgprRB_S32}, 0);
423 auto Lo = B.buildSelect({VgprRB_S32}, Src, True, False);
424 MachineInstrBuilder Hi;
425 switch (Opc) {
426 case G_SEXT:
427 Hi = Lo;
428 break;
429 case G_ZEXT:
430 Hi = False;
431 break;
432 case G_ANYEXT:
433 Hi = B.buildUndef({VgprRB_S32});
434 break;
435 default:
437 MF, MORE, "amdgpu-regbanklegalize",
438 "AMDGPU RegBankLegalize: lowerVccExtToSel, Opcode not supported", MI);
439 return false;
440 }
441
442 B.buildMergeValues(Dst, {Lo.getReg(0), Hi.getReg(0)});
443 } else {
445 MF, MORE, "amdgpu-regbanklegalize",
446 "AMDGPU RegBankLegalize: lowerVccExtToSel, Type not supported", MI);
447 return false;
448 }
449
450 MI.eraseFromParent();
451 return true;
452}
453
454std::pair<Register, Register> RegBankLegalizeHelper::unpackZExt(Register Reg) {
455 auto PackedS32 = B.buildBitcast(SgprRB_S32, Reg);
456 auto Mask = B.buildConstant(SgprRB_S32, 0x0000ffff);
457 auto Lo = B.buildAnd(SgprRB_S32, PackedS32, Mask);
458 auto Hi = B.buildLShr(SgprRB_S32, PackedS32, B.buildConstant(SgprRB_S32, 16));
459 return {Lo.getReg(0), Hi.getReg(0)};
460}
461
462std::pair<Register, Register> RegBankLegalizeHelper::unpackSExt(Register Reg) {
463 auto PackedS32 = B.buildBitcast(SgprRB_S32, Reg);
464 auto Lo = B.buildSExtInReg(SgprRB_S32, PackedS32, 16);
465 auto Hi = B.buildAShr(SgprRB_S32, PackedS32, B.buildConstant(SgprRB_S32, 16));
466 return {Lo.getReg(0), Hi.getReg(0)};
467}
468
469std::pair<Register, Register> RegBankLegalizeHelper::unpackAExt(Register Reg) {
470 auto PackedS32 = B.buildBitcast(SgprRB_S32, Reg);
471 auto Lo = PackedS32;
472 auto Hi = B.buildLShr(SgprRB_S32, PackedS32, B.buildConstant(SgprRB_S32, 16));
473 return {Lo.getReg(0), Hi.getReg(0)};
474}
475
476std::pair<Register, Register>
477RegBankLegalizeHelper::unpackAExtTruncS16(Register Reg) {
478 auto [Lo32, Hi32] = unpackAExt(Reg);
479 return {B.buildTrunc(SgprRB_S16, Lo32).getReg(0),
480 B.buildTrunc(SgprRB_S16, Hi32).getReg(0)};
481}
482
483bool RegBankLegalizeHelper::lowerUnpackBitShift(MachineInstr &MI) {
484 Register Lo, Hi;
485 switch (MI.getOpcode()) {
486 case AMDGPU::G_SHL: {
487 auto [Val0, Val1] = unpackAExt(MI.getOperand(1).getReg());
488 auto [Amt0, Amt1] = unpackAExt(MI.getOperand(2).getReg());
489 Lo = B.buildInstr(MI.getOpcode(), {SgprRB_S32}, {Val0, Amt0}).getReg(0);
490 Hi = B.buildInstr(MI.getOpcode(), {SgprRB_S32}, {Val1, Amt1}).getReg(0);
491 break;
492 }
493 case AMDGPU::G_LSHR: {
494 auto [Val0, Val1] = unpackZExt(MI.getOperand(1).getReg());
495 auto [Amt0, Amt1] = unpackZExt(MI.getOperand(2).getReg());
496 Lo = B.buildInstr(MI.getOpcode(), {SgprRB_S32}, {Val0, Amt0}).getReg(0);
497 Hi = B.buildInstr(MI.getOpcode(), {SgprRB_S32}, {Val1, Amt1}).getReg(0);
498 break;
499 }
500 case AMDGPU::G_ASHR: {
501 auto [Val0, Val1] = unpackSExt(MI.getOperand(1).getReg());
502 auto [Amt0, Amt1] = unpackSExt(MI.getOperand(2).getReg());
503 Lo = B.buildAShr(SgprRB_S32, Val0, Amt0).getReg(0);
504 Hi = B.buildAShr(SgprRB_S32, Val1, Amt1).getReg(0);
505 break;
506 }
507 default:
509 MF, MORE, "amdgpu-regbanklegalize",
510 "AMDGPU RegBankLegalize: lowerUnpackBitShift, case not implemented",
511 MI);
512 return false;
513 }
514 B.buildBuildVectorTrunc(MI.getOperand(0).getReg(), {Lo, Hi});
515 MI.eraseFromParent();
516 return true;
517}
518
519bool RegBankLegalizeHelper::lowerUnpackMinMax(MachineInstr &MI) {
520 Register Lo, Hi;
521 switch (MI.getOpcode()) {
522 case AMDGPU::G_SMIN:
523 case AMDGPU::G_SMAX: {
524 // For signed operations, use sign extension
525 auto [Val0_Lo, Val0_Hi] = unpackSExt(MI.getOperand(1).getReg());
526 auto [Val1_Lo, Val1_Hi] = unpackSExt(MI.getOperand(2).getReg());
527 Lo = B.buildInstr(MI.getOpcode(), {SgprRB_S32}, {Val0_Lo, Val1_Lo})
528 .getReg(0);
529 Hi = B.buildInstr(MI.getOpcode(), {SgprRB_S32}, {Val0_Hi, Val1_Hi})
530 .getReg(0);
531 break;
532 }
533 case AMDGPU::G_UMIN:
534 case AMDGPU::G_UMAX: {
535 // For unsigned operations, use zero extension
536 auto [Val0_Lo, Val0_Hi] = unpackZExt(MI.getOperand(1).getReg());
537 auto [Val1_Lo, Val1_Hi] = unpackZExt(MI.getOperand(2).getReg());
538 Lo = B.buildInstr(MI.getOpcode(), {SgprRB_S32}, {Val0_Lo, Val1_Lo})
539 .getReg(0);
540 Hi = B.buildInstr(MI.getOpcode(), {SgprRB_S32}, {Val0_Hi, Val1_Hi})
541 .getReg(0);
542 break;
543 }
544 default:
546 MF, MORE, "amdgpu-regbanklegalize",
547 "AMDGPU RegBankLegalize: lowerUnpackMinMax, case not implemented", MI);
548 return false;
549 }
550 B.buildBuildVectorTrunc(MI.getOperand(0).getReg(), {Lo, Hi});
551 MI.eraseFromParent();
552 return true;
553}
554
555bool RegBankLegalizeHelper::lowerUnpackAExt(MachineInstr &MI) {
556 auto [Op1Lo, Op1Hi] = unpackAExt(MI.getOperand(1).getReg());
557 auto [Op2Lo, Op2Hi] = unpackAExt(MI.getOperand(2).getReg());
558 auto ResLo = B.buildInstr(MI.getOpcode(), {SgprRB_S32}, {Op1Lo, Op2Lo});
559 auto ResHi = B.buildInstr(MI.getOpcode(), {SgprRB_S32}, {Op1Hi, Op2Hi});
560 B.buildBuildVectorTrunc(MI.getOperand(0).getReg(),
561 {ResLo.getReg(0), ResHi.getReg(0)});
562 MI.eraseFromParent();
563 return true;
564}
565
568 return (GI->is(Intrinsic::amdgcn_sbfe));
569
570 return MI.getOpcode() == AMDGPU::G_SBFX;
571}
572
573bool RegBankLegalizeHelper::lowerV_BFE(MachineInstr &MI) {
574 Register Dst = MI.getOperand(0).getReg();
575 assert(MRI.getType(Dst) == LLT::scalar(64));
576 bool Signed = isSignedBFE(MI);
577 unsigned FirstOpnd = isa<GIntrinsic>(MI) ? 2 : 1;
578 // Extract bitfield from Src, LSBit is the least-significant bit for the
579 // extraction (field offset) and Width is size of bitfield.
580 Register Src = MI.getOperand(FirstOpnd).getReg();
581 Register LSBit = MI.getOperand(FirstOpnd + 1).getReg();
582 Register Width = MI.getOperand(FirstOpnd + 2).getReg();
583 // Comments are for signed bitfield extract, similar for unsigned. x is sign
584 // bit. s is sign, l is LSB and y are remaining bits of bitfield to extract.
585
586 // Src >> LSBit Hi|Lo: x?????syyyyyyl??? -> xxxx?????syyyyyyl
587 unsigned SHROpc = Signed ? AMDGPU::G_ASHR : AMDGPU::G_LSHR;
588 auto SHRSrc = B.buildInstr(SHROpc, {{VgprRB, S64}}, {Src, LSBit});
589
590 auto ConstWidth = getIConstantVRegValWithLookThrough(Width, MRI);
591
592 // Expand to Src >> LSBit << (64 - Width) >> (64 - Width)
593 // << (64 - Width): Hi|Lo: xxxx?????syyyyyyl -> syyyyyyl000000000
594 // >> (64 - Width): Hi|Lo: syyyyyyl000000000 -> ssssssssssyyyyyyl
595 if (!ConstWidth) {
596 auto Amt = B.buildSub(VgprRB_S32, B.buildConstant(SgprRB_S32, 64), Width);
597 auto SignBit = B.buildShl({VgprRB, S64}, SHRSrc, Amt);
598 B.buildInstr(SHROpc, {Dst}, {SignBit, Amt});
599 MI.eraseFromParent();
600 return true;
601 }
602
603 uint64_t WidthImm = ConstWidth->Value.getZExtValue();
604 auto UnmergeSHRSrc = B.buildUnmerge(VgprRB_S32, SHRSrc);
605 Register SHRSrcLo = UnmergeSHRSrc.getReg(0);
606 Register SHRSrcHi = UnmergeSHRSrc.getReg(1);
607 auto Zero = B.buildConstant({VgprRB, S32}, 0);
608 unsigned BFXOpc = Signed ? AMDGPU::G_SBFX : AMDGPU::G_UBFX;
609
610 if (WidthImm <= 32) {
611 // SHRSrc Hi|Lo: ????????|???syyyl -> ????????|ssssyyyl
612 auto Lo = B.buildInstr(BFXOpc, {VgprRB_S32}, {SHRSrcLo, Zero, Width});
613 MachineInstrBuilder Hi;
614 if (Signed) {
615 // SHRSrc Hi|Lo: ????????|ssssyyyl -> ssssssss|ssssyyyl
616 Hi = B.buildAShr(VgprRB_S32, Lo, B.buildConstant(VgprRB_S32, 31));
617 } else {
618 // SHRSrc Hi|Lo: ????????|000syyyl -> 00000000|000syyyl
619 Hi = Zero;
620 }
621 B.buildMergeLikeInstr(Dst, {Lo, Hi});
622 } else {
623 auto Amt = B.buildConstant(VgprRB_S32, WidthImm - 32);
624 // SHRSrc Hi|Lo: ??????sy|yyyyyyyl -> sssssssy|yyyyyyyl
625 auto Hi = B.buildInstr(BFXOpc, {VgprRB_S32}, {SHRSrcHi, Zero, Amt});
626 B.buildMergeLikeInstr(Dst, {SHRSrcLo, Hi});
627 }
628
629 MI.eraseFromParent();
630 return true;
631}
632
633bool RegBankLegalizeHelper::lowerS_BFE(MachineInstr &MI) {
634 Register DstReg = MI.getOperand(0).getReg();
635 LLT Ty = MRI.getType(DstReg);
636 bool Signed = isSignedBFE(MI);
637 unsigned FirstOpnd = isa<GIntrinsic>(MI) ? 2 : 1;
638 Register Src = MI.getOperand(FirstOpnd).getReg();
639 Register LSBit = MI.getOperand(FirstOpnd + 1).getReg();
640 Register Width = MI.getOperand(FirstOpnd + 2).getReg();
641 // For uniform bit field extract there are 4 available instructions, but
642 // LSBit(field offset) and Width(size of bitfield) need to be packed in S32,
643 // field offset in low and size in high 16 bits.
644
645 // Src1 Hi16|Lo16 = Size|FieldOffset
646 auto Mask = B.buildConstant(SgprRB_S32, maskTrailingOnes<unsigned>(6));
647 auto FieldOffset = B.buildAnd(SgprRB_S32, LSBit, Mask);
648 auto Size = B.buildShl(SgprRB_S32, Width, B.buildConstant(SgprRB_S32, 16));
649 auto Src1 = B.buildOr(SgprRB_S32, FieldOffset, Size);
650 unsigned Opc32 = Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32;
651 unsigned Opc64 = Signed ? AMDGPU::S_BFE_I64 : AMDGPU::S_BFE_U64;
652 unsigned Opc = Ty == S32 ? Opc32 : Opc64;
653
654 // Select machine instruction, because of reg class constraining, insert
655 // copies from reg class to reg bank.
656 auto S_BFE = B.buildInstr(Opc, {{SgprRB, Ty}},
657 {B.buildCopy(Ty, Src), B.buildCopy(S32, Src1)});
658 constrainSelectedInstRegOperands(*S_BFE, *ST.getInstrInfo(),
659 *ST.getRegisterInfo(), RBI);
660
661 B.buildCopy(DstReg, S_BFE->getOperand(0).getReg());
662 MI.eraseFromParent();
663 return true;
664}
665
666bool RegBankLegalizeHelper::lowerSplitTo32(MachineInstr &MI) {
667 Register Dst = MI.getOperand(0).getReg();
668 LLT DstTy = MRI.getType(Dst);
669 assert(DstTy == V4S16 || DstTy == V2S32 || DstTy == S64);
670 LLT Ty = DstTy == V4S16 ? V2S16 : S32;
671 auto Op1 = B.buildUnmerge({VgprRB, Ty}, MI.getOperand(1).getReg());
672 auto Op2 = B.buildUnmerge({VgprRB, Ty}, MI.getOperand(2).getReg());
673 unsigned Opc = MI.getOpcode();
674 auto Flags = MI.getFlags();
675 auto Lo =
676 B.buildInstr(Opc, {{VgprRB, Ty}}, {Op1.getReg(0), Op2.getReg(0)}, Flags);
677 auto Hi =
678 B.buildInstr(Opc, {{VgprRB, Ty}}, {Op1.getReg(1), Op2.getReg(1)}, Flags);
679 B.buildMergeLikeInstr(Dst, {Lo, Hi});
680 MI.eraseFromParent();
681 return true;
682}
683
684bool RegBankLegalizeHelper::lowerSplitTo32Mul(MachineInstr &MI) {
685 Register Dst = MI.getOperand(0).getReg();
686 assert(MRI.getType(Dst) == S64);
687 auto Op1 = B.buildUnmerge({VgprRB_S32}, MI.getOperand(1).getReg());
688 auto Op2 = B.buildUnmerge({VgprRB_S32}, MI.getOperand(2).getReg());
689
690 // TODO: G_AMDGPU_MAD_* optimizations for G_MUL divergent S64 operation to
691 // match GlobalISel with old regbankselect.
692 auto Lo = B.buildMul(VgprRB_S32, Op1.getReg(0), Op2.getReg(0));
693 auto Carry = B.buildUMulH(VgprRB_S32, Op1.getReg(0), Op2.getReg(0));
694 auto MulLo0Hi1 = B.buildMul(VgprRB_S32, Op1.getReg(0), Op2.getReg(1));
695 auto MulHi0Lo1 = B.buildMul(VgprRB_S32, Op1.getReg(1), Op2.getReg(0));
696 auto Sum = B.buildAdd(VgprRB_S32, MulLo0Hi1, MulHi0Lo1);
697 auto Hi = B.buildAdd(VgprRB_S32, Sum, Carry);
698
699 B.buildMergeLikeInstr(Dst, {Lo, Hi});
700 MI.eraseFromParent();
701 return true;
702}
703
704bool RegBankLegalizeHelper::lowerSplitTo16(MachineInstr &MI) {
705 Register Dst = MI.getOperand(0).getReg();
706 assert(MRI.getType(Dst) == V2S16);
707 unsigned Opc = MI.getOpcode();
708 unsigned NumOps = MI.getNumOperands();
709 auto Flags = MI.getFlags();
710
711 auto [Op1Lo, Op1Hi] = unpackAExtTruncS16(MI.getOperand(1).getReg());
712
713 if (NumOps == 2) {
714 auto Lo = B.buildInstr(Opc, {SgprRB_S16}, {Op1Lo}, Flags);
715 auto Hi = B.buildInstr(Opc, {SgprRB_S16}, {Op1Hi}, Flags);
716 B.buildMergeLikeInstr(Dst, {Lo, Hi});
717 MI.eraseFromParent();
718 return true;
719 }
720
721 auto [Op2Lo, Op2Hi] = unpackAExtTruncS16(MI.getOperand(2).getReg());
722
723 if (NumOps == 3) {
724 auto Lo = B.buildInstr(Opc, {SgprRB_S16}, {Op1Lo, Op2Lo}, Flags);
725 auto Hi = B.buildInstr(Opc, {SgprRB_S16}, {Op1Hi, Op2Hi}, Flags);
726 B.buildMergeLikeInstr(Dst, {Lo, Hi});
727 MI.eraseFromParent();
728 return true;
729 }
730
731 assert(NumOps == 4);
732 auto [Op3Lo, Op3Hi] = unpackAExtTruncS16(MI.getOperand(3).getReg());
733 auto Lo = B.buildInstr(Opc, {SgprRB_S16}, {Op1Lo, Op2Lo, Op3Lo}, Flags);
734 auto Hi = B.buildInstr(Opc, {SgprRB_S16}, {Op1Hi, Op2Hi, Op3Hi}, Flags);
735 B.buildMergeLikeInstr(Dst, {Lo, Hi});
736 MI.eraseFromParent();
737 return true;
738}
739
740bool RegBankLegalizeHelper::lowerUniMAD64(MachineInstr &MI) {
741 Register Dst0 = MI.getOperand(0).getReg();
742 Register Dst1 = MI.getOperand(1).getReg();
743 Register Src0 = MI.getOperand(2).getReg();
744 Register Src1 = MI.getOperand(3).getReg();
745 Register Src2 = MI.getOperand(4).getReg();
746
747 const GCNSubtarget &ST = B.getMF().getSubtarget<GCNSubtarget>();
748
749 // Keep the multiplication on the SALU.
750 Register DstLo = B.buildMul(SgprRB_S32, Src0, Src1).getReg(0);
751 Register DstHi = MRI.createVirtualRegister(SgprRB_S32);
752 if (ST.hasScalarMulHiInsts()) {
753 B.buildInstr(AMDGPU::G_UMULH, {{DstHi}}, {Src0, Src1});
754 } else {
755 auto VSrc0 = B.buildCopy(VgprRB_S32, Src0);
756 auto VSrc1 = B.buildCopy(VgprRB_S32, Src1);
757 auto MulHi = B.buildInstr(AMDGPU::G_UMULH, {VgprRB_S32}, {VSrc0, VSrc1});
758 buildReadAnyLane(B, DstHi, MulHi.getReg(0), RBI);
759 }
760
761 // Accumulate and produce the "carry-out" bit.
762
763 // The "carry-out" is defined as bit 64 of the result when computed as a
764 // big integer. For unsigned multiply-add, this matches the usual
765 // definition of carry-out.
766 if (mi_match(Src2, MRI, MIPatternMatch::m_ZeroInt())) {
767 // No accumulate: result is just the multiplication, carry is 0.
768 B.buildMergeLikeInstr(Dst0, {DstLo, DstHi});
769 B.buildConstant(Dst1, 0);
770 } else {
771 // Accumulate: add Src2 to the multiplication result with carry chain.
772 Register Src2Lo = MRI.createVirtualRegister(SgprRB_S32);
773 Register Src2Hi = MRI.createVirtualRegister(SgprRB_S32);
774 B.buildUnmerge({Src2Lo, Src2Hi}, Src2);
775
776 auto AddLo = B.buildUAddo(SgprRB_S32, SgprRB_S32, DstLo, Src2Lo);
777 auto AddHi =
778 B.buildUAdde(SgprRB_S32, SgprRB_S32, DstHi, Src2Hi, AddLo.getReg(1));
779 B.buildMergeLikeInstr(Dst0, {AddLo.getReg(0), AddHi.getReg(0)});
780 B.buildCopy(Dst1, AddHi.getReg(1));
781 }
782
783 MI.eraseFromParent();
784 return true;
785}
786
787bool RegBankLegalizeHelper::lowerSplitTo32Select(MachineInstr &MI) {
788 Register Dst = MI.getOperand(0).getReg();
789 LLT DstTy = MRI.getType(Dst);
790 assert(DstTy == V4S16 || DstTy == V2S32 || DstTy == S64 ||
791 (DstTy.isPointer() && DstTy.getSizeInBits() == 64));
792 LLT Ty = DstTy == V4S16 ? V2S16 : S32;
793 auto Op2 = B.buildUnmerge({VgprRB, Ty}, MI.getOperand(2).getReg());
794 auto Op3 = B.buildUnmerge({VgprRB, Ty}, MI.getOperand(3).getReg());
795 Register Cond = MI.getOperand(1).getReg();
796 auto Flags = MI.getFlags();
797 auto Lo =
798 B.buildSelect({VgprRB, Ty}, Cond, Op2.getReg(0), Op3.getReg(0), Flags);
799 auto Hi =
800 B.buildSelect({VgprRB, Ty}, Cond, Op2.getReg(1), Op3.getReg(1), Flags);
801
802 B.buildMergeLikeInstr(Dst, {Lo, Hi});
803 MI.eraseFromParent();
804 return true;
805}
806
807bool RegBankLegalizeHelper::lowerSplitTo32SExtInReg(MachineInstr &MI) {
808 auto Op1 = B.buildUnmerge(VgprRB_S32, MI.getOperand(1).getReg());
809 int Amt = MI.getOperand(2).getImm();
810 Register Lo, Hi;
811 // Hi|Lo: s sign bit, ?/x bits changed/not changed by sign-extend
812 if (Amt <= 32) {
813 auto Freeze = B.buildFreeze(VgprRB_S32, Op1.getReg(0));
814 if (Amt == 32) {
815 // Hi|Lo: ????????|sxxxxxxx -> ssssssss|sxxxxxxx
816 Lo = Freeze.getReg(0);
817 } else {
818 // Hi|Lo: ????????|???sxxxx -> ssssssss|ssssxxxx
819 Lo = B.buildSExtInReg(VgprRB_S32, Freeze, Amt).getReg(0);
820 }
821
822 auto SignExtCst = B.buildConstant(SgprRB_S32, 31);
823 Hi = B.buildAShr(VgprRB_S32, Lo, SignExtCst).getReg(0);
824 } else {
825 // Hi|Lo: ?????sxx|xxxxxxxx -> ssssssxx|xxxxxxxx
826 Lo = Op1.getReg(0);
827 Hi = B.buildSExtInReg(VgprRB_S32, Op1.getReg(1), Amt - 32).getReg(0);
828 }
829
830 B.buildMergeLikeInstr(MI.getOperand(0).getReg(), {Lo, Hi});
831 MI.eraseFromParent();
832 return true;
833}
834
835bool RegBankLegalizeHelper::lowerSplitBitCount64To32(MachineInstr &MI) {
836 // Split 64-bit find-first-bit operations into 32-bit halves:
837 // (ffbh hi:lo) -> umin(ffbh(hi), uaddsat(ffbh(lo), 32))
838 // (ffbl hi:lo) -> umin(ffbl(lo), uaddsat(ffbl(hi), 32))
839 // (ctlz_zero_undef hi:lo) -> umin(ffbh(hi), add(ffbh(lo), 32))
840 // (cttz_zero_undef hi:lo) -> umin(ffbl(lo), add(ffbl(hi), 32))
841 unsigned Opc = MI.getOpcode();
842
843 // FFBH/FFBL return 0xFFFFFFFF on zero input, using uaddsat to avoid
844 // wrapping. CTLZ/CTTZ guarantee non-zero input (zero_undef), so plain add
845 // is fine.
846 unsigned FFBOpc;
847 unsigned AddOpc;
848 bool SearchFromMSB;
849 switch (Opc) {
850 case AMDGPU::G_AMDGPU_FFBH_U32:
851 FFBOpc = Opc;
852 AddOpc = AMDGPU::G_UADDSAT;
853 SearchFromMSB = true;
854 break;
855 case AMDGPU::G_AMDGPU_FFBL_B32:
856 FFBOpc = Opc;
857 AddOpc = AMDGPU::G_UADDSAT;
858 SearchFromMSB = false;
859 break;
860 case AMDGPU::G_CTLZ_ZERO_UNDEF:
861 FFBOpc = AMDGPU::G_AMDGPU_FFBH_U32;
862 AddOpc = AMDGPU::G_ADD;
863 SearchFromMSB = true;
864 break;
865 case AMDGPU::G_CTTZ_ZERO_UNDEF:
866 FFBOpc = AMDGPU::G_AMDGPU_FFBL_B32;
867 AddOpc = AMDGPU::G_ADD;
868 SearchFromMSB = false;
869 break;
870 default:
871 llvm_unreachable("unexpected opcode in lowerSplitBitCount64To32");
872 }
873
874 auto Unmerge = B.buildUnmerge(VgprRB_S32, MI.getOperand(1).getReg());
875 Register Lo = Unmerge.getReg(0);
876 Register Hi = Unmerge.getReg(1);
877
878 // MSB-first (FFBH/CTLZ) searches hi first; LSB-first (FFBL/CTTZ) searches
879 // lo first. The secondary half adds 32 to account for the primary half's
880 // width.
881 auto Primary = B.buildInstr(FFBOpc, {VgprRB_S32}, {SearchFromMSB ? Hi : Lo});
882 auto Secondary =
883 B.buildInstr(FFBOpc, {VgprRB_S32}, {SearchFromMSB ? Lo : Hi});
884
885 auto Adjusted = B.buildInstr(AddOpc, {VgprRB_S32},
886 {Secondary, B.buildConstant(VgprRB_S32, 32)});
887 B.buildUMin(MI.getOperand(0).getReg(), Primary, Adjusted);
888
889 MI.eraseFromParent();
890 return true;
891}
892
893bool RegBankLegalizeHelper::lowerExtrVecEltToSel(MachineInstr &MI) {
894 // Lower extract vector element to a compare-select chain:
895 // result = elt[0]
896 // for i in 1..N-1:
897 // result = (idx == i) ? elt[i] : result
898 //
899 // When the index is divergent, each lane may want a different element, so
900 // we must check every element per lane.
901 Register Dst = MI.getOperand(0).getReg();
902 Register Src = MI.getOperand(1).getReg();
903 Register Idx = MI.getOperand(2).getReg();
904
905 LLT VecTy = MRI.getType(Src);
906 LLT ScalarTy = VecTy.getScalarType();
907 unsigned NumElts = VecTy.getNumElements();
908 MachineRegisterInfo::VRegAttrs VgprRB_EltTy = {VgprRB, ScalarTy};
909
910 auto Unmerge = B.buildUnmerge(VgprRB_EltTy, Src);
911
912 if (ScalarTy.getSizeInBits() == 32) {
913 Register PrevSelect = Unmerge.getReg(0);
914 for (unsigned I = 1; I < NumElts; ++I) {
915 auto IdxConst = B.buildConstant({SgprRB, MRI.getType(Idx)}, I);
916 auto Cmp = B.buildICmp(CmpInst::ICMP_EQ, VccRB_S1, Idx, IdxConst);
917 PrevSelect =
918 B.buildSelect(VgprRB_EltTy, Cmp, Unmerge.getReg(I), PrevSelect)
919 .getReg(0);
920 }
921 B.buildCopy(Dst, PrevSelect);
922 } else if (ScalarTy.getSizeInBits() == 64) {
923 auto InitUnmerge = B.buildUnmerge(VgprRB_S32, Unmerge.getReg(0));
924 Register PrevLo = InitUnmerge.getReg(0);
925 Register PrevHi = InitUnmerge.getReg(1);
926 for (unsigned I = 1; I < NumElts; ++I) {
927 auto IdxConst = B.buildConstant({SgprRB, MRI.getType(Idx)}, I);
928 auto Cmp = B.buildICmp(CmpInst::ICMP_EQ, VccRB_S1, Idx, IdxConst);
929 auto EltUnmerge = B.buildUnmerge(VgprRB_S32, Unmerge.getReg(I));
930 PrevLo = B.buildSelect(VgprRB_S32, Cmp, EltUnmerge.getReg(0), PrevLo)
931 .getReg(0);
932 PrevHi = B.buildSelect(VgprRB_S32, Cmp, EltUnmerge.getReg(1), PrevHi)
933 .getReg(0);
934 }
935 B.buildMergeLikeInstr(Dst, {PrevLo, PrevHi});
936 } else {
938 MF, MORE, "amdgpu-regbanklegalize",
939 "AMDGPU RegBankLegalize: ExtrVecEltToSel unsupported element type", MI);
940 return false;
941 }
942
943 MI.eraseFromParent();
944 return true;
945}
946
947bool RegBankLegalizeHelper::lowerExtrVecEltTo32(MachineInstr &MI) {
948 // Reduce a 64-bit element extract to two 32-bit extracts:
949 // vec32 = bitcast <N x s64> to <2N x s32>
950 // lo = vec32[idx * 2]
951 // hi = vec32[idx * 2 + 1]
952 // result = merge(lo, hi)
953 //
954 // When the index is uniform, all lanes extract the same element, so we can
955 // just split the s64 extract into two s32 extracts which lower to MOVREL.
956 Register Dst = MI.getOperand(0).getReg();
957 Register Src = MI.getOperand(1).getReg();
958 Register Idx = MI.getOperand(2).getReg();
959
960 LLT SrcTy = MRI.getType(Src);
961 LLT Vec32Ty = LLT::fixed_vector(2 * SrcTy.getNumElements(), 32);
962
963 assert(MRI.getRegBank(Src) == VgprRB && MRI.getRegBank(Idx) == SgprRB &&
964 "expected VGPR src and SGPR idx");
965
966 auto CastSrc = B.buildBitcast({VgprRB, Vec32Ty}, Src);
967
968 // Calculate new Lo and Hi indices
969 auto One = B.buildConstant(SgprRB_S32, 1);
970 auto IdxLo = B.buildShl(SgprRB_S32, Idx, One);
971 auto IdxHi = B.buildAdd(SgprRB_S32, IdxLo, One);
972
973 auto ExtLo = B.buildExtractVectorElement(VgprRB_S32, CastSrc, IdxLo);
974 auto ExtHi = B.buildExtractVectorElement(VgprRB_S32, CastSrc, IdxHi);
975
976 B.buildMergeLikeInstr(Dst, {ExtLo.getReg(0), ExtHi.getReg(0)});
977
978 MI.eraseFromParent();
979 return true;
980}
981
982bool RegBankLegalizeHelper::lower(MachineInstr &MI,
983 const RegBankLLTMapping &Mapping,
984 WaterfallInfo &WFI) {
985
986 switch (Mapping.LoweringMethod) {
987 case DoNotLower:
988 break;
989 case VccExtToSel:
990 return lowerVccExtToSel(MI);
991 case UniExtToSel: {
992 LLT Ty = MRI.getType(MI.getOperand(0).getReg());
993 auto True = B.buildConstant({SgprRB, Ty},
994 MI.getOpcode() == AMDGPU::G_SEXT ? -1 : 1);
995 auto False = B.buildConstant({SgprRB, Ty}, 0);
996 // Input to G_{Z|S}EXT is 'Legalizer legal' S1. Most common case is compare.
997 // We are making select here. S1 cond was already 'any-extended to S32' +
998 // 'AND with 1 to clean high bits' by Sgpr32AExtBoolInReg.
999 B.buildSelect(MI.getOperand(0).getReg(), MI.getOperand(1).getReg(), True,
1000 False);
1001 MI.eraseFromParent();
1002 return true;
1003 }
1004 case UnpackBitShift:
1005 return lowerUnpackBitShift(MI);
1006 case UnpackMinMax:
1007 return lowerUnpackMinMax(MI);
1008 case ScalarizeToS16:
1009 return lowerSplitTo16(MI);
1010 case Ext32To64: {
1011 const RegisterBank *RB = MRI.getRegBank(MI.getOperand(0).getReg());
1012 MachineInstrBuilder Hi;
1013 switch (MI.getOpcode()) {
1014 case AMDGPU::G_ZEXT: {
1015 Hi = B.buildConstant({RB, S32}, 0);
1016 break;
1017 }
1018 case AMDGPU::G_SEXT: {
1019 // Replicate sign bit from 32-bit extended part.
1020 auto ShiftAmt = B.buildConstant({RB, S32}, 31);
1021 Hi = B.buildAShr({RB, S32}, MI.getOperand(1).getReg(), ShiftAmt);
1022 break;
1023 }
1024 case AMDGPU::G_ANYEXT: {
1025 Hi = B.buildUndef({RB, S32});
1026 break;
1027 }
1028 default:
1029 reportGISelFailure(MF, MORE, "amdgpu-regbanklegalize",
1030 "AMDGPU RegBankLegalize: Ext32To64, unsuported opcode",
1031 MI);
1032 return false;
1033 }
1034
1035 B.buildMergeLikeInstr(MI.getOperand(0).getReg(),
1036 {MI.getOperand(1).getReg(), Hi});
1037 MI.eraseFromParent();
1038 return true;
1039 }
1040 case UniCstExt: {
1041 uint64_t ConstVal = MI.getOperand(1).getCImm()->getZExtValue();
1042 B.buildConstant(MI.getOperand(0).getReg(), ConstVal);
1043
1044 MI.eraseFromParent();
1045 return true;
1046 }
1047 case VgprToVccCopy: {
1048 Register Src = MI.getOperand(1).getReg();
1049 LLT Ty = MRI.getType(Src);
1050 // Take lowest bit from each lane and put it in lane mask.
1051 // Lowering via compare, but we need to clean high bits first as compare
1052 // compares all bits in register.
1053 Register BoolSrc = MRI.createVirtualRegister({VgprRB, Ty});
1054 if (Ty == S64) {
1055 auto Src64 = B.buildUnmerge(VgprRB_S32, Src);
1056 auto One = B.buildConstant(VgprRB_S32, 1);
1057 auto AndLo = B.buildAnd(VgprRB_S32, Src64.getReg(0), One);
1058 auto Zero = B.buildConstant(VgprRB_S32, 0);
1059 auto AndHi = B.buildAnd(VgprRB_S32, Src64.getReg(1), Zero);
1060 B.buildMergeLikeInstr(BoolSrc, {AndLo, AndHi});
1061 } else {
1062 assert(Ty == S32 || Ty == S16);
1063 auto One = B.buildConstant({VgprRB, Ty}, 1);
1064 B.buildAnd(BoolSrc, Src, One);
1065 }
1066 auto Zero = B.buildConstant({VgprRB, Ty}, 0);
1067 B.buildICmp(CmpInst::ICMP_NE, MI.getOperand(0).getReg(), BoolSrc, Zero);
1068 MI.eraseFromParent();
1069 return true;
1070 }
1071 case V_BFE:
1072 return lowerV_BFE(MI);
1073 case S_BFE:
1074 return lowerS_BFE(MI);
1075 case UniMAD64:
1076 return lowerUniMAD64(MI);
1077 case UniMul64: {
1078 B.buildMul(MI.getOperand(0), MI.getOperand(1), MI.getOperand(2));
1079 MI.eraseFromParent();
1080 return true;
1081 }
1082 case DivSMulToMAD: {
1083 auto Op1 = B.buildTrunc(VgprRB_S32, MI.getOperand(1));
1084 auto Op2 = B.buildTrunc(VgprRB_S32, MI.getOperand(2));
1085 auto Zero = B.buildConstant({VgprRB, S64}, 0);
1086
1087 unsigned NewOpc = MI.getOpcode() == AMDGPU::G_AMDGPU_S_MUL_U64_U32
1088 ? AMDGPU::G_AMDGPU_MAD_U64_U32
1089 : AMDGPU::G_AMDGPU_MAD_I64_I32;
1090
1091 B.buildInstr(NewOpc, {MI.getOperand(0).getReg(), {SgprRB, S32}},
1092 {Op1, Op2, Zero});
1093 MI.eraseFromParent();
1094 return true;
1095 }
1096 case SplitTo32:
1097 return lowerSplitTo32(MI);
1098 case SplitTo32Mul:
1099 return lowerSplitTo32Mul(MI);
1100 case SplitTo32Select:
1101 return lowerSplitTo32Select(MI);
1102 case SplitTo32SExtInReg:
1103 return lowerSplitTo32SExtInReg(MI);
1104 case SplitLoad: {
1105 LLT DstTy = MRI.getType(MI.getOperand(0).getReg());
1106 unsigned Size = DstTy.getSizeInBits();
1107 // Even split to 128-bit loads
1108 if (Size > 128) {
1109 LLT B128;
1110 if (DstTy.isVector()) {
1111 LLT EltTy = DstTy.getElementType();
1112 B128 = LLT::fixed_vector(128 / EltTy.getSizeInBits(), EltTy);
1113 } else {
1114 B128 = LLT::scalar(128);
1115 }
1116 if (Size / 128 == 2)
1117 splitLoad(MI, {B128, B128});
1118 else if (Size / 128 == 4)
1119 splitLoad(MI, {B128, B128, B128, B128});
1120 else {
1121 reportGISelFailure(MF, MORE, "amdgpu-regbanklegalize",
1122 "AMDGPU RegBankLegalize: SplitLoad, unsuported type",
1123 MI);
1124 return false;
1125 }
1126 }
1127 // 64 and 32 bit load
1128 else if (DstTy == S96)
1129 splitLoad(MI, {S64, S32}, S32);
1130 else if (DstTy == V3S32)
1131 splitLoad(MI, {V2S32, S32}, S32);
1132 else if (DstTy == V6S16)
1133 splitLoad(MI, {V4S16, V2S16}, V2S16);
1134 else {
1135 reportGISelFailure(MF, MORE, "amdgpu-regbanklegalize",
1136 "AMDGPU RegBankLegalize: SplitLoad, unsuported type",
1137 MI);
1138 return false;
1139 }
1140 return true;
1141 }
1142 case WidenLoad: {
1143 LLT DstTy = MRI.getType(MI.getOperand(0).getReg());
1144 if (DstTy == S96)
1145 widenLoad(MI, S128);
1146 else if (DstTy == V3S32)
1147 widenLoad(MI, V4S32, S32);
1148 else if (DstTy == V6S16)
1149 widenLoad(MI, V8S16, V2S16);
1150 else {
1151 reportGISelFailure(MF, MORE, "amdgpu-regbanklegalize",
1152 "AMDGPU RegBankLegalize: WidenLoad, unsuported type",
1153 MI);
1154 return false;
1155 }
1156 return true;
1157 }
1158 case UnpackAExt:
1159 return lowerUnpackAExt(MI);
1160 case WidenMMOToS32:
1161 return widenMMOToS32(cast<GAnyLoad>(MI));
1162 case VerifyAllSgpr: {
1163 assert(llvm::all_of(MI.operands(), [&](const MachineOperand &Op) {
1164 return MRI.getRegBankOrNull(Op.getReg()) == SgprRB;
1165 }));
1166 return true;
1167 }
1168 case ApplyAllVgpr: {
1169 assert(llvm::all_of(MI.defs(), [&](const MachineOperand &Op) {
1170 return MRI.getRegBankOrNull(Op.getReg()) == VgprRB;
1171 }));
1172 B.setInstrAndDebugLoc(MI);
1173 for (unsigned i = MI.getNumDefs(); i < MI.getNumOperands(); ++i) {
1174 Register Reg = MI.getOperand(i).getReg();
1175 if (MRI.getRegBank(Reg) != VgprRB) {
1176 auto Copy = B.buildCopy({VgprRB, MRI.getType(Reg)}, Reg);
1177 MI.getOperand(i).setReg(Copy.getReg(0));
1178 }
1179 }
1180 return true;
1181 }
1182 case UnmergeToShiftTrunc: {
1183 GUnmerge *Unmerge = dyn_cast<GUnmerge>(&MI);
1184 LLT Ty = MRI.getType(Unmerge->getSourceReg());
1185 if (Ty.getSizeInBits() % 32 != 0) {
1186 reportGISelFailure(MF, MORE, "amdgpu-regbanklegalize",
1187 "AMDGPU RegBankLegalize: unmerge not multiple of 32",
1188 MI);
1189 return false;
1190 }
1191
1192 B.setInstrAndDebugLoc(MI);
1193 if (Ty.getSizeInBits() > 32) {
1194 auto UnmergeV2S16 =
1195 B.buildUnmerge({SgprRB, V2S16}, Unmerge->getSourceReg());
1196 for (unsigned i = 0; i < UnmergeV2S16->getNumDefs(); ++i) {
1197 auto [Dst0S32, Dst1S32] =
1198 unpackAExt(UnmergeV2S16->getOperand(i).getReg());
1199 B.buildTrunc(MI.getOperand(i * 2).getReg(), Dst0S32);
1200 B.buildTrunc(MI.getOperand(i * 2 + 1).getReg(), Dst1S32);
1201 }
1202 } else {
1203 auto [Dst0S32, Dst1S32] = unpackAExt(MI.getOperand(2).getReg());
1204 B.buildTrunc(MI.getOperand(0).getReg(), Dst0S32);
1205 B.buildTrunc(MI.getOperand(1).getReg(), Dst1S32);
1206 }
1207
1208 MI.eraseFromParent();
1209 return true;
1210 }
1212 Register Dst = MI.getOperand(0).getReg();
1213 Register NewDst = MRI.createVirtualRegister(SgprRB_S32);
1214 B.setInsertPt(*MI.getParent(), MI.getParent()->getFirstNonPHI());
1215 MI.getOperand(0).setReg(NewDst);
1216 B.buildTrunc(Dst, NewDst);
1217
1218 for (unsigned i = 1; i < MI.getNumOperands(); i += 2) {
1219 Register UseReg = MI.getOperand(i).getReg();
1220
1221 auto DefMI = MRI.getVRegDef(UseReg)->getIterator();
1222 MachineBasicBlock *DefMBB = DefMI->getParent();
1223
1224 B.setInsertPt(*DefMBB, DefMBB->SkipPHIsAndLabels(std::next(DefMI)));
1225
1226 auto NewUse = B.buildAnyExt(SgprRB_S32, UseReg);
1227 MI.getOperand(i).setReg(NewUse.getReg(0));
1228 }
1229 break;
1230 }
1231 case VerifyAllSgprGPHI: {
1232 assert(llvm::all_of(MI.operands(), [&](const MachineOperand &Op) {
1233 if (Op.isMBB())
1234 return true;
1235 return MRI.getRegBankOrNull(Op.getReg()) == SgprRB;
1236 }));
1237 return true;
1238 }
1240 assert(MRI.getRegBankOrNull(MI.getOperand(0).getReg()) == VgprRB);
1241 assert(llvm::all_of(MI.operands(), [&](const MachineOperand &Op) {
1242 if (Op.isMBB())
1243 return true;
1244 const RegisterBank *RB = MRI.getRegBankOrNull(Op.getReg());
1245 return RB == VgprRB || RB == SgprRB;
1246 }));
1247 return true;
1248 }
1249 case ApplyINTRIN_IMAGE:
1250 return applyRegisterBanksINTRIN_IMAGE(MI);
1252 return lowerSplitBitCount64To32(MI);
1253 case ExtrVecEltToSel:
1254 return lowerExtrVecEltToSel(MI);
1255 case ExtrVecEltTo32:
1256 return lowerExtrVecEltTo32(MI);
1257 }
1258
1259 if (!WFI.SgprWaterfallOperandRegs.empty()) {
1260 if (!executeInWaterfallLoop(B, WFI))
1261 return false;
1262 }
1263 return true;
1264}
1265
1266LLT RegBankLegalizeHelper::getTyFromID(RegBankLLTMappingApplyID ID) {
1267 switch (ID) {
1268 case Vcc:
1269 case UniInVcc:
1270 return LLT::scalar(1);
1271 case Sgpr16:
1272 case Vgpr16:
1273 case UniInVgprS16:
1274 return LLT::scalar(16);
1275 case Sgpr32:
1276 case Sgpr32_WF:
1277 case Sgpr32Trunc:
1278 case Sgpr32AExt:
1280 case Sgpr32SExt:
1281 case Sgpr32ZExt:
1282 case UniInVgprS32:
1283 case Sgpr32ToVgprDst:
1284 case Vgpr32:
1285 case Vgpr32AExt:
1286 case Vgpr32SExt:
1287 case Vgpr32ZExt:
1288 return LLT::scalar(32);
1289 case Sgpr64:
1290 case Vgpr64:
1291 case UniInVgprS64:
1292 case Sgpr64ToVgprDst:
1293 return LLT::scalar(64);
1294 case Sgpr128:
1295 case Vgpr128:
1296 return LLT::scalar(128);
1297 case SgprP0:
1298 case SgprP0Call_WF:
1299 case VgprP0:
1300 return LLT::pointer(0, 64);
1301 case SgprP1:
1302 case VgprP1:
1303 return LLT::pointer(1, 64);
1304 case SgprP2:
1305 case VgprP2:
1306 return LLT::pointer(2, 32);
1307 case SgprP3:
1308 case VgprP3:
1309 return LLT::pointer(3, 32);
1310 case SgprP4:
1311 case SgprP4Call_WF:
1312 case VgprP4:
1313 return LLT::pointer(4, 64);
1314 case SgprP5:
1315 case VgprP5:
1316 return LLT::pointer(5, 32);
1317 case SgprP8:
1318 return LLT::pointer(8, 128);
1319 case SgprV2S16:
1320 case VgprV2S16:
1321 case UniInVgprV2S16:
1322 return LLT::fixed_vector(2, 16);
1323 case SgprV2S32:
1324 case VgprV2S32:
1325 case UniInVgprV2S32:
1326 return LLT::fixed_vector(2, 32);
1327 case VgprV3S32:
1328 return LLT::fixed_vector(3, 32);
1329 case VgprV4S16:
1330 return LLT::fixed_vector(4, 16);
1331 case SgprV4S32:
1332 case SgprV4S32_WF:
1333 case VgprV4S32:
1334 case UniInVgprV4S32:
1335 return LLT::fixed_vector(4, 32);
1336 case VgprV8S32:
1337 return LLT::fixed_vector(8, 32);
1338 case VgprV2S64:
1339 case UniInVgprV2S64:
1340 return LLT::fixed_vector(2, 64);
1341 default:
1342 return LLT();
1343 }
1344}
1345
1346LLT RegBankLegalizeHelper::getBTyFromID(RegBankLLTMappingApplyID ID, LLT Ty) {
1347 switch (ID) {
1348 case SgprB32:
1349 case VgprB32:
1350 case SgprB32_M0:
1352 case UniInVgprB32:
1353 if (Ty == LLT::scalar(32) || Ty == LLT::fixed_vector(2, 16) ||
1354 isAnyPtr(Ty, 32))
1355 return Ty;
1356 return LLT();
1357 case SgprPtr32:
1358 case VgprPtr32:
1359 return isAnyPtr(Ty, 32) ? Ty : LLT();
1360 case SgprPtr64:
1361 case VgprPtr64:
1362 return isAnyPtr(Ty, 64) ? Ty : LLT();
1363 case SgprPtr128:
1364 case VgprPtr128:
1365 return isAnyPtr(Ty, 128) ? Ty : LLT();
1366 case SgprB64:
1367 case VgprB64:
1369 case UniInVgprB64:
1370 if (Ty == LLT::scalar(64) || Ty == LLT::fixed_vector(2, 32) ||
1371 Ty == LLT::fixed_vector(4, 16) || isAnyPtr(Ty, 64))
1372 return Ty;
1373 return LLT();
1374 case SgprB96:
1375 case VgprB96:
1376 case UniInVgprB96:
1377 if (Ty == LLT::scalar(96) || Ty == LLT::fixed_vector(3, 32) ||
1378 Ty == LLT::fixed_vector(6, 16))
1379 return Ty;
1380 return LLT();
1381 case SgprB128:
1382 case VgprB128:
1383 case UniInVgprB128:
1384 if (Ty == LLT::scalar(128) || Ty == LLT::fixed_vector(4, 32) ||
1385 Ty == LLT::fixed_vector(2, 64) || Ty == LLT::fixed_vector(8, 16) ||
1386 isAnyPtr(Ty, 128))
1387 return Ty;
1388 return LLT();
1389 case VgprB160:
1390 case UniInVgprB160:
1391 if (Ty.getSizeInBits() == 160)
1392 return Ty;
1393 return LLT();
1394 case SgprB256:
1395 case VgprB256:
1396 case UniInVgprB256:
1397 if (Ty == LLT::scalar(256) || Ty == LLT::fixed_vector(8, 32) ||
1398 Ty == LLT::fixed_vector(4, 64) || Ty == LLT::fixed_vector(16, 16))
1399 return Ty;
1400 return LLT();
1401 case SgprB512:
1402 case VgprB512:
1403 case UniInVgprB512:
1404 if (Ty == LLT::scalar(512) || Ty == LLT::fixed_vector(16, 32) ||
1405 Ty == LLT::fixed_vector(8, 64))
1406 return Ty;
1407 return LLT();
1408 case SgprBRC: {
1409 const SIRegisterInfo *TRI =
1410 static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
1411 unsigned LLTSize = Ty.getSizeInBits();
1412 if (LLTSize >= 32 && TRI->getSGPRClassForBitWidth(LLTSize))
1413 return Ty;
1414 return LLT();
1415 }
1416 case VgprBRC: {
1417 const SIRegisterInfo *TRI =
1418 static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
1419 if (TRI->getSGPRClassForBitWidth(Ty.getSizeInBits()))
1420 return Ty;
1421 return LLT();
1422 }
1423 default:
1424 return LLT();
1425 }
1426}
1427
1428const RegisterBank *
1429RegBankLegalizeHelper::getRegBankFromID(RegBankLLTMappingApplyID ID) {
1430 switch (ID) {
1431 case Vcc:
1432 return VccRB;
1433 case Sgpr16:
1434 case Sgpr32:
1435 case Sgpr32_WF:
1436 case Sgpr64:
1437 case Sgpr128:
1438 case SgprP0:
1439 case SgprP0Call_WF:
1440 case SgprP1:
1441 case SgprP2:
1442 case SgprP3:
1443 case SgprP4:
1444 case SgprP4Call_WF:
1445 case SgprP5:
1446 case SgprP8:
1447 case SgprPtr32:
1448 case SgprPtr64:
1449 case SgprPtr128:
1450 case SgprV2S16:
1451 case SgprV2S32:
1452 case SgprV4S32:
1453 case SgprV4S32_WF:
1454 case SgprB32:
1455 case SgprB64:
1456 case SgprB96:
1457 case SgprB128:
1458 case SgprB256:
1459 case SgprB512:
1460 case SgprBRC:
1461 case UniInVcc:
1462 case UniInVgprS16:
1463 case UniInVgprS32:
1464 case UniInVgprS64:
1465 case UniInVgprV2S16:
1466 case UniInVgprV2S32:
1467 case UniInVgprV4S32:
1468 case UniInVgprV2S64:
1469 case UniInVgprB32:
1470 case UniInVgprB64:
1471 case UniInVgprB96:
1472 case UniInVgprB128:
1473 case UniInVgprB160:
1474 case UniInVgprB256:
1475 case UniInVgprB512:
1476 case Sgpr32Trunc:
1477 case Sgpr32AExt:
1479 case Sgpr32SExt:
1480 case Sgpr32ZExt:
1481 return SgprRB;
1482 case Vgpr16:
1483 case Vgpr32:
1484 case Vgpr64:
1485 case Vgpr128:
1486 case VgprP0:
1487 case VgprP1:
1488 case VgprP2:
1489 case VgprP3:
1490 case VgprP4:
1491 case VgprP5:
1492 case VgprPtr32:
1493 case VgprPtr64:
1494 case VgprPtr128:
1495 case VgprV2S16:
1496 case VgprV2S32:
1497 case VgprV2S64:
1498 case VgprV3S32:
1499 case VgprV4S16:
1500 case VgprV4S32:
1501 case VgprV8S32:
1502 case VgprB32:
1503 case VgprB64:
1504 case VgprB96:
1505 case VgprB128:
1506 case VgprB160:
1507 case VgprB256:
1508 case VgprB512:
1509 case VgprBRC:
1510 case Vgpr32AExt:
1511 case Vgpr32SExt:
1512 case Vgpr32ZExt:
1513 case Sgpr32ToVgprDst:
1514 case Sgpr64ToVgprDst:
1515 return VgprRB;
1516 default:
1517 return nullptr;
1518 }
1519}
1520
1521bool RegBankLegalizeHelper::applyMappingDst(
1522 MachineInstr &MI, unsigned &OpIdx,
1523 const SmallVectorImpl<RegBankLLTMappingApplyID> &MethodIDs) {
1524 // Defs start from operand 0
1525 for (; OpIdx < MethodIDs.size(); ++OpIdx) {
1526 if (MethodIDs[OpIdx] == None)
1527 continue;
1528 MachineOperand &Op = MI.getOperand(OpIdx);
1529 Register Reg = Op.getReg();
1530 LLT Ty = MRI.getType(Reg);
1531 [[maybe_unused]] const RegisterBank *RB = MRI.getRegBank(Reg);
1532
1533 switch (MethodIDs[OpIdx]) {
1534 // vcc, sgpr and vgpr scalars, pointers and vectors
1535 case Vcc:
1536 case Sgpr16:
1537 case Sgpr32:
1538 case Sgpr64:
1539 case Sgpr128:
1540 case SgprP0:
1541 case SgprP1:
1542 case SgprP3:
1543 case SgprP4:
1544 case SgprP5:
1545 case SgprP8:
1546 case SgprV2S16:
1547 case SgprV2S32:
1548 case SgprV4S32:
1549 case Vgpr16:
1550 case Vgpr32:
1551 case Vgpr64:
1552 case Vgpr128:
1553 case VgprP0:
1554 case VgprP1:
1555 case VgprP2:
1556 case VgprP3:
1557 case VgprP4:
1558 case VgprP5:
1559 case VgprV2S16:
1560 case VgprV2S32:
1561 case VgprV2S64:
1562 case VgprV3S32:
1563 case VgprV4S16:
1564 case VgprV4S32:
1565 case VgprV8S32: {
1566 assert(Ty == getTyFromID(MethodIDs[OpIdx]));
1567 assert(RB == getRegBankFromID(MethodIDs[OpIdx]));
1568 break;
1569 }
1570 // sgpr and vgpr B-types
1571 case SgprB32:
1572 case SgprB64:
1573 case SgprB96:
1574 case SgprB128:
1575 case SgprB256:
1576 case SgprB512:
1577 case SgprBRC:
1578 case SgprPtr32:
1579 case SgprPtr64:
1580 case SgprPtr128:
1581 case VgprB32:
1582 case VgprB64:
1583 case VgprB96:
1584 case VgprB128:
1585 case VgprB160:
1586 case VgprB256:
1587 case VgprB512:
1588 case VgprBRC:
1589 case VgprPtr32:
1590 case VgprPtr64:
1591 case VgprPtr128: {
1592 assert(Ty == getBTyFromID(MethodIDs[OpIdx], Ty));
1593 assert(RB == getRegBankFromID(MethodIDs[OpIdx]));
1594 break;
1595 }
1596 // uniform in vcc/vgpr: scalars, vectors and B-types
1597 case UniInVcc: {
1598 assert(Ty == S1);
1599 assert(RB == SgprRB);
1600 Register NewDst = MRI.createVirtualRegister(VccRB_S1);
1601 Op.setReg(NewDst);
1602 if (!MRI.use_empty(Reg)) {
1603 auto CopyS32_Vcc =
1604 B.buildInstr(AMDGPU::G_AMDGPU_COPY_SCC_VCC, {SgprRB_S32}, {NewDst});
1605 B.buildTrunc(Reg, CopyS32_Vcc);
1606 }
1607 break;
1608 }
1609 case UniInVgprS16: {
1610 assert(Ty == getTyFromID(MethodIDs[OpIdx]));
1611 assert(RB == SgprRB);
1612 Register NewVgprDstS16 = MRI.createVirtualRegister({VgprRB, S16});
1613 Register NewVgprDstS32 = MRI.createVirtualRegister({VgprRB, S32});
1614 Register NewSgprDstS32 = MRI.createVirtualRegister({SgprRB, S32});
1615 Op.setReg(NewVgprDstS16);
1616 B.buildAnyExt(NewVgprDstS32, NewVgprDstS16);
1617 buildReadAnyLane(B, NewSgprDstS32, NewVgprDstS32, RBI);
1618 B.buildTrunc(Reg, NewSgprDstS32);
1619 break;
1620 }
1621 case UniInVgprS32:
1622 case UniInVgprS64:
1623 case UniInVgprV2S16:
1624 case UniInVgprV2S32:
1625 case UniInVgprV4S32:
1626 case UniInVgprV2S64: {
1627 assert(Ty == getTyFromID(MethodIDs[OpIdx]));
1628 assert(RB == SgprRB);
1629 Register NewVgprDst = MRI.createVirtualRegister({VgprRB, Ty});
1630 Op.setReg(NewVgprDst);
1631 buildReadAnyLane(B, Reg, NewVgprDst, RBI);
1632 break;
1633 }
1634 case UniInVgprB32:
1635 case UniInVgprB64:
1636 case UniInVgprB96:
1637 case UniInVgprB128:
1638 case UniInVgprB160:
1639 case UniInVgprB256:
1640 case UniInVgprB512: {
1641 assert(Ty == getBTyFromID(MethodIDs[OpIdx], Ty));
1642 assert(RB == SgprRB);
1643 Register NewVgprDst = MRI.createVirtualRegister({VgprRB, Ty});
1644 Op.setReg(NewVgprDst);
1645 AMDGPU::buildReadAnyLane(B, Reg, NewVgprDst, RBI);
1646 break;
1647 }
1648 // sgpr trunc
1649 case Sgpr32Trunc: {
1650 assert(Ty.getSizeInBits() < 32);
1651 assert(RB == SgprRB);
1652 Register NewDst = MRI.createVirtualRegister(SgprRB_S32);
1653 Op.setReg(NewDst);
1654 if (!MRI.use_empty(Reg))
1655 B.buildTrunc(Reg, NewDst);
1656 break;
1657 }
1658 case Sgpr32ToVgprDst:
1659 case Sgpr64ToVgprDst: {
1660 assert(Ty == getTyFromID(MethodIDs[OpIdx]));
1661 assert(RB == VgprRB);
1662 Op.setReg(MRI.createVirtualRegister({SgprRB, Ty}));
1663 B.buildCopy(Reg, Op.getReg());
1664 break;
1665 }
1666 case InvalidMapping: {
1668 MF, MORE, "amdgpu-regbanklegalize",
1669 "AMDGPU RegBankLegalize: missing fast rule ('Div' or 'Uni') for", MI);
1670 return false;
1671 }
1672 default:
1674 MF, MORE, "amdgpu-regbanklegalize",
1675 "AMDGPU RegBankLegalize: applyMappingDst, ID not supported", MI);
1676 return false;
1677 }
1678 }
1679
1680 return true;
1681}
1682
1683bool RegBankLegalizeHelper::applyMappingSrc(
1684 MachineInstr &MI, unsigned &OpIdx,
1685 const SmallVectorImpl<RegBankLLTMappingApplyID> &MethodIDs,
1686 WaterfallInfo &WFI) {
1687 for (unsigned i = 0; i < MethodIDs.size(); ++OpIdx, ++i) {
1688 if (MethodIDs[i] == None || MethodIDs[i] == IntrId || MethodIDs[i] == Imm)
1689 continue;
1690
1691 MachineOperand &Op = MI.getOperand(OpIdx);
1692 Register Reg = Op.getReg();
1693 LLT Ty = MRI.getType(Reg);
1694 const RegisterBank *RB = MRI.getRegBank(Reg);
1695
1696 switch (MethodIDs[i]) {
1697 case Vcc: {
1698 assert(Ty == S1);
1699 assert(RB == VccRB || RB == SgprRB);
1700 if (RB == SgprRB) {
1701 auto Aext = B.buildAnyExt(SgprRB_S32, Reg);
1702 auto CopyVcc_Scc =
1703 B.buildInstr(AMDGPU::G_AMDGPU_COPY_VCC_SCC, {VccRB_S1}, {Aext});
1704 Op.setReg(CopyVcc_Scc.getReg(0));
1705 }
1706 break;
1707 }
1708 // sgpr scalars, pointers and vectors
1709 case Sgpr16:
1710 case Sgpr32:
1711 case Sgpr64:
1712 case Sgpr128:
1713 case SgprP0:
1714 case SgprP1:
1715 case SgprP3:
1716 case SgprP4:
1717 case SgprP5:
1718 case SgprP8:
1719 case SgprV2S16:
1720 case SgprV2S32:
1721 case SgprV4S32: {
1722 assert(Ty == getTyFromID(MethodIDs[i]));
1723 assert(RB == getRegBankFromID(MethodIDs[i]));
1724 break;
1725 }
1726 // sgpr B-types
1727 case SgprB32:
1728 case SgprB64:
1729 case SgprB96:
1730 case SgprB128:
1731 case SgprB256:
1732 case SgprB512:
1733 case SgprBRC:
1734 case SgprPtr32:
1735 case SgprPtr64:
1736 case SgprPtr128: {
1737 assert(Ty == getBTyFromID(MethodIDs[i], Ty));
1738 assert(RB == getRegBankFromID(MethodIDs[i]));
1739 break;
1740 }
1741 // vgpr scalars, pointers and vectors
1742 case Vgpr16:
1743 case Vgpr32:
1744 case Vgpr64:
1745 case Vgpr128:
1746 case VgprP0:
1747 case VgprP1:
1748 case VgprP2:
1749 case VgprP3:
1750 case VgprP4:
1751 case VgprP5:
1752 case VgprV2S16:
1753 case VgprV2S32:
1754 case VgprV2S64:
1755 case VgprV3S32:
1756 case VgprV4S16:
1757 case VgprV4S32:
1758 case VgprV8S32: {
1759 assert(Ty == getTyFromID(MethodIDs[i]));
1760 if (RB != VgprRB) {
1761 auto CopyToVgpr = B.buildCopy({VgprRB, Ty}, Reg);
1762 Op.setReg(CopyToVgpr.getReg(0));
1763 }
1764 break;
1765 }
1766 // vgpr B-types
1767 case VgprB32:
1768 case VgprB64:
1769 case VgprB96:
1770 case VgprB128:
1771 case VgprB160:
1772 case VgprB256:
1773 case VgprB512:
1774 case VgprBRC:
1775 case VgprPtr32:
1776 case VgprPtr64:
1777 case VgprPtr128: {
1778 assert(Ty == getBTyFromID(MethodIDs[i], Ty));
1779 if (RB != VgprRB) {
1780 auto CopyToVgpr = B.buildCopy({VgprRB, Ty}, Reg);
1781 Op.setReg(CopyToVgpr.getReg(0));
1782 }
1783 break;
1784 }
1785 // sgpr waterfall, scalars, and vectors
1786 case Sgpr32_WF:
1787 case SgprV4S32_WF: {
1788 assert(Ty == getTyFromID(MethodIDs[i]));
1789 if (RB != SgprRB) {
1790 WFI.SgprWaterfallOperandRegs.insert(Reg);
1791 if (!WFI.Start.isValid()) {
1792 WFI.Start = MI.getIterator();
1793 WFI.End = std::next(MI.getIterator());
1794 }
1795 }
1796 break;
1797 }
1798 case SgprP0Call_WF:
1799 case SgprP4Call_WF: {
1800 assert(Ty == getTyFromID(MethodIDs[i]));
1801 if (RB != SgprRB) {
1802 WFI.SgprWaterfallOperandRegs.insert(Reg);
1803
1804 // Find the ADJCALLSTACKUP before the call.
1805 MachineBasicBlock::iterator Start = MI.getIterator();
1806 while (Start->getOpcode() != AMDGPU::ADJCALLSTACKUP)
1807 --Start;
1808
1809 // Find the ADJCALLSTACKDOWN after the call (include it in range).
1810 MachineBasicBlock::iterator End = MI.getIterator();
1811 while (End->getOpcode() != AMDGPU::ADJCALLSTACKDOWN)
1812 ++End;
1813 ++End;
1814
1815 B.setInsertPt(*MI.getParent(), Start);
1816 WFI.Start = Start;
1817 WFI.End = End;
1818 }
1819 break;
1820 }
1821 case SgprB32_M0:
1823 case SgprB64_ReadFirstLane: {
1824 assert(Ty == getBTyFromID(MethodIDs[i], Ty));
1825 if (RB == SgprRB)
1826 break;
1827 assert(RB == VgprRB);
1828 Register NewSGPR = MRI.createVirtualRegister({SgprRB, Ty});
1829 buildReadFirstLane(B, NewSGPR, Op.getReg(), RBI);
1830 Op.setReg(NewSGPR);
1831 break;
1832 }
1833 // sgpr and vgpr scalars with extend
1834 case Sgpr32AExt: {
1835 // Note: this ext allows S1, and it is meant to be combined away.
1836 assert(Ty.getSizeInBits() < 32);
1837 assert(RB == SgprRB);
1838 auto Aext = B.buildAnyExt(SgprRB_S32, Reg);
1839 Op.setReg(Aext.getReg(0));
1840 break;
1841 }
1842 case Sgpr32AExtBoolInReg: {
1843 // Note: this ext allows S1, and it is meant to be combined away.
1844 assert(Ty.getSizeInBits() == 1);
1845 assert(RB == SgprRB);
1846 auto Aext = B.buildAnyExt(SgprRB_S32, Reg);
1847 // Zext SgprS1 is not legal, make AND with 1 instead. This instruction is
1848 // most of times meant to be combined away in AMDGPURegBankCombiner.
1849 auto Cst1 = B.buildConstant(SgprRB_S32, 1);
1850 auto BoolInReg = B.buildAnd(SgprRB_S32, Aext, Cst1);
1851 Op.setReg(BoolInReg.getReg(0));
1852 break;
1853 }
1854 case Sgpr32SExt: {
1855 assert(1 < Ty.getSizeInBits() && Ty.getSizeInBits() < 32);
1856 assert(RB == SgprRB);
1857 auto Sext = B.buildSExt(SgprRB_S32, Reg);
1858 Op.setReg(Sext.getReg(0));
1859 break;
1860 }
1861 case Sgpr32ZExt: {
1862 assert(1 < Ty.getSizeInBits() && Ty.getSizeInBits() < 32);
1863 assert(RB == SgprRB);
1864 auto Zext = B.buildZExt({SgprRB, S32}, Reg);
1865 Op.setReg(Zext.getReg(0));
1866 break;
1867 }
1868 case Vgpr32AExt: {
1869 assert(Ty.getSizeInBits() < 32);
1870 assert(RB == VgprRB);
1871 auto Aext = B.buildAnyExt({VgprRB, S32}, Reg);
1872 Op.setReg(Aext.getReg(0));
1873 break;
1874 }
1875 case Vgpr32SExt: {
1876 // Note this ext allows S1, and it is meant to be combined away.
1877 assert(Ty.getSizeInBits() < 32);
1878 assert(RB == VgprRB);
1879 auto Sext = B.buildSExt({VgprRB, S32}, Reg);
1880 Op.setReg(Sext.getReg(0));
1881 break;
1882 }
1883 case Vgpr32ZExt: {
1884 // Note this ext allows S1, and it is meant to be combined away.
1885 assert(Ty.getSizeInBits() < 32);
1886 assert(RB == VgprRB);
1887 auto Zext = B.buildZExt({VgprRB, S32}, Reg);
1888 Op.setReg(Zext.getReg(0));
1889 break;
1890 }
1891 default:
1893 MF, MORE, "amdgpu-regbanklegalize",
1894 "AMDGPU RegBankLegalize: applyMappingSrc, ID not supported", MI);
1895 return false;
1896 }
1897 }
1898 return true;
1899}
1900
1901[[maybe_unused]] static bool verifyRegBankOnOperands(MachineInstr &MI,
1902 const RegisterBank *RB,
1904 unsigned StartOpIdx,
1905 unsigned EndOpIdx) {
1906 for (unsigned i = StartOpIdx; i <= EndOpIdx; ++i) {
1907 if (MRI.getRegBankOrNull(MI.getOperand(i).getReg()) != RB)
1908 return false;
1909 }
1910 return true;
1911}
1912
1914 const RegisterBank *RB = MRI.getRegBank(MI.getOperand(0).getReg());
1915 // Put RB on all registers
1916 unsigned NumDefs = MI.getNumDefs();
1917 unsigned NumOperands = MI.getNumOperands();
1918
1919 assert(verifyRegBankOnOperands(MI, RB, MRI, 0, NumDefs - 1));
1920 if (RB == SgprRB)
1921 assert(verifyRegBankOnOperands(MI, RB, MRI, NumDefs, NumOperands - 1));
1922
1923 if (RB == VgprRB) {
1924 B.setInstr(MI);
1925 for (unsigned i = NumDefs; i < NumOperands; ++i) {
1926 Register Reg = MI.getOperand(i).getReg();
1927 if (MRI.getRegBank(Reg) != RB) {
1928 auto Copy = B.buildCopy({VgprRB, MRI.getType(Reg)}, Reg);
1929 MI.getOperand(i).setReg(Copy.getReg(0));
1930 }
1931 }
1932 }
1933}
1934
1935bool RegBankLegalizeHelper::applyRegisterBanksINTRIN_IMAGE(MachineInstr &MI) {
1936 const AMDGPU::RsrcIntrinsic *RSrcIntrin =
1938 assert(RSrcIntrin && RSrcIntrin->IsImage);
1939
1940 unsigned RsrcIdx = RSrcIntrin->RsrcArg;
1941 const unsigned NumDefs = MI.getNumExplicitDefs();
1942
1943 // The reported argument index is relative to the IR intrinsic call arguments,
1944 // so we need to shift by the number of defs and the intrinsic ID.
1945 RsrcIdx += NumDefs + 1;
1946
1947 MachineBasicBlock *MBB = MI.getParent();
1948 B.setInsertPt(*MBB, MBB->SkipPHIsAndLabels(std::next(MI.getIterator())));
1949
1950 // Defs(for image loads with return) are vgpr.
1951 for (unsigned i = 0; i < NumDefs; ++i) {
1952 const RegisterBank *RB = MRI.getRegBank(MI.getOperand(i).getReg());
1953 if (RB == VgprRB)
1954 continue;
1955
1956 Register Reg = MI.getOperand(i).getReg();
1957 Register NewVgprDst = MRI.createVirtualRegister({VgprRB, MRI.getType(Reg)});
1958 MI.getOperand(i).setReg(NewVgprDst);
1959 buildReadAnyLane(B, Reg, NewVgprDst, RBI);
1960 }
1961
1962 B.setInstrAndDebugLoc(MI);
1963
1964 // Register uses(before RsrcIdx) are vgpr.
1965 for (unsigned i = 1; i < RsrcIdx; ++i) {
1966 MachineOperand &Op = MI.getOperand(i);
1967 if (!Op.isReg())
1968 continue;
1969
1970 Register Reg = Op.getReg();
1971 if (!Reg.isVirtual())
1972 continue;
1973
1974 if (MRI.getRegBank(Reg) == VgprRB)
1975 continue;
1976
1977 auto Copy = B.buildCopy({VgprRB, MRI.getType(Reg)}, Reg);
1978 Op.setReg(Copy.getReg(0));
1979 }
1980
1981 SmallSet<Register, 4> OpsToWaterfall;
1982
1983 // Register use RsrcIdx(and RsrcIdx+1 in some cases) is sgpr.
1984 for (unsigned i = RsrcIdx; i < MI.getNumOperands(); ++i) {
1985 MachineOperand &Op = MI.getOperand(i);
1986 if (!Op.isReg())
1987 continue;
1988
1989 Register Reg = Op.getReg();
1990 if (MRI.getRegBank(Reg) != SgprRB)
1991 OpsToWaterfall.insert(Reg);
1992 }
1993
1994 if (!OpsToWaterfall.empty()) {
1995 MachineBasicBlock::iterator MII = MI.getIterator();
1996 executeInWaterfallLoop(B, {OpsToWaterfall, MII, std::next(MII)});
1997 }
1998
1999 return true;
2000}
MachineInstrBuilder MachineInstrBuilder & DefMI
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.
Provides AMDGPU specific target descriptions.
static bool isSignedBFE(MachineInstr &MI)
static bool verifyRegBankOnOperands(MachineInstr &MI, const RegisterBank *RB, MachineRegisterInfo &MRI, unsigned StartOpIdx, unsigned EndOpIdx)
This file declares the targeting of the RegisterBankInfo class for AMDGPU.
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator MBBI
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
AMD GCN specific subclass of TargetSubtarget.
Declares convenience wrapper classes for interpreting MachineInstr instances as specific generic oper...
static Register UseReg(const MachineOperand &MO)
IRTranslator LLVM IR MI
const size_t AbstractManglingParser< Derived, Alloc >::NumOps
#define I(x, y, z)
Definition MD5.cpp:57
Contains matchers for matching SSA Machine Instructions.
This file declares the MachineIRBuilder class.
Register Reg
Register const TargetRegisterInfo * TRI
Machine IR instance of the generic uniformity analysis.
Promote Memory to Register
Definition Mem2Reg.cpp:110
static MCRegister getReg(const MCDisassembler *D, unsigned RC, unsigned RegNo)
MachineInstr unsigned OpIdx
const SmallVectorImpl< MachineOperand > & Cond
RegBankLegalizeHelper(MachineIRBuilder &B, const MachineUniformityInfo &MUI, const RegisterBankInfo &RBI, const RegBankLegalizeRules &RBLRules)
const RegBankLLTMapping * findMappingForMI(const MachineInstr &MI, const MachineRegisterInfo &MRI, const MachineUniformityInfo &MUI) const
static APInt getLowBitsSet(unsigned numBits, unsigned loBitsSet)
Constructs an APInt value that has the bottom loBitsSet bits set.
Definition APInt.h:307
@ ICMP_NE
not equal
Definition InstrTypes.h:698
iterator find(const_arg_type_t< KeyT > Val)
Definition DenseMap.h:178
iterator end()
Definition DenseMap.h:81
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition DenseMap.h:241
const SIRegisterInfo * getRegisterInfo() const override
Represents a call to an intrinsic.
Register getSourceReg() const
Get the unmerge source register.
constexpr bool isScalar() const
LLT getScalarType() const
static constexpr LLT scalar(unsigned SizeInBits)
Get a low-level scalar or aggregate "bag of bits".
constexpr bool isValid() const
constexpr uint16_t getNumElements() const
Returns the number of elements in a vector LLT.
constexpr bool isVector() const
static constexpr LLT pointer(unsigned AddressSpace, unsigned SizeInBits)
Get a low-level pointer in the given address space.
constexpr TypeSize getSizeInBits() const
Returns the total size of the type. Must only be called on sized types.
constexpr bool isPointer() const
static constexpr LLT fixed_vector(unsigned NumElements, unsigned ScalarSizeInBits)
Get a low-level fixed-width vector of some number of elements and element width.
constexpr TypeSize getSizeInBytes() const
Returns the total size of the type in bytes, i.e.
LLT getElementType() const
Returns the vector's element type. Only valid for vector types.
TypeSize getValue() const
LLVM_ABI void transferSuccessorsAndUpdatePHIs(MachineBasicBlock *FromMBB)
Transfers all the successors, as in transferSuccessors, and update PHI operands in the successor bloc...
LLVM_ABI iterator SkipPHIsAndLabels(iterator I)
Return the first instruction in MBB after I that is not a PHI or a label.
LLVM_ABI void addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob=BranchProbability::getUnknown())
Add Succ as a successor of this MachineBasicBlock.
void splice(iterator Where, MachineBasicBlock *Other, iterator From)
Take an instruction from MBB 'Other' at the position From, and insert it into this MBB right before '...
MachineInstrBundleIterator< MachineInstr > iterator
BasicBlockListType::iterator iterator
MachineBasicBlock * CreateMachineBasicBlock(const BasicBlock *BB=nullptr, std::optional< UniqueBBID > BBID=std::nullopt)
CreateMachineInstr - Allocate a new MachineInstr.
void insert(iterator MBBI, MachineBasicBlock *MBB)
Helper class to build MachineInstr.
Representation of each machine instruction.
const MachineBasicBlock * getParent() const
LocationSize getSize() const
Return the size in bytes of the memory reference.
MachineOperand class - Representation of each machine instruction operand.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
const RegisterBank * getRegBank(Register Reg) const
Return the register bank of Reg.
LLVM_ABI Register createVirtualRegister(const TargetRegisterClass *RegClass, StringRef Name="")
createVirtualRegister - Create and return a new virtual register in the function with the specified r...
LLT getType(Register Reg) const
Get the low-level type of Reg or LLT{} if Reg is not a generic (target independent) virtual register.
const RegisterBank * getRegBankOrNull(Register Reg) const
Return the register bank of Reg, or null if Reg has not been assigned a register bank or has been ass...
Holds all the information related to register banks.
This class implements the register bank concept.
Wrapper class representing virtual and physical registers.
Definition Register.h:20
bool empty() const
Definition SmallSet.h:169
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
Definition SmallSet.h:184
void push_back(const T &Elt)
self_iterator getIterator()
Definition ilist_node.h:123
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
bool isAnyPtr(LLT Ty, unsigned Width)
Intrinsic::ID getIntrinsicID(const MachineInstr &I)
Return the intrinsic ID for opcodes with the G_AMDGPU_INTRIN_ prefix.
void buildReadAnyLane(MachineIRBuilder &B, Register SgprDst, Register VgprSrc, const RegisterBankInfo &RBI)
const RsrcIntrinsic * lookupRsrcIntrinsic(unsigned Intr)
void buildReadFirstLane(MachineIRBuilder &B, Register SgprDst, Register VgprSrc, const RegisterBankInfo &RBI)
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
SpecificConstantMatch m_ZeroInt()
Convenience matchers for specific integer values.
bool mi_match(Reg R, const MachineRegisterInfo &MRI, Pattern &&P)
This is an optimization pass for GlobalISel generic memory operations.
GenericUniformityInfo< MachineSSAContext > MachineUniformityInfo
@ Offset
Definition DWP.cpp:532
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1739
@ Kill
The last use of a register.
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
LLVM_ABI void constrainSelectedInstRegOperands(MachineInstr &I, const TargetInstrInfo &TII, const TargetRegisterInfo &TRI, const RegisterBankInfo &RBI)
Mutate the newly-selected instruction I to constrain its (possibly generic) virtual register operands...
Definition Utils.cpp:155
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
LLVM_ABI void reportGISelFailure(MachineFunction &MF, MachineOptimizationRemarkEmitter &MORE, MachineOptimizationRemarkMissed &R)
Report an ISel error as a missed optimization remark to the LLVMContext's diagnostic stream.
Definition Utils.cpp:257
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
DWARFExpression::Operation Op
ArrayRef(const T &OneElt) -> ArrayRef< T >
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
LLVM_ABI std::optional< ValueAndVReg > getIConstantVRegValWithLookThrough(Register VReg, const MachineRegisterInfo &MRI, bool LookThroughInstrs=true)
If VReg is defined by a statically evaluable chain of instructions rooted on a G_CONSTANT returns its...
Definition Utils.cpp:432
constexpr T maskTrailingOnes(unsigned N)
Create a bitmask with the N right-most bits set to 1, and all other bits set to 0.
Definition MathExtras.h:77
SmallVector< RegBankLLTMappingApplyID, 2 > DstOpMapping
SmallVector< RegBankLLTMappingApplyID, 4 > SrcOpMapping
Holds waterfall loop information: the set of SGPR operand registers that need waterfalling,...
MachineBasicBlock::iterator Start
SmallSet< Register, 4 > SgprWaterfallOperandRegs