LLVM 23.0.0git
AMDGPURegBankLegalizeHelper.cpp
Go to the documentation of this file.
1//===-- AMDGPURegBankLegalizeHelper.cpp -----------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// Implements actual lowering algorithms for each ID that can be used in
10/// Rule.OperandMapping. Similar to legalizer helper but with register banks.
11//
12//===----------------------------------------------------------------------===//
13
16#include "AMDGPUInstrInfo.h"
19#include "GCNSubtarget.h"
26#include "llvm/IR/IntrinsicsAMDGPU.h"
27
28#define DEBUG_TYPE "amdgpu-regbanklegalize"
29
30using namespace llvm;
31using namespace AMDGPU;
32
35 const RegisterBankInfo &RBI, const RegBankLegalizeRules &RBLRules)
36 : MF(B.getMF()), ST(MF.getSubtarget<GCNSubtarget>()), B(B),
37 MRI(*B.getMRI()), MUI(MUI), RBI(RBI), MORE(MF, nullptr),
38 RBLRules(RBLRules), IsWave32(ST.isWave32()),
39 SgprRB(&RBI.getRegBank(AMDGPU::SGPRRegBankID)),
40 VgprRB(&RBI.getRegBank(AMDGPU::VGPRRegBankID)),
41 VccRB(&RBI.getRegBank(AMDGPU::VCCRegBankID)) {}
42
44 const SetOfRulesForOpcode *RuleSet = RBLRules.getRulesForOpc(MI);
45 if (!RuleSet) {
46 reportGISelFailure(MF, MORE, "amdgpu-regbanklegalize",
47 "No AMDGPU RegBankLegalize rules defined for opcode",
48 MI);
49 return false;
50 }
51
52 const RegBankLLTMapping *Mapping = RuleSet->findMappingForMI(MI, MRI, MUI);
53 if (!Mapping) {
54 reportGISelFailure(MF, MORE, "amdgpu-regbanklegalize",
55 "AMDGPU RegBankLegalize: none of the rules defined with "
56 "'Any' for MI's opcode matched MI",
57 MI);
58 return false;
59 }
60
61 WaterfallInfo WFI;
62 unsigned OpIdx = 0;
63 if (!Mapping->DstOpMapping.empty()) {
64 B.setInsertPt(*MI.getParent(), std::next(MI.getIterator()));
65 if (!applyMappingDst(MI, OpIdx, Mapping->DstOpMapping))
66 return false;
67 }
68 if (!Mapping->SrcOpMapping.empty()) {
69 B.setInstr(MI);
70 if (!applyMappingSrc(MI, OpIdx, Mapping->SrcOpMapping, WFI))
71 return false;
72 }
73
74 if (!lower(MI, *Mapping, WFI))
75 return false;
76
77 return true;
78}
79
80bool RegBankLegalizeHelper::executeInWaterfallLoop(MachineIRBuilder &B,
81 const WaterfallInfo &WFI) {
82 assert(WFI.Start.isValid() && WFI.End.isValid() &&
83 "Waterfall range not initialized");
84
85 // Track use registers which have already been expanded with a readfirstlane
86 // sequence. This may have multiple uses if moving a sequence.
87 DenseMap<Register, Register> WaterfalledRegMap;
88
89 MachineBasicBlock &MBB = B.getMBB();
90 MachineFunction &MF = B.getMF();
91
94
96 const TargetRegisterClass *WaveRC = TRI->getWaveMaskRegClass();
97 unsigned MovExecOpc, MovExecTermOpc, XorTermOpc, AndSaveExecOpc, ExecReg;
98 if (IsWave32) {
99 MovExecOpc = AMDGPU::S_MOV_B32;
100 MovExecTermOpc = AMDGPU::S_MOV_B32_term;
101 XorTermOpc = AMDGPU::S_XOR_B32_term;
102 AndSaveExecOpc = AMDGPU::S_AND_SAVEEXEC_B32;
103 ExecReg = AMDGPU::EXEC_LO;
104 } else {
105 MovExecOpc = AMDGPU::S_MOV_B64;
106 MovExecTermOpc = AMDGPU::S_MOV_B64_term;
107 XorTermOpc = AMDGPU::S_XOR_B64_term;
108 AndSaveExecOpc = AMDGPU::S_AND_SAVEEXEC_B64;
109 ExecReg = AMDGPU::EXEC;
110 }
111
112#ifndef NDEBUG
113 const int OrigRangeSize = std::distance(BeginIt, EndIt);
114#endif
115
116 MachineRegisterInfo &MRI = *B.getMRI();
117 Register SaveExecReg = MRI.createVirtualRegister(WaveRC);
118 Register InitSaveExecReg = MRI.createVirtualRegister(WaveRC);
119
120 // Don't bother using generic instructions/registers for the exec mask.
121 B.buildInstr(TargetOpcode::IMPLICIT_DEF).addDef(InitSaveExecReg);
122
123 Register SavedExec = MRI.createVirtualRegister(WaveRC);
124
125 // To insert the loop we need to split the block. Move everything before
126 // this point to a new block, and insert a new empty block before this
127 // instruction.
130 MachineBasicBlock *RestoreExecBB = MF.CreateMachineBasicBlock();
131 MachineBasicBlock *RemainderBB = MF.CreateMachineBasicBlock();
133 ++MBBI;
134 MF.insert(MBBI, LoopBB);
135 MF.insert(MBBI, BodyBB);
136 MF.insert(MBBI, RestoreExecBB);
137 MF.insert(MBBI, RemainderBB);
138
139 LoopBB->addSuccessor(BodyBB);
140 BodyBB->addSuccessor(RestoreExecBB);
141 BodyBB->addSuccessor(LoopBB);
142
143 // Move the rest of the block into a new block.
145 RemainderBB->splice(RemainderBB->begin(), &MBB, EndIt, MBB.end());
146
147 MBB.addSuccessor(LoopBB);
148 RestoreExecBB->addSuccessor(RemainderBB);
149
150 B.setInsertPt(*LoopBB, LoopBB->end());
151
152 // +-MBB:------------+
153 // | ... |
154 // | %0 = G_INST_1 |
155 // | %Dst = MI %Vgpr |
156 // | %1 = G_INST_2 |
157 // | ... |
158 // +-----------------+
159 // ->
160 // +-MBB-------------------------------+
161 // | ... |
162 // | %0 = G_INST_1 |
163 // | %SaveExecReg = S_MOV_B32 $exec_lo |
164 // +----------------|------------------+
165 // | /------------------------------|
166 // V V |
167 // +-LoopBB---------------------------------------------------------------+ |
168 // | %CurrentLaneReg:sgpr(s32) = READFIRSTLANE %Vgpr | |
169 // | instead of executing for each lane, see if other lanes had | |
170 // | same value for %Vgpr and execute for them also. | |
171 // | %CondReg:vcc(s1) = G_ICMP eq %CurrentLaneReg, %Vgpr | |
172 // | %CondRegLM:sreg_32 = ballot %CondReg // copy vcc to sreg32 lane mask | |
173 // | %SavedExec = S_AND_SAVEEXEC_B32 %CondRegLM | |
174 // | exec is active for lanes with the same "CurrentLane value" in Vgpr | |
175 // +----------------|-----------------------------------------------------+ |
176 // V |
177 // +-BodyBB------------------------------------------------------------+ |
178 // | %Dst = MI %CurrentLaneReg:sgpr(s32) | |
179 // | executed only for active lanes and written to Dst | |
180 // | $exec = S_XOR_B32 $exec, %SavedExec | |
181 // | set active lanes to 0 in SavedExec, lanes that did not write to | |
182 // | Dst yet, and set this as new exec (for READFIRSTLANE and ICMP) | |
183 // | SI_WATERFALL_LOOP LoopBB |-----|
184 // +----------------|--------------------------------------------------+
185 // V
186 // +-RestoreExecBB--------------------------+
187 // | $exec_lo = S_MOV_B32_term %SaveExecReg |
188 // +----------------|-----------------------+
189 // V
190 // +-RemainderBB:----------------------+
191 // | %1 = G_INST_2 |
192 // | ... |
193 // +---------------------------------- +
194
195 // Move the instruction into the loop body. Note we moved everything after
196 // Range.end() already into a new block, so Range.end() is no longer valid.
197 BodyBB->splice(BodyBB->end(), &MBB, BeginIt, MBB.end());
198
199 // Figure out the iterator range after splicing the instructions.
200 MachineBasicBlock::iterator NewBegin = BeginIt;
201 auto NewEnd = BodyBB->end();
202 assert(std::distance(NewBegin, NewEnd) == OrigRangeSize);
203
204 B.setMBB(*LoopBB);
205 Register CondReg;
206
207 for (MachineInstr &MI : make_range(NewBegin, NewEnd)) {
208 for (MachineOperand &Op : MI.all_uses()) {
209 Register OldReg = Op.getReg();
210 if (!WFI.SgprWaterfallOperandRegs.count(OldReg))
211 continue;
212
213 // See if we already processed this register in another instruction in
214 // the sequence.
215 auto OldVal = WaterfalledRegMap.find(OldReg);
216 if (OldVal != WaterfalledRegMap.end()) {
217 Op.setReg(OldVal->second);
218 continue;
219 }
220
221 Register OpReg = Op.getReg();
222 LLT OpTy = MRI.getType(OpReg);
223
224 // TODO: support for agpr
225 assert(MRI.getRegBank(OpReg) == VgprRB);
226 Register CurrentLaneReg = MRI.createVirtualRegister({SgprRB, OpTy});
227 buildReadFirstLane(B, CurrentLaneReg, OpReg, RBI);
228
229 // Build the comparison(s), CurrentLaneReg == OpReg.
230 unsigned OpSize = OpTy.getSizeInBits();
231 unsigned PartSize = (OpSize % 64 == 0) ? 64 : 32;
232 LLT PartTy = LLT::scalar(PartSize);
233 unsigned NumParts = OpSize / PartSize;
235 SmallVector<Register, 8> CurrentLaneParts;
236
237 if (NumParts == 1) {
238 OpParts.push_back(OpReg);
239 CurrentLaneParts.push_back(CurrentLaneReg);
240 } else {
241 auto UnmergeOp = B.buildUnmerge({VgprRB, PartTy}, OpReg);
242 auto UnmergeCurrLane = B.buildUnmerge({SgprRB, PartTy}, CurrentLaneReg);
243 for (unsigned i = 0; i < NumParts; ++i) {
244 OpParts.push_back(UnmergeOp.getReg(i));
245 CurrentLaneParts.push_back(UnmergeCurrLane.getReg(i));
246 }
247 }
248
249 for (unsigned i = 0; i < NumParts; ++i) {
250 Register CmpReg = MRI.createVirtualRegister(VccRB_S1);
251 B.buildICmp(CmpInst::ICMP_EQ, CmpReg, CurrentLaneParts[i], OpParts[i]);
252
253 if (!CondReg)
254 CondReg = CmpReg;
255 else
256 CondReg = B.buildAnd(VccRB_S1, CondReg, CmpReg).getReg(0);
257 }
258
259 Op.setReg(CurrentLaneReg);
260
261 // Make sure we don't re-process this register again.
262 WaterfalledRegMap.insert(std::pair(OldReg, Op.getReg()));
263 }
264 }
265
266 // Copy vcc to sgpr32/64, ballot becomes a no-op during instruction selection.
267 Register CondRegLM =
268 MRI.createVirtualRegister({WaveRC, LLT::scalar(IsWave32 ? 32 : 64)});
269 B.buildIntrinsic(Intrinsic::amdgcn_ballot, CondRegLM).addReg(CondReg);
270
271 // Update EXEC, save the original EXEC value to SavedExec.
272 B.buildInstr(AndSaveExecOpc)
273 .addDef(SavedExec)
274 .addReg(CondRegLM, RegState::Kill);
275 MRI.setSimpleHint(SavedExec, CondRegLM);
276
277 B.setInsertPt(*BodyBB, BodyBB->end());
278
279 // Update EXEC, switch all done bits to 0 and all todo bits to 1.
280 B.buildInstr(XorTermOpc).addDef(ExecReg).addReg(ExecReg).addReg(SavedExec);
281
282 // XXX - s_xor_b64 sets scc to 1 if the result is nonzero, so can we use
283 // s_cbranch_scc0?
284
285 // Loop back to V_READFIRSTLANE_B32 if there are still variants to cover.
286 B.buildInstr(AMDGPU::SI_WATERFALL_LOOP).addMBB(LoopBB);
287
288 // Save the EXEC mask before the loop.
289 B.setInsertPt(MBB, MBB.end());
290 B.buildInstr(MovExecOpc).addDef(SaveExecReg).addReg(ExecReg);
291
292 // Restore the EXEC mask after the loop.
293 B.setInsertPt(*RestoreExecBB, RestoreExecBB->begin());
294 B.buildInstr(MovExecTermOpc).addDef(ExecReg).addReg(SaveExecReg);
295
296 // Set the insert point after the original instruction, so any new
297 // instructions will be in the remainder.
298 B.setInsertPt(*RemainderBB, RemainderBB->begin());
299
300 return true;
301}
302
303bool RegBankLegalizeHelper::splitLoad(MachineInstr &MI,
304 ArrayRef<LLT> LLTBreakdown, LLT MergeTy) {
305 MachineFunction &MF = B.getMF();
306 assert(MI.getNumMemOperands() == 1);
307 MachineMemOperand &BaseMMO = **MI.memoperands_begin();
308 Register Dst = MI.getOperand(0).getReg();
309 const RegisterBank *DstRB = MRI.getRegBankOrNull(Dst);
310 Register Base = MI.getOperand(1).getReg();
311 LLT PtrTy = MRI.getType(Base);
312 const RegisterBank *PtrRB = MRI.getRegBankOrNull(Base);
313 LLT OffsetTy = LLT::scalar(PtrTy.getSizeInBits());
314 SmallVector<Register, 4> LoadPartRegs;
315
316 unsigned ByteOffset = 0;
317 for (LLT PartTy : LLTBreakdown) {
318 Register BasePlusOffset;
319 if (ByteOffset == 0) {
320 BasePlusOffset = Base;
321 } else {
322 auto Offset = B.buildConstant({PtrRB, OffsetTy}, ByteOffset);
323 BasePlusOffset =
324 B.buildObjectPtrOffset({PtrRB, PtrTy}, Base, Offset).getReg(0);
325 }
326 auto *OffsetMMO = MF.getMachineMemOperand(&BaseMMO, ByteOffset, PartTy);
327 auto LoadPart = B.buildLoad({DstRB, PartTy}, BasePlusOffset, *OffsetMMO);
328 LoadPartRegs.push_back(LoadPart.getReg(0));
329 ByteOffset += PartTy.getSizeInBytes();
330 }
331
332 if (!MergeTy.isValid()) {
333 // Loads are of same size, concat or merge them together.
334 B.buildMergeLikeInstr(Dst, LoadPartRegs);
335 } else {
336 // Loads are not all of same size, need to unmerge them to smaller pieces
337 // of MergeTy type, then merge pieces to Dst.
338 SmallVector<Register, 4> MergeTyParts;
339 for (Register Reg : LoadPartRegs) {
340 if (MRI.getType(Reg) == MergeTy) {
341 MergeTyParts.push_back(Reg);
342 } else {
343 auto Unmerge = B.buildUnmerge({DstRB, MergeTy}, Reg);
344 for (unsigned i = 0; i < Unmerge->getNumOperands() - 1; ++i)
345 MergeTyParts.push_back(Unmerge.getReg(i));
346 }
347 }
348 B.buildMergeLikeInstr(Dst, MergeTyParts);
349 }
350 MI.eraseFromParent();
351 return true;
352}
353
354bool RegBankLegalizeHelper::widenLoad(MachineInstr &MI, LLT WideTy,
355 LLT MergeTy) {
356 MachineFunction &MF = B.getMF();
357 assert(MI.getNumMemOperands() == 1);
358 MachineMemOperand &BaseMMO = **MI.memoperands_begin();
359 Register Dst = MI.getOperand(0).getReg();
360 const RegisterBank *DstRB = MRI.getRegBankOrNull(Dst);
361 Register Base = MI.getOperand(1).getReg();
362
363 MachineMemOperand *WideMMO = MF.getMachineMemOperand(&BaseMMO, 0, WideTy);
364 auto WideLoad = B.buildLoad({DstRB, WideTy}, Base, *WideMMO);
365
366 if (WideTy.isScalar()) {
367 B.buildTrunc(Dst, WideLoad);
368 } else {
369 SmallVector<Register, 4> MergeTyParts;
370 auto Unmerge = B.buildUnmerge({DstRB, MergeTy}, WideLoad);
371
372 LLT DstTy = MRI.getType(Dst);
373 unsigned NumElts = DstTy.getSizeInBits() / MergeTy.getSizeInBits();
374 for (unsigned i = 0; i < NumElts; ++i) {
375 MergeTyParts.push_back(Unmerge.getReg(i));
376 }
377 B.buildMergeLikeInstr(Dst, MergeTyParts);
378 }
379 MI.eraseFromParent();
380 return true;
381}
382
383bool RegBankLegalizeHelper::widenMMOToS32(GAnyLoad &MI) const {
384 Register Dst = MI.getDstReg();
385 Register Ptr = MI.getPointerReg();
386 MachineMemOperand &MMO = MI.getMMO();
387 unsigned MemSize = 8 * MMO.getSize().getValue();
388
389 MachineMemOperand *WideMMO = B.getMF().getMachineMemOperand(&MMO, 0, S32);
390
391 if (MI.getOpcode() == G_LOAD) {
392 B.buildLoad(Dst, Ptr, *WideMMO);
393 } else {
394 auto Load = B.buildLoad(SgprRB_S32, Ptr, *WideMMO);
395
396 if (MI.getOpcode() == G_ZEXTLOAD) {
397 APInt Mask = APInt::getLowBitsSet(S32.getSizeInBits(), MemSize);
398 auto MaskCst = B.buildConstant(SgprRB_S32, Mask);
399 B.buildAnd(Dst, Load, MaskCst);
400 } else {
401 assert(MI.getOpcode() == G_SEXTLOAD);
402 B.buildSExtInReg(Dst, Load, MemSize);
403 }
404 }
405
406 MI.eraseFromParent();
407 return true;
408}
409
410bool RegBankLegalizeHelper::lowerVccExtToSel(MachineInstr &MI) {
411 Register Dst = MI.getOperand(0).getReg();
412 LLT Ty = MRI.getType(Dst);
413 Register Src = MI.getOperand(1).getReg();
414 unsigned Opc = MI.getOpcode();
415 int TrueExtCst = Opc == G_SEXT ? -1 : 1;
416 if (Ty == S32 || Ty == S16) {
417 auto True = B.buildConstant({VgprRB, Ty}, TrueExtCst);
418 auto False = B.buildConstant({VgprRB, Ty}, 0);
419 B.buildSelect(Dst, Src, True, False);
420 } else if (Ty == S64) {
421 auto True = B.buildConstant({VgprRB_S32}, TrueExtCst);
422 auto False = B.buildConstant({VgprRB_S32}, 0);
423 auto Lo = B.buildSelect({VgprRB_S32}, Src, True, False);
424 MachineInstrBuilder Hi;
425 switch (Opc) {
426 case G_SEXT:
427 Hi = Lo;
428 break;
429 case G_ZEXT:
430 Hi = False;
431 break;
432 case G_ANYEXT:
433 Hi = B.buildUndef({VgprRB_S32});
434 break;
435 default:
437 MF, MORE, "amdgpu-regbanklegalize",
438 "AMDGPU RegBankLegalize: lowerVccExtToSel, Opcode not supported", MI);
439 return false;
440 }
441
442 B.buildMergeValues(Dst, {Lo.getReg(0), Hi.getReg(0)});
443 } else {
445 MF, MORE, "amdgpu-regbanklegalize",
446 "AMDGPU RegBankLegalize: lowerVccExtToSel, Type not supported", MI);
447 return false;
448 }
449
450 MI.eraseFromParent();
451 return true;
452}
453
454std::pair<Register, Register> RegBankLegalizeHelper::unpackZExt(Register Reg) {
455 auto PackedS32 = B.buildBitcast(SgprRB_S32, Reg);
456 auto Mask = B.buildConstant(SgprRB_S32, 0x0000ffff);
457 auto Lo = B.buildAnd(SgprRB_S32, PackedS32, Mask);
458 auto Hi = B.buildLShr(SgprRB_S32, PackedS32, B.buildConstant(SgprRB_S32, 16));
459 return {Lo.getReg(0), Hi.getReg(0)};
460}
461
462std::pair<Register, Register> RegBankLegalizeHelper::unpackSExt(Register Reg) {
463 auto PackedS32 = B.buildBitcast(SgprRB_S32, Reg);
464 auto Lo = B.buildSExtInReg(SgprRB_S32, PackedS32, 16);
465 auto Hi = B.buildAShr(SgprRB_S32, PackedS32, B.buildConstant(SgprRB_S32, 16));
466 return {Lo.getReg(0), Hi.getReg(0)};
467}
468
469std::pair<Register, Register> RegBankLegalizeHelper::unpackAExt(Register Reg) {
470 auto PackedS32 = B.buildBitcast(SgprRB_S32, Reg);
471 auto Lo = PackedS32;
472 auto Hi = B.buildLShr(SgprRB_S32, PackedS32, B.buildConstant(SgprRB_S32, 16));
473 return {Lo.getReg(0), Hi.getReg(0)};
474}
475
476std::pair<Register, Register>
477RegBankLegalizeHelper::unpackAExtTruncS16(Register Reg) {
478 auto [Lo32, Hi32] = unpackAExt(Reg);
479 return {B.buildTrunc(SgprRB_S16, Lo32).getReg(0),
480 B.buildTrunc(SgprRB_S16, Hi32).getReg(0)};
481}
482
483bool RegBankLegalizeHelper::lowerUnpackBitShift(MachineInstr &MI) {
484 Register Lo, Hi;
485 switch (MI.getOpcode()) {
486 case AMDGPU::G_SHL: {
487 auto [Val0, Val1] = unpackAExt(MI.getOperand(1).getReg());
488 auto [Amt0, Amt1] = unpackAExt(MI.getOperand(2).getReg());
489 Lo = B.buildInstr(MI.getOpcode(), {SgprRB_S32}, {Val0, Amt0}).getReg(0);
490 Hi = B.buildInstr(MI.getOpcode(), {SgprRB_S32}, {Val1, Amt1}).getReg(0);
491 break;
492 }
493 case AMDGPU::G_LSHR: {
494 auto [Val0, Val1] = unpackZExt(MI.getOperand(1).getReg());
495 auto [Amt0, Amt1] = unpackZExt(MI.getOperand(2).getReg());
496 Lo = B.buildInstr(MI.getOpcode(), {SgprRB_S32}, {Val0, Amt0}).getReg(0);
497 Hi = B.buildInstr(MI.getOpcode(), {SgprRB_S32}, {Val1, Amt1}).getReg(0);
498 break;
499 }
500 case AMDGPU::G_ASHR: {
501 auto [Val0, Val1] = unpackSExt(MI.getOperand(1).getReg());
502 auto [Amt0, Amt1] = unpackSExt(MI.getOperand(2).getReg());
503 Lo = B.buildAShr(SgprRB_S32, Val0, Amt0).getReg(0);
504 Hi = B.buildAShr(SgprRB_S32, Val1, Amt1).getReg(0);
505 break;
506 }
507 default:
509 MF, MORE, "amdgpu-regbanklegalize",
510 "AMDGPU RegBankLegalize: lowerUnpackBitShift, case not implemented",
511 MI);
512 return false;
513 }
514 B.buildBuildVectorTrunc(MI.getOperand(0).getReg(), {Lo, Hi});
515 MI.eraseFromParent();
516 return true;
517}
518
519bool RegBankLegalizeHelper::lowerUnpackMinMax(MachineInstr &MI) {
520 Register Lo, Hi;
521 switch (MI.getOpcode()) {
522 case AMDGPU::G_SMIN:
523 case AMDGPU::G_SMAX: {
524 // For signed operations, use sign extension
525 auto [Val0_Lo, Val0_Hi] = unpackSExt(MI.getOperand(1).getReg());
526 auto [Val1_Lo, Val1_Hi] = unpackSExt(MI.getOperand(2).getReg());
527 Lo = B.buildInstr(MI.getOpcode(), {SgprRB_S32}, {Val0_Lo, Val1_Lo})
528 .getReg(0);
529 Hi = B.buildInstr(MI.getOpcode(), {SgprRB_S32}, {Val0_Hi, Val1_Hi})
530 .getReg(0);
531 break;
532 }
533 case AMDGPU::G_UMIN:
534 case AMDGPU::G_UMAX: {
535 // For unsigned operations, use zero extension
536 auto [Val0_Lo, Val0_Hi] = unpackZExt(MI.getOperand(1).getReg());
537 auto [Val1_Lo, Val1_Hi] = unpackZExt(MI.getOperand(2).getReg());
538 Lo = B.buildInstr(MI.getOpcode(), {SgprRB_S32}, {Val0_Lo, Val1_Lo})
539 .getReg(0);
540 Hi = B.buildInstr(MI.getOpcode(), {SgprRB_S32}, {Val0_Hi, Val1_Hi})
541 .getReg(0);
542 break;
543 }
544 default:
546 MF, MORE, "amdgpu-regbanklegalize",
547 "AMDGPU RegBankLegalize: lowerUnpackMinMax, case not implemented", MI);
548 return false;
549 }
550 B.buildBuildVectorTrunc(MI.getOperand(0).getReg(), {Lo, Hi});
551 MI.eraseFromParent();
552 return true;
553}
554
555bool RegBankLegalizeHelper::lowerUnpackAExt(MachineInstr &MI) {
556 auto [Op1Lo, Op1Hi] = unpackAExt(MI.getOperand(1).getReg());
557 auto [Op2Lo, Op2Hi] = unpackAExt(MI.getOperand(2).getReg());
558 auto ResLo = B.buildInstr(MI.getOpcode(), {SgprRB_S32}, {Op1Lo, Op2Lo});
559 auto ResHi = B.buildInstr(MI.getOpcode(), {SgprRB_S32}, {Op1Hi, Op2Hi});
560 B.buildBuildVectorTrunc(MI.getOperand(0).getReg(),
561 {ResLo.getReg(0), ResHi.getReg(0)});
562 MI.eraseFromParent();
563 return true;
564}
565
568 return (GI->is(Intrinsic::amdgcn_sbfe));
569
570 return MI.getOpcode() == AMDGPU::G_SBFX;
571}
572
573bool RegBankLegalizeHelper::lowerV_BFE(MachineInstr &MI) {
574 Register Dst = MI.getOperand(0).getReg();
575 assert(MRI.getType(Dst) == LLT::scalar(64));
576 bool Signed = isSignedBFE(MI);
577 unsigned FirstOpnd = isa<GIntrinsic>(MI) ? 2 : 1;
578 // Extract bitfield from Src, LSBit is the least-significant bit for the
579 // extraction (field offset) and Width is size of bitfield.
580 Register Src = MI.getOperand(FirstOpnd).getReg();
581 Register LSBit = MI.getOperand(FirstOpnd + 1).getReg();
582 Register Width = MI.getOperand(FirstOpnd + 2).getReg();
583 // Comments are for signed bitfield extract, similar for unsigned. x is sign
584 // bit. s is sign, l is LSB and y are remaining bits of bitfield to extract.
585
586 // Src >> LSBit Hi|Lo: x?????syyyyyyl??? -> xxxx?????syyyyyyl
587 unsigned SHROpc = Signed ? AMDGPU::G_ASHR : AMDGPU::G_LSHR;
588 auto SHRSrc = B.buildInstr(SHROpc, {{VgprRB, S64}}, {Src, LSBit});
589
590 auto ConstWidth = getIConstantVRegValWithLookThrough(Width, MRI);
591
592 // Expand to Src >> LSBit << (64 - Width) >> (64 - Width)
593 // << (64 - Width): Hi|Lo: xxxx?????syyyyyyl -> syyyyyyl000000000
594 // >> (64 - Width): Hi|Lo: syyyyyyl000000000 -> ssssssssssyyyyyyl
595 if (!ConstWidth) {
596 auto Amt = B.buildSub(VgprRB_S32, B.buildConstant(SgprRB_S32, 64), Width);
597 auto SignBit = B.buildShl({VgprRB, S64}, SHRSrc, Amt);
598 B.buildInstr(SHROpc, {Dst}, {SignBit, Amt});
599 MI.eraseFromParent();
600 return true;
601 }
602
603 uint64_t WidthImm = ConstWidth->Value.getZExtValue();
604 auto UnmergeSHRSrc = B.buildUnmerge(VgprRB_S32, SHRSrc);
605 Register SHRSrcLo = UnmergeSHRSrc.getReg(0);
606 Register SHRSrcHi = UnmergeSHRSrc.getReg(1);
607 auto Zero = B.buildConstant({VgprRB, S32}, 0);
608 unsigned BFXOpc = Signed ? AMDGPU::G_SBFX : AMDGPU::G_UBFX;
609
610 if (WidthImm <= 32) {
611 // SHRSrc Hi|Lo: ????????|???syyyl -> ????????|ssssyyyl
612 auto Lo = B.buildInstr(BFXOpc, {VgprRB_S32}, {SHRSrcLo, Zero, Width});
613 MachineInstrBuilder Hi;
614 if (Signed) {
615 // SHRSrc Hi|Lo: ????????|ssssyyyl -> ssssssss|ssssyyyl
616 Hi = B.buildAShr(VgprRB_S32, Lo, B.buildConstant(VgprRB_S32, 31));
617 } else {
618 // SHRSrc Hi|Lo: ????????|000syyyl -> 00000000|000syyyl
619 Hi = Zero;
620 }
621 B.buildMergeLikeInstr(Dst, {Lo, Hi});
622 } else {
623 auto Amt = B.buildConstant(VgprRB_S32, WidthImm - 32);
624 // SHRSrc Hi|Lo: ??????sy|yyyyyyyl -> sssssssy|yyyyyyyl
625 auto Hi = B.buildInstr(BFXOpc, {VgprRB_S32}, {SHRSrcHi, Zero, Amt});
626 B.buildMergeLikeInstr(Dst, {SHRSrcLo, Hi});
627 }
628
629 MI.eraseFromParent();
630 return true;
631}
632
633bool RegBankLegalizeHelper::lowerS_BFE(MachineInstr &MI) {
634 Register DstReg = MI.getOperand(0).getReg();
635 LLT Ty = MRI.getType(DstReg);
636 bool Signed = isSignedBFE(MI);
637 unsigned FirstOpnd = isa<GIntrinsic>(MI) ? 2 : 1;
638 Register Src = MI.getOperand(FirstOpnd).getReg();
639 Register LSBit = MI.getOperand(FirstOpnd + 1).getReg();
640 Register Width = MI.getOperand(FirstOpnd + 2).getReg();
641 // For uniform bit field extract there are 4 available instructions, but
642 // LSBit(field offset) and Width(size of bitfield) need to be packed in S32,
643 // field offset in low and size in high 16 bits.
644
645 // Src1 Hi16|Lo16 = Size|FieldOffset
646 auto Mask = B.buildConstant(SgprRB_S32, maskTrailingOnes<unsigned>(6));
647 auto FieldOffset = B.buildAnd(SgprRB_S32, LSBit, Mask);
648 auto Size = B.buildShl(SgprRB_S32, Width, B.buildConstant(SgprRB_S32, 16));
649 auto Src1 = B.buildOr(SgprRB_S32, FieldOffset, Size);
650 unsigned Opc32 = Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32;
651 unsigned Opc64 = Signed ? AMDGPU::S_BFE_I64 : AMDGPU::S_BFE_U64;
652 unsigned Opc = Ty == S32 ? Opc32 : Opc64;
653
654 // Select machine instruction, because of reg class constraining, insert
655 // copies from reg class to reg bank.
656 auto S_BFE = B.buildInstr(Opc, {{SgprRB, Ty}},
657 {B.buildCopy(Ty, Src), B.buildCopy(S32, Src1)});
658 constrainSelectedInstRegOperands(*S_BFE, *ST.getInstrInfo(),
659 *ST.getRegisterInfo(), RBI);
660
661 B.buildCopy(DstReg, S_BFE->getOperand(0).getReg());
662 MI.eraseFromParent();
663 return true;
664}
665
666bool RegBankLegalizeHelper::lowerSplitTo32(MachineInstr &MI) {
667 Register Dst = MI.getOperand(0).getReg();
668 LLT DstTy = MRI.getType(Dst);
669 assert(DstTy == V4S16 || DstTy == V2S32 || DstTy == S64);
670 LLT Ty = DstTy == V4S16 ? V2S16 : S32;
671 auto Op1 = B.buildUnmerge({VgprRB, Ty}, MI.getOperand(1).getReg());
672 auto Op2 = B.buildUnmerge({VgprRB, Ty}, MI.getOperand(2).getReg());
673 unsigned Opc = MI.getOpcode();
674 auto Flags = MI.getFlags();
675 auto Lo =
676 B.buildInstr(Opc, {{VgprRB, Ty}}, {Op1.getReg(0), Op2.getReg(0)}, Flags);
677 auto Hi =
678 B.buildInstr(Opc, {{VgprRB, Ty}}, {Op1.getReg(1), Op2.getReg(1)}, Flags);
679 B.buildMergeLikeInstr(Dst, {Lo, Hi});
680 MI.eraseFromParent();
681 return true;
682}
683
684bool RegBankLegalizeHelper::lowerSplitTo32Mul(MachineInstr &MI) {
685 Register Dst = MI.getOperand(0).getReg();
686 assert(MRI.getType(Dst) == S64);
687 auto Op1 = B.buildUnmerge({VgprRB_S32}, MI.getOperand(1).getReg());
688 auto Op2 = B.buildUnmerge({VgprRB_S32}, MI.getOperand(2).getReg());
689
690 // TODO: G_AMDGPU_MAD_* optimizations for G_MUL divergent S64 operation to
691 // match GlobalISel with old regbankselect.
692 auto Lo = B.buildMul(VgprRB_S32, Op1.getReg(0), Op2.getReg(0));
693 auto Carry = B.buildUMulH(VgprRB_S32, Op1.getReg(0), Op2.getReg(0));
694 auto MulLo0Hi1 = B.buildMul(VgprRB_S32, Op1.getReg(0), Op2.getReg(1));
695 auto MulHi0Lo1 = B.buildMul(VgprRB_S32, Op1.getReg(1), Op2.getReg(0));
696 auto Sum = B.buildAdd(VgprRB_S32, MulLo0Hi1, MulHi0Lo1);
697 auto Hi = B.buildAdd(VgprRB_S32, Sum, Carry);
698
699 B.buildMergeLikeInstr(Dst, {Lo, Hi});
700 MI.eraseFromParent();
701 return true;
702}
703
704bool RegBankLegalizeHelper::lowerSplitTo16(MachineInstr &MI) {
705 Register Dst = MI.getOperand(0).getReg();
706 assert(MRI.getType(Dst) == V2S16);
707 unsigned Opc = MI.getOpcode();
708 unsigned NumOps = MI.getNumOperands();
709 auto Flags = MI.getFlags();
710
711 auto [Op1Lo, Op1Hi] = unpackAExtTruncS16(MI.getOperand(1).getReg());
712
713 if (NumOps == 2) {
714 auto Lo = B.buildInstr(Opc, {SgprRB_S16}, {Op1Lo}, Flags);
715 auto Hi = B.buildInstr(Opc, {SgprRB_S16}, {Op1Hi}, Flags);
716 B.buildMergeLikeInstr(Dst, {Lo, Hi});
717 MI.eraseFromParent();
718 return true;
719 }
720
721 auto [Op2Lo, Op2Hi] = unpackAExtTruncS16(MI.getOperand(2).getReg());
722
723 if (NumOps == 3) {
724 auto Lo = B.buildInstr(Opc, {SgprRB_S16}, {Op1Lo, Op2Lo}, Flags);
725 auto Hi = B.buildInstr(Opc, {SgprRB_S16}, {Op1Hi, Op2Hi}, Flags);
726 B.buildMergeLikeInstr(Dst, {Lo, Hi});
727 MI.eraseFromParent();
728 return true;
729 }
730
731 assert(NumOps == 4);
732 auto [Op3Lo, Op3Hi] = unpackAExtTruncS16(MI.getOperand(3).getReg());
733 auto Lo = B.buildInstr(Opc, {SgprRB_S16}, {Op1Lo, Op2Lo, Op3Lo}, Flags);
734 auto Hi = B.buildInstr(Opc, {SgprRB_S16}, {Op1Hi, Op2Hi, Op3Hi}, Flags);
735 B.buildMergeLikeInstr(Dst, {Lo, Hi});
736 MI.eraseFromParent();
737 return true;
738}
739
740bool RegBankLegalizeHelper::lowerUniMAD64(MachineInstr &MI) {
741 Register Dst0 = MI.getOperand(0).getReg();
742 Register Dst1 = MI.getOperand(1).getReg();
743 Register Src0 = MI.getOperand(2).getReg();
744 Register Src1 = MI.getOperand(3).getReg();
745 Register Src2 = MI.getOperand(4).getReg();
746
747 const GCNSubtarget &ST = B.getMF().getSubtarget<GCNSubtarget>();
748
749 // Keep the multiplication on the SALU.
750 Register DstLo = B.buildMul(SgprRB_S32, Src0, Src1).getReg(0);
751 Register DstHi = MRI.createVirtualRegister(SgprRB_S32);
752 if (ST.hasScalarMulHiInsts()) {
753 B.buildInstr(AMDGPU::G_UMULH, {{DstHi}}, {Src0, Src1});
754 } else {
755 auto VSrc0 = B.buildCopy(VgprRB_S32, Src0);
756 auto VSrc1 = B.buildCopy(VgprRB_S32, Src1);
757 auto MulHi = B.buildInstr(AMDGPU::G_UMULH, {VgprRB_S32}, {VSrc0, VSrc1});
758 buildReadAnyLane(B, DstHi, MulHi.getReg(0), RBI);
759 }
760
761 // Accumulate and produce the "carry-out" bit.
762
763 // The "carry-out" is defined as bit 64 of the result when computed as a
764 // big integer. For unsigned multiply-add, this matches the usual
765 // definition of carry-out.
766 if (mi_match(Src2, MRI, MIPatternMatch::m_ZeroInt())) {
767 // No accumulate: result is just the multiplication, carry is 0.
768 B.buildMergeLikeInstr(Dst0, {DstLo, DstHi});
769 B.buildConstant(Dst1, 0);
770 } else {
771 // Accumulate: add Src2 to the multiplication result with carry chain.
772 Register Src2Lo = MRI.createVirtualRegister(SgprRB_S32);
773 Register Src2Hi = MRI.createVirtualRegister(SgprRB_S32);
774 B.buildUnmerge({Src2Lo, Src2Hi}, Src2);
775
776 auto AddLo = B.buildUAddo(SgprRB_S32, SgprRB_S32, DstLo, Src2Lo);
777 auto AddHi =
778 B.buildUAdde(SgprRB_S32, SgprRB_S32, DstHi, Src2Hi, AddLo.getReg(1));
779 B.buildMergeLikeInstr(Dst0, {AddLo.getReg(0), AddHi.getReg(0)});
780 B.buildCopy(Dst1, AddHi.getReg(1));
781 }
782
783 MI.eraseFromParent();
784 return true;
785}
786
787bool RegBankLegalizeHelper::lowerSplitTo32Select(MachineInstr &MI) {
788 Register Dst = MI.getOperand(0).getReg();
789 LLT DstTy = MRI.getType(Dst);
790 assert(DstTy == V4S16 || DstTy == V2S32 || DstTy == S64 ||
791 (DstTy.isPointer() && DstTy.getSizeInBits() == 64));
792 LLT Ty = DstTy == V4S16 ? V2S16 : S32;
793 auto Op2 = B.buildUnmerge({VgprRB, Ty}, MI.getOperand(2).getReg());
794 auto Op3 = B.buildUnmerge({VgprRB, Ty}, MI.getOperand(3).getReg());
795 Register Cond = MI.getOperand(1).getReg();
796 auto Flags = MI.getFlags();
797 auto Lo =
798 B.buildSelect({VgprRB, Ty}, Cond, Op2.getReg(0), Op3.getReg(0), Flags);
799 auto Hi =
800 B.buildSelect({VgprRB, Ty}, Cond, Op2.getReg(1), Op3.getReg(1), Flags);
801
802 B.buildMergeLikeInstr(Dst, {Lo, Hi});
803 MI.eraseFromParent();
804 return true;
805}
806
807bool RegBankLegalizeHelper::lowerSplitTo32SExtInReg(MachineInstr &MI) {
808 auto Op1 = B.buildUnmerge(VgprRB_S32, MI.getOperand(1).getReg());
809 int Amt = MI.getOperand(2).getImm();
810 Register Lo, Hi;
811 // Hi|Lo: s sign bit, ?/x bits changed/not changed by sign-extend
812 if (Amt <= 32) {
813 auto Freeze = B.buildFreeze(VgprRB_S32, Op1.getReg(0));
814 if (Amt == 32) {
815 // Hi|Lo: ????????|sxxxxxxx -> ssssssss|sxxxxxxx
816 Lo = Freeze.getReg(0);
817 } else {
818 // Hi|Lo: ????????|???sxxxx -> ssssssss|ssssxxxx
819 Lo = B.buildSExtInReg(VgprRB_S32, Freeze, Amt).getReg(0);
820 }
821
822 auto SignExtCst = B.buildConstant(SgprRB_S32, 31);
823 Hi = B.buildAShr(VgprRB_S32, Lo, SignExtCst).getReg(0);
824 } else {
825 // Hi|Lo: ?????sxx|xxxxxxxx -> ssssssxx|xxxxxxxx
826 Lo = Op1.getReg(0);
827 Hi = B.buildSExtInReg(VgprRB_S32, Op1.getReg(1), Amt - 32).getReg(0);
828 }
829
830 B.buildMergeLikeInstr(MI.getOperand(0).getReg(), {Lo, Hi});
831 MI.eraseFromParent();
832 return true;
833}
834
835bool RegBankLegalizeHelper::lowerSplitBitCount64To32(MachineInstr &MI) {
836 // Split 64-bit find-first-bit operations into 32-bit halves:
837 // (ffbh hi:lo) -> umin(ffbh(hi), uaddsat(ffbh(lo), 32))
838 // (ffbl hi:lo) -> umin(ffbl(lo), uaddsat(ffbl(hi), 32))
839 // (ctlz_zero_undef hi:lo) -> umin(ffbh(hi), add(ffbh(lo), 32))
840 // (cttz_zero_undef hi:lo) -> umin(ffbl(lo), add(ffbl(hi), 32))
841 unsigned Opc = MI.getOpcode();
842
843 // FFBH/FFBL return 0xFFFFFFFF on zero input, using uaddsat to avoid
844 // wrapping. CTLZ/CTTZ guarantee non-zero input (zero_undef), so plain add
845 // is fine.
846 unsigned FFBOpc;
847 unsigned AddOpc;
848 bool SearchFromMSB;
849 switch (Opc) {
850 case AMDGPU::G_AMDGPU_FFBH_U32:
851 FFBOpc = Opc;
852 AddOpc = AMDGPU::G_UADDSAT;
853 SearchFromMSB = true;
854 break;
855 case AMDGPU::G_AMDGPU_FFBL_B32:
856 FFBOpc = Opc;
857 AddOpc = AMDGPU::G_UADDSAT;
858 SearchFromMSB = false;
859 break;
860 case AMDGPU::G_CTLZ_ZERO_UNDEF:
861 FFBOpc = AMDGPU::G_AMDGPU_FFBH_U32;
862 AddOpc = AMDGPU::G_ADD;
863 SearchFromMSB = true;
864 break;
865 case AMDGPU::G_CTTZ_ZERO_UNDEF:
866 FFBOpc = AMDGPU::G_AMDGPU_FFBL_B32;
867 AddOpc = AMDGPU::G_ADD;
868 SearchFromMSB = false;
869 break;
870 default:
871 llvm_unreachable("unexpected opcode in lowerSplitBitCount64To32");
872 }
873
874 auto Unmerge = B.buildUnmerge(VgprRB_S32, MI.getOperand(1).getReg());
875 Register Lo = Unmerge.getReg(0);
876 Register Hi = Unmerge.getReg(1);
877
878 // MSB-first (FFBH/CTLZ) searches hi first; LSB-first (FFBL/CTTZ) searches
879 // lo first. The secondary half adds 32 to account for the primary half's
880 // width.
881 auto Primary = B.buildInstr(FFBOpc, {VgprRB_S32}, {SearchFromMSB ? Hi : Lo});
882 auto Secondary =
883 B.buildInstr(FFBOpc, {VgprRB_S32}, {SearchFromMSB ? Lo : Hi});
884
885 auto Adjusted = B.buildInstr(AddOpc, {VgprRB_S32},
886 {Secondary, B.buildConstant(VgprRB_S32, 32)});
887 B.buildUMin(MI.getOperand(0).getReg(), Primary, Adjusted);
888
889 MI.eraseFromParent();
890 return true;
891}
892
893bool RegBankLegalizeHelper::lowerExtrVecEltToSel(MachineInstr &MI) {
894 // Lower extract vector element to a compare-select chain:
895 // result = elt[0]
896 // for i in 1..N-1:
897 // result = (idx == i) ? elt[i] : result
898 //
899 // When the index is divergent, each lane may want a different element, so
900 // we must check every element per lane.
901 Register Dst = MI.getOperand(0).getReg();
902 Register Src = MI.getOperand(1).getReg();
903 Register Idx = MI.getOperand(2).getReg();
904
905 LLT VecTy = MRI.getType(Src);
906 LLT ScalarTy = VecTy.getScalarType();
907 unsigned NumElts = VecTy.getNumElements();
908 MachineRegisterInfo::VRegAttrs VgprRB_EltTy = {VgprRB, ScalarTy};
909
910 auto Unmerge = B.buildUnmerge(VgprRB_EltTy, Src);
911
912 if (ScalarTy.getSizeInBits() == 32) {
913 Register PrevSelect = Unmerge.getReg(0);
914 for (unsigned I = 1; I < NumElts; ++I) {
915 auto IdxConst = B.buildConstant({SgprRB, MRI.getType(Idx)}, I);
916 auto Cmp = B.buildICmp(CmpInst::ICMP_EQ, VccRB_S1, Idx, IdxConst);
917 PrevSelect =
918 B.buildSelect(VgprRB_EltTy, Cmp, Unmerge.getReg(I), PrevSelect)
919 .getReg(0);
920 }
921 B.buildCopy(Dst, PrevSelect);
922 } else if (ScalarTy.getSizeInBits() == 64) {
923 auto InitUnmerge = B.buildUnmerge(VgprRB_S32, Unmerge.getReg(0));
924 Register PrevLo = InitUnmerge.getReg(0);
925 Register PrevHi = InitUnmerge.getReg(1);
926 for (unsigned I = 1; I < NumElts; ++I) {
927 auto IdxConst = B.buildConstant({SgprRB, MRI.getType(Idx)}, I);
928 auto Cmp = B.buildICmp(CmpInst::ICMP_EQ, VccRB_S1, Idx, IdxConst);
929 auto EltUnmerge = B.buildUnmerge(VgprRB_S32, Unmerge.getReg(I));
930 PrevLo = B.buildSelect(VgprRB_S32, Cmp, EltUnmerge.getReg(0), PrevLo)
931 .getReg(0);
932 PrevHi = B.buildSelect(VgprRB_S32, Cmp, EltUnmerge.getReg(1), PrevHi)
933 .getReg(0);
934 }
935 B.buildMergeLikeInstr(Dst, {PrevLo, PrevHi});
936 } else {
938 MF, MORE, "amdgpu-regbanklegalize",
939 "AMDGPU RegBankLegalize: ExtrVecEltToSel unsupported element type", MI);
940 return false;
941 }
942
943 MI.eraseFromParent();
944 return true;
945}
946
947bool RegBankLegalizeHelper::lowerExtrVecEltTo32(MachineInstr &MI) {
948 // Reduce a 64-bit element extract to two 32-bit extracts:
949 // vec32 = bitcast <N x s64> to <2N x s32>
950 // lo = vec32[idx * 2]
951 // hi = vec32[idx * 2 + 1]
952 // result = merge(lo, hi)
953 //
954 // When the index is uniform, all lanes extract the same element, so we can
955 // just split the s64 extract into two s32 extracts which lower to MOVREL.
956 Register Dst = MI.getOperand(0).getReg();
957 Register Src = MI.getOperand(1).getReg();
958 Register Idx = MI.getOperand(2).getReg();
959
960 LLT SrcTy = MRI.getType(Src);
961 LLT Vec32Ty = LLT::fixed_vector(2 * SrcTy.getNumElements(), 32);
962
963 assert(MRI.getRegBank(Src) == VgprRB && MRI.getRegBank(Idx) == SgprRB &&
964 "expected VGPR src and SGPR idx");
965
966 auto CastSrc = B.buildBitcast({VgprRB, Vec32Ty}, Src);
967
968 // Calculate new Lo and Hi indices
969 auto One = B.buildConstant(SgprRB_S32, 1);
970 auto IdxLo = B.buildShl(SgprRB_S32, Idx, One);
971 auto IdxHi = B.buildAdd(SgprRB_S32, IdxLo, One);
972
973 auto ExtLo = B.buildExtractVectorElement(VgprRB_S32, CastSrc, IdxLo);
974 auto ExtHi = B.buildExtractVectorElement(VgprRB_S32, CastSrc, IdxHi);
975
976 B.buildMergeLikeInstr(Dst, {ExtLo.getReg(0), ExtHi.getReg(0)});
977
978 MI.eraseFromParent();
979 return true;
980}
981
982bool RegBankLegalizeHelper::lower(MachineInstr &MI,
983 const RegBankLLTMapping &Mapping,
984 WaterfallInfo &WFI) {
985
986 switch (Mapping.LoweringMethod) {
987 case DoNotLower:
988 break;
989 case VccExtToSel:
990 return lowerVccExtToSel(MI);
991 case UniExtToSel: {
992 LLT Ty = MRI.getType(MI.getOperand(0).getReg());
993 auto True = B.buildConstant({SgprRB, Ty},
994 MI.getOpcode() == AMDGPU::G_SEXT ? -1 : 1);
995 auto False = B.buildConstant({SgprRB, Ty}, 0);
996 // Input to G_{Z|S}EXT is 'Legalizer legal' S1. Most common case is compare.
997 // We are making select here. S1 cond was already 'any-extended to S32' +
998 // 'AND with 1 to clean high bits' by Sgpr32AExtBoolInReg.
999 B.buildSelect(MI.getOperand(0).getReg(), MI.getOperand(1).getReg(), True,
1000 False);
1001 MI.eraseFromParent();
1002 return true;
1003 }
1004 case UnpackBitShift:
1005 return lowerUnpackBitShift(MI);
1006 case UnpackMinMax:
1007 return lowerUnpackMinMax(MI);
1008 case ScalarizeToS16:
1009 return lowerSplitTo16(MI);
1010 case Ext32To64: {
1011 const RegisterBank *RB = MRI.getRegBank(MI.getOperand(0).getReg());
1012 MachineInstrBuilder Hi;
1013 switch (MI.getOpcode()) {
1014 case AMDGPU::G_ZEXT: {
1015 Hi = B.buildConstant({RB, S32}, 0);
1016 break;
1017 }
1018 case AMDGPU::G_SEXT: {
1019 // Replicate sign bit from 32-bit extended part.
1020 auto ShiftAmt = B.buildConstant({RB, S32}, 31);
1021 Hi = B.buildAShr({RB, S32}, MI.getOperand(1).getReg(), ShiftAmt);
1022 break;
1023 }
1024 case AMDGPU::G_ANYEXT: {
1025 Hi = B.buildUndef({RB, S32});
1026 break;
1027 }
1028 default:
1029 reportGISelFailure(MF, MORE, "amdgpu-regbanklegalize",
1030 "AMDGPU RegBankLegalize: Ext32To64, unsuported opcode",
1031 MI);
1032 return false;
1033 }
1034
1035 B.buildMergeLikeInstr(MI.getOperand(0).getReg(),
1036 {MI.getOperand(1).getReg(), Hi});
1037 MI.eraseFromParent();
1038 return true;
1039 }
1040 case UniCstExt: {
1041 uint64_t ConstVal = MI.getOperand(1).getCImm()->getZExtValue();
1042 B.buildConstant(MI.getOperand(0).getReg(), ConstVal);
1043
1044 MI.eraseFromParent();
1045 return true;
1046 }
1047 case VgprToVccCopy: {
1048 Register Src = MI.getOperand(1).getReg();
1049 LLT Ty = MRI.getType(Src);
1050 // Take lowest bit from each lane and put it in lane mask.
1051 // Lowering via compare, but we need to clean high bits first as compare
1052 // compares all bits in register.
1053 Register BoolSrc = MRI.createVirtualRegister({VgprRB, Ty});
1054 if (Ty == S64) {
1055 auto Src64 = B.buildUnmerge(VgprRB_S32, Src);
1056 auto One = B.buildConstant(VgprRB_S32, 1);
1057 auto AndLo = B.buildAnd(VgprRB_S32, Src64.getReg(0), One);
1058 auto Zero = B.buildConstant(VgprRB_S32, 0);
1059 auto AndHi = B.buildAnd(VgprRB_S32, Src64.getReg(1), Zero);
1060 B.buildMergeLikeInstr(BoolSrc, {AndLo, AndHi});
1061 } else {
1062 assert(Ty == S32 || Ty == S16);
1063 auto One = B.buildConstant({VgprRB, Ty}, 1);
1064 B.buildAnd(BoolSrc, Src, One);
1065 }
1066 auto Zero = B.buildConstant({VgprRB, Ty}, 0);
1067 B.buildICmp(CmpInst::ICMP_NE, MI.getOperand(0).getReg(), BoolSrc, Zero);
1068 MI.eraseFromParent();
1069 return true;
1070 }
1071 case V_BFE:
1072 return lowerV_BFE(MI);
1073 case S_BFE:
1074 return lowerS_BFE(MI);
1075 case UniMAD64:
1076 return lowerUniMAD64(MI);
1077 case UniMul64: {
1078 B.buildMul(MI.getOperand(0), MI.getOperand(1), MI.getOperand(2));
1079 MI.eraseFromParent();
1080 return true;
1081 }
1082 case DivSMulToMAD: {
1083 auto Op1 = B.buildTrunc(VgprRB_S32, MI.getOperand(1));
1084 auto Op2 = B.buildTrunc(VgprRB_S32, MI.getOperand(2));
1085 auto Zero = B.buildConstant({VgprRB, S64}, 0);
1086
1087 unsigned NewOpc = MI.getOpcode() == AMDGPU::G_AMDGPU_S_MUL_U64_U32
1088 ? AMDGPU::G_AMDGPU_MAD_U64_U32
1089 : AMDGPU::G_AMDGPU_MAD_I64_I32;
1090
1091 B.buildInstr(NewOpc, {MI.getOperand(0).getReg(), {SgprRB, S32}},
1092 {Op1, Op2, Zero});
1093 MI.eraseFromParent();
1094 return true;
1095 }
1096 case SplitTo32:
1097 return lowerSplitTo32(MI);
1098 case SplitTo32Mul:
1099 return lowerSplitTo32Mul(MI);
1100 case SplitTo32Select:
1101 return lowerSplitTo32Select(MI);
1102 case SplitTo32SExtInReg:
1103 return lowerSplitTo32SExtInReg(MI);
1104 case SplitLoad: {
1105 LLT DstTy = MRI.getType(MI.getOperand(0).getReg());
1106 unsigned Size = DstTy.getSizeInBits();
1107 // Even split to 128-bit loads
1108 if (Size > 128) {
1109 LLT B128;
1110 if (DstTy.isVector()) {
1111 LLT EltTy = DstTy.getElementType();
1112 B128 = LLT::fixed_vector(128 / EltTy.getSizeInBits(), EltTy);
1113 } else {
1114 B128 = LLT::scalar(128);
1115 }
1116 if (Size / 128 == 2)
1117 splitLoad(MI, {B128, B128});
1118 else if (Size / 128 == 4)
1119 splitLoad(MI, {B128, B128, B128, B128});
1120 else {
1121 reportGISelFailure(MF, MORE, "amdgpu-regbanklegalize",
1122 "AMDGPU RegBankLegalize: SplitLoad, unsuported type",
1123 MI);
1124 return false;
1125 }
1126 }
1127 // 64 and 32 bit load
1128 else if (DstTy == S96)
1129 splitLoad(MI, {S64, S32}, S32);
1130 else if (DstTy == V3S32)
1131 splitLoad(MI, {V2S32, S32}, S32);
1132 else if (DstTy == V6S16)
1133 splitLoad(MI, {V4S16, V2S16}, V2S16);
1134 else {
1135 reportGISelFailure(MF, MORE, "amdgpu-regbanklegalize",
1136 "AMDGPU RegBankLegalize: SplitLoad, unsuported type",
1137 MI);
1138 return false;
1139 }
1140 return true;
1141 }
1142 case WidenLoad: {
1143 LLT DstTy = MRI.getType(MI.getOperand(0).getReg());
1144 if (DstTy == S96)
1145 widenLoad(MI, S128);
1146 else if (DstTy == V3S32)
1147 widenLoad(MI, V4S32, S32);
1148 else if (DstTy == V6S16)
1149 widenLoad(MI, V8S16, V2S16);
1150 else {
1151 reportGISelFailure(MF, MORE, "amdgpu-regbanklegalize",
1152 "AMDGPU RegBankLegalize: WidenLoad, unsuported type",
1153 MI);
1154 return false;
1155 }
1156 return true;
1157 }
1158 case UnpackAExt:
1159 return lowerUnpackAExt(MI);
1160 case WidenMMOToS32:
1161 return widenMMOToS32(cast<GAnyLoad>(MI));
1162 case VerifyAllSgpr: {
1163 assert(llvm::all_of(MI.operands(), [&](const MachineOperand &Op) {
1164 return MRI.getRegBankOrNull(Op.getReg()) == SgprRB;
1165 }));
1166 return true;
1167 }
1168 case ApplyAllVgpr: {
1169 assert(llvm::all_of(MI.defs(), [&](const MachineOperand &Op) {
1170 return MRI.getRegBankOrNull(Op.getReg()) == VgprRB;
1171 }));
1172 B.setInstrAndDebugLoc(MI);
1173 for (unsigned i = MI.getNumDefs(); i < MI.getNumOperands(); ++i) {
1174 MachineOperand &Op = MI.getOperand(i);
1175 if (!Op.isReg())
1176 continue;
1177 Register Reg = Op.getReg();
1178 if (MRI.getRegBank(Reg) != VgprRB) {
1179 auto Copy = B.buildCopy({VgprRB, MRI.getType(Reg)}, Reg);
1180 Op.setReg(Copy.getReg(0));
1181 }
1182 }
1183 return true;
1184 }
1185 case UnmergeToShiftTrunc: {
1186 GUnmerge *Unmerge = dyn_cast<GUnmerge>(&MI);
1187 LLT Ty = MRI.getType(Unmerge->getSourceReg());
1188 if (Ty.getSizeInBits() % 32 != 0) {
1189 reportGISelFailure(MF, MORE, "amdgpu-regbanklegalize",
1190 "AMDGPU RegBankLegalize: unmerge not multiple of 32",
1191 MI);
1192 return false;
1193 }
1194
1195 B.setInstrAndDebugLoc(MI);
1196 if (Ty.getSizeInBits() > 32) {
1197 auto UnmergeV2S16 =
1198 B.buildUnmerge({SgprRB, V2S16}, Unmerge->getSourceReg());
1199 for (unsigned i = 0; i < UnmergeV2S16->getNumDefs(); ++i) {
1200 auto [Dst0S32, Dst1S32] =
1201 unpackAExt(UnmergeV2S16->getOperand(i).getReg());
1202 B.buildTrunc(MI.getOperand(i * 2).getReg(), Dst0S32);
1203 B.buildTrunc(MI.getOperand(i * 2 + 1).getReg(), Dst1S32);
1204 }
1205 } else {
1206 auto [Dst0S32, Dst1S32] = unpackAExt(MI.getOperand(2).getReg());
1207 B.buildTrunc(MI.getOperand(0).getReg(), Dst0S32);
1208 B.buildTrunc(MI.getOperand(1).getReg(), Dst1S32);
1209 }
1210
1211 MI.eraseFromParent();
1212 return true;
1213 }
1215 Register Dst = MI.getOperand(0).getReg();
1216 Register NewDst = MRI.createVirtualRegister(SgprRB_S32);
1217 B.setInsertPt(*MI.getParent(), MI.getParent()->getFirstNonPHI());
1218 MI.getOperand(0).setReg(NewDst);
1219 B.buildTrunc(Dst, NewDst);
1220
1221 for (unsigned i = 1; i < MI.getNumOperands(); i += 2) {
1222 Register UseReg = MI.getOperand(i).getReg();
1223
1224 auto DefMI = MRI.getVRegDef(UseReg)->getIterator();
1225 MachineBasicBlock *DefMBB = DefMI->getParent();
1226
1227 B.setInsertPt(*DefMBB, DefMBB->SkipPHIsAndLabels(std::next(DefMI)));
1228
1229 auto NewUse = B.buildAnyExt(SgprRB_S32, UseReg);
1230 MI.getOperand(i).setReg(NewUse.getReg(0));
1231 }
1232 break;
1233 }
1234 case VerifyAllSgprGPHI: {
1235 assert(llvm::all_of(MI.operands(), [&](const MachineOperand &Op) {
1236 if (Op.isMBB())
1237 return true;
1238 return MRI.getRegBankOrNull(Op.getReg()) == SgprRB;
1239 }));
1240 return true;
1241 }
1243 assert(MRI.getRegBankOrNull(MI.getOperand(0).getReg()) == VgprRB);
1244 assert(llvm::all_of(MI.operands(), [&](const MachineOperand &Op) {
1245 if (Op.isMBB())
1246 return true;
1247 const RegisterBank *RB = MRI.getRegBankOrNull(Op.getReg());
1248 return RB == VgprRB || RB == SgprRB;
1249 }));
1250 return true;
1251 }
1252 case ApplyINTRIN_IMAGE:
1253 return applyRegisterBanksINTRIN_IMAGE(MI);
1255 return lowerSplitBitCount64To32(MI);
1256 case ExtrVecEltToSel:
1257 return lowerExtrVecEltToSel(MI);
1258 case ExtrVecEltTo32:
1259 return lowerExtrVecEltTo32(MI);
1260 }
1261
1262 if (!WFI.SgprWaterfallOperandRegs.empty()) {
1263 if (!executeInWaterfallLoop(B, WFI))
1264 return false;
1265 }
1266 return true;
1267}
1268
1269LLT RegBankLegalizeHelper::getTyFromID(RegBankLLTMappingApplyID ID) {
1270 switch (ID) {
1271 case Vcc:
1272 case UniInVcc:
1273 return LLT::scalar(1);
1274 case Sgpr16:
1275 case Vgpr16:
1276 case UniInVgprS16:
1277 return LLT::scalar(16);
1278 case Sgpr32:
1279 case Sgpr32_WF:
1280 case Sgpr32Trunc:
1281 case Sgpr32AExt:
1283 case Sgpr32SExt:
1284 case Sgpr32ZExt:
1285 case UniInVgprS32:
1286 case Sgpr32ToVgprDst:
1287 case Vgpr32:
1288 case Vgpr32AExt:
1289 case Vgpr32SExt:
1290 case Vgpr32ZExt:
1291 return LLT::scalar(32);
1292 case Sgpr64:
1293 case Vgpr64:
1294 case UniInVgprS64:
1295 case Sgpr64ToVgprDst:
1296 return LLT::scalar(64);
1297 case Sgpr128:
1298 case Vgpr128:
1299 return LLT::scalar(128);
1300 case SgprP0:
1301 case SgprP0Call_WF:
1302 case VgprP0:
1303 return LLT::pointer(0, 64);
1304 case SgprP1:
1305 case VgprP1:
1306 return LLT::pointer(1, 64);
1307 case SgprP2:
1308 case VgprP2:
1309 return LLT::pointer(2, 32);
1310 case SgprP3:
1311 case VgprP3:
1312 return LLT::pointer(3, 32);
1313 case SgprP4:
1314 case SgprP4Call_WF:
1315 case VgprP4:
1316 return LLT::pointer(4, 64);
1317 case SgprP5:
1318 case VgprP5:
1319 return LLT::pointer(5, 32);
1320 case SgprP8:
1321 return LLT::pointer(8, 128);
1322 case SgprV2S16:
1323 case VgprV2S16:
1324 case UniInVgprV2S16:
1325 return LLT::fixed_vector(2, 16);
1326 case SgprV2S32:
1327 case VgprV2S32:
1328 case UniInVgprV2S32:
1329 return LLT::fixed_vector(2, 32);
1330 case VgprV3S32:
1331 return LLT::fixed_vector(3, 32);
1332 case VgprV4S16:
1333 return LLT::fixed_vector(4, 16);
1334 case SgprV4S32:
1335 case SgprV4S32_WF:
1336 case VgprV4S32:
1337 case UniInVgprV4S32:
1338 return LLT::fixed_vector(4, 32);
1339 case VgprV8S32:
1340 return LLT::fixed_vector(8, 32);
1341 case VgprV2S64:
1342 case UniInVgprV2S64:
1343 return LLT::fixed_vector(2, 64);
1344 default:
1345 return LLT();
1346 }
1347}
1348
1349LLT RegBankLegalizeHelper::getBTyFromID(RegBankLLTMappingApplyID ID, LLT Ty) {
1350 switch (ID) {
1351 case SgprB32:
1352 case VgprB32:
1353 case SgprB32_M0:
1355 case UniInVgprB32:
1356 if (Ty == LLT::scalar(32) || Ty == LLT::fixed_vector(2, 16) ||
1357 isAnyPtr(Ty, 32))
1358 return Ty;
1359 return LLT();
1360 case SgprPtr32:
1361 case VgprPtr32:
1362 return isAnyPtr(Ty, 32) ? Ty : LLT();
1363 case SgprPtr64:
1364 case VgprPtr64:
1365 return isAnyPtr(Ty, 64) ? Ty : LLT();
1366 case SgprPtr128:
1367 case VgprPtr128:
1368 return isAnyPtr(Ty, 128) ? Ty : LLT();
1369 case SgprB64:
1370 case VgprB64:
1372 case UniInVgprB64:
1373 if (Ty == LLT::scalar(64) || Ty == LLT::fixed_vector(2, 32) ||
1374 Ty == LLT::fixed_vector(4, 16) || isAnyPtr(Ty, 64))
1375 return Ty;
1376 return LLT();
1377 case SgprB96:
1378 case VgprB96:
1379 case UniInVgprB96:
1380 if (Ty == LLT::scalar(96) || Ty == LLT::fixed_vector(3, 32) ||
1381 Ty == LLT::fixed_vector(6, 16))
1382 return Ty;
1383 return LLT();
1384 case SgprB128:
1385 case VgprB128:
1386 case UniInVgprB128:
1387 if (Ty == LLT::scalar(128) || Ty == LLT::fixed_vector(4, 32) ||
1388 Ty == LLT::fixed_vector(2, 64) || Ty == LLT::fixed_vector(8, 16) ||
1389 isAnyPtr(Ty, 128))
1390 return Ty;
1391 return LLT();
1392 case VgprB160:
1393 case UniInVgprB160:
1394 if (Ty.getSizeInBits() == 160)
1395 return Ty;
1396 return LLT();
1397 case SgprB256:
1398 case VgprB256:
1399 case UniInVgprB256:
1400 if (Ty == LLT::scalar(256) || Ty == LLT::fixed_vector(8, 32) ||
1401 Ty == LLT::fixed_vector(4, 64) || Ty == LLT::fixed_vector(16, 16))
1402 return Ty;
1403 return LLT();
1404 case SgprB512:
1405 case VgprB512:
1406 case UniInVgprB512:
1407 if (Ty == LLT::scalar(512) || Ty == LLT::fixed_vector(16, 32) ||
1408 Ty == LLT::fixed_vector(8, 64))
1409 return Ty;
1410 return LLT();
1411 case SgprBRC: {
1412 const SIRegisterInfo *TRI =
1413 static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
1414 unsigned LLTSize = Ty.getSizeInBits();
1415 if (LLTSize >= 32 && TRI->getSGPRClassForBitWidth(LLTSize))
1416 return Ty;
1417 return LLT();
1418 }
1419 case VgprBRC: {
1420 const SIRegisterInfo *TRI =
1421 static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
1422 if (TRI->getSGPRClassForBitWidth(Ty.getSizeInBits()))
1423 return Ty;
1424 return LLT();
1425 }
1426 default:
1427 return LLT();
1428 }
1429}
1430
1431const RegisterBank *
1432RegBankLegalizeHelper::getRegBankFromID(RegBankLLTMappingApplyID ID) {
1433 switch (ID) {
1434 case Vcc:
1435 return VccRB;
1436 case Sgpr16:
1437 case Sgpr32:
1438 case Sgpr32_WF:
1439 case Sgpr64:
1440 case Sgpr128:
1441 case SgprP0:
1442 case SgprP0Call_WF:
1443 case SgprP1:
1444 case SgprP2:
1445 case SgprP3:
1446 case SgprP4:
1447 case SgprP4Call_WF:
1448 case SgprP5:
1449 case SgprP8:
1450 case SgprPtr32:
1451 case SgprPtr64:
1452 case SgprPtr128:
1453 case SgprV2S16:
1454 case SgprV2S32:
1455 case SgprV4S32:
1456 case SgprV4S32_WF:
1457 case SgprB32:
1458 case SgprB64:
1459 case SgprB96:
1460 case SgprB128:
1461 case SgprB256:
1462 case SgprB512:
1463 case SgprBRC:
1464 case UniInVcc:
1465 case UniInVgprS16:
1466 case UniInVgprS32:
1467 case UniInVgprS64:
1468 case UniInVgprV2S16:
1469 case UniInVgprV2S32:
1470 case UniInVgprV4S32:
1471 case UniInVgprV2S64:
1472 case UniInVgprB32:
1473 case UniInVgprB64:
1474 case UniInVgprB96:
1475 case UniInVgprB128:
1476 case UniInVgprB160:
1477 case UniInVgprB256:
1478 case UniInVgprB512:
1479 case Sgpr32Trunc:
1480 case Sgpr32AExt:
1482 case Sgpr32SExt:
1483 case Sgpr32ZExt:
1484 return SgprRB;
1485 case Vgpr16:
1486 case Vgpr32:
1487 case Vgpr64:
1488 case Vgpr128:
1489 case VgprP0:
1490 case VgprP1:
1491 case VgprP2:
1492 case VgprP3:
1493 case VgprP4:
1494 case VgprP5:
1495 case VgprPtr32:
1496 case VgprPtr64:
1497 case VgprPtr128:
1498 case VgprV2S16:
1499 case VgprV2S32:
1500 case VgprV2S64:
1501 case VgprV3S32:
1502 case VgprV4S16:
1503 case VgprV4S32:
1504 case VgprV8S32:
1505 case VgprB32:
1506 case VgprB64:
1507 case VgprB96:
1508 case VgprB128:
1509 case VgprB160:
1510 case VgprB256:
1511 case VgprB512:
1512 case VgprBRC:
1513 case Vgpr32AExt:
1514 case Vgpr32SExt:
1515 case Vgpr32ZExt:
1516 case Sgpr32ToVgprDst:
1517 case Sgpr64ToVgprDst:
1518 return VgprRB;
1519 default:
1520 return nullptr;
1521 }
1522}
1523
1524bool RegBankLegalizeHelper::applyMappingDst(
1525 MachineInstr &MI, unsigned &OpIdx,
1526 const SmallVectorImpl<RegBankLLTMappingApplyID> &MethodIDs) {
1527 // Defs start from operand 0
1528 for (; OpIdx < MethodIDs.size(); ++OpIdx) {
1529 if (MethodIDs[OpIdx] == None)
1530 continue;
1531 MachineOperand &Op = MI.getOperand(OpIdx);
1532 Register Reg = Op.getReg();
1533 LLT Ty = MRI.getType(Reg);
1534 [[maybe_unused]] const RegisterBank *RB = MRI.getRegBank(Reg);
1535
1536 switch (MethodIDs[OpIdx]) {
1537 // vcc, sgpr and vgpr scalars, pointers and vectors
1538 case Vcc:
1539 case Sgpr16:
1540 case Sgpr32:
1541 case Sgpr64:
1542 case Sgpr128:
1543 case SgprP0:
1544 case SgprP1:
1545 case SgprP3:
1546 case SgprP4:
1547 case SgprP5:
1548 case SgprP8:
1549 case SgprV2S16:
1550 case SgprV2S32:
1551 case SgprV4S32:
1552 case Vgpr16:
1553 case Vgpr32:
1554 case Vgpr64:
1555 case Vgpr128:
1556 case VgprP0:
1557 case VgprP1:
1558 case VgprP2:
1559 case VgprP3:
1560 case VgprP4:
1561 case VgprP5:
1562 case VgprV2S16:
1563 case VgprV2S32:
1564 case VgprV2S64:
1565 case VgprV3S32:
1566 case VgprV4S16:
1567 case VgprV4S32:
1568 case VgprV8S32: {
1569 assert(Ty == getTyFromID(MethodIDs[OpIdx]));
1570 assert(RB == getRegBankFromID(MethodIDs[OpIdx]));
1571 break;
1572 }
1573 // sgpr and vgpr B-types
1574 case SgprB32:
1575 case SgprB64:
1576 case SgprB96:
1577 case SgprB128:
1578 case SgprB256:
1579 case SgprB512:
1580 case SgprBRC:
1581 case SgprPtr32:
1582 case SgprPtr64:
1583 case SgprPtr128:
1584 case VgprB32:
1585 case VgprB64:
1586 case VgprB96:
1587 case VgprB128:
1588 case VgprB160:
1589 case VgprB256:
1590 case VgprB512:
1591 case VgprBRC:
1592 case VgprPtr32:
1593 case VgprPtr64:
1594 case VgprPtr128: {
1595 assert(Ty == getBTyFromID(MethodIDs[OpIdx], Ty));
1596 assert(RB == getRegBankFromID(MethodIDs[OpIdx]));
1597 break;
1598 }
1599 // uniform in vcc/vgpr: scalars, vectors and B-types
1600 case UniInVcc: {
1601 assert(Ty == S1);
1602 assert(RB == SgprRB);
1603 Register NewDst = MRI.createVirtualRegister(VccRB_S1);
1604 Op.setReg(NewDst);
1605 if (!MRI.use_empty(Reg)) {
1606 auto CopyS32_Vcc =
1607 B.buildInstr(AMDGPU::G_AMDGPU_COPY_SCC_VCC, {SgprRB_S32}, {NewDst});
1608 B.buildTrunc(Reg, CopyS32_Vcc);
1609 }
1610 break;
1611 }
1612 case UniInVgprS16: {
1613 assert(Ty == getTyFromID(MethodIDs[OpIdx]));
1614 assert(RB == SgprRB);
1615 Register NewVgprDstS16 = MRI.createVirtualRegister({VgprRB, S16});
1616 Register NewVgprDstS32 = MRI.createVirtualRegister({VgprRB, S32});
1617 Register NewSgprDstS32 = MRI.createVirtualRegister({SgprRB, S32});
1618 Op.setReg(NewVgprDstS16);
1619 B.buildAnyExt(NewVgprDstS32, NewVgprDstS16);
1620 buildReadAnyLane(B, NewSgprDstS32, NewVgprDstS32, RBI);
1621 B.buildTrunc(Reg, NewSgprDstS32);
1622 break;
1623 }
1624 case UniInVgprS32:
1625 case UniInVgprS64:
1626 case UniInVgprV2S16:
1627 case UniInVgprV2S32:
1628 case UniInVgprV4S32:
1629 case UniInVgprV2S64: {
1630 assert(Ty == getTyFromID(MethodIDs[OpIdx]));
1631 assert(RB == SgprRB);
1632 Register NewVgprDst = MRI.createVirtualRegister({VgprRB, Ty});
1633 Op.setReg(NewVgprDst);
1634 buildReadAnyLane(B, Reg, NewVgprDst, RBI);
1635 break;
1636 }
1637 case UniInVgprB32:
1638 case UniInVgprB64:
1639 case UniInVgprB96:
1640 case UniInVgprB128:
1641 case UniInVgprB160:
1642 case UniInVgprB256:
1643 case UniInVgprB512: {
1644 assert(Ty == getBTyFromID(MethodIDs[OpIdx], Ty));
1645 assert(RB == SgprRB);
1646 Register NewVgprDst = MRI.createVirtualRegister({VgprRB, Ty});
1647 Op.setReg(NewVgprDst);
1648 AMDGPU::buildReadAnyLane(B, Reg, NewVgprDst, RBI);
1649 break;
1650 }
1651 // sgpr trunc
1652 case Sgpr32Trunc: {
1653 assert(Ty.getSizeInBits() < 32);
1654 assert(RB == SgprRB);
1655 Register NewDst = MRI.createVirtualRegister(SgprRB_S32);
1656 Op.setReg(NewDst);
1657 if (!MRI.use_empty(Reg))
1658 B.buildTrunc(Reg, NewDst);
1659 break;
1660 }
1661 case Sgpr32ToVgprDst:
1662 case Sgpr64ToVgprDst: {
1663 assert(Ty == getTyFromID(MethodIDs[OpIdx]));
1664 assert(RB == VgprRB);
1665 Op.setReg(MRI.createVirtualRegister({SgprRB, Ty}));
1666 B.buildCopy(Reg, Op.getReg());
1667 break;
1668 }
1669 case InvalidMapping: {
1671 MF, MORE, "amdgpu-regbanklegalize",
1672 "AMDGPU RegBankLegalize: missing fast rule ('Div' or 'Uni') for", MI);
1673 return false;
1674 }
1675 default:
1677 MF, MORE, "amdgpu-regbanklegalize",
1678 "AMDGPU RegBankLegalize: applyMappingDst, ID not supported", MI);
1679 return false;
1680 }
1681 }
1682
1683 return true;
1684}
1685
1686bool RegBankLegalizeHelper::applyMappingSrc(
1687 MachineInstr &MI, unsigned &OpIdx,
1688 const SmallVectorImpl<RegBankLLTMappingApplyID> &MethodIDs,
1689 WaterfallInfo &WFI) {
1690 for (unsigned i = 0; i < MethodIDs.size(); ++OpIdx, ++i) {
1691 if (MethodIDs[i] == None || MethodIDs[i] == IntrId || MethodIDs[i] == Imm)
1692 continue;
1693
1694 MachineOperand &Op = MI.getOperand(OpIdx);
1695 Register Reg = Op.getReg();
1696 LLT Ty = MRI.getType(Reg);
1697 const RegisterBank *RB = MRI.getRegBank(Reg);
1698
1699 switch (MethodIDs[i]) {
1700 case Vcc: {
1701 assert(Ty == S1);
1702 assert(RB == VccRB || RB == SgprRB);
1703 if (RB == SgprRB) {
1704 auto Aext = B.buildAnyExt(SgprRB_S32, Reg);
1705 auto CopyVcc_Scc =
1706 B.buildInstr(AMDGPU::G_AMDGPU_COPY_VCC_SCC, {VccRB_S1}, {Aext});
1707 Op.setReg(CopyVcc_Scc.getReg(0));
1708 }
1709 break;
1710 }
1711 // sgpr scalars, pointers and vectors
1712 case Sgpr16:
1713 case Sgpr32:
1714 case Sgpr64:
1715 case Sgpr128:
1716 case SgprP0:
1717 case SgprP1:
1718 case SgprP3:
1719 case SgprP4:
1720 case SgprP5:
1721 case SgprP8:
1722 case SgprV2S16:
1723 case SgprV2S32:
1724 case SgprV4S32: {
1725 assert(Ty == getTyFromID(MethodIDs[i]));
1726 assert(RB == getRegBankFromID(MethodIDs[i]));
1727 break;
1728 }
1729 // sgpr B-types
1730 case SgprB32:
1731 case SgprB64:
1732 case SgprB96:
1733 case SgprB128:
1734 case SgprB256:
1735 case SgprB512:
1736 case SgprBRC:
1737 case SgprPtr32:
1738 case SgprPtr64:
1739 case SgprPtr128: {
1740 assert(Ty == getBTyFromID(MethodIDs[i], Ty));
1741 assert(RB == getRegBankFromID(MethodIDs[i]));
1742 break;
1743 }
1744 // vgpr scalars, pointers and vectors
1745 case Vgpr16:
1746 case Vgpr32:
1747 case Vgpr64:
1748 case Vgpr128:
1749 case VgprP0:
1750 case VgprP1:
1751 case VgprP2:
1752 case VgprP3:
1753 case VgprP4:
1754 case VgprP5:
1755 case VgprV2S16:
1756 case VgprV2S32:
1757 case VgprV2S64:
1758 case VgprV3S32:
1759 case VgprV4S16:
1760 case VgprV4S32:
1761 case VgprV8S32: {
1762 assert(Ty == getTyFromID(MethodIDs[i]));
1763 if (RB != VgprRB) {
1764 auto CopyToVgpr = B.buildCopy({VgprRB, Ty}, Reg);
1765 Op.setReg(CopyToVgpr.getReg(0));
1766 }
1767 break;
1768 }
1769 // vgpr B-types
1770 case VgprB32:
1771 case VgprB64:
1772 case VgprB96:
1773 case VgprB128:
1774 case VgprB160:
1775 case VgprB256:
1776 case VgprB512:
1777 case VgprBRC:
1778 case VgprPtr32:
1779 case VgprPtr64:
1780 case VgprPtr128: {
1781 assert(Ty == getBTyFromID(MethodIDs[i], Ty));
1782 if (RB != VgprRB) {
1783 auto CopyToVgpr = B.buildCopy({VgprRB, Ty}, Reg);
1784 Op.setReg(CopyToVgpr.getReg(0));
1785 }
1786 break;
1787 }
1788 // sgpr waterfall, scalars, and vectors
1789 case Sgpr32_WF:
1790 case SgprV4S32_WF: {
1791 assert(Ty == getTyFromID(MethodIDs[i]));
1792 if (RB != SgprRB) {
1793 WFI.SgprWaterfallOperandRegs.insert(Reg);
1794 if (!WFI.Start.isValid()) {
1795 WFI.Start = MI.getIterator();
1796 WFI.End = std::next(MI.getIterator());
1797 }
1798 }
1799 break;
1800 }
1801 case SgprP0Call_WF:
1802 case SgprP4Call_WF: {
1803 assert(Ty == getTyFromID(MethodIDs[i]));
1804 if (RB != SgprRB) {
1805 WFI.SgprWaterfallOperandRegs.insert(Reg);
1806
1807 // Find the ADJCALLSTACKUP before the call.
1808 MachineBasicBlock::iterator Start = MI.getIterator();
1809 while (Start->getOpcode() != AMDGPU::ADJCALLSTACKUP)
1810 --Start;
1811
1812 // Find the ADJCALLSTACKDOWN after the call (include it in range).
1813 MachineBasicBlock::iterator End = MI.getIterator();
1814 while (End->getOpcode() != AMDGPU::ADJCALLSTACKDOWN)
1815 ++End;
1816 ++End;
1817
1818 B.setInsertPt(*MI.getParent(), Start);
1819 WFI.Start = Start;
1820 WFI.End = End;
1821 }
1822 break;
1823 }
1824 case SgprB32_M0:
1826 case SgprB64_ReadFirstLane: {
1827 assert(Ty == getBTyFromID(MethodIDs[i], Ty));
1828 if (RB == SgprRB)
1829 break;
1830 assert(RB == VgprRB);
1831 Register NewSGPR = MRI.createVirtualRegister({SgprRB, Ty});
1832 buildReadFirstLane(B, NewSGPR, Op.getReg(), RBI);
1833 Op.setReg(NewSGPR);
1834 break;
1835 }
1836 // sgpr and vgpr scalars with extend
1837 case Sgpr32AExt: {
1838 // Note: this ext allows S1, and it is meant to be combined away.
1839 assert(Ty.getSizeInBits() < 32);
1840 assert(RB == SgprRB);
1841 auto Aext = B.buildAnyExt(SgprRB_S32, Reg);
1842 Op.setReg(Aext.getReg(0));
1843 break;
1844 }
1845 case Sgpr32AExtBoolInReg: {
1846 // Note: this ext allows S1, and it is meant to be combined away.
1847 assert(Ty.getSizeInBits() == 1);
1848 assert(RB == SgprRB);
1849 auto Aext = B.buildAnyExt(SgprRB_S32, Reg);
1850 // Zext SgprS1 is not legal, make AND with 1 instead. This instruction is
1851 // most of times meant to be combined away in AMDGPURegBankCombiner.
1852 auto Cst1 = B.buildConstant(SgprRB_S32, 1);
1853 auto BoolInReg = B.buildAnd(SgprRB_S32, Aext, Cst1);
1854 Op.setReg(BoolInReg.getReg(0));
1855 break;
1856 }
1857 case Sgpr32SExt: {
1858 assert(1 < Ty.getSizeInBits() && Ty.getSizeInBits() < 32);
1859 assert(RB == SgprRB);
1860 auto Sext = B.buildSExt(SgprRB_S32, Reg);
1861 Op.setReg(Sext.getReg(0));
1862 break;
1863 }
1864 case Sgpr32ZExt: {
1865 assert(1 < Ty.getSizeInBits() && Ty.getSizeInBits() < 32);
1866 assert(RB == SgprRB);
1867 auto Zext = B.buildZExt({SgprRB, S32}, Reg);
1868 Op.setReg(Zext.getReg(0));
1869 break;
1870 }
1871 case Vgpr32AExt: {
1872 assert(Ty.getSizeInBits() < 32);
1873 assert(RB == VgprRB);
1874 auto Aext = B.buildAnyExt({VgprRB, S32}, Reg);
1875 Op.setReg(Aext.getReg(0));
1876 break;
1877 }
1878 case Vgpr32SExt: {
1879 // Note this ext allows S1, and it is meant to be combined away.
1880 assert(Ty.getSizeInBits() < 32);
1881 assert(RB == VgprRB);
1882 auto Sext = B.buildSExt({VgprRB, S32}, Reg);
1883 Op.setReg(Sext.getReg(0));
1884 break;
1885 }
1886 case Vgpr32ZExt: {
1887 // Note this ext allows S1, and it is meant to be combined away.
1888 assert(Ty.getSizeInBits() < 32);
1889 assert(RB == VgprRB);
1890 auto Zext = B.buildZExt({VgprRB, S32}, Reg);
1891 Op.setReg(Zext.getReg(0));
1892 break;
1893 }
1894 default:
1896 MF, MORE, "amdgpu-regbanklegalize",
1897 "AMDGPU RegBankLegalize: applyMappingSrc, ID not supported", MI);
1898 return false;
1899 }
1900 }
1901 return true;
1902}
1903
1904[[maybe_unused]] static bool verifyRegBankOnOperands(MachineInstr &MI,
1905 const RegisterBank *RB,
1907 unsigned StartOpIdx,
1908 unsigned EndOpIdx) {
1909 for (unsigned i = StartOpIdx; i <= EndOpIdx; ++i) {
1910 if (MRI.getRegBankOrNull(MI.getOperand(i).getReg()) != RB)
1911 return false;
1912 }
1913 return true;
1914}
1915
1917 const RegisterBank *RB = MRI.getRegBank(MI.getOperand(0).getReg());
1918 // Put RB on all registers
1919 unsigned NumDefs = MI.getNumDefs();
1920 unsigned NumOperands = MI.getNumOperands();
1921
1922 assert(verifyRegBankOnOperands(MI, RB, MRI, 0, NumDefs - 1));
1923 if (RB == SgprRB)
1924 assert(verifyRegBankOnOperands(MI, RB, MRI, NumDefs, NumOperands - 1));
1925
1926 if (RB == VgprRB) {
1927 B.setInstr(MI);
1928 for (unsigned i = NumDefs; i < NumOperands; ++i) {
1929 Register Reg = MI.getOperand(i).getReg();
1930 if (MRI.getRegBank(Reg) != RB) {
1931 auto Copy = B.buildCopy({VgprRB, MRI.getType(Reg)}, Reg);
1932 MI.getOperand(i).setReg(Copy.getReg(0));
1933 }
1934 }
1935 }
1936}
1937
1938bool RegBankLegalizeHelper::applyRegisterBanksINTRIN_IMAGE(MachineInstr &MI) {
1939 const AMDGPU::RsrcIntrinsic *RSrcIntrin =
1941 assert(RSrcIntrin && RSrcIntrin->IsImage);
1942
1943 unsigned RsrcIdx = RSrcIntrin->RsrcArg;
1944 const unsigned NumDefs = MI.getNumExplicitDefs();
1945
1946 // The reported argument index is relative to the IR intrinsic call arguments,
1947 // so we need to shift by the number of defs and the intrinsic ID.
1948 RsrcIdx += NumDefs + 1;
1949
1950 MachineBasicBlock *MBB = MI.getParent();
1951 B.setInsertPt(*MBB, MBB->SkipPHIsAndLabels(std::next(MI.getIterator())));
1952
1953 // Defs(for image loads with return) are vgpr.
1954 for (unsigned i = 0; i < NumDefs; ++i) {
1955 const RegisterBank *RB = MRI.getRegBank(MI.getOperand(i).getReg());
1956 if (RB == VgprRB)
1957 continue;
1958
1959 Register Reg = MI.getOperand(i).getReg();
1960 Register NewVgprDst = MRI.createVirtualRegister({VgprRB, MRI.getType(Reg)});
1961 MI.getOperand(i).setReg(NewVgprDst);
1962 buildReadAnyLane(B, Reg, NewVgprDst, RBI);
1963 }
1964
1965 B.setInstrAndDebugLoc(MI);
1966
1967 // Register uses(before RsrcIdx) are vgpr.
1968 for (unsigned i = 1; i < RsrcIdx; ++i) {
1969 MachineOperand &Op = MI.getOperand(i);
1970 if (!Op.isReg())
1971 continue;
1972
1973 Register Reg = Op.getReg();
1974 if (!Reg.isVirtual())
1975 continue;
1976
1977 if (MRI.getRegBank(Reg) == VgprRB)
1978 continue;
1979
1980 auto Copy = B.buildCopy({VgprRB, MRI.getType(Reg)}, Reg);
1981 Op.setReg(Copy.getReg(0));
1982 }
1983
1984 SmallSet<Register, 4> OpsToWaterfall;
1985
1986 // Register use RsrcIdx(and RsrcIdx+1 in some cases) is sgpr.
1987 for (unsigned i = RsrcIdx; i < MI.getNumOperands(); ++i) {
1988 MachineOperand &Op = MI.getOperand(i);
1989 if (!Op.isReg())
1990 continue;
1991
1992 Register Reg = Op.getReg();
1993 if (MRI.getRegBank(Reg) != SgprRB)
1994 OpsToWaterfall.insert(Reg);
1995 }
1996
1997 if (!OpsToWaterfall.empty()) {
1998 MachineBasicBlock::iterator MII = MI.getIterator();
1999 executeInWaterfallLoop(B, {OpsToWaterfall, MII, std::next(MII)});
2000 }
2001
2002 return true;
2003}
MachineInstrBuilder MachineInstrBuilder & DefMI
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.
Provides AMDGPU specific target descriptions.
static bool isSignedBFE(MachineInstr &MI)
static bool verifyRegBankOnOperands(MachineInstr &MI, const RegisterBank *RB, MachineRegisterInfo &MRI, unsigned StartOpIdx, unsigned EndOpIdx)
This file declares the targeting of the RegisterBankInfo class for AMDGPU.
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator MBBI
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
AMD GCN specific subclass of TargetSubtarget.
Declares convenience wrapper classes for interpreting MachineInstr instances as specific generic oper...
static Register UseReg(const MachineOperand &MO)
IRTranslator LLVM IR MI
const size_t AbstractManglingParser< Derived, Alloc >::NumOps
#define I(x, y, z)
Definition MD5.cpp:57
Contains matchers for matching SSA Machine Instructions.
This file declares the MachineIRBuilder class.
Register Reg
Register const TargetRegisterInfo * TRI
Machine IR instance of the generic uniformity analysis.
Promote Memory to Register
Definition Mem2Reg.cpp:110
static MCRegister getReg(const MCDisassembler *D, unsigned RC, unsigned RegNo)
MachineInstr unsigned OpIdx
const SmallVectorImpl< MachineOperand > & Cond
RegBankLegalizeHelper(MachineIRBuilder &B, const MachineUniformityInfo &MUI, const RegisterBankInfo &RBI, const RegBankLegalizeRules &RBLRules)
const RegBankLLTMapping * findMappingForMI(const MachineInstr &MI, const MachineRegisterInfo &MRI, const MachineUniformityInfo &MUI) const
static APInt getLowBitsSet(unsigned numBits, unsigned loBitsSet)
Constructs an APInt value that has the bottom loBitsSet bits set.
Definition APInt.h:307
@ ICMP_NE
not equal
Definition InstrTypes.h:698
iterator find(const_arg_type_t< KeyT > Val)
Definition DenseMap.h:178
iterator end()
Definition DenseMap.h:81
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition DenseMap.h:241
const SIRegisterInfo * getRegisterInfo() const override
Represents a call to an intrinsic.
Register getSourceReg() const
Get the unmerge source register.
constexpr bool isScalar() const
LLT getScalarType() const
static constexpr LLT scalar(unsigned SizeInBits)
Get a low-level scalar or aggregate "bag of bits".
constexpr bool isValid() const
constexpr uint16_t getNumElements() const
Returns the number of elements in a vector LLT.
constexpr bool isVector() const
static constexpr LLT pointer(unsigned AddressSpace, unsigned SizeInBits)
Get a low-level pointer in the given address space.
constexpr TypeSize getSizeInBits() const
Returns the total size of the type. Must only be called on sized types.
constexpr bool isPointer() const
static constexpr LLT fixed_vector(unsigned NumElements, unsigned ScalarSizeInBits)
Get a low-level fixed-width vector of some number of elements and element width.
constexpr TypeSize getSizeInBytes() const
Returns the total size of the type in bytes, i.e.
LLT getElementType() const
Returns the vector's element type. Only valid for vector types.
TypeSize getValue() const
LLVM_ABI void transferSuccessorsAndUpdatePHIs(MachineBasicBlock *FromMBB)
Transfers all the successors, as in transferSuccessors, and update PHI operands in the successor bloc...
LLVM_ABI iterator SkipPHIsAndLabels(iterator I)
Return the first instruction in MBB after I that is not a PHI or a label.
LLVM_ABI void addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob=BranchProbability::getUnknown())
Add Succ as a successor of this MachineBasicBlock.
void splice(iterator Where, MachineBasicBlock *Other, iterator From)
Take an instruction from MBB 'Other' at the position From, and insert it into this MBB right before '...
MachineInstrBundleIterator< MachineInstr > iterator
BasicBlockListType::iterator iterator
MachineBasicBlock * CreateMachineBasicBlock(const BasicBlock *BB=nullptr, std::optional< UniqueBBID > BBID=std::nullopt)
CreateMachineInstr - Allocate a new MachineInstr.
void insert(iterator MBBI, MachineBasicBlock *MBB)
Helper class to build MachineInstr.
Representation of each machine instruction.
const MachineBasicBlock * getParent() const
LocationSize getSize() const
Return the size in bytes of the memory reference.
MachineOperand class - Representation of each machine instruction operand.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
const RegisterBank * getRegBank(Register Reg) const
Return the register bank of Reg.
LLVM_ABI Register createVirtualRegister(const TargetRegisterClass *RegClass, StringRef Name="")
createVirtualRegister - Create and return a new virtual register in the function with the specified r...
LLT getType(Register Reg) const
Get the low-level type of Reg or LLT{} if Reg is not a generic (target independent) virtual register.
const RegisterBank * getRegBankOrNull(Register Reg) const
Return the register bank of Reg, or null if Reg has not been assigned a register bank or has been ass...
Holds all the information related to register banks.
This class implements the register bank concept.
Wrapper class representing virtual and physical registers.
Definition Register.h:20
bool empty() const
Definition SmallSet.h:169
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
Definition SmallSet.h:184
void push_back(const T &Elt)
self_iterator getIterator()
Definition ilist_node.h:123
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
bool isAnyPtr(LLT Ty, unsigned Width)
Intrinsic::ID getIntrinsicID(const MachineInstr &I)
Return the intrinsic ID for opcodes with the G_AMDGPU_INTRIN_ prefix.
void buildReadAnyLane(MachineIRBuilder &B, Register SgprDst, Register VgprSrc, const RegisterBankInfo &RBI)
const RsrcIntrinsic * lookupRsrcIntrinsic(unsigned Intr)
void buildReadFirstLane(MachineIRBuilder &B, Register SgprDst, Register VgprSrc, const RegisterBankInfo &RBI)
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
SpecificConstantMatch m_ZeroInt()
Convenience matchers for specific integer values.
bool mi_match(Reg R, const MachineRegisterInfo &MRI, Pattern &&P)
This is an optimization pass for GlobalISel generic memory operations.
GenericUniformityInfo< MachineSSAContext > MachineUniformityInfo
@ Offset
Definition DWP.cpp:532
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1739
@ Kill
The last use of a register.
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
LLVM_ABI void constrainSelectedInstRegOperands(MachineInstr &I, const TargetInstrInfo &TII, const TargetRegisterInfo &TRI, const RegisterBankInfo &RBI)
Mutate the newly-selected instruction I to constrain its (possibly generic) virtual register operands...
Definition Utils.cpp:155
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
LLVM_ABI void reportGISelFailure(MachineFunction &MF, MachineOptimizationRemarkEmitter &MORE, MachineOptimizationRemarkMissed &R)
Report an ISel error as a missed optimization remark to the LLVMContext's diagnostic stream.
Definition Utils.cpp:257
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
DWARFExpression::Operation Op
ArrayRef(const T &OneElt) -> ArrayRef< T >
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
LLVM_ABI std::optional< ValueAndVReg > getIConstantVRegValWithLookThrough(Register VReg, const MachineRegisterInfo &MRI, bool LookThroughInstrs=true)
If VReg is defined by a statically evaluable chain of instructions rooted on a G_CONSTANT returns its...
Definition Utils.cpp:432
constexpr T maskTrailingOnes(unsigned N)
Create a bitmask with the N right-most bits set to 1, and all other bits set to 0.
Definition MathExtras.h:77
SmallVector< RegBankLLTMappingApplyID, 2 > DstOpMapping
SmallVector< RegBankLLTMappingApplyID, 4 > SrcOpMapping
Holds waterfall loop information: the set of SGPR operand registers that need waterfalling,...
MachineBasicBlock::iterator Start
SmallSet< Register, 4 > SgprWaterfallOperandRegs