LLVM 23.0.0git
AMDGPURegBankLegalizeHelper.cpp
Go to the documentation of this file.
1//===-- AMDGPURegBankLegalizeHelper.cpp -----------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// Implements actual lowering algorithms for each ID that can be used in
10/// Rule.OperandMapping. Similar to legalizer helper but with register banks.
11//
12//===----------------------------------------------------------------------===//
13
16#include "AMDGPUInstrInfo.h"
17#include "AMDGPULaneMaskUtils.h"
20#include "GCNSubtarget.h"
28#include "llvm/IR/IntrinsicsAMDGPU.h"
29
30#define DEBUG_TYPE "amdgpu-regbanklegalize"
31
32using namespace llvm;
33using namespace AMDGPU;
34
37 const RegisterBankInfo &RBI, const RegBankLegalizeRules &RBLRules)
38 : MF(B.getMF()), MFI(MF.getInfo<SIMachineFunctionInfo>()),
39 ST(MF.getSubtarget<GCNSubtarget>()), TII(*ST.getInstrInfo()), B(B),
40 MRI(*B.getMRI()), MUI(MUI), RBI(RBI), MORE(MF, nullptr),
41 RBLRules(RBLRules), IsWave32(ST.isWave32()),
42 SgprRB(&RBI.getRegBank(AMDGPU::SGPRRegBankID)),
43 VgprRB(&RBI.getRegBank(AMDGPU::VGPRRegBankID)),
44 AgprRB(&RBI.getRegBank(AMDGPU::AGPRRegBankID)),
45 VccRB(&RBI.getRegBank(AMDGPU::VCCRegBankID)) {}
46
48 const SetOfRulesForOpcode *RuleSet = RBLRules.getRulesForOpc(MI);
49 if (!RuleSet) {
50 reportGISelFailure(MF, MORE, "amdgpu-regbanklegalize",
51 "No AMDGPU RegBankLegalize rules defined for opcode",
52 MI);
53 return false;
54 }
55
56 const RegBankLLTMapping *Mapping = RuleSet->findMappingForMI(MI, MRI, MUI);
57 if (!Mapping) {
58 reportGISelFailure(MF, MORE, "amdgpu-regbanklegalize",
59 "AMDGPU RegBankLegalize: none of the rules defined with "
60 "'Any' for MI's opcode matched MI",
61 MI);
62 return false;
63 }
64
65 WaterfallInfo WFI;
66 unsigned OpIdx = 0;
67 if (!Mapping->DstOpMapping.empty()) {
68 B.setInsertPt(*MI.getParent(), std::next(MI.getIterator()));
69 if (!applyMappingDst(MI, OpIdx, Mapping->DstOpMapping))
70 return false;
71 }
72 if (!Mapping->SrcOpMapping.empty()) {
73 B.setInstr(MI);
74 if (!applyMappingSrc(MI, OpIdx, Mapping->SrcOpMapping, WFI))
75 return false;
76 }
77
78 if (!lower(MI, *Mapping, WFI))
79 return false;
80
81 if (!WFI.SgprWaterfallOperandRegs.empty()) {
82 if (!executeInWaterfallLoop(B, WFI))
83 return false;
84 }
85
86 return true;
87}
88
89bool RegBankLegalizeHelper::executeInWaterfallLoop(MachineIRBuilder &B,
90 const WaterfallInfo &WFI) {
91 assert(WFI.Start.isValid() && WFI.End.isValid() &&
92 "Waterfall range not initialized");
93
94 // Track use registers which have already been expanded with a readfirstlane
95 // sequence. This may have multiple uses if moving a sequence.
96 DenseMap<Register, Register> WaterfalledRegMap;
97
98 MachineBasicBlock &MBB = B.getMBB();
99 MachineFunction &MF = B.getMF();
100
103
104 const SIRegisterInfo *TRI = ST.getRegisterInfo();
105 const TargetRegisterClass *WaveRC = TRI->getWaveMaskRegClass();
107
108#ifndef NDEBUG
109 const int OrigRangeSize = std::distance(BeginIt, EndIt);
110#endif
111
112 MachineRegisterInfo &MRI = *B.getMRI();
113 Register SaveExecReg = MRI.createVirtualRegister(WaveRC);
114 Register InitSaveExecReg = MRI.createVirtualRegister(WaveRC);
115
116 // Don't bother using generic instructions/registers for the exec mask.
117 B.setInstr(*WFI.Start);
118 B.buildInstr(TargetOpcode::IMPLICIT_DEF).addDef(InitSaveExecReg);
119
120 Register SavedExec = MRI.createVirtualRegister(WaveRC);
121
122 // To insert the loop we need to split the block. Move everything before
123 // this point to a new block, and insert a new empty block before this
124 // instruction.
127 MachineBasicBlock *RestoreExecBB = MF.CreateMachineBasicBlock();
128 MachineBasicBlock *RemainderBB = MF.CreateMachineBasicBlock();
130 ++MBBI;
131 MF.insert(MBBI, LoopBB);
132 MF.insert(MBBI, BodyBB);
133 MF.insert(MBBI, RestoreExecBB);
134 MF.insert(MBBI, RemainderBB);
135
136 LoopBB->addSuccessor(BodyBB);
137 BodyBB->addSuccessor(RestoreExecBB);
138 BodyBB->addSuccessor(LoopBB);
139
140 // Move the rest of the block into a new block.
142 RemainderBB->splice(RemainderBB->begin(), &MBB, EndIt, MBB.end());
143
144 MBB.addSuccessor(LoopBB);
145 RestoreExecBB->addSuccessor(RemainderBB);
146
147 B.setInsertPt(*LoopBB, LoopBB->end());
148
149 // +-MBB:------------+
150 // | ... |
151 // | %0 = G_INST_1 |
152 // | %Dst = MI %Vgpr |
153 // | %1 = G_INST_2 |
154 // | ... |
155 // +-----------------+
156 // ->
157 // +-MBB-------------------------------+
158 // | ... |
159 // | %0 = G_INST_1 |
160 // | %SaveExecReg = S_MOV_B32 $exec_lo |
161 // +----------------|------------------+
162 // | /------------------------------|
163 // V V |
164 // +-LoopBB---------------------------------------------------------------+ |
165 // | %CurrentLaneReg:sgpr(s32) = READFIRSTLANE %Vgpr | |
166 // | instead of executing for each lane, see if other lanes had | |
167 // | same value for %Vgpr and execute for them also. | |
168 // | %CondReg:vcc(s1) = G_ICMP eq %CurrentLaneReg, %Vgpr | |
169 // | %CondRegLM:sreg_32 = ballot %CondReg // copy vcc to sreg32 lane mask | |
170 // | %SavedExec = S_AND_SAVEEXEC_B32 %CondRegLM | |
171 // | exec is active for lanes with the same "CurrentLane value" in Vgpr | |
172 // +----------------|-----------------------------------------------------+ |
173 // V |
174 // +-BodyBB------------------------------------------------------------+ |
175 // | %Dst = MI %CurrentLaneReg:sgpr(s32) | |
176 // | executed only for active lanes and written to Dst | |
177 // | $exec = S_XOR_B32 $exec, %SavedExec | |
178 // | set active lanes to 0 in SavedExec, lanes that did not write to | |
179 // | Dst yet, and set this as new exec (for READFIRSTLANE and ICMP) | |
180 // | SI_WATERFALL_LOOP LoopBB |-----|
181 // +----------------|--------------------------------------------------+
182 // V
183 // +-RestoreExecBB--------------------------+
184 // | $exec_lo = S_MOV_B32_term %SaveExecReg |
185 // +----------------|-----------------------+
186 // V
187 // +-RemainderBB:----------------------+
188 // | %1 = G_INST_2 |
189 // | ... |
190 // +---------------------------------- +
191
192 // Move the instruction into the loop body. Note we moved everything after
193 // Range.end() already into a new block, so Range.end() is no longer valid.
194 BodyBB->splice(BodyBB->end(), &MBB, BeginIt, MBB.end());
195
196 // Figure out the iterator range after splicing the instructions.
197 MachineBasicBlock::iterator NewBegin = BeginIt;
198 auto NewEnd = BodyBB->end();
199 assert(std::distance(NewBegin, NewEnd) == OrigRangeSize);
200
201 B.setMBB(*LoopBB);
202 Register CondReg;
203
204 for (MachineInstr &MI : make_range(NewBegin, NewEnd)) {
205 for (MachineOperand &Op : MI.all_uses()) {
206 Register OldReg = Op.getReg();
207 if (!WFI.SgprWaterfallOperandRegs.count(OldReg))
208 continue;
209
210 // See if we already processed this register in another instruction in
211 // the sequence.
212 auto OldVal = WaterfalledRegMap.find(OldReg);
213 if (OldVal != WaterfalledRegMap.end()) {
214 Op.setReg(OldVal->second);
215 continue;
216 }
217
218 Register OpReg = Op.getReg();
219 LLT OpTy = MRI.getType(OpReg);
220
221 // TODO: support for agpr
222 assert(MRI.getRegBank(OpReg) == VgprRB);
223 Register CurrentLaneReg = MRI.createVirtualRegister({SgprRB, OpTy});
224 buildReadFirstLane(B, CurrentLaneReg, OpReg, RBI);
225
226 // Build the comparison(s), CurrentLaneReg == OpReg.
227 unsigned OpSize = OpTy.getSizeInBits();
228 unsigned PartSize = (OpSize % 64 == 0) ? 64 : 32;
229 LLT PartTy = LLT::scalar(PartSize);
230 unsigned NumParts = OpSize / PartSize;
232 SmallVector<Register, 8> CurrentLaneParts;
233
234 if (NumParts == 1) {
235 OpParts.push_back(OpReg);
236 CurrentLaneParts.push_back(CurrentLaneReg);
237 } else {
238 auto UnmergeOp = B.buildUnmerge({VgprRB, PartTy}, OpReg);
239 auto UnmergeCurrLane = B.buildUnmerge({SgprRB, PartTy}, CurrentLaneReg);
240 for (unsigned i = 0; i < NumParts; ++i) {
241 OpParts.push_back(UnmergeOp.getReg(i));
242 CurrentLaneParts.push_back(UnmergeCurrLane.getReg(i));
243 }
244 }
245
246 for (unsigned i = 0; i < NumParts; ++i) {
247 Register CmpReg = MRI.createVirtualRegister(VccRB_S1);
248 B.buildICmp(CmpInst::ICMP_EQ, CmpReg, CurrentLaneParts[i], OpParts[i]);
249
250 if (!CondReg)
251 CondReg = CmpReg;
252 else
253 CondReg = B.buildAnd(VccRB_S1, CondReg, CmpReg).getReg(0);
254 }
255
256 Op.setReg(CurrentLaneReg);
257
258 // Make sure we don't re-process this register again.
259 WaterfalledRegMap.insert(std::pair(OldReg, Op.getReg()));
260 }
261 }
262
263 // Copy vcc to sgpr32/64, ballot becomes a no-op during instruction selection.
264 Register CondRegLM =
265 MRI.createVirtualRegister({WaveRC, LLT::scalar(IsWave32 ? 32 : 64)});
266 B.buildIntrinsic(Intrinsic::amdgcn_ballot, CondRegLM).addReg(CondReg);
267
268 // Update EXEC, save the original EXEC value to SavedExec.
269 B.buildInstr(LMC.AndSaveExecOpc)
270 .addDef(SavedExec)
271 .addReg(CondRegLM, RegState::Kill);
272 MRI.setSimpleHint(SavedExec, CondRegLM);
273
274 B.setInsertPt(*BodyBB, BodyBB->end());
275
276 // Update EXEC, switch all done bits to 0 and all todo bits to 1.
277 B.buildInstr(LMC.XorTermOpc)
278 .addDef(LMC.ExecReg)
279 .addReg(LMC.ExecReg)
280 .addReg(SavedExec);
281
282 // XXX - s_xor_b64 sets scc to 1 if the result is nonzero, so can we use
283 // s_cbranch_scc0?
284
285 // Loop back to V_READFIRSTLANE_B32 if there are still variants to cover.
286 B.buildInstr(AMDGPU::SI_WATERFALL_LOOP).addMBB(LoopBB);
287
288 // Save the EXEC mask before the loop.
289 B.setInsertPt(MBB, MBB.end());
290 B.buildInstr(LMC.MovOpc).addDef(SaveExecReg).addReg(LMC.ExecReg);
291
292 // Restore the EXEC mask after the loop.
293 B.setInsertPt(*RestoreExecBB, RestoreExecBB->begin());
294 B.buildInstr(LMC.MovTermOpc).addDef(LMC.ExecReg).addReg(SaveExecReg);
295
296 // Set the insert point after the original instruction, so any new
297 // instructions will be in the remainder.
298 B.setInsertPt(*RemainderBB, RemainderBB->begin());
299
300 return true;
301}
302
303// Analyze a combined offset from an llvm.amdgcn.s.buffer intrinsic and store
304// the three offsets (voffset, soffset and instoffset)
305unsigned RegBankLegalizeHelper::setBufferOffsets(
306 MachineIRBuilder &B, Register CombinedOffset, Register &VOffsetReg,
307 Register &SOffsetReg, int64_t &InstOffsetVal, Align Alignment) {
308 if (std::optional<int64_t> Imm =
309 getIConstantVRegSExtVal(CombinedOffset, MRI)) {
310 uint32_t SOffset, ImmOffset;
311 if (TII.splitMUBUFOffset(*Imm, SOffset, ImmOffset, Alignment)) {
312 VOffsetReg = B.buildConstant({VgprRB, S32}, 0).getReg(0);
313 SOffsetReg = B.buildConstant({SgprRB, S32}, SOffset).getReg(0);
314 InstOffsetVal = ImmOffset;
315 return SOffset + ImmOffset;
316 }
317 }
318 const bool CheckNUW = ST.hasGFX1250Insts();
320 MRI, CombinedOffset, /*KnownBits=*/nullptr,
321 /*CheckNUW=*/CheckNUW);
322 uint32_t SOffset, ImmOffset;
323 if (static_cast<int32_t>(Offset) > 0 &&
324 TII.splitMUBUFOffset(Offset, SOffset, ImmOffset, Alignment)) {
325 if (Base.isValid() && MRI.getRegBank(Base) == VgprRB) {
326 VOffsetReg = Base;
327 SOffsetReg = B.buildConstant({SgprRB, S32}, SOffset).getReg(0);
328 InstOffsetVal = ImmOffset;
329 return 0;
330 }
331 // If we have SGPR base, we can use it for soffset.
332 if (SOffset == 0) {
333 VOffsetReg = B.buildConstant({VgprRB, S32}, 0).getReg(0);
334 SOffsetReg = Base;
335 InstOffsetVal = ImmOffset;
336 return 0;
337 }
338 }
339 // Handle the variable sgpr + vgpr case.
340 MachineInstr *Add = getOpcodeDef(AMDGPU::G_ADD, CombinedOffset, MRI);
341 if (Add && static_cast<int32_t>(Offset) >= 0 &&
342 (!CheckNUW || Add->getFlag(MachineInstr::NoUWrap))) {
343 Register Src0 = getSrcRegIgnoringCopies(Add->getOperand(1).getReg(), MRI);
344 Register Src1 = getSrcRegIgnoringCopies(Add->getOperand(2).getReg(), MRI);
345 const RegisterBank *Src0Bank = MRI.getRegBank(Src0);
346 const RegisterBank *Src1Bank = MRI.getRegBank(Src1);
347 if (Src0Bank == VgprRB && Src1Bank == SgprRB) {
348 VOffsetReg = Src0;
349 SOffsetReg = Src1;
350 return 0;
351 }
352 if (Src0Bank == SgprRB && Src1Bank == VgprRB) {
353 VOffsetReg = Src1;
354 SOffsetReg = Src0;
355 return 0;
356 }
357 }
358 // Ensure we have a VGPR for the combined offset. This could be an issue if we
359 // have an SGPR offset and a VGPR resource.
360 if (MRI.getRegBank(CombinedOffset) == VgprRB) {
361 VOffsetReg = CombinedOffset;
362 } else {
363 VOffsetReg = B.buildCopy({VgprRB, S32}, CombinedOffset).getReg(0);
364 }
365 SOffsetReg = B.buildConstant({SgprRB, S32}, 0).getReg(0);
366 return 0;
367}
368
369bool RegBankLegalizeHelper::splitLoad(MachineInstr &MI,
370 ArrayRef<LLT> LLTBreakdown, LLT MergeTy) {
371 MachineFunction &MF = B.getMF();
372 assert(MI.getNumMemOperands() == 1);
373 MachineMemOperand &BaseMMO = **MI.memoperands_begin();
374 Register Dst = MI.getOperand(0).getReg();
375 const RegisterBank *DstRB = MRI.getRegBankOrNull(Dst);
376 Register Base = MI.getOperand(1).getReg();
377 LLT PtrTy = MRI.getType(Base);
378 const RegisterBank *PtrRB = MRI.getRegBankOrNull(Base);
379 LLT OffsetTy = LLT::scalar(PtrTy.getSizeInBits());
380 SmallVector<Register, 4> LoadPartRegs;
381
382 unsigned ByteOffset = 0;
383 for (LLT PartTy : LLTBreakdown) {
384 Register BasePlusOffset;
385 if (ByteOffset == 0) {
386 BasePlusOffset = Base;
387 } else {
388 auto Offset = B.buildConstant({PtrRB, OffsetTy}, ByteOffset);
389 BasePlusOffset =
390 B.buildObjectPtrOffset({PtrRB, PtrTy}, Base, Offset).getReg(0);
391 }
392 auto *OffsetMMO = MF.getMachineMemOperand(&BaseMMO, ByteOffset, PartTy);
393 auto LoadPart = B.buildLoad({DstRB, PartTy}, BasePlusOffset, *OffsetMMO);
394 LoadPartRegs.push_back(LoadPart.getReg(0));
395 ByteOffset += PartTy.getSizeInBytes();
396 }
397
398 if (!MergeTy.isValid()) {
399 // Loads are of same size, concat or merge them together.
400 B.buildMergeLikeInstr(Dst, LoadPartRegs);
401 } else {
402 // Loads are not all of same size, need to unmerge them to smaller pieces
403 // of MergeTy type, then merge pieces to Dst.
404 SmallVector<Register, 4> MergeTyParts;
405 for (Register Reg : LoadPartRegs) {
406 if (MRI.getType(Reg) == MergeTy) {
407 MergeTyParts.push_back(Reg);
408 } else {
409 auto Unmerge = B.buildUnmerge({DstRB, MergeTy}, Reg);
410 for (unsigned i = 0; i < Unmerge->getNumOperands() - 1; ++i)
411 MergeTyParts.push_back(Unmerge.getReg(i));
412 }
413 }
414 B.buildMergeLikeInstr(Dst, MergeTyParts);
415 }
416 MI.eraseFromParent();
417 return true;
418}
419
420bool RegBankLegalizeHelper::widenLoad(MachineInstr &MI, LLT WideTy,
421 LLT MergeTy) {
422 MachineFunction &MF = B.getMF();
423 assert(MI.getNumMemOperands() == 1);
424 MachineMemOperand &BaseMMO = **MI.memoperands_begin();
425 Register Dst = MI.getOperand(0).getReg();
426 const RegisterBank *DstRB = MRI.getRegBankOrNull(Dst);
427 Register Base = MI.getOperand(1).getReg();
428
429 MachineMemOperand *WideMMO = MF.getMachineMemOperand(&BaseMMO, 0, WideTy);
430 auto WideLoad = B.buildLoad({DstRB, WideTy}, Base, *WideMMO);
431
432 if (WideTy.isScalar()) {
433 B.buildTrunc(Dst, WideLoad);
434 } else {
435 SmallVector<Register, 4> MergeTyParts;
436 auto Unmerge = B.buildUnmerge({DstRB, MergeTy}, WideLoad);
437
438 LLT DstTy = MRI.getType(Dst);
439 unsigned NumElts = DstTy.getSizeInBits() / MergeTy.getSizeInBits();
440 for (unsigned i = 0; i < NumElts; ++i) {
441 MergeTyParts.push_back(Unmerge.getReg(i));
442 }
443 B.buildMergeLikeInstr(Dst, MergeTyParts);
444 }
445 MI.eraseFromParent();
446 return true;
447}
448
449bool RegBankLegalizeHelper::widenMMOToS32(GAnyLoad &MI) const {
450 Register Dst = MI.getDstReg();
451 Register Ptr = MI.getPointerReg();
452 MachineMemOperand &MMO = MI.getMMO();
453 unsigned MemSize = 8 * MMO.getSize().getValue();
454
455 MachineMemOperand *WideMMO = B.getMF().getMachineMemOperand(&MMO, 0, S32);
456
457 if (MI.getOpcode() == G_LOAD) {
458 B.buildLoad(Dst, Ptr, *WideMMO);
459 } else {
460 auto Load = B.buildLoad(SgprRB_S32, Ptr, *WideMMO);
461
462 if (MI.getOpcode() == G_ZEXTLOAD) {
463 APInt Mask = APInt::getLowBitsSet(S32.getSizeInBits(), MemSize);
464 auto MaskCst = B.buildConstant(SgprRB_S32, Mask);
465 B.buildAnd(Dst, Load, MaskCst);
466 } else {
467 assert(MI.getOpcode() == G_SEXTLOAD);
468 B.buildSExtInReg(Dst, Load, MemSize);
469 }
470 }
471
472 MI.eraseFromParent();
473 return true;
474}
475
476bool RegBankLegalizeHelper::lowerVccExtToSel(MachineInstr &MI) {
477 Register Dst = MI.getOperand(0).getReg();
478 LLT Ty = MRI.getType(Dst);
479 Register Src = MI.getOperand(1).getReg();
480 unsigned Opc = MI.getOpcode();
481 int TrueExtCst = Opc == G_SEXT ? -1 : 1;
482 if (Ty == S32 || Ty == S16) {
483 auto True = B.buildConstant({VgprRB, Ty}, TrueExtCst);
484 auto False = B.buildConstant({VgprRB, Ty}, 0);
485 B.buildSelect(Dst, Src, True, False);
486 } else if (Ty == S64) {
487 auto True = B.buildConstant({VgprRB_S32}, TrueExtCst);
488 auto False = B.buildConstant({VgprRB_S32}, 0);
489 auto Lo = B.buildSelect({VgprRB_S32}, Src, True, False);
490 MachineInstrBuilder Hi;
491 switch (Opc) {
492 case G_SEXT:
493 Hi = Lo;
494 break;
495 case G_ZEXT:
496 Hi = False;
497 break;
498 case G_ANYEXT:
499 Hi = B.buildUndef({VgprRB_S32});
500 break;
501 default:
503 MF, MORE, "amdgpu-regbanklegalize",
504 "AMDGPU RegBankLegalize: lowerVccExtToSel, Opcode not supported", MI);
505 return false;
506 }
507
508 B.buildMergeValues(Dst, {Lo.getReg(0), Hi.getReg(0)});
509 } else {
511 MF, MORE, "amdgpu-regbanklegalize",
512 "AMDGPU RegBankLegalize: lowerVccExtToSel, Type not supported", MI);
513 return false;
514 }
515
516 MI.eraseFromParent();
517 return true;
518}
519
520std::pair<Register, Register> RegBankLegalizeHelper::unpackZExt(Register Reg) {
521 auto PackedS32 = B.buildBitcast(SgprRB_S32, Reg);
522 auto Mask = B.buildConstant(SgprRB_S32, 0x0000ffff);
523 auto Lo = B.buildAnd(SgprRB_S32, PackedS32, Mask);
524 auto Hi = B.buildLShr(SgprRB_S32, PackedS32, B.buildConstant(SgprRB_S32, 16));
525 return {Lo.getReg(0), Hi.getReg(0)};
526}
527
528std::pair<Register, Register> RegBankLegalizeHelper::unpackSExt(Register Reg) {
529 auto PackedS32 = B.buildBitcast(SgprRB_S32, Reg);
530 auto Lo = B.buildSExtInReg(SgprRB_S32, PackedS32, 16);
531 auto Hi = B.buildAShr(SgprRB_S32, PackedS32, B.buildConstant(SgprRB_S32, 16));
532 return {Lo.getReg(0), Hi.getReg(0)};
533}
534
535std::pair<Register, Register> RegBankLegalizeHelper::unpackAExt(Register Reg) {
536 auto PackedS32 = B.buildBitcast(SgprRB_S32, Reg);
537 auto Lo = PackedS32;
538 auto Hi = B.buildLShr(SgprRB_S32, PackedS32, B.buildConstant(SgprRB_S32, 16));
539 return {Lo.getReg(0), Hi.getReg(0)};
540}
541
542std::pair<Register, Register>
543RegBankLegalizeHelper::unpackAExtTruncS16(Register Reg) {
544 auto [Lo32, Hi32] = unpackAExt(Reg);
545 return {B.buildTrunc(SgprRB_S16, Lo32).getReg(0),
546 B.buildTrunc(SgprRB_S16, Hi32).getReg(0)};
547}
548
549bool RegBankLegalizeHelper::lowerUnpackBitShift(MachineInstr &MI) {
550 Register Lo, Hi;
551 switch (MI.getOpcode()) {
552 case AMDGPU::G_SHL: {
553 auto [Val0, Val1] = unpackAExt(MI.getOperand(1).getReg());
554 auto [Amt0, Amt1] = unpackAExt(MI.getOperand(2).getReg());
555 Lo = B.buildInstr(MI.getOpcode(), {SgprRB_S32}, {Val0, Amt0}).getReg(0);
556 Hi = B.buildInstr(MI.getOpcode(), {SgprRB_S32}, {Val1, Amt1}).getReg(0);
557 break;
558 }
559 case AMDGPU::G_LSHR: {
560 auto [Val0, Val1] = unpackZExt(MI.getOperand(1).getReg());
561 auto [Amt0, Amt1] = unpackZExt(MI.getOperand(2).getReg());
562 Lo = B.buildInstr(MI.getOpcode(), {SgprRB_S32}, {Val0, Amt0}).getReg(0);
563 Hi = B.buildInstr(MI.getOpcode(), {SgprRB_S32}, {Val1, Amt1}).getReg(0);
564 break;
565 }
566 case AMDGPU::G_ASHR: {
567 auto [Val0, Val1] = unpackSExt(MI.getOperand(1).getReg());
568 auto [Amt0, Amt1] = unpackSExt(MI.getOperand(2).getReg());
569 Lo = B.buildAShr(SgprRB_S32, Val0, Amt0).getReg(0);
570 Hi = B.buildAShr(SgprRB_S32, Val1, Amt1).getReg(0);
571 break;
572 }
573 default:
575 MF, MORE, "amdgpu-regbanklegalize",
576 "AMDGPU RegBankLegalize: lowerUnpackBitShift, case not implemented",
577 MI);
578 return false;
579 }
580 B.buildBuildVectorTrunc(MI.getOperand(0).getReg(), {Lo, Hi});
581 MI.eraseFromParent();
582 return true;
583}
584
585bool RegBankLegalizeHelper::lowerUnpackMinMax(MachineInstr &MI) {
586 Register Lo, Hi;
587 switch (MI.getOpcode()) {
588 case AMDGPU::G_SMIN:
589 case AMDGPU::G_SMAX: {
590 // For signed operations, use sign extension
591 auto [Val0_Lo, Val0_Hi] = unpackSExt(MI.getOperand(1).getReg());
592 auto [Val1_Lo, Val1_Hi] = unpackSExt(MI.getOperand(2).getReg());
593 Lo = B.buildInstr(MI.getOpcode(), {SgprRB_S32}, {Val0_Lo, Val1_Lo})
594 .getReg(0);
595 Hi = B.buildInstr(MI.getOpcode(), {SgprRB_S32}, {Val0_Hi, Val1_Hi})
596 .getReg(0);
597 break;
598 }
599 case AMDGPU::G_UMIN:
600 case AMDGPU::G_UMAX: {
601 // For unsigned operations, use zero extension
602 auto [Val0_Lo, Val0_Hi] = unpackZExt(MI.getOperand(1).getReg());
603 auto [Val1_Lo, Val1_Hi] = unpackZExt(MI.getOperand(2).getReg());
604 Lo = B.buildInstr(MI.getOpcode(), {SgprRB_S32}, {Val0_Lo, Val1_Lo})
605 .getReg(0);
606 Hi = B.buildInstr(MI.getOpcode(), {SgprRB_S32}, {Val0_Hi, Val1_Hi})
607 .getReg(0);
608 break;
609 }
610 default:
612 MF, MORE, "amdgpu-regbanklegalize",
613 "AMDGPU RegBankLegalize: lowerUnpackMinMax, case not implemented", MI);
614 return false;
615 }
616 B.buildBuildVectorTrunc(MI.getOperand(0).getReg(), {Lo, Hi});
617 MI.eraseFromParent();
618 return true;
619}
620
621bool RegBankLegalizeHelper::lowerUnpackAExt(MachineInstr &MI) {
622 auto [Op1Lo, Op1Hi] = unpackAExt(MI.getOperand(1).getReg());
623 auto [Op2Lo, Op2Hi] = unpackAExt(MI.getOperand(2).getReg());
624 auto ResLo = B.buildInstr(MI.getOpcode(), {SgprRB_S32}, {Op1Lo, Op2Lo});
625 auto ResHi = B.buildInstr(MI.getOpcode(), {SgprRB_S32}, {Op1Hi, Op2Hi});
626 B.buildBuildVectorTrunc(MI.getOperand(0).getReg(),
627 {ResLo.getReg(0), ResHi.getReg(0)});
628 MI.eraseFromParent();
629 return true;
630}
631
632bool RegBankLegalizeHelper::lowerSBufToBuf(MachineInstr &MI,
633 WaterfallInfo &WFI) {
634 Register Dst = MI.getOperand(0).getReg();
635 LLT Ty = MRI.getType(Dst);
636 const RegisterBank *RSrcBank = MRI.getRegBank(MI.getOperand(1).getReg());
637 unsigned LoadSize = Ty.getSizeInBits();
638 int NumLoads = 1;
639 SmallVector<Register, 4> LoadParts;
640 if (LoadSize == 256 || LoadSize == 512) {
641 NumLoads = LoadSize / 128;
642 Ty = Ty.divide(NumLoads);
643 }
644 for (int i = 0; i < NumLoads; ++i)
645 LoadParts.emplace_back(MRI.createVirtualRegister({VgprRB, Ty}));
646 MachineMemOperand *OrigMMO = *MI.memoperands_begin();
647 const Align Alignment = OrigMMO->getAlign();
648 MachineFunction &MF = B.getMF();
649 Register SOffset;
650 Register VOffset;
651 int64_t ImmOffset = 0;
652 unsigned MMOOffset = setBufferOffsets(B, MI.getOperand(2).getReg(), VOffset,
653 SOffset, ImmOffset, Alignment);
654 // Use the MMO size from the original instruction rather than the (possibly
655 // widened) register type. E.g. 96-bit loads are widened to 128-bit during
656 // legalization but the MMO still reflects the original 96-bit access size.
657 const unsigned MemSize = divideCeil(OrigMMO->getSize().getValue(), NumLoads);
658 MachineMemOperand *BaseMMO = MF.getMachineMemOperand(OrigMMO, 0, MemSize);
659 if (MMOOffset != 0)
660 BaseMMO = MF.getMachineMemOperand(BaseMMO, MMOOffset, MemSize);
661 // If only the offset is divergent, emit a MUBUF buffer load
662 // instead. We can assume that the buffer is unswizzled.
663 Register RSrc = MI.getOperand(1).getReg();
664 Register VIndex = B.buildConstant(VgprRB_S32, 0).getReg(0);
665 unsigned Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD;
666 switch (MI.getOpcode()) {
667 case AMDGPU::G_AMDGPU_S_BUFFER_LOAD_SBYTE:
668 Opc = G_AMDGPU_BUFFER_LOAD_SBYTE;
669 break;
670 case AMDGPU::G_AMDGPU_S_BUFFER_LOAD_UBYTE:
671 Opc = G_AMDGPU_BUFFER_LOAD_UBYTE;
672 break;
673 case AMDGPU::G_AMDGPU_S_BUFFER_LOAD_SSHORT:
674 Opc = G_AMDGPU_BUFFER_LOAD_SSHORT;
675 break;
676 case AMDGPU::G_AMDGPU_S_BUFFER_LOAD_USHORT:
677 Opc = G_AMDGPU_BUFFER_LOAD_USHORT;
678 break;
679 default:
680 break;
681 }
682 for (int i = 0; i < NumLoads; ++i) {
683 B.buildInstr(Opc)
684 .addDef(LoadParts[i]) // vdata
685 .addUse(RSrc) // rsrc
686 .addUse(VIndex) // vindex
687 .addUse(VOffset) // voffset
688 .addUse(SOffset) // soffset
689 .addImm(ImmOffset + 16 * i) // offset(imm)
690 .addImm(0) // cachepolicy, swizzled buffer(imm)
691 .addImm(0) // idxen(imm)
692 .addMemOperand(MF.getMachineMemOperand(BaseMMO, 16 * i, MemSize));
693 }
694 if (NumLoads == 1)
695 B.buildCopy(Dst, LoadParts[0]);
696 else
697 B.buildMergeLikeInstr(Dst, LoadParts);
698 B.setInstr(*MRI.getVRegDef(LoadParts[0]));
699 if (RSrcBank != SgprRB) {
700 WFI.SgprWaterfallOperandRegs.insert(RSrc);
701 WFI.Start = MRI.getVRegDef(LoadParts.front());
702 WFI.End = std::next(MRI.getVRegDef(LoadParts.back())->getIterator());
703 }
704 MI.eraseFromParent();
705 return true;
706}
707
710 return (GI->is(Intrinsic::amdgcn_sbfe));
711
712 return MI.getOpcode() == AMDGPU::G_SBFX;
713}
714
715bool RegBankLegalizeHelper::lowerV_BFE(MachineInstr &MI) {
716 Register Dst = MI.getOperand(0).getReg();
717 assert(MRI.getType(Dst) == LLT::scalar(64));
718 bool Signed = isSignedBFE(MI);
719 unsigned FirstOpnd = isa<GIntrinsic>(MI) ? 2 : 1;
720 // Extract bitfield from Src, LSBit is the least-significant bit for the
721 // extraction (field offset) and Width is size of bitfield.
722 Register Src = MI.getOperand(FirstOpnd).getReg();
723 Register LSBit = MI.getOperand(FirstOpnd + 1).getReg();
724 Register Width = MI.getOperand(FirstOpnd + 2).getReg();
725 // Comments are for signed bitfield extract, similar for unsigned. x is sign
726 // bit. s is sign, l is LSB and y are remaining bits of bitfield to extract.
727
728 // Src >> LSBit Hi|Lo: x?????syyyyyyl??? -> xxxx?????syyyyyyl
729 unsigned SHROpc = Signed ? AMDGPU::G_ASHR : AMDGPU::G_LSHR;
730 auto SHRSrc = B.buildInstr(SHROpc, {{VgprRB, S64}}, {Src, LSBit});
731
732 auto ConstWidth = getIConstantVRegValWithLookThrough(Width, MRI);
733
734 // Expand to Src >> LSBit << (64 - Width) >> (64 - Width)
735 // << (64 - Width): Hi|Lo: xxxx?????syyyyyyl -> syyyyyyl000000000
736 // >> (64 - Width): Hi|Lo: syyyyyyl000000000 -> ssssssssssyyyyyyl
737 if (!ConstWidth) {
738 auto Amt = B.buildSub(VgprRB_S32, B.buildConstant(SgprRB_S32, 64), Width);
739 auto SignBit = B.buildShl({VgprRB, S64}, SHRSrc, Amt);
740 B.buildInstr(SHROpc, {Dst}, {SignBit, Amt});
741 MI.eraseFromParent();
742 return true;
743 }
744
745 uint64_t WidthImm = ConstWidth->Value.getZExtValue();
746 auto UnmergeSHRSrc = B.buildUnmerge(VgprRB_S32, SHRSrc);
747 Register SHRSrcLo = UnmergeSHRSrc.getReg(0);
748 Register SHRSrcHi = UnmergeSHRSrc.getReg(1);
749 auto Zero = B.buildConstant({VgprRB, S32}, 0);
750 unsigned BFXOpc = Signed ? AMDGPU::G_SBFX : AMDGPU::G_UBFX;
751
752 if (WidthImm <= 32) {
753 // SHRSrc Hi|Lo: ????????|???syyyl -> ????????|ssssyyyl
754 auto Lo = B.buildInstr(BFXOpc, {VgprRB_S32}, {SHRSrcLo, Zero, Width});
755 MachineInstrBuilder Hi;
756 if (Signed) {
757 // SHRSrc Hi|Lo: ????????|ssssyyyl -> ssssssss|ssssyyyl
758 Hi = B.buildAShr(VgprRB_S32, Lo, B.buildConstant(VgprRB_S32, 31));
759 } else {
760 // SHRSrc Hi|Lo: ????????|000syyyl -> 00000000|000syyyl
761 Hi = Zero;
762 }
763 B.buildMergeLikeInstr(Dst, {Lo, Hi});
764 } else {
765 auto Amt = B.buildConstant(VgprRB_S32, WidthImm - 32);
766 // SHRSrc Hi|Lo: ??????sy|yyyyyyyl -> sssssssy|yyyyyyyl
767 auto Hi = B.buildInstr(BFXOpc, {VgprRB_S32}, {SHRSrcHi, Zero, Amt});
768 B.buildMergeLikeInstr(Dst, {SHRSrcLo, Hi});
769 }
770
771 MI.eraseFromParent();
772 return true;
773}
774
775bool RegBankLegalizeHelper::lowerS_BFE(MachineInstr &MI) {
776 Register DstReg = MI.getOperand(0).getReg();
777 LLT Ty = MRI.getType(DstReg);
778 bool Signed = isSignedBFE(MI);
779 unsigned FirstOpnd = isa<GIntrinsic>(MI) ? 2 : 1;
780 Register Src = MI.getOperand(FirstOpnd).getReg();
781 Register LSBit = MI.getOperand(FirstOpnd + 1).getReg();
782 Register Width = MI.getOperand(FirstOpnd + 2).getReg();
783 // For uniform bit field extract there are 4 available instructions, but
784 // LSBit(field offset) and Width(size of bitfield) need to be packed in S32,
785 // field offset in low and size in high 16 bits.
786
787 // Src1 Hi16|Lo16 = Size|FieldOffset
788 auto Mask = B.buildConstant(SgprRB_S32, maskTrailingOnes<unsigned>(6));
789 auto FieldOffset = B.buildAnd(SgprRB_S32, LSBit, Mask);
790 auto Size = B.buildShl(SgprRB_S32, Width, B.buildConstant(SgprRB_S32, 16));
791 auto Src1 = B.buildOr(SgprRB_S32, FieldOffset, Size);
792 unsigned Opc32 = Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32;
793 unsigned Opc64 = Signed ? AMDGPU::S_BFE_I64 : AMDGPU::S_BFE_U64;
794 unsigned Opc = Ty == S32 ? Opc32 : Opc64;
795
796 // Select machine instruction, because of reg class constraining, insert
797 // copies from reg class to reg bank.
798 auto S_BFE = B.buildInstr(Opc, {{SgprRB, Ty}},
799 {B.buildCopy(Ty, Src), B.buildCopy(S32, Src1)});
800 constrainSelectedInstRegOperands(*S_BFE, *ST.getInstrInfo(),
801 *ST.getRegisterInfo(), RBI);
802
803 B.buildCopy(DstReg, S_BFE->getOperand(0).getReg());
804 MI.eraseFromParent();
805 return true;
806}
807
808bool RegBankLegalizeHelper::lowerSplitTo32(MachineInstr &MI) {
809 Register Dst = MI.getOperand(0).getReg();
810 LLT DstTy = MRI.getType(Dst);
811 assert(DstTy == V4S16 || DstTy == V2S32 || DstTy == S64);
812 LLT Ty = DstTy == V4S16 ? V2S16 : S32;
813 auto Op1 = B.buildUnmerge({VgprRB, Ty}, MI.getOperand(1).getReg());
814 auto Op2 = B.buildUnmerge({VgprRB, Ty}, MI.getOperand(2).getReg());
815 unsigned Opc = MI.getOpcode();
816 auto Flags = MI.getFlags();
817 auto Lo =
818 B.buildInstr(Opc, {{VgprRB, Ty}}, {Op1.getReg(0), Op2.getReg(0)}, Flags);
819 auto Hi =
820 B.buildInstr(Opc, {{VgprRB, Ty}}, {Op1.getReg(1), Op2.getReg(1)}, Flags);
821 B.buildMergeLikeInstr(Dst, {Lo, Hi});
822 MI.eraseFromParent();
823 return true;
824}
825
826bool RegBankLegalizeHelper::lowerSplitTo32Mul(MachineInstr &MI) {
827 Register Dst = MI.getOperand(0).getReg();
828 assert(MRI.getType(Dst) == S64);
829 auto Op1 = B.buildUnmerge({VgprRB_S32}, MI.getOperand(1).getReg());
830 auto Op2 = B.buildUnmerge({VgprRB_S32}, MI.getOperand(2).getReg());
831
832 // TODO: G_AMDGPU_MAD_* optimizations for G_MUL divergent S64 operation to
833 // match GlobalISel with old regbankselect.
834 auto Lo = B.buildMul(VgprRB_S32, Op1.getReg(0), Op2.getReg(0));
835 auto Carry = B.buildUMulH(VgprRB_S32, Op1.getReg(0), Op2.getReg(0));
836 auto MulLo0Hi1 = B.buildMul(VgprRB_S32, Op1.getReg(0), Op2.getReg(1));
837 auto MulHi0Lo1 = B.buildMul(VgprRB_S32, Op1.getReg(1), Op2.getReg(0));
838 auto Sum = B.buildAdd(VgprRB_S32, MulLo0Hi1, MulHi0Lo1);
839 auto Hi = B.buildAdd(VgprRB_S32, Sum, Carry);
840
841 B.buildMergeLikeInstr(Dst, {Lo, Hi});
842 MI.eraseFromParent();
843 return true;
844}
845
846bool RegBankLegalizeHelper::lowerSplitTo16(MachineInstr &MI) {
847 Register Dst = MI.getOperand(0).getReg();
848 assert(MRI.getType(Dst) == V2S16);
849 unsigned Opc = MI.getOpcode();
850 unsigned NumOps = MI.getNumOperands();
851 auto Flags = MI.getFlags();
852
853 auto [Op1Lo, Op1Hi] = unpackAExtTruncS16(MI.getOperand(1).getReg());
854
855 if (NumOps == 2) {
856 auto Lo = B.buildInstr(Opc, {SgprRB_S16}, {Op1Lo}, Flags);
857 auto Hi = B.buildInstr(Opc, {SgprRB_S16}, {Op1Hi}, Flags);
858 B.buildMergeLikeInstr(Dst, {Lo, Hi});
859 MI.eraseFromParent();
860 return true;
861 }
862
863 auto [Op2Lo, Op2Hi] = unpackAExtTruncS16(MI.getOperand(2).getReg());
864
865 if (NumOps == 3) {
866 auto Lo = B.buildInstr(Opc, {SgprRB_S16}, {Op1Lo, Op2Lo}, Flags);
867 auto Hi = B.buildInstr(Opc, {SgprRB_S16}, {Op1Hi, Op2Hi}, Flags);
868 B.buildMergeLikeInstr(Dst, {Lo, Hi});
869 MI.eraseFromParent();
870 return true;
871 }
872
873 assert(NumOps == 4);
874 auto [Op3Lo, Op3Hi] = unpackAExtTruncS16(MI.getOperand(3).getReg());
875 auto Lo = B.buildInstr(Opc, {SgprRB_S16}, {Op1Lo, Op2Lo, Op3Lo}, Flags);
876 auto Hi = B.buildInstr(Opc, {SgprRB_S16}, {Op1Hi, Op2Hi, Op3Hi}, Flags);
877 B.buildMergeLikeInstr(Dst, {Lo, Hi});
878 MI.eraseFromParent();
879 return true;
880}
881
882bool RegBankLegalizeHelper::lowerUniMAD64(MachineInstr &MI) {
883 Register Dst0 = MI.getOperand(0).getReg();
884 Register Dst1 = MI.getOperand(1).getReg();
885 Register Src0 = MI.getOperand(2).getReg();
886 Register Src1 = MI.getOperand(3).getReg();
887 Register Src2 = MI.getOperand(4).getReg();
888
889 const GCNSubtarget &ST = B.getMF().getSubtarget<GCNSubtarget>();
890
891 // Keep the multiplication on the SALU.
892 Register DstLo = B.buildMul(SgprRB_S32, Src0, Src1).getReg(0);
893 Register DstHi = MRI.createVirtualRegister(SgprRB_S32);
894 if (ST.hasScalarMulHiInsts()) {
895 B.buildInstr(AMDGPU::G_UMULH, {{DstHi}}, {Src0, Src1});
896 } else {
897 auto VSrc0 = B.buildCopy(VgprRB_S32, Src0);
898 auto VSrc1 = B.buildCopy(VgprRB_S32, Src1);
899 auto MulHi = B.buildInstr(AMDGPU::G_UMULH, {VgprRB_S32}, {VSrc0, VSrc1});
900 buildReadAnyLane(B, DstHi, MulHi.getReg(0), RBI);
901 }
902
903 // Accumulate and produce the "carry-out" bit.
904
905 // The "carry-out" is defined as bit 64 of the result when computed as a
906 // big integer. For unsigned multiply-add, this matches the usual
907 // definition of carry-out.
908 if (mi_match(Src2, MRI, MIPatternMatch::m_ZeroInt())) {
909 // No accumulate: result is just the multiplication, carry is 0.
910 B.buildMergeLikeInstr(Dst0, {DstLo, DstHi});
911 B.buildConstant(Dst1, 0);
912 } else {
913 // Accumulate: add Src2 to the multiplication result with carry chain.
914 Register Src2Lo = MRI.createVirtualRegister(SgprRB_S32);
915 Register Src2Hi = MRI.createVirtualRegister(SgprRB_S32);
916 B.buildUnmerge({Src2Lo, Src2Hi}, Src2);
917
918 auto AddLo = B.buildUAddo(SgprRB_S32, SgprRB_S32, DstLo, Src2Lo);
919 auto AddHi =
920 B.buildUAdde(SgprRB_S32, SgprRB_S32, DstHi, Src2Hi, AddLo.getReg(1));
921 B.buildMergeLikeInstr(Dst0, {AddLo.getReg(0), AddHi.getReg(0)});
922 B.buildCopy(Dst1, AddHi.getReg(1));
923 }
924
925 MI.eraseFromParent();
926 return true;
927}
928
929bool RegBankLegalizeHelper::lowerSplitTo32Select(MachineInstr &MI) {
930 Register Dst = MI.getOperand(0).getReg();
931 LLT DstTy = MRI.getType(Dst);
932 assert(DstTy == V4S16 || DstTy == V2S32 || DstTy == S64 ||
933 (DstTy.isPointer() && DstTy.getSizeInBits() == 64));
934 LLT Ty = DstTy == V4S16 ? V2S16 : S32;
935 auto Op2 = B.buildUnmerge({VgprRB, Ty}, MI.getOperand(2).getReg());
936 auto Op3 = B.buildUnmerge({VgprRB, Ty}, MI.getOperand(3).getReg());
937 Register Cond = MI.getOperand(1).getReg();
938 auto Flags = MI.getFlags();
939 auto Lo =
940 B.buildSelect({VgprRB, Ty}, Cond, Op2.getReg(0), Op3.getReg(0), Flags);
941 auto Hi =
942 B.buildSelect({VgprRB, Ty}, Cond, Op2.getReg(1), Op3.getReg(1), Flags);
943
944 B.buildMergeLikeInstr(Dst, {Lo, Hi});
945 MI.eraseFromParent();
946 return true;
947}
948
949bool RegBankLegalizeHelper::lowerSplitTo32SExtInReg(MachineInstr &MI) {
950 auto Op1 = B.buildUnmerge(VgprRB_S32, MI.getOperand(1).getReg());
951 int Amt = MI.getOperand(2).getImm();
952 Register Lo, Hi;
953 // Hi|Lo: s sign bit, ?/x bits changed/not changed by sign-extend
954 if (Amt <= 32) {
955 auto Freeze = B.buildFreeze(VgprRB_S32, Op1.getReg(0));
956 if (Amt == 32) {
957 // Hi|Lo: ????????|sxxxxxxx -> ssssssss|sxxxxxxx
958 Lo = Freeze.getReg(0);
959 } else {
960 // Hi|Lo: ????????|???sxxxx -> ssssssss|ssssxxxx
961 Lo = B.buildSExtInReg(VgprRB_S32, Freeze, Amt).getReg(0);
962 }
963
964 auto SignExtCst = B.buildConstant(SgprRB_S32, 31);
965 Hi = B.buildAShr(VgprRB_S32, Lo, SignExtCst).getReg(0);
966 } else {
967 // Hi|Lo: ?????sxx|xxxxxxxx -> ssssssxx|xxxxxxxx
968 Lo = Op1.getReg(0);
969 Hi = B.buildSExtInReg(VgprRB_S32, Op1.getReg(1), Amt - 32).getReg(0);
970 }
971
972 B.buildMergeLikeInstr(MI.getOperand(0).getReg(), {Lo, Hi});
973 MI.eraseFromParent();
974 return true;
975}
976
977bool RegBankLegalizeHelper::lowerSplitBitCount64To32(MachineInstr &MI) {
978 // Split 64-bit find-first-bit operations into 32-bit halves:
979 // (ffbh hi:lo) -> umin(ffbh(hi), uaddsat(ffbh(lo), 32))
980 // (ffbl hi:lo) -> umin(ffbl(lo), uaddsat(ffbl(hi), 32))
981 // (ctlz_zero_poison hi:lo) -> umin(ffbh(hi), add(ffbh(lo), 32))
982 // (cttz_zero_poison hi:lo) -> umin(ffbl(lo), add(ffbl(hi), 32))
983 unsigned Opc = MI.getOpcode();
984
985 // FFBH/FFBL return 0xFFFFFFFF on zero input, using uaddsat to avoid
986 // wrapping. CTLZ/CTTZ guarantee non-zero input (zero_poison), so plain add
987 // is fine.
988 unsigned FFBOpc;
989 unsigned AddOpc;
990 bool SearchFromMSB;
991 switch (Opc) {
992 case AMDGPU::G_AMDGPU_FFBH_U32:
993 FFBOpc = Opc;
994 AddOpc = AMDGPU::G_UADDSAT;
995 SearchFromMSB = true;
996 break;
997 case AMDGPU::G_AMDGPU_FFBL_B32:
998 FFBOpc = Opc;
999 AddOpc = AMDGPU::G_UADDSAT;
1000 SearchFromMSB = false;
1001 break;
1002 case AMDGPU::G_CTLZ_ZERO_POISON:
1003 FFBOpc = AMDGPU::G_AMDGPU_FFBH_U32;
1004 AddOpc = AMDGPU::G_ADD;
1005 SearchFromMSB = true;
1006 break;
1007 case AMDGPU::G_CTTZ_ZERO_POISON:
1008 FFBOpc = AMDGPU::G_AMDGPU_FFBL_B32;
1009 AddOpc = AMDGPU::G_ADD;
1010 SearchFromMSB = false;
1011 break;
1012 default:
1013 llvm_unreachable("unexpected opcode in lowerSplitBitCount64To32");
1014 }
1015
1016 auto Unmerge = B.buildUnmerge(VgprRB_S32, MI.getOperand(1).getReg());
1017 Register Lo = Unmerge.getReg(0);
1018 Register Hi = Unmerge.getReg(1);
1019
1020 // MSB-first (FFBH/CTLZ) searches hi first; LSB-first (FFBL/CTTZ) searches
1021 // lo first. The secondary half adds 32 to account for the primary half's
1022 // width.
1023 auto Primary = B.buildInstr(FFBOpc, {VgprRB_S32}, {SearchFromMSB ? Hi : Lo});
1024 auto Secondary =
1025 B.buildInstr(FFBOpc, {VgprRB_S32}, {SearchFromMSB ? Lo : Hi});
1026
1027 auto Adjusted = B.buildInstr(AddOpc, {VgprRB_S32},
1028 {Secondary, B.buildConstant(VgprRB_S32, 32)});
1029 B.buildUMin(MI.getOperand(0).getReg(), Primary, Adjusted);
1030
1031 MI.eraseFromParent();
1032 return true;
1033}
1034
1035bool RegBankLegalizeHelper::lowerExtrVecEltToSel(MachineInstr &MI) {
1036 // Lower extract vector element to a compare-select chain:
1037 // result = elt[0]
1038 // for i in 1..N-1:
1039 // result = (idx == i) ? elt[i] : result
1040 //
1041 // When the index is divergent, each lane may want a different element, so
1042 // we must check every element per lane.
1043 Register Dst = MI.getOperand(0).getReg();
1044 Register Src = MI.getOperand(1).getReg();
1045 Register Idx = MI.getOperand(2).getReg();
1046
1047 LLT VecTy = MRI.getType(Src);
1048 LLT ScalarTy = VecTy.getScalarType();
1049 unsigned NumElts = VecTy.getNumElements();
1050 MachineRegisterInfo::VRegAttrs VgprRB_EltTy = {VgprRB, ScalarTy};
1051
1052 auto Unmerge = B.buildUnmerge(VgprRB_EltTy, Src);
1053
1054 if (ScalarTy.getSizeInBits() == 32) {
1055 Register PrevSelect = Unmerge.getReg(0);
1056 for (unsigned I = 1; I < NumElts; ++I) {
1057 auto IdxConst = B.buildConstant({SgprRB, MRI.getType(Idx)}, I);
1058 auto Cmp = B.buildICmp(CmpInst::ICMP_EQ, VccRB_S1, Idx, IdxConst);
1059 PrevSelect =
1060 B.buildSelect(VgprRB_EltTy, Cmp, Unmerge.getReg(I), PrevSelect)
1061 .getReg(0);
1062 }
1063 B.buildCopy(Dst, PrevSelect);
1064 } else if (ScalarTy.getSizeInBits() == 64) {
1065 auto InitUnmerge = B.buildUnmerge(VgprRB_S32, Unmerge.getReg(0));
1066 Register PrevLo = InitUnmerge.getReg(0);
1067 Register PrevHi = InitUnmerge.getReg(1);
1068 for (unsigned I = 1; I < NumElts; ++I) {
1069 auto IdxConst = B.buildConstant({SgprRB, MRI.getType(Idx)}, I);
1070 auto Cmp = B.buildICmp(CmpInst::ICMP_EQ, VccRB_S1, Idx, IdxConst);
1071 auto EltUnmerge = B.buildUnmerge(VgprRB_S32, Unmerge.getReg(I));
1072 PrevLo = B.buildSelect(VgprRB_S32, Cmp, EltUnmerge.getReg(0), PrevLo)
1073 .getReg(0);
1074 PrevHi = B.buildSelect(VgprRB_S32, Cmp, EltUnmerge.getReg(1), PrevHi)
1075 .getReg(0);
1076 }
1077 B.buildMergeLikeInstr(Dst, {PrevLo, PrevHi});
1078 } else {
1080 MF, MORE, "amdgpu-regbanklegalize",
1081 "AMDGPU RegBankLegalize: ExtrVecEltToSel unsupported element type", MI);
1082 return false;
1083 }
1084
1085 MI.eraseFromParent();
1086 return true;
1087}
1088
1089bool RegBankLegalizeHelper::lowerExtrVecEltTo32(MachineInstr &MI) {
1090 // Reduce a 64-bit element extract to two 32-bit extracts:
1091 // vec32 = bitcast <N x s64> to <2N x s32>
1092 // lo = vec32[idx * 2]
1093 // hi = vec32[idx * 2 + 1]
1094 // result = merge(lo, hi)
1095 //
1096 // When the index is uniform, all lanes extract the same element, so we can
1097 // just split the s64 extract into two s32 extracts which lower to MOVREL.
1098 Register Dst = MI.getOperand(0).getReg();
1099 Register Src = MI.getOperand(1).getReg();
1100 Register Idx = MI.getOperand(2).getReg();
1101
1102 LLT SrcTy = MRI.getType(Src);
1103 LLT Vec32Ty = LLT::fixed_vector(2 * SrcTy.getNumElements(), 32);
1104
1105 assert(MRI.getRegBank(Src) == VgprRB && MRI.getRegBank(Idx) == SgprRB &&
1106 "expected VGPR src and SGPR idx");
1107
1108 auto CastSrc = B.buildBitcast({VgprRB, Vec32Ty}, Src);
1109
1110 // Calculate new Lo and Hi indices
1111 auto One = B.buildConstant(SgprRB_S32, 1);
1112 auto IdxLo = B.buildShl(SgprRB_S32, Idx, One);
1113 auto IdxHi = B.buildAdd(SgprRB_S32, IdxLo, One);
1114
1115 auto ExtLo = B.buildExtractVectorElement(VgprRB_S32, CastSrc, IdxLo);
1116 auto ExtHi = B.buildExtractVectorElement(VgprRB_S32, CastSrc, IdxHi);
1117
1118 B.buildMergeLikeInstr(Dst, {ExtLo.getReg(0), ExtHi.getReg(0)});
1119
1120 MI.eraseFromParent();
1121 return true;
1122}
1123
1124bool RegBankLegalizeHelper::lowerInsVecEltToSel(MachineInstr &MI) {
1125 // Lower insert vector element to a compare-select chain:
1126 // for i in 0..N-1:
1127 // result[i] = (idx == i) ? elt : srcVec[i]
1128 // dst = merge(result[0..N-1])
1129 //
1130 // VGPR B64 requires splitting to lo/hi s32 pairs since there is no
1131 // v_cndmask_b64. SGPR B64/B32 and VGPR B32 can be handled natively.
1132 Register Dst = MI.getOperand(0).getReg();
1133 Register Src = MI.getOperand(1).getReg();
1134 Register Elt = MI.getOperand(2).getReg();
1135 Register Idx = MI.getOperand(3).getReg();
1136
1137 LLT VecTy = MRI.getType(Src);
1138 LLT ScalarTy = VecTy.getScalarType();
1139 unsigned NumElts = VecTy.getNumElements();
1140 const RegisterBank *SrcRB = MRI.getRegBank(Src);
1141 bool IsSGPR = (SrcRB == SgprRB);
1142 SmallVector<Register, 16> Selects;
1143
1144 if (!IsSGPR && ScalarTy.getSizeInBits() == 64) {
1145 // VGPR B64: split to 32-bit lo/hi since there is no v_cndmask_b64.
1146 auto Unmerge = B.buildUnmerge(VgprRB_S32, Src);
1147 auto EltUnmerge = B.buildUnmerge(VgprRB_S32, Elt);
1148 Register EltLo = EltUnmerge.getReg(0);
1149 Register EltHi = EltUnmerge.getReg(1);
1150 for (unsigned I = 0; I < NumElts; ++I) {
1151 auto IdxConst = B.buildConstant(VgprRB_S32, I);
1152 auto Cmp = B.buildICmp(CmpInst::ICMP_EQ, VccRB_S1, Idx, IdxConst);
1153 Selects.push_back(
1154 B.buildSelect(VgprRB_S32, Cmp, EltLo, Unmerge.getReg(2 * I))
1155 .getReg(0));
1156 Selects.push_back(
1157 B.buildSelect(VgprRB_S32, Cmp, EltHi, Unmerge.getReg(2 * I + 1))
1158 .getReg(0));
1159 }
1160 LLT Vec32Ty = LLT::fixed_vector(2 * NumElts, 32);
1161 auto Vec32 = B.buildBuildVector({VgprRB, Vec32Ty}, Selects);
1162 B.buildBitcast(Dst, Vec32);
1163 } else if (ScalarTy.getSizeInBits() == 32 || ScalarTy.getSizeInBits() == 64) {
1164 // B32 (any bank) and SGPR B64: element-wise select at native width.
1165 MachineRegisterInfo::VRegAttrs SrcRB_EltTy = {SrcRB, ScalarTy};
1166 MachineRegisterInfo::VRegAttrs CmpTy = IsSGPR ? SgprRB_S32 : VccRB_S1;
1167 auto Unmerge = B.buildUnmerge(SrcRB_EltTy, Src);
1168 for (unsigned I = 0; I < NumElts; ++I) {
1169 auto IdxConst = B.buildConstant(SgprRB_S32, I);
1170 auto Cmp = B.buildICmp(CmpInst::ICMP_EQ, CmpTy, Idx, IdxConst);
1171 Selects.push_back(
1172 B.buildSelect(SrcRB_EltTy, Cmp, Elt, Unmerge.getReg(I)).getReg(0));
1173 }
1174 B.buildMergeLikeInstr(Dst, Selects);
1175 } else {
1177 MF, MORE, "amdgpu-regbanklegalize",
1178 "AMDGPU RegBankLegalize: InsVecEltToSel unsupported element type", MI);
1179 return false;
1180 }
1181
1182 MI.eraseFromParent();
1183 return true;
1184}
1185
1186bool RegBankLegalizeHelper::lowerInsVecEltTo32(MachineInstr &MI) {
1187 // Reduce a 64-bit element insert to two 32-bit inserts:
1188 // vec32 = bitcast <N x s64> to <2N x s32>
1189 // lo, hi = unmerge elt
1190 // vec32[idx * 2] = lo
1191 // vec32[idx * 2 + 1] = hi
1192 // dst = bitcast <2N x s32> to <N x s64>
1193 //
1194 // When the index is uniform, all lanes insert at the same position, so we
1195 // can split the s64 insert into two s32 inserts which lower to MOVREL/GPRIDX.
1196 Register Dst = MI.getOperand(0).getReg();
1197 Register Src = MI.getOperand(1).getReg();
1198 Register Elt = MI.getOperand(2).getReg();
1199 Register Idx = MI.getOperand(3).getReg();
1200
1201 LLT SrcTy = MRI.getType(Src);
1202 LLT Vec32Ty = LLT::fixed_vector(2 * SrcTy.getNumElements(), 32);
1203
1204 assert(MRI.getRegBank(Src) == VgprRB && MRI.getRegBank(Idx) == SgprRB &&
1205 "expected VGPR src and SGPR idx");
1206
1207 MachineRegisterInfo::VRegAttrs VgprRB_Vec32Ty = {VgprRB, Vec32Ty};
1208
1209 auto CastSrc = B.buildBitcast(VgprRB_Vec32Ty, Src);
1210 auto EltUnmerge = B.buildUnmerge(VgprRB_S32, Elt);
1211
1212 // Calculate new Lo and Hi indices
1213 auto One = B.buildConstant(SgprRB_S32, 1);
1214 auto IdxLo = B.buildShl(SgprRB_S32, Idx, One);
1215 auto IdxHi = B.buildAdd(SgprRB_S32, IdxLo, One);
1216
1217 auto InsLo = B.buildInsertVectorElement(VgprRB_Vec32Ty, CastSrc,
1218 EltUnmerge.getReg(0), IdxLo);
1219 auto InsHi = B.buildInsertVectorElement(VgprRB_Vec32Ty, InsLo,
1220 EltUnmerge.getReg(1), IdxHi);
1221
1222 B.buildBitcast(Dst, InsHi);
1223
1224 MI.eraseFromParent();
1225 return true;
1226}
1227
1228bool RegBankLegalizeHelper::lowerAbsToNegMax(MachineInstr &MI) {
1229 // Lower divergent G_ABS to smax(x, 0 - x) in the VGPR bank:
1230 // zero = 0
1231 // neg = G_SUB zero, x
1232 // dst = G_SMAX x, neg
1233 //
1234 // There is no integer v_abs instruction on AMDGPU, so divergent G_ABS is
1235 // expanded to this sub/smax pair.
1236 Register DstReg = MI.getOperand(0).getReg();
1237 Register SrcReg = MI.getOperand(1).getReg();
1238 LLT Ty = MRI.getType(DstReg);
1239
1240 Register Zero;
1241 if (Ty == V2S16) {
1242 // buildConstant cannot produce a V2S16 directly; pack two S16 zeros.
1243 Register Zero16 = B.buildConstant({VgprRB, S16}, 0).getReg(0);
1244 Zero = B.buildBuildVector({VgprRB, Ty}, {Zero16, Zero16}).getReg(0);
1245 } else {
1246 assert((Ty == S32 || Ty == S16) && "unexpected type for AbsToNegMax");
1247 Zero = B.buildConstant({VgprRB, Ty}, 0).getReg(0);
1248 }
1249
1250 auto Neg = B.buildSub({VgprRB, Ty}, Zero, SrcReg);
1251 B.buildSMax(DstReg, SrcReg, Neg);
1252 MI.eraseFromParent();
1253 return true;
1254}
1255
1256bool RegBankLegalizeHelper::lowerAbsToS32(MachineInstr &MI) {
1257 // Lower uniform V2S16 abs by unpacking the values to two separate SGPR
1258 // registers and re-emitting G_ABS on each:
1259 // packed = bitcast <2 x s16> src to s32
1260 // lo = sext_inreg packed, 16
1261 // hi = ashr packed, 16
1262 // dst = build_vector_trunc G_ABS(lo), G_ABS(hi)
1263 //
1264 // SALU only has s_abs_i32, with no direct uniform V2S16 abs. The
1265 // re-emitted G_ABS(SgprRB, S32) selects to s_abs_i32 on each value.
1266 auto Bitcast = B.buildBitcast({SgprRB_S32}, MI.getOperand(1).getReg());
1267 auto SextInReg = B.buildSExtInReg({SgprRB_S32}, Bitcast, 16);
1268 auto ShiftHi =
1269 B.buildAShr({SgprRB_S32}, Bitcast, B.buildConstant({SgprRB_S32}, 16));
1270
1271 auto AbsLo = B.buildInstr(AMDGPU::G_ABS, {{SgprRB_S32}}, {SextInReg});
1272 auto AbsHi = B.buildInstr(AMDGPU::G_ABS, {{SgprRB_S32}}, {ShiftHi});
1273 B.buildBuildVectorTrunc(MI.getOperand(0).getReg(),
1274 {AbsLo.getReg(0), AbsHi.getReg(0)});
1275
1276 MI.eraseFromParent();
1277 return true;
1278}
1279
1280bool RegBankLegalizeHelper::lower(MachineInstr &MI,
1281 const RegBankLLTMapping &Mapping,
1282 WaterfallInfo &WFI) {
1283
1284 switch (Mapping.LoweringMethod) {
1285 case DoNotLower:
1286 break;
1287 case VccExtToSel:
1288 return lowerVccExtToSel(MI);
1289 case UniExtToSel: {
1290 LLT Ty = MRI.getType(MI.getOperand(0).getReg());
1291 auto True = B.buildConstant({SgprRB, Ty},
1292 MI.getOpcode() == AMDGPU::G_SEXT ? -1 : 1);
1293 auto False = B.buildConstant({SgprRB, Ty}, 0);
1294 // Input to G_{Z|S}EXT is 'Legalizer legal' S1. Most common case is compare.
1295 // We are making select here. S1 cond was already 'any-extended to S32' +
1296 // 'AND with 1 to clean high bits' by Sgpr32AExtBoolInReg.
1297 B.buildSelect(MI.getOperand(0).getReg(), MI.getOperand(1).getReg(), True,
1298 False);
1299 MI.eraseFromParent();
1300 return true;
1301 }
1302 case UnpackBitShift:
1303 return lowerUnpackBitShift(MI);
1304 case UnpackMinMax:
1305 return lowerUnpackMinMax(MI);
1306 case ScalarizeToS16:
1307 return lowerSplitTo16(MI);
1308 case Ext32To64: {
1309 const RegisterBank *RB = MRI.getRegBank(MI.getOperand(0).getReg());
1310 MachineInstrBuilder Hi;
1311 switch (MI.getOpcode()) {
1312 case AMDGPU::G_ZEXT: {
1313 Hi = B.buildConstant({RB, S32}, 0);
1314 break;
1315 }
1316 case AMDGPU::G_SEXT: {
1317 // Replicate sign bit from 32-bit extended part.
1318 auto ShiftAmt = B.buildConstant({RB, S32}, 31);
1319 Hi = B.buildAShr({RB, S32}, MI.getOperand(1).getReg(), ShiftAmt);
1320 break;
1321 }
1322 case AMDGPU::G_ANYEXT: {
1323 Hi = B.buildUndef({RB, S32});
1324 break;
1325 }
1326 default:
1327 reportGISelFailure(MF, MORE, "amdgpu-regbanklegalize",
1328 "AMDGPU RegBankLegalize: Ext32To64, unsuported opcode",
1329 MI);
1330 return false;
1331 }
1332
1333 B.buildMergeLikeInstr(MI.getOperand(0).getReg(),
1334 {MI.getOperand(1).getReg(), Hi});
1335 MI.eraseFromParent();
1336 return true;
1337 }
1338 case UniCstExt: {
1339 uint64_t ConstVal = MI.getOperand(1).getCImm()->getZExtValue();
1340 B.buildConstant(MI.getOperand(0).getReg(), ConstVal);
1341
1342 MI.eraseFromParent();
1343 return true;
1344 }
1345 case VgprToVccCopy: {
1346 Register Src = MI.getOperand(1).getReg();
1347 LLT Ty = MRI.getType(Src);
1348 // Take lowest bit from each lane and put it in lane mask.
1349 // Lowering via compare, but we need to clean high bits first as compare
1350 // compares all bits in register.
1351 Register BoolSrc = MRI.createVirtualRegister({VgprRB, Ty});
1352 if (Ty == S64) {
1353 auto Src64 = B.buildUnmerge(VgprRB_S32, Src);
1354 auto One = B.buildConstant(VgprRB_S32, 1);
1355 auto AndLo = B.buildAnd(VgprRB_S32, Src64.getReg(0), One);
1356 auto Zero = B.buildConstant(VgprRB_S32, 0);
1357 auto AndHi = B.buildAnd(VgprRB_S32, Src64.getReg(1), Zero);
1358 B.buildMergeLikeInstr(BoolSrc, {AndLo, AndHi});
1359 } else {
1360 assert(Ty == S32 || Ty == S16);
1361 auto One = B.buildConstant({VgprRB, Ty}, 1);
1362 B.buildAnd(BoolSrc, Src, One);
1363 }
1364 auto Zero = B.buildConstant({VgprRB, Ty}, 0);
1365 B.buildICmp(CmpInst::ICMP_NE, MI.getOperand(0).getReg(), BoolSrc, Zero);
1366 MI.eraseFromParent();
1367 return true;
1368 }
1369 case V_BFE:
1370 return lowerV_BFE(MI);
1371 case S_BFE:
1372 return lowerS_BFE(MI);
1373 case UniMAD64:
1374 return lowerUniMAD64(MI);
1375 case UniMul64: {
1376 B.buildMul(MI.getOperand(0), MI.getOperand(1), MI.getOperand(2));
1377 MI.eraseFromParent();
1378 return true;
1379 }
1380 case DivSMulToMAD: {
1381 auto Op1 = B.buildTrunc(VgprRB_S32, MI.getOperand(1));
1382 auto Op2 = B.buildTrunc(VgprRB_S32, MI.getOperand(2));
1383 auto Zero = B.buildConstant({VgprRB, S64}, 0);
1384
1385 unsigned NewOpc = MI.getOpcode() == AMDGPU::G_AMDGPU_S_MUL_U64_U32
1386 ? AMDGPU::G_AMDGPU_MAD_U64_U32
1387 : AMDGPU::G_AMDGPU_MAD_I64_I32;
1388
1389 B.buildInstr(NewOpc, {MI.getOperand(0).getReg(), {SgprRB, S32}},
1390 {Op1, Op2, Zero});
1391 MI.eraseFromParent();
1392 return true;
1393 }
1394 case SplitTo32:
1395 return lowerSplitTo32(MI);
1396 case SplitTo32Mul:
1397 return lowerSplitTo32Mul(MI);
1398 case SplitTo32Select:
1399 return lowerSplitTo32Select(MI);
1400 case SplitTo32SExtInReg:
1401 return lowerSplitTo32SExtInReg(MI);
1402 case CtPop64To32: {
1403 auto Unmerge = B.buildUnmerge({VgprRB, S32}, MI.getOperand(1).getReg());
1404 auto LoPopCnt = B.buildCTPOP({VgprRB, S32}, Unmerge.getReg(0));
1405 auto HiPopCnt = B.buildCTPOP({VgprRB, S32}, Unmerge.getReg(1));
1406 // Max popcount of two 32-bit values is 64, so this add cannot overflow.
1407 B.buildAdd(MI.getOperand(0).getReg(), LoPopCnt, HiPopCnt,
1409
1410 MI.eraseFromParent();
1411 break;
1412 }
1413 case S_BUF_to_BUF:
1414 return lowerSBufToBuf(MI, WFI);
1415 case SplitLoad: {
1416 LLT DstTy = MRI.getType(MI.getOperand(0).getReg());
1417 unsigned Size = DstTy.getSizeInBits();
1418 // Even split to 128-bit loads
1419 if (Size > 128) {
1420 LLT B128;
1421 if (DstTy.isVector()) {
1422 LLT EltTy = DstTy.getElementType();
1423 B128 = LLT::fixed_vector(128 / EltTy.getSizeInBits(), EltTy);
1424 } else {
1425 B128 = LLT::scalar(128);
1426 }
1427 if (Size / 128 == 2)
1428 splitLoad(MI, {B128, B128});
1429 else if (Size / 128 == 4)
1430 splitLoad(MI, {B128, B128, B128, B128});
1431 else {
1432 reportGISelFailure(MF, MORE, "amdgpu-regbanklegalize",
1433 "AMDGPU RegBankLegalize: SplitLoad, unsuported type",
1434 MI);
1435 return false;
1436 }
1437 }
1438 // 64 and 32 bit load
1439 else if (DstTy == S96)
1440 splitLoad(MI, {S64, S32}, S32);
1441 else if (DstTy == V3S32)
1442 splitLoad(MI, {V2S32, S32}, S32);
1443 else if (DstTy == V6S16)
1444 splitLoad(MI, {V4S16, V2S16}, V2S16);
1445 else {
1446 reportGISelFailure(MF, MORE, "amdgpu-regbanklegalize",
1447 "AMDGPU RegBankLegalize: SplitLoad, unsuported type",
1448 MI);
1449 return false;
1450 }
1451 return true;
1452 }
1453 case DynStackAlloc: {
1454 const auto &TFI = *ST.getFrameLowering();
1455 // Guard in case the stack growth direction ever changes with scratch
1456 // instructions.
1457 assert(TFI.getStackGrowthDirection() == TargetFrameLowering::StackGrowsUp &&
1458 "Stack grows upwards for AMDGPU");
1459
1460 Register Dst = MI.getOperand(0).getReg();
1461 Register AllocSize = MI.getOperand(1).getReg();
1462 Align Alignment = assumeAligned(MI.getOperand(2).getImm());
1463
1464 // Erase before building new instrs to avoid hitting multiple Dst assert
1465 // with CSE.
1466 B.setInsertPt(*MI.getParent(), std::next(MI.getIterator()));
1467 MI.eraseFromParent();
1468
1469 if (MRI.getRegBank(AllocSize) != SgprRB) {
1470 auto WaveReduction =
1471 B.buildIntrinsic(Intrinsic::amdgcn_wave_reduce_umax, {SgprRB_S32})
1472 .addUse(AllocSize)
1473 .addImm(0);
1474 AllocSize = WaveReduction.getReg(0);
1475 }
1476
1477 LLT PtrTy = MRI.getType(Dst);
1478 assert(PtrTy.getSizeInBits() == 32 &&
1479 "Expected 32-bit pointer for stack allocation");
1480 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
1481 Register SPReg = Info->getStackPtrOffsetReg();
1482
1483 // When using flat-scratch, the stack offset is unscaled.
1484 const bool HasFlatScratch = ST.hasFlatScratchEnabled();
1485 const unsigned WavefrontSizeLog2 = ST.getWavefrontSizeLog2();
1486
1487 Register AdjustedSize = AllocSize;
1488 if (!HasFlatScratch) {
1489 auto WaveSize = B.buildConstant(SgprRB_S32, WavefrontSizeLog2);
1490 AdjustedSize = B.buildShl(SgprRB_S32, AllocSize, WaveSize).getReg(0);
1491 }
1492 if (Alignment > TFI.getStackAlign()) {
1493 const uint64_t EffectiveAlignment =
1494 Alignment.value() << (HasFlatScratch ? 0 : WavefrontSizeLog2);
1495 auto OldSP = B.buildCopy({SgprRB, PtrTy}, SPReg);
1496 auto Tmp1 =
1497 B.buildPtrAdd({SgprRB, PtrTy}, OldSP,
1498 B.buildConstant(SgprRB_S32, EffectiveAlignment - 1));
1499 uint64_t Mask = maskTrailingZeros<uint64_t>(Log2_64(EffectiveAlignment));
1500 B.buildPtrMask(Dst, Tmp1, B.buildConstant(SgprRB_S32, Mask));
1501 } else {
1502 B.buildCopy(Dst, SPReg);
1503 }
1504 auto PtrAdd = B.buildPtrAdd({SgprRB, PtrTy}, Dst, AdjustedSize);
1505 B.buildCopy(SPReg, PtrAdd);
1506 return true;
1507 }
1508 case WidenLoad: {
1509 LLT DstTy = MRI.getType(MI.getOperand(0).getReg());
1510 if (DstTy == S96)
1511 widenLoad(MI, S128);
1512 else if (DstTy == V3S32)
1513 widenLoad(MI, V4S32, S32);
1514 else if (DstTy == V6S16)
1515 widenLoad(MI, V8S16, V2S16);
1516 else {
1517 reportGISelFailure(MF, MORE, "amdgpu-regbanklegalize",
1518 "AMDGPU RegBankLegalize: WidenLoad, unsuported type",
1519 MI);
1520 return false;
1521 }
1522 return true;
1523 }
1524 case UnpackAExt:
1525 return lowerUnpackAExt(MI);
1526 case WidenMMOToS32:
1527 return widenMMOToS32(cast<GAnyLoad>(MI));
1528 case VerifyAllSgpr: {
1529 assert(llvm::all_of(MI.operands(), [&](const MachineOperand &Op) {
1530 return MRI.getRegBankOrNull(Op.getReg()) == SgprRB;
1531 }));
1532 return true;
1533 }
1534 case ApplyAllVgpr: {
1535 assert(llvm::all_of(MI.defs(), [&](const MachineOperand &Op) {
1536 return MRI.getRegBankOrNull(Op.getReg()) == VgprRB;
1537 }));
1538 B.setInstrAndDebugLoc(MI);
1539 for (unsigned i = MI.getNumDefs(); i < MI.getNumOperands(); ++i) {
1540 MachineOperand &Op = MI.getOperand(i);
1541 if (!Op.isReg())
1542 continue;
1543 Register Reg = Op.getReg();
1544 if (MRI.getRegBank(Reg) != VgprRB) {
1545 auto Copy = B.buildCopy({VgprRB, MRI.getType(Reg)}, Reg);
1546 Op.setReg(Copy.getReg(0));
1547 }
1548 }
1549 return true;
1550 }
1551 case UnmergeToShiftTrunc: {
1552 GUnmerge *Unmerge = dyn_cast<GUnmerge>(&MI);
1553 LLT Ty = MRI.getType(Unmerge->getSourceReg());
1554 if (Ty.getSizeInBits() % 32 != 0) {
1555 reportGISelFailure(MF, MORE, "amdgpu-regbanklegalize",
1556 "AMDGPU RegBankLegalize: unmerge not multiple of 32",
1557 MI);
1558 return false;
1559 }
1560
1561 B.setInstrAndDebugLoc(MI);
1562 if (Ty.getSizeInBits() > 32) {
1563 auto UnmergeV2S16 =
1564 B.buildUnmerge({SgprRB, V2S16}, Unmerge->getSourceReg());
1565 for (unsigned i = 0; i < UnmergeV2S16->getNumDefs(); ++i) {
1566 auto [Dst0S32, Dst1S32] =
1567 unpackAExt(UnmergeV2S16->getOperand(i).getReg());
1568 B.buildTrunc(MI.getOperand(i * 2).getReg(), Dst0S32);
1569 B.buildTrunc(MI.getOperand(i * 2 + 1).getReg(), Dst1S32);
1570 }
1571 } else {
1572 auto [Dst0S32, Dst1S32] = unpackAExt(MI.getOperand(2).getReg());
1573 B.buildTrunc(MI.getOperand(0).getReg(), Dst0S32);
1574 B.buildTrunc(MI.getOperand(1).getReg(), Dst1S32);
1575 }
1576
1577 MI.eraseFromParent();
1578 return true;
1579 }
1581 Register Dst = MI.getOperand(0).getReg();
1582 Register NewDst = MRI.createVirtualRegister(SgprRB_S32);
1583 B.setInsertPt(*MI.getParent(), MI.getParent()->getFirstNonPHI());
1584 MI.getOperand(0).setReg(NewDst);
1585 B.buildTrunc(Dst, NewDst);
1586
1587 for (unsigned i = 1; i < MI.getNumOperands(); i += 2) {
1588 Register UseReg = MI.getOperand(i).getReg();
1589
1590 auto DefMI = MRI.getVRegDef(UseReg)->getIterator();
1591 MachineBasicBlock *DefMBB = DefMI->getParent();
1592
1593 B.setInsertPt(*DefMBB, DefMBB->SkipPHIsAndLabels(std::next(DefMI)));
1594
1595 auto NewUse = B.buildAnyExt(SgprRB_S32, UseReg);
1596 MI.getOperand(i).setReg(NewUse.getReg(0));
1597 }
1598 break;
1599 }
1600 case VerifyAllSgprGPHI: {
1601 assert(llvm::all_of(MI.operands(), [&](const MachineOperand &Op) {
1602 if (Op.isMBB())
1603 return true;
1604 return MRI.getRegBankOrNull(Op.getReg()) == SgprRB;
1605 }));
1606 return true;
1607 }
1609 assert(MRI.getRegBankOrNull(MI.getOperand(0).getReg()) == VgprRB);
1610 assert(llvm::all_of(MI.operands(), [&](const MachineOperand &Op) {
1611 if (Op.isMBB())
1612 return true;
1613 const RegisterBank *RB = MRI.getRegBankOrNull(Op.getReg());
1614 return RB == VgprRB || RB == SgprRB;
1615 }));
1616 return true;
1617 }
1618 case ApplyINTRIN_IMAGE: {
1619 const AMDGPU::RsrcIntrinsic *RSrcIntrin =
1621 assert(RSrcIntrin && RSrcIntrin->IsImage);
1622 // The reported argument index is relative to the IR intrinsic call
1623 // arguments, so shift by the number of defs and the intrinsic ID.
1624 unsigned RsrcIdx = RSrcIntrin->RsrcArg + MI.getNumExplicitDefs() + 1;
1625 return applyRegisterBanksVgprWithSgprRsrc(MI, RsrcIdx);
1626 }
1628 // Rsrc is the last register operand. Base BVH trails an A16 immediate
1629 // after rsrc; dual/BVH8 do not. Scan backwards for the last virtual
1630 // register.
1631 unsigned RsrcIdx = MI.getNumOperands();
1632 while (RsrcIdx-- > MI.getNumExplicitDefs()) {
1633 const MachineOperand &Op = MI.getOperand(RsrcIdx);
1634 if (Op.isReg() && Op.getReg().isVirtual())
1635 break;
1636 }
1637 return applyRegisterBanksVgprWithSgprRsrc(MI, RsrcIdx);
1638 }
1640 return lowerSplitBitCount64To32(MI);
1641 case ExtrVecEltToSel:
1642 return lowerExtrVecEltToSel(MI);
1643 case ExtrVecEltTo32:
1644 return lowerExtrVecEltTo32(MI);
1645 case InsVecEltToSel:
1646 return lowerInsVecEltToSel(MI);
1647 case InsVecEltTo32:
1648 return lowerInsVecEltTo32(MI);
1649 case AbsToNegMax:
1650 return lowerAbsToNegMax(MI);
1651 case AbsToS32:
1652 return lowerAbsToS32(MI);
1653 case DeletePrefetch:
1654 MI.eraseFromParent();
1655 return true;
1656 }
1657
1658 return true;
1659}
1660
1661LLT RegBankLegalizeHelper::getTyFromID(RegBankLLTMappingApplyID ID) {
1662 switch (ID) {
1663 case Vcc:
1664 case UniInVcc:
1665 return LLT::scalar(1);
1666 case Sgpr16:
1667 case Vgpr16:
1668 case UniInVgprS16:
1669 return LLT::scalar(16);
1670 case Sgpr32:
1671 case Sgpr32_WF:
1672 case Sgpr32Trunc:
1673 case Sgpr32AExt:
1675 case Sgpr32SExt:
1676 case Sgpr32ZExt:
1677 case UniInVgprS32:
1678 case Sgpr32ToVgprDst:
1679 case Vgpr32:
1680 case Vgpr32AExt:
1681 case Vgpr32SExt:
1682 case Vgpr32ZExt:
1683 return LLT::scalar(32);
1684 case Sgpr64:
1685 case Vgpr64:
1686 case UniInVgprS64:
1687 case Sgpr64ToVgprDst:
1688 return LLT::scalar(64);
1689 case Sgpr128:
1690 case Vgpr128:
1691 return LLT::scalar(128);
1692 case SgprP0:
1693 case SgprP0Call_WF:
1694 case VgprP0:
1695 return LLT::pointer(0, 64);
1696 case SgprP1:
1697 case VgprP1:
1698 return LLT::pointer(1, 64);
1699 case SgprP2:
1700 case VgprP2:
1701 return LLT::pointer(2, 32);
1702 case SgprP3:
1703 case VgprP3:
1704 return LLT::pointer(3, 32);
1705 case SgprP4:
1706 case SgprP4Call_WF:
1707 case VgprP4:
1708 return LLT::pointer(4, 64);
1709 case SgprP5:
1710 case VgprP5:
1711 return LLT::pointer(5, 32);
1712 case SgprP6:
1713 return LLT::pointer(6, 32);
1714 case SgprP8:
1715 return LLT::pointer(8, 128);
1716 case SgprV2S16:
1717 case VgprV2S16:
1718 case UniInVgprV2S16:
1719 return LLT::fixed_vector(2, 16);
1720 case SgprV2S32:
1721 case VgprV2S32:
1722 case UniInVgprV2S32:
1723 return LLT::fixed_vector(2, 32);
1724 case VgprV3S32:
1725 case UniInVgprV3S32:
1726 return LLT::fixed_vector(3, 32);
1727 case VgprV4S16:
1728 return LLT::fixed_vector(4, 16);
1729 case VgprV8S16:
1730 case UniInVgprV8S16:
1731 return LLT::fixed_vector(8, 16);
1732 case VgprV16S16:
1733 case UniInVgprV16S16:
1734 return LLT::fixed_vector(16, 16);
1735 case SgprV4S32:
1736 case SgprV4S32_WF:
1738 case VgprV4S32:
1739 case UniInVgprV4S32:
1740 return LLT::fixed_vector(4, 32);
1741 case VgprV8S32:
1742 case UniInVgprV8S32:
1744 return LLT::fixed_vector(8, 32);
1745 case VgprV2S64:
1746 case UniInVgprV2S64:
1747 return LLT::fixed_vector(2, 64);
1748 case VgprV6S32:
1749 case UniInVgprV6S32:
1750 return LLT::fixed_vector(6, 32);
1751 case VgprV16S32:
1752 case UniInVgprV16S32:
1753 return LLT::fixed_vector(16, 32);
1754 case VgprV32S16:
1755 case UniInVgprV32S16:
1756 return LLT::fixed_vector(32, 16);
1757 case VgprV32S32:
1758 case UniInVgprV32S32:
1759 return LLT::fixed_vector(32, 32);
1760 default:
1761 return LLT();
1762 }
1763}
1764
1765LLT RegBankLegalizeHelper::getBTyFromID(RegBankLLTMappingApplyID ID, LLT Ty) {
1766 switch (ID) {
1767 case SgprB32:
1768 case VgprB32:
1769 case SgprB32_M0:
1771 case UniInVgprB32:
1772 if (Ty == LLT::scalar(32) || Ty == LLT::fixed_vector(2, 16) ||
1773 isAnyPtr(Ty, 32))
1774 return Ty;
1775 return LLT();
1776 case SgprPtr32:
1777 case VgprPtr32:
1778 return isAnyPtr(Ty, 32) ? Ty : LLT();
1779 case SgprPtr64:
1780 case VgprPtr64:
1781 return isAnyPtr(Ty, 64) ? Ty : LLT();
1782 case SgprPtr128:
1783 case VgprPtr128:
1784 return isAnyPtr(Ty, 128) ? Ty : LLT();
1785 case SgprB64:
1786 case VgprB64:
1788 case UniInVgprB64:
1789 if (Ty == LLT::scalar(64) || Ty == LLT::fixed_vector(2, 32) ||
1790 Ty == LLT::fixed_vector(4, 16) || isAnyPtr(Ty, 64))
1791 return Ty;
1792 return LLT();
1793 case SgprB96:
1794 case VgprB96:
1795 case UniInVgprB96:
1796 if (Ty == LLT::scalar(96) || Ty == LLT::fixed_vector(3, 32) ||
1797 Ty == LLT::fixed_vector(6, 16))
1798 return Ty;
1799 return LLT();
1800 case SgprB128:
1801 case VgprB128:
1802 case UniInVgprB128:
1803 if (Ty.getSizeInBits() == 128)
1804 return Ty;
1805 return LLT();
1806 case VgprB160:
1807 case UniInVgprB160:
1808 if (Ty.getSizeInBits() == 160)
1809 return Ty;
1810 return LLT();
1811 case SgprB256:
1812 case VgprB256:
1813 case UniInVgprB256:
1814 if (Ty.getSizeInBits() == 256)
1815 return Ty;
1816 return LLT();
1817 case SgprB512:
1818 case VgprB512:
1819 case UniInVgprB512:
1820 if (Ty.getSizeInBits() == 512)
1821 return Ty;
1822 return LLT();
1823 case SgprBRC: {
1824 const SIRegisterInfo *TRI =
1825 static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
1826 unsigned LLTSize = Ty.getSizeInBits();
1827 if (LLTSize >= 32 && TRI->getSGPRClassForBitWidth(LLTSize))
1828 return Ty;
1829 return LLT();
1830 }
1831 case VgprBRC: {
1832 const SIRegisterInfo *TRI =
1833 static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
1834 if (TRI->getSGPRClassForBitWidth(Ty.getSizeInBits()))
1835 return Ty;
1836 return LLT();
1837 }
1838 default:
1839 return LLT();
1840 }
1841}
1842
1843const RegisterBank *
1844RegBankLegalizeHelper::getRegBankFromID(RegBankLLTMappingApplyID ID) {
1845 switch (ID) {
1846 case Vcc:
1847 return VccRB;
1848 case Sgpr16:
1849 case Sgpr32:
1850 case Sgpr32_WF:
1851 case Sgpr64:
1852 case Sgpr128:
1853 case SgprP0:
1854 case SgprP0Call_WF:
1855 case SgprP1:
1856 case SgprP2:
1857 case SgprP3:
1858 case SgprP4:
1859 case SgprP4Call_WF:
1860 case SgprP5:
1861 case SgprP6:
1862 case SgprP8:
1863 case SgprPtr32:
1864 case SgprPtr64:
1865 case SgprPtr128:
1866 case SgprV2S16:
1867 case SgprV2S32:
1868 case SgprV4S32:
1869 case SgprV4S32_WF:
1872 case SgprB32:
1873 case SgprB64:
1874 case SgprB96:
1875 case SgprB128:
1876 case SgprB256:
1877 case SgprB512:
1878 case SgprBRC:
1879 case UniInVcc:
1880 case UniInVgprS16:
1881 case UniInVgprS32:
1882 case UniInVgprS64:
1883 case UniInVgprV2S16:
1884 case UniInVgprV2S32:
1885 case UniInVgprV3S32:
1886 case UniInVgprV4S32:
1887 case UniInVgprV2S64:
1888 case UniInVgprV6S32:
1889 case UniInVgprV8S16:
1890 case UniInVgprV8S32:
1891 case UniInVgprV16S16:
1892 case UniInVgprV16S32:
1893 case UniInVgprV32S16:
1894 case UniInVgprV32S32:
1895 case UniInVgprB32:
1896 case UniInVgprB64:
1897 case UniInVgprB96:
1898 case UniInVgprB128:
1899 case UniInVgprB160:
1900 case UniInVgprB256:
1901 case UniInVgprB512:
1902 case Sgpr32Trunc:
1903 case Sgpr32AExt:
1905 case Sgpr32SExt:
1906 case Sgpr32ZExt:
1907 return SgprRB;
1908 case AgprAnyTy:
1909 return AgprRB;
1910 case Vgpr16:
1911 case Vgpr32:
1912 case Vgpr64:
1913 case Vgpr128:
1914 case VgprP0:
1915 case VgprP1:
1916 case VgprP2:
1917 case VgprP3:
1918 case VgprP4:
1919 case VgprP5:
1920 case VgprPtr32:
1921 case VgprPtr64:
1922 case VgprPtr128:
1923 case VgprV2S16:
1924 case VgprV2S32:
1925 case VgprV2S64:
1926 case VgprV3S32:
1927 case VgprV4S16:
1928 case VgprV8S16:
1929 case VgprV16S16:
1930 case VgprV4S32:
1931 case VgprV6S32:
1932 case VgprV8S32:
1933 case VgprV16S32:
1934 case VgprV32S16:
1935 case VgprV32S32:
1936 case VgprB32:
1937 case VgprB64:
1938 case VgprB96:
1939 case VgprB128:
1940 case VgprB160:
1941 case VgprB256:
1942 case VgprB512:
1943 case VgprBRC:
1944 case VgprAnyTy:
1945 case Vgpr32AExt:
1946 case Vgpr32SExt:
1947 case Vgpr32ZExt:
1948 case Sgpr32ToVgprDst:
1949 case Sgpr64ToVgprDst:
1950 return VgprRB;
1951 default:
1952 return nullptr;
1953 }
1954}
1955
1956bool RegBankLegalizeHelper::applyMappingDst(
1957 MachineInstr &MI, unsigned &OpIdx,
1958 const SmallVectorImpl<RegBankLLTMappingApplyID> &MethodIDs) {
1959 // Defs start from operand 0
1960 for (; OpIdx < MethodIDs.size(); ++OpIdx) {
1961 if (MethodIDs[OpIdx] == None)
1962 continue;
1963 MachineOperand &Op = MI.getOperand(OpIdx);
1964 Register Reg = Op.getReg();
1965 LLT Ty = MRI.getType(Reg);
1966 [[maybe_unused]] const RegisterBank *RB = MRI.getRegBank(Reg);
1967
1968 switch (MethodIDs[OpIdx]) {
1969 // vcc, sgpr and vgpr scalars, pointers and vectors
1970 case Vcc:
1971 case Sgpr16:
1972 case Sgpr32:
1973 case Sgpr64:
1974 case Sgpr128:
1975 case SgprP0:
1976 case SgprP1:
1977 case SgprP3:
1978 case SgprP4:
1979 case SgprP5:
1980 case SgprP6:
1981 case SgprP8:
1982 case SgprV2S16:
1983 case SgprV2S32:
1984 case SgprV4S32:
1985 case Vgpr16:
1986 case Vgpr32:
1987 case Vgpr64:
1988 case Vgpr128:
1989 case VgprP0:
1990 case VgprP1:
1991 case VgprP2:
1992 case VgprP3:
1993 case VgprP4:
1994 case VgprP5:
1995 case VgprV2S16:
1996 case VgprV2S32:
1997 case VgprV2S64:
1998 case VgprV3S32:
1999 case VgprV4S16:
2000 case VgprV8S16:
2001 case VgprV16S16:
2002 case VgprV4S32:
2003 case VgprV6S32:
2004 case VgprV8S32:
2005 case VgprV16S32:
2006 case VgprV32S16:
2007 case VgprV32S32: {
2008 assert(Ty == getTyFromID(MethodIDs[OpIdx]));
2009 assert(RB == getRegBankFromID(MethodIDs[OpIdx]));
2010 break;
2011 }
2012 // sgpr and vgpr B-types
2013 case SgprB32:
2014 case SgprB64:
2015 case SgprB96:
2016 case SgprB128:
2017 case SgprB256:
2018 case SgprB512:
2019 case SgprBRC:
2020 case SgprPtr32:
2021 case SgprPtr64:
2022 case SgprPtr128:
2023 case VgprB32:
2024 case VgprB64:
2025 case VgprB96:
2026 case VgprB128:
2027 case VgprB160:
2028 case VgprB256:
2029 case VgprB512:
2030 case VgprBRC:
2031 case VgprPtr32:
2032 case VgprPtr64:
2033 case VgprPtr128: {
2034 assert(Ty == getBTyFromID(MethodIDs[OpIdx], Ty));
2035 assert(RB == getRegBankFromID(MethodIDs[OpIdx]));
2036 break;
2037 }
2038 case VgprAnyTy: {
2039 assert(RB == VgprRB);
2040 break;
2041 }
2042 case AgprAnyTy: {
2043 if (RB == AgprRB)
2044 break;
2045 Register NewAgprDst = MRI.createVirtualRegister({AgprRB, Ty});
2046 Op.setReg(NewAgprDst);
2047 if (!MRI.use_nodbg_empty(Reg))
2048 B.buildCopy(Reg, NewAgprDst);
2049 break;
2050 }
2051 case VgprOrAgprAnyTy: {
2052 const unsigned NumRegs = Ty.getSizeInBits() / 32;
2053 const RegisterBank *DstRB =
2054 MFI->selectAGPRFormMFMA(NumRegs) ? AgprRB : VgprRB;
2055 if (RB == DstRB)
2056 break;
2057 Register NewDst = MRI.createVirtualRegister({DstRB, Ty});
2058 Op.setReg(NewDst);
2059 if (!MRI.use_nodbg_empty(Reg))
2060 B.buildCopy(Reg, NewDst);
2061 break;
2062 }
2063 // uniform in vcc/vgpr: scalars, vectors and B-types
2064 case UniInVcc: {
2065 assert(Ty == S1);
2066 assert(RB == SgprRB);
2067 Register NewDst = MRI.createVirtualRegister(VccRB_S1);
2068 Op.setReg(NewDst);
2069 if (!MRI.use_empty(Reg)) {
2070 auto CopyS32_Vcc =
2071 B.buildInstr(AMDGPU::G_AMDGPU_COPY_SCC_VCC, {SgprRB_S32}, {NewDst});
2072 B.buildTrunc(Reg, CopyS32_Vcc);
2073 }
2074 break;
2075 }
2076 case UniInVgprS16: {
2077 assert(Ty == getTyFromID(MethodIDs[OpIdx]));
2078 assert(RB == SgprRB);
2079 Register NewVgprDstS16 = MRI.createVirtualRegister({VgprRB, S16});
2080 Register NewVgprDstS32 = MRI.createVirtualRegister({VgprRB, S32});
2081 Register NewSgprDstS32 = MRI.createVirtualRegister({SgprRB, S32});
2082 Op.setReg(NewVgprDstS16);
2083 B.buildAnyExt(NewVgprDstS32, NewVgprDstS16);
2084 buildReadAnyLane(B, NewSgprDstS32, NewVgprDstS32, RBI);
2085 B.buildTrunc(Reg, NewSgprDstS32);
2086 break;
2087 }
2088 case UniInVgprS32:
2089 case UniInVgprS64:
2090 case UniInVgprV2S16:
2091 case UniInVgprV2S32:
2092 case UniInVgprV3S32:
2093 case UniInVgprV4S32:
2094 case UniInVgprV2S64:
2095 case UniInVgprV6S32:
2096 case UniInVgprV8S16:
2097 case UniInVgprV8S32:
2098 case UniInVgprV16S16:
2099 case UniInVgprV16S32:
2100 case UniInVgprV32S16:
2101 case UniInVgprV32S32: {
2102 assert(Ty == getTyFromID(MethodIDs[OpIdx]));
2103 assert(RB == SgprRB);
2104 Register NewVgprDst = MRI.createVirtualRegister({VgprRB, Ty});
2105 Op.setReg(NewVgprDst);
2106 buildReadAnyLane(B, Reg, NewVgprDst, RBI);
2107 break;
2108 }
2109 case UniInVgprB32:
2110 case UniInVgprB64:
2111 case UniInVgprB96:
2112 case UniInVgprB128:
2113 case UniInVgprB160:
2114 case UniInVgprB256:
2115 case UniInVgprB512: {
2116 assert(Ty == getBTyFromID(MethodIDs[OpIdx], Ty));
2117 assert(RB == SgprRB);
2118 Register NewVgprDst = MRI.createVirtualRegister({VgprRB, Ty});
2119 Op.setReg(NewVgprDst);
2120 AMDGPU::buildReadAnyLane(B, Reg, NewVgprDst, RBI);
2121 break;
2122 }
2123 // sgpr trunc
2124 case Sgpr32Trunc: {
2125 assert(Ty.getSizeInBits() < 32);
2126 assert(RB == SgprRB);
2127 Register NewDst = MRI.createVirtualRegister(SgprRB_S32);
2128 Op.setReg(NewDst);
2129 if (!MRI.use_empty(Reg))
2130 B.buildTrunc(Reg, NewDst);
2131 break;
2132 }
2133 case Sgpr32ToVgprDst:
2134 case Sgpr64ToVgprDst: {
2135 assert(Ty == getTyFromID(MethodIDs[OpIdx]));
2136 assert(RB == VgprRB);
2137 Op.setReg(MRI.createVirtualRegister({SgprRB, Ty}));
2138 B.buildCopy(Reg, Op.getReg());
2139 break;
2140 }
2141 case InvalidMapping: {
2143 MF, MORE, "amdgpu-regbanklegalize",
2144 "AMDGPU RegBankLegalize: missing fast rule ('Div' or 'Uni') for", MI);
2145 return false;
2146 }
2147 default:
2149 MF, MORE, "amdgpu-regbanklegalize",
2150 "AMDGPU RegBankLegalize: applyMappingDst, ID not supported", MI);
2151 return false;
2152 }
2153 }
2154
2155 return true;
2156}
2157
2158bool RegBankLegalizeHelper::applyMappingSrc(
2159 MachineInstr &MI, unsigned &OpIdx,
2160 const SmallVectorImpl<RegBankLLTMappingApplyID> &MethodIDs,
2161 WaterfallInfo &WFI) {
2162 for (unsigned i = 0; i < MethodIDs.size(); ++OpIdx, ++i) {
2163 if (MethodIDs[i] == None || MethodIDs[i] == IntrId || MethodIDs[i] == Imm)
2164 continue;
2165
2166 MachineOperand &Op = MI.getOperand(OpIdx);
2167 Register Reg = Op.getReg();
2168 LLT Ty = MRI.getType(Reg);
2169 const RegisterBank *RB = MRI.getRegBank(Reg);
2170
2171 switch (MethodIDs[i]) {
2172 case Vcc: {
2173 assert(Ty == S1);
2174 assert(RB == VccRB || RB == SgprRB);
2175 if (RB == SgprRB) {
2176 auto Aext = B.buildAnyExt(SgprRB_S32, Reg);
2177 auto CopyVcc_Scc =
2178 B.buildInstr(AMDGPU::G_AMDGPU_COPY_VCC_SCC, {VccRB_S1}, {Aext});
2179 Op.setReg(CopyVcc_Scc.getReg(0));
2180 }
2181 break;
2182 }
2183 // sgpr scalars, pointers and vectors
2184 case Sgpr16:
2185 case Sgpr32:
2186 case Sgpr64:
2187 case Sgpr128:
2188 case SgprP0:
2189 case SgprP1:
2190 case SgprP3:
2191 case SgprP4:
2192 case SgprP5:
2193 case SgprP6:
2194 case SgprP8:
2195 case SgprV2S16:
2196 case SgprV2S32:
2197 case SgprV4S32: {
2198 assert(Ty == getTyFromID(MethodIDs[i]));
2199 assert(RB == getRegBankFromID(MethodIDs[i]));
2200 break;
2201 }
2202 // sgpr B-types
2203 case SgprB32:
2204 case SgprB64:
2205 case SgprB96:
2206 case SgprB128:
2207 case SgprB256:
2208 case SgprB512:
2209 case SgprBRC:
2210 case SgprPtr32:
2211 case SgprPtr64:
2212 case SgprPtr128: {
2213 assert(Ty == getBTyFromID(MethodIDs[i], Ty));
2214 assert(RB == getRegBankFromID(MethodIDs[i]));
2215 break;
2216 }
2217 // vgpr scalars, pointers and vectors
2218 case Vgpr16:
2219 case Vgpr32:
2220 case Vgpr64:
2221 case Vgpr128:
2222 case VgprP0:
2223 case VgprP1:
2224 case VgprP2:
2225 case VgprP3:
2226 case VgprP4:
2227 case VgprP5:
2228 case VgprV2S16:
2229 case VgprV2S32:
2230 case VgprV2S64:
2231 case VgprV3S32:
2232 case VgprV4S16:
2233 case VgprV8S16:
2234 case VgprV16S16:
2235 case VgprV4S32:
2236 case VgprV6S32:
2237 case VgprV8S32:
2238 case VgprV16S32:
2239 case VgprV32S16:
2240 case VgprV32S32: {
2241 assert(Ty == getTyFromID(MethodIDs[i]));
2242 if (RB != VgprRB) {
2243 auto CopyToVgpr = B.buildCopy({VgprRB, Ty}, Reg);
2244 Op.setReg(CopyToVgpr.getReg(0));
2245 }
2246 break;
2247 }
2248 // vgpr B-types
2249 case VgprB32:
2250 case VgprB64:
2251 case VgprB96:
2252 case VgprB128:
2253 case VgprB160:
2254 case VgprB256:
2255 case VgprB512:
2256 case VgprBRC:
2257 case VgprPtr32:
2258 case VgprPtr64:
2259 case VgprPtr128: {
2260 assert(Ty == getBTyFromID(MethodIDs[i], Ty));
2261 if (RB != VgprRB) {
2262 auto CopyToVgpr = B.buildCopy({VgprRB, Ty}, Reg);
2263 Op.setReg(CopyToVgpr.getReg(0));
2264 }
2265 break;
2266 }
2267 case VgprAnyTy: {
2268 if (RB != VgprRB) {
2269 auto CopyToVgpr = B.buildCopy({VgprRB, Ty}, Reg);
2270 Op.setReg(CopyToVgpr.getReg(0));
2271 }
2272 break;
2273 }
2274 case AgprAnyTy: {
2275 if (RB != AgprRB) {
2276 auto CopyToAgpr = B.buildCopy({AgprRB, Ty}, Reg);
2277 Op.setReg(CopyToAgpr.getReg(0));
2278 }
2279 break;
2280 }
2281 case VgprOrAgprAnyTy: {
2282 const unsigned NumRegs = Ty.getSizeInBits() / 32;
2283 const RegisterBank *SrcRB =
2284 MFI->selectAGPRFormMFMA(NumRegs) ? AgprRB : VgprRB;
2285 if (RB != SrcRB)
2286 Op.setReg(B.buildCopy({SrcRB, Ty}, Reg).getReg(0));
2287 break;
2288 }
2289 // sgpr waterfall, scalars, and vectors
2290 case Sgpr32_WF:
2291 case SgprV4S32_WF: {
2292 assert(Ty == getTyFromID(MethodIDs[i]));
2293 if (RB != SgprRB) {
2294 WFI.SgprWaterfallOperandRegs.insert(Reg);
2295 if (!WFI.Start.isValid()) {
2296 WFI.Start = MI.getIterator();
2297 WFI.End = std::next(MI.getIterator());
2298 }
2299 }
2300 break;
2301 }
2302 case SgprP0Call_WF:
2303 case SgprP4Call_WF: {
2304 assert(Ty == getTyFromID(MethodIDs[i]));
2305 if (RB != SgprRB) {
2306 WFI.SgprWaterfallOperandRegs.insert(Reg);
2307
2308 // Find the ADJCALLSTACKUP before the call.
2309 MachineBasicBlock::iterator Start = MI.getIterator();
2310 while (Start->getOpcode() != AMDGPU::ADJCALLSTACKUP)
2311 --Start;
2312
2313 // Find the ADJCALLSTACKDOWN after the call (include it in range).
2314 MachineBasicBlock::iterator End = MI.getIterator();
2315 while (End->getOpcode() != AMDGPU::ADJCALLSTACKDOWN)
2316 ++End;
2317 ++End;
2318
2319 WFI.Start = Start;
2320 WFI.End = End;
2321 }
2322 break;
2323 }
2324 case SgprB32_M0:
2326 case SgprB64_ReadFirstLane: {
2327 assert(Ty == getBTyFromID(MethodIDs[i], Ty));
2328 if (RB == SgprRB)
2329 break;
2330 assert(RB == VgprRB);
2331 Register NewSGPR = MRI.createVirtualRegister({SgprRB, Ty});
2332 buildReadFirstLane(B, NewSGPR, Op.getReg(), RBI);
2333 Op.setReg(NewSGPR);
2334 break;
2335 }
2338 assert(Ty == getTyFromID(MethodIDs[i]));
2339 if (RB == SgprRB)
2340 break;
2341 assert(RB == VgprRB);
2342 Register NewSGPR = MRI.createVirtualRegister({SgprRB, Ty});
2343 buildReadFirstLane(B, NewSGPR, Op.getReg(), RBI);
2344 Op.setReg(NewSGPR);
2345 break;
2346 }
2347 // sgpr and vgpr scalars with extend
2348 case Sgpr32AExt: {
2349 // Note: this ext allows S1, and it is meant to be combined away.
2350 assert(Ty.getSizeInBits() < 32);
2351 assert(RB == SgprRB);
2352 auto Aext = B.buildAnyExt(SgprRB_S32, Reg);
2353 Op.setReg(Aext.getReg(0));
2354 break;
2355 }
2356 case Sgpr32AExtBoolInReg: {
2357 // Note: this ext allows S1, and it is meant to be combined away.
2358 assert(Ty.getSizeInBits() == 1);
2359 assert(RB == SgprRB);
2360 auto Aext = B.buildAnyExt(SgprRB_S32, Reg);
2361 // Zext SgprS1 is not legal, make AND with 1 instead. This instruction is
2362 // most of times meant to be combined away in AMDGPURegBankCombiner.
2363 auto Cst1 = B.buildConstant(SgprRB_S32, 1);
2364 auto BoolInReg = B.buildAnd(SgprRB_S32, Aext, Cst1);
2365 Op.setReg(BoolInReg.getReg(0));
2366 break;
2367 }
2368 case Sgpr32SExt: {
2369 assert(1 < Ty.getSizeInBits() && Ty.getSizeInBits() < 32);
2370 assert(RB == SgprRB);
2371 auto Sext = B.buildSExt(SgprRB_S32, Reg);
2372 Op.setReg(Sext.getReg(0));
2373 break;
2374 }
2375 case Sgpr32ZExt: {
2376 assert(1 < Ty.getSizeInBits() && Ty.getSizeInBits() < 32);
2377 assert(RB == SgprRB);
2378 auto Zext = B.buildZExt({SgprRB, S32}, Reg);
2379 Op.setReg(Zext.getReg(0));
2380 break;
2381 }
2382 case Vgpr32AExt: {
2383 assert(Ty.getSizeInBits() < 32);
2384 assert(RB == VgprRB);
2385 auto Aext = B.buildAnyExt({VgprRB, S32}, Reg);
2386 Op.setReg(Aext.getReg(0));
2387 break;
2388 }
2389 case Vgpr32SExt: {
2390 // Note this ext allows S1, and it is meant to be combined away.
2391 assert(Ty.getSizeInBits() < 32);
2392 assert(RB == VgprRB);
2393 auto Sext = B.buildSExt({VgprRB, S32}, Reg);
2394 Op.setReg(Sext.getReg(0));
2395 break;
2396 }
2397 case Vgpr32ZExt: {
2398 // Note this ext allows S1, and it is meant to be combined away.
2399 assert(Ty.getSizeInBits() < 32);
2400 assert(RB == VgprRB);
2401 auto Zext = B.buildZExt({VgprRB, S32}, Reg);
2402 Op.setReg(Zext.getReg(0));
2403 break;
2404 }
2405 default:
2407 MF, MORE, "amdgpu-regbanklegalize",
2408 "AMDGPU RegBankLegalize: applyMappingSrc, ID not supported", MI);
2409 return false;
2410 }
2411 }
2412 return true;
2413}
2414
2415[[maybe_unused]] static bool verifyRegBankOnOperands(MachineInstr &MI,
2416 const RegisterBank *RB,
2418 unsigned StartOpIdx,
2419 unsigned EndOpIdx) {
2420 for (unsigned i = StartOpIdx; i <= EndOpIdx; ++i) {
2421 if (MRI.getRegBankOrNull(MI.getOperand(i).getReg()) != RB)
2422 return false;
2423 }
2424 return true;
2425}
2426
2427bool RegBankLegalizeHelper::applyRegisterBanksVgprWithSgprRsrc(
2428 MachineInstr &MI, unsigned RsrcIdx) {
2429 const unsigned NumDefs = MI.getNumExplicitDefs();
2430
2431 MachineBasicBlock *MBB = MI.getParent();
2432 B.setInsertPt(*MBB, MBB->SkipPHIsAndLabels(std::next(MI.getIterator())));
2433
2434 // Defs are vgpr.
2435 for (unsigned i = 0; i < NumDefs; ++i) {
2436 Register Reg = MI.getOperand(i).getReg();
2437 if (MRI.getRegBank(Reg) == VgprRB)
2438 continue;
2439
2440 Register NewVgprDst = MRI.createVirtualRegister({VgprRB, MRI.getType(Reg)});
2441 MI.getOperand(i).setReg(NewVgprDst);
2442 buildReadAnyLane(B, Reg, NewVgprDst, RBI);
2443 }
2444
2445 B.setInstrAndDebugLoc(MI);
2446
2447 // Register uses before RsrcIdx are vgpr.
2448 for (unsigned i = NumDefs; i < RsrcIdx; ++i) {
2449 MachineOperand &Op = MI.getOperand(i);
2450 if (!Op.isReg())
2451 continue;
2452
2453 Register Reg = Op.getReg();
2454 if (!Reg.isVirtual())
2455 continue;
2456
2457 if (MRI.getRegBank(Reg) == VgprRB)
2458 continue;
2459
2460 auto Copy = B.buildCopy({VgprRB, MRI.getType(Reg)}, Reg);
2461 Op.setReg(Copy.getReg(0));
2462 }
2463
2464 SmallSet<Register, 4> OpsToWaterfall;
2465
2466 // Register use RsrcIdx (and later register operands) is sgpr.
2467 for (unsigned i = RsrcIdx; i < MI.getNumOperands(); ++i) {
2468 MachineOperand &Op = MI.getOperand(i);
2469 if (!Op.isReg())
2470 continue;
2471
2472 Register Reg = Op.getReg();
2473 if (MRI.getRegBank(Reg) != SgprRB)
2474 OpsToWaterfall.insert(Reg);
2475 }
2476
2477 if (!OpsToWaterfall.empty()) {
2478 MachineBasicBlock::iterator MII = MI.getIterator();
2479 executeInWaterfallLoop(B, {OpsToWaterfall, MII, std::next(MII)});
2480 }
2481
2482 return true;
2483}
MachineInstrBuilder MachineInstrBuilder & DefMI
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.
Provides AMDGPU specific target descriptions.
static bool isSignedBFE(MachineInstr &MI)
static bool verifyRegBankOnOperands(MachineInstr &MI, const RegisterBank *RB, MachineRegisterInfo &MRI, unsigned StartOpIdx, unsigned EndOpIdx)
This file declares the targeting of the RegisterBankInfo class for AMDGPU.
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator MBBI
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
AMD GCN specific subclass of TargetSubtarget.
Declares convenience wrapper classes for interpreting MachineInstr instances as specific generic oper...
static Register UseReg(const MachineOperand &MO)
IRTranslator LLVM IR MI
const size_t AbstractManglingParser< Derived, Alloc >::NumOps
#define I(x, y, z)
Definition MD5.cpp:57
Contains matchers for matching SSA Machine Instructions.
This file declares the MachineIRBuilder class.
Register Reg
Register const TargetRegisterInfo * TRI
Machine IR instance of the generic uniformity analysis.
Promote Memory to Register
Definition Mem2Reg.cpp:110
static MCRegister getReg(const MCDisassembler *D, unsigned RC, unsigned RegNo)
MachineInstr unsigned OpIdx
static constexpr MCPhysReg SPReg
const SmallVectorImpl< MachineOperand > & Cond
static const LaneMaskConstants & get(const GCNSubtarget &ST)
RegBankLegalizeHelper(MachineIRBuilder &B, const MachineUniformityInfo &MUI, const RegisterBankInfo &RBI, const RegBankLegalizeRules &RBLRules)
const RegBankLLTMapping * findMappingForMI(const MachineInstr &MI, const MachineRegisterInfo &MRI, const MachineUniformityInfo &MUI) const
static APInt getLowBitsSet(unsigned numBits, unsigned loBitsSet)
Constructs an APInt value that has the bottom loBitsSet bits set.
Definition APInt.h:307
@ ICMP_NE
not equal
Definition InstrTypes.h:762
iterator find(const_arg_type_t< KeyT > Val)
Definition DenseMap.h:225
iterator end()
Definition DenseMap.h:143
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition DenseMap.h:286
const SIRegisterInfo * getRegisterInfo() const override
Represents a call to an intrinsic.
Register getSourceReg() const
Get the unmerge source register.
constexpr bool isScalar() const
LLT getScalarType() const
static constexpr LLT scalar(unsigned SizeInBits)
Get a low-level scalar or aggregate "bag of bits".
constexpr bool isValid() const
constexpr uint16_t getNumElements() const
Returns the number of elements in a vector LLT.
constexpr bool isVector() const
static constexpr LLT pointer(unsigned AddressSpace, unsigned SizeInBits)
Get a low-level pointer in the given address space.
constexpr TypeSize getSizeInBits() const
Returns the total size of the type. Must only be called on sized types.
constexpr bool isPointer() const
LLT divide(int Factor) const
Return a type that is Factor times smaller.
static constexpr LLT fixed_vector(unsigned NumElements, unsigned ScalarSizeInBits)
Get a low-level fixed-width vector of some number of elements and element width.
constexpr TypeSize getSizeInBytes() const
Returns the total size of the type in bytes, i.e.
LLT getElementType() const
Returns the vector's element type. Only valid for vector types.
TypeSize getValue() const
LLVM_ABI void transferSuccessorsAndUpdatePHIs(MachineBasicBlock *FromMBB)
Transfers all the successors, as in transferSuccessors, and update PHI operands in the successor bloc...
LLVM_ABI iterator SkipPHIsAndLabels(iterator I)
Return the first instruction in MBB after I that is not a PHI or a label.
LLVM_ABI void addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob=BranchProbability::getUnknown())
Add Succ as a successor of this MachineBasicBlock.
void splice(iterator Where, MachineBasicBlock *Other, iterator From)
Take an instruction from MBB 'Other' at the position From, and insert it into this MBB right before '...
MachineInstrBundleIterator< MachineInstr > iterator
BasicBlockListType::iterator iterator
MachineBasicBlock * CreateMachineBasicBlock(const BasicBlock *BB=nullptr, std::optional< UniqueBBID > BBID=std::nullopt)
CreateMachineInstr - Allocate a new MachineInstr.
void insert(iterator MBBI, MachineBasicBlock *MBB)
Helper class to build MachineInstr.
Representation of each machine instruction.
const MachineBasicBlock * getParent() const
LocationSize getSize() const
Return the size in bytes of the memory reference.
LLVM_ABI Align getAlign() const
Return the minimum known alignment in bytes of the actual memory reference.
MachineOperand class - Representation of each machine instruction operand.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
const RegisterBank * getRegBank(Register Reg) const
Return the register bank of Reg.
LLVM_ABI Register createVirtualRegister(const TargetRegisterClass *RegClass, StringRef Name="")
createVirtualRegister - Create and return a new virtual register in the function with the specified r...
LLT getType(Register Reg) const
Get the low-level type of Reg or LLT{} if Reg is not a generic (target independent) virtual register.
const RegisterBank * getRegBankOrNull(Register Reg) const
Return the register bank of Reg, or null if Reg has not been assigned a register bank or has been ass...
Holds all the information related to register banks.
This class implements the register bank concept.
Wrapper class representing virtual and physical registers.
Definition Register.h:20
constexpr bool isVirtual() const
Return true if the specified register number is in the virtual register namespace.
Definition Register.h:79
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
bool empty() const
Definition SmallSet.h:169
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
Definition SmallSet.h:184
reference emplace_back(ArgTypes &&... Args)
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
self_iterator getIterator()
Definition ilist_node.h:123
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
constexpr char Align[]
Key for Kernel::Arg::Metadata::mAlign.
bool isAnyPtr(LLT Ty, unsigned Width)
Intrinsic::ID getIntrinsicID(const MachineInstr &I)
Return the intrinsic ID for opcodes with the G_AMDGPU_INTRIN_ prefix.
std::pair< Register, unsigned > getBaseWithConstantOffset(MachineRegisterInfo &MRI, Register Reg, GISelValueTracking *ValueTracking=nullptr, bool CheckNUW=false)
Returns base register and constant offset.
void buildReadAnyLane(MachineIRBuilder &B, Register SgprDst, Register VgprSrc, const RegisterBankInfo &RBI)
const RsrcIntrinsic * lookupRsrcIntrinsic(unsigned Intr)
void buildReadFirstLane(MachineIRBuilder &B, Register SgprDst, Register VgprSrc, const RegisterBankInfo &RBI)
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ Bitcast
Perform the operation on a different, but equivalently sized type.
SpecificConstantMatch m_ZeroInt()
Convenience matchers for specific integer values.
bool mi_match(Reg R, const MachineRegisterInfo &MRI, Pattern &&P)
This is an optimization pass for GlobalISel generic memory operations.
GenericUniformityInfo< MachineSSAContext > MachineUniformityInfo
@ Offset
Definition DWP.cpp:558
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1738
LLVM_ABI MachineInstr * getOpcodeDef(unsigned Opcode, Register Reg, const MachineRegisterInfo &MRI)
See if Reg is defined by an single def instruction that is Opcode.
Definition Utils.cpp:656
@ Kill
The last use of a register.
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
LLVM_ABI void constrainSelectedInstRegOperands(MachineInstr &I, const TargetInstrInfo &TII, const TargetRegisterInfo &TRI, const RegisterBankInfo &RBI)
Mutate the newly-selected instruction I to constrain its (possibly generic) virtual register operands...
Definition Utils.cpp:159
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
unsigned Log2_64(uint64_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition MathExtras.h:337
LLVM_ABI std::optional< int64_t > getIConstantVRegSExtVal(Register VReg, const MachineRegisterInfo &MRI)
If VReg is defined by a G_CONSTANT fits in int64_t returns it.
Definition Utils.cpp:317
LLVM_ABI void reportGISelFailure(MachineFunction &MF, MachineOptimizationRemarkEmitter &MORE, MachineOptimizationRemarkMissed &R)
Report an ISel error as a missed optimization remark to the LLVMContext's diagnostic stream.
Definition Utils.cpp:261
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition MathExtras.h:394
constexpr T maskTrailingZeros(unsigned N)
Create a bitmask with the N right-most bits set to 0, and all other bits set to 1.
Definition MathExtras.h:94
@ Add
Sum of integers.
DWARFExpression::Operation Op
ArrayRef(const T &OneElt) -> ArrayRef< T >
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
LLVM_ABI std::optional< ValueAndVReg > getIConstantVRegValWithLookThrough(Register VReg, const MachineRegisterInfo &MRI, bool LookThroughInstrs=true)
If VReg is defined by a statically evaluable chain of instructions rooted on a G_CONSTANT returns its...
Definition Utils.cpp:436
Align assumeAligned(uint64_t Value)
Treats the value 0 as a 1, so Align is always at least 1.
Definition Alignment.h:100
LLVM_ABI Register getSrcRegIgnoringCopies(Register Reg, const MachineRegisterInfo &MRI)
Find the source register for Reg, folding away any trivial copies.
Definition Utils.cpp:504
constexpr T maskTrailingOnes(unsigned N)
Create a bitmask with the N right-most bits set to 1, and all other bits set to 0.
Definition MathExtras.h:77
SmallVector< RegBankLLTMappingApplyID, 2 > DstOpMapping
SmallVector< RegBankLLTMappingApplyID, 4 > SrcOpMapping
Holds waterfall loop information: the set of SGPR operand registers that need waterfalling,...
MachineBasicBlock::iterator Start
SmallSet< Register, 4 > SgprWaterfallOperandRegs
constexpr uint64_t value() const
This is a hole in the type system and should not be abused.
Definition Alignment.h:77