LLVM 23.0.0git
AMDGPURegBankLegalizeHelper.cpp
Go to the documentation of this file.
1//===-- AMDGPURegBankLegalizeHelper.cpp -----------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// Implements actual lowering algorithms for each ID that can be used in
10/// Rule.OperandMapping. Similar to legalizer helper but with register banks.
11//
12//===----------------------------------------------------------------------===//
13
16#include "AMDGPUInstrInfo.h"
17#include "AMDGPULaneMaskUtils.h"
20#include "GCNSubtarget.h"
29#include "llvm/IR/IntrinsicsAMDGPU.h"
30
31#define DEBUG_TYPE "amdgpu-regbanklegalize"
32
33using namespace llvm;
34using namespace AMDGPU;
35
39 const RegBankLegalizeRules &RBLRules)
40 : MF(B.getMF()), MFI(MF.getInfo<SIMachineFunctionInfo>()),
41 ST(MF.getSubtarget<GCNSubtarget>()), TII(*ST.getInstrInfo()), B(B),
42 MRI(*B.getMRI()), MUI(MUI), VT(VT), RBI(RBI), MORE(MF, nullptr),
43 RBLRules(RBLRules), IsWave32(ST.isWave32()),
44 SgprRB(&RBI.getRegBank(AMDGPU::SGPRRegBankID)),
45 VgprRB(&RBI.getRegBank(AMDGPU::VGPRRegBankID)),
46 AgprRB(&RBI.getRegBank(AMDGPU::AGPRRegBankID)),
47 VccRB(&RBI.getRegBank(AMDGPU::VCCRegBankID)) {}
48
50 const SetOfRulesForOpcode *RuleSet = RBLRules.getRulesForOpc(MI);
51 if (!RuleSet) {
52 reportGISelFailure(MF, MORE, "amdgpu-regbanklegalize",
53 "No AMDGPU RegBankLegalize rules defined for opcode",
54 MI);
55 return false;
56 }
57
58 const RegBankLLTMapping *Mapping = RuleSet->findMappingForMI(MI, MRI, MUI);
59 if (!Mapping) {
60 reportGISelFailure(MF, MORE, "amdgpu-regbanklegalize",
61 "AMDGPU RegBankLegalize: none of the rules defined with "
62 "'Any' for MI's opcode matched MI",
63 MI);
64 return false;
65 }
66
67 WaterfallInfo WFI;
68 unsigned OpIdx = 0;
69 if (!Mapping->DstOpMapping.empty()) {
70 B.setInsertPt(*MI.getParent(), std::next(MI.getIterator()));
71 if (!applyMappingDst(MI, OpIdx, Mapping->DstOpMapping))
72 return false;
73 }
74 if (!Mapping->SrcOpMapping.empty()) {
75 B.setInstr(MI);
76 if (!applyMappingSrc(MI, OpIdx, Mapping->SrcOpMapping, WFI))
77 return false;
78 }
79
80 if (!lower(MI, *Mapping, WFI))
81 return false;
82
83 if (!WFI.SgprWaterfallOperandRegs.empty()) {
84 if (!executeInWaterfallLoop(B, WFI))
85 return false;
86 }
87
88 return true;
89}
90
91bool RegBankLegalizeHelper::executeInWaterfallLoop(MachineIRBuilder &B,
92 const WaterfallInfo &WFI) {
93 assert(WFI.Start.isValid() && WFI.End.isValid() &&
94 "Waterfall range not initialized");
95
96 // Track use registers which have already been expanded with a readfirstlane
97 // sequence. This may have multiple uses if moving a sequence.
98 DenseMap<Register, Register> WaterfalledRegMap;
99
100 MachineBasicBlock &MBB = B.getMBB();
101 MachineFunction &MF = B.getMF();
102
105
106 const SIRegisterInfo *TRI = ST.getRegisterInfo();
107 const TargetRegisterClass *WaveRC = TRI->getWaveMaskRegClass();
109
110#ifndef NDEBUG
111 const int OrigRangeSize = std::distance(BeginIt, EndIt);
112#endif
113
114 MachineRegisterInfo &MRI = *B.getMRI();
115 Register SaveExecReg = MRI.createVirtualRegister(WaveRC);
116 Register InitSaveExecReg = MRI.createVirtualRegister(WaveRC);
117
118 // Don't bother using generic instructions/registers for the exec mask.
119 B.setInstr(*WFI.Start);
120 B.buildInstr(TargetOpcode::IMPLICIT_DEF).addDef(InitSaveExecReg);
121
122 Register SavedExec = MRI.createVirtualRegister(WaveRC);
123
124 // To insert the loop we need to split the block. Move everything before
125 // this point to a new block, and insert a new empty block before this
126 // instruction.
129 MachineBasicBlock *RestoreExecBB = MF.CreateMachineBasicBlock();
130 MachineBasicBlock *RemainderBB = MF.CreateMachineBasicBlock();
132 ++MBBI;
133 MF.insert(MBBI, LoopBB);
134 MF.insert(MBBI, BodyBB);
135 MF.insert(MBBI, RestoreExecBB);
136 MF.insert(MBBI, RemainderBB);
137
138 LoopBB->addSuccessor(BodyBB);
139 BodyBB->addSuccessor(RestoreExecBB);
140 BodyBB->addSuccessor(LoopBB);
141
142 // Move the rest of the block into a new block.
144 RemainderBB->splice(RemainderBB->begin(), &MBB, EndIt, MBB.end());
145
146 MBB.addSuccessor(LoopBB);
147 RestoreExecBB->addSuccessor(RemainderBB);
148
149 B.setInsertPt(*LoopBB, LoopBB->end());
150
151 // +-MBB:------------+
152 // | ... |
153 // | %0 = G_INST_1 |
154 // | %Dst = MI %Vgpr |
155 // | %1 = G_INST_2 |
156 // | ... |
157 // +-----------------+
158 // ->
159 // +-MBB-------------------------------+
160 // | ... |
161 // | %0 = G_INST_1 |
162 // | %SaveExecReg = S_MOV_B32 $exec_lo |
163 // +----------------|------------------+
164 // | /------------------------------|
165 // V V |
166 // +-LoopBB---------------------------------------------------------------+ |
167 // | %CurrentLaneReg:sgpr(s32) = READFIRSTLANE %Vgpr | |
168 // | instead of executing for each lane, see if other lanes had | |
169 // | same value for %Vgpr and execute for them also. | |
170 // | %CondReg:vcc(s1) = G_ICMP eq %CurrentLaneReg, %Vgpr | |
171 // | %CondRegLM:sreg_32 = ballot %CondReg // copy vcc to sreg32 lane mask | |
172 // | %SavedExec = S_AND_SAVEEXEC_B32 %CondRegLM | |
173 // | exec is active for lanes with the same "CurrentLane value" in Vgpr | |
174 // +----------------|-----------------------------------------------------+ |
175 // V |
176 // +-BodyBB------------------------------------------------------------+ |
177 // | %Dst = MI %CurrentLaneReg:sgpr(s32) | |
178 // | executed only for active lanes and written to Dst | |
179 // | $exec = S_XOR_B32 $exec, %SavedExec | |
180 // | set active lanes to 0 in SavedExec, lanes that did not write to | |
181 // | Dst yet, and set this as new exec (for READFIRSTLANE and ICMP) | |
182 // | SI_WATERFALL_LOOP LoopBB |-----|
183 // +----------------|--------------------------------------------------+
184 // V
185 // +-RestoreExecBB--------------------------+
186 // | $exec_lo = S_MOV_B32_term %SaveExecReg |
187 // +----------------|-----------------------+
188 // V
189 // +-RemainderBB:----------------------+
190 // | %1 = G_INST_2 |
191 // | ... |
192 // +---------------------------------- +
193
194 // Move the instruction into the loop body. Note we moved everything after
195 // Range.end() already into a new block, so Range.end() is no longer valid.
196 BodyBB->splice(BodyBB->end(), &MBB, BeginIt, MBB.end());
197
198 // Figure out the iterator range after splicing the instructions.
199 MachineBasicBlock::iterator NewBegin = BeginIt;
200 auto NewEnd = BodyBB->end();
201 assert(std::distance(NewBegin, NewEnd) == OrigRangeSize);
202
203 B.setMBB(*LoopBB);
204 Register CondReg;
205
206 for (MachineInstr &MI : make_range(NewBegin, NewEnd)) {
207 for (MachineOperand &Op : MI.all_uses()) {
208 Register OldReg = Op.getReg();
209 if (!WFI.SgprWaterfallOperandRegs.count(OldReg))
210 continue;
211
212 // See if we already processed this register in another instruction in
213 // the sequence.
214 auto OldVal = WaterfalledRegMap.find(OldReg);
215 if (OldVal != WaterfalledRegMap.end()) {
216 Op.setReg(OldVal->second);
217 continue;
218 }
219
220 Register OpReg = Op.getReg();
221 LLT OpTy = MRI.getType(OpReg);
222
223 // TODO: support for agpr
224 assert(MRI.getRegBank(OpReg) == VgprRB);
225 Register CurrentLaneReg = MRI.createVirtualRegister({SgprRB, OpTy});
226 buildReadFirstLane(B, CurrentLaneReg, OpReg, RBI);
227
228 // Build the comparison(s), CurrentLaneReg == OpReg.
229 unsigned OpSize = OpTy.getSizeInBits();
230 unsigned PartSize = (OpSize % 64 == 0) ? 64 : 32;
231 LLT PartTy = LLT::scalar(PartSize);
232 unsigned NumParts = OpSize / PartSize;
234 SmallVector<Register, 8> CurrentLaneParts;
235
236 if (NumParts == 1) {
237 OpParts.push_back(OpReg);
238 CurrentLaneParts.push_back(CurrentLaneReg);
239 } else {
240 auto UnmergeOp = B.buildUnmerge({VgprRB, PartTy}, OpReg);
241 auto UnmergeCurrLane = B.buildUnmerge({SgprRB, PartTy}, CurrentLaneReg);
242 for (unsigned i = 0; i < NumParts; ++i) {
243 OpParts.push_back(UnmergeOp.getReg(i));
244 CurrentLaneParts.push_back(UnmergeCurrLane.getReg(i));
245 }
246 }
247
248 for (unsigned i = 0; i < NumParts; ++i) {
249 Register CmpReg = MRI.createVirtualRegister(VccRB_S1);
250 B.buildICmp(CmpInst::ICMP_EQ, CmpReg, CurrentLaneParts[i], OpParts[i]);
251
252 if (!CondReg)
253 CondReg = CmpReg;
254 else
255 CondReg = B.buildAnd(VccRB_S1, CondReg, CmpReg).getReg(0);
256 }
257
258 Op.setReg(CurrentLaneReg);
259
260 // Make sure we don't re-process this register again.
261 WaterfalledRegMap.insert(std::pair(OldReg, Op.getReg()));
262 }
263 }
264
265 // Copy vcc to sgpr32/64, ballot becomes a no-op during instruction selection.
266 Register CondRegLM =
267 MRI.createVirtualRegister({WaveRC, LLT::scalar(IsWave32 ? 32 : 64)});
268 B.buildIntrinsic(Intrinsic::amdgcn_ballot, CondRegLM).addReg(CondReg);
269
270 // Update EXEC, save the original EXEC value to SavedExec.
271 B.buildInstr(LMC.AndSaveExecOpc)
272 .addDef(SavedExec)
273 .addReg(CondRegLM, RegState::Kill);
274 MRI.setSimpleHint(SavedExec, CondRegLM);
275
276 B.setInsertPt(*BodyBB, BodyBB->end());
277
278 // Update EXEC, switch all done bits to 0 and all todo bits to 1.
279 B.buildInstr(LMC.XorTermOpc)
280 .addDef(LMC.ExecReg)
281 .addReg(LMC.ExecReg)
282 .addReg(SavedExec);
283
284 // XXX - s_xor_b64 sets scc to 1 if the result is nonzero, so can we use
285 // s_cbranch_scc0?
286
287 // Loop back to V_READFIRSTLANE_B32 if there are still variants to cover.
288 B.buildInstr(AMDGPU::SI_WATERFALL_LOOP).addMBB(LoopBB);
289
290 // Save the EXEC mask before the loop.
291 B.setInsertPt(MBB, MBB.end());
292 B.buildInstr(LMC.MovOpc).addDef(SaveExecReg).addReg(LMC.ExecReg);
293
294 // Restore the EXEC mask after the loop.
295 B.setInsertPt(*RestoreExecBB, RestoreExecBB->begin());
296 B.buildInstr(LMC.MovTermOpc).addDef(LMC.ExecReg).addReg(SaveExecReg);
297
298 // Set the insert point after the original instruction, so any new
299 // instructions will be in the remainder.
300 B.setInsertPt(*RemainderBB, RemainderBB->begin());
301
302 return true;
303}
304
305// Analyze a combined offset from an llvm.amdgcn.s.buffer intrinsic and store
306// the three offsets (voffset, soffset and instoffset)
307unsigned RegBankLegalizeHelper::setBufferOffsets(
308 MachineIRBuilder &B, Register CombinedOffset, Register &VOffsetReg,
309 Register &SOffsetReg, int64_t &InstOffsetVal, Align Alignment) {
310 if (std::optional<int64_t> Imm =
311 getIConstantVRegSExtVal(CombinedOffset, MRI)) {
312 uint32_t SOffset, ImmOffset;
313 if (TII.splitMUBUFOffset(*Imm, SOffset, ImmOffset, Alignment)) {
314 VOffsetReg = B.buildConstant({VgprRB, S32}, 0).getReg(0);
315 SOffsetReg = B.buildConstant({SgprRB, S32}, SOffset).getReg(0);
316 InstOffsetVal = ImmOffset;
317 return SOffset + ImmOffset;
318 }
319 }
320 const bool CheckNUW = ST.hasGFX1250Insts();
322 MRI, CombinedOffset, /*KnownBits=*/nullptr,
323 /*CheckNUW=*/CheckNUW);
324 uint32_t SOffset, ImmOffset;
325 if (static_cast<int32_t>(Offset) > 0 &&
326 TII.splitMUBUFOffset(Offset, SOffset, ImmOffset, Alignment)) {
327 if (Base.isValid() && MRI.getRegBank(Base) == VgprRB) {
328 VOffsetReg = Base;
329 SOffsetReg = B.buildConstant({SgprRB, S32}, SOffset).getReg(0);
330 InstOffsetVal = ImmOffset;
331 return 0;
332 }
333 // If we have SGPR base, we can use it for soffset.
334 if (SOffset == 0) {
335 VOffsetReg = B.buildConstant({VgprRB, S32}, 0).getReg(0);
336 SOffsetReg = Base;
337 InstOffsetVal = ImmOffset;
338 return 0;
339 }
340 }
341 // Handle the variable sgpr + vgpr case.
342 MachineInstr *Add = getOpcodeDef(AMDGPU::G_ADD, CombinedOffset, MRI);
343 if (Add && static_cast<int32_t>(Offset) >= 0 &&
344 (!CheckNUW || Add->getFlag(MachineInstr::NoUWrap))) {
345 Register Src0 = getSrcRegIgnoringCopies(Add->getOperand(1).getReg(), MRI);
346 Register Src1 = getSrcRegIgnoringCopies(Add->getOperand(2).getReg(), MRI);
347 const RegisterBank *Src0Bank = MRI.getRegBank(Src0);
348 const RegisterBank *Src1Bank = MRI.getRegBank(Src1);
349 if (Src0Bank == VgprRB && Src1Bank == SgprRB) {
350 VOffsetReg = Src0;
351 SOffsetReg = Src1;
352 return 0;
353 }
354 if (Src0Bank == SgprRB && Src1Bank == VgprRB) {
355 VOffsetReg = Src1;
356 SOffsetReg = Src0;
357 return 0;
358 }
359 }
360 // Ensure we have a VGPR for the combined offset. This could be an issue if we
361 // have an SGPR offset and a VGPR resource.
362 if (MRI.getRegBank(CombinedOffset) == VgprRB) {
363 VOffsetReg = CombinedOffset;
364 } else {
365 VOffsetReg = B.buildCopy({VgprRB, S32}, CombinedOffset).getReg(0);
366 }
367 SOffsetReg = B.buildConstant({SgprRB, S32}, 0).getReg(0);
368 return 0;
369}
370
371bool RegBankLegalizeHelper::splitLoad(MachineInstr &MI,
372 ArrayRef<LLT> LLTBreakdown, LLT MergeTy) {
373 MachineFunction &MF = B.getMF();
374 assert(MI.getNumMemOperands() == 1);
375 MachineMemOperand &BaseMMO = **MI.memoperands_begin();
376 Register Dst = MI.getOperand(0).getReg();
377 const RegisterBank *DstRB = MRI.getRegBankOrNull(Dst);
378 Register Base = MI.getOperand(1).getReg();
379 LLT PtrTy = MRI.getType(Base);
380 const RegisterBank *PtrRB = MRI.getRegBankOrNull(Base);
381 LLT OffsetTy = LLT::scalar(PtrTy.getSizeInBits());
382 SmallVector<Register, 4> LoadPartRegs;
383
384 unsigned ByteOffset = 0;
385 for (LLT PartTy : LLTBreakdown) {
386 Register BasePlusOffset;
387 if (ByteOffset == 0) {
388 BasePlusOffset = Base;
389 } else {
390 auto Offset = B.buildConstant({PtrRB, OffsetTy}, ByteOffset);
391 BasePlusOffset =
392 B.buildObjectPtrOffset({PtrRB, PtrTy}, Base, Offset).getReg(0);
393 }
394 auto *OffsetMMO = MF.getMachineMemOperand(&BaseMMO, ByteOffset, PartTy);
395 auto LoadPart = B.buildLoad({DstRB, PartTy}, BasePlusOffset, *OffsetMMO);
396 LoadPartRegs.push_back(LoadPart.getReg(0));
397 ByteOffset += PartTy.getSizeInBytes();
398 }
399
400 if (!MergeTy.isValid()) {
401 // Loads are of same size, concat or merge them together.
402 B.buildMergeLikeInstr(Dst, LoadPartRegs);
403 } else {
404 // Loads are not all of same size, need to unmerge them to smaller pieces
405 // of MergeTy type, then merge pieces to Dst.
406 SmallVector<Register, 4> MergeTyParts;
407 for (Register Reg : LoadPartRegs) {
408 if (MRI.getType(Reg) == MergeTy) {
409 MergeTyParts.push_back(Reg);
410 } else {
411 auto Unmerge = B.buildUnmerge({DstRB, MergeTy}, Reg);
412 for (unsigned i = 0; i < Unmerge->getNumOperands() - 1; ++i)
413 MergeTyParts.push_back(Unmerge.getReg(i));
414 }
415 }
416 B.buildMergeLikeInstr(Dst, MergeTyParts);
417 }
418 MI.eraseFromParent();
419 return true;
420}
421
422bool RegBankLegalizeHelper::widenLoad(MachineInstr &MI, LLT WideTy,
423 LLT MergeTy) {
424 MachineFunction &MF = B.getMF();
425 assert(MI.getNumMemOperands() == 1);
426 MachineMemOperand &BaseMMO = **MI.memoperands_begin();
427 Register Dst = MI.getOperand(0).getReg();
428 const RegisterBank *DstRB = MRI.getRegBankOrNull(Dst);
429 Register Base = MI.getOperand(1).getReg();
430
431 MachineMemOperand *WideMMO = MF.getMachineMemOperand(&BaseMMO, 0, WideTy);
432 auto WideLoad = B.buildLoad({DstRB, WideTy}, Base, *WideMMO);
433
434 if (WideTy.isScalar()) {
435 B.buildTrunc(Dst, WideLoad);
436 } else {
437 SmallVector<Register, 4> MergeTyParts;
438 auto Unmerge = B.buildUnmerge({DstRB, MergeTy}, WideLoad);
439
440 LLT DstTy = MRI.getType(Dst);
441 unsigned NumElts = DstTy.getSizeInBits() / MergeTy.getSizeInBits();
442 for (unsigned i = 0; i < NumElts; ++i) {
443 MergeTyParts.push_back(Unmerge.getReg(i));
444 }
445 B.buildMergeLikeInstr(Dst, MergeTyParts);
446 }
447 MI.eraseFromParent();
448 return true;
449}
450
451bool RegBankLegalizeHelper::widenMMOToS32(GAnyLoad &MI) const {
452 Register Dst = MI.getDstReg();
453 Register Ptr = MI.getPointerReg();
454 MachineMemOperand &MMO = MI.getMMO();
455 unsigned MemSize = 8 * MMO.getSize().getValue();
456
457 MachineMemOperand *WideMMO = B.getMF().getMachineMemOperand(&MMO, 0, S32);
458
459 if (MI.getOpcode() == G_LOAD) {
460 B.buildLoad(Dst, Ptr, *WideMMO);
461 } else {
462 auto Load = B.buildLoad(SgprRB_S32, Ptr, *WideMMO);
463
464 if (MI.getOpcode() == G_ZEXTLOAD) {
465 APInt Mask = APInt::getLowBitsSet(S32.getSizeInBits(), MemSize);
466 auto MaskCst = B.buildConstant(SgprRB_S32, Mask);
467 B.buildAnd(Dst, Load, MaskCst);
468 } else {
469 assert(MI.getOpcode() == G_SEXTLOAD);
470 B.buildSExtInReg(Dst, Load, MemSize);
471 }
472 }
473
474 MI.eraseFromParent();
475 return true;
476}
477
478bool RegBankLegalizeHelper::lowerVccExtToSel(MachineInstr &MI) {
479 Register Dst = MI.getOperand(0).getReg();
480 LLT Ty = MRI.getType(Dst);
481 Register Src = MI.getOperand(1).getReg();
482 unsigned Opc = MI.getOpcode();
483 int TrueExtCst = Opc == G_SEXT ? -1 : 1;
484 if (Ty == S32 || Ty == S16) {
485 auto True = B.buildConstant({VgprRB, Ty}, TrueExtCst);
486 auto False = B.buildConstant({VgprRB, Ty}, 0);
487 B.buildSelect(Dst, Src, True, False);
488 } else if (Ty == S64) {
489 auto True = B.buildConstant({VgprRB_S32}, TrueExtCst);
490 auto False = B.buildConstant({VgprRB_S32}, 0);
491 auto Lo = B.buildSelect({VgprRB_S32}, Src, True, False);
492 MachineInstrBuilder Hi;
493 switch (Opc) {
494 case G_SEXT:
495 Hi = Lo;
496 break;
497 case G_ZEXT:
498 Hi = False;
499 break;
500 case G_ANYEXT:
501 Hi = B.buildUndef({VgprRB_S32});
502 break;
503 default:
505 MF, MORE, "amdgpu-regbanklegalize",
506 "AMDGPU RegBankLegalize: lowerVccExtToSel, Opcode not supported", MI);
507 return false;
508 }
509
510 B.buildMergeValues(Dst, {Lo.getReg(0), Hi.getReg(0)});
511 } else {
513 MF, MORE, "amdgpu-regbanklegalize",
514 "AMDGPU RegBankLegalize: lowerVccExtToSel, Type not supported", MI);
515 return false;
516 }
517
518 MI.eraseFromParent();
519 return true;
520}
521
522std::pair<Register, Register> RegBankLegalizeHelper::unpackZExt(Register Reg) {
523 auto PackedS32 = B.buildBitcast(SgprRB_S32, Reg);
524 auto Mask = B.buildConstant(SgprRB_S32, 0x0000ffff);
525 auto Lo = B.buildAnd(SgprRB_S32, PackedS32, Mask);
526 auto Hi = B.buildLShr(SgprRB_S32, PackedS32, B.buildConstant(SgprRB_S32, 16));
527 return {Lo.getReg(0), Hi.getReg(0)};
528}
529
530std::pair<Register, Register> RegBankLegalizeHelper::unpackSExt(Register Reg) {
531 auto PackedS32 = B.buildBitcast(SgprRB_S32, Reg);
532 auto Lo = B.buildSExtInReg(SgprRB_S32, PackedS32, 16);
533 auto Hi = B.buildAShr(SgprRB_S32, PackedS32, B.buildConstant(SgprRB_S32, 16));
534 return {Lo.getReg(0), Hi.getReg(0)};
535}
536
537std::pair<Register, Register> RegBankLegalizeHelper::unpackAExt(Register Reg) {
538 auto PackedS32 = B.buildBitcast(SgprRB_S32, Reg);
539 auto Lo = PackedS32;
540 auto Hi = B.buildLShr(SgprRB_S32, PackedS32, B.buildConstant(SgprRB_S32, 16));
541 return {Lo.getReg(0), Hi.getReg(0)};
542}
543
544std::pair<Register, Register>
545RegBankLegalizeHelper::unpackAExtTruncS16(Register Reg) {
546 auto [Lo32, Hi32] = unpackAExt(Reg);
547 return {B.buildTrunc(SgprRB_S16, Lo32).getReg(0),
548 B.buildTrunc(SgprRB_S16, Hi32).getReg(0)};
549}
550
551bool RegBankLegalizeHelper::lowerUnpackBitShift(MachineInstr &MI) {
552 Register Lo, Hi;
553 switch (MI.getOpcode()) {
554 case AMDGPU::G_SHL: {
555 auto [Val0, Val1] = unpackAExt(MI.getOperand(1).getReg());
556 auto [Amt0, Amt1] = unpackAExt(MI.getOperand(2).getReg());
557 Lo = B.buildInstr(MI.getOpcode(), {SgprRB_S32}, {Val0, Amt0}).getReg(0);
558 Hi = B.buildInstr(MI.getOpcode(), {SgprRB_S32}, {Val1, Amt1}).getReg(0);
559 break;
560 }
561 case AMDGPU::G_LSHR: {
562 auto [Val0, Val1] = unpackZExt(MI.getOperand(1).getReg());
563 auto [Amt0, Amt1] = unpackZExt(MI.getOperand(2).getReg());
564 Lo = B.buildInstr(MI.getOpcode(), {SgprRB_S32}, {Val0, Amt0}).getReg(0);
565 Hi = B.buildInstr(MI.getOpcode(), {SgprRB_S32}, {Val1, Amt1}).getReg(0);
566 break;
567 }
568 case AMDGPU::G_ASHR: {
569 auto [Val0, Val1] = unpackSExt(MI.getOperand(1).getReg());
570 auto [Amt0, Amt1] = unpackSExt(MI.getOperand(2).getReg());
571 Lo = B.buildAShr(SgprRB_S32, Val0, Amt0).getReg(0);
572 Hi = B.buildAShr(SgprRB_S32, Val1, Amt1).getReg(0);
573 break;
574 }
575 default:
577 MF, MORE, "amdgpu-regbanklegalize",
578 "AMDGPU RegBankLegalize: lowerUnpackBitShift, case not implemented",
579 MI);
580 return false;
581 }
582 B.buildBuildVectorTrunc(MI.getOperand(0).getReg(), {Lo, Hi});
583 MI.eraseFromParent();
584 return true;
585}
586
587bool RegBankLegalizeHelper::lowerUnpackMinMax(MachineInstr &MI) {
588 Register Lo, Hi;
589 switch (MI.getOpcode()) {
590 case AMDGPU::G_SMIN:
591 case AMDGPU::G_SMAX: {
592 // For signed operations, use sign extension
593 auto [Val0_Lo, Val0_Hi] = unpackSExt(MI.getOperand(1).getReg());
594 auto [Val1_Lo, Val1_Hi] = unpackSExt(MI.getOperand(2).getReg());
595 Lo = B.buildInstr(MI.getOpcode(), {SgprRB_S32}, {Val0_Lo, Val1_Lo})
596 .getReg(0);
597 Hi = B.buildInstr(MI.getOpcode(), {SgprRB_S32}, {Val0_Hi, Val1_Hi})
598 .getReg(0);
599 break;
600 }
601 case AMDGPU::G_UMIN:
602 case AMDGPU::G_UMAX: {
603 // For unsigned operations, use zero extension
604 auto [Val0_Lo, Val0_Hi] = unpackZExt(MI.getOperand(1).getReg());
605 auto [Val1_Lo, Val1_Hi] = unpackZExt(MI.getOperand(2).getReg());
606 Lo = B.buildInstr(MI.getOpcode(), {SgprRB_S32}, {Val0_Lo, Val1_Lo})
607 .getReg(0);
608 Hi = B.buildInstr(MI.getOpcode(), {SgprRB_S32}, {Val0_Hi, Val1_Hi})
609 .getReg(0);
610 break;
611 }
612 default:
614 MF, MORE, "amdgpu-regbanklegalize",
615 "AMDGPU RegBankLegalize: lowerUnpackMinMax, case not implemented", MI);
616 return false;
617 }
618 B.buildBuildVectorTrunc(MI.getOperand(0).getReg(), {Lo, Hi});
619 MI.eraseFromParent();
620 return true;
621}
622
623bool RegBankLegalizeHelper::lowerUnpackAExt(MachineInstr &MI) {
624 auto [Op1Lo, Op1Hi] = unpackAExt(MI.getOperand(1).getReg());
625 auto [Op2Lo, Op2Hi] = unpackAExt(MI.getOperand(2).getReg());
626 auto ResLo = B.buildInstr(MI.getOpcode(), {SgprRB_S32}, {Op1Lo, Op2Lo});
627 auto ResHi = B.buildInstr(MI.getOpcode(), {SgprRB_S32}, {Op1Hi, Op2Hi});
628 B.buildBuildVectorTrunc(MI.getOperand(0).getReg(),
629 {ResLo.getReg(0), ResHi.getReg(0)});
630 MI.eraseFromParent();
631 return true;
632}
633
634bool RegBankLegalizeHelper::lowerSBufToBuf(MachineInstr &MI,
635 WaterfallInfo &WFI) {
636 Register Dst = MI.getOperand(0).getReg();
637 LLT Ty = MRI.getType(Dst);
638 const RegisterBank *RSrcBank = MRI.getRegBank(MI.getOperand(1).getReg());
639 unsigned LoadSize = Ty.getSizeInBits();
640 int NumLoads = 1;
641 SmallVector<Register, 4> LoadParts;
642 if (LoadSize == 256 || LoadSize == 512) {
643 NumLoads = LoadSize / 128;
644 Ty = Ty.divide(NumLoads);
645 }
646 for (int i = 0; i < NumLoads; ++i)
647 LoadParts.emplace_back(MRI.createVirtualRegister({VgprRB, Ty}));
648 MachineMemOperand *OrigMMO = *MI.memoperands_begin();
649 const Align Alignment = OrigMMO->getAlign();
650 MachineFunction &MF = B.getMF();
651 Register SOffset;
652 Register VOffset;
653 int64_t ImmOffset = 0;
654 unsigned MMOOffset = setBufferOffsets(B, MI.getOperand(2).getReg(), VOffset,
655 SOffset, ImmOffset, Alignment);
656 // Use the MMO size from the original instruction rather than the (possibly
657 // widened) register type. E.g. 96-bit loads are widened to 128-bit during
658 // legalization but the MMO still reflects the original 96-bit access size.
659 const unsigned MemSize = divideCeil(OrigMMO->getSize().getValue(), NumLoads);
660 MachineMemOperand *BaseMMO = MF.getMachineMemOperand(OrigMMO, 0, MemSize);
661 if (MMOOffset != 0)
662 BaseMMO = MF.getMachineMemOperand(BaseMMO, MMOOffset, MemSize);
663 // If only the offset is divergent, emit a MUBUF buffer load
664 // instead. We can assume that the buffer is unswizzled.
665 Register RSrc = MI.getOperand(1).getReg();
666 Register VIndex = B.buildConstant(VgprRB_S32, 0).getReg(0);
667 unsigned Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD;
668 switch (MI.getOpcode()) {
669 case AMDGPU::G_AMDGPU_S_BUFFER_LOAD_SBYTE:
670 Opc = G_AMDGPU_BUFFER_LOAD_SBYTE;
671 break;
672 case AMDGPU::G_AMDGPU_S_BUFFER_LOAD_UBYTE:
673 Opc = G_AMDGPU_BUFFER_LOAD_UBYTE;
674 break;
675 case AMDGPU::G_AMDGPU_S_BUFFER_LOAD_SSHORT:
676 Opc = G_AMDGPU_BUFFER_LOAD_SSHORT;
677 break;
678 case AMDGPU::G_AMDGPU_S_BUFFER_LOAD_USHORT:
679 Opc = G_AMDGPU_BUFFER_LOAD_USHORT;
680 break;
681 default:
682 break;
683 }
684 for (int i = 0; i < NumLoads; ++i) {
685 B.buildInstr(Opc)
686 .addDef(LoadParts[i]) // vdata
687 .addUse(RSrc) // rsrc
688 .addUse(VIndex) // vindex
689 .addUse(VOffset) // voffset
690 .addUse(SOffset) // soffset
691 .addImm(ImmOffset + 16 * i) // offset(imm)
692 .addImm(0) // cachepolicy, swizzled buffer(imm)
693 .addImm(0) // idxen(imm)
694 .addMemOperand(MF.getMachineMemOperand(BaseMMO, 16 * i, MemSize));
695 }
696 if (NumLoads == 1)
697 B.buildCopy(Dst, LoadParts[0]);
698 else
699 B.buildMergeLikeInstr(Dst, LoadParts);
700 B.setInstr(*MRI.getVRegDef(LoadParts[0]));
701 if (RSrcBank != SgprRB) {
702 WFI.SgprWaterfallOperandRegs.insert(RSrc);
703 WFI.Start = MRI.getVRegDef(LoadParts.front());
704 WFI.End = std::next(MRI.getVRegDef(LoadParts.back())->getIterator());
705 }
706 MI.eraseFromParent();
707 return true;
708}
709
712 return (GI->is(Intrinsic::amdgcn_sbfe));
713
714 return MI.getOpcode() == AMDGPU::G_SBFX;
715}
716
717bool RegBankLegalizeHelper::lowerV_BFE(MachineInstr &MI) {
718 Register Dst = MI.getOperand(0).getReg();
719 assert(MRI.getType(Dst) == LLT::scalar(64));
720 bool Signed = isSignedBFE(MI);
721 unsigned FirstOpnd = isa<GIntrinsic>(MI) ? 2 : 1;
722 // Extract bitfield from Src, LSBit is the least-significant bit for the
723 // extraction (field offset) and Width is size of bitfield.
724 Register Src = MI.getOperand(FirstOpnd).getReg();
725 Register LSBit = MI.getOperand(FirstOpnd + 1).getReg();
726 Register Width = MI.getOperand(FirstOpnd + 2).getReg();
727 // Comments are for signed bitfield extract, similar for unsigned. x is sign
728 // bit. s is sign, l is LSB and y are remaining bits of bitfield to extract.
729
730 // Src >> LSBit Hi|Lo: x?????syyyyyyl??? -> xxxx?????syyyyyyl
731 unsigned SHROpc = Signed ? AMDGPU::G_ASHR : AMDGPU::G_LSHR;
732 auto SHRSrc = B.buildInstr(SHROpc, {{VgprRB, S64}}, {Src, LSBit});
733
734 auto ConstWidth = getIConstantVRegValWithLookThrough(Width, MRI);
735
736 // Expand to Src >> LSBit << (64 - Width) >> (64 - Width)
737 // << (64 - Width): Hi|Lo: xxxx?????syyyyyyl -> syyyyyyl000000000
738 // >> (64 - Width): Hi|Lo: syyyyyyl000000000 -> ssssssssssyyyyyyl
739 if (!ConstWidth) {
740 auto Amt = B.buildSub(VgprRB_S32, B.buildConstant(SgprRB_S32, 64), Width);
741 auto SignBit = B.buildShl({VgprRB, S64}, SHRSrc, Amt);
742 B.buildInstr(SHROpc, {Dst}, {SignBit, Amt});
743 MI.eraseFromParent();
744 return true;
745 }
746
747 uint64_t WidthImm = ConstWidth->Value.getZExtValue();
748 auto UnmergeSHRSrc = B.buildUnmerge(VgprRB_S32, SHRSrc);
749 Register SHRSrcLo = UnmergeSHRSrc.getReg(0);
750 Register SHRSrcHi = UnmergeSHRSrc.getReg(1);
751 auto Zero = B.buildConstant({VgprRB, S32}, 0);
752 unsigned BFXOpc = Signed ? AMDGPU::G_SBFX : AMDGPU::G_UBFX;
753
754 if (WidthImm <= 32) {
755 // SHRSrc Hi|Lo: ????????|???syyyl -> ????????|ssssyyyl
756 auto Lo = B.buildInstr(BFXOpc, {VgprRB_S32}, {SHRSrcLo, Zero, Width});
757 MachineInstrBuilder Hi;
758 if (Signed) {
759 // SHRSrc Hi|Lo: ????????|ssssyyyl -> ssssssss|ssssyyyl
760 Hi = B.buildAShr(VgprRB_S32, Lo, B.buildConstant(VgprRB_S32, 31));
761 } else {
762 // SHRSrc Hi|Lo: ????????|000syyyl -> 00000000|000syyyl
763 Hi = Zero;
764 }
765 B.buildMergeLikeInstr(Dst, {Lo, Hi});
766 } else {
767 auto Amt = B.buildConstant(VgprRB_S32, WidthImm - 32);
768 // SHRSrc Hi|Lo: ??????sy|yyyyyyyl -> sssssssy|yyyyyyyl
769 auto Hi = B.buildInstr(BFXOpc, {VgprRB_S32}, {SHRSrcHi, Zero, Amt});
770 B.buildMergeLikeInstr(Dst, {SHRSrcLo, Hi});
771 }
772
773 MI.eraseFromParent();
774 return true;
775}
776
777bool RegBankLegalizeHelper::lowerS_BFE(MachineInstr &MI) {
778 Register DstReg = MI.getOperand(0).getReg();
779 LLT Ty = MRI.getType(DstReg);
780 bool Signed = isSignedBFE(MI);
781 unsigned FirstOpnd = isa<GIntrinsic>(MI) ? 2 : 1;
782 Register Src = MI.getOperand(FirstOpnd).getReg();
783 Register LSBit = MI.getOperand(FirstOpnd + 1).getReg();
784 Register Width = MI.getOperand(FirstOpnd + 2).getReg();
785 // For uniform bit field extract there are 4 available instructions, but
786 // LSBit(field offset) and Width(size of bitfield) need to be packed in S32,
787 // field offset in low and size in high 16 bits.
788
789 // Src1 Hi16|Lo16 = Size|FieldOffset
790 auto Mask = B.buildConstant(SgprRB_S32, maskTrailingOnes<unsigned>(6));
791 auto FieldOffset = B.buildAnd(SgprRB_S32, LSBit, Mask);
792 auto Size = B.buildShl(SgprRB_S32, Width, B.buildConstant(SgprRB_S32, 16));
793 auto Src1 = B.buildOr(SgprRB_S32, FieldOffset, Size);
794 unsigned Opc32 = Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32;
795 unsigned Opc64 = Signed ? AMDGPU::S_BFE_I64 : AMDGPU::S_BFE_U64;
796 unsigned Opc = Ty == S32 ? Opc32 : Opc64;
797
798 // Select machine instruction, because of reg class constraining, insert
799 // copies from reg class to reg bank.
800 auto S_BFE = B.buildInstr(Opc, {{SgprRB, Ty}},
801 {B.buildCopy(Ty, Src), B.buildCopy(S32, Src1)});
802 constrainSelectedInstRegOperands(*S_BFE, *ST.getInstrInfo(),
803 *ST.getRegisterInfo(), RBI);
804
805 B.buildCopy(DstReg, S_BFE->getOperand(0).getReg());
806 MI.eraseFromParent();
807 return true;
808}
809
810bool RegBankLegalizeHelper::lowerSplitTo32(MachineInstr &MI) {
811 Register Dst = MI.getOperand(0).getReg();
812 LLT DstTy = MRI.getType(Dst);
813 assert(DstTy == V4S16 || DstTy == V2S32 || DstTy == S64);
814 LLT Ty = DstTy == V4S16 ? V2S16 : S32;
815 auto Op1 = B.buildUnmerge({VgprRB, Ty}, MI.getOperand(1).getReg());
816 auto Op2 = B.buildUnmerge({VgprRB, Ty}, MI.getOperand(2).getReg());
817 unsigned Opc = MI.getOpcode();
818 auto Flags = MI.getFlags();
819 auto Lo =
820 B.buildInstr(Opc, {{VgprRB, Ty}}, {Op1.getReg(0), Op2.getReg(0)}, Flags);
821 auto Hi =
822 B.buildInstr(Opc, {{VgprRB, Ty}}, {Op1.getReg(1), Op2.getReg(1)}, Flags);
823 B.buildMergeLikeInstr(Dst, {Lo, Hi});
824 MI.eraseFromParent();
825 return true;
826}
827
828bool RegBankLegalizeHelper::lowerSplitTo32Mul(MachineInstr &MI) {
829 Register Dst = MI.getOperand(0).getReg();
830 assert(MRI.getType(Dst) == S64);
831 auto Op1 = B.buildUnmerge({VgprRB_S32}, MI.getOperand(1).getReg());
832 auto Op2 = B.buildUnmerge({VgprRB_S32}, MI.getOperand(2).getReg());
833
834 // TODO: G_AMDGPU_MAD_* optimizations for G_MUL divergent S64 operation to
835 // match GlobalISel with old regbankselect.
836 auto Lo = B.buildMul(VgprRB_S32, Op1.getReg(0), Op2.getReg(0));
837 auto Carry = B.buildUMulH(VgprRB_S32, Op1.getReg(0), Op2.getReg(0));
838 auto MulLo0Hi1 = B.buildMul(VgprRB_S32, Op1.getReg(0), Op2.getReg(1));
839 auto MulHi0Lo1 = B.buildMul(VgprRB_S32, Op1.getReg(1), Op2.getReg(0));
840 auto Sum = B.buildAdd(VgprRB_S32, MulLo0Hi1, MulHi0Lo1);
841 auto Hi = B.buildAdd(VgprRB_S32, Sum, Carry);
842
843 B.buildMergeLikeInstr(Dst, {Lo, Hi});
844 MI.eraseFromParent();
845 return true;
846}
847
848bool RegBankLegalizeHelper::lowerSplitTo16(MachineInstr &MI) {
849 Register Dst = MI.getOperand(0).getReg();
850 assert(MRI.getType(Dst) == V2S16);
851 unsigned Opc = MI.getOpcode();
852 unsigned NumOps = MI.getNumOperands();
853 auto Flags = MI.getFlags();
854
855 auto [Op1Lo, Op1Hi] = unpackAExtTruncS16(MI.getOperand(1).getReg());
856
857 if (NumOps == 2) {
858 auto Lo = B.buildInstr(Opc, {SgprRB_S16}, {Op1Lo}, Flags);
859 auto Hi = B.buildInstr(Opc, {SgprRB_S16}, {Op1Hi}, Flags);
860 B.buildMergeLikeInstr(Dst, {Lo, Hi});
861 MI.eraseFromParent();
862 return true;
863 }
864
865 auto [Op2Lo, Op2Hi] = unpackAExtTruncS16(MI.getOperand(2).getReg());
866
867 if (NumOps == 3) {
868 auto Lo = B.buildInstr(Opc, {SgprRB_S16}, {Op1Lo, Op2Lo}, Flags);
869 auto Hi = B.buildInstr(Opc, {SgprRB_S16}, {Op1Hi, Op2Hi}, Flags);
870 B.buildMergeLikeInstr(Dst, {Lo, Hi});
871 MI.eraseFromParent();
872 return true;
873 }
874
875 assert(NumOps == 4);
876 auto [Op3Lo, Op3Hi] = unpackAExtTruncS16(MI.getOperand(3).getReg());
877 auto Lo = B.buildInstr(Opc, {SgprRB_S16}, {Op1Lo, Op2Lo, Op3Lo}, Flags);
878 auto Hi = B.buildInstr(Opc, {SgprRB_S16}, {Op1Hi, Op2Hi, Op3Hi}, Flags);
879 B.buildMergeLikeInstr(Dst, {Lo, Hi});
880 MI.eraseFromParent();
881 return true;
882}
883
884bool RegBankLegalizeHelper::lowerUniMAD64(MachineInstr &MI) {
885 Register Dst0 = MI.getOperand(0).getReg();
886 Register Dst1 = MI.getOperand(1).getReg();
887 Register Src0 = MI.getOperand(2).getReg();
888 Register Src1 = MI.getOperand(3).getReg();
889 Register Src2 = MI.getOperand(4).getReg();
890
891 const GCNSubtarget &ST = B.getMF().getSubtarget<GCNSubtarget>();
892
893 // Keep the multiplication on the SALU.
894 Register DstLo = B.buildMul(SgprRB_S32, Src0, Src1).getReg(0);
895 Register DstHi = MRI.createVirtualRegister(SgprRB_S32);
896 if (ST.hasScalarMulHiInsts()) {
897 B.buildInstr(AMDGPU::G_UMULH, {{DstHi}}, {Src0, Src1});
898 } else {
899 auto VSrc0 = B.buildCopy(VgprRB_S32, Src0);
900 auto VSrc1 = B.buildCopy(VgprRB_S32, Src1);
901 auto MulHi = B.buildInstr(AMDGPU::G_UMULH, {VgprRB_S32}, {VSrc0, VSrc1});
902 buildReadAnyLane(B, DstHi, MulHi.getReg(0), RBI);
903 }
904
905 // Accumulate and produce the "carry-out" bit.
906
907 // The "carry-out" is defined as bit 64 of the result when computed as a
908 // big integer. For unsigned multiply-add, this matches the usual
909 // definition of carry-out.
910 if (mi_match(Src2, MRI, MIPatternMatch::m_ZeroInt())) {
911 // No accumulate: result is just the multiplication, carry is 0.
912 B.buildMergeLikeInstr(Dst0, {DstLo, DstHi});
913 B.buildConstant(Dst1, 0);
914 } else {
915 // Accumulate: add Src2 to the multiplication result with carry chain.
916 Register Src2Lo = MRI.createVirtualRegister(SgprRB_S32);
917 Register Src2Hi = MRI.createVirtualRegister(SgprRB_S32);
918 B.buildUnmerge({Src2Lo, Src2Hi}, Src2);
919
920 auto AddLo = B.buildUAddo(SgprRB_S32, SgprRB_S32, DstLo, Src2Lo);
921 auto AddHi =
922 B.buildUAdde(SgprRB_S32, SgprRB_S32, DstHi, Src2Hi, AddLo.getReg(1));
923 B.buildMergeLikeInstr(Dst0, {AddLo.getReg(0), AddHi.getReg(0)});
924 B.buildCopy(Dst1, AddHi.getReg(1));
925 }
926
927 MI.eraseFromParent();
928 return true;
929}
930
931bool RegBankLegalizeHelper::lowerSplitTo32Select(MachineInstr &MI) {
932 Register Dst = MI.getOperand(0).getReg();
933 LLT DstTy = MRI.getType(Dst);
934 assert(DstTy == V4S16 || DstTy == V2S32 || DstTy == S64 ||
935 (DstTy.isPointer() && DstTy.getSizeInBits() == 64));
936 LLT Ty = DstTy == V4S16 ? V2S16 : S32;
937 auto Op2 = B.buildUnmerge({VgprRB, Ty}, MI.getOperand(2).getReg());
938 auto Op3 = B.buildUnmerge({VgprRB, Ty}, MI.getOperand(3).getReg());
939 Register Cond = MI.getOperand(1).getReg();
940 auto Flags = MI.getFlags();
941 auto Lo =
942 B.buildSelect({VgprRB, Ty}, Cond, Op2.getReg(0), Op3.getReg(0), Flags);
943 auto Hi =
944 B.buildSelect({VgprRB, Ty}, Cond, Op2.getReg(1), Op3.getReg(1), Flags);
945
946 B.buildMergeLikeInstr(Dst, {Lo, Hi});
947 MI.eraseFromParent();
948 return true;
949}
950
951bool RegBankLegalizeHelper::lowerSplitTo32SExtInReg(MachineInstr &MI) {
952 auto Op1 = B.buildUnmerge(VgprRB_S32, MI.getOperand(1).getReg());
953 int Amt = MI.getOperand(2).getImm();
954 Register Lo, Hi;
955 // Hi|Lo: s sign bit, ?/x bits changed/not changed by sign-extend
956 if (Amt <= 32) {
957 auto Freeze = B.buildFreeze(VgprRB_S32, Op1.getReg(0));
958 if (Amt == 32) {
959 // Hi|Lo: ????????|sxxxxxxx -> ssssssss|sxxxxxxx
960 Lo = Freeze.getReg(0);
961 } else {
962 // Hi|Lo: ????????|???sxxxx -> ssssssss|ssssxxxx
963 Lo = B.buildSExtInReg(VgprRB_S32, Freeze, Amt).getReg(0);
964 }
965
966 auto SignExtCst = B.buildConstant(SgprRB_S32, 31);
967 Hi = B.buildAShr(VgprRB_S32, Lo, SignExtCst).getReg(0);
968 } else {
969 // Hi|Lo: ?????sxx|xxxxxxxx -> ssssssxx|xxxxxxxx
970 Lo = Op1.getReg(0);
971 Hi = B.buildSExtInReg(VgprRB_S32, Op1.getReg(1), Amt - 32).getReg(0);
972 }
973
974 B.buildMergeLikeInstr(MI.getOperand(0).getReg(), {Lo, Hi});
975 MI.eraseFromParent();
976 return true;
977}
978
979bool RegBankLegalizeHelper::lowerSplitBitCount64To32(MachineInstr &MI) {
980 // Split 64-bit find-first-bit operations into 32-bit halves:
981 // (ffbh hi:lo) -> umin(ffbh(hi), uaddsat(ffbh(lo), 32))
982 // (ffbl hi:lo) -> umin(ffbl(lo), uaddsat(ffbl(hi), 32))
983 // (ctlz_zero_poison hi:lo) -> umin(ffbh(hi), add(ffbh(lo), 32))
984 // (cttz_zero_poison hi:lo) -> umin(ffbl(lo), add(ffbl(hi), 32))
985 unsigned Opc = MI.getOpcode();
986
987 // FFBH/FFBL return 0xFFFFFFFF on zero input, using uaddsat to avoid
988 // wrapping. CTLZ/CTTZ guarantee non-zero input (zero_poison), so plain add
989 // is fine.
990 unsigned FFBOpc;
991 unsigned AddOpc;
992 bool SearchFromMSB;
993 switch (Opc) {
994 case AMDGPU::G_AMDGPU_FFBH_U32:
995 FFBOpc = Opc;
996 AddOpc = AMDGPU::G_UADDSAT;
997 SearchFromMSB = true;
998 break;
999 case AMDGPU::G_AMDGPU_FFBL_B32:
1000 FFBOpc = Opc;
1001 AddOpc = AMDGPU::G_UADDSAT;
1002 SearchFromMSB = false;
1003 break;
1004 case AMDGPU::G_CTLZ_ZERO_POISON:
1005 FFBOpc = AMDGPU::G_AMDGPU_FFBH_U32;
1006 AddOpc = AMDGPU::G_ADD;
1007 SearchFromMSB = true;
1008 break;
1009 case AMDGPU::G_CTTZ_ZERO_POISON:
1010 FFBOpc = AMDGPU::G_AMDGPU_FFBL_B32;
1011 AddOpc = AMDGPU::G_ADD;
1012 SearchFromMSB = false;
1013 break;
1014 default:
1015 llvm_unreachable("unexpected opcode in lowerSplitBitCount64To32");
1016 }
1017
1018 auto Unmerge = B.buildUnmerge(VgprRB_S32, MI.getOperand(1).getReg());
1019 Register Lo = Unmerge.getReg(0);
1020 Register Hi = Unmerge.getReg(1);
1021
1022 // MSB-first (FFBH/CTLZ) searches hi first; LSB-first (FFBL/CTTZ) searches
1023 // lo first. The secondary half adds 32 to account for the primary half's
1024 // width.
1025 auto Primary = B.buildInstr(FFBOpc, {VgprRB_S32}, {SearchFromMSB ? Hi : Lo});
1026 auto Secondary =
1027 B.buildInstr(FFBOpc, {VgprRB_S32}, {SearchFromMSB ? Lo : Hi});
1028
1029 auto Adjusted = B.buildInstr(AddOpc, {VgprRB_S32},
1030 {Secondary, B.buildConstant(VgprRB_S32, 32)});
1031 B.buildUMin(MI.getOperand(0).getReg(), Primary, Adjusted);
1032
1033 MI.eraseFromParent();
1034 return true;
1035}
1036
1037bool RegBankLegalizeHelper::lowerExtrVecEltToSel(MachineInstr &MI) {
1038 // Lower extract vector element to a compare-select chain:
1039 // result = elt[0]
1040 // for i in 1..N-1:
1041 // result = (idx == i) ? elt[i] : result
1042 //
1043 // When the index is divergent, each lane may want a different element, so
1044 // we must check every element per lane.
1045 Register Dst = MI.getOperand(0).getReg();
1046 Register Src = MI.getOperand(1).getReg();
1047 Register Idx = MI.getOperand(2).getReg();
1048
1049 LLT VecTy = MRI.getType(Src);
1050 LLT ScalarTy = VecTy.getScalarType();
1051 unsigned NumElts = VecTy.getNumElements();
1052 MachineRegisterInfo::VRegAttrs VgprRB_EltTy = {VgprRB, ScalarTy};
1053
1054 auto Unmerge = B.buildUnmerge(VgprRB_EltTy, Src);
1055
1056 if (ScalarTy.getSizeInBits() == 32) {
1057 Register PrevSelect = Unmerge.getReg(0);
1058 for (unsigned I = 1; I < NumElts; ++I) {
1059 auto IdxConst = B.buildConstant({SgprRB, MRI.getType(Idx)}, I);
1060 auto Cmp = B.buildICmp(CmpInst::ICMP_EQ, VccRB_S1, Idx, IdxConst);
1061 PrevSelect =
1062 B.buildSelect(VgprRB_EltTy, Cmp, Unmerge.getReg(I), PrevSelect)
1063 .getReg(0);
1064 }
1065 B.buildCopy(Dst, PrevSelect);
1066 } else if (ScalarTy.getSizeInBits() == 64) {
1067 auto InitUnmerge = B.buildUnmerge(VgprRB_S32, Unmerge.getReg(0));
1068 Register PrevLo = InitUnmerge.getReg(0);
1069 Register PrevHi = InitUnmerge.getReg(1);
1070 for (unsigned I = 1; I < NumElts; ++I) {
1071 auto IdxConst = B.buildConstant({SgprRB, MRI.getType(Idx)}, I);
1072 auto Cmp = B.buildICmp(CmpInst::ICMP_EQ, VccRB_S1, Idx, IdxConst);
1073 auto EltUnmerge = B.buildUnmerge(VgprRB_S32, Unmerge.getReg(I));
1074 PrevLo = B.buildSelect(VgprRB_S32, Cmp, EltUnmerge.getReg(0), PrevLo)
1075 .getReg(0);
1076 PrevHi = B.buildSelect(VgprRB_S32, Cmp, EltUnmerge.getReg(1), PrevHi)
1077 .getReg(0);
1078 }
1079 B.buildMergeLikeInstr(Dst, {PrevLo, PrevHi});
1080 } else {
1082 MF, MORE, "amdgpu-regbanklegalize",
1083 "AMDGPU RegBankLegalize: ExtrVecEltToSel unsupported element type", MI);
1084 return false;
1085 }
1086
1087 MI.eraseFromParent();
1088 return true;
1089}
1090
1091bool RegBankLegalizeHelper::lowerExtrVecEltTo32(MachineInstr &MI) {
1092 // Reduce a 64-bit element extract to two 32-bit extracts:
1093 // vec32 = bitcast <N x s64> to <2N x s32>
1094 // lo = vec32[idx * 2]
1095 // hi = vec32[idx * 2 + 1]
1096 // result = merge(lo, hi)
1097 //
1098 // When the index is uniform, all lanes extract the same element, so we can
1099 // just split the s64 extract into two s32 extracts which lower to MOVREL.
1100 Register Dst = MI.getOperand(0).getReg();
1101 Register Src = MI.getOperand(1).getReg();
1102 Register Idx = MI.getOperand(2).getReg();
1103
1104 LLT SrcTy = MRI.getType(Src);
1105 LLT Vec32Ty = LLT::fixed_vector(2 * SrcTy.getNumElements(), 32);
1106
1107 assert(MRI.getRegBank(Src) == VgprRB && MRI.getRegBank(Idx) == SgprRB &&
1108 "expected VGPR src and SGPR idx");
1109
1110 auto CastSrc = B.buildBitcast({VgprRB, Vec32Ty}, Src);
1111
1112 // Calculate new Lo and Hi indices
1113 auto One = B.buildConstant(SgprRB_S32, 1);
1114 auto IdxLo = B.buildShl(SgprRB_S32, Idx, One);
1115 auto IdxHi = B.buildAdd(SgprRB_S32, IdxLo, One);
1116
1117 auto ExtLo = B.buildExtractVectorElement(VgprRB_S32, CastSrc, IdxLo);
1118 auto ExtHi = B.buildExtractVectorElement(VgprRB_S32, CastSrc, IdxHi);
1119
1120 B.buildMergeLikeInstr(Dst, {ExtLo.getReg(0), ExtHi.getReg(0)});
1121
1122 MI.eraseFromParent();
1123 return true;
1124}
1125
1126bool RegBankLegalizeHelper::lowerInsVecEltToSel(MachineInstr &MI) {
1127 // Lower insert vector element to a compare-select chain:
1128 // for i in 0..N-1:
1129 // result[i] = (idx == i) ? elt : srcVec[i]
1130 // dst = merge(result[0..N-1])
1131 //
1132 // VGPR B64 requires splitting to lo/hi s32 pairs since there is no
1133 // v_cndmask_b64. SGPR B64/B32 and VGPR B32 can be handled natively.
1134 Register Dst = MI.getOperand(0).getReg();
1135 Register Src = MI.getOperand(1).getReg();
1136 Register Elt = MI.getOperand(2).getReg();
1137 Register Idx = MI.getOperand(3).getReg();
1138
1139 LLT VecTy = MRI.getType(Src);
1140 LLT ScalarTy = VecTy.getScalarType();
1141 unsigned NumElts = VecTy.getNumElements();
1142 const RegisterBank *SrcRB = MRI.getRegBank(Src);
1143 bool IsSGPR = (SrcRB == SgprRB);
1144 SmallVector<Register, 16> Selects;
1145
1146 if (!IsSGPR && ScalarTy.getSizeInBits() == 64) {
1147 // VGPR B64: split to 32-bit lo/hi since there is no v_cndmask_b64.
1148 auto Unmerge = B.buildUnmerge(VgprRB_S32, Src);
1149 auto EltUnmerge = B.buildUnmerge(VgprRB_S32, Elt);
1150 Register EltLo = EltUnmerge.getReg(0);
1151 Register EltHi = EltUnmerge.getReg(1);
1152 for (unsigned I = 0; I < NumElts; ++I) {
1153 auto IdxConst = B.buildConstant(VgprRB_S32, I);
1154 auto Cmp = B.buildICmp(CmpInst::ICMP_EQ, VccRB_S1, Idx, IdxConst);
1155 Selects.push_back(
1156 B.buildSelect(VgprRB_S32, Cmp, EltLo, Unmerge.getReg(2 * I))
1157 .getReg(0));
1158 Selects.push_back(
1159 B.buildSelect(VgprRB_S32, Cmp, EltHi, Unmerge.getReg(2 * I + 1))
1160 .getReg(0));
1161 }
1162 LLT Vec32Ty = LLT::fixed_vector(2 * NumElts, 32);
1163 auto Vec32 = B.buildBuildVector({VgprRB, Vec32Ty}, Selects);
1164 B.buildBitcast(Dst, Vec32);
1165 } else if (ScalarTy.getSizeInBits() == 32 || ScalarTy.getSizeInBits() == 64) {
1166 // B32 (any bank) and SGPR B64: element-wise select at native width.
1167 MachineRegisterInfo::VRegAttrs SrcRB_EltTy = {SrcRB, ScalarTy};
1168 MachineRegisterInfo::VRegAttrs CmpTy = IsSGPR ? SgprRB_S32 : VccRB_S1;
1169 auto Unmerge = B.buildUnmerge(SrcRB_EltTy, Src);
1170 for (unsigned I = 0; I < NumElts; ++I) {
1171 auto IdxConst = B.buildConstant(SgprRB_S32, I);
1172 auto Cmp = B.buildICmp(CmpInst::ICMP_EQ, CmpTy, Idx, IdxConst);
1173 Selects.push_back(
1174 B.buildSelect(SrcRB_EltTy, Cmp, Elt, Unmerge.getReg(I)).getReg(0));
1175 }
1176 B.buildMergeLikeInstr(Dst, Selects);
1177 } else {
1179 MF, MORE, "amdgpu-regbanklegalize",
1180 "AMDGPU RegBankLegalize: InsVecEltToSel unsupported element type", MI);
1181 return false;
1182 }
1183
1184 MI.eraseFromParent();
1185 return true;
1186}
1187
1188bool RegBankLegalizeHelper::lowerInsVecEltTo32(MachineInstr &MI) {
1189 // Reduce a 64-bit element insert to two 32-bit inserts:
1190 // vec32 = bitcast <N x s64> to <2N x s32>
1191 // lo, hi = unmerge elt
1192 // vec32[idx * 2] = lo
1193 // vec32[idx * 2 + 1] = hi
1194 // dst = bitcast <2N x s32> to <N x s64>
1195 //
1196 // When the index is uniform, all lanes insert at the same position, so we
1197 // can split the s64 insert into two s32 inserts which lower to MOVREL/GPRIDX.
1198 Register Dst = MI.getOperand(0).getReg();
1199 Register Src = MI.getOperand(1).getReg();
1200 Register Elt = MI.getOperand(2).getReg();
1201 Register Idx = MI.getOperand(3).getReg();
1202
1203 LLT SrcTy = MRI.getType(Src);
1204 LLT Vec32Ty = LLT::fixed_vector(2 * SrcTy.getNumElements(), 32);
1205
1206 assert(MRI.getRegBank(Src) == VgprRB && MRI.getRegBank(Idx) == SgprRB &&
1207 "expected VGPR src and SGPR idx");
1208
1209 MachineRegisterInfo::VRegAttrs VgprRB_Vec32Ty = {VgprRB, Vec32Ty};
1210
1211 auto CastSrc = B.buildBitcast(VgprRB_Vec32Ty, Src);
1212 auto EltUnmerge = B.buildUnmerge(VgprRB_S32, Elt);
1213
1214 // Calculate new Lo and Hi indices
1215 auto One = B.buildConstant(SgprRB_S32, 1);
1216 auto IdxLo = B.buildShl(SgprRB_S32, Idx, One);
1217 auto IdxHi = B.buildAdd(SgprRB_S32, IdxLo, One);
1218
1219 auto InsLo = B.buildInsertVectorElement(VgprRB_Vec32Ty, CastSrc,
1220 EltUnmerge.getReg(0), IdxLo);
1221 auto InsHi = B.buildInsertVectorElement(VgprRB_Vec32Ty, InsLo,
1222 EltUnmerge.getReg(1), IdxHi);
1223
1224 B.buildBitcast(Dst, InsHi);
1225
1226 MI.eraseFromParent();
1227 return true;
1228}
1229
1230bool RegBankLegalizeHelper::lowerAbsToNegMax(MachineInstr &MI) {
1231 // Lower divergent G_ABS to smax(x, 0 - x) in the VGPR bank:
1232 // zero = 0
1233 // neg = G_SUB zero, x
1234 // dst = G_SMAX x, neg
1235 //
1236 // There is no integer v_abs instruction on AMDGPU, so divergent G_ABS is
1237 // expanded to this sub/smax pair.
1238 Register DstReg = MI.getOperand(0).getReg();
1239 Register SrcReg = MI.getOperand(1).getReg();
1240 LLT Ty = MRI.getType(DstReg);
1241
1242 Register Zero;
1243 if (Ty == V2S16) {
1244 // buildConstant cannot produce a V2S16 directly; pack two S16 zeros.
1245 Register Zero16 = B.buildConstant({VgprRB, S16}, 0).getReg(0);
1246 Zero = B.buildBuildVector({VgprRB, Ty}, {Zero16, Zero16}).getReg(0);
1247 } else {
1248 assert((Ty == S32 || Ty == S16) && "unexpected type for AbsToNegMax");
1249 Zero = B.buildConstant({VgprRB, Ty}, 0).getReg(0);
1250 }
1251
1252 auto Neg = B.buildSub({VgprRB, Ty}, Zero, SrcReg);
1253 B.buildSMax(DstReg, SrcReg, Neg);
1254 MI.eraseFromParent();
1255 return true;
1256}
1257
1258bool RegBankLegalizeHelper::lowerAbsToS32(MachineInstr &MI) {
1259 // Lower uniform V2S16 abs by unpacking the values to two separate SGPR
1260 // registers and re-emitting G_ABS on each:
1261 // packed = bitcast <2 x s16> src to s32
1262 // lo = sext_inreg packed, 16
1263 // hi = ashr packed, 16
1264 // dst = build_vector_trunc G_ABS(lo), G_ABS(hi)
1265 //
1266 // SALU only has s_abs_i32, with no direct uniform V2S16 abs. The
1267 // re-emitted G_ABS(SgprRB, S32) selects to s_abs_i32 on each value.
1268 auto Bitcast = B.buildBitcast({SgprRB_S32}, MI.getOperand(1).getReg());
1269 auto SextInReg = B.buildSExtInReg({SgprRB_S32}, Bitcast, 16);
1270 auto ShiftHi =
1271 B.buildAShr({SgprRB_S32}, Bitcast, B.buildConstant({SgprRB_S32}, 16));
1272
1273 auto AbsLo = B.buildInstr(AMDGPU::G_ABS, {{SgprRB_S32}}, {SextInReg});
1274 auto AbsHi = B.buildInstr(AMDGPU::G_ABS, {{SgprRB_S32}}, {ShiftHi});
1275 B.buildBuildVectorTrunc(MI.getOperand(0).getReg(),
1276 {AbsLo.getReg(0), AbsHi.getReg(0)});
1277
1278 MI.eraseFromParent();
1279 return true;
1280}
1281
1282// Ported from SITargetLowering::lowerSET_ROUNDING in SIISelLowering.cpp.
1283// Keep the mapping logic and conversion tables aligned with the SDAG lowering.
1284bool RegBankLegalizeHelper::lowerSetRounding(MachineInstr &MI) {
1285 Register NewMode = MI.getOperand(0).getReg();
1286
1287 // Index a table of 4-bit entries mapping from the C FLT_ROUNDS values to the
1288 // hardware MODE.fp_round values.
1289 if (auto ConstMode = getIConstantVRegValWithLookThrough(NewMode, MRI)) {
1290 uint32_t ClampedVal = std::min(
1291 static_cast<uint32_t>(ConstMode->Value.getZExtValue()),
1292 static_cast<uint32_t>(AMDGPU::TowardZeroF32_TowardNegativeF64));
1293 uint32_t DecodedVal = AMDGPU::decodeFltRoundToHWConversionTable(ClampedVal);
1294 NewMode = B.buildConstant(SgprRB_S32, DecodedVal).getReg(0);
1295 } else {
1296 // If we know the input can only be one of the supported standard modes in
1297 // the range 0-3, we can use a simplified mapping to hardware values.
1298 KnownBits Known = VT->getKnownBits(NewMode);
1299 const bool UseReducedTable = Known.countMinLeadingZeros() >= 30;
1300 // The supported standard values are 0-3. The extended values start at 8. We
1301 // need to offset by 4 if the value is in the extended range.
1302
1303 if (UseReducedTable) {
1304 // Truncate to the low 32-bits.
1305 auto BitTable = B.buildConstant(
1306 SgprRB_S32, AMDGPU::FltRoundToHWConversionTable & 0xffff);
1307
1308 auto Two = B.buildConstant(SgprRB_S32, 2);
1309 auto RoundModeTimesNumBits = B.buildShl(SgprRB_S32, NewMode, Two);
1310
1311 NewMode =
1312 B.buildLShr(SgprRB_S32, BitTable, RoundModeTimesNumBits).getReg(0);
1313
1314 // TODO: A demanded-bits simplification on the setreg source here could
1315 // likely reduce the table extracted bits into inline immediates.
1316 } else {
1317 // table_index = umin(value, value - 4)
1318 // MODE.fp_round = (bit_table >> (table_index << 2)) & 0xf
1319 auto NegFour = B.buildConstant(SgprRB_S32, -4);
1320 auto OffsetEnum = B.buildAdd(SgprRB_S32, NewMode, NegFour);
1321 auto IndexVal = B.buildUMin(SgprRB_S32, NewMode, OffsetEnum);
1322
1323 auto Two = B.buildConstant(SgprRB_S32, 2);
1324 auto RoundModeTimesNumBits = B.buildShl(SgprRB_S32, IndexVal, Two);
1325
1326 auto BitTable =
1327 B.buildConstant({SgprRB, S64}, AMDGPU::FltRoundToHWConversionTable);
1328 auto TableValue =
1329 B.buildLShr({SgprRB, S64}, BitTable, RoundModeTimesNumBits);
1330 // No need to mask out the high bits since the setreg will ignore them
1331 // anyway.
1332 NewMode = B.buildTrunc(SgprRB_S32, TableValue).getReg(0);
1333 }
1334 }
1335
1336 // N.B. The setreg will be later folded into s_round_mode on supported
1337 // targets.
1338 uint32_t BothRoundHwReg =
1340 B.buildIntrinsic(Intrinsic::amdgcn_s_setreg, ArrayRef<DstOp>(),
1341 /*HasSideEffects=*/true, /*isConvergent=*/false)
1342 .addImm(static_cast<int16_t>(BothRoundHwReg))
1343 .addReg(NewMode);
1344
1345 MI.eraseFromParent();
1346 return true;
1347}
1348
1349// Ported from SITargetLowering::lowerGET_ROUNDING in SIISelLowering.cpp.
1350// Keep the mapping logic and conversion tables aligned with the SDAG lowering.
1351bool RegBankLegalizeHelper::lowerGetRounding(MachineInstr &MI) {
1352 Register Dst = MI.getOperand(0).getReg();
1353
1354 uint32_t BothRoundHwReg =
1356 auto GetReg =
1357 B.buildIntrinsic(Intrinsic::amdgcn_s_getreg, {SgprRB_S32},
1358 /*HasSideEffects=*/true, /*isConvergent=*/false)
1359 .addImm(BothRoundHwReg);
1360
1361 // There are two rounding modes, one for f32 and one for f64/f16. We only
1362 // report in the standard value range if both are the same.
1363 //
1364 // The raw values also differ from the expected FLT_ROUNDS values. Nearest
1365 // ties away from zero is not supported, and the other values are rotated by
1366 // 1.
1367 //
1368 // If the two rounding modes are not the same, report a target defined value.
1369
1370 // Mode register rounding mode fields:
1371 //
1372 // [1:0] Single-precision round mode.
1373 // [3:2] Double/Half-precision round mode.
1374 //
1375 // 0=nearest even; 1= +infinity; 2= -infinity, 3= toward zero.
1376 //
1377 // Hardware Spec
1378 // Toward-0 3 0
1379 // Nearest Even 0 1
1380 // +Inf 1 2
1381 // -Inf 2 3
1382 // NearestAway0 N/A 4
1383 //
1384 // We have to handle 16 permutations of a 4-bit value, so we create a 64-bit
1385 // table we can index by the raw hardware mode.
1386 //
1387 // (trunc (FltRoundConversionTable >> MODE.fp_round)) & 0xf
1388 auto BitTable =
1389 B.buildConstant({SgprRB, S64}, AMDGPU::FltRoundConversionTable);
1390
1391 auto Two = B.buildConstant(SgprRB_S32, 2);
1392 auto RoundModeTimesNumBits = B.buildShl(SgprRB_S32, GetReg, Two);
1393
1394 // TODO: We could possibly avoid a 64-bit shift and use a simpler table if we
1395 // knew only one mode was demanded.
1396 auto TableValue = B.buildLShr({SgprRB, S64}, BitTable, RoundModeTimesNumBits);
1397 auto TruncTable = B.buildTrunc(SgprRB_S32, TableValue);
1398
1399 auto EntryMask = B.buildConstant(SgprRB_S32, 0xf);
1400 auto TableEntry = B.buildAnd(SgprRB_S32, TruncTable, EntryMask);
1401
1402 // There's a gap in the 4-bit encoded table and actual enum values, so offset
1403 // if it's an extended value.
1404 auto Four = B.buildConstant(SgprRB_S32, 4);
1405 auto EnumOffset = B.buildAdd(SgprRB_S32, TableEntry, Four);
1406 auto IsStandardMode =
1407 B.buildICmp(CmpInst::ICMP_ULT, SgprRB_S32, TableEntry, Four);
1408 B.buildSelect(Dst, IsStandardMode, TableEntry, EnumOffset);
1409
1410 MI.eraseFromParent();
1411 return true;
1412}
1413
1414bool RegBankLegalizeHelper::lower(MachineInstr &MI,
1415 const RegBankLLTMapping &Mapping,
1416 WaterfallInfo &WFI) {
1417
1418 switch (Mapping.LoweringMethod) {
1419 case DoNotLower:
1420 break;
1421 case VccExtToSel:
1422 return lowerVccExtToSel(MI);
1423 case UniExtToSel: {
1424 LLT Ty = MRI.getType(MI.getOperand(0).getReg());
1425 auto True = B.buildConstant({SgprRB, Ty},
1426 MI.getOpcode() == AMDGPU::G_SEXT ? -1 : 1);
1427 auto False = B.buildConstant({SgprRB, Ty}, 0);
1428 // Input to G_{Z|S}EXT is 'Legalizer legal' S1. Most common case is compare.
1429 // We are making select here. S1 cond was already 'any-extended to S32' +
1430 // 'AND with 1 to clean high bits' by Sgpr32AExtBoolInReg.
1431 B.buildSelect(MI.getOperand(0).getReg(), MI.getOperand(1).getReg(), True,
1432 False);
1433 MI.eraseFromParent();
1434 return true;
1435 }
1436 case UnpackBitShift:
1437 return lowerUnpackBitShift(MI);
1438 case UnpackMinMax:
1439 return lowerUnpackMinMax(MI);
1440 case ScalarizeToS16:
1441 return lowerSplitTo16(MI);
1442 case Ext32To64: {
1443 const RegisterBank *RB = MRI.getRegBank(MI.getOperand(0).getReg());
1444 MachineInstrBuilder Hi;
1445 switch (MI.getOpcode()) {
1446 case AMDGPU::G_ZEXT: {
1447 Hi = B.buildConstant({RB, S32}, 0);
1448 break;
1449 }
1450 case AMDGPU::G_SEXT: {
1451 // Replicate sign bit from 32-bit extended part.
1452 auto ShiftAmt = B.buildConstant({RB, S32}, 31);
1453 Hi = B.buildAShr({RB, S32}, MI.getOperand(1).getReg(), ShiftAmt);
1454 break;
1455 }
1456 case AMDGPU::G_ANYEXT: {
1457 Hi = B.buildUndef({RB, S32});
1458 break;
1459 }
1460 default:
1461 reportGISelFailure(MF, MORE, "amdgpu-regbanklegalize",
1462 "AMDGPU RegBankLegalize: Ext32To64, unsuported opcode",
1463 MI);
1464 return false;
1465 }
1466
1467 B.buildMergeLikeInstr(MI.getOperand(0).getReg(),
1468 {MI.getOperand(1).getReg(), Hi});
1469 MI.eraseFromParent();
1470 return true;
1471 }
1472 case UniCstExt: {
1473 uint64_t ConstVal = MI.getOperand(1).getCImm()->getZExtValue();
1474 B.buildConstant(MI.getOperand(0).getReg(), ConstVal);
1475
1476 MI.eraseFromParent();
1477 return true;
1478 }
1479 case VgprToVccCopy: {
1480 Register Src = MI.getOperand(1).getReg();
1481 LLT Ty = MRI.getType(Src);
1482 // Take lowest bit from each lane and put it in lane mask.
1483 // Lowering via compare, but we need to clean high bits first as compare
1484 // compares all bits in register.
1485 Register BoolSrc = MRI.createVirtualRegister({VgprRB, Ty});
1486 if (Ty == S64) {
1487 auto Src64 = B.buildUnmerge(VgprRB_S32, Src);
1488 auto One = B.buildConstant(VgprRB_S32, 1);
1489 auto AndLo = B.buildAnd(VgprRB_S32, Src64.getReg(0), One);
1490 auto Zero = B.buildConstant(VgprRB_S32, 0);
1491 auto AndHi = B.buildAnd(VgprRB_S32, Src64.getReg(1), Zero);
1492 B.buildMergeLikeInstr(BoolSrc, {AndLo, AndHi});
1493 } else {
1494 assert(Ty == S32 || Ty == S16);
1495 auto One = B.buildConstant({VgprRB, Ty}, 1);
1496 B.buildAnd(BoolSrc, Src, One);
1497 }
1498 auto Zero = B.buildConstant({VgprRB, Ty}, 0);
1499 B.buildICmp(CmpInst::ICMP_NE, MI.getOperand(0).getReg(), BoolSrc, Zero);
1500 MI.eraseFromParent();
1501 return true;
1502 }
1503 case V_BFE:
1504 return lowerV_BFE(MI);
1505 case S_BFE:
1506 return lowerS_BFE(MI);
1507 case UniMAD64:
1508 return lowerUniMAD64(MI);
1509 case UniMul64: {
1510 B.buildMul(MI.getOperand(0), MI.getOperand(1), MI.getOperand(2));
1511 MI.eraseFromParent();
1512 return true;
1513 }
1514 case DivSMulToMAD: {
1515 auto Op1 = B.buildTrunc(VgprRB_S32, MI.getOperand(1));
1516 auto Op2 = B.buildTrunc(VgprRB_S32, MI.getOperand(2));
1517 auto Zero = B.buildConstant({VgprRB, S64}, 0);
1518
1519 unsigned NewOpc = MI.getOpcode() == AMDGPU::G_AMDGPU_S_MUL_U64_U32
1520 ? AMDGPU::G_AMDGPU_MAD_U64_U32
1521 : AMDGPU::G_AMDGPU_MAD_I64_I32;
1522
1523 B.buildInstr(NewOpc, {MI.getOperand(0).getReg(), {SgprRB, S32}},
1524 {Op1, Op2, Zero});
1525 MI.eraseFromParent();
1526 return true;
1527 }
1528 case SplitTo32:
1529 return lowerSplitTo32(MI);
1530 case SplitTo32Mul:
1531 return lowerSplitTo32Mul(MI);
1532 case SplitTo32Select:
1533 return lowerSplitTo32Select(MI);
1534 case SplitTo32SExtInReg:
1535 return lowerSplitTo32SExtInReg(MI);
1536 case CtPop64To32: {
1537 auto Unmerge = B.buildUnmerge({VgprRB, S32}, MI.getOperand(1).getReg());
1538 auto LoPopCnt = B.buildCTPOP({VgprRB, S32}, Unmerge.getReg(0));
1539 auto HiPopCnt = B.buildCTPOP({VgprRB, S32}, Unmerge.getReg(1));
1540 // Max popcount of two 32-bit values is 64, so this add cannot overflow.
1541 B.buildAdd(MI.getOperand(0).getReg(), LoPopCnt, HiPopCnt,
1543
1544 MI.eraseFromParent();
1545 break;
1546 }
1547 case S_BUF_to_BUF:
1548 return lowerSBufToBuf(MI, WFI);
1549 case SplitLoad: {
1550 LLT DstTy = MRI.getType(MI.getOperand(0).getReg());
1551 unsigned Size = DstTy.getSizeInBits();
1552 // Even split to 128-bit loads
1553 if (Size > 128) {
1554 LLT B128;
1555 if (DstTy.isVector()) {
1556 LLT EltTy = DstTy.getElementType();
1557 B128 = LLT::fixed_vector(128 / EltTy.getSizeInBits(), EltTy);
1558 } else {
1559 B128 = LLT::scalar(128);
1560 }
1561 if (Size / 128 == 2)
1562 splitLoad(MI, {B128, B128});
1563 else if (Size / 128 == 4)
1564 splitLoad(MI, {B128, B128, B128, B128});
1565 else {
1566 reportGISelFailure(MF, MORE, "amdgpu-regbanklegalize",
1567 "AMDGPU RegBankLegalize: SplitLoad, unsuported type",
1568 MI);
1569 return false;
1570 }
1571 }
1572 // 64 and 32 bit load
1573 else if (DstTy == S96)
1574 splitLoad(MI, {S64, S32}, S32);
1575 else if (DstTy == V3S32)
1576 splitLoad(MI, {V2S32, S32}, S32);
1577 else if (DstTy == V6S16)
1578 splitLoad(MI, {V4S16, V2S16}, V2S16);
1579 else {
1580 reportGISelFailure(MF, MORE, "amdgpu-regbanklegalize",
1581 "AMDGPU RegBankLegalize: SplitLoad, unsuported type",
1582 MI);
1583 return false;
1584 }
1585 return true;
1586 }
1587 case DynStackAlloc: {
1588 const auto &TFI = *ST.getFrameLowering();
1589 // Guard in case the stack growth direction ever changes with scratch
1590 // instructions.
1591 assert(TFI.getStackGrowthDirection() == TargetFrameLowering::StackGrowsUp &&
1592 "Stack grows upwards for AMDGPU");
1593
1594 Register Dst = MI.getOperand(0).getReg();
1595 Register AllocSize = MI.getOperand(1).getReg();
1596 Align Alignment = assumeAligned(MI.getOperand(2).getImm());
1597
1598 // Erase before building new instrs to avoid hitting multiple Dst assert
1599 // with CSE.
1600 B.setInsertPt(*MI.getParent(), std::next(MI.getIterator()));
1601 MI.eraseFromParent();
1602
1603 if (MRI.getRegBank(AllocSize) != SgprRB) {
1604 auto WaveReduction =
1605 B.buildIntrinsic(Intrinsic::amdgcn_wave_reduce_umax, {SgprRB_S32})
1606 .addUse(AllocSize)
1607 .addImm(0);
1608 AllocSize = WaveReduction.getReg(0);
1609 }
1610
1611 LLT PtrTy = MRI.getType(Dst);
1612 assert(PtrTy.getSizeInBits() == 32 &&
1613 "Expected 32-bit pointer for stack allocation");
1614 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
1615 Register SPReg = Info->getStackPtrOffsetReg();
1616
1617 // When using flat-scratch, the stack offset is unscaled.
1618 const bool HasFlatScratch = ST.hasFlatScratchEnabled();
1619 const unsigned WavefrontSizeLog2 = ST.getWavefrontSizeLog2();
1620
1621 Register AdjustedSize = AllocSize;
1622 if (!HasFlatScratch) {
1623 auto WaveSize = B.buildConstant(SgprRB_S32, WavefrontSizeLog2);
1624 AdjustedSize = B.buildShl(SgprRB_S32, AllocSize, WaveSize).getReg(0);
1625 }
1626 if (Alignment > TFI.getStackAlign()) {
1627 const uint64_t EffectiveAlignment =
1628 Alignment.value() << (HasFlatScratch ? 0 : WavefrontSizeLog2);
1629 auto OldSP = B.buildCopy({SgprRB, PtrTy}, SPReg);
1630 auto Tmp1 =
1631 B.buildPtrAdd({SgprRB, PtrTy}, OldSP,
1632 B.buildConstant(SgprRB_S32, EffectiveAlignment - 1));
1633 uint64_t Mask = maskTrailingZeros<uint64_t>(Log2_64(EffectiveAlignment));
1634 B.buildPtrMask(Dst, Tmp1, B.buildConstant(SgprRB_S32, Mask));
1635 } else {
1636 B.buildCopy(Dst, SPReg);
1637 }
1638 auto PtrAdd = B.buildPtrAdd({SgprRB, PtrTy}, Dst, AdjustedSize);
1639 B.buildCopy(SPReg, PtrAdd);
1640 return true;
1641 }
1642 case WidenLoad: {
1643 LLT DstTy = MRI.getType(MI.getOperand(0).getReg());
1644 if (DstTy == S96)
1645 widenLoad(MI, S128);
1646 else if (DstTy == V3S32)
1647 widenLoad(MI, V4S32, S32);
1648 else if (DstTy == V6S16)
1649 widenLoad(MI, V8S16, V2S16);
1650 else {
1651 reportGISelFailure(MF, MORE, "amdgpu-regbanklegalize",
1652 "AMDGPU RegBankLegalize: WidenLoad, unsuported type",
1653 MI);
1654 return false;
1655 }
1656 return true;
1657 }
1658 case UnpackAExt:
1659 return lowerUnpackAExt(MI);
1660 case WidenMMOToS32:
1661 return widenMMOToS32(cast<GAnyLoad>(MI));
1662 case VerifyAllSgpr: {
1663 assert(llvm::all_of(MI.operands(), [&](const MachineOperand &Op) {
1664 return MRI.getRegBankOrNull(Op.getReg()) == SgprRB;
1665 }));
1666 return true;
1667 }
1668 case ApplyAllVgpr: {
1669 assert(llvm::all_of(MI.defs(), [&](const MachineOperand &Op) {
1670 return MRI.getRegBankOrNull(Op.getReg()) == VgprRB;
1671 }));
1672 B.setInstrAndDebugLoc(MI);
1673 for (unsigned i = MI.getNumDefs(); i < MI.getNumOperands(); ++i) {
1674 MachineOperand &Op = MI.getOperand(i);
1675 if (!Op.isReg())
1676 continue;
1677 Register Reg = Op.getReg();
1678 if (MRI.getRegBank(Reg) != VgprRB) {
1679 auto Copy = B.buildCopy({VgprRB, MRI.getType(Reg)}, Reg);
1680 Op.setReg(Copy.getReg(0));
1681 }
1682 }
1683 return true;
1684 }
1685 case UnmergeToShiftTrunc: {
1686 GUnmerge *Unmerge = dyn_cast<GUnmerge>(&MI);
1687 LLT Ty = MRI.getType(Unmerge->getSourceReg());
1688 if (Ty.getSizeInBits() % 32 != 0) {
1689 reportGISelFailure(MF, MORE, "amdgpu-regbanklegalize",
1690 "AMDGPU RegBankLegalize: unmerge not multiple of 32",
1691 MI);
1692 return false;
1693 }
1694
1695 B.setInstrAndDebugLoc(MI);
1696 if (Ty.getSizeInBits() > 32) {
1697 auto UnmergeV2S16 =
1698 B.buildUnmerge({SgprRB, V2S16}, Unmerge->getSourceReg());
1699 for (unsigned i = 0; i < UnmergeV2S16->getNumDefs(); ++i) {
1700 auto [Dst0S32, Dst1S32] =
1701 unpackAExt(UnmergeV2S16->getOperand(i).getReg());
1702 B.buildTrunc(MI.getOperand(i * 2).getReg(), Dst0S32);
1703 B.buildTrunc(MI.getOperand(i * 2 + 1).getReg(), Dst1S32);
1704 }
1705 } else {
1706 auto [Dst0S32, Dst1S32] = unpackAExt(MI.getOperand(2).getReg());
1707 B.buildTrunc(MI.getOperand(0).getReg(), Dst0S32);
1708 B.buildTrunc(MI.getOperand(1).getReg(), Dst1S32);
1709 }
1710
1711 MI.eraseFromParent();
1712 return true;
1713 }
1715 Register Dst = MI.getOperand(0).getReg();
1716 Register NewDst = MRI.createVirtualRegister(SgprRB_S32);
1717 B.setInsertPt(*MI.getParent(), MI.getParent()->getFirstNonPHI());
1718 MI.getOperand(0).setReg(NewDst);
1719 B.buildTrunc(Dst, NewDst);
1720
1721 for (unsigned i = 1; i < MI.getNumOperands(); i += 2) {
1722 Register UseReg = MI.getOperand(i).getReg();
1723
1724 auto DefMI = MRI.getVRegDef(UseReg)->getIterator();
1725 MachineBasicBlock *DefMBB = DefMI->getParent();
1726
1727 B.setInsertPt(*DefMBB, DefMBB->SkipPHIsAndLabels(std::next(DefMI)));
1728
1729 auto NewUse = B.buildAnyExt(SgprRB_S32, UseReg);
1730 MI.getOperand(i).setReg(NewUse.getReg(0));
1731 }
1732 break;
1733 }
1734 case VerifyAllSgprGPHI: {
1735 assert(llvm::all_of(MI.operands(), [&](const MachineOperand &Op) {
1736 if (Op.isMBB())
1737 return true;
1738 return MRI.getRegBankOrNull(Op.getReg()) == SgprRB;
1739 }));
1740 return true;
1741 }
1743 assert(MRI.getRegBankOrNull(MI.getOperand(0).getReg()) == VgprRB);
1744 assert(llvm::all_of(MI.operands(), [&](const MachineOperand &Op) {
1745 if (Op.isMBB())
1746 return true;
1747 const RegisterBank *RB = MRI.getRegBankOrNull(Op.getReg());
1748 return RB == VgprRB || RB == SgprRB;
1749 }));
1750 return true;
1751 }
1752 case ApplyINTRIN_IMAGE: {
1753 const AMDGPU::RsrcIntrinsic *RSrcIntrin =
1755 assert(RSrcIntrin && RSrcIntrin->IsImage);
1756 // The reported argument index is relative to the IR intrinsic call
1757 // arguments, so shift by the number of defs and the intrinsic ID.
1758 unsigned RsrcIdx = RSrcIntrin->RsrcArg + MI.getNumExplicitDefs() + 1;
1759 return applyRegisterBanksVgprWithSgprRsrc(MI, RsrcIdx);
1760 }
1762 // Rsrc is the last register operand. Base BVH trails an A16 immediate
1763 // after rsrc; dual/BVH8 do not. Scan backwards for the last virtual
1764 // register.
1765 unsigned RsrcIdx = MI.getNumOperands();
1766 while (RsrcIdx-- > MI.getNumExplicitDefs()) {
1767 const MachineOperand &Op = MI.getOperand(RsrcIdx);
1768 if (Op.isReg() && Op.getReg().isVirtual())
1769 break;
1770 }
1771 return applyRegisterBanksVgprWithSgprRsrc(MI, RsrcIdx);
1772 }
1774 return lowerSplitBitCount64To32(MI);
1775 case ExtrVecEltToSel:
1776 return lowerExtrVecEltToSel(MI);
1777 case ExtrVecEltTo32:
1778 return lowerExtrVecEltTo32(MI);
1779 case InsVecEltToSel:
1780 return lowerInsVecEltToSel(MI);
1781 case InsVecEltTo32:
1782 return lowerInsVecEltTo32(MI);
1783 case AbsToNegMax:
1784 return lowerAbsToNegMax(MI);
1785 case AbsToS32:
1786 return lowerAbsToS32(MI);
1787 case DeletePrefetch:
1788 MI.eraseFromParent();
1789 return true;
1790 case LowerSetRounding:
1791 return lowerSetRounding(MI);
1792 case LowerGetRounding:
1793 return lowerGetRounding(MI);
1794 }
1795
1796 return true;
1797}
1798
1799LLT RegBankLegalizeHelper::getTyFromID(RegBankLLTMappingApplyID ID) {
1800 switch (ID) {
1801 case Vcc:
1802 case UniInVcc:
1803 return LLT::scalar(1);
1804 case Sgpr16:
1805 case Vgpr16:
1806 case UniInVgprS16:
1807 return LLT::scalar(16);
1808 case Sgpr32:
1809 case Sgpr32_WF:
1810 case Sgpr32Trunc:
1811 case Sgpr32AExt:
1813 case Sgpr32SExt:
1814 case Sgpr32ZExt:
1815 case UniInVgprS32:
1816 case Sgpr32ToVgprDst:
1817 case Vgpr32:
1818 case Vgpr32AExt:
1819 case Vgpr32SExt:
1820 case Vgpr32ZExt:
1821 return LLT::scalar(32);
1822 case Sgpr64:
1823 case Vgpr64:
1824 case UniInVgprS64:
1825 case Sgpr64ToVgprDst:
1826 return LLT::scalar(64);
1827 case Sgpr128:
1828 case Vgpr128:
1829 return LLT::scalar(128);
1830 case SgprP0:
1831 case SgprP0Call_WF:
1832 case VgprP0:
1833 return LLT::pointer(0, 64);
1834 case SgprP1:
1835 case VgprP1:
1836 return LLT::pointer(1, 64);
1837 case SgprP2:
1838 case VgprP2:
1839 return LLT::pointer(2, 32);
1840 case SgprP3:
1841 case VgprP3:
1842 return LLT::pointer(3, 32);
1843 case SgprP4:
1844 case SgprP4Call_WF:
1845 case VgprP4:
1846 return LLT::pointer(4, 64);
1847 case SgprP5:
1848 case VgprP5:
1849 return LLT::pointer(5, 32);
1850 case SgprP6:
1851 return LLT::pointer(6, 32);
1852 case SgprP8:
1853 return LLT::pointer(8, 128);
1854 case SgprV2S16:
1855 case VgprV2S16:
1856 case UniInVgprV2S16:
1857 return LLT::fixed_vector(2, 16);
1858 case SgprV2S32:
1859 case VgprV2S32:
1860 case UniInVgprV2S32:
1861 return LLT::fixed_vector(2, 32);
1862 case VgprV3S32:
1863 case UniInVgprV3S32:
1864 return LLT::fixed_vector(3, 32);
1865 case VgprV4S16:
1866 return LLT::fixed_vector(4, 16);
1867 case VgprV8S16:
1868 case UniInVgprV8S16:
1869 return LLT::fixed_vector(8, 16);
1870 case VgprV16S16:
1871 case UniInVgprV16S16:
1872 return LLT::fixed_vector(16, 16);
1873 case SgprV4S32:
1874 case SgprV4S32_WF:
1876 case VgprV4S32:
1877 case UniInVgprV4S32:
1878 return LLT::fixed_vector(4, 32);
1879 case VgprV8S32:
1880 case UniInVgprV8S32:
1882 return LLT::fixed_vector(8, 32);
1883 case VgprV2S64:
1884 case UniInVgprV2S64:
1885 return LLT::fixed_vector(2, 64);
1886 case VgprV6S32:
1887 case UniInVgprV6S32:
1888 return LLT::fixed_vector(6, 32);
1889 case VgprV16S32:
1890 case UniInVgprV16S32:
1891 return LLT::fixed_vector(16, 32);
1892 case VgprV32S16:
1893 case UniInVgprV32S16:
1894 return LLT::fixed_vector(32, 16);
1895 case VgprV32S32:
1896 case UniInVgprV32S32:
1897 return LLT::fixed_vector(32, 32);
1898 default:
1899 return LLT();
1900 }
1901}
1902
1903LLT RegBankLegalizeHelper::getBTyFromID(RegBankLLTMappingApplyID ID, LLT Ty) {
1904 switch (ID) {
1905 case SgprB32:
1906 case VgprB32:
1907 case SgprB32_M0:
1909 case UniInVgprB32:
1910 if (Ty == LLT::scalar(32) || Ty == LLT::fixed_vector(2, 16) ||
1911 isAnyPtr(Ty, 32))
1912 return Ty;
1913 return LLT();
1914 case SgprPtr32:
1915 case VgprPtr32:
1916 return isAnyPtr(Ty, 32) ? Ty : LLT();
1917 case SgprPtr64:
1918 case VgprPtr64:
1919 return isAnyPtr(Ty, 64) ? Ty : LLT();
1920 case SgprPtr128:
1921 case VgprPtr128:
1922 return isAnyPtr(Ty, 128) ? Ty : LLT();
1923 case SgprB64:
1924 case VgprB64:
1926 case UniInVgprB64:
1927 if (Ty == LLT::scalar(64) || Ty == LLT::fixed_vector(2, 32) ||
1928 Ty == LLT::fixed_vector(4, 16) || isAnyPtr(Ty, 64))
1929 return Ty;
1930 return LLT();
1931 case SgprB96:
1932 case VgprB96:
1933 case UniInVgprB96:
1934 if (Ty == LLT::scalar(96) || Ty == LLT::fixed_vector(3, 32) ||
1935 Ty == LLT::fixed_vector(6, 16))
1936 return Ty;
1937 return LLT();
1938 case SgprB128:
1939 case VgprB128:
1940 case UniInVgprB128:
1941 if (Ty.getSizeInBits() == 128)
1942 return Ty;
1943 return LLT();
1944 case VgprB160:
1945 case UniInVgprB160:
1946 if (Ty.getSizeInBits() == 160)
1947 return Ty;
1948 return LLT();
1949 case SgprB256:
1950 case VgprB256:
1951 case UniInVgprB256:
1952 if (Ty.getSizeInBits() == 256)
1953 return Ty;
1954 return LLT();
1955 case SgprB512:
1956 case VgprB512:
1957 case UniInVgprB512:
1958 if (Ty.getSizeInBits() == 512)
1959 return Ty;
1960 return LLT();
1961 case SgprBRC: {
1962 const SIRegisterInfo *TRI =
1963 static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
1964 unsigned LLTSize = Ty.getSizeInBits();
1965 if (LLTSize >= 32 && TRI->getSGPRClassForBitWidth(LLTSize))
1966 return Ty;
1967 return LLT();
1968 }
1969 case VgprBRC: {
1970 const SIRegisterInfo *TRI =
1971 static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
1972 if (TRI->getSGPRClassForBitWidth(Ty.getSizeInBits()))
1973 return Ty;
1974 return LLT();
1975 }
1976 default:
1977 return LLT();
1978 }
1979}
1980
1981const RegisterBank *
1982RegBankLegalizeHelper::getRegBankFromID(RegBankLLTMappingApplyID ID) {
1983 switch (ID) {
1984 case Vcc:
1985 return VccRB;
1986 case Sgpr16:
1987 case Sgpr32:
1988 case Sgpr32_WF:
1989 case Sgpr64:
1990 case Sgpr128:
1991 case SgprP0:
1992 case SgprP0Call_WF:
1993 case SgprP1:
1994 case SgprP2:
1995 case SgprP3:
1996 case SgprP4:
1997 case SgprP4Call_WF:
1998 case SgprP5:
1999 case SgprP6:
2000 case SgprP8:
2001 case SgprPtr32:
2002 case SgprPtr64:
2003 case SgprPtr128:
2004 case SgprV2S16:
2005 case SgprV2S32:
2006 case SgprV4S32:
2007 case SgprV4S32_WF:
2010 case SgprB32:
2011 case SgprB64:
2012 case SgprB96:
2013 case SgprB128:
2014 case SgprB256:
2015 case SgprB512:
2016 case SgprBRC:
2017 case UniInVcc:
2018 case UniInVgprS16:
2019 case UniInVgprS32:
2020 case UniInVgprS64:
2021 case UniInVgprV2S16:
2022 case UniInVgprV2S32:
2023 case UniInVgprV3S32:
2024 case UniInVgprV4S32:
2025 case UniInVgprV2S64:
2026 case UniInVgprV6S32:
2027 case UniInVgprV8S16:
2028 case UniInVgprV8S32:
2029 case UniInVgprV16S16:
2030 case UniInVgprV16S32:
2031 case UniInVgprV32S16:
2032 case UniInVgprV32S32:
2033 case UniInVgprB32:
2034 case UniInVgprB64:
2035 case UniInVgprB96:
2036 case UniInVgprB128:
2037 case UniInVgprB160:
2038 case UniInVgprB256:
2039 case UniInVgprB512:
2040 case Sgpr32Trunc:
2041 case Sgpr32AExt:
2043 case Sgpr32SExt:
2044 case Sgpr32ZExt:
2045 return SgprRB;
2046 case AgprAnyTy:
2047 return AgprRB;
2048 case Vgpr16:
2049 case Vgpr32:
2050 case Vgpr64:
2051 case Vgpr128:
2052 case VgprP0:
2053 case VgprP1:
2054 case VgprP2:
2055 case VgprP3:
2056 case VgprP4:
2057 case VgprP5:
2058 case VgprPtr32:
2059 case VgprPtr64:
2060 case VgprPtr128:
2061 case VgprV2S16:
2062 case VgprV2S32:
2063 case VgprV2S64:
2064 case VgprV3S32:
2065 case VgprV4S16:
2066 case VgprV8S16:
2067 case VgprV16S16:
2068 case VgprV4S32:
2069 case VgprV6S32:
2070 case VgprV8S32:
2071 case VgprV16S32:
2072 case VgprV32S16:
2073 case VgprV32S32:
2074 case VgprB32:
2075 case VgprB64:
2076 case VgprB96:
2077 case VgprB128:
2078 case VgprB160:
2079 case VgprB256:
2080 case VgprB512:
2081 case VgprBRC:
2082 case VgprAnyTy:
2083 case Vgpr32AExt:
2084 case Vgpr32SExt:
2085 case Vgpr32ZExt:
2086 case Sgpr32ToVgprDst:
2087 case Sgpr64ToVgprDst:
2088 return VgprRB;
2089 default:
2090 return nullptr;
2091 }
2092}
2093
2094bool RegBankLegalizeHelper::applyMappingDst(
2095 MachineInstr &MI, unsigned &OpIdx,
2096 const SmallVectorImpl<RegBankLLTMappingApplyID> &MethodIDs) {
2097 // Defs start from operand 0
2098 for (; OpIdx < MethodIDs.size(); ++OpIdx) {
2099 if (MethodIDs[OpIdx] == None)
2100 continue;
2101 MachineOperand &Op = MI.getOperand(OpIdx);
2102 Register Reg = Op.getReg();
2103 LLT Ty = MRI.getType(Reg);
2104 [[maybe_unused]] const RegisterBank *RB = MRI.getRegBank(Reg);
2105
2106 switch (MethodIDs[OpIdx]) {
2107 // vcc, sgpr and vgpr scalars, pointers and vectors
2108 case Vcc:
2109 case Sgpr16:
2110 case Sgpr32:
2111 case Sgpr64:
2112 case Sgpr128:
2113 case SgprP0:
2114 case SgprP1:
2115 case SgprP3:
2116 case SgprP4:
2117 case SgprP5:
2118 case SgprP6:
2119 case SgprP8:
2120 case SgprV2S16:
2121 case SgprV2S32:
2122 case SgprV4S32:
2123 case Vgpr16:
2124 case Vgpr32:
2125 case Vgpr64:
2126 case Vgpr128:
2127 case VgprP0:
2128 case VgprP1:
2129 case VgprP2:
2130 case VgprP3:
2131 case VgprP4:
2132 case VgprP5:
2133 case VgprV2S16:
2134 case VgprV2S32:
2135 case VgprV2S64:
2136 case VgprV3S32:
2137 case VgprV4S16:
2138 case VgprV8S16:
2139 case VgprV16S16:
2140 case VgprV4S32:
2141 case VgprV6S32:
2142 case VgprV8S32:
2143 case VgprV16S32:
2144 case VgprV32S16:
2145 case VgprV32S32: {
2146 assert(Ty == getTyFromID(MethodIDs[OpIdx]));
2147 assert(RB == getRegBankFromID(MethodIDs[OpIdx]));
2148 break;
2149 }
2150 // sgpr and vgpr B-types
2151 case SgprB32:
2152 case SgprB64:
2153 case SgprB96:
2154 case SgprB128:
2155 case SgprB256:
2156 case SgprB512:
2157 case SgprBRC:
2158 case SgprPtr32:
2159 case SgprPtr64:
2160 case SgprPtr128:
2161 case VgprB32:
2162 case VgprB64:
2163 case VgprB96:
2164 case VgprB128:
2165 case VgprB160:
2166 case VgprB256:
2167 case VgprB512:
2168 case VgprBRC:
2169 case VgprPtr32:
2170 case VgprPtr64:
2171 case VgprPtr128: {
2172 assert(Ty == getBTyFromID(MethodIDs[OpIdx], Ty));
2173 assert(RB == getRegBankFromID(MethodIDs[OpIdx]));
2174 break;
2175 }
2176 case VgprAnyTy: {
2177 assert(RB == VgprRB);
2178 break;
2179 }
2180 case AgprAnyTy: {
2181 if (RB == AgprRB)
2182 break;
2183 Register NewAgprDst = MRI.createVirtualRegister({AgprRB, Ty});
2184 Op.setReg(NewAgprDst);
2185 if (!MRI.use_nodbg_empty(Reg))
2186 B.buildCopy(Reg, NewAgprDst);
2187 break;
2188 }
2189 case VgprOrAgprAnyTy: {
2190 const unsigned NumRegs = Ty.getSizeInBits() / 32;
2191 const RegisterBank *DstRB =
2192 MFI->selectAGPRFormMFMA(NumRegs) ? AgprRB : VgprRB;
2193 if (RB == DstRB)
2194 break;
2195 Register NewDst = MRI.createVirtualRegister({DstRB, Ty});
2196 Op.setReg(NewDst);
2197 if (!MRI.use_nodbg_empty(Reg))
2198 B.buildCopy(Reg, NewDst);
2199 break;
2200 }
2201 // uniform in vcc/vgpr: scalars, vectors and B-types
2202 case UniInVcc: {
2203 assert(Ty == S1);
2204 assert(RB == SgprRB);
2205 Register NewDst = MRI.createVirtualRegister(VccRB_S1);
2206 Op.setReg(NewDst);
2207 if (!MRI.use_empty(Reg)) {
2208 auto CopyS32_Vcc =
2209 B.buildInstr(AMDGPU::G_AMDGPU_COPY_SCC_VCC, {SgprRB_S32}, {NewDst});
2210 B.buildTrunc(Reg, CopyS32_Vcc);
2211 }
2212 break;
2213 }
2214 case UniInVgprS16: {
2215 assert(Ty == getTyFromID(MethodIDs[OpIdx]));
2216 assert(RB == SgprRB);
2217 Register NewVgprDstS16 = MRI.createVirtualRegister({VgprRB, S16});
2218 Register NewVgprDstS32 = MRI.createVirtualRegister({VgprRB, S32});
2219 Register NewSgprDstS32 = MRI.createVirtualRegister({SgprRB, S32});
2220 Op.setReg(NewVgprDstS16);
2221 B.buildAnyExt(NewVgprDstS32, NewVgprDstS16);
2222 buildReadAnyLane(B, NewSgprDstS32, NewVgprDstS32, RBI);
2223 B.buildTrunc(Reg, NewSgprDstS32);
2224 break;
2225 }
2226 case UniInVgprS32:
2227 case UniInVgprS64:
2228 case UniInVgprV2S16:
2229 case UniInVgprV2S32:
2230 case UniInVgprV3S32:
2231 case UniInVgprV4S32:
2232 case UniInVgprV2S64:
2233 case UniInVgprV6S32:
2234 case UniInVgprV8S16:
2235 case UniInVgprV8S32:
2236 case UniInVgprV16S16:
2237 case UniInVgprV16S32:
2238 case UniInVgprV32S16:
2239 case UniInVgprV32S32: {
2240 assert(Ty == getTyFromID(MethodIDs[OpIdx]));
2241 assert(RB == SgprRB);
2242 Register NewVgprDst = MRI.createVirtualRegister({VgprRB, Ty});
2243 Op.setReg(NewVgprDst);
2244 buildReadAnyLane(B, Reg, NewVgprDst, RBI);
2245 break;
2246 }
2247 case UniInVgprB32:
2248 case UniInVgprB64:
2249 case UniInVgprB96:
2250 case UniInVgprB128:
2251 case UniInVgprB160:
2252 case UniInVgprB256:
2253 case UniInVgprB512: {
2254 assert(Ty == getBTyFromID(MethodIDs[OpIdx], Ty));
2255 assert(RB == SgprRB);
2256 Register NewVgprDst = MRI.createVirtualRegister({VgprRB, Ty});
2257 Op.setReg(NewVgprDst);
2258 AMDGPU::buildReadAnyLane(B, Reg, NewVgprDst, RBI);
2259 break;
2260 }
2261 // sgpr trunc
2262 case Sgpr32Trunc: {
2263 assert(Ty.getSizeInBits() < 32);
2264 assert(RB == SgprRB);
2265 Register NewDst = MRI.createVirtualRegister(SgprRB_S32);
2266 Op.setReg(NewDst);
2267 if (!MRI.use_empty(Reg))
2268 B.buildTrunc(Reg, NewDst);
2269 break;
2270 }
2271 case Sgpr32ToVgprDst:
2272 case Sgpr64ToVgprDst: {
2273 assert(Ty == getTyFromID(MethodIDs[OpIdx]));
2274 assert(RB == VgprRB);
2275 Op.setReg(MRI.createVirtualRegister({SgprRB, Ty}));
2276 B.buildCopy(Reg, Op.getReg());
2277 break;
2278 }
2279 case InvalidMapping: {
2281 MF, MORE, "amdgpu-regbanklegalize",
2282 "AMDGPU RegBankLegalize: missing fast rule ('Div' or 'Uni') for", MI);
2283 return false;
2284 }
2285 default:
2287 MF, MORE, "amdgpu-regbanklegalize",
2288 "AMDGPU RegBankLegalize: applyMappingDst, ID not supported", MI);
2289 return false;
2290 }
2291 }
2292
2293 return true;
2294}
2295
2296bool RegBankLegalizeHelper::applyMappingSrc(
2297 MachineInstr &MI, unsigned &OpIdx,
2298 const SmallVectorImpl<RegBankLLTMappingApplyID> &MethodIDs,
2299 WaterfallInfo &WFI) {
2300 for (unsigned i = 0; i < MethodIDs.size(); ++OpIdx, ++i) {
2301 if (MethodIDs[i] == None || MethodIDs[i] == IntrId || MethodIDs[i] == Imm)
2302 continue;
2303
2304 MachineOperand &Op = MI.getOperand(OpIdx);
2305 Register Reg = Op.getReg();
2306 LLT Ty = MRI.getType(Reg);
2307 const RegisterBank *RB = MRI.getRegBank(Reg);
2308
2309 switch (MethodIDs[i]) {
2310 case Vcc: {
2311 assert(Ty == S1);
2312 assert(RB == VccRB || RB == SgprRB);
2313 if (RB == SgprRB) {
2314 auto Aext = B.buildAnyExt(SgprRB_S32, Reg);
2315 auto CopyVcc_Scc =
2316 B.buildInstr(AMDGPU::G_AMDGPU_COPY_VCC_SCC, {VccRB_S1}, {Aext});
2317 Op.setReg(CopyVcc_Scc.getReg(0));
2318 }
2319 break;
2320 }
2321 // sgpr scalars, pointers and vectors
2322 case Sgpr16:
2323 case Sgpr32:
2324 case Sgpr64:
2325 case Sgpr128:
2326 case SgprP0:
2327 case SgprP1:
2328 case SgprP3:
2329 case SgprP4:
2330 case SgprP5:
2331 case SgprP6:
2332 case SgprP8:
2333 case SgprV2S16:
2334 case SgprV2S32:
2335 case SgprV4S32: {
2336 assert(Ty == getTyFromID(MethodIDs[i]));
2337 assert(RB == getRegBankFromID(MethodIDs[i]));
2338 break;
2339 }
2340 // sgpr B-types
2341 case SgprB32:
2342 case SgprB64:
2343 case SgprB96:
2344 case SgprB128:
2345 case SgprB256:
2346 case SgprB512:
2347 case SgprBRC:
2348 case SgprPtr32:
2349 case SgprPtr64:
2350 case SgprPtr128: {
2351 assert(Ty == getBTyFromID(MethodIDs[i], Ty));
2352 assert(RB == getRegBankFromID(MethodIDs[i]));
2353 break;
2354 }
2355 // vgpr scalars, pointers and vectors
2356 case Vgpr16:
2357 case Vgpr32:
2358 case Vgpr64:
2359 case Vgpr128:
2360 case VgprP0:
2361 case VgprP1:
2362 case VgprP2:
2363 case VgprP3:
2364 case VgprP4:
2365 case VgprP5:
2366 case VgprV2S16:
2367 case VgprV2S32:
2368 case VgprV2S64:
2369 case VgprV3S32:
2370 case VgprV4S16:
2371 case VgprV8S16:
2372 case VgprV16S16:
2373 case VgprV4S32:
2374 case VgprV6S32:
2375 case VgprV8S32:
2376 case VgprV16S32:
2377 case VgprV32S16:
2378 case VgprV32S32: {
2379 assert(Ty == getTyFromID(MethodIDs[i]));
2380 if (RB != VgprRB) {
2381 auto CopyToVgpr = B.buildCopy({VgprRB, Ty}, Reg);
2382 Op.setReg(CopyToVgpr.getReg(0));
2383 }
2384 break;
2385 }
2386 // vgpr B-types
2387 case VgprB32:
2388 case VgprB64:
2389 case VgprB96:
2390 case VgprB128:
2391 case VgprB160:
2392 case VgprB256:
2393 case VgprB512:
2394 case VgprBRC:
2395 case VgprPtr32:
2396 case VgprPtr64:
2397 case VgprPtr128: {
2398 assert(Ty == getBTyFromID(MethodIDs[i], Ty));
2399 if (RB != VgprRB) {
2400 auto CopyToVgpr = B.buildCopy({VgprRB, Ty}, Reg);
2401 Op.setReg(CopyToVgpr.getReg(0));
2402 }
2403 break;
2404 }
2405 case VgprAnyTy: {
2406 if (RB != VgprRB) {
2407 auto CopyToVgpr = B.buildCopy({VgprRB, Ty}, Reg);
2408 Op.setReg(CopyToVgpr.getReg(0));
2409 }
2410 break;
2411 }
2412 case AgprAnyTy: {
2413 if (RB != AgprRB) {
2414 auto CopyToAgpr = B.buildCopy({AgprRB, Ty}, Reg);
2415 Op.setReg(CopyToAgpr.getReg(0));
2416 }
2417 break;
2418 }
2419 case VgprOrAgprAnyTy: {
2420 const unsigned NumRegs = Ty.getSizeInBits() / 32;
2421 const RegisterBank *SrcRB =
2422 MFI->selectAGPRFormMFMA(NumRegs) ? AgprRB : VgprRB;
2423 if (RB != SrcRB)
2424 Op.setReg(B.buildCopy({SrcRB, Ty}, Reg).getReg(0));
2425 break;
2426 }
2427 // sgpr waterfall, scalars, and vectors
2428 case Sgpr32_WF:
2429 case SgprV4S32_WF: {
2430 assert(Ty == getTyFromID(MethodIDs[i]));
2431 if (RB != SgprRB) {
2432 WFI.SgprWaterfallOperandRegs.insert(Reg);
2433 if (!WFI.Start.isValid()) {
2434 WFI.Start = MI.getIterator();
2435 WFI.End = std::next(MI.getIterator());
2436 }
2437 }
2438 break;
2439 }
2440 case SgprP0Call_WF:
2441 case SgprP4Call_WF: {
2442 assert(Ty == getTyFromID(MethodIDs[i]));
2443 if (RB != SgprRB) {
2444 WFI.SgprWaterfallOperandRegs.insert(Reg);
2445
2446 // Find the ADJCALLSTACKUP before the call.
2447 MachineBasicBlock::iterator Start = MI.getIterator();
2448 while (Start->getOpcode() != AMDGPU::ADJCALLSTACKUP)
2449 --Start;
2450
2451 // Find the ADJCALLSTACKDOWN after the call (include it in range).
2452 MachineBasicBlock::iterator End = MI.getIterator();
2453 while (End->getOpcode() != AMDGPU::ADJCALLSTACKDOWN)
2454 ++End;
2455 ++End;
2456
2457 WFI.Start = Start;
2458 WFI.End = End;
2459 }
2460 break;
2461 }
2462 case SgprB32_M0:
2464 case SgprB64_ReadFirstLane: {
2465 assert(Ty == getBTyFromID(MethodIDs[i], Ty));
2466 if (RB == SgprRB)
2467 break;
2468 assert(RB == VgprRB);
2469 Register NewSGPR = MRI.createVirtualRegister({SgprRB, Ty});
2470 buildReadFirstLane(B, NewSGPR, Op.getReg(), RBI);
2471 Op.setReg(NewSGPR);
2472 break;
2473 }
2476 assert(Ty == getTyFromID(MethodIDs[i]));
2477 if (RB == SgprRB)
2478 break;
2479 assert(RB == VgprRB);
2480 Register NewSGPR = MRI.createVirtualRegister({SgprRB, Ty});
2481 buildReadFirstLane(B, NewSGPR, Op.getReg(), RBI);
2482 Op.setReg(NewSGPR);
2483 break;
2484 }
2485 // sgpr and vgpr scalars with extend
2486 case Sgpr32AExt: {
2487 // Note: this ext allows S1, and it is meant to be combined away.
2488 assert(Ty.getSizeInBits() < 32);
2489 assert(RB == SgprRB);
2490 auto Aext = B.buildAnyExt(SgprRB_S32, Reg);
2491 Op.setReg(Aext.getReg(0));
2492 break;
2493 }
2494 case Sgpr32AExtBoolInReg: {
2495 // Note: this ext allows S1, and it is meant to be combined away.
2496 assert(Ty.getSizeInBits() == 1);
2497 assert(RB == SgprRB);
2498 auto Aext = B.buildAnyExt(SgprRB_S32, Reg);
2499 // Zext SgprS1 is not legal, make AND with 1 instead. This instruction is
2500 // most of times meant to be combined away in AMDGPURegBankCombiner.
2501 auto Cst1 = B.buildConstant(SgprRB_S32, 1);
2502 auto BoolInReg = B.buildAnd(SgprRB_S32, Aext, Cst1);
2503 Op.setReg(BoolInReg.getReg(0));
2504 break;
2505 }
2506 case Sgpr32SExt: {
2507 assert(1 < Ty.getSizeInBits() && Ty.getSizeInBits() < 32);
2508 assert(RB == SgprRB);
2509 auto Sext = B.buildSExt(SgprRB_S32, Reg);
2510 Op.setReg(Sext.getReg(0));
2511 break;
2512 }
2513 case Sgpr32ZExt: {
2514 assert(1 < Ty.getSizeInBits() && Ty.getSizeInBits() < 32);
2515 assert(RB == SgprRB);
2516 auto Zext = B.buildZExt({SgprRB, S32}, Reg);
2517 Op.setReg(Zext.getReg(0));
2518 break;
2519 }
2520 case Vgpr32AExt: {
2521 assert(Ty.getSizeInBits() < 32);
2522 assert(RB == VgprRB);
2523 auto Aext = B.buildAnyExt({VgprRB, S32}, Reg);
2524 Op.setReg(Aext.getReg(0));
2525 break;
2526 }
2527 case Vgpr32SExt: {
2528 // Note this ext allows S1, and it is meant to be combined away.
2529 assert(Ty.getSizeInBits() < 32);
2530 assert(RB == VgprRB);
2531 auto Sext = B.buildSExt({VgprRB, S32}, Reg);
2532 Op.setReg(Sext.getReg(0));
2533 break;
2534 }
2535 case Vgpr32ZExt: {
2536 // Note this ext allows S1, and it is meant to be combined away.
2537 assert(Ty.getSizeInBits() < 32);
2538 assert(RB == VgprRB);
2539 auto Zext = B.buildZExt({VgprRB, S32}, Reg);
2540 Op.setReg(Zext.getReg(0));
2541 break;
2542 }
2543 default:
2545 MF, MORE, "amdgpu-regbanklegalize",
2546 "AMDGPU RegBankLegalize: applyMappingSrc, ID not supported", MI);
2547 return false;
2548 }
2549 }
2550 return true;
2551}
2552
2553[[maybe_unused]] static bool verifyRegBankOnOperands(MachineInstr &MI,
2554 const RegisterBank *RB,
2556 unsigned StartOpIdx,
2557 unsigned EndOpIdx) {
2558 for (unsigned i = StartOpIdx; i <= EndOpIdx; ++i) {
2559 if (MRI.getRegBankOrNull(MI.getOperand(i).getReg()) != RB)
2560 return false;
2561 }
2562 return true;
2563}
2564
2565bool RegBankLegalizeHelper::applyRegisterBanksVgprWithSgprRsrc(
2566 MachineInstr &MI, unsigned RsrcIdx) {
2567 const unsigned NumDefs = MI.getNumExplicitDefs();
2568
2569 MachineBasicBlock *MBB = MI.getParent();
2570 B.setInsertPt(*MBB, MBB->SkipPHIsAndLabels(std::next(MI.getIterator())));
2571
2572 // Defs are vgpr.
2573 for (unsigned i = 0; i < NumDefs; ++i) {
2574 Register Reg = MI.getOperand(i).getReg();
2575 if (MRI.getRegBank(Reg) == VgprRB)
2576 continue;
2577
2578 Register NewVgprDst = MRI.createVirtualRegister({VgprRB, MRI.getType(Reg)});
2579 MI.getOperand(i).setReg(NewVgprDst);
2580 buildReadAnyLane(B, Reg, NewVgprDst, RBI);
2581 }
2582
2583 B.setInstrAndDebugLoc(MI);
2584
2585 // Register uses before RsrcIdx are vgpr.
2586 for (unsigned i = NumDefs; i < RsrcIdx; ++i) {
2587 MachineOperand &Op = MI.getOperand(i);
2588 if (!Op.isReg())
2589 continue;
2590
2591 Register Reg = Op.getReg();
2592 if (!Reg.isVirtual())
2593 continue;
2594
2595 if (MRI.getRegBank(Reg) == VgprRB)
2596 continue;
2597
2598 auto Copy = B.buildCopy({VgprRB, MRI.getType(Reg)}, Reg);
2599 Op.setReg(Copy.getReg(0));
2600 }
2601
2602 SmallSet<Register, 4> OpsToWaterfall;
2603
2604 // Register use RsrcIdx (and later register operands) is sgpr.
2605 for (unsigned i = RsrcIdx; i < MI.getNumOperands(); ++i) {
2606 MachineOperand &Op = MI.getOperand(i);
2607 if (!Op.isReg())
2608 continue;
2609
2610 Register Reg = Op.getReg();
2611 if (MRI.getRegBank(Reg) != SgprRB)
2612 OpsToWaterfall.insert(Reg);
2613 }
2614
2615 if (!OpsToWaterfall.empty()) {
2616 MachineBasicBlock::iterator MII = MI.getIterator();
2617 executeInWaterfallLoop(B, {OpsToWaterfall, MII, std::next(MII)});
2618 }
2619
2620 return true;
2621}
MachineInstrBuilder MachineInstrBuilder & DefMI
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.
Provides AMDGPU specific target descriptions.
static bool isSignedBFE(MachineInstr &MI)
static bool verifyRegBankOnOperands(MachineInstr &MI, const RegisterBank *RB, MachineRegisterInfo &MRI, unsigned StartOpIdx, unsigned EndOpIdx)
This file declares the targeting of the RegisterBankInfo class for AMDGPU.
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator MBBI
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
AMD GCN specific subclass of TargetSubtarget.
Provides analysis for querying information about KnownBits during GISel passes.
Declares convenience wrapper classes for interpreting MachineInstr instances as specific generic oper...
static Register UseReg(const MachineOperand &MO)
IRTranslator LLVM IR MI
const size_t AbstractManglingParser< Derived, Alloc >::NumOps
#define I(x, y, z)
Definition MD5.cpp:57
Contains matchers for matching SSA Machine Instructions.
This file declares the MachineIRBuilder class.
Register Reg
Register const TargetRegisterInfo * TRI
Machine IR instance of the generic uniformity analysis.
Promote Memory to Register
Definition Mem2Reg.cpp:110
static MCRegister getReg(const MCDisassembler *D, unsigned RC, unsigned RegNo)
MachineInstr unsigned OpIdx
static constexpr MCPhysReg SPReg
const SmallVectorImpl< MachineOperand > & Cond
static const LaneMaskConstants & get(const GCNSubtarget &ST)
RegBankLegalizeHelper(MachineIRBuilder &B, const MachineUniformityInfo &MUI, GISelValueTracking *VT, const RegisterBankInfo &RBI, const RegBankLegalizeRules &RBLRules)
const RegBankLLTMapping * findMappingForMI(const MachineInstr &MI, const MachineRegisterInfo &MRI, const MachineUniformityInfo &MUI) const
static APInt getLowBitsSet(unsigned numBits, unsigned loBitsSet)
Constructs an APInt value that has the bottom loBitsSet bits set.
Definition APInt.h:307
@ ICMP_ULT
unsigned less than
Definition InstrTypes.h:765
@ ICMP_NE
not equal
Definition InstrTypes.h:762
iterator find(const_arg_type_t< KeyT > Val)
Definition DenseMap.h:225
iterator end()
Definition DenseMap.h:143
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition DenseMap.h:286
const SIRegisterInfo * getRegisterInfo() const override
Represents a call to an intrinsic.
Register getSourceReg() const
Get the unmerge source register.
constexpr bool isScalar() const
LLT getScalarType() const
static constexpr LLT scalar(unsigned SizeInBits)
Get a low-level scalar or aggregate "bag of bits".
constexpr bool isValid() const
constexpr uint16_t getNumElements() const
Returns the number of elements in a vector LLT.
constexpr bool isVector() const
static constexpr LLT pointer(unsigned AddressSpace, unsigned SizeInBits)
Get a low-level pointer in the given address space.
constexpr TypeSize getSizeInBits() const
Returns the total size of the type. Must only be called on sized types.
constexpr bool isPointer() const
LLT divide(int Factor) const
Return a type that is Factor times smaller.
static constexpr LLT fixed_vector(unsigned NumElements, unsigned ScalarSizeInBits)
Get a low-level fixed-width vector of some number of elements and element width.
constexpr TypeSize getSizeInBytes() const
Returns the total size of the type in bytes, i.e.
LLT getElementType() const
Returns the vector's element type. Only valid for vector types.
TypeSize getValue() const
LLVM_ABI void transferSuccessorsAndUpdatePHIs(MachineBasicBlock *FromMBB)
Transfers all the successors, as in transferSuccessors, and update PHI operands in the successor bloc...
LLVM_ABI iterator SkipPHIsAndLabels(iterator I)
Return the first instruction in MBB after I that is not a PHI or a label.
LLVM_ABI void addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob=BranchProbability::getUnknown())
Add Succ as a successor of this MachineBasicBlock.
void splice(iterator Where, MachineBasicBlock *Other, iterator From)
Take an instruction from MBB 'Other' at the position From, and insert it into this MBB right before '...
MachineInstrBundleIterator< MachineInstr > iterator
BasicBlockListType::iterator iterator
MachineBasicBlock * CreateMachineBasicBlock(const BasicBlock *BB=nullptr, std::optional< UniqueBBID > BBID=std::nullopt)
CreateMachineInstr - Allocate a new MachineInstr.
void insert(iterator MBBI, MachineBasicBlock *MBB)
Helper class to build MachineInstr.
Representation of each machine instruction.
const MachineBasicBlock * getParent() const
LocationSize getSize() const
Return the size in bytes of the memory reference.
LLVM_ABI Align getAlign() const
Return the minimum known alignment in bytes of the actual memory reference.
MachineOperand class - Representation of each machine instruction operand.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
const RegisterBank * getRegBank(Register Reg) const
Return the register bank of Reg.
LLVM_ABI Register createVirtualRegister(const TargetRegisterClass *RegClass, StringRef Name="")
createVirtualRegister - Create and return a new virtual register in the function with the specified r...
LLT getType(Register Reg) const
Get the low-level type of Reg or LLT{} if Reg is not a generic (target independent) virtual register.
const RegisterBank * getRegBankOrNull(Register Reg) const
Return the register bank of Reg, or null if Reg has not been assigned a register bank or has been ass...
Holds all the information related to register banks.
This class implements the register bank concept.
Wrapper class representing virtual and physical registers.
Definition Register.h:20
constexpr bool isVirtual() const
Return true if the specified register number is in the virtual register namespace.
Definition Register.h:79
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
bool empty() const
Definition SmallSet.h:169
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
Definition SmallSet.h:184
reference emplace_back(ArgTypes &&... Args)
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
self_iterator getIterator()
Definition ilist_node.h:123
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
constexpr char Align[]
Key for Kernel::Arg::Metadata::mAlign.
const uint64_t FltRoundToHWConversionTable
bool isAnyPtr(LLT Ty, unsigned Width)
uint32_t decodeFltRoundToHWConversionTable(uint32_t FltRounds)
Read the hardware rounding mode equivalent of a AMDGPUFltRounds value.
Intrinsic::ID getIntrinsicID(const MachineInstr &I)
Return the intrinsic ID for opcodes with the G_AMDGPU_INTRIN_ prefix.
std::pair< Register, unsigned > getBaseWithConstantOffset(MachineRegisterInfo &MRI, Register Reg, GISelValueTracking *ValueTracking=nullptr, bool CheckNUW=false)
Returns base register and constant offset.
void buildReadAnyLane(MachineIRBuilder &B, Register SgprDst, Register VgprSrc, const RegisterBankInfo &RBI)
const RsrcIntrinsic * lookupRsrcIntrinsic(unsigned Intr)
void buildReadFirstLane(MachineIRBuilder &B, Register SgprDst, Register VgprSrc, const RegisterBankInfo &RBI)
const uint64_t FltRoundConversionTable
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ Bitcast
Perform the operation on a different, but equivalently sized type.
SpecificConstantMatch m_ZeroInt()
Convenience matchers for specific integer values.
bool mi_match(Reg R, const MachineRegisterInfo &MRI, Pattern &&P)
This is an optimization pass for GlobalISel generic memory operations.
GenericUniformityInfo< MachineSSAContext > MachineUniformityInfo
@ Offset
Definition DWP.cpp:558
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1739
LLVM_ABI MachineInstr * getOpcodeDef(unsigned Opcode, Register Reg, const MachineRegisterInfo &MRI)
See if Reg is defined by an single def instruction that is Opcode.
Definition Utils.cpp:656
@ Kill
The last use of a register.
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
LLVM_ABI void constrainSelectedInstRegOperands(MachineInstr &I, const TargetInstrInfo &TII, const TargetRegisterInfo &TRI, const RegisterBankInfo &RBI)
Mutate the newly-selected instruction I to constrain its (possibly generic) virtual register operands...
Definition Utils.cpp:159
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
unsigned Log2_64(uint64_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition MathExtras.h:337
LLVM_ABI std::optional< int64_t > getIConstantVRegSExtVal(Register VReg, const MachineRegisterInfo &MRI)
If VReg is defined by a G_CONSTANT fits in int64_t returns it.
Definition Utils.cpp:317
LLVM_ABI void reportGISelFailure(MachineFunction &MF, MachineOptimizationRemarkEmitter &MORE, MachineOptimizationRemarkMissed &R)
Report an ISel error as a missed optimization remark to the LLVMContext's diagnostic stream.
Definition Utils.cpp:261
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition MathExtras.h:394
constexpr T maskTrailingZeros(unsigned N)
Create a bitmask with the N right-most bits set to 0, and all other bits set to 1.
Definition MathExtras.h:94
@ Add
Sum of integers.
DWARFExpression::Operation Op
ArrayRef(const T &OneElt) -> ArrayRef< T >
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
LLVM_ABI std::optional< ValueAndVReg > getIConstantVRegValWithLookThrough(Register VReg, const MachineRegisterInfo &MRI, bool LookThroughInstrs=true)
If VReg is defined by a statically evaluable chain of instructions rooted on a G_CONSTANT returns its...
Definition Utils.cpp:436
Align assumeAligned(uint64_t Value)
Treats the value 0 as a 1, so Align is always at least 1.
Definition Alignment.h:100
LLVM_ABI Register getSrcRegIgnoringCopies(Register Reg, const MachineRegisterInfo &MRI)
Find the source register for Reg, folding away any trivial copies.
Definition Utils.cpp:504
constexpr T maskTrailingOnes(unsigned N)
Create a bitmask with the N right-most bits set to 1, and all other bits set to 0.
Definition MathExtras.h:77
static constexpr uint64_t encode(Fields... Values)
SmallVector< RegBankLLTMappingApplyID, 2 > DstOpMapping
SmallVector< RegBankLLTMappingApplyID, 4 > SrcOpMapping
Holds waterfall loop information: the set of SGPR operand registers that need waterfalling,...
MachineBasicBlock::iterator Start
SmallSet< Register, 4 > SgprWaterfallOperandRegs
constexpr uint64_t value() const
This is a hole in the type system and should not be abused.
Definition Alignment.h:77
unsigned countMinLeadingZeros() const
Returns the minimum number of leading zero bits.
Definition KnownBits.h:262