25#include "llvm/IR/IntrinsicsAMDGPU.h"
27#define DEBUG_TYPE "amdgpu-regbanklegalize"
35 : ST(B.getMF().getSubtarget<
GCNSubtarget>()), B(B), MRI(*B.getMRI()),
36 MUI(MUI), RBI(RBI), RBLRules(RBLRules), IsWave32(ST.isWave32()),
37 SgprRB(&RBI.getRegBank(
AMDGPU::SGPRRegBankID)),
38 VgprRB(&RBI.getRegBank(
AMDGPU::VGPRRegBankID)),
39 VccRB(&RBI.getRegBank(
AMDGPU::VCCRegBankID)) {}
48 B.setInsertPt(*
MI.getParent(), std::next(
MI.getIterator()));
56 lower(
MI, Mapping, WaterfallSgprs);
59bool RegBankLegalizeHelper::executeInWaterfallLoop(
71 unsigned MovExecOpc, MovExecTermOpc, XorTermOpc, AndSaveExecOpc, ExecReg;
73 MovExecOpc = AMDGPU::S_MOV_B32;
74 MovExecTermOpc = AMDGPU::S_MOV_B32_term;
75 XorTermOpc = AMDGPU::S_XOR_B32_term;
76 AndSaveExecOpc = AMDGPU::S_AND_SAVEEXEC_B32;
77 ExecReg = AMDGPU::EXEC_LO;
79 MovExecOpc = AMDGPU::S_MOV_B64;
80 MovExecTermOpc = AMDGPU::S_MOV_B64_term;
81 XorTermOpc = AMDGPU::S_XOR_B64_term;
82 AndSaveExecOpc = AMDGPU::S_AND_SAVEEXEC_B64;
83 ExecReg = AMDGPU::EXEC;
87 const int OrigRangeSize = std::distance(
Range.begin(),
Range.end());
91 Register SaveExecReg =
MRI.createVirtualRegister(WaveRC);
92 Register InitSaveExecReg =
MRI.createVirtualRegister(WaveRC);
95 B.buildInstr(TargetOpcode::IMPLICIT_DEF).addDef(InitSaveExecReg);
97 Register SavedExec =
MRI.createVirtualRegister(WaveRC);
121 MBB.addSuccessor(LoopBB);
124 B.setInsertPt(*LoopBB, LoopBB->
end());
175 auto NewEnd = BodyBB->
end();
176 assert(std::distance(NewBegin, NewEnd) == OrigRangeSize);
184 if (!SGPROperandRegs.
count(OldReg))
189 auto OldVal = WaterfalledRegMap.
find(OldReg);
190 if (OldVal != WaterfalledRegMap.
end()) {
191 Op.setReg(OldVal->second);
196 LLT OpTy = MRI.getType(OpReg);
199 assert(MRI.getRegBank(OpReg) == VgprRB);
200 Register CurrentLaneReg = MRI.createVirtualRegister({SgprRB, OpTy});
205 unsigned PartSize = (OpSize % 64 == 0) ? 64 : 32;
207 unsigned NumParts = OpSize / PartSize;
213 CurrentLaneParts.
push_back(CurrentLaneReg);
215 auto UnmergeOp = B.buildUnmerge({VgprRB, PartTy}, OpReg);
216 auto UnmergeCurrLane = B.buildUnmerge({SgprRB, PartTy}, CurrentLaneReg);
217 for (
unsigned i = 0; i < NumParts; ++i) {
219 CurrentLaneParts.
push_back(UnmergeCurrLane.getReg(i));
223 for (
unsigned i = 0; i < NumParts; ++i) {
224 Register CmpReg = MRI.createVirtualRegister(VccRB_S1);
230 CondReg = B.buildAnd(VccRB_S1, CondReg, CmpReg).getReg(0);
233 Op.setReg(CurrentLaneReg);
236 WaterfalledRegMap.
insert(std::pair(OldReg,
Op.getReg()));
242 MRI.createVirtualRegister({WaveRC,
LLT::scalar(IsWave32 ? 32 : 64)});
243 B.buildIntrinsic(Intrinsic::amdgcn_ballot, CondRegLM).addReg(CondReg);
246 B.buildInstr(AndSaveExecOpc)
249 MRI.setSimpleHint(SavedExec, CondRegLM);
251 B.setInsertPt(*BodyBB, BodyBB->
end());
254 B.buildInstr(XorTermOpc).addDef(ExecReg).addReg(ExecReg).addReg(SavedExec);
260 B.buildInstr(AMDGPU::SI_WATERFALL_LOOP).addMBB(LoopBB);
264 B.buildInstr(MovExecOpc).addDef(SaveExecReg).addReg(ExecReg);
267 B.setInsertPt(*RestoreExecBB, RestoreExecBB->
begin());
268 B.buildInstr(MovExecTermOpc).addDef(ExecReg).addReg(SaveExecReg);
272 B.setInsertPt(*RemainderBB, RemainderBB->
begin());
277void RegBankLegalizeHelper::splitLoad(MachineInstr &
MI,
279 MachineFunction &MF = B.getMF();
280 assert(
MI.getNumMemOperands() == 1);
281 MachineMemOperand &BaseMMO = **
MI.memoperands_begin();
283 const RegisterBank *DstRB = MRI.getRegBankOrNull(Dst);
285 LLT PtrTy = MRI.getType(
Base);
286 const RegisterBank *PtrRB = MRI.getRegBankOrNull(
Base);
290 unsigned ByteOffset = 0;
291 for (LLT PartTy : LLTBreakdown) {
293 if (ByteOffset == 0) {
294 BasePlusOffset =
Base;
296 auto Offset = B.buildConstant({PtrRB, OffsetTy}, ByteOffset);
301 auto LoadPart = B.buildLoad({DstRB, PartTy}, BasePlusOffset, *OffsetMMO);
302 LoadPartRegs.
push_back(LoadPart.getReg(0));
308 B.buildMergeLikeInstr(Dst, LoadPartRegs);
314 if (MRI.getType(
Reg) == MergeTy) {
317 auto Unmerge = B.buildUnmerge({DstRB, MergeTy},
Reg);
318 for (
unsigned i = 0; i < Unmerge->getNumOperands() - 1; ++i)
319 MergeTyParts.
push_back(Unmerge.getReg(i));
322 B.buildMergeLikeInstr(Dst, MergeTyParts);
324 MI.eraseFromParent();
327void RegBankLegalizeHelper::widenLoad(MachineInstr &
MI, LLT WideTy,
329 MachineFunction &MF = B.getMF();
330 assert(
MI.getNumMemOperands() == 1);
331 MachineMemOperand &BaseMMO = **
MI.memoperands_begin();
333 const RegisterBank *DstRB = MRI.getRegBankOrNull(Dst);
337 auto WideLoad = B.buildLoad({DstRB, WideTy},
Base, *WideMMO);
340 B.buildTrunc(Dst, WideLoad);
343 auto Unmerge = B.buildUnmerge({DstRB, MergeTy}, WideLoad);
345 LLT DstTy = MRI.getType(Dst);
347 for (
unsigned i = 0; i < NumElts; ++i) {
348 MergeTyParts.
push_back(Unmerge.getReg(i));
350 B.buildMergeLikeInstr(Dst, MergeTyParts);
352 MI.eraseFromParent();
355void RegBankLegalizeHelper::widenMMOToS32(GAnyLoad &
MI)
const {
358 MachineMemOperand &MMO =
MI.getMMO();
361 MachineMemOperand *WideMMO = B.getMF().getMachineMemOperand(&MMO, 0, S32);
363 if (
MI.getOpcode() == G_LOAD) {
364 B.buildLoad(Dst,
Ptr, *WideMMO);
366 auto Load = B.buildLoad(SgprRB_S32,
Ptr, *WideMMO);
368 if (
MI.getOpcode() == G_ZEXTLOAD) {
370 auto MaskCst = B.buildConstant(SgprRB_S32, Mask);
371 B.buildAnd(Dst, Load, MaskCst);
373 assert(
MI.getOpcode() == G_SEXTLOAD);
374 B.buildSExtInReg(Dst, Load, MemSize);
378 MI.eraseFromParent();
381void RegBankLegalizeHelper::lowerVccExtToSel(MachineInstr &
MI) {
383 LLT Ty = MRI.getType(Dst);
385 unsigned Opc =
MI.getOpcode();
386 int TrueExtCst =
Opc == G_SEXT ? -1 : 1;
387 if (Ty == S32 || Ty == S16) {
388 auto True = B.buildConstant({VgprRB, Ty}, TrueExtCst);
389 auto False = B.buildConstant({VgprRB, Ty}, 0);
390 B.buildSelect(Dst, Src, True, False);
391 }
else if (Ty == S64) {
392 auto True = B.buildConstant({VgprRB_S32}, TrueExtCst);
393 auto False = B.buildConstant({VgprRB_S32}, 0);
394 auto Lo = B.buildSelect({VgprRB_S32}, Src, True, False);
395 MachineInstrBuilder
Hi;
404 Hi = B.buildUndef({VgprRB_S32});
410 B.buildMergeValues(Dst, {
Lo.getReg(0),
Hi.getReg(0)});
415 MI.eraseFromParent();
418std::pair<Register, Register> RegBankLegalizeHelper::unpackZExt(
Register Reg) {
419 auto PackedS32 = B.buildBitcast(SgprRB_S32,
Reg);
420 auto Mask = B.buildConstant(SgprRB_S32, 0x0000ffff);
421 auto Lo = B.buildAnd(SgprRB_S32, PackedS32, Mask);
422 auto Hi = B.buildLShr(SgprRB_S32, PackedS32, B.buildConstant(SgprRB_S32, 16));
423 return {
Lo.getReg(0),
Hi.getReg(0)};
426std::pair<Register, Register> RegBankLegalizeHelper::unpackSExt(
Register Reg) {
427 auto PackedS32 = B.buildBitcast(SgprRB_S32,
Reg);
428 auto Lo = B.buildSExtInReg(SgprRB_S32, PackedS32, 16);
429 auto Hi = B.buildAShr(SgprRB_S32, PackedS32, B.buildConstant(SgprRB_S32, 16));
430 return {
Lo.getReg(0),
Hi.getReg(0)};
433std::pair<Register, Register> RegBankLegalizeHelper::unpackAExt(
Register Reg) {
434 auto PackedS32 = B.buildBitcast(SgprRB_S32,
Reg);
436 auto Hi = B.buildLShr(SgprRB_S32, PackedS32, B.buildConstant(SgprRB_S32, 16));
437 return {
Lo.getReg(0),
Hi.getReg(0)};
440void RegBankLegalizeHelper::lowerUnpackBitShift(MachineInstr &
MI) {
442 switch (
MI.getOpcode()) {
443 case AMDGPU::G_SHL: {
444 auto [Val0, Val1] = unpackAExt(
MI.getOperand(1).getReg());
445 auto [Amt0, Amt1] = unpackAExt(
MI.getOperand(2).getReg());
446 Lo = B.buildInstr(
MI.getOpcode(), {SgprRB_S32}, {Val0, Amt0}).getReg(0);
447 Hi = B.buildInstr(
MI.getOpcode(), {SgprRB_S32}, {Val1, Amt1}).getReg(0);
450 case AMDGPU::G_LSHR: {
451 auto [Val0, Val1] = unpackZExt(
MI.getOperand(1).getReg());
452 auto [Amt0, Amt1] = unpackZExt(
MI.getOperand(2).getReg());
453 Lo = B.buildInstr(
MI.getOpcode(), {SgprRB_S32}, {Val0, Amt0}).getReg(0);
454 Hi = B.buildInstr(
MI.getOpcode(), {SgprRB_S32}, {Val1, Amt1}).getReg(0);
457 case AMDGPU::G_ASHR: {
458 auto [Val0, Val1] = unpackSExt(
MI.getOperand(1).getReg());
459 auto [Amt0, Amt1] = unpackSExt(
MI.getOperand(2).getReg());
460 Lo = B.buildAShr(SgprRB_S32, Val0, Amt0).getReg(0);
461 Hi = B.buildAShr(SgprRB_S32, Val1, Amt1).getReg(0);
467 B.buildBuildVectorTrunc(
MI.getOperand(0).getReg(), {Lo, Hi});
468 MI.eraseFromParent();
471void RegBankLegalizeHelper::lowerUnpackMinMax(MachineInstr &
MI) {
473 switch (
MI.getOpcode()) {
475 case AMDGPU::G_SMAX: {
477 auto [Val0_Lo, Val0_Hi] = unpackSExt(
MI.getOperand(1).getReg());
478 auto [Val1_Lo, Val1_Hi] = unpackSExt(
MI.getOperand(2).getReg());
479 Lo = B.buildInstr(
MI.getOpcode(), {SgprRB_S32}, {Val0_Lo, Val1_Lo})
481 Hi = B.buildInstr(
MI.getOpcode(), {SgprRB_S32}, {Val0_Hi, Val1_Hi})
486 case AMDGPU::G_UMAX: {
488 auto [Val0_Lo, Val0_Hi] = unpackZExt(
MI.getOperand(1).getReg());
489 auto [Val1_Lo, Val1_Hi] = unpackZExt(
MI.getOperand(2).getReg());
490 Lo = B.buildInstr(
MI.getOpcode(), {SgprRB_S32}, {Val0_Lo, Val1_Lo})
492 Hi = B.buildInstr(
MI.getOpcode(), {SgprRB_S32}, {Val0_Hi, Val1_Hi})
499 B.buildBuildVectorTrunc(
MI.getOperand(0).getReg(), {Lo, Hi});
500 MI.eraseFromParent();
503void RegBankLegalizeHelper::lowerUnpackAExt(MachineInstr &
MI) {
504 auto [Op1Lo, Op1Hi] = unpackAExt(
MI.getOperand(1).getReg());
505 auto [Op2Lo, Op2Hi] = unpackAExt(
MI.getOperand(2).getReg());
506 auto ResLo = B.buildInstr(
MI.getOpcode(), {SgprRB_S32}, {Op1Lo, Op2Lo});
507 auto ResHi = B.buildInstr(
MI.getOpcode(), {SgprRB_S32}, {Op1Hi, Op2Hi});
508 B.buildBuildVectorTrunc(
MI.getOperand(0).getReg(),
509 {ResLo.getReg(0), ResHi.getReg(0)});
510 MI.eraseFromParent();
515 return (GI->is(Intrinsic::amdgcn_sbfe));
517 return MI.getOpcode() == AMDGPU::G_SBFX;
520void RegBankLegalizeHelper::lowerV_BFE(MachineInstr &
MI) {
527 Register Src =
MI.getOperand(FirstOpnd).getReg();
528 Register LSBit =
MI.getOperand(FirstOpnd + 1).getReg();
529 Register Width =
MI.getOperand(FirstOpnd + 2).getReg();
534 unsigned SHROpc =
Signed ? AMDGPU::G_ASHR : AMDGPU::G_LSHR;
535 auto SHRSrc = B.buildInstr(SHROpc, {{VgprRB, S64}}, {Src, LSBit});
543 auto Amt = B.buildSub(VgprRB_S32, B.buildConstant(SgprRB_S32, 64), Width);
544 auto SignBit = B.buildShl({VgprRB, S64}, SHRSrc, Amt);
545 B.buildInstr(SHROpc, {Dst}, {SignBit, Amt});
546 MI.eraseFromParent();
550 uint64_t WidthImm = ConstWidth->Value.getZExtValue();
551 auto UnmergeSHRSrc = B.buildUnmerge(VgprRB_S32, SHRSrc);
552 Register SHRSrcLo = UnmergeSHRSrc.getReg(0);
553 Register SHRSrcHi = UnmergeSHRSrc.getReg(1);
554 auto Zero = B.buildConstant({VgprRB, S32}, 0);
555 unsigned BFXOpc =
Signed ? AMDGPU::G_SBFX : AMDGPU::G_UBFX;
557 if (WidthImm <= 32) {
559 auto Lo = B.buildInstr(BFXOpc, {VgprRB_S32}, {SHRSrcLo,
Zero, Width});
560 MachineInstrBuilder
Hi;
563 Hi = B.buildAShr(VgprRB_S32,
Lo, B.buildConstant(VgprRB_S32, 31));
568 B.buildMergeLikeInstr(Dst, {
Lo,
Hi});
570 auto Amt = B.buildConstant(VgprRB_S32, WidthImm - 32);
572 auto Hi = B.buildInstr(BFXOpc, {VgprRB_S32}, {SHRSrcHi,
Zero, Amt});
573 B.buildMergeLikeInstr(Dst, {SHRSrcLo,
Hi});
576 MI.eraseFromParent();
579void RegBankLegalizeHelper::lowerS_BFE(MachineInstr &
MI) {
581 LLT Ty = MRI.getType(DstReg);
584 Register Src =
MI.getOperand(FirstOpnd).getReg();
585 Register LSBit =
MI.getOperand(FirstOpnd + 1).getReg();
586 Register Width =
MI.getOperand(FirstOpnd + 2).getReg();
593 auto FieldOffset = B.buildAnd(SgprRB_S32, LSBit, Mask);
594 auto Size = B.buildShl(SgprRB_S32, Width, B.buildConstant(SgprRB_S32, 16));
595 auto Src1 = B.buildOr(SgprRB_S32, FieldOffset,
Size);
596 unsigned Opc32 =
Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32;
597 unsigned Opc64 =
Signed ? AMDGPU::S_BFE_I64 : AMDGPU::S_BFE_U64;
598 unsigned Opc = Ty == S32 ? Opc32 : Opc64;
602 auto S_BFE = B.buildInstr(
Opc, {{SgprRB, Ty}},
603 {B.buildCopy(Ty, Src), B.buildCopy(S32, Src1)});
605 *ST.getRegisterInfo(), RBI))
608 B.buildCopy(DstReg,
S_BFE->getOperand(0).getReg());
609 MI.eraseFromParent();
612void RegBankLegalizeHelper::lowerSplitTo32(MachineInstr &
MI) {
614 LLT DstTy = MRI.getType(Dst);
615 assert(DstTy == V4S16 || DstTy == V2S32 || DstTy == S64);
616 LLT Ty = DstTy == V4S16 ? V2S16 : S32;
617 auto Op1 = B.buildUnmerge({VgprRB, Ty},
MI.getOperand(1).
getReg());
618 auto Op2 = B.buildUnmerge({VgprRB, Ty},
MI.getOperand(2).
getReg());
619 unsigned Opc =
MI.getOpcode();
622 B.buildInstr(
Opc, {{VgprRB, Ty}}, {Op1.getReg(0), Op2.getReg(0)},
Flags);
624 B.buildInstr(
Opc, {{VgprRB, Ty}}, {Op1.getReg(1), Op2.getReg(1)},
Flags);
625 B.buildMergeLikeInstr(Dst, {
Lo,
Hi});
626 MI.eraseFromParent();
629void RegBankLegalizeHelper::lowerSplitTo16(MachineInstr &
MI) {
631 assert(MRI.getType(Dst) == V2S16);
632 auto [Op1Lo32, Op1Hi32] = unpackAExt(
MI.getOperand(1).getReg());
633 auto [Op2Lo32, Op2Hi32] = unpackAExt(
MI.getOperand(2).getReg());
634 unsigned Opc =
MI.getOpcode();
636 auto Op1Lo = B.buildTrunc(SgprRB_S16, Op1Lo32);
637 auto Op1Hi = B.buildTrunc(SgprRB_S16, Op1Hi32);
638 auto Op2Lo = B.buildTrunc(SgprRB_S16, Op2Lo32);
639 auto Op2Hi = B.buildTrunc(SgprRB_S16, Op2Hi32);
640 auto Lo = B.buildInstr(
Opc, {SgprRB_S16}, {Op1Lo, Op2Lo},
Flags);
641 auto Hi = B.buildInstr(
Opc, {SgprRB_S16}, {Op1Hi, Op2Hi},
Flags);
642 B.buildMergeLikeInstr(Dst, {
Lo,
Hi});
643 MI.eraseFromParent();
646void RegBankLegalizeHelper::lowerSplitTo32Select(MachineInstr &
MI) {
648 LLT DstTy = MRI.getType(Dst);
649 assert(DstTy == V4S16 || DstTy == V2S32 || DstTy == S64 ||
651 LLT Ty = DstTy == V4S16 ? V2S16 : S32;
652 auto Op2 = B.buildUnmerge({VgprRB, Ty},
MI.getOperand(2).
getReg());
653 auto Op3 = B.buildUnmerge({VgprRB, Ty},
MI.getOperand(3).
getReg());
657 B.buildSelect({VgprRB, Ty},
Cond, Op2.getReg(0), Op3.getReg(0), Flags);
659 B.buildSelect({VgprRB, Ty},
Cond, Op2.getReg(1), Op3.getReg(1), Flags);
661 B.buildMergeLikeInstr(Dst, {
Lo,
Hi});
662 MI.eraseFromParent();
665void RegBankLegalizeHelper::lowerSplitTo32SExtInReg(MachineInstr &
MI) {
666 auto Op1 = B.buildUnmerge(VgprRB_S32,
MI.getOperand(1).getReg());
667 int Amt =
MI.getOperand(2).getImm();
671 auto Freeze = B.buildFreeze(VgprRB_S32, Op1.getReg(0));
674 Lo = Freeze.getReg(0);
677 Lo = B.buildSExtInReg(VgprRB_S32, Freeze, Amt).getReg(0);
680 auto SignExtCst = B.buildConstant(SgprRB_S32, 31);
681 Hi = B.buildAShr(VgprRB_S32,
Lo, SignExtCst).getReg(0);
685 Hi = B.buildSExtInReg(VgprRB_S32, Op1.getReg(1), Amt - 32).getReg(0);
688 B.buildMergeLikeInstr(
MI.getOperand(0).getReg(), {Lo, Hi});
689 MI.eraseFromParent();
692void RegBankLegalizeHelper::lower(MachineInstr &
MI,
694 SmallSet<Register, 4> &WaterfallSgprs) {
700 return lowerVccExtToSel(
MI);
702 LLT Ty = MRI.getType(
MI.getOperand(0).getReg());
703 auto True = B.buildConstant({SgprRB, Ty},
704 MI.getOpcode() == AMDGPU::G_SEXT ? -1 : 1);
705 auto False = B.buildConstant({SgprRB, Ty}, 0);
709 B.buildSelect(
MI.getOperand(0).getReg(),
MI.getOperand(1).getReg(), True,
711 MI.eraseFromParent();
715 return lowerUnpackBitShift(
MI);
717 return lowerUnpackMinMax(
MI);
719 return lowerSplitTo16(
MI);
721 const RegisterBank *RB = MRI.getRegBank(
MI.getOperand(0).getReg());
722 MachineInstrBuilder
Hi;
723 switch (
MI.getOpcode()) {
724 case AMDGPU::G_ZEXT: {
725 Hi = B.buildConstant({RB, S32}, 0);
728 case AMDGPU::G_SEXT: {
730 auto ShiftAmt = B.buildConstant({RB, S32}, 31);
731 Hi = B.buildAShr({RB, S32},
MI.getOperand(1).
getReg(), ShiftAmt);
734 case AMDGPU::G_ANYEXT: {
735 Hi = B.buildUndef({RB, S32});
742 B.buildMergeLikeInstr(
MI.getOperand(0).getReg(),
743 {MI.getOperand(1).getReg(), Hi});
744 MI.eraseFromParent();
748 uint64_t ConstVal =
MI.getOperand(1).getCImm()->getZExtValue();
749 B.buildConstant(
MI.getOperand(0).getReg(), ConstVal);
751 MI.eraseFromParent();
756 LLT Ty = MRI.getType(Src);
760 Register BoolSrc = MRI.createVirtualRegister({VgprRB, Ty});
762 auto Src64 = B.buildUnmerge(VgprRB_S32, Src);
763 auto One = B.buildConstant(VgprRB_S32, 1);
764 auto AndLo = B.buildAnd(VgprRB_S32, Src64.getReg(0), One);
765 auto Zero = B.buildConstant(VgprRB_S32, 0);
766 auto AndHi = B.buildAnd(VgprRB_S32, Src64.getReg(1), Zero);
767 B.buildMergeLikeInstr(BoolSrc, {AndLo, AndHi});
769 assert(Ty == S32 || Ty == S16);
770 auto One = B.buildConstant({VgprRB, Ty}, 1);
771 B.buildAnd(BoolSrc, Src, One);
773 auto Zero = B.buildConstant({VgprRB, Ty}, 0);
775 MI.eraseFromParent();
779 return lowerV_BFE(
MI);
781 return lowerS_BFE(
MI);
783 return lowerSplitTo32(
MI);
785 return lowerSplitTo32Select(
MI);
787 return lowerSplitTo32SExtInReg(
MI);
789 LLT DstTy = MRI.getType(
MI.getOperand(0).getReg());
802 else if (
Size / 128 == 4)
810 else if (DstTy == S96)
811 splitLoad(
MI, {S64, S32}, S32);
812 else if (DstTy == V3S32)
813 splitLoad(
MI, {V2S32, S32}, S32);
814 else if (DstTy == V6S16)
815 splitLoad(
MI, {V4S16, V2S16}, V2S16);
823 LLT DstTy = MRI.getType(
MI.getOperand(0).getReg());
826 else if (DstTy == V3S32)
827 widenLoad(
MI, V4S32, S32);
828 else if (DstTy == V6S16)
829 widenLoad(
MI, V8S16, V2S16);
837 return lowerUnpackAExt(
MI);
842 if (!WaterfallSgprs.
empty()) {
844 executeInWaterfallLoop(B,
make_range(
I, std::next(
I)), WaterfallSgprs);
918 return isAnyPtr(Ty, 32) ? Ty : LLT();
921 return isAnyPtr(Ty, 64) ? Ty : LLT();
924 return isAnyPtr(Ty, 128) ? Ty : LLT();
1039void RegBankLegalizeHelper::applyMappingDst(
1040 MachineInstr &
MI,
unsigned &
OpIdx,
1041 const SmallVectorImpl<RegBankLLTMappingApplyID> &MethodIDs) {
1046 MachineOperand &
Op =
MI.getOperand(
OpIdx);
1048 LLT Ty = MRI.getType(
Reg);
1049 [[maybe_unused]]
const RegisterBank *RB = MRI.getRegBank(
Reg);
1051 switch (MethodIDs[
OpIdx]) {
1108 Register NewDst = MRI.createVirtualRegister(VccRB_S1);
1111 B.buildInstr(AMDGPU::G_AMDGPU_COPY_SCC_VCC, {SgprRB_S32}, {NewDst});
1112 B.buildTrunc(
Reg, CopyS32_Vcc);
1118 Register NewVgprDstS16 = MRI.createVirtualRegister({VgprRB, S16});
1119 Register NewVgprDstS32 = MRI.createVirtualRegister({VgprRB, S32});
1120 Register NewSgprDstS32 = MRI.createVirtualRegister({SgprRB, S32});
1121 Op.setReg(NewVgprDstS16);
1122 B.buildAnyExt(NewVgprDstS32, NewVgprDstS16);
1124 B.buildTrunc(
Reg, NewSgprDstS32);
1133 Register NewVgprDst = MRI.createVirtualRegister({VgprRB, Ty});
1134 Op.setReg(NewVgprDst);
1146 Register NewVgprDst = MRI.createVirtualRegister({VgprRB, Ty});
1147 Op.setReg(NewVgprDst);
1155 Register NewDst = MRI.createVirtualRegister(SgprRB_S32);
1157 if (!MRI.use_empty(
Reg))
1158 B.buildTrunc(
Reg, NewDst);
1171void RegBankLegalizeHelper::applyMappingSrc(
1172 MachineInstr &
MI,
unsigned &
OpIdx,
1173 const SmallVectorImpl<RegBankLLTMappingApplyID> &MethodIDs,
1174 SmallSet<Register, 4> &SgprWaterfallOperandRegs) {
1175 for (
unsigned i = 0; i < MethodIDs.
size(); ++
OpIdx, ++i) {
1176 if (MethodIDs[i] ==
None || MethodIDs[i] ==
IntrId || MethodIDs[i] ==
Imm)
1179 MachineOperand &
Op =
MI.getOperand(
OpIdx);
1181 LLT Ty = MRI.getType(
Reg);
1182 const RegisterBank *RB = MRI.getRegBank(
Reg);
1184 switch (MethodIDs[i]) {
1187 assert(RB == VccRB || RB == SgprRB);
1189 auto Aext = B.buildAnyExt(SgprRB_S32,
Reg);
1191 B.buildInstr(AMDGPU::G_AMDGPU_COPY_VCC_SCC, {VccRB_S1}, {Aext});
1192 Op.setReg(CopyVcc_Scc.getReg(0));
1208 assert(Ty == getTyFromID(MethodIDs[i]));
1209 assert(RB == getRegBankFromID(MethodIDs[i]));
1222 assert(Ty == getBTyFromID(MethodIDs[i], Ty));
1223 assert(RB == getRegBankFromID(MethodIDs[i]));
1239 assert(Ty == getTyFromID(MethodIDs[i]));
1241 auto CopyToVgpr = B.buildCopy({VgprRB, Ty},
Reg);
1242 Op.setReg(CopyToVgpr.getReg(0));
1256 assert(Ty == getBTyFromID(MethodIDs[i], Ty));
1258 auto CopyToVgpr = B.buildCopy({VgprRB, Ty},
Reg);
1259 Op.setReg(CopyToVgpr.getReg(0));
1266 assert(Ty == getTyFromID(MethodIDs[i]));
1276 auto Aext = B.buildAnyExt(SgprRB_S32,
Reg);
1277 Op.setReg(Aext.getReg(0));
1284 auto Aext = B.buildAnyExt(SgprRB_S32,
Reg);
1287 auto Cst1 = B.buildConstant(SgprRB_S32, 1);
1288 auto BoolInReg = B.buildAnd(SgprRB_S32, Aext, Cst1);
1289 Op.setReg(BoolInReg.getReg(0));
1295 auto Sext = B.buildSExt(SgprRB_S32,
Reg);
1296 Op.setReg(Sext.getReg(0));
1302 auto Zext = B.buildZExt({SgprRB, S32},
Reg);
1303 Op.setReg(Zext.getReg(0));
1310 auto Sext = B.buildSExt({VgprRB, S32},
Reg);
1311 Op.setReg(Sext.getReg(0));
1318 auto Zext = B.buildZExt({VgprRB, S32},
Reg);
1319 Op.setReg(Zext.getReg(0));
1330 LLT Ty = MRI.getType(Dst);
1333 B.setInsertPt(*
MI.getParent(),
MI.getParent()->getFirstNonPHI());
1335 Register NewDst = MRI.createVirtualRegister(SgprRB_S32);
1336 MI.getOperand(0).setReg(NewDst);
1337 B.buildTrunc(Dst, NewDst);
1339 for (
unsigned i = 1; i <
MI.getNumOperands(); i += 2) {
1342 auto DefMI = MRI.getVRegDef(
UseReg)->getIterator();
1347 auto NewUse = B.buildAnyExt(SgprRB_S32,
UseReg);
1348 MI.getOperand(i).setReg(NewUse.getReg(0));
1357 if (Ty ==
LLT::scalar(1) && MUI.isDivergent(Dst)) {
1360 "before RegBankLegalize to lower lane mask(vcc) phis");
1378 unsigned StartOpIdx,
1379 unsigned EndOpIdx) {
1380 for (
unsigned i = StartOpIdx; i <= EndOpIdx; ++i) {
1381 if (
MRI.getRegBankOrNull(
MI.getOperand(i).getReg()) != RB)
1388 const RegisterBank *RB = MRI.getRegBank(
MI.getOperand(0).getReg());
1390 unsigned NumDefs =
MI.getNumDefs();
1391 unsigned NumOperands =
MI.getNumOperands();
1399 for (
unsigned i = NumDefs; i < NumOperands; ++i) {
1401 if (MRI.getRegBank(Reg) != RB) {
1402 auto Copy = B.buildCopy({VgprRB, MRI.getType(Reg)}, Reg);
1403 MI.getOperand(i).setReg(Copy.getReg(0));
unsigned const MachineRegisterInfo * MRI
MachineInstrBuilder MachineInstrBuilder & DefMI
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.
Provides AMDGPU specific target descriptions.
static bool isSignedBFE(MachineInstr &MI)
static bool verifyRegBankOnOperands(MachineInstr &MI, const RegisterBank *RB, MachineRegisterInfo &MRI, unsigned StartOpIdx, unsigned EndOpIdx)
This file declares the targeting of the RegisterBankInfo class for AMDGPU.
MachineBasicBlock MachineBasicBlock::iterator MBBI
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
AMD GCN specific subclass of TargetSubtarget.
Declares convenience wrapper classes for interpreting MachineInstr instances as specific generic oper...
static Register UseReg(const MachineOperand &MO)
This file declares the MachineIRBuilder class.
Register const TargetRegisterInfo * TRI
Promote Memory to Register
static unsigned getReg(const MCDisassembler *D, unsigned RC, unsigned RegNo)
MachineInstr unsigned OpIdx
ConstantRange Range(APInt(BitWidth, Low), APInt(BitWidth, High))
const SmallVectorImpl< MachineOperand > & Cond
void applyMappingTrivial(MachineInstr &MI)
RegBankLegalizeHelper(MachineIRBuilder &B, const MachineUniformityInfo &MUI, const RegisterBankInfo &RBI, const RegBankLegalizeRules &RBLRules)
void findRuleAndApplyMapping(MachineInstr &MI)
void applyMappingPHI(MachineInstr &MI)
const RegBankLLTMapping & findMappingForMI(const MachineInstr &MI, const MachineRegisterInfo &MRI, const MachineUniformityInfo &MUI) const
static APInt getLowBitsSet(unsigned numBits, unsigned loBitsSet)
Constructs an APInt value that has the bottom loBitsSet bits set.
iterator find(const_arg_type_t< KeyT > Val)
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
const SIRegisterInfo * getRegisterInfo() const override
Represents a call to an intrinsic.
constexpr bool isScalar() const
static constexpr LLT scalar(unsigned SizeInBits)
Get a low-level scalar or aggregate "bag of bits".
constexpr bool isValid() const
constexpr bool isVector() const
static constexpr LLT pointer(unsigned AddressSpace, unsigned SizeInBits)
Get a low-level pointer in the given address space.
constexpr TypeSize getSizeInBits() const
Returns the total size of the type. Must only be called on sized types.
constexpr bool isPointer() const
constexpr LLT getElementType() const
Returns the vector's element type. Only valid for vector types.
static constexpr LLT fixed_vector(unsigned NumElements, unsigned ScalarSizeInBits)
Get a low-level fixed-width vector of some number of elements and element width.
constexpr TypeSize getSizeInBytes() const
Returns the total size of the type in bytes, i.e.
TypeSize getValue() const
LLVM_ABI void transferSuccessorsAndUpdatePHIs(MachineBasicBlock *FromMBB)
Transfers all the successors, as in transferSuccessors, and update PHI operands in the successor bloc...
LLVM_ABI iterator SkipPHIsAndLabels(iterator I)
Return the first instruction in MBB after I that is not a PHI or a label.
LLVM_ABI void addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob=BranchProbability::getUnknown())
Add Succ as a successor of this MachineBasicBlock.
void splice(iterator Where, MachineBasicBlock *Other, iterator From)
Take an instruction from MBB 'Other' at the position From, and insert it into this MBB right before '...
MachineInstrBundleIterator< MachineInstr > iterator
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
BasicBlockListType::iterator iterator
MachineBasicBlock * CreateMachineBasicBlock(const BasicBlock *BB=nullptr, std::optional< UniqueBBID > BBID=std::nullopt)
CreateMachineInstr - Allocate a new MachineInstr.
void insert(iterator MBBI, MachineBasicBlock *MBB)
Helper class to build MachineInstr.
Representation of each machine instruction.
LocationSize getSize() const
Return the size in bytes of the memory reference.
MachineOperand class - Representation of each machine instruction operand.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
Holds all the information related to register banks.
This class implements the register bank concept.
Wrapper class representing virtual and physical registers.
SmallSet - This maintains a set of unique values, optimizing for the case when the set is small (less...
size_type count(const T &V) const
count - Return 1 if the element is in the set, 0 otherwise.
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
void push_back(const T &Elt)
A range adaptor for a pair of iterators.
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
bool isAnyPtr(LLT Ty, unsigned Width)
void buildReadAnyLane(MachineIRBuilder &B, Register SgprDst, Register VgprSrc, const RegisterBankInfo &RBI)
void buildReadFirstLane(MachineIRBuilder &B, Register SgprDst, Register VgprSrc, const RegisterBankInfo &RBI)
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
@ Kill
The last use of a register.
This is an optimization pass for GlobalISel generic memory operations.
GenericUniformityInfo< MachineSSAContext > MachineUniformityInfo
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
LLVM_ABI bool constrainSelectedInstRegOperands(MachineInstr &I, const TargetInstrInfo &TII, const TargetRegisterInfo &TRI, const RegisterBankInfo &RBI)
Mutate the newly-selected instruction I to constrain its (possibly generic) virtual register operands...
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
DWARFExpression::Operation Op
ArrayRef(const T &OneElt) -> ArrayRef< T >
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
LLVM_ABI std::optional< ValueAndVReg > getIConstantVRegValWithLookThrough(Register VReg, const MachineRegisterInfo &MRI, bool LookThroughInstrs=true)
If VReg is defined by a statically evaluable chain of instructions rooted on a G_CONSTANT returns its...
constexpr T maskTrailingOnes(unsigned N)
Create a bitmask with the N right-most bits set to 1, and all other bits set to 0.
LoweringMethodID LoweringMethod
SmallVector< RegBankLLTMappingApplyID, 2 > DstOpMapping
SmallVector< RegBankLLTMappingApplyID, 4 > SrcOpMapping