25#include "llvm/IR/IntrinsicsAMDGPU.h"
27#define DEBUG_TYPE "amdgpu-regbanklegalize"
35 : MF(B.getMF()), ST(MF.getSubtarget<
GCNSubtarget>()), B(B),
36 MRI(*B.getMRI()), MUI(MUI), RBI(RBI), MORE(MF, nullptr),
37 RBLRules(RBLRules), IsWave32(ST.isWave32()),
38 SgprRB(&RBI.getRegBank(
AMDGPU::SGPRRegBankID)),
39 VgprRB(&RBI.getRegBank(
AMDGPU::VGPRRegBankID)),
40 VccRB(&RBI.getRegBank(
AMDGPU::VCCRegBankID)) {}
46 "No AMDGPU RegBankLegalize rules defined for opcode",
54 "AMDGPU RegBankLegalize: none of the rules defined with "
55 "'Any' for MI's opcode matched MI",
63 B.setInsertPt(*
MI.getParent(), std::next(
MI.getIterator()));
73 if (!lower(
MI, *Mapping, WaterfallSgprs))
79bool RegBankLegalizeHelper::executeInWaterfallLoop(
91 unsigned MovExecOpc, MovExecTermOpc, XorTermOpc, AndSaveExecOpc, ExecReg;
93 MovExecOpc = AMDGPU::S_MOV_B32;
94 MovExecTermOpc = AMDGPU::S_MOV_B32_term;
95 XorTermOpc = AMDGPU::S_XOR_B32_term;
96 AndSaveExecOpc = AMDGPU::S_AND_SAVEEXEC_B32;
97 ExecReg = AMDGPU::EXEC_LO;
99 MovExecOpc = AMDGPU::S_MOV_B64;
100 MovExecTermOpc = AMDGPU::S_MOV_B64_term;
101 XorTermOpc = AMDGPU::S_XOR_B64_term;
102 AndSaveExecOpc = AMDGPU::S_AND_SAVEEXEC_B64;
103 ExecReg = AMDGPU::EXEC;
107 const int OrigRangeSize = std::distance(
Range.begin(),
Range.end());
111 Register SaveExecReg =
MRI.createVirtualRegister(WaveRC);
112 Register InitSaveExecReg =
MRI.createVirtualRegister(WaveRC);
115 B.buildInstr(TargetOpcode::IMPLICIT_DEF).addDef(InitSaveExecReg);
117 Register SavedExec =
MRI.createVirtualRegister(WaveRC);
141 MBB.addSuccessor(LoopBB);
144 B.setInsertPt(*LoopBB, LoopBB->
end());
195 auto NewEnd = BodyBB->
end();
196 assert(std::distance(NewBegin, NewEnd) == OrigRangeSize);
204 if (!SGPROperandRegs.
count(OldReg))
209 auto OldVal = WaterfalledRegMap.
find(OldReg);
210 if (OldVal != WaterfalledRegMap.
end()) {
211 Op.setReg(OldVal->second);
216 LLT OpTy = MRI.getType(OpReg);
219 assert(MRI.getRegBank(OpReg) == VgprRB);
220 Register CurrentLaneReg = MRI.createVirtualRegister({SgprRB, OpTy});
225 unsigned PartSize = (OpSize % 64 == 0) ? 64 : 32;
227 unsigned NumParts = OpSize / PartSize;
233 CurrentLaneParts.
push_back(CurrentLaneReg);
235 auto UnmergeOp = B.buildUnmerge({VgprRB, PartTy}, OpReg);
236 auto UnmergeCurrLane = B.buildUnmerge({SgprRB, PartTy}, CurrentLaneReg);
237 for (
unsigned i = 0; i < NumParts; ++i) {
239 CurrentLaneParts.
push_back(UnmergeCurrLane.getReg(i));
243 for (
unsigned i = 0; i < NumParts; ++i) {
244 Register CmpReg = MRI.createVirtualRegister(VccRB_S1);
250 CondReg = B.buildAnd(VccRB_S1, CondReg, CmpReg).getReg(0);
253 Op.setReg(CurrentLaneReg);
256 WaterfalledRegMap.
insert(std::pair(OldReg,
Op.getReg()));
262 MRI.createVirtualRegister({WaveRC,
LLT::scalar(IsWave32 ? 32 : 64)});
263 B.buildIntrinsic(Intrinsic::amdgcn_ballot, CondRegLM).addReg(CondReg);
266 B.buildInstr(AndSaveExecOpc)
269 MRI.setSimpleHint(SavedExec, CondRegLM);
271 B.setInsertPt(*BodyBB, BodyBB->
end());
274 B.buildInstr(XorTermOpc).addDef(ExecReg).addReg(ExecReg).addReg(SavedExec);
280 B.buildInstr(AMDGPU::SI_WATERFALL_LOOP).addMBB(LoopBB);
284 B.buildInstr(MovExecOpc).addDef(SaveExecReg).addReg(ExecReg);
287 B.setInsertPt(*RestoreExecBB, RestoreExecBB->
begin());
288 B.buildInstr(MovExecTermOpc).addDef(ExecReg).addReg(SaveExecReg);
292 B.setInsertPt(*RemainderBB, RemainderBB->
begin());
297bool RegBankLegalizeHelper::splitLoad(MachineInstr &
MI,
299 MachineFunction &MF = B.getMF();
300 assert(
MI.getNumMemOperands() == 1);
301 MachineMemOperand &BaseMMO = **
MI.memoperands_begin();
303 const RegisterBank *DstRB = MRI.getRegBankOrNull(Dst);
305 LLT PtrTy = MRI.getType(
Base);
306 const RegisterBank *PtrRB = MRI.getRegBankOrNull(
Base);
310 unsigned ByteOffset = 0;
311 for (LLT PartTy : LLTBreakdown) {
313 if (ByteOffset == 0) {
314 BasePlusOffset =
Base;
316 auto Offset = B.buildConstant({PtrRB, OffsetTy}, ByteOffset);
320 auto *OffsetMMO = MF.getMachineMemOperand(&BaseMMO, ByteOffset, PartTy);
321 auto LoadPart = B.buildLoad({DstRB, PartTy}, BasePlusOffset, *OffsetMMO);
322 LoadPartRegs.
push_back(LoadPart.getReg(0));
328 B.buildMergeLikeInstr(Dst, LoadPartRegs);
334 if (MRI.getType(
Reg) == MergeTy) {
337 auto Unmerge = B.buildUnmerge({DstRB, MergeTy},
Reg);
338 for (
unsigned i = 0; i < Unmerge->getNumOperands() - 1; ++i)
339 MergeTyParts.
push_back(Unmerge.getReg(i));
342 B.buildMergeLikeInstr(Dst, MergeTyParts);
344 MI.eraseFromParent();
348bool RegBankLegalizeHelper::widenLoad(MachineInstr &
MI, LLT WideTy,
350 MachineFunction &MF = B.getMF();
351 assert(
MI.getNumMemOperands() == 1);
352 MachineMemOperand &BaseMMO = **
MI.memoperands_begin();
354 const RegisterBank *DstRB = MRI.getRegBankOrNull(Dst);
357 MachineMemOperand *WideMMO = MF.getMachineMemOperand(&BaseMMO, 0, WideTy);
358 auto WideLoad = B.buildLoad({DstRB, WideTy},
Base, *WideMMO);
361 B.buildTrunc(Dst, WideLoad);
364 auto Unmerge = B.buildUnmerge({DstRB, MergeTy}, WideLoad);
366 LLT DstTy = MRI.getType(Dst);
368 for (
unsigned i = 0; i < NumElts; ++i) {
369 MergeTyParts.
push_back(Unmerge.getReg(i));
371 B.buildMergeLikeInstr(Dst, MergeTyParts);
373 MI.eraseFromParent();
377bool RegBankLegalizeHelper::widenMMOToS32(GAnyLoad &
MI)
const {
380 MachineMemOperand &MMO =
MI.getMMO();
383 MachineMemOperand *WideMMO = B.getMF().getMachineMemOperand(&MMO, 0, S32);
385 if (
MI.getOpcode() == G_LOAD) {
386 B.buildLoad(Dst, Ptr, *WideMMO);
388 auto Load = B.buildLoad(SgprRB_S32, Ptr, *WideMMO);
390 if (
MI.getOpcode() == G_ZEXTLOAD) {
392 auto MaskCst = B.buildConstant(SgprRB_S32, Mask);
393 B.buildAnd(Dst, Load, MaskCst);
395 assert(
MI.getOpcode() == G_SEXTLOAD);
396 B.buildSExtInReg(Dst, Load, MemSize);
400 MI.eraseFromParent();
404bool RegBankLegalizeHelper::lowerVccExtToSel(MachineInstr &
MI) {
406 LLT Ty = MRI.getType(Dst);
408 unsigned Opc =
MI.getOpcode();
409 int TrueExtCst =
Opc == G_SEXT ? -1 : 1;
410 if (Ty == S32 || Ty == S16) {
411 auto True = B.buildConstant({VgprRB, Ty}, TrueExtCst);
412 auto False = B.buildConstant({VgprRB, Ty}, 0);
413 B.buildSelect(Dst, Src, True, False);
414 }
else if (Ty == S64) {
415 auto True = B.buildConstant({VgprRB_S32}, TrueExtCst);
416 auto False = B.buildConstant({VgprRB_S32}, 0);
417 auto Lo = B.buildSelect({VgprRB_S32}, Src, True, False);
418 MachineInstrBuilder
Hi;
427 Hi = B.buildUndef({VgprRB_S32});
431 MF, MORE,
"amdgpu-regbanklegalize",
432 "AMDGPU RegBankLegalize: lowerVccExtToSel, Opcode not supported",
MI);
436 B.buildMergeValues(Dst, {
Lo.getReg(0),
Hi.getReg(0)});
439 MF, MORE,
"amdgpu-regbanklegalize",
440 "AMDGPU RegBankLegalize: lowerVccExtToSel, Type not supported",
MI);
444 MI.eraseFromParent();
448std::pair<Register, Register> RegBankLegalizeHelper::unpackZExt(
Register Reg) {
449 auto PackedS32 = B.buildBitcast(SgprRB_S32,
Reg);
450 auto Mask = B.buildConstant(SgprRB_S32, 0x0000ffff);
451 auto Lo = B.buildAnd(SgprRB_S32, PackedS32, Mask);
452 auto Hi = B.buildLShr(SgprRB_S32, PackedS32, B.buildConstant(SgprRB_S32, 16));
453 return {
Lo.getReg(0),
Hi.getReg(0)};
456std::pair<Register, Register> RegBankLegalizeHelper::unpackSExt(
Register Reg) {
457 auto PackedS32 = B.buildBitcast(SgprRB_S32,
Reg);
458 auto Lo = B.buildSExtInReg(SgprRB_S32, PackedS32, 16);
459 auto Hi = B.buildAShr(SgprRB_S32, PackedS32, B.buildConstant(SgprRB_S32, 16));
460 return {
Lo.getReg(0),
Hi.getReg(0)};
463std::pair<Register, Register> RegBankLegalizeHelper::unpackAExt(
Register Reg) {
464 auto PackedS32 = B.buildBitcast(SgprRB_S32,
Reg);
466 auto Hi = B.buildLShr(SgprRB_S32, PackedS32, B.buildConstant(SgprRB_S32, 16));
467 return {
Lo.getReg(0),
Hi.getReg(0)};
470std::pair<Register, Register>
471RegBankLegalizeHelper::unpackAExtTruncS16(
Register Reg) {
472 auto [Lo32, Hi32] = unpackAExt(
Reg);
473 return {B.buildTrunc(SgprRB_S16, Lo32).getReg(0),
474 B.buildTrunc(SgprRB_S16, Hi32).getReg(0)};
477bool RegBankLegalizeHelper::lowerUnpackBitShift(MachineInstr &
MI) {
479 switch (
MI.getOpcode()) {
480 case AMDGPU::G_SHL: {
481 auto [Val0, Val1] = unpackAExt(
MI.getOperand(1).getReg());
482 auto [Amt0, Amt1] = unpackAExt(
MI.getOperand(2).getReg());
483 Lo = B.buildInstr(
MI.getOpcode(), {SgprRB_S32}, {Val0, Amt0}).getReg(0);
484 Hi = B.buildInstr(
MI.getOpcode(), {SgprRB_S32}, {Val1, Amt1}).getReg(0);
487 case AMDGPU::G_LSHR: {
488 auto [Val0, Val1] = unpackZExt(
MI.getOperand(1).getReg());
489 auto [Amt0, Amt1] = unpackZExt(
MI.getOperand(2).getReg());
490 Lo = B.buildInstr(
MI.getOpcode(), {SgprRB_S32}, {Val0, Amt0}).getReg(0);
491 Hi = B.buildInstr(
MI.getOpcode(), {SgprRB_S32}, {Val1, Amt1}).getReg(0);
494 case AMDGPU::G_ASHR: {
495 auto [Val0, Val1] = unpackSExt(
MI.getOperand(1).getReg());
496 auto [Amt0, Amt1] = unpackSExt(
MI.getOperand(2).getReg());
497 Lo = B.buildAShr(SgprRB_S32, Val0, Amt0).getReg(0);
498 Hi = B.buildAShr(SgprRB_S32, Val1, Amt1).getReg(0);
503 MF, MORE,
"amdgpu-regbanklegalize",
504 "AMDGPU RegBankLegalize: lowerUnpackBitShift, case not implemented",
508 B.buildBuildVectorTrunc(
MI.getOperand(0).getReg(), {Lo, Hi});
509 MI.eraseFromParent();
513bool RegBankLegalizeHelper::lowerUnpackMinMax(MachineInstr &
MI) {
515 switch (
MI.getOpcode()) {
517 case AMDGPU::G_SMAX: {
519 auto [Val0_Lo, Val0_Hi] = unpackSExt(
MI.getOperand(1).getReg());
520 auto [Val1_Lo, Val1_Hi] = unpackSExt(
MI.getOperand(2).getReg());
521 Lo = B.buildInstr(
MI.getOpcode(), {SgprRB_S32}, {Val0_Lo, Val1_Lo})
523 Hi = B.buildInstr(
MI.getOpcode(), {SgprRB_S32}, {Val0_Hi, Val1_Hi})
528 case AMDGPU::G_UMAX: {
530 auto [Val0_Lo, Val0_Hi] = unpackZExt(
MI.getOperand(1).getReg());
531 auto [Val1_Lo, Val1_Hi] = unpackZExt(
MI.getOperand(2).getReg());
532 Lo = B.buildInstr(
MI.getOpcode(), {SgprRB_S32}, {Val0_Lo, Val1_Lo})
534 Hi = B.buildInstr(
MI.getOpcode(), {SgprRB_S32}, {Val0_Hi, Val1_Hi})
540 MF, MORE,
"amdgpu-regbanklegalize",
541 "AMDGPU RegBankLegalize: lowerUnpackMinMax, case not implemented",
MI);
544 B.buildBuildVectorTrunc(
MI.getOperand(0).getReg(), {Lo, Hi});
545 MI.eraseFromParent();
549bool RegBankLegalizeHelper::lowerUnpackAExt(MachineInstr &
MI) {
550 auto [Op1Lo, Op1Hi] = unpackAExt(
MI.getOperand(1).getReg());
551 auto [Op2Lo, Op2Hi] = unpackAExt(
MI.getOperand(2).getReg());
552 auto ResLo = B.buildInstr(
MI.getOpcode(), {SgprRB_S32}, {Op1Lo, Op2Lo});
553 auto ResHi = B.buildInstr(
MI.getOpcode(), {SgprRB_S32}, {Op1Hi, Op2Hi});
554 B.buildBuildVectorTrunc(
MI.getOperand(0).getReg(),
555 {ResLo.getReg(0), ResHi.getReg(0)});
556 MI.eraseFromParent();
562 return (GI->is(Intrinsic::amdgcn_sbfe));
564 return MI.getOpcode() == AMDGPU::G_SBFX;
567bool RegBankLegalizeHelper::lowerV_BFE(MachineInstr &
MI) {
574 Register Src =
MI.getOperand(FirstOpnd).getReg();
575 Register LSBit =
MI.getOperand(FirstOpnd + 1).getReg();
576 Register Width =
MI.getOperand(FirstOpnd + 2).getReg();
581 unsigned SHROpc =
Signed ? AMDGPU::G_ASHR : AMDGPU::G_LSHR;
582 auto SHRSrc = B.buildInstr(SHROpc, {{VgprRB, S64}}, {Src, LSBit});
590 auto Amt = B.buildSub(VgprRB_S32, B.buildConstant(SgprRB_S32, 64), Width);
591 auto SignBit = B.buildShl({VgprRB, S64}, SHRSrc, Amt);
592 B.buildInstr(SHROpc, {Dst}, {SignBit, Amt});
593 MI.eraseFromParent();
597 uint64_t WidthImm = ConstWidth->Value.getZExtValue();
598 auto UnmergeSHRSrc = B.buildUnmerge(VgprRB_S32, SHRSrc);
599 Register SHRSrcLo = UnmergeSHRSrc.getReg(0);
600 Register SHRSrcHi = UnmergeSHRSrc.getReg(1);
601 auto Zero = B.buildConstant({VgprRB, S32}, 0);
602 unsigned BFXOpc =
Signed ? AMDGPU::G_SBFX : AMDGPU::G_UBFX;
604 if (WidthImm <= 32) {
606 auto Lo = B.buildInstr(BFXOpc, {VgprRB_S32}, {SHRSrcLo,
Zero, Width});
607 MachineInstrBuilder
Hi;
610 Hi = B.buildAShr(VgprRB_S32,
Lo, B.buildConstant(VgprRB_S32, 31));
615 B.buildMergeLikeInstr(Dst, {
Lo,
Hi});
617 auto Amt = B.buildConstant(VgprRB_S32, WidthImm - 32);
619 auto Hi = B.buildInstr(BFXOpc, {VgprRB_S32}, {SHRSrcHi,
Zero, Amt});
620 B.buildMergeLikeInstr(Dst, {SHRSrcLo,
Hi});
623 MI.eraseFromParent();
627bool RegBankLegalizeHelper::lowerS_BFE(MachineInstr &
MI) {
629 LLT Ty = MRI.getType(DstReg);
632 Register Src =
MI.getOperand(FirstOpnd).getReg();
633 Register LSBit =
MI.getOperand(FirstOpnd + 1).getReg();
634 Register Width =
MI.getOperand(FirstOpnd + 2).getReg();
641 auto FieldOffset = B.buildAnd(SgprRB_S32, LSBit, Mask);
642 auto Size = B.buildShl(SgprRB_S32, Width, B.buildConstant(SgprRB_S32, 16));
643 auto Src1 = B.buildOr(SgprRB_S32, FieldOffset,
Size);
644 unsigned Opc32 =
Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32;
645 unsigned Opc64 =
Signed ? AMDGPU::S_BFE_I64 : AMDGPU::S_BFE_U64;
646 unsigned Opc = Ty == S32 ? Opc32 : Opc64;
650 auto S_BFE = B.buildInstr(
Opc, {{SgprRB, Ty}},
651 {B.buildCopy(Ty, Src), B.buildCopy(S32, Src1)});
653 *ST.getRegisterInfo(), RBI)) {
655 MF, MORE,
"amdgpu-regbanklegalize",
656 "AMDGPU RegBankLegalize: lowerS_BFE, failed to constrain BFE",
MI);
660 B.buildCopy(DstReg,
S_BFE->getOperand(0).getReg());
661 MI.eraseFromParent();
665bool RegBankLegalizeHelper::lowerSplitTo32(MachineInstr &
MI) {
667 LLT DstTy = MRI.getType(Dst);
668 assert(DstTy == V4S16 || DstTy == V2S32 || DstTy == S64);
669 LLT Ty = DstTy == V4S16 ? V2S16 : S32;
670 auto Op1 = B.buildUnmerge({VgprRB, Ty},
MI.getOperand(1).
getReg());
671 auto Op2 = B.buildUnmerge({VgprRB, Ty},
MI.getOperand(2).
getReg());
672 unsigned Opc =
MI.getOpcode();
675 B.buildInstr(
Opc, {{VgprRB, Ty}}, {Op1.getReg(0), Op2.getReg(0)},
Flags);
677 B.buildInstr(
Opc, {{VgprRB, Ty}}, {Op1.getReg(1), Op2.getReg(1)},
Flags);
678 B.buildMergeLikeInstr(Dst, {
Lo,
Hi});
679 MI.eraseFromParent();
683bool RegBankLegalizeHelper::lowerSplitTo16(MachineInstr &
MI) {
685 assert(MRI.getType(Dst) == V2S16);
686 unsigned Opc =
MI.getOpcode();
689 if (
MI.getNumOperands() == 2) {
690 auto [Op1Lo, Op1Hi] = unpackAExtTruncS16(
MI.getOperand(1).getReg());
691 auto Lo = B.buildInstr(
Opc, {SgprRB_S16}, {Op1Lo},
Flags);
692 auto Hi = B.buildInstr(
Opc, {SgprRB_S16}, {Op1Hi},
Flags);
693 B.buildMergeLikeInstr(Dst, {
Lo,
Hi});
694 MI.eraseFromParent();
699 auto [Op1Lo, Op1Hi] = unpackAExtTruncS16(
MI.getOperand(1).getReg());
700 auto [Op2Lo, Op2Hi] = unpackAExtTruncS16(
MI.getOperand(2).getReg());
701 auto Lo = B.buildInstr(
Opc, {SgprRB_S16}, {Op1Lo, Op2Lo},
Flags);
702 auto Hi = B.buildInstr(
Opc, {SgprRB_S16}, {Op1Hi, Op2Hi},
Flags);
703 B.buildMergeLikeInstr(Dst, {
Lo,
Hi});
704 MI.eraseFromParent();
708bool RegBankLegalizeHelper::lowerSplitTo32Select(MachineInstr &
MI) {
710 LLT DstTy = MRI.getType(Dst);
711 assert(DstTy == V4S16 || DstTy == V2S32 || DstTy == S64 ||
713 LLT Ty = DstTy == V4S16 ? V2S16 : S32;
714 auto Op2 = B.buildUnmerge({VgprRB, Ty},
MI.getOperand(2).
getReg());
715 auto Op3 = B.buildUnmerge({VgprRB, Ty},
MI.getOperand(3).
getReg());
719 B.buildSelect({VgprRB, Ty},
Cond, Op2.getReg(0), Op3.getReg(0), Flags);
721 B.buildSelect({VgprRB, Ty},
Cond, Op2.getReg(1), Op3.getReg(1), Flags);
723 B.buildMergeLikeInstr(Dst, {
Lo,
Hi});
724 MI.eraseFromParent();
728bool RegBankLegalizeHelper::lowerSplitTo32SExtInReg(MachineInstr &
MI) {
729 auto Op1 = B.buildUnmerge(VgprRB_S32,
MI.getOperand(1).getReg());
730 int Amt =
MI.getOperand(2).getImm();
734 auto Freeze = B.buildFreeze(VgprRB_S32, Op1.getReg(0));
737 Lo = Freeze.getReg(0);
740 Lo = B.buildSExtInReg(VgprRB_S32, Freeze, Amt).getReg(0);
743 auto SignExtCst = B.buildConstant(SgprRB_S32, 31);
744 Hi = B.buildAShr(VgprRB_S32,
Lo, SignExtCst).getReg(0);
748 Hi = B.buildSExtInReg(VgprRB_S32, Op1.getReg(1), Amt - 32).getReg(0);
751 B.buildMergeLikeInstr(
MI.getOperand(0).getReg(), {Lo, Hi});
752 MI.eraseFromParent();
756bool RegBankLegalizeHelper::lower(MachineInstr &
MI,
758 SmallSet<Register, 4> &WaterfallSgprs) {
764 return lowerVccExtToSel(
MI);
766 LLT Ty = MRI.getType(
MI.getOperand(0).getReg());
767 auto True = B.buildConstant({SgprRB, Ty},
768 MI.getOpcode() == AMDGPU::G_SEXT ? -1 : 1);
769 auto False = B.buildConstant({SgprRB, Ty}, 0);
773 B.buildSelect(
MI.getOperand(0).getReg(),
MI.getOperand(1).getReg(), True,
775 MI.eraseFromParent();
779 return lowerUnpackBitShift(
MI);
781 return lowerUnpackMinMax(
MI);
783 return lowerSplitTo16(
MI);
785 const RegisterBank *RB = MRI.getRegBank(
MI.getOperand(0).getReg());
786 MachineInstrBuilder
Hi;
787 switch (
MI.getOpcode()) {
788 case AMDGPU::G_ZEXT: {
789 Hi = B.buildConstant({RB, S32}, 0);
792 case AMDGPU::G_SEXT: {
794 auto ShiftAmt = B.buildConstant({RB, S32}, 31);
795 Hi = B.buildAShr({RB, S32},
MI.getOperand(1).
getReg(), ShiftAmt);
798 case AMDGPU::G_ANYEXT: {
799 Hi = B.buildUndef({RB, S32});
804 "AMDGPU RegBankLegalize: Ext32To64, unsuported opcode",
809 B.buildMergeLikeInstr(
MI.getOperand(0).getReg(),
810 {MI.getOperand(1).getReg(), Hi});
811 MI.eraseFromParent();
815 uint64_t ConstVal =
MI.getOperand(1).getCImm()->getZExtValue();
816 B.buildConstant(
MI.getOperand(0).getReg(), ConstVal);
818 MI.eraseFromParent();
823 LLT Ty = MRI.getType(Src);
827 Register BoolSrc = MRI.createVirtualRegister({VgprRB, Ty});
829 auto Src64 = B.buildUnmerge(VgprRB_S32, Src);
830 auto One = B.buildConstant(VgprRB_S32, 1);
831 auto AndLo = B.buildAnd(VgprRB_S32, Src64.getReg(0), One);
832 auto Zero = B.buildConstant(VgprRB_S32, 0);
833 auto AndHi = B.buildAnd(VgprRB_S32, Src64.getReg(1), Zero);
834 B.buildMergeLikeInstr(BoolSrc, {AndLo, AndHi});
836 assert(Ty == S32 || Ty == S16);
837 auto One = B.buildConstant({VgprRB, Ty}, 1);
838 B.buildAnd(BoolSrc, Src, One);
840 auto Zero = B.buildConstant({VgprRB, Ty}, 0);
842 MI.eraseFromParent();
846 return lowerV_BFE(
MI);
848 return lowerS_BFE(
MI);
850 return lowerSplitTo32(
MI);
852 return lowerSplitTo32Select(
MI);
854 return lowerSplitTo32SExtInReg(
MI);
856 LLT DstTy = MRI.getType(
MI.getOperand(0).getReg());
869 else if (
Size / 128 == 4)
873 "AMDGPU RegBankLegalize: SplitLoad, unsuported type",
879 else if (DstTy == S96)
880 splitLoad(
MI, {S64, S32}, S32);
881 else if (DstTy == V3S32)
882 splitLoad(
MI, {V2S32, S32}, S32);
883 else if (DstTy == V6S16)
884 splitLoad(
MI, {V4S16, V2S16}, V2S16);
887 "AMDGPU RegBankLegalize: SplitLoad, unsuported type",
894 LLT DstTy = MRI.getType(
MI.getOperand(0).getReg());
897 else if (DstTy == V3S32)
898 widenLoad(
MI, V4S32, S32);
899 else if (DstTy == V6S16)
900 widenLoad(
MI, V8S16, V2S16);
903 "AMDGPU RegBankLegalize: WidenLoad, unsuported type",
910 return lowerUnpackAExt(
MI);
915 if (!WaterfallSgprs.
empty()) {
917 if (!executeInWaterfallLoop(B,
make_range(
I, std::next(
I)), WaterfallSgprs))
996 return isAnyPtr(Ty, 32) ? Ty : LLT();
999 return isAnyPtr(Ty, 64) ? Ty : LLT();
1002 return isAnyPtr(Ty, 128) ? Ty : LLT();
1119bool RegBankLegalizeHelper::applyMappingDst(
1120 MachineInstr &
MI,
unsigned &
OpIdx,
1121 const SmallVectorImpl<RegBankLLTMappingApplyID> &MethodIDs) {
1126 MachineOperand &
Op =
MI.getOperand(
OpIdx);
1128 LLT Ty = MRI.getType(
Reg);
1129 [[maybe_unused]]
const RegisterBank *RB = MRI.getRegBank(
Reg);
1131 switch (MethodIDs[
OpIdx]) {
1190 Register NewDst = MRI.createVirtualRegister(VccRB_S1);
1193 B.buildInstr(AMDGPU::G_AMDGPU_COPY_SCC_VCC, {SgprRB_S32}, {NewDst});
1194 B.buildTrunc(
Reg, CopyS32_Vcc);
1200 Register NewVgprDstS16 = MRI.createVirtualRegister({VgprRB, S16});
1201 Register NewVgprDstS32 = MRI.createVirtualRegister({VgprRB, S32});
1202 Register NewSgprDstS32 = MRI.createVirtualRegister({SgprRB, S32});
1203 Op.setReg(NewVgprDstS16);
1204 B.buildAnyExt(NewVgprDstS32, NewVgprDstS16);
1206 B.buildTrunc(
Reg, NewSgprDstS32);
1215 Register NewVgprDst = MRI.createVirtualRegister({VgprRB, Ty});
1216 Op.setReg(NewVgprDst);
1228 Register NewVgprDst = MRI.createVirtualRegister({VgprRB, Ty});
1229 Op.setReg(NewVgprDst);
1237 Register NewDst = MRI.createVirtualRegister(SgprRB_S32);
1239 if (!MRI.use_empty(
Reg))
1240 B.buildTrunc(
Reg, NewDst);
1245 MF, MORE,
"amdgpu-regbanklegalize",
1246 "AMDGPU RegBankLegalize: missing fast rule ('Div' or 'Uni') for",
MI);
1251 MF, MORE,
"amdgpu-regbanklegalize",
1252 "AMDGPU RegBankLegalize: applyMappingDst, ID not supported",
MI);
1260bool RegBankLegalizeHelper::applyMappingSrc(
1261 MachineInstr &
MI,
unsigned &
OpIdx,
1262 const SmallVectorImpl<RegBankLLTMappingApplyID> &MethodIDs,
1263 SmallSet<Register, 4> &SgprWaterfallOperandRegs) {
1264 for (
unsigned i = 0; i < MethodIDs.
size(); ++
OpIdx, ++i) {
1265 if (MethodIDs[i] ==
None || MethodIDs[i] ==
IntrId || MethodIDs[i] ==
Imm)
1268 MachineOperand &
Op =
MI.getOperand(
OpIdx);
1270 LLT Ty = MRI.getType(
Reg);
1271 const RegisterBank *RB = MRI.getRegBank(
Reg);
1273 switch (MethodIDs[i]) {
1276 assert(RB == VccRB || RB == SgprRB);
1278 auto Aext = B.buildAnyExt(SgprRB_S32,
Reg);
1280 B.buildInstr(AMDGPU::G_AMDGPU_COPY_VCC_SCC, {VccRB_S1}, {Aext});
1281 Op.setReg(CopyVcc_Scc.getReg(0));
1299 assert(Ty == getTyFromID(MethodIDs[i]));
1300 assert(RB == getRegBankFromID(MethodIDs[i]));
1313 assert(Ty == getBTyFromID(MethodIDs[i], Ty));
1314 assert(RB == getRegBankFromID(MethodIDs[i]));
1330 assert(Ty == getTyFromID(MethodIDs[i]));
1332 auto CopyToVgpr = B.buildCopy({VgprRB, Ty},
Reg);
1333 Op.setReg(CopyToVgpr.getReg(0));
1347 assert(Ty == getBTyFromID(MethodIDs[i], Ty));
1349 auto CopyToVgpr = B.buildCopy({VgprRB, Ty},
Reg);
1350 Op.setReg(CopyToVgpr.getReg(0));
1357 assert(Ty == getTyFromID(MethodIDs[i]));
1367 auto Aext = B.buildAnyExt(SgprRB_S32,
Reg);
1368 Op.setReg(Aext.getReg(0));
1375 auto Aext = B.buildAnyExt(SgprRB_S32,
Reg);
1378 auto Cst1 = B.buildConstant(SgprRB_S32, 1);
1379 auto BoolInReg = B.buildAnd(SgprRB_S32, Aext, Cst1);
1380 Op.setReg(BoolInReg.getReg(0));
1386 auto Sext = B.buildSExt(SgprRB_S32,
Reg);
1387 Op.setReg(Sext.getReg(0));
1393 auto Zext = B.buildZExt({SgprRB, S32},
Reg);
1394 Op.setReg(Zext.getReg(0));
1401 auto Sext = B.buildSExt({VgprRB, S32},
Reg);
1402 Op.setReg(Sext.getReg(0));
1409 auto Zext = B.buildZExt({VgprRB, S32},
Reg);
1410 Op.setReg(Zext.getReg(0));
1415 MF, MORE,
"amdgpu-regbanklegalize",
1416 "AMDGPU RegBankLegalize: applyMappingSrc, ID not supported",
MI);
1425 LLT Ty = MRI.getType(Dst);
1428 B.setInsertPt(*
MI.getParent(),
MI.getParent()->getFirstNonPHI());
1430 Register NewDst = MRI.createVirtualRegister(SgprRB_S32);
1431 MI.getOperand(0).setReg(NewDst);
1432 B.buildTrunc(Dst, NewDst);
1434 for (
unsigned i = 1; i <
MI.getNumOperands(); i += 2) {
1437 auto DefMI = MRI.getVRegDef(
UseReg)->getIterator();
1442 auto NewUse = B.buildAnyExt(SgprRB_S32,
UseReg);
1443 MI.getOperand(i).setReg(NewUse.getReg(0));
1452 if (Ty ==
LLT::scalar(1) && MUI.isDivergent(Dst)) {
1454 "AMDGPU RegBankLegalize: Can't lower divergent S1 G_PHI",
1468 "AMDGPU RegBankLegalize: type not supported for G_PHI",
1476 unsigned StartOpIdx,
1477 unsigned EndOpIdx) {
1478 for (
unsigned i = StartOpIdx; i <= EndOpIdx; ++i) {
1479 if (
MRI.getRegBankOrNull(
MI.getOperand(i).getReg()) != RB)
1486 const RegisterBank *RB = MRI.getRegBank(
MI.getOperand(0).getReg());
1488 unsigned NumDefs =
MI.getNumDefs();
1489 unsigned NumOperands =
MI.getNumOperands();
1497 for (
unsigned i = NumDefs; i < NumOperands; ++i) {
1499 if (MRI.getRegBank(Reg) != RB) {
1500 auto Copy = B.buildCopy({VgprRB, MRI.getType(Reg)}, Reg);
1501 MI.getOperand(i).setReg(Copy.getReg(0));
unsigned const MachineRegisterInfo * MRI
MachineInstrBuilder MachineInstrBuilder & DefMI
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.
Provides AMDGPU specific target descriptions.
static bool isSignedBFE(MachineInstr &MI)
static bool verifyRegBankOnOperands(MachineInstr &MI, const RegisterBank *RB, MachineRegisterInfo &MRI, unsigned StartOpIdx, unsigned EndOpIdx)
This file declares the targeting of the RegisterBankInfo class for AMDGPU.
MachineBasicBlock MachineBasicBlock::iterator MBBI
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
AMD GCN specific subclass of TargetSubtarget.
Declares convenience wrapper classes for interpreting MachineInstr instances as specific generic oper...
static Register UseReg(const MachineOperand &MO)
This file declares the MachineIRBuilder class.
Register const TargetRegisterInfo * TRI
Promote Memory to Register
static MCRegister getReg(const MCDisassembler *D, unsigned RC, unsigned RegNo)
MachineInstr unsigned OpIdx
ConstantRange Range(APInt(BitWidth, Low), APInt(BitWidth, High))
const SmallVectorImpl< MachineOperand > & Cond
bool findRuleAndApplyMapping(MachineInstr &MI)
bool applyMappingPHI(MachineInstr &MI)
void applyMappingTrivial(MachineInstr &MI)
RegBankLegalizeHelper(MachineIRBuilder &B, const MachineUniformityInfo &MUI, const RegisterBankInfo &RBI, const RegBankLegalizeRules &RBLRules)
const RegBankLLTMapping * findMappingForMI(const MachineInstr &MI, const MachineRegisterInfo &MRI, const MachineUniformityInfo &MUI) const
static APInt getLowBitsSet(unsigned numBits, unsigned loBitsSet)
Constructs an APInt value that has the bottom loBitsSet bits set.
iterator find(const_arg_type_t< KeyT > Val)
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
const SIRegisterInfo * getRegisterInfo() const override
Represents a call to an intrinsic.
constexpr bool isScalar() const
static constexpr LLT scalar(unsigned SizeInBits)
Get a low-level scalar or aggregate "bag of bits".
constexpr bool isValid() const
constexpr bool isVector() const
static constexpr LLT pointer(unsigned AddressSpace, unsigned SizeInBits)
Get a low-level pointer in the given address space.
constexpr TypeSize getSizeInBits() const
Returns the total size of the type. Must only be called on sized types.
constexpr bool isPointer() const
constexpr LLT getElementType() const
Returns the vector's element type. Only valid for vector types.
static constexpr LLT fixed_vector(unsigned NumElements, unsigned ScalarSizeInBits)
Get a low-level fixed-width vector of some number of elements and element width.
constexpr TypeSize getSizeInBytes() const
Returns the total size of the type in bytes, i.e.
TypeSize getValue() const
LLVM_ABI void transferSuccessorsAndUpdatePHIs(MachineBasicBlock *FromMBB)
Transfers all the successors, as in transferSuccessors, and update PHI operands in the successor bloc...
LLVM_ABI iterator SkipPHIsAndLabels(iterator I)
Return the first instruction in MBB after I that is not a PHI or a label.
LLVM_ABI void addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob=BranchProbability::getUnknown())
Add Succ as a successor of this MachineBasicBlock.
void splice(iterator Where, MachineBasicBlock *Other, iterator From)
Take an instruction from MBB 'Other' at the position From, and insert it into this MBB right before '...
MachineInstrBundleIterator< MachineInstr > iterator
BasicBlockListType::iterator iterator
MachineBasicBlock * CreateMachineBasicBlock(const BasicBlock *BB=nullptr, std::optional< UniqueBBID > BBID=std::nullopt)
CreateMachineInstr - Allocate a new MachineInstr.
void insert(iterator MBBI, MachineBasicBlock *MBB)
Helper class to build MachineInstr.
Representation of each machine instruction.
LocationSize getSize() const
Return the size in bytes of the memory reference.
MachineOperand class - Representation of each machine instruction operand.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
Holds all the information related to register banks.
This class implements the register bank concept.
Wrapper class representing virtual and physical registers.
SmallSet - This maintains a set of unique values, optimizing for the case when the set is small (less...
size_type count(const T &V) const
count - Return 1 if the element is in the set, 0 otherwise.
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
void push_back(const T &Elt)
A range adaptor for a pair of iterators.
bool isAnyPtr(LLT Ty, unsigned Width)
void buildReadAnyLane(MachineIRBuilder &B, Register SgprDst, Register VgprSrc, const RegisterBankInfo &RBI)
void buildReadFirstLane(MachineIRBuilder &B, Register SgprDst, Register VgprSrc, const RegisterBankInfo &RBI)
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
@ Kill
The last use of a register.
This is an optimization pass for GlobalISel generic memory operations.
GenericUniformityInfo< MachineSSAContext > MachineUniformityInfo
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
LLVM_ABI bool constrainSelectedInstRegOperands(MachineInstr &I, const TargetInstrInfo &TII, const TargetRegisterInfo &TRI, const RegisterBankInfo &RBI)
Mutate the newly-selected instruction I to constrain its (possibly generic) virtual register operands...
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
LLVM_ABI void reportGISelFailure(MachineFunction &MF, MachineOptimizationRemarkEmitter &MORE, MachineOptimizationRemarkMissed &R)
Report an ISel error as a missed optimization remark to the LLVMContext's diagnostic stream.
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
DWARFExpression::Operation Op
ArrayRef(const T &OneElt) -> ArrayRef< T >
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
LLVM_ABI std::optional< ValueAndVReg > getIConstantVRegValWithLookThrough(Register VReg, const MachineRegisterInfo &MRI, bool LookThroughInstrs=true)
If VReg is defined by a statically evaluable chain of instructions rooted on a G_CONSTANT returns its...
constexpr T maskTrailingOnes(unsigned N)
Create a bitmask with the N right-most bits set to 1, and all other bits set to 0.
LoweringMethodID LoweringMethod
SmallVector< RegBankLLTMappingApplyID, 2 > DstOpMapping
SmallVector< RegBankLLTMappingApplyID, 4 > SrcOpMapping