27#include "llvm/IR/IntrinsicsAMDGPU.h"
29#define DEBUG_TYPE "amdgpu-regbanklegalize"
37 : MF(B.getMF()), ST(MF.getSubtarget<
GCNSubtarget>()), B(B),
38 MRI(*B.getMRI()), MUI(MUI), RBI(RBI), MORE(MF, nullptr),
39 RBLRules(RBLRules), IsWave32(ST.isWave32()),
40 SgprRB(&RBI.getRegBank(
AMDGPU::SGPRRegBankID)),
41 VgprRB(&RBI.getRegBank(
AMDGPU::VGPRRegBankID)),
42 AgprRB(&RBI.getRegBank(
AMDGPU::AGPRRegBankID)),
43 VccRB(&RBI.getRegBank(
AMDGPU::VCCRegBankID)) {}
49 "No AMDGPU RegBankLegalize rules defined for opcode",
57 "AMDGPU RegBankLegalize: none of the rules defined with "
58 "'Any' for MI's opcode matched MI",
66 B.setInsertPt(*
MI.getParent(), std::next(
MI.getIterator()));
76 if (!lower(
MI, *Mapping, WFI))
85 "Waterfall range not initialized");
102 const int OrigRangeSize = std::distance(BeginIt, EndIt);
110 B.buildInstr(TargetOpcode::IMPLICIT_DEF).addDef(InitSaveExecReg);
136 MBB.addSuccessor(LoopBB);
139 B.setInsertPt(*LoopBB, LoopBB->
end());
190 auto NewEnd = BodyBB->
end();
191 assert(std::distance(NewBegin, NewEnd) == OrigRangeSize);
204 auto OldVal = WaterfalledRegMap.
find(OldReg);
205 if (OldVal != WaterfalledRegMap.
end()) {
206 Op.setReg(OldVal->second);
220 unsigned PartSize = (OpSize % 64 == 0) ? 64 : 32;
222 unsigned NumParts = OpSize / PartSize;
228 CurrentLaneParts.
push_back(CurrentLaneReg);
230 auto UnmergeOp = B.buildUnmerge({VgprRB, PartTy}, OpReg);
231 auto UnmergeCurrLane = B.buildUnmerge({SgprRB, PartTy}, CurrentLaneReg);
232 for (
unsigned i = 0; i < NumParts; ++i) {
234 CurrentLaneParts.
push_back(UnmergeCurrLane.getReg(i));
238 for (
unsigned i = 0; i < NumParts; ++i) {
239 Register CmpReg = MRI.createVirtualRegister(VccRB_S1);
245 CondReg = B.buildAnd(VccRB_S1, CondReg, CmpReg).getReg(0);
248 Op.setReg(CurrentLaneReg);
251 WaterfalledRegMap.
insert(std::pair(OldReg,
Op.getReg()));
257 MRI.createVirtualRegister({WaveRC,
LLT::scalar(IsWave32 ? 32 : 64)});
258 B.buildIntrinsic(Intrinsic::amdgcn_ballot, CondRegLM).addReg(CondReg);
264 MRI.setSimpleHint(SavedExec, CondRegLM);
266 B.setInsertPt(*BodyBB, BodyBB->
end());
278 B.buildInstr(AMDGPU::SI_WATERFALL_LOOP).addMBB(LoopBB);
282 B.buildInstr(LMC.
MovOpc).addDef(SaveExecReg).addReg(LMC.
ExecReg);
285 B.setInsertPt(*RestoreExecBB, RestoreExecBB->
begin());
290 B.setInsertPt(*RemainderBB, RemainderBB->
begin());
295bool RegBankLegalizeHelper::splitLoad(MachineInstr &
MI,
297 MachineFunction &MF = B.getMF();
298 assert(
MI.getNumMemOperands() == 1);
299 MachineMemOperand &BaseMMO = **
MI.memoperands_begin();
301 const RegisterBank *DstRB = MRI.getRegBankOrNull(Dst);
303 LLT PtrTy = MRI.getType(
Base);
304 const RegisterBank *PtrRB = MRI.getRegBankOrNull(
Base);
308 unsigned ByteOffset = 0;
309 for (LLT PartTy : LLTBreakdown) {
311 if (ByteOffset == 0) {
312 BasePlusOffset =
Base;
314 auto Offset = B.buildConstant({PtrRB, OffsetTy}, ByteOffset);
318 auto *OffsetMMO = MF.getMachineMemOperand(&BaseMMO, ByteOffset, PartTy);
319 auto LoadPart = B.buildLoad({DstRB, PartTy}, BasePlusOffset, *OffsetMMO);
320 LoadPartRegs.
push_back(LoadPart.getReg(0));
326 B.buildMergeLikeInstr(Dst, LoadPartRegs);
332 if (MRI.getType(
Reg) == MergeTy) {
335 auto Unmerge = B.buildUnmerge({DstRB, MergeTy},
Reg);
336 for (
unsigned i = 0; i < Unmerge->getNumOperands() - 1; ++i)
337 MergeTyParts.
push_back(Unmerge.getReg(i));
340 B.buildMergeLikeInstr(Dst, MergeTyParts);
342 MI.eraseFromParent();
346bool RegBankLegalizeHelper::widenLoad(MachineInstr &
MI, LLT WideTy,
348 MachineFunction &MF = B.getMF();
349 assert(
MI.getNumMemOperands() == 1);
350 MachineMemOperand &BaseMMO = **
MI.memoperands_begin();
352 const RegisterBank *DstRB = MRI.getRegBankOrNull(Dst);
355 MachineMemOperand *WideMMO = MF.getMachineMemOperand(&BaseMMO, 0, WideTy);
356 auto WideLoad = B.buildLoad({DstRB, WideTy},
Base, *WideMMO);
359 B.buildTrunc(Dst, WideLoad);
362 auto Unmerge = B.buildUnmerge({DstRB, MergeTy}, WideLoad);
364 LLT DstTy = MRI.getType(Dst);
366 for (
unsigned i = 0; i < NumElts; ++i) {
367 MergeTyParts.
push_back(Unmerge.getReg(i));
369 B.buildMergeLikeInstr(Dst, MergeTyParts);
371 MI.eraseFromParent();
375bool RegBankLegalizeHelper::widenMMOToS32(GAnyLoad &
MI)
const {
378 MachineMemOperand &MMO =
MI.getMMO();
381 MachineMemOperand *WideMMO = B.getMF().getMachineMemOperand(&MMO, 0, S32);
383 if (
MI.getOpcode() == G_LOAD) {
384 B.buildLoad(Dst, Ptr, *WideMMO);
386 auto Load = B.buildLoad(SgprRB_S32, Ptr, *WideMMO);
388 if (
MI.getOpcode() == G_ZEXTLOAD) {
390 auto MaskCst = B.buildConstant(SgprRB_S32, Mask);
391 B.buildAnd(Dst, Load, MaskCst);
393 assert(
MI.getOpcode() == G_SEXTLOAD);
394 B.buildSExtInReg(Dst, Load, MemSize);
398 MI.eraseFromParent();
402bool RegBankLegalizeHelper::lowerVccExtToSel(MachineInstr &
MI) {
404 LLT Ty = MRI.getType(Dst);
406 unsigned Opc =
MI.getOpcode();
407 int TrueExtCst =
Opc == G_SEXT ? -1 : 1;
408 if (Ty == S32 || Ty == S16) {
409 auto True = B.buildConstant({VgprRB, Ty}, TrueExtCst);
410 auto False = B.buildConstant({VgprRB, Ty}, 0);
411 B.buildSelect(Dst, Src, True, False);
412 }
else if (Ty == S64) {
413 auto True = B.buildConstant({VgprRB_S32}, TrueExtCst);
414 auto False = B.buildConstant({VgprRB_S32}, 0);
415 auto Lo = B.buildSelect({VgprRB_S32}, Src, True, False);
416 MachineInstrBuilder
Hi;
425 Hi = B.buildUndef({VgprRB_S32});
429 MF, MORE,
"amdgpu-regbanklegalize",
430 "AMDGPU RegBankLegalize: lowerVccExtToSel, Opcode not supported",
MI);
434 B.buildMergeValues(Dst, {
Lo.getReg(0),
Hi.getReg(0)});
437 MF, MORE,
"amdgpu-regbanklegalize",
438 "AMDGPU RegBankLegalize: lowerVccExtToSel, Type not supported",
MI);
442 MI.eraseFromParent();
446std::pair<Register, Register> RegBankLegalizeHelper::unpackZExt(
Register Reg) {
447 auto PackedS32 = B.buildBitcast(SgprRB_S32,
Reg);
448 auto Mask = B.buildConstant(SgprRB_S32, 0x0000ffff);
449 auto Lo = B.buildAnd(SgprRB_S32, PackedS32, Mask);
450 auto Hi = B.buildLShr(SgprRB_S32, PackedS32, B.buildConstant(SgprRB_S32, 16));
451 return {
Lo.getReg(0),
Hi.getReg(0)};
454std::pair<Register, Register> RegBankLegalizeHelper::unpackSExt(
Register Reg) {
455 auto PackedS32 = B.buildBitcast(SgprRB_S32,
Reg);
456 auto Lo = B.buildSExtInReg(SgprRB_S32, PackedS32, 16);
457 auto Hi = B.buildAShr(SgprRB_S32, PackedS32, B.buildConstant(SgprRB_S32, 16));
458 return {
Lo.getReg(0),
Hi.getReg(0)};
461std::pair<Register, Register> RegBankLegalizeHelper::unpackAExt(
Register Reg) {
462 auto PackedS32 = B.buildBitcast(SgprRB_S32,
Reg);
464 auto Hi = B.buildLShr(SgprRB_S32, PackedS32, B.buildConstant(SgprRB_S32, 16));
465 return {
Lo.getReg(0),
Hi.getReg(0)};
468std::pair<Register, Register>
469RegBankLegalizeHelper::unpackAExtTruncS16(
Register Reg) {
470 auto [Lo32, Hi32] = unpackAExt(
Reg);
471 return {B.buildTrunc(SgprRB_S16, Lo32).getReg(0),
472 B.buildTrunc(SgprRB_S16, Hi32).getReg(0)};
475bool RegBankLegalizeHelper::lowerUnpackBitShift(MachineInstr &
MI) {
477 switch (
MI.getOpcode()) {
478 case AMDGPU::G_SHL: {
479 auto [Val0, Val1] = unpackAExt(
MI.getOperand(1).getReg());
480 auto [Amt0, Amt1] = unpackAExt(
MI.getOperand(2).getReg());
481 Lo = B.buildInstr(
MI.getOpcode(), {SgprRB_S32}, {Val0, Amt0}).getReg(0);
482 Hi = B.buildInstr(
MI.getOpcode(), {SgprRB_S32}, {Val1, Amt1}).getReg(0);
485 case AMDGPU::G_LSHR: {
486 auto [Val0, Val1] = unpackZExt(
MI.getOperand(1).getReg());
487 auto [Amt0, Amt1] = unpackZExt(
MI.getOperand(2).getReg());
488 Lo = B.buildInstr(
MI.getOpcode(), {SgprRB_S32}, {Val0, Amt0}).getReg(0);
489 Hi = B.buildInstr(
MI.getOpcode(), {SgprRB_S32}, {Val1, Amt1}).getReg(0);
492 case AMDGPU::G_ASHR: {
493 auto [Val0, Val1] = unpackSExt(
MI.getOperand(1).getReg());
494 auto [Amt0, Amt1] = unpackSExt(
MI.getOperand(2).getReg());
495 Lo = B.buildAShr(SgprRB_S32, Val0, Amt0).getReg(0);
496 Hi = B.buildAShr(SgprRB_S32, Val1, Amt1).getReg(0);
501 MF, MORE,
"amdgpu-regbanklegalize",
502 "AMDGPU RegBankLegalize: lowerUnpackBitShift, case not implemented",
506 B.buildBuildVectorTrunc(
MI.getOperand(0).getReg(), {Lo, Hi});
507 MI.eraseFromParent();
511bool RegBankLegalizeHelper::lowerUnpackMinMax(MachineInstr &
MI) {
513 switch (
MI.getOpcode()) {
515 case AMDGPU::G_SMAX: {
517 auto [Val0_Lo, Val0_Hi] = unpackSExt(
MI.getOperand(1).getReg());
518 auto [Val1_Lo, Val1_Hi] = unpackSExt(
MI.getOperand(2).getReg());
519 Lo = B.buildInstr(
MI.getOpcode(), {SgprRB_S32}, {Val0_Lo, Val1_Lo})
521 Hi = B.buildInstr(
MI.getOpcode(), {SgprRB_S32}, {Val0_Hi, Val1_Hi})
526 case AMDGPU::G_UMAX: {
528 auto [Val0_Lo, Val0_Hi] = unpackZExt(
MI.getOperand(1).getReg());
529 auto [Val1_Lo, Val1_Hi] = unpackZExt(
MI.getOperand(2).getReg());
530 Lo = B.buildInstr(
MI.getOpcode(), {SgprRB_S32}, {Val0_Lo, Val1_Lo})
532 Hi = B.buildInstr(
MI.getOpcode(), {SgprRB_S32}, {Val0_Hi, Val1_Hi})
538 MF, MORE,
"amdgpu-regbanklegalize",
539 "AMDGPU RegBankLegalize: lowerUnpackMinMax, case not implemented",
MI);
542 B.buildBuildVectorTrunc(
MI.getOperand(0).getReg(), {Lo, Hi});
543 MI.eraseFromParent();
547bool RegBankLegalizeHelper::lowerUnpackAExt(MachineInstr &
MI) {
548 auto [Op1Lo, Op1Hi] = unpackAExt(
MI.getOperand(1).getReg());
549 auto [Op2Lo, Op2Hi] = unpackAExt(
MI.getOperand(2).getReg());
550 auto ResLo = B.buildInstr(
MI.getOpcode(), {SgprRB_S32}, {Op1Lo, Op2Lo});
551 auto ResHi = B.buildInstr(
MI.getOpcode(), {SgprRB_S32}, {Op1Hi, Op2Hi});
552 B.buildBuildVectorTrunc(
MI.getOperand(0).getReg(),
553 {ResLo.getReg(0), ResHi.getReg(0)});
554 MI.eraseFromParent();
560 return (GI->is(Intrinsic::amdgcn_sbfe));
562 return MI.getOpcode() == AMDGPU::G_SBFX;
565bool RegBankLegalizeHelper::lowerV_BFE(MachineInstr &
MI) {
572 Register Src =
MI.getOperand(FirstOpnd).getReg();
573 Register LSBit =
MI.getOperand(FirstOpnd + 1).getReg();
574 Register Width =
MI.getOperand(FirstOpnd + 2).getReg();
579 unsigned SHROpc =
Signed ? AMDGPU::G_ASHR : AMDGPU::G_LSHR;
580 auto SHRSrc = B.buildInstr(SHROpc, {{VgprRB, S64}}, {Src, LSBit});
588 auto Amt = B.buildSub(VgprRB_S32, B.buildConstant(SgprRB_S32, 64), Width);
589 auto SignBit = B.buildShl({VgprRB, S64}, SHRSrc, Amt);
590 B.buildInstr(SHROpc, {Dst}, {SignBit, Amt});
591 MI.eraseFromParent();
595 uint64_t WidthImm = ConstWidth->Value.getZExtValue();
596 auto UnmergeSHRSrc = B.buildUnmerge(VgprRB_S32, SHRSrc);
597 Register SHRSrcLo = UnmergeSHRSrc.getReg(0);
598 Register SHRSrcHi = UnmergeSHRSrc.getReg(1);
599 auto Zero = B.buildConstant({VgprRB, S32}, 0);
600 unsigned BFXOpc =
Signed ? AMDGPU::G_SBFX : AMDGPU::G_UBFX;
602 if (WidthImm <= 32) {
604 auto Lo = B.buildInstr(BFXOpc, {VgprRB_S32}, {SHRSrcLo,
Zero, Width});
605 MachineInstrBuilder
Hi;
608 Hi = B.buildAShr(VgprRB_S32,
Lo, B.buildConstant(VgprRB_S32, 31));
613 B.buildMergeLikeInstr(Dst, {
Lo,
Hi});
615 auto Amt = B.buildConstant(VgprRB_S32, WidthImm - 32);
617 auto Hi = B.buildInstr(BFXOpc, {VgprRB_S32}, {SHRSrcHi,
Zero, Amt});
618 B.buildMergeLikeInstr(Dst, {SHRSrcLo,
Hi});
621 MI.eraseFromParent();
625bool RegBankLegalizeHelper::lowerS_BFE(MachineInstr &
MI) {
627 LLT Ty = MRI.getType(DstReg);
630 Register Src =
MI.getOperand(FirstOpnd).getReg();
631 Register LSBit =
MI.getOperand(FirstOpnd + 1).getReg();
632 Register Width =
MI.getOperand(FirstOpnd + 2).getReg();
639 auto FieldOffset = B.buildAnd(SgprRB_S32, LSBit, Mask);
640 auto Size = B.buildShl(SgprRB_S32, Width, B.buildConstant(SgprRB_S32, 16));
641 auto Src1 = B.buildOr(SgprRB_S32, FieldOffset,
Size);
642 unsigned Opc32 =
Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32;
643 unsigned Opc64 =
Signed ? AMDGPU::S_BFE_I64 : AMDGPU::S_BFE_U64;
644 unsigned Opc = Ty == S32 ? Opc32 : Opc64;
648 auto S_BFE = B.buildInstr(
Opc, {{SgprRB, Ty}},
649 {B.buildCopy(Ty, Src), B.buildCopy(S32, Src1)});
651 *ST.getRegisterInfo(), RBI);
653 B.buildCopy(DstReg,
S_BFE->getOperand(0).getReg());
654 MI.eraseFromParent();
658bool RegBankLegalizeHelper::lowerSplitTo32(MachineInstr &
MI) {
660 LLT DstTy = MRI.getType(Dst);
661 assert(DstTy == V4S16 || DstTy == V2S32 || DstTy == S64);
662 LLT Ty = DstTy == V4S16 ? V2S16 : S32;
663 auto Op1 = B.buildUnmerge({VgprRB, Ty},
MI.getOperand(1).
getReg());
664 auto Op2 = B.buildUnmerge({VgprRB, Ty},
MI.getOperand(2).
getReg());
665 unsigned Opc =
MI.getOpcode();
668 B.buildInstr(
Opc, {{VgprRB, Ty}}, {Op1.getReg(0), Op2.getReg(0)},
Flags);
670 B.buildInstr(
Opc, {{VgprRB, Ty}}, {Op1.getReg(1), Op2.getReg(1)},
Flags);
671 B.buildMergeLikeInstr(Dst, {
Lo,
Hi});
672 MI.eraseFromParent();
676bool RegBankLegalizeHelper::lowerSplitTo32Mul(MachineInstr &
MI) {
678 assert(MRI.getType(Dst) == S64);
679 auto Op1 = B.buildUnmerge({VgprRB_S32},
MI.getOperand(1).
getReg());
680 auto Op2 = B.buildUnmerge({VgprRB_S32},
MI.getOperand(2).
getReg());
684 auto Lo = B.buildMul(VgprRB_S32, Op1.getReg(0), Op2.getReg(0));
685 auto Carry = B.buildUMulH(VgprRB_S32, Op1.getReg(0), Op2.getReg(0));
686 auto MulLo0Hi1 = B.buildMul(VgprRB_S32, Op1.getReg(0), Op2.getReg(1));
687 auto MulHi0Lo1 = B.buildMul(VgprRB_S32, Op1.getReg(1), Op2.getReg(0));
688 auto Sum = B.buildAdd(VgprRB_S32, MulLo0Hi1, MulHi0Lo1);
689 auto Hi = B.buildAdd(VgprRB_S32, Sum, Carry);
691 B.buildMergeLikeInstr(Dst, {
Lo,
Hi});
692 MI.eraseFromParent();
696bool RegBankLegalizeHelper::lowerSplitTo16(MachineInstr &
MI) {
698 assert(MRI.getType(Dst) == V2S16);
699 unsigned Opc =
MI.getOpcode();
700 unsigned NumOps =
MI.getNumOperands();
703 auto [Op1Lo, Op1Hi] = unpackAExtTruncS16(
MI.getOperand(1).getReg());
706 auto Lo = B.buildInstr(
Opc, {SgprRB_S16}, {Op1Lo},
Flags);
707 auto Hi = B.buildInstr(
Opc, {SgprRB_S16}, {Op1Hi},
Flags);
708 B.buildMergeLikeInstr(Dst, {
Lo,
Hi});
709 MI.eraseFromParent();
713 auto [Op2Lo, Op2Hi] = unpackAExtTruncS16(
MI.getOperand(2).getReg());
716 auto Lo = B.buildInstr(
Opc, {SgprRB_S16}, {Op1Lo, Op2Lo},
Flags);
717 auto Hi = B.buildInstr(
Opc, {SgprRB_S16}, {Op1Hi, Op2Hi},
Flags);
718 B.buildMergeLikeInstr(Dst, {
Lo,
Hi});
719 MI.eraseFromParent();
724 auto [Op3Lo, Op3Hi] = unpackAExtTruncS16(
MI.getOperand(3).getReg());
725 auto Lo = B.buildInstr(
Opc, {SgprRB_S16}, {Op1Lo, Op2Lo, Op3Lo},
Flags);
726 auto Hi = B.buildInstr(
Opc, {SgprRB_S16}, {Op1Hi, Op2Hi, Op3Hi},
Flags);
727 B.buildMergeLikeInstr(Dst, {
Lo,
Hi});
728 MI.eraseFromParent();
732bool RegBankLegalizeHelper::lowerUniMAD64(MachineInstr &
MI) {
739 const GCNSubtarget &ST = B.getMF().getSubtarget<GCNSubtarget>();
742 Register DstLo = B.buildMul(SgprRB_S32, Src0, Src1).getReg(0);
743 Register DstHi = MRI.createVirtualRegister(SgprRB_S32);
744 if (ST.hasScalarMulHiInsts()) {
745 B.buildInstr(AMDGPU::G_UMULH, {{DstHi}}, {Src0, Src1});
747 auto VSrc0 = B.buildCopy(VgprRB_S32, Src0);
748 auto VSrc1 = B.buildCopy(VgprRB_S32, Src1);
749 auto MulHi = B.buildInstr(AMDGPU::G_UMULH, {VgprRB_S32}, {VSrc0, VSrc1});
760 B.buildMergeLikeInstr(Dst0, {DstLo, DstHi});
761 B.buildConstant(Dst1, 0);
764 Register Src2Lo = MRI.createVirtualRegister(SgprRB_S32);
765 Register Src2Hi = MRI.createVirtualRegister(SgprRB_S32);
766 B.buildUnmerge({Src2Lo, Src2Hi}, Src2);
768 auto AddLo = B.buildUAddo(SgprRB_S32, SgprRB_S32, DstLo, Src2Lo);
770 B.buildUAdde(SgprRB_S32, SgprRB_S32, DstHi, Src2Hi, AddLo.getReg(1));
771 B.buildMergeLikeInstr(Dst0, {AddLo.getReg(0), AddHi.getReg(0)});
772 B.buildCopy(Dst1, AddHi.getReg(1));
775 MI.eraseFromParent();
779bool RegBankLegalizeHelper::lowerSplitTo32Select(MachineInstr &
MI) {
781 LLT DstTy = MRI.getType(Dst);
782 assert(DstTy == V4S16 || DstTy == V2S32 || DstTy == S64 ||
784 LLT Ty = DstTy == V4S16 ? V2S16 : S32;
785 auto Op2 = B.buildUnmerge({VgprRB, Ty},
MI.getOperand(2).
getReg());
786 auto Op3 = B.buildUnmerge({VgprRB, Ty},
MI.getOperand(3).
getReg());
790 B.buildSelect({VgprRB, Ty},
Cond, Op2.getReg(0), Op3.getReg(0), Flags);
792 B.buildSelect({VgprRB, Ty},
Cond, Op2.getReg(1), Op3.getReg(1), Flags);
794 B.buildMergeLikeInstr(Dst, {
Lo,
Hi});
795 MI.eraseFromParent();
799bool RegBankLegalizeHelper::lowerSplitTo32SExtInReg(MachineInstr &
MI) {
800 auto Op1 = B.buildUnmerge(VgprRB_S32,
MI.getOperand(1).getReg());
801 int Amt =
MI.getOperand(2).getImm();
805 auto Freeze = B.buildFreeze(VgprRB_S32, Op1.getReg(0));
808 Lo = Freeze.getReg(0);
811 Lo = B.buildSExtInReg(VgprRB_S32, Freeze, Amt).getReg(0);
814 auto SignExtCst = B.buildConstant(SgprRB_S32, 31);
815 Hi = B.buildAShr(VgprRB_S32,
Lo, SignExtCst).getReg(0);
819 Hi = B.buildSExtInReg(VgprRB_S32, Op1.getReg(1), Amt - 32).getReg(0);
822 B.buildMergeLikeInstr(
MI.getOperand(0).getReg(), {Lo, Hi});
823 MI.eraseFromParent();
827bool RegBankLegalizeHelper::lowerSplitBitCount64To32(MachineInstr &
MI) {
833 unsigned Opc =
MI.getOpcode();
842 case AMDGPU::G_AMDGPU_FFBH_U32:
844 AddOpc = AMDGPU::G_UADDSAT;
845 SearchFromMSB =
true;
847 case AMDGPU::G_AMDGPU_FFBL_B32:
849 AddOpc = AMDGPU::G_UADDSAT;
850 SearchFromMSB =
false;
852 case AMDGPU::G_CTLZ_ZERO_UNDEF:
853 FFBOpc = AMDGPU::G_AMDGPU_FFBH_U32;
854 AddOpc = AMDGPU::G_ADD;
855 SearchFromMSB =
true;
857 case AMDGPU::G_CTTZ_ZERO_UNDEF:
858 FFBOpc = AMDGPU::G_AMDGPU_FFBL_B32;
859 AddOpc = AMDGPU::G_ADD;
860 SearchFromMSB =
false;
866 auto Unmerge = B.buildUnmerge(VgprRB_S32,
MI.getOperand(1).getReg());
873 auto Primary = B.buildInstr(FFBOpc, {VgprRB_S32}, {SearchFromMSB ?
Hi :
Lo});
875 B.buildInstr(FFBOpc, {VgprRB_S32}, {SearchFromMSB ?
Lo :
Hi});
877 auto Adjusted = B.buildInstr(AddOpc, {VgprRB_S32},
878 {Secondary, B.buildConstant(VgprRB_S32, 32)});
879 B.buildUMin(
MI.getOperand(0).getReg(), Primary, Adjusted);
881 MI.eraseFromParent();
885bool RegBankLegalizeHelper::lowerExtrVecEltToSel(MachineInstr &
MI) {
897 LLT VecTy = MRI.getType(Src);
900 MachineRegisterInfo::VRegAttrs VgprRB_EltTy = {VgprRB, ScalarTy};
902 auto Unmerge = B.buildUnmerge(VgprRB_EltTy, Src);
905 Register PrevSelect = Unmerge.getReg(0);
906 for (
unsigned I = 1;
I < NumElts; ++
I) {
907 auto IdxConst = B.buildConstant({SgprRB, MRI.getType(Idx)},
I);
910 B.buildSelect(VgprRB_EltTy, Cmp, Unmerge.getReg(
I), PrevSelect)
913 B.buildCopy(Dst, PrevSelect);
915 auto InitUnmerge = B.buildUnmerge(VgprRB_S32, Unmerge.getReg(0));
916 Register PrevLo = InitUnmerge.getReg(0);
917 Register PrevHi = InitUnmerge.getReg(1);
918 for (
unsigned I = 1;
I < NumElts; ++
I) {
919 auto IdxConst = B.buildConstant({SgprRB, MRI.getType(Idx)},
I);
921 auto EltUnmerge = B.buildUnmerge(VgprRB_S32, Unmerge.getReg(
I));
922 PrevLo = B.buildSelect(VgprRB_S32, Cmp, EltUnmerge.getReg(0), PrevLo)
924 PrevHi = B.buildSelect(VgprRB_S32, Cmp, EltUnmerge.getReg(1), PrevHi)
927 B.buildMergeLikeInstr(Dst, {PrevLo, PrevHi});
930 MF, MORE,
"amdgpu-regbanklegalize",
931 "AMDGPU RegBankLegalize: ExtrVecEltToSel unsupported element type",
MI);
935 MI.eraseFromParent();
939bool RegBankLegalizeHelper::lowerExtrVecEltTo32(MachineInstr &
MI) {
952 LLT SrcTy = MRI.getType(Src);
955 assert(MRI.getRegBank(Src) == VgprRB && MRI.getRegBank(Idx) == SgprRB &&
956 "expected VGPR src and SGPR idx");
958 auto CastSrc = B.buildBitcast({VgprRB, Vec32Ty}, Src);
961 auto One = B.buildConstant(SgprRB_S32, 1);
962 auto IdxLo = B.buildShl(SgprRB_S32, Idx, One);
963 auto IdxHi = B.buildAdd(SgprRB_S32, IdxLo, One);
965 auto ExtLo = B.buildExtractVectorElement(VgprRB_S32, CastSrc, IdxLo);
966 auto ExtHi = B.buildExtractVectorElement(VgprRB_S32, CastSrc, IdxHi);
968 B.buildMergeLikeInstr(Dst, {ExtLo.getReg(0), ExtHi.getReg(0)});
970 MI.eraseFromParent();
974bool RegBankLegalizeHelper::lowerInsVecEltToSel(MachineInstr &
MI) {
987 LLT VecTy = MRI.getType(Src);
990 const RegisterBank *SrcRB = MRI.getRegBank(Src);
991 bool IsSGPR = (SrcRB == SgprRB);
992 SmallVector<Register, 16> Selects;
996 auto Unmerge = B.buildUnmerge(VgprRB_S32, Src);
997 auto EltUnmerge = B.buildUnmerge(VgprRB_S32, Elt);
998 Register EltLo = EltUnmerge.getReg(0);
999 Register EltHi = EltUnmerge.getReg(1);
1000 for (
unsigned I = 0;
I < NumElts; ++
I) {
1001 auto IdxConst = B.buildConstant(VgprRB_S32,
I);
1004 B.buildSelect(VgprRB_S32, Cmp, EltLo, Unmerge.getReg(2 *
I))
1007 B.buildSelect(VgprRB_S32, Cmp, EltHi, Unmerge.getReg(2 *
I + 1))
1011 auto Vec32 = B.buildBuildVector({VgprRB, Vec32Ty}, Selects);
1012 B.buildBitcast(Dst, Vec32);
1015 MachineRegisterInfo::VRegAttrs SrcRB_EltTy = {SrcRB, ScalarTy};
1016 MachineRegisterInfo::VRegAttrs CmpTy = IsSGPR ? SgprRB_S32 : VccRB_S1;
1017 auto Unmerge = B.buildUnmerge(SrcRB_EltTy, Src);
1018 for (
unsigned I = 0;
I < NumElts; ++
I) {
1019 auto IdxConst = B.buildConstant(SgprRB_S32,
I);
1022 B.buildSelect(SrcRB_EltTy, Cmp, Elt, Unmerge.getReg(
I)).getReg(0));
1024 B.buildMergeLikeInstr(Dst, Selects);
1027 MF, MORE,
"amdgpu-regbanklegalize",
1028 "AMDGPU RegBankLegalize: InsVecEltToSel unsupported element type",
MI);
1032 MI.eraseFromParent();
1036bool RegBankLegalizeHelper::lowerInsVecEltTo32(MachineInstr &
MI) {
1051 LLT SrcTy = MRI.getType(Src);
1054 assert(MRI.getRegBank(Src) == VgprRB && MRI.getRegBank(Idx) == SgprRB &&
1055 "expected VGPR src and SGPR idx");
1057 MachineRegisterInfo::VRegAttrs VgprRB_Vec32Ty = {VgprRB, Vec32Ty};
1059 auto CastSrc = B.buildBitcast(VgprRB_Vec32Ty, Src);
1060 auto EltUnmerge = B.buildUnmerge(VgprRB_S32, Elt);
1063 auto One = B.buildConstant(SgprRB_S32, 1);
1064 auto IdxLo = B.buildShl(SgprRB_S32, Idx, One);
1065 auto IdxHi = B.buildAdd(SgprRB_S32, IdxLo, One);
1067 auto InsLo = B.buildInsertVectorElement(VgprRB_Vec32Ty, CastSrc,
1068 EltUnmerge.getReg(0), IdxLo);
1069 auto InsHi = B.buildInsertVectorElement(VgprRB_Vec32Ty, InsLo,
1070 EltUnmerge.getReg(1), IdxHi);
1072 B.buildBitcast(Dst, InsHi);
1074 MI.eraseFromParent();
1078bool RegBankLegalizeHelper::lowerAbsToNegMax(MachineInstr &
MI) {
1088 LLT Ty = MRI.getType(DstReg);
1094 Zero = B.buildBuildVector({VgprRB, Ty}, {Zero16, Zero16}).
getReg(0);
1096 assert((Ty == S32 || Ty == S16) &&
"unexpected type for AbsToNegMax");
1097 Zero = B.buildConstant({VgprRB, Ty}, 0).
getReg(0);
1100 auto Neg = B.buildSub({VgprRB, Ty},
Zero, SrcReg);
1101 B.buildSMax(DstReg, SrcReg, Neg);
1102 MI.eraseFromParent();
1106bool RegBankLegalizeHelper::lowerAbsToS32(MachineInstr &
MI) {
1116 auto Bitcast = B.buildBitcast({SgprRB_S32},
MI.getOperand(1).
getReg());
1117 auto SextInReg = B.buildSExtInReg({SgprRB_S32},
Bitcast, 16);
1119 B.buildAShr({SgprRB_S32},
Bitcast, B.buildConstant({SgprRB_S32}, 16));
1121 auto AbsLo = B.buildInstr(AMDGPU::G_ABS, {{SgprRB_S32}}, {SextInReg});
1122 auto AbsHi = B.buildInstr(AMDGPU::G_ABS, {{SgprRB_S32}}, {ShiftHi});
1123 B.buildBuildVectorTrunc(
MI.getOperand(0).getReg(),
1124 {AbsLo.getReg(0), AbsHi.getReg(0)});
1126 MI.eraseFromParent();
1130bool RegBankLegalizeHelper::lower(MachineInstr &
MI,
1138 return lowerVccExtToSel(
MI);
1140 LLT Ty = MRI.getType(
MI.getOperand(0).getReg());
1141 auto True = B.buildConstant({SgprRB, Ty},
1142 MI.getOpcode() == AMDGPU::G_SEXT ? -1 : 1);
1143 auto False = B.buildConstant({SgprRB, Ty}, 0);
1147 B.buildSelect(
MI.getOperand(0).getReg(),
MI.getOperand(1).getReg(), True,
1149 MI.eraseFromParent();
1153 return lowerUnpackBitShift(
MI);
1155 return lowerUnpackMinMax(
MI);
1157 return lowerSplitTo16(
MI);
1159 const RegisterBank *RB = MRI.getRegBank(
MI.getOperand(0).getReg());
1160 MachineInstrBuilder
Hi;
1161 switch (
MI.getOpcode()) {
1162 case AMDGPU::G_ZEXT: {
1163 Hi = B.buildConstant({RB, S32}, 0);
1166 case AMDGPU::G_SEXT: {
1168 auto ShiftAmt = B.buildConstant({RB, S32}, 31);
1169 Hi = B.buildAShr({RB, S32},
MI.getOperand(1).
getReg(), ShiftAmt);
1172 case AMDGPU::G_ANYEXT: {
1173 Hi = B.buildUndef({RB, S32});
1178 "AMDGPU RegBankLegalize: Ext32To64, unsuported opcode",
1183 B.buildMergeLikeInstr(
MI.getOperand(0).getReg(),
1184 {MI.getOperand(1).getReg(), Hi});
1185 MI.eraseFromParent();
1189 uint64_t ConstVal =
MI.getOperand(1).getCImm()->getZExtValue();
1190 B.buildConstant(
MI.getOperand(0).getReg(), ConstVal);
1192 MI.eraseFromParent();
1197 LLT Ty = MRI.getType(Src);
1201 Register BoolSrc = MRI.createVirtualRegister({VgprRB, Ty});
1203 auto Src64 = B.buildUnmerge(VgprRB_S32, Src);
1204 auto One = B.buildConstant(VgprRB_S32, 1);
1205 auto AndLo = B.buildAnd(VgprRB_S32, Src64.getReg(0), One);
1206 auto Zero = B.buildConstant(VgprRB_S32, 0);
1207 auto AndHi = B.buildAnd(VgprRB_S32, Src64.getReg(1), Zero);
1208 B.buildMergeLikeInstr(BoolSrc, {AndLo, AndHi});
1210 assert(Ty == S32 || Ty == S16);
1211 auto One = B.buildConstant({VgprRB, Ty}, 1);
1212 B.buildAnd(BoolSrc, Src, One);
1214 auto Zero = B.buildConstant({VgprRB, Ty}, 0);
1216 MI.eraseFromParent();
1220 return lowerV_BFE(
MI);
1222 return lowerS_BFE(
MI);
1224 return lowerUniMAD64(
MI);
1226 B.buildMul(
MI.getOperand(0),
MI.getOperand(1),
MI.getOperand(2));
1227 MI.eraseFromParent();
1231 auto Op1 = B.buildTrunc(VgprRB_S32,
MI.getOperand(1));
1232 auto Op2 = B.buildTrunc(VgprRB_S32,
MI.getOperand(2));
1233 auto Zero = B.buildConstant({VgprRB, S64}, 0);
1235 unsigned NewOpc =
MI.getOpcode() == AMDGPU::G_AMDGPU_S_MUL_U64_U32
1236 ? AMDGPU::G_AMDGPU_MAD_U64_U32
1237 : AMDGPU::G_AMDGPU_MAD_I64_I32;
1239 B.buildInstr(NewOpc, {
MI.getOperand(0).getReg(), {SgprRB, S32}},
1241 MI.eraseFromParent();
1245 return lowerSplitTo32(
MI);
1247 return lowerSplitTo32Mul(
MI);
1249 return lowerSplitTo32Select(
MI);
1251 return lowerSplitTo32SExtInReg(
MI);
1253 LLT DstTy = MRI.getType(
MI.getOperand(0).getReg());
1264 if (
Size / 128 == 2)
1266 else if (
Size / 128 == 4)
1270 "AMDGPU RegBankLegalize: SplitLoad, unsuported type",
1276 else if (DstTy == S96)
1277 splitLoad(
MI, {S64, S32}, S32);
1278 else if (DstTy == V3S32)
1279 splitLoad(
MI, {V2S32, S32}, S32);
1280 else if (DstTy == V6S16)
1281 splitLoad(
MI, {V4S16, V2S16}, V2S16);
1284 "AMDGPU RegBankLegalize: SplitLoad, unsuported type",
1291 LLT DstTy = MRI.getType(
MI.getOperand(0).getReg());
1293 widenLoad(
MI, S128);
1294 else if (DstTy == V3S32)
1295 widenLoad(
MI, V4S32, S32);
1296 else if (DstTy == V6S16)
1297 widenLoad(
MI, V8S16, V2S16);
1300 "AMDGPU RegBankLegalize: WidenLoad, unsuported type",
1307 return lowerUnpackAExt(
MI);
1312 return MRI.getRegBankOrNull(Op.getReg()) == SgprRB;
1318 return MRI.getRegBankOrNull(Op.getReg()) == VgprRB;
1320 B.setInstrAndDebugLoc(
MI);
1321 for (
unsigned i =
MI.getNumDefs(); i <
MI.getNumOperands(); ++i) {
1322 MachineOperand &
Op =
MI.getOperand(i);
1326 if (MRI.getRegBank(
Reg) != VgprRB) {
1327 auto Copy = B.buildCopy({VgprRB, MRI.getType(
Reg)},
Reg);
1328 Op.setReg(
Copy.getReg(0));
1338 "AMDGPU RegBankLegalize: unmerge not multiple of 32",
1343 B.setInstrAndDebugLoc(
MI);
1346 B.buildUnmerge({SgprRB, V2S16}, Unmerge->
getSourceReg());
1347 for (
unsigned i = 0; i < UnmergeV2S16->getNumDefs(); ++i) {
1348 auto [Dst0S32, Dst1S32] =
1349 unpackAExt(UnmergeV2S16->getOperand(i).getReg());
1350 B.buildTrunc(
MI.getOperand(i * 2).getReg(), Dst0S32);
1351 B.buildTrunc(
MI.getOperand(i * 2 + 1).getReg(), Dst1S32);
1354 auto [Dst0S32, Dst1S32] = unpackAExt(
MI.getOperand(2).getReg());
1355 B.buildTrunc(
MI.getOperand(0).getReg(), Dst0S32);
1356 B.buildTrunc(
MI.getOperand(1).getReg(), Dst1S32);
1359 MI.eraseFromParent();
1364 Register NewDst = MRI.createVirtualRegister(SgprRB_S32);
1365 B.setInsertPt(*
MI.getParent(),
MI.getParent()->getFirstNonPHI());
1366 MI.getOperand(0).setReg(NewDst);
1367 B.buildTrunc(Dst, NewDst);
1369 for (
unsigned i = 1; i <
MI.getNumOperands(); i += 2) {
1377 auto NewUse = B.buildAnyExt(SgprRB_S32,
UseReg);
1378 MI.getOperand(i).setReg(NewUse.getReg(0));
1386 return MRI.getRegBankOrNull(Op.getReg()) == SgprRB;
1391 assert(MRI.getRegBankOrNull(
MI.getOperand(0).getReg()) == VgprRB);
1395 const RegisterBank *RB = MRI.getRegBankOrNull(Op.getReg());
1396 return RB == VgprRB || RB == SgprRB;
1401 const AMDGPU::RsrcIntrinsic *RSrcIntrin =
1406 unsigned RsrcIdx = RSrcIntrin->
RsrcArg +
MI.getNumExplicitDefs() + 1;
1407 return applyRegisterBanksVgprWithSgprRsrc(
MI, RsrcIdx);
1413 unsigned RsrcIdx =
MI.getNumOperands();
1414 while (RsrcIdx-- >
MI.getNumExplicitDefs()) {
1415 const MachineOperand &
Op =
MI.getOperand(RsrcIdx);
1416 if (
Op.isReg() &&
Op.getReg().isVirtual())
1419 return applyRegisterBanksVgprWithSgprRsrc(
MI, RsrcIdx);
1422 return lowerSplitBitCount64To32(
MI);
1424 return lowerExtrVecEltToSel(
MI);
1426 return lowerExtrVecEltTo32(
MI);
1428 return lowerInsVecEltToSel(
MI);
1430 return lowerInsVecEltTo32(
MI);
1432 return lowerAbsToNegMax(
MI);
1434 return lowerAbsToS32(
MI);
1438 if (!executeInWaterfallLoop(B, WFI))
1544 return isAnyPtr(Ty, 32) ? Ty : LLT();
1547 return isAnyPtr(Ty, 64) ? Ty : LLT();
1550 return isAnyPtr(Ty, 128) ? Ty : LLT();
1594 const SIRegisterInfo *
TRI =
1595 static_cast<const SIRegisterInfo *
>(MRI.getTargetRegisterInfo());
1597 if (LLTSize >= 32 &&
TRI->getSGPRClassForBitWidth(LLTSize))
1602 const SIRegisterInfo *
TRI =
1603 static_cast<const SIRegisterInfo *
>(MRI.getTargetRegisterInfo());
1712bool RegBankLegalizeHelper::applyMappingDst(
1713 MachineInstr &
MI,
unsigned &
OpIdx,
1714 const SmallVectorImpl<RegBankLLTMappingApplyID> &MethodIDs) {
1719 MachineOperand &
Op =
MI.getOperand(
OpIdx);
1721 LLT Ty = MRI.getType(
Reg);
1722 [[maybe_unused]]
const RegisterBank *RB = MRI.getRegBank(
Reg);
1724 switch (MethodIDs[
OpIdx]) {
1796 Register NewAgprDst = MRI.createVirtualRegister({AgprRB, Ty});
1797 Op.setReg(NewAgprDst);
1798 if (!MRI.use_nodbg_empty(
Reg))
1799 B.buildCopy(
Reg, NewAgprDst);
1806 Register NewDst = MRI.createVirtualRegister(VccRB_S1);
1808 if (!MRI.use_empty(
Reg)) {
1810 B.buildInstr(AMDGPU::G_AMDGPU_COPY_SCC_VCC, {SgprRB_S32}, {NewDst});
1811 B.buildTrunc(
Reg, CopyS32_Vcc);
1818 Register NewVgprDstS16 = MRI.createVirtualRegister({VgprRB, S16});
1819 Register NewVgprDstS32 = MRI.createVirtualRegister({VgprRB, S32});
1820 Register NewSgprDstS32 = MRI.createVirtualRegister({SgprRB, S32});
1821 Op.setReg(NewVgprDstS16);
1822 B.buildAnyExt(NewVgprDstS32, NewVgprDstS16);
1824 B.buildTrunc(
Reg, NewSgprDstS32);
1835 Register NewVgprDst = MRI.createVirtualRegister({VgprRB, Ty});
1836 Op.setReg(NewVgprDst);
1849 Register NewVgprDst = MRI.createVirtualRegister({VgprRB, Ty});
1850 Op.setReg(NewVgprDst);
1858 Register NewDst = MRI.createVirtualRegister(SgprRB_S32);
1860 if (!MRI.use_empty(
Reg))
1861 B.buildTrunc(
Reg, NewDst);
1868 Op.setReg(MRI.createVirtualRegister({SgprRB, Ty}));
1869 B.buildCopy(
Reg,
Op.getReg());
1874 MF, MORE,
"amdgpu-regbanklegalize",
1875 "AMDGPU RegBankLegalize: missing fast rule ('Div' or 'Uni') for",
MI);
1880 MF, MORE,
"amdgpu-regbanklegalize",
1881 "AMDGPU RegBankLegalize: applyMappingDst, ID not supported",
MI);
1889bool RegBankLegalizeHelper::applyMappingSrc(
1890 MachineInstr &
MI,
unsigned &
OpIdx,
1891 const SmallVectorImpl<RegBankLLTMappingApplyID> &MethodIDs,
1893 for (
unsigned i = 0; i < MethodIDs.
size(); ++
OpIdx, ++i) {
1894 if (MethodIDs[i] ==
None || MethodIDs[i] ==
IntrId || MethodIDs[i] ==
Imm)
1897 MachineOperand &
Op =
MI.getOperand(
OpIdx);
1899 LLT Ty = MRI.getType(
Reg);
1900 const RegisterBank *RB = MRI.getRegBank(
Reg);
1902 switch (MethodIDs[i]) {
1905 assert(RB == VccRB || RB == SgprRB);
1907 auto Aext = B.buildAnyExt(SgprRB_S32,
Reg);
1909 B.buildInstr(AMDGPU::G_AMDGPU_COPY_VCC_SCC, {VccRB_S1}, {Aext});
1910 Op.setReg(CopyVcc_Scc.getReg(0));
1928 assert(Ty == getTyFromID(MethodIDs[i]));
1929 assert(RB == getRegBankFromID(MethodIDs[i]));
1943 assert(Ty == getBTyFromID(MethodIDs[i], Ty));
1944 assert(RB == getRegBankFromID(MethodIDs[i]));
1968 assert(Ty == getTyFromID(MethodIDs[i]));
1970 auto CopyToVgpr = B.buildCopy({VgprRB, Ty},
Reg);
1971 Op.setReg(CopyToVgpr.getReg(0));
1987 assert(Ty == getBTyFromID(MethodIDs[i], Ty));
1989 auto CopyToVgpr = B.buildCopy({VgprRB, Ty},
Reg);
1990 Op.setReg(CopyToVgpr.getReg(0));
1996 auto CopyToVgpr = B.buildCopy({VgprRB, Ty},
Reg);
1997 Op.setReg(CopyToVgpr.getReg(0));
2003 auto CopyToAgpr = B.buildCopy({AgprRB, Ty},
Reg);
2004 Op.setReg(CopyToAgpr.getReg(0));
2011 assert(Ty == getTyFromID(MethodIDs[i]));
2016 WFI.
End = std::next(
MI.getIterator());
2023 assert(Ty == getTyFromID(MethodIDs[i]));
2029 while (
Start->getOpcode() != AMDGPU::ADJCALLSTACKUP)
2034 while (End->getOpcode() != AMDGPU::ADJCALLSTACKDOWN)
2038 B.setInsertPt(*
MI.getParent(), Start);
2047 assert(Ty == getBTyFromID(MethodIDs[i], Ty));
2051 Register NewSGPR = MRI.createVirtualRegister({SgprRB, Ty});
2057 assert(Ty == getTyFromID(MethodIDs[i]));
2061 Register NewSGPR = MRI.createVirtualRegister({SgprRB, Ty});
2071 auto Aext = B.buildAnyExt(SgprRB_S32,
Reg);
2072 Op.setReg(Aext.getReg(0));
2079 auto Aext = B.buildAnyExt(SgprRB_S32,
Reg);
2082 auto Cst1 = B.buildConstant(SgprRB_S32, 1);
2083 auto BoolInReg = B.buildAnd(SgprRB_S32, Aext, Cst1);
2084 Op.setReg(BoolInReg.getReg(0));
2090 auto Sext = B.buildSExt(SgprRB_S32,
Reg);
2091 Op.setReg(Sext.getReg(0));
2097 auto Zext = B.buildZExt({SgprRB, S32},
Reg);
2098 Op.setReg(Zext.getReg(0));
2104 auto Aext = B.buildAnyExt({VgprRB, S32},
Reg);
2105 Op.setReg(Aext.getReg(0));
2112 auto Sext = B.buildSExt({VgprRB, S32},
Reg);
2113 Op.setReg(Sext.getReg(0));
2120 auto Zext = B.buildZExt({VgprRB, S32},
Reg);
2121 Op.setReg(Zext.getReg(0));
2126 MF, MORE,
"amdgpu-regbanklegalize",
2127 "AMDGPU RegBankLegalize: applyMappingSrc, ID not supported",
MI);
2137 unsigned StartOpIdx,
2138 unsigned EndOpIdx) {
2139 for (
unsigned i = StartOpIdx; i <= EndOpIdx; ++i) {
2146bool RegBankLegalizeHelper::applyRegisterBanksVgprWithSgprRsrc(
2147 MachineInstr &
MI,
unsigned RsrcIdx) {
2148 const unsigned NumDefs =
MI.getNumExplicitDefs();
2150 MachineBasicBlock *
MBB =
MI.getParent();
2154 for (
unsigned i = 0; i < NumDefs; ++i) {
2156 if (MRI.getRegBank(
Reg) == VgprRB)
2159 Register NewVgprDst = MRI.createVirtualRegister({VgprRB, MRI.getType(
Reg)});
2160 MI.getOperand(i).setReg(NewVgprDst);
2164 B.setInstrAndDebugLoc(
MI);
2167 for (
unsigned i = NumDefs; i < RsrcIdx; ++i) {
2168 MachineOperand &
Op =
MI.getOperand(i);
2176 if (MRI.getRegBank(
Reg) == VgprRB)
2179 auto Copy = B.buildCopy({VgprRB, MRI.getType(
Reg)},
Reg);
2180 Op.setReg(
Copy.getReg(0));
2183 SmallSet<Register, 4> OpsToWaterfall;
2186 for (
unsigned i = RsrcIdx; i <
MI.getNumOperands(); ++i) {
2187 MachineOperand &
Op =
MI.getOperand(i);
2192 if (MRI.getRegBank(
Reg) != SgprRB)
2196 if (!OpsToWaterfall.
empty()) {
2198 executeInWaterfallLoop(B, {OpsToWaterfall, MII, std::next(MII)});
MachineInstrBuilder MachineInstrBuilder & DefMI
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.
Provides AMDGPU specific target descriptions.
static bool isSignedBFE(MachineInstr &MI)
static bool verifyRegBankOnOperands(MachineInstr &MI, const RegisterBank *RB, MachineRegisterInfo &MRI, unsigned StartOpIdx, unsigned EndOpIdx)
This file declares the targeting of the RegisterBankInfo class for AMDGPU.
MachineBasicBlock MachineBasicBlock::iterator MBBI
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
AMD GCN specific subclass of TargetSubtarget.
Declares convenience wrapper classes for interpreting MachineInstr instances as specific generic oper...
static Register UseReg(const MachineOperand &MO)
const size_t AbstractManglingParser< Derived, Alloc >::NumOps
Contains matchers for matching SSA Machine Instructions.
This file declares the MachineIRBuilder class.
Register const TargetRegisterInfo * TRI
Promote Memory to Register
static MCRegister getReg(const MCDisassembler *D, unsigned RC, unsigned RegNo)
MachineInstr unsigned OpIdx
const SmallVectorImpl< MachineOperand > & Cond
static const LaneMaskConstants & get(const GCNSubtarget &ST)
const unsigned XorTermOpc
const unsigned MovTermOpc
const unsigned AndSaveExecOpc
bool findRuleAndApplyMapping(MachineInstr &MI)
RegBankLegalizeHelper(MachineIRBuilder &B, const MachineUniformityInfo &MUI, const RegisterBankInfo &RBI, const RegBankLegalizeRules &RBLRules)
const RegBankLLTMapping * findMappingForMI(const MachineInstr &MI, const MachineRegisterInfo &MRI, const MachineUniformityInfo &MUI) const
static APInt getLowBitsSet(unsigned numBits, unsigned loBitsSet)
Constructs an APInt value that has the bottom loBitsSet bits set.
iterator find(const_arg_type_t< KeyT > Val)
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
const SIRegisterInfo * getRegisterInfo() const override
Represents a call to an intrinsic.
Register getSourceReg() const
Get the unmerge source register.
constexpr bool isScalar() const
LLT getScalarType() const
static constexpr LLT scalar(unsigned SizeInBits)
Get a low-level scalar or aggregate "bag of bits".
constexpr bool isValid() const
constexpr uint16_t getNumElements() const
Returns the number of elements in a vector LLT.
constexpr bool isVector() const
static constexpr LLT pointer(unsigned AddressSpace, unsigned SizeInBits)
Get a low-level pointer in the given address space.
constexpr TypeSize getSizeInBits() const
Returns the total size of the type. Must only be called on sized types.
constexpr bool isPointer() const
static constexpr LLT fixed_vector(unsigned NumElements, unsigned ScalarSizeInBits)
Get a low-level fixed-width vector of some number of elements and element width.
constexpr TypeSize getSizeInBytes() const
Returns the total size of the type in bytes, i.e.
LLT getElementType() const
Returns the vector's element type. Only valid for vector types.
TypeSize getValue() const
LLVM_ABI void transferSuccessorsAndUpdatePHIs(MachineBasicBlock *FromMBB)
Transfers all the successors, as in transferSuccessors, and update PHI operands in the successor bloc...
LLVM_ABI iterator SkipPHIsAndLabels(iterator I)
Return the first instruction in MBB after I that is not a PHI or a label.
LLVM_ABI void addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob=BranchProbability::getUnknown())
Add Succ as a successor of this MachineBasicBlock.
void splice(iterator Where, MachineBasicBlock *Other, iterator From)
Take an instruction from MBB 'Other' at the position From, and insert it into this MBB right before '...
MachineInstrBundleIterator< MachineInstr > iterator
BasicBlockListType::iterator iterator
MachineBasicBlock * CreateMachineBasicBlock(const BasicBlock *BB=nullptr, std::optional< UniqueBBID > BBID=std::nullopt)
CreateMachineInstr - Allocate a new MachineInstr.
void insert(iterator MBBI, MachineBasicBlock *MBB)
Helper class to build MachineInstr.
bool isValid() const
Check for null.
Representation of each machine instruction.
const MachineBasicBlock * getParent() const
LocationSize getSize() const
Return the size in bytes of the memory reference.
MachineOperand class - Representation of each machine instruction operand.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
const RegisterBank * getRegBank(Register Reg) const
Return the register bank of Reg.
LLVM_ABI Register createVirtualRegister(const TargetRegisterClass *RegClass, StringRef Name="")
createVirtualRegister - Create and return a new virtual register in the function with the specified r...
LLT getType(Register Reg) const
Get the low-level type of Reg or LLT{} if Reg is not a generic (target independent) virtual register.
const RegisterBank * getRegBankOrNull(Register Reg) const
Return the register bank of Reg, or null if Reg has not been assigned a register bank or has been ass...
Holds all the information related to register banks.
This class implements the register bank concept.
Wrapper class representing virtual and physical registers.
constexpr bool isVirtual() const
Return true if the specified register number is in the virtual register namespace.
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
self_iterator getIterator()
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ SgprV4S32_ReadFirstLane
bool isAnyPtr(LLT Ty, unsigned Width)
Intrinsic::ID getIntrinsicID(const MachineInstr &I)
Return the intrinsic ID for opcodes with the G_AMDGPU_INTRIN_ prefix.
@ VerifyAllSgprOrVgprGPHI
@ AextToS32InIncomingBlockGPHI
void buildReadAnyLane(MachineIRBuilder &B, Register SgprDst, Register VgprSrc, const RegisterBankInfo &RBI)
const RsrcIntrinsic * lookupRsrcIntrinsic(unsigned Intr)
void buildReadFirstLane(MachineIRBuilder &B, Register SgprDst, Register VgprSrc, const RegisterBankInfo &RBI)
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
@ Bitcast
Perform the operation on a different, but equivalently sized type.
SpecificConstantMatch m_ZeroInt()
Convenience matchers for specific integer values.
bool mi_match(Reg R, const MachineRegisterInfo &MRI, Pattern &&P)
This is an optimization pass for GlobalISel generic memory operations.
GenericUniformityInfo< MachineSSAContext > MachineUniformityInfo
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
@ Kill
The last use of a register.
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
LLVM_ABI void constrainSelectedInstRegOperands(MachineInstr &I, const TargetInstrInfo &TII, const TargetRegisterInfo &TRI, const RegisterBankInfo &RBI)
Mutate the newly-selected instruction I to constrain its (possibly generic) virtual register operands...
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
LLVM_ABI void reportGISelFailure(MachineFunction &MF, MachineOptimizationRemarkEmitter &MORE, MachineOptimizationRemarkMissed &R)
Report an ISel error as a missed optimization remark to the LLVMContext's diagnostic stream.
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
DWARFExpression::Operation Op
ArrayRef(const T &OneElt) -> ArrayRef< T >
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
LLVM_ABI std::optional< ValueAndVReg > getIConstantVRegValWithLookThrough(Register VReg, const MachineRegisterInfo &MRI, bool LookThroughInstrs=true)
If VReg is defined by a statically evaluable chain of instructions rooted on a G_CONSTANT returns its...
constexpr T maskTrailingOnes(unsigned N)
Create a bitmask with the N right-most bits set to 1, and all other bits set to 0.
LoweringMethodID LoweringMethod
SmallVector< RegBankLLTMappingApplyID, 2 > DstOpMapping
SmallVector< RegBankLLTMappingApplyID, 4 > SrcOpMapping
Holds waterfall loop information: the set of SGPR operand registers that need waterfalling,...
MachineBasicBlock::iterator Start
SmallSet< Register, 4 > SgprWaterfallOperandRegs
MachineBasicBlock::iterator End