27#include "llvm/IR/IntrinsicsAMDGPU.h"
29#define DEBUG_TYPE "amdgpu-regbanklegalize"
37 : MF(B.getMF()), ST(MF.getSubtarget<
GCNSubtarget>()), B(B),
38 MRI(*B.getMRI()), MUI(MUI), RBI(RBI), MORE(MF, nullptr),
39 RBLRules(RBLRules), IsWave32(ST.isWave32()),
40 SgprRB(&RBI.getRegBank(
AMDGPU::SGPRRegBankID)),
41 VgprRB(&RBI.getRegBank(
AMDGPU::VGPRRegBankID)),
42 AgprRB(&RBI.getRegBank(
AMDGPU::AGPRRegBankID)),
43 VccRB(&RBI.getRegBank(
AMDGPU::VCCRegBankID)) {}
49 "No AMDGPU RegBankLegalize rules defined for opcode",
57 "AMDGPU RegBankLegalize: none of the rules defined with "
58 "'Any' for MI's opcode matched MI",
66 B.setInsertPt(*
MI.getParent(), std::next(
MI.getIterator()));
76 if (!lower(
MI, *Mapping, WFI))
85 "Waterfall range not initialized");
102 const int OrigRangeSize = std::distance(BeginIt, EndIt);
110 B.buildInstr(TargetOpcode::IMPLICIT_DEF).addDef(InitSaveExecReg);
136 MBB.addSuccessor(LoopBB);
139 B.setInsertPt(*LoopBB, LoopBB->
end());
190 auto NewEnd = BodyBB->
end();
191 assert(std::distance(NewBegin, NewEnd) == OrigRangeSize);
204 auto OldVal = WaterfalledRegMap.
find(OldReg);
205 if (OldVal != WaterfalledRegMap.
end()) {
206 Op.setReg(OldVal->second);
220 unsigned PartSize = (OpSize % 64 == 0) ? 64 : 32;
222 unsigned NumParts = OpSize / PartSize;
228 CurrentLaneParts.
push_back(CurrentLaneReg);
230 auto UnmergeOp = B.buildUnmerge({VgprRB, PartTy}, OpReg);
231 auto UnmergeCurrLane = B.buildUnmerge({SgprRB, PartTy}, CurrentLaneReg);
232 for (
unsigned i = 0; i < NumParts; ++i) {
234 CurrentLaneParts.
push_back(UnmergeCurrLane.getReg(i));
238 for (
unsigned i = 0; i < NumParts; ++i) {
239 Register CmpReg = MRI.createVirtualRegister(VccRB_S1);
245 CondReg = B.buildAnd(VccRB_S1, CondReg, CmpReg).getReg(0);
248 Op.setReg(CurrentLaneReg);
251 WaterfalledRegMap.
insert(std::pair(OldReg,
Op.getReg()));
257 MRI.createVirtualRegister({WaveRC,
LLT::scalar(IsWave32 ? 32 : 64)});
258 B.buildIntrinsic(Intrinsic::amdgcn_ballot, CondRegLM).addReg(CondReg);
264 MRI.setSimpleHint(SavedExec, CondRegLM);
266 B.setInsertPt(*BodyBB, BodyBB->
end());
278 B.buildInstr(AMDGPU::SI_WATERFALL_LOOP).addMBB(LoopBB);
282 B.buildInstr(LMC.
MovOpc).addDef(SaveExecReg).addReg(LMC.
ExecReg);
285 B.setInsertPt(*RestoreExecBB, RestoreExecBB->
begin());
290 B.setInsertPt(*RemainderBB, RemainderBB->
begin());
295bool RegBankLegalizeHelper::splitLoad(MachineInstr &
MI,
297 MachineFunction &MF = B.getMF();
298 assert(
MI.getNumMemOperands() == 1);
299 MachineMemOperand &BaseMMO = **
MI.memoperands_begin();
301 const RegisterBank *DstRB = MRI.getRegBankOrNull(Dst);
303 LLT PtrTy = MRI.getType(
Base);
304 const RegisterBank *PtrRB = MRI.getRegBankOrNull(
Base);
308 unsigned ByteOffset = 0;
309 for (LLT PartTy : LLTBreakdown) {
311 if (ByteOffset == 0) {
312 BasePlusOffset =
Base;
314 auto Offset = B.buildConstant({PtrRB, OffsetTy}, ByteOffset);
318 auto *OffsetMMO = MF.getMachineMemOperand(&BaseMMO, ByteOffset, PartTy);
319 auto LoadPart = B.buildLoad({DstRB, PartTy}, BasePlusOffset, *OffsetMMO);
320 LoadPartRegs.
push_back(LoadPart.getReg(0));
326 B.buildMergeLikeInstr(Dst, LoadPartRegs);
332 if (MRI.getType(
Reg) == MergeTy) {
335 auto Unmerge = B.buildUnmerge({DstRB, MergeTy},
Reg);
336 for (
unsigned i = 0; i < Unmerge->getNumOperands() - 1; ++i)
337 MergeTyParts.
push_back(Unmerge.getReg(i));
340 B.buildMergeLikeInstr(Dst, MergeTyParts);
342 MI.eraseFromParent();
346bool RegBankLegalizeHelper::widenLoad(MachineInstr &
MI, LLT WideTy,
348 MachineFunction &MF = B.getMF();
349 assert(
MI.getNumMemOperands() == 1);
350 MachineMemOperand &BaseMMO = **
MI.memoperands_begin();
352 const RegisterBank *DstRB = MRI.getRegBankOrNull(Dst);
355 MachineMemOperand *WideMMO = MF.getMachineMemOperand(&BaseMMO, 0, WideTy);
356 auto WideLoad = B.buildLoad({DstRB, WideTy},
Base, *WideMMO);
359 B.buildTrunc(Dst, WideLoad);
362 auto Unmerge = B.buildUnmerge({DstRB, MergeTy}, WideLoad);
364 LLT DstTy = MRI.getType(Dst);
366 for (
unsigned i = 0; i < NumElts; ++i) {
367 MergeTyParts.
push_back(Unmerge.getReg(i));
369 B.buildMergeLikeInstr(Dst, MergeTyParts);
371 MI.eraseFromParent();
375bool RegBankLegalizeHelper::widenMMOToS32(GAnyLoad &
MI)
const {
378 MachineMemOperand &MMO =
MI.getMMO();
381 MachineMemOperand *WideMMO = B.getMF().getMachineMemOperand(&MMO, 0, S32);
383 if (
MI.getOpcode() == G_LOAD) {
384 B.buildLoad(Dst, Ptr, *WideMMO);
386 auto Load = B.buildLoad(SgprRB_S32, Ptr, *WideMMO);
388 if (
MI.getOpcode() == G_ZEXTLOAD) {
390 auto MaskCst = B.buildConstant(SgprRB_S32, Mask);
391 B.buildAnd(Dst, Load, MaskCst);
393 assert(
MI.getOpcode() == G_SEXTLOAD);
394 B.buildSExtInReg(Dst, Load, MemSize);
398 MI.eraseFromParent();
402bool RegBankLegalizeHelper::lowerVccExtToSel(MachineInstr &
MI) {
404 LLT Ty = MRI.getType(Dst);
406 unsigned Opc =
MI.getOpcode();
407 int TrueExtCst =
Opc == G_SEXT ? -1 : 1;
408 if (Ty == S32 || Ty == S16) {
409 auto True = B.buildConstant({VgprRB, Ty}, TrueExtCst);
410 auto False = B.buildConstant({VgprRB, Ty}, 0);
411 B.buildSelect(Dst, Src, True, False);
412 }
else if (Ty == S64) {
413 auto True = B.buildConstant({VgprRB_S32}, TrueExtCst);
414 auto False = B.buildConstant({VgprRB_S32}, 0);
415 auto Lo = B.buildSelect({VgprRB_S32}, Src, True, False);
416 MachineInstrBuilder
Hi;
425 Hi = B.buildUndef({VgprRB_S32});
429 MF, MORE,
"amdgpu-regbanklegalize",
430 "AMDGPU RegBankLegalize: lowerVccExtToSel, Opcode not supported",
MI);
434 B.buildMergeValues(Dst, {
Lo.getReg(0),
Hi.getReg(0)});
437 MF, MORE,
"amdgpu-regbanklegalize",
438 "AMDGPU RegBankLegalize: lowerVccExtToSel, Type not supported",
MI);
442 MI.eraseFromParent();
446std::pair<Register, Register> RegBankLegalizeHelper::unpackZExt(
Register Reg) {
447 auto PackedS32 = B.buildBitcast(SgprRB_S32,
Reg);
448 auto Mask = B.buildConstant(SgprRB_S32, 0x0000ffff);
449 auto Lo = B.buildAnd(SgprRB_S32, PackedS32, Mask);
450 auto Hi = B.buildLShr(SgprRB_S32, PackedS32, B.buildConstant(SgprRB_S32, 16));
451 return {
Lo.getReg(0),
Hi.getReg(0)};
454std::pair<Register, Register> RegBankLegalizeHelper::unpackSExt(
Register Reg) {
455 auto PackedS32 = B.buildBitcast(SgprRB_S32,
Reg);
456 auto Lo = B.buildSExtInReg(SgprRB_S32, PackedS32, 16);
457 auto Hi = B.buildAShr(SgprRB_S32, PackedS32, B.buildConstant(SgprRB_S32, 16));
458 return {
Lo.getReg(0),
Hi.getReg(0)};
461std::pair<Register, Register> RegBankLegalizeHelper::unpackAExt(
Register Reg) {
462 auto PackedS32 = B.buildBitcast(SgprRB_S32,
Reg);
464 auto Hi = B.buildLShr(SgprRB_S32, PackedS32, B.buildConstant(SgprRB_S32, 16));
465 return {
Lo.getReg(0),
Hi.getReg(0)};
468std::pair<Register, Register>
469RegBankLegalizeHelper::unpackAExtTruncS16(
Register Reg) {
470 auto [Lo32, Hi32] = unpackAExt(
Reg);
471 return {B.buildTrunc(SgprRB_S16, Lo32).getReg(0),
472 B.buildTrunc(SgprRB_S16, Hi32).getReg(0)};
475bool RegBankLegalizeHelper::lowerUnpackBitShift(MachineInstr &
MI) {
477 switch (
MI.getOpcode()) {
478 case AMDGPU::G_SHL: {
479 auto [Val0, Val1] = unpackAExt(
MI.getOperand(1).getReg());
480 auto [Amt0, Amt1] = unpackAExt(
MI.getOperand(2).getReg());
481 Lo = B.buildInstr(
MI.getOpcode(), {SgprRB_S32}, {Val0, Amt0}).getReg(0);
482 Hi = B.buildInstr(
MI.getOpcode(), {SgprRB_S32}, {Val1, Amt1}).getReg(0);
485 case AMDGPU::G_LSHR: {
486 auto [Val0, Val1] = unpackZExt(
MI.getOperand(1).getReg());
487 auto [Amt0, Amt1] = unpackZExt(
MI.getOperand(2).getReg());
488 Lo = B.buildInstr(
MI.getOpcode(), {SgprRB_S32}, {Val0, Amt0}).getReg(0);
489 Hi = B.buildInstr(
MI.getOpcode(), {SgprRB_S32}, {Val1, Amt1}).getReg(0);
492 case AMDGPU::G_ASHR: {
493 auto [Val0, Val1] = unpackSExt(
MI.getOperand(1).getReg());
494 auto [Amt0, Amt1] = unpackSExt(
MI.getOperand(2).getReg());
495 Lo = B.buildAShr(SgprRB_S32, Val0, Amt0).getReg(0);
496 Hi = B.buildAShr(SgprRB_S32, Val1, Amt1).getReg(0);
501 MF, MORE,
"amdgpu-regbanklegalize",
502 "AMDGPU RegBankLegalize: lowerUnpackBitShift, case not implemented",
506 B.buildBuildVectorTrunc(
MI.getOperand(0).getReg(), {Lo, Hi});
507 MI.eraseFromParent();
511bool RegBankLegalizeHelper::lowerUnpackMinMax(MachineInstr &
MI) {
513 switch (
MI.getOpcode()) {
515 case AMDGPU::G_SMAX: {
517 auto [Val0_Lo, Val0_Hi] = unpackSExt(
MI.getOperand(1).getReg());
518 auto [Val1_Lo, Val1_Hi] = unpackSExt(
MI.getOperand(2).getReg());
519 Lo = B.buildInstr(
MI.getOpcode(), {SgprRB_S32}, {Val0_Lo, Val1_Lo})
521 Hi = B.buildInstr(
MI.getOpcode(), {SgprRB_S32}, {Val0_Hi, Val1_Hi})
526 case AMDGPU::G_UMAX: {
528 auto [Val0_Lo, Val0_Hi] = unpackZExt(
MI.getOperand(1).getReg());
529 auto [Val1_Lo, Val1_Hi] = unpackZExt(
MI.getOperand(2).getReg());
530 Lo = B.buildInstr(
MI.getOpcode(), {SgprRB_S32}, {Val0_Lo, Val1_Lo})
532 Hi = B.buildInstr(
MI.getOpcode(), {SgprRB_S32}, {Val0_Hi, Val1_Hi})
538 MF, MORE,
"amdgpu-regbanklegalize",
539 "AMDGPU RegBankLegalize: lowerUnpackMinMax, case not implemented",
MI);
542 B.buildBuildVectorTrunc(
MI.getOperand(0).getReg(), {Lo, Hi});
543 MI.eraseFromParent();
547bool RegBankLegalizeHelper::lowerUnpackAExt(MachineInstr &
MI) {
548 auto [Op1Lo, Op1Hi] = unpackAExt(
MI.getOperand(1).getReg());
549 auto [Op2Lo, Op2Hi] = unpackAExt(
MI.getOperand(2).getReg());
550 auto ResLo = B.buildInstr(
MI.getOpcode(), {SgprRB_S32}, {Op1Lo, Op2Lo});
551 auto ResHi = B.buildInstr(
MI.getOpcode(), {SgprRB_S32}, {Op1Hi, Op2Hi});
552 B.buildBuildVectorTrunc(
MI.getOperand(0).getReg(),
553 {ResLo.getReg(0), ResHi.getReg(0)});
554 MI.eraseFromParent();
560 return (GI->is(Intrinsic::amdgcn_sbfe));
562 return MI.getOpcode() == AMDGPU::G_SBFX;
565bool RegBankLegalizeHelper::lowerV_BFE(MachineInstr &
MI) {
572 Register Src =
MI.getOperand(FirstOpnd).getReg();
573 Register LSBit =
MI.getOperand(FirstOpnd + 1).getReg();
574 Register Width =
MI.getOperand(FirstOpnd + 2).getReg();
579 unsigned SHROpc =
Signed ? AMDGPU::G_ASHR : AMDGPU::G_LSHR;
580 auto SHRSrc = B.buildInstr(SHROpc, {{VgprRB, S64}}, {Src, LSBit});
588 auto Amt = B.buildSub(VgprRB_S32, B.buildConstant(SgprRB_S32, 64), Width);
589 auto SignBit = B.buildShl({VgprRB, S64}, SHRSrc, Amt);
590 B.buildInstr(SHROpc, {Dst}, {SignBit, Amt});
591 MI.eraseFromParent();
595 uint64_t WidthImm = ConstWidth->Value.getZExtValue();
596 auto UnmergeSHRSrc = B.buildUnmerge(VgprRB_S32, SHRSrc);
597 Register SHRSrcLo = UnmergeSHRSrc.getReg(0);
598 Register SHRSrcHi = UnmergeSHRSrc.getReg(1);
599 auto Zero = B.buildConstant({VgprRB, S32}, 0);
600 unsigned BFXOpc =
Signed ? AMDGPU::G_SBFX : AMDGPU::G_UBFX;
602 if (WidthImm <= 32) {
604 auto Lo = B.buildInstr(BFXOpc, {VgprRB_S32}, {SHRSrcLo,
Zero, Width});
605 MachineInstrBuilder
Hi;
608 Hi = B.buildAShr(VgprRB_S32,
Lo, B.buildConstant(VgprRB_S32, 31));
613 B.buildMergeLikeInstr(Dst, {
Lo,
Hi});
615 auto Amt = B.buildConstant(VgprRB_S32, WidthImm - 32);
617 auto Hi = B.buildInstr(BFXOpc, {VgprRB_S32}, {SHRSrcHi,
Zero, Amt});
618 B.buildMergeLikeInstr(Dst, {SHRSrcLo,
Hi});
621 MI.eraseFromParent();
625bool RegBankLegalizeHelper::lowerS_BFE(MachineInstr &
MI) {
627 LLT Ty = MRI.getType(DstReg);
630 Register Src =
MI.getOperand(FirstOpnd).getReg();
631 Register LSBit =
MI.getOperand(FirstOpnd + 1).getReg();
632 Register Width =
MI.getOperand(FirstOpnd + 2).getReg();
639 auto FieldOffset = B.buildAnd(SgprRB_S32, LSBit, Mask);
640 auto Size = B.buildShl(SgprRB_S32, Width, B.buildConstant(SgprRB_S32, 16));
641 auto Src1 = B.buildOr(SgprRB_S32, FieldOffset,
Size);
642 unsigned Opc32 =
Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32;
643 unsigned Opc64 =
Signed ? AMDGPU::S_BFE_I64 : AMDGPU::S_BFE_U64;
644 unsigned Opc = Ty == S32 ? Opc32 : Opc64;
648 auto S_BFE = B.buildInstr(
Opc, {{SgprRB, Ty}},
649 {B.buildCopy(Ty, Src), B.buildCopy(S32, Src1)});
651 *ST.getRegisterInfo(), RBI);
653 B.buildCopy(DstReg,
S_BFE->getOperand(0).getReg());
654 MI.eraseFromParent();
658bool RegBankLegalizeHelper::lowerSplitTo32(MachineInstr &
MI) {
660 LLT DstTy = MRI.getType(Dst);
661 assert(DstTy == V4S16 || DstTy == V2S32 || DstTy == S64);
662 LLT Ty = DstTy == V4S16 ? V2S16 : S32;
663 auto Op1 = B.buildUnmerge({VgprRB, Ty},
MI.getOperand(1).
getReg());
664 auto Op2 = B.buildUnmerge({VgprRB, Ty},
MI.getOperand(2).
getReg());
665 unsigned Opc =
MI.getOpcode();
668 B.buildInstr(
Opc, {{VgprRB, Ty}}, {Op1.getReg(0), Op2.getReg(0)},
Flags);
670 B.buildInstr(
Opc, {{VgprRB, Ty}}, {Op1.getReg(1), Op2.getReg(1)},
Flags);
671 B.buildMergeLikeInstr(Dst, {
Lo,
Hi});
672 MI.eraseFromParent();
676bool RegBankLegalizeHelper::lowerSplitTo32Mul(MachineInstr &
MI) {
678 assert(MRI.getType(Dst) == S64);
679 auto Op1 = B.buildUnmerge({VgprRB_S32},
MI.getOperand(1).
getReg());
680 auto Op2 = B.buildUnmerge({VgprRB_S32},
MI.getOperand(2).
getReg());
684 auto Lo = B.buildMul(VgprRB_S32, Op1.getReg(0), Op2.getReg(0));
685 auto Carry = B.buildUMulH(VgprRB_S32, Op1.getReg(0), Op2.getReg(0));
686 auto MulLo0Hi1 = B.buildMul(VgprRB_S32, Op1.getReg(0), Op2.getReg(1));
687 auto MulHi0Lo1 = B.buildMul(VgprRB_S32, Op1.getReg(1), Op2.getReg(0));
688 auto Sum = B.buildAdd(VgprRB_S32, MulLo0Hi1, MulHi0Lo1);
689 auto Hi = B.buildAdd(VgprRB_S32, Sum, Carry);
691 B.buildMergeLikeInstr(Dst, {
Lo,
Hi});
692 MI.eraseFromParent();
696bool RegBankLegalizeHelper::lowerSplitTo16(MachineInstr &
MI) {
698 assert(MRI.getType(Dst) == V2S16);
699 unsigned Opc =
MI.getOpcode();
700 unsigned NumOps =
MI.getNumOperands();
703 auto [Op1Lo, Op1Hi] = unpackAExtTruncS16(
MI.getOperand(1).getReg());
706 auto Lo = B.buildInstr(
Opc, {SgprRB_S16}, {Op1Lo},
Flags);
707 auto Hi = B.buildInstr(
Opc, {SgprRB_S16}, {Op1Hi},
Flags);
708 B.buildMergeLikeInstr(Dst, {
Lo,
Hi});
709 MI.eraseFromParent();
713 auto [Op2Lo, Op2Hi] = unpackAExtTruncS16(
MI.getOperand(2).getReg());
716 auto Lo = B.buildInstr(
Opc, {SgprRB_S16}, {Op1Lo, Op2Lo},
Flags);
717 auto Hi = B.buildInstr(
Opc, {SgprRB_S16}, {Op1Hi, Op2Hi},
Flags);
718 B.buildMergeLikeInstr(Dst, {
Lo,
Hi});
719 MI.eraseFromParent();
724 auto [Op3Lo, Op3Hi] = unpackAExtTruncS16(
MI.getOperand(3).getReg());
725 auto Lo = B.buildInstr(
Opc, {SgprRB_S16}, {Op1Lo, Op2Lo, Op3Lo},
Flags);
726 auto Hi = B.buildInstr(
Opc, {SgprRB_S16}, {Op1Hi, Op2Hi, Op3Hi},
Flags);
727 B.buildMergeLikeInstr(Dst, {
Lo,
Hi});
728 MI.eraseFromParent();
732bool RegBankLegalizeHelper::lowerUniMAD64(MachineInstr &
MI) {
739 const GCNSubtarget &ST = B.getMF().getSubtarget<GCNSubtarget>();
742 Register DstLo = B.buildMul(SgprRB_S32, Src0, Src1).getReg(0);
743 Register DstHi = MRI.createVirtualRegister(SgprRB_S32);
744 if (ST.hasScalarMulHiInsts()) {
745 B.buildInstr(AMDGPU::G_UMULH, {{DstHi}}, {Src0, Src1});
747 auto VSrc0 = B.buildCopy(VgprRB_S32, Src0);
748 auto VSrc1 = B.buildCopy(VgprRB_S32, Src1);
749 auto MulHi = B.buildInstr(AMDGPU::G_UMULH, {VgprRB_S32}, {VSrc0, VSrc1});
760 B.buildMergeLikeInstr(Dst0, {DstLo, DstHi});
761 B.buildConstant(Dst1, 0);
764 Register Src2Lo = MRI.createVirtualRegister(SgprRB_S32);
765 Register Src2Hi = MRI.createVirtualRegister(SgprRB_S32);
766 B.buildUnmerge({Src2Lo, Src2Hi}, Src2);
768 auto AddLo = B.buildUAddo(SgprRB_S32, SgprRB_S32, DstLo, Src2Lo);
770 B.buildUAdde(SgprRB_S32, SgprRB_S32, DstHi, Src2Hi, AddLo.getReg(1));
771 B.buildMergeLikeInstr(Dst0, {AddLo.getReg(0), AddHi.getReg(0)});
772 B.buildCopy(Dst1, AddHi.getReg(1));
775 MI.eraseFromParent();
779bool RegBankLegalizeHelper::lowerSplitTo32Select(MachineInstr &
MI) {
781 LLT DstTy = MRI.getType(Dst);
782 assert(DstTy == V4S16 || DstTy == V2S32 || DstTy == S64 ||
784 LLT Ty = DstTy == V4S16 ? V2S16 : S32;
785 auto Op2 = B.buildUnmerge({VgprRB, Ty},
MI.getOperand(2).
getReg());
786 auto Op3 = B.buildUnmerge({VgprRB, Ty},
MI.getOperand(3).
getReg());
790 B.buildSelect({VgprRB, Ty},
Cond, Op2.getReg(0), Op3.getReg(0), Flags);
792 B.buildSelect({VgprRB, Ty},
Cond, Op2.getReg(1), Op3.getReg(1), Flags);
794 B.buildMergeLikeInstr(Dst, {
Lo,
Hi});
795 MI.eraseFromParent();
799bool RegBankLegalizeHelper::lowerSplitTo32SExtInReg(MachineInstr &
MI) {
800 auto Op1 = B.buildUnmerge(VgprRB_S32,
MI.getOperand(1).getReg());
801 int Amt =
MI.getOperand(2).getImm();
805 auto Freeze = B.buildFreeze(VgprRB_S32, Op1.getReg(0));
808 Lo = Freeze.getReg(0);
811 Lo = B.buildSExtInReg(VgprRB_S32, Freeze, Amt).getReg(0);
814 auto SignExtCst = B.buildConstant(SgprRB_S32, 31);
815 Hi = B.buildAShr(VgprRB_S32,
Lo, SignExtCst).getReg(0);
819 Hi = B.buildSExtInReg(VgprRB_S32, Op1.getReg(1), Amt - 32).getReg(0);
822 B.buildMergeLikeInstr(
MI.getOperand(0).getReg(), {Lo, Hi});
823 MI.eraseFromParent();
827bool RegBankLegalizeHelper::lowerSplitBitCount64To32(MachineInstr &
MI) {
833 unsigned Opc =
MI.getOpcode();
842 case AMDGPU::G_AMDGPU_FFBH_U32:
844 AddOpc = AMDGPU::G_UADDSAT;
845 SearchFromMSB =
true;
847 case AMDGPU::G_AMDGPU_FFBL_B32:
849 AddOpc = AMDGPU::G_UADDSAT;
850 SearchFromMSB =
false;
852 case AMDGPU::G_CTLZ_ZERO_POISON:
853 FFBOpc = AMDGPU::G_AMDGPU_FFBH_U32;
854 AddOpc = AMDGPU::G_ADD;
855 SearchFromMSB =
true;
857 case AMDGPU::G_CTTZ_ZERO_POISON:
858 FFBOpc = AMDGPU::G_AMDGPU_FFBL_B32;
859 AddOpc = AMDGPU::G_ADD;
860 SearchFromMSB =
false;
866 auto Unmerge = B.buildUnmerge(VgprRB_S32,
MI.getOperand(1).getReg());
873 auto Primary = B.buildInstr(FFBOpc, {VgprRB_S32}, {SearchFromMSB ?
Hi :
Lo});
875 B.buildInstr(FFBOpc, {VgprRB_S32}, {SearchFromMSB ?
Lo :
Hi});
877 auto Adjusted = B.buildInstr(AddOpc, {VgprRB_S32},
878 {Secondary, B.buildConstant(VgprRB_S32, 32)});
879 B.buildUMin(
MI.getOperand(0).getReg(), Primary, Adjusted);
881 MI.eraseFromParent();
885bool RegBankLegalizeHelper::lowerExtrVecEltToSel(MachineInstr &
MI) {
897 LLT VecTy = MRI.getType(Src);
900 MachineRegisterInfo::VRegAttrs VgprRB_EltTy = {VgprRB, ScalarTy};
902 auto Unmerge = B.buildUnmerge(VgprRB_EltTy, Src);
905 Register PrevSelect = Unmerge.getReg(0);
906 for (
unsigned I = 1;
I < NumElts; ++
I) {
907 auto IdxConst = B.buildConstant({SgprRB, MRI.getType(Idx)},
I);
910 B.buildSelect(VgprRB_EltTy, Cmp, Unmerge.getReg(
I), PrevSelect)
913 B.buildCopy(Dst, PrevSelect);
915 auto InitUnmerge = B.buildUnmerge(VgprRB_S32, Unmerge.getReg(0));
916 Register PrevLo = InitUnmerge.getReg(0);
917 Register PrevHi = InitUnmerge.getReg(1);
918 for (
unsigned I = 1;
I < NumElts; ++
I) {
919 auto IdxConst = B.buildConstant({SgprRB, MRI.getType(Idx)},
I);
921 auto EltUnmerge = B.buildUnmerge(VgprRB_S32, Unmerge.getReg(
I));
922 PrevLo = B.buildSelect(VgprRB_S32, Cmp, EltUnmerge.getReg(0), PrevLo)
924 PrevHi = B.buildSelect(VgprRB_S32, Cmp, EltUnmerge.getReg(1), PrevHi)
927 B.buildMergeLikeInstr(Dst, {PrevLo, PrevHi});
930 MF, MORE,
"amdgpu-regbanklegalize",
931 "AMDGPU RegBankLegalize: ExtrVecEltToSel unsupported element type",
MI);
935 MI.eraseFromParent();
939bool RegBankLegalizeHelper::lowerExtrVecEltTo32(MachineInstr &
MI) {
952 LLT SrcTy = MRI.getType(Src);
955 assert(MRI.getRegBank(Src) == VgprRB && MRI.getRegBank(Idx) == SgprRB &&
956 "expected VGPR src and SGPR idx");
958 auto CastSrc = B.buildBitcast({VgprRB, Vec32Ty}, Src);
961 auto One = B.buildConstant(SgprRB_S32, 1);
962 auto IdxLo = B.buildShl(SgprRB_S32, Idx, One);
963 auto IdxHi = B.buildAdd(SgprRB_S32, IdxLo, One);
965 auto ExtLo = B.buildExtractVectorElement(VgprRB_S32, CastSrc, IdxLo);
966 auto ExtHi = B.buildExtractVectorElement(VgprRB_S32, CastSrc, IdxHi);
968 B.buildMergeLikeInstr(Dst, {ExtLo.getReg(0), ExtHi.getReg(0)});
970 MI.eraseFromParent();
974bool RegBankLegalizeHelper::lowerInsVecEltToSel(MachineInstr &
MI) {
987 LLT VecTy = MRI.getType(Src);
990 const RegisterBank *SrcRB = MRI.getRegBank(Src);
991 bool IsSGPR = (SrcRB == SgprRB);
992 SmallVector<Register, 16> Selects;
996 auto Unmerge = B.buildUnmerge(VgprRB_S32, Src);
997 auto EltUnmerge = B.buildUnmerge(VgprRB_S32, Elt);
998 Register EltLo = EltUnmerge.getReg(0);
999 Register EltHi = EltUnmerge.getReg(1);
1000 for (
unsigned I = 0;
I < NumElts; ++
I) {
1001 auto IdxConst = B.buildConstant(VgprRB_S32,
I);
1004 B.buildSelect(VgprRB_S32, Cmp, EltLo, Unmerge.getReg(2 *
I))
1007 B.buildSelect(VgprRB_S32, Cmp, EltHi, Unmerge.getReg(2 *
I + 1))
1011 auto Vec32 = B.buildBuildVector({VgprRB, Vec32Ty}, Selects);
1012 B.buildBitcast(Dst, Vec32);
1015 MachineRegisterInfo::VRegAttrs SrcRB_EltTy = {SrcRB, ScalarTy};
1016 MachineRegisterInfo::VRegAttrs CmpTy = IsSGPR ? SgprRB_S32 : VccRB_S1;
1017 auto Unmerge = B.buildUnmerge(SrcRB_EltTy, Src);
1018 for (
unsigned I = 0;
I < NumElts; ++
I) {
1019 auto IdxConst = B.buildConstant(SgprRB_S32,
I);
1022 B.buildSelect(SrcRB_EltTy, Cmp, Elt, Unmerge.getReg(
I)).getReg(0));
1024 B.buildMergeLikeInstr(Dst, Selects);
1027 MF, MORE,
"amdgpu-regbanklegalize",
1028 "AMDGPU RegBankLegalize: InsVecEltToSel unsupported element type",
MI);
1032 MI.eraseFromParent();
1036bool RegBankLegalizeHelper::lowerInsVecEltTo32(MachineInstr &
MI) {
1051 LLT SrcTy = MRI.getType(Src);
1054 assert(MRI.getRegBank(Src) == VgprRB && MRI.getRegBank(Idx) == SgprRB &&
1055 "expected VGPR src and SGPR idx");
1057 MachineRegisterInfo::VRegAttrs VgprRB_Vec32Ty = {VgprRB, Vec32Ty};
1059 auto CastSrc = B.buildBitcast(VgprRB_Vec32Ty, Src);
1060 auto EltUnmerge = B.buildUnmerge(VgprRB_S32, Elt);
1063 auto One = B.buildConstant(SgprRB_S32, 1);
1064 auto IdxLo = B.buildShl(SgprRB_S32, Idx, One);
1065 auto IdxHi = B.buildAdd(SgprRB_S32, IdxLo, One);
1067 auto InsLo = B.buildInsertVectorElement(VgprRB_Vec32Ty, CastSrc,
1068 EltUnmerge.getReg(0), IdxLo);
1069 auto InsHi = B.buildInsertVectorElement(VgprRB_Vec32Ty, InsLo,
1070 EltUnmerge.getReg(1), IdxHi);
1072 B.buildBitcast(Dst, InsHi);
1074 MI.eraseFromParent();
1078bool RegBankLegalizeHelper::lowerAbsToNegMax(MachineInstr &
MI) {
1088 LLT Ty = MRI.getType(DstReg);
1094 Zero = B.buildBuildVector({VgprRB, Ty}, {Zero16, Zero16}).
getReg(0);
1096 assert((Ty == S32 || Ty == S16) &&
"unexpected type for AbsToNegMax");
1097 Zero = B.buildConstant({VgprRB, Ty}, 0).
getReg(0);
1100 auto Neg = B.buildSub({VgprRB, Ty},
Zero, SrcReg);
1101 B.buildSMax(DstReg, SrcReg, Neg);
1102 MI.eraseFromParent();
1106bool RegBankLegalizeHelper::lowerAbsToS32(MachineInstr &
MI) {
1116 auto Bitcast = B.buildBitcast({SgprRB_S32},
MI.getOperand(1).
getReg());
1117 auto SextInReg = B.buildSExtInReg({SgprRB_S32},
Bitcast, 16);
1119 B.buildAShr({SgprRB_S32},
Bitcast, B.buildConstant({SgprRB_S32}, 16));
1121 auto AbsLo = B.buildInstr(AMDGPU::G_ABS, {{SgprRB_S32}}, {SextInReg});
1122 auto AbsHi = B.buildInstr(AMDGPU::G_ABS, {{SgprRB_S32}}, {ShiftHi});
1123 B.buildBuildVectorTrunc(
MI.getOperand(0).getReg(),
1124 {AbsLo.getReg(0), AbsHi.getReg(0)});
1126 MI.eraseFromParent();
1130bool RegBankLegalizeHelper::lower(MachineInstr &
MI,
1138 return lowerVccExtToSel(
MI);
1140 LLT Ty = MRI.getType(
MI.getOperand(0).getReg());
1141 auto True = B.buildConstant({SgprRB, Ty},
1142 MI.getOpcode() == AMDGPU::G_SEXT ? -1 : 1);
1143 auto False = B.buildConstant({SgprRB, Ty}, 0);
1147 B.buildSelect(
MI.getOperand(0).getReg(),
MI.getOperand(1).getReg(), True,
1149 MI.eraseFromParent();
1153 return lowerUnpackBitShift(
MI);
1155 return lowerUnpackMinMax(
MI);
1157 return lowerSplitTo16(
MI);
1159 const RegisterBank *RB = MRI.getRegBank(
MI.getOperand(0).getReg());
1160 MachineInstrBuilder
Hi;
1161 switch (
MI.getOpcode()) {
1162 case AMDGPU::G_ZEXT: {
1163 Hi = B.buildConstant({RB, S32}, 0);
1166 case AMDGPU::G_SEXT: {
1168 auto ShiftAmt = B.buildConstant({RB, S32}, 31);
1169 Hi = B.buildAShr({RB, S32},
MI.getOperand(1).
getReg(), ShiftAmt);
1172 case AMDGPU::G_ANYEXT: {
1173 Hi = B.buildUndef({RB, S32});
1178 "AMDGPU RegBankLegalize: Ext32To64, unsuported opcode",
1183 B.buildMergeLikeInstr(
MI.getOperand(0).getReg(),
1184 {MI.getOperand(1).getReg(), Hi});
1185 MI.eraseFromParent();
1189 uint64_t ConstVal =
MI.getOperand(1).getCImm()->getZExtValue();
1190 B.buildConstant(
MI.getOperand(0).getReg(), ConstVal);
1192 MI.eraseFromParent();
1197 LLT Ty = MRI.getType(Src);
1201 Register BoolSrc = MRI.createVirtualRegister({VgprRB, Ty});
1203 auto Src64 = B.buildUnmerge(VgprRB_S32, Src);
1204 auto One = B.buildConstant(VgprRB_S32, 1);
1205 auto AndLo = B.buildAnd(VgprRB_S32, Src64.getReg(0), One);
1206 auto Zero = B.buildConstant(VgprRB_S32, 0);
1207 auto AndHi = B.buildAnd(VgprRB_S32, Src64.getReg(1), Zero);
1208 B.buildMergeLikeInstr(BoolSrc, {AndLo, AndHi});
1210 assert(Ty == S32 || Ty == S16);
1211 auto One = B.buildConstant({VgprRB, Ty}, 1);
1212 B.buildAnd(BoolSrc, Src, One);
1214 auto Zero = B.buildConstant({VgprRB, Ty}, 0);
1216 MI.eraseFromParent();
1220 return lowerV_BFE(
MI);
1222 return lowerS_BFE(
MI);
1224 return lowerUniMAD64(
MI);
1226 B.buildMul(
MI.getOperand(0),
MI.getOperand(1),
MI.getOperand(2));
1227 MI.eraseFromParent();
1231 auto Op1 = B.buildTrunc(VgprRB_S32,
MI.getOperand(1));
1232 auto Op2 = B.buildTrunc(VgprRB_S32,
MI.getOperand(2));
1233 auto Zero = B.buildConstant({VgprRB, S64}, 0);
1235 unsigned NewOpc =
MI.getOpcode() == AMDGPU::G_AMDGPU_S_MUL_U64_U32
1236 ? AMDGPU::G_AMDGPU_MAD_U64_U32
1237 : AMDGPU::G_AMDGPU_MAD_I64_I32;
1239 B.buildInstr(NewOpc, {
MI.getOperand(0).getReg(), {SgprRB, S32}},
1241 MI.eraseFromParent();
1245 return lowerSplitTo32(
MI);
1247 return lowerSplitTo32Mul(
MI);
1249 return lowerSplitTo32Select(
MI);
1251 return lowerSplitTo32SExtInReg(
MI);
1253 auto Unmerge = B.buildUnmerge({VgprRB, S32},
MI.getOperand(1).
getReg());
1254 auto LoPopCnt = B.buildCTPOP({VgprRB, S32}, Unmerge.getReg(0));
1255 auto HiPopCnt = B.buildCTPOP({VgprRB, S32}, Unmerge.getReg(1));
1257 B.buildAdd(
MI.getOperand(0).getReg(), LoPopCnt, HiPopCnt,
1260 MI.eraseFromParent();
1264 LLT DstTy = MRI.getType(
MI.getOperand(0).getReg());
1275 if (
Size / 128 == 2)
1277 else if (
Size / 128 == 4)
1281 "AMDGPU RegBankLegalize: SplitLoad, unsuported type",
1287 else if (DstTy == S96)
1288 splitLoad(
MI, {S64, S32}, S32);
1289 else if (DstTy == V3S32)
1290 splitLoad(
MI, {V2S32, S32}, S32);
1291 else if (DstTy == V6S16)
1292 splitLoad(
MI, {V4S16, V2S16}, V2S16);
1295 "AMDGPU RegBankLegalize: SplitLoad, unsuported type",
1302 LLT DstTy = MRI.getType(
MI.getOperand(0).getReg());
1304 widenLoad(
MI, S128);
1305 else if (DstTy == V3S32)
1306 widenLoad(
MI, V4S32, S32);
1307 else if (DstTy == V6S16)
1308 widenLoad(
MI, V8S16, V2S16);
1311 "AMDGPU RegBankLegalize: WidenLoad, unsuported type",
1318 return lowerUnpackAExt(
MI);
1323 return MRI.getRegBankOrNull(Op.getReg()) == SgprRB;
1329 return MRI.getRegBankOrNull(Op.getReg()) == VgprRB;
1331 B.setInstrAndDebugLoc(
MI);
1332 for (
unsigned i =
MI.getNumDefs(); i <
MI.getNumOperands(); ++i) {
1333 MachineOperand &
Op =
MI.getOperand(i);
1337 if (MRI.getRegBank(
Reg) != VgprRB) {
1338 auto Copy = B.buildCopy({VgprRB, MRI.getType(
Reg)},
Reg);
1339 Op.setReg(
Copy.getReg(0));
1349 "AMDGPU RegBankLegalize: unmerge not multiple of 32",
1354 B.setInstrAndDebugLoc(
MI);
1357 B.buildUnmerge({SgprRB, V2S16}, Unmerge->
getSourceReg());
1358 for (
unsigned i = 0; i < UnmergeV2S16->getNumDefs(); ++i) {
1359 auto [Dst0S32, Dst1S32] =
1360 unpackAExt(UnmergeV2S16->getOperand(i).getReg());
1361 B.buildTrunc(
MI.getOperand(i * 2).getReg(), Dst0S32);
1362 B.buildTrunc(
MI.getOperand(i * 2 + 1).getReg(), Dst1S32);
1365 auto [Dst0S32, Dst1S32] = unpackAExt(
MI.getOperand(2).getReg());
1366 B.buildTrunc(
MI.getOperand(0).getReg(), Dst0S32);
1367 B.buildTrunc(
MI.getOperand(1).getReg(), Dst1S32);
1370 MI.eraseFromParent();
1375 Register NewDst = MRI.createVirtualRegister(SgprRB_S32);
1376 B.setInsertPt(*
MI.getParent(),
MI.getParent()->getFirstNonPHI());
1377 MI.getOperand(0).setReg(NewDst);
1378 B.buildTrunc(Dst, NewDst);
1380 for (
unsigned i = 1; i <
MI.getNumOperands(); i += 2) {
1388 auto NewUse = B.buildAnyExt(SgprRB_S32,
UseReg);
1389 MI.getOperand(i).setReg(NewUse.getReg(0));
1397 return MRI.getRegBankOrNull(Op.getReg()) == SgprRB;
1402 assert(MRI.getRegBankOrNull(
MI.getOperand(0).getReg()) == VgprRB);
1406 const RegisterBank *RB = MRI.getRegBankOrNull(Op.getReg());
1407 return RB == VgprRB || RB == SgprRB;
1412 const AMDGPU::RsrcIntrinsic *RSrcIntrin =
1417 unsigned RsrcIdx = RSrcIntrin->
RsrcArg +
MI.getNumExplicitDefs() + 1;
1418 return applyRegisterBanksVgprWithSgprRsrc(
MI, RsrcIdx);
1424 unsigned RsrcIdx =
MI.getNumOperands();
1425 while (RsrcIdx-- >
MI.getNumExplicitDefs()) {
1426 const MachineOperand &
Op =
MI.getOperand(RsrcIdx);
1427 if (
Op.isReg() &&
Op.getReg().isVirtual())
1430 return applyRegisterBanksVgprWithSgprRsrc(
MI, RsrcIdx);
1433 return lowerSplitBitCount64To32(
MI);
1435 return lowerExtrVecEltToSel(
MI);
1437 return lowerExtrVecEltTo32(
MI);
1439 return lowerInsVecEltToSel(
MI);
1441 return lowerInsVecEltTo32(
MI);
1443 return lowerAbsToNegMax(
MI);
1445 return lowerAbsToS32(
MI);
1449 if (!executeInWaterfallLoop(B, WFI))
1555 return isAnyPtr(Ty, 32) ? Ty : LLT();
1558 return isAnyPtr(Ty, 64) ? Ty : LLT();
1561 return isAnyPtr(Ty, 128) ? Ty : LLT();
1605 const SIRegisterInfo *
TRI =
1606 static_cast<const SIRegisterInfo *
>(MRI.getTargetRegisterInfo());
1608 if (LLTSize >= 32 &&
TRI->getSGPRClassForBitWidth(LLTSize))
1613 const SIRegisterInfo *
TRI =
1614 static_cast<const SIRegisterInfo *
>(MRI.getTargetRegisterInfo());
1723bool RegBankLegalizeHelper::applyMappingDst(
1724 MachineInstr &
MI,
unsigned &
OpIdx,
1725 const SmallVectorImpl<RegBankLLTMappingApplyID> &MethodIDs) {
1730 MachineOperand &
Op =
MI.getOperand(
OpIdx);
1732 LLT Ty = MRI.getType(
Reg);
1733 [[maybe_unused]]
const RegisterBank *RB = MRI.getRegBank(
Reg);
1735 switch (MethodIDs[
OpIdx]) {
1807 Register NewAgprDst = MRI.createVirtualRegister({AgprRB, Ty});
1808 Op.setReg(NewAgprDst);
1809 if (!MRI.use_nodbg_empty(
Reg))
1810 B.buildCopy(
Reg, NewAgprDst);
1817 Register NewDst = MRI.createVirtualRegister(VccRB_S1);
1819 if (!MRI.use_empty(
Reg)) {
1821 B.buildInstr(AMDGPU::G_AMDGPU_COPY_SCC_VCC, {SgprRB_S32}, {NewDst});
1822 B.buildTrunc(
Reg, CopyS32_Vcc);
1829 Register NewVgprDstS16 = MRI.createVirtualRegister({VgprRB, S16});
1830 Register NewVgprDstS32 = MRI.createVirtualRegister({VgprRB, S32});
1831 Register NewSgprDstS32 = MRI.createVirtualRegister({SgprRB, S32});
1832 Op.setReg(NewVgprDstS16);
1833 B.buildAnyExt(NewVgprDstS32, NewVgprDstS16);
1835 B.buildTrunc(
Reg, NewSgprDstS32);
1846 Register NewVgprDst = MRI.createVirtualRegister({VgprRB, Ty});
1847 Op.setReg(NewVgprDst);
1860 Register NewVgprDst = MRI.createVirtualRegister({VgprRB, Ty});
1861 Op.setReg(NewVgprDst);
1869 Register NewDst = MRI.createVirtualRegister(SgprRB_S32);
1871 if (!MRI.use_empty(
Reg))
1872 B.buildTrunc(
Reg, NewDst);
1879 Op.setReg(MRI.createVirtualRegister({SgprRB, Ty}));
1880 B.buildCopy(
Reg,
Op.getReg());
1885 MF, MORE,
"amdgpu-regbanklegalize",
1886 "AMDGPU RegBankLegalize: missing fast rule ('Div' or 'Uni') for",
MI);
1891 MF, MORE,
"amdgpu-regbanklegalize",
1892 "AMDGPU RegBankLegalize: applyMappingDst, ID not supported",
MI);
1900bool RegBankLegalizeHelper::applyMappingSrc(
1901 MachineInstr &
MI,
unsigned &
OpIdx,
1902 const SmallVectorImpl<RegBankLLTMappingApplyID> &MethodIDs,
1904 for (
unsigned i = 0; i < MethodIDs.
size(); ++
OpIdx, ++i) {
1905 if (MethodIDs[i] ==
None || MethodIDs[i] ==
IntrId || MethodIDs[i] ==
Imm)
1908 MachineOperand &
Op =
MI.getOperand(
OpIdx);
1910 LLT Ty = MRI.getType(
Reg);
1911 const RegisterBank *RB = MRI.getRegBank(
Reg);
1913 switch (MethodIDs[i]) {
1916 assert(RB == VccRB || RB == SgprRB);
1918 auto Aext = B.buildAnyExt(SgprRB_S32,
Reg);
1920 B.buildInstr(AMDGPU::G_AMDGPU_COPY_VCC_SCC, {VccRB_S1}, {Aext});
1921 Op.setReg(CopyVcc_Scc.getReg(0));
1939 assert(Ty == getTyFromID(MethodIDs[i]));
1940 assert(RB == getRegBankFromID(MethodIDs[i]));
1954 assert(Ty == getBTyFromID(MethodIDs[i], Ty));
1955 assert(RB == getRegBankFromID(MethodIDs[i]));
1979 assert(Ty == getTyFromID(MethodIDs[i]));
1981 auto CopyToVgpr = B.buildCopy({VgprRB, Ty},
Reg);
1982 Op.setReg(CopyToVgpr.getReg(0));
1998 assert(Ty == getBTyFromID(MethodIDs[i], Ty));
2000 auto CopyToVgpr = B.buildCopy({VgprRB, Ty},
Reg);
2001 Op.setReg(CopyToVgpr.getReg(0));
2007 auto CopyToVgpr = B.buildCopy({VgprRB, Ty},
Reg);
2008 Op.setReg(CopyToVgpr.getReg(0));
2014 auto CopyToAgpr = B.buildCopy({AgprRB, Ty},
Reg);
2015 Op.setReg(CopyToAgpr.getReg(0));
2022 assert(Ty == getTyFromID(MethodIDs[i]));
2027 WFI.
End = std::next(
MI.getIterator());
2034 assert(Ty == getTyFromID(MethodIDs[i]));
2040 while (
Start->getOpcode() != AMDGPU::ADJCALLSTACKUP)
2045 while (End->getOpcode() != AMDGPU::ADJCALLSTACKDOWN)
2049 B.setInsertPt(*
MI.getParent(), Start);
2058 assert(Ty == getBTyFromID(MethodIDs[i], Ty));
2062 Register NewSGPR = MRI.createVirtualRegister({SgprRB, Ty});
2068 assert(Ty == getTyFromID(MethodIDs[i]));
2072 Register NewSGPR = MRI.createVirtualRegister({SgprRB, Ty});
2082 auto Aext = B.buildAnyExt(SgprRB_S32,
Reg);
2083 Op.setReg(Aext.getReg(0));
2090 auto Aext = B.buildAnyExt(SgprRB_S32,
Reg);
2093 auto Cst1 = B.buildConstant(SgprRB_S32, 1);
2094 auto BoolInReg = B.buildAnd(SgprRB_S32, Aext, Cst1);
2095 Op.setReg(BoolInReg.getReg(0));
2101 auto Sext = B.buildSExt(SgprRB_S32,
Reg);
2102 Op.setReg(Sext.getReg(0));
2108 auto Zext = B.buildZExt({SgprRB, S32},
Reg);
2109 Op.setReg(Zext.getReg(0));
2115 auto Aext = B.buildAnyExt({VgprRB, S32},
Reg);
2116 Op.setReg(Aext.getReg(0));
2123 auto Sext = B.buildSExt({VgprRB, S32},
Reg);
2124 Op.setReg(Sext.getReg(0));
2131 auto Zext = B.buildZExt({VgprRB, S32},
Reg);
2132 Op.setReg(Zext.getReg(0));
2137 MF, MORE,
"amdgpu-regbanklegalize",
2138 "AMDGPU RegBankLegalize: applyMappingSrc, ID not supported",
MI);
2148 unsigned StartOpIdx,
2149 unsigned EndOpIdx) {
2150 for (
unsigned i = StartOpIdx; i <= EndOpIdx; ++i) {
2157bool RegBankLegalizeHelper::applyRegisterBanksVgprWithSgprRsrc(
2158 MachineInstr &
MI,
unsigned RsrcIdx) {
2159 const unsigned NumDefs =
MI.getNumExplicitDefs();
2161 MachineBasicBlock *
MBB =
MI.getParent();
2165 for (
unsigned i = 0; i < NumDefs; ++i) {
2167 if (MRI.getRegBank(
Reg) == VgprRB)
2170 Register NewVgprDst = MRI.createVirtualRegister({VgprRB, MRI.getType(
Reg)});
2171 MI.getOperand(i).setReg(NewVgprDst);
2175 B.setInstrAndDebugLoc(
MI);
2178 for (
unsigned i = NumDefs; i < RsrcIdx; ++i) {
2179 MachineOperand &
Op =
MI.getOperand(i);
2187 if (MRI.getRegBank(
Reg) == VgprRB)
2190 auto Copy = B.buildCopy({VgprRB, MRI.getType(
Reg)},
Reg);
2191 Op.setReg(
Copy.getReg(0));
2194 SmallSet<Register, 4> OpsToWaterfall;
2197 for (
unsigned i = RsrcIdx; i <
MI.getNumOperands(); ++i) {
2198 MachineOperand &
Op =
MI.getOperand(i);
2203 if (MRI.getRegBank(
Reg) != SgprRB)
2207 if (!OpsToWaterfall.
empty()) {
2209 executeInWaterfallLoop(B, {OpsToWaterfall, MII, std::next(MII)});
MachineInstrBuilder MachineInstrBuilder & DefMI
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.
Provides AMDGPU specific target descriptions.
static bool isSignedBFE(MachineInstr &MI)
static bool verifyRegBankOnOperands(MachineInstr &MI, const RegisterBank *RB, MachineRegisterInfo &MRI, unsigned StartOpIdx, unsigned EndOpIdx)
This file declares the targeting of the RegisterBankInfo class for AMDGPU.
MachineBasicBlock MachineBasicBlock::iterator MBBI
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
AMD GCN specific subclass of TargetSubtarget.
Declares convenience wrapper classes for interpreting MachineInstr instances as specific generic oper...
static Register UseReg(const MachineOperand &MO)
const size_t AbstractManglingParser< Derived, Alloc >::NumOps
Contains matchers for matching SSA Machine Instructions.
This file declares the MachineIRBuilder class.
Register const TargetRegisterInfo * TRI
Promote Memory to Register
static MCRegister getReg(const MCDisassembler *D, unsigned RC, unsigned RegNo)
MachineInstr unsigned OpIdx
const SmallVectorImpl< MachineOperand > & Cond
static const LaneMaskConstants & get(const GCNSubtarget &ST)
const unsigned XorTermOpc
const unsigned MovTermOpc
const unsigned AndSaveExecOpc
bool findRuleAndApplyMapping(MachineInstr &MI)
RegBankLegalizeHelper(MachineIRBuilder &B, const MachineUniformityInfo &MUI, const RegisterBankInfo &RBI, const RegBankLegalizeRules &RBLRules)
const RegBankLLTMapping * findMappingForMI(const MachineInstr &MI, const MachineRegisterInfo &MRI, const MachineUniformityInfo &MUI) const
static APInt getLowBitsSet(unsigned numBits, unsigned loBitsSet)
Constructs an APInt value that has the bottom loBitsSet bits set.
iterator find(const_arg_type_t< KeyT > Val)
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
const SIRegisterInfo * getRegisterInfo() const override
Represents a call to an intrinsic.
Register getSourceReg() const
Get the unmerge source register.
constexpr bool isScalar() const
LLT getScalarType() const
static constexpr LLT scalar(unsigned SizeInBits)
Get a low-level scalar or aggregate "bag of bits".
constexpr bool isValid() const
constexpr uint16_t getNumElements() const
Returns the number of elements in a vector LLT.
constexpr bool isVector() const
static constexpr LLT pointer(unsigned AddressSpace, unsigned SizeInBits)
Get a low-level pointer in the given address space.
constexpr TypeSize getSizeInBits() const
Returns the total size of the type. Must only be called on sized types.
constexpr bool isPointer() const
static constexpr LLT fixed_vector(unsigned NumElements, unsigned ScalarSizeInBits)
Get a low-level fixed-width vector of some number of elements and element width.
constexpr TypeSize getSizeInBytes() const
Returns the total size of the type in bytes, i.e.
LLT getElementType() const
Returns the vector's element type. Only valid for vector types.
TypeSize getValue() const
LLVM_ABI void transferSuccessorsAndUpdatePHIs(MachineBasicBlock *FromMBB)
Transfers all the successors, as in transferSuccessors, and update PHI operands in the successor bloc...
LLVM_ABI iterator SkipPHIsAndLabels(iterator I)
Return the first instruction in MBB after I that is not a PHI or a label.
LLVM_ABI void addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob=BranchProbability::getUnknown())
Add Succ as a successor of this MachineBasicBlock.
void splice(iterator Where, MachineBasicBlock *Other, iterator From)
Take an instruction from MBB 'Other' at the position From, and insert it into this MBB right before '...
MachineInstrBundleIterator< MachineInstr > iterator
BasicBlockListType::iterator iterator
MachineBasicBlock * CreateMachineBasicBlock(const BasicBlock *BB=nullptr, std::optional< UniqueBBID > BBID=std::nullopt)
CreateMachineInstr - Allocate a new MachineInstr.
void insert(iterator MBBI, MachineBasicBlock *MBB)
Helper class to build MachineInstr.
bool isValid() const
Check for null.
Representation of each machine instruction.
const MachineBasicBlock * getParent() const
LocationSize getSize() const
Return the size in bytes of the memory reference.
MachineOperand class - Representation of each machine instruction operand.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
const RegisterBank * getRegBank(Register Reg) const
Return the register bank of Reg.
LLVM_ABI Register createVirtualRegister(const TargetRegisterClass *RegClass, StringRef Name="")
createVirtualRegister - Create and return a new virtual register in the function with the specified r...
LLT getType(Register Reg) const
Get the low-level type of Reg or LLT{} if Reg is not a generic (target independent) virtual register.
const RegisterBank * getRegBankOrNull(Register Reg) const
Return the register bank of Reg, or null if Reg has not been assigned a register bank or has been ass...
Holds all the information related to register banks.
This class implements the register bank concept.
Wrapper class representing virtual and physical registers.
constexpr bool isVirtual() const
Return true if the specified register number is in the virtual register namespace.
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
self_iterator getIterator()
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ SgprV4S32_ReadFirstLane
bool isAnyPtr(LLT Ty, unsigned Width)
Intrinsic::ID getIntrinsicID(const MachineInstr &I)
Return the intrinsic ID for opcodes with the G_AMDGPU_INTRIN_ prefix.
@ VerifyAllSgprOrVgprGPHI
@ AextToS32InIncomingBlockGPHI
void buildReadAnyLane(MachineIRBuilder &B, Register SgprDst, Register VgprSrc, const RegisterBankInfo &RBI)
const RsrcIntrinsic * lookupRsrcIntrinsic(unsigned Intr)
void buildReadFirstLane(MachineIRBuilder &B, Register SgprDst, Register VgprSrc, const RegisterBankInfo &RBI)
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
@ Bitcast
Perform the operation on a different, but equivalently sized type.
SpecificConstantMatch m_ZeroInt()
Convenience matchers for specific integer values.
bool mi_match(Reg R, const MachineRegisterInfo &MRI, Pattern &&P)
This is an optimization pass for GlobalISel generic memory operations.
GenericUniformityInfo< MachineSSAContext > MachineUniformityInfo
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
@ Kill
The last use of a register.
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
LLVM_ABI void constrainSelectedInstRegOperands(MachineInstr &I, const TargetInstrInfo &TII, const TargetRegisterInfo &TRI, const RegisterBankInfo &RBI)
Mutate the newly-selected instruction I to constrain its (possibly generic) virtual register operands...
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
LLVM_ABI void reportGISelFailure(MachineFunction &MF, MachineOptimizationRemarkEmitter &MORE, MachineOptimizationRemarkMissed &R)
Report an ISel error as a missed optimization remark to the LLVMContext's diagnostic stream.
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
DWARFExpression::Operation Op
ArrayRef(const T &OneElt) -> ArrayRef< T >
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
LLVM_ABI std::optional< ValueAndVReg > getIConstantVRegValWithLookThrough(Register VReg, const MachineRegisterInfo &MRI, bool LookThroughInstrs=true)
If VReg is defined by a statically evaluable chain of instructions rooted on a G_CONSTANT returns its...
constexpr T maskTrailingOnes(unsigned N)
Create a bitmask with the N right-most bits set to 1, and all other bits set to 0.
LoweringMethodID LoweringMethod
SmallVector< RegBankLLTMappingApplyID, 2 > DstOpMapping
SmallVector< RegBankLLTMappingApplyID, 4 > SrcOpMapping
Holds waterfall loop information: the set of SGPR operand registers that need waterfalling,...
MachineBasicBlock::iterator Start
SmallSet< Register, 4 > SgprWaterfallOperandRegs
MachineBasicBlock::iterator End