28#include "llvm/IR/IntrinsicsAMDGPU.h"
30#define DEBUG_TYPE "amdgpu-regbanklegalize"
39 ST(MF.getSubtarget<
GCNSubtarget>()), TII(*ST.getInstrInfo()), B(B),
40 MRI(*B.getMRI()), MUI(MUI), RBI(RBI), MORE(MF, nullptr),
41 RBLRules(RBLRules), IsWave32(ST.isWave32()),
42 SgprRB(&RBI.getRegBank(
AMDGPU::SGPRRegBankID)),
43 VgprRB(&RBI.getRegBank(
AMDGPU::VGPRRegBankID)),
44 AgprRB(&RBI.getRegBank(
AMDGPU::AGPRRegBankID)),
45 VccRB(&RBI.getRegBank(
AMDGPU::VCCRegBankID)) {}
51 "No AMDGPU RegBankLegalize rules defined for opcode",
59 "AMDGPU RegBankLegalize: none of the rules defined with "
60 "'Any' for MI's opcode matched MI",
68 B.setInsertPt(*
MI.getParent(), std::next(
MI.getIterator()));
78 if (!lower(
MI, *Mapping, WFI))
82 if (!executeInWaterfallLoop(B, WFI))
92 "Waterfall range not initialized");
109 const int OrigRangeSize = std::distance(BeginIt, EndIt);
118 B.buildInstr(TargetOpcode::IMPLICIT_DEF).addDef(InitSaveExecReg);
144 MBB.addSuccessor(LoopBB);
147 B.setInsertPt(*LoopBB, LoopBB->
end());
198 auto NewEnd = BodyBB->
end();
199 assert(std::distance(NewBegin, NewEnd) == OrigRangeSize);
212 auto OldVal = WaterfalledRegMap.
find(OldReg);
213 if (OldVal != WaterfalledRegMap.
end()) {
214 Op.setReg(OldVal->second);
228 unsigned PartSize = (OpSize % 64 == 0) ? 64 : 32;
230 unsigned NumParts = OpSize / PartSize;
236 CurrentLaneParts.
push_back(CurrentLaneReg);
238 auto UnmergeOp = B.buildUnmerge({VgprRB, PartTy}, OpReg);
239 auto UnmergeCurrLane = B.buildUnmerge({SgprRB, PartTy}, CurrentLaneReg);
240 for (
unsigned i = 0; i < NumParts; ++i) {
242 CurrentLaneParts.
push_back(UnmergeCurrLane.getReg(i));
246 for (
unsigned i = 0; i < NumParts; ++i) {
247 Register CmpReg = MRI.createVirtualRegister(VccRB_S1);
253 CondReg = B.buildAnd(VccRB_S1, CondReg, CmpReg).getReg(0);
256 Op.setReg(CurrentLaneReg);
259 WaterfalledRegMap.
insert(std::pair(OldReg,
Op.getReg()));
265 MRI.createVirtualRegister({WaveRC,
LLT::scalar(IsWave32 ? 32 : 64)});
266 B.buildIntrinsic(Intrinsic::amdgcn_ballot, CondRegLM).addReg(CondReg);
272 MRI.setSimpleHint(SavedExec, CondRegLM);
274 B.setInsertPt(*BodyBB, BodyBB->
end());
286 B.buildInstr(AMDGPU::SI_WATERFALL_LOOP).addMBB(LoopBB);
290 B.buildInstr(LMC.
MovOpc).addDef(SaveExecReg).addReg(LMC.
ExecReg);
293 B.setInsertPt(*RestoreExecBB, RestoreExecBB->
begin());
298 B.setInsertPt(*RemainderBB, RemainderBB->
begin());
305unsigned RegBankLegalizeHelper::setBufferOffsets(
307 Register &SOffsetReg, int64_t &InstOffsetVal, Align Alignment) {
308 if (std::optional<int64_t>
Imm =
310 uint32_t SOffset, ImmOffset;
311 if (TII.splitMUBUFOffset(*
Imm, SOffset, ImmOffset, Alignment)) {
312 VOffsetReg = B.buildConstant({VgprRB, S32}, 0).
getReg(0);
313 SOffsetReg = B.buildConstant({SgprRB, S32}, SOffset).
getReg(0);
314 InstOffsetVal = ImmOffset;
315 return SOffset + ImmOffset;
318 const bool CheckNUW = ST.hasGFX1250Insts();
320 MRI, CombinedOffset,
nullptr,
322 uint32_t SOffset, ImmOffset;
323 if (
static_cast<int32_t
>(
Offset) > 0 &&
324 TII.splitMUBUFOffset(
Offset, SOffset, ImmOffset, Alignment)) {
325 if (
Base.isValid() && MRI.getRegBank(
Base) == VgprRB) {
327 SOffsetReg = B.buildConstant({SgprRB, S32}, SOffset).
getReg(0);
328 InstOffsetVal = ImmOffset;
333 VOffsetReg = B.buildConstant({VgprRB, S32}, 0).
getReg(0);
335 InstOffsetVal = ImmOffset;
341 if (
Add &&
static_cast<int32_t
>(
Offset) >= 0 &&
345 const RegisterBank *Src0Bank = MRI.getRegBank(Src0);
346 const RegisterBank *Src1Bank = MRI.getRegBank(Src1);
347 if (Src0Bank == VgprRB && Src1Bank == SgprRB) {
352 if (Src0Bank == SgprRB && Src1Bank == VgprRB) {
360 if (MRI.getRegBank(CombinedOffset) == VgprRB) {
361 VOffsetReg = CombinedOffset;
363 VOffsetReg = B.buildCopy({VgprRB, S32}, CombinedOffset).
getReg(0);
365 SOffsetReg = B.buildConstant({SgprRB, S32}, 0).
getReg(0);
369bool RegBankLegalizeHelper::splitLoad(MachineInstr &
MI,
371 MachineFunction &MF = B.getMF();
372 assert(
MI.getNumMemOperands() == 1);
373 MachineMemOperand &BaseMMO = **
MI.memoperands_begin();
375 const RegisterBank *DstRB = MRI.getRegBankOrNull(Dst);
377 LLT PtrTy = MRI.getType(
Base);
378 const RegisterBank *PtrRB = MRI.getRegBankOrNull(
Base);
382 unsigned ByteOffset = 0;
383 for (LLT PartTy : LLTBreakdown) {
385 if (ByteOffset == 0) {
386 BasePlusOffset =
Base;
388 auto Offset = B.buildConstant({PtrRB, OffsetTy}, ByteOffset);
392 auto *OffsetMMO = MF.getMachineMemOperand(&BaseMMO, ByteOffset, PartTy);
393 auto LoadPart = B.buildLoad({DstRB, PartTy}, BasePlusOffset, *OffsetMMO);
394 LoadPartRegs.
push_back(LoadPart.getReg(0));
400 B.buildMergeLikeInstr(Dst, LoadPartRegs);
406 if (MRI.getType(
Reg) == MergeTy) {
409 auto Unmerge = B.buildUnmerge({DstRB, MergeTy},
Reg);
410 for (
unsigned i = 0; i < Unmerge->getNumOperands() - 1; ++i)
411 MergeTyParts.
push_back(Unmerge.getReg(i));
414 B.buildMergeLikeInstr(Dst, MergeTyParts);
416 MI.eraseFromParent();
420bool RegBankLegalizeHelper::widenLoad(MachineInstr &
MI, LLT WideTy,
422 MachineFunction &MF = B.getMF();
423 assert(
MI.getNumMemOperands() == 1);
424 MachineMemOperand &BaseMMO = **
MI.memoperands_begin();
426 const RegisterBank *DstRB = MRI.getRegBankOrNull(Dst);
429 MachineMemOperand *WideMMO = MF.getMachineMemOperand(&BaseMMO, 0, WideTy);
430 auto WideLoad = B.buildLoad({DstRB, WideTy},
Base, *WideMMO);
433 B.buildTrunc(Dst, WideLoad);
436 auto Unmerge = B.buildUnmerge({DstRB, MergeTy}, WideLoad);
438 LLT DstTy = MRI.getType(Dst);
440 for (
unsigned i = 0; i < NumElts; ++i) {
441 MergeTyParts.
push_back(Unmerge.getReg(i));
443 B.buildMergeLikeInstr(Dst, MergeTyParts);
445 MI.eraseFromParent();
449bool RegBankLegalizeHelper::widenMMOToS32(GAnyLoad &
MI)
const {
452 MachineMemOperand &MMO =
MI.getMMO();
455 MachineMemOperand *WideMMO = B.getMF().getMachineMemOperand(&MMO, 0, S32);
457 if (
MI.getOpcode() == G_LOAD) {
458 B.buildLoad(Dst, Ptr, *WideMMO);
460 auto Load = B.buildLoad(SgprRB_S32, Ptr, *WideMMO);
462 if (
MI.getOpcode() == G_ZEXTLOAD) {
464 auto MaskCst = B.buildConstant(SgprRB_S32, Mask);
465 B.buildAnd(Dst, Load, MaskCst);
467 assert(
MI.getOpcode() == G_SEXTLOAD);
468 B.buildSExtInReg(Dst, Load, MemSize);
472 MI.eraseFromParent();
476bool RegBankLegalizeHelper::lowerVccExtToSel(MachineInstr &
MI) {
478 LLT Ty = MRI.getType(Dst);
480 unsigned Opc =
MI.getOpcode();
481 int TrueExtCst =
Opc == G_SEXT ? -1 : 1;
482 if (Ty == S32 || Ty == S16) {
483 auto True = B.buildConstant({VgprRB, Ty}, TrueExtCst);
484 auto False = B.buildConstant({VgprRB, Ty}, 0);
485 B.buildSelect(Dst, Src, True, False);
486 }
else if (Ty == S64) {
487 auto True = B.buildConstant({VgprRB_S32}, TrueExtCst);
488 auto False = B.buildConstant({VgprRB_S32}, 0);
489 auto Lo = B.buildSelect({VgprRB_S32}, Src, True, False);
490 MachineInstrBuilder
Hi;
499 Hi = B.buildUndef({VgprRB_S32});
503 MF, MORE,
"amdgpu-regbanklegalize",
504 "AMDGPU RegBankLegalize: lowerVccExtToSel, Opcode not supported",
MI);
508 B.buildMergeValues(Dst, {
Lo.getReg(0),
Hi.getReg(0)});
511 MF, MORE,
"amdgpu-regbanklegalize",
512 "AMDGPU RegBankLegalize: lowerVccExtToSel, Type not supported",
MI);
516 MI.eraseFromParent();
520std::pair<Register, Register> RegBankLegalizeHelper::unpackZExt(
Register Reg) {
521 auto PackedS32 = B.buildBitcast(SgprRB_S32,
Reg);
522 auto Mask = B.buildConstant(SgprRB_S32, 0x0000ffff);
523 auto Lo = B.buildAnd(SgprRB_S32, PackedS32, Mask);
524 auto Hi = B.buildLShr(SgprRB_S32, PackedS32, B.buildConstant(SgprRB_S32, 16));
525 return {
Lo.getReg(0),
Hi.getReg(0)};
528std::pair<Register, Register> RegBankLegalizeHelper::unpackSExt(
Register Reg) {
529 auto PackedS32 = B.buildBitcast(SgprRB_S32,
Reg);
530 auto Lo = B.buildSExtInReg(SgprRB_S32, PackedS32, 16);
531 auto Hi = B.buildAShr(SgprRB_S32, PackedS32, B.buildConstant(SgprRB_S32, 16));
532 return {
Lo.getReg(0),
Hi.getReg(0)};
535std::pair<Register, Register> RegBankLegalizeHelper::unpackAExt(
Register Reg) {
536 auto PackedS32 = B.buildBitcast(SgprRB_S32,
Reg);
538 auto Hi = B.buildLShr(SgprRB_S32, PackedS32, B.buildConstant(SgprRB_S32, 16));
539 return {
Lo.getReg(0),
Hi.getReg(0)};
542std::pair<Register, Register>
543RegBankLegalizeHelper::unpackAExtTruncS16(
Register Reg) {
544 auto [Lo32, Hi32] = unpackAExt(
Reg);
545 return {B.buildTrunc(SgprRB_S16, Lo32).getReg(0),
546 B.buildTrunc(SgprRB_S16, Hi32).getReg(0)};
549bool RegBankLegalizeHelper::lowerUnpackBitShift(MachineInstr &
MI) {
551 switch (
MI.getOpcode()) {
552 case AMDGPU::G_SHL: {
553 auto [Val0, Val1] = unpackAExt(
MI.getOperand(1).getReg());
554 auto [Amt0, Amt1] = unpackAExt(
MI.getOperand(2).getReg());
555 Lo = B.buildInstr(
MI.getOpcode(), {SgprRB_S32}, {Val0, Amt0}).getReg(0);
556 Hi = B.buildInstr(
MI.getOpcode(), {SgprRB_S32}, {Val1, Amt1}).getReg(0);
559 case AMDGPU::G_LSHR: {
560 auto [Val0, Val1] = unpackZExt(
MI.getOperand(1).getReg());
561 auto [Amt0, Amt1] = unpackZExt(
MI.getOperand(2).getReg());
562 Lo = B.buildInstr(
MI.getOpcode(), {SgprRB_S32}, {Val0, Amt0}).getReg(0);
563 Hi = B.buildInstr(
MI.getOpcode(), {SgprRB_S32}, {Val1, Amt1}).getReg(0);
566 case AMDGPU::G_ASHR: {
567 auto [Val0, Val1] = unpackSExt(
MI.getOperand(1).getReg());
568 auto [Amt0, Amt1] = unpackSExt(
MI.getOperand(2).getReg());
569 Lo = B.buildAShr(SgprRB_S32, Val0, Amt0).getReg(0);
570 Hi = B.buildAShr(SgprRB_S32, Val1, Amt1).getReg(0);
575 MF, MORE,
"amdgpu-regbanklegalize",
576 "AMDGPU RegBankLegalize: lowerUnpackBitShift, case not implemented",
580 B.buildBuildVectorTrunc(
MI.getOperand(0).getReg(), {Lo, Hi});
581 MI.eraseFromParent();
585bool RegBankLegalizeHelper::lowerUnpackMinMax(MachineInstr &
MI) {
587 switch (
MI.getOpcode()) {
589 case AMDGPU::G_SMAX: {
591 auto [Val0_Lo, Val0_Hi] = unpackSExt(
MI.getOperand(1).getReg());
592 auto [Val1_Lo, Val1_Hi] = unpackSExt(
MI.getOperand(2).getReg());
593 Lo = B.buildInstr(
MI.getOpcode(), {SgprRB_S32}, {Val0_Lo, Val1_Lo})
595 Hi = B.buildInstr(
MI.getOpcode(), {SgprRB_S32}, {Val0_Hi, Val1_Hi})
600 case AMDGPU::G_UMAX: {
602 auto [Val0_Lo, Val0_Hi] = unpackZExt(
MI.getOperand(1).getReg());
603 auto [Val1_Lo, Val1_Hi] = unpackZExt(
MI.getOperand(2).getReg());
604 Lo = B.buildInstr(
MI.getOpcode(), {SgprRB_S32}, {Val0_Lo, Val1_Lo})
606 Hi = B.buildInstr(
MI.getOpcode(), {SgprRB_S32}, {Val0_Hi, Val1_Hi})
612 MF, MORE,
"amdgpu-regbanklegalize",
613 "AMDGPU RegBankLegalize: lowerUnpackMinMax, case not implemented",
MI);
616 B.buildBuildVectorTrunc(
MI.getOperand(0).getReg(), {Lo, Hi});
617 MI.eraseFromParent();
621bool RegBankLegalizeHelper::lowerUnpackAExt(MachineInstr &
MI) {
622 auto [Op1Lo, Op1Hi] = unpackAExt(
MI.getOperand(1).getReg());
623 auto [Op2Lo, Op2Hi] = unpackAExt(
MI.getOperand(2).getReg());
624 auto ResLo = B.buildInstr(
MI.getOpcode(), {SgprRB_S32}, {Op1Lo, Op2Lo});
625 auto ResHi = B.buildInstr(
MI.getOpcode(), {SgprRB_S32}, {Op1Hi, Op2Hi});
626 B.buildBuildVectorTrunc(
MI.getOperand(0).getReg(),
627 {ResLo.getReg(0), ResHi.getReg(0)});
628 MI.eraseFromParent();
632bool RegBankLegalizeHelper::lowerSBufToBuf(MachineInstr &
MI,
635 LLT Ty = MRI.getType(Dst);
636 const RegisterBank *RSrcBank = MRI.getRegBank(
MI.getOperand(1).getReg());
640 if (LoadSize == 256 || LoadSize == 512) {
641 NumLoads = LoadSize / 128;
644 for (
int i = 0; i < NumLoads; ++i)
645 LoadParts.
emplace_back(MRI.createVirtualRegister({VgprRB, Ty}));
646 MachineMemOperand *OrigMMO = *
MI.memoperands_begin();
648 MachineFunction &MF = B.getMF();
651 int64_t ImmOffset = 0;
652 unsigned MMOOffset = setBufferOffsets(B,
MI.getOperand(2).getReg(), VOffset,
653 SOffset, ImmOffset, Alignment);
658 MachineMemOperand *BaseMMO = MF.getMachineMemOperand(OrigMMO, 0, MemSize);
660 BaseMMO = MF.getMachineMemOperand(BaseMMO, MMOOffset, MemSize);
664 Register VIndex = B.buildConstant(VgprRB_S32, 0).getReg(0);
665 unsigned Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD;
666 switch (
MI.getOpcode()) {
667 case AMDGPU::G_AMDGPU_S_BUFFER_LOAD_SBYTE:
668 Opc = G_AMDGPU_BUFFER_LOAD_SBYTE;
670 case AMDGPU::G_AMDGPU_S_BUFFER_LOAD_UBYTE:
671 Opc = G_AMDGPU_BUFFER_LOAD_UBYTE;
673 case AMDGPU::G_AMDGPU_S_BUFFER_LOAD_SSHORT:
674 Opc = G_AMDGPU_BUFFER_LOAD_SSHORT;
676 case AMDGPU::G_AMDGPU_S_BUFFER_LOAD_USHORT:
677 Opc = G_AMDGPU_BUFFER_LOAD_USHORT;
682 for (
int i = 0; i < NumLoads; ++i) {
684 .addDef(LoadParts[i])
689 .addImm(ImmOffset + 16 * i)
692 .addMemOperand(MF.getMachineMemOperand(BaseMMO, 16 * i, MemSize));
695 B.buildCopy(Dst, LoadParts[0]);
697 B.buildMergeLikeInstr(Dst, LoadParts);
698 B.setInstr(*MRI.getVRegDef(LoadParts[0]));
699 if (RSrcBank != SgprRB) {
701 WFI.
Start = MRI.getVRegDef(LoadParts.
front());
702 WFI.
End = std::next(MRI.getVRegDef(LoadParts.
back())->getIterator());
704 MI.eraseFromParent();
710 return (GI->is(Intrinsic::amdgcn_sbfe));
712 return MI.getOpcode() == AMDGPU::G_SBFX;
715bool RegBankLegalizeHelper::lowerV_BFE(MachineInstr &
MI) {
722 Register Src =
MI.getOperand(FirstOpnd).getReg();
723 Register LSBit =
MI.getOperand(FirstOpnd + 1).getReg();
724 Register Width =
MI.getOperand(FirstOpnd + 2).getReg();
729 unsigned SHROpc =
Signed ? AMDGPU::G_ASHR : AMDGPU::G_LSHR;
730 auto SHRSrc = B.buildInstr(SHROpc, {{VgprRB, S64}}, {Src, LSBit});
738 auto Amt = B.buildSub(VgprRB_S32, B.buildConstant(SgprRB_S32, 64), Width);
739 auto SignBit = B.buildShl({VgprRB, S64}, SHRSrc, Amt);
740 B.buildInstr(SHROpc, {Dst}, {SignBit, Amt});
741 MI.eraseFromParent();
745 uint64_t WidthImm = ConstWidth->Value.getZExtValue();
746 auto UnmergeSHRSrc = B.buildUnmerge(VgprRB_S32, SHRSrc);
747 Register SHRSrcLo = UnmergeSHRSrc.getReg(0);
748 Register SHRSrcHi = UnmergeSHRSrc.getReg(1);
749 auto Zero = B.buildConstant({VgprRB, S32}, 0);
750 unsigned BFXOpc =
Signed ? AMDGPU::G_SBFX : AMDGPU::G_UBFX;
752 if (WidthImm <= 32) {
754 auto Lo = B.buildInstr(BFXOpc, {VgprRB_S32}, {SHRSrcLo,
Zero, Width});
755 MachineInstrBuilder
Hi;
758 Hi = B.buildAShr(VgprRB_S32,
Lo, B.buildConstant(VgprRB_S32, 31));
763 B.buildMergeLikeInstr(Dst, {
Lo,
Hi});
765 auto Amt = B.buildConstant(VgprRB_S32, WidthImm - 32);
767 auto Hi = B.buildInstr(BFXOpc, {VgprRB_S32}, {SHRSrcHi,
Zero, Amt});
768 B.buildMergeLikeInstr(Dst, {SHRSrcLo,
Hi});
771 MI.eraseFromParent();
775bool RegBankLegalizeHelper::lowerS_BFE(MachineInstr &
MI) {
777 LLT Ty = MRI.getType(DstReg);
780 Register Src =
MI.getOperand(FirstOpnd).getReg();
781 Register LSBit =
MI.getOperand(FirstOpnd + 1).getReg();
782 Register Width =
MI.getOperand(FirstOpnd + 2).getReg();
789 auto FieldOffset = B.buildAnd(SgprRB_S32, LSBit, Mask);
790 auto Size = B.buildShl(SgprRB_S32, Width, B.buildConstant(SgprRB_S32, 16));
791 auto Src1 = B.buildOr(SgprRB_S32, FieldOffset,
Size);
792 unsigned Opc32 =
Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32;
793 unsigned Opc64 =
Signed ? AMDGPU::S_BFE_I64 : AMDGPU::S_BFE_U64;
794 unsigned Opc = Ty == S32 ? Opc32 : Opc64;
798 auto S_BFE = B.buildInstr(
Opc, {{SgprRB, Ty}},
799 {B.buildCopy(Ty, Src), B.buildCopy(S32, Src1)});
801 *ST.getRegisterInfo(), RBI);
803 B.buildCopy(DstReg,
S_BFE->getOperand(0).getReg());
804 MI.eraseFromParent();
808bool RegBankLegalizeHelper::lowerSplitTo32(MachineInstr &
MI) {
810 LLT DstTy = MRI.getType(Dst);
811 assert(DstTy == V4S16 || DstTy == V2S32 || DstTy == S64);
812 LLT Ty = DstTy == V4S16 ? V2S16 : S32;
813 auto Op1 = B.buildUnmerge({VgprRB, Ty},
MI.getOperand(1).
getReg());
814 auto Op2 = B.buildUnmerge({VgprRB, Ty},
MI.getOperand(2).
getReg());
815 unsigned Opc =
MI.getOpcode();
818 B.buildInstr(
Opc, {{VgprRB, Ty}}, {Op1.getReg(0), Op2.getReg(0)},
Flags);
820 B.buildInstr(
Opc, {{VgprRB, Ty}}, {Op1.getReg(1), Op2.getReg(1)},
Flags);
821 B.buildMergeLikeInstr(Dst, {
Lo,
Hi});
822 MI.eraseFromParent();
826bool RegBankLegalizeHelper::lowerSplitTo32Mul(MachineInstr &
MI) {
828 assert(MRI.getType(Dst) == S64);
829 auto Op1 = B.buildUnmerge({VgprRB_S32},
MI.getOperand(1).
getReg());
830 auto Op2 = B.buildUnmerge({VgprRB_S32},
MI.getOperand(2).
getReg());
834 auto Lo = B.buildMul(VgprRB_S32, Op1.getReg(0), Op2.getReg(0));
835 auto Carry = B.buildUMulH(VgprRB_S32, Op1.getReg(0), Op2.getReg(0));
836 auto MulLo0Hi1 = B.buildMul(VgprRB_S32, Op1.getReg(0), Op2.getReg(1));
837 auto MulHi0Lo1 = B.buildMul(VgprRB_S32, Op1.getReg(1), Op2.getReg(0));
838 auto Sum = B.buildAdd(VgprRB_S32, MulLo0Hi1, MulHi0Lo1);
839 auto Hi = B.buildAdd(VgprRB_S32, Sum, Carry);
841 B.buildMergeLikeInstr(Dst, {
Lo,
Hi});
842 MI.eraseFromParent();
846bool RegBankLegalizeHelper::lowerSplitTo16(MachineInstr &
MI) {
848 assert(MRI.getType(Dst) == V2S16);
849 unsigned Opc =
MI.getOpcode();
850 unsigned NumOps =
MI.getNumOperands();
853 auto [Op1Lo, Op1Hi] = unpackAExtTruncS16(
MI.getOperand(1).getReg());
856 auto Lo = B.buildInstr(
Opc, {SgprRB_S16}, {Op1Lo},
Flags);
857 auto Hi = B.buildInstr(
Opc, {SgprRB_S16}, {Op1Hi},
Flags);
858 B.buildMergeLikeInstr(Dst, {
Lo,
Hi});
859 MI.eraseFromParent();
863 auto [Op2Lo, Op2Hi] = unpackAExtTruncS16(
MI.getOperand(2).getReg());
866 auto Lo = B.buildInstr(
Opc, {SgprRB_S16}, {Op1Lo, Op2Lo},
Flags);
867 auto Hi = B.buildInstr(
Opc, {SgprRB_S16}, {Op1Hi, Op2Hi},
Flags);
868 B.buildMergeLikeInstr(Dst, {
Lo,
Hi});
869 MI.eraseFromParent();
874 auto [Op3Lo, Op3Hi] = unpackAExtTruncS16(
MI.getOperand(3).getReg());
875 auto Lo = B.buildInstr(
Opc, {SgprRB_S16}, {Op1Lo, Op2Lo, Op3Lo},
Flags);
876 auto Hi = B.buildInstr(
Opc, {SgprRB_S16}, {Op1Hi, Op2Hi, Op3Hi},
Flags);
877 B.buildMergeLikeInstr(Dst, {
Lo,
Hi});
878 MI.eraseFromParent();
882bool RegBankLegalizeHelper::lowerUniMAD64(MachineInstr &
MI) {
889 const GCNSubtarget &ST = B.getMF().getSubtarget<GCNSubtarget>();
892 Register DstLo = B.buildMul(SgprRB_S32, Src0, Src1).getReg(0);
893 Register DstHi = MRI.createVirtualRegister(SgprRB_S32);
894 if (ST.hasScalarMulHiInsts()) {
895 B.buildInstr(AMDGPU::G_UMULH, {{DstHi}}, {Src0, Src1});
897 auto VSrc0 = B.buildCopy(VgprRB_S32, Src0);
898 auto VSrc1 = B.buildCopy(VgprRB_S32, Src1);
899 auto MulHi = B.buildInstr(AMDGPU::G_UMULH, {VgprRB_S32}, {VSrc0, VSrc1});
910 B.buildMergeLikeInstr(Dst0, {DstLo, DstHi});
911 B.buildConstant(Dst1, 0);
914 Register Src2Lo = MRI.createVirtualRegister(SgprRB_S32);
915 Register Src2Hi = MRI.createVirtualRegister(SgprRB_S32);
916 B.buildUnmerge({Src2Lo, Src2Hi}, Src2);
918 auto AddLo = B.buildUAddo(SgprRB_S32, SgprRB_S32, DstLo, Src2Lo);
920 B.buildUAdde(SgprRB_S32, SgprRB_S32, DstHi, Src2Hi, AddLo.getReg(1));
921 B.buildMergeLikeInstr(Dst0, {AddLo.getReg(0), AddHi.getReg(0)});
922 B.buildCopy(Dst1, AddHi.getReg(1));
925 MI.eraseFromParent();
929bool RegBankLegalizeHelper::lowerSplitTo32Select(MachineInstr &
MI) {
931 LLT DstTy = MRI.getType(Dst);
932 assert(DstTy == V4S16 || DstTy == V2S32 || DstTy == S64 ||
934 LLT Ty = DstTy == V4S16 ? V2S16 : S32;
935 auto Op2 = B.buildUnmerge({VgprRB, Ty},
MI.getOperand(2).
getReg());
936 auto Op3 = B.buildUnmerge({VgprRB, Ty},
MI.getOperand(3).
getReg());
940 B.buildSelect({VgprRB, Ty},
Cond, Op2.getReg(0), Op3.getReg(0), Flags);
942 B.buildSelect({VgprRB, Ty},
Cond, Op2.getReg(1), Op3.getReg(1), Flags);
944 B.buildMergeLikeInstr(Dst, {
Lo,
Hi});
945 MI.eraseFromParent();
949bool RegBankLegalizeHelper::lowerSplitTo32SExtInReg(MachineInstr &
MI) {
950 auto Op1 = B.buildUnmerge(VgprRB_S32,
MI.getOperand(1).getReg());
951 int Amt =
MI.getOperand(2).getImm();
955 auto Freeze = B.buildFreeze(VgprRB_S32, Op1.getReg(0));
958 Lo = Freeze.getReg(0);
961 Lo = B.buildSExtInReg(VgprRB_S32, Freeze, Amt).getReg(0);
964 auto SignExtCst = B.buildConstant(SgprRB_S32, 31);
965 Hi = B.buildAShr(VgprRB_S32,
Lo, SignExtCst).getReg(0);
969 Hi = B.buildSExtInReg(VgprRB_S32, Op1.getReg(1), Amt - 32).getReg(0);
972 B.buildMergeLikeInstr(
MI.getOperand(0).getReg(), {Lo, Hi});
973 MI.eraseFromParent();
977bool RegBankLegalizeHelper::lowerSplitBitCount64To32(MachineInstr &
MI) {
983 unsigned Opc =
MI.getOpcode();
992 case AMDGPU::G_AMDGPU_FFBH_U32:
994 AddOpc = AMDGPU::G_UADDSAT;
995 SearchFromMSB =
true;
997 case AMDGPU::G_AMDGPU_FFBL_B32:
999 AddOpc = AMDGPU::G_UADDSAT;
1000 SearchFromMSB =
false;
1002 case AMDGPU::G_CTLZ_ZERO_POISON:
1003 FFBOpc = AMDGPU::G_AMDGPU_FFBH_U32;
1004 AddOpc = AMDGPU::G_ADD;
1005 SearchFromMSB =
true;
1007 case AMDGPU::G_CTTZ_ZERO_POISON:
1008 FFBOpc = AMDGPU::G_AMDGPU_FFBL_B32;
1009 AddOpc = AMDGPU::G_ADD;
1010 SearchFromMSB =
false;
1016 auto Unmerge = B.buildUnmerge(VgprRB_S32,
MI.getOperand(1).getReg());
1023 auto Primary = B.buildInstr(FFBOpc, {VgprRB_S32}, {SearchFromMSB ?
Hi :
Lo});
1025 B.buildInstr(FFBOpc, {VgprRB_S32}, {SearchFromMSB ?
Lo :
Hi});
1027 auto Adjusted = B.buildInstr(AddOpc, {VgprRB_S32},
1028 {Secondary, B.buildConstant(VgprRB_S32, 32)});
1029 B.buildUMin(
MI.getOperand(0).getReg(), Primary, Adjusted);
1031 MI.eraseFromParent();
1035bool RegBankLegalizeHelper::lowerExtrVecEltToSel(MachineInstr &
MI) {
1047 LLT VecTy = MRI.getType(Src);
1050 MachineRegisterInfo::VRegAttrs VgprRB_EltTy = {VgprRB, ScalarTy};
1052 auto Unmerge = B.buildUnmerge(VgprRB_EltTy, Src);
1055 Register PrevSelect = Unmerge.getReg(0);
1056 for (
unsigned I = 1;
I < NumElts; ++
I) {
1057 auto IdxConst = B.buildConstant({SgprRB, MRI.getType(Idx)},
I);
1060 B.buildSelect(VgprRB_EltTy, Cmp, Unmerge.getReg(
I), PrevSelect)
1063 B.buildCopy(Dst, PrevSelect);
1065 auto InitUnmerge = B.buildUnmerge(VgprRB_S32, Unmerge.getReg(0));
1066 Register PrevLo = InitUnmerge.getReg(0);
1067 Register PrevHi = InitUnmerge.getReg(1);
1068 for (
unsigned I = 1;
I < NumElts; ++
I) {
1069 auto IdxConst = B.buildConstant({SgprRB, MRI.getType(Idx)},
I);
1071 auto EltUnmerge = B.buildUnmerge(VgprRB_S32, Unmerge.getReg(
I));
1072 PrevLo = B.buildSelect(VgprRB_S32, Cmp, EltUnmerge.getReg(0), PrevLo)
1074 PrevHi = B.buildSelect(VgprRB_S32, Cmp, EltUnmerge.getReg(1), PrevHi)
1077 B.buildMergeLikeInstr(Dst, {PrevLo, PrevHi});
1080 MF, MORE,
"amdgpu-regbanklegalize",
1081 "AMDGPU RegBankLegalize: ExtrVecEltToSel unsupported element type",
MI);
1085 MI.eraseFromParent();
1089bool RegBankLegalizeHelper::lowerExtrVecEltTo32(MachineInstr &
MI) {
1102 LLT SrcTy = MRI.getType(Src);
1105 assert(MRI.getRegBank(Src) == VgprRB && MRI.getRegBank(Idx) == SgprRB &&
1106 "expected VGPR src and SGPR idx");
1108 auto CastSrc = B.buildBitcast({VgprRB, Vec32Ty}, Src);
1111 auto One = B.buildConstant(SgprRB_S32, 1);
1112 auto IdxLo = B.buildShl(SgprRB_S32, Idx, One);
1113 auto IdxHi = B.buildAdd(SgprRB_S32, IdxLo, One);
1115 auto ExtLo = B.buildExtractVectorElement(VgprRB_S32, CastSrc, IdxLo);
1116 auto ExtHi = B.buildExtractVectorElement(VgprRB_S32, CastSrc, IdxHi);
1118 B.buildMergeLikeInstr(Dst, {ExtLo.getReg(0), ExtHi.getReg(0)});
1120 MI.eraseFromParent();
1124bool RegBankLegalizeHelper::lowerInsVecEltToSel(MachineInstr &
MI) {
1137 LLT VecTy = MRI.getType(Src);
1140 const RegisterBank *SrcRB = MRI.getRegBank(Src);
1141 bool IsSGPR = (SrcRB == SgprRB);
1142 SmallVector<Register, 16> Selects;
1146 auto Unmerge = B.buildUnmerge(VgprRB_S32, Src);
1147 auto EltUnmerge = B.buildUnmerge(VgprRB_S32, Elt);
1148 Register EltLo = EltUnmerge.getReg(0);
1149 Register EltHi = EltUnmerge.getReg(1);
1150 for (
unsigned I = 0;
I < NumElts; ++
I) {
1151 auto IdxConst = B.buildConstant(VgprRB_S32,
I);
1154 B.buildSelect(VgprRB_S32, Cmp, EltLo, Unmerge.getReg(2 *
I))
1157 B.buildSelect(VgprRB_S32, Cmp, EltHi, Unmerge.getReg(2 *
I + 1))
1161 auto Vec32 = B.buildBuildVector({VgprRB, Vec32Ty}, Selects);
1162 B.buildBitcast(Dst, Vec32);
1165 MachineRegisterInfo::VRegAttrs SrcRB_EltTy = {SrcRB, ScalarTy};
1166 MachineRegisterInfo::VRegAttrs CmpTy = IsSGPR ? SgprRB_S32 : VccRB_S1;
1167 auto Unmerge = B.buildUnmerge(SrcRB_EltTy, Src);
1168 for (
unsigned I = 0;
I < NumElts; ++
I) {
1169 auto IdxConst = B.buildConstant(SgprRB_S32,
I);
1172 B.buildSelect(SrcRB_EltTy, Cmp, Elt, Unmerge.getReg(
I)).getReg(0));
1174 B.buildMergeLikeInstr(Dst, Selects);
1177 MF, MORE,
"amdgpu-regbanklegalize",
1178 "AMDGPU RegBankLegalize: InsVecEltToSel unsupported element type",
MI);
1182 MI.eraseFromParent();
1186bool RegBankLegalizeHelper::lowerInsVecEltTo32(MachineInstr &
MI) {
1201 LLT SrcTy = MRI.getType(Src);
1204 assert(MRI.getRegBank(Src) == VgprRB && MRI.getRegBank(Idx) == SgprRB &&
1205 "expected VGPR src and SGPR idx");
1207 MachineRegisterInfo::VRegAttrs VgprRB_Vec32Ty = {VgprRB, Vec32Ty};
1209 auto CastSrc = B.buildBitcast(VgprRB_Vec32Ty, Src);
1210 auto EltUnmerge = B.buildUnmerge(VgprRB_S32, Elt);
1213 auto One = B.buildConstant(SgprRB_S32, 1);
1214 auto IdxLo = B.buildShl(SgprRB_S32, Idx, One);
1215 auto IdxHi = B.buildAdd(SgprRB_S32, IdxLo, One);
1217 auto InsLo = B.buildInsertVectorElement(VgprRB_Vec32Ty, CastSrc,
1218 EltUnmerge.getReg(0), IdxLo);
1219 auto InsHi = B.buildInsertVectorElement(VgprRB_Vec32Ty, InsLo,
1220 EltUnmerge.getReg(1), IdxHi);
1222 B.buildBitcast(Dst, InsHi);
1224 MI.eraseFromParent();
1228bool RegBankLegalizeHelper::lowerAbsToNegMax(MachineInstr &
MI) {
1238 LLT Ty = MRI.getType(DstReg);
1244 Zero = B.buildBuildVector({VgprRB, Ty}, {Zero16, Zero16}).
getReg(0);
1246 assert((Ty == S32 || Ty == S16) &&
"unexpected type for AbsToNegMax");
1247 Zero = B.buildConstant({VgprRB, Ty}, 0).
getReg(0);
1250 auto Neg = B.buildSub({VgprRB, Ty},
Zero, SrcReg);
1251 B.buildSMax(DstReg, SrcReg, Neg);
1252 MI.eraseFromParent();
1256bool RegBankLegalizeHelper::lowerAbsToS32(MachineInstr &
MI) {
1266 auto Bitcast = B.buildBitcast({SgprRB_S32},
MI.getOperand(1).
getReg());
1267 auto SextInReg = B.buildSExtInReg({SgprRB_S32},
Bitcast, 16);
1269 B.buildAShr({SgprRB_S32},
Bitcast, B.buildConstant({SgprRB_S32}, 16));
1271 auto AbsLo = B.buildInstr(AMDGPU::G_ABS, {{SgprRB_S32}}, {SextInReg});
1272 auto AbsHi = B.buildInstr(AMDGPU::G_ABS, {{SgprRB_S32}}, {ShiftHi});
1273 B.buildBuildVectorTrunc(
MI.getOperand(0).getReg(),
1274 {AbsLo.getReg(0), AbsHi.getReg(0)});
1276 MI.eraseFromParent();
1280bool RegBankLegalizeHelper::lower(MachineInstr &
MI,
1288 return lowerVccExtToSel(
MI);
1290 LLT Ty = MRI.getType(
MI.getOperand(0).getReg());
1291 auto True = B.buildConstant({SgprRB, Ty},
1292 MI.getOpcode() == AMDGPU::G_SEXT ? -1 : 1);
1293 auto False = B.buildConstant({SgprRB, Ty}, 0);
1297 B.buildSelect(
MI.getOperand(0).getReg(),
MI.getOperand(1).getReg(), True,
1299 MI.eraseFromParent();
1303 return lowerUnpackBitShift(
MI);
1305 return lowerUnpackMinMax(
MI);
1307 return lowerSplitTo16(
MI);
1309 const RegisterBank *RB = MRI.getRegBank(
MI.getOperand(0).getReg());
1310 MachineInstrBuilder
Hi;
1311 switch (
MI.getOpcode()) {
1312 case AMDGPU::G_ZEXT: {
1313 Hi = B.buildConstant({RB, S32}, 0);
1316 case AMDGPU::G_SEXT: {
1318 auto ShiftAmt = B.buildConstant({RB, S32}, 31);
1319 Hi = B.buildAShr({RB, S32},
MI.getOperand(1).
getReg(), ShiftAmt);
1322 case AMDGPU::G_ANYEXT: {
1323 Hi = B.buildUndef({RB, S32});
1328 "AMDGPU RegBankLegalize: Ext32To64, unsuported opcode",
1333 B.buildMergeLikeInstr(
MI.getOperand(0).getReg(),
1334 {MI.getOperand(1).getReg(), Hi});
1335 MI.eraseFromParent();
1339 uint64_t ConstVal =
MI.getOperand(1).getCImm()->getZExtValue();
1340 B.buildConstant(
MI.getOperand(0).getReg(), ConstVal);
1342 MI.eraseFromParent();
1347 LLT Ty = MRI.getType(Src);
1351 Register BoolSrc = MRI.createVirtualRegister({VgprRB, Ty});
1353 auto Src64 = B.buildUnmerge(VgprRB_S32, Src);
1354 auto One = B.buildConstant(VgprRB_S32, 1);
1355 auto AndLo = B.buildAnd(VgprRB_S32, Src64.getReg(0), One);
1356 auto Zero = B.buildConstant(VgprRB_S32, 0);
1357 auto AndHi = B.buildAnd(VgprRB_S32, Src64.getReg(1), Zero);
1358 B.buildMergeLikeInstr(BoolSrc, {AndLo, AndHi});
1360 assert(Ty == S32 || Ty == S16);
1361 auto One = B.buildConstant({VgprRB, Ty}, 1);
1362 B.buildAnd(BoolSrc, Src, One);
1364 auto Zero = B.buildConstant({VgprRB, Ty}, 0);
1366 MI.eraseFromParent();
1370 return lowerV_BFE(
MI);
1372 return lowerS_BFE(
MI);
1374 return lowerUniMAD64(
MI);
1376 B.buildMul(
MI.getOperand(0),
MI.getOperand(1),
MI.getOperand(2));
1377 MI.eraseFromParent();
1381 auto Op1 = B.buildTrunc(VgprRB_S32,
MI.getOperand(1));
1382 auto Op2 = B.buildTrunc(VgprRB_S32,
MI.getOperand(2));
1383 auto Zero = B.buildConstant({VgprRB, S64}, 0);
1385 unsigned NewOpc =
MI.getOpcode() == AMDGPU::G_AMDGPU_S_MUL_U64_U32
1386 ? AMDGPU::G_AMDGPU_MAD_U64_U32
1387 : AMDGPU::G_AMDGPU_MAD_I64_I32;
1389 B.buildInstr(NewOpc, {
MI.getOperand(0).getReg(), {SgprRB, S32}},
1391 MI.eraseFromParent();
1395 return lowerSplitTo32(
MI);
1397 return lowerSplitTo32Mul(
MI);
1399 return lowerSplitTo32Select(
MI);
1401 return lowerSplitTo32SExtInReg(
MI);
1403 auto Unmerge = B.buildUnmerge({VgprRB, S32},
MI.getOperand(1).
getReg());
1404 auto LoPopCnt = B.buildCTPOP({VgprRB, S32}, Unmerge.getReg(0));
1405 auto HiPopCnt = B.buildCTPOP({VgprRB, S32}, Unmerge.getReg(1));
1407 B.buildAdd(
MI.getOperand(0).getReg(), LoPopCnt, HiPopCnt,
1410 MI.eraseFromParent();
1414 return lowerSBufToBuf(
MI, WFI);
1416 LLT DstTy = MRI.getType(
MI.getOperand(0).getReg());
1427 if (
Size / 128 == 2)
1429 else if (
Size / 128 == 4)
1433 "AMDGPU RegBankLegalize: SplitLoad, unsuported type",
1439 else if (DstTy == S96)
1440 splitLoad(
MI, {S64, S32}, S32);
1441 else if (DstTy == V3S32)
1442 splitLoad(
MI, {V2S32, S32}, S32);
1443 else if (DstTy == V6S16)
1444 splitLoad(
MI, {V4S16, V2S16}, V2S16);
1447 "AMDGPU RegBankLegalize: SplitLoad, unsuported type",
1454 const auto &TFI = *ST.getFrameLowering();
1458 "Stack grows upwards for AMDGPU");
1461 Register AllocSize =
MI.getOperand(1).getReg();
1466 B.setInsertPt(*
MI.getParent(), std::next(
MI.getIterator()));
1467 MI.eraseFromParent();
1469 if (MRI.getRegBank(AllocSize) != SgprRB) {
1470 auto WaveReduction =
1471 B.buildIntrinsic(Intrinsic::amdgcn_wave_reduce_umax, {SgprRB_S32})
1474 AllocSize = WaveReduction.getReg(0);
1477 LLT PtrTy = MRI.getType(Dst);
1479 "Expected 32-bit pointer for stack allocation");
1480 const SIMachineFunctionInfo *
Info = MF.getInfo<SIMachineFunctionInfo>();
1484 const bool HasFlatScratch = ST.hasFlatScratchEnabled();
1485 const unsigned WavefrontSizeLog2 = ST.getWavefrontSizeLog2();
1488 if (!HasFlatScratch) {
1489 auto WaveSize = B.buildConstant(SgprRB_S32, WavefrontSizeLog2);
1490 AdjustedSize = B.buildShl(SgprRB_S32, AllocSize, WaveSize).getReg(0);
1492 if (Alignment > TFI.getStackAlign()) {
1493 const uint64_t EffectiveAlignment =
1494 Alignment.
value() << (HasFlatScratch ? 0 : WavefrontSizeLog2);
1495 auto OldSP = B.buildCopy({SgprRB, PtrTy},
SPReg);
1497 B.buildPtrAdd({SgprRB, PtrTy}, OldSP,
1498 B.buildConstant(SgprRB_S32, EffectiveAlignment - 1));
1500 B.buildPtrMask(Dst, Tmp1, B.buildConstant(SgprRB_S32, Mask));
1502 B.buildCopy(Dst,
SPReg);
1504 auto PtrAdd = B.buildPtrAdd({SgprRB, PtrTy}, Dst, AdjustedSize);
1505 B.buildCopy(
SPReg, PtrAdd);
1509 LLT DstTy = MRI.getType(
MI.getOperand(0).getReg());
1511 widenLoad(
MI, S128);
1512 else if (DstTy == V3S32)
1513 widenLoad(
MI, V4S32, S32);
1514 else if (DstTy == V6S16)
1515 widenLoad(
MI, V8S16, V2S16);
1518 "AMDGPU RegBankLegalize: WidenLoad, unsuported type",
1525 return lowerUnpackAExt(
MI);
1530 return MRI.getRegBankOrNull(Op.getReg()) == SgprRB;
1536 return MRI.getRegBankOrNull(Op.getReg()) == VgprRB;
1538 B.setInstrAndDebugLoc(
MI);
1539 for (
unsigned i =
MI.getNumDefs(); i <
MI.getNumOperands(); ++i) {
1540 MachineOperand &
Op =
MI.getOperand(i);
1544 if (MRI.getRegBank(
Reg) != VgprRB) {
1545 auto Copy = B.buildCopy({VgprRB, MRI.getType(
Reg)},
Reg);
1546 Op.setReg(
Copy.getReg(0));
1556 "AMDGPU RegBankLegalize: unmerge not multiple of 32",
1561 B.setInstrAndDebugLoc(
MI);
1564 B.buildUnmerge({SgprRB, V2S16}, Unmerge->
getSourceReg());
1565 for (
unsigned i = 0; i < UnmergeV2S16->getNumDefs(); ++i) {
1566 auto [Dst0S32, Dst1S32] =
1567 unpackAExt(UnmergeV2S16->getOperand(i).getReg());
1568 B.buildTrunc(
MI.getOperand(i * 2).getReg(), Dst0S32);
1569 B.buildTrunc(
MI.getOperand(i * 2 + 1).getReg(), Dst1S32);
1572 auto [Dst0S32, Dst1S32] = unpackAExt(
MI.getOperand(2).getReg());
1573 B.buildTrunc(
MI.getOperand(0).getReg(), Dst0S32);
1574 B.buildTrunc(
MI.getOperand(1).getReg(), Dst1S32);
1577 MI.eraseFromParent();
1582 Register NewDst = MRI.createVirtualRegister(SgprRB_S32);
1583 B.setInsertPt(*
MI.getParent(),
MI.getParent()->getFirstNonPHI());
1584 MI.getOperand(0).setReg(NewDst);
1585 B.buildTrunc(Dst, NewDst);
1587 for (
unsigned i = 1; i <
MI.getNumOperands(); i += 2) {
1595 auto NewUse = B.buildAnyExt(SgprRB_S32,
UseReg);
1596 MI.getOperand(i).setReg(NewUse.getReg(0));
1604 return MRI.getRegBankOrNull(Op.getReg()) == SgprRB;
1609 assert(MRI.getRegBankOrNull(
MI.getOperand(0).getReg()) == VgprRB);
1613 const RegisterBank *RB = MRI.getRegBankOrNull(Op.getReg());
1614 return RB == VgprRB || RB == SgprRB;
1619 const AMDGPU::RsrcIntrinsic *RSrcIntrin =
1624 unsigned RsrcIdx = RSrcIntrin->
RsrcArg +
MI.getNumExplicitDefs() + 1;
1625 return applyRegisterBanksVgprWithSgprRsrc(
MI, RsrcIdx);
1631 unsigned RsrcIdx =
MI.getNumOperands();
1632 while (RsrcIdx-- >
MI.getNumExplicitDefs()) {
1633 const MachineOperand &
Op =
MI.getOperand(RsrcIdx);
1634 if (
Op.isReg() &&
Op.getReg().isVirtual())
1637 return applyRegisterBanksVgprWithSgprRsrc(
MI, RsrcIdx);
1640 return lowerSplitBitCount64To32(
MI);
1642 return lowerExtrVecEltToSel(
MI);
1644 return lowerExtrVecEltTo32(
MI);
1646 return lowerInsVecEltToSel(
MI);
1648 return lowerInsVecEltTo32(
MI);
1650 return lowerAbsToNegMax(
MI);
1652 return lowerAbsToS32(
MI);
1654 MI.eraseFromParent();
1778 return isAnyPtr(Ty, 32) ? Ty : LLT();
1781 return isAnyPtr(Ty, 64) ? Ty : LLT();
1784 return isAnyPtr(Ty, 128) ? Ty : LLT();
1824 const SIRegisterInfo *
TRI =
1825 static_cast<const SIRegisterInfo *
>(MRI.getTargetRegisterInfo());
1827 if (LLTSize >= 32 &&
TRI->getSGPRClassForBitWidth(LLTSize))
1832 const SIRegisterInfo *
TRI =
1833 static_cast<const SIRegisterInfo *
>(MRI.getTargetRegisterInfo());
1956bool RegBankLegalizeHelper::applyMappingDst(
1957 MachineInstr &
MI,
unsigned &
OpIdx,
1958 const SmallVectorImpl<RegBankLLTMappingApplyID> &MethodIDs) {
1963 MachineOperand &
Op =
MI.getOperand(
OpIdx);
1965 LLT Ty = MRI.getType(
Reg);
1966 [[maybe_unused]]
const RegisterBank *RB = MRI.getRegBank(
Reg);
1968 switch (MethodIDs[
OpIdx]) {
2045 Register NewAgprDst = MRI.createVirtualRegister({AgprRB, Ty});
2046 Op.setReg(NewAgprDst);
2047 if (!MRI.use_nodbg_empty(
Reg))
2048 B.buildCopy(
Reg, NewAgprDst);
2053 const RegisterBank *DstRB =
2054 MFI->selectAGPRFormMFMA(NumRegs) ? AgprRB : VgprRB;
2057 Register NewDst = MRI.createVirtualRegister({DstRB, Ty});
2059 if (!MRI.use_nodbg_empty(
Reg))
2060 B.buildCopy(
Reg, NewDst);
2067 Register NewDst = MRI.createVirtualRegister(VccRB_S1);
2069 if (!MRI.use_empty(
Reg)) {
2071 B.buildInstr(AMDGPU::G_AMDGPU_COPY_SCC_VCC, {SgprRB_S32}, {NewDst});
2072 B.buildTrunc(
Reg, CopyS32_Vcc);
2079 Register NewVgprDstS16 = MRI.createVirtualRegister({VgprRB, S16});
2080 Register NewVgprDstS32 = MRI.createVirtualRegister({VgprRB, S32});
2081 Register NewSgprDstS32 = MRI.createVirtualRegister({SgprRB, S32});
2082 Op.setReg(NewVgprDstS16);
2083 B.buildAnyExt(NewVgprDstS32, NewVgprDstS16);
2085 B.buildTrunc(
Reg, NewSgprDstS32);
2104 Register NewVgprDst = MRI.createVirtualRegister({VgprRB, Ty});
2105 Op.setReg(NewVgprDst);
2118 Register NewVgprDst = MRI.createVirtualRegister({VgprRB, Ty});
2119 Op.setReg(NewVgprDst);
2127 Register NewDst = MRI.createVirtualRegister(SgprRB_S32);
2129 if (!MRI.use_empty(
Reg))
2130 B.buildTrunc(
Reg, NewDst);
2137 Op.setReg(MRI.createVirtualRegister({SgprRB, Ty}));
2138 B.buildCopy(
Reg,
Op.getReg());
2143 MF, MORE,
"amdgpu-regbanklegalize",
2144 "AMDGPU RegBankLegalize: missing fast rule ('Div' or 'Uni') for",
MI);
2149 MF, MORE,
"amdgpu-regbanklegalize",
2150 "AMDGPU RegBankLegalize: applyMappingDst, ID not supported",
MI);
2158bool RegBankLegalizeHelper::applyMappingSrc(
2159 MachineInstr &
MI,
unsigned &
OpIdx,
2160 const SmallVectorImpl<RegBankLLTMappingApplyID> &MethodIDs,
2162 for (
unsigned i = 0; i < MethodIDs.
size(); ++
OpIdx, ++i) {
2163 if (MethodIDs[i] ==
None || MethodIDs[i] ==
IntrId || MethodIDs[i] ==
Imm)
2166 MachineOperand &
Op =
MI.getOperand(
OpIdx);
2168 LLT Ty = MRI.getType(
Reg);
2169 const RegisterBank *RB = MRI.getRegBank(
Reg);
2171 switch (MethodIDs[i]) {
2174 assert(RB == VccRB || RB == SgprRB);
2176 auto Aext = B.buildAnyExt(SgprRB_S32,
Reg);
2178 B.buildInstr(AMDGPU::G_AMDGPU_COPY_VCC_SCC, {VccRB_S1}, {Aext});
2179 Op.setReg(CopyVcc_Scc.getReg(0));
2198 assert(Ty == getTyFromID(MethodIDs[i]));
2199 assert(RB == getRegBankFromID(MethodIDs[i]));
2213 assert(Ty == getBTyFromID(MethodIDs[i], Ty));
2214 assert(RB == getRegBankFromID(MethodIDs[i]));
2241 assert(Ty == getTyFromID(MethodIDs[i]));
2243 auto CopyToVgpr = B.buildCopy({VgprRB, Ty},
Reg);
2244 Op.setReg(CopyToVgpr.getReg(0));
2260 assert(Ty == getBTyFromID(MethodIDs[i], Ty));
2262 auto CopyToVgpr = B.buildCopy({VgprRB, Ty},
Reg);
2263 Op.setReg(CopyToVgpr.getReg(0));
2269 auto CopyToVgpr = B.buildCopy({VgprRB, Ty},
Reg);
2270 Op.setReg(CopyToVgpr.getReg(0));
2276 auto CopyToAgpr = B.buildCopy({AgprRB, Ty},
Reg);
2277 Op.setReg(CopyToAgpr.getReg(0));
2283 const RegisterBank *SrcRB =
2284 MFI->selectAGPRFormMFMA(NumRegs) ? AgprRB : VgprRB;
2286 Op.setReg(B.buildCopy({SrcRB, Ty},
Reg).getReg(0));
2292 assert(Ty == getTyFromID(MethodIDs[i]));
2297 WFI.
End = std::next(
MI.getIterator());
2304 assert(Ty == getTyFromID(MethodIDs[i]));
2310 while (
Start->getOpcode() != AMDGPU::ADJCALLSTACKUP)
2315 while (End->getOpcode() != AMDGPU::ADJCALLSTACKDOWN)
2327 assert(Ty == getBTyFromID(MethodIDs[i], Ty));
2331 Register NewSGPR = MRI.createVirtualRegister({SgprRB, Ty});
2338 assert(Ty == getTyFromID(MethodIDs[i]));
2342 Register NewSGPR = MRI.createVirtualRegister({SgprRB, Ty});
2352 auto Aext = B.buildAnyExt(SgprRB_S32,
Reg);
2353 Op.setReg(Aext.getReg(0));
2360 auto Aext = B.buildAnyExt(SgprRB_S32,
Reg);
2363 auto Cst1 = B.buildConstant(SgprRB_S32, 1);
2364 auto BoolInReg = B.buildAnd(SgprRB_S32, Aext, Cst1);
2365 Op.setReg(BoolInReg.getReg(0));
2371 auto Sext = B.buildSExt(SgprRB_S32,
Reg);
2372 Op.setReg(Sext.getReg(0));
2378 auto Zext = B.buildZExt({SgprRB, S32},
Reg);
2379 Op.setReg(Zext.getReg(0));
2385 auto Aext = B.buildAnyExt({VgprRB, S32},
Reg);
2386 Op.setReg(Aext.getReg(0));
2393 auto Sext = B.buildSExt({VgprRB, S32},
Reg);
2394 Op.setReg(Sext.getReg(0));
2401 auto Zext = B.buildZExt({VgprRB, S32},
Reg);
2402 Op.setReg(Zext.getReg(0));
2407 MF, MORE,
"amdgpu-regbanklegalize",
2408 "AMDGPU RegBankLegalize: applyMappingSrc, ID not supported",
MI);
2418 unsigned StartOpIdx,
2419 unsigned EndOpIdx) {
2420 for (
unsigned i = StartOpIdx; i <= EndOpIdx; ++i) {
2427bool RegBankLegalizeHelper::applyRegisterBanksVgprWithSgprRsrc(
2428 MachineInstr &
MI,
unsigned RsrcIdx) {
2429 const unsigned NumDefs =
MI.getNumExplicitDefs();
2431 MachineBasicBlock *
MBB =
MI.getParent();
2435 for (
unsigned i = 0; i < NumDefs; ++i) {
2437 if (MRI.getRegBank(
Reg) == VgprRB)
2440 Register NewVgprDst = MRI.createVirtualRegister({VgprRB, MRI.getType(
Reg)});
2441 MI.getOperand(i).setReg(NewVgprDst);
2445 B.setInstrAndDebugLoc(
MI);
2448 for (
unsigned i = NumDefs; i < RsrcIdx; ++i) {
2449 MachineOperand &
Op =
MI.getOperand(i);
2457 if (MRI.getRegBank(
Reg) == VgprRB)
2460 auto Copy = B.buildCopy({VgprRB, MRI.getType(
Reg)},
Reg);
2461 Op.setReg(
Copy.getReg(0));
2464 SmallSet<Register, 4> OpsToWaterfall;
2467 for (
unsigned i = RsrcIdx; i <
MI.getNumOperands(); ++i) {
2468 MachineOperand &
Op =
MI.getOperand(i);
2473 if (MRI.getRegBank(
Reg) != SgprRB)
2477 if (!OpsToWaterfall.
empty()) {
2479 executeInWaterfallLoop(B, {OpsToWaterfall, MII, std::next(MII)});
MachineInstrBuilder MachineInstrBuilder & DefMI
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.
Provides AMDGPU specific target descriptions.
static bool isSignedBFE(MachineInstr &MI)
static bool verifyRegBankOnOperands(MachineInstr &MI, const RegisterBank *RB, MachineRegisterInfo &MRI, unsigned StartOpIdx, unsigned EndOpIdx)
This file declares the targeting of the RegisterBankInfo class for AMDGPU.
MachineBasicBlock MachineBasicBlock::iterator MBBI
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
AMD GCN specific subclass of TargetSubtarget.
Declares convenience wrapper classes for interpreting MachineInstr instances as specific generic oper...
static Register UseReg(const MachineOperand &MO)
const size_t AbstractManglingParser< Derived, Alloc >::NumOps
Contains matchers for matching SSA Machine Instructions.
This file declares the MachineIRBuilder class.
Register const TargetRegisterInfo * TRI
Promote Memory to Register
static MCRegister getReg(const MCDisassembler *D, unsigned RC, unsigned RegNo)
MachineInstr unsigned OpIdx
static constexpr MCPhysReg SPReg
const SmallVectorImpl< MachineOperand > & Cond
static const LaneMaskConstants & get(const GCNSubtarget &ST)
const unsigned XorTermOpc
const unsigned MovTermOpc
const unsigned AndSaveExecOpc
bool findRuleAndApplyMapping(MachineInstr &MI)
RegBankLegalizeHelper(MachineIRBuilder &B, const MachineUniformityInfo &MUI, const RegisterBankInfo &RBI, const RegBankLegalizeRules &RBLRules)
const RegBankLLTMapping * findMappingForMI(const MachineInstr &MI, const MachineRegisterInfo &MRI, const MachineUniformityInfo &MUI) const
static APInt getLowBitsSet(unsigned numBits, unsigned loBitsSet)
Constructs an APInt value that has the bottom loBitsSet bits set.
iterator find(const_arg_type_t< KeyT > Val)
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
const SIRegisterInfo * getRegisterInfo() const override
Represents a call to an intrinsic.
Register getSourceReg() const
Get the unmerge source register.
constexpr bool isScalar() const
LLT getScalarType() const
static constexpr LLT scalar(unsigned SizeInBits)
Get a low-level scalar or aggregate "bag of bits".
constexpr bool isValid() const
constexpr uint16_t getNumElements() const
Returns the number of elements in a vector LLT.
constexpr bool isVector() const
static constexpr LLT pointer(unsigned AddressSpace, unsigned SizeInBits)
Get a low-level pointer in the given address space.
constexpr TypeSize getSizeInBits() const
Returns the total size of the type. Must only be called on sized types.
constexpr bool isPointer() const
LLT divide(int Factor) const
Return a type that is Factor times smaller.
static constexpr LLT fixed_vector(unsigned NumElements, unsigned ScalarSizeInBits)
Get a low-level fixed-width vector of some number of elements and element width.
constexpr TypeSize getSizeInBytes() const
Returns the total size of the type in bytes, i.e.
LLT getElementType() const
Returns the vector's element type. Only valid for vector types.
TypeSize getValue() const
LLVM_ABI void transferSuccessorsAndUpdatePHIs(MachineBasicBlock *FromMBB)
Transfers all the successors, as in transferSuccessors, and update PHI operands in the successor bloc...
LLVM_ABI iterator SkipPHIsAndLabels(iterator I)
Return the first instruction in MBB after I that is not a PHI or a label.
LLVM_ABI void addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob=BranchProbability::getUnknown())
Add Succ as a successor of this MachineBasicBlock.
void splice(iterator Where, MachineBasicBlock *Other, iterator From)
Take an instruction from MBB 'Other' at the position From, and insert it into this MBB right before '...
MachineInstrBundleIterator< MachineInstr > iterator
BasicBlockListType::iterator iterator
MachineBasicBlock * CreateMachineBasicBlock(const BasicBlock *BB=nullptr, std::optional< UniqueBBID > BBID=std::nullopt)
CreateMachineInstr - Allocate a new MachineInstr.
void insert(iterator MBBI, MachineBasicBlock *MBB)
Helper class to build MachineInstr.
bool isValid() const
Check for null.
Representation of each machine instruction.
const MachineBasicBlock * getParent() const
LocationSize getSize() const
Return the size in bytes of the memory reference.
LLVM_ABI Align getAlign() const
Return the minimum known alignment in bytes of the actual memory reference.
MachineOperand class - Representation of each machine instruction operand.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
const RegisterBank * getRegBank(Register Reg) const
Return the register bank of Reg.
LLVM_ABI Register createVirtualRegister(const TargetRegisterClass *RegClass, StringRef Name="")
createVirtualRegister - Create and return a new virtual register in the function with the specified r...
LLT getType(Register Reg) const
Get the low-level type of Reg or LLT{} if Reg is not a generic (target independent) virtual register.
const RegisterBank * getRegBankOrNull(Register Reg) const
Return the register bank of Reg, or null if Reg has not been assigned a register bank or has been ass...
Holds all the information related to register banks.
This class implements the register bank concept.
Wrapper class representing virtual and physical registers.
constexpr bool isVirtual() const
Return true if the specified register number is in the virtual register namespace.
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
reference emplace_back(ArgTypes &&... Args)
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
self_iterator getIterator()
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
constexpr char Align[]
Key for Kernel::Arg::Metadata::mAlign.
@ SgprV4S32_ReadFirstLane
@ SgprV8S32_ReadFirstLane
bool isAnyPtr(LLT Ty, unsigned Width)
Intrinsic::ID getIntrinsicID(const MachineInstr &I)
Return the intrinsic ID for opcodes with the G_AMDGPU_INTRIN_ prefix.
std::pair< Register, unsigned > getBaseWithConstantOffset(MachineRegisterInfo &MRI, Register Reg, GISelValueTracking *ValueTracking=nullptr, bool CheckNUW=false)
Returns base register and constant offset.
@ VerifyAllSgprOrVgprGPHI
@ AextToS32InIncomingBlockGPHI
void buildReadAnyLane(MachineIRBuilder &B, Register SgprDst, Register VgprSrc, const RegisterBankInfo &RBI)
const RsrcIntrinsic * lookupRsrcIntrinsic(unsigned Intr)
void buildReadFirstLane(MachineIRBuilder &B, Register SgprDst, Register VgprSrc, const RegisterBankInfo &RBI)
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
@ Bitcast
Perform the operation on a different, but equivalently sized type.
SpecificConstantMatch m_ZeroInt()
Convenience matchers for specific integer values.
bool mi_match(Reg R, const MachineRegisterInfo &MRI, Pattern &&P)
This is an optimization pass for GlobalISel generic memory operations.
GenericUniformityInfo< MachineSSAContext > MachineUniformityInfo
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
LLVM_ABI MachineInstr * getOpcodeDef(unsigned Opcode, Register Reg, const MachineRegisterInfo &MRI)
See if Reg is defined by an single def instruction that is Opcode.
@ Kill
The last use of a register.
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
LLVM_ABI void constrainSelectedInstRegOperands(MachineInstr &I, const TargetInstrInfo &TII, const TargetRegisterInfo &TRI, const RegisterBankInfo &RBI)
Mutate the newly-selected instruction I to constrain its (possibly generic) virtual register operands...
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
unsigned Log2_64(uint64_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
LLVM_ABI std::optional< int64_t > getIConstantVRegSExtVal(Register VReg, const MachineRegisterInfo &MRI)
If VReg is defined by a G_CONSTANT fits in int64_t returns it.
LLVM_ABI void reportGISelFailure(MachineFunction &MF, MachineOptimizationRemarkEmitter &MORE, MachineOptimizationRemarkMissed &R)
Report an ISel error as a missed optimization remark to the LLVMContext's diagnostic stream.
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
constexpr T maskTrailingZeros(unsigned N)
Create a bitmask with the N right-most bits set to 0, and all other bits set to 1.
DWARFExpression::Operation Op
ArrayRef(const T &OneElt) -> ArrayRef< T >
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
LLVM_ABI std::optional< ValueAndVReg > getIConstantVRegValWithLookThrough(Register VReg, const MachineRegisterInfo &MRI, bool LookThroughInstrs=true)
If VReg is defined by a statically evaluable chain of instructions rooted on a G_CONSTANT returns its...
Align assumeAligned(uint64_t Value)
Treats the value 0 as a 1, so Align is always at least 1.
LLVM_ABI Register getSrcRegIgnoringCopies(Register Reg, const MachineRegisterInfo &MRI)
Find the source register for Reg, folding away any trivial copies.
constexpr T maskTrailingOnes(unsigned N)
Create a bitmask with the N right-most bits set to 1, and all other bits set to 0.
LoweringMethodID LoweringMethod
SmallVector< RegBankLLTMappingApplyID, 2 > DstOpMapping
SmallVector< RegBankLLTMappingApplyID, 4 > SrcOpMapping
Holds waterfall loop information: the set of SGPR operand registers that need waterfalling,...
MachineBasicBlock::iterator Start
SmallSet< Register, 4 > SgprWaterfallOperandRegs
MachineBasicBlock::iterator End
constexpr uint64_t value() const
This is a hole in the type system and should not be abused.