29#include "llvm/IR/IntrinsicsAMDGPU.h"
31#define DEBUG_TYPE "amdgpu-regbanklegalize"
41 ST(MF.getSubtarget<
GCNSubtarget>()), TII(*ST.getInstrInfo()), B(B),
42 MRI(*B.getMRI()), MUI(MUI), VT(VT), RBI(RBI), MORE(MF, nullptr),
43 RBLRules(RBLRules), IsWave32(ST.isWave32()),
44 SgprRB(&RBI.getRegBank(
AMDGPU::SGPRRegBankID)),
45 VgprRB(&RBI.getRegBank(
AMDGPU::VGPRRegBankID)),
46 AgprRB(&RBI.getRegBank(
AMDGPU::AGPRRegBankID)),
47 VccRB(&RBI.getRegBank(
AMDGPU::VCCRegBankID)) {}
53 "No AMDGPU RegBankLegalize rules defined for opcode",
61 "AMDGPU RegBankLegalize: none of the rules defined with "
62 "'Any' for MI's opcode matched MI",
70 B.setInsertPt(*
MI.getParent(), std::next(
MI.getIterator()));
80 if (!lower(
MI, *Mapping, WFI))
84 if (!executeInWaterfallLoop(B, WFI))
94 "Waterfall range not initialized");
111 const int OrigRangeSize = std::distance(BeginIt, EndIt);
120 B.buildInstr(TargetOpcode::IMPLICIT_DEF).addDef(InitSaveExecReg);
146 MBB.addSuccessor(LoopBB);
149 B.setInsertPt(*LoopBB, LoopBB->
end());
200 auto NewEnd = BodyBB->
end();
201 assert(std::distance(NewBegin, NewEnd) == OrigRangeSize);
214 auto OldVal = WaterfalledRegMap.
find(OldReg);
215 if (OldVal != WaterfalledRegMap.
end()) {
216 Op.setReg(OldVal->second);
230 unsigned PartSize = (OpSize % 64 == 0) ? 64 : 32;
232 unsigned NumParts = OpSize / PartSize;
238 CurrentLaneParts.
push_back(CurrentLaneReg);
240 auto UnmergeOp = B.buildUnmerge({VgprRB, PartTy}, OpReg);
241 auto UnmergeCurrLane = B.buildUnmerge({SgprRB, PartTy}, CurrentLaneReg);
242 for (
unsigned i = 0; i < NumParts; ++i) {
244 CurrentLaneParts.
push_back(UnmergeCurrLane.getReg(i));
248 for (
unsigned i = 0; i < NumParts; ++i) {
249 Register CmpReg = MRI.createVirtualRegister(VccRB_S1);
255 CondReg = B.buildAnd(VccRB_S1, CondReg, CmpReg).getReg(0);
258 Op.setReg(CurrentLaneReg);
261 WaterfalledRegMap.
insert(std::pair(OldReg,
Op.getReg()));
267 MRI.createVirtualRegister({WaveRC,
LLT::scalar(IsWave32 ? 32 : 64)});
268 B.buildIntrinsic(Intrinsic::amdgcn_ballot, CondRegLM).addReg(CondReg);
274 MRI.setSimpleHint(SavedExec, CondRegLM);
276 B.setInsertPt(*BodyBB, BodyBB->
end());
288 B.buildInstr(AMDGPU::SI_WATERFALL_LOOP).addMBB(LoopBB);
292 B.buildInstr(LMC.
MovOpc).addDef(SaveExecReg).addReg(LMC.
ExecReg);
295 B.setInsertPt(*RestoreExecBB, RestoreExecBB->
begin());
300 B.setInsertPt(*RemainderBB, RemainderBB->
begin());
307unsigned RegBankLegalizeHelper::setBufferOffsets(
309 Register &SOffsetReg, int64_t &InstOffsetVal, Align Alignment) {
310 if (std::optional<int64_t>
Imm =
312 uint32_t SOffset, ImmOffset;
313 if (TII.splitMUBUFOffset(*
Imm, SOffset, ImmOffset, Alignment)) {
314 VOffsetReg = B.buildConstant({VgprRB, S32}, 0).
getReg(0);
315 SOffsetReg = B.buildConstant({SgprRB, S32}, SOffset).
getReg(0);
316 InstOffsetVal = ImmOffset;
317 return SOffset + ImmOffset;
320 const bool CheckNUW = ST.hasGFX1250Insts();
322 MRI, CombinedOffset,
nullptr,
324 uint32_t SOffset, ImmOffset;
325 if (
static_cast<int32_t
>(
Offset) > 0 &&
326 TII.splitMUBUFOffset(
Offset, SOffset, ImmOffset, Alignment)) {
327 if (
Base.isValid() && MRI.getRegBank(
Base) == VgprRB) {
329 SOffsetReg = B.buildConstant({SgprRB, S32}, SOffset).
getReg(0);
330 InstOffsetVal = ImmOffset;
335 VOffsetReg = B.buildConstant({VgprRB, S32}, 0).
getReg(0);
337 InstOffsetVal = ImmOffset;
343 if (
Add &&
static_cast<int32_t
>(
Offset) >= 0 &&
347 const RegisterBank *Src0Bank = MRI.getRegBank(Src0);
348 const RegisterBank *Src1Bank = MRI.getRegBank(Src1);
349 if (Src0Bank == VgprRB && Src1Bank == SgprRB) {
354 if (Src0Bank == SgprRB && Src1Bank == VgprRB) {
362 if (MRI.getRegBank(CombinedOffset) == VgprRB) {
363 VOffsetReg = CombinedOffset;
365 VOffsetReg = B.buildCopy({VgprRB, S32}, CombinedOffset).
getReg(0);
367 SOffsetReg = B.buildConstant({SgprRB, S32}, 0).
getReg(0);
371bool RegBankLegalizeHelper::splitLoad(MachineInstr &
MI,
373 MachineFunction &MF = B.getMF();
374 assert(
MI.getNumMemOperands() == 1);
375 MachineMemOperand &BaseMMO = **
MI.memoperands_begin();
377 const RegisterBank *DstRB = MRI.getRegBankOrNull(Dst);
379 LLT PtrTy = MRI.getType(
Base);
380 const RegisterBank *PtrRB = MRI.getRegBankOrNull(
Base);
384 unsigned ByteOffset = 0;
385 for (LLT PartTy : LLTBreakdown) {
387 if (ByteOffset == 0) {
388 BasePlusOffset =
Base;
390 auto Offset = B.buildConstant({PtrRB, OffsetTy}, ByteOffset);
394 auto *OffsetMMO = MF.getMachineMemOperand(&BaseMMO, ByteOffset, PartTy);
395 auto LoadPart = B.buildLoad({DstRB, PartTy}, BasePlusOffset, *OffsetMMO);
396 LoadPartRegs.
push_back(LoadPart.getReg(0));
402 B.buildMergeLikeInstr(Dst, LoadPartRegs);
408 if (MRI.getType(
Reg) == MergeTy) {
411 auto Unmerge = B.buildUnmerge({DstRB, MergeTy},
Reg);
412 for (
unsigned i = 0; i < Unmerge->getNumOperands() - 1; ++i)
413 MergeTyParts.
push_back(Unmerge.getReg(i));
416 B.buildMergeLikeInstr(Dst, MergeTyParts);
418 MI.eraseFromParent();
422bool RegBankLegalizeHelper::widenLoad(MachineInstr &
MI, LLT WideTy,
424 MachineFunction &MF = B.getMF();
425 assert(
MI.getNumMemOperands() == 1);
426 MachineMemOperand &BaseMMO = **
MI.memoperands_begin();
428 const RegisterBank *DstRB = MRI.getRegBankOrNull(Dst);
431 MachineMemOperand *WideMMO = MF.getMachineMemOperand(&BaseMMO, 0, WideTy);
432 auto WideLoad = B.buildLoad({DstRB, WideTy},
Base, *WideMMO);
435 B.buildTrunc(Dst, WideLoad);
438 auto Unmerge = B.buildUnmerge({DstRB, MergeTy}, WideLoad);
440 LLT DstTy = MRI.getType(Dst);
442 for (
unsigned i = 0; i < NumElts; ++i) {
443 MergeTyParts.
push_back(Unmerge.getReg(i));
445 B.buildMergeLikeInstr(Dst, MergeTyParts);
447 MI.eraseFromParent();
451bool RegBankLegalizeHelper::widenMMOToS32(GAnyLoad &
MI)
const {
454 MachineMemOperand &MMO =
MI.getMMO();
457 MachineMemOperand *WideMMO = B.getMF().getMachineMemOperand(&MMO, 0, S32);
459 if (
MI.getOpcode() == G_LOAD) {
460 B.buildLoad(Dst, Ptr, *WideMMO);
462 auto Load = B.buildLoad(SgprRB_S32, Ptr, *WideMMO);
464 if (
MI.getOpcode() == G_ZEXTLOAD) {
466 auto MaskCst = B.buildConstant(SgprRB_S32, Mask);
467 B.buildAnd(Dst, Load, MaskCst);
469 assert(
MI.getOpcode() == G_SEXTLOAD);
470 B.buildSExtInReg(Dst, Load, MemSize);
474 MI.eraseFromParent();
478bool RegBankLegalizeHelper::lowerVccExtToSel(MachineInstr &
MI) {
480 LLT Ty = MRI.getType(Dst);
482 unsigned Opc =
MI.getOpcode();
483 int TrueExtCst =
Opc == G_SEXT ? -1 : 1;
484 if (Ty == S32 || Ty == S16) {
485 auto True = B.buildConstant({VgprRB, Ty}, TrueExtCst);
486 auto False = B.buildConstant({VgprRB, Ty}, 0);
487 B.buildSelect(Dst, Src, True, False);
488 }
else if (Ty == S64) {
489 auto True = B.buildConstant({VgprRB_S32}, TrueExtCst);
490 auto False = B.buildConstant({VgprRB_S32}, 0);
491 auto Lo = B.buildSelect({VgprRB_S32}, Src, True, False);
492 MachineInstrBuilder
Hi;
501 Hi = B.buildUndef({VgprRB_S32});
505 MF, MORE,
"amdgpu-regbanklegalize",
506 "AMDGPU RegBankLegalize: lowerVccExtToSel, Opcode not supported",
MI);
510 B.buildMergeValues(Dst, {
Lo.getReg(0),
Hi.getReg(0)});
513 MF, MORE,
"amdgpu-regbanklegalize",
514 "AMDGPU RegBankLegalize: lowerVccExtToSel, Type not supported",
MI);
518 MI.eraseFromParent();
522std::pair<Register, Register> RegBankLegalizeHelper::unpackZExt(
Register Reg) {
523 auto PackedS32 = B.buildBitcast(SgprRB_S32,
Reg);
524 auto Mask = B.buildConstant(SgprRB_S32, 0x0000ffff);
525 auto Lo = B.buildAnd(SgprRB_S32, PackedS32, Mask);
526 auto Hi = B.buildLShr(SgprRB_S32, PackedS32, B.buildConstant(SgprRB_S32, 16));
527 return {
Lo.getReg(0),
Hi.getReg(0)};
530std::pair<Register, Register> RegBankLegalizeHelper::unpackSExt(
Register Reg) {
531 auto PackedS32 = B.buildBitcast(SgprRB_S32,
Reg);
532 auto Lo = B.buildSExtInReg(SgprRB_S32, PackedS32, 16);
533 auto Hi = B.buildAShr(SgprRB_S32, PackedS32, B.buildConstant(SgprRB_S32, 16));
534 return {
Lo.getReg(0),
Hi.getReg(0)};
537std::pair<Register, Register> RegBankLegalizeHelper::unpackAExt(
Register Reg) {
538 auto PackedS32 = B.buildBitcast(SgprRB_S32,
Reg);
540 auto Hi = B.buildLShr(SgprRB_S32, PackedS32, B.buildConstant(SgprRB_S32, 16));
541 return {
Lo.getReg(0),
Hi.getReg(0)};
544std::pair<Register, Register>
545RegBankLegalizeHelper::unpackAExtTruncS16(
Register Reg) {
546 auto [Lo32, Hi32] = unpackAExt(
Reg);
547 return {B.buildTrunc(SgprRB_S16, Lo32).getReg(0),
548 B.buildTrunc(SgprRB_S16, Hi32).getReg(0)};
551bool RegBankLegalizeHelper::lowerUnpackBitShift(MachineInstr &
MI) {
553 switch (
MI.getOpcode()) {
554 case AMDGPU::G_SHL: {
555 auto [Val0, Val1] = unpackAExt(
MI.getOperand(1).getReg());
556 auto [Amt0, Amt1] = unpackAExt(
MI.getOperand(2).getReg());
557 Lo = B.buildInstr(
MI.getOpcode(), {SgprRB_S32}, {Val0, Amt0}).getReg(0);
558 Hi = B.buildInstr(
MI.getOpcode(), {SgprRB_S32}, {Val1, Amt1}).getReg(0);
561 case AMDGPU::G_LSHR: {
562 auto [Val0, Val1] = unpackZExt(
MI.getOperand(1).getReg());
563 auto [Amt0, Amt1] = unpackZExt(
MI.getOperand(2).getReg());
564 Lo = B.buildInstr(
MI.getOpcode(), {SgprRB_S32}, {Val0, Amt0}).getReg(0);
565 Hi = B.buildInstr(
MI.getOpcode(), {SgprRB_S32}, {Val1, Amt1}).getReg(0);
568 case AMDGPU::G_ASHR: {
569 auto [Val0, Val1] = unpackSExt(
MI.getOperand(1).getReg());
570 auto [Amt0, Amt1] = unpackSExt(
MI.getOperand(2).getReg());
571 Lo = B.buildAShr(SgprRB_S32, Val0, Amt0).getReg(0);
572 Hi = B.buildAShr(SgprRB_S32, Val1, Amt1).getReg(0);
577 MF, MORE,
"amdgpu-regbanklegalize",
578 "AMDGPU RegBankLegalize: lowerUnpackBitShift, case not implemented",
582 B.buildBuildVectorTrunc(
MI.getOperand(0).getReg(), {Lo, Hi});
583 MI.eraseFromParent();
587bool RegBankLegalizeHelper::lowerUnpackMinMax(MachineInstr &
MI) {
589 switch (
MI.getOpcode()) {
591 case AMDGPU::G_SMAX: {
593 auto [Val0_Lo, Val0_Hi] = unpackSExt(
MI.getOperand(1).getReg());
594 auto [Val1_Lo, Val1_Hi] = unpackSExt(
MI.getOperand(2).getReg());
595 Lo = B.buildInstr(
MI.getOpcode(), {SgprRB_S32}, {Val0_Lo, Val1_Lo})
597 Hi = B.buildInstr(
MI.getOpcode(), {SgprRB_S32}, {Val0_Hi, Val1_Hi})
602 case AMDGPU::G_UMAX: {
604 auto [Val0_Lo, Val0_Hi] = unpackZExt(
MI.getOperand(1).getReg());
605 auto [Val1_Lo, Val1_Hi] = unpackZExt(
MI.getOperand(2).getReg());
606 Lo = B.buildInstr(
MI.getOpcode(), {SgprRB_S32}, {Val0_Lo, Val1_Lo})
608 Hi = B.buildInstr(
MI.getOpcode(), {SgprRB_S32}, {Val0_Hi, Val1_Hi})
614 MF, MORE,
"amdgpu-regbanklegalize",
615 "AMDGPU RegBankLegalize: lowerUnpackMinMax, case not implemented",
MI);
618 B.buildBuildVectorTrunc(
MI.getOperand(0).getReg(), {Lo, Hi});
619 MI.eraseFromParent();
623bool RegBankLegalizeHelper::lowerUnpackAExt(MachineInstr &
MI) {
624 auto [Op1Lo, Op1Hi] = unpackAExt(
MI.getOperand(1).getReg());
625 auto [Op2Lo, Op2Hi] = unpackAExt(
MI.getOperand(2).getReg());
626 auto ResLo = B.buildInstr(
MI.getOpcode(), {SgprRB_S32}, {Op1Lo, Op2Lo});
627 auto ResHi = B.buildInstr(
MI.getOpcode(), {SgprRB_S32}, {Op1Hi, Op2Hi});
628 B.buildBuildVectorTrunc(
MI.getOperand(0).getReg(),
629 {ResLo.getReg(0), ResHi.getReg(0)});
630 MI.eraseFromParent();
634bool RegBankLegalizeHelper::lowerSBufToBuf(MachineInstr &
MI,
637 LLT Ty = MRI.getType(Dst);
638 const RegisterBank *RSrcBank = MRI.getRegBank(
MI.getOperand(1).getReg());
642 if (LoadSize == 256 || LoadSize == 512) {
643 NumLoads = LoadSize / 128;
646 for (
int i = 0; i < NumLoads; ++i)
647 LoadParts.
emplace_back(MRI.createVirtualRegister({VgprRB, Ty}));
648 MachineMemOperand *OrigMMO = *
MI.memoperands_begin();
650 MachineFunction &MF = B.getMF();
653 int64_t ImmOffset = 0;
654 unsigned MMOOffset = setBufferOffsets(B,
MI.getOperand(2).getReg(), VOffset,
655 SOffset, ImmOffset, Alignment);
660 MachineMemOperand *BaseMMO = MF.getMachineMemOperand(OrigMMO, 0, MemSize);
662 BaseMMO = MF.getMachineMemOperand(BaseMMO, MMOOffset, MemSize);
666 Register VIndex = B.buildConstant(VgprRB_S32, 0).getReg(0);
667 unsigned Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD;
668 switch (
MI.getOpcode()) {
669 case AMDGPU::G_AMDGPU_S_BUFFER_LOAD_SBYTE:
670 Opc = G_AMDGPU_BUFFER_LOAD_SBYTE;
672 case AMDGPU::G_AMDGPU_S_BUFFER_LOAD_UBYTE:
673 Opc = G_AMDGPU_BUFFER_LOAD_UBYTE;
675 case AMDGPU::G_AMDGPU_S_BUFFER_LOAD_SSHORT:
676 Opc = G_AMDGPU_BUFFER_LOAD_SSHORT;
678 case AMDGPU::G_AMDGPU_S_BUFFER_LOAD_USHORT:
679 Opc = G_AMDGPU_BUFFER_LOAD_USHORT;
684 for (
int i = 0; i < NumLoads; ++i) {
686 .addDef(LoadParts[i])
691 .addImm(ImmOffset + 16 * i)
694 .addMemOperand(MF.getMachineMemOperand(BaseMMO, 16 * i, MemSize));
697 B.buildCopy(Dst, LoadParts[0]);
699 B.buildMergeLikeInstr(Dst, LoadParts);
700 B.setInstr(*MRI.getVRegDef(LoadParts[0]));
701 if (RSrcBank != SgprRB) {
703 WFI.
Start = MRI.getVRegDef(LoadParts.
front());
704 WFI.
End = std::next(MRI.getVRegDef(LoadParts.
back())->getIterator());
706 MI.eraseFromParent();
712 return (GI->is(Intrinsic::amdgcn_sbfe));
714 return MI.getOpcode() == AMDGPU::G_SBFX;
717bool RegBankLegalizeHelper::lowerV_BFE(MachineInstr &
MI) {
724 Register Src =
MI.getOperand(FirstOpnd).getReg();
725 Register LSBit =
MI.getOperand(FirstOpnd + 1).getReg();
726 Register Width =
MI.getOperand(FirstOpnd + 2).getReg();
731 unsigned SHROpc =
Signed ? AMDGPU::G_ASHR : AMDGPU::G_LSHR;
732 auto SHRSrc = B.buildInstr(SHROpc, {{VgprRB, S64}}, {Src, LSBit});
740 auto Amt = B.buildSub(VgprRB_S32, B.buildConstant(SgprRB_S32, 64), Width);
741 auto SignBit = B.buildShl({VgprRB, S64}, SHRSrc, Amt);
742 B.buildInstr(SHROpc, {Dst}, {SignBit, Amt});
743 MI.eraseFromParent();
747 uint64_t WidthImm = ConstWidth->Value.getZExtValue();
748 auto UnmergeSHRSrc = B.buildUnmerge(VgprRB_S32, SHRSrc);
749 Register SHRSrcLo = UnmergeSHRSrc.getReg(0);
750 Register SHRSrcHi = UnmergeSHRSrc.getReg(1);
751 auto Zero = B.buildConstant({VgprRB, S32}, 0);
752 unsigned BFXOpc =
Signed ? AMDGPU::G_SBFX : AMDGPU::G_UBFX;
754 if (WidthImm <= 32) {
756 auto Lo = B.buildInstr(BFXOpc, {VgprRB_S32}, {SHRSrcLo,
Zero, Width});
757 MachineInstrBuilder
Hi;
760 Hi = B.buildAShr(VgprRB_S32,
Lo, B.buildConstant(VgprRB_S32, 31));
765 B.buildMergeLikeInstr(Dst, {
Lo,
Hi});
767 auto Amt = B.buildConstant(VgprRB_S32, WidthImm - 32);
769 auto Hi = B.buildInstr(BFXOpc, {VgprRB_S32}, {SHRSrcHi,
Zero, Amt});
770 B.buildMergeLikeInstr(Dst, {SHRSrcLo,
Hi});
773 MI.eraseFromParent();
777bool RegBankLegalizeHelper::lowerS_BFE(MachineInstr &
MI) {
779 LLT Ty = MRI.getType(DstReg);
782 Register Src =
MI.getOperand(FirstOpnd).getReg();
783 Register LSBit =
MI.getOperand(FirstOpnd + 1).getReg();
784 Register Width =
MI.getOperand(FirstOpnd + 2).getReg();
791 auto FieldOffset = B.buildAnd(SgprRB_S32, LSBit, Mask);
792 auto Size = B.buildShl(SgprRB_S32, Width, B.buildConstant(SgprRB_S32, 16));
793 auto Src1 = B.buildOr(SgprRB_S32, FieldOffset,
Size);
794 unsigned Opc32 =
Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32;
795 unsigned Opc64 =
Signed ? AMDGPU::S_BFE_I64 : AMDGPU::S_BFE_U64;
796 unsigned Opc = Ty == S32 ? Opc32 : Opc64;
800 auto S_BFE = B.buildInstr(
Opc, {{SgprRB, Ty}},
801 {B.buildCopy(Ty, Src), B.buildCopy(S32, Src1)});
803 *ST.getRegisterInfo(), RBI);
805 B.buildCopy(DstReg,
S_BFE->getOperand(0).getReg());
806 MI.eraseFromParent();
810bool RegBankLegalizeHelper::lowerSplitTo32(MachineInstr &
MI) {
812 LLT DstTy = MRI.getType(Dst);
813 assert(DstTy == V4S16 || DstTy == V2S32 || DstTy == S64);
814 LLT Ty = DstTy == V4S16 ? V2S16 : S32;
815 auto Op1 = B.buildUnmerge({VgprRB, Ty},
MI.getOperand(1).
getReg());
816 auto Op2 = B.buildUnmerge({VgprRB, Ty},
MI.getOperand(2).
getReg());
817 unsigned Opc =
MI.getOpcode();
820 B.buildInstr(
Opc, {{VgprRB, Ty}}, {Op1.getReg(0), Op2.getReg(0)},
Flags);
822 B.buildInstr(
Opc, {{VgprRB, Ty}}, {Op1.getReg(1), Op2.getReg(1)},
Flags);
823 B.buildMergeLikeInstr(Dst, {
Lo,
Hi});
824 MI.eraseFromParent();
828bool RegBankLegalizeHelper::lowerSplitTo32Mul(MachineInstr &
MI) {
830 assert(MRI.getType(Dst) == S64);
831 auto Op1 = B.buildUnmerge({VgprRB_S32},
MI.getOperand(1).
getReg());
832 auto Op2 = B.buildUnmerge({VgprRB_S32},
MI.getOperand(2).
getReg());
836 auto Lo = B.buildMul(VgprRB_S32, Op1.getReg(0), Op2.getReg(0));
837 auto Carry = B.buildUMulH(VgprRB_S32, Op1.getReg(0), Op2.getReg(0));
838 auto MulLo0Hi1 = B.buildMul(VgprRB_S32, Op1.getReg(0), Op2.getReg(1));
839 auto MulHi0Lo1 = B.buildMul(VgprRB_S32, Op1.getReg(1), Op2.getReg(0));
840 auto Sum = B.buildAdd(VgprRB_S32, MulLo0Hi1, MulHi0Lo1);
841 auto Hi = B.buildAdd(VgprRB_S32, Sum, Carry);
843 B.buildMergeLikeInstr(Dst, {
Lo,
Hi});
844 MI.eraseFromParent();
848bool RegBankLegalizeHelper::lowerSplitTo16(MachineInstr &
MI) {
850 assert(MRI.getType(Dst) == V2S16);
851 unsigned Opc =
MI.getOpcode();
852 unsigned NumOps =
MI.getNumOperands();
855 auto [Op1Lo, Op1Hi] = unpackAExtTruncS16(
MI.getOperand(1).getReg());
858 auto Lo = B.buildInstr(
Opc, {SgprRB_S16}, {Op1Lo},
Flags);
859 auto Hi = B.buildInstr(
Opc, {SgprRB_S16}, {Op1Hi},
Flags);
860 B.buildMergeLikeInstr(Dst, {
Lo,
Hi});
861 MI.eraseFromParent();
865 auto [Op2Lo, Op2Hi] = unpackAExtTruncS16(
MI.getOperand(2).getReg());
868 auto Lo = B.buildInstr(
Opc, {SgprRB_S16}, {Op1Lo, Op2Lo},
Flags);
869 auto Hi = B.buildInstr(
Opc, {SgprRB_S16}, {Op1Hi, Op2Hi},
Flags);
870 B.buildMergeLikeInstr(Dst, {
Lo,
Hi});
871 MI.eraseFromParent();
876 auto [Op3Lo, Op3Hi] = unpackAExtTruncS16(
MI.getOperand(3).getReg());
877 auto Lo = B.buildInstr(
Opc, {SgprRB_S16}, {Op1Lo, Op2Lo, Op3Lo},
Flags);
878 auto Hi = B.buildInstr(
Opc, {SgprRB_S16}, {Op1Hi, Op2Hi, Op3Hi},
Flags);
879 B.buildMergeLikeInstr(Dst, {
Lo,
Hi});
880 MI.eraseFromParent();
884bool RegBankLegalizeHelper::lowerUniMAD64(MachineInstr &
MI) {
891 const GCNSubtarget &ST = B.getMF().getSubtarget<GCNSubtarget>();
894 Register DstLo = B.buildMul(SgprRB_S32, Src0, Src1).getReg(0);
895 Register DstHi = MRI.createVirtualRegister(SgprRB_S32);
896 if (ST.hasScalarMulHiInsts()) {
897 B.buildInstr(AMDGPU::G_UMULH, {{DstHi}}, {Src0, Src1});
899 auto VSrc0 = B.buildCopy(VgprRB_S32, Src0);
900 auto VSrc1 = B.buildCopy(VgprRB_S32, Src1);
901 auto MulHi = B.buildInstr(AMDGPU::G_UMULH, {VgprRB_S32}, {VSrc0, VSrc1});
912 B.buildMergeLikeInstr(Dst0, {DstLo, DstHi});
913 B.buildConstant(Dst1, 0);
916 Register Src2Lo = MRI.createVirtualRegister(SgprRB_S32);
917 Register Src2Hi = MRI.createVirtualRegister(SgprRB_S32);
918 B.buildUnmerge({Src2Lo, Src2Hi}, Src2);
920 auto AddLo = B.buildUAddo(SgprRB_S32, SgprRB_S32, DstLo, Src2Lo);
922 B.buildUAdde(SgprRB_S32, SgprRB_S32, DstHi, Src2Hi, AddLo.getReg(1));
923 B.buildMergeLikeInstr(Dst0, {AddLo.getReg(0), AddHi.getReg(0)});
924 B.buildCopy(Dst1, AddHi.getReg(1));
927 MI.eraseFromParent();
931bool RegBankLegalizeHelper::lowerSplitTo32Select(MachineInstr &
MI) {
933 LLT DstTy = MRI.getType(Dst);
934 assert(DstTy == V4S16 || DstTy == V2S32 || DstTy == S64 ||
936 LLT Ty = DstTy == V4S16 ? V2S16 : S32;
937 auto Op2 = B.buildUnmerge({VgprRB, Ty},
MI.getOperand(2).
getReg());
938 auto Op3 = B.buildUnmerge({VgprRB, Ty},
MI.getOperand(3).
getReg());
942 B.buildSelect({VgprRB, Ty},
Cond, Op2.getReg(0), Op3.getReg(0), Flags);
944 B.buildSelect({VgprRB, Ty},
Cond, Op2.getReg(1), Op3.getReg(1), Flags);
946 B.buildMergeLikeInstr(Dst, {
Lo,
Hi});
947 MI.eraseFromParent();
951bool RegBankLegalizeHelper::lowerSplitTo32SExtInReg(MachineInstr &
MI) {
952 auto Op1 = B.buildUnmerge(VgprRB_S32,
MI.getOperand(1).getReg());
953 int Amt =
MI.getOperand(2).getImm();
957 auto Freeze = B.buildFreeze(VgprRB_S32, Op1.getReg(0));
960 Lo = Freeze.getReg(0);
963 Lo = B.buildSExtInReg(VgprRB_S32, Freeze, Amt).getReg(0);
966 auto SignExtCst = B.buildConstant(SgprRB_S32, 31);
967 Hi = B.buildAShr(VgprRB_S32,
Lo, SignExtCst).getReg(0);
971 Hi = B.buildSExtInReg(VgprRB_S32, Op1.getReg(1), Amt - 32).getReg(0);
974 B.buildMergeLikeInstr(
MI.getOperand(0).getReg(), {Lo, Hi});
975 MI.eraseFromParent();
979bool RegBankLegalizeHelper::lowerSplitBitCount64To32(MachineInstr &
MI) {
985 unsigned Opc =
MI.getOpcode();
994 case AMDGPU::G_AMDGPU_FFBH_U32:
996 AddOpc = AMDGPU::G_UADDSAT;
997 SearchFromMSB =
true;
999 case AMDGPU::G_AMDGPU_FFBL_B32:
1001 AddOpc = AMDGPU::G_UADDSAT;
1002 SearchFromMSB =
false;
1004 case AMDGPU::G_CTLZ_ZERO_POISON:
1005 FFBOpc = AMDGPU::G_AMDGPU_FFBH_U32;
1006 AddOpc = AMDGPU::G_ADD;
1007 SearchFromMSB =
true;
1009 case AMDGPU::G_CTTZ_ZERO_POISON:
1010 FFBOpc = AMDGPU::G_AMDGPU_FFBL_B32;
1011 AddOpc = AMDGPU::G_ADD;
1012 SearchFromMSB =
false;
1018 auto Unmerge = B.buildUnmerge(VgprRB_S32,
MI.getOperand(1).getReg());
1025 auto Primary = B.buildInstr(FFBOpc, {VgprRB_S32}, {SearchFromMSB ?
Hi :
Lo});
1027 B.buildInstr(FFBOpc, {VgprRB_S32}, {SearchFromMSB ?
Lo :
Hi});
1029 auto Adjusted = B.buildInstr(AddOpc, {VgprRB_S32},
1030 {Secondary, B.buildConstant(VgprRB_S32, 32)});
1031 B.buildUMin(
MI.getOperand(0).getReg(), Primary, Adjusted);
1033 MI.eraseFromParent();
1037bool RegBankLegalizeHelper::lowerExtrVecEltToSel(MachineInstr &
MI) {
1049 LLT VecTy = MRI.getType(Src);
1052 MachineRegisterInfo::VRegAttrs VgprRB_EltTy = {VgprRB, ScalarTy};
1054 auto Unmerge = B.buildUnmerge(VgprRB_EltTy, Src);
1057 Register PrevSelect = Unmerge.getReg(0);
1058 for (
unsigned I = 1;
I < NumElts; ++
I) {
1059 auto IdxConst = B.buildConstant({SgprRB, MRI.getType(Idx)},
I);
1062 B.buildSelect(VgprRB_EltTy, Cmp, Unmerge.getReg(
I), PrevSelect)
1065 B.buildCopy(Dst, PrevSelect);
1067 auto InitUnmerge = B.buildUnmerge(VgprRB_S32, Unmerge.getReg(0));
1068 Register PrevLo = InitUnmerge.getReg(0);
1069 Register PrevHi = InitUnmerge.getReg(1);
1070 for (
unsigned I = 1;
I < NumElts; ++
I) {
1071 auto IdxConst = B.buildConstant({SgprRB, MRI.getType(Idx)},
I);
1073 auto EltUnmerge = B.buildUnmerge(VgprRB_S32, Unmerge.getReg(
I));
1074 PrevLo = B.buildSelect(VgprRB_S32, Cmp, EltUnmerge.getReg(0), PrevLo)
1076 PrevHi = B.buildSelect(VgprRB_S32, Cmp, EltUnmerge.getReg(1), PrevHi)
1079 B.buildMergeLikeInstr(Dst, {PrevLo, PrevHi});
1082 MF, MORE,
"amdgpu-regbanklegalize",
1083 "AMDGPU RegBankLegalize: ExtrVecEltToSel unsupported element type",
MI);
1087 MI.eraseFromParent();
1091bool RegBankLegalizeHelper::lowerExtrVecEltTo32(MachineInstr &
MI) {
1104 LLT SrcTy = MRI.getType(Src);
1107 assert(MRI.getRegBank(Src) == VgprRB && MRI.getRegBank(Idx) == SgprRB &&
1108 "expected VGPR src and SGPR idx");
1110 auto CastSrc = B.buildBitcast({VgprRB, Vec32Ty}, Src);
1113 auto One = B.buildConstant(SgprRB_S32, 1);
1114 auto IdxLo = B.buildShl(SgprRB_S32, Idx, One);
1115 auto IdxHi = B.buildAdd(SgprRB_S32, IdxLo, One);
1117 auto ExtLo = B.buildExtractVectorElement(VgprRB_S32, CastSrc, IdxLo);
1118 auto ExtHi = B.buildExtractVectorElement(VgprRB_S32, CastSrc, IdxHi);
1120 B.buildMergeLikeInstr(Dst, {ExtLo.getReg(0), ExtHi.getReg(0)});
1122 MI.eraseFromParent();
1126bool RegBankLegalizeHelper::lowerInsVecEltToSel(MachineInstr &
MI) {
1139 LLT VecTy = MRI.getType(Src);
1142 const RegisterBank *SrcRB = MRI.getRegBank(Src);
1143 bool IsSGPR = (SrcRB == SgprRB);
1144 SmallVector<Register, 16> Selects;
1148 auto Unmerge = B.buildUnmerge(VgprRB_S32, Src);
1149 auto EltUnmerge = B.buildUnmerge(VgprRB_S32, Elt);
1150 Register EltLo = EltUnmerge.getReg(0);
1151 Register EltHi = EltUnmerge.getReg(1);
1152 for (
unsigned I = 0;
I < NumElts; ++
I) {
1153 auto IdxConst = B.buildConstant(VgprRB_S32,
I);
1156 B.buildSelect(VgprRB_S32, Cmp, EltLo, Unmerge.getReg(2 *
I))
1159 B.buildSelect(VgprRB_S32, Cmp, EltHi, Unmerge.getReg(2 *
I + 1))
1163 auto Vec32 = B.buildBuildVector({VgprRB, Vec32Ty}, Selects);
1164 B.buildBitcast(Dst, Vec32);
1167 MachineRegisterInfo::VRegAttrs SrcRB_EltTy = {SrcRB, ScalarTy};
1168 MachineRegisterInfo::VRegAttrs CmpTy = IsSGPR ? SgprRB_S32 : VccRB_S1;
1169 auto Unmerge = B.buildUnmerge(SrcRB_EltTy, Src);
1170 for (
unsigned I = 0;
I < NumElts; ++
I) {
1171 auto IdxConst = B.buildConstant(SgprRB_S32,
I);
1174 B.buildSelect(SrcRB_EltTy, Cmp, Elt, Unmerge.getReg(
I)).getReg(0));
1176 B.buildMergeLikeInstr(Dst, Selects);
1179 MF, MORE,
"amdgpu-regbanklegalize",
1180 "AMDGPU RegBankLegalize: InsVecEltToSel unsupported element type",
MI);
1184 MI.eraseFromParent();
1188bool RegBankLegalizeHelper::lowerInsVecEltTo32(MachineInstr &
MI) {
1203 LLT SrcTy = MRI.getType(Src);
1206 assert(MRI.getRegBank(Src) == VgprRB && MRI.getRegBank(Idx) == SgprRB &&
1207 "expected VGPR src and SGPR idx");
1209 MachineRegisterInfo::VRegAttrs VgprRB_Vec32Ty = {VgprRB, Vec32Ty};
1211 auto CastSrc = B.buildBitcast(VgprRB_Vec32Ty, Src);
1212 auto EltUnmerge = B.buildUnmerge(VgprRB_S32, Elt);
1215 auto One = B.buildConstant(SgprRB_S32, 1);
1216 auto IdxLo = B.buildShl(SgprRB_S32, Idx, One);
1217 auto IdxHi = B.buildAdd(SgprRB_S32, IdxLo, One);
1219 auto InsLo = B.buildInsertVectorElement(VgprRB_Vec32Ty, CastSrc,
1220 EltUnmerge.getReg(0), IdxLo);
1221 auto InsHi = B.buildInsertVectorElement(VgprRB_Vec32Ty, InsLo,
1222 EltUnmerge.getReg(1), IdxHi);
1224 B.buildBitcast(Dst, InsHi);
1226 MI.eraseFromParent();
1230bool RegBankLegalizeHelper::lowerAbsToNegMax(MachineInstr &
MI) {
1240 LLT Ty = MRI.getType(DstReg);
1246 Zero = B.buildBuildVector({VgprRB, Ty}, {Zero16, Zero16}).
getReg(0);
1248 assert((Ty == S32 || Ty == S16) &&
"unexpected type for AbsToNegMax");
1249 Zero = B.buildConstant({VgprRB, Ty}, 0).
getReg(0);
1252 auto Neg = B.buildSub({VgprRB, Ty},
Zero, SrcReg);
1253 B.buildSMax(DstReg, SrcReg, Neg);
1254 MI.eraseFromParent();
1258bool RegBankLegalizeHelper::lowerAbsToS32(MachineInstr &
MI) {
1268 auto Bitcast = B.buildBitcast({SgprRB_S32},
MI.getOperand(1).
getReg());
1269 auto SextInReg = B.buildSExtInReg({SgprRB_S32},
Bitcast, 16);
1271 B.buildAShr({SgprRB_S32},
Bitcast, B.buildConstant({SgprRB_S32}, 16));
1273 auto AbsLo = B.buildInstr(AMDGPU::G_ABS, {{SgprRB_S32}}, {SextInReg});
1274 auto AbsHi = B.buildInstr(AMDGPU::G_ABS, {{SgprRB_S32}}, {ShiftHi});
1275 B.buildBuildVectorTrunc(
MI.getOperand(0).getReg(),
1276 {AbsLo.getReg(0), AbsHi.getReg(0)});
1278 MI.eraseFromParent();
1284bool RegBankLegalizeHelper::lowerSetRounding(MachineInstr &
MI) {
1285 Register NewMode =
MI.getOperand(0).getReg();
1290 uint32_t ClampedVal = std::min(
1291 static_cast<uint32_t
>(ConstMode->Value.getZExtValue()),
1294 NewMode = B.buildConstant(SgprRB_S32, DecodedVal).getReg(0);
1298 KnownBits Known = VT->getKnownBits(NewMode);
1303 if (UseReducedTable) {
1305 auto BitTable = B.buildConstant(
1308 auto Two = B.buildConstant(SgprRB_S32, 2);
1309 auto RoundModeTimesNumBits = B.buildShl(SgprRB_S32, NewMode, Two);
1312 B.buildLShr(SgprRB_S32, BitTable, RoundModeTimesNumBits).getReg(0);
1319 auto NegFour = B.buildConstant(SgprRB_S32, -4);
1320 auto OffsetEnum = B.buildAdd(SgprRB_S32, NewMode, NegFour);
1321 auto IndexVal = B.buildUMin(SgprRB_S32, NewMode, OffsetEnum);
1323 auto Two = B.buildConstant(SgprRB_S32, 2);
1324 auto RoundModeTimesNumBits = B.buildShl(SgprRB_S32, IndexVal, Two);
1329 B.buildLShr({SgprRB, S64}, BitTable, RoundModeTimesNumBits);
1332 NewMode = B.buildTrunc(SgprRB_S32, TableValue).getReg(0);
1338 uint32_t BothRoundHwReg =
1342 .addImm(
static_cast<int16_t
>(BothRoundHwReg))
1345 MI.eraseFromParent();
1351bool RegBankLegalizeHelper::lowerGetRounding(MachineInstr &
MI) {
1354 uint32_t BothRoundHwReg =
1357 B.buildIntrinsic(Intrinsic::amdgcn_s_getreg, {SgprRB_S32},
1359 .addImm(BothRoundHwReg);
1391 auto Two = B.buildConstant(SgprRB_S32, 2);
1392 auto RoundModeTimesNumBits = B.buildShl(SgprRB_S32, GetReg, Two);
1396 auto TableValue = B.buildLShr({SgprRB, S64}, BitTable, RoundModeTimesNumBits);
1397 auto TruncTable = B.buildTrunc(SgprRB_S32, TableValue);
1399 auto EntryMask = B.buildConstant(SgprRB_S32, 0xf);
1400 auto TableEntry = B.buildAnd(SgprRB_S32, TruncTable, EntryMask);
1404 auto Four = B.buildConstant(SgprRB_S32, 4);
1405 auto EnumOffset = B.buildAdd(SgprRB_S32, TableEntry, Four);
1406 auto IsStandardMode =
1408 B.buildSelect(Dst, IsStandardMode, TableEntry, EnumOffset);
1410 MI.eraseFromParent();
1414bool RegBankLegalizeHelper::lower(MachineInstr &
MI,
1422 return lowerVccExtToSel(
MI);
1424 LLT Ty = MRI.getType(
MI.getOperand(0).getReg());
1425 auto True = B.buildConstant({SgprRB, Ty},
1426 MI.getOpcode() == AMDGPU::G_SEXT ? -1 : 1);
1427 auto False = B.buildConstant({SgprRB, Ty}, 0);
1431 B.buildSelect(
MI.getOperand(0).getReg(),
MI.getOperand(1).getReg(), True,
1433 MI.eraseFromParent();
1437 return lowerUnpackBitShift(
MI);
1439 return lowerUnpackMinMax(
MI);
1441 return lowerSplitTo16(
MI);
1443 const RegisterBank *RB = MRI.getRegBank(
MI.getOperand(0).getReg());
1444 MachineInstrBuilder
Hi;
1445 switch (
MI.getOpcode()) {
1446 case AMDGPU::G_ZEXT: {
1447 Hi = B.buildConstant({RB, S32}, 0);
1450 case AMDGPU::G_SEXT: {
1452 auto ShiftAmt = B.buildConstant({RB, S32}, 31);
1453 Hi = B.buildAShr({RB, S32},
MI.getOperand(1).
getReg(), ShiftAmt);
1456 case AMDGPU::G_ANYEXT: {
1457 Hi = B.buildUndef({RB, S32});
1462 "AMDGPU RegBankLegalize: Ext32To64, unsuported opcode",
1467 B.buildMergeLikeInstr(
MI.getOperand(0).getReg(),
1468 {MI.getOperand(1).getReg(), Hi});
1469 MI.eraseFromParent();
1473 uint64_t ConstVal =
MI.getOperand(1).getCImm()->getZExtValue();
1474 B.buildConstant(
MI.getOperand(0).getReg(), ConstVal);
1476 MI.eraseFromParent();
1481 LLT Ty = MRI.getType(Src);
1485 Register BoolSrc = MRI.createVirtualRegister({VgprRB, Ty});
1487 auto Src64 = B.buildUnmerge(VgprRB_S32, Src);
1488 auto One = B.buildConstant(VgprRB_S32, 1);
1489 auto AndLo = B.buildAnd(VgprRB_S32, Src64.getReg(0), One);
1490 auto Zero = B.buildConstant(VgprRB_S32, 0);
1491 auto AndHi = B.buildAnd(VgprRB_S32, Src64.getReg(1), Zero);
1492 B.buildMergeLikeInstr(BoolSrc, {AndLo, AndHi});
1494 assert(Ty == S32 || Ty == S16);
1495 auto One = B.buildConstant({VgprRB, Ty}, 1);
1496 B.buildAnd(BoolSrc, Src, One);
1498 auto Zero = B.buildConstant({VgprRB, Ty}, 0);
1500 MI.eraseFromParent();
1504 return lowerV_BFE(
MI);
1506 return lowerS_BFE(
MI);
1508 return lowerUniMAD64(
MI);
1510 B.buildMul(
MI.getOperand(0),
MI.getOperand(1),
MI.getOperand(2));
1511 MI.eraseFromParent();
1515 auto Op1 = B.buildTrunc(VgprRB_S32,
MI.getOperand(1));
1516 auto Op2 = B.buildTrunc(VgprRB_S32,
MI.getOperand(2));
1517 auto Zero = B.buildConstant({VgprRB, S64}, 0);
1519 unsigned NewOpc =
MI.getOpcode() == AMDGPU::G_AMDGPU_S_MUL_U64_U32
1520 ? AMDGPU::G_AMDGPU_MAD_U64_U32
1521 : AMDGPU::G_AMDGPU_MAD_I64_I32;
1523 B.buildInstr(NewOpc, {
MI.getOperand(0).getReg(), {SgprRB, S32}},
1525 MI.eraseFromParent();
1529 return lowerSplitTo32(
MI);
1531 return lowerSplitTo32Mul(
MI);
1533 return lowerSplitTo32Select(
MI);
1535 return lowerSplitTo32SExtInReg(
MI);
1537 auto Unmerge = B.buildUnmerge({VgprRB, S32},
MI.getOperand(1).
getReg());
1538 auto LoPopCnt = B.buildCTPOP({VgprRB, S32}, Unmerge.getReg(0));
1539 auto HiPopCnt = B.buildCTPOP({VgprRB, S32}, Unmerge.getReg(1));
1541 B.buildAdd(
MI.getOperand(0).getReg(), LoPopCnt, HiPopCnt,
1544 MI.eraseFromParent();
1548 return lowerSBufToBuf(
MI, WFI);
1550 LLT DstTy = MRI.getType(
MI.getOperand(0).getReg());
1561 if (
Size / 128 == 2)
1563 else if (
Size / 128 == 4)
1567 "AMDGPU RegBankLegalize: SplitLoad, unsuported type",
1573 else if (DstTy == S96)
1574 splitLoad(
MI, {S64, S32}, S32);
1575 else if (DstTy == V3S32)
1576 splitLoad(
MI, {V2S32, S32}, S32);
1577 else if (DstTy == V6S16)
1578 splitLoad(
MI, {V4S16, V2S16}, V2S16);
1581 "AMDGPU RegBankLegalize: SplitLoad, unsuported type",
1588 const auto &TFI = *ST.getFrameLowering();
1592 "Stack grows upwards for AMDGPU");
1595 Register AllocSize =
MI.getOperand(1).getReg();
1600 B.setInsertPt(*
MI.getParent(), std::next(
MI.getIterator()));
1601 MI.eraseFromParent();
1603 if (MRI.getRegBank(AllocSize) != SgprRB) {
1604 auto WaveReduction =
1605 B.buildIntrinsic(Intrinsic::amdgcn_wave_reduce_umax, {SgprRB_S32})
1608 AllocSize = WaveReduction.getReg(0);
1611 LLT PtrTy = MRI.getType(Dst);
1613 "Expected 32-bit pointer for stack allocation");
1614 const SIMachineFunctionInfo *
Info = MF.getInfo<SIMachineFunctionInfo>();
1618 const bool HasFlatScratch = ST.hasFlatScratchEnabled();
1619 const unsigned WavefrontSizeLog2 = ST.getWavefrontSizeLog2();
1622 if (!HasFlatScratch) {
1623 auto WaveSize = B.buildConstant(SgprRB_S32, WavefrontSizeLog2);
1624 AdjustedSize = B.buildShl(SgprRB_S32, AllocSize, WaveSize).getReg(0);
1626 if (Alignment > TFI.getStackAlign()) {
1627 const uint64_t EffectiveAlignment =
1628 Alignment.
value() << (HasFlatScratch ? 0 : WavefrontSizeLog2);
1629 auto OldSP = B.buildCopy({SgprRB, PtrTy},
SPReg);
1631 B.buildPtrAdd({SgprRB, PtrTy}, OldSP,
1632 B.buildConstant(SgprRB_S32, EffectiveAlignment - 1));
1634 B.buildPtrMask(Dst, Tmp1, B.buildConstant(SgprRB_S32, Mask));
1636 B.buildCopy(Dst,
SPReg);
1638 auto PtrAdd = B.buildPtrAdd({SgprRB, PtrTy}, Dst, AdjustedSize);
1639 B.buildCopy(
SPReg, PtrAdd);
1643 LLT DstTy = MRI.getType(
MI.getOperand(0).getReg());
1645 widenLoad(
MI, S128);
1646 else if (DstTy == V3S32)
1647 widenLoad(
MI, V4S32, S32);
1648 else if (DstTy == V6S16)
1649 widenLoad(
MI, V8S16, V2S16);
1652 "AMDGPU RegBankLegalize: WidenLoad, unsuported type",
1659 return lowerUnpackAExt(
MI);
1664 return MRI.getRegBankOrNull(Op.getReg()) == SgprRB;
1670 return MRI.getRegBankOrNull(Op.getReg()) == VgprRB;
1672 B.setInstrAndDebugLoc(
MI);
1673 for (
unsigned i =
MI.getNumDefs(); i <
MI.getNumOperands(); ++i) {
1674 MachineOperand &
Op =
MI.getOperand(i);
1678 if (MRI.getRegBank(
Reg) != VgprRB) {
1679 auto Copy = B.buildCopy({VgprRB, MRI.getType(
Reg)},
Reg);
1680 Op.setReg(
Copy.getReg(0));
1690 "AMDGPU RegBankLegalize: unmerge not multiple of 32",
1695 B.setInstrAndDebugLoc(
MI);
1698 B.buildUnmerge({SgprRB, V2S16}, Unmerge->
getSourceReg());
1699 for (
unsigned i = 0; i < UnmergeV2S16->getNumDefs(); ++i) {
1700 auto [Dst0S32, Dst1S32] =
1701 unpackAExt(UnmergeV2S16->getOperand(i).getReg());
1702 B.buildTrunc(
MI.getOperand(i * 2).getReg(), Dst0S32);
1703 B.buildTrunc(
MI.getOperand(i * 2 + 1).getReg(), Dst1S32);
1706 auto [Dst0S32, Dst1S32] = unpackAExt(
MI.getOperand(2).getReg());
1707 B.buildTrunc(
MI.getOperand(0).getReg(), Dst0S32);
1708 B.buildTrunc(
MI.getOperand(1).getReg(), Dst1S32);
1711 MI.eraseFromParent();
1716 Register NewDst = MRI.createVirtualRegister(SgprRB_S32);
1717 B.setInsertPt(*
MI.getParent(),
MI.getParent()->getFirstNonPHI());
1718 MI.getOperand(0).setReg(NewDst);
1719 B.buildTrunc(Dst, NewDst);
1721 for (
unsigned i = 1; i <
MI.getNumOperands(); i += 2) {
1729 auto NewUse = B.buildAnyExt(SgprRB_S32,
UseReg);
1730 MI.getOperand(i).setReg(NewUse.getReg(0));
1738 return MRI.getRegBankOrNull(Op.getReg()) == SgprRB;
1743 assert(MRI.getRegBankOrNull(
MI.getOperand(0).getReg()) == VgprRB);
1747 const RegisterBank *RB = MRI.getRegBankOrNull(Op.getReg());
1748 return RB == VgprRB || RB == SgprRB;
1753 const AMDGPU::RsrcIntrinsic *RSrcIntrin =
1758 unsigned RsrcIdx = RSrcIntrin->
RsrcArg +
MI.getNumExplicitDefs() + 1;
1759 return applyRegisterBanksVgprWithSgprRsrc(
MI, RsrcIdx);
1765 unsigned RsrcIdx =
MI.getNumOperands();
1766 while (RsrcIdx-- >
MI.getNumExplicitDefs()) {
1767 const MachineOperand &
Op =
MI.getOperand(RsrcIdx);
1768 if (
Op.isReg() &&
Op.getReg().isVirtual())
1771 return applyRegisterBanksVgprWithSgprRsrc(
MI, RsrcIdx);
1774 return lowerSplitBitCount64To32(
MI);
1776 return lowerExtrVecEltToSel(
MI);
1778 return lowerExtrVecEltTo32(
MI);
1780 return lowerInsVecEltToSel(
MI);
1782 return lowerInsVecEltTo32(
MI);
1784 return lowerAbsToNegMax(
MI);
1786 return lowerAbsToS32(
MI);
1788 MI.eraseFromParent();
1791 return lowerSetRounding(
MI);
1793 return lowerGetRounding(
MI);
1916 return isAnyPtr(Ty, 32) ? Ty : LLT();
1919 return isAnyPtr(Ty, 64) ? Ty : LLT();
1922 return isAnyPtr(Ty, 128) ? Ty : LLT();
1962 const SIRegisterInfo *
TRI =
1963 static_cast<const SIRegisterInfo *
>(MRI.getTargetRegisterInfo());
1965 if (LLTSize >= 32 &&
TRI->getSGPRClassForBitWidth(LLTSize))
1970 const SIRegisterInfo *
TRI =
1971 static_cast<const SIRegisterInfo *
>(MRI.getTargetRegisterInfo());
2094bool RegBankLegalizeHelper::applyMappingDst(
2095 MachineInstr &
MI,
unsigned &
OpIdx,
2096 const SmallVectorImpl<RegBankLLTMappingApplyID> &MethodIDs) {
2101 MachineOperand &
Op =
MI.getOperand(
OpIdx);
2103 LLT Ty = MRI.getType(
Reg);
2104 [[maybe_unused]]
const RegisterBank *RB = MRI.getRegBank(
Reg);
2106 switch (MethodIDs[
OpIdx]) {
2183 Register NewAgprDst = MRI.createVirtualRegister({AgprRB, Ty});
2184 Op.setReg(NewAgprDst);
2185 if (!MRI.use_nodbg_empty(
Reg))
2186 B.buildCopy(
Reg, NewAgprDst);
2191 const RegisterBank *DstRB =
2192 MFI->selectAGPRFormMFMA(NumRegs) ? AgprRB : VgprRB;
2195 Register NewDst = MRI.createVirtualRegister({DstRB, Ty});
2197 if (!MRI.use_nodbg_empty(
Reg))
2198 B.buildCopy(
Reg, NewDst);
2205 Register NewDst = MRI.createVirtualRegister(VccRB_S1);
2207 if (!MRI.use_empty(
Reg)) {
2209 B.buildInstr(AMDGPU::G_AMDGPU_COPY_SCC_VCC, {SgprRB_S32}, {NewDst});
2210 B.buildTrunc(
Reg, CopyS32_Vcc);
2217 Register NewVgprDstS16 = MRI.createVirtualRegister({VgprRB, S16});
2218 Register NewVgprDstS32 = MRI.createVirtualRegister({VgprRB, S32});
2219 Register NewSgprDstS32 = MRI.createVirtualRegister({SgprRB, S32});
2220 Op.setReg(NewVgprDstS16);
2221 B.buildAnyExt(NewVgprDstS32, NewVgprDstS16);
2223 B.buildTrunc(
Reg, NewSgprDstS32);
2242 Register NewVgprDst = MRI.createVirtualRegister({VgprRB, Ty});
2243 Op.setReg(NewVgprDst);
2256 Register NewVgprDst = MRI.createVirtualRegister({VgprRB, Ty});
2257 Op.setReg(NewVgprDst);
2265 Register NewDst = MRI.createVirtualRegister(SgprRB_S32);
2267 if (!MRI.use_empty(
Reg))
2268 B.buildTrunc(
Reg, NewDst);
2275 Op.setReg(MRI.createVirtualRegister({SgprRB, Ty}));
2276 B.buildCopy(
Reg,
Op.getReg());
2281 MF, MORE,
"amdgpu-regbanklegalize",
2282 "AMDGPU RegBankLegalize: missing fast rule ('Div' or 'Uni') for",
MI);
2287 MF, MORE,
"amdgpu-regbanklegalize",
2288 "AMDGPU RegBankLegalize: applyMappingDst, ID not supported",
MI);
2296bool RegBankLegalizeHelper::applyMappingSrc(
2297 MachineInstr &
MI,
unsigned &
OpIdx,
2298 const SmallVectorImpl<RegBankLLTMappingApplyID> &MethodIDs,
2300 for (
unsigned i = 0; i < MethodIDs.
size(); ++
OpIdx, ++i) {
2301 if (MethodIDs[i] ==
None || MethodIDs[i] ==
IntrId || MethodIDs[i] ==
Imm)
2304 MachineOperand &
Op =
MI.getOperand(
OpIdx);
2306 LLT Ty = MRI.getType(
Reg);
2307 const RegisterBank *RB = MRI.getRegBank(
Reg);
2309 switch (MethodIDs[i]) {
2312 assert(RB == VccRB || RB == SgprRB);
2314 auto Aext = B.buildAnyExt(SgprRB_S32,
Reg);
2316 B.buildInstr(AMDGPU::G_AMDGPU_COPY_VCC_SCC, {VccRB_S1}, {Aext});
2317 Op.setReg(CopyVcc_Scc.getReg(0));
2336 assert(Ty == getTyFromID(MethodIDs[i]));
2337 assert(RB == getRegBankFromID(MethodIDs[i]));
2351 assert(Ty == getBTyFromID(MethodIDs[i], Ty));
2352 assert(RB == getRegBankFromID(MethodIDs[i]));
2379 assert(Ty == getTyFromID(MethodIDs[i]));
2381 auto CopyToVgpr = B.buildCopy({VgprRB, Ty},
Reg);
2382 Op.setReg(CopyToVgpr.getReg(0));
2398 assert(Ty == getBTyFromID(MethodIDs[i], Ty));
2400 auto CopyToVgpr = B.buildCopy({VgprRB, Ty},
Reg);
2401 Op.setReg(CopyToVgpr.getReg(0));
2407 auto CopyToVgpr = B.buildCopy({VgprRB, Ty},
Reg);
2408 Op.setReg(CopyToVgpr.getReg(0));
2414 auto CopyToAgpr = B.buildCopy({AgprRB, Ty},
Reg);
2415 Op.setReg(CopyToAgpr.getReg(0));
2421 const RegisterBank *SrcRB =
2422 MFI->selectAGPRFormMFMA(NumRegs) ? AgprRB : VgprRB;
2424 Op.setReg(B.buildCopy({SrcRB, Ty},
Reg).getReg(0));
2430 assert(Ty == getTyFromID(MethodIDs[i]));
2435 WFI.
End = std::next(
MI.getIterator());
2442 assert(Ty == getTyFromID(MethodIDs[i]));
2448 while (
Start->getOpcode() != AMDGPU::ADJCALLSTACKUP)
2453 while (End->getOpcode() != AMDGPU::ADJCALLSTACKDOWN)
2465 assert(Ty == getBTyFromID(MethodIDs[i], Ty));
2469 Register NewSGPR = MRI.createVirtualRegister({SgprRB, Ty});
2476 assert(Ty == getTyFromID(MethodIDs[i]));
2480 Register NewSGPR = MRI.createVirtualRegister({SgprRB, Ty});
2490 auto Aext = B.buildAnyExt(SgprRB_S32,
Reg);
2491 Op.setReg(Aext.getReg(0));
2498 auto Aext = B.buildAnyExt(SgprRB_S32,
Reg);
2501 auto Cst1 = B.buildConstant(SgprRB_S32, 1);
2502 auto BoolInReg = B.buildAnd(SgprRB_S32, Aext, Cst1);
2503 Op.setReg(BoolInReg.getReg(0));
2509 auto Sext = B.buildSExt(SgprRB_S32,
Reg);
2510 Op.setReg(Sext.getReg(0));
2516 auto Zext = B.buildZExt({SgprRB, S32},
Reg);
2517 Op.setReg(Zext.getReg(0));
2523 auto Aext = B.buildAnyExt({VgprRB, S32},
Reg);
2524 Op.setReg(Aext.getReg(0));
2531 auto Sext = B.buildSExt({VgprRB, S32},
Reg);
2532 Op.setReg(Sext.getReg(0));
2539 auto Zext = B.buildZExt({VgprRB, S32},
Reg);
2540 Op.setReg(Zext.getReg(0));
2545 MF, MORE,
"amdgpu-regbanklegalize",
2546 "AMDGPU RegBankLegalize: applyMappingSrc, ID not supported",
MI);
2556 unsigned StartOpIdx,
2557 unsigned EndOpIdx) {
2558 for (
unsigned i = StartOpIdx; i <= EndOpIdx; ++i) {
2565bool RegBankLegalizeHelper::applyRegisterBanksVgprWithSgprRsrc(
2566 MachineInstr &
MI,
unsigned RsrcIdx) {
2567 const unsigned NumDefs =
MI.getNumExplicitDefs();
2569 MachineBasicBlock *
MBB =
MI.getParent();
2573 for (
unsigned i = 0; i < NumDefs; ++i) {
2575 if (MRI.getRegBank(
Reg) == VgprRB)
2578 Register NewVgprDst = MRI.createVirtualRegister({VgprRB, MRI.getType(
Reg)});
2579 MI.getOperand(i).setReg(NewVgprDst);
2583 B.setInstrAndDebugLoc(
MI);
2586 for (
unsigned i = NumDefs; i < RsrcIdx; ++i) {
2587 MachineOperand &
Op =
MI.getOperand(i);
2595 if (MRI.getRegBank(
Reg) == VgprRB)
2598 auto Copy = B.buildCopy({VgprRB, MRI.getType(
Reg)},
Reg);
2599 Op.setReg(
Copy.getReg(0));
2602 SmallSet<Register, 4> OpsToWaterfall;
2605 for (
unsigned i = RsrcIdx; i <
MI.getNumOperands(); ++i) {
2606 MachineOperand &
Op =
MI.getOperand(i);
2611 if (MRI.getRegBank(
Reg) != SgprRB)
2615 if (!OpsToWaterfall.
empty()) {
2617 executeInWaterfallLoop(B, {OpsToWaterfall, MII, std::next(MII)});
MachineInstrBuilder MachineInstrBuilder & DefMI
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.
Provides AMDGPU specific target descriptions.
static bool isSignedBFE(MachineInstr &MI)
static bool verifyRegBankOnOperands(MachineInstr &MI, const RegisterBank *RB, MachineRegisterInfo &MRI, unsigned StartOpIdx, unsigned EndOpIdx)
This file declares the targeting of the RegisterBankInfo class for AMDGPU.
MachineBasicBlock MachineBasicBlock::iterator MBBI
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
AMD GCN specific subclass of TargetSubtarget.
Provides analysis for querying information about KnownBits during GISel passes.
Declares convenience wrapper classes for interpreting MachineInstr instances as specific generic oper...
static Register UseReg(const MachineOperand &MO)
const size_t AbstractManglingParser< Derived, Alloc >::NumOps
Contains matchers for matching SSA Machine Instructions.
This file declares the MachineIRBuilder class.
Register const TargetRegisterInfo * TRI
Promote Memory to Register
static MCRegister getReg(const MCDisassembler *D, unsigned RC, unsigned RegNo)
MachineInstr unsigned OpIdx
static constexpr MCPhysReg SPReg
const SmallVectorImpl< MachineOperand > & Cond
static const LaneMaskConstants & get(const GCNSubtarget &ST)
const unsigned XorTermOpc
const unsigned MovTermOpc
const unsigned AndSaveExecOpc
bool findRuleAndApplyMapping(MachineInstr &MI)
RegBankLegalizeHelper(MachineIRBuilder &B, const MachineUniformityInfo &MUI, GISelValueTracking *VT, const RegisterBankInfo &RBI, const RegBankLegalizeRules &RBLRules)
const RegBankLLTMapping * findMappingForMI(const MachineInstr &MI, const MachineRegisterInfo &MRI, const MachineUniformityInfo &MUI) const
static APInt getLowBitsSet(unsigned numBits, unsigned loBitsSet)
Constructs an APInt value that has the bottom loBitsSet bits set.
@ ICMP_ULT
unsigned less than
iterator find(const_arg_type_t< KeyT > Val)
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
const SIRegisterInfo * getRegisterInfo() const override
Represents a call to an intrinsic.
Register getSourceReg() const
Get the unmerge source register.
constexpr bool isScalar() const
LLT getScalarType() const
static constexpr LLT scalar(unsigned SizeInBits)
Get a low-level scalar or aggregate "bag of bits".
constexpr bool isValid() const
constexpr uint16_t getNumElements() const
Returns the number of elements in a vector LLT.
constexpr bool isVector() const
static constexpr LLT pointer(unsigned AddressSpace, unsigned SizeInBits)
Get a low-level pointer in the given address space.
constexpr TypeSize getSizeInBits() const
Returns the total size of the type. Must only be called on sized types.
constexpr bool isPointer() const
LLT divide(int Factor) const
Return a type that is Factor times smaller.
static constexpr LLT fixed_vector(unsigned NumElements, unsigned ScalarSizeInBits)
Get a low-level fixed-width vector of some number of elements and element width.
constexpr TypeSize getSizeInBytes() const
Returns the total size of the type in bytes, i.e.
LLT getElementType() const
Returns the vector's element type. Only valid for vector types.
TypeSize getValue() const
LLVM_ABI void transferSuccessorsAndUpdatePHIs(MachineBasicBlock *FromMBB)
Transfers all the successors, as in transferSuccessors, and update PHI operands in the successor bloc...
LLVM_ABI iterator SkipPHIsAndLabels(iterator I)
Return the first instruction in MBB after I that is not a PHI or a label.
LLVM_ABI void addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob=BranchProbability::getUnknown())
Add Succ as a successor of this MachineBasicBlock.
void splice(iterator Where, MachineBasicBlock *Other, iterator From)
Take an instruction from MBB 'Other' at the position From, and insert it into this MBB right before '...
MachineInstrBundleIterator< MachineInstr > iterator
BasicBlockListType::iterator iterator
MachineBasicBlock * CreateMachineBasicBlock(const BasicBlock *BB=nullptr, std::optional< UniqueBBID > BBID=std::nullopt)
CreateMachineInstr - Allocate a new MachineInstr.
void insert(iterator MBBI, MachineBasicBlock *MBB)
Helper class to build MachineInstr.
bool isValid() const
Check for null.
Representation of each machine instruction.
const MachineBasicBlock * getParent() const
LocationSize getSize() const
Return the size in bytes of the memory reference.
LLVM_ABI Align getAlign() const
Return the minimum known alignment in bytes of the actual memory reference.
MachineOperand class - Representation of each machine instruction operand.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
const RegisterBank * getRegBank(Register Reg) const
Return the register bank of Reg.
LLVM_ABI Register createVirtualRegister(const TargetRegisterClass *RegClass, StringRef Name="")
createVirtualRegister - Create and return a new virtual register in the function with the specified r...
LLT getType(Register Reg) const
Get the low-level type of Reg or LLT{} if Reg is not a generic (target independent) virtual register.
const RegisterBank * getRegBankOrNull(Register Reg) const
Return the register bank of Reg, or null if Reg has not been assigned a register bank or has been ass...
Holds all the information related to register banks.
This class implements the register bank concept.
Wrapper class representing virtual and physical registers.
constexpr bool isVirtual() const
Return true if the specified register number is in the virtual register namespace.
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
reference emplace_back(ArgTypes &&... Args)
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
self_iterator getIterator()
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
constexpr char Align[]
Key for Kernel::Arg::Metadata::mAlign.
const uint64_t FltRoundToHWConversionTable
@ SgprV4S32_ReadFirstLane
@ SgprV8S32_ReadFirstLane
bool isAnyPtr(LLT Ty, unsigned Width)
@ TowardZeroF32_TowardNegativeF64
uint32_t decodeFltRoundToHWConversionTable(uint32_t FltRounds)
Read the hardware rounding mode equivalent of a AMDGPUFltRounds value.
Intrinsic::ID getIntrinsicID(const MachineInstr &I)
Return the intrinsic ID for opcodes with the G_AMDGPU_INTRIN_ prefix.
std::pair< Register, unsigned > getBaseWithConstantOffset(MachineRegisterInfo &MRI, Register Reg, GISelValueTracking *ValueTracking=nullptr, bool CheckNUW=false)
Returns base register and constant offset.
@ VerifyAllSgprOrVgprGPHI
@ AextToS32InIncomingBlockGPHI
void buildReadAnyLane(MachineIRBuilder &B, Register SgprDst, Register VgprSrc, const RegisterBankInfo &RBI)
const RsrcIntrinsic * lookupRsrcIntrinsic(unsigned Intr)
void buildReadFirstLane(MachineIRBuilder &B, Register SgprDst, Register VgprSrc, const RegisterBankInfo &RBI)
const uint64_t FltRoundConversionTable
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
@ Bitcast
Perform the operation on a different, but equivalently sized type.
SpecificConstantMatch m_ZeroInt()
Convenience matchers for specific integer values.
bool mi_match(Reg R, const MachineRegisterInfo &MRI, Pattern &&P)
This is an optimization pass for GlobalISel generic memory operations.
GenericUniformityInfo< MachineSSAContext > MachineUniformityInfo
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
LLVM_ABI MachineInstr * getOpcodeDef(unsigned Opcode, Register Reg, const MachineRegisterInfo &MRI)
See if Reg is defined by an single def instruction that is Opcode.
@ Kill
The last use of a register.
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
LLVM_ABI void constrainSelectedInstRegOperands(MachineInstr &I, const TargetInstrInfo &TII, const TargetRegisterInfo &TRI, const RegisterBankInfo &RBI)
Mutate the newly-selected instruction I to constrain its (possibly generic) virtual register operands...
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
unsigned Log2_64(uint64_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
LLVM_ABI std::optional< int64_t > getIConstantVRegSExtVal(Register VReg, const MachineRegisterInfo &MRI)
If VReg is defined by a G_CONSTANT fits in int64_t returns it.
LLVM_ABI void reportGISelFailure(MachineFunction &MF, MachineOptimizationRemarkEmitter &MORE, MachineOptimizationRemarkMissed &R)
Report an ISel error as a missed optimization remark to the LLVMContext's diagnostic stream.
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
constexpr T maskTrailingZeros(unsigned N)
Create a bitmask with the N right-most bits set to 0, and all other bits set to 1.
DWARFExpression::Operation Op
ArrayRef(const T &OneElt) -> ArrayRef< T >
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
LLVM_ABI std::optional< ValueAndVReg > getIConstantVRegValWithLookThrough(Register VReg, const MachineRegisterInfo &MRI, bool LookThroughInstrs=true)
If VReg is defined by a statically evaluable chain of instructions rooted on a G_CONSTANT returns its...
Align assumeAligned(uint64_t Value)
Treats the value 0 as a 1, so Align is always at least 1.
LLVM_ABI Register getSrcRegIgnoringCopies(Register Reg, const MachineRegisterInfo &MRI)
Find the source register for Reg, folding away any trivial copies.
constexpr T maskTrailingOnes(unsigned N)
Create a bitmask with the N right-most bits set to 1, and all other bits set to 0.
static constexpr uint64_t encode(Fields... Values)
LoweringMethodID LoweringMethod
SmallVector< RegBankLLTMappingApplyID, 2 > DstOpMapping
SmallVector< RegBankLLTMappingApplyID, 4 > SrcOpMapping
Holds waterfall loop information: the set of SGPR operand registers that need waterfalling,...
MachineBasicBlock::iterator Start
SmallSet< Register, 4 > SgprWaterfallOperandRegs
MachineBasicBlock::iterator End
constexpr uint64_t value() const
This is a hole in the type system and should not be abused.
unsigned countMinLeadingZeros() const
Returns the minimum number of leading zero bits.