90 #define DEBUG_TYPE "si-load-store-opt" 135 unsigned NumAddresses;
138 for (
unsigned i = 0; i < NumAddresses; i++) {
141 if (AddrReg[i]->isImm() || AddrRegNext.
isImm()) {
142 if (AddrReg[i]->isImm() != AddrRegNext.
isImm() ||
160 for (
unsigned i = 0; i < NumAddresses; ++i) {
169 if (!AddrOp->
isReg())
189 struct BaseRegisters {
193 unsigned LoSubReg = 0;
194 unsigned HiSubReg = 0;
212 static bool dmasksCanBeCombined(
const CombineInfo &CI,
const SIInstrInfo &TII);
213 static bool offsetsCanBeCombined(CombineInfo &CI);
214 static bool widthsFit(
const GCNSubtarget &STM,
const CombineInfo &CI);
215 static unsigned getNewOpcode(
const CombineInfo &CI);
216 static std::pair<unsigned, unsigned> getSubRegIdxs(
const CombineInfo &CI);
219 bool findMatchingInst(CombineInfo &CI);
221 unsigned read2Opcode(
unsigned EltSize)
const;
222 unsigned read2ST64Opcode(
unsigned EltSize)
const;
225 unsigned write2Opcode(
unsigned EltSize)
const;
226 unsigned write2ST64Opcode(
unsigned EltSize)
const;
234 int32_t NewOffset)
const;
235 unsigned computeBase(
MachineInstr &
MI,
const MemAddress &Addr)
const;
245 void addInstToMergeableList(
const CombineInfo &CI,
246 std::list<std::list<CombineInfo> > &MergeableInsts)
const;
248 std::list<std::list<CombineInfo> > &MergeableInsts)
const;
257 void removeCombinedInst(std::list<CombineInfo> &MergeList,
259 bool optimizeInstsWithSameBaseAddr(std::list<CombineInfo> &MergeList,
260 bool &OptimizeListAgain);
261 bool optimizeBlock(std::list<std::list<CombineInfo> > &MergeableInsts);
265 StringRef getPassName()
const override {
return "SI Load Store Optimizer"; }
289 case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
291 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
293 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
308 case AMDGPU::BUFFER_LOAD_DWORD_OFFEN:
309 case AMDGPU::BUFFER_LOAD_DWORD_OFFEN_exact:
310 case AMDGPU::BUFFER_LOAD_DWORD_OFFSET:
311 case AMDGPU::BUFFER_LOAD_DWORD_OFFSET_exact:
313 case AMDGPU::BUFFER_STORE_DWORD_OFFEN:
314 case AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact:
315 case AMDGPU::BUFFER_STORE_DWORD_OFFSET:
316 case AMDGPU::BUFFER_STORE_DWORD_OFFSET_exact:
325 if (TII.get(Opc).mayStore() || !TII.get(Opc).mayLoad() || TII.
isGather4(Opc))
330 case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
331 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
332 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
333 return S_BUFFER_LOAD_IMM;
334 case AMDGPU::DS_READ_B32:
335 case AMDGPU::DS_READ_B32_gfx9:
336 case AMDGPU::DS_READ_B64:
337 case AMDGPU::DS_READ_B64_gfx9:
339 case AMDGPU::DS_WRITE_B32:
340 case AMDGPU::DS_WRITE_B32_gfx9:
341 case AMDGPU::DS_WRITE_B64:
342 case AMDGPU::DS_WRITE_B64_gfx9:
349 static unsigned getInstSubclass(
unsigned Opc,
const SIInstrInfo &TII) {
357 return Info->BaseOpcode;
360 case AMDGPU::DS_READ_B32:
361 case AMDGPU::DS_READ_B32_gfx9:
362 case AMDGPU::DS_READ_B64:
363 case AMDGPU::DS_READ_B64_gfx9:
364 case AMDGPU::DS_WRITE_B32:
365 case AMDGPU::DS_WRITE_B32_gfx9:
366 case AMDGPU::DS_WRITE_B64:
367 case AMDGPU::DS_WRITE_B64_gfx9:
369 case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
370 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
371 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
372 return AMDGPU::S_BUFFER_LOAD_DWORD_IMM;
376 static unsigned getRegs(
unsigned Opc,
const SIInstrInfo &TII) {
396 unsigned result = VADDR | SRSRC;
406 case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
407 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
408 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
410 case AMDGPU::DS_READ_B32:
411 case AMDGPU::DS_READ_B64:
412 case AMDGPU::DS_READ_B32_gfx9:
413 case AMDGPU::DS_READ_B64_gfx9:
414 case AMDGPU::DS_WRITE_B32:
415 case AMDGPU::DS_WRITE_B64:
416 case AMDGPU::DS_WRITE_B32_gfx9:
417 case AMDGPU::DS_WRITE_B64_gfx9:
427 unsigned Opc = MI->getOpcode();
428 InstClass = getInstClass(Opc, TII);
436 (Opc == AMDGPU::DS_READ_B64 || Opc == AMDGPU::DS_READ_B64_gfx9) ? 8
441 (Opc == AMDGPU::DS_WRITE_B64 || Opc == AMDGPU::DS_WRITE_B64_gfx9) ? 8
444 case S_BUFFER_LOAD_IMM:
452 if (InstClass ==
MIMG) {
456 Offset0 =
I->getOperand(OffsetIdx).getImm();
459 Width0 = getOpcodeWidth(*
I, TII);
461 if ((InstClass == DS_READ) || (InstClass == DS_WRITE)) {
463 }
else if (InstClass !=
MIMG) {
465 if (InstClass != S_BUFFER_LOAD_IMM) {
471 unsigned AddrOpName[5] = {0};
473 const unsigned Regs = getRegs(
I->getOpcode(),
TII);
476 AddrOpName[NumAddresses++] = AMDGPU::OpName::addr;
480 AddrOpName[NumAddresses++] = AMDGPU::OpName::sbase;
484 AddrOpName[NumAddresses++] = AMDGPU::OpName::srsrc;
487 if (Regs & SOFFSET) {
488 AddrOpName[NumAddresses++] = AMDGPU::OpName::soffset;
492 AddrOpName[NumAddresses++] = AMDGPU::OpName::vaddr;
496 AddrOpName[NumAddresses++] = AMDGPU::OpName::ssamp;
499 for (
unsigned i = 0; i < NumAddresses; i++) {
501 AddrReg[i] = &
I->getOperand(AddrIdx[i]);
510 assert(InstClass == getInstClass(Paired->getOpcode(),
TII));
512 if (InstClass ==
MIMG) {
517 Offset1 = Paired->getOperand(OffsetIdx).getImm();
520 Width1 = getOpcodeWidth(*Paired, TII);
521 if ((InstClass == DS_READ) || (InstClass == DS_WRITE)) {
523 }
else if (InstClass !=
MIMG) {
525 if (InstClass != S_BUFFER_LOAD_IMM) {
536 "SI Load Store Optimizer",
false,
false)
541 char SILoadStoreOptimizer::
ID = 0;
546 return new SILoadStoreOptimizer();
554 MI->removeFromParent();
578 return !(A->mayStore() || B->mayStore()) || !A->mayAlias(AA, *B,
true);
596 ((
Use.readsReg() && RegDefs.
count(
Use.getReg())) ||
599 PhysRegUses.
count(
Use.getReg())))) {
615 if (!InstToMove->mayLoadOrStore())
638 bool SILoadStoreOptimizer::dmasksCanBeCombined(
const CombineInfo &CI,
const SIInstrInfo &
TII) {
645 if ((TFEOp && TFEOp->getImm()) || (LWEOp && LWEOp->getImm()))
649 unsigned OperandsToMatch[] = {AMDGPU::OpName::glc, AMDGPU::OpName::slc,
650 AMDGPU::OpName::d16, AMDGPU::OpName::unorm,
653 for (
auto op : OperandsToMatch) {
658 CI.I->getOperand(Idx).getImm() != CI.Paired->getOperand(Idx).getImm())
663 unsigned MaxMask =
std::max(CI.DMask0, CI.DMask1);
664 unsigned MinMask = std::min(CI.DMask0, CI.DMask1);
667 if ((1u << AllowedBitsForMin) <= MinMask)
673 bool SILoadStoreOptimizer::offsetsCanBeCombined(CombineInfo &CI) {
678 if (CI.Offset0 == CI.Offset1)
682 if ((CI.Offset0 % CI.EltSize != 0) || (CI.Offset1 % CI.EltSize != 0))
685 unsigned EltOffset0 = CI.Offset0 / CI.EltSize;
686 unsigned EltOffset1 = CI.Offset1 / CI.EltSize;
691 if ((CI.InstClass != DS_READ) && (CI.InstClass != DS_WRITE)) {
692 return (EltOffset0 + CI.Width0 == EltOffset1 ||
693 EltOffset1 + CI.Width1 == EltOffset0) &&
694 CI.GLC0 == CI.GLC1 && CI.DLC0 == CI.DLC1 &&
695 (CI.InstClass == S_BUFFER_LOAD_IMM || CI.SLC0 == CI.SLC1);
700 if ((EltOffset0 % 64 == 0) && (EltOffset1 % 64) == 0 &&
702 CI.Offset0 = EltOffset0 / 64;
703 CI.Offset1 = EltOffset1 / 64;
710 CI.Offset0 = EltOffset0;
711 CI.Offset1 = EltOffset1;
716 unsigned OffsetDiff =
std::abs((
int)EltOffset1 - (
int)EltOffset0);
717 CI.BaseOff = std::min(CI.Offset0, CI.Offset1);
719 if ((OffsetDiff % 64 == 0) &&
isUInt<8>(OffsetDiff / 64)) {
720 CI.Offset0 = (EltOffset0 - CI.BaseOff / CI.EltSize) / 64;
721 CI.Offset1 = (EltOffset1 - CI.BaseOff / CI.EltSize) / 64;
727 CI.Offset0 = EltOffset0 - CI.BaseOff / CI.EltSize;
728 CI.Offset1 = EltOffset1 - CI.BaseOff / CI.EltSize;
735 bool SILoadStoreOptimizer::widthsFit(
const GCNSubtarget &STM,
736 const CombineInfo &CI) {
737 const unsigned Width = (CI.Width0 + CI.Width1);
738 switch (CI.InstClass) {
741 case S_BUFFER_LOAD_IMM:
752 bool SILoadStoreOptimizer::findMatchingInst(CombineInfo &CI) {
757 const unsigned Opc = CI.I->getOpcode();
763 const unsigned InstSubclass = getInstSubclass(Opc, *TII);
768 if (Swizzled != -1 && CI.I->getOperand(Swizzled).getImm())
777 for (; MBBI !=
E; ++MBBI) {
779 if ((getInstClass(MBBI->getOpcode(), *
TII) != InstClass) ||
780 (getInstSubclass(MBBI->getOpcode(), *
TII) != InstSubclass)) {
787 if (MBBI->hasUnmodeledSideEffects()) {
793 if (MBBI->mayLoadOrStore() &&
799 CI.InstsToMove.push_back(&*MBBI);
813 if (MBBI->hasOrderedMemoryRef())
826 bool Match = CI.hasSameBaseAddress(*MBBI);
829 CI.setPaired(MBBI, *TII);
835 ? dmasksCanBeCombined(CI, *TII)
836 : widthsFit(*STM, CI) && offsetsCanBeCombined(CI);
857 unsigned SILoadStoreOptimizer::read2Opcode(
unsigned EltSize)
const {
859 return (EltSize == 4) ? AMDGPU::DS_READ2_B32 : AMDGPU::DS_READ2_B64;
860 return (EltSize == 4) ? AMDGPU::DS_READ2_B32_gfx9 : AMDGPU::DS_READ2_B64_gfx9;
863 unsigned SILoadStoreOptimizer::read2ST64Opcode(
unsigned EltSize)
const {
865 return (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32 : AMDGPU::DS_READ2ST64_B64;
867 return (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32_gfx9
868 : AMDGPU::DS_READ2ST64_B64_gfx9;
872 SILoadStoreOptimizer::mergeRead2Pair(CombineInfo &CI) {
877 const auto *AddrReg = TII->
getNamedOperand(*CI.I, AMDGPU::OpName::addr);
880 const auto *Dest1 = TII->
getNamedOperand(*CI.Paired, AMDGPU::OpName::vdst);
882 unsigned NewOffset0 = CI.Offset0;
883 unsigned NewOffset1 = CI.Offset1;
885 CI.UseST64 ? read2ST64Opcode(CI.EltSize) : read2Opcode(CI.EltSize);
887 unsigned SubRegIdx0 = (CI.EltSize == 4) ? AMDGPU::sub0 : AMDGPU::sub0_sub1;
888 unsigned SubRegIdx1 = (CI.EltSize == 4) ? AMDGPU::sub1 : AMDGPU::sub2_sub3;
890 if (NewOffset0 > NewOffset1) {
897 (NewOffset0 != NewOffset1) &&
"Computed offset doesn't fit");
902 (CI.EltSize == 4) ? &AMDGPU::VReg_64RegClass : &AMDGPU::VReg_128RegClass;
903 Register DestReg =
MRI->createVirtualRegister(SuperRC);
907 Register BaseReg = AddrReg->getReg();
908 unsigned BaseSubReg = AddrReg->getSubReg();
909 unsigned BaseRegFlags = 0;
911 Register ImmReg =
MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
912 BuildMI(*MBB, CI.Paired, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg)
915 BaseReg =
MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
920 .
addReg(AddrReg->getReg(), 0, BaseSubReg)
926 BuildMI(*MBB, CI.Paired, DL, Read2Desc, DestReg)
927 .
addReg(BaseReg, BaseRegFlags, BaseSubReg)
935 const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
938 BuildMI(*MBB, CI.Paired, DL, CopyDesc)
940 .
addReg(DestReg, 0, SubRegIdx0);
947 CI.I->eraseFromParent();
948 CI.Paired->eraseFromParent();
954 unsigned SILoadStoreOptimizer::write2Opcode(
unsigned EltSize)
const {
956 return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32 : AMDGPU::DS_WRITE2_B64;
957 return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32_gfx9
958 : AMDGPU::DS_WRITE2_B64_gfx9;
961 unsigned SILoadStoreOptimizer::write2ST64Opcode(
unsigned EltSize)
const {
963 return (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32
964 : AMDGPU::DS_WRITE2ST64_B64;
966 return (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32_gfx9
967 : AMDGPU::DS_WRITE2ST64_B64_gfx9;
971 SILoadStoreOptimizer::mergeWrite2Pair(CombineInfo &CI) {
983 unsigned NewOffset0 = CI.Offset0;
984 unsigned NewOffset1 = CI.Offset1;
986 CI.UseST64 ? write2ST64Opcode(CI.EltSize) : write2Opcode(CI.EltSize);
988 if (NewOffset0 > NewOffset1) {
995 (NewOffset0 != NewOffset1) &&
"Computed offset doesn't fit");
1001 unsigned BaseSubReg = AddrReg->
getSubReg();
1002 unsigned BaseRegFlags = 0;
1004 Register ImmReg =
MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
1005 BuildMI(*MBB, CI.Paired, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg)
1006 .addImm(CI.BaseOff);
1008 BaseReg =
MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1019 BuildMI(*MBB, CI.Paired, DL, Write2Desc)
1020 .
addReg(BaseReg, BaseRegFlags, BaseSubReg)
1030 CI.I->eraseFromParent();
1031 CI.Paired->eraseFromParent();
1033 LLVM_DEBUG(
dbgs() <<
"Inserted write2 inst: " << *Write2 <<
'\n');
1038 SILoadStoreOptimizer::mergeImagePair(CombineInfo &CI) {
1041 const unsigned Opcode = getNewOpcode(CI);
1045 Register DestReg =
MRI->createVirtualRegister(SuperRC);
1046 unsigned MergedDMask = CI.DMask0 | CI.DMask1;
1050 auto MIB =
BuildMI(*MBB, CI.Paired, DL, TII->get(Opcode), DestReg);
1051 for (
unsigned I = 1,
E = (*CI.I).getNumOperands();
I !=
E; ++
I) {
1055 MIB.
add((*CI.I).getOperand(
I));
1061 assert(CI.I->hasOneMemOperand() && CI.Paired->hasOneMemOperand());
1068 std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI);
1069 const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
1070 const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
1073 const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
1074 const auto *Dest0 = TII->
getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
1075 const auto *Dest1 = TII->
getNamedOperand(*CI.Paired, AMDGPU::OpName::vdata);
1077 BuildMI(*MBB, CI.Paired, DL, CopyDesc)
1079 .
addReg(DestReg, 0, SubRegIdx0);
1086 CI.I->eraseFromParent();
1087 CI.Paired->eraseFromParent();
1092 SILoadStoreOptimizer::mergeSBufferLoadImmPair(CombineInfo &CI) {
1095 const unsigned Opcode = getNewOpcode(CI);
1099 Register DestReg =
MRI->createVirtualRegister(SuperRC);
1100 unsigned MergedOffset = std::min(CI.Offset0, CI.Offset1);
1105 assert(CI.I->hasOneMemOperand() && CI.Paired->hasOneMemOperand());
1111 BuildMI(*MBB, CI.Paired, DL, TII->get(Opcode), DestReg)
1113 .addImm(MergedOffset)
1118 std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI);
1119 const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
1120 const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
1123 const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
1124 const auto *Dest0 = TII->
getNamedOperand(*CI.I, AMDGPU::OpName::sdst);
1125 const auto *Dest1 = TII->
getNamedOperand(*CI.Paired, AMDGPU::OpName::sdst);
1127 BuildMI(*MBB, CI.Paired, DL, CopyDesc)
1129 .
addReg(DestReg, 0, SubRegIdx0);
1136 CI.I->eraseFromParent();
1137 CI.Paired->eraseFromParent();
1142 SILoadStoreOptimizer::mergeBufferLoadPair(CombineInfo &CI) {
1146 const unsigned Opcode = getNewOpcode(CI);
1151 Register DestReg =
MRI->createVirtualRegister(SuperRC);
1152 unsigned MergedOffset = std::min(CI.Offset0, CI.Offset1);
1154 auto MIB =
BuildMI(*MBB, CI.Paired, DL, TII->get(Opcode), DestReg);
1156 const unsigned Regs = getRegs(Opcode, *TII);
1164 assert(CI.I->hasOneMemOperand() && CI.Paired->hasOneMemOperand());
1172 .addImm(MergedOffset)
1180 std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI);
1181 const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
1182 const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
1185 const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
1186 const auto *Dest0 = TII->
getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
1187 const auto *Dest1 = TII->
getNamedOperand(*CI.Paired, AMDGPU::OpName::vdata);
1189 BuildMI(*MBB, CI.Paired, DL, CopyDesc)
1191 .
addReg(DestReg, 0, SubRegIdx0);
1198 CI.I->eraseFromParent();
1199 CI.Paired->eraseFromParent();
1203 unsigned SILoadStoreOptimizer::getNewOpcode(
const CombineInfo &CI) {
1204 const unsigned Width = CI.Width0 + CI.Width1;
1206 switch (CI.InstClass) {
1214 case S_BUFFER_LOAD_IMM:
1219 return AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM;
1221 return AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM;
1229 std::pair<unsigned, unsigned>
1230 SILoadStoreOptimizer::getSubRegIdxs(
const CombineInfo &CI) {
1232 if (CI.Width0 == 0 || CI.Width0 == 0 || CI.Width0 + CI.Width1 > 4)
1233 return std::make_pair(0, 0);
1236 if (CI.InstClass ==
MIMG) {
1239 ReverseOrder = CI.DMask0 > CI.DMask1;
1241 ReverseOrder = CI.Offset0 > CI.Offset1;
1243 static const unsigned Idxs[4][4] = {
1244 {AMDGPU::sub0, AMDGPU::sub0_sub1, AMDGPU::sub0_sub1_sub2, AMDGPU::sub0_sub1_sub2_sub3},
1245 {AMDGPU::sub1, AMDGPU::sub1_sub2, AMDGPU::sub1_sub2_sub3, 0},
1246 {AMDGPU::sub2, AMDGPU::sub2_sub3, 0, 0},
1247 {AMDGPU::sub3, 0, 0, 0},
1252 assert(CI.Width0 >= 1 && CI.Width0 <= 3);
1253 assert(CI.Width1 >= 1 && CI.Width1 <= 3);
1256 Idx1 = Idxs[0][CI.Width1 - 1];
1257 Idx0 = Idxs[CI.Width1][CI.Width0 - 1];
1259 Idx0 = Idxs[0][CI.Width0 - 1];
1260 Idx1 = Idxs[CI.Width0][CI.Width1 - 1];
1263 return std::make_pair(Idx0, Idx1);
1267 SILoadStoreOptimizer::getTargetRegisterClass(
const CombineInfo &CI) {
1268 if (CI.InstClass == S_BUFFER_LOAD_IMM) {
1269 switch (CI.Width0 + CI.Width1) {
1273 return &AMDGPU::SReg_64_XEXECRegClass;
1275 return &AMDGPU::SGPR_128RegClass;
1277 return &AMDGPU::SReg_256RegClass;
1279 return &AMDGPU::SReg_512RegClass;
1282 switch (CI.Width0 + CI.Width1) {
1286 return &AMDGPU::VReg_64RegClass;
1288 return &AMDGPU::VReg_96RegClass;
1290 return &AMDGPU::VReg_128RegClass;
1296 SILoadStoreOptimizer::mergeBufferStorePair(CombineInfo &CI) {
1300 const unsigned Opcode = getNewOpcode(CI);
1302 std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI);
1303 const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
1304 const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
1308 Register SrcReg =
MRI->createVirtualRegister(SuperRC);
1310 const auto *Src0 = TII->
getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
1311 const auto *Src1 = TII->
getNamedOperand(*CI.Paired, AMDGPU::OpName::vdata);
1313 BuildMI(*MBB, CI.Paired, DL, TII->get(AMDGPU::REG_SEQUENCE), SrcReg)
1317 .addImm(SubRegIdx1);
1319 auto MIB =
BuildMI(*MBB, CI.Paired, DL, TII->get(Opcode))
1322 const unsigned Regs = getRegs(Opcode, *TII);
1331 assert(CI.I->hasOneMemOperand() && CI.Paired->hasOneMemOperand());
1339 .addImm(std::min(CI.Offset0, CI.Offset1))
1349 CI.I->eraseFromParent();
1350 CI.Paired->eraseFromParent();
1355 SILoadStoreOptimizer::createRegOrImm(int32_t Val,
MachineInstr &
MI)
const {
1356 APInt V(32, Val,
true);
1360 Register Reg =
MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
1363 TII->get(AMDGPU::S_MOV_B32),
Reg)
1371 unsigned SILoadStoreOptimizer::computeBase(
MachineInstr &MI,
1372 const MemAddress &Addr)
const {
1377 assert((
TRI->getRegSizeInBits(Addr.Base.LoReg, *
MRI) == 32 ||
1378 Addr.Base.LoSubReg) &&
1379 "Expected 32-bit Base-Register-Low!!");
1381 assert((
TRI->getRegSizeInBits(Addr.Base.HiReg, *
MRI) == 32 ||
1382 Addr.Base.HiSubReg) &&
1383 "Expected 32-bit Base-Register-Hi!!");
1386 MachineOperand OffsetLo = createRegOrImm(static_cast<int32_t>(Addr.Offset), MI);
1388 createRegOrImm(static_cast<int32_t>(Addr.Offset >> 32), MI);
1390 const auto *CarryRC =
TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
1391 Register CarryReg =
MRI->createVirtualRegister(CarryRC);
1392 Register DeadCarryReg =
MRI->createVirtualRegister(CarryRC);
1394 Register DestSub0 =
MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1395 Register DestSub1 =
MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1397 BuildMI(*MBB, MBBI, DL, TII->get(AMDGPU::V_ADD_I32_e64), DestSub0)
1399 .
addReg(Addr.Base.LoReg, 0, Addr.Base.LoSubReg)
1406 BuildMI(*MBB, MBBI, DL, TII->get(AMDGPU::V_ADDC_U32_e64), DestSub1)
1408 .
addReg(Addr.Base.HiReg, 0, Addr.Base.HiSubReg)
1415 Register FullDestReg =
MRI->createVirtualRegister(&AMDGPU::VReg_64RegClass);
1417 BuildMI(*MBB, MBBI, DL, TII->get(TargetOpcode::REG_SEQUENCE), FullDestReg)
1429 void SILoadStoreOptimizer::updateBaseAndOffset(
MachineInstr &MI,
1431 int32_t NewOffset)
const {
1445 if (!Def || Def->
getOpcode() != AMDGPU::S_MOV_B32 ||
1463 MemAddress &Addr)
const {
1468 if (!Def || Def->
getOpcode() != AMDGPU::REG_SEQUENCE
1480 if (!BaseLoDef || BaseLoDef->
getOpcode() != AMDGPU::V_ADD_I32_e64 ||
1481 !BaseHiDef || BaseHiDef->
getOpcode() != AMDGPU::V_ADDC_U32_e64)
1484 const auto *Src0 = TII->
getNamedOperand(*BaseLoDef, AMDGPU::OpName::src0);
1485 const auto *Src1 = TII->
getNamedOperand(*BaseLoDef, AMDGPU::OpName::src1);
1487 auto Offset0P = extractConstOffset(*Src0);
1491 if (!(Offset0P = extractConstOffset(*Src1)))
1505 uint64_t Offset1 = Src1->getImm();
1508 Addr.Base.LoReg = BaseLo.
getReg();
1509 Addr.Base.HiReg = BaseHi.
getReg();
1510 Addr.Base.LoSubReg = BaseLo.
getSubReg();
1511 Addr.Base.HiSubReg = BaseHi.
getSubReg();
1512 Addr.Offset = (*Offset0P & 0x00000000ffffffff) | (Offset1 << 32);
1515 bool SILoadStoreOptimizer::promoteConstantOffsetToImm(
1517 MemInfoMap &Visited,
1530 if (AnchorList.
count(&MI))
1543 if (Visited.find(&MI) == Visited.end()) {
1544 processBaseWithConstOffset(Base, MAddr);
1545 Visited[&
MI] = MAddr;
1547 MAddr = Visited[&
MI];
1549 if (MAddr.Offset == 0) {
1550 LLVM_DEBUG(
dbgs() <<
" Failed to extract constant-offset or there are no" 1551 " constant offsets that can be promoted.\n";);
1556 << MAddr.Base.LoReg <<
"} Offset: " << MAddr.Offset <<
"\n\n";);
1584 MemAddress AnchorAddr;
1585 uint32_t MaxDist = std::numeric_limits<uint32_t>::min();
1595 for ( ; MBBI !=
E; ++MBBI) {
1605 MemAddress MAddrNext;
1606 if (Visited.find(&MINext) == Visited.end()) {
1607 processBaseWithConstOffset(BaseNext, MAddrNext);
1608 Visited[&MINext] = MAddrNext;
1610 MAddrNext = Visited[&MINext];
1612 if (MAddrNext.Base.LoReg != MAddr.Base.LoReg ||
1613 MAddrNext.Base.HiReg != MAddr.Base.HiReg ||
1614 MAddrNext.Base.LoSubReg != MAddr.Base.LoSubReg ||
1615 MAddrNext.Base.HiSubReg != MAddr.Base.HiSubReg)
1618 InstsWCommonBase.
push_back(std::make_pair(&MINext, MAddrNext.Offset));
1620 int64_t Dist = MAddr.Offset - MAddrNext.Offset;
1628 AnchorAddr = MAddrNext;
1629 AnchorInst = &MINext;
1634 LLVM_DEBUG(
dbgs() <<
" Anchor-Inst(with max-distance from Offset): ";
1635 AnchorInst->
dump());
1637 << AnchorAddr.Offset <<
"\n\n");
1640 unsigned Base = computeBase(MI, AnchorAddr);
1642 updateBaseAndOffset(MI, Base, MAddr.Offset - AnchorAddr.Offset);
1645 for (
auto P : InstsWCommonBase) {
1648 AM.
BaseOffs =
P.second - AnchorAddr.Offset;
1652 dbgs() <<
")";
P.first->dump());
1653 updateBaseAndOffset(*
P.first, Base,
P.second - AnchorAddr.Offset);
1657 AnchorList.
insert(AnchorInst);
1664 void SILoadStoreOptimizer::addInstToMergeableList(
const CombineInfo &CI,
1665 std::list<std::list<CombineInfo> > &MergeableInsts)
const {
1666 for (std::list<CombineInfo> &AddrList : MergeableInsts) {
1667 if (AddrList.front().hasSameBaseAddress(*CI.I) &&
1668 AddrList.front().InstClass == CI.InstClass) {
1669 AddrList.emplace_back(CI);
1675 MergeableInsts.emplace_back(1, CI);
1679 std::list<std::list<CombineInfo> > &MergeableInsts)
const {
1691 if (promoteConstantOffsetToImm(MI, Visited, AnchorList))
1703 CI.setMI(MI, *TII, *STM);
1705 if (!CI.hasMergeableAddress(*
MRI))
1708 addInstToMergeableList(CI, MergeableInsts);
1716 bool SILoadStoreOptimizer::optimizeBlock(
1717 std::list<std::list<CombineInfo> > &MergeableInsts) {
1720 for (std::list<CombineInfo> &MergeList : MergeableInsts) {
1721 if (MergeList.size() < 2)
1724 bool OptimizeListAgain =
false;
1725 if (!optimizeInstsWithSameBaseAddr(MergeList, OptimizeListAgain)) {
1735 if (!OptimizeListAgain)
1738 OptimizeAgain |= OptimizeListAgain;
1745 SILoadStoreOptimizer::removeCombinedInst(std::list<CombineInfo> &MergeList,
1748 for (
auto CI = MergeList.begin(),
E = MergeList.end(); CI !=
E; ++CI) {
1749 if (&*CI->I == &MI) {
1750 MergeList.erase(CI);
1757 SILoadStoreOptimizer::optimizeInstsWithSameBaseAddr(
1758 std::list<CombineInfo> &MergeList,
1759 bool &OptimizeListAgain) {
1761 for (
auto I = MergeList.begin();
I != MergeList.end(); ++
I) {
1762 CombineInfo &CI = *
I;
1764 switch (CI.InstClass) {
1768 if (findMatchingInst(CI)) {
1770 removeCombinedInst(MergeList, *CI.Paired);
1772 CI.setMI(NewMI, *TII, *STM);
1776 if (findMatchingInst(CI)) {
1778 removeCombinedInst(MergeList, *CI.Paired);
1780 CI.setMI(NewMI, *TII, *STM);
1783 case S_BUFFER_LOAD_IMM:
1784 if (findMatchingInst(CI)) {
1786 removeCombinedInst(MergeList, *CI.Paired);
1788 CI.setMI(NewMI, *TII, *STM);
1789 OptimizeListAgain |= (CI.Width0 + CI.Width1) < 16;
1793 if (findMatchingInst(CI)) {
1795 removeCombinedInst(MergeList, *CI.Paired);
1797 CI.setMI(NewMI, *TII, *STM);
1798 OptimizeListAgain |= (CI.Width0 + CI.Width1) < 4;
1802 if (findMatchingInst(CI)) {
1804 removeCombinedInst(MergeList, *CI.Paired);
1806 CI.setMI(NewMI, *TII, *STM);
1807 OptimizeListAgain |= (CI.Width0 + CI.Width1) < 4;
1811 if (findMatchingInst(CI)) {
1813 removeCombinedInst(MergeList, *CI.Paired);
1815 CI.setMI(NewMI, *TII, *STM);
1816 OptimizeListAgain |= (CI.Width0 + CI.Width1) < 4;
1823 CI.InstsToMove.clear();
1829 bool SILoadStoreOptimizer::runOnMachineFunction(
MachineFunction &MF) {
1841 AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
1843 assert(
MRI->isSSA() &&
"Must be run on SSA");
1851 std::list<std::list<CombineInfo> > MergeableInsts;
1853 Modified |= collectMergeableInsts(MBB, MergeableInsts);
1855 OptimizeAgain =
false;
1856 Modified |= optimizeBlock(MergeableInsts);
1857 }
while (OptimizeAgain);
const MachineInstrBuilder & add(const MachineOperand &MO) const
bool isLegalGlobalAddressingMode(const AddrMode &AM) const
Interface definition for SIRegisterInfo.
This represents an addressing mode of: BaseGV + BaseOffs + BaseReg + Scale*ScaleReg If BaseGV is null...
static PassRegistry * getPassRegistry()
getPassRegistry - Access the global registry object, which is automatically initialized at applicatio...
AMDGPU specific subclass of TargetSubtarget.
bool getMUBUFHasSrsrc(unsigned Opc)
LLVM_READONLY const MIMGInfo * getMIMGInfo(unsigned Opc)
This class represents lattice values for constants.
char & SILoadStoreOptimizerID
void push_back(const T &Elt)
const DebugLoc & getDebugLoc() const
Returns the debug location id of this MachineInstr.
static bool isPhysicalRegister(unsigned Reg)
Return true if the specified register number is in the physical register namespace.
Describe properties that are true of each instruction in the target description file.
unsigned getSubReg() const
uint64_t getSize() const
Return the size in bytes of the memory reference.
bool mayLoadOrStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly read or modify memory.
const SIInstrInfo * getInstrInfo() const override
unsigned const TargetRegisterInfo * TRI
iterator_range< mop_iterator > operands()
bool isInlineConstant(const APInt &Imm) const
bool isImm() const
isImm - Tests if this is a MO_Immediate operand.
const SIRegisterInfo & getRegisterInfo() const
LLVM_READONLY int16_t getNamedOperandIdx(uint16_t Opcode, uint16_t NamedIdx)
unsigned countTrailingZeros(T Val, ZeroBehavior ZB=ZB_Width)
Count number of 0's from the least significant bit to the most stopping at the first 1...
AnalysisUsage & addRequired()
#define INITIALIZE_PASS_DEPENDENCY(depName)
A description of a memory reference used in the backend.
bool hasDwordx3LoadStores() const
MachineFunctionPass - This class adapts the FunctionPass interface to allow convenient creation of pa...
const HexagonInstrInfo * TII
unsigned getNumOperands() const
Retuns the total number of operands.
static bool isGather4(const MachineInstr &MI)
static bool isMIMG(const MachineInstr &MI)
A Use represents the edge between a Value definition and its users.
MachineBasicBlock iterator that automatically skips over MIs that are inside bundles (i...
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
int getMUBUFElements(unsigned Opc)
static bool memAccessesCanBeReordered(MachineBasicBlock::iterator A, MachineBasicBlock::iterator B, AliasAnalysis *AA)
int getMaskedMIMGOp(unsigned Opc, unsigned NewChannels)
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, uint64_t s, unsigned base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
static bool isMUBUF(const MachineInstr &MI)
LLVM_READONLY MachineOperand * getNamedOperand(MachineInstr &MI, unsigned OperandName) const
Returns the operand named Op.
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory)...
LLVM_READONLY const MIMGBaseOpcodeInfo * getMIMGBaseOpcodeInfo(unsigned BaseOpcode)
void setReg(Register Reg)
Change the register this operand corresponds to.
instr_iterator insert(instr_iterator I, MachineInstr *M)
Insert MI into the instruction list before I, possibly inside a bundle.
static bool canMoveInstsAcrossMemOp(MachineInstr &MemOp, ArrayRef< MachineInstr *> InstsToMove, AliasAnalysis *AA)
Analysis containing CSE Info
MachineInstrBuilder BuildMI(MachineFunction &MF, const DebugLoc &DL, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
bool mayStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly modify memory.
int64_t getSMRDEncodedOffset(const MCSubtargetInfo &ST, int64_t ByteOffset)
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
constexpr bool isUInt< 8 >(uint64_t x)
unsigned const MachineRegisterInfo * MRI
bool getMUBUFHasSoffset(unsigned Opc)
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
LLVM_READONLY int getGlobalSaddrOp(uint16_t Opcode)
std::pair< iterator, bool > insert(const ValueT &V)
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
int getMUBUFBaseOpcode(unsigned Opc)
int getMUBUFOpcode(unsigned BaseOpc, unsigned Elements)
Represent the analysis usage information of a pass.
void setOffset(int64_t NewOffset)
void setImm(int64_t immVal)
FunctionPass class - This class is used to implement most global optimizations.
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
self_iterator getIterator()
static void moveInstsAfter(MachineBasicBlock::iterator I, ArrayRef< MachineInstr *> InstsToMove)
static MachineOperand CreateReg(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isEarlyClobber=false, unsigned SubReg=0, bool isDebug=false, bool isInternalRead=false, bool isRenamable=false)
INITIALIZE_PASS_END(RegBankSelect, DEBUG_TYPE, "Assign register bank of generic virtual registers", false, false) RegBankSelect
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
static MachineMemOperand * combineKnownAdjacentMMOs(MachineFunction &MF, const MachineMemOperand *A, const MachineMemOperand *B)
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements...
unsigned countPopulation(T Value)
Count the number of set bits in a value.
Align max(MaybeAlign Lhs, Align Rhs)
static uint64_t add(uint64_t LeftOp, uint64_t RightOp)
MachineOperand class - Representation of each machine instruction operand.
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small...
MachineInstrBuilder getAddNoCarry(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, unsigned DestReg) const
Return a partially built integer add instruction without carry.
bool hasOrderedMemoryRef() const
Return true if this instruction may have an ordered or volatile memory reference, or if the informati...
void setPreservesCFG()
This function should be called by the pass, iff they do not:
const Function & getFunction() const
Return the LLVM function that this machine code represents.
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Class for arbitrary precision integers.
static bool addToListsIfDependent(MachineInstr &MI, DenseSet< unsigned > &RegDefs, DenseSet< unsigned > &PhysRegUses, SmallVectorImpl< MachineInstr *> &Insts)
static unsigned getReg(const void *D, unsigned RC, unsigned RegNo)
INITIALIZE_PASS_BEGIN(SILoadStoreOptimizer, DEBUG_TYPE, "SI Load Store Optimizer", false, false) INITIALIZE_PASS_END(SILoadStoreOptimizer
void initializeSILoadStoreOptimizerPass(PassRegistry &)
const MachineBasicBlock * getParent() const
MachineRegisterInfo - Keep track of information for virtual and physical registers, including vreg register classes, use/def chains for registers, etc.
Provides AMDGPU specific target descriptions.
Representation of each machine instruction.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
Interface definition for SIInstrInfo.
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
bool loadStoreOptEnabled() const
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
static MachineOperand CreateImm(int64_t Val)
APFloat abs(APFloat X)
Returns the absolute value of the argument.
const MachineInstrBuilder & cloneMergedMemRefs(ArrayRef< const MachineInstr *> OtherMIs) const
size_type count(const_arg_type_t< ValueT > V) const
Return 1 if the specified key is in the set, 0 otherwise.
bool hasOneNonDBGUse(unsigned RegNo) const
hasOneNonDBGUse - Return true if there is exactly one non-Debug use of the specified register...
bool isReg() const
isReg - Tests if this is a MO_Register operand.
FunctionPass * createSILoadStoreOptimizerPass()
bool ldsRequiresM0Init() const
Return if most LDS instructions have an m0 use that require m0 to be iniitalized. ...
bool mayLoad(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly read memory.
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
int64_t getOffset() const
For normal values, this is a byte offset added to the base address.
StringRef - Represent a constant reference to a string, i.e.
bool getMUBUFHasVAddr(unsigned Opc)
Register getReg() const
getReg - Returns the register number.
A wrapper pass to provide the legacy pass manager access to a suitably prepared AAResults object...
const SITargetLowering * getTargetLowering() const override
const MachineOperand & getOperand(unsigned i) const
static void addDefsUsesToList(const MachineInstr &MI, DenseSet< unsigned > &RegDefs, DenseSet< unsigned > &PhysRegUses)
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
Wrapper class representing virtual and physical registers.