69#define DEBUG_TYPE "si-load-store-opt"
77 S_BUFFER_LOAD_SGPR_IMM,
94 unsigned char NumVAddrs = 0;
105const unsigned MaxAddressRegs = 12 + 1 + 1;
116 InstClassEnum InstClass;
120 int AddrIdx[MaxAddressRegs];
122 unsigned NumAddresses;
125 bool hasSameBaseAddress(
const CombineInfo &CI) {
126 if (NumAddresses != CI.NumAddresses)
130 for (
unsigned i = 0; i < NumAddresses; i++) {
133 if (AddrReg[i]->
isImm() || AddrRegNext.
isImm()) {
134 if (AddrReg[i]->
isImm() != AddrRegNext.
isImm() ||
152 for (
unsigned i = 0; i < NumAddresses; ++i) {
161 if (!AddrOp->
isReg())
170 if (
MRI.hasOneNonDBGUse(AddrOp->
getReg()))
180 return (InstClass == MIMG) ? DMask <
Other.DMask : Offset <
Other.Offset;
184 struct BaseRegisters {
188 unsigned LoSubReg = 0;
189 unsigned HiSubReg = 0;
210 static bool dmasksCanBeCombined(
const CombineInfo &CI,
212 const CombineInfo &Paired);
213 static bool offsetsCanBeCombined(CombineInfo &CI,
const GCNSubtarget &STI,
214 CombineInfo &Paired,
bool Modify =
false);
215 static bool widthsFit(
const GCNSubtarget &STI,
const CombineInfo &CI,
216 const CombineInfo &Paired);
217 static unsigned getNewOpcode(
const CombineInfo &CI,
const CombineInfo &Paired);
218 static std::pair<unsigned, unsigned> getSubRegIdxs(
const CombineInfo &CI,
219 const CombineInfo &Paired);
221 const CombineInfo &Paired);
224 CombineInfo *checkAndPrepareMerge(CombineInfo &CI, CombineInfo &Paired);
226 unsigned read2Opcode(
unsigned EltSize)
const;
227 unsigned read2ST64Opcode(
unsigned EltSize)
const;
229 mergeRead2Pair(CombineInfo &CI, CombineInfo &Paired,
232 unsigned write2Opcode(
unsigned EltSize)
const;
233 unsigned write2ST64Opcode(
unsigned EltSize)
const;
235 mergeWrite2Pair(CombineInfo &CI, CombineInfo &Paired,
238 mergeImagePair(CombineInfo &CI, CombineInfo &Paired,
241 mergeSMemLoadImmPair(CombineInfo &CI, CombineInfo &Paired,
244 mergeBufferLoadPair(CombineInfo &CI, CombineInfo &Paired,
247 mergeBufferStorePair(CombineInfo &CI, CombineInfo &Paired,
250 mergeTBufferLoadPair(CombineInfo &CI, CombineInfo &Paired,
253 mergeTBufferStorePair(CombineInfo &CI, CombineInfo &Paired,
256 mergeFlatLoadPair(CombineInfo &CI, CombineInfo &Paired,
259 mergeFlatStorePair(CombineInfo &CI, CombineInfo &Paired,
263 int32_t NewOffset)
const;
275 std::list<std::list<CombineInfo> > &MergeableInsts)
const;
280 std::list<std::list<CombineInfo>> &MergeableInsts)
const;
283 const CombineInfo &Paired);
285 static InstClassEnum getCommonInstClass(
const CombineInfo &CI,
286 const CombineInfo &Paired);
295 bool optimizeInstsWithSameBaseAddr(std::list<CombineInfo> &MergeList,
296 bool &OptimizeListAgain);
297 bool optimizeBlock(std::list<std::list<CombineInfo> > &MergeableInsts);
301 StringRef getPassName()
const override {
return "SI Load Store Optimizer"; }
312 .
set(MachineFunctionProperties::Property::IsSSA);
317 const unsigned Opc =
MI.getOpcode();
319 if (
TII.isMUBUF(Opc)) {
323 if (
TII.isMIMG(
MI)) {
325 TII.getNamedOperand(
MI, AMDGPU::OpName::dmask)->getImm();
328 if (
TII.isMTBUF(Opc)) {
333 case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
334 case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM:
335 case AMDGPU::S_LOAD_DWORD_IMM:
336 case AMDGPU::GLOBAL_LOAD_DWORD:
337 case AMDGPU::GLOBAL_LOAD_DWORD_SADDR:
338 case AMDGPU::GLOBAL_STORE_DWORD:
339 case AMDGPU::GLOBAL_STORE_DWORD_SADDR:
340 case AMDGPU::FLAT_LOAD_DWORD:
341 case AMDGPU::FLAT_STORE_DWORD:
343 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
344 case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM:
345 case AMDGPU::S_LOAD_DWORDX2_IMM:
346 case AMDGPU::GLOBAL_LOAD_DWORDX2:
347 case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR:
348 case AMDGPU::GLOBAL_STORE_DWORDX2:
349 case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR:
350 case AMDGPU::FLAT_LOAD_DWORDX2:
351 case AMDGPU::FLAT_STORE_DWORDX2:
353 case AMDGPU::GLOBAL_LOAD_DWORDX3:
354 case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR:
355 case AMDGPU::GLOBAL_STORE_DWORDX3:
356 case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR:
357 case AMDGPU::FLAT_LOAD_DWORDX3:
358 case AMDGPU::FLAT_STORE_DWORDX3:
360 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
361 case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM:
362 case AMDGPU::S_LOAD_DWORDX4_IMM:
363 case AMDGPU::GLOBAL_LOAD_DWORDX4:
364 case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR:
365 case AMDGPU::GLOBAL_STORE_DWORDX4:
366 case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR:
367 case AMDGPU::FLAT_LOAD_DWORDX4:
368 case AMDGPU::FLAT_STORE_DWORDX4:
370 case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
371 case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM:
372 case AMDGPU::S_LOAD_DWORDX8_IMM:
374 case AMDGPU::DS_READ_B32: [[fallthrough]];
375 case AMDGPU::DS_READ_B32_gfx9: [[fallthrough]];
376 case AMDGPU::DS_WRITE_B32: [[fallthrough]];
377 case AMDGPU::DS_WRITE_B32_gfx9:
379 case AMDGPU::DS_READ_B64: [[fallthrough]];
380 case AMDGPU::DS_READ_B64_gfx9: [[fallthrough]];
381 case AMDGPU::DS_WRITE_B64: [[fallthrough]];
382 case AMDGPU::DS_WRITE_B64_gfx9:
390static InstClassEnum getInstClass(
unsigned Opc,
const SIInstrInfo &
TII) {
393 if (
TII.isMUBUF(Opc)) {
397 case AMDGPU::BUFFER_LOAD_DWORD_OFFEN:
398 case AMDGPU::BUFFER_LOAD_DWORD_OFFEN_exact:
399 case AMDGPU::BUFFER_LOAD_DWORD_OFFSET:
400 case AMDGPU::BUFFER_LOAD_DWORD_OFFSET_exact:
402 case AMDGPU::BUFFER_STORE_DWORD_OFFEN:
403 case AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact:
404 case AMDGPU::BUFFER_STORE_DWORD_OFFSET:
405 case AMDGPU::BUFFER_STORE_DWORD_OFFSET_exact:
409 if (
TII.isMIMG(Opc)) {
418 if (
TII.get(Opc).mayStore() || !
TII.get(Opc).mayLoad() ||
423 if (
TII.isMTBUF(Opc)) {
427 case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFEN:
428 case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFEN_exact:
429 case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFSET:
430 case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFSET_exact:
431 case AMDGPU::TBUFFER_LOAD_FORMAT_X_IDXEN:
432 case AMDGPU::TBUFFER_LOAD_FORMAT_X_IDXEN_exact:
433 case AMDGPU::TBUFFER_LOAD_FORMAT_X_BOTHEN:
434 case AMDGPU::TBUFFER_LOAD_FORMAT_X_BOTHEN_exact:
436 case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFEN:
437 case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFEN_exact:
438 case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFSET:
439 case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFSET_exact:
440 return TBUFFER_STORE;
444 case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
445 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
446 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
447 case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
448 return S_BUFFER_LOAD_IMM;
449 case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM:
450 case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM:
451 case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM:
452 case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM:
453 return S_BUFFER_LOAD_SGPR_IMM;
454 case AMDGPU::S_LOAD_DWORD_IMM:
455 case AMDGPU::S_LOAD_DWORDX2_IMM:
456 case AMDGPU::S_LOAD_DWORDX4_IMM:
457 case AMDGPU::S_LOAD_DWORDX8_IMM:
459 case AMDGPU::DS_READ_B32:
460 case AMDGPU::DS_READ_B32_gfx9:
461 case AMDGPU::DS_READ_B64:
462 case AMDGPU::DS_READ_B64_gfx9:
464 case AMDGPU::DS_WRITE_B32:
465 case AMDGPU::DS_WRITE_B32_gfx9:
466 case AMDGPU::DS_WRITE_B64:
467 case AMDGPU::DS_WRITE_B64_gfx9:
469 case AMDGPU::GLOBAL_LOAD_DWORD:
470 case AMDGPU::GLOBAL_LOAD_DWORDX2:
471 case AMDGPU::GLOBAL_LOAD_DWORDX3:
472 case AMDGPU::GLOBAL_LOAD_DWORDX4:
473 case AMDGPU::FLAT_LOAD_DWORD:
474 case AMDGPU::FLAT_LOAD_DWORDX2:
475 case AMDGPU::FLAT_LOAD_DWORDX3:
476 case AMDGPU::FLAT_LOAD_DWORDX4:
478 case AMDGPU::GLOBAL_LOAD_DWORD_SADDR:
479 case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR:
480 case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR:
481 case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR:
482 return GLOBAL_LOAD_SADDR;
483 case AMDGPU::GLOBAL_STORE_DWORD:
484 case AMDGPU::GLOBAL_STORE_DWORDX2:
485 case AMDGPU::GLOBAL_STORE_DWORDX3:
486 case AMDGPU::GLOBAL_STORE_DWORDX4:
487 case AMDGPU::FLAT_STORE_DWORD:
488 case AMDGPU::FLAT_STORE_DWORDX2:
489 case AMDGPU::FLAT_STORE_DWORDX3:
490 case AMDGPU::FLAT_STORE_DWORDX4:
492 case AMDGPU::GLOBAL_STORE_DWORD_SADDR:
493 case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR:
494 case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR:
495 case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR:
496 return GLOBAL_STORE_SADDR;
503static unsigned getInstSubclass(
unsigned Opc,
const SIInstrInfo &
TII) {
506 if (
TII.isMUBUF(Opc))
508 if (
TII.isMIMG(Opc)) {
511 return Info->BaseOpcode;
513 if (
TII.isMTBUF(Opc))
516 case AMDGPU::DS_READ_B32:
517 case AMDGPU::DS_READ_B32_gfx9:
518 case AMDGPU::DS_READ_B64:
519 case AMDGPU::DS_READ_B64_gfx9:
520 case AMDGPU::DS_WRITE_B32:
521 case AMDGPU::DS_WRITE_B32_gfx9:
522 case AMDGPU::DS_WRITE_B64:
523 case AMDGPU::DS_WRITE_B64_gfx9:
525 case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
526 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
527 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
528 case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
529 return AMDGPU::S_BUFFER_LOAD_DWORD_IMM;
530 case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM:
531 case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM:
532 case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM:
533 case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM:
534 return AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM;
535 case AMDGPU::S_LOAD_DWORD_IMM:
536 case AMDGPU::S_LOAD_DWORDX2_IMM:
537 case AMDGPU::S_LOAD_DWORDX4_IMM:
538 case AMDGPU::S_LOAD_DWORDX8_IMM:
539 return AMDGPU::S_LOAD_DWORD_IMM;
540 case AMDGPU::GLOBAL_LOAD_DWORD:
541 case AMDGPU::GLOBAL_LOAD_DWORDX2:
542 case AMDGPU::GLOBAL_LOAD_DWORDX3:
543 case AMDGPU::GLOBAL_LOAD_DWORDX4:
544 case AMDGPU::FLAT_LOAD_DWORD:
545 case AMDGPU::FLAT_LOAD_DWORDX2:
546 case AMDGPU::FLAT_LOAD_DWORDX3:
547 case AMDGPU::FLAT_LOAD_DWORDX4:
548 return AMDGPU::FLAT_LOAD_DWORD;
549 case AMDGPU::GLOBAL_LOAD_DWORD_SADDR:
550 case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR:
551 case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR:
552 case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR:
553 return AMDGPU::GLOBAL_LOAD_DWORD_SADDR;
554 case AMDGPU::GLOBAL_STORE_DWORD:
555 case AMDGPU::GLOBAL_STORE_DWORDX2:
556 case AMDGPU::GLOBAL_STORE_DWORDX3:
557 case AMDGPU::GLOBAL_STORE_DWORDX4:
558 case AMDGPU::FLAT_STORE_DWORD:
559 case AMDGPU::FLAT_STORE_DWORDX2:
560 case AMDGPU::FLAT_STORE_DWORDX3:
561 case AMDGPU::FLAT_STORE_DWORDX4:
562 return AMDGPU::FLAT_STORE_DWORD;
563 case AMDGPU::GLOBAL_STORE_DWORD_SADDR:
564 case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR:
565 case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR:
566 case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR:
567 return AMDGPU::GLOBAL_STORE_DWORD_SADDR;
578SILoadStoreOptimizer::getCommonInstClass(
const CombineInfo &CI,
579 const CombineInfo &Paired) {
580 assert(CI.InstClass == Paired.InstClass);
582 if ((CI.InstClass == FLAT_LOAD || CI.InstClass == FLAT_STORE) &&
584 return (CI.InstClass == FLAT_STORE) ? GLOBAL_STORE : GLOBAL_LOAD;
592 if (
TII.isMUBUF(Opc)) {
598 Result.SOffset =
true;
603 if (
TII.isMIMG(Opc)) {
605 if (VAddr0Idx >= 0) {
607 Result.NumVAddrs = SRsrcIdx - VAddr0Idx;
618 if (
TII.isMTBUF(Opc)) {
624 Result.SOffset =
true;
632 case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM:
633 case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM:
634 case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM:
635 case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM:
636 Result.SOffset =
true;
638 case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
639 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
640 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
641 case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
642 case AMDGPU::S_LOAD_DWORD_IMM:
643 case AMDGPU::S_LOAD_DWORDX2_IMM:
644 case AMDGPU::S_LOAD_DWORDX4_IMM:
645 case AMDGPU::S_LOAD_DWORDX8_IMM:
648 case AMDGPU::DS_READ_B32:
649 case AMDGPU::DS_READ_B64:
650 case AMDGPU::DS_READ_B32_gfx9:
651 case AMDGPU::DS_READ_B64_gfx9:
652 case AMDGPU::DS_WRITE_B32:
653 case AMDGPU::DS_WRITE_B64:
654 case AMDGPU::DS_WRITE_B32_gfx9:
655 case AMDGPU::DS_WRITE_B64_gfx9:
658 case AMDGPU::GLOBAL_LOAD_DWORD_SADDR:
659 case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR:
660 case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR:
661 case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR:
662 case AMDGPU::GLOBAL_STORE_DWORD_SADDR:
663 case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR:
664 case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR:
665 case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR:
668 case AMDGPU::GLOBAL_LOAD_DWORD:
669 case AMDGPU::GLOBAL_LOAD_DWORDX2:
670 case AMDGPU::GLOBAL_LOAD_DWORDX3:
671 case AMDGPU::GLOBAL_LOAD_DWORDX4:
672 case AMDGPU::GLOBAL_STORE_DWORD:
673 case AMDGPU::GLOBAL_STORE_DWORDX2:
674 case AMDGPU::GLOBAL_STORE_DWORDX3:
675 case AMDGPU::GLOBAL_STORE_DWORDX4:
676 case AMDGPU::FLAT_LOAD_DWORD:
677 case AMDGPU::FLAT_LOAD_DWORDX2:
678 case AMDGPU::FLAT_LOAD_DWORDX3:
679 case AMDGPU::FLAT_LOAD_DWORDX4:
680 case AMDGPU::FLAT_STORE_DWORD:
681 case AMDGPU::FLAT_STORE_DWORDX2:
682 case AMDGPU::FLAT_STORE_DWORDX3:
683 case AMDGPU::FLAT_STORE_DWORDX4:
690 const SILoadStoreOptimizer &LSO) {
692 unsigned Opc =
MI->getOpcode();
693 InstClass = getInstClass(Opc, *LSO.TII);
695 if (InstClass == UNKNOWN)
698 IsAGPR = LSO.TRI->hasAGPRs(LSO.getDataRegClass(*
MI));
703 (Opc == AMDGPU::DS_READ_B64 || Opc == AMDGPU::DS_READ_B64_gfx9) ? 8
708 (Opc == AMDGPU::DS_WRITE_B64 || Opc == AMDGPU::DS_WRITE_B64_gfx9) ? 8
711 case S_BUFFER_LOAD_IMM:
712 case S_BUFFER_LOAD_SGPR_IMM:
721 if (InstClass == MIMG) {
722 DMask = LSO.TII->getNamedOperand(*
I, AMDGPU::OpName::dmask)->getImm();
727 Offset =
I->getOperand(OffsetIdx).getImm();
730 if (InstClass == TBUFFER_LOAD || InstClass == TBUFFER_STORE)
731 Format = LSO.TII->getNamedOperand(*
I, AMDGPU::OpName::format)->getImm();
733 Width = getOpcodeWidth(*
I, *LSO.TII);
735 if ((InstClass == DS_READ) || (InstClass == DS_WRITE)) {
737 }
else if (InstClass != MIMG) {
738 CPol = LSO.TII->getNamedOperand(*
I, AMDGPU::OpName::cpol)->getImm();
741 AddressRegs Regs = getRegs(Opc, *LSO.TII);
744 for (
unsigned J = 0; J < Regs.NumVAddrs; J++)
745 AddrIdx[NumAddresses++] =
748 AddrIdx[NumAddresses++] =
751 AddrIdx[NumAddresses++] =
754 AddrIdx[NumAddresses++] =
757 AddrIdx[NumAddresses++] =
760 AddrIdx[NumAddresses++] =
763 AddrIdx[NumAddresses++] =
766 AddrIdx[NumAddresses++] =
768 assert(NumAddresses <= MaxAddressRegs);
770 for (
unsigned J = 0; J < NumAddresses; J++)
771 AddrReg[J] = &
I->getOperand(AddrIdx[J]);
777 "SI Load Store Optimizer",
false,
false)
782char SILoadStoreOptimizer::
ID = 0;
787 return new SILoadStoreOptimizer();
793 for (
const auto &
Op :
MI.operands()) {
803bool SILoadStoreOptimizer::canSwapInstructions(
806 if (
A.mayLoadOrStore() &&
B.mayLoadOrStore() &&
807 (
A.mayStore() ||
B.mayStore()) &&
A.mayAlias(AA,
B,
true))
809 for (
const auto &BOp :
B.operands()) {
812 if ((BOp.isDef() || BOp.readsReg()) && ARegDefs.
contains(BOp.getReg()))
814 if (BOp.isDef() && ARegUses.
contains(BOp.getReg()))
823SILoadStoreOptimizer::combineKnownAdjacentMMOs(
const CombineInfo &CI,
824 const CombineInfo &Paired) {
844bool SILoadStoreOptimizer::dmasksCanBeCombined(
const CombineInfo &CI,
846 const CombineInfo &Paired) {
847 assert(CI.InstClass == MIMG);
850 const auto *TFEOp =
TII.getNamedOperand(*CI.I, AMDGPU::OpName::tfe);
851 const auto *LWEOp =
TII.getNamedOperand(*CI.I, AMDGPU::OpName::lwe);
853 if ((TFEOp && TFEOp->getImm()) || (LWEOp && LWEOp->getImm()))
857 unsigned OperandsToMatch[] = {AMDGPU::OpName::cpol, AMDGPU::OpName::d16,
858 AMDGPU::OpName::unorm, AMDGPU::OpName::da,
859 AMDGPU::OpName::r128, AMDGPU::OpName::a16};
861 for (
auto op : OperandsToMatch) {
866 CI.I->getOperand(
Idx).getImm() != Paired.I->getOperand(
Idx).getImm())
871 unsigned MaxMask = std::max(CI.DMask, Paired.DMask);
872 unsigned MinMask = std::min(CI.DMask, Paired.DMask);
878 if ((1u << AllowedBitsForMin) <= MinMask)
885 unsigned ComponentCount,
887 if (ComponentCount > 4)
906 return NewFormatInfo->
Format;
919bool SILoadStoreOptimizer::offsetsCanBeCombined(CombineInfo &CI,
923 assert(CI.InstClass != MIMG);
927 if (CI.Offset == Paired.Offset)
931 if ((CI.Offset % CI.EltSize != 0) || (Paired.Offset % CI.EltSize != 0))
934 if (CI.InstClass == TBUFFER_LOAD || CI.InstClass == TBUFFER_STORE) {
958 uint32_t EltOffset0 = CI.Offset / CI.EltSize;
959 uint32_t EltOffset1 = Paired.Offset / CI.EltSize;
964 if ((CI.InstClass != DS_READ) && (CI.InstClass != DS_WRITE)) {
965 if (EltOffset0 + CI.Width != EltOffset1 &&
966 EltOffset1 + Paired.Width != EltOffset0)
968 if (CI.CPol != Paired.CPol)
975 if ((EltOffset0 % 64 == 0) && (EltOffset1 % 64) == 0 &&
976 isUInt<8>(EltOffset0 / 64) && isUInt<8>(EltOffset1 / 64)) {
978 CI.Offset = EltOffset0 / 64;
979 Paired.Offset = EltOffset1 / 64;
986 if (isUInt<8>(EltOffset0) && isUInt<8>(EltOffset1)) {
988 CI.Offset = EltOffset0;
989 Paired.Offset = EltOffset1;
995 uint32_t Min = std::min(EltOffset0, EltOffset1);
998 const uint32_t Mask = maskTrailingOnes<uint32_t>(8) * 64;
999 if (((Max - Min) & ~Mask) == 0) {
1007 BaseOff |= Min & maskTrailingOnes<uint32_t>(6);
1008 CI.BaseOff = BaseOff * CI.EltSize;
1009 CI.Offset = (EltOffset0 - BaseOff) / 64;
1010 Paired.Offset = (EltOffset1 - BaseOff) / 64;
1016 if (isUInt<8>(Max - Min)) {
1022 CI.BaseOff = BaseOff * CI.EltSize;
1023 CI.Offset = EltOffset0 - BaseOff;
1024 Paired.Offset = EltOffset1 - BaseOff;
1032bool SILoadStoreOptimizer::widthsFit(
const GCNSubtarget &STM,
1033 const CombineInfo &CI,
1034 const CombineInfo &Paired) {
1035 const unsigned Width = (CI.Width + Paired.Width);
1036 switch (CI.InstClass) {
1039 case S_BUFFER_LOAD_IMM:
1040 case S_BUFFER_LOAD_SGPR_IMM:
1054SILoadStoreOptimizer::getDataRegClass(
const MachineInstr &
MI)
const {
1055 if (
const auto *Dst =
TII->getNamedOperand(
MI, AMDGPU::OpName::vdst)) {
1056 return TRI->getRegClassForReg(*
MRI, Dst->getReg());
1058 if (
const auto *Src =
TII->getNamedOperand(
MI, AMDGPU::OpName::vdata)) {
1059 return TRI->getRegClassForReg(*
MRI, Src->getReg());
1061 if (
const auto *Src =
TII->getNamedOperand(
MI, AMDGPU::OpName::data0)) {
1062 return TRI->getRegClassForReg(*
MRI, Src->getReg());
1064 if (
const auto *Dst =
TII->getNamedOperand(
MI, AMDGPU::OpName::sdst)) {
1065 return TRI->getRegClassForReg(*
MRI, Dst->getReg());
1067 if (
const auto *Src =
TII->getNamedOperand(
MI, AMDGPU::OpName::sdata)) {
1068 return TRI->getRegClassForReg(*
MRI, Src->getReg());
1075SILoadStoreOptimizer::CombineInfo *
1076SILoadStoreOptimizer::checkAndPrepareMerge(CombineInfo &CI,
1077 CombineInfo &Paired) {
1080 if (CI.InstClass == UNKNOWN || Paired.InstClass == UNKNOWN)
1082 assert(CI.InstClass == Paired.InstClass);
1084 if (getInstSubclass(CI.I->getOpcode(), *
TII) !=
1085 getInstSubclass(Paired.I->getOpcode(), *
TII))
1090 if (CI.InstClass == MIMG) {
1091 if (!dmasksCanBeCombined(CI, *
TII, Paired))
1094 if (!widthsFit(*STM, CI, Paired) || !offsetsCanBeCombined(CI, *STM, Paired))
1101 if (CI.I->mayLoad()) {
1105 if (!canSwapInstructions(RegDefs, RegUses, *Paired.I, *
MBBI))
1113 if (!canSwapInstructions(RegDefs, RegUses, *CI.I, *
MBBI))
1123 if (CI.InstClass == DS_READ || CI.InstClass == DS_WRITE)
1124 offsetsCanBeCombined(CI, *STM, Paired,
true);
1128unsigned SILoadStoreOptimizer::read2Opcode(
unsigned EltSize)
const {
1130 return (EltSize == 4) ? AMDGPU::DS_READ2_B32 : AMDGPU::DS_READ2_B64;
1131 return (EltSize == 4) ? AMDGPU::DS_READ2_B32_gfx9 : AMDGPU::DS_READ2_B64_gfx9;
1134unsigned SILoadStoreOptimizer::read2ST64Opcode(
unsigned EltSize)
const {
1136 return (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32 : AMDGPU::DS_READ2ST64_B64;
1138 return (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32_gfx9
1139 : AMDGPU::DS_READ2ST64_B64_gfx9;
1143SILoadStoreOptimizer::mergeRead2Pair(CombineInfo &CI, CombineInfo &Paired,
1149 const auto *AddrReg =
TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr);
1151 const auto *Dest0 =
TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdst);
1152 const auto *Dest1 =
TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdst);
1154 unsigned NewOffset0 = CI.Offset;
1155 unsigned NewOffset1 = Paired.Offset;
1157 CI.UseST64 ? read2ST64Opcode(CI.EltSize) : read2Opcode(CI.EltSize);
1159 unsigned SubRegIdx0 = (CI.EltSize == 4) ? AMDGPU::sub0 : AMDGPU::sub0_sub1;
1160 unsigned SubRegIdx1 = (CI.EltSize == 4) ? AMDGPU::sub1 : AMDGPU::sub2_sub3;
1162 if (NewOffset0 > NewOffset1) {
1168 assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) &&
1169 (NewOffset0 != NewOffset1) &&
"Computed offset doesn't fit");
1174 Register DestReg =
MRI->createVirtualRegister(SuperRC);
1178 Register BaseReg = AddrReg->getReg();
1179 unsigned BaseSubReg = AddrReg->getSubReg();
1180 unsigned BaseRegFlags = 0;
1182 Register ImmReg =
MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
1186 BaseReg =
MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1189 TII->getAddNoCarry(*
MBB, InsertBefore,
DL, BaseReg)
1191 .addReg(AddrReg->getReg(), 0, BaseSubReg)
1198 .
addReg(BaseReg, BaseRegFlags, BaseSubReg)
1211 .
addReg(DestReg, 0, SubRegIdx0);
1217 Paired.I->eraseFromParent();
1223unsigned SILoadStoreOptimizer::write2Opcode(
unsigned EltSize)
const {
1225 return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32 : AMDGPU::DS_WRITE2_B64;
1226 return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32_gfx9
1227 : AMDGPU::DS_WRITE2_B64_gfx9;
1230unsigned SILoadStoreOptimizer::write2ST64Opcode(
unsigned EltSize)
const {
1232 return (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32
1233 : AMDGPU::DS_WRITE2ST64_B64;
1235 return (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32_gfx9
1236 : AMDGPU::DS_WRITE2ST64_B64_gfx9;
1240 CombineInfo &CI, CombineInfo &Paired,
1247 TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr);
1249 TII->getNamedOperand(*CI.I, AMDGPU::OpName::data0);
1251 TII->getNamedOperand(*Paired.I, AMDGPU::OpName::data0);
1253 unsigned NewOffset0 = CI.Offset;
1254 unsigned NewOffset1 = Paired.Offset;
1256 CI.UseST64 ? write2ST64Opcode(CI.EltSize) : write2Opcode(CI.EltSize);
1258 if (NewOffset0 > NewOffset1) {
1264 assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) &&
1265 (NewOffset0 != NewOffset1) &&
"Computed offset doesn't fit");
1271 unsigned BaseSubReg = AddrReg->
getSubReg();
1272 unsigned BaseRegFlags = 0;
1274 Register ImmReg =
MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
1278 BaseReg =
MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1281 TII->getAddNoCarry(*
MBB, InsertBefore,
DL, BaseReg)
1283 .addReg(AddrReg->
getReg(), 0, BaseSubReg)
1290 .
addReg(BaseReg, BaseRegFlags, BaseSubReg)
1299 Paired.I->eraseFromParent();
1301 LLVM_DEBUG(
dbgs() <<
"Inserted write2 inst: " << *Write2 <<
'\n');
1306SILoadStoreOptimizer::mergeImagePair(CombineInfo &CI, CombineInfo &Paired,
1310 const unsigned Opcode = getNewOpcode(CI, Paired);
1314 Register DestReg =
MRI->createVirtualRegister(SuperRC);
1315 unsigned MergedDMask = CI.DMask | Paired.DMask;
1319 auto MIB =
BuildMI(*
MBB, InsertBefore,
DL,
TII->get(Opcode), DestReg);
1320 for (
unsigned I = 1,
E = (*CI.I).getNumOperands();
I !=
E; ++
I) {
1322 MIB.addImm(MergedDMask);
1324 MIB.add((*CI.I).getOperand(
I));
1330 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1332 MachineInstr *
New = MIB.addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1334 unsigned SubRegIdx0, SubRegIdx1;
1335 std::tie(SubRegIdx0, SubRegIdx1) = getSubRegIdxs(CI, Paired);
1339 const auto *Dest0 =
TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
1340 const auto *Dest1 =
TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata);
1344 .
addReg(DestReg, 0, SubRegIdx0);
1350 Paired.I->eraseFromParent();
1355 CombineInfo &CI, CombineInfo &Paired,
1359 const unsigned Opcode = getNewOpcode(CI, Paired);
1363 Register DestReg =
MRI->createVirtualRegister(SuperRC);
1364 unsigned MergedOffset = std::min(CI.Offset, Paired.Offset);
1369 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1373 .
add(*
TII->getNamedOperand(*CI.I, AMDGPU::OpName::sbase));
1374 if (CI.InstClass == S_BUFFER_LOAD_SGPR_IMM)
1375 New.add(*
TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset));
1376 New.addImm(MergedOffset);
1377 New.addImm(CI.CPol).addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1379 std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired);
1380 const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
1381 const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
1385 const auto *Dest0 =
TII->getNamedOperand(*CI.I, AMDGPU::OpName::sdst);
1386 const auto *Dest1 =
TII->getNamedOperand(*Paired.I, AMDGPU::OpName::sdst);
1390 .
addReg(DestReg, 0, SubRegIdx0);
1396 Paired.I->eraseFromParent();
1401 CombineInfo &CI, CombineInfo &Paired,
1406 const unsigned Opcode = getNewOpcode(CI, Paired);
1411 Register DestReg =
MRI->createVirtualRegister(SuperRC);
1412 unsigned MergedOffset = std::min(CI.Offset, Paired.Offset);
1414 auto MIB =
BuildMI(*
MBB, InsertBefore,
DL,
TII->get(Opcode), DestReg);
1416 AddressRegs Regs = getRegs(Opcode, *
TII);
1419 MIB.add(*
TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
1424 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1427 MIB.add(*
TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
1428 .add(*
TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
1429 .addImm(MergedOffset)
1432 .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1434 std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired);
1435 const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
1436 const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
1440 const auto *Dest0 =
TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
1441 const auto *Dest1 =
TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata);
1445 .
addReg(DestReg, 0, SubRegIdx0);
1451 Paired.I->eraseFromParent();
1456 CombineInfo &CI, CombineInfo &Paired,
1461 const unsigned Opcode = getNewOpcode(CI, Paired);
1466 Register DestReg =
MRI->createVirtualRegister(SuperRC);
1467 unsigned MergedOffset = std::min(CI.Offset, Paired.Offset);
1469 auto MIB =
BuildMI(*
MBB, InsertBefore,
DL,
TII->get(Opcode), DestReg);
1471 AddressRegs Regs = getRegs(Opcode, *
TII);
1474 MIB.add(*
TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
1476 unsigned JoinedFormat =
1482 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1485 MIB.add(*
TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
1486 .add(*
TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
1487 .addImm(MergedOffset)
1488 .addImm(JoinedFormat)
1491 .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1493 std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired);
1494 const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
1495 const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
1499 const auto *Dest0 =
TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
1500 const auto *Dest1 =
TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata);
1504 .
addReg(DestReg, 0, SubRegIdx0);
1510 Paired.I->eraseFromParent();
1515 CombineInfo &CI, CombineInfo &Paired,
1520 const unsigned Opcode = getNewOpcode(CI, Paired);
1522 std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired);
1523 const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
1524 const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
1528 Register SrcReg =
MRI->createVirtualRegister(SuperRC);
1530 const auto *Src0 =
TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
1531 const auto *Src1 =
TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata);
1533 BuildMI(*
MBB, InsertBefore,
DL,
TII->get(AMDGPU::REG_SEQUENCE), SrcReg)
1542 AddressRegs Regs = getRegs(Opcode, *
TII);
1545 MIB.add(*
TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
1547 unsigned JoinedFormat =
1553 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1556 MIB.add(*
TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
1557 .add(*
TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
1558 .addImm(std::min(CI.Offset, Paired.Offset))
1559 .addImm(JoinedFormat)
1562 .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1564 CI.I->eraseFromParent();
1565 Paired.I->eraseFromParent();
1570 CombineInfo &CI, CombineInfo &Paired,
1575 const unsigned Opcode = getNewOpcode(CI, Paired);
1578 Register DestReg =
MRI->createVirtualRegister(SuperRC);
1580 auto MIB =
BuildMI(*
MBB, InsertBefore,
DL,
TII->get(Opcode), DestReg);
1582 if (
auto *SAddr =
TII->getNamedOperand(*CI.I, AMDGPU::OpName::saddr))
1586 MIB.add(*
TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr))
1587 .addImm(std::min(CI.Offset, Paired.Offset))
1589 .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1591 std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired);
1592 const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
1593 const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
1597 const auto *Dest0 =
TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdst);
1598 const auto *Dest1 =
TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdst);
1602 .
addReg(DestReg, 0, SubRegIdx0);
1608 Paired.I->eraseFromParent();
1613 CombineInfo &CI, CombineInfo &Paired,
1618 const unsigned Opcode = getNewOpcode(CI, Paired);
1620 std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired);
1621 const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
1622 const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
1626 Register SrcReg =
MRI->createVirtualRegister(SuperRC);
1628 const auto *Src0 =
TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
1629 const auto *Src1 =
TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata);
1631 BuildMI(*
MBB, InsertBefore,
DL,
TII->get(AMDGPU::REG_SEQUENCE), SrcReg)
1638 .
add(*
TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr))
1641 if (
auto *SAddr =
TII->getNamedOperand(*CI.I, AMDGPU::OpName::saddr))
1645 MIB.addImm(std::min(CI.Offset, Paired.Offset))
1647 .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1649 CI.I->eraseFromParent();
1650 Paired.I->eraseFromParent();
1654unsigned SILoadStoreOptimizer::getNewOpcode(
const CombineInfo &CI,
1655 const CombineInfo &Paired) {
1656 const unsigned Width = CI.Width + Paired.Width;
1658 switch (getCommonInstClass(CI, Paired)) {
1660 assert(CI.InstClass == BUFFER_LOAD || CI.InstClass == BUFFER_STORE);
1671 case S_BUFFER_LOAD_IMM:
1676 return AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM;
1678 return AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM;
1680 return AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM;
1682 case S_BUFFER_LOAD_SGPR_IMM:
1687 return AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM;
1689 return AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM;
1691 return AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM;
1698 return AMDGPU::S_LOAD_DWORDX2_IMM;
1700 return AMDGPU::S_LOAD_DWORDX4_IMM;
1702 return AMDGPU::S_LOAD_DWORDX8_IMM;
1709 return AMDGPU::GLOBAL_LOAD_DWORDX2;
1711 return AMDGPU::GLOBAL_LOAD_DWORDX3;
1713 return AMDGPU::GLOBAL_LOAD_DWORDX4;
1715 case GLOBAL_LOAD_SADDR:
1720 return AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR;
1722 return AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR;
1724 return AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR;
1731 return AMDGPU::GLOBAL_STORE_DWORDX2;
1733 return AMDGPU::GLOBAL_STORE_DWORDX3;
1735 return AMDGPU::GLOBAL_STORE_DWORDX4;
1737 case GLOBAL_STORE_SADDR:
1742 return AMDGPU::GLOBAL_STORE_DWORDX2_SADDR;
1744 return AMDGPU::GLOBAL_STORE_DWORDX3_SADDR;
1746 return AMDGPU::GLOBAL_STORE_DWORDX4_SADDR;
1753 return AMDGPU::FLAT_LOAD_DWORDX2;
1755 return AMDGPU::FLAT_LOAD_DWORDX3;
1757 return AMDGPU::FLAT_LOAD_DWORDX4;
1764 return AMDGPU::FLAT_STORE_DWORDX2;
1766 return AMDGPU::FLAT_STORE_DWORDX3;
1768 return AMDGPU::FLAT_STORE_DWORDX4;
1777std::pair<unsigned, unsigned>
1778SILoadStoreOptimizer::getSubRegIdxs(
const CombineInfo &CI,
1779 const CombineInfo &Paired) {
1780 assert((CI.InstClass != MIMG ||
1782 CI.Width + Paired.Width)) &&
1788 static const unsigned Idxs[5][4] = {
1789 {AMDGPU::sub0, AMDGPU::sub0_sub1, AMDGPU::sub0_sub1_sub2, AMDGPU::sub0_sub1_sub2_sub3},
1790 {AMDGPU::sub1, AMDGPU::sub1_sub2, AMDGPU::sub1_sub2_sub3, AMDGPU::sub1_sub2_sub3_sub4},
1791 {AMDGPU::sub2, AMDGPU::sub2_sub3, AMDGPU::sub2_sub3_sub4, AMDGPU::sub2_sub3_sub4_sub5},
1792 {AMDGPU::sub3, AMDGPU::sub3_sub4, AMDGPU::sub3_sub4_sub5, AMDGPU::sub3_sub4_sub5_sub6},
1793 {AMDGPU::sub4, AMDGPU::sub4_sub5, AMDGPU::sub4_sub5_sub6, AMDGPU::sub4_sub5_sub6_sub7},
1796 assert(CI.Width >= 1 && CI.Width <= 4);
1797 assert(Paired.Width >= 1 && Paired.Width <= 4);
1800 Idx1 = Idxs[0][Paired.Width - 1];
1801 Idx0 = Idxs[Paired.Width][CI.Width - 1];
1803 Idx0 = Idxs[0][CI.Width - 1];
1804 Idx1 = Idxs[CI.Width][Paired.Width - 1];
1807 return std::pair(Idx0, Idx1);
1811SILoadStoreOptimizer::getTargetRegisterClass(
const CombineInfo &CI,
1812 const CombineInfo &Paired) {
1813 if (CI.InstClass == S_BUFFER_LOAD_IMM ||
1814 CI.InstClass == S_BUFFER_LOAD_SGPR_IMM || CI.InstClass == S_LOAD_IMM) {
1815 switch (CI.Width + Paired.Width) {
1819 return &AMDGPU::SReg_64_XEXECRegClass;
1821 return &AMDGPU::SGPR_128RegClass;
1823 return &AMDGPU::SGPR_256RegClass;
1825 return &AMDGPU::SGPR_512RegClass;
1829 unsigned BitWidth = 32 * (CI.Width + Paired.Width);
1830 return TRI->isAGPRClass(getDataRegClass(*CI.I))
1836 CombineInfo &CI, CombineInfo &Paired,
1841 const unsigned Opcode = getNewOpcode(CI, Paired);
1843 std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired);
1844 const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
1845 const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
1849 Register SrcReg =
MRI->createVirtualRegister(SuperRC);
1851 const auto *Src0 =
TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
1852 const auto *Src1 =
TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata);
1854 BuildMI(*
MBB, InsertBefore,
DL,
TII->get(AMDGPU::REG_SEQUENCE), SrcReg)
1863 AddressRegs Regs = getRegs(Opcode, *
TII);
1866 MIB.add(*
TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
1872 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1875 MIB.add(*
TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
1876 .add(*
TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
1877 .addImm(std::min(CI.Offset, Paired.Offset))
1880 .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1882 CI.I->eraseFromParent();
1883 Paired.I->eraseFromParent();
1888SILoadStoreOptimizer::createRegOrImm(int32_t Val,
MachineInstr &
MI)
const {
1890 if (
TII->isInlineConstant(V))
1893 Register Reg =
MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
1895 BuildMI(*
MI.getParent(),
MI.getIterator(),
MI.getDebugLoc(),
1896 TII->get(AMDGPU::S_MOV_B32), Reg)
1905 const MemAddress &
Addr)
const {
1911 Addr.Base.LoSubReg) &&
1912 "Expected 32-bit Base-Register-Low!!");
1915 Addr.Base.HiSubReg) &&
1916 "Expected 32-bit Base-Register-Hi!!");
1921 createRegOrImm(
static_cast<int32_t
>(
Addr.Offset >> 32),
MI);
1923 const auto *CarryRC =
TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
1924 Register CarryReg =
MRI->createVirtualRegister(CarryRC);
1925 Register DeadCarryReg =
MRI->createVirtualRegister(CarryRC);
1927 Register DestSub0 =
MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1928 Register DestSub1 =
MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1948 Register FullDestReg =
MRI->createVirtualRegister(
TRI->getVGPR64Class());
1964 int32_t NewOffset)
const {
1965 auto Base =
TII->getNamedOperand(
MI, AMDGPU::OpName::vaddr);
1966 Base->setReg(NewBase);
1967 Base->setIsKill(
false);
1968 TII->getNamedOperand(
MI, AMDGPU::OpName::offset)->setImm(NewOffset);
1971std::optional<int32_t>
1977 return std::nullopt;
1980 if (!Def ||
Def->getOpcode() != AMDGPU::S_MOV_B32 ||
1981 !
Def->getOperand(1).isImm())
1982 return std::nullopt;
1984 return Def->getOperand(1).getImm();
1998 MemAddress &
Addr)
const {
2003 if (!Def ||
Def->getOpcode() != AMDGPU::REG_SEQUENCE
2004 ||
Def->getNumOperands() != 5)
2015 if (!BaseLoDef || BaseLoDef->
getOpcode() != AMDGPU::V_ADD_CO_U32_e64 ||
2016 !BaseHiDef || BaseHiDef->
getOpcode() != AMDGPU::V_ADDC_U32_e64)
2019 const auto *Src0 =
TII->getNamedOperand(*BaseLoDef, AMDGPU::OpName::src0);
2020 const auto *Src1 =
TII->getNamedOperand(*BaseLoDef, AMDGPU::OpName::src1);
2022 auto Offset0P = extractConstOffset(*Src0);
2026 if (!(Offset0P = extractConstOffset(*Src1)))
2031 Src0 =
TII->getNamedOperand(*BaseHiDef, AMDGPU::OpName::src0);
2032 Src1 =
TII->getNamedOperand(*BaseHiDef, AMDGPU::OpName::src1);
2047 Addr.Offset = (*Offset0P & 0x00000000ffffffff) | (Offset1 << 32);
2050bool SILoadStoreOptimizer::promoteConstantOffsetToImm(
2052 MemInfoMap &Visited,
2055 if (!(
MI.mayLoad() ^
MI.mayStore()))
2063 TII->getNamedOperand(
MI, AMDGPU::OpName::vdata) !=
nullptr)
2071 if (
TII->getNamedOperand(
MI, AMDGPU::OpName::offset)->getImm()) {
2079 if (!Visited.contains(&
MI)) {
2080 processBaseWithConstOffset(
Base, MAddr);
2081 Visited[&
MI] = MAddr;
2083 MAddr = Visited[&
MI];
2085 if (MAddr.Offset == 0) {
2086 LLVM_DEBUG(
dbgs() <<
" Failed to extract constant-offset or there are no"
2087 " constant offsets that can be promoted.\n";);
2092 << MAddr.Base.LoReg <<
"} Offset: " << MAddr.Offset <<
"\n\n";);
2120 MemAddress AnchorAddr;
2121 uint32_t MaxDist = std::numeric_limits<uint32_t>::min();
2136 TII->getNamedOperand(MINext, AMDGPU::OpName::offset)->getImm())
2140 *
TII->getNamedOperand(MINext, AMDGPU::OpName::vaddr);
2141 MemAddress MAddrNext;
2142 if (!Visited.contains(&MINext)) {
2143 processBaseWithConstOffset(BaseNext, MAddrNext);
2144 Visited[&MINext] = MAddrNext;
2146 MAddrNext = Visited[&MINext];
2148 if (MAddrNext.Base.LoReg != MAddr.Base.LoReg ||
2149 MAddrNext.Base.HiReg != MAddr.Base.HiReg ||
2150 MAddrNext.Base.LoSubReg != MAddr.Base.LoSubReg ||
2151 MAddrNext.Base.HiSubReg != MAddr.Base.HiSubReg)
2154 InstsWCommonBase.
push_back(std::pair(&MINext, MAddrNext.Offset));
2156 int64_t Dist = MAddr.Offset - MAddrNext.Offset;
2161 (
uint32_t)std::abs(Dist) > MaxDist) {
2162 MaxDist = std::abs(Dist);
2164 AnchorAddr = MAddrNext;
2165 AnchorInst = &MINext;
2170 LLVM_DEBUG(
dbgs() <<
" Anchor-Inst(with max-distance from Offset): ";
2171 AnchorInst->
dump());
2173 << AnchorAddr.Offset <<
"\n\n");
2178 updateBaseAndOffset(
MI,
Base, MAddr.Offset - AnchorAddr.Offset);
2181 for (
auto P : InstsWCommonBase) {
2184 AM.
BaseOffs =
P.second - AnchorAddr.Offset;
2188 dbgs() <<
")";
P.first->dump());
2189 updateBaseAndOffset(*
P.first,
Base,
P.second - AnchorAddr.Offset);
2193 AnchorList.
insert(AnchorInst);
2200void SILoadStoreOptimizer::addInstToMergeableList(
const CombineInfo &CI,
2201 std::list<std::list<CombineInfo> > &MergeableInsts)
const {
2202 for (std::list<CombineInfo> &AddrList : MergeableInsts) {
2203 if (AddrList.front().InstClass == CI.InstClass &&
2204 AddrList.front().IsAGPR == CI.IsAGPR &&
2205 AddrList.front().hasSameBaseAddress(CI)) {
2206 AddrList.emplace_back(CI);
2212 MergeableInsts.emplace_back(1, CI);
2215std::pair<MachineBasicBlock::iterator, bool>
2216SILoadStoreOptimizer::collectMergeableInsts(
2219 std::list<std::list<CombineInfo>> &MergeableInsts)
const {
2225 for (; BlockI !=
End; ++BlockI) {
2230 if (promoteConstantOffsetToImm(
MI, Visited, AnchorList))
2235 if (
MI.hasOrderedMemoryRef() ||
MI.hasUnmodeledSideEffects()) {
2243 const InstClassEnum InstClass = getInstClass(
MI.getOpcode(), *
TII);
2244 if (InstClass == UNKNOWN)
2250 if (Swizzled != -1 &&
MI.getOperand(Swizzled).getImm())
2254 CI.setMI(
MI, *
this);
2257 if (!CI.hasMergeableAddress(*
MRI))
2260 if (CI.InstClass == DS_WRITE && CI.IsAGPR) {
2282 for (std::list<std::list<CombineInfo>>::iterator
I = MergeableInsts.begin(),
2283 E = MergeableInsts.end();
I !=
E;) {
2285 std::list<CombineInfo> &MergeList = *
I;
2286 if (MergeList.size() <= 1) {
2290 I = MergeableInsts.erase(
I);
2298 [] (
const CombineInfo &
A,
const CombineInfo &
B) {
2299 return A.Offset <
B.Offset;
2304 return std::pair(BlockI,
Modified);
2310bool SILoadStoreOptimizer::optimizeBlock(
2311 std::list<std::list<CombineInfo> > &MergeableInsts) {
2314 for (std::list<std::list<CombineInfo>>::iterator
I = MergeableInsts.begin(),
2315 E = MergeableInsts.end();
I !=
E;) {
2316 std::list<CombineInfo> &MergeList = *
I;
2318 bool OptimizeListAgain =
false;
2319 if (!optimizeInstsWithSameBaseAddr(MergeList, OptimizeListAgain)) {
2323 I = MergeableInsts.erase(
I);
2331 if (!OptimizeListAgain) {
2332 I = MergeableInsts.erase(
I);
2335 OptimizeAgain =
true;
2341SILoadStoreOptimizer::optimizeInstsWithSameBaseAddr(
2342 std::list<CombineInfo> &MergeList,
2343 bool &OptimizeListAgain) {
2344 if (MergeList.empty())
2349 for (
auto I = MergeList.begin(), Next = std::next(
I); Next != MergeList.end();
2350 Next = std::next(
I)) {
2355 if ((*First).Order > (*Second).Order)
2357 CombineInfo &CI = *
First;
2358 CombineInfo &Paired = *Second;
2360 CombineInfo *Where = checkAndPrepareMerge(CI, Paired);
2368 LLVM_DEBUG(
dbgs() <<
"Merging: " << *CI.I <<
" with: " << *Paired.I);
2371 switch (CI.InstClass) {
2376 NewMI = mergeRead2Pair(CI, Paired, Where->I);
2379 NewMI = mergeWrite2Pair(CI, Paired, Where->I);
2381 case S_BUFFER_LOAD_IMM:
2382 case S_BUFFER_LOAD_SGPR_IMM:
2384 NewMI = mergeSMemLoadImmPair(CI, Paired, Where->I);
2385 OptimizeListAgain |= CI.Width + Paired.Width < 8;
2388 NewMI = mergeBufferLoadPair(CI, Paired, Where->I);
2389 OptimizeListAgain |= CI.Width + Paired.Width < 4;
2392 NewMI = mergeBufferStorePair(CI, Paired, Where->I);
2393 OptimizeListAgain |= CI.Width + Paired.Width < 4;
2396 NewMI = mergeImagePair(CI, Paired, Where->I);
2397 OptimizeListAgain |= CI.Width + Paired.Width < 4;
2400 NewMI = mergeTBufferLoadPair(CI, Paired, Where->I);
2401 OptimizeListAgain |= CI.Width + Paired.Width < 4;
2404 NewMI = mergeTBufferStorePair(CI, Paired, Where->I);
2405 OptimizeListAgain |= CI.Width + Paired.Width < 4;
2409 case GLOBAL_LOAD_SADDR:
2410 NewMI = mergeFlatLoadPair(CI, Paired, Where->I);
2411 OptimizeListAgain |= CI.Width + Paired.Width < 4;
2415 case GLOBAL_STORE_SADDR:
2416 NewMI = mergeFlatStorePair(CI, Paired, Where->I);
2417 OptimizeListAgain |= CI.Width + Paired.Width < 4;
2420 CI.setMI(NewMI, *
this);
2421 CI.Order = Where->Order;
2425 MergeList.erase(Second);
2440 TRI = &
TII->getRegisterInfo();
2443 AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
2458 bool CollectModified;
2459 std::list<std::list<CombineInfo>> MergeableInsts;
2463 std::tie(SectionEnd, CollectModified) =
2469 OptimizeAgain =
false;
2471 }
while (OptimizeAgain);
unsigned const MachineRegisterInfo * MRI
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
MachineBasicBlock MachineBasicBlock::iterator MBBI
INITIALIZE_PASS(AMDGPUImageIntrinsicOptimizer, DEBUG_TYPE, "AMDGPU Image Intrinsic Optimizer", false, false) char AMDGPUImageIntrinsicOptimizer void addInstToMergeableList(IntrinsicInst *II, SmallVector< SmallVector< IntrinsicInst *, 4 > > &MergeableInsts, const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr)
BasicBlock::iterator collectMergeableInsts(BasicBlock::iterator I, BasicBlock::iterator E, SmallVector< SmallVector< IntrinsicInst *, 4 > > &MergeableInsts)
Provides AMDGPU specific target descriptions.
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
Analysis containing CSE Info
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
std::optional< std::vector< StOtherPiece > > Other
AMD GCN specific subclass of TargetSubtarget.
const HexagonInstrInfo * TII
unsigned const TargetRegisterInfo * TRI
static unsigned getReg(const MCDisassembler *D, unsigned RC, unsigned RegNo)
#define INITIALIZE_PASS_DEPENDENCY(depName)
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
static uint32_t mostAlignedValueInRange(uint32_t Lo, uint32_t Hi)
static void addDefsUsesToList(const MachineInstr &MI, DenseSet< Register > &RegDefs, DenseSet< Register > &RegUses)
static unsigned getBufferFormatWithCompCount(unsigned OldFormat, unsigned ComponentCount, const GCNSubtarget &STI)
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
static bool isImm(const MachineOperand &MO, MachineRegisterInfo *MRI)
static bool optimizeBlock(BasicBlock &BB, bool &ModifiedDT, const TargetTransformInfo &TTI, const DataLayout &DL, DomTreeUpdater *DTU)
A wrapper pass to provide the legacy pass manager access to a suitably prepared AAResults object.
Class for arbitrary precision integers.
Represent the analysis usage information of a pass.
AnalysisUsage & addRequired()
void setPreservesCFG()
This function should be called by the pass, iff they do not:
This class represents an Operation in the Expression.
Implements a dense probed hash-table based set.
FunctionPass class - This class is used to implement most global optimizations.
bool loadStoreOptEnabled() const
const SIInstrInfo * getInstrInfo() const override
bool hasDwordx3LoadStores() const
const SITargetLowering * getTargetLowering() const override
bool ldsRequiresM0Init() const
Return if most LDS instructions have an m0 use that require m0 to be initialized.
Describe properties that are true of each instruction in the target description file.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
MachineFunctionPass - This class adapts the FunctionPass interface to allow convenient creation of pa...
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
Properties which a MachineFunction may have at a given point in time.
MachineFunctionProperties & set(Property P)
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, uint64_t s, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
const MachineInstrBuilder & cloneMergedMemRefs(ArrayRef< const MachineInstr * > OtherMIs) const
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
Representation of each machine instruction.
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
void eraseFromParent()
Unlink 'this' from the containing basic block and delete it.
A description of a memory reference used in the backend.
unsigned getAddrSpace() const
uint64_t getSize() const
Return the size in bytes of the memory reference.
const MachinePointerInfo & getPointerInfo() const
MachineOperand class - Representation of each machine instruction operand.
unsigned getSubReg() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
bool isImm() const
isImm - Tests if this is a MO_Immediate operand.
static MachineOperand CreateImm(int64_t Val)
Register getReg() const
getReg - Returns the register number.
static MachineOperand CreateReg(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isEarlyClobber=false, unsigned SubReg=0, bool isDebug=false, bool isInternalRead=false, bool isRenamable=false)
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
static PassRegistry * getPassRegistry()
getPassRegistry - Access the global registry object, which is automatically initialized at applicatio...
Wrapper class representing virtual and physical registers.
constexpr bool isPhysical() const
Return true if the specified register number is in the physical register namespace.
static bool isFLATGlobal(const MachineInstr &MI)
bool isLegalGlobalAddressingMode(const AddrMode &AM) const
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
StringRef - Represent a constant reference to a string, i.e.
std::pair< iterator, bool > insert(const ValueT &V)
bool contains(const_arg_type_t< ValueT > V) const
Check if the set contains the given element.
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ FLAT_ADDRESS
Address space for flat memory.
LLVM_READONLY int getGlobalSaddrOp(uint16_t Opcode)
LLVM_READONLY const MIMGInfo * getMIMGInfo(unsigned Opc)
uint64_t convertSMRDOffsetUnits(const MCSubtargetInfo &ST, uint64_t ByteOffset)
Convert ByteOffset to dwords if the subtarget uses dword SMRD immediate offsets.
bool getMTBUFHasSrsrc(unsigned Opc)
int getMTBUFElements(unsigned Opc)
LLVM_READONLY int16_t getNamedOperandIdx(uint16_t Opcode, uint16_t NamedIdx)
bool getMTBUFHasSoffset(unsigned Opc)
int getMUBUFOpcode(unsigned BaseOpc, unsigned Elements)
int getMUBUFBaseOpcode(unsigned Opc)
int getMTBUFBaseOpcode(unsigned Opc)
bool getMUBUFHasVAddr(unsigned Opc)
LLVM_READONLY bool hasNamedOperand(uint64_t Opcode, uint64_t NamedIdx)
int getMTBUFOpcode(unsigned BaseOpc, unsigned Elements)
bool getMUBUFHasSoffset(unsigned Opc)
const MIMGBaseOpcodeInfo * getMIMGBaseOpcode(unsigned Opc)
LLVM_READONLY const MIMGBaseOpcodeInfo * getMIMGBaseOpcodeInfo(unsigned BaseOpcode)
int getMaskedMIMGOp(unsigned Opc, unsigned NewChannels)
bool getMTBUFHasVAddr(unsigned Opc)
int getMUBUFElements(unsigned Opc)
const GcnBufferFormatInfo * getGcnBufferFormatInfo(uint8_t BitsPerComp, uint8_t NumComponents, uint8_t NumFormat, const MCSubtargetInfo &STI)
bool getMUBUFHasSrsrc(unsigned Opc)
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
@ Define
Register definition.
@ Kill
The last use of a register.
Reg
All possible values of the reg field in the ModR/M byte.
NodeAddr< DefNode * > Def
This is an optimization pass for GlobalISel generic memory operations.
bool operator<(int64_t V1, const APSInt &V2)
int popcount(T Value) noexcept
Count the number of set bits in a value.
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
char & SILoadStoreOptimizerID
FunctionPass * createSILoadStoreOptimizerPass()
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
void initializeSILoadStoreOptimizerPass(PassRegistry &)
int countl_zero(T Val)
Count number of 0's from the most significant bit to the least stopping at the first 1.
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
@ First
Helpers to iterate all locations in the MemoryEffectsBase class.
constexpr unsigned BitWidth
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
This class contains a discriminated union of information about pointers in memory operands,...
This represents an addressing mode of: BaseGV + BaseOffs + BaseReg + Scale*ScaleReg If BaseGV is null...