69#define DEBUG_TYPE "si-load-store-opt"
77 S_BUFFER_LOAD_SGPR_IMM,
94 unsigned char NumVAddrs = 0;
105const unsigned MaxAddressRegs = 12 + 1 + 1;
116 InstClassEnum InstClass;
120 int AddrIdx[MaxAddressRegs];
122 unsigned NumAddresses;
125 bool hasSameBaseAddress(
const CombineInfo &CI) {
126 if (NumAddresses != CI.NumAddresses)
130 for (
unsigned i = 0; i < NumAddresses; i++) {
133 if (AddrReg[i]->
isImm() || AddrRegNext.
isImm()) {
134 if (AddrReg[i]->
isImm() != AddrRegNext.
isImm() ||
152 for (
unsigned i = 0; i < NumAddresses; ++i) {
161 if (!AddrOp->
isReg())
170 if (
MRI.hasOneNonDBGUse(AddrOp->
getReg()))
180 return (InstClass == MIMG) ? DMask <
Other.DMask : Offset <
Other.Offset;
184 struct BaseRegisters {
188 unsigned LoSubReg = 0;
189 unsigned HiSubReg = 0;
210 static bool dmasksCanBeCombined(
const CombineInfo &CI,
212 const CombineInfo &Paired);
213 static bool offsetsCanBeCombined(CombineInfo &CI,
const GCNSubtarget &STI,
214 CombineInfo &Paired,
bool Modify =
false);
215 static bool widthsFit(
const GCNSubtarget &STI,
const CombineInfo &CI,
216 const CombineInfo &Paired);
217 static unsigned getNewOpcode(
const CombineInfo &CI,
const CombineInfo &Paired);
218 static std::pair<unsigned, unsigned> getSubRegIdxs(
const CombineInfo &CI,
219 const CombineInfo &Paired);
221 const CombineInfo &Paired);
224 CombineInfo *checkAndPrepareMerge(CombineInfo &CI, CombineInfo &Paired);
226 unsigned read2Opcode(
unsigned EltSize)
const;
227 unsigned read2ST64Opcode(
unsigned EltSize)
const;
229 mergeRead2Pair(CombineInfo &CI, CombineInfo &Paired,
232 unsigned write2Opcode(
unsigned EltSize)
const;
233 unsigned write2ST64Opcode(
unsigned EltSize)
const;
235 mergeWrite2Pair(CombineInfo &CI, CombineInfo &Paired,
238 mergeImagePair(CombineInfo &CI, CombineInfo &Paired,
241 mergeSMemLoadImmPair(CombineInfo &CI, CombineInfo &Paired,
244 mergeBufferLoadPair(CombineInfo &CI, CombineInfo &Paired,
247 mergeBufferStorePair(CombineInfo &CI, CombineInfo &Paired,
250 mergeTBufferLoadPair(CombineInfo &CI, CombineInfo &Paired,
253 mergeTBufferStorePair(CombineInfo &CI, CombineInfo &Paired,
256 mergeFlatLoadPair(CombineInfo &CI, CombineInfo &Paired,
259 mergeFlatStorePair(CombineInfo &CI, CombineInfo &Paired,
263 int32_t NewOffset)
const;
266 std::optional<int32_t> extractConstOffset(
const MachineOperand &Op)
const;
274 void addInstToMergeableList(
const CombineInfo &CI,
275 std::list<std::list<CombineInfo> > &MergeableInsts)
const;
277 std::pair<MachineBasicBlock::iterator, bool> collectMergeableInsts(
280 std::list<std::list<CombineInfo>> &MergeableInsts)
const;
283 const CombineInfo &Paired);
285 static InstClassEnum getCommonInstClass(
const CombineInfo &CI,
286 const CombineInfo &Paired);
295 bool optimizeInstsWithSameBaseAddr(std::list<CombineInfo> &MergeList,
296 bool &OptimizeListAgain);
297 bool optimizeBlock(std::list<std::list<CombineInfo> > &MergeableInsts);
301 StringRef getPassName()
const override {
return "SI Load Store Optimizer"; }
312 .
set(MachineFunctionProperties::Property::IsSSA);
317 const unsigned Opc =
MI.getOpcode();
319 if (
TII.isMUBUF(Opc)) {
321 return AMDGPU::getMUBUFElements(Opc);
323 if (
TII.isMIMG(
MI)) {
325 TII.getNamedOperand(
MI, AMDGPU::OpName::dmask)->getImm();
328 if (
TII.isMTBUF(Opc)) {
329 return AMDGPU::getMTBUFElements(Opc);
333 case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
334 case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR:
335 case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM:
336 case AMDGPU::S_LOAD_DWORD_IMM:
337 case AMDGPU::GLOBAL_LOAD_DWORD:
338 case AMDGPU::GLOBAL_LOAD_DWORD_SADDR:
339 case AMDGPU::GLOBAL_STORE_DWORD:
340 case AMDGPU::GLOBAL_STORE_DWORD_SADDR:
341 case AMDGPU::FLAT_LOAD_DWORD:
342 case AMDGPU::FLAT_STORE_DWORD:
344 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
345 case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR:
346 case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM:
347 case AMDGPU::S_LOAD_DWORDX2_IMM:
348 case AMDGPU::GLOBAL_LOAD_DWORDX2:
349 case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR:
350 case AMDGPU::GLOBAL_STORE_DWORDX2:
351 case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR:
352 case AMDGPU::FLAT_LOAD_DWORDX2:
353 case AMDGPU::FLAT_STORE_DWORDX2:
355 case AMDGPU::GLOBAL_LOAD_DWORDX3:
356 case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR:
357 case AMDGPU::GLOBAL_STORE_DWORDX3:
358 case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR:
359 case AMDGPU::FLAT_LOAD_DWORDX3:
360 case AMDGPU::FLAT_STORE_DWORDX3:
362 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
363 case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR:
364 case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM:
365 case AMDGPU::S_LOAD_DWORDX4_IMM:
366 case AMDGPU::GLOBAL_LOAD_DWORDX4:
367 case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR:
368 case AMDGPU::GLOBAL_STORE_DWORDX4:
369 case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR:
370 case AMDGPU::FLAT_LOAD_DWORDX4:
371 case AMDGPU::FLAT_STORE_DWORDX4:
373 case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
374 case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR:
375 case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM:
376 case AMDGPU::S_LOAD_DWORDX8_IMM:
378 case AMDGPU::DS_READ_B32: [[fallthrough]];
379 case AMDGPU::DS_READ_B32_gfx9: [[fallthrough]];
380 case AMDGPU::DS_WRITE_B32: [[fallthrough]];
381 case AMDGPU::DS_WRITE_B32_gfx9:
383 case AMDGPU::DS_READ_B64: [[fallthrough]];
384 case AMDGPU::DS_READ_B64_gfx9: [[fallthrough]];
385 case AMDGPU::DS_WRITE_B64: [[fallthrough]];
386 case AMDGPU::DS_WRITE_B64_gfx9:
394static InstClassEnum getInstClass(
unsigned Opc,
const SIInstrInfo &
TII) {
397 if (
TII.isMUBUF(Opc)) {
398 switch (AMDGPU::getMUBUFBaseOpcode(Opc)) {
401 case AMDGPU::BUFFER_LOAD_DWORD_OFFEN:
402 case AMDGPU::BUFFER_LOAD_DWORD_OFFEN_exact:
403 case AMDGPU::BUFFER_LOAD_DWORD_OFFSET:
404 case AMDGPU::BUFFER_LOAD_DWORD_OFFSET_exact:
406 case AMDGPU::BUFFER_STORE_DWORD_OFFEN:
407 case AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact:
408 case AMDGPU::BUFFER_STORE_DWORD_OFFSET:
409 case AMDGPU::BUFFER_STORE_DWORD_OFFSET_exact:
413 if (
TII.isMIMG(Opc)) {
415 if (!AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::vaddr) &&
416 !AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::vaddr0))
419 if (AMDGPU::getMIMGBaseOpcode(Opc)->BVH)
422 if (
TII.get(Opc).mayStore() || !
TII.get(Opc).mayLoad() ||
427 if (
TII.isMTBUF(Opc)) {
428 switch (AMDGPU::getMTBUFBaseOpcode(Opc)) {
431 case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFEN:
432 case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFEN_exact:
433 case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFSET:
434 case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFSET_exact:
436 case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFEN:
437 case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFEN_exact:
438 case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFSET:
439 case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFSET_exact:
440 return TBUFFER_STORE;
444 case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
445 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
446 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
447 case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
448 return S_BUFFER_LOAD_IMM;
451 case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR:
452 case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR:
453 case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR:
454 case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR:
455 case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM:
456 case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM:
457 case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM:
458 case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM:
459 return S_BUFFER_LOAD_SGPR_IMM;
460 case AMDGPU::S_LOAD_DWORD_IMM:
461 case AMDGPU::S_LOAD_DWORDX2_IMM:
462 case AMDGPU::S_LOAD_DWORDX4_IMM:
463 case AMDGPU::S_LOAD_DWORDX8_IMM:
465 case AMDGPU::DS_READ_B32:
466 case AMDGPU::DS_READ_B32_gfx9:
467 case AMDGPU::DS_READ_B64:
468 case AMDGPU::DS_READ_B64_gfx9:
470 case AMDGPU::DS_WRITE_B32:
471 case AMDGPU::DS_WRITE_B32_gfx9:
472 case AMDGPU::DS_WRITE_B64:
473 case AMDGPU::DS_WRITE_B64_gfx9:
475 case AMDGPU::GLOBAL_LOAD_DWORD:
476 case AMDGPU::GLOBAL_LOAD_DWORDX2:
477 case AMDGPU::GLOBAL_LOAD_DWORDX3:
478 case AMDGPU::GLOBAL_LOAD_DWORDX4:
479 case AMDGPU::FLAT_LOAD_DWORD:
480 case AMDGPU::FLAT_LOAD_DWORDX2:
481 case AMDGPU::FLAT_LOAD_DWORDX3:
482 case AMDGPU::FLAT_LOAD_DWORDX4:
484 case AMDGPU::GLOBAL_LOAD_DWORD_SADDR:
485 case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR:
486 case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR:
487 case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR:
488 return GLOBAL_LOAD_SADDR;
489 case AMDGPU::GLOBAL_STORE_DWORD:
490 case AMDGPU::GLOBAL_STORE_DWORDX2:
491 case AMDGPU::GLOBAL_STORE_DWORDX3:
492 case AMDGPU::GLOBAL_STORE_DWORDX4:
493 case AMDGPU::FLAT_STORE_DWORD:
494 case AMDGPU::FLAT_STORE_DWORDX2:
495 case AMDGPU::FLAT_STORE_DWORDX3:
496 case AMDGPU::FLAT_STORE_DWORDX4:
498 case AMDGPU::GLOBAL_STORE_DWORD_SADDR:
499 case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR:
500 case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR:
501 case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR:
502 return GLOBAL_STORE_SADDR;
509static unsigned getInstSubclass(
unsigned Opc,
const SIInstrInfo &
TII) {
512 if (
TII.isMUBUF(Opc))
513 return AMDGPU::getMUBUFBaseOpcode(Opc);
514 if (
TII.isMIMG(Opc)) {
517 return Info->BaseOpcode;
519 if (
TII.isMTBUF(Opc))
520 return AMDGPU::getMTBUFBaseOpcode(Opc);
522 case AMDGPU::DS_READ_B32:
523 case AMDGPU::DS_READ_B32_gfx9:
524 case AMDGPU::DS_READ_B64:
525 case AMDGPU::DS_READ_B64_gfx9:
526 case AMDGPU::DS_WRITE_B32:
527 case AMDGPU::DS_WRITE_B32_gfx9:
528 case AMDGPU::DS_WRITE_B64:
529 case AMDGPU::DS_WRITE_B64_gfx9:
531 case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
532 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
533 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
534 case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
535 return AMDGPU::S_BUFFER_LOAD_DWORD_IMM;
538 case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR:
539 case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR:
540 case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR:
541 case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR:
542 case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM:
543 case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM:
544 case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM:
545 case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM:
546 return AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM;
547 case AMDGPU::S_LOAD_DWORD_IMM:
548 case AMDGPU::S_LOAD_DWORDX2_IMM:
549 case AMDGPU::S_LOAD_DWORDX4_IMM:
550 case AMDGPU::S_LOAD_DWORDX8_IMM:
551 return AMDGPU::S_LOAD_DWORD_IMM;
552 case AMDGPU::GLOBAL_LOAD_DWORD:
553 case AMDGPU::GLOBAL_LOAD_DWORDX2:
554 case AMDGPU::GLOBAL_LOAD_DWORDX3:
555 case AMDGPU::GLOBAL_LOAD_DWORDX4:
556 case AMDGPU::FLAT_LOAD_DWORD:
557 case AMDGPU::FLAT_LOAD_DWORDX2:
558 case AMDGPU::FLAT_LOAD_DWORDX3:
559 case AMDGPU::FLAT_LOAD_DWORDX4:
560 return AMDGPU::FLAT_LOAD_DWORD;
561 case AMDGPU::GLOBAL_LOAD_DWORD_SADDR:
562 case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR:
563 case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR:
564 case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR:
565 return AMDGPU::GLOBAL_LOAD_DWORD_SADDR;
566 case AMDGPU::GLOBAL_STORE_DWORD:
567 case AMDGPU::GLOBAL_STORE_DWORDX2:
568 case AMDGPU::GLOBAL_STORE_DWORDX3:
569 case AMDGPU::GLOBAL_STORE_DWORDX4:
570 case AMDGPU::FLAT_STORE_DWORD:
571 case AMDGPU::FLAT_STORE_DWORDX2:
572 case AMDGPU::FLAT_STORE_DWORDX3:
573 case AMDGPU::FLAT_STORE_DWORDX4:
574 return AMDGPU::FLAT_STORE_DWORD;
575 case AMDGPU::GLOBAL_STORE_DWORD_SADDR:
576 case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR:
577 case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR:
578 case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR:
579 return AMDGPU::GLOBAL_STORE_DWORD_SADDR;
590SILoadStoreOptimizer::getCommonInstClass(
const CombineInfo &CI,
591 const CombineInfo &Paired) {
592 assert(CI.InstClass == Paired.InstClass);
594 if ((CI.InstClass == FLAT_LOAD || CI.InstClass == FLAT_STORE) &&
596 return (CI.InstClass == FLAT_STORE) ? GLOBAL_STORE : GLOBAL_LOAD;
604 if (
TII.isMUBUF(Opc)) {
605 if (AMDGPU::getMUBUFHasVAddr(Opc))
607 if (AMDGPU::getMUBUFHasSrsrc(Opc))
609 if (AMDGPU::getMUBUFHasSoffset(Opc))
610 Result.SOffset =
true;
615 if (
TII.isMIMG(Opc)) {
616 int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0);
617 if (VAddr0Idx >= 0) {
618 int SRsrcIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::srsrc);
619 Result.NumVAddrs = SRsrcIdx - VAddr0Idx;
625 if (
Info && AMDGPU::getMIMGBaseOpcodeInfo(
Info->BaseOpcode)->Sampler)
630 if (
TII.isMTBUF(Opc)) {
631 if (AMDGPU::getMTBUFHasVAddr(Opc))
633 if (AMDGPU::getMTBUFHasSrsrc(Opc))
635 if (AMDGPU::getMTBUFHasSoffset(Opc))
636 Result.SOffset =
true;
644 case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR:
645 case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR:
646 case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR:
647 case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR:
648 case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM:
649 case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM:
650 case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM:
651 case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM:
652 Result.SOffset =
true;
654 case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
655 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
656 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
657 case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
658 case AMDGPU::S_LOAD_DWORD_IMM:
659 case AMDGPU::S_LOAD_DWORDX2_IMM:
660 case AMDGPU::S_LOAD_DWORDX4_IMM:
661 case AMDGPU::S_LOAD_DWORDX8_IMM:
664 case AMDGPU::DS_READ_B32:
665 case AMDGPU::DS_READ_B64:
666 case AMDGPU::DS_READ_B32_gfx9:
667 case AMDGPU::DS_READ_B64_gfx9:
668 case AMDGPU::DS_WRITE_B32:
669 case AMDGPU::DS_WRITE_B64:
670 case AMDGPU::DS_WRITE_B32_gfx9:
671 case AMDGPU::DS_WRITE_B64_gfx9:
674 case AMDGPU::GLOBAL_LOAD_DWORD_SADDR:
675 case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR:
676 case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR:
677 case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR:
678 case AMDGPU::GLOBAL_STORE_DWORD_SADDR:
679 case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR:
680 case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR:
681 case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR:
684 case AMDGPU::GLOBAL_LOAD_DWORD:
685 case AMDGPU::GLOBAL_LOAD_DWORDX2:
686 case AMDGPU::GLOBAL_LOAD_DWORDX3:
687 case AMDGPU::GLOBAL_LOAD_DWORDX4:
688 case AMDGPU::GLOBAL_STORE_DWORD:
689 case AMDGPU::GLOBAL_STORE_DWORDX2:
690 case AMDGPU::GLOBAL_STORE_DWORDX3:
691 case AMDGPU::GLOBAL_STORE_DWORDX4:
692 case AMDGPU::FLAT_LOAD_DWORD:
693 case AMDGPU::FLAT_LOAD_DWORDX2:
694 case AMDGPU::FLAT_LOAD_DWORDX3:
695 case AMDGPU::FLAT_LOAD_DWORDX4:
696 case AMDGPU::FLAT_STORE_DWORD:
697 case AMDGPU::FLAT_STORE_DWORDX2:
698 case AMDGPU::FLAT_STORE_DWORDX3:
699 case AMDGPU::FLAT_STORE_DWORDX4:
706 const SILoadStoreOptimizer &LSO) {
708 unsigned Opc =
MI->getOpcode();
709 InstClass = getInstClass(Opc, *LSO.TII);
711 if (InstClass == UNKNOWN)
714 IsAGPR = LSO.TRI->hasAGPRs(LSO.getDataRegClass(*
MI));
719 (Opc == AMDGPU::DS_READ_B64 || Opc == AMDGPU::DS_READ_B64_gfx9) ? 8
724 (Opc == AMDGPU::DS_WRITE_B64 || Opc == AMDGPU::DS_WRITE_B64_gfx9) ? 8
727 case S_BUFFER_LOAD_IMM:
728 case S_BUFFER_LOAD_SGPR_IMM:
730 EltSize = AMDGPU::convertSMRDOffsetUnits(*LSO.STM, 4);
737 if (InstClass == MIMG) {
738 DMask = LSO.TII->getNamedOperand(*
I, AMDGPU::OpName::dmask)->getImm();
742 int OffsetIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::offset);
743 Offset = OffsetIdx == -1 ? 0 :
I->getOperand(OffsetIdx).getImm();
746 if (InstClass == TBUFFER_LOAD || InstClass == TBUFFER_STORE)
747 Format = LSO.TII->getNamedOperand(*
I, AMDGPU::OpName::format)->getImm();
749 Width = getOpcodeWidth(*
I, *LSO.TII);
751 if ((InstClass == DS_READ) || (InstClass == DS_WRITE)) {
753 }
else if (InstClass != MIMG) {
754 CPol = LSO.TII->getNamedOperand(*
I, AMDGPU::OpName::cpol)->getImm();
757 AddressRegs Regs = getRegs(Opc, *LSO.TII);
760 for (
unsigned J = 0; J < Regs.NumVAddrs; J++)
761 AddrIdx[NumAddresses++] =
762 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0) + J;
764 AddrIdx[NumAddresses++] =
765 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::addr);
767 AddrIdx[NumAddresses++] =
768 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::sbase);
770 AddrIdx[NumAddresses++] =
771 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::srsrc);
773 AddrIdx[NumAddresses++] =
774 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::soffset);
776 AddrIdx[NumAddresses++] =
777 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::saddr);
779 AddrIdx[NumAddresses++] =
780 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr);
782 AddrIdx[NumAddresses++] =
783 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::ssamp);
784 assert(NumAddresses <= MaxAddressRegs);
786 for (
unsigned J = 0; J < NumAddresses; J++)
787 AddrReg[J] = &
I->getOperand(AddrIdx[J]);
793 "SI Load Store Optimizer",
false,
false)
798char SILoadStoreOptimizer::
ID = 0;
803 return new SILoadStoreOptimizer();
809 for (
const auto &Op :
MI.operands()) {
813 RegDefs.
insert(Op.getReg());
815 RegUses.
insert(Op.getReg());
819bool SILoadStoreOptimizer::canSwapInstructions(
822 if (
A.mayLoadOrStore() &&
B.mayLoadOrStore() &&
823 (
A.mayStore() ||
B.mayStore()) &&
A.mayAlias(AA,
B,
true))
825 for (
const auto &BOp :
B.operands()) {
828 if ((BOp.isDef() || BOp.readsReg()) && ARegDefs.
contains(BOp.getReg()))
830 if (BOp.isDef() && ARegUses.
contains(BOp.getReg()))
839SILoadStoreOptimizer::combineKnownAdjacentMMOs(
const CombineInfo &CI,
840 const CombineInfo &Paired) {
860bool SILoadStoreOptimizer::dmasksCanBeCombined(
const CombineInfo &CI,
862 const CombineInfo &Paired) {
863 assert(CI.InstClass == MIMG);
866 const auto *TFEOp =
TII.getNamedOperand(*CI.I, AMDGPU::OpName::tfe);
867 const auto *LWEOp =
TII.getNamedOperand(*CI.I, AMDGPU::OpName::lwe);
869 if ((TFEOp && TFEOp->getImm()) || (LWEOp && LWEOp->getImm()))
873 unsigned OperandsToMatch[] = {AMDGPU::OpName::cpol, AMDGPU::OpName::d16,
874 AMDGPU::OpName::unorm, AMDGPU::OpName::da,
875 AMDGPU::OpName::r128, AMDGPU::OpName::a16};
877 for (
auto op : OperandsToMatch) {
878 int Idx = AMDGPU::getNamedOperandIdx(CI.I->getOpcode(),
op);
879 if (AMDGPU::getNamedOperandIdx(Paired.I->getOpcode(),
op) !=
Idx)
882 CI.I->getOperand(
Idx).getImm() != Paired.I->getOperand(
Idx).getImm())
887 unsigned MaxMask = std::max(CI.DMask, Paired.DMask);
888 unsigned MinMask = std::min(CI.DMask, Paired.DMask);
891 if ((1u << AllowedBitsForMin) <= MinMask)
898 unsigned ComponentCount,
900 if (ComponentCount > 4)
919 return NewFormatInfo->
Format;
932bool SILoadStoreOptimizer::offsetsCanBeCombined(CombineInfo &CI,
936 assert(CI.InstClass != MIMG);
940 if (CI.Offset == Paired.Offset)
944 if ((CI.Offset % CI.EltSize != 0) || (Paired.Offset % CI.EltSize != 0))
947 if (CI.InstClass == TBUFFER_LOAD || CI.InstClass == TBUFFER_STORE) {
971 uint32_t EltOffset0 = CI.Offset / CI.EltSize;
972 uint32_t EltOffset1 = Paired.Offset / CI.EltSize;
977 if ((CI.InstClass != DS_READ) && (CI.InstClass != DS_WRITE)) {
978 return (EltOffset0 + CI.Width == EltOffset1 ||
979 EltOffset1 + Paired.Width == EltOffset0) &&
980 CI.CPol == Paired.CPol;
985 if ((EltOffset0 % 64 == 0) && (EltOffset1 % 64) == 0 &&
986 isUInt<8>(EltOffset0 / 64) && isUInt<8>(EltOffset1 / 64)) {
988 CI.Offset = EltOffset0 / 64;
989 Paired.Offset = EltOffset1 / 64;
996 if (isUInt<8>(EltOffset0) && isUInt<8>(EltOffset1)) {
998 CI.Offset = EltOffset0;
999 Paired.Offset = EltOffset1;
1005 uint32_t Min = std::min(EltOffset0, EltOffset1);
1008 const uint32_t Mask = maskTrailingOnes<uint32_t>(8) * 64;
1009 if (((Max - Min) & ~Mask) == 0) {
1017 BaseOff |= Min & maskTrailingOnes<uint32_t>(6);
1018 CI.BaseOff = BaseOff * CI.EltSize;
1019 CI.Offset = (EltOffset0 - BaseOff) / 64;
1020 Paired.Offset = (EltOffset1 - BaseOff) / 64;
1026 if (isUInt<8>(Max - Min)) {
1032 CI.BaseOff = BaseOff * CI.EltSize;
1033 CI.Offset = EltOffset0 - BaseOff;
1034 Paired.Offset = EltOffset1 - BaseOff;
1042bool SILoadStoreOptimizer::widthsFit(
const GCNSubtarget &STM,
1043 const CombineInfo &CI,
1044 const CombineInfo &Paired) {
1045 const unsigned Width = (CI.Width + Paired.Width);
1046 switch (CI.InstClass) {
1049 case S_BUFFER_LOAD_IMM:
1050 case S_BUFFER_LOAD_SGPR_IMM:
1064SILoadStoreOptimizer::getDataRegClass(
const MachineInstr &
MI)
const {
1065 if (
const auto *Dst =
TII->getNamedOperand(
MI, AMDGPU::OpName::vdst)) {
1066 return TRI->getRegClassForReg(*
MRI, Dst->getReg());
1068 if (
const auto *Src =
TII->getNamedOperand(
MI, AMDGPU::OpName::vdata)) {
1069 return TRI->getRegClassForReg(*
MRI, Src->getReg());
1071 if (
const auto *Src =
TII->getNamedOperand(
MI, AMDGPU::OpName::data0)) {
1072 return TRI->getRegClassForReg(*
MRI, Src->getReg());
1074 if (
const auto *Dst =
TII->getNamedOperand(
MI, AMDGPU::OpName::sdst)) {
1075 return TRI->getRegClassForReg(*
MRI, Dst->getReg());
1077 if (
const auto *Src =
TII->getNamedOperand(
MI, AMDGPU::OpName::sdata)) {
1078 return TRI->getRegClassForReg(*
MRI, Src->getReg());
1085SILoadStoreOptimizer::CombineInfo *
1086SILoadStoreOptimizer::checkAndPrepareMerge(CombineInfo &CI,
1087 CombineInfo &Paired) {
1090 if (CI.InstClass == UNKNOWN || Paired.InstClass == UNKNOWN)
1092 assert(CI.InstClass == Paired.InstClass);
1094 if (getInstSubclass(CI.I->getOpcode(), *
TII) !=
1095 getInstSubclass(Paired.I->getOpcode(), *
TII))
1100 if (CI.InstClass == MIMG) {
1101 if (!dmasksCanBeCombined(CI, *
TII, Paired))
1104 if (!widthsFit(*STM, CI, Paired) || !offsetsCanBeCombined(CI, *STM, Paired))
1111 if (CI.I->mayLoad()) {
1115 if (!canSwapInstructions(RegDefs, RegUses, *Paired.I, *
MBBI))
1123 if (!canSwapInstructions(RegDefs, RegUses, *CI.I, *
MBBI))
1133 if (CI.InstClass == DS_READ || CI.InstClass == DS_WRITE)
1134 offsetsCanBeCombined(CI, *STM, Paired,
true);
1138unsigned SILoadStoreOptimizer::read2Opcode(
unsigned EltSize)
const {
1140 return (EltSize == 4) ? AMDGPU::DS_READ2_B32 : AMDGPU::DS_READ2_B64;
1141 return (EltSize == 4) ? AMDGPU::DS_READ2_B32_gfx9 : AMDGPU::DS_READ2_B64_gfx9;
1144unsigned SILoadStoreOptimizer::read2ST64Opcode(
unsigned EltSize)
const {
1146 return (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32 : AMDGPU::DS_READ2ST64_B64;
1148 return (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32_gfx9
1149 : AMDGPU::DS_READ2ST64_B64_gfx9;
1153SILoadStoreOptimizer::mergeRead2Pair(CombineInfo &CI, CombineInfo &Paired,
1159 const auto *AddrReg =
TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr);
1161 const auto *Dest0 =
TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdst);
1162 const auto *Dest1 =
TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdst);
1164 unsigned NewOffset0 = CI.Offset;
1165 unsigned NewOffset1 = Paired.Offset;
1167 CI.UseST64 ? read2ST64Opcode(CI.EltSize) : read2Opcode(CI.EltSize);
1169 unsigned SubRegIdx0 = (CI.EltSize == 4) ? AMDGPU::sub0 : AMDGPU::sub0_sub1;
1170 unsigned SubRegIdx1 = (CI.EltSize == 4) ? AMDGPU::sub1 : AMDGPU::sub2_sub3;
1172 if (NewOffset0 > NewOffset1) {
1178 assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) &&
1179 (NewOffset0 != NewOffset1) &&
"Computed offset doesn't fit");
1184 Register DestReg =
MRI->createVirtualRegister(SuperRC);
1188 Register BaseReg = AddrReg->getReg();
1189 unsigned BaseSubReg = AddrReg->getSubReg();
1190 unsigned BaseRegFlags = 0;
1192 Register ImmReg =
MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
1196 BaseReg =
MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1199 TII->getAddNoCarry(*
MBB, InsertBefore,
DL, BaseReg)
1201 .addReg(AddrReg->getReg(), 0, BaseSubReg)
1208 .
addReg(BaseReg, BaseRegFlags, BaseSubReg)
1221 .
addReg(DestReg, 0, SubRegIdx0);
1227 Paired.I->eraseFromParent();
1233unsigned SILoadStoreOptimizer::write2Opcode(
unsigned EltSize)
const {
1235 return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32 : AMDGPU::DS_WRITE2_B64;
1236 return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32_gfx9
1237 : AMDGPU::DS_WRITE2_B64_gfx9;
1240unsigned SILoadStoreOptimizer::write2ST64Opcode(
unsigned EltSize)
const {
1242 return (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32
1243 : AMDGPU::DS_WRITE2ST64_B64;
1245 return (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32_gfx9
1246 : AMDGPU::DS_WRITE2ST64_B64_gfx9;
1250 CombineInfo &CI, CombineInfo &Paired,
1257 TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr);
1259 TII->getNamedOperand(*CI.I, AMDGPU::OpName::data0);
1261 TII->getNamedOperand(*Paired.I, AMDGPU::OpName::data0);
1263 unsigned NewOffset0 = CI.Offset;
1264 unsigned NewOffset1 = Paired.Offset;
1266 CI.UseST64 ? write2ST64Opcode(CI.EltSize) : write2Opcode(CI.EltSize);
1268 if (NewOffset0 > NewOffset1) {
1274 assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) &&
1275 (NewOffset0 != NewOffset1) &&
"Computed offset doesn't fit");
1281 unsigned BaseSubReg = AddrReg->
getSubReg();
1282 unsigned BaseRegFlags = 0;
1284 Register ImmReg =
MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
1288 BaseReg =
MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1291 TII->getAddNoCarry(*
MBB, InsertBefore,
DL, BaseReg)
1293 .addReg(AddrReg->
getReg(), 0, BaseSubReg)
1300 .
addReg(BaseReg, BaseRegFlags, BaseSubReg)
1309 Paired.I->eraseFromParent();
1311 LLVM_DEBUG(
dbgs() <<
"Inserted write2 inst: " << *Write2 <<
'\n');
1316SILoadStoreOptimizer::mergeImagePair(CombineInfo &CI, CombineInfo &Paired,
1320 const unsigned Opcode = getNewOpcode(CI, Paired);
1324 Register DestReg =
MRI->createVirtualRegister(SuperRC);
1325 unsigned MergedDMask = CI.DMask | Paired.DMask;
1327 AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), AMDGPU::OpName::dmask);
1329 auto MIB =
BuildMI(*
MBB, InsertBefore,
DL,
TII->get(Opcode), DestReg);
1330 for (
unsigned I = 1,
E = (*CI.I).getNumOperands();
I !=
E; ++
I) {
1332 MIB.addImm(MergedDMask);
1334 MIB.add((*CI.I).getOperand(
I));
1340 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1342 MachineInstr *
New = MIB.addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1344 unsigned SubRegIdx0, SubRegIdx1;
1345 std::tie(SubRegIdx0, SubRegIdx1) = getSubRegIdxs(CI, Paired);
1349 const auto *Dest0 =
TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
1350 const auto *Dest1 =
TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata);
1354 .
addReg(DestReg, 0, SubRegIdx0);
1360 Paired.I->eraseFromParent();
1365 CombineInfo &CI, CombineInfo &Paired,
1369 const unsigned Opcode = getNewOpcode(CI, Paired);
1373 Register DestReg =
MRI->createVirtualRegister(SuperRC);
1374 unsigned MergedOffset = std::min(CI.Offset, Paired.Offset);
1379 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1383 .
add(*
TII->getNamedOperand(*CI.I, AMDGPU::OpName::sbase));
1384 if (CI.InstClass == S_BUFFER_LOAD_SGPR_IMM)
1385 New.add(*
TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset));
1388 if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::offset))
1389 New.addImm(MergedOffset);
1390 New.addImm(CI.CPol).addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1392 std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired);
1393 const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
1394 const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
1398 const auto *Dest0 =
TII->getNamedOperand(*CI.I, AMDGPU::OpName::sdst);
1399 const auto *Dest1 =
TII->getNamedOperand(*Paired.I, AMDGPU::OpName::sdst);
1403 .
addReg(DestReg, 0, SubRegIdx0);
1409 Paired.I->eraseFromParent();
1414 CombineInfo &CI, CombineInfo &Paired,
1419 const unsigned Opcode = getNewOpcode(CI, Paired);
1424 Register DestReg =
MRI->createVirtualRegister(SuperRC);
1425 unsigned MergedOffset = std::min(CI.Offset, Paired.Offset);
1427 auto MIB =
BuildMI(*
MBB, InsertBefore,
DL,
TII->get(Opcode), DestReg);
1429 AddressRegs Regs = getRegs(Opcode, *
TII);
1432 MIB.add(*
TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
1437 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1440 MIB.add(*
TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
1441 .add(*
TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
1442 .addImm(MergedOffset)
1445 .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1447 std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired);
1448 const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
1449 const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
1453 const auto *Dest0 =
TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
1454 const auto *Dest1 =
TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata);
1458 .
addReg(DestReg, 0, SubRegIdx0);
1464 Paired.I->eraseFromParent();
1469 CombineInfo &CI, CombineInfo &Paired,
1474 const unsigned Opcode = getNewOpcode(CI, Paired);
1479 Register DestReg =
MRI->createVirtualRegister(SuperRC);
1480 unsigned MergedOffset = std::min(CI.Offset, Paired.Offset);
1482 auto MIB =
BuildMI(*
MBB, InsertBefore,
DL,
TII->get(Opcode), DestReg);
1484 AddressRegs Regs = getRegs(Opcode, *
TII);
1487 MIB.add(*
TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
1489 unsigned JoinedFormat =
1495 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1498 MIB.add(*
TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
1499 .add(*
TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
1500 .addImm(MergedOffset)
1501 .addImm(JoinedFormat)
1504 .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1506 std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired);
1507 const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
1508 const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
1512 const auto *Dest0 =
TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
1513 const auto *Dest1 =
TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata);
1517 .
addReg(DestReg, 0, SubRegIdx0);
1523 Paired.I->eraseFromParent();
1528 CombineInfo &CI, CombineInfo &Paired,
1533 const unsigned Opcode = getNewOpcode(CI, Paired);
1535 std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired);
1536 const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
1537 const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
1541 Register SrcReg =
MRI->createVirtualRegister(SuperRC);
1543 const auto *Src0 =
TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
1544 const auto *Src1 =
TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata);
1546 BuildMI(*
MBB, InsertBefore,
DL,
TII->get(AMDGPU::REG_SEQUENCE), SrcReg)
1555 AddressRegs Regs = getRegs(Opcode, *
TII);
1558 MIB.add(*
TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
1560 unsigned JoinedFormat =
1566 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1569 MIB.add(*
TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
1570 .add(*
TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
1571 .addImm(std::min(CI.Offset, Paired.Offset))
1572 .addImm(JoinedFormat)
1575 .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1577 CI.I->eraseFromParent();
1578 Paired.I->eraseFromParent();
1583 CombineInfo &CI, CombineInfo &Paired,
1588 const unsigned Opcode = getNewOpcode(CI, Paired);
1591 Register DestReg =
MRI->createVirtualRegister(SuperRC);
1593 auto MIB =
BuildMI(*
MBB, InsertBefore,
DL,
TII->get(Opcode), DestReg);
1595 if (
auto *SAddr =
TII->getNamedOperand(*CI.I, AMDGPU::OpName::saddr))
1599 MIB.add(*
TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr))
1600 .addImm(std::min(CI.Offset, Paired.Offset))
1602 .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1604 std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired);
1605 const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
1606 const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
1610 const auto *Dest0 =
TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdst);
1611 const auto *Dest1 =
TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdst);
1615 .
addReg(DestReg, 0, SubRegIdx0);
1621 Paired.I->eraseFromParent();
1626 CombineInfo &CI, CombineInfo &Paired,
1631 const unsigned Opcode = getNewOpcode(CI, Paired);
1633 std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired);
1634 const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
1635 const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
1639 Register SrcReg =
MRI->createVirtualRegister(SuperRC);
1641 const auto *Src0 =
TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
1642 const auto *Src1 =
TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata);
1644 BuildMI(*
MBB, InsertBefore,
DL,
TII->get(AMDGPU::REG_SEQUENCE), SrcReg)
1651 .
add(*
TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr))
1654 if (
auto *SAddr =
TII->getNamedOperand(*CI.I, AMDGPU::OpName::saddr))
1658 MIB.addImm(std::min(CI.Offset, Paired.Offset))
1660 .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1662 CI.I->eraseFromParent();
1663 Paired.I->eraseFromParent();
1667unsigned SILoadStoreOptimizer::getNewOpcode(
const CombineInfo &CI,
1668 const CombineInfo &Paired) {
1669 const unsigned Width = CI.Width + Paired.Width;
1671 switch (getCommonInstClass(CI, Paired)) {
1673 assert(CI.InstClass == BUFFER_LOAD || CI.InstClass == BUFFER_STORE);
1675 return AMDGPU::getMUBUFOpcode(AMDGPU::getMUBUFBaseOpcode(CI.I->getOpcode()),
1679 return AMDGPU::getMTBUFOpcode(AMDGPU::getMTBUFBaseOpcode(CI.I->getOpcode()),
1684 case S_BUFFER_LOAD_IMM:
1689 return AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM;
1691 return AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM;
1693 return AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM;
1695 case S_BUFFER_LOAD_SGPR_IMM:
1700 return CI.Offset == 0 ? AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR
1701 : AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM;
1703 return CI.Offset == 0 ? AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR
1704 : AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM;
1706 return CI.Offset == 0 ? AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR
1707 : AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM;
1714 return AMDGPU::S_LOAD_DWORDX2_IMM;
1716 return AMDGPU::S_LOAD_DWORDX4_IMM;
1718 return AMDGPU::S_LOAD_DWORDX8_IMM;
1725 return AMDGPU::GLOBAL_LOAD_DWORDX2;
1727 return AMDGPU::GLOBAL_LOAD_DWORDX3;
1729 return AMDGPU::GLOBAL_LOAD_DWORDX4;
1731 case GLOBAL_LOAD_SADDR:
1736 return AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR;
1738 return AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR;
1740 return AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR;
1747 return AMDGPU::GLOBAL_STORE_DWORDX2;
1749 return AMDGPU::GLOBAL_STORE_DWORDX3;
1751 return AMDGPU::GLOBAL_STORE_DWORDX4;
1753 case GLOBAL_STORE_SADDR:
1758 return AMDGPU::GLOBAL_STORE_DWORDX2_SADDR;
1760 return AMDGPU::GLOBAL_STORE_DWORDX3_SADDR;
1762 return AMDGPU::GLOBAL_STORE_DWORDX4_SADDR;
1769 return AMDGPU::FLAT_LOAD_DWORDX2;
1771 return AMDGPU::FLAT_LOAD_DWORDX3;
1773 return AMDGPU::FLAT_LOAD_DWORDX4;
1780 return AMDGPU::FLAT_STORE_DWORDX2;
1782 return AMDGPU::FLAT_STORE_DWORDX3;
1784 return AMDGPU::FLAT_STORE_DWORDX4;
1789 return AMDGPU::getMaskedMIMGOp(CI.I->getOpcode(), Width);
1793std::pair<unsigned, unsigned>
1794SILoadStoreOptimizer::getSubRegIdxs(
const CombineInfo &CI,
1795 const CombineInfo &Paired) {
1796 assert((CI.InstClass != MIMG ||
1798 CI.Width + Paired.Width)) &&
1804 static const unsigned Idxs[5][4] = {
1805 {AMDGPU::sub0, AMDGPU::sub0_sub1, AMDGPU::sub0_sub1_sub2, AMDGPU::sub0_sub1_sub2_sub3},
1806 {AMDGPU::sub1, AMDGPU::sub1_sub2, AMDGPU::sub1_sub2_sub3, AMDGPU::sub1_sub2_sub3_sub4},
1807 {AMDGPU::sub2, AMDGPU::sub2_sub3, AMDGPU::sub2_sub3_sub4, AMDGPU::sub2_sub3_sub4_sub5},
1808 {AMDGPU::sub3, AMDGPU::sub3_sub4, AMDGPU::sub3_sub4_sub5, AMDGPU::sub3_sub4_sub5_sub6},
1809 {AMDGPU::sub4, AMDGPU::sub4_sub5, AMDGPU::sub4_sub5_sub6, AMDGPU::sub4_sub5_sub6_sub7},
1812 assert(CI.Width >= 1 && CI.Width <= 4);
1813 assert(Paired.Width >= 1 && Paired.Width <= 4);
1816 Idx1 = Idxs[0][Paired.Width - 1];
1817 Idx0 = Idxs[Paired.Width][CI.Width - 1];
1819 Idx0 = Idxs[0][CI.Width - 1];
1820 Idx1 = Idxs[CI.Width][Paired.Width - 1];
1823 return std::pair(Idx0, Idx1);
1827SILoadStoreOptimizer::getTargetRegisterClass(
const CombineInfo &CI,
1828 const CombineInfo &Paired) {
1829 if (CI.InstClass == S_BUFFER_LOAD_IMM ||
1830 CI.InstClass == S_BUFFER_LOAD_SGPR_IMM || CI.InstClass == S_LOAD_IMM) {
1831 switch (CI.Width + Paired.Width) {
1835 return &AMDGPU::SReg_64_XEXECRegClass;
1837 return &AMDGPU::SGPR_128RegClass;
1839 return &AMDGPU::SGPR_256RegClass;
1841 return &AMDGPU::SGPR_512RegClass;
1845 unsigned BitWidth = 32 * (CI.Width + Paired.Width);
1846 return TRI->isAGPRClass(getDataRegClass(*CI.I))
1852 CombineInfo &CI, CombineInfo &Paired,
1857 const unsigned Opcode = getNewOpcode(CI, Paired);
1859 std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired);
1860 const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
1861 const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
1865 Register SrcReg =
MRI->createVirtualRegister(SuperRC);
1867 const auto *Src0 =
TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
1868 const auto *Src1 =
TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata);
1870 BuildMI(*
MBB, InsertBefore,
DL,
TII->get(AMDGPU::REG_SEQUENCE), SrcReg)
1879 AddressRegs Regs = getRegs(Opcode, *
TII);
1882 MIB.add(*
TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
1888 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1891 MIB.add(*
TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
1892 .add(*
TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
1893 .addImm(std::min(CI.Offset, Paired.Offset))
1896 .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1898 CI.I->eraseFromParent();
1899 Paired.I->eraseFromParent();
1904SILoadStoreOptimizer::createRegOrImm(int32_t Val,
MachineInstr &
MI)
const {
1905 APInt V(32, Val,
true);
1906 if (
TII->isInlineConstant(V))
1909 Register Reg =
MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
1911 BuildMI(*
MI.getParent(),
MI.getIterator(),
MI.getDebugLoc(),
1912 TII->get(AMDGPU::S_MOV_B32), Reg)
1921 const MemAddress &
Addr)
const {
1927 Addr.Base.LoSubReg) &&
1928 "Expected 32-bit Base-Register-Low!!");
1931 Addr.Base.HiSubReg) &&
1932 "Expected 32-bit Base-Register-Hi!!");
1937 createRegOrImm(
static_cast<int32_t
>(
Addr.Offset >> 32),
MI);
1939 const auto *CarryRC =
TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
1940 Register CarryReg =
MRI->createVirtualRegister(CarryRC);
1941 Register DeadCarryReg =
MRI->createVirtualRegister(CarryRC);
1943 Register DestSub0 =
MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1944 Register DestSub1 =
MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1964 Register FullDestReg =
MRI->createVirtualRegister(
TRI->getVGPR64Class());
1980 int32_t NewOffset)
const {
1981 auto Base =
TII->getNamedOperand(
MI, AMDGPU::OpName::vaddr);
1982 Base->setReg(NewBase);
1983 Base->setIsKill(
false);
1984 TII->getNamedOperand(
MI, AMDGPU::OpName::offset)->setImm(NewOffset);
1987std::optional<int32_t>
1988SILoadStoreOptimizer::extractConstOffset(
const MachineOperand &Op)
const {
1993 return std::nullopt;
1996 if (!Def ||
Def->getOpcode() != AMDGPU::S_MOV_B32 ||
1997 !
Def->getOperand(1).isImm())
1998 return std::nullopt;
2000 return Def->getOperand(1).getImm();
2014 MemAddress &
Addr)
const {
2019 if (!Def ||
Def->getOpcode() != AMDGPU::REG_SEQUENCE
2020 ||
Def->getNumOperands() != 5)
2031 if (!BaseLoDef || BaseLoDef->
getOpcode() != AMDGPU::V_ADD_CO_U32_e64 ||
2032 !BaseHiDef || BaseHiDef->
getOpcode() != AMDGPU::V_ADDC_U32_e64)
2035 const auto *Src0 =
TII->getNamedOperand(*BaseLoDef, AMDGPU::OpName::src0);
2036 const auto *Src1 =
TII->getNamedOperand(*BaseLoDef, AMDGPU::OpName::src1);
2038 auto Offset0P = extractConstOffset(*Src0);
2042 if (!(Offset0P = extractConstOffset(*Src1)))
2047 Src0 =
TII->getNamedOperand(*BaseHiDef, AMDGPU::OpName::src0);
2048 Src1 =
TII->getNamedOperand(*BaseHiDef, AMDGPU::OpName::src1);
2063 Addr.Offset = (*Offset0P & 0x00000000ffffffff) | (Offset1 << 32);
2066bool SILoadStoreOptimizer::promoteConstantOffsetToImm(
2068 MemInfoMap &Visited,
2071 if (!(
MI.mayLoad() ^
MI.mayStore()))
2075 if (AMDGPU::getGlobalSaddrOp(
MI.getOpcode()) < 0)
2079 TII->getNamedOperand(
MI, AMDGPU::OpName::vdata) !=
nullptr)
2087 if (
TII->getNamedOperand(
MI, AMDGPU::OpName::offset)->getImm()) {
2095 if (Visited.find(&
MI) == Visited.end()) {
2096 processBaseWithConstOffset(
Base, MAddr);
2097 Visited[&
MI] = MAddr;
2099 MAddr = Visited[&
MI];
2101 if (MAddr.Offset == 0) {
2102 LLVM_DEBUG(
dbgs() <<
" Failed to extract constant-offset or there are no"
2103 " constant offsets that can be promoted.\n";);
2108 << MAddr.Base.LoReg <<
"} Offset: " << MAddr.Offset <<
"\n\n";);
2136 MemAddress AnchorAddr;
2137 uint32_t MaxDist = std::numeric_limits<uint32_t>::min();
2152 TII->getNamedOperand(MINext, AMDGPU::OpName::offset)->getImm())
2156 *
TII->getNamedOperand(MINext, AMDGPU::OpName::vaddr);
2157 MemAddress MAddrNext;
2158 if (Visited.find(&MINext) == Visited.end()) {
2159 processBaseWithConstOffset(BaseNext, MAddrNext);
2160 Visited[&MINext] = MAddrNext;
2162 MAddrNext = Visited[&MINext];
2164 if (MAddrNext.Base.LoReg != MAddr.Base.LoReg ||
2165 MAddrNext.Base.HiReg != MAddr.Base.HiReg ||
2166 MAddrNext.Base.LoSubReg != MAddr.Base.LoSubReg ||
2167 MAddrNext.Base.HiSubReg != MAddr.Base.HiSubReg)
2170 InstsWCommonBase.
push_back(std::pair(&MINext, MAddrNext.Offset));
2172 int64_t Dist = MAddr.Offset - MAddrNext.Offset;
2177 (
uint32_t)std::abs(Dist) > MaxDist) {
2178 MaxDist = std::abs(Dist);
2180 AnchorAddr = MAddrNext;
2181 AnchorInst = &MINext;
2186 LLVM_DEBUG(
dbgs() <<
" Anchor-Inst(with max-distance from Offset): ";
2187 AnchorInst->
dump());
2189 << AnchorAddr.Offset <<
"\n\n");
2194 updateBaseAndOffset(
MI,
Base, MAddr.Offset - AnchorAddr.Offset);
2197 for (
auto P : InstsWCommonBase) {
2200 AM.
BaseOffs =
P.second - AnchorAddr.Offset;
2204 dbgs() <<
")";
P.first->dump());
2205 updateBaseAndOffset(*
P.first,
Base,
P.second - AnchorAddr.Offset);
2209 AnchorList.
insert(AnchorInst);
2216void SILoadStoreOptimizer::addInstToMergeableList(
const CombineInfo &CI,
2217 std::list<std::list<CombineInfo> > &MergeableInsts)
const {
2218 for (std::list<CombineInfo> &AddrList : MergeableInsts) {
2219 if (AddrList.front().InstClass == CI.InstClass &&
2220 AddrList.front().IsAGPR == CI.IsAGPR &&
2221 AddrList.front().hasSameBaseAddress(CI)) {
2222 AddrList.emplace_back(CI);
2228 MergeableInsts.emplace_back(1, CI);
2231std::pair<MachineBasicBlock::iterator, bool>
2232SILoadStoreOptimizer::collectMergeableInsts(
2235 std::list<std::list<CombineInfo>> &MergeableInsts)
const {
2241 for (; BlockI != End; ++BlockI) {
2246 if (promoteConstantOffsetToImm(
MI, Visited, AnchorList))
2251 if (
MI.hasOrderedMemoryRef() ||
MI.hasUnmodeledSideEffects()) {
2259 const InstClassEnum InstClass = getInstClass(
MI.getOpcode(), *
TII);
2260 if (InstClass == UNKNOWN)
2265 AMDGPU::getNamedOperandIdx(
MI.getOpcode(), AMDGPU::OpName::swz);
2266 if (Swizzled != -1 &&
MI.getOperand(Swizzled).getImm())
2270 CI.setMI(
MI, *
this);
2273 if (!CI.hasMergeableAddress(*
MRI))
2276 if (CI.InstClass == DS_WRITE && CI.IsAGPR) {
2288 addInstToMergeableList(CI, MergeableInsts);
2298 for (std::list<std::list<CombineInfo>>::iterator
I = MergeableInsts.begin(),
2299 E = MergeableInsts.end();
I !=
E;) {
2301 std::list<CombineInfo> &MergeList = *
I;
2302 if (MergeList.size() <= 1) {
2306 I = MergeableInsts.erase(
I);
2314 [] (
const CombineInfo &
A,
const CombineInfo &
B) {
2315 return A.Offset <
B.Offset;
2320 return std::pair(BlockI,
Modified);
2326bool SILoadStoreOptimizer::optimizeBlock(
2327 std::list<std::list<CombineInfo> > &MergeableInsts) {
2330 for (std::list<std::list<CombineInfo>>::iterator
I = MergeableInsts.begin(),
2331 E = MergeableInsts.end();
I !=
E;) {
2332 std::list<CombineInfo> &MergeList = *
I;
2334 bool OptimizeListAgain =
false;
2335 if (!optimizeInstsWithSameBaseAddr(MergeList, OptimizeListAgain)) {
2339 I = MergeableInsts.erase(
I);
2347 if (!OptimizeListAgain) {
2348 I = MergeableInsts.erase(
I);
2351 OptimizeAgain =
true;
2357SILoadStoreOptimizer::optimizeInstsWithSameBaseAddr(
2358 std::list<CombineInfo> &MergeList,
2359 bool &OptimizeListAgain) {
2360 if (MergeList.empty())
2365 for (
auto I = MergeList.begin(), Next = std::next(
I); Next != MergeList.end();
2366 Next = std::next(
I)) {
2371 if ((*First).Order > (*Second).Order)
2373 CombineInfo &CI = *
First;
2374 CombineInfo &Paired = *Second;
2376 CombineInfo *Where = checkAndPrepareMerge(CI, Paired);
2384 LLVM_DEBUG(
dbgs() <<
"Merging: " << *CI.I <<
" with: " << *Paired.I);
2387 switch (CI.InstClass) {
2392 NewMI = mergeRead2Pair(CI, Paired, Where->I);
2395 NewMI = mergeWrite2Pair(CI, Paired, Where->I);
2397 case S_BUFFER_LOAD_IMM:
2398 case S_BUFFER_LOAD_SGPR_IMM:
2400 NewMI = mergeSMemLoadImmPair(CI, Paired, Where->I);
2401 OptimizeListAgain |= CI.Width + Paired.Width < 8;
2404 NewMI = mergeBufferLoadPair(CI, Paired, Where->I);
2405 OptimizeListAgain |= CI.Width + Paired.Width < 4;
2408 NewMI = mergeBufferStorePair(CI, Paired, Where->I);
2409 OptimizeListAgain |= CI.Width + Paired.Width < 4;
2412 NewMI = mergeImagePair(CI, Paired, Where->I);
2413 OptimizeListAgain |= CI.Width + Paired.Width < 4;
2416 NewMI = mergeTBufferLoadPair(CI, Paired, Where->I);
2417 OptimizeListAgain |= CI.Width + Paired.Width < 4;
2420 NewMI = mergeTBufferStorePair(CI, Paired, Where->I);
2421 OptimizeListAgain |= CI.Width + Paired.Width < 4;
2425 case GLOBAL_LOAD_SADDR:
2426 NewMI = mergeFlatLoadPair(CI, Paired, Where->I);
2427 OptimizeListAgain |= CI.Width + Paired.Width < 4;
2431 case GLOBAL_STORE_SADDR:
2432 NewMI = mergeFlatStorePair(CI, Paired, Where->I);
2433 OptimizeListAgain |= CI.Width + Paired.Width < 4;
2436 CI.setMI(NewMI, *
this);
2437 CI.Order = Where->Order;
2441 MergeList.erase(Second);
2456 TRI = &
TII->getRegisterInfo();
2459 AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
2474 bool CollectModified;
2475 std::list<std::list<CombineInfo>> MergeableInsts;
2479 std::tie(SectionEnd, CollectModified) =
2480 collectMergeableInsts(
I,
E, Visited, AnchorList, MergeableInsts);
2485 OptimizeAgain =
false;
2487 }
while (OptimizeAgain);
unsigned const MachineRegisterInfo * MRI
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
MachineBasicBlock MachineBasicBlock::iterator MBBI
Provides AMDGPU specific target descriptions.
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
Analysis containing CSE Info
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
std::optional< std::vector< StOtherPiece > > Other
AMD GCN specific subclass of TargetSubtarget.
const HexagonInstrInfo * TII
unsigned const TargetRegisterInfo * TRI
static unsigned getReg(const MCDisassembler *D, unsigned RC, unsigned RegNo)
#define INITIALIZE_PASS_DEPENDENCY(depName)
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
static uint32_t mostAlignedValueInRange(uint32_t Lo, uint32_t Hi)
static void addDefsUsesToList(const MachineInstr &MI, DenseSet< Register > &RegDefs, DenseSet< Register > &RegUses)
static unsigned getBufferFormatWithCompCount(unsigned OldFormat, unsigned ComponentCount, const GCNSubtarget &STI)
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
static bool isImm(const MachineOperand &MO, MachineRegisterInfo *MRI)
static bool optimizeBlock(BasicBlock &BB, bool &ModifiedDT, const TargetTransformInfo &TTI, const DataLayout &DL, DomTreeUpdater *DTU)
A wrapper pass to provide the legacy pass manager access to a suitably prepared AAResults object.
Class for arbitrary precision integers.
Represent the analysis usage information of a pass.
AnalysisUsage & addRequired()
void setPreservesCFG()
This function should be called by the pass, iff they do not:
Implements a dense probed hash-table based set.
FunctionPass class - This class is used to implement most global optimizations.
bool loadStoreOptEnabled() const
const SIInstrInfo * getInstrInfo() const override
bool hasDwordx3LoadStores() const
const SITargetLowering * getTargetLowering() const override
bool ldsRequiresM0Init() const
Return if most LDS instructions have an m0 use that require m0 to be initialized.
Describe properties that are true of each instruction in the target description file.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
MachineFunctionPass - This class adapts the FunctionPass interface to allow convenient creation of pa...
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
Properties which a MachineFunction may have at a given point in time.
MachineFunctionProperties & set(Property P)
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, uint64_t s, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
const MachineInstrBuilder & cloneMergedMemRefs(ArrayRef< const MachineInstr * > OtherMIs) const
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
Representation of each machine instruction.
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
void eraseFromParent()
Unlink 'this' from the containing basic block and delete it.
A description of a memory reference used in the backend.
unsigned getAddrSpace() const
uint64_t getSize() const
Return the size in bytes of the memory reference.
const MachinePointerInfo & getPointerInfo() const
MachineOperand class - Representation of each machine instruction operand.
unsigned getSubReg() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
bool isImm() const
isImm - Tests if this is a MO_Immediate operand.
static MachineOperand CreateImm(int64_t Val)
Register getReg() const
getReg - Returns the register number.
static MachineOperand CreateReg(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isEarlyClobber=false, unsigned SubReg=0, bool isDebug=false, bool isInternalRead=false, bool isRenamable=false)
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
static PassRegistry * getPassRegistry()
getPassRegistry - Access the global registry object, which is automatically initialized at applicatio...
Wrapper class representing virtual and physical registers.
bool isPhysical() const
Return true if the specified register number is in the physical register namespace.
static bool isFLATGlobal(const MachineInstr &MI)
bool isLegalGlobalAddressingMode(const AddrMode &AM) const
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
StringRef - Represent a constant reference to a string, i.e.
std::pair< iterator, bool > insert(const ValueT &V)
bool contains(const_arg_type_t< ValueT > V) const
Check if the set contains the given element.
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ FLAT_ADDRESS
Address space for flat memory.
const GcnBufferFormatInfo * getGcnBufferFormatInfo(uint8_t BitsPerComp, uint8_t NumComponents, uint8_t NumFormat, const MCSubtargetInfo &STI)
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
@ Define
Register definition.
@ Kill
The last use of a register.
Reg
All possible values of the reg field in the ModR/M byte.
This is an optimization pass for GlobalISel generic memory operations.
bool operator<(int64_t V1, const APSInt &V2)
int popcount(T Value) noexcept
Count the number of set bits in a value.
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
char & SILoadStoreOptimizerID
FunctionPass * createSILoadStoreOptimizerPass()
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
void initializeSILoadStoreOptimizerPass(PassRegistry &)
int countl_zero(T Val)
Count number of 0's from the most significant bit to the least stopping at the first 1.
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
constexpr unsigned BitWidth
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
This class contains a discriminated union of information about pointers in memory operands,...
This represents an addressing mode of: BaseGV + BaseOffs + BaseReg + Scale*ScaleReg If BaseGV is null...