71#define DEBUG_TYPE "si-load-store-opt"
79 S_BUFFER_LOAD_SGPR_IMM,
98 unsigned char NumVAddrs = 0;
101 bool SOffset =
false;
109const unsigned MaxAddressRegs = 12 + 1 + 1;
111class SILoadStoreOptimizer {
120 InstClassEnum InstClass;
124 int AddrIdx[MaxAddressRegs];
126 unsigned NumAddresses;
129 bool hasSameBaseAddress(
const CombineInfo &CI) {
130 if (NumAddresses != CI.NumAddresses)
134 for (
unsigned i = 0; i < NumAddresses; i++) {
137 if (AddrReg[i]->isImm() || AddrRegNext.
isImm()) {
138 if (AddrReg[i]->isImm() != AddrRegNext.
isImm() ||
156 for (
unsigned i = 0; i < NumAddresses; ++i) {
165 if (!AddrOp->
isReg())
171 AddrOp->
getReg() != AMDGPU::SGPR_NULL)
176 if (
MRI.hasOneNonDBGUse(AddrOp->
getReg()))
190 struct BaseRegisters {
194 unsigned LoSubReg = 0;
195 unsigned HiSubReg = 0;
217 static bool dmasksCanBeCombined(
const CombineInfo &CI,
219 const CombineInfo &Paired);
220 static bool offsetsCanBeCombined(CombineInfo &CI,
const GCNSubtarget &STI,
221 CombineInfo &Paired,
bool Modify =
false);
222 static bool widthsFit(
const GCNSubtarget &STI,
const CombineInfo &CI,
223 const CombineInfo &Paired);
224 unsigned getNewOpcode(
const CombineInfo &CI,
const CombineInfo &Paired);
225 static std::pair<unsigned, unsigned> getSubRegIdxs(
const CombineInfo &CI,
226 const CombineInfo &Paired);
228 getTargetRegisterClass(
const CombineInfo &CI,
229 const CombineInfo &Paired)
const;
232 CombineInfo *checkAndPrepareMerge(CombineInfo &CI, CombineInfo &Paired);
234 void copyToDestRegs(CombineInfo &CI, CombineInfo &Paired,
238 Register copyFromSrcRegs(CombineInfo &CI, CombineInfo &Paired,
242 unsigned read2Opcode(
unsigned EltSize)
const;
243 unsigned read2ST64Opcode(
unsigned EltSize)
const;
245 mergeRead2Pair(CombineInfo &CI, CombineInfo &Paired,
248 unsigned write2Opcode(
unsigned EltSize)
const;
249 unsigned write2ST64Opcode(
unsigned EltSize)
const;
250 unsigned getWrite2Opcode(
const CombineInfo &CI)
const;
253 mergeWrite2Pair(CombineInfo &CI, CombineInfo &Paired,
256 mergeImagePair(CombineInfo &CI, CombineInfo &Paired,
259 mergeSMemLoadImmPair(CombineInfo &CI, CombineInfo &Paired,
262 mergeBufferLoadPair(CombineInfo &CI, CombineInfo &Paired,
265 mergeBufferStorePair(CombineInfo &CI, CombineInfo &Paired,
268 mergeTBufferLoadPair(CombineInfo &CI, CombineInfo &Paired,
271 mergeTBufferStorePair(CombineInfo &CI, CombineInfo &Paired,
274 mergeFlatLoadPair(CombineInfo &CI, CombineInfo &Paired,
277 mergeFlatStorePair(CombineInfo &CI, CombineInfo &Paired,
281 int32_t NewOffset)
const;
293 std::list<std::list<CombineInfo> > &MergeableInsts)
const;
298 std::list<std::list<CombineInfo>> &MergeableInsts)
const;
301 const CombineInfo &Paired);
303 static InstClassEnum getCommonInstClass(
const CombineInfo &CI,
304 const CombineInfo &Paired);
306 bool optimizeInstsWithSameBaseAddr(std::list<CombineInfo> &MergeList,
307 bool &OptimizeListAgain);
308 bool optimizeBlock(std::list<std::list<CombineInfo> > &MergeableInsts);
323 StringRef getPassName()
const override {
return "SI Load Store Optimizer"; }
338 const unsigned Opc =
MI.getOpcode();
344 if (
TII.isImage(
MI)) {
346 TII.getNamedOperand(
MI, AMDGPU::OpName::dmask)->getImm();
354 case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
355 case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM:
356 case AMDGPU::S_LOAD_DWORD_IMM:
357 case AMDGPU::GLOBAL_LOAD_DWORD:
358 case AMDGPU::GLOBAL_LOAD_DWORD_SADDR:
359 case AMDGPU::GLOBAL_STORE_DWORD:
360 case AMDGPU::GLOBAL_STORE_DWORD_SADDR:
361 case AMDGPU::FLAT_LOAD_DWORD:
362 case AMDGPU::FLAT_STORE_DWORD:
363 case AMDGPU::FLAT_LOAD_DWORD_SADDR:
364 case AMDGPU::FLAT_STORE_DWORD_SADDR:
366 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
367 case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM:
368 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM_ec:
369 case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM_ec:
370 case AMDGPU::S_LOAD_DWORDX2_IMM:
371 case AMDGPU::S_LOAD_DWORDX2_IMM_ec:
372 case AMDGPU::GLOBAL_LOAD_DWORDX2:
373 case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR:
374 case AMDGPU::GLOBAL_STORE_DWORDX2:
375 case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR:
376 case AMDGPU::FLAT_LOAD_DWORDX2:
377 case AMDGPU::FLAT_STORE_DWORDX2:
378 case AMDGPU::FLAT_LOAD_DWORDX2_SADDR:
379 case AMDGPU::FLAT_STORE_DWORDX2_SADDR:
381 case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM:
382 case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM:
383 case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM_ec:
384 case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM_ec:
385 case AMDGPU::S_LOAD_DWORDX3_IMM:
386 case AMDGPU::S_LOAD_DWORDX3_IMM_ec:
387 case AMDGPU::GLOBAL_LOAD_DWORDX3:
388 case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR:
389 case AMDGPU::GLOBAL_STORE_DWORDX3:
390 case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR:
391 case AMDGPU::FLAT_LOAD_DWORDX3:
392 case AMDGPU::FLAT_STORE_DWORDX3:
393 case AMDGPU::FLAT_LOAD_DWORDX3_SADDR:
394 case AMDGPU::FLAT_STORE_DWORDX3_SADDR:
396 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
397 case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM:
398 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM_ec:
399 case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM_ec:
400 case AMDGPU::S_LOAD_DWORDX4_IMM:
401 case AMDGPU::S_LOAD_DWORDX4_IMM_ec:
402 case AMDGPU::GLOBAL_LOAD_DWORDX4:
403 case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR:
404 case AMDGPU::GLOBAL_STORE_DWORDX4:
405 case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR:
406 case AMDGPU::FLAT_LOAD_DWORDX4:
407 case AMDGPU::FLAT_STORE_DWORDX4:
408 case AMDGPU::FLAT_LOAD_DWORDX4_SADDR:
409 case AMDGPU::FLAT_STORE_DWORDX4_SADDR:
411 case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
412 case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM:
413 case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM_ec:
414 case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM_ec:
415 case AMDGPU::S_LOAD_DWORDX8_IMM:
416 case AMDGPU::S_LOAD_DWORDX8_IMM_ec:
418 case AMDGPU::DS_READ_B32:
419 case AMDGPU::DS_READ_B32_gfx9:
420 case AMDGPU::DS_WRITE_B32:
421 case AMDGPU::DS_WRITE_B32_gfx9:
423 case AMDGPU::DS_READ_B64:
424 case AMDGPU::DS_READ_B64_gfx9:
425 case AMDGPU::DS_WRITE_B64:
426 case AMDGPU::DS_WRITE_B64_gfx9:
441 case AMDGPU::BUFFER_LOAD_DWORD_BOTHEN:
442 case AMDGPU::BUFFER_LOAD_DWORD_BOTHEN_exact:
443 case AMDGPU::BUFFER_LOAD_DWORD_IDXEN:
444 case AMDGPU::BUFFER_LOAD_DWORD_IDXEN_exact:
445 case AMDGPU::BUFFER_LOAD_DWORD_OFFEN:
446 case AMDGPU::BUFFER_LOAD_DWORD_OFFEN_exact:
447 case AMDGPU::BUFFER_LOAD_DWORD_OFFSET:
448 case AMDGPU::BUFFER_LOAD_DWORD_OFFSET_exact:
449 case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_BOTHEN:
450 case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_BOTHEN_exact:
451 case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_IDXEN:
452 case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_IDXEN_exact:
453 case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_OFFEN:
454 case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_OFFEN_exact:
455 case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_OFFSET:
456 case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_OFFSET_exact:
458 case AMDGPU::BUFFER_STORE_DWORD_BOTHEN:
459 case AMDGPU::BUFFER_STORE_DWORD_BOTHEN_exact:
460 case AMDGPU::BUFFER_STORE_DWORD_IDXEN:
461 case AMDGPU::BUFFER_STORE_DWORD_IDXEN_exact:
462 case AMDGPU::BUFFER_STORE_DWORD_OFFEN:
463 case AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact:
464 case AMDGPU::BUFFER_STORE_DWORD_OFFSET:
465 case AMDGPU::BUFFER_STORE_DWORD_OFFSET_exact:
466 case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_BOTHEN:
467 case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_BOTHEN_exact:
468 case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_IDXEN:
469 case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_IDXEN_exact:
470 case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_OFFEN:
471 case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_OFFEN_exact:
472 case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_OFFSET:
473 case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_OFFSET_exact:
486 if (
TII.get(
Opc).mayStore() || !
TII.get(
Opc).mayLoad() ||
495 case AMDGPU::TBUFFER_LOAD_FORMAT_X_BOTHEN:
496 case AMDGPU::TBUFFER_LOAD_FORMAT_X_BOTHEN_exact:
497 case AMDGPU::TBUFFER_LOAD_FORMAT_X_IDXEN:
498 case AMDGPU::TBUFFER_LOAD_FORMAT_X_IDXEN_exact:
499 case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFEN:
500 case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFEN_exact:
501 case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFSET:
502 case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFSET_exact:
503 case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_BOTHEN:
504 case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_BOTHEN_exact:
505 case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_IDXEN:
506 case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_IDXEN_exact:
507 case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_OFFEN:
508 case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_OFFEN_exact:
509 case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_OFFSET:
510 case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_OFFSET_exact:
512 case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFEN:
513 case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFEN_exact:
514 case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFSET:
515 case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFSET_exact:
516 case AMDGPU::TBUFFER_STORE_FORMAT_X_VBUFFER_OFFEN:
517 case AMDGPU::TBUFFER_STORE_FORMAT_X_VBUFFER_OFFEN_exact:
518 case AMDGPU::TBUFFER_STORE_FORMAT_X_VBUFFER_OFFSET:
519 case AMDGPU::TBUFFER_STORE_FORMAT_X_VBUFFER_OFFSET_exact:
520 return TBUFFER_STORE;
524 case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
525 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
526 case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM:
527 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
528 case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
529 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM_ec:
530 case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM_ec:
531 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM_ec:
532 case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM_ec:
533 return S_BUFFER_LOAD_IMM;
534 case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM:
535 case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM:
536 case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM:
537 case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM:
538 case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM:
539 case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM_ec:
540 case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM_ec:
541 case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM_ec:
542 case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM_ec:
543 return S_BUFFER_LOAD_SGPR_IMM;
544 case AMDGPU::S_LOAD_DWORD_IMM:
545 case AMDGPU::S_LOAD_DWORDX2_IMM:
546 case AMDGPU::S_LOAD_DWORDX3_IMM:
547 case AMDGPU::S_LOAD_DWORDX4_IMM:
548 case AMDGPU::S_LOAD_DWORDX8_IMM:
549 case AMDGPU::S_LOAD_DWORDX2_IMM_ec:
550 case AMDGPU::S_LOAD_DWORDX3_IMM_ec:
551 case AMDGPU::S_LOAD_DWORDX4_IMM_ec:
552 case AMDGPU::S_LOAD_DWORDX8_IMM_ec:
554 case AMDGPU::DS_READ_B32:
555 case AMDGPU::DS_READ_B32_gfx9:
556 case AMDGPU::DS_READ_B64:
557 case AMDGPU::DS_READ_B64_gfx9:
559 case AMDGPU::DS_WRITE_B32:
560 case AMDGPU::DS_WRITE_B32_gfx9:
561 case AMDGPU::DS_WRITE_B64:
562 case AMDGPU::DS_WRITE_B64_gfx9:
564 case AMDGPU::GLOBAL_LOAD_DWORD:
565 case AMDGPU::GLOBAL_LOAD_DWORDX2:
566 case AMDGPU::GLOBAL_LOAD_DWORDX3:
567 case AMDGPU::GLOBAL_LOAD_DWORDX4:
568 case AMDGPU::FLAT_LOAD_DWORD:
569 case AMDGPU::FLAT_LOAD_DWORDX2:
570 case AMDGPU::FLAT_LOAD_DWORDX3:
571 case AMDGPU::FLAT_LOAD_DWORDX4:
573 case AMDGPU::GLOBAL_LOAD_DWORD_SADDR:
574 case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR:
575 case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR:
576 case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR:
577 return GLOBAL_LOAD_SADDR;
578 case AMDGPU::GLOBAL_STORE_DWORD:
579 case AMDGPU::GLOBAL_STORE_DWORDX2:
580 case AMDGPU::GLOBAL_STORE_DWORDX3:
581 case AMDGPU::GLOBAL_STORE_DWORDX4:
582 case AMDGPU::FLAT_STORE_DWORD:
583 case AMDGPU::FLAT_STORE_DWORDX2:
584 case AMDGPU::FLAT_STORE_DWORDX3:
585 case AMDGPU::FLAT_STORE_DWORDX4:
587 case AMDGPU::GLOBAL_STORE_DWORD_SADDR:
588 case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR:
589 case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR:
590 case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR:
591 return GLOBAL_STORE_SADDR;
592 case AMDGPU::FLAT_LOAD_DWORD_SADDR:
593 case AMDGPU::FLAT_LOAD_DWORDX2_SADDR:
594 case AMDGPU::FLAT_LOAD_DWORDX3_SADDR:
595 case AMDGPU::FLAT_LOAD_DWORDX4_SADDR:
596 return FLAT_LOAD_SADDR;
597 case AMDGPU::FLAT_STORE_DWORD_SADDR:
598 case AMDGPU::FLAT_STORE_DWORDX2_SADDR:
599 case AMDGPU::FLAT_STORE_DWORDX3_SADDR:
600 case AMDGPU::FLAT_STORE_DWORDX4_SADDR:
601 return FLAT_STORE_SADDR;
616 return Info->BaseOpcode;
621 case AMDGPU::DS_READ_B32:
622 case AMDGPU::DS_READ_B32_gfx9:
623 case AMDGPU::DS_READ_B64:
624 case AMDGPU::DS_READ_B64_gfx9:
625 case AMDGPU::DS_WRITE_B32:
626 case AMDGPU::DS_WRITE_B32_gfx9:
627 case AMDGPU::DS_WRITE_B64:
628 case AMDGPU::DS_WRITE_B64_gfx9:
630 case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
631 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
632 case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM:
633 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
634 case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
635 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM_ec:
636 case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM_ec:
637 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM_ec:
638 case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM_ec:
639 return AMDGPU::S_BUFFER_LOAD_DWORD_IMM;
640 case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM:
641 case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM:
642 case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM:
643 case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM:
644 case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM:
645 case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM_ec:
646 case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM_ec:
647 case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM_ec:
648 case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM_ec:
649 return AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM;
650 case AMDGPU::S_LOAD_DWORD_IMM:
651 case AMDGPU::S_LOAD_DWORDX2_IMM:
652 case AMDGPU::S_LOAD_DWORDX3_IMM:
653 case AMDGPU::S_LOAD_DWORDX4_IMM:
654 case AMDGPU::S_LOAD_DWORDX8_IMM:
655 case AMDGPU::S_LOAD_DWORDX2_IMM_ec:
656 case AMDGPU::S_LOAD_DWORDX3_IMM_ec:
657 case AMDGPU::S_LOAD_DWORDX4_IMM_ec:
658 case AMDGPU::S_LOAD_DWORDX8_IMM_ec:
659 return AMDGPU::S_LOAD_DWORD_IMM;
660 case AMDGPU::GLOBAL_LOAD_DWORD:
661 case AMDGPU::GLOBAL_LOAD_DWORDX2:
662 case AMDGPU::GLOBAL_LOAD_DWORDX3:
663 case AMDGPU::GLOBAL_LOAD_DWORDX4:
664 case AMDGPU::FLAT_LOAD_DWORD:
665 case AMDGPU::FLAT_LOAD_DWORDX2:
666 case AMDGPU::FLAT_LOAD_DWORDX3:
667 case AMDGPU::FLAT_LOAD_DWORDX4:
668 return AMDGPU::FLAT_LOAD_DWORD;
669 case AMDGPU::GLOBAL_LOAD_DWORD_SADDR:
670 case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR:
671 case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR:
672 case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR:
673 return AMDGPU::GLOBAL_LOAD_DWORD_SADDR;
674 case AMDGPU::GLOBAL_STORE_DWORD:
675 case AMDGPU::GLOBAL_STORE_DWORDX2:
676 case AMDGPU::GLOBAL_STORE_DWORDX3:
677 case AMDGPU::GLOBAL_STORE_DWORDX4:
678 case AMDGPU::FLAT_STORE_DWORD:
679 case AMDGPU::FLAT_STORE_DWORDX2:
680 case AMDGPU::FLAT_STORE_DWORDX3:
681 case AMDGPU::FLAT_STORE_DWORDX4:
682 return AMDGPU::FLAT_STORE_DWORD;
683 case AMDGPU::GLOBAL_STORE_DWORD_SADDR:
684 case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR:
685 case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR:
686 case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR:
687 return AMDGPU::GLOBAL_STORE_DWORD_SADDR;
688 case AMDGPU::FLAT_LOAD_DWORD_SADDR:
689 case AMDGPU::FLAT_LOAD_DWORDX2_SADDR:
690 case AMDGPU::FLAT_LOAD_DWORDX3_SADDR:
691 case AMDGPU::FLAT_LOAD_DWORDX4_SADDR:
692 return AMDGPU::FLAT_LOAD_DWORD_SADDR;
693 case AMDGPU::FLAT_STORE_DWORD_SADDR:
694 case AMDGPU::FLAT_STORE_DWORDX2_SADDR:
695 case AMDGPU::FLAT_STORE_DWORDX3_SADDR:
696 case AMDGPU::FLAT_STORE_DWORDX4_SADDR:
697 return AMDGPU::FLAT_STORE_DWORD_SADDR;
708SILoadStoreOptimizer::getCommonInstClass(
const CombineInfo &CI,
709 const CombineInfo &Paired) {
710 assert(CI.InstClass == Paired.InstClass);
712 if ((CI.InstClass == FLAT_LOAD || CI.InstClass == FLAT_STORE) &&
714 return (CI.InstClass == FLAT_STORE) ? GLOBAL_STORE : GLOBAL_LOAD;
728 Result.SOffset =
true;
734 int VAddr0Idx = AMDGPU::getNamedOperandIdx(
Opc, AMDGPU::OpName::vaddr0);
735 if (VAddr0Idx >= 0) {
736 AMDGPU::OpName RsrcName =
737 TII.isMIMG(
Opc) ? AMDGPU::OpName::srsrc : AMDGPU::OpName::rsrc;
738 int RsrcIdx = AMDGPU::getNamedOperandIdx(
Opc, RsrcName);
739 Result.NumVAddrs = RsrcIdx - VAddr0Idx;
756 Result.SOffset =
true;
764 case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM:
765 case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM:
766 case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM:
767 case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM:
768 case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM:
769 case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM_ec:
770 case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM_ec:
771 case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM_ec:
772 case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM_ec:
773 Result.SOffset =
true;
775 case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
776 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
777 case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM:
778 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
779 case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
780 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM_ec:
781 case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM_ec:
782 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM_ec:
783 case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM_ec:
784 case AMDGPU::S_LOAD_DWORD_IMM:
785 case AMDGPU::S_LOAD_DWORDX2_IMM:
786 case AMDGPU::S_LOAD_DWORDX3_IMM:
787 case AMDGPU::S_LOAD_DWORDX4_IMM:
788 case AMDGPU::S_LOAD_DWORDX8_IMM:
789 case AMDGPU::S_LOAD_DWORDX2_IMM_ec:
790 case AMDGPU::S_LOAD_DWORDX3_IMM_ec:
791 case AMDGPU::S_LOAD_DWORDX4_IMM_ec:
792 case AMDGPU::S_LOAD_DWORDX8_IMM_ec:
795 case AMDGPU::DS_READ_B32:
796 case AMDGPU::DS_READ_B64:
797 case AMDGPU::DS_READ_B32_gfx9:
798 case AMDGPU::DS_READ_B64_gfx9:
799 case AMDGPU::DS_WRITE_B32:
800 case AMDGPU::DS_WRITE_B64:
801 case AMDGPU::DS_WRITE_B32_gfx9:
802 case AMDGPU::DS_WRITE_B64_gfx9:
805 case AMDGPU::GLOBAL_LOAD_DWORD_SADDR:
806 case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR:
807 case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR:
808 case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR:
809 case AMDGPU::GLOBAL_STORE_DWORD_SADDR:
810 case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR:
811 case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR:
812 case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR:
813 case AMDGPU::FLAT_LOAD_DWORD_SADDR:
814 case AMDGPU::FLAT_LOAD_DWORDX2_SADDR:
815 case AMDGPU::FLAT_LOAD_DWORDX3_SADDR:
816 case AMDGPU::FLAT_LOAD_DWORDX4_SADDR:
817 case AMDGPU::FLAT_STORE_DWORD_SADDR:
818 case AMDGPU::FLAT_STORE_DWORDX2_SADDR:
819 case AMDGPU::FLAT_STORE_DWORDX3_SADDR:
820 case AMDGPU::FLAT_STORE_DWORDX4_SADDR:
823 case AMDGPU::GLOBAL_LOAD_DWORD:
824 case AMDGPU::GLOBAL_LOAD_DWORDX2:
825 case AMDGPU::GLOBAL_LOAD_DWORDX3:
826 case AMDGPU::GLOBAL_LOAD_DWORDX4:
827 case AMDGPU::GLOBAL_STORE_DWORD:
828 case AMDGPU::GLOBAL_STORE_DWORDX2:
829 case AMDGPU::GLOBAL_STORE_DWORDX3:
830 case AMDGPU::GLOBAL_STORE_DWORDX4:
831 case AMDGPU::FLAT_LOAD_DWORD:
832 case AMDGPU::FLAT_LOAD_DWORDX2:
833 case AMDGPU::FLAT_LOAD_DWORDX3:
834 case AMDGPU::FLAT_LOAD_DWORDX4:
835 case AMDGPU::FLAT_STORE_DWORD:
836 case AMDGPU::FLAT_STORE_DWORDX2:
837 case AMDGPU::FLAT_STORE_DWORDX3:
838 case AMDGPU::FLAT_STORE_DWORDX4:
845 const SILoadStoreOptimizer &LSO) {
847 unsigned Opc =
MI->getOpcode();
848 InstClass = getInstClass(
Opc, *LSO.TII);
850 if (InstClass == UNKNOWN)
853 DataRC = LSO.getDataRegClass(*
MI);
858 (
Opc == AMDGPU::DS_READ_B64 ||
Opc == AMDGPU::DS_READ_B64_gfx9) ? 8
863 (
Opc == AMDGPU::DS_WRITE_B64 ||
Opc == AMDGPU::DS_WRITE_B64_gfx9) ? 8
866 case S_BUFFER_LOAD_IMM:
867 case S_BUFFER_LOAD_SGPR_IMM:
876 if (InstClass == MIMG) {
881 int OffsetIdx = AMDGPU::getNamedOperandIdx(
Opc, AMDGPU::OpName::offset);
882 Offset =
I->getOperand(OffsetIdx).getImm();
885 if (InstClass == TBUFFER_LOAD || InstClass == TBUFFER_STORE) {
889 EltSize =
Info->BitsPerComp / 8;
892 Width = getOpcodeWidth(*
I, *LSO.TII);
894 if ((InstClass == DS_READ) || (InstClass == DS_WRITE)) {
896 }
else if (InstClass != MIMG) {
900 AddressRegs Regs = getRegs(
Opc, *LSO.TII);
904 for (
unsigned J = 0; J < Regs.NumVAddrs; J++)
905 AddrIdx[NumAddresses++] =
906 AMDGPU::getNamedOperandIdx(
Opc, AMDGPU::OpName::vaddr0) + J;
908 AddrIdx[NumAddresses++] =
909 AMDGPU::getNamedOperandIdx(
Opc, AMDGPU::OpName::addr);
911 AddrIdx[NumAddresses++] =
912 AMDGPU::getNamedOperandIdx(
Opc, AMDGPU::OpName::sbase);
914 AddrIdx[NumAddresses++] = AMDGPU::getNamedOperandIdx(
915 Opc, isVIMAGEorVSAMPLE ? AMDGPU::OpName::rsrc : AMDGPU::OpName::srsrc);
917 AddrIdx[NumAddresses++] =
918 AMDGPU::getNamedOperandIdx(
Opc, AMDGPU::OpName::soffset);
920 AddrIdx[NumAddresses++] =
921 AMDGPU::getNamedOperandIdx(
Opc, AMDGPU::OpName::saddr);
923 AddrIdx[NumAddresses++] =
924 AMDGPU::getNamedOperandIdx(
Opc, AMDGPU::OpName::vaddr);
926 AddrIdx[NumAddresses++] = AMDGPU::getNamedOperandIdx(
927 Opc, isVIMAGEorVSAMPLE ? AMDGPU::OpName::samp : AMDGPU::OpName::ssamp);
928 assert(NumAddresses <= MaxAddressRegs);
930 for (
unsigned J = 0; J < NumAddresses; J++)
931 AddrReg[J] = &
I->getOperand(AddrIdx[J]);
937 "SI Load Store Optimizer",
false,
false)
942char SILoadStoreOptimizerLegacy::
ID = 0;
947 return new SILoadStoreOptimizerLegacy();
953 for (
const auto &
Op :
MI.operands()) {
963bool SILoadStoreOptimizer::canSwapInstructions(
964 const DenseSet<Register> &ARegDefs,
const DenseSet<Register> &ARegUses,
965 const MachineInstr &
A,
const MachineInstr &
B)
const {
966 if (
A.mayLoadOrStore() &&
B.mayLoadOrStore() &&
967 (
A.mayStore() ||
B.mayStore()) &&
A.mayAlias(AA,
B,
true))
969 for (
const auto &BOp :
B.operands()) {
972 if ((BOp.isDef() || BOp.readsReg()) && ARegDefs.
contains(BOp.getReg()))
974 if (BOp.isDef() && ARegUses.
contains(BOp.getReg()))
983SILoadStoreOptimizer::combineKnownAdjacentMMOs(
const CombineInfo &CI,
984 const CombineInfo &Paired) {
985 const MachineMemOperand *MMOa = *CI.I->memoperands_begin();
986 const MachineMemOperand *MMOb = *Paired.I->memoperands_begin();
1000 MachineFunction *MF = CI.I->getMF();
1004bool SILoadStoreOptimizer::dmasksCanBeCombined(
const CombineInfo &CI,
1005 const SIInstrInfo &
TII,
1006 const CombineInfo &Paired) {
1007 assert(CI.InstClass == MIMG);
1010 const auto *TFEOp =
TII.getNamedOperand(*CI.I, AMDGPU::OpName::tfe);
1011 const auto *LWEOp =
TII.getNamedOperand(*CI.I, AMDGPU::OpName::lwe);
1013 if ((TFEOp && TFEOp->getImm()) || (LWEOp && LWEOp->getImm()))
1017 AMDGPU::OpName OperandsToMatch[] = {
1018 AMDGPU::OpName::cpol, AMDGPU::OpName::d16, AMDGPU::OpName::unorm,
1019 AMDGPU::OpName::da, AMDGPU::OpName::r128, AMDGPU::OpName::a16};
1021 for (AMDGPU::OpName
op : OperandsToMatch) {
1022 int Idx = AMDGPU::getNamedOperandIdx(CI.I->getOpcode(),
op);
1023 if (AMDGPU::getNamedOperandIdx(Paired.I->getOpcode(),
op) != Idx)
1026 CI.I->getOperand(Idx).getImm() != Paired.I->getOperand(Idx).getImm())
1031 unsigned MaxMask = std::max(CI.DMask, Paired.DMask);
1032 unsigned MinMask = std::min(CI.DMask, Paired.DMask);
1038 if ((1u << AllowedBitsForMin) <= MinMask)
1045 unsigned ComponentCount,
1047 if (ComponentCount > 4)
1066 return NewFormatInfo->
Format;
1079bool SILoadStoreOptimizer::offsetsCanBeCombined(CombineInfo &CI,
1080 const GCNSubtarget &STI,
1081 CombineInfo &Paired,
1083 assert(CI.InstClass != MIMG);
1087 if (CI.Offset == Paired.Offset)
1091 if ((CI.Offset % CI.EltSize != 0) || (Paired.Offset % CI.EltSize != 0))
1094 if (CI.InstClass == TBUFFER_LOAD || CI.InstClass == TBUFFER_STORE) {
1096 const llvm::AMDGPU::GcnBufferFormatInfo *Info0 =
1098 const llvm::AMDGPU::GcnBufferFormatInfo *Info1 =
1110 unsigned NumCombinedComponents = CI.Width + Paired.Width;
1111 if (NumCombinedComponents == 3 && CI.EltSize <= 2)
1112 NumCombinedComponents = 4;
1120 unsigned ElemIndex0 = CI.Offset / CI.EltSize;
1121 unsigned ElemIndex1 = Paired.Offset / Paired.EltSize;
1122 if (ElemIndex0 + CI.Width != ElemIndex1 &&
1123 ElemIndex1 + Paired.Width != ElemIndex0)
1129 unsigned MergedBytes = CI.EltSize * NumCombinedComponents;
1130 unsigned RequiredAlign = std::min(MergedBytes, 4u);
1131 unsigned MinOff = std::min(CI.Offset, Paired.Offset);
1132 if (MinOff % RequiredAlign != 0)
1138 uint32_t EltOffset0 = CI.Offset / CI.EltSize;
1139 uint32_t EltOffset1 = Paired.Offset / CI.EltSize;
1144 if ((CI.InstClass != DS_READ) && (CI.InstClass != DS_WRITE)) {
1145 if (EltOffset0 + CI.Width != EltOffset1 &&
1146 EltOffset1 + Paired.Width != EltOffset0)
1152 if (CI.InstClass == S_LOAD_IMM || CI.InstClass == S_BUFFER_LOAD_IMM ||
1153 CI.InstClass == S_BUFFER_LOAD_SGPR_IMM) {
1159 if (CI.Width != Paired.Width &&
1160 (CI.Width < Paired.Width) == (CI.Offset < Paired.Offset))
1168 if ((EltOffset0 % 64 == 0) && (EltOffset1 % 64) == 0 &&
1171 CI.Offset = EltOffset0 / 64;
1172 Paired.Offset = EltOffset1 / 64;
1181 CI.Offset = EltOffset0;
1182 Paired.Offset = EltOffset1;
1188 uint32_t Min = std::min(EltOffset0, EltOffset1);
1189 uint32_t
Max = std::max(EltOffset0, EltOffset1);
1192 if (((Max - Min) & ~Mask) == 0) {
1201 CI.BaseOff = BaseOff * CI.EltSize;
1202 CI.Offset = (EltOffset0 - BaseOff) / 64;
1203 Paired.Offset = (EltOffset1 - BaseOff) / 64;
1215 CI.BaseOff = BaseOff * CI.EltSize;
1216 CI.Offset = EltOffset0 - BaseOff;
1217 Paired.Offset = EltOffset1 - BaseOff;
1225bool SILoadStoreOptimizer::widthsFit(
const GCNSubtarget &STM,
1226 const CombineInfo &CI,
1227 const CombineInfo &Paired) {
1228 const unsigned Width = (CI.Width + Paired.Width);
1229 switch (CI.InstClass) {
1232 case S_BUFFER_LOAD_IMM:
1233 case S_BUFFER_LOAD_SGPR_IMM:
1248const TargetRegisterClass *
1249SILoadStoreOptimizer::getDataRegClass(
const MachineInstr &
MI)
const {
1250 if (
const auto *Dst =
TII->getNamedOperand(
MI, AMDGPU::OpName::vdst)) {
1251 return TRI->getRegClassForReg(*
MRI, Dst->getReg());
1253 if (
const auto *Src =
TII->getNamedOperand(
MI, AMDGPU::OpName::vdata)) {
1254 return TRI->getRegClassForReg(*
MRI, Src->getReg());
1256 if (
const auto *Src =
TII->getNamedOperand(
MI, AMDGPU::OpName::data0)) {
1257 return TRI->getRegClassForReg(*
MRI, Src->getReg());
1259 if (
const auto *Dst =
TII->getNamedOperand(
MI, AMDGPU::OpName::sdst)) {
1260 return TRI->getRegClassForReg(*
MRI, Dst->getReg());
1262 if (
const auto *Src =
TII->getNamedOperand(
MI, AMDGPU::OpName::sdata)) {
1263 return TRI->getRegClassForReg(*
MRI, Src->getReg());
1270SILoadStoreOptimizer::CombineInfo *
1271SILoadStoreOptimizer::checkAndPrepareMerge(CombineInfo &CI,
1272 CombineInfo &Paired) {
1275 if (CI.InstClass == UNKNOWN || Paired.InstClass == UNKNOWN)
1277 assert(CI.InstClass == Paired.InstClass);
1279 if (getInstSubclass(CI.I->getOpcode(), *
TII) !=
1280 getInstSubclass(Paired.I->getOpcode(), *
TII))
1285 if (CI.InstClass == MIMG) {
1286 if (!dmasksCanBeCombined(CI, *
TII, Paired))
1289 if (!widthsFit(*STM, CI, Paired) || !offsetsCanBeCombined(CI, *STM, Paired))
1293 DenseSet<Register> RegDefs;
1294 DenseSet<Register> RegUses;
1296 if (CI.I->mayLoad()) {
1300 if (!canSwapInstructions(RegDefs, RegUses, *Paired.I, *
MBBI))
1308 if (!canSwapInstructions(RegDefs, RegUses, *CI.I, *
MBBI))
1318 if (CI.InstClass == DS_READ || CI.InstClass == DS_WRITE)
1319 offsetsCanBeCombined(CI, *STM, Paired,
true);
1321 if (CI.InstClass == DS_WRITE) {
1329 const MachineOperand *Data0 =
1330 TII->getNamedOperand(*CI.I, AMDGPU::OpName::data0);
1331 const MachineOperand *Data1 =
1332 TII->getNamedOperand(*Paired.I, AMDGPU::OpName::data0);
1334 const MCInstrDesc &Write2Opc =
TII->get(getWrite2Opcode(CI));
1335 int Data0Idx = AMDGPU::getNamedOperandIdx(Write2Opc.
getOpcode(),
1336 AMDGPU::OpName::data0);
1337 int Data1Idx = AMDGPU::getNamedOperandIdx(Write2Opc.
getOpcode(),
1338 AMDGPU::OpName::data1);
1340 const TargetRegisterClass *DataRC0 =
TII->getRegClass(Write2Opc, Data0Idx);
1342 const TargetRegisterClass *DataRC1 =
TII->getRegClass(Write2Opc, Data1Idx);
1345 DataRC0 =
TRI->getMatchingSuperRegClass(
MRI->getRegClass(Data0->
getReg()),
1350 DataRC1 =
TRI->getMatchingSuperRegClass(
MRI->getRegClass(Data1->
getReg()),
1354 if (!
MRI->constrainRegClass(Data0->
getReg(), DataRC0) ||
1355 !
MRI->constrainRegClass(Data1->
getReg(), DataRC1))
1367void SILoadStoreOptimizer::copyToDestRegs(
1368 CombineInfo &CI, CombineInfo &Paired,
1370 AMDGPU::OpName OpName,
Register DestReg)
const {
1371 MachineBasicBlock *
MBB = CI.I->getParent();
1373 auto [SubRegIdx0, SubRegIdx1] = getSubRegIdxs(CI, Paired);
1376 const MCInstrDesc &CopyDesc =
TII->get(TargetOpcode::COPY);
1377 auto *Dest0 =
TII->getNamedOperand(*CI.I, OpName);
1378 auto *Dest1 =
TII->getNamedOperand(*Paired.I, OpName);
1383 Dest0->setIsEarlyClobber(
false);
1384 Dest1->setIsEarlyClobber(
false);
1388 .
addReg(DestReg, 0, SubRegIdx0);
1397SILoadStoreOptimizer::copyFromSrcRegs(CombineInfo &CI, CombineInfo &Paired,
1400 AMDGPU::OpName OpName)
const {
1401 MachineBasicBlock *
MBB = CI.I->getParent();
1403 auto [SubRegIdx0, SubRegIdx1] = getSubRegIdxs(CI, Paired);
1406 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1407 Register SrcReg =
MRI->createVirtualRegister(SuperRC);
1409 const auto *Src0 =
TII->getNamedOperand(*CI.I, OpName);
1410 const auto *Src1 =
TII->getNamedOperand(*Paired.I, OpName);
1412 BuildMI(*
MBB, InsertBefore,
DL,
TII->get(AMDGPU::REG_SEQUENCE), SrcReg)
1421unsigned SILoadStoreOptimizer::read2Opcode(
unsigned EltSize)
const {
1423 return (EltSize == 4) ? AMDGPU::DS_READ2_B32 : AMDGPU::DS_READ2_B64;
1424 return (EltSize == 4) ? AMDGPU::DS_READ2_B32_gfx9 : AMDGPU::DS_READ2_B64_gfx9;
1427unsigned SILoadStoreOptimizer::read2ST64Opcode(
unsigned EltSize)
const {
1429 return (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32 : AMDGPU::DS_READ2ST64_B64;
1431 return (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32_gfx9
1432 : AMDGPU::DS_READ2ST64_B64_gfx9;
1436SILoadStoreOptimizer::mergeRead2Pair(CombineInfo &CI, CombineInfo &Paired,
1438 MachineBasicBlock *
MBB = CI.I->getParent();
1442 const auto *AddrReg =
TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr);
1444 unsigned NewOffset0 = std::min(CI.Offset, Paired.Offset);
1445 unsigned NewOffset1 = std::max(CI.Offset, Paired.Offset);
1447 CI.UseST64 ? read2ST64Opcode(CI.EltSize) : read2Opcode(CI.EltSize);
1450 (NewOffset0 != NewOffset1) &&
"Computed offset doesn't fit");
1452 const MCInstrDesc &Read2Desc =
TII->get(
Opc);
1454 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1455 Register DestReg =
MRI->createVirtualRegister(SuperRC);
1461 unsigned BaseSubReg = AddrReg->getSubReg();
1462 unsigned BaseRegFlags = 0;
1464 Register ImmReg =
MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
1468 BaseReg =
MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1471 TII->getAddNoCarry(*
MBB, InsertBefore,
DL, BaseReg)
1473 .addReg(AddrReg->getReg(), 0, BaseSubReg)
1478 MachineInstrBuilder Read2 =
1480 .
addReg(BaseReg, BaseRegFlags, BaseSubReg)
1486 copyToDestRegs(CI, Paired, InsertBefore,
DL, AMDGPU::OpName::vdst, DestReg);
1488 CI.I->eraseFromParent();
1489 Paired.I->eraseFromParent();
1495unsigned SILoadStoreOptimizer::write2Opcode(
unsigned EltSize)
const {
1497 return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32 : AMDGPU::DS_WRITE2_B64;
1498 return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32_gfx9
1499 : AMDGPU::DS_WRITE2_B64_gfx9;
1502unsigned SILoadStoreOptimizer::write2ST64Opcode(
unsigned EltSize)
const {
1504 return (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32
1505 : AMDGPU::DS_WRITE2ST64_B64;
1507 return (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32_gfx9
1508 : AMDGPU::DS_WRITE2ST64_B64_gfx9;
1511unsigned SILoadStoreOptimizer::getWrite2Opcode(
const CombineInfo &CI)
const {
1512 return CI.UseST64 ? write2ST64Opcode(CI.EltSize) : write2Opcode(CI.EltSize);
1516 CombineInfo &CI, CombineInfo &Paired,
1518 MachineBasicBlock *
MBB = CI.I->getParent();
1522 const MachineOperand *AddrReg =
1523 TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr);
1524 const MachineOperand *Data0 =
1525 TII->getNamedOperand(*CI.I, AMDGPU::OpName::data0);
1526 const MachineOperand *Data1 =
1527 TII->getNamedOperand(*Paired.I, AMDGPU::OpName::data0);
1529 unsigned NewOffset0 = CI.Offset;
1530 unsigned NewOffset1 = Paired.Offset;
1531 unsigned Opc = getWrite2Opcode(CI);
1533 if (NewOffset0 > NewOffset1) {
1540 (NewOffset0 != NewOffset1) &&
"Computed offset doesn't fit");
1542 const MCInstrDesc &Write2Desc =
TII->get(
Opc);
1547 unsigned BaseSubReg = AddrReg->
getSubReg();
1548 unsigned BaseRegFlags = 0;
1550 Register ImmReg =
MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
1554 BaseReg =
MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1557 TII->getAddNoCarry(*
MBB, InsertBefore,
DL, BaseReg)
1559 .addReg(AddrReg->
getReg(), 0, BaseSubReg)
1564 MachineInstrBuilder Write2 =
1566 .
addReg(BaseReg, BaseRegFlags, BaseSubReg)
1574 CI.I->eraseFromParent();
1575 Paired.I->eraseFromParent();
1577 LLVM_DEBUG(
dbgs() <<
"Inserted write2 inst: " << *Write2 <<
'\n');
1582SILoadStoreOptimizer::mergeImagePair(CombineInfo &CI, CombineInfo &Paired,
1584 MachineBasicBlock *
MBB = CI.I->getParent();
1588 const unsigned Opcode = getNewOpcode(CI, Paired);
1590 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1592 Register DestReg =
MRI->createVirtualRegister(SuperRC);
1593 unsigned MergedDMask = CI.DMask | Paired.DMask;
1595 AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), AMDGPU::OpName::dmask);
1597 auto MIB =
BuildMI(*
MBB, InsertBefore,
DL,
TII->get(Opcode), DestReg);
1598 for (
unsigned I = 1,
E = (*CI.I).getNumOperands();
I !=
E; ++
I) {
1600 MIB.addImm(MergedDMask);
1602 MIB.add((*CI.I).getOperand(
I));
1608 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1610 MachineInstr *
New = MIB.addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1612 copyToDestRegs(CI, Paired, InsertBefore,
DL, AMDGPU::OpName::vdata, DestReg);
1614 CI.I->eraseFromParent();
1615 Paired.I->eraseFromParent();
1620 CombineInfo &CI, CombineInfo &Paired,
1622 MachineBasicBlock *
MBB = CI.I->getParent();
1626 const unsigned Opcode = getNewOpcode(CI, Paired);
1628 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1630 Register DestReg =
MRI->createVirtualRegister(SuperRC);
1631 unsigned MergedOffset = std::min(CI.Offset, Paired.Offset);
1636 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1638 MachineInstrBuilder
New =
1640 .
add(*
TII->getNamedOperand(*CI.I, AMDGPU::OpName::sbase));
1641 if (CI.InstClass == S_BUFFER_LOAD_SGPR_IMM)
1642 New.add(*
TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset));
1643 New.addImm(MergedOffset);
1644 New.addImm(CI.CPol).addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1646 copyToDestRegs(CI, Paired, InsertBefore,
DL, AMDGPU::OpName::sdst, DestReg);
1648 CI.I->eraseFromParent();
1649 Paired.I->eraseFromParent();
1654 CombineInfo &CI, CombineInfo &Paired,
1656 MachineBasicBlock *
MBB = CI.I->getParent();
1661 const unsigned Opcode = getNewOpcode(CI, Paired);
1663 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1666 Register DestReg =
MRI->createVirtualRegister(SuperRC);
1667 unsigned MergedOffset = std::min(CI.Offset, Paired.Offset);
1669 auto MIB =
BuildMI(*
MBB, InsertBefore,
DL,
TII->get(Opcode), DestReg);
1671 AddressRegs Regs = getRegs(Opcode, *
TII);
1674 MIB.add(*
TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
1679 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1682 MIB.add(*
TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
1683 .add(*
TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
1684 .addImm(MergedOffset)
1687 .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1689 copyToDestRegs(CI, Paired, InsertBefore,
DL, AMDGPU::OpName::vdata, DestReg);
1691 CI.I->eraseFromParent();
1692 Paired.I->eraseFromParent();
1697 CombineInfo &CI, CombineInfo &Paired,
1699 MachineBasicBlock *
MBB = CI.I->getParent();
1704 const unsigned Opcode = getNewOpcode(CI, Paired);
1706 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1709 Register DestReg =
MRI->createVirtualRegister(SuperRC);
1710 unsigned MergedOffset = std::min(CI.Offset, Paired.Offset);
1712 auto MIB =
BuildMI(*
MBB, InsertBefore,
DL,
TII->get(Opcode), DestReg);
1714 AddressRegs Regs = getRegs(Opcode, *
TII);
1717 MIB.add(*
TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
1722 unsigned NumCombinedComponents = CI.Width + Paired.Width;
1723 if (NumCombinedComponents == 3 && CI.EltSize <= 2)
1724 NumCombinedComponents = 4;
1725 unsigned JoinedFormat =
1731 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1734 MIB.add(*
TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
1735 .add(*
TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
1736 .addImm(MergedOffset)
1737 .addImm(JoinedFormat)
1740 .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1742 copyToDestRegs(CI, Paired, InsertBefore,
DL, AMDGPU::OpName::vdata, DestReg);
1744 CI.I->eraseFromParent();
1745 Paired.I->eraseFromParent();
1750 CombineInfo &CI, CombineInfo &Paired,
1752 MachineBasicBlock *
MBB = CI.I->getParent();
1756 const unsigned Opcode = getNewOpcode(CI, Paired);
1759 copyFromSrcRegs(CI, Paired, InsertBefore,
DL, AMDGPU::OpName::vdata);
1764 AddressRegs Regs = getRegs(Opcode, *
TII);
1767 MIB.add(*
TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
1772 unsigned NumCombinedComponents = CI.Width + Paired.Width;
1773 if (NumCombinedComponents == 3 && CI.EltSize <= 2)
1774 NumCombinedComponents = 4;
1775 unsigned JoinedFormat =
1781 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1784 MIB.add(*
TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
1785 .add(*
TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
1786 .addImm(std::min(CI.Offset, Paired.Offset))
1787 .addImm(JoinedFormat)
1790 .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1792 CI.I->eraseFromParent();
1793 Paired.I->eraseFromParent();
1798 CombineInfo &CI, CombineInfo &Paired,
1800 MachineBasicBlock *
MBB = CI.I->getParent();
1805 const unsigned Opcode = getNewOpcode(CI, Paired);
1807 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1808 Register DestReg =
MRI->createVirtualRegister(SuperRC);
1810 auto MIB =
BuildMI(*
MBB, InsertBefore,
DL,
TII->get(Opcode), DestReg);
1812 if (
auto *SAddr =
TII->getNamedOperand(*CI.I, AMDGPU::OpName::saddr))
1816 MIB.add(*
TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr))
1817 .addImm(std::min(CI.Offset, Paired.Offset))
1819 .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1821 copyToDestRegs(CI, Paired, InsertBefore,
DL, AMDGPU::OpName::vdst, DestReg);
1823 CI.I->eraseFromParent();
1824 Paired.I->eraseFromParent();
1829 CombineInfo &CI, CombineInfo &Paired,
1831 MachineBasicBlock *
MBB = CI.I->getParent();
1836 const unsigned Opcode = getNewOpcode(CI, Paired);
1839 copyFromSrcRegs(CI, Paired, InsertBefore,
DL, AMDGPU::OpName::vdata);
1842 .
add(*
TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr))
1845 if (
auto *SAddr =
TII->getNamedOperand(*CI.I, AMDGPU::OpName::saddr))
1849 MIB.addImm(std::min(CI.Offset, Paired.Offset))
1851 .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1853 CI.I->eraseFromParent();
1854 Paired.I->eraseFromParent();
1863 (MMOs.
size() != 1 || MMOs[0]->
getAlign().value() < Width * 4);
1866unsigned SILoadStoreOptimizer::getNewOpcode(
const CombineInfo &CI,
1867 const CombineInfo &Paired) {
1868 const unsigned Width = CI.Width + Paired.Width;
1870 switch (getCommonInstClass(CI, Paired)) {
1872 assert(CI.InstClass == BUFFER_LOAD || CI.InstClass == BUFFER_STORE);
1883 case S_BUFFER_LOAD_IMM: {
1886 bool NeedsConstrainedOpc =
1892 return NeedsConstrainedOpc ? AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM_ec
1893 : AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM;
1895 return NeedsConstrainedOpc ? AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM_ec
1896 : AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM;
1898 return NeedsConstrainedOpc ? AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM_ec
1899 : AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM;
1901 return NeedsConstrainedOpc ? AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM_ec
1902 : AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM;
1905 case S_BUFFER_LOAD_SGPR_IMM: {
1908 bool NeedsConstrainedOpc =
1914 return NeedsConstrainedOpc ? AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM_ec
1915 : AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM;
1917 return NeedsConstrainedOpc ? AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM_ec
1918 : AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM;
1920 return NeedsConstrainedOpc ? AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM_ec
1921 : AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM;
1923 return NeedsConstrainedOpc ? AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM_ec
1924 : AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM;
1930 bool NeedsConstrainedOpc =
1936 return NeedsConstrainedOpc ? AMDGPU::S_LOAD_DWORDX2_IMM_ec
1937 : AMDGPU::S_LOAD_DWORDX2_IMM;
1939 return NeedsConstrainedOpc ? AMDGPU::S_LOAD_DWORDX3_IMM_ec
1940 : AMDGPU::S_LOAD_DWORDX3_IMM;
1942 return NeedsConstrainedOpc ? AMDGPU::S_LOAD_DWORDX4_IMM_ec
1943 : AMDGPU::S_LOAD_DWORDX4_IMM;
1945 return NeedsConstrainedOpc ? AMDGPU::S_LOAD_DWORDX8_IMM_ec
1946 : AMDGPU::S_LOAD_DWORDX8_IMM;
1954 return AMDGPU::GLOBAL_LOAD_DWORDX2;
1956 return AMDGPU::GLOBAL_LOAD_DWORDX3;
1958 return AMDGPU::GLOBAL_LOAD_DWORDX4;
1960 case GLOBAL_LOAD_SADDR:
1965 return AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR;
1967 return AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR;
1969 return AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR;
1976 return AMDGPU::GLOBAL_STORE_DWORDX2;
1978 return AMDGPU::GLOBAL_STORE_DWORDX3;
1980 return AMDGPU::GLOBAL_STORE_DWORDX4;
1982 case GLOBAL_STORE_SADDR:
1987 return AMDGPU::GLOBAL_STORE_DWORDX2_SADDR;
1989 return AMDGPU::GLOBAL_STORE_DWORDX3_SADDR;
1991 return AMDGPU::GLOBAL_STORE_DWORDX4_SADDR;
1998 return AMDGPU::FLAT_LOAD_DWORDX2;
2000 return AMDGPU::FLAT_LOAD_DWORDX3;
2002 return AMDGPU::FLAT_LOAD_DWORDX4;
2009 return AMDGPU::FLAT_STORE_DWORDX2;
2011 return AMDGPU::FLAT_STORE_DWORDX3;
2013 return AMDGPU::FLAT_STORE_DWORDX4;
2015 case FLAT_LOAD_SADDR:
2020 return AMDGPU::FLAT_LOAD_DWORDX2_SADDR;
2022 return AMDGPU::FLAT_LOAD_DWORDX3_SADDR;
2024 return AMDGPU::FLAT_LOAD_DWORDX4_SADDR;
2026 case FLAT_STORE_SADDR:
2031 return AMDGPU::FLAT_STORE_DWORDX2_SADDR;
2033 return AMDGPU::FLAT_STORE_DWORDX3_SADDR;
2035 return AMDGPU::FLAT_STORE_DWORDX4_SADDR;
2044std::pair<unsigned, unsigned>
2045SILoadStoreOptimizer::getSubRegIdxs(
const CombineInfo &CI,
2046 const CombineInfo &Paired) {
2047 assert((CI.InstClass != MIMG ||
2049 CI.Width + Paired.Width)) &&
2055 static const unsigned Idxs[5][4] = {
2056 {AMDGPU::sub0, AMDGPU::sub0_sub1, AMDGPU::sub0_sub1_sub2, AMDGPU::sub0_sub1_sub2_sub3},
2057 {AMDGPU::sub1, AMDGPU::sub1_sub2, AMDGPU::sub1_sub2_sub3, AMDGPU::sub1_sub2_sub3_sub4},
2058 {AMDGPU::sub2, AMDGPU::sub2_sub3, AMDGPU::sub2_sub3_sub4, AMDGPU::sub2_sub3_sub4_sub5},
2059 {AMDGPU::sub3, AMDGPU::sub3_sub4, AMDGPU::sub3_sub4_sub5, AMDGPU::sub3_sub4_sub5_sub6},
2060 {AMDGPU::sub4, AMDGPU::sub4_sub5, AMDGPU::sub4_sub5_sub6, AMDGPU::sub4_sub5_sub6_sub7},
2063 assert(CI.Width >= 1 && CI.Width <= 4);
2064 assert(Paired.Width >= 1 && Paired.Width <= 4);
2067 Idx1 = Idxs[0][Paired.Width - 1];
2068 Idx0 = Idxs[Paired.Width][CI.Width - 1];
2070 Idx0 = Idxs[0][CI.Width - 1];
2071 Idx1 = Idxs[CI.Width][Paired.Width - 1];
2074 return {Idx0, Idx1};
2077const TargetRegisterClass *
2078SILoadStoreOptimizer::getTargetRegisterClass(
const CombineInfo &CI,
2079 const CombineInfo &Paired)
const {
2080 if (CI.InstClass == S_BUFFER_LOAD_IMM ||
2081 CI.InstClass == S_BUFFER_LOAD_SGPR_IMM || CI.InstClass == S_LOAD_IMM) {
2082 switch (CI.Width + Paired.Width) {
2086 return &AMDGPU::SReg_64_XEXECRegClass;
2088 return &AMDGPU::SGPR_96RegClass;
2090 return &AMDGPU::SGPR_128RegClass;
2092 return &AMDGPU::SGPR_256RegClass;
2094 return &AMDGPU::SGPR_512RegClass;
2100 unsigned BitWidth = 32 * (CI.Width + Paired.Width);
2101 return TRI->isAGPRClass(getDataRegClass(*CI.I))
2107 CombineInfo &CI, CombineInfo &Paired,
2109 MachineBasicBlock *
MBB = CI.I->getParent();
2113 const unsigned Opcode = getNewOpcode(CI, Paired);
2116 copyFromSrcRegs(CI, Paired, InsertBefore,
DL, AMDGPU::OpName::vdata);
2121 AddressRegs Regs = getRegs(Opcode, *
TII);
2124 MIB.add(*
TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
2130 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
2133 MIB.add(*
TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
2134 .add(*
TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
2135 .addImm(std::min(CI.Offset, Paired.Offset))
2138 .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
2140 CI.I->eraseFromParent();
2141 Paired.I->eraseFromParent();
2146SILoadStoreOptimizer::createRegOrImm(int32_t Val, MachineInstr &
MI)
const {
2147 APInt
V(32, Val,
true);
2148 if (
TII->isInlineConstant(V))
2151 Register Reg =
MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2153 BuildMI(*
MI.getParent(),
MI.getIterator(),
MI.getDebugLoc(),
2154 TII->get(AMDGPU::S_MOV_B32),
Reg)
2162Register SILoadStoreOptimizer::computeBase(MachineInstr &
MI,
2163 const MemAddress &Addr)
const {
2164 MachineBasicBlock *
MBB =
MI.getParent();
2168 assert((
TRI->getRegSizeInBits(Addr.Base.LoReg, *
MRI) == 32 ||
2169 Addr.Base.LoSubReg) &&
2170 "Expected 32-bit Base-Register-Low!!");
2172 assert((
TRI->getRegSizeInBits(Addr.Base.HiReg, *
MRI) == 32 ||
2173 Addr.Base.HiSubReg) &&
2174 "Expected 32-bit Base-Register-Hi!!");
2177 MachineOperand OffsetLo = createRegOrImm(
static_cast<int32_t
>(Addr.Offset),
MI);
2178 MachineOperand OffsetHi =
2179 createRegOrImm(
static_cast<int32_t
>(Addr.Offset >> 32),
MI);
2181 const auto *CarryRC =
TRI->getWaveMaskRegClass();
2182 Register CarryReg =
MRI->createVirtualRegister(CarryRC);
2183 Register DeadCarryReg =
MRI->createVirtualRegister(CarryRC);
2185 Register DestSub0 =
MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
2186 Register DestSub1 =
MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
2187 MachineInstr *LoHalf =
2190 .
addReg(Addr.Base.LoReg, 0, Addr.Base.LoSubReg)
2196 MachineInstr *HiHalf =
2199 .
addReg(Addr.Base.HiReg, 0, Addr.Base.HiSubReg)
2206 Register FullDestReg =
MRI->createVirtualRegister(
TRI->getVGPR64Class());
2207 MachineInstr *FullBase =
2220void SILoadStoreOptimizer::updateBaseAndOffset(MachineInstr &
MI,
2222 int32_t NewOffset)
const {
2223 auto *
Base =
TII->getNamedOperand(
MI, AMDGPU::OpName::vaddr);
2224 Base->setReg(NewBase);
2225 Base->setIsKill(
false);
2226 TII->getNamedOperand(
MI, AMDGPU::OpName::offset)->setImm(NewOffset);
2229std::optional<int32_t>
2230SILoadStoreOptimizer::extractConstOffset(
const MachineOperand &
Op)
const {
2235 return std::nullopt;
2237 MachineInstr *
Def =
MRI->getUniqueVRegDef(
Op.getReg());
2238 if (!Def ||
Def->getOpcode() != AMDGPU::S_MOV_B32 ||
2239 !
Def->getOperand(1).isImm())
2240 return std::nullopt;
2242 return Def->getOperand(1).getImm();
2255void SILoadStoreOptimizer::processBaseWithConstOffset(
const MachineOperand &
Base,
2256 MemAddress &Addr)
const {
2260 MachineInstr *
Def =
MRI->getUniqueVRegDef(
Base.getReg());
2261 if (!Def ||
Def->getOpcode() != AMDGPU::REG_SEQUENCE
2262 ||
Def->getNumOperands() != 5)
2265 MachineOperand BaseLo =
Def->getOperand(1);
2266 MachineOperand BaseHi =
Def->getOperand(3);
2270 MachineInstr *BaseLoDef =
MRI->getUniqueVRegDef(BaseLo.
getReg());
2271 MachineInstr *BaseHiDef =
MRI->getUniqueVRegDef(BaseHi.
getReg());
2273 if (!BaseLoDef || BaseLoDef->
getOpcode() != AMDGPU::V_ADD_CO_U32_e64 ||
2274 !BaseHiDef || BaseHiDef->
getOpcode() != AMDGPU::V_ADDC_U32_e64)
2277 const auto *Src0 =
TII->getNamedOperand(*BaseLoDef, AMDGPU::OpName::src0);
2278 const auto *Src1 =
TII->getNamedOperand(*BaseLoDef, AMDGPU::OpName::src1);
2280 auto Offset0P = extractConstOffset(*Src0);
2284 if (!(Offset0P = extractConstOffset(*Src1)))
2289 if (!BaseLo.
isReg())
2292 Src0 =
TII->getNamedOperand(*BaseHiDef, AMDGPU::OpName::src0);
2293 Src1 =
TII->getNamedOperand(*BaseHiDef, AMDGPU::OpName::src1);
2298 if (!Src1->isImm() || Src0->isImm())
2301 uint64_t Offset1 = Src1->getImm();
2304 if (!BaseHi.
isReg())
2307 Addr.Base.LoReg = BaseLo.
getReg();
2308 Addr.Base.HiReg = BaseHi.
getReg();
2309 Addr.Base.LoSubReg = BaseLo.
getSubReg();
2310 Addr.Base.HiSubReg = BaseHi.
getSubReg();
2311 Addr.Offset = (*Offset0P & 0x00000000ffffffff) | (Offset1 << 32);
2314bool SILoadStoreOptimizer::promoteConstantOffsetToImm(
2316 MemInfoMap &Visited,
2317 SmallPtrSet<MachineInstr *, 4> &
AnchorList)
const {
2334 if (
TII->getNamedOperand(
MI, AMDGPU::OpName::offset)->getImm()) {
2340 MachineOperand &
Base = *
TII->getNamedOperand(
MI, AMDGPU::OpName::vaddr);
2341 auto [It,
Inserted] = Visited.try_emplace(&
MI);
2344 processBaseWithConstOffset(
Base, MAddr);
2349 if (MAddr.Offset == 0) {
2350 LLVM_DEBUG(
dbgs() <<
" Failed to extract constant-offset or there are no"
2351 " constant offsets that can be promoted.\n";);
2357 <<
"} Offset: " << MAddr.Offset <<
"\n\n";);
2384 MachineInstr *AnchorInst =
nullptr;
2385 MemAddress AnchorAddr;
2386 uint32_t MaxDist = std::numeric_limits<uint32_t>::min();
2389 MachineBasicBlock *
MBB =
MI.getParent();
2396 MachineInstr &MINext = *
MBBI;
2400 TII->getNamedOperand(MINext, AMDGPU::OpName::offset)->getImm())
2403 const MachineOperand &BaseNext =
2404 *
TII->getNamedOperand(MINext, AMDGPU::OpName::vaddr);
2405 MemAddress MAddrNext;
2406 auto [It,
Inserted] = Visited.try_emplace(&MINext);
2408 processBaseWithConstOffset(BaseNext, MAddrNext);
2409 It->second = MAddrNext;
2411 MAddrNext = It->second;
2413 if (MAddrNext.Base.LoReg != MAddr.Base.LoReg ||
2414 MAddrNext.Base.HiReg != MAddr.Base.HiReg ||
2415 MAddrNext.Base.LoSubReg != MAddr.Base.LoSubReg ||
2416 MAddrNext.Base.HiSubReg != MAddr.Base.HiSubReg)
2419 InstsWCommonBase.
emplace_back(&MINext, MAddrNext.Offset);
2421 int64_t Dist = MAddr.Offset - MAddrNext.Offset;
2422 TargetLoweringBase::AddrMode AM;
2426 (uint32_t)std::abs(Dist) > MaxDist) {
2427 MaxDist = std::abs(Dist);
2429 AnchorAddr = MAddrNext;
2430 AnchorInst = &MINext;
2435 LLVM_DEBUG(
dbgs() <<
" Anchor-Inst(with max-distance from Offset): ";
2436 AnchorInst->
dump());
2438 << AnchorAddr.Offset <<
"\n\n");
2443 updateBaseAndOffset(
MI,
Base, MAddr.Offset - AnchorAddr.Offset);
2446 for (
auto [OtherMI, OtherOffset] : InstsWCommonBase) {
2447 TargetLoweringBase::AddrMode AM;
2449 AM.
BaseOffs = OtherOffset - AnchorAddr.Offset;
2454 updateBaseAndOffset(*OtherMI,
Base, OtherOffset - AnchorAddr.Offset);
2465void SILoadStoreOptimizer::addInstToMergeableList(
const CombineInfo &CI,
2466 std::list<std::list<CombineInfo> > &MergeableInsts)
const {
2467 for (std::list<CombineInfo> &AddrList : MergeableInsts) {
2468 if (AddrList.front().InstClass == CI.InstClass &&
2469 AddrList.front().hasSameBaseAddress(CI)) {
2470 AddrList.emplace_back(CI);
2476 MergeableInsts.emplace_back(1, CI);
2479std::pair<MachineBasicBlock::iterator, bool>
2480SILoadStoreOptimizer::collectMergeableInsts(
2482 MemInfoMap &Visited, SmallPtrSet<MachineInstr *, 4> &
AnchorList,
2483 std::list<std::list<CombineInfo>> &MergeableInsts)
const {
2489 for (; BlockI != End; ++BlockI) {
2490 MachineInstr &
MI = *BlockI;
2494 if (promoteConstantOffsetToImm(
MI, Visited,
AnchorList))
2499 if (
MI.hasOrderedMemoryRef() ||
MI.hasUnmodeledSideEffects()) {
2507 const InstClassEnum InstClass = getInstClass(
MI.getOpcode(), *
TII);
2508 if (InstClass == UNKNOWN)
2513 AMDGPU::getNamedOperandIdx(
MI.getOpcode(), AMDGPU::OpName::swz);
2514 if (Swizzled != -1 &&
MI.getOperand(Swizzled).getImm())
2517 if (InstClass == TBUFFER_LOAD || InstClass == TBUFFER_STORE) {
2518 const MachineOperand *Fmt =
2519 TII->getNamedOperand(
MI, AMDGPU::OpName::format);
2527 CI.setMI(
MI, *
this);
2530 if (!CI.hasMergeableAddress(*
MRI))
2545 for (std::list<std::list<CombineInfo>>::iterator
I = MergeableInsts.begin(),
2546 E = MergeableInsts.end();
I !=
E;) {
2548 std::list<CombineInfo> &MergeList = *
I;
2549 if (MergeList.size() <= 1) {
2553 I = MergeableInsts.erase(
I);
2561 [] (
const CombineInfo &
A,
const CombineInfo &
B) {
2562 return A.Offset <
B.Offset;
2573bool SILoadStoreOptimizer::optimizeBlock(
2574 std::list<std::list<CombineInfo> > &MergeableInsts) {
2577 for (std::list<std::list<CombineInfo>>::iterator
I = MergeableInsts.begin(),
2578 E = MergeableInsts.end();
I !=
E;) {
2579 std::list<CombineInfo> &MergeList = *
I;
2581 bool OptimizeListAgain =
false;
2582 if (!optimizeInstsWithSameBaseAddr(MergeList, OptimizeListAgain)) {
2586 I = MergeableInsts.erase(
I);
2594 if (!OptimizeListAgain) {
2595 I = MergeableInsts.erase(
I);
2598 OptimizeAgain =
true;
2604SILoadStoreOptimizer::optimizeInstsWithSameBaseAddr(
2605 std::list<CombineInfo> &MergeList,
2606 bool &OptimizeListAgain) {
2607 if (MergeList.empty())
2612 for (
auto I = MergeList.begin(),
Next = std::next(
I);
Next != MergeList.end();
2613 Next = std::next(
I)) {
2618 if ((*First).Order > (*Second).Order)
2620 CombineInfo &CI = *
First;
2621 CombineInfo &Paired = *Second;
2623 CombineInfo *Where = checkAndPrepareMerge(CI, Paired);
2631 LLVM_DEBUG(
dbgs() <<
"Merging: " << *CI.I <<
" with: " << *Paired.I);
2634 switch (CI.InstClass) {
2639 NewMI = mergeRead2Pair(CI, Paired, Where->I);
2642 NewMI = mergeWrite2Pair(CI, Paired, Where->I);
2644 case S_BUFFER_LOAD_IMM:
2645 case S_BUFFER_LOAD_SGPR_IMM:
2647 NewMI = mergeSMemLoadImmPair(CI, Paired, Where->I);
2648 OptimizeListAgain |= CI.Width + Paired.Width < 8;
2651 NewMI = mergeBufferLoadPair(CI, Paired, Where->I);
2652 OptimizeListAgain |= CI.Width + Paired.Width < 4;
2655 NewMI = mergeBufferStorePair(CI, Paired, Where->I);
2656 OptimizeListAgain |= CI.Width + Paired.Width < 4;
2659 NewMI = mergeImagePair(CI, Paired, Where->I);
2660 OptimizeListAgain |= CI.Width + Paired.Width < 4;
2663 NewMI = mergeTBufferLoadPair(CI, Paired, Where->I);
2664 OptimizeListAgain |= CI.Width + Paired.Width < 4;
2667 NewMI = mergeTBufferStorePair(CI, Paired, Where->I);
2668 OptimizeListAgain |= CI.Width + Paired.Width < 4;
2671 case FLAT_LOAD_SADDR:
2673 case GLOBAL_LOAD_SADDR:
2674 NewMI = mergeFlatLoadPair(CI, Paired, Where->I);
2675 OptimizeListAgain |= CI.Width + Paired.Width < 4;
2678 case FLAT_STORE_SADDR:
2680 case GLOBAL_STORE_SADDR:
2681 NewMI = mergeFlatStorePair(CI, Paired, Where->I);
2682 OptimizeListAgain |= CI.Width + Paired.Width < 4;
2685 CI.setMI(NewMI, *
this);
2686 CI.Order = Where->Order;
2690 MergeList.erase(Second);
2696bool SILoadStoreOptimizerLegacy::runOnMachineFunction(MachineFunction &MF) {
2699 return SILoadStoreOptimizer(
2700 &getAnalysis<AAResultsWrapperPass>().getAAResults())
2704bool SILoadStoreOptimizer::run(MachineFunction &MF) {
2724 for (MachineBasicBlock &
MBB : MF) {
2728 bool CollectModified;
2729 std::list<std::list<CombineInfo>> MergeableInsts;
2733 std::tie(SectionEnd, CollectModified) =
2739 OptimizeAgain =
false;
2741 }
while (OptimizeAgain);
2763 bool Changed = SILoadStoreOptimizer(&
AA).run(MF);
unsigned const MachineRegisterInfo * MRI
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
INITIALIZE_PASS(AMDGPUImageIntrinsicOptimizer, DEBUG_TYPE, "AMDGPU Image Intrinsic Optimizer", false, false) char AMDGPUImageIntrinsicOptimizer void addInstToMergeableList(IntrinsicInst *II, SmallVector< SmallVector< IntrinsicInst *, 4 > > &MergeableInsts, const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr)
BasicBlock::iterator collectMergeableInsts(BasicBlock::iterator I, BasicBlock::iterator E, SmallVector< SmallVector< IntrinsicInst *, 4 > > &MergeableInsts)
Provides AMDGPU specific target descriptions.
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
MachineBasicBlock MachineBasicBlock::iterator MBBI
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
Analysis containing CSE Info
AMD GCN specific subclass of TargetSubtarget.
const HexagonInstrInfo * TII
static MaybeAlign getAlign(Value *Ptr)
Register const TargetRegisterInfo * TRI
Promote Memory to Register
static MCRegister getReg(const MCDisassembler *D, unsigned RC, unsigned RegNo)
FunctionAnalysisManager FAM
#define INITIALIZE_PASS_DEPENDENCY(depName)
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
static uint32_t mostAlignedValueInRange(uint32_t Lo, uint32_t Hi)
static bool needsConstrainedOpcode(const GCNSubtarget &STM, ArrayRef< MachineMemOperand * > MMOs, unsigned Width)
static void addDefsUsesToList(const MachineInstr &MI, DenseSet< Register > &RegDefs, DenseSet< Register > &RegUses)
static unsigned getBufferFormatWithCompCount(unsigned OldFormat, unsigned ComponentCount, const GCNSubtarget &STI)
static bool optimizeBlock(BasicBlock &BB, bool &ModifiedDT, const TargetTransformInfo &TTI, const DataLayout &DL, bool HasBranchDivergence, DomTreeUpdater *DTU)
A manager for alias analyses.
A wrapper pass to provide the legacy pass manager access to a suitably prepared AAResults object.
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
Represent the analysis usage information of a pass.
AnalysisUsage & addRequired()
LLVM_ABI void setPreservesCFG()
This function should be called by the pass, iff they do not:
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
size_t size() const
size - Get the array size.
Represents analyses that only rely on functions' control flow.
static LLVM_ABI DebugLoc getMergedLocation(DebugLoc LocA, DebugLoc LocB)
When two instructions are combined into a single instruction we also need to combine the original loc...
Implements a dense probed hash-table based set.
FunctionPass class - This class is used to implement most global optimizations.
bool hasOptNone() const
Do not optimize this function (-O0).
bool loadStoreOptEnabled() const
bool hasFlatInstOffsets() const
const SIInstrInfo * getInstrInfo() const override
bool hasDwordx3LoadStores() const
const SITargetLowering * getTargetLowering() const override
bool ldsRequiresM0Init() const
Return if most LDS instructions have an m0 use that require m0 to be initialized.
bool hasScalarDwordx3Loads() const
bool isXNACKEnabled() const
const HexagonRegisterInfo & getRegisterInfo() const
TypeSize getValue() const
unsigned getOpcode() const
Return the opcode number for this descriptor.
An RAII based helper class to modify MachineFunctionProperties when running pass.
MachineInstrBundleIterator< MachineInstr > iterator
MachineFunctionPass - This class adapts the FunctionPass interface to allow convenient creation of pa...
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
Properties which a MachineFunction may have at a given point in time.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
const MachineInstrBuilder & cloneMergedMemRefs(ArrayRef< const MachineInstr * > OtherMIs) const
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
Representation of each machine instruction.
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
LLVM_ABI void dump() const
A description of a memory reference used in the backend.
LocationSize getSize() const
Return the size in bytes of the memory reference.
unsigned getAddrSpace() const
const MachinePointerInfo & getPointerInfo() const
MachineOperand class - Representation of each machine instruction operand.
unsigned getSubReg() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
bool isImm() const
isImm - Tests if this is a MO_Immediate operand.
static MachineOperand CreateImm(int64_t Val)
Register getReg() const
getReg - Returns the register number.
static MachineOperand CreateReg(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isEarlyClobber=false, unsigned SubReg=0, bool isDebug=false, bool isInternalRead=false, bool isRenamable=false)
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
A set of analyses that are preserved following a run of a transformation pass.
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
PreservedAnalyses & preserveSet()
Mark an analysis set as preserved.
Wrapper class representing virtual and physical registers.
constexpr bool isPhysical() const
Return true if the specified register number is in the physical register namespace.
static bool isFLATScratch(const MachineInstr &MI)
static bool isVIMAGE(const MachineInstr &MI)
static bool isFLATGlobal(const MachineInstr &MI)
static bool isVSAMPLE(const MachineInstr &MI)
static bool isFLAT(const MachineInstr &MI)
LLVM_READONLY MachineOperand * getNamedOperand(MachineInstr &MI, AMDGPU::OpName OperandName) const
Returns the operand named Op.
PreservedAnalyses run(MachineFunction &MF, MachineFunctionAnalysisManager &MFAM)
bool isLegalFlatAddressingMode(const AddrMode &AM, unsigned AddrSpace) const
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
reference emplace_back(ArgTypes &&... Args)
StringRef - Represent a constant reference to a string, i.e.
std::pair< iterator, bool > insert(const ValueT &V)
bool contains(const_arg_type_t< ValueT > V) const
Check if the set contains the given element.
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
Abstract Attribute helper functions.
@ FLAT_ADDRESS
Address space for flat memory.
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
LLVM_READONLY const MIMGInfo * getMIMGInfo(unsigned Opc)
uint64_t convertSMRDOffsetUnits(const MCSubtargetInfo &ST, uint64_t ByteOffset)
Convert ByteOffset to dwords if the subtarget uses dword SMRD immediate offsets.
bool getMTBUFHasSrsrc(unsigned Opc)
int getMTBUFElements(unsigned Opc)
bool getMTBUFHasSoffset(unsigned Opc)
int getMUBUFOpcode(unsigned BaseOpc, unsigned Elements)
int getMUBUFBaseOpcode(unsigned Opc)
LLVM_READONLY bool hasNamedOperand(uint64_t Opcode, OpName NamedIdx)
int getMTBUFBaseOpcode(unsigned Opc)
bool getMUBUFHasVAddr(unsigned Opc)
int getMTBUFOpcode(unsigned BaseOpc, unsigned Elements)
bool getMUBUFHasSoffset(unsigned Opc)
const MIMGBaseOpcodeInfo * getMIMGBaseOpcode(unsigned Opc)
LLVM_READONLY const MIMGBaseOpcodeInfo * getMIMGBaseOpcodeInfo(unsigned BaseOpcode)
int getMaskedMIMGOp(unsigned Opc, unsigned NewChannels)
bool getMTBUFHasVAddr(unsigned Opc)
int getMUBUFElements(unsigned Opc)
const GcnBufferFormatInfo * getGcnBufferFormatInfo(uint8_t BitsPerComp, uint8_t NumComponents, uint8_t NumFormat, const MCSubtargetInfo &STI)
bool getMUBUFHasSrsrc(unsigned Opc)
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
@ Define
Register definition.
@ Kill
The last use of a register.
NodeAddr< DefNode * > Def
BaseReg
Stack frame base register. Bit 0 of FREInfo.Info.
This is an optimization pass for GlobalISel generic memory operations.
bool operator<(int64_t V1, const APSInt &V2)
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
constexpr T maskLeadingOnes(unsigned N)
Create a bitmask with the N left-most bits set to 1, and all other bits set to 0.
FunctionPass * createSILoadStoreOptimizerLegacyPass()
AnalysisManager< MachineFunction > MachineFunctionAnalysisManager
char & SILoadStoreOptimizerLegacyID
constexpr int popcount(T Value) noexcept
Count the number of set bits in a value.
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
LLVM_ABI PreservedAnalyses getMachineFunctionPassPreservedAnalyses()
Returns the minimum set of Analyses that all machine function passes must preserve.
int countl_zero(T Val)
Count number of 0's from the most significant bit to the least stopping at the first 1.
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
constexpr bool isUInt(uint64_t x)
Checks if an unsigned integer fits into the given bit width.
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
@ First
Helpers to iterate all locations in the MemoryEffectsBase class.
FunctionAddr VTableAddr Next
DWARFExpression::Operation Op
std::vector< std::pair< LineLocation, FunctionId > > AnchorList
constexpr unsigned BitWidth
constexpr T maskTrailingOnes(unsigned N)
Create a bitmask with the N right-most bits set to 1, and all other bits set to 0.
AAResults AliasAnalysis
Temporary typedef for legacy code that uses a generic AliasAnalysis pointer or reference.
LLVM_ABI Printable printReg(Register Reg, const TargetRegisterInfo *TRI=nullptr, unsigned SubIdx=0, const MachineRegisterInfo *MRI=nullptr)
Prints virtual and physical registers with or without a TRI instance.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.