69#define DEBUG_TYPE "si-load-store-opt"
77 S_BUFFER_LOAD_SGPR_IMM,
94 unsigned char NumVAddrs = 0;
105const unsigned MaxAddressRegs = 12 + 1 + 1;
116 InstClassEnum InstClass;
120 int AddrIdx[MaxAddressRegs];
122 unsigned NumAddresses;
125 bool hasSameBaseAddress(
const CombineInfo &CI) {
126 if (NumAddresses != CI.NumAddresses)
130 for (
unsigned i = 0; i < NumAddresses; i++) {
133 if (AddrReg[i]->
isImm() || AddrRegNext.
isImm()) {
134 if (AddrReg[i]->
isImm() != AddrRegNext.
isImm() ||
152 for (
unsigned i = 0; i < NumAddresses; ++i) {
161 if (!AddrOp->
isReg())
167 AddrOp->
getReg() != AMDGPU::SGPR_NULL)
172 if (
MRI.hasOneNonDBGUse(AddrOp->
getReg()))
182 return (InstClass == MIMG) ? DMask <
Other.DMask : Offset <
Other.Offset;
186 struct BaseRegisters {
190 unsigned LoSubReg = 0;
191 unsigned HiSubReg = 0;
212 static bool dmasksCanBeCombined(
const CombineInfo &CI,
214 const CombineInfo &Paired);
215 static bool offsetsCanBeCombined(CombineInfo &CI,
const GCNSubtarget &STI,
216 CombineInfo &Paired,
bool Modify =
false);
217 static bool widthsFit(
const GCNSubtarget &STI,
const CombineInfo &CI,
218 const CombineInfo &Paired);
219 static unsigned getNewOpcode(
const CombineInfo &CI,
const CombineInfo &Paired);
220 static std::pair<unsigned, unsigned> getSubRegIdxs(
const CombineInfo &CI,
221 const CombineInfo &Paired);
223 getTargetRegisterClass(
const CombineInfo &CI,
224 const CombineInfo &Paired)
const;
227 CombineInfo *checkAndPrepareMerge(CombineInfo &CI, CombineInfo &Paired);
229 void copyToDestRegs(CombineInfo &CI, CombineInfo &Paired,
232 Register copyFromSrcRegs(CombineInfo &CI, CombineInfo &Paired,
236 unsigned read2Opcode(
unsigned EltSize)
const;
237 unsigned read2ST64Opcode(
unsigned EltSize)
const;
239 mergeRead2Pair(CombineInfo &CI, CombineInfo &Paired,
242 unsigned write2Opcode(
unsigned EltSize)
const;
243 unsigned write2ST64Opcode(
unsigned EltSize)
const;
245 mergeWrite2Pair(CombineInfo &CI, CombineInfo &Paired,
248 mergeImagePair(CombineInfo &CI, CombineInfo &Paired,
251 mergeSMemLoadImmPair(CombineInfo &CI, CombineInfo &Paired,
254 mergeBufferLoadPair(CombineInfo &CI, CombineInfo &Paired,
257 mergeBufferStorePair(CombineInfo &CI, CombineInfo &Paired,
260 mergeTBufferLoadPair(CombineInfo &CI, CombineInfo &Paired,
263 mergeTBufferStorePair(CombineInfo &CI, CombineInfo &Paired,
266 mergeFlatLoadPair(CombineInfo &CI, CombineInfo &Paired,
269 mergeFlatStorePair(CombineInfo &CI, CombineInfo &Paired,
273 int32_t NewOffset)
const;
285 std::list<std::list<CombineInfo> > &MergeableInsts)
const;
290 std::list<std::list<CombineInfo>> &MergeableInsts)
const;
293 const CombineInfo &Paired);
295 static InstClassEnum getCommonInstClass(
const CombineInfo &CI,
296 const CombineInfo &Paired);
305 bool optimizeInstsWithSameBaseAddr(std::list<CombineInfo> &MergeList,
306 bool &OptimizeListAgain);
307 bool optimizeBlock(std::list<std::list<CombineInfo> > &MergeableInsts);
311 StringRef getPassName()
const override {
return "SI Load Store Optimizer"; }
322 .
set(MachineFunctionProperties::Property::IsSSA);
327 const unsigned Opc =
MI.getOpcode();
329 if (
TII.isMUBUF(Opc)) {
333 if (
TII.isImage(
MI)) {
335 TII.getNamedOperand(
MI, AMDGPU::OpName::dmask)->getImm();
338 if (
TII.isMTBUF(Opc)) {
343 case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
344 case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM:
345 case AMDGPU::S_LOAD_DWORD_IMM:
346 case AMDGPU::GLOBAL_LOAD_DWORD:
347 case AMDGPU::GLOBAL_LOAD_DWORD_SADDR:
348 case AMDGPU::GLOBAL_STORE_DWORD:
349 case AMDGPU::GLOBAL_STORE_DWORD_SADDR:
350 case AMDGPU::FLAT_LOAD_DWORD:
351 case AMDGPU::FLAT_STORE_DWORD:
353 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
354 case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM:
355 case AMDGPU::S_LOAD_DWORDX2_IMM:
356 case AMDGPU::GLOBAL_LOAD_DWORDX2:
357 case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR:
358 case AMDGPU::GLOBAL_STORE_DWORDX2:
359 case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR:
360 case AMDGPU::FLAT_LOAD_DWORDX2:
361 case AMDGPU::FLAT_STORE_DWORDX2:
363 case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM:
364 case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM:
365 case AMDGPU::S_LOAD_DWORDX3_IMM:
366 case AMDGPU::GLOBAL_LOAD_DWORDX3:
367 case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR:
368 case AMDGPU::GLOBAL_STORE_DWORDX3:
369 case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR:
370 case AMDGPU::FLAT_LOAD_DWORDX3:
371 case AMDGPU::FLAT_STORE_DWORDX3:
373 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
374 case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM:
375 case AMDGPU::S_LOAD_DWORDX4_IMM:
376 case AMDGPU::GLOBAL_LOAD_DWORDX4:
377 case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR:
378 case AMDGPU::GLOBAL_STORE_DWORDX4:
379 case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR:
380 case AMDGPU::FLAT_LOAD_DWORDX4:
381 case AMDGPU::FLAT_STORE_DWORDX4:
383 case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
384 case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM:
385 case AMDGPU::S_LOAD_DWORDX8_IMM:
387 case AMDGPU::DS_READ_B32:
388 case AMDGPU::DS_READ_B32_gfx9:
389 case AMDGPU::DS_WRITE_B32:
390 case AMDGPU::DS_WRITE_B32_gfx9:
392 case AMDGPU::DS_READ_B64:
393 case AMDGPU::DS_READ_B64_gfx9:
394 case AMDGPU::DS_WRITE_B64:
395 case AMDGPU::DS_WRITE_B64_gfx9:
403static InstClassEnum getInstClass(
unsigned Opc,
const SIInstrInfo &
TII) {
406 if (
TII.isMUBUF(Opc)) {
410 case AMDGPU::BUFFER_LOAD_DWORD_BOTHEN:
411 case AMDGPU::BUFFER_LOAD_DWORD_BOTHEN_exact:
412 case AMDGPU::BUFFER_LOAD_DWORD_IDXEN:
413 case AMDGPU::BUFFER_LOAD_DWORD_IDXEN_exact:
414 case AMDGPU::BUFFER_LOAD_DWORD_OFFEN:
415 case AMDGPU::BUFFER_LOAD_DWORD_OFFEN_exact:
416 case AMDGPU::BUFFER_LOAD_DWORD_OFFSET:
417 case AMDGPU::BUFFER_LOAD_DWORD_OFFSET_exact:
418 case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_BOTHEN:
419 case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_BOTHEN_exact:
420 case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_IDXEN:
421 case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_IDXEN_exact:
422 case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_OFFEN:
423 case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_OFFEN_exact:
424 case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_OFFSET:
425 case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_OFFSET_exact:
427 case AMDGPU::BUFFER_STORE_DWORD_BOTHEN:
428 case AMDGPU::BUFFER_STORE_DWORD_BOTHEN_exact:
429 case AMDGPU::BUFFER_STORE_DWORD_IDXEN:
430 case AMDGPU::BUFFER_STORE_DWORD_IDXEN_exact:
431 case AMDGPU::BUFFER_STORE_DWORD_OFFEN:
432 case AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact:
433 case AMDGPU::BUFFER_STORE_DWORD_OFFSET:
434 case AMDGPU::BUFFER_STORE_DWORD_OFFSET_exact:
435 case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_BOTHEN:
436 case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_BOTHEN_exact:
437 case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_IDXEN:
438 case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_IDXEN_exact:
439 case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_OFFEN:
440 case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_OFFEN_exact:
441 case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_OFFSET:
442 case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_OFFSET_exact:
446 if (
TII.isImage(Opc)) {
455 if (
TII.get(Opc).mayStore() || !
TII.get(Opc).mayLoad() ||
460 if (
TII.isMTBUF(Opc)) {
464 case AMDGPU::TBUFFER_LOAD_FORMAT_X_BOTHEN:
465 case AMDGPU::TBUFFER_LOAD_FORMAT_X_BOTHEN_exact:
466 case AMDGPU::TBUFFER_LOAD_FORMAT_X_IDXEN:
467 case AMDGPU::TBUFFER_LOAD_FORMAT_X_IDXEN_exact:
468 case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFEN:
469 case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFEN_exact:
470 case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFSET:
471 case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFSET_exact:
472 case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_BOTHEN:
473 case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_BOTHEN_exact:
474 case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_IDXEN:
475 case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_IDXEN_exact:
476 case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_OFFEN:
477 case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_OFFEN_exact:
478 case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_OFFSET:
479 case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_OFFSET_exact:
481 case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFEN:
482 case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFEN_exact:
483 case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFSET:
484 case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFSET_exact:
485 case AMDGPU::TBUFFER_STORE_FORMAT_X_VBUFFER_OFFEN:
486 case AMDGPU::TBUFFER_STORE_FORMAT_X_VBUFFER_OFFEN_exact:
487 case AMDGPU::TBUFFER_STORE_FORMAT_X_VBUFFER_OFFSET:
488 case AMDGPU::TBUFFER_STORE_FORMAT_X_VBUFFER_OFFSET_exact:
489 return TBUFFER_STORE;
493 case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
494 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
495 case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM:
496 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
497 case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
498 return S_BUFFER_LOAD_IMM;
499 case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM:
500 case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM:
501 case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM:
502 case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM:
503 case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM:
504 return S_BUFFER_LOAD_SGPR_IMM;
505 case AMDGPU::S_LOAD_DWORD_IMM:
506 case AMDGPU::S_LOAD_DWORDX2_IMM:
507 case AMDGPU::S_LOAD_DWORDX3_IMM:
508 case AMDGPU::S_LOAD_DWORDX4_IMM:
509 case AMDGPU::S_LOAD_DWORDX8_IMM:
511 case AMDGPU::DS_READ_B32:
512 case AMDGPU::DS_READ_B32_gfx9:
513 case AMDGPU::DS_READ_B64:
514 case AMDGPU::DS_READ_B64_gfx9:
516 case AMDGPU::DS_WRITE_B32:
517 case AMDGPU::DS_WRITE_B32_gfx9:
518 case AMDGPU::DS_WRITE_B64:
519 case AMDGPU::DS_WRITE_B64_gfx9:
521 case AMDGPU::GLOBAL_LOAD_DWORD:
522 case AMDGPU::GLOBAL_LOAD_DWORDX2:
523 case AMDGPU::GLOBAL_LOAD_DWORDX3:
524 case AMDGPU::GLOBAL_LOAD_DWORDX4:
525 case AMDGPU::FLAT_LOAD_DWORD:
526 case AMDGPU::FLAT_LOAD_DWORDX2:
527 case AMDGPU::FLAT_LOAD_DWORDX3:
528 case AMDGPU::FLAT_LOAD_DWORDX4:
530 case AMDGPU::GLOBAL_LOAD_DWORD_SADDR:
531 case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR:
532 case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR:
533 case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR:
534 return GLOBAL_LOAD_SADDR;
535 case AMDGPU::GLOBAL_STORE_DWORD:
536 case AMDGPU::GLOBAL_STORE_DWORDX2:
537 case AMDGPU::GLOBAL_STORE_DWORDX3:
538 case AMDGPU::GLOBAL_STORE_DWORDX4:
539 case AMDGPU::FLAT_STORE_DWORD:
540 case AMDGPU::FLAT_STORE_DWORDX2:
541 case AMDGPU::FLAT_STORE_DWORDX3:
542 case AMDGPU::FLAT_STORE_DWORDX4:
544 case AMDGPU::GLOBAL_STORE_DWORD_SADDR:
545 case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR:
546 case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR:
547 case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR:
548 return GLOBAL_STORE_SADDR;
555static unsigned getInstSubclass(
unsigned Opc,
const SIInstrInfo &
TII) {
558 if (
TII.isMUBUF(Opc))
560 if (
TII.isImage(Opc)) {
563 return Info->BaseOpcode;
565 if (
TII.isMTBUF(Opc))
568 case AMDGPU::DS_READ_B32:
569 case AMDGPU::DS_READ_B32_gfx9:
570 case AMDGPU::DS_READ_B64:
571 case AMDGPU::DS_READ_B64_gfx9:
572 case AMDGPU::DS_WRITE_B32:
573 case AMDGPU::DS_WRITE_B32_gfx9:
574 case AMDGPU::DS_WRITE_B64:
575 case AMDGPU::DS_WRITE_B64_gfx9:
577 case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
578 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
579 case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM:
580 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
581 case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
582 return AMDGPU::S_BUFFER_LOAD_DWORD_IMM;
583 case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM:
584 case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM:
585 case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM:
586 case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM:
587 case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM:
588 return AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM;
589 case AMDGPU::S_LOAD_DWORD_IMM:
590 case AMDGPU::S_LOAD_DWORDX2_IMM:
591 case AMDGPU::S_LOAD_DWORDX3_IMM:
592 case AMDGPU::S_LOAD_DWORDX4_IMM:
593 case AMDGPU::S_LOAD_DWORDX8_IMM:
594 return AMDGPU::S_LOAD_DWORD_IMM;
595 case AMDGPU::GLOBAL_LOAD_DWORD:
596 case AMDGPU::GLOBAL_LOAD_DWORDX2:
597 case AMDGPU::GLOBAL_LOAD_DWORDX3:
598 case AMDGPU::GLOBAL_LOAD_DWORDX4:
599 case AMDGPU::FLAT_LOAD_DWORD:
600 case AMDGPU::FLAT_LOAD_DWORDX2:
601 case AMDGPU::FLAT_LOAD_DWORDX3:
602 case AMDGPU::FLAT_LOAD_DWORDX4:
603 return AMDGPU::FLAT_LOAD_DWORD;
604 case AMDGPU::GLOBAL_LOAD_DWORD_SADDR:
605 case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR:
606 case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR:
607 case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR:
608 return AMDGPU::GLOBAL_LOAD_DWORD_SADDR;
609 case AMDGPU::GLOBAL_STORE_DWORD:
610 case AMDGPU::GLOBAL_STORE_DWORDX2:
611 case AMDGPU::GLOBAL_STORE_DWORDX3:
612 case AMDGPU::GLOBAL_STORE_DWORDX4:
613 case AMDGPU::FLAT_STORE_DWORD:
614 case AMDGPU::FLAT_STORE_DWORDX2:
615 case AMDGPU::FLAT_STORE_DWORDX3:
616 case AMDGPU::FLAT_STORE_DWORDX4:
617 return AMDGPU::FLAT_STORE_DWORD;
618 case AMDGPU::GLOBAL_STORE_DWORD_SADDR:
619 case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR:
620 case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR:
621 case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR:
622 return AMDGPU::GLOBAL_STORE_DWORD_SADDR;
633SILoadStoreOptimizer::getCommonInstClass(
const CombineInfo &CI,
634 const CombineInfo &Paired) {
635 assert(CI.InstClass == Paired.InstClass);
637 if ((CI.InstClass == FLAT_LOAD || CI.InstClass == FLAT_STORE) &&
639 return (CI.InstClass == FLAT_STORE) ? GLOBAL_STORE : GLOBAL_LOAD;
647 if (
TII.isMUBUF(Opc)) {
653 Result.SOffset =
true;
658 if (
TII.isImage(Opc)) {
660 if (VAddr0Idx >= 0) {
662 TII.isMIMG(Opc) ? AMDGPU::OpName::srsrc : AMDGPU::OpName::rsrc;
664 Result.NumVAddrs = RsrcIdx - VAddr0Idx;
675 if (
TII.isMTBUF(Opc)) {
681 Result.SOffset =
true;
689 case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM:
690 case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM:
691 case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM:
692 case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM:
693 case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM:
694 Result.SOffset =
true;
696 case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
697 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
698 case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM:
699 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
700 case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
701 case AMDGPU::S_LOAD_DWORD_IMM:
702 case AMDGPU::S_LOAD_DWORDX2_IMM:
703 case AMDGPU::S_LOAD_DWORDX3_IMM:
704 case AMDGPU::S_LOAD_DWORDX4_IMM:
705 case AMDGPU::S_LOAD_DWORDX8_IMM:
708 case AMDGPU::DS_READ_B32:
709 case AMDGPU::DS_READ_B64:
710 case AMDGPU::DS_READ_B32_gfx9:
711 case AMDGPU::DS_READ_B64_gfx9:
712 case AMDGPU::DS_WRITE_B32:
713 case AMDGPU::DS_WRITE_B64:
714 case AMDGPU::DS_WRITE_B32_gfx9:
715 case AMDGPU::DS_WRITE_B64_gfx9:
718 case AMDGPU::GLOBAL_LOAD_DWORD_SADDR:
719 case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR:
720 case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR:
721 case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR:
722 case AMDGPU::GLOBAL_STORE_DWORD_SADDR:
723 case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR:
724 case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR:
725 case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR:
728 case AMDGPU::GLOBAL_LOAD_DWORD:
729 case AMDGPU::GLOBAL_LOAD_DWORDX2:
730 case AMDGPU::GLOBAL_LOAD_DWORDX3:
731 case AMDGPU::GLOBAL_LOAD_DWORDX4:
732 case AMDGPU::GLOBAL_STORE_DWORD:
733 case AMDGPU::GLOBAL_STORE_DWORDX2:
734 case AMDGPU::GLOBAL_STORE_DWORDX3:
735 case AMDGPU::GLOBAL_STORE_DWORDX4:
736 case AMDGPU::FLAT_LOAD_DWORD:
737 case AMDGPU::FLAT_LOAD_DWORDX2:
738 case AMDGPU::FLAT_LOAD_DWORDX3:
739 case AMDGPU::FLAT_LOAD_DWORDX4:
740 case AMDGPU::FLAT_STORE_DWORD:
741 case AMDGPU::FLAT_STORE_DWORDX2:
742 case AMDGPU::FLAT_STORE_DWORDX3:
743 case AMDGPU::FLAT_STORE_DWORDX4:
750 const SILoadStoreOptimizer &LSO) {
752 unsigned Opc =
MI->getOpcode();
753 InstClass = getInstClass(Opc, *LSO.TII);
755 if (InstClass == UNKNOWN)
758 IsAGPR = LSO.TRI->hasAGPRs(LSO.getDataRegClass(*
MI));
763 (Opc == AMDGPU::DS_READ_B64 || Opc == AMDGPU::DS_READ_B64_gfx9) ? 8
768 (Opc == AMDGPU::DS_WRITE_B64 || Opc == AMDGPU::DS_WRITE_B64_gfx9) ? 8
771 case S_BUFFER_LOAD_IMM:
772 case S_BUFFER_LOAD_SGPR_IMM:
781 if (InstClass == MIMG) {
782 DMask = LSO.TII->getNamedOperand(*
I, AMDGPU::OpName::dmask)->getImm();
787 Offset =
I->getOperand(OffsetIdx).getImm();
790 if (InstClass == TBUFFER_LOAD || InstClass == TBUFFER_STORE)
791 Format = LSO.TII->getNamedOperand(*
I, AMDGPU::OpName::format)->getImm();
793 Width = getOpcodeWidth(*
I, *LSO.TII);
795 if ((InstClass == DS_READ) || (InstClass == DS_WRITE)) {
797 }
else if (InstClass != MIMG) {
798 CPol = LSO.TII->getNamedOperand(*
I, AMDGPU::OpName::cpol)->getImm();
801 AddressRegs Regs = getRegs(Opc, *LSO.TII);
802 bool isVIMAGEorVSAMPLE = LSO.TII->isVIMAGE(*
I) || LSO.TII->isVSAMPLE(*
I);
805 for (
unsigned J = 0; J < Regs.NumVAddrs; J++)
806 AddrIdx[NumAddresses++] =
809 AddrIdx[NumAddresses++] =
812 AddrIdx[NumAddresses++] =
816 Opc, isVIMAGEorVSAMPLE ? AMDGPU::OpName::rsrc : AMDGPU::OpName::srsrc);
818 AddrIdx[NumAddresses++] =
821 AddrIdx[NumAddresses++] =
824 AddrIdx[NumAddresses++] =
828 Opc, isVIMAGEorVSAMPLE ? AMDGPU::OpName::samp : AMDGPU::OpName::ssamp);
829 assert(NumAddresses <= MaxAddressRegs);
831 for (
unsigned J = 0; J < NumAddresses; J++)
832 AddrReg[J] = &
I->getOperand(AddrIdx[J]);
838 "SI Load Store Optimizer",
false,
false)
843char SILoadStoreOptimizer::
ID = 0;
848 return new SILoadStoreOptimizer();
854 for (
const auto &
Op :
MI.operands()) {
864bool SILoadStoreOptimizer::canSwapInstructions(
867 if (
A.mayLoadOrStore() &&
B.mayLoadOrStore() &&
868 (
A.mayStore() ||
B.mayStore()) &&
A.mayAlias(AA,
B,
true))
870 for (
const auto &BOp :
B.operands()) {
873 if ((BOp.isDef() || BOp.readsReg()) && ARegDefs.
contains(BOp.getReg()))
875 if (BOp.isDef() && ARegUses.
contains(BOp.getReg()))
884SILoadStoreOptimizer::combineKnownAdjacentMMOs(
const CombineInfo &CI,
885 const CombineInfo &Paired) {
905bool SILoadStoreOptimizer::dmasksCanBeCombined(
const CombineInfo &CI,
907 const CombineInfo &Paired) {
908 assert(CI.InstClass == MIMG);
911 const auto *TFEOp =
TII.getNamedOperand(*CI.I, AMDGPU::OpName::tfe);
912 const auto *LWEOp =
TII.getNamedOperand(*CI.I, AMDGPU::OpName::lwe);
914 if ((TFEOp && TFEOp->getImm()) || (LWEOp && LWEOp->getImm()))
918 unsigned OperandsToMatch[] = {AMDGPU::OpName::cpol, AMDGPU::OpName::d16,
919 AMDGPU::OpName::unorm, AMDGPU::OpName::da,
920 AMDGPU::OpName::r128, AMDGPU::OpName::a16};
922 for (
auto op : OperandsToMatch) {
927 CI.I->getOperand(
Idx).getImm() != Paired.I->getOperand(
Idx).getImm())
932 unsigned MaxMask = std::max(CI.DMask, Paired.DMask);
933 unsigned MinMask = std::min(CI.DMask, Paired.DMask);
939 if ((1u << AllowedBitsForMin) <= MinMask)
946 unsigned ComponentCount,
948 if (ComponentCount > 4)
967 return NewFormatInfo->
Format;
980bool SILoadStoreOptimizer::offsetsCanBeCombined(CombineInfo &CI,
984 assert(CI.InstClass != MIMG);
988 if (CI.Offset == Paired.Offset)
992 if ((CI.Offset % CI.EltSize != 0) || (Paired.Offset % CI.EltSize != 0))
995 if (CI.InstClass == TBUFFER_LOAD || CI.InstClass == TBUFFER_STORE) {
1019 uint32_t EltOffset0 = CI.Offset / CI.EltSize;
1020 uint32_t EltOffset1 = Paired.Offset / CI.EltSize;
1025 if ((CI.InstClass != DS_READ) && (CI.InstClass != DS_WRITE)) {
1026 if (EltOffset0 + CI.Width != EltOffset1 &&
1027 EltOffset1 + Paired.Width != EltOffset0)
1029 if (CI.CPol != Paired.CPol)
1031 if (CI.InstClass == S_LOAD_IMM || CI.InstClass == S_BUFFER_LOAD_IMM ||
1032 CI.InstClass == S_BUFFER_LOAD_SGPR_IMM) {
1038 if (CI.Width != Paired.Width &&
1039 (CI.Width < Paired.Width) == (CI.Offset < Paired.Offset))
1047 if ((EltOffset0 % 64 == 0) && (EltOffset1 % 64) == 0 &&
1048 isUInt<8>(EltOffset0 / 64) && isUInt<8>(EltOffset1 / 64)) {
1050 CI.Offset = EltOffset0 / 64;
1051 Paired.Offset = EltOffset1 / 64;
1058 if (isUInt<8>(EltOffset0) && isUInt<8>(EltOffset1)) {
1060 CI.Offset = EltOffset0;
1061 Paired.Offset = EltOffset1;
1067 uint32_t Min = std::min(EltOffset0, EltOffset1);
1070 const uint32_t Mask = maskTrailingOnes<uint32_t>(8) * 64;
1071 if (((Max - Min) & ~Mask) == 0) {
1079 BaseOff |= Min & maskTrailingOnes<uint32_t>(6);
1080 CI.BaseOff = BaseOff * CI.EltSize;
1081 CI.Offset = (EltOffset0 - BaseOff) / 64;
1082 Paired.Offset = (EltOffset1 - BaseOff) / 64;
1088 if (isUInt<8>(Max - Min)) {
1094 CI.BaseOff = BaseOff * CI.EltSize;
1095 CI.Offset = EltOffset0 - BaseOff;
1096 Paired.Offset = EltOffset1 - BaseOff;
1104bool SILoadStoreOptimizer::widthsFit(
const GCNSubtarget &STM,
1105 const CombineInfo &CI,
1106 const CombineInfo &Paired) {
1107 const unsigned Width = (CI.Width + Paired.Width);
1108 switch (CI.InstClass) {
1111 case S_BUFFER_LOAD_IMM:
1112 case S_BUFFER_LOAD_SGPR_IMM:
1128SILoadStoreOptimizer::getDataRegClass(
const MachineInstr &
MI)
const {
1129 if (
const auto *Dst =
TII->getNamedOperand(
MI, AMDGPU::OpName::vdst)) {
1130 return TRI->getRegClassForReg(*
MRI, Dst->getReg());
1132 if (
const auto *Src =
TII->getNamedOperand(
MI, AMDGPU::OpName::vdata)) {
1133 return TRI->getRegClassForReg(*
MRI, Src->getReg());
1135 if (
const auto *Src =
TII->getNamedOperand(
MI, AMDGPU::OpName::data0)) {
1136 return TRI->getRegClassForReg(*
MRI, Src->getReg());
1138 if (
const auto *Dst =
TII->getNamedOperand(
MI, AMDGPU::OpName::sdst)) {
1139 return TRI->getRegClassForReg(*
MRI, Dst->getReg());
1141 if (
const auto *Src =
TII->getNamedOperand(
MI, AMDGPU::OpName::sdata)) {
1142 return TRI->getRegClassForReg(*
MRI, Src->getReg());
1149SILoadStoreOptimizer::CombineInfo *
1150SILoadStoreOptimizer::checkAndPrepareMerge(CombineInfo &CI,
1151 CombineInfo &Paired) {
1154 if (CI.InstClass == UNKNOWN || Paired.InstClass == UNKNOWN)
1156 assert(CI.InstClass == Paired.InstClass);
1158 if (getInstSubclass(CI.I->getOpcode(), *
TII) !=
1159 getInstSubclass(Paired.I->getOpcode(), *
TII))
1164 if (CI.InstClass == MIMG) {
1165 if (!dmasksCanBeCombined(CI, *
TII, Paired))
1168 if (!widthsFit(*STM, CI, Paired) || !offsetsCanBeCombined(CI, *STM, Paired))
1175 if (CI.I->mayLoad()) {
1179 if (!canSwapInstructions(RegDefs, RegUses, *Paired.I, *
MBBI))
1187 if (!canSwapInstructions(RegDefs, RegUses, *CI.I, *
MBBI))
1197 if (CI.InstClass == DS_READ || CI.InstClass == DS_WRITE)
1198 offsetsCanBeCombined(CI, *STM, Paired,
true);
1204void SILoadStoreOptimizer::copyToDestRegs(
1205 CombineInfo &CI, CombineInfo &Paired,
1211 auto [SubRegIdx0, SubRegIdx1] = getSubRegIdxs(CI, Paired);
1215 const auto *Dest0 =
TII->getNamedOperand(*CI.I,
OpName);
1216 const auto *Dest1 =
TII->getNamedOperand(*Paired.I,
OpName);
1220 .
addReg(DestReg, 0, SubRegIdx0);
1229SILoadStoreOptimizer::copyFromSrcRegs(CombineInfo &CI, CombineInfo &Paired,
1235 auto [SubRegIdx0, SubRegIdx1] = getSubRegIdxs(CI, Paired);
1239 Register SrcReg =
MRI->createVirtualRegister(SuperRC);
1241 const auto *Src0 =
TII->getNamedOperand(*CI.I,
OpName);
1242 const auto *Src1 =
TII->getNamedOperand(*Paired.I,
OpName);
1244 BuildMI(*
MBB, InsertBefore,
DL,
TII->get(AMDGPU::REG_SEQUENCE), SrcReg)
1253unsigned SILoadStoreOptimizer::read2Opcode(
unsigned EltSize)
const {
1255 return (EltSize == 4) ? AMDGPU::DS_READ2_B32 : AMDGPU::DS_READ2_B64;
1256 return (EltSize == 4) ? AMDGPU::DS_READ2_B32_gfx9 : AMDGPU::DS_READ2_B64_gfx9;
1259unsigned SILoadStoreOptimizer::read2ST64Opcode(
unsigned EltSize)
const {
1261 return (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32 : AMDGPU::DS_READ2ST64_B64;
1263 return (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32_gfx9
1264 : AMDGPU::DS_READ2ST64_B64_gfx9;
1268SILoadStoreOptimizer::mergeRead2Pair(CombineInfo &CI, CombineInfo &Paired,
1274 const auto *AddrReg =
TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr);
1276 unsigned NewOffset0 = std::min(CI.Offset, Paired.Offset);
1277 unsigned NewOffset1 = std::max(CI.Offset, Paired.Offset);
1279 CI.UseST64 ? read2ST64Opcode(CI.EltSize) : read2Opcode(CI.EltSize);
1281 assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) &&
1282 (NewOffset0 != NewOffset1) &&
"Computed offset doesn't fit");
1287 Register DestReg =
MRI->createVirtualRegister(SuperRC);
1291 Register BaseReg = AddrReg->getReg();
1292 unsigned BaseSubReg = AddrReg->getSubReg();
1293 unsigned BaseRegFlags = 0;
1295 Register ImmReg =
MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
1299 BaseReg =
MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1302 TII->getAddNoCarry(*
MBB, InsertBefore,
DL, BaseReg)
1304 .addReg(AddrReg->getReg(), 0, BaseSubReg)
1311 .
addReg(BaseReg, BaseRegFlags, BaseSubReg)
1317 copyToDestRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdst, DestReg);
1319 CI.I->eraseFromParent();
1320 Paired.I->eraseFromParent();
1326unsigned SILoadStoreOptimizer::write2Opcode(
unsigned EltSize)
const {
1328 return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32 : AMDGPU::DS_WRITE2_B64;
1329 return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32_gfx9
1330 : AMDGPU::DS_WRITE2_B64_gfx9;
1333unsigned SILoadStoreOptimizer::write2ST64Opcode(
unsigned EltSize)
const {
1335 return (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32
1336 : AMDGPU::DS_WRITE2ST64_B64;
1338 return (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32_gfx9
1339 : AMDGPU::DS_WRITE2ST64_B64_gfx9;
1343 CombineInfo &CI, CombineInfo &Paired,
1350 TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr);
1352 TII->getNamedOperand(*CI.I, AMDGPU::OpName::data0);
1354 TII->getNamedOperand(*Paired.I, AMDGPU::OpName::data0);
1356 unsigned NewOffset0 = CI.Offset;
1357 unsigned NewOffset1 = Paired.Offset;
1359 CI.UseST64 ? write2ST64Opcode(CI.EltSize) : write2Opcode(CI.EltSize);
1361 if (NewOffset0 > NewOffset1) {
1367 assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) &&
1368 (NewOffset0 != NewOffset1) &&
"Computed offset doesn't fit");
1374 unsigned BaseSubReg = AddrReg->
getSubReg();
1375 unsigned BaseRegFlags = 0;
1377 Register ImmReg =
MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
1381 BaseReg =
MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1384 TII->getAddNoCarry(*
MBB, InsertBefore,
DL, BaseReg)
1386 .addReg(AddrReg->
getReg(), 0, BaseSubReg)
1393 .
addReg(BaseReg, BaseRegFlags, BaseSubReg)
1402 Paired.I->eraseFromParent();
1404 LLVM_DEBUG(
dbgs() <<
"Inserted write2 inst: " << *Write2 <<
'\n');
1409SILoadStoreOptimizer::mergeImagePair(CombineInfo &CI, CombineInfo &Paired,
1413 const unsigned Opcode = getNewOpcode(CI, Paired);
1417 Register DestReg =
MRI->createVirtualRegister(SuperRC);
1418 unsigned MergedDMask = CI.DMask | Paired.DMask;
1422 auto MIB =
BuildMI(*
MBB, InsertBefore,
DL,
TII->get(Opcode), DestReg);
1423 for (
unsigned I = 1, E = (*CI.I).getNumOperands();
I != E; ++
I) {
1425 MIB.addImm(MergedDMask);
1427 MIB.add((*CI.I).getOperand(
I));
1433 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1435 MachineInstr *
New = MIB.addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1437 copyToDestRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdata, DestReg);
1439 CI.I->eraseFromParent();
1440 Paired.I->eraseFromParent();
1445 CombineInfo &CI, CombineInfo &Paired,
1449 const unsigned Opcode = getNewOpcode(CI, Paired);
1453 Register DestReg =
MRI->createVirtualRegister(SuperRC);
1454 unsigned MergedOffset = std::min(CI.Offset, Paired.Offset);
1459 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1463 .
add(*
TII->getNamedOperand(*CI.I, AMDGPU::OpName::sbase));
1464 if (CI.InstClass == S_BUFFER_LOAD_SGPR_IMM)
1465 New.add(*
TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset));
1466 New.addImm(MergedOffset);
1467 New.addImm(CI.CPol).addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1469 copyToDestRegs(CI, Paired, InsertBefore, AMDGPU::OpName::sdst, DestReg);
1471 CI.I->eraseFromParent();
1472 Paired.I->eraseFromParent();
1477 CombineInfo &CI, CombineInfo &Paired,
1482 const unsigned Opcode = getNewOpcode(CI, Paired);
1487 Register DestReg =
MRI->createVirtualRegister(SuperRC);
1488 unsigned MergedOffset = std::min(CI.Offset, Paired.Offset);
1490 auto MIB =
BuildMI(*
MBB, InsertBefore,
DL,
TII->get(Opcode), DestReg);
1492 AddressRegs Regs = getRegs(Opcode, *
TII);
1495 MIB.add(*
TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
1500 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1503 MIB.add(*
TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
1504 .add(*
TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
1505 .addImm(MergedOffset)
1508 .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1510 copyToDestRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdata, DestReg);
1512 CI.I->eraseFromParent();
1513 Paired.I->eraseFromParent();
1518 CombineInfo &CI, CombineInfo &Paired,
1523 const unsigned Opcode = getNewOpcode(CI, Paired);
1528 Register DestReg =
MRI->createVirtualRegister(SuperRC);
1529 unsigned MergedOffset = std::min(CI.Offset, Paired.Offset);
1531 auto MIB =
BuildMI(*
MBB, InsertBefore,
DL,
TII->get(Opcode), DestReg);
1533 AddressRegs Regs = getRegs(Opcode, *
TII);
1536 MIB.add(*
TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
1538 unsigned JoinedFormat =
1544 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1547 MIB.add(*
TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
1548 .add(*
TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
1549 .addImm(MergedOffset)
1550 .addImm(JoinedFormat)
1553 .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1555 copyToDestRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdata, DestReg);
1557 CI.I->eraseFromParent();
1558 Paired.I->eraseFromParent();
1563 CombineInfo &CI, CombineInfo &Paired,
1568 const unsigned Opcode = getNewOpcode(CI, Paired);
1571 copyFromSrcRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdata);
1576 AddressRegs Regs = getRegs(Opcode, *
TII);
1579 MIB.add(*
TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
1581 unsigned JoinedFormat =
1587 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1590 MIB.add(*
TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
1591 .add(*
TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
1592 .addImm(std::min(CI.Offset, Paired.Offset))
1593 .addImm(JoinedFormat)
1596 .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1598 CI.I->eraseFromParent();
1599 Paired.I->eraseFromParent();
1604 CombineInfo &CI, CombineInfo &Paired,
1609 const unsigned Opcode = getNewOpcode(CI, Paired);
1612 Register DestReg =
MRI->createVirtualRegister(SuperRC);
1614 auto MIB =
BuildMI(*
MBB, InsertBefore,
DL,
TII->get(Opcode), DestReg);
1616 if (
auto *SAddr =
TII->getNamedOperand(*CI.I, AMDGPU::OpName::saddr))
1620 MIB.add(*
TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr))
1621 .addImm(std::min(CI.Offset, Paired.Offset))
1623 .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1625 copyToDestRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdst, DestReg);
1627 CI.I->eraseFromParent();
1628 Paired.I->eraseFromParent();
1633 CombineInfo &CI, CombineInfo &Paired,
1638 const unsigned Opcode = getNewOpcode(CI, Paired);
1641 copyFromSrcRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdata);
1644 .
add(*
TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr))
1647 if (
auto *SAddr =
TII->getNamedOperand(*CI.I, AMDGPU::OpName::saddr))
1651 MIB.addImm(std::min(CI.Offset, Paired.Offset))
1653 .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1655 CI.I->eraseFromParent();
1656 Paired.I->eraseFromParent();
1660unsigned SILoadStoreOptimizer::getNewOpcode(
const CombineInfo &CI,
1661 const CombineInfo &Paired) {
1662 const unsigned Width = CI.Width + Paired.Width;
1664 switch (getCommonInstClass(CI, Paired)) {
1666 assert(CI.InstClass == BUFFER_LOAD || CI.InstClass == BUFFER_STORE);
1677 case S_BUFFER_LOAD_IMM:
1682 return AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM;
1684 return AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM;
1686 return AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM;
1688 return AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM;
1690 case S_BUFFER_LOAD_SGPR_IMM:
1695 return AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM;
1697 return AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM;
1699 return AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM;
1701 return AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM;
1708 return AMDGPU::S_LOAD_DWORDX2_IMM;
1710 return AMDGPU::S_LOAD_DWORDX3_IMM;
1712 return AMDGPU::S_LOAD_DWORDX4_IMM;
1714 return AMDGPU::S_LOAD_DWORDX8_IMM;
1721 return AMDGPU::GLOBAL_LOAD_DWORDX2;
1723 return AMDGPU::GLOBAL_LOAD_DWORDX3;
1725 return AMDGPU::GLOBAL_LOAD_DWORDX4;
1727 case GLOBAL_LOAD_SADDR:
1732 return AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR;
1734 return AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR;
1736 return AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR;
1743 return AMDGPU::GLOBAL_STORE_DWORDX2;
1745 return AMDGPU::GLOBAL_STORE_DWORDX3;
1747 return AMDGPU::GLOBAL_STORE_DWORDX4;
1749 case GLOBAL_STORE_SADDR:
1754 return AMDGPU::GLOBAL_STORE_DWORDX2_SADDR;
1756 return AMDGPU::GLOBAL_STORE_DWORDX3_SADDR;
1758 return AMDGPU::GLOBAL_STORE_DWORDX4_SADDR;
1765 return AMDGPU::FLAT_LOAD_DWORDX2;
1767 return AMDGPU::FLAT_LOAD_DWORDX3;
1769 return AMDGPU::FLAT_LOAD_DWORDX4;
1776 return AMDGPU::FLAT_STORE_DWORDX2;
1778 return AMDGPU::FLAT_STORE_DWORDX3;
1780 return AMDGPU::FLAT_STORE_DWORDX4;
1789std::pair<unsigned, unsigned>
1790SILoadStoreOptimizer::getSubRegIdxs(
const CombineInfo &CI,
1791 const CombineInfo &Paired) {
1792 assert((CI.InstClass != MIMG ||
1794 CI.Width + Paired.Width)) &&
1800 static const unsigned Idxs[5][4] = {
1801 {AMDGPU::sub0, AMDGPU::sub0_sub1, AMDGPU::sub0_sub1_sub2, AMDGPU::sub0_sub1_sub2_sub3},
1802 {AMDGPU::sub1, AMDGPU::sub1_sub2, AMDGPU::sub1_sub2_sub3, AMDGPU::sub1_sub2_sub3_sub4},
1803 {AMDGPU::sub2, AMDGPU::sub2_sub3, AMDGPU::sub2_sub3_sub4, AMDGPU::sub2_sub3_sub4_sub5},
1804 {AMDGPU::sub3, AMDGPU::sub3_sub4, AMDGPU::sub3_sub4_sub5, AMDGPU::sub3_sub4_sub5_sub6},
1805 {AMDGPU::sub4, AMDGPU::sub4_sub5, AMDGPU::sub4_sub5_sub6, AMDGPU::sub4_sub5_sub6_sub7},
1808 assert(CI.Width >= 1 && CI.Width <= 4);
1809 assert(Paired.Width >= 1 && Paired.Width <= 4);
1812 Idx1 = Idxs[0][Paired.Width - 1];
1813 Idx0 = Idxs[Paired.Width][CI.Width - 1];
1815 Idx0 = Idxs[0][CI.Width - 1];
1816 Idx1 = Idxs[CI.Width][Paired.Width - 1];
1819 return {Idx0, Idx1};
1823SILoadStoreOptimizer::getTargetRegisterClass(
const CombineInfo &CI,
1824 const CombineInfo &Paired)
const {
1825 if (CI.InstClass == S_BUFFER_LOAD_IMM ||
1826 CI.InstClass == S_BUFFER_LOAD_SGPR_IMM || CI.InstClass == S_LOAD_IMM) {
1827 switch (CI.Width + Paired.Width) {
1831 return &AMDGPU::SReg_64_XEXECRegClass;
1833 return &AMDGPU::SGPR_96RegClass;
1835 return &AMDGPU::SGPR_128RegClass;
1837 return &AMDGPU::SGPR_256RegClass;
1839 return &AMDGPU::SGPR_512RegClass;
1843 unsigned BitWidth = 32 * (CI.Width + Paired.Width);
1844 return TRI->isAGPRClass(getDataRegClass(*CI.I))
1850 CombineInfo &CI, CombineInfo &Paired,
1855 const unsigned Opcode = getNewOpcode(CI, Paired);
1858 copyFromSrcRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdata);
1863 AddressRegs Regs = getRegs(Opcode, *
TII);
1866 MIB.add(*
TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
1872 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1875 MIB.add(*
TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
1876 .add(*
TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
1877 .addImm(std::min(CI.Offset, Paired.Offset))
1880 .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1882 CI.I->eraseFromParent();
1883 Paired.I->eraseFromParent();
1888SILoadStoreOptimizer::createRegOrImm(int32_t Val,
MachineInstr &
MI)
const {
1890 if (
TII->isInlineConstant(V))
1893 Register Reg =
MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
1895 BuildMI(*
MI.getParent(),
MI.getIterator(),
MI.getDebugLoc(),
1896 TII->get(AMDGPU::S_MOV_B32), Reg)
1905 const MemAddress &
Addr)
const {
1911 Addr.Base.LoSubReg) &&
1912 "Expected 32-bit Base-Register-Low!!");
1915 Addr.Base.HiSubReg) &&
1916 "Expected 32-bit Base-Register-Hi!!");
1921 createRegOrImm(
static_cast<int32_t
>(
Addr.Offset >> 32),
MI);
1923 const auto *CarryRC =
TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
1924 Register CarryReg =
MRI->createVirtualRegister(CarryRC);
1925 Register DeadCarryReg =
MRI->createVirtualRegister(CarryRC);
1927 Register DestSub0 =
MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1928 Register DestSub1 =
MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1948 Register FullDestReg =
MRI->createVirtualRegister(
TRI->getVGPR64Class());
1964 int32_t NewOffset)
const {
1965 auto Base =
TII->getNamedOperand(
MI, AMDGPU::OpName::vaddr);
1966 Base->setReg(NewBase);
1967 Base->setIsKill(
false);
1968 TII->getNamedOperand(
MI, AMDGPU::OpName::offset)->setImm(NewOffset);
1971std::optional<int32_t>
1977 return std::nullopt;
1980 if (!Def ||
Def->getOpcode() != AMDGPU::S_MOV_B32 ||
1981 !
Def->getOperand(1).isImm())
1982 return std::nullopt;
1984 return Def->getOperand(1).getImm();
1998 MemAddress &
Addr)
const {
2003 if (!Def ||
Def->getOpcode() != AMDGPU::REG_SEQUENCE
2004 ||
Def->getNumOperands() != 5)
2015 if (!BaseLoDef || BaseLoDef->
getOpcode() != AMDGPU::V_ADD_CO_U32_e64 ||
2016 !BaseHiDef || BaseHiDef->
getOpcode() != AMDGPU::V_ADDC_U32_e64)
2019 const auto *Src0 =
TII->getNamedOperand(*BaseLoDef, AMDGPU::OpName::src0);
2020 const auto *Src1 =
TII->getNamedOperand(*BaseLoDef, AMDGPU::OpName::src1);
2022 auto Offset0P = extractConstOffset(*Src0);
2026 if (!(Offset0P = extractConstOffset(*Src1)))
2031 Src0 =
TII->getNamedOperand(*BaseHiDef, AMDGPU::OpName::src0);
2032 Src1 =
TII->getNamedOperand(*BaseHiDef, AMDGPU::OpName::src1);
2047 Addr.Offset = (*Offset0P & 0x00000000ffffffff) | (Offset1 << 32);
2050bool SILoadStoreOptimizer::promoteConstantOffsetToImm(
2052 MemInfoMap &Visited,
2055 if (!(
MI.mayLoad() ^
MI.mayStore()))
2063 TII->getNamedOperand(
MI, AMDGPU::OpName::vdata) !=
nullptr)
2071 if (
TII->getNamedOperand(
MI, AMDGPU::OpName::offset)->getImm()) {
2079 if (!Visited.contains(&
MI)) {
2080 processBaseWithConstOffset(
Base, MAddr);
2081 Visited[&
MI] = MAddr;
2083 MAddr = Visited[&
MI];
2085 if (MAddr.Offset == 0) {
2086 LLVM_DEBUG(
dbgs() <<
" Failed to extract constant-offset or there are no"
2087 " constant offsets that can be promoted.\n";);
2092 << MAddr.Base.LoReg <<
"} Offset: " << MAddr.Offset <<
"\n\n";);
2120 MemAddress AnchorAddr;
2121 uint32_t MaxDist = std::numeric_limits<uint32_t>::min();
2136 TII->getNamedOperand(MINext, AMDGPU::OpName::offset)->getImm())
2140 *
TII->getNamedOperand(MINext, AMDGPU::OpName::vaddr);
2141 MemAddress MAddrNext;
2142 if (!Visited.contains(&MINext)) {
2143 processBaseWithConstOffset(BaseNext, MAddrNext);
2144 Visited[&MINext] = MAddrNext;
2146 MAddrNext = Visited[&MINext];
2148 if (MAddrNext.Base.LoReg != MAddr.Base.LoReg ||
2149 MAddrNext.Base.HiReg != MAddr.Base.HiReg ||
2150 MAddrNext.Base.LoSubReg != MAddr.Base.LoSubReg ||
2151 MAddrNext.Base.HiSubReg != MAddr.Base.HiSubReg)
2154 InstsWCommonBase.
emplace_back(&MINext, MAddrNext.Offset);
2156 int64_t Dist = MAddr.Offset - MAddrNext.Offset;
2161 (
uint32_t)std::abs(Dist) > MaxDist) {
2162 MaxDist = std::abs(Dist);
2164 AnchorAddr = MAddrNext;
2165 AnchorInst = &MINext;
2170 LLVM_DEBUG(
dbgs() <<
" Anchor-Inst(with max-distance from Offset): ";
2171 AnchorInst->
dump());
2173 << AnchorAddr.Offset <<
"\n\n");
2178 updateBaseAndOffset(
MI,
Base, MAddr.Offset - AnchorAddr.Offset);
2181 for (
auto [OtherMI, OtherOffset] : InstsWCommonBase) {
2184 AM.
BaseOffs = OtherOffset - AnchorAddr.Offset;
2189 updateBaseAndOffset(*OtherMI,
Base, OtherOffset - AnchorAddr.Offset);
2193 AnchorList.
insert(AnchorInst);
2200void SILoadStoreOptimizer::addInstToMergeableList(
const CombineInfo &CI,
2201 std::list<std::list<CombineInfo> > &MergeableInsts)
const {
2202 for (std::list<CombineInfo> &AddrList : MergeableInsts) {
2203 if (AddrList.front().InstClass == CI.InstClass &&
2204 AddrList.front().IsAGPR == CI.IsAGPR &&
2205 AddrList.front().hasSameBaseAddress(CI)) {
2206 AddrList.emplace_back(CI);
2212 MergeableInsts.emplace_back(1, CI);
2215std::pair<MachineBasicBlock::iterator, bool>
2216SILoadStoreOptimizer::collectMergeableInsts(
2219 std::list<std::list<CombineInfo>> &MergeableInsts)
const {
2225 for (; BlockI !=
End; ++BlockI) {
2230 if (promoteConstantOffsetToImm(
MI, Visited, AnchorList))
2235 if (
MI.hasOrderedMemoryRef() ||
MI.hasUnmodeledSideEffects()) {
2243 const InstClassEnum InstClass = getInstClass(
MI.getOpcode(), *
TII);
2244 if (InstClass == UNKNOWN)
2250 if (Swizzled != -1 &&
MI.getOperand(Swizzled).getImm())
2254 CI.setMI(
MI, *
this);
2257 if (!CI.hasMergeableAddress(*
MRI))
2260 if (CI.InstClass == DS_WRITE && CI.IsAGPR) {
2282 for (std::list<std::list<CombineInfo>>::iterator
I = MergeableInsts.begin(),
2283 E = MergeableInsts.end();
I != E;) {
2285 std::list<CombineInfo> &MergeList = *
I;
2286 if (MergeList.size() <= 1) {
2290 I = MergeableInsts.erase(
I);
2298 [] (
const CombineInfo &
A,
const CombineInfo &
B) {
2299 return A.Offset <
B.Offset;
2310bool SILoadStoreOptimizer::optimizeBlock(
2311 std::list<std::list<CombineInfo> > &MergeableInsts) {
2314 for (std::list<std::list<CombineInfo>>::iterator
I = MergeableInsts.begin(),
2315 E = MergeableInsts.end();
I != E;) {
2316 std::list<CombineInfo> &MergeList = *
I;
2318 bool OptimizeListAgain =
false;
2319 if (!optimizeInstsWithSameBaseAddr(MergeList, OptimizeListAgain)) {
2323 I = MergeableInsts.erase(
I);
2331 if (!OptimizeListAgain) {
2332 I = MergeableInsts.erase(
I);
2335 OptimizeAgain =
true;
2341SILoadStoreOptimizer::optimizeInstsWithSameBaseAddr(
2342 std::list<CombineInfo> &MergeList,
2343 bool &OptimizeListAgain) {
2344 if (MergeList.empty())
2349 for (
auto I = MergeList.begin(), Next = std::next(
I); Next != MergeList.end();
2350 Next = std::next(
I)) {
2355 if ((*First).Order > (*Second).Order)
2357 CombineInfo &CI = *
First;
2358 CombineInfo &Paired = *Second;
2360 CombineInfo *Where = checkAndPrepareMerge(CI, Paired);
2368 LLVM_DEBUG(
dbgs() <<
"Merging: " << *CI.I <<
" with: " << *Paired.I);
2371 switch (CI.InstClass) {
2376 NewMI = mergeRead2Pair(CI, Paired, Where->I);
2379 NewMI = mergeWrite2Pair(CI, Paired, Where->I);
2381 case S_BUFFER_LOAD_IMM:
2382 case S_BUFFER_LOAD_SGPR_IMM:
2384 NewMI = mergeSMemLoadImmPair(CI, Paired, Where->I);
2385 OptimizeListAgain |= CI.Width + Paired.Width < 8;
2388 NewMI = mergeBufferLoadPair(CI, Paired, Where->I);
2389 OptimizeListAgain |= CI.Width + Paired.Width < 4;
2392 NewMI = mergeBufferStorePair(CI, Paired, Where->I);
2393 OptimizeListAgain |= CI.Width + Paired.Width < 4;
2396 NewMI = mergeImagePair(CI, Paired, Where->I);
2397 OptimizeListAgain |= CI.Width + Paired.Width < 4;
2400 NewMI = mergeTBufferLoadPair(CI, Paired, Where->I);
2401 OptimizeListAgain |= CI.Width + Paired.Width < 4;
2404 NewMI = mergeTBufferStorePair(CI, Paired, Where->I);
2405 OptimizeListAgain |= CI.Width + Paired.Width < 4;
2409 case GLOBAL_LOAD_SADDR:
2410 NewMI = mergeFlatLoadPair(CI, Paired, Where->I);
2411 OptimizeListAgain |= CI.Width + Paired.Width < 4;
2415 case GLOBAL_STORE_SADDR:
2416 NewMI = mergeFlatStorePair(CI, Paired, Where->I);
2417 OptimizeListAgain |= CI.Width + Paired.Width < 4;
2420 CI.setMI(NewMI, *
this);
2421 CI.Order = Where->Order;
2425 MergeList.erase(Second);
2440 TRI = &
TII->getRegisterInfo();
2443 AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
2458 bool CollectModified;
2459 std::list<std::list<CombineInfo>> MergeableInsts;
2463 std::tie(SectionEnd, CollectModified) =
2469 OptimizeAgain =
false;
2471 }
while (OptimizeAgain);
unsigned const MachineRegisterInfo * MRI
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
MachineBasicBlock MachineBasicBlock::iterator MBBI
INITIALIZE_PASS(AMDGPUImageIntrinsicOptimizer, DEBUG_TYPE, "AMDGPU Image Intrinsic Optimizer", false, false) char AMDGPUImageIntrinsicOptimizer void addInstToMergeableList(IntrinsicInst *II, SmallVector< SmallVector< IntrinsicInst *, 4 > > &MergeableInsts, const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr)
BasicBlock::iterator collectMergeableInsts(BasicBlock::iterator I, BasicBlock::iterator E, SmallVector< SmallVector< IntrinsicInst *, 4 > > &MergeableInsts)
Provides AMDGPU specific target descriptions.
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
Analysis containing CSE Info
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
std::optional< std::vector< StOtherPiece > > Other
AMD GCN specific subclass of TargetSubtarget.
const HexagonInstrInfo * TII
unsigned const TargetRegisterInfo * TRI
static unsigned getReg(const MCDisassembler *D, unsigned RC, unsigned RegNo)
#define INITIALIZE_PASS_DEPENDENCY(depName)
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
static uint32_t mostAlignedValueInRange(uint32_t Lo, uint32_t Hi)
static void addDefsUsesToList(const MachineInstr &MI, DenseSet< Register > &RegDefs, DenseSet< Register > &RegUses)
static unsigned getBufferFormatWithCompCount(unsigned OldFormat, unsigned ComponentCount, const GCNSubtarget &STI)
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
static bool isImm(const MachineOperand &MO, MachineRegisterInfo *MRI)
static bool optimizeBlock(BasicBlock &BB, bool &ModifiedDT, const TargetTransformInfo &TTI, const DataLayout &DL, DomTreeUpdater *DTU)
support::ulittle16_t & Lo
support::ulittle16_t & Hi
A wrapper pass to provide the legacy pass manager access to a suitably prepared AAResults object.
Class for arbitrary precision integers.
Represent the analysis usage information of a pass.
AnalysisUsage & addRequired()
void setPreservesCFG()
This function should be called by the pass, iff they do not:
This class represents an Operation in the Expression.
Implements a dense probed hash-table based set.
FunctionPass class - This class is used to implement most global optimizations.
bool loadStoreOptEnabled() const
const SIInstrInfo * getInstrInfo() const override
bool hasDwordx3LoadStores() const
const SITargetLowering * getTargetLowering() const override
bool ldsRequiresM0Init() const
Return if most LDS instructions have an m0 use that require m0 to be initialized.
bool hasScalarDwordx3Loads() const
TypeSize getValue() const
Describe properties that are true of each instruction in the target description file.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
MachineFunctionPass - This class adapts the FunctionPass interface to allow convenient creation of pa...
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
Properties which a MachineFunction may have at a given point in time.
MachineFunctionProperties & set(Property P)
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
const MachineInstrBuilder & cloneMergedMemRefs(ArrayRef< const MachineInstr * > OtherMIs) const
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
Representation of each machine instruction.
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
void eraseFromParent()
Unlink 'this' from the containing basic block and delete it.
A description of a memory reference used in the backend.
LocationSize getSize() const
Return the size in bytes of the memory reference.
unsigned getAddrSpace() const
const MachinePointerInfo & getPointerInfo() const
MachineOperand class - Representation of each machine instruction operand.
unsigned getSubReg() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
bool isImm() const
isImm - Tests if this is a MO_Immediate operand.
static MachineOperand CreateImm(int64_t Val)
Register getReg() const
getReg - Returns the register number.
static MachineOperand CreateReg(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isEarlyClobber=false, unsigned SubReg=0, bool isDebug=false, bool isInternalRead=false, bool isRenamable=false)
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
static PassRegistry * getPassRegistry()
getPassRegistry - Access the global registry object, which is automatically initialized at applicatio...
Wrapper class representing virtual and physical registers.
constexpr bool isPhysical() const
Return true if the specified register number is in the physical register namespace.
static bool isFLATGlobal(const MachineInstr &MI)
bool isLegalGlobalAddressingMode(const AddrMode &AM) const
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
reference emplace_back(ArgTypes &&... Args)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
StringRef - Represent a constant reference to a string, i.e.
std::pair< iterator, bool > insert(const ValueT &V)
bool contains(const_arg_type_t< ValueT > V) const
Check if the set contains the given element.
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ FLAT_ADDRESS
Address space for flat memory.
LLVM_READONLY int getGlobalSaddrOp(uint16_t Opcode)
LLVM_READONLY const MIMGInfo * getMIMGInfo(unsigned Opc)
uint64_t convertSMRDOffsetUnits(const MCSubtargetInfo &ST, uint64_t ByteOffset)
Convert ByteOffset to dwords if the subtarget uses dword SMRD immediate offsets.
bool getMTBUFHasSrsrc(unsigned Opc)
int getMTBUFElements(unsigned Opc)
LLVM_READONLY int16_t getNamedOperandIdx(uint16_t Opcode, uint16_t NamedIdx)
bool getMTBUFHasSoffset(unsigned Opc)
int getMUBUFOpcode(unsigned BaseOpc, unsigned Elements)
int getMUBUFBaseOpcode(unsigned Opc)
int getMTBUFBaseOpcode(unsigned Opc)
bool getMUBUFHasVAddr(unsigned Opc)
LLVM_READONLY bool hasNamedOperand(uint64_t Opcode, uint64_t NamedIdx)
int getMTBUFOpcode(unsigned BaseOpc, unsigned Elements)
bool getMUBUFHasSoffset(unsigned Opc)
const MIMGBaseOpcodeInfo * getMIMGBaseOpcode(unsigned Opc)
LLVM_READONLY const MIMGBaseOpcodeInfo * getMIMGBaseOpcodeInfo(unsigned BaseOpcode)
int getMaskedMIMGOp(unsigned Opc, unsigned NewChannels)
bool getMTBUFHasVAddr(unsigned Opc)
int getMUBUFElements(unsigned Opc)
const GcnBufferFormatInfo * getGcnBufferFormatInfo(uint8_t BitsPerComp, uint8_t NumComponents, uint8_t NumFormat, const MCSubtargetInfo &STI)
bool getMUBUFHasSrsrc(unsigned Opc)
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
@ Define
Register definition.
@ Kill
The last use of a register.
Reg
All possible values of the reg field in the ModR/M byte.
NodeAddr< DefNode * > Def
This is an optimization pass for GlobalISel generic memory operations.
bool operator<(int64_t V1, const APSInt &V2)
int popcount(T Value) noexcept
Count the number of set bits in a value.
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
char & SILoadStoreOptimizerID
FunctionPass * createSILoadStoreOptimizerPass()
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
void initializeSILoadStoreOptimizerPass(PassRegistry &)
int countl_zero(T Val)
Count number of 0's from the most significant bit to the least stopping at the first 1.
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
@ First
Helpers to iterate all locations in the MemoryEffectsBase class.
constexpr unsigned BitWidth
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
This class contains a discriminated union of information about pointers in memory operands,...
This represents an addressing mode of: BaseGV + BaseOffs + BaseReg + Scale*ScaleReg + ScalableOffset*...