69#define DEBUG_TYPE "si-load-store-opt"
77 S_BUFFER_LOAD_SGPR_IMM,
94 unsigned char NumVAddrs = 0;
105const unsigned MaxAddressRegs = 12 + 1 + 1;
116 InstClassEnum InstClass;
120 int AddrIdx[MaxAddressRegs];
122 unsigned NumAddresses;
125 bool hasSameBaseAddress(
const CombineInfo &CI) {
126 if (NumAddresses != CI.NumAddresses)
130 for (
unsigned i = 0; i < NumAddresses; i++) {
133 if (AddrReg[i]->
isImm() || AddrRegNext.
isImm()) {
134 if (AddrReg[i]->
isImm() != AddrRegNext.
isImm() ||
152 for (
unsigned i = 0; i < NumAddresses; ++i) {
161 if (!AddrOp->
isReg())
167 AddrOp->
getReg() != AMDGPU::SGPR_NULL)
172 if (
MRI.hasOneNonDBGUse(AddrOp->
getReg()))
182 return (InstClass == MIMG) ? DMask <
Other.DMask : Offset <
Other.Offset;
186 struct BaseRegisters {
190 unsigned LoSubReg = 0;
191 unsigned HiSubReg = 0;
212 static bool dmasksCanBeCombined(
const CombineInfo &CI,
214 const CombineInfo &Paired);
215 static bool offsetsCanBeCombined(CombineInfo &CI,
const GCNSubtarget &STI,
216 CombineInfo &Paired,
bool Modify =
false);
217 static bool widthsFit(
const GCNSubtarget &STI,
const CombineInfo &CI,
218 const CombineInfo &Paired);
219 unsigned getNewOpcode(
const CombineInfo &CI,
const CombineInfo &Paired);
220 static std::pair<unsigned, unsigned> getSubRegIdxs(
const CombineInfo &CI,
221 const CombineInfo &Paired);
223 getTargetRegisterClass(
const CombineInfo &CI,
224 const CombineInfo &Paired)
const;
227 CombineInfo *checkAndPrepareMerge(CombineInfo &CI, CombineInfo &Paired);
229 void copyToDestRegs(CombineInfo &CI, CombineInfo &Paired,
232 Register copyFromSrcRegs(CombineInfo &CI, CombineInfo &Paired,
236 unsigned read2Opcode(
unsigned EltSize)
const;
237 unsigned read2ST64Opcode(
unsigned EltSize)
const;
239 mergeRead2Pair(CombineInfo &CI, CombineInfo &Paired,
242 unsigned write2Opcode(
unsigned EltSize)
const;
243 unsigned write2ST64Opcode(
unsigned EltSize)
const;
245 mergeWrite2Pair(CombineInfo &CI, CombineInfo &Paired,
248 mergeImagePair(CombineInfo &CI, CombineInfo &Paired,
251 mergeSMemLoadImmPair(CombineInfo &CI, CombineInfo &Paired,
254 mergeBufferLoadPair(CombineInfo &CI, CombineInfo &Paired,
257 mergeBufferStorePair(CombineInfo &CI, CombineInfo &Paired,
260 mergeTBufferLoadPair(CombineInfo &CI, CombineInfo &Paired,
263 mergeTBufferStorePair(CombineInfo &CI, CombineInfo &Paired,
266 mergeFlatLoadPair(CombineInfo &CI, CombineInfo &Paired,
269 mergeFlatStorePair(CombineInfo &CI, CombineInfo &Paired,
273 int32_t NewOffset)
const;
285 std::list<std::list<CombineInfo> > &MergeableInsts)
const;
290 std::list<std::list<CombineInfo>> &MergeableInsts)
const;
293 const CombineInfo &Paired);
295 static InstClassEnum getCommonInstClass(
const CombineInfo &CI,
296 const CombineInfo &Paired);
305 bool optimizeInstsWithSameBaseAddr(std::list<CombineInfo> &MergeList,
306 bool &OptimizeListAgain);
307 bool optimizeBlock(std::list<std::list<CombineInfo> > &MergeableInsts);
311 StringRef getPassName()
const override {
return "SI Load Store Optimizer"; }
322 .
set(MachineFunctionProperties::Property::IsSSA);
327 const unsigned Opc =
MI.getOpcode();
329 if (
TII.isMUBUF(Opc)) {
333 if (
TII.isImage(
MI)) {
335 TII.getNamedOperand(
MI, AMDGPU::OpName::dmask)->getImm();
338 if (
TII.isMTBUF(Opc)) {
343 case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
344 case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM:
345 case AMDGPU::S_LOAD_DWORD_IMM:
346 case AMDGPU::GLOBAL_LOAD_DWORD:
347 case AMDGPU::GLOBAL_LOAD_DWORD_SADDR:
348 case AMDGPU::GLOBAL_STORE_DWORD:
349 case AMDGPU::GLOBAL_STORE_DWORD_SADDR:
350 case AMDGPU::FLAT_LOAD_DWORD:
351 case AMDGPU::FLAT_STORE_DWORD:
353 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
354 case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM:
355 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM_ec:
356 case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM_ec:
357 case AMDGPU::S_LOAD_DWORDX2_IMM:
358 case AMDGPU::S_LOAD_DWORDX2_IMM_ec:
359 case AMDGPU::GLOBAL_LOAD_DWORDX2:
360 case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR:
361 case AMDGPU::GLOBAL_STORE_DWORDX2:
362 case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR:
363 case AMDGPU::FLAT_LOAD_DWORDX2:
364 case AMDGPU::FLAT_STORE_DWORDX2:
366 case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM:
367 case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM:
368 case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM_ec:
369 case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM_ec:
370 case AMDGPU::S_LOAD_DWORDX3_IMM:
371 case AMDGPU::S_LOAD_DWORDX3_IMM_ec:
372 case AMDGPU::GLOBAL_LOAD_DWORDX3:
373 case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR:
374 case AMDGPU::GLOBAL_STORE_DWORDX3:
375 case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR:
376 case AMDGPU::FLAT_LOAD_DWORDX3:
377 case AMDGPU::FLAT_STORE_DWORDX3:
379 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
380 case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM:
381 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM_ec:
382 case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM_ec:
383 case AMDGPU::S_LOAD_DWORDX4_IMM:
384 case AMDGPU::S_LOAD_DWORDX4_IMM_ec:
385 case AMDGPU::GLOBAL_LOAD_DWORDX4:
386 case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR:
387 case AMDGPU::GLOBAL_STORE_DWORDX4:
388 case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR:
389 case AMDGPU::FLAT_LOAD_DWORDX4:
390 case AMDGPU::FLAT_STORE_DWORDX4:
392 case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
393 case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM:
394 case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM_ec:
395 case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM_ec:
396 case AMDGPU::S_LOAD_DWORDX8_IMM:
397 case AMDGPU::S_LOAD_DWORDX8_IMM_ec:
399 case AMDGPU::DS_READ_B32:
400 case AMDGPU::DS_READ_B32_gfx9:
401 case AMDGPU::DS_WRITE_B32:
402 case AMDGPU::DS_WRITE_B32_gfx9:
404 case AMDGPU::DS_READ_B64:
405 case AMDGPU::DS_READ_B64_gfx9:
406 case AMDGPU::DS_WRITE_B64:
407 case AMDGPU::DS_WRITE_B64_gfx9:
415static InstClassEnum getInstClass(
unsigned Opc,
const SIInstrInfo &
TII) {
418 if (
TII.isMUBUF(Opc)) {
422 case AMDGPU::BUFFER_LOAD_DWORD_BOTHEN:
423 case AMDGPU::BUFFER_LOAD_DWORD_BOTHEN_exact:
424 case AMDGPU::BUFFER_LOAD_DWORD_IDXEN:
425 case AMDGPU::BUFFER_LOAD_DWORD_IDXEN_exact:
426 case AMDGPU::BUFFER_LOAD_DWORD_OFFEN:
427 case AMDGPU::BUFFER_LOAD_DWORD_OFFEN_exact:
428 case AMDGPU::BUFFER_LOAD_DWORD_OFFSET:
429 case AMDGPU::BUFFER_LOAD_DWORD_OFFSET_exact:
430 case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_BOTHEN:
431 case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_BOTHEN_exact:
432 case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_IDXEN:
433 case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_IDXEN_exact:
434 case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_OFFEN:
435 case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_OFFEN_exact:
436 case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_OFFSET:
437 case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_OFFSET_exact:
439 case AMDGPU::BUFFER_STORE_DWORD_BOTHEN:
440 case AMDGPU::BUFFER_STORE_DWORD_BOTHEN_exact:
441 case AMDGPU::BUFFER_STORE_DWORD_IDXEN:
442 case AMDGPU::BUFFER_STORE_DWORD_IDXEN_exact:
443 case AMDGPU::BUFFER_STORE_DWORD_OFFEN:
444 case AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact:
445 case AMDGPU::BUFFER_STORE_DWORD_OFFSET:
446 case AMDGPU::BUFFER_STORE_DWORD_OFFSET_exact:
447 case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_BOTHEN:
448 case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_BOTHEN_exact:
449 case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_IDXEN:
450 case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_IDXEN_exact:
451 case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_OFFEN:
452 case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_OFFEN_exact:
453 case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_OFFSET:
454 case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_OFFSET_exact:
458 if (
TII.isImage(Opc)) {
467 if (
TII.get(Opc).mayStore() || !
TII.get(Opc).mayLoad() ||
472 if (
TII.isMTBUF(Opc)) {
476 case AMDGPU::TBUFFER_LOAD_FORMAT_X_BOTHEN:
477 case AMDGPU::TBUFFER_LOAD_FORMAT_X_BOTHEN_exact:
478 case AMDGPU::TBUFFER_LOAD_FORMAT_X_IDXEN:
479 case AMDGPU::TBUFFER_LOAD_FORMAT_X_IDXEN_exact:
480 case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFEN:
481 case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFEN_exact:
482 case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFSET:
483 case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFSET_exact:
484 case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_BOTHEN:
485 case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_BOTHEN_exact:
486 case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_IDXEN:
487 case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_IDXEN_exact:
488 case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_OFFEN:
489 case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_OFFEN_exact:
490 case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_OFFSET:
491 case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_OFFSET_exact:
493 case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFEN:
494 case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFEN_exact:
495 case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFSET:
496 case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFSET_exact:
497 case AMDGPU::TBUFFER_STORE_FORMAT_X_VBUFFER_OFFEN:
498 case AMDGPU::TBUFFER_STORE_FORMAT_X_VBUFFER_OFFEN_exact:
499 case AMDGPU::TBUFFER_STORE_FORMAT_X_VBUFFER_OFFSET:
500 case AMDGPU::TBUFFER_STORE_FORMAT_X_VBUFFER_OFFSET_exact:
501 return TBUFFER_STORE;
505 case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
506 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
507 case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM:
508 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
509 case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
510 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM_ec:
511 case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM_ec:
512 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM_ec:
513 case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM_ec:
514 return S_BUFFER_LOAD_IMM;
515 case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM:
516 case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM:
517 case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM:
518 case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM:
519 case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM:
520 case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM_ec:
521 case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM_ec:
522 case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM_ec:
523 case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM_ec:
524 return S_BUFFER_LOAD_SGPR_IMM;
525 case AMDGPU::S_LOAD_DWORD_IMM:
526 case AMDGPU::S_LOAD_DWORDX2_IMM:
527 case AMDGPU::S_LOAD_DWORDX3_IMM:
528 case AMDGPU::S_LOAD_DWORDX4_IMM:
529 case AMDGPU::S_LOAD_DWORDX8_IMM:
530 case AMDGPU::S_LOAD_DWORDX2_IMM_ec:
531 case AMDGPU::S_LOAD_DWORDX3_IMM_ec:
532 case AMDGPU::S_LOAD_DWORDX4_IMM_ec:
533 case AMDGPU::S_LOAD_DWORDX8_IMM_ec:
535 case AMDGPU::DS_READ_B32:
536 case AMDGPU::DS_READ_B32_gfx9:
537 case AMDGPU::DS_READ_B64:
538 case AMDGPU::DS_READ_B64_gfx9:
540 case AMDGPU::DS_WRITE_B32:
541 case AMDGPU::DS_WRITE_B32_gfx9:
542 case AMDGPU::DS_WRITE_B64:
543 case AMDGPU::DS_WRITE_B64_gfx9:
545 case AMDGPU::GLOBAL_LOAD_DWORD:
546 case AMDGPU::GLOBAL_LOAD_DWORDX2:
547 case AMDGPU::GLOBAL_LOAD_DWORDX3:
548 case AMDGPU::GLOBAL_LOAD_DWORDX4:
549 case AMDGPU::FLAT_LOAD_DWORD:
550 case AMDGPU::FLAT_LOAD_DWORDX2:
551 case AMDGPU::FLAT_LOAD_DWORDX3:
552 case AMDGPU::FLAT_LOAD_DWORDX4:
554 case AMDGPU::GLOBAL_LOAD_DWORD_SADDR:
555 case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR:
556 case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR:
557 case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR:
558 return GLOBAL_LOAD_SADDR;
559 case AMDGPU::GLOBAL_STORE_DWORD:
560 case AMDGPU::GLOBAL_STORE_DWORDX2:
561 case AMDGPU::GLOBAL_STORE_DWORDX3:
562 case AMDGPU::GLOBAL_STORE_DWORDX4:
563 case AMDGPU::FLAT_STORE_DWORD:
564 case AMDGPU::FLAT_STORE_DWORDX2:
565 case AMDGPU::FLAT_STORE_DWORDX3:
566 case AMDGPU::FLAT_STORE_DWORDX4:
568 case AMDGPU::GLOBAL_STORE_DWORD_SADDR:
569 case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR:
570 case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR:
571 case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR:
572 return GLOBAL_STORE_SADDR;
579static unsigned getInstSubclass(
unsigned Opc,
const SIInstrInfo &
TII) {
582 if (
TII.isMUBUF(Opc))
584 if (
TII.isImage(Opc)) {
587 return Info->BaseOpcode;
589 if (
TII.isMTBUF(Opc))
592 case AMDGPU::DS_READ_B32:
593 case AMDGPU::DS_READ_B32_gfx9:
594 case AMDGPU::DS_READ_B64:
595 case AMDGPU::DS_READ_B64_gfx9:
596 case AMDGPU::DS_WRITE_B32:
597 case AMDGPU::DS_WRITE_B32_gfx9:
598 case AMDGPU::DS_WRITE_B64:
599 case AMDGPU::DS_WRITE_B64_gfx9:
601 case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
602 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
603 case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM:
604 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
605 case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
606 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM_ec:
607 case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM_ec:
608 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM_ec:
609 case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM_ec:
610 return AMDGPU::S_BUFFER_LOAD_DWORD_IMM;
611 case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM:
612 case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM:
613 case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM:
614 case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM:
615 case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM:
616 case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM_ec:
617 case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM_ec:
618 case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM_ec:
619 case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM_ec:
620 return AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM;
621 case AMDGPU::S_LOAD_DWORD_IMM:
622 case AMDGPU::S_LOAD_DWORDX2_IMM:
623 case AMDGPU::S_LOAD_DWORDX3_IMM:
624 case AMDGPU::S_LOAD_DWORDX4_IMM:
625 case AMDGPU::S_LOAD_DWORDX8_IMM:
626 case AMDGPU::S_LOAD_DWORDX2_IMM_ec:
627 case AMDGPU::S_LOAD_DWORDX3_IMM_ec:
628 case AMDGPU::S_LOAD_DWORDX4_IMM_ec:
629 case AMDGPU::S_LOAD_DWORDX8_IMM_ec:
630 return AMDGPU::S_LOAD_DWORD_IMM;
631 case AMDGPU::GLOBAL_LOAD_DWORD:
632 case AMDGPU::GLOBAL_LOAD_DWORDX2:
633 case AMDGPU::GLOBAL_LOAD_DWORDX3:
634 case AMDGPU::GLOBAL_LOAD_DWORDX4:
635 case AMDGPU::FLAT_LOAD_DWORD:
636 case AMDGPU::FLAT_LOAD_DWORDX2:
637 case AMDGPU::FLAT_LOAD_DWORDX3:
638 case AMDGPU::FLAT_LOAD_DWORDX4:
639 return AMDGPU::FLAT_LOAD_DWORD;
640 case AMDGPU::GLOBAL_LOAD_DWORD_SADDR:
641 case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR:
642 case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR:
643 case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR:
644 return AMDGPU::GLOBAL_LOAD_DWORD_SADDR;
645 case AMDGPU::GLOBAL_STORE_DWORD:
646 case AMDGPU::GLOBAL_STORE_DWORDX2:
647 case AMDGPU::GLOBAL_STORE_DWORDX3:
648 case AMDGPU::GLOBAL_STORE_DWORDX4:
649 case AMDGPU::FLAT_STORE_DWORD:
650 case AMDGPU::FLAT_STORE_DWORDX2:
651 case AMDGPU::FLAT_STORE_DWORDX3:
652 case AMDGPU::FLAT_STORE_DWORDX4:
653 return AMDGPU::FLAT_STORE_DWORD;
654 case AMDGPU::GLOBAL_STORE_DWORD_SADDR:
655 case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR:
656 case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR:
657 case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR:
658 return AMDGPU::GLOBAL_STORE_DWORD_SADDR;
669SILoadStoreOptimizer::getCommonInstClass(
const CombineInfo &CI,
670 const CombineInfo &Paired) {
671 assert(CI.InstClass == Paired.InstClass);
673 if ((CI.InstClass == FLAT_LOAD || CI.InstClass == FLAT_STORE) &&
675 return (CI.InstClass == FLAT_STORE) ? GLOBAL_STORE : GLOBAL_LOAD;
683 if (
TII.isMUBUF(Opc)) {
689 Result.SOffset =
true;
694 if (
TII.isImage(Opc)) {
696 if (VAddr0Idx >= 0) {
698 TII.isMIMG(Opc) ? AMDGPU::OpName::srsrc : AMDGPU::OpName::rsrc;
700 Result.NumVAddrs = RsrcIdx - VAddr0Idx;
711 if (
TII.isMTBUF(Opc)) {
717 Result.SOffset =
true;
725 case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM:
726 case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM:
727 case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM:
728 case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM:
729 case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM:
730 case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM_ec:
731 case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM_ec:
732 case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM_ec:
733 case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM_ec:
734 Result.SOffset =
true;
736 case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
737 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
738 case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM:
739 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
740 case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
741 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM_ec:
742 case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM_ec:
743 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM_ec:
744 case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM_ec:
745 case AMDGPU::S_LOAD_DWORD_IMM:
746 case AMDGPU::S_LOAD_DWORDX2_IMM:
747 case AMDGPU::S_LOAD_DWORDX3_IMM:
748 case AMDGPU::S_LOAD_DWORDX4_IMM:
749 case AMDGPU::S_LOAD_DWORDX8_IMM:
750 case AMDGPU::S_LOAD_DWORDX2_IMM_ec:
751 case AMDGPU::S_LOAD_DWORDX3_IMM_ec:
752 case AMDGPU::S_LOAD_DWORDX4_IMM_ec:
753 case AMDGPU::S_LOAD_DWORDX8_IMM_ec:
756 case AMDGPU::DS_READ_B32:
757 case AMDGPU::DS_READ_B64:
758 case AMDGPU::DS_READ_B32_gfx9:
759 case AMDGPU::DS_READ_B64_gfx9:
760 case AMDGPU::DS_WRITE_B32:
761 case AMDGPU::DS_WRITE_B64:
762 case AMDGPU::DS_WRITE_B32_gfx9:
763 case AMDGPU::DS_WRITE_B64_gfx9:
766 case AMDGPU::GLOBAL_LOAD_DWORD_SADDR:
767 case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR:
768 case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR:
769 case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR:
770 case AMDGPU::GLOBAL_STORE_DWORD_SADDR:
771 case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR:
772 case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR:
773 case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR:
776 case AMDGPU::GLOBAL_LOAD_DWORD:
777 case AMDGPU::GLOBAL_LOAD_DWORDX2:
778 case AMDGPU::GLOBAL_LOAD_DWORDX3:
779 case AMDGPU::GLOBAL_LOAD_DWORDX4:
780 case AMDGPU::GLOBAL_STORE_DWORD:
781 case AMDGPU::GLOBAL_STORE_DWORDX2:
782 case AMDGPU::GLOBAL_STORE_DWORDX3:
783 case AMDGPU::GLOBAL_STORE_DWORDX4:
784 case AMDGPU::FLAT_LOAD_DWORD:
785 case AMDGPU::FLAT_LOAD_DWORDX2:
786 case AMDGPU::FLAT_LOAD_DWORDX3:
787 case AMDGPU::FLAT_LOAD_DWORDX4:
788 case AMDGPU::FLAT_STORE_DWORD:
789 case AMDGPU::FLAT_STORE_DWORDX2:
790 case AMDGPU::FLAT_STORE_DWORDX3:
791 case AMDGPU::FLAT_STORE_DWORDX4:
798 const SILoadStoreOptimizer &LSO) {
800 unsigned Opc =
MI->getOpcode();
801 InstClass = getInstClass(Opc, *LSO.TII);
803 if (InstClass == UNKNOWN)
806 IsAGPR = LSO.TRI->hasAGPRs(LSO.getDataRegClass(*
MI));
811 (Opc == AMDGPU::DS_READ_B64 || Opc == AMDGPU::DS_READ_B64_gfx9) ? 8
816 (Opc == AMDGPU::DS_WRITE_B64 || Opc == AMDGPU::DS_WRITE_B64_gfx9) ? 8
819 case S_BUFFER_LOAD_IMM:
820 case S_BUFFER_LOAD_SGPR_IMM:
829 if (InstClass == MIMG) {
830 DMask = LSO.TII->getNamedOperand(*
I, AMDGPU::OpName::dmask)->getImm();
835 Offset =
I->getOperand(OffsetIdx).getImm();
838 if (InstClass == TBUFFER_LOAD || InstClass == TBUFFER_STORE)
839 Format = LSO.TII->getNamedOperand(*
I, AMDGPU::OpName::format)->getImm();
841 Width = getOpcodeWidth(*
I, *LSO.TII);
843 if ((InstClass == DS_READ) || (InstClass == DS_WRITE)) {
845 }
else if (InstClass != MIMG) {
846 CPol = LSO.TII->getNamedOperand(*
I, AMDGPU::OpName::cpol)->getImm();
849 AddressRegs Regs = getRegs(Opc, *LSO.TII);
850 bool isVIMAGEorVSAMPLE = LSO.TII->isVIMAGE(*
I) || LSO.TII->isVSAMPLE(*
I);
853 for (
unsigned J = 0; J < Regs.NumVAddrs; J++)
854 AddrIdx[NumAddresses++] =
857 AddrIdx[NumAddresses++] =
860 AddrIdx[NumAddresses++] =
864 Opc, isVIMAGEorVSAMPLE ? AMDGPU::OpName::rsrc : AMDGPU::OpName::srsrc);
866 AddrIdx[NumAddresses++] =
869 AddrIdx[NumAddresses++] =
872 AddrIdx[NumAddresses++] =
876 Opc, isVIMAGEorVSAMPLE ? AMDGPU::OpName::samp : AMDGPU::OpName::ssamp);
877 assert(NumAddresses <= MaxAddressRegs);
879 for (
unsigned J = 0; J < NumAddresses; J++)
880 AddrReg[J] = &
I->getOperand(AddrIdx[J]);
886 "SI Load Store Optimizer",
false,
false)
891char SILoadStoreOptimizer::
ID = 0;
896 return new SILoadStoreOptimizer();
902 for (
const auto &
Op :
MI.operands()) {
912bool SILoadStoreOptimizer::canSwapInstructions(
915 if (
A.mayLoadOrStore() &&
B.mayLoadOrStore() &&
916 (
A.mayStore() ||
B.mayStore()) &&
A.mayAlias(AA,
B,
true))
918 for (
const auto &BOp :
B.operands()) {
921 if ((BOp.isDef() || BOp.readsReg()) && ARegDefs.
contains(BOp.getReg()))
923 if (BOp.isDef() && ARegUses.
contains(BOp.getReg()))
932SILoadStoreOptimizer::combineKnownAdjacentMMOs(
const CombineInfo &CI,
933 const CombineInfo &Paired) {
953bool SILoadStoreOptimizer::dmasksCanBeCombined(
const CombineInfo &CI,
955 const CombineInfo &Paired) {
956 assert(CI.InstClass == MIMG);
959 const auto *TFEOp =
TII.getNamedOperand(*CI.I, AMDGPU::OpName::tfe);
960 const auto *LWEOp =
TII.getNamedOperand(*CI.I, AMDGPU::OpName::lwe);
962 if ((TFEOp && TFEOp->getImm()) || (LWEOp && LWEOp->getImm()))
966 unsigned OperandsToMatch[] = {AMDGPU::OpName::cpol, AMDGPU::OpName::d16,
967 AMDGPU::OpName::unorm, AMDGPU::OpName::da,
968 AMDGPU::OpName::r128, AMDGPU::OpName::a16};
970 for (
auto op : OperandsToMatch) {
975 CI.I->getOperand(
Idx).getImm() != Paired.I->getOperand(
Idx).getImm())
980 unsigned MaxMask = std::max(CI.DMask, Paired.DMask);
981 unsigned MinMask = std::min(CI.DMask, Paired.DMask);
987 if ((1u << AllowedBitsForMin) <= MinMask)
994 unsigned ComponentCount,
996 if (ComponentCount > 4)
1015 return NewFormatInfo->
Format;
1028bool SILoadStoreOptimizer::offsetsCanBeCombined(CombineInfo &CI,
1030 CombineInfo &Paired,
1032 assert(CI.InstClass != MIMG);
1036 if (CI.Offset == Paired.Offset)
1040 if ((CI.Offset % CI.EltSize != 0) || (Paired.Offset % CI.EltSize != 0))
1043 if (CI.InstClass == TBUFFER_LOAD || CI.InstClass == TBUFFER_STORE) {
1067 uint32_t EltOffset0 = CI.Offset / CI.EltSize;
1068 uint32_t EltOffset1 = Paired.Offset / CI.EltSize;
1073 if ((CI.InstClass != DS_READ) && (CI.InstClass != DS_WRITE)) {
1074 if (EltOffset0 + CI.Width != EltOffset1 &&
1075 EltOffset1 + Paired.Width != EltOffset0)
1077 if (CI.CPol != Paired.CPol)
1079 if (CI.InstClass == S_LOAD_IMM || CI.InstClass == S_BUFFER_LOAD_IMM ||
1080 CI.InstClass == S_BUFFER_LOAD_SGPR_IMM) {
1086 if (CI.Width != Paired.Width &&
1087 (CI.Width < Paired.Width) == (CI.Offset < Paired.Offset))
1095 if ((EltOffset0 % 64 == 0) && (EltOffset1 % 64) == 0 &&
1096 isUInt<8>(EltOffset0 / 64) && isUInt<8>(EltOffset1 / 64)) {
1098 CI.Offset = EltOffset0 / 64;
1099 Paired.Offset = EltOffset1 / 64;
1106 if (isUInt<8>(EltOffset0) && isUInt<8>(EltOffset1)) {
1108 CI.Offset = EltOffset0;
1109 Paired.Offset = EltOffset1;
1115 uint32_t Min = std::min(EltOffset0, EltOffset1);
1118 const uint32_t Mask = maskTrailingOnes<uint32_t>(8) * 64;
1119 if (((Max - Min) & ~Mask) == 0) {
1127 BaseOff |= Min & maskTrailingOnes<uint32_t>(6);
1128 CI.BaseOff = BaseOff * CI.EltSize;
1129 CI.Offset = (EltOffset0 - BaseOff) / 64;
1130 Paired.Offset = (EltOffset1 - BaseOff) / 64;
1136 if (isUInt<8>(Max - Min)) {
1142 CI.BaseOff = BaseOff * CI.EltSize;
1143 CI.Offset = EltOffset0 - BaseOff;
1144 Paired.Offset = EltOffset1 - BaseOff;
1152bool SILoadStoreOptimizer::widthsFit(
const GCNSubtarget &STM,
1153 const CombineInfo &CI,
1154 const CombineInfo &Paired) {
1155 const unsigned Width = (CI.Width + Paired.Width);
1156 switch (CI.InstClass) {
1159 case S_BUFFER_LOAD_IMM:
1160 case S_BUFFER_LOAD_SGPR_IMM:
1176SILoadStoreOptimizer::getDataRegClass(
const MachineInstr &
MI)
const {
1177 if (
const auto *Dst =
TII->getNamedOperand(
MI, AMDGPU::OpName::vdst)) {
1178 return TRI->getRegClassForReg(*
MRI, Dst->getReg());
1180 if (
const auto *Src =
TII->getNamedOperand(
MI, AMDGPU::OpName::vdata)) {
1181 return TRI->getRegClassForReg(*
MRI, Src->getReg());
1183 if (
const auto *Src =
TII->getNamedOperand(
MI, AMDGPU::OpName::data0)) {
1184 return TRI->getRegClassForReg(*
MRI, Src->getReg());
1186 if (
const auto *Dst =
TII->getNamedOperand(
MI, AMDGPU::OpName::sdst)) {
1187 return TRI->getRegClassForReg(*
MRI, Dst->getReg());
1189 if (
const auto *Src =
TII->getNamedOperand(
MI, AMDGPU::OpName::sdata)) {
1190 return TRI->getRegClassForReg(*
MRI, Src->getReg());
1197SILoadStoreOptimizer::CombineInfo *
1198SILoadStoreOptimizer::checkAndPrepareMerge(CombineInfo &CI,
1199 CombineInfo &Paired) {
1202 if (CI.InstClass == UNKNOWN || Paired.InstClass == UNKNOWN)
1204 assert(CI.InstClass == Paired.InstClass);
1206 if (getInstSubclass(CI.I->getOpcode(), *
TII) !=
1207 getInstSubclass(Paired.I->getOpcode(), *
TII))
1212 if (CI.InstClass == MIMG) {
1213 if (!dmasksCanBeCombined(CI, *
TII, Paired))
1216 if (!widthsFit(*STM, CI, Paired) || !offsetsCanBeCombined(CI, *STM, Paired))
1223 if (CI.I->mayLoad()) {
1227 if (!canSwapInstructions(RegDefs, RegUses, *Paired.I, *
MBBI))
1235 if (!canSwapInstructions(RegDefs, RegUses, *CI.I, *
MBBI))
1245 if (CI.InstClass == DS_READ || CI.InstClass == DS_WRITE)
1246 offsetsCanBeCombined(CI, *STM, Paired,
true);
1252void SILoadStoreOptimizer::copyToDestRegs(
1253 CombineInfo &CI, CombineInfo &Paired,
1259 auto [SubRegIdx0, SubRegIdx1] = getSubRegIdxs(CI, Paired);
1263 auto *Dest0 =
TII->getNamedOperand(*CI.I,
OpName);
1264 auto *Dest1 =
TII->getNamedOperand(*Paired.I,
OpName);
1269 Dest0->setIsEarlyClobber(
false);
1270 Dest1->setIsEarlyClobber(
false);
1274 .
addReg(DestReg, 0, SubRegIdx0);
1283SILoadStoreOptimizer::copyFromSrcRegs(CombineInfo &CI, CombineInfo &Paired,
1289 auto [SubRegIdx0, SubRegIdx1] = getSubRegIdxs(CI, Paired);
1293 Register SrcReg =
MRI->createVirtualRegister(SuperRC);
1295 const auto *Src0 =
TII->getNamedOperand(*CI.I,
OpName);
1296 const auto *Src1 =
TII->getNamedOperand(*Paired.I,
OpName);
1298 BuildMI(*
MBB, InsertBefore,
DL,
TII->get(AMDGPU::REG_SEQUENCE), SrcReg)
1307unsigned SILoadStoreOptimizer::read2Opcode(
unsigned EltSize)
const {
1309 return (EltSize == 4) ? AMDGPU::DS_READ2_B32 : AMDGPU::DS_READ2_B64;
1310 return (EltSize == 4) ? AMDGPU::DS_READ2_B32_gfx9 : AMDGPU::DS_READ2_B64_gfx9;
1313unsigned SILoadStoreOptimizer::read2ST64Opcode(
unsigned EltSize)
const {
1315 return (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32 : AMDGPU::DS_READ2ST64_B64;
1317 return (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32_gfx9
1318 : AMDGPU::DS_READ2ST64_B64_gfx9;
1322SILoadStoreOptimizer::mergeRead2Pair(CombineInfo &CI, CombineInfo &Paired,
1328 const auto *AddrReg =
TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr);
1330 unsigned NewOffset0 = std::min(CI.Offset, Paired.Offset);
1331 unsigned NewOffset1 = std::max(CI.Offset, Paired.Offset);
1333 CI.UseST64 ? read2ST64Opcode(CI.EltSize) : read2Opcode(CI.EltSize);
1335 assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) &&
1336 (NewOffset0 != NewOffset1) &&
"Computed offset doesn't fit");
1341 Register DestReg =
MRI->createVirtualRegister(SuperRC);
1345 Register BaseReg = AddrReg->getReg();
1346 unsigned BaseSubReg = AddrReg->getSubReg();
1347 unsigned BaseRegFlags = 0;
1349 Register ImmReg =
MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
1353 BaseReg =
MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1356 TII->getAddNoCarry(*
MBB, InsertBefore,
DL, BaseReg)
1358 .addReg(AddrReg->getReg(), 0, BaseSubReg)
1365 .
addReg(BaseReg, BaseRegFlags, BaseSubReg)
1371 copyToDestRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdst, DestReg);
1373 CI.I->eraseFromParent();
1374 Paired.I->eraseFromParent();
1380unsigned SILoadStoreOptimizer::write2Opcode(
unsigned EltSize)
const {
1382 return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32 : AMDGPU::DS_WRITE2_B64;
1383 return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32_gfx9
1384 : AMDGPU::DS_WRITE2_B64_gfx9;
1387unsigned SILoadStoreOptimizer::write2ST64Opcode(
unsigned EltSize)
const {
1389 return (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32
1390 : AMDGPU::DS_WRITE2ST64_B64;
1392 return (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32_gfx9
1393 : AMDGPU::DS_WRITE2ST64_B64_gfx9;
1397 CombineInfo &CI, CombineInfo &Paired,
1404 TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr);
1406 TII->getNamedOperand(*CI.I, AMDGPU::OpName::data0);
1408 TII->getNamedOperand(*Paired.I, AMDGPU::OpName::data0);
1410 unsigned NewOffset0 = CI.Offset;
1411 unsigned NewOffset1 = Paired.Offset;
1413 CI.UseST64 ? write2ST64Opcode(CI.EltSize) : write2Opcode(CI.EltSize);
1415 if (NewOffset0 > NewOffset1) {
1421 assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) &&
1422 (NewOffset0 != NewOffset1) &&
"Computed offset doesn't fit");
1428 unsigned BaseSubReg = AddrReg->
getSubReg();
1429 unsigned BaseRegFlags = 0;
1431 Register ImmReg =
MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
1435 BaseReg =
MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1438 TII->getAddNoCarry(*
MBB, InsertBefore,
DL, BaseReg)
1440 .addReg(AddrReg->
getReg(), 0, BaseSubReg)
1447 .
addReg(BaseReg, BaseRegFlags, BaseSubReg)
1456 Paired.I->eraseFromParent();
1458 LLVM_DEBUG(
dbgs() <<
"Inserted write2 inst: " << *Write2 <<
'\n');
1463SILoadStoreOptimizer::mergeImagePair(CombineInfo &CI, CombineInfo &Paired,
1467 const unsigned Opcode = getNewOpcode(CI, Paired);
1471 Register DestReg =
MRI->createVirtualRegister(SuperRC);
1472 unsigned MergedDMask = CI.DMask | Paired.DMask;
1476 auto MIB =
BuildMI(*
MBB, InsertBefore,
DL,
TII->get(Opcode), DestReg);
1477 for (
unsigned I = 1, E = (*CI.I).getNumOperands();
I != E; ++
I) {
1479 MIB.addImm(MergedDMask);
1481 MIB.add((*CI.I).getOperand(
I));
1487 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1489 MachineInstr *
New = MIB.addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1491 copyToDestRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdata, DestReg);
1493 CI.I->eraseFromParent();
1494 Paired.I->eraseFromParent();
1499 CombineInfo &CI, CombineInfo &Paired,
1503 const unsigned Opcode = getNewOpcode(CI, Paired);
1507 Register DestReg =
MRI->createVirtualRegister(SuperRC);
1508 unsigned MergedOffset = std::min(CI.Offset, Paired.Offset);
1513 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1517 .
add(*
TII->getNamedOperand(*CI.I, AMDGPU::OpName::sbase));
1518 if (CI.InstClass == S_BUFFER_LOAD_SGPR_IMM)
1519 New.add(*
TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset));
1520 New.addImm(MergedOffset);
1521 New.addImm(CI.CPol).addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1523 copyToDestRegs(CI, Paired, InsertBefore, AMDGPU::OpName::sdst, DestReg);
1525 CI.I->eraseFromParent();
1526 Paired.I->eraseFromParent();
1531 CombineInfo &CI, CombineInfo &Paired,
1536 const unsigned Opcode = getNewOpcode(CI, Paired);
1541 Register DestReg =
MRI->createVirtualRegister(SuperRC);
1542 unsigned MergedOffset = std::min(CI.Offset, Paired.Offset);
1544 auto MIB =
BuildMI(*
MBB, InsertBefore,
DL,
TII->get(Opcode), DestReg);
1546 AddressRegs Regs = getRegs(Opcode, *
TII);
1549 MIB.add(*
TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
1554 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1557 MIB.add(*
TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
1558 .add(*
TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
1559 .addImm(MergedOffset)
1562 .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1564 copyToDestRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdata, DestReg);
1566 CI.I->eraseFromParent();
1567 Paired.I->eraseFromParent();
1572 CombineInfo &CI, CombineInfo &Paired,
1577 const unsigned Opcode = getNewOpcode(CI, Paired);
1582 Register DestReg =
MRI->createVirtualRegister(SuperRC);
1583 unsigned MergedOffset = std::min(CI.Offset, Paired.Offset);
1585 auto MIB =
BuildMI(*
MBB, InsertBefore,
DL,
TII->get(Opcode), DestReg);
1587 AddressRegs Regs = getRegs(Opcode, *
TII);
1590 MIB.add(*
TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
1592 unsigned JoinedFormat =
1598 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1601 MIB.add(*
TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
1602 .add(*
TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
1603 .addImm(MergedOffset)
1604 .addImm(JoinedFormat)
1607 .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1609 copyToDestRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdata, DestReg);
1611 CI.I->eraseFromParent();
1612 Paired.I->eraseFromParent();
1617 CombineInfo &CI, CombineInfo &Paired,
1622 const unsigned Opcode = getNewOpcode(CI, Paired);
1625 copyFromSrcRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdata);
1630 AddressRegs Regs = getRegs(Opcode, *
TII);
1633 MIB.add(*
TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
1635 unsigned JoinedFormat =
1641 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1644 MIB.add(*
TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
1645 .add(*
TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
1646 .addImm(std::min(CI.Offset, Paired.Offset))
1647 .addImm(JoinedFormat)
1650 .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1652 CI.I->eraseFromParent();
1653 Paired.I->eraseFromParent();
1658 CombineInfo &CI, CombineInfo &Paired,
1663 const unsigned Opcode = getNewOpcode(CI, Paired);
1666 Register DestReg =
MRI->createVirtualRegister(SuperRC);
1668 auto MIB =
BuildMI(*
MBB, InsertBefore,
DL,
TII->get(Opcode), DestReg);
1670 if (
auto *SAddr =
TII->getNamedOperand(*CI.I, AMDGPU::OpName::saddr))
1674 MIB.add(*
TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr))
1675 .addImm(std::min(CI.Offset, Paired.Offset))
1677 .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1679 copyToDestRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdst, DestReg);
1681 CI.I->eraseFromParent();
1682 Paired.I->eraseFromParent();
1687 CombineInfo &CI, CombineInfo &Paired,
1692 const unsigned Opcode = getNewOpcode(CI, Paired);
1695 copyFromSrcRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdata);
1698 .
add(*
TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr))
1701 if (
auto *SAddr =
TII->getNamedOperand(*CI.I, AMDGPU::OpName::saddr))
1705 MIB.addImm(std::min(CI.Offset, Paired.Offset))
1707 .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1709 CI.I->eraseFromParent();
1710 Paired.I->eraseFromParent();
1719 (MMOs.
size() != 1 || MMOs[0]->getAlign().value() < Width * 4);
1722unsigned SILoadStoreOptimizer::getNewOpcode(
const CombineInfo &CI,
1723 const CombineInfo &Paired) {
1724 const unsigned Width = CI.Width + Paired.Width;
1726 switch (getCommonInstClass(CI, Paired)) {
1728 assert(CI.InstClass == BUFFER_LOAD || CI.InstClass == BUFFER_STORE);
1739 case S_BUFFER_LOAD_IMM: {
1742 bool NeedsConstrainedOpc =
1748 return NeedsConstrainedOpc ? AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM_ec
1749 : AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM;
1751 return NeedsConstrainedOpc ? AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM_ec
1752 : AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM;
1754 return NeedsConstrainedOpc ? AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM_ec
1755 : AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM;
1757 return NeedsConstrainedOpc ? AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM_ec
1758 : AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM;
1761 case S_BUFFER_LOAD_SGPR_IMM: {
1764 bool NeedsConstrainedOpc =
1770 return NeedsConstrainedOpc ? AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM_ec
1771 : AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM;
1773 return NeedsConstrainedOpc ? AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM_ec
1774 : AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM;
1776 return NeedsConstrainedOpc ? AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM_ec
1777 : AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM;
1779 return NeedsConstrainedOpc ? AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM_ec
1780 : AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM;
1786 bool NeedsConstrainedOpc =
1792 return NeedsConstrainedOpc ? AMDGPU::S_LOAD_DWORDX2_IMM_ec
1793 : AMDGPU::S_LOAD_DWORDX2_IMM;
1795 return NeedsConstrainedOpc ? AMDGPU::S_LOAD_DWORDX3_IMM_ec
1796 : AMDGPU::S_LOAD_DWORDX3_IMM;
1798 return NeedsConstrainedOpc ? AMDGPU::S_LOAD_DWORDX4_IMM_ec
1799 : AMDGPU::S_LOAD_DWORDX4_IMM;
1801 return NeedsConstrainedOpc ? AMDGPU::S_LOAD_DWORDX8_IMM_ec
1802 : AMDGPU::S_LOAD_DWORDX8_IMM;
1810 return AMDGPU::GLOBAL_LOAD_DWORDX2;
1812 return AMDGPU::GLOBAL_LOAD_DWORDX3;
1814 return AMDGPU::GLOBAL_LOAD_DWORDX4;
1816 case GLOBAL_LOAD_SADDR:
1821 return AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR;
1823 return AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR;
1825 return AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR;
1832 return AMDGPU::GLOBAL_STORE_DWORDX2;
1834 return AMDGPU::GLOBAL_STORE_DWORDX3;
1836 return AMDGPU::GLOBAL_STORE_DWORDX4;
1838 case GLOBAL_STORE_SADDR:
1843 return AMDGPU::GLOBAL_STORE_DWORDX2_SADDR;
1845 return AMDGPU::GLOBAL_STORE_DWORDX3_SADDR;
1847 return AMDGPU::GLOBAL_STORE_DWORDX4_SADDR;
1854 return AMDGPU::FLAT_LOAD_DWORDX2;
1856 return AMDGPU::FLAT_LOAD_DWORDX3;
1858 return AMDGPU::FLAT_LOAD_DWORDX4;
1865 return AMDGPU::FLAT_STORE_DWORDX2;
1867 return AMDGPU::FLAT_STORE_DWORDX3;
1869 return AMDGPU::FLAT_STORE_DWORDX4;
1878std::pair<unsigned, unsigned>
1879SILoadStoreOptimizer::getSubRegIdxs(
const CombineInfo &CI,
1880 const CombineInfo &Paired) {
1881 assert((CI.InstClass != MIMG ||
1883 CI.Width + Paired.Width)) &&
1889 static const unsigned Idxs[5][4] = {
1890 {AMDGPU::sub0, AMDGPU::sub0_sub1, AMDGPU::sub0_sub1_sub2, AMDGPU::sub0_sub1_sub2_sub3},
1891 {AMDGPU::sub1, AMDGPU::sub1_sub2, AMDGPU::sub1_sub2_sub3, AMDGPU::sub1_sub2_sub3_sub4},
1892 {AMDGPU::sub2, AMDGPU::sub2_sub3, AMDGPU::sub2_sub3_sub4, AMDGPU::sub2_sub3_sub4_sub5},
1893 {AMDGPU::sub3, AMDGPU::sub3_sub4, AMDGPU::sub3_sub4_sub5, AMDGPU::sub3_sub4_sub5_sub6},
1894 {AMDGPU::sub4, AMDGPU::sub4_sub5, AMDGPU::sub4_sub5_sub6, AMDGPU::sub4_sub5_sub6_sub7},
1897 assert(CI.Width >= 1 && CI.Width <= 4);
1898 assert(Paired.Width >= 1 && Paired.Width <= 4);
1901 Idx1 = Idxs[0][Paired.Width - 1];
1902 Idx0 = Idxs[Paired.Width][CI.Width - 1];
1904 Idx0 = Idxs[0][CI.Width - 1];
1905 Idx1 = Idxs[CI.Width][Paired.Width - 1];
1908 return {Idx0, Idx1};
1912SILoadStoreOptimizer::getTargetRegisterClass(
const CombineInfo &CI,
1913 const CombineInfo &Paired)
const {
1914 if (CI.InstClass == S_BUFFER_LOAD_IMM ||
1915 CI.InstClass == S_BUFFER_LOAD_SGPR_IMM || CI.InstClass == S_LOAD_IMM) {
1916 switch (CI.Width + Paired.Width) {
1920 return &AMDGPU::SReg_64_XEXECRegClass;
1922 return &AMDGPU::SGPR_96RegClass;
1924 return &AMDGPU::SGPR_128RegClass;
1926 return &AMDGPU::SGPR_256RegClass;
1928 return &AMDGPU::SGPR_512RegClass;
1932 unsigned BitWidth = 32 * (CI.Width + Paired.Width);
1933 return TRI->isAGPRClass(getDataRegClass(*CI.I))
1939 CombineInfo &CI, CombineInfo &Paired,
1944 const unsigned Opcode = getNewOpcode(CI, Paired);
1947 copyFromSrcRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdata);
1952 AddressRegs Regs = getRegs(Opcode, *
TII);
1955 MIB.add(*
TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
1961 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1964 MIB.add(*
TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
1965 .add(*
TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
1966 .addImm(std::min(CI.Offset, Paired.Offset))
1969 .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1971 CI.I->eraseFromParent();
1972 Paired.I->eraseFromParent();
1977SILoadStoreOptimizer::createRegOrImm(int32_t Val,
MachineInstr &
MI)
const {
1979 if (
TII->isInlineConstant(V))
1982 Register Reg =
MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
1984 BuildMI(*
MI.getParent(),
MI.getIterator(),
MI.getDebugLoc(),
1985 TII->get(AMDGPU::S_MOV_B32), Reg)
1994 const MemAddress &
Addr)
const {
2000 Addr.Base.LoSubReg) &&
2001 "Expected 32-bit Base-Register-Low!!");
2004 Addr.Base.HiSubReg) &&
2005 "Expected 32-bit Base-Register-Hi!!");
2010 createRegOrImm(
static_cast<int32_t
>(
Addr.Offset >> 32),
MI);
2012 const auto *CarryRC =
TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
2013 Register CarryReg =
MRI->createVirtualRegister(CarryRC);
2014 Register DeadCarryReg =
MRI->createVirtualRegister(CarryRC);
2016 Register DestSub0 =
MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
2017 Register DestSub1 =
MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
2037 Register FullDestReg =
MRI->createVirtualRegister(
TRI->getVGPR64Class());
2053 int32_t NewOffset)
const {
2054 auto Base =
TII->getNamedOperand(
MI, AMDGPU::OpName::vaddr);
2055 Base->setReg(NewBase);
2056 Base->setIsKill(
false);
2057 TII->getNamedOperand(
MI, AMDGPU::OpName::offset)->setImm(NewOffset);
2060std::optional<int32_t>
2066 return std::nullopt;
2069 if (!Def ||
Def->getOpcode() != AMDGPU::S_MOV_B32 ||
2070 !
Def->getOperand(1).isImm())
2071 return std::nullopt;
2073 return Def->getOperand(1).getImm();
2087 MemAddress &
Addr)
const {
2092 if (!Def ||
Def->getOpcode() != AMDGPU::REG_SEQUENCE
2093 ||
Def->getNumOperands() != 5)
2104 if (!BaseLoDef || BaseLoDef->
getOpcode() != AMDGPU::V_ADD_CO_U32_e64 ||
2105 !BaseHiDef || BaseHiDef->
getOpcode() != AMDGPU::V_ADDC_U32_e64)
2108 const auto *Src0 =
TII->getNamedOperand(*BaseLoDef, AMDGPU::OpName::src0);
2109 const auto *Src1 =
TII->getNamedOperand(*BaseLoDef, AMDGPU::OpName::src1);
2111 auto Offset0P = extractConstOffset(*Src0);
2115 if (!(Offset0P = extractConstOffset(*Src1)))
2120 if (!BaseLo.
isReg())
2123 Src0 =
TII->getNamedOperand(*BaseHiDef, AMDGPU::OpName::src0);
2124 Src1 =
TII->getNamedOperand(*BaseHiDef, AMDGPU::OpName::src1);
2129 if (!Src1->isImm() || Src0->isImm())
2135 if (!BaseHi.
isReg())
2142 Addr.Offset = (*Offset0P & 0x00000000ffffffff) | (Offset1 << 32);
2145bool SILoadStoreOptimizer::promoteConstantOffsetToImm(
2147 MemInfoMap &Visited,
2165 if (
TII->getNamedOperand(
MI, AMDGPU::OpName::offset)->getImm()) {
2173 if (!Visited.contains(&
MI)) {
2174 processBaseWithConstOffset(
Base, MAddr);
2175 Visited[&
MI] = MAddr;
2177 MAddr = Visited[&
MI];
2179 if (MAddr.Offset == 0) {
2180 LLVM_DEBUG(
dbgs() <<
" Failed to extract constant-offset or there are no"
2181 " constant offsets that can be promoted.\n";);
2186 << MAddr.Base.LoReg <<
"} Offset: " << MAddr.Offset <<
"\n\n";);
2214 MemAddress AnchorAddr;
2215 uint32_t MaxDist = std::numeric_limits<uint32_t>::min();
2230 TII->getNamedOperand(MINext, AMDGPU::OpName::offset)->getImm())
2234 *
TII->getNamedOperand(MINext, AMDGPU::OpName::vaddr);
2235 MemAddress MAddrNext;
2236 if (!Visited.contains(&MINext)) {
2237 processBaseWithConstOffset(BaseNext, MAddrNext);
2238 Visited[&MINext] = MAddrNext;
2240 MAddrNext = Visited[&MINext];
2242 if (MAddrNext.Base.LoReg != MAddr.Base.LoReg ||
2243 MAddrNext.Base.HiReg != MAddr.Base.HiReg ||
2244 MAddrNext.Base.LoSubReg != MAddr.Base.LoSubReg ||
2245 MAddrNext.Base.HiSubReg != MAddr.Base.HiSubReg)
2248 InstsWCommonBase.
emplace_back(&MINext, MAddrNext.Offset);
2250 int64_t Dist = MAddr.Offset - MAddrNext.Offset;
2255 (
uint32_t)std::abs(Dist) > MaxDist) {
2256 MaxDist = std::abs(Dist);
2258 AnchorAddr = MAddrNext;
2259 AnchorInst = &MINext;
2264 LLVM_DEBUG(
dbgs() <<
" Anchor-Inst(with max-distance from Offset): ";
2265 AnchorInst->
dump());
2267 << AnchorAddr.Offset <<
"\n\n");
2272 updateBaseAndOffset(
MI,
Base, MAddr.Offset - AnchorAddr.Offset);
2275 for (
auto [OtherMI, OtherOffset] : InstsWCommonBase) {
2278 AM.
BaseOffs = OtherOffset - AnchorAddr.Offset;
2283 updateBaseAndOffset(*OtherMI,
Base, OtherOffset - AnchorAddr.Offset);
2294void SILoadStoreOptimizer::addInstToMergeableList(
const CombineInfo &CI,
2295 std::list<std::list<CombineInfo> > &MergeableInsts)
const {
2296 for (std::list<CombineInfo> &AddrList : MergeableInsts) {
2297 if (AddrList.front().InstClass == CI.InstClass &&
2298 AddrList.front().IsAGPR == CI.IsAGPR &&
2299 AddrList.front().hasSameBaseAddress(CI)) {
2300 AddrList.emplace_back(CI);
2306 MergeableInsts.emplace_back(1, CI);
2309std::pair<MachineBasicBlock::iterator, bool>
2310SILoadStoreOptimizer::collectMergeableInsts(
2313 std::list<std::list<CombineInfo>> &MergeableInsts)
const {
2319 for (; BlockI !=
End; ++BlockI) {
2324 if (promoteConstantOffsetToImm(
MI, Visited,
AnchorList))
2329 if (
MI.hasOrderedMemoryRef() ||
MI.hasUnmodeledSideEffects()) {
2337 const InstClassEnum InstClass = getInstClass(
MI.getOpcode(), *
TII);
2338 if (InstClass == UNKNOWN)
2344 if (Swizzled != -1 &&
MI.getOperand(Swizzled).getImm())
2348 CI.setMI(
MI, *
this);
2351 if (!CI.hasMergeableAddress(*
MRI))
2354 if (CI.InstClass == DS_WRITE && CI.IsAGPR) {
2376 for (std::list<std::list<CombineInfo>>::iterator
I = MergeableInsts.begin(),
2377 E = MergeableInsts.end();
I != E;) {
2379 std::list<CombineInfo> &MergeList = *
I;
2380 if (MergeList.size() <= 1) {
2384 I = MergeableInsts.erase(
I);
2392 [] (
const CombineInfo &
A,
const CombineInfo &
B) {
2393 return A.Offset <
B.Offset;
2404bool SILoadStoreOptimizer::optimizeBlock(
2405 std::list<std::list<CombineInfo> > &MergeableInsts) {
2408 for (std::list<std::list<CombineInfo>>::iterator
I = MergeableInsts.begin(),
2409 E = MergeableInsts.end();
I != E;) {
2410 std::list<CombineInfo> &MergeList = *
I;
2412 bool OptimizeListAgain =
false;
2413 if (!optimizeInstsWithSameBaseAddr(MergeList, OptimizeListAgain)) {
2417 I = MergeableInsts.erase(
I);
2425 if (!OptimizeListAgain) {
2426 I = MergeableInsts.erase(
I);
2429 OptimizeAgain =
true;
2435SILoadStoreOptimizer::optimizeInstsWithSameBaseAddr(
2436 std::list<CombineInfo> &MergeList,
2437 bool &OptimizeListAgain) {
2438 if (MergeList.empty())
2443 for (
auto I = MergeList.begin(), Next = std::next(
I); Next != MergeList.end();
2444 Next = std::next(
I)) {
2449 if ((*First).Order > (*Second).Order)
2451 CombineInfo &CI = *
First;
2452 CombineInfo &Paired = *Second;
2454 CombineInfo *Where = checkAndPrepareMerge(CI, Paired);
2462 LLVM_DEBUG(
dbgs() <<
"Merging: " << *CI.I <<
" with: " << *Paired.I);
2465 switch (CI.InstClass) {
2470 NewMI = mergeRead2Pair(CI, Paired, Where->I);
2473 NewMI = mergeWrite2Pair(CI, Paired, Where->I);
2475 case S_BUFFER_LOAD_IMM:
2476 case S_BUFFER_LOAD_SGPR_IMM:
2478 NewMI = mergeSMemLoadImmPair(CI, Paired, Where->I);
2479 OptimizeListAgain |= CI.Width + Paired.Width < 8;
2482 NewMI = mergeBufferLoadPair(CI, Paired, Where->I);
2483 OptimizeListAgain |= CI.Width + Paired.Width < 4;
2486 NewMI = mergeBufferStorePair(CI, Paired, Where->I);
2487 OptimizeListAgain |= CI.Width + Paired.Width < 4;
2490 NewMI = mergeImagePair(CI, Paired, Where->I);
2491 OptimizeListAgain |= CI.Width + Paired.Width < 4;
2494 NewMI = mergeTBufferLoadPair(CI, Paired, Where->I);
2495 OptimizeListAgain |= CI.Width + Paired.Width < 4;
2498 NewMI = mergeTBufferStorePair(CI, Paired, Where->I);
2499 OptimizeListAgain |= CI.Width + Paired.Width < 4;
2503 case GLOBAL_LOAD_SADDR:
2504 NewMI = mergeFlatLoadPair(CI, Paired, Where->I);
2505 OptimizeListAgain |= CI.Width + Paired.Width < 4;
2509 case GLOBAL_STORE_SADDR:
2510 NewMI = mergeFlatStorePair(CI, Paired, Where->I);
2511 OptimizeListAgain |= CI.Width + Paired.Width < 4;
2514 CI.setMI(NewMI, *
this);
2515 CI.Order = Where->Order;
2519 MergeList.erase(Second);
2534 TRI = &
TII->getRegisterInfo();
2537 AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
2552 bool CollectModified;
2553 std::list<std::list<CombineInfo>> MergeableInsts;
2557 std::tie(SectionEnd, CollectModified) =
2563 OptimizeAgain =
false;
2565 }
while (OptimizeAgain);
unsigned const MachineRegisterInfo * MRI
INITIALIZE_PASS(AMDGPUImageIntrinsicOptimizer, DEBUG_TYPE, "AMDGPU Image Intrinsic Optimizer", false, false) char AMDGPUImageIntrinsicOptimizer void addInstToMergeableList(IntrinsicInst *II, SmallVector< SmallVector< IntrinsicInst *, 4 > > &MergeableInsts, const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr)
BasicBlock::iterator collectMergeableInsts(BasicBlock::iterator I, BasicBlock::iterator E, SmallVector< SmallVector< IntrinsicInst *, 4 > > &MergeableInsts)
Provides AMDGPU specific target descriptions.
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
MachineBasicBlock MachineBasicBlock::iterator MBBI
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
Analysis containing CSE Info
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
std::optional< std::vector< StOtherPiece > > Other
AMD GCN specific subclass of TargetSubtarget.
const HexagonInstrInfo * TII
unsigned const TargetRegisterInfo * TRI
static unsigned getReg(const MCDisassembler *D, unsigned RC, unsigned RegNo)
#define INITIALIZE_PASS_DEPENDENCY(depName)
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
static uint32_t mostAlignedValueInRange(uint32_t Lo, uint32_t Hi)
static bool needsConstrainedOpcode(const GCNSubtarget &STM, ArrayRef< MachineMemOperand * > MMOs, unsigned Width)
static void addDefsUsesToList(const MachineInstr &MI, DenseSet< Register > &RegDefs, DenseSet< Register > &RegUses)
static unsigned getBufferFormatWithCompCount(unsigned OldFormat, unsigned ComponentCount, const GCNSubtarget &STI)
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
static bool isImm(const MachineOperand &MO, MachineRegisterInfo *MRI)
static bool optimizeBlock(BasicBlock &BB, bool &ModifiedDT, const TargetTransformInfo &TTI, const DataLayout &DL, DomTreeUpdater *DTU)
support::ulittle16_t & Lo
support::ulittle16_t & Hi
A wrapper pass to provide the legacy pass manager access to a suitably prepared AAResults object.
Class for arbitrary precision integers.
Represent the analysis usage information of a pass.
AnalysisUsage & addRequired()
void setPreservesCFG()
This function should be called by the pass, iff they do not:
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
size_t size() const
size - Get the array size.
This class represents an Operation in the Expression.
Implements a dense probed hash-table based set.
FunctionPass class - This class is used to implement most global optimizations.
bool loadStoreOptEnabled() const
bool hasFlatInstOffsets() const
const SIInstrInfo * getInstrInfo() const override
bool hasDwordx3LoadStores() const
const SITargetLowering * getTargetLowering() const override
bool ldsRequiresM0Init() const
Return if most LDS instructions have an m0 use that require m0 to be initialized.
bool hasScalarDwordx3Loads() const
bool isXNACKEnabled() const
TypeSize getValue() const
Describe properties that are true of each instruction in the target description file.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
MachineFunctionPass - This class adapts the FunctionPass interface to allow convenient creation of pa...
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
Properties which a MachineFunction may have at a given point in time.
MachineFunctionProperties & set(Property P)
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
const MachineInstrBuilder & cloneMergedMemRefs(ArrayRef< const MachineInstr * > OtherMIs) const
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
Representation of each machine instruction.
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
void eraseFromParent()
Unlink 'this' from the containing basic block and delete it.
A description of a memory reference used in the backend.
LocationSize getSize() const
Return the size in bytes of the memory reference.
unsigned getAddrSpace() const
const MachinePointerInfo & getPointerInfo() const
MachineOperand class - Representation of each machine instruction operand.
unsigned getSubReg() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
bool isImm() const
isImm - Tests if this is a MO_Immediate operand.
static MachineOperand CreateImm(int64_t Val)
Register getReg() const
getReg - Returns the register number.
static MachineOperand CreateReg(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isEarlyClobber=false, unsigned SubReg=0, bool isDebug=false, bool isInternalRead=false, bool isRenamable=false)
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
static PassRegistry * getPassRegistry()
getPassRegistry - Access the global registry object, which is automatically initialized at applicatio...
Wrapper class representing virtual and physical registers.
constexpr bool isPhysical() const
Return true if the specified register number is in the physical register namespace.
static bool isFLATScratch(const MachineInstr &MI)
static bool isFLATGlobal(const MachineInstr &MI)
static bool isFLAT(const MachineInstr &MI)
bool isLegalFlatAddressingMode(const AddrMode &AM, unsigned AddrSpace) const
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
reference emplace_back(ArgTypes &&... Args)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
StringRef - Represent a constant reference to a string, i.e.
std::pair< iterator, bool > insert(const ValueT &V)
bool contains(const_arg_type_t< ValueT > V) const
Check if the set contains the given element.
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ FLAT_ADDRESS
Address space for flat memory.
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
LLVM_READONLY const MIMGInfo * getMIMGInfo(unsigned Opc)
uint64_t convertSMRDOffsetUnits(const MCSubtargetInfo &ST, uint64_t ByteOffset)
Convert ByteOffset to dwords if the subtarget uses dword SMRD immediate offsets.
bool getMTBUFHasSrsrc(unsigned Opc)
int getMTBUFElements(unsigned Opc)
LLVM_READONLY int16_t getNamedOperandIdx(uint16_t Opcode, uint16_t NamedIdx)
bool getMTBUFHasSoffset(unsigned Opc)
int getMUBUFOpcode(unsigned BaseOpc, unsigned Elements)
int getMUBUFBaseOpcode(unsigned Opc)
int getMTBUFBaseOpcode(unsigned Opc)
bool getMUBUFHasVAddr(unsigned Opc)
LLVM_READONLY bool hasNamedOperand(uint64_t Opcode, uint64_t NamedIdx)
int getMTBUFOpcode(unsigned BaseOpc, unsigned Elements)
bool getMUBUFHasSoffset(unsigned Opc)
const MIMGBaseOpcodeInfo * getMIMGBaseOpcode(unsigned Opc)
LLVM_READONLY const MIMGBaseOpcodeInfo * getMIMGBaseOpcodeInfo(unsigned BaseOpcode)
int getMaskedMIMGOp(unsigned Opc, unsigned NewChannels)
bool getMTBUFHasVAddr(unsigned Opc)
int getMUBUFElements(unsigned Opc)
const GcnBufferFormatInfo * getGcnBufferFormatInfo(uint8_t BitsPerComp, uint8_t NumComponents, uint8_t NumFormat, const MCSubtargetInfo &STI)
bool getMUBUFHasSrsrc(unsigned Opc)
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
@ Define
Register definition.
@ Kill
The last use of a register.
Reg
All possible values of the reg field in the ModR/M byte.
NodeAddr< DefNode * > Def
This is an optimization pass for GlobalISel generic memory operations.
bool operator<(int64_t V1, const APSInt &V2)
std::vector< std::pair< LineLocation, FunctionId > > AnchorList
int popcount(T Value) noexcept
Count the number of set bits in a value.
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
char & SILoadStoreOptimizerID
FunctionPass * createSILoadStoreOptimizerPass()
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
void initializeSILoadStoreOptimizerPass(PassRegistry &)
int countl_zero(T Val)
Count number of 0's from the most significant bit to the least stopping at the first 1.
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
@ First
Helpers to iterate all locations in the MemoryEffectsBase class.
constexpr unsigned BitWidth
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
This class contains a discriminated union of information about pointers in memory operands,...
This represents an addressing mode of: BaseGV + BaseOffs + BaseReg + Scale*ScaleReg + ScalableOffset*...