70#define DEBUG_TYPE "si-load-store-opt"
78 S_BUFFER_LOAD_SGPR_IMM,
95 unsigned char NumVAddrs = 0;
106const unsigned MaxAddressRegs = 12 + 1 + 1;
108class SILoadStoreOptimizer {
117 InstClassEnum InstClass;
121 int AddrIdx[MaxAddressRegs];
123 unsigned NumAddresses;
126 bool hasSameBaseAddress(
const CombineInfo &CI) {
127 if (NumAddresses != CI.NumAddresses)
131 for (
unsigned i = 0; i < NumAddresses; i++) {
134 if (AddrReg[i]->
isImm() || AddrRegNext.
isImm()) {
135 if (AddrReg[i]->
isImm() != AddrRegNext.
isImm() ||
153 for (
unsigned i = 0; i < NumAddresses; ++i) {
162 if (!AddrOp->
isReg())
168 AddrOp->
getReg() != AMDGPU::SGPR_NULL)
173 if (
MRI.hasOneNonDBGUse(AddrOp->
getReg()))
183 return (InstClass == MIMG) ? DMask <
Other.DMask : Offset <
Other.Offset;
187 struct BaseRegisters {
191 unsigned LoSubReg = 0;
192 unsigned HiSubReg = 0;
213 static bool dmasksCanBeCombined(
const CombineInfo &CI,
215 const CombineInfo &Paired);
216 static bool offsetsCanBeCombined(CombineInfo &CI,
const GCNSubtarget &STI,
217 CombineInfo &Paired,
bool Modify =
false);
218 static bool widthsFit(
const GCNSubtarget &STI,
const CombineInfo &CI,
219 const CombineInfo &Paired);
220 unsigned getNewOpcode(
const CombineInfo &CI,
const CombineInfo &Paired);
221 static std::pair<unsigned, unsigned> getSubRegIdxs(
const CombineInfo &CI,
222 const CombineInfo &Paired);
224 getTargetRegisterClass(
const CombineInfo &CI,
225 const CombineInfo &Paired)
const;
228 CombineInfo *checkAndPrepareMerge(CombineInfo &CI, CombineInfo &Paired);
230 void copyToDestRegs(CombineInfo &CI, CombineInfo &Paired,
233 Register copyFromSrcRegs(CombineInfo &CI, CombineInfo &Paired,
237 unsigned read2Opcode(
unsigned EltSize)
const;
238 unsigned read2ST64Opcode(
unsigned EltSize)
const;
240 mergeRead2Pair(CombineInfo &CI, CombineInfo &Paired,
243 unsigned write2Opcode(
unsigned EltSize)
const;
244 unsigned write2ST64Opcode(
unsigned EltSize)
const;
246 mergeWrite2Pair(CombineInfo &CI, CombineInfo &Paired,
249 mergeImagePair(CombineInfo &CI, CombineInfo &Paired,
252 mergeSMemLoadImmPair(CombineInfo &CI, CombineInfo &Paired,
255 mergeBufferLoadPair(CombineInfo &CI, CombineInfo &Paired,
258 mergeBufferStorePair(CombineInfo &CI, CombineInfo &Paired,
261 mergeTBufferLoadPair(CombineInfo &CI, CombineInfo &Paired,
264 mergeTBufferStorePair(CombineInfo &CI, CombineInfo &Paired,
267 mergeFlatLoadPair(CombineInfo &CI, CombineInfo &Paired,
270 mergeFlatStorePair(CombineInfo &CI, CombineInfo &Paired,
274 int32_t NewOffset)
const;
286 std::list<std::list<CombineInfo> > &MergeableInsts)
const;
291 std::list<std::list<CombineInfo>> &MergeableInsts)
const;
294 const CombineInfo &Paired);
296 static InstClassEnum getCommonInstClass(
const CombineInfo &CI,
297 const CombineInfo &Paired);
299 bool optimizeInstsWithSameBaseAddr(std::list<CombineInfo> &MergeList,
300 bool &OptimizeListAgain);
301 bool optimizeBlock(std::list<std::list<CombineInfo> > &MergeableInsts);
327 .
set(MachineFunctionProperties::Property::IsSSA);
332 const unsigned Opc =
MI.getOpcode();
334 if (
TII.isMUBUF(Opc)) {
338 if (
TII.isImage(
MI)) {
340 TII.getNamedOperand(
MI, AMDGPU::OpName::dmask)->getImm();
343 if (
TII.isMTBUF(Opc)) {
348 case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
349 case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM:
350 case AMDGPU::S_LOAD_DWORD_IMM:
351 case AMDGPU::GLOBAL_LOAD_DWORD:
352 case AMDGPU::GLOBAL_LOAD_DWORD_SADDR:
353 case AMDGPU::GLOBAL_STORE_DWORD:
354 case AMDGPU::GLOBAL_STORE_DWORD_SADDR:
355 case AMDGPU::FLAT_LOAD_DWORD:
356 case AMDGPU::FLAT_STORE_DWORD:
358 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
359 case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM:
360 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM_ec:
361 case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM_ec:
362 case AMDGPU::S_LOAD_DWORDX2_IMM:
363 case AMDGPU::S_LOAD_DWORDX2_IMM_ec:
364 case AMDGPU::GLOBAL_LOAD_DWORDX2:
365 case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR:
366 case AMDGPU::GLOBAL_STORE_DWORDX2:
367 case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR:
368 case AMDGPU::FLAT_LOAD_DWORDX2:
369 case AMDGPU::FLAT_STORE_DWORDX2:
371 case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM:
372 case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM:
373 case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM_ec:
374 case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM_ec:
375 case AMDGPU::S_LOAD_DWORDX3_IMM:
376 case AMDGPU::S_LOAD_DWORDX3_IMM_ec:
377 case AMDGPU::GLOBAL_LOAD_DWORDX3:
378 case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR:
379 case AMDGPU::GLOBAL_STORE_DWORDX3:
380 case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR:
381 case AMDGPU::FLAT_LOAD_DWORDX3:
382 case AMDGPU::FLAT_STORE_DWORDX3:
384 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
385 case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM:
386 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM_ec:
387 case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM_ec:
388 case AMDGPU::S_LOAD_DWORDX4_IMM:
389 case AMDGPU::S_LOAD_DWORDX4_IMM_ec:
390 case AMDGPU::GLOBAL_LOAD_DWORDX4:
391 case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR:
392 case AMDGPU::GLOBAL_STORE_DWORDX4:
393 case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR:
394 case AMDGPU::FLAT_LOAD_DWORDX4:
395 case AMDGPU::FLAT_STORE_DWORDX4:
397 case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
398 case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM:
399 case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM_ec:
400 case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM_ec:
401 case AMDGPU::S_LOAD_DWORDX8_IMM:
402 case AMDGPU::S_LOAD_DWORDX8_IMM_ec:
404 case AMDGPU::DS_READ_B32:
405 case AMDGPU::DS_READ_B32_gfx9:
406 case AMDGPU::DS_WRITE_B32:
407 case AMDGPU::DS_WRITE_B32_gfx9:
409 case AMDGPU::DS_READ_B64:
410 case AMDGPU::DS_READ_B64_gfx9:
411 case AMDGPU::DS_WRITE_B64:
412 case AMDGPU::DS_WRITE_B64_gfx9:
420static InstClassEnum getInstClass(
unsigned Opc,
const SIInstrInfo &
TII) {
423 if (
TII.isMUBUF(Opc)) {
427 case AMDGPU::BUFFER_LOAD_DWORD_BOTHEN:
428 case AMDGPU::BUFFER_LOAD_DWORD_BOTHEN_exact:
429 case AMDGPU::BUFFER_LOAD_DWORD_IDXEN:
430 case AMDGPU::BUFFER_LOAD_DWORD_IDXEN_exact:
431 case AMDGPU::BUFFER_LOAD_DWORD_OFFEN:
432 case AMDGPU::BUFFER_LOAD_DWORD_OFFEN_exact:
433 case AMDGPU::BUFFER_LOAD_DWORD_OFFSET:
434 case AMDGPU::BUFFER_LOAD_DWORD_OFFSET_exact:
435 case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_BOTHEN:
436 case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_BOTHEN_exact:
437 case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_IDXEN:
438 case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_IDXEN_exact:
439 case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_OFFEN:
440 case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_OFFEN_exact:
441 case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_OFFSET:
442 case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_OFFSET_exact:
444 case AMDGPU::BUFFER_STORE_DWORD_BOTHEN:
445 case AMDGPU::BUFFER_STORE_DWORD_BOTHEN_exact:
446 case AMDGPU::BUFFER_STORE_DWORD_IDXEN:
447 case AMDGPU::BUFFER_STORE_DWORD_IDXEN_exact:
448 case AMDGPU::BUFFER_STORE_DWORD_OFFEN:
449 case AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact:
450 case AMDGPU::BUFFER_STORE_DWORD_OFFSET:
451 case AMDGPU::BUFFER_STORE_DWORD_OFFSET_exact:
452 case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_BOTHEN:
453 case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_BOTHEN_exact:
454 case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_IDXEN:
455 case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_IDXEN_exact:
456 case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_OFFEN:
457 case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_OFFEN_exact:
458 case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_OFFSET:
459 case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_OFFSET_exact:
463 if (
TII.isImage(Opc)) {
472 if (
TII.get(Opc).mayStore() || !
TII.get(Opc).mayLoad() ||
477 if (
TII.isMTBUF(Opc)) {
481 case AMDGPU::TBUFFER_LOAD_FORMAT_X_BOTHEN:
482 case AMDGPU::TBUFFER_LOAD_FORMAT_X_BOTHEN_exact:
483 case AMDGPU::TBUFFER_LOAD_FORMAT_X_IDXEN:
484 case AMDGPU::TBUFFER_LOAD_FORMAT_X_IDXEN_exact:
485 case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFEN:
486 case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFEN_exact:
487 case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFSET:
488 case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFSET_exact:
489 case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_BOTHEN:
490 case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_BOTHEN_exact:
491 case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_IDXEN:
492 case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_IDXEN_exact:
493 case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_OFFEN:
494 case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_OFFEN_exact:
495 case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_OFFSET:
496 case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_OFFSET_exact:
498 case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFEN:
499 case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFEN_exact:
500 case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFSET:
501 case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFSET_exact:
502 case AMDGPU::TBUFFER_STORE_FORMAT_X_VBUFFER_OFFEN:
503 case AMDGPU::TBUFFER_STORE_FORMAT_X_VBUFFER_OFFEN_exact:
504 case AMDGPU::TBUFFER_STORE_FORMAT_X_VBUFFER_OFFSET:
505 case AMDGPU::TBUFFER_STORE_FORMAT_X_VBUFFER_OFFSET_exact:
506 return TBUFFER_STORE;
510 case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
511 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
512 case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM:
513 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
514 case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
515 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM_ec:
516 case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM_ec:
517 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM_ec:
518 case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM_ec:
519 return S_BUFFER_LOAD_IMM;
520 case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM:
521 case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM:
522 case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM:
523 case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM:
524 case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM:
525 case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM_ec:
526 case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM_ec:
527 case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM_ec:
528 case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM_ec:
529 return S_BUFFER_LOAD_SGPR_IMM;
530 case AMDGPU::S_LOAD_DWORD_IMM:
531 case AMDGPU::S_LOAD_DWORDX2_IMM:
532 case AMDGPU::S_LOAD_DWORDX3_IMM:
533 case AMDGPU::S_LOAD_DWORDX4_IMM:
534 case AMDGPU::S_LOAD_DWORDX8_IMM:
535 case AMDGPU::S_LOAD_DWORDX2_IMM_ec:
536 case AMDGPU::S_LOAD_DWORDX3_IMM_ec:
537 case AMDGPU::S_LOAD_DWORDX4_IMM_ec:
538 case AMDGPU::S_LOAD_DWORDX8_IMM_ec:
540 case AMDGPU::DS_READ_B32:
541 case AMDGPU::DS_READ_B32_gfx9:
542 case AMDGPU::DS_READ_B64:
543 case AMDGPU::DS_READ_B64_gfx9:
545 case AMDGPU::DS_WRITE_B32:
546 case AMDGPU::DS_WRITE_B32_gfx9:
547 case AMDGPU::DS_WRITE_B64:
548 case AMDGPU::DS_WRITE_B64_gfx9:
550 case AMDGPU::GLOBAL_LOAD_DWORD:
551 case AMDGPU::GLOBAL_LOAD_DWORDX2:
552 case AMDGPU::GLOBAL_LOAD_DWORDX3:
553 case AMDGPU::GLOBAL_LOAD_DWORDX4:
554 case AMDGPU::FLAT_LOAD_DWORD:
555 case AMDGPU::FLAT_LOAD_DWORDX2:
556 case AMDGPU::FLAT_LOAD_DWORDX3:
557 case AMDGPU::FLAT_LOAD_DWORDX4:
559 case AMDGPU::GLOBAL_LOAD_DWORD_SADDR:
560 case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR:
561 case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR:
562 case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR:
563 return GLOBAL_LOAD_SADDR;
564 case AMDGPU::GLOBAL_STORE_DWORD:
565 case AMDGPU::GLOBAL_STORE_DWORDX2:
566 case AMDGPU::GLOBAL_STORE_DWORDX3:
567 case AMDGPU::GLOBAL_STORE_DWORDX4:
568 case AMDGPU::FLAT_STORE_DWORD:
569 case AMDGPU::FLAT_STORE_DWORDX2:
570 case AMDGPU::FLAT_STORE_DWORDX3:
571 case AMDGPU::FLAT_STORE_DWORDX4:
573 case AMDGPU::GLOBAL_STORE_DWORD_SADDR:
574 case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR:
575 case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR:
576 case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR:
577 return GLOBAL_STORE_SADDR;
584static unsigned getInstSubclass(
unsigned Opc,
const SIInstrInfo &
TII) {
587 if (
TII.isMUBUF(Opc))
589 if (
TII.isImage(Opc)) {
592 return Info->BaseOpcode;
594 if (
TII.isMTBUF(Opc))
597 case AMDGPU::DS_READ_B32:
598 case AMDGPU::DS_READ_B32_gfx9:
599 case AMDGPU::DS_READ_B64:
600 case AMDGPU::DS_READ_B64_gfx9:
601 case AMDGPU::DS_WRITE_B32:
602 case AMDGPU::DS_WRITE_B32_gfx9:
603 case AMDGPU::DS_WRITE_B64:
604 case AMDGPU::DS_WRITE_B64_gfx9:
606 case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
607 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
608 case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM:
609 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
610 case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
611 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM_ec:
612 case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM_ec:
613 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM_ec:
614 case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM_ec:
615 return AMDGPU::S_BUFFER_LOAD_DWORD_IMM;
616 case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM:
617 case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM:
618 case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM:
619 case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM:
620 case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM:
621 case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM_ec:
622 case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM_ec:
623 case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM_ec:
624 case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM_ec:
625 return AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM;
626 case AMDGPU::S_LOAD_DWORD_IMM:
627 case AMDGPU::S_LOAD_DWORDX2_IMM:
628 case AMDGPU::S_LOAD_DWORDX3_IMM:
629 case AMDGPU::S_LOAD_DWORDX4_IMM:
630 case AMDGPU::S_LOAD_DWORDX8_IMM:
631 case AMDGPU::S_LOAD_DWORDX2_IMM_ec:
632 case AMDGPU::S_LOAD_DWORDX3_IMM_ec:
633 case AMDGPU::S_LOAD_DWORDX4_IMM_ec:
634 case AMDGPU::S_LOAD_DWORDX8_IMM_ec:
635 return AMDGPU::S_LOAD_DWORD_IMM;
636 case AMDGPU::GLOBAL_LOAD_DWORD:
637 case AMDGPU::GLOBAL_LOAD_DWORDX2:
638 case AMDGPU::GLOBAL_LOAD_DWORDX3:
639 case AMDGPU::GLOBAL_LOAD_DWORDX4:
640 case AMDGPU::FLAT_LOAD_DWORD:
641 case AMDGPU::FLAT_LOAD_DWORDX2:
642 case AMDGPU::FLAT_LOAD_DWORDX3:
643 case AMDGPU::FLAT_LOAD_DWORDX4:
644 return AMDGPU::FLAT_LOAD_DWORD;
645 case AMDGPU::GLOBAL_LOAD_DWORD_SADDR:
646 case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR:
647 case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR:
648 case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR:
649 return AMDGPU::GLOBAL_LOAD_DWORD_SADDR;
650 case AMDGPU::GLOBAL_STORE_DWORD:
651 case AMDGPU::GLOBAL_STORE_DWORDX2:
652 case AMDGPU::GLOBAL_STORE_DWORDX3:
653 case AMDGPU::GLOBAL_STORE_DWORDX4:
654 case AMDGPU::FLAT_STORE_DWORD:
655 case AMDGPU::FLAT_STORE_DWORDX2:
656 case AMDGPU::FLAT_STORE_DWORDX3:
657 case AMDGPU::FLAT_STORE_DWORDX4:
658 return AMDGPU::FLAT_STORE_DWORD;
659 case AMDGPU::GLOBAL_STORE_DWORD_SADDR:
660 case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR:
661 case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR:
662 case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR:
663 return AMDGPU::GLOBAL_STORE_DWORD_SADDR;
674SILoadStoreOptimizer::getCommonInstClass(
const CombineInfo &CI,
675 const CombineInfo &Paired) {
676 assert(CI.InstClass == Paired.InstClass);
678 if ((CI.InstClass == FLAT_LOAD || CI.InstClass == FLAT_STORE) &&
680 return (CI.InstClass == FLAT_STORE) ? GLOBAL_STORE : GLOBAL_LOAD;
688 if (
TII.isMUBUF(Opc)) {
694 Result.SOffset =
true;
699 if (
TII.isImage(Opc)) {
701 if (VAddr0Idx >= 0) {
703 TII.isMIMG(Opc) ? AMDGPU::OpName::srsrc : AMDGPU::OpName::rsrc;
705 Result.NumVAddrs = RsrcIdx - VAddr0Idx;
716 if (
TII.isMTBUF(Opc)) {
722 Result.SOffset =
true;
730 case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM:
731 case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM:
732 case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM:
733 case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM:
734 case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM:
735 case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM_ec:
736 case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM_ec:
737 case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM_ec:
738 case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM_ec:
739 Result.SOffset =
true;
741 case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
742 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
743 case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM:
744 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
745 case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
746 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM_ec:
747 case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM_ec:
748 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM_ec:
749 case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM_ec:
750 case AMDGPU::S_LOAD_DWORD_IMM:
751 case AMDGPU::S_LOAD_DWORDX2_IMM:
752 case AMDGPU::S_LOAD_DWORDX3_IMM:
753 case AMDGPU::S_LOAD_DWORDX4_IMM:
754 case AMDGPU::S_LOAD_DWORDX8_IMM:
755 case AMDGPU::S_LOAD_DWORDX2_IMM_ec:
756 case AMDGPU::S_LOAD_DWORDX3_IMM_ec:
757 case AMDGPU::S_LOAD_DWORDX4_IMM_ec:
758 case AMDGPU::S_LOAD_DWORDX8_IMM_ec:
761 case AMDGPU::DS_READ_B32:
762 case AMDGPU::DS_READ_B64:
763 case AMDGPU::DS_READ_B32_gfx9:
764 case AMDGPU::DS_READ_B64_gfx9:
765 case AMDGPU::DS_WRITE_B32:
766 case AMDGPU::DS_WRITE_B64:
767 case AMDGPU::DS_WRITE_B32_gfx9:
768 case AMDGPU::DS_WRITE_B64_gfx9:
771 case AMDGPU::GLOBAL_LOAD_DWORD_SADDR:
772 case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR:
773 case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR:
774 case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR:
775 case AMDGPU::GLOBAL_STORE_DWORD_SADDR:
776 case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR:
777 case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR:
778 case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR:
781 case AMDGPU::GLOBAL_LOAD_DWORD:
782 case AMDGPU::GLOBAL_LOAD_DWORDX2:
783 case AMDGPU::GLOBAL_LOAD_DWORDX3:
784 case AMDGPU::GLOBAL_LOAD_DWORDX4:
785 case AMDGPU::GLOBAL_STORE_DWORD:
786 case AMDGPU::GLOBAL_STORE_DWORDX2:
787 case AMDGPU::GLOBAL_STORE_DWORDX3:
788 case AMDGPU::GLOBAL_STORE_DWORDX4:
789 case AMDGPU::FLAT_LOAD_DWORD:
790 case AMDGPU::FLAT_LOAD_DWORDX2:
791 case AMDGPU::FLAT_LOAD_DWORDX3:
792 case AMDGPU::FLAT_LOAD_DWORDX4:
793 case AMDGPU::FLAT_STORE_DWORD:
794 case AMDGPU::FLAT_STORE_DWORDX2:
795 case AMDGPU::FLAT_STORE_DWORDX3:
796 case AMDGPU::FLAT_STORE_DWORDX4:
803 const SILoadStoreOptimizer &LSO) {
805 unsigned Opc =
MI->getOpcode();
806 InstClass = getInstClass(Opc, *LSO.TII);
808 if (InstClass == UNKNOWN)
811 IsAGPR = LSO.TRI->hasAGPRs(LSO.getDataRegClass(*
MI));
816 (Opc == AMDGPU::DS_READ_B64 || Opc == AMDGPU::DS_READ_B64_gfx9) ? 8
821 (Opc == AMDGPU::DS_WRITE_B64 || Opc == AMDGPU::DS_WRITE_B64_gfx9) ? 8
824 case S_BUFFER_LOAD_IMM:
825 case S_BUFFER_LOAD_SGPR_IMM:
834 if (InstClass == MIMG) {
835 DMask = LSO.TII->getNamedOperand(*
I, AMDGPU::OpName::dmask)->getImm();
840 Offset =
I->getOperand(OffsetIdx).getImm();
843 if (InstClass == TBUFFER_LOAD || InstClass == TBUFFER_STORE)
844 Format = LSO.TII->getNamedOperand(*
I, AMDGPU::OpName::format)->getImm();
846 Width = getOpcodeWidth(*
I, *LSO.TII);
848 if ((InstClass == DS_READ) || (InstClass == DS_WRITE)) {
850 }
else if (InstClass != MIMG) {
851 CPol = LSO.TII->getNamedOperand(*
I, AMDGPU::OpName::cpol)->getImm();
854 AddressRegs Regs = getRegs(Opc, *LSO.TII);
855 bool isVIMAGEorVSAMPLE = LSO.TII->isVIMAGE(*
I) || LSO.TII->isVSAMPLE(*
I);
858 for (
unsigned J = 0; J < Regs.NumVAddrs; J++)
859 AddrIdx[NumAddresses++] =
862 AddrIdx[NumAddresses++] =
865 AddrIdx[NumAddresses++] =
869 Opc, isVIMAGEorVSAMPLE ? AMDGPU::OpName::rsrc : AMDGPU::OpName::srsrc);
871 AddrIdx[NumAddresses++] =
874 AddrIdx[NumAddresses++] =
877 AddrIdx[NumAddresses++] =
881 Opc, isVIMAGEorVSAMPLE ? AMDGPU::OpName::samp : AMDGPU::OpName::ssamp);
882 assert(NumAddresses <= MaxAddressRegs);
884 for (
unsigned J = 0; J < NumAddresses; J++)
885 AddrReg[J] = &
I->getOperand(AddrIdx[J]);
891 "SI Load Store Optimizer",
false,
false)
896char SILoadStoreOptimizerLegacy::
ID = 0;
901 return new SILoadStoreOptimizerLegacy();
907 for (
const auto &
Op :
MI.operands()) {
917bool SILoadStoreOptimizer::canSwapInstructions(
920 if (
A.mayLoadOrStore() &&
B.mayLoadOrStore() &&
921 (
A.mayStore() ||
B.mayStore()) &&
A.mayAlias(AA,
B,
true))
923 for (
const auto &BOp :
B.operands()) {
926 if ((BOp.isDef() || BOp.readsReg()) && ARegDefs.
contains(BOp.getReg()))
928 if (BOp.isDef() && ARegUses.
contains(BOp.getReg()))
937SILoadStoreOptimizer::combineKnownAdjacentMMOs(
const CombineInfo &CI,
938 const CombineInfo &Paired) {
958bool SILoadStoreOptimizer::dmasksCanBeCombined(
const CombineInfo &CI,
960 const CombineInfo &Paired) {
961 assert(CI.InstClass == MIMG);
964 const auto *TFEOp =
TII.getNamedOperand(*CI.I, AMDGPU::OpName::tfe);
965 const auto *LWEOp =
TII.getNamedOperand(*CI.I, AMDGPU::OpName::lwe);
967 if ((TFEOp && TFEOp->getImm()) || (LWEOp && LWEOp->getImm()))
971 unsigned OperandsToMatch[] = {AMDGPU::OpName::cpol, AMDGPU::OpName::d16,
972 AMDGPU::OpName::unorm, AMDGPU::OpName::da,
973 AMDGPU::OpName::r128, AMDGPU::OpName::a16};
975 for (
auto op : OperandsToMatch) {
980 CI.I->getOperand(
Idx).getImm() != Paired.I->getOperand(
Idx).getImm())
985 unsigned MaxMask = std::max(CI.DMask, Paired.DMask);
986 unsigned MinMask = std::min(CI.DMask, Paired.DMask);
992 if ((1u << AllowedBitsForMin) <= MinMask)
999 unsigned ComponentCount,
1001 if (ComponentCount > 4)
1020 return NewFormatInfo->
Format;
1033bool SILoadStoreOptimizer::offsetsCanBeCombined(CombineInfo &CI,
1035 CombineInfo &Paired,
1037 assert(CI.InstClass != MIMG);
1041 if (CI.Offset == Paired.Offset)
1045 if ((CI.Offset % CI.EltSize != 0) || (Paired.Offset % CI.EltSize != 0))
1048 if (CI.InstClass == TBUFFER_LOAD || CI.InstClass == TBUFFER_STORE) {
1072 uint32_t EltOffset0 = CI.Offset / CI.EltSize;
1073 uint32_t EltOffset1 = Paired.Offset / CI.EltSize;
1078 if ((CI.InstClass != DS_READ) && (CI.InstClass != DS_WRITE)) {
1079 if (EltOffset0 + CI.Width != EltOffset1 &&
1080 EltOffset1 + Paired.Width != EltOffset0)
1082 if (CI.CPol != Paired.CPol)
1084 if (CI.InstClass == S_LOAD_IMM || CI.InstClass == S_BUFFER_LOAD_IMM ||
1085 CI.InstClass == S_BUFFER_LOAD_SGPR_IMM) {
1091 if (CI.Width != Paired.Width &&
1092 (CI.Width < Paired.Width) == (CI.Offset < Paired.Offset))
1100 if ((EltOffset0 % 64 == 0) && (EltOffset1 % 64) == 0 &&
1101 isUInt<8>(EltOffset0 / 64) && isUInt<8>(EltOffset1 / 64)) {
1103 CI.Offset = EltOffset0 / 64;
1104 Paired.Offset = EltOffset1 / 64;
1111 if (isUInt<8>(EltOffset0) && isUInt<8>(EltOffset1)) {
1113 CI.Offset = EltOffset0;
1114 Paired.Offset = EltOffset1;
1120 uint32_t Min = std::min(EltOffset0, EltOffset1);
1123 const uint32_t Mask = maskTrailingOnes<uint32_t>(8) * 64;
1124 if (((Max - Min) & ~Mask) == 0) {
1132 BaseOff |= Min & maskTrailingOnes<uint32_t>(6);
1133 CI.BaseOff = BaseOff * CI.EltSize;
1134 CI.Offset = (EltOffset0 - BaseOff) / 64;
1135 Paired.Offset = (EltOffset1 - BaseOff) / 64;
1141 if (isUInt<8>(Max - Min)) {
1147 CI.BaseOff = BaseOff * CI.EltSize;
1148 CI.Offset = EltOffset0 - BaseOff;
1149 Paired.Offset = EltOffset1 - BaseOff;
1157bool SILoadStoreOptimizer::widthsFit(
const GCNSubtarget &STM,
1158 const CombineInfo &CI,
1159 const CombineInfo &Paired) {
1160 const unsigned Width = (CI.Width + Paired.Width);
1161 switch (CI.InstClass) {
1164 case S_BUFFER_LOAD_IMM:
1165 case S_BUFFER_LOAD_SGPR_IMM:
1181SILoadStoreOptimizer::getDataRegClass(
const MachineInstr &
MI)
const {
1182 if (
const auto *Dst =
TII->getNamedOperand(
MI, AMDGPU::OpName::vdst)) {
1183 return TRI->getRegClassForReg(*
MRI, Dst->getReg());
1185 if (
const auto *Src =
TII->getNamedOperand(
MI, AMDGPU::OpName::vdata)) {
1186 return TRI->getRegClassForReg(*
MRI, Src->getReg());
1188 if (
const auto *Src =
TII->getNamedOperand(
MI, AMDGPU::OpName::data0)) {
1189 return TRI->getRegClassForReg(*
MRI, Src->getReg());
1191 if (
const auto *Dst =
TII->getNamedOperand(
MI, AMDGPU::OpName::sdst)) {
1192 return TRI->getRegClassForReg(*
MRI, Dst->getReg());
1194 if (
const auto *Src =
TII->getNamedOperand(
MI, AMDGPU::OpName::sdata)) {
1195 return TRI->getRegClassForReg(*
MRI, Src->getReg());
1202SILoadStoreOptimizer::CombineInfo *
1203SILoadStoreOptimizer::checkAndPrepareMerge(CombineInfo &CI,
1204 CombineInfo &Paired) {
1207 if (CI.InstClass == UNKNOWN || Paired.InstClass == UNKNOWN)
1209 assert(CI.InstClass == Paired.InstClass);
1211 if (getInstSubclass(CI.I->getOpcode(), *
TII) !=
1212 getInstSubclass(Paired.I->getOpcode(), *
TII))
1217 if (CI.InstClass == MIMG) {
1218 if (!dmasksCanBeCombined(CI, *
TII, Paired))
1221 if (!widthsFit(*STM, CI, Paired) || !offsetsCanBeCombined(CI, *STM, Paired))
1228 if (CI.I->mayLoad()) {
1232 if (!canSwapInstructions(RegDefs, RegUses, *Paired.I, *
MBBI))
1240 if (!canSwapInstructions(RegDefs, RegUses, *CI.I, *
MBBI))
1250 if (CI.InstClass == DS_READ || CI.InstClass == DS_WRITE)
1251 offsetsCanBeCombined(CI, *STM, Paired,
true);
1257void SILoadStoreOptimizer::copyToDestRegs(
1258 CombineInfo &CI, CombineInfo &Paired,
1264 auto [SubRegIdx0, SubRegIdx1] = getSubRegIdxs(CI, Paired);
1268 auto *Dest0 =
TII->getNamedOperand(*CI.I,
OpName);
1269 auto *Dest1 =
TII->getNamedOperand(*Paired.I,
OpName);
1274 Dest0->setIsEarlyClobber(
false);
1275 Dest1->setIsEarlyClobber(
false);
1279 .
addReg(DestReg, 0, SubRegIdx0);
1288SILoadStoreOptimizer::copyFromSrcRegs(CombineInfo &CI, CombineInfo &Paired,
1294 auto [SubRegIdx0, SubRegIdx1] = getSubRegIdxs(CI, Paired);
1298 Register SrcReg =
MRI->createVirtualRegister(SuperRC);
1300 const auto *Src0 =
TII->getNamedOperand(*CI.I,
OpName);
1301 const auto *Src1 =
TII->getNamedOperand(*Paired.I,
OpName);
1303 BuildMI(*
MBB, InsertBefore,
DL,
TII->get(AMDGPU::REG_SEQUENCE), SrcReg)
1312unsigned SILoadStoreOptimizer::read2Opcode(
unsigned EltSize)
const {
1314 return (EltSize == 4) ? AMDGPU::DS_READ2_B32 : AMDGPU::DS_READ2_B64;
1315 return (EltSize == 4) ? AMDGPU::DS_READ2_B32_gfx9 : AMDGPU::DS_READ2_B64_gfx9;
1318unsigned SILoadStoreOptimizer::read2ST64Opcode(
unsigned EltSize)
const {
1320 return (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32 : AMDGPU::DS_READ2ST64_B64;
1322 return (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32_gfx9
1323 : AMDGPU::DS_READ2ST64_B64_gfx9;
1327SILoadStoreOptimizer::mergeRead2Pair(CombineInfo &CI, CombineInfo &Paired,
1333 const auto *AddrReg =
TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr);
1335 unsigned NewOffset0 = std::min(CI.Offset, Paired.Offset);
1336 unsigned NewOffset1 = std::max(CI.Offset, Paired.Offset);
1338 CI.UseST64 ? read2ST64Opcode(CI.EltSize) : read2Opcode(CI.EltSize);
1340 assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) &&
1341 (NewOffset0 != NewOffset1) &&
"Computed offset doesn't fit");
1346 Register DestReg =
MRI->createVirtualRegister(SuperRC);
1350 Register BaseReg = AddrReg->getReg();
1351 unsigned BaseSubReg = AddrReg->getSubReg();
1352 unsigned BaseRegFlags = 0;
1354 Register ImmReg =
MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
1358 BaseReg =
MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1361 TII->getAddNoCarry(*
MBB, InsertBefore,
DL, BaseReg)
1363 .addReg(AddrReg->getReg(), 0, BaseSubReg)
1370 .
addReg(BaseReg, BaseRegFlags, BaseSubReg)
1376 copyToDestRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdst, DestReg);
1378 CI.I->eraseFromParent();
1379 Paired.I->eraseFromParent();
1385unsigned SILoadStoreOptimizer::write2Opcode(
unsigned EltSize)
const {
1387 return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32 : AMDGPU::DS_WRITE2_B64;
1388 return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32_gfx9
1389 : AMDGPU::DS_WRITE2_B64_gfx9;
1392unsigned SILoadStoreOptimizer::write2ST64Opcode(
unsigned EltSize)
const {
1394 return (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32
1395 : AMDGPU::DS_WRITE2ST64_B64;
1397 return (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32_gfx9
1398 : AMDGPU::DS_WRITE2ST64_B64_gfx9;
1402 CombineInfo &CI, CombineInfo &Paired,
1409 TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr);
1411 TII->getNamedOperand(*CI.I, AMDGPU::OpName::data0);
1413 TII->getNamedOperand(*Paired.I, AMDGPU::OpName::data0);
1415 unsigned NewOffset0 = CI.Offset;
1416 unsigned NewOffset1 = Paired.Offset;
1418 CI.UseST64 ? write2ST64Opcode(CI.EltSize) : write2Opcode(CI.EltSize);
1420 if (NewOffset0 > NewOffset1) {
1426 assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) &&
1427 (NewOffset0 != NewOffset1) &&
"Computed offset doesn't fit");
1433 unsigned BaseSubReg = AddrReg->
getSubReg();
1434 unsigned BaseRegFlags = 0;
1436 Register ImmReg =
MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
1440 BaseReg =
MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1443 TII->getAddNoCarry(*
MBB, InsertBefore,
DL, BaseReg)
1445 .addReg(AddrReg->
getReg(), 0, BaseSubReg)
1452 .
addReg(BaseReg, BaseRegFlags, BaseSubReg)
1461 Paired.I->eraseFromParent();
1463 LLVM_DEBUG(
dbgs() <<
"Inserted write2 inst: " << *Write2 <<
'\n');
1468SILoadStoreOptimizer::mergeImagePair(CombineInfo &CI, CombineInfo &Paired,
1472 const unsigned Opcode = getNewOpcode(CI, Paired);
1476 Register DestReg =
MRI->createVirtualRegister(SuperRC);
1477 unsigned MergedDMask = CI.DMask | Paired.DMask;
1481 auto MIB =
BuildMI(*
MBB, InsertBefore,
DL,
TII->get(Opcode), DestReg);
1482 for (
unsigned I = 1, E = (*CI.I).getNumOperands();
I != E; ++
I) {
1484 MIB.addImm(MergedDMask);
1486 MIB.add((*CI.I).getOperand(
I));
1492 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1494 MachineInstr *
New = MIB.addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1496 copyToDestRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdata, DestReg);
1498 CI.I->eraseFromParent();
1499 Paired.I->eraseFromParent();
1504 CombineInfo &CI, CombineInfo &Paired,
1508 const unsigned Opcode = getNewOpcode(CI, Paired);
1512 Register DestReg =
MRI->createVirtualRegister(SuperRC);
1513 unsigned MergedOffset = std::min(CI.Offset, Paired.Offset);
1518 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1522 .
add(*
TII->getNamedOperand(*CI.I, AMDGPU::OpName::sbase));
1523 if (CI.InstClass == S_BUFFER_LOAD_SGPR_IMM)
1524 New.add(*
TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset));
1525 New.addImm(MergedOffset);
1526 New.addImm(CI.CPol).addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1528 copyToDestRegs(CI, Paired, InsertBefore, AMDGPU::OpName::sdst, DestReg);
1530 CI.I->eraseFromParent();
1531 Paired.I->eraseFromParent();
1536 CombineInfo &CI, CombineInfo &Paired,
1541 const unsigned Opcode = getNewOpcode(CI, Paired);
1546 Register DestReg =
MRI->createVirtualRegister(SuperRC);
1547 unsigned MergedOffset = std::min(CI.Offset, Paired.Offset);
1549 auto MIB =
BuildMI(*
MBB, InsertBefore,
DL,
TII->get(Opcode), DestReg);
1551 AddressRegs Regs = getRegs(Opcode, *
TII);
1554 MIB.add(*
TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
1559 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1562 MIB.add(*
TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
1563 .add(*
TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
1564 .addImm(MergedOffset)
1567 .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1569 copyToDestRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdata, DestReg);
1571 CI.I->eraseFromParent();
1572 Paired.I->eraseFromParent();
1577 CombineInfo &CI, CombineInfo &Paired,
1582 const unsigned Opcode = getNewOpcode(CI, Paired);
1587 Register DestReg =
MRI->createVirtualRegister(SuperRC);
1588 unsigned MergedOffset = std::min(CI.Offset, Paired.Offset);
1590 auto MIB =
BuildMI(*
MBB, InsertBefore,
DL,
TII->get(Opcode), DestReg);
1592 AddressRegs Regs = getRegs(Opcode, *
TII);
1595 MIB.add(*
TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
1597 unsigned JoinedFormat =
1603 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1606 MIB.add(*
TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
1607 .add(*
TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
1608 .addImm(MergedOffset)
1609 .addImm(JoinedFormat)
1612 .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1614 copyToDestRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdata, DestReg);
1616 CI.I->eraseFromParent();
1617 Paired.I->eraseFromParent();
1622 CombineInfo &CI, CombineInfo &Paired,
1627 const unsigned Opcode = getNewOpcode(CI, Paired);
1630 copyFromSrcRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdata);
1635 AddressRegs Regs = getRegs(Opcode, *
TII);
1638 MIB.add(*
TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
1640 unsigned JoinedFormat =
1646 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1649 MIB.add(*
TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
1650 .add(*
TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
1651 .addImm(std::min(CI.Offset, Paired.Offset))
1652 .addImm(JoinedFormat)
1655 .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1657 CI.I->eraseFromParent();
1658 Paired.I->eraseFromParent();
1663 CombineInfo &CI, CombineInfo &Paired,
1668 const unsigned Opcode = getNewOpcode(CI, Paired);
1671 Register DestReg =
MRI->createVirtualRegister(SuperRC);
1673 auto MIB =
BuildMI(*
MBB, InsertBefore,
DL,
TII->get(Opcode), DestReg);
1675 if (
auto *SAddr =
TII->getNamedOperand(*CI.I, AMDGPU::OpName::saddr))
1679 MIB.add(*
TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr))
1680 .addImm(std::min(CI.Offset, Paired.Offset))
1682 .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1684 copyToDestRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdst, DestReg);
1686 CI.I->eraseFromParent();
1687 Paired.I->eraseFromParent();
1692 CombineInfo &CI, CombineInfo &Paired,
1697 const unsigned Opcode = getNewOpcode(CI, Paired);
1700 copyFromSrcRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdata);
1703 .
add(*
TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr))
1706 if (
auto *SAddr =
TII->getNamedOperand(*CI.I, AMDGPU::OpName::saddr))
1710 MIB.addImm(std::min(CI.Offset, Paired.Offset))
1712 .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1714 CI.I->eraseFromParent();
1715 Paired.I->eraseFromParent();
1724 (MMOs.
size() != 1 || MMOs[0]->getAlign().value() < Width * 4);
1727unsigned SILoadStoreOptimizer::getNewOpcode(
const CombineInfo &CI,
1728 const CombineInfo &Paired) {
1729 const unsigned Width = CI.Width + Paired.Width;
1731 switch (getCommonInstClass(CI, Paired)) {
1733 assert(CI.InstClass == BUFFER_LOAD || CI.InstClass == BUFFER_STORE);
1744 case S_BUFFER_LOAD_IMM: {
1747 bool NeedsConstrainedOpc =
1753 return NeedsConstrainedOpc ? AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM_ec
1754 : AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM;
1756 return NeedsConstrainedOpc ? AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM_ec
1757 : AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM;
1759 return NeedsConstrainedOpc ? AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM_ec
1760 : AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM;
1762 return NeedsConstrainedOpc ? AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM_ec
1763 : AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM;
1766 case S_BUFFER_LOAD_SGPR_IMM: {
1769 bool NeedsConstrainedOpc =
1775 return NeedsConstrainedOpc ? AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM_ec
1776 : AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM;
1778 return NeedsConstrainedOpc ? AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM_ec
1779 : AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM;
1781 return NeedsConstrainedOpc ? AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM_ec
1782 : AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM;
1784 return NeedsConstrainedOpc ? AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM_ec
1785 : AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM;
1791 bool NeedsConstrainedOpc =
1797 return NeedsConstrainedOpc ? AMDGPU::S_LOAD_DWORDX2_IMM_ec
1798 : AMDGPU::S_LOAD_DWORDX2_IMM;
1800 return NeedsConstrainedOpc ? AMDGPU::S_LOAD_DWORDX3_IMM_ec
1801 : AMDGPU::S_LOAD_DWORDX3_IMM;
1803 return NeedsConstrainedOpc ? AMDGPU::S_LOAD_DWORDX4_IMM_ec
1804 : AMDGPU::S_LOAD_DWORDX4_IMM;
1806 return NeedsConstrainedOpc ? AMDGPU::S_LOAD_DWORDX8_IMM_ec
1807 : AMDGPU::S_LOAD_DWORDX8_IMM;
1815 return AMDGPU::GLOBAL_LOAD_DWORDX2;
1817 return AMDGPU::GLOBAL_LOAD_DWORDX3;
1819 return AMDGPU::GLOBAL_LOAD_DWORDX4;
1821 case GLOBAL_LOAD_SADDR:
1826 return AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR;
1828 return AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR;
1830 return AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR;
1837 return AMDGPU::GLOBAL_STORE_DWORDX2;
1839 return AMDGPU::GLOBAL_STORE_DWORDX3;
1841 return AMDGPU::GLOBAL_STORE_DWORDX4;
1843 case GLOBAL_STORE_SADDR:
1848 return AMDGPU::GLOBAL_STORE_DWORDX2_SADDR;
1850 return AMDGPU::GLOBAL_STORE_DWORDX3_SADDR;
1852 return AMDGPU::GLOBAL_STORE_DWORDX4_SADDR;
1859 return AMDGPU::FLAT_LOAD_DWORDX2;
1861 return AMDGPU::FLAT_LOAD_DWORDX3;
1863 return AMDGPU::FLAT_LOAD_DWORDX4;
1870 return AMDGPU::FLAT_STORE_DWORDX2;
1872 return AMDGPU::FLAT_STORE_DWORDX3;
1874 return AMDGPU::FLAT_STORE_DWORDX4;
1883std::pair<unsigned, unsigned>
1884SILoadStoreOptimizer::getSubRegIdxs(
const CombineInfo &CI,
1885 const CombineInfo &Paired) {
1886 assert((CI.InstClass != MIMG ||
1888 CI.Width + Paired.Width)) &&
1894 static const unsigned Idxs[5][4] = {
1895 {AMDGPU::sub0, AMDGPU::sub0_sub1, AMDGPU::sub0_sub1_sub2, AMDGPU::sub0_sub1_sub2_sub3},
1896 {AMDGPU::sub1, AMDGPU::sub1_sub2, AMDGPU::sub1_sub2_sub3, AMDGPU::sub1_sub2_sub3_sub4},
1897 {AMDGPU::sub2, AMDGPU::sub2_sub3, AMDGPU::sub2_sub3_sub4, AMDGPU::sub2_sub3_sub4_sub5},
1898 {AMDGPU::sub3, AMDGPU::sub3_sub4, AMDGPU::sub3_sub4_sub5, AMDGPU::sub3_sub4_sub5_sub6},
1899 {AMDGPU::sub4, AMDGPU::sub4_sub5, AMDGPU::sub4_sub5_sub6, AMDGPU::sub4_sub5_sub6_sub7},
1902 assert(CI.Width >= 1 && CI.Width <= 4);
1903 assert(Paired.Width >= 1 && Paired.Width <= 4);
1906 Idx1 = Idxs[0][Paired.Width - 1];
1907 Idx0 = Idxs[Paired.Width][CI.Width - 1];
1909 Idx0 = Idxs[0][CI.Width - 1];
1910 Idx1 = Idxs[CI.Width][Paired.Width - 1];
1913 return {Idx0, Idx1};
1917SILoadStoreOptimizer::getTargetRegisterClass(
const CombineInfo &CI,
1918 const CombineInfo &Paired)
const {
1919 if (CI.InstClass == S_BUFFER_LOAD_IMM ||
1920 CI.InstClass == S_BUFFER_LOAD_SGPR_IMM || CI.InstClass == S_LOAD_IMM) {
1921 switch (CI.Width + Paired.Width) {
1925 return &AMDGPU::SReg_64_XEXECRegClass;
1927 return &AMDGPU::SGPR_96RegClass;
1929 return &AMDGPU::SGPR_128RegClass;
1931 return &AMDGPU::SGPR_256RegClass;
1933 return &AMDGPU::SGPR_512RegClass;
1937 unsigned BitWidth = 32 * (CI.Width + Paired.Width);
1938 return TRI->isAGPRClass(getDataRegClass(*CI.I))
1944 CombineInfo &CI, CombineInfo &Paired,
1949 const unsigned Opcode = getNewOpcode(CI, Paired);
1952 copyFromSrcRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdata);
1957 AddressRegs Regs = getRegs(Opcode, *
TII);
1960 MIB.add(*
TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
1966 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1969 MIB.add(*
TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
1970 .add(*
TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
1971 .addImm(std::min(CI.Offset, Paired.Offset))
1974 .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1976 CI.I->eraseFromParent();
1977 Paired.I->eraseFromParent();
1982SILoadStoreOptimizer::createRegOrImm(int32_t Val,
MachineInstr &
MI)
const {
1984 if (
TII->isInlineConstant(V))
1987 Register Reg =
MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
1989 BuildMI(*
MI.getParent(),
MI.getIterator(),
MI.getDebugLoc(),
1990 TII->get(AMDGPU::S_MOV_B32), Reg)
1999 const MemAddress &
Addr)
const {
2005 Addr.Base.LoSubReg) &&
2006 "Expected 32-bit Base-Register-Low!!");
2009 Addr.Base.HiSubReg) &&
2010 "Expected 32-bit Base-Register-Hi!!");
2015 createRegOrImm(
static_cast<int32_t
>(
Addr.Offset >> 32),
MI);
2017 const auto *CarryRC =
TRI->getWaveMaskRegClass();
2018 Register CarryReg =
MRI->createVirtualRegister(CarryRC);
2019 Register DeadCarryReg =
MRI->createVirtualRegister(CarryRC);
2021 Register DestSub0 =
MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
2022 Register DestSub1 =
MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
2042 Register FullDestReg =
MRI->createVirtualRegister(
TRI->getVGPR64Class());
2058 int32_t NewOffset)
const {
2059 auto *
Base =
TII->getNamedOperand(
MI, AMDGPU::OpName::vaddr);
2060 Base->setReg(NewBase);
2061 Base->setIsKill(
false);
2062 TII->getNamedOperand(
MI, AMDGPU::OpName::offset)->setImm(NewOffset);
2065std::optional<int32_t>
2071 return std::nullopt;
2074 if (!Def ||
Def->getOpcode() != AMDGPU::S_MOV_B32 ||
2075 !
Def->getOperand(1).isImm())
2076 return std::nullopt;
2078 return Def->getOperand(1).getImm();
2092 MemAddress &
Addr)
const {
2097 if (!Def ||
Def->getOpcode() != AMDGPU::REG_SEQUENCE
2098 ||
Def->getNumOperands() != 5)
2109 if (!BaseLoDef || BaseLoDef->
getOpcode() != AMDGPU::V_ADD_CO_U32_e64 ||
2110 !BaseHiDef || BaseHiDef->
getOpcode() != AMDGPU::V_ADDC_U32_e64)
2113 const auto *Src0 =
TII->getNamedOperand(*BaseLoDef, AMDGPU::OpName::src0);
2114 const auto *Src1 =
TII->getNamedOperand(*BaseLoDef, AMDGPU::OpName::src1);
2116 auto Offset0P = extractConstOffset(*Src0);
2120 if (!(Offset0P = extractConstOffset(*Src1)))
2125 if (!BaseLo.
isReg())
2128 Src0 =
TII->getNamedOperand(*BaseHiDef, AMDGPU::OpName::src0);
2129 Src1 =
TII->getNamedOperand(*BaseHiDef, AMDGPU::OpName::src1);
2134 if (!Src1->isImm() || Src0->isImm())
2140 if (!BaseHi.
isReg())
2147 Addr.Offset = (*Offset0P & 0x00000000ffffffff) | (Offset1 << 32);
2150bool SILoadStoreOptimizer::promoteConstantOffsetToImm(
2152 MemInfoMap &Visited,
2170 if (
TII->getNamedOperand(
MI, AMDGPU::OpName::offset)->getImm()) {
2178 if (!Visited.contains(&
MI)) {
2179 processBaseWithConstOffset(
Base, MAddr);
2180 Visited[&
MI] = MAddr;
2182 MAddr = Visited[&
MI];
2184 if (MAddr.Offset == 0) {
2185 LLVM_DEBUG(
dbgs() <<
" Failed to extract constant-offset or there are no"
2186 " constant offsets that can be promoted.\n";);
2192 <<
"} Offset: " << MAddr.Offset <<
"\n\n";);
2220 MemAddress AnchorAddr;
2221 uint32_t MaxDist = std::numeric_limits<uint32_t>::min();
2236 TII->getNamedOperand(MINext, AMDGPU::OpName::offset)->getImm())
2240 *
TII->getNamedOperand(MINext, AMDGPU::OpName::vaddr);
2241 MemAddress MAddrNext;
2242 if (!Visited.contains(&MINext)) {
2243 processBaseWithConstOffset(BaseNext, MAddrNext);
2244 Visited[&MINext] = MAddrNext;
2246 MAddrNext = Visited[&MINext];
2248 if (MAddrNext.Base.LoReg != MAddr.Base.LoReg ||
2249 MAddrNext.Base.HiReg != MAddr.Base.HiReg ||
2250 MAddrNext.Base.LoSubReg != MAddr.Base.LoSubReg ||
2251 MAddrNext.Base.HiSubReg != MAddr.Base.HiSubReg)
2254 InstsWCommonBase.
emplace_back(&MINext, MAddrNext.Offset);
2256 int64_t Dist = MAddr.Offset - MAddrNext.Offset;
2261 (
uint32_t)std::abs(Dist) > MaxDist) {
2262 MaxDist = std::abs(Dist);
2264 AnchorAddr = MAddrNext;
2265 AnchorInst = &MINext;
2270 LLVM_DEBUG(
dbgs() <<
" Anchor-Inst(with max-distance from Offset): ";
2271 AnchorInst->
dump());
2273 << AnchorAddr.Offset <<
"\n\n");
2278 updateBaseAndOffset(
MI,
Base, MAddr.Offset - AnchorAddr.Offset);
2281 for (
auto [OtherMI, OtherOffset] : InstsWCommonBase) {
2284 AM.
BaseOffs = OtherOffset - AnchorAddr.Offset;
2289 updateBaseAndOffset(*OtherMI,
Base, OtherOffset - AnchorAddr.Offset);
2300void SILoadStoreOptimizer::addInstToMergeableList(
const CombineInfo &CI,
2301 std::list<std::list<CombineInfo> > &MergeableInsts)
const {
2302 for (std::list<CombineInfo> &AddrList : MergeableInsts) {
2303 if (AddrList.front().InstClass == CI.InstClass &&
2304 AddrList.front().IsAGPR == CI.IsAGPR &&
2305 AddrList.front().hasSameBaseAddress(CI)) {
2306 AddrList.emplace_back(CI);
2312 MergeableInsts.emplace_back(1, CI);
2315std::pair<MachineBasicBlock::iterator, bool>
2316SILoadStoreOptimizer::collectMergeableInsts(
2319 std::list<std::list<CombineInfo>> &MergeableInsts)
const {
2325 for (; BlockI !=
End; ++BlockI) {
2330 if (promoteConstantOffsetToImm(
MI, Visited,
AnchorList))
2335 if (
MI.hasOrderedMemoryRef() ||
MI.hasUnmodeledSideEffects()) {
2343 const InstClassEnum InstClass = getInstClass(
MI.getOpcode(), *
TII);
2344 if (InstClass == UNKNOWN)
2350 if (Swizzled != -1 &&
MI.getOperand(Swizzled).getImm())
2354 CI.setMI(
MI, *
this);
2357 if (!CI.hasMergeableAddress(*
MRI))
2360 if (CI.InstClass == DS_WRITE && CI.IsAGPR) {
2382 for (std::list<std::list<CombineInfo>>::iterator
I = MergeableInsts.begin(),
2383 E = MergeableInsts.end();
I != E;) {
2385 std::list<CombineInfo> &MergeList = *
I;
2386 if (MergeList.size() <= 1) {
2390 I = MergeableInsts.erase(
I);
2398 [] (
const CombineInfo &
A,
const CombineInfo &
B) {
2399 return A.Offset <
B.Offset;
2410bool SILoadStoreOptimizer::optimizeBlock(
2411 std::list<std::list<CombineInfo> > &MergeableInsts) {
2414 for (std::list<std::list<CombineInfo>>::iterator
I = MergeableInsts.begin(),
2415 E = MergeableInsts.end();
I != E;) {
2416 std::list<CombineInfo> &MergeList = *
I;
2418 bool OptimizeListAgain =
false;
2419 if (!optimizeInstsWithSameBaseAddr(MergeList, OptimizeListAgain)) {
2423 I = MergeableInsts.erase(
I);
2431 if (!OptimizeListAgain) {
2432 I = MergeableInsts.erase(
I);
2435 OptimizeAgain =
true;
2441SILoadStoreOptimizer::optimizeInstsWithSameBaseAddr(
2442 std::list<CombineInfo> &MergeList,
2443 bool &OptimizeListAgain) {
2444 if (MergeList.empty())
2449 for (
auto I = MergeList.begin(), Next = std::next(
I); Next != MergeList.end();
2450 Next = std::next(
I)) {
2455 if ((*First).Order > (*Second).Order)
2457 CombineInfo &CI = *
First;
2458 CombineInfo &Paired = *Second;
2460 CombineInfo *Where = checkAndPrepareMerge(CI, Paired);
2468 LLVM_DEBUG(
dbgs() <<
"Merging: " << *CI.I <<
" with: " << *Paired.I);
2471 switch (CI.InstClass) {
2476 NewMI = mergeRead2Pair(CI, Paired, Where->I);
2479 NewMI = mergeWrite2Pair(CI, Paired, Where->I);
2481 case S_BUFFER_LOAD_IMM:
2482 case S_BUFFER_LOAD_SGPR_IMM:
2484 NewMI = mergeSMemLoadImmPair(CI, Paired, Where->I);
2485 OptimizeListAgain |= CI.Width + Paired.Width < 8;
2488 NewMI = mergeBufferLoadPair(CI, Paired, Where->I);
2489 OptimizeListAgain |= CI.Width + Paired.Width < 4;
2492 NewMI = mergeBufferStorePair(CI, Paired, Where->I);
2493 OptimizeListAgain |= CI.Width + Paired.Width < 4;
2496 NewMI = mergeImagePair(CI, Paired, Where->I);
2497 OptimizeListAgain |= CI.Width + Paired.Width < 4;
2500 NewMI = mergeTBufferLoadPair(CI, Paired, Where->I);
2501 OptimizeListAgain |= CI.Width + Paired.Width < 4;
2504 NewMI = mergeTBufferStorePair(CI, Paired, Where->I);
2505 OptimizeListAgain |= CI.Width + Paired.Width < 4;
2509 case GLOBAL_LOAD_SADDR:
2510 NewMI = mergeFlatLoadPair(CI, Paired, Where->I);
2511 OptimizeListAgain |= CI.Width + Paired.Width < 4;
2515 case GLOBAL_STORE_SADDR:
2516 NewMI = mergeFlatStorePair(CI, Paired, Where->I);
2517 OptimizeListAgain |= CI.Width + Paired.Width < 4;
2520 CI.setMI(NewMI, *
this);
2521 CI.Order = Where->Order;
2525 MergeList.erase(Second);
2531bool SILoadStoreOptimizerLegacy::runOnMachineFunction(
MachineFunction &MF) {
2534 return SILoadStoreOptimizer(
2535 &getAnalysis<AAResultsWrapperPass>().getAAResults())
2545 TRI = &
TII->getRegisterInfo();
2562 bool CollectModified;
2563 std::list<std::list<CombineInfo>> MergeableInsts;
2567 std::tie(SectionEnd, CollectModified) =
2573 OptimizeAgain =
false;
2575 }
while (OptimizeAgain);
2597 bool Changed = SILoadStoreOptimizer(&AA).
run(MF);
unsigned const MachineRegisterInfo * MRI
INITIALIZE_PASS(AMDGPUImageIntrinsicOptimizer, DEBUG_TYPE, "AMDGPU Image Intrinsic Optimizer", false, false) char AMDGPUImageIntrinsicOptimizer void addInstToMergeableList(IntrinsicInst *II, SmallVector< SmallVector< IntrinsicInst *, 4 > > &MergeableInsts, const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr)
BasicBlock::iterator collectMergeableInsts(BasicBlock::iterator I, BasicBlock::iterator E, SmallVector< SmallVector< IntrinsicInst *, 4 > > &MergeableInsts)
Provides AMDGPU specific target descriptions.
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
MachineBasicBlock MachineBasicBlock::iterator MBBI
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
Analysis containing CSE Info
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
std::optional< std::vector< StOtherPiece > > Other
AMD GCN specific subclass of TargetSubtarget.
const HexagonInstrInfo * TII
unsigned const TargetRegisterInfo * TRI
static unsigned getReg(const MCDisassembler *D, unsigned RC, unsigned RegNo)
FunctionAnalysisManager FAM
#define INITIALIZE_PASS_DEPENDENCY(depName)
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
static uint32_t mostAlignedValueInRange(uint32_t Lo, uint32_t Hi)
static bool needsConstrainedOpcode(const GCNSubtarget &STM, ArrayRef< MachineMemOperand * > MMOs, unsigned Width)
static void addDefsUsesToList(const MachineInstr &MI, DenseSet< Register > &RegDefs, DenseSet< Register > &RegUses)
static unsigned getBufferFormatWithCompCount(unsigned OldFormat, unsigned ComponentCount, const GCNSubtarget &STI)
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
static bool isImm(const MachineOperand &MO, MachineRegisterInfo *MRI)
static bool optimizeBlock(BasicBlock &BB, bool &ModifiedDT, const TargetTransformInfo &TTI, const DataLayout &DL, bool HasBranchDivergence, DomTreeUpdater *DTU)
support::ulittle16_t & Lo
support::ulittle16_t & Hi
A manager for alias analyses.
Result run(Function &F, FunctionAnalysisManager &AM)
A wrapper pass to provide the legacy pass manager access to a suitably prepared AAResults object.
Class for arbitrary precision integers.
A container for analyses that lazily runs them and caches their results.
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
Represent the analysis usage information of a pass.
AnalysisUsage & addRequired()
void setPreservesCFG()
This function should be called by the pass, iff they do not:
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
size_t size() const
size - Get the array size.
Represents analyses that only rely on functions' control flow.
This class represents an Operation in the Expression.
Implements a dense probed hash-table based set.
FunctionPass class - This class is used to implement most global optimizations.
bool hasOptNone() const
Do not optimize this function (-O0).
bool loadStoreOptEnabled() const
bool hasFlatInstOffsets() const
const SIInstrInfo * getInstrInfo() const override
bool hasDwordx3LoadStores() const
const SITargetLowering * getTargetLowering() const override
bool ldsRequiresM0Init() const
Return if most LDS instructions have an m0 use that require m0 to be initialized.
bool hasScalarDwordx3Loads() const
bool isXNACKEnabled() const
TypeSize getValue() const
Describe properties that are true of each instruction in the target description file.
An RAII based helper class to modify MachineFunctionProperties when running pass.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
MachineFunctionPass - This class adapts the FunctionPass interface to allow convenient creation of pa...
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
virtual bool runOnMachineFunction(MachineFunction &MF)=0
runOnMachineFunction - This method must be overloaded to perform the desired machine code transformat...
virtual MachineFunctionProperties getRequiredProperties() const
Properties which a MachineFunction may have at a given point in time.
MachineFunctionProperties & set(Property P)
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
const MachineInstrBuilder & cloneMergedMemRefs(ArrayRef< const MachineInstr * > OtherMIs) const
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
Representation of each machine instruction.
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
void eraseFromParent()
Unlink 'this' from the containing basic block and delete it.
A description of a memory reference used in the backend.
LocationSize getSize() const
Return the size in bytes of the memory reference.
unsigned getAddrSpace() const
const MachinePointerInfo & getPointerInfo() const
MachineOperand class - Representation of each machine instruction operand.
unsigned getSubReg() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
bool isImm() const
isImm - Tests if this is a MO_Immediate operand.
static MachineOperand CreateImm(int64_t Val)
Register getReg() const
getReg - Returns the register number.
static MachineOperand CreateReg(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isEarlyClobber=false, unsigned SubReg=0, bool isDebug=false, bool isInternalRead=false, bool isRenamable=false)
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
virtual StringRef getPassName() const
getPassName - Return a nice clean name for a pass.
A set of analyses that are preserved following a run of a transformation pass.
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
void preserveSet()
Mark an analysis set as preserved.
Wrapper class representing virtual and physical registers.
constexpr bool isPhysical() const
Return true if the specified register number is in the physical register namespace.
static bool isFLATScratch(const MachineInstr &MI)
static bool isFLATGlobal(const MachineInstr &MI)
static bool isFLAT(const MachineInstr &MI)
PreservedAnalyses run(MachineFunction &MF, MachineFunctionAnalysisManager &MFAM)
bool isLegalFlatAddressingMode(const AddrMode &AM, unsigned AddrSpace) const
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
reference emplace_back(ArgTypes &&... Args)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
StringRef - Represent a constant reference to a string, i.e.
std::pair< iterator, bool > insert(const ValueT &V)
bool contains(const_arg_type_t< ValueT > V) const
Check if the set contains the given element.
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ FLAT_ADDRESS
Address space for flat memory.
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
LLVM_READONLY const MIMGInfo * getMIMGInfo(unsigned Opc)
uint64_t convertSMRDOffsetUnits(const MCSubtargetInfo &ST, uint64_t ByteOffset)
Convert ByteOffset to dwords if the subtarget uses dword SMRD immediate offsets.
bool getMTBUFHasSrsrc(unsigned Opc)
int getMTBUFElements(unsigned Opc)
LLVM_READONLY int16_t getNamedOperandIdx(uint16_t Opcode, uint16_t NamedIdx)
bool getMTBUFHasSoffset(unsigned Opc)
int getMUBUFOpcode(unsigned BaseOpc, unsigned Elements)
int getMUBUFBaseOpcode(unsigned Opc)
int getMTBUFBaseOpcode(unsigned Opc)
bool getMUBUFHasVAddr(unsigned Opc)
LLVM_READONLY bool hasNamedOperand(uint64_t Opcode, uint64_t NamedIdx)
int getMTBUFOpcode(unsigned BaseOpc, unsigned Elements)
bool getMUBUFHasSoffset(unsigned Opc)
const MIMGBaseOpcodeInfo * getMIMGBaseOpcode(unsigned Opc)
LLVM_READONLY const MIMGBaseOpcodeInfo * getMIMGBaseOpcodeInfo(unsigned BaseOpcode)
int getMaskedMIMGOp(unsigned Opc, unsigned NewChannels)
bool getMTBUFHasVAddr(unsigned Opc)
int getMUBUFElements(unsigned Opc)
const GcnBufferFormatInfo * getGcnBufferFormatInfo(uint8_t BitsPerComp, uint8_t NumComponents, uint8_t NumFormat, const MCSubtargetInfo &STI)
bool getMUBUFHasSrsrc(unsigned Opc)
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
@ Define
Register definition.
@ Kill
The last use of a register.
Reg
All possible values of the reg field in the ModR/M byte.
NodeAddr< DefNode * > Def
This is an optimization pass for GlobalISel generic memory operations.
bool operator<(int64_t V1, const APSInt &V2)
std::vector< std::pair< LineLocation, FunctionId > > AnchorList
int popcount(T Value) noexcept
Count the number of set bits in a value.
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
FunctionPass * createSILoadStoreOptimizerLegacyPass()
char & SILoadStoreOptimizerLegacyID
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
PreservedAnalyses getMachineFunctionPassPreservedAnalyses()
Returns the minimum set of Analyses that all machine function passes must preserve.
int countl_zero(T Val)
Count number of 0's from the most significant bit to the least stopping at the first 1.
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
@ First
Helpers to iterate all locations in the MemoryEffectsBase class.
constexpr unsigned BitWidth
Printable printReg(Register Reg, const TargetRegisterInfo *TRI=nullptr, unsigned SubIdx=0, const MachineRegisterInfo *MRI=nullptr)
Prints virtual and physical registers with or without a TRI instance.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
This class contains a discriminated union of information about pointers in memory operands,...
This represents an addressing mode of: BaseGV + BaseOffs + BaseReg + Scale*ScaleReg + ScalableOffset*...