57#define DEBUG_TYPE "aarch64-ldst-opt"
59STATISTIC(NumPairCreated,
"Number of load/store pair instructions generated");
60STATISTIC(NumPostFolded,
"Number of post-index updates folded");
61STATISTIC(NumPreFolded,
"Number of pre-index updates folded");
63 "Number of load/store from unscaled generated");
64STATISTIC(NumZeroStoresPromoted,
"Number of narrow zero stores promoted");
65STATISTIC(NumLoadsFromStoresPromoted,
"Number of loads from stores promoted");
68 "Controls which pairs are considered for renaming");
83#define AARCH64_LOAD_STORE_OPT_NAME "AArch64 load / store optimization pass"
87using LdStPairFlags =
struct LdStPairFlags {
91 bool MergeForward =
false;
102 std::optional<MCPhysReg> RenameReg;
104 LdStPairFlags() =
default;
106 void setMergeForward(
bool V =
true) { MergeForward = V; }
107 bool getMergeForward()
const {
return MergeForward; }
109 void setSExtIdx(
int V) { SExtIdx = V; }
110 int getSExtIdx()
const {
return SExtIdx; }
112 void setRenameReg(
MCPhysReg R) { RenameReg = R; }
113 void clearRenameReg() { RenameReg = std::nullopt; }
114 std::optional<MCPhysReg> getRenameReg()
const {
return RenameReg; }
142 LdStPairFlags &
Flags,
144 bool FindNarrowMerge);
155 const LdStPairFlags &
Flags);
161 const LdStPairFlags &
Flags);
173 int UnscaledOffset,
unsigned Limit);
184 unsigned BaseReg,
int Offset);
209 MachineFunctionProperties::Property::NoVRegs);
215char AArch64LoadStoreOpt::ID = 0;
222static
bool isNarrowStore(
unsigned Opc) {
226 case AArch64::STRBBui:
227 case AArch64::STURBBi:
228 case AArch64::STRHHui:
229 case AArch64::STURHHi:
237 switch (
MI.getOpcode()) {
243 case AArch64::STZ2Gi:
249 bool *IsValidLdStrOpc =
nullptr) {
251 *IsValidLdStrOpc =
true;
255 *IsValidLdStrOpc =
false;
256 return std::numeric_limits<unsigned>::max();
257 case AArch64::STRDui:
258 case AArch64::STURDi:
259 case AArch64::STRDpre:
260 case AArch64::STRQui:
261 case AArch64::STURQi:
262 case AArch64::STRQpre:
263 case AArch64::STRBBui:
264 case AArch64::STURBBi:
265 case AArch64::STRHHui:
266 case AArch64::STURHHi:
267 case AArch64::STRWui:
268 case AArch64::STRWpre:
269 case AArch64::STURWi:
270 case AArch64::STRXui:
271 case AArch64::STRXpre:
272 case AArch64::STURXi:
273 case AArch64::LDRDui:
274 case AArch64::LDURDi:
275 case AArch64::LDRDpre:
276 case AArch64::LDRQui:
277 case AArch64::LDURQi:
278 case AArch64::LDRQpre:
279 case AArch64::LDRWui:
280 case AArch64::LDURWi:
281 case AArch64::LDRWpre:
282 case AArch64::LDRXui:
283 case AArch64::LDURXi:
284 case AArch64::LDRXpre:
285 case AArch64::STRSui:
286 case AArch64::STURSi:
287 case AArch64::STRSpre:
288 case AArch64::LDRSui:
289 case AArch64::LDURSi:
290 case AArch64::LDRSpre:
292 case AArch64::LDRSWui:
293 return AArch64::LDRWui;
294 case AArch64::LDURSWi:
295 return AArch64::LDURWi;
303 case AArch64::STRBBui:
304 return AArch64::STRHHui;
305 case AArch64::STRHHui:
306 return AArch64::STRWui;
307 case AArch64::STURBBi:
308 return AArch64::STURHHi;
309 case AArch64::STURHHi:
310 return AArch64::STURWi;
311 case AArch64::STURWi:
312 return AArch64::STURXi;
313 case AArch64::STRWui:
314 return AArch64::STRXui;
322 case AArch64::STRSui:
323 case AArch64::STURSi:
324 return AArch64::STPSi;
325 case AArch64::STRSpre:
326 return AArch64::STPSpre;
327 case AArch64::STRDui:
328 case AArch64::STURDi:
329 return AArch64::STPDi;
330 case AArch64::STRDpre:
331 return AArch64::STPDpre;
332 case AArch64::STRQui:
333 case AArch64::STURQi:
334 return AArch64::STPQi;
335 case AArch64::STRQpre:
336 return AArch64::STPQpre;
337 case AArch64::STRWui:
338 case AArch64::STURWi:
339 return AArch64::STPWi;
340 case AArch64::STRWpre:
341 return AArch64::STPWpre;
342 case AArch64::STRXui:
343 case AArch64::STURXi:
344 return AArch64::STPXi;
345 case AArch64::STRXpre:
346 return AArch64::STPXpre;
347 case AArch64::LDRSui:
348 case AArch64::LDURSi:
349 return AArch64::LDPSi;
350 case AArch64::LDRSpre:
351 return AArch64::LDPSpre;
352 case AArch64::LDRDui:
353 case AArch64::LDURDi:
354 return AArch64::LDPDi;
355 case AArch64::LDRDpre:
356 return AArch64::LDPDpre;
357 case AArch64::LDRQui:
358 case AArch64::LDURQi:
359 return AArch64::LDPQi;
360 case AArch64::LDRQpre:
361 return AArch64::LDPQpre;
362 case AArch64::LDRWui:
363 case AArch64::LDURWi:
364 return AArch64::LDPWi;
365 case AArch64::LDRWpre:
366 return AArch64::LDPWpre;
367 case AArch64::LDRXui:
368 case AArch64::LDURXi:
369 return AArch64::LDPXi;
370 case AArch64::LDRXpre:
371 return AArch64::LDPXpre;
372 case AArch64::LDRSWui:
373 case AArch64::LDURSWi:
374 return AArch64::LDPSWi;
385 case AArch64::LDRBBui:
386 return StOpc == AArch64::STRBBui || StOpc == AArch64::STRHHui ||
387 StOpc == AArch64::STRWui || StOpc == AArch64::STRXui;
388 case AArch64::LDURBBi:
389 return StOpc == AArch64::STURBBi || StOpc == AArch64::STURHHi ||
390 StOpc == AArch64::STURWi || StOpc == AArch64::STURXi;
391 case AArch64::LDRHHui:
392 return StOpc == AArch64::STRHHui || StOpc == AArch64::STRWui ||
393 StOpc == AArch64::STRXui;
394 case AArch64::LDURHHi:
395 return StOpc == AArch64::STURHHi || StOpc == AArch64::STURWi ||
396 StOpc == AArch64::STURXi;
397 case AArch64::LDRWui:
398 return StOpc == AArch64::STRWui || StOpc == AArch64::STRXui;
399 case AArch64::LDURWi:
400 return StOpc == AArch64::STURWi || StOpc == AArch64::STURXi;
401 case AArch64::LDRXui:
402 return StOpc == AArch64::STRXui;
403 case AArch64::LDURXi:
404 return StOpc == AArch64::STURXi;
416 case AArch64::STRSui:
417 return AArch64::STRSpre;
418 case AArch64::STRDui:
419 return AArch64::STRDpre;
420 case AArch64::STRQui:
421 return AArch64::STRQpre;
422 case AArch64::STRBBui:
423 return AArch64::STRBBpre;
424 case AArch64::STRHHui:
425 return AArch64::STRHHpre;
426 case AArch64::STRWui:
427 return AArch64::STRWpre;
428 case AArch64::STRXui:
429 return AArch64::STRXpre;
430 case AArch64::LDRSui:
431 return AArch64::LDRSpre;
432 case AArch64::LDRDui:
433 return AArch64::LDRDpre;
434 case AArch64::LDRQui:
435 return AArch64::LDRQpre;
436 case AArch64::LDRBBui:
437 return AArch64::LDRBBpre;
438 case AArch64::LDRHHui:
439 return AArch64::LDRHHpre;
440 case AArch64::LDRWui:
441 return AArch64::LDRWpre;
442 case AArch64::LDRXui:
443 return AArch64::LDRXpre;
444 case AArch64::LDRSWui:
445 return AArch64::LDRSWpre;
447 return AArch64::LDPSpre;
448 case AArch64::LDPSWi:
449 return AArch64::LDPSWpre;
451 return AArch64::LDPDpre;
453 return AArch64::LDPQpre;
455 return AArch64::LDPWpre;
457 return AArch64::LDPXpre;
459 return AArch64::STPSpre;
461 return AArch64::STPDpre;
463 return AArch64::STPQpre;
465 return AArch64::STPWpre;
467 return AArch64::STPXpre;
469 return AArch64::STGPreIndex;
471 return AArch64::STZGPreIndex;
473 return AArch64::ST2GPreIndex;
474 case AArch64::STZ2Gi:
475 return AArch64::STZ2GPreIndex;
477 return AArch64::STGPpre;
485 case AArch64::STRSui:
486 case AArch64::STURSi:
487 return AArch64::STRSpost;
488 case AArch64::STRDui:
489 case AArch64::STURDi:
490 return AArch64::STRDpost;
491 case AArch64::STRQui:
492 case AArch64::STURQi:
493 return AArch64::STRQpost;
494 case AArch64::STRBBui:
495 return AArch64::STRBBpost;
496 case AArch64::STRHHui:
497 return AArch64::STRHHpost;
498 case AArch64::STRWui:
499 case AArch64::STURWi:
500 return AArch64::STRWpost;
501 case AArch64::STRXui:
502 case AArch64::STURXi:
503 return AArch64::STRXpost;
504 case AArch64::LDRSui:
505 case AArch64::LDURSi:
506 return AArch64::LDRSpost;
507 case AArch64::LDRDui:
508 case AArch64::LDURDi:
509 return AArch64::LDRDpost;
510 case AArch64::LDRQui:
511 case AArch64::LDURQi:
512 return AArch64::LDRQpost;
513 case AArch64::LDRBBui:
514 return AArch64::LDRBBpost;
515 case AArch64::LDRHHui:
516 return AArch64::LDRHHpost;
517 case AArch64::LDRWui:
518 case AArch64::LDURWi:
519 return AArch64::LDRWpost;
520 case AArch64::LDRXui:
521 case AArch64::LDURXi:
522 return AArch64::LDRXpost;
523 case AArch64::LDRSWui:
524 return AArch64::LDRSWpost;
526 return AArch64::LDPSpost;
527 case AArch64::LDPSWi:
528 return AArch64::LDPSWpost;
530 return AArch64::LDPDpost;
532 return AArch64::LDPQpost;
534 return AArch64::LDPWpost;
536 return AArch64::LDPXpost;
538 return AArch64::STPSpost;
540 return AArch64::STPDpost;
542 return AArch64::STPQpost;
544 return AArch64::STPWpost;
546 return AArch64::STPXpost;
548 return AArch64::STGPostIndex;
550 return AArch64::STZGPostIndex;
552 return AArch64::ST2GPostIndex;
553 case AArch64::STZ2Gi:
554 return AArch64::STZ2GPostIndex;
556 return AArch64::STGPpost;
563 unsigned OpcB =
MI.getOpcode();
568 case AArch64::STRSpre:
569 return (OpcB == AArch64::STRSui) || (OpcB == AArch64::STURSi);
570 case AArch64::STRDpre:
571 return (OpcB == AArch64::STRDui) || (OpcB == AArch64::STURDi);
572 case AArch64::STRQpre:
573 return (OpcB == AArch64::STRQui) || (OpcB == AArch64::STURQi);
574 case AArch64::STRWpre:
575 return (OpcB == AArch64::STRWui) || (OpcB == AArch64::STURWi);
576 case AArch64::STRXpre:
577 return (OpcB == AArch64::STRXui) || (OpcB == AArch64::STURXi);
578 case AArch64::LDRSpre:
579 return (OpcB == AArch64::LDRSui) || (OpcB == AArch64::LDURSi);
580 case AArch64::LDRDpre:
581 return (OpcB == AArch64::LDRDui) || (OpcB == AArch64::LDURDi);
582 case AArch64::LDRQpre:
583 return (OpcB == AArch64::LDRQui) || (OpcB == AArch64::LDURQi);
584 case AArch64::LDRWpre:
585 return (OpcB == AArch64::LDRWui) || (OpcB == AArch64::LDURWi);
586 case AArch64::LDRXpre:
587 return (OpcB == AArch64::LDRXui) || (OpcB == AArch64::LDURXi);
593 int &MinOffset,
int &MaxOffset) {
611 unsigned PairedRegOp = 0) {
612 assert(PairedRegOp < 2 &&
"Unexpected register operand idx.");
618 return MI.getOperand(
Idx);
627 int UnscaledStOffset =
631 int UnscaledLdOffset =
635 return (UnscaledStOffset <= UnscaledLdOffset) &&
636 (UnscaledLdOffset + LoadSize <= (UnscaledStOffset + StoreSize));
640 unsigned Opc =
MI.getOpcode();
641 return (Opc == AArch64::STRWui || Opc == AArch64::STURWi ||
642 isNarrowStore(Opc)) &&
647 switch (
MI.getOpcode()) {
651 case AArch64::LDRBBui:
652 case AArch64::LDRHHui:
653 case AArch64::LDRWui:
654 case AArch64::LDRXui:
656 case AArch64::LDURBBi:
657 case AArch64::LDURHHi:
658 case AArch64::LDURWi:
659 case AArch64::LDURXi:
665 unsigned Opc =
MI.getOpcode();
670 case AArch64::STRSui:
671 case AArch64::STRDui:
672 case AArch64::STRQui:
673 case AArch64::STRXui:
674 case AArch64::STRWui:
675 case AArch64::STRHHui:
676 case AArch64::STRBBui:
677 case AArch64::LDRSui:
678 case AArch64::LDRDui:
679 case AArch64::LDRQui:
680 case AArch64::LDRXui:
681 case AArch64::LDRWui:
682 case AArch64::LDRHHui:
683 case AArch64::LDRBBui:
687 case AArch64::STZ2Gi:
690 case AArch64::STURSi:
691 case AArch64::STURDi:
692 case AArch64::STURQi:
693 case AArch64::STURWi:
694 case AArch64::STURXi:
695 case AArch64::LDURSi:
696 case AArch64::LDURDi:
697 case AArch64::LDURQi:
698 case AArch64::LDURWi:
699 case AArch64::LDURXi:
702 case AArch64::LDPSWi:
723 const LdStPairFlags &Flags) {
725 "Expected promotable zero stores.");
733 if (NextI == MergeMI)
736 unsigned Opc =
I->getOpcode();
737 bool IsScaled = !
TII->hasUnscaledLdStOffset(Opc);
738 int OffsetStride = IsScaled ? 1 :
TII->getMemScale(*
I);
740 bool MergeForward =
Flags.getMergeForward();
761 assert(((OffsetImm & 1) == 0) &&
"Unexpected offset to merge");
770 .
addReg(isNarrowStore(Opc) ? AArch64::WZR : AArch64::XZR)
774 .setMIFlags(
I->mergeFlagsWith(*MergeMI));
777 LLVM_DEBUG(
dbgs() <<
"Creating wider store. Replacing instructions:\n ");
786 I->eraseFromParent();
787 MergeMI->eraseFromParent();
797 auto MBB =
MI.getParent();
805 return MOP.isReg() && MOP.isDef() && !MOP.isDebug() && MOP.getReg() &&
806 TRI->regsOverlap(MOP.getReg(), DefReg);
820 if (MOP.isReg() && MOP.isKill())
824 if (MOP.isReg() && !MOP.isKill())
825 Units.
addReg(MOP.getReg());
831 const LdStPairFlags &Flags) {
841 int SExtIdx =
Flags.getSExtIdx();
844 bool IsUnscaled =
TII->hasUnscaledLdStOffset(Opc);
845 int OffsetStride = IsUnscaled ?
TII->getMemScale(*
I) : 1;
847 bool MergeForward =
Flags.getMergeForward();
849 std::optional<MCPhysReg> RenameReg =
Flags.getRenameReg();
850 if (MergeForward && RenameReg) {
852 DefinedInBB.addReg(*RenameReg);
856 auto GetMatchingSubReg = [
this,
858 for (
MCPhysReg SubOrSuper :
TRI->sub_and_superregs_inclusive(*RenameReg))
859 if (
TRI->getMinimalPhysRegClass(OriginalReg) ==
860 TRI->getMinimalPhysRegClass(SubOrSuper))
866 [
this, RegToRename, GetMatchingSubReg](
MachineInstr &
MI,
bool IsDef) {
868 bool SeenDef =
false;
869 for (
auto &MOP :
MI.operands()) {
872 if (MOP.isReg() && !MOP.isDebug() && MOP.getReg() &&
873 (!SeenDef || (MOP.isDef() && MOP.isImplicit())) &&
874 TRI->regsOverlap(MOP.getReg(), RegToRename)) {
875 assert((MOP.isImplicit() ||
876 (MOP.isRenamable() && !MOP.isEarlyClobber())) &&
877 "Need renamable operands");
878 MOP.setReg(GetMatchingSubReg(MOP.getReg()));
883 for (
auto &MOP :
MI.operands()) {
884 if (MOP.isReg() && !MOP.isDebug() && MOP.getReg() &&
885 TRI->regsOverlap(MOP.getReg(), RegToRename)) {
886 assert((MOP.isImplicit() ||
887 (MOP.isRenamable() && !MOP.isEarlyClobber())) &&
888 "Need renamable operands");
889 MOP.setReg(GetMatchingSubReg(MOP.getReg()));
904 std::next(
I), std::next(Paired)))
907 return !MOP.isReg() || MOP.isDebug() || !MOP.getReg() ||
909 !TRI->regsOverlap(MOP.getReg(), *RenameReg);
911 "Rename register used between paired instruction, trashing the "
927 bool PairedIsUnscaled =
TII->hasUnscaledLdStOffset(Paired->getOpcode());
928 if (IsUnscaled != PairedIsUnscaled) {
932 int MemSize =
TII->getMemScale(*Paired);
933 if (PairedIsUnscaled) {
936 assert(!(PairedOffset %
TII->getMemScale(*Paired)) &&
937 "Offset should be a multiple of the stride!");
938 PairedOffset /= MemSize;
940 PairedOffset *= MemSize;
948 if (
Offset == PairedOffset + OffsetStride &&
956 SExtIdx = (SExtIdx + 1) % 2;
964 assert(!(OffsetImm %
TII->getMemScale(*RtMI)) &&
965 "Unscaled offset cannot be scaled.");
966 OffsetImm /=
TII->getMemScale(*RtMI);
976 if (RegOp0.
isUse()) {
991 MI.clearRegisterKills(Reg,
TRI);
1007 .setMIFlags(
I->mergeFlagsWith(*Paired));
1012 dbgs() <<
"Creating pair load/store. Replacing instructions:\n ");
1017 if (SExtIdx != -1) {
1027 Register DstRegW =
TRI->getSubReg(DstRegX, AArch64::sub_32);
1037 BuildMI(*
MBB, InsertionPoint,
DL,
TII->get(TargetOpcode::KILL), DstRegW)
1043 BuildMI(*
MBB, InsertionPoint,
DL,
TII->get(AArch64::SBFMXri), DstRegX)
1057 if (MOP.isReg() && MOP.isKill())
1058 DefinedInBB.addReg(MOP.getReg());
1061 I->eraseFromParent();
1062 Paired->eraseFromParent();
1071 next_nodbg(LoadI, LoadI->getParent()->end());
1073 int LoadSize =
TII->getMemScale(*LoadI);
1074 int StoreSize =
TII->getMemScale(*StoreI);
1078 bool IsStoreXReg =
TRI->getRegClass(AArch64::GPR64RegClassID)->contains(StRt);
1081 TRI->getRegClass(AArch64::GPR32RegClassID)->contains(StRt)) &&
1082 "Unexpected RegClass");
1085 if (LoadSize == StoreSize && (LoadSize == 4 || LoadSize == 8)) {
1088 if (StRt == LdRt && LoadSize == 8) {
1090 LoadI->getIterator())) {
1091 if (
MI.killsRegister(StRt,
TRI)) {
1092 MI.clearRegisterKills(StRt,
TRI);
1099 LoadI->eraseFromParent();
1104 BuildMI(*LoadI->getParent(), LoadI, LoadI->getDebugLoc(),
1105 TII->get(IsStoreXReg ? AArch64::ORRXrs : AArch64::ORRWrs), LdRt)
1106 .
addReg(IsStoreXReg ? AArch64::XZR : AArch64::WZR)
1113 if (!Subtarget->isLittleEndian())
1115 bool IsUnscaled =
TII->hasUnscaledLdStOffset(*LoadI);
1116 assert(IsUnscaled ==
TII->hasUnscaledLdStOffset(*StoreI) &&
1117 "Unsupported ld/st match");
1118 assert(LoadSize <= StoreSize &&
"Invalid load size");
1119 int UnscaledLdOffset =
1123 int UnscaledStOffset =
1127 int Width = LoadSize * 8;
1130 LdRt, AArch64::sub_32, &AArch64::GPR64RegClass))
1133 assert((UnscaledLdOffset >= UnscaledStOffset &&
1134 (UnscaledLdOffset + LoadSize) <= UnscaledStOffset + StoreSize) &&
1137 int Immr = 8 * (UnscaledLdOffset - UnscaledStOffset);
1138 int Imms = Immr +
Width - 1;
1139 if (UnscaledLdOffset == UnscaledStOffset) {
1140 uint32_t AndMaskEncoded = ((IsStoreXReg ? 1 : 0) << 12)
1146 BuildMI(*LoadI->getParent(), LoadI, LoadI->getDebugLoc(),
1147 TII->get(IsStoreXReg ? AArch64::ANDXri : AArch64::ANDWri),
1154 BuildMI(*LoadI->getParent(), LoadI, LoadI->getDebugLoc(),
1155 TII->get(IsStoreXReg ? AArch64::UBFMXri : AArch64::UBFMWri),
1167 if (
MI.killsRegister(StRt,
TRI)) {
1168 MI.clearRegisterKills(StRt,
TRI);
1183 LoadI->eraseFromParent();
1193 if (
Offset % OffsetStride)
1197 return Offset <= 63 && Offset >= -64;
1205 return (Num + PowOf2 - 1) & ~(PowOf2 - 1);
1218bool AArch64LoadStoreOpt::findMatchingStore(
1233 ModifiedRegUnits.clear();
1234 UsedRegUnits.clear();
1243 if (!
MI.isTransient())
1269 if (!ModifiedRegUnits.available(BaseReg))
1275 }
while (
MBBI !=
B && Count < Limit);
1287 LdStPairFlags &Flags,
1290 if (
MI.hasOrderedMemoryRef() ||
TII->isLdStPairSuppressed(
MI))
1295 !
TII->isLdStPairSuppressed(FirstMI) &&
1296 "FirstMI shouldn't get here if either of these checks are true.");
1303 unsigned OpcB =
MI.getOpcode();
1310 bool IsValidLdStrOpc, PairIsValidLdStrOpc;
1312 assert(IsValidLdStrOpc &&
1313 "Given Opc should be a Load or Store with an immediate");
1316 Flags.setSExtIdx(NonSExtOpc == (
unsigned)OpcA ? 1 : 0);
1322 if (!PairIsValidLdStrOpc)
1327 if (isNarrowStore(OpcA) || isNarrowStore(OpcB))
1337 return TII->hasUnscaledLdStOffset(OpcA) !=
TII->hasUnscaledLdStOffset(OpcB) &&
1362 return MOP.isReg() && !MOP.isDebug() && MOP.getReg() &&
1363 MOP.isImplicit() && MOP.isKill() &&
1364 TRI->regsOverlap(RegToRename, MOP.getReg());
1366 LLVM_DEBUG(
dbgs() <<
" Operand not killed at " << FirstMI <<
"\n");
1371 auto *RegClass =
TRI->getMinimalPhysRegClass(MOP.
getReg());
1378 if (RegClass->HasDisjunctSubRegs) {
1381 <<
" Cannot rename operands with multiple disjunct subregisters ("
1390 bool FoundDef =
false;
1401 LLVM_DEBUG(
dbgs() <<
" Cannot rename framesetup instructions currently ("
1421 if (
MI.isPseudo()) {
1427 for (
auto &MOP :
MI.operands()) {
1429 !
TRI->regsOverlap(MOP.
getReg(), RegToRename))
1431 if (!canRenameMOP(MOP)) {
1433 <<
" Cannot rename " << MOP <<
" in " <<
MI <<
"\n");
1440 for (
auto &MOP :
MI.operands()) {
1442 !
TRI->regsOverlap(MOP.
getReg(), RegToRename))
1445 if (!canRenameMOP(MOP)) {
1447 <<
" Cannot rename " << MOP <<
" in " <<
MI <<
"\n");
1460 LLVM_DEBUG(
dbgs() <<
" Did not find definition for register in BB\n");
1481 auto AnySubOrSuperRegCalleePreserved = [&MF,
TRI](
MCPhysReg PR) {
1482 return any_of(
TRI->sub_and_superregs_inclusive(PR),
1484 return TRI->isCalleeSavedPhysReg(SubOrSuper, MF);
1490 auto CanBeUsedForAllClasses = [&RequiredClasses,
TRI](
MCPhysReg PR) {
1492 return any_of(
TRI->sub_and_superregs_inclusive(PR),
1494 return C == TRI->getMinimalPhysRegClass(SubOrSuper);
1499 auto *RegClass =
TRI->getMinimalPhysRegClass(Reg);
1502 !
RegInfo.isReserved(PR) && !AnySubOrSuperRegCalleePreserved(PR) &&
1503 CanBeUsedForAllClasses(PR)) {
1511 <<
TRI->getRegClassName(RegClass) <<
"\n");
1512 return std::nullopt;
1519 LdStPairFlags &Flags,
unsigned Limit,
1520 bool FindNarrowMerge) {
1528 bool IsUnscaled =
TII->hasUnscaledLdStOffset(FirstMI);
1532 int OffsetStride = IsUnscaled ?
TII->getMemScale(FirstMI) : 1;
1535 std::optional<bool> MaybeCanRename;
1537 MaybeCanRename = {
false};
1543 Flags.clearRenameReg();
1547 ModifiedRegUnits.clear();
1548 UsedRegUnits.clear();
1553 for (
unsigned Count = 0;
MBBI !=
E && Count < Limit;
1561 if (!
MI.isTransient())
1564 Flags.setSExtIdx(-1);
1567 assert(
MI.mayLoadOrStore() &&
"Expected memory operation.");
1576 bool MIIsUnscaled =
TII->hasUnscaledLdStOffset(
MI);
1577 if (IsUnscaled != MIIsUnscaled) {
1581 int MemSize =
TII->getMemScale(
MI);
1585 if (MIOffset % MemSize) {
1591 MIOffset /= MemSize;
1593 MIOffset *= MemSize;
1599 if (BaseReg == MIBaseReg) {
1605 bool IsOutOfBounds = MIOffset !=
TII->getMemScale(
MI);
1606 bool IsBaseRegUsed = !UsedRegUnits.available(
1608 bool IsBaseRegModified = !ModifiedRegUnits.available(
1613 bool IsMIRegTheSame =
1616 if (IsOutOfBounds || IsBaseRegUsed || IsBaseRegModified ||
1624 if ((
Offset != MIOffset + OffsetStride) &&
1625 (
Offset + OffsetStride != MIOffset)) {
1634 if (FindNarrowMerge) {
1639 if ((!IsUnscaled &&
alignTo(MinOffset, 2) != MinOffset) ||
1660 if (IsUnscaled && (
alignTo(MinOffset, OffsetStride) != MinOffset)) {
1685 if (!ModifiedRegUnits.available(BaseReg))
1697 Flags.setMergeForward(
false);
1698 Flags.clearRenameReg();
1708 !
mayAlias(FirstMI, MemInsns, AA)) {
1711 Flags.setMergeForward(
true);
1712 Flags.clearRenameReg();
1717 if (!MaybeCanRename)
1719 RequiredClasses,
TRI)};
1721 if (*MaybeCanRename) {
1722 std::optional<MCPhysReg> MaybeRenameReg =
1724 Reg, DefinedInBB, UsedInBetween,
1725 RequiredClasses,
TRI);
1726 if (MaybeRenameReg) {
1727 Flags.setRenameReg(*MaybeRenameReg);
1728 Flags.setMergeForward(
true);
1729 MBBIWithRenameReg =
MBBI;
1739 if (
Flags.getRenameReg())
1740 return MBBIWithRenameReg;
1752 if (!ModifiedRegUnits.available(BaseReg))
1756 if (
MI.mayLoadOrStore())
1764 auto End =
MI.getParent()->end();
1765 if (MaybeCFI == End ||
1766 MaybeCFI->getOpcode() != TargetOpcode::CFI_INSTRUCTION ||
1773 unsigned CFIIndex = MaybeCFI->getOperand(0).getCFIIndex();
1788 assert((Update->getOpcode() == AArch64::ADDXri ||
1789 Update->getOpcode() == AArch64::SUBXri) &&
1790 "Unexpected base register update instruction to merge!");
1802 if (NextI == Update)
1805 int Value = Update->getOperand(2).getImm();
1807 "Can't merge 1 << 12 offset into pre-/post-indexed load / store");
1808 if (Update->getOpcode() == AArch64::SUBXri)
1814 int Scale, MinOffset, MaxOffset;
1818 MIB =
BuildMI(*
I->getParent(),
I,
I->getDebugLoc(),
TII->get(NewOpc))
1827 MIB =
BuildMI(*
I->getParent(),
I,
I->getDebugLoc(),
TII->get(NewOpc))
1857 I->eraseFromParent();
1858 Update->eraseFromParent();
1863bool AArch64LoadStoreOpt::isMatchingUpdateInsn(
MachineInstr &MemMI,
1865 unsigned BaseReg,
int Offset) {
1866 switch (
MI.getOpcode()) {
1869 case AArch64::SUBXri:
1870 case AArch64::ADDXri:
1873 if (!
MI.getOperand(2).isImm())
1881 if (
MI.getOperand(0).getReg() != BaseReg ||
1882 MI.getOperand(1).getReg() != BaseReg)
1885 int UpdateOffset =
MI.getOperand(2).getImm();
1886 if (
MI.getOpcode() == AArch64::SUBXri)
1887 UpdateOffset = -UpdateOffset;
1891 int Scale, MinOffset, MaxOffset;
1893 if (UpdateOffset % Scale != 0)
1897 int ScaledOffset = UpdateOffset / Scale;
1898 if (ScaledOffset > MaxOffset || ScaledOffset < MinOffset)
1918 TII->getMemScale(MemMI);
1923 if (MIUnscaledOffset != UnscaledOffset)
1934 for (
unsigned i = 0, e = IsPairedInsn ? 2 : 1; i !=
e; ++i) {
1936 if (DestReg == BaseReg ||
TRI->isSubRegister(BaseReg, DestReg))
1943 ModifiedRegUnits.clear();
1944 UsedRegUnits.clear();
1950 const bool BaseRegSP = BaseReg == AArch64::SP;
1958 for (
unsigned Count = 0;
MBBI !=
E && Count < Limit;
1964 if (!
MI.isTransient())
1968 if (isMatchingUpdateInsn(*
I,
MI, BaseReg, UnscaledOffset))
1978 if (!ModifiedRegUnits.available(BaseReg) ||
1979 !UsedRegUnits.available(BaseReg) ||
1980 (BaseRegSP &&
MBBI->mayLoadOrStore()))
2005 for (
unsigned i = 0, e = IsPairedInsn ? 2 : 1; i !=
e; ++i) {
2007 if (DestReg == BaseReg ||
TRI->isSubRegister(BaseReg, DestReg))
2012 const bool BaseRegSP = BaseReg == AArch64::SP;
2021 unsigned RedZoneSize =
2026 ModifiedRegUnits.clear();
2027 UsedRegUnits.clear();
2029 bool MemAcessBeforeSPPreInc =
false;
2036 if (!
MI.isTransient())
2040 if (isMatchingUpdateInsn(*
I,
MI, BaseReg,
Offset)) {
2043 if (MemAcessBeforeSPPreInc &&
MBBI->getOperand(2).getImm() > RedZoneSize)
2053 if (!ModifiedRegUnits.available(BaseReg) ||
2054 !UsedRegUnits.available(BaseReg))
2059 if (BaseRegSP &&
MBBI->mayLoadOrStore())
2060 MemAcessBeforeSPPreInc =
true;
2061 }
while (
MBBI !=
B && Count < Limit);
2065bool AArch64LoadStoreOpt::tryToPromoteLoadFromStore(
2069 if (
MI.hasOrderedMemoryRef())
2083 ++NumLoadsFromStoresPromoted;
2087 MBBI = promoteLoadFromStore(
MBBI, StoreI);
2094bool AArch64LoadStoreOpt::tryToMergeZeroStInst(
2100 if (!
TII->isCandidateToMergeOrPair(
MI))
2104 LdStPairFlags
Flags;
2108 ++NumZeroStoresPromoted;
2112 MBBI = mergeNarrowZeroStores(
MBBI, MergeMI, Flags);
2124 if (!
TII->isCandidateToMergeOrPair(
MI))
2130 bool IsUnscaled =
TII->hasUnscaledLdStOffset(
MI);
2132 int OffsetStride = IsUnscaled ?
TII->getMemScale(
MI) : 1;
2140 LdStPairFlags
Flags;
2145 if (
TII->hasUnscaledLdStOffset(
MI))
2146 ++NumUnscaledPairCreated;
2149 auto Prev = std::prev(
MBBI);
2150 MBBI = mergePairedInsns(
MBBI, Paired, Flags);
2153 for (
auto I = std::next(Prev);
I !=
MBBI;
I++)
2161bool AArch64LoadStoreOpt::tryToMergeLdStUpdate
2175 MBBI = mergeUpdateInsn(
MBBI, Update,
false);
2180 if (
TII->hasUnscaledLdStOffset(
MI.getOpcode()))
2191 MBBI = mergeUpdateInsn(
MBBI, Update,
true);
2198 int UnscaledOffset =
2206 Update = findMatchingUpdateInsnForward(
MBBI, UnscaledOffset,
UpdateLimit);
2209 MBBI = mergeUpdateInsn(
MBBI, Update,
true);
2217 bool EnableNarrowZeroStOpt) {
2248 if (EnableNarrowZeroStOpt)
2265 DefinedInBB.
clear();
2266 DefinedInBB.addLiveIns(
MBB);
2274 if (
TII->isPairableLdStInst(*
MBBI) && tryToPairLdStInst(
MBBI))
2304 AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
2309 ModifiedRegUnits.init(*
TRI);
2310 UsedRegUnits.init(*
TRI);
2311 DefinedInBB.init(*
TRI);
2314 bool enableNarrowZeroStOpt = !Subtarget->requiresStrictAlign();
2315 for (
auto &
MBB : Fn) {
2336 return new AArch64LoadStoreOpt();
static cl::opt< bool > EnableRenaming("aarch64-load-store-renaming", cl::init(true), cl::Hidden)
static MachineOperand & getLdStRegOp(MachineInstr &MI, unsigned PairedRegOp=0)
static bool isPromotableLoadFromStore(MachineInstr &MI)
static void getPrePostIndexedMemOpInfo(const MachineInstr &MI, int &Scale, int &MinOffset, int &MaxOffset)
static bool inBoundsForPair(bool IsUnscaled, int Offset, int OffsetStride)
static unsigned getMatchingPairOpcode(unsigned Opc)
static bool isMergeableLdStUpdate(MachineInstr &MI)
static bool areCandidatesToMergeOrPair(MachineInstr &FirstMI, MachineInstr &MI, LdStPairFlags &Flags, const AArch64InstrInfo *TII)
static std::optional< MCPhysReg > tryToFindRegisterToRename(const MachineFunction &MF, Register Reg, LiveRegUnits &DefinedInBB, LiveRegUnits &UsedInBetween, SmallPtrSetImpl< const TargetRegisterClass * > &RequiredClasses, const TargetRegisterInfo *TRI)
static bool needsWinCFI(const MachineFunction *MF)
static bool mayAlias(MachineInstr &MIa, SmallVectorImpl< MachineInstr * > &MemInsns, AliasAnalysis *AA)
static cl::opt< unsigned > LdStLimit("aarch64-load-store-scan-limit", cl::init(20), cl::Hidden)
static unsigned getPreIndexedOpcode(unsigned Opc)
#define AARCH64_LOAD_STORE_OPT_NAME
static cl::opt< unsigned > UpdateLimit("aarch64-update-scan-limit", cl::init(100), cl::Hidden)
static bool isPromotableZeroStoreInst(MachineInstr &MI)
static unsigned getMatchingWideOpcode(unsigned Opc)
static unsigned getMatchingNonSExtOpcode(unsigned Opc, bool *IsValidLdStrOpc=nullptr)
static MachineBasicBlock::iterator maybeMoveCFI(MachineInstr &MI, MachineBasicBlock::iterator MaybeCFI)
static int alignTo(int Num, int PowOf2)
static bool isTagStore(const MachineInstr &MI)
static unsigned isMatchingStore(MachineInstr &LoadInst, MachineInstr &StoreInst)
static bool forAllMIsUntilDef(MachineInstr &MI, MCPhysReg DefReg, const TargetRegisterInfo *TRI, unsigned Limit, std::function< bool(MachineInstr &, bool)> &Fn)
static unsigned getPostIndexedOpcode(unsigned Opc)
static bool isLdOffsetInRangeOfSt(MachineInstr &LoadInst, MachineInstr &StoreInst, const AArch64InstrInfo *TII)
static bool isPreLdStPairCandidate(MachineInstr &FirstMI, MachineInstr &MI)
static void updateDefinedRegisters(MachineInstr &MI, LiveRegUnits &Units, const TargetRegisterInfo *TRI)
static bool canRenameUpToDef(MachineInstr &FirstMI, LiveRegUnits &UsedInBetween, SmallPtrSetImpl< const TargetRegisterClass * > &RequiredClasses, const TargetRegisterInfo *TRI)
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
MachineBasicBlock MachineBasicBlock::iterator MBBI
static void print(raw_ostream &Out, object::Archive::Kind Kind, T Val)
This file implements the BitVector class.
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
This file provides an implementation of debug counters.
#define DEBUG_COUNTER(VARNAME, COUNTERNAME, DESC)
const HexagonInstrInfo * TII
unsigned const TargetRegisterInfo * TRI
static unsigned getReg(const MCDisassembler *D, unsigned RC, unsigned RegNo)
#define INITIALIZE_PASS(passName, arg, name, cfg, analysis)
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
static bool isImm(const MachineOperand &MO, MachineRegisterInfo *MRI)
static bool optimizeBlock(BasicBlock &BB, bool &ModifiedDT, const TargetTransformInfo &TTI, const DataLayout &DL, DomTreeUpdater *DTU)
This file defines the SmallVector class.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
static std::optional< unsigned > getOpcode(ArrayRef< VPValue * > Values)
Returns the opcode of Values or ~0 if they do not all agree.
A wrapper pass to provide the legacy pass manager access to a suitably prepared AAResults object.
static const MachineOperand & getLdStOffsetOp(const MachineInstr &MI)
Returns the the immediate offset operator of a load/store.
static bool isPreLdSt(const MachineInstr &MI)
Returns whether the instruction is a pre-indexed load/store.
static bool isPairedLdSt(const MachineInstr &MI)
Returns whether the instruction is a paired load/store.
static int getMemScale(unsigned Opc)
Scaling factor for (scaled or unscaled) load or store.
static const MachineOperand & getLdStBaseOp(const MachineInstr &MI)
Returns the base register operator of a load/store.
const AArch64RegisterInfo * getRegisterInfo() const override
const AArch64InstrInfo * getInstrInfo() const override
const AArch64TargetLowering * getTargetLowering() const override
unsigned getRedZoneSize(const Function &F) const
Represent the analysis usage information of a pass.
AnalysisUsage & addRequired()
static bool shouldExecute(unsigned CounterName)
FunctionPass class - This class is used to implement most global optimizations.
bool needsUnwindTableEntry() const
True if this function needs an unwind table.
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
A set of register units used to track register liveness.
static void accumulateUsedDefed(const MachineInstr &MI, LiveRegUnits &ModifiedRegUnits, LiveRegUnits &UsedRegUnits, const TargetRegisterInfo *TRI)
For a machine instruction MI, adds all register units used in UsedRegUnits and defined or clobbered i...
bool available(MCPhysReg Reg) const
Returns true if no part of physical register Reg is live.
void init(const TargetRegisterInfo &TRI)
Initialize and clear the set.
void addReg(MCPhysReg Reg)
Adds register units covered by physical register Reg.
void removeReg(MCPhysReg Reg)
Removes all register units covered by physical register Reg.
void accumulate(const MachineInstr &MI)
Adds all register units used, defined or clobbered in MI.
An instruction for reading from memory.
bool usesWindowsCFI() const
OpType getOperation() const
Wrapper class representing physical registers. Should be passed by value.
reverse_instr_iterator instr_rend()
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
void splice(iterator Where, MachineBasicBlock *Other, iterator From)
Take an instruction from MBB 'Other' at the position From, and insert it into this MBB right before '...
MachineFunctionPass - This class adapts the FunctionPass interface to allow convenient creation of pa...
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
virtual bool runOnMachineFunction(MachineFunction &MF)=0
runOnMachineFunction - This method must be overloaded to perform the desired machine code transformat...
virtual MachineFunctionProperties getRequiredProperties() const
Properties which a MachineFunction may have at a given point in time.
MachineFunctionProperties & set(Property P)
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
const std::vector< MCCFIInstruction > & getFrameInstructions() const
Returns a reference to a list of cfi instructions in the function's prologue.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
const LLVMTargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
const MachineInstrBuilder & cloneMergedMemRefs(ArrayRef< const MachineInstr * > OtherMIs) const
const MachineInstrBuilder & setMemRefs(ArrayRef< MachineMemOperand * > MMOs) const
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & setMIFlags(unsigned Flags) const
MachineInstr * getInstr() const
If conversion operators fail, use this method to get the MachineInstr explicitly.
Representation of each machine instruction.
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
const MachineBasicBlock * getParent() const
bool mayAlias(AAResults *AA, const MachineInstr &Other, bool UseTBAA) const
Returns true if this instruction's memory access aliases the memory access of Other.
bool mayLoad(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly read memory.
iterator_range< mop_iterator > operands()
bool hasOrderedMemoryRef() const
Return true if this instruction may have an ordered or volatile memory reference, or if the informati...
const MachineFunction * getMF() const
Return the function that contains the basic block that this instruction belongs to.
bool mayStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly modify memory.
const MachineOperand & getOperand(unsigned i) const
MachineOperand class - Representation of each machine instruction operand.
void setImplicit(bool Val=true)
bool isReg() const
isReg - Tests if this is a MO_Register operand.
void setReg(Register Reg)
Change the register this operand corresponds to.
bool isImm() const
isImm - Tests if this is a MO_Immediate operand.
void setIsKill(bool Val=true)
bool isRenamable() const
isRenamable - Returns true if this register may be renamed, i.e.
bool isEarlyClobber() const
Register getReg() const
getReg - Returns the register number.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
bool tracksLiveness() const
tracksLiveness - Returns true when tracking register liveness accurately.
static PassRegistry * getPassRegistry()
getPassRegistry - Access the global registry object, which is automatically initialized at applicatio...
virtual StringRef getPassName() const
getPassName - Return a nice clean name for a pass.
Wrapper class representing virtual and physical registers.
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
An instruction for storing to memory.
StringRef - Represent a constant reference to a string, i.e.
const MCAsmInfo * getMCAsmInfo() const
Return target specific asm information.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
LLVM Value Representation.
self_iterator getIterator()
A range adaptor for a pair of iterators.
This provides a very simple, boring adaptor for a begin and end iterator into a range type.
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
static unsigned getShiftValue(unsigned Imm)
getShiftValue - Extract the shift value.
static unsigned getShifterImm(AArch64_AM::ShiftExtendType ST, unsigned Imm)
getShifterImm - Encode the shift type and amount: imm: 6-bit shift amount shifter: 000 ==> lsl 001 ==...
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
@ C
The default llvm calling convention, compatible with C.
@ Define
Register definition.
Reg
All possible values of the reg field in the ModR/M byte.
initializer< Ty > init(const Ty &Val)
This is an optimization pass for GlobalISel generic memory operations.
IterT next_nodbg(IterT It, IterT End, bool SkipPseudoOp=true)
Increment It, then continue incrementing it while it points to a debug instruction.
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
FunctionPass * createAArch64LoadStoreOptimizationPass()
createAArch64LoadStoreOptimizationPass - returns an instance of the load / store optimization pass.
auto instructionsWithoutDebug(IterT It, IterT End, bool SkipPseudoOp=true)
Construct a range iterator which begins at It and moves forwards until End is reached,...
iterator_range< filter_iterator< ConstMIBundleOperands, std::function< bool(const MachineOperand &)> > > phys_regs_and_masks(const MachineInstr &MI)
Returns an iterator range over all physical register and mask operands for MI and bundled instruction...
void initializeAArch64LoadStoreOptPass(PassRegistry &)
IterT prev_nodbg(IterT It, IterT Begin, bool SkipPseudoOp=true)
Decrement It, then continue decrementing it while it points to a debug instruction.
Printable printReg(Register Reg, const TargetRegisterInfo *TRI=nullptr, unsigned SubIdx=0, const MachineRegisterInfo *MRI=nullptr)
Prints virtual and physical registers with or without a TRI instance.