52 #include <unordered_map>
56 #define DEBUG_TYPE "aarch64-simdinstr-opt"
59 "Number of SIMD instructions modified");
61 #define AARCH64_VECTOR_BY_ELEMENT_OPT_NAME \
62 "AArch64 SIMD instructions optimization pass"
76 std::map<std::pair<unsigned, std::string>,
bool> SIMDInstrTable;
79 std::unordered_map<std::string, bool> InterlEarlyExit;
89 std::vector<unsigned> ReplOpc;
93 #define RuleST2(OpcOrg, OpcR0, OpcR1, OpcR2, RC) \
94 {OpcOrg, {OpcR0, OpcR1, OpcR2}, RC}
95 #define RuleST4(OpcOrg, OpcR0, OpcR1, OpcR2, OpcR3, OpcR4, OpcR5, OpcR6, \
96 OpcR7, OpcR8, OpcR9, RC) \
98 {OpcR0, OpcR1, OpcR2, OpcR3, OpcR4, OpcR5, OpcR6, OpcR7, OpcR8, OpcR9}, RC}
101 std::vector<InstReplInfo> IRT = {
103 RuleST2(AArch64::ST2Twov2d, AArch64::ZIP1v2i64, AArch64::ZIP2v2i64,
104 AArch64::STPQi, AArch64::FPR128RegClass),
105 RuleST2(AArch64::ST2Twov4s, AArch64::ZIP1v4i32, AArch64::ZIP2v4i32,
106 AArch64::STPQi, AArch64::FPR128RegClass),
107 RuleST2(AArch64::ST2Twov2s, AArch64::ZIP1v2i32, AArch64::ZIP2v2i32,
108 AArch64::STPDi, AArch64::FPR64RegClass),
109 RuleST2(AArch64::ST2Twov8h, AArch64::ZIP1v8i16, AArch64::ZIP2v8i16,
110 AArch64::STPQi, AArch64::FPR128RegClass),
111 RuleST2(AArch64::ST2Twov4h, AArch64::ZIP1v4i16, AArch64::ZIP2v4i16,
112 AArch64::STPDi, AArch64::FPR64RegClass),
113 RuleST2(AArch64::ST2Twov16b, AArch64::ZIP1v16i8, AArch64::ZIP2v16i8,
114 AArch64::STPQi, AArch64::FPR128RegClass),
115 RuleST2(AArch64::ST2Twov8b, AArch64::ZIP1v8i8, AArch64::ZIP2v8i8,
116 AArch64::STPDi, AArch64::FPR64RegClass),
118 RuleST4(AArch64::ST4Fourv2d, AArch64::ZIP1v2i64, AArch64::ZIP2v2i64,
119 AArch64::ZIP1v2i64, AArch64::ZIP2v2i64, AArch64::ZIP1v2i64,
120 AArch64::ZIP2v2i64, AArch64::ZIP1v2i64, AArch64::ZIP2v2i64,
121 AArch64::STPQi, AArch64::STPQi, AArch64::FPR128RegClass),
122 RuleST4(AArch64::ST4Fourv4s, AArch64::ZIP1v4i32, AArch64::ZIP2v4i32,
123 AArch64::ZIP1v4i32, AArch64::ZIP2v4i32, AArch64::ZIP1v4i32,
124 AArch64::ZIP2v4i32, AArch64::ZIP1v4i32, AArch64::ZIP2v4i32,
125 AArch64::STPQi, AArch64::STPQi, AArch64::FPR128RegClass),
126 RuleST4(AArch64::ST4Fourv2s, AArch64::ZIP1v2i32, AArch64::ZIP2v2i32,
127 AArch64::ZIP1v2i32, AArch64::ZIP2v2i32, AArch64::ZIP1v2i32,
128 AArch64::ZIP2v2i32, AArch64::ZIP1v2i32, AArch64::ZIP2v2i32,
129 AArch64::STPDi, AArch64::STPDi, AArch64::FPR64RegClass),
130 RuleST4(AArch64::ST4Fourv8h, AArch64::ZIP1v8i16, AArch64::ZIP2v8i16,
131 AArch64::ZIP1v8i16, AArch64::ZIP2v8i16, AArch64::ZIP1v8i16,
132 AArch64::ZIP2v8i16, AArch64::ZIP1v8i16, AArch64::ZIP2v8i16,
133 AArch64::STPQi, AArch64::STPQi, AArch64::FPR128RegClass),
134 RuleST4(AArch64::ST4Fourv4h, AArch64::ZIP1v4i16, AArch64::ZIP2v4i16,
135 AArch64::ZIP1v4i16, AArch64::ZIP2v4i16, AArch64::ZIP1v4i16,
136 AArch64::ZIP2v4i16, AArch64::ZIP1v4i16, AArch64::ZIP2v4i16,
137 AArch64::STPDi, AArch64::STPDi, AArch64::FPR64RegClass),
138 RuleST4(AArch64::ST4Fourv16b, AArch64::ZIP1v16i8, AArch64::ZIP2v16i8,
139 AArch64::ZIP1v16i8, AArch64::ZIP2v16i8, AArch64::ZIP1v16i8,
140 AArch64::ZIP2v16i8, AArch64::ZIP1v16i8, AArch64::ZIP2v16i8,
141 AArch64::STPQi, AArch64::STPQi, AArch64::FPR128RegClass),
142 RuleST4(AArch64::ST4Fourv8b, AArch64::ZIP1v8i8, AArch64::ZIP2v8i8,
143 AArch64::ZIP1v8i8, AArch64::ZIP2v8i8, AArch64::ZIP1v8i8,
144 AArch64::ZIP2v8i8, AArch64::ZIP1v8i8, AArch64::ZIP2v8i8,
145 AArch64::STPDi, AArch64::STPDi, AArch64::FPR64RegClass)
150 static const unsigned MaxNumRepl = 10;
173 bool reuseDUP(
MachineInstr &
MI,
unsigned DupOpcode,
unsigned SrcReg,
174 unsigned LaneNumber,
unsigned *DestReg)
const;
187 bool processSeqRegInst(
MachineInstr *DefiningMI,
unsigned* StReg,
188 unsigned* StRegKill,
unsigned NumArg)
const;
217 bool AArch64SIMDInstrOpt::
222 std::string Subtarget = std::string(SchedModel.getSubtargetInfo()->getCPU());
223 auto InstID = std::make_pair(InstDesc->getOpcode(), Subtarget);
224 auto It = SIMDInstrTable.find(InstID);
225 if (It != SIMDInstrTable.end())
228 unsigned SCIdx = InstDesc->getSchedClass();
230 SchedModel.getMCSchedModel()->getSchedClassDesc(SCIdx);
237 SIMDInstrTable[InstID] =
false;
240 for (
const auto *IDesc : InstDescRepl)
242 SCDescRepl = SchedModel.getMCSchedModel()->getSchedClassDesc(
243 IDesc->getSchedClass());
246 SIMDInstrTable[InstID] =
false;
252 unsigned ReplCost = 0;
253 for (
const auto *IDesc :InstDescRepl)
254 ReplCost += SchedModel.computeInstrLatency(IDesc->getOpcode());
256 if (SchedModel.computeInstrLatency(InstDesc->getOpcode()) > ReplCost)
258 SIMDInstrTable[InstID] =
true;
263 SIMDInstrTable[InstID] =
false;
274 bool AArch64SIMDInstrOpt::shouldExitEarly(
MachineFunction *MF, Subpass SP) {
283 OriginalMCID = &
TII->get(AArch64::FMLAv4i32_indexed);
284 ReplInstrMCID.push_back(&
TII->get(AArch64::DUPv4i32lane));
285 ReplInstrMCID.push_back(&
TII->get(AArch64::FMLAv4f32));
286 if (shouldReplaceInst(MF, OriginalMCID, ReplInstrMCID))
292 std::string Subtarget =
293 std::string(SchedModel.getSubtargetInfo()->getCPU());
294 auto It = InterlEarlyExit.find(Subtarget);
295 if (It != InterlEarlyExit.end())
298 for (
auto &
I : IRT) {
299 OriginalMCID = &
TII->get(
I.OrigOpc);
300 for (
auto &Repl :
I.ReplOpc)
301 ReplInstrMCID.push_back(&
TII->get(Repl));
302 if (shouldReplaceInst(MF, OriginalMCID, ReplInstrMCID)) {
303 InterlEarlyExit[Subtarget] =
false;
306 ReplInstrMCID.
clear();
308 InterlEarlyExit[Subtarget] =
true;
319 bool AArch64SIMDInstrOpt::reuseDUP(
MachineInstr &
MI,
unsigned DupOpcode,
320 unsigned SrcReg,
unsigned LaneNumber,
321 unsigned *DestReg)
const {
327 if (CurrentMI->
getOpcode() == DupOpcode &&
357 switch (
MI.getOpcode()) {
362 case AArch64::FMLAv4i32_indexed:
363 DupMCID = &
TII->get(AArch64::DUPv4i32lane);
364 MulMCID = &
TII->get(AArch64::FMLAv4f32);
366 case AArch64::FMLSv4i32_indexed:
367 DupMCID = &
TII->get(AArch64::DUPv4i32lane);
368 MulMCID = &
TII->get(AArch64::FMLSv4f32);
370 case AArch64::FMULXv4i32_indexed:
371 DupMCID = &
TII->get(AArch64::DUPv4i32lane);
372 MulMCID = &
TII->get(AArch64::FMULXv4f32);
374 case AArch64::FMULv4i32_indexed:
375 DupMCID = &
TII->get(AArch64::DUPv4i32lane);
376 MulMCID = &
TII->get(AArch64::FMULv4f32);
380 case AArch64::FMLAv2i64_indexed:
381 DupMCID = &
TII->get(AArch64::DUPv2i64lane);
382 MulMCID = &
TII->get(AArch64::FMLAv2f64);
384 case AArch64::FMLSv2i64_indexed:
385 DupMCID = &
TII->get(AArch64::DUPv2i64lane);
386 MulMCID = &
TII->get(AArch64::FMLSv2f64);
388 case AArch64::FMULXv2i64_indexed:
389 DupMCID = &
TII->get(AArch64::DUPv2i64lane);
390 MulMCID = &
TII->get(AArch64::FMULXv2f64);
392 case AArch64::FMULv2i64_indexed:
393 DupMCID = &
TII->get(AArch64::DUPv2i64lane);
394 MulMCID = &
TII->get(AArch64::FMULv2f64);
398 case AArch64::FMLAv2i32_indexed:
399 RC = &AArch64::FPR64RegClass;
400 DupMCID = &
TII->get(AArch64::DUPv2i32lane);
401 MulMCID = &
TII->get(AArch64::FMLAv2f32);
403 case AArch64::FMLSv2i32_indexed:
404 RC = &AArch64::FPR64RegClass;
405 DupMCID = &
TII->get(AArch64::DUPv2i32lane);
406 MulMCID = &
TII->get(AArch64::FMLSv2f32);
408 case AArch64::FMULXv2i32_indexed:
409 RC = &AArch64::FPR64RegClass;
410 DupMCID = &
TII->get(AArch64::DUPv2i32lane);
411 MulMCID = &
TII->get(AArch64::FMULXv2f32);
413 case AArch64::FMULv2i32_indexed:
414 RC = &AArch64::FPR64RegClass;
415 DupMCID = &
TII->get(AArch64::DUPv2i32lane);
416 MulMCID = &
TII->get(AArch64::FMULv2f32);
421 ReplInstrMCID.push_back(DupMCID);
422 ReplInstrMCID.push_back(MulMCID);
423 if (!shouldReplaceInst(
MI.getParent()->getParent(), &
TII->get(
MI.getOpcode()),
440 if (
MI.getNumOperands() == 5) {
443 unsigned LaneNumber =
MI.getOperand(4).getImm();
447 if (!reuseDUP(
MI, DupMCID->
getOpcode(), SrcReg2, LaneNumber, &DupDest)) {
450 .
addReg(SrcReg2, Src2IsKill)
454 .
addReg(SrcReg0, Src0IsKill)
455 .
addReg(SrcReg1, Src1IsKill)
456 .
addReg(DupDest, Src2IsKill);
457 }
else if (
MI.getNumOperands() == 4) {
458 unsigned LaneNumber =
MI.getOperand(3).getImm();
459 if (!reuseDUP(
MI, DupMCID->
getOpcode(), SrcReg1, LaneNumber, &DupDest)) {
462 .
addReg(SrcReg1, Src1IsKill)
466 .
addReg(SrcReg0, Src0IsKill)
467 .
addReg(DupDest, Src1IsKill);
505 bool AArch64SIMDInstrOpt::optimizeLdStInterleave(
MachineInstr &
MI) {
507 unsigned SeqReg, AddrReg;
508 unsigned StReg[4], StRegKill[4];
518 for (
auto &
I : IRT) {
519 if (
MI.getOpcode() ==
I.OrigOpc) {
520 SeqReg =
MI.getOperand(0).getReg();
521 AddrReg =
MI.getOperand(1).getReg();
523 unsigned NumReg = determineSrcReg(
MI);
524 if (!processSeqRegInst(DefiningMI, StReg, StRegKill, NumReg))
527 for (
auto &Repl :
I.ReplOpc) {
528 ReplInstrMCID.push_back(&
TII->get(Repl));
530 if (Repl != AArch64::STPQi && Repl != AArch64::STPDi)
543 if (!shouldReplaceInst(
MI.getParent()->getParent(), &
TII->get(
MI.getOpcode()),
553 switch (
MI.getOpcode()) {
557 case AArch64::ST2Twov16b:
558 case AArch64::ST2Twov8b:
559 case AArch64::ST2Twov8h:
560 case AArch64::ST2Twov4h:
561 case AArch64::ST2Twov4s:
562 case AArch64::ST2Twov2s:
563 case AArch64::ST2Twov2d:
569 .
addReg(StReg[0], StRegKill[0])
570 .
addReg(StReg[1], StRegKill[1]);
579 case AArch64::ST4Fourv16b:
580 case AArch64::ST4Fourv8b:
581 case AArch64::ST4Fourv8h:
582 case AArch64::ST4Fourv4h:
583 case AArch64::ST4Fourv4s:
584 case AArch64::ST4Fourv2s:
585 case AArch64::ST4Fourv2d:
591 .
addReg(StReg[0], StRegKill[0])
592 .
addReg(StReg[2], StRegKill[2]);
597 .
addReg(StReg[1], StRegKill[1])
598 .
addReg(StReg[3], StRegKill[3]);
634 bool AArch64SIMDInstrOpt::processSeqRegInst(
MachineInstr *DefiningMI,
635 unsigned* StReg,
unsigned* StRegKill,
unsigned NumArg)
const {
636 assert(DefiningMI !=
nullptr);
637 if (DefiningMI->
getOpcode() != AArch64::REG_SEQUENCE)
640 for (
unsigned i=0;
i<NumArg;
i++) {
669 unsigned AArch64SIMDInstrOpt::determineSrcReg(
MachineInstr &
MI)
const {
670 switch (
MI.getOpcode()) {
674 case AArch64::ST2Twov16b:
675 case AArch64::ST2Twov8b:
676 case AArch64::ST2Twov8h:
677 case AArch64::ST2Twov4h:
678 case AArch64::ST2Twov4s:
679 case AArch64::ST2Twov2s:
680 case AArch64::ST2Twov2d:
683 case AArch64::ST4Fourv16b:
684 case AArch64::ST4Fourv8b:
685 case AArch64::ST4Fourv8h:
686 case AArch64::ST4Fourv4h:
687 case AArch64::ST4Fourv4s:
688 case AArch64::ST4Fourv2s:
689 case AArch64::ST4Fourv2d:
705 SchedModel.init(&
ST);
706 if (!SchedModel.hasInstrSchedModel())
709 bool Changed =
false;
710 for (
auto OptimizationKind : {VectorElem, Interleave}) {
711 if (!shouldExitEarly(&MF, OptimizationKind)) {
716 if (OptimizationKind == VectorElem)
717 InstRewrite = optimizeVectElement(
MI) ;
719 InstRewrite = optimizeLdStInterleave(
MI);
723 RemoveMIs.push_back(&
MI);
729 MI->eraseFromParent();
739 return new AArch64SIMDInstrOpt();