52#include <unordered_map>
56#define DEBUG_TYPE "aarch64-simdinstr-opt"
59 "Number of SIMD instructions modified");
61#define AARCH64_VECTOR_BY_ELEMENT_OPT_NAME \
62 "AArch64 SIMD instructions optimization pass"
76 std::map<std::pair<unsigned, std::string>,
bool> SIMDInstrTable;
79 std::unordered_map<std::string, bool> InterlEarlyExit;
89 std::vector<unsigned> ReplOpc;
93#define RuleST2(OpcOrg, OpcR0, OpcR1, OpcR2, RC) \
94 {OpcOrg, {OpcR0, OpcR1, OpcR2}, RC}
95#define RuleST4(OpcOrg, OpcR0, OpcR1, OpcR2, OpcR3, OpcR4, OpcR5, OpcR6, \
96 OpcR7, OpcR8, OpcR9, RC) \
98 {OpcR0, OpcR1, OpcR2, OpcR3, OpcR4, OpcR5, OpcR6, OpcR7, OpcR8, OpcR9}, RC}
101 std::vector<InstReplInfo> IRT = {
103 RuleST2(AArch64::ST2Twov2d, AArch64::ZIP1v2i64, AArch64::ZIP2v2i64,
104 AArch64::STPQi, AArch64::FPR128RegClass),
105 RuleST2(AArch64::ST2Twov4s, AArch64::ZIP1v4i32, AArch64::ZIP2v4i32,
106 AArch64::STPQi, AArch64::FPR128RegClass),
107 RuleST2(AArch64::ST2Twov2s, AArch64::ZIP1v2i32, AArch64::ZIP2v2i32,
108 AArch64::STPDi, AArch64::FPR64RegClass),
109 RuleST2(AArch64::ST2Twov8h, AArch64::ZIP1v8i16, AArch64::ZIP2v8i16,
110 AArch64::STPQi, AArch64::FPR128RegClass),
111 RuleST2(AArch64::ST2Twov4h, AArch64::ZIP1v4i16, AArch64::ZIP2v4i16,
112 AArch64::STPDi, AArch64::FPR64RegClass),
113 RuleST2(AArch64::ST2Twov16b, AArch64::ZIP1v16i8, AArch64::ZIP2v16i8,
114 AArch64::STPQi, AArch64::FPR128RegClass),
115 RuleST2(AArch64::ST2Twov8b, AArch64::ZIP1v8i8, AArch64::ZIP2v8i8,
116 AArch64::STPDi, AArch64::FPR64RegClass),
118 RuleST4(AArch64::ST4Fourv2d, AArch64::ZIP1v2i64, AArch64::ZIP2v2i64,
119 AArch64::ZIP1v2i64, AArch64::ZIP2v2i64, AArch64::ZIP1v2i64,
120 AArch64::ZIP2v2i64, AArch64::ZIP1v2i64, AArch64::ZIP2v2i64,
121 AArch64::STPQi, AArch64::STPQi, AArch64::FPR128RegClass),
122 RuleST4(AArch64::ST4Fourv4s, AArch64::ZIP1v4i32, AArch64::ZIP2v4i32,
123 AArch64::ZIP1v4i32, AArch64::ZIP2v4i32, AArch64::ZIP1v4i32,
124 AArch64::ZIP2v4i32, AArch64::ZIP1v4i32, AArch64::ZIP2v4i32,
125 AArch64::STPQi, AArch64::STPQi, AArch64::FPR128RegClass),
126 RuleST4(AArch64::ST4Fourv2s, AArch64::ZIP1v2i32, AArch64::ZIP2v2i32,
127 AArch64::ZIP1v2i32, AArch64::ZIP2v2i32, AArch64::ZIP1v2i32,
128 AArch64::ZIP2v2i32, AArch64::ZIP1v2i32, AArch64::ZIP2v2i32,
129 AArch64::STPDi, AArch64::STPDi, AArch64::FPR64RegClass),
130 RuleST4(AArch64::ST4Fourv8h, AArch64::ZIP1v8i16, AArch64::ZIP2v8i16,
131 AArch64::ZIP1v8i16, AArch64::ZIP2v8i16, AArch64::ZIP1v8i16,
132 AArch64::ZIP2v8i16, AArch64::ZIP1v8i16, AArch64::ZIP2v8i16,
133 AArch64::STPQi, AArch64::STPQi, AArch64::FPR128RegClass),
134 RuleST4(AArch64::ST4Fourv4h, AArch64::ZIP1v4i16, AArch64::ZIP2v4i16,
135 AArch64::ZIP1v4i16, AArch64::ZIP2v4i16, AArch64::ZIP1v4i16,
136 AArch64::ZIP2v4i16, AArch64::ZIP1v4i16, AArch64::ZIP2v4i16,
137 AArch64::STPDi, AArch64::STPDi, AArch64::FPR64RegClass),
138 RuleST4(AArch64::ST4Fourv16b, AArch64::ZIP1v16i8, AArch64::ZIP2v16i8,
139 AArch64::ZIP1v16i8, AArch64::ZIP2v16i8, AArch64::ZIP1v16i8,
140 AArch64::ZIP2v16i8, AArch64::ZIP1v16i8, AArch64::ZIP2v16i8,
141 AArch64::STPQi, AArch64::STPQi, AArch64::FPR128RegClass),
142 RuleST4(AArch64::ST4Fourv8b, AArch64::ZIP1v8i8, AArch64::ZIP2v8i8,
143 AArch64::ZIP1v8i8, AArch64::ZIP2v8i8, AArch64::ZIP1v8i8,
144 AArch64::ZIP2v8i8, AArch64::ZIP1v8i8, AArch64::ZIP2v8i8,
145 AArch64::STPDi, AArch64::STPDi, AArch64::FPR64RegClass)
150 static const unsigned MaxNumRepl = 10;
173 bool reuseDUP(
MachineInstr &
MI,
unsigned DupOpcode,
unsigned SrcReg,
174 unsigned LaneNumber,
unsigned *DestReg)
const;
187 bool processSeqRegInst(
MachineInstr *DefiningMI,
unsigned* StReg,
188 unsigned* StRegKill,
unsigned NumArg)
const;
206char AArch64SIMDInstrOpt::ID = 0;
217bool AArch64SIMDInstrOpt::
222 std::string Subtarget = std::string(SchedModel.getSubtargetInfo()->getCPU());
223 auto InstID = std::make_pair(InstDesc->getOpcode(), Subtarget);
224 auto It = SIMDInstrTable.find(InstID);
225 if (It != SIMDInstrTable.end())
228 unsigned SCIdx = InstDesc->getSchedClass();
230 SchedModel.getMCSchedModel()->getSchedClassDesc(SCIdx);
237 SIMDInstrTable[InstID] =
false;
240 for (
const auto *IDesc : InstDescRepl)
242 SCDescRepl = SchedModel.getMCSchedModel()->getSchedClassDesc(
243 IDesc->getSchedClass());
246 SIMDInstrTable[InstID] =
false;
252 unsigned ReplCost = 0;
253 for (
const auto *IDesc :InstDescRepl)
254 ReplCost += SchedModel.computeInstrLatency(IDesc->getOpcode());
256 if (SchedModel.computeInstrLatency(InstDesc->getOpcode()) > ReplCost)
258 SIMDInstrTable[InstID] =
true;
263 SIMDInstrTable[InstID] =
false;
274bool AArch64SIMDInstrOpt::shouldExitEarly(
MachineFunction *MF, Subpass SP) {
283 OriginalMCID = &
TII->get(AArch64::FMLAv4i32_indexed);
284 ReplInstrMCID.
push_back(&
TII->get(AArch64::DUPv4i32lane));
286 if (shouldReplaceInst(MF, OriginalMCID, ReplInstrMCID))
292 std::string Subtarget =
293 std::string(SchedModel.getSubtargetInfo()->getCPU());
294 auto It = InterlEarlyExit.find(Subtarget);
295 if (It != InterlEarlyExit.end())
298 for (
auto &
I : IRT) {
299 OriginalMCID = &
TII->get(
I.OrigOpc);
300 for (
auto &Repl :
I.ReplOpc)
302 if (shouldReplaceInst(MF, OriginalMCID, ReplInstrMCID)) {
303 InterlEarlyExit[Subtarget] =
false;
306 ReplInstrMCID.
clear();
308 InterlEarlyExit[Subtarget] =
true;
319bool AArch64SIMDInstrOpt::reuseDUP(
MachineInstr &
MI,
unsigned DupOpcode,
320 unsigned SrcReg,
unsigned LaneNumber,
321 unsigned *DestReg)
const {
327 if (CurrentMI->
getOpcode() == DupOpcode &&
357 switch (
MI.getOpcode()) {
362 case AArch64::FMLAv4i32_indexed:
363 DupMCID = &
TII->get(AArch64::DUPv4i32lane);
364 MulMCID = &
TII->get(AArch64::FMLAv4f32);
366 case AArch64::FMLSv4i32_indexed:
367 DupMCID = &
TII->get(AArch64::DUPv4i32lane);
368 MulMCID = &
TII->get(AArch64::FMLSv4f32);
370 case AArch64::FMULXv4i32_indexed:
371 DupMCID = &
TII->get(AArch64::DUPv4i32lane);
372 MulMCID = &
TII->get(AArch64::FMULXv4f32);
374 case AArch64::FMULv4i32_indexed:
375 DupMCID = &
TII->get(AArch64::DUPv4i32lane);
376 MulMCID = &
TII->get(AArch64::FMULv4f32);
380 case AArch64::FMLAv2i64_indexed:
381 DupMCID = &
TII->get(AArch64::DUPv2i64lane);
382 MulMCID = &
TII->get(AArch64::FMLAv2f64);
384 case AArch64::FMLSv2i64_indexed:
385 DupMCID = &
TII->get(AArch64::DUPv2i64lane);
386 MulMCID = &
TII->get(AArch64::FMLSv2f64);
388 case AArch64::FMULXv2i64_indexed:
389 DupMCID = &
TII->get(AArch64::DUPv2i64lane);
390 MulMCID = &
TII->get(AArch64::FMULXv2f64);
392 case AArch64::FMULv2i64_indexed:
393 DupMCID = &
TII->get(AArch64::DUPv2i64lane);
394 MulMCID = &
TII->get(AArch64::FMULv2f64);
398 case AArch64::FMLAv2i32_indexed:
399 RC = &AArch64::FPR64RegClass;
400 DupMCID = &
TII->get(AArch64::DUPv2i32lane);
401 MulMCID = &
TII->get(AArch64::FMLAv2f32);
403 case AArch64::FMLSv2i32_indexed:
404 RC = &AArch64::FPR64RegClass;
405 DupMCID = &
TII->get(AArch64::DUPv2i32lane);
406 MulMCID = &
TII->get(AArch64::FMLSv2f32);
408 case AArch64::FMULXv2i32_indexed:
409 RC = &AArch64::FPR64RegClass;
410 DupMCID = &
TII->get(AArch64::DUPv2i32lane);
411 MulMCID = &
TII->get(AArch64::FMULXv2f32);
413 case AArch64::FMULv2i32_indexed:
414 RC = &AArch64::FPR64RegClass;
415 DupMCID = &
TII->get(AArch64::DUPv2i32lane);
416 MulMCID = &
TII->get(AArch64::FMULv2f32);
423 if (!shouldReplaceInst(
MI.getParent()->getParent(), &
TII->get(
MI.getOpcode()),
440 if (
MI.getNumOperands() == 5) {
443 unsigned LaneNumber =
MI.getOperand(4).getImm();
447 if (!reuseDUP(
MI, DupMCID->
getOpcode(), SrcReg2, LaneNumber, &DupDest)) {
448 DupDest =
MRI.createVirtualRegister(RC);
450 .
addReg(SrcReg2, Src2IsKill)
454 .
addReg(SrcReg0, Src0IsKill)
455 .
addReg(SrcReg1, Src1IsKill)
456 .
addReg(DupDest, Src2IsKill);
457 }
else if (
MI.getNumOperands() == 4) {
458 unsigned LaneNumber =
MI.getOperand(3).getImm();
459 if (!reuseDUP(
MI, DupMCID->
getOpcode(), SrcReg1, LaneNumber, &DupDest)) {
460 DupDest =
MRI.createVirtualRegister(RC);
462 .
addReg(SrcReg1, Src1IsKill)
466 .
addReg(SrcReg0, Src0IsKill)
467 .
addReg(DupDest, Src1IsKill);
505bool AArch64SIMDInstrOpt::optimizeLdStInterleave(
MachineInstr &
MI) {
507 unsigned SeqReg, AddrReg;
508 unsigned StReg[4], StRegKill[4];
518 for (
auto &
I : IRT) {
519 if (
MI.getOpcode() ==
I.OrigOpc) {
520 SeqReg =
MI.getOperand(0).getReg();
521 AddrReg =
MI.getOperand(1).getReg();
522 DefiningMI =
MRI->getUniqueVRegDef(SeqReg);
523 unsigned NumReg = determineSrcReg(
MI);
524 if (!processSeqRegInst(DefiningMI, StReg, StRegKill, NumReg))
527 for (
auto &Repl :
I.ReplOpc) {
530 if (Repl != AArch64::STPQi && Repl != AArch64::STPDi)
543 if (!shouldReplaceInst(
MI.getParent()->getParent(), &
TII->get(
MI.getOpcode()),
553 switch (
MI.getOpcode()) {
557 case AArch64::ST2Twov16b:
558 case AArch64::ST2Twov8b:
559 case AArch64::ST2Twov8h:
560 case AArch64::ST2Twov4h:
561 case AArch64::ST2Twov4s:
562 case AArch64::ST2Twov2s:
563 case AArch64::ST2Twov2d:
569 .
addReg(StReg[0], StRegKill[0])
570 .
addReg(StReg[1], StRegKill[1]);
579 case AArch64::ST4Fourv16b:
580 case AArch64::ST4Fourv8b:
581 case AArch64::ST4Fourv8h:
582 case AArch64::ST4Fourv4h:
583 case AArch64::ST4Fourv4s:
584 case AArch64::ST4Fourv2s:
585 case AArch64::ST4Fourv2d:
591 .
addReg(StReg[0], StRegKill[0])
592 .
addReg(StReg[2], StRegKill[2]);
597 .
addReg(StReg[1], StRegKill[1])
598 .
addReg(StReg[3], StRegKill[3]);
634bool AArch64SIMDInstrOpt::processSeqRegInst(
MachineInstr *DefiningMI,
635 unsigned* StReg,
unsigned* StRegKill,
unsigned NumArg)
const {
636 assert(DefiningMI !=
nullptr);
637 if (DefiningMI->
getOpcode() != AArch64::REG_SEQUENCE)
640 for (
unsigned i=0; i<NumArg; i++) {
669unsigned AArch64SIMDInstrOpt::determineSrcReg(
MachineInstr &
MI)
const {
670 switch (
MI.getOpcode()) {
674 case AArch64::ST2Twov16b:
675 case AArch64::ST2Twov8b:
676 case AArch64::ST2Twov8h:
677 case AArch64::ST2Twov4h:
678 case AArch64::ST2Twov4s:
679 case AArch64::ST2Twov2s:
680 case AArch64::ST2Twov2d:
683 case AArch64::ST4Fourv16b:
684 case AArch64::ST4Fourv8b:
685 case AArch64::ST4Fourv8h:
686 case AArch64::ST4Fourv4h:
687 case AArch64::ST4Fourv4s:
688 case AArch64::ST4Fourv2s:
689 case AArch64::ST4Fourv2d:
705 SchedModel.init(&ST);
706 if (!SchedModel.hasInstrSchedModel())
709 bool Changed =
false;
710 for (
auto OptimizationKind : {VectorElem, Interleave}) {
711 if (!shouldExitEarly(&MF, OptimizationKind)) {
716 if (OptimizationKind == VectorElem)
717 InstRewrite = optimizeVectElement(
MI) ;
719 InstRewrite = optimizeLdStInterleave(
MI);
729 MI->eraseFromParent();
739 return new AArch64SIMDInstrOpt();
unsigned const MachineRegisterInfo * MRI
#define RuleST4(OpcOrg, OpcR0, OpcR1, OpcR2, OpcR3, OpcR4, OpcR5, OpcR6, OpcR7, OpcR8, OpcR9, RC)
#define RuleST2(OpcOrg, OpcR0, OpcR1, OpcR2, RC)
#define AARCH64_VECTOR_BY_ELEMENT_OPT_NAME
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
const HexagonInstrInfo * TII
#define INITIALIZE_PASS(passName, arg, name, cfg, analysis)
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
This file defines the SmallVector class.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
FunctionPass class - This class is used to implement most global optimizations.
Describe properties that are true of each instruction in the target description file.
unsigned getOpcode() const
Return the opcode number for this descriptor.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
MachineFunctionPass - This class adapts the FunctionPass interface to allow convenient creation of pa...
virtual bool runOnMachineFunction(MachineFunction &MF)=0
runOnMachineFunction - This method must be overloaded to perform the desired machine code transformat...
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
Representation of each machine instruction.
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
unsigned getNumOperands() const
Retuns the total number of operands.
const MachineOperand & getOperand(unsigned i) const
bool isImm() const
isImm - Tests if this is a MO_Immediate operand.
Register getReg() const
getReg - Returns the register number.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
static PassRegistry * getPassRegistry()
getPassRegistry - Access the global registry object, which is automatically initialized at applicatio...
virtual StringRef getPassName() const
getPassName - Return a nice clean name for a pass.
Wrapper class representing virtual and physical registers.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
StringRef - Represent a constant reference to a string, i.e.
TargetInstrInfo - Interface to description of machine instruction set.
Provide an instruction scheduling machine model to CodeGen passes.
TargetSubtargetInfo - Generic base class for all target subtargets.
virtual const TargetInstrInfo * getInstrInfo() const
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
This is an optimization pass for GlobalISel generic memory operations.
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
FunctionPass * createAArch64SIMDInstrOptPass()
Returns an instance of the high cost ASIMD instruction replacement optimization pass.
unsigned getKillRegState(bool B)
void initializeAArch64SIMDInstrOptPass(PassRegistry &)
Summarize the scheduling resources required for an instruction of a particular scheduling class.