52#include <unordered_map>
57#define DEBUG_TYPE "aarch64-simdinstr-opt"
60 "Number of SIMD instructions modified");
62#define AARCH64_VECTOR_BY_ELEMENT_OPT_NAME \
63 "AArch64 SIMD instructions optimization pass"
77 std::map<std::pair<unsigned, std::string>,
bool> SIMDInstrTable;
80 std::unordered_map<std::string, bool> InterlEarlyExit;
90 std::vector<unsigned> ReplOpc;
94#define RuleST2(OpcOrg, OpcR0, OpcR1, OpcR2, RC) \
95 {OpcOrg, {OpcR0, OpcR1, OpcR2}, RC}
96#define RuleST4(OpcOrg, OpcR0, OpcR1, OpcR2, OpcR3, OpcR4, OpcR5, OpcR6, \
97 OpcR7, OpcR8, OpcR9, RC) \
99 {OpcR0, OpcR1, OpcR2, OpcR3, OpcR4, OpcR5, OpcR6, OpcR7, OpcR8, OpcR9}, RC}
102 std::vector<InstReplInfo> IRT = {
104 RuleST2(AArch64::ST2Twov2d, AArch64::ZIP1v2i64, AArch64::ZIP2v2i64,
105 AArch64::STPQi, AArch64::FPR128RegClass),
106 RuleST2(AArch64::ST2Twov4s, AArch64::ZIP1v4i32, AArch64::ZIP2v4i32,
107 AArch64::STPQi, AArch64::FPR128RegClass),
108 RuleST2(AArch64::ST2Twov2s, AArch64::ZIP1v2i32, AArch64::ZIP2v2i32,
109 AArch64::STPDi, AArch64::FPR64RegClass),
110 RuleST2(AArch64::ST2Twov8h, AArch64::ZIP1v8i16, AArch64::ZIP2v8i16,
111 AArch64::STPQi, AArch64::FPR128RegClass),
112 RuleST2(AArch64::ST2Twov4h, AArch64::ZIP1v4i16, AArch64::ZIP2v4i16,
113 AArch64::STPDi, AArch64::FPR64RegClass),
114 RuleST2(AArch64::ST2Twov16b, AArch64::ZIP1v16i8, AArch64::ZIP2v16i8,
115 AArch64::STPQi, AArch64::FPR128RegClass),
116 RuleST2(AArch64::ST2Twov8b, AArch64::ZIP1v8i8, AArch64::ZIP2v8i8,
117 AArch64::STPDi, AArch64::FPR64RegClass),
119 RuleST4(AArch64::ST4Fourv2d, AArch64::ZIP1v2i64, AArch64::ZIP2v2i64,
120 AArch64::ZIP1v2i64, AArch64::ZIP2v2i64, AArch64::ZIP1v2i64,
121 AArch64::ZIP2v2i64, AArch64::ZIP1v2i64, AArch64::ZIP2v2i64,
122 AArch64::STPQi, AArch64::STPQi, AArch64::FPR128RegClass),
123 RuleST4(AArch64::ST4Fourv4s, AArch64::ZIP1v4i32, AArch64::ZIP2v4i32,
124 AArch64::ZIP1v4i32, AArch64::ZIP2v4i32, AArch64::ZIP1v4i32,
125 AArch64::ZIP2v4i32, AArch64::ZIP1v4i32, AArch64::ZIP2v4i32,
126 AArch64::STPQi, AArch64::STPQi, AArch64::FPR128RegClass),
127 RuleST4(AArch64::ST4Fourv2s, AArch64::ZIP1v2i32, AArch64::ZIP2v2i32,
128 AArch64::ZIP1v2i32, AArch64::ZIP2v2i32, AArch64::ZIP1v2i32,
129 AArch64::ZIP2v2i32, AArch64::ZIP1v2i32, AArch64::ZIP2v2i32,
130 AArch64::STPDi, AArch64::STPDi, AArch64::FPR64RegClass),
131 RuleST4(AArch64::ST4Fourv8h, AArch64::ZIP1v8i16, AArch64::ZIP2v8i16,
132 AArch64::ZIP1v8i16, AArch64::ZIP2v8i16, AArch64::ZIP1v8i16,
133 AArch64::ZIP2v8i16, AArch64::ZIP1v8i16, AArch64::ZIP2v8i16,
134 AArch64::STPQi, AArch64::STPQi, AArch64::FPR128RegClass),
135 RuleST4(AArch64::ST4Fourv4h, AArch64::ZIP1v4i16, AArch64::ZIP2v4i16,
136 AArch64::ZIP1v4i16, AArch64::ZIP2v4i16, AArch64::ZIP1v4i16,
137 AArch64::ZIP2v4i16, AArch64::ZIP1v4i16, AArch64::ZIP2v4i16,
138 AArch64::STPDi, AArch64::STPDi, AArch64::FPR64RegClass),
139 RuleST4(AArch64::ST4Fourv16b, AArch64::ZIP1v16i8, AArch64::ZIP2v16i8,
140 AArch64::ZIP1v16i8, AArch64::ZIP2v16i8, AArch64::ZIP1v16i8,
141 AArch64::ZIP2v16i8, AArch64::ZIP1v16i8, AArch64::ZIP2v16i8,
142 AArch64::STPQi, AArch64::STPQi, AArch64::FPR128RegClass),
143 RuleST4(AArch64::ST4Fourv8b, AArch64::ZIP1v8i8, AArch64::ZIP2v8i8,
144 AArch64::ZIP1v8i8, AArch64::ZIP2v8i8, AArch64::ZIP1v8i8,
145 AArch64::ZIP2v8i8, AArch64::ZIP1v8i8, AArch64::ZIP2v8i8,
146 AArch64::STPDi, AArch64::STPDi, AArch64::FPR64RegClass)
151 static const unsigned MaxNumRepl = 10;
174 bool reuseDUP(
MachineInstr &
MI,
unsigned DupOpcode,
unsigned SrcReg,
175 unsigned LaneNumber,
unsigned *DestReg)
const;
188 bool processSeqRegInst(
MachineInstr *DefiningMI,
unsigned* StReg,
189 unsigned* StRegKill,
unsigned NumArg)
const;
207char AArch64SIMDInstrOpt::ID = 0;
218bool AArch64SIMDInstrOpt::
223 std::string Subtarget = std::string(SchedModel.getSubtargetInfo()->getCPU());
224 auto InstID = std::make_pair(InstDesc->getOpcode(), Subtarget);
225 auto It = SIMDInstrTable.find(InstID);
226 if (It != SIMDInstrTable.end())
229 unsigned SCIdx = InstDesc->getSchedClass();
231 SchedModel.getMCSchedModel()->getSchedClassDesc(SCIdx);
238 SIMDInstrTable[InstID] =
false;
241 for (
const auto *IDesc : InstDescRepl)
243 SCDescRepl = SchedModel.getMCSchedModel()->getSchedClassDesc(
244 IDesc->getSchedClass());
247 SIMDInstrTable[InstID] =
false;
253 unsigned ReplCost = 0;
254 for (
const auto *IDesc :InstDescRepl)
255 ReplCost += SchedModel.computeInstrLatency(IDesc->getOpcode());
257 if (SchedModel.computeInstrLatency(InstDesc->getOpcode()) > ReplCost)
259 SIMDInstrTable[InstID] =
true;
264 SIMDInstrTable[InstID] =
false;
275bool AArch64SIMDInstrOpt::shouldExitEarly(
MachineFunction *MF, Subpass SP) {
284 OriginalMCID = &
TII->get(AArch64::FMLAv4i32_indexed);
285 ReplInstrMCID.
push_back(&
TII->get(AArch64::DUPv4i32lane));
287 if (shouldReplaceInst(MF, OriginalMCID, ReplInstrMCID))
293 std::string Subtarget =
294 std::string(SchedModel.getSubtargetInfo()->getCPU());
295 auto It = InterlEarlyExit.find(Subtarget);
296 if (It != InterlEarlyExit.end())
299 for (
auto &
I : IRT) {
300 OriginalMCID = &
TII->get(
I.OrigOpc);
301 for (
auto &Repl :
I.ReplOpc)
303 if (shouldReplaceInst(MF, OriginalMCID, ReplInstrMCID)) {
304 InterlEarlyExit[Subtarget] =
false;
307 ReplInstrMCID.
clear();
309 InterlEarlyExit[Subtarget] =
true;
320bool AArch64SIMDInstrOpt::reuseDUP(
MachineInstr &
MI,
unsigned DupOpcode,
321 unsigned SrcReg,
unsigned LaneNumber,
322 unsigned *DestReg)
const {
328 if (CurrentMI->
getOpcode() == DupOpcode &&
358 switch (
MI.getOpcode()) {
363 case AArch64::FMLAv4i32_indexed:
364 DupMCID = &
TII->get(AArch64::DUPv4i32lane);
365 MulMCID = &
TII->get(AArch64::FMLAv4f32);
367 case AArch64::FMLSv4i32_indexed:
368 DupMCID = &
TII->get(AArch64::DUPv4i32lane);
369 MulMCID = &
TII->get(AArch64::FMLSv4f32);
371 case AArch64::FMULXv4i32_indexed:
372 DupMCID = &
TII->get(AArch64::DUPv4i32lane);
373 MulMCID = &
TII->get(AArch64::FMULXv4f32);
375 case AArch64::FMULv4i32_indexed:
376 DupMCID = &
TII->get(AArch64::DUPv4i32lane);
377 MulMCID = &
TII->get(AArch64::FMULv4f32);
381 case AArch64::FMLAv2i64_indexed:
382 DupMCID = &
TII->get(AArch64::DUPv2i64lane);
383 MulMCID = &
TII->get(AArch64::FMLAv2f64);
385 case AArch64::FMLSv2i64_indexed:
386 DupMCID = &
TII->get(AArch64::DUPv2i64lane);
387 MulMCID = &
TII->get(AArch64::FMLSv2f64);
389 case AArch64::FMULXv2i64_indexed:
390 DupMCID = &
TII->get(AArch64::DUPv2i64lane);
391 MulMCID = &
TII->get(AArch64::FMULXv2f64);
393 case AArch64::FMULv2i64_indexed:
394 DupMCID = &
TII->get(AArch64::DUPv2i64lane);
395 MulMCID = &
TII->get(AArch64::FMULv2f64);
399 case AArch64::FMLAv2i32_indexed:
400 RC = &AArch64::FPR64RegClass;
401 DupMCID = &
TII->get(AArch64::DUPv2i32lane);
402 MulMCID = &
TII->get(AArch64::FMLAv2f32);
404 case AArch64::FMLSv2i32_indexed:
405 RC = &AArch64::FPR64RegClass;
406 DupMCID = &
TII->get(AArch64::DUPv2i32lane);
407 MulMCID = &
TII->get(AArch64::FMLSv2f32);
409 case AArch64::FMULXv2i32_indexed:
410 RC = &AArch64::FPR64RegClass;
411 DupMCID = &
TII->get(AArch64::DUPv2i32lane);
412 MulMCID = &
TII->get(AArch64::FMULXv2f32);
414 case AArch64::FMULv2i32_indexed:
415 RC = &AArch64::FPR64RegClass;
416 DupMCID = &
TII->get(AArch64::DUPv2i32lane);
417 MulMCID = &
TII->get(AArch64::FMULv2f32);
424 if (!shouldReplaceInst(
MI.getParent()->getParent(), &
TII->get(
MI.getOpcode()),
441 if (
MI.getNumOperands() == 5) {
444 unsigned LaneNumber =
MI.getOperand(4).getImm();
448 if (!reuseDUP(
MI, DupMCID->
getOpcode(), SrcReg2, LaneNumber, &DupDest)) {
449 DupDest =
MRI.createVirtualRegister(RC);
451 .
addReg(SrcReg2, Src2IsKill)
455 .
addReg(SrcReg0, Src0IsKill)
456 .
addReg(SrcReg1, Src1IsKill)
457 .
addReg(DupDest, Src2IsKill);
458 }
else if (
MI.getNumOperands() == 4) {
459 unsigned LaneNumber =
MI.getOperand(3).getImm();
460 if (!reuseDUP(
MI, DupMCID->
getOpcode(), SrcReg1, LaneNumber, &DupDest)) {
461 DupDest =
MRI.createVirtualRegister(RC);
463 .
addReg(SrcReg1, Src1IsKill)
467 .
addReg(SrcReg0, Src0IsKill)
468 .
addReg(DupDest, Src1IsKill);
506bool AArch64SIMDInstrOpt::optimizeLdStInterleave(
MachineInstr &
MI) {
508 unsigned SeqReg, AddrReg;
509 unsigned StReg[4], StRegKill[4];
519 for (
auto &
I : IRT) {
520 if (
MI.getOpcode() ==
I.OrigOpc) {
521 SeqReg =
MI.getOperand(0).getReg();
522 AddrReg =
MI.getOperand(1).getReg();
523 DefiningMI =
MRI->getUniqueVRegDef(SeqReg);
524 unsigned NumReg = determineSrcReg(
MI);
525 if (!processSeqRegInst(DefiningMI, StReg, StRegKill, NumReg))
528 for (
auto &Repl :
I.ReplOpc) {
531 if (Repl != AArch64::STPQi && Repl != AArch64::STPDi)
544 if (!shouldReplaceInst(
MI.getParent()->getParent(), &
TII->get(
MI.getOpcode()),
554 switch (
MI.getOpcode()) {
558 case AArch64::ST2Twov16b:
559 case AArch64::ST2Twov8b:
560 case AArch64::ST2Twov8h:
561 case AArch64::ST2Twov4h:
562 case AArch64::ST2Twov4s:
563 case AArch64::ST2Twov2s:
564 case AArch64::ST2Twov2d:
570 .
addReg(StReg[0], StRegKill[0])
571 .
addReg(StReg[1], StRegKill[1]);
580 case AArch64::ST4Fourv16b:
581 case AArch64::ST4Fourv8b:
582 case AArch64::ST4Fourv8h:
583 case AArch64::ST4Fourv4h:
584 case AArch64::ST4Fourv4s:
585 case AArch64::ST4Fourv2s:
586 case AArch64::ST4Fourv2d:
592 .
addReg(StReg[0], StRegKill[0])
593 .
addReg(StReg[2], StRegKill[2]);
598 .
addReg(StReg[1], StRegKill[1])
599 .
addReg(StReg[3], StRegKill[3]);
635bool AArch64SIMDInstrOpt::processSeqRegInst(
MachineInstr *DefiningMI,
636 unsigned* StReg,
unsigned* StRegKill,
unsigned NumArg)
const {
637 assert(DefiningMI !=
nullptr);
638 if (DefiningMI->
getOpcode() != AArch64::REG_SEQUENCE)
641 for (
unsigned i=0; i<NumArg; i++) {
670unsigned AArch64SIMDInstrOpt::determineSrcReg(
MachineInstr &
MI)
const {
671 switch (
MI.getOpcode()) {
675 case AArch64::ST2Twov16b:
676 case AArch64::ST2Twov8b:
677 case AArch64::ST2Twov8h:
678 case AArch64::ST2Twov4h:
679 case AArch64::ST2Twov4s:
680 case AArch64::ST2Twov2s:
681 case AArch64::ST2Twov2d:
684 case AArch64::ST4Fourv16b:
685 case AArch64::ST4Fourv8b:
686 case AArch64::ST4Fourv8h:
687 case AArch64::ST4Fourv4h:
688 case AArch64::ST4Fourv4s:
689 case AArch64::ST4Fourv2s:
690 case AArch64::ST4Fourv2d:
706 SchedModel.init(&ST);
707 if (!SchedModel.hasInstrSchedModel())
710 bool Changed =
false;
711 for (
auto OptimizationKind : {VectorElem, Interleave}) {
712 if (!shouldExitEarly(&MF, OptimizationKind)) {
717 if (OptimizationKind == VectorElem)
718 InstRewrite = optimizeVectElement(
MI) ;
720 InstRewrite = optimizeLdStInterleave(
MI);
730 MI->eraseFromParent();
740 return new AArch64SIMDInstrOpt();
unsigned const MachineRegisterInfo * MRI
#define RuleST4(OpcOrg, OpcR0, OpcR1, OpcR2, OpcR3, OpcR4, OpcR5, OpcR6, OpcR7, OpcR8, OpcR9, RC)
#define RuleST2(OpcOrg, OpcR0, OpcR1, OpcR2, RC)
#define AARCH64_VECTOR_BY_ELEMENT_OPT_NAME
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
const HexagonInstrInfo * TII
#define INITIALIZE_PASS(passName, arg, name, cfg, analysis)
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
This file defines the SmallVector class.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
FunctionPass class - This class is used to implement most global optimizations.
Describe properties that are true of each instruction in the target description file.
unsigned getOpcode() const
Return the opcode number for this descriptor.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
MachineFunctionPass - This class adapts the FunctionPass interface to allow convenient creation of pa...
virtual bool runOnMachineFunction(MachineFunction &MF)=0
runOnMachineFunction - This method must be overloaded to perform the desired machine code transformat...
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
Representation of each machine instruction.
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
unsigned getNumOperands() const
Retuns the total number of operands.
const MachineOperand & getOperand(unsigned i) const
bool isImm() const
isImm - Tests if this is a MO_Immediate operand.
Register getReg() const
getReg - Returns the register number.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
static PassRegistry * getPassRegistry()
getPassRegistry - Access the global registry object, which is automatically initialized at applicatio...
virtual StringRef getPassName() const
getPassName - Return a nice clean name for a pass.
Wrapper class representing virtual and physical registers.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
StringRef - Represent a constant reference to a string, i.e.
TargetInstrInfo - Interface to description of machine instruction set.
Provide an instruction scheduling machine model to CodeGen passes.
TargetSubtargetInfo - Generic base class for all target subtargets.
virtual const TargetInstrInfo * getInstrInfo() const
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
This is an optimization pass for GlobalISel generic memory operations.
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
FunctionPass * createAArch64SIMDInstrOptPass()
Returns an instance of the high cost ASIMD instruction replacement optimization pass.
unsigned getKillRegState(bool B)
void initializeAArch64SIMDInstrOptPass(PassRegistry &)
Summarize the scheduling resources required for an instruction of a particular scheduling class.