55#include <unordered_map>
59#define DEBUG_TYPE "aarch64-simd-instr-opt"
62 "Number of SIMD instructions modified");
64#define AARCH64_VECTOR_BY_ELEMENT_OPT_NAME \
65 "AArch64 SIMD instructions optimization pass"
71constexpr unsigned MaxNumRepl = 10;
73class AArch64SIMDInstrOptImpl {
79 using SIMDInstrTableMap = std::map<std::pair<unsigned, std::string>,
bool>;
81 using InterlEarlyExitMap = std::unordered_map<std::string, bool>;
89 SIMDInstrTableMap &SIMDInstrTable;
93 InterlEarlyExitMap &InterlEarlyExit;
101 struct InstReplInfo {
103 unsigned ReplOpc[MaxNumRepl];
108#define RuleST2(OpcOrg, OpcR0, OpcR1, OpcR2, RC) \
109 {OpcOrg, {OpcR0, OpcR1, OpcR2}, 3, &RC}
110#define RuleST4(OpcOrg, OpcR0, OpcR1, OpcR2, OpcR3, OpcR4, OpcR5, OpcR6, \
111 OpcR7, OpcR8, OpcR9, RC) \
113 {OpcR0, OpcR1, OpcR2, OpcR3, OpcR4, OpcR5, OpcR6, OpcR7, OpcR8, OpcR9}, \
117 AArch64SIMDInstrOptImpl(SIMDInstrTableMap &SIMDInstrTable,
118 InterlEarlyExitMap &InterlEarlyExit)
119 : SIMDInstrTable(SIMDInstrTable), InterlEarlyExit(InterlEarlyExit) {}
125 bool shouldReplaceInst(MachineFunction *MF,
const MCInstrDesc *InstDesc,
126 SmallVectorImpl<const MCInstrDesc*> &ReplInstrMCID);
132 bool shouldExitEarly(MachineFunction *MF, Subpass SP);
138 bool reuseDUP(MachineInstr &
MI,
unsigned DupOpcode,
unsigned SrcReg,
139 unsigned LaneNumber,
unsigned *DestReg)
const;
145 bool optimizeVectElement(MachineInstr &
MI);
152 bool processSeqRegInst(MachineInstr *DefiningMI,
unsigned *StReg,
153 RegState *StRegKill,
unsigned NumArg)
const;
158 bool optimizeLdStInterleave(MachineInstr &
MI);
162 unsigned determineSrcReg(MachineInstr &
MI)
const;
164 bool run(MachineFunction &MF);
170 AArch64SIMDInstrOptImpl::SIMDInstrTableMap SIMDInstrTable;
171 AArch64SIMDInstrOptImpl::InterlEarlyExitMap InterlEarlyExit;
173 AArch64SIMDInstrOptLegacy() : MachineFunctionPass(ID) {}
175 bool runOnMachineFunction(MachineFunction &Fn)
override;
177 StringRef getPassName()
const override {
182char AArch64SIMDInstrOptLegacy::ID = 0;
185constexpr AArch64SIMDInstrOptImpl::InstReplInfo IRT[] = {
187 RuleST2(AArch64::ST2Twov2d, AArch64::ZIP1v2i64, AArch64::ZIP2v2i64,
188 AArch64::STPQi, AArch64::FPR128RegClass),
189 RuleST2(AArch64::ST2Twov4s, AArch64::ZIP1v4i32, AArch64::ZIP2v4i32,
190 AArch64::STPQi, AArch64::FPR128RegClass),
191 RuleST2(AArch64::ST2Twov2s, AArch64::ZIP1v2i32, AArch64::ZIP2v2i32,
192 AArch64::STPDi, AArch64::FPR64RegClass),
193 RuleST2(AArch64::ST2Twov8h, AArch64::ZIP1v8i16, AArch64::ZIP2v8i16,
194 AArch64::STPQi, AArch64::FPR128RegClass),
195 RuleST2(AArch64::ST2Twov4h, AArch64::ZIP1v4i16, AArch64::ZIP2v4i16,
196 AArch64::STPDi, AArch64::FPR64RegClass),
197 RuleST2(AArch64::ST2Twov16b, AArch64::ZIP1v16i8, AArch64::ZIP2v16i8,
198 AArch64::STPQi, AArch64::FPR128RegClass),
199 RuleST2(AArch64::ST2Twov8b, AArch64::ZIP1v8i8, AArch64::ZIP2v8i8,
200 AArch64::STPDi, AArch64::FPR64RegClass),
202 RuleST4(AArch64::ST4Fourv2d, AArch64::ZIP1v2i64, AArch64::ZIP2v2i64,
203 AArch64::ZIP1v2i64, AArch64::ZIP2v2i64, AArch64::ZIP1v2i64,
204 AArch64::ZIP2v2i64, AArch64::ZIP1v2i64, AArch64::ZIP2v2i64,
205 AArch64::STPQi, AArch64::STPQi, AArch64::FPR128RegClass),
206 RuleST4(AArch64::ST4Fourv4s, AArch64::ZIP1v4i32, AArch64::ZIP2v4i32,
207 AArch64::ZIP1v4i32, AArch64::ZIP2v4i32, AArch64::ZIP1v4i32,
208 AArch64::ZIP2v4i32, AArch64::ZIP1v4i32, AArch64::ZIP2v4i32,
209 AArch64::STPQi, AArch64::STPQi, AArch64::FPR128RegClass),
210 RuleST4(AArch64::ST4Fourv2s, AArch64::ZIP1v2i32, AArch64::ZIP2v2i32,
211 AArch64::ZIP1v2i32, AArch64::ZIP2v2i32, AArch64::ZIP1v2i32,
212 AArch64::ZIP2v2i32, AArch64::ZIP1v2i32, AArch64::ZIP2v2i32,
213 AArch64::STPDi, AArch64::STPDi, AArch64::FPR64RegClass),
214 RuleST4(AArch64::ST4Fourv8h, AArch64::ZIP1v8i16, AArch64::ZIP2v8i16,
215 AArch64::ZIP1v8i16, AArch64::ZIP2v8i16, AArch64::ZIP1v8i16,
216 AArch64::ZIP2v8i16, AArch64::ZIP1v8i16, AArch64::ZIP2v8i16,
217 AArch64::STPQi, AArch64::STPQi, AArch64::FPR128RegClass),
218 RuleST4(AArch64::ST4Fourv4h, AArch64::ZIP1v4i16, AArch64::ZIP2v4i16,
219 AArch64::ZIP1v4i16, AArch64::ZIP2v4i16, AArch64::ZIP1v4i16,
220 AArch64::ZIP2v4i16, AArch64::ZIP1v4i16, AArch64::ZIP2v4i16,
221 AArch64::STPDi, AArch64::STPDi, AArch64::FPR64RegClass),
222 RuleST4(AArch64::ST4Fourv16b, AArch64::ZIP1v16i8, AArch64::ZIP2v16i8,
223 AArch64::ZIP1v16i8, AArch64::ZIP2v16i8, AArch64::ZIP1v16i8,
224 AArch64::ZIP2v16i8, AArch64::ZIP1v16i8, AArch64::ZIP2v16i8,
225 AArch64::STPQi, AArch64::STPQi, AArch64::FPR128RegClass),
226 RuleST4(AArch64::ST4Fourv8b, AArch64::ZIP1v8i8, AArch64::ZIP2v8i8,
227 AArch64::ZIP1v8i8, AArch64::ZIP2v8i8, AArch64::ZIP1v8i8,
228 AArch64::ZIP2v8i8, AArch64::ZIP1v8i8, AArch64::ZIP2v8i8,
229 AArch64::STPDi, AArch64::STPDi, AArch64::FPR64RegClass)};
240bool AArch64SIMDInstrOptImpl::shouldReplaceInst(
245 std::string Subtarget = std::string(SchedModel.getSubtargetInfo()->getCPU());
246 auto InstID = std::make_pair(InstDesc->getOpcode(), Subtarget);
247 auto It = SIMDInstrTable.find(InstID);
248 if (It != SIMDInstrTable.end())
251 unsigned SCIdx = InstDesc->getSchedClass();
253 SchedModel.getMCSchedModel()->getSchedClassDesc(SCIdx);
260 SIMDInstrTable[InstID] =
false;
263 for (
const auto *IDesc : InstDescRepl)
265 SCDescRepl = SchedModel.getMCSchedModel()->getSchedClassDesc(
266 IDesc->getSchedClass());
269 SIMDInstrTable[InstID] =
false;
275 unsigned ReplCost = 0;
276 for (
const auto *IDesc :InstDescRepl)
277 ReplCost += SchedModel.computeInstrLatency(IDesc->getOpcode());
279 if (SchedModel.computeInstrLatency(InstDesc->getOpcode()) > ReplCost)
281 SIMDInstrTable[InstID] =
true;
286 SIMDInstrTable[InstID] =
false;
297bool AArch64SIMDInstrOptImpl::shouldExitEarly(
MachineFunction *MF, Subpass SP) {
298 const MCInstrDesc *OriginalMCID;
306 OriginalMCID = &
TII->get(AArch64::FMLAv4i32_indexed);
307 ReplInstrMCID.
push_back(&
TII->get(AArch64::DUPv4i32lane));
309 if (shouldReplaceInst(MF, OriginalMCID, ReplInstrMCID))
315 std::string Subtarget =
317 auto It = InterlEarlyExit.find(Subtarget);
318 if (It != InterlEarlyExit.end())
321 for (
const auto &
I : IRT) {
322 OriginalMCID = &
TII->get(
I.OrigOpc);
323 for (
unsigned J = 0; J <
I.NumRepl; ++J)
325 if (shouldReplaceInst(MF, OriginalMCID, ReplInstrMCID)) {
326 InterlEarlyExit[Subtarget] =
false;
329 ReplInstrMCID.
clear();
331 InterlEarlyExit[Subtarget] =
true;
342bool AArch64SIMDInstrOptImpl::reuseDUP(MachineInstr &
MI,
unsigned DupOpcode,
343 unsigned SrcReg,
unsigned LaneNumber,
344 unsigned *DestReg)
const {
348 MachineInstr *CurrentMI = &*MII;
350 if (CurrentMI->
getOpcode() == DupOpcode &&
376bool AArch64SIMDInstrOptImpl::optimizeVectElement(MachineInstr &
MI) {
377 const MCInstrDesc *MulMCID, *DupMCID;
378 const TargetRegisterClass *RC = &AArch64::FPR128RegClass;
380 switch (
MI.getOpcode()) {
385 case AArch64::FMLAv4i32_indexed:
386 DupMCID = &
TII->get(AArch64::DUPv4i32lane);
387 MulMCID = &
TII->get(AArch64::FMLAv4f32);
389 case AArch64::FMLSv4i32_indexed:
390 DupMCID = &
TII->get(AArch64::DUPv4i32lane);
391 MulMCID = &
TII->get(AArch64::FMLSv4f32);
393 case AArch64::FMULXv4i32_indexed:
394 DupMCID = &
TII->get(AArch64::DUPv4i32lane);
395 MulMCID = &
TII->get(AArch64::FMULXv4f32);
397 case AArch64::FMULv4i32_indexed:
398 DupMCID = &
TII->get(AArch64::DUPv4i32lane);
399 MulMCID = &
TII->get(AArch64::FMULv4f32);
403 case AArch64::FMLAv2i64_indexed:
404 DupMCID = &
TII->get(AArch64::DUPv2i64lane);
405 MulMCID = &
TII->get(AArch64::FMLAv2f64);
407 case AArch64::FMLSv2i64_indexed:
408 DupMCID = &
TII->get(AArch64::DUPv2i64lane);
409 MulMCID = &
TII->get(AArch64::FMLSv2f64);
411 case AArch64::FMULXv2i64_indexed:
412 DupMCID = &
TII->get(AArch64::DUPv2i64lane);
413 MulMCID = &
TII->get(AArch64::FMULXv2f64);
415 case AArch64::FMULv2i64_indexed:
416 DupMCID = &
TII->get(AArch64::DUPv2i64lane);
417 MulMCID = &
TII->get(AArch64::FMULv2f64);
421 case AArch64::FMLAv2i32_indexed:
422 RC = &AArch64::FPR64RegClass;
423 DupMCID = &
TII->get(AArch64::DUPv2i32lane);
424 MulMCID = &
TII->get(AArch64::FMLAv2f32);
426 case AArch64::FMLSv2i32_indexed:
427 RC = &AArch64::FPR64RegClass;
428 DupMCID = &
TII->get(AArch64::DUPv2i32lane);
429 MulMCID = &
TII->get(AArch64::FMLSv2f32);
431 case AArch64::FMULXv2i32_indexed:
432 RC = &AArch64::FPR64RegClass;
433 DupMCID = &
TII->get(AArch64::DUPv2i32lane);
434 MulMCID = &
TII->get(AArch64::FMULXv2f32);
436 case AArch64::FMULv2i32_indexed:
437 RC = &AArch64::FPR64RegClass;
438 DupMCID = &
TII->get(AArch64::DUPv2i32lane);
439 MulMCID = &
TII->get(AArch64::FMULv2f32);
446 if (!shouldReplaceInst(
MI.getParent()->getParent(), &
TII->get(
MI.getOpcode()),
451 MachineBasicBlock &
MBB = *
MI.getParent();
463 if (
MI.getNumOperands() == 5) {
466 unsigned LaneNumber =
MI.getOperand(4).getImm();
470 if (!reuseDUP(
MI, DupMCID->
getOpcode(), SrcReg2, LaneNumber, &DupDest)) {
473 .
addReg(SrcReg2, Src2IsKill)
477 .
addReg(SrcReg0, Src0IsKill)
478 .
addReg(SrcReg1, Src1IsKill)
479 .
addReg(DupDest, Src2IsKill);
480 }
else if (
MI.getNumOperands() == 4) {
481 unsigned LaneNumber =
MI.getOperand(3).getImm();
482 if (!reuseDUP(
MI, DupMCID->
getOpcode(), SrcReg1, LaneNumber, &DupDest)) {
485 .
addReg(SrcReg1, Src1IsKill)
489 .
addReg(SrcReg0, Src0IsKill)
490 .
addReg(DupDest, Src1IsKill);
528bool AArch64SIMDInstrOptImpl::optimizeLdStInterleave(MachineInstr &
MI) {
530 unsigned SeqReg, AddrReg;
533 MachineInstr *DefiningMI;
535 MachineBasicBlock &
MBB = *
MI.getParent();
542 for (
const auto &
I : IRT) {
543 if (
MI.getOpcode() ==
I.OrigOpc) {
544 SeqReg =
MI.getOperand(0).getReg();
545 AddrReg =
MI.getOperand(1).getReg();
547 unsigned NumReg = determineSrcReg(
MI);
548 if (!processSeqRegInst(DefiningMI, StReg, StRegKill, NumReg))
551 for (
unsigned J = 0; J <
I.NumRepl; ++J) {
552 unsigned Repl =
I.ReplOpc[J];
555 if (Repl != AArch64::STPQi && Repl != AArch64::STPDi)
568 if (!shouldReplaceInst(
MI.getParent()->getParent(), &
TII->get(
MI.getOpcode()),
578 switch (
MI.getOpcode()) {
582 case AArch64::ST2Twov16b:
583 case AArch64::ST2Twov8b:
584 case AArch64::ST2Twov8h:
585 case AArch64::ST2Twov4h:
586 case AArch64::ST2Twov4s:
587 case AArch64::ST2Twov2s:
588 case AArch64::ST2Twov2d:
594 .
addReg(StReg[0], StRegKill[0])
595 .
addReg(StReg[1], StRegKill[1]);
604 case AArch64::ST4Fourv16b:
605 case AArch64::ST4Fourv8b:
606 case AArch64::ST4Fourv8h:
607 case AArch64::ST4Fourv4h:
608 case AArch64::ST4Fourv4s:
609 case AArch64::ST4Fourv2s:
610 case AArch64::ST4Fourv2d:
616 .
addReg(StReg[0], StRegKill[0])
617 .
addReg(StReg[2], StRegKill[2]);
622 .
addReg(StReg[1], StRegKill[1])
623 .
addReg(StReg[3], StRegKill[3]);
659bool AArch64SIMDInstrOptImpl::processSeqRegInst(MachineInstr *DefiningMI,
662 unsigned NumArg)
const {
663 assert(DefiningMI !=
nullptr);
664 if (DefiningMI->
getOpcode() != AArch64::REG_SEQUENCE)
667 for (
unsigned i=0; i<NumArg; i++) {
696unsigned AArch64SIMDInstrOptImpl::determineSrcReg(MachineInstr &
MI)
const {
697 switch (
MI.getOpcode()) {
701 case AArch64::ST2Twov16b:
702 case AArch64::ST2Twov8b:
703 case AArch64::ST2Twov8h:
704 case AArch64::ST2Twov4h:
705 case AArch64::ST2Twov4s:
706 case AArch64::ST2Twov2s:
707 case AArch64::ST2Twov2d:
710 case AArch64::ST4Fourv16b:
711 case AArch64::ST4Fourv8b:
712 case AArch64::ST4Fourv8h:
713 case AArch64::ST4Fourv4h:
714 case AArch64::ST4Fourv4s:
715 case AArch64::ST4Fourv2s:
716 case AArch64::ST4Fourv2d:
721bool AArch64SIMDInstrOptImpl::run(MachineFunction &MF) {
724 TII =
ST.getInstrInfo();
725 SchedModel.
init(&ST);
730 for (
auto OptimizationKind : {VectorElem, Interleave}) {
731 if (!shouldExitEarly(&MF, OptimizationKind)) {
732 SmallVector<MachineInstr *, 8> RemoveMIs;
733 for (MachineBasicBlock &
MBB : MF) {
734 for (MachineInstr &
MI :
MBB) {
736 if (OptimizationKind == VectorElem)
737 InstRewrite = optimizeVectElement(
MI) ;
739 InstRewrite = optimizeLdStInterleave(
MI);
748 for (MachineInstr *
MI : RemoveMIs)
749 MI->eraseFromParent();
756bool AArch64SIMDInstrOptLegacy::runOnMachineFunction(MachineFunction &MF) {
760 return AArch64SIMDInstrOptImpl(SIMDInstrTable, InterlEarlyExit).run(MF);
767 AArch64SIMDInstrOptImpl(SIMDInstrTable, InterlEarlyExit).run(MF);
779 return new AArch64SIMDInstrOptLegacy();
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
#define RuleST4(OpcOrg, OpcR0, OpcR1, OpcR2, OpcR3, OpcR4, OpcR5, OpcR6, OpcR7, OpcR8, OpcR9, RC)
#define RuleST2(OpcOrg, OpcR0, OpcR1, OpcR2, RC)
#define AARCH64_VECTOR_BY_ELEMENT_OPT_NAME
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
const HexagonInstrInfo * TII
Promote Memory to Register
#define INITIALIZE_PASS(passName, arg, name, cfg, analysis)
This file defines the SmallVector class.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
PreservedAnalyses run(MachineFunction &MF, MachineFunctionAnalysisManager &MFAM)
Represents analyses that only rely on functions' control flow.
FunctionPass class - This class is used to implement most global optimizations.
Describe properties that are true of each instruction in the target description file.
unsigned getOpcode() const
Return the opcode number for this descriptor.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
MachineInstrBundleIterator< MachineInstr > iterator
MachineFunctionPass - This class adapts the FunctionPass interface to allow convenient creation of pa...
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
const MachineInstrBuilder & addReg(Register RegNo, RegState Flags={}, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
unsigned getNumOperands() const
Retuns the total number of operands.
const MachineOperand & getOperand(unsigned i) const
bool isImm() const
isImm - Tests if this is a MO_Immediate operand.
Register getReg() const
getReg - Returns the register number.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
LLVM_ABI Register createVirtualRegister(const TargetRegisterClass *RegClass, StringRef Name="")
createVirtualRegister - Create and return a new virtual register in the function with the specified r...
LLVM_ABI MachineInstr * getUniqueVRegDef(Register Reg) const
getUniqueVRegDef - Return the unique machine instr that defines the specified virtual register or nul...
A set of analyses that are preserved following a run of a transformation pass.
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
PreservedAnalyses & preserveSet()
Mark an analysis set as preserved.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void push_back(const T &Elt)
Provide an instruction scheduling machine model to CodeGen passes.
LLVM_ABI bool hasInstrSchedModel() const
Return true if this machine model includes an instruction-level scheduling model.
LLVM_ABI void init(const TargetSubtargetInfo *TSInfo, bool EnableSModel=true, bool EnableSItins=true)
Initialize the machine model for instruction scheduling.
const TargetSubtargetInfo * getSubtargetInfo() const
TargetSubtargetInfo getter.
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
DXILDebugInfoMap run(Module &M)
This is an optimization pass for GlobalISel generic memory operations.
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
RegState
Flags to represent properties of register accesses.
constexpr RegState getKillRegState(bool B)
AnalysisManager< MachineFunction > MachineFunctionAnalysisManager
LLVM_ABI PreservedAnalyses getMachineFunctionPassPreservedAnalyses()
Returns the minimum set of Analyses that all machine function passes must preserve.
FunctionPass * createAArch64SIMDInstrOptPass()
Returns an instance of the high cost ASIMD instruction replacement optimization pass.
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
Summarize the scheduling resources required for an instruction of a particular scheduling class.