Line data Source code
1 : //=- AArch64VectorByElementOpt.cpp - AArch64 vector by element inst opt pass =//
2 : //
3 : // The LLVM Compiler Infrastructure
4 : //
5 : // This file is distributed under the University of Illinois Open Source
6 : // License. See LICENSE.TXT for details.
7 : //
8 : //===----------------------------------------------------------------------===//
9 : //
10 : // This file contains a pass that performs optimization for vector by element
11 : // SIMD instructions.
12 : //
13 : // Certain SIMD instructions with vector element operand are not efficient.
14 : // Rewrite them into SIMD instructions with vector operands. This rewrite
15 : // is driven by the latency of the instructions.
16 : //
17 : // Example:
18 : // fmla v0.4s, v1.4s, v2.s[1]
19 : // is rewritten into
20 : // dup v3.4s, v2.s[1]
21 : // fmla v0.4s, v1.4s, v3.4s
22 : //
23 : //===----------------------------------------------------------------------===//
24 :
25 : #include "AArch64InstrInfo.h"
26 : #include "llvm/ADT/SmallVector.h"
27 : #include "llvm/ADT/Statistic.h"
28 : #include "llvm/ADT/StringRef.h"
29 : #include "llvm/CodeGen/MachineBasicBlock.h"
30 : #include "llvm/CodeGen/MachineFunction.h"
31 : #include "llvm/CodeGen/MachineFunctionPass.h"
32 : #include "llvm/CodeGen/MachineInstr.h"
33 : #include "llvm/CodeGen/MachineInstrBuilder.h"
34 : #include "llvm/CodeGen/MachineOperand.h"
35 : #include "llvm/CodeGen/MachineRegisterInfo.h"
36 : #include "llvm/CodeGen/TargetSchedule.h"
37 : #include "llvm/MC/MCInstrDesc.h"
38 : #include "llvm/MC/MCSchedule.h"
39 : #include "llvm/Pass.h"
40 : #include "llvm/Target/TargetInstrInfo.h"
41 : #include "llvm/Target/TargetSubtargetInfo.h"
42 : #include <map>
43 :
44 : using namespace llvm;
45 :
46 : #define DEBUG_TYPE "aarch64-vectorbyelement-opt"
47 :
48 : STATISTIC(NumModifiedInstr,
49 : "Number of vector by element instructions modified");
50 :
51 : #define AARCH64_VECTOR_BY_ELEMENT_OPT_NAME \
52 : "AArch64 vector by element instruction optimization pass"
53 :
54 : namespace {
55 :
56 2721 : struct AArch64VectorByElementOpt : public MachineFunctionPass {
57 : static char ID;
58 :
59 : const TargetInstrInfo *TII;
60 : MachineRegisterInfo *MRI;
61 : TargetSchedModel SchedModel;
62 :
63 914 : AArch64VectorByElementOpt() : MachineFunctionPass(ID) {
64 914 : initializeAArch64VectorByElementOptPass(*PassRegistry::getPassRegistry());
65 914 : }
66 :
67 : /// Based only on latency of instructions, determine if it is cost efficient
68 : /// to replace the instruction InstDesc by the two instructions InstDescRep1
69 : /// and InstDescRep2.
70 : /// Return true if replacement is recommended.
71 : bool
72 : shouldReplaceInstruction(MachineFunction *MF, const MCInstrDesc *InstDesc,
73 : const MCInstrDesc *InstDescRep1,
74 : const MCInstrDesc *InstDescRep2,
75 : std::map<unsigned, bool> &VecInstElemTable) const;
76 :
77 : /// Determine if we need to exit the vector by element instruction
78 : /// optimization pass early. This makes sure that Targets with no need
79 : /// for this optimization do not spent any compile time on this pass.
80 : /// This check is done by comparing the latency of an indexed FMLA
81 : /// instruction to the latency of the DUP + the latency of a vector
82 : /// FMLA instruction. We do not check on other related instructions such
83 : /// as FMLS as we assume that if the situation shows up for one
84 : /// instruction, then it is likely to show up for the related ones.
85 : /// Return true if early exit of the pass is recommended.
86 : bool earlyExitVectElement(MachineFunction *MF);
87 :
88 : /// Check whether an equivalent DUP instruction has already been
89 : /// created or not.
90 : /// Return true when the dup instruction already exists. In this case,
91 : /// DestReg will point to the destination of the already created DUP.
92 : bool reuseDUP(MachineInstr &MI, unsigned DupOpcode, unsigned SrcReg,
93 : unsigned LaneNumber, unsigned *DestReg) const;
94 :
95 : /// Certain SIMD instructions with vector element operand are not efficient.
96 : /// Rewrite them into SIMD instructions with vector operands. This rewrite
97 : /// is driven by the latency of the instructions.
98 : /// Return true if the SIMD instruction is modified.
99 : bool optimizeVectElement(MachineInstr &MI,
100 : std::map<unsigned, bool> *VecInstElemTable) const;
101 :
102 : bool runOnMachineFunction(MachineFunction &Fn) override;
103 :
104 913 : StringRef getPassName() const override {
105 913 : return AARCH64_VECTOR_BY_ELEMENT_OPT_NAME;
106 : }
107 : };
108 :
109 : char AArch64VectorByElementOpt::ID = 0;
110 :
111 : } // end anonymous namespace
112 :
113 315295 : INITIALIZE_PASS(AArch64VectorByElementOpt, "aarch64-vectorbyelement-opt",
114 : AARCH64_VECTOR_BY_ELEMENT_OPT_NAME, false, false)
115 :
116 : /// Based only on latency of instructions, determine if it is cost efficient
117 : /// to replace the instruction InstDesc by the two instructions InstDescRep1
118 : /// and InstDescRep2. Note that it is assumed in this fuction that an
119 : /// instruction of type InstDesc is always replaced by the same two
120 : /// instructions as results are cached here.
121 : /// Return true if replacement is recommended.
122 1059 : bool AArch64VectorByElementOpt::shouldReplaceInstruction(
123 : MachineFunction *MF, const MCInstrDesc *InstDesc,
124 : const MCInstrDesc *InstDescRep1, const MCInstrDesc *InstDescRep2,
125 : std::map<unsigned, bool> &VecInstElemTable) const {
126 : // Check if replacment decision is alredy available in the cached table.
127 : // if so, return it.
128 1061 : if (!VecInstElemTable.empty() &&
129 6 : VecInstElemTable.find(InstDesc->getOpcode()) != VecInstElemTable.end())
130 0 : return VecInstElemTable[InstDesc->getOpcode()];
131 :
132 1059 : unsigned SCIdx = InstDesc->getSchedClass();
133 1059 : unsigned SCIdxRep1 = InstDescRep1->getSchedClass();
134 1059 : unsigned SCIdxRep2 = InstDescRep2->getSchedClass();
135 : const MCSchedClassDesc *SCDesc =
136 2118 : SchedModel.getMCSchedModel()->getSchedClassDesc(SCIdx);
137 : const MCSchedClassDesc *SCDescRep1 =
138 2118 : SchedModel.getMCSchedModel()->getSchedClassDesc(SCIdxRep1);
139 : const MCSchedClassDesc *SCDescRep2 =
140 2118 : SchedModel.getMCSchedModel()->getSchedClassDesc(SCIdxRep2);
141 :
142 : // If a subtarget does not define resources for any of the instructions
143 : // of interest, then return false for no replacement.
144 1059 : if (!SCDesc->isValid() || SCDesc->isVariant() || !SCDescRep1->isValid() ||
145 2118 : SCDescRep1->isVariant() || !SCDescRep2->isValid() ||
146 : SCDescRep2->isVariant()) {
147 0 : VecInstElemTable[InstDesc->getOpcode()] = false;
148 : return false;
149 : }
150 :
151 3177 : if (SchedModel.computeInstrLatency(InstDesc->getOpcode()) >
152 3177 : SchedModel.computeInstrLatency(InstDescRep1->getOpcode()) +
153 2118 : SchedModel.computeInstrLatency(InstDescRep2->getOpcode())) {
154 992 : VecInstElemTable[InstDesc->getOpcode()] = true;
155 : return true;
156 : }
157 1126 : VecInstElemTable[InstDesc->getOpcode()] = false;
158 : return false;
159 : }
160 :
161 : /// Determine if we need to exit the vector by element instruction
162 : /// optimization pass early. This makes sure that Targets with no need
163 : /// for this optimization do not spent any compile time on this pass.
164 : /// This check is done by comparing the latency of an indexed FMLA
165 : /// instruction to the latency of the DUP + the latency of a vector
166 : /// FMLA instruction. We do not check on other related instructions such
167 : /// as FMLS as we assume that if the situation shows up for one
168 : /// instruction, then it is likely to show up for the related ones.
169 : /// Return true if early exit of the pass is recommended.
170 1010 : bool AArch64VectorByElementOpt::earlyExitVectElement(MachineFunction *MF) {
171 2020 : std::map<unsigned, bool> VecInstElemTable;
172 2020 : const MCInstrDesc *IndexMulMCID = &TII->get(AArch64::FMLAv4i32_indexed);
173 2020 : const MCInstrDesc *DupMCID = &TII->get(AArch64::DUPv4i32lane);
174 2020 : const MCInstrDesc *MulMCID = &TII->get(AArch64::FMULv4f32);
175 :
176 1010 : if (!shouldReplaceInstruction(MF, IndexMulMCID, DupMCID, MulMCID,
177 : VecInstElemTable))
178 : return true;
179 : return false;
180 : }
181 :
182 : /// Check whether an equivalent DUP instruction has already been
183 : /// created or not.
184 : /// Return true when the dup instruction already exists. In this case,
185 : /// DestReg will point to the destination of the already created DUP.
186 49 : bool AArch64VectorByElementOpt::reuseDUP(MachineInstr &MI, unsigned DupOpcode,
187 : unsigned SrcReg, unsigned LaneNumber,
188 : unsigned *DestReg) const {
189 98 : for (MachineBasicBlock::iterator MII = MI, MIE = MI.getParent()->begin();
190 215 : MII != MIE;) {
191 334 : MII--;
192 167 : MachineInstr *CurrentMI = &*MII;
193 :
194 2 : if (CurrentMI->getOpcode() == DupOpcode &&
195 2 : CurrentMI->getNumOperands() == 3 &&
196 171 : CurrentMI->getOperand(1).getReg() == SrcReg &&
197 2 : CurrentMI->getOperand(2).getImm() == LaneNumber) {
198 1 : *DestReg = CurrentMI->getOperand(0).getReg();
199 1 : return true;
200 : }
201 : }
202 :
203 : return false;
204 : }
205 :
206 : /// Certain SIMD instructions with vector element operand are not efficient.
207 : /// Rewrite them into SIMD instructions with vector operands. This rewrite
208 : /// is driven by the latency of the instructions.
209 : /// The instruction of concerns are for the time being fmla, fmls, fmul,
210 : /// and fmulx and hence they are hardcoded.
211 : ///
212 : /// Example:
213 : /// fmla v0.4s, v1.4s, v2.s[1]
214 : /// is rewritten into
215 : /// dup v3.4s, v2.s[1] // dup not necessary if redundant
216 : /// fmla v0.4s, v1.4s, v3.4s
217 : /// Return true if the SIMD instruction is modified.
218 4110 : bool AArch64VectorByElementOpt::optimizeVectElement(
219 : MachineInstr &MI, std::map<unsigned, bool> *VecInstElemTable) const {
220 : const MCInstrDesc *MulMCID, *DupMCID;
221 4110 : const TargetRegisterClass *RC = &AArch64::FPR128RegClass;
222 :
223 8220 : switch (MI.getOpcode()) {
224 : default:
225 : return false;
226 :
227 : // 4X32 instructions
228 6 : case AArch64::FMLAv4i32_indexed:
229 12 : DupMCID = &TII->get(AArch64::DUPv4i32lane);
230 12 : MulMCID = &TII->get(AArch64::FMLAv4f32);
231 6 : break;
232 6 : case AArch64::FMLSv4i32_indexed:
233 12 : DupMCID = &TII->get(AArch64::DUPv4i32lane);
234 12 : MulMCID = &TII->get(AArch64::FMLSv4f32);
235 6 : break;
236 4 : case AArch64::FMULXv4i32_indexed:
237 8 : DupMCID = &TII->get(AArch64::DUPv4i32lane);
238 8 : MulMCID = &TII->get(AArch64::FMULXv4f32);
239 4 : break;
240 4 : case AArch64::FMULv4i32_indexed:
241 8 : DupMCID = &TII->get(AArch64::DUPv4i32lane);
242 8 : MulMCID = &TII->get(AArch64::FMULv4f32);
243 4 : break;
244 :
245 : // 2X64 instructions
246 3 : case AArch64::FMLAv2i64_indexed:
247 6 : DupMCID = &TII->get(AArch64::DUPv2i64lane);
248 6 : MulMCID = &TII->get(AArch64::FMLAv2f64);
249 3 : break;
250 3 : case AArch64::FMLSv2i64_indexed:
251 6 : DupMCID = &TII->get(AArch64::DUPv2i64lane);
252 6 : MulMCID = &TII->get(AArch64::FMLSv2f64);
253 3 : break;
254 4 : case AArch64::FMULXv2i64_indexed:
255 8 : DupMCID = &TII->get(AArch64::DUPv2i64lane);
256 8 : MulMCID = &TII->get(AArch64::FMULXv2f64);
257 4 : break;
258 3 : case AArch64::FMULv2i64_indexed:
259 6 : DupMCID = &TII->get(AArch64::DUPv2i64lane);
260 6 : MulMCID = &TII->get(AArch64::FMULv2f64);
261 3 : break;
262 :
263 : // 2X32 instructions
264 4 : case AArch64::FMLAv2i32_indexed:
265 4 : RC = &AArch64::FPR64RegClass;
266 8 : DupMCID = &TII->get(AArch64::DUPv2i32lane);
267 8 : MulMCID = &TII->get(AArch64::FMLAv2f32);
268 4 : break;
269 4 : case AArch64::FMLSv2i32_indexed:
270 4 : RC = &AArch64::FPR64RegClass;
271 8 : DupMCID = &TII->get(AArch64::DUPv2i32lane);
272 8 : MulMCID = &TII->get(AArch64::FMLSv2f32);
273 4 : break;
274 4 : case AArch64::FMULXv2i32_indexed:
275 4 : RC = &AArch64::FPR64RegClass;
276 8 : DupMCID = &TII->get(AArch64::DUPv2i32lane);
277 8 : MulMCID = &TII->get(AArch64::FMULXv2f32);
278 4 : break;
279 4 : case AArch64::FMULv2i32_indexed:
280 4 : RC = &AArch64::FPR64RegClass;
281 8 : DupMCID = &TII->get(AArch64::DUPv2i32lane);
282 8 : MulMCID = &TII->get(AArch64::FMULv2f32);
283 4 : break;
284 : }
285 :
286 49 : if (!shouldReplaceInstruction(MI.getParent()->getParent(),
287 98 : &TII->get(MI.getOpcode()), DupMCID, MulMCID,
288 : *VecInstElemTable))
289 : return false;
290 :
291 49 : const DebugLoc &DL = MI.getDebugLoc();
292 49 : MachineBasicBlock &MBB = *MI.getParent();
293 49 : MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
294 :
295 : // get the operands of the current SIMD arithmetic instruction.
296 49 : unsigned MulDest = MI.getOperand(0).getReg();
297 49 : unsigned SrcReg0 = MI.getOperand(1).getReg();
298 147 : unsigned Src0IsKill = getKillRegState(MI.getOperand(1).isKill());
299 49 : unsigned SrcReg1 = MI.getOperand(2).getReg();
300 147 : unsigned Src1IsKill = getKillRegState(MI.getOperand(2).isKill());
301 : unsigned DupDest;
302 :
303 : // Instructions of interest have either 4 or 5 operands.
304 49 : if (MI.getNumOperands() == 5) {
305 26 : unsigned SrcReg2 = MI.getOperand(3).getReg();
306 78 : unsigned Src2IsKill = getKillRegState(MI.getOperand(3).isKill());
307 26 : unsigned LaneNumber = MI.getOperand(4).getImm();
308 :
309 : // Create a new DUP instruction. Note that if an equivalent DUP instruction
310 : // has already been created before, then use that one instread of creating
311 : // a new one.
312 52 : if (!reuseDUP(MI, DupMCID->getOpcode(), SrcReg2, LaneNumber, &DupDest)) {
313 25 : DupDest = MRI.createVirtualRegister(RC);
314 50 : BuildMI(MBB, MI, DL, *DupMCID, DupDest)
315 25 : .addReg(SrcReg2, Src2IsKill)
316 50 : .addImm(LaneNumber);
317 : }
318 52 : BuildMI(MBB, MI, DL, *MulMCID, MulDest)
319 26 : .addReg(SrcReg0, Src0IsKill)
320 26 : .addReg(SrcReg1, Src1IsKill)
321 26 : .addReg(DupDest, Src2IsKill);
322 23 : } else if (MI.getNumOperands() == 4) {
323 23 : unsigned LaneNumber = MI.getOperand(3).getImm();
324 46 : if (!reuseDUP(MI, DupMCID->getOpcode(), SrcReg1, LaneNumber, &DupDest)) {
325 23 : DupDest = MRI.createVirtualRegister(RC);
326 46 : BuildMI(MBB, MI, DL, *DupMCID, DupDest)
327 23 : .addReg(SrcReg1, Src1IsKill)
328 46 : .addImm(LaneNumber);
329 : }
330 46 : BuildMI(MBB, MI, DL, *MulMCID, MulDest)
331 23 : .addReg(SrcReg0, Src0IsKill)
332 23 : .addReg(DupDest, Src1IsKill);
333 : } else {
334 : return false;
335 : }
336 :
337 : ++NumModifiedInstr;
338 : return true;
339 : }
340 :
341 11003 : bool AArch64VectorByElementOpt::runOnMachineFunction(MachineFunction &MF) {
342 11003 : if (skipFunction(*MF.getFunction()))
343 : return false;
344 :
345 11002 : TII = MF.getSubtarget().getInstrInfo();
346 11002 : MRI = &MF.getRegInfo();
347 11002 : const TargetSubtargetInfo &ST = MF.getSubtarget();
348 : const AArch64InstrInfo *AAII =
349 11002 : static_cast<const AArch64InstrInfo *>(ST.getInstrInfo());
350 11002 : if (!AAII)
351 : return false;
352 11002 : SchedModel.init(ST.getSchedModel(), &ST, AAII);
353 11002 : if (!SchedModel.hasInstrSchedModel())
354 : return false;
355 :
356 : // A simple check to exit this pass early for targets that do not need it.
357 1010 : if (earlyExitVectElement(&MF))
358 : return false;
359 :
360 447 : bool Changed = false;
361 447 : std::map<unsigned, bool> VecInstElemTable;
362 894 : SmallVector<MachineInstr *, 8> RemoveMIs;
363 :
364 1855 : for (MachineBasicBlock &MBB : MF) {
365 1028 : for (MachineBasicBlock::iterator MII = MBB.begin(), MIE = MBB.end();
366 4624 : MII != MIE;) {
367 4110 : MachineInstr &MI = *MII;
368 4110 : if (optimizeVectElement(MI, &VecInstElemTable)) {
369 : // Add MI to the list of instructions to be removed given that it has
370 : // been replaced.
371 49 : RemoveMIs.push_back(&MI);
372 49 : Changed = true;
373 : }
374 : ++MII;
375 : }
376 : }
377 :
378 1390 : for (MachineInstr *MI : RemoveMIs)
379 49 : MI->eraseFromParent();
380 :
381 447 : return Changed;
382 : }
383 :
384 : /// createAArch64VectorByElementOptPass - returns an instance of the
385 : /// vector by element optimization pass.
386 914 : FunctionPass *llvm::createAArch64VectorByElementOptPass() {
387 914 : return new AArch64VectorByElementOpt();
388 : }
|