LLVM 23.0.0git
AArch64SIMDInstrOpt.cpp
Go to the documentation of this file.
1//
2// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
3// See https://llvm.org/LICENSE.txt for license information.
4// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
5//
6//===----------------------------------------------------------------------===//
7//
8// This file contains a pass that performs optimization on SIMD instructions
9// with high latency by splitting them into more efficient series of
10// instructions.
11//
12// 1. Rewrite certain SIMD instructions with vector element due to their
13// inefficiency on some targets.
14//
15// For example:
16// fmla v0.4s, v1.4s, v2.s[1]
17//
18// Is rewritten into:
19// dup v3.4s, v2.s[1]
20// fmla v0.4s, v1.4s, v3.4s
21//
22// 2. Rewrite interleaved memory access instructions due to their
23// inefficiency on some targets.
24//
25// For example:
26// st2 {v0.4s, v1.4s}, addr
27//
28// Is rewritten into:
29// zip1 v2.4s, v0.4s, v1.4s
30// zip2 v3.4s, v0.4s, v1.4s
31// stp q2, q3, addr
32//
33//===----------------------------------------------------------------------===//
34
35#include "AArch64InstrInfo.h"
36#include "AArch64Subtarget.h"
38#include "llvm/ADT/Statistic.h"
39#include "llvm/ADT/StringRef.h"
51#include "llvm/MC/MCInstrDesc.h"
52#include "llvm/MC/MCSchedule.h"
53#include "llvm/Pass.h"
54#include <map>
55#include <unordered_map>
56
57using namespace llvm;
58
59#define DEBUG_TYPE "aarch64-simd-instr-opt"
60
61STATISTIC(NumModifiedInstr,
62 "Number of SIMD instructions modified");
63
64#define AARCH64_VECTOR_BY_ELEMENT_OPT_NAME \
65 "AArch64 SIMD instructions optimization pass"
66
67namespace {
68
69// A costly instruction is replaced in this work by N efficient instructions
70// The maximum of N is currently 10 and it is for ST4 case.
71constexpr unsigned MaxNumRepl = 10;
72
73class AArch64SIMDInstrOptImpl {
74public:
75 const AArch64InstrInfo *TII;
77 TargetSchedModel SchedModel;
78
79 using SIMDInstrTableMap = std::map<std::pair<unsigned, std::string>, bool>;
80
81 using InterlEarlyExitMap = std::unordered_map<std::string, bool>;
82
83 // The two maps below are used to cache decisions instead of recomputing. Note
84 // that we're only storing references, the data is scoped at the Pass level to
85 // enable the caching.
86 //
87 // This is used to cache instruction replacement decisions within function
88 // units and across function units.
89 SIMDInstrTableMap &SIMDInstrTable;
90
91 // This is used to cache the decision of whether to leave the interleaved
92 // store instructions replacement pass early or not for a particular target.
93 InterlEarlyExitMap &InterlEarlyExit;
94
95 typedef enum {
96 VectorElem,
97 Interleave
98 } Subpass;
99
100 // Instruction represented by OrigOpc is replaced by instructions in ReplOpc.
101 struct InstReplInfo {
102 unsigned OrigOpc;
103 unsigned ReplOpc[MaxNumRepl];
104 unsigned NumRepl;
105 const TargetRegisterClass *RC;
106 };
107
108#define RuleST2(OpcOrg, OpcR0, OpcR1, OpcR2, RC) \
109 {OpcOrg, {OpcR0, OpcR1, OpcR2}, 3, &RC}
110#define RuleST4(OpcOrg, OpcR0, OpcR1, OpcR2, OpcR3, OpcR4, OpcR5, OpcR6, \
111 OpcR7, OpcR8, OpcR9, RC) \
112 {OpcOrg, \
113 {OpcR0, OpcR1, OpcR2, OpcR3, OpcR4, OpcR5, OpcR6, OpcR7, OpcR8, OpcR9}, \
114 10, \
115 &RC}
116
117 AArch64SIMDInstrOptImpl(SIMDInstrTableMap &SIMDInstrTable,
118 InterlEarlyExitMap &InterlEarlyExit)
119 : SIMDInstrTable(SIMDInstrTable), InterlEarlyExit(InterlEarlyExit) {}
120
121 /// Based only on latency of instructions, determine if it is cost efficient
122 /// to replace the instruction InstDesc by the instructions stored in the
123 /// array InstDescRepl.
124 /// Return true if replacement is expected to be faster.
125 bool shouldReplaceInst(MachineFunction *MF, const MCInstrDesc *InstDesc,
126 SmallVectorImpl<const MCInstrDesc*> &ReplInstrMCID);
127
128 /// Determine if we need to exit the instruction replacement optimization
129 /// passes early. This makes sure that no compile time is spent in this pass
130 /// for targets with no need for any of these optimizations.
131 /// Return true if early exit of the pass is recommended.
132 bool shouldExitEarly(MachineFunction *MF, Subpass SP);
133
134 /// Check whether an equivalent DUP instruction has already been
135 /// created or not.
136 /// Return true when the DUP instruction already exists. In this case,
137 /// DestReg will point to the destination of the already created DUP.
138 bool reuseDUP(MachineInstr &MI, unsigned DupOpcode, unsigned SrcReg,
139 unsigned LaneNumber, unsigned *DestReg) const;
140
141 /// Certain SIMD instructions with vector element operand are not efficient.
142 /// Rewrite them into SIMD instructions with vector operands. This rewrite
143 /// is driven by the latency of the instructions.
144 /// Return true if the SIMD instruction is modified.
145 bool optimizeVectElement(MachineInstr &MI);
146
147 /// Process The REG_SEQUENCE instruction, and extract the source
148 /// operands of the ST2/4 instruction from it.
149 /// Example of such instructions.
150 /// %dest = REG_SEQUENCE %st2_src1, dsub0, %st2_src2, dsub1;
151 /// Return true when the instruction is processed successfully.
152 bool processSeqRegInst(MachineInstr *DefiningMI, unsigned *StReg,
153 RegState *StRegKill, unsigned NumArg) const;
154
155 /// Load/Store Interleaving instructions are not always beneficial.
156 /// Replace them by ZIP instructionand classical load/store.
157 /// Return true if the SIMD instruction is modified.
158 bool optimizeLdStInterleave(MachineInstr &MI);
159
160 /// Return the number of useful source registers for this
161 /// instruction (2 for ST2 and 4 for ST4).
162 unsigned determineSrcReg(MachineInstr &MI) const;
163
164 bool run(MachineFunction &MF);
165};
166
167struct AArch64SIMDInstrOptLegacy : public MachineFunctionPass {
168 static char ID;
169
170 AArch64SIMDInstrOptImpl::SIMDInstrTableMap SIMDInstrTable;
171 AArch64SIMDInstrOptImpl::InterlEarlyExitMap InterlEarlyExit;
172
173 AArch64SIMDInstrOptLegacy() : MachineFunctionPass(ID) {}
174
175 bool runOnMachineFunction(MachineFunction &Fn) override;
176
177 StringRef getPassName() const override {
179 }
180};
181
182char AArch64SIMDInstrOptLegacy::ID = 0;
183
184// The Instruction Replacement Table.
185constexpr AArch64SIMDInstrOptImpl::InstReplInfo IRT[] = {
186 // ST2 instructions
187 RuleST2(AArch64::ST2Twov2d, AArch64::ZIP1v2i64, AArch64::ZIP2v2i64,
188 AArch64::STPQi, AArch64::FPR128RegClass),
189 RuleST2(AArch64::ST2Twov4s, AArch64::ZIP1v4i32, AArch64::ZIP2v4i32,
190 AArch64::STPQi, AArch64::FPR128RegClass),
191 RuleST2(AArch64::ST2Twov2s, AArch64::ZIP1v2i32, AArch64::ZIP2v2i32,
192 AArch64::STPDi, AArch64::FPR64RegClass),
193 RuleST2(AArch64::ST2Twov8h, AArch64::ZIP1v8i16, AArch64::ZIP2v8i16,
194 AArch64::STPQi, AArch64::FPR128RegClass),
195 RuleST2(AArch64::ST2Twov4h, AArch64::ZIP1v4i16, AArch64::ZIP2v4i16,
196 AArch64::STPDi, AArch64::FPR64RegClass),
197 RuleST2(AArch64::ST2Twov16b, AArch64::ZIP1v16i8, AArch64::ZIP2v16i8,
198 AArch64::STPQi, AArch64::FPR128RegClass),
199 RuleST2(AArch64::ST2Twov8b, AArch64::ZIP1v8i8, AArch64::ZIP2v8i8,
200 AArch64::STPDi, AArch64::FPR64RegClass),
201 // ST4 instructions
202 RuleST4(AArch64::ST4Fourv2d, AArch64::ZIP1v2i64, AArch64::ZIP2v2i64,
203 AArch64::ZIP1v2i64, AArch64::ZIP2v2i64, AArch64::ZIP1v2i64,
204 AArch64::ZIP2v2i64, AArch64::ZIP1v2i64, AArch64::ZIP2v2i64,
205 AArch64::STPQi, AArch64::STPQi, AArch64::FPR128RegClass),
206 RuleST4(AArch64::ST4Fourv4s, AArch64::ZIP1v4i32, AArch64::ZIP2v4i32,
207 AArch64::ZIP1v4i32, AArch64::ZIP2v4i32, AArch64::ZIP1v4i32,
208 AArch64::ZIP2v4i32, AArch64::ZIP1v4i32, AArch64::ZIP2v4i32,
209 AArch64::STPQi, AArch64::STPQi, AArch64::FPR128RegClass),
210 RuleST4(AArch64::ST4Fourv2s, AArch64::ZIP1v2i32, AArch64::ZIP2v2i32,
211 AArch64::ZIP1v2i32, AArch64::ZIP2v2i32, AArch64::ZIP1v2i32,
212 AArch64::ZIP2v2i32, AArch64::ZIP1v2i32, AArch64::ZIP2v2i32,
213 AArch64::STPDi, AArch64::STPDi, AArch64::FPR64RegClass),
214 RuleST4(AArch64::ST4Fourv8h, AArch64::ZIP1v8i16, AArch64::ZIP2v8i16,
215 AArch64::ZIP1v8i16, AArch64::ZIP2v8i16, AArch64::ZIP1v8i16,
216 AArch64::ZIP2v8i16, AArch64::ZIP1v8i16, AArch64::ZIP2v8i16,
217 AArch64::STPQi, AArch64::STPQi, AArch64::FPR128RegClass),
218 RuleST4(AArch64::ST4Fourv4h, AArch64::ZIP1v4i16, AArch64::ZIP2v4i16,
219 AArch64::ZIP1v4i16, AArch64::ZIP2v4i16, AArch64::ZIP1v4i16,
220 AArch64::ZIP2v4i16, AArch64::ZIP1v4i16, AArch64::ZIP2v4i16,
221 AArch64::STPDi, AArch64::STPDi, AArch64::FPR64RegClass),
222 RuleST4(AArch64::ST4Fourv16b, AArch64::ZIP1v16i8, AArch64::ZIP2v16i8,
223 AArch64::ZIP1v16i8, AArch64::ZIP2v16i8, AArch64::ZIP1v16i8,
224 AArch64::ZIP2v16i8, AArch64::ZIP1v16i8, AArch64::ZIP2v16i8,
225 AArch64::STPQi, AArch64::STPQi, AArch64::FPR128RegClass),
226 RuleST4(AArch64::ST4Fourv8b, AArch64::ZIP1v8i8, AArch64::ZIP2v8i8,
227 AArch64::ZIP1v8i8, AArch64::ZIP2v8i8, AArch64::ZIP1v8i8,
228 AArch64::ZIP2v8i8, AArch64::ZIP1v8i8, AArch64::ZIP2v8i8,
229 AArch64::STPDi, AArch64::STPDi, AArch64::FPR64RegClass)};
230
231} // end anonymous namespace
232
233INITIALIZE_PASS(AArch64SIMDInstrOptLegacy, "aarch64-simd-instr-opt",
235
236/// Based only on latency of instructions, determine if it is cost efficient
237/// to replace the instruction InstDesc by the instructions stored in the
238/// array InstDescRepl.
239/// Return true if replacement is expected to be faster.
240bool AArch64SIMDInstrOptImpl::shouldReplaceInst(
241 MachineFunction *MF, const MCInstrDesc *InstDesc,
242 SmallVectorImpl<const MCInstrDesc *> &InstDescRepl) {
243 // Check if replacement decision is already available in the cached table.
244 // if so, return it.
245 std::string Subtarget = std::string(SchedModel.getSubtargetInfo()->getCPU());
246 auto InstID = std::make_pair(InstDesc->getOpcode(), Subtarget);
247 auto It = SIMDInstrTable.find(InstID);
248 if (It != SIMDInstrTable.end())
249 return It->second;
250
251 unsigned SCIdx = InstDesc->getSchedClass();
252 const MCSchedClassDesc *SCDesc =
253 SchedModel.getMCSchedModel()->getSchedClassDesc(SCIdx);
254
255 // If a target does not define resources for the instructions
256 // of interest, then return false for no replacement.
257 const MCSchedClassDesc *SCDescRepl;
258 if (!SCDesc->isValid() || SCDesc->isVariant())
259 {
260 SIMDInstrTable[InstID] = false;
261 return false;
262 }
263 for (const auto *IDesc : InstDescRepl)
264 {
265 SCDescRepl = SchedModel.getMCSchedModel()->getSchedClassDesc(
266 IDesc->getSchedClass());
267 if (!SCDescRepl->isValid() || SCDescRepl->isVariant())
268 {
269 SIMDInstrTable[InstID] = false;
270 return false;
271 }
272 }
273
274 // Replacement cost.
275 unsigned ReplCost = 0;
276 for (const auto *IDesc :InstDescRepl)
277 ReplCost += SchedModel.computeInstrLatency(IDesc->getOpcode());
278
279 if (SchedModel.computeInstrLatency(InstDesc->getOpcode()) > ReplCost)
280 {
281 SIMDInstrTable[InstID] = true;
282 return true;
283 }
284 else
285 {
286 SIMDInstrTable[InstID] = false;
287 return false;
288 }
289}
290
291/// Determine if we need to exit this pass for a kind of instruction replacement
292/// early. This makes sure that no compile time is spent in this pass for
293/// targets with no need for any of these optimizations beyond performing this
294/// check.
295/// Return true if early exit of this pass for a kind of instruction
296/// replacement is recommended for a target.
297bool AArch64SIMDInstrOptImpl::shouldExitEarly(MachineFunction *MF, Subpass SP) {
298 const MCInstrDesc *OriginalMCID;
300
301 switch (SP) {
302 // For this optimization, check by comparing the latency of a representative
303 // instruction to that of the replacement instructions.
304 // TODO: check for all concerned instructions.
305 case VectorElem:
306 OriginalMCID = &TII->get(AArch64::FMLAv4i32_indexed);
307 ReplInstrMCID.push_back(&TII->get(AArch64::DUPv4i32lane));
308 ReplInstrMCID.push_back(&TII->get(AArch64::FMLAv4f32));
309 if (shouldReplaceInst(MF, OriginalMCID, ReplInstrMCID))
310 return false;
311 break;
312
313 // For this optimization, check for all concerned instructions.
314 case Interleave:
315 std::string Subtarget =
316 std::string(SchedModel.getSubtargetInfo()->getCPU());
317 auto It = InterlEarlyExit.find(Subtarget);
318 if (It != InterlEarlyExit.end())
319 return It->second;
320
321 for (const auto &I : IRT) {
322 OriginalMCID = &TII->get(I.OrigOpc);
323 for (unsigned J = 0; J < I.NumRepl; ++J)
324 ReplInstrMCID.push_back(&TII->get(I.ReplOpc[J]));
325 if (shouldReplaceInst(MF, OriginalMCID, ReplInstrMCID)) {
326 InterlEarlyExit[Subtarget] = false;
327 return false;
328 }
329 ReplInstrMCID.clear();
330 }
331 InterlEarlyExit[Subtarget] = true;
332 break;
333 }
334
335 return true;
336}
337
338/// Check whether an equivalent DUP instruction has already been
339/// created or not.
340/// Return true when the DUP instruction already exists. In this case,
341/// DestReg will point to the destination of the already created DUP.
342bool AArch64SIMDInstrOptImpl::reuseDUP(MachineInstr &MI, unsigned DupOpcode,
343 unsigned SrcReg, unsigned LaneNumber,
344 unsigned *DestReg) const {
345 for (MachineBasicBlock::iterator MII = MI, MIE = MI.getParent()->begin();
346 MII != MIE;) {
347 MII--;
348 MachineInstr *CurrentMI = &*MII;
349
350 if (CurrentMI->getOpcode() == DupOpcode &&
351 CurrentMI->getNumOperands() == 3 &&
352 CurrentMI->getOperand(1).getReg() == SrcReg &&
353 CurrentMI->getOperand(2).getImm() == LaneNumber) {
354 *DestReg = CurrentMI->getOperand(0).getReg();
355 return true;
356 }
357 }
358
359 return false;
360}
361
362/// Certain SIMD instructions with vector element operand are not efficient.
363/// Rewrite them into SIMD instructions with vector operands. This rewrite
364/// is driven by the latency of the instructions.
365/// The instruction of concerns are for the time being FMLA, FMLS, FMUL,
366/// and FMULX and hence they are hardcoded.
367///
368/// For example:
369/// fmla v0.4s, v1.4s, v2.s[1]
370///
371/// Is rewritten into
372/// dup v3.4s, v2.s[1] // DUP not necessary if redundant
373/// fmla v0.4s, v1.4s, v3.4s
374///
375/// Return true if the SIMD instruction is modified.
376bool AArch64SIMDInstrOptImpl::optimizeVectElement(MachineInstr &MI) {
377 const MCInstrDesc *MulMCID, *DupMCID;
378 const TargetRegisterClass *RC = &AArch64::FPR128RegClass;
379
380 switch (MI.getOpcode()) {
381 default:
382 return false;
383
384 // 4X32 instructions
385 case AArch64::FMLAv4i32_indexed:
386 DupMCID = &TII->get(AArch64::DUPv4i32lane);
387 MulMCID = &TII->get(AArch64::FMLAv4f32);
388 break;
389 case AArch64::FMLSv4i32_indexed:
390 DupMCID = &TII->get(AArch64::DUPv4i32lane);
391 MulMCID = &TII->get(AArch64::FMLSv4f32);
392 break;
393 case AArch64::FMULXv4i32_indexed:
394 DupMCID = &TII->get(AArch64::DUPv4i32lane);
395 MulMCID = &TII->get(AArch64::FMULXv4f32);
396 break;
397 case AArch64::FMULv4i32_indexed:
398 DupMCID = &TII->get(AArch64::DUPv4i32lane);
399 MulMCID = &TII->get(AArch64::FMULv4f32);
400 break;
401
402 // 2X64 instructions
403 case AArch64::FMLAv2i64_indexed:
404 DupMCID = &TII->get(AArch64::DUPv2i64lane);
405 MulMCID = &TII->get(AArch64::FMLAv2f64);
406 break;
407 case AArch64::FMLSv2i64_indexed:
408 DupMCID = &TII->get(AArch64::DUPv2i64lane);
409 MulMCID = &TII->get(AArch64::FMLSv2f64);
410 break;
411 case AArch64::FMULXv2i64_indexed:
412 DupMCID = &TII->get(AArch64::DUPv2i64lane);
413 MulMCID = &TII->get(AArch64::FMULXv2f64);
414 break;
415 case AArch64::FMULv2i64_indexed:
416 DupMCID = &TII->get(AArch64::DUPv2i64lane);
417 MulMCID = &TII->get(AArch64::FMULv2f64);
418 break;
419
420 // 2X32 instructions
421 case AArch64::FMLAv2i32_indexed:
422 RC = &AArch64::FPR64RegClass;
423 DupMCID = &TII->get(AArch64::DUPv2i32lane);
424 MulMCID = &TII->get(AArch64::FMLAv2f32);
425 break;
426 case AArch64::FMLSv2i32_indexed:
427 RC = &AArch64::FPR64RegClass;
428 DupMCID = &TII->get(AArch64::DUPv2i32lane);
429 MulMCID = &TII->get(AArch64::FMLSv2f32);
430 break;
431 case AArch64::FMULXv2i32_indexed:
432 RC = &AArch64::FPR64RegClass;
433 DupMCID = &TII->get(AArch64::DUPv2i32lane);
434 MulMCID = &TII->get(AArch64::FMULXv2f32);
435 break;
436 case AArch64::FMULv2i32_indexed:
437 RC = &AArch64::FPR64RegClass;
438 DupMCID = &TII->get(AArch64::DUPv2i32lane);
439 MulMCID = &TII->get(AArch64::FMULv2f32);
440 break;
441 }
442
444 ReplInstrMCID.push_back(DupMCID);
445 ReplInstrMCID.push_back(MulMCID);
446 if (!shouldReplaceInst(MI.getParent()->getParent(), &TII->get(MI.getOpcode()),
447 ReplInstrMCID))
448 return false;
449
450 const DebugLoc &DL = MI.getDebugLoc();
451 MachineBasicBlock &MBB = *MI.getParent();
452 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
453
454 // Get the operands of the current SIMD arithmetic instruction.
455 Register MulDest = MI.getOperand(0).getReg();
456 Register SrcReg0 = MI.getOperand(1).getReg();
457 RegState Src0IsKill = getKillRegState(MI.getOperand(1).isKill());
458 Register SrcReg1 = MI.getOperand(2).getReg();
459 RegState Src1IsKill = getKillRegState(MI.getOperand(2).isKill());
460 unsigned DupDest;
461
462 // Instructions of interest have either 4 or 5 operands.
463 if (MI.getNumOperands() == 5) {
464 Register SrcReg2 = MI.getOperand(3).getReg();
465 RegState Src2IsKill = getKillRegState(MI.getOperand(3).isKill());
466 unsigned LaneNumber = MI.getOperand(4).getImm();
467 // Create a new DUP instruction. Note that if an equivalent DUP instruction
468 // has already been created before, then use that one instead of creating
469 // a new one.
470 if (!reuseDUP(MI, DupMCID->getOpcode(), SrcReg2, LaneNumber, &DupDest)) {
471 DupDest = MRI.createVirtualRegister(RC);
472 BuildMI(MBB, MI, DL, *DupMCID, DupDest)
473 .addReg(SrcReg2, Src2IsKill)
474 .addImm(LaneNumber);
475 }
476 BuildMI(MBB, MI, DL, *MulMCID, MulDest)
477 .addReg(SrcReg0, Src0IsKill)
478 .addReg(SrcReg1, Src1IsKill)
479 .addReg(DupDest, Src2IsKill);
480 } else if (MI.getNumOperands() == 4) {
481 unsigned LaneNumber = MI.getOperand(3).getImm();
482 if (!reuseDUP(MI, DupMCID->getOpcode(), SrcReg1, LaneNumber, &DupDest)) {
483 DupDest = MRI.createVirtualRegister(RC);
484 BuildMI(MBB, MI, DL, *DupMCID, DupDest)
485 .addReg(SrcReg1, Src1IsKill)
486 .addImm(LaneNumber);
487 }
488 BuildMI(MBB, MI, DL, *MulMCID, MulDest)
489 .addReg(SrcReg0, Src0IsKill)
490 .addReg(DupDest, Src1IsKill);
491 } else {
492 return false;
493 }
494
495 ++NumModifiedInstr;
496 return true;
497}
498
499/// Load/Store Interleaving instructions are not always beneficial.
500/// Replace them by ZIP instructions and classical load/store.
501///
502/// For example:
503/// st2 {v0.4s, v1.4s}, addr
504///
505/// Is rewritten into:
506/// zip1 v2.4s, v0.4s, v1.4s
507/// zip2 v3.4s, v0.4s, v1.4s
508/// stp q2, q3, addr
509//
510/// For example:
511/// st4 {v0.4s, v1.4s, v2.4s, v3.4s}, addr
512///
513/// Is rewritten into:
514/// zip1 v4.4s, v0.4s, v2.4s
515/// zip2 v5.4s, v0.4s, v2.4s
516/// zip1 v6.4s, v1.4s, v3.4s
517/// zip2 v7.4s, v1.4s, v3.4s
518/// zip1 v8.4s, v4.4s, v6.4s
519/// zip2 v9.4s, v4.4s, v6.4s
520/// zip1 v10.4s, v5.4s, v7.4s
521/// zip2 v11.4s, v5.4s, v7.4s
522/// stp q8, q9, addr
523/// stp q10, q11, addr+32
524///
525/// Currently only instructions related to ST2 and ST4 are considered.
526/// Other may be added later.
527/// Return true if the SIMD instruction is modified.
528bool AArch64SIMDInstrOptImpl::optimizeLdStInterleave(MachineInstr &MI) {
529
530 unsigned SeqReg, AddrReg;
531 unsigned StReg[4];
532 RegState StRegKill[4];
533 MachineInstr *DefiningMI;
534 const DebugLoc &DL = MI.getDebugLoc();
535 MachineBasicBlock &MBB = *MI.getParent();
538
539 // If current instruction matches any of the rewriting rules, then
540 // gather information about parameters of the new instructions.
541 bool Match = false;
542 for (const auto &I : IRT) {
543 if (MI.getOpcode() == I.OrigOpc) {
544 SeqReg = MI.getOperand(0).getReg();
545 AddrReg = MI.getOperand(1).getReg();
546 DefiningMI = MRI->getUniqueVRegDef(SeqReg);
547 unsigned NumReg = determineSrcReg(MI);
548 if (!processSeqRegInst(DefiningMI, StReg, StRegKill, NumReg))
549 return false;
550
551 for (unsigned J = 0; J < I.NumRepl; ++J) {
552 unsigned Repl = I.ReplOpc[J];
553 ReplInstrMCID.push_back(&TII->get(Repl));
554 // Generate destination registers but only for non-store instruction.
555 if (Repl != AArch64::STPQi && Repl != AArch64::STPDi)
556 ZipDest.push_back(MRI->createVirtualRegister(I.RC));
557 }
558 Match = true;
559 break;
560 }
561 }
562
563 if (!Match)
564 return false;
565
566 // Determine if it is profitable to replace MI by the series of instructions
567 // represented in ReplInstrMCID.
568 if (!shouldReplaceInst(MI.getParent()->getParent(), &TII->get(MI.getOpcode()),
569 ReplInstrMCID))
570 return false;
571
572 // Generate the replacement instructions composed of ZIP1, ZIP2, and STP (at
573 // this point, the code generation is hardcoded and does not rely on the IRT
574 // table used above given that code generation for ST2 replacement is somewhat
575 // different than for ST4 replacement. We could have added more info into the
576 // table related to how we build new instructions but we may be adding more
577 // complexity with that).
578 switch (MI.getOpcode()) {
579 default:
580 return false;
581
582 case AArch64::ST2Twov16b:
583 case AArch64::ST2Twov8b:
584 case AArch64::ST2Twov8h:
585 case AArch64::ST2Twov4h:
586 case AArch64::ST2Twov4s:
587 case AArch64::ST2Twov2s:
588 case AArch64::ST2Twov2d:
589 // ZIP instructions
590 BuildMI(MBB, MI, DL, *ReplInstrMCID[0], ZipDest[0])
591 .addReg(StReg[0])
592 .addReg(StReg[1]);
593 BuildMI(MBB, MI, DL, *ReplInstrMCID[1], ZipDest[1])
594 .addReg(StReg[0], StRegKill[0])
595 .addReg(StReg[1], StRegKill[1]);
596 // STP instructions
597 BuildMI(MBB, MI, DL, *ReplInstrMCID[2])
598 .addReg(ZipDest[0])
599 .addReg(ZipDest[1])
600 .addReg(AddrReg)
601 .addImm(0);
602 break;
603
604 case AArch64::ST4Fourv16b:
605 case AArch64::ST4Fourv8b:
606 case AArch64::ST4Fourv8h:
607 case AArch64::ST4Fourv4h:
608 case AArch64::ST4Fourv4s:
609 case AArch64::ST4Fourv2s:
610 case AArch64::ST4Fourv2d:
611 // ZIP instructions
612 BuildMI(MBB, MI, DL, *ReplInstrMCID[0], ZipDest[0])
613 .addReg(StReg[0])
614 .addReg(StReg[2]);
615 BuildMI(MBB, MI, DL, *ReplInstrMCID[1], ZipDest[1])
616 .addReg(StReg[0], StRegKill[0])
617 .addReg(StReg[2], StRegKill[2]);
618 BuildMI(MBB, MI, DL, *ReplInstrMCID[2], ZipDest[2])
619 .addReg(StReg[1])
620 .addReg(StReg[3]);
621 BuildMI(MBB, MI, DL, *ReplInstrMCID[3], ZipDest[3])
622 .addReg(StReg[1], StRegKill[1])
623 .addReg(StReg[3], StRegKill[3]);
624 BuildMI(MBB, MI, DL, *ReplInstrMCID[4], ZipDest[4])
625 .addReg(ZipDest[0])
626 .addReg(ZipDest[2]);
627 BuildMI(MBB, MI, DL, *ReplInstrMCID[5], ZipDest[5])
628 .addReg(ZipDest[0])
629 .addReg(ZipDest[2]);
630 BuildMI(MBB, MI, DL, *ReplInstrMCID[6], ZipDest[6])
631 .addReg(ZipDest[1])
632 .addReg(ZipDest[3]);
633 BuildMI(MBB, MI, DL, *ReplInstrMCID[7], ZipDest[7])
634 .addReg(ZipDest[1])
635 .addReg(ZipDest[3]);
636 // stp instructions
637 BuildMI(MBB, MI, DL, *ReplInstrMCID[8])
638 .addReg(ZipDest[4])
639 .addReg(ZipDest[5])
640 .addReg(AddrReg)
641 .addImm(0);
642 BuildMI(MBB, MI, DL, *ReplInstrMCID[9])
643 .addReg(ZipDest[6])
644 .addReg(ZipDest[7])
645 .addReg(AddrReg)
646 .addImm(2);
647 break;
648 }
649
650 ++NumModifiedInstr;
651 return true;
652}
653
654/// Process The REG_SEQUENCE instruction, and extract the source
655/// operands of the ST2/4 instruction from it.
656/// Example of such instruction.
657/// %dest = REG_SEQUENCE %st2_src1, dsub0, %st2_src2, dsub1;
658/// Return true when the instruction is processed successfully.
659bool AArch64SIMDInstrOptImpl::processSeqRegInst(MachineInstr *DefiningMI,
660 unsigned *StReg,
661 RegState *StRegKill,
662 unsigned NumArg) const {
663 assert(DefiningMI != nullptr);
664 if (DefiningMI->getOpcode() != AArch64::REG_SEQUENCE)
665 return false;
666
667 for (unsigned i=0; i<NumArg; i++) {
668 StReg[i] = DefiningMI->getOperand(2*i+1).getReg();
669 StRegKill[i] = getKillRegState(DefiningMI->getOperand(2*i+1).isKill());
670
671 // Validation check for the other arguments.
672 if (DefiningMI->getOperand(2*i+2).isImm()) {
673 switch (DefiningMI->getOperand(2*i+2).getImm()) {
674 default:
675 return false;
676
677 case AArch64::dsub0:
678 case AArch64::dsub1:
679 case AArch64::dsub2:
680 case AArch64::dsub3:
681 case AArch64::qsub0:
682 case AArch64::qsub1:
683 case AArch64::qsub2:
684 case AArch64::qsub3:
685 break;
686 }
687 }
688 else
689 return false;
690 }
691 return true;
692}
693
694/// Return the number of useful source registers for this instruction
695/// (2 for ST2 and 4 for ST4).
696unsigned AArch64SIMDInstrOptImpl::determineSrcReg(MachineInstr &MI) const {
697 switch (MI.getOpcode()) {
698 default:
699 llvm_unreachable("Unsupported instruction for this pass");
700
701 case AArch64::ST2Twov16b:
702 case AArch64::ST2Twov8b:
703 case AArch64::ST2Twov8h:
704 case AArch64::ST2Twov4h:
705 case AArch64::ST2Twov4s:
706 case AArch64::ST2Twov2s:
707 case AArch64::ST2Twov2d:
708 return 2;
709
710 case AArch64::ST4Fourv16b:
711 case AArch64::ST4Fourv8b:
712 case AArch64::ST4Fourv8h:
713 case AArch64::ST4Fourv4h:
714 case AArch64::ST4Fourv4s:
715 case AArch64::ST4Fourv2s:
716 case AArch64::ST4Fourv2d:
717 return 4;
718 }
719}
720
721bool AArch64SIMDInstrOptImpl::run(MachineFunction &MF) {
722 MRI = &MF.getRegInfo();
723 const AArch64Subtarget &ST = MF.getSubtarget<AArch64Subtarget>();
724 TII = ST.getInstrInfo();
725 SchedModel.init(&ST);
726 if (!SchedModel.hasInstrSchedModel())
727 return false;
728
729 bool Changed = false;
730 for (auto OptimizationKind : {VectorElem, Interleave}) {
731 if (!shouldExitEarly(&MF, OptimizationKind)) {
732 SmallVector<MachineInstr *, 8> RemoveMIs;
733 for (MachineBasicBlock &MBB : MF) {
734 for (MachineInstr &MI : MBB) {
735 bool InstRewrite;
736 if (OptimizationKind == VectorElem)
737 InstRewrite = optimizeVectElement(MI) ;
738 else
739 InstRewrite = optimizeLdStInterleave(MI);
740 if (InstRewrite) {
741 // Add MI to the list of instructions to be removed given that it
742 // has been replaced.
743 RemoveMIs.push_back(&MI);
744 Changed = true;
745 }
746 }
747 }
748 for (MachineInstr *MI : RemoveMIs)
749 MI->eraseFromParent();
750 }
751 }
752
753 return Changed;
754}
755
756bool AArch64SIMDInstrOptLegacy::runOnMachineFunction(MachineFunction &MF) {
757 if (skipFunction(MF.getFunction()))
758 return false;
759
760 return AArch64SIMDInstrOptImpl(SIMDInstrTable, InterlEarlyExit).run(MF);
761}
762
763PreservedAnalyses
766 const bool Changed =
767 AArch64SIMDInstrOptImpl(SIMDInstrTable, InterlEarlyExit).run(MF);
768 if (!Changed)
769 return PreservedAnalyses::all();
770
773 return PA;
774}
775
776/// Returns an instance of the high cost ASIMD instruction replacement
777/// optimization pass.
779 return new AArch64SIMDInstrOptLegacy();
780}
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
aarch64 promote const
#define RuleST4(OpcOrg, OpcR0, OpcR1, OpcR2, OpcR3, OpcR4, OpcR5, OpcR6, OpcR7, OpcR8, OpcR9, RC)
#define RuleST2(OpcOrg, OpcR0, OpcR1, OpcR2, RC)
#define AARCH64_VECTOR_BY_ELEMENT_OPT_NAME
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
#define I(x, y, z)
Definition MD5.cpp:57
Promote Memory to Register
Definition Mem2Reg.cpp:110
#define INITIALIZE_PASS(passName, arg, name, cfg, analysis)
Definition PassSupport.h:56
This file defines the SmallVector class.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition Statistic.h:171
PreservedAnalyses run(MachineFunction &MF, MachineFunctionAnalysisManager &MFAM)
Represents analyses that only rely on functions' control flow.
Definition Analysis.h:73
FunctionPass class - This class is used to implement most global optimizations.
Definition Pass.h:314
Describe properties that are true of each instruction in the target description file.
unsigned getOpcode() const
Return the opcode number for this descriptor.
StringRef getCPU() const
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
MachineInstrBundleIterator< MachineInstr > iterator
MachineFunctionPass - This class adapts the FunctionPass interface to allow convenient creation of pa...
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
const MachineInstrBuilder & addReg(Register RegNo, RegState Flags={}, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
unsigned getNumOperands() const
Retuns the total number of operands.
const MachineOperand & getOperand(unsigned i) const
int64_t getImm() const
bool isImm() const
isImm - Tests if this is a MO_Immediate operand.
Register getReg() const
getReg - Returns the register number.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
LLVM_ABI Register createVirtualRegister(const TargetRegisterClass *RegClass, StringRef Name="")
createVirtualRegister - Create and return a new virtual register in the function with the specified r...
LLVM_ABI MachineInstr * getUniqueVRegDef(Register Reg) const
getUniqueVRegDef - Return the unique machine instr that defines the specified virtual register or nul...
A set of analyses that are preserved following a run of a transformation pass.
Definition Analysis.h:112
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition Analysis.h:118
PreservedAnalyses & preserveSet()
Mark an analysis set as preserved.
Definition Analysis.h:151
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void push_back(const T &Elt)
Provide an instruction scheduling machine model to CodeGen passes.
LLVM_ABI bool hasInstrSchedModel() const
Return true if this machine model includes an instruction-level scheduling model.
LLVM_ABI void init(const TargetSubtargetInfo *TSInfo, bool EnableSModel=true, bool EnableSItins=true)
Initialize the machine model for instruction scheduling.
const TargetSubtargetInfo * getSubtargetInfo() const
TargetSubtargetInfo getter.
Changed
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
DXILDebugInfoMap run(Module &M)
This is an optimization pass for GlobalISel generic memory operations.
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
RegState
Flags to represent properties of register accesses.
constexpr RegState getKillRegState(bool B)
AnalysisManager< MachineFunction > MachineFunctionAnalysisManager
LLVM_ABI PreservedAnalyses getMachineFunctionPassPreservedAnalyses()
Returns the minimum set of Analyses that all machine function passes must preserve.
FunctionPass * createAArch64SIMDInstrOptPass()
Returns an instance of the high cost ASIMD instruction replacement optimization pass.
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
Summarize the scheduling resources required for an instruction of a particular scheduling class.
Definition MCSchedule.h:123
bool isVariant() const
Definition MCSchedule.h:144