LLVM  4.0.0
AArch64AdvSIMDScalarPass.cpp
Go to the documentation of this file.
1 //===-- AArch64AdvSIMDScalar.cpp - Replace dead defs w/ zero reg --===//
2 //
3 // The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 // When profitable, replace GPR targeting i64 instructions with their
10 // AdvSIMD scalar equivalents. Generally speaking, "profitable" is defined
11 // as minimizing the number of cross-class register copies.
12 //===----------------------------------------------------------------------===//
13 
14 //===----------------------------------------------------------------------===//
15 // TODO: Graph based predicate heuristics.
16 // Walking the instruction list linearly will get many, perhaps most, of
17 // the cases, but to do a truly thorough job of this, we need a more
18 // wholistic approach.
19 //
20 // This optimization is very similar in spirit to the register allocator's
21 // spill placement, only here we're determining where to place cross-class
22 // register copies rather than spills. As such, a similar approach is
23 // called for.
24 //
25 // We want to build up a set of graphs of all instructions which are candidates
26 // for transformation along with instructions which generate their inputs and
27 // consume their outputs. For each edge in the graph, we assign a weight
28 // based on whether there is a copy required there (weight zero if not) and
29 // the block frequency of the block containing the defining or using
30 // instruction, whichever is less. Our optimization is then a graph problem
31 // to minimize the total weight of all the graphs, then transform instructions
32 // and add or remove copy instructions as called for to implement the
33 // solution.
34 //===----------------------------------------------------------------------===//
35 
36 #include "AArch64.h"
37 #include "AArch64InstrInfo.h"
38 #include "AArch64RegisterInfo.h"
39 #include "AArch64Subtarget.h"
40 #include "llvm/ADT/Statistic.h"
47 #include "llvm/Support/Debug.h"
49 using namespace llvm;
50 
51 #define DEBUG_TYPE "aarch64-simd-scalar"
52 
53 // Allow forcing all i64 operations with equivalent SIMD instructions to use
54 // them. For stress-testing the transformation function.
55 static cl::opt<bool>
56 TransformAll("aarch64-simd-scalar-force-all",
57  cl::desc("Force use of AdvSIMD scalar instructions everywhere"),
58  cl::init(false), cl::Hidden);
59 
60 STATISTIC(NumScalarInsnsUsed, "Number of scalar instructions used");
61 STATISTIC(NumCopiesDeleted, "Number of cross-class copies deleted");
62 STATISTIC(NumCopiesInserted, "Number of cross-class copies inserted");
63 
64 #define AARCH64_ADVSIMD_NAME "AdvSIMD Scalar Operation Optimization"
65 
66 namespace {
67 class AArch64AdvSIMDScalar : public MachineFunctionPass {
69  const TargetInstrInfo *TII;
70 
71 private:
72  // isProfitableToTransform - Predicate function to determine whether an
73  // instruction should be transformed to its equivalent AdvSIMD scalar
74  // instruction. "add Xd, Xn, Xm" ==> "add Dd, Da, Db", for example.
75  bool isProfitableToTransform(const MachineInstr &MI) const;
76 
77  // transformInstruction - Perform the transformation of an instruction
78  // to its equivalant AdvSIMD scalar instruction. Update inputs and outputs
79  // to be the correct register class, minimizing cross-class copies.
80  void transformInstruction(MachineInstr &MI);
81 
82  // processMachineBasicBlock - Main optimzation loop.
83  bool processMachineBasicBlock(MachineBasicBlock *MBB);
84 
85 public:
86  static char ID; // Pass identification, replacement for typeid.
87  explicit AArch64AdvSIMDScalar() : MachineFunctionPass(ID) {
89  }
90 
91  bool runOnMachineFunction(MachineFunction &F) override;
92 
93  StringRef getPassName() const override { return AARCH64_ADVSIMD_NAME; }
94 
95  void getAnalysisUsage(AnalysisUsage &AU) const override {
96  AU.setPreservesCFG();
98  }
99 };
100 char AArch64AdvSIMDScalar::ID = 0;
101 } // end anonymous namespace
102 
103 INITIALIZE_PASS(AArch64AdvSIMDScalar, "aarch64-simd-scalar",
104  AARCH64_ADVSIMD_NAME, false, false)
105 
106 static bool isGPR64(unsigned Reg, unsigned SubReg,
108  if (SubReg)
109  return false;
111  return MRI->getRegClass(Reg)->hasSuperClassEq(&AArch64::GPR64RegClass);
112  return AArch64::GPR64RegClass.contains(Reg);
113 }
114 
115 static bool isFPR64(unsigned Reg, unsigned SubReg,
116  const MachineRegisterInfo *MRI) {
118  return (MRI->getRegClass(Reg)->hasSuperClassEq(&AArch64::FPR64RegClass) &&
119  SubReg == 0) ||
120  (MRI->getRegClass(Reg)->hasSuperClassEq(&AArch64::FPR128RegClass) &&
121  SubReg == AArch64::dsub);
122  // Physical register references just check the register class directly.
123  return (AArch64::FPR64RegClass.contains(Reg) && SubReg == 0) ||
124  (AArch64::FPR128RegClass.contains(Reg) && SubReg == AArch64::dsub);
125 }
126 
127 // getSrcFromCopy - Get the original source register for a GPR64 <--> FPR64
128 // copy instruction. Return zero_reg if the instruction is not a copy.
130  const MachineRegisterInfo *MRI,
131  unsigned &SubReg) {
132  SubReg = 0;
133  // The "FMOV Xd, Dn" instruction is the typical form.
134  if (MI->getOpcode() == AArch64::FMOVDXr ||
135  MI->getOpcode() == AArch64::FMOVXDr)
136  return &MI->getOperand(1);
137  // A lane zero extract "UMOV.d Xd, Vn[0]" is equivalent. We shouldn't see
138  // these at this stage, but it's easy to check for.
139  if (MI->getOpcode() == AArch64::UMOVvi64 && MI->getOperand(2).getImm() == 0) {
140  SubReg = AArch64::dsub;
141  return &MI->getOperand(1);
142  }
143  // Or just a plain COPY instruction. This can be directly to/from FPR64,
144  // or it can be a dsub subreg reference to an FPR128.
145  if (MI->getOpcode() == AArch64::COPY) {
146  if (isFPR64(MI->getOperand(0).getReg(), MI->getOperand(0).getSubReg(),
147  MRI) &&
148  isGPR64(MI->getOperand(1).getReg(), MI->getOperand(1).getSubReg(), MRI))
149  return &MI->getOperand(1);
150  if (isGPR64(MI->getOperand(0).getReg(), MI->getOperand(0).getSubReg(),
151  MRI) &&
152  isFPR64(MI->getOperand(1).getReg(), MI->getOperand(1).getSubReg(),
153  MRI)) {
154  SubReg = MI->getOperand(1).getSubReg();
155  return &MI->getOperand(1);
156  }
157  }
158 
159  // Otherwise, this is some other kind of instruction.
160  return nullptr;
161 }
162 
163 // getTransformOpcode - For any opcode for which there is an AdvSIMD equivalent
164 // that we're considering transforming to, return that AdvSIMD opcode. For all
165 // others, return the original opcode.
166 static unsigned getTransformOpcode(unsigned Opc) {
167  switch (Opc) {
168  default:
169  break;
170  // FIXME: Lots more possibilities.
171  case AArch64::ADDXrr:
172  return AArch64::ADDv1i64;
173  case AArch64::SUBXrr:
174  return AArch64::SUBv1i64;
175  case AArch64::ANDXrr:
176  return AArch64::ANDv8i8;
177  case AArch64::EORXrr:
178  return AArch64::EORv8i8;
179  case AArch64::ORRXrr:
180  return AArch64::ORRv8i8;
181  }
182  // No AdvSIMD equivalent, so just return the original opcode.
183  return Opc;
184 }
185 
186 static bool isTransformable(const MachineInstr &MI) {
187  unsigned Opc = MI.getOpcode();
188  return Opc != getTransformOpcode(Opc);
189 }
190 
191 // isProfitableToTransform - Predicate function to determine whether an
192 // instruction should be transformed to its equivalent AdvSIMD scalar
193 // instruction. "add Xd, Xn, Xm" ==> "add Dd, Da, Db", for example.
194 bool AArch64AdvSIMDScalar::isProfitableToTransform(
195  const MachineInstr &MI) const {
196  // If this instruction isn't eligible to be transformed (no SIMD equivalent),
197  // early exit since that's the common case.
198  if (!isTransformable(MI))
199  return false;
200 
201  // Count the number of copies we'll need to add and approximate the number
202  // of copies that a transform will enable us to remove.
203  unsigned NumNewCopies = 3;
204  unsigned NumRemovableCopies = 0;
205 
206  unsigned OrigSrc0 = MI.getOperand(1).getReg();
207  unsigned OrigSrc1 = MI.getOperand(2).getReg();
208  unsigned SubReg0;
209  unsigned SubReg1;
210  if (!MRI->def_empty(OrigSrc0)) {
212  MRI->def_instr_begin(OrigSrc0);
213  assert(std::next(Def) == MRI->def_instr_end() && "Multiple def in SSA!");
214  MachineOperand *MOSrc0 = getSrcFromCopy(&*Def, MRI, SubReg0);
215  // If the source was from a copy, we don't need to insert a new copy.
216  if (MOSrc0)
217  --NumNewCopies;
218  // If there are no other users of the original source, we can delete
219  // that instruction.
220  if (MOSrc0 && MRI->hasOneNonDBGUse(OrigSrc0))
221  ++NumRemovableCopies;
222  }
223  if (!MRI->def_empty(OrigSrc1)) {
225  MRI->def_instr_begin(OrigSrc1);
226  assert(std::next(Def) == MRI->def_instr_end() && "Multiple def in SSA!");
227  MachineOperand *MOSrc1 = getSrcFromCopy(&*Def, MRI, SubReg1);
228  if (MOSrc1)
229  --NumNewCopies;
230  // If there are no other users of the original source, we can delete
231  // that instruction.
232  if (MOSrc1 && MRI->hasOneNonDBGUse(OrigSrc1))
233  ++NumRemovableCopies;
234  }
235 
236  // If any of the uses of the original instructions is a cross class copy,
237  // that's a copy that will be removable if we transform. Likewise, if
238  // any of the uses is a transformable instruction, it's likely the tranforms
239  // will chain, enabling us to save a copy there, too. This is an aggressive
240  // heuristic that approximates the graph based cost analysis described above.
241  unsigned Dst = MI.getOperand(0).getReg();
242  bool AllUsesAreCopies = true;
244  Use = MRI->use_instr_nodbg_begin(Dst),
245  E = MRI->use_instr_nodbg_end();
246  Use != E; ++Use) {
247  unsigned SubReg;
248  if (getSrcFromCopy(&*Use, MRI, SubReg) || isTransformable(*Use))
249  ++NumRemovableCopies;
250  // If the use is an INSERT_SUBREG, that's still something that can
251  // directly use the FPR64, so we don't invalidate AllUsesAreCopies. It's
252  // preferable to have it use the FPR64 in most cases, as if the source
253  // vector is an IMPLICIT_DEF, the INSERT_SUBREG just goes away entirely.
254  // Ditto for a lane insert.
255  else if (Use->getOpcode() == AArch64::INSERT_SUBREG ||
256  Use->getOpcode() == AArch64::INSvi64gpr)
257  ;
258  else
259  AllUsesAreCopies = false;
260  }
261  // If all of the uses of the original destination register are copies to
262  // FPR64, then we won't end up having a new copy back to GPR64 either.
263  if (AllUsesAreCopies)
264  --NumNewCopies;
265 
266  // If a transform will not increase the number of cross-class copies required,
267  // return true.
268  if (NumNewCopies <= NumRemovableCopies)
269  return true;
270 
271  // Finally, even if we otherwise wouldn't transform, check if we're forcing
272  // transformation of everything.
273  return TransformAll;
274 }
275 
277  unsigned Dst, unsigned Src, bool IsKill) {
279  TII->get(AArch64::COPY), Dst)
280  .addReg(Src, getKillRegState(IsKill));
281  DEBUG(dbgs() << " adding copy: " << *MIB);
282  ++NumCopiesInserted;
283  return MIB;
284 }
285 
286 // transformInstruction - Perform the transformation of an instruction
287 // to its equivalant AdvSIMD scalar instruction. Update inputs and outputs
288 // to be the correct register class, minimizing cross-class copies.
289 void AArch64AdvSIMDScalar::transformInstruction(MachineInstr &MI) {
290  DEBUG(dbgs() << "Scalar transform: " << MI);
291 
293  unsigned OldOpc = MI.getOpcode();
294  unsigned NewOpc = getTransformOpcode(OldOpc);
295  assert(OldOpc != NewOpc && "transform an instruction to itself?!");
296 
297  // Check if we need a copy for the source registers.
298  unsigned OrigSrc0 = MI.getOperand(1).getReg();
299  unsigned OrigSrc1 = MI.getOperand(2).getReg();
300  unsigned Src0 = 0, SubReg0;
301  unsigned Src1 = 0, SubReg1;
302  bool KillSrc0 = false, KillSrc1 = false;
303  if (!MRI->def_empty(OrigSrc0)) {
305  MRI->def_instr_begin(OrigSrc0);
306  assert(std::next(Def) == MRI->def_instr_end() && "Multiple def in SSA!");
307  MachineOperand *MOSrc0 = getSrcFromCopy(&*Def, MRI, SubReg0);
308  // If there are no other users of the original source, we can delete
309  // that instruction.
310  if (MOSrc0) {
311  Src0 = MOSrc0->getReg();
312  KillSrc0 = MOSrc0->isKill();
313  // Src0 is going to be reused, thus, it cannot be killed anymore.
314  MOSrc0->setIsKill(false);
315  if (MRI->hasOneNonDBGUse(OrigSrc0)) {
316  assert(MOSrc0 && "Can't delete copy w/o a valid original source!");
317  Def->eraseFromParent();
318  ++NumCopiesDeleted;
319  }
320  }
321  }
322  if (!MRI->def_empty(OrigSrc1)) {
324  MRI->def_instr_begin(OrigSrc1);
325  assert(std::next(Def) == MRI->def_instr_end() && "Multiple def in SSA!");
326  MachineOperand *MOSrc1 = getSrcFromCopy(&*Def, MRI, SubReg1);
327  // If there are no other users of the original source, we can delete
328  // that instruction.
329  if (MOSrc1) {
330  Src1 = MOSrc1->getReg();
331  KillSrc1 = MOSrc1->isKill();
332  // Src0 is going to be reused, thus, it cannot be killed anymore.
333  MOSrc1->setIsKill(false);
334  if (MRI->hasOneNonDBGUse(OrigSrc1)) {
335  assert(MOSrc1 && "Can't delete copy w/o a valid original source!");
336  Def->eraseFromParent();
337  ++NumCopiesDeleted;
338  }
339  }
340  }
341  // If we weren't able to reference the original source directly, create a
342  // copy.
343  if (!Src0) {
344  SubReg0 = 0;
345  Src0 = MRI->createVirtualRegister(&AArch64::FPR64RegClass);
346  insertCopy(TII, MI, Src0, OrigSrc0, KillSrc0);
347  KillSrc0 = true;
348  }
349  if (!Src1) {
350  SubReg1 = 0;
351  Src1 = MRI->createVirtualRegister(&AArch64::FPR64RegClass);
352  insertCopy(TII, MI, Src1, OrigSrc1, KillSrc1);
353  KillSrc1 = true;
354  }
355 
356  // Create a vreg for the destination.
357  // FIXME: No need to do this if the ultimate user expects an FPR64.
358  // Check for that and avoid the copy if possible.
359  unsigned Dst = MRI->createVirtualRegister(&AArch64::FPR64RegClass);
360 
361  // For now, all of the new instructions have the same simple three-register
362  // form, so no need to special case based on what instruction we're
363  // building.
364  BuildMI(*MBB, MI, MI.getDebugLoc(), TII->get(NewOpc), Dst)
365  .addReg(Src0, getKillRegState(KillSrc0), SubReg0)
366  .addReg(Src1, getKillRegState(KillSrc1), SubReg1);
367 
368  // Now copy the result back out to a GPR.
369  // FIXME: Try to avoid this if all uses could actually just use the FPR64
370  // directly.
371  insertCopy(TII, MI, MI.getOperand(0).getReg(), Dst, true);
372 
373  // Erase the old instruction.
374  MI.eraseFromParent();
375 
376  ++NumScalarInsnsUsed;
377 }
378 
379 // processMachineBasicBlock - Main optimzation loop.
380 bool AArch64AdvSIMDScalar::processMachineBasicBlock(MachineBasicBlock *MBB) {
381  bool Changed = false;
382  for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end(); I != E;) {
383  MachineInstr &MI = *I++;
384  if (isProfitableToTransform(MI)) {
385  transformInstruction(MI);
386  Changed = true;
387  }
388  }
389  return Changed;
390 }
391 
392 // runOnMachineFunction - Pass entry point from PassManager.
393 bool AArch64AdvSIMDScalar::runOnMachineFunction(MachineFunction &mf) {
394  bool Changed = false;
395  DEBUG(dbgs() << "***** AArch64AdvSIMDScalar *****\n");
396 
397  if (skipFunction(*mf.getFunction()))
398  return false;
399 
400  MRI = &mf.getRegInfo();
401  TII = mf.getSubtarget().getInstrInfo();
402 
403  // Just check things on a one-block-at-a-time basis.
404  for (MachineFunction::iterator I = mf.begin(), E = mf.end(); I != E; ++I)
405  if (processMachineBasicBlock(&*I))
406  Changed = true;
407  return Changed;
408 }
409 
410 // createAArch64AdvSIMDScalar - Factory function used by AArch64TargetMachine
411 // to add the pass to the PassManager.
413  return new AArch64AdvSIMDScalar();
414 }
static MachineInstr * insertCopy(const TargetInstrInfo *TII, MachineInstr &MI, unsigned Dst, unsigned Src, bool IsKill)
static PassRegistry * getPassRegistry()
getPassRegistry - Access the global registry object, which is automatically initialized at applicatio...
STATISTIC(NumFunctions,"Total number of functions")
static bool isVirtualRegister(unsigned Reg)
Return true if the specified register number is in the virtual register namespace.
#define AARCH64_ADVSIMD_NAME
bool hasSuperClassEq(const TargetRegisterClass *RC) const
Returns true if RC is a super-class of or equal to this class.
const Function * getFunction() const
getFunction - Return the LLVM function that this machine code represents
static bool isTransformable(const MachineInstr &MI)
static MachineOperand * getSrcFromCopy(MachineInstr *MI, const MachineRegisterInfo *MRI, unsigned &SubReg)
return AArch64::GPR64RegClass contains(Reg)
static bool isFPR64(unsigned Reg, unsigned SubReg, const MachineRegisterInfo *MRI)
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineFunctionPass - This class adapts the FunctionPass interface to allow convenient creation of pa...
const HexagonInstrInfo * TII
A Use represents the edge between a Value definition and its users.
Definition: Use.h:56
void eraseFromParent()
Unlink 'this' from the containing basic block and delete it.
unsigned SubReg
const TargetRegisterClass * getRegClass(unsigned Reg) const
Return the register class of the specified virtual register.
Reg
All possible values of the reg field in the ModR/M byte.
defusechain_iterator - This class provides iterator support for machine operands in the function that...
#define F(x, y, z)
Definition: MD5.cpp:51
MachineBasicBlock * MBB
int64_t getImm() const
unsigned getKillRegState(bool B)
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
Definition: MachineInstr.h:273
static GCRegistry::Add< CoreCLRGC > E("coreclr","CoreCLR-compatible GC")
const MachineBasicBlock * getParent() const
Definition: MachineInstr.h:131
TargetInstrInfo - Interface to description of machine instruction set.
MachineInstrBuilder BuildMI(MachineFunction &MF, const DebugLoc &DL, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:395
unsigned const MachineRegisterInfo * MRI
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
INITIALIZE_PASS(AArch64AdvSIMDScalar,"aarch64-simd-scalar", AARCH64_ADVSIMD_NAME, false, false) static bool isGPR64(unsigned Reg
static cl::opt< bool > TransformAll("aarch64-simd-scalar-force-all", cl::desc("Force use of AdvSIMD scalar instructions everywhere"), cl::init(false), cl::Hidden)
const MachineOperand & getOperand(unsigned i) const
Definition: MachineInstr.h:279
Represent the analysis usage information of a pass.
FunctionPass * createAArch64AdvSIMDScalar()
static unsigned getTransformOpcode(unsigned Opc)
FunctionPass class - This class is used to implement most global optimizations.
Definition: Pass.h:298
unsigned getSubReg() const
const MCInstrDesc & get(unsigned Opcode) const
Return the machine instruction descriptor that corresponds to the specified instruction opcode...
Definition: MCInstrInfo.h:45
Iterator for intrusive lists based on ilist_node.
MachineOperand class - Representation of each machine instruction operand.
void setPreservesCFG()
This function should be called by the pass, iff they do not:
Definition: Pass.cpp:276
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:132
const DebugLoc & getDebugLoc() const
Returns the debug location id of this MachineInstr.
Definition: MachineInstr.h:250
MachineRegisterInfo - Keep track of information for virtual and physical registers, including vreg register classes, use/def chains for registers, etc.
Representation of each machine instruction.
Definition: MachineInstr.h:52
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
#define I(x, y, z)
Definition: MD5.cpp:54
unsigned getReg() const
getReg - Returns the register number.
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
aarch64 promote const
virtual const TargetInstrInfo * getInstrInfo() const
#define DEBUG(X)
Definition: Debug.h:100
IRTranslator LLVM IR MI
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:47
void initializeAArch64AdvSIMDScalarPass(PassRegistry &)