LLVM  10.0.0svn
PPCVSXSwapRemoval.cpp
Go to the documentation of this file.
1 //===----------- PPCVSXSwapRemoval.cpp - Remove VSX LE Swaps -------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===---------------------------------------------------------------------===//
8 //
9 // This pass analyzes vector computations and removes unnecessary
10 // doubleword swaps (xxswapd instructions). This pass is performed
11 // only for little-endian VSX code generation.
12 //
13 // For this specific case, loads and stores of v4i32, v4f32, v2i64,
14 // and v2f64 vectors are inefficient. These are implemented using
15 // the lxvd2x and stxvd2x instructions, which invert the order of
16 // doublewords in a vector register. Thus code generation inserts
17 // an xxswapd after each such load, and prior to each such store.
18 //
19 // The extra xxswapd instructions reduce performance. The purpose
20 // of this pass is to reduce the number of xxswapd instructions
21 // required for correctness.
22 //
23 // The primary insight is that much code that operates on vectors
24 // does not care about the relative order of elements in a register,
25 // so long as the correct memory order is preserved. If we have a
26 // computation where all input values are provided by lxvd2x/xxswapd,
27 // all outputs are stored using xxswapd/lxvd2x, and all intermediate
28 // computations are lane-insensitive (independent of element order),
29 // then all the xxswapd instructions associated with the loads and
30 // stores may be removed without changing observable semantics.
31 //
32 // This pass uses standard equivalence class infrastructure to create
33 // maximal webs of computations fitting the above description. Each
34 // such web is then optimized by removing its unnecessary xxswapd
35 // instructions.
36 //
37 // There are some lane-sensitive operations for which we can still
38 // permit the optimization, provided we modify those operations
39 // accordingly. Such operations are identified as using "special
40 // handling" within this module.
41 //
42 //===---------------------------------------------------------------------===//
43 
44 #include "PPC.h"
45 #include "PPCInstrBuilder.h"
46 #include "PPCInstrInfo.h"
47 #include "PPCTargetMachine.h"
48 #include "llvm/ADT/DenseMap.h"
53 #include "llvm/Config/llvm-config.h"
54 #include "llvm/Support/Debug.h"
55 #include "llvm/Support/Format.h"
57 
58 using namespace llvm;
59 
60 #define DEBUG_TYPE "ppc-vsx-swaps"
61 
62 namespace {
63 
64 // A PPCVSXSwapEntry is created for each machine instruction that
65 // is relevant to a vector computation.
66 struct PPCVSXSwapEntry {
67  // Pointer to the instruction.
68  MachineInstr *VSEMI;
69 
70  // Unique ID (position in the swap vector).
71  int VSEId;
72 
73  // Attributes of this node.
74  unsigned int IsLoad : 1;
75  unsigned int IsStore : 1;
76  unsigned int IsSwap : 1;
77  unsigned int MentionsPhysVR : 1;
78  unsigned int IsSwappable : 1;
79  unsigned int MentionsPartialVR : 1;
80  unsigned int SpecialHandling : 3;
81  unsigned int WebRejected : 1;
82  unsigned int WillRemove : 1;
83 };
84 
85 enum SHValues {
86  SH_NONE = 0,
87  SH_EXTRACT,
88  SH_INSERT,
89  SH_NOSWAP_LD,
90  SH_NOSWAP_ST,
91  SH_SPLAT,
92  SH_XXPERMDI,
93  SH_COPYWIDEN
94 };
95 
96 struct PPCVSXSwapRemoval : public MachineFunctionPass {
97 
98  static char ID;
99  const PPCInstrInfo *TII;
100  MachineFunction *MF;
102 
103  // Swap entries are allocated in a vector for better performance.
104  std::vector<PPCVSXSwapEntry> SwapVector;
105 
106  // A mapping is maintained between machine instructions and
107  // their swap entries. The key is the address of the MI.
109 
110  // Equivalence classes are used to gather webs of related computation.
111  // Swap entries are represented by their VSEId fields.
113 
114  PPCVSXSwapRemoval() : MachineFunctionPass(ID) {
116  }
117 
118 private:
119  // Initialize data structures.
120  void initialize(MachineFunction &MFParm);
121 
122  // Walk the machine instructions to gather vector usage information.
123  // Return true iff vector mentions are present.
124  bool gatherVectorInstructions();
125 
126  // Add an entry to the swap vector and swap map.
127  int addSwapEntry(MachineInstr *MI, PPCVSXSwapEntry &SwapEntry);
128 
129  // Hunt backwards through COPY and SUBREG_TO_REG chains for a
130  // source register. VecIdx indicates the swap vector entry to
131  // mark as mentioning a physical register if the search leads
132  // to one.
133  unsigned lookThruCopyLike(unsigned SrcReg, unsigned VecIdx);
134 
135  // Generate equivalence classes for related computations (webs).
136  void formWebs();
137 
138  // Analyze webs and determine those that cannot be optimized.
139  void recordUnoptimizableWebs();
140 
141  // Record which swap instructions can be safely removed.
142  void markSwapsForRemoval();
143 
144  // Remove swaps and update other instructions requiring special
145  // handling. Return true iff any changes are made.
146  bool removeSwaps();
147 
148  // Insert a swap instruction from SrcReg to DstReg at the given
149  // InsertPoint.
150  void insertSwap(MachineInstr *MI, MachineBasicBlock::iterator InsertPoint,
151  unsigned DstReg, unsigned SrcReg);
152 
153  // Update instructions requiring special handling.
154  void handleSpecialSwappables(int EntryIdx);
155 
156  // Dump a description of the entries in the swap vector.
157  void dumpSwapVector();
158 
159  // Return true iff the given register is in the given class.
160  bool isRegInClass(unsigned Reg, const TargetRegisterClass *RC) {
162  return RC->hasSubClassEq(MRI->getRegClass(Reg));
163  return RC->contains(Reg);
164  }
165 
166  // Return true iff the given register is a full vector register.
167  bool isVecReg(unsigned Reg) {
168  return (isRegInClass(Reg, &PPC::VSRCRegClass) ||
169  isRegInClass(Reg, &PPC::VRRCRegClass));
170  }
171 
172  // Return true iff the given register is a partial vector register.
173  bool isScalarVecReg(unsigned Reg) {
174  return (isRegInClass(Reg, &PPC::VSFRCRegClass) ||
175  isRegInClass(Reg, &PPC::VSSRCRegClass));
176  }
177 
178  // Return true iff the given register mentions all or part of a
179  // vector register. Also sets Partial to true if the mention
180  // is for just the floating-point register overlap of the register.
181  bool isAnyVecReg(unsigned Reg, bool &Partial) {
182  if (isScalarVecReg(Reg))
183  Partial = true;
184  return isScalarVecReg(Reg) || isVecReg(Reg);
185  }
186 
187 public:
188  // Main entry point for this pass.
189  bool runOnMachineFunction(MachineFunction &MF) override {
190  if (skipFunction(MF.getFunction()))
191  return false;
192 
193  // If we don't have VSX on the subtarget, don't do anything.
194  // Also, on Power 9 the load and store ops preserve element order and so
195  // the swaps are not required.
196  const PPCSubtarget &STI = MF.getSubtarget<PPCSubtarget>();
197  if (!STI.hasVSX() || !STI.needsSwapsForVSXMemOps())
198  return false;
199 
200  bool Changed = false;
201  initialize(MF);
202 
203  if (gatherVectorInstructions()) {
204  formWebs();
205  recordUnoptimizableWebs();
206  markSwapsForRemoval();
207  Changed = removeSwaps();
208  }
209 
210  // FIXME: See the allocation of EC in initialize().
211  delete EC;
212  return Changed;
213  }
214 };
215 
216 // Initialize data structures for this pass. In particular, clear the
217 // swap vector and allocate the equivalence class mapping before
218 // processing each function.
220  MF = &MFParm;
221  MRI = &MF->getRegInfo();
222  TII = MF->getSubtarget<PPCSubtarget>().getInstrInfo();
223 
224  // An initial vector size of 256 appears to work well in practice.
225  // Small/medium functions with vector content tend not to incur a
226  // reallocation at this size. Three of the vector tests in
227  // projects/test-suite reallocate, which seems like a reasonable rate.
228  const int InitialVectorSize(256);
229  SwapVector.clear();
230  SwapVector.reserve(InitialVectorSize);
231 
232  // FIXME: Currently we allocate EC each time because we don't have
233  // access to the set representation on which to call clear(). Should
234  // consider adding a clear() method to the EquivalenceClasses class.
235  EC = new EquivalenceClasses<int>;
236 }
237 
238 // Create an entry in the swap vector for each instruction that mentions
239 // a full vector register, recording various characteristics of the
240 // instructions there.
241 bool PPCVSXSwapRemoval::gatherVectorInstructions() {
242  bool RelevantFunction = false;
243 
244  for (MachineBasicBlock &MBB : *MF) {
245  for (MachineInstr &MI : MBB) {
246 
247  if (MI.isDebugInstr())
248  continue;
249 
250  bool RelevantInstr = false;
251  bool Partial = false;
252 
253  for (const MachineOperand &MO : MI.operands()) {
254  if (!MO.isReg())
255  continue;
256  Register Reg = MO.getReg();
257  if (isAnyVecReg(Reg, Partial)) {
258  RelevantInstr = true;
259  break;
260  }
261  }
262 
263  if (!RelevantInstr)
264  continue;
265 
266  RelevantFunction = true;
267 
268  // Create a SwapEntry initialized to zeros, then fill in the
269  // instruction and ID fields before pushing it to the back
270  // of the swap vector.
271  PPCVSXSwapEntry SwapEntry{};
272  int VecIdx = addSwapEntry(&MI, SwapEntry);
273 
274  switch(MI.getOpcode()) {
275  default:
276  // Unless noted otherwise, an instruction is considered
277  // safe for the optimization. There are a large number of
278  // such true-SIMD instructions (all vector math, logical,
279  // select, compare, etc.). However, if the instruction
280  // mentions a partial vector register and does not have
281  // special handling defined, it is not swappable.
282  if (Partial)
283  SwapVector[VecIdx].MentionsPartialVR = 1;
284  else
285  SwapVector[VecIdx].IsSwappable = 1;
286  break;
287  case PPC::XXPERMDI: {
288  // This is a swap if it is of the form XXPERMDI t, s, s, 2.
289  // Unfortunately, MachineCSE ignores COPY and SUBREG_TO_REG, so we
290  // can also see XXPERMDI t, SUBREG_TO_REG(s), SUBREG_TO_REG(s), 2,
291  // for example. We have to look through chains of COPY and
292  // SUBREG_TO_REG to find the real source value for comparison.
293  // If the real source value is a physical register, then mark the
294  // XXPERMDI as mentioning a physical register.
295  int immed = MI.getOperand(3).getImm();
296  if (immed == 2) {
297  unsigned trueReg1 = lookThruCopyLike(MI.getOperand(1).getReg(),
298  VecIdx);
299  unsigned trueReg2 = lookThruCopyLike(MI.getOperand(2).getReg(),
300  VecIdx);
301  if (trueReg1 == trueReg2)
302  SwapVector[VecIdx].IsSwap = 1;
303  else {
304  // We can still handle these if the two registers are not
305  // identical, by adjusting the form of the XXPERMDI.
306  SwapVector[VecIdx].IsSwappable = 1;
307  SwapVector[VecIdx].SpecialHandling = SHValues::SH_XXPERMDI;
308  }
309  // This is a doubleword splat if it is of the form
310  // XXPERMDI t, s, s, 0 or XXPERMDI t, s, s, 3. As above we
311  // must look through chains of copy-likes to find the source
312  // register. We turn off the marking for mention of a physical
313  // register, because splatting it is safe; the optimization
314  // will not swap the value in the physical register. Whether
315  // or not the two input registers are identical, we can handle
316  // these by adjusting the form of the XXPERMDI.
317  } else if (immed == 0 || immed == 3) {
318 
319  SwapVector[VecIdx].IsSwappable = 1;
320  SwapVector[VecIdx].SpecialHandling = SHValues::SH_XXPERMDI;
321 
322  unsigned trueReg1 = lookThruCopyLike(MI.getOperand(1).getReg(),
323  VecIdx);
324  unsigned trueReg2 = lookThruCopyLike(MI.getOperand(2).getReg(),
325  VecIdx);
326  if (trueReg1 == trueReg2)
327  SwapVector[VecIdx].MentionsPhysVR = 0;
328 
329  } else {
330  // We can still handle these by adjusting the form of the XXPERMDI.
331  SwapVector[VecIdx].IsSwappable = 1;
332  SwapVector[VecIdx].SpecialHandling = SHValues::SH_XXPERMDI;
333  }
334  break;
335  }
336  case PPC::LVX:
337  // Non-permuting loads are currently unsafe. We can use special
338  // handling for this in the future. By not marking these as
339  // IsSwap, we ensure computations containing them will be rejected
340  // for now.
341  SwapVector[VecIdx].IsLoad = 1;
342  break;
343  case PPC::LXVD2X:
344  case PPC::LXVW4X:
345  // Permuting loads are marked as both load and swap, and are
346  // safe for optimization.
347  SwapVector[VecIdx].IsLoad = 1;
348  SwapVector[VecIdx].IsSwap = 1;
349  break;
350  case PPC::LXSDX:
351  case PPC::LXSSPX:
352  case PPC::XFLOADf64:
353  case PPC::XFLOADf32:
354  // A load of a floating-point value into the high-order half of
355  // a vector register is safe, provided that we introduce a swap
356  // following the load, which will be done by the SUBREG_TO_REG
357  // support. So just mark these as safe.
358  SwapVector[VecIdx].IsLoad = 1;
359  SwapVector[VecIdx].IsSwappable = 1;
360  break;
361  case PPC::STVX:
362  // Non-permuting stores are currently unsafe. We can use special
363  // handling for this in the future. By not marking these as
364  // IsSwap, we ensure computations containing them will be rejected
365  // for now.
366  SwapVector[VecIdx].IsStore = 1;
367  break;
368  case PPC::STXVD2X:
369  case PPC::STXVW4X:
370  // Permuting stores are marked as both store and swap, and are
371  // safe for optimization.
372  SwapVector[VecIdx].IsStore = 1;
373  SwapVector[VecIdx].IsSwap = 1;
374  break;
375  case PPC::COPY:
376  // These are fine provided they are moving between full vector
377  // register classes.
378  if (isVecReg(MI.getOperand(0).getReg()) &&
379  isVecReg(MI.getOperand(1).getReg()))
380  SwapVector[VecIdx].IsSwappable = 1;
381  // If we have a copy from one scalar floating-point register
382  // to another, we can accept this even if it is a physical
383  // register. The only way this gets involved is if it feeds
384  // a SUBREG_TO_REG, which is handled by introducing a swap.
385  else if (isScalarVecReg(MI.getOperand(0).getReg()) &&
386  isScalarVecReg(MI.getOperand(1).getReg()))
387  SwapVector[VecIdx].IsSwappable = 1;
388  break;
389  case PPC::SUBREG_TO_REG: {
390  // These are fine provided they are moving between full vector
391  // register classes. If they are moving from a scalar
392  // floating-point class to a vector class, we can handle those
393  // as well, provided we introduce a swap. It is generally the
394  // case that we will introduce fewer swaps than we remove, but
395  // (FIXME) a cost model could be used. However, introduced
396  // swaps could potentially be CSEd, so this is not trivial.
397  if (isVecReg(MI.getOperand(0).getReg()) &&
398  isVecReg(MI.getOperand(2).getReg()))
399  SwapVector[VecIdx].IsSwappable = 1;
400  else if (isVecReg(MI.getOperand(0).getReg()) &&
401  isScalarVecReg(MI.getOperand(2).getReg())) {
402  SwapVector[VecIdx].IsSwappable = 1;
403  SwapVector[VecIdx].SpecialHandling = SHValues::SH_COPYWIDEN;
404  }
405  break;
406  }
407  case PPC::VSPLTB:
408  case PPC::VSPLTH:
409  case PPC::VSPLTW:
410  case PPC::XXSPLTW:
411  // Splats are lane-sensitive, but we can use special handling
412  // to adjust the source lane for the splat.
413  SwapVector[VecIdx].IsSwappable = 1;
414  SwapVector[VecIdx].SpecialHandling = SHValues::SH_SPLAT;
415  break;
416  // The presence of the following lane-sensitive operations in a
417  // web will kill the optimization, at least for now. For these
418  // we do nothing, causing the optimization to fail.
419  // FIXME: Some of these could be permitted with special handling,
420  // and will be phased in as time permits.
421  // FIXME: There is no simple and maintainable way to express a set
422  // of opcodes having a common attribute in TableGen. Should this
423  // change, this is a prime candidate to use such a mechanism.
424  case PPC::INLINEASM:
425  case PPC::INLINEASM_BR:
426  case PPC::EXTRACT_SUBREG:
427  case PPC::INSERT_SUBREG:
428  case PPC::COPY_TO_REGCLASS:
429  case PPC::LVEBX:
430  case PPC::LVEHX:
431  case PPC::LVEWX:
432  case PPC::LVSL:
433  case PPC::LVSR:
434  case PPC::LVXL:
435  case PPC::STVEBX:
436  case PPC::STVEHX:
437  case PPC::STVEWX:
438  case PPC::STVXL:
439  // We can handle STXSDX and STXSSPX similarly to LXSDX and LXSSPX,
440  // by adding special handling for narrowing copies as well as
441  // widening ones. However, I've experimented with this, and in
442  // practice we currently do not appear to use STXSDX fed by
443  // a narrowing copy from a full vector register. Since I can't
444  // generate any useful test cases, I've left this alone for now.
445  case PPC::STXSDX:
446  case PPC::STXSSPX:
447  case PPC::VCIPHER:
448  case PPC::VCIPHERLAST:
449  case PPC::VMRGHB:
450  case PPC::VMRGHH:
451  case PPC::VMRGHW:
452  case PPC::VMRGLB:
453  case PPC::VMRGLH:
454  case PPC::VMRGLW:
455  case PPC::VMULESB:
456  case PPC::VMULESH:
457  case PPC::VMULESW:
458  case PPC::VMULEUB:
459  case PPC::VMULEUH:
460  case PPC::VMULEUW:
461  case PPC::VMULOSB:
462  case PPC::VMULOSH:
463  case PPC::VMULOSW:
464  case PPC::VMULOUB:
465  case PPC::VMULOUH:
466  case PPC::VMULOUW:
467  case PPC::VNCIPHER:
468  case PPC::VNCIPHERLAST:
469  case PPC::VPERM:
470  case PPC::VPERMXOR:
471  case PPC::VPKPX:
472  case PPC::VPKSHSS:
473  case PPC::VPKSHUS:
474  case PPC::VPKSDSS:
475  case PPC::VPKSDUS:
476  case PPC::VPKSWSS:
477  case PPC::VPKSWUS:
478  case PPC::VPKUDUM:
479  case PPC::VPKUDUS:
480  case PPC::VPKUHUM:
481  case PPC::VPKUHUS:
482  case PPC::VPKUWUM:
483  case PPC::VPKUWUS:
484  case PPC::VPMSUMB:
485  case PPC::VPMSUMD:
486  case PPC::VPMSUMH:
487  case PPC::VPMSUMW:
488  case PPC::VRLB:
489  case PPC::VRLD:
490  case PPC::VRLH:
491  case PPC::VRLW:
492  case PPC::VSBOX:
493  case PPC::VSHASIGMAD:
494  case PPC::VSHASIGMAW:
495  case PPC::VSL:
496  case PPC::VSLDOI:
497  case PPC::VSLO:
498  case PPC::VSR:
499  case PPC::VSRO:
500  case PPC::VSUM2SWS:
501  case PPC::VSUM4SBS:
502  case PPC::VSUM4SHS:
503  case PPC::VSUM4UBS:
504  case PPC::VSUMSWS:
505  case PPC::VUPKHPX:
506  case PPC::VUPKHSB:
507  case PPC::VUPKHSH:
508  case PPC::VUPKHSW:
509  case PPC::VUPKLPX:
510  case PPC::VUPKLSB:
511  case PPC::VUPKLSH:
512  case PPC::VUPKLSW:
513  case PPC::XXMRGHW:
514  case PPC::XXMRGLW:
515  // XXSLDWI could be replaced by a general permute with one of three
516  // permute control vectors (for shift values 1, 2, 3). However,
517  // VPERM has a more restrictive register class.
518  case PPC::XXSLDWI:
519  case PPC::XSCVDPSPN:
520  case PPC::XSCVSPDPN:
521  break;
522  }
523  }
524  }
525 
526  if (RelevantFunction) {
527  LLVM_DEBUG(dbgs() << "Swap vector when first built\n\n");
528  LLVM_DEBUG(dumpSwapVector());
529  }
530 
531  return RelevantFunction;
532 }
533 
534 // Add an entry to the swap vector and swap map, and make a
535 // singleton equivalence class for the entry.
536 int PPCVSXSwapRemoval::addSwapEntry(MachineInstr *MI,
537  PPCVSXSwapEntry& SwapEntry) {
538  SwapEntry.VSEMI = MI;
539  SwapEntry.VSEId = SwapVector.size();
540  SwapVector.push_back(SwapEntry);
541  EC->insert(SwapEntry.VSEId);
542  SwapMap[MI] = SwapEntry.VSEId;
543  return SwapEntry.VSEId;
544 }
545 
546 // This is used to find the "true" source register for an
547 // XXPERMDI instruction, since MachineCSE does not handle the
548 // "copy-like" operations (Copy and SubregToReg). Returns
549 // the original SrcReg unless it is the target of a copy-like
550 // operation, in which case we chain backwards through all
551 // such operations to the ultimate source register. If a
552 // physical register is encountered, we stop the search and
553 // flag the swap entry indicated by VecIdx (the original
554 // XXPERMDI) as mentioning a physical register.
555 unsigned PPCVSXSwapRemoval::lookThruCopyLike(unsigned SrcReg,
556  unsigned VecIdx) {
557  MachineInstr *MI = MRI->getVRegDef(SrcReg);
558  if (!MI->isCopyLike())
559  return SrcReg;
560 
561  unsigned CopySrcReg;
562  if (MI->isCopy())
563  CopySrcReg = MI->getOperand(1).getReg();
564  else {
565  assert(MI->isSubregToReg() && "bad opcode for lookThruCopyLike");
566  CopySrcReg = MI->getOperand(2).getReg();
567  }
568 
569  if (!Register::isVirtualRegister(CopySrcReg)) {
570  if (!isScalarVecReg(CopySrcReg))
571  SwapVector[VecIdx].MentionsPhysVR = 1;
572  return CopySrcReg;
573  }
574 
575  return lookThruCopyLike(CopySrcReg, VecIdx);
576 }
577 
578 // Generate equivalence classes for related computations (webs) by
579 // def-use relationships of virtual registers. Mention of a physical
580 // register terminates the generation of equivalence classes as this
581 // indicates a use of a parameter, definition of a return value, use
582 // of a value returned from a call, or definition of a parameter to a
583 // call. Computations with physical register mentions are flagged
584 // as such so their containing webs will not be optimized.
585 void PPCVSXSwapRemoval::formWebs() {
586 
587  LLVM_DEBUG(dbgs() << "\n*** Forming webs for swap removal ***\n\n");
588 
589  for (unsigned EntryIdx = 0; EntryIdx < SwapVector.size(); ++EntryIdx) {
590 
591  MachineInstr *MI = SwapVector[EntryIdx].VSEMI;
592 
593  LLVM_DEBUG(dbgs() << "\n" << SwapVector[EntryIdx].VSEId << " ");
594  LLVM_DEBUG(MI->dump());
595 
596  // It's sufficient to walk vector uses and join them to their unique
597  // definitions. In addition, check full vector register operands
598  // for physical regs. We exclude partial-vector register operands
599  // because we can handle them if copied to a full vector.
600  for (const MachineOperand &MO : MI->operands()) {
601  if (!MO.isReg())
602  continue;
603 
604  Register Reg = MO.getReg();
605  if (!isVecReg(Reg) && !isScalarVecReg(Reg))
606  continue;
607 
608  if (!Register::isVirtualRegister(Reg)) {
609  if (!(MI->isCopy() && isScalarVecReg(Reg)))
610  SwapVector[EntryIdx].MentionsPhysVR = 1;
611  continue;
612  }
613 
614  if (!MO.isUse())
615  continue;
616 
617  MachineInstr* DefMI = MRI->getVRegDef(Reg);
618  assert(SwapMap.find(DefMI) != SwapMap.end() &&
619  "Inconsistency: def of vector reg not found in swap map!");
620  int DefIdx = SwapMap[DefMI];
621  (void)EC->unionSets(SwapVector[DefIdx].VSEId,
622  SwapVector[EntryIdx].VSEId);
623 
624  LLVM_DEBUG(dbgs() << format("Unioning %d with %d\n",
625  SwapVector[DefIdx].VSEId,
626  SwapVector[EntryIdx].VSEId));
627  LLVM_DEBUG(dbgs() << " Def: ");
628  LLVM_DEBUG(DefMI->dump());
629  }
630  }
631 }
632 
633 // Walk the swap vector entries looking for conditions that prevent their
634 // containing computations from being optimized. When such conditions are
635 // found, mark the representative of the computation's equivalence class
636 // as rejected.
637 void PPCVSXSwapRemoval::recordUnoptimizableWebs() {
638 
639  LLVM_DEBUG(dbgs() << "\n*** Rejecting webs for swap removal ***\n\n");
640 
641  for (unsigned EntryIdx = 0; EntryIdx < SwapVector.size(); ++EntryIdx) {
642  int Repr = EC->getLeaderValue(SwapVector[EntryIdx].VSEId);
643 
644  // If representative is already rejected, don't waste further time.
645  if (SwapVector[Repr].WebRejected)
646  continue;
647 
648  // Reject webs containing mentions of physical or partial registers, or
649  // containing operations that we don't know how to handle in a lane-
650  // permuted region.
651  if (SwapVector[EntryIdx].MentionsPhysVR ||
652  SwapVector[EntryIdx].MentionsPartialVR ||
653  !(SwapVector[EntryIdx].IsSwappable || SwapVector[EntryIdx].IsSwap)) {
654 
655  SwapVector[Repr].WebRejected = 1;
656 
657  LLVM_DEBUG(
658  dbgs() << format("Web %d rejected for physreg, partial reg, or not "
659  "swap[pable]\n",
660  Repr));
661  LLVM_DEBUG(dbgs() << " in " << EntryIdx << ": ");
662  LLVM_DEBUG(SwapVector[EntryIdx].VSEMI->dump());
663  LLVM_DEBUG(dbgs() << "\n");
664  }
665 
666  // Reject webs than contain swapping loads that feed something other
667  // than a swap instruction.
668  else if (SwapVector[EntryIdx].IsLoad && SwapVector[EntryIdx].IsSwap) {
669  MachineInstr *MI = SwapVector[EntryIdx].VSEMI;
670  Register DefReg = MI->getOperand(0).getReg();
671 
672  // We skip debug instructions in the analysis. (Note that debug
673  // location information is still maintained by this optimization
674  // because it remains on the LXVD2X and STXVD2X instructions after
675  // the XXPERMDIs are removed.)
676  for (MachineInstr &UseMI : MRI->use_nodbg_instructions(DefReg)) {
677  int UseIdx = SwapMap[&UseMI];
678 
679  if (!SwapVector[UseIdx].IsSwap || SwapVector[UseIdx].IsLoad ||
680  SwapVector[UseIdx].IsStore) {
681 
682  SwapVector[Repr].WebRejected = 1;
683 
684  LLVM_DEBUG(dbgs() << format(
685  "Web %d rejected for load not feeding swap\n", Repr));
686  LLVM_DEBUG(dbgs() << " def " << EntryIdx << ": ");
687  LLVM_DEBUG(MI->dump());
688  LLVM_DEBUG(dbgs() << " use " << UseIdx << ": ");
689  LLVM_DEBUG(UseMI.dump());
690  LLVM_DEBUG(dbgs() << "\n");
691  }
692  }
693 
694  // Reject webs that contain swapping stores that are fed by something
695  // other than a swap instruction.
696  } else if (SwapVector[EntryIdx].IsStore && SwapVector[EntryIdx].IsSwap) {
697  MachineInstr *MI = SwapVector[EntryIdx].VSEMI;
698  Register UseReg = MI->getOperand(0).getReg();
699  MachineInstr *DefMI = MRI->getVRegDef(UseReg);
700  Register DefReg = DefMI->getOperand(0).getReg();
701  int DefIdx = SwapMap[DefMI];
702 
703  if (!SwapVector[DefIdx].IsSwap || SwapVector[DefIdx].IsLoad ||
704  SwapVector[DefIdx].IsStore) {
705 
706  SwapVector[Repr].WebRejected = 1;
707 
708  LLVM_DEBUG(dbgs() << format(
709  "Web %d rejected for store not fed by swap\n", Repr));
710  LLVM_DEBUG(dbgs() << " def " << DefIdx << ": ");
711  LLVM_DEBUG(DefMI->dump());
712  LLVM_DEBUG(dbgs() << " use " << EntryIdx << ": ");
713  LLVM_DEBUG(MI->dump());
714  LLVM_DEBUG(dbgs() << "\n");
715  }
716 
717  // Ensure all uses of the register defined by DefMI feed store
718  // instructions
719  for (MachineInstr &UseMI : MRI->use_nodbg_instructions(DefReg)) {
720  int UseIdx = SwapMap[&UseMI];
721 
722  if (SwapVector[UseIdx].VSEMI->getOpcode() != MI->getOpcode()) {
723  SwapVector[Repr].WebRejected = 1;
724 
725  LLVM_DEBUG(
726  dbgs() << format(
727  "Web %d rejected for swap not feeding only stores\n", Repr));
728  LLVM_DEBUG(dbgs() << " def "
729  << " : ");
730  LLVM_DEBUG(DefMI->dump());
731  LLVM_DEBUG(dbgs() << " use " << UseIdx << ": ");
732  LLVM_DEBUG(SwapVector[UseIdx].VSEMI->dump());
733  LLVM_DEBUG(dbgs() << "\n");
734  }
735  }
736  }
737  }
738 
739  LLVM_DEBUG(dbgs() << "Swap vector after web analysis:\n\n");
740  LLVM_DEBUG(dumpSwapVector());
741 }
742 
743 // Walk the swap vector entries looking for swaps fed by permuting loads
744 // and swaps that feed permuting stores. If the containing computation
745 // has not been marked rejected, mark each such swap for removal.
746 // (Removal is delayed in case optimization has disturbed the pattern,
747 // such that multiple loads feed the same swap, etc.)
748 void PPCVSXSwapRemoval::markSwapsForRemoval() {
749 
750  LLVM_DEBUG(dbgs() << "\n*** Marking swaps for removal ***\n\n");
751 
752  for (unsigned EntryIdx = 0; EntryIdx < SwapVector.size(); ++EntryIdx) {
753 
754  if (SwapVector[EntryIdx].IsLoad && SwapVector[EntryIdx].IsSwap) {
755  int Repr = EC->getLeaderValue(SwapVector[EntryIdx].VSEId);
756 
757  if (!SwapVector[Repr].WebRejected) {
758  MachineInstr *MI = SwapVector[EntryIdx].VSEMI;
759  Register DefReg = MI->getOperand(0).getReg();
760 
761  for (MachineInstr &UseMI : MRI->use_nodbg_instructions(DefReg)) {
762  int UseIdx = SwapMap[&UseMI];
763  SwapVector[UseIdx].WillRemove = 1;
764 
765  LLVM_DEBUG(dbgs() << "Marking swap fed by load for removal: ");
766  LLVM_DEBUG(UseMI.dump());
767  }
768  }
769 
770  } else if (SwapVector[EntryIdx].IsStore && SwapVector[EntryIdx].IsSwap) {
771  int Repr = EC->getLeaderValue(SwapVector[EntryIdx].VSEId);
772 
773  if (!SwapVector[Repr].WebRejected) {
774  MachineInstr *MI = SwapVector[EntryIdx].VSEMI;
775  Register UseReg = MI->getOperand(0).getReg();
776  MachineInstr *DefMI = MRI->getVRegDef(UseReg);
777  int DefIdx = SwapMap[DefMI];
778  SwapVector[DefIdx].WillRemove = 1;
779 
780  LLVM_DEBUG(dbgs() << "Marking swap feeding store for removal: ");
781  LLVM_DEBUG(DefMI->dump());
782  }
783 
784  } else if (SwapVector[EntryIdx].IsSwappable &&
785  SwapVector[EntryIdx].SpecialHandling != 0) {
786  int Repr = EC->getLeaderValue(SwapVector[EntryIdx].VSEId);
787 
788  if (!SwapVector[Repr].WebRejected)
789  handleSpecialSwappables(EntryIdx);
790  }
791  }
792 }
793 
794 // Create an xxswapd instruction and insert it prior to the given point.
795 // MI is used to determine basic block and debug loc information.
796 // FIXME: When inserting a swap, we should check whether SrcReg is
797 // defined by another swap: SrcReg = XXPERMDI Reg, Reg, 2; If so,
798 // then instead we should generate a copy from Reg to DstReg.
799 void PPCVSXSwapRemoval::insertSwap(MachineInstr *MI,
800  MachineBasicBlock::iterator InsertPoint,
801  unsigned DstReg, unsigned SrcReg) {
802  BuildMI(*MI->getParent(), InsertPoint, MI->getDebugLoc(),
803  TII->get(PPC::XXPERMDI), DstReg)
804  .addReg(SrcReg)
805  .addReg(SrcReg)
806  .addImm(2);
807 }
808 
809 // The identified swap entry requires special handling to allow its
810 // containing computation to be optimized. Perform that handling
811 // here.
812 // FIXME: Additional opportunities will be phased in with subsequent
813 // patches.
814 void PPCVSXSwapRemoval::handleSpecialSwappables(int EntryIdx) {
815  switch (SwapVector[EntryIdx].SpecialHandling) {
816 
817  default:
818  llvm_unreachable("Unexpected special handling type");
819 
820  // For splats based on an index into a vector, add N/2 modulo N
821  // to the index, where N is the number of vector elements.
822  case SHValues::SH_SPLAT: {
823  MachineInstr *MI = SwapVector[EntryIdx].VSEMI;
824  unsigned NElts;
825 
826  LLVM_DEBUG(dbgs() << "Changing splat: ");
827  LLVM_DEBUG(MI->dump());
828 
829  switch (MI->getOpcode()) {
830  default:
831  llvm_unreachable("Unexpected splat opcode");
832  case PPC::VSPLTB: NElts = 16; break;
833  case PPC::VSPLTH: NElts = 8; break;
834  case PPC::VSPLTW:
835  case PPC::XXSPLTW: NElts = 4; break;
836  }
837 
838  unsigned EltNo;
839  if (MI->getOpcode() == PPC::XXSPLTW)
840  EltNo = MI->getOperand(2).getImm();
841  else
842  EltNo = MI->getOperand(1).getImm();
843 
844  EltNo = (EltNo + NElts / 2) % NElts;
845  if (MI->getOpcode() == PPC::XXSPLTW)
846  MI->getOperand(2).setImm(EltNo);
847  else
848  MI->getOperand(1).setImm(EltNo);
849 
850  LLVM_DEBUG(dbgs() << " Into: ");
851  LLVM_DEBUG(MI->dump());
852  break;
853  }
854 
855  // For an XXPERMDI that isn't handled otherwise, we need to
856  // reverse the order of the operands. If the selector operand
857  // has a value of 0 or 3, we need to change it to 3 or 0,
858  // respectively. Otherwise we should leave it alone. (This
859  // is equivalent to reversing the two bits of the selector
860  // operand and complementing the result.)
861  case SHValues::SH_XXPERMDI: {
862  MachineInstr *MI = SwapVector[EntryIdx].VSEMI;
863 
864  LLVM_DEBUG(dbgs() << "Changing XXPERMDI: ");
865  LLVM_DEBUG(MI->dump());
866 
867  unsigned Selector = MI->getOperand(3).getImm();
868  if (Selector == 0 || Selector == 3)
869  Selector = 3 - Selector;
870  MI->getOperand(3).setImm(Selector);
871 
872  Register Reg1 = MI->getOperand(1).getReg();
873  Register Reg2 = MI->getOperand(2).getReg();
874  MI->getOperand(1).setReg(Reg2);
875  MI->getOperand(2).setReg(Reg1);
876 
877  // We also need to swap kill flag associated with the register.
878  bool IsKill1 = MI->getOperand(1).isKill();
879  bool IsKill2 = MI->getOperand(2).isKill();
880  MI->getOperand(1).setIsKill(IsKill2);
881  MI->getOperand(2).setIsKill(IsKill1);
882 
883  LLVM_DEBUG(dbgs() << " Into: ");
884  LLVM_DEBUG(MI->dump());
885  break;
886  }
887 
888  // For a copy from a scalar floating-point register to a vector
889  // register, removing swaps will leave the copied value in the
890  // wrong lane. Insert a swap following the copy to fix this.
891  case SHValues::SH_COPYWIDEN: {
892  MachineInstr *MI = SwapVector[EntryIdx].VSEMI;
893 
894  LLVM_DEBUG(dbgs() << "Changing SUBREG_TO_REG: ");
895  LLVM_DEBUG(MI->dump());
896 
897  Register DstReg = MI->getOperand(0).getReg();
898  const TargetRegisterClass *DstRC = MRI->getRegClass(DstReg);
899  Register NewVReg = MRI->createVirtualRegister(DstRC);
900 
901  MI->getOperand(0).setReg(NewVReg);
902  LLVM_DEBUG(dbgs() << " Into: ");
903  LLVM_DEBUG(MI->dump());
904 
905  auto InsertPoint = ++MachineBasicBlock::iterator(MI);
906 
907  // Note that an XXPERMDI requires a VSRC, so if the SUBREG_TO_REG
908  // is copying to a VRRC, we need to be careful to avoid a register
909  // assignment problem. In this case we must copy from VRRC to VSRC
910  // prior to the swap, and from VSRC to VRRC following the swap.
911  // Coalescing will usually remove all this mess.
912  if (DstRC == &PPC::VRRCRegClass) {
913  Register VSRCTmp1 = MRI->createVirtualRegister(&PPC::VSRCRegClass);
914  Register VSRCTmp2 = MRI->createVirtualRegister(&PPC::VSRCRegClass);
915 
916  BuildMI(*MI->getParent(), InsertPoint, MI->getDebugLoc(),
917  TII->get(PPC::COPY), VSRCTmp1)
918  .addReg(NewVReg);
919  LLVM_DEBUG(std::prev(InsertPoint)->dump());
920 
921  insertSwap(MI, InsertPoint, VSRCTmp2, VSRCTmp1);
922  LLVM_DEBUG(std::prev(InsertPoint)->dump());
923 
924  BuildMI(*MI->getParent(), InsertPoint, MI->getDebugLoc(),
925  TII->get(PPC::COPY), DstReg)
926  .addReg(VSRCTmp2);
927  LLVM_DEBUG(std::prev(InsertPoint)->dump());
928 
929  } else {
930  insertSwap(MI, InsertPoint, DstReg, NewVReg);
931  LLVM_DEBUG(std::prev(InsertPoint)->dump());
932  }
933  break;
934  }
935  }
936 }
937 
938 // Walk the swap vector and replace each entry marked for removal with
939 // a copy operation.
940 bool PPCVSXSwapRemoval::removeSwaps() {
941 
942  LLVM_DEBUG(dbgs() << "\n*** Removing swaps ***\n\n");
943 
944  bool Changed = false;
945 
946  for (unsigned EntryIdx = 0; EntryIdx < SwapVector.size(); ++EntryIdx) {
947  if (SwapVector[EntryIdx].WillRemove) {
948  Changed = true;
949  MachineInstr *MI = SwapVector[EntryIdx].VSEMI;
950  MachineBasicBlock *MBB = MI->getParent();
951  BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(TargetOpcode::COPY),
952  MI->getOperand(0).getReg())
953  .add(MI->getOperand(1));
954 
955  LLVM_DEBUG(dbgs() << format("Replaced %d with copy: ",
956  SwapVector[EntryIdx].VSEId));
957  LLVM_DEBUG(MI->dump());
958 
959  MI->eraseFromParent();
960  }
961  }
962 
963  return Changed;
964 }
965 
966 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
967 // For debug purposes, dump the contents of the swap vector.
968 LLVM_DUMP_METHOD void PPCVSXSwapRemoval::dumpSwapVector() {
969 
970  for (unsigned EntryIdx = 0; EntryIdx < SwapVector.size(); ++EntryIdx) {
971 
972  MachineInstr *MI = SwapVector[EntryIdx].VSEMI;
973  int ID = SwapVector[EntryIdx].VSEId;
974 
975  dbgs() << format("%6d", ID);
976  dbgs() << format("%6d", EC->getLeaderValue(ID));
977  dbgs() << format(" %bb.%3d", MI->getParent()->getNumber());
978  dbgs() << format(" %14s ", TII->getName(MI->getOpcode()).str().c_str());
979 
980  if (SwapVector[EntryIdx].IsLoad)
981  dbgs() << "load ";
982  if (SwapVector[EntryIdx].IsStore)
983  dbgs() << "store ";
984  if (SwapVector[EntryIdx].IsSwap)
985  dbgs() << "swap ";
986  if (SwapVector[EntryIdx].MentionsPhysVR)
987  dbgs() << "physreg ";
988  if (SwapVector[EntryIdx].MentionsPartialVR)
989  dbgs() << "partialreg ";
990 
991  if (SwapVector[EntryIdx].IsSwappable) {
992  dbgs() << "swappable ";
993  switch(SwapVector[EntryIdx].SpecialHandling) {
994  default:
995  dbgs() << "special:**unknown**";
996  break;
997  case SH_NONE:
998  break;
999  case SH_EXTRACT:
1000  dbgs() << "special:extract ";
1001  break;
1002  case SH_INSERT:
1003  dbgs() << "special:insert ";
1004  break;
1005  case SH_NOSWAP_LD:
1006  dbgs() << "special:load ";
1007  break;
1008  case SH_NOSWAP_ST:
1009  dbgs() << "special:store ";
1010  break;
1011  case SH_SPLAT:
1012  dbgs() << "special:splat ";
1013  break;
1014  case SH_XXPERMDI:
1015  dbgs() << "special:xxpermdi ";
1016  break;
1017  case SH_COPYWIDEN:
1018  dbgs() << "special:copywiden ";
1019  break;
1020  }
1021  }
1022 
1023  if (SwapVector[EntryIdx].WebRejected)
1024  dbgs() << "rejected ";
1025  if (SwapVector[EntryIdx].WillRemove)
1026  dbgs() << "remove ";
1027 
1028  dbgs() << "\n";
1029 
1030  // For no-asserts builds.
1031  (void)MI;
1032  (void)ID;
1033  }
1034 
1035  dbgs() << "\n";
1036 }
1037 #endif
1038 
1039 } // end default namespace
1040 
1041 INITIALIZE_PASS_BEGIN(PPCVSXSwapRemoval, DEBUG_TYPE,
1042  "PowerPC VSX Swap Removal", false, false)
1043 INITIALIZE_PASS_END(PPCVSXSwapRemoval, DEBUG_TYPE,
1044  "PowerPC VSX Swap Removal", false, false)
1045 
1046 char PPCVSXSwapRemoval::ID = 0;
1047 FunctionPass*
1048 llvm::createPPCVSXSwapRemovalPass() { return new PPCVSXSwapRemoval(); }
static PassRegistry * getPassRegistry()
getPassRegistry - Access the global registry object, which is automatically initialized at applicatio...
bool contains(unsigned Reg) const
Return true if the specified register is included in this register class.
This class represents lattice values for constants.
Definition: AllocatorList.h:23
#define LLVM_DUMP_METHOD
Mark debug helper function definitions like dump() that should not be stripped from debug builds...
Definition: Compiler.h:473
FunctionPass * createPPCVSXSwapRemovalPass()
bool isSubregToReg() const
const DebugLoc & getDebugLoc() const
Returns the debug location id of this MachineInstr.
Definition: MachineInstr.h:385
unsigned Reg
format_object< Ts... > format(const char *Fmt, const Ts &... Vals)
These are helper functions used to produce formatted output.
Definition: Format.h:124
bool hasVSX() const
Definition: PPCSubtarget.h:252
iterator_range< mop_iterator > operands()
Definition: MachineInstr.h:461
bool isCopyLike() const
Return true if the instruction behaves like a copy.
const TargetRegisterClass * getRegClass(Register Reg) const
Return the register class of the specified virtual register.
PowerPC VSX Swap Removal
MachineFunctionPass - This class adapts the FunctionPass interface to allow convenient creation of pa...
const HexagonInstrInfo * TII
void eraseFromParent()
Unlink &#39;this&#39; from the containing basic block and delete it.
INITIALIZE_PASS_BEGIN(PPCVSXSwapRemoval, DEBUG_TYPE, "PowerPC VSX Swap Removal", false, false) INITIALIZE_PASS_END(PPCVSXSwapRemoval
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
Definition: MachineInstr.h:411
INLINEASM - Represents an inline asm block.
Definition: ISDOpcodes.h:695
CHAIN = STXVD2X CHAIN, VSRC, Ptr - Occurs only for little endian.
void setReg(Register Reg)
Change the register this operand corresponds to.
int getNumber() const
MachineBasicBlocks are uniquely numbered at the function level, unless they&#39;re not in a MachineFuncti...
void initializePPCVSXSwapRemovalPass(PassRegistry &)
VSRC, CHAIN = LXVD2X_LE CHAIN, Ptr - Occurs only for little endian.
MachineInstrBuilder BuildMI(MachineFunction &MF, const DebugLoc &DL, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
MachineInstrBundleIterator< MachineInstr > iterator
void dump(const SparseBitVector< ElementSize > &LHS, raw_ostream &out)
unsigned const MachineRegisterInfo * MRI
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineInstrBuilder & UseMI
static Register UseReg(const MachineOperand &MO)
EquivalenceClasses - This represents a collection of equivalence classes and supports three efficient...
void setImm(int64_t immVal)
FunctionPass class - This class is used to implement most global optimizations.
Definition: Pass.h:284
static bool isVecReg(unsigned Reg)
bool isCopy() const
INITIALIZE_PASS_END(RegBankSelect, DEBUG_TYPE, "Assign register bank of generic virtual registers", false, false) RegBankSelect
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
bool hasSubClassEq(const TargetRegisterClass *RC) const
Returns true if RC is a sub-class of or equal to this class.
void setIsKill(bool Val=true)
static uint64_t add(uint64_t LeftOp, uint64_t RightOp)
Definition: FileCheck.cpp:243
MachineOperand class - Representation of each machine instruction operand.
MachineInstrBuilder MachineInstrBuilder & DefMI
VPERM - The PPC VPERM Instruction.
int64_t getImm() const
const Function & getFunction() const
Return the LLVM function that this machine code represents.
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:132
static void initialize(TargetLibraryInfoImpl &TLI, const Triple &T, ArrayRef< StringRef > StandardNames)
Initialize the set of available library functions based on the specified target triple.
const MachineBasicBlock * getParent() const
Definition: MachineInstr.h:256
MachineRegisterInfo - Keep track of information for virtual and physical registers, including vreg register classes, use/def chains for registers, etc.
Representation of each machine instruction.
Definition: MachineInstr.h:64
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
INLINEASM_BR - Terminator version of inline asm. Used by asm-goto.
Definition: ISDOpcodes.h:698
bool needsSwapsForVSXMemOps() const
Definition: PPCSubtarget.h:301
static bool isVirtualRegister(unsigned Reg)
Return true if the specified register number is in the virtual register namespace.
Definition: Register.h:69
#define DEBUG_TYPE
IRTranslator LLVM IR MI
Register getReg() const
getReg - Returns the register number.
#define LLVM_DEBUG(X)
Definition: Debug.h:122
const MachineOperand & getOperand(unsigned i) const
Definition: MachineInstr.h:416
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
Wrapper class representing virtual and physical registers.
Definition: Register.h:19
XXPERMDI - The PPC XXPERMDI instruction.