LLVM  14.0.0git
InterleavedLoadCombinePass.cpp
Go to the documentation of this file.
1 //===- InterleavedLoadCombine.cpp - Combine Interleaved Loads ---*- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // \file
10 //
11 // This file defines the interleaved-load-combine pass. The pass searches for
12 // ShuffleVectorInstruction that execute interleaving loads. If a matching
13 // pattern is found, it adds a combined load and further instructions in a
14 // pattern that is detectable by InterleavedAccesPass. The old instructions are
15 // left dead to be removed later. The pass is specifically designed to be
16 // executed just before InterleavedAccesPass to find any left-over instances
17 // that are not detected within former passes.
18 //
19 //===----------------------------------------------------------------------===//
20 
21 #include "llvm/ADT/Statistic.h"
27 #include "llvm/CodeGen/Passes.h"
31 #include "llvm/IR/DataLayout.h"
32 #include "llvm/IR/Dominators.h"
33 #include "llvm/IR/Function.h"
34 #include "llvm/IR/Instructions.h"
35 #include "llvm/IR/IRBuilder.h"
37 #include "llvm/IR/Module.h"
38 #include "llvm/InitializePasses.h"
39 #include "llvm/Pass.h"
40 #include "llvm/Support/Debug.h"
44 
45 #include <algorithm>
46 #include <cassert>
47 #include <list>
48 
49 using namespace llvm;
50 
51 #define DEBUG_TYPE "interleaved-load-combine"
52 
53 namespace {
54 
55 /// Statistic counter
56 STATISTIC(NumInterleavedLoadCombine, "Number of combined loads");
57 
58 /// Option to disable the pass
59 static cl::opt<bool> DisableInterleavedLoadCombine(
60  "disable-" DEBUG_TYPE, cl::init(false), cl::Hidden,
61  cl::desc("Disable combining of interleaved loads"));
62 
63 struct VectorInfo;
64 
65 struct InterleavedLoadCombineImpl {
66 public:
67  InterleavedLoadCombineImpl(Function &F, DominatorTree &DT, MemorySSA &MSSA,
69  : F(F), DT(DT), MSSA(MSSA),
70  TLI(*TM.getSubtargetImpl(F)->getTargetLowering()),
71  TTI(TM.getTargetTransformInfo(F)) {}
72 
73  /// Scan the function for interleaved load candidates and execute the
74  /// replacement if applicable.
75  bool run();
76 
77 private:
78  /// Function this pass is working on
79  Function &F;
80 
81  /// Dominator Tree Analysis
82  DominatorTree &DT;
83 
84  /// Memory Alias Analyses
85  MemorySSA &MSSA;
86 
87  /// Target Lowering Information
88  const TargetLowering &TLI;
89 
90  /// Target Transform Information
92 
93  /// Find the instruction in sets LIs that dominates all others, return nullptr
94  /// if there is none.
95  LoadInst *findFirstLoad(const std::set<LoadInst *> &LIs);
96 
97  /// Replace interleaved load candidates. It does additional
98  /// analyses if this makes sense. Returns true on success and false
99  /// of nothing has been changed.
100  bool combine(std::list<VectorInfo> &InterleavedLoad,
102 
103  /// Given a set of VectorInfo containing candidates for a given interleave
104  /// factor, find a set that represents a 'factor' interleaved load.
105  bool findPattern(std::list<VectorInfo> &Candidates,
106  std::list<VectorInfo> &InterleavedLoad, unsigned Factor,
107  const DataLayout &DL);
108 }; // InterleavedLoadCombine
109 
110 /// First Order Polynomial on an n-Bit Integer Value
111 ///
112 /// Polynomial(Value) = Value * B + A + E*2^(n-e)
113 ///
114 /// A and B are the coefficients. E*2^(n-e) is an error within 'e' most
115 /// significant bits. It is introduced if an exact computation cannot be proven
116 /// (e.q. division by 2).
117 ///
118 /// As part of this optimization multiple loads will be combined. It necessary
119 /// to prove that loads are within some relative offset to each other. This
120 /// class is used to prove relative offsets of values loaded from memory.
121 ///
122 /// Representing an integer in this form is sound since addition in two's
123 /// complement is associative (trivial) and multiplication distributes over the
124 /// addition (see Proof(1) in Polynomial::mul). Further, both operations
125 /// commute.
126 //
127 // Example:
128 // declare @fn(i64 %IDX, <4 x float>* %PTR) {
129 // %Pa1 = add i64 %IDX, 2
130 // %Pa2 = lshr i64 %Pa1, 1
131 // %Pa3 = getelementptr inbounds <4 x float>, <4 x float>* %PTR, i64 %Pa2
132 // %Va = load <4 x float>, <4 x float>* %Pa3
133 //
134 // %Pb1 = add i64 %IDX, 4
135 // %Pb2 = lshr i64 %Pb1, 1
136 // %Pb3 = getelementptr inbounds <4 x float>, <4 x float>* %PTR, i64 %Pb2
137 // %Vb = load <4 x float>, <4 x float>* %Pb3
138 // ... }
139 //
140 // The goal is to prove that two loads load consecutive addresses.
141 //
142 // In this case the polynomials are constructed by the following
143 // steps.
144 //
145 // The number tag #e specifies the error bits.
146 //
147 // Pa_0 = %IDX #0
148 // Pa_1 = %IDX + 2 #0 | add 2
149 // Pa_2 = %IDX/2 + 1 #1 | lshr 1
150 // Pa_3 = %IDX/2 + 1 #1 | GEP, step signext to i64
151 // Pa_4 = (%IDX/2)*16 + 16 #0 | GEP, multiply index by sizeof(4) for floats
152 // Pa_5 = (%IDX/2)*16 + 16 #0 | GEP, add offset of leading components
153 //
154 // Pb_0 = %IDX #0
155 // Pb_1 = %IDX + 4 #0 | add 2
156 // Pb_2 = %IDX/2 + 2 #1 | lshr 1
157 // Pb_3 = %IDX/2 + 2 #1 | GEP, step signext to i64
158 // Pb_4 = (%IDX/2)*16 + 32 #0 | GEP, multiply index by sizeof(4) for floats
159 // Pb_5 = (%IDX/2)*16 + 16 #0 | GEP, add offset of leading components
160 //
161 // Pb_5 - Pa_5 = 16 #0 | subtract to get the offset
162 //
163 // Remark: %PTR is not maintained within this class. So in this instance the
164 // offset of 16 can only be assumed if the pointers are equal.
165 //
166 class Polynomial {
167  /// Operations on B
168  enum BOps {
169  LShr,
170  Mul,
171  SExt,
172  Trunc,
173  };
174 
175  /// Number of Error Bits e
176  unsigned ErrorMSBs;
177 
178  /// Value
179  Value *V;
180 
181  /// Coefficient B
183 
184  /// Coefficient A
185  APInt A;
186 
187 public:
188  Polynomial(Value *V) : ErrorMSBs((unsigned)-1), V(V), B(), A() {
189  IntegerType *Ty = dyn_cast<IntegerType>(V->getType());
190  if (Ty) {
191  ErrorMSBs = 0;
192  this->V = V;
193  A = APInt(Ty->getBitWidth(), 0);
194  }
195  }
196 
197  Polynomial(const APInt &A, unsigned ErrorMSBs = 0)
198  : ErrorMSBs(ErrorMSBs), V(NULL), B(), A(A) {}
199 
200  Polynomial(unsigned BitWidth, uint64_t A, unsigned ErrorMSBs = 0)
201  : ErrorMSBs(ErrorMSBs), V(NULL), B(), A(BitWidth, A) {}
202 
203  Polynomial() : ErrorMSBs((unsigned)-1), V(NULL), B(), A() {}
204 
205  /// Increment and clamp the number of undefined bits.
206  void incErrorMSBs(unsigned amt) {
207  if (ErrorMSBs == (unsigned)-1)
208  return;
209 
210  ErrorMSBs += amt;
211  if (ErrorMSBs > A.getBitWidth())
212  ErrorMSBs = A.getBitWidth();
213  }
214 
215  /// Decrement and clamp the number of undefined bits.
216  void decErrorMSBs(unsigned amt) {
217  if (ErrorMSBs == (unsigned)-1)
218  return;
219 
220  if (ErrorMSBs > amt)
221  ErrorMSBs -= amt;
222  else
223  ErrorMSBs = 0;
224  }
225 
226  /// Apply an add on the polynomial
227  Polynomial &add(const APInt &C) {
228  // Note: Addition is associative in two's complement even when in case of
229  // signed overflow.
230  //
231  // Error bits can only propagate into higher significant bits. As these are
232  // already regarded as undefined, there is no change.
233  //
234  // Theorem: Adding a constant to a polynomial does not change the error
235  // term.
236  //
237  // Proof:
238  //
239  // Since the addition is associative and commutes:
240  //
241  // (B + A + E*2^(n-e)) + C = B + (A + C) + E*2^(n-e)
242  // [qed]
243 
244  if (C.getBitWidth() != A.getBitWidth()) {
245  ErrorMSBs = (unsigned)-1;
246  return *this;
247  }
248 
249  A += C;
250  return *this;
251  }
252 
253  /// Apply a multiplication onto the polynomial.
254  Polynomial &mul(const APInt &C) {
255  // Note: Multiplication distributes over the addition
256  //
257  // Theorem: Multiplication distributes over the addition
258  //
259  // Proof(1):
260  //
261  // (B+A)*C =-
262  // = (B + A) + (B + A) + .. {C Times}
263  // addition is associative and commutes, hence
264  // = B + B + .. {C Times} .. + A + A + .. {C times}
265  // = B*C + A*C
266  // (see (function add) for signed values and overflows)
267  // [qed]
268  //
269  // Theorem: If C has c trailing zeros, errors bits in A or B are shifted out
270  // to the left.
271  //
272  // Proof(2):
273  //
274  // Let B' and A' be the n-Bit inputs with some unknown errors EA,
275  // EB at e leading bits. B' and A' can be written down as:
276  //
277  // B' = B + 2^(n-e)*EB
278  // A' = A + 2^(n-e)*EA
279  //
280  // Let C' be an input with c trailing zero bits. C' can be written as
281  //
282  // C' = C*2^c
283  //
284  // Therefore we can compute the result by using distributivity and
285  // commutativity.
286  //
287  // (B'*C' + A'*C') = [B + 2^(n-e)*EB] * C' + [A + 2^(n-e)*EA] * C' =
288  // = [B + 2^(n-e)*EB + A + 2^(n-e)*EA] * C' =
289  // = (B'+A') * C' =
290  // = [B + 2^(n-e)*EB + A + 2^(n-e)*EA] * C' =
291  // = [B + A + 2^(n-e)*EB + 2^(n-e)*EA] * C' =
292  // = (B + A) * C' + [2^(n-e)*EB + 2^(n-e)*EA)] * C' =
293  // = (B + A) * C' + [2^(n-e)*EB + 2^(n-e)*EA)] * C*2^c =
294  // = (B + A) * C' + C*(EB + EA)*2^(n-e)*2^c =
295  //
296  // Let EC be the final error with EC = C*(EB + EA)
297  //
298  // = (B + A)*C' + EC*2^(n-e)*2^c =
299  // = (B + A)*C' + EC*2^(n-(e-c))
300  //
301  // Since EC is multiplied by 2^(n-(e-c)) the resulting error contains c
302  // less error bits than the input. c bits are shifted out to the left.
303  // [qed]
304 
305  if (C.getBitWidth() != A.getBitWidth()) {
306  ErrorMSBs = (unsigned)-1;
307  return *this;
308  }
309 
310  // Multiplying by one is a no-op.
311  if (C.isOne()) {
312  return *this;
313  }
314 
315  // Multiplying by zero removes the coefficient B and defines all bits.
316  if (C.isZero()) {
317  ErrorMSBs = 0;
318  deleteB();
319  }
320 
321  // See Proof(2): Trailing zero bits indicate a left shift. This removes
322  // leading bits from the result even if they are undefined.
323  decErrorMSBs(C.countTrailingZeros());
324 
325  A *= C;
326  pushBOperation(Mul, C);
327  return *this;
328  }
329 
330  /// Apply a logical shift right on the polynomial
331  Polynomial &lshr(const APInt &C) {
332  // Theorem(1): (B + A + E*2^(n-e)) >> 1 => (B >> 1) + (A >> 1) + E'*2^(n-e')
333  // where
334  // e' = e + 1,
335  // E is a e-bit number,
336  // E' is a e'-bit number,
337  // holds under the following precondition:
338  // pre(1): A % 2 = 0
339  // pre(2): e < n, (see Theorem(2) for the trivial case with e=n)
340  // where >> expresses a logical shift to the right, with adding zeros.
341  //
342  // We need to show that for every, E there is a E'
343  //
344  // B = b_h * 2^(n-1) + b_m * 2 + b_l
345  // A = a_h * 2^(n-1) + a_m * 2 (pre(1))
346  //
347  // where a_h, b_h, b_l are single bits, and a_m, b_m are (n-2) bit numbers
348  //
349  // Let X = (B + A + E*2^(n-e)) >> 1
350  // Let Y = (B >> 1) + (A >> 1) + E*2^(n-e) >> 1
351  //
352  // X = [B + A + E*2^(n-e)] >> 1 =
353  // = [ b_h * 2^(n-1) + b_m * 2 + b_l +
354  // + a_h * 2^(n-1) + a_m * 2 +
355  // + E * 2^(n-e) ] >> 1 =
356  //
357  // The sum is built by putting the overflow of [a_m + b+n] into the term
358  // 2^(n-1). As there are no more bits beyond 2^(n-1) the overflow within
359  // this bit is discarded. This is expressed by % 2.
360  //
361  // The bit in position 0 cannot overflow into the term (b_m + a_m).
362  //
363  // = [ ([b_h + a_h + (b_m + a_m) >> (n-2)] % 2) * 2^(n-1) +
364  // + ((b_m + a_m) % 2^(n-2)) * 2 +
365  // + b_l + E * 2^(n-e) ] >> 1 =
366  //
367  // The shift is computed by dividing the terms by 2 and by cutting off
368  // b_l.
369  //
370  // = ([b_h + a_h + (b_m + a_m) >> (n-2)] % 2) * 2^(n-2) +
371  // + ((b_m + a_m) % 2^(n-2)) +
372  // + E * 2^(n-(e+1)) =
373  //
374  // by the definition in the Theorem e+1 = e'
375  //
376  // = ([b_h + a_h + (b_m + a_m) >> (n-2)] % 2) * 2^(n-2) +
377  // + ((b_m + a_m) % 2^(n-2)) +
378  // + E * 2^(n-e') =
379  //
380  // Compute Y by applying distributivity first
381  //
382  // Y = (B >> 1) + (A >> 1) + E*2^(n-e') =
383  // = (b_h * 2^(n-1) + b_m * 2 + b_l) >> 1 +
384  // + (a_h * 2^(n-1) + a_m * 2) >> 1 +
385  // + E * 2^(n-e) >> 1 =
386  //
387  // Again, the shift is computed by dividing the terms by 2 and by cutting
388  // off b_l.
389  //
390  // = b_h * 2^(n-2) + b_m +
391  // + a_h * 2^(n-2) + a_m +
392  // + E * 2^(n-(e+1)) =
393  //
394  // Again, the sum is built by putting the overflow of [a_m + b+n] into
395  // the term 2^(n-1). But this time there is room for a second bit in the
396  // term 2^(n-2) we add this bit to a new term and denote it o_h in a
397  // second step.
398  //
399  // = ([b_h + a_h + (b_m + a_m) >> (n-2)] >> 1) * 2^(n-1) +
400  // + ([b_h + a_h + (b_m + a_m) >> (n-2)] % 2) * 2^(n-2) +
401  // + ((b_m + a_m) % 2^(n-2)) +
402  // + E * 2^(n-(e+1)) =
403  //
404  // Let o_h = [b_h + a_h + (b_m + a_m) >> (n-2)] >> 1
405  // Further replace e+1 by e'.
406  //
407  // = o_h * 2^(n-1) +
408  // + ([b_h + a_h + (b_m + a_m) >> (n-2)] % 2) * 2^(n-2) +
409  // + ((b_m + a_m) % 2^(n-2)) +
410  // + E * 2^(n-e') =
411  //
412  // Move o_h into the error term and construct E'. To ensure that there is
413  // no 2^x with negative x, this step requires pre(2) (e < n).
414  //
415  // = ([b_h + a_h + (b_m + a_m) >> (n-2)] % 2) * 2^(n-2) +
416  // + ((b_m + a_m) % 2^(n-2)) +
417  // + o_h * 2^(e'-1) * 2^(n-e') + | pre(2), move 2^(e'-1)
418  // | out of the old exponent
419  // + E * 2^(n-e') =
420  // = ([b_h + a_h + (b_m + a_m) >> (n-2)] % 2) * 2^(n-2) +
421  // + ((b_m + a_m) % 2^(n-2)) +
422  // + [o_h * 2^(e'-1) + E] * 2^(n-e') + | move 2^(e'-1) out of
423  // | the old exponent
424  //
425  // Let E' = o_h * 2^(e'-1) + E
426  //
427  // = ([b_h + a_h + (b_m + a_m) >> (n-2)] % 2) * 2^(n-2) +
428  // + ((b_m + a_m) % 2^(n-2)) +
429  // + E' * 2^(n-e')
430  //
431  // Because X and Y are distinct only in there error terms and E' can be
432  // constructed as shown the theorem holds.
433  // [qed]
434  //
435  // For completeness in case of the case e=n it is also required to show that
436  // distributivity can be applied.
437  //
438  // In this case Theorem(1) transforms to (the pre-condition on A can also be
439  // dropped)
440  //
441  // Theorem(2): (B + A + E) >> 1 => (B >> 1) + (A >> 1) + E'
442  // where
443  // A, B, E, E' are two's complement numbers with the same bit
444  // width
445  //
446  // Let A + B + E = X
447  // Let (B >> 1) + (A >> 1) = Y
448  //
449  // Therefore we need to show that for every X and Y there is an E' which
450  // makes the equation
451  //
452  // X = Y + E'
453  //
454  // hold. This is trivially the case for E' = X - Y.
455  //
456  // [qed]
457  //
458  // Remark: Distributing lshr with and arbitrary number n can be expressed as
459  // ((((B + A) lshr 1) lshr 1) ... ) {n times}.
460  // This construction induces n additional error bits at the left.
461 
462  if (C.getBitWidth() != A.getBitWidth()) {
463  ErrorMSBs = (unsigned)-1;
464  return *this;
465  }
466 
467  if (C.isZero())
468  return *this;
469 
470  // Test if the result will be zero
471  unsigned shiftAmt = C.getZExtValue();
472  if (shiftAmt >= C.getBitWidth())
473  return mul(APInt(C.getBitWidth(), 0));
474 
475  // The proof that shiftAmt LSBs are zero for at least one summand is only
476  // possible for the constant number.
477  //
478  // If this can be proven add shiftAmt to the error counter
479  // `ErrorMSBs`. Otherwise set all bits as undefined.
480  if (A.countTrailingZeros() < shiftAmt)
481  ErrorMSBs = A.getBitWidth();
482  else
483  incErrorMSBs(shiftAmt);
484 
485  // Apply the operation.
486  pushBOperation(LShr, C);
487  A = A.lshr(shiftAmt);
488 
489  return *this;
490  }
491 
492  /// Apply a sign-extend or truncate operation on the polynomial.
493  Polynomial &sextOrTrunc(unsigned n) {
494  if (n < A.getBitWidth()) {
495  // Truncate: Clearly undefined Bits on the MSB side are removed
496  // if there are any.
497  decErrorMSBs(A.getBitWidth() - n);
498  A = A.trunc(n);
499  pushBOperation(Trunc, APInt(sizeof(n) * 8, n));
500  }
501  if (n > A.getBitWidth()) {
502  // Extend: Clearly extending first and adding later is different
503  // to adding first and extending later in all extended bits.
504  incErrorMSBs(n - A.getBitWidth());
505  A = A.sext(n);
506  pushBOperation(SExt, APInt(sizeof(n) * 8, n));
507  }
508 
509  return *this;
510  }
511 
512  /// Test if there is a coefficient B.
513  bool isFirstOrder() const { return V != nullptr; }
514 
515  /// Test coefficient B of two Polynomials are equal.
516  bool isCompatibleTo(const Polynomial &o) const {
517  // The polynomial use different bit width.
518  if (A.getBitWidth() != o.A.getBitWidth())
519  return false;
520 
521  // If neither Polynomial has the Coefficient B.
522  if (!isFirstOrder() && !o.isFirstOrder())
523  return true;
524 
525  // The index variable is different.
526  if (V != o.V)
527  return false;
528 
529  // Check the operations.
530  if (B.size() != o.B.size())
531  return false;
532 
533  auto ob = o.B.begin();
534  for (auto &b : B) {
535  if (b != *ob)
536  return false;
537  ob++;
538  }
539 
540  return true;
541  }
542 
543  /// Subtract two polynomials, return an undefined polynomial if
544  /// subtraction is not possible.
545  Polynomial operator-(const Polynomial &o) const {
546  // Return an undefined polynomial if incompatible.
547  if (!isCompatibleTo(o))
548  return Polynomial();
549 
550  // If the polynomials are compatible (meaning they have the same
551  // coefficient on B), B is eliminated. Thus a polynomial solely
552  // containing A is returned
553  return Polynomial(A - o.A, std::max(ErrorMSBs, o.ErrorMSBs));
554  }
555 
556  /// Subtract a constant from a polynomial,
557  Polynomial operator-(uint64_t C) const {
558  Polynomial Result(*this);
559  Result.A -= C;
560  return Result;
561  }
562 
563  /// Add a constant to a polynomial,
564  Polynomial operator+(uint64_t C) const {
565  Polynomial Result(*this);
566  Result.A += C;
567  return Result;
568  }
569 
570  /// Returns true if it can be proven that two Polynomials are equal.
571  bool isProvenEqualTo(const Polynomial &o) {
572  // Subtract both polynomials and test if it is fully defined and zero.
573  Polynomial r = *this - o;
574  return (r.ErrorMSBs == 0) && (!r.isFirstOrder()) && (r.A.isZero());
575  }
576 
577  /// Print the polynomial into a stream.
578  void print(raw_ostream &OS) const {
579  OS << "[{#ErrBits:" << ErrorMSBs << "} ";
580 
581  if (V) {
582  for (auto b : B)
583  OS << "(";
584  OS << "(" << *V << ") ";
585 
586  for (auto b : B) {
587  switch (b.first) {
588  case LShr:
589  OS << "LShr ";
590  break;
591  case Mul:
592  OS << "Mul ";
593  break;
594  case SExt:
595  OS << "SExt ";
596  break;
597  case Trunc:
598  OS << "Trunc ";
599  break;
600  }
601 
602  OS << b.second << ") ";
603  }
604  }
605 
606  OS << "+ " << A << "]";
607  }
608 
609 private:
610  void deleteB() {
611  V = nullptr;
612  B.clear();
613  }
614 
615  void pushBOperation(const BOps Op, const APInt &C) {
616  if (isFirstOrder()) {
617  B.push_back(std::make_pair(Op, C));
618  return;
619  }
620  }
621 };
622 
623 #ifndef NDEBUG
624 static raw_ostream &operator<<(raw_ostream &OS, const Polynomial &S) {
625  S.print(OS);
626  return OS;
627 }
628 #endif
629 
630 /// VectorInfo stores abstract the following information for each vector
631 /// element:
632 ///
633 /// 1) The the memory address loaded into the element as Polynomial
634 /// 2) a set of load instruction necessary to construct the vector,
635 /// 3) a set of all other instructions that are necessary to create the vector and
636 /// 4) a pointer value that can be used as relative base for all elements.
637 struct VectorInfo {
638 private:
639  VectorInfo(const VectorInfo &c) : VTy(c.VTy) {
641  "Copying VectorInfo is neither implemented nor necessary,");
642  }
643 
644 public:
645  /// Information of a Vector Element
646  struct ElementInfo {
647  /// Offset Polynomial.
648  Polynomial Ofs;
649 
650  /// The Load Instruction used to Load the entry. LI is null if the pointer
651  /// of the load instruction does not point on to the entry
652  LoadInst *LI;
653 
654  ElementInfo(Polynomial Offset = Polynomial(), LoadInst *LI = nullptr)
655  : Ofs(Offset), LI(LI) {}
656  };
657 
658  /// Basic-block the load instructions are within
659  BasicBlock *BB;
660 
661  /// Pointer value of all participation load instructions
662  Value *PV;
663 
664  /// Participating load instructions
665  std::set<LoadInst *> LIs;
666 
667  /// Participating instructions
668  std::set<Instruction *> Is;
669 
670  /// Final shuffle-vector instruction
671  ShuffleVectorInst *SVI;
672 
673  /// Information of the offset for each vector element
674  ElementInfo *EI;
675 
676  /// Vector Type
677  FixedVectorType *const VTy;
678 
679  VectorInfo(FixedVectorType *VTy)
680  : BB(nullptr), PV(nullptr), LIs(), Is(), SVI(nullptr), VTy(VTy) {
681  EI = new ElementInfo[VTy->getNumElements()];
682  }
683 
684  virtual ~VectorInfo() { delete[] EI; }
685 
686  unsigned getDimension() const { return VTy->getNumElements(); }
687 
688  /// Test if the VectorInfo can be part of an interleaved load with the
689  /// specified factor.
690  ///
691  /// \param Factor of the interleave
692  /// \param DL Targets Datalayout
693  ///
694  /// \returns true if this is possible and false if not
695  bool isInterleaved(unsigned Factor, const DataLayout &DL) const {
696  unsigned Size = DL.getTypeAllocSize(VTy->getElementType());
697  for (unsigned i = 1; i < getDimension(); i++) {
698  if (!EI[i].Ofs.isProvenEqualTo(EI[0].Ofs + i * Factor * Size)) {
699  return false;
700  }
701  }
702  return true;
703  }
704 
705  /// Recursively computes the vector information stored in V.
706  ///
707  /// This function delegates the work to specialized implementations
708  ///
709  /// \param V Value to operate on
710  /// \param Result Result of the computation
711  ///
712  /// \returns false if no sensible information can be gathered.
713  static bool compute(Value *V, VectorInfo &Result, const DataLayout &DL) {
714  ShuffleVectorInst *SVI = dyn_cast<ShuffleVectorInst>(V);
715  if (SVI)
716  return computeFromSVI(SVI, Result, DL);
717  LoadInst *LI = dyn_cast<LoadInst>(V);
718  if (LI)
719  return computeFromLI(LI, Result, DL);
720  BitCastInst *BCI = dyn_cast<BitCastInst>(V);
721  if (BCI)
722  return computeFromBCI(BCI, Result, DL);
723  return false;
724  }
725 
726  /// BitCastInst specialization to compute the vector information.
727  ///
728  /// \param BCI BitCastInst to operate on
729  /// \param Result Result of the computation
730  ///
731  /// \returns false if no sensible information can be gathered.
732  static bool computeFromBCI(BitCastInst *BCI, VectorInfo &Result,
733  const DataLayout &DL) {
734  Instruction *Op = dyn_cast<Instruction>(BCI->getOperand(0));
735 
736  if (!Op)
737  return false;
738 
739  FixedVectorType *VTy = dyn_cast<FixedVectorType>(Op->getType());
740  if (!VTy)
741  return false;
742 
743  // We can only cast from large to smaller vectors
744  if (Result.VTy->getNumElements() % VTy->getNumElements())
745  return false;
746 
747  unsigned Factor = Result.VTy->getNumElements() / VTy->getNumElements();
748  unsigned NewSize = DL.getTypeAllocSize(Result.VTy->getElementType());
749  unsigned OldSize = DL.getTypeAllocSize(VTy->getElementType());
750 
751  if (NewSize * Factor != OldSize)
752  return false;
753 
754  VectorInfo Old(VTy);
755  if (!compute(Op, Old, DL))
756  return false;
757 
758  for (unsigned i = 0; i < Result.VTy->getNumElements(); i += Factor) {
759  for (unsigned j = 0; j < Factor; j++) {
760  Result.EI[i + j] =
761  ElementInfo(Old.EI[i / Factor].Ofs + j * NewSize,
762  j == 0 ? Old.EI[i / Factor].LI : nullptr);
763  }
764  }
765 
766  Result.BB = Old.BB;
767  Result.PV = Old.PV;
768  Result.LIs.insert(Old.LIs.begin(), Old.LIs.end());
769  Result.Is.insert(Old.Is.begin(), Old.Is.end());
770  Result.Is.insert(BCI);
771  Result.SVI = nullptr;
772 
773  return true;
774  }
775 
776  /// ShuffleVectorInst specialization to compute vector information.
777  ///
778  /// \param SVI ShuffleVectorInst to operate on
779  /// \param Result Result of the computation
780  ///
781  /// Compute the left and the right side vector information and merge them by
782  /// applying the shuffle operation. This function also ensures that the left
783  /// and right side have compatible loads. This means that all loads are with
784  /// in the same basic block and are based on the same pointer.
785  ///
786  /// \returns false if no sensible information can be gathered.
787  static bool computeFromSVI(ShuffleVectorInst *SVI, VectorInfo &Result,
788  const DataLayout &DL) {
789  FixedVectorType *ArgTy =
790  cast<FixedVectorType>(SVI->getOperand(0)->getType());
791 
792  // Compute the left hand vector information.
793  VectorInfo LHS(ArgTy);
794  if (!compute(SVI->getOperand(0), LHS, DL))
795  LHS.BB = nullptr;
796 
797  // Compute the right hand vector information.
798  VectorInfo RHS(ArgTy);
799  if (!compute(SVI->getOperand(1), RHS, DL))
800  RHS.BB = nullptr;
801 
802  // Neither operand produced sensible results?
803  if (!LHS.BB && !RHS.BB)
804  return false;
805  // Only RHS produced sensible results?
806  else if (!LHS.BB) {
807  Result.BB = RHS.BB;
808  Result.PV = RHS.PV;
809  }
810  // Only LHS produced sensible results?
811  else if (!RHS.BB) {
812  Result.BB = LHS.BB;
813  Result.PV = LHS.PV;
814  }
815  // Both operands produced sensible results?
816  else if ((LHS.BB == RHS.BB) && (LHS.PV == RHS.PV)) {
817  Result.BB = LHS.BB;
818  Result.PV = LHS.PV;
819  }
820  // Both operands produced sensible results but they are incompatible.
821  else {
822  return false;
823  }
824 
825  // Merge and apply the operation on the offset information.
826  if (LHS.BB) {
827  Result.LIs.insert(LHS.LIs.begin(), LHS.LIs.end());
828  Result.Is.insert(LHS.Is.begin(), LHS.Is.end());
829  }
830  if (RHS.BB) {
831  Result.LIs.insert(RHS.LIs.begin(), RHS.LIs.end());
832  Result.Is.insert(RHS.Is.begin(), RHS.Is.end());
833  }
834  Result.Is.insert(SVI);
835  Result.SVI = SVI;
836 
837  int j = 0;
838  for (int i : SVI->getShuffleMask()) {
839  assert((i < 2 * (signed)ArgTy->getNumElements()) &&
840  "Invalid ShuffleVectorInst (index out of bounds)");
841 
842  if (i < 0)
843  Result.EI[j] = ElementInfo();
844  else if (i < (signed)ArgTy->getNumElements()) {
845  if (LHS.BB)
846  Result.EI[j] = LHS.EI[i];
847  else
848  Result.EI[j] = ElementInfo();
849  } else {
850  if (RHS.BB)
851  Result.EI[j] = RHS.EI[i - ArgTy->getNumElements()];
852  else
853  Result.EI[j] = ElementInfo();
854  }
855  j++;
856  }
857 
858  return true;
859  }
860 
861  /// LoadInst specialization to compute vector information.
862  ///
863  /// This function also acts as abort condition to the recursion.
864  ///
865  /// \param LI LoadInst to operate on
866  /// \param Result Result of the computation
867  ///
868  /// \returns false if no sensible information can be gathered.
869  static bool computeFromLI(LoadInst *LI, VectorInfo &Result,
870  const DataLayout &DL) {
871  Value *BasePtr;
872  Polynomial Offset;
873 
874  if (LI->isVolatile())
875  return false;
876 
877  if (LI->isAtomic())
878  return false;
879 
880  // Get the base polynomial
881  computePolynomialFromPointer(*LI->getPointerOperand(), Offset, BasePtr, DL);
882 
883  Result.BB = LI->getParent();
884  Result.PV = BasePtr;
885  Result.LIs.insert(LI);
886  Result.Is.insert(LI);
887 
888  for (unsigned i = 0; i < Result.getDimension(); i++) {
889  Value *Idx[2] = {
892  };
893  int64_t Ofs = DL.getIndexedOffsetInType(Result.VTy, makeArrayRef(Idx, 2));
894  Result.EI[i] = ElementInfo(Offset + Ofs, i == 0 ? LI : nullptr);
895  }
896 
897  return true;
898  }
899 
900  /// Recursively compute polynomial of a value.
901  ///
902  /// \param BO Input binary operation
903  /// \param Result Result polynomial
904  static void computePolynomialBinOp(BinaryOperator &BO, Polynomial &Result) {
905  Value *LHS = BO.getOperand(0);
906  Value *RHS = BO.getOperand(1);
907 
908  // Find the RHS Constant if any
909  ConstantInt *C = dyn_cast<ConstantInt>(RHS);
910  if ((!C) && BO.isCommutative()) {
911  C = dyn_cast<ConstantInt>(LHS);
912  if (C)
913  std::swap(LHS, RHS);
914  }
915 
916  switch (BO.getOpcode()) {
917  case Instruction::Add:
918  if (!C)
919  break;
920 
921  computePolynomial(*LHS, Result);
922  Result.add(C->getValue());
923  return;
924 
925  case Instruction::LShr:
926  if (!C)
927  break;
928 
929  computePolynomial(*LHS, Result);
930  Result.lshr(C->getValue());
931  return;
932 
933  default:
934  break;
935  }
936 
937  Result = Polynomial(&BO);
938  }
939 
940  /// Recursively compute polynomial of a value
941  ///
942  /// \param V input value
943  /// \param Result result polynomial
944  static void computePolynomial(Value &V, Polynomial &Result) {
945  if (auto *BO = dyn_cast<BinaryOperator>(&V))
946  computePolynomialBinOp(*BO, Result);
947  else
948  Result = Polynomial(&V);
949  }
950 
951  /// Compute the Polynomial representation of a Pointer type.
952  ///
953  /// \param Ptr input pointer value
954  /// \param Result result polynomial
955  /// \param BasePtr pointer the polynomial is based on
956  /// \param DL Datalayout of the target machine
957  static void computePolynomialFromPointer(Value &Ptr, Polynomial &Result,
958  Value *&BasePtr,
959  const DataLayout &DL) {
960  // Not a pointer type? Return an undefined polynomial
961  PointerType *PtrTy = dyn_cast<PointerType>(Ptr.getType());
962  if (!PtrTy) {
963  Result = Polynomial();
964  BasePtr = nullptr;
965  return;
966  }
967  unsigned PointerBits =
968  DL.getIndexSizeInBits(PtrTy->getPointerAddressSpace());
969 
970  /// Skip pointer casts. Return Zero polynomial otherwise
971  if (isa<CastInst>(&Ptr)) {
972  CastInst &CI = *cast<CastInst>(&Ptr);
973  switch (CI.getOpcode()) {
974  case Instruction::BitCast:
975  computePolynomialFromPointer(*CI.getOperand(0), Result, BasePtr, DL);
976  break;
977  default:
978  BasePtr = &Ptr;
979  Polynomial(PointerBits, 0);
980  break;
981  }
982  }
983  /// Resolve GetElementPtrInst.
984  else if (isa<GetElementPtrInst>(&Ptr)) {
985  GetElementPtrInst &GEP = *cast<GetElementPtrInst>(&Ptr);
986 
987  APInt BaseOffset(PointerBits, 0);
988 
989  // Check if we can compute the Offset with accumulateConstantOffset
990  if (GEP.accumulateConstantOffset(DL, BaseOffset)) {
991  Result = Polynomial(BaseOffset);
992  BasePtr = GEP.getPointerOperand();
993  return;
994  } else {
995  // Otherwise we allow that the last index operand of the GEP is
996  // non-constant.
997  unsigned idxOperand, e;
998  SmallVector<Value *, 4> Indices;
999  for (idxOperand = 1, e = GEP.getNumOperands(); idxOperand < e;
1000  idxOperand++) {
1001  ConstantInt *IDX = dyn_cast<ConstantInt>(GEP.getOperand(idxOperand));
1002  if (!IDX)
1003  break;
1004  Indices.push_back(IDX);
1005  }
1006 
1007  // It must also be the last operand.
1008  if (idxOperand + 1 != e) {
1009  Result = Polynomial();
1010  BasePtr = nullptr;
1011  return;
1012  }
1013 
1014  // Compute the polynomial of the index operand.
1015  computePolynomial(*GEP.getOperand(idxOperand), Result);
1016 
1017  // Compute base offset from zero based index, excluding the last
1018  // variable operand.
1019  BaseOffset =
1020  DL.getIndexedOffsetInType(GEP.getSourceElementType(), Indices);
1021 
1022  // Apply the operations of GEP to the polynomial.
1023  unsigned ResultSize = DL.getTypeAllocSize(GEP.getResultElementType());
1024  Result.sextOrTrunc(PointerBits);
1025  Result.mul(APInt(PointerBits, ResultSize));
1026  Result.add(BaseOffset);
1027  BasePtr = GEP.getPointerOperand();
1028  }
1029  }
1030  // All other instructions are handled by using the value as base pointer and
1031  // a zero polynomial.
1032  else {
1033  BasePtr = &Ptr;
1034  Polynomial(DL.getIndexSizeInBits(PtrTy->getPointerAddressSpace()), 0);
1035  }
1036  }
1037 
1038 #ifndef NDEBUG
1039  void print(raw_ostream &OS) const {
1040  if (PV)
1041  OS << *PV;
1042  else
1043  OS << "(none)";
1044  OS << " + ";
1045  for (unsigned i = 0; i < getDimension(); i++)
1046  OS << ((i == 0) ? "[" : ", ") << EI[i].Ofs;
1047  OS << "]";
1048  }
1049 #endif
1050 };
1051 
1052 } // anonymous namespace
1053 
1054 bool InterleavedLoadCombineImpl::findPattern(
1055  std::list<VectorInfo> &Candidates, std::list<VectorInfo> &InterleavedLoad,
1056  unsigned Factor, const DataLayout &DL) {
1057  for (auto C0 = Candidates.begin(), E0 = Candidates.end(); C0 != E0; ++C0) {
1058  unsigned i;
1059  // Try to find an interleaved load using the front of Worklist as first line
1060  unsigned Size = DL.getTypeAllocSize(C0->VTy->getElementType());
1061 
1062  // List containing iterators pointing to the VectorInfos of the candidates
1063  std::vector<std::list<VectorInfo>::iterator> Res(Factor, Candidates.end());
1064 
1065  for (auto C = Candidates.begin(), E = Candidates.end(); C != E; C++) {
1066  if (C->VTy != C0->VTy)
1067  continue;
1068  if (C->BB != C0->BB)
1069  continue;
1070  if (C->PV != C0->PV)
1071  continue;
1072 
1073  // Check the current value matches any of factor - 1 remaining lines
1074  for (i = 1; i < Factor; i++) {
1075  if (C->EI[0].Ofs.isProvenEqualTo(C0->EI[0].Ofs + i * Size)) {
1076  Res[i] = C;
1077  }
1078  }
1079 
1080  for (i = 1; i < Factor; i++) {
1081  if (Res[i] == Candidates.end())
1082  break;
1083  }
1084  if (i == Factor) {
1085  Res[0] = C0;
1086  break;
1087  }
1088  }
1089 
1090  if (Res[0] != Candidates.end()) {
1091  // Move the result into the output
1092  for (unsigned i = 0; i < Factor; i++) {
1093  InterleavedLoad.splice(InterleavedLoad.end(), Candidates, Res[i]);
1094  }
1095 
1096  return true;
1097  }
1098  }
1099  return false;
1100 }
1101 
1102 LoadInst *
1103 InterleavedLoadCombineImpl::findFirstLoad(const std::set<LoadInst *> &LIs) {
1104  assert(!LIs.empty() && "No load instructions given.");
1105 
1106  // All LIs are within the same BB. Select the first for a reference.
1107  BasicBlock *BB = (*LIs.begin())->getParent();
1109  *BB, [&LIs](Instruction &I) -> bool { return is_contained(LIs, &I); });
1110  assert(FLI != BB->end());
1111 
1112  return cast<LoadInst>(FLI);
1113 }
1114 
1115 bool InterleavedLoadCombineImpl::combine(std::list<VectorInfo> &InterleavedLoad,
1117  LLVM_DEBUG(dbgs() << "Checking interleaved load\n");
1118 
1119  // The insertion point is the LoadInst which loads the first values. The
1120  // following tests are used to proof that the combined load can be inserted
1121  // just before InsertionPoint.
1122  LoadInst *InsertionPoint = InterleavedLoad.front().EI[0].LI;
1123 
1124  // Test if the offset is computed
1125  if (!InsertionPoint)
1126  return false;
1127 
1128  std::set<LoadInst *> LIs;
1129  std::set<Instruction *> Is;
1130  std::set<Instruction *> SVIs;
1131 
1132  InstructionCost InterleavedCost;
1135 
1136  // Get the interleave factor
1137  unsigned Factor = InterleavedLoad.size();
1138 
1139  // Merge all input sets used in analysis
1140  for (auto &VI : InterleavedLoad) {
1141  // Generate a set of all load instructions to be combined
1142  LIs.insert(VI.LIs.begin(), VI.LIs.end());
1143 
1144  // Generate a set of all instructions taking part in load
1145  // interleaved. This list excludes the instructions necessary for the
1146  // polynomial construction.
1147  Is.insert(VI.Is.begin(), VI.Is.end());
1148 
1149  // Generate the set of the final ShuffleVectorInst.
1150  SVIs.insert(VI.SVI);
1151  }
1152 
1153  // There is nothing to combine.
1154  if (LIs.size() < 2)
1155  return false;
1156 
1157  // Test if all participating instruction will be dead after the
1158  // transformation. If intermediate results are used, no performance gain can
1159  // be expected. Also sum the cost of the Instructions beeing left dead.
1160  for (auto &I : Is) {
1161  // Compute the old cost
1163 
1164  // The final SVIs are allowed not to be dead, all uses will be replaced
1165  if (SVIs.find(I) != SVIs.end())
1166  continue;
1167 
1168  // If there are users outside the set to be eliminated, we abort the
1169  // transformation. No gain can be expected.
1170  for (auto *U : I->users()) {
1171  if (Is.find(dyn_cast<Instruction>(U)) == Is.end())
1172  return false;
1173  }
1174  }
1175 
1176  // We need to have a valid cost in order to proceed.
1177  if (!InstructionCost.isValid())
1178  return false;
1179 
1180  // We know that all LoadInst are within the same BB. This guarantees that
1181  // either everything or nothing is loaded.
1182  LoadInst *First = findFirstLoad(LIs);
1183 
1184  // To be safe that the loads can be combined, iterate over all loads and test
1185  // that the corresponding defining access dominates first LI. This guarantees
1186  // that there are no aliasing stores in between the loads.
1187  auto FMA = MSSA.getMemoryAccess(First);
1188  for (auto LI : LIs) {
1189  auto MADef = MSSA.getMemoryAccess(LI)->getDefiningAccess();
1190  if (!MSSA.dominates(MADef, FMA))
1191  return false;
1192  }
1193  assert(!LIs.empty() && "There are no LoadInst to combine");
1194 
1195  // It is necessary that insertion point dominates all final ShuffleVectorInst.
1196  for (auto &VI : InterleavedLoad) {
1197  if (!DT.dominates(InsertionPoint, VI.SVI))
1198  return false;
1199  }
1200 
1201  // All checks are done. Add instructions detectable by InterleavedAccessPass
1202  // The old instruction will are left dead.
1203  IRBuilder<> Builder(InsertionPoint);
1204  Type *ETy = InterleavedLoad.front().SVI->getType()->getElementType();
1205  unsigned ElementsPerSVI =
1206  cast<FixedVectorType>(InterleavedLoad.front().SVI->getType())
1207  ->getNumElements();
1208  FixedVectorType *ILTy = FixedVectorType::get(ETy, Factor * ElementsPerSVI);
1209 
1210  SmallVector<unsigned, 4> Indices;
1211  for (unsigned i = 0; i < Factor; i++)
1212  Indices.push_back(i);
1213  InterleavedCost = TTI.getInterleavedMemoryOpCost(
1214  Instruction::Load, ILTy, Factor, Indices, InsertionPoint->getAlign(),
1215  InsertionPoint->getPointerAddressSpace(), CostKind);
1216 
1217  if (InterleavedCost >= InstructionCost) {
1218  return false;
1219  }
1220 
1221  // Create a pointer cast for the wide load.
1222  auto CI = Builder.CreatePointerCast(InsertionPoint->getOperand(0),
1223  ILTy->getPointerTo(),
1224  "interleaved.wide.ptrcast");
1225 
1226  // Create the wide load and update the MemorySSA.
1227  auto LI = Builder.CreateAlignedLoad(ILTy, CI, InsertionPoint->getAlign(),
1228  "interleaved.wide.load");
1229  auto MSSAU = MemorySSAUpdater(&MSSA);
1230  MemoryUse *MSSALoad = cast<MemoryUse>(MSSAU.createMemoryAccessBefore(
1231  LI, nullptr, MSSA.getMemoryAccess(InsertionPoint)));
1232  MSSAU.insertUse(MSSALoad);
1233 
1234  // Create the final SVIs and replace all uses.
1235  int i = 0;
1236  for (auto &VI : InterleavedLoad) {
1238  for (unsigned j = 0; j < ElementsPerSVI; j++)
1239  Mask.push_back(i + j * Factor);
1240 
1241  Builder.SetInsertPoint(VI.SVI);
1242  auto SVI = Builder.CreateShuffleVector(LI, Mask, "interleaved.shuffle");
1243  VI.SVI->replaceAllUsesWith(SVI);
1244  i++;
1245  }
1246 
1247  NumInterleavedLoadCombine++;
1248  ORE.emit([&]() {
1249  return OptimizationRemark(DEBUG_TYPE, "Combined Interleaved Load", LI)
1250  << "Load interleaved combined with factor "
1251  << ore::NV("Factor", Factor);
1252  });
1253 
1254  return true;
1255 }
1256 
1257 bool InterleavedLoadCombineImpl::run() {
1259  bool changed = false;
1260  unsigned MaxFactor = TLI.getMaxSupportedInterleaveFactor();
1261 
1262  auto &DL = F.getParent()->getDataLayout();
1263 
1264  // Start with the highest factor to avoid combining and recombining.
1265  for (unsigned Factor = MaxFactor; Factor >= 2; Factor--) {
1266  std::list<VectorInfo> Candidates;
1267 
1268  for (BasicBlock &BB : F) {
1269  for (Instruction &I : BB) {
1270  if (auto SVI = dyn_cast<ShuffleVectorInst>(&I)) {
1271  // We don't support scalable vectors in this pass.
1272  if (isa<ScalableVectorType>(SVI->getType()))
1273  continue;
1274 
1275  Candidates.emplace_back(cast<FixedVectorType>(SVI->getType()));
1276 
1277  if (!VectorInfo::computeFromSVI(SVI, Candidates.back(), DL)) {
1278  Candidates.pop_back();
1279  continue;
1280  }
1281 
1282  if (!Candidates.back().isInterleaved(Factor, DL)) {
1283  Candidates.pop_back();
1284  }
1285  }
1286  }
1287  }
1288 
1289  std::list<VectorInfo> InterleavedLoad;
1290  while (findPattern(Candidates, InterleavedLoad, Factor, DL)) {
1291  if (combine(InterleavedLoad, ORE)) {
1292  changed = true;
1293  } else {
1294  // Remove the first element of the Interleaved Load but put the others
1295  // back on the list and continue searching
1296  Candidates.splice(Candidates.begin(), InterleavedLoad,
1297  std::next(InterleavedLoad.begin()),
1298  InterleavedLoad.end());
1299  }
1300  InterleavedLoad.clear();
1301  }
1302  }
1303 
1304  return changed;
1305 }
1306 
1307 namespace {
1308 /// This pass combines interleaved loads into a pattern detectable by
1309 /// InterleavedAccessPass.
1310 struct InterleavedLoadCombine : public FunctionPass {
1311  static char ID;
1312 
1313  InterleavedLoadCombine() : FunctionPass(ID) {
1315  }
1316 
1317  StringRef getPassName() const override {
1318  return "Interleaved Load Combine Pass";
1319  }
1320 
1321  bool runOnFunction(Function &F) override {
1322  if (DisableInterleavedLoadCombine)
1323  return false;
1324 
1325  auto *TPC = getAnalysisIfAvailable<TargetPassConfig>();
1326  if (!TPC)
1327  return false;
1328 
1329  LLVM_DEBUG(dbgs() << "*** " << getPassName() << ": " << F.getName()
1330  << "\n");
1331 
1332  return InterleavedLoadCombineImpl(
1333  F, getAnalysis<DominatorTreeWrapperPass>().getDomTree(),
1334  getAnalysis<MemorySSAWrapperPass>().getMSSA(),
1335  TPC->getTM<TargetMachine>())
1336  .run();
1337  }
1338 
1339  void getAnalysisUsage(AnalysisUsage &AU) const override {
1343  }
1344 
1345 private:
1346 };
1347 } // anonymous namespace
1348 
1350 
1352  InterleavedLoadCombine, DEBUG_TYPE,
1353  "Combine interleaved loads into wide loads and shufflevector instructions",
1354  false, false)
1358  InterleavedLoadCombine, DEBUG_TYPE,
1359  "Combine interleaved loads into wide loads and shufflevector instructions",
1361 
1362 FunctionPass *
1364  auto P = new InterleavedLoadCombine();
1365  return P;
1366 }
llvm::Check::Size
@ Size
Definition: FileCheck.h:73
i
i
Definition: README.txt:29
llvm::InstructionCost
Definition: InstructionCost.h:29
llvm::TargetTransformInfo::TargetCostKind
TargetCostKind
The kind of cost model.
Definition: TargetTransformInfo.h:211
llvm
This file implements support for optimizing divisions by a constant.
Definition: AllocatorList.h:23
llvm::initializeInterleavedLoadCombinePass
void initializeInterleavedLoadCombinePass(PassRegistry &)
llvm::DataLayout
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:113
print
static void print(raw_ostream &Out, object::Archive::Kind Kind, T Val)
Definition: ArchiveWriter.cpp:147
llvm::BasicBlock::iterator
InstListType::iterator iterator
Instruction iterators...
Definition: BasicBlock.h:90
llvm::ShuffleVectorInst::getShuffleMask
static void getShuffleMask(const Constant *Mask, SmallVectorImpl< int > &Result)
Convert the input shuffle mask operand to a vector of integers.
Definition: Instructions.cpp:2097
MemorySSAUpdater.h
llvm::Function
Definition: Function.h:62
P
This currently compiles esp xmm0 movsd esp eax eax esp ret We should use not the dag combiner This is because dagcombine2 needs to be able to see through the X86ISD::Wrapper which DAGCombine can t really do The code for turning x load into a single vector load is target independent and should be moved to the dag combiner The code for turning x load into a vector load can only handle a direct load from a global or a direct load from the stack It should be generalized to handle any load from P
Definition: README-SSE.txt:411
Pass.h
llvm::BitCastInst
This class represents a no-op cast from one type to another.
Definition: Instructions.h:5198
llvm::SmallVector
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1168
Statistic.h
ErrorHandling.h
llvm::TargetTransformInfo
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
Definition: TargetTransformInfo.h:168
llvm::Type::getPointerAddressSpace
unsigned getPointerAddressSpace() const
Get the address space of this pointer or pointer vector type.
Definition: DerivedTypes.h:734
llvm::IRBuilder<>
OptimizationRemarkEmitter.h
llvm::DominatorTree
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition: Dominators.h:151
llvm::ISD::FMA
@ FMA
FMA - Perform a * b + c with no intermediate rounding step.
Definition: ISDOpcodes.h:466
llvm::cl::Hidden
@ Hidden
Definition: CommandLine.h:143
llvm::SPII::Load
@ Load
Definition: SparcInstrInfo.h:32
llvm::Type
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
Module.h
Offset
uint64_t Offset
Definition: ELFObjHandler.cpp:81
llvm::ore::NV
DiagnosticInfoOptimizationBase::Argument NV
Definition: OptimizationRemarkEmitter.h:136
llvm::VectorType::getElementType
Type * getElementType() const
Definition: DerivedTypes.h:422
llvm::LoadInst::getPointerOperand
Value * getPointerOperand()
Definition: Instructions.h:267
llvm::MemoryUse
Represents read-only accesses to memory.
Definition: MemorySSA.h:320
and
We currently generate a but we really shouldn eax ecx xorl edx divl ecx eax divl ecx movl eax ret A similar code sequence works for division We currently compile i32 v2 eax eax jo LBB1_2 and
Definition: README.txt:1271
llvm::FixedVectorType
Class to represent fixed width SIMD vectors.
Definition: DerivedTypes.h:525
LegacyPassManager.h
llvm::LoadInst::getAlign
Align getAlign() const
Return the alignment of the access that is being performed.
Definition: Instructions.h:223
llvm::BitmaskEnumDetail::Mask
std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
Definition: BitmaskEnum.h:80
llvm::Type::getInt32Ty
static IntegerType * getInt32Ty(LLVMContext &C)
Definition: Type.cpp:241
LLVM_DEBUG
#define LLVM_DEBUG(X)
Definition: Debug.h:101
F
#define F(x, y, z)
Definition: MD5.cpp:56
llvm::BasicBlock
LLVM Basic Block Representation.
Definition: BasicBlock.h:58
llvm::dbgs
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
llvm::FixedVectorType::getNumElements
unsigned getNumElements() const
Definition: DerivedTypes.h:568
TargetLowering.h
llvm::ConstantInt
This is the shared class of boolean and integer constants.
Definition: Constants.h:79
llvm::MemorySSAWrapperPass
Legacy analysis pass which computes MemorySSA.
Definition: MemorySSA.h:981
TargetMachine.h
llvm::PassRegistry::getPassRegistry
static PassRegistry * getPassRegistry()
getPassRegistry - Access the global registry object, which is automatically initialized at applicatio...
Definition: PassRegistry.cpp:31
E
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
C
(vector float) vec_cmpeq(*A, *B) C
Definition: README_ALTIVEC.txt:86
llvm::TargetLowering
This class defines information used to lower LLVM code to legal SelectionDAG operators that the targe...
Definition: TargetLowering.h:3189
llvm::AnalysisUsage
Represent the analysis usage information of a pass.
Definition: PassAnalysisSupport.h:47
b
the resulting code requires compare and branches when and if the revised code is with conditional branches instead of More there is a byte word extend before each where there should be only and the condition codes are not remembered when the same two values are compared twice More LSR enhancements i8 and i32 load store addressing modes are identical int b
Definition: README.txt:418
false
Definition: StackSlotColoring.cpp:142
B
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
First
into llvm powi allowing the code generator to produce balanced multiplication trees First
Definition: README.txt:54
llvm::IntegerType
Class to represent integer types.
Definition: DerivedTypes.h:40
llvm::BinaryOperator::getOpcode
BinaryOps getOpcode() const
Definition: InstrTypes.h:393
llvm::Instruction
Definition: Instruction.h:45
llvm::operator-
APInt operator-(APInt)
Definition: APInt.h:2047
llvm::DominatorTreeWrapperPass
Legacy analysis pass which computes a DominatorTree.
Definition: Dominators.h:287
llvm::STATISTIC
STATISTIC(NumFunctions, "Total number of functions")
llvm::raw_ostream
This class implements an extremely fast bulk output stream that can only output to a stream.
Definition: raw_ostream.h:53
llvm::operator<<
raw_ostream & operator<<(raw_ostream &OS, const APFixedPoint &FX)
Definition: APFixedPoint.h:230
llvm::ConstantInt::get
static Constant * get(Type *Ty, uint64_t V, bool IsSigned=false)
If Ty is a vector type, return a Constant with a splat of the given value.
Definition: Constants.cpp:925
llvm::FixedVectorType::get
static FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition: Type.cpp:686
c
the resulting code requires compare and branches when and if the revised code is with conditional branches instead of More there is a byte word extend before each where there should be only and the condition codes are not remembered when the same two values are compared twice More LSR enhancements i8 and i32 load store addressing modes are identical int int c
Definition: README.txt:418
INITIALIZE_PASS_END
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:58
DEBUG_TYPE
#define DEBUG_TYPE
Definition: InterleavedLoadCombinePass.cpp:51
llvm::Instruction::isCommutative
bool isCommutative() const LLVM_READONLY
Return true if the instruction is commutative:
Definition: Instruction.cpp:758
into
Clang compiles this into
Definition: README.txt:504
Passes.h
Combine
Hexagon Vector Combine
Definition: HexagonVectorCombine.cpp:1525
llvm::cl::opt< bool >
llvm::LoadInst::getPointerAddressSpace
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
Definition: Instructions.h:273
llvm::instructions
inst_range instructions(Function *F)
Definition: InstIterator.h:133
VI
@ VI
Definition: SIInstrInfo.cpp:7685
uint64_t
llvm::MemorySSAUpdater
Definition: MemorySSAUpdater.h:56
llvm::TargetTransformInfo::getInterleavedMemoryOpCost
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, bool UseMaskForCond=false, bool UseMaskForGaps=false) const
Definition: TargetTransformInfo.cpp:856
INITIALIZE_PASS_DEPENDENCY
INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
llvm::ARM_AM::add
@ add
Definition: ARMAddressingModes.h:39
llvm::numbers::e
constexpr double e
Definition: MathExtras.h:57
MemoryLocation.h
llvm::MemorySSA
Encapsulates MemorySSA, including all data associated with memory accesses.
Definition: MemorySSA.h:705
I
#define I(x, y, z)
Definition: MD5.cpp:59
llvm::GetElementPtrInst
an instruction for type-safe pointer arithmetic to access elements of arrays and structs
Definition: Instructions.h:928
llvm::cl::init
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:441
llvm::PointerType
Class to represent pointers.
Definition: DerivedTypes.h:632
llvm::is_contained
bool is_contained(R &&Range, const E &Element)
Wrapper function around std::find to detect if an element exists in a container.
Definition: STLExtras.h:1616
TargetPassConfig.h
IRBuilder.h
assert
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
llvm::TargetMachine
Primary interface to the complete machine description for the target machine.
Definition: TargetMachine.h:79
std::swap
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition: BitVector.h:840
llvm::OptimizationRemarkEmitter::emit
void emit(DiagnosticInfoOptimizationBase &OptDiag)
Output the remark via the diagnostic handler and to the optimization record file.
Definition: OptimizationRemarkEmitter.cpp:77
llvm::TTI
TargetTransformInfo TTI
Definition: TargetTransformInfo.h:163
llvm::operator+
APInt operator+(APInt a, const APInt &b)
Definition: APInt.h:2052
Builder
assume Assume Builder
Definition: AssumeBundleBuilder.cpp:650
llvm::APInt
Class for arbitrary precision integers.
Definition: APInt.h:75
llvm::BinaryOperator
Definition: InstrTypes.h:189
llvm::OptimizationRemarkEmitter
The optimization diagnostic interface.
Definition: OptimizationRemarkEmitter.h:33
DataLayout.h
llvm::StringRef
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:58
llvm::TargetTransformInfo::TCK_SizeAndLatency
@ TCK_SizeAndLatency
The weighted sum of size and latency.
Definition: TargetTransformInfo.h:215
llvm_unreachable
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
Definition: ErrorHandling.h:134
llvm::Value::getType
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:256
CostKind
static cl::opt< TargetTransformInfo::TargetCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(TargetTransformInfo::TCK_RecipThroughput), cl::values(clEnumValN(TargetTransformInfo::TCK_RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(TargetTransformInfo::TCK_Latency, "latency", "Instruction latency"), clEnumValN(TargetTransformInfo::TCK_CodeSize, "code-size", "Code size"), clEnumValN(TargetTransformInfo::TCK_SizeAndLatency, "size-latency", "Code size and latency")))
getParent
static const Function * getParent(const Value *V)
Definition: BasicAliasAnalysis.cpp:867
llvm::Value::getContext
LLVMContext & getContext() const
All values hold a context through their type.
Definition: Value.cpp:990
TargetSubtargetInfo.h
DL
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
Definition: AArch64SLSHardening.cpp:76
S
add sub stmia L5 ldr r0 bl L_printf $stub Instead of a and a wouldn t it be better to do three moves *Return an aggregate type is even return S
Definition: README.txt:210
llvm::RecurKind::Mul
@ Mul
Product of integers.
llvm::CastInst
This is the base class for all instructions that perform data casts.
Definition: InstrTypes.h:430
llvm::InstructionCost::isValid
bool isValid() const
Definition: InstructionCost.h:79
llvm::LoadInst
An instruction for reading from memory.
Definition: Instructions.h:175
llvm::find_if
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1578
runOnFunction
static bool runOnFunction(Function &F, bool PostInlining)
Definition: EntryExitInstrumenter.cpp:69
j
return j(j<< 16)
llvm::Instruction::isAtomic
bool isAtomic() const
Return true if this instruction has an AtomicOrdering of unordered or higher.
Definition: Instruction.cpp:604
llvm::AMDGPU::SendMsg::Op
Op
Definition: SIDefines.h:324
Function.h
llvm::BitWidth
constexpr unsigned BitWidth
Definition: BitmaskEnum.h:147
llvm::Type::getPointerTo
PointerType * getPointerTo(unsigned AddrSpace=0) const
Return a pointer to the current type.
Definition: Type.cpp:776
llvm::TargetTransformInfo::getInstructionCost
InstructionCost getInstructionCost(const Instruction *I, enum TargetCostKind kind) const
Query the cost of a specified instruction.
Definition: TargetTransformInfo.h:225
llvm::MCID::Add
@ Add
Definition: MCInstrDesc.h:183
llvm::makeArrayRef
ArrayRef< T > makeArrayRef(const T &OneElt)
Construct an ArrayRef from a single element.
Definition: ArrayRef.h:476
llvm::ShuffleVectorInst
This instruction constructs a fixed permutation of two input vectors.
Definition: Instructions.h:2009
llvm::OptimizationRemark
Diagnostic information for applied optimization remarks.
Definition: DiagnosticInfo.h:685
MemorySSA.h
Instructions.h
llvm::LoadInst::isVolatile
bool isVolatile() const
Return true if this is a load from a volatile memory location.
Definition: Instructions.h:212
Dominators.h
llvm::CastInst::getOpcode
Instruction::CastOps getOpcode() const
Return the opcode of this CastInst.
Definition: InstrTypes.h:677
llvm::Instruction::getParent
const BasicBlock * getParent() const
Definition: Instruction.h:94
INITIALIZE_PASS_BEGIN
INITIALIZE_PASS_BEGIN(InterleavedLoadCombine, DEBUG_TYPE, "Combine interleaved loads into wide loads and shufflevector instructions", false, false) INITIALIZE_PASS_END(InterleavedLoadCombine
llvm::IntegerType::getBitWidth
unsigned getBitWidth() const
Get the number of bits in this IntegerType.
Definition: DerivedTypes.h:72
llvm::max
Align max(MaybeAlign Lhs, Align Rhs)
Definition: Alignment.h:340
TargetTransformInfo.h
llvm::Pass::getAnalysisUsage
virtual void getAnalysisUsage(AnalysisUsage &) const
getAnalysisUsage - This function should be overriden by passes that need analysis information to do t...
Definition: Pass.cpp:93
TM
const char LLVMTargetMachineRef TM
Definition: PassBuilderBindings.cpp:47
llvm::FunctionPass
FunctionPass class - This class is used to implement most global optimizations.
Definition: Pass.h:298
BB
Common register allocation spilling lr str ldr sxth r3 ldr mla r4 can lr mov lr str ldr sxth r3 mla r4 and then merge mul and lr str ldr sxth r3 mla r4 It also increase the likelihood the store may become dead bb27 Successors according to LLVM BB
Definition: README.txt:39
GEP
Hexagon Common GEP
Definition: HexagonCommonGEP.cpp:172
llvm::createInterleavedLoadCombinePass
FunctionPass * createInterleavedLoadCombinePass()
InterleavedLoadCombines Pass - This pass identifies interleaved loads and combines them into wide loa...
Definition: InterleavedLoadCombinePass.cpp:1363
llvm::AnalysisUsage::addRequired
AnalysisUsage & addRequired()
Definition: PassAnalysisSupport.h:75
lshr
Vector Shift Left don t map to llvm shl and lshr
Definition: README_P9.txt:118
llvm::User::getOperand
Value * getOperand(unsigned i) const
Definition: User.h:169
llvm::cl::desc
Definition: CommandLine.h:412
raw_ostream.h
n
The same transformation can work with an even modulo with the addition of a and shrink the compare RHS by the same amount Unless the target supports that transformation probably isn t worthwhile The transformation can also easily be made to work with non zero equality for n
Definition: README.txt:685
combine
vector combine
Definition: VectorCombine.cpp:1161
InitializePasses.h
llvm::Value
LLVM Value Representation.
Definition: Value.h:75
Debug.h
llvm::Intrinsic::ID
unsigned ID
Definition: TargetTransformInfo.h:37