LLVM  6.0.0svn
HexagonLoopIdiomRecognition.cpp
Go to the documentation of this file.
1 //===- HexagonLoopIdiomRecognition.cpp ------------------------------------===//
2 //
3 // The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 
10 #define DEBUG_TYPE "hexagon-lir"
11 
12 #include "llvm/ADT/APInt.h"
13 #include "llvm/ADT/DenseMap.h"
14 #include "llvm/ADT/SetVector.h"
15 #include "llvm/ADT/SmallPtrSet.h"
16 #include "llvm/ADT/SmallSet.h"
17 #include "llvm/ADT/SmallVector.h"
18 #include "llvm/ADT/StringRef.h"
19 #include "llvm/ADT/Triple.h"
22 #include "llvm/Analysis/LoopInfo.h"
23 #include "llvm/Analysis/LoopPass.h"
30 #include "llvm/IR/Attributes.h"
31 #include "llvm/IR/BasicBlock.h"
32 #include "llvm/IR/Constant.h"
33 #include "llvm/IR/Constants.h"
34 #include "llvm/IR/DataLayout.h"
35 #include "llvm/IR/DebugLoc.h"
36 #include "llvm/IR/DerivedTypes.h"
37 #include "llvm/IR/Dominators.h"
38 #include "llvm/IR/Function.h"
39 #include "llvm/IR/IRBuilder.h"
40 #include "llvm/IR/InstrTypes.h"
41 #include "llvm/IR/Instruction.h"
42 #include "llvm/IR/Instructions.h"
43 #include "llvm/IR/IntrinsicInst.h"
44 #include "llvm/IR/Intrinsics.h"
45 #include "llvm/IR/Module.h"
46 #include "llvm/IR/PatternMatch.h"
47 #include "llvm/IR/Type.h"
48 #include "llvm/IR/User.h"
49 #include "llvm/IR/Value.h"
50 #include "llvm/Pass.h"
51 #include "llvm/Support/Casting.h"
53 #include "llvm/Support/Compiler.h"
54 #include "llvm/Support/Debug.h"
56 #include "llvm/Support/KnownBits.h"
58 #include "llvm/Transforms/Scalar.h"
60 #include <algorithm>
61 #include <array>
62 #include <cassert>
63 #include <cstdint>
64 #include <cstdlib>
65 #include <deque>
66 #include <functional>
67 #include <iterator>
68 #include <map>
69 #include <set>
70 #include <utility>
71 #include <vector>
72 
73 using namespace llvm;
74 
75 static cl::opt<bool> DisableMemcpyIdiom("disable-memcpy-idiom",
76  cl::Hidden, cl::init(false),
77  cl::desc("Disable generation of memcpy in loop idiom recognition"));
78 
79 static cl::opt<bool> DisableMemmoveIdiom("disable-memmove-idiom",
80  cl::Hidden, cl::init(false),
81  cl::desc("Disable generation of memmove in loop idiom recognition"));
82 
83 static cl::opt<unsigned> RuntimeMemSizeThreshold("runtime-mem-idiom-threshold",
84  cl::Hidden, cl::init(0), cl::desc("Threshold (in bytes) for the runtime "
85  "check guarding the memmove."));
86 
88  "compile-time-mem-idiom-threshold", cl::Hidden, cl::init(64),
89  cl::desc("Threshold (in bytes) to perform the transformation, if the "
90  "runtime loop count (mem transfer size) is known at compile-time."));
91 
92 static cl::opt<bool> OnlyNonNestedMemmove("only-nonnested-memmove-idiom",
93  cl::Hidden, cl::init(true),
94  cl::desc("Only enable generating memmove in non-nested loops"));
95 
96 cl::opt<bool> HexagonVolatileMemcpy("disable-hexagon-volatile-memcpy",
97  cl::Hidden, cl::init(false),
98  cl::desc("Enable Hexagon-specific memcpy for volatile destination."));
99 
100 static cl::opt<unsigned> SimplifyLimit("hlir-simplify-limit", cl::init(10000),
101  cl::Hidden, cl::desc("Maximum number of simplification steps in HLIR"));
102 
103 static const char *HexagonVolatileMemcpyName
104  = "hexagon_memcpy_forward_vp4cp4n2";
105 
106 
107 namespace llvm {
108 
111 
112 } // end namespace llvm
113 
114 namespace {
115 
116  class HexagonLoopIdiomRecognize : public LoopPass {
117  public:
118  static char ID;
119 
120  explicit HexagonLoopIdiomRecognize() : LoopPass(ID) {
122  }
123 
124  StringRef getPassName() const override {
125  return "Recognize Hexagon-specific loop idioms";
126  }
127 
128  void getAnalysisUsage(AnalysisUsage &AU) const override {
138  }
139 
140  bool runOnLoop(Loop *L, LPPassManager &LPM) override;
141 
142  private:
143  unsigned getStoreSizeInBytes(StoreInst *SI);
144  int getSCEVStride(const SCEVAddRecExpr *StoreEv);
145  bool isLegalStore(Loop *CurLoop, StoreInst *SI);
146  void collectStores(Loop *CurLoop, BasicBlock *BB,
148  bool processCopyingStore(Loop *CurLoop, StoreInst *SI, const SCEV *BECount);
149  bool coverLoop(Loop *L, SmallVectorImpl<Instruction*> &Insts) const;
150  bool runOnLoopBlock(Loop *CurLoop, BasicBlock *BB, const SCEV *BECount,
151  SmallVectorImpl<BasicBlock*> &ExitBlocks);
152  bool runOnCountableLoop(Loop *L);
153 
154  AliasAnalysis *AA;
155  const DataLayout *DL;
156  DominatorTree *DT;
157  LoopInfo *LF;
158  const TargetLibraryInfo *TLI;
159  ScalarEvolution *SE;
160  bool HasMemcpy, HasMemmove;
161  };
162 
163  struct Simplifier {
164  using Rule = std::function<Value * (Instruction *, LLVMContext &)>;
165 
166  void addRule(const Rule &R) { Rules.push_back(R); }
167 
168  private:
169  struct WorkListType {
170  WorkListType() = default;
171 
172  void push_back(Value* V) {
173  // Do not push back duplicates.
174  if (!S.count(V)) { Q.push_back(V); S.insert(V); }
175  }
176 
177  Value *pop_front_val() {
178  Value *V = Q.front(); Q.pop_front(); S.erase(V);
179  return V;
180  }
181 
182  bool empty() const { return Q.empty(); }
183 
184  private:
185  std::deque<Value*> Q;
186  std::set<Value*> S;
187  };
188 
189  using ValueSetType = std::set<Value *>;
190 
191  std::vector<Rule> Rules;
192 
193  public:
194  struct Context {
195  using ValueMapType = DenseMap<Value *, Value *>;
196 
197  Value *Root;
198  ValueSetType Used; // The set of all cloned values used by Root.
199  ValueSetType Clones; // The set of all cloned values.
200  LLVMContext &Ctx;
201 
202  Context(Instruction *Exp)
203  : Ctx(Exp->getParent()->getParent()->getContext()) {
204  initialize(Exp);
205  }
206 
207  ~Context() { cleanup(); }
208 
209  void print(raw_ostream &OS, const Value *V) const;
210  Value *materialize(BasicBlock *B, BasicBlock::iterator At);
211 
212  private:
213  friend struct Simplifier;
214 
215  void initialize(Instruction *Exp);
216  void cleanup();
217 
218  template <typename FuncT> void traverse(Value *V, FuncT F);
219  void record(Value *V);
220  void use(Value *V);
221  void unuse(Value *V);
222 
223  bool equal(const Instruction *I, const Instruction *J) const;
224  Value *find(Value *Tree, Value *Sub) const;
225  Value *subst(Value *Tree, Value *OldV, Value *NewV);
226  void replace(Value *OldV, Value *NewV);
227  void link(Instruction *I, BasicBlock *B, BasicBlock::iterator At);
228  };
229 
230  Value *simplify(Context &C);
231  };
232 
233  struct PE {
234  PE(const Simplifier::Context &c, Value *v = nullptr) : C(c), V(v) {}
235 
236  const Simplifier::Context &C;
237  const Value *V;
238  };
239 
241  raw_ostream &operator<< (raw_ostream &OS, const PE &P) {
242  P.C.print(OS, P.V ? P.V : P.C.Root);
243  return OS;
244  }
245 
246 } // end anonymous namespace
247 
249 
250 INITIALIZE_PASS_BEGIN(HexagonLoopIdiomRecognize, "hexagon-loop-idiom",
251  "Recognize Hexagon-specific loop idioms", false, false)
253 INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
254 INITIALIZE_PASS_DEPENDENCY(LCSSAWrapperPass)
259 INITIALIZE_PASS_END(HexagonLoopIdiomRecognize, "hexagon-loop-idiom",
260  "Recognize Hexagon-specific loop idioms", false, false)
261 
262 template <typename FuncT>
263 void Simplifier::Context::traverse(Value *V, FuncT F) {
264  WorkListType Q;
265  Q.push_back(V);
266 
267  while (!Q.empty()) {
268  Instruction *U = dyn_cast<Instruction>(Q.pop_front_val());
269  if (!U || U->getParent())
270  continue;
271  if (!F(U))
272  continue;
273  for (Value *Op : U->operands())
274  Q.push_back(Op);
275  }
276 }
277 
278 void Simplifier::Context::print(raw_ostream &OS, const Value *V) const {
279  const auto *U = dyn_cast<const Instruction>(V);
280  if (!U) {
281  OS << V << '(' << *V << ')';
282  return;
283  }
284 
285  if (U->getParent()) {
286  OS << U << '(';
287  U->printAsOperand(OS, true);
288  OS << ')';
289  return;
290  }
291 
292  unsigned N = U->getNumOperands();
293  if (N != 0)
294  OS << U << '(';
295  OS << U->getOpcodeName();
296  for (const Value *Op : U->operands()) {
297  OS << ' ';
298  print(OS, Op);
299  }
300  if (N != 0)
301  OS << ')';
302 }
303 
305  // Perform a deep clone of the expression, set Root to the root
306  // of the clone, and build a map from the cloned values to the
307  // original ones.
308  ValueMapType M;
309  BasicBlock *Block = Exp->getParent();
310  WorkListType Q;
311  Q.push_back(Exp);
312 
313  while (!Q.empty()) {
314  Value *V = Q.pop_front_val();
315  if (M.find(V) != M.end())
316  continue;
317  if (Instruction *U = dyn_cast<Instruction>(V)) {
318  if (isa<PHINode>(U) || U->getParent() != Block)
319  continue;
320  for (Value *Op : U->operands())
321  Q.push_back(Op);
322  M.insert({U, U->clone()});
323  }
324  }
325 
326  for (std::pair<Value*,Value*> P : M) {
327  Instruction *U = cast<Instruction>(P.second);
328  for (unsigned i = 0, n = U->getNumOperands(); i != n; ++i) {
329  auto F = M.find(U->getOperand(i));
330  if (F != M.end())
331  U->setOperand(i, F->second);
332  }
333  }
334 
335  auto R = M.find(Exp);
336  assert(R != M.end());
337  Root = R->second;
338 
339  record(Root);
340  use(Root);
341 }
342 
343 void Simplifier::Context::record(Value *V) {
344  auto Record = [this](Instruction *U) -> bool {
345  Clones.insert(U);
346  return true;
347  };
348  traverse(V, Record);
349 }
350 
352  auto Use = [this](Instruction *U) -> bool {
353  Used.insert(U);
354  return true;
355  };
356  traverse(V, Use);
357 }
358 
359 void Simplifier::Context::unuse(Value *V) {
360  if (!isa<Instruction>(V) || cast<Instruction>(V)->getParent() != nullptr)
361  return;
362 
363  auto Unuse = [this](Instruction *U) -> bool {
364  if (!U->use_empty())
365  return false;
366  Used.erase(U);
367  return true;
368  };
369  traverse(V, Unuse);
370 }
371 
372 Value *Simplifier::Context::subst(Value *Tree, Value *OldV, Value *NewV) {
373  if (Tree == OldV)
374  return NewV;
375  if (OldV == NewV)
376  return Tree;
377 
378  WorkListType Q;
379  Q.push_back(Tree);
380  while (!Q.empty()) {
381  Instruction *U = dyn_cast<Instruction>(Q.pop_front_val());
382  // If U is not an instruction, or it's not a clone, skip it.
383  if (!U || U->getParent())
384  continue;
385  for (unsigned i = 0, n = U->getNumOperands(); i != n; ++i) {
386  Value *Op = U->getOperand(i);
387  if (Op == OldV) {
388  U->setOperand(i, NewV);
389  unuse(OldV);
390  } else {
391  Q.push_back(Op);
392  }
393  }
394  }
395  return Tree;
396 }
397 
398 void Simplifier::Context::replace(Value *OldV, Value *NewV) {
399  if (Root == OldV) {
400  Root = NewV;
401  use(Root);
402  return;
403  }
404 
405  // NewV may be a complex tree that has just been created by one of the
406  // transformation rules. We need to make sure that it is commoned with
407  // the existing Root to the maximum extent possible.
408  // Identify all subtrees of NewV (including NewV itself) that have
409  // equivalent counterparts in Root, and replace those subtrees with
410  // these counterparts.
411  WorkListType Q;
412  Q.push_back(NewV);
413  while (!Q.empty()) {
414  Value *V = Q.pop_front_val();
416  if (!U || U->getParent())
417  continue;
418  if (Value *DupV = find(Root, V)) {
419  if (DupV != V)
420  NewV = subst(NewV, V, DupV);
421  } else {
422  for (Value *Op : U->operands())
423  Q.push_back(Op);
424  }
425  }
426 
427  // Now, simply replace OldV with NewV in Root.
428  Root = subst(Root, OldV, NewV);
429  use(Root);
430 }
431 
433  for (Value *V : Clones) {
434  Instruction *U = cast<Instruction>(V);
435  if (!U->getParent())
436  U->dropAllReferences();
437  }
438 
439  for (Value *V : Clones) {
440  Instruction *U = cast<Instruction>(V);
441  if (!U->getParent())
442  U->deleteValue();
443  }
444 }
445 
447  const Instruction *J) const {
448  if (I == J)
449  return true;
450  if (!I->isSameOperationAs(J))
451  return false;
452  if (isa<PHINode>(I))
453  return I->isIdenticalTo(J);
454 
455  for (unsigned i = 0, n = I->getNumOperands(); i != n; ++i) {
456  Value *OpI = I->getOperand(i), *OpJ = J->getOperand(i);
457  if (OpI == OpJ)
458  continue;
459  auto *InI = dyn_cast<const Instruction>(OpI);
460  auto *InJ = dyn_cast<const Instruction>(OpJ);
461  if (InI && InJ) {
462  if (!equal(InI, InJ))
463  return false;
464  } else if (InI != InJ || !InI)
465  return false;
466  }
467  return true;
468 }
469 
470 Value *Simplifier::Context::find(Value *Tree, Value *Sub) const {
471  Instruction *SubI = dyn_cast<Instruction>(Sub);
472  WorkListType Q;
473  Q.push_back(Tree);
474 
475  while (!Q.empty()) {
476  Value *V = Q.pop_front_val();
477  if (V == Sub)
478  return V;
480  if (!U || U->getParent())
481  continue;
482  if (SubI && equal(SubI, U))
483  return U;
484  assert(!isa<PHINode>(U));
485  for (Value *Op : U->operands())
486  Q.push_back(Op);
487  }
488  return nullptr;
489 }
490 
491 void Simplifier::Context::link(Instruction *I, BasicBlock *B,
493  if (I->getParent())
494  return;
495 
496  for (Value *Op : I->operands()) {
497  if (Instruction *OpI = dyn_cast<Instruction>(Op))
498  link(OpI, B, At);
499  }
500 
501  B->getInstList().insert(At, I);
502 }
503 
504 Value *Simplifier::Context::materialize(BasicBlock *B,
506  if (Instruction *RootI = dyn_cast<Instruction>(Root))
507  link(RootI, B, At);
508  return Root;
509 }
510 
512  WorkListType Q;
513  Q.push_back(C.Root);
514  unsigned Count = 0;
515  const unsigned Limit = SimplifyLimit;
516 
517  while (!Q.empty()) {
518  if (Count++ >= Limit)
519  break;
520  Instruction *U = dyn_cast<Instruction>(Q.pop_front_val());
521  if (!U || U->getParent() || !C.Used.count(U))
522  continue;
523  bool Changed = false;
524  for (Rule &R : Rules) {
525  Value *W = R(U, C.Ctx);
526  if (!W)
527  continue;
528  Changed = true;
529  C.record(W);
530  C.replace(U, W);
531  Q.push_back(C.Root);
532  break;
533  }
534  if (!Changed) {
535  for (Value *Op : U->operands())
536  Q.push_back(Op);
537  }
538  }
539  return Count < Limit ? C.Root : nullptr;
540 }
541 
542 //===----------------------------------------------------------------------===//
543 //
544 // Implementation of PolynomialMultiplyRecognize
545 //
546 //===----------------------------------------------------------------------===//
547 
548 namespace {
549 
550  class PolynomialMultiplyRecognize {
551  public:
552  explicit PolynomialMultiplyRecognize(Loop *loop, const DataLayout &dl,
553  const DominatorTree &dt, const TargetLibraryInfo &tli,
554  ScalarEvolution &se)
555  : CurLoop(loop), DL(dl), DT(dt), TLI(tli), SE(se) {}
556 
557  bool recognize();
558 
559  private:
560  using ValueSeq = SetVector<Value *>;
561 
562  IntegerType *getPmpyType() const {
563  LLVMContext &Ctx = CurLoop->getHeader()->getParent()->getContext();
564  return IntegerType::get(Ctx, 32);
565  }
566 
567  bool isPromotableTo(Value *V, IntegerType *Ty);
568  void promoteTo(Instruction *In, IntegerType *DestTy, BasicBlock *LoopB);
569  bool promoteTypes(BasicBlock *LoopB, BasicBlock *ExitB);
570 
571  Value *getCountIV(BasicBlock *BB);
572  bool findCycle(Value *Out, Value *In, ValueSeq &Cycle);
573  void classifyCycle(Instruction *DivI, ValueSeq &Cycle, ValueSeq &Early,
574  ValueSeq &Late);
575  bool classifyInst(Instruction *UseI, ValueSeq &Early, ValueSeq &Late);
576  bool commutesWithShift(Instruction *I);
577  bool highBitsAreZero(Value *V, unsigned IterCount);
578  bool keepsHighBitsZero(Value *V, unsigned IterCount);
579  bool isOperandShifted(Instruction *I, Value *Op);
580  bool convertShiftsToLeft(BasicBlock *LoopB, BasicBlock *ExitB,
581  unsigned IterCount);
582  void cleanupLoopBody(BasicBlock *LoopB);
583 
584  struct ParsedValues {
585  ParsedValues() = default;
586 
587  Value *M = nullptr;
588  Value *P = nullptr;
589  Value *Q = nullptr;
590  Value *R = nullptr;
591  Value *X = nullptr;
592  Instruction *Res = nullptr;
593  unsigned IterCount = 0;
594  bool Left = false;
595  bool Inv = false;
596  };
597 
598  bool matchLeftShift(SelectInst *SelI, Value *CIV, ParsedValues &PV);
599  bool matchRightShift(SelectInst *SelI, ParsedValues &PV);
600  bool scanSelect(SelectInst *SI, BasicBlock *LoopB, BasicBlock *PrehB,
601  Value *CIV, ParsedValues &PV, bool PreScan);
602  unsigned getInverseMxN(unsigned QP);
603  Value *generate(BasicBlock::iterator At, ParsedValues &PV);
604 
605  void setupSimplifier();
606 
607  Simplifier Simp;
608  Loop *CurLoop;
609  const DataLayout &DL;
610  const DominatorTree &DT;
611  const TargetLibraryInfo &TLI;
612  ScalarEvolution &SE;
613  };
614 
615 } // end anonymous namespace
616 
617 Value *PolynomialMultiplyRecognize::getCountIV(BasicBlock *BB) {
618  pred_iterator PI = pred_begin(BB), PE = pred_end(BB);
619  if (std::distance(PI, PE) != 2)
620  return nullptr;
621  BasicBlock *PB = (*PI == BB) ? *std::next(PI) : *PI;
622 
623  for (auto I = BB->begin(), E = BB->end(); I != E && isa<PHINode>(I); ++I) {
624  auto *PN = cast<PHINode>(I);
625  Value *InitV = PN->getIncomingValueForBlock(PB);
626  if (!isa<ConstantInt>(InitV) || !cast<ConstantInt>(InitV)->isZero())
627  continue;
628  Value *IterV = PN->getIncomingValueForBlock(BB);
629  if (!isa<BinaryOperator>(IterV))
630  continue;
631  auto *BO = dyn_cast<BinaryOperator>(IterV);
632  if (BO->getOpcode() != Instruction::Add)
633  continue;
634  Value *IncV = nullptr;
635  if (BO->getOperand(0) == PN)
636  IncV = BO->getOperand(1);
637  else if (BO->getOperand(1) == PN)
638  IncV = BO->getOperand(0);
639  if (IncV == nullptr)
640  continue;
641 
642  if (auto *T = dyn_cast<ConstantInt>(IncV))
643  if (T->getZExtValue() == 1)
644  return PN;
645  }
646  return nullptr;
647 }
648 
649 static void replaceAllUsesOfWithIn(Value *I, Value *J, BasicBlock *BB) {
650  for (auto UI = I->user_begin(), UE = I->user_end(); UI != UE;) {
651  Use &TheUse = UI.getUse();
652  ++UI;
653  if (auto *II = dyn_cast<Instruction>(TheUse.getUser()))
654  if (BB == II->getParent())
655  II->replaceUsesOfWith(I, J);
656  }
657 }
658 
659 bool PolynomialMultiplyRecognize::matchLeftShift(SelectInst *SelI,
660  Value *CIV, ParsedValues &PV) {
661  // Match the following:
662  // select (X & (1 << i)) != 0 ? R ^ (Q << i) : R
663  // select (X & (1 << i)) == 0 ? R : R ^ (Q << i)
664  // The condition may also check for equality with the masked value, i.e
665  // select (X & (1 << i)) == (1 << i) ? R ^ (Q << i) : R
666  // select (X & (1 << i)) != (1 << i) ? R : R ^ (Q << i);
667 
668  Value *CondV = SelI->getCondition();
669  Value *TrueV = SelI->getTrueValue();
670  Value *FalseV = SelI->getFalseValue();
671 
672  using namespace PatternMatch;
673 
675  Value *A = nullptr, *B = nullptr, *C = nullptr;
676 
677  if (!match(CondV, m_ICmp(P, m_And(m_Value(A), m_Value(B)), m_Value(C))) &&
678  !match(CondV, m_ICmp(P, m_Value(C), m_And(m_Value(A), m_Value(B)))))
679  return false;
680  if (P != CmpInst::ICMP_EQ && P != CmpInst::ICMP_NE)
681  return false;
682  // Matched: select (A & B) == C ? ... : ...
683  // select (A & B) != C ? ... : ...
684 
685  Value *X = nullptr, *Sh1 = nullptr;
686  // Check (A & B) for (X & (1 << i)):
687  if (match(A, m_Shl(m_One(), m_Specific(CIV)))) {
688  Sh1 = A;
689  X = B;
690  } else if (match(B, m_Shl(m_One(), m_Specific(CIV)))) {
691  Sh1 = B;
692  X = A;
693  } else {
694  // TODO: Could also check for an induction variable containing single
695  // bit shifted left by 1 in each iteration.
696  return false;
697  }
698 
699  bool TrueIfZero;
700 
701  // Check C against the possible values for comparison: 0 and (1 << i):
702  if (match(C, m_Zero()))
703  TrueIfZero = (P == CmpInst::ICMP_EQ);
704  else if (C == Sh1)
705  TrueIfZero = (P == CmpInst::ICMP_NE);
706  else
707  return false;
708 
709  // So far, matched:
710  // select (X & (1 << i)) ? ... : ...
711  // including variations of the check against zero/non-zero value.
712 
713  Value *ShouldSameV = nullptr, *ShouldXoredV = nullptr;
714  if (TrueIfZero) {
715  ShouldSameV = TrueV;
716  ShouldXoredV = FalseV;
717  } else {
718  ShouldSameV = FalseV;
719  ShouldXoredV = TrueV;
720  }
721 
722  Value *Q = nullptr, *R = nullptr, *Y = nullptr, *Z = nullptr;
723  Value *T = nullptr;
724  if (match(ShouldXoredV, m_Xor(m_Value(Y), m_Value(Z)))) {
725  // Matched: select +++ ? ... : Y ^ Z
726  // select +++ ? Y ^ Z : ...
727  // where +++ denotes previously checked matches.
728  if (ShouldSameV == Y)
729  T = Z;
730  else if (ShouldSameV == Z)
731  T = Y;
732  else
733  return false;
734  R = ShouldSameV;
735  // Matched: select +++ ? R : R ^ T
736  // select +++ ? R ^ T : R
737  // depending on TrueIfZero.
738 
739  } else if (match(ShouldSameV, m_Zero())) {
740  // Matched: select +++ ? 0 : ...
741  // select +++ ? ... : 0
742  if (!SelI->hasOneUse())
743  return false;
744  T = ShouldXoredV;
745  // Matched: select +++ ? 0 : T
746  // select +++ ? T : 0
747 
748  Value *U = *SelI->user_begin();
749  if (!match(U, m_Xor(m_Specific(SelI), m_Value(R))) &&
750  !match(U, m_Xor(m_Value(R), m_Specific(SelI))))
751  return false;
752  // Matched: xor (select +++ ? 0 : T), R
753  // xor (select +++ ? T : 0), R
754  } else
755  return false;
756 
757  // The xor input value T is isolated into its own match so that it could
758  // be checked against an induction variable containing a shifted bit
759  // (todo).
760  // For now, check against (Q << i).
761  if (!match(T, m_Shl(m_Value(Q), m_Specific(CIV))) &&
762  !match(T, m_Shl(m_ZExt(m_Value(Q)), m_ZExt(m_Specific(CIV)))))
763  return false;
764  // Matched: select +++ ? R : R ^ (Q << i)
765  // select +++ ? R ^ (Q << i) : R
766 
767  PV.X = X;
768  PV.Q = Q;
769  PV.R = R;
770  PV.Left = true;
771  return true;
772 }
773 
774 bool PolynomialMultiplyRecognize::matchRightShift(SelectInst *SelI,
775  ParsedValues &PV) {
776  // Match the following:
777  // select (X & 1) != 0 ? (R >> 1) ^ Q : (R >> 1)
778  // select (X & 1) == 0 ? (R >> 1) : (R >> 1) ^ Q
779  // The condition may also check for equality with the masked value, i.e
780  // select (X & 1) == 1 ? (R >> 1) ^ Q : (R >> 1)
781  // select (X & 1) != 1 ? (R >> 1) : (R >> 1) ^ Q
782 
783  Value *CondV = SelI->getCondition();
784  Value *TrueV = SelI->getTrueValue();
785  Value *FalseV = SelI->getFalseValue();
786 
787  using namespace PatternMatch;
788 
789  Value *C = nullptr;
791  bool TrueIfZero;
792 
793  if (match(CondV, m_ICmp(P, m_Value(C), m_Zero())) ||
794  match(CondV, m_ICmp(P, m_Zero(), m_Value(C)))) {
795  if (P != CmpInst::ICMP_EQ && P != CmpInst::ICMP_NE)
796  return false;
797  // Matched: select C == 0 ? ... : ...
798  // select C != 0 ? ... : ...
799  TrueIfZero = (P == CmpInst::ICMP_EQ);
800  } else if (match(CondV, m_ICmp(P, m_Value(C), m_One())) ||
801  match(CondV, m_ICmp(P, m_One(), m_Value(C)))) {
802  if (P != CmpInst::ICMP_EQ && P != CmpInst::ICMP_NE)
803  return false;
804  // Matched: select C == 1 ? ... : ...
805  // select C != 1 ? ... : ...
806  TrueIfZero = (P == CmpInst::ICMP_NE);
807  } else
808  return false;
809 
810  Value *X = nullptr;
811  if (!match(C, m_And(m_Value(X), m_One())) &&
812  !match(C, m_And(m_One(), m_Value(X))))
813  return false;
814  // Matched: select (X & 1) == +++ ? ... : ...
815  // select (X & 1) != +++ ? ... : ...
816 
817  Value *R = nullptr, *Q = nullptr;
818  if (TrueIfZero) {
819  // The select's condition is true if the tested bit is 0.
820  // TrueV must be the shift, FalseV must be the xor.
821  if (!match(TrueV, m_LShr(m_Value(R), m_One())))
822  return false;
823  // Matched: select +++ ? (R >> 1) : ...
824  if (!match(FalseV, m_Xor(m_Specific(TrueV), m_Value(Q))) &&
825  !match(FalseV, m_Xor(m_Value(Q), m_Specific(TrueV))))
826  return false;
827  // Matched: select +++ ? (R >> 1) : (R >> 1) ^ Q
828  // with commuting ^.
829  } else {
830  // The select's condition is true if the tested bit is 1.
831  // TrueV must be the xor, FalseV must be the shift.
832  if (!match(FalseV, m_LShr(m_Value(R), m_One())))
833  return false;
834  // Matched: select +++ ? ... : (R >> 1)
835  if (!match(TrueV, m_Xor(m_Specific(FalseV), m_Value(Q))) &&
836  !match(TrueV, m_Xor(m_Value(Q), m_Specific(FalseV))))
837  return false;
838  // Matched: select +++ ? (R >> 1) ^ Q : (R >> 1)
839  // with commuting ^.
840  }
841 
842  PV.X = X;
843  PV.Q = Q;
844  PV.R = R;
845  PV.Left = false;
846  return true;
847 }
848 
849 bool PolynomialMultiplyRecognize::scanSelect(SelectInst *SelI,
850  BasicBlock *LoopB, BasicBlock *PrehB, Value *CIV, ParsedValues &PV,
851  bool PreScan) {
852  using namespace PatternMatch;
853 
854  // The basic pattern for R = P.Q is:
855  // for i = 0..31
856  // R = phi (0, R')
857  // if (P & (1 << i)) ; test-bit(P, i)
858  // R' = R ^ (Q << i)
859  //
860  // Similarly, the basic pattern for R = (P/Q).Q - P
861  // for i = 0..31
862  // R = phi(P, R')
863  // if (R & (1 << i))
864  // R' = R ^ (Q << i)
865 
866  // There exist idioms, where instead of Q being shifted left, P is shifted
867  // right. This produces a result that is shifted right by 32 bits (the
868  // non-shifted result is 64-bit).
869  //
870  // For R = P.Q, this would be:
871  // for i = 0..31
872  // R = phi (0, R')
873  // if ((P >> i) & 1)
874  // R' = (R >> 1) ^ Q ; R is cycled through the loop, so it must
875  // else ; be shifted by 1, not i.
876  // R' = R >> 1
877  //
878  // And for the inverse:
879  // for i = 0..31
880  // R = phi (P, R')
881  // if (R & 1)
882  // R' = (R >> 1) ^ Q
883  // else
884  // R' = R >> 1
885 
886  // The left-shifting idioms share the same pattern:
887  // select (X & (1 << i)) ? R ^ (Q << i) : R
888  // Similarly for right-shifting idioms:
889  // select (X & 1) ? (R >> 1) ^ Q
890 
891  if (matchLeftShift(SelI, CIV, PV)) {
892  // If this is a pre-scan, getting this far is sufficient.
893  if (PreScan)
894  return true;
895 
896  // Need to make sure that the SelI goes back into R.
897  auto *RPhi = dyn_cast<PHINode>(PV.R);
898  if (!RPhi)
899  return false;
900  if (SelI != RPhi->getIncomingValueForBlock(LoopB))
901  return false;
902  PV.Res = SelI;
903 
904  // If X is loop invariant, it must be the input polynomial, and the
905  // idiom is the basic polynomial multiply.
906  if (CurLoop->isLoopInvariant(PV.X)) {
907  PV.P = PV.X;
908  PV.Inv = false;
909  } else {
910  // X is not loop invariant. If X == R, this is the inverse pmpy.
911  // Otherwise, check for an xor with an invariant value. If the
912  // variable argument to the xor is R, then this is still a valid
913  // inverse pmpy.
914  PV.Inv = true;
915  if (PV.X != PV.R) {
916  Value *Var = nullptr, *Inv = nullptr, *X1 = nullptr, *X2 = nullptr;
917  if (!match(PV.X, m_Xor(m_Value(X1), m_Value(X2))))
918  return false;
919  auto *I1 = dyn_cast<Instruction>(X1);
920  auto *I2 = dyn_cast<Instruction>(X2);
921  if (!I1 || I1->getParent() != LoopB) {
922  Var = X2;
923  Inv = X1;
924  } else if (!I2 || I2->getParent() != LoopB) {
925  Var = X1;
926  Inv = X2;
927  } else
928  return false;
929  if (Var != PV.R)
930  return false;
931  PV.M = Inv;
932  }
933  // The input polynomial P still needs to be determined. It will be
934  // the entry value of R.
935  Value *EntryP = RPhi->getIncomingValueForBlock(PrehB);
936  PV.P = EntryP;
937  }
938 
939  return true;
940  }
941 
942  if (matchRightShift(SelI, PV)) {
943  // If this is an inverse pattern, the Q polynomial must be known at
944  // compile time.
945  if (PV.Inv && !isa<ConstantInt>(PV.Q))
946  return false;
947  if (PreScan)
948  return true;
949  // There is no exact matching of right-shift pmpy.
950  return false;
951  }
952 
953  return false;
954 }
955 
956 bool PolynomialMultiplyRecognize::isPromotableTo(Value *Val,
957  IntegerType *DestTy) {
959  if (!T || T->getBitWidth() > DestTy->getBitWidth())
960  return false;
961  if (T->getBitWidth() == DestTy->getBitWidth())
962  return true;
963  // Non-instructions are promotable. The reason why an instruction may not
964  // be promotable is that it may produce a different result if its operands
965  // and the result are promoted, for example, it may produce more non-zero
966  // bits. While it would still be possible to represent the proper result
967  // in a wider type, it may require adding additional instructions (which
968  // we don't want to do).
970  if (!In)
971  return true;
972  // The bitwidth of the source type is smaller than the destination.
973  // Check if the individual operation can be promoted.
974  switch (In->getOpcode()) {
975  case Instruction::PHI:
976  case Instruction::ZExt:
977  case Instruction::And:
978  case Instruction::Or:
979  case Instruction::Xor:
980  case Instruction::LShr: // Shift right is ok.
981  case Instruction::Select:
982  return true;
983  case Instruction::ICmp:
984  if (CmpInst *CI = cast<CmpInst>(In))
985  return CI->isEquality() || CI->isUnsigned();
986  llvm_unreachable("Cast failed unexpectedly");
987  case Instruction::Add:
988  return In->hasNoSignedWrap() && In->hasNoUnsignedWrap();
989  }
990  return false;
991 }
992 
993 void PolynomialMultiplyRecognize::promoteTo(Instruction *In,
994  IntegerType *DestTy, BasicBlock *LoopB) {
995  // Leave boolean values alone.
996  if (!In->getType()->isIntegerTy(1))
997  In->mutateType(DestTy);
998  unsigned DestBW = DestTy->getBitWidth();
999 
1000  // Handle PHIs.
1001  if (PHINode *P = dyn_cast<PHINode>(In)) {
1002  unsigned N = P->getNumIncomingValues();
1003  for (unsigned i = 0; i != N; ++i) {
1004  BasicBlock *InB = P->getIncomingBlock(i);
1005  if (InB == LoopB)
1006  continue;
1007  Value *InV = P->getIncomingValue(i);
1008  IntegerType *Ty = cast<IntegerType>(InV->getType());
1009  // Do not promote values in PHI nodes of type i1.
1010  if (Ty != P->getType()) {
1011  // If the value type does not match the PHI type, the PHI type
1012  // must have been promoted.
1013  assert(Ty->getBitWidth() < DestBW);
1014  InV = IRBuilder<>(InB->getTerminator()).CreateZExt(InV, DestTy);
1015  P->setIncomingValue(i, InV);
1016  }
1017  }
1018  } else if (ZExtInst *Z = dyn_cast<ZExtInst>(In)) {
1019  Value *Op = Z->getOperand(0);
1020  if (Op->getType() == Z->getType())
1021  Z->replaceAllUsesWith(Op);
1022  Z->eraseFromParent();
1023  return;
1024  }
1025 
1026  // Promote immediates.
1027  for (unsigned i = 0, n = In->getNumOperands(); i != n; ++i) {
1028  if (ConstantInt *CI = dyn_cast<ConstantInt>(In->getOperand(i)))
1029  if (CI->getType()->getBitWidth() < DestBW)
1030  In->setOperand(i, ConstantInt::get(DestTy, CI->getZExtValue()));
1031  }
1032 }
1033 
1034 bool PolynomialMultiplyRecognize::promoteTypes(BasicBlock *LoopB,
1035  BasicBlock *ExitB) {
1036  assert(LoopB);
1037  // Skip loops where the exit block has more than one predecessor. The values
1038  // coming from the loop block will be promoted to another type, and so the
1039  // values coming into the exit block from other predecessors would also have
1040  // to be promoted.
1041  if (!ExitB || (ExitB->getSinglePredecessor() != LoopB))
1042  return false;
1043  IntegerType *DestTy = getPmpyType();
1044  // Check if the exit values have types that are no wider than the type
1045  // that we want to promote to.
1046  unsigned DestBW = DestTy->getBitWidth();
1047  for (Instruction &In : *ExitB) {
1048  PHINode *P = dyn_cast<PHINode>(&In);
1049  if (!P)
1050  break;
1051  if (P->getNumIncomingValues() != 1)
1052  return false;
1053  assert(P->getIncomingBlock(0) == LoopB);
1055  if (!T || T->getBitWidth() > DestBW)
1056  return false;
1057  }
1058 
1059  // Check all instructions in the loop.
1060  for (Instruction &In : *LoopB)
1061  if (!In.isTerminator() && !isPromotableTo(&In, DestTy))
1062  return false;
1063 
1064  // Perform the promotion.
1065  std::vector<Instruction*> LoopIns;
1066  std::transform(LoopB->begin(), LoopB->end(), std::back_inserter(LoopIns),
1067  [](Instruction &In) { return &In; });
1068  for (Instruction *In : LoopIns)
1069  promoteTo(In, DestTy, LoopB);
1070 
1071  // Fix up the PHI nodes in the exit block.
1072  Instruction *EndI = ExitB->getFirstNonPHI();
1073  BasicBlock::iterator End = EndI ? EndI->getIterator() : ExitB->end();
1074  for (auto I = ExitB->begin(); I != End; ++I) {
1075  PHINode *P = dyn_cast<PHINode>(I);
1076  if (!P)
1077  break;
1078  Type *Ty0 = P->getIncomingValue(0)->getType();
1079  Type *PTy = P->getType();
1080  if (PTy != Ty0) {
1081  assert(Ty0 == DestTy);
1082  // In order to create the trunc, P must have the promoted type.
1083  P->mutateType(Ty0);
1084  Value *T = IRBuilder<>(ExitB, End).CreateTrunc(P, PTy);
1085  // In order for the RAUW to work, the types of P and T must match.
1086  P->mutateType(PTy);
1087  P->replaceAllUsesWith(T);
1088  // Final update of the P's type.
1089  P->mutateType(Ty0);
1090  cast<Instruction>(T)->setOperand(0, P);
1091  }
1092  }
1093 
1094  return true;
1095 }
1096 
1097 bool PolynomialMultiplyRecognize::findCycle(Value *Out, Value *In,
1098  ValueSeq &Cycle) {
1099  // Out = ..., In, ...
1100  if (Out == In)
1101  return true;
1102 
1103  auto *BB = cast<Instruction>(Out)->getParent();
1104  bool HadPhi = false;
1105 
1106  for (auto U : Out->users()) {
1107  auto *I = dyn_cast<Instruction>(&*U);
1108  if (I == nullptr || I->getParent() != BB)
1109  continue;
1110  // Make sure that there are no multi-iteration cycles, e.g.
1111  // p1 = phi(p2)
1112  // p2 = phi(p1)
1113  // The cycle p1->p2->p1 would span two loop iterations.
1114  // Check that there is only one phi in the cycle.
1115  bool IsPhi = isa<PHINode>(I);
1116  if (IsPhi && HadPhi)
1117  return false;
1118  HadPhi |= IsPhi;
1119  if (Cycle.count(I))
1120  return false;
1121  Cycle.insert(I);
1122  if (findCycle(I, In, Cycle))
1123  break;
1124  Cycle.remove(I);
1125  }
1126  return !Cycle.empty();
1127 }
1128 
1129 void PolynomialMultiplyRecognize::classifyCycle(Instruction *DivI,
1130  ValueSeq &Cycle, ValueSeq &Early, ValueSeq &Late) {
1131  // All the values in the cycle that are between the phi node and the
1132  // divider instruction will be classified as "early", all other values
1133  // will be "late".
1134 
1135  bool IsE = true;
1136  unsigned I, N = Cycle.size();
1137  for (I = 0; I < N; ++I) {
1138  Value *V = Cycle[I];
1139  if (DivI == V)
1140  IsE = false;
1141  else if (!isa<PHINode>(V))
1142  continue;
1143  // Stop if found either.
1144  break;
1145  }
1146  // "I" is the index of either DivI or the phi node, whichever was first.
1147  // "E" is "false" or "true" respectively.
1148  ValueSeq &First = !IsE ? Early : Late;
1149  for (unsigned J = 0; J < I; ++J)
1150  First.insert(Cycle[J]);
1151 
1152  ValueSeq &Second = IsE ? Early : Late;
1153  Second.insert(Cycle[I]);
1154  for (++I; I < N; ++I) {
1155  Value *V = Cycle[I];
1156  if (DivI == V || isa<PHINode>(V))
1157  break;
1158  Second.insert(V);
1159  }
1160 
1161  for (; I < N; ++I)
1162  First.insert(Cycle[I]);
1163 }
1164 
1165 bool PolynomialMultiplyRecognize::classifyInst(Instruction *UseI,
1166  ValueSeq &Early, ValueSeq &Late) {
1167  // Select is an exception, since the condition value does not have to be
1168  // classified in the same way as the true/false values. The true/false
1169  // values do have to be both early or both late.
1170  if (UseI->getOpcode() == Instruction::Select) {
1171  Value *TV = UseI->getOperand(1), *FV = UseI->getOperand(2);
1172  if (Early.count(TV) || Early.count(FV)) {
1173  if (Late.count(TV) || Late.count(FV))
1174  return false;
1175  Early.insert(UseI);
1176  } else if (Late.count(TV) || Late.count(FV)) {
1177  if (Early.count(TV) || Early.count(FV))
1178  return false;
1179  Late.insert(UseI);
1180  }
1181  return true;
1182  }
1183 
1184  // Not sure what would be the example of this, but the code below relies
1185  // on having at least one operand.
1186  if (UseI->getNumOperands() == 0)
1187  return true;
1188 
1189  bool AE = true, AL = true;
1190  for (auto &I : UseI->operands()) {
1191  if (Early.count(&*I))
1192  AL = false;
1193  else if (Late.count(&*I))
1194  AE = false;
1195  }
1196  // If the operands appear "all early" and "all late" at the same time,
1197  // then it means that none of them are actually classified as either.
1198  // This is harmless.
1199  if (AE && AL)
1200  return true;
1201  // Conversely, if they are neither "all early" nor "all late", then
1202  // we have a mixture of early and late operands that is not a known
1203  // exception.
1204  if (!AE && !AL)
1205  return false;
1206 
1207  // Check that we have covered the two special cases.
1208  assert(AE != AL);
1209 
1210  if (AE)
1211  Early.insert(UseI);
1212  else
1213  Late.insert(UseI);
1214  return true;
1215 }
1216 
1217 bool PolynomialMultiplyRecognize::commutesWithShift(Instruction *I) {
1218  switch (I->getOpcode()) {
1219  case Instruction::And:
1220  case Instruction::Or:
1221  case Instruction::Xor:
1222  case Instruction::LShr:
1223  case Instruction::Shl:
1224  case Instruction::Select:
1225  case Instruction::ICmp:
1226  case Instruction::PHI:
1227  break;
1228  default:
1229  return false;
1230  }
1231  return true;
1232 }
1233 
1234 bool PolynomialMultiplyRecognize::highBitsAreZero(Value *V,
1235  unsigned IterCount) {
1236  auto *T = dyn_cast<IntegerType>(V->getType());
1237  if (!T)
1238  return false;
1239 
1240  KnownBits Known(T->getBitWidth());
1241  computeKnownBits(V, Known, DL);
1242  return Known.countMinLeadingZeros() >= IterCount;
1243 }
1244 
1245 bool PolynomialMultiplyRecognize::keepsHighBitsZero(Value *V,
1246  unsigned IterCount) {
1247  // Assume that all inputs to the value have the high bits zero.
1248  // Check if the value itself preserves the zeros in the high bits.
1249  if (auto *C = dyn_cast<ConstantInt>(V))
1250  return C->getValue().countLeadingZeros() >= IterCount;
1251 
1252  if (auto *I = dyn_cast<Instruction>(V)) {
1253  switch (I->getOpcode()) {
1254  case Instruction::And:
1255  case Instruction::Or:
1256  case Instruction::Xor:
1257  case Instruction::LShr:
1258  case Instruction::Select:
1259  case Instruction::ICmp:
1260  case Instruction::PHI:
1261  case Instruction::ZExt:
1262  return true;
1263  }
1264  }
1265 
1266  return false;
1267 }
1268 
1269 bool PolynomialMultiplyRecognize::isOperandShifted(Instruction *I, Value *Op) {
1270  unsigned Opc = I->getOpcode();
1271  if (Opc == Instruction::Shl || Opc == Instruction::LShr)
1272  return Op != I->getOperand(1);
1273  return true;
1274 }
1275 
1276 bool PolynomialMultiplyRecognize::convertShiftsToLeft(BasicBlock *LoopB,
1277  BasicBlock *ExitB, unsigned IterCount) {
1278  Value *CIV = getCountIV(LoopB);
1279  if (CIV == nullptr)
1280  return false;
1281  auto *CIVTy = dyn_cast<IntegerType>(CIV->getType());
1282  if (CIVTy == nullptr)
1283  return false;
1284 
1285  ValueSeq RShifts;
1286  ValueSeq Early, Late, Cycled;
1287 
1288  // Find all value cycles that contain logical right shifts by 1.
1289  for (Instruction &I : *LoopB) {
1290  using namespace PatternMatch;
1291 
1292  Value *V = nullptr;
1293  if (!match(&I, m_LShr(m_Value(V), m_One())))
1294  continue;
1295  ValueSeq C;
1296  if (!findCycle(&I, V, C))
1297  continue;
1298 
1299  // Found a cycle.
1300  C.insert(&I);
1301  classifyCycle(&I, C, Early, Late);
1302  Cycled.insert(C.begin(), C.end());
1303  RShifts.insert(&I);
1304  }
1305 
1306  // Find the set of all values affected by the shift cycles, i.e. all
1307  // cycled values, and (recursively) all their users.
1308  ValueSeq Users(Cycled.begin(), Cycled.end());
1309  for (unsigned i = 0; i < Users.size(); ++i) {
1310  Value *V = Users[i];
1311  if (!isa<IntegerType>(V->getType()))
1312  return false;
1313  auto *R = cast<Instruction>(V);
1314  // If the instruction does not commute with shifts, the loop cannot
1315  // be unshifted.
1316  if (!commutesWithShift(R))
1317  return false;
1318  for (auto I = R->user_begin(), E = R->user_end(); I != E; ++I) {
1319  auto *T = cast<Instruction>(*I);
1320  // Skip users from outside of the loop. They will be handled later.
1321  // Also, skip the right-shifts and phi nodes, since they mix early
1322  // and late values.
1323  if (T->getParent() != LoopB || RShifts.count(T) || isa<PHINode>(T))
1324  continue;
1325 
1326  Users.insert(T);
1327  if (!classifyInst(T, Early, Late))
1328  return false;
1329  }
1330  }
1331 
1332  if (Users.empty())
1333  return false;
1334 
1335  // Verify that high bits remain zero.
1336  ValueSeq Internal(Users.begin(), Users.end());
1337  ValueSeq Inputs;
1338  for (unsigned i = 0; i < Internal.size(); ++i) {
1339  auto *R = dyn_cast<Instruction>(Internal[i]);
1340  if (!R)
1341  continue;
1342  for (Value *Op : R->operands()) {
1343  auto *T = dyn_cast<Instruction>(Op);
1344  if (T && T->getParent() != LoopB)
1345  Inputs.insert(Op);
1346  else
1347  Internal.insert(Op);
1348  }
1349  }
1350  for (Value *V : Inputs)
1351  if (!highBitsAreZero(V, IterCount))
1352  return false;
1353  for (Value *V : Internal)
1354  if (!keepsHighBitsZero(V, IterCount))
1355  return false;
1356 
1357  // Finally, the work can be done. Unshift each user.
1358  IRBuilder<> IRB(LoopB);
1359  std::map<Value*,Value*> ShiftMap;
1360 
1361  using CastMapType = std::map<std::pair<Value *, Type *>, Value *>;
1362 
1363  CastMapType CastMap;
1364 
1365  auto upcast = [] (CastMapType &CM, IRBuilder<> &IRB, Value *V,
1366  IntegerType *Ty) -> Value* {
1367  auto H = CM.find(std::make_pair(V, Ty));
1368  if (H != CM.end())
1369  return H->second;
1370  Value *CV = IRB.CreateIntCast(V, Ty, false);
1371  CM.insert(std::make_pair(std::make_pair(V, Ty), CV));
1372  return CV;
1373  };
1374 
1375  for (auto I = LoopB->begin(), E = LoopB->end(); I != E; ++I) {
1376  using namespace PatternMatch;
1377 
1378  if (isa<PHINode>(I) || !Users.count(&*I))
1379  continue;
1380 
1381  // Match lshr x, 1.
1382  Value *V = nullptr;
1383  if (match(&*I, m_LShr(m_Value(V), m_One()))) {
1384  replaceAllUsesOfWithIn(&*I, V, LoopB);
1385  continue;
1386  }
1387  // For each non-cycled operand, replace it with the corresponding
1388  // value shifted left.
1389  for (auto &J : I->operands()) {
1390  Value *Op = J.get();
1391  if (!isOperandShifted(&*I, Op))
1392  continue;
1393  if (Users.count(Op))
1394  continue;
1395  // Skip shifting zeros.
1396  if (isa<ConstantInt>(Op) && cast<ConstantInt>(Op)->isZero())
1397  continue;
1398  // Check if we have already generated a shift for this value.
1399  auto F = ShiftMap.find(Op);
1400  Value *W = (F != ShiftMap.end()) ? F->second : nullptr;
1401  if (W == nullptr) {
1402  IRB.SetInsertPoint(&*I);
1403  // First, the shift amount will be CIV or CIV+1, depending on
1404  // whether the value is early or late. Instead of creating CIV+1,
1405  // do a single shift of the value.
1406  Value *ShAmt = CIV, *ShVal = Op;
1407  auto *VTy = cast<IntegerType>(ShVal->getType());
1408  auto *ATy = cast<IntegerType>(ShAmt->getType());
1409  if (Late.count(&*I))
1410  ShVal = IRB.CreateShl(Op, ConstantInt::get(VTy, 1));
1411  // Second, the types of the shifted value and the shift amount
1412  // must match.
1413  if (VTy != ATy) {
1414  if (VTy->getBitWidth() < ATy->getBitWidth())
1415  ShVal = upcast(CastMap, IRB, ShVal, ATy);
1416  else
1417  ShAmt = upcast(CastMap, IRB, ShAmt, VTy);
1418  }
1419  // Ready to generate the shift and memoize it.
1420  W = IRB.CreateShl(ShVal, ShAmt);
1421  ShiftMap.insert(std::make_pair(Op, W));
1422  }
1423  I->replaceUsesOfWith(Op, W);
1424  }
1425  }
1426 
1427  // Update the users outside of the loop to account for having left
1428  // shifts. They would normally be shifted right in the loop, so shift
1429  // them right after the loop exit.
1430  // Take advantage of the loop-closed SSA form, which has all the post-
1431  // loop values in phi nodes.
1432  IRB.SetInsertPoint(ExitB, ExitB->getFirstInsertionPt());
1433  for (auto P = ExitB->begin(), Q = ExitB->end(); P != Q; ++P) {
1434  if (!isa<PHINode>(P))
1435  break;
1436  auto *PN = cast<PHINode>(P);
1437  Value *U = PN->getIncomingValueForBlock(LoopB);
1438  if (!Users.count(U))
1439  continue;
1440  Value *S = IRB.CreateLShr(PN, ConstantInt::get(PN->getType(), IterCount));
1441  PN->replaceAllUsesWith(S);
1442  // The above RAUW will create
1443  // S = lshr S, IterCount
1444  // so we need to fix it back into
1445  // S = lshr PN, IterCount
1446  cast<User>(S)->replaceUsesOfWith(S, PN);
1447  }
1448 
1449  return true;
1450 }
1451 
1452 void PolynomialMultiplyRecognize::cleanupLoopBody(BasicBlock *LoopB) {
1453  for (auto &I : *LoopB)
1454  if (Value *SV = SimplifyInstruction(&I, {DL, &TLI, &DT}))
1455  I.replaceAllUsesWith(SV);
1456 
1457  for (auto I = LoopB->begin(), N = I; I != LoopB->end(); I = N) {
1458  N = std::next(I);
1460  }
1461 }
1462 
1463 unsigned PolynomialMultiplyRecognize::getInverseMxN(unsigned QP) {
1464  // Arrays of coefficients of Q and the inverse, C.
1465  // Q[i] = coefficient at x^i.
1466  std::array<char,32> Q, C;
1467 
1468  for (unsigned i = 0; i < 32; ++i) {
1469  Q[i] = QP & 1;
1470  QP >>= 1;
1471  }
1472  assert(Q[0] == 1);
1473 
1474  // Find C, such that
1475  // (Q[n]*x^n + ... + Q[1]*x + Q[0]) * (C[n]*x^n + ... + C[1]*x + C[0]) = 1
1476  //
1477  // For it to have a solution, Q[0] must be 1. Since this is Z2[x], the
1478  // operations * and + are & and ^ respectively.
1479  //
1480  // Find C[i] recursively, by comparing i-th coefficient in the product
1481  // with 0 (or 1 for i=0).
1482  //
1483  // C[0] = 1, since C[0] = Q[0], and Q[0] = 1.
1484  C[0] = 1;
1485  for (unsigned i = 1; i < 32; ++i) {
1486  // Solve for C[i] in:
1487  // C[0]Q[i] ^ C[1]Q[i-1] ^ ... ^ C[i-1]Q[1] ^ C[i]Q[0] = 0
1488  // This is equivalent to
1489  // C[0]Q[i] ^ C[1]Q[i-1] ^ ... ^ C[i-1]Q[1] ^ C[i] = 0
1490  // which is
1491  // C[0]Q[i] ^ C[1]Q[i-1] ^ ... ^ C[i-1]Q[1] = C[i]
1492  unsigned T = 0;
1493  for (unsigned j = 0; j < i; ++j)
1494  T = T ^ (C[j] & Q[i-j]);
1495  C[i] = T;
1496  }
1497 
1498  unsigned QV = 0;
1499  for (unsigned i = 0; i < 32; ++i)
1500  if (C[i])
1501  QV |= (1 << i);
1502 
1503  return QV;
1504 }
1505 
1506 Value *PolynomialMultiplyRecognize::generate(BasicBlock::iterator At,
1507  ParsedValues &PV) {
1508  IRBuilder<> B(&*At);
1509  Module *M = At->getParent()->getParent()->getParent();
1510  Value *PMF = Intrinsic::getDeclaration(M, Intrinsic::hexagon_M4_pmpyw);
1511 
1512  Value *P = PV.P, *Q = PV.Q, *P0 = P;
1513  unsigned IC = PV.IterCount;
1514 
1515  if (PV.M != nullptr)
1516  P0 = P = B.CreateXor(P, PV.M);
1517 
1518  // Create a bit mask to clear the high bits beyond IterCount.
1519  auto *BMI = ConstantInt::get(P->getType(), APInt::getLowBitsSet(32, IC));
1520 
1521  if (PV.IterCount != 32)
1522  P = B.CreateAnd(P, BMI);
1523 
1524  if (PV.Inv) {
1525  auto *QI = dyn_cast<ConstantInt>(PV.Q);
1526  assert(QI && QI->getBitWidth() <= 32);
1527 
1528  // Again, clearing bits beyond IterCount.
1529  unsigned M = (1 << PV.IterCount) - 1;
1530  unsigned Tmp = (QI->getZExtValue() | 1) & M;
1531  unsigned QV = getInverseMxN(Tmp) & M;
1532  auto *QVI = ConstantInt::get(QI->getType(), QV);
1533  P = B.CreateCall(PMF, {P, QVI});
1534  P = B.CreateTrunc(P, QI->getType());
1535  if (IC != 32)
1536  P = B.CreateAnd(P, BMI);
1537  }
1538 
1539  Value *R = B.CreateCall(PMF, {P, Q});
1540 
1541  if (PV.M != nullptr)
1542  R = B.CreateXor(R, B.CreateIntCast(P0, R->getType(), false));
1543 
1544  return R;
1545 }
1546 
1547 void PolynomialMultiplyRecognize::setupSimplifier() {
1548  Simp.addRule(
1549  // Sink zext past bitwise operations.
1550  [](Instruction *I, LLVMContext &Ctx) -> Value* {
1551  if (I->getOpcode() != Instruction::ZExt)
1552  return nullptr;
1554  if (!T)
1555  return nullptr;
1556  switch (T->getOpcode()) {
1557  case Instruction::And:
1558  case Instruction::Or:
1559  case Instruction::Xor:
1560  break;
1561  default:
1562  return nullptr;
1563  }
1564  IRBuilder<> B(Ctx);
1565  return B.CreateBinOp(cast<BinaryOperator>(T)->getOpcode(),
1566  B.CreateZExt(T->getOperand(0), I->getType()),
1567  B.CreateZExt(T->getOperand(1), I->getType()));
1568  });
1569  Simp.addRule(
1570  // (xor (and x a) (and y a)) -> (and (xor x y) a)
1571  [](Instruction *I, LLVMContext &Ctx) -> Value* {
1572  if (I->getOpcode() != Instruction::Xor)
1573  return nullptr;
1574  Instruction *And0 = dyn_cast<Instruction>(I->getOperand(0));
1575  Instruction *And1 = dyn_cast<Instruction>(I->getOperand(1));
1576  if (!And0 || !And1)
1577  return nullptr;
1578  if (And0->getOpcode() != Instruction::And ||
1579  And1->getOpcode() != Instruction::And)
1580  return nullptr;
1581  if (And0->getOperand(1) != And1->getOperand(1))
1582  return nullptr;
1583  IRBuilder<> B(Ctx);
1584  return B.CreateAnd(B.CreateXor(And0->getOperand(0), And1->getOperand(0)),
1585  And0->getOperand(1));
1586  });
1587  Simp.addRule(
1588  // (Op (select c x y) z) -> (select c (Op x z) (Op y z))
1589  // (Op x (select c y z)) -> (select c (Op x y) (Op x z))
1590  [](Instruction *I, LLVMContext &Ctx) -> Value* {
1592  if (!BO)
1593  return nullptr;
1594  Instruction::BinaryOps Op = BO->getOpcode();
1595  if (SelectInst *Sel = dyn_cast<SelectInst>(BO->getOperand(0))) {
1596  IRBuilder<> B(Ctx);
1597  Value *X = Sel->getTrueValue(), *Y = Sel->getFalseValue();
1598  Value *Z = BO->getOperand(1);
1599  return B.CreateSelect(Sel->getCondition(),
1600  B.CreateBinOp(Op, X, Z),
1601  B.CreateBinOp(Op, Y, Z));
1602  }
1603  if (SelectInst *Sel = dyn_cast<SelectInst>(BO->getOperand(1))) {
1604  IRBuilder<> B(Ctx);
1605  Value *X = BO->getOperand(0);
1606  Value *Y = Sel->getTrueValue(), *Z = Sel->getFalseValue();
1607  return B.CreateSelect(Sel->getCondition(),
1608  B.CreateBinOp(Op, X, Y),
1609  B.CreateBinOp(Op, X, Z));
1610  }
1611  return nullptr;
1612  });
1613  Simp.addRule(
1614  // (select c (select c x y) z) -> (select c x z)
1615  // (select c x (select c y z)) -> (select c x z)
1616  [](Instruction *I, LLVMContext &Ctx) -> Value* {
1617  SelectInst *Sel = dyn_cast<SelectInst>(I);
1618  if (!Sel)
1619  return nullptr;
1620  IRBuilder<> B(Ctx);
1621  Value *C = Sel->getCondition();
1622  if (SelectInst *Sel0 = dyn_cast<SelectInst>(Sel->getTrueValue())) {
1623  if (Sel0->getCondition() == C)
1624  return B.CreateSelect(C, Sel0->getTrueValue(), Sel->getFalseValue());
1625  }
1626  if (SelectInst *Sel1 = dyn_cast<SelectInst>(Sel->getFalseValue())) {
1627  if (Sel1->getCondition() == C)
1628  return B.CreateSelect(C, Sel->getTrueValue(), Sel1->getFalseValue());
1629  }
1630  return nullptr;
1631  });
1632  Simp.addRule(
1633  // (or (lshr x 1) 0x800.0) -> (xor (lshr x 1) 0x800.0)
1634  [](Instruction *I, LLVMContext &Ctx) -> Value* {
1635  if (I->getOpcode() != Instruction::Or)
1636  return nullptr;
1637  Instruction *LShr = dyn_cast<Instruction>(I->getOperand(0));
1638  if (!LShr || LShr->getOpcode() != Instruction::LShr)
1639  return nullptr;
1640  ConstantInt *One = dyn_cast<ConstantInt>(LShr->getOperand(1));
1641  if (!One || One->getZExtValue() != 1)
1642  return nullptr;
1643  ConstantInt *Msb = dyn_cast<ConstantInt>(I->getOperand(1));
1644  if (!Msb || Msb->getZExtValue() != Msb->getType()->getSignBit())
1645  return nullptr;
1646  return IRBuilder<>(Ctx).CreateXor(LShr, Msb);
1647  });
1648  Simp.addRule(
1649  // (lshr (BitOp x y) c) -> (BitOp (lshr x c) (lshr y c))
1650  [](Instruction *I, LLVMContext &Ctx) -> Value* {
1651  if (I->getOpcode() != Instruction::LShr)
1652  return nullptr;
1654  if (!BitOp)
1655  return nullptr;
1656  switch (BitOp->getOpcode()) {
1657  case Instruction::And:
1658  case Instruction::Or:
1659  case Instruction::Xor:
1660  break;
1661  default:
1662  return nullptr;
1663  }
1664  IRBuilder<> B(Ctx);
1665  Value *S = I->getOperand(1);
1666  return B.CreateBinOp(BitOp->getOpcode(),
1667  B.CreateLShr(BitOp->getOperand(0), S),
1668  B.CreateLShr(BitOp->getOperand(1), S));
1669  });
1670  Simp.addRule(
1671  // (BitOp1 (BitOp2 x a) b) -> (BitOp2 x (BitOp1 a b))
1672  [](Instruction *I, LLVMContext &Ctx) -> Value* {
1673  auto IsBitOp = [](unsigned Op) -> bool {
1674  switch (Op) {
1675  case Instruction::And:
1676  case Instruction::Or:
1677  case Instruction::Xor:
1678  return true;
1679  }
1680  return false;
1681  };
1683  if (!BitOp1 || !IsBitOp(BitOp1->getOpcode()))
1684  return nullptr;
1685  BinaryOperator *BitOp2 = dyn_cast<BinaryOperator>(BitOp1->getOperand(0));
1686  if (!BitOp2 || !IsBitOp(BitOp2->getOpcode()))
1687  return nullptr;
1688  ConstantInt *CA = dyn_cast<ConstantInt>(BitOp2->getOperand(1));
1689  ConstantInt *CB = dyn_cast<ConstantInt>(BitOp1->getOperand(1));
1690  if (!CA || !CB)
1691  return nullptr;
1692  IRBuilder<> B(Ctx);
1693  Value *X = BitOp2->getOperand(0);
1694  return B.CreateBinOp(BitOp2->getOpcode(), X,
1695  B.CreateBinOp(BitOp1->getOpcode(), CA, CB));
1696  });
1697 }
1698 
1699 bool PolynomialMultiplyRecognize::recognize() {
1700  DEBUG(dbgs() << "Starting PolynomialMultiplyRecognize on loop\n"
1701  << *CurLoop << '\n');
1702  // Restrictions:
1703  // - The loop must consist of a single block.
1704  // - The iteration count must be known at compile-time.
1705  // - The loop must have an induction variable starting from 0, and
1706  // incremented in each iteration of the loop.
1707  BasicBlock *LoopB = CurLoop->getHeader();
1708  DEBUG(dbgs() << "Loop header:\n" << *LoopB);
1709 
1710  if (LoopB != CurLoop->getLoopLatch())
1711  return false;
1712  BasicBlock *ExitB = CurLoop->getExitBlock();
1713  if (ExitB == nullptr)
1714  return false;
1715  BasicBlock *EntryB = CurLoop->getLoopPreheader();
1716  if (EntryB == nullptr)
1717  return false;
1718 
1719  unsigned IterCount = 0;
1720  const SCEV *CT = SE.getBackedgeTakenCount(CurLoop);
1721  if (isa<SCEVCouldNotCompute>(CT))
1722  return false;
1723  if (auto *CV = dyn_cast<SCEVConstant>(CT))
1724  IterCount = CV->getValue()->getZExtValue() + 1;
1725 
1726  Value *CIV = getCountIV(LoopB);
1727  ParsedValues PV;
1728  PV.IterCount = IterCount;
1729  DEBUG(dbgs() << "Loop IV: " << *CIV << "\nIterCount: " << IterCount << '\n');
1730 
1731  setupSimplifier();
1732 
1733  // Perform a preliminary scan of select instructions to see if any of them
1734  // looks like a generator of the polynomial multiply steps. Assume that a
1735  // loop can only contain a single transformable operation, so stop the
1736  // traversal after the first reasonable candidate was found.
1737  // XXX: Currently this approach can modify the loop before being 100% sure
1738  // that the transformation can be carried out.
1739  bool FoundPreScan = false;
1740  for (Instruction &In : *LoopB) {
1742  if (!SI)
1743  continue;
1744 
1745  Simplifier::Context C(SI);
1746  Value *T = Simp.simplify(C);
1747  SelectInst *SelI = (T && isa<SelectInst>(T)) ? cast<SelectInst>(T) : SI;
1748  DEBUG(dbgs() << "scanSelect(pre-scan): " << PE(C, SelI) << '\n');
1749  if (scanSelect(SelI, LoopB, EntryB, CIV, PV, true)) {
1750  FoundPreScan = true;
1751  if (SelI != SI) {
1752  Value *NewSel = C.materialize(LoopB, SI->getIterator());
1753  SI->replaceAllUsesWith(NewSel);
1755  }
1756  break;
1757  }
1758  }
1759 
1760  if (!FoundPreScan) {
1761  DEBUG(dbgs() << "Have not found candidates for pmpy\n");
1762  return false;
1763  }
1764 
1765  if (!PV.Left) {
1766  // The right shift version actually only returns the higher bits of
1767  // the result (each iteration discards the LSB). If we want to convert it
1768  // to a left-shifting loop, the working data type must be at least as
1769  // wide as the target's pmpy instruction.
1770  if (!promoteTypes(LoopB, ExitB))
1771  return false;
1772  if (!convertShiftsToLeft(LoopB, ExitB, IterCount))
1773  return false;
1774  cleanupLoopBody(LoopB);
1775  }
1776 
1777  // Scan the loop again, find the generating select instruction.
1778  bool FoundScan = false;
1779  for (Instruction &In : *LoopB) {
1780  SelectInst *SelI = dyn_cast<SelectInst>(&In);
1781  if (!SelI)
1782  continue;
1783  DEBUG(dbgs() << "scanSelect: " << *SelI << '\n');
1784  FoundScan = scanSelect(SelI, LoopB, EntryB, CIV, PV, false);
1785  if (FoundScan)
1786  break;
1787  }
1788  assert(FoundScan);
1789 
1790  DEBUG({
1791  StringRef PP = (PV.M ? "(P+M)" : "P");
1792  if (!PV.Inv)
1793  dbgs() << "Found pmpy idiom: R = " << PP << ".Q\n";
1794  else
1795  dbgs() << "Found inverse pmpy idiom: R = (" << PP << "/Q).Q) + "
1796  << PP << "\n";
1797  dbgs() << " Res:" << *PV.Res << "\n P:" << *PV.P << "\n";
1798  if (PV.M)
1799  dbgs() << " M:" << *PV.M << "\n";
1800  dbgs() << " Q:" << *PV.Q << "\n";
1801  dbgs() << " Iteration count:" << PV.IterCount << "\n";
1802  });
1803 
1804  BasicBlock::iterator At(EntryB->getTerminator());
1805  Value *PM = generate(At, PV);
1806  if (PM == nullptr)
1807  return false;
1808 
1809  if (PM->getType() != PV.Res->getType())
1810  PM = IRBuilder<>(&*At).CreateIntCast(PM, PV.Res->getType(), false);
1811 
1812  PV.Res->replaceAllUsesWith(PM);
1813  PV.Res->eraseFromParent();
1814  return true;
1815 }
1816 
1818  uint64_t SizeInBits = DL->getTypeSizeInBits(SI->getValueOperand()->getType());
1819  assert(((SizeInBits & 7) || (SizeInBits >> 32) == 0) &&
1820  "Don't overflow unsigned.");
1821  return (unsigned)SizeInBits >> 3;
1822 }
1823 
1824 int HexagonLoopIdiomRecognize::getSCEVStride(const SCEVAddRecExpr *S) {
1825  if (const SCEVConstant *SC = dyn_cast<SCEVConstant>(S->getOperand(1)))
1826  return SC->getAPInt().getSExtValue();
1827  return 0;
1828 }
1829 
1830 bool HexagonLoopIdiomRecognize::isLegalStore(Loop *CurLoop, StoreInst *SI) {
1831  // Allow volatile stores if HexagonVolatileMemcpy is enabled.
1832  if (!(SI->isVolatile() && HexagonVolatileMemcpy) && !SI->isSimple())
1833  return false;
1834 
1835  Value *StoredVal = SI->getValueOperand();
1836  Value *StorePtr = SI->getPointerOperand();
1837 
1838  // Reject stores that are so large that they overflow an unsigned.
1839  uint64_t SizeInBits = DL->getTypeSizeInBits(StoredVal->getType());
1840  if ((SizeInBits & 7) || (SizeInBits >> 32) != 0)
1841  return false;
1842 
1843  // See if the pointer expression is an AddRec like {base,+,1} on the current
1844  // loop, which indicates a strided store. If we have something else, it's a
1845  // random store we can't handle.
1846  auto *StoreEv = dyn_cast<SCEVAddRecExpr>(SE->getSCEV(StorePtr));
1847  if (!StoreEv || StoreEv->getLoop() != CurLoop || !StoreEv->isAffine())
1848  return false;
1849 
1850  // Check to see if the stride matches the size of the store. If so, then we
1851  // know that every byte is touched in the loop.
1852  int Stride = getSCEVStride(StoreEv);
1853  if (Stride == 0)
1854  return false;
1855  unsigned StoreSize = getStoreSizeInBytes(SI);
1856  if (StoreSize != unsigned(std::abs(Stride)))
1857  return false;
1858 
1859  // The store must be feeding a non-volatile load.
1860  LoadInst *LI = dyn_cast<LoadInst>(SI->getValueOperand());
1861  if (!LI || !LI->isSimple())
1862  return false;
1863 
1864  // See if the pointer expression is an AddRec like {base,+,1} on the current
1865  // loop, which indicates a strided load. If we have something else, it's a
1866  // random load we can't handle.
1867  Value *LoadPtr = LI->getPointerOperand();
1868  auto *LoadEv = dyn_cast<SCEVAddRecExpr>(SE->getSCEV(LoadPtr));
1869  if (!LoadEv || LoadEv->getLoop() != CurLoop || !LoadEv->isAffine())
1870  return false;
1871 
1872  // The store and load must share the same stride.
1873  if (StoreEv->getOperand(1) != LoadEv->getOperand(1))
1874  return false;
1875 
1876  // Success. This store can be converted into a memcpy.
1877  return true;
1878 }
1879 
1880 /// mayLoopAccessLocation - Return true if the specified loop might access the
1881 /// specified pointer location, which is a loop-strided access. The 'Access'
1882 /// argument specifies what the verboten forms of access are (read or write).
1883 static bool
1885  const SCEV *BECount, unsigned StoreSize,
1886  AliasAnalysis &AA,
1887  SmallPtrSetImpl<Instruction *> &Ignored) {
1888  // Get the location that may be stored across the loop. Since the access
1889  // is strided positively through memory, we say that the modified location
1890  // starts at the pointer and has infinite size.
1891  uint64_t AccessSize = MemoryLocation::UnknownSize;
1892 
1893  // If the loop iterates a fixed number of times, we can refine the access
1894  // size to be exactly the size of the memset, which is (BECount+1)*StoreSize
1895  if (const SCEVConstant *BECst = dyn_cast<SCEVConstant>(BECount))
1896  AccessSize = (BECst->getValue()->getZExtValue() + 1) * StoreSize;
1897 
1898  // TODO: For this to be really effective, we have to dive into the pointer
1899  // operand in the store. Store to &A[i] of 100 will always return may alias
1900  // with store of &A[100], we need to StoreLoc to be "A" with size of 100,
1901  // which will then no-alias a store to &A[100].
1902  MemoryLocation StoreLoc(Ptr, AccessSize);
1903 
1904  for (auto *B : L->blocks())
1905  for (auto &I : *B)
1906  if (Ignored.count(&I) == 0 && (AA.getModRefInfo(&I, StoreLoc) & Access))
1907  return true;
1908 
1909  return false;
1910 }
1911 
1912 void HexagonLoopIdiomRecognize::collectStores(Loop *CurLoop, BasicBlock *BB,
1913  SmallVectorImpl<StoreInst*> &Stores) {
1914  Stores.clear();
1915  for (Instruction &I : *BB)
1916  if (StoreInst *SI = dyn_cast<StoreInst>(&I))
1917  if (isLegalStore(CurLoop, SI))
1918  Stores.push_back(SI);
1919 }
1920 
1921 bool HexagonLoopIdiomRecognize::processCopyingStore(Loop *CurLoop,
1922  StoreInst *SI, const SCEV *BECount) {
1923  assert((SI->isSimple() || (SI->isVolatile() && HexagonVolatileMemcpy)) &&
1924  "Expected only non-volatile stores, or Hexagon-specific memcpy"
1925  "to volatile destination.");
1926 
1927  Value *StorePtr = SI->getPointerOperand();
1928  auto *StoreEv = cast<SCEVAddRecExpr>(SE->getSCEV(StorePtr));
1929  unsigned Stride = getSCEVStride(StoreEv);
1930  unsigned StoreSize = getStoreSizeInBytes(SI);
1931  if (Stride != StoreSize)
1932  return false;
1933 
1934  // See if the pointer expression is an AddRec like {base,+,1} on the current
1935  // loop, which indicates a strided load. If we have something else, it's a
1936  // random load we can't handle.
1937  LoadInst *LI = dyn_cast<LoadInst>(SI->getValueOperand());
1938  auto *LoadEv = cast<SCEVAddRecExpr>(SE->getSCEV(LI->getPointerOperand()));
1939 
1940  // The trip count of the loop and the base pointer of the addrec SCEV is
1941  // guaranteed to be loop invariant, which means that it should dominate the
1942  // header. This allows us to insert code for it in the preheader.
1943  BasicBlock *Preheader = CurLoop->getLoopPreheader();
1944  Instruction *ExpPt = Preheader->getTerminator();
1945  IRBuilder<> Builder(ExpPt);
1946  SCEVExpander Expander(*SE, *DL, "hexagon-loop-idiom");
1947 
1948  Type *IntPtrTy = Builder.getIntPtrTy(*DL, SI->getPointerAddressSpace());
1949 
1950  // Okay, we have a strided store "p[i]" of a loaded value. We can turn
1951  // this into a memcpy/memmove in the loop preheader now if we want. However,
1952  // this would be unsafe to do if there is anything else in the loop that may
1953  // read or write the memory region we're storing to. For memcpy, this
1954  // includes the load that feeds the stores. Check for an alias by generating
1955  // the base address and checking everything.
1956  Value *StoreBasePtr = Expander.expandCodeFor(StoreEv->getStart(),
1957  Builder.getInt8PtrTy(SI->getPointerAddressSpace()), ExpPt);
1958  Value *LoadBasePtr = nullptr;
1959 
1960  bool Overlap = false;
1961  bool DestVolatile = SI->isVolatile();
1962  Type *BECountTy = BECount->getType();
1963 
1964  if (DestVolatile) {
1965  // The trip count must fit in i32, since it is the type of the "num_words"
1966  // argument to hexagon_memcpy_forward_vp4cp4n2.
1967  if (StoreSize != 4 || DL->getTypeSizeInBits(BECountTy) > 32) {
1968 CleanupAndExit:
1969  // If we generated new code for the base pointer, clean up.
1970  Expander.clear();
1971  if (StoreBasePtr && (LoadBasePtr != StoreBasePtr)) {
1972  RecursivelyDeleteTriviallyDeadInstructions(StoreBasePtr, TLI);
1973  StoreBasePtr = nullptr;
1974  }
1975  if (LoadBasePtr) {
1977  LoadBasePtr = nullptr;
1978  }
1979  return false;
1980  }
1981  }
1982 
1984  Ignore1.insert(SI);
1985  if (mayLoopAccessLocation(StoreBasePtr, MRI_ModRef, CurLoop, BECount,
1986  StoreSize, *AA, Ignore1)) {
1987  // Check if the load is the offending instruction.
1988  Ignore1.insert(LI);
1989  if (mayLoopAccessLocation(StoreBasePtr, MRI_ModRef, CurLoop, BECount,
1990  StoreSize, *AA, Ignore1)) {
1991  // Still bad. Nothing we can do.
1992  goto CleanupAndExit;
1993  }
1994  // It worked with the load ignored.
1995  Overlap = true;
1996  }
1997 
1998  if (!Overlap) {
1999  if (DisableMemcpyIdiom || !HasMemcpy)
2000  goto CleanupAndExit;
2001  } else {
2002  // Don't generate memmove if this function will be inlined. This is
2003  // because the caller will undergo this transformation after inlining.
2004  Function *Func = CurLoop->getHeader()->getParent();
2005  if (Func->hasFnAttribute(Attribute::AlwaysInline))
2006  goto CleanupAndExit;
2007 
2008  // In case of a memmove, the call to memmove will be executed instead
2009  // of the loop, so we need to make sure that there is nothing else in
2010  // the loop than the load, store and instructions that these two depend
2011  // on.
2013  Insts.push_back(SI);
2014  Insts.push_back(LI);
2015  if (!coverLoop(CurLoop, Insts))
2016  goto CleanupAndExit;
2017 
2018  if (DisableMemmoveIdiom || !HasMemmove)
2019  goto CleanupAndExit;
2020  bool IsNested = CurLoop->getParentLoop() != nullptr;
2021  if (IsNested && OnlyNonNestedMemmove)
2022  goto CleanupAndExit;
2023  }
2024 
2025  // For a memcpy, we have to make sure that the input array is not being
2026  // mutated by the loop.
2027  LoadBasePtr = Expander.expandCodeFor(LoadEv->getStart(),
2028  Builder.getInt8PtrTy(LI->getPointerAddressSpace()), ExpPt);
2029 
2031  Ignore2.insert(SI);
2032  if (mayLoopAccessLocation(LoadBasePtr, MRI_Mod, CurLoop, BECount, StoreSize,
2033  *AA, Ignore2))
2034  goto CleanupAndExit;
2035 
2036  // Check the stride.
2037  bool StridePos = getSCEVStride(LoadEv) >= 0;
2038 
2039  // Currently, the volatile memcpy only emulates traversing memory forward.
2040  if (!StridePos && DestVolatile)
2041  goto CleanupAndExit;
2042 
2043  bool RuntimeCheck = (Overlap || DestVolatile);
2044 
2045  BasicBlock *ExitB;
2046  if (RuntimeCheck) {
2047  // The runtime check needs a single exit block.
2048  SmallVector<BasicBlock*, 8> ExitBlocks;
2049  CurLoop->getUniqueExitBlocks(ExitBlocks);
2050  if (ExitBlocks.size() != 1)
2051  goto CleanupAndExit;
2052  ExitB = ExitBlocks[0];
2053  }
2054 
2055  // The # stored bytes is (BECount+1)*Size. Expand the trip count out to
2056  // pointer size if it isn't already.
2057  LLVMContext &Ctx = SI->getContext();
2058  BECount = SE->getTruncateOrZeroExtend(BECount, IntPtrTy);
2059  unsigned Alignment = std::min(SI->getAlignment(), LI->getAlignment());
2060  DebugLoc DLoc = SI->getDebugLoc();
2061 
2062  const SCEV *NumBytesS =
2063  SE->getAddExpr(BECount, SE->getOne(IntPtrTy), SCEV::FlagNUW);
2064  if (StoreSize != 1)
2065  NumBytesS = SE->getMulExpr(NumBytesS, SE->getConstant(IntPtrTy, StoreSize),
2066  SCEV::FlagNUW);
2067  Value *NumBytes = Expander.expandCodeFor(NumBytesS, IntPtrTy, ExpPt);
2068  if (Instruction *In = dyn_cast<Instruction>(NumBytes))
2069  if (Value *Simp = SimplifyInstruction(In, {*DL, TLI, DT}))
2070  NumBytes = Simp;
2071 
2072  CallInst *NewCall;
2073 
2074  if (RuntimeCheck) {
2076  if (ConstantInt *CI = dyn_cast<ConstantInt>(NumBytes)) {
2077  uint64_t C = CI->getZExtValue();
2078  if (Threshold != 0 && C < Threshold)
2079  goto CleanupAndExit;
2081  goto CleanupAndExit;
2082  }
2083 
2084  BasicBlock *Header = CurLoop->getHeader();
2085  Function *Func = Header->getParent();
2086  Loop *ParentL = LF->getLoopFor(Preheader);
2087  StringRef HeaderName = Header->getName();
2088 
2089  // Create a new (empty) preheader, and update the PHI nodes in the
2090  // header to use the new preheader.
2091  BasicBlock *NewPreheader = BasicBlock::Create(Ctx, HeaderName+".rtli.ph",
2092  Func, Header);
2093  if (ParentL)
2094  ParentL->addBasicBlockToLoop(NewPreheader, *LF);
2095  IRBuilder<>(NewPreheader).CreateBr(Header);
2096  for (auto &In : *Header) {
2097  PHINode *PN = dyn_cast<PHINode>(&In);
2098  if (!PN)
2099  break;
2100  int bx = PN->getBasicBlockIndex(Preheader);
2101  if (bx >= 0)
2102  PN->setIncomingBlock(bx, NewPreheader);
2103  }
2104  DT->addNewBlock(NewPreheader, Preheader);
2105  DT->changeImmediateDominator(Header, NewPreheader);
2106 
2107  // Check for safe conditions to execute memmove.
2108  // If stride is positive, copying things from higher to lower addresses
2109  // is equivalent to memmove. For negative stride, it's the other way
2110  // around. Copying forward in memory with positive stride may not be
2111  // same as memmove since we may be copying values that we just stored
2112  // in some previous iteration.
2113  Value *LA = Builder.CreatePtrToInt(LoadBasePtr, IntPtrTy);
2114  Value *SA = Builder.CreatePtrToInt(StoreBasePtr, IntPtrTy);
2115  Value *LowA = StridePos ? SA : LA;
2116  Value *HighA = StridePos ? LA : SA;
2117  Value *CmpA = Builder.CreateICmpULT(LowA, HighA);
2118  Value *Cond = CmpA;
2119 
2120  // Check for distance between pointers. Since the case LowA < HighA
2121  // is checked for above, assume LowA >= HighA.
2122  Value *Dist = Builder.CreateSub(LowA, HighA);
2123  Value *CmpD = Builder.CreateICmpSLE(NumBytes, Dist);
2124  Value *CmpEither = Builder.CreateOr(Cond, CmpD);
2125  Cond = CmpEither;
2126 
2127  if (Threshold != 0) {
2128  Type *Ty = NumBytes->getType();
2129  Value *Thr = ConstantInt::get(Ty, Threshold);
2130  Value *CmpB = Builder.CreateICmpULT(Thr, NumBytes);
2131  Value *CmpBoth = Builder.CreateAnd(Cond, CmpB);
2132  Cond = CmpBoth;
2133  }
2134  BasicBlock *MemmoveB = BasicBlock::Create(Ctx, Header->getName()+".rtli",
2135  Func, NewPreheader);
2136  if (ParentL)
2137  ParentL->addBasicBlockToLoop(MemmoveB, *LF);
2138  Instruction *OldT = Preheader->getTerminator();
2139  Builder.CreateCondBr(Cond, MemmoveB, NewPreheader);
2140  OldT->eraseFromParent();
2141  Preheader->setName(Preheader->getName()+".old");
2142  DT->addNewBlock(MemmoveB, Preheader);
2143  // Find the new immediate dominator of the exit block.
2144  BasicBlock *ExitD = Preheader;
2145  for (auto PI = pred_begin(ExitB), PE = pred_end(ExitB); PI != PE; ++PI) {
2146  BasicBlock *PB = *PI;
2147  ExitD = DT->findNearestCommonDominator(ExitD, PB);
2148  if (!ExitD)
2149  break;
2150  }
2151  // If the prior immediate dominator of ExitB was dominated by the
2152  // old preheader, then the old preheader becomes the new immediate
2153  // dominator. Otherwise don't change anything (because the newly
2154  // added blocks are dominated by the old preheader).
2155  if (ExitD && DT->dominates(Preheader, ExitD)) {
2156  DomTreeNode *BN = DT->getNode(ExitB);
2157  DomTreeNode *DN = DT->getNode(ExitD);
2158  BN->setIDom(DN);
2159  }
2160 
2161  // Add a call to memmove to the conditional block.
2162  IRBuilder<> CondBuilder(MemmoveB);
2163  CondBuilder.CreateBr(ExitB);
2164  CondBuilder.SetInsertPoint(MemmoveB->getTerminator());
2165 
2166  if (DestVolatile) {
2167  Type *Int32Ty = Type::getInt32Ty(Ctx);
2168  Type *Int32PtrTy = Type::getInt32PtrTy(Ctx);
2169  Type *VoidTy = Type::getVoidTy(Ctx);
2170  Module *M = Func->getParent();
2172  Int32PtrTy, Int32PtrTy, Int32Ty);
2173  Function *Fn = cast<Function>(CF);
2175 
2176  const SCEV *OneS = SE->getConstant(Int32Ty, 1);
2177  const SCEV *BECount32 = SE->getTruncateOrZeroExtend(BECount, Int32Ty);
2178  const SCEV *NumWordsS = SE->getAddExpr(BECount32, OneS, SCEV::FlagNUW);
2179  Value *NumWords = Expander.expandCodeFor(NumWordsS, Int32Ty,
2180  MemmoveB->getTerminator());
2181  if (Instruction *In = dyn_cast<Instruction>(NumWords))
2182  if (Value *Simp = SimplifyInstruction(In, {*DL, TLI, DT}))
2183  NumWords = Simp;
2184 
2185  Value *Op0 = (StoreBasePtr->getType() == Int32PtrTy)
2186  ? StoreBasePtr
2187  : CondBuilder.CreateBitCast(StoreBasePtr, Int32PtrTy);
2188  Value *Op1 = (LoadBasePtr->getType() == Int32PtrTy)
2189  ? LoadBasePtr
2190  : CondBuilder.CreateBitCast(LoadBasePtr, Int32PtrTy);
2191  NewCall = CondBuilder.CreateCall(Fn, {Op0, Op1, NumWords});
2192  } else {
2193  NewCall = CondBuilder.CreateMemMove(StoreBasePtr, LoadBasePtr,
2194  NumBytes, Alignment);
2195  }
2196  } else {
2197  NewCall = Builder.CreateMemCpy(StoreBasePtr, LoadBasePtr,
2198  NumBytes, Alignment);
2199  // Okay, the memcpy has been formed. Zap the original store and
2200  // anything that feeds into it.
2202  }
2203 
2204  NewCall->setDebugLoc(DLoc);
2205 
2206  DEBUG(dbgs() << " Formed " << (Overlap ? "memmove: " : "memcpy: ")
2207  << *NewCall << "\n"
2208  << " from load ptr=" << *LoadEv << " at: " << *LI << "\n"
2209  << " from store ptr=" << *StoreEv << " at: " << *SI << "\n");
2210 
2211  return true;
2212 }
2213 
2214 // \brief Check if the instructions in Insts, together with their dependencies
2215 // cover the loop in the sense that the loop could be safely eliminated once
2216 // the instructions in Insts are removed.
2217 bool HexagonLoopIdiomRecognize::coverLoop(Loop *L,
2218  SmallVectorImpl<Instruction*> &Insts) const {
2219  SmallSet<BasicBlock*,8> LoopBlocks;
2220  for (auto *B : L->blocks())
2221  LoopBlocks.insert(B);
2222 
2223  SetVector<Instruction*> Worklist(Insts.begin(), Insts.end());
2224 
2225  // Collect all instructions from the loop that the instructions in Insts
2226  // depend on (plus their dependencies, etc.). These instructions will
2227  // constitute the expression trees that feed those in Insts, but the trees
2228  // will be limited only to instructions contained in the loop.
2229  for (unsigned i = 0; i < Worklist.size(); ++i) {
2230  Instruction *In = Worklist[i];
2231  for (auto I = In->op_begin(), E = In->op_end(); I != E; ++I) {
2232  Instruction *OpI = dyn_cast<Instruction>(I);
2233  if (!OpI)
2234  continue;
2235  BasicBlock *PB = OpI->getParent();
2236  if (!LoopBlocks.count(PB))
2237  continue;
2238  Worklist.insert(OpI);
2239  }
2240  }
2241 
2242  // Scan all instructions in the loop, if any of them have a user outside
2243  // of the loop, or outside of the expressions collected above, then either
2244  // the loop has a side-effect visible outside of it, or there are
2245  // instructions in it that are not involved in the original set Insts.
2246  for (auto *B : L->blocks()) {
2247  for (auto &In : *B) {
2248  if (isa<BranchInst>(In) || isa<DbgInfoIntrinsic>(In))
2249  continue;
2250  if (!Worklist.count(&In) && In.mayHaveSideEffects())
2251  return false;
2252  for (const auto &K : In.users()) {
2253  Instruction *UseI = dyn_cast<Instruction>(K);
2254  if (!UseI)
2255  continue;
2256  BasicBlock *UseB = UseI->getParent();
2257  if (LF->getLoopFor(UseB) != L)
2258  return false;
2259  }
2260  }
2261  }
2262 
2263  return true;
2264 }
2265 
2266 /// runOnLoopBlock - Process the specified block, which lives in a counted loop
2267 /// with the specified backedge count. This block is known to be in the current
2268 /// loop and not in any subloops.
2269 bool HexagonLoopIdiomRecognize::runOnLoopBlock(Loop *CurLoop, BasicBlock *BB,
2270  const SCEV *BECount, SmallVectorImpl<BasicBlock*> &ExitBlocks) {
2271  // We can only promote stores in this block if they are unconditionally
2272  // executed in the loop. For a block to be unconditionally executed, it has
2273  // to dominate all the exit blocks of the loop. Verify this now.
2274  auto DominatedByBB = [this,BB] (BasicBlock *EB) -> bool {
2275  return DT->dominates(BB, EB);
2276  };
2277  if (!std::all_of(ExitBlocks.begin(), ExitBlocks.end(), DominatedByBB))
2278  return false;
2279 
2280  bool MadeChange = false;
2281  // Look for store instructions, which may be optimized to memset/memcpy.
2283  collectStores(CurLoop, BB, Stores);
2284 
2285  // Optimize the store into a memcpy, if it feeds an similarly strided load.
2286  for (auto &SI : Stores)
2287  MadeChange |= processCopyingStore(CurLoop, SI, BECount);
2288 
2289  return MadeChange;
2290 }
2291 
2292 bool HexagonLoopIdiomRecognize::runOnCountableLoop(Loop *L) {
2293  PolynomialMultiplyRecognize PMR(L, *DL, *DT, *TLI, *SE);
2294  if (PMR.recognize())
2295  return true;
2296 
2297  if (!HasMemcpy && !HasMemmove)
2298  return false;
2299 
2300  const SCEV *BECount = SE->getBackedgeTakenCount(L);
2301  assert(!isa<SCEVCouldNotCompute>(BECount) &&
2302  "runOnCountableLoop() called on a loop without a predictable"
2303  "backedge-taken count");
2304 
2305  SmallVector<BasicBlock *, 8> ExitBlocks;
2306  L->getUniqueExitBlocks(ExitBlocks);
2307 
2308  bool Changed = false;
2309 
2310  // Scan all the blocks in the loop that are not in subloops.
2311  for (auto *BB : L->getBlocks()) {
2312  // Ignore blocks in subloops.
2313  if (LF->getLoopFor(BB) != L)
2314  continue;
2315  Changed |= runOnLoopBlock(L, BB, BECount, ExitBlocks);
2316  }
2317 
2318  return Changed;
2319 }
2320 
2321 bool HexagonLoopIdiomRecognize::runOnLoop(Loop *L, LPPassManager &LPM) {
2322  const Module &M = *L->getHeader()->getParent()->getParent();
2323  if (Triple(M.getTargetTriple()).getArch() != Triple::hexagon)
2324  return false;
2325 
2326  if (skipLoop(L))
2327  return false;
2328 
2329  // If the loop could not be converted to canonical form, it must have an
2330  // indirectbr in it, just give up.
2331  if (!L->getLoopPreheader())
2332  return false;
2333 
2334  // Disable loop idiom recognition if the function's name is a common idiom.
2335  StringRef Name = L->getHeader()->getParent()->getName();
2336  if (Name == "memset" || Name == "memcpy" || Name == "memmove")
2337  return false;
2338 
2339  AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
2340  DL = &L->getHeader()->getModule()->getDataLayout();
2341  DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
2342  LF = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
2343  TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
2344  SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
2345 
2346  HasMemcpy = TLI->has(LibFunc_memcpy);
2347  HasMemmove = TLI->has(LibFunc_memmove);
2348 
2349  if (SE->hasLoopInvariantBackedgeTakenCount(L))
2350  return runOnCountableLoop(L);
2351  return false;
2352 }
2353 
2355  return new HexagonLoopIdiomRecognize();
2356 }
Pass interface - Implemented by all &#39;passes&#39;.
Definition: Pass.h:81
static cl::opt< unsigned > CompileTimeMemSizeThreshold("compile-time-mem-idiom-threshold", cl::Hidden, cl::init(64), cl::desc("Threshold (in bytes) to perform the transformation, if the " "runtime loop count (mem transfer size) is known at compile-time."))
static cl::opt< unsigned > RuntimeMemSizeThreshold("runtime-mem-idiom-threshold", cl::Hidden, cl::init(0), cl::desc("Threshold (in bytes) for the runtime " "check guarding the memmove."))
BinaryOp_match< LHS, RHS, Instruction::And > m_And(const LHS &L, const RHS &R)
Definition: PatternMatch.h:574
uint64_t CallInst * C
Value * getValueOperand()
Definition: Instructions.h:395
SymbolTableList< Instruction >::iterator eraseFromParent()
This method unlinks &#39;this&#39; from the containing basic block and deletes it.
Definition: Instruction.cpp:69
IntegerType * getType() const
getType - Specialize the getType() method to always return an IntegerType, which reduces the amount o...
Definition: Constants.h:172
A parsed version of the target data layout string in and methods for querying it. ...
Definition: DataLayout.h:109
const std::string & getTargetTriple() const
Get the target triple which is a string describing the target host.
Definition: Module.h:233
BranchInst * CreateCondBr(Value *Cond, BasicBlock *True, BasicBlock *False, MDNode *BranchWeights=nullptr, MDNode *Unpredictable=nullptr)
Create a conditional &#39;br Cond, TrueDest, FalseDest&#39; instruction.
Definition: IRBuilder.h:775
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
Definition: PatternMatch.h:72
This class is the base class for the comparison instructions.
Definition: InstrTypes.h:843
bool isSimple() const
Definition: Instructions.h:262
static GCMetadataPrinterRegistry::Add< ErlangGCPrinter > X("erlang", "erlang-compatible garbage collector")
AnalysisUsage & addPreserved()
Add the specified Pass class to the set of analyses preserved by this pass.
static PassRegistry * getPassRegistry()
getPassRegistry - Access the global registry object, which is automatically initialized at applicatio...
Value * CreateBinOp(Instruction::BinaryOps Opc, Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition: IRBuilder.h:1108
LLVMContext & Context
void dropAllReferences()
Drop all references to operands.
Definition: User.h:279
bool isSameOperationAs(const Instruction *I, unsigned flags=0) const
This function determines if the specified instruction executes the same operation as the current one...
Compute iterated dominance frontiers using a linear time algorithm.
Definition: AllocatorList.h:24
BinaryOps getOpcode() const
Definition: InstrTypes.h:523
Value * CreateICmpULT(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:1558
Constant * getOrInsertFunction(StringRef Name, FunctionType *T, AttributeList AttributeList)
Look up the specified function in the module symbol table.
Definition: Module.cpp:142
Value * CreateXor(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:1095
A Module instance is used to store all the information related to an LLVM module. ...
Definition: Module.h:63
LLVM_ATTRIBUTE_ALWAYS_INLINE size_type size() const
Definition: SmallVector.h:136
match_zero m_Zero()
Match an arbitrary zero/null constant.
Definition: PatternMatch.h:145
This class represents zero extension of integer types.
static cl::opt< bool > DisableMemmoveIdiom("disable-memmove-idiom", cl::Hidden, cl::init(false), cl::desc("Disable generation of memmove in loop idiom recognition"))
The main scalar evolution driver.
This class represents a function call, abstracting a target machine&#39;s calling convention.
static APInt getLowBitsSet(unsigned numBits, unsigned loBitsSet)
Get a value with low bits set.
Definition: APInt.h:641
static PointerType * getInt32PtrTy(LLVMContext &C, unsigned AS=0)
Definition: Type.cpp:228
BlockT * getLoopPreheader() const
If there is a preheader for this loop, return it.
Definition: LoopInfoImpl.h:106
const Value * getTrueValue() const
Externally visible function.
Definition: GlobalValue.h:49
LLVMContext & getContext() const
All values hold a context through their type.
Definition: Value.cpp:697
bool isTerminator() const
Definition: Instruction.h:128
void deleteValue()
Delete a pointer to a generic Value.
Definition: Value.cpp:93
bool hasFnAttribute(Attribute::AttrKind Kind) const
Return true if the function has the attribute.
Definition: Function.h:262
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly...
Definition: STLExtras.h:816
A debug info location.
Definition: DebugLoc.h:34
F(f)
static void cleanup(BlockFrequencyInfoImplBase &BFI)
Clear all memory not needed downstream.
An instruction for reading from memory.
Definition: Instructions.h:164
iv Induction Variable Users
Definition: IVUsers.cpp:51
static bool mayLoopAccessLocation(Value *Ptr, ModRefInfo Access, Loop *L, const SCEV *BECount, unsigned StoreSize, AliasAnalysis &AA, SmallPtrSetImpl< Instruction *> &Ignored)
mayLoopAccessLocation - Return true if the specified loop might access the specified pointer location...
bool hasNoSignedWrap() const
Determine whether the no signed wrap flag is set.
The access modifies the value stored in memory.
op_iterator op_begin()
Definition: User.h:214
INITIALIZE_PASS_BEGIN(HexagonLoopIdiomRecognize, "hexagon-loop-idiom", "Recognize Hexagon-specific loop idioms", false, false) INITIALIZE_PASS_END(HexagonLoopIdiomRecognize
iterator begin()
Instruction iterator methods.
Definition: BasicBlock.h:252
bool isIdenticalTo(const Instruction *I) const
Return true if the specified instruction is exactly identical to the current one. ...
static GCMetadataPrinterRegistry::Add< OcamlGCMetadataPrinter > Y("ocaml", "ocaml 3.10-compatible collector")
bool match(Val *V, const Pattern &P)
Definition: PatternMatch.h:49
AnalysisUsage & addRequired()
const Module * getModule() const
Return the module owning the function this basic block belongs to, or nullptr it the function does no...
Definition: BasicBlock.cpp:116
#define INITIALIZE_PASS_DEPENDENCY(depName)
Definition: PassSupport.h:51
ModRefInfo
Flags indicating whether a memory access modifies or references memory.
BinaryOp_match< LHS, RHS, Instruction::Xor > m_Xor(const LHS &L, const RHS &R)
Definition: PatternMatch.h:586
This class represents the LLVM &#39;select&#39; instruction.
static const char * HexagonVolatileMemcpyName
const DataLayout & getDataLayout() const
Get the data layout for the module&#39;s target platform.
Definition: Module.cpp:361
int getBasicBlockIndex(const BasicBlock *BB) const
Return the first index of the specified basic block in the value list for this PHI.
A Use represents the edge between a Value definition and its users.
Definition: Use.h:56
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: APFloat.h:42
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition: Type.h:197
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition: IRBuilder.h:664
IntegerType * getIntPtrTy(const DataLayout &DL, unsigned AddrSpace=0)
Fetch the type representing a pointer to an integer value.
Definition: IRBuilder.h:391
This file contains the simple types necessary to represent the attributes associated with functions a...
void setName(const Twine &Name)
Change the name of the value.
Definition: Value.cpp:284
zlib-gnu style compression
This file implements a class to represent arbitrary precision integral constant values and operations...
BlockT * getHeader() const
Definition: LoopInfo.h:100
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
Definition: Instructions.h:404
User * getUser() const LLVM_READONLY
Returns the User that contains this Use.
Definition: Use.cpp:41
Value * CreateBitCast(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:1444
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:245
void addBasicBlockToLoop(BlockT *NewBB, LoopInfoBase< BlockT, LoopT > &LI)
This method is used by other analyses to update loop information.
Definition: LoopInfoImpl.h:183
This node represents a polynomial recurrence on the trip count of the specified loop.
CastClass_match< OpTy, Instruction::ZExt > m_ZExt(const OpTy &Op)
Matches ZExt.
Definition: PatternMatch.h:924
#define T
CallInst * CreateMemMove(Value *Dst, Value *Src, uint64_t Size, unsigned Align, bool isVolatile=false, MDNode *TBAATag=nullptr, MDNode *ScopeTag=nullptr, MDNode *NoAliasTag=nullptr)
Create and insert a memmove between the specified pointers.
Definition: IRBuilder.h:464
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
Definition: Instruction.h:125
Value * CreateSub(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition: IRBuilder.h:911
An instruction for storing to memory.
Definition: Instructions.h:306
void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition: Value.cpp:428
Value * CreateZExt(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:1383
unsigned getBitWidth() const
Get the number of bits in this IntegerType.
Definition: DerivedTypes.h:66
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree...
Definition: Dominators.h:140
Function * getDeclaration(Module *M, ID id, ArrayRef< Type *> Tys=None)
Create or insert an LLVM Function declaration for an intrinsic, and return it.
Definition: Function.cpp:980
void SetInsertPoint(BasicBlock *TheBB)
This specifies that created instructions should be appended to the end of the specified block...
Definition: IRBuilder.h:128
Value * getOperand(unsigned i) const
Definition: User.h:154
void getUniqueExitBlocks(SmallVectorImpl< BasicBlock *> &ExitBlocks) const
Return all unique successor blocks of this loop.
Definition: LoopInfo.cpp:393
void replaceUsesOfWith(Value *From, Value *To)
Replace uses of one Value with another.
Definition: User.cpp:22
Value * CreateOr(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:1079
void clear()
Erase the contents of the InsertedExpressions map so that users trying to expand the same expression ...
#define P(N)
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:406
const SCEV * getOperand(unsigned i) const
BinaryOp_match< LHS, RHS, Instruction::LShr > m_LShr(const LHS &L, const RHS &R)
Definition: PatternMatch.h:598
void initializeHexagonLoopIdiomRecognizePass(PassRegistry &)
uint64_t getZExtValue() const
Return the constant as a 64-bit unsigned integer value after it has been zero extended as appropriate...
Definition: Constants.h:149
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
* if(!EatIfPresent(lltok::kw_thread_local)) return false
ParseOptionalThreadLocal := /*empty.
const_iterator getFirstInsertionPt() const
Returns an iterator to the first instruction in this block that is suitable for inserting a non-PHI i...
Definition: BasicBlock.cpp:200
void setDebugLoc(DebugLoc Loc)
Set the debug location information for this instruction.
Definition: Instruction.h:281
const BasicBlock * getSinglePredecessor() const
Return the predecessor of this block if it has a single predecessor block.
Definition: BasicBlock.cpp:217
LLVM Basic Block Representation.
Definition: BasicBlock.h:59
The instances of the Type class are immutable: once they are created, they are never changed...
Definition: Type.h:46
This is an important class for using LLVM in a threaded context.
Definition: LLVMContext.h:69
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
This is an important base class in LLVM.
Definition: Constant.h:42
LLVM_ATTRIBUTE_ALWAYS_INLINE iterator begin()
Definition: SmallVector.h:116
SmallSet - This maintains a set of unique values, optimizing for the case when the set is small (less...
Definition: SmallSet.h:36
This file contains the declarations for the subclasses of Constant, which represent the different fla...
Value * CreateSelect(Value *C, Value *True, Value *False, const Twine &Name="", Instruction *MDFrom=nullptr)
Definition: IRBuilder.h:1689
#define H(x, y, z)
Definition: MD5.cpp:57
char & LCSSAID
Definition: LCSSA.cpp:412
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition: SmallPtrSet.h:371
const std::vector< BlockT * > & getBlocks() const
Get a list of the basic blocks which make up this loop.
Definition: LoopInfo.h:149
Interval::pred_iterator pred_begin(Interval *I)
pred_begin/pred_end - define methods so that Intervals may be used just like BasicBlocks can with the...
Definition: Interval.h:113
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
Definition: PatternMatch.h:382
Represent the analysis usage information of a pass.
op_iterator op_end()
Definition: User.h:216
static Type * getVoidTy(LLVMContext &C)
Definition: Type.cpp:161
BinaryOp_match< LHS, RHS, Instruction::Shl > m_Shl(const LHS &L, const RHS &R)
Definition: PatternMatch.h:592
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition: InstrTypes.h:853
Value * expandCodeFor(const SCEV *SH, Type *Ty, Instruction *I)
Insert code to directly compute the specified SCEV expression into the program.
static const unsigned End
Interval::pred_iterator pred_end(Interval *I)
Definition: Interval.h:116
op_range operands()
Definition: User.h:222
Value * getPointerOperand()
Definition: Instructions.h:270
static BasicBlock * Create(LLVMContext &Context, const Twine &Name="", Function *Parent=nullptr, BasicBlock *InsertBefore=nullptr)
Creates a new BasicBlock.
Definition: BasicBlock.h:101
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
Definition: SmallPtrSet.h:382
self_iterator getIterator()
Definition: ilist_node.h:82
Class to represent integer types.
Definition: DerivedTypes.h:40
std::pair< NoneType, bool > insert(const T &V)
insert - Insert an element into the set if it isn&#39;t already there.
Definition: SmallSet.h:81
static cl::opt< bool > OnlyNonNestedMemmove("only-nonnested-memmove-idiom", cl::Hidden, cl::init(true), cl::desc("Only enable generating memmove in non-nested loops"))
const Value * getCondition() const
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function. ...
Definition: Function.cpp:194
bool RecursivelyDeleteTriviallyDeadInstructions(Value *V, const TargetLibraryInfo *TLI=nullptr)
If the specified value is a trivially dead instruction, delete it.
Definition: Local.cpp:401
Value * getIncomingValue(unsigned i) const
Return incoming value number x.
static void replaceAllUsesOfWithIn(Value *I, Value *J, BasicBlock *BB)
PointerType * getInt8PtrTy(unsigned AddrSpace=0)
Fetch the type representing a pointer to an 8-bit integer value.
Definition: IRBuilder.h:386
INITIALIZE_PASS_END(RegBankSelect, DEBUG_TYPE, "Assign register bank of generic virtual registers", false, false) RegBankSelect
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
Value * CreateTrunc(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:1380
Triple - Helper class for working with autoconf configuration names.
Definition: Triple.h:44
hexagon loop Recognize Hexagon specific loop idioms
char & LoopSimplifyID
static cl::opt< unsigned > SimplifyLimit("hlir-simplify-limit", cl::init(10000), cl::Hidden, cl::desc("Maximum number of simplification steps in HLIR"))
Representation for a specific memory location.
const InstListType & getInstList() const
Return the underlying instruction list container.
Definition: BasicBlock.h:317
static IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
Definition: Type.cpp:240
Iterator for intrusive lists based on ilist_node.
unsigned getNumOperands() const
Definition: User.h:176
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements...
Definition: SmallPtrSet.h:418
This is the shared class of boolean and integer constants.
Definition: Constants.h:84
Type * getType() const
Return the LLVM type of this SCEV expression.
auto find(R &&Range, const T &Val) -> decltype(std::begin(Range))
Provide wrappers to std::find which take ranges instead of having to pass begin/end explicitly...
Definition: STLExtras.h:837
void setIncomingBlock(unsigned i, BasicBlock *BB)
iterator end()
Definition: BasicBlock.h:254
AnalysisUsage & addRequiredID(const void *ID)
Definition: Pass.cpp:298
Value * CreateIntCast(Value *V, Type *DestTy, bool isSigned, const Twine &Name="")
Definition: IRBuilder.h:1507
This is a &#39;vector&#39; (really, a variable-sized array), optimized for the case when the array is small...
Definition: SmallVector.h:864
Module.h This file contains the declarations for the Module class.
Provides information about what library functions are available for the current target.
CHAIN = SC CHAIN, Imm128 - System call.
static Constant * get(Type *Ty, uint64_t V, bool isSigned=false)
If Ty is a vector type, return a Constant with a splat of the given value.
Definition: Constants.cpp:560
unsigned getNumIncomingValues() const
Return the number of incoming edges.
void setLinkage(LinkageTypes LT)
Definition: GlobalValue.h:425
void setOperand(unsigned i, Value *Val)
Definition: User.h:159
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:132
static void initialize(TargetLibraryInfoImpl &TLI, const Triple &T, ArrayRef< StringRef > StandardNames)
initialize - Initialize the set of available library functions based on the specified target triple...
ModRefInfo getModRefInfo(ImmutableCallSite CS, const MemoryLocation &Loc)
getModRefInfo (for call sites) - Return information about whether a particular call site modifies or ...
iterator_range< user_iterator > users()
Definition: Value.h:395
static cl::opt< bool > DisableMemcpyIdiom("disable-memcpy-idiom", cl::Hidden, cl::init(false), cl::desc("Disable generation of memcpy in loop idiom recognition"))
This class uses information about analyze scalars to rewrite expressions in canonical form...
const Value * getFalseValue() const
LoopT * getParentLoop() const
Definition: LoopInfo.h:101
static bool isZero(Value *V, const DataLayout &DL, DominatorTree *DT, AssumptionCache *AC)
Definition: Lint.cpp:529
LLVM_ATTRIBUTE_ALWAYS_INLINE iterator end()
Definition: SmallVector.h:120
bool isVolatile() const
Return true if this is a store to a volatile memory location.
Definition: Instructions.h:339
iterator insert(iterator where, pointer New)
Definition: ilist.h:241
const DebugLoc & getDebugLoc() const
Return the debug location for this node as a DebugLoc.
Definition: Instruction.h:284
CallInst * CreateMemCpy(Value *Dst, Value *Src, uint64_t Size, unsigned Align, bool isVolatile=false, MDNode *TBAATag=nullptr, MDNode *TBAAStructTag=nullptr, MDNode *ScopeTag=nullptr, MDNode *NoAliasTag=nullptr)
Create and insert a memcpy between the specified pointers.
Definition: IRBuilder.h:423
unsigned getAlignment() const
Return the alignment of the access that is being performed.
Definition: Instructions.h:226
This class represents an analyzed expression in the program.
static IntegerType * getInt32Ty(LLVMContext &C)
Definition: Type.cpp:176
Represents a single loop in the control flow graph.
Definition: LoopInfo.h:420
match_one m_One()
Match an integer 1 or a vector with all elements equal to 1.
Definition: PatternMatch.h:194
This file provides utility analysis objects describing memory locations.
StringRef getName() const
Return a constant reference to the value&#39;s name.
Definition: Value.cpp:218
BasicBlock * getIncomingBlock(unsigned i) const
Return incoming basic block number i.
const Function * getParent() const
Return the enclosing method, or null if none.
Definition: BasicBlock.h:108
The access both references and modifies the value stored in memory.
#define I(x, y, z)
Definition: MD5.cpp:58
#define N
APFloat abs(APFloat X)
Returns the absolute value of the argument.
Definition: APFloat.h:1213
Pass * createHexagonLoopIdiomPass()
LLVM_NODISCARD std::enable_if<!is_simple_type< Y >::value, typename cast_retty< X, const Y >::ret_type >::type dyn_cast(const Y &Val)
Definition: Casting.h:323
uint64_t getSignBit() const
Return a uint64_t with just the most significant bit set (the sign bit, if the value is treated as a ...
Definition: DerivedTypes.h:76
bool hasNoUnsignedWrap() const
Determine whether the no unsigned wrap flag is set.
raw_ostream & operator<<(raw_ostream &OS, const APInt &I)
Definition: APInt.h:2018
unsigned getAlignment() const
Return the alignment of the access that is being performed.
Definition: Instructions.h:351
void mutateType(Type *Ty)
Mutate the type of this Value to be of the specified type.
Definition: Value.h:598
Value * CreateAnd(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:1063
Value * CreatePtrToInt(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:1436
OutputIt transform(R &&Range, OutputIt d_first, UnaryPredicate P)
Wrapper function around std::transform to apply a function to a range and store the result elsewhere...
Definition: STLExtras.h:894
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
Definition: Instructions.h:276
static int const Threshold
TODO: Write a new FunctionPass AliasAnalysis so that it can keep a cache.
cl::opt< bool > HexagonVolatileMemcpy("disable-hexagon-volatile-memcpy", cl::Hidden, cl::init(false), cl::desc("Enable Hexagon-specific memcpy for volatile destination."))
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
user_iterator user_begin()
Definition: Value.h:371
Module * getParent()
Get the module that this global value is contained inside of...
Definition: GlobalValue.h:545
LLVM Value Representation.
Definition: Value.h:73
BranchInst * CreateBr(BasicBlock *Dest)
Create an unconditional &#39;br label X&#39; instruction.
Definition: IRBuilder.h:769
static const Function * getParent(const Value *V)
hexagon loop idiom
Value * CreateLShr(Value *LHS, Value *RHS, const Twine &Name="", bool isExact=false)
Definition: IRBuilder.h:1027
This class implements an extremely fast bulk output stream that can only output to a stream...
Definition: raw_ostream.h:44
#define DEBUG(X)
Definition: Debug.h:118
static unsigned getStoreSizeInBytes(StoreInst *SI, const DataLayout *DL)
The legacy pass manager&#39;s analysis pass to compute loop information.
Definition: LoopInfo.h:932
bool hasOneUse() const
Return true if there is exactly one user of this value.
Definition: Value.h:408
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:49
static Expected< std::string > replace(StringRef S, StringRef From, StringRef To)
PassRegistry - This class manages the registration and intitialization of the pass subsystem as appli...
Definition: PassRegistry.h:39
Legacy analysis pass which computes a DominatorTree.
Definition: Dominators.h:267
A wrapper pass to provide the legacy pass manager access to a suitably prepared AAResults object...
bool isSimple() const
Definition: Instructions.h:387
const TerminatorInst * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Definition: BasicBlock.cpp:120
pgo instr use
loop simplify
iterator_range< block_iterator > blocks() const
Definition: LoopInfo.h:156
Value * SimplifyInstruction(Instruction *I, const SimplifyQuery &Q, OptimizationRemarkEmitter *ORE=nullptr)
See if we can compute a simplified version of this instruction.
Value * CreateICmpSLE(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:1573
Value * getPointerOperand()
Definition: Instructions.h:398
void computeKnownBits(const Value *V, KnownBits &Known, const DataLayout &DL, unsigned Depth=0, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, OptimizationRemarkEmitter *ORE=nullptr)
Determine which bits of V are known to be either zero or one and return them in the KnownZero/KnownOn...
IntegerType * Int32Ty
const BasicBlock * getParent() const
Definition: Instruction.h:66
This class represents a constant integer value.
CmpClass_match< LHS, RHS, ICmpInst, ICmpInst::Predicate > m_ICmp(ICmpInst::Predicate &Pred, const LHS &L, const RHS &R)
Definition: PatternMatch.h:837
CallInst * CreateCall(Value *Callee, ArrayRef< Value *> Args=None, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition: IRBuilder.h:1659
void setIDom(DomTreeNodeBase *NewIDom)
size_type count(const T &V) const
count - Return 1 if the element is in the set, 0 otherwise.
Definition: SmallSet.h:65
#define LLVM_ATTRIBUTE_USED
Definition: Compiler.h:117
user_iterator user_end()
Definition: Value.h:379