LLVM  14.0.0git
HexagonLoopIdiomRecognition.cpp
Go to the documentation of this file.
1 //===- HexagonLoopIdiomRecognition.cpp ------------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 
10 #include "llvm/ADT/APInt.h"
11 #include "llvm/ADT/DenseMap.h"
12 #include "llvm/ADT/SetVector.h"
13 #include "llvm/ADT/SmallPtrSet.h"
14 #include "llvm/ADT/SmallSet.h"
15 #include "llvm/ADT/SmallVector.h"
16 #include "llvm/ADT/StringRef.h"
17 #include "llvm/ADT/Triple.h"
21 #include "llvm/Analysis/LoopInfo.h"
22 #include "llvm/Analysis/LoopPass.h"
28 #include "llvm/IR/Attributes.h"
29 #include "llvm/IR/BasicBlock.h"
30 #include "llvm/IR/Constant.h"
31 #include "llvm/IR/Constants.h"
32 #include "llvm/IR/DataLayout.h"
33 #include "llvm/IR/DebugLoc.h"
34 #include "llvm/IR/DerivedTypes.h"
35 #include "llvm/IR/Dominators.h"
36 #include "llvm/IR/Function.h"
37 #include "llvm/IR/IRBuilder.h"
38 #include "llvm/IR/InstrTypes.h"
39 #include "llvm/IR/Instruction.h"
40 #include "llvm/IR/Instructions.h"
41 #include "llvm/IR/IntrinsicInst.h"
42 #include "llvm/IR/Intrinsics.h"
43 #include "llvm/IR/IntrinsicsHexagon.h"
44 #include "llvm/IR/Module.h"
45 #include "llvm/IR/PassManager.h"
46 #include "llvm/IR/PatternMatch.h"
47 #include "llvm/IR/Type.h"
48 #include "llvm/IR/User.h"
49 #include "llvm/IR/Value.h"
50 #include "llvm/InitializePasses.h"
51 #include "llvm/Pass.h"
52 #include "llvm/Support/Casting.h"
54 #include "llvm/Support/Compiler.h"
55 #include "llvm/Support/Debug.h"
57 #include "llvm/Support/KnownBits.h"
59 #include "llvm/Transforms/Scalar.h"
60 #include "llvm/Transforms/Utils.h"
63 #include <algorithm>
64 #include <array>
65 #include <cassert>
66 #include <cstdint>
67 #include <cstdlib>
68 #include <deque>
69 #include <functional>
70 #include <iterator>
71 #include <map>
72 #include <set>
73 #include <utility>
74 #include <vector>
75 
76 #define DEBUG_TYPE "hexagon-lir"
77 
78 using namespace llvm;
79 
80 static cl::opt<bool> DisableMemcpyIdiom("disable-memcpy-idiom",
81  cl::Hidden, cl::init(false),
82  cl::desc("Disable generation of memcpy in loop idiom recognition"));
83 
84 static cl::opt<bool> DisableMemmoveIdiom("disable-memmove-idiom",
85  cl::Hidden, cl::init(false),
86  cl::desc("Disable generation of memmove in loop idiom recognition"));
87 
88 static cl::opt<unsigned> RuntimeMemSizeThreshold("runtime-mem-idiom-threshold",
89  cl::Hidden, cl::init(0), cl::desc("Threshold (in bytes) for the runtime "
90  "check guarding the memmove."));
91 
93  "compile-time-mem-idiom-threshold", cl::Hidden, cl::init(64),
94  cl::desc("Threshold (in bytes) to perform the transformation, if the "
95  "runtime loop count (mem transfer size) is known at compile-time."));
96 
97 static cl::opt<bool> OnlyNonNestedMemmove("only-nonnested-memmove-idiom",
98  cl::Hidden, cl::init(true),
99  cl::desc("Only enable generating memmove in non-nested loops"));
100 
102  "disable-hexagon-volatile-memcpy", cl::Hidden, cl::init(false),
103  cl::desc("Enable Hexagon-specific memcpy for volatile destination."));
104 
105 static cl::opt<unsigned> SimplifyLimit("hlir-simplify-limit", cl::init(10000),
106  cl::Hidden, cl::desc("Maximum number of simplification steps in HLIR"));
107 
108 static const char *HexagonVolatileMemcpyName
109  = "hexagon_memcpy_forward_vp4cp4n2";
110 
111 
112 namespace llvm {
113 
116 
117 } // end namespace llvm
118 
119 namespace {
120 
121 class HexagonLoopIdiomRecognize {
122 public:
123  explicit HexagonLoopIdiomRecognize(AliasAnalysis *AA, DominatorTree *DT,
124  LoopInfo *LF, const TargetLibraryInfo *TLI,
125  ScalarEvolution *SE)
126  : AA(AA), DT(DT), LF(LF), TLI(TLI), SE(SE) {}
127 
128  bool run(Loop *L);
129 
130 private:
131  int getSCEVStride(const SCEVAddRecExpr *StoreEv);
132  bool isLegalStore(Loop *CurLoop, StoreInst *SI);
133  void collectStores(Loop *CurLoop, BasicBlock *BB,
135  bool processCopyingStore(Loop *CurLoop, StoreInst *SI, const SCEV *BECount);
136  bool coverLoop(Loop *L, SmallVectorImpl<Instruction *> &Insts) const;
137  bool runOnLoopBlock(Loop *CurLoop, BasicBlock *BB, const SCEV *BECount,
138  SmallVectorImpl<BasicBlock *> &ExitBlocks);
139  bool runOnCountableLoop(Loop *L);
140 
141  AliasAnalysis *AA;
142  const DataLayout *DL;
143  DominatorTree *DT;
144  LoopInfo *LF;
145  const TargetLibraryInfo *TLI;
146  ScalarEvolution *SE;
147  bool HasMemcpy, HasMemmove;
148 };
149 
150 class HexagonLoopIdiomRecognizeLegacyPass : public LoopPass {
151 public:
152  static char ID;
153 
154  explicit HexagonLoopIdiomRecognizeLegacyPass() : LoopPass(ID) {
157  }
158 
159  StringRef getPassName() const override {
160  return "Recognize Hexagon-specific loop idioms";
161  }
162 
163  void getAnalysisUsage(AnalysisUsage &AU) const override {
172  }
173 
174  bool runOnLoop(Loop *L, LPPassManager &LPM) override;
175 };
176 
177 struct Simplifier {
178  struct Rule {
179  using FuncType = std::function<Value *(Instruction *, LLVMContext &)>;
180  Rule(StringRef N, FuncType F) : Name(N), Fn(F) {}
181  StringRef Name; // For debugging.
182  FuncType Fn;
183  };
184 
185  void addRule(StringRef N, const Rule::FuncType &F) {
186  Rules.push_back(Rule(N, F));
187  }
188 
189 private:
190  struct WorkListType {
191  WorkListType() = default;
192 
193  void push_back(Value *V) {
194  // Do not push back duplicates.
195  if (!S.count(V)) {
196  Q.push_back(V);
197  S.insert(V);
198  }
199  }
200 
201  Value *pop_front_val() {
202  Value *V = Q.front();
203  Q.pop_front();
204  S.erase(V);
205  return V;
206  }
207 
208  bool empty() const { return Q.empty(); }
209 
210  private:
211  std::deque<Value *> Q;
212  std::set<Value *> S;
213  };
214 
215  using ValueSetType = std::set<Value *>;
216 
217  std::vector<Rule> Rules;
218 
219 public:
220  struct Context {
221  using ValueMapType = DenseMap<Value *, Value *>;
222 
223  Value *Root;
224  ValueSetType Used; // The set of all cloned values used by Root.
225  ValueSetType Clones; // The set of all cloned values.
226  LLVMContext &Ctx;
227 
228  Context(Instruction *Exp)
229  : Ctx(Exp->getParent()->getParent()->getContext()) {
230  initialize(Exp);
231  }
232 
233  ~Context() { cleanup(); }
234 
235  void print(raw_ostream &OS, const Value *V) const;
236  Value *materialize(BasicBlock *B, BasicBlock::iterator At);
237 
238  private:
239  friend struct Simplifier;
240 
241  void initialize(Instruction *Exp);
242  void cleanup();
243 
244  template <typename FuncT> void traverse(Value *V, FuncT F);
245  void record(Value *V);
246  void use(Value *V);
247  void unuse(Value *V);
248 
249  bool equal(const Instruction *I, const Instruction *J) const;
250  Value *find(Value *Tree, Value *Sub) const;
251  Value *subst(Value *Tree, Value *OldV, Value *NewV);
252  void replace(Value *OldV, Value *NewV);
254  };
255 
256  Value *simplify(Context &C);
257 };
258 
259  struct PE {
260  PE(const Simplifier::Context &c, Value *v = nullptr) : C(c), V(v) {}
261 
262  const Simplifier::Context &C;
263  const Value *V;
264  };
265 
267  raw_ostream &operator<<(raw_ostream &OS, const PE &P) {
268  P.C.print(OS, P.V ? P.V : P.C.Root);
269  return OS;
270  }
271 
272 } // end anonymous namespace
273 
275 
276 INITIALIZE_PASS_BEGIN(HexagonLoopIdiomRecognizeLegacyPass, "hexagon-loop-idiom",
277  "Recognize Hexagon-specific loop idioms", false, false)
279 INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
280 INITIALIZE_PASS_DEPENDENCY(LCSSAWrapperPass)
285 INITIALIZE_PASS_END(HexagonLoopIdiomRecognizeLegacyPass, "hexagon-loop-idiom",
286  "Recognize Hexagon-specific loop idioms", false, false)
287 
288 template <typename FuncT>
289 void Simplifier::Context::traverse(Value *V, FuncT F) {
290  WorkListType Q;
291  Q.push_back(V);
292 
293  while (!Q.empty()) {
294  Instruction *U = dyn_cast<Instruction>(Q.pop_front_val());
295  if (!U || U->getParent())
296  continue;
297  if (!F(U))
298  continue;
299  for (Value *Op : U->operands())
300  Q.push_back(Op);
301  }
302 }
303 
304 void Simplifier::Context::print(raw_ostream &OS, const Value *V) const {
305  const auto *U = dyn_cast<const Instruction>(V);
306  if (!U) {
307  OS << V << '(' << *V << ')';
308  return;
309  }
310 
311  if (U->getParent()) {
312  OS << U << '(';
313  U->printAsOperand(OS, true);
314  OS << ')';
315  return;
316  }
317 
318  unsigned N = U->getNumOperands();
319  if (N != 0)
320  OS << U << '(';
321  OS << U->getOpcodeName();
322  for (const Value *Op : U->operands()) {
323  OS << ' ';
324  print(OS, Op);
325  }
326  if (N != 0)
327  OS << ')';
328 }
329 
331  // Perform a deep clone of the expression, set Root to the root
332  // of the clone, and build a map from the cloned values to the
333  // original ones.
334  ValueMapType M;
335  BasicBlock *Block = Exp->getParent();
336  WorkListType Q;
337  Q.push_back(Exp);
338 
339  while (!Q.empty()) {
340  Value *V = Q.pop_front_val();
341  if (M.find(V) != M.end())
342  continue;
343  if (Instruction *U = dyn_cast<Instruction>(V)) {
344  if (isa<PHINode>(U) || U->getParent() != Block)
345  continue;
346  for (Value *Op : U->operands())
347  Q.push_back(Op);
348  M.insert({U, U->clone()});
349  }
350  }
351 
352  for (std::pair<Value*,Value*> P : M) {
353  Instruction *U = cast<Instruction>(P.second);
354  for (unsigned i = 0, n = U->getNumOperands(); i != n; ++i) {
355  auto F = M.find(U->getOperand(i));
356  if (F != M.end())
357  U->setOperand(i, F->second);
358  }
359  }
360 
361  auto R = M.find(Exp);
362  assert(R != M.end());
363  Root = R->second;
364 
365  record(Root);
366  use(Root);
367 }
368 
369 void Simplifier::Context::record(Value *V) {
370  auto Record = [this](Instruction *U) -> bool {
371  Clones.insert(U);
372  return true;
373  };
374  traverse(V, Record);
375 }
376 
378  auto Use = [this](Instruction *U) -> bool {
379  Used.insert(U);
380  return true;
381  };
382  traverse(V, Use);
383 }
384 
385 void Simplifier::Context::unuse(Value *V) {
386  if (!isa<Instruction>(V) || cast<Instruction>(V)->getParent() != nullptr)
387  return;
388 
389  auto Unuse = [this](Instruction *U) -> bool {
390  if (!U->use_empty())
391  return false;
392  Used.erase(U);
393  return true;
394  };
395  traverse(V, Unuse);
396 }
397 
398 Value *Simplifier::Context::subst(Value *Tree, Value *OldV, Value *NewV) {
399  if (Tree == OldV)
400  return NewV;
401  if (OldV == NewV)
402  return Tree;
403 
404  WorkListType Q;
405  Q.push_back(Tree);
406  while (!Q.empty()) {
407  Instruction *U = dyn_cast<Instruction>(Q.pop_front_val());
408  // If U is not an instruction, or it's not a clone, skip it.
409  if (!U || U->getParent())
410  continue;
411  for (unsigned i = 0, n = U->getNumOperands(); i != n; ++i) {
412  Value *Op = U->getOperand(i);
413  if (Op == OldV) {
414  U->setOperand(i, NewV);
415  unuse(OldV);
416  } else {
417  Q.push_back(Op);
418  }
419  }
420  }
421  return Tree;
422 }
423 
424 void Simplifier::Context::replace(Value *OldV, Value *NewV) {
425  if (Root == OldV) {
426  Root = NewV;
427  use(Root);
428  return;
429  }
430 
431  // NewV may be a complex tree that has just been created by one of the
432  // transformation rules. We need to make sure that it is commoned with
433  // the existing Root to the maximum extent possible.
434  // Identify all subtrees of NewV (including NewV itself) that have
435  // equivalent counterparts in Root, and replace those subtrees with
436  // these counterparts.
437  WorkListType Q;
438  Q.push_back(NewV);
439  while (!Q.empty()) {
440  Value *V = Q.pop_front_val();
441  Instruction *U = dyn_cast<Instruction>(V);
442  if (!U || U->getParent())
443  continue;
444  if (Value *DupV = find(Root, V)) {
445  if (DupV != V)
446  NewV = subst(NewV, V, DupV);
447  } else {
448  for (Value *Op : U->operands())
449  Q.push_back(Op);
450  }
451  }
452 
453  // Now, simply replace OldV with NewV in Root.
454  Root = subst(Root, OldV, NewV);
455  use(Root);
456 }
457 
459  for (Value *V : Clones) {
460  Instruction *U = cast<Instruction>(V);
461  if (!U->getParent())
462  U->dropAllReferences();
463  }
464 
465  for (Value *V : Clones) {
466  Instruction *U = cast<Instruction>(V);
467  if (!U->getParent())
468  U->deleteValue();
469  }
470 }
471 
473  const Instruction *J) const {
474  if (I == J)
475  return true;
476  if (!I->isSameOperationAs(J))
477  return false;
478  if (isa<PHINode>(I))
479  return I->isIdenticalTo(J);
480 
481  for (unsigned i = 0, n = I->getNumOperands(); i != n; ++i) {
482  Value *OpI = I->getOperand(i), *OpJ = J->getOperand(i);
483  if (OpI == OpJ)
484  continue;
485  auto *InI = dyn_cast<const Instruction>(OpI);
486  auto *InJ = dyn_cast<const Instruction>(OpJ);
487  if (InI && InJ) {
488  if (!equal(InI, InJ))
489  return false;
490  } else if (InI != InJ || !InI)
491  return false;
492  }
493  return true;
494 }
495 
496 Value *Simplifier::Context::find(Value *Tree, Value *Sub) const {
497  Instruction *SubI = dyn_cast<Instruction>(Sub);
498  WorkListType Q;
499  Q.push_back(Tree);
500 
501  while (!Q.empty()) {
502  Value *V = Q.pop_front_val();
503  if (V == Sub)
504  return V;
505  Instruction *U = dyn_cast<Instruction>(V);
506  if (!U || U->getParent())
507  continue;
508  if (SubI && equal(SubI, U))
509  return U;
510  assert(!isa<PHINode>(U));
511  for (Value *Op : U->operands())
512  Q.push_back(Op);
513  }
514  return nullptr;
515 }
516 
519  if (I->getParent())
520  return;
521 
522  for (Value *Op : I->operands()) {
523  if (Instruction *OpI = dyn_cast<Instruction>(Op))
524  link(OpI, B, At);
525  }
526 
527  B->getInstList().insert(At, I);
528 }
529 
530 Value *Simplifier::Context::materialize(BasicBlock *B,
532  if (Instruction *RootI = dyn_cast<Instruction>(Root))
533  link(RootI, B, At);
534  return Root;
535 }
536 
538  WorkListType Q;
539  Q.push_back(C.Root);
540  unsigned Count = 0;
541  const unsigned Limit = SimplifyLimit;
542 
543  while (!Q.empty()) {
544  if (Count++ >= Limit)
545  break;
546  Instruction *U = dyn_cast<Instruction>(Q.pop_front_val());
547  if (!U || U->getParent() || !C.Used.count(U))
548  continue;
549  bool Changed = false;
550  for (Rule &R : Rules) {
551  Value *W = R.Fn(U, C.Ctx);
552  if (!W)
553  continue;
554  Changed = true;
555  C.record(W);
556  C.replace(U, W);
557  Q.push_back(C.Root);
558  break;
559  }
560  if (!Changed) {
561  for (Value *Op : U->operands())
562  Q.push_back(Op);
563  }
564  }
565  return Count < Limit ? C.Root : nullptr;
566 }
567 
568 //===----------------------------------------------------------------------===//
569 //
570 // Implementation of PolynomialMultiplyRecognize
571 //
572 //===----------------------------------------------------------------------===//
573 
574 namespace {
575 
576  class PolynomialMultiplyRecognize {
577  public:
578  explicit PolynomialMultiplyRecognize(Loop *loop, const DataLayout &dl,
579  const DominatorTree &dt, const TargetLibraryInfo &tli,
580  ScalarEvolution &se)
581  : CurLoop(loop), DL(dl), DT(dt), TLI(tli), SE(se) {}
582 
583  bool recognize();
584 
585  private:
586  using ValueSeq = SetVector<Value *>;
587 
588  IntegerType *getPmpyType() const {
589  LLVMContext &Ctx = CurLoop->getHeader()->getParent()->getContext();
590  return IntegerType::get(Ctx, 32);
591  }
592 
593  bool isPromotableTo(Value *V, IntegerType *Ty);
594  void promoteTo(Instruction *In, IntegerType *DestTy, BasicBlock *LoopB);
595  bool promoteTypes(BasicBlock *LoopB, BasicBlock *ExitB);
596 
597  Value *getCountIV(BasicBlock *BB);
598  bool findCycle(Value *Out, Value *In, ValueSeq &Cycle);
599  void classifyCycle(Instruction *DivI, ValueSeq &Cycle, ValueSeq &Early,
600  ValueSeq &Late);
601  bool classifyInst(Instruction *UseI, ValueSeq &Early, ValueSeq &Late);
602  bool commutesWithShift(Instruction *I);
603  bool highBitsAreZero(Value *V, unsigned IterCount);
604  bool keepsHighBitsZero(Value *V, unsigned IterCount);
605  bool isOperandShifted(Instruction *I, Value *Op);
606  bool convertShiftsToLeft(BasicBlock *LoopB, BasicBlock *ExitB,
607  unsigned IterCount);
608  void cleanupLoopBody(BasicBlock *LoopB);
609 
610  struct ParsedValues {
611  ParsedValues() = default;
612 
613  Value *M = nullptr;
614  Value *P = nullptr;
615  Value *Q = nullptr;
616  Value *R = nullptr;
617  Value *X = nullptr;
618  Instruction *Res = nullptr;
619  unsigned IterCount = 0;
620  bool Left = false;
621  bool Inv = false;
622  };
623 
624  bool matchLeftShift(SelectInst *SelI, Value *CIV, ParsedValues &PV);
625  bool matchRightShift(SelectInst *SelI, ParsedValues &PV);
626  bool scanSelect(SelectInst *SI, BasicBlock *LoopB, BasicBlock *PrehB,
627  Value *CIV, ParsedValues &PV, bool PreScan);
628  unsigned getInverseMxN(unsigned QP);
629  Value *generate(BasicBlock::iterator At, ParsedValues &PV);
630 
631  void setupPreSimplifier(Simplifier &S);
632  void setupPostSimplifier(Simplifier &S);
633 
634  Loop *CurLoop;
635  const DataLayout &DL;
636  const DominatorTree &DT;
637  const TargetLibraryInfo &TLI;
638  ScalarEvolution &SE;
639  };
640 
641 } // end anonymous namespace
642 
643 Value *PolynomialMultiplyRecognize::getCountIV(BasicBlock *BB) {
644  pred_iterator PI = pred_begin(BB), PE = pred_end(BB);
645  if (std::distance(PI, PE) != 2)
646  return nullptr;
647  BasicBlock *PB = (*PI == BB) ? *std::next(PI) : *PI;
648 
649  for (auto I = BB->begin(), E = BB->end(); I != E && isa<PHINode>(I); ++I) {
650  auto *PN = cast<PHINode>(I);
651  Value *InitV = PN->getIncomingValueForBlock(PB);
652  if (!isa<ConstantInt>(InitV) || !cast<ConstantInt>(InitV)->isZero())
653  continue;
654  Value *IterV = PN->getIncomingValueForBlock(BB);
655  auto *BO = dyn_cast<BinaryOperator>(IterV);
656  if (!BO)
657  continue;
658  if (BO->getOpcode() != Instruction::Add)
659  continue;
660  Value *IncV = nullptr;
661  if (BO->getOperand(0) == PN)
662  IncV = BO->getOperand(1);
663  else if (BO->getOperand(1) == PN)
664  IncV = BO->getOperand(0);
665  if (IncV == nullptr)
666  continue;
667 
668  if (auto *T = dyn_cast<ConstantInt>(IncV))
669  if (T->getZExtValue() == 1)
670  return PN;
671  }
672  return nullptr;
673 }
674 
676  for (auto UI = I->user_begin(), UE = I->user_end(); UI != UE;) {
677  Use &TheUse = UI.getUse();
678  ++UI;
679  if (auto *II = dyn_cast<Instruction>(TheUse.getUser()))
680  if (BB == II->getParent())
681  II->replaceUsesOfWith(I, J);
682  }
683 }
684 
685 bool PolynomialMultiplyRecognize::matchLeftShift(SelectInst *SelI,
686  Value *CIV, ParsedValues &PV) {
687  // Match the following:
688  // select (X & (1 << i)) != 0 ? R ^ (Q << i) : R
689  // select (X & (1 << i)) == 0 ? R : R ^ (Q << i)
690  // The condition may also check for equality with the masked value, i.e
691  // select (X & (1 << i)) == (1 << i) ? R ^ (Q << i) : R
692  // select (X & (1 << i)) != (1 << i) ? R : R ^ (Q << i);
693 
694  Value *CondV = SelI->getCondition();
695  Value *TrueV = SelI->getTrueValue();
696  Value *FalseV = SelI->getFalseValue();
697 
698  using namespace PatternMatch;
699 
701  Value *A = nullptr, *B = nullptr, *C = nullptr;
702 
703  if (!match(CondV, m_ICmp(P, m_And(m_Value(A), m_Value(B)), m_Value(C))) &&
704  !match(CondV, m_ICmp(P, m_Value(C), m_And(m_Value(A), m_Value(B)))))
705  return false;
706  if (P != CmpInst::ICMP_EQ && P != CmpInst::ICMP_NE)
707  return false;
708  // Matched: select (A & B) == C ? ... : ...
709  // select (A & B) != C ? ... : ...
710 
711  Value *X = nullptr, *Sh1 = nullptr;
712  // Check (A & B) for (X & (1 << i)):
713  if (match(A, m_Shl(m_One(), m_Specific(CIV)))) {
714  Sh1 = A;
715  X = B;
716  } else if (match(B, m_Shl(m_One(), m_Specific(CIV)))) {
717  Sh1 = B;
718  X = A;
719  } else {
720  // TODO: Could also check for an induction variable containing single
721  // bit shifted left by 1 in each iteration.
722  return false;
723  }
724 
725  bool TrueIfZero;
726 
727  // Check C against the possible values for comparison: 0 and (1 << i):
728  if (match(C, m_Zero()))
729  TrueIfZero = (P == CmpInst::ICMP_EQ);
730  else if (C == Sh1)
731  TrueIfZero = (P == CmpInst::ICMP_NE);
732  else
733  return false;
734 
735  // So far, matched:
736  // select (X & (1 << i)) ? ... : ...
737  // including variations of the check against zero/non-zero value.
738 
739  Value *ShouldSameV = nullptr, *ShouldXoredV = nullptr;
740  if (TrueIfZero) {
741  ShouldSameV = TrueV;
742  ShouldXoredV = FalseV;
743  } else {
744  ShouldSameV = FalseV;
745  ShouldXoredV = TrueV;
746  }
747 
748  Value *Q = nullptr, *R = nullptr, *Y = nullptr, *Z = nullptr;
749  Value *T = nullptr;
750  if (match(ShouldXoredV, m_Xor(m_Value(Y), m_Value(Z)))) {
751  // Matched: select +++ ? ... : Y ^ Z
752  // select +++ ? Y ^ Z : ...
753  // where +++ denotes previously checked matches.
754  if (ShouldSameV == Y)
755  T = Z;
756  else if (ShouldSameV == Z)
757  T = Y;
758  else
759  return false;
760  R = ShouldSameV;
761  // Matched: select +++ ? R : R ^ T
762  // select +++ ? R ^ T : R
763  // depending on TrueIfZero.
764 
765  } else if (match(ShouldSameV, m_Zero())) {
766  // Matched: select +++ ? 0 : ...
767  // select +++ ? ... : 0
768  if (!SelI->hasOneUse())
769  return false;
770  T = ShouldXoredV;
771  // Matched: select +++ ? 0 : T
772  // select +++ ? T : 0
773 
774  Value *U = *SelI->user_begin();
775  if (!match(U, m_Xor(m_Specific(SelI), m_Value(R))) &&
776  !match(U, m_Xor(m_Value(R), m_Specific(SelI))))
777  return false;
778  // Matched: xor (select +++ ? 0 : T), R
779  // xor (select +++ ? T : 0), R
780  } else
781  return false;
782 
783  // The xor input value T is isolated into its own match so that it could
784  // be checked against an induction variable containing a shifted bit
785  // (todo).
786  // For now, check against (Q << i).
787  if (!match(T, m_Shl(m_Value(Q), m_Specific(CIV))) &&
788  !match(T, m_Shl(m_ZExt(m_Value(Q)), m_ZExt(m_Specific(CIV)))))
789  return false;
790  // Matched: select +++ ? R : R ^ (Q << i)
791  // select +++ ? R ^ (Q << i) : R
792 
793  PV.X = X;
794  PV.Q = Q;
795  PV.R = R;
796  PV.Left = true;
797  return true;
798 }
799 
800 bool PolynomialMultiplyRecognize::matchRightShift(SelectInst *SelI,
801  ParsedValues &PV) {
802  // Match the following:
803  // select (X & 1) != 0 ? (R >> 1) ^ Q : (R >> 1)
804  // select (X & 1) == 0 ? (R >> 1) : (R >> 1) ^ Q
805  // The condition may also check for equality with the masked value, i.e
806  // select (X & 1) == 1 ? (R >> 1) ^ Q : (R >> 1)
807  // select (X & 1) != 1 ? (R >> 1) : (R >> 1) ^ Q
808 
809  Value *CondV = SelI->getCondition();
810  Value *TrueV = SelI->getTrueValue();
811  Value *FalseV = SelI->getFalseValue();
812 
813  using namespace PatternMatch;
814 
815  Value *C = nullptr;
817  bool TrueIfZero;
818 
819  if (match(CondV, m_ICmp(P, m_Value(C), m_Zero())) ||
820  match(CondV, m_ICmp(P, m_Zero(), m_Value(C)))) {
821  if (P != CmpInst::ICMP_EQ && P != CmpInst::ICMP_NE)
822  return false;
823  // Matched: select C == 0 ? ... : ...
824  // select C != 0 ? ... : ...
825  TrueIfZero = (P == CmpInst::ICMP_EQ);
826  } else if (match(CondV, m_ICmp(P, m_Value(C), m_One())) ||
827  match(CondV, m_ICmp(P, m_One(), m_Value(C)))) {
828  if (P != CmpInst::ICMP_EQ && P != CmpInst::ICMP_NE)
829  return false;
830  // Matched: select C == 1 ? ... : ...
831  // select C != 1 ? ... : ...
832  TrueIfZero = (P == CmpInst::ICMP_NE);
833  } else
834  return false;
835 
836  Value *X = nullptr;
837  if (!match(C, m_And(m_Value(X), m_One())) &&
838  !match(C, m_And(m_One(), m_Value(X))))
839  return false;
840  // Matched: select (X & 1) == +++ ? ... : ...
841  // select (X & 1) != +++ ? ... : ...
842 
843  Value *R = nullptr, *Q = nullptr;
844  if (TrueIfZero) {
845  // The select's condition is true if the tested bit is 0.
846  // TrueV must be the shift, FalseV must be the xor.
847  if (!match(TrueV, m_LShr(m_Value(R), m_One())))
848  return false;
849  // Matched: select +++ ? (R >> 1) : ...
850  if (!match(FalseV, m_Xor(m_Specific(TrueV), m_Value(Q))) &&
851  !match(FalseV, m_Xor(m_Value(Q), m_Specific(TrueV))))
852  return false;
853  // Matched: select +++ ? (R >> 1) : (R >> 1) ^ Q
854  // with commuting ^.
855  } else {
856  // The select's condition is true if the tested bit is 1.
857  // TrueV must be the xor, FalseV must be the shift.
858  if (!match(FalseV, m_LShr(m_Value(R), m_One())))
859  return false;
860  // Matched: select +++ ? ... : (R >> 1)
861  if (!match(TrueV, m_Xor(m_Specific(FalseV), m_Value(Q))) &&
862  !match(TrueV, m_Xor(m_Value(Q), m_Specific(FalseV))))
863  return false;
864  // Matched: select +++ ? (R >> 1) ^ Q : (R >> 1)
865  // with commuting ^.
866  }
867 
868  PV.X = X;
869  PV.Q = Q;
870  PV.R = R;
871  PV.Left = false;
872  return true;
873 }
874 
875 bool PolynomialMultiplyRecognize::scanSelect(SelectInst *SelI,
876  BasicBlock *LoopB, BasicBlock *PrehB, Value *CIV, ParsedValues &PV,
877  bool PreScan) {
878  using namespace PatternMatch;
879 
880  // The basic pattern for R = P.Q is:
881  // for i = 0..31
882  // R = phi (0, R')
883  // if (P & (1 << i)) ; test-bit(P, i)
884  // R' = R ^ (Q << i)
885  //
886  // Similarly, the basic pattern for R = (P/Q).Q - P
887  // for i = 0..31
888  // R = phi(P, R')
889  // if (R & (1 << i))
890  // R' = R ^ (Q << i)
891 
892  // There exist idioms, where instead of Q being shifted left, P is shifted
893  // right. This produces a result that is shifted right by 32 bits (the
894  // non-shifted result is 64-bit).
895  //
896  // For R = P.Q, this would be:
897  // for i = 0..31
898  // R = phi (0, R')
899  // if ((P >> i) & 1)
900  // R' = (R >> 1) ^ Q ; R is cycled through the loop, so it must
901  // else ; be shifted by 1, not i.
902  // R' = R >> 1
903  //
904  // And for the inverse:
905  // for i = 0..31
906  // R = phi (P, R')
907  // if (R & 1)
908  // R' = (R >> 1) ^ Q
909  // else
910  // R' = R >> 1
911 
912  // The left-shifting idioms share the same pattern:
913  // select (X & (1 << i)) ? R ^ (Q << i) : R
914  // Similarly for right-shifting idioms:
915  // select (X & 1) ? (R >> 1) ^ Q
916 
917  if (matchLeftShift(SelI, CIV, PV)) {
918  // If this is a pre-scan, getting this far is sufficient.
919  if (PreScan)
920  return true;
921 
922  // Need to make sure that the SelI goes back into R.
923  auto *RPhi = dyn_cast<PHINode>(PV.R);
924  if (!RPhi)
925  return false;
926  if (SelI != RPhi->getIncomingValueForBlock(LoopB))
927  return false;
928  PV.Res = SelI;
929 
930  // If X is loop invariant, it must be the input polynomial, and the
931  // idiom is the basic polynomial multiply.
932  if (CurLoop->isLoopInvariant(PV.X)) {
933  PV.P = PV.X;
934  PV.Inv = false;
935  } else {
936  // X is not loop invariant. If X == R, this is the inverse pmpy.
937  // Otherwise, check for an xor with an invariant value. If the
938  // variable argument to the xor is R, then this is still a valid
939  // inverse pmpy.
940  PV.Inv = true;
941  if (PV.X != PV.R) {
942  Value *Var = nullptr, *Inv = nullptr, *X1 = nullptr, *X2 = nullptr;
943  if (!match(PV.X, m_Xor(m_Value(X1), m_Value(X2))))
944  return false;
945  auto *I1 = dyn_cast<Instruction>(X1);
946  auto *I2 = dyn_cast<Instruction>(X2);
947  if (!I1 || I1->getParent() != LoopB) {
948  Var = X2;
949  Inv = X1;
950  } else if (!I2 || I2->getParent() != LoopB) {
951  Var = X1;
952  Inv = X2;
953  } else
954  return false;
955  if (Var != PV.R)
956  return false;
957  PV.M = Inv;
958  }
959  // The input polynomial P still needs to be determined. It will be
960  // the entry value of R.
961  Value *EntryP = RPhi->getIncomingValueForBlock(PrehB);
962  PV.P = EntryP;
963  }
964 
965  return true;
966  }
967 
968  if (matchRightShift(SelI, PV)) {
969  // If this is an inverse pattern, the Q polynomial must be known at
970  // compile time.
971  if (PV.Inv && !isa<ConstantInt>(PV.Q))
972  return false;
973  if (PreScan)
974  return true;
975  // There is no exact matching of right-shift pmpy.
976  return false;
977  }
978 
979  return false;
980 }
981 
982 bool PolynomialMultiplyRecognize::isPromotableTo(Value *Val,
983  IntegerType *DestTy) {
984  IntegerType *T = dyn_cast<IntegerType>(Val->getType());
985  if (!T || T->getBitWidth() > DestTy->getBitWidth())
986  return false;
987  if (T->getBitWidth() == DestTy->getBitWidth())
988  return true;
989  // Non-instructions are promotable. The reason why an instruction may not
990  // be promotable is that it may produce a different result if its operands
991  // and the result are promoted, for example, it may produce more non-zero
992  // bits. While it would still be possible to represent the proper result
993  // in a wider type, it may require adding additional instructions (which
994  // we don't want to do).
995  Instruction *In = dyn_cast<Instruction>(Val);
996  if (!In)
997  return true;
998  // The bitwidth of the source type is smaller than the destination.
999  // Check if the individual operation can be promoted.
1000  switch (In->getOpcode()) {
1001  case Instruction::PHI:
1002  case Instruction::ZExt:
1003  case Instruction::And:
1004  case Instruction::Or:
1005  case Instruction::Xor:
1006  case Instruction::LShr: // Shift right is ok.
1007  case Instruction::Select:
1008  case Instruction::Trunc:
1009  return true;
1010  case Instruction::ICmp:
1011  if (CmpInst *CI = cast<CmpInst>(In))
1012  return CI->isEquality() || CI->isUnsigned();
1013  llvm_unreachable("Cast failed unexpectedly");
1014  case Instruction::Add:
1015  return In->hasNoSignedWrap() && In->hasNoUnsignedWrap();
1016  }
1017  return false;
1018 }
1019 
1020 void PolynomialMultiplyRecognize::promoteTo(Instruction *In,
1021  IntegerType *DestTy, BasicBlock *LoopB) {
1022  Type *OrigTy = In->getType();
1023  assert(!OrigTy->isVoidTy() && "Invalid instruction to promote");
1024 
1025  // Leave boolean values alone.
1026  if (!In->getType()->isIntegerTy(1))
1027  In->mutateType(DestTy);
1028  unsigned DestBW = DestTy->getBitWidth();
1029 
1030  // Handle PHIs.
1031  if (PHINode *P = dyn_cast<PHINode>(In)) {
1032  unsigned N = P->getNumIncomingValues();
1033  for (unsigned i = 0; i != N; ++i) {
1034  BasicBlock *InB = P->getIncomingBlock(i);
1035  if (InB == LoopB)
1036  continue;
1037  Value *InV = P->getIncomingValue(i);
1038  IntegerType *Ty = cast<IntegerType>(InV->getType());
1039  // Do not promote values in PHI nodes of type i1.
1040  if (Ty != P->getType()) {
1041  // If the value type does not match the PHI type, the PHI type
1042  // must have been promoted.
1043  assert(Ty->getBitWidth() < DestBW);
1044  InV = IRBuilder<>(InB->getTerminator()).CreateZExt(InV, DestTy);
1045  P->setIncomingValue(i, InV);
1046  }
1047  }
1048  } else if (ZExtInst *Z = dyn_cast<ZExtInst>(In)) {
1049  Value *Op = Z->getOperand(0);
1050  if (Op->getType() == Z->getType())
1051  Z->replaceAllUsesWith(Op);
1052  Z->eraseFromParent();
1053  return;
1054  }
1055  if (TruncInst *T = dyn_cast<TruncInst>(In)) {
1056  IntegerType *TruncTy = cast<IntegerType>(OrigTy);
1057  Value *Mask = ConstantInt::get(DestTy, (1u << TruncTy->getBitWidth()) - 1);
1058  Value *And = IRBuilder<>(In).CreateAnd(T->getOperand(0), Mask);
1059  T->replaceAllUsesWith(And);
1060  T->eraseFromParent();
1061  return;
1062  }
1063 
1064  // Promote immediates.
1065  for (unsigned i = 0, n = In->getNumOperands(); i != n; ++i) {
1066  if (ConstantInt *CI = dyn_cast<ConstantInt>(In->getOperand(i)))
1067  if (CI->getType()->getBitWidth() < DestBW)
1068  In->setOperand(i, ConstantInt::get(DestTy, CI->getZExtValue()));
1069  }
1070 }
1071 
1072 bool PolynomialMultiplyRecognize::promoteTypes(BasicBlock *LoopB,
1073  BasicBlock *ExitB) {
1074  assert(LoopB);
1075  // Skip loops where the exit block has more than one predecessor. The values
1076  // coming from the loop block will be promoted to another type, and so the
1077  // values coming into the exit block from other predecessors would also have
1078  // to be promoted.
1079  if (!ExitB || (ExitB->getSinglePredecessor() != LoopB))
1080  return false;
1081  IntegerType *DestTy = getPmpyType();
1082  // Check if the exit values have types that are no wider than the type
1083  // that we want to promote to.
1084  unsigned DestBW = DestTy->getBitWidth();
1085  for (PHINode &P : ExitB->phis()) {
1086  if (P.getNumIncomingValues() != 1)
1087  return false;
1088  assert(P.getIncomingBlock(0) == LoopB);
1089  IntegerType *T = dyn_cast<IntegerType>(P.getType());
1090  if (!T || T->getBitWidth() > DestBW)
1091  return false;
1092  }
1093 
1094  // Check all instructions in the loop.
1095  for (Instruction &In : *LoopB)
1096  if (!In.isTerminator() && !isPromotableTo(&In, DestTy))
1097  return false;
1098 
1099  // Perform the promotion.
1100  std::vector<Instruction*> LoopIns;
1101  std::transform(LoopB->begin(), LoopB->end(), std::back_inserter(LoopIns),
1102  [](Instruction &In) { return &In; });
1103  for (Instruction *In : LoopIns)
1104  if (!In->isTerminator())
1105  promoteTo(In, DestTy, LoopB);
1106 
1107  // Fix up the PHI nodes in the exit block.
1108  Instruction *EndI = ExitB->getFirstNonPHI();
1109  BasicBlock::iterator End = EndI ? EndI->getIterator() : ExitB->end();
1110  for (auto I = ExitB->begin(); I != End; ++I) {
1111  PHINode *P = dyn_cast<PHINode>(I);
1112  if (!P)
1113  break;
1114  Type *Ty0 = P->getIncomingValue(0)->getType();
1115  Type *PTy = P->getType();
1116  if (PTy != Ty0) {
1117  assert(Ty0 == DestTy);
1118  // In order to create the trunc, P must have the promoted type.
1119  P->mutateType(Ty0);
1120  Value *T = IRBuilder<>(ExitB, End).CreateTrunc(P, PTy);
1121  // In order for the RAUW to work, the types of P and T must match.
1122  P->mutateType(PTy);
1123  P->replaceAllUsesWith(T);
1124  // Final update of the P's type.
1125  P->mutateType(Ty0);
1126  cast<Instruction>(T)->setOperand(0, P);
1127  }
1128  }
1129 
1130  return true;
1131 }
1132 
1133 bool PolynomialMultiplyRecognize::findCycle(Value *Out, Value *In,
1134  ValueSeq &Cycle) {
1135  // Out = ..., In, ...
1136  if (Out == In)
1137  return true;
1138 
1139  auto *BB = cast<Instruction>(Out)->getParent();
1140  bool HadPhi = false;
1141 
1142  for (auto U : Out->users()) {
1143  auto *I = dyn_cast<Instruction>(&*U);
1144  if (I == nullptr || I->getParent() != BB)
1145  continue;
1146  // Make sure that there are no multi-iteration cycles, e.g.
1147  // p1 = phi(p2)
1148  // p2 = phi(p1)
1149  // The cycle p1->p2->p1 would span two loop iterations.
1150  // Check that there is only one phi in the cycle.
1151  bool IsPhi = isa<PHINode>(I);
1152  if (IsPhi && HadPhi)
1153  return false;
1154  HadPhi |= IsPhi;
1155  if (Cycle.count(I))
1156  return false;
1157  Cycle.insert(I);
1158  if (findCycle(I, In, Cycle))
1159  break;
1160  Cycle.remove(I);
1161  }
1162  return !Cycle.empty();
1163 }
1164 
1165 void PolynomialMultiplyRecognize::classifyCycle(Instruction *DivI,
1166  ValueSeq &Cycle, ValueSeq &Early, ValueSeq &Late) {
1167  // All the values in the cycle that are between the phi node and the
1168  // divider instruction will be classified as "early", all other values
1169  // will be "late".
1170 
1171  bool IsE = true;
1172  unsigned I, N = Cycle.size();
1173  for (I = 0; I < N; ++I) {
1174  Value *V = Cycle[I];
1175  if (DivI == V)
1176  IsE = false;
1177  else if (!isa<PHINode>(V))
1178  continue;
1179  // Stop if found either.
1180  break;
1181  }
1182  // "I" is the index of either DivI or the phi node, whichever was first.
1183  // "E" is "false" or "true" respectively.
1184  ValueSeq &First = !IsE ? Early : Late;
1185  for (unsigned J = 0; J < I; ++J)
1186  First.insert(Cycle[J]);
1187 
1188  ValueSeq &Second = IsE ? Early : Late;
1189  Second.insert(Cycle[I]);
1190  for (++I; I < N; ++I) {
1191  Value *V = Cycle[I];
1192  if (DivI == V || isa<PHINode>(V))
1193  break;
1194  Second.insert(V);
1195  }
1196 
1197  for (; I < N; ++I)
1198  First.insert(Cycle[I]);
1199 }
1200 
1201 bool PolynomialMultiplyRecognize::classifyInst(Instruction *UseI,
1202  ValueSeq &Early, ValueSeq &Late) {
1203  // Select is an exception, since the condition value does not have to be
1204  // classified in the same way as the true/false values. The true/false
1205  // values do have to be both early or both late.
1206  if (UseI->getOpcode() == Instruction::Select) {
1207  Value *TV = UseI->getOperand(1), *FV = UseI->getOperand(2);
1208  if (Early.count(TV) || Early.count(FV)) {
1209  if (Late.count(TV) || Late.count(FV))
1210  return false;
1211  Early.insert(UseI);
1212  } else if (Late.count(TV) || Late.count(FV)) {
1213  if (Early.count(TV) || Early.count(FV))
1214  return false;
1215  Late.insert(UseI);
1216  }
1217  return true;
1218  }
1219 
1220  // Not sure what would be the example of this, but the code below relies
1221  // on having at least one operand.
1222  if (UseI->getNumOperands() == 0)
1223  return true;
1224 
1225  bool AE = true, AL = true;
1226  for (auto &I : UseI->operands()) {
1227  if (Early.count(&*I))
1228  AL = false;
1229  else if (Late.count(&*I))
1230  AE = false;
1231  }
1232  // If the operands appear "all early" and "all late" at the same time,
1233  // then it means that none of them are actually classified as either.
1234  // This is harmless.
1235  if (AE && AL)
1236  return true;
1237  // Conversely, if they are neither "all early" nor "all late", then
1238  // we have a mixture of early and late operands that is not a known
1239  // exception.
1240  if (!AE && !AL)
1241  return false;
1242 
1243  // Check that we have covered the two special cases.
1244  assert(AE != AL);
1245 
1246  if (AE)
1247  Early.insert(UseI);
1248  else
1249  Late.insert(UseI);
1250  return true;
1251 }
1252 
1253 bool PolynomialMultiplyRecognize::commutesWithShift(Instruction *I) {
1254  switch (I->getOpcode()) {
1255  case Instruction::And:
1256  case Instruction::Or:
1257  case Instruction::Xor:
1258  case Instruction::LShr:
1259  case Instruction::Shl:
1260  case Instruction::Select:
1261  case Instruction::ICmp:
1262  case Instruction::PHI:
1263  break;
1264  default:
1265  return false;
1266  }
1267  return true;
1268 }
1269 
1270 bool PolynomialMultiplyRecognize::highBitsAreZero(Value *V,
1271  unsigned IterCount) {
1272  auto *T = dyn_cast<IntegerType>(V->getType());
1273  if (!T)
1274  return false;
1275 
1276  KnownBits Known(T->getBitWidth());
1277  computeKnownBits(V, Known, DL);
1278  return Known.countMinLeadingZeros() >= IterCount;
1279 }
1280 
1281 bool PolynomialMultiplyRecognize::keepsHighBitsZero(Value *V,
1282  unsigned IterCount) {
1283  // Assume that all inputs to the value have the high bits zero.
1284  // Check if the value itself preserves the zeros in the high bits.
1285  if (auto *C = dyn_cast<ConstantInt>(V))
1286  return C->getValue().countLeadingZeros() >= IterCount;
1287 
1288  if (auto *I = dyn_cast<Instruction>(V)) {
1289  switch (I->getOpcode()) {
1290  case Instruction::And:
1291  case Instruction::Or:
1292  case Instruction::Xor:
1293  case Instruction::LShr:
1294  case Instruction::Select:
1295  case Instruction::ICmp:
1296  case Instruction::PHI:
1297  case Instruction::ZExt:
1298  return true;
1299  }
1300  }
1301 
1302  return false;
1303 }
1304 
1305 bool PolynomialMultiplyRecognize::isOperandShifted(Instruction *I, Value *Op) {
1306  unsigned Opc = I->getOpcode();
1307  if (Opc == Instruction::Shl || Opc == Instruction::LShr)
1308  return Op != I->getOperand(1);
1309  return true;
1310 }
1311 
1312 bool PolynomialMultiplyRecognize::convertShiftsToLeft(BasicBlock *LoopB,
1313  BasicBlock *ExitB, unsigned IterCount) {
1314  Value *CIV = getCountIV(LoopB);
1315  if (CIV == nullptr)
1316  return false;
1317  auto *CIVTy = dyn_cast<IntegerType>(CIV->getType());
1318  if (CIVTy == nullptr)
1319  return false;
1320 
1321  ValueSeq RShifts;
1322  ValueSeq Early, Late, Cycled;
1323 
1324  // Find all value cycles that contain logical right shifts by 1.
1325  for (Instruction &I : *LoopB) {
1326  using namespace PatternMatch;
1327 
1328  Value *V = nullptr;
1329  if (!match(&I, m_LShr(m_Value(V), m_One())))
1330  continue;
1331  ValueSeq C;
1332  if (!findCycle(&I, V, C))
1333  continue;
1334 
1335  // Found a cycle.
1336  C.insert(&I);
1337  classifyCycle(&I, C, Early, Late);
1338  Cycled.insert(C.begin(), C.end());
1339  RShifts.insert(&I);
1340  }
1341 
1342  // Find the set of all values affected by the shift cycles, i.e. all
1343  // cycled values, and (recursively) all their users.
1344  ValueSeq Users(Cycled.begin(), Cycled.end());
1345  for (unsigned i = 0; i < Users.size(); ++i) {
1346  Value *V = Users[i];
1347  if (!isa<IntegerType>(V->getType()))
1348  return false;
1349  auto *R = cast<Instruction>(V);
1350  // If the instruction does not commute with shifts, the loop cannot
1351  // be unshifted.
1352  if (!commutesWithShift(R))
1353  return false;
1354  for (User *U : R->users()) {
1355  auto *T = cast<Instruction>(U);
1356  // Skip users from outside of the loop. They will be handled later.
1357  // Also, skip the right-shifts and phi nodes, since they mix early
1358  // and late values.
1359  if (T->getParent() != LoopB || RShifts.count(T) || isa<PHINode>(T))
1360  continue;
1361 
1362  Users.insert(T);
1363  if (!classifyInst(T, Early, Late))
1364  return false;
1365  }
1366  }
1367 
1368  if (Users.empty())
1369  return false;
1370 
1371  // Verify that high bits remain zero.
1372  ValueSeq Internal(Users.begin(), Users.end());
1373  ValueSeq Inputs;
1374  for (unsigned i = 0; i < Internal.size(); ++i) {
1375  auto *R = dyn_cast<Instruction>(Internal[i]);
1376  if (!R)
1377  continue;
1378  for (Value *Op : R->operands()) {
1379  auto *T = dyn_cast<Instruction>(Op);
1380  if (T && T->getParent() != LoopB)
1381  Inputs.insert(Op);
1382  else
1383  Internal.insert(Op);
1384  }
1385  }
1386  for (Value *V : Inputs)
1387  if (!highBitsAreZero(V, IterCount))
1388  return false;
1389  for (Value *V : Internal)
1390  if (!keepsHighBitsZero(V, IterCount))
1391  return false;
1392 
1393  // Finally, the work can be done. Unshift each user.
1394  IRBuilder<> IRB(LoopB);
1395  std::map<Value*,Value*> ShiftMap;
1396 
1397  using CastMapType = std::map<std::pair<Value *, Type *>, Value *>;
1398 
1399  CastMapType CastMap;
1400 
1401  auto upcast = [] (CastMapType &CM, IRBuilder<> &IRB, Value *V,
1402  IntegerType *Ty) -> Value* {
1403  auto H = CM.find(std::make_pair(V, Ty));
1404  if (H != CM.end())
1405  return H->second;
1406  Value *CV = IRB.CreateIntCast(V, Ty, false);
1407  CM.insert(std::make_pair(std::make_pair(V, Ty), CV));
1408  return CV;
1409  };
1410 
1411  for (auto I = LoopB->begin(), E = LoopB->end(); I != E; ++I) {
1412  using namespace PatternMatch;
1413 
1414  if (isa<PHINode>(I) || !Users.count(&*I))
1415  continue;
1416 
1417  // Match lshr x, 1.
1418  Value *V = nullptr;
1419  if (match(&*I, m_LShr(m_Value(V), m_One()))) {
1420  replaceAllUsesOfWithIn(&*I, V, LoopB);
1421  continue;
1422  }
1423  // For each non-cycled operand, replace it with the corresponding
1424  // value shifted left.
1425  for (auto &J : I->operands()) {
1426  Value *Op = J.get();
1427  if (!isOperandShifted(&*I, Op))
1428  continue;
1429  if (Users.count(Op))
1430  continue;
1431  // Skip shifting zeros.
1432  if (isa<ConstantInt>(Op) && cast<ConstantInt>(Op)->isZero())
1433  continue;
1434  // Check if we have already generated a shift for this value.
1435  auto F = ShiftMap.find(Op);
1436  Value *W = (F != ShiftMap.end()) ? F->second : nullptr;
1437  if (W == nullptr) {
1438  IRB.SetInsertPoint(&*I);
1439  // First, the shift amount will be CIV or CIV+1, depending on
1440  // whether the value is early or late. Instead of creating CIV+1,
1441  // do a single shift of the value.
1442  Value *ShAmt = CIV, *ShVal = Op;
1443  auto *VTy = cast<IntegerType>(ShVal->getType());
1444  auto *ATy = cast<IntegerType>(ShAmt->getType());
1445  if (Late.count(&*I))
1446  ShVal = IRB.CreateShl(Op, ConstantInt::get(VTy, 1));
1447  // Second, the types of the shifted value and the shift amount
1448  // must match.
1449  if (VTy != ATy) {
1450  if (VTy->getBitWidth() < ATy->getBitWidth())
1451  ShVal = upcast(CastMap, IRB, ShVal, ATy);
1452  else
1453  ShAmt = upcast(CastMap, IRB, ShAmt, VTy);
1454  }
1455  // Ready to generate the shift and memoize it.
1456  W = IRB.CreateShl(ShVal, ShAmt);
1457  ShiftMap.insert(std::make_pair(Op, W));
1458  }
1459  I->replaceUsesOfWith(Op, W);
1460  }
1461  }
1462 
1463  // Update the users outside of the loop to account for having left
1464  // shifts. They would normally be shifted right in the loop, so shift
1465  // them right after the loop exit.
1466  // Take advantage of the loop-closed SSA form, which has all the post-
1467  // loop values in phi nodes.
1468  IRB.SetInsertPoint(ExitB, ExitB->getFirstInsertionPt());
1469  for (auto P = ExitB->begin(), Q = ExitB->end(); P != Q; ++P) {
1470  if (!isa<PHINode>(P))
1471  break;
1472  auto *PN = cast<PHINode>(P);
1473  Value *U = PN->getIncomingValueForBlock(LoopB);
1474  if (!Users.count(U))
1475  continue;
1476  Value *S = IRB.CreateLShr(PN, ConstantInt::get(PN->getType(), IterCount));
1477  PN->replaceAllUsesWith(S);
1478  // The above RAUW will create
1479  // S = lshr S, IterCount
1480  // so we need to fix it back into
1481  // S = lshr PN, IterCount
1482  cast<User>(S)->replaceUsesOfWith(S, PN);
1483  }
1484 
1485  return true;
1486 }
1487 
1488 void PolynomialMultiplyRecognize::cleanupLoopBody(BasicBlock *LoopB) {
1489  for (auto &I : *LoopB)
1490  if (Value *SV = SimplifyInstruction(&I, {DL, &TLI, &DT}))
1491  I.replaceAllUsesWith(SV);
1492 
1493  for (Instruction &I : llvm::make_early_inc_range(*LoopB))
1495 }
1496 
1497 unsigned PolynomialMultiplyRecognize::getInverseMxN(unsigned QP) {
1498  // Arrays of coefficients of Q and the inverse, C.
1499  // Q[i] = coefficient at x^i.
1500  std::array<char,32> Q, C;
1501 
1502  for (unsigned i = 0; i < 32; ++i) {
1503  Q[i] = QP & 1;
1504  QP >>= 1;
1505  }
1506  assert(Q[0] == 1);
1507 
1508  // Find C, such that
1509  // (Q[n]*x^n + ... + Q[1]*x + Q[0]) * (C[n]*x^n + ... + C[1]*x + C[0]) = 1
1510  //
1511  // For it to have a solution, Q[0] must be 1. Since this is Z2[x], the
1512  // operations * and + are & and ^ respectively.
1513  //
1514  // Find C[i] recursively, by comparing i-th coefficient in the product
1515  // with 0 (or 1 for i=0).
1516  //
1517  // C[0] = 1, since C[0] = Q[0], and Q[0] = 1.
1518  C[0] = 1;
1519  for (unsigned i = 1; i < 32; ++i) {
1520  // Solve for C[i] in:
1521  // C[0]Q[i] ^ C[1]Q[i-1] ^ ... ^ C[i-1]Q[1] ^ C[i]Q[0] = 0
1522  // This is equivalent to
1523  // C[0]Q[i] ^ C[1]Q[i-1] ^ ... ^ C[i-1]Q[1] ^ C[i] = 0
1524  // which is
1525  // C[0]Q[i] ^ C[1]Q[i-1] ^ ... ^ C[i-1]Q[1] = C[i]
1526  unsigned T = 0;
1527  for (unsigned j = 0; j < i; ++j)
1528  T = T ^ (C[j] & Q[i-j]);
1529  C[i] = T;
1530  }
1531 
1532  unsigned QV = 0;
1533  for (unsigned i = 0; i < 32; ++i)
1534  if (C[i])
1535  QV |= (1 << i);
1536 
1537  return QV;
1538 }
1539 
1541  ParsedValues &PV) {
1542  IRBuilder<> B(&*At);
1543  Module *M = At->getParent()->getParent()->getParent();
1544  Function *PMF = Intrinsic::getDeclaration(M, Intrinsic::hexagon_M4_pmpyw);
1545 
1546  Value *P = PV.P, *Q = PV.Q, *P0 = P;
1547  unsigned IC = PV.IterCount;
1548 
1549  if (PV.M != nullptr)
1550  P0 = P = B.CreateXor(P, PV.M);
1551 
1552  // Create a bit mask to clear the high bits beyond IterCount.
1553  auto *BMI = ConstantInt::get(P->getType(), APInt::getLowBitsSet(32, IC));
1554 
1555  if (PV.IterCount != 32)
1556  P = B.CreateAnd(P, BMI);
1557 
1558  if (PV.Inv) {
1559  auto *QI = dyn_cast<ConstantInt>(PV.Q);
1560  assert(QI && QI->getBitWidth() <= 32);
1561 
1562  // Again, clearing bits beyond IterCount.
1563  unsigned M = (1 << PV.IterCount) - 1;
1564  unsigned Tmp = (QI->getZExtValue() | 1) & M;
1565  unsigned QV = getInverseMxN(Tmp) & M;
1566  auto *QVI = ConstantInt::get(QI->getType(), QV);
1567  P = B.CreateCall(PMF, {P, QVI});
1568  P = B.CreateTrunc(P, QI->getType());
1569  if (IC != 32)
1570  P = B.CreateAnd(P, BMI);
1571  }
1572 
1573  Value *R = B.CreateCall(PMF, {P, Q});
1574 
1575  if (PV.M != nullptr)
1576  R = B.CreateXor(R, B.CreateIntCast(P0, R->getType(), false));
1577 
1578  return R;
1579 }
1580 
1581 static bool hasZeroSignBit(const Value *V) {
1582  if (const auto *CI = dyn_cast<const ConstantInt>(V))
1583  return (CI->getType()->getSignBit() & CI->getSExtValue()) == 0;
1584  const Instruction *I = dyn_cast<const Instruction>(V);
1585  if (!I)
1586  return false;
1587  switch (I->getOpcode()) {
1588  case Instruction::LShr:
1589  if (const auto SI = dyn_cast<const ConstantInt>(I->getOperand(1)))
1590  return SI->getZExtValue() > 0;
1591  return false;
1592  case Instruction::Or:
1593  case Instruction::Xor:
1594  return hasZeroSignBit(I->getOperand(0)) &&
1595  hasZeroSignBit(I->getOperand(1));
1596  case Instruction::And:
1597  return hasZeroSignBit(I->getOperand(0)) ||
1598  hasZeroSignBit(I->getOperand(1));
1599  }
1600  return false;
1601 }
1602 
1603 void PolynomialMultiplyRecognize::setupPreSimplifier(Simplifier &S) {
1604  S.addRule("sink-zext",
1605  // Sink zext past bitwise operations.
1606  [](Instruction *I, LLVMContext &Ctx) -> Value* {
1607  if (I->getOpcode() != Instruction::ZExt)
1608  return nullptr;
1609  Instruction *T = dyn_cast<Instruction>(I->getOperand(0));
1610  if (!T)
1611  return nullptr;
1612  switch (T->getOpcode()) {
1613  case Instruction::And:
1614  case Instruction::Or:
1615  case Instruction::Xor:
1616  break;
1617  default:
1618  return nullptr;
1619  }
1620  IRBuilder<> B(Ctx);
1621  return B.CreateBinOp(cast<BinaryOperator>(T)->getOpcode(),
1622  B.CreateZExt(T->getOperand(0), I->getType()),
1623  B.CreateZExt(T->getOperand(1), I->getType()));
1624  });
1625  S.addRule("xor/and -> and/xor",
1626  // (xor (and x a) (and y a)) -> (and (xor x y) a)
1627  [](Instruction *I, LLVMContext &Ctx) -> Value* {
1628  if (I->getOpcode() != Instruction::Xor)
1629  return nullptr;
1630  Instruction *And0 = dyn_cast<Instruction>(I->getOperand(0));
1631  Instruction *And1 = dyn_cast<Instruction>(I->getOperand(1));
1632  if (!And0 || !And1)
1633  return nullptr;
1634  if (And0->getOpcode() != Instruction::And ||
1635  And1->getOpcode() != Instruction::And)
1636  return nullptr;
1637  if (And0->getOperand(1) != And1->getOperand(1))
1638  return nullptr;
1639  IRBuilder<> B(Ctx);
1640  return B.CreateAnd(B.CreateXor(And0->getOperand(0), And1->getOperand(0)),
1641  And0->getOperand(1));
1642  });
1643  S.addRule("sink binop into select",
1644  // (Op (select c x y) z) -> (select c (Op x z) (Op y z))
1645  // (Op x (select c y z)) -> (select c (Op x y) (Op x z))
1646  [](Instruction *I, LLVMContext &Ctx) -> Value* {
1647  BinaryOperator *BO = dyn_cast<BinaryOperator>(I);
1648  if (!BO)
1649  return nullptr;
1651  if (SelectInst *Sel = dyn_cast<SelectInst>(BO->getOperand(0))) {
1652  IRBuilder<> B(Ctx);
1653  Value *X = Sel->getTrueValue(), *Y = Sel->getFalseValue();
1654  Value *Z = BO->getOperand(1);
1655  return B.CreateSelect(Sel->getCondition(),
1656  B.CreateBinOp(Op, X, Z),
1657  B.CreateBinOp(Op, Y, Z));
1658  }
1659  if (SelectInst *Sel = dyn_cast<SelectInst>(BO->getOperand(1))) {
1660  IRBuilder<> B(Ctx);
1661  Value *X = BO->getOperand(0);
1662  Value *Y = Sel->getTrueValue(), *Z = Sel->getFalseValue();
1663  return B.CreateSelect(Sel->getCondition(),
1664  B.CreateBinOp(Op, X, Y),
1665  B.CreateBinOp(Op, X, Z));
1666  }
1667  return nullptr;
1668  });
1669  S.addRule("fold select-select",
1670  // (select c (select c x y) z) -> (select c x z)
1671  // (select c x (select c y z)) -> (select c x z)
1672  [](Instruction *I, LLVMContext &Ctx) -> Value* {
1673  SelectInst *Sel = dyn_cast<SelectInst>(I);
1674  if (!Sel)
1675  return nullptr;
1676  IRBuilder<> B(Ctx);
1677  Value *C = Sel->getCondition();
1678  if (SelectInst *Sel0 = dyn_cast<SelectInst>(Sel->getTrueValue())) {
1679  if (Sel0->getCondition() == C)
1680  return B.CreateSelect(C, Sel0->getTrueValue(), Sel->getFalseValue());
1681  }
1682  if (SelectInst *Sel1 = dyn_cast<SelectInst>(Sel->getFalseValue())) {
1683  if (Sel1->getCondition() == C)
1684  return B.CreateSelect(C, Sel->getTrueValue(), Sel1->getFalseValue());
1685  }
1686  return nullptr;
1687  });
1688  S.addRule("or-signbit -> xor-signbit",
1689  // (or (lshr x 1) 0x800.0) -> (xor (lshr x 1) 0x800.0)
1690  [](Instruction *I, LLVMContext &Ctx) -> Value* {
1691  if (I->getOpcode() != Instruction::Or)
1692  return nullptr;
1693  ConstantInt *Msb = dyn_cast<ConstantInt>(I->getOperand(1));
1694  if (!Msb || Msb->getZExtValue() != Msb->getType()->getSignBit())
1695  return nullptr;
1696  if (!hasZeroSignBit(I->getOperand(0)))
1697  return nullptr;
1698  return IRBuilder<>(Ctx).CreateXor(I->getOperand(0), Msb);
1699  });
1700  S.addRule("sink lshr into binop",
1701  // (lshr (BitOp x y) c) -> (BitOp (lshr x c) (lshr y c))
1702  [](Instruction *I, LLVMContext &Ctx) -> Value* {
1703  if (I->getOpcode() != Instruction::LShr)
1704  return nullptr;
1705  BinaryOperator *BitOp = dyn_cast<BinaryOperator>(I->getOperand(0));
1706  if (!BitOp)
1707  return nullptr;
1708  switch (BitOp->getOpcode()) {
1709  case Instruction::And:
1710  case Instruction::Or:
1711  case Instruction::Xor:
1712  break;
1713  default:
1714  return nullptr;
1715  }
1716  IRBuilder<> B(Ctx);
1717  Value *S = I->getOperand(1);
1718  return B.CreateBinOp(BitOp->getOpcode(),
1719  B.CreateLShr(BitOp->getOperand(0), S),
1720  B.CreateLShr(BitOp->getOperand(1), S));
1721  });
1722  S.addRule("expose bitop-const",
1723  // (BitOp1 (BitOp2 x a) b) -> (BitOp2 x (BitOp1 a b))
1724  [](Instruction *I, LLVMContext &Ctx) -> Value* {
1725  auto IsBitOp = [](unsigned Op) -> bool {
1726  switch (Op) {
1727  case Instruction::And:
1728  case Instruction::Or:
1729  case Instruction::Xor:
1730  return true;
1731  }
1732  return false;
1733  };
1734  BinaryOperator *BitOp1 = dyn_cast<BinaryOperator>(I);
1735  if (!BitOp1 || !IsBitOp(BitOp1->getOpcode()))
1736  return nullptr;
1737  BinaryOperator *BitOp2 = dyn_cast<BinaryOperator>(BitOp1->getOperand(0));
1738  if (!BitOp2 || !IsBitOp(BitOp2->getOpcode()))
1739  return nullptr;
1740  ConstantInt *CA = dyn_cast<ConstantInt>(BitOp2->getOperand(1));
1741  ConstantInt *CB = dyn_cast<ConstantInt>(BitOp1->getOperand(1));
1742  if (!CA || !CB)
1743  return nullptr;
1744  IRBuilder<> B(Ctx);
1745  Value *X = BitOp2->getOperand(0);
1746  return B.CreateBinOp(BitOp2->getOpcode(), X,
1747  B.CreateBinOp(BitOp1->getOpcode(), CA, CB));
1748  });
1749 }
1750 
1751 void PolynomialMultiplyRecognize::setupPostSimplifier(Simplifier &S) {
1752  S.addRule("(and (xor (and x a) y) b) -> (and (xor x y) b), if b == b&a",
1753  [](Instruction *I, LLVMContext &Ctx) -> Value* {
1754  if (I->getOpcode() != Instruction::And)
1755  return nullptr;
1756  Instruction *Xor = dyn_cast<Instruction>(I->getOperand(0));
1757  ConstantInt *C0 = dyn_cast<ConstantInt>(I->getOperand(1));
1758  if (!Xor || !C0)
1759  return nullptr;
1760  if (Xor->getOpcode() != Instruction::Xor)
1761  return nullptr;
1762  Instruction *And0 = dyn_cast<Instruction>(Xor->getOperand(0));
1763  Instruction *And1 = dyn_cast<Instruction>(Xor->getOperand(1));
1764  // Pick the first non-null and.
1765  if (!And0 || And0->getOpcode() != Instruction::And)
1766  std::swap(And0, And1);
1767  ConstantInt *C1 = dyn_cast<ConstantInt>(And0->getOperand(1));
1768  if (!C1)
1769  return nullptr;
1770  uint32_t V0 = C0->getZExtValue();
1771  uint32_t V1 = C1->getZExtValue();
1772  if (V0 != (V0 & V1))
1773  return nullptr;
1774  IRBuilder<> B(Ctx);
1775  return B.CreateAnd(B.CreateXor(And0->getOperand(0), And1), C0);
1776  });
1777 }
1778 
1779 bool PolynomialMultiplyRecognize::recognize() {
1780  LLVM_DEBUG(dbgs() << "Starting PolynomialMultiplyRecognize on loop\n"
1781  << *CurLoop << '\n');
1782  // Restrictions:
1783  // - The loop must consist of a single block.
1784  // - The iteration count must be known at compile-time.
1785  // - The loop must have an induction variable starting from 0, and
1786  // incremented in each iteration of the loop.
1787  BasicBlock *LoopB = CurLoop->getHeader();
1788  LLVM_DEBUG(dbgs() << "Loop header:\n" << *LoopB);
1789 
1790  if (LoopB != CurLoop->getLoopLatch())
1791  return false;
1792  BasicBlock *ExitB = CurLoop->getExitBlock();
1793  if (ExitB == nullptr)
1794  return false;
1795  BasicBlock *EntryB = CurLoop->getLoopPreheader();
1796  if (EntryB == nullptr)
1797  return false;
1798 
1799  unsigned IterCount = 0;
1800  const SCEV *CT = SE.getBackedgeTakenCount(CurLoop);
1801  if (isa<SCEVCouldNotCompute>(CT))
1802  return false;
1803  if (auto *CV = dyn_cast<SCEVConstant>(CT))
1804  IterCount = CV->getValue()->getZExtValue() + 1;
1805 
1806  Value *CIV = getCountIV(LoopB);
1807  ParsedValues PV;
1808  Simplifier PreSimp;
1809  PV.IterCount = IterCount;
1810  LLVM_DEBUG(dbgs() << "Loop IV: " << *CIV << "\nIterCount: " << IterCount
1811  << '\n');
1812 
1813  setupPreSimplifier(PreSimp);
1814 
1815  // Perform a preliminary scan of select instructions to see if any of them
1816  // looks like a generator of the polynomial multiply steps. Assume that a
1817  // loop can only contain a single transformable operation, so stop the
1818  // traversal after the first reasonable candidate was found.
1819  // XXX: Currently this approach can modify the loop before being 100% sure
1820  // that the transformation can be carried out.
1821  bool FoundPreScan = false;
1822  auto FeedsPHI = [LoopB](const Value *V) -> bool {
1823  for (const Value *U : V->users()) {
1824  if (const auto *P = dyn_cast<const PHINode>(U))
1825  if (P->getParent() == LoopB)
1826  return true;
1827  }
1828  return false;
1829  };
1830  for (Instruction &In : *LoopB) {
1831  SelectInst *SI = dyn_cast<SelectInst>(&In);
1832  if (!SI || !FeedsPHI(SI))
1833  continue;
1834 
1836  Value *T = PreSimp.simplify(C);
1837  SelectInst *SelI = (T && isa<SelectInst>(T)) ? cast<SelectInst>(T) : SI;
1838  LLVM_DEBUG(dbgs() << "scanSelect(pre-scan): " << PE(C, SelI) << '\n');
1839  if (scanSelect(SelI, LoopB, EntryB, CIV, PV, true)) {
1840  FoundPreScan = true;
1841  if (SelI != SI) {
1842  Value *NewSel = C.materialize(LoopB, SI->getIterator());
1843  SI->replaceAllUsesWith(NewSel);
1845  }
1846  break;
1847  }
1848  }
1849 
1850  if (!FoundPreScan) {
1851  LLVM_DEBUG(dbgs() << "Have not found candidates for pmpy\n");
1852  return false;
1853  }
1854 
1855  if (!PV.Left) {
1856  // The right shift version actually only returns the higher bits of
1857  // the result (each iteration discards the LSB). If we want to convert it
1858  // to a left-shifting loop, the working data type must be at least as
1859  // wide as the target's pmpy instruction.
1860  if (!promoteTypes(LoopB, ExitB))
1861  return false;
1862  // Run post-promotion simplifications.
1863  Simplifier PostSimp;
1864  setupPostSimplifier(PostSimp);
1865  for (Instruction &In : *LoopB) {
1866  SelectInst *SI = dyn_cast<SelectInst>(&In);
1867  if (!SI || !FeedsPHI(SI))
1868  continue;
1870  Value *T = PostSimp.simplify(C);
1871  SelectInst *SelI = dyn_cast_or_null<SelectInst>(T);
1872  if (SelI != SI) {
1873  Value *NewSel = C.materialize(LoopB, SI->getIterator());
1874  SI->replaceAllUsesWith(NewSel);
1876  }
1877  break;
1878  }
1879 
1880  if (!convertShiftsToLeft(LoopB, ExitB, IterCount))
1881  return false;
1882  cleanupLoopBody(LoopB);
1883  }
1884 
1885  // Scan the loop again, find the generating select instruction.
1886  bool FoundScan = false;
1887  for (Instruction &In : *LoopB) {
1888  SelectInst *SelI = dyn_cast<SelectInst>(&In);
1889  if (!SelI)
1890  continue;
1891  LLVM_DEBUG(dbgs() << "scanSelect: " << *SelI << '\n');
1892  FoundScan = scanSelect(SelI, LoopB, EntryB, CIV, PV, false);
1893  if (FoundScan)
1894  break;
1895  }
1896  assert(FoundScan);
1897 
1898  LLVM_DEBUG({
1899  StringRef PP = (PV.M ? "(P+M)" : "P");
1900  if (!PV.Inv)
1901  dbgs() << "Found pmpy idiom: R = " << PP << ".Q\n";
1902  else
1903  dbgs() << "Found inverse pmpy idiom: R = (" << PP << "/Q).Q) + "
1904  << PP << "\n";
1905  dbgs() << " Res:" << *PV.Res << "\n P:" << *PV.P << "\n";
1906  if (PV.M)
1907  dbgs() << " M:" << *PV.M << "\n";
1908  dbgs() << " Q:" << *PV.Q << "\n";
1909  dbgs() << " Iteration count:" << PV.IterCount << "\n";
1910  });
1911 
1912  BasicBlock::iterator At(EntryB->getTerminator());
1913  Value *PM = generate(At, PV);
1914  if (PM == nullptr)
1915  return false;
1916 
1917  if (PM->getType() != PV.Res->getType())
1918  PM = IRBuilder<>(&*At).CreateIntCast(PM, PV.Res->getType(), false);
1919 
1920  PV.Res->replaceAllUsesWith(PM);
1921  PV.Res->eraseFromParent();
1922  return true;
1923 }
1924 
1925 int HexagonLoopIdiomRecognize::getSCEVStride(const SCEVAddRecExpr *S) {
1926  if (const SCEVConstant *SC = dyn_cast<SCEVConstant>(S->getOperand(1)))
1927  return SC->getAPInt().getSExtValue();
1928  return 0;
1929 }
1930 
1931 bool HexagonLoopIdiomRecognize::isLegalStore(Loop *CurLoop, StoreInst *SI) {
1932  // Allow volatile stores if HexagonVolatileMemcpy is enabled.
1933  if (!(SI->isVolatile() && HexagonVolatileMemcpy) && !SI->isSimple())
1934  return false;
1935 
1936  Value *StoredVal = SI->getValueOperand();
1937  Value *StorePtr = SI->getPointerOperand();
1938 
1939  // Reject stores that are so large that they overflow an unsigned.
1940  uint64_t SizeInBits = DL->getTypeSizeInBits(StoredVal->getType());
1941  if ((SizeInBits & 7) || (SizeInBits >> 32) != 0)
1942  return false;
1943 
1944  // See if the pointer expression is an AddRec like {base,+,1} on the current
1945  // loop, which indicates a strided store. If we have something else, it's a
1946  // random store we can't handle.
1947  auto *StoreEv = dyn_cast<SCEVAddRecExpr>(SE->getSCEV(StorePtr));
1948  if (!StoreEv || StoreEv->getLoop() != CurLoop || !StoreEv->isAffine())
1949  return false;
1950 
1951  // Check to see if the stride matches the size of the store. If so, then we
1952  // know that every byte is touched in the loop.
1953  int Stride = getSCEVStride(StoreEv);
1954  if (Stride == 0)
1955  return false;
1956  unsigned StoreSize = DL->getTypeStoreSize(SI->getValueOperand()->getType());
1957  if (StoreSize != unsigned(std::abs(Stride)))
1958  return false;
1959 
1960  // The store must be feeding a non-volatile load.
1961  LoadInst *LI = dyn_cast<LoadInst>(SI->getValueOperand());
1962  if (!LI || !LI->isSimple())
1963  return false;
1964 
1965  // See if the pointer expression is an AddRec like {base,+,1} on the current
1966  // loop, which indicates a strided load. If we have something else, it's a
1967  // random load we can't handle.
1968  Value *LoadPtr = LI->getPointerOperand();
1969  auto *LoadEv = dyn_cast<SCEVAddRecExpr>(SE->getSCEV(LoadPtr));
1970  if (!LoadEv || LoadEv->getLoop() != CurLoop || !LoadEv->isAffine())
1971  return false;
1972 
1973  // The store and load must share the same stride.
1974  if (StoreEv->getOperand(1) != LoadEv->getOperand(1))
1975  return false;
1976 
1977  // Success. This store can be converted into a memcpy.
1978  return true;
1979 }
1980 
1981 /// mayLoopAccessLocation - Return true if the specified loop might access the
1982 /// specified pointer location, which is a loop-strided access. The 'Access'
1983 /// argument specifies what the verboten forms of access are (read or write).
1984 static bool
1986  const SCEV *BECount, unsigned StoreSize,
1987  AliasAnalysis &AA,
1988  SmallPtrSetImpl<Instruction *> &Ignored) {
1989  // Get the location that may be stored across the loop. Since the access
1990  // is strided positively through memory, we say that the modified location
1991  // starts at the pointer and has infinite size.
1993 
1994  // If the loop iterates a fixed number of times, we can refine the access
1995  // size to be exactly the size of the memset, which is (BECount+1)*StoreSize
1996  if (const SCEVConstant *BECst = dyn_cast<SCEVConstant>(BECount))
1997  AccessSize = LocationSize::precise((BECst->getValue()->getZExtValue() + 1) *
1998  StoreSize);
1999 
2000  // TODO: For this to be really effective, we have to dive into the pointer
2001  // operand in the store. Store to &A[i] of 100 will always return may alias
2002  // with store of &A[100], we need to StoreLoc to be "A" with size of 100,
2003  // which will then no-alias a store to &A[100].
2004  MemoryLocation StoreLoc(Ptr, AccessSize);
2005 
2006  for (auto *B : L->blocks())
2007  for (auto &I : *B)
2008  if (Ignored.count(&I) == 0 &&
2009  isModOrRefSet(
2010  intersectModRef(AA.getModRefInfo(&I, StoreLoc), Access)))
2011  return true;
2012 
2013  return false;
2014 }
2015 
2016 void HexagonLoopIdiomRecognize::collectStores(Loop *CurLoop, BasicBlock *BB,
2017  SmallVectorImpl<StoreInst*> &Stores) {
2018  Stores.clear();
2019  for (Instruction &I : *BB)
2020  if (StoreInst *SI = dyn_cast<StoreInst>(&I))
2021  if (isLegalStore(CurLoop, SI))
2022  Stores.push_back(SI);
2023 }
2024 
2025 bool HexagonLoopIdiomRecognize::processCopyingStore(Loop *CurLoop,
2026  StoreInst *SI, const SCEV *BECount) {
2027  assert((SI->isSimple() || (SI->isVolatile() && HexagonVolatileMemcpy)) &&
2028  "Expected only non-volatile stores, or Hexagon-specific memcpy"
2029  "to volatile destination.");
2030 
2031  Value *StorePtr = SI->getPointerOperand();
2032  auto *StoreEv = cast<SCEVAddRecExpr>(SE->getSCEV(StorePtr));
2033  unsigned Stride = getSCEVStride(StoreEv);
2034  unsigned StoreSize = DL->getTypeStoreSize(SI->getValueOperand()->getType());
2035  if (Stride != StoreSize)
2036  return false;
2037 
2038  // See if the pointer expression is an AddRec like {base,+,1} on the current
2039  // loop, which indicates a strided load. If we have something else, it's a
2040  // random load we can't handle.
2041  auto *LI = cast<LoadInst>(SI->getValueOperand());
2042  auto *LoadEv = cast<SCEVAddRecExpr>(SE->getSCEV(LI->getPointerOperand()));
2043 
2044  // The trip count of the loop and the base pointer of the addrec SCEV is
2045  // guaranteed to be loop invariant, which means that it should dominate the
2046  // header. This allows us to insert code for it in the preheader.
2047  BasicBlock *Preheader = CurLoop->getLoopPreheader();
2048  Instruction *ExpPt = Preheader->getTerminator();
2049  IRBuilder<> Builder(ExpPt);
2050  SCEVExpander Expander(*SE, *DL, "hexagon-loop-idiom");
2051 
2052  Type *IntPtrTy = Builder.getIntPtrTy(*DL, SI->getPointerAddressSpace());
2053 
2054  // Okay, we have a strided store "p[i]" of a loaded value. We can turn
2055  // this into a memcpy/memmove in the loop preheader now if we want. However,
2056  // this would be unsafe to do if there is anything else in the loop that may
2057  // read or write the memory region we're storing to. For memcpy, this
2058  // includes the load that feeds the stores. Check for an alias by generating
2059  // the base address and checking everything.
2060  Value *StoreBasePtr = Expander.expandCodeFor(StoreEv->getStart(),
2061  Builder.getInt8PtrTy(SI->getPointerAddressSpace()), ExpPt);
2062  Value *LoadBasePtr = nullptr;
2063 
2064  bool Overlap = false;
2065  bool DestVolatile = SI->isVolatile();
2066  Type *BECountTy = BECount->getType();
2067 
2068  if (DestVolatile) {
2069  // The trip count must fit in i32, since it is the type of the "num_words"
2070  // argument to hexagon_memcpy_forward_vp4cp4n2.
2071  if (StoreSize != 4 || DL->getTypeSizeInBits(BECountTy) > 32) {
2072 CleanupAndExit:
2073  // If we generated new code for the base pointer, clean up.
2074  Expander.clear();
2075  if (StoreBasePtr && (LoadBasePtr != StoreBasePtr)) {
2076  RecursivelyDeleteTriviallyDeadInstructions(StoreBasePtr, TLI);
2077  StoreBasePtr = nullptr;
2078  }
2079  if (LoadBasePtr) {
2081  LoadBasePtr = nullptr;
2082  }
2083  return false;
2084  }
2085  }
2086 
2088  Ignore1.insert(SI);
2089  if (mayLoopAccessLocation(StoreBasePtr, ModRefInfo::ModRef, CurLoop, BECount,
2090  StoreSize, *AA, Ignore1)) {
2091  // Check if the load is the offending instruction.
2092  Ignore1.insert(LI);
2093  if (mayLoopAccessLocation(StoreBasePtr, ModRefInfo::ModRef, CurLoop,
2094  BECount, StoreSize, *AA, Ignore1)) {
2095  // Still bad. Nothing we can do.
2096  goto CleanupAndExit;
2097  }
2098  // It worked with the load ignored.
2099  Overlap = true;
2100  }
2101 
2102  if (!Overlap) {
2103  if (DisableMemcpyIdiom || !HasMemcpy)
2104  goto CleanupAndExit;
2105  } else {
2106  // Don't generate memmove if this function will be inlined. This is
2107  // because the caller will undergo this transformation after inlining.
2108  Function *Func = CurLoop->getHeader()->getParent();
2109  if (Func->hasFnAttribute(Attribute::AlwaysInline))
2110  goto CleanupAndExit;
2111 
2112  // In case of a memmove, the call to memmove will be executed instead
2113  // of the loop, so we need to make sure that there is nothing else in
2114  // the loop than the load, store and instructions that these two depend
2115  // on.
2117  Insts.push_back(SI);
2118  Insts.push_back(LI);
2119  if (!coverLoop(CurLoop, Insts))
2120  goto CleanupAndExit;
2121 
2122  if (DisableMemmoveIdiom || !HasMemmove)
2123  goto CleanupAndExit;
2124  bool IsNested = CurLoop->getParentLoop() != nullptr;
2125  if (IsNested && OnlyNonNestedMemmove)
2126  goto CleanupAndExit;
2127  }
2128 
2129  // For a memcpy, we have to make sure that the input array is not being
2130  // mutated by the loop.
2131  LoadBasePtr = Expander.expandCodeFor(LoadEv->getStart(),
2132  Builder.getInt8PtrTy(LI->getPointerAddressSpace()), ExpPt);
2133 
2135  Ignore2.insert(SI);
2136  if (mayLoopAccessLocation(LoadBasePtr, ModRefInfo::Mod, CurLoop, BECount,
2137  StoreSize, *AA, Ignore2))
2138  goto CleanupAndExit;
2139 
2140  // Check the stride.
2141  bool StridePos = getSCEVStride(LoadEv) >= 0;
2142 
2143  // Currently, the volatile memcpy only emulates traversing memory forward.
2144  if (!StridePos && DestVolatile)
2145  goto CleanupAndExit;
2146 
2147  bool RuntimeCheck = (Overlap || DestVolatile);
2148 
2149  BasicBlock *ExitB;
2150  if (RuntimeCheck) {
2151  // The runtime check needs a single exit block.
2152  SmallVector<BasicBlock*, 8> ExitBlocks;
2153  CurLoop->getUniqueExitBlocks(ExitBlocks);
2154  if (ExitBlocks.size() != 1)
2155  goto CleanupAndExit;
2156  ExitB = ExitBlocks[0];
2157  }
2158 
2159  // The # stored bytes is (BECount+1)*Size. Expand the trip count out to
2160  // pointer size if it isn't already.
2161  LLVMContext &Ctx = SI->getContext();
2162  BECount = SE->getTruncateOrZeroExtend(BECount, IntPtrTy);
2163  DebugLoc DLoc = SI->getDebugLoc();
2164 
2165  const SCEV *NumBytesS =
2166  SE->getAddExpr(BECount, SE->getOne(IntPtrTy), SCEV::FlagNUW);
2167  if (StoreSize != 1)
2168  NumBytesS = SE->getMulExpr(NumBytesS, SE->getConstant(IntPtrTy, StoreSize),
2169  SCEV::FlagNUW);
2170  Value *NumBytes = Expander.expandCodeFor(NumBytesS, IntPtrTy, ExpPt);
2171  if (Instruction *In = dyn_cast<Instruction>(NumBytes))
2172  if (Value *Simp = SimplifyInstruction(In, {*DL, TLI, DT}))
2173  NumBytes = Simp;
2174 
2175  CallInst *NewCall;
2176 
2177  if (RuntimeCheck) {
2179  if (ConstantInt *CI = dyn_cast<ConstantInt>(NumBytes)) {
2180  uint64_t C = CI->getZExtValue();
2181  if (Threshold != 0 && C < Threshold)
2182  goto CleanupAndExit;
2184  goto CleanupAndExit;
2185  }
2186 
2187  BasicBlock *Header = CurLoop->getHeader();
2188  Function *Func = Header->getParent();
2189  Loop *ParentL = LF->getLoopFor(Preheader);
2190  StringRef HeaderName = Header->getName();
2191 
2192  // Create a new (empty) preheader, and update the PHI nodes in the
2193  // header to use the new preheader.
2194  BasicBlock *NewPreheader = BasicBlock::Create(Ctx, HeaderName+".rtli.ph",
2195  Func, Header);
2196  if (ParentL)
2197  ParentL->addBasicBlockToLoop(NewPreheader, *LF);
2198  IRBuilder<>(NewPreheader).CreateBr(Header);
2199  for (auto &In : *Header) {
2200  PHINode *PN = dyn_cast<PHINode>(&In);
2201  if (!PN)
2202  break;
2203  int bx = PN->getBasicBlockIndex(Preheader);
2204  if (bx >= 0)
2205  PN->setIncomingBlock(bx, NewPreheader);
2206  }
2207  DT->addNewBlock(NewPreheader, Preheader);
2208  DT->changeImmediateDominator(Header, NewPreheader);
2209 
2210  // Check for safe conditions to execute memmove.
2211  // If stride is positive, copying things from higher to lower addresses
2212  // is equivalent to memmove. For negative stride, it's the other way
2213  // around. Copying forward in memory with positive stride may not be
2214  // same as memmove since we may be copying values that we just stored
2215  // in some previous iteration.
2216  Value *LA = Builder.CreatePtrToInt(LoadBasePtr, IntPtrTy);
2217  Value *SA = Builder.CreatePtrToInt(StoreBasePtr, IntPtrTy);
2218  Value *LowA = StridePos ? SA : LA;
2219  Value *HighA = StridePos ? LA : SA;
2220  Value *CmpA = Builder.CreateICmpULT(LowA, HighA);
2221  Value *Cond = CmpA;
2222 
2223  // Check for distance between pointers. Since the case LowA < HighA
2224  // is checked for above, assume LowA >= HighA.
2225  Value *Dist = Builder.CreateSub(LowA, HighA);
2226  Value *CmpD = Builder.CreateICmpSLE(NumBytes, Dist);
2227  Value *CmpEither = Builder.CreateOr(Cond, CmpD);
2228  Cond = CmpEither;
2229 
2230  if (Threshold != 0) {
2231  Type *Ty = NumBytes->getType();
2232  Value *Thr = ConstantInt::get(Ty, Threshold);
2233  Value *CmpB = Builder.CreateICmpULT(Thr, NumBytes);
2234  Value *CmpBoth = Builder.CreateAnd(Cond, CmpB);
2235  Cond = CmpBoth;
2236  }
2237  BasicBlock *MemmoveB = BasicBlock::Create(Ctx, Header->getName()+".rtli",
2238  Func, NewPreheader);
2239  if (ParentL)
2240  ParentL->addBasicBlockToLoop(MemmoveB, *LF);
2241  Instruction *OldT = Preheader->getTerminator();
2242  Builder.CreateCondBr(Cond, MemmoveB, NewPreheader);
2243  OldT->eraseFromParent();
2244  Preheader->setName(Preheader->getName()+".old");
2245  DT->addNewBlock(MemmoveB, Preheader);
2246  // Find the new immediate dominator of the exit block.
2247  BasicBlock *ExitD = Preheader;
2248  for (BasicBlock *PB : predecessors(ExitB)) {
2249  ExitD = DT->findNearestCommonDominator(ExitD, PB);
2250  if (!ExitD)
2251  break;
2252  }
2253  // If the prior immediate dominator of ExitB was dominated by the
2254  // old preheader, then the old preheader becomes the new immediate
2255  // dominator. Otherwise don't change anything (because the newly
2256  // added blocks are dominated by the old preheader).
2257  if (ExitD && DT->dominates(Preheader, ExitD)) {
2258  DomTreeNode *BN = DT->getNode(ExitB);
2259  DomTreeNode *DN = DT->getNode(ExitD);
2260  BN->setIDom(DN);
2261  }
2262 
2263  // Add a call to memmove to the conditional block.
2264  IRBuilder<> CondBuilder(MemmoveB);
2265  CondBuilder.CreateBr(ExitB);
2266  CondBuilder.SetInsertPoint(MemmoveB->getTerminator());
2267 
2268  if (DestVolatile) {
2269  Type *Int32Ty = Type::getInt32Ty(Ctx);
2270  Type *Int32PtrTy = Type::getInt32PtrTy(Ctx);
2271  Type *VoidTy = Type::getVoidTy(Ctx);
2272  Module *M = Func->getParent();
2273  FunctionCallee Fn = M->getOrInsertFunction(
2274  HexagonVolatileMemcpyName, VoidTy, Int32PtrTy, Int32PtrTy, Int32Ty);
2275 
2276  const SCEV *OneS = SE->getConstant(Int32Ty, 1);
2277  const SCEV *BECount32 = SE->getTruncateOrZeroExtend(BECount, Int32Ty);
2278  const SCEV *NumWordsS = SE->getAddExpr(BECount32, OneS, SCEV::FlagNUW);
2279  Value *NumWords = Expander.expandCodeFor(NumWordsS, Int32Ty,
2280  MemmoveB->getTerminator());
2281  if (Instruction *In = dyn_cast<Instruction>(NumWords))
2282  if (Value *Simp = SimplifyInstruction(In, {*DL, TLI, DT}))
2283  NumWords = Simp;
2284 
2285  Value *Op0 = (StoreBasePtr->getType() == Int32PtrTy)
2286  ? StoreBasePtr
2287  : CondBuilder.CreateBitCast(StoreBasePtr, Int32PtrTy);
2288  Value *Op1 = (LoadBasePtr->getType() == Int32PtrTy)
2289  ? LoadBasePtr
2290  : CondBuilder.CreateBitCast(LoadBasePtr, Int32PtrTy);
2291  NewCall = CondBuilder.CreateCall(Fn, {Op0, Op1, NumWords});
2292  } else {
2293  NewCall = CondBuilder.CreateMemMove(
2294  StoreBasePtr, SI->getAlign(), LoadBasePtr, LI->getAlign(), NumBytes);
2295  }
2296  } else {
2297  NewCall = Builder.CreateMemCpy(StoreBasePtr, SI->getAlign(), LoadBasePtr,
2298  LI->getAlign(), NumBytes);
2299  // Okay, the memcpy has been formed. Zap the original store and
2300  // anything that feeds into it.
2302  }
2303 
2304  NewCall->setDebugLoc(DLoc);
2305 
2306  LLVM_DEBUG(dbgs() << " Formed " << (Overlap ? "memmove: " : "memcpy: ")
2307  << *NewCall << "\n"
2308  << " from load ptr=" << *LoadEv << " at: " << *LI << "\n"
2309  << " from store ptr=" << *StoreEv << " at: " << *SI
2310  << "\n");
2311 
2312  return true;
2313 }
2314 
2315 // Check if the instructions in Insts, together with their dependencies
2316 // cover the loop in the sense that the loop could be safely eliminated once
2317 // the instructions in Insts are removed.
2318 bool HexagonLoopIdiomRecognize::coverLoop(Loop *L,
2319  SmallVectorImpl<Instruction*> &Insts) const {
2320  SmallSet<BasicBlock*,8> LoopBlocks;
2321  for (auto *B : L->blocks())
2322  LoopBlocks.insert(B);
2323 
2324  SetVector<Instruction*> Worklist(Insts.begin(), Insts.end());
2325 
2326  // Collect all instructions from the loop that the instructions in Insts
2327  // depend on (plus their dependencies, etc.). These instructions will
2328  // constitute the expression trees that feed those in Insts, but the trees
2329  // will be limited only to instructions contained in the loop.
2330  for (unsigned i = 0; i < Worklist.size(); ++i) {
2331  Instruction *In = Worklist[i];
2332  for (auto I = In->op_begin(), E = In->op_end(); I != E; ++I) {
2333  Instruction *OpI = dyn_cast<Instruction>(I);
2334  if (!OpI)
2335  continue;
2336  BasicBlock *PB = OpI->getParent();
2337  if (!LoopBlocks.count(PB))
2338  continue;
2339  Worklist.insert(OpI);
2340  }
2341  }
2342 
2343  // Scan all instructions in the loop, if any of them have a user outside
2344  // of the loop, or outside of the expressions collected above, then either
2345  // the loop has a side-effect visible outside of it, or there are
2346  // instructions in it that are not involved in the original set Insts.
2347  for (auto *B : L->blocks()) {
2348  for (auto &In : *B) {
2349  if (isa<BranchInst>(In) || isa<DbgInfoIntrinsic>(In))
2350  continue;
2351  if (!Worklist.count(&In) && In.mayHaveSideEffects())
2352  return false;
2353  for (auto K : In.users()) {
2354  Instruction *UseI = dyn_cast<Instruction>(K);
2355  if (!UseI)
2356  continue;
2357  BasicBlock *UseB = UseI->getParent();
2358  if (LF->getLoopFor(UseB) != L)
2359  return false;
2360  }
2361  }
2362  }
2363 
2364  return true;
2365 }
2366 
2367 /// runOnLoopBlock - Process the specified block, which lives in a counted loop
2368 /// with the specified backedge count. This block is known to be in the current
2369 /// loop and not in any subloops.
2370 bool HexagonLoopIdiomRecognize::runOnLoopBlock(Loop *CurLoop, BasicBlock *BB,
2371  const SCEV *BECount, SmallVectorImpl<BasicBlock*> &ExitBlocks) {
2372  // We can only promote stores in this block if they are unconditionally
2373  // executed in the loop. For a block to be unconditionally executed, it has
2374  // to dominate all the exit blocks of the loop. Verify this now.
2375  auto DominatedByBB = [this,BB] (BasicBlock *EB) -> bool {
2376  return DT->dominates(BB, EB);
2377  };
2378  if (!all_of(ExitBlocks, DominatedByBB))
2379  return false;
2380 
2381  bool MadeChange = false;
2382  // Look for store instructions, which may be optimized to memset/memcpy.
2384  collectStores(CurLoop, BB, Stores);
2385 
2386  // Optimize the store into a memcpy, if it feeds an similarly strided load.
2387  for (auto &SI : Stores)
2388  MadeChange |= processCopyingStore(CurLoop, SI, BECount);
2389 
2390  return MadeChange;
2391 }
2392 
2393 bool HexagonLoopIdiomRecognize::runOnCountableLoop(Loop *L) {
2394  PolynomialMultiplyRecognize PMR(L, *DL, *DT, *TLI, *SE);
2395  if (PMR.recognize())
2396  return true;
2397 
2398  if (!HasMemcpy && !HasMemmove)
2399  return false;
2400 
2401  const SCEV *BECount = SE->getBackedgeTakenCount(L);
2402  assert(!isa<SCEVCouldNotCompute>(BECount) &&
2403  "runOnCountableLoop() called on a loop without a predictable"
2404  "backedge-taken count");
2405 
2406  SmallVector<BasicBlock *, 8> ExitBlocks;
2407  L->getUniqueExitBlocks(ExitBlocks);
2408 
2409  bool Changed = false;
2410 
2411  // Scan all the blocks in the loop that are not in subloops.
2412  for (auto *BB : L->getBlocks()) {
2413  // Ignore blocks in subloops.
2414  if (LF->getLoopFor(BB) != L)
2415  continue;
2416  Changed |= runOnLoopBlock(L, BB, BECount, ExitBlocks);
2417  }
2418 
2419  return Changed;
2420 }
2421 
2422 bool HexagonLoopIdiomRecognize::run(Loop *L) {
2423  const Module &M = *L->getHeader()->getParent()->getParent();
2424  if (Triple(M.getTargetTriple()).getArch() != Triple::hexagon)
2425  return false;
2426 
2427  // If the loop could not be converted to canonical form, it must have an
2428  // indirectbr in it, just give up.
2429  if (!L->getLoopPreheader())
2430  return false;
2431 
2432  // Disable loop idiom recognition if the function's name is a common idiom.
2433  StringRef Name = L->getHeader()->getParent()->getName();
2434  if (Name == "memset" || Name == "memcpy" || Name == "memmove")
2435  return false;
2436 
2437  DL = &L->getHeader()->getModule()->getDataLayout();
2438 
2439  HasMemcpy = TLI->has(LibFunc_memcpy);
2440  HasMemmove = TLI->has(LibFunc_memmove);
2441 
2442  if (SE->hasLoopInvariantBackedgeTakenCount(L))
2443  return runOnCountableLoop(L);
2444  return false;
2445 }
2446 
2447 bool HexagonLoopIdiomRecognizeLegacyPass::runOnLoop(Loop *L,
2448  LPPassManager &LPM) {
2449  if (skipLoop(L))
2450  return false;
2451 
2452  auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
2453  auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
2454  auto *LF = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
2455  auto *TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(
2456  *L->getHeader()->getParent());
2457  auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
2458  return HexagonLoopIdiomRecognize(AA, DT, LF, TLI, SE).run(L);
2459 }
2460 
2462  return new HexagonLoopIdiomRecognizeLegacyPass();
2463 }
2464 
2468  LPMUpdater &U) {
2469  return HexagonLoopIdiomRecognize(&AR.AA, &AR.DT, &AR.LI, &AR.TLI, &AR.SE)
2470  .run(&L)
2473 }
i
i
Definition: README.txt:29
use
Move duplicate certain instructions close to their use
Definition: Localizer.cpp:31
llvm::PreservedAnalyses
A set of analyses that are preserved following a run of a transformation pass.
Definition: PassManager.h:155
Int32Ty
IntegerType * Int32Ty
Definition: NVVMIntrRange.cpp:67
llvm::RecursivelyDeleteTriviallyDeadInstructions
bool RecursivelyDeleteTriviallyDeadInstructions(Value *V, const TargetLibraryInfo *TLI=nullptr, MemorySSAUpdater *MSSAU=nullptr, std::function< void(Value *)> AboutToDeleteCallback=std::function< void(Value *)>())
If the specified value is a trivially dead instruction, delete it.
Definition: Local.cpp:511
llvm::IRBuilderBase::CreateIntCast
Value * CreateIntCast(Value *V, Type *DestTy, bool isSigned, const Twine &Name="")
Definition: IRBuilder.h:2061
LLVM_ATTRIBUTE_USED
#define LLVM_ATTRIBUTE_USED
Definition: Compiler.h:144
llvm::BasicBlock::end
iterator end()
Definition: BasicBlock.h:298
llvm::predecessors
pred_range predecessors(BasicBlock *BB)
Definition: CFG.h:127
llvm
This is an optimization pass for GlobalISel generic memory operations.
Definition: AllocatorList.h:23
M
We currently emits eax Perhaps this is what we really should generate is Is imull three or four cycles eax eax The current instruction priority is based on pattern complexity The former is more complex because it folds a load so the latter will not be emitted Perhaps we should use AddedComplexity to give LEA32r a higher priority We should always try to match LEA first since the LEA matching code does some estimate to determine whether the match is profitable if we care more about code then imull is better It s two bytes shorter than movl leal On a Pentium M
Definition: README.txt:252
llvm::AArch64CC::AL
@ AL
Definition: AArch64BaseInfo.h:269
llvm::createHexagonLoopIdiomPass
Pass * createHexagonLoopIdiomPass()
Definition: HexagonLoopIdiomRecognition.cpp:2461
llvm::CmpInst::ICMP_EQ
@ ICMP_EQ
equal
Definition: InstrTypes.h:742
llvm::DataLayout
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:113
llvm::Value::hasOneUse
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition: Value.h:434
llvm::LocationSize::afterPointer
constexpr static LocationSize afterPointer()
Any location after the base pointer (but still within the underlying object).
Definition: MemoryLocation.h:123
print
static void print(raw_ostream &Out, object::Archive::Kind Kind, T Val)
Definition: ArchiveWriter.cpp:147
llvm::Intrinsic::getDeclaration
Function * getDeclaration(Module *M, ID id, ArrayRef< Type * > Tys=None)
Create or insert an LLVM Function declaration for an intrinsic, and return it.
Definition: Function.cpp:1384
llvm::CmpInst::Predicate
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition: InstrTypes.h:721
llvm::User::operands
op_range operands()
Definition: User.h:242
llvm::BasicBlock::iterator
InstListType::iterator iterator
Instruction iterators...
Definition: BasicBlock.h:90
llvm::ConstantInt::getType
IntegerType * getType() const
getType - Specialize the getType() method to always return an IntegerType, which reduces the amount o...
Definition: Constants.h:173
llvm::isModOrRefSet
LLVM_NODISCARD bool isModOrRefSet(const ModRefInfo MRI)
Definition: AliasAnalysis.h:190
llvm::BasicBlock::getParent
const Function * getParent() const
Return the enclosing method, or null if none.
Definition: BasicBlock.h:107
IntrinsicInst.h
ScalarEvolutionExpander.h
simplify
hexagon bit simplify
Definition: HexagonBitSimplify.cpp:261
Scalar.h
T
llvm::Function
Definition: Function.h:62
llvm::Loop
Represents a single loop in the control flow graph.
Definition: LoopInfo.h:530
StringRef.h
P
This currently compiles esp xmm0 movsd esp eax eax esp ret We should use not the dag combiner This is because dagcombine2 needs to be able to see through the X86ISD::Wrapper which DAGCombine can t really do The code for turning x load into a single vector load is target independent and should be moved to the dag combiner The code for turning x load into a vector load can only handle a direct load from a global or a direct load from the stack It should be generalized to handle any load from P
Definition: README-SSE.txt:411
Pass.h
llvm::PatternMatch::m_LShr
BinaryOp_match< LHS, RHS, Instruction::LShr > m_LShr(const LHS &L, const RHS &R)
Definition: PatternMatch.h:1124
llvm::SCEVExpander
This class uses information about analyze scalars to rewrite expressions in canonical form.
Definition: ScalarEvolutionExpander.h:65
C1
instcombine should handle this C2 when C1
Definition: README.txt:263
llvm::IRBuilderBase::CreateXor
Value * CreateXor(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:1414
llvm::SmallVector
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1168
llvm::RISCVFenceField::W
@ W
Definition: RISCVBaseInfo.h:208
llvm::User::dropAllReferences
void dropAllReferences()
Drop all references to operands.
Definition: User.h:299
RuntimeMemSizeThreshold
static cl::opt< unsigned > RuntimeMemSizeThreshold("runtime-mem-idiom-threshold", cl::Hidden, cl::init(0), cl::desc("Threshold (in bytes) for the runtime " "check guarding the memmove."))
ErrorHandling.h
llvm::IRBuilder<>
llvm::ScalarEvolution
The main scalar evolution driver.
Definition: ScalarEvolution.h:460
llvm::CmpInst::ICMP_NE
@ ICMP_NE
not equal
Definition: InstrTypes.h:743
ValueTracking.h
Local.h
llvm::IRBuilderBase::CreateBr
BranchInst * CreateBr(BasicBlock *Dest)
Create an unconditional 'br label X' instruction.
Definition: IRBuilder.h:989
llvm::DominatorTree
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition: Dominators.h:151
llvm::Triple
Triple - Helper class for working with autoconf configuration names.
Definition: Triple.h:45
llvm::cl::Hidden
@ Hidden
Definition: CommandLine.h:143
cleanup
static void cleanup(BlockFrequencyInfoImplBase &BFI)
Clear all memory not needed downstream.
Definition: BlockFrequencyInfoImpl.cpp:304
APInt.h
ScalarEvolution.h
llvm::Type
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
DenseMap.h
Module.h
HexagonVolatileMemcpy
static cl::opt< bool > HexagonVolatileMemcpy("disable-hexagon-volatile-memcpy", cl::Hidden, cl::init(false), cl::desc("Enable Hexagon-specific memcpy for volatile destination."))
llvm::LoopInfoWrapperPass
The legacy pass manager's analysis pass to compute loop information.
Definition: LoopInfo.h:1268
llvm::LoopStandardAnalysisResults
The adaptor from a function pass to a loop pass computes these analyses and makes them available to t...
Definition: LoopAnalysisManager.h:52
llvm::SmallSet
SmallSet - This maintains a set of unique values, optimizing for the case when the set is small (less...
Definition: SmallSet.h:134
llvm::Triple::hexagon
@ hexagon
Definition: Triple.h:60
llvm::SmallPtrSet
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
Definition: SmallPtrSet.h:449
llvm::SelectInst::getFalseValue
const Value * getFalseValue() const
Definition: Instructions.h:1781
llvm::Value::user_begin
user_iterator user_begin()
Definition: Value.h:397
idiom
hexagon loop idiom
Definition: HexagonLoopIdiomRecognition.cpp:285
llvm::BasicBlock::getSinglePredecessor
const BasicBlock * getSinglePredecessor() const
Return the predecessor of this block if it has a single predecessor block.
Definition: BasicBlock.cpp:268
initialize
static void initialize(TargetLibraryInfoImpl &TLI, const Triple &T, ArrayRef< StringLiteral > StandardNames)
Initialize the set of available library functions based on the specified target triple.
Definition: TargetLibraryInfo.cpp:116
llvm::LoadInst::getPointerOperand
Value * getPointerOperand()
Definition: Instructions.h:267
llvm::LoopStandardAnalysisResults::DT
DominatorTree & DT
Definition: LoopAnalysisManager.h:55
replace
static void replace(Module &M, GlobalVariable *Old, GlobalVariable *New)
Definition: ConstantMerge.cpp:116
llvm::LoadInst::getAlign
Align getAlign() const
Return the alignment of the access that is being performed.
Definition: Instructions.h:223
llvm::Type::getInt32PtrTy
static PointerType * getInt32PtrTy(LLVMContext &C, unsigned AS=0)
Definition: Type.cpp:301
llvm::BitmaskEnumDetail::Mask
std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
Definition: BitmaskEnum.h:80
llvm::Type::getInt32Ty
static IntegerType * getInt32Ty(LLVMContext &C)
Definition: Type.cpp:241
LLVM_DEBUG
#define LLVM_DEBUG(X)
Definition: Debug.h:101
F
#define F(x, y, z)
Definition: MD5.cpp:56
llvm::RISCVFenceField::R
@ R
Definition: RISCVBaseInfo.h:207
KnownBits.h
loop
Analysis the ScalarEvolution expression for r is< loop > Outside the loop
Definition: README.txt:8
Context
ManagedStatic< detail::RecordContext > Context
Definition: Record.cpp:96
llvm::BasicBlock
LLVM Basic Block Representation.
Definition: BasicBlock.h:58
llvm::lltok::equal
@ equal
Definition: LLToken.h:25
LoopAnalysisManager.h
AliasAnalysis.h
llvm::dbgs
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
llvm::AlignStyle::Left
@ Left
Instruction.h
CommandLine.h
llvm::ConstantInt
This is the shared class of boolean and integer constants.
Definition: Constants.h:79
idioms
hexagon loop Recognize Hexagon specific loop idioms
Definition: HexagonLoopIdiomRecognition.cpp:286
llvm::LoopBase::getParentLoop
LoopT * getParentLoop() const
Return the parent loop if it exists or nullptr for top level loops.
Definition: LoopInfo.h:113
hasZeroSignBit
static bool hasZeroSignBit(const Value *V)
Definition: HexagonLoopIdiomRecognition.cpp:1581
llvm::Instruction::getOpcode
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
Definition: Instruction.h:160
llvm::all_of
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1600
llvm::SelectInst::getCondition
const Value * getCondition() const
Definition: Instructions.h:1779
llvm::RecurKind::And
@ And
Bitwise or logical AND of integers.
llvm::PassRegistry::getPassRegistry
static PassRegistry * getPassRegistry()
getPassRegistry - Access the global registry object, which is automatically initialized at applicatio...
Definition: PassRegistry.cpp:31
llvm::PPCISD::SC
@ SC
CHAIN = SC CHAIN, Imm128 - System call.
Definition: PPCISelLowering.h:418
CompileTimeMemSizeThreshold
static cl::opt< unsigned > CompileTimeMemSizeThreshold("compile-time-mem-idiom-threshold", cl::Hidden, cl::init(64), cl::desc("Threshold (in bytes) to perform the transformation, if the " "runtime loop count (mem transfer size) is known at compile-time."))
Constants.h
llvm::PatternMatch::match
bool match(Val *V, const Pattern &P)
Definition: PatternMatch.h:49
isZero
static bool isZero(Value *V, const DataLayout &DL, DominatorTree *DT, AssumptionCache *AC)
Definition: Lint.cpp:519
llvm::AAResults
Definition: AliasAnalysis.h:508
E
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
llvm::HexagonLoopIdiomRecognitionPass::run
PreservedAnalyses run(Loop &L, LoopAnalysisManager &AM, LoopStandardAnalysisResults &AR, LPMUpdater &U)
Definition: HexagonLoopIdiomRecognition.cpp:2466
llvm::LocationSize
Definition: MemoryLocation.h:65
llvm::User
Definition: User.h:44
Intrinsics.h
C
(vector float) vec_cmpeq(*A, *B) C
Definition: README_ALTIVEC.txt:86
llvm::ARM_PROC::A
@ A
Definition: ARMBaseInfo.h:34
InstrTypes.h
llvm::BasicBlock::begin
iterator begin()
Instruction iterator methods.
Definition: BasicBlock.h:296
Y
static GCMetadataPrinterRegistry::Add< OcamlGCMetadataPrinter > Y("ocaml", "ocaml 3.10-compatible collector")
llvm::AnalysisUsage
Represent the analysis usage information of a pass.
Definition: PassAnalysisSupport.h:47
getOpcode
static Optional< unsigned > getOpcode(ArrayRef< VPValue * > Values)
Returns the opcode of Values or ~0 if they do not all agree.
Definition: VPlanSLP.cpp:199
llvm::replace
void replace(Container &Cont, typename Container::iterator ContIt, typename Container::iterator ContEnd, RandomAccessIterator ValIt, RandomAccessIterator ValEnd)
Given a sequence container Cont, replace the range [ContIt, ContEnd) with the range [ValIt,...
Definition: STLExtras.h:1804
llvm::LoopBase::blocks
iterator_range< block_iterator > blocks() const
Definition: LoopInfo.h:178
TargetLibraryInfo.h
llvm::BasicBlock::getFirstInsertionPt
const_iterator getFirstInsertionPt() const
Returns an iterator to the first instruction in this block that is suitable for inserting a non-PHI i...
Definition: BasicBlock.cpp:253
false
Definition: StackSlotColoring.cpp:142
llvm::PatternMatch::m_ZExt
CastClass_match< OpTy, Instruction::ZExt > m_ZExt(const OpTy &Op)
Matches ZExt.
Definition: PatternMatch.h:1636
B
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
First
into llvm powi allowing the code generator to produce balanced multiplication trees First
Definition: README.txt:54
llvm::IntegerType
Class to represent integer types.
Definition: DerivedTypes.h:40
llvm::LoopBase::getBlocks
ArrayRef< BlockT * > getBlocks() const
Get a list of the basic blocks which make up this loop.
Definition: LoopInfo.h:171
llvm::BinaryOperator::getOpcode
BinaryOps getOpcode() const
Definition: InstrTypes.h:394
llvm::Instruction
Definition: Instruction.h:45
llvm::SimplifyInstruction
Value * SimplifyInstruction(Instruction *I, const SimplifyQuery &Q, OptimizationRemarkEmitter *ORE=nullptr)
See if we can compute a simplified version of this instruction.
Definition: InstructionSimplify.cpp:6310
llvm::DominatorTreeWrapperPass
Legacy analysis pass which computes a DominatorTree.
Definition: Dominators.h:287
llvm::PassRegistry
PassRegistry - This class manages the registration and intitialization of the pass subsystem as appli...
Definition: PassRegistry.h:38
llvm::BasicBlock::phis
iterator_range< const_phi_iterator > phis() const
Returns a range that iterates over the phis in the basic block.
Definition: BasicBlock.h:354
llvm::raw_ostream
This class implements an extremely fast bulk output stream that can only output to a stream.
Definition: raw_ostream.h:53
llvm::Value::setName
void setName(const Twine &Name)
Change the name of the value.
Definition: Value.cpp:376
llvm::operator<<
raw_ostream & operator<<(raw_ostream &OS, const APFixedPoint &FX)
Definition: APFixedPoint.h:230
llvm::LocationSize::precise
static LocationSize precise(uint64_t Value)
Definition: MemoryLocation.h:100
llvm::ConstantInt::get
static Constant * get(Type *Ty, uint64_t V, bool IsSigned=false)
If Ty is a vector type, return a Constant with a splat of the given value.
Definition: Constants.cpp:925
llvm::Use::getUser
User * getUser() const
Returns the User that contains this Use.
Definition: Use.h:73
llvm::ScalarEvolutionWrapperPass
Definition: ScalarEvolution.h:2123
DebugLoc.h
llvm::LPPassManager
Definition: LoopPass.h:75
SmallPtrSet.h
llvm::BasicBlock::getModule
const Module * getModule() const
Return the module owning the function this basic block belongs to, or nullptr if the function does no...
Definition: BasicBlock.cpp:148
DisableMemcpyIdiom
static cl::opt< bool > DisableMemcpyIdiom("disable-memcpy-idiom", cl::Hidden, cl::init(false), cl::desc("Disable generation of memcpy in loop idiom recognition"))
PatternMatch.h
llvm::BasicBlock::getFirstNonPHI
const Instruction * getFirstNonPHI() const
Returns a pointer to the first instruction in this block that is not a PHINode instruction.
Definition: BasicBlock.cpp:216
llvm::IntegerType::getSignBit
uint64_t getSignBit() const
Return a uint64_t with just the most significant bit set (the sign bit, if the value is treated as a ...
Definition: DerivedTypes.h:82
llvm::Triple::getArch
ArchType getArch() const
Get the parsed architecture type of this triple.
Definition: Triple.h:312
Utils.h
c
the resulting code requires compare and branches when and if the revised code is with conditional branches instead of More there is a byte word extend before each where there should be only and the condition codes are not remembered when the same two values are compared twice More LSR enhancements i8 and i32 load store addressing modes are identical int int c
Definition: README.txt:418
OnlyNonNestedMemmove
static cl::opt< bool > OnlyNonNestedMemmove("only-nonnested-memmove-idiom", cl::Hidden, cl::init(true), cl::desc("Only enable generating memmove in non-nested loops"))
llvm::Value::use_empty
bool use_empty() const
Definition: Value.h:344
Type.h
llvm::IRBuilderBase::CreateAnd
Value * CreateAnd(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:1362
X
static GCMetadataPrinterRegistry::Add< ErlangGCPrinter > X("erlang", "erlang-compatible garbage collector")
llvm::PatternMatch::m_One
cst_pred_ty< is_one > m_One()
Match an integer 1 or a vector with all elements equal to 1.
Definition: PatternMatch.h:513
INITIALIZE_PASS_END
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:58
LoopInfo.h
llvm::IRBuilderBase::CreateZExt
Value * CreateZExt(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:1900
llvm::CmpInst
This class is the base class for the comparison instructions.
Definition: InstrTypes.h:711
llvm::PatternMatch::m_Xor
BinaryOp_match< LHS, RHS, Instruction::Xor > m_Xor(const LHS &L, const RHS &R)
Definition: PatternMatch.h:1112
PB
PassBuilder PB(Machine, PassOpts->PTO, None, &PIC)
llvm::tgtok::In
@ In
Definition: TGLexer.h:51
llvm::getLoopPassPreservedAnalyses
PreservedAnalyses getLoopPassPreservedAnalyses()
Returns the minimum set of Analyses that all loop passes must preserve.
Definition: LoopAnalysisManager.cpp:140
BasicBlock.h
llvm::cl::opt< bool >
llvm::LoadInst::getPointerAddressSpace
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
Definition: Instructions.h:273
llvm::SCEV
This class represents an analyzed expression in the program.
Definition: ScalarEvolution.h:77
llvm::PatternMatch::m_Zero
is_zero m_Zero()
Match any null constant or a vector with all elements equal to 0.
Definition: PatternMatch.h:535
llvm::StoreInst
An instruction for storing to memory.
Definition: Instructions.h:304
llvm::SmallSet::count
size_type count(const T &V) const
count - Return 1 if the element is in the set, 0 otherwise.
Definition: SmallSet.h:164
llvm::Instruction::eraseFromParent
SymbolTableList< Instruction >::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
Definition: Instruction.cpp:78
llvm::TargetLibraryInfoWrapperPass
Definition: TargetLibraryInfo.h:465
uint64_t
llvm::LoopPass
Definition: LoopPass.h:27
llvm::equal
bool equal(L &&LRange, R &&RRange)
Wrapper function around std::equal to detect if pair-wise elements between two ranges are the same.
Definition: STLExtras.h:1759
llvm::GlobalValue::getParent
Module * getParent()
Get the module that this global value is contained inside of...
Definition: GlobalValue.h:578
llvm::LPMUpdater
This class provides an interface for updating the loop pass manager based on mutations to the loop ne...
Definition: LoopPassManager.h:263
llvm::find
auto find(R &&Range, const T &Val)
Provide wrappers to std::find which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1620
llvm::LoopBase::getUniqueExitBlocks
void getUniqueExitBlocks(SmallVectorImpl< BlockT * > &ExitBlocks) const
Return all unique successor blocks of this loop.
Definition: LoopInfoImpl.h:122
llvm::TruncInst
This class represents a truncation of integer types.
Definition: Instructions.h:4764
llvm::PHINode::setIncomingBlock
void setIncomingBlock(unsigned i, BasicBlock *BB)
Definition: Instructions.h:2783
llvm::initializeHexagonLoopIdiomRecognizeLegacyPassPass
void initializeHexagonLoopIdiomRecognizeLegacyPassPass(PassRegistry &)
INITIALIZE_PASS_DEPENDENCY
INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
llvm::LLVMContext
This is an important class for using LLVM in a threaded context.
Definition: LLVMContext.h:68
MemoryLocation.h
llvm::DenseMap
Definition: DenseMap.h:714
I
#define I(x, y, z)
Definition: MD5.cpp:59
llvm::cl::init
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:441
llvm::make_early_inc_range
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition: STLExtras.h:593
llvm::PHINode::getBasicBlockIndex
int getBasicBlockIndex(const BasicBlock *BB) const
Return the first index of the specified basic block in the value list for this PHI.
Definition: Instructions.h:2826
llvm::SelectInst::getTrueValue
const Value * getTrueValue() const
Definition: Instructions.h:1780
llvm::LoopBase::getLoopPreheader
BlockT * getLoopPreheader() const
If there is a preheader for this loop, return it.
Definition: LoopInfoImpl.h:167
llvm::PatternMatch::m_And
BinaryOp_match< LHS, RHS, Instruction::And > m_And(const LHS &L, const RHS &R)
Definition: PatternMatch.h:1100
llvm::Instruction::setDebugLoc
void setDebugLoc(DebugLoc Loc)
Set the debug location information for this instruction.
Definition: Instruction.h:367
llvm::SCEVConstant
This class represents a constant integer value.
Definition: ScalarEvolutionExpressions.h:47
INITIALIZE_PASS_BEGIN
INITIALIZE_PASS_BEGIN(HexagonLoopIdiomRecognizeLegacyPass, "hexagon-loop-idiom", "Recognize Hexagon-specific loop idioms", false, false) INITIALIZE_PASS_END(HexagonLoopIdiomRecognizeLegacyPass
llvm::computeKnownBits
void computeKnownBits(const Value *V, KnownBits &Known, const DataLayout &DL, unsigned Depth=0, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, OptimizationRemarkEmitter *ORE=nullptr, bool UseInstrInfo=true)
Determine which bits of V are known to be either zero or one and return them in the KnownZero/KnownOn...
Definition: ValueTracking.cpp:224
llvm::ModRefInfo
ModRefInfo
Flags indicating whether a memory access modifies or references memory.
Definition: AliasAnalysis.h:149
IRBuilder.h
assert
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
std::swap
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition: BitVector.h:840
SI
StandardInstrumentations SI(Debug, VerifyEach)
llvm::SelectInst
This class represents the LLVM 'select' instruction.
Definition: Instructions.h:1732
function
print Print MemDeps of function
Definition: MemDepPrinter.cpp:83
llvm::Type::isVoidTy
bool isVoidTy() const
Return true if this is 'void'.
Definition: Type.h:138
llvm::Record
Definition: Record.h:1486
llvm::Module
A Module instance is used to store all the information related to an LLVM module.
Definition: Module.h:67
llvm::SmallPtrSetImpl::count
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
Definition: SmallPtrSet.h:382
Builder
assume Assume Builder
Definition: AssumeBundleBuilder.cpp:650
llvm::PatternMatch::m_Value
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
Definition: PatternMatch.h:76
llvm::ZExtInst
This class represents zero extension of integer types.
Definition: Instructions.h:4803
llvm::User::setOperand
void setOperand(unsigned i, Value *Val)
Definition: User.h:174
Triple.h
llvm::LoopInfo
Definition: LoopInfo.h:1083
llvm::BinaryOperator
Definition: InstrTypes.h:190
mayLoopAccessLocation
static bool mayLoopAccessLocation(Value *Ptr, ModRefInfo Access, Loop *L, const SCEV *BECount, unsigned StoreSize, AliasAnalysis &AA, SmallPtrSetImpl< Instruction * > &Ignored)
mayLoopAccessLocation - Return true if the specified loop might access the specified pointer location...
Definition: HexagonLoopIdiomRecognition.cpp:1985
QP
So we should use XX3Form_Rcr to implement instrinsic Convert DP QP
Definition: README_P9.txt:329
DataLayout.h
Cond
SmallVector< MachineOperand, 4 > Cond
Definition: BasicBlockSections.cpp:179
llvm::StringRef
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:57
llvm::LoadInst::isSimple
bool isSimple() const
Definition: Instructions.h:259
LoopPass.h
llvm_unreachable
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
Definition: ErrorHandling.h:134
llvm::Value::getType
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:255
if
if(llvm_vc STREQUAL "") set(fake_version_inc "$
Definition: CMakeLists.txt:14
llvm::AnalysisUsage::addPreserved
AnalysisUsage & addPreserved()
Add the specified Pass class to the set of analyses preserved by this pass.
Definition: PassAnalysisSupport.h:98
llvm::BasicBlock::Create
static BasicBlock * Create(LLVMContext &Context, const Twine &Name="", Function *Parent=nullptr, BasicBlock *InsertBefore=nullptr)
Creates a new BasicBlock.
Definition: BasicBlock.h:100
getParent
static const Function * getParent(const Value *V)
Definition: BasicAliasAnalysis.cpp:870
uint32_t
Compiler.h
llvm::ModRefInfo::Mod
@ Mod
The access may modify the value stored in memory.
llvm::ilist_node_impl::getIterator
self_iterator getIterator()
Definition: ilist_node.h:81
DL
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
Definition: AArch64SLSHardening.cpp:76
llvm::SCEV::FlagNUW
@ FlagNUW
Definition: ScalarEvolution.h:135
llvm::LoopStandardAnalysisResults::LI
LoopInfo & LI
Definition: LoopAnalysisManager.h:56
S
add sub stmia L5 ldr r0 bl L_printf $stub Instead of a and a wouldn t it be better to do three moves *Return an aggregate type is even return S
Definition: README.txt:210
llvm::SmallSet::insert
std::pair< NoneType, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
Definition: SmallSet.h:180
llvm::PredIterator
Definition: CFG.h:43
llvm::ifs::IFSSymbolType::Func
@ Func
llvm::DomTreeNodeBase< BasicBlock >
llvm::Value::getName
StringRef getName() const
Return a constant reference to the value's name.
Definition: Value.cpp:309
llvm::LoadInst
An instruction for reading from memory.
Definition: Instructions.h:175
replaceAllUsesOfWithIn
static void replaceAllUsesOfWithIn(Value *I, Value *J, BasicBlock *BB)
Definition: HexagonLoopIdiomRecognition.cpp:675
llvm::BasicBlock::getTerminator
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Definition: BasicBlock.cpp:152
llvm::IRBuilderBase::CreateTrunc
Value * CreateTrunc(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:1896
llvm::ConstantInt::getZExtValue
uint64_t getZExtValue() const
Return the constant as a 64-bit unsigned integer value after it has been zero extended as appropriate...
Definition: Constants.h:142
llvm::MCID::Select
@ Select
Definition: MCInstrDesc.h:162
Attributes.h
j
return j(j<< 16)
llvm::LCSSAID
char & LCSSAID
Definition: LCSSA.cpp:485
HexagonLoopIdiomRecognition.h
Constant.h
llvm::DomTreeNodeBase::setIDom
void setIDom(DomTreeNodeBase *NewIDom)
Definition: GenericDomTree.h:123
llvm::LoopSimplifyID
char & LoopSimplifyID
Definition: LoopSimplify.cpp:799
llvm::empty
constexpr bool empty(const T &RangeOrContainer)
Test whether RangeOrContainer is empty. Similar to C++17 std::empty.
Definition: STLExtras.h:254
llvm::GraphProgram::Name
Name
Definition: GraphWriter.h:52
llvm::KnownBits
Definition: KnownBits.h:23
H
#define H(x, y, z)
Definition: MD5.cpp:58
llvm::AMDGPU::SendMsg::Op
Op
Definition: SIDefines.h:325
llvm::PreservedAnalyses::all
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition: PassManager.h:161
llvm::pred_end
Interval::pred_iterator pred_end(Interval *I)
Definition: Interval.h:112
llvm::SCEVAddRecExpr
This node represents a polynomial recurrence on the trip count of the specified loop.
Definition: ScalarEvolutionExpressions.h:352
Casting.h
Function.h
llvm::LoopBase::getHeader
BlockT * getHeader() const
Definition: LoopInfo.h:104
PassManager.h
llvm::TargetLibraryInfo
Provides information about what library functions are available for the current target.
Definition: TargetLibraryInfo.h:221
llvm::ARCCC::Z
@ Z
Definition: ARCInfo.h:41
transform
instcombine should handle this transform
Definition: README.txt:262
llvm::SmallVectorImpl::clear
void clear()
Definition: SmallVector.h:585
llvm::MCID::Add
@ Add
Definition: MCInstrDesc.h:183
llvm::LoopStandardAnalysisResults::SE
ScalarEvolution & SE
Definition: LoopAnalysisManager.h:57
llvm::LoopStandardAnalysisResults::AA
AAResults & AA
Definition: LoopAnalysisManager.h:53
ScalarEvolutionExpressions.h
llvm::Instruction::BinaryOps
BinaryOps
Definition: Instruction.h:789
llvm::pred_begin
Interval::pred_iterator pred_begin(Interval *I)
pred_begin/pred_end - define methods so that Intervals may be used just like BasicBlocks can with the...
Definition: Interval.h:109
llvm::Type::getVoidTy
static Type * getVoidTy(LLVMContext &C)
Definition: Type.cpp:224
llvm::Pass
Pass interface - Implemented by all 'passes'.
Definition: Pass.h:91
Instructions.h
llvm::User::getNumOperands
unsigned getNumOperands() const
Definition: User.h:191
SmallVector.h
llvm::PatternMatch::m_Specific
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
Definition: PatternMatch.h:777
User.h
llvm::LoopBase::addBasicBlockToLoop
void addBasicBlockToLoop(BlockT *NewBB, LoopInfoBase< BlockT, LoopT > &LI)
This method is used by other analyses to update loop information.
Definition: LoopInfoImpl.h:242
llvm::PatternMatch::m_ICmp
CmpClass_match< LHS, RHS, ICmpInst, ICmpInst::Predicate > m_ICmp(ICmpInst::Predicate &Pred, const LHS &L, const RHS &R)
Definition: PatternMatch.h:1401
Dominators.h
N
#define N
llvm::AAResultsWrapperPass
A wrapper pass to provide the legacy pass manager access to a suitably prepared AAResults object.
Definition: AliasAnalysis.h:1336
llvm::Instruction::getParent
const BasicBlock * getParent() const
Definition: Instruction.h:94
InstructionSimplify.h
llvm::Value::deleteValue
void deleteValue()
Delete a pointer to a generic Value.
Definition: Value.cpp:110
llvm::IntegerType::getBitWidth
unsigned getBitWidth() const
Get the number of bits in this IntegerType.
Definition: DerivedTypes.h:72
Users
iv Induction Variable Users
Definition: IVUsers.cpp:52
generate
We currently generate
Definition: README.txt:597
llvm::FunctionCallee
A handy container for a FunctionType+Callee-pointer pair, which can be passed around as a single enti...
Definition: DerivedTypes.h:165
llvm::codeview::CompileSym3Flags::Exp
@ Exp
llvm::PHINode
Definition: Instructions.h:2648
Threshold
static cl::opt< unsigned > Threshold("loop-unswitch-threshold", cl::desc("Max loop size to unswitch"), cl::init(100), cl::Hidden)
llvm::SmallVectorImpl
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: APFloat.h:43
llvm::Module::getDataLayout
const DataLayout & getDataLayout() const
Get the data layout for the module's target platform.
Definition: Module.cpp:401
DerivedTypes.h
llvm::SmallPtrSetImpl< Instruction * >
llvm::SCEV::getType
Type * getType() const
Return the LLVM type of this SCEV expression.
Definition: ScalarEvolution.cpp:377
llvm::AnalysisManager
A container for analyses that lazily runs them and caches their results.
Definition: InstructionSimplify.h:44
llvm::APInt::getLowBitsSet
static APInt getLowBitsSet(unsigned numBits, unsigned loBitsSet)
Constructs an APInt value that has the bottom loBitsSet bits set.
Definition: APInt.h:289
llvm::IntegerType::get
static IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
Definition: Type.cpp:313
llvm::CallInst
This class represents a function call, abstracting a target machine's calling convention.
Definition: Instructions.h:1469
llvm::AnalysisUsage::addRequiredID
AnalysisUsage & addRequiredID(const void *ID)
Definition: Pass.cpp:267
BB
Common register allocation spilling lr str ldr sxth r3 ldr mla r4 can lr mov lr str ldr sxth r3 mla r4 and then merge mul and lr str ldr sxth r3 mla r4 It also increase the likelihood the store may become dead bb27 Successors according to LLVM BB
Definition: README.txt:39
SimplifyLimit
static cl::opt< unsigned > SimplifyLimit("hlir-simplify-limit", cl::init(10000), cl::Hidden, cl::desc("Maximum number of simplification steps in HLIR"))
llvm::AnalysisUsage::addRequired
AnalysisUsage & addRequired()
Definition: PassAnalysisSupport.h:75
llvm::DebugLoc
A debug info location.
Definition: DebugLoc.h:33
llvm::User::getOperand
Value * getOperand(unsigned i) const
Definition: User.h:169
llvm::cl::desc
Definition: CommandLine.h:412
raw_ostream.h
n
The same transformation can work with an even modulo with the addition of a and shrink the compare RHS by the same amount Unless the target supports that transformation probably isn t worthwhile The transformation can also easily be made to work with non zero equality for n
Definition: README.txt:685
HexagonVolatileMemcpyName
static const char * HexagonVolatileMemcpyName
Definition: HexagonLoopIdiomRecognition.cpp:109
llvm::SetVector< Value * >
llvm::intersectModRef
LLVM_NODISCARD ModRefInfo intersectModRef(const ModRefInfo MRI1, const ModRefInfo MRI2)
Definition: AliasAnalysis.h:237
Value.h
llvm::pdb::PDB_SymType::Block
@ Block
llvm::abs
APFloat abs(APFloat X)
Returns the absolute value of the argument.
Definition: APFloat.h:1282
llvm::LoopStandardAnalysisResults::TLI
TargetLibraryInfo & TLI
Definition: LoopAnalysisManager.h:58
InitializePasses.h
llvm::Value
LLVM Value Representation.
Definition: Value.h:74
llvm::PatternMatch::m_Shl
BinaryOp_match< LHS, RHS, Instruction::Shl > m_Shl(const LHS &L, const RHS &R)
Definition: PatternMatch.h:1118
Debug.h
llvm::Value::users
iterator_range< user_iterator > users()
Definition: Value.h:421
llvm::AAResults::getModRefInfo
ModRefInfo getModRefInfo(const CallBase *Call, const MemoryLocation &Loc)
getModRefInfo (for call sites) - Return information about whether a particular call site modifies or ...
Definition: AliasAnalysis.cpp:218
llvm::ModRefInfo::ModRef
@ ModRef
The access may reference and may modify the value stored in memory.
llvm::MemoryLocation
Representation for a specific memory location.
Definition: MemoryLocation.h:209
llvm::RecurKind::Xor
@ Xor
Bitwise or logical XOR of integers.
SetVector.h
DisableMemmoveIdiom
static cl::opt< bool > DisableMemmoveIdiom("disable-memmove-idiom", cl::Hidden, cl::init(false), cl::desc("Disable generation of memmove in loop idiom recognition"))
llvm::Use
A Use represents the edge between a Value definition and its users.
Definition: Use.h:44
SmallSet.h
llvm::SmallPtrSetImpl::insert
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition: SmallPtrSet.h:364
llvm::Intrinsic::ID
unsigned ID
Definition: TargetTransformInfo.h:38