LLVM  8.0.0svn
LoopIdiomRecognize.cpp
Go to the documentation of this file.
1 //===- LoopIdiomRecognize.cpp - Loop idiom recognition --------------------===//
2 //
3 // The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 // This pass implements an idiom recognizer that transforms simple loops into a
11 // non-loop form. In cases that this kicks in, it can be a significant
12 // performance win.
13 //
14 // If compiling for code size we avoid idiom recognition if the resulting
15 // code could be larger than the code for the original loop. One way this could
16 // happen is if the loop is not removable after idiom recognition due to the
17 // presence of non-idiom instructions. The initial implementation of the
18 // heuristics applies to idioms in multi-block loops.
19 //
20 //===----------------------------------------------------------------------===//
21 //
22 // TODO List:
23 //
24 // Future loop memory idioms to recognize:
25 // memcmp, memmove, strlen, etc.
26 // Future floating point idioms to recognize in -ffast-math mode:
27 // fpowi
28 // Future integer operation idioms to recognize:
29 // ctpop, ctlz, cttz
30 //
31 // Beware that isel's default lowering for ctpop is highly inefficient for
32 // i64 and larger types when i64 is legal and the value has few bits set. It
33 // would be good to enhance isel to emit a loop for ctpop in this case.
34 //
35 // This could recognize common matrix multiplies and dot product idioms and
36 // replace them with calls to BLAS (if linked in??).
37 //
38 //===----------------------------------------------------------------------===//
39 
40 #include "llvm/ADT/APInt.h"
41 #include "llvm/ADT/ArrayRef.h"
42 #include "llvm/ADT/DenseMap.h"
43 #include "llvm/ADT/MapVector.h"
44 #include "llvm/ADT/SetVector.h"
45 #include "llvm/ADT/SmallPtrSet.h"
46 #include "llvm/ADT/SmallVector.h"
47 #include "llvm/ADT/Statistic.h"
48 #include "llvm/ADT/StringRef.h"
51 #include "llvm/Analysis/LoopInfo.h"
52 #include "llvm/Analysis/LoopPass.h"
61 #include "llvm/IR/Attributes.h"
62 #include "llvm/IR/BasicBlock.h"
63 #include "llvm/IR/Constant.h"
64 #include "llvm/IR/Constants.h"
65 #include "llvm/IR/DataLayout.h"
66 #include "llvm/IR/DebugLoc.h"
67 #include "llvm/IR/DerivedTypes.h"
68 #include "llvm/IR/Dominators.h"
69 #include "llvm/IR/GlobalValue.h"
70 #include "llvm/IR/GlobalVariable.h"
71 #include "llvm/IR/IRBuilder.h"
72 #include "llvm/IR/InstrTypes.h"
73 #include "llvm/IR/Instruction.h"
74 #include "llvm/IR/Instructions.h"
75 #include "llvm/IR/IntrinsicInst.h"
76 #include "llvm/IR/Intrinsics.h"
77 #include "llvm/IR/LLVMContext.h"
78 #include "llvm/IR/Module.h"
79 #include "llvm/IR/PassManager.h"
80 #include "llvm/IR/Type.h"
81 #include "llvm/IR/User.h"
82 #include "llvm/IR/Value.h"
83 #include "llvm/IR/ValueHandle.h"
84 #include "llvm/Pass.h"
85 #include "llvm/Support/Casting.h"
87 #include "llvm/Support/Debug.h"
89 #include "llvm/Transforms/Scalar.h"
93 #include <algorithm>
94 #include <cassert>
95 #include <cstdint>
96 #include <utility>
97 #include <vector>
98 
99 using namespace llvm;
100 
101 #define DEBUG_TYPE "loop-idiom"
102 
103 STATISTIC(NumMemSet, "Number of memset's formed from loop stores");
104 STATISTIC(NumMemCpy, "Number of memcpy's formed from loop load+stores");
105 
107  "use-lir-code-size-heurs",
108  cl::desc("Use loop idiom recognition code size heuristics when compiling"
109  "with -Os/-Oz"),
110  cl::init(true), cl::Hidden);
111 
112 namespace {
113 
114 class LoopIdiomRecognize {
115  Loop *CurLoop = nullptr;
116  AliasAnalysis *AA;
117  DominatorTree *DT;
118  LoopInfo *LI;
119  ScalarEvolution *SE;
120  TargetLibraryInfo *TLI;
121  const TargetTransformInfo *TTI;
122  const DataLayout *DL;
123  bool ApplyCodeSizeHeuristics;
124 
125 public:
126  explicit LoopIdiomRecognize(AliasAnalysis *AA, DominatorTree *DT,
127  LoopInfo *LI, ScalarEvolution *SE,
128  TargetLibraryInfo *TLI,
129  const TargetTransformInfo *TTI,
130  const DataLayout *DL)
131  : AA(AA), DT(DT), LI(LI), SE(SE), TLI(TLI), TTI(TTI), DL(DL) {}
132 
133  bool runOnLoop(Loop *L);
134 
135 private:
136  using StoreList = SmallVector<StoreInst *, 8>;
137  using StoreListMap = MapVector<Value *, StoreList>;
138 
139  StoreListMap StoreRefsForMemset;
140  StoreListMap StoreRefsForMemsetPattern;
141  StoreList StoreRefsForMemcpy;
142  bool HasMemset;
143  bool HasMemsetPattern;
144  bool HasMemcpy;
145 
146  /// Return code for isLegalStore()
147  enum LegalStoreKind {
148  None = 0,
149  Memset,
150  MemsetPattern,
151  Memcpy,
152  UnorderedAtomicMemcpy,
153  DontUse // Dummy retval never to be used. Allows catching errors in retval
154  // handling.
155  };
156 
157  /// \name Countable Loop Idiom Handling
158  /// @{
159 
160  bool runOnCountableLoop();
161  bool runOnLoopBlock(BasicBlock *BB, const SCEV *BECount,
162  SmallVectorImpl<BasicBlock *> &ExitBlocks);
163 
164  void collectStores(BasicBlock *BB);
165  LegalStoreKind isLegalStore(StoreInst *SI);
166  bool processLoopStores(SmallVectorImpl<StoreInst *> &SL, const SCEV *BECount,
167  bool ForMemset);
168  bool processLoopMemSet(MemSetInst *MSI, const SCEV *BECount);
169 
170  bool processLoopStridedStore(Value *DestPtr, unsigned StoreSize,
171  unsigned StoreAlignment, Value *StoredVal,
172  Instruction *TheStore,
174  const SCEVAddRecExpr *Ev, const SCEV *BECount,
175  bool NegStride, bool IsLoopMemset = false);
176  bool processLoopStoreOfLoopLoad(StoreInst *SI, const SCEV *BECount);
177  bool avoidLIRForMultiBlockLoop(bool IsMemset = false,
178  bool IsLoopMemset = false);
179 
180  /// @}
181  /// \name Noncountable Loop Idiom Handling
182  /// @{
183 
184  bool runOnNoncountableLoop();
185 
186  bool recognizePopcount();
187  void transformLoopToPopcount(BasicBlock *PreCondBB, Instruction *CntInst,
188  PHINode *CntPhi, Value *Var);
189  bool recognizeAndInsertCTLZ();
190  void transformLoopToCountable(BasicBlock *PreCondBB, Instruction *CntInst,
191  PHINode *CntPhi, Value *Var, Instruction *DefX,
192  const DebugLoc &DL, bool ZeroCheck,
193  bool IsCntPhiUsedOutsideLoop);
194 
195  /// @}
196 };
197 
198 class LoopIdiomRecognizeLegacyPass : public LoopPass {
199 public:
200  static char ID;
201 
202  explicit LoopIdiomRecognizeLegacyPass() : LoopPass(ID) {
205  }
206 
207  bool runOnLoop(Loop *L, LPPassManager &LPM) override {
208  if (skipLoop(L))
209  return false;
210 
211  AliasAnalysis *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
212  DominatorTree *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
213  LoopInfo *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
214  ScalarEvolution *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
215  TargetLibraryInfo *TLI =
216  &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
217  const TargetTransformInfo *TTI =
218  &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(
219  *L->getHeader()->getParent());
220  const DataLayout *DL = &L->getHeader()->getModule()->getDataLayout();
221 
222  LoopIdiomRecognize LIR(AA, DT, LI, SE, TLI, TTI, DL);
223  return LIR.runOnLoop(L);
224  }
225 
226  /// This transformation requires natural loop information & requires that
227  /// loop preheaders be inserted into the CFG.
228  void getAnalysisUsage(AnalysisUsage &AU) const override {
232  }
233 };
234 
235 } // end anonymous namespace
236 
238 
241  LPMUpdater &) {
242  const auto *DL = &L.getHeader()->getModule()->getDataLayout();
243 
244  LoopIdiomRecognize LIR(&AR.AA, &AR.DT, &AR.LI, &AR.SE, &AR.TLI, &AR.TTI, DL);
245  if (!LIR.runOnLoop(&L))
246  return PreservedAnalyses::all();
247 
249 }
250 
251 INITIALIZE_PASS_BEGIN(LoopIdiomRecognizeLegacyPass, "loop-idiom",
252  "Recognize loop idioms", false, false)
256 INITIALIZE_PASS_END(LoopIdiomRecognizeLegacyPass, "loop-idiom",
257  "Recognize loop idioms", false, false)
258 
259 Pass *llvm::createLoopIdiomPass() { return new LoopIdiomRecognizeLegacyPass(); }
260 
263  I->eraseFromParent();
264 }
265 
266 //===----------------------------------------------------------------------===//
267 //
268 // Implementation of LoopIdiomRecognize
269 //
270 //===----------------------------------------------------------------------===//
271 
272 bool LoopIdiomRecognize::runOnLoop(Loop *L) {
273  CurLoop = L;
274  // If the loop could not be converted to canonical form, it must have an
275  // indirectbr in it, just give up.
276  if (!L->getLoopPreheader())
277  return false;
278 
279  // Disable loop idiom recognition if the function's name is a common idiom.
281  if (Name == "memset" || Name == "memcpy")
282  return false;
283 
284  // Determine if code size heuristics need to be applied.
285  ApplyCodeSizeHeuristics =
287 
288  HasMemset = TLI->has(LibFunc_memset);
289  HasMemsetPattern = TLI->has(LibFunc_memset_pattern16);
290  HasMemcpy = TLI->has(LibFunc_memcpy);
291 
292  if (HasMemset || HasMemsetPattern || HasMemcpy)
294  return runOnCountableLoop();
295 
296  return runOnNoncountableLoop();
297 }
298 
299 bool LoopIdiomRecognize::runOnCountableLoop() {
300  const SCEV *BECount = SE->getBackedgeTakenCount(CurLoop);
301  assert(!isa<SCEVCouldNotCompute>(BECount) &&
302  "runOnCountableLoop() called on a loop without a predictable"
303  "backedge-taken count");
304 
305  // If this loop executes exactly one time, then it should be peeled, not
306  // optimized by this pass.
307  if (const SCEVConstant *BECst = dyn_cast<SCEVConstant>(BECount))
308  if (BECst->getAPInt() == 0)
309  return false;
310 
311  SmallVector<BasicBlock *, 8> ExitBlocks;
312  CurLoop->getUniqueExitBlocks(ExitBlocks);
313 
314  LLVM_DEBUG(dbgs() << "loop-idiom Scanning: F["
315  << CurLoop->getHeader()->getParent()->getName()
316  << "] Loop %" << CurLoop->getHeader()->getName() << "\n");
317 
318  bool MadeChange = false;
319 
320  // The following transforms hoist stores/memsets into the loop pre-header.
321  // Give up if the loop has instructions may throw.
322  LoopSafetyInfo SafetyInfo;
323  SafetyInfo.computeLoopSafetyInfo(CurLoop);
324  if (SafetyInfo.anyBlockMayThrow())
325  return MadeChange;
326 
327  // Scan all the blocks in the loop that are not in subloops.
328  for (auto *BB : CurLoop->getBlocks()) {
329  // Ignore blocks in subloops.
330  if (LI->getLoopFor(BB) != CurLoop)
331  continue;
332 
333  MadeChange |= runOnLoopBlock(BB, BECount, ExitBlocks);
334  }
335  return MadeChange;
336 }
337 
338 static APInt getStoreStride(const SCEVAddRecExpr *StoreEv) {
339  const SCEVConstant *ConstStride = cast<SCEVConstant>(StoreEv->getOperand(1));
340  return ConstStride->getAPInt();
341 }
342 
343 /// getMemSetPatternValue - If a strided store of the specified value is safe to
344 /// turn into a memset_pattern16, return a ConstantArray of 16 bytes that should
345 /// be passed in. Otherwise, return null.
346 ///
347 /// Note that we don't ever attempt to use memset_pattern8 or 4, because these
348 /// just replicate their input array and then pass on to memset_pattern16.
350  // If the value isn't a constant, we can't promote it to being in a constant
351  // array. We could theoretically do a store to an alloca or something, but
352  // that doesn't seem worthwhile.
353  Constant *C = dyn_cast<Constant>(V);
354  if (!C)
355  return nullptr;
356 
357  // Only handle simple values that are a power of two bytes in size.
358  uint64_t Size = DL->getTypeSizeInBits(V->getType());
359  if (Size == 0 || (Size & 7) || (Size & (Size - 1)))
360  return nullptr;
361 
362  // Don't care enough about darwin/ppc to implement this.
363  if (DL->isBigEndian())
364  return nullptr;
365 
366  // Convert to size in bytes.
367  Size /= 8;
368 
369  // TODO: If CI is larger than 16-bytes, we can try slicing it in half to see
370  // if the top and bottom are the same (e.g. for vectors and large integers).
371  if (Size > 16)
372  return nullptr;
373 
374  // If the constant is exactly 16 bytes, just use it.
375  if (Size == 16)
376  return C;
377 
378  // Otherwise, we'll use an array of the constants.
379  unsigned ArraySize = 16 / Size;
380  ArrayType *AT = ArrayType::get(V->getType(), ArraySize);
381  return ConstantArray::get(AT, std::vector<Constant *>(ArraySize, C));
382 }
383 
384 LoopIdiomRecognize::LegalStoreKind
385 LoopIdiomRecognize::isLegalStore(StoreInst *SI) {
386  // Don't touch volatile stores.
387  if (SI->isVolatile())
388  return LegalStoreKind::None;
389  // We only want simple or unordered-atomic stores.
390  if (!SI->isUnordered())
391  return LegalStoreKind::None;
392 
393  // Don't convert stores of non-integral pointer types to memsets (which stores
394  // integers).
395  if (DL->isNonIntegralPointerType(SI->getValueOperand()->getType()))
396  return LegalStoreKind::None;
397 
398  // Avoid merging nontemporal stores.
400  return LegalStoreKind::None;
401 
402  Value *StoredVal = SI->getValueOperand();
403  Value *StorePtr = SI->getPointerOperand();
404 
405  // Reject stores that are so large that they overflow an unsigned.
406  uint64_t SizeInBits = DL->getTypeSizeInBits(StoredVal->getType());
407  if ((SizeInBits & 7) || (SizeInBits >> 32) != 0)
408  return LegalStoreKind::None;
409 
410  // See if the pointer expression is an AddRec like {base,+,1} on the current
411  // loop, which indicates a strided store. If we have something else, it's a
412  // random store we can't handle.
413  const SCEVAddRecExpr *StoreEv =
414  dyn_cast<SCEVAddRecExpr>(SE->getSCEV(StorePtr));
415  if (!StoreEv || StoreEv->getLoop() != CurLoop || !StoreEv->isAffine())
416  return LegalStoreKind::None;
417 
418  // Check to see if we have a constant stride.
419  if (!isa<SCEVConstant>(StoreEv->getOperand(1)))
420  return LegalStoreKind::None;
421 
422  // See if the store can be turned into a memset.
423 
424  // If the stored value is a byte-wise value (like i32 -1), then it may be
425  // turned into a memset of i8 -1, assuming that all the consecutive bytes
426  // are stored. A store of i32 0x01020304 can never be turned into a memset,
427  // but it can be turned into memset_pattern if the target supports it.
428  Value *SplatValue = isBytewiseValue(StoredVal);
429  Constant *PatternValue = nullptr;
430 
431  // Note: memset and memset_pattern on unordered-atomic is yet not supported
432  bool UnorderedAtomic = SI->isUnordered() && !SI->isSimple();
433 
434  // If we're allowed to form a memset, and the stored value would be
435  // acceptable for memset, use it.
436  if (!UnorderedAtomic && HasMemset && SplatValue &&
437  // Verify that the stored value is loop invariant. If not, we can't
438  // promote the memset.
439  CurLoop->isLoopInvariant(SplatValue)) {
440  // It looks like we can use SplatValue.
441  return LegalStoreKind::Memset;
442  } else if (!UnorderedAtomic && HasMemsetPattern &&
443  // Don't create memset_pattern16s with address spaces.
444  StorePtr->getType()->getPointerAddressSpace() == 0 &&
445  (PatternValue = getMemSetPatternValue(StoredVal, DL))) {
446  // It looks like we can use PatternValue!
447  return LegalStoreKind::MemsetPattern;
448  }
449 
450  // Otherwise, see if the store can be turned into a memcpy.
451  if (HasMemcpy) {
452  // Check to see if the stride matches the size of the store. If so, then we
453  // know that every byte is touched in the loop.
454  APInt Stride = getStoreStride(StoreEv);
455  unsigned StoreSize = DL->getTypeStoreSize(SI->getValueOperand()->getType());
456  if (StoreSize != Stride && StoreSize != -Stride)
457  return LegalStoreKind::None;
458 
459  // The store must be feeding a non-volatile load.
461 
462  // Only allow non-volatile loads
463  if (!LI || LI->isVolatile())
464  return LegalStoreKind::None;
465  // Only allow simple or unordered-atomic loads
466  if (!LI->isUnordered())
467  return LegalStoreKind::None;
468 
469  // See if the pointer expression is an AddRec like {base,+,1} on the current
470  // loop, which indicates a strided load. If we have something else, it's a
471  // random load we can't handle.
472  const SCEVAddRecExpr *LoadEv =
474  if (!LoadEv || LoadEv->getLoop() != CurLoop || !LoadEv->isAffine())
475  return LegalStoreKind::None;
476 
477  // The store and load must share the same stride.
478  if (StoreEv->getOperand(1) != LoadEv->getOperand(1))
479  return LegalStoreKind::None;
480 
481  // Success. This store can be converted into a memcpy.
482  UnorderedAtomic = UnorderedAtomic || LI->isAtomic();
483  return UnorderedAtomic ? LegalStoreKind::UnorderedAtomicMemcpy
484  : LegalStoreKind::Memcpy;
485  }
486  // This store can't be transformed into a memset/memcpy.
487  return LegalStoreKind::None;
488 }
489 
490 void LoopIdiomRecognize::collectStores(BasicBlock *BB) {
491  StoreRefsForMemset.clear();
492  StoreRefsForMemsetPattern.clear();
493  StoreRefsForMemcpy.clear();
494  for (Instruction &I : *BB) {
495  StoreInst *SI = dyn_cast<StoreInst>(&I);
496  if (!SI)
497  continue;
498 
499  // Make sure this is a strided store with a constant stride.
500  switch (isLegalStore(SI)) {
502  // Nothing to do
503  break;
504  case LegalStoreKind::Memset: {
505  // Find the base pointer.
507  StoreRefsForMemset[Ptr].push_back(SI);
508  } break;
509  case LegalStoreKind::MemsetPattern: {
510  // Find the base pointer.
512  StoreRefsForMemsetPattern[Ptr].push_back(SI);
513  } break;
514  case LegalStoreKind::Memcpy:
515  case LegalStoreKind::UnorderedAtomicMemcpy:
516  StoreRefsForMemcpy.push_back(SI);
517  break;
518  default:
519  assert(false && "unhandled return value");
520  break;
521  }
522  }
523 }
524 
525 /// runOnLoopBlock - Process the specified block, which lives in a counted loop
526 /// with the specified backedge count. This block is known to be in the current
527 /// loop and not in any subloops.
528 bool LoopIdiomRecognize::runOnLoopBlock(
529  BasicBlock *BB, const SCEV *BECount,
530  SmallVectorImpl<BasicBlock *> &ExitBlocks) {
531  // We can only promote stores in this block if they are unconditionally
532  // executed in the loop. For a block to be unconditionally executed, it has
533  // to dominate all the exit blocks of the loop. Verify this now.
534  for (unsigned i = 0, e = ExitBlocks.size(); i != e; ++i)
535  if (!DT->dominates(BB, ExitBlocks[i]))
536  return false;
537 
538  bool MadeChange = false;
539  // Look for store instructions, which may be optimized to memset/memcpy.
540  collectStores(BB);
541 
542  // Look for a single store or sets of stores with a common base, which can be
543  // optimized into a memset (memset_pattern). The latter most commonly happens
544  // with structs and handunrolled loops.
545  for (auto &SL : StoreRefsForMemset)
546  MadeChange |= processLoopStores(SL.second, BECount, true);
547 
548  for (auto &SL : StoreRefsForMemsetPattern)
549  MadeChange |= processLoopStores(SL.second, BECount, false);
550 
551  // Optimize the store into a memcpy, if it feeds an similarly strided load.
552  for (auto &SI : StoreRefsForMemcpy)
553  MadeChange |= processLoopStoreOfLoopLoad(SI, BECount);
554 
555  for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) {
556  Instruction *Inst = &*I++;
557  // Look for memset instructions, which may be optimized to a larger memset.
558  if (MemSetInst *MSI = dyn_cast<MemSetInst>(Inst)) {
559  WeakTrackingVH InstPtr(&*I);
560  if (!processLoopMemSet(MSI, BECount))
561  continue;
562  MadeChange = true;
563 
564  // If processing the memset invalidated our iterator, start over from the
565  // top of the block.
566  if (!InstPtr)
567  I = BB->begin();
568  continue;
569  }
570  }
571 
572  return MadeChange;
573 }
574 
575 /// processLoopStores - See if this store(s) can be promoted to a memset.
576 bool LoopIdiomRecognize::processLoopStores(SmallVectorImpl<StoreInst *> &SL,
577  const SCEV *BECount,
578  bool ForMemset) {
579  // Try to find consecutive stores that can be transformed into memsets.
580  SetVector<StoreInst *> Heads, Tails;
582 
583  // Do a quadratic search on all of the given stores and find
584  // all of the pairs of stores that follow each other.
585  SmallVector<unsigned, 16> IndexQueue;
586  for (unsigned i = 0, e = SL.size(); i < e; ++i) {
587  assert(SL[i]->isSimple() && "Expected only non-volatile stores.");
588 
589  Value *FirstStoredVal = SL[i]->getValueOperand();
590  Value *FirstStorePtr = SL[i]->getPointerOperand();
591  const SCEVAddRecExpr *FirstStoreEv =
592  cast<SCEVAddRecExpr>(SE->getSCEV(FirstStorePtr));
593  APInt FirstStride = getStoreStride(FirstStoreEv);
594  unsigned FirstStoreSize = DL->getTypeStoreSize(SL[i]->getValueOperand()->getType());
595 
596  // See if we can optimize just this store in isolation.
597  if (FirstStride == FirstStoreSize || -FirstStride == FirstStoreSize) {
598  Heads.insert(SL[i]);
599  continue;
600  }
601 
602  Value *FirstSplatValue = nullptr;
603  Constant *FirstPatternValue = nullptr;
604 
605  if (ForMemset)
606  FirstSplatValue = isBytewiseValue(FirstStoredVal);
607  else
608  FirstPatternValue = getMemSetPatternValue(FirstStoredVal, DL);
609 
610  assert((FirstSplatValue || FirstPatternValue) &&
611  "Expected either splat value or pattern value.");
612 
613  IndexQueue.clear();
614  // If a store has multiple consecutive store candidates, search Stores
615  // array according to the sequence: from i+1 to e, then from i-1 to 0.
616  // This is because usually pairing with immediate succeeding or preceding
617  // candidate create the best chance to find memset opportunity.
618  unsigned j = 0;
619  for (j = i + 1; j < e; ++j)
620  IndexQueue.push_back(j);
621  for (j = i; j > 0; --j)
622  IndexQueue.push_back(j - 1);
623 
624  for (auto &k : IndexQueue) {
625  assert(SL[k]->isSimple() && "Expected only non-volatile stores.");
626  Value *SecondStorePtr = SL[k]->getPointerOperand();
627  const SCEVAddRecExpr *SecondStoreEv =
628  cast<SCEVAddRecExpr>(SE->getSCEV(SecondStorePtr));
629  APInt SecondStride = getStoreStride(SecondStoreEv);
630 
631  if (FirstStride != SecondStride)
632  continue;
633 
634  Value *SecondStoredVal = SL[k]->getValueOperand();
635  Value *SecondSplatValue = nullptr;
636  Constant *SecondPatternValue = nullptr;
637 
638  if (ForMemset)
639  SecondSplatValue = isBytewiseValue(SecondStoredVal);
640  else
641  SecondPatternValue = getMemSetPatternValue(SecondStoredVal, DL);
642 
643  assert((SecondSplatValue || SecondPatternValue) &&
644  "Expected either splat value or pattern value.");
645 
646  if (isConsecutiveAccess(SL[i], SL[k], *DL, *SE, false)) {
647  if (ForMemset) {
648  if (FirstSplatValue != SecondSplatValue)
649  continue;
650  } else {
651  if (FirstPatternValue != SecondPatternValue)
652  continue;
653  }
654  Tails.insert(SL[k]);
655  Heads.insert(SL[i]);
656  ConsecutiveChain[SL[i]] = SL[k];
657  break;
658  }
659  }
660  }
661 
662  // We may run into multiple chains that merge into a single chain. We mark the
663  // stores that we transformed so that we don't visit the same store twice.
664  SmallPtrSet<Value *, 16> TransformedStores;
665  bool Changed = false;
666 
667  // For stores that start but don't end a link in the chain:
668  for (SetVector<StoreInst *>::iterator it = Heads.begin(), e = Heads.end();
669  it != e; ++it) {
670  if (Tails.count(*it))
671  continue;
672 
673  // We found a store instr that starts a chain. Now follow the chain and try
674  // to transform it.
675  SmallPtrSet<Instruction *, 8> AdjacentStores;
676  StoreInst *I = *it;
677 
678  StoreInst *HeadStore = I;
679  unsigned StoreSize = 0;
680 
681  // Collect the chain into a list.
682  while (Tails.count(I) || Heads.count(I)) {
683  if (TransformedStores.count(I))
684  break;
685  AdjacentStores.insert(I);
686 
687  StoreSize += DL->getTypeStoreSize(I->getValueOperand()->getType());
688  // Move to the next value in the chain.
689  I = ConsecutiveChain[I];
690  }
691 
692  Value *StoredVal = HeadStore->getValueOperand();
693  Value *StorePtr = HeadStore->getPointerOperand();
694  const SCEVAddRecExpr *StoreEv = cast<SCEVAddRecExpr>(SE->getSCEV(StorePtr));
695  APInt Stride = getStoreStride(StoreEv);
696 
697  // Check to see if the stride matches the size of the stores. If so, then
698  // we know that every byte is touched in the loop.
699  if (StoreSize != Stride && StoreSize != -Stride)
700  continue;
701 
702  bool NegStride = StoreSize == -Stride;
703 
704  if (processLoopStridedStore(StorePtr, StoreSize, HeadStore->getAlignment(),
705  StoredVal, HeadStore, AdjacentStores, StoreEv,
706  BECount, NegStride)) {
707  TransformedStores.insert(AdjacentStores.begin(), AdjacentStores.end());
708  Changed = true;
709  }
710  }
711 
712  return Changed;
713 }
714 
715 /// processLoopMemSet - See if this memset can be promoted to a large memset.
716 bool LoopIdiomRecognize::processLoopMemSet(MemSetInst *MSI,
717  const SCEV *BECount) {
718  // We can only handle non-volatile memsets with a constant size.
719  if (MSI->isVolatile() || !isa<ConstantInt>(MSI->getLength()))
720  return false;
721 
722  // If we're not allowed to hack on memset, we fail.
723  if (!HasMemset)
724  return false;
725 
726  Value *Pointer = MSI->getDest();
727 
728  // See if the pointer expression is an AddRec like {base,+,1} on the current
729  // loop, which indicates a strided store. If we have something else, it's a
730  // random store we can't handle.
731  const SCEVAddRecExpr *Ev = dyn_cast<SCEVAddRecExpr>(SE->getSCEV(Pointer));
732  if (!Ev || Ev->getLoop() != CurLoop || !Ev->isAffine())
733  return false;
734 
735  // Reject memsets that are so large that they overflow an unsigned.
736  uint64_t SizeInBytes = cast<ConstantInt>(MSI->getLength())->getZExtValue();
737  if ((SizeInBytes >> 32) != 0)
738  return false;
739 
740  // Check to see if the stride matches the size of the memset. If so, then we
741  // know that every byte is touched in the loop.
742  const SCEVConstant *ConstStride = dyn_cast<SCEVConstant>(Ev->getOperand(1));
743  if (!ConstStride)
744  return false;
745 
746  APInt Stride = ConstStride->getAPInt();
747  if (SizeInBytes != Stride && SizeInBytes != -Stride)
748  return false;
749 
750  // Verify that the memset value is loop invariant. If not, we can't promote
751  // the memset.
752  Value *SplatValue = MSI->getValue();
753  if (!SplatValue || !CurLoop->isLoopInvariant(SplatValue))
754  return false;
755 
757  MSIs.insert(MSI);
758  bool NegStride = SizeInBytes == -Stride;
759  return processLoopStridedStore(Pointer, (unsigned)SizeInBytes,
760  MSI->getDestAlignment(), SplatValue, MSI, MSIs,
761  Ev, BECount, NegStride, /*IsLoopMemset=*/true);
762 }
763 
764 /// mayLoopAccessLocation - Return true if the specified loop might access the
765 /// specified pointer location, which is a loop-strided access. The 'Access'
766 /// argument specifies what the verboten forms of access are (read or write).
767 static bool
769  const SCEV *BECount, unsigned StoreSize,
770  AliasAnalysis &AA,
771  SmallPtrSetImpl<Instruction *> &IgnoredStores) {
772  // Get the location that may be stored across the loop. Since the access is
773  // strided positively through memory, we say that the modified location starts
774  // at the pointer and has infinite size.
775  uint64_t AccessSize = MemoryLocation::UnknownSize;
776 
777  // If the loop iterates a fixed number of times, we can refine the access size
778  // to be exactly the size of the memset, which is (BECount+1)*StoreSize
779  if (const SCEVConstant *BECst = dyn_cast<SCEVConstant>(BECount))
780  AccessSize = (BECst->getValue()->getZExtValue() + 1) * StoreSize;
781 
782  // TODO: For this to be really effective, we have to dive into the pointer
783  // operand in the store. Store to &A[i] of 100 will always return may alias
784  // with store of &A[100], we need to StoreLoc to be "A" with size of 100,
785  // which will then no-alias a store to &A[100].
786  MemoryLocation StoreLoc(Ptr, AccessSize);
787 
788  for (Loop::block_iterator BI = L->block_begin(), E = L->block_end(); BI != E;
789  ++BI)
790  for (Instruction &I : **BI)
791  if (IgnoredStores.count(&I) == 0 &&
793  intersectModRef(AA.getModRefInfo(&I, StoreLoc), Access)))
794  return true;
795 
796  return false;
797 }
798 
799 // If we have a negative stride, Start refers to the end of the memory location
800 // we're trying to memset. Therefore, we need to recompute the base pointer,
801 // which is just Start - BECount*Size.
802 static const SCEV *getStartForNegStride(const SCEV *Start, const SCEV *BECount,
803  Type *IntPtr, unsigned StoreSize,
804  ScalarEvolution *SE) {
805  const SCEV *Index = SE->getTruncateOrZeroExtend(BECount, IntPtr);
806  if (StoreSize != 1)
807  Index = SE->getMulExpr(Index, SE->getConstant(IntPtr, StoreSize),
808  SCEV::FlagNUW);
809  return SE->getMinusSCEV(Start, Index);
810 }
811 
812 /// Compute the number of bytes as a SCEV from the backedge taken count.
813 ///
814 /// This also maps the SCEV into the provided type and tries to handle the
815 /// computation in a way that will fold cleanly.
816 static const SCEV *getNumBytes(const SCEV *BECount, Type *IntPtr,
817  unsigned StoreSize, Loop *CurLoop,
818  const DataLayout *DL, ScalarEvolution *SE) {
819  const SCEV *NumBytesS;
820  // The # stored bytes is (BECount+1)*Size. Expand the trip count out to
821  // pointer size if it isn't already.
822  //
823  // If we're going to need to zero extend the BE count, check if we can add
824  // one to it prior to zero extending without overflow. Provided this is safe,
825  // it allows better simplification of the +1.
826  if (DL->getTypeSizeInBits(BECount->getType()) <
827  DL->getTypeSizeInBits(IntPtr) &&
829  CurLoop, ICmpInst::ICMP_NE, BECount,
830  SE->getNegativeSCEV(SE->getOne(BECount->getType())))) {
831  NumBytesS = SE->getZeroExtendExpr(
832  SE->getAddExpr(BECount, SE->getOne(BECount->getType()), SCEV::FlagNUW),
833  IntPtr);
834  } else {
835  NumBytesS = SE->getAddExpr(SE->getTruncateOrZeroExtend(BECount, IntPtr),
836  SE->getOne(IntPtr), SCEV::FlagNUW);
837  }
838 
839  // And scale it based on the store size.
840  if (StoreSize != 1) {
841  NumBytesS = SE->getMulExpr(NumBytesS, SE->getConstant(IntPtr, StoreSize),
842  SCEV::FlagNUW);
843  }
844  return NumBytesS;
845 }
846 
847 /// processLoopStridedStore - We see a strided store of some value. If we can
848 /// transform this into a memset or memset_pattern in the loop preheader, do so.
849 bool LoopIdiomRecognize::processLoopStridedStore(
850  Value *DestPtr, unsigned StoreSize, unsigned StoreAlignment,
851  Value *StoredVal, Instruction *TheStore,
853  const SCEV *BECount, bool NegStride, bool IsLoopMemset) {
854  Value *SplatValue = isBytewiseValue(StoredVal);
855  Constant *PatternValue = nullptr;
856 
857  if (!SplatValue)
858  PatternValue = getMemSetPatternValue(StoredVal, DL);
859 
860  assert((SplatValue || PatternValue) &&
861  "Expected either splat value or pattern value.");
862 
863  // The trip count of the loop and the base pointer of the addrec SCEV is
864  // guaranteed to be loop invariant, which means that it should dominate the
865  // header. This allows us to insert code for it in the preheader.
866  unsigned DestAS = DestPtr->getType()->getPointerAddressSpace();
867  BasicBlock *Preheader = CurLoop->getLoopPreheader();
868  IRBuilder<> Builder(Preheader->getTerminator());
869  SCEVExpander Expander(*SE, *DL, "loop-idiom");
870 
871  Type *DestInt8PtrTy = Builder.getInt8PtrTy(DestAS);
872  Type *IntPtr = Builder.getIntPtrTy(*DL, DestAS);
873 
874  const SCEV *Start = Ev->getStart();
875  // Handle negative strided loops.
876  if (NegStride)
877  Start = getStartForNegStride(Start, BECount, IntPtr, StoreSize, SE);
878 
879  // TODO: ideally we should still be able to generate memset if SCEV expander
880  // is taught to generate the dependencies at the latest point.
881  if (!isSafeToExpand(Start, *SE))
882  return false;
883 
884  // Okay, we have a strided store "p[i]" of a splattable value. We can turn
885  // this into a memset in the loop preheader now if we want. However, this
886  // would be unsafe to do if there is anything else in the loop that may read
887  // or write to the aliased location. Check for any overlap by generating the
888  // base pointer and checking the region.
889  Value *BasePtr =
890  Expander.expandCodeFor(Start, DestInt8PtrTy, Preheader->getTerminator());
891  if (mayLoopAccessLocation(BasePtr, ModRefInfo::ModRef, CurLoop, BECount,
892  StoreSize, *AA, Stores)) {
893  Expander.clear();
894  // If we generated new code for the base pointer, clean up.
896  return false;
897  }
898 
899  if (avoidLIRForMultiBlockLoop(/*IsMemset=*/true, IsLoopMemset))
900  return false;
901 
902  // Okay, everything looks good, insert the memset.
903 
904  const SCEV *NumBytesS =
905  getNumBytes(BECount, IntPtr, StoreSize, CurLoop, DL, SE);
906 
907  // TODO: ideally we should still be able to generate memset if SCEV expander
908  // is taught to generate the dependencies at the latest point.
909  if (!isSafeToExpand(NumBytesS, *SE))
910  return false;
911 
912  Value *NumBytes =
913  Expander.expandCodeFor(NumBytesS, IntPtr, Preheader->getTerminator());
914 
915  CallInst *NewCall;
916  if (SplatValue) {
917  NewCall =
918  Builder.CreateMemSet(BasePtr, SplatValue, NumBytes, StoreAlignment);
919  } else {
920  // Everything is emitted in default address space
921  Type *Int8PtrTy = DestInt8PtrTy;
922 
923  Module *M = TheStore->getModule();
924  Value *MSP =
925  M->getOrInsertFunction("memset_pattern16", Builder.getVoidTy(),
926  Int8PtrTy, Int8PtrTy, IntPtr);
927  inferLibFuncAttributes(*M->getFunction("memset_pattern16"), *TLI);
928 
929  // Otherwise we should form a memset_pattern16. PatternValue is known to be
930  // an constant array of 16-bytes. Plop the value into a mergable global.
931  GlobalVariable *GV = new GlobalVariable(*M, PatternValue->getType(), true,
933  PatternValue, ".memset_pattern");
934  GV->setUnnamedAddr(GlobalValue::UnnamedAddr::Global); // Ok to merge these.
935  GV->setAlignment(16);
936  Value *PatternPtr = ConstantExpr::getBitCast(GV, Int8PtrTy);
937  NewCall = Builder.CreateCall(MSP, {BasePtr, PatternPtr, NumBytes});
938  }
939 
940  LLVM_DEBUG(dbgs() << " Formed memset: " << *NewCall << "\n"
941  << " from store to: " << *Ev << " at: " << *TheStore
942  << "\n");
943  NewCall->setDebugLoc(TheStore->getDebugLoc());
944 
945  // Okay, the memset has been formed. Zap the original store and anything that
946  // feeds into it.
947  for (auto *I : Stores)
949  ++NumMemSet;
950  return true;
951 }
952 
953 /// If the stored value is a strided load in the same loop with the same stride
954 /// this may be transformable into a memcpy. This kicks in for stuff like
955 /// for (i) A[i] = B[i];
956 bool LoopIdiomRecognize::processLoopStoreOfLoopLoad(StoreInst *SI,
957  const SCEV *BECount) {
958  assert(SI->isUnordered() && "Expected only non-volatile non-ordered stores.");
959 
960  Value *StorePtr = SI->getPointerOperand();
961  const SCEVAddRecExpr *StoreEv = cast<SCEVAddRecExpr>(SE->getSCEV(StorePtr));
962  APInt Stride = getStoreStride(StoreEv);
963  unsigned StoreSize = DL->getTypeStoreSize(SI->getValueOperand()->getType());
964  bool NegStride = StoreSize == -Stride;
965 
966  // The store must be feeding a non-volatile load.
967  LoadInst *LI = cast<LoadInst>(SI->getValueOperand());
968  assert(LI->isUnordered() && "Expected only non-volatile non-ordered loads.");
969 
970  // See if the pointer expression is an AddRec like {base,+,1} on the current
971  // loop, which indicates a strided load. If we have something else, it's a
972  // random load we can't handle.
973  const SCEVAddRecExpr *LoadEv =
974  cast<SCEVAddRecExpr>(SE->getSCEV(LI->getPointerOperand()));
975 
976  // The trip count of the loop and the base pointer of the addrec SCEV is
977  // guaranteed to be loop invariant, which means that it should dominate the
978  // header. This allows us to insert code for it in the preheader.
979  BasicBlock *Preheader = CurLoop->getLoopPreheader();
980  IRBuilder<> Builder(Preheader->getTerminator());
981  SCEVExpander Expander(*SE, *DL, "loop-idiom");
982 
983  const SCEV *StrStart = StoreEv->getStart();
984  unsigned StrAS = SI->getPointerAddressSpace();
985  Type *IntPtrTy = Builder.getIntPtrTy(*DL, StrAS);
986 
987  // Handle negative strided loops.
988  if (NegStride)
989  StrStart = getStartForNegStride(StrStart, BECount, IntPtrTy, StoreSize, SE);
990 
991  // Okay, we have a strided store "p[i]" of a loaded value. We can turn
992  // this into a memcpy in the loop preheader now if we want. However, this
993  // would be unsafe to do if there is anything else in the loop that may read
994  // or write the memory region we're storing to. This includes the load that
995  // feeds the stores. Check for an alias by generating the base address and
996  // checking everything.
997  Value *StoreBasePtr = Expander.expandCodeFor(
998  StrStart, Builder.getInt8PtrTy(StrAS), Preheader->getTerminator());
999 
1001  Stores.insert(SI);
1002  if (mayLoopAccessLocation(StoreBasePtr, ModRefInfo::ModRef, CurLoop, BECount,
1003  StoreSize, *AA, Stores)) {
1004  Expander.clear();
1005  // If we generated new code for the base pointer, clean up.
1006  RecursivelyDeleteTriviallyDeadInstructions(StoreBasePtr, TLI);
1007  return false;
1008  }
1009 
1010  const SCEV *LdStart = LoadEv->getStart();
1011  unsigned LdAS = LI->getPointerAddressSpace();
1012 
1013  // Handle negative strided loops.
1014  if (NegStride)
1015  LdStart = getStartForNegStride(LdStart, BECount, IntPtrTy, StoreSize, SE);
1016 
1017  // For a memcpy, we have to make sure that the input array is not being
1018  // mutated by the loop.
1019  Value *LoadBasePtr = Expander.expandCodeFor(
1020  LdStart, Builder.getInt8PtrTy(LdAS), Preheader->getTerminator());
1021 
1022  if (mayLoopAccessLocation(LoadBasePtr, ModRefInfo::Mod, CurLoop, BECount,
1023  StoreSize, *AA, Stores)) {
1024  Expander.clear();
1025  // If we generated new code for the base pointer, clean up.
1027  RecursivelyDeleteTriviallyDeadInstructions(StoreBasePtr, TLI);
1028  return false;
1029  }
1030 
1031  if (avoidLIRForMultiBlockLoop())
1032  return false;
1033 
1034  // Okay, everything is safe, we can transform this!
1035 
1036  const SCEV *NumBytesS =
1037  getNumBytes(BECount, IntPtrTy, StoreSize, CurLoop, DL, SE);
1038 
1039  Value *NumBytes =
1040  Expander.expandCodeFor(NumBytesS, IntPtrTy, Preheader->getTerminator());
1041 
1042  CallInst *NewCall = nullptr;
1043  // Check whether to generate an unordered atomic memcpy:
1044  // If the load or store are atomic, then they must necessarily be unordered
1045  // by previous checks.
1046  if (!SI->isAtomic() && !LI->isAtomic())
1047  NewCall = Builder.CreateMemCpy(StoreBasePtr, SI->getAlignment(),
1048  LoadBasePtr, LI->getAlignment(), NumBytes);
1049  else {
1050  // We cannot allow unaligned ops for unordered load/store, so reject
1051  // anything where the alignment isn't at least the element size.
1052  unsigned Align = std::min(SI->getAlignment(), LI->getAlignment());
1053  if (Align < StoreSize)
1054  return false;
1055 
1056  // If the element.atomic memcpy is not lowered into explicit
1057  // loads/stores later, then it will be lowered into an element-size
1058  // specific lib call. If the lib call doesn't exist for our store size, then
1059  // we shouldn't generate the memcpy.
1060  if (StoreSize > TTI->getAtomicMemIntrinsicMaxElementSize())
1061  return false;
1062 
1063  // Create the call.
1064  // Note that unordered atomic loads/stores are *required* by the spec to
1065  // have an alignment but non-atomic loads/stores may not.
1066  NewCall = Builder.CreateElementUnorderedAtomicMemCpy(
1067  StoreBasePtr, SI->getAlignment(), LoadBasePtr, LI->getAlignment(),
1068  NumBytes, StoreSize);
1069  }
1070  NewCall->setDebugLoc(SI->getDebugLoc());
1071 
1072  LLVM_DEBUG(dbgs() << " Formed memcpy: " << *NewCall << "\n"
1073  << " from load ptr=" << *LoadEv << " at: " << *LI << "\n"
1074  << " from store ptr=" << *StoreEv << " at: " << *SI
1075  << "\n");
1076 
1077  // Okay, the memcpy has been formed. Zap the original store and anything that
1078  // feeds into it.
1080  ++NumMemCpy;
1081  return true;
1082 }
1083 
1084 // When compiling for codesize we avoid idiom recognition for a multi-block loop
1085 // unless it is a loop_memset idiom or a memset/memcpy idiom in a nested loop.
1086 //
1087 bool LoopIdiomRecognize::avoidLIRForMultiBlockLoop(bool IsMemset,
1088  bool IsLoopMemset) {
1089  if (ApplyCodeSizeHeuristics && CurLoop->getNumBlocks() > 1) {
1090  if (!CurLoop->getParentLoop() && (!IsMemset || !IsLoopMemset)) {
1091  LLVM_DEBUG(dbgs() << " " << CurLoop->getHeader()->getParent()->getName()
1092  << " : LIR " << (IsMemset ? "Memset" : "Memcpy")
1093  << " avoided: multi-block top-level loop\n");
1094  return true;
1095  }
1096  }
1097 
1098  return false;
1099 }
1100 
1101 bool LoopIdiomRecognize::runOnNoncountableLoop() {
1102  return recognizePopcount() || recognizeAndInsertCTLZ();
1103 }
1104 
1105 /// Check if the given conditional branch is based on the comparison between
1106 /// a variable and zero, and if the variable is non-zero, the control yields to
1107 /// the loop entry. If the branch matches the behavior, the variable involved
1108 /// in the comparison is returned. This function will be called to see if the
1109 /// precondition and postcondition of the loop are in desirable form.
1110 static Value *matchCondition(BranchInst *BI, BasicBlock *LoopEntry) {
1111  if (!BI || !BI->isConditional())
1112  return nullptr;
1113 
1114  ICmpInst *Cond = dyn_cast<ICmpInst>(BI->getCondition());
1115  if (!Cond)
1116  return nullptr;
1117 
1118  ConstantInt *CmpZero = dyn_cast<ConstantInt>(Cond->getOperand(1));
1119  if (!CmpZero || !CmpZero->isZero())
1120  return nullptr;
1121 
1122  ICmpInst::Predicate Pred = Cond->getPredicate();
1123  if ((Pred == ICmpInst::ICMP_NE && BI->getSuccessor(0) == LoopEntry) ||
1124  (Pred == ICmpInst::ICMP_EQ && BI->getSuccessor(1) == LoopEntry))
1125  return Cond->getOperand(0);
1126 
1127  return nullptr;
1128 }
1129 
1130 // Check if the recurrence variable `VarX` is in the right form to create
1131 // the idiom. Returns the value coerced to a PHINode if so.
1133  BasicBlock *LoopEntry) {
1134  auto *PhiX = dyn_cast<PHINode>(VarX);
1135  if (PhiX && PhiX->getParent() == LoopEntry &&
1136  (PhiX->getOperand(0) == DefX || PhiX->getOperand(1) == DefX))
1137  return PhiX;
1138  return nullptr;
1139 }
1140 
1141 /// Return true iff the idiom is detected in the loop.
1142 ///
1143 /// Additionally:
1144 /// 1) \p CntInst is set to the instruction counting the population bit.
1145 /// 2) \p CntPhi is set to the corresponding phi node.
1146 /// 3) \p Var is set to the value whose population bits are being counted.
1147 ///
1148 /// The core idiom we are trying to detect is:
1149 /// \code
1150 /// if (x0 != 0)
1151 /// goto loop-exit // the precondition of the loop
1152 /// cnt0 = init-val;
1153 /// do {
1154 /// x1 = phi (x0, x2);
1155 /// cnt1 = phi(cnt0, cnt2);
1156 ///
1157 /// cnt2 = cnt1 + 1;
1158 /// ...
1159 /// x2 = x1 & (x1 - 1);
1160 /// ...
1161 /// } while(x != 0);
1162 ///
1163 /// loop-exit:
1164 /// \endcode
1165 static bool detectPopcountIdiom(Loop *CurLoop, BasicBlock *PreCondBB,
1166  Instruction *&CntInst, PHINode *&CntPhi,
1167  Value *&Var) {
1168  // step 1: Check to see if the look-back branch match this pattern:
1169  // "if (a!=0) goto loop-entry".
1170  BasicBlock *LoopEntry;
1171  Instruction *DefX2, *CountInst;
1172  Value *VarX1, *VarX0;
1173  PHINode *PhiX, *CountPhi;
1174 
1175  DefX2 = CountInst = nullptr;
1176  VarX1 = VarX0 = nullptr;
1177  PhiX = CountPhi = nullptr;
1178  LoopEntry = *(CurLoop->block_begin());
1179 
1180  // step 1: Check if the loop-back branch is in desirable form.
1181  {
1182  if (Value *T = matchCondition(
1183  dyn_cast<BranchInst>(LoopEntry->getTerminator()), LoopEntry))
1184  DefX2 = dyn_cast<Instruction>(T);
1185  else
1186  return false;
1187  }
1188 
1189  // step 2: detect instructions corresponding to "x2 = x1 & (x1 - 1)"
1190  {
1191  if (!DefX2 || DefX2->getOpcode() != Instruction::And)
1192  return false;
1193 
1194  BinaryOperator *SubOneOp;
1195 
1196  if ((SubOneOp = dyn_cast<BinaryOperator>(DefX2->getOperand(0))))
1197  VarX1 = DefX2->getOperand(1);
1198  else {
1199  VarX1 = DefX2->getOperand(0);
1200  SubOneOp = dyn_cast<BinaryOperator>(DefX2->getOperand(1));
1201  }
1202  if (!SubOneOp || SubOneOp->getOperand(0) != VarX1)
1203  return false;
1204 
1205  ConstantInt *Dec = dyn_cast<ConstantInt>(SubOneOp->getOperand(1));
1206  if (!Dec ||
1207  !((SubOneOp->getOpcode() == Instruction::Sub && Dec->isOne()) ||
1208  (SubOneOp->getOpcode() == Instruction::Add &&
1209  Dec->isMinusOne()))) {
1210  return false;
1211  }
1212  }
1213 
1214  // step 3: Check the recurrence of variable X
1215  PhiX = getRecurrenceVar(VarX1, DefX2, LoopEntry);
1216  if (!PhiX)
1217  return false;
1218 
1219  // step 4: Find the instruction which count the population: cnt2 = cnt1 + 1
1220  {
1221  CountInst = nullptr;
1222  for (BasicBlock::iterator Iter = LoopEntry->getFirstNonPHI()->getIterator(),
1223  IterE = LoopEntry->end();
1224  Iter != IterE; Iter++) {
1225  Instruction *Inst = &*Iter;
1226  if (Inst->getOpcode() != Instruction::Add)
1227  continue;
1228 
1229  ConstantInt *Inc = dyn_cast<ConstantInt>(Inst->getOperand(1));
1230  if (!Inc || !Inc->isOne())
1231  continue;
1232 
1233  PHINode *Phi = getRecurrenceVar(Inst->getOperand(0), Inst, LoopEntry);
1234  if (!Phi)
1235  continue;
1236 
1237  // Check if the result of the instruction is live of the loop.
1238  bool LiveOutLoop = false;
1239  for (User *U : Inst->users()) {
1240  if ((cast<Instruction>(U))->getParent() != LoopEntry) {
1241  LiveOutLoop = true;
1242  break;
1243  }
1244  }
1245 
1246  if (LiveOutLoop) {
1247  CountInst = Inst;
1248  CountPhi = Phi;
1249  break;
1250  }
1251  }
1252 
1253  if (!CountInst)
1254  return false;
1255  }
1256 
1257  // step 5: check if the precondition is in this form:
1258  // "if (x != 0) goto loop-head ; else goto somewhere-we-don't-care;"
1259  {
1260  auto *PreCondBr = dyn_cast<BranchInst>(PreCondBB->getTerminator());
1261  Value *T = matchCondition(PreCondBr, CurLoop->getLoopPreheader());
1262  if (T != PhiX->getOperand(0) && T != PhiX->getOperand(1))
1263  return false;
1264 
1265  CntInst = CountInst;
1266  CntPhi = CountPhi;
1267  Var = T;
1268  }
1269 
1270  return true;
1271 }
1272 
1273 /// Return true if the idiom is detected in the loop.
1274 ///
1275 /// Additionally:
1276 /// 1) \p CntInst is set to the instruction Counting Leading Zeros (CTLZ)
1277 /// or nullptr if there is no such.
1278 /// 2) \p CntPhi is set to the corresponding phi node
1279 /// or nullptr if there is no such.
1280 /// 3) \p Var is set to the value whose CTLZ could be used.
1281 /// 4) \p DefX is set to the instruction calculating Loop exit condition.
1282 ///
1283 /// The core idiom we are trying to detect is:
1284 /// \code
1285 /// if (x0 == 0)
1286 /// goto loop-exit // the precondition of the loop
1287 /// cnt0 = init-val;
1288 /// do {
1289 /// x = phi (x0, x.next); //PhiX
1290 /// cnt = phi(cnt0, cnt.next);
1291 ///
1292 /// cnt.next = cnt + 1;
1293 /// ...
1294 /// x.next = x >> 1; // DefX
1295 /// ...
1296 /// } while(x.next != 0);
1297 ///
1298 /// loop-exit:
1299 /// \endcode
1300 static bool detectCTLZIdiom(Loop *CurLoop, PHINode *&PhiX,
1301  Instruction *&CntInst, PHINode *&CntPhi,
1302  Instruction *&DefX) {
1303  BasicBlock *LoopEntry;
1304  Value *VarX = nullptr;
1305 
1306  DefX = nullptr;
1307  PhiX = nullptr;
1308  CntInst = nullptr;
1309  CntPhi = nullptr;
1310  LoopEntry = *(CurLoop->block_begin());
1311 
1312  // step 1: Check if the loop-back branch is in desirable form.
1313  if (Value *T = matchCondition(
1314  dyn_cast<BranchInst>(LoopEntry->getTerminator()), LoopEntry))
1315  DefX = dyn_cast<Instruction>(T);
1316  else
1317  return false;
1318 
1319  // step 2: detect instructions corresponding to "x.next = x >> 1"
1320  if (!DefX || (DefX->getOpcode() != Instruction::AShr &&
1321  DefX->getOpcode() != Instruction::LShr))
1322  return false;
1323  ConstantInt *Shft = dyn_cast<ConstantInt>(DefX->getOperand(1));
1324  if (!Shft || !Shft->isOne())
1325  return false;
1326  VarX = DefX->getOperand(0);
1327 
1328  // step 3: Check the recurrence of variable X
1329  PhiX = getRecurrenceVar(VarX, DefX, LoopEntry);
1330  if (!PhiX)
1331  return false;
1332 
1333  // step 4: Find the instruction which count the CTLZ: cnt.next = cnt + 1
1334  // TODO: We can skip the step. If loop trip count is known (CTLZ),
1335  // then all uses of "cnt.next" could be optimized to the trip count
1336  // plus "cnt0". Currently it is not optimized.
1337  // This step could be used to detect POPCNT instruction:
1338  // cnt.next = cnt + (x.next & 1)
1339  for (BasicBlock::iterator Iter = LoopEntry->getFirstNonPHI()->getIterator(),
1340  IterE = LoopEntry->end();
1341  Iter != IterE; Iter++) {
1342  Instruction *Inst = &*Iter;
1343  if (Inst->getOpcode() != Instruction::Add)
1344  continue;
1345 
1346  ConstantInt *Inc = dyn_cast<ConstantInt>(Inst->getOperand(1));
1347  if (!Inc || !Inc->isOne())
1348  continue;
1349 
1350  PHINode *Phi = getRecurrenceVar(Inst->getOperand(0), Inst, LoopEntry);
1351  if (!Phi)
1352  continue;
1353 
1354  CntInst = Inst;
1355  CntPhi = Phi;
1356  break;
1357  }
1358  if (!CntInst)
1359  return false;
1360 
1361  return true;
1362 }
1363 
1364 /// Recognize CTLZ idiom in a non-countable loop and convert the loop
1365 /// to countable (with CTLZ trip count).
1366 /// If CTLZ inserted as a new trip count returns true; otherwise, returns false.
1367 bool LoopIdiomRecognize::recognizeAndInsertCTLZ() {
1368  // Give up if the loop has multiple blocks or multiple backedges.
1369  if (CurLoop->getNumBackEdges() != 1 || CurLoop->getNumBlocks() != 1)
1370  return false;
1371 
1372  Instruction *CntInst, *DefX;
1373  PHINode *CntPhi, *PhiX;
1374  if (!detectCTLZIdiom(CurLoop, PhiX, CntInst, CntPhi, DefX))
1375  return false;
1376 
1377  bool IsCntPhiUsedOutsideLoop = false;
1378  for (User *U : CntPhi->users())
1379  if (!CurLoop->contains(cast<Instruction>(U))) {
1380  IsCntPhiUsedOutsideLoop = true;
1381  break;
1382  }
1383  bool IsCntInstUsedOutsideLoop = false;
1384  for (User *U : CntInst->users())
1385  if (!CurLoop->contains(cast<Instruction>(U))) {
1386  IsCntInstUsedOutsideLoop = true;
1387  break;
1388  }
1389  // If both CntInst and CntPhi are used outside the loop the profitability
1390  // is questionable.
1391  if (IsCntInstUsedOutsideLoop && IsCntPhiUsedOutsideLoop)
1392  return false;
1393 
1394  // For some CPUs result of CTLZ(X) intrinsic is undefined
1395  // when X is 0. If we can not guarantee X != 0, we need to check this
1396  // when expand.
1397  bool ZeroCheck = false;
1398  // It is safe to assume Preheader exist as it was checked in
1399  // parent function RunOnLoop.
1400  BasicBlock *PH = CurLoop->getLoopPreheader();
1401  Value *InitX = PhiX->getIncomingValueForBlock(PH);
1402 
1403  // Make sure the initial value can't be negative otherwise the ashr in the
1404  // loop might never reach zero which would make the loop infinite.
1405  if (DefX->getOpcode() == Instruction::AShr && !isKnownNonNegative(InitX, *DL))
1406  return false;
1407 
1408  // If we are using the count instruction outside the loop, make sure we
1409  // have a zero check as a precondition. Without the check the loop would run
1410  // one iteration for before any check of the input value. This means 0 and 1
1411  // would have identical behavior in the original loop and thus
1412  if (!IsCntPhiUsedOutsideLoop) {
1413  auto *PreCondBB = PH->getSinglePredecessor();
1414  if (!PreCondBB)
1415  return false;
1416  auto *PreCondBI = dyn_cast<BranchInst>(PreCondBB->getTerminator());
1417  if (!PreCondBI)
1418  return false;
1419  if (matchCondition(PreCondBI, PH) != InitX)
1420  return false;
1421  ZeroCheck = true;
1422  }
1423 
1424  // Check if CTLZ intrinsic is profitable. Assume it is always profitable
1425  // if we delete the loop (the loop has only 6 instructions):
1426  // %n.addr.0 = phi [ %n, %entry ], [ %shr, %while.cond ]
1427  // %i.0 = phi [ %i0, %entry ], [ %inc, %while.cond ]
1428  // %shr = ashr %n.addr.0, 1
1429  // %tobool = icmp eq %shr, 0
1430  // %inc = add nsw %i.0, 1
1431  // br i1 %tobool
1432 
1433  const Value *Args[] =
1434  {InitX, ZeroCheck ? ConstantInt::getTrue(InitX->getContext())
1435  : ConstantInt::getFalse(InitX->getContext())};
1436  if (CurLoop->getHeader()->size() != 6 &&
1437  TTI->getIntrinsicCost(Intrinsic::ctlz, InitX->getType(), Args) >
1439  return false;
1440 
1441  transformLoopToCountable(PH, CntInst, CntPhi, InitX, DefX,
1442  DefX->getDebugLoc(), ZeroCheck,
1443  IsCntPhiUsedOutsideLoop);
1444  return true;
1445 }
1446 
1447 /// Recognizes a population count idiom in a non-countable loop.
1448 ///
1449 /// If detected, transforms the relevant code to issue the popcount intrinsic
1450 /// function call, and returns true; otherwise, returns false.
1451 bool LoopIdiomRecognize::recognizePopcount() {
1453  return false;
1454 
1455  // Counting population are usually conducted by few arithmetic instructions.
1456  // Such instructions can be easily "absorbed" by vacant slots in a
1457  // non-compact loop. Therefore, recognizing popcount idiom only makes sense
1458  // in a compact loop.
1459 
1460  // Give up if the loop has multiple blocks or multiple backedges.
1461  if (CurLoop->getNumBackEdges() != 1 || CurLoop->getNumBlocks() != 1)
1462  return false;
1463 
1464  BasicBlock *LoopBody = *(CurLoop->block_begin());
1465  if (LoopBody->size() >= 20) {
1466  // The loop is too big, bail out.
1467  return false;
1468  }
1469 
1470  // It should have a preheader containing nothing but an unconditional branch.
1471  BasicBlock *PH = CurLoop->getLoopPreheader();
1472  if (!PH || &PH->front() != PH->getTerminator())
1473  return false;
1474  auto *EntryBI = dyn_cast<BranchInst>(PH->getTerminator());
1475  if (!EntryBI || EntryBI->isConditional())
1476  return false;
1477 
1478  // It should have a precondition block where the generated popcount intrinsic
1479  // function can be inserted.
1480  auto *PreCondBB = PH->getSinglePredecessor();
1481  if (!PreCondBB)
1482  return false;
1483  auto *PreCondBI = dyn_cast<BranchInst>(PreCondBB->getTerminator());
1484  if (!PreCondBI || PreCondBI->isUnconditional())
1485  return false;
1486 
1487  Instruction *CntInst;
1488  PHINode *CntPhi;
1489  Value *Val;
1490  if (!detectPopcountIdiom(CurLoop, PreCondBB, CntInst, CntPhi, Val))
1491  return false;
1492 
1493  transformLoopToPopcount(PreCondBB, CntInst, CntPhi, Val);
1494  return true;
1495 }
1496 
1498  const DebugLoc &DL) {
1499  Value *Ops[] = {Val};
1500  Type *Tys[] = {Val->getType()};
1501 
1502  Module *M = IRBuilder.GetInsertBlock()->getParent()->getParent();
1503  Value *Func = Intrinsic::getDeclaration(M, Intrinsic::ctpop, Tys);
1504  CallInst *CI = IRBuilder.CreateCall(Func, Ops);
1505  CI->setDebugLoc(DL);
1506 
1507  return CI;
1508 }
1509 
1511  const DebugLoc &DL, bool ZeroCheck) {
1512  Value *Ops[] = {Val, ZeroCheck ? IRBuilder.getTrue() : IRBuilder.getFalse()};
1513  Type *Tys[] = {Val->getType()};
1514 
1515  Module *M = IRBuilder.GetInsertBlock()->getParent()->getParent();
1516  Value *Func = Intrinsic::getDeclaration(M, Intrinsic::ctlz, Tys);
1517  CallInst *CI = IRBuilder.CreateCall(Func, Ops);
1518  CI->setDebugLoc(DL);
1519 
1520  return CI;
1521 }
1522 
1523 /// Transform the following loop:
1524 /// loop:
1525 /// CntPhi = PHI [Cnt0, CntInst]
1526 /// PhiX = PHI [InitX, DefX]
1527 /// CntInst = CntPhi + 1
1528 /// DefX = PhiX >> 1
1529 /// LOOP_BODY
1530 /// Br: loop if (DefX != 0)
1531 /// Use(CntPhi) or Use(CntInst)
1532 ///
1533 /// Into:
1534 /// If CntPhi used outside the loop:
1535 /// CountPrev = BitWidth(InitX) - CTLZ(InitX >> 1)
1536 /// Count = CountPrev + 1
1537 /// else
1538 /// Count = BitWidth(InitX) - CTLZ(InitX)
1539 /// loop:
1540 /// CntPhi = PHI [Cnt0, CntInst]
1541 /// PhiX = PHI [InitX, DefX]
1542 /// PhiCount = PHI [Count, Dec]
1543 /// CntInst = CntPhi + 1
1544 /// DefX = PhiX >> 1
1545 /// Dec = PhiCount - 1
1546 /// LOOP_BODY
1547 /// Br: loop if (Dec != 0)
1548 /// Use(CountPrev + Cnt0) // Use(CntPhi)
1549 /// or
1550 /// Use(Count + Cnt0) // Use(CntInst)
1551 ///
1552 /// If LOOP_BODY is empty the loop will be deleted.
1553 /// If CntInst and DefX are not used in LOOP_BODY they will be removed.
1554 void LoopIdiomRecognize::transformLoopToCountable(
1555  BasicBlock *Preheader, Instruction *CntInst, PHINode *CntPhi, Value *InitX,
1556  Instruction *DefX, const DebugLoc &DL, bool ZeroCheck,
1557  bool IsCntPhiUsedOutsideLoop) {
1558  BranchInst *PreheaderBr = cast<BranchInst>(Preheader->getTerminator());
1559 
1560  // Step 1: Insert the CTLZ instruction at the end of the preheader block
1561  // Count = BitWidth - CTLZ(InitX);
1562  // If there are uses of CntPhi create:
1563  // CountPrev = BitWidth - CTLZ(InitX >> 1);
1564  IRBuilder<> Builder(PreheaderBr);
1565  Builder.SetCurrentDebugLocation(DL);
1566  Value *CTLZ, *Count, *CountPrev, *NewCount, *InitXNext;
1567 
1568  if (IsCntPhiUsedOutsideLoop) {
1569  if (DefX->getOpcode() == Instruction::AShr)
1570  InitXNext =
1571  Builder.CreateAShr(InitX, ConstantInt::get(InitX->getType(), 1));
1572  else if (DefX->getOpcode() == Instruction::LShr)
1573  InitXNext =
1574  Builder.CreateLShr(InitX, ConstantInt::get(InitX->getType(), 1));
1575  else
1576  llvm_unreachable("Unexpected opcode!");
1577  } else
1578  InitXNext = InitX;
1579  CTLZ = createCTLZIntrinsic(Builder, InitXNext, DL, ZeroCheck);
1580  Count = Builder.CreateSub(
1581  ConstantInt::get(CTLZ->getType(),
1582  CTLZ->getType()->getIntegerBitWidth()),
1583  CTLZ);
1584  if (IsCntPhiUsedOutsideLoop) {
1585  CountPrev = Count;
1586  Count = Builder.CreateAdd(
1587  CountPrev,
1588  ConstantInt::get(CountPrev->getType(), 1));
1589  }
1590  if (IsCntPhiUsedOutsideLoop)
1591  NewCount = Builder.CreateZExtOrTrunc(CountPrev,
1592  cast<IntegerType>(CntInst->getType()));
1593  else
1594  NewCount = Builder.CreateZExtOrTrunc(Count,
1595  cast<IntegerType>(CntInst->getType()));
1596 
1597  // If the CTLZ counter's initial value is not zero, insert Add Inst.
1598  Value *CntInitVal = CntPhi->getIncomingValueForBlock(Preheader);
1599  ConstantInt *InitConst = dyn_cast<ConstantInt>(CntInitVal);
1600  if (!InitConst || !InitConst->isZero())
1601  NewCount = Builder.CreateAdd(NewCount, CntInitVal);
1602 
1603  // Step 2: Insert new IV and loop condition:
1604  // loop:
1605  // ...
1606  // PhiCount = PHI [Count, Dec]
1607  // ...
1608  // Dec = PhiCount - 1
1609  // ...
1610  // Br: loop if (Dec != 0)
1611  BasicBlock *Body = *(CurLoop->block_begin());
1612  auto *LbBr = cast<BranchInst>(Body->getTerminator());
1613  ICmpInst *LbCond = cast<ICmpInst>(LbBr->getCondition());
1614  Type *Ty = Count->getType();
1615 
1616  PHINode *TcPhi = PHINode::Create(Ty, 2, "tcphi", &Body->front());
1617 
1618  Builder.SetInsertPoint(LbCond);
1619  Instruction *TcDec = cast<Instruction>(
1620  Builder.CreateSub(TcPhi, ConstantInt::get(Ty, 1),
1621  "tcdec", false, true));
1622 
1623  TcPhi->addIncoming(Count, Preheader);
1624  TcPhi->addIncoming(TcDec, Body);
1625 
1626  CmpInst::Predicate Pred =
1627  (LbBr->getSuccessor(0) == Body) ? CmpInst::ICMP_NE : CmpInst::ICMP_EQ;
1628  LbCond->setPredicate(Pred);
1629  LbCond->setOperand(0, TcDec);
1630  LbCond->setOperand(1, ConstantInt::get(Ty, 0));
1631 
1632  // Step 3: All the references to the original counter outside
1633  // the loop are replaced with the NewCount -- the value returned from
1634  // __builtin_ctlz(x).
1635  if (IsCntPhiUsedOutsideLoop)
1636  CntPhi->replaceUsesOutsideBlock(NewCount, Body);
1637  else
1638  CntInst->replaceUsesOutsideBlock(NewCount, Body);
1639 
1640  // step 4: Forget the "non-computable" trip-count SCEV associated with the
1641  // loop. The loop would otherwise not be deleted even if it becomes empty.
1642  SE->forgetLoop(CurLoop);
1643 }
1644 
1645 void LoopIdiomRecognize::transformLoopToPopcount(BasicBlock *PreCondBB,
1646  Instruction *CntInst,
1647  PHINode *CntPhi, Value *Var) {
1648  BasicBlock *PreHead = CurLoop->getLoopPreheader();
1649  auto *PreCondBr = cast<BranchInst>(PreCondBB->getTerminator());
1650  const DebugLoc &DL = CntInst->getDebugLoc();
1651 
1652  // Assuming before transformation, the loop is following:
1653  // if (x) // the precondition
1654  // do { cnt++; x &= x - 1; } while(x);
1655 
1656  // Step 1: Insert the ctpop instruction at the end of the precondition block
1657  IRBuilder<> Builder(PreCondBr);
1658  Value *PopCnt, *PopCntZext, *NewCount, *TripCnt;
1659  {
1660  PopCnt = createPopcntIntrinsic(Builder, Var, DL);
1661  NewCount = PopCntZext =
1662  Builder.CreateZExtOrTrunc(PopCnt, cast<IntegerType>(CntPhi->getType()));
1663 
1664  if (NewCount != PopCnt)
1665  (cast<Instruction>(NewCount))->setDebugLoc(DL);
1666 
1667  // TripCnt is exactly the number of iterations the loop has
1668  TripCnt = NewCount;
1669 
1670  // If the population counter's initial value is not zero, insert Add Inst.
1671  Value *CntInitVal = CntPhi->getIncomingValueForBlock(PreHead);
1672  ConstantInt *InitConst = dyn_cast<ConstantInt>(CntInitVal);
1673  if (!InitConst || !InitConst->isZero()) {
1674  NewCount = Builder.CreateAdd(NewCount, CntInitVal);
1675  (cast<Instruction>(NewCount))->setDebugLoc(DL);
1676  }
1677  }
1678 
1679  // Step 2: Replace the precondition from "if (x == 0) goto loop-exit" to
1680  // "if (NewCount == 0) loop-exit". Without this change, the intrinsic
1681  // function would be partial dead code, and downstream passes will drag
1682  // it back from the precondition block to the preheader.
1683  {
1684  ICmpInst *PreCond = cast<ICmpInst>(PreCondBr->getCondition());
1685 
1686  Value *Opnd0 = PopCntZext;
1687  Value *Opnd1 = ConstantInt::get(PopCntZext->getType(), 0);
1688  if (PreCond->getOperand(0) != Var)
1689  std::swap(Opnd0, Opnd1);
1690 
1691  ICmpInst *NewPreCond = cast<ICmpInst>(
1692  Builder.CreateICmp(PreCond->getPredicate(), Opnd0, Opnd1));
1693  PreCondBr->setCondition(NewPreCond);
1694 
1696  }
1697 
1698  // Step 3: Note that the population count is exactly the trip count of the
1699  // loop in question, which enable us to convert the loop from noncountable
1700  // loop into a countable one. The benefit is twofold:
1701  //
1702  // - If the loop only counts population, the entire loop becomes dead after
1703  // the transformation. It is a lot easier to prove a countable loop dead
1704  // than to prove a noncountable one. (In some C dialects, an infinite loop
1705  // isn't dead even if it computes nothing useful. In general, DCE needs
1706  // to prove a noncountable loop finite before safely delete it.)
1707  //
1708  // - If the loop also performs something else, it remains alive.
1709  // Since it is transformed to countable form, it can be aggressively
1710  // optimized by some optimizations which are in general not applicable
1711  // to a noncountable loop.
1712  //
1713  // After this step, this loop (conceptually) would look like following:
1714  // newcnt = __builtin_ctpop(x);
1715  // t = newcnt;
1716  // if (x)
1717  // do { cnt++; x &= x-1; t--) } while (t > 0);
1718  BasicBlock *Body = *(CurLoop->block_begin());
1719  {
1720  auto *LbBr = cast<BranchInst>(Body->getTerminator());
1721  ICmpInst *LbCond = cast<ICmpInst>(LbBr->getCondition());
1722  Type *Ty = TripCnt->getType();
1723 
1724  PHINode *TcPhi = PHINode::Create(Ty, 2, "tcphi", &Body->front());
1725 
1726  Builder.SetInsertPoint(LbCond);
1727  Instruction *TcDec = cast<Instruction>(
1728  Builder.CreateSub(TcPhi, ConstantInt::get(Ty, 1),
1729  "tcdec", false, true));
1730 
1731  TcPhi->addIncoming(TripCnt, PreHead);
1732  TcPhi->addIncoming(TcDec, Body);
1733 
1734  CmpInst::Predicate Pred =
1735  (LbBr->getSuccessor(0) == Body) ? CmpInst::ICMP_UGT : CmpInst::ICMP_SLE;
1736  LbCond->setPredicate(Pred);
1737  LbCond->setOperand(0, TcDec);
1738  LbCond->setOperand(1, ConstantInt::get(Ty, 0));
1739  }
1740 
1741  // Step 4: All the references to the original population counter outside
1742  // the loop are replaced with the NewCount -- the value returned from
1743  // __builtin_ctpop().
1744  CntInst->replaceUsesOutsideBlock(NewCount, Body);
1745 
1746  // step 5: Forget the "non-computable" trip-count SCEV associated with the
1747  // loop. The loop would otherwise not be deleted even if it becomes empty.
1748  SE->forgetLoop(CurLoop);
1749 }
The access may reference and may modify the value stored in memory.
Pass interface - Implemented by all &#39;passes&#39;.
Definition: Pass.h:81
uint64_t CallInst * C
Value * getValueOperand()
Definition: Instructions.h:399
SymbolTableList< Instruction >::iterator eraseFromParent()
This method unlinks &#39;this&#39; from the containing basic block and deletes it.
Definition: Instruction.cpp:68
void push_back(const T &Elt)
Definition: SmallVector.h:218
A parsed version of the target data layout string in and methods for querying it. ...
Definition: DataLayout.h:111
static ConstantInt * getFalse(LLVMContext &Context)
Definition: Constants.cpp:584
constexpr char Align[]
Key for Kernel::Arg::Metadata::mAlign.
Value * CreateICmp(CmpInst::Predicate P, Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:1846
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
static PassRegistry * getPassRegistry()
getPassRegistry - Access the global registry object, which is automatically initialized at applicatio...
static CallInst * createPopcntIntrinsic(IRBuilder<> &IRBuilder, Value *Val, const DebugLoc &DL)
PreservedAnalyses getLoopPassPreservedAnalyses()
Returns the minimum set of Analyses that all loop passes must preserve.
Value * CreateZExtOrTrunc(Value *V, Type *DestTy, const Twine &Name="")
Create a ZExt or Trunc from the integer value V to DestTy.
Definition: IRBuilder.h:1566
const SCEV * getConstant(ConstantInt *V)
Compute iterated dominance frontiers using a linear time algorithm.
Definition: AllocatorList.h:24
BinaryOps getOpcode() const
Definition: InstrTypes.h:555
Value * isBytewiseValue(Value *V)
If the specified value can be set by repeating the same byte in memory, return the i8 value that it i...
bool isAtomic() const
Return true if this instruction has an AtomicOrdering of unordered or higher.
Constant * getOrInsertFunction(StringRef Name, FunctionType *T, AttributeList AttributeList)
Look up the specified function in the module symbol table.
Definition: Module.cpp:142
ArrayRef< BasicBlock *>::const_iterator block_iterator
Definition: LoopInfo.h:153
A Module instance is used to store all the information related to an LLVM module. ...
Definition: Module.h:63
bool isUnordered() const
Definition: Instructions.h:393
The main scalar evolution driver.
static bool detectCTLZIdiom(Loop *CurLoop, PHINode *&PhiX, Instruction *&CntInst, PHINode *&CntPhi, Instruction *&DefX)
Return true if the idiom is detected in the loop.
This class represents a function call, abstracting a target machine&#39;s calling convention.
static PHINode * getRecurrenceVar(Value *VarX, Instruction *DefX, BasicBlock *LoopEntry)
BlockT * getLoopPreheader() const
If there is a preheader for this loop, return it.
Definition: LoopInfoImpl.h:174
Value * getValue() const
Like Internal, but omit from symbol table.
Definition: GlobalValue.h:57
LLVMContext & getContext() const
All values hold a context through their type.
Definition: Value.cpp:714
This class wraps the llvm.memset intrinsic.
BasicBlock * getSuccessor(unsigned i) const
STATISTIC(NumFunctions, "Total number of functions")
A debug info location.
Definition: DebugLoc.h:34
The adaptor from a function pass to a loop pass computes these analyses and makes them available to t...
unsigned getPointerAddressSpace() const
Get the address space of this pointer or pointer vector type.
Definition: DerivedTypes.h:503
An instruction for reading from memory.
Definition: Instructions.h:168
Value * getCondition() const
Value * getLength() const
iterator end()
Get an iterator to the end of the SetVector.
Definition: SetVector.h:93
static Constant * get(ArrayType *T, ArrayRef< Constant *> V)
Definition: Constants.cpp:960
Value * getDest() const
This is just like getRawDest, but it strips off any cast instructions (including addrspacecast) that ...
iterator begin()
Instruction iterator methods.
Definition: BasicBlock.h:264
PopcntSupportKind getPopcntSupport(unsigned IntTyWidthInBit) const
Return hardware support for population count.
AnalysisUsage & addRequired()
const Module * getModule() const
Return the module owning the function this basic block belongs to, or nullptr if the function does no...
Definition: BasicBlock.cpp:134
#define INITIALIZE_PASS_DEPENDENCY(depName)
Definition: PassSupport.h:51
bool isVolatile() const
Return true if this is a load from a volatile memory location.
Definition: Instructions.h:221
const DataLayout & getDataLayout() const
Get the data layout for the module&#39;s target platform.
Definition: Module.cpp:361
LoopT * getLoopFor(const BlockT *BB) const
Return the inner most loop that BB lives in.
Definition: LoopInfo.h:684
static CallInst * createCTLZIntrinsic(IRBuilder<> &IRBuilder, Value *Val, const DebugLoc &DL, bool ZeroCheck)
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition: IRBuilder.h:731
This file contains the simple types necessary to represent the attributes associated with functions a...
Value * CreateAdd(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition: IRBuilder.h:962
unsigned getDestAlignment() const
This file implements a class to represent arbitrary precision integral constant values and operations...
const APInt & getAPInt() const
BlockT * getHeader() const
Definition: LoopInfo.h:100
static bool isSimple(Instruction *I)
bool isOne() const
This is just a convenience method to make client code smaller for a common case.
Definition: Constants.h:201
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
Definition: Instructions.h:408
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:245
Value handle that is nullable, but tries to track the Value.
Definition: ValueHandle.h:182
bool insert(const value_type &X)
Insert a new element into the SetVector.
Definition: SetVector.h:142
This node represents a polynomial recurrence on the trip count of the specified loop.
static APInt getStoreStride(const SCEVAddRecExpr *StoreEv)
#define T
BasicBlock * GetInsertBlock() const
Definition: IRBuilder.h:121
Class to represent array types.
Definition: DerivedTypes.h:369
static cl::opt< bool > UseLIRCodeSizeHeurs("use-lir-code-size-heurs", cl::desc("Use loop idiom recognition code size heuristics when compiling" "with -Os/-Oz"), cl::init(true), cl::Hidden)
bool has(LibFunc F) const
Tests whether a library function is available.
iterator begin()
Get an iterator to the beginning of the SetVector.
Definition: SetVector.h:83
MDNode * getMetadata(unsigned KindID) const
Get the metadata of given kind attached to this Instruction.
Definition: Instruction.h:200
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
Definition: Instruction.h:126
Value * CreateSub(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition: IRBuilder.h:979
void initializeLoopIdiomRecognizeLegacyPassPass(PassRegistry &)
An instruction for storing to memory.
Definition: Instructions.h:310
bool isMinusOne() const
This function will return true iff every bit in this constant is set to true.
Definition: Constants.h:209
void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition: Value.cpp:439
static void deleteDeadInstruction(Instruction *I)
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree...
Definition: Dominators.h:145
Function * getDeclaration(Module *M, ID id, ArrayRef< Type *> Tys=None)
Create or insert an LLVM Function declaration for an intrinsic, and return it.
Definition: Function.cpp:1007
void SetInsertPoint(BasicBlock *TheBB)
This specifies that created instructions should be appended to the end of the specified block...
Definition: IRBuilder.h:127
Value * getOperand(unsigned i) const
Definition: User.h:170
size_type count(const key_type &key) const
Count the number of elements of a given key in the SetVector.
Definition: SetVector.h:211
bool isLoopEntryGuardedByCond(const Loop *L, ICmpInst::Predicate Pred, const SCEV *LHS, const SCEV *RHS)
Test whether entry to the loop is protected by a conditional between LHS and RHS. ...
static Constant * getBitCast(Constant *C, Type *Ty, bool OnlyIfReduced=false)
Definition: Constants.cpp:1750
bool isConsecutiveAccess(Value *A, Value *B, const DataLayout &DL, ScalarEvolution &SE, bool CheckType=true)
Returns true if the memory operations A and B are consecutive.
bool inferLibFuncAttributes(Function &F, const TargetLibraryInfo &TLI)
Analyze the name and prototype of the given function and set any applicable attributes.
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:410
const SCEV * getOperand(unsigned i) const
const Instruction * getFirstNonPHI() const
Returns a pointer to the first instruction in this block that is not a PHINode instruction.
Definition: BasicBlock.cpp:189
Wrapper pass for TargetTransformInfo.
A set of analyses that are preserved following a run of a transformation pass.
Definition: PassManager.h:153
const SCEV * getOne(Type *Ty)
Return a SCEV for the constant 1 of a specific type.
void setDebugLoc(DebugLoc Loc)
Set the debug location information for this instruction.
Definition: Instruction.h:287
const BasicBlock * getSinglePredecessor() const
Return the predecessor of this block if it has a single predecessor block.
Definition: BasicBlock.cpp:235
LLVM Basic Block Representation.
Definition: BasicBlock.h:59
The instances of the Type class are immutable: once they are created, they are never changed...
Definition: Type.h:46
ConstantInt * getTrue()
Get the constant value for i1 true.
Definition: IRBuilder.h:287
Conditional or Unconditional Branch instruction.
bool isKnownNonNegative(const Value *V, const DataLayout &DL, unsigned Depth=0, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr)
Returns true if the give value is known to be non-negative.
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
This is an important base class in LLVM.
Definition: Constant.h:42
Value * getIncomingValueForBlock(const BasicBlock *BB) const
This file contains the declarations for the subclasses of Constant, which represent the different fla...
const Instruction & front() const
Definition: BasicBlock.h:276
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition: SmallPtrSet.h:371
const SCEV * getAddExpr(SmallVectorImpl< const SCEV *> &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical add expression, or something simpler if possible.
size_t size() const
Definition: BasicBlock.h:274
bool isUnordered() const
Definition: Instructions.h:268
Represent the analysis usage information of a pass.
bool optForSize() const
Optimize this function for size (-Os) or minimum size (-Oz).
Definition: Function.h:584
static Value * matchCondition(BranchInst *BI, BasicBlock *LoopEntry)
Check if the given conditional branch is based on the comparison between a variable and zero...
This instruction compares its operands according to the predicate given to the constructor.
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition: InstrTypes.h:885
bool isAffine() const
Return true if this represents an expression A + B*x where A and B are loop invariant values...
Value * getPointerOperand()
Definition: Instructions.h:274
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
Definition: SmallPtrSet.h:382
self_iterator getIterator()
Definition: ilist_node.h:82
const SCEV * getMinusSCEV(const SCEV *LHS, const SCEV *RHS, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Return LHS-RHS. Minus is represented in SCEV as A+B*-1.
bool RecursivelyDeleteTriviallyDeadInstructions(Value *V, const TargetLibraryInfo *TLI=nullptr, MemorySSAUpdater *MSSAU=nullptr)
If the specified value is a trivially dead instruction, delete it.
Definition: Local.cpp:430
static UndefValue * get(Type *T)
Static factory methods - Return an &#39;undef&#39; object of the specified type.
Definition: Constants.cpp:1392
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition: PassManager.h:159
size_t size() const
Definition: SmallVector.h:53
static wasm::ValType getType(const TargetRegisterClass *RC)
static PointerType * getInt8PtrTy(LLVMContext &C, unsigned AS=0)
Definition: Type.cpp:220
bool isVolatile() const
const SCEV * getMulExpr(SmallVectorImpl< const SCEV *> &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical multiply expression, or something simpler if possible.
Value * GetUnderlyingObject(Value *V, const DataLayout &DL, unsigned MaxLookup=6)
This method strips off any GEP address adjustments and pointer casts from the specified value...
INITIALIZE_PASS_END(RegBankSelect, DEBUG_TYPE, "Assign register bank of generic virtual registers", false, false) RegBankSelect
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
bool anyBlockMayThrow() const
Returns true iff any block of the loop for which this info is contains an instruction that may throw ...
Definition: MustExecute.cpp:29
Representation for a specific memory location.
This class provides an interface for updating the loop pass manager based on mutations to the loop ne...
typename vector_type::const_iterator iterator
Definition: SetVector.h:49
Iterator for intrusive lists based on ilist_node.
This is the shared class of boolean and integer constants.
Definition: Constants.h:84
Type * getType() const
Return the LLVM type of this SCEV expression.
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
iterator end()
Definition: BasicBlock.h:266
This is a &#39;vector&#39; (really, a variable-sized array), optimized for the case when the array is small...
Definition: SmallVector.h:847
bool dominates(const Instruction *Def, const Use &U) const
Return true if Def dominates a use in User.
Definition: Dominators.cpp:249
Module.h This file contains the declarations for the Module class.
Provides information about what library functions are available for the current target.
static Constant * get(Type *Ty, uint64_t V, bool isSigned=false)
If Ty is a vector type, return a Constant with a splat of the given value.
Definition: Constants.cpp:621
bool isConditional() const
static PHINode * Create(Type *Ty, unsigned NumReservedValues, const Twine &NameStr="", Instruction *InsertBefore=nullptr)
Constructors - NumReservedValues is a hint for the number of incoming edges that this phi node will h...
static Constant * getMemSetPatternValue(Value *V, const DataLayout *DL)
getMemSetPatternValue - If a strided store of the specified value is safe to turn into a memset_patte...
static const SCEV * getNumBytes(const SCEV *BECount, Type *IntPtr, unsigned StoreSize, Loop *CurLoop, const DataLayout *DL, ScalarEvolution *SE)
Compute the number of bytes as a SCEV from the backedge taken count.
static bool detectPopcountIdiom(Loop *CurLoop, BasicBlock *PreCondBB, Instruction *&CntInst, PHINode *&CntPhi, Value *&Var)
Return true iff the idiom is detected in the loop.
static ConstantInt * getTrue(LLVMContext &Context)
Definition: Constants.cpp:577
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:133
The access may modify the value stored in memory.
Function * getFunction(StringRef Name) const
Look up the specified function in the module symbol table.
Definition: Module.cpp:172
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition: BitVector.h:924
signed less or equal
Definition: InstrTypes.h:915
loop Recognize loop idioms
const Module * getModule() const
Return the module owning the function this instruction belongs to or nullptr it the function does not...
Definition: Instruction.cpp:56
Class for arbitrary precision integers.
Definition: APInt.h:70
loop idiom
ModRefInfo getModRefInfo(ImmutableCallSite CS, const MemoryLocation &Loc)
getModRefInfo (for call sites) - Return information about whether a particular call site modifies or ...
iterator_range< user_iterator > users()
Definition: Value.h:400
This class uses information about analyze scalars to rewrite expressions in canonical form...
Pass * createLoopIdiomPass()
ConstantInt * getFalse()
Get the constant value for i1 false.
Definition: IRBuilder.h:292
uint64_t getTypeSizeInBits(Type *Ty) const
Size examples:
Definition: DataLayout.h:560
Predicate getPredicate() const
Return the predicate for this instruction.
Definition: InstrTypes.h:959
bool isVolatile() const
Return true if this is a store to a volatile memory location.
Definition: Instructions.h:343
const DebugLoc & getDebugLoc() const
Return the debug location for this node as a DebugLoc.
Definition: Instruction.h:290
INITIALIZE_PASS_BEGIN(LoopIdiomRecognizeLegacyPass, "loop-idiom", "Recognize loop idioms", false, false) INITIALIZE_PASS_END(LoopIdiomRecognizeLegacyPass
void forgetLoop(const Loop *L)
This method should be called by the client when it has changed a loop in a way that may effect Scalar...
void setUnnamedAddr(UnnamedAddr Val)
Definition: GlobalValue.h:215
unsigned getAtomicMemIntrinsicMaxElementSize() const
unsigned getAlignment() const
Return the alignment of the access that is being performed.
Definition: Instructions.h:230
This class represents an analyzed expression in the program.
unsigned getIntegerBitWidth() const
Definition: DerivedTypes.h:97
Represents a single loop in the control flow graph.
Definition: LoopInfo.h:459
This file provides utility analysis objects describing memory locations.
StringRef getName() const
Return a constant reference to the value&#39;s name.
Definition: Value.cpp:224
const Function * getParent() const
Return the enclosing method, or null if none.
Definition: BasicBlock.h:108
LLVM_NODISCARD ModRefInfo intersectModRef(const ModRefInfo MRI1, const ModRefInfo MRI2)
#define I(x, y, z)
Definition: MD5.cpp:58
bool isZero() const
This is just a convenience method to make client code smaller for a common code.
Definition: Constants.h:193
void getLoopAnalysisUsage(AnalysisUsage &AU)
Helper to consistently add the set of standard passes to a loop pass&#39;s AnalysisUsage.
Definition: LoopUtils.cpp:1227
static ArrayType * get(Type *ElementType, uint64_t NumElements)
This static method is the primary way to construct an ArrayType.
Definition: Type.cpp:568
LLVM_NODISCARD std::enable_if<!is_simple_type< Y >::value, typename cast_retty< X, const Y >::ret_type >::type dyn_cast(const Y &Val)
Definition: Casting.h:323
block_iterator block_end() const
Definition: LoopInfo.h:155
Captures loop safety information.
Definition: MustExecute.h:47
void computeLoopSafetyInfo(Loop *)
Computes safety information for a loop checks loop body & header for the possibility of may throw exc...
Definition: MustExecute.cpp:33
unsigned getAlignment() const
Return the alignment of the access that is being performed.
Definition: Instructions.h:355
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
Definition: Instructions.h:280
static const SCEV * getStartForNegStride(const SCEV *Start, const SCEV *BECount, Type *IntPtr, unsigned StoreSize, ScalarEvolution *SE)
const SCEV * getBackedgeTakenCount(const Loop *L)
If the specified loop has a predictable backedge-taken count, return it, otherwise return a SCEVCould...
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
The cost of a typical &#39;add&#39; instruction.
int getIntrinsicCost(Intrinsic::ID IID, Type *RetTy, ArrayRef< Type *> ParamTys) const
Estimate the cost of an intrinsic when lowered.
const SCEV * getNegativeSCEV(const SCEV *V, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap)
Return the SCEV object corresponding to -V.
Module * getParent()
Get the module that this global value is contained inside of...
Definition: GlobalValue.h:565
LLVM Value Representation.
Definition: Value.h:73
const SCEV * getSCEV(Value *V)
Return a SCEV expression for the full generality of the specified expression.
A vector that has set insertion semantics.
Definition: SetVector.h:41
constexpr char Size[]
Key for Kernel::Arg::Metadata::mSize.
static bool mayLoopAccessLocation(Value *Ptr, ModRefInfo Access, Loop *L, const SCEV *BECount, unsigned StoreSize, AliasAnalysis &AA, SmallPtrSetImpl< Instruction *> &IgnoredStores)
mayLoopAccessLocation - Return true if the specified loop might access the specified pointer location...
ModRefInfo
Flags indicating whether a memory access modifies or references memory.
unsigned greater than
Definition: InstrTypes.h:908
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:49
A container for analyses that lazily runs them and caches their results.
void replaceUsesOutsideBlock(Value *V, BasicBlock *BB)
replaceUsesOutsideBlock - Go through the uses list for this definition and make each use point to "V"...
Definition: Value.cpp:449
const SCEV * getTruncateOrZeroExtend(const SCEV *V, Type *Ty)
Return a SCEV corresponding to a conversion of the input value to the specified type.
This pass exposes codegen information to IR-level passes.
LLVM_NODISCARD bool isModOrRefSet(const ModRefInfo MRI)
bool isSimple() const
Definition: Instructions.h:391
const TerminatorInst * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Definition: BasicBlock.cpp:138
This header defines various interfaces for pass management in LLVM.
bool isBigEndian() const
Definition: DataLayout.h:222
#define LLVM_DEBUG(X)
Definition: Debug.h:123
bool isSafeToExpand(const SCEV *S, ScalarEvolution &SE)
Return true if the given expression is safe to expand in the sense that all materialized values are s...
block_iterator block_begin() const
Definition: LoopInfo.h:154
bool hasLoopInvariantBackedgeTakenCount(const Loop *L)
Return true if the specified loop has an analyzable loop-invariant backedge-taken count...
Value * getPointerOperand()
Definition: Instructions.h:402
const SCEV * getZeroExtendExpr(const SCEV *Op, Type *Ty, unsigned Depth=0)
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
PreservedAnalyses run(Loop &L, LoopAnalysisManager &AM, LoopStandardAnalysisResults &AR, LPMUpdater &U)
This class represents a constant integer value.
CallInst * CreateCall(Value *Callee, ArrayRef< Value *> Args=None, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition: IRBuilder.h:1871