LLVM  6.0.0svn
LoopIdiomRecognize.cpp
Go to the documentation of this file.
1 //===- LoopIdiomRecognize.cpp - Loop idiom recognition --------------------===//
2 //
3 // The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 // This pass implements an idiom recognizer that transforms simple loops into a
11 // non-loop form. In cases that this kicks in, it can be a significant
12 // performance win.
13 //
14 // If compiling for code size we avoid idiom recognition if the resulting
15 // code could be larger than the code for the original loop. One way this could
16 // happen is if the loop is not removable after idiom recognition due to the
17 // presence of non-idiom instructions. The initial implementation of the
18 // heuristics applies to idioms in multi-block loops.
19 //
20 //===----------------------------------------------------------------------===//
21 //
22 // TODO List:
23 //
24 // Future loop memory idioms to recognize:
25 // memcmp, memmove, strlen, etc.
26 // Future floating point idioms to recognize in -ffast-math mode:
27 // fpowi
28 // Future integer operation idioms to recognize:
29 // ctpop, ctlz, cttz
30 //
31 // Beware that isel's default lowering for ctpop is highly inefficient for
32 // i64 and larger types when i64 is legal and the value has few bits set. It
33 // would be good to enhance isel to emit a loop for ctpop in this case.
34 //
35 // This could recognize common matrix multiplies and dot product idioms and
36 // replace them with calls to BLAS (if linked in??).
37 //
38 //===----------------------------------------------------------------------===//
39 
41 #include "llvm/ADT/APInt.h"
42 #include "llvm/ADT/ArrayRef.h"
43 #include "llvm/ADT/DenseMap.h"
44 #include "llvm/ADT/MapVector.h"
45 #include "llvm/ADT/SetVector.h"
46 #include "llvm/ADT/SmallPtrSet.h"
47 #include "llvm/ADT/SmallVector.h"
48 #include "llvm/ADT/Statistic.h"
49 #include "llvm/ADT/StringRef.h"
52 #include "llvm/Analysis/LoopInfo.h"
53 #include "llvm/Analysis/LoopPass.h"
61 #include "llvm/IR/Attributes.h"
62 #include "llvm/IR/BasicBlock.h"
63 #include "llvm/IR/Constant.h"
64 #include "llvm/IR/Constants.h"
65 #include "llvm/IR/DataLayout.h"
66 #include "llvm/IR/DebugLoc.h"
67 #include "llvm/IR/DerivedTypes.h"
68 #include "llvm/IR/Dominators.h"
69 #include "llvm/IR/GlobalValue.h"
70 #include "llvm/IR/GlobalVariable.h"
71 #include "llvm/IR/IRBuilder.h"
72 #include "llvm/IR/InstrTypes.h"
73 #include "llvm/IR/Instruction.h"
74 #include "llvm/IR/Instructions.h"
75 #include "llvm/IR/IntrinsicInst.h"
76 #include "llvm/IR/Intrinsics.h"
77 #include "llvm/IR/LLVMContext.h"
78 #include "llvm/IR/Module.h"
79 #include "llvm/IR/PassManager.h"
80 #include "llvm/IR/Type.h"
81 #include "llvm/IR/User.h"
82 #include "llvm/IR/Value.h"
83 #include "llvm/IR/ValueHandle.h"
84 #include "llvm/Pass.h"
85 #include "llvm/Support/Casting.h"
87 #include "llvm/Support/Debug.h"
89 #include "llvm/Transforms/Scalar.h"
93 #include <algorithm>
94 #include <cassert>
95 #include <cstdint>
96 #include <utility>
97 #include <vector>
98 
99 using namespace llvm;
100 
101 #define DEBUG_TYPE "loop-idiom"
102 
103 STATISTIC(NumMemSet, "Number of memset's formed from loop stores");
104 STATISTIC(NumMemCpy, "Number of memcpy's formed from loop load+stores");
105 
107  "use-lir-code-size-heurs",
108  cl::desc("Use loop idiom recognition code size heuristics when compiling"
109  "with -Os/-Oz"),
110  cl::init(true), cl::Hidden);
111 
112 namespace {
113 
114 class LoopIdiomRecognize {
115  Loop *CurLoop = nullptr;
116  AliasAnalysis *AA;
117  DominatorTree *DT;
118  LoopInfo *LI;
119  ScalarEvolution *SE;
120  TargetLibraryInfo *TLI;
121  const TargetTransformInfo *TTI;
122  const DataLayout *DL;
123  bool ApplyCodeSizeHeuristics;
124 
125 public:
126  explicit LoopIdiomRecognize(AliasAnalysis *AA, DominatorTree *DT,
127  LoopInfo *LI, ScalarEvolution *SE,
128  TargetLibraryInfo *TLI,
129  const TargetTransformInfo *TTI,
130  const DataLayout *DL)
131  : AA(AA), DT(DT), LI(LI), SE(SE), TLI(TLI), TTI(TTI), DL(DL) {}
132 
133  bool runOnLoop(Loop *L);
134 
135 private:
136  using StoreList = SmallVector<StoreInst *, 8>;
137  using StoreListMap = MapVector<Value *, StoreList>;
138 
139  StoreListMap StoreRefsForMemset;
140  StoreListMap StoreRefsForMemsetPattern;
141  StoreList StoreRefsForMemcpy;
142  bool HasMemset;
143  bool HasMemsetPattern;
144  bool HasMemcpy;
145 
146  /// Return code for isLegalStore()
147  enum LegalStoreKind {
148  None = 0,
149  Memset,
150  MemsetPattern,
151  Memcpy,
152  UnorderedAtomicMemcpy,
153  DontUse // Dummy retval never to be used. Allows catching errors in retval
154  // handling.
155  };
156 
157  /// \name Countable Loop Idiom Handling
158  /// @{
159 
160  bool runOnCountableLoop();
161  bool runOnLoopBlock(BasicBlock *BB, const SCEV *BECount,
162  SmallVectorImpl<BasicBlock *> &ExitBlocks);
163 
164  void collectStores(BasicBlock *BB);
165  LegalStoreKind isLegalStore(StoreInst *SI);
166  bool processLoopStores(SmallVectorImpl<StoreInst *> &SL, const SCEV *BECount,
167  bool ForMemset);
168  bool processLoopMemSet(MemSetInst *MSI, const SCEV *BECount);
169 
170  bool processLoopStridedStore(Value *DestPtr, unsigned StoreSize,
171  unsigned StoreAlignment, Value *StoredVal,
172  Instruction *TheStore,
174  const SCEVAddRecExpr *Ev, const SCEV *BECount,
175  bool NegStride, bool IsLoopMemset = false);
176  bool processLoopStoreOfLoopLoad(StoreInst *SI, const SCEV *BECount);
177  bool avoidLIRForMultiBlockLoop(bool IsMemset = false,
178  bool IsLoopMemset = false);
179 
180  /// @}
181  /// \name Noncountable Loop Idiom Handling
182  /// @{
183 
184  bool runOnNoncountableLoop();
185 
186  bool recognizePopcount();
187  void transformLoopToPopcount(BasicBlock *PreCondBB, Instruction *CntInst,
188  PHINode *CntPhi, Value *Var);
189  bool recognizeAndInsertCTLZ();
190  void transformLoopToCountable(BasicBlock *PreCondBB, Instruction *CntInst,
191  PHINode *CntPhi, Value *Var, const DebugLoc DL,
192  bool ZeroCheck, bool IsCntPhiUsedOutsideLoop);
193 
194  /// @}
195 };
196 
197 class LoopIdiomRecognizeLegacyPass : public LoopPass {
198 public:
199  static char ID;
200 
201  explicit LoopIdiomRecognizeLegacyPass() : LoopPass(ID) {
204  }
205 
206  bool runOnLoop(Loop *L, LPPassManager &LPM) override {
207  if (skipLoop(L))
208  return false;
209 
210  AliasAnalysis *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
211  DominatorTree *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
212  LoopInfo *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
213  ScalarEvolution *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
214  TargetLibraryInfo *TLI =
215  &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
216  const TargetTransformInfo *TTI =
217  &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(
218  *L->getHeader()->getParent());
219  const DataLayout *DL = &L->getHeader()->getModule()->getDataLayout();
220 
221  LoopIdiomRecognize LIR(AA, DT, LI, SE, TLI, TTI, DL);
222  return LIR.runOnLoop(L);
223  }
224 
225  /// This transformation requires natural loop information & requires that
226  /// loop preheaders be inserted into the CFG.
227  void getAnalysisUsage(AnalysisUsage &AU) const override {
231  }
232 };
233 
234 } // end anonymous namespace
235 
237 
240  LPMUpdater &) {
241  const auto *DL = &L.getHeader()->getModule()->getDataLayout();
242 
243  LoopIdiomRecognize LIR(&AR.AA, &AR.DT, &AR.LI, &AR.SE, &AR.TLI, &AR.TTI, DL);
244  if (!LIR.runOnLoop(&L))
245  return PreservedAnalyses::all();
246 
248 }
249 
250 INITIALIZE_PASS_BEGIN(LoopIdiomRecognizeLegacyPass, "loop-idiom",
251  "Recognize loop idioms", false, false)
255 INITIALIZE_PASS_END(LoopIdiomRecognizeLegacyPass, "loop-idiom",
256  "Recognize loop idioms", false, false)
257 
258 Pass *llvm::createLoopIdiomPass() { return new LoopIdiomRecognizeLegacyPass(); }
259 
262  I->eraseFromParent();
263 }
264 
265 //===----------------------------------------------------------------------===//
266 //
267 // Implementation of LoopIdiomRecognize
268 //
269 //===----------------------------------------------------------------------===//
270 
271 bool LoopIdiomRecognize::runOnLoop(Loop *L) {
272  CurLoop = L;
273  // If the loop could not be converted to canonical form, it must have an
274  // indirectbr in it, just give up.
275  if (!L->getLoopPreheader())
276  return false;
277 
278  // Disable loop idiom recognition if the function's name is a common idiom.
280  if (Name == "memset" || Name == "memcpy")
281  return false;
282 
283  // Determine if code size heuristics need to be applied.
284  ApplyCodeSizeHeuristics =
286 
287  HasMemset = TLI->has(LibFunc_memset);
288  HasMemsetPattern = TLI->has(LibFunc_memset_pattern16);
289  HasMemcpy = TLI->has(LibFunc_memcpy);
290 
291  if (HasMemset || HasMemsetPattern || HasMemcpy)
293  return runOnCountableLoop();
294 
295  return runOnNoncountableLoop();
296 }
297 
298 bool LoopIdiomRecognize::runOnCountableLoop() {
299  const SCEV *BECount = SE->getBackedgeTakenCount(CurLoop);
300  assert(!isa<SCEVCouldNotCompute>(BECount) &&
301  "runOnCountableLoop() called on a loop without a predictable"
302  "backedge-taken count");
303 
304  // If this loop executes exactly one time, then it should be peeled, not
305  // optimized by this pass.
306  if (const SCEVConstant *BECst = dyn_cast<SCEVConstant>(BECount))
307  if (BECst->getAPInt() == 0)
308  return false;
309 
310  SmallVector<BasicBlock *, 8> ExitBlocks;
311  CurLoop->getUniqueExitBlocks(ExitBlocks);
312 
313  DEBUG(dbgs() << "loop-idiom Scanning: F["
314  << CurLoop->getHeader()->getParent()->getName() << "] Loop %"
315  << CurLoop->getHeader()->getName() << "\n");
316 
317  bool MadeChange = false;
318 
319  // The following transforms hoist stores/memsets into the loop pre-header.
320  // Give up if the loop has instructions may throw.
321  LoopSafetyInfo SafetyInfo;
322  computeLoopSafetyInfo(&SafetyInfo, CurLoop);
323  if (SafetyInfo.MayThrow)
324  return MadeChange;
325 
326  // Scan all the blocks in the loop that are not in subloops.
327  for (auto *BB : CurLoop->getBlocks()) {
328  // Ignore blocks in subloops.
329  if (LI->getLoopFor(BB) != CurLoop)
330  continue;
331 
332  MadeChange |= runOnLoopBlock(BB, BECount, ExitBlocks);
333  }
334  return MadeChange;
335 }
336 
337 static unsigned getStoreSizeInBytes(StoreInst *SI, const DataLayout *DL) {
338  uint64_t SizeInBits = DL->getTypeSizeInBits(SI->getValueOperand()->getType());
339  assert(((SizeInBits & 7) || (SizeInBits >> 32) == 0) &&
340  "Don't overflow unsigned.");
341  return (unsigned)SizeInBits >> 3;
342 }
343 
344 static APInt getStoreStride(const SCEVAddRecExpr *StoreEv) {
345  const SCEVConstant *ConstStride = cast<SCEVConstant>(StoreEv->getOperand(1));
346  return ConstStride->getAPInt();
347 }
348 
349 /// getMemSetPatternValue - If a strided store of the specified value is safe to
350 /// turn into a memset_pattern16, return a ConstantArray of 16 bytes that should
351 /// be passed in. Otherwise, return null.
352 ///
353 /// Note that we don't ever attempt to use memset_pattern8 or 4, because these
354 /// just replicate their input array and then pass on to memset_pattern16.
356  // If the value isn't a constant, we can't promote it to being in a constant
357  // array. We could theoretically do a store to an alloca or something, but
358  // that doesn't seem worthwhile.
359  Constant *C = dyn_cast<Constant>(V);
360  if (!C)
361  return nullptr;
362 
363  // Only handle simple values that are a power of two bytes in size.
364  uint64_t Size = DL->getTypeSizeInBits(V->getType());
365  if (Size == 0 || (Size & 7) || (Size & (Size - 1)))
366  return nullptr;
367 
368  // Don't care enough about darwin/ppc to implement this.
369  if (DL->isBigEndian())
370  return nullptr;
371 
372  // Convert to size in bytes.
373  Size /= 8;
374 
375  // TODO: If CI is larger than 16-bytes, we can try slicing it in half to see
376  // if the top and bottom are the same (e.g. for vectors and large integers).
377  if (Size > 16)
378  return nullptr;
379 
380  // If the constant is exactly 16 bytes, just use it.
381  if (Size == 16)
382  return C;
383 
384  // Otherwise, we'll use an array of the constants.
385  unsigned ArraySize = 16 / Size;
386  ArrayType *AT = ArrayType::get(V->getType(), ArraySize);
387  return ConstantArray::get(AT, std::vector<Constant *>(ArraySize, C));
388 }
389 
390 LoopIdiomRecognize::LegalStoreKind
391 LoopIdiomRecognize::isLegalStore(StoreInst *SI) {
392  // Don't touch volatile stores.
393  if (SI->isVolatile())
394  return LegalStoreKind::None;
395  // We only want simple or unordered-atomic stores.
396  if (!SI->isUnordered())
397  return LegalStoreKind::None;
398 
399  // Don't convert stores of non-integral pointer types to memsets (which stores
400  // integers).
401  if (DL->isNonIntegralPointerType(SI->getValueOperand()->getType()))
402  return LegalStoreKind::None;
403 
404  // Avoid merging nontemporal stores.
406  return LegalStoreKind::None;
407 
408  Value *StoredVal = SI->getValueOperand();
409  Value *StorePtr = SI->getPointerOperand();
410 
411  // Reject stores that are so large that they overflow an unsigned.
412  uint64_t SizeInBits = DL->getTypeSizeInBits(StoredVal->getType());
413  if ((SizeInBits & 7) || (SizeInBits >> 32) != 0)
414  return LegalStoreKind::None;
415 
416  // See if the pointer expression is an AddRec like {base,+,1} on the current
417  // loop, which indicates a strided store. If we have something else, it's a
418  // random store we can't handle.
419  const SCEVAddRecExpr *StoreEv =
420  dyn_cast<SCEVAddRecExpr>(SE->getSCEV(StorePtr));
421  if (!StoreEv || StoreEv->getLoop() != CurLoop || !StoreEv->isAffine())
422  return LegalStoreKind::None;
423 
424  // Check to see if we have a constant stride.
425  if (!isa<SCEVConstant>(StoreEv->getOperand(1)))
426  return LegalStoreKind::None;
427 
428  // See if the store can be turned into a memset.
429 
430  // If the stored value is a byte-wise value (like i32 -1), then it may be
431  // turned into a memset of i8 -1, assuming that all the consecutive bytes
432  // are stored. A store of i32 0x01020304 can never be turned into a memset,
433  // but it can be turned into memset_pattern if the target supports it.
434  Value *SplatValue = isBytewiseValue(StoredVal);
435  Constant *PatternValue = nullptr;
436 
437  // Note: memset and memset_pattern on unordered-atomic is yet not supported
438  bool UnorderedAtomic = SI->isUnordered() && !SI->isSimple();
439 
440  // If we're allowed to form a memset, and the stored value would be
441  // acceptable for memset, use it.
442  if (!UnorderedAtomic && HasMemset && SplatValue &&
443  // Verify that the stored value is loop invariant. If not, we can't
444  // promote the memset.
445  CurLoop->isLoopInvariant(SplatValue)) {
446  // It looks like we can use SplatValue.
447  return LegalStoreKind::Memset;
448  } else if (!UnorderedAtomic && HasMemsetPattern &&
449  // Don't create memset_pattern16s with address spaces.
450  StorePtr->getType()->getPointerAddressSpace() == 0 &&
451  (PatternValue = getMemSetPatternValue(StoredVal, DL))) {
452  // It looks like we can use PatternValue!
453  return LegalStoreKind::MemsetPattern;
454  }
455 
456  // Otherwise, see if the store can be turned into a memcpy.
457  if (HasMemcpy) {
458  // Check to see if the stride matches the size of the store. If so, then we
459  // know that every byte is touched in the loop.
460  APInt Stride = getStoreStride(StoreEv);
461  unsigned StoreSize = getStoreSizeInBytes(SI, DL);
462  if (StoreSize != Stride && StoreSize != -Stride)
463  return LegalStoreKind::None;
464 
465  // The store must be feeding a non-volatile load.
467 
468  // Only allow non-volatile loads
469  if (!LI || LI->isVolatile())
470  return LegalStoreKind::None;
471  // Only allow simple or unordered-atomic loads
472  if (!LI->isUnordered())
473  return LegalStoreKind::None;
474 
475  // See if the pointer expression is an AddRec like {base,+,1} on the current
476  // loop, which indicates a strided load. If we have something else, it's a
477  // random load we can't handle.
478  const SCEVAddRecExpr *LoadEv =
480  if (!LoadEv || LoadEv->getLoop() != CurLoop || !LoadEv->isAffine())
481  return LegalStoreKind::None;
482 
483  // The store and load must share the same stride.
484  if (StoreEv->getOperand(1) != LoadEv->getOperand(1))
485  return LegalStoreKind::None;
486 
487  // Success. This store can be converted into a memcpy.
488  UnorderedAtomic = UnorderedAtomic || LI->isAtomic();
489  return UnorderedAtomic ? LegalStoreKind::UnorderedAtomicMemcpy
490  : LegalStoreKind::Memcpy;
491  }
492  // This store can't be transformed into a memset/memcpy.
493  return LegalStoreKind::None;
494 }
495 
496 void LoopIdiomRecognize::collectStores(BasicBlock *BB) {
497  StoreRefsForMemset.clear();
498  StoreRefsForMemsetPattern.clear();
499  StoreRefsForMemcpy.clear();
500  for (Instruction &I : *BB) {
501  StoreInst *SI = dyn_cast<StoreInst>(&I);
502  if (!SI)
503  continue;
504 
505  // Make sure this is a strided store with a constant stride.
506  switch (isLegalStore(SI)) {
508  // Nothing to do
509  break;
510  case LegalStoreKind::Memset: {
511  // Find the base pointer.
512  Value *Ptr = GetUnderlyingObject(SI->getPointerOperand(), *DL);
513  StoreRefsForMemset[Ptr].push_back(SI);
514  } break;
515  case LegalStoreKind::MemsetPattern: {
516  // Find the base pointer.
517  Value *Ptr = GetUnderlyingObject(SI->getPointerOperand(), *DL);
518  StoreRefsForMemsetPattern[Ptr].push_back(SI);
519  } break;
520  case LegalStoreKind::Memcpy:
521  case LegalStoreKind::UnorderedAtomicMemcpy:
522  StoreRefsForMemcpy.push_back(SI);
523  break;
524  default:
525  assert(false && "unhandled return value");
526  break;
527  }
528  }
529 }
530 
531 /// runOnLoopBlock - Process the specified block, which lives in a counted loop
532 /// with the specified backedge count. This block is known to be in the current
533 /// loop and not in any subloops.
534 bool LoopIdiomRecognize::runOnLoopBlock(
535  BasicBlock *BB, const SCEV *BECount,
536  SmallVectorImpl<BasicBlock *> &ExitBlocks) {
537  // We can only promote stores in this block if they are unconditionally
538  // executed in the loop. For a block to be unconditionally executed, it has
539  // to dominate all the exit blocks of the loop. Verify this now.
540  for (unsigned i = 0, e = ExitBlocks.size(); i != e; ++i)
541  if (!DT->dominates(BB, ExitBlocks[i]))
542  return false;
543 
544  bool MadeChange = false;
545  // Look for store instructions, which may be optimized to memset/memcpy.
546  collectStores(BB);
547 
548  // Look for a single store or sets of stores with a common base, which can be
549  // optimized into a memset (memset_pattern). The latter most commonly happens
550  // with structs and handunrolled loops.
551  for (auto &SL : StoreRefsForMemset)
552  MadeChange |= processLoopStores(SL.second, BECount, true);
553 
554  for (auto &SL : StoreRefsForMemsetPattern)
555  MadeChange |= processLoopStores(SL.second, BECount, false);
556 
557  // Optimize the store into a memcpy, if it feeds an similarly strided load.
558  for (auto &SI : StoreRefsForMemcpy)
559  MadeChange |= processLoopStoreOfLoopLoad(SI, BECount);
560 
561  for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) {
562  Instruction *Inst = &*I++;
563  // Look for memset instructions, which may be optimized to a larger memset.
564  if (MemSetInst *MSI = dyn_cast<MemSetInst>(Inst)) {
565  WeakTrackingVH InstPtr(&*I);
566  if (!processLoopMemSet(MSI, BECount))
567  continue;
568  MadeChange = true;
569 
570  // If processing the memset invalidated our iterator, start over from the
571  // top of the block.
572  if (!InstPtr)
573  I = BB->begin();
574  continue;
575  }
576  }
577 
578  return MadeChange;
579 }
580 
581 /// processLoopStores - See if this store(s) can be promoted to a memset.
582 bool LoopIdiomRecognize::processLoopStores(SmallVectorImpl<StoreInst *> &SL,
583  const SCEV *BECount,
584  bool ForMemset) {
585  // Try to find consecutive stores that can be transformed into memsets.
586  SetVector<StoreInst *> Heads, Tails;
588 
589  // Do a quadratic search on all of the given stores and find
590  // all of the pairs of stores that follow each other.
591  SmallVector<unsigned, 16> IndexQueue;
592  for (unsigned i = 0, e = SL.size(); i < e; ++i) {
593  assert(SL[i]->isSimple() && "Expected only non-volatile stores.");
594 
595  Value *FirstStoredVal = SL[i]->getValueOperand();
596  Value *FirstStorePtr = SL[i]->getPointerOperand();
597  const SCEVAddRecExpr *FirstStoreEv =
598  cast<SCEVAddRecExpr>(SE->getSCEV(FirstStorePtr));
599  APInt FirstStride = getStoreStride(FirstStoreEv);
600  unsigned FirstStoreSize = getStoreSizeInBytes(SL[i], DL);
601 
602  // See if we can optimize just this store in isolation.
603  if (FirstStride == FirstStoreSize || -FirstStride == FirstStoreSize) {
604  Heads.insert(SL[i]);
605  continue;
606  }
607 
608  Value *FirstSplatValue = nullptr;
609  Constant *FirstPatternValue = nullptr;
610 
611  if (ForMemset)
612  FirstSplatValue = isBytewiseValue(FirstStoredVal);
613  else
614  FirstPatternValue = getMemSetPatternValue(FirstStoredVal, DL);
615 
616  assert((FirstSplatValue || FirstPatternValue) &&
617  "Expected either splat value or pattern value.");
618 
619  IndexQueue.clear();
620  // If a store has multiple consecutive store candidates, search Stores
621  // array according to the sequence: from i+1 to e, then from i-1 to 0.
622  // This is because usually pairing with immediate succeeding or preceding
623  // candidate create the best chance to find memset opportunity.
624  unsigned j = 0;
625  for (j = i + 1; j < e; ++j)
626  IndexQueue.push_back(j);
627  for (j = i; j > 0; --j)
628  IndexQueue.push_back(j - 1);
629 
630  for (auto &k : IndexQueue) {
631  assert(SL[k]->isSimple() && "Expected only non-volatile stores.");
632  Value *SecondStorePtr = SL[k]->getPointerOperand();
633  const SCEVAddRecExpr *SecondStoreEv =
634  cast<SCEVAddRecExpr>(SE->getSCEV(SecondStorePtr));
635  APInt SecondStride = getStoreStride(SecondStoreEv);
636 
637  if (FirstStride != SecondStride)
638  continue;
639 
640  Value *SecondStoredVal = SL[k]->getValueOperand();
641  Value *SecondSplatValue = nullptr;
642  Constant *SecondPatternValue = nullptr;
643 
644  if (ForMemset)
645  SecondSplatValue = isBytewiseValue(SecondStoredVal);
646  else
647  SecondPatternValue = getMemSetPatternValue(SecondStoredVal, DL);
648 
649  assert((SecondSplatValue || SecondPatternValue) &&
650  "Expected either splat value or pattern value.");
651 
652  if (isConsecutiveAccess(SL[i], SL[k], *DL, *SE, false)) {
653  if (ForMemset) {
654  if (FirstSplatValue != SecondSplatValue)
655  continue;
656  } else {
657  if (FirstPatternValue != SecondPatternValue)
658  continue;
659  }
660  Tails.insert(SL[k]);
661  Heads.insert(SL[i]);
662  ConsecutiveChain[SL[i]] = SL[k];
663  break;
664  }
665  }
666  }
667 
668  // We may run into multiple chains that merge into a single chain. We mark the
669  // stores that we transformed so that we don't visit the same store twice.
670  SmallPtrSet<Value *, 16> TransformedStores;
671  bool Changed = false;
672 
673  // For stores that start but don't end a link in the chain:
674  for (SetVector<StoreInst *>::iterator it = Heads.begin(), e = Heads.end();
675  it != e; ++it) {
676  if (Tails.count(*it))
677  continue;
678 
679  // We found a store instr that starts a chain. Now follow the chain and try
680  // to transform it.
681  SmallPtrSet<Instruction *, 8> AdjacentStores;
682  StoreInst *I = *it;
683 
684  StoreInst *HeadStore = I;
685  unsigned StoreSize = 0;
686 
687  // Collect the chain into a list.
688  while (Tails.count(I) || Heads.count(I)) {
689  if (TransformedStores.count(I))
690  break;
691  AdjacentStores.insert(I);
692 
693  StoreSize += getStoreSizeInBytes(I, DL);
694  // Move to the next value in the chain.
695  I = ConsecutiveChain[I];
696  }
697 
698  Value *StoredVal = HeadStore->getValueOperand();
699  Value *StorePtr = HeadStore->getPointerOperand();
700  const SCEVAddRecExpr *StoreEv = cast<SCEVAddRecExpr>(SE->getSCEV(StorePtr));
701  APInt Stride = getStoreStride(StoreEv);
702 
703  // Check to see if the stride matches the size of the stores. If so, then
704  // we know that every byte is touched in the loop.
705  if (StoreSize != Stride && StoreSize != -Stride)
706  continue;
707 
708  bool NegStride = StoreSize == -Stride;
709 
710  if (processLoopStridedStore(StorePtr, StoreSize, HeadStore->getAlignment(),
711  StoredVal, HeadStore, AdjacentStores, StoreEv,
712  BECount, NegStride)) {
713  TransformedStores.insert(AdjacentStores.begin(), AdjacentStores.end());
714  Changed = true;
715  }
716  }
717 
718  return Changed;
719 }
720 
721 /// processLoopMemSet - See if this memset can be promoted to a large memset.
722 bool LoopIdiomRecognize::processLoopMemSet(MemSetInst *MSI,
723  const SCEV *BECount) {
724  // We can only handle non-volatile memsets with a constant size.
725  if (MSI->isVolatile() || !isa<ConstantInt>(MSI->getLength()))
726  return false;
727 
728  // If we're not allowed to hack on memset, we fail.
729  if (!HasMemset)
730  return false;
731 
732  Value *Pointer = MSI->getDest();
733 
734  // See if the pointer expression is an AddRec like {base,+,1} on the current
735  // loop, which indicates a strided store. If we have something else, it's a
736  // random store we can't handle.
737  const SCEVAddRecExpr *Ev = dyn_cast<SCEVAddRecExpr>(SE->getSCEV(Pointer));
738  if (!Ev || Ev->getLoop() != CurLoop || !Ev->isAffine())
739  return false;
740 
741  // Reject memsets that are so large that they overflow an unsigned.
742  uint64_t SizeInBytes = cast<ConstantInt>(MSI->getLength())->getZExtValue();
743  if ((SizeInBytes >> 32) != 0)
744  return false;
745 
746  // Check to see if the stride matches the size of the memset. If so, then we
747  // know that every byte is touched in the loop.
748  const SCEVConstant *ConstStride = dyn_cast<SCEVConstant>(Ev->getOperand(1));
749  if (!ConstStride)
750  return false;
751 
752  APInt Stride = ConstStride->getAPInt();
753  if (SizeInBytes != Stride && SizeInBytes != -Stride)
754  return false;
755 
756  // Verify that the memset value is loop invariant. If not, we can't promote
757  // the memset.
758  Value *SplatValue = MSI->getValue();
759  if (!SplatValue || !CurLoop->isLoopInvariant(SplatValue))
760  return false;
761 
763  MSIs.insert(MSI);
764  bool NegStride = SizeInBytes == -Stride;
765  return processLoopStridedStore(Pointer, (unsigned)SizeInBytes,
766  MSI->getAlignment(), SplatValue, MSI, MSIs, Ev,
767  BECount, NegStride, /*IsLoopMemset=*/true);
768 }
769 
770 /// mayLoopAccessLocation - Return true if the specified loop might access the
771 /// specified pointer location, which is a loop-strided access. The 'Access'
772 /// argument specifies what the verboten forms of access are (read or write).
773 static bool
775  const SCEV *BECount, unsigned StoreSize,
776  AliasAnalysis &AA,
777  SmallPtrSetImpl<Instruction *> &IgnoredStores) {
778  // Get the location that may be stored across the loop. Since the access is
779  // strided positively through memory, we say that the modified location starts
780  // at the pointer and has infinite size.
781  uint64_t AccessSize = MemoryLocation::UnknownSize;
782 
783  // If the loop iterates a fixed number of times, we can refine the access size
784  // to be exactly the size of the memset, which is (BECount+1)*StoreSize
785  if (const SCEVConstant *BECst = dyn_cast<SCEVConstant>(BECount))
786  AccessSize = (BECst->getValue()->getZExtValue() + 1) * StoreSize;
787 
788  // TODO: For this to be really effective, we have to dive into the pointer
789  // operand in the store. Store to &A[i] of 100 will always return may alias
790  // with store of &A[100], we need to StoreLoc to be "A" with size of 100,
791  // which will then no-alias a store to &A[100].
792  MemoryLocation StoreLoc(Ptr, AccessSize);
793 
794  for (Loop::block_iterator BI = L->block_begin(), E = L->block_end(); BI != E;
795  ++BI)
796  for (Instruction &I : **BI)
797  if (IgnoredStores.count(&I) == 0 &&
798  (AA.getModRefInfo(&I, StoreLoc) & Access))
799  return true;
800 
801  return false;
802 }
803 
804 // If we have a negative stride, Start refers to the end of the memory location
805 // we're trying to memset. Therefore, we need to recompute the base pointer,
806 // which is just Start - BECount*Size.
807 static const SCEV *getStartForNegStride(const SCEV *Start, const SCEV *BECount,
808  Type *IntPtr, unsigned StoreSize,
809  ScalarEvolution *SE) {
810  const SCEV *Index = SE->getTruncateOrZeroExtend(BECount, IntPtr);
811  if (StoreSize != 1)
812  Index = SE->getMulExpr(Index, SE->getConstant(IntPtr, StoreSize),
813  SCEV::FlagNUW);
814  return SE->getMinusSCEV(Start, Index);
815 }
816 
817 /// Compute the number of bytes as a SCEV from the backedge taken count.
818 ///
819 /// This also maps the SCEV into the provided type and tries to handle the
820 /// computation in a way that will fold cleanly.
821 static const SCEV *getNumBytes(const SCEV *BECount, Type *IntPtr,
822  unsigned StoreSize, Loop *CurLoop,
823  const DataLayout *DL, ScalarEvolution *SE) {
824  const SCEV *NumBytesS;
825  // The # stored bytes is (BECount+1)*Size. Expand the trip count out to
826  // pointer size if it isn't already.
827  //
828  // If we're going to need to zero extend the BE count, check if we can add
829  // one to it prior to zero extending without overflow. Provided this is safe,
830  // it allows better simplification of the +1.
831  if (DL->getTypeSizeInBits(BECount->getType()) <
832  DL->getTypeSizeInBits(IntPtr) &&
834  CurLoop, ICmpInst::ICMP_NE, BECount,
835  SE->getNegativeSCEV(SE->getOne(BECount->getType())))) {
836  NumBytesS = SE->getZeroExtendExpr(
837  SE->getAddExpr(BECount, SE->getOne(BECount->getType()), SCEV::FlagNUW),
838  IntPtr);
839  } else {
840  NumBytesS = SE->getAddExpr(SE->getTruncateOrZeroExtend(BECount, IntPtr),
841  SE->getOne(IntPtr), SCEV::FlagNUW);
842  }
843 
844  // And scale it based on the store size.
845  if (StoreSize != 1) {
846  NumBytesS = SE->getMulExpr(NumBytesS, SE->getConstant(IntPtr, StoreSize),
847  SCEV::FlagNUW);
848  }
849  return NumBytesS;
850 }
851 
852 /// processLoopStridedStore - We see a strided store of some value. If we can
853 /// transform this into a memset or memset_pattern in the loop preheader, do so.
854 bool LoopIdiomRecognize::processLoopStridedStore(
855  Value *DestPtr, unsigned StoreSize, unsigned StoreAlignment,
856  Value *StoredVal, Instruction *TheStore,
858  const SCEV *BECount, bool NegStride, bool IsLoopMemset) {
859  Value *SplatValue = isBytewiseValue(StoredVal);
860  Constant *PatternValue = nullptr;
861 
862  if (!SplatValue)
863  PatternValue = getMemSetPatternValue(StoredVal, DL);
864 
865  assert((SplatValue || PatternValue) &&
866  "Expected either splat value or pattern value.");
867 
868  // The trip count of the loop and the base pointer of the addrec SCEV is
869  // guaranteed to be loop invariant, which means that it should dominate the
870  // header. This allows us to insert code for it in the preheader.
871  unsigned DestAS = DestPtr->getType()->getPointerAddressSpace();
872  BasicBlock *Preheader = CurLoop->getLoopPreheader();
873  IRBuilder<> Builder(Preheader->getTerminator());
874  SCEVExpander Expander(*SE, *DL, "loop-idiom");
875 
876  Type *DestInt8PtrTy = Builder.getInt8PtrTy(DestAS);
877  Type *IntPtr = Builder.getIntPtrTy(*DL, DestAS);
878 
879  const SCEV *Start = Ev->getStart();
880  // Handle negative strided loops.
881  if (NegStride)
882  Start = getStartForNegStride(Start, BECount, IntPtr, StoreSize, SE);
883 
884  // TODO: ideally we should still be able to generate memset if SCEV expander
885  // is taught to generate the dependencies at the latest point.
886  if (!isSafeToExpand(Start, *SE))
887  return false;
888 
889  // Okay, we have a strided store "p[i]" of a splattable value. We can turn
890  // this into a memset in the loop preheader now if we want. However, this
891  // would be unsafe to do if there is anything else in the loop that may read
892  // or write to the aliased location. Check for any overlap by generating the
893  // base pointer and checking the region.
894  Value *BasePtr =
895  Expander.expandCodeFor(Start, DestInt8PtrTy, Preheader->getTerminator());
896  if (mayLoopAccessLocation(BasePtr, MRI_ModRef, CurLoop, BECount, StoreSize,
897  *AA, Stores)) {
898  Expander.clear();
899  // If we generated new code for the base pointer, clean up.
901  return false;
902  }
903 
904  if (avoidLIRForMultiBlockLoop(/*IsMemset=*/true, IsLoopMemset))
905  return false;
906 
907  // Okay, everything looks good, insert the memset.
908 
909  const SCEV *NumBytesS =
910  getNumBytes(BECount, IntPtr, StoreSize, CurLoop, DL, SE);
911 
912  // TODO: ideally we should still be able to generate memset if SCEV expander
913  // is taught to generate the dependencies at the latest point.
914  if (!isSafeToExpand(NumBytesS, *SE))
915  return false;
916 
917  Value *NumBytes =
918  Expander.expandCodeFor(NumBytesS, IntPtr, Preheader->getTerminator());
919 
920  CallInst *NewCall;
921  if (SplatValue) {
922  NewCall =
923  Builder.CreateMemSet(BasePtr, SplatValue, NumBytes, StoreAlignment);
924  } else {
925  // Everything is emitted in default address space
926  Type *Int8PtrTy = DestInt8PtrTy;
927 
928  Module *M = TheStore->getModule();
929  Value *MSP =
930  M->getOrInsertFunction("memset_pattern16", Builder.getVoidTy(),
931  Int8PtrTy, Int8PtrTy, IntPtr);
932  inferLibFuncAttributes(*M->getFunction("memset_pattern16"), *TLI);
933 
934  // Otherwise we should form a memset_pattern16. PatternValue is known to be
935  // an constant array of 16-bytes. Plop the value into a mergable global.
936  GlobalVariable *GV = new GlobalVariable(*M, PatternValue->getType(), true,
938  PatternValue, ".memset_pattern");
939  GV->setUnnamedAddr(GlobalValue::UnnamedAddr::Global); // Ok to merge these.
940  GV->setAlignment(16);
941  Value *PatternPtr = ConstantExpr::getBitCast(GV, Int8PtrTy);
942  NewCall = Builder.CreateCall(MSP, {BasePtr, PatternPtr, NumBytes});
943  }
944 
945  DEBUG(dbgs() << " Formed memset: " << *NewCall << "\n"
946  << " from store to: " << *Ev << " at: " << *TheStore << "\n");
947  NewCall->setDebugLoc(TheStore->getDebugLoc());
948 
949  // Okay, the memset has been formed. Zap the original store and anything that
950  // feeds into it.
951  for (auto *I : Stores)
953  ++NumMemSet;
954  return true;
955 }
956 
957 /// If the stored value is a strided load in the same loop with the same stride
958 /// this may be transformable into a memcpy. This kicks in for stuff like
959 /// for (i) A[i] = B[i];
960 bool LoopIdiomRecognize::processLoopStoreOfLoopLoad(StoreInst *SI,
961  const SCEV *BECount) {
962  assert(SI->isUnordered() && "Expected only non-volatile non-ordered stores.");
963 
964  Value *StorePtr = SI->getPointerOperand();
965  const SCEVAddRecExpr *StoreEv = cast<SCEVAddRecExpr>(SE->getSCEV(StorePtr));
966  APInt Stride = getStoreStride(StoreEv);
967  unsigned StoreSize = getStoreSizeInBytes(SI, DL);
968  bool NegStride = StoreSize == -Stride;
969 
970  // The store must be feeding a non-volatile load.
971  LoadInst *LI = cast<LoadInst>(SI->getValueOperand());
972  assert(LI->isUnordered() && "Expected only non-volatile non-ordered loads.");
973 
974  // See if the pointer expression is an AddRec like {base,+,1} on the current
975  // loop, which indicates a strided load. If we have something else, it's a
976  // random load we can't handle.
977  const SCEVAddRecExpr *LoadEv =
978  cast<SCEVAddRecExpr>(SE->getSCEV(LI->getPointerOperand()));
979 
980  // The trip count of the loop and the base pointer of the addrec SCEV is
981  // guaranteed to be loop invariant, which means that it should dominate the
982  // header. This allows us to insert code for it in the preheader.
983  BasicBlock *Preheader = CurLoop->getLoopPreheader();
984  IRBuilder<> Builder(Preheader->getTerminator());
985  SCEVExpander Expander(*SE, *DL, "loop-idiom");
986 
987  const SCEV *StrStart = StoreEv->getStart();
988  unsigned StrAS = SI->getPointerAddressSpace();
989  Type *IntPtrTy = Builder.getIntPtrTy(*DL, StrAS);
990 
991  // Handle negative strided loops.
992  if (NegStride)
993  StrStart = getStartForNegStride(StrStart, BECount, IntPtrTy, StoreSize, SE);
994 
995  // Okay, we have a strided store "p[i]" of a loaded value. We can turn
996  // this into a memcpy in the loop preheader now if we want. However, this
997  // would be unsafe to do if there is anything else in the loop that may read
998  // or write the memory region we're storing to. This includes the load that
999  // feeds the stores. Check for an alias by generating the base address and
1000  // checking everything.
1001  Value *StoreBasePtr = Expander.expandCodeFor(
1002  StrStart, Builder.getInt8PtrTy(StrAS), Preheader->getTerminator());
1003 
1005  Stores.insert(SI);
1006  if (mayLoopAccessLocation(StoreBasePtr, MRI_ModRef, CurLoop, BECount,
1007  StoreSize, *AA, Stores)) {
1008  Expander.clear();
1009  // If we generated new code for the base pointer, clean up.
1010  RecursivelyDeleteTriviallyDeadInstructions(StoreBasePtr, TLI);
1011  return false;
1012  }
1013 
1014  const SCEV *LdStart = LoadEv->getStart();
1015  unsigned LdAS = LI->getPointerAddressSpace();
1016 
1017  // Handle negative strided loops.
1018  if (NegStride)
1019  LdStart = getStartForNegStride(LdStart, BECount, IntPtrTy, StoreSize, SE);
1020 
1021  // For a memcpy, we have to make sure that the input array is not being
1022  // mutated by the loop.
1023  Value *LoadBasePtr = Expander.expandCodeFor(
1024  LdStart, Builder.getInt8PtrTy(LdAS), Preheader->getTerminator());
1025 
1026  if (mayLoopAccessLocation(LoadBasePtr, MRI_Mod, CurLoop, BECount, StoreSize,
1027  *AA, Stores)) {
1028  Expander.clear();
1029  // If we generated new code for the base pointer, clean up.
1031  RecursivelyDeleteTriviallyDeadInstructions(StoreBasePtr, TLI);
1032  return false;
1033  }
1034 
1035  if (avoidLIRForMultiBlockLoop())
1036  return false;
1037 
1038  // Okay, everything is safe, we can transform this!
1039 
1040  const SCEV *NumBytesS =
1041  getNumBytes(BECount, IntPtrTy, StoreSize, CurLoop, DL, SE);
1042 
1043  Value *NumBytes =
1044  Expander.expandCodeFor(NumBytesS, IntPtrTy, Preheader->getTerminator());
1045 
1046  unsigned Align = std::min(SI->getAlignment(), LI->getAlignment());
1047  CallInst *NewCall = nullptr;
1048  // Check whether to generate an unordered atomic memcpy:
1049  // If the load or store are atomic, then they must neccessarily be unordered
1050  // by previous checks.
1051  if (!SI->isAtomic() && !LI->isAtomic())
1052  NewCall = Builder.CreateMemCpy(StoreBasePtr, LoadBasePtr, NumBytes, Align);
1053  else {
1054  // We cannot allow unaligned ops for unordered load/store, so reject
1055  // anything where the alignment isn't at least the element size.
1056  if (Align < StoreSize)
1057  return false;
1058 
1059  // If the element.atomic memcpy is not lowered into explicit
1060  // loads/stores later, then it will be lowered into an element-size
1061  // specific lib call. If the lib call doesn't exist for our store size, then
1062  // we shouldn't generate the memcpy.
1063  if (StoreSize > TTI->getAtomicMemIntrinsicMaxElementSize())
1064  return false;
1065 
1066  // Create the call.
1067  // Note that unordered atomic loads/stores are *required* by the spec to
1068  // have an alignment but non-atomic loads/stores may not.
1069  NewCall = Builder.CreateElementUnorderedAtomicMemCpy(
1070  StoreBasePtr, SI->getAlignment(), LoadBasePtr, LI->getAlignment(),
1071  NumBytes, StoreSize);
1072  }
1073  NewCall->setDebugLoc(SI->getDebugLoc());
1074 
1075  DEBUG(dbgs() << " Formed memcpy: " << *NewCall << "\n"
1076  << " from load ptr=" << *LoadEv << " at: " << *LI << "\n"
1077  << " from store ptr=" << *StoreEv << " at: " << *SI << "\n");
1078 
1079  // Okay, the memcpy has been formed. Zap the original store and anything that
1080  // feeds into it.
1082  ++NumMemCpy;
1083  return true;
1084 }
1085 
1086 // When compiling for codesize we avoid idiom recognition for a multi-block loop
1087 // unless it is a loop_memset idiom or a memset/memcpy idiom in a nested loop.
1088 //
1089 bool LoopIdiomRecognize::avoidLIRForMultiBlockLoop(bool IsMemset,
1090  bool IsLoopMemset) {
1091  if (ApplyCodeSizeHeuristics && CurLoop->getNumBlocks() > 1) {
1092  if (!CurLoop->getParentLoop() && (!IsMemset || !IsLoopMemset)) {
1093  DEBUG(dbgs() << " " << CurLoop->getHeader()->getParent()->getName()
1094  << " : LIR " << (IsMemset ? "Memset" : "Memcpy")
1095  << " avoided: multi-block top-level loop\n");
1096  return true;
1097  }
1098  }
1099 
1100  return false;
1101 }
1102 
1103 bool LoopIdiomRecognize::runOnNoncountableLoop() {
1104  return recognizePopcount() || recognizeAndInsertCTLZ();
1105 }
1106 
1107 /// Check if the given conditional branch is based on the comparison between
1108 /// a variable and zero, and if the variable is non-zero, the control yields to
1109 /// the loop entry. If the branch matches the behavior, the variable involved
1110 /// in the comparison is returned. This function will be called to see if the
1111 /// precondition and postcondition of the loop are in desirable form.
1112 static Value *matchCondition(BranchInst *BI, BasicBlock *LoopEntry) {
1113  if (!BI || !BI->isConditional())
1114  return nullptr;
1115 
1116  ICmpInst *Cond = dyn_cast<ICmpInst>(BI->getCondition());
1117  if (!Cond)
1118  return nullptr;
1119 
1120  ConstantInt *CmpZero = dyn_cast<ConstantInt>(Cond->getOperand(1));
1121  if (!CmpZero || !CmpZero->isZero())
1122  return nullptr;
1123 
1124  ICmpInst::Predicate Pred = Cond->getPredicate();
1125  if ((Pred == ICmpInst::ICMP_NE && BI->getSuccessor(0) == LoopEntry) ||
1126  (Pred == ICmpInst::ICMP_EQ && BI->getSuccessor(1) == LoopEntry))
1127  return Cond->getOperand(0);
1128 
1129  return nullptr;
1130 }
1131 
1132 // Check if the recurrence variable `VarX` is in the right form to create
1133 // the idiom. Returns the value coerced to a PHINode if so.
1135  BasicBlock *LoopEntry) {
1136  auto *PhiX = dyn_cast<PHINode>(VarX);
1137  if (PhiX && PhiX->getParent() == LoopEntry &&
1138  (PhiX->getOperand(0) == DefX || PhiX->getOperand(1) == DefX))
1139  return PhiX;
1140  return nullptr;
1141 }
1142 
1143 /// Return true iff the idiom is detected in the loop.
1144 ///
1145 /// Additionally:
1146 /// 1) \p CntInst is set to the instruction counting the population bit.
1147 /// 2) \p CntPhi is set to the corresponding phi node.
1148 /// 3) \p Var is set to the value whose population bits are being counted.
1149 ///
1150 /// The core idiom we are trying to detect is:
1151 /// \code
1152 /// if (x0 != 0)
1153 /// goto loop-exit // the precondition of the loop
1154 /// cnt0 = init-val;
1155 /// do {
1156 /// x1 = phi (x0, x2);
1157 /// cnt1 = phi(cnt0, cnt2);
1158 ///
1159 /// cnt2 = cnt1 + 1;
1160 /// ...
1161 /// x2 = x1 & (x1 - 1);
1162 /// ...
1163 /// } while(x != 0);
1164 ///
1165 /// loop-exit:
1166 /// \endcode
1167 static bool detectPopcountIdiom(Loop *CurLoop, BasicBlock *PreCondBB,
1168  Instruction *&CntInst, PHINode *&CntPhi,
1169  Value *&Var) {
1170  // step 1: Check to see if the look-back branch match this pattern:
1171  // "if (a!=0) goto loop-entry".
1172  BasicBlock *LoopEntry;
1173  Instruction *DefX2, *CountInst;
1174  Value *VarX1, *VarX0;
1175  PHINode *PhiX, *CountPhi;
1176 
1177  DefX2 = CountInst = nullptr;
1178  VarX1 = VarX0 = nullptr;
1179  PhiX = CountPhi = nullptr;
1180  LoopEntry = *(CurLoop->block_begin());
1181 
1182  // step 1: Check if the loop-back branch is in desirable form.
1183  {
1184  if (Value *T = matchCondition(
1185  dyn_cast<BranchInst>(LoopEntry->getTerminator()), LoopEntry))
1186  DefX2 = dyn_cast<Instruction>(T);
1187  else
1188  return false;
1189  }
1190 
1191  // step 2: detect instructions corresponding to "x2 = x1 & (x1 - 1)"
1192  {
1193  if (!DefX2 || DefX2->getOpcode() != Instruction::And)
1194  return false;
1195 
1196  BinaryOperator *SubOneOp;
1197 
1198  if ((SubOneOp = dyn_cast<BinaryOperator>(DefX2->getOperand(0))))
1199  VarX1 = DefX2->getOperand(1);
1200  else {
1201  VarX1 = DefX2->getOperand(0);
1202  SubOneOp = dyn_cast<BinaryOperator>(DefX2->getOperand(1));
1203  }
1204  if (!SubOneOp)
1205  return false;
1206 
1207  Instruction *SubInst = cast<Instruction>(SubOneOp);
1208  ConstantInt *Dec = dyn_cast<ConstantInt>(SubInst->getOperand(1));
1209  if (!Dec ||
1210  !((SubInst->getOpcode() == Instruction::Sub && Dec->isOne()) ||
1211  (SubInst->getOpcode() == Instruction::Add &&
1212  Dec->isMinusOne()))) {
1213  return false;
1214  }
1215  }
1216 
1217  // step 3: Check the recurrence of variable X
1218  PhiX = getRecurrenceVar(VarX1, DefX2, LoopEntry);
1219  if (!PhiX)
1220  return false;
1221 
1222  // step 4: Find the instruction which count the population: cnt2 = cnt1 + 1
1223  {
1224  CountInst = nullptr;
1225  for (BasicBlock::iterator Iter = LoopEntry->getFirstNonPHI()->getIterator(),
1226  IterE = LoopEntry->end();
1227  Iter != IterE; Iter++) {
1228  Instruction *Inst = &*Iter;
1229  if (Inst->getOpcode() != Instruction::Add)
1230  continue;
1231 
1232  ConstantInt *Inc = dyn_cast<ConstantInt>(Inst->getOperand(1));
1233  if (!Inc || !Inc->isOne())
1234  continue;
1235 
1236  PHINode *Phi = getRecurrenceVar(Inst->getOperand(0), Inst, LoopEntry);
1237  if (!Phi)
1238  continue;
1239 
1240  // Check if the result of the instruction is live of the loop.
1241  bool LiveOutLoop = false;
1242  for (User *U : Inst->users()) {
1243  if ((cast<Instruction>(U))->getParent() != LoopEntry) {
1244  LiveOutLoop = true;
1245  break;
1246  }
1247  }
1248 
1249  if (LiveOutLoop) {
1250  CountInst = Inst;
1251  CountPhi = Phi;
1252  break;
1253  }
1254  }
1255 
1256  if (!CountInst)
1257  return false;
1258  }
1259 
1260  // step 5: check if the precondition is in this form:
1261  // "if (x != 0) goto loop-head ; else goto somewhere-we-don't-care;"
1262  {
1263  auto *PreCondBr = dyn_cast<BranchInst>(PreCondBB->getTerminator());
1264  Value *T = matchCondition(PreCondBr, CurLoop->getLoopPreheader());
1265  if (T != PhiX->getOperand(0) && T != PhiX->getOperand(1))
1266  return false;
1267 
1268  CntInst = CountInst;
1269  CntPhi = CountPhi;
1270  Var = T;
1271  }
1272 
1273  return true;
1274 }
1275 
1276 /// Return true if the idiom is detected in the loop.
1277 ///
1278 /// Additionally:
1279 /// 1) \p CntInst is set to the instruction Counting Leading Zeros (CTLZ)
1280 /// or nullptr if there is no such.
1281 /// 2) \p CntPhi is set to the corresponding phi node
1282 /// or nullptr if there is no such.
1283 /// 3) \p Var is set to the value whose CTLZ could be used.
1284 /// 4) \p DefX is set to the instruction calculating Loop exit condition.
1285 ///
1286 /// The core idiom we are trying to detect is:
1287 /// \code
1288 /// if (x0 == 0)
1289 /// goto loop-exit // the precondition of the loop
1290 /// cnt0 = init-val;
1291 /// do {
1292 /// x = phi (x0, x.next); //PhiX
1293 /// cnt = phi(cnt0, cnt.next);
1294 ///
1295 /// cnt.next = cnt + 1;
1296 /// ...
1297 /// x.next = x >> 1; // DefX
1298 /// ...
1299 /// } while(x.next != 0);
1300 ///
1301 /// loop-exit:
1302 /// \endcode
1303 static bool detectCTLZIdiom(Loop *CurLoop, PHINode *&PhiX,
1304  Instruction *&CntInst, PHINode *&CntPhi,
1305  Instruction *&DefX) {
1306  BasicBlock *LoopEntry;
1307  Value *VarX = nullptr;
1308 
1309  DefX = nullptr;
1310  PhiX = nullptr;
1311  CntInst = nullptr;
1312  CntPhi = nullptr;
1313  LoopEntry = *(CurLoop->block_begin());
1314 
1315  // step 1: Check if the loop-back branch is in desirable form.
1316  if (Value *T = matchCondition(
1317  dyn_cast<BranchInst>(LoopEntry->getTerminator()), LoopEntry))
1318  DefX = dyn_cast<Instruction>(T);
1319  else
1320  return false;
1321 
1322  // step 2: detect instructions corresponding to "x.next = x >> 1"
1323  if (!DefX || DefX->getOpcode() != Instruction::AShr)
1324  return false;
1325  ConstantInt *Shft = dyn_cast<ConstantInt>(DefX->getOperand(1));
1326  if (!Shft || !Shft->isOne())
1327  return false;
1328  VarX = DefX->getOperand(0);
1329 
1330  // step 3: Check the recurrence of variable X
1331  PhiX = getRecurrenceVar(VarX, DefX, LoopEntry);
1332  if (!PhiX)
1333  return false;
1334 
1335  // step 4: Find the instruction which count the CTLZ: cnt.next = cnt + 1
1336  // TODO: We can skip the step. If loop trip count is known (CTLZ),
1337  // then all uses of "cnt.next" could be optimized to the trip count
1338  // plus "cnt0". Currently it is not optimized.
1339  // This step could be used to detect POPCNT instruction:
1340  // cnt.next = cnt + (x.next & 1)
1341  for (BasicBlock::iterator Iter = LoopEntry->getFirstNonPHI()->getIterator(),
1342  IterE = LoopEntry->end();
1343  Iter != IterE; Iter++) {
1344  Instruction *Inst = &*Iter;
1345  if (Inst->getOpcode() != Instruction::Add)
1346  continue;
1347 
1348  ConstantInt *Inc = dyn_cast<ConstantInt>(Inst->getOperand(1));
1349  if (!Inc || !Inc->isOne())
1350  continue;
1351 
1352  PHINode *Phi = getRecurrenceVar(Inst->getOperand(0), Inst, LoopEntry);
1353  if (!Phi)
1354  continue;
1355 
1356  CntInst = Inst;
1357  CntPhi = Phi;
1358  break;
1359  }
1360  if (!CntInst)
1361  return false;
1362 
1363  return true;
1364 }
1365 
1366 /// Recognize CTLZ idiom in a non-countable loop and convert the loop
1367 /// to countable (with CTLZ trip count).
1368 /// If CTLZ inserted as a new trip count returns true; otherwise, returns false.
1369 bool LoopIdiomRecognize::recognizeAndInsertCTLZ() {
1370  // Give up if the loop has multiple blocks or multiple backedges.
1371  if (CurLoop->getNumBackEdges() != 1 || CurLoop->getNumBlocks() != 1)
1372  return false;
1373 
1374  Instruction *CntInst, *DefX;
1375  PHINode *CntPhi, *PhiX;
1376  if (!detectCTLZIdiom(CurLoop, PhiX, CntInst, CntPhi, DefX))
1377  return false;
1378 
1379  bool IsCntPhiUsedOutsideLoop = false;
1380  for (User *U : CntPhi->users())
1381  if (!CurLoop->contains(dyn_cast<Instruction>(U))) {
1382  IsCntPhiUsedOutsideLoop = true;
1383  break;
1384  }
1385  bool IsCntInstUsedOutsideLoop = false;
1386  for (User *U : CntInst->users())
1387  if (!CurLoop->contains(dyn_cast<Instruction>(U))) {
1388  IsCntInstUsedOutsideLoop = true;
1389  break;
1390  }
1391  // If both CntInst and CntPhi are used outside the loop the profitability
1392  // is questionable.
1393  if (IsCntInstUsedOutsideLoop && IsCntPhiUsedOutsideLoop)
1394  return false;
1395 
1396  // For some CPUs result of CTLZ(X) intrinsic is undefined
1397  // when X is 0. If we can not guarantee X != 0, we need to check this
1398  // when expand.
1399  bool ZeroCheck = false;
1400  // It is safe to assume Preheader exist as it was checked in
1401  // parent function RunOnLoop.
1402  BasicBlock *PH = CurLoop->getLoopPreheader();
1403  Value *InitX = PhiX->getIncomingValueForBlock(PH);
1404  // If we check X != 0 before entering the loop we don't need a zero
1405  // check in CTLZ intrinsic, but only if Cnt Phi is not used outside of the
1406  // loop (if it is used we count CTLZ(X >> 1)).
1407  if (!IsCntPhiUsedOutsideLoop)
1408  if (BasicBlock *PreCondBB = PH->getSinglePredecessor())
1409  if (BranchInst *PreCondBr =
1410  dyn_cast<BranchInst>(PreCondBB->getTerminator())) {
1411  if (matchCondition(PreCondBr, PH) == InitX)
1412  ZeroCheck = true;
1413  }
1414 
1415  // Check if CTLZ intrinsic is profitable. Assume it is always profitable
1416  // if we delete the loop (the loop has only 6 instructions):
1417  // %n.addr.0 = phi [ %n, %entry ], [ %shr, %while.cond ]
1418  // %i.0 = phi [ %i0, %entry ], [ %inc, %while.cond ]
1419  // %shr = ashr %n.addr.0, 1
1420  // %tobool = icmp eq %shr, 0
1421  // %inc = add nsw %i.0, 1
1422  // br i1 %tobool
1423 
1424  IRBuilder<> Builder(PH->getTerminator());
1426  {InitX, ZeroCheck ? Builder.getTrue() : Builder.getFalse()};
1428  if (CurLoop->getHeader()->size() != 6 &&
1429  TTI->getIntrinsicCost(Intrinsic::ctlz, InitX->getType(), Args) >
1431  return false;
1432 
1433  const DebugLoc DL = DefX->getDebugLoc();
1434  transformLoopToCountable(PH, CntInst, CntPhi, InitX, DL, ZeroCheck,
1435  IsCntPhiUsedOutsideLoop);
1436  return true;
1437 }
1438 
1439 /// Recognizes a population count idiom in a non-countable loop.
1440 ///
1441 /// If detected, transforms the relevant code to issue the popcount intrinsic
1442 /// function call, and returns true; otherwise, returns false.
1443 bool LoopIdiomRecognize::recognizePopcount() {
1445  return false;
1446 
1447  // Counting population are usually conducted by few arithmetic instructions.
1448  // Such instructions can be easily "absorbed" by vacant slots in a
1449  // non-compact loop. Therefore, recognizing popcount idiom only makes sense
1450  // in a compact loop.
1451 
1452  // Give up if the loop has multiple blocks or multiple backedges.
1453  if (CurLoop->getNumBackEdges() != 1 || CurLoop->getNumBlocks() != 1)
1454  return false;
1455 
1456  BasicBlock *LoopBody = *(CurLoop->block_begin());
1457  if (LoopBody->size() >= 20) {
1458  // The loop is too big, bail out.
1459  return false;
1460  }
1461 
1462  // It should have a preheader containing nothing but an unconditional branch.
1463  BasicBlock *PH = CurLoop->getLoopPreheader();
1464  if (!PH || &PH->front() != PH->getTerminator())
1465  return false;
1466  auto *EntryBI = dyn_cast<BranchInst>(PH->getTerminator());
1467  if (!EntryBI || EntryBI->isConditional())
1468  return false;
1469 
1470  // It should have a precondition block where the generated popcount instrinsic
1471  // function can be inserted.
1472  auto *PreCondBB = PH->getSinglePredecessor();
1473  if (!PreCondBB)
1474  return false;
1475  auto *PreCondBI = dyn_cast<BranchInst>(PreCondBB->getTerminator());
1476  if (!PreCondBI || PreCondBI->isUnconditional())
1477  return false;
1478 
1479  Instruction *CntInst;
1480  PHINode *CntPhi;
1481  Value *Val;
1482  if (!detectPopcountIdiom(CurLoop, PreCondBB, CntInst, CntPhi, Val))
1483  return false;
1484 
1485  transformLoopToPopcount(PreCondBB, CntInst, CntPhi, Val);
1486  return true;
1487 }
1488 
1490  const DebugLoc &DL) {
1491  Value *Ops[] = {Val};
1492  Type *Tys[] = {Val->getType()};
1493 
1494  Module *M = IRBuilder.GetInsertBlock()->getParent()->getParent();
1495  Value *Func = Intrinsic::getDeclaration(M, Intrinsic::ctpop, Tys);
1496  CallInst *CI = IRBuilder.CreateCall(Func, Ops);
1497  CI->setDebugLoc(DL);
1498 
1499  return CI;
1500 }
1501 
1503  const DebugLoc &DL, bool ZeroCheck) {
1504  Value *Ops[] = {Val, ZeroCheck ? IRBuilder.getTrue() : IRBuilder.getFalse()};
1505  Type *Tys[] = {Val->getType()};
1506 
1507  Module *M = IRBuilder.GetInsertBlock()->getParent()->getParent();
1508  Value *Func = Intrinsic::getDeclaration(M, Intrinsic::ctlz, Tys);
1509  CallInst *CI = IRBuilder.CreateCall(Func, Ops);
1510  CI->setDebugLoc(DL);
1511 
1512  return CI;
1513 }
1514 
1515 /// Transform the following loop:
1516 /// loop:
1517 /// CntPhi = PHI [Cnt0, CntInst]
1518 /// PhiX = PHI [InitX, DefX]
1519 /// CntInst = CntPhi + 1
1520 /// DefX = PhiX >> 1
1521 /// LOOP_BODY
1522 /// Br: loop if (DefX != 0)
1523 /// Use(CntPhi) or Use(CntInst)
1524 ///
1525 /// Into:
1526 /// If CntPhi used outside the loop:
1527 /// CountPrev = BitWidth(InitX) - CTLZ(InitX >> 1)
1528 /// Count = CountPrev + 1
1529 /// else
1530 /// Count = BitWidth(InitX) - CTLZ(InitX)
1531 /// loop:
1532 /// CntPhi = PHI [Cnt0, CntInst]
1533 /// PhiX = PHI [InitX, DefX]
1534 /// PhiCount = PHI [Count, Dec]
1535 /// CntInst = CntPhi + 1
1536 /// DefX = PhiX >> 1
1537 /// Dec = PhiCount - 1
1538 /// LOOP_BODY
1539 /// Br: loop if (Dec != 0)
1540 /// Use(CountPrev + Cnt0) // Use(CntPhi)
1541 /// or
1542 /// Use(Count + Cnt0) // Use(CntInst)
1543 ///
1544 /// If LOOP_BODY is empty the loop will be deleted.
1545 /// If CntInst and DefX are not used in LOOP_BODY they will be removed.
1546 void LoopIdiomRecognize::transformLoopToCountable(
1547  BasicBlock *Preheader, Instruction *CntInst, PHINode *CntPhi, Value *InitX,
1548  const DebugLoc DL, bool ZeroCheck, bool IsCntPhiUsedOutsideLoop) {
1549  BranchInst *PreheaderBr = dyn_cast<BranchInst>(Preheader->getTerminator());
1550 
1551  // Step 1: Insert the CTLZ instruction at the end of the preheader block
1552  // Count = BitWidth - CTLZ(InitX);
1553  // If there are uses of CntPhi create:
1554  // CountPrev = BitWidth - CTLZ(InitX >> 1);
1555  IRBuilder<> Builder(PreheaderBr);
1556  Builder.SetCurrentDebugLocation(DL);
1557  Value *CTLZ, *Count, *CountPrev, *NewCount, *InitXNext;
1558 
1559  if (IsCntPhiUsedOutsideLoop)
1560  InitXNext = Builder.CreateAShr(InitX,
1561  ConstantInt::get(InitX->getType(), 1));
1562  else
1563  InitXNext = InitX;
1564  CTLZ = createCTLZIntrinsic(Builder, InitXNext, DL, ZeroCheck);
1565  Count = Builder.CreateSub(
1566  ConstantInt::get(CTLZ->getType(),
1567  CTLZ->getType()->getIntegerBitWidth()),
1568  CTLZ);
1569  if (IsCntPhiUsedOutsideLoop) {
1570  CountPrev = Count;
1571  Count = Builder.CreateAdd(
1572  CountPrev,
1573  ConstantInt::get(CountPrev->getType(), 1));
1574  }
1575  if (IsCntPhiUsedOutsideLoop)
1576  NewCount = Builder.CreateZExtOrTrunc(CountPrev,
1577  cast<IntegerType>(CntInst->getType()));
1578  else
1579  NewCount = Builder.CreateZExtOrTrunc(Count,
1580  cast<IntegerType>(CntInst->getType()));
1581 
1582  // If the CTLZ counter's initial value is not zero, insert Add Inst.
1583  Value *CntInitVal = CntPhi->getIncomingValueForBlock(Preheader);
1584  ConstantInt *InitConst = dyn_cast<ConstantInt>(CntInitVal);
1585  if (!InitConst || !InitConst->isZero())
1586  NewCount = Builder.CreateAdd(NewCount, CntInitVal);
1587 
1588  // Step 2: Insert new IV and loop condition:
1589  // loop:
1590  // ...
1591  // PhiCount = PHI [Count, Dec]
1592  // ...
1593  // Dec = PhiCount - 1
1594  // ...
1595  // Br: loop if (Dec != 0)
1596  BasicBlock *Body = *(CurLoop->block_begin());
1597  auto *LbBr = dyn_cast<BranchInst>(Body->getTerminator());
1598  ICmpInst *LbCond = cast<ICmpInst>(LbBr->getCondition());
1599  Type *Ty = Count->getType();
1600 
1601  PHINode *TcPhi = PHINode::Create(Ty, 2, "tcphi", &Body->front());
1602 
1603  Builder.SetInsertPoint(LbCond);
1604  Instruction *TcDec = cast<Instruction>(
1605  Builder.CreateSub(TcPhi, ConstantInt::get(Ty, 1),
1606  "tcdec", false, true));
1607 
1608  TcPhi->addIncoming(Count, Preheader);
1609  TcPhi->addIncoming(TcDec, Body);
1610 
1611  CmpInst::Predicate Pred =
1612  (LbBr->getSuccessor(0) == Body) ? CmpInst::ICMP_NE : CmpInst::ICMP_EQ;
1613  LbCond->setPredicate(Pred);
1614  LbCond->setOperand(0, TcDec);
1615  LbCond->setOperand(1, ConstantInt::get(Ty, 0));
1616 
1617  // Step 3: All the references to the original counter outside
1618  // the loop are replaced with the NewCount -- the value returned from
1619  // __builtin_ctlz(x).
1620  if (IsCntPhiUsedOutsideLoop)
1621  CntPhi->replaceUsesOutsideBlock(NewCount, Body);
1622  else
1623  CntInst->replaceUsesOutsideBlock(NewCount, Body);
1624 
1625  // step 4: Forget the "non-computable" trip-count SCEV associated with the
1626  // loop. The loop would otherwise not be deleted even if it becomes empty.
1627  SE->forgetLoop(CurLoop);
1628 }
1629 
1630 void LoopIdiomRecognize::transformLoopToPopcount(BasicBlock *PreCondBB,
1631  Instruction *CntInst,
1632  PHINode *CntPhi, Value *Var) {
1633  BasicBlock *PreHead = CurLoop->getLoopPreheader();
1634  auto *PreCondBr = dyn_cast<BranchInst>(PreCondBB->getTerminator());
1635  const DebugLoc DL = CntInst->getDebugLoc();
1636 
1637  // Assuming before transformation, the loop is following:
1638  // if (x) // the precondition
1639  // do { cnt++; x &= x - 1; } while(x);
1640 
1641  // Step 1: Insert the ctpop instruction at the end of the precondition block
1642  IRBuilder<> Builder(PreCondBr);
1643  Value *PopCnt, *PopCntZext, *NewCount, *TripCnt;
1644  {
1645  PopCnt = createPopcntIntrinsic(Builder, Var, DL);
1646  NewCount = PopCntZext =
1647  Builder.CreateZExtOrTrunc(PopCnt, cast<IntegerType>(CntPhi->getType()));
1648 
1649  if (NewCount != PopCnt)
1650  (cast<Instruction>(NewCount))->setDebugLoc(DL);
1651 
1652  // TripCnt is exactly the number of iterations the loop has
1653  TripCnt = NewCount;
1654 
1655  // If the population counter's initial value is not zero, insert Add Inst.
1656  Value *CntInitVal = CntPhi->getIncomingValueForBlock(PreHead);
1657  ConstantInt *InitConst = dyn_cast<ConstantInt>(CntInitVal);
1658  if (!InitConst || !InitConst->isZero()) {
1659  NewCount = Builder.CreateAdd(NewCount, CntInitVal);
1660  (cast<Instruction>(NewCount))->setDebugLoc(DL);
1661  }
1662  }
1663 
1664  // Step 2: Replace the precondition from "if (x == 0) goto loop-exit" to
1665  // "if (NewCount == 0) loop-exit". Without this change, the intrinsic
1666  // function would be partial dead code, and downstream passes will drag
1667  // it back from the precondition block to the preheader.
1668  {
1669  ICmpInst *PreCond = cast<ICmpInst>(PreCondBr->getCondition());
1670 
1671  Value *Opnd0 = PopCntZext;
1672  Value *Opnd1 = ConstantInt::get(PopCntZext->getType(), 0);
1673  if (PreCond->getOperand(0) != Var)
1674  std::swap(Opnd0, Opnd1);
1675 
1676  ICmpInst *NewPreCond = cast<ICmpInst>(
1677  Builder.CreateICmp(PreCond->getPredicate(), Opnd0, Opnd1));
1678  PreCondBr->setCondition(NewPreCond);
1679 
1681  }
1682 
1683  // Step 3: Note that the population count is exactly the trip count of the
1684  // loop in question, which enable us to to convert the loop from noncountable
1685  // loop into a countable one. The benefit is twofold:
1686  //
1687  // - If the loop only counts population, the entire loop becomes dead after
1688  // the transformation. It is a lot easier to prove a countable loop dead
1689  // than to prove a noncountable one. (In some C dialects, an infinite loop
1690  // isn't dead even if it computes nothing useful. In general, DCE needs
1691  // to prove a noncountable loop finite before safely delete it.)
1692  //
1693  // - If the loop also performs something else, it remains alive.
1694  // Since it is transformed to countable form, it can be aggressively
1695  // optimized by some optimizations which are in general not applicable
1696  // to a noncountable loop.
1697  //
1698  // After this step, this loop (conceptually) would look like following:
1699  // newcnt = __builtin_ctpop(x);
1700  // t = newcnt;
1701  // if (x)
1702  // do { cnt++; x &= x-1; t--) } while (t > 0);
1703  BasicBlock *Body = *(CurLoop->block_begin());
1704  {
1705  auto *LbBr = dyn_cast<BranchInst>(Body->getTerminator());
1706  ICmpInst *LbCond = cast<ICmpInst>(LbBr->getCondition());
1707  Type *Ty = TripCnt->getType();
1708 
1709  PHINode *TcPhi = PHINode::Create(Ty, 2, "tcphi", &Body->front());
1710 
1711  Builder.SetInsertPoint(LbCond);
1712  Instruction *TcDec = cast<Instruction>(
1713  Builder.CreateSub(TcPhi, ConstantInt::get(Ty, 1),
1714  "tcdec", false, true));
1715 
1716  TcPhi->addIncoming(TripCnt, PreHead);
1717  TcPhi->addIncoming(TcDec, Body);
1718 
1719  CmpInst::Predicate Pred =
1720  (LbBr->getSuccessor(0) == Body) ? CmpInst::ICMP_UGT : CmpInst::ICMP_SLE;
1721  LbCond->setPredicate(Pred);
1722  LbCond->setOperand(0, TcDec);
1723  LbCond->setOperand(1, ConstantInt::get(Ty, 0));
1724  }
1725 
1726  // Step 4: All the references to the original population counter outside
1727  // the loop are replaced with the NewCount -- the value returned from
1728  // __builtin_ctpop().
1729  CntInst->replaceUsesOutsideBlock(NewCount, Body);
1730 
1731  // step 5: Forget the "non-computable" trip-count SCEV associated with the
1732  // loop. The loop would otherwise not be deleted even if it becomes empty.
1733  SE->forgetLoop(CurLoop);
1734 }
Pass interface - Implemented by all &#39;passes&#39;.
Definition: Pass.h:81
uint64_t CallInst * C
Value * getValueOperand()
Definition: Instructions.h:395
SymbolTableList< Instruction >::iterator eraseFromParent()
This method unlinks &#39;this&#39; from the containing basic block and deletes it.
Definition: Instruction.cpp:69
void push_back(const T &Elt)
Definition: SmallVector.h:212
A parsed version of the target data layout string in and methods for querying it. ...
Definition: DataLayout.h:109
constexpr char Align[]
Key for Kernel::Arg::Metadata::mAlign.
Value * CreateICmp(CmpInst::Predicate P, Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:1638
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
static PassRegistry * getPassRegistry()
getPassRegistry - Access the global registry object, which is automatically initialized at applicatio...
static CallInst * createPopcntIntrinsic(IRBuilder<> &IRBuilder, Value *Val, const DebugLoc &DL)
PreservedAnalyses getLoopPassPreservedAnalyses()
Returns the minimum set of Analyses that all loop passes must preserve.
Value * CreateZExtOrTrunc(Value *V, Type *DestTy, const Twine &Name="")
Create a ZExt or Trunc from the integer value V to DestTy.
Definition: IRBuilder.h:1395
const SCEV * getConstant(ConstantInt *V)
Compute iterated dominance frontiers using a linear time algorithm.
Definition: AllocatorList.h:24
Value * isBytewiseValue(Value *V)
If the specified value can be set by repeating the same byte in memory, return the i8 value that it i...
bool isAtomic() const
Return true if this instruction has an AtomicOrdering of unordered or higher.
Constant * getOrInsertFunction(StringRef Name, FunctionType *T, AttributeList AttributeList)
Look up the specified function in the module symbol table.
Definition: Module.cpp:142
A Module instance is used to store all the information related to an LLVM module. ...
Definition: Module.h:63
LLVM_ATTRIBUTE_ALWAYS_INLINE size_type size() const
Definition: SmallVector.h:136
bool isUnordered() const
Definition: Instructions.h:389
The main scalar evolution driver.
static bool detectCTLZIdiom(Loop *CurLoop, PHINode *&PhiX, Instruction *&CntInst, PHINode *&CntPhi, Instruction *&DefX)
Return true if the idiom is detected in the loop.
This class represents a function call, abstracting a target machine&#39;s calling convention.
static PHINode * getRecurrenceVar(Value *VarX, Instruction *DefX, BasicBlock *LoopEntry)
BlockT * getLoopPreheader() const
If there is a preheader for this loop, return it.
Definition: LoopInfoImpl.h:106
Like Internal, but omit from symbol table.
Definition: GlobalValue.h:57
This class wraps the llvm.memset intrinsic.
BasicBlock * getSuccessor(unsigned i) const
STATISTIC(NumFunctions, "Total number of functions")
A debug info location.
Definition: DebugLoc.h:34
The adaptor from a function pass to a loop pass computes these analyses and makes them available to t...
unsigned getPointerAddressSpace() const
Get the address space of this pointer or pointer vector type.
Definition: DerivedTypes.h:503
An instruction for reading from memory.
Definition: Instructions.h:164
Value * getCondition() const
Value * getLength() const
iterator end()
Get an iterator to the end of the SetVector.
Definition: SetVector.h:93
The access modifies the value stored in memory.
static Constant * get(ArrayType *T, ArrayRef< Constant *> V)
Definition: Constants.cpp:888
Value * getDest() const
This is just like getRawDest, but it strips off any cast instructions (including addrspacecast) that ...
iterator begin()
Instruction iterator methods.
Definition: BasicBlock.h:252
PopcntSupportKind getPopcntSupport(unsigned IntTyWidthInBit) const
Return hardware support for population count.
AnalysisUsage & addRequired()
const Module * getModule() const
Return the module owning the function this basic block belongs to, or nullptr it the function does no...
Definition: BasicBlock.cpp:116
#define INITIALIZE_PASS_DEPENDENCY(depName)
Definition: PassSupport.h:51
bool isVolatile() const
Return true if this is a load from a volatile memory location.
Definition: Instructions.h:217
ModRefInfo
Flags indicating whether a memory access modifies or references memory.
const DataLayout & getDataLayout() const
Get the data layout for the module&#39;s target platform.
Definition: Module.cpp:361
LoopT * getLoopFor(const BlockT *BB) const
Return the inner most loop that BB lives in.
Definition: LoopInfo.h:659
static CallInst * createCTLZIntrinsic(IRBuilder<> &IRBuilder, Value *Val, const DebugLoc &DL, bool ZeroCheck)
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition: IRBuilder.h:668
This file contains the simple types necessary to represent the attributes associated with functions a...
Value * CreateAdd(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition: IRBuilder.h:893
This file implements a class to represent arbitrary precision integral constant values and operations...
const APInt & getAPInt() const
BlockT * getHeader() const
Definition: LoopInfo.h:100
void computeLoopSafetyInfo(LoopSafetyInfo *, Loop *)
Computes safety information for a loop checks loop body & header for the possibility of may throw exc...
Definition: LICM.cpp:493
static bool isSimple(Instruction *I)
bool isOne() const
This is just a convenience method to make client code smaller for a common case.
Definition: Constants.h:201
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
Definition: Instructions.h:404
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:245
Value handle that is nullable, but tries to track the Value.
Definition: ValueHandle.h:182
bool insert(const value_type &X)
Insert a new element into the SetVector.
Definition: SetVector.h:142
This node represents a polynomial recurrence on the trip count of the specified loop.
static APInt getStoreStride(const SCEVAddRecExpr *StoreEv)
#define T
BasicBlock * GetInsertBlock() const
Definition: IRBuilder.h:122
Class to represent array types.
Definition: DerivedTypes.h:369
static cl::opt< bool > UseLIRCodeSizeHeurs("use-lir-code-size-heurs", cl::desc("Use loop idiom recognition code size heuristics when compiling" "with -Os/-Oz"), cl::init(true), cl::Hidden)
bool has(LibFunc F) const
Tests whether a library function is available.
iterator begin()
Get an iterator to the beginning of the SetVector.
Definition: SetVector.h:83
MDNode * getMetadata(unsigned KindID) const
Get the metadata of given kind attached to this Instruction.
Definition: Instruction.h:194
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory)...
Definition: APInt.h:33
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
Definition: Instruction.h:125
Value * CreateSub(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition: IRBuilder.h:915
void initializeLoopIdiomRecognizeLegacyPassPass(PassRegistry &)
An instruction for storing to memory.
Definition: Instructions.h:306
void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition: Value.cpp:430
static void deleteDeadInstruction(Instruction *I)
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree...
Definition: Dominators.h:140
Function * getDeclaration(Module *M, ID id, ArrayRef< Type *> Tys=None)
Create or insert an LLVM Function declaration for an intrinsic, and return it.
Definition: Function.cpp:980
void SetInsertPoint(BasicBlock *TheBB)
This specifies that created instructions should be appended to the end of the specified block...
Definition: IRBuilder.h:128
Value * getOperand(unsigned i) const
Definition: User.h:154
size_type count(const key_type &key) const
Count the number of elements of a given key in the SetVector.
Definition: SetVector.h:211
bool isLoopEntryGuardedByCond(const Loop *L, ICmpInst::Predicate Pred, const SCEV *LHS, const SCEV *RHS)
Test whether entry to the loop is protected by a conditional between LHS and RHS. ...
static Constant * getBitCast(Constant *C, Type *Ty, bool OnlyIfReduced=false)
Definition: Constants.cpp:1678
bool isConsecutiveAccess(Value *A, Value *B, const DataLayout &DL, ScalarEvolution &SE, bool CheckType=true)
Returns true if the memory operations A and B are consecutive.
bool inferLibFuncAttributes(Function &F, const TargetLibraryInfo &TLI)
Analyze the name and prototype of the given function and set any applicable attributes.
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:406
const SCEV * getOperand(unsigned i) const
const Instruction * getFirstNonPHI() const
Returns a pointer to the first instruction in this block that is not a PHINode instruction.
Definition: BasicBlock.cpp:171
Wrapper pass for TargetTransformInfo.
A set of analyses that are preserved following a run of a transformation pass.
Definition: PassManager.h:153
const SCEV * getOne(Type *Ty)
Return a SCEV for the constant 1 of a specific type.
void setDebugLoc(DebugLoc Loc)
Set the debug location information for this instruction.
Definition: Instruction.h:281
const BasicBlock * getSinglePredecessor() const
Return the predecessor of this block if it has a single predecessor block.
Definition: BasicBlock.cpp:217
LLVM Basic Block Representation.
Definition: BasicBlock.h:59
The instances of the Type class are immutable: once they are created, they are never changed...
Definition: Type.h:46
ConstantInt * getTrue()
Get the constant value for i1 true.
Definition: IRBuilder.h:288
Conditional or Unconditional Branch instruction.
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
This is an important base class in LLVM.
Definition: Constant.h:42
Value * getIncomingValueForBlock(const BasicBlock *BB) const
This file contains the declarations for the subclasses of Constant, which represent the different fla...
const Instruction & front() const
Definition: BasicBlock.h:264
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition: SmallPtrSet.h:371
const SCEV * getAddExpr(SmallVectorImpl< const SCEV *> &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical add expression, or something simpler if possible.
size_t size() const
Definition: BasicBlock.h:262
bool isUnordered() const
Definition: Instructions.h:264
Represent the analysis usage information of a pass.
bool optForSize() const
Optimize this function for size (-Os) or minimum size (-Oz).
Definition: Function.h:530
static Value * matchCondition(BranchInst *BI, BasicBlock *LoopEntry)
Check if the given conditional branch is based on the comparison between a variable and zero...
This instruction compares its operands according to the predicate given to the constructor.
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition: InstrTypes.h:853
std::vector< BasicBlock *>::const_iterator block_iterator
Definition: LoopInfo.h:153
bool isAffine() const
Return true if this represents an expression A + B*x where A and B are loop invariant values...
Value * getPointerOperand()
Definition: Instructions.h:270
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
Definition: SmallPtrSet.h:382
self_iterator getIterator()
Definition: ilist_node.h:82
const SCEV * getMinusSCEV(const SCEV *LHS, const SCEV *RHS, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Return LHS-RHS. Minus is represented in SCEV as A+B*-1.
static UndefValue * get(Type *T)
Static factory methods - Return an &#39;undef&#39; object of the specified type.
Definition: Constants.cpp:1320
bool RecursivelyDeleteTriviallyDeadInstructions(Value *V, const TargetLibraryInfo *TLI=nullptr)
If the specified value is a trivially dead instruction, delete it.
Definition: Local.cpp:401
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition: PassManager.h:159
static PointerType * getInt8PtrTy(LLVMContext &C, unsigned AS=0)
Definition: Type.cpp:220
bool isVolatile() const
const SCEV * getMulExpr(SmallVectorImpl< const SCEV *> &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical multiply expression, or something simpler if possible.
Value * GetUnderlyingObject(Value *V, const DataLayout &DL, unsigned MaxLookup=6)
This method strips off any GEP address adjustments and pointer casts from the specified value...
INITIALIZE_PASS_END(RegBankSelect, DEBUG_TYPE, "Assign register bank of generic virtual registers", false, false) RegBankSelect
Representation for a specific memory location.
This class provides an interface for updating the loop pass manager based on mutations to the loop ne...
typename vector_type::const_iterator iterator
Definition: SetVector.h:49
Iterator for intrusive lists based on ilist_node.
This is the shared class of boolean and integer constants.
Definition: Constants.h:84
Type * getType() const
Return the LLVM type of this SCEV expression.
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
iterator end()
Definition: BasicBlock.h:254
This is a &#39;vector&#39; (really, a variable-sized array), optimized for the case when the array is small...
Definition: SmallVector.h:864
bool dominates(const Instruction *Def, const Use &U) const
Return true if Def dominates a use in User.
Definition: Dominators.cpp:239
Module.h This file contains the declarations for the Module class.
Provides information about what library functions are available for the current target.
static Constant * get(Type *Ty, uint64_t V, bool isSigned=false)
If Ty is a vector type, return a Constant with a splat of the given value.
Definition: Constants.cpp:560
bool isConditional() const
static PHINode * Create(Type *Ty, unsigned NumReservedValues, const Twine &NameStr="", Instruction *InsertBefore=nullptr)
Constructors - NumReservedValues is a hint for the number of incoming edges that this phi node will h...
static Constant * getMemSetPatternValue(Value *V, const DataLayout *DL)
getMemSetPatternValue - If a strided store of the specified value is safe to turn into a memset_patte...
static const SCEV * getNumBytes(const SCEV *BECount, Type *IntPtr, unsigned StoreSize, Loop *CurLoop, const DataLayout *DL, ScalarEvolution *SE)
Compute the number of bytes as a SCEV from the backedge taken count.
static bool detectPopcountIdiom(Loop *CurLoop, BasicBlock *PreCondBB, Instruction *&CntInst, PHINode *&CntPhi, Value *&Var)
Return true iff the idiom is detected in the loop.
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:132
Function * getFunction(StringRef Name) const
Look up the specified function in the module symbol table.
Definition: Module.cpp:172
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition: BitVector.h:923
signed less or equal
Definition: InstrTypes.h:883
loop Recognize loop idioms
const Module * getModule() const
Return the module owning the function this instruction belongs to or nullptr it the function does not...
Definition: Instruction.cpp:57
Class for arbitrary precision integers.
Definition: APInt.h:69
loop idiom
ModRefInfo getModRefInfo(ImmutableCallSite CS, const MemoryLocation &Loc)
getModRefInfo (for call sites) - Return information about whether a particular call site modifies or ...
iterator_range< user_iterator > users()
Definition: Value.h:401
This class uses information about analyze scalars to rewrite expressions in canonical form...
Pass * createLoopIdiomPass()
ConstantInt * getFalse()
Get the constant value for i1 false.
Definition: IRBuilder.h:293
uint64_t getTypeSizeInBits(Type *Ty) const
Size examples:
Definition: DataLayout.h:530
Predicate getPredicate() const
Return the predicate for this instruction.
Definition: InstrTypes.h:927
bool isVolatile() const
Return true if this is a store to a volatile memory location.
Definition: Instructions.h:339
Captures loop safety information.
Definition: LoopUtils.h:51
const DebugLoc & getDebugLoc() const
Return the debug location for this node as a DebugLoc.
Definition: Instruction.h:284
INITIALIZE_PASS_BEGIN(LoopIdiomRecognizeLegacyPass, "loop-idiom", "Recognize loop idioms", false, false) INITIALIZE_PASS_END(LoopIdiomRecognizeLegacyPass
void forgetLoop(const Loop *L)
This method should be called by the client when it has changed a loop in a way that may effect Scalar...
void setUnnamedAddr(UnnamedAddr Val)
Definition: GlobalValue.h:208
unsigned getAtomicMemIntrinsicMaxElementSize() const
unsigned getAlignment() const
Return the alignment of the access that is being performed.
Definition: Instructions.h:226
This class represents an analyzed expression in the program.
unsigned getIntegerBitWidth() const
Definition: DerivedTypes.h:97
Represents a single loop in the control flow graph.
Definition: LoopInfo.h:420
This file provides utility analysis objects describing memory locations.
StringRef getName() const
Return a constant reference to the value&#39;s name.
Definition: Value.cpp:220
const Function * getParent() const
Return the enclosing method, or null if none.
Definition: BasicBlock.h:108
The access both references and modifies the value stored in memory.
#define I(x, y, z)
Definition: MD5.cpp:58
bool isZero() const
This is just a convenience method to make client code smaller for a common code.
Definition: Constants.h:193
void getLoopAnalysisUsage(AnalysisUsage &AU)
Helper to consistently add the set of standard passes to a loop pass&#39;s AnalysisUsage.
Definition: LoopUtils.cpp:1023
static ArrayType * get(Type *ElementType, uint64_t NumElements)
This static method is the primary way to construct an ArrayType.
Definition: Type.cpp:568
LLVM_NODISCARD std::enable_if<!is_simple_type< Y >::value, typename cast_retty< X, const Y >::ret_type >::type dyn_cast(const Y &Val)
Definition: Casting.h:323
block_iterator block_end() const
Definition: LoopInfo.h:155
Value * getValue() const
Return the arguments to the instruction.
unsigned getAlignment() const
Return the alignment of the access that is being performed.
Definition: Instructions.h:351
unsigned getAlignment() const
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
Definition: Instructions.h:276
static const SCEV * getStartForNegStride(const SCEV *Start, const SCEV *BECount, Type *IntPtr, unsigned StoreSize, ScalarEvolution *SE)
const SCEV * getBackedgeTakenCount(const Loop *L)
If the specified loop has a predictable backedge-taken count, return it, otherwise return a SCEVCould...
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
The cost of a typical &#39;add&#39; instruction.
int getIntrinsicCost(Intrinsic::ID IID, Type *RetTy, ArrayRef< Type *> ParamTys) const
Estimate the cost of an intrinsic when lowered.
const SCEV * getNegativeSCEV(const SCEV *V, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap)
Return the SCEV object corresponding to -V.
Module * getParent()
Get the module that this global value is contained inside of...
Definition: GlobalValue.h:556
LLVM Value Representation.
Definition: Value.h:73
const SCEV * getSCEV(Value *V)
Return a SCEV expression for the full generality of the specified expression.
A vector that has set insertion semantics.
Definition: SetVector.h:41
constexpr char Size[]
Key for Kernel::Arg::Metadata::mSize.
static bool mayLoopAccessLocation(Value *Ptr, ModRefInfo Access, Loop *L, const SCEV *BECount, unsigned StoreSize, AliasAnalysis &AA, SmallPtrSetImpl< Instruction *> &IgnoredStores)
mayLoopAccessLocation - Return true if the specified loop might access the specified pointer location...
#define DEBUG(X)
Definition: Debug.h:118
static unsigned getStoreSizeInBytes(StoreInst *SI, const DataLayout *DL)
unsigned greater than
Definition: InstrTypes.h:876
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:49
A container for analyses that lazily runs them and caches their results.
void replaceUsesOutsideBlock(Value *V, BasicBlock *BB)
replaceUsesOutsideBlock - Go through the uses list for this definition and make each use point to "V"...
Definition: Value.cpp:440
const SCEV * getTruncateOrZeroExtend(const SCEV *V, Type *Ty)
Return a SCEV corresponding to a conversion of the input value to the specified type.
This pass exposes codegen information to IR-level passes.
bool isSimple() const
Definition: Instructions.h:387
const TerminatorInst * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Definition: BasicBlock.cpp:120
This header defines various interfaces for pass management in LLVM.
bool isBigEndian() const
Definition: DataLayout.h:217
bool isSafeToExpand(const SCEV *S, ScalarEvolution &SE)
Return true if the given expression is safe to expand in the sense that all materialized values are s...
block_iterator block_begin() const
Definition: LoopInfo.h:154
bool hasLoopInvariantBackedgeTakenCount(const Loop *L)
Return true if the specified loop has an analyzable loop-invariant backedge-taken count...
Value * getPointerOperand()
Definition: Instructions.h:398
const SCEV * getZeroExtendExpr(const SCEV *Op, Type *Ty, unsigned Depth=0)
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
PreservedAnalyses run(Loop &L, LoopAnalysisManager &AM, LoopStandardAnalysisResults &AR, LPMUpdater &U)
This class represents a constant integer value.
CallInst * CreateCall(Value *Callee, ArrayRef< Value *> Args=None, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition: IRBuilder.h:1663