LLVM  7.0.0svn
LoopIdiomRecognize.cpp
Go to the documentation of this file.
1 //===- LoopIdiomRecognize.cpp - Loop idiom recognition --------------------===//
2 //
3 // The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 // This pass implements an idiom recognizer that transforms simple loops into a
11 // non-loop form. In cases that this kicks in, it can be a significant
12 // performance win.
13 //
14 // If compiling for code size we avoid idiom recognition if the resulting
15 // code could be larger than the code for the original loop. One way this could
16 // happen is if the loop is not removable after idiom recognition due to the
17 // presence of non-idiom instructions. The initial implementation of the
18 // heuristics applies to idioms in multi-block loops.
19 //
20 //===----------------------------------------------------------------------===//
21 //
22 // TODO List:
23 //
24 // Future loop memory idioms to recognize:
25 // memcmp, memmove, strlen, etc.
26 // Future floating point idioms to recognize in -ffast-math mode:
27 // fpowi
28 // Future integer operation idioms to recognize:
29 // ctpop, ctlz, cttz
30 //
31 // Beware that isel's default lowering for ctpop is highly inefficient for
32 // i64 and larger types when i64 is legal and the value has few bits set. It
33 // would be good to enhance isel to emit a loop for ctpop in this case.
34 //
35 // This could recognize common matrix multiplies and dot product idioms and
36 // replace them with calls to BLAS (if linked in??).
37 //
38 //===----------------------------------------------------------------------===//
39 
40 #include "llvm/ADT/APInt.h"
41 #include "llvm/ADT/ArrayRef.h"
42 #include "llvm/ADT/DenseMap.h"
43 #include "llvm/ADT/MapVector.h"
44 #include "llvm/ADT/SetVector.h"
45 #include "llvm/ADT/SmallPtrSet.h"
46 #include "llvm/ADT/SmallVector.h"
47 #include "llvm/ADT/Statistic.h"
48 #include "llvm/ADT/StringRef.h"
51 #include "llvm/Analysis/LoopInfo.h"
52 #include "llvm/Analysis/LoopPass.h"
61 #include "llvm/IR/Attributes.h"
62 #include "llvm/IR/BasicBlock.h"
63 #include "llvm/IR/Constant.h"
64 #include "llvm/IR/Constants.h"
65 #include "llvm/IR/DataLayout.h"
66 #include "llvm/IR/DebugLoc.h"
67 #include "llvm/IR/DerivedTypes.h"
68 #include "llvm/IR/Dominators.h"
69 #include "llvm/IR/GlobalValue.h"
70 #include "llvm/IR/GlobalVariable.h"
71 #include "llvm/IR/IRBuilder.h"
72 #include "llvm/IR/InstrTypes.h"
73 #include "llvm/IR/Instruction.h"
74 #include "llvm/IR/Instructions.h"
75 #include "llvm/IR/IntrinsicInst.h"
76 #include "llvm/IR/Intrinsics.h"
77 #include "llvm/IR/LLVMContext.h"
78 #include "llvm/IR/Module.h"
79 #include "llvm/IR/PassManager.h"
80 #include "llvm/IR/Type.h"
81 #include "llvm/IR/User.h"
82 #include "llvm/IR/Value.h"
83 #include "llvm/IR/ValueHandle.h"
84 #include "llvm/Pass.h"
85 #include "llvm/Support/Casting.h"
87 #include "llvm/Support/Debug.h"
89 #include "llvm/Transforms/Scalar.h"
93 #include <algorithm>
94 #include <cassert>
95 #include <cstdint>
96 #include <utility>
97 #include <vector>
98 
99 using namespace llvm;
100 
101 #define DEBUG_TYPE "loop-idiom"
102 
103 STATISTIC(NumMemSet, "Number of memset's formed from loop stores");
104 STATISTIC(NumMemCpy, "Number of memcpy's formed from loop load+stores");
105 
107  "use-lir-code-size-heurs",
108  cl::desc("Use loop idiom recognition code size heuristics when compiling"
109  "with -Os/-Oz"),
110  cl::init(true), cl::Hidden);
111 
112 namespace {
113 
114 class LoopIdiomRecognize {
115  Loop *CurLoop = nullptr;
116  AliasAnalysis *AA;
117  DominatorTree *DT;
118  LoopInfo *LI;
119  ScalarEvolution *SE;
120  TargetLibraryInfo *TLI;
121  const TargetTransformInfo *TTI;
122  const DataLayout *DL;
123  bool ApplyCodeSizeHeuristics;
124 
125 public:
126  explicit LoopIdiomRecognize(AliasAnalysis *AA, DominatorTree *DT,
127  LoopInfo *LI, ScalarEvolution *SE,
128  TargetLibraryInfo *TLI,
129  const TargetTransformInfo *TTI,
130  const DataLayout *DL)
131  : AA(AA), DT(DT), LI(LI), SE(SE), TLI(TLI), TTI(TTI), DL(DL) {}
132 
133  bool runOnLoop(Loop *L);
134 
135 private:
136  using StoreList = SmallVector<StoreInst *, 8>;
137  using StoreListMap = MapVector<Value *, StoreList>;
138 
139  StoreListMap StoreRefsForMemset;
140  StoreListMap StoreRefsForMemsetPattern;
141  StoreList StoreRefsForMemcpy;
142  bool HasMemset;
143  bool HasMemsetPattern;
144  bool HasMemcpy;
145 
146  /// Return code for isLegalStore()
147  enum LegalStoreKind {
148  None = 0,
149  Memset,
150  MemsetPattern,
151  Memcpy,
152  UnorderedAtomicMemcpy,
153  DontUse // Dummy retval never to be used. Allows catching errors in retval
154  // handling.
155  };
156 
157  /// \name Countable Loop Idiom Handling
158  /// @{
159 
160  bool runOnCountableLoop();
161  bool runOnLoopBlock(BasicBlock *BB, const SCEV *BECount,
162  SmallVectorImpl<BasicBlock *> &ExitBlocks);
163 
164  void collectStores(BasicBlock *BB);
165  LegalStoreKind isLegalStore(StoreInst *SI);
166  bool processLoopStores(SmallVectorImpl<StoreInst *> &SL, const SCEV *BECount,
167  bool ForMemset);
168  bool processLoopMemSet(MemSetInst *MSI, const SCEV *BECount);
169 
170  bool processLoopStridedStore(Value *DestPtr, unsigned StoreSize,
171  unsigned StoreAlignment, Value *StoredVal,
172  Instruction *TheStore,
174  const SCEVAddRecExpr *Ev, const SCEV *BECount,
175  bool NegStride, bool IsLoopMemset = false);
176  bool processLoopStoreOfLoopLoad(StoreInst *SI, const SCEV *BECount);
177  bool avoidLIRForMultiBlockLoop(bool IsMemset = false,
178  bool IsLoopMemset = false);
179 
180  /// @}
181  /// \name Noncountable Loop Idiom Handling
182  /// @{
183 
184  bool runOnNoncountableLoop();
185 
186  bool recognizePopcount();
187  void transformLoopToPopcount(BasicBlock *PreCondBB, Instruction *CntInst,
188  PHINode *CntPhi, Value *Var);
189  bool recognizeAndInsertCTLZ();
190  void transformLoopToCountable(BasicBlock *PreCondBB, Instruction *CntInst,
191  PHINode *CntPhi, Value *Var, const DebugLoc DL,
192  bool ZeroCheck, bool IsCntPhiUsedOutsideLoop);
193 
194  /// @}
195 };
196 
197 class LoopIdiomRecognizeLegacyPass : public LoopPass {
198 public:
199  static char ID;
200 
201  explicit LoopIdiomRecognizeLegacyPass() : LoopPass(ID) {
204  }
205 
206  bool runOnLoop(Loop *L, LPPassManager &LPM) override {
207  if (skipLoop(L))
208  return false;
209 
210  AliasAnalysis *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
211  DominatorTree *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
212  LoopInfo *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
213  ScalarEvolution *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
214  TargetLibraryInfo *TLI =
215  &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
216  const TargetTransformInfo *TTI =
217  &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(
218  *L->getHeader()->getParent());
219  const DataLayout *DL = &L->getHeader()->getModule()->getDataLayout();
220 
221  LoopIdiomRecognize LIR(AA, DT, LI, SE, TLI, TTI, DL);
222  return LIR.runOnLoop(L);
223  }
224 
225  /// This transformation requires natural loop information & requires that
226  /// loop preheaders be inserted into the CFG.
227  void getAnalysisUsage(AnalysisUsage &AU) const override {
231  }
232 };
233 
234 } // end anonymous namespace
235 
237 
240  LPMUpdater &) {
241  const auto *DL = &L.getHeader()->getModule()->getDataLayout();
242 
243  LoopIdiomRecognize LIR(&AR.AA, &AR.DT, &AR.LI, &AR.SE, &AR.TLI, &AR.TTI, DL);
244  if (!LIR.runOnLoop(&L))
245  return PreservedAnalyses::all();
246 
248 }
249 
250 INITIALIZE_PASS_BEGIN(LoopIdiomRecognizeLegacyPass, "loop-idiom",
251  "Recognize loop idioms", false, false)
255 INITIALIZE_PASS_END(LoopIdiomRecognizeLegacyPass, "loop-idiom",
256  "Recognize loop idioms", false, false)
257 
258 Pass *llvm::createLoopIdiomPass() { return new LoopIdiomRecognizeLegacyPass(); }
259 
262  I->eraseFromParent();
263 }
264 
265 //===----------------------------------------------------------------------===//
266 //
267 // Implementation of LoopIdiomRecognize
268 //
269 //===----------------------------------------------------------------------===//
270 
271 bool LoopIdiomRecognize::runOnLoop(Loop *L) {
272  CurLoop = L;
273  // If the loop could not be converted to canonical form, it must have an
274  // indirectbr in it, just give up.
275  if (!L->getLoopPreheader())
276  return false;
277 
278  // Disable loop idiom recognition if the function's name is a common idiom.
280  if (Name == "memset" || Name == "memcpy")
281  return false;
282 
283  // Determine if code size heuristics need to be applied.
284  ApplyCodeSizeHeuristics =
286 
287  HasMemset = TLI->has(LibFunc_memset);
288  HasMemsetPattern = TLI->has(LibFunc_memset_pattern16);
289  HasMemcpy = TLI->has(LibFunc_memcpy);
290 
291  if (HasMemset || HasMemsetPattern || HasMemcpy)
293  return runOnCountableLoop();
294 
295  return runOnNoncountableLoop();
296 }
297 
298 bool LoopIdiomRecognize::runOnCountableLoop() {
299  const SCEV *BECount = SE->getBackedgeTakenCount(CurLoop);
300  assert(!isa<SCEVCouldNotCompute>(BECount) &&
301  "runOnCountableLoop() called on a loop without a predictable"
302  "backedge-taken count");
303 
304  // If this loop executes exactly one time, then it should be peeled, not
305  // optimized by this pass.
306  if (const SCEVConstant *BECst = dyn_cast<SCEVConstant>(BECount))
307  if (BECst->getAPInt() == 0)
308  return false;
309 
310  SmallVector<BasicBlock *, 8> ExitBlocks;
311  CurLoop->getUniqueExitBlocks(ExitBlocks);
312 
313  LLVM_DEBUG(dbgs() << "loop-idiom Scanning: F["
314  << CurLoop->getHeader()->getParent()->getName()
315  << "] Loop %" << CurLoop->getHeader()->getName() << "\n");
316 
317  bool MadeChange = false;
318 
319  // The following transforms hoist stores/memsets into the loop pre-header.
320  // Give up if the loop has instructions may throw.
321  LoopSafetyInfo SafetyInfo;
322  computeLoopSafetyInfo(&SafetyInfo, CurLoop);
323  if (SafetyInfo.MayThrow)
324  return MadeChange;
325 
326  // Scan all the blocks in the loop that are not in subloops.
327  for (auto *BB : CurLoop->getBlocks()) {
328  // Ignore blocks in subloops.
329  if (LI->getLoopFor(BB) != CurLoop)
330  continue;
331 
332  MadeChange |= runOnLoopBlock(BB, BECount, ExitBlocks);
333  }
334  return MadeChange;
335 }
336 
337 static APInt getStoreStride(const SCEVAddRecExpr *StoreEv) {
338  const SCEVConstant *ConstStride = cast<SCEVConstant>(StoreEv->getOperand(1));
339  return ConstStride->getAPInt();
340 }
341 
342 /// getMemSetPatternValue - If a strided store of the specified value is safe to
343 /// turn into a memset_pattern16, return a ConstantArray of 16 bytes that should
344 /// be passed in. Otherwise, return null.
345 ///
346 /// Note that we don't ever attempt to use memset_pattern8 or 4, because these
347 /// just replicate their input array and then pass on to memset_pattern16.
349  // If the value isn't a constant, we can't promote it to being in a constant
350  // array. We could theoretically do a store to an alloca or something, but
351  // that doesn't seem worthwhile.
352  Constant *C = dyn_cast<Constant>(V);
353  if (!C)
354  return nullptr;
355 
356  // Only handle simple values that are a power of two bytes in size.
357  uint64_t Size = DL->getTypeSizeInBits(V->getType());
358  if (Size == 0 || (Size & 7) || (Size & (Size - 1)))
359  return nullptr;
360 
361  // Don't care enough about darwin/ppc to implement this.
362  if (DL->isBigEndian())
363  return nullptr;
364 
365  // Convert to size in bytes.
366  Size /= 8;
367 
368  // TODO: If CI is larger than 16-bytes, we can try slicing it in half to see
369  // if the top and bottom are the same (e.g. for vectors and large integers).
370  if (Size > 16)
371  return nullptr;
372 
373  // If the constant is exactly 16 bytes, just use it.
374  if (Size == 16)
375  return C;
376 
377  // Otherwise, we'll use an array of the constants.
378  unsigned ArraySize = 16 / Size;
379  ArrayType *AT = ArrayType::get(V->getType(), ArraySize);
380  return ConstantArray::get(AT, std::vector<Constant *>(ArraySize, C));
381 }
382 
383 LoopIdiomRecognize::LegalStoreKind
384 LoopIdiomRecognize::isLegalStore(StoreInst *SI) {
385  // Don't touch volatile stores.
386  if (SI->isVolatile())
387  return LegalStoreKind::None;
388  // We only want simple or unordered-atomic stores.
389  if (!SI->isUnordered())
390  return LegalStoreKind::None;
391 
392  // Don't convert stores of non-integral pointer types to memsets (which stores
393  // integers).
394  if (DL->isNonIntegralPointerType(SI->getValueOperand()->getType()))
395  return LegalStoreKind::None;
396 
397  // Avoid merging nontemporal stores.
399  return LegalStoreKind::None;
400 
401  Value *StoredVal = SI->getValueOperand();
402  Value *StorePtr = SI->getPointerOperand();
403 
404  // Reject stores that are so large that they overflow an unsigned.
405  uint64_t SizeInBits = DL->getTypeSizeInBits(StoredVal->getType());
406  if ((SizeInBits & 7) || (SizeInBits >> 32) != 0)
407  return LegalStoreKind::None;
408 
409  // See if the pointer expression is an AddRec like {base,+,1} on the current
410  // loop, which indicates a strided store. If we have something else, it's a
411  // random store we can't handle.
412  const SCEVAddRecExpr *StoreEv =
413  dyn_cast<SCEVAddRecExpr>(SE->getSCEV(StorePtr));
414  if (!StoreEv || StoreEv->getLoop() != CurLoop || !StoreEv->isAffine())
415  return LegalStoreKind::None;
416 
417  // Check to see if we have a constant stride.
418  if (!isa<SCEVConstant>(StoreEv->getOperand(1)))
419  return LegalStoreKind::None;
420 
421  // See if the store can be turned into a memset.
422 
423  // If the stored value is a byte-wise value (like i32 -1), then it may be
424  // turned into a memset of i8 -1, assuming that all the consecutive bytes
425  // are stored. A store of i32 0x01020304 can never be turned into a memset,
426  // but it can be turned into memset_pattern if the target supports it.
427  Value *SplatValue = isBytewiseValue(StoredVal);
428  Constant *PatternValue = nullptr;
429 
430  // Note: memset and memset_pattern on unordered-atomic is yet not supported
431  bool UnorderedAtomic = SI->isUnordered() && !SI->isSimple();
432 
433  // If we're allowed to form a memset, and the stored value would be
434  // acceptable for memset, use it.
435  if (!UnorderedAtomic && HasMemset && SplatValue &&
436  // Verify that the stored value is loop invariant. If not, we can't
437  // promote the memset.
438  CurLoop->isLoopInvariant(SplatValue)) {
439  // It looks like we can use SplatValue.
440  return LegalStoreKind::Memset;
441  } else if (!UnorderedAtomic && HasMemsetPattern &&
442  // Don't create memset_pattern16s with address spaces.
443  StorePtr->getType()->getPointerAddressSpace() == 0 &&
444  (PatternValue = getMemSetPatternValue(StoredVal, DL))) {
445  // It looks like we can use PatternValue!
446  return LegalStoreKind::MemsetPattern;
447  }
448 
449  // Otherwise, see if the store can be turned into a memcpy.
450  if (HasMemcpy) {
451  // Check to see if the stride matches the size of the store. If so, then we
452  // know that every byte is touched in the loop.
453  APInt Stride = getStoreStride(StoreEv);
454  unsigned StoreSize = DL->getTypeStoreSize(SI->getValueOperand()->getType());
455  if (StoreSize != Stride && StoreSize != -Stride)
456  return LegalStoreKind::None;
457 
458  // The store must be feeding a non-volatile load.
460 
461  // Only allow non-volatile loads
462  if (!LI || LI->isVolatile())
463  return LegalStoreKind::None;
464  // Only allow simple or unordered-atomic loads
465  if (!LI->isUnordered())
466  return LegalStoreKind::None;
467 
468  // See if the pointer expression is an AddRec like {base,+,1} on the current
469  // loop, which indicates a strided load. If we have something else, it's a
470  // random load we can't handle.
471  const SCEVAddRecExpr *LoadEv =
473  if (!LoadEv || LoadEv->getLoop() != CurLoop || !LoadEv->isAffine())
474  return LegalStoreKind::None;
475 
476  // The store and load must share the same stride.
477  if (StoreEv->getOperand(1) != LoadEv->getOperand(1))
478  return LegalStoreKind::None;
479 
480  // Success. This store can be converted into a memcpy.
481  UnorderedAtomic = UnorderedAtomic || LI->isAtomic();
482  return UnorderedAtomic ? LegalStoreKind::UnorderedAtomicMemcpy
483  : LegalStoreKind::Memcpy;
484  }
485  // This store can't be transformed into a memset/memcpy.
486  return LegalStoreKind::None;
487 }
488 
489 void LoopIdiomRecognize::collectStores(BasicBlock *BB) {
490  StoreRefsForMemset.clear();
491  StoreRefsForMemsetPattern.clear();
492  StoreRefsForMemcpy.clear();
493  for (Instruction &I : *BB) {
494  StoreInst *SI = dyn_cast<StoreInst>(&I);
495  if (!SI)
496  continue;
497 
498  // Make sure this is a strided store with a constant stride.
499  switch (isLegalStore(SI)) {
501  // Nothing to do
502  break;
503  case LegalStoreKind::Memset: {
504  // Find the base pointer.
505  Value *Ptr = GetUnderlyingObject(SI->getPointerOperand(), *DL);
506  StoreRefsForMemset[Ptr].push_back(SI);
507  } break;
508  case LegalStoreKind::MemsetPattern: {
509  // Find the base pointer.
510  Value *Ptr = GetUnderlyingObject(SI->getPointerOperand(), *DL);
511  StoreRefsForMemsetPattern[Ptr].push_back(SI);
512  } break;
513  case LegalStoreKind::Memcpy:
514  case LegalStoreKind::UnorderedAtomicMemcpy:
515  StoreRefsForMemcpy.push_back(SI);
516  break;
517  default:
518  assert(false && "unhandled return value");
519  break;
520  }
521  }
522 }
523 
524 /// runOnLoopBlock - Process the specified block, which lives in a counted loop
525 /// with the specified backedge count. This block is known to be in the current
526 /// loop and not in any subloops.
527 bool LoopIdiomRecognize::runOnLoopBlock(
528  BasicBlock *BB, const SCEV *BECount,
529  SmallVectorImpl<BasicBlock *> &ExitBlocks) {
530  // We can only promote stores in this block if they are unconditionally
531  // executed in the loop. For a block to be unconditionally executed, it has
532  // to dominate all the exit blocks of the loop. Verify this now.
533  for (unsigned i = 0, e = ExitBlocks.size(); i != e; ++i)
534  if (!DT->dominates(BB, ExitBlocks[i]))
535  return false;
536 
537  bool MadeChange = false;
538  // Look for store instructions, which may be optimized to memset/memcpy.
539  collectStores(BB);
540 
541  // Look for a single store or sets of stores with a common base, which can be
542  // optimized into a memset (memset_pattern). The latter most commonly happens
543  // with structs and handunrolled loops.
544  for (auto &SL : StoreRefsForMemset)
545  MadeChange |= processLoopStores(SL.second, BECount, true);
546 
547  for (auto &SL : StoreRefsForMemsetPattern)
548  MadeChange |= processLoopStores(SL.second, BECount, false);
549 
550  // Optimize the store into a memcpy, if it feeds an similarly strided load.
551  for (auto &SI : StoreRefsForMemcpy)
552  MadeChange |= processLoopStoreOfLoopLoad(SI, BECount);
553 
554  for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) {
555  Instruction *Inst = &*I++;
556  // Look for memset instructions, which may be optimized to a larger memset.
557  if (MemSetInst *MSI = dyn_cast<MemSetInst>(Inst)) {
558  WeakTrackingVH InstPtr(&*I);
559  if (!processLoopMemSet(MSI, BECount))
560  continue;
561  MadeChange = true;
562 
563  // If processing the memset invalidated our iterator, start over from the
564  // top of the block.
565  if (!InstPtr)
566  I = BB->begin();
567  continue;
568  }
569  }
570 
571  return MadeChange;
572 }
573 
574 /// processLoopStores - See if this store(s) can be promoted to a memset.
575 bool LoopIdiomRecognize::processLoopStores(SmallVectorImpl<StoreInst *> &SL,
576  const SCEV *BECount,
577  bool ForMemset) {
578  // Try to find consecutive stores that can be transformed into memsets.
579  SetVector<StoreInst *> Heads, Tails;
581 
582  // Do a quadratic search on all of the given stores and find
583  // all of the pairs of stores that follow each other.
584  SmallVector<unsigned, 16> IndexQueue;
585  for (unsigned i = 0, e = SL.size(); i < e; ++i) {
586  assert(SL[i]->isSimple() && "Expected only non-volatile stores.");
587 
588  Value *FirstStoredVal = SL[i]->getValueOperand();
589  Value *FirstStorePtr = SL[i]->getPointerOperand();
590  const SCEVAddRecExpr *FirstStoreEv =
591  cast<SCEVAddRecExpr>(SE->getSCEV(FirstStorePtr));
592  APInt FirstStride = getStoreStride(FirstStoreEv);
593  unsigned FirstStoreSize = DL->getTypeStoreSize(SL[i]->getValueOperand()->getType());
594 
595  // See if we can optimize just this store in isolation.
596  if (FirstStride == FirstStoreSize || -FirstStride == FirstStoreSize) {
597  Heads.insert(SL[i]);
598  continue;
599  }
600 
601  Value *FirstSplatValue = nullptr;
602  Constant *FirstPatternValue = nullptr;
603 
604  if (ForMemset)
605  FirstSplatValue = isBytewiseValue(FirstStoredVal);
606  else
607  FirstPatternValue = getMemSetPatternValue(FirstStoredVal, DL);
608 
609  assert((FirstSplatValue || FirstPatternValue) &&
610  "Expected either splat value or pattern value.");
611 
612  IndexQueue.clear();
613  // If a store has multiple consecutive store candidates, search Stores
614  // array according to the sequence: from i+1 to e, then from i-1 to 0.
615  // This is because usually pairing with immediate succeeding or preceding
616  // candidate create the best chance to find memset opportunity.
617  unsigned j = 0;
618  for (j = i + 1; j < e; ++j)
619  IndexQueue.push_back(j);
620  for (j = i; j > 0; --j)
621  IndexQueue.push_back(j - 1);
622 
623  for (auto &k : IndexQueue) {
624  assert(SL[k]->isSimple() && "Expected only non-volatile stores.");
625  Value *SecondStorePtr = SL[k]->getPointerOperand();
626  const SCEVAddRecExpr *SecondStoreEv =
627  cast<SCEVAddRecExpr>(SE->getSCEV(SecondStorePtr));
628  APInt SecondStride = getStoreStride(SecondStoreEv);
629 
630  if (FirstStride != SecondStride)
631  continue;
632 
633  Value *SecondStoredVal = SL[k]->getValueOperand();
634  Value *SecondSplatValue = nullptr;
635  Constant *SecondPatternValue = nullptr;
636 
637  if (ForMemset)
638  SecondSplatValue = isBytewiseValue(SecondStoredVal);
639  else
640  SecondPatternValue = getMemSetPatternValue(SecondStoredVal, DL);
641 
642  assert((SecondSplatValue || SecondPatternValue) &&
643  "Expected either splat value or pattern value.");
644 
645  if (isConsecutiveAccess(SL[i], SL[k], *DL, *SE, false)) {
646  if (ForMemset) {
647  if (FirstSplatValue != SecondSplatValue)
648  continue;
649  } else {
650  if (FirstPatternValue != SecondPatternValue)
651  continue;
652  }
653  Tails.insert(SL[k]);
654  Heads.insert(SL[i]);
655  ConsecutiveChain[SL[i]] = SL[k];
656  break;
657  }
658  }
659  }
660 
661  // We may run into multiple chains that merge into a single chain. We mark the
662  // stores that we transformed so that we don't visit the same store twice.
663  SmallPtrSet<Value *, 16> TransformedStores;
664  bool Changed = false;
665 
666  // For stores that start but don't end a link in the chain:
667  for (SetVector<StoreInst *>::iterator it = Heads.begin(), e = Heads.end();
668  it != e; ++it) {
669  if (Tails.count(*it))
670  continue;
671 
672  // We found a store instr that starts a chain. Now follow the chain and try
673  // to transform it.
674  SmallPtrSet<Instruction *, 8> AdjacentStores;
675  StoreInst *I = *it;
676 
677  StoreInst *HeadStore = I;
678  unsigned StoreSize = 0;
679 
680  // Collect the chain into a list.
681  while (Tails.count(I) || Heads.count(I)) {
682  if (TransformedStores.count(I))
683  break;
684  AdjacentStores.insert(I);
685 
686  StoreSize += DL->getTypeStoreSize(I->getValueOperand()->getType());
687  // Move to the next value in the chain.
688  I = ConsecutiveChain[I];
689  }
690 
691  Value *StoredVal = HeadStore->getValueOperand();
692  Value *StorePtr = HeadStore->getPointerOperand();
693  const SCEVAddRecExpr *StoreEv = cast<SCEVAddRecExpr>(SE->getSCEV(StorePtr));
694  APInt Stride = getStoreStride(StoreEv);
695 
696  // Check to see if the stride matches the size of the stores. If so, then
697  // we know that every byte is touched in the loop.
698  if (StoreSize != Stride && StoreSize != -Stride)
699  continue;
700 
701  bool NegStride = StoreSize == -Stride;
702 
703  if (processLoopStridedStore(StorePtr, StoreSize, HeadStore->getAlignment(),
704  StoredVal, HeadStore, AdjacentStores, StoreEv,
705  BECount, NegStride)) {
706  TransformedStores.insert(AdjacentStores.begin(), AdjacentStores.end());
707  Changed = true;
708  }
709  }
710 
711  return Changed;
712 }
713 
714 /// processLoopMemSet - See if this memset can be promoted to a large memset.
715 bool LoopIdiomRecognize::processLoopMemSet(MemSetInst *MSI,
716  const SCEV *BECount) {
717  // We can only handle non-volatile memsets with a constant size.
718  if (MSI->isVolatile() || !isa<ConstantInt>(MSI->getLength()))
719  return false;
720 
721  // If we're not allowed to hack on memset, we fail.
722  if (!HasMemset)
723  return false;
724 
725  Value *Pointer = MSI->getDest();
726 
727  // See if the pointer expression is an AddRec like {base,+,1} on the current
728  // loop, which indicates a strided store. If we have something else, it's a
729  // random store we can't handle.
730  const SCEVAddRecExpr *Ev = dyn_cast<SCEVAddRecExpr>(SE->getSCEV(Pointer));
731  if (!Ev || Ev->getLoop() != CurLoop || !Ev->isAffine())
732  return false;
733 
734  // Reject memsets that are so large that they overflow an unsigned.
735  uint64_t SizeInBytes = cast<ConstantInt>(MSI->getLength())->getZExtValue();
736  if ((SizeInBytes >> 32) != 0)
737  return false;
738 
739  // Check to see if the stride matches the size of the memset. If so, then we
740  // know that every byte is touched in the loop.
741  const SCEVConstant *ConstStride = dyn_cast<SCEVConstant>(Ev->getOperand(1));
742  if (!ConstStride)
743  return false;
744 
745  APInt Stride = ConstStride->getAPInt();
746  if (SizeInBytes != Stride && SizeInBytes != -Stride)
747  return false;
748 
749  // Verify that the memset value is loop invariant. If not, we can't promote
750  // the memset.
751  Value *SplatValue = MSI->getValue();
752  if (!SplatValue || !CurLoop->isLoopInvariant(SplatValue))
753  return false;
754 
756  MSIs.insert(MSI);
757  bool NegStride = SizeInBytes == -Stride;
758  return processLoopStridedStore(Pointer, (unsigned)SizeInBytes,
759  MSI->getDestAlignment(), SplatValue, MSI, MSIs,
760  Ev, BECount, NegStride, /*IsLoopMemset=*/true);
761 }
762 
763 /// mayLoopAccessLocation - Return true if the specified loop might access the
764 /// specified pointer location, which is a loop-strided access. The 'Access'
765 /// argument specifies what the verboten forms of access are (read or write).
766 static bool
768  const SCEV *BECount, unsigned StoreSize,
769  AliasAnalysis &AA,
770  SmallPtrSetImpl<Instruction *> &IgnoredStores) {
771  // Get the location that may be stored across the loop. Since the access is
772  // strided positively through memory, we say that the modified location starts
773  // at the pointer and has infinite size.
774  uint64_t AccessSize = MemoryLocation::UnknownSize;
775 
776  // If the loop iterates a fixed number of times, we can refine the access size
777  // to be exactly the size of the memset, which is (BECount+1)*StoreSize
778  if (const SCEVConstant *BECst = dyn_cast<SCEVConstant>(BECount))
779  AccessSize = (BECst->getValue()->getZExtValue() + 1) * StoreSize;
780 
781  // TODO: For this to be really effective, we have to dive into the pointer
782  // operand in the store. Store to &A[i] of 100 will always return may alias
783  // with store of &A[100], we need to StoreLoc to be "A" with size of 100,
784  // which will then no-alias a store to &A[100].
785  MemoryLocation StoreLoc(Ptr, AccessSize);
786 
787  for (Loop::block_iterator BI = L->block_begin(), E = L->block_end(); BI != E;
788  ++BI)
789  for (Instruction &I : **BI)
790  if (IgnoredStores.count(&I) == 0 &&
792  intersectModRef(AA.getModRefInfo(&I, StoreLoc), Access)))
793  return true;
794 
795  return false;
796 }
797 
798 // If we have a negative stride, Start refers to the end of the memory location
799 // we're trying to memset. Therefore, we need to recompute the base pointer,
800 // which is just Start - BECount*Size.
801 static const SCEV *getStartForNegStride(const SCEV *Start, const SCEV *BECount,
802  Type *IntPtr, unsigned StoreSize,
803  ScalarEvolution *SE) {
804  const SCEV *Index = SE->getTruncateOrZeroExtend(BECount, IntPtr);
805  if (StoreSize != 1)
806  Index = SE->getMulExpr(Index, SE->getConstant(IntPtr, StoreSize),
807  SCEV::FlagNUW);
808  return SE->getMinusSCEV(Start, Index);
809 }
810 
811 /// Compute the number of bytes as a SCEV from the backedge taken count.
812 ///
813 /// This also maps the SCEV into the provided type and tries to handle the
814 /// computation in a way that will fold cleanly.
815 static const SCEV *getNumBytes(const SCEV *BECount, Type *IntPtr,
816  unsigned StoreSize, Loop *CurLoop,
817  const DataLayout *DL, ScalarEvolution *SE) {
818  const SCEV *NumBytesS;
819  // The # stored bytes is (BECount+1)*Size. Expand the trip count out to
820  // pointer size if it isn't already.
821  //
822  // If we're going to need to zero extend the BE count, check if we can add
823  // one to it prior to zero extending without overflow. Provided this is safe,
824  // it allows better simplification of the +1.
825  if (DL->getTypeSizeInBits(BECount->getType()) <
826  DL->getTypeSizeInBits(IntPtr) &&
828  CurLoop, ICmpInst::ICMP_NE, BECount,
829  SE->getNegativeSCEV(SE->getOne(BECount->getType())))) {
830  NumBytesS = SE->getZeroExtendExpr(
831  SE->getAddExpr(BECount, SE->getOne(BECount->getType()), SCEV::FlagNUW),
832  IntPtr);
833  } else {
834  NumBytesS = SE->getAddExpr(SE->getTruncateOrZeroExtend(BECount, IntPtr),
835  SE->getOne(IntPtr), SCEV::FlagNUW);
836  }
837 
838  // And scale it based on the store size.
839  if (StoreSize != 1) {
840  NumBytesS = SE->getMulExpr(NumBytesS, SE->getConstant(IntPtr, StoreSize),
841  SCEV::FlagNUW);
842  }
843  return NumBytesS;
844 }
845 
846 /// processLoopStridedStore - We see a strided store of some value. If we can
847 /// transform this into a memset or memset_pattern in the loop preheader, do so.
848 bool LoopIdiomRecognize::processLoopStridedStore(
849  Value *DestPtr, unsigned StoreSize, unsigned StoreAlignment,
850  Value *StoredVal, Instruction *TheStore,
852  const SCEV *BECount, bool NegStride, bool IsLoopMemset) {
853  Value *SplatValue = isBytewiseValue(StoredVal);
854  Constant *PatternValue = nullptr;
855 
856  if (!SplatValue)
857  PatternValue = getMemSetPatternValue(StoredVal, DL);
858 
859  assert((SplatValue || PatternValue) &&
860  "Expected either splat value or pattern value.");
861 
862  // The trip count of the loop and the base pointer of the addrec SCEV is
863  // guaranteed to be loop invariant, which means that it should dominate the
864  // header. This allows us to insert code for it in the preheader.
865  unsigned DestAS = DestPtr->getType()->getPointerAddressSpace();
866  BasicBlock *Preheader = CurLoop->getLoopPreheader();
867  IRBuilder<> Builder(Preheader->getTerminator());
868  SCEVExpander Expander(*SE, *DL, "loop-idiom");
869 
870  Type *DestInt8PtrTy = Builder.getInt8PtrTy(DestAS);
871  Type *IntPtr = Builder.getIntPtrTy(*DL, DestAS);
872 
873  const SCEV *Start = Ev->getStart();
874  // Handle negative strided loops.
875  if (NegStride)
876  Start = getStartForNegStride(Start, BECount, IntPtr, StoreSize, SE);
877 
878  // TODO: ideally we should still be able to generate memset if SCEV expander
879  // is taught to generate the dependencies at the latest point.
880  if (!isSafeToExpand(Start, *SE))
881  return false;
882 
883  // Okay, we have a strided store "p[i]" of a splattable value. We can turn
884  // this into a memset in the loop preheader now if we want. However, this
885  // would be unsafe to do if there is anything else in the loop that may read
886  // or write to the aliased location. Check for any overlap by generating the
887  // base pointer and checking the region.
888  Value *BasePtr =
889  Expander.expandCodeFor(Start, DestInt8PtrTy, Preheader->getTerminator());
890  if (mayLoopAccessLocation(BasePtr, ModRefInfo::ModRef, CurLoop, BECount,
891  StoreSize, *AA, Stores)) {
892  Expander.clear();
893  // If we generated new code for the base pointer, clean up.
895  return false;
896  }
897 
898  if (avoidLIRForMultiBlockLoop(/*IsMemset=*/true, IsLoopMemset))
899  return false;
900 
901  // Okay, everything looks good, insert the memset.
902 
903  const SCEV *NumBytesS =
904  getNumBytes(BECount, IntPtr, StoreSize, CurLoop, DL, SE);
905 
906  // TODO: ideally we should still be able to generate memset if SCEV expander
907  // is taught to generate the dependencies at the latest point.
908  if (!isSafeToExpand(NumBytesS, *SE))
909  return false;
910 
911  Value *NumBytes =
912  Expander.expandCodeFor(NumBytesS, IntPtr, Preheader->getTerminator());
913 
914  CallInst *NewCall;
915  if (SplatValue) {
916  NewCall =
917  Builder.CreateMemSet(BasePtr, SplatValue, NumBytes, StoreAlignment);
918  } else {
919  // Everything is emitted in default address space
920  Type *Int8PtrTy = DestInt8PtrTy;
921 
922  Module *M = TheStore->getModule();
923  Value *MSP =
924  M->getOrInsertFunction("memset_pattern16", Builder.getVoidTy(),
925  Int8PtrTy, Int8PtrTy, IntPtr);
926  inferLibFuncAttributes(*M->getFunction("memset_pattern16"), *TLI);
927 
928  // Otherwise we should form a memset_pattern16. PatternValue is known to be
929  // an constant array of 16-bytes. Plop the value into a mergable global.
930  GlobalVariable *GV = new GlobalVariable(*M, PatternValue->getType(), true,
932  PatternValue, ".memset_pattern");
933  GV->setUnnamedAddr(GlobalValue::UnnamedAddr::Global); // Ok to merge these.
934  GV->setAlignment(16);
935  Value *PatternPtr = ConstantExpr::getBitCast(GV, Int8PtrTy);
936  NewCall = Builder.CreateCall(MSP, {BasePtr, PatternPtr, NumBytes});
937  }
938 
939  LLVM_DEBUG(dbgs() << " Formed memset: " << *NewCall << "\n"
940  << " from store to: " << *Ev << " at: " << *TheStore
941  << "\n");
942  NewCall->setDebugLoc(TheStore->getDebugLoc());
943 
944  // Okay, the memset has been formed. Zap the original store and anything that
945  // feeds into it.
946  for (auto *I : Stores)
948  ++NumMemSet;
949  return true;
950 }
951 
952 /// If the stored value is a strided load in the same loop with the same stride
953 /// this may be transformable into a memcpy. This kicks in for stuff like
954 /// for (i) A[i] = B[i];
955 bool LoopIdiomRecognize::processLoopStoreOfLoopLoad(StoreInst *SI,
956  const SCEV *BECount) {
957  assert(SI->isUnordered() && "Expected only non-volatile non-ordered stores.");
958 
959  Value *StorePtr = SI->getPointerOperand();
960  const SCEVAddRecExpr *StoreEv = cast<SCEVAddRecExpr>(SE->getSCEV(StorePtr));
961  APInt Stride = getStoreStride(StoreEv);
962  unsigned StoreSize = DL->getTypeStoreSize(SI->getValueOperand()->getType());
963  bool NegStride = StoreSize == -Stride;
964 
965  // The store must be feeding a non-volatile load.
966  LoadInst *LI = cast<LoadInst>(SI->getValueOperand());
967  assert(LI->isUnordered() && "Expected only non-volatile non-ordered loads.");
968 
969  // See if the pointer expression is an AddRec like {base,+,1} on the current
970  // loop, which indicates a strided load. If we have something else, it's a
971  // random load we can't handle.
972  const SCEVAddRecExpr *LoadEv =
973  cast<SCEVAddRecExpr>(SE->getSCEV(LI->getPointerOperand()));
974 
975  // The trip count of the loop and the base pointer of the addrec SCEV is
976  // guaranteed to be loop invariant, which means that it should dominate the
977  // header. This allows us to insert code for it in the preheader.
978  BasicBlock *Preheader = CurLoop->getLoopPreheader();
979  IRBuilder<> Builder(Preheader->getTerminator());
980  SCEVExpander Expander(*SE, *DL, "loop-idiom");
981 
982  const SCEV *StrStart = StoreEv->getStart();
983  unsigned StrAS = SI->getPointerAddressSpace();
984  Type *IntPtrTy = Builder.getIntPtrTy(*DL, StrAS);
985 
986  // Handle negative strided loops.
987  if (NegStride)
988  StrStart = getStartForNegStride(StrStart, BECount, IntPtrTy, StoreSize, SE);
989 
990  // Okay, we have a strided store "p[i]" of a loaded value. We can turn
991  // this into a memcpy in the loop preheader now if we want. However, this
992  // would be unsafe to do if there is anything else in the loop that may read
993  // or write the memory region we're storing to. This includes the load that
994  // feeds the stores. Check for an alias by generating the base address and
995  // checking everything.
996  Value *StoreBasePtr = Expander.expandCodeFor(
997  StrStart, Builder.getInt8PtrTy(StrAS), Preheader->getTerminator());
998 
1000  Stores.insert(SI);
1001  if (mayLoopAccessLocation(StoreBasePtr, ModRefInfo::ModRef, CurLoop, BECount,
1002  StoreSize, *AA, Stores)) {
1003  Expander.clear();
1004  // If we generated new code for the base pointer, clean up.
1005  RecursivelyDeleteTriviallyDeadInstructions(StoreBasePtr, TLI);
1006  return false;
1007  }
1008 
1009  const SCEV *LdStart = LoadEv->getStart();
1010  unsigned LdAS = LI->getPointerAddressSpace();
1011 
1012  // Handle negative strided loops.
1013  if (NegStride)
1014  LdStart = getStartForNegStride(LdStart, BECount, IntPtrTy, StoreSize, SE);
1015 
1016  // For a memcpy, we have to make sure that the input array is not being
1017  // mutated by the loop.
1018  Value *LoadBasePtr = Expander.expandCodeFor(
1019  LdStart, Builder.getInt8PtrTy(LdAS), Preheader->getTerminator());
1020 
1021  if (mayLoopAccessLocation(LoadBasePtr, ModRefInfo::Mod, CurLoop, BECount,
1022  StoreSize, *AA, Stores)) {
1023  Expander.clear();
1024  // If we generated new code for the base pointer, clean up.
1026  RecursivelyDeleteTriviallyDeadInstructions(StoreBasePtr, TLI);
1027  return false;
1028  }
1029 
1030  if (avoidLIRForMultiBlockLoop())
1031  return false;
1032 
1033  // Okay, everything is safe, we can transform this!
1034 
1035  const SCEV *NumBytesS =
1036  getNumBytes(BECount, IntPtrTy, StoreSize, CurLoop, DL, SE);
1037 
1038  Value *NumBytes =
1039  Expander.expandCodeFor(NumBytesS, IntPtrTy, Preheader->getTerminator());
1040 
1041  CallInst *NewCall = nullptr;
1042  // Check whether to generate an unordered atomic memcpy:
1043  // If the load or store are atomic, then they must necessarily be unordered
1044  // by previous checks.
1045  if (!SI->isAtomic() && !LI->isAtomic())
1046  NewCall = Builder.CreateMemCpy(StoreBasePtr, SI->getAlignment(),
1047  LoadBasePtr, LI->getAlignment(), NumBytes);
1048  else {
1049  // We cannot allow unaligned ops for unordered load/store, so reject
1050  // anything where the alignment isn't at least the element size.
1051  unsigned Align = std::min(SI->getAlignment(), LI->getAlignment());
1052  if (Align < StoreSize)
1053  return false;
1054 
1055  // If the element.atomic memcpy is not lowered into explicit
1056  // loads/stores later, then it will be lowered into an element-size
1057  // specific lib call. If the lib call doesn't exist for our store size, then
1058  // we shouldn't generate the memcpy.
1059  if (StoreSize > TTI->getAtomicMemIntrinsicMaxElementSize())
1060  return false;
1061 
1062  // Create the call.
1063  // Note that unordered atomic loads/stores are *required* by the spec to
1064  // have an alignment but non-atomic loads/stores may not.
1065  NewCall = Builder.CreateElementUnorderedAtomicMemCpy(
1066  StoreBasePtr, SI->getAlignment(), LoadBasePtr, LI->getAlignment(),
1067  NumBytes, StoreSize);
1068  }
1069  NewCall->setDebugLoc(SI->getDebugLoc());
1070 
1071  LLVM_DEBUG(dbgs() << " Formed memcpy: " << *NewCall << "\n"
1072  << " from load ptr=" << *LoadEv << " at: " << *LI << "\n"
1073  << " from store ptr=" << *StoreEv << " at: " << *SI
1074  << "\n");
1075 
1076  // Okay, the memcpy has been formed. Zap the original store and anything that
1077  // feeds into it.
1079  ++NumMemCpy;
1080  return true;
1081 }
1082 
1083 // When compiling for codesize we avoid idiom recognition for a multi-block loop
1084 // unless it is a loop_memset idiom or a memset/memcpy idiom in a nested loop.
1085 //
1086 bool LoopIdiomRecognize::avoidLIRForMultiBlockLoop(bool IsMemset,
1087  bool IsLoopMemset) {
1088  if (ApplyCodeSizeHeuristics && CurLoop->getNumBlocks() > 1) {
1089  if (!CurLoop->getParentLoop() && (!IsMemset || !IsLoopMemset)) {
1090  LLVM_DEBUG(dbgs() << " " << CurLoop->getHeader()->getParent()->getName()
1091  << " : LIR " << (IsMemset ? "Memset" : "Memcpy")
1092  << " avoided: multi-block top-level loop\n");
1093  return true;
1094  }
1095  }
1096 
1097  return false;
1098 }
1099 
1100 bool LoopIdiomRecognize::runOnNoncountableLoop() {
1101  return recognizePopcount() || recognizeAndInsertCTLZ();
1102 }
1103 
1104 /// Check if the given conditional branch is based on the comparison between
1105 /// a variable and zero, and if the variable is non-zero, the control yields to
1106 /// the loop entry. If the branch matches the behavior, the variable involved
1107 /// in the comparison is returned. This function will be called to see if the
1108 /// precondition and postcondition of the loop are in desirable form.
1109 static Value *matchCondition(BranchInst *BI, BasicBlock *LoopEntry) {
1110  if (!BI || !BI->isConditional())
1111  return nullptr;
1112 
1113  ICmpInst *Cond = dyn_cast<ICmpInst>(BI->getCondition());
1114  if (!Cond)
1115  return nullptr;
1116 
1117  ConstantInt *CmpZero = dyn_cast<ConstantInt>(Cond->getOperand(1));
1118  if (!CmpZero || !CmpZero->isZero())
1119  return nullptr;
1120 
1121  ICmpInst::Predicate Pred = Cond->getPredicate();
1122  if ((Pred == ICmpInst::ICMP_NE && BI->getSuccessor(0) == LoopEntry) ||
1123  (Pred == ICmpInst::ICMP_EQ && BI->getSuccessor(1) == LoopEntry))
1124  return Cond->getOperand(0);
1125 
1126  return nullptr;
1127 }
1128 
1129 // Check if the recurrence variable `VarX` is in the right form to create
1130 // the idiom. Returns the value coerced to a PHINode if so.
1132  BasicBlock *LoopEntry) {
1133  auto *PhiX = dyn_cast<PHINode>(VarX);
1134  if (PhiX && PhiX->getParent() == LoopEntry &&
1135  (PhiX->getOperand(0) == DefX || PhiX->getOperand(1) == DefX))
1136  return PhiX;
1137  return nullptr;
1138 }
1139 
1140 /// Return true iff the idiom is detected in the loop.
1141 ///
1142 /// Additionally:
1143 /// 1) \p CntInst is set to the instruction counting the population bit.
1144 /// 2) \p CntPhi is set to the corresponding phi node.
1145 /// 3) \p Var is set to the value whose population bits are being counted.
1146 ///
1147 /// The core idiom we are trying to detect is:
1148 /// \code
1149 /// if (x0 != 0)
1150 /// goto loop-exit // the precondition of the loop
1151 /// cnt0 = init-val;
1152 /// do {
1153 /// x1 = phi (x0, x2);
1154 /// cnt1 = phi(cnt0, cnt2);
1155 ///
1156 /// cnt2 = cnt1 + 1;
1157 /// ...
1158 /// x2 = x1 & (x1 - 1);
1159 /// ...
1160 /// } while(x != 0);
1161 ///
1162 /// loop-exit:
1163 /// \endcode
1164 static bool detectPopcountIdiom(Loop *CurLoop, BasicBlock *PreCondBB,
1165  Instruction *&CntInst, PHINode *&CntPhi,
1166  Value *&Var) {
1167  // step 1: Check to see if the look-back branch match this pattern:
1168  // "if (a!=0) goto loop-entry".
1169  BasicBlock *LoopEntry;
1170  Instruction *DefX2, *CountInst;
1171  Value *VarX1, *VarX0;
1172  PHINode *PhiX, *CountPhi;
1173 
1174  DefX2 = CountInst = nullptr;
1175  VarX1 = VarX0 = nullptr;
1176  PhiX = CountPhi = nullptr;
1177  LoopEntry = *(CurLoop->block_begin());
1178 
1179  // step 1: Check if the loop-back branch is in desirable form.
1180  {
1181  if (Value *T = matchCondition(
1182  dyn_cast<BranchInst>(LoopEntry->getTerminator()), LoopEntry))
1183  DefX2 = dyn_cast<Instruction>(T);
1184  else
1185  return false;
1186  }
1187 
1188  // step 2: detect instructions corresponding to "x2 = x1 & (x1 - 1)"
1189  {
1190  if (!DefX2 || DefX2->getOpcode() != Instruction::And)
1191  return false;
1192 
1193  BinaryOperator *SubOneOp;
1194 
1195  if ((SubOneOp = dyn_cast<BinaryOperator>(DefX2->getOperand(0))))
1196  VarX1 = DefX2->getOperand(1);
1197  else {
1198  VarX1 = DefX2->getOperand(0);
1199  SubOneOp = dyn_cast<BinaryOperator>(DefX2->getOperand(1));
1200  }
1201  if (!SubOneOp || SubOneOp->getOperand(0) != VarX1)
1202  return false;
1203 
1204  ConstantInt *Dec = dyn_cast<ConstantInt>(SubOneOp->getOperand(1));
1205  if (!Dec ||
1206  !((SubOneOp->getOpcode() == Instruction::Sub && Dec->isOne()) ||
1207  (SubOneOp->getOpcode() == Instruction::Add &&
1208  Dec->isMinusOne()))) {
1209  return false;
1210  }
1211  }
1212 
1213  // step 3: Check the recurrence of variable X
1214  PhiX = getRecurrenceVar(VarX1, DefX2, LoopEntry);
1215  if (!PhiX)
1216  return false;
1217 
1218  // step 4: Find the instruction which count the population: cnt2 = cnt1 + 1
1219  {
1220  CountInst = nullptr;
1221  for (BasicBlock::iterator Iter = LoopEntry->getFirstNonPHI()->getIterator(),
1222  IterE = LoopEntry->end();
1223  Iter != IterE; Iter++) {
1224  Instruction *Inst = &*Iter;
1225  if (Inst->getOpcode() != Instruction::Add)
1226  continue;
1227 
1228  ConstantInt *Inc = dyn_cast<ConstantInt>(Inst->getOperand(1));
1229  if (!Inc || !Inc->isOne())
1230  continue;
1231 
1232  PHINode *Phi = getRecurrenceVar(Inst->getOperand(0), Inst, LoopEntry);
1233  if (!Phi)
1234  continue;
1235 
1236  // Check if the result of the instruction is live of the loop.
1237  bool LiveOutLoop = false;
1238  for (User *U : Inst->users()) {
1239  if ((cast<Instruction>(U))->getParent() != LoopEntry) {
1240  LiveOutLoop = true;
1241  break;
1242  }
1243  }
1244 
1245  if (LiveOutLoop) {
1246  CountInst = Inst;
1247  CountPhi = Phi;
1248  break;
1249  }
1250  }
1251 
1252  if (!CountInst)
1253  return false;
1254  }
1255 
1256  // step 5: check if the precondition is in this form:
1257  // "if (x != 0) goto loop-head ; else goto somewhere-we-don't-care;"
1258  {
1259  auto *PreCondBr = dyn_cast<BranchInst>(PreCondBB->getTerminator());
1260  Value *T = matchCondition(PreCondBr, CurLoop->getLoopPreheader());
1261  if (T != PhiX->getOperand(0) && T != PhiX->getOperand(1))
1262  return false;
1263 
1264  CntInst = CountInst;
1265  CntPhi = CountPhi;
1266  Var = T;
1267  }
1268 
1269  return true;
1270 }
1271 
1272 /// Return true if the idiom is detected in the loop.
1273 ///
1274 /// Additionally:
1275 /// 1) \p CntInst is set to the instruction Counting Leading Zeros (CTLZ)
1276 /// or nullptr if there is no such.
1277 /// 2) \p CntPhi is set to the corresponding phi node
1278 /// or nullptr if there is no such.
1279 /// 3) \p Var is set to the value whose CTLZ could be used.
1280 /// 4) \p DefX is set to the instruction calculating Loop exit condition.
1281 ///
1282 /// The core idiom we are trying to detect is:
1283 /// \code
1284 /// if (x0 == 0)
1285 /// goto loop-exit // the precondition of the loop
1286 /// cnt0 = init-val;
1287 /// do {
1288 /// x = phi (x0, x.next); //PhiX
1289 /// cnt = phi(cnt0, cnt.next);
1290 ///
1291 /// cnt.next = cnt + 1;
1292 /// ...
1293 /// x.next = x >> 1; // DefX
1294 /// ...
1295 /// } while(x.next != 0);
1296 ///
1297 /// loop-exit:
1298 /// \endcode
1299 static bool detectCTLZIdiom(Loop *CurLoop, PHINode *&PhiX,
1300  Instruction *&CntInst, PHINode *&CntPhi,
1301  Instruction *&DefX) {
1302  BasicBlock *LoopEntry;
1303  Value *VarX = nullptr;
1304 
1305  DefX = nullptr;
1306  PhiX = nullptr;
1307  CntInst = nullptr;
1308  CntPhi = nullptr;
1309  LoopEntry = *(CurLoop->block_begin());
1310 
1311  // step 1: Check if the loop-back branch is in desirable form.
1312  if (Value *T = matchCondition(
1313  dyn_cast<BranchInst>(LoopEntry->getTerminator()), LoopEntry))
1314  DefX = dyn_cast<Instruction>(T);
1315  else
1316  return false;
1317 
1318  // step 2: detect instructions corresponding to "x.next = x >> 1"
1319  // TODO: Support loops that use LShr.
1320  if (!DefX || DefX->getOpcode() != Instruction::AShr)
1321  return false;
1322  ConstantInt *Shft = dyn_cast<ConstantInt>(DefX->getOperand(1));
1323  if (!Shft || !Shft->isOne())
1324  return false;
1325  VarX = DefX->getOperand(0);
1326 
1327  // step 3: Check the recurrence of variable X
1328  PhiX = getRecurrenceVar(VarX, DefX, LoopEntry);
1329  if (!PhiX)
1330  return false;
1331 
1332  // step 4: Find the instruction which count the CTLZ: cnt.next = cnt + 1
1333  // TODO: We can skip the step. If loop trip count is known (CTLZ),
1334  // then all uses of "cnt.next" could be optimized to the trip count
1335  // plus "cnt0". Currently it is not optimized.
1336  // This step could be used to detect POPCNT instruction:
1337  // cnt.next = cnt + (x.next & 1)
1338  for (BasicBlock::iterator Iter = LoopEntry->getFirstNonPHI()->getIterator(),
1339  IterE = LoopEntry->end();
1340  Iter != IterE; Iter++) {
1341  Instruction *Inst = &*Iter;
1342  if (Inst->getOpcode() != Instruction::Add)
1343  continue;
1344 
1345  ConstantInt *Inc = dyn_cast<ConstantInt>(Inst->getOperand(1));
1346  if (!Inc || !Inc->isOne())
1347  continue;
1348 
1349  PHINode *Phi = getRecurrenceVar(Inst->getOperand(0), Inst, LoopEntry);
1350  if (!Phi)
1351  continue;
1352 
1353  CntInst = Inst;
1354  CntPhi = Phi;
1355  break;
1356  }
1357  if (!CntInst)
1358  return false;
1359 
1360  return true;
1361 }
1362 
1363 /// Recognize CTLZ idiom in a non-countable loop and convert the loop
1364 /// to countable (with CTLZ trip count).
1365 /// If CTLZ inserted as a new trip count returns true; otherwise, returns false.
1366 bool LoopIdiomRecognize::recognizeAndInsertCTLZ() {
1367  // Give up if the loop has multiple blocks or multiple backedges.
1368  if (CurLoop->getNumBackEdges() != 1 || CurLoop->getNumBlocks() != 1)
1369  return false;
1370 
1371  Instruction *CntInst, *DefX;
1372  PHINode *CntPhi, *PhiX;
1373  if (!detectCTLZIdiom(CurLoop, PhiX, CntInst, CntPhi, DefX))
1374  return false;
1375 
1376  bool IsCntPhiUsedOutsideLoop = false;
1377  for (User *U : CntPhi->users())
1378  if (!CurLoop->contains(cast<Instruction>(U))) {
1379  IsCntPhiUsedOutsideLoop = true;
1380  break;
1381  }
1382  bool IsCntInstUsedOutsideLoop = false;
1383  for (User *U : CntInst->users())
1384  if (!CurLoop->contains(cast<Instruction>(U))) {
1385  IsCntInstUsedOutsideLoop = true;
1386  break;
1387  }
1388  // If both CntInst and CntPhi are used outside the loop the profitability
1389  // is questionable.
1390  if (IsCntInstUsedOutsideLoop && IsCntPhiUsedOutsideLoop)
1391  return false;
1392 
1393  // For some CPUs result of CTLZ(X) intrinsic is undefined
1394  // when X is 0. If we can not guarantee X != 0, we need to check this
1395  // when expand.
1396  bool ZeroCheck = false;
1397  // It is safe to assume Preheader exist as it was checked in
1398  // parent function RunOnLoop.
1399  BasicBlock *PH = CurLoop->getLoopPreheader();
1400  Value *InitX = PhiX->getIncomingValueForBlock(PH);
1401 
1402  // Make sure the initial value can't be negative otherwise the ashr in the
1403  // loop might never reach zero which would make the loop infinite.
1404  // TODO: Support loops that use lshr and wouldn't need this check.
1405  if (!isKnownNonNegative(InitX, *DL))
1406  return false;
1407 
1408  // If we check X != 0 before entering the loop we don't need a zero
1409  // check in CTLZ intrinsic, but only if Cnt Phi is not used outside of the
1410  // loop (if it is used we count CTLZ(X >> 1)).
1411  if (!IsCntPhiUsedOutsideLoop)
1412  if (BasicBlock *PreCondBB = PH->getSinglePredecessor())
1413  if (BranchInst *PreCondBr =
1414  dyn_cast<BranchInst>(PreCondBB->getTerminator())) {
1415  if (matchCondition(PreCondBr, PH) == InitX)
1416  ZeroCheck = true;
1417  }
1418 
1419  // Check if CTLZ intrinsic is profitable. Assume it is always profitable
1420  // if we delete the loop (the loop has only 6 instructions):
1421  // %n.addr.0 = phi [ %n, %entry ], [ %shr, %while.cond ]
1422  // %i.0 = phi [ %i0, %entry ], [ %inc, %while.cond ]
1423  // %shr = ashr %n.addr.0, 1
1424  // %tobool = icmp eq %shr, 0
1425  // %inc = add nsw %i.0, 1
1426  // br i1 %tobool
1427 
1428  const Value *Args[] =
1429  {InitX, ZeroCheck ? ConstantInt::getTrue(InitX->getContext())
1430  : ConstantInt::getFalse(InitX->getContext())};
1431  if (CurLoop->getHeader()->size() != 6 &&
1432  TTI->getIntrinsicCost(Intrinsic::ctlz, InitX->getType(), Args) >
1434  return false;
1435 
1436  const DebugLoc DL = DefX->getDebugLoc();
1437  transformLoopToCountable(PH, CntInst, CntPhi, InitX, DL, ZeroCheck,
1438  IsCntPhiUsedOutsideLoop);
1439  return true;
1440 }
1441 
1442 /// Recognizes a population count idiom in a non-countable loop.
1443 ///
1444 /// If detected, transforms the relevant code to issue the popcount intrinsic
1445 /// function call, and returns true; otherwise, returns false.
1446 bool LoopIdiomRecognize::recognizePopcount() {
1448  return false;
1449 
1450  // Counting population are usually conducted by few arithmetic instructions.
1451  // Such instructions can be easily "absorbed" by vacant slots in a
1452  // non-compact loop. Therefore, recognizing popcount idiom only makes sense
1453  // in a compact loop.
1454 
1455  // Give up if the loop has multiple blocks or multiple backedges.
1456  if (CurLoop->getNumBackEdges() != 1 || CurLoop->getNumBlocks() != 1)
1457  return false;
1458 
1459  BasicBlock *LoopBody = *(CurLoop->block_begin());
1460  if (LoopBody->size() >= 20) {
1461  // The loop is too big, bail out.
1462  return false;
1463  }
1464 
1465  // It should have a preheader containing nothing but an unconditional branch.
1466  BasicBlock *PH = CurLoop->getLoopPreheader();
1467  if (!PH || &PH->front() != PH->getTerminator())
1468  return false;
1469  auto *EntryBI = dyn_cast<BranchInst>(PH->getTerminator());
1470  if (!EntryBI || EntryBI->isConditional())
1471  return false;
1472 
1473  // It should have a precondition block where the generated popcount intrinsic
1474  // function can be inserted.
1475  auto *PreCondBB = PH->getSinglePredecessor();
1476  if (!PreCondBB)
1477  return false;
1478  auto *PreCondBI = dyn_cast<BranchInst>(PreCondBB->getTerminator());
1479  if (!PreCondBI || PreCondBI->isUnconditional())
1480  return false;
1481 
1482  Instruction *CntInst;
1483  PHINode *CntPhi;
1484  Value *Val;
1485  if (!detectPopcountIdiom(CurLoop, PreCondBB, CntInst, CntPhi, Val))
1486  return false;
1487 
1488  transformLoopToPopcount(PreCondBB, CntInst, CntPhi, Val);
1489  return true;
1490 }
1491 
1493  const DebugLoc &DL) {
1494  Value *Ops[] = {Val};
1495  Type *Tys[] = {Val->getType()};
1496 
1497  Module *M = IRBuilder.GetInsertBlock()->getParent()->getParent();
1498  Value *Func = Intrinsic::getDeclaration(M, Intrinsic::ctpop, Tys);
1499  CallInst *CI = IRBuilder.CreateCall(Func, Ops);
1500  CI->setDebugLoc(DL);
1501 
1502  return CI;
1503 }
1504 
1506  const DebugLoc &DL, bool ZeroCheck) {
1507  Value *Ops[] = {Val, ZeroCheck ? IRBuilder.getTrue() : IRBuilder.getFalse()};
1508  Type *Tys[] = {Val->getType()};
1509 
1510  Module *M = IRBuilder.GetInsertBlock()->getParent()->getParent();
1511  Value *Func = Intrinsic::getDeclaration(M, Intrinsic::ctlz, Tys);
1512  CallInst *CI = IRBuilder.CreateCall(Func, Ops);
1513  CI->setDebugLoc(DL);
1514 
1515  return CI;
1516 }
1517 
1518 /// Transform the following loop:
1519 /// loop:
1520 /// CntPhi = PHI [Cnt0, CntInst]
1521 /// PhiX = PHI [InitX, DefX]
1522 /// CntInst = CntPhi + 1
1523 /// DefX = PhiX >> 1
1524 /// LOOP_BODY
1525 /// Br: loop if (DefX != 0)
1526 /// Use(CntPhi) or Use(CntInst)
1527 ///
1528 /// Into:
1529 /// If CntPhi used outside the loop:
1530 /// CountPrev = BitWidth(InitX) - CTLZ(InitX >> 1)
1531 /// Count = CountPrev + 1
1532 /// else
1533 /// Count = BitWidth(InitX) - CTLZ(InitX)
1534 /// loop:
1535 /// CntPhi = PHI [Cnt0, CntInst]
1536 /// PhiX = PHI [InitX, DefX]
1537 /// PhiCount = PHI [Count, Dec]
1538 /// CntInst = CntPhi + 1
1539 /// DefX = PhiX >> 1
1540 /// Dec = PhiCount - 1
1541 /// LOOP_BODY
1542 /// Br: loop if (Dec != 0)
1543 /// Use(CountPrev + Cnt0) // Use(CntPhi)
1544 /// or
1545 /// Use(Count + Cnt0) // Use(CntInst)
1546 ///
1547 /// If LOOP_BODY is empty the loop will be deleted.
1548 /// If CntInst and DefX are not used in LOOP_BODY they will be removed.
1549 void LoopIdiomRecognize::transformLoopToCountable(
1550  BasicBlock *Preheader, Instruction *CntInst, PHINode *CntPhi, Value *InitX,
1551  const DebugLoc DL, bool ZeroCheck, bool IsCntPhiUsedOutsideLoop) {
1552  BranchInst *PreheaderBr = cast<BranchInst>(Preheader->getTerminator());
1553 
1554  // Step 1: Insert the CTLZ instruction at the end of the preheader block
1555  // Count = BitWidth - CTLZ(InitX);
1556  // If there are uses of CntPhi create:
1557  // CountPrev = BitWidth - CTLZ(InitX >> 1);
1558  IRBuilder<> Builder(PreheaderBr);
1559  Builder.SetCurrentDebugLocation(DL);
1560  Value *CTLZ, *Count, *CountPrev, *NewCount, *InitXNext;
1561 
1562  if (IsCntPhiUsedOutsideLoop)
1563  InitXNext = Builder.CreateAShr(InitX,
1564  ConstantInt::get(InitX->getType(), 1));
1565  else
1566  InitXNext = InitX;
1567  CTLZ = createCTLZIntrinsic(Builder, InitXNext, DL, ZeroCheck);
1568  Count = Builder.CreateSub(
1569  ConstantInt::get(CTLZ->getType(),
1570  CTLZ->getType()->getIntegerBitWidth()),
1571  CTLZ);
1572  if (IsCntPhiUsedOutsideLoop) {
1573  CountPrev = Count;
1574  Count = Builder.CreateAdd(
1575  CountPrev,
1576  ConstantInt::get(CountPrev->getType(), 1));
1577  }
1578  if (IsCntPhiUsedOutsideLoop)
1579  NewCount = Builder.CreateZExtOrTrunc(CountPrev,
1580  cast<IntegerType>(CntInst->getType()));
1581  else
1582  NewCount = Builder.CreateZExtOrTrunc(Count,
1583  cast<IntegerType>(CntInst->getType()));
1584 
1585  // If the CTLZ counter's initial value is not zero, insert Add Inst.
1586  Value *CntInitVal = CntPhi->getIncomingValueForBlock(Preheader);
1587  ConstantInt *InitConst = dyn_cast<ConstantInt>(CntInitVal);
1588  if (!InitConst || !InitConst->isZero())
1589  NewCount = Builder.CreateAdd(NewCount, CntInitVal);
1590 
1591  // Step 2: Insert new IV and loop condition:
1592  // loop:
1593  // ...
1594  // PhiCount = PHI [Count, Dec]
1595  // ...
1596  // Dec = PhiCount - 1
1597  // ...
1598  // Br: loop if (Dec != 0)
1599  BasicBlock *Body = *(CurLoop->block_begin());
1600  auto *LbBr = cast<BranchInst>(Body->getTerminator());
1601  ICmpInst *LbCond = cast<ICmpInst>(LbBr->getCondition());
1602  Type *Ty = Count->getType();
1603 
1604  PHINode *TcPhi = PHINode::Create(Ty, 2, "tcphi", &Body->front());
1605 
1606  Builder.SetInsertPoint(LbCond);
1607  Instruction *TcDec = cast<Instruction>(
1608  Builder.CreateSub(TcPhi, ConstantInt::get(Ty, 1),
1609  "tcdec", false, true));
1610 
1611  TcPhi->addIncoming(Count, Preheader);
1612  TcPhi->addIncoming(TcDec, Body);
1613 
1614  CmpInst::Predicate Pred =
1615  (LbBr->getSuccessor(0) == Body) ? CmpInst::ICMP_NE : CmpInst::ICMP_EQ;
1616  LbCond->setPredicate(Pred);
1617  LbCond->setOperand(0, TcDec);
1618  LbCond->setOperand(1, ConstantInt::get(Ty, 0));
1619 
1620  // Step 3: All the references to the original counter outside
1621  // the loop are replaced with the NewCount -- the value returned from
1622  // __builtin_ctlz(x).
1623  if (IsCntPhiUsedOutsideLoop)
1624  CntPhi->replaceUsesOutsideBlock(NewCount, Body);
1625  else
1626  CntInst->replaceUsesOutsideBlock(NewCount, Body);
1627 
1628  // step 4: Forget the "non-computable" trip-count SCEV associated with the
1629  // loop. The loop would otherwise not be deleted even if it becomes empty.
1630  SE->forgetLoop(CurLoop);
1631 }
1632 
1633 void LoopIdiomRecognize::transformLoopToPopcount(BasicBlock *PreCondBB,
1634  Instruction *CntInst,
1635  PHINode *CntPhi, Value *Var) {
1636  BasicBlock *PreHead = CurLoop->getLoopPreheader();
1637  auto *PreCondBr = cast<BranchInst>(PreCondBB->getTerminator());
1638  const DebugLoc DL = CntInst->getDebugLoc();
1639 
1640  // Assuming before transformation, the loop is following:
1641  // if (x) // the precondition
1642  // do { cnt++; x &= x - 1; } while(x);
1643 
1644  // Step 1: Insert the ctpop instruction at the end of the precondition block
1645  IRBuilder<> Builder(PreCondBr);
1646  Value *PopCnt, *PopCntZext, *NewCount, *TripCnt;
1647  {
1648  PopCnt = createPopcntIntrinsic(Builder, Var, DL);
1649  NewCount = PopCntZext =
1650  Builder.CreateZExtOrTrunc(PopCnt, cast<IntegerType>(CntPhi->getType()));
1651 
1652  if (NewCount != PopCnt)
1653  (cast<Instruction>(NewCount))->setDebugLoc(DL);
1654 
1655  // TripCnt is exactly the number of iterations the loop has
1656  TripCnt = NewCount;
1657 
1658  // If the population counter's initial value is not zero, insert Add Inst.
1659  Value *CntInitVal = CntPhi->getIncomingValueForBlock(PreHead);
1660  ConstantInt *InitConst = dyn_cast<ConstantInt>(CntInitVal);
1661  if (!InitConst || !InitConst->isZero()) {
1662  NewCount = Builder.CreateAdd(NewCount, CntInitVal);
1663  (cast<Instruction>(NewCount))->setDebugLoc(DL);
1664  }
1665  }
1666 
1667  // Step 2: Replace the precondition from "if (x == 0) goto loop-exit" to
1668  // "if (NewCount == 0) loop-exit". Without this change, the intrinsic
1669  // function would be partial dead code, and downstream passes will drag
1670  // it back from the precondition block to the preheader.
1671  {
1672  ICmpInst *PreCond = cast<ICmpInst>(PreCondBr->getCondition());
1673 
1674  Value *Opnd0 = PopCntZext;
1675  Value *Opnd1 = ConstantInt::get(PopCntZext->getType(), 0);
1676  if (PreCond->getOperand(0) != Var)
1677  std::swap(Opnd0, Opnd1);
1678 
1679  ICmpInst *NewPreCond = cast<ICmpInst>(
1680  Builder.CreateICmp(PreCond->getPredicate(), Opnd0, Opnd1));
1681  PreCondBr->setCondition(NewPreCond);
1682 
1684  }
1685 
1686  // Step 3: Note that the population count is exactly the trip count of the
1687  // loop in question, which enable us to convert the loop from noncountable
1688  // loop into a countable one. The benefit is twofold:
1689  //
1690  // - If the loop only counts population, the entire loop becomes dead after
1691  // the transformation. It is a lot easier to prove a countable loop dead
1692  // than to prove a noncountable one. (In some C dialects, an infinite loop
1693  // isn't dead even if it computes nothing useful. In general, DCE needs
1694  // to prove a noncountable loop finite before safely delete it.)
1695  //
1696  // - If the loop also performs something else, it remains alive.
1697  // Since it is transformed to countable form, it can be aggressively
1698  // optimized by some optimizations which are in general not applicable
1699  // to a noncountable loop.
1700  //
1701  // After this step, this loop (conceptually) would look like following:
1702  // newcnt = __builtin_ctpop(x);
1703  // t = newcnt;
1704  // if (x)
1705  // do { cnt++; x &= x-1; t--) } while (t > 0);
1706  BasicBlock *Body = *(CurLoop->block_begin());
1707  {
1708  auto *LbBr = cast<BranchInst>(Body->getTerminator());
1709  ICmpInst *LbCond = cast<ICmpInst>(LbBr->getCondition());
1710  Type *Ty = TripCnt->getType();
1711 
1712  PHINode *TcPhi = PHINode::Create(Ty, 2, "tcphi", &Body->front());
1713 
1714  Builder.SetInsertPoint(LbCond);
1715  Instruction *TcDec = cast<Instruction>(
1716  Builder.CreateSub(TcPhi, ConstantInt::get(Ty, 1),
1717  "tcdec", false, true));
1718 
1719  TcPhi->addIncoming(TripCnt, PreHead);
1720  TcPhi->addIncoming(TcDec, Body);
1721 
1722  CmpInst::Predicate Pred =
1723  (LbBr->getSuccessor(0) == Body) ? CmpInst::ICMP_UGT : CmpInst::ICMP_SLE;
1724  LbCond->setPredicate(Pred);
1725  LbCond->setOperand(0, TcDec);
1726  LbCond->setOperand(1, ConstantInt::get(Ty, 0));
1727  }
1728 
1729  // Step 4: All the references to the original population counter outside
1730  // the loop are replaced with the NewCount -- the value returned from
1731  // __builtin_ctpop().
1732  CntInst->replaceUsesOutsideBlock(NewCount, Body);
1733 
1734  // step 5: Forget the "non-computable" trip-count SCEV associated with the
1735  // loop. The loop would otherwise not be deleted even if it becomes empty.
1736  SE->forgetLoop(CurLoop);
1737 }
The access may reference and may modify the value stored in memory.
Pass interface - Implemented by all &#39;passes&#39;.
Definition: Pass.h:81
uint64_t CallInst * C
Value * getValueOperand()
Definition: Instructions.h:395
SymbolTableList< Instruction >::iterator eraseFromParent()
This method unlinks &#39;this&#39; from the containing basic block and deletes it.
Definition: Instruction.cpp:68
void push_back(const T &Elt)
Definition: SmallVector.h:213
A parsed version of the target data layout string in and methods for querying it. ...
Definition: DataLayout.h:111
static ConstantInt * getFalse(LLVMContext &Context)
Definition: Constants.cpp:574
constexpr char Align[]
Key for Kernel::Arg::Metadata::mAlign.
Value * CreateICmp(CmpInst::Predicate P, Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:1846
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
static PassRegistry * getPassRegistry()
getPassRegistry - Access the global registry object, which is automatically initialized at applicatio...
static CallInst * createPopcntIntrinsic(IRBuilder<> &IRBuilder, Value *Val, const DebugLoc &DL)
PreservedAnalyses getLoopPassPreservedAnalyses()
Returns the minimum set of Analyses that all loop passes must preserve.
Value * CreateZExtOrTrunc(Value *V, Type *DestTy, const Twine &Name="")
Create a ZExt or Trunc from the integer value V to DestTy.
Definition: IRBuilder.h:1566
const SCEV * getConstant(ConstantInt *V)
Compute iterated dominance frontiers using a linear time algorithm.
Definition: AllocatorList.h:24
BinaryOps getOpcode() const
Definition: InstrTypes.h:555
Value * isBytewiseValue(Value *V)
If the specified value can be set by repeating the same byte in memory, return the i8 value that it i...
bool isAtomic() const
Return true if this instruction has an AtomicOrdering of unordered or higher.
Constant * getOrInsertFunction(StringRef Name, FunctionType *T, AttributeList AttributeList)
Look up the specified function in the module symbol table.
Definition: Module.cpp:142
ArrayRef< BasicBlock *>::const_iterator block_iterator
Definition: LoopInfo.h:153
A Module instance is used to store all the information related to an LLVM module. ...
Definition: Module.h:63
LLVM_ATTRIBUTE_ALWAYS_INLINE size_type size() const
Definition: SmallVector.h:137
bool isUnordered() const
Definition: Instructions.h:389
The main scalar evolution driver.
static bool detectCTLZIdiom(Loop *CurLoop, PHINode *&PhiX, Instruction *&CntInst, PHINode *&CntPhi, Instruction *&DefX)
Return true if the idiom is detected in the loop.
This class represents a function call, abstracting a target machine&#39;s calling convention.
static PHINode * getRecurrenceVar(Value *VarX, Instruction *DefX, BasicBlock *LoopEntry)
BlockT * getLoopPreheader() const
If there is a preheader for this loop, return it.
Definition: LoopInfoImpl.h:106
Value * getValue() const
Like Internal, but omit from symbol table.
Definition: GlobalValue.h:57
LLVMContext & getContext() const
All values hold a context through their type.
Definition: Value.cpp:713
This class wraps the llvm.memset intrinsic.
BasicBlock * getSuccessor(unsigned i) const
STATISTIC(NumFunctions, "Total number of functions")
A debug info location.
Definition: DebugLoc.h:34
The adaptor from a function pass to a loop pass computes these analyses and makes them available to t...
unsigned getPointerAddressSpace() const
Get the address space of this pointer or pointer vector type.
Definition: DerivedTypes.h:503
An instruction for reading from memory.
Definition: Instructions.h:164
Value * getCondition() const
Value * getLength() const
iterator end()
Get an iterator to the end of the SetVector.
Definition: SetVector.h:93
static Constant * get(ArrayType *T, ArrayRef< Constant *> V)
Definition: Constants.cpp:950
Value * getDest() const
This is just like getRawDest, but it strips off any cast instructions (including addrspacecast) that ...
iterator begin()
Instruction iterator methods.
Definition: BasicBlock.h:264
PopcntSupportKind getPopcntSupport(unsigned IntTyWidthInBit) const
Return hardware support for population count.
AnalysisUsage & addRequired()
const Module * getModule() const
Return the module owning the function this basic block belongs to, or nullptr if the function does no...
Definition: BasicBlock.cpp:134
#define INITIALIZE_PASS_DEPENDENCY(depName)
Definition: PassSupport.h:51
bool isVolatile() const
Return true if this is a load from a volatile memory location.
Definition: Instructions.h:217
const DataLayout & getDataLayout() const
Get the data layout for the module&#39;s target platform.
Definition: Module.cpp:361
LoopT * getLoopFor(const BlockT *BB) const
Return the inner most loop that BB lives in.
Definition: LoopInfo.h:684
static CallInst * createCTLZIntrinsic(IRBuilder<> &IRBuilder, Value *Val, const DebugLoc &DL, bool ZeroCheck)
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition: IRBuilder.h:731
This file contains the simple types necessary to represent the attributes associated with functions a...
Value * CreateAdd(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition: IRBuilder.h:962
unsigned getDestAlignment() const
This file implements a class to represent arbitrary precision integral constant values and operations...
const APInt & getAPInt() const
BlockT * getHeader() const
Definition: LoopInfo.h:100
void computeLoopSafetyInfo(LoopSafetyInfo *, Loop *)
Computes safety information for a loop checks loop body & header for the possibility of may throw exc...
Definition: MustExecute.cpp:28
static bool isSimple(Instruction *I)
bool isOne() const
This is just a convenience method to make client code smaller for a common case.
Definition: Constants.h:201
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
Definition: Instructions.h:404
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:245
Value handle that is nullable, but tries to track the Value.
Definition: ValueHandle.h:182
bool insert(const value_type &X)
Insert a new element into the SetVector.
Definition: SetVector.h:142
This node represents a polynomial recurrence on the trip count of the specified loop.
static APInt getStoreStride(const SCEVAddRecExpr *StoreEv)
#define T
BasicBlock * GetInsertBlock() const
Definition: IRBuilder.h:121
Class to represent array types.
Definition: DerivedTypes.h:369
static cl::opt< bool > UseLIRCodeSizeHeurs("use-lir-code-size-heurs", cl::desc("Use loop idiom recognition code size heuristics when compiling" "with -Os/-Oz"), cl::init(true), cl::Hidden)
bool has(LibFunc F) const
Tests whether a library function is available.
iterator begin()
Get an iterator to the beginning of the SetVector.
Definition: SetVector.h:83
MDNode * getMetadata(unsigned KindID) const
Get the metadata of given kind attached to this Instruction.
Definition: Instruction.h:200
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
Definition: Instruction.h:126
Value * CreateSub(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition: IRBuilder.h:979
void initializeLoopIdiomRecognizeLegacyPassPass(PassRegistry &)
An instruction for storing to memory.
Definition: Instructions.h:306
bool isMinusOne() const
This function will return true iff every bit in this constant is set to true.
Definition: Constants.h:209
void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition: Value.cpp:439
static void deleteDeadInstruction(Instruction *I)
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree...
Definition: Dominators.h:142
Function * getDeclaration(Module *M, ID id, ArrayRef< Type *> Tys=None)
Create or insert an LLVM Function declaration for an intrinsic, and return it.
Definition: Function.cpp:1001
void SetInsertPoint(BasicBlock *TheBB)
This specifies that created instructions should be appended to the end of the specified block...
Definition: IRBuilder.h:127
Value * getOperand(unsigned i) const
Definition: User.h:170
size_type count(const key_type &key) const
Count the number of elements of a given key in the SetVector.
Definition: SetVector.h:211
bool isLoopEntryGuardedByCond(const Loop *L, ICmpInst::Predicate Pred, const SCEV *LHS, const SCEV *RHS)
Test whether entry to the loop is protected by a conditional between LHS and RHS. ...
static Constant * getBitCast(Constant *C, Type *Ty, bool OnlyIfReduced=false)
Definition: Constants.cpp:1740
bool isConsecutiveAccess(Value *A, Value *B, const DataLayout &DL, ScalarEvolution &SE, bool CheckType=true)
Returns true if the memory operations A and B are consecutive.
bool inferLibFuncAttributes(Function &F, const TargetLibraryInfo &TLI)
Analyze the name and prototype of the given function and set any applicable attributes.
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:410
const SCEV * getOperand(unsigned i) const
const Instruction * getFirstNonPHI() const
Returns a pointer to the first instruction in this block that is not a PHINode instruction.
Definition: BasicBlock.cpp:189
Wrapper pass for TargetTransformInfo.
A set of analyses that are preserved following a run of a transformation pass.
Definition: PassManager.h:153
const SCEV * getOne(Type *Ty)
Return a SCEV for the constant 1 of a specific type.
void setDebugLoc(DebugLoc Loc)
Set the debug location information for this instruction.
Definition: Instruction.h:287
const BasicBlock * getSinglePredecessor() const
Return the predecessor of this block if it has a single predecessor block.
Definition: BasicBlock.cpp:235
LLVM Basic Block Representation.
Definition: BasicBlock.h:59
The instances of the Type class are immutable: once they are created, they are never changed...
Definition: Type.h:46
ConstantInt * getTrue()
Get the constant value for i1 true.
Definition: IRBuilder.h:287
Conditional or Unconditional Branch instruction.
bool isKnownNonNegative(const Value *V, const DataLayout &DL, unsigned Depth=0, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr)
Returns true if the give value is known to be non-negative.
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
This is an important base class in LLVM.
Definition: Constant.h:42
Value * getIncomingValueForBlock(const BasicBlock *BB) const
This file contains the declarations for the subclasses of Constant, which represent the different fla...
const Instruction & front() const
Definition: BasicBlock.h:276
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition: SmallPtrSet.h:371
const SCEV * getAddExpr(SmallVectorImpl< const SCEV *> &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical add expression, or something simpler if possible.
size_t size() const
Definition: BasicBlock.h:274
bool isUnordered() const
Definition: Instructions.h:264
Represent the analysis usage information of a pass.
bool optForSize() const
Optimize this function for size (-Os) or minimum size (-Oz).
Definition: Function.h:584
static Value * matchCondition(BranchInst *BI, BasicBlock *LoopEntry)
Check if the given conditional branch is based on the comparison between a variable and zero...
This instruction compares its operands according to the predicate given to the constructor.
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition: InstrTypes.h:885
bool isAffine() const
Return true if this represents an expression A + B*x where A and B are loop invariant values...
Value * getPointerOperand()
Definition: Instructions.h:270
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
Definition: SmallPtrSet.h:382
self_iterator getIterator()
Definition: ilist_node.h:82
const SCEV * getMinusSCEV(const SCEV *LHS, const SCEV *RHS, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Return LHS-RHS. Minus is represented in SCEV as A+B*-1.
static UndefValue * get(Type *T)
Static factory methods - Return an &#39;undef&#39; object of the specified type.
Definition: Constants.cpp:1382
bool RecursivelyDeleteTriviallyDeadInstructions(Value *V, const TargetLibraryInfo *TLI=nullptr)
If the specified value is a trivially dead instruction, delete it.
Definition: Local.cpp:429
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition: PassManager.h:159
static wasm::ValType getType(const TargetRegisterClass *RC)
static PointerType * getInt8PtrTy(LLVMContext &C, unsigned AS=0)
Definition: Type.cpp:220
bool isVolatile() const
const SCEV * getMulExpr(SmallVectorImpl< const SCEV *> &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical multiply expression, or something simpler if possible.
Value * GetUnderlyingObject(Value *V, const DataLayout &DL, unsigned MaxLookup=6)
This method strips off any GEP address adjustments and pointer casts from the specified value...
INITIALIZE_PASS_END(RegBankSelect, DEBUG_TYPE, "Assign register bank of generic virtual registers", false, false) RegBankSelect
Representation for a specific memory location.
This class provides an interface for updating the loop pass manager based on mutations to the loop ne...
typename vector_type::const_iterator iterator
Definition: SetVector.h:49
Iterator for intrusive lists based on ilist_node.
This is the shared class of boolean and integer constants.
Definition: Constants.h:84
Type * getType() const
Return the LLVM type of this SCEV expression.
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
iterator end()
Definition: BasicBlock.h:266
This is a &#39;vector&#39; (really, a variable-sized array), optimized for the case when the array is small...
Definition: SmallVector.h:861
bool dominates(const Instruction *Def, const Use &U) const
Return true if Def dominates a use in User.
Definition: Dominators.cpp:244
Module.h This file contains the declarations for the Module class.
Provides information about what library functions are available for the current target.
static Constant * get(Type *Ty, uint64_t V, bool isSigned=false)
If Ty is a vector type, return a Constant with a splat of the given value.
Definition: Constants.cpp:611
bool isConditional() const
static PHINode * Create(Type *Ty, unsigned NumReservedValues, const Twine &NameStr="", Instruction *InsertBefore=nullptr)
Constructors - NumReservedValues is a hint for the number of incoming edges that this phi node will h...
static Constant * getMemSetPatternValue(Value *V, const DataLayout *DL)
getMemSetPatternValue - If a strided store of the specified value is safe to turn into a memset_patte...
static const SCEV * getNumBytes(const SCEV *BECount, Type *IntPtr, unsigned StoreSize, Loop *CurLoop, const DataLayout *DL, ScalarEvolution *SE)
Compute the number of bytes as a SCEV from the backedge taken count.
static bool detectPopcountIdiom(Loop *CurLoop, BasicBlock *PreCondBB, Instruction *&CntInst, PHINode *&CntPhi, Value *&Var)
Return true iff the idiom is detected in the loop.
static ConstantInt * getTrue(LLVMContext &Context)
Definition: Constants.cpp:567
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:133
The access may modify the value stored in memory.
Function * getFunction(StringRef Name) const
Look up the specified function in the module symbol table.
Definition: Module.cpp:172
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition: BitVector.h:924
signed less or equal
Definition: InstrTypes.h:915
loop Recognize loop idioms
const Module * getModule() const
Return the module owning the function this instruction belongs to or nullptr it the function does not...
Definition: Instruction.cpp:56
Class for arbitrary precision integers.
Definition: APInt.h:69
loop idiom
ModRefInfo getModRefInfo(ImmutableCallSite CS, const MemoryLocation &Loc)
getModRefInfo (for call sites) - Return information about whether a particular call site modifies or ...
iterator_range< user_iterator > users()
Definition: Value.h:399
This class uses information about analyze scalars to rewrite expressions in canonical form...
Pass * createLoopIdiomPass()
ConstantInt * getFalse()
Get the constant value for i1 false.
Definition: IRBuilder.h:292
uint64_t getTypeSizeInBits(Type *Ty) const
Size examples:
Definition: DataLayout.h:560
Predicate getPredicate() const
Return the predicate for this instruction.
Definition: InstrTypes.h:959
bool isVolatile() const
Return true if this is a store to a volatile memory location.
Definition: Instructions.h:339
Captures loop safety information.
Definition: MustExecute.h:39
const DebugLoc & getDebugLoc() const
Return the debug location for this node as a DebugLoc.
Definition: Instruction.h:290
INITIALIZE_PASS_BEGIN(LoopIdiomRecognizeLegacyPass, "loop-idiom", "Recognize loop idioms", false, false) INITIALIZE_PASS_END(LoopIdiomRecognizeLegacyPass
void forgetLoop(const Loop *L)
This method should be called by the client when it has changed a loop in a way that may effect Scalar...
void setUnnamedAddr(UnnamedAddr Val)
Definition: GlobalValue.h:215
unsigned getAtomicMemIntrinsicMaxElementSize() const
unsigned getAlignment() const
Return the alignment of the access that is being performed.
Definition: Instructions.h:226
This class represents an analyzed expression in the program.
unsigned getIntegerBitWidth() const
Definition: DerivedTypes.h:97
Represents a single loop in the control flow graph.
Definition: LoopInfo.h:445
This file provides utility analysis objects describing memory locations.
StringRef getName() const
Return a constant reference to the value&#39;s name.
Definition: Value.cpp:224
const Function * getParent() const
Return the enclosing method, or null if none.
Definition: BasicBlock.h:108
LLVM_NODISCARD ModRefInfo intersectModRef(const ModRefInfo MRI1, const ModRefInfo MRI2)
#define I(x, y, z)
Definition: MD5.cpp:58
bool isZero() const
This is just a convenience method to make client code smaller for a common code.
Definition: Constants.h:193
void getLoopAnalysisUsage(AnalysisUsage &AU)
Helper to consistently add the set of standard passes to a loop pass&#39;s AnalysisUsage.
Definition: LoopUtils.cpp:1226
static ArrayType * get(Type *ElementType, uint64_t NumElements)
This static method is the primary way to construct an ArrayType.
Definition: Type.cpp:568
LLVM_NODISCARD std::enable_if<!is_simple_type< Y >::value, typename cast_retty< X, const Y >::ret_type >::type dyn_cast(const Y &Val)
Definition: Casting.h:323
block_iterator block_end() const
Definition: LoopInfo.h:155
unsigned getAlignment() const
Return the alignment of the access that is being performed.
Definition: Instructions.h:351
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
Definition: Instructions.h:276
static const SCEV * getStartForNegStride(const SCEV *Start, const SCEV *BECount, Type *IntPtr, unsigned StoreSize, ScalarEvolution *SE)
const SCEV * getBackedgeTakenCount(const Loop *L)
If the specified loop has a predictable backedge-taken count, return it, otherwise return a SCEVCould...
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
The cost of a typical &#39;add&#39; instruction.
int getIntrinsicCost(Intrinsic::ID IID, Type *RetTy, ArrayRef< Type *> ParamTys) const
Estimate the cost of an intrinsic when lowered.
const SCEV * getNegativeSCEV(const SCEV *V, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap)
Return the SCEV object corresponding to -V.
Module * getParent()
Get the module that this global value is contained inside of...
Definition: GlobalValue.h:565
LLVM Value Representation.
Definition: Value.h:73
const SCEV * getSCEV(Value *V)
Return a SCEV expression for the full generality of the specified expression.
A vector that has set insertion semantics.
Definition: SetVector.h:41
constexpr char Size[]
Key for Kernel::Arg::Metadata::mSize.
static bool mayLoopAccessLocation(Value *Ptr, ModRefInfo Access, Loop *L, const SCEV *BECount, unsigned StoreSize, AliasAnalysis &AA, SmallPtrSetImpl< Instruction *> &IgnoredStores)
mayLoopAccessLocation - Return true if the specified loop might access the specified pointer location...
ModRefInfo
Flags indicating whether a memory access modifies or references memory.
unsigned greater than
Definition: InstrTypes.h:908
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:49
A container for analyses that lazily runs them and caches their results.
void replaceUsesOutsideBlock(Value *V, BasicBlock *BB)
replaceUsesOutsideBlock - Go through the uses list for this definition and make each use point to "V"...
Definition: Value.cpp:449
const SCEV * getTruncateOrZeroExtend(const SCEV *V, Type *Ty)
Return a SCEV corresponding to a conversion of the input value to the specified type.
This pass exposes codegen information to IR-level passes.
LLVM_NODISCARD bool isModOrRefSet(const ModRefInfo MRI)
bool isSimple() const
Definition: Instructions.h:387
const TerminatorInst * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Definition: BasicBlock.cpp:138
This header defines various interfaces for pass management in LLVM.
bool isBigEndian() const
Definition: DataLayout.h:222
#define LLVM_DEBUG(X)
Definition: Debug.h:119
bool isSafeToExpand(const SCEV *S, ScalarEvolution &SE)
Return true if the given expression is safe to expand in the sense that all materialized values are s...
block_iterator block_begin() const
Definition: LoopInfo.h:154
bool hasLoopInvariantBackedgeTakenCount(const Loop *L)
Return true if the specified loop has an analyzable loop-invariant backedge-taken count...
Value * getPointerOperand()
Definition: Instructions.h:398
const SCEV * getZeroExtendExpr(const SCEV *Op, Type *Ty, unsigned Depth=0)
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
PreservedAnalyses run(Loop &L, LoopAnalysisManager &AM, LoopStandardAnalysisResults &AR, LPMUpdater &U)
This class represents a constant integer value.
CallInst * CreateCall(Value *Callee, ArrayRef< Value *> Args=None, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition: IRBuilder.h:1871