LLVM  4.0.0
LoopIdiomRecognize.cpp
Go to the documentation of this file.
1 //===-- LoopIdiomRecognize.cpp - Loop idiom recognition -------------------===//
2 //
3 // The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 // This pass implements an idiom recognizer that transforms simple loops into a
11 // non-loop form. In cases that this kicks in, it can be a significant
12 // performance win.
13 //
14 // If compiling for code size we avoid idiom recognition if the resulting
15 // code could be larger than the code for the original loop. One way this could
16 // happen is if the loop is not removable after idiom recognition due to the
17 // presence of non-idiom instructions. The initial implementation of the
18 // heuristics applies to idioms in multi-block loops.
19 //
20 //===----------------------------------------------------------------------===//
21 //
22 // TODO List:
23 //
24 // Future loop memory idioms to recognize:
25 // memcmp, memmove, strlen, etc.
26 // Future floating point idioms to recognize in -ffast-math mode:
27 // fpowi
28 // Future integer operation idioms to recognize:
29 // ctpop, ctlz, cttz
30 //
31 // Beware that isel's default lowering for ctpop is highly inefficient for
32 // i64 and larger types when i64 is legal and the value has few bits set. It
33 // would be good to enhance isel to emit a loop for ctpop in this case.
34 //
35 // This could recognize common matrix multiplies and dot product idioms and
36 // replace them with calls to BLAS (if linked in??).
37 //
38 //===----------------------------------------------------------------------===//
39 
41 #include "llvm/ADT/MapVector.h"
42 #include "llvm/ADT/SetVector.h"
43 #include "llvm/ADT/Statistic.h"
48 #include "llvm/Analysis/LoopPass.h"
55 #include "llvm/IR/DataLayout.h"
56 #include "llvm/IR/Dominators.h"
57 #include "llvm/IR/IRBuilder.h"
58 #include "llvm/IR/IntrinsicInst.h"
59 #include "llvm/IR/Module.h"
60 #include "llvm/Support/Debug.h"
62 #include "llvm/Transforms/Scalar.h"
67 using namespace llvm;
68 
69 #define DEBUG_TYPE "loop-idiom"
70 
71 STATISTIC(NumMemSet, "Number of memset's formed from loop stores");
72 STATISTIC(NumMemCpy, "Number of memcpy's formed from loop load+stores");
73 
75  "use-lir-code-size-heurs",
76  cl::desc("Use loop idiom recognition code size heuristics when compiling"
77  "with -Os/-Oz"),
78  cl::init(true), cl::Hidden);
79 
80 namespace {
81 
82 class LoopIdiomRecognize {
83  Loop *CurLoop;
84  AliasAnalysis *AA;
85  DominatorTree *DT;
86  LoopInfo *LI;
87  ScalarEvolution *SE;
88  TargetLibraryInfo *TLI;
89  const TargetTransformInfo *TTI;
90  const DataLayout *DL;
91  bool ApplyCodeSizeHeuristics;
92 
93 public:
94  explicit LoopIdiomRecognize(AliasAnalysis *AA, DominatorTree *DT,
95  LoopInfo *LI, ScalarEvolution *SE,
96  TargetLibraryInfo *TLI,
97  const TargetTransformInfo *TTI,
98  const DataLayout *DL)
99  : CurLoop(nullptr), AA(AA), DT(DT), LI(LI), SE(SE), TLI(TLI), TTI(TTI),
100  DL(DL) {}
101 
102  bool runOnLoop(Loop *L);
103 
104 private:
105  typedef SmallVector<StoreInst *, 8> StoreList;
106  typedef MapVector<Value *, StoreList> StoreListMap;
107  StoreListMap StoreRefsForMemset;
108  StoreListMap StoreRefsForMemsetPattern;
109  StoreList StoreRefsForMemcpy;
110  bool HasMemset;
111  bool HasMemsetPattern;
112  bool HasMemcpy;
113 
114  /// \name Countable Loop Idiom Handling
115  /// @{
116 
117  bool runOnCountableLoop();
118  bool runOnLoopBlock(BasicBlock *BB, const SCEV *BECount,
119  SmallVectorImpl<BasicBlock *> &ExitBlocks);
120 
121  void collectStores(BasicBlock *BB);
122  bool isLegalStore(StoreInst *SI, bool &ForMemset, bool &ForMemsetPattern,
123  bool &ForMemcpy);
124  bool processLoopStores(SmallVectorImpl<StoreInst *> &SL, const SCEV *BECount,
125  bool ForMemset);
126  bool processLoopMemSet(MemSetInst *MSI, const SCEV *BECount);
127 
128  bool processLoopStridedStore(Value *DestPtr, unsigned StoreSize,
129  unsigned StoreAlignment, Value *StoredVal,
130  Instruction *TheStore,
132  const SCEVAddRecExpr *Ev, const SCEV *BECount,
133  bool NegStride, bool IsLoopMemset = false);
134  bool processLoopStoreOfLoopLoad(StoreInst *SI, const SCEV *BECount);
135  bool avoidLIRForMultiBlockLoop(bool IsMemset = false,
136  bool IsLoopMemset = false);
137 
138  /// @}
139  /// \name Noncountable Loop Idiom Handling
140  /// @{
141 
142  bool runOnNoncountableLoop();
143 
144  bool recognizePopcount();
145  void transformLoopToPopcount(BasicBlock *PreCondBB, Instruction *CntInst,
146  PHINode *CntPhi, Value *Var);
147 
148  /// @}
149 };
150 
151 class LoopIdiomRecognizeLegacyPass : public LoopPass {
152 public:
153  static char ID;
154  explicit LoopIdiomRecognizeLegacyPass() : LoopPass(ID) {
157  }
158 
159  bool runOnLoop(Loop *L, LPPassManager &LPM) override {
160  if (skipLoop(L))
161  return false;
162 
163  AliasAnalysis *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
164  DominatorTree *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
165  LoopInfo *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
166  ScalarEvolution *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
167  TargetLibraryInfo *TLI =
168  &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
169  const TargetTransformInfo *TTI =
170  &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(
171  *L->getHeader()->getParent());
172  const DataLayout *DL = &L->getHeader()->getModule()->getDataLayout();
173 
174  LoopIdiomRecognize LIR(AA, DT, LI, SE, TLI, TTI, DL);
175  return LIR.runOnLoop(L);
176  }
177 
178  /// This transformation requires natural loop information & requires that
179  /// loop preheaders be inserted into the CFG.
180  ///
181  void getAnalysisUsage(AnalysisUsage &AU) const override {
185  }
186 };
187 } // End anonymous namespace.
188 
191  LPMUpdater &) {
192  const auto *DL = &L.getHeader()->getModule()->getDataLayout();
193 
194  LoopIdiomRecognize LIR(&AR.AA, &AR.DT, &AR.LI, &AR.SE, &AR.TLI, &AR.TTI, DL);
195  if (!LIR.runOnLoop(&L))
196  return PreservedAnalyses::all();
197 
199 }
200 
202 INITIALIZE_PASS_BEGIN(LoopIdiomRecognizeLegacyPass, "loop-idiom",
203  "Recognize loop idioms", false, false)
207 INITIALIZE_PASS_END(LoopIdiomRecognizeLegacyPass, "loop-idiom",
208  "Recognize loop idioms", false, false)
209 
210 Pass *llvm::createLoopIdiomPass() { return new LoopIdiomRecognizeLegacyPass(); }
211 
214  I->eraseFromParent();
215 }
216 
217 //===----------------------------------------------------------------------===//
218 //
219 // Implementation of LoopIdiomRecognize
220 //
221 //===----------------------------------------------------------------------===//
222 
223 bool LoopIdiomRecognize::runOnLoop(Loop *L) {
224  CurLoop = L;
225  // If the loop could not be converted to canonical form, it must have an
226  // indirectbr in it, just give up.
227  if (!L->getLoopPreheader())
228  return false;
229 
230  // Disable loop idiom recognition if the function's name is a common idiom.
231  StringRef Name = L->getHeader()->getParent()->getName();
232  if (Name == "memset" || Name == "memcpy")
233  return false;
234 
235  // Determine if code size heuristics need to be applied.
236  ApplyCodeSizeHeuristics =
237  L->getHeader()->getParent()->optForSize() && UseLIRCodeSizeHeurs;
238 
239  HasMemset = TLI->has(LibFunc::memset);
240  HasMemsetPattern = TLI->has(LibFunc::memset_pattern16);
241  HasMemcpy = TLI->has(LibFunc::memcpy);
242 
243  if (HasMemset || HasMemsetPattern || HasMemcpy)
245  return runOnCountableLoop();
246 
247  return runOnNoncountableLoop();
248 }
249 
250 bool LoopIdiomRecognize::runOnCountableLoop() {
251  const SCEV *BECount = SE->getBackedgeTakenCount(CurLoop);
252  assert(!isa<SCEVCouldNotCompute>(BECount) &&
253  "runOnCountableLoop() called on a loop without a predictable"
254  "backedge-taken count");
255 
256  // If this loop executes exactly one time, then it should be peeled, not
257  // optimized by this pass.
258  if (const SCEVConstant *BECst = dyn_cast<SCEVConstant>(BECount))
259  if (BECst->getAPInt() == 0)
260  return false;
261 
262  SmallVector<BasicBlock *, 8> ExitBlocks;
263  CurLoop->getUniqueExitBlocks(ExitBlocks);
264 
265  DEBUG(dbgs() << "loop-idiom Scanning: F["
266  << CurLoop->getHeader()->getParent()->getName() << "] Loop %"
267  << CurLoop->getHeader()->getName() << "\n");
268 
269  bool MadeChange = false;
270 
271  // The following transforms hoist stores/memsets into the loop pre-header.
272  // Give up if the loop has instructions may throw.
273  LoopSafetyInfo SafetyInfo;
274  computeLoopSafetyInfo(&SafetyInfo, CurLoop);
275  if (SafetyInfo.MayThrow)
276  return MadeChange;
277 
278  // Scan all the blocks in the loop that are not in subloops.
279  for (auto *BB : CurLoop->getBlocks()) {
280  // Ignore blocks in subloops.
281  if (LI->getLoopFor(BB) != CurLoop)
282  continue;
283 
284  MadeChange |= runOnLoopBlock(BB, BECount, ExitBlocks);
285  }
286  return MadeChange;
287 }
288 
289 static unsigned getStoreSizeInBytes(StoreInst *SI, const DataLayout *DL) {
290  uint64_t SizeInBits = DL->getTypeSizeInBits(SI->getValueOperand()->getType());
291  assert(((SizeInBits & 7) || (SizeInBits >> 32) == 0) &&
292  "Don't overflow unsigned.");
293  return (unsigned)SizeInBits >> 3;
294 }
295 
296 static APInt getStoreStride(const SCEVAddRecExpr *StoreEv) {
297  const SCEVConstant *ConstStride = cast<SCEVConstant>(StoreEv->getOperand(1));
298  return ConstStride->getAPInt();
299 }
300 
301 /// getMemSetPatternValue - If a strided store of the specified value is safe to
302 /// turn into a memset_pattern16, return a ConstantArray of 16 bytes that should
303 /// be passed in. Otherwise, return null.
304 ///
305 /// Note that we don't ever attempt to use memset_pattern8 or 4, because these
306 /// just replicate their input array and then pass on to memset_pattern16.
308  // If the value isn't a constant, we can't promote it to being in a constant
309  // array. We could theoretically do a store to an alloca or something, but
310  // that doesn't seem worthwhile.
311  Constant *C = dyn_cast<Constant>(V);
312  if (!C)
313  return nullptr;
314 
315  // Only handle simple values that are a power of two bytes in size.
316  uint64_t Size = DL->getTypeSizeInBits(V->getType());
317  if (Size == 0 || (Size & 7) || (Size & (Size - 1)))
318  return nullptr;
319 
320  // Don't care enough about darwin/ppc to implement this.
321  if (DL->isBigEndian())
322  return nullptr;
323 
324  // Convert to size in bytes.
325  Size /= 8;
326 
327  // TODO: If CI is larger than 16-bytes, we can try slicing it in half to see
328  // if the top and bottom are the same (e.g. for vectors and large integers).
329  if (Size > 16)
330  return nullptr;
331 
332  // If the constant is exactly 16 bytes, just use it.
333  if (Size == 16)
334  return C;
335 
336  // Otherwise, we'll use an array of the constants.
337  unsigned ArraySize = 16 / Size;
338  ArrayType *AT = ArrayType::get(V->getType(), ArraySize);
339  return ConstantArray::get(AT, std::vector<Constant *>(ArraySize, C));
340 }
341 
342 bool LoopIdiomRecognize::isLegalStore(StoreInst *SI, bool &ForMemset,
343  bool &ForMemsetPattern, bool &ForMemcpy) {
344  // Don't touch volatile stores.
345  if (!SI->isSimple())
346  return false;
347 
348  // Avoid merging nontemporal stores.
350  return false;
351 
352  Value *StoredVal = SI->getValueOperand();
353  Value *StorePtr = SI->getPointerOperand();
354 
355  // Reject stores that are so large that they overflow an unsigned.
356  uint64_t SizeInBits = DL->getTypeSizeInBits(StoredVal->getType());
357  if ((SizeInBits & 7) || (SizeInBits >> 32) != 0)
358  return false;
359 
360  // See if the pointer expression is an AddRec like {base,+,1} on the current
361  // loop, which indicates a strided store. If we have something else, it's a
362  // random store we can't handle.
363  const SCEVAddRecExpr *StoreEv =
364  dyn_cast<SCEVAddRecExpr>(SE->getSCEV(StorePtr));
365  if (!StoreEv || StoreEv->getLoop() != CurLoop || !StoreEv->isAffine())
366  return false;
367 
368  // Check to see if we have a constant stride.
369  if (!isa<SCEVConstant>(StoreEv->getOperand(1)))
370  return false;
371 
372  // See if the store can be turned into a memset.
373 
374  // If the stored value is a byte-wise value (like i32 -1), then it may be
375  // turned into a memset of i8 -1, assuming that all the consecutive bytes
376  // are stored. A store of i32 0x01020304 can never be turned into a memset,
377  // but it can be turned into memset_pattern if the target supports it.
378  Value *SplatValue = isBytewiseValue(StoredVal);
379  Constant *PatternValue = nullptr;
380 
381  // If we're allowed to form a memset, and the stored value would be
382  // acceptable for memset, use it.
383  if (HasMemset && SplatValue &&
384  // Verify that the stored value is loop invariant. If not, we can't
385  // promote the memset.
386  CurLoop->isLoopInvariant(SplatValue)) {
387  // It looks like we can use SplatValue.
388  ForMemset = true;
389  return true;
390  } else if (HasMemsetPattern &&
391  // Don't create memset_pattern16s with address spaces.
392  StorePtr->getType()->getPointerAddressSpace() == 0 &&
393  (PatternValue = getMemSetPatternValue(StoredVal, DL))) {
394  // It looks like we can use PatternValue!
395  ForMemsetPattern = true;
396  return true;
397  }
398 
399  // Otherwise, see if the store can be turned into a memcpy.
400  if (HasMemcpy) {
401  // Check to see if the stride matches the size of the store. If so, then we
402  // know that every byte is touched in the loop.
403  APInt Stride = getStoreStride(StoreEv);
404  unsigned StoreSize = getStoreSizeInBytes(SI, DL);
405  if (StoreSize != Stride && StoreSize != -Stride)
406  return false;
407 
408  // The store must be feeding a non-volatile load.
410  if (!LI || !LI->isSimple())
411  return false;
412 
413  // See if the pointer expression is an AddRec like {base,+,1} on the current
414  // loop, which indicates a strided load. If we have something else, it's a
415  // random load we can't handle.
416  const SCEVAddRecExpr *LoadEv =
418  if (!LoadEv || LoadEv->getLoop() != CurLoop || !LoadEv->isAffine())
419  return false;
420 
421  // The store and load must share the same stride.
422  if (StoreEv->getOperand(1) != LoadEv->getOperand(1))
423  return false;
424 
425  // Success. This store can be converted into a memcpy.
426  ForMemcpy = true;
427  return true;
428  }
429  // This store can't be transformed into a memset/memcpy.
430  return false;
431 }
432 
433 void LoopIdiomRecognize::collectStores(BasicBlock *BB) {
434  StoreRefsForMemset.clear();
435  StoreRefsForMemsetPattern.clear();
436  StoreRefsForMemcpy.clear();
437  for (Instruction &I : *BB) {
438  StoreInst *SI = dyn_cast<StoreInst>(&I);
439  if (!SI)
440  continue;
441 
442  bool ForMemset = false;
443  bool ForMemsetPattern = false;
444  bool ForMemcpy = false;
445  // Make sure this is a strided store with a constant stride.
446  if (!isLegalStore(SI, ForMemset, ForMemsetPattern, ForMemcpy))
447  continue;
448 
449  // Save the store locations.
450  if (ForMemset) {
451  // Find the base pointer.
453  StoreRefsForMemset[Ptr].push_back(SI);
454  } else if (ForMemsetPattern) {
455  // Find the base pointer.
456  Value *Ptr = GetUnderlyingObject(SI->getPointerOperand(), *DL);
457  StoreRefsForMemsetPattern[Ptr].push_back(SI);
458  } else if (ForMemcpy)
459  StoreRefsForMemcpy.push_back(SI);
460  }
461 }
462 
463 /// runOnLoopBlock - Process the specified block, which lives in a counted loop
464 /// with the specified backedge count. This block is known to be in the current
465 /// loop and not in any subloops.
466 bool LoopIdiomRecognize::runOnLoopBlock(
467  BasicBlock *BB, const SCEV *BECount,
468  SmallVectorImpl<BasicBlock *> &ExitBlocks) {
469  // We can only promote stores in this block if they are unconditionally
470  // executed in the loop. For a block to be unconditionally executed, it has
471  // to dominate all the exit blocks of the loop. Verify this now.
472  for (unsigned i = 0, e = ExitBlocks.size(); i != e; ++i)
473  if (!DT->dominates(BB, ExitBlocks[i]))
474  return false;
475 
476  bool MadeChange = false;
477  // Look for store instructions, which may be optimized to memset/memcpy.
478  collectStores(BB);
479 
480  // Look for a single store or sets of stores with a common base, which can be
481  // optimized into a memset (memset_pattern). The latter most commonly happens
482  // with structs and handunrolled loops.
483  for (auto &SL : StoreRefsForMemset)
484  MadeChange |= processLoopStores(SL.second, BECount, true);
485 
486  for (auto &SL : StoreRefsForMemsetPattern)
487  MadeChange |= processLoopStores(SL.second, BECount, false);
488 
489  // Optimize the store into a memcpy, if it feeds an similarly strided load.
490  for (auto &SI : StoreRefsForMemcpy)
491  MadeChange |= processLoopStoreOfLoopLoad(SI, BECount);
492 
493  for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) {
494  Instruction *Inst = &*I++;
495  // Look for memset instructions, which may be optimized to a larger memset.
496  if (MemSetInst *MSI = dyn_cast<MemSetInst>(Inst)) {
497  WeakVH InstPtr(&*I);
498  if (!processLoopMemSet(MSI, BECount))
499  continue;
500  MadeChange = true;
501 
502  // If processing the memset invalidated our iterator, start over from the
503  // top of the block.
504  if (!InstPtr)
505  I = BB->begin();
506  continue;
507  }
508  }
509 
510  return MadeChange;
511 }
512 
513 /// processLoopStores - See if this store(s) can be promoted to a memset.
514 bool LoopIdiomRecognize::processLoopStores(SmallVectorImpl<StoreInst *> &SL,
515  const SCEV *BECount,
516  bool ForMemset) {
517  // Try to find consecutive stores that can be transformed into memsets.
518  SetVector<StoreInst *> Heads, Tails;
520 
521  // Do a quadratic search on all of the given stores and find
522  // all of the pairs of stores that follow each other.
523  SmallVector<unsigned, 16> IndexQueue;
524  for (unsigned i = 0, e = SL.size(); i < e; ++i) {
525  assert(SL[i]->isSimple() && "Expected only non-volatile stores.");
526 
527  Value *FirstStoredVal = SL[i]->getValueOperand();
528  Value *FirstStorePtr = SL[i]->getPointerOperand();
529  const SCEVAddRecExpr *FirstStoreEv =
530  cast<SCEVAddRecExpr>(SE->getSCEV(FirstStorePtr));
531  APInt FirstStride = getStoreStride(FirstStoreEv);
532  unsigned FirstStoreSize = getStoreSizeInBytes(SL[i], DL);
533 
534  // See if we can optimize just this store in isolation.
535  if (FirstStride == FirstStoreSize || -FirstStride == FirstStoreSize) {
536  Heads.insert(SL[i]);
537  continue;
538  }
539 
540  Value *FirstSplatValue = nullptr;
541  Constant *FirstPatternValue = nullptr;
542 
543  if (ForMemset)
544  FirstSplatValue = isBytewiseValue(FirstStoredVal);
545  else
546  FirstPatternValue = getMemSetPatternValue(FirstStoredVal, DL);
547 
548  assert((FirstSplatValue || FirstPatternValue) &&
549  "Expected either splat value or pattern value.");
550 
551  IndexQueue.clear();
552  // If a store has multiple consecutive store candidates, search Stores
553  // array according to the sequence: from i+1 to e, then from i-1 to 0.
554  // This is because usually pairing with immediate succeeding or preceding
555  // candidate create the best chance to find memset opportunity.
556  unsigned j = 0;
557  for (j = i + 1; j < e; ++j)
558  IndexQueue.push_back(j);
559  for (j = i; j > 0; --j)
560  IndexQueue.push_back(j - 1);
561 
562  for (auto &k : IndexQueue) {
563  assert(SL[k]->isSimple() && "Expected only non-volatile stores.");
564  Value *SecondStorePtr = SL[k]->getPointerOperand();
565  const SCEVAddRecExpr *SecondStoreEv =
566  cast<SCEVAddRecExpr>(SE->getSCEV(SecondStorePtr));
567  APInt SecondStride = getStoreStride(SecondStoreEv);
568 
569  if (FirstStride != SecondStride)
570  continue;
571 
572  Value *SecondStoredVal = SL[k]->getValueOperand();
573  Value *SecondSplatValue = nullptr;
574  Constant *SecondPatternValue = nullptr;
575 
576  if (ForMemset)
577  SecondSplatValue = isBytewiseValue(SecondStoredVal);
578  else
579  SecondPatternValue = getMemSetPatternValue(SecondStoredVal, DL);
580 
581  assert((SecondSplatValue || SecondPatternValue) &&
582  "Expected either splat value or pattern value.");
583 
584  if (isConsecutiveAccess(SL[i], SL[k], *DL, *SE, false)) {
585  if (ForMemset) {
586  if (FirstSplatValue != SecondSplatValue)
587  continue;
588  } else {
589  if (FirstPatternValue != SecondPatternValue)
590  continue;
591  }
592  Tails.insert(SL[k]);
593  Heads.insert(SL[i]);
594  ConsecutiveChain[SL[i]] = SL[k];
595  break;
596  }
597  }
598  }
599 
600  // We may run into multiple chains that merge into a single chain. We mark the
601  // stores that we transformed so that we don't visit the same store twice.
602  SmallPtrSet<Value *, 16> TransformedStores;
603  bool Changed = false;
604 
605  // For stores that start but don't end a link in the chain:
606  for (SetVector<StoreInst *>::iterator it = Heads.begin(), e = Heads.end();
607  it != e; ++it) {
608  if (Tails.count(*it))
609  continue;
610 
611  // We found a store instr that starts a chain. Now follow the chain and try
612  // to transform it.
613  SmallPtrSet<Instruction *, 8> AdjacentStores;
614  StoreInst *I = *it;
615 
616  StoreInst *HeadStore = I;
617  unsigned StoreSize = 0;
618 
619  // Collect the chain into a list.
620  while (Tails.count(I) || Heads.count(I)) {
621  if (TransformedStores.count(I))
622  break;
623  AdjacentStores.insert(I);
624 
625  StoreSize += getStoreSizeInBytes(I, DL);
626  // Move to the next value in the chain.
627  I = ConsecutiveChain[I];
628  }
629 
630  Value *StoredVal = HeadStore->getValueOperand();
631  Value *StorePtr = HeadStore->getPointerOperand();
632  const SCEVAddRecExpr *StoreEv = cast<SCEVAddRecExpr>(SE->getSCEV(StorePtr));
633  APInt Stride = getStoreStride(StoreEv);
634 
635  // Check to see if the stride matches the size of the stores. If so, then
636  // we know that every byte is touched in the loop.
637  if (StoreSize != Stride && StoreSize != -Stride)
638  continue;
639 
640  bool NegStride = StoreSize == -Stride;
641 
642  if (processLoopStridedStore(StorePtr, StoreSize, HeadStore->getAlignment(),
643  StoredVal, HeadStore, AdjacentStores, StoreEv,
644  BECount, NegStride)) {
645  TransformedStores.insert(AdjacentStores.begin(), AdjacentStores.end());
646  Changed = true;
647  }
648  }
649 
650  return Changed;
651 }
652 
653 /// processLoopMemSet - See if this memset can be promoted to a large memset.
654 bool LoopIdiomRecognize::processLoopMemSet(MemSetInst *MSI,
655  const SCEV *BECount) {
656  // We can only handle non-volatile memsets with a constant size.
657  if (MSI->isVolatile() || !isa<ConstantInt>(MSI->getLength()))
658  return false;
659 
660  // If we're not allowed to hack on memset, we fail.
661  if (!HasMemset)
662  return false;
663 
664  Value *Pointer = MSI->getDest();
665 
666  // See if the pointer expression is an AddRec like {base,+,1} on the current
667  // loop, which indicates a strided store. If we have something else, it's a
668  // random store we can't handle.
669  const SCEVAddRecExpr *Ev = dyn_cast<SCEVAddRecExpr>(SE->getSCEV(Pointer));
670  if (!Ev || Ev->getLoop() != CurLoop || !Ev->isAffine())
671  return false;
672 
673  // Reject memsets that are so large that they overflow an unsigned.
674  uint64_t SizeInBytes = cast<ConstantInt>(MSI->getLength())->getZExtValue();
675  if ((SizeInBytes >> 32) != 0)
676  return false;
677 
678  // Check to see if the stride matches the size of the memset. If so, then we
679  // know that every byte is touched in the loop.
680  const SCEVConstant *ConstStride = dyn_cast<SCEVConstant>(Ev->getOperand(1));
681  if (!ConstStride)
682  return false;
683 
684  APInt Stride = ConstStride->getAPInt();
685  if (SizeInBytes != Stride && SizeInBytes != -Stride)
686  return false;
687 
688  // Verify that the memset value is loop invariant. If not, we can't promote
689  // the memset.
690  Value *SplatValue = MSI->getValue();
691  if (!SplatValue || !CurLoop->isLoopInvariant(SplatValue))
692  return false;
693 
695  MSIs.insert(MSI);
696  bool NegStride = SizeInBytes == -Stride;
697  return processLoopStridedStore(Pointer, (unsigned)SizeInBytes,
698  MSI->getAlignment(), SplatValue, MSI, MSIs, Ev,
699  BECount, NegStride, /*IsLoopMemset=*/true);
700 }
701 
702 /// mayLoopAccessLocation - Return true if the specified loop might access the
703 /// specified pointer location, which is a loop-strided access. The 'Access'
704 /// argument specifies what the verboten forms of access are (read or write).
705 static bool
707  const SCEV *BECount, unsigned StoreSize,
708  AliasAnalysis &AA,
709  SmallPtrSetImpl<Instruction *> &IgnoredStores) {
710  // Get the location that may be stored across the loop. Since the access is
711  // strided positively through memory, we say that the modified location starts
712  // at the pointer and has infinite size.
713  uint64_t AccessSize = MemoryLocation::UnknownSize;
714 
715  // If the loop iterates a fixed number of times, we can refine the access size
716  // to be exactly the size of the memset, which is (BECount+1)*StoreSize
717  if (const SCEVConstant *BECst = dyn_cast<SCEVConstant>(BECount))
718  AccessSize = (BECst->getValue()->getZExtValue() + 1) * StoreSize;
719 
720  // TODO: For this to be really effective, we have to dive into the pointer
721  // operand in the store. Store to &A[i] of 100 will always return may alias
722  // with store of &A[100], we need to StoreLoc to be "A" with size of 100,
723  // which will then no-alias a store to &A[100].
724  MemoryLocation StoreLoc(Ptr, AccessSize);
725 
726  for (Loop::block_iterator BI = L->block_begin(), E = L->block_end(); BI != E;
727  ++BI)
728  for (Instruction &I : **BI)
729  if (IgnoredStores.count(&I) == 0 &&
730  (AA.getModRefInfo(&I, StoreLoc) & Access))
731  return true;
732 
733  return false;
734 }
735 
736 // If we have a negative stride, Start refers to the end of the memory location
737 // we're trying to memset. Therefore, we need to recompute the base pointer,
738 // which is just Start - BECount*Size.
739 static const SCEV *getStartForNegStride(const SCEV *Start, const SCEV *BECount,
740  Type *IntPtr, unsigned StoreSize,
741  ScalarEvolution *SE) {
742  const SCEV *Index = SE->getTruncateOrZeroExtend(BECount, IntPtr);
743  if (StoreSize != 1)
744  Index = SE->getMulExpr(Index, SE->getConstant(IntPtr, StoreSize),
745  SCEV::FlagNUW);
746  return SE->getMinusSCEV(Start, Index);
747 }
748 
749 /// processLoopStridedStore - We see a strided store of some value. If we can
750 /// transform this into a memset or memset_pattern in the loop preheader, do so.
751 bool LoopIdiomRecognize::processLoopStridedStore(
752  Value *DestPtr, unsigned StoreSize, unsigned StoreAlignment,
753  Value *StoredVal, Instruction *TheStore,
755  const SCEV *BECount, bool NegStride, bool IsLoopMemset) {
756  Value *SplatValue = isBytewiseValue(StoredVal);
757  Constant *PatternValue = nullptr;
758 
759  if (!SplatValue)
760  PatternValue = getMemSetPatternValue(StoredVal, DL);
761 
762  assert((SplatValue || PatternValue) &&
763  "Expected either splat value or pattern value.");
764 
765  // The trip count of the loop and the base pointer of the addrec SCEV is
766  // guaranteed to be loop invariant, which means that it should dominate the
767  // header. This allows us to insert code for it in the preheader.
768  unsigned DestAS = DestPtr->getType()->getPointerAddressSpace();
769  BasicBlock *Preheader = CurLoop->getLoopPreheader();
770  IRBuilder<> Builder(Preheader->getTerminator());
771  SCEVExpander Expander(*SE, *DL, "loop-idiom");
772 
773  Type *DestInt8PtrTy = Builder.getInt8PtrTy(DestAS);
774  Type *IntPtr = Builder.getIntPtrTy(*DL, DestAS);
775 
776  const SCEV *Start = Ev->getStart();
777  // Handle negative strided loops.
778  if (NegStride)
779  Start = getStartForNegStride(Start, BECount, IntPtr, StoreSize, SE);
780 
781  // Okay, we have a strided store "p[i]" of a splattable value. We can turn
782  // this into a memset in the loop preheader now if we want. However, this
783  // would be unsafe to do if there is anything else in the loop that may read
784  // or write to the aliased location. Check for any overlap by generating the
785  // base pointer and checking the region.
786  Value *BasePtr =
787  Expander.expandCodeFor(Start, DestInt8PtrTy, Preheader->getTerminator());
788  if (mayLoopAccessLocation(BasePtr, MRI_ModRef, CurLoop, BECount, StoreSize,
789  *AA, Stores)) {
790  Expander.clear();
791  // If we generated new code for the base pointer, clean up.
793  return false;
794  }
795 
796  if (avoidLIRForMultiBlockLoop(/*IsMemset=*/true, IsLoopMemset))
797  return false;
798 
799  // Okay, everything looks good, insert the memset.
800 
801  // The # stored bytes is (BECount+1)*Size. Expand the trip count out to
802  // pointer size if it isn't already.
803  BECount = SE->getTruncateOrZeroExtend(BECount, IntPtr);
804 
805  const SCEV *NumBytesS =
806  SE->getAddExpr(BECount, SE->getOne(IntPtr), SCEV::FlagNUW);
807  if (StoreSize != 1) {
808  NumBytesS = SE->getMulExpr(NumBytesS, SE->getConstant(IntPtr, StoreSize),
809  SCEV::FlagNUW);
810  }
811 
812  Value *NumBytes =
813  Expander.expandCodeFor(NumBytesS, IntPtr, Preheader->getTerminator());
814 
815  CallInst *NewCall;
816  if (SplatValue) {
817  NewCall =
818  Builder.CreateMemSet(BasePtr, SplatValue, NumBytes, StoreAlignment);
819  } else {
820  // Everything is emitted in default address space
821  Type *Int8PtrTy = DestInt8PtrTy;
822 
823  Module *M = TheStore->getModule();
824  Value *MSP =
825  M->getOrInsertFunction("memset_pattern16", Builder.getVoidTy(),
826  Int8PtrTy, Int8PtrTy, IntPtr, (void *)nullptr);
827  inferLibFuncAttributes(*M->getFunction("memset_pattern16"), *TLI);
828 
829  // Otherwise we should form a memset_pattern16. PatternValue is known to be
830  // an constant array of 16-bytes. Plop the value into a mergable global.
831  GlobalVariable *GV = new GlobalVariable(*M, PatternValue->getType(), true,
833  PatternValue, ".memset_pattern");
834  GV->setUnnamedAddr(GlobalValue::UnnamedAddr::Global); // Ok to merge these.
835  GV->setAlignment(16);
836  Value *PatternPtr = ConstantExpr::getBitCast(GV, Int8PtrTy);
837  NewCall = Builder.CreateCall(MSP, {BasePtr, PatternPtr, NumBytes});
838  }
839 
840  DEBUG(dbgs() << " Formed memset: " << *NewCall << "\n"
841  << " from store to: " << *Ev << " at: " << *TheStore << "\n");
842  NewCall->setDebugLoc(TheStore->getDebugLoc());
843 
844  // Okay, the memset has been formed. Zap the original store and anything that
845  // feeds into it.
846  for (auto *I : Stores)
848  ++NumMemSet;
849  return true;
850 }
851 
852 /// If the stored value is a strided load in the same loop with the same stride
853 /// this may be transformable into a memcpy. This kicks in for stuff like
854 /// for (i) A[i] = B[i];
855 bool LoopIdiomRecognize::processLoopStoreOfLoopLoad(StoreInst *SI,
856  const SCEV *BECount) {
857  assert(SI->isSimple() && "Expected only non-volatile stores.");
858 
859  Value *StorePtr = SI->getPointerOperand();
860  const SCEVAddRecExpr *StoreEv = cast<SCEVAddRecExpr>(SE->getSCEV(StorePtr));
861  APInt Stride = getStoreStride(StoreEv);
862  unsigned StoreSize = getStoreSizeInBytes(SI, DL);
863  bool NegStride = StoreSize == -Stride;
864 
865  // The store must be feeding a non-volatile load.
866  LoadInst *LI = cast<LoadInst>(SI->getValueOperand());
867  assert(LI->isSimple() && "Expected only non-volatile stores.");
868 
869  // See if the pointer expression is an AddRec like {base,+,1} on the current
870  // loop, which indicates a strided load. If we have something else, it's a
871  // random load we can't handle.
872  const SCEVAddRecExpr *LoadEv =
873  cast<SCEVAddRecExpr>(SE->getSCEV(LI->getPointerOperand()));
874 
875  // The trip count of the loop and the base pointer of the addrec SCEV is
876  // guaranteed to be loop invariant, which means that it should dominate the
877  // header. This allows us to insert code for it in the preheader.
878  BasicBlock *Preheader = CurLoop->getLoopPreheader();
879  IRBuilder<> Builder(Preheader->getTerminator());
880  SCEVExpander Expander(*SE, *DL, "loop-idiom");
881 
882  const SCEV *StrStart = StoreEv->getStart();
883  unsigned StrAS = SI->getPointerAddressSpace();
884  Type *IntPtrTy = Builder.getIntPtrTy(*DL, StrAS);
885 
886  // Handle negative strided loops.
887  if (NegStride)
888  StrStart = getStartForNegStride(StrStart, BECount, IntPtrTy, StoreSize, SE);
889 
890  // Okay, we have a strided store "p[i]" of a loaded value. We can turn
891  // this into a memcpy in the loop preheader now if we want. However, this
892  // would be unsafe to do if there is anything else in the loop that may read
893  // or write the memory region we're storing to. This includes the load that
894  // feeds the stores. Check for an alias by generating the base address and
895  // checking everything.
896  Value *StoreBasePtr = Expander.expandCodeFor(
897  StrStart, Builder.getInt8PtrTy(StrAS), Preheader->getTerminator());
898 
900  Stores.insert(SI);
901  if (mayLoopAccessLocation(StoreBasePtr, MRI_ModRef, CurLoop, BECount,
902  StoreSize, *AA, Stores)) {
903  Expander.clear();
904  // If we generated new code for the base pointer, clean up.
906  return false;
907  }
908 
909  const SCEV *LdStart = LoadEv->getStart();
910  unsigned LdAS = LI->getPointerAddressSpace();
911 
912  // Handle negative strided loops.
913  if (NegStride)
914  LdStart = getStartForNegStride(LdStart, BECount, IntPtrTy, StoreSize, SE);
915 
916  // For a memcpy, we have to make sure that the input array is not being
917  // mutated by the loop.
918  Value *LoadBasePtr = Expander.expandCodeFor(
919  LdStart, Builder.getInt8PtrTy(LdAS), Preheader->getTerminator());
920 
921  if (mayLoopAccessLocation(LoadBasePtr, MRI_Mod, CurLoop, BECount, StoreSize,
922  *AA, Stores)) {
923  Expander.clear();
924  // If we generated new code for the base pointer, clean up.
927  return false;
928  }
929 
930  if (avoidLIRForMultiBlockLoop())
931  return false;
932 
933  // Okay, everything is safe, we can transform this!
934 
935  // The # stored bytes is (BECount+1)*Size. Expand the trip count out to
936  // pointer size if it isn't already.
937  BECount = SE->getTruncateOrZeroExtend(BECount, IntPtrTy);
938 
939  const SCEV *NumBytesS =
940  SE->getAddExpr(BECount, SE->getOne(IntPtrTy), SCEV::FlagNUW);
941  if (StoreSize != 1)
942  NumBytesS = SE->getMulExpr(NumBytesS, SE->getConstant(IntPtrTy, StoreSize),
943  SCEV::FlagNUW);
944 
945  Value *NumBytes =
946  Expander.expandCodeFor(NumBytesS, IntPtrTy, Preheader->getTerminator());
947 
948  CallInst *NewCall =
949  Builder.CreateMemCpy(StoreBasePtr, LoadBasePtr, NumBytes,
950  std::min(SI->getAlignment(), LI->getAlignment()));
951  NewCall->setDebugLoc(SI->getDebugLoc());
952 
953  DEBUG(dbgs() << " Formed memcpy: " << *NewCall << "\n"
954  << " from load ptr=" << *LoadEv << " at: " << *LI << "\n"
955  << " from store ptr=" << *StoreEv << " at: " << *SI << "\n");
956 
957  // Okay, the memcpy has been formed. Zap the original store and anything that
958  // feeds into it.
960  ++NumMemCpy;
961  return true;
962 }
963 
964 // When compiling for codesize we avoid idiom recognition for a multi-block loop
965 // unless it is a loop_memset idiom or a memset/memcpy idiom in a nested loop.
966 //
967 bool LoopIdiomRecognize::avoidLIRForMultiBlockLoop(bool IsMemset,
968  bool IsLoopMemset) {
969  if (ApplyCodeSizeHeuristics && CurLoop->getNumBlocks() > 1) {
970  if (!CurLoop->getParentLoop() && (!IsMemset || !IsLoopMemset)) {
971  DEBUG(dbgs() << " " << CurLoop->getHeader()->getParent()->getName()
972  << " : LIR " << (IsMemset ? "Memset" : "Memcpy")
973  << " avoided: multi-block top-level loop\n");
974  return true;
975  }
976  }
977 
978  return false;
979 }
980 
981 bool LoopIdiomRecognize::runOnNoncountableLoop() {
982  return recognizePopcount();
983 }
984 
985 /// Check if the given conditional branch is based on the comparison between
986 /// a variable and zero, and if the variable is non-zero, the control yields to
987 /// the loop entry. If the branch matches the behavior, the variable involved
988 /// in the comparison is returned. This function will be called to see if the
989 /// precondition and postcondition of the loop are in desirable form.
990 static Value *matchCondition(BranchInst *BI, BasicBlock *LoopEntry) {
991  if (!BI || !BI->isConditional())
992  return nullptr;
993 
994  ICmpInst *Cond = dyn_cast<ICmpInst>(BI->getCondition());
995  if (!Cond)
996  return nullptr;
997 
998  ConstantInt *CmpZero = dyn_cast<ConstantInt>(Cond->getOperand(1));
999  if (!CmpZero || !CmpZero->isZero())
1000  return nullptr;
1001 
1002  ICmpInst::Predicate Pred = Cond->getPredicate();
1003  if ((Pred == ICmpInst::ICMP_NE && BI->getSuccessor(0) == LoopEntry) ||
1004  (Pred == ICmpInst::ICMP_EQ && BI->getSuccessor(1) == LoopEntry))
1005  return Cond->getOperand(0);
1006 
1007  return nullptr;
1008 }
1009 
1010 /// Return true iff the idiom is detected in the loop.
1011 ///
1012 /// Additionally:
1013 /// 1) \p CntInst is set to the instruction counting the population bit.
1014 /// 2) \p CntPhi is set to the corresponding phi node.
1015 /// 3) \p Var is set to the value whose population bits are being counted.
1016 ///
1017 /// The core idiom we are trying to detect is:
1018 /// \code
1019 /// if (x0 != 0)
1020 /// goto loop-exit // the precondition of the loop
1021 /// cnt0 = init-val;
1022 /// do {
1023 /// x1 = phi (x0, x2);
1024 /// cnt1 = phi(cnt0, cnt2);
1025 ///
1026 /// cnt2 = cnt1 + 1;
1027 /// ...
1028 /// x2 = x1 & (x1 - 1);
1029 /// ...
1030 /// } while(x != 0);
1031 ///
1032 /// loop-exit:
1033 /// \endcode
1034 static bool detectPopcountIdiom(Loop *CurLoop, BasicBlock *PreCondBB,
1035  Instruction *&CntInst, PHINode *&CntPhi,
1036  Value *&Var) {
1037  // step 1: Check to see if the look-back branch match this pattern:
1038  // "if (a!=0) goto loop-entry".
1039  BasicBlock *LoopEntry;
1040  Instruction *DefX2, *CountInst;
1041  Value *VarX1, *VarX0;
1042  PHINode *PhiX, *CountPhi;
1043 
1044  DefX2 = CountInst = nullptr;
1045  VarX1 = VarX0 = nullptr;
1046  PhiX = CountPhi = nullptr;
1047  LoopEntry = *(CurLoop->block_begin());
1048 
1049  // step 1: Check if the loop-back branch is in desirable form.
1050  {
1051  if (Value *T = matchCondition(
1052  dyn_cast<BranchInst>(LoopEntry->getTerminator()), LoopEntry))
1053  DefX2 = dyn_cast<Instruction>(T);
1054  else
1055  return false;
1056  }
1057 
1058  // step 2: detect instructions corresponding to "x2 = x1 & (x1 - 1)"
1059  {
1060  if (!DefX2 || DefX2->getOpcode() != Instruction::And)
1061  return false;
1062 
1063  BinaryOperator *SubOneOp;
1064 
1065  if ((SubOneOp = dyn_cast<BinaryOperator>(DefX2->getOperand(0))))
1066  VarX1 = DefX2->getOperand(1);
1067  else {
1068  VarX1 = DefX2->getOperand(0);
1069  SubOneOp = dyn_cast<BinaryOperator>(DefX2->getOperand(1));
1070  }
1071  if (!SubOneOp)
1072  return false;
1073 
1074  Instruction *SubInst = cast<Instruction>(SubOneOp);
1075  ConstantInt *Dec = dyn_cast<ConstantInt>(SubInst->getOperand(1));
1076  if (!Dec ||
1077  !((SubInst->getOpcode() == Instruction::Sub && Dec->isOne()) ||
1078  (SubInst->getOpcode() == Instruction::Add &&
1079  Dec->isAllOnesValue()))) {
1080  return false;
1081  }
1082  }
1083 
1084  // step 3: Check the recurrence of variable X
1085  {
1086  PhiX = dyn_cast<PHINode>(VarX1);
1087  if (!PhiX ||
1088  (PhiX->getOperand(0) != DefX2 && PhiX->getOperand(1) != DefX2)) {
1089  return false;
1090  }
1091  }
1092 
1093  // step 4: Find the instruction which count the population: cnt2 = cnt1 + 1
1094  {
1095  CountInst = nullptr;
1096  for (BasicBlock::iterator Iter = LoopEntry->getFirstNonPHI()->getIterator(),
1097  IterE = LoopEntry->end();
1098  Iter != IterE; Iter++) {
1099  Instruction *Inst = &*Iter;
1100  if (Inst->getOpcode() != Instruction::Add)
1101  continue;
1102 
1103  ConstantInt *Inc = dyn_cast<ConstantInt>(Inst->getOperand(1));
1104  if (!Inc || !Inc->isOne())
1105  continue;
1106 
1107  PHINode *Phi = dyn_cast<PHINode>(Inst->getOperand(0));
1108  if (!Phi || Phi->getParent() != LoopEntry)
1109  continue;
1110 
1111  // Check if the result of the instruction is live of the loop.
1112  bool LiveOutLoop = false;
1113  for (User *U : Inst->users()) {
1114  if ((cast<Instruction>(U))->getParent() != LoopEntry) {
1115  LiveOutLoop = true;
1116  break;
1117  }
1118  }
1119 
1120  if (LiveOutLoop) {
1121  CountInst = Inst;
1122  CountPhi = Phi;
1123  break;
1124  }
1125  }
1126 
1127  if (!CountInst)
1128  return false;
1129  }
1130 
1131  // step 5: check if the precondition is in this form:
1132  // "if (x != 0) goto loop-head ; else goto somewhere-we-don't-care;"
1133  {
1134  auto *PreCondBr = dyn_cast<BranchInst>(PreCondBB->getTerminator());
1135  Value *T = matchCondition(PreCondBr, CurLoop->getLoopPreheader());
1136  if (T != PhiX->getOperand(0) && T != PhiX->getOperand(1))
1137  return false;
1138 
1139  CntInst = CountInst;
1140  CntPhi = CountPhi;
1141  Var = T;
1142  }
1143 
1144  return true;
1145 }
1146 
1147 /// Recognizes a population count idiom in a non-countable loop.
1148 ///
1149 /// If detected, transforms the relevant code to issue the popcount intrinsic
1150 /// function call, and returns true; otherwise, returns false.
1151 bool LoopIdiomRecognize::recognizePopcount() {
1153  return false;
1154 
1155  // Counting population are usually conducted by few arithmetic instructions.
1156  // Such instructions can be easily "absorbed" by vacant slots in a
1157  // non-compact loop. Therefore, recognizing popcount idiom only makes sense
1158  // in a compact loop.
1159 
1160  // Give up if the loop has multiple blocks or multiple backedges.
1161  if (CurLoop->getNumBackEdges() != 1 || CurLoop->getNumBlocks() != 1)
1162  return false;
1163 
1164  BasicBlock *LoopBody = *(CurLoop->block_begin());
1165  if (LoopBody->size() >= 20) {
1166  // The loop is too big, bail out.
1167  return false;
1168  }
1169 
1170  // It should have a preheader containing nothing but an unconditional branch.
1171  BasicBlock *PH = CurLoop->getLoopPreheader();
1172  if (!PH || &PH->front() != PH->getTerminator())
1173  return false;
1174  auto *EntryBI = dyn_cast<BranchInst>(PH->getTerminator());
1175  if (!EntryBI || EntryBI->isConditional())
1176  return false;
1177 
1178  // It should have a precondition block where the generated popcount instrinsic
1179  // function can be inserted.
1180  auto *PreCondBB = PH->getSinglePredecessor();
1181  if (!PreCondBB)
1182  return false;
1183  auto *PreCondBI = dyn_cast<BranchInst>(PreCondBB->getTerminator());
1184  if (!PreCondBI || PreCondBI->isUnconditional())
1185  return false;
1186 
1187  Instruction *CntInst;
1188  PHINode *CntPhi;
1189  Value *Val;
1190  if (!detectPopcountIdiom(CurLoop, PreCondBB, CntInst, CntPhi, Val))
1191  return false;
1192 
1193  transformLoopToPopcount(PreCondBB, CntInst, CntPhi, Val);
1194  return true;
1195 }
1196 
1198  const DebugLoc &DL) {
1199  Value *Ops[] = {Val};
1200  Type *Tys[] = {Val->getType()};
1201 
1202  Module *M = IRBuilder.GetInsertBlock()->getParent()->getParent();
1203  Value *Func = Intrinsic::getDeclaration(M, Intrinsic::ctpop, Tys);
1204  CallInst *CI = IRBuilder.CreateCall(Func, Ops);
1205  CI->setDebugLoc(DL);
1206 
1207  return CI;
1208 }
1209 
1210 void LoopIdiomRecognize::transformLoopToPopcount(BasicBlock *PreCondBB,
1211  Instruction *CntInst,
1212  PHINode *CntPhi, Value *Var) {
1213  BasicBlock *PreHead = CurLoop->getLoopPreheader();
1214  auto *PreCondBr = dyn_cast<BranchInst>(PreCondBB->getTerminator());
1215  const DebugLoc DL = CntInst->getDebugLoc();
1216 
1217  // Assuming before transformation, the loop is following:
1218  // if (x) // the precondition
1219  // do { cnt++; x &= x - 1; } while(x);
1220 
1221  // Step 1: Insert the ctpop instruction at the end of the precondition block
1222  IRBuilder<> Builder(PreCondBr);
1223  Value *PopCnt, *PopCntZext, *NewCount, *TripCnt;
1224  {
1225  PopCnt = createPopcntIntrinsic(Builder, Var, DL);
1226  NewCount = PopCntZext =
1227  Builder.CreateZExtOrTrunc(PopCnt, cast<IntegerType>(CntPhi->getType()));
1228 
1229  if (NewCount != PopCnt)
1230  (cast<Instruction>(NewCount))->setDebugLoc(DL);
1231 
1232  // TripCnt is exactly the number of iterations the loop has
1233  TripCnt = NewCount;
1234 
1235  // If the population counter's initial value is not zero, insert Add Inst.
1236  Value *CntInitVal = CntPhi->getIncomingValueForBlock(PreHead);
1237  ConstantInt *InitConst = dyn_cast<ConstantInt>(CntInitVal);
1238  if (!InitConst || !InitConst->isZero()) {
1239  NewCount = Builder.CreateAdd(NewCount, CntInitVal);
1240  (cast<Instruction>(NewCount))->setDebugLoc(DL);
1241  }
1242  }
1243 
1244  // Step 2: Replace the precondition from "if (x == 0) goto loop-exit" to
1245  // "if (NewCount == 0) loop-exit". Without this change, the intrinsic
1246  // function would be partial dead code, and downstream passes will drag
1247  // it back from the precondition block to the preheader.
1248  {
1249  ICmpInst *PreCond = cast<ICmpInst>(PreCondBr->getCondition());
1250 
1251  Value *Opnd0 = PopCntZext;
1252  Value *Opnd1 = ConstantInt::get(PopCntZext->getType(), 0);
1253  if (PreCond->getOperand(0) != Var)
1254  std::swap(Opnd0, Opnd1);
1255 
1256  ICmpInst *NewPreCond = cast<ICmpInst>(
1257  Builder.CreateICmp(PreCond->getPredicate(), Opnd0, Opnd1));
1258  PreCondBr->setCondition(NewPreCond);
1259 
1261  }
1262 
1263  // Step 3: Note that the population count is exactly the trip count of the
1264  // loop in question, which enable us to to convert the loop from noncountable
1265  // loop into a countable one. The benefit is twofold:
1266  //
1267  // - If the loop only counts population, the entire loop becomes dead after
1268  // the transformation. It is a lot easier to prove a countable loop dead
1269  // than to prove a noncountable one. (In some C dialects, an infinite loop
1270  // isn't dead even if it computes nothing useful. In general, DCE needs
1271  // to prove a noncountable loop finite before safely delete it.)
1272  //
1273  // - If the loop also performs something else, it remains alive.
1274  // Since it is transformed to countable form, it can be aggressively
1275  // optimized by some optimizations which are in general not applicable
1276  // to a noncountable loop.
1277  //
1278  // After this step, this loop (conceptually) would look like following:
1279  // newcnt = __builtin_ctpop(x);
1280  // t = newcnt;
1281  // if (x)
1282  // do { cnt++; x &= x-1; t--) } while (t > 0);
1283  BasicBlock *Body = *(CurLoop->block_begin());
1284  {
1285  auto *LbBr = dyn_cast<BranchInst>(Body->getTerminator());
1286  ICmpInst *LbCond = cast<ICmpInst>(LbBr->getCondition());
1287  Type *Ty = TripCnt->getType();
1288 
1289  PHINode *TcPhi = PHINode::Create(Ty, 2, "tcphi", &Body->front());
1290 
1291  Builder.SetInsertPoint(LbCond);
1292  Instruction *TcDec = cast<Instruction>(
1293  Builder.CreateSub(TcPhi, ConstantInt::get(Ty, 1),
1294  "tcdec", false, true));
1295 
1296  TcPhi->addIncoming(TripCnt, PreHead);
1297  TcPhi->addIncoming(TcDec, Body);
1298 
1299  CmpInst::Predicate Pred =
1300  (LbBr->getSuccessor(0) == Body) ? CmpInst::ICMP_UGT : CmpInst::ICMP_SLE;
1301  LbCond->setPredicate(Pred);
1302  LbCond->setOperand(0, TcDec);
1303  LbCond->setOperand(1, ConstantInt::get(Ty, 0));
1304  }
1305 
1306  // Step 4: All the references to the original population counter outside
1307  // the loop are replaced with the NewCount -- the value returned from
1308  // __builtin_ctpop().
1309  CntInst->replaceUsesOutsideBlock(NewCount, Body);
1310 
1311  // step 5: Forget the "non-computable" trip-count SCEV associated with the
1312  // loop. The loop would otherwise not be deleted even if it becomes empty.
1313  SE->forgetLoop(CurLoop);
1314 }
unsigned getAlignment() const
MachineLoop * L
Pass interface - Implemented by all 'passes'.
Definition: Pass.h:81
PopcntSupportKind getPopcntSupport(unsigned IntTyWidthInBit) const
Return hardware support for population count.
Value * getValueOperand()
Definition: Instructions.h:391
SymbolTableList< Instruction >::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
Definition: Instruction.cpp:76
void push_back(const T &Elt)
Definition: SmallVector.h:211
A parsed version of the target data layout string in and methods for querying it. ...
Definition: DataLayout.h:102
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
static PassRegistry * getPassRegistry()
getPassRegistry - Access the global registry object, which is automatically initialized at applicatio...
static CallInst * createPopcntIntrinsic(IRBuilder<> &IRBuilder, Value *Val, const DebugLoc &DL)
PreservedAnalyses getLoopPassPreservedAnalyses()
Returns the minimum set of Analyses that all loop passes must preserve.
const SCEV * getConstant(ConstantInt *V)
STATISTIC(NumFunctions,"Total number of functions")
size_t i
Value * isBytewiseValue(Value *V)
If the specified value can be set by repeating the same byte in memory, return the i8 value that it i...
This header provides classes for managing a pipeline of passes over loops in LLVM IR...
bool isVolatile() const
This is the interface for a simple mod/ref and alias analysis over globals.
A Module instance is used to store all the information related to an LLVM module. ...
Definition: Module.h:52
Value * getValue() const
Return the arguments to the instruction.
The main scalar evolution driver.
bool isSimple() const
Definition: Instructions.h:384
This class represents a function call, abstracting a target machine's calling convention.
size_type count(PtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
Definition: SmallPtrSet.h:380
Like Internal, but omit from symbol table.
Definition: GlobalValue.h:57
This class wraps the llvm.memset intrinsic.
const Function * getParent() const
Return the enclosing method, or null if none.
Definition: BasicBlock.h:100
A debug info location.
Definition: DebugLoc.h:34
const Instruction & front() const
Definition: BasicBlock.h:240
The adaptor from a function pass to a loop pass computes these analyses and makes them available to t...
An instruction for reading from memory.
Definition: Instructions.h:164
LoopT * getLoopFor(const BlockT *BB) const
Return the inner most loop that BB lives in.
Definition: LoopInfo.h:575
bool isSimple() const
Definition: Instructions.h:263
iterator end()
Get an iterator to the end of the SetVector.
Definition: SetVector.h:93
The access modifies the value stored in memory.
BlockT * getHeader() const
Definition: LoopInfo.h:102
const SCEV * getStart() const
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
Definition: SmallPtrSet.h:345
iterator begin()
Instruction iterator methods.
Definition: BasicBlock.h:228
AnalysisUsage & addRequired()
#define INITIALIZE_PASS_DEPENDENCY(depName)
Definition: PassSupport.h:53
ModRefInfo
Flags indicating whether a memory access modifies or references memory.
Definition: AliasAnalysis.h:94
This is the interface for a SCEV-based alias analysis.
bool has(LibFunc::Func F) const
Tests whether a library function is available.
INITIALIZE_PASS_BEGIN(LoopIdiomRecognizeLegacyPass,"loop-idiom","Recognize loop idioms", false, false) INITIALIZE_PASS_END(LoopIdiomRecognizeLegacyPass
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: APFloat.h:32
Instruction * getFirstNonPHI()
Returns a pointer to the first instruction in this block that is not a PHINode instruction.
Definition: BasicBlock.cpp:180
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition: IRBuilder.h:588
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
Definition: Instructions.h:399
void computeLoopSafetyInfo(LoopSafetyInfo *, Loop *)
Computes safety information for a loop checks loop body & header for the possibility of may throw exc...
Definition: LICM.cpp:450
static bool isSimple(Instruction *I)
bool insert(const value_type &X)
Insert a new element into the SetVector.
Definition: SetVector.h:136
This node represents a polynomial recurrence on the trip count of the specified loop.
static APInt getStoreStride(const SCEVAddRecExpr *StoreEv)
#define T
Value handle that is nullable, but tries to track the Value.
Definition: ValueHandle.h:134
Class to represent array types.
Definition: DerivedTypes.h:345
BasicBlock * getSuccessor(unsigned i) const
iterator begin()
Get an iterator to the beginning of the SetVector.
Definition: SetVector.h:83
Function * getDeclaration(Module *M, ID id, ArrayRef< Type * > Tys=None)
Create or insert an LLVM Function declaration for an intrinsic, and return it.
Definition: Function.cpp:949
void initializeLoopIdiomRecognizeLegacyPassPass(PassRegistry &)
An instruction for storing to memory.
Definition: Instructions.h:300
void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition: Value.cpp:401
static void deleteDeadInstruction(Instruction *I)
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree...
Definition: Dominators.h:96
Maximum length of the test input libFuzzer tries to guess a good value based on the corpus and reports it always prefer smaller inputs during the corpus shuffle When libFuzzer itself reports a bug this exit code will be used If indicates the maximal total time in seconds to run the fuzzer minimizes the provided crash input Use with etc Experimental Use value profile to guide fuzzing Number of simultaneous worker processes to run the jobs If min(jobs, NumberOfCpuCores()/2)\" is used.") FUZZER_FLAG_INT(reload
static GCRegistry::Add< CoreCLRGC > E("coreclr","CoreCLR-compatible GC")
Function * getFunction(StringRef Name) const
Look up the specified function in the module symbol table.
Definition: Module.cpp:196
static Constant * getBitCast(Constant *C, Type *Ty, bool OnlyIfReduced=false)
Definition: Constants.cpp:1695
bool isConsecutiveAccess(Value *A, Value *B, const DataLayout &DL, ScalarEvolution &SE, bool CheckType=true)
Returns true if the memory operations A and B are consecutive.
bool inferLibFuncAttributes(Function &F, const TargetLibraryInfo &TLI)
Analyze the name and prototype of the given function and set any applicable attributes.
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:395
static cl::opt< bool > UseLIRCodeSizeHeurs("use-lir-code-size-heurs", cl::desc("Use loop idiom recognition code size heuristics when compiling""with -Os/-Oz"), cl::init(true), cl::Hidden)
unsigned getAlignment() const
Return the alignment of the access that is being performed.
Definition: Instructions.h:348
bool isAffine() const
Return true if this represents an expression A + B*x where A and B are loop invariant values...
Wrapper pass for TargetTransformInfo.
A set of analyses that are preserved following a run of a transformation pass.
Definition: PassManager.h:107
const SCEV * getOne(Type *Ty)
Return a SCEV for the constant 1 of a specific type.
void setDebugLoc(DebugLoc Loc)
Set the debug location information for this instruction.
Definition: Instruction.h:256
BlockT * getLoopPreheader() const
If there is a preheader for this loop, return it.
Definition: LoopInfoImpl.h:109
Constant * getOrInsertFunction(StringRef Name, FunctionType *T, AttributeSet AttributeList)
Look up the specified function in the module symbol table.
Definition: Module.cpp:123
LLVM Basic Block Representation.
Definition: BasicBlock.h:51
The instances of the Type class are immutable: once they are created, they are never changed...
Definition: Type.h:45
Conditional or Unconditional Branch instruction.
This is an important base class in LLVM.
Definition: Constant.h:42
const SCEV * getOperand(unsigned i) const
static Constant * get(ArrayType *T, ArrayRef< Constant * > V)
Definition: Constants.cpp:888
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition: SmallPtrSet.h:368
const DebugLoc & getDebugLoc() const
Return the debug location for this node as a DebugLoc.
Definition: Instruction.h:259
Represent the analysis usage information of a pass.
static Value * matchCondition(BranchInst *BI, BasicBlock *LoopEntry)
Check if the given conditional branch is based on the comparison between a variable and zero...
This instruction compares its operands according to the predicate given to the constructor.
const SCEV * getMinusSCEV(const SCEV *LHS, const SCEV *RHS, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap)
Return LHS-RHS. Minus is represented in SCEV as A+B*-1.
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition: InstrTypes.h:880
INITIALIZE_PASS_END(RegBankSelect, DEBUG_TYPE,"Assign register bank of generic virtual registers", false, false) RegBankSelect
Value * getOperand(unsigned i) const
Definition: User.h:145
Value * getPointerOperand()
Definition: Instructions.h:270
self_iterator getIterator()
Definition: ilist_node.h:81
Predicate getPredicate() const
Return the predicate for this instruction.
Definition: InstrTypes.h:960
const APInt & getAPInt() const
static UndefValue * get(Type *T)
Static factory methods - Return an 'undef' object of the specified type.
Definition: Constants.cpp:1337
bool RecursivelyDeleteTriviallyDeadInstructions(Value *V, const TargetLibraryInfo *TLI=nullptr)
If the specified value is a trivially dead instruction, delete it.
Definition: Local.cpp:355
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition: PassManager.h:113
static PointerType * getInt8PtrTy(LLVMContext &C, unsigned AS=0)
Definition: Type.cpp:213
Value * GetUnderlyingObject(Value *V, const DataLayout &DL, unsigned MaxLookup=6)
This method strips off any GEP address adjustments and pointer casts from the specified value...
static bool mayLoopAccessLocation(Value *Ptr, ModRefInfo Access, Loop *L, const SCEV *BECount, unsigned StoreSize, AliasAnalysis &AA, SmallPtrSetImpl< Instruction * > &IgnoredStores)
mayLoopAccessLocation - Return true if the specified loop might access the specified pointer location...
bool dominates(const Instruction *Def, const Use &U) const
Return true if Def dominates a use in User.
Definition: Dominators.cpp:218
bool isConditional() const
Representation for a specific memory location.
This class provides an interface for updating the loop pass manager based on mutations to the loop ne...
Iterator for intrusive lists based on ilist_node.
loop Recognize loop false
This is the shared class of boolean and integer constants.
Definition: Constants.h:88
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
Value * getDest() const
This is just like getRawDest, but it strips off any cast instructions that feed it, giving the original input.
iterator end()
Definition: BasicBlock.h:230
const Module * getModule() const
Return the module owning the function this instruction belongs to or nullptr it the function does not...
Definition: Instruction.cpp:58
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small...
Definition: SmallVector.h:843
Module.h This file contains the declarations for the Module class.
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:230
Provides information about what library functions are available for the current target.
MDNode * getMetadata(unsigned KindID) const
Get the metadata of given kind attached to this Instruction.
Definition: Instruction.h:175
Value * getLength() const
BasicBlock * GetInsertBlock() const
Definition: IRBuilder.h:121
static Constant * get(Type *Ty, uint64_t V, bool isSigned=false)
If Ty is a vector type, return a Constant with a splat of the given value.
Definition: Constants.cpp:558
bool isZero() const
This is just a convenience method to make client code smaller for a common code.
Definition: Constants.h:198
static PHINode * Create(Type *Ty, unsigned NumReservedValues, const Twine &NameStr="", Instruction *InsertBefore=nullptr)
Constructors - NumReservedValues is a hint for the number of incoming edges that this phi node will h...
static Constant * getMemSetPatternValue(Value *V, const DataLayout *DL)
getMemSetPatternValue - If a strided store of the specified value is safe to turn into a memset_patte...
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
Definition: Instructions.h:275
static bool detectPopcountIdiom(Loop *CurLoop, BasicBlock *PreCondBB, Instruction *&CntInst, PHINode *&CntPhi, Value *&Var)
Return true iff the idiom is detected in the loop.
static GCRegistry::Add< ShadowStackGC > C("shadow-stack","Very portable GC for uncooperative code generators")
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:132
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition: BitVector.h:586
signed less or equal
Definition: InstrTypes.h:910
loop Recognize loop idioms
Class for arbitrary precision integers.
Definition: APInt.h:77
loop idiom
ModRefInfo getModRefInfo(ImmutableCallSite CS, const MemoryLocation &Loc)
getModRefInfo (for call sites) - Return information about whether a particular call site modifies or ...
Value * getIncomingValueForBlock(const BasicBlock *BB) const
iterator_range< user_iterator > users()
Definition: Value.h:370
BasicBlock * getSinglePredecessor()
Return the predecessor of this block if it has a single predecessor block.
Definition: BasicBlock.cpp:226
This class uses information about analyze scalars to rewrite expressions in canonical form...
const SCEV * getAddExpr(SmallVectorImpl< const SCEV * > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap)
Get a canonical add expression, or something simpler if possible.
std::vector< BlockT * >::const_iterator block_iterator
Definition: LoopInfo.h:140
Pass * createLoopIdiomPass()
APInt And(const APInt &LHS, const APInt &RHS)
Bitwise AND function for APInt.
Definition: APInt.h:1942
block_iterator block_end() const
Definition: LoopInfo.h:142
Value * getCondition() const
Captures loop safety information.
Definition: LoopUtils.h:42
void forgetLoop(const Loop *L)
This method should be called by the client when it has changed a loop in a way that may effect Scalar...
size_type count(const key_type &key) const
Count the number of elements of a given key in the SetVector.
Definition: SetVector.h:205
void setUnnamedAddr(UnnamedAddr Val)
Definition: GlobalValue.h:203
This class represents an analyzed expression in the program.
Represents a single loop in the control flow graph.
Definition: LoopInfo.h:368
unsigned getAlignment() const
Return the alignment of the access that is being performed.
Definition: Instructions.h:227
The access both references and modifies the value stored in memory.
#define I(x, y, z)
Definition: MD5.cpp:54
TerminatorInst * getTerminator()
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Definition: BasicBlock.cpp:124
LLVM_ATTRIBUTE_ALWAYS_INLINE size_type size() const
Definition: SmallVector.h:135
CallInst * CreateCall(Value *Callee, ArrayRef< Value * > Args=None, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition: IRBuilder.h:1579
void getLoopAnalysisUsage(AnalysisUsage &AU)
Helper to consistently add the set of standard passes to a loop pass's AnalysisUsage.
Definition: LoopUtils.cpp:938
static ArrayType * get(Type *ElementType, uint64_t NumElements)
This static method is the primary way to construct an ArrayType.
Definition: Type.cpp:606
LLVM_NODISCARD std::enable_if<!is_simple_type< Y >::value, typename cast_retty< X, const Y >::ret_type >::type dyn_cast(const Y &Val)
Definition: Casting.h:287
const Loop * getLoop() const
size_t size() const
Definition: BasicBlock.h:238
static const SCEV * getStartForNegStride(const SCEV *Start, const SCEV *BECount, Type *IntPtr, unsigned StoreSize, ScalarEvolution *SE)
const SCEV * getBackedgeTakenCount(const Loop *L)
If the specified loop has a predictable backedge-taken count, return it, otherwise return a SCEVCould...
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
Module * getParent()
Get the module that this global value is contained inside of...
Definition: GlobalValue.h:537
LLVM Value Representation.
Definition: Value.h:71
vector_type::const_iterator iterator
Definition: SetVector.h:49
const SCEV * getSCEV(Value *V)
Return a SCEV expression for the full generality of the specified expression.
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
Definition: Instruction.h:111
A vector that has set insertion semantics.
Definition: SetVector.h:41
uint64_t getTypeSizeInBits(Type *Ty) const
Size examples:
Definition: DataLayout.h:533
#define DEBUG(X)
Definition: Debug.h:100
static unsigned getStoreSizeInBytes(StoreInst *SI, const DataLayout *DL)
block_iterator block_begin() const
Definition: LoopInfo.h:141
unsigned greater than
Definition: InstrTypes.h:903
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:47
This is the interface for LLVM's primary stateless and local alias analysis.
A container for analyses that lazily runs them and caches their results.
void replaceUsesOutsideBlock(Value *V, BasicBlock *BB)
replaceUsesOutsideBlock - Go through the uses list for this definition and make each use point to "V"...
Definition: Value.cpp:411
const SCEV * getTruncateOrZeroExtend(const SCEV *V, Type *Ty)
Return a SCEV corresponding to a conversion of the input value to the specified type.
This pass exposes codegen information to IR-level passes.
int * Ptr
bool isBigEndian() const
Definition: DataLayout.h:221
unsigned getPointerAddressSpace() const
Get the address space of this pointer or pointer vector type.
Definition: DerivedTypes.h:479
const SCEV * getMulExpr(SmallVectorImpl< const SCEV * > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap)
Get a canonical multiply expression, or something simpler if possible.
bool hasLoopInvariantBackedgeTakenCount(const Loop *L)
Return true if the specified loop has an analyzable loop-invariant backedge-taken count...
Value * getPointerOperand()
Definition: Instructions.h:394
const BasicBlock * getParent() const
Definition: Instruction.h:62
bool isOne() const
This is just a convenience method to make client code smaller for a common case.
Definition: Constants.h:206
PreservedAnalyses run(Loop &L, LoopAnalysisManager &AM, LoopStandardAnalysisResults &AR, LPMUpdater &U)
This class represents a constant integer value.