LLVM 17.0.0git
LoopIdiomRecognize.cpp
Go to the documentation of this file.
1//===- LoopIdiomRecognize.cpp - Loop idiom recognition --------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This pass implements an idiom recognizer that transforms simple loops into a
10// non-loop form. In cases that this kicks in, it can be a significant
11// performance win.
12//
13// If compiling for code size we avoid idiom recognition if the resulting
14// code could be larger than the code for the original loop. One way this could
15// happen is if the loop is not removable after idiom recognition due to the
16// presence of non-idiom instructions. The initial implementation of the
17// heuristics applies to idioms in multi-block loops.
18//
19//===----------------------------------------------------------------------===//
20//
21// TODO List:
22//
23// Future loop memory idioms to recognize:
24// memcmp, strlen, etc.
25// Future floating point idioms to recognize in -ffast-math mode:
26// fpowi
27// Future integer operation idioms to recognize:
28// ctpop
29//
30// Beware that isel's default lowering for ctpop is highly inefficient for
31// i64 and larger types when i64 is legal and the value has few bits set. It
32// would be good to enhance isel to emit a loop for ctpop in this case.
33//
34// This could recognize common matrix multiplies and dot product idioms and
35// replace them with calls to BLAS (if linked in??).
36//
37//===----------------------------------------------------------------------===//
38
40#include "llvm/ADT/APInt.h"
41#include "llvm/ADT/ArrayRef.h"
42#include "llvm/ADT/DenseMap.h"
43#include "llvm/ADT/MapVector.h"
44#include "llvm/ADT/SetVector.h"
47#include "llvm/ADT/Statistic.h"
48#include "llvm/ADT/StringRef.h"
64#include "llvm/IR/BasicBlock.h"
65#include "llvm/IR/Constant.h"
66#include "llvm/IR/Constants.h"
67#include "llvm/IR/DataLayout.h"
68#include "llvm/IR/DebugLoc.h"
70#include "llvm/IR/Dominators.h"
71#include "llvm/IR/GlobalValue.h"
73#include "llvm/IR/IRBuilder.h"
74#include "llvm/IR/InstrTypes.h"
75#include "llvm/IR/Instruction.h"
78#include "llvm/IR/Intrinsics.h"
79#include "llvm/IR/LLVMContext.h"
80#include "llvm/IR/Module.h"
81#include "llvm/IR/PassManager.h"
83#include "llvm/IR/Type.h"
84#include "llvm/IR/User.h"
85#include "llvm/IR/Value.h"
86#include "llvm/IR/ValueHandle.h"
88#include "llvm/Pass.h"
91#include "llvm/Support/Debug.h"
99#include <algorithm>
100#include <cassert>
101#include <cstdint>
102#include <utility>
103#include <vector>
104
105using namespace llvm;
106
107#define DEBUG_TYPE "loop-idiom"
108
109STATISTIC(NumMemSet, "Number of memset's formed from loop stores");
110STATISTIC(NumMemCpy, "Number of memcpy's formed from loop load+stores");
111STATISTIC(NumMemMove, "Number of memmove's formed from loop load+stores");
113 NumShiftUntilBitTest,
114 "Number of uncountable loops recognized as 'shift until bitttest' idiom");
115STATISTIC(NumShiftUntilZero,
116 "Number of uncountable loops recognized as 'shift until zero' idiom");
117
120 DisableLIRPAll("disable-" DEBUG_TYPE "-all",
121 cl::desc("Options to disable Loop Idiom Recognize Pass."),
124
127 DisableLIRPMemset("disable-" DEBUG_TYPE "-memset",
128 cl::desc("Proceed with loop idiom recognize pass, but do "
129 "not convert loop(s) to memset."),
132
135 DisableLIRPMemcpy("disable-" DEBUG_TYPE "-memcpy",
136 cl::desc("Proceed with loop idiom recognize pass, but do "
137 "not convert loop(s) to memcpy."),
140
142 "use-lir-code-size-heurs",
143 cl::desc("Use loop idiom recognition code size heuristics when compiling"
144 "with -Os/-Oz"),
145 cl::init(true), cl::Hidden);
146
147namespace {
148
149class LoopIdiomRecognize {
150 Loop *CurLoop = nullptr;
151 AliasAnalysis *AA;
152 DominatorTree *DT;
153 LoopInfo *LI;
154 ScalarEvolution *SE;
157 const DataLayout *DL;
159 bool ApplyCodeSizeHeuristics;
160 std::unique_ptr<MemorySSAUpdater> MSSAU;
161
162public:
163 explicit LoopIdiomRecognize(AliasAnalysis *AA, DominatorTree *DT,
164 LoopInfo *LI, ScalarEvolution *SE,
166 const TargetTransformInfo *TTI, MemorySSA *MSSA,
167 const DataLayout *DL,
169 : AA(AA), DT(DT), LI(LI), SE(SE), TLI(TLI), TTI(TTI), DL(DL), ORE(ORE) {
170 if (MSSA)
171 MSSAU = std::make_unique<MemorySSAUpdater>(MSSA);
172 }
173
174 bool runOnLoop(Loop *L);
175
176private:
177 using StoreList = SmallVector<StoreInst *, 8>;
178 using StoreListMap = MapVector<Value *, StoreList>;
179
180 StoreListMap StoreRefsForMemset;
181 StoreListMap StoreRefsForMemsetPattern;
182 StoreList StoreRefsForMemcpy;
183 bool HasMemset;
184 bool HasMemsetPattern;
185 bool HasMemcpy;
186
187 /// Return code for isLegalStore()
188 enum LegalStoreKind {
189 None = 0,
190 Memset,
191 MemsetPattern,
192 Memcpy,
193 UnorderedAtomicMemcpy,
194 DontUse // Dummy retval never to be used. Allows catching errors in retval
195 // handling.
196 };
197
198 /// \name Countable Loop Idiom Handling
199 /// @{
200
201 bool runOnCountableLoop();
202 bool runOnLoopBlock(BasicBlock *BB, const SCEV *BECount,
204
205 void collectStores(BasicBlock *BB);
206 LegalStoreKind isLegalStore(StoreInst *SI);
207 enum class ForMemset { No, Yes };
208 bool processLoopStores(SmallVectorImpl<StoreInst *> &SL, const SCEV *BECount,
209 ForMemset For);
210
211 template <typename MemInst>
212 bool processLoopMemIntrinsic(
213 BasicBlock *BB,
214 bool (LoopIdiomRecognize::*Processor)(MemInst *, const SCEV *),
215 const SCEV *BECount);
216 bool processLoopMemCpy(MemCpyInst *MCI, const SCEV *BECount);
217 bool processLoopMemSet(MemSetInst *MSI, const SCEV *BECount);
218
219 bool processLoopStridedStore(Value *DestPtr, const SCEV *StoreSizeSCEV,
220 MaybeAlign StoreAlignment, Value *StoredVal,
221 Instruction *TheStore,
223 const SCEVAddRecExpr *Ev, const SCEV *BECount,
224 bool IsNegStride, bool IsLoopMemset = false);
225 bool processLoopStoreOfLoopLoad(StoreInst *SI, const SCEV *BECount);
226 bool processLoopStoreOfLoopLoad(Value *DestPtr, Value *SourcePtr,
227 const SCEV *StoreSize, MaybeAlign StoreAlign,
228 MaybeAlign LoadAlign, Instruction *TheStore,
229 Instruction *TheLoad,
230 const SCEVAddRecExpr *StoreEv,
231 const SCEVAddRecExpr *LoadEv,
232 const SCEV *BECount);
233 bool avoidLIRForMultiBlockLoop(bool IsMemset = false,
234 bool IsLoopMemset = false);
235
236 /// @}
237 /// \name Noncountable Loop Idiom Handling
238 /// @{
239
240 bool runOnNoncountableLoop();
241
242 bool recognizePopcount();
243 void transformLoopToPopcount(BasicBlock *PreCondBB, Instruction *CntInst,
244 PHINode *CntPhi, Value *Var);
245 bool recognizeAndInsertFFS(); /// Find First Set: ctlz or cttz
246 void transformLoopToCountable(Intrinsic::ID IntrinID, BasicBlock *PreCondBB,
247 Instruction *CntInst, PHINode *CntPhi,
248 Value *Var, Instruction *DefX,
249 const DebugLoc &DL, bool ZeroCheck,
250 bool IsCntPhiUsedOutsideLoop);
251
252 bool recognizeShiftUntilBitTest();
253 bool recognizeShiftUntilZero();
254
255 /// @}
256};
257
258class LoopIdiomRecognizeLegacyPass : public LoopPass {
259public:
260 static char ID;
261
262 explicit LoopIdiomRecognizeLegacyPass() : LoopPass(ID) {
265 }
266
267 bool runOnLoop(Loop *L, LPPassManager &LPM) override {
269 return false;
270
271 if (skipLoop(L))
272 return false;
273
274 AliasAnalysis *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
275 DominatorTree *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
276 LoopInfo *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
277 ScalarEvolution *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
278 TargetLibraryInfo *TLI =
279 &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(
280 *L->getHeader()->getParent());
281 const TargetTransformInfo *TTI =
282 &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(
283 *L->getHeader()->getParent());
284 const DataLayout *DL = &L->getHeader()->getModule()->getDataLayout();
285 auto *MSSAAnalysis = getAnalysisIfAvailable<MemorySSAWrapperPass>();
286 MemorySSA *MSSA = nullptr;
287 if (MSSAAnalysis)
288 MSSA = &MSSAAnalysis->getMSSA();
289
290 // For the old PM, we can't use OptimizationRemarkEmitter as an analysis
291 // pass. Function analyses need to be preserved across loop transformations
292 // but ORE cannot be preserved (see comment before the pass definition).
294
295 LoopIdiomRecognize LIR(AA, DT, LI, SE, TLI, TTI, MSSA, DL, ORE);
296 return LIR.runOnLoop(L);
297 }
298
299 /// This transformation requires natural loop information & requires that
300 /// loop preheaders be inserted into the CFG.
301 void getAnalysisUsage(AnalysisUsage &AU) const override {
306 }
307};
308
309} // end anonymous namespace
310
311char LoopIdiomRecognizeLegacyPass::ID = 0;
312
315 LPMUpdater &) {
317 return PreservedAnalyses::all();
318
319 const auto *DL = &L.getHeader()->getModule()->getDataLayout();
320
321 // For the new PM, we also can't use OptimizationRemarkEmitter as an analysis
322 // pass. Function analyses need to be preserved across loop transformations
323 // but ORE cannot be preserved (see comment before the pass definition).
325
326 LoopIdiomRecognize LIR(&AR.AA, &AR.DT, &AR.LI, &AR.SE, &AR.TLI, &AR.TTI,
327 AR.MSSA, DL, ORE);
328 if (!LIR.runOnLoop(&L))
329 return PreservedAnalyses::all();
330
332 if (AR.MSSA)
333 PA.preserve<MemorySSAAnalysis>();
334 return PA;
335}
336
337INITIALIZE_PASS_BEGIN(LoopIdiomRecognizeLegacyPass, "loop-idiom",
338 "Recognize loop idioms", false, false)
342INITIALIZE_PASS_END(LoopIdiomRecognizeLegacyPass, "loop-idiom",
343 "Recognize loop idioms", false, false)
344
345Pass *llvm::createLoopIdiomPass() { return new LoopIdiomRecognizeLegacyPass(); }
346
348 I->replaceAllUsesWith(PoisonValue::get(I->getType()));
349 I->eraseFromParent();
350}
351
352//===----------------------------------------------------------------------===//
353//
354// Implementation of LoopIdiomRecognize
355//
356//===----------------------------------------------------------------------===//
357
358bool LoopIdiomRecognize::runOnLoop(Loop *L) {
359 CurLoop = L;
360 // If the loop could not be converted to canonical form, it must have an
361 // indirectbr in it, just give up.
362 if (!L->getLoopPreheader())
363 return false;
364
365 // Disable loop idiom recognition if the function's name is a common idiom.
367 if (Name == "memset" || Name == "memcpy")
368 return false;
369
370 // Determine if code size heuristics need to be applied.
371 ApplyCodeSizeHeuristics =
373
374 HasMemset = TLI->has(LibFunc_memset);
375 HasMemsetPattern = TLI->has(LibFunc_memset_pattern16);
376 HasMemcpy = TLI->has(LibFunc_memcpy);
377
378 if (HasMemset || HasMemsetPattern || HasMemcpy)
380 return runOnCountableLoop();
381
382 return runOnNoncountableLoop();
383}
384
385bool LoopIdiomRecognize::runOnCountableLoop() {
386 const SCEV *BECount = SE->getBackedgeTakenCount(CurLoop);
387 assert(!isa<SCEVCouldNotCompute>(BECount) &&
388 "runOnCountableLoop() called on a loop without a predictable"
389 "backedge-taken count");
390
391 // If this loop executes exactly one time, then it should be peeled, not
392 // optimized by this pass.
393 if (const SCEVConstant *BECst = dyn_cast<SCEVConstant>(BECount))
394 if (BECst->getAPInt() == 0)
395 return false;
396
398 CurLoop->getUniqueExitBlocks(ExitBlocks);
399
400 LLVM_DEBUG(dbgs() << DEBUG_TYPE " Scanning: F["
401 << CurLoop->getHeader()->getParent()->getName()
402 << "] Countable Loop %" << CurLoop->getHeader()->getName()
403 << "\n");
404
405 // The following transforms hoist stores/memsets into the loop pre-header.
406 // Give up if the loop has instructions that may throw.
407 SimpleLoopSafetyInfo SafetyInfo;
408 SafetyInfo.computeLoopSafetyInfo(CurLoop);
409 if (SafetyInfo.anyBlockMayThrow())
410 return false;
411
412 bool MadeChange = false;
413
414 // Scan all the blocks in the loop that are not in subloops.
415 for (auto *BB : CurLoop->getBlocks()) {
416 // Ignore blocks in subloops.
417 if (LI->getLoopFor(BB) != CurLoop)
418 continue;
419
420 MadeChange |= runOnLoopBlock(BB, BECount, ExitBlocks);
421 }
422 return MadeChange;
423}
424
425static APInt getStoreStride(const SCEVAddRecExpr *StoreEv) {
426 const SCEVConstant *ConstStride = cast<SCEVConstant>(StoreEv->getOperand(1));
427 return ConstStride->getAPInt();
428}
429
430/// getMemSetPatternValue - If a strided store of the specified value is safe to
431/// turn into a memset_pattern16, return a ConstantArray of 16 bytes that should
432/// be passed in. Otherwise, return null.
433///
434/// Note that we don't ever attempt to use memset_pattern8 or 4, because these
435/// just replicate their input array and then pass on to memset_pattern16.
437 // FIXME: This could check for UndefValue because it can be merged into any
438 // other valid pattern.
439
440 // If the value isn't a constant, we can't promote it to being in a constant
441 // array. We could theoretically do a store to an alloca or something, but
442 // that doesn't seem worthwhile.
443 Constant *C = dyn_cast<Constant>(V);
444 if (!C || isa<ConstantExpr>(C))
445 return nullptr;
446
447 // Only handle simple values that are a power of two bytes in size.
448 uint64_t Size = DL->getTypeSizeInBits(V->getType());
449 if (Size == 0 || (Size & 7) || (Size & (Size - 1)))
450 return nullptr;
451
452 // Don't care enough about darwin/ppc to implement this.
453 if (DL->isBigEndian())
454 return nullptr;
455
456 // Convert to size in bytes.
457 Size /= 8;
458
459 // TODO: If CI is larger than 16-bytes, we can try slicing it in half to see
460 // if the top and bottom are the same (e.g. for vectors and large integers).
461 if (Size > 16)
462 return nullptr;
463
464 // If the constant is exactly 16 bytes, just use it.
465 if (Size == 16)
466 return C;
467
468 // Otherwise, we'll use an array of the constants.
469 unsigned ArraySize = 16 / Size;
470 ArrayType *AT = ArrayType::get(V->getType(), ArraySize);
471 return ConstantArray::get(AT, std::vector<Constant *>(ArraySize, C));
472}
473
474LoopIdiomRecognize::LegalStoreKind
475LoopIdiomRecognize::isLegalStore(StoreInst *SI) {
476 // Don't touch volatile stores.
477 if (SI->isVolatile())
478 return LegalStoreKind::None;
479 // We only want simple or unordered-atomic stores.
480 if (!SI->isUnordered())
481 return LegalStoreKind::None;
482
483 // Avoid merging nontemporal stores.
484 if (SI->getMetadata(LLVMContext::MD_nontemporal))
485 return LegalStoreKind::None;
486
487 Value *StoredVal = SI->getValueOperand();
488 Value *StorePtr = SI->getPointerOperand();
489
490 // Don't convert stores of non-integral pointer types to memsets (which stores
491 // integers).
492 if (DL->isNonIntegralPointerType(StoredVal->getType()->getScalarType()))
493 return LegalStoreKind::None;
494
495 // Reject stores that are so large that they overflow an unsigned.
496 // When storing out scalable vectors we bail out for now, since the code
497 // below currently only works for constant strides.
498 TypeSize SizeInBits = DL->getTypeSizeInBits(StoredVal->getType());
499 if (SizeInBits.isScalable() || (SizeInBits.getFixedValue() & 7) ||
500 (SizeInBits.getFixedValue() >> 32) != 0)
501 return LegalStoreKind::None;
502
503 // See if the pointer expression is an AddRec like {base,+,1} on the current
504 // loop, which indicates a strided store. If we have something else, it's a
505 // random store we can't handle.
506 const SCEVAddRecExpr *StoreEv =
507 dyn_cast<SCEVAddRecExpr>(SE->getSCEV(StorePtr));
508 if (!StoreEv || StoreEv->getLoop() != CurLoop || !StoreEv->isAffine())
509 return LegalStoreKind::None;
510
511 // Check to see if we have a constant stride.
512 if (!isa<SCEVConstant>(StoreEv->getOperand(1)))
513 return LegalStoreKind::None;
514
515 // See if the store can be turned into a memset.
516
517 // If the stored value is a byte-wise value (like i32 -1), then it may be
518 // turned into a memset of i8 -1, assuming that all the consecutive bytes
519 // are stored. A store of i32 0x01020304 can never be turned into a memset,
520 // but it can be turned into memset_pattern if the target supports it.
521 Value *SplatValue = isBytewiseValue(StoredVal, *DL);
522
523 // Note: memset and memset_pattern on unordered-atomic is yet not supported
524 bool UnorderedAtomic = SI->isUnordered() && !SI->isSimple();
525
526 // If we're allowed to form a memset, and the stored value would be
527 // acceptable for memset, use it.
528 if (!UnorderedAtomic && HasMemset && SplatValue && !DisableLIRP::Memset &&
529 // Verify that the stored value is loop invariant. If not, we can't
530 // promote the memset.
531 CurLoop->isLoopInvariant(SplatValue)) {
532 // It looks like we can use SplatValue.
533 return LegalStoreKind::Memset;
534 }
535 if (!UnorderedAtomic && HasMemsetPattern && !DisableLIRP::Memset &&
536 // Don't create memset_pattern16s with address spaces.
537 StorePtr->getType()->getPointerAddressSpace() == 0 &&
538 getMemSetPatternValue(StoredVal, DL)) {
539 // It looks like we can use PatternValue!
540 return LegalStoreKind::MemsetPattern;
541 }
542
543 // Otherwise, see if the store can be turned into a memcpy.
544 if (HasMemcpy && !DisableLIRP::Memcpy) {
545 // Check to see if the stride matches the size of the store. If so, then we
546 // know that every byte is touched in the loop.
547 APInt Stride = getStoreStride(StoreEv);
548 unsigned StoreSize = DL->getTypeStoreSize(SI->getValueOperand()->getType());
549 if (StoreSize != Stride && StoreSize != -Stride)
550 return LegalStoreKind::None;
551
552 // The store must be feeding a non-volatile load.
553 LoadInst *LI = dyn_cast<LoadInst>(SI->getValueOperand());
554
555 // Only allow non-volatile loads
556 if (!LI || LI->isVolatile())
557 return LegalStoreKind::None;
558 // Only allow simple or unordered-atomic loads
559 if (!LI->isUnordered())
560 return LegalStoreKind::None;
561
562 // See if the pointer expression is an AddRec like {base,+,1} on the current
563 // loop, which indicates a strided load. If we have something else, it's a
564 // random load we can't handle.
565 const SCEVAddRecExpr *LoadEv =
566 dyn_cast<SCEVAddRecExpr>(SE->getSCEV(LI->getPointerOperand()));
567 if (!LoadEv || LoadEv->getLoop() != CurLoop || !LoadEv->isAffine())
568 return LegalStoreKind::None;
569
570 // The store and load must share the same stride.
571 if (StoreEv->getOperand(1) != LoadEv->getOperand(1))
572 return LegalStoreKind::None;
573
574 // Success. This store can be converted into a memcpy.
575 UnorderedAtomic = UnorderedAtomic || LI->isAtomic();
576 return UnorderedAtomic ? LegalStoreKind::UnorderedAtomicMemcpy
577 : LegalStoreKind::Memcpy;
578 }
579 // This store can't be transformed into a memset/memcpy.
580 return LegalStoreKind::None;
581}
582
583void LoopIdiomRecognize::collectStores(BasicBlock *BB) {
584 StoreRefsForMemset.clear();
585 StoreRefsForMemsetPattern.clear();
586 StoreRefsForMemcpy.clear();
587 for (Instruction &I : *BB) {
588 StoreInst *SI = dyn_cast<StoreInst>(&I);
589 if (!SI)
590 continue;
591
592 // Make sure this is a strided store with a constant stride.
593 switch (isLegalStore(SI)) {
594 case LegalStoreKind::None:
595 // Nothing to do
596 break;
597 case LegalStoreKind::Memset: {
598 // Find the base pointer.
599 Value *Ptr = getUnderlyingObject(SI->getPointerOperand());
600 StoreRefsForMemset[Ptr].push_back(SI);
601 } break;
602 case LegalStoreKind::MemsetPattern: {
603 // Find the base pointer.
604 Value *Ptr = getUnderlyingObject(SI->getPointerOperand());
605 StoreRefsForMemsetPattern[Ptr].push_back(SI);
606 } break;
607 case LegalStoreKind::Memcpy:
608 case LegalStoreKind::UnorderedAtomicMemcpy:
609 StoreRefsForMemcpy.push_back(SI);
610 break;
611 default:
612 assert(false && "unhandled return value");
613 break;
614 }
615 }
616}
617
618/// runOnLoopBlock - Process the specified block, which lives in a counted loop
619/// with the specified backedge count. This block is known to be in the current
620/// loop and not in any subloops.
621bool LoopIdiomRecognize::runOnLoopBlock(
622 BasicBlock *BB, const SCEV *BECount,
623 SmallVectorImpl<BasicBlock *> &ExitBlocks) {
624 // We can only promote stores in this block if they are unconditionally
625 // executed in the loop. For a block to be unconditionally executed, it has
626 // to dominate all the exit blocks of the loop. Verify this now.
627 for (BasicBlock *ExitBlock : ExitBlocks)
628 if (!DT->dominates(BB, ExitBlock))
629 return false;
630
631 bool MadeChange = false;
632 // Look for store instructions, which may be optimized to memset/memcpy.
633 collectStores(BB);
634
635 // Look for a single store or sets of stores with a common base, which can be
636 // optimized into a memset (memset_pattern). The latter most commonly happens
637 // with structs and handunrolled loops.
638 for (auto &SL : StoreRefsForMemset)
639 MadeChange |= processLoopStores(SL.second, BECount, ForMemset::Yes);
640
641 for (auto &SL : StoreRefsForMemsetPattern)
642 MadeChange |= processLoopStores(SL.second, BECount, ForMemset::No);
643
644 // Optimize the store into a memcpy, if it feeds an similarly strided load.
645 for (auto &SI : StoreRefsForMemcpy)
646 MadeChange |= processLoopStoreOfLoopLoad(SI, BECount);
647
648 MadeChange |= processLoopMemIntrinsic<MemCpyInst>(
649 BB, &LoopIdiomRecognize::processLoopMemCpy, BECount);
650 MadeChange |= processLoopMemIntrinsic<MemSetInst>(
651 BB, &LoopIdiomRecognize::processLoopMemSet, BECount);
652
653 return MadeChange;
654}
655
656/// See if this store(s) can be promoted to a memset.
657bool LoopIdiomRecognize::processLoopStores(SmallVectorImpl<StoreInst *> &SL,
658 const SCEV *BECount, ForMemset For) {
659 // Try to find consecutive stores that can be transformed into memsets.
660 SetVector<StoreInst *> Heads, Tails;
662
663 // Do a quadratic search on all of the given stores and find
664 // all of the pairs of stores that follow each other.
665 SmallVector<unsigned, 16> IndexQueue;
666 for (unsigned i = 0, e = SL.size(); i < e; ++i) {
667 assert(SL[i]->isSimple() && "Expected only non-volatile stores.");
668
669 Value *FirstStoredVal = SL[i]->getValueOperand();
670 Value *FirstStorePtr = SL[i]->getPointerOperand();
671 const SCEVAddRecExpr *FirstStoreEv =
672 cast<SCEVAddRecExpr>(SE->getSCEV(FirstStorePtr));
673 APInt FirstStride = getStoreStride(FirstStoreEv);
674 unsigned FirstStoreSize = DL->getTypeStoreSize(SL[i]->getValueOperand()->getType());
675
676 // See if we can optimize just this store in isolation.
677 if (FirstStride == FirstStoreSize || -FirstStride == FirstStoreSize) {
678 Heads.insert(SL[i]);
679 continue;
680 }
681
682 Value *FirstSplatValue = nullptr;
683 Constant *FirstPatternValue = nullptr;
684
685 if (For == ForMemset::Yes)
686 FirstSplatValue = isBytewiseValue(FirstStoredVal, *DL);
687 else
688 FirstPatternValue = getMemSetPatternValue(FirstStoredVal, DL);
689
690 assert((FirstSplatValue || FirstPatternValue) &&
691 "Expected either splat value or pattern value.");
692
693 IndexQueue.clear();
694 // If a store has multiple consecutive store candidates, search Stores
695 // array according to the sequence: from i+1 to e, then from i-1 to 0.
696 // This is because usually pairing with immediate succeeding or preceding
697 // candidate create the best chance to find memset opportunity.
698 unsigned j = 0;
699 for (j = i + 1; j < e; ++j)
700 IndexQueue.push_back(j);
701 for (j = i; j > 0; --j)
702 IndexQueue.push_back(j - 1);
703
704 for (auto &k : IndexQueue) {
705 assert(SL[k]->isSimple() && "Expected only non-volatile stores.");
706 Value *SecondStorePtr = SL[k]->getPointerOperand();
707 const SCEVAddRecExpr *SecondStoreEv =
708 cast<SCEVAddRecExpr>(SE->getSCEV(SecondStorePtr));
709 APInt SecondStride = getStoreStride(SecondStoreEv);
710
711 if (FirstStride != SecondStride)
712 continue;
713
714 Value *SecondStoredVal = SL[k]->getValueOperand();
715 Value *SecondSplatValue = nullptr;
716 Constant *SecondPatternValue = nullptr;
717
718 if (For == ForMemset::Yes)
719 SecondSplatValue = isBytewiseValue(SecondStoredVal, *DL);
720 else
721 SecondPatternValue = getMemSetPatternValue(SecondStoredVal, DL);
722
723 assert((SecondSplatValue || SecondPatternValue) &&
724 "Expected either splat value or pattern value.");
725
726 if (isConsecutiveAccess(SL[i], SL[k], *DL, *SE, false)) {
727 if (For == ForMemset::Yes) {
728 if (isa<UndefValue>(FirstSplatValue))
729 FirstSplatValue = SecondSplatValue;
730 if (FirstSplatValue != SecondSplatValue)
731 continue;
732 } else {
733 if (isa<UndefValue>(FirstPatternValue))
734 FirstPatternValue = SecondPatternValue;
735 if (FirstPatternValue != SecondPatternValue)
736 continue;
737 }
738 Tails.insert(SL[k]);
739 Heads.insert(SL[i]);
740 ConsecutiveChain[SL[i]] = SL[k];
741 break;
742 }
743 }
744 }
745
746 // We may run into multiple chains that merge into a single chain. We mark the
747 // stores that we transformed so that we don't visit the same store twice.
748 SmallPtrSet<Value *, 16> TransformedStores;
749 bool Changed = false;
750
751 // For stores that start but don't end a link in the chain:
752 for (StoreInst *I : Heads) {
753 if (Tails.count(I))
754 continue;
755
756 // We found a store instr that starts a chain. Now follow the chain and try
757 // to transform it.
758 SmallPtrSet<Instruction *, 8> AdjacentStores;
759 StoreInst *HeadStore = I;
760 unsigned StoreSize = 0;
761
762 // Collect the chain into a list.
763 while (Tails.count(I) || Heads.count(I)) {
764 if (TransformedStores.count(I))
765 break;
766 AdjacentStores.insert(I);
767
768 StoreSize += DL->getTypeStoreSize(I->getValueOperand()->getType());
769 // Move to the next value in the chain.
770 I = ConsecutiveChain[I];
771 }
772
773 Value *StoredVal = HeadStore->getValueOperand();
774 Value *StorePtr = HeadStore->getPointerOperand();
775 const SCEVAddRecExpr *StoreEv = cast<SCEVAddRecExpr>(SE->getSCEV(StorePtr));
776 APInt Stride = getStoreStride(StoreEv);
777
778 // Check to see if the stride matches the size of the stores. If so, then
779 // we know that every byte is touched in the loop.
780 if (StoreSize != Stride && StoreSize != -Stride)
781 continue;
782
783 bool IsNegStride = StoreSize == -Stride;
784
785 Type *IntIdxTy = DL->getIndexType(StorePtr->getType());
786 const SCEV *StoreSizeSCEV = SE->getConstant(IntIdxTy, StoreSize);
787 if (processLoopStridedStore(StorePtr, StoreSizeSCEV,
788 MaybeAlign(HeadStore->getAlign()), StoredVal,
789 HeadStore, AdjacentStores, StoreEv, BECount,
790 IsNegStride)) {
791 TransformedStores.insert(AdjacentStores.begin(), AdjacentStores.end());
792 Changed = true;
793 }
794 }
795
796 return Changed;
797}
798
799/// processLoopMemIntrinsic - Template function for calling different processor
800/// functions based on mem intrinsic type.
801template <typename MemInst>
802bool LoopIdiomRecognize::processLoopMemIntrinsic(
803 BasicBlock *BB,
804 bool (LoopIdiomRecognize::*Processor)(MemInst *, const SCEV *),
805 const SCEV *BECount) {
806 bool MadeChange = false;
807 for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) {
808 Instruction *Inst = &*I++;
809 // Look for memory instructions, which may be optimized to a larger one.
810 if (MemInst *MI = dyn_cast<MemInst>(Inst)) {
811 WeakTrackingVH InstPtr(&*I);
812 if (!(this->*Processor)(MI, BECount))
813 continue;
814 MadeChange = true;
815
816 // If processing the instruction invalidated our iterator, start over from
817 // the top of the block.
818 if (!InstPtr)
819 I = BB->begin();
820 }
821 }
822 return MadeChange;
823}
824
825/// processLoopMemCpy - See if this memcpy can be promoted to a large memcpy
826bool LoopIdiomRecognize::processLoopMemCpy(MemCpyInst *MCI,
827 const SCEV *BECount) {
828 // We can only handle non-volatile memcpys with a constant size.
829 if (MCI->isVolatile() || !isa<ConstantInt>(MCI->getLength()))
830 return false;
831
832 // If we're not allowed to hack on memcpy, we fail.
833 if ((!HasMemcpy && !isa<MemCpyInlineInst>(MCI)) || DisableLIRP::Memcpy)
834 return false;
835
836 Value *Dest = MCI->getDest();
837 Value *Source = MCI->getSource();
838 if (!Dest || !Source)
839 return false;
840
841 // See if the load and store pointer expressions are AddRec like {base,+,1} on
842 // the current loop, which indicates a strided load and store. If we have
843 // something else, it's a random load or store we can't handle.
844 const SCEVAddRecExpr *StoreEv = dyn_cast<SCEVAddRecExpr>(SE->getSCEV(Dest));
845 if (!StoreEv || StoreEv->getLoop() != CurLoop || !StoreEv->isAffine())
846 return false;
847 const SCEVAddRecExpr *LoadEv = dyn_cast<SCEVAddRecExpr>(SE->getSCEV(Source));
848 if (!LoadEv || LoadEv->getLoop() != CurLoop || !LoadEv->isAffine())
849 return false;
850
851 // Reject memcpys that are so large that they overflow an unsigned.
852 uint64_t SizeInBytes = cast<ConstantInt>(MCI->getLength())->getZExtValue();
853 if ((SizeInBytes >> 32) != 0)
854 return false;
855
856 // Check if the stride matches the size of the memcpy. If so, then we know
857 // that every byte is touched in the loop.
858 const SCEVConstant *ConstStoreStride =
859 dyn_cast<SCEVConstant>(StoreEv->getOperand(1));
860 const SCEVConstant *ConstLoadStride =
861 dyn_cast<SCEVConstant>(LoadEv->getOperand(1));
862 if (!ConstStoreStride || !ConstLoadStride)
863 return false;
864
865 APInt StoreStrideValue = ConstStoreStride->getAPInt();
866 APInt LoadStrideValue = ConstLoadStride->getAPInt();
867 // Huge stride value - give up
868 if (StoreStrideValue.getBitWidth() > 64 || LoadStrideValue.getBitWidth() > 64)
869 return false;
870
871 if (SizeInBytes != StoreStrideValue && SizeInBytes != -StoreStrideValue) {
872 ORE.emit([&]() {
873 return OptimizationRemarkMissed(DEBUG_TYPE, "SizeStrideUnequal", MCI)
874 << ore::NV("Inst", "memcpy") << " in "
875 << ore::NV("Function", MCI->getFunction())
876 << " function will not be hoisted: "
877 << ore::NV("Reason", "memcpy size is not equal to stride");
878 });
879 return false;
880 }
881
882 int64_t StoreStrideInt = StoreStrideValue.getSExtValue();
883 int64_t LoadStrideInt = LoadStrideValue.getSExtValue();
884 // Check if the load stride matches the store stride.
885 if (StoreStrideInt != LoadStrideInt)
886 return false;
887
888 return processLoopStoreOfLoopLoad(
889 Dest, Source, SE->getConstant(Dest->getType(), SizeInBytes),
890 MCI->getDestAlign(), MCI->getSourceAlign(), MCI, MCI, StoreEv, LoadEv,
891 BECount);
892}
893
894/// processLoopMemSet - See if this memset can be promoted to a large memset.
895bool LoopIdiomRecognize::processLoopMemSet(MemSetInst *MSI,
896 const SCEV *BECount) {
897 // We can only handle non-volatile memsets.
898 if (MSI->isVolatile())
899 return false;
900
901 // If we're not allowed to hack on memset, we fail.
902 if (!HasMemset || DisableLIRP::Memset)
903 return false;
904
905 Value *Pointer = MSI->getDest();
906
907 // See if the pointer expression is an AddRec like {base,+,1} on the current
908 // loop, which indicates a strided store. If we have something else, it's a
909 // random store we can't handle.
910 const SCEVAddRecExpr *Ev = dyn_cast<SCEVAddRecExpr>(SE->getSCEV(Pointer));
911 if (!Ev || Ev->getLoop() != CurLoop)
912 return false;
913 if (!Ev->isAffine()) {
914 LLVM_DEBUG(dbgs() << " Pointer is not affine, abort\n");
915 return false;
916 }
917
918 const SCEV *PointerStrideSCEV = Ev->getOperand(1);
919 const SCEV *MemsetSizeSCEV = SE->getSCEV(MSI->getLength());
920 if (!PointerStrideSCEV || !MemsetSizeSCEV)
921 return false;
922
923 bool IsNegStride = false;
924 const bool IsConstantSize = isa<ConstantInt>(MSI->getLength());
925
926 if (IsConstantSize) {
927 // Memset size is constant.
928 // Check if the pointer stride matches the memset size. If so, then
929 // we know that every byte is touched in the loop.
930 LLVM_DEBUG(dbgs() << " memset size is constant\n");
931 uint64_t SizeInBytes = cast<ConstantInt>(MSI->getLength())->getZExtValue();
932 const SCEVConstant *ConstStride = dyn_cast<SCEVConstant>(Ev->getOperand(1));
933 if (!ConstStride)
934 return false;
935
936 APInt Stride = ConstStride->getAPInt();
937 if (SizeInBytes != Stride && SizeInBytes != -Stride)
938 return false;
939
940 IsNegStride = SizeInBytes == -Stride;
941 } else {
942 // Memset size is non-constant.
943 // Check if the pointer stride matches the memset size.
944 // To be conservative, the pass would not promote pointers that aren't in
945 // address space zero. Also, the pass only handles memset length and stride
946 // that are invariant for the top level loop.
947 LLVM_DEBUG(dbgs() << " memset size is non-constant\n");
948 if (Pointer->getType()->getPointerAddressSpace() != 0) {
949 LLVM_DEBUG(dbgs() << " pointer is not in address space zero, "
950 << "abort\n");
951 return false;
952 }
953 if (!SE->isLoopInvariant(MemsetSizeSCEV, CurLoop)) {
954 LLVM_DEBUG(dbgs() << " memset size is not a loop-invariant, "
955 << "abort\n");
956 return false;
957 }
958
959 // Compare positive direction PointerStrideSCEV with MemsetSizeSCEV
960 IsNegStride = PointerStrideSCEV->isNonConstantNegative();
961 const SCEV *PositiveStrideSCEV =
962 IsNegStride ? SE->getNegativeSCEV(PointerStrideSCEV)
963 : PointerStrideSCEV;
964 LLVM_DEBUG(dbgs() << " MemsetSizeSCEV: " << *MemsetSizeSCEV << "\n"
965 << " PositiveStrideSCEV: " << *PositiveStrideSCEV
966 << "\n");
967
968 if (PositiveStrideSCEV != MemsetSizeSCEV) {
969 // If an expression is covered by the loop guard, compare again and
970 // proceed with optimization if equal.
971 const SCEV *FoldedPositiveStride =
972 SE->applyLoopGuards(PositiveStrideSCEV, CurLoop);
973 const SCEV *FoldedMemsetSize =
974 SE->applyLoopGuards(MemsetSizeSCEV, CurLoop);
975
976 LLVM_DEBUG(dbgs() << " Try to fold SCEV based on loop guard\n"
977 << " FoldedMemsetSize: " << *FoldedMemsetSize << "\n"
978 << " FoldedPositiveStride: " << *FoldedPositiveStride
979 << "\n");
980
981 if (FoldedPositiveStride != FoldedMemsetSize) {
982 LLVM_DEBUG(dbgs() << " SCEV don't match, abort\n");
983 return false;
984 }
985 }
986 }
987
988 // Verify that the memset value is loop invariant. If not, we can't promote
989 // the memset.
990 Value *SplatValue = MSI->getValue();
991 if (!SplatValue || !CurLoop->isLoopInvariant(SplatValue))
992 return false;
993
995 MSIs.insert(MSI);
996 return processLoopStridedStore(Pointer, SE->getSCEV(MSI->getLength()),
997 MSI->getDestAlign(), SplatValue, MSI, MSIs, Ev,
998 BECount, IsNegStride, /*IsLoopMemset=*/true);
999}
1000
1001/// mayLoopAccessLocation - Return true if the specified loop might access the
1002/// specified pointer location, which is a loop-strided access. The 'Access'
1003/// argument specifies what the verboten forms of access are (read or write).
1004static bool
1006 const SCEV *BECount, const SCEV *StoreSizeSCEV,
1007 AliasAnalysis &AA,
1008 SmallPtrSetImpl<Instruction *> &IgnoredInsts) {
1009 // Get the location that may be stored across the loop. Since the access is
1010 // strided positively through memory, we say that the modified location starts
1011 // at the pointer and has infinite size.
1013
1014 // If the loop iterates a fixed number of times, we can refine the access size
1015 // to be exactly the size of the memset, which is (BECount+1)*StoreSize
1016 const SCEVConstant *BECst = dyn_cast<SCEVConstant>(BECount);
1017 const SCEVConstant *ConstSize = dyn_cast<SCEVConstant>(StoreSizeSCEV);
1018 if (BECst && ConstSize)
1019 AccessSize = LocationSize::precise((BECst->getValue()->getZExtValue() + 1) *
1020 ConstSize->getValue()->getZExtValue());
1021
1022 // TODO: For this to be really effective, we have to dive into the pointer
1023 // operand in the store. Store to &A[i] of 100 will always return may alias
1024 // with store of &A[100], we need to StoreLoc to be "A" with size of 100,
1025 // which will then no-alias a store to &A[100].
1026 MemoryLocation StoreLoc(Ptr, AccessSize);
1027
1028 for (BasicBlock *B : L->blocks())
1029 for (Instruction &I : *B)
1030 if (!IgnoredInsts.contains(&I) &&
1031 isModOrRefSet(AA.getModRefInfo(&I, StoreLoc) & Access))
1032 return true;
1033 return false;
1034}
1035
1036// If we have a negative stride, Start refers to the end of the memory location
1037// we're trying to memset. Therefore, we need to recompute the base pointer,
1038// which is just Start - BECount*Size.
1039static const SCEV *getStartForNegStride(const SCEV *Start, const SCEV *BECount,
1040 Type *IntPtr, const SCEV *StoreSizeSCEV,
1041 ScalarEvolution *SE) {
1042 const SCEV *Index = SE->getTruncateOrZeroExtend(BECount, IntPtr);
1043 if (!StoreSizeSCEV->isOne()) {
1044 // index = back edge count * store size
1045 Index = SE->getMulExpr(Index,
1046 SE->getTruncateOrZeroExtend(StoreSizeSCEV, IntPtr),
1048 }
1049 // base pointer = start - index * store size
1050 return SE->getMinusSCEV(Start, Index);
1051}
1052
1053/// Compute trip count from the backedge taken count.
1054static const SCEV *getTripCount(const SCEV *BECount, Type *IntPtr,
1055 Loop *CurLoop, const DataLayout *DL,
1056 ScalarEvolution *SE) {
1057 const SCEV *TripCountS = nullptr;
1058 // The # stored bytes is (BECount+1). Expand the trip count out to
1059 // pointer size if it isn't already.
1060 //
1061 // If we're going to need to zero extend the BE count, check if we can add
1062 // one to it prior to zero extending without overflow. Provided this is safe,
1063 // it allows better simplification of the +1.
1064 if (DL->getTypeSizeInBits(BECount->getType()) <
1065 DL->getTypeSizeInBits(IntPtr) &&
1067 CurLoop, ICmpInst::ICMP_NE, BECount,
1068 SE->getNegativeSCEV(SE->getOne(BECount->getType())))) {
1069 TripCountS = SE->getZeroExtendExpr(
1070 SE->getAddExpr(BECount, SE->getOne(BECount->getType()), SCEV::FlagNUW),
1071 IntPtr);
1072 } else {
1073 TripCountS = SE->getAddExpr(SE->getTruncateOrZeroExtend(BECount, IntPtr),
1074 SE->getOne(IntPtr), SCEV::FlagNUW);
1075 }
1076
1077 return TripCountS;
1078}
1079
1080/// Compute the number of bytes as a SCEV from the backedge taken count.
1081///
1082/// This also maps the SCEV into the provided type and tries to handle the
1083/// computation in a way that will fold cleanly.
1084static const SCEV *getNumBytes(const SCEV *BECount, Type *IntPtr,
1085 const SCEV *StoreSizeSCEV, Loop *CurLoop,
1086 const DataLayout *DL, ScalarEvolution *SE) {
1087 const SCEV *TripCountSCEV = getTripCount(BECount, IntPtr, CurLoop, DL, SE);
1088
1089 return SE->getMulExpr(TripCountSCEV,
1090 SE->getTruncateOrZeroExtend(StoreSizeSCEV, IntPtr),
1092}
1093
1094/// processLoopStridedStore - We see a strided store of some value. If we can
1095/// transform this into a memset or memset_pattern in the loop preheader, do so.
1096bool LoopIdiomRecognize::processLoopStridedStore(
1097 Value *DestPtr, const SCEV *StoreSizeSCEV, MaybeAlign StoreAlignment,
1098 Value *StoredVal, Instruction *TheStore,
1100 const SCEV *BECount, bool IsNegStride, bool IsLoopMemset) {
1101 Module *M = TheStore->getModule();
1102 Value *SplatValue = isBytewiseValue(StoredVal, *DL);
1103 Constant *PatternValue = nullptr;
1104
1105 if (!SplatValue)
1106 PatternValue = getMemSetPatternValue(StoredVal, DL);
1107
1108 assert((SplatValue || PatternValue) &&
1109 "Expected either splat value or pattern value.");
1110
1111 // The trip count of the loop and the base pointer of the addrec SCEV is
1112 // guaranteed to be loop invariant, which means that it should dominate the
1113 // header. This allows us to insert code for it in the preheader.
1114 unsigned DestAS = DestPtr->getType()->getPointerAddressSpace();
1115 BasicBlock *Preheader = CurLoop->getLoopPreheader();
1116 IRBuilder<> Builder(Preheader->getTerminator());
1117 SCEVExpander Expander(*SE, *DL, "loop-idiom");
1118 SCEVExpanderCleaner ExpCleaner(Expander);
1119
1120 Type *DestInt8PtrTy = Builder.getInt8PtrTy(DestAS);
1121 Type *IntIdxTy = DL->getIndexType(DestPtr->getType());
1122
1123 bool Changed = false;
1124 const SCEV *Start = Ev->getStart();
1125 // Handle negative strided loops.
1126 if (IsNegStride)
1127 Start = getStartForNegStride(Start, BECount, IntIdxTy, StoreSizeSCEV, SE);
1128
1129 // TODO: ideally we should still be able to generate memset if SCEV expander
1130 // is taught to generate the dependencies at the latest point.
1131 if (!Expander.isSafeToExpand(Start))
1132 return Changed;
1133
1134 // Okay, we have a strided store "p[i]" of a splattable value. We can turn
1135 // this into a memset in the loop preheader now if we want. However, this
1136 // would be unsafe to do if there is anything else in the loop that may read
1137 // or write to the aliased location. Check for any overlap by generating the
1138 // base pointer and checking the region.
1139 Value *BasePtr =
1140 Expander.expandCodeFor(Start, DestInt8PtrTy, Preheader->getTerminator());
1141
1142 // From here on out, conservatively report to the pass manager that we've
1143 // changed the IR, even if we later clean up these added instructions. There
1144 // may be structural differences e.g. in the order of use lists not accounted
1145 // for in just a textual dump of the IR. This is written as a variable, even
1146 // though statically all the places this dominates could be replaced with
1147 // 'true', with the hope that anyone trying to be clever / "more precise" with
1148 // the return value will read this comment, and leave them alone.
1149 Changed = true;
1150
1151 if (mayLoopAccessLocation(BasePtr, ModRefInfo::ModRef, CurLoop, BECount,
1152 StoreSizeSCEV, *AA, Stores))
1153 return Changed;
1154
1155 if (avoidLIRForMultiBlockLoop(/*IsMemset=*/true, IsLoopMemset))
1156 return Changed;
1157
1158 // Okay, everything looks good, insert the memset.
1159
1160 const SCEV *NumBytesS =
1161 getNumBytes(BECount, IntIdxTy, StoreSizeSCEV, CurLoop, DL, SE);
1162
1163 // TODO: ideally we should still be able to generate memset if SCEV expander
1164 // is taught to generate the dependencies at the latest point.
1165 if (!Expander.isSafeToExpand(NumBytesS))
1166 return Changed;
1167
1168 Value *NumBytes =
1169 Expander.expandCodeFor(NumBytesS, IntIdxTy, Preheader->getTerminator());
1170
1171 CallInst *NewCall;
1172 if (SplatValue) {
1173 AAMDNodes AATags = TheStore->getAAMetadata();
1174 for (Instruction *Store : Stores)
1175 AATags = AATags.merge(Store->getAAMetadata());
1176 if (auto CI = dyn_cast<ConstantInt>(NumBytes))
1177 AATags = AATags.extendTo(CI->getZExtValue());
1178 else
1179 AATags = AATags.extendTo(-1);
1180
1181 NewCall = Builder.CreateMemSet(
1182 BasePtr, SplatValue, NumBytes, MaybeAlign(StoreAlignment),
1183 /*isVolatile=*/false, AATags.TBAA, AATags.Scope, AATags.NoAlias);
1184 } else if (isLibFuncEmittable(M, TLI, LibFunc_memset_pattern16)) {
1185 // Everything is emitted in default address space
1186 Type *Int8PtrTy = DestInt8PtrTy;
1187
1188 StringRef FuncName = "memset_pattern16";
1189 FunctionCallee MSP = getOrInsertLibFunc(M, *TLI, LibFunc_memset_pattern16,
1190 Builder.getVoidTy(), Int8PtrTy, Int8PtrTy, IntIdxTy);
1191 inferNonMandatoryLibFuncAttrs(M, FuncName, *TLI);
1192
1193 // Otherwise we should form a memset_pattern16. PatternValue is known to be
1194 // an constant array of 16-bytes. Plop the value into a mergable global.
1195 GlobalVariable *GV = new GlobalVariable(*M, PatternValue->getType(), true,
1197 PatternValue, ".memset_pattern");
1198 GV->setUnnamedAddr(GlobalValue::UnnamedAddr::Global); // Ok to merge these.
1199 GV->setAlignment(Align(16));
1200 Value *PatternPtr = ConstantExpr::getBitCast(GV, Int8PtrTy);
1201 NewCall = Builder.CreateCall(MSP, {BasePtr, PatternPtr, NumBytes});
1202 } else
1203 return Changed;
1204
1205 NewCall->setDebugLoc(TheStore->getDebugLoc());
1206
1207 if (MSSAU) {
1208 MemoryAccess *NewMemAcc = MSSAU->createMemoryAccessInBB(
1209 NewCall, nullptr, NewCall->getParent(), MemorySSA::BeforeTerminator);
1210 MSSAU->insertDef(cast<MemoryDef>(NewMemAcc), true);
1211 }
1212
1213 LLVM_DEBUG(dbgs() << " Formed memset: " << *NewCall << "\n"
1214 << " from store to: " << *Ev << " at: " << *TheStore
1215 << "\n");
1216
1217 ORE.emit([&]() {
1218 OptimizationRemark R(DEBUG_TYPE, "ProcessLoopStridedStore",
1219 NewCall->getDebugLoc(), Preheader);
1220 R << "Transformed loop-strided store in "
1221 << ore::NV("Function", TheStore->getFunction())
1222 << " function into a call to "
1223 << ore::NV("NewFunction", NewCall->getCalledFunction())
1224 << "() intrinsic";
1225 if (!Stores.empty())
1226 R << ore::setExtraArgs();
1227 for (auto *I : Stores) {
1228 R << ore::NV("FromBlock", I->getParent()->getName())
1229 << ore::NV("ToBlock", Preheader->getName());
1230 }
1231 return R;
1232 });
1233
1234 // Okay, the memset has been formed. Zap the original store and anything that
1235 // feeds into it.
1236 for (auto *I : Stores) {
1237 if (MSSAU)
1238 MSSAU->removeMemoryAccess(I, true);
1240 }
1241 if (MSSAU && VerifyMemorySSA)
1242 MSSAU->getMemorySSA()->verifyMemorySSA();
1243 ++NumMemSet;
1244 ExpCleaner.markResultUsed();
1245 return true;
1246}
1247
1248/// If the stored value is a strided load in the same loop with the same stride
1249/// this may be transformable into a memcpy. This kicks in for stuff like
1250/// for (i) A[i] = B[i];
1251bool LoopIdiomRecognize::processLoopStoreOfLoopLoad(StoreInst *SI,
1252 const SCEV *BECount) {
1253 assert(SI->isUnordered() && "Expected only non-volatile non-ordered stores.");
1254
1255 Value *StorePtr = SI->getPointerOperand();
1256 const SCEVAddRecExpr *StoreEv = cast<SCEVAddRecExpr>(SE->getSCEV(StorePtr));
1257 unsigned StoreSize = DL->getTypeStoreSize(SI->getValueOperand()->getType());
1258
1259 // The store must be feeding a non-volatile load.
1260 LoadInst *LI = cast<LoadInst>(SI->getValueOperand());
1261 assert(LI->isUnordered() && "Expected only non-volatile non-ordered loads.");
1262
1263 // See if the pointer expression is an AddRec like {base,+,1} on the current
1264 // loop, which indicates a strided load. If we have something else, it's a
1265 // random load we can't handle.
1266 Value *LoadPtr = LI->getPointerOperand();
1267 const SCEVAddRecExpr *LoadEv = cast<SCEVAddRecExpr>(SE->getSCEV(LoadPtr));
1268
1269 const SCEV *StoreSizeSCEV = SE->getConstant(StorePtr->getType(), StoreSize);
1270 return processLoopStoreOfLoopLoad(StorePtr, LoadPtr, StoreSizeSCEV,
1271 SI->getAlign(), LI->getAlign(), SI, LI,
1272 StoreEv, LoadEv, BECount);
1273}
1274
1275namespace {
1276class MemmoveVerifier {
1277public:
1278 explicit MemmoveVerifier(const Value &LoadBasePtr, const Value &StoreBasePtr,
1279 const DataLayout &DL)
1281 LoadBasePtr.stripPointerCasts(), LoadOff, DL)),
1283 StoreBasePtr.stripPointerCasts(), StoreOff, DL)),
1284 IsSameObject(BP1 == BP2) {}
1285
1286 bool loadAndStoreMayFormMemmove(unsigned StoreSize, bool IsNegStride,
1287 const Instruction &TheLoad,
1288 bool IsMemCpy) const {
1289 if (IsMemCpy) {
1290 // Ensure that LoadBasePtr is after StoreBasePtr or before StoreBasePtr
1291 // for negative stride.
1292 if ((!IsNegStride && LoadOff <= StoreOff) ||
1293 (IsNegStride && LoadOff >= StoreOff))
1294 return false;
1295 } else {
1296 // Ensure that LoadBasePtr is after StoreBasePtr or before StoreBasePtr
1297 // for negative stride. LoadBasePtr shouldn't overlap with StoreBasePtr.
1298 int64_t LoadSize =
1299 DL.getTypeSizeInBits(TheLoad.getType()).getFixedValue() / 8;
1300 if (BP1 != BP2 || LoadSize != int64_t(StoreSize))
1301 return false;
1302 if ((!IsNegStride && LoadOff < StoreOff + int64_t(StoreSize)) ||
1303 (IsNegStride && LoadOff + LoadSize > StoreOff))
1304 return false;
1305 }
1306 return true;
1307 }
1308
1309private:
1310 const DataLayout &DL;
1311 int64_t LoadOff = 0;
1312 int64_t StoreOff = 0;
1313 const Value *BP1;
1314 const Value *BP2;
1315
1316public:
1317 const bool IsSameObject;
1318};
1319} // namespace
1320
1321bool LoopIdiomRecognize::processLoopStoreOfLoopLoad(
1322 Value *DestPtr, Value *SourcePtr, const SCEV *StoreSizeSCEV,
1323 MaybeAlign StoreAlign, MaybeAlign LoadAlign, Instruction *TheStore,
1324 Instruction *TheLoad, const SCEVAddRecExpr *StoreEv,
1325 const SCEVAddRecExpr *LoadEv, const SCEV *BECount) {
1326
1327 // FIXME: until llvm.memcpy.inline supports dynamic sizes, we need to
1328 // conservatively bail here, since otherwise we may have to transform
1329 // llvm.memcpy.inline into llvm.memcpy which is illegal.
1330 if (isa<MemCpyInlineInst>(TheStore))
1331 return false;
1332
1333 // The trip count of the loop and the base pointer of the addrec SCEV is
1334 // guaranteed to be loop invariant, which means that it should dominate the
1335 // header. This allows us to insert code for it in the preheader.
1336 BasicBlock *Preheader = CurLoop->getLoopPreheader();
1337 IRBuilder<> Builder(Preheader->getTerminator());
1338 SCEVExpander Expander(*SE, *DL, "loop-idiom");
1339
1340 SCEVExpanderCleaner ExpCleaner(Expander);
1341
1342 bool Changed = false;
1343 const SCEV *StrStart = StoreEv->getStart();
1344 unsigned StrAS = DestPtr->getType()->getPointerAddressSpace();
1345 Type *IntIdxTy = Builder.getIntNTy(DL->getIndexSizeInBits(StrAS));
1346
1347 APInt Stride = getStoreStride(StoreEv);
1348 const SCEVConstant *ConstStoreSize = dyn_cast<SCEVConstant>(StoreSizeSCEV);
1349
1350 // TODO: Deal with non-constant size; Currently expect constant store size
1351 assert(ConstStoreSize && "store size is expected to be a constant");
1352
1353 int64_t StoreSize = ConstStoreSize->getValue()->getZExtValue();
1354 bool IsNegStride = StoreSize == -Stride;
1355
1356 // Handle negative strided loops.
1357 if (IsNegStride)
1358 StrStart =
1359 getStartForNegStride(StrStart, BECount, IntIdxTy, StoreSizeSCEV, SE);
1360
1361 // Okay, we have a strided store "p[i]" of a loaded value. We can turn
1362 // this into a memcpy in the loop preheader now if we want. However, this
1363 // would be unsafe to do if there is anything else in the loop that may read
1364 // or write the memory region we're storing to. This includes the load that
1365 // feeds the stores. Check for an alias by generating the base address and
1366 // checking everything.
1367 Value *StoreBasePtr = Expander.expandCodeFor(
1368 StrStart, Builder.getInt8PtrTy(StrAS), Preheader->getTerminator());
1369
1370 // From here on out, conservatively report to the pass manager that we've
1371 // changed the IR, even if we later clean up these added instructions. There
1372 // may be structural differences e.g. in the order of use lists not accounted
1373 // for in just a textual dump of the IR. This is written as a variable, even
1374 // though statically all the places this dominates could be replaced with
1375 // 'true', with the hope that anyone trying to be clever / "more precise" with
1376 // the return value will read this comment, and leave them alone.
1377 Changed = true;
1378
1379 SmallPtrSet<Instruction *, 2> IgnoredInsts;
1380 IgnoredInsts.insert(TheStore);
1381
1382 bool IsMemCpy = isa<MemCpyInst>(TheStore);
1383 const StringRef InstRemark = IsMemCpy ? "memcpy" : "load and store";
1384
1385 bool LoopAccessStore =
1386 mayLoopAccessLocation(StoreBasePtr, ModRefInfo::ModRef, CurLoop, BECount,
1387 StoreSizeSCEV, *AA, IgnoredInsts);
1388 if (LoopAccessStore) {
1389 // For memmove case it's not enough to guarantee that loop doesn't access
1390 // TheStore and TheLoad. Additionally we need to make sure that TheStore is
1391 // the only user of TheLoad.
1392 if (!TheLoad->hasOneUse())
1393 return Changed;
1394 IgnoredInsts.insert(TheLoad);
1395 if (mayLoopAccessLocation(StoreBasePtr, ModRefInfo::ModRef, CurLoop,
1396 BECount, StoreSizeSCEV, *AA, IgnoredInsts)) {
1397 ORE.emit([&]() {
1398 return OptimizationRemarkMissed(DEBUG_TYPE, "LoopMayAccessStore",
1399 TheStore)
1400 << ore::NV("Inst", InstRemark) << " in "
1401 << ore::NV("Function", TheStore->getFunction())
1402 << " function will not be hoisted: "
1403 << ore::NV("Reason", "The loop may access store location");
1404 });
1405 return Changed;
1406 }
1407 IgnoredInsts.erase(TheLoad);
1408 }
1409
1410 const SCEV *LdStart = LoadEv->getStart();
1411 unsigned LdAS = SourcePtr->getType()->getPointerAddressSpace();
1412
1413 // Handle negative strided loops.
1414 if (IsNegStride)
1415 LdStart =
1416 getStartForNegStride(LdStart, BECount, IntIdxTy, StoreSizeSCEV, SE);
1417
1418 // For a memcpy, we have to make sure that the input array is not being
1419 // mutated by the loop.
1420 Value *LoadBasePtr = Expander.expandCodeFor(
1421 LdStart, Builder.getInt8PtrTy(LdAS), Preheader->getTerminator());
1422
1423 // If the store is a memcpy instruction, we must check if it will write to
1424 // the load memory locations. So remove it from the ignored stores.
1425 MemmoveVerifier Verifier(*LoadBasePtr, *StoreBasePtr, *DL);
1426 if (IsMemCpy && !Verifier.IsSameObject)
1427 IgnoredInsts.erase(TheStore);
1428 if (mayLoopAccessLocation(LoadBasePtr, ModRefInfo::Mod, CurLoop, BECount,
1429 StoreSizeSCEV, *AA, IgnoredInsts)) {
1430 ORE.emit([&]() {
1431 return OptimizationRemarkMissed(DEBUG_TYPE, "LoopMayAccessLoad", TheLoad)
1432 << ore::NV("Inst", InstRemark) << " in "
1433 << ore::NV("Function", TheStore->getFunction())
1434 << " function will not be hoisted: "
1435 << ore::NV("Reason", "The loop may access load location");
1436 });
1437 return Changed;
1438 }
1439
1440 bool UseMemMove = IsMemCpy ? Verifier.IsSameObject : LoopAccessStore;
1441 if (UseMemMove)
1442 if (!Verifier.loadAndStoreMayFormMemmove(StoreSize, IsNegStride, *TheLoad,
1443 IsMemCpy))
1444 return Changed;
1445
1446 if (avoidLIRForMultiBlockLoop())
1447 return Changed;
1448
1449 // Okay, everything is safe, we can transform this!
1450
1451 const SCEV *NumBytesS =
1452 getNumBytes(BECount, IntIdxTy, StoreSizeSCEV, CurLoop, DL, SE);
1453
1454 Value *NumBytes =
1455 Expander.expandCodeFor(NumBytesS, IntIdxTy, Preheader->getTerminator());
1456
1457 AAMDNodes AATags = TheLoad->getAAMetadata();
1458 AAMDNodes StoreAATags = TheStore->getAAMetadata();
1459 AATags = AATags.merge(StoreAATags);
1460 if (auto CI = dyn_cast<ConstantInt>(NumBytes))
1461 AATags = AATags.extendTo(CI->getZExtValue());
1462 else
1463 AATags = AATags.extendTo(-1);
1464
1465 CallInst *NewCall = nullptr;
1466 // Check whether to generate an unordered atomic memcpy:
1467 // If the load or store are atomic, then they must necessarily be unordered
1468 // by previous checks.
1469 if (!TheStore->isAtomic() && !TheLoad->isAtomic()) {
1470 if (UseMemMove)
1471 NewCall = Builder.CreateMemMove(
1472 StoreBasePtr, StoreAlign, LoadBasePtr, LoadAlign, NumBytes,
1473 /*isVolatile=*/false, AATags.TBAA, AATags.Scope, AATags.NoAlias);
1474 else
1475 NewCall =
1476 Builder.CreateMemCpy(StoreBasePtr, StoreAlign, LoadBasePtr, LoadAlign,
1477 NumBytes, /*isVolatile=*/false, AATags.TBAA,
1478 AATags.TBAAStruct, AATags.Scope, AATags.NoAlias);
1479 } else {
1480 // For now don't support unordered atomic memmove.
1481 if (UseMemMove)
1482 return Changed;
1483 // We cannot allow unaligned ops for unordered load/store, so reject
1484 // anything where the alignment isn't at least the element size.
1485 assert((StoreAlign && LoadAlign) &&
1486 "Expect unordered load/store to have align.");
1487 if (*StoreAlign < StoreSize || *LoadAlign < StoreSize)
1488 return Changed;
1489
1490 // If the element.atomic memcpy is not lowered into explicit
1491 // loads/stores later, then it will be lowered into an element-size
1492 // specific lib call. If the lib call doesn't exist for our store size, then
1493 // we shouldn't generate the memcpy.
1494 if (StoreSize > TTI->getAtomicMemIntrinsicMaxElementSize())
1495 return Changed;
1496
1497 // Create the call.
1498 // Note that unordered atomic loads/stores are *required* by the spec to
1499 // have an alignment but non-atomic loads/stores may not.
1500 NewCall = Builder.CreateElementUnorderedAtomicMemCpy(
1501 StoreBasePtr, *StoreAlign, LoadBasePtr, *LoadAlign, NumBytes, StoreSize,
1502 AATags.TBAA, AATags.TBAAStruct, AATags.Scope, AATags.NoAlias);
1503 }
1504 NewCall->setDebugLoc(TheStore->getDebugLoc());
1505
1506 if (MSSAU) {
1507 MemoryAccess *NewMemAcc = MSSAU->createMemoryAccessInBB(
1508 NewCall, nullptr, NewCall->getParent(), MemorySSA::BeforeTerminator);
1509 MSSAU->insertDef(cast<MemoryDef>(NewMemAcc), true);
1510 }
1511
1512 LLVM_DEBUG(dbgs() << " Formed new call: " << *NewCall << "\n"
1513 << " from load ptr=" << *LoadEv << " at: " << *TheLoad
1514 << "\n"
1515 << " from store ptr=" << *StoreEv << " at: " << *TheStore
1516 << "\n");
1517
1518 ORE.emit([&]() {
1519 return OptimizationRemark(DEBUG_TYPE, "ProcessLoopStoreOfLoopLoad",
1520 NewCall->getDebugLoc(), Preheader)
1521 << "Formed a call to "
1522 << ore::NV("NewFunction", NewCall->getCalledFunction())
1523 << "() intrinsic from " << ore::NV("Inst", InstRemark)
1524 << " instruction in " << ore::NV("Function", TheStore->getFunction())
1525 << " function"
1527 << ore::NV("FromBlock", TheStore->getParent()->getName())
1528 << ore::NV("ToBlock", Preheader->getName());
1529 });
1530
1531 // Okay, a new call to memcpy/memmove has been formed. Zap the original store
1532 // and anything that feeds into it.
1533 if (MSSAU)
1534 MSSAU->removeMemoryAccess(TheStore, true);
1535 deleteDeadInstruction(TheStore);
1536 if (MSSAU && VerifyMemorySSA)
1537 MSSAU->getMemorySSA()->verifyMemorySSA();
1538 if (UseMemMove)
1539 ++NumMemMove;
1540 else
1541 ++NumMemCpy;
1542 ExpCleaner.markResultUsed();
1543 return true;
1544}
1545
1546// When compiling for codesize we avoid idiom recognition for a multi-block loop
1547// unless it is a loop_memset idiom or a memset/memcpy idiom in a nested loop.
1548//
1549bool LoopIdiomRecognize::avoidLIRForMultiBlockLoop(bool IsMemset,
1550 bool IsLoopMemset) {
1551 if (ApplyCodeSizeHeuristics && CurLoop->getNumBlocks() > 1) {
1552 if (CurLoop->isOutermost() && (!IsMemset || !IsLoopMemset)) {
1553 LLVM_DEBUG(dbgs() << " " << CurLoop->getHeader()->getParent()->getName()
1554 << " : LIR " << (IsMemset ? "Memset" : "Memcpy")
1555 << " avoided: multi-block top-level loop\n");
1556 return true;
1557 }
1558 }
1559
1560 return false;
1561}
1562
1563bool LoopIdiomRecognize::runOnNoncountableLoop() {
1564 LLVM_DEBUG(dbgs() << DEBUG_TYPE " Scanning: F["
1565 << CurLoop->getHeader()->getParent()->getName()
1566 << "] Noncountable Loop %"
1567 << CurLoop->getHeader()->getName() << "\n");
1568
1569 return recognizePopcount() || recognizeAndInsertFFS() ||
1570 recognizeShiftUntilBitTest() || recognizeShiftUntilZero();
1571}
1572
1573/// Check if the given conditional branch is based on the comparison between
1574/// a variable and zero, and if the variable is non-zero or zero (JmpOnZero is
1575/// true), the control yields to the loop entry. If the branch matches the
1576/// behavior, the variable involved in the comparison is returned. This function
1577/// will be called to see if the precondition and postcondition of the loop are
1578/// in desirable form.
1580 bool JmpOnZero = false) {
1581 if (!BI || !BI->isConditional())
1582 return nullptr;
1583
1584 ICmpInst *Cond = dyn_cast<ICmpInst>(BI->getCondition());
1585 if (!Cond)
1586 return nullptr;
1587
1588 ConstantInt *CmpZero = dyn_cast<ConstantInt>(Cond->getOperand(1));
1589 if (!CmpZero || !CmpZero->isZero())
1590 return nullptr;
1591
1592 BasicBlock *TrueSucc = BI->getSuccessor(0);
1593 BasicBlock *FalseSucc = BI->getSuccessor(1);
1594 if (JmpOnZero)
1595 std::swap(TrueSucc, FalseSucc);
1596
1597 ICmpInst::Predicate Pred = Cond->getPredicate();
1598 if ((Pred == ICmpInst::ICMP_NE && TrueSucc == LoopEntry) ||
1599 (Pred == ICmpInst::ICMP_EQ && FalseSucc == LoopEntry))
1600 return Cond->getOperand(0);
1601
1602 return nullptr;
1603}
1604
1605// Check if the recurrence variable `VarX` is in the right form to create
1606// the idiom. Returns the value coerced to a PHINode if so.
1608 BasicBlock *LoopEntry) {
1609 auto *PhiX = dyn_cast<PHINode>(VarX);
1610 if (PhiX && PhiX->getParent() == LoopEntry &&
1611 (PhiX->getOperand(0) == DefX || PhiX->getOperand(1) == DefX))
1612 return PhiX;
1613 return nullptr;
1614}
1615
1616/// Return true iff the idiom is detected in the loop.
1617///
1618/// Additionally:
1619/// 1) \p CntInst is set to the instruction counting the population bit.
1620/// 2) \p CntPhi is set to the corresponding phi node.
1621/// 3) \p Var is set to the value whose population bits are being counted.
1622///
1623/// The core idiom we are trying to detect is:
1624/// \code
1625/// if (x0 != 0)
1626/// goto loop-exit // the precondition of the loop
1627/// cnt0 = init-val;
1628/// do {
1629/// x1 = phi (x0, x2);
1630/// cnt1 = phi(cnt0, cnt2);
1631///
1632/// cnt2 = cnt1 + 1;
1633/// ...
1634/// x2 = x1 & (x1 - 1);
1635/// ...
1636/// } while(x != 0);
1637///
1638/// loop-exit:
1639/// \endcode
1640static bool detectPopcountIdiom(Loop *CurLoop, BasicBlock *PreCondBB,
1641 Instruction *&CntInst, PHINode *&CntPhi,
1642 Value *&Var) {
1643 // step 1: Check to see if the look-back branch match this pattern:
1644 // "if (a!=0) goto loop-entry".
1645 BasicBlock *LoopEntry;
1646 Instruction *DefX2, *CountInst;
1647 Value *VarX1, *VarX0;
1648 PHINode *PhiX, *CountPhi;
1649
1650 DefX2 = CountInst = nullptr;
1651 VarX1 = VarX0 = nullptr;
1652 PhiX = CountPhi = nullptr;
1653 LoopEntry = *(CurLoop->block_begin());
1654
1655 // step 1: Check if the loop-back branch is in desirable form.
1656 {
1657 if (Value *T = matchCondition(
1658 dyn_cast<BranchInst>(LoopEntry->getTerminator()), LoopEntry))
1659 DefX2 = dyn_cast<Instruction>(T);
1660 else
1661 return false;
1662 }
1663
1664 // step 2: detect instructions corresponding to "x2 = x1 & (x1 - 1)"
1665 {
1666 if (!DefX2 || DefX2->getOpcode() != Instruction::And)
1667 return false;
1668
1669 BinaryOperator *SubOneOp;
1670
1671 if ((SubOneOp = dyn_cast<BinaryOperator>(DefX2->getOperand(0))))
1672 VarX1 = DefX2->getOperand(1);
1673 else {
1674 VarX1 = DefX2->getOperand(0);
1675 SubOneOp = dyn_cast<BinaryOperator>(DefX2->getOperand(1));
1676 }
1677 if (!SubOneOp || SubOneOp->getOperand(0) != VarX1)
1678 return false;
1679
1680 ConstantInt *Dec = dyn_cast<ConstantInt>(SubOneOp->getOperand(1));
1681 if (!Dec ||
1682 !((SubOneOp->getOpcode() == Instruction::Sub && Dec->isOne()) ||
1683 (SubOneOp->getOpcode() == Instruction::Add &&
1684 Dec->isMinusOne()))) {
1685 return false;
1686 }
1687 }
1688
1689 // step 3: Check the recurrence of variable X
1690 PhiX = getRecurrenceVar(VarX1, DefX2, LoopEntry);
1691 if (!PhiX)
1692 return false;
1693
1694 // step 4: Find the instruction which count the population: cnt2 = cnt1 + 1
1695 {
1696 CountInst = nullptr;
1697 for (Instruction &Inst : llvm::make_range(
1698 LoopEntry->getFirstNonPHI()->getIterator(), LoopEntry->end())) {
1699 if (Inst.getOpcode() != Instruction::Add)
1700 continue;
1701
1702 ConstantInt *Inc = dyn_cast<ConstantInt>(Inst.getOperand(1));
1703 if (!Inc || !Inc->isOne())
1704 continue;
1705
1706 PHINode *Phi = getRecurrenceVar(Inst.getOperand(0), &Inst, LoopEntry);
1707 if (!Phi)
1708 continue;
1709
1710 // Check if the result of the instruction is live of the loop.
1711 bool LiveOutLoop = false;
1712 for (User *U : Inst.users()) {
1713 if ((cast<Instruction>(U))->getParent() != LoopEntry) {
1714 LiveOutLoop = true;
1715 break;
1716 }
1717 }
1718
1719 if (LiveOutLoop) {
1720 CountInst = &Inst;
1721 CountPhi = Phi;
1722 break;
1723 }
1724 }
1725
1726 if (!CountInst)
1727 return false;
1728 }
1729
1730 // step 5: check if the precondition is in this form:
1731 // "if (x != 0) goto loop-head ; else goto somewhere-we-don't-care;"
1732 {
1733 auto *PreCondBr = dyn_cast<BranchInst>(PreCondBB->getTerminator());
1734 Value *T = matchCondition(PreCondBr, CurLoop->getLoopPreheader());
1735 if (T != PhiX->getOperand(0) && T != PhiX->getOperand(1))
1736 return false;
1737
1738 CntInst = CountInst;
1739 CntPhi = CountPhi;
1740 Var = T;
1741 }
1742
1743 return true;
1744}
1745
1746/// Return true if the idiom is detected in the loop.
1747///
1748/// Additionally:
1749/// 1) \p CntInst is set to the instruction Counting Leading Zeros (CTLZ)
1750/// or nullptr if there is no such.
1751/// 2) \p CntPhi is set to the corresponding phi node
1752/// or nullptr if there is no such.
1753/// 3) \p Var is set to the value whose CTLZ could be used.
1754/// 4) \p DefX is set to the instruction calculating Loop exit condition.
1755///
1756/// The core idiom we are trying to detect is:
1757/// \code
1758/// if (x0 == 0)
1759/// goto loop-exit // the precondition of the loop
1760/// cnt0 = init-val;
1761/// do {
1762/// x = phi (x0, x.next); //PhiX
1763/// cnt = phi(cnt0, cnt.next);
1764///
1765/// cnt.next = cnt + 1;
1766/// ...
1767/// x.next = x >> 1; // DefX
1768/// ...
1769/// } while(x.next != 0);
1770///
1771/// loop-exit:
1772/// \endcode
1773static bool detectShiftUntilZeroIdiom(Loop *CurLoop, const DataLayout &DL,
1774 Intrinsic::ID &IntrinID, Value *&InitX,
1775 Instruction *&CntInst, PHINode *&CntPhi,
1776 Instruction *&DefX) {
1777 BasicBlock *LoopEntry;
1778 Value *VarX = nullptr;
1779
1780 DefX = nullptr;
1781 CntInst = nullptr;
1782 CntPhi = nullptr;
1783 LoopEntry = *(CurLoop->block_begin());
1784
1785 // step 1: Check if the loop-back branch is in desirable form.
1786 if (Value *T = matchCondition(
1787 dyn_cast<BranchInst>(LoopEntry->getTerminator()), LoopEntry))
1788 DefX = dyn_cast<Instruction>(T);
1789 else
1790 return false;
1791
1792 // step 2: detect instructions corresponding to "x.next = x >> 1 or x << 1"
1793 if (!DefX || !DefX->isShift())
1794 return false;
1795 IntrinID = DefX->getOpcode() == Instruction::Shl ? Intrinsic::cttz :
1796 Intrinsic::ctlz;
1797 ConstantInt *Shft = dyn_cast<ConstantInt>(DefX->getOperand(1));
1798 if (!Shft || !Shft->isOne())
1799 return false;
1800 VarX = DefX->getOperand(0);
1801
1802 // step 3: Check the recurrence of variable X
1803 PHINode *PhiX = getRecurrenceVar(VarX, DefX, LoopEntry);
1804 if (!PhiX)
1805 return false;
1806
1807 InitX = PhiX->getIncomingValueForBlock(CurLoop->getLoopPreheader());
1808
1809 // Make sure the initial value can't be negative otherwise the ashr in the
1810 // loop might never reach zero which would make the loop infinite.
1811 if (DefX->getOpcode() == Instruction::AShr && !isKnownNonNegative(InitX, DL))
1812 return false;
1813
1814 // step 4: Find the instruction which count the CTLZ: cnt.next = cnt + 1
1815 // or cnt.next = cnt + -1.
1816 // TODO: We can skip the step. If loop trip count is known (CTLZ),
1817 // then all uses of "cnt.next" could be optimized to the trip count
1818 // plus "cnt0". Currently it is not optimized.
1819 // This step could be used to detect POPCNT instruction:
1820 // cnt.next = cnt + (x.next & 1)
1821 for (Instruction &Inst : llvm::make_range(
1822 LoopEntry->getFirstNonPHI()->getIterator(), LoopEntry->end())) {
1823 if (Inst.getOpcode() != Instruction::Add)
1824 continue;
1825
1826 ConstantInt *Inc = dyn_cast<ConstantInt>(Inst.getOperand(1));
1827 if (!Inc || (!Inc->isOne() && !Inc->isMinusOne()))
1828 continue;
1829
1830 PHINode *Phi = getRecurrenceVar(Inst.getOperand(0), &Inst, LoopEntry);
1831 if (!Phi)
1832 continue;
1833
1834 CntInst = &Inst;
1835 CntPhi = Phi;
1836 break;
1837 }
1838 if (!CntInst)
1839 return false;
1840
1841 return true;
1842}
1843
1844/// Recognize CTLZ or CTTZ idiom in a non-countable loop and convert the loop
1845/// to countable (with CTLZ / CTTZ trip count). If CTLZ / CTTZ inserted as a new
1846/// trip count returns true; otherwise, returns false.
1847bool LoopIdiomRecognize::recognizeAndInsertFFS() {
1848 // Give up if the loop has multiple blocks or multiple backedges.
1849 if (CurLoop->getNumBackEdges() != 1 || CurLoop->getNumBlocks() != 1)
1850 return false;
1851
1852 Intrinsic::ID IntrinID;
1853 Value *InitX;
1854 Instruction *DefX = nullptr;
1855 PHINode *CntPhi = nullptr;
1856 Instruction *CntInst = nullptr;
1857 // Help decide if transformation is profitable. For ShiftUntilZero idiom,
1858 // this is always 6.
1859 size_t IdiomCanonicalSize = 6;
1860
1861 if (!detectShiftUntilZeroIdiom(CurLoop, *DL, IntrinID, InitX,
1862 CntInst, CntPhi, DefX))
1863 return false;
1864
1865 bool IsCntPhiUsedOutsideLoop = false;
1866 for (User *U : CntPhi->users())
1867 if (!CurLoop->contains(cast<Instruction>(U))) {
1868 IsCntPhiUsedOutsideLoop = true;
1869 break;
1870 }
1871 bool IsCntInstUsedOutsideLoop = false;
1872 for (User *U : CntInst->users())
1873 if (!CurLoop->contains(cast<Instruction>(U))) {
1874 IsCntInstUsedOutsideLoop = true;
1875 break;
1876 }
1877 // If both CntInst and CntPhi are used outside the loop the profitability
1878 // is questionable.
1879 if (IsCntInstUsedOutsideLoop && IsCntPhiUsedOutsideLoop)
1880 return false;
1881
1882 // For some CPUs result of CTLZ(X) intrinsic is undefined
1883 // when X is 0. If we can not guarantee X != 0, we need to check this
1884 // when expand.
1885 bool ZeroCheck = false;
1886 // It is safe to assume Preheader exist as it was checked in
1887 // parent function RunOnLoop.
1888 BasicBlock *PH = CurLoop->getLoopPreheader();
1889
1890 // If we are using the count instruction outside the loop, make sure we
1891 // have a zero check as a precondition. Without the check the loop would run
1892 // one iteration for before any check of the input value. This means 0 and 1
1893 // would have identical behavior in the original loop and thus
1894 if (!IsCntPhiUsedOutsideLoop) {
1895 auto *PreCondBB = PH->getSinglePredecessor();
1896 if (!PreCondBB)
1897 return false;
1898 auto *PreCondBI = dyn_cast<BranchInst>(PreCondBB->getTerminator());
1899 if (!PreCondBI)
1900 return false;
1901 if (matchCondition(PreCondBI, PH) != InitX)
1902 return false;
1903 ZeroCheck = true;
1904 }
1905
1906 // Check if CTLZ / CTTZ intrinsic is profitable. Assume it is always
1907 // profitable if we delete the loop.
1908
1909 // the loop has only 6 instructions:
1910 // %n.addr.0 = phi [ %n, %entry ], [ %shr, %while.cond ]
1911 // %i.0 = phi [ %i0, %entry ], [ %inc, %while.cond ]
1912 // %shr = ashr %n.addr.0, 1
1913 // %tobool = icmp eq %shr, 0
1914 // %inc = add nsw %i.0, 1
1915 // br i1 %tobool
1916
1917 const Value *Args[] = {InitX,
1918 ConstantInt::getBool(InitX->getContext(), ZeroCheck)};
1919
1920 // @llvm.dbg doesn't count as they have no semantic effect.
1921 auto InstWithoutDebugIt = CurLoop->getHeader()->instructionsWithoutDebug();
1923 std::distance(InstWithoutDebugIt.begin(), InstWithoutDebugIt.end());
1924
1925 IntrinsicCostAttributes Attrs(IntrinID, InitX->getType(), Args);
1928 if (HeaderSize != IdiomCanonicalSize &&
1930 return false;
1931
1932 transformLoopToCountable(IntrinID, PH, CntInst, CntPhi, InitX, DefX,
1933 DefX->getDebugLoc(), ZeroCheck,
1934 IsCntPhiUsedOutsideLoop);
1935 return true;
1936}
1937
1938/// Recognizes a population count idiom in a non-countable loop.
1939///
1940/// If detected, transforms the relevant code to issue the popcount intrinsic
1941/// function call, and returns true; otherwise, returns false.
1942bool LoopIdiomRecognize::recognizePopcount() {
1944 return false;
1945
1946 // Counting population are usually conducted by few arithmetic instructions.
1947 // Such instructions can be easily "absorbed" by vacant slots in a
1948 // non-compact loop. Therefore, recognizing popcount idiom only makes sense
1949 // in a compact loop.
1950
1951 // Give up if the loop has multiple blocks or multiple backedges.
1952 if (CurLoop->getNumBackEdges() != 1 || CurLoop->getNumBlocks() != 1)
1953 return false;
1954
1955 BasicBlock *LoopBody = *(CurLoop->block_begin());
1956 if (LoopBody->size() >= 20) {
1957 // The loop is too big, bail out.
1958 return false;
1959 }
1960
1961 // It should have a preheader containing nothing but an unconditional branch.
1962 BasicBlock *PH = CurLoop->getLoopPreheader();
1963 if (!PH || &PH->front() != PH->getTerminator())
1964 return false;
1965 auto *EntryBI = dyn_cast<BranchInst>(PH->getTerminator());
1966 if (!EntryBI || EntryBI->isConditional())
1967 return false;
1968
1969 // It should have a precondition block where the generated popcount intrinsic
1970 // function can be inserted.
1971 auto *PreCondBB = PH->getSinglePredecessor();
1972 if (!PreCondBB)
1973 return false;
1974 auto *PreCondBI = dyn_cast<BranchInst>(PreCondBB->getTerminator());
1975 if (!PreCondBI || PreCondBI->isUnconditional())
1976 return false;
1977
1978 Instruction *CntInst;
1979 PHINode *CntPhi;
1980 Value *Val;
1981 if (!detectPopcountIdiom(CurLoop, PreCondBB, CntInst, CntPhi, Val))
1982 return false;
1983
1984 transformLoopToPopcount(PreCondBB, CntInst, CntPhi, Val);
1985 return true;
1986}
1987
1989 const DebugLoc &DL) {
1990 Value *Ops[] = {Val};
1991 Type *Tys[] = {Val->getType()};
1992
1994 Function *Func = Intrinsic::getDeclaration(M, Intrinsic::ctpop, Tys);
1995 CallInst *CI = IRBuilder.CreateCall(Func, Ops);
1996 CI->setDebugLoc(DL);
1997
1998 return CI;
1999}
2000
2002 const DebugLoc &DL, bool ZeroCheck,
2003 Intrinsic::ID IID) {
2004 Value *Ops[] = {Val, IRBuilder.getInt1(ZeroCheck)};
2005 Type *Tys[] = {Val->getType()};
2006
2008 Function *Func = Intrinsic::getDeclaration(M, IID, Tys);
2009 CallInst *CI = IRBuilder.CreateCall(Func, Ops);
2010 CI->setDebugLoc(DL);
2011
2012 return CI;
2013}
2014
2015/// Transform the following loop (Using CTLZ, CTTZ is similar):
2016/// loop:
2017/// CntPhi = PHI [Cnt0, CntInst]
2018/// PhiX = PHI [InitX, DefX]
2019/// CntInst = CntPhi + 1
2020/// DefX = PhiX >> 1
2021/// LOOP_BODY
2022/// Br: loop if (DefX != 0)
2023/// Use(CntPhi) or Use(CntInst)
2024///
2025/// Into:
2026/// If CntPhi used outside the loop:
2027/// CountPrev = BitWidth(InitX) - CTLZ(InitX >> 1)
2028/// Count = CountPrev + 1
2029/// else
2030/// Count = BitWidth(InitX) - CTLZ(InitX)
2031/// loop:
2032/// CntPhi = PHI [Cnt0, CntInst]
2033/// PhiX = PHI [InitX, DefX]
2034/// PhiCount = PHI [Count, Dec]
2035/// CntInst = CntPhi + 1
2036/// DefX = PhiX >> 1
2037/// Dec = PhiCount - 1
2038/// LOOP_BODY
2039/// Br: loop if (Dec != 0)
2040/// Use(CountPrev + Cnt0) // Use(CntPhi)
2041/// or
2042/// Use(Count + Cnt0) // Use(CntInst)
2043///
2044/// If LOOP_BODY is empty the loop will be deleted.
2045/// If CntInst and DefX are not used in LOOP_BODY they will be removed.
2046void LoopIdiomRecognize::transformLoopToCountable(
2047 Intrinsic::ID IntrinID, BasicBlock *Preheader, Instruction *CntInst,
2048 PHINode *CntPhi, Value *InitX, Instruction *DefX, const DebugLoc &DL,
2049 bool ZeroCheck, bool IsCntPhiUsedOutsideLoop) {
2050 BranchInst *PreheaderBr = cast<BranchInst>(Preheader->getTerminator());
2051
2052 // Step 1: Insert the CTLZ/CTTZ instruction at the end of the preheader block
2053 IRBuilder<> Builder(PreheaderBr);
2054 Builder.SetCurrentDebugLocation(DL);
2055
2056 // If there are no uses of CntPhi crate:
2057 // Count = BitWidth - CTLZ(InitX);
2058 // NewCount = Count;
2059 // If there are uses of CntPhi create:
2060 // NewCount = BitWidth - CTLZ(InitX >> 1);
2061 // Count = NewCount + 1;
2062 Value *InitXNext;
2063 if (IsCntPhiUsedOutsideLoop) {
2064 if (DefX->getOpcode() == Instruction::AShr)
2065 InitXNext = Builder.CreateAShr(InitX, 1);
2066 else if (DefX->getOpcode() == Instruction::LShr)
2067 InitXNext = Builder.CreateLShr(InitX, 1);
2068 else if (DefX->getOpcode() == Instruction::Shl) // cttz
2069 InitXNext = Builder.CreateShl(InitX, 1);
2070 else
2071 llvm_unreachable("Unexpected opcode!");
2072 } else
2073 InitXNext = InitX;
2074 Value *Count =
2075 createFFSIntrinsic(Builder, InitXNext, DL, ZeroCheck, IntrinID);
2076 Type *CountTy = Count->getType();
2077 Count = Builder.CreateSub(
2078 ConstantInt::get(CountTy, CountTy->getIntegerBitWidth()), Count);
2079 Value *NewCount = Count;
2080 if (IsCntPhiUsedOutsideLoop)
2081 Count = Builder.CreateAdd(Count, ConstantInt::get(CountTy, 1));
2082
2083 NewCount = Builder.CreateZExtOrTrunc(NewCount, CntInst->getType());
2084
2085 Value *CntInitVal = CntPhi->getIncomingValueForBlock(Preheader);
2086 if (cast<ConstantInt>(CntInst->getOperand(1))->isOne()) {
2087 // If the counter was being incremented in the loop, add NewCount to the
2088 // counter's initial value, but only if the initial value is not zero.
2089 ConstantInt *InitConst = dyn_cast<ConstantInt>(CntInitVal);
2090 if (!InitConst || !InitConst->isZero())
2091 NewCount = Builder.CreateAdd(NewCount, CntInitVal);
2092 } else {
2093 // If the count was being decremented in the loop, subtract NewCount from
2094 // the counter's initial value.
2095 NewCount = Builder.CreateSub(CntInitVal, NewCount);
2096 }
2097
2098 // Step 2: Insert new IV and loop condition:
2099 // loop:
2100 // ...
2101 // PhiCount = PHI [Count, Dec]
2102 // ...
2103 // Dec = PhiCount - 1
2104 // ...
2105 // Br: loop if (Dec != 0)
2106 BasicBlock *Body = *(CurLoop->block_begin());
2107 auto *LbBr = cast<BranchInst>(Body->getTerminator());
2108 ICmpInst *LbCond = cast<ICmpInst>(LbBr->getCondition());
2109
2110 PHINode *TcPhi = PHINode::Create(CountTy, 2, "tcphi", &Body->front());
2111
2112 Builder.SetInsertPoint(LbCond);
2113 Instruction *TcDec = cast<Instruction>(Builder.CreateSub(
2114 TcPhi, ConstantInt::get(CountTy, 1), "tcdec", false, true));
2115
2116 TcPhi->addIncoming(Count, Preheader);
2117 TcPhi->addIncoming(TcDec, Body);
2118
2119 CmpInst::Predicate Pred =
2120 (LbBr->getSuccessor(0) == Body) ? CmpInst::ICMP_NE : CmpInst::ICMP_EQ;
2121 LbCond->setPredicate(Pred);
2122 LbCond->setOperand(0, TcDec);
2123 LbCond->setOperand(1, ConstantInt::get(CountTy, 0));
2124
2125 // Step 3: All the references to the original counter outside
2126 // the loop are replaced with the NewCount
2127 if (IsCntPhiUsedOutsideLoop)
2128 CntPhi->replaceUsesOutsideBlock(NewCount, Body);
2129 else
2130 CntInst->replaceUsesOutsideBlock(NewCount, Body);
2131
2132 // step 4: Forget the "non-computable" trip-count SCEV associated with the
2133 // loop. The loop would otherwise not be deleted even if it becomes empty.
2134 SE->forgetLoop(CurLoop);
2135}
2136
2137void LoopIdiomRecognize::transformLoopToPopcount(BasicBlock *PreCondBB,
2138 Instruction *CntInst,
2139 PHINode *CntPhi, Value *Var) {
2140 BasicBlock *PreHead = CurLoop->getLoopPreheader();
2141 auto *PreCondBr = cast<BranchInst>(PreCondBB->getTerminator());
2142 const DebugLoc &DL = CntInst->getDebugLoc();
2143
2144 // Assuming before transformation, the loop is following:
2145 // if (x) // the precondition
2146 // do { cnt++; x &= x - 1; } while(x);
2147
2148 // Step 1: Insert the ctpop instruction at the end of the precondition block
2149 IRBuilder<> Builder(PreCondBr);
2150 Value *PopCnt, *PopCntZext, *NewCount, *TripCnt;
2151 {
2152 PopCnt = createPopcntIntrinsic(Builder, Var, DL);
2153 NewCount = PopCntZext =
2154 Builder.CreateZExtOrTrunc(PopCnt, cast<IntegerType>(CntPhi->getType()));
2155
2156 if (NewCount != PopCnt)
2157 (cast<Instruction>(NewCount))->setDebugLoc(DL);
2158
2159 // TripCnt is exactly the number of iterations the loop has
2160 TripCnt = NewCount;
2161
2162 // If the population counter's initial value is not zero, insert Add Inst.
2163 Value *CntInitVal = CntPhi->getIncomingValueForBlock(PreHead);
2164 ConstantInt *InitConst = dyn_cast<ConstantInt>(CntInitVal);
2165 if (!InitConst || !InitConst->isZero()) {
2166 NewCount = Builder.CreateAdd(NewCount, CntInitVal);
2167 (cast<Instruction>(NewCount))->setDebugLoc(DL);
2168 }
2169 }
2170
2171 // Step 2: Replace the precondition from "if (x == 0) goto loop-exit" to
2172 // "if (NewCount == 0) loop-exit". Without this change, the intrinsic
2173 // function would be partial dead code, and downstream passes will drag
2174 // it back from the precondition block to the preheader.
2175 {
2176 ICmpInst *PreCond = cast<ICmpInst>(PreCondBr->getCondition());
2177
2178 Value *Opnd0 = PopCntZext;
2179 Value *Opnd1 = ConstantInt::get(PopCntZext->getType(), 0);
2180 if (PreCond->getOperand(0) != Var)
2181 std::swap(Opnd0, Opnd1);
2182
2183 ICmpInst *NewPreCond = cast<ICmpInst>(
2184 Builder.CreateICmp(PreCond->getPredicate(), Opnd0, Opnd1));
2185 PreCondBr->setCondition(NewPreCond);
2186
2188 }
2189
2190 // Step 3: Note that the population count is exactly the trip count of the
2191 // loop in question, which enable us to convert the loop from noncountable
2192 // loop into a countable one. The benefit is twofold:
2193 //
2194 // - If the loop only counts population, the entire loop becomes dead after
2195 // the transformation. It is a lot easier to prove a countable loop dead
2196 // than to prove a noncountable one. (In some C dialects, an infinite loop
2197 // isn't dead even if it computes nothing useful. In general, DCE needs
2198 // to prove a noncountable loop finite before safely delete it.)
2199 //
2200 // - If the loop also performs something else, it remains alive.
2201 // Since it is transformed to countable form, it can be aggressively
2202 // optimized by some optimizations which are in general not applicable
2203 // to a noncountable loop.
2204 //
2205 // After this step, this loop (conceptually) would look like following:
2206 // newcnt = __builtin_ctpop(x);
2207 // t = newcnt;
2208 // if (x)
2209 // do { cnt++; x &= x-1; t--) } while (t > 0);
2210 BasicBlock *Body = *(CurLoop->block_begin());
2211 {
2212 auto *LbBr = cast<BranchInst>(Body->getTerminator());
2213 ICmpInst *LbCond = cast<ICmpInst>(LbBr->getCondition());
2214 Type *Ty = TripCnt->getType();
2215
2216 PHINode *TcPhi = PHINode::Create(Ty, 2, "tcphi", &Body->front());
2217
2218 Builder.SetInsertPoint(LbCond);
2219 Instruction *TcDec = cast<Instruction>(
2220 Builder.CreateSub(TcPhi, ConstantInt::get(Ty, 1),
2221 "tcdec", false, true));
2222
2223 TcPhi->addIncoming(TripCnt, PreHead);
2224 TcPhi->addIncoming(TcDec, Body);
2225
2226 CmpInst::Predicate Pred =
2227 (LbBr->getSuccessor(0) == Body) ? CmpInst::ICMP_UGT : CmpInst::ICMP_SLE;
2228 LbCond->setPredicate(Pred);
2229 LbCond->setOperand(0, TcDec);
2230 LbCond->setOperand(1, ConstantInt::get(Ty, 0));
2231 }
2232
2233 // Step 4: All the references to the original population counter outside
2234 // the loop are replaced with the NewCount -- the value returned from
2235 // __builtin_ctpop().
2236 CntInst->replaceUsesOutsideBlock(NewCount, Body);
2237
2238 // step 5: Forget the "non-computable" trip-count SCEV associated with the
2239 // loop. The loop would otherwise not be deleted even if it becomes empty.
2240 SE->forgetLoop(CurLoop);
2241}
2242
2243/// Match loop-invariant value.
2244template <typename SubPattern_t> struct match_LoopInvariant {
2245 SubPattern_t SubPattern;
2246 const Loop *L;
2247
2248 match_LoopInvariant(const SubPattern_t &SP, const Loop *L)
2249 : SubPattern(SP), L(L) {}
2250
2251 template <typename ITy> bool match(ITy *V) {
2252 return L->isLoopInvariant(V) && SubPattern.match(V);
2253 }
2254};
2255
2256/// Matches if the value is loop-invariant.
2257template <typename Ty>
2258inline match_LoopInvariant<Ty> m_LoopInvariant(const Ty &M, const Loop *L) {
2259 return match_LoopInvariant<Ty>(M, L);
2260}
2261
2262/// Return true if the idiom is detected in the loop.
2263///
2264/// The core idiom we are trying to detect is:
2265/// \code
2266/// entry:
2267/// <...>
2268/// %bitmask = shl i32 1, %bitpos
2269/// br label %loop
2270///
2271/// loop:
2272/// %x.curr = phi i32 [ %x, %entry ], [ %x.next, %loop ]
2273/// %x.curr.bitmasked = and i32 %x.curr, %bitmask
2274/// %x.curr.isbitunset = icmp eq i32 %x.curr.bitmasked, 0
2275/// %x.next = shl i32 %x.curr, 1
2276/// <...>
2277/// br i1 %x.curr.isbitunset, label %loop, label %end
2278///
2279/// end:
2280/// %x.curr.res = phi i32 [ %x.curr, %loop ] <...>
2281/// %x.next.res = phi i32 [ %x.next, %loop ] <...>
2282/// <...>
2283/// \endcode
2284static bool detectShiftUntilBitTestIdiom(Loop *CurLoop, Value *&BaseX,
2285 Value *&BitMask, Value *&BitPos,
2286 Value *&CurrX, Instruction *&NextX) {
2288 " Performing shift-until-bittest idiom detection.\n");
2289
2290 // Give up if the loop has multiple blocks or multiple backedges.
2291 if (CurLoop->getNumBlocks() != 1 || CurLoop->getNumBackEdges() != 1) {
2292 LLVM_DEBUG(dbgs() << DEBUG_TYPE " Bad block/backedge count.\n");
2293 return false;
2294 }
2295
2296 BasicBlock *LoopHeaderBB = CurLoop->getHeader();
2297 BasicBlock *LoopPreheaderBB = CurLoop->getLoopPreheader();
2298 assert(LoopPreheaderBB && "There is always a loop preheader.");
2299
2300 using namespace PatternMatch;
2301
2302 // Step 1: Check if the loop backedge is in desirable form.
2303
2305 Value *CmpLHS, *CmpRHS;
2306 BasicBlock *TrueBB, *FalseBB;
2307 if (!match(LoopHeaderBB->getTerminator(),
2308 m_Br(m_ICmp(Pred, m_Value(CmpLHS), m_Value(CmpRHS)),
2309 m_BasicBlock(TrueBB), m_BasicBlock(FalseBB)))) {
2310 LLVM_DEBUG(dbgs() << DEBUG_TYPE " Bad backedge structure.\n");
2311 return false;
2312 }
2313
2314 // Step 2: Check if the backedge's condition is in desirable form.
2315
2316 auto MatchVariableBitMask = [&]() {
2317 return ICmpInst::isEquality(Pred) && match(CmpRHS, m_Zero()) &&
2318 match(CmpLHS,
2319 m_c_And(m_Value(CurrX),
2321 m_Value(BitMask),
2322 m_LoopInvariant(m_Shl(m_One(), m_Value(BitPos)),
2323 CurLoop))));
2324 };
2325 auto MatchConstantBitMask = [&]() {
2326 return ICmpInst::isEquality(Pred) && match(CmpRHS, m_Zero()) &&
2327 match(CmpLHS, m_And(m_Value(CurrX),
2328 m_CombineAnd(m_Value(BitMask), m_Power2()))) &&
2329 (BitPos = ConstantExpr::getExactLogBase2(cast<Constant>(BitMask)));
2330 };
2331 auto MatchDecomposableConstantBitMask = [&]() {
2332 APInt Mask;
2333 return llvm::decomposeBitTestICmp(CmpLHS, CmpRHS, Pred, CurrX, Mask) &&
2334 ICmpInst::isEquality(Pred) && Mask.isPowerOf2() &&
2335 (BitMask = ConstantInt::get(CurrX->getType(), Mask)) &&
2336 (BitPos = ConstantInt::get(CurrX->getType(), Mask.logBase2()));
2337 };
2338
2339 if (!MatchVariableBitMask() && !MatchConstantBitMask() &&
2340 !MatchDecomposableConstantBitMask()) {
2341 LLVM_DEBUG(dbgs() << DEBUG_TYPE " Bad backedge comparison.\n");
2342 return false;
2343 }
2344
2345 // Step 3: Check if the recurrence is in desirable form.
2346 auto *CurrXPN = dyn_cast<PHINode>(CurrX);
2347 if (!CurrXPN || CurrXPN->getParent() != LoopHeaderBB) {
2348 LLVM_DEBUG(dbgs() << DEBUG_TYPE " Not an expected PHI node.\n");
2349 return false;
2350 }
2351
2352 BaseX = CurrXPN->getIncomingValueForBlock(LoopPreheaderBB);
2353 NextX =
2354 dyn_cast<Instruction>(CurrXPN->getIncomingValueForBlock(LoopHeaderBB));
2355
2356 assert(CurLoop->isLoopInvariant(BaseX) &&
2357 "Expected BaseX to be avaliable in the preheader!");
2358
2359 if (!NextX || !match(NextX, m_Shl(m_Specific(CurrX), m_One()))) {
2360 // FIXME: support right-shift?
2361 LLVM_DEBUG(dbgs() << DEBUG_TYPE " Bad recurrence.\n");
2362 return false;
2363 }
2364
2365 // Step 4: Check if the backedge's destinations are in desirable form.
2366
2368 "Should only get equality predicates here.");
2369
2370 // cmp-br is commutative, so canonicalize to a single variant.
2371 if (Pred != ICmpInst::Predicate::ICMP_EQ) {
2372 Pred = ICmpInst::getInversePredicate(Pred);
2373 std::swap(TrueBB, FalseBB);
2374 }
2375
2376 // We expect to exit loop when comparison yields false,
2377 // so when it yields true we should branch back to loop header.
2378 if (TrueBB != LoopHeaderBB) {
2379 LLVM_DEBUG(dbgs() << DEBUG_TYPE " Bad backedge flow.\n");
2380 return false;
2381 }
2382
2383 // Okay, idiom checks out.
2384 return true;
2385}
2386
2387/// Look for the following loop:
2388/// \code
2389/// entry:
2390/// <...>
2391/// %bitmask = shl i32 1, %bitpos
2392/// br label %loop
2393///
2394/// loop:
2395/// %x.curr = phi i32 [ %x, %entry ], [ %x.next, %loop ]
2396/// %x.curr.bitmasked = and i32 %x.curr, %bitmask
2397/// %x.curr.isbitunset = icmp eq i32 %x.curr.bitmasked, 0
2398/// %x.next = shl i32 %x.curr, 1
2399/// <...>
2400/// br i1 %x.curr.isbitunset, label %loop, label %end
2401///
2402/// end:
2403/// %x.curr.res = phi i32 [ %x.curr, %loop ] <...>
2404/// %x.next.res = phi i32 [ %x.next, %loop ] <...>
2405/// <...>
2406/// \endcode
2407///
2408/// And transform it into:
2409/// \code
2410/// entry:
2411/// %bitmask = shl i32 1, %bitpos
2412/// %lowbitmask = add i32 %bitmask, -1
2413/// %mask = or i32 %lowbitmask, %bitmask
2414/// %x.masked = and i32 %x, %mask
2415/// %x.masked.numleadingzeros = call i32 @llvm.ctlz.i32(i32 %x.masked,
2416/// i1 true)
2417/// %x.masked.numactivebits = sub i32 32, %x.masked.numleadingzeros
2418/// %x.masked.leadingonepos = add i32 %x.masked.numactivebits, -1
2419/// %backedgetakencount = sub i32 %bitpos, %x.masked.leadingonepos
2420/// %tripcount = add i32 %backedgetakencount, 1
2421/// %x.curr = shl i32 %x, %backedgetakencount
2422/// %x.next = shl i32 %x, %tripcount
2423/// br label %loop
2424///
2425/// loop:
2426/// %loop.iv = phi i32 [ 0, %entry ], [ %loop.iv.next, %loop ]
2427/// %loop.iv.next = add nuw i32 %loop.iv, 1
2428/// %loop.ivcheck = icmp eq i32 %loop.iv.next, %tripcount
2429/// <...>
2430/// br i1 %loop.ivcheck, label %end, label %loop
2431///
2432/// end:
2433/// %x.curr.res = phi i32 [ %x.curr, %loop ] <...>
2434/// %x.next.res = phi i32 [ %x.next, %loop ] <...>
2435/// <...>
2436/// \endcode
2437bool LoopIdiomRecognize::recognizeShiftUntilBitTest() {
2438 bool MadeChange = false;
2439
2440 Value *X, *BitMask, *BitPos, *XCurr;
2441 Instruction *XNext;
2442 if (!detectShiftUntilBitTestIdiom(CurLoop, X, BitMask, BitPos, XCurr,
2443 XNext)) {
2445 " shift-until-bittest idiom detection failed.\n");
2446 return MadeChange;
2447 }
2448 LLVM_DEBUG(dbgs() << DEBUG_TYPE " shift-until-bittest idiom detected!\n");
2449
2450 // Ok, it is the idiom we were looking for, we *could* transform this loop,
2451 // but is it profitable to transform?
2452
2453 BasicBlock *LoopHeaderBB = CurLoop->getHeader();
2454 BasicBlock *LoopPreheaderBB = CurLoop->getLoopPreheader();
2455 assert(LoopPreheaderBB && "There is always a loop preheader.");
2456
2457 BasicBlock *SuccessorBB = CurLoop->getExitBlock();
2458 assert(SuccessorBB && "There is only a single successor.");
2459
2460 IRBuilder<> Builder(LoopPreheaderBB->getTerminator());
2461 Builder.SetCurrentDebugLocation(cast<Instruction>(XCurr)->getDebugLoc());
2462
2463 Intrinsic::ID IntrID = Intrinsic::ctlz;
2464 Type *Ty = X->getType();
2465 unsigned Bitwidth = Ty->getScalarSizeInBits();
2466
2469
2470 // The rewrite is considered to be unprofitable iff and only iff the
2471 // intrinsic/shift we'll use are not cheap. Note that we are okay with *just*
2472 // making the loop countable, even if nothing else changes.
2474 IntrID, Ty, {UndefValue::get(Ty), /*is_zero_undef=*/Builder.getTrue()});
2476 if (Cost > TargetTransformInfo::TCC_Basic) {
2478 " Intrinsic is too costly, not beneficial\n");
2479 return MadeChange;
2480 }
2481 if (TTI->getArithmeticInstrCost(Instruction::Shl, Ty, CostKind) >
2483 LLVM_DEBUG(dbgs() << DEBUG_TYPE " Shift is too costly, not beneficial\n");
2484 return MadeChange;
2485 }
2486
2487 // Ok, transform appears worthwhile.
2488 MadeChange = true;
2489
2490 // Step 1: Compute the loop trip count.
2491
2492 Value *LowBitMask = Builder.CreateAdd(BitMask, Constant::getAllOnesValue(Ty),
2493 BitPos->getName() + ".lowbitmask");
2494 Value *Mask =
2495 Builder.CreateOr(LowBitMask, BitMask, BitPos->getName() + ".mask");
2496 Value *XMasked = Builder.CreateAnd(X, Mask, X->getName() + ".masked");
2497 CallInst *XMaskedNumLeadingZeros = Builder.CreateIntrinsic(
2498 IntrID, Ty, {XMasked, /*is_zero_undef=*/Builder.getTrue()},
2499 /*FMFSource=*/nullptr, XMasked->getName() + ".numleadingzeros");
2500 Value *XMaskedNumActiveBits = Builder.CreateSub(
2501 ConstantInt::get(Ty, Ty->getScalarSizeInBits()), XMaskedNumLeadingZeros,
2502 XMasked->getName() + ".numactivebits", /*HasNUW=*/true,
2503 /*HasNSW=*/Bitwidth != 2);
2504 Value *XMaskedLeadingOnePos =
2505 Builder.CreateAdd(XMaskedNumActiveBits, Constant::getAllOnesValue(Ty),
2506 XMasked->getName() + ".leadingonepos", /*HasNUW=*/false,
2507 /*HasNSW=*/Bitwidth > 2);
2508
2509 Value *LoopBackedgeTakenCount = Builder.CreateSub(
2510 BitPos, XMaskedLeadingOnePos, CurLoop->getName() + ".backedgetakencount",
2511 /*HasNUW=*/true, /*HasNSW=*/true);
2512 // We know loop's backedge-taken count, but what's loop's trip count?
2513 // Note that while NUW is always safe, while NSW is only for bitwidths != 2.
2514 Value *LoopTripCount =
2515 Builder.CreateAdd(LoopBackedgeTakenCount, ConstantInt::get(Ty, 1),
2516 CurLoop->getName() + ".tripcount", /*HasNUW=*/true,
2517 /*HasNSW=*/Bitwidth != 2);
2518
2519 // Step 2: Compute the recurrence's final value without a loop.
2520
2521 // NewX is always safe to compute, because `LoopBackedgeTakenCount`
2522 // will always be smaller than `bitwidth(X)`, i.e. we never get poison.
2523 Value *NewX = Builder.CreateShl(X, LoopBackedgeTakenCount);
2524 NewX->takeName(XCurr);
2525 if (auto *I = dyn_cast<Instruction>(NewX))
2526 I->copyIRFlags(XNext, /*IncludeWrapFlags=*/true);
2527
2528 Value *NewXNext;
2529 // Rewriting XNext is more complicated, however, because `X << LoopTripCount`
2530 // will be poison iff `LoopTripCount == bitwidth(X)` (which will happen
2531 // iff `BitPos` is `bitwidth(x) - 1` and `X` is `1`). So unless we know
2532 // that isn't the case, we'll need to emit an alternative, safe IR.
2533 if (XNext->hasNoSignedWrap() || XNext->hasNoUnsignedWrap() ||
2537 Ty->getScalarSizeInBits() - 1))))
2538 NewXNext = Builder.CreateShl(X, LoopTripCount);
2539 else {
2540 // Otherwise, just additionally shift by one. It's the smallest solution,
2541 // alternatively, we could check that NewX is INT_MIN (or BitPos is )
2542 // and select 0 instead.
2543 NewXNext = Builder.CreateShl(NewX, ConstantInt::get(Ty, 1));
2544 }
2545
2546 NewXNext->takeName(XNext);
2547 if (auto *I = dyn_cast<Instruction>(NewXNext))
2548 I->copyIRFlags(XNext, /*IncludeWrapFlags=*/true);
2549
2550 // Step 3: Adjust the successor basic block to recieve the computed
2551 // recurrence's final value instead of the recurrence itself.
2552
2553 XCurr->replaceUsesOutsideBlock(NewX, LoopHeaderBB);
2554 XNext->replaceUsesOutsideBlock(NewXNext, LoopHeaderBB);
2555
2556 // Step 4: Rewrite the loop into a countable form, with canonical IV.
2557
2558 // The new canonical induction variable.
2559 Builder.SetInsertPoint(&LoopHeaderBB->front());
2560 auto *IV = Builder.CreatePHI(Ty, 2, CurLoop->getName() + ".iv");
2561
2562 // The induction itself.
2563 // Note that while NUW is always safe, while NSW is only for bitwidths != 2.
2564 Builder.SetInsertPoint(LoopHeaderBB->getTerminator());
2565 auto *IVNext =
2566 Builder.CreateAdd(IV, ConstantInt::get(Ty, 1), IV->getName() + ".next",
2567 /*HasNUW=*/true, /*HasNSW=*/Bitwidth != 2);
2568
2569 // The loop trip count check.
2570 auto *IVCheck = Builder.CreateICmpEQ(IVNext, LoopTripCount,
2571 CurLoop->getName() + ".ivcheck");
2572 Builder.CreateCondBr(IVCheck, SuccessorBB, LoopHeaderBB);
2573 LoopHeaderBB->getTerminator()->eraseFromParent();
2574
2575 // Populate the IV PHI.
2576 IV->addIncoming(ConstantInt::get(Ty, 0), LoopPreheaderBB);
2577 IV->addIncoming(IVNext, LoopHeaderBB);
2578
2579 // Step 5: Forget the "non-computable" trip-count SCEV associated with the
2580 // loop. The loop would otherwise not be deleted even if it becomes empty.
2581
2582 SE->forgetLoop(CurLoop);
2583
2584 // Other passes will take care of actually deleting the loop if possible.
2585
2586 LLVM_DEBUG(dbgs() << DEBUG_TYPE " shift-until-bittest idiom optimized!\n");
2587
2588 ++NumShiftUntilBitTest;
2589 return MadeChange;
2590}
2591
2592/// Return true if the idiom is detected in the loop.
2593///
2594/// The core idiom we are trying to detect is:
2595/// \code
2596/// entry:
2597/// <...>
2598/// %start = <...>
2599/// %extraoffset = <...>
2600/// <...>
2601/// br label %for.cond
2602///
2603/// loop:
2604/// %iv = phi i8 [ %start, %entry ], [ %iv.next, %for.cond ]
2605/// %nbits = add nsw i8 %iv, %extraoffset
2606/// %val.shifted = {{l,a}shr,shl} i8 %val, %nbits
2607/// %val.shifted.iszero = icmp eq i8 %val.shifted, 0
2608/// %iv.next = add i8 %iv, 1
2609/// <...>
2610/// br i1 %val.shifted.iszero, label %end, label %loop
2611///
2612/// end:
2613/// %iv.res = phi i8 [ %iv, %loop ] <...>
2614/// %nbits.res = phi i8 [ %nbits, %loop ] <...>
2615/// %val.shifted.res = phi i8 [ %val.shifted, %loop ] <...>
2616/// %val.shifted.iszero.res = phi i1 [ %val.shifted.iszero, %loop ] <...>
2617/// %iv.next.res = phi i8 [ %iv.next, %loop ] <...>
2618/// <...>
2619/// \endcode
2621 Instruction *&ValShiftedIsZero,
2622 Intrinsic::ID &IntrinID, Instruction *&IV,
2623 Value *&Start, Value *&Val,
2624 const SCEV *&ExtraOffsetExpr,
2625 bool &InvertedCond) {
2627 " Performing shift-until-zero idiom detection.\n");
2628
2629 // Give up if the loop has multiple blocks or multiple backedges.
2630 if (CurLoop->getNumBlocks() != 1 || CurLoop->getNumBackEdges() != 1) {
2631 LLVM_DEBUG(dbgs() << DEBUG_TYPE " Bad block/backedge count.\n");
2632 return false;
2633 }
2634
2635 Instruction *ValShifted, *NBits, *IVNext;
2636 Value *ExtraOffset;
2637
2638 BasicBlock *LoopHeaderBB = CurLoop->getHeader();
2639 BasicBlock *LoopPreheaderBB = CurLoop->getLoopPreheader();
2640 assert(LoopPreheaderBB && "There is always a loop preheader.");
2641
2642 using namespace PatternMatch;
2643
2644 // Step 1: Check if the loop backedge, condition is in desirable form.
2645
2647 BasicBlock *TrueBB, *FalseBB;
2648 if (!match(LoopHeaderBB->getTerminator(),
2649 m_Br(m_Instruction(ValShiftedIsZero), m_BasicBlock(TrueBB),
2650 m_BasicBlock(FalseBB))) ||
2651 !match(ValShiftedIsZero,
2652 m_ICmp(Pred, m_Instruction(ValShifted), m_Zero())) ||
2653 !ICmpInst::isEquality(Pred)) {
2654 LLVM_DEBUG(dbgs() << DEBUG_TYPE " Bad backedge structure.\n");
2655 return false;
2656 }
2657
2658 // Step 2: Check if the comparison's operand is in desirable form.
2659 // FIXME: Val could be a one-input PHI node, which we should look past.
2660 if (!match(ValShifted, m_Shift(m_LoopInvariant(m_Value(Val), CurLoop),
2661 m_Instruction(NBits)))) {
2662 LLVM_DEBUG(dbgs() << DEBUG_TYPE " Bad comparisons value computation.\n");
2663 return false;
2664 }
2665 IntrinID = ValShifted->getOpcode() == Instruction::Shl ? Intrinsic::cttz
2666 : Intrinsic::ctlz;
2667
2668 // Step 3: Check if the shift amount is in desirable form.
2669
2670 if (match(NBits, m_c_Add(m_Instruction(IV),
2671 m_LoopInvariant(m_Value(ExtraOffset), CurLoop))) &&
2672 (NBits->hasNoSignedWrap() || NBits->hasNoUnsignedWrap()))
2673 ExtraOffsetExpr = SE->getNegativeSCEV(SE->getSCEV(ExtraOffset));
2674 else if (match(NBits,
2676 m_LoopInvariant(m_Value(ExtraOffset), CurLoop))) &&
2677 NBits->hasNoSignedWrap())
2678 ExtraOffsetExpr = SE->getSCEV(ExtraOffset);
2679 else {
2680 IV = NBits;
2681 ExtraOffsetExpr = SE->getZero(NBits->getType());
2682 }
2683
2684 // Step 4: Check if the recurrence is in desirable form.
2685 auto *IVPN = dyn_cast<PHINode>(IV);
2686 if (!IVPN || IVPN->getParent() != LoopHeaderBB) {
2687 LLVM_DEBUG(dbgs() << DEBUG_TYPE " Not an expected PHI node.\n");
2688 return false;
2689 }
2690
2691 Start = IVPN->getIncomingValueForBlock(LoopPreheaderBB);
2692 IVNext = dyn_cast<Instruction>(IVPN->getIncomingValueForBlock(LoopHeaderBB));
2693
2694 if (!IVNext || !match(IVNext, m_Add(m_Specific(IVPN), m_One()))) {
2695 LLVM_DEBUG(dbgs() << DEBUG_TYPE " Bad recurrence.\n");
2696 return false;
2697 }
2698
2699 // Step 4: Check if the backedge's destinations are in desirable form.
2700
2702 "Should only get equality predicates here.");
2703
2704 // cmp-br is commutative, so canonicalize to a single variant.
2705 InvertedCond = Pred != ICmpInst::Predicate::ICMP_EQ;
2706 if (InvertedCond) {
2707 Pred = ICmpInst::getInversePredicate(Pred);
2708 std::swap(TrueBB, FalseBB);
2709 }
2710
2711 // We expect to exit loop when comparison yields true,
2712 // so when it yields false we should branch back to loop header.
2713 if (FalseBB != LoopHeaderBB) {
2714 LLVM_DEBUG(dbgs() << DEBUG_TYPE " Bad backedge flow.\n");
2715 return false;
2716 }
2717
2718 // The new, countable, loop will certainly only run a known number of
2719 // iterations, It won't be infinite. But the old loop might be infinite
2720 // under certain conditions. For logical shifts, the value will become zero
2721 // after at most bitwidth(%Val) loop iterations. However, for arithmetic
2722 // right-shift, iff the sign bit was set, the value will never become zero,
2723 // and the loop may never finish.
2724 if (ValShifted->getOpcode() == Instruction::AShr &&
2725 !isMustProgress(CurLoop) && !SE->isKnownNonNegative(SE->getSCEV(Val))) {
2726 LLVM_DEBUG(dbgs() << DEBUG_TYPE " Can not prove the loop is finite.\n");
2727 return false;
2728 }
2729
2730 // Okay, idiom checks out.
2731 return true;
2732}
2733
2734/// Look for the following loop:
2735/// \code
2736/// entry:
2737/// <...>
2738/// %start = <...>
2739/// %extraoffset = <...>
2740/// <...>
2741/// br label %for.cond
2742///
2743/// loop:
2744/// %iv = phi i8 [ %start, %entry ], [ %iv.next, %for.cond ]
2745/// %nbits = add nsw i8 %iv, %extraoffset
2746/// %val.shifted = {{l,a}shr,shl} i8 %val, %nbits
2747/// %val.shifted.iszero = icmp eq i8 %val.shifted, 0
2748/// %iv.next = add i8 %iv, 1
2749/// <...>
2750/// br i1 %val.shifted.iszero, label %end, label %loop
2751///
2752/// end:
2753/// %iv.res = phi i8 [ %iv, %loop ] <...>
2754/// %nbits.res = phi i8 [ %nbits, %loop ] <...>
2755/// %val.shifted.res = phi i8 [ %val.shifted, %loop ] <...>
2756/// %val.shifted.iszero.res = phi i1 [ %val.shifted.iszero, %loop ] <...>
2757/// %iv.next.res = phi i8 [ %iv.next, %loop ] <...>
2758/// <...>
2759/// \endcode
2760///
2761/// And transform it into:
2762/// \code
2763/// entry:
2764/// <...>
2765/// %start = <...>
2766/// %extraoffset = <...>
2767/// <...>
2768/// %val.numleadingzeros = call i8 @llvm.ct{l,t}z.i8(i8 %val, i1 0)
2769/// %val.numactivebits = sub i8 8, %val.numleadingzeros
2770/// %extraoffset.neg = sub i8 0, %extraoffset
2771/// %tmp = add i8 %val.numactivebits, %extraoffset.neg
2772/// %iv.final = call i8 @llvm.smax.i8(i8 %tmp, i8 %start)
2773/// %loop.tripcount = sub i8 %iv.final, %start
2774/// br label %loop
2775///
2776/// loop:
2777/// %loop.iv = phi i8 [ 0, %entry ], [ %loop.iv.next, %loop ]
2778/// %loop.iv.next = add i8 %loop.iv, 1
2779/// %loop.ivcheck = icmp eq i8 %loop.iv.next, %loop.tripcount
2780/// %iv = add i8 %loop.iv, %start
2781/// <...>
2782/// br i1 %loop.ivcheck, label %end, label %loop
2783///
2784/// end:
2785/// %iv.res = phi i8 [ %iv.final, %loop ] <...>
2786/// <...>
2787/// \endcode
2788bool LoopIdiomRecognize::recognizeShiftUntilZero() {
2789 bool MadeChange = false;
2790
2791 Instruction *ValShiftedIsZero;
2792 Intrinsic::ID IntrID;
2793 Instruction *IV;
2794 Value *Start, *Val;
2795 const SCEV *ExtraOffsetExpr;
2796 bool InvertedCond;
2797 if (!detectShiftUntilZeroIdiom(CurLoop, SE, ValShiftedIsZero, IntrID, IV,
2798 Start, Val, ExtraOffsetExpr, InvertedCond)) {
2800 " shift-until-zero idiom detection failed.\n");
2801 return MadeChange;
2802 }
2803 LLVM_DEBUG(dbgs() << DEBUG_TYPE " shift-until-zero idiom detected!\n");
2804
2805 // Ok, it is the idiom we were looking for, we *could* transform this loop,
2806 // but is it profitable to transform?
2807
2808 BasicBlock *LoopHeaderBB = CurLoop->getHeader();
2809 BasicBlock *LoopPreheaderBB = CurLoop->getLoopPreheader();
2810 assert(LoopPreheaderBB && "There is always a loop preheader.");
2811
2812 BasicBlock *SuccessorBB = CurLoop->getExitBlock();
2813 assert(SuccessorBB && "There is only a single successor.");
2814
2815 IRBuilder<> Builder(LoopPreheaderBB->getTerminator());
2816 Builder.SetCurrentDebugLocation(IV->getDebugLoc());
2817
2818 Type *Ty = Val->getType();
2819 unsigned Bitwidth = Ty->getScalarSizeInBits();
2820
2823
2824 // The rewrite is considered to be unprofitable iff and only iff the
2825 // intrinsic we'll use are not cheap. Note that we are okay with *just*
2826 // making the loop countable, even if nothing else changes.
2828 IntrID, Ty, {UndefValue::get(Ty), /*is_zero_undef=*/Builder.getFalse()});
2830 if (Cost > TargetTransformInfo::TCC_Basic) {
2832 " Intrinsic is too costly, not beneficial\n");
2833 return MadeChange;
2834 }
2835
2836 // Ok, transform appears worthwhile.
2837 MadeChange = true;
2838
2839 bool OffsetIsZero = false;
2840 if (auto *ExtraOffsetExprC = dyn_cast<SCEVConstant>(ExtraOffsetExpr))
2841 OffsetIsZero = ExtraOffsetExprC->isZero();
2842
2843 // Step 1: Compute the loop's final IV value / trip count.
2844
2845 CallInst *ValNumLeadingZeros = Builder.CreateIntrinsic(
2846 IntrID, Ty, {Val, /*is_zero_undef=*/Builder.getFalse()},
2847 /*FMFSource=*/nullptr, Val->getName() + ".numleadingzeros");
2848 Value *ValNumActiveBits = Builder.CreateSub(
2849 ConstantInt::get(Ty, Ty->getScalarSizeInBits()), ValNumLeadingZeros,
2850 Val->getName() + ".numactivebits", /*HasNUW=*/true,
2851 /*HasNSW=*/Bitwidth != 2);
2852
2853 SCEVExpander Expander(*SE, *DL, "loop-idiom");
2854 Expander.setInsertPoint(&*Builder.GetInsertPoint());
2855 Value *ExtraOffset = Expander.expandCodeFor(ExtraOffsetExpr);
2856
2857 Value *ValNumActiveBitsOffset = Builder.CreateAdd(
2858 ValNumActiveBits, ExtraOffset, ValNumActiveBits->getName() + ".offset",
2859 /*HasNUW=*/OffsetIsZero, /*HasNSW=*/true);
2860 Value *IVFinal = Builder.CreateIntrinsic(Intrinsic::smax, {Ty},
2861 {ValNumActiveBitsOffset, Start},
2862 /*FMFSource=*/nullptr, "iv.final");
2863
2864 auto *LoopBackedgeTakenCount = cast<Instruction>(Builder.CreateSub(
2865 IVFinal, Start, CurLoop->getName() + ".backedgetakencount",
2866 /*HasNUW=*/OffsetIsZero, /*HasNSW=*/true));
2867 // FIXME: or when the offset was `add nuw`
2868
2869 // We know loop's backedge-taken count, but what's loop's trip count?
2870 Value *LoopTripCount =
2871 Builder.CreateAdd(LoopBackedgeTakenCount, ConstantInt::get(Ty, 1),
2872 CurLoop->getName() + ".tripcount", /*HasNUW=*/true,
2873 /*HasNSW=*/Bitwidth != 2);
2874
2875 // Step 2: Adjust the successor basic block to recieve the original
2876 // induction variable's final value instead of the orig. IV itself.
2877
2878 IV->replaceUsesOutsideBlock(IVFinal, LoopHeaderBB);
2879
2880 // Step 3: Rewrite the loop into a countable form, with canonical IV.
2881
2882 // The new canonical induction variable.
2883 Builder.SetInsertPoint(&LoopHeaderBB->front());
2884 auto *CIV = Builder.CreatePHI(Ty, 2, CurLoop->getName() + ".iv");
2885
2886 // The induction itself.
2887 Builder.SetInsertPoint(LoopHeaderBB->getFirstNonPHI());
2888 auto *CIVNext =
2889 Builder.CreateAdd(CIV, ConstantInt::get(Ty, 1), CIV->getName() + ".next",
2890 /*HasNUW=*/true, /*HasNSW=*/Bitwidth != 2);
2891
2892 // The loop trip count check.
2893 auto *CIVCheck = Builder.CreateICmpEQ(CIVNext, LoopTripCount,
2894 CurLoop->getName() + ".ivcheck");
2895 auto *NewIVCheck = CIVCheck;
2896 if (InvertedCond) {
2897 NewIVCheck = Builder.CreateNot(CIVCheck);
2898 NewIVCheck->takeName(ValShiftedIsZero);
2899 }
2900
2901 // The original IV, but rebased to be an offset to the CIV.
2902 auto *IVDePHId = Builder.CreateAdd(CIV, Start, "", /*HasNUW=*/false,
2903 /*HasNSW=*/true); // FIXME: what about NUW?
2904 IVDePHId->takeName(IV);
2905
2906 // The loop terminator.
2907 Builder.SetInsertPoint(LoopHeaderBB->getTerminator());
2908 Builder.CreateCondBr(CIVCheck, SuccessorBB, LoopHeaderBB);
2909 LoopHeaderBB->getTerminator()->eraseFromParent();
2910
2911 // Populate the IV PHI.
2912 CIV->addIncoming(ConstantInt::get(Ty, 0), LoopPreheaderBB);
2913 CIV->addIncoming(CIVNext, LoopHeaderBB);
2914
2915 // Step 4: Forget the "non-computable" trip-count SCEV associated with the
2916 // loop. The loop would otherwise not be deleted even if it becomes empty.
2917
2918 SE->forgetLoop(CurLoop);
2919
2920 // Step 5: Try to cleanup the loop's body somewhat.
2921 IV->replaceAllUsesWith(IVDePHId);
2922 IV->eraseFromParent();
2923
2924 ValShiftedIsZero->replaceAllUsesWith(NewIVCheck);
2925 ValShiftedIsZero->eraseFromParent();
2926
2927 // Other passes will take care of actually deleting the loop if possible.
2928
2929 LLVM_DEBUG(dbgs() << DEBUG_TYPE " shift-until-zero idiom optimized!\n");
2930
2931 ++NumShiftUntilZero;
2932 return MadeChange;
2933}
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
This file implements a class to represent arbitrary precision integral constant values and operations...
assume Assume Builder
static const Function * getParent(const Value *V)
SmallVector< MachineOperand, 4 > Cond
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
This file contains the declarations for the subclasses of Constant, which represent the different fla...
static cl::opt< TargetTransformInfo::TargetCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(TargetTransformInfo::TCK_RecipThroughput), cl::values(clEnumValN(TargetTransformInfo::TCK_RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(TargetTransformInfo::TCK_Latency, "latency", "Instruction latency"), clEnumValN(TargetTransformInfo::TCK_CodeSize, "code-size", "Code size"), clEnumValN(TargetTransformInfo::TCK_SizeAndLatency, "size-latency", "Code size and latency")))
#define LLVM_DEBUG(X)
Definition: Debug.h:101
This file defines the DenseMap class.
std::string Name
uint64_t Size
static GCMetadataPrinterRegistry::Add< ErlangGCPrinter > X("erlang", "erlang-compatible garbage collector")
#define DEBUG_TYPE
hexagon loop Recognize Hexagon specific loop idioms
hexagon loop idiom
static bool mayLoopAccessLocation(Value *Ptr, ModRefInfo Access, Loop *L, const SCEV *BECount, unsigned StoreSize, AliasAnalysis &AA, SmallPtrSetImpl< Instruction * > &Ignored)
mayLoopAccessLocation - Return true if the specified loop might access the specified pointer location...
IRTranslator LLVM IR MI
This file defines an InstructionCost class that is used when calculating the cost of an instruction,...
static Value * matchCondition(BranchInst *BI, BasicBlock *LoopEntry, bool JmpOnZero=false)
Check if the given conditional branch is based on the comparison between a variable and zero,...
static PHINode * getRecurrenceVar(Value *VarX, Instruction *DefX, BasicBlock *LoopEntry)
static cl::opt< bool, true > DisableLIRPMemset("disable-" DEBUG_TYPE "-memset", cl::desc("Proceed with loop idiom recognize pass, but do " "not convert loop(s) to memset."), cl::location(DisableLIRP::Memset), cl::init(false), cl::ReallyHidden)
static cl::opt< bool > UseLIRCodeSizeHeurs("use-lir-code-size-heurs", cl::desc("Use loop idiom recognition code size heuristics when compiling" "with -Os/-Oz"), cl::init(true), cl::Hidden)
static CallInst * createFFSIntrinsic(IRBuilder<> &IRBuilder, Value *Val, const DebugLoc &DL, bool ZeroCheck, Intrinsic::ID IID)
static bool detectShiftUntilBitTestIdiom(Loop *CurLoop, Value *&BaseX, Value *&BitMask, Value *&BitPos, Value *&CurrX, Instruction *&NextX)
Return true if the idiom is detected in the loop.
static bool detectPopcountIdiom(Loop *CurLoop, BasicBlock *PreCondBB, Instruction *&CntInst, PHINode *&CntPhi, Value *&Var)
Return true iff the idiom is detected in the loop.
static Constant * getMemSetPatternValue(Value *V, const DataLayout *DL)
getMemSetPatternValue - If a strided store of the specified value is safe to turn into a memset_patte...
static const SCEV * getTripCount(const SCEV *BECount, Type *IntPtr, Loop *CurLoop, const DataLayout *DL, ScalarEvolution *SE)
Compute trip count from the backedge taken count.
static cl::opt< bool, true > DisableLIRPMemcpy("disable-" DEBUG_TYPE "-memcpy", cl::desc("Proceed with loop idiom recognize pass, but do " "not convert loop(s) to memcpy."), cl::location(DisableLIRP::Memcpy), cl::init(false), cl::ReallyHidden)
static CallInst * createPopcntIntrinsic(IRBuilder<> &IRBuilder, Value *Val, const DebugLoc &DL)
static const SCEV * getNumBytes(const SCEV *BECount, Type *IntPtr, const SCEV *StoreSizeSCEV, Loop *CurLoop, const DataLayout *DL, ScalarEvolution *SE)
Compute the number of bytes as a SCEV from the backedge taken count.
static bool detectShiftUntilZeroIdiom(Loop *CurLoop, const DataLayout &DL, Intrinsic::ID &IntrinID, Value *&InitX, Instruction *&CntInst, PHINode *&CntPhi, Instruction *&DefX)
Return true if the idiom is detected in the loop.
static const SCEV * getStartForNegStride(const SCEV *Start, const SCEV *BECount, Type *IntPtr, const SCEV *StoreSizeSCEV, ScalarEvolution *SE)
static APInt getStoreStride(const SCEVAddRecExpr *StoreEv)
match_LoopInvariant< Ty > m_LoopInvariant(const Ty &M, const Loop *L)
Matches if the value is loop-invariant.
#define DEBUG_TYPE
static cl::opt< bool, true > DisableLIRPAll("disable-" DEBUG_TYPE "-all", cl::desc("Options to disable Loop Idiom Recognize Pass."), cl::location(DisableLIRP::All), cl::init(false), cl::ReallyHidden)
static void deleteDeadInstruction(Instruction *I)
static M68kRelType getType(unsigned Kind, MCSymbolRefExpr::VariantKind &Modifier, bool &IsPCRel)
#define I(x, y, z)
Definition: MD5.cpp:58
static DebugLoc getDebugLoc(MachineBasicBlock::instr_iterator FirstMI, MachineBasicBlock::instr_iterator LastMI)
Return the first found DebugLoc that has a DILocation, given a range of instructions.
This file implements a map that provides insertion order iteration.
This file provides utility analysis objects describing memory locations.
This file exposes an interface to building/using memory SSA to walk memory instructions using a use/d...
Module.h This file contains the declarations for the Module class.
Contains a collection of routines for determining if a given instruction is guaranteed to execute if ...
This header defines various interfaces for pass management in LLVM.
#define INITIALIZE_PASS_DEPENDENCY(depName)
Definition: PassSupport.h:55
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:59
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:52
@ SI
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
static bool isSimple(Instruction *I)
verify safepoint Safepoint IR Verifier
This file implements a set that has insertion order iteration characteristics.
This file defines the SmallPtrSet class.
This file defines the SmallVector class.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition: Statistic.h:167
This pass exposes codegen information to IR-level passes.
static const uint32_t IV[8]
Definition: blake3_impl.h:85
ModRefInfo getModRefInfo(const Instruction *I, const std::optional< MemoryLocation > &OptLoc)
Check whether or not an instruction may read or write the optionally specified memory location.
Class for arbitrary precision integers.
Definition: APInt.h:75
unsigned getBitWidth() const
Return the number of bits in the APInt.
Definition: APInt.h:1439
int64_t getSExtValue() const
Get sign extended value.
Definition: APInt.h:1516
A container for analyses that lazily runs them and caches their results.
Definition: PassManager.h:620
Represent the analysis usage information of a pass.
AnalysisUsage & addRequired()
AnalysisUsage & addPreserved()
Add the specified Pass class to the set of analyses preserved by this pass.
static ArrayType * get(Type *ElementType, uint64_t NumElements)
This static method is the primary way to construct an ArrayType.
Definition: Type.cpp:640
LLVM Basic Block Representation.
Definition: BasicBlock.h:56
iterator end()
Definition: BasicBlock.h:316
iterator begin()
Instruction iterator methods.
Definition: BasicBlock.h:314
iterator_range< filter_iterator< BasicBlock::const_iterator, std::function< bool(const Instruction &)> > > instructionsWithoutDebug(bool SkipPseudoOp=true) const
Return a const iterator range over the instructions in the block, skipping any debug instructions.
Definition: BasicBlock.cpp:103
const Instruction * getFirstNonPHI() const
Returns a pointer to the first instruction in this block that is not a PHINode instruction.
Definition: BasicBlock.cpp:208
const Instruction & front() const
Definition: BasicBlock.h:326
const BasicBlock * getSinglePredecessor() const
Return the predecessor of this block if it has a single predecessor block.
Definition: BasicBlock.cpp:284
const Function * getParent() const
Return the enclosing method, or null if none.
Definition: BasicBlock.h:112
InstListType::iterator iterator
Instruction iterators...
Definition: BasicBlock.h:87
size_t size() const
Definition: BasicBlock.h:324
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Definition: BasicBlock.h:127
const Module * getModule() const
Return the module owning the function this basic block belongs to, or nullptr if the function does no...
Definition: BasicBlock.cpp:146
BinaryOps getOpcode() const
Definition: InstrTypes.h:391
Conditional or Unconditional Branch instruction.
bool isConditional() const
BasicBlock * getSuccessor(unsigned i) const
Value * getCondition() const
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
Definition: InstrTypes.h:1406
This class represents a function call, abstracting a target machine's calling convention.
void setPredicate(Predicate P)
Set the predicate for this instruction to the specified value.
Definition: InstrTypes.h:811
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition: InstrTypes.h:718
@ ICMP_SLE
signed less or equal
Definition: InstrTypes.h:748
@ ICMP_UGT
unsigned greater than
Definition: InstrTypes.h:741
@ ICMP_EQ
equal
Definition: InstrTypes.h:739
@ ICMP_NE
not equal
Definition: InstrTypes.h:740
Predicate getInversePredicate() const
For example, EQ -> NE, UGT -> ULE, SLT -> SGE, OEQ -> UNE, UGT -> OLE, OLT -> UGE,...
Definition: InstrTypes.h:832
Predicate getPredicate() const
Return the predicate for this instruction.
Definition: InstrTypes.h:808
static Constant * get(ArrayType *T, ArrayRef< Constant * > V)
Definition: Constants.cpp:1249
static Constant * getBitCast(Constant *C, Type *Ty, bool OnlyIfReduced=false)
Definition: Constants.cpp:2229
static Constant * getExactLogBase2(Constant *C)
If C is a scalar/fixed width vector of known powers of 2, then this function returns a new scalar/fix...
Definition: Constants.cpp:2712
This is the shared class of boolean and integer constants.
Definition: Constants.h:78
bool isMinusOne() const
This function will return true iff every bit in this constant is set to true.
Definition: Constants.h:205
bool isOne() const
This is just a convenience method to make client code smaller for a common case.
Definition: Constants.h:199
bool isZero() const
This is just a convenience method to make client code smaller for a common code.
Definition: Constants.h:193
static Constant * get(Type *Ty, uint64_t V, bool IsSigned=false)
If Ty is a vector type, return a Constant with a splat of the given value.
Definition: Constants.cpp:887
uint64_t getZExtValue() const
Return the constant as a 64-bit unsigned integer value after it has been zero extended as appropriate...
Definition: Constants.h:141
static ConstantInt * getBool(LLVMContext &Context, bool V)
Definition: Constants.cpp:849
This is an important base class in LLVM.
Definition: Constant.h:41
static Constant * getAllOnesValue(Type *Ty)
Definition: Constants.cpp:403
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:114
A debug info location.
Definition: DebugLoc.h:33
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition: Dominators.h:166
bool dominates(const BasicBlock *BB, const Use &U) const
Return true if the (end of the) basic block BB dominates the use U.
Definition: Dominators.cpp:122
A handy container for a FunctionType+Callee-pointer pair, which can be passed around as a single enti...
Definition: DerivedTypes.h:165
bool hasOptSize() const
Optimize this function for size (-Os) or minimum size (-Oz).
Definition: Function.h:644
void setAlignment(Align Align)
Sets the alignment attribute of the GlobalObject.
Definition: Globals.cpp:130
void setUnnamedAddr(UnnamedAddr Val)
Definition: GlobalValue.h:227
Module * getParent()
Get the module that this global value is contained inside of...
Definition: GlobalValue.h:652
@ PrivateLinkage
Like Internal, but omit from symbol table.
Definition: GlobalValue.h:56
This instruction compares its operands according to the predicate given to the constructor.
bool isEquality() const
Return true if this predicate is either EQ or NE.
ConstantInt * getInt1(bool V)
Get a constant value representing either true or false.
Definition: IRBuilder.h:447
BasicBlock * GetInsertBlock() const
Definition: IRBuilder.h:174
CallInst * CreateCall(FunctionType *FTy, Value *Callee, ArrayRef< Value * > Args=std::nullopt, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition: IRBuilder.h:2293
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition: IRBuilder.h:2550
bool hasNoUnsignedWrap() const LLVM_READONLY
Determine whether the no unsigned wrap flag is set.
bool hasNoSignedWrap() const LLVM_READONLY
Determine whether the no signed wrap flag is set.
const DebugLoc & getDebugLoc() const
Return the debug location for this node as a DebugLoc.
Definition: Instruction.h:358
const Module * getModule() const
Return the module owning the function this instruction belongs to or nullptr it the function does not...
Definition: Instruction.cpp:70
bool isAtomic() const LLVM_READONLY
Return true if this instruction has an AtomicOrdering of unordered or higher.
const BasicBlock * getParent() const
Definition: Instruction.h:90
const Function * getFunction() const
Return the function this instruction belongs to.
Definition: Instruction.cpp:74
AAMDNodes getAAMetadata() const
Returns the AA metadata for this instruction.
Definition: Metadata.cpp:1499
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
Definition: Instruction.h:168
bool isShift() const
Definition: Instruction.h:175
SymbolTableList< Instruction >::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
Definition: Instruction.cpp:82
void setDebugLoc(DebugLoc Loc)
Set the debug location information for this instruction.
Definition: Instruction.h:355
This class provides an interface for updating the loop pass manager based on mutations to the loop ne...
An instruction for reading from memory.
Definition: Instructions.h:177
Value * getPointerOperand()
Definition: Instructions.h:264
bool isVolatile() const
Return true if this is a load from a volatile memory location.
Definition: Instructions.h:214
bool isUnordered() const
Definition: Instructions.h:258
Align getAlign() const
Return the alignment of the access that is being performed.
Definition: Instructions.h:220
static LocationSize precise(uint64_t Value)
static constexpr LocationSize afterPointer()
Any location after the base pointer (but still within the underlying object).
bool contains(const LoopT *L) const
Return true if the specified loop is contained within in this loop.
Definition: LoopInfo.h:139
bool isOutermost() const
Return true if the loop does not have a parent (natural) loop.
Definition: LoopInfo.h:185
unsigned getNumBlocks() const
Get the number of blocks in this loop in constant time.
Definition: LoopInfo.h:202
unsigned getNumBackEdges() const
Calculate the number of back edges to the loop header.
Definition: LoopInfo.h:267
BlockT * getHeader() const
Definition: LoopInfo.h:105
iterator_range< block_iterator > blocks() const
Definition: LoopInfo.h:195
BlockT * getExitBlock() const
If getExitBlocks would return exactly one block, return that block.
Definition: LoopInfoImpl.h:107
BlockT * getLoopPreheader() const
If there is a preheader for this loop, return it.
Definition: LoopInfoImpl.h:183
ArrayRef< BlockT * > getBlocks() const
Get a list of the basic blocks which make up this loop.
Definition: LoopInfo.h:188
void getUniqueExitBlocks(SmallVectorImpl< BlockT * > &ExitBlocks) const
Return all unique successor blocks of this loop.
Definition: LoopInfoImpl.h:142
block_iterator block_begin() const
Definition: LoopInfo.h:193
PreservedAnalyses run(Loop &L, LoopAnalysisManager &AM, LoopStandardAnalysisResults &AR, LPMUpdater &U)
LoopT * getLoopFor(const BlockT *BB) const
Return the inner most loop that BB lives in.
Definition: LoopInfo.h:992
virtual bool runOnLoop(Loop *L, LPPassManager &LPM)=0
bool skipLoop(const Loop *L) const
Optional passes call this function to check whether the pass should be skipped.
Definition: LoopPass.cpp:370
Represents a single loop in the control flow graph.
Definition: LoopInfo.h:547
bool isLoopInvariant(const Value *V) const
Return true if the specified value is loop invariant.
Definition: LoopInfo.cpp:60
StringRef getName() const
Definition: LoopInfo.h:891
This class implements a map that also provides access to all stored values in a deterministic order.
Definition: MapVector.h:37
This class wraps the llvm.memcpy intrinsic.
Value * getLength() const
Value * getDest() const
This is just like getRawDest, but it strips off any cast instructions (including addrspacecast) that ...
MaybeAlign getDestAlign() const
bool isVolatile() const
Value * getValue() const
This class wraps the llvm.memset and llvm.memset.inline intrinsics.
MaybeAlign getSourceAlign() const
Value * getSource() const
This is just like getRawSource, but it strips off any cast instructions that feed it,...
Representation for a specific memory location.
An analysis that produces MemorySSA for a function.
Definition: MemorySSA.h:936
Legacy analysis pass which computes MemorySSA.
Definition: MemorySSA.h:986
Encapsulates MemorySSA, including all data associated with memory accesses.
Definition: MemorySSA.h:700
A Module instance is used to store all the information related to an LLVM module.
Definition: Module.h:65
const DataLayout & getDataLayout() const
Get the data layout for the module's target platform.
Definition: Module.cpp:398
The optimization diagnostic interface.
void emit(DiagnosticInfoOptimizationBase &OptDiag)
Output the remark via the diagnostic handler and to the optimization record file.
Diagnostic information for missed-optimization remarks.
Diagnostic information for applied optimization remarks.
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
static PHINode * Create(Type *Ty, unsigned NumReservedValues, const Twine &NameStr="", Instruction *InsertBefore=nullptr)
Constructors - NumReservedValues is a hint for the number of incoming edges that this phi node will h...
Value * getIncomingValueForBlock(const BasicBlock *BB) const
static PassRegistry * getPassRegistry()
getPassRegistry - Access the global registry object, which is automatically initialized at applicatio...
Pass interface - Implemented by all 'passes'.
Definition: Pass.h:91
virtual void getAnalysisUsage(AnalysisUsage &) const
getAnalysisUsage - This function should be overriden by passes that need analysis information to do t...
Definition: Pass.cpp:98
static PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
Definition: Constants.cpp:1759
A set of analyses that are preserved following a run of a transformation pass.
Definition: PassManager.h:152
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition: PassManager.h:158
This node represents a polynomial recurrence on the trip count of the specified loop.
bool isAffine() const
Return true if this represents an expression A + B*x where A and B are loop invariant values.
This class represents a constant integer value.
ConstantInt * getValue() const
const APInt & getAPInt() const
Helper to remove instructions inserted during SCEV expansion, unless they are marked as used.
This class uses information about analyze scalars to rewrite expressions in canonical form.
const SCEV * getOperand(unsigned i) const
This class represents an analyzed expression in the program.
bool isOne() const
Return true if the expression is a constant one.
bool isNonConstantNegative() const
Return true if the specified scev is negated, but not a constant.
Type * getType() const
Return the LLVM type of this SCEV expression.
The main scalar evolution driver.
bool isKnownNonNegative(const SCEV *S)
Test if the given expression is known to be non-negative.
const SCEV * getNegativeSCEV(const SCEV *V, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap)
Return the SCEV object corresponding to -V.
const SCEV * getBackedgeTakenCount(const Loop *L, ExitCountKind Kind=Exact)
If the specified loop has a predictable backedge-taken count, return it, otherwise return a SCEVCould...
const SCEV * getZero(Type *Ty)
Return a SCEV for the constant 0 of a specific type.
const SCEV * getConstant(ConstantInt *V)
const SCEV * getSCEV(Value *V)
Return a SCEV expression for the full generality of the specified expression.
const SCEV * getOne(Type *Ty)
Return a SCEV for the constant 1 of a specific type.
void forgetLoop(const Loop *L)
This method should be called by the client when it has changed a loop in a way that may effect Scalar...
bool isLoopInvariant(const SCEV *S, const Loop *L)
Return true if the value of the given SCEV is unchanging in the specified loop.
const SCEV * getZeroExtendExpr(const SCEV *Op, Type *Ty, unsigned Depth=0)
const SCEV * getMinusSCEV(const SCEV *LHS, const SCEV *RHS, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Return LHS-RHS.
bool hasLoopInvariantBackedgeTakenCount(const Loop *L)
Return true if the specified loop has an analyzable loop-invariant backedge-taken count.
const SCEV * applyLoopGuards(const SCEV *Expr, const Loop *L)
Try to apply information from loop guards for L to Expr.
const SCEV * getMulExpr(SmallVectorImpl< const SCEV * > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical multiply expression, or something simpler if possible.
const SCEV * getTruncateOrZeroExtend(const SCEV *V, Type *Ty, unsigned Depth=0)
Return a SCEV corresponding to a conversion of the input value to the specified type.
bool isLoopEntryGuardedByCond(const Loop *L, ICmpInst::Predicate Pred, const SCEV *LHS, const SCEV *RHS)
Test whether entry to the loop is protected by a conditional between LHS and RHS.
const SCEV * getAddExpr(SmallVectorImpl< const SCEV * > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical add expression, or something simpler if possible.
A vector that has set insertion semantics.
Definition: SetVector.h:40
size_type count(const key_type &key) const
Count the number of elements of a given key in the SetVector.
Definition: SetVector.h:208
bool insert(const value_type &X)
Insert a new element into the SetVector.
Definition: SetVector.h:141
Simple and conservative implementation of LoopSafetyInfo that can give false-positive answers to its ...
Definition: MustExecute.h:110
void computeLoopSafetyInfo(const Loop *CurLoop) override
Computes safety information for a loop checks loop body & header for the possibility of may throw exc...
Definition: MustExecute.cpp:51
bool anyBlockMayThrow() const override
Returns true iff any block of the loop for which this info is contains an instruction that may throw ...
Definition: MustExecute.cpp:47
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
Definition: SmallPtrSet.h:344
bool erase(PtrType Ptr)
erase - If the set contains the specified pointer, remove it and return true, otherwise return false.
Definition: SmallPtrSet.h:379
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
Definition: SmallPtrSet.h:383
iterator end() const
Definition: SmallPtrSet.h:408
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition: SmallPtrSet.h:365
iterator begin() const
Definition: SmallPtrSet.h:403
bool contains(ConstPtrType Ptr) const
Definition: SmallPtrSet.h:389
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
Definition: SmallPtrSet.h:450
size_t size() const
Definition: SmallVector.h:91
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:577
void push_back(const T &Elt)
Definition: SmallVector.h:416
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1200
An instruction for storing to memory.
Definition: Instructions.h:301
Align getAlign() const
Definition: Instructions.h:345
Value * getValueOperand()
Definition: Instructions.h:390
Value * getPointerOperand()
Definition: Instructions.h:393
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:50
Provides information about what library functions are available for the current target.
bool has(LibFunc F) const
Tests whether a library function is available.
Wrapper pass for TargetTransformInfo.
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind) const
unsigned getAtomicMemIntrinsicMaxElementSize() const
PopcntSupportKind getPopcntSupport(unsigned IntTyWidthInBit) const
Return hardware support for population count.
TargetCostKind
The kind of cost model.
@ TCK_SizeAndLatency
The weighted sum of size and latency.
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, TTI::OperandValueInfo Opd1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Opd2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args=ArrayRef< const Value * >(), const Instruction *CxtI=nullptr) const
This is an approximation of reciprocal throughput of a math/logic op.
@ TCC_Basic
The cost of a typical 'add' instruction.
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
unsigned getIntegerBitWidth() const
unsigned getPointerAddressSpace() const
Get the address space of this pointer or pointer vector type.
unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition: Type.h:341
static UndefValue * get(Type *T)
Static factory methods - Return an 'undef' object of the specified type.
Definition: Constants.cpp:1740
void setOperand(unsigned i, Value *Val)
Definition: User.h:174
Value * getOperand(unsigned i) const
Definition: User.h:169
LLVM Value Representation.
Definition: Value.h:74
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:255
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition: Value.h:434
void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition: Value.cpp:532
iterator_range< user_iterator > users()
Definition: Value.h:421
void replaceUsesOutsideBlock(Value *V, BasicBlock *BB)
replaceUsesOutsideBlock - Go through the uses list for this definition and make each use point to "V"...
Definition: Value.cpp:584
LLVMContext & getContext() const
All values hold a context through their type.
Definition: Value.cpp:994
StringRef getName() const
Return a constant reference to the value's name.
Definition: Value.cpp:308
void takeName(Value *V)
Transfer the name from V to this value.
Definition: Value.cpp:381
Value handle that is nullable, but tries to track the Value.
Definition: ValueHandle.h:204
constexpr ScalarTy getFixedValue() const
Definition: TypeSize.h:182
constexpr bool isScalable() const
Returns whether the quantity is scaled by a runtime quantity (vscale).
Definition: TypeSize.h:166
self_iterator getIterator()
Definition: ilist_node.h:82
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
constexpr char Attrs[]
Key for Kernel::Metadata::mAttrs.
@ HeaderSize
Definition: BTF.h:60
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
Definition: BitmaskEnum.h:80
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition: CallingConv.h:24
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
Function * getDeclaration(Module *M, ID id, ArrayRef< Type * > Tys=std::nullopt)
Create or insert an LLVM Function declaration for an intrinsic, and return it.
Definition: Function.cpp:1502
BinaryOp_match< LHS, RHS, Instruction::And > m_And(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::Add > m_Add(const LHS &L, const RHS &R)
Definition: PatternMatch.h:979
cst_pred_ty< is_power2 > m_Power2()
Match an integer or vector power-of-2.
Definition: PatternMatch.h:544
BinaryOp_match< LHS, RHS, Instruction::And, true > m_c_And(const LHS &L, const RHS &R)
Matches an And with LHS and RHS in either order.
bool match(Val *V, const Pattern &P)
Definition: PatternMatch.h:49
bind_ty< Instruction > m_Instruction(Instruction *&I)
Match an instruction, capturing it if we match.
Definition: PatternMatch.h:716
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
Definition: PatternMatch.h:772
cst_pred_ty< is_one > m_One()
Match an integer 1 or a vector with all elements equal to 1.
Definition: PatternMatch.h:517
match_combine_and< LTy, RTy > m_CombineAnd(const LTy &L, const RTy &R)
Combine two pattern matchers matching L && R.
Definition: PatternMatch.h:224
CmpClass_match< LHS, RHS, ICmpInst, ICmpInst::Predicate > m_ICmp(ICmpInst::Predicate &Pred, const LHS &L, const RHS &R)
brc_match< Cond_t, bind_ty< BasicBlock >, bind_ty< BasicBlock > > m_Br(const Cond_t &C, BasicBlock *&T, BasicBlock *&F)
BinaryOp_match< LHS, RHS, Instruction::Add, true > m_c_Add(const LHS &L, const RHS &R)
Matches a Add with LHS and RHS in either order.
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
Definition: PatternMatch.h:76
BinOpPred_match< LHS, RHS, is_shift_op > m_Shift(const LHS &L, const RHS &R)
Matches shift operations.
BinaryOp_match< LHS, RHS, Instruction::Shl > m_Shl(const LHS &L, const RHS &R)
class_match< BasicBlock > m_BasicBlock()
Match an arbitrary basic block value and ignore it.
Definition: PatternMatch.h:168
is_zero m_Zero()
Match any null constant or a vector with all elements equal to 0.
Definition: PatternMatch.h:537
BinaryOp_match< LHS, RHS, Instruction::Sub > m_Sub(const LHS &L, const RHS &R)
Definition: PatternMatch.h:991
cst_pred_ty< icmp_pred_with_threshold > m_SpecificInt_ICMP(ICmpInst::Predicate Predicate, const APInt &Threshold)
Match an integer or vector with every element comparing 'pred' (eg/ne/...) to Threshold.
Definition: PatternMatch.h:598
@ ReallyHidden
Definition: CommandLine.h:139
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:445
LocationClass< Ty > location(Ty &L)
Definition: CommandLine.h:465
constexpr double e
Definition: MathExtras.h:31
DiagnosticInfoOptimizationBase::setExtraArgs setExtraArgs
DiagnosticInfoOptimizationBase::Argument NV
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
bool RecursivelyDeleteTriviallyDeadInstructions(Value *V, const TargetLibraryInfo *TLI=nullptr, MemorySSAUpdater *MSSAU=nullptr, std::function< void(Value *)> AboutToDeleteCallback=std::function< void(Value *)>())
If the specified value is a trivially dead instruction, delete it.
Definition: Local.cpp:537
Pass * createLoopIdiomPass()
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
Value * GetPointerBaseWithConstantOffset(Value *Ptr, int64_t &Offset, const DataLayout &DL, bool AllowNonInbounds=true)
Analyze the specified pointer to see if it can be expressed as a base pointer plus a constant offset.
const Value * getUnderlyingObject(const Value *V, unsigned MaxLookup=6)
This method strips off any GEP address adjustments and pointer casts from the specified value,...
bool inferNonMandatoryLibFuncAttrs(Module *M, StringRef Name, const TargetLibraryInfo &TLI)
Analyze the name and prototype of the given function and set any applicable attributes.
bool isLibFuncEmittable(const Module *M, const TargetLibraryInfo *TLI, LibFunc TheLibFunc)
Check whether the library function is available on target and also that it in the current Module is a...
bool isMustProgress(const Loop *L)
Return true if this loop can be assumed to make progress.
Definition: LoopInfo.cpp:1118
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
void initializeLoopIdiomRecognizeLegacyPassPass(PassRegistry &)
bool isModOrRefSet(const ModRefInfo MRI)
Definition: ModRef.h:42
FunctionCallee getOrInsertLibFunc(Module *M, const TargetLibraryInfo &TLI, LibFunc TheLibFunc, FunctionType *T, AttributeList AttributeList)
Calls getOrInsertFunction() and then makes sure to add mandatory argument attributes.
void getLoopAnalysisUsage(AnalysisUsage &AU)
Helper to consistently add the set of standard passes to a loop pass's AnalysisUsage.
Definition: LoopUtils.cpp:141
ModRefInfo
Flags indicating whether a memory access modifies or references memory.
Definition: ModRef.h:27
@ ModRef
The access may reference and may modify the value stored in memory.
@ Mod
The access may modify the value stored in memory.
bool VerifyMemorySSA
Enables verification of MemorySSA.
Definition: MemorySSA.cpp:89
bool isConsecutiveAccess(Value *A, Value *B, const DataLayout &DL, ScalarEvolution &SE, bool CheckType=true)
Returns true if the memory operations A and B are consecutive.
bool isKnownNonNegative(const Value *V, const DataLayout &DL, unsigned Depth=0, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true)
Returns true if the give value is known to be non-negative.
PreservedAnalyses getLoopPassPreservedAnalyses()
Returns the minimum set of Analyses that all loop passes must preserve.
Value * isBytewiseValue(Value *V, const DataLayout &DL)
If the specified value can be set by repeating the same byte in memory, return the i8 value that it i...
bool decomposeBitTestICmp(Value *LHS, Value *RHS, CmpInst::Predicate &Pred, Value *&X, APInt &Mask, bool LookThroughTrunc=true)
Decompose an icmp into the form ((X & Mask) pred 0) if possible.
constexpr std::nullopt_t None
Definition: None.h:28
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition: BitVector.h:853
A collection of metadata nodes that might be associated with a memory access used by the alias-analys...
Definition: Metadata.h:651
MDNode * TBAAStruct
The tag for type-based alias analysis (tbaa struct).
Definition: Metadata.h:671
MDNode * Scope
The tag for alias scope specification (used with noalias).
Definition: Metadata.h:674
MDNode * TBAA
The tag for type-based alias analysis.
Definition: Metadata.h:668
AAMDNodes merge(const AAMDNodes &Other) const
Given two sets of AAMDNodes applying to potentially different locations, determine the best AAMDNodes...
MDNode * NoAlias
The tag specifying the noalias scope.
Definition: Metadata.h:677
AAMDNodes extendTo(ssize_t Len) const
Create a new AAMDNode that describes this AAMDNode after extending it to apply to a series of bytes o...
Definition: Metadata.h:718
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
static bool Memset
When true, Memset is disabled.
static bool All
When true, the entire pass is disabled.
static bool Memcpy
When true, Memcpy is disabled.
The adaptor from a function pass to a loop pass computes these analyses and makes them available to t...
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition: Alignment.h:117
Match loop-invariant value.
match_LoopInvariant(const SubPattern_t &SP, const Loop *L)