Bug Summary

File:llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
Warning:line 471, column 15
Although the value stored to 'PatternValue' is used in the enclosing expression, the value is never actually read from 'PatternValue'

Annotated Source Code

Press '?' to see keyboard shortcuts

clang -cc1 -triple x86_64-pc-linux-gnu -analyze -disable-free -disable-llvm-verifier -discard-value-names -main-file-name LoopIdiomRecognize.cpp -analyzer-store=region -analyzer-opt-analyze-nested-blocks -analyzer-checker=core -analyzer-checker=apiModeling -analyzer-checker=unix -analyzer-checker=deadcode -analyzer-checker=cplusplus -analyzer-checker=security.insecureAPI.UncheckedReturn -analyzer-checker=security.insecureAPI.getpw -analyzer-checker=security.insecureAPI.gets -analyzer-checker=security.insecureAPI.mktemp -analyzer-checker=security.insecureAPI.mkstemp -analyzer-checker=security.insecureAPI.vfork -analyzer-checker=nullability.NullPassedToNonnull -analyzer-checker=nullability.NullReturnedFromNonnull -analyzer-output plist -w -setup-static-analyzer -analyzer-config-compatibility-mode=true -mrelocation-model pic -pic-level 2 -mthread-model posix -mframe-pointer=none -fmath-errno -fno-rounding-math -masm-verbose -mconstructor-aliases -munwind-tables -fuse-init-array -target-cpu x86-64 -dwarf-column-info -debugger-tuning=gdb -ffunction-sections -fdata-sections -resource-dir /usr/lib/llvm-10/lib/clang/10.0.0 -D _DEBUG -D _GNU_SOURCE -D __STDC_CONSTANT_MACROS -D __STDC_FORMAT_MACROS -D __STDC_LIMIT_MACROS -I /build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/build-llvm/lib/Transforms/Scalar -I /build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Transforms/Scalar -I /build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/build-llvm/include -I /build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/include -U NDEBUG -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/6.3.0/../../../../include/c++/6.3.0 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/6.3.0/../../../../include/x86_64-linux-gnu/c++/6.3.0 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/6.3.0/../../../../include/x86_64-linux-gnu/c++/6.3.0 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/6.3.0/../../../../include/c++/6.3.0/backward -internal-isystem /usr/local/include -internal-isystem /usr/lib/llvm-10/lib/clang/10.0.0/include -internal-externc-isystem /usr/include/x86_64-linux-gnu -internal-externc-isystem /include -internal-externc-isystem /usr/include -O2 -Wno-unused-parameter -Wwrite-strings -Wno-missing-field-initializers -Wno-long-long -Wno-maybe-uninitialized -Wno-comment -std=c++14 -fdeprecated-macro -fdebug-compilation-dir /build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/build-llvm/lib/Transforms/Scalar -fdebug-prefix-map=/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809=. -ferror-limit 19 -fmessage-length 0 -fvisibility-inlines-hidden -stack-protector 2 -fgnuc-version=4.2.1 -fobjc-runtime=gcc -fdiagnostics-show-option -vectorize-loops -vectorize-slp -analyzer-output=html -analyzer-config stable-report-filename=true -faddrsig -o /tmp/scan-build-2019-12-11-181444-25759-1 -x c++ /build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
1//===- LoopIdiomRecognize.cpp - Loop idiom recognition --------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This pass implements an idiom recognizer that transforms simple loops into a
10// non-loop form. In cases that this kicks in, it can be a significant
11// performance win.
12//
13// If compiling for code size we avoid idiom recognition if the resulting
14// code could be larger than the code for the original loop. One way this could
15// happen is if the loop is not removable after idiom recognition due to the
16// presence of non-idiom instructions. The initial implementation of the
17// heuristics applies to idioms in multi-block loops.
18//
19//===----------------------------------------------------------------------===//
20//
21// TODO List:
22//
23// Future loop memory idioms to recognize:
24// memcmp, memmove, strlen, etc.
25// Future floating point idioms to recognize in -ffast-math mode:
26// fpowi
27// Future integer operation idioms to recognize:
28// ctpop
29//
30// Beware that isel's default lowering for ctpop is highly inefficient for
31// i64 and larger types when i64 is legal and the value has few bits set. It
32// would be good to enhance isel to emit a loop for ctpop in this case.
33//
34// This could recognize common matrix multiplies and dot product idioms and
35// replace them with calls to BLAS (if linked in??).
36//
37//===----------------------------------------------------------------------===//
38
39#include "llvm/Transforms/Scalar/LoopIdiomRecognize.h"
40#include "llvm/ADT/APInt.h"
41#include "llvm/ADT/ArrayRef.h"
42#include "llvm/ADT/DenseMap.h"
43#include "llvm/ADT/MapVector.h"
44#include "llvm/ADT/SetVector.h"
45#include "llvm/ADT/SmallPtrSet.h"
46#include "llvm/ADT/SmallVector.h"
47#include "llvm/ADT/Statistic.h"
48#include "llvm/ADT/StringRef.h"
49#include "llvm/Analysis/AliasAnalysis.h"
50#include "llvm/Analysis/LoopAccessAnalysis.h"
51#include "llvm/Analysis/LoopInfo.h"
52#include "llvm/Analysis/LoopPass.h"
53#include "llvm/Analysis/MemoryLocation.h"
54#include "llvm/Analysis/OptimizationRemarkEmitter.h"
55#include "llvm/Analysis/ScalarEvolution.h"
56#include "llvm/Analysis/ScalarEvolutionExpander.h"
57#include "llvm/Analysis/ScalarEvolutionExpressions.h"
58#include "llvm/Analysis/TargetLibraryInfo.h"
59#include "llvm/Analysis/TargetTransformInfo.h"
60#include "llvm/Analysis/ValueTracking.h"
61#include "llvm/IR/Attributes.h"
62#include "llvm/IR/BasicBlock.h"
63#include "llvm/IR/Constant.h"
64#include "llvm/IR/Constants.h"
65#include "llvm/IR/DataLayout.h"
66#include "llvm/IR/DebugLoc.h"
67#include "llvm/IR/DerivedTypes.h"
68#include "llvm/IR/Dominators.h"
69#include "llvm/IR/GlobalValue.h"
70#include "llvm/IR/GlobalVariable.h"
71#include "llvm/IR/IRBuilder.h"
72#include "llvm/IR/InstrTypes.h"
73#include "llvm/IR/Instruction.h"
74#include "llvm/IR/Instructions.h"
75#include "llvm/IR/IntrinsicInst.h"
76#include "llvm/IR/Intrinsics.h"
77#include "llvm/IR/LLVMContext.h"
78#include "llvm/IR/Module.h"
79#include "llvm/IR/PassManager.h"
80#include "llvm/IR/Type.h"
81#include "llvm/IR/User.h"
82#include "llvm/IR/Value.h"
83#include "llvm/IR/ValueHandle.h"
84#include "llvm/Pass.h"
85#include "llvm/Support/Casting.h"
86#include "llvm/Support/CommandLine.h"
87#include "llvm/Support/Debug.h"
88#include "llvm/Support/raw_ostream.h"
89#include "llvm/Transforms/Scalar.h"
90#include "llvm/Transforms/Utils/BuildLibCalls.h"
91#include "llvm/Transforms/Utils/Local.h"
92#include "llvm/Transforms/Utils/LoopUtils.h"
93#include <algorithm>
94#include <cassert>
95#include <cstdint>
96#include <utility>
97#include <vector>
98
99using namespace llvm;
100
101#define DEBUG_TYPE"loop-idiom" "loop-idiom"
102
103STATISTIC(NumMemSet, "Number of memset's formed from loop stores")static llvm::Statistic NumMemSet = {"loop-idiom", "NumMemSet"
, "Number of memset's formed from loop stores"}
;
104STATISTIC(NumMemCpy, "Number of memcpy's formed from loop load+stores")static llvm::Statistic NumMemCpy = {"loop-idiom", "NumMemCpy"
, "Number of memcpy's formed from loop load+stores"}
;
105
106static cl::opt<bool> UseLIRCodeSizeHeurs(
107 "use-lir-code-size-heurs",
108 cl::desc("Use loop idiom recognition code size heuristics when compiling"
109 "with -Os/-Oz"),
110 cl::init(true), cl::Hidden);
111
112namespace {
113
114class LoopIdiomRecognize {
115 Loop *CurLoop = nullptr;
116 AliasAnalysis *AA;
117 DominatorTree *DT;
118 LoopInfo *LI;
119 ScalarEvolution *SE;
120 TargetLibraryInfo *TLI;
121 const TargetTransformInfo *TTI;
122 const DataLayout *DL;
123 OptimizationRemarkEmitter &ORE;
124 bool ApplyCodeSizeHeuristics;
125
126public:
127 explicit LoopIdiomRecognize(AliasAnalysis *AA, DominatorTree *DT,
128 LoopInfo *LI, ScalarEvolution *SE,
129 TargetLibraryInfo *TLI,
130 const TargetTransformInfo *TTI,
131 const DataLayout *DL,
132 OptimizationRemarkEmitter &ORE)
133 : AA(AA), DT(DT), LI(LI), SE(SE), TLI(TLI), TTI(TTI), DL(DL), ORE(ORE) {}
134
135 bool runOnLoop(Loop *L);
136
137private:
138 using StoreList = SmallVector<StoreInst *, 8>;
139 using StoreListMap = MapVector<Value *, StoreList>;
140
141 StoreListMap StoreRefsForMemset;
142 StoreListMap StoreRefsForMemsetPattern;
143 StoreList StoreRefsForMemcpy;
144 bool HasMemset;
145 bool HasMemsetPattern;
146 bool HasMemcpy;
147
148 /// Return code for isLegalStore()
149 enum LegalStoreKind {
150 None = 0,
151 Memset,
152 MemsetPattern,
153 Memcpy,
154 UnorderedAtomicMemcpy,
155 DontUse // Dummy retval never to be used. Allows catching errors in retval
156 // handling.
157 };
158
159 /// \name Countable Loop Idiom Handling
160 /// @{
161
162 bool runOnCountableLoop();
163 bool runOnLoopBlock(BasicBlock *BB, const SCEV *BECount,
164 SmallVectorImpl<BasicBlock *> &ExitBlocks);
165
166 void collectStores(BasicBlock *BB);
167 LegalStoreKind isLegalStore(StoreInst *SI);
168 enum class ForMemset { No, Yes };
169 bool processLoopStores(SmallVectorImpl<StoreInst *> &SL, const SCEV *BECount,
170 ForMemset For);
171 bool processLoopMemSet(MemSetInst *MSI, const SCEV *BECount);
172
173 bool processLoopStridedStore(Value *DestPtr, unsigned StoreSize,
174 unsigned StoreAlignment, Value *StoredVal,
175 Instruction *TheStore,
176 SmallPtrSetImpl<Instruction *> &Stores,
177 const SCEVAddRecExpr *Ev, const SCEV *BECount,
178 bool NegStride, bool IsLoopMemset = false);
179 bool processLoopStoreOfLoopLoad(StoreInst *SI, const SCEV *BECount);
180 bool avoidLIRForMultiBlockLoop(bool IsMemset = false,
181 bool IsLoopMemset = false);
182
183 /// @}
184 /// \name Noncountable Loop Idiom Handling
185 /// @{
186
187 bool runOnNoncountableLoop();
188
189 bool recognizePopcount();
190 void transformLoopToPopcount(BasicBlock *PreCondBB, Instruction *CntInst,
191 PHINode *CntPhi, Value *Var);
192 bool recognizeAndInsertFFS(); /// Find First Set: ctlz or cttz
193 void transformLoopToCountable(Intrinsic::ID IntrinID, BasicBlock *PreCondBB,
194 Instruction *CntInst, PHINode *CntPhi,
195 Value *Var, Instruction *DefX,
196 const DebugLoc &DL, bool ZeroCheck,
197 bool IsCntPhiUsedOutsideLoop);
198
199 /// @}
200};
201
202class LoopIdiomRecognizeLegacyPass : public LoopPass {
203public:
204 static char ID;
205
206 explicit LoopIdiomRecognizeLegacyPass() : LoopPass(ID) {
207 initializeLoopIdiomRecognizeLegacyPassPass(
208 *PassRegistry::getPassRegistry());
209 }
210
211 bool runOnLoop(Loop *L, LPPassManager &LPM) override {
212 if (skipLoop(L))
213 return false;
214
215 AliasAnalysis *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
216 DominatorTree *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
217 LoopInfo *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
218 ScalarEvolution *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
219 TargetLibraryInfo *TLI =
220 &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(
221 *L->getHeader()->getParent());
222 const TargetTransformInfo *TTI =
223 &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(
224 *L->getHeader()->getParent());
225 const DataLayout *DL = &L->getHeader()->getModule()->getDataLayout();
226
227 // For the old PM, we can't use OptimizationRemarkEmitter as an analysis
228 // pass. Function analyses need to be preserved across loop transformations
229 // but ORE cannot be preserved (see comment before the pass definition).
230 OptimizationRemarkEmitter ORE(L->getHeader()->getParent());
231
232 LoopIdiomRecognize LIR(AA, DT, LI, SE, TLI, TTI, DL, ORE);
233 return LIR.runOnLoop(L);
234 }
235
236 /// This transformation requires natural loop information & requires that
237 /// loop preheaders be inserted into the CFG.
238 void getAnalysisUsage(AnalysisUsage &AU) const override {
239 AU.addRequired<TargetLibraryInfoWrapperPass>();
240 AU.addRequired<TargetTransformInfoWrapperPass>();
241 getLoopAnalysisUsage(AU);
242 }
243};
244
245} // end anonymous namespace
246
247char LoopIdiomRecognizeLegacyPass::ID = 0;
248
249PreservedAnalyses LoopIdiomRecognizePass::run(Loop &L, LoopAnalysisManager &AM,
250 LoopStandardAnalysisResults &AR,
251 LPMUpdater &) {
252 const auto *DL = &L.getHeader()->getModule()->getDataLayout();
253
254 const auto &FAM =
255 AM.getResult<FunctionAnalysisManagerLoopProxy>(L, AR).getManager();
256 Function *F = L.getHeader()->getParent();
257
258 auto *ORE = FAM.getCachedResult<OptimizationRemarkEmitterAnalysis>(*F);
259 // FIXME: This should probably be optional rather than required.
260 if (!ORE)
261 report_fatal_error(
262 "LoopIdiomRecognizePass: OptimizationRemarkEmitterAnalysis not cached "
263 "at a higher level");
264
265 LoopIdiomRecognize LIR(&AR.AA, &AR.DT, &AR.LI, &AR.SE, &AR.TLI, &AR.TTI, DL,
266 *ORE);
267 if (!LIR.runOnLoop(&L))
268 return PreservedAnalyses::all();
269
270 return getLoopPassPreservedAnalyses();
271}
272
273INITIALIZE_PASS_BEGIN(LoopIdiomRecognizeLegacyPass, "loop-idiom",static void *initializeLoopIdiomRecognizeLegacyPassPassOnce(PassRegistry
&Registry) {
274 "Recognize loop idioms", false, false)static void *initializeLoopIdiomRecognizeLegacyPassPassOnce(PassRegistry
&Registry) {
275INITIALIZE_PASS_DEPENDENCY(LoopPass)initializeLoopPassPass(Registry);
276INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)initializeTargetLibraryInfoWrapperPassPass(Registry);
277INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)initializeTargetTransformInfoWrapperPassPass(Registry);
278INITIALIZE_PASS_END(LoopIdiomRecognizeLegacyPass, "loop-idiom",PassInfo *PI = new PassInfo( "Recognize loop idioms", "loop-idiom"
, &LoopIdiomRecognizeLegacyPass::ID, PassInfo::NormalCtor_t
(callDefaultCtor<LoopIdiomRecognizeLegacyPass>), false,
false); Registry.registerPass(*PI, true); return PI; } static
llvm::once_flag InitializeLoopIdiomRecognizeLegacyPassPassFlag
; void llvm::initializeLoopIdiomRecognizeLegacyPassPass(PassRegistry
&Registry) { llvm::call_once(InitializeLoopIdiomRecognizeLegacyPassPassFlag
, initializeLoopIdiomRecognizeLegacyPassPassOnce, std::ref(Registry
)); }
279 "Recognize loop idioms", false, false)PassInfo *PI = new PassInfo( "Recognize loop idioms", "loop-idiom"
, &LoopIdiomRecognizeLegacyPass::ID, PassInfo::NormalCtor_t
(callDefaultCtor<LoopIdiomRecognizeLegacyPass>), false,
false); Registry.registerPass(*PI, true); return PI; } static
llvm::once_flag InitializeLoopIdiomRecognizeLegacyPassPassFlag
; void llvm::initializeLoopIdiomRecognizeLegacyPassPass(PassRegistry
&Registry) { llvm::call_once(InitializeLoopIdiomRecognizeLegacyPassPassFlag
, initializeLoopIdiomRecognizeLegacyPassPassOnce, std::ref(Registry
)); }
280
281Pass *llvm::createLoopIdiomPass() { return new LoopIdiomRecognizeLegacyPass(); }
282
283static void deleteDeadInstruction(Instruction *I) {
284 I->replaceAllUsesWith(UndefValue::get(I->getType()));
285 I->eraseFromParent();
286}
287
288//===----------------------------------------------------------------------===//
289//
290// Implementation of LoopIdiomRecognize
291//
292//===----------------------------------------------------------------------===//
293
294bool LoopIdiomRecognize::runOnLoop(Loop *L) {
295 CurLoop = L;
296 // If the loop could not be converted to canonical form, it must have an
297 // indirectbr in it, just give up.
298 if (!L->getLoopPreheader())
299 return false;
300
301 // Disable loop idiom recognition if the function's name is a common idiom.
302 StringRef Name = L->getHeader()->getParent()->getName();
303 if (Name == "memset" || Name == "memcpy")
304 return false;
305
306 // Determine if code size heuristics need to be applied.
307 ApplyCodeSizeHeuristics =
308 L->getHeader()->getParent()->hasOptSize() && UseLIRCodeSizeHeurs;
309
310 HasMemset = TLI->has(LibFunc_memset);
311 HasMemsetPattern = TLI->has(LibFunc_memset_pattern16);
312 HasMemcpy = TLI->has(LibFunc_memcpy);
313
314 if (HasMemset || HasMemsetPattern || HasMemcpy)
315 if (SE->hasLoopInvariantBackedgeTakenCount(L))
316 return runOnCountableLoop();
317
318 return runOnNoncountableLoop();
319}
320
321bool LoopIdiomRecognize::runOnCountableLoop() {
322 const SCEV *BECount = SE->getBackedgeTakenCount(CurLoop);
323 assert(!isa<SCEVCouldNotCompute>(BECount) &&((!isa<SCEVCouldNotCompute>(BECount) && "runOnCountableLoop() called on a loop without a predictable"
"backedge-taken count") ? static_cast<void> (0) : __assert_fail
("!isa<SCEVCouldNotCompute>(BECount) && \"runOnCountableLoop() called on a loop without a predictable\" \"backedge-taken count\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp"
, 325, __PRETTY_FUNCTION__))
324 "runOnCountableLoop() called on a loop without a predictable"((!isa<SCEVCouldNotCompute>(BECount) && "runOnCountableLoop() called on a loop without a predictable"
"backedge-taken count") ? static_cast<void> (0) : __assert_fail
("!isa<SCEVCouldNotCompute>(BECount) && \"runOnCountableLoop() called on a loop without a predictable\" \"backedge-taken count\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp"
, 325, __PRETTY_FUNCTION__))
325 "backedge-taken count")((!isa<SCEVCouldNotCompute>(BECount) && "runOnCountableLoop() called on a loop without a predictable"
"backedge-taken count") ? static_cast<void> (0) : __assert_fail
("!isa<SCEVCouldNotCompute>(BECount) && \"runOnCountableLoop() called on a loop without a predictable\" \"backedge-taken count\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp"
, 325, __PRETTY_FUNCTION__))
;
326
327 // If this loop executes exactly one time, then it should be peeled, not
328 // optimized by this pass.
329 if (const SCEVConstant *BECst = dyn_cast<SCEVConstant>(BECount))
330 if (BECst->getAPInt() == 0)
331 return false;
332
333 SmallVector<BasicBlock *, 8> ExitBlocks;
334 CurLoop->getUniqueExitBlocks(ExitBlocks);
335
336 LLVM_DEBUG(dbgs() << DEBUG_TYPE " Scanning: F["do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-idiom")) { dbgs() << "loop-idiom" " Scanning: F["
<< CurLoop->getHeader()->getParent()->getName
() << "] Countable Loop %" << CurLoop->getHeader
()->getName() << "\n"; } } while (false)
337 << CurLoop->getHeader()->getParent()->getName()do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-idiom")) { dbgs() << "loop-idiom" " Scanning: F["
<< CurLoop->getHeader()->getParent()->getName
() << "] Countable Loop %" << CurLoop->getHeader
()->getName() << "\n"; } } while (false)
338 << "] Countable Loop %" << CurLoop->getHeader()->getName()do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-idiom")) { dbgs() << "loop-idiom" " Scanning: F["
<< CurLoop->getHeader()->getParent()->getName
() << "] Countable Loop %" << CurLoop->getHeader
()->getName() << "\n"; } } while (false)
339 << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-idiom")) { dbgs() << "loop-idiom" " Scanning: F["
<< CurLoop->getHeader()->getParent()->getName
() << "] Countable Loop %" << CurLoop->getHeader
()->getName() << "\n"; } } while (false)
;
340
341 bool MadeChange = false;
342
343 // The following transforms hoist stores/memsets into the loop pre-header.
344 // Give up if the loop has instructions may throw.
345 SimpleLoopSafetyInfo SafetyInfo;
346 SafetyInfo.computeLoopSafetyInfo(CurLoop);
347 if (SafetyInfo.anyBlockMayThrow())
348 return MadeChange;
349
350 // Scan all the blocks in the loop that are not in subloops.
351 for (auto *BB : CurLoop->getBlocks()) {
352 // Ignore blocks in subloops.
353 if (LI->getLoopFor(BB) != CurLoop)
354 continue;
355
356 MadeChange |= runOnLoopBlock(BB, BECount, ExitBlocks);
357 }
358 return MadeChange;
359}
360
361static APInt getStoreStride(const SCEVAddRecExpr *StoreEv) {
362 const SCEVConstant *ConstStride = cast<SCEVConstant>(StoreEv->getOperand(1));
363 return ConstStride->getAPInt();
364}
365
366/// getMemSetPatternValue - If a strided store of the specified value is safe to
367/// turn into a memset_pattern16, return a ConstantArray of 16 bytes that should
368/// be passed in. Otherwise, return null.
369///
370/// Note that we don't ever attempt to use memset_pattern8 or 4, because these
371/// just replicate their input array and then pass on to memset_pattern16.
372static Constant *getMemSetPatternValue(Value *V, const DataLayout *DL) {
373 // FIXME: This could check for UndefValue because it can be merged into any
374 // other valid pattern.
375
376 // If the value isn't a constant, we can't promote it to being in a constant
377 // array. We could theoretically do a store to an alloca or something, but
378 // that doesn't seem worthwhile.
379 Constant *C = dyn_cast<Constant>(V);
380 if (!C)
381 return nullptr;
382
383 // Only handle simple values that are a power of two bytes in size.
384 uint64_t Size = DL->getTypeSizeInBits(V->getType());
385 if (Size == 0 || (Size & 7) || (Size & (Size - 1)))
386 return nullptr;
387
388 // Don't care enough about darwin/ppc to implement this.
389 if (DL->isBigEndian())
390 return nullptr;
391
392 // Convert to size in bytes.
393 Size /= 8;
394
395 // TODO: If CI is larger than 16-bytes, we can try slicing it in half to see
396 // if the top and bottom are the same (e.g. for vectors and large integers).
397 if (Size > 16)
398 return nullptr;
399
400 // If the constant is exactly 16 bytes, just use it.
401 if (Size == 16)
402 return C;
403
404 // Otherwise, we'll use an array of the constants.
405 unsigned ArraySize = 16 / Size;
406 ArrayType *AT = ArrayType::get(V->getType(), ArraySize);
407 return ConstantArray::get(AT, std::vector<Constant *>(ArraySize, C));
408}
409
410LoopIdiomRecognize::LegalStoreKind
411LoopIdiomRecognize::isLegalStore(StoreInst *SI) {
412 // Don't touch volatile stores.
413 if (SI->isVolatile())
414 return LegalStoreKind::None;
415 // We only want simple or unordered-atomic stores.
416 if (!SI->isUnordered())
417 return LegalStoreKind::None;
418
419 // Don't convert stores of non-integral pointer types to memsets (which stores
420 // integers).
421 if (DL->isNonIntegralPointerType(SI->getValueOperand()->getType()))
422 return LegalStoreKind::None;
423
424 // Avoid merging nontemporal stores.
425 if (SI->getMetadata(LLVMContext::MD_nontemporal))
426 return LegalStoreKind::None;
427
428 Value *StoredVal = SI->getValueOperand();
429 Value *StorePtr = SI->getPointerOperand();
430
431 // Reject stores that are so large that they overflow an unsigned.
432 uint64_t SizeInBits = DL->getTypeSizeInBits(StoredVal->getType());
433 if ((SizeInBits & 7) || (SizeInBits >> 32) != 0)
434 return LegalStoreKind::None;
435
436 // See if the pointer expression is an AddRec like {base,+,1} on the current
437 // loop, which indicates a strided store. If we have something else, it's a
438 // random store we can't handle.
439 const SCEVAddRecExpr *StoreEv =
440 dyn_cast<SCEVAddRecExpr>(SE->getSCEV(StorePtr));
441 if (!StoreEv || StoreEv->getLoop() != CurLoop || !StoreEv->isAffine())
442 return LegalStoreKind::None;
443
444 // Check to see if we have a constant stride.
445 if (!isa<SCEVConstant>(StoreEv->getOperand(1)))
446 return LegalStoreKind::None;
447
448 // See if the store can be turned into a memset.
449
450 // If the stored value is a byte-wise value (like i32 -1), then it may be
451 // turned into a memset of i8 -1, assuming that all the consecutive bytes
452 // are stored. A store of i32 0x01020304 can never be turned into a memset,
453 // but it can be turned into memset_pattern if the target supports it.
454 Value *SplatValue = isBytewiseValue(StoredVal, *DL);
455 Constant *PatternValue = nullptr;
456
457 // Note: memset and memset_pattern on unordered-atomic is yet not supported
458 bool UnorderedAtomic = SI->isUnordered() && !SI->isSimple();
459
460 // If we're allowed to form a memset, and the stored value would be
461 // acceptable for memset, use it.
462 if (!UnorderedAtomic && HasMemset && SplatValue &&
463 // Verify that the stored value is loop invariant. If not, we can't
464 // promote the memset.
465 CurLoop->isLoopInvariant(SplatValue)) {
466 // It looks like we can use SplatValue.
467 return LegalStoreKind::Memset;
468 } else if (!UnorderedAtomic && HasMemsetPattern &&
469 // Don't create memset_pattern16s with address spaces.
470 StorePtr->getType()->getPointerAddressSpace() == 0 &&
471 (PatternValue = getMemSetPatternValue(StoredVal, DL))) {
Although the value stored to 'PatternValue' is used in the enclosing expression, the value is never actually read from 'PatternValue'
472 // It looks like we can use PatternValue!
473 return LegalStoreKind::MemsetPattern;
474 }
475
476 // Otherwise, see if the store can be turned into a memcpy.
477 if (HasMemcpy) {
478 // Check to see if the stride matches the size of the store. If so, then we
479 // know that every byte is touched in the loop.
480 APInt Stride = getStoreStride(StoreEv);
481 unsigned StoreSize = DL->getTypeStoreSize(SI->getValueOperand()->getType());
482 if (StoreSize != Stride && StoreSize != -Stride)
483 return LegalStoreKind::None;
484
485 // The store must be feeding a non-volatile load.
486 LoadInst *LI = dyn_cast<LoadInst>(SI->getValueOperand());
487
488 // Only allow non-volatile loads
489 if (!LI || LI->isVolatile())
490 return LegalStoreKind::None;
491 // Only allow simple or unordered-atomic loads
492 if (!LI->isUnordered())
493 return LegalStoreKind::None;
494
495 // See if the pointer expression is an AddRec like {base,+,1} on the current
496 // loop, which indicates a strided load. If we have something else, it's a
497 // random load we can't handle.
498 const SCEVAddRecExpr *LoadEv =
499 dyn_cast<SCEVAddRecExpr>(SE->getSCEV(LI->getPointerOperand()));
500 if (!LoadEv || LoadEv->getLoop() != CurLoop || !LoadEv->isAffine())
501 return LegalStoreKind::None;
502
503 // The store and load must share the same stride.
504 if (StoreEv->getOperand(1) != LoadEv->getOperand(1))
505 return LegalStoreKind::None;
506
507 // Success. This store can be converted into a memcpy.
508 UnorderedAtomic = UnorderedAtomic || LI->isAtomic();
509 return UnorderedAtomic ? LegalStoreKind::UnorderedAtomicMemcpy
510 : LegalStoreKind::Memcpy;
511 }
512 // This store can't be transformed into a memset/memcpy.
513 return LegalStoreKind::None;
514}
515
516void LoopIdiomRecognize::collectStores(BasicBlock *BB) {
517 StoreRefsForMemset.clear();
518 StoreRefsForMemsetPattern.clear();
519 StoreRefsForMemcpy.clear();
520 for (Instruction &I : *BB) {
521 StoreInst *SI = dyn_cast<StoreInst>(&I);
522 if (!SI)
523 continue;
524
525 // Make sure this is a strided store with a constant stride.
526 switch (isLegalStore(SI)) {
527 case LegalStoreKind::None:
528 // Nothing to do
529 break;
530 case LegalStoreKind::Memset: {
531 // Find the base pointer.
532 Value *Ptr = GetUnderlyingObject(SI->getPointerOperand(), *DL);
533 StoreRefsForMemset[Ptr].push_back(SI);
534 } break;
535 case LegalStoreKind::MemsetPattern: {
536 // Find the base pointer.
537 Value *Ptr = GetUnderlyingObject(SI->getPointerOperand(), *DL);
538 StoreRefsForMemsetPattern[Ptr].push_back(SI);
539 } break;
540 case LegalStoreKind::Memcpy:
541 case LegalStoreKind::UnorderedAtomicMemcpy:
542 StoreRefsForMemcpy.push_back(SI);
543 break;
544 default:
545 assert(false && "unhandled return value")((false && "unhandled return value") ? static_cast<
void> (0) : __assert_fail ("false && \"unhandled return value\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp"
, 545, __PRETTY_FUNCTION__))
;
546 break;
547 }
548 }
549}
550
551/// runOnLoopBlock - Process the specified block, which lives in a counted loop
552/// with the specified backedge count. This block is known to be in the current
553/// loop and not in any subloops.
554bool LoopIdiomRecognize::runOnLoopBlock(
555 BasicBlock *BB, const SCEV *BECount,
556 SmallVectorImpl<BasicBlock *> &ExitBlocks) {
557 // We can only promote stores in this block if they are unconditionally
558 // executed in the loop. For a block to be unconditionally executed, it has
559 // to dominate all the exit blocks of the loop. Verify this now.
560 for (unsigned i = 0, e = ExitBlocks.size(); i != e; ++i)
561 if (!DT->dominates(BB, ExitBlocks[i]))
562 return false;
563
564 bool MadeChange = false;
565 // Look for store instructions, which may be optimized to memset/memcpy.
566 collectStores(BB);
567
568 // Look for a single store or sets of stores with a common base, which can be
569 // optimized into a memset (memset_pattern). The latter most commonly happens
570 // with structs and handunrolled loops.
571 for (auto &SL : StoreRefsForMemset)
572 MadeChange |= processLoopStores(SL.second, BECount, ForMemset::Yes);
573
574 for (auto &SL : StoreRefsForMemsetPattern)
575 MadeChange |= processLoopStores(SL.second, BECount, ForMemset::No);
576
577 // Optimize the store into a memcpy, if it feeds an similarly strided load.
578 for (auto &SI : StoreRefsForMemcpy)
579 MadeChange |= processLoopStoreOfLoopLoad(SI, BECount);
580
581 for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) {
582 Instruction *Inst = &*I++;
583 // Look for memset instructions, which may be optimized to a larger memset.
584 if (MemSetInst *MSI = dyn_cast<MemSetInst>(Inst)) {
585 WeakTrackingVH InstPtr(&*I);
586 if (!processLoopMemSet(MSI, BECount))
587 continue;
588 MadeChange = true;
589
590 // If processing the memset invalidated our iterator, start over from the
591 // top of the block.
592 if (!InstPtr)
593 I = BB->begin();
594 continue;
595 }
596 }
597
598 return MadeChange;
599}
600
601/// See if this store(s) can be promoted to a memset.
602bool LoopIdiomRecognize::processLoopStores(SmallVectorImpl<StoreInst *> &SL,
603 const SCEV *BECount, ForMemset For) {
604 // Try to find consecutive stores that can be transformed into memsets.
605 SetVector<StoreInst *> Heads, Tails;
606 SmallDenseMap<StoreInst *, StoreInst *> ConsecutiveChain;
607
608 // Do a quadratic search on all of the given stores and find
609 // all of the pairs of stores that follow each other.
610 SmallVector<unsigned, 16> IndexQueue;
611 for (unsigned i = 0, e = SL.size(); i < e; ++i) {
612 assert(SL[i]->isSimple() && "Expected only non-volatile stores.")((SL[i]->isSimple() && "Expected only non-volatile stores."
) ? static_cast<void> (0) : __assert_fail ("SL[i]->isSimple() && \"Expected only non-volatile stores.\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp"
, 612, __PRETTY_FUNCTION__))
;
613
614 Value *FirstStoredVal = SL[i]->getValueOperand();
615 Value *FirstStorePtr = SL[i]->getPointerOperand();
616 const SCEVAddRecExpr *FirstStoreEv =
617 cast<SCEVAddRecExpr>(SE->getSCEV(FirstStorePtr));
618 APInt FirstStride = getStoreStride(FirstStoreEv);
619 unsigned FirstStoreSize = DL->getTypeStoreSize(SL[i]->getValueOperand()->getType());
620
621 // See if we can optimize just this store in isolation.
622 if (FirstStride == FirstStoreSize || -FirstStride == FirstStoreSize) {
623 Heads.insert(SL[i]);
624 continue;
625 }
626
627 Value *FirstSplatValue = nullptr;
628 Constant *FirstPatternValue = nullptr;
629
630 if (For == ForMemset::Yes)
631 FirstSplatValue = isBytewiseValue(FirstStoredVal, *DL);
632 else
633 FirstPatternValue = getMemSetPatternValue(FirstStoredVal, DL);
634
635 assert((FirstSplatValue || FirstPatternValue) &&(((FirstSplatValue || FirstPatternValue) && "Expected either splat value or pattern value."
) ? static_cast<void> (0) : __assert_fail ("(FirstSplatValue || FirstPatternValue) && \"Expected either splat value or pattern value.\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp"
, 636, __PRETTY_FUNCTION__))
636 "Expected either splat value or pattern value.")(((FirstSplatValue || FirstPatternValue) && "Expected either splat value or pattern value."
) ? static_cast<void> (0) : __assert_fail ("(FirstSplatValue || FirstPatternValue) && \"Expected either splat value or pattern value.\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp"
, 636, __PRETTY_FUNCTION__))
;
637
638 IndexQueue.clear();
639 // If a store has multiple consecutive store candidates, search Stores
640 // array according to the sequence: from i+1 to e, then from i-1 to 0.
641 // This is because usually pairing with immediate succeeding or preceding
642 // candidate create the best chance to find memset opportunity.
643 unsigned j = 0;
644 for (j = i + 1; j < e; ++j)
645 IndexQueue.push_back(j);
646 for (j = i; j > 0; --j)
647 IndexQueue.push_back(j - 1);
648
649 for (auto &k : IndexQueue) {
650 assert(SL[k]->isSimple() && "Expected only non-volatile stores.")((SL[k]->isSimple() && "Expected only non-volatile stores."
) ? static_cast<void> (0) : __assert_fail ("SL[k]->isSimple() && \"Expected only non-volatile stores.\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp"
, 650, __PRETTY_FUNCTION__))
;
651 Value *SecondStorePtr = SL[k]->getPointerOperand();
652 const SCEVAddRecExpr *SecondStoreEv =
653 cast<SCEVAddRecExpr>(SE->getSCEV(SecondStorePtr));
654 APInt SecondStride = getStoreStride(SecondStoreEv);
655
656 if (FirstStride != SecondStride)
657 continue;
658
659 Value *SecondStoredVal = SL[k]->getValueOperand();
660 Value *SecondSplatValue = nullptr;
661 Constant *SecondPatternValue = nullptr;
662
663 if (For == ForMemset::Yes)
664 SecondSplatValue = isBytewiseValue(SecondStoredVal, *DL);
665 else
666 SecondPatternValue = getMemSetPatternValue(SecondStoredVal, DL);
667
668 assert((SecondSplatValue || SecondPatternValue) &&(((SecondSplatValue || SecondPatternValue) && "Expected either splat value or pattern value."
) ? static_cast<void> (0) : __assert_fail ("(SecondSplatValue || SecondPatternValue) && \"Expected either splat value or pattern value.\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp"
, 669, __PRETTY_FUNCTION__))
669 "Expected either splat value or pattern value.")(((SecondSplatValue || SecondPatternValue) && "Expected either splat value or pattern value."
) ? static_cast<void> (0) : __assert_fail ("(SecondSplatValue || SecondPatternValue) && \"Expected either splat value or pattern value.\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp"
, 669, __PRETTY_FUNCTION__))
;
670
671 if (isConsecutiveAccess(SL[i], SL[k], *DL, *SE, false)) {
672 if (For == ForMemset::Yes) {
673 if (isa<UndefValue>(FirstSplatValue))
674 FirstSplatValue = SecondSplatValue;
675 if (FirstSplatValue != SecondSplatValue)
676 continue;
677 } else {
678 if (isa<UndefValue>(FirstPatternValue))
679 FirstPatternValue = SecondPatternValue;
680 if (FirstPatternValue != SecondPatternValue)
681 continue;
682 }
683 Tails.insert(SL[k]);
684 Heads.insert(SL[i]);
685 ConsecutiveChain[SL[i]] = SL[k];
686 break;
687 }
688 }
689 }
690
691 // We may run into multiple chains that merge into a single chain. We mark the
692 // stores that we transformed so that we don't visit the same store twice.
693 SmallPtrSet<Value *, 16> TransformedStores;
694 bool Changed = false;
695
696 // For stores that start but don't end a link in the chain:
697 for (SetVector<StoreInst *>::iterator it = Heads.begin(), e = Heads.end();
698 it != e; ++it) {
699 if (Tails.count(*it))
700 continue;
701
702 // We found a store instr that starts a chain. Now follow the chain and try
703 // to transform it.
704 SmallPtrSet<Instruction *, 8> AdjacentStores;
705 StoreInst *I = *it;
706
707 StoreInst *HeadStore = I;
708 unsigned StoreSize = 0;
709
710 // Collect the chain into a list.
711 while (Tails.count(I) || Heads.count(I)) {
712 if (TransformedStores.count(I))
713 break;
714 AdjacentStores.insert(I);
715
716 StoreSize += DL->getTypeStoreSize(I->getValueOperand()->getType());
717 // Move to the next value in the chain.
718 I = ConsecutiveChain[I];
719 }
720
721 Value *StoredVal = HeadStore->getValueOperand();
722 Value *StorePtr = HeadStore->getPointerOperand();
723 const SCEVAddRecExpr *StoreEv = cast<SCEVAddRecExpr>(SE->getSCEV(StorePtr));
724 APInt Stride = getStoreStride(StoreEv);
725
726 // Check to see if the stride matches the size of the stores. If so, then
727 // we know that every byte is touched in the loop.
728 if (StoreSize != Stride && StoreSize != -Stride)
729 continue;
730
731 bool NegStride = StoreSize == -Stride;
732
733 if (processLoopStridedStore(StorePtr, StoreSize, HeadStore->getAlignment(),
734 StoredVal, HeadStore, AdjacentStores, StoreEv,
735 BECount, NegStride)) {
736 TransformedStores.insert(AdjacentStores.begin(), AdjacentStores.end());
737 Changed = true;
738 }
739 }
740
741 return Changed;
742}
743
744/// processLoopMemSet - See if this memset can be promoted to a large memset.
745bool LoopIdiomRecognize::processLoopMemSet(MemSetInst *MSI,
746 const SCEV *BECount) {
747 // We can only handle non-volatile memsets with a constant size.
748 if (MSI->isVolatile() || !isa<ConstantInt>(MSI->getLength()))
749 return false;
750
751 // If we're not allowed to hack on memset, we fail.
752 if (!HasMemset)
753 return false;
754
755 Value *Pointer = MSI->getDest();
756
757 // See if the pointer expression is an AddRec like {base,+,1} on the current
758 // loop, which indicates a strided store. If we have something else, it's a
759 // random store we can't handle.
760 const SCEVAddRecExpr *Ev = dyn_cast<SCEVAddRecExpr>(SE->getSCEV(Pointer));
761 if (!Ev || Ev->getLoop() != CurLoop || !Ev->isAffine())
762 return false;
763
764 // Reject memsets that are so large that they overflow an unsigned.
765 uint64_t SizeInBytes = cast<ConstantInt>(MSI->getLength())->getZExtValue();
766 if ((SizeInBytes >> 32) != 0)
767 return false;
768
769 // Check to see if the stride matches the size of the memset. If so, then we
770 // know that every byte is touched in the loop.
771 const SCEVConstant *ConstStride = dyn_cast<SCEVConstant>(Ev->getOperand(1));
772 if (!ConstStride)
773 return false;
774
775 APInt Stride = ConstStride->getAPInt();
776 if (SizeInBytes != Stride && SizeInBytes != -Stride)
777 return false;
778
779 // Verify that the memset value is loop invariant. If not, we can't promote
780 // the memset.
781 Value *SplatValue = MSI->getValue();
782 if (!SplatValue || !CurLoop->isLoopInvariant(SplatValue))
783 return false;
784
785 SmallPtrSet<Instruction *, 1> MSIs;
786 MSIs.insert(MSI);
787 bool NegStride = SizeInBytes == -Stride;
788 return processLoopStridedStore(Pointer, (unsigned)SizeInBytes,
789 MSI->getDestAlignment(), SplatValue, MSI, MSIs,
790 Ev, BECount, NegStride, /*IsLoopMemset=*/true);
791}
792
793/// mayLoopAccessLocation - Return true if the specified loop might access the
794/// specified pointer location, which is a loop-strided access. The 'Access'
795/// argument specifies what the verboten forms of access are (read or write).
796static bool
797mayLoopAccessLocation(Value *Ptr, ModRefInfo Access, Loop *L,
798 const SCEV *BECount, unsigned StoreSize,
799 AliasAnalysis &AA,
800 SmallPtrSetImpl<Instruction *> &IgnoredStores) {
801 // Get the location that may be stored across the loop. Since the access is
802 // strided positively through memory, we say that the modified location starts
803 // at the pointer and has infinite size.
804 LocationSize AccessSize = LocationSize::unknown();
805
806 // If the loop iterates a fixed number of times, we can refine the access size
807 // to be exactly the size of the memset, which is (BECount+1)*StoreSize
808 if (const SCEVConstant *BECst = dyn_cast<SCEVConstant>(BECount))
809 AccessSize = LocationSize::precise((BECst->getValue()->getZExtValue() + 1) *
810 StoreSize);
811
812 // TODO: For this to be really effective, we have to dive into the pointer
813 // operand in the store. Store to &A[i] of 100 will always return may alias
814 // with store of &A[100], we need to StoreLoc to be "A" with size of 100,
815 // which will then no-alias a store to &A[100].
816 MemoryLocation StoreLoc(Ptr, AccessSize);
817
818 for (Loop::block_iterator BI = L->block_begin(), E = L->block_end(); BI != E;
819 ++BI)
820 for (Instruction &I : **BI)
821 if (IgnoredStores.count(&I) == 0 &&
822 isModOrRefSet(
823 intersectModRef(AA.getModRefInfo(&I, StoreLoc), Access)))
824 return true;
825
826 return false;
827}
828
829// If we have a negative stride, Start refers to the end of the memory location
830// we're trying to memset. Therefore, we need to recompute the base pointer,
831// which is just Start - BECount*Size.
832static const SCEV *getStartForNegStride(const SCEV *Start, const SCEV *BECount,
833 Type *IntPtr, unsigned StoreSize,
834 ScalarEvolution *SE) {
835 const SCEV *Index = SE->getTruncateOrZeroExtend(BECount, IntPtr);
836 if (StoreSize != 1)
837 Index = SE->getMulExpr(Index, SE->getConstant(IntPtr, StoreSize),
838 SCEV::FlagNUW);
839 return SE->getMinusSCEV(Start, Index);
840}
841
842/// Compute the number of bytes as a SCEV from the backedge taken count.
843///
844/// This also maps the SCEV into the provided type and tries to handle the
845/// computation in a way that will fold cleanly.
846static const SCEV *getNumBytes(const SCEV *BECount, Type *IntPtr,
847 unsigned StoreSize, Loop *CurLoop,
848 const DataLayout *DL, ScalarEvolution *SE) {
849 const SCEV *NumBytesS;
850 // The # stored bytes is (BECount+1)*Size. Expand the trip count out to
851 // pointer size if it isn't already.
852 //
853 // If we're going to need to zero extend the BE count, check if we can add
854 // one to it prior to zero extending without overflow. Provided this is safe,
855 // it allows better simplification of the +1.
856 if (DL->getTypeSizeInBits(BECount->getType()) <
857 DL->getTypeSizeInBits(IntPtr) &&
858 SE->isLoopEntryGuardedByCond(
859 CurLoop, ICmpInst::ICMP_NE, BECount,
860 SE->getNegativeSCEV(SE->getOne(BECount->getType())))) {
861 NumBytesS = SE->getZeroExtendExpr(
862 SE->getAddExpr(BECount, SE->getOne(BECount->getType()), SCEV::FlagNUW),
863 IntPtr);
864 } else {
865 NumBytesS = SE->getAddExpr(SE->getTruncateOrZeroExtend(BECount, IntPtr),
866 SE->getOne(IntPtr), SCEV::FlagNUW);
867 }
868
869 // And scale it based on the store size.
870 if (StoreSize != 1) {
871 NumBytesS = SE->getMulExpr(NumBytesS, SE->getConstant(IntPtr, StoreSize),
872 SCEV::FlagNUW);
873 }
874 return NumBytesS;
875}
876
877/// processLoopStridedStore - We see a strided store of some value. If we can
878/// transform this into a memset or memset_pattern in the loop preheader, do so.
879bool LoopIdiomRecognize::processLoopStridedStore(
880 Value *DestPtr, unsigned StoreSize, unsigned StoreAlignment,
881 Value *StoredVal, Instruction *TheStore,
882 SmallPtrSetImpl<Instruction *> &Stores, const SCEVAddRecExpr *Ev,
883 const SCEV *BECount, bool NegStride, bool IsLoopMemset) {
884 Value *SplatValue = isBytewiseValue(StoredVal, *DL);
885 Constant *PatternValue = nullptr;
886
887 if (!SplatValue)
888 PatternValue = getMemSetPatternValue(StoredVal, DL);
889
890 assert((SplatValue || PatternValue) &&(((SplatValue || PatternValue) && "Expected either splat value or pattern value."
) ? static_cast<void> (0) : __assert_fail ("(SplatValue || PatternValue) && \"Expected either splat value or pattern value.\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp"
, 891, __PRETTY_FUNCTION__))
891 "Expected either splat value or pattern value.")(((SplatValue || PatternValue) && "Expected either splat value or pattern value."
) ? static_cast<void> (0) : __assert_fail ("(SplatValue || PatternValue) && \"Expected either splat value or pattern value.\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp"
, 891, __PRETTY_FUNCTION__))
;
892
893 // The trip count of the loop and the base pointer of the addrec SCEV is
894 // guaranteed to be loop invariant, which means that it should dominate the
895 // header. This allows us to insert code for it in the preheader.
896 unsigned DestAS = DestPtr->getType()->getPointerAddressSpace();
897 BasicBlock *Preheader = CurLoop->getLoopPreheader();
898 IRBuilder<> Builder(Preheader->getTerminator());
899 SCEVExpander Expander(*SE, *DL, "loop-idiom");
900
901 Type *DestInt8PtrTy = Builder.getInt8PtrTy(DestAS);
902 Type *IntPtr = Builder.getIntPtrTy(*DL, DestAS);
903
904 const SCEV *Start = Ev->getStart();
905 // Handle negative strided loops.
906 if (NegStride)
907 Start = getStartForNegStride(Start, BECount, IntPtr, StoreSize, SE);
908
909 // TODO: ideally we should still be able to generate memset if SCEV expander
910 // is taught to generate the dependencies at the latest point.
911 if (!isSafeToExpand(Start, *SE))
912 return false;
913
914 // Okay, we have a strided store "p[i]" of a splattable value. We can turn
915 // this into a memset in the loop preheader now if we want. However, this
916 // would be unsafe to do if there is anything else in the loop that may read
917 // or write to the aliased location. Check for any overlap by generating the
918 // base pointer and checking the region.
919 Value *BasePtr =
920 Expander.expandCodeFor(Start, DestInt8PtrTy, Preheader->getTerminator());
921 if (mayLoopAccessLocation(BasePtr, ModRefInfo::ModRef, CurLoop, BECount,
922 StoreSize, *AA, Stores)) {
923 Expander.clear();
924 // If we generated new code for the base pointer, clean up.
925 RecursivelyDeleteTriviallyDeadInstructions(BasePtr, TLI);
926 return false;
927 }
928
929 if (avoidLIRForMultiBlockLoop(/*IsMemset=*/true, IsLoopMemset))
930 return false;
931
932 // Okay, everything looks good, insert the memset.
933
934 const SCEV *NumBytesS =
935 getNumBytes(BECount, IntPtr, StoreSize, CurLoop, DL, SE);
936
937 // TODO: ideally we should still be able to generate memset if SCEV expander
938 // is taught to generate the dependencies at the latest point.
939 if (!isSafeToExpand(NumBytesS, *SE))
940 return false;
941
942 Value *NumBytes =
943 Expander.expandCodeFor(NumBytesS, IntPtr, Preheader->getTerminator());
944
945 CallInst *NewCall;
946 if (SplatValue) {
947 NewCall =
948 Builder.CreateMemSet(BasePtr, SplatValue, NumBytes, StoreAlignment);
949 } else {
950 // Everything is emitted in default address space
951 Type *Int8PtrTy = DestInt8PtrTy;
952
953 Module *M = TheStore->getModule();
954 StringRef FuncName = "memset_pattern16";
955 FunctionCallee MSP = M->getOrInsertFunction(FuncName, Builder.getVoidTy(),
956 Int8PtrTy, Int8PtrTy, IntPtr);
957 inferLibFuncAttributes(M, FuncName, *TLI);
958
959 // Otherwise we should form a memset_pattern16. PatternValue is known to be
960 // an constant array of 16-bytes. Plop the value into a mergable global.
961 GlobalVariable *GV = new GlobalVariable(*M, PatternValue->getType(), true,
962 GlobalValue::PrivateLinkage,
963 PatternValue, ".memset_pattern");
964 GV->setUnnamedAddr(GlobalValue::UnnamedAddr::Global); // Ok to merge these.
965 GV->setAlignment(Align(16));
966 Value *PatternPtr = ConstantExpr::getBitCast(GV, Int8PtrTy);
967 NewCall = Builder.CreateCall(MSP, {BasePtr, PatternPtr, NumBytes});
968 }
969
970 LLVM_DEBUG(dbgs() << " Formed memset: " << *NewCall << "\n"do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-idiom")) { dbgs() << " Formed memset: " <<
*NewCall << "\n" << " from store to: " <<
*Ev << " at: " << *TheStore << "\n"; } } while
(false)
971 << " from store to: " << *Ev << " at: " << *TheStoredo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-idiom")) { dbgs() << " Formed memset: " <<
*NewCall << "\n" << " from store to: " <<
*Ev << " at: " << *TheStore << "\n"; } } while
(false)
972 << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-idiom")) { dbgs() << " Formed memset: " <<
*NewCall << "\n" << " from store to: " <<
*Ev << " at: " << *TheStore << "\n"; } } while
(false)
;
973 NewCall->setDebugLoc(TheStore->getDebugLoc());
974
975 ORE.emit([&]() {
976 return OptimizationRemark(DEBUG_TYPE"loop-idiom", "ProcessLoopStridedStore",
977 NewCall->getDebugLoc(), Preheader)
978 << "Transformed loop-strided store into a call to "
979 << ore::NV("NewFunction", NewCall->getCalledFunction())
980 << "() function";
981 });
982
983 // Okay, the memset has been formed. Zap the original store and anything that
984 // feeds into it.
985 for (auto *I : Stores)
986 deleteDeadInstruction(I);
987 ++NumMemSet;
988 return true;
989}
990
991/// If the stored value is a strided load in the same loop with the same stride
992/// this may be transformable into a memcpy. This kicks in for stuff like
993/// for (i) A[i] = B[i];
994bool LoopIdiomRecognize::processLoopStoreOfLoopLoad(StoreInst *SI,
995 const SCEV *BECount) {
996 assert(SI->isUnordered() && "Expected only non-volatile non-ordered stores.")((SI->isUnordered() && "Expected only non-volatile non-ordered stores."
) ? static_cast<void> (0) : __assert_fail ("SI->isUnordered() && \"Expected only non-volatile non-ordered stores.\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp"
, 996, __PRETTY_FUNCTION__))
;
997
998 Value *StorePtr = SI->getPointerOperand();
999 const SCEVAddRecExpr *StoreEv = cast<SCEVAddRecExpr>(SE->getSCEV(StorePtr));
1000 APInt Stride = getStoreStride(StoreEv);
1001 unsigned StoreSize = DL->getTypeStoreSize(SI->getValueOperand()->getType());
1002 bool NegStride = StoreSize == -Stride;
1003
1004 // The store must be feeding a non-volatile load.
1005 LoadInst *LI = cast<LoadInst>(SI->getValueOperand());
1006 assert(LI->isUnordered() && "Expected only non-volatile non-ordered loads.")((LI->isUnordered() && "Expected only non-volatile non-ordered loads."
) ? static_cast<void> (0) : __assert_fail ("LI->isUnordered() && \"Expected only non-volatile non-ordered loads.\""
, "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp"
, 1006, __PRETTY_FUNCTION__))
;
1007
1008 // See if the pointer expression is an AddRec like {base,+,1} on the current
1009 // loop, which indicates a strided load. If we have something else, it's a
1010 // random load we can't handle.
1011 const SCEVAddRecExpr *LoadEv =
1012 cast<SCEVAddRecExpr>(SE->getSCEV(LI->getPointerOperand()));
1013
1014 // The trip count of the loop and the base pointer of the addrec SCEV is
1015 // guaranteed to be loop invariant, which means that it should dominate the
1016 // header. This allows us to insert code for it in the preheader.
1017 BasicBlock *Preheader = CurLoop->getLoopPreheader();
1018 IRBuilder<> Builder(Preheader->getTerminator());
1019 SCEVExpander Expander(*SE, *DL, "loop-idiom");
1020
1021 const SCEV *StrStart = StoreEv->getStart();
1022 unsigned StrAS = SI->getPointerAddressSpace();
1023 Type *IntPtrTy = Builder.getIntPtrTy(*DL, StrAS);
1024
1025 // Handle negative strided loops.
1026 if (NegStride)
1027 StrStart = getStartForNegStride(StrStart, BECount, IntPtrTy, StoreSize, SE);
1028
1029 // Okay, we have a strided store "p[i]" of a loaded value. We can turn
1030 // this into a memcpy in the loop preheader now if we want. However, this
1031 // would be unsafe to do if there is anything else in the loop that may read
1032 // or write the memory region we're storing to. This includes the load that
1033 // feeds the stores. Check for an alias by generating the base address and
1034 // checking everything.
1035 Value *StoreBasePtr = Expander.expandCodeFor(
1036 StrStart, Builder.getInt8PtrTy(StrAS), Preheader->getTerminator());
1037
1038 SmallPtrSet<Instruction *, 1> Stores;
1039 Stores.insert(SI);
1040 if (mayLoopAccessLocation(StoreBasePtr, ModRefInfo::ModRef, CurLoop, BECount,
1041 StoreSize, *AA, Stores)) {
1042 Expander.clear();
1043 // If we generated new code for the base pointer, clean up.
1044 RecursivelyDeleteTriviallyDeadInstructions(StoreBasePtr, TLI);
1045 return false;
1046 }
1047
1048 const SCEV *LdStart = LoadEv->getStart();
1049 unsigned LdAS = LI->getPointerAddressSpace();
1050
1051 // Handle negative strided loops.
1052 if (NegStride)
1053 LdStart = getStartForNegStride(LdStart, BECount, IntPtrTy, StoreSize, SE);
1054
1055 // For a memcpy, we have to make sure that the input array is not being
1056 // mutated by the loop.
1057 Value *LoadBasePtr = Expander.expandCodeFor(
1058 LdStart, Builder.getInt8PtrTy(LdAS), Preheader->getTerminator());
1059
1060 if (mayLoopAccessLocation(LoadBasePtr, ModRefInfo::Mod, CurLoop, BECount,
1061 StoreSize, *AA, Stores)) {
1062 Expander.clear();
1063 // If we generated new code for the base pointer, clean up.
1064 RecursivelyDeleteTriviallyDeadInstructions(LoadBasePtr, TLI);
1065 RecursivelyDeleteTriviallyDeadInstructions(StoreBasePtr, TLI);
1066 return false;
1067 }
1068
1069 if (avoidLIRForMultiBlockLoop())
1070 return false;
1071
1072 // Okay, everything is safe, we can transform this!
1073
1074 const SCEV *NumBytesS =
1075 getNumBytes(BECount, IntPtrTy, StoreSize, CurLoop, DL, SE);
1076
1077 Value *NumBytes =
1078 Expander.expandCodeFor(NumBytesS, IntPtrTy, Preheader->getTerminator());
1079
1080 CallInst *NewCall = nullptr;
1081 // Check whether to generate an unordered atomic memcpy:
1082 // If the load or store are atomic, then they must necessarily be unordered
1083 // by previous checks.
1084 if (!SI->isAtomic() && !LI->isAtomic())
1085 NewCall = Builder.CreateMemCpy(StoreBasePtr, SI->getAlignment(),
1086 LoadBasePtr, LI->getAlignment(), NumBytes);
1087 else {
1088 // We cannot allow unaligned ops for unordered load/store, so reject
1089 // anything where the alignment isn't at least the element size.
1090 unsigned Align = std::min(SI->getAlignment(), LI->getAlignment());
1091 if (Align < StoreSize)
1092 return false;
1093
1094 // If the element.atomic memcpy is not lowered into explicit
1095 // loads/stores later, then it will be lowered into an element-size
1096 // specific lib call. If the lib call doesn't exist for our store size, then
1097 // we shouldn't generate the memcpy.
1098 if (StoreSize > TTI->getAtomicMemIntrinsicMaxElementSize())
1099 return false;
1100
1101 // Create the call.
1102 // Note that unordered atomic loads/stores are *required* by the spec to
1103 // have an alignment but non-atomic loads/stores may not.
1104 NewCall = Builder.CreateElementUnorderedAtomicMemCpy(
1105 StoreBasePtr, SI->getAlignment(), LoadBasePtr, LI->getAlignment(),
1106 NumBytes, StoreSize);
1107 }
1108 NewCall->setDebugLoc(SI->getDebugLoc());
1109
1110 LLVM_DEBUG(dbgs() << " Formed memcpy: " << *NewCall << "\n"do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-idiom")) { dbgs() << " Formed memcpy: " <<
*NewCall << "\n" << " from load ptr=" <<
*LoadEv << " at: " << *LI << "\n" <<
" from store ptr=" << *StoreEv << " at: " <<
*SI << "\n"; } } while (false)
1111 << " from load ptr=" << *LoadEv << " at: " << *LI << "\n"do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-idiom")) { dbgs() << " Formed memcpy: " <<
*NewCall << "\n" << " from load ptr=" <<
*LoadEv << " at: " << *LI << "\n" <<
" from store ptr=" << *StoreEv << " at: " <<
*SI << "\n"; } } while (false)
1112 << " from store ptr=" << *StoreEv << " at: " << *SIdo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-idiom")) { dbgs() << " Formed memcpy: " <<
*NewCall << "\n" << " from load ptr=" <<
*LoadEv << " at: " << *LI << "\n" <<
" from store ptr=" << *StoreEv << " at: " <<
*SI << "\n"; } } while (false)
1113 << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-idiom")) { dbgs() << " Formed memcpy: " <<
*NewCall << "\n" << " from load ptr=" <<
*LoadEv << " at: " << *LI << "\n" <<
" from store ptr=" << *StoreEv << " at: " <<
*SI << "\n"; } } while (false)
;
1114
1115 ORE.emit([&]() {
1116 return OptimizationRemark(DEBUG_TYPE"loop-idiom", "ProcessLoopStoreOfLoopLoad",
1117 NewCall->getDebugLoc(), Preheader)
1118 << "Formed a call to "
1119 << ore::NV("NewFunction", NewCall->getCalledFunction())
1120 << "() function";
1121 });
1122
1123 // Okay, the memcpy has been formed. Zap the original store and anything that
1124 // feeds into it.
1125 deleteDeadInstruction(SI);
1126 ++NumMemCpy;
1127 return true;
1128}
1129
1130// When compiling for codesize we avoid idiom recognition for a multi-block loop
1131// unless it is a loop_memset idiom or a memset/memcpy idiom in a nested loop.
1132//
1133bool LoopIdiomRecognize::avoidLIRForMultiBlockLoop(bool IsMemset,
1134 bool IsLoopMemset) {
1135 if (ApplyCodeSizeHeuristics && CurLoop->getNumBlocks() > 1) {
1136 if (!CurLoop->getParentLoop() && (!IsMemset || !IsLoopMemset)) {
1137 LLVM_DEBUG(dbgs() << " " << CurLoop->getHeader()->getParent()->getName()do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-idiom")) { dbgs() << " " << CurLoop->getHeader
()->getParent()->getName() << " : LIR " << (
IsMemset ? "Memset" : "Memcpy") << " avoided: multi-block top-level loop\n"
; } } while (false)
1138 << " : LIR " << (IsMemset ? "Memset" : "Memcpy")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-idiom")) { dbgs() << " " << CurLoop->getHeader
()->getParent()->getName() << " : LIR " << (
IsMemset ? "Memset" : "Memcpy") << " avoided: multi-block top-level loop\n"
; } } while (false)
1139 << " avoided: multi-block top-level loop\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-idiom")) { dbgs() << " " << CurLoop->getHeader
()->getParent()->getName() << " : LIR " << (
IsMemset ? "Memset" : "Memcpy") << " avoided: multi-block top-level loop\n"
; } } while (false)
;
1140 return true;
1141 }
1142 }
1143
1144 return false;
1145}
1146
1147bool LoopIdiomRecognize::runOnNoncountableLoop() {
1148 LLVM_DEBUG(dbgs() << DEBUG_TYPE " Scanning: F["do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-idiom")) { dbgs() << "loop-idiom" " Scanning: F["
<< CurLoop->getHeader()->getParent()->getName
() << "] Noncountable Loop %" << CurLoop->getHeader
()->getName() << "\n"; } } while (false)
1149 << CurLoop->getHeader()->getParent()->getName()do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-idiom")) { dbgs() << "loop-idiom" " Scanning: F["
<< CurLoop->getHeader()->getParent()->getName
() << "] Noncountable Loop %" << CurLoop->getHeader
()->getName() << "\n"; } } while (false)
1150 << "] Noncountable Loop %"do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-idiom")) { dbgs() << "loop-idiom" " Scanning: F["
<< CurLoop->getHeader()->getParent()->getName
() << "] Noncountable Loop %" << CurLoop->getHeader
()->getName() << "\n"; } } while (false)
1151 << CurLoop->getHeader()->getName() << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-idiom")) { dbgs() << "loop-idiom" " Scanning: F["
<< CurLoop->getHeader()->getParent()->getName
() << "] Noncountable Loop %" << CurLoop->getHeader
()->getName() << "\n"; } } while (false)
;
1152
1153 return recognizePopcount() || recognizeAndInsertFFS();
1154}
1155
1156/// Check if the given conditional branch is based on the comparison between
1157/// a variable and zero, and if the variable is non-zero or zero (JmpOnZero is
1158/// true), the control yields to the loop entry. If the branch matches the
1159/// behavior, the variable involved in the comparison is returned. This function
1160/// will be called to see if the precondition and postcondition of the loop are
1161/// in desirable form.
1162static Value *matchCondition(BranchInst *BI, BasicBlock *LoopEntry,
1163 bool JmpOnZero = false) {
1164 if (!BI || !BI->isConditional())
1165 return nullptr;
1166
1167 ICmpInst *Cond = dyn_cast<ICmpInst>(BI->getCondition());
1168 if (!Cond)
1169 return nullptr;
1170
1171 ConstantInt *CmpZero = dyn_cast<ConstantInt>(Cond->getOperand(1));
1172 if (!CmpZero || !CmpZero->isZero())
1173 return nullptr;
1174
1175 BasicBlock *TrueSucc = BI->getSuccessor(0);
1176 BasicBlock *FalseSucc = BI->getSuccessor(1);
1177 if (JmpOnZero)
1178 std::swap(TrueSucc, FalseSucc);
1179
1180 ICmpInst::Predicate Pred = Cond->getPredicate();
1181 if ((Pred == ICmpInst::ICMP_NE && TrueSucc == LoopEntry) ||
1182 (Pred == ICmpInst::ICMP_EQ && FalseSucc == LoopEntry))
1183 return Cond->getOperand(0);
1184
1185 return nullptr;
1186}
1187
1188// Check if the recurrence variable `VarX` is in the right form to create
1189// the idiom. Returns the value coerced to a PHINode if so.
1190static PHINode *getRecurrenceVar(Value *VarX, Instruction *DefX,
1191 BasicBlock *LoopEntry) {
1192 auto *PhiX = dyn_cast<PHINode>(VarX);
1193 if (PhiX && PhiX->getParent() == LoopEntry &&
1194 (PhiX->getOperand(0) == DefX || PhiX->getOperand(1) == DefX))
1195 return PhiX;
1196 return nullptr;
1197}
1198
1199/// Return true iff the idiom is detected in the loop.
1200///
1201/// Additionally:
1202/// 1) \p CntInst is set to the instruction counting the population bit.
1203/// 2) \p CntPhi is set to the corresponding phi node.
1204/// 3) \p Var is set to the value whose population bits are being counted.
1205///
1206/// The core idiom we are trying to detect is:
1207/// \code
1208/// if (x0 != 0)
1209/// goto loop-exit // the precondition of the loop
1210/// cnt0 = init-val;
1211/// do {
1212/// x1 = phi (x0, x2);
1213/// cnt1 = phi(cnt0, cnt2);
1214///
1215/// cnt2 = cnt1 + 1;
1216/// ...
1217/// x2 = x1 & (x1 - 1);
1218/// ...
1219/// } while(x != 0);
1220///
1221/// loop-exit:
1222/// \endcode
1223static bool detectPopcountIdiom(Loop *CurLoop, BasicBlock *PreCondBB,
1224 Instruction *&CntInst, PHINode *&CntPhi,
1225 Value *&Var) {
1226 // step 1: Check to see if the look-back branch match this pattern:
1227 // "if (a!=0) goto loop-entry".
1228 BasicBlock *LoopEntry;
1229 Instruction *DefX2, *CountInst;
1230 Value *VarX1, *VarX0;
1231 PHINode *PhiX, *CountPhi;
1232
1233 DefX2 = CountInst = nullptr;
1234 VarX1 = VarX0 = nullptr;
1235 PhiX = CountPhi = nullptr;
1236 LoopEntry = *(CurLoop->block_begin());
1237
1238 // step 1: Check if the loop-back branch is in desirable form.
1239 {
1240 if (Value *T = matchCondition(
1241 dyn_cast<BranchInst>(LoopEntry->getTerminator()), LoopEntry))
1242 DefX2 = dyn_cast<Instruction>(T);
1243 else
1244 return false;
1245 }
1246
1247 // step 2: detect instructions corresponding to "x2 = x1 & (x1 - 1)"
1248 {
1249 if (!DefX2 || DefX2->getOpcode() != Instruction::And)
1250 return false;
1251
1252 BinaryOperator *SubOneOp;
1253
1254 if ((SubOneOp = dyn_cast<BinaryOperator>(DefX2->getOperand(0))))
1255 VarX1 = DefX2->getOperand(1);
1256 else {
1257 VarX1 = DefX2->getOperand(0);
1258 SubOneOp = dyn_cast<BinaryOperator>(DefX2->getOperand(1));
1259 }
1260 if (!SubOneOp || SubOneOp->getOperand(0) != VarX1)
1261 return false;
1262
1263 ConstantInt *Dec = dyn_cast<ConstantInt>(SubOneOp->getOperand(1));
1264 if (!Dec ||
1265 !((SubOneOp->getOpcode() == Instruction::Sub && Dec->isOne()) ||
1266 (SubOneOp->getOpcode() == Instruction::Add &&
1267 Dec->isMinusOne()))) {
1268 return false;
1269 }
1270 }
1271
1272 // step 3: Check the recurrence of variable X
1273 PhiX = getRecurrenceVar(VarX1, DefX2, LoopEntry);
1274 if (!PhiX)
1275 return false;
1276
1277 // step 4: Find the instruction which count the population: cnt2 = cnt1 + 1
1278 {
1279 CountInst = nullptr;
1280 for (BasicBlock::iterator Iter = LoopEntry->getFirstNonPHI()->getIterator(),
1281 IterE = LoopEntry->end();
1282 Iter != IterE; Iter++) {
1283 Instruction *Inst = &*Iter;
1284 if (Inst->getOpcode() != Instruction::Add)
1285 continue;
1286
1287 ConstantInt *Inc = dyn_cast<ConstantInt>(Inst->getOperand(1));
1288 if (!Inc || !Inc->isOne())
1289 continue;
1290
1291 PHINode *Phi = getRecurrenceVar(Inst->getOperand(0), Inst, LoopEntry);
1292 if (!Phi)
1293 continue;
1294
1295 // Check if the result of the instruction is live of the loop.
1296 bool LiveOutLoop = false;
1297 for (User *U : Inst->users()) {
1298 if ((cast<Instruction>(U))->getParent() != LoopEntry) {
1299 LiveOutLoop = true;
1300 break;
1301 }
1302 }
1303
1304 if (LiveOutLoop) {
1305 CountInst = Inst;
1306 CountPhi = Phi;
1307 break;
1308 }
1309 }
1310
1311 if (!CountInst)
1312 return false;
1313 }
1314
1315 // step 5: check if the precondition is in this form:
1316 // "if (x != 0) goto loop-head ; else goto somewhere-we-don't-care;"
1317 {
1318 auto *PreCondBr = dyn_cast<BranchInst>(PreCondBB->getTerminator());
1319 Value *T = matchCondition(PreCondBr, CurLoop->getLoopPreheader());
1320 if (T != PhiX->getOperand(0) && T != PhiX->getOperand(1))
1321 return false;
1322
1323 CntInst = CountInst;
1324 CntPhi = CountPhi;
1325 Var = T;
1326 }
1327
1328 return true;
1329}
1330
1331/// Return true if the idiom is detected in the loop.
1332///
1333/// Additionally:
1334/// 1) \p CntInst is set to the instruction Counting Leading Zeros (CTLZ)
1335/// or nullptr if there is no such.
1336/// 2) \p CntPhi is set to the corresponding phi node
1337/// or nullptr if there is no such.
1338/// 3) \p Var is set to the value whose CTLZ could be used.
1339/// 4) \p DefX is set to the instruction calculating Loop exit condition.
1340///
1341/// The core idiom we are trying to detect is:
1342/// \code
1343/// if (x0 == 0)
1344/// goto loop-exit // the precondition of the loop
1345/// cnt0 = init-val;
1346/// do {
1347/// x = phi (x0, x.next); //PhiX
1348/// cnt = phi(cnt0, cnt.next);
1349///
1350/// cnt.next = cnt + 1;
1351/// ...
1352/// x.next = x >> 1; // DefX
1353/// ...
1354/// } while(x.next != 0);
1355///
1356/// loop-exit:
1357/// \endcode
1358static bool detectShiftUntilZeroIdiom(Loop *CurLoop, const DataLayout &DL,
1359 Intrinsic::ID &IntrinID, Value *&InitX,
1360 Instruction *&CntInst, PHINode *&CntPhi,
1361 Instruction *&DefX) {
1362 BasicBlock *LoopEntry;
1363 Value *VarX = nullptr;
1364
1365 DefX = nullptr;
1366 CntInst = nullptr;
1367 CntPhi = nullptr;
1368 LoopEntry = *(CurLoop->block_begin());
1369
1370 // step 1: Check if the loop-back branch is in desirable form.
1371 if (Value *T = matchCondition(
1372 dyn_cast<BranchInst>(LoopEntry->getTerminator()), LoopEntry))
1373 DefX = dyn_cast<Instruction>(T);
1374 else
1375 return false;
1376
1377 // step 2: detect instructions corresponding to "x.next = x >> 1 or x << 1"
1378 if (!DefX || !DefX->isShift())
1379 return false;
1380 IntrinID = DefX->getOpcode() == Instruction::Shl ? Intrinsic::cttz :
1381 Intrinsic::ctlz;
1382 ConstantInt *Shft = dyn_cast<ConstantInt>(DefX->getOperand(1));
1383 if (!Shft || !Shft->isOne())
1384 return false;
1385 VarX = DefX->getOperand(0);
1386
1387 // step 3: Check the recurrence of variable X
1388 PHINode *PhiX = getRecurrenceVar(VarX, DefX, LoopEntry);
1389 if (!PhiX)
1390 return false;
1391
1392 InitX = PhiX->getIncomingValueForBlock(CurLoop->getLoopPreheader());
1393
1394 // Make sure the initial value can't be negative otherwise the ashr in the
1395 // loop might never reach zero which would make the loop infinite.
1396 if (DefX->getOpcode() == Instruction::AShr && !isKnownNonNegative(InitX, DL))
1397 return false;
1398
1399 // step 4: Find the instruction which count the CTLZ: cnt.next = cnt + 1
1400 // TODO: We can skip the step. If loop trip count is known (CTLZ),
1401 // then all uses of "cnt.next" could be optimized to the trip count
1402 // plus "cnt0". Currently it is not optimized.
1403 // This step could be used to detect POPCNT instruction:
1404 // cnt.next = cnt + (x.next & 1)
1405 for (BasicBlock::iterator Iter = LoopEntry->getFirstNonPHI()->getIterator(),
1406 IterE = LoopEntry->end();
1407 Iter != IterE; Iter++) {
1408 Instruction *Inst = &*Iter;
1409 if (Inst->getOpcode() != Instruction::Add)
1410 continue;
1411
1412 ConstantInt *Inc = dyn_cast<ConstantInt>(Inst->getOperand(1));
1413 if (!Inc || !Inc->isOne())
1414 continue;
1415
1416 PHINode *Phi = getRecurrenceVar(Inst->getOperand(0), Inst, LoopEntry);
1417 if (!Phi)
1418 continue;
1419
1420 CntInst = Inst;
1421 CntPhi = Phi;
1422 break;
1423 }
1424 if (!CntInst)
1425 return false;
1426
1427 return true;
1428}
1429
1430/// Recognize CTLZ or CTTZ idiom in a non-countable loop and convert the loop
1431/// to countable (with CTLZ / CTTZ trip count). If CTLZ / CTTZ inserted as a new
1432/// trip count returns true; otherwise, returns false.
1433bool LoopIdiomRecognize::recognizeAndInsertFFS() {
1434 // Give up if the loop has multiple blocks or multiple backedges.
1435 if (CurLoop->getNumBackEdges() != 1 || CurLoop->getNumBlocks() != 1)
1436 return false;
1437
1438 Intrinsic::ID IntrinID;
1439 Value *InitX;
1440 Instruction *DefX = nullptr;
1441 PHINode *CntPhi = nullptr;
1442 Instruction *CntInst = nullptr;
1443 // Help decide if transformation is profitable. For ShiftUntilZero idiom,
1444 // this is always 6.
1445 size_t IdiomCanonicalSize = 6;
1446
1447 if (!detectShiftUntilZeroIdiom(CurLoop, *DL, IntrinID, InitX,
1448 CntInst, CntPhi, DefX))
1449 return false;
1450
1451 bool IsCntPhiUsedOutsideLoop = false;
1452 for (User *U : CntPhi->users())
1453 if (!CurLoop->contains(cast<Instruction>(U))) {
1454 IsCntPhiUsedOutsideLoop = true;
1455 break;
1456 }
1457 bool IsCntInstUsedOutsideLoop = false;
1458 for (User *U : CntInst->users())
1459 if (!CurLoop->contains(cast<Instruction>(U))) {
1460 IsCntInstUsedOutsideLoop = true;
1461 break;
1462 }
1463 // If both CntInst and CntPhi are used outside the loop the profitability
1464 // is questionable.
1465 if (IsCntInstUsedOutsideLoop && IsCntPhiUsedOutsideLoop)
1466 return false;
1467
1468 // For some CPUs result of CTLZ(X) intrinsic is undefined
1469 // when X is 0. If we can not guarantee X != 0, we need to check this
1470 // when expand.
1471 bool ZeroCheck = false;
1472 // It is safe to assume Preheader exist as it was checked in
1473 // parent function RunOnLoop.
1474 BasicBlock *PH = CurLoop->getLoopPreheader();
1475
1476 // If we are using the count instruction outside the loop, make sure we
1477 // have a zero check as a precondition. Without the check the loop would run
1478 // one iteration for before any check of the input value. This means 0 and 1
1479 // would have identical behavior in the original loop and thus
1480 if (!IsCntPhiUsedOutsideLoop) {
1481 auto *PreCondBB = PH->getSinglePredecessor();
1482 if (!PreCondBB)
1483 return false;
1484 auto *PreCondBI = dyn_cast<BranchInst>(PreCondBB->getTerminator());
1485 if (!PreCondBI)
1486 return false;
1487 if (matchCondition(PreCondBI, PH) != InitX)
1488 return false;
1489 ZeroCheck = true;
1490 }
1491
1492 // Check if CTLZ / CTTZ intrinsic is profitable. Assume it is always
1493 // profitable if we delete the loop.
1494
1495 // the loop has only 6 instructions:
1496 // %n.addr.0 = phi [ %n, %entry ], [ %shr, %while.cond ]
1497 // %i.0 = phi [ %i0, %entry ], [ %inc, %while.cond ]
1498 // %shr = ashr %n.addr.0, 1
1499 // %tobool = icmp eq %shr, 0
1500 // %inc = add nsw %i.0, 1
1501 // br i1 %tobool
1502
1503 const Value *Args[] =
1504 {InitX, ZeroCheck ? ConstantInt::getTrue(InitX->getContext())
1505 : ConstantInt::getFalse(InitX->getContext())};
1506
1507 // @llvm.dbg doesn't count as they have no semantic effect.
1508 auto InstWithoutDebugIt = CurLoop->getHeader()->instructionsWithoutDebug();
1509 uint32_t HeaderSize =
1510 std::distance(InstWithoutDebugIt.begin(), InstWithoutDebugIt.end());
1511
1512 if (HeaderSize != IdiomCanonicalSize &&
1513 TTI->getIntrinsicCost(IntrinID, InitX->getType(), Args) >
1514 TargetTransformInfo::TCC_Basic)
1515 return false;
1516
1517 transformLoopToCountable(IntrinID, PH, CntInst, CntPhi, InitX, DefX,
1518 DefX->getDebugLoc(), ZeroCheck,
1519 IsCntPhiUsedOutsideLoop);
1520 return true;
1521}
1522
1523/// Recognizes a population count idiom in a non-countable loop.
1524///
1525/// If detected, transforms the relevant code to issue the popcount intrinsic
1526/// function call, and returns true; otherwise, returns false.
1527bool LoopIdiomRecognize::recognizePopcount() {
1528 if (TTI->getPopcntSupport(32) != TargetTransformInfo::PSK_FastHardware)
1529 return false;
1530
1531 // Counting population are usually conducted by few arithmetic instructions.
1532 // Such instructions can be easily "absorbed" by vacant slots in a
1533 // non-compact loop. Therefore, recognizing popcount idiom only makes sense
1534 // in a compact loop.
1535
1536 // Give up if the loop has multiple blocks or multiple backedges.
1537 if (CurLoop->getNumBackEdges() != 1 || CurLoop->getNumBlocks() != 1)
1538 return false;
1539
1540 BasicBlock *LoopBody = *(CurLoop->block_begin());
1541 if (LoopBody->size() >= 20) {
1542 // The loop is too big, bail out.
1543 return false;
1544 }
1545
1546 // It should have a preheader containing nothing but an unconditional branch.
1547 BasicBlock *PH = CurLoop->getLoopPreheader();
1548 if (!PH || &PH->front() != PH->getTerminator())
1549 return false;
1550 auto *EntryBI = dyn_cast<BranchInst>(PH->getTerminator());
1551 if (!EntryBI || EntryBI->isConditional())
1552 return false;
1553
1554 // It should have a precondition block where the generated popcount intrinsic
1555 // function can be inserted.
1556 auto *PreCondBB = PH->getSinglePredecessor();
1557 if (!PreCondBB)
1558 return false;
1559 auto *PreCondBI = dyn_cast<BranchInst>(PreCondBB->getTerminator());
1560 if (!PreCondBI || PreCondBI->isUnconditional())
1561 return false;
1562
1563 Instruction *CntInst;
1564 PHINode *CntPhi;
1565 Value *Val;
1566 if (!detectPopcountIdiom(CurLoop, PreCondBB, CntInst, CntPhi, Val))
1567 return false;
1568
1569 transformLoopToPopcount(PreCondBB, CntInst, CntPhi, Val);
1570 return true;
1571}
1572
1573static CallInst *createPopcntIntrinsic(IRBuilder<> &IRBuilder, Value *Val,
1574 const DebugLoc &DL) {
1575 Value *Ops[] = {Val};
1576 Type *Tys[] = {Val->getType()};
1577
1578 Module *M = IRBuilder.GetInsertBlock()->getParent()->getParent();
1579 Function *Func = Intrinsic::getDeclaration(M, Intrinsic::ctpop, Tys);
1580 CallInst *CI = IRBuilder.CreateCall(Func, Ops);
1581 CI->setDebugLoc(DL);
1582
1583 return CI;
1584}
1585
1586static CallInst *createFFSIntrinsic(IRBuilder<> &IRBuilder, Value *Val,
1587 const DebugLoc &DL, bool ZeroCheck,
1588 Intrinsic::ID IID) {
1589 Value *Ops[] = {Val, ZeroCheck ? IRBuilder.getTrue() : IRBuilder.getFalse()};
1590 Type *Tys[] = {Val->getType()};
1591
1592 Module *M = IRBuilder.GetInsertBlock()->getParent()->getParent();
1593 Function *Func = Intrinsic::getDeclaration(M, IID, Tys);
1594 CallInst *CI = IRBuilder.CreateCall(Func, Ops);
1595 CI->setDebugLoc(DL);
1596
1597 return CI;
1598}
1599
1600/// Transform the following loop (Using CTLZ, CTTZ is similar):
1601/// loop:
1602/// CntPhi = PHI [Cnt0, CntInst]
1603/// PhiX = PHI [InitX, DefX]
1604/// CntInst = CntPhi + 1
1605/// DefX = PhiX >> 1
1606/// LOOP_BODY
1607/// Br: loop if (DefX != 0)
1608/// Use(CntPhi) or Use(CntInst)
1609///
1610/// Into:
1611/// If CntPhi used outside the loop:
1612/// CountPrev = BitWidth(InitX) - CTLZ(InitX >> 1)
1613/// Count = CountPrev + 1
1614/// else
1615/// Count = BitWidth(InitX) - CTLZ(InitX)
1616/// loop:
1617/// CntPhi = PHI [Cnt0, CntInst]
1618/// PhiX = PHI [InitX, DefX]
1619/// PhiCount = PHI [Count, Dec]
1620/// CntInst = CntPhi + 1
1621/// DefX = PhiX >> 1
1622/// Dec = PhiCount - 1
1623/// LOOP_BODY
1624/// Br: loop if (Dec != 0)
1625/// Use(CountPrev + Cnt0) // Use(CntPhi)
1626/// or
1627/// Use(Count + Cnt0) // Use(CntInst)
1628///
1629/// If LOOP_BODY is empty the loop will be deleted.
1630/// If CntInst and DefX are not used in LOOP_BODY they will be removed.
1631void LoopIdiomRecognize::transformLoopToCountable(
1632 Intrinsic::ID IntrinID, BasicBlock *Preheader, Instruction *CntInst,
1633 PHINode *CntPhi, Value *InitX, Instruction *DefX, const DebugLoc &DL,
1634 bool ZeroCheck, bool IsCntPhiUsedOutsideLoop) {
1635 BranchInst *PreheaderBr = cast<BranchInst>(Preheader->getTerminator());
1636
1637 // Step 1: Insert the CTLZ/CTTZ instruction at the end of the preheader block
1638 IRBuilder<> Builder(PreheaderBr);
1639 Builder.SetCurrentDebugLocation(DL);
1640 Value *FFS, *Count, *CountPrev, *NewCount, *InitXNext;
1641
1642 // Count = BitWidth - CTLZ(InitX);
1643 // If there are uses of CntPhi create:
1644 // CountPrev = BitWidth - CTLZ(InitX >> 1);
1645 if (IsCntPhiUsedOutsideLoop) {
1646 if (DefX->getOpcode() == Instruction::AShr)
1647 InitXNext =
1648 Builder.CreateAShr(InitX, ConstantInt::get(InitX->getType(), 1));
1649 else if (DefX->getOpcode() == Instruction::LShr)
1650 InitXNext =
1651 Builder.CreateLShr(InitX, ConstantInt::get(InitX->getType(), 1));
1652 else if (DefX->getOpcode() == Instruction::Shl) // cttz
1653 InitXNext =
1654 Builder.CreateShl(InitX, ConstantInt::get(InitX->getType(), 1));
1655 else
1656 llvm_unreachable("Unexpected opcode!")::llvm::llvm_unreachable_internal("Unexpected opcode!", "/build/llvm-toolchain-snapshot-10~+201911111502510600c19528f1809/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp"
, 1656)
;
1657 } else
1658 InitXNext = InitX;
1659 FFS = createFFSIntrinsic(Builder, InitXNext, DL, ZeroCheck, IntrinID);
1660 Count = Builder.CreateSub(
1661 ConstantInt::get(FFS->getType(),
1662 FFS->getType()->getIntegerBitWidth()),
1663 FFS);
1664 if (IsCntPhiUsedOutsideLoop) {
1665 CountPrev = Count;
1666 Count = Builder.CreateAdd(
1667 CountPrev,
1668 ConstantInt::get(CountPrev->getType(), 1));
1669 }
1670
1671 NewCount = Builder.CreateZExtOrTrunc(
1672 IsCntPhiUsedOutsideLoop ? CountPrev : Count,
1673 cast<IntegerType>(CntInst->getType()));
1674
1675 // If the counter's initial value is not zero, insert Add Inst.
1676 Value *CntInitVal = CntPhi->getIncomingValueForBlock(Preheader);
1677 ConstantInt *InitConst = dyn_cast<ConstantInt>(CntInitVal);
1678 if (!InitConst || !InitConst->isZero())
1679 NewCount = Builder.CreateAdd(NewCount, CntInitVal);
1680
1681 // Step 2: Insert new IV and loop condition:
1682 // loop:
1683 // ...
1684 // PhiCount = PHI [Count, Dec]
1685 // ...
1686 // Dec = PhiCount - 1
1687 // ...
1688 // Br: loop if (Dec != 0)
1689 BasicBlock *Body = *(CurLoop->block_begin());
1690 auto *LbBr = cast<BranchInst>(Body->getTerminator());
1691 ICmpInst *LbCond = cast<ICmpInst>(LbBr->getCondition());
1692 Type *Ty = Count->getType();
1693
1694 PHINode *TcPhi = PHINode::Create(Ty, 2, "tcphi", &Body->front());
1695
1696 Builder.SetInsertPoint(LbCond);
1697 Instruction *TcDec = cast<Instruction>(
1698 Builder.CreateSub(TcPhi, ConstantInt::get(Ty, 1),
1699 "tcdec", false, true));
1700
1701 TcPhi->addIncoming(Count, Preheader);
1702 TcPhi->addIncoming(TcDec, Body);
1703
1704 CmpInst::Predicate Pred =
1705 (LbBr->getSuccessor(0) == Body) ? CmpInst::ICMP_NE : CmpInst::ICMP_EQ;
1706 LbCond->setPredicate(Pred);
1707 LbCond->setOperand(0, TcDec);
1708 LbCond->setOperand(1, ConstantInt::get(Ty, 0));
1709
1710 // Step 3: All the references to the original counter outside
1711 // the loop are replaced with the NewCount
1712 if (IsCntPhiUsedOutsideLoop)
1713 CntPhi->replaceUsesOutsideBlock(NewCount, Body);
1714 else
1715 CntInst->replaceUsesOutsideBlock(NewCount, Body);
1716
1717 // step 4: Forget the "non-computable" trip-count SCEV associated with the
1718 // loop. The loop would otherwise not be deleted even if it becomes empty.
1719 SE->forgetLoop(CurLoop);
1720}
1721
1722void LoopIdiomRecognize::transformLoopToPopcount(BasicBlock *PreCondBB,
1723 Instruction *CntInst,
1724 PHINode *CntPhi, Value *Var) {
1725 BasicBlock *PreHead = CurLoop->getLoopPreheader();
1726 auto *PreCondBr = cast<BranchInst>(PreCondBB->getTerminator());
1727 const DebugLoc &DL = CntInst->getDebugLoc();
1728
1729 // Assuming before transformation, the loop is following:
1730 // if (x) // the precondition
1731 // do { cnt++; x &= x - 1; } while(x);
1732
1733 // Step 1: Insert the ctpop instruction at the end of the precondition block
1734 IRBuilder<> Builder(PreCondBr);
1735 Value *PopCnt, *PopCntZext, *NewCount, *TripCnt;
1736 {
1737 PopCnt = createPopcntIntrinsic(Builder, Var, DL);
1738 NewCount = PopCntZext =
1739 Builder.CreateZExtOrTrunc(PopCnt, cast<IntegerType>(CntPhi->getType()));
1740
1741 if (NewCount != PopCnt)
1742 (cast<Instruction>(NewCount))->setDebugLoc(DL);
1743
1744 // TripCnt is exactly the number of iterations the loop has
1745 TripCnt = NewCount;
1746
1747 // If the population counter's initial value is not zero, insert Add Inst.
1748 Value *CntInitVal = CntPhi->getIncomingValueForBlock(PreHead);
1749 ConstantInt *InitConst = dyn_cast<ConstantInt>(CntInitVal);
1750 if (!InitConst || !InitConst->isZero()) {
1751 NewCount = Builder.CreateAdd(NewCount, CntInitVal);
1752 (cast<Instruction>(NewCount))->setDebugLoc(DL);
1753 }
1754 }
1755
1756 // Step 2: Replace the precondition from "if (x == 0) goto loop-exit" to
1757 // "if (NewCount == 0) loop-exit". Without this change, the intrinsic
1758 // function would be partial dead code, and downstream passes will drag
1759 // it back from the precondition block to the preheader.
1760 {
1761 ICmpInst *PreCond = cast<ICmpInst>(PreCondBr->getCondition());
1762
1763 Value *Opnd0 = PopCntZext;
1764 Value *Opnd1 = ConstantInt::get(PopCntZext->getType(), 0);
1765 if (PreCond->getOperand(0) != Var)
1766 std::swap(Opnd0, Opnd1);
1767
1768 ICmpInst *NewPreCond = cast<ICmpInst>(
1769 Builder.CreateICmp(PreCond->getPredicate(), Opnd0, Opnd1));
1770 PreCondBr->setCondition(NewPreCond);
1771
1772 RecursivelyDeleteTriviallyDeadInstructions(PreCond, TLI);
1773 }
1774
1775 // Step 3: Note that the population count is exactly the trip count of the
1776 // loop in question, which enable us to convert the loop from noncountable
1777 // loop into a countable one. The benefit is twofold:
1778 //
1779 // - If the loop only counts population, the entire loop becomes dead after
1780 // the transformation. It is a lot easier to prove a countable loop dead
1781 // than to prove a noncountable one. (In some C dialects, an infinite loop
1782 // isn't dead even if it computes nothing useful. In general, DCE needs
1783 // to prove a noncountable loop finite before safely delete it.)
1784 //
1785 // - If the loop also performs something else, it remains alive.
1786 // Since it is transformed to countable form, it can be aggressively
1787 // optimized by some optimizations which are in general not applicable
1788 // to a noncountable loop.
1789 //
1790 // After this step, this loop (conceptually) would look like following:
1791 // newcnt = __builtin_ctpop(x);
1792 // t = newcnt;
1793 // if (x)
1794 // do { cnt++; x &= x-1; t--) } while (t > 0);
1795 BasicBlock *Body = *(CurLoop->block_begin());
1796 {
1797 auto *LbBr = cast<BranchInst>(Body->getTerminator());
1798 ICmpInst *LbCond = cast<ICmpInst>(LbBr->getCondition());
1799 Type *Ty = TripCnt->getType();
1800
1801 PHINode *TcPhi = PHINode::Create(Ty, 2, "tcphi", &Body->front());
1802
1803 Builder.SetInsertPoint(LbCond);
1804 Instruction *TcDec = cast<Instruction>(
1805 Builder.CreateSub(TcPhi, ConstantInt::get(Ty, 1),
1806 "tcdec", false, true));
1807
1808 TcPhi->addIncoming(TripCnt, PreHead);
1809 TcPhi->addIncoming(TcDec, Body);
1810
1811 CmpInst::Predicate Pred =
1812 (LbBr->getSuccessor(0) == Body) ? CmpInst::ICMP_UGT : CmpInst::ICMP_SLE;
1813 LbCond->setPredicate(Pred);
1814 LbCond->setOperand(0, TcDec);
1815 LbCond->setOperand(1, ConstantInt::get(Ty, 0));
1816 }
1817
1818 // Step 4: All the references to the original population counter outside
1819 // the loop are replaced with the NewCount -- the value returned from
1820 // __builtin_ctpop().
1821 CntInst->replaceUsesOutsideBlock(NewCount, Body);
1822
1823 // step 5: Forget the "non-computable" trip-count SCEV associated with the
1824 // loop. The loop would otherwise not be deleted even if it becomes empty.
1825 SE->forgetLoop(CurLoop);
1826}