Bug Summary

File:llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
Warning:line 472, column 15
Although the value stored to 'PatternValue' is used in the enclosing expression, the value is never actually read from 'PatternValue'

Annotated Source Code

Press '?' to see keyboard shortcuts

clang -cc1 -triple x86_64-pc-linux-gnu -analyze -disable-free -disable-llvm-verifier -discard-value-names -main-file-name LoopIdiomRecognize.cpp -analyzer-store=region -analyzer-opt-analyze-nested-blocks -analyzer-checker=core -analyzer-checker=apiModeling -analyzer-checker=unix -analyzer-checker=deadcode -analyzer-checker=cplusplus -analyzer-checker=security.insecureAPI.UncheckedReturn -analyzer-checker=security.insecureAPI.getpw -analyzer-checker=security.insecureAPI.gets -analyzer-checker=security.insecureAPI.mktemp -analyzer-checker=security.insecureAPI.mkstemp -analyzer-checker=security.insecureAPI.vfork -analyzer-checker=nullability.NullPassedToNonnull -analyzer-checker=nullability.NullReturnedFromNonnull -analyzer-output plist -w -setup-static-analyzer -analyzer-config-compatibility-mode=true -mrelocation-model pic -pic-level 2 -mthread-model posix -mframe-pointer=none -fmath-errno -fno-rounding-math -masm-verbose -mconstructor-aliases -munwind-tables -target-cpu x86-64 -dwarf-column-info -fno-split-dwarf-inlining -debugger-tuning=gdb -ffunction-sections -fdata-sections -resource-dir /usr/lib/llvm-10/lib/clang/10.0.0 -D _DEBUG -D _GNU_SOURCE -D __STDC_CONSTANT_MACROS -D __STDC_FORMAT_MACROS -D __STDC_LIMIT_MACROS -I /build/llvm-toolchain-snapshot-10~++20200112100611+7fa5290d5bd/build-llvm/lib/Transforms/Scalar -I /build/llvm-toolchain-snapshot-10~++20200112100611+7fa5290d5bd/llvm/lib/Transforms/Scalar -I /build/llvm-toolchain-snapshot-10~++20200112100611+7fa5290d5bd/build-llvm/include -I /build/llvm-toolchain-snapshot-10~++20200112100611+7fa5290d5bd/llvm/include -U NDEBUG -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/6.3.0/../../../../include/c++/6.3.0 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/6.3.0/../../../../include/x86_64-linux-gnu/c++/6.3.0 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/6.3.0/../../../../include/x86_64-linux-gnu/c++/6.3.0 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/6.3.0/../../../../include/c++/6.3.0/backward -internal-isystem /usr/local/include -internal-isystem /usr/lib/llvm-10/lib/clang/10.0.0/include -internal-externc-isystem /usr/include/x86_64-linux-gnu -internal-externc-isystem /include -internal-externc-isystem /usr/include -O2 -Wno-unused-parameter -Wwrite-strings -Wno-missing-field-initializers -Wno-long-long -Wno-maybe-uninitialized -Wno-comment -std=c++14 -fdeprecated-macro -fdebug-compilation-dir /build/llvm-toolchain-snapshot-10~++20200112100611+7fa5290d5bd/build-llvm/lib/Transforms/Scalar -fdebug-prefix-map=/build/llvm-toolchain-snapshot-10~++20200112100611+7fa5290d5bd=. -ferror-limit 19 -fmessage-length 0 -fvisibility-inlines-hidden -stack-protector 2 -fgnuc-version=4.2.1 -fobjc-runtime=gcc -fdiagnostics-show-option -vectorize-loops -vectorize-slp -analyzer-output=html -analyzer-config stable-report-filename=true -faddrsig -o /tmp/scan-build-2020-01-13-084841-49055-1 -x c++ /build/llvm-toolchain-snapshot-10~++20200112100611+7fa5290d5bd/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
1//===- LoopIdiomRecognize.cpp - Loop idiom recognition --------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This pass implements an idiom recognizer that transforms simple loops into a
10// non-loop form. In cases that this kicks in, it can be a significant
11// performance win.
12//
13// If compiling for code size we avoid idiom recognition if the resulting
14// code could be larger than the code for the original loop. One way this could
15// happen is if the loop is not removable after idiom recognition due to the
16// presence of non-idiom instructions. The initial implementation of the
17// heuristics applies to idioms in multi-block loops.
18//
19//===----------------------------------------------------------------------===//
20//
21// TODO List:
22//
23// Future loop memory idioms to recognize:
24// memcmp, memmove, strlen, etc.
25// Future floating point idioms to recognize in -ffast-math mode:
26// fpowi
27// Future integer operation idioms to recognize:
28// ctpop
29//
30// Beware that isel's default lowering for ctpop is highly inefficient for
31// i64 and larger types when i64 is legal and the value has few bits set. It
32// would be good to enhance isel to emit a loop for ctpop in this case.
33//
34// This could recognize common matrix multiplies and dot product idioms and
35// replace them with calls to BLAS (if linked in??).
36//
37//===----------------------------------------------------------------------===//
38
39#include "llvm/Transforms/Scalar/LoopIdiomRecognize.h"
40#include "llvm/ADT/APInt.h"
41#include "llvm/ADT/ArrayRef.h"
42#include "llvm/ADT/DenseMap.h"
43#include "llvm/ADT/MapVector.h"
44#include "llvm/ADT/SetVector.h"
45#include "llvm/ADT/SmallPtrSet.h"
46#include "llvm/ADT/SmallVector.h"
47#include "llvm/ADT/Statistic.h"
48#include "llvm/ADT/StringRef.h"
49#include "llvm/Analysis/AliasAnalysis.h"
50#include "llvm/Analysis/LoopAccessAnalysis.h"
51#include "llvm/Analysis/LoopInfo.h"
52#include "llvm/Analysis/LoopPass.h"
53#include "llvm/Analysis/MemoryLocation.h"
54#include "llvm/Analysis/OptimizationRemarkEmitter.h"
55#include "llvm/Analysis/ScalarEvolution.h"
56#include "llvm/Analysis/ScalarEvolutionExpander.h"
57#include "llvm/Analysis/ScalarEvolutionExpressions.h"
58#include "llvm/Analysis/TargetLibraryInfo.h"
59#include "llvm/Analysis/TargetTransformInfo.h"
60#include "llvm/Analysis/ValueTracking.h"
61#include "llvm/IR/Attributes.h"
62#include "llvm/IR/BasicBlock.h"
63#include "llvm/IR/Constant.h"
64#include "llvm/IR/Constants.h"
65#include "llvm/IR/DataLayout.h"
66#include "llvm/IR/DebugLoc.h"
67#include "llvm/IR/DerivedTypes.h"
68#include "llvm/IR/Dominators.h"
69#include "llvm/IR/GlobalValue.h"
70#include "llvm/IR/GlobalVariable.h"
71#include "llvm/IR/IRBuilder.h"
72#include "llvm/IR/InstrTypes.h"
73#include "llvm/IR/Instruction.h"
74#include "llvm/IR/Instructions.h"
75#include "llvm/IR/IntrinsicInst.h"
76#include "llvm/IR/Intrinsics.h"
77#include "llvm/IR/LLVMContext.h"
78#include "llvm/IR/Module.h"
79#include "llvm/IR/PassManager.h"
80#include "llvm/IR/Type.h"
81#include "llvm/IR/User.h"
82#include "llvm/IR/Value.h"
83#include "llvm/IR/ValueHandle.h"
84#include "llvm/InitializePasses.h"
85#include "llvm/Pass.h"
86#include "llvm/Support/Casting.h"
87#include "llvm/Support/CommandLine.h"
88#include "llvm/Support/Debug.h"
89#include "llvm/Support/raw_ostream.h"
90#include "llvm/Transforms/Scalar.h"
91#include "llvm/Transforms/Utils/BuildLibCalls.h"
92#include "llvm/Transforms/Utils/Local.h"
93#include "llvm/Transforms/Utils/LoopUtils.h"
94#include <algorithm>
95#include <cassert>
96#include <cstdint>
97#include <utility>
98#include <vector>
99
100using namespace llvm;
101
102#define DEBUG_TYPE"loop-idiom" "loop-idiom"
103
104STATISTIC(NumMemSet, "Number of memset's formed from loop stores")static llvm::Statistic NumMemSet = {"loop-idiom", "NumMemSet"
, "Number of memset's formed from loop stores"}
;
105STATISTIC(NumMemCpy, "Number of memcpy's formed from loop load+stores")static llvm::Statistic NumMemCpy = {"loop-idiom", "NumMemCpy"
, "Number of memcpy's formed from loop load+stores"}
;
106
107static cl::opt<bool> UseLIRCodeSizeHeurs(
108 "use-lir-code-size-heurs",
109 cl::desc("Use loop idiom recognition code size heuristics when compiling"
110 "with -Os/-Oz"),
111 cl::init(true), cl::Hidden);
112
113namespace {
114
115class LoopIdiomRecognize {
116 Loop *CurLoop = nullptr;
117 AliasAnalysis *AA;
118 DominatorTree *DT;
119 LoopInfo *LI;
120 ScalarEvolution *SE;
121 TargetLibraryInfo *TLI;
122 const TargetTransformInfo *TTI;
123 const DataLayout *DL;
124 OptimizationRemarkEmitter &ORE;
125 bool ApplyCodeSizeHeuristics;
126
127public:
128 explicit LoopIdiomRecognize(AliasAnalysis *AA, DominatorTree *DT,
129 LoopInfo *LI, ScalarEvolution *SE,
130 TargetLibraryInfo *TLI,
131 const TargetTransformInfo *TTI,
132 const DataLayout *DL,
133 OptimizationRemarkEmitter &ORE)
134 : AA(AA), DT(DT), LI(LI), SE(SE), TLI(TLI), TTI(TTI), DL(DL), ORE(ORE) {}
135
136 bool runOnLoop(Loop *L);
137
138private:
139 using StoreList = SmallVector<StoreInst *, 8>;
140 using StoreListMap = MapVector<Value *, StoreList>;
141
142 StoreListMap StoreRefsForMemset;
143 StoreListMap StoreRefsForMemsetPattern;
144 StoreList StoreRefsForMemcpy;
145 bool HasMemset;
146 bool HasMemsetPattern;
147 bool HasMemcpy;
148
149 /// Return code for isLegalStore()
150 enum LegalStoreKind {
151 None = 0,
152 Memset,
153 MemsetPattern,
154 Memcpy,
155 UnorderedAtomicMemcpy,
156 DontUse // Dummy retval never to be used. Allows catching errors in retval
157 // handling.
158 };
159
160 /// \name Countable Loop Idiom Handling
161 /// @{
162
163 bool runOnCountableLoop();
164 bool runOnLoopBlock(BasicBlock *BB, const SCEV *BECount,
165 SmallVectorImpl<BasicBlock *> &ExitBlocks);
166
167 void collectStores(BasicBlock *BB);
168 LegalStoreKind isLegalStore(StoreInst *SI);
169 enum class ForMemset { No, Yes };
170 bool processLoopStores(SmallVectorImpl<StoreInst *> &SL, const SCEV *BECount,
171 ForMemset For);
172 bool processLoopMemSet(MemSetInst *MSI, const SCEV *BECount);
173
174 bool processLoopStridedStore(Value *DestPtr, unsigned StoreSize,
175 MaybeAlign StoreAlignment, Value *StoredVal,
176 Instruction *TheStore,
177 SmallPtrSetImpl<Instruction *> &Stores,
178 const SCEVAddRecExpr *Ev, const SCEV *BECount,
179 bool NegStride, bool IsLoopMemset = false);
180 bool processLoopStoreOfLoopLoad(StoreInst *SI, const SCEV *BECount);
181 bool avoidLIRForMultiBlockLoop(bool IsMemset = false,
182 bool IsLoopMemset = false);
183
184 /// @}
185 /// \name Noncountable Loop Idiom Handling
186 /// @{
187
188 bool runOnNoncountableLoop();
189
190 bool recognizePopcount();
191 void transformLoopToPopcount(BasicBlock *PreCondBB, Instruction *CntInst,
192 PHINode *CntPhi, Value *Var);
193 bool recognizeAndInsertFFS(); /// Find First Set: ctlz or cttz
194 void transformLoopToCountable(Intrinsic::ID IntrinID, BasicBlock *PreCondBB,
195 Instruction *CntInst, PHINode *CntPhi,
196 Value *Var, Instruction *DefX,
197 const DebugLoc &DL, bool ZeroCheck,
198 bool IsCntPhiUsedOutsideLoop);
199
200 /// @}
201};
202
203class LoopIdiomRecognizeLegacyPass : public LoopPass {
204public:
205 static char ID;
206
207 explicit LoopIdiomRecognizeLegacyPass() : LoopPass(ID) {
208 initializeLoopIdiomRecognizeLegacyPassPass(
209 *PassRegistry::getPassRegistry());
210 }
211
212 bool runOnLoop(Loop *L, LPPassManager &LPM) override {
213 if (skipLoop(L))
214 return false;
215
216 AliasAnalysis *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
217 DominatorTree *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
218 LoopInfo *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
219 ScalarEvolution *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
220 TargetLibraryInfo *TLI =
221 &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(
222 *L->getHeader()->getParent());
223 const TargetTransformInfo *TTI =
224 &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(
225 *L->getHeader()->getParent());
226 const DataLayout *DL = &L->getHeader()->getModule()->getDataLayout();
227
228 // For the old PM, we can't use OptimizationRemarkEmitter as an analysis
229 // pass. Function analyses need to be preserved across loop transformations
230 // but ORE cannot be preserved (see comment before the pass definition).
231 OptimizationRemarkEmitter ORE(L->getHeader()->getParent());
232
233 LoopIdiomRecognize LIR(AA, DT, LI, SE, TLI, TTI, DL, ORE);
234 return LIR.runOnLoop(L);
235 }
236
237 /// This transformation requires natural loop information & requires that
238 /// loop preheaders be inserted into the CFG.
239 void getAnalysisUsage(AnalysisUsage &AU) const override {
240 AU.addRequired<TargetLibraryInfoWrapperPass>();
241 AU.addRequired<TargetTransformInfoWrapperPass>();
242 getLoopAnalysisUsage(AU);
243 }
244};
245
246} // end anonymous namespace
247
248char LoopIdiomRecognizeLegacyPass::ID = 0;
249
250PreservedAnalyses LoopIdiomRecognizePass::run(Loop &L, LoopAnalysisManager &AM,
251 LoopStandardAnalysisResults &AR,
252 LPMUpdater &) {
253 const auto *DL = &L.getHeader()->getModule()->getDataLayout();
254
255 const auto &FAM =
256 AM.getResult<FunctionAnalysisManagerLoopProxy>(L, AR).getManager();
257 Function *F = L.getHeader()->getParent();
258
259 auto *ORE = FAM.getCachedResult<OptimizationRemarkEmitterAnalysis>(*F);
260 // FIXME: This should probably be optional rather than required.
261 if (!ORE)
262 report_fatal_error(
263 "LoopIdiomRecognizePass: OptimizationRemarkEmitterAnalysis not cached "
264 "at a higher level");
265
266 LoopIdiomRecognize LIR(&AR.AA, &AR.DT, &AR.LI, &AR.SE, &AR.TLI, &AR.TTI, DL,
267 *ORE);
268 if (!LIR.runOnLoop(&L))
269 return PreservedAnalyses::all();
270
271 return getLoopPassPreservedAnalyses();
272}
273
274INITIALIZE_PASS_BEGIN(LoopIdiomRecognizeLegacyPass, "loop-idiom",static void *initializeLoopIdiomRecognizeLegacyPassPassOnce(PassRegistry
&Registry) {
275 "Recognize loop idioms", false, false)static void *initializeLoopIdiomRecognizeLegacyPassPassOnce(PassRegistry
&Registry) {
276INITIALIZE_PASS_DEPENDENCY(LoopPass)initializeLoopPassPass(Registry);
277INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)initializeTargetLibraryInfoWrapperPassPass(Registry);
278INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)initializeTargetTransformInfoWrapperPassPass(Registry);
279INITIALIZE_PASS_END(LoopIdiomRecognizeLegacyPass, "loop-idiom",PassInfo *PI = new PassInfo( "Recognize loop idioms", "loop-idiom"
, &LoopIdiomRecognizeLegacyPass::ID, PassInfo::NormalCtor_t
(callDefaultCtor<LoopIdiomRecognizeLegacyPass>), false,
false); Registry.registerPass(*PI, true); return PI; } static
llvm::once_flag InitializeLoopIdiomRecognizeLegacyPassPassFlag
; void llvm::initializeLoopIdiomRecognizeLegacyPassPass(PassRegistry
&Registry) { llvm::call_once(InitializeLoopIdiomRecognizeLegacyPassPassFlag
, initializeLoopIdiomRecognizeLegacyPassPassOnce, std::ref(Registry
)); }
280 "Recognize loop idioms", false, false)PassInfo *PI = new PassInfo( "Recognize loop idioms", "loop-idiom"
, &LoopIdiomRecognizeLegacyPass::ID, PassInfo::NormalCtor_t
(callDefaultCtor<LoopIdiomRecognizeLegacyPass>), false,
false); Registry.registerPass(*PI, true); return PI; } static
llvm::once_flag InitializeLoopIdiomRecognizeLegacyPassPassFlag
; void llvm::initializeLoopIdiomRecognizeLegacyPassPass(PassRegistry
&Registry) { llvm::call_once(InitializeLoopIdiomRecognizeLegacyPassPassFlag
, initializeLoopIdiomRecognizeLegacyPassPassOnce, std::ref(Registry
)); }
281
282Pass *llvm::createLoopIdiomPass() { return new LoopIdiomRecognizeLegacyPass(); }
283
284static void deleteDeadInstruction(Instruction *I) {
285 I->replaceAllUsesWith(UndefValue::get(I->getType()));
286 I->eraseFromParent();
287}
288
289//===----------------------------------------------------------------------===//
290//
291// Implementation of LoopIdiomRecognize
292//
293//===----------------------------------------------------------------------===//
294
295bool LoopIdiomRecognize::runOnLoop(Loop *L) {
296 CurLoop = L;
297 // If the loop could not be converted to canonical form, it must have an
298 // indirectbr in it, just give up.
299 if (!L->getLoopPreheader())
300 return false;
301
302 // Disable loop idiom recognition if the function's name is a common idiom.
303 StringRef Name = L->getHeader()->getParent()->getName();
304 if (Name == "memset" || Name == "memcpy")
305 return false;
306
307 // Determine if code size heuristics need to be applied.
308 ApplyCodeSizeHeuristics =
309 L->getHeader()->getParent()->hasOptSize() && UseLIRCodeSizeHeurs;
310
311 HasMemset = TLI->has(LibFunc_memset);
312 HasMemsetPattern = TLI->has(LibFunc_memset_pattern16);
313 HasMemcpy = TLI->has(LibFunc_memcpy);
314
315 if (HasMemset || HasMemsetPattern || HasMemcpy)
316 if (SE->hasLoopInvariantBackedgeTakenCount(L))
317 return runOnCountableLoop();
318
319 return runOnNoncountableLoop();
320}
321
322bool LoopIdiomRecognize::runOnCountableLoop() {
323 const SCEV *BECount = SE->getBackedgeTakenCount(CurLoop);
324 assert(!isa<SCEVCouldNotCompute>(BECount) &&((!isa<SCEVCouldNotCompute>(BECount) && "runOnCountableLoop() called on a loop without a predictable"
"backedge-taken count") ? static_cast<void> (0) : __assert_fail
("!isa<SCEVCouldNotCompute>(BECount) && \"runOnCountableLoop() called on a loop without a predictable\" \"backedge-taken count\""
, "/build/llvm-toolchain-snapshot-10~++20200112100611+7fa5290d5bd/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp"
, 326, __PRETTY_FUNCTION__))
325 "runOnCountableLoop() called on a loop without a predictable"((!isa<SCEVCouldNotCompute>(BECount) && "runOnCountableLoop() called on a loop without a predictable"
"backedge-taken count") ? static_cast<void> (0) : __assert_fail
("!isa<SCEVCouldNotCompute>(BECount) && \"runOnCountableLoop() called on a loop without a predictable\" \"backedge-taken count\""
, "/build/llvm-toolchain-snapshot-10~++20200112100611+7fa5290d5bd/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp"
, 326, __PRETTY_FUNCTION__))
326 "backedge-taken count")((!isa<SCEVCouldNotCompute>(BECount) && "runOnCountableLoop() called on a loop without a predictable"
"backedge-taken count") ? static_cast<void> (0) : __assert_fail
("!isa<SCEVCouldNotCompute>(BECount) && \"runOnCountableLoop() called on a loop without a predictable\" \"backedge-taken count\""
, "/build/llvm-toolchain-snapshot-10~++20200112100611+7fa5290d5bd/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp"
, 326, __PRETTY_FUNCTION__))
;
327
328 // If this loop executes exactly one time, then it should be peeled, not
329 // optimized by this pass.
330 if (const SCEVConstant *BECst = dyn_cast<SCEVConstant>(BECount))
331 if (BECst->getAPInt() == 0)
332 return false;
333
334 SmallVector<BasicBlock *, 8> ExitBlocks;
335 CurLoop->getUniqueExitBlocks(ExitBlocks);
336
337 LLVM_DEBUG(dbgs() << DEBUG_TYPE " Scanning: F["do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-idiom")) { dbgs() << "loop-idiom" " Scanning: F["
<< CurLoop->getHeader()->getParent()->getName
() << "] Countable Loop %" << CurLoop->getHeader
()->getName() << "\n"; } } while (false)
338 << CurLoop->getHeader()->getParent()->getName()do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-idiom")) { dbgs() << "loop-idiom" " Scanning: F["
<< CurLoop->getHeader()->getParent()->getName
() << "] Countable Loop %" << CurLoop->getHeader
()->getName() << "\n"; } } while (false)
339 << "] Countable Loop %" << CurLoop->getHeader()->getName()do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-idiom")) { dbgs() << "loop-idiom" " Scanning: F["
<< CurLoop->getHeader()->getParent()->getName
() << "] Countable Loop %" << CurLoop->getHeader
()->getName() << "\n"; } } while (false)
340 << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-idiom")) { dbgs() << "loop-idiom" " Scanning: F["
<< CurLoop->getHeader()->getParent()->getName
() << "] Countable Loop %" << CurLoop->getHeader
()->getName() << "\n"; } } while (false)
;
341
342 bool MadeChange = false;
343
344 // The following transforms hoist stores/memsets into the loop pre-header.
345 // Give up if the loop has instructions may throw.
346 SimpleLoopSafetyInfo SafetyInfo;
347 SafetyInfo.computeLoopSafetyInfo(CurLoop);
348 if (SafetyInfo.anyBlockMayThrow())
349 return MadeChange;
350
351 // Scan all the blocks in the loop that are not in subloops.
352 for (auto *BB : CurLoop->getBlocks()) {
353 // Ignore blocks in subloops.
354 if (LI->getLoopFor(BB) != CurLoop)
355 continue;
356
357 MadeChange |= runOnLoopBlock(BB, BECount, ExitBlocks);
358 }
359 return MadeChange;
360}
361
362static APInt getStoreStride(const SCEVAddRecExpr *StoreEv) {
363 const SCEVConstant *ConstStride = cast<SCEVConstant>(StoreEv->getOperand(1));
364 return ConstStride->getAPInt();
365}
366
367/// getMemSetPatternValue - If a strided store of the specified value is safe to
368/// turn into a memset_pattern16, return a ConstantArray of 16 bytes that should
369/// be passed in. Otherwise, return null.
370///
371/// Note that we don't ever attempt to use memset_pattern8 or 4, because these
372/// just replicate their input array and then pass on to memset_pattern16.
373static Constant *getMemSetPatternValue(Value *V, const DataLayout *DL) {
374 // FIXME: This could check for UndefValue because it can be merged into any
375 // other valid pattern.
376
377 // If the value isn't a constant, we can't promote it to being in a constant
378 // array. We could theoretically do a store to an alloca or something, but
379 // that doesn't seem worthwhile.
380 Constant *C = dyn_cast<Constant>(V);
381 if (!C)
382 return nullptr;
383
384 // Only handle simple values that are a power of two bytes in size.
385 uint64_t Size = DL->getTypeSizeInBits(V->getType());
386 if (Size == 0 || (Size & 7) || (Size & (Size - 1)))
387 return nullptr;
388
389 // Don't care enough about darwin/ppc to implement this.
390 if (DL->isBigEndian())
391 return nullptr;
392
393 // Convert to size in bytes.
394 Size /= 8;
395
396 // TODO: If CI is larger than 16-bytes, we can try slicing it in half to see
397 // if the top and bottom are the same (e.g. for vectors and large integers).
398 if (Size > 16)
399 return nullptr;
400
401 // If the constant is exactly 16 bytes, just use it.
402 if (Size == 16)
403 return C;
404
405 // Otherwise, we'll use an array of the constants.
406 unsigned ArraySize = 16 / Size;
407 ArrayType *AT = ArrayType::get(V->getType(), ArraySize);
408 return ConstantArray::get(AT, std::vector<Constant *>(ArraySize, C));
409}
410
411LoopIdiomRecognize::LegalStoreKind
412LoopIdiomRecognize::isLegalStore(StoreInst *SI) {
413 // Don't touch volatile stores.
414 if (SI->isVolatile())
415 return LegalStoreKind::None;
416 // We only want simple or unordered-atomic stores.
417 if (!SI->isUnordered())
418 return LegalStoreKind::None;
419
420 // Don't convert stores of non-integral pointer types to memsets (which stores
421 // integers).
422 if (DL->isNonIntegralPointerType(SI->getValueOperand()->getType()))
423 return LegalStoreKind::None;
424
425 // Avoid merging nontemporal stores.
426 if (SI->getMetadata(LLVMContext::MD_nontemporal))
427 return LegalStoreKind::None;
428
429 Value *StoredVal = SI->getValueOperand();
430 Value *StorePtr = SI->getPointerOperand();
431
432 // Reject stores that are so large that they overflow an unsigned.
433 uint64_t SizeInBits = DL->getTypeSizeInBits(StoredVal->getType());
434 if ((SizeInBits & 7) || (SizeInBits >> 32) != 0)
435 return LegalStoreKind::None;
436
437 // See if the pointer expression is an AddRec like {base,+,1} on the current
438 // loop, which indicates a strided store. If we have something else, it's a
439 // random store we can't handle.
440 const SCEVAddRecExpr *StoreEv =
441 dyn_cast<SCEVAddRecExpr>(SE->getSCEV(StorePtr));
442 if (!StoreEv || StoreEv->getLoop() != CurLoop || !StoreEv->isAffine())
443 return LegalStoreKind::None;
444
445 // Check to see if we have a constant stride.
446 if (!isa<SCEVConstant>(StoreEv->getOperand(1)))
447 return LegalStoreKind::None;
448
449 // See if the store can be turned into a memset.
450
451 // If the stored value is a byte-wise value (like i32 -1), then it may be
452 // turned into a memset of i8 -1, assuming that all the consecutive bytes
453 // are stored. A store of i32 0x01020304 can never be turned into a memset,
454 // but it can be turned into memset_pattern if the target supports it.
455 Value *SplatValue = isBytewiseValue(StoredVal, *DL);
456 Constant *PatternValue = nullptr;
457
458 // Note: memset and memset_pattern on unordered-atomic is yet not supported
459 bool UnorderedAtomic = SI->isUnordered() && !SI->isSimple();
460
461 // If we're allowed to form a memset, and the stored value would be
462 // acceptable for memset, use it.
463 if (!UnorderedAtomic && HasMemset && SplatValue &&
464 // Verify that the stored value is loop invariant. If not, we can't
465 // promote the memset.
466 CurLoop->isLoopInvariant(SplatValue)) {
467 // It looks like we can use SplatValue.
468 return LegalStoreKind::Memset;
469 } else if (!UnorderedAtomic && HasMemsetPattern &&
470 // Don't create memset_pattern16s with address spaces.
471 StorePtr->getType()->getPointerAddressSpace() == 0 &&
472 (PatternValue = getMemSetPatternValue(StoredVal, DL))) {
Although the value stored to 'PatternValue' is used in the enclosing expression, the value is never actually read from 'PatternValue'
473 // It looks like we can use PatternValue!
474 return LegalStoreKind::MemsetPattern;
475 }
476
477 // Otherwise, see if the store can be turned into a memcpy.
478 if (HasMemcpy) {
479 // Check to see if the stride matches the size of the store. If so, then we
480 // know that every byte is touched in the loop.
481 APInt Stride = getStoreStride(StoreEv);
482 unsigned StoreSize = DL->getTypeStoreSize(SI->getValueOperand()->getType());
483 if (StoreSize != Stride && StoreSize != -Stride)
484 return LegalStoreKind::None;
485
486 // The store must be feeding a non-volatile load.
487 LoadInst *LI = dyn_cast<LoadInst>(SI->getValueOperand());
488
489 // Only allow non-volatile loads
490 if (!LI || LI->isVolatile())
491 return LegalStoreKind::None;
492 // Only allow simple or unordered-atomic loads
493 if (!LI->isUnordered())
494 return LegalStoreKind::None;
495
496 // See if the pointer expression is an AddRec like {base,+,1} on the current
497 // loop, which indicates a strided load. If we have something else, it's a
498 // random load we can't handle.
499 const SCEVAddRecExpr *LoadEv =
500 dyn_cast<SCEVAddRecExpr>(SE->getSCEV(LI->getPointerOperand()));
501 if (!LoadEv || LoadEv->getLoop() != CurLoop || !LoadEv->isAffine())
502 return LegalStoreKind::None;
503
504 // The store and load must share the same stride.
505 if (StoreEv->getOperand(1) != LoadEv->getOperand(1))
506 return LegalStoreKind::None;
507
508 // Success. This store can be converted into a memcpy.
509 UnorderedAtomic = UnorderedAtomic || LI->isAtomic();
510 return UnorderedAtomic ? LegalStoreKind::UnorderedAtomicMemcpy
511 : LegalStoreKind::Memcpy;
512 }
513 // This store can't be transformed into a memset/memcpy.
514 return LegalStoreKind::None;
515}
516
517void LoopIdiomRecognize::collectStores(BasicBlock *BB) {
518 StoreRefsForMemset.clear();
519 StoreRefsForMemsetPattern.clear();
520 StoreRefsForMemcpy.clear();
521 for (Instruction &I : *BB) {
522 StoreInst *SI = dyn_cast<StoreInst>(&I);
523 if (!SI)
524 continue;
525
526 // Make sure this is a strided store with a constant stride.
527 switch (isLegalStore(SI)) {
528 case LegalStoreKind::None:
529 // Nothing to do
530 break;
531 case LegalStoreKind::Memset: {
532 // Find the base pointer.
533 Value *Ptr = GetUnderlyingObject(SI->getPointerOperand(), *DL);
534 StoreRefsForMemset[Ptr].push_back(SI);
535 } break;
536 case LegalStoreKind::MemsetPattern: {
537 // Find the base pointer.
538 Value *Ptr = GetUnderlyingObject(SI->getPointerOperand(), *DL);
539 StoreRefsForMemsetPattern[Ptr].push_back(SI);
540 } break;
541 case LegalStoreKind::Memcpy:
542 case LegalStoreKind::UnorderedAtomicMemcpy:
543 StoreRefsForMemcpy.push_back(SI);
544 break;
545 default:
546 assert(false && "unhandled return value")((false && "unhandled return value") ? static_cast<
void> (0) : __assert_fail ("false && \"unhandled return value\""
, "/build/llvm-toolchain-snapshot-10~++20200112100611+7fa5290d5bd/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp"
, 546, __PRETTY_FUNCTION__))
;
547 break;
548 }
549 }
550}
551
552/// runOnLoopBlock - Process the specified block, which lives in a counted loop
553/// with the specified backedge count. This block is known to be in the current
554/// loop and not in any subloops.
555bool LoopIdiomRecognize::runOnLoopBlock(
556 BasicBlock *BB, const SCEV *BECount,
557 SmallVectorImpl<BasicBlock *> &ExitBlocks) {
558 // We can only promote stores in this block if they are unconditionally
559 // executed in the loop. For a block to be unconditionally executed, it has
560 // to dominate all the exit blocks of the loop. Verify this now.
561 for (unsigned i = 0, e = ExitBlocks.size(); i != e; ++i)
562 if (!DT->dominates(BB, ExitBlocks[i]))
563 return false;
564
565 bool MadeChange = false;
566 // Look for store instructions, which may be optimized to memset/memcpy.
567 collectStores(BB);
568
569 // Look for a single store or sets of stores with a common base, which can be
570 // optimized into a memset (memset_pattern). The latter most commonly happens
571 // with structs and handunrolled loops.
572 for (auto &SL : StoreRefsForMemset)
573 MadeChange |= processLoopStores(SL.second, BECount, ForMemset::Yes);
574
575 for (auto &SL : StoreRefsForMemsetPattern)
576 MadeChange |= processLoopStores(SL.second, BECount, ForMemset::No);
577
578 // Optimize the store into a memcpy, if it feeds an similarly strided load.
579 for (auto &SI : StoreRefsForMemcpy)
580 MadeChange |= processLoopStoreOfLoopLoad(SI, BECount);
581
582 for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) {
583 Instruction *Inst = &*I++;
584 // Look for memset instructions, which may be optimized to a larger memset.
585 if (MemSetInst *MSI = dyn_cast<MemSetInst>(Inst)) {
586 WeakTrackingVH InstPtr(&*I);
587 if (!processLoopMemSet(MSI, BECount))
588 continue;
589 MadeChange = true;
590
591 // If processing the memset invalidated our iterator, start over from the
592 // top of the block.
593 if (!InstPtr)
594 I = BB->begin();
595 continue;
596 }
597 }
598
599 return MadeChange;
600}
601
602/// See if this store(s) can be promoted to a memset.
603bool LoopIdiomRecognize::processLoopStores(SmallVectorImpl<StoreInst *> &SL,
604 const SCEV *BECount, ForMemset For) {
605 // Try to find consecutive stores that can be transformed into memsets.
606 SetVector<StoreInst *> Heads, Tails;
607 SmallDenseMap<StoreInst *, StoreInst *> ConsecutiveChain;
608
609 // Do a quadratic search on all of the given stores and find
610 // all of the pairs of stores that follow each other.
611 SmallVector<unsigned, 16> IndexQueue;
612 for (unsigned i = 0, e = SL.size(); i < e; ++i) {
613 assert(SL[i]->isSimple() && "Expected only non-volatile stores.")((SL[i]->isSimple() && "Expected only non-volatile stores."
) ? static_cast<void> (0) : __assert_fail ("SL[i]->isSimple() && \"Expected only non-volatile stores.\""
, "/build/llvm-toolchain-snapshot-10~++20200112100611+7fa5290d5bd/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp"
, 613, __PRETTY_FUNCTION__))
;
614
615 Value *FirstStoredVal = SL[i]->getValueOperand();
616 Value *FirstStorePtr = SL[i]->getPointerOperand();
617 const SCEVAddRecExpr *FirstStoreEv =
618 cast<SCEVAddRecExpr>(SE->getSCEV(FirstStorePtr));
619 APInt FirstStride = getStoreStride(FirstStoreEv);
620 unsigned FirstStoreSize = DL->getTypeStoreSize(SL[i]->getValueOperand()->getType());
621
622 // See if we can optimize just this store in isolation.
623 if (FirstStride == FirstStoreSize || -FirstStride == FirstStoreSize) {
624 Heads.insert(SL[i]);
625 continue;
626 }
627
628 Value *FirstSplatValue = nullptr;
629 Constant *FirstPatternValue = nullptr;
630
631 if (For == ForMemset::Yes)
632 FirstSplatValue = isBytewiseValue(FirstStoredVal, *DL);
633 else
634 FirstPatternValue = getMemSetPatternValue(FirstStoredVal, DL);
635
636 assert((FirstSplatValue || FirstPatternValue) &&(((FirstSplatValue || FirstPatternValue) && "Expected either splat value or pattern value."
) ? static_cast<void> (0) : __assert_fail ("(FirstSplatValue || FirstPatternValue) && \"Expected either splat value or pattern value.\""
, "/build/llvm-toolchain-snapshot-10~++20200112100611+7fa5290d5bd/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp"
, 637, __PRETTY_FUNCTION__))
637 "Expected either splat value or pattern value.")(((FirstSplatValue || FirstPatternValue) && "Expected either splat value or pattern value."
) ? static_cast<void> (0) : __assert_fail ("(FirstSplatValue || FirstPatternValue) && \"Expected either splat value or pattern value.\""
, "/build/llvm-toolchain-snapshot-10~++20200112100611+7fa5290d5bd/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp"
, 637, __PRETTY_FUNCTION__))
;
638
639 IndexQueue.clear();
640 // If a store has multiple consecutive store candidates, search Stores
641 // array according to the sequence: from i+1 to e, then from i-1 to 0.
642 // This is because usually pairing with immediate succeeding or preceding
643 // candidate create the best chance to find memset opportunity.
644 unsigned j = 0;
645 for (j = i + 1; j < e; ++j)
646 IndexQueue.push_back(j);
647 for (j = i; j > 0; --j)
648 IndexQueue.push_back(j - 1);
649
650 for (auto &k : IndexQueue) {
651 assert(SL[k]->isSimple() && "Expected only non-volatile stores.")((SL[k]->isSimple() && "Expected only non-volatile stores."
) ? static_cast<void> (0) : __assert_fail ("SL[k]->isSimple() && \"Expected only non-volatile stores.\""
, "/build/llvm-toolchain-snapshot-10~++20200112100611+7fa5290d5bd/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp"
, 651, __PRETTY_FUNCTION__))
;
652 Value *SecondStorePtr = SL[k]->getPointerOperand();
653 const SCEVAddRecExpr *SecondStoreEv =
654 cast<SCEVAddRecExpr>(SE->getSCEV(SecondStorePtr));
655 APInt SecondStride = getStoreStride(SecondStoreEv);
656
657 if (FirstStride != SecondStride)
658 continue;
659
660 Value *SecondStoredVal = SL[k]->getValueOperand();
661 Value *SecondSplatValue = nullptr;
662 Constant *SecondPatternValue = nullptr;
663
664 if (For == ForMemset::Yes)
665 SecondSplatValue = isBytewiseValue(SecondStoredVal, *DL);
666 else
667 SecondPatternValue = getMemSetPatternValue(SecondStoredVal, DL);
668
669 assert((SecondSplatValue || SecondPatternValue) &&(((SecondSplatValue || SecondPatternValue) && "Expected either splat value or pattern value."
) ? static_cast<void> (0) : __assert_fail ("(SecondSplatValue || SecondPatternValue) && \"Expected either splat value or pattern value.\""
, "/build/llvm-toolchain-snapshot-10~++20200112100611+7fa5290d5bd/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp"
, 670, __PRETTY_FUNCTION__))
670 "Expected either splat value or pattern value.")(((SecondSplatValue || SecondPatternValue) && "Expected either splat value or pattern value."
) ? static_cast<void> (0) : __assert_fail ("(SecondSplatValue || SecondPatternValue) && \"Expected either splat value or pattern value.\""
, "/build/llvm-toolchain-snapshot-10~++20200112100611+7fa5290d5bd/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp"
, 670, __PRETTY_FUNCTION__))
;
671
672 if (isConsecutiveAccess(SL[i], SL[k], *DL, *SE, false)) {
673 if (For == ForMemset::Yes) {
674 if (isa<UndefValue>(FirstSplatValue))
675 FirstSplatValue = SecondSplatValue;
676 if (FirstSplatValue != SecondSplatValue)
677 continue;
678 } else {
679 if (isa<UndefValue>(FirstPatternValue))
680 FirstPatternValue = SecondPatternValue;
681 if (FirstPatternValue != SecondPatternValue)
682 continue;
683 }
684 Tails.insert(SL[k]);
685 Heads.insert(SL[i]);
686 ConsecutiveChain[SL[i]] = SL[k];
687 break;
688 }
689 }
690 }
691
692 // We may run into multiple chains that merge into a single chain. We mark the
693 // stores that we transformed so that we don't visit the same store twice.
694 SmallPtrSet<Value *, 16> TransformedStores;
695 bool Changed = false;
696
697 // For stores that start but don't end a link in the chain:
698 for (SetVector<StoreInst *>::iterator it = Heads.begin(), e = Heads.end();
699 it != e; ++it) {
700 if (Tails.count(*it))
701 continue;
702
703 // We found a store instr that starts a chain. Now follow the chain and try
704 // to transform it.
705 SmallPtrSet<Instruction *, 8> AdjacentStores;
706 StoreInst *I = *it;
707
708 StoreInst *HeadStore = I;
709 unsigned StoreSize = 0;
710
711 // Collect the chain into a list.
712 while (Tails.count(I) || Heads.count(I)) {
713 if (TransformedStores.count(I))
714 break;
715 AdjacentStores.insert(I);
716
717 StoreSize += DL->getTypeStoreSize(I->getValueOperand()->getType());
718 // Move to the next value in the chain.
719 I = ConsecutiveChain[I];
720 }
721
722 Value *StoredVal = HeadStore->getValueOperand();
723 Value *StorePtr = HeadStore->getPointerOperand();
724 const SCEVAddRecExpr *StoreEv = cast<SCEVAddRecExpr>(SE->getSCEV(StorePtr));
725 APInt Stride = getStoreStride(StoreEv);
726
727 // Check to see if the stride matches the size of the stores. If so, then
728 // we know that every byte is touched in the loop.
729 if (StoreSize != Stride && StoreSize != -Stride)
730 continue;
731
732 bool NegStride = StoreSize == -Stride;
733
734 if (processLoopStridedStore(StorePtr, StoreSize,
735 MaybeAlign(HeadStore->getAlignment()),
736 StoredVal, HeadStore, AdjacentStores, StoreEv,
737 BECount, NegStride)) {
738 TransformedStores.insert(AdjacentStores.begin(), AdjacentStores.end());
739 Changed = true;
740 }
741 }
742
743 return Changed;
744}
745
746/// processLoopMemSet - See if this memset can be promoted to a large memset.
747bool LoopIdiomRecognize::processLoopMemSet(MemSetInst *MSI,
748 const SCEV *BECount) {
749 // We can only handle non-volatile memsets with a constant size.
750 if (MSI->isVolatile() || !isa<ConstantInt>(MSI->getLength()))
751 return false;
752
753 // If we're not allowed to hack on memset, we fail.
754 if (!HasMemset)
755 return false;
756
757 Value *Pointer = MSI->getDest();
758
759 // See if the pointer expression is an AddRec like {base,+,1} on the current
760 // loop, which indicates a strided store. If we have something else, it's a
761 // random store we can't handle.
762 const SCEVAddRecExpr *Ev = dyn_cast<SCEVAddRecExpr>(SE->getSCEV(Pointer));
763 if (!Ev || Ev->getLoop() != CurLoop || !Ev->isAffine())
764 return false;
765
766 // Reject memsets that are so large that they overflow an unsigned.
767 uint64_t SizeInBytes = cast<ConstantInt>(MSI->getLength())->getZExtValue();
768 if ((SizeInBytes >> 32) != 0)
769 return false;
770
771 // Check to see if the stride matches the size of the memset. If so, then we
772 // know that every byte is touched in the loop.
773 const SCEVConstant *ConstStride = dyn_cast<SCEVConstant>(Ev->getOperand(1));
774 if (!ConstStride)
775 return false;
776
777 APInt Stride = ConstStride->getAPInt();
778 if (SizeInBytes != Stride && SizeInBytes != -Stride)
779 return false;
780
781 // Verify that the memset value is loop invariant. If not, we can't promote
782 // the memset.
783 Value *SplatValue = MSI->getValue();
784 if (!SplatValue || !CurLoop->isLoopInvariant(SplatValue))
785 return false;
786
787 SmallPtrSet<Instruction *, 1> MSIs;
788 MSIs.insert(MSI);
789 bool NegStride = SizeInBytes == -Stride;
790 return processLoopStridedStore(
791 Pointer, (unsigned)SizeInBytes, MaybeAlign(MSI->getDestAlignment()),
792 SplatValue, MSI, MSIs, Ev, BECount, NegStride, /*IsLoopMemset=*/true);
793}
794
795/// mayLoopAccessLocation - Return true if the specified loop might access the
796/// specified pointer location, which is a loop-strided access. The 'Access'
797/// argument specifies what the verboten forms of access are (read or write).
798static bool
799mayLoopAccessLocation(Value *Ptr, ModRefInfo Access, Loop *L,
800 const SCEV *BECount, unsigned StoreSize,
801 AliasAnalysis &AA,
802 SmallPtrSetImpl<Instruction *> &IgnoredStores) {
803 // Get the location that may be stored across the loop. Since the access is
804 // strided positively through memory, we say that the modified location starts
805 // at the pointer and has infinite size.
806 LocationSize AccessSize = LocationSize::unknown();
807
808 // If the loop iterates a fixed number of times, we can refine the access size
809 // to be exactly the size of the memset, which is (BECount+1)*StoreSize
810 if (const SCEVConstant *BECst = dyn_cast<SCEVConstant>(BECount))
811 AccessSize = LocationSize::precise((BECst->getValue()->getZExtValue() + 1) *
812 StoreSize);
813
814 // TODO: For this to be really effective, we have to dive into the pointer
815 // operand in the store. Store to &A[i] of 100 will always return may alias
816 // with store of &A[100], we need to StoreLoc to be "A" with size of 100,
817 // which will then no-alias a store to &A[100].
818 MemoryLocation StoreLoc(Ptr, AccessSize);
819
820 for (Loop::block_iterator BI = L->block_begin(), E = L->block_end(); BI != E;
821 ++BI)
822 for (Instruction &I : **BI)
823 if (IgnoredStores.count(&I) == 0 &&
824 isModOrRefSet(
825 intersectModRef(AA.getModRefInfo(&I, StoreLoc), Access)))
826 return true;
827
828 return false;
829}
830
831// If we have a negative stride, Start refers to the end of the memory location
832// we're trying to memset. Therefore, we need to recompute the base pointer,
833// which is just Start - BECount*Size.
834static const SCEV *getStartForNegStride(const SCEV *Start, const SCEV *BECount,
835 Type *IntPtr, unsigned StoreSize,
836 ScalarEvolution *SE) {
837 const SCEV *Index = SE->getTruncateOrZeroExtend(BECount, IntPtr);
838 if (StoreSize != 1)
839 Index = SE->getMulExpr(Index, SE->getConstant(IntPtr, StoreSize),
840 SCEV::FlagNUW);
841 return SE->getMinusSCEV(Start, Index);
842}
843
844/// Compute the number of bytes as a SCEV from the backedge taken count.
845///
846/// This also maps the SCEV into the provided type and tries to handle the
847/// computation in a way that will fold cleanly.
848static const SCEV *getNumBytes(const SCEV *BECount, Type *IntPtr,
849 unsigned StoreSize, Loop *CurLoop,
850 const DataLayout *DL, ScalarEvolution *SE) {
851 const SCEV *NumBytesS;
852 // The # stored bytes is (BECount+1)*Size. Expand the trip count out to
853 // pointer size if it isn't already.
854 //
855 // If we're going to need to zero extend the BE count, check if we can add
856 // one to it prior to zero extending without overflow. Provided this is safe,
857 // it allows better simplification of the +1.
858 if (DL->getTypeSizeInBits(BECount->getType()) <
859 DL->getTypeSizeInBits(IntPtr) &&
860 SE->isLoopEntryGuardedByCond(
861 CurLoop, ICmpInst::ICMP_NE, BECount,
862 SE->getNegativeSCEV(SE->getOne(BECount->getType())))) {
863 NumBytesS = SE->getZeroExtendExpr(
864 SE->getAddExpr(BECount, SE->getOne(BECount->getType()), SCEV::FlagNUW),
865 IntPtr);
866 } else {
867 NumBytesS = SE->getAddExpr(SE->getTruncateOrZeroExtend(BECount, IntPtr),
868 SE->getOne(IntPtr), SCEV::FlagNUW);
869 }
870
871 // And scale it based on the store size.
872 if (StoreSize != 1) {
873 NumBytesS = SE->getMulExpr(NumBytesS, SE->getConstant(IntPtr, StoreSize),
874 SCEV::FlagNUW);
875 }
876 return NumBytesS;
877}
878
879/// processLoopStridedStore - We see a strided store of some value. If we can
880/// transform this into a memset or memset_pattern in the loop preheader, do so.
881bool LoopIdiomRecognize::processLoopStridedStore(
882 Value *DestPtr, unsigned StoreSize, MaybeAlign StoreAlignment,
883 Value *StoredVal, Instruction *TheStore,
884 SmallPtrSetImpl<Instruction *> &Stores, const SCEVAddRecExpr *Ev,
885 const SCEV *BECount, bool NegStride, bool IsLoopMemset) {
886 Value *SplatValue = isBytewiseValue(StoredVal, *DL);
887 Constant *PatternValue = nullptr;
888
889 if (!SplatValue)
890 PatternValue = getMemSetPatternValue(StoredVal, DL);
891
892 assert((SplatValue || PatternValue) &&(((SplatValue || PatternValue) && "Expected either splat value or pattern value."
) ? static_cast<void> (0) : __assert_fail ("(SplatValue || PatternValue) && \"Expected either splat value or pattern value.\""
, "/build/llvm-toolchain-snapshot-10~++20200112100611+7fa5290d5bd/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp"
, 893, __PRETTY_FUNCTION__))
893 "Expected either splat value or pattern value.")(((SplatValue || PatternValue) && "Expected either splat value or pattern value."
) ? static_cast<void> (0) : __assert_fail ("(SplatValue || PatternValue) && \"Expected either splat value or pattern value.\""
, "/build/llvm-toolchain-snapshot-10~++20200112100611+7fa5290d5bd/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp"
, 893, __PRETTY_FUNCTION__))
;
894
895 // The trip count of the loop and the base pointer of the addrec SCEV is
896 // guaranteed to be loop invariant, which means that it should dominate the
897 // header. This allows us to insert code for it in the preheader.
898 unsigned DestAS = DestPtr->getType()->getPointerAddressSpace();
899 BasicBlock *Preheader = CurLoop->getLoopPreheader();
900 IRBuilder<> Builder(Preheader->getTerminator());
901 SCEVExpander Expander(*SE, *DL, "loop-idiom");
902
903 Type *DestInt8PtrTy = Builder.getInt8PtrTy(DestAS);
904 Type *IntIdxTy = DL->getIndexType(DestPtr->getType());
905
906 const SCEV *Start = Ev->getStart();
907 // Handle negative strided loops.
908 if (NegStride)
909 Start = getStartForNegStride(Start, BECount, IntIdxTy, StoreSize, SE);
910
911 // TODO: ideally we should still be able to generate memset if SCEV expander
912 // is taught to generate the dependencies at the latest point.
913 if (!isSafeToExpand(Start, *SE))
914 return false;
915
916 // Okay, we have a strided store "p[i]" of a splattable value. We can turn
917 // this into a memset in the loop preheader now if we want. However, this
918 // would be unsafe to do if there is anything else in the loop that may read
919 // or write to the aliased location. Check for any overlap by generating the
920 // base pointer and checking the region.
921 Value *BasePtr =
922 Expander.expandCodeFor(Start, DestInt8PtrTy, Preheader->getTerminator());
923 if (mayLoopAccessLocation(BasePtr, ModRefInfo::ModRef, CurLoop, BECount,
924 StoreSize, *AA, Stores)) {
925 Expander.clear();
926 // If we generated new code for the base pointer, clean up.
927 RecursivelyDeleteTriviallyDeadInstructions(BasePtr, TLI);
928 return false;
929 }
930
931 if (avoidLIRForMultiBlockLoop(/*IsMemset=*/true, IsLoopMemset))
932 return false;
933
934 // Okay, everything looks good, insert the memset.
935
936 const SCEV *NumBytesS =
937 getNumBytes(BECount, IntIdxTy, StoreSize, CurLoop, DL, SE);
938
939 // TODO: ideally we should still be able to generate memset if SCEV expander
940 // is taught to generate the dependencies at the latest point.
941 if (!isSafeToExpand(NumBytesS, *SE))
942 return false;
943
944 Value *NumBytes =
945 Expander.expandCodeFor(NumBytesS, IntIdxTy, Preheader->getTerminator());
946
947 CallInst *NewCall;
948 if (SplatValue) {
949 NewCall = Builder.CreateMemSet(BasePtr, SplatValue, NumBytes,
950 MaybeAlign(StoreAlignment));
951 } else {
952 // Everything is emitted in default address space
953 Type *Int8PtrTy = DestInt8PtrTy;
954
955 Module *M = TheStore->getModule();
956 StringRef FuncName = "memset_pattern16";
957 FunctionCallee MSP = M->getOrInsertFunction(FuncName, Builder.getVoidTy(),
958 Int8PtrTy, Int8PtrTy, IntIdxTy);
959 inferLibFuncAttributes(M, FuncName, *TLI);
960
961 // Otherwise we should form a memset_pattern16. PatternValue is known to be
962 // an constant array of 16-bytes. Plop the value into a mergable global.
963 GlobalVariable *GV = new GlobalVariable(*M, PatternValue->getType(), true,
964 GlobalValue::PrivateLinkage,
965 PatternValue, ".memset_pattern");
966 GV->setUnnamedAddr(GlobalValue::UnnamedAddr::Global); // Ok to merge these.
967 GV->setAlignment(Align(16));
968 Value *PatternPtr = ConstantExpr::getBitCast(GV, Int8PtrTy);
969 NewCall = Builder.CreateCall(MSP, {BasePtr, PatternPtr, NumBytes});
970 }
971
972 LLVM_DEBUG(dbgs() << " Formed memset: " << *NewCall << "\n"do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-idiom")) { dbgs() << " Formed memset: " <<
*NewCall << "\n" << " from store to: " <<
*Ev << " at: " << *TheStore << "\n"; } } while
(false)
973 << " from store to: " << *Ev << " at: " << *TheStoredo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-idiom")) { dbgs() << " Formed memset: " <<
*NewCall << "\n" << " from store to: " <<
*Ev << " at: " << *TheStore << "\n"; } } while
(false)
974 << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-idiom")) { dbgs() << " Formed memset: " <<
*NewCall << "\n" << " from store to: " <<
*Ev << " at: " << *TheStore << "\n"; } } while
(false)
;
975 NewCall->setDebugLoc(TheStore->getDebugLoc());
976
977 ORE.emit([&]() {
978 return OptimizationRemark(DEBUG_TYPE"loop-idiom", "ProcessLoopStridedStore",
979 NewCall->getDebugLoc(), Preheader)
980 << "Transformed loop-strided store into a call to "
981 << ore::NV("NewFunction", NewCall->getCalledFunction())
982 << "() function";
983 });
984
985 // Okay, the memset has been formed. Zap the original store and anything that
986 // feeds into it.
987 for (auto *I : Stores)
988 deleteDeadInstruction(I);
989 ++NumMemSet;
990 return true;
991}
992
993/// If the stored value is a strided load in the same loop with the same stride
994/// this may be transformable into a memcpy. This kicks in for stuff like
995/// for (i) A[i] = B[i];
996bool LoopIdiomRecognize::processLoopStoreOfLoopLoad(StoreInst *SI,
997 const SCEV *BECount) {
998 assert(SI->isUnordered() && "Expected only non-volatile non-ordered stores.")((SI->isUnordered() && "Expected only non-volatile non-ordered stores."
) ? static_cast<void> (0) : __assert_fail ("SI->isUnordered() && \"Expected only non-volatile non-ordered stores.\""
, "/build/llvm-toolchain-snapshot-10~++20200112100611+7fa5290d5bd/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp"
, 998, __PRETTY_FUNCTION__))
;
999
1000 Value *StorePtr = SI->getPointerOperand();
1001 const SCEVAddRecExpr *StoreEv = cast<SCEVAddRecExpr>(SE->getSCEV(StorePtr));
1002 APInt Stride = getStoreStride(StoreEv);
1003 unsigned StoreSize = DL->getTypeStoreSize(SI->getValueOperand()->getType());
1004 bool NegStride = StoreSize == -Stride;
1005
1006 // The store must be feeding a non-volatile load.
1007 LoadInst *LI = cast<LoadInst>(SI->getValueOperand());
1008 assert(LI->isUnordered() && "Expected only non-volatile non-ordered loads.")((LI->isUnordered() && "Expected only non-volatile non-ordered loads."
) ? static_cast<void> (0) : __assert_fail ("LI->isUnordered() && \"Expected only non-volatile non-ordered loads.\""
, "/build/llvm-toolchain-snapshot-10~++20200112100611+7fa5290d5bd/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp"
, 1008, __PRETTY_FUNCTION__))
;
1009
1010 // See if the pointer expression is an AddRec like {base,+,1} on the current
1011 // loop, which indicates a strided load. If we have something else, it's a
1012 // random load we can't handle.
1013 const SCEVAddRecExpr *LoadEv =
1014 cast<SCEVAddRecExpr>(SE->getSCEV(LI->getPointerOperand()));
1015
1016 // The trip count of the loop and the base pointer of the addrec SCEV is
1017 // guaranteed to be loop invariant, which means that it should dominate the
1018 // header. This allows us to insert code for it in the preheader.
1019 BasicBlock *Preheader = CurLoop->getLoopPreheader();
1020 IRBuilder<> Builder(Preheader->getTerminator());
1021 SCEVExpander Expander(*SE, *DL, "loop-idiom");
1022
1023 const SCEV *StrStart = StoreEv->getStart();
1024 unsigned StrAS = SI->getPointerAddressSpace();
1025 Type *IntIdxTy = Builder.getIntNTy(DL->getIndexSizeInBits(StrAS));
1026
1027 // Handle negative strided loops.
1028 if (NegStride)
1029 StrStart = getStartForNegStride(StrStart, BECount, IntIdxTy, StoreSize, SE);
1030
1031 // Okay, we have a strided store "p[i]" of a loaded value. We can turn
1032 // this into a memcpy in the loop preheader now if we want. However, this
1033 // would be unsafe to do if there is anything else in the loop that may read
1034 // or write the memory region we're storing to. This includes the load that
1035 // feeds the stores. Check for an alias by generating the base address and
1036 // checking everything.
1037 Value *StoreBasePtr = Expander.expandCodeFor(
1038 StrStart, Builder.getInt8PtrTy(StrAS), Preheader->getTerminator());
1039
1040 SmallPtrSet<Instruction *, 1> Stores;
1041 Stores.insert(SI);
1042 if (mayLoopAccessLocation(StoreBasePtr, ModRefInfo::ModRef, CurLoop, BECount,
1043 StoreSize, *AA, Stores)) {
1044 Expander.clear();
1045 // If we generated new code for the base pointer, clean up.
1046 RecursivelyDeleteTriviallyDeadInstructions(StoreBasePtr, TLI);
1047 return false;
1048 }
1049
1050 const SCEV *LdStart = LoadEv->getStart();
1051 unsigned LdAS = LI->getPointerAddressSpace();
1052
1053 // Handle negative strided loops.
1054 if (NegStride)
1055 LdStart = getStartForNegStride(LdStart, BECount, IntIdxTy, StoreSize, SE);
1056
1057 // For a memcpy, we have to make sure that the input array is not being
1058 // mutated by the loop.
1059 Value *LoadBasePtr = Expander.expandCodeFor(
1060 LdStart, Builder.getInt8PtrTy(LdAS), Preheader->getTerminator());
1061
1062 if (mayLoopAccessLocation(LoadBasePtr, ModRefInfo::Mod, CurLoop, BECount,
1063 StoreSize, *AA, Stores)) {
1064 Expander.clear();
1065 // If we generated new code for the base pointer, clean up.
1066 RecursivelyDeleteTriviallyDeadInstructions(LoadBasePtr, TLI);
1067 RecursivelyDeleteTriviallyDeadInstructions(StoreBasePtr, TLI);
1068 return false;
1069 }
1070
1071 if (avoidLIRForMultiBlockLoop())
1072 return false;
1073
1074 // Okay, everything is safe, we can transform this!
1075
1076 const SCEV *NumBytesS =
1077 getNumBytes(BECount, IntIdxTy, StoreSize, CurLoop, DL, SE);
1078
1079 Value *NumBytes =
1080 Expander.expandCodeFor(NumBytesS, IntIdxTy, Preheader->getTerminator());
1081
1082 CallInst *NewCall = nullptr;
1083 // Check whether to generate an unordered atomic memcpy:
1084 // If the load or store are atomic, then they must necessarily be unordered
1085 // by previous checks.
1086 if (!SI->isAtomic() && !LI->isAtomic())
1087 NewCall = Builder.CreateMemCpy(StoreBasePtr, SI->getAlign(), LoadBasePtr,
1088 LI->getAlign(), NumBytes);
1089 else {
1090 // We cannot allow unaligned ops for unordered load/store, so reject
1091 // anything where the alignment isn't at least the element size.
1092 unsigned Align = std::min(SI->getAlignment(), LI->getAlignment());
1093 if (Align < StoreSize)
1094 return false;
1095
1096 // If the element.atomic memcpy is not lowered into explicit
1097 // loads/stores later, then it will be lowered into an element-size
1098 // specific lib call. If the lib call doesn't exist for our store size, then
1099 // we shouldn't generate the memcpy.
1100 if (StoreSize > TTI->getAtomicMemIntrinsicMaxElementSize())
1101 return false;
1102
1103 // Create the call.
1104 // Note that unordered atomic loads/stores are *required* by the spec to
1105 // have an alignment but non-atomic loads/stores may not.
1106 NewCall = Builder.CreateElementUnorderedAtomicMemCpy(
1107 StoreBasePtr, SI->getAlignment(), LoadBasePtr, LI->getAlignment(),
1108 NumBytes, StoreSize);
1109 }
1110 NewCall->setDebugLoc(SI->getDebugLoc());
1111
1112 LLVM_DEBUG(dbgs() << " Formed memcpy: " << *NewCall << "\n"do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-idiom")) { dbgs() << " Formed memcpy: " <<
*NewCall << "\n" << " from load ptr=" <<
*LoadEv << " at: " << *LI << "\n" <<
" from store ptr=" << *StoreEv << " at: " <<
*SI << "\n"; } } while (false)
1113 << " from load ptr=" << *LoadEv << " at: " << *LI << "\n"do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-idiom")) { dbgs() << " Formed memcpy: " <<
*NewCall << "\n" << " from load ptr=" <<
*LoadEv << " at: " << *LI << "\n" <<
" from store ptr=" << *StoreEv << " at: " <<
*SI << "\n"; } } while (false)
1114 << " from store ptr=" << *StoreEv << " at: " << *SIdo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-idiom")) { dbgs() << " Formed memcpy: " <<
*NewCall << "\n" << " from load ptr=" <<
*LoadEv << " at: " << *LI << "\n" <<
" from store ptr=" << *StoreEv << " at: " <<
*SI << "\n"; } } while (false)
1115 << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-idiom")) { dbgs() << " Formed memcpy: " <<
*NewCall << "\n" << " from load ptr=" <<
*LoadEv << " at: " << *LI << "\n" <<
" from store ptr=" << *StoreEv << " at: " <<
*SI << "\n"; } } while (false)
;
1116
1117 ORE.emit([&]() {
1118 return OptimizationRemark(DEBUG_TYPE"loop-idiom", "ProcessLoopStoreOfLoopLoad",
1119 NewCall->getDebugLoc(), Preheader)
1120 << "Formed a call to "
1121 << ore::NV("NewFunction", NewCall->getCalledFunction())
1122 << "() function";
1123 });
1124
1125 // Okay, the memcpy has been formed. Zap the original store and anything that
1126 // feeds into it.
1127 deleteDeadInstruction(SI);
1128 ++NumMemCpy;
1129 return true;
1130}
1131
1132// When compiling for codesize we avoid idiom recognition for a multi-block loop
1133// unless it is a loop_memset idiom or a memset/memcpy idiom in a nested loop.
1134//
1135bool LoopIdiomRecognize::avoidLIRForMultiBlockLoop(bool IsMemset,
1136 bool IsLoopMemset) {
1137 if (ApplyCodeSizeHeuristics && CurLoop->getNumBlocks() > 1) {
1138 if (!CurLoop->getParentLoop() && (!IsMemset || !IsLoopMemset)) {
1139 LLVM_DEBUG(dbgs() << " " << CurLoop->getHeader()->getParent()->getName()do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-idiom")) { dbgs() << " " << CurLoop->getHeader
()->getParent()->getName() << " : LIR " << (
IsMemset ? "Memset" : "Memcpy") << " avoided: multi-block top-level loop\n"
; } } while (false)
1140 << " : LIR " << (IsMemset ? "Memset" : "Memcpy")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-idiom")) { dbgs() << " " << CurLoop->getHeader
()->getParent()->getName() << " : LIR " << (
IsMemset ? "Memset" : "Memcpy") << " avoided: multi-block top-level loop\n"
; } } while (false)
1141 << " avoided: multi-block top-level loop\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-idiom")) { dbgs() << " " << CurLoop->getHeader
()->getParent()->getName() << " : LIR " << (
IsMemset ? "Memset" : "Memcpy") << " avoided: multi-block top-level loop\n"
; } } while (false)
;
1142 return true;
1143 }
1144 }
1145
1146 return false;
1147}
1148
1149bool LoopIdiomRecognize::runOnNoncountableLoop() {
1150 LLVM_DEBUG(dbgs() << DEBUG_TYPE " Scanning: F["do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-idiom")) { dbgs() << "loop-idiom" " Scanning: F["
<< CurLoop->getHeader()->getParent()->getName
() << "] Noncountable Loop %" << CurLoop->getHeader
()->getName() << "\n"; } } while (false)
1151 << CurLoop->getHeader()->getParent()->getName()do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-idiom")) { dbgs() << "loop-idiom" " Scanning: F["
<< CurLoop->getHeader()->getParent()->getName
() << "] Noncountable Loop %" << CurLoop->getHeader
()->getName() << "\n"; } } while (false)
1152 << "] Noncountable Loop %"do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-idiom")) { dbgs() << "loop-idiom" " Scanning: F["
<< CurLoop->getHeader()->getParent()->getName
() << "] Noncountable Loop %" << CurLoop->getHeader
()->getName() << "\n"; } } while (false)
1153 << CurLoop->getHeader()->getName() << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-idiom")) { dbgs() << "loop-idiom" " Scanning: F["
<< CurLoop->getHeader()->getParent()->getName
() << "] Noncountable Loop %" << CurLoop->getHeader
()->getName() << "\n"; } } while (false)
;
1154
1155 return recognizePopcount() || recognizeAndInsertFFS();
1156}
1157
1158/// Check if the given conditional branch is based on the comparison between
1159/// a variable and zero, and if the variable is non-zero or zero (JmpOnZero is
1160/// true), the control yields to the loop entry. If the branch matches the
1161/// behavior, the variable involved in the comparison is returned. This function
1162/// will be called to see if the precondition and postcondition of the loop are
1163/// in desirable form.
1164static Value *matchCondition(BranchInst *BI, BasicBlock *LoopEntry,
1165 bool JmpOnZero = false) {
1166 if (!BI || !BI->isConditional())
1167 return nullptr;
1168
1169 ICmpInst *Cond = dyn_cast<ICmpInst>(BI->getCondition());
1170 if (!Cond)
1171 return nullptr;
1172
1173 ConstantInt *CmpZero = dyn_cast<ConstantInt>(Cond->getOperand(1));
1174 if (!CmpZero || !CmpZero->isZero())
1175 return nullptr;
1176
1177 BasicBlock *TrueSucc = BI->getSuccessor(0);
1178 BasicBlock *FalseSucc = BI->getSuccessor(1);
1179 if (JmpOnZero)
1180 std::swap(TrueSucc, FalseSucc);
1181
1182 ICmpInst::Predicate Pred = Cond->getPredicate();
1183 if ((Pred == ICmpInst::ICMP_NE && TrueSucc == LoopEntry) ||
1184 (Pred == ICmpInst::ICMP_EQ && FalseSucc == LoopEntry))
1185 return Cond->getOperand(0);
1186
1187 return nullptr;
1188}
1189
1190// Check if the recurrence variable `VarX` is in the right form to create
1191// the idiom. Returns the value coerced to a PHINode if so.
1192static PHINode *getRecurrenceVar(Value *VarX, Instruction *DefX,
1193 BasicBlock *LoopEntry) {
1194 auto *PhiX = dyn_cast<PHINode>(VarX);
1195 if (PhiX && PhiX->getParent() == LoopEntry &&
1196 (PhiX->getOperand(0) == DefX || PhiX->getOperand(1) == DefX))
1197 return PhiX;
1198 return nullptr;
1199}
1200
1201/// Return true iff the idiom is detected in the loop.
1202///
1203/// Additionally:
1204/// 1) \p CntInst is set to the instruction counting the population bit.
1205/// 2) \p CntPhi is set to the corresponding phi node.
1206/// 3) \p Var is set to the value whose population bits are being counted.
1207///
1208/// The core idiom we are trying to detect is:
1209/// \code
1210/// if (x0 != 0)
1211/// goto loop-exit // the precondition of the loop
1212/// cnt0 = init-val;
1213/// do {
1214/// x1 = phi (x0, x2);
1215/// cnt1 = phi(cnt0, cnt2);
1216///
1217/// cnt2 = cnt1 + 1;
1218/// ...
1219/// x2 = x1 & (x1 - 1);
1220/// ...
1221/// } while(x != 0);
1222///
1223/// loop-exit:
1224/// \endcode
1225static bool detectPopcountIdiom(Loop *CurLoop, BasicBlock *PreCondBB,
1226 Instruction *&CntInst, PHINode *&CntPhi,
1227 Value *&Var) {
1228 // step 1: Check to see if the look-back branch match this pattern:
1229 // "if (a!=0) goto loop-entry".
1230 BasicBlock *LoopEntry;
1231 Instruction *DefX2, *CountInst;
1232 Value *VarX1, *VarX0;
1233 PHINode *PhiX, *CountPhi;
1234
1235 DefX2 = CountInst = nullptr;
1236 VarX1 = VarX0 = nullptr;
1237 PhiX = CountPhi = nullptr;
1238 LoopEntry = *(CurLoop->block_begin());
1239
1240 // step 1: Check if the loop-back branch is in desirable form.
1241 {
1242 if (Value *T = matchCondition(
1243 dyn_cast<BranchInst>(LoopEntry->getTerminator()), LoopEntry))
1244 DefX2 = dyn_cast<Instruction>(T);
1245 else
1246 return false;
1247 }
1248
1249 // step 2: detect instructions corresponding to "x2 = x1 & (x1 - 1)"
1250 {
1251 if (!DefX2 || DefX2->getOpcode() != Instruction::And)
1252 return false;
1253
1254 BinaryOperator *SubOneOp;
1255
1256 if ((SubOneOp = dyn_cast<BinaryOperator>(DefX2->getOperand(0))))
1257 VarX1 = DefX2->getOperand(1);
1258 else {
1259 VarX1 = DefX2->getOperand(0);
1260 SubOneOp = dyn_cast<BinaryOperator>(DefX2->getOperand(1));
1261 }
1262 if (!SubOneOp || SubOneOp->getOperand(0) != VarX1)
1263 return false;
1264
1265 ConstantInt *Dec = dyn_cast<ConstantInt>(SubOneOp->getOperand(1));
1266 if (!Dec ||
1267 !((SubOneOp->getOpcode() == Instruction::Sub && Dec->isOne()) ||
1268 (SubOneOp->getOpcode() == Instruction::Add &&
1269 Dec->isMinusOne()))) {
1270 return false;
1271 }
1272 }
1273
1274 // step 3: Check the recurrence of variable X
1275 PhiX = getRecurrenceVar(VarX1, DefX2, LoopEntry);
1276 if (!PhiX)
1277 return false;
1278
1279 // step 4: Find the instruction which count the population: cnt2 = cnt1 + 1
1280 {
1281 CountInst = nullptr;
1282 for (BasicBlock::iterator Iter = LoopEntry->getFirstNonPHI()->getIterator(),
1283 IterE = LoopEntry->end();
1284 Iter != IterE; Iter++) {
1285 Instruction *Inst = &*Iter;
1286 if (Inst->getOpcode() != Instruction::Add)
1287 continue;
1288
1289 ConstantInt *Inc = dyn_cast<ConstantInt>(Inst->getOperand(1));
1290 if (!Inc || !Inc->isOne())
1291 continue;
1292
1293 PHINode *Phi = getRecurrenceVar(Inst->getOperand(0), Inst, LoopEntry);
1294 if (!Phi)
1295 continue;
1296
1297 // Check if the result of the instruction is live of the loop.
1298 bool LiveOutLoop = false;
1299 for (User *U : Inst->users()) {
1300 if ((cast<Instruction>(U))->getParent() != LoopEntry) {
1301 LiveOutLoop = true;
1302 break;
1303 }
1304 }
1305
1306 if (LiveOutLoop) {
1307 CountInst = Inst;
1308 CountPhi = Phi;
1309 break;
1310 }
1311 }
1312
1313 if (!CountInst)
1314 return false;
1315 }
1316
1317 // step 5: check if the precondition is in this form:
1318 // "if (x != 0) goto loop-head ; else goto somewhere-we-don't-care;"
1319 {
1320 auto *PreCondBr = dyn_cast<BranchInst>(PreCondBB->getTerminator());
1321 Value *T = matchCondition(PreCondBr, CurLoop->getLoopPreheader());
1322 if (T != PhiX->getOperand(0) && T != PhiX->getOperand(1))
1323 return false;
1324
1325 CntInst = CountInst;
1326 CntPhi = CountPhi;
1327 Var = T;
1328 }
1329
1330 return true;
1331}
1332
1333/// Return true if the idiom is detected in the loop.
1334///
1335/// Additionally:
1336/// 1) \p CntInst is set to the instruction Counting Leading Zeros (CTLZ)
1337/// or nullptr if there is no such.
1338/// 2) \p CntPhi is set to the corresponding phi node
1339/// or nullptr if there is no such.
1340/// 3) \p Var is set to the value whose CTLZ could be used.
1341/// 4) \p DefX is set to the instruction calculating Loop exit condition.
1342///
1343/// The core idiom we are trying to detect is:
1344/// \code
1345/// if (x0 == 0)
1346/// goto loop-exit // the precondition of the loop
1347/// cnt0 = init-val;
1348/// do {
1349/// x = phi (x0, x.next); //PhiX
1350/// cnt = phi(cnt0, cnt.next);
1351///
1352/// cnt.next = cnt + 1;
1353/// ...
1354/// x.next = x >> 1; // DefX
1355/// ...
1356/// } while(x.next != 0);
1357///
1358/// loop-exit:
1359/// \endcode
1360static bool detectShiftUntilZeroIdiom(Loop *CurLoop, const DataLayout &DL,
1361 Intrinsic::ID &IntrinID, Value *&InitX,
1362 Instruction *&CntInst, PHINode *&CntPhi,
1363 Instruction *&DefX) {
1364 BasicBlock *LoopEntry;
1365 Value *VarX = nullptr;
1366
1367 DefX = nullptr;
1368 CntInst = nullptr;
1369 CntPhi = nullptr;
1370 LoopEntry = *(CurLoop->block_begin());
1371
1372 // step 1: Check if the loop-back branch is in desirable form.
1373 if (Value *T = matchCondition(
1374 dyn_cast<BranchInst>(LoopEntry->getTerminator()), LoopEntry))
1375 DefX = dyn_cast<Instruction>(T);
1376 else
1377 return false;
1378
1379 // step 2: detect instructions corresponding to "x.next = x >> 1 or x << 1"
1380 if (!DefX || !DefX->isShift())
1381 return false;
1382 IntrinID = DefX->getOpcode() == Instruction::Shl ? Intrinsic::cttz :
1383 Intrinsic::ctlz;
1384 ConstantInt *Shft = dyn_cast<ConstantInt>(DefX->getOperand(1));
1385 if (!Shft || !Shft->isOne())
1386 return false;
1387 VarX = DefX->getOperand(0);
1388
1389 // step 3: Check the recurrence of variable X
1390 PHINode *PhiX = getRecurrenceVar(VarX, DefX, LoopEntry);
1391 if (!PhiX)
1392 return false;
1393
1394 InitX = PhiX->getIncomingValueForBlock(CurLoop->getLoopPreheader());
1395
1396 // Make sure the initial value can't be negative otherwise the ashr in the
1397 // loop might never reach zero which would make the loop infinite.
1398 if (DefX->getOpcode() == Instruction::AShr && !isKnownNonNegative(InitX, DL))
1399 return false;
1400
1401 // step 4: Find the instruction which count the CTLZ: cnt.next = cnt + 1
1402 // TODO: We can skip the step. If loop trip count is known (CTLZ),
1403 // then all uses of "cnt.next" could be optimized to the trip count
1404 // plus "cnt0". Currently it is not optimized.
1405 // This step could be used to detect POPCNT instruction:
1406 // cnt.next = cnt + (x.next & 1)
1407 for (BasicBlock::iterator Iter = LoopEntry->getFirstNonPHI()->getIterator(),
1408 IterE = LoopEntry->end();
1409 Iter != IterE; Iter++) {
1410 Instruction *Inst = &*Iter;
1411 if (Inst->getOpcode() != Instruction::Add)
1412 continue;
1413
1414 ConstantInt *Inc = dyn_cast<ConstantInt>(Inst->getOperand(1));
1415 if (!Inc || !Inc->isOne())
1416 continue;
1417
1418 PHINode *Phi = getRecurrenceVar(Inst->getOperand(0), Inst, LoopEntry);
1419 if (!Phi)
1420 continue;
1421
1422 CntInst = Inst;
1423 CntPhi = Phi;
1424 break;
1425 }
1426 if (!CntInst)
1427 return false;
1428
1429 return true;
1430}
1431
1432/// Recognize CTLZ or CTTZ idiom in a non-countable loop and convert the loop
1433/// to countable (with CTLZ / CTTZ trip count). If CTLZ / CTTZ inserted as a new
1434/// trip count returns true; otherwise, returns false.
1435bool LoopIdiomRecognize::recognizeAndInsertFFS() {
1436 // Give up if the loop has multiple blocks or multiple backedges.
1437 if (CurLoop->getNumBackEdges() != 1 || CurLoop->getNumBlocks() != 1)
1438 return false;
1439
1440 Intrinsic::ID IntrinID;
1441 Value *InitX;
1442 Instruction *DefX = nullptr;
1443 PHINode *CntPhi = nullptr;
1444 Instruction *CntInst = nullptr;
1445 // Help decide if transformation is profitable. For ShiftUntilZero idiom,
1446 // this is always 6.
1447 size_t IdiomCanonicalSize = 6;
1448
1449 if (!detectShiftUntilZeroIdiom(CurLoop, *DL, IntrinID, InitX,
1450 CntInst, CntPhi, DefX))
1451 return false;
1452
1453 bool IsCntPhiUsedOutsideLoop = false;
1454 for (User *U : CntPhi->users())
1455 if (!CurLoop->contains(cast<Instruction>(U))) {
1456 IsCntPhiUsedOutsideLoop = true;
1457 break;
1458 }
1459 bool IsCntInstUsedOutsideLoop = false;
1460 for (User *U : CntInst->users())
1461 if (!CurLoop->contains(cast<Instruction>(U))) {
1462 IsCntInstUsedOutsideLoop = true;
1463 break;
1464 }
1465 // If both CntInst and CntPhi are used outside the loop the profitability
1466 // is questionable.
1467 if (IsCntInstUsedOutsideLoop && IsCntPhiUsedOutsideLoop)
1468 return false;
1469
1470 // For some CPUs result of CTLZ(X) intrinsic is undefined
1471 // when X is 0. If we can not guarantee X != 0, we need to check this
1472 // when expand.
1473 bool ZeroCheck = false;
1474 // It is safe to assume Preheader exist as it was checked in
1475 // parent function RunOnLoop.
1476 BasicBlock *PH = CurLoop->getLoopPreheader();
1477
1478 // If we are using the count instruction outside the loop, make sure we
1479 // have a zero check as a precondition. Without the check the loop would run
1480 // one iteration for before any check of the input value. This means 0 and 1
1481 // would have identical behavior in the original loop and thus
1482 if (!IsCntPhiUsedOutsideLoop) {
1483 auto *PreCondBB = PH->getSinglePredecessor();
1484 if (!PreCondBB)
1485 return false;
1486 auto *PreCondBI = dyn_cast<BranchInst>(PreCondBB->getTerminator());
1487 if (!PreCondBI)
1488 return false;
1489 if (matchCondition(PreCondBI, PH) != InitX)
1490 return false;
1491 ZeroCheck = true;
1492 }
1493
1494 // Check if CTLZ / CTTZ intrinsic is profitable. Assume it is always
1495 // profitable if we delete the loop.
1496
1497 // the loop has only 6 instructions:
1498 // %n.addr.0 = phi [ %n, %entry ], [ %shr, %while.cond ]
1499 // %i.0 = phi [ %i0, %entry ], [ %inc, %while.cond ]
1500 // %shr = ashr %n.addr.0, 1
1501 // %tobool = icmp eq %shr, 0
1502 // %inc = add nsw %i.0, 1
1503 // br i1 %tobool
1504
1505 const Value *Args[] =
1506 {InitX, ZeroCheck ? ConstantInt::getTrue(InitX->getContext())
1507 : ConstantInt::getFalse(InitX->getContext())};
1508
1509 // @llvm.dbg doesn't count as they have no semantic effect.
1510 auto InstWithoutDebugIt = CurLoop->getHeader()->instructionsWithoutDebug();
1511 uint32_t HeaderSize =
1512 std::distance(InstWithoutDebugIt.begin(), InstWithoutDebugIt.end());
1513
1514 if (HeaderSize != IdiomCanonicalSize &&
1515 TTI->getIntrinsicCost(IntrinID, InitX->getType(), Args) >
1516 TargetTransformInfo::TCC_Basic)
1517 return false;
1518
1519 transformLoopToCountable(IntrinID, PH, CntInst, CntPhi, InitX, DefX,
1520 DefX->getDebugLoc(), ZeroCheck,
1521 IsCntPhiUsedOutsideLoop);
1522 return true;
1523}
1524
1525/// Recognizes a population count idiom in a non-countable loop.
1526///
1527/// If detected, transforms the relevant code to issue the popcount intrinsic
1528/// function call, and returns true; otherwise, returns false.
1529bool LoopIdiomRecognize::recognizePopcount() {
1530 if (TTI->getPopcntSupport(32) != TargetTransformInfo::PSK_FastHardware)
1531 return false;
1532
1533 // Counting population are usually conducted by few arithmetic instructions.
1534 // Such instructions can be easily "absorbed" by vacant slots in a
1535 // non-compact loop. Therefore, recognizing popcount idiom only makes sense
1536 // in a compact loop.
1537
1538 // Give up if the loop has multiple blocks or multiple backedges.
1539 if (CurLoop->getNumBackEdges() != 1 || CurLoop->getNumBlocks() != 1)
1540 return false;
1541
1542 BasicBlock *LoopBody = *(CurLoop->block_begin());
1543 if (LoopBody->size() >= 20) {
1544 // The loop is too big, bail out.
1545 return false;
1546 }
1547
1548 // It should have a preheader containing nothing but an unconditional branch.
1549 BasicBlock *PH = CurLoop->getLoopPreheader();
1550 if (!PH || &PH->front() != PH->getTerminator())
1551 return false;
1552 auto *EntryBI = dyn_cast<BranchInst>(PH->getTerminator());
1553 if (!EntryBI || EntryBI->isConditional())
1554 return false;
1555
1556 // It should have a precondition block where the generated popcount intrinsic
1557 // function can be inserted.
1558 auto *PreCondBB = PH->getSinglePredecessor();
1559 if (!PreCondBB)
1560 return false;
1561 auto *PreCondBI = dyn_cast<BranchInst>(PreCondBB->getTerminator());
1562 if (!PreCondBI || PreCondBI->isUnconditional())
1563 return false;
1564
1565 Instruction *CntInst;
1566 PHINode *CntPhi;
1567 Value *Val;
1568 if (!detectPopcountIdiom(CurLoop, PreCondBB, CntInst, CntPhi, Val))
1569 return false;
1570
1571 transformLoopToPopcount(PreCondBB, CntInst, CntPhi, Val);
1572 return true;
1573}
1574
1575static CallInst *createPopcntIntrinsic(IRBuilder<> &IRBuilder, Value *Val,
1576 const DebugLoc &DL) {
1577 Value *Ops[] = {Val};
1578 Type *Tys[] = {Val->getType()};
1579
1580 Module *M = IRBuilder.GetInsertBlock()->getParent()->getParent();
1581 Function *Func = Intrinsic::getDeclaration(M, Intrinsic::ctpop, Tys);
1582 CallInst *CI = IRBuilder.CreateCall(Func, Ops);
1583 CI->setDebugLoc(DL);
1584
1585 return CI;
1586}
1587
1588static CallInst *createFFSIntrinsic(IRBuilder<> &IRBuilder, Value *Val,
1589 const DebugLoc &DL, bool ZeroCheck,
1590 Intrinsic::ID IID) {
1591 Value *Ops[] = {Val, ZeroCheck ? IRBuilder.getTrue() : IRBuilder.getFalse()};
1592 Type *Tys[] = {Val->getType()};
1593
1594 Module *M = IRBuilder.GetInsertBlock()->getParent()->getParent();
1595 Function *Func = Intrinsic::getDeclaration(M, IID, Tys);
1596 CallInst *CI = IRBuilder.CreateCall(Func, Ops);
1597 CI->setDebugLoc(DL);
1598
1599 return CI;
1600}
1601
1602/// Transform the following loop (Using CTLZ, CTTZ is similar):
1603/// loop:
1604/// CntPhi = PHI [Cnt0, CntInst]
1605/// PhiX = PHI [InitX, DefX]
1606/// CntInst = CntPhi + 1
1607/// DefX = PhiX >> 1
1608/// LOOP_BODY
1609/// Br: loop if (DefX != 0)
1610/// Use(CntPhi) or Use(CntInst)
1611///
1612/// Into:
1613/// If CntPhi used outside the loop:
1614/// CountPrev = BitWidth(InitX) - CTLZ(InitX >> 1)
1615/// Count = CountPrev + 1
1616/// else
1617/// Count = BitWidth(InitX) - CTLZ(InitX)
1618/// loop:
1619/// CntPhi = PHI [Cnt0, CntInst]
1620/// PhiX = PHI [InitX, DefX]
1621/// PhiCount = PHI [Count, Dec]
1622/// CntInst = CntPhi + 1
1623/// DefX = PhiX >> 1
1624/// Dec = PhiCount - 1
1625/// LOOP_BODY
1626/// Br: loop if (Dec != 0)
1627/// Use(CountPrev + Cnt0) // Use(CntPhi)
1628/// or
1629/// Use(Count + Cnt0) // Use(CntInst)
1630///
1631/// If LOOP_BODY is empty the loop will be deleted.
1632/// If CntInst and DefX are not used in LOOP_BODY they will be removed.
1633void LoopIdiomRecognize::transformLoopToCountable(
1634 Intrinsic::ID IntrinID, BasicBlock *Preheader, Instruction *CntInst,
1635 PHINode *CntPhi, Value *InitX, Instruction *DefX, const DebugLoc &DL,
1636 bool ZeroCheck, bool IsCntPhiUsedOutsideLoop) {
1637 BranchInst *PreheaderBr = cast<BranchInst>(Preheader->getTerminator());
1638
1639 // Step 1: Insert the CTLZ/CTTZ instruction at the end of the preheader block
1640 IRBuilder<> Builder(PreheaderBr);
1641 Builder.SetCurrentDebugLocation(DL);
1642 Value *FFS, *Count, *CountPrev, *NewCount, *InitXNext;
1643
1644 // Count = BitWidth - CTLZ(InitX);
1645 // If there are uses of CntPhi create:
1646 // CountPrev = BitWidth - CTLZ(InitX >> 1);
1647 if (IsCntPhiUsedOutsideLoop) {
1648 if (DefX->getOpcode() == Instruction::AShr)
1649 InitXNext =
1650 Builder.CreateAShr(InitX, ConstantInt::get(InitX->getType(), 1));
1651 else if (DefX->getOpcode() == Instruction::LShr)
1652 InitXNext =
1653 Builder.CreateLShr(InitX, ConstantInt::get(InitX->getType(), 1));
1654 else if (DefX->getOpcode() == Instruction::Shl) // cttz
1655 InitXNext =
1656 Builder.CreateShl(InitX, ConstantInt::get(InitX->getType(), 1));
1657 else
1658 llvm_unreachable("Unexpected opcode!")::llvm::llvm_unreachable_internal("Unexpected opcode!", "/build/llvm-toolchain-snapshot-10~++20200112100611+7fa5290d5bd/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp"
, 1658)
;
1659 } else
1660 InitXNext = InitX;
1661 FFS = createFFSIntrinsic(Builder, InitXNext, DL, ZeroCheck, IntrinID);
1662 Count = Builder.CreateSub(
1663 ConstantInt::get(FFS->getType(),
1664 FFS->getType()->getIntegerBitWidth()),
1665 FFS);
1666 if (IsCntPhiUsedOutsideLoop) {
1667 CountPrev = Count;
1668 Count = Builder.CreateAdd(
1669 CountPrev,
1670 ConstantInt::get(CountPrev->getType(), 1));
1671 }
1672
1673 NewCount = Builder.CreateZExtOrTrunc(
1674 IsCntPhiUsedOutsideLoop ? CountPrev : Count,
1675 cast<IntegerType>(CntInst->getType()));
1676
1677 // If the counter's initial value is not zero, insert Add Inst.
1678 Value *CntInitVal = CntPhi->getIncomingValueForBlock(Preheader);
1679 ConstantInt *InitConst = dyn_cast<ConstantInt>(CntInitVal);
1680 if (!InitConst || !InitConst->isZero())
1681 NewCount = Builder.CreateAdd(NewCount, CntInitVal);
1682
1683 // Step 2: Insert new IV and loop condition:
1684 // loop:
1685 // ...
1686 // PhiCount = PHI [Count, Dec]
1687 // ...
1688 // Dec = PhiCount - 1
1689 // ...
1690 // Br: loop if (Dec != 0)
1691 BasicBlock *Body = *(CurLoop->block_begin());
1692 auto *LbBr = cast<BranchInst>(Body->getTerminator());
1693 ICmpInst *LbCond = cast<ICmpInst>(LbBr->getCondition());
1694 Type *Ty = Count->getType();
1695
1696 PHINode *TcPhi = PHINode::Create(Ty, 2, "tcphi", &Body->front());
1697
1698 Builder.SetInsertPoint(LbCond);
1699 Instruction *TcDec = cast<Instruction>(
1700 Builder.CreateSub(TcPhi, ConstantInt::get(Ty, 1),
1701 "tcdec", false, true));
1702
1703 TcPhi->addIncoming(Count, Preheader);
1704 TcPhi->addIncoming(TcDec, Body);
1705
1706 CmpInst::Predicate Pred =
1707 (LbBr->getSuccessor(0) == Body) ? CmpInst::ICMP_NE : CmpInst::ICMP_EQ;
1708 LbCond->setPredicate(Pred);
1709 LbCond->setOperand(0, TcDec);
1710 LbCond->setOperand(1, ConstantInt::get(Ty, 0));
1711
1712 // Step 3: All the references to the original counter outside
1713 // the loop are replaced with the NewCount
1714 if (IsCntPhiUsedOutsideLoop)
1715 CntPhi->replaceUsesOutsideBlock(NewCount, Body);
1716 else
1717 CntInst->replaceUsesOutsideBlock(NewCount, Body);
1718
1719 // step 4: Forget the "non-computable" trip-count SCEV associated with the
1720 // loop. The loop would otherwise not be deleted even if it becomes empty.
1721 SE->forgetLoop(CurLoop);
1722}
1723
1724void LoopIdiomRecognize::transformLoopToPopcount(BasicBlock *PreCondBB,
1725 Instruction *CntInst,
1726 PHINode *CntPhi, Value *Var) {
1727 BasicBlock *PreHead = CurLoop->getLoopPreheader();
1728 auto *PreCondBr = cast<BranchInst>(PreCondBB->getTerminator());
1729 const DebugLoc &DL = CntInst->getDebugLoc();
1730
1731 // Assuming before transformation, the loop is following:
1732 // if (x) // the precondition
1733 // do { cnt++; x &= x - 1; } while(x);
1734
1735 // Step 1: Insert the ctpop instruction at the end of the precondition block
1736 IRBuilder<> Builder(PreCondBr);
1737 Value *PopCnt, *PopCntZext, *NewCount, *TripCnt;
1738 {
1739 PopCnt = createPopcntIntrinsic(Builder, Var, DL);
1740 NewCount = PopCntZext =
1741 Builder.CreateZExtOrTrunc(PopCnt, cast<IntegerType>(CntPhi->getType()));
1742
1743 if (NewCount != PopCnt)
1744 (cast<Instruction>(NewCount))->setDebugLoc(DL);
1745
1746 // TripCnt is exactly the number of iterations the loop has
1747 TripCnt = NewCount;
1748
1749 // If the population counter's initial value is not zero, insert Add Inst.
1750 Value *CntInitVal = CntPhi->getIncomingValueForBlock(PreHead);
1751 ConstantInt *InitConst = dyn_cast<ConstantInt>(CntInitVal);
1752 if (!InitConst || !InitConst->isZero()) {
1753 NewCount = Builder.CreateAdd(NewCount, CntInitVal);
1754 (cast<Instruction>(NewCount))->setDebugLoc(DL);
1755 }
1756 }
1757
1758 // Step 2: Replace the precondition from "if (x == 0) goto loop-exit" to
1759 // "if (NewCount == 0) loop-exit". Without this change, the intrinsic
1760 // function would be partial dead code, and downstream passes will drag
1761 // it back from the precondition block to the preheader.
1762 {
1763 ICmpInst *PreCond = cast<ICmpInst>(PreCondBr->getCondition());
1764
1765 Value *Opnd0 = PopCntZext;
1766 Value *Opnd1 = ConstantInt::get(PopCntZext->getType(), 0);
1767 if (PreCond->getOperand(0) != Var)
1768 std::swap(Opnd0, Opnd1);
1769
1770 ICmpInst *NewPreCond = cast<ICmpInst>(
1771 Builder.CreateICmp(PreCond->getPredicate(), Opnd0, Opnd1));
1772 PreCondBr->setCondition(NewPreCond);
1773
1774 RecursivelyDeleteTriviallyDeadInstructions(PreCond, TLI);
1775 }
1776
1777 // Step 3: Note that the population count is exactly the trip count of the
1778 // loop in question, which enable us to convert the loop from noncountable
1779 // loop into a countable one. The benefit is twofold:
1780 //
1781 // - If the loop only counts population, the entire loop becomes dead after
1782 // the transformation. It is a lot easier to prove a countable loop dead
1783 // than to prove a noncountable one. (In some C dialects, an infinite loop
1784 // isn't dead even if it computes nothing useful. In general, DCE needs
1785 // to prove a noncountable loop finite before safely delete it.)
1786 //
1787 // - If the loop also performs something else, it remains alive.
1788 // Since it is transformed to countable form, it can be aggressively
1789 // optimized by some optimizations which are in general not applicable
1790 // to a noncountable loop.
1791 //
1792 // After this step, this loop (conceptually) would look like following:
1793 // newcnt = __builtin_ctpop(x);
1794 // t = newcnt;
1795 // if (x)
1796 // do { cnt++; x &= x-1; t--) } while (t > 0);
1797 BasicBlock *Body = *(CurLoop->block_begin());
1798 {
1799 auto *LbBr = cast<BranchInst>(Body->getTerminator());
1800 ICmpInst *LbCond = cast<ICmpInst>(LbBr->getCondition());
1801 Type *Ty = TripCnt->getType();
1802
1803 PHINode *TcPhi = PHINode::Create(Ty, 2, "tcphi", &Body->front());
1804
1805 Builder.SetInsertPoint(LbCond);
1806 Instruction *TcDec = cast<Instruction>(
1807 Builder.CreateSub(TcPhi, ConstantInt::get(Ty, 1),
1808 "tcdec", false, true));
1809
1810 TcPhi->addIncoming(TripCnt, PreHead);
1811 TcPhi->addIncoming(TcDec, Body);
1812
1813 CmpInst::Predicate Pred =
1814 (LbBr->getSuccessor(0) == Body) ? CmpInst::ICMP_UGT : CmpInst::ICMP_SLE;
1815 LbCond->setPredicate(Pred);
1816 LbCond->setOperand(0, TcDec);
1817 LbCond->setOperand(1, ConstantInt::get(Ty, 0));
1818 }
1819
1820 // Step 4: All the references to the original population counter outside
1821 // the loop are replaced with the NewCount -- the value returned from
1822 // __builtin_ctpop().
1823 CntInst->replaceUsesOutsideBlock(NewCount, Body);
1824
1825 // step 5: Forget the "non-computable" trip-count SCEV associated with the
1826 // loop. The loop would otherwise not be deleted even if it becomes empty.
1827 SE->forgetLoop(CurLoop);
1828}