Bug Summary

File:llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
Warning:line 480, column 15
Although the value stored to 'PatternValue' is used in the enclosing expression, the value is never actually read from 'PatternValue'

Annotated Source Code

Press '?' to see keyboard shortcuts

clang -cc1 -triple x86_64-pc-linux-gnu -analyze -disable-free -disable-llvm-verifier -discard-value-names -main-file-name LoopIdiomRecognize.cpp -analyzer-store=region -analyzer-opt-analyze-nested-blocks -analyzer-checker=core -analyzer-checker=apiModeling -analyzer-checker=unix -analyzer-checker=deadcode -analyzer-checker=cplusplus -analyzer-checker=security.insecureAPI.UncheckedReturn -analyzer-checker=security.insecureAPI.getpw -analyzer-checker=security.insecureAPI.gets -analyzer-checker=security.insecureAPI.mktemp -analyzer-checker=security.insecureAPI.mkstemp -analyzer-checker=security.insecureAPI.vfork -analyzer-checker=nullability.NullPassedToNonnull -analyzer-checker=nullability.NullReturnedFromNonnull -analyzer-output plist -w -setup-static-analyzer -analyzer-config-compatibility-mode=true -mrelocation-model pic -pic-level 2 -mthread-model posix -mframe-pointer=none -fmath-errno -fno-rounding-math -masm-verbose -mconstructor-aliases -munwind-tables -target-cpu x86-64 -dwarf-column-info -fno-split-dwarf-inlining -debugger-tuning=gdb -ffunction-sections -fdata-sections -resource-dir /usr/lib/llvm-11/lib/clang/11.0.0 -D _DEBUG -D _GNU_SOURCE -D __STDC_CONSTANT_MACROS -D __STDC_FORMAT_MACROS -D __STDC_LIMIT_MACROS -I /build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/build-llvm/lib/Transforms/Scalar -I /build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Scalar -I /build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/build-llvm/include -I /build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/include -U NDEBUG -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/6.3.0/../../../../include/c++/6.3.0 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/6.3.0/../../../../include/x86_64-linux-gnu/c++/6.3.0 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/6.3.0/../../../../include/x86_64-linux-gnu/c++/6.3.0 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/6.3.0/../../../../include/c++/6.3.0/backward -internal-isystem /usr/local/include -internal-isystem /usr/lib/llvm-11/lib/clang/11.0.0/include -internal-externc-isystem /usr/include/x86_64-linux-gnu -internal-externc-isystem /include -internal-externc-isystem /usr/include -O2 -Wno-unused-parameter -Wwrite-strings -Wno-missing-field-initializers -Wno-long-long -Wno-maybe-uninitialized -Wno-comment -std=c++14 -fdeprecated-macro -fdebug-compilation-dir /build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/build-llvm/lib/Transforms/Scalar -fdebug-prefix-map=/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347=. -ferror-limit 19 -fmessage-length 0 -fvisibility-inlines-hidden -stack-protector 2 -fgnuc-version=4.2.1 -fobjc-runtime=gcc -fdiagnostics-show-option -vectorize-loops -vectorize-slp -analyzer-output=html -analyzer-config stable-report-filename=true -faddrsig -o /tmp/scan-build-2020-03-09-184146-41876-1 -x c++ /build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
1//===- LoopIdiomRecognize.cpp - Loop idiom recognition --------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This pass implements an idiom recognizer that transforms simple loops into a
10// non-loop form. In cases that this kicks in, it can be a significant
11// performance win.
12//
13// If compiling for code size we avoid idiom recognition if the resulting
14// code could be larger than the code for the original loop. One way this could
15// happen is if the loop is not removable after idiom recognition due to the
16// presence of non-idiom instructions. The initial implementation of the
17// heuristics applies to idioms in multi-block loops.
18//
19//===----------------------------------------------------------------------===//
20//
21// TODO List:
22//
23// Future loop memory idioms to recognize:
24// memcmp, memmove, strlen, etc.
25// Future floating point idioms to recognize in -ffast-math mode:
26// fpowi
27// Future integer operation idioms to recognize:
28// ctpop
29//
30// Beware that isel's default lowering for ctpop is highly inefficient for
31// i64 and larger types when i64 is legal and the value has few bits set. It
32// would be good to enhance isel to emit a loop for ctpop in this case.
33//
34// This could recognize common matrix multiplies and dot product idioms and
35// replace them with calls to BLAS (if linked in??).
36//
37//===----------------------------------------------------------------------===//
38
39#include "llvm/Transforms/Scalar/LoopIdiomRecognize.h"
40#include "llvm/ADT/APInt.h"
41#include "llvm/ADT/ArrayRef.h"
42#include "llvm/ADT/DenseMap.h"
43#include "llvm/ADT/MapVector.h"
44#include "llvm/ADT/SetVector.h"
45#include "llvm/ADT/SmallPtrSet.h"
46#include "llvm/ADT/SmallVector.h"
47#include "llvm/ADT/Statistic.h"
48#include "llvm/ADT/StringRef.h"
49#include "llvm/Analysis/AliasAnalysis.h"
50#include "llvm/Analysis/LoopAccessAnalysis.h"
51#include "llvm/Analysis/LoopInfo.h"
52#include "llvm/Analysis/LoopPass.h"
53#include "llvm/Analysis/MemoryLocation.h"
54#include "llvm/Analysis/MemorySSA.h"
55#include "llvm/Analysis/MemorySSAUpdater.h"
56#include "llvm/Analysis/OptimizationRemarkEmitter.h"
57#include "llvm/Analysis/ScalarEvolution.h"
58#include "llvm/Analysis/ScalarEvolutionExpander.h"
59#include "llvm/Analysis/ScalarEvolutionExpressions.h"
60#include "llvm/Analysis/TargetLibraryInfo.h"
61#include "llvm/Analysis/TargetTransformInfo.h"
62#include "llvm/Analysis/ValueTracking.h"
63#include "llvm/IR/Attributes.h"
64#include "llvm/IR/BasicBlock.h"
65#include "llvm/IR/Constant.h"
66#include "llvm/IR/Constants.h"
67#include "llvm/IR/DataLayout.h"
68#include "llvm/IR/DebugLoc.h"
69#include "llvm/IR/DerivedTypes.h"
70#include "llvm/IR/Dominators.h"
71#include "llvm/IR/GlobalValue.h"
72#include "llvm/IR/GlobalVariable.h"
73#include "llvm/IR/IRBuilder.h"
74#include "llvm/IR/InstrTypes.h"
75#include "llvm/IR/Instruction.h"
76#include "llvm/IR/Instructions.h"
77#include "llvm/IR/IntrinsicInst.h"
78#include "llvm/IR/Intrinsics.h"
79#include "llvm/IR/LLVMContext.h"
80#include "llvm/IR/Module.h"
81#include "llvm/IR/PassManager.h"
82#include "llvm/IR/Type.h"
83#include "llvm/IR/User.h"
84#include "llvm/IR/Value.h"
85#include "llvm/IR/ValueHandle.h"
86#include "llvm/InitializePasses.h"
87#include "llvm/Pass.h"
88#include "llvm/Support/Casting.h"
89#include "llvm/Support/CommandLine.h"
90#include "llvm/Support/Debug.h"
91#include "llvm/Support/raw_ostream.h"
92#include "llvm/Transforms/Scalar.h"
93#include "llvm/Transforms/Utils/BuildLibCalls.h"
94#include "llvm/Transforms/Utils/Local.h"
95#include "llvm/Transforms/Utils/LoopUtils.h"
96#include <algorithm>
97#include <cassert>
98#include <cstdint>
99#include <utility>
100#include <vector>
101
102using namespace llvm;
103
104#define DEBUG_TYPE"loop-idiom" "loop-idiom"
105
106STATISTIC(NumMemSet, "Number of memset's formed from loop stores")static llvm::Statistic NumMemSet = {"loop-idiom", "NumMemSet"
, "Number of memset's formed from loop stores"}
;
107STATISTIC(NumMemCpy, "Number of memcpy's formed from loop load+stores")static llvm::Statistic NumMemCpy = {"loop-idiom", "NumMemCpy"
, "Number of memcpy's formed from loop load+stores"}
;
108
109static cl::opt<bool> UseLIRCodeSizeHeurs(
110 "use-lir-code-size-heurs",
111 cl::desc("Use loop idiom recognition code size heuristics when compiling"
112 "with -Os/-Oz"),
113 cl::init(true), cl::Hidden);
114
115namespace {
116
117class LoopIdiomRecognize {
118 Loop *CurLoop = nullptr;
119 AliasAnalysis *AA;
120 DominatorTree *DT;
121 LoopInfo *LI;
122 ScalarEvolution *SE;
123 TargetLibraryInfo *TLI;
124 const TargetTransformInfo *TTI;
125 const DataLayout *DL;
126 OptimizationRemarkEmitter &ORE;
127 bool ApplyCodeSizeHeuristics;
128 std::unique_ptr<MemorySSAUpdater> MSSAU;
129
130public:
131 explicit LoopIdiomRecognize(AliasAnalysis *AA, DominatorTree *DT,
132 LoopInfo *LI, ScalarEvolution *SE,
133 TargetLibraryInfo *TLI,
134 const TargetTransformInfo *TTI, MemorySSA *MSSA,
135 const DataLayout *DL,
136 OptimizationRemarkEmitter &ORE)
137 : AA(AA), DT(DT), LI(LI), SE(SE), TLI(TLI), TTI(TTI), DL(DL), ORE(ORE) {
138 if (MSSA)
139 MSSAU = std::make_unique<MemorySSAUpdater>(MSSA);
140 }
141
142 bool runOnLoop(Loop *L);
143
144private:
145 using StoreList = SmallVector<StoreInst *, 8>;
146 using StoreListMap = MapVector<Value *, StoreList>;
147
148 StoreListMap StoreRefsForMemset;
149 StoreListMap StoreRefsForMemsetPattern;
150 StoreList StoreRefsForMemcpy;
151 bool HasMemset;
152 bool HasMemsetPattern;
153 bool HasMemcpy;
154
155 /// Return code for isLegalStore()
156 enum LegalStoreKind {
157 None = 0,
158 Memset,
159 MemsetPattern,
160 Memcpy,
161 UnorderedAtomicMemcpy,
162 DontUse // Dummy retval never to be used. Allows catching errors in retval
163 // handling.
164 };
165
166 /// \name Countable Loop Idiom Handling
167 /// @{
168
169 bool runOnCountableLoop();
170 bool runOnLoopBlock(BasicBlock *BB, const SCEV *BECount,
171 SmallVectorImpl<BasicBlock *> &ExitBlocks);
172
173 void collectStores(BasicBlock *BB);
174 LegalStoreKind isLegalStore(StoreInst *SI);
175 enum class ForMemset { No, Yes };
176 bool processLoopStores(SmallVectorImpl<StoreInst *> &SL, const SCEV *BECount,
177 ForMemset For);
178 bool processLoopMemSet(MemSetInst *MSI, const SCEV *BECount);
179
180 bool processLoopStridedStore(Value *DestPtr, unsigned StoreSize,
181 MaybeAlign StoreAlignment, Value *StoredVal,
182 Instruction *TheStore,
183 SmallPtrSetImpl<Instruction *> &Stores,
184 const SCEVAddRecExpr *Ev, const SCEV *BECount,
185 bool NegStride, bool IsLoopMemset = false);
186 bool processLoopStoreOfLoopLoad(StoreInst *SI, const SCEV *BECount);
187 bool avoidLIRForMultiBlockLoop(bool IsMemset = false,
188 bool IsLoopMemset = false);
189
190 /// @}
191 /// \name Noncountable Loop Idiom Handling
192 /// @{
193
194 bool runOnNoncountableLoop();
195
196 bool recognizePopcount();
197 void transformLoopToPopcount(BasicBlock *PreCondBB, Instruction *CntInst,
198 PHINode *CntPhi, Value *Var);
199 bool recognizeAndInsertFFS(); /// Find First Set: ctlz or cttz
200 void transformLoopToCountable(Intrinsic::ID IntrinID, BasicBlock *PreCondBB,
201 Instruction *CntInst, PHINode *CntPhi,
202 Value *Var, Instruction *DefX,
203 const DebugLoc &DL, bool ZeroCheck,
204 bool IsCntPhiUsedOutsideLoop);
205
206 /// @}
207};
208
209class LoopIdiomRecognizeLegacyPass : public LoopPass {
210public:
211 static char ID;
212
213 explicit LoopIdiomRecognizeLegacyPass() : LoopPass(ID) {
214 initializeLoopIdiomRecognizeLegacyPassPass(
215 *PassRegistry::getPassRegistry());
216 }
217
218 bool runOnLoop(Loop *L, LPPassManager &LPM) override {
219 if (skipLoop(L))
220 return false;
221
222 AliasAnalysis *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
223 DominatorTree *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
224 LoopInfo *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
225 ScalarEvolution *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
226 TargetLibraryInfo *TLI =
227 &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(
228 *L->getHeader()->getParent());
229 const TargetTransformInfo *TTI =
230 &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(
231 *L->getHeader()->getParent());
232 const DataLayout *DL = &L->getHeader()->getModule()->getDataLayout();
233 auto *MSSAAnalysis = getAnalysisIfAvailable<MemorySSAWrapperPass>();
234 MemorySSA *MSSA = nullptr;
235 if (MSSAAnalysis)
236 MSSA = &MSSAAnalysis->getMSSA();
237
238 // For the old PM, we can't use OptimizationRemarkEmitter as an analysis
239 // pass. Function analyses need to be preserved across loop transformations
240 // but ORE cannot be preserved (see comment before the pass definition).
241 OptimizationRemarkEmitter ORE(L->getHeader()->getParent());
242
243 LoopIdiomRecognize LIR(AA, DT, LI, SE, TLI, TTI, MSSA, DL, ORE);
244 return LIR.runOnLoop(L);
245 }
246
247 /// This transformation requires natural loop information & requires that
248 /// loop preheaders be inserted into the CFG.
249 void getAnalysisUsage(AnalysisUsage &AU) const override {
250 AU.addRequired<TargetLibraryInfoWrapperPass>();
251 AU.addRequired<TargetTransformInfoWrapperPass>();
252 AU.addPreserved<MemorySSAWrapperPass>();
253 getLoopAnalysisUsage(AU);
254 }
255};
256
257} // end anonymous namespace
258
259char LoopIdiomRecognizeLegacyPass::ID = 0;
260
261PreservedAnalyses LoopIdiomRecognizePass::run(Loop &L, LoopAnalysisManager &AM,
262 LoopStandardAnalysisResults &AR,
263 LPMUpdater &) {
264 const auto *DL = &L.getHeader()->getModule()->getDataLayout();
265
266 // For the new PM, we also can't use OptimizationRemarkEmitter as an analysis
267 // pass. Function analyses need to be preserved across loop transformations
268 // but ORE cannot be preserved (see comment before the pass definition).
269 OptimizationRemarkEmitter ORE(L.getHeader()->getParent());
270
271 LoopIdiomRecognize LIR(&AR.AA, &AR.DT, &AR.LI, &AR.SE, &AR.TLI, &AR.TTI,
272 AR.MSSA, DL, ORE);
273 if (!LIR.runOnLoop(&L))
274 return PreservedAnalyses::all();
275
276 auto PA = getLoopPassPreservedAnalyses();
277 if (AR.MSSA)
278 PA.preserve<MemorySSAAnalysis>();
279 return PA;
280}
281
282INITIALIZE_PASS_BEGIN(LoopIdiomRecognizeLegacyPass, "loop-idiom",static void *initializeLoopIdiomRecognizeLegacyPassPassOnce(PassRegistry
&Registry) {
283 "Recognize loop idioms", false, false)static void *initializeLoopIdiomRecognizeLegacyPassPassOnce(PassRegistry
&Registry) {
284INITIALIZE_PASS_DEPENDENCY(LoopPass)initializeLoopPassPass(Registry);
285INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)initializeTargetLibraryInfoWrapperPassPass(Registry);
286INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)initializeTargetTransformInfoWrapperPassPass(Registry);
287INITIALIZE_PASS_END(LoopIdiomRecognizeLegacyPass, "loop-idiom",PassInfo *PI = new PassInfo( "Recognize loop idioms", "loop-idiom"
, &LoopIdiomRecognizeLegacyPass::ID, PassInfo::NormalCtor_t
(callDefaultCtor<LoopIdiomRecognizeLegacyPass>), false,
false); Registry.registerPass(*PI, true); return PI; } static
llvm::once_flag InitializeLoopIdiomRecognizeLegacyPassPassFlag
; void llvm::initializeLoopIdiomRecognizeLegacyPassPass(PassRegistry
&Registry) { llvm::call_once(InitializeLoopIdiomRecognizeLegacyPassPassFlag
, initializeLoopIdiomRecognizeLegacyPassPassOnce, std::ref(Registry
)); }
288 "Recognize loop idioms", false, false)PassInfo *PI = new PassInfo( "Recognize loop idioms", "loop-idiom"
, &LoopIdiomRecognizeLegacyPass::ID, PassInfo::NormalCtor_t
(callDefaultCtor<LoopIdiomRecognizeLegacyPass>), false,
false); Registry.registerPass(*PI, true); return PI; } static
llvm::once_flag InitializeLoopIdiomRecognizeLegacyPassPassFlag
; void llvm::initializeLoopIdiomRecognizeLegacyPassPass(PassRegistry
&Registry) { llvm::call_once(InitializeLoopIdiomRecognizeLegacyPassPassFlag
, initializeLoopIdiomRecognizeLegacyPassPassOnce, std::ref(Registry
)); }
289
290Pass *llvm::createLoopIdiomPass() { return new LoopIdiomRecognizeLegacyPass(); }
291
292static void deleteDeadInstruction(Instruction *I) {
293 I->replaceAllUsesWith(UndefValue::get(I->getType()));
294 I->eraseFromParent();
295}
296
297//===----------------------------------------------------------------------===//
298//
299// Implementation of LoopIdiomRecognize
300//
301//===----------------------------------------------------------------------===//
302
303bool LoopIdiomRecognize::runOnLoop(Loop *L) {
304 CurLoop = L;
305 // If the loop could not be converted to canonical form, it must have an
306 // indirectbr in it, just give up.
307 if (!L->getLoopPreheader())
308 return false;
309
310 // Disable loop idiom recognition if the function's name is a common idiom.
311 StringRef Name = L->getHeader()->getParent()->getName();
312 if (Name == "memset" || Name == "memcpy")
313 return false;
314
315 // Determine if code size heuristics need to be applied.
316 ApplyCodeSizeHeuristics =
317 L->getHeader()->getParent()->hasOptSize() && UseLIRCodeSizeHeurs;
318
319 HasMemset = TLI->has(LibFunc_memset);
320 HasMemsetPattern = TLI->has(LibFunc_memset_pattern16);
321 HasMemcpy = TLI->has(LibFunc_memcpy);
322
323 if (HasMemset || HasMemsetPattern || HasMemcpy)
324 if (SE->hasLoopInvariantBackedgeTakenCount(L))
325 return runOnCountableLoop();
326
327 return runOnNoncountableLoop();
328}
329
330bool LoopIdiomRecognize::runOnCountableLoop() {
331 const SCEV *BECount = SE->getBackedgeTakenCount(CurLoop);
332 assert(!isa<SCEVCouldNotCompute>(BECount) &&((!isa<SCEVCouldNotCompute>(BECount) && "runOnCountableLoop() called on a loop without a predictable"
"backedge-taken count") ? static_cast<void> (0) : __assert_fail
("!isa<SCEVCouldNotCompute>(BECount) && \"runOnCountableLoop() called on a loop without a predictable\" \"backedge-taken count\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp"
, 334, __PRETTY_FUNCTION__))
333 "runOnCountableLoop() called on a loop without a predictable"((!isa<SCEVCouldNotCompute>(BECount) && "runOnCountableLoop() called on a loop without a predictable"
"backedge-taken count") ? static_cast<void> (0) : __assert_fail
("!isa<SCEVCouldNotCompute>(BECount) && \"runOnCountableLoop() called on a loop without a predictable\" \"backedge-taken count\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp"
, 334, __PRETTY_FUNCTION__))
334 "backedge-taken count")((!isa<SCEVCouldNotCompute>(BECount) && "runOnCountableLoop() called on a loop without a predictable"
"backedge-taken count") ? static_cast<void> (0) : __assert_fail
("!isa<SCEVCouldNotCompute>(BECount) && \"runOnCountableLoop() called on a loop without a predictable\" \"backedge-taken count\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp"
, 334, __PRETTY_FUNCTION__))
;
335
336 // If this loop executes exactly one time, then it should be peeled, not
337 // optimized by this pass.
338 if (const SCEVConstant *BECst = dyn_cast<SCEVConstant>(BECount))
339 if (BECst->getAPInt() == 0)
340 return false;
341
342 SmallVector<BasicBlock *, 8> ExitBlocks;
343 CurLoop->getUniqueExitBlocks(ExitBlocks);
344
345 LLVM_DEBUG(dbgs() << DEBUG_TYPE " Scanning: F["do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-idiom")) { dbgs() << "loop-idiom" " Scanning: F["
<< CurLoop->getHeader()->getParent()->getName
() << "] Countable Loop %" << CurLoop->getHeader
()->getName() << "\n"; } } while (false)
346 << CurLoop->getHeader()->getParent()->getName()do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-idiom")) { dbgs() << "loop-idiom" " Scanning: F["
<< CurLoop->getHeader()->getParent()->getName
() << "] Countable Loop %" << CurLoop->getHeader
()->getName() << "\n"; } } while (false)
347 << "] Countable Loop %" << CurLoop->getHeader()->getName()do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-idiom")) { dbgs() << "loop-idiom" " Scanning: F["
<< CurLoop->getHeader()->getParent()->getName
() << "] Countable Loop %" << CurLoop->getHeader
()->getName() << "\n"; } } while (false)
348 << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-idiom")) { dbgs() << "loop-idiom" " Scanning: F["
<< CurLoop->getHeader()->getParent()->getName
() << "] Countable Loop %" << CurLoop->getHeader
()->getName() << "\n"; } } while (false)
;
349
350 // The following transforms hoist stores/memsets into the loop pre-header.
351 // Give up if the loop has instructions that may throw.
352 SimpleLoopSafetyInfo SafetyInfo;
353 SafetyInfo.computeLoopSafetyInfo(CurLoop);
354 if (SafetyInfo.anyBlockMayThrow())
355 return false;
356
357 bool MadeChange = false;
358
359 // Scan all the blocks in the loop that are not in subloops.
360 for (auto *BB : CurLoop->getBlocks()) {
361 // Ignore blocks in subloops.
362 if (LI->getLoopFor(BB) != CurLoop)
363 continue;
364
365 MadeChange |= runOnLoopBlock(BB, BECount, ExitBlocks);
366 }
367 return MadeChange;
368}
369
370static APInt getStoreStride(const SCEVAddRecExpr *StoreEv) {
371 const SCEVConstant *ConstStride = cast<SCEVConstant>(StoreEv->getOperand(1));
372 return ConstStride->getAPInt();
373}
374
375/// getMemSetPatternValue - If a strided store of the specified value is safe to
376/// turn into a memset_pattern16, return a ConstantArray of 16 bytes that should
377/// be passed in. Otherwise, return null.
378///
379/// Note that we don't ever attempt to use memset_pattern8 or 4, because these
380/// just replicate their input array and then pass on to memset_pattern16.
381static Constant *getMemSetPatternValue(Value *V, const DataLayout *DL) {
382 // FIXME: This could check for UndefValue because it can be merged into any
383 // other valid pattern.
384
385 // If the value isn't a constant, we can't promote it to being in a constant
386 // array. We could theoretically do a store to an alloca or something, but
387 // that doesn't seem worthwhile.
388 Constant *C = dyn_cast<Constant>(V);
389 if (!C)
390 return nullptr;
391
392 // Only handle simple values that are a power of two bytes in size.
393 uint64_t Size = DL->getTypeSizeInBits(V->getType());
394 if (Size == 0 || (Size & 7) || (Size & (Size - 1)))
395 return nullptr;
396
397 // Don't care enough about darwin/ppc to implement this.
398 if (DL->isBigEndian())
399 return nullptr;
400
401 // Convert to size in bytes.
402 Size /= 8;
403
404 // TODO: If CI is larger than 16-bytes, we can try slicing it in half to see
405 // if the top and bottom are the same (e.g. for vectors and large integers).
406 if (Size > 16)
407 return nullptr;
408
409 // If the constant is exactly 16 bytes, just use it.
410 if (Size == 16)
411 return C;
412
413 // Otherwise, we'll use an array of the constants.
414 unsigned ArraySize = 16 / Size;
415 ArrayType *AT = ArrayType::get(V->getType(), ArraySize);
416 return ConstantArray::get(AT, std::vector<Constant *>(ArraySize, C));
417}
418
419LoopIdiomRecognize::LegalStoreKind
420LoopIdiomRecognize::isLegalStore(StoreInst *SI) {
421 // Don't touch volatile stores.
422 if (SI->isVolatile())
423 return LegalStoreKind::None;
424 // We only want simple or unordered-atomic stores.
425 if (!SI->isUnordered())
426 return LegalStoreKind::None;
427
428 // Don't convert stores of non-integral pointer types to memsets (which stores
429 // integers).
430 if (DL->isNonIntegralPointerType(SI->getValueOperand()->getType()))
431 return LegalStoreKind::None;
432
433 // Avoid merging nontemporal stores.
434 if (SI->getMetadata(LLVMContext::MD_nontemporal))
435 return LegalStoreKind::None;
436
437 Value *StoredVal = SI->getValueOperand();
438 Value *StorePtr = SI->getPointerOperand();
439
440 // Reject stores that are so large that they overflow an unsigned.
441 uint64_t SizeInBits = DL->getTypeSizeInBits(StoredVal->getType());
442 if ((SizeInBits & 7) || (SizeInBits >> 32) != 0)
443 return LegalStoreKind::None;
444
445 // See if the pointer expression is an AddRec like {base,+,1} on the current
446 // loop, which indicates a strided store. If we have something else, it's a
447 // random store we can't handle.
448 const SCEVAddRecExpr *StoreEv =
449 dyn_cast<SCEVAddRecExpr>(SE->getSCEV(StorePtr));
450 if (!StoreEv || StoreEv->getLoop() != CurLoop || !StoreEv->isAffine())
451 return LegalStoreKind::None;
452
453 // Check to see if we have a constant stride.
454 if (!isa<SCEVConstant>(StoreEv->getOperand(1)))
455 return LegalStoreKind::None;
456
457 // See if the store can be turned into a memset.
458
459 // If the stored value is a byte-wise value (like i32 -1), then it may be
460 // turned into a memset of i8 -1, assuming that all the consecutive bytes
461 // are stored. A store of i32 0x01020304 can never be turned into a memset,
462 // but it can be turned into memset_pattern if the target supports it.
463 Value *SplatValue = isBytewiseValue(StoredVal, *DL);
464 Constant *PatternValue = nullptr;
465
466 // Note: memset and memset_pattern on unordered-atomic is yet not supported
467 bool UnorderedAtomic = SI->isUnordered() && !SI->isSimple();
468
469 // If we're allowed to form a memset, and the stored value would be
470 // acceptable for memset, use it.
471 if (!UnorderedAtomic && HasMemset && SplatValue &&
472 // Verify that the stored value is loop invariant. If not, we can't
473 // promote the memset.
474 CurLoop->isLoopInvariant(SplatValue)) {
475 // It looks like we can use SplatValue.
476 return LegalStoreKind::Memset;
477 } else if (!UnorderedAtomic && HasMemsetPattern &&
478 // Don't create memset_pattern16s with address spaces.
479 StorePtr->getType()->getPointerAddressSpace() == 0 &&
480 (PatternValue = getMemSetPatternValue(StoredVal, DL))) {
Although the value stored to 'PatternValue' is used in the enclosing expression, the value is never actually read from 'PatternValue'
481 // It looks like we can use PatternValue!
482 return LegalStoreKind::MemsetPattern;
483 }
484
485 // Otherwise, see if the store can be turned into a memcpy.
486 if (HasMemcpy) {
487 // Check to see if the stride matches the size of the store. If so, then we
488 // know that every byte is touched in the loop.
489 APInt Stride = getStoreStride(StoreEv);
490 unsigned StoreSize = DL->getTypeStoreSize(SI->getValueOperand()->getType());
491 if (StoreSize != Stride && StoreSize != -Stride)
492 return LegalStoreKind::None;
493
494 // The store must be feeding a non-volatile load.
495 LoadInst *LI = dyn_cast<LoadInst>(SI->getValueOperand());
496
497 // Only allow non-volatile loads
498 if (!LI || LI->isVolatile())
499 return LegalStoreKind::None;
500 // Only allow simple or unordered-atomic loads
501 if (!LI->isUnordered())
502 return LegalStoreKind::None;
503
504 // See if the pointer expression is an AddRec like {base,+,1} on the current
505 // loop, which indicates a strided load. If we have something else, it's a
506 // random load we can't handle.
507 const SCEVAddRecExpr *LoadEv =
508 dyn_cast<SCEVAddRecExpr>(SE->getSCEV(LI->getPointerOperand()));
509 if (!LoadEv || LoadEv->getLoop() != CurLoop || !LoadEv->isAffine())
510 return LegalStoreKind::None;
511
512 // The store and load must share the same stride.
513 if (StoreEv->getOperand(1) != LoadEv->getOperand(1))
514 return LegalStoreKind::None;
515
516 // Success. This store can be converted into a memcpy.
517 UnorderedAtomic = UnorderedAtomic || LI->isAtomic();
518 return UnorderedAtomic ? LegalStoreKind::UnorderedAtomicMemcpy
519 : LegalStoreKind::Memcpy;
520 }
521 // This store can't be transformed into a memset/memcpy.
522 return LegalStoreKind::None;
523}
524
525void LoopIdiomRecognize::collectStores(BasicBlock *BB) {
526 StoreRefsForMemset.clear();
527 StoreRefsForMemsetPattern.clear();
528 StoreRefsForMemcpy.clear();
529 for (Instruction &I : *BB) {
530 StoreInst *SI = dyn_cast<StoreInst>(&I);
531 if (!SI)
532 continue;
533
534 // Make sure this is a strided store with a constant stride.
535 switch (isLegalStore(SI)) {
536 case LegalStoreKind::None:
537 // Nothing to do
538 break;
539 case LegalStoreKind::Memset: {
540 // Find the base pointer.
541 Value *Ptr = GetUnderlyingObject(SI->getPointerOperand(), *DL);
542 StoreRefsForMemset[Ptr].push_back(SI);
543 } break;
544 case LegalStoreKind::MemsetPattern: {
545 // Find the base pointer.
546 Value *Ptr = GetUnderlyingObject(SI->getPointerOperand(), *DL);
547 StoreRefsForMemsetPattern[Ptr].push_back(SI);
548 } break;
549 case LegalStoreKind::Memcpy:
550 case LegalStoreKind::UnorderedAtomicMemcpy:
551 StoreRefsForMemcpy.push_back(SI);
552 break;
553 default:
554 assert(false && "unhandled return value")((false && "unhandled return value") ? static_cast<
void> (0) : __assert_fail ("false && \"unhandled return value\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp"
, 554, __PRETTY_FUNCTION__))
;
555 break;
556 }
557 }
558}
559
560/// runOnLoopBlock - Process the specified block, which lives in a counted loop
561/// with the specified backedge count. This block is known to be in the current
562/// loop and not in any subloops.
563bool LoopIdiomRecognize::runOnLoopBlock(
564 BasicBlock *BB, const SCEV *BECount,
565 SmallVectorImpl<BasicBlock *> &ExitBlocks) {
566 // We can only promote stores in this block if they are unconditionally
567 // executed in the loop. For a block to be unconditionally executed, it has
568 // to dominate all the exit blocks of the loop. Verify this now.
569 for (unsigned i = 0, e = ExitBlocks.size(); i != e; ++i)
570 if (!DT->dominates(BB, ExitBlocks[i]))
571 return false;
572
573 bool MadeChange = false;
574 // Look for store instructions, which may be optimized to memset/memcpy.
575 collectStores(BB);
576
577 // Look for a single store or sets of stores with a common base, which can be
578 // optimized into a memset (memset_pattern). The latter most commonly happens
579 // with structs and handunrolled loops.
580 for (auto &SL : StoreRefsForMemset)
581 MadeChange |= processLoopStores(SL.second, BECount, ForMemset::Yes);
582
583 for (auto &SL : StoreRefsForMemsetPattern)
584 MadeChange |= processLoopStores(SL.second, BECount, ForMemset::No);
585
586 // Optimize the store into a memcpy, if it feeds an similarly strided load.
587 for (auto &SI : StoreRefsForMemcpy)
588 MadeChange |= processLoopStoreOfLoopLoad(SI, BECount);
589
590 for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) {
591 Instruction *Inst = &*I++;
592 // Look for memset instructions, which may be optimized to a larger memset.
593 if (MemSetInst *MSI = dyn_cast<MemSetInst>(Inst)) {
594 WeakTrackingVH InstPtr(&*I);
595 if (!processLoopMemSet(MSI, BECount))
596 continue;
597 MadeChange = true;
598
599 // If processing the memset invalidated our iterator, start over from the
600 // top of the block.
601 if (!InstPtr)
602 I = BB->begin();
603 continue;
604 }
605 }
606
607 return MadeChange;
608}
609
610/// See if this store(s) can be promoted to a memset.
611bool LoopIdiomRecognize::processLoopStores(SmallVectorImpl<StoreInst *> &SL,
612 const SCEV *BECount, ForMemset For) {
613 // Try to find consecutive stores that can be transformed into memsets.
614 SetVector<StoreInst *> Heads, Tails;
615 SmallDenseMap<StoreInst *, StoreInst *> ConsecutiveChain;
616
617 // Do a quadratic search on all of the given stores and find
618 // all of the pairs of stores that follow each other.
619 SmallVector<unsigned, 16> IndexQueue;
620 for (unsigned i = 0, e = SL.size(); i < e; ++i) {
621 assert(SL[i]->isSimple() && "Expected only non-volatile stores.")((SL[i]->isSimple() && "Expected only non-volatile stores."
) ? static_cast<void> (0) : __assert_fail ("SL[i]->isSimple() && \"Expected only non-volatile stores.\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp"
, 621, __PRETTY_FUNCTION__))
;
622
623 Value *FirstStoredVal = SL[i]->getValueOperand();
624 Value *FirstStorePtr = SL[i]->getPointerOperand();
625 const SCEVAddRecExpr *FirstStoreEv =
626 cast<SCEVAddRecExpr>(SE->getSCEV(FirstStorePtr));
627 APInt FirstStride = getStoreStride(FirstStoreEv);
628 unsigned FirstStoreSize = DL->getTypeStoreSize(SL[i]->getValueOperand()->getType());
629
630 // See if we can optimize just this store in isolation.
631 if (FirstStride == FirstStoreSize || -FirstStride == FirstStoreSize) {
632 Heads.insert(SL[i]);
633 continue;
634 }
635
636 Value *FirstSplatValue = nullptr;
637 Constant *FirstPatternValue = nullptr;
638
639 if (For == ForMemset::Yes)
640 FirstSplatValue = isBytewiseValue(FirstStoredVal, *DL);
641 else
642 FirstPatternValue = getMemSetPatternValue(FirstStoredVal, DL);
643
644 assert((FirstSplatValue || FirstPatternValue) &&(((FirstSplatValue || FirstPatternValue) && "Expected either splat value or pattern value."
) ? static_cast<void> (0) : __assert_fail ("(FirstSplatValue || FirstPatternValue) && \"Expected either splat value or pattern value.\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp"
, 645, __PRETTY_FUNCTION__))
645 "Expected either splat value or pattern value.")(((FirstSplatValue || FirstPatternValue) && "Expected either splat value or pattern value."
) ? static_cast<void> (0) : __assert_fail ("(FirstSplatValue || FirstPatternValue) && \"Expected either splat value or pattern value.\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp"
, 645, __PRETTY_FUNCTION__))
;
646
647 IndexQueue.clear();
648 // If a store has multiple consecutive store candidates, search Stores
649 // array according to the sequence: from i+1 to e, then from i-1 to 0.
650 // This is because usually pairing with immediate succeeding or preceding
651 // candidate create the best chance to find memset opportunity.
652 unsigned j = 0;
653 for (j = i + 1; j < e; ++j)
654 IndexQueue.push_back(j);
655 for (j = i; j > 0; --j)
656 IndexQueue.push_back(j - 1);
657
658 for (auto &k : IndexQueue) {
659 assert(SL[k]->isSimple() && "Expected only non-volatile stores.")((SL[k]->isSimple() && "Expected only non-volatile stores."
) ? static_cast<void> (0) : __assert_fail ("SL[k]->isSimple() && \"Expected only non-volatile stores.\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp"
, 659, __PRETTY_FUNCTION__))
;
660 Value *SecondStorePtr = SL[k]->getPointerOperand();
661 const SCEVAddRecExpr *SecondStoreEv =
662 cast<SCEVAddRecExpr>(SE->getSCEV(SecondStorePtr));
663 APInt SecondStride = getStoreStride(SecondStoreEv);
664
665 if (FirstStride != SecondStride)
666 continue;
667
668 Value *SecondStoredVal = SL[k]->getValueOperand();
669 Value *SecondSplatValue = nullptr;
670 Constant *SecondPatternValue = nullptr;
671
672 if (For == ForMemset::Yes)
673 SecondSplatValue = isBytewiseValue(SecondStoredVal, *DL);
674 else
675 SecondPatternValue = getMemSetPatternValue(SecondStoredVal, DL);
676
677 assert((SecondSplatValue || SecondPatternValue) &&(((SecondSplatValue || SecondPatternValue) && "Expected either splat value or pattern value."
) ? static_cast<void> (0) : __assert_fail ("(SecondSplatValue || SecondPatternValue) && \"Expected either splat value or pattern value.\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp"
, 678, __PRETTY_FUNCTION__))
678 "Expected either splat value or pattern value.")(((SecondSplatValue || SecondPatternValue) && "Expected either splat value or pattern value."
) ? static_cast<void> (0) : __assert_fail ("(SecondSplatValue || SecondPatternValue) && \"Expected either splat value or pattern value.\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp"
, 678, __PRETTY_FUNCTION__))
;
679
680 if (isConsecutiveAccess(SL[i], SL[k], *DL, *SE, false)) {
681 if (For == ForMemset::Yes) {
682 if (isa<UndefValue>(FirstSplatValue))
683 FirstSplatValue = SecondSplatValue;
684 if (FirstSplatValue != SecondSplatValue)
685 continue;
686 } else {
687 if (isa<UndefValue>(FirstPatternValue))
688 FirstPatternValue = SecondPatternValue;
689 if (FirstPatternValue != SecondPatternValue)
690 continue;
691 }
692 Tails.insert(SL[k]);
693 Heads.insert(SL[i]);
694 ConsecutiveChain[SL[i]] = SL[k];
695 break;
696 }
697 }
698 }
699
700 // We may run into multiple chains that merge into a single chain. We mark the
701 // stores that we transformed so that we don't visit the same store twice.
702 SmallPtrSet<Value *, 16> TransformedStores;
703 bool Changed = false;
704
705 // For stores that start but don't end a link in the chain:
706 for (SetVector<StoreInst *>::iterator it = Heads.begin(), e = Heads.end();
707 it != e; ++it) {
708 if (Tails.count(*it))
709 continue;
710
711 // We found a store instr that starts a chain. Now follow the chain and try
712 // to transform it.
713 SmallPtrSet<Instruction *, 8> AdjacentStores;
714 StoreInst *I = *it;
715
716 StoreInst *HeadStore = I;
717 unsigned StoreSize = 0;
718
719 // Collect the chain into a list.
720 while (Tails.count(I) || Heads.count(I)) {
721 if (TransformedStores.count(I))
722 break;
723 AdjacentStores.insert(I);
724
725 StoreSize += DL->getTypeStoreSize(I->getValueOperand()->getType());
726 // Move to the next value in the chain.
727 I = ConsecutiveChain[I];
728 }
729
730 Value *StoredVal = HeadStore->getValueOperand();
731 Value *StorePtr = HeadStore->getPointerOperand();
732 const SCEVAddRecExpr *StoreEv = cast<SCEVAddRecExpr>(SE->getSCEV(StorePtr));
733 APInt Stride = getStoreStride(StoreEv);
734
735 // Check to see if the stride matches the size of the stores. If so, then
736 // we know that every byte is touched in the loop.
737 if (StoreSize != Stride && StoreSize != -Stride)
738 continue;
739
740 bool NegStride = StoreSize == -Stride;
741
742 if (processLoopStridedStore(StorePtr, StoreSize,
743 MaybeAlign(HeadStore->getAlignment()),
744 StoredVal, HeadStore, AdjacentStores, StoreEv,
745 BECount, NegStride)) {
746 TransformedStores.insert(AdjacentStores.begin(), AdjacentStores.end());
747 Changed = true;
748 }
749 }
750
751 return Changed;
752}
753
754/// processLoopMemSet - See if this memset can be promoted to a large memset.
755bool LoopIdiomRecognize::processLoopMemSet(MemSetInst *MSI,
756 const SCEV *BECount) {
757 // We can only handle non-volatile memsets with a constant size.
758 if (MSI->isVolatile() || !isa<ConstantInt>(MSI->getLength()))
759 return false;
760
761 // If we're not allowed to hack on memset, we fail.
762 if (!HasMemset)
763 return false;
764
765 Value *Pointer = MSI->getDest();
766
767 // See if the pointer expression is an AddRec like {base,+,1} on the current
768 // loop, which indicates a strided store. If we have something else, it's a
769 // random store we can't handle.
770 const SCEVAddRecExpr *Ev = dyn_cast<SCEVAddRecExpr>(SE->getSCEV(Pointer));
771 if (!Ev || Ev->getLoop() != CurLoop || !Ev->isAffine())
772 return false;
773
774 // Reject memsets that are so large that they overflow an unsigned.
775 uint64_t SizeInBytes = cast<ConstantInt>(MSI->getLength())->getZExtValue();
776 if ((SizeInBytes >> 32) != 0)
777 return false;
778
779 // Check to see if the stride matches the size of the memset. If so, then we
780 // know that every byte is touched in the loop.
781 const SCEVConstant *ConstStride = dyn_cast<SCEVConstant>(Ev->getOperand(1));
782 if (!ConstStride)
783 return false;
784
785 APInt Stride = ConstStride->getAPInt();
786 if (SizeInBytes != Stride && SizeInBytes != -Stride)
787 return false;
788
789 // Verify that the memset value is loop invariant. If not, we can't promote
790 // the memset.
791 Value *SplatValue = MSI->getValue();
792 if (!SplatValue || !CurLoop->isLoopInvariant(SplatValue))
793 return false;
794
795 SmallPtrSet<Instruction *, 1> MSIs;
796 MSIs.insert(MSI);
797 bool NegStride = SizeInBytes == -Stride;
798 return processLoopStridedStore(
799 Pointer, (unsigned)SizeInBytes, MaybeAlign(MSI->getDestAlignment()),
800 SplatValue, MSI, MSIs, Ev, BECount, NegStride, /*IsLoopMemset=*/true);
801}
802
803/// mayLoopAccessLocation - Return true if the specified loop might access the
804/// specified pointer location, which is a loop-strided access. The 'Access'
805/// argument specifies what the verboten forms of access are (read or write).
806static bool
807mayLoopAccessLocation(Value *Ptr, ModRefInfo Access, Loop *L,
808 const SCEV *BECount, unsigned StoreSize,
809 AliasAnalysis &AA,
810 SmallPtrSetImpl<Instruction *> &IgnoredStores) {
811 // Get the location that may be stored across the loop. Since the access is
812 // strided positively through memory, we say that the modified location starts
813 // at the pointer and has infinite size.
814 LocationSize AccessSize = LocationSize::unknown();
815
816 // If the loop iterates a fixed number of times, we can refine the access size
817 // to be exactly the size of the memset, which is (BECount+1)*StoreSize
818 if (const SCEVConstant *BECst = dyn_cast<SCEVConstant>(BECount))
819 AccessSize = LocationSize::precise((BECst->getValue()->getZExtValue() + 1) *
820 StoreSize);
821
822 // TODO: For this to be really effective, we have to dive into the pointer
823 // operand in the store. Store to &A[i] of 100 will always return may alias
824 // with store of &A[100], we need to StoreLoc to be "A" with size of 100,
825 // which will then no-alias a store to &A[100].
826 MemoryLocation StoreLoc(Ptr, AccessSize);
827
828 for (Loop::block_iterator BI = L->block_begin(), E = L->block_end(); BI != E;
829 ++BI)
830 for (Instruction &I : **BI)
831 if (IgnoredStores.count(&I) == 0 &&
832 isModOrRefSet(
833 intersectModRef(AA.getModRefInfo(&I, StoreLoc), Access)))
834 return true;
835
836 return false;
837}
838
839// If we have a negative stride, Start refers to the end of the memory location
840// we're trying to memset. Therefore, we need to recompute the base pointer,
841// which is just Start - BECount*Size.
842static const SCEV *getStartForNegStride(const SCEV *Start, const SCEV *BECount,
843 Type *IntPtr, unsigned StoreSize,
844 ScalarEvolution *SE) {
845 const SCEV *Index = SE->getTruncateOrZeroExtend(BECount, IntPtr);
846 if (StoreSize != 1)
847 Index = SE->getMulExpr(Index, SE->getConstant(IntPtr, StoreSize),
848 SCEV::FlagNUW);
849 return SE->getMinusSCEV(Start, Index);
850}
851
852/// Compute the number of bytes as a SCEV from the backedge taken count.
853///
854/// This also maps the SCEV into the provided type and tries to handle the
855/// computation in a way that will fold cleanly.
856static const SCEV *getNumBytes(const SCEV *BECount, Type *IntPtr,
857 unsigned StoreSize, Loop *CurLoop,
858 const DataLayout *DL, ScalarEvolution *SE) {
859 const SCEV *NumBytesS;
860 // The # stored bytes is (BECount+1)*Size. Expand the trip count out to
861 // pointer size if it isn't already.
862 //
863 // If we're going to need to zero extend the BE count, check if we can add
864 // one to it prior to zero extending without overflow. Provided this is safe,
865 // it allows better simplification of the +1.
866 if (DL->getTypeSizeInBits(BECount->getType()) <
867 DL->getTypeSizeInBits(IntPtr) &&
868 SE->isLoopEntryGuardedByCond(
869 CurLoop, ICmpInst::ICMP_NE, BECount,
870 SE->getNegativeSCEV(SE->getOne(BECount->getType())))) {
871 NumBytesS = SE->getZeroExtendExpr(
872 SE->getAddExpr(BECount, SE->getOne(BECount->getType()), SCEV::FlagNUW),
873 IntPtr);
874 } else {
875 NumBytesS = SE->getAddExpr(SE->getTruncateOrZeroExtend(BECount, IntPtr),
876 SE->getOne(IntPtr), SCEV::FlagNUW);
877 }
878
879 // And scale it based on the store size.
880 if (StoreSize != 1) {
881 NumBytesS = SE->getMulExpr(NumBytesS, SE->getConstant(IntPtr, StoreSize),
882 SCEV::FlagNUW);
883 }
884 return NumBytesS;
885}
886
887/// processLoopStridedStore - We see a strided store of some value. If we can
888/// transform this into a memset or memset_pattern in the loop preheader, do so.
889bool LoopIdiomRecognize::processLoopStridedStore(
890 Value *DestPtr, unsigned StoreSize, MaybeAlign StoreAlignment,
891 Value *StoredVal, Instruction *TheStore,
892 SmallPtrSetImpl<Instruction *> &Stores, const SCEVAddRecExpr *Ev,
893 const SCEV *BECount, bool NegStride, bool IsLoopMemset) {
894 Value *SplatValue = isBytewiseValue(StoredVal, *DL);
895 Constant *PatternValue = nullptr;
896
897 if (!SplatValue)
898 PatternValue = getMemSetPatternValue(StoredVal, DL);
899
900 assert((SplatValue || PatternValue) &&(((SplatValue || PatternValue) && "Expected either splat value or pattern value."
) ? static_cast<void> (0) : __assert_fail ("(SplatValue || PatternValue) && \"Expected either splat value or pattern value.\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp"
, 901, __PRETTY_FUNCTION__))
901 "Expected either splat value or pattern value.")(((SplatValue || PatternValue) && "Expected either splat value or pattern value."
) ? static_cast<void> (0) : __assert_fail ("(SplatValue || PatternValue) && \"Expected either splat value or pattern value.\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp"
, 901, __PRETTY_FUNCTION__))
;
902
903 // The trip count of the loop and the base pointer of the addrec SCEV is
904 // guaranteed to be loop invariant, which means that it should dominate the
905 // header. This allows us to insert code for it in the preheader.
906 unsigned DestAS = DestPtr->getType()->getPointerAddressSpace();
907 BasicBlock *Preheader = CurLoop->getLoopPreheader();
908 IRBuilder<> Builder(Preheader->getTerminator());
909 SCEVExpander Expander(*SE, *DL, "loop-idiom");
910
911 Type *DestInt8PtrTy = Builder.getInt8PtrTy(DestAS);
912 Type *IntIdxTy = DL->getIndexType(DestPtr->getType());
913
914 const SCEV *Start = Ev->getStart();
915 // Handle negative strided loops.
916 if (NegStride)
917 Start = getStartForNegStride(Start, BECount, IntIdxTy, StoreSize, SE);
918
919 // TODO: ideally we should still be able to generate memset if SCEV expander
920 // is taught to generate the dependencies at the latest point.
921 if (!isSafeToExpand(Start, *SE))
922 return false;
923
924 // Okay, we have a strided store "p[i]" of a splattable value. We can turn
925 // this into a memset in the loop preheader now if we want. However, this
926 // would be unsafe to do if there is anything else in the loop that may read
927 // or write to the aliased location. Check for any overlap by generating the
928 // base pointer and checking the region.
929 Value *BasePtr =
930 Expander.expandCodeFor(Start, DestInt8PtrTy, Preheader->getTerminator());
931 if (mayLoopAccessLocation(BasePtr, ModRefInfo::ModRef, CurLoop, BECount,
932 StoreSize, *AA, Stores)) {
933 Expander.clear();
934 // If we generated new code for the base pointer, clean up.
935 RecursivelyDeleteTriviallyDeadInstructions(BasePtr, TLI);
936 return false;
937 }
938
939 if (avoidLIRForMultiBlockLoop(/*IsMemset=*/true, IsLoopMemset))
940 return false;
941
942 // Okay, everything looks good, insert the memset.
943
944 const SCEV *NumBytesS =
945 getNumBytes(BECount, IntIdxTy, StoreSize, CurLoop, DL, SE);
946
947 // TODO: ideally we should still be able to generate memset if SCEV expander
948 // is taught to generate the dependencies at the latest point.
949 if (!isSafeToExpand(NumBytesS, *SE))
950 return false;
951
952 Value *NumBytes =
953 Expander.expandCodeFor(NumBytesS, IntIdxTy, Preheader->getTerminator());
954
955 CallInst *NewCall;
956 if (SplatValue) {
957 NewCall = Builder.CreateMemSet(BasePtr, SplatValue, NumBytes,
958 MaybeAlign(StoreAlignment));
959 } else {
960 // Everything is emitted in default address space
961 Type *Int8PtrTy = DestInt8PtrTy;
962
963 Module *M = TheStore->getModule();
964 StringRef FuncName = "memset_pattern16";
965 FunctionCallee MSP = M->getOrInsertFunction(FuncName, Builder.getVoidTy(),
966 Int8PtrTy, Int8PtrTy, IntIdxTy);
967 inferLibFuncAttributes(M, FuncName, *TLI);
968
969 // Otherwise we should form a memset_pattern16. PatternValue is known to be
970 // an constant array of 16-bytes. Plop the value into a mergable global.
971 GlobalVariable *GV = new GlobalVariable(*M, PatternValue->getType(), true,
972 GlobalValue::PrivateLinkage,
973 PatternValue, ".memset_pattern");
974 GV->setUnnamedAddr(GlobalValue::UnnamedAddr::Global); // Ok to merge these.
975 GV->setAlignment(Align(16));
976 Value *PatternPtr = ConstantExpr::getBitCast(GV, Int8PtrTy);
977 NewCall = Builder.CreateCall(MSP, {BasePtr, PatternPtr, NumBytes});
978 }
979 NewCall->setDebugLoc(TheStore->getDebugLoc());
980
981 if (MSSAU) {
982 MemoryAccess *NewMemAcc = MSSAU->createMemoryAccessInBB(
983 NewCall, nullptr, NewCall->getParent(), MemorySSA::BeforeTerminator);
984 MSSAU->insertDef(cast<MemoryDef>(NewMemAcc), true);
985 }
986
987 LLVM_DEBUG(dbgs() << " Formed memset: " << *NewCall << "\n"do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-idiom")) { dbgs() << " Formed memset: " <<
*NewCall << "\n" << " from store to: " <<
*Ev << " at: " << *TheStore << "\n"; } } while
(false)
988 << " from store to: " << *Ev << " at: " << *TheStoredo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-idiom")) { dbgs() << " Formed memset: " <<
*NewCall << "\n" << " from store to: " <<
*Ev << " at: " << *TheStore << "\n"; } } while
(false)
989 << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-idiom")) { dbgs() << " Formed memset: " <<
*NewCall << "\n" << " from store to: " <<
*Ev << " at: " << *TheStore << "\n"; } } while
(false)
;
990
991 ORE.emit([&]() {
992 return OptimizationRemark(DEBUG_TYPE"loop-idiom", "ProcessLoopStridedStore",
993 NewCall->getDebugLoc(), Preheader)
994 << "Transformed loop-strided store into a call to "
995 << ore::NV("NewFunction", NewCall->getCalledFunction())
996 << "() function";
997 });
998
999 // Okay, the memset has been formed. Zap the original store and anything that
1000 // feeds into it.
1001 for (auto *I : Stores) {
1002 if (MSSAU)
1003 MSSAU->removeMemoryAccess(I, true);
1004 deleteDeadInstruction(I);
1005 }
1006 if (MSSAU && VerifyMemorySSA)
1007 MSSAU->getMemorySSA()->verifyMemorySSA();
1008 ++NumMemSet;
1009 return true;
1010}
1011
1012/// If the stored value is a strided load in the same loop with the same stride
1013/// this may be transformable into a memcpy. This kicks in for stuff like
1014/// for (i) A[i] = B[i];
1015bool LoopIdiomRecognize::processLoopStoreOfLoopLoad(StoreInst *SI,
1016 const SCEV *BECount) {
1017 assert(SI->isUnordered() && "Expected only non-volatile non-ordered stores.")((SI->isUnordered() && "Expected only non-volatile non-ordered stores."
) ? static_cast<void> (0) : __assert_fail ("SI->isUnordered() && \"Expected only non-volatile non-ordered stores.\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp"
, 1017, __PRETTY_FUNCTION__))
;
1018
1019 Value *StorePtr = SI->getPointerOperand();
1020 const SCEVAddRecExpr *StoreEv = cast<SCEVAddRecExpr>(SE->getSCEV(StorePtr));
1021 APInt Stride = getStoreStride(StoreEv);
1022 unsigned StoreSize = DL->getTypeStoreSize(SI->getValueOperand()->getType());
1023 bool NegStride = StoreSize == -Stride;
1024
1025 // The store must be feeding a non-volatile load.
1026 LoadInst *LI = cast<LoadInst>(SI->getValueOperand());
1027 assert(LI->isUnordered() && "Expected only non-volatile non-ordered loads.")((LI->isUnordered() && "Expected only non-volatile non-ordered loads."
) ? static_cast<void> (0) : __assert_fail ("LI->isUnordered() && \"Expected only non-volatile non-ordered loads.\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp"
, 1027, __PRETTY_FUNCTION__))
;
1028
1029 // See if the pointer expression is an AddRec like {base,+,1} on the current
1030 // loop, which indicates a strided load. If we have something else, it's a
1031 // random load we can't handle.
1032 const SCEVAddRecExpr *LoadEv =
1033 cast<SCEVAddRecExpr>(SE->getSCEV(LI->getPointerOperand()));
1034
1035 // The trip count of the loop and the base pointer of the addrec SCEV is
1036 // guaranteed to be loop invariant, which means that it should dominate the
1037 // header. This allows us to insert code for it in the preheader.
1038 BasicBlock *Preheader = CurLoop->getLoopPreheader();
1039 IRBuilder<> Builder(Preheader->getTerminator());
1040 SCEVExpander Expander(*SE, *DL, "loop-idiom");
1041
1042 const SCEV *StrStart = StoreEv->getStart();
1043 unsigned StrAS = SI->getPointerAddressSpace();
1044 Type *IntIdxTy = Builder.getIntNTy(DL->getIndexSizeInBits(StrAS));
1045
1046 // Handle negative strided loops.
1047 if (NegStride)
1048 StrStart = getStartForNegStride(StrStart, BECount, IntIdxTy, StoreSize, SE);
1049
1050 // Okay, we have a strided store "p[i]" of a loaded value. We can turn
1051 // this into a memcpy in the loop preheader now if we want. However, this
1052 // would be unsafe to do if there is anything else in the loop that may read
1053 // or write the memory region we're storing to. This includes the load that
1054 // feeds the stores. Check for an alias by generating the base address and
1055 // checking everything.
1056 Value *StoreBasePtr = Expander.expandCodeFor(
1057 StrStart, Builder.getInt8PtrTy(StrAS), Preheader->getTerminator());
1058
1059 SmallPtrSet<Instruction *, 1> Stores;
1060 Stores.insert(SI);
1061 if (mayLoopAccessLocation(StoreBasePtr, ModRefInfo::ModRef, CurLoop, BECount,
1062 StoreSize, *AA, Stores)) {
1063 Expander.clear();
1064 // If we generated new code for the base pointer, clean up.
1065 RecursivelyDeleteTriviallyDeadInstructions(StoreBasePtr, TLI);
1066 return false;
1067 }
1068
1069 const SCEV *LdStart = LoadEv->getStart();
1070 unsigned LdAS = LI->getPointerAddressSpace();
1071
1072 // Handle negative strided loops.
1073 if (NegStride)
1074 LdStart = getStartForNegStride(LdStart, BECount, IntIdxTy, StoreSize, SE);
1075
1076 // For a memcpy, we have to make sure that the input array is not being
1077 // mutated by the loop.
1078 Value *LoadBasePtr = Expander.expandCodeFor(
1079 LdStart, Builder.getInt8PtrTy(LdAS), Preheader->getTerminator());
1080
1081 if (mayLoopAccessLocation(LoadBasePtr, ModRefInfo::Mod, CurLoop, BECount,
1082 StoreSize, *AA, Stores)) {
1083 Expander.clear();
1084 // If we generated new code for the base pointer, clean up.
1085 RecursivelyDeleteTriviallyDeadInstructions(LoadBasePtr, TLI);
1086 RecursivelyDeleteTriviallyDeadInstructions(StoreBasePtr, TLI);
1087 return false;
1088 }
1089
1090 if (avoidLIRForMultiBlockLoop())
1091 return false;
1092
1093 // Okay, everything is safe, we can transform this!
1094
1095 const SCEV *NumBytesS =
1096 getNumBytes(BECount, IntIdxTy, StoreSize, CurLoop, DL, SE);
1097
1098 Value *NumBytes =
1099 Expander.expandCodeFor(NumBytesS, IntIdxTy, Preheader->getTerminator());
1100
1101 CallInst *NewCall = nullptr;
1102 // Check whether to generate an unordered atomic memcpy:
1103 // If the load or store are atomic, then they must necessarily be unordered
1104 // by previous checks.
1105 if (!SI->isAtomic() && !LI->isAtomic())
1106 NewCall = Builder.CreateMemCpy(StoreBasePtr, SI->getAlign(), LoadBasePtr,
1107 LI->getAlign(), NumBytes);
1108 else {
1109 // We cannot allow unaligned ops for unordered load/store, so reject
1110 // anything where the alignment isn't at least the element size.
1111 const MaybeAlign StoreAlign = SI->getAlign();
1112 const MaybeAlign LoadAlign = LI->getAlign();
1113 if (StoreAlign == None || LoadAlign == None)
1114 return false;
1115 if (*StoreAlign < StoreSize || *LoadAlign < StoreSize)
1116 return false;
1117
1118 // If the element.atomic memcpy is not lowered into explicit
1119 // loads/stores later, then it will be lowered into an element-size
1120 // specific lib call. If the lib call doesn't exist for our store size, then
1121 // we shouldn't generate the memcpy.
1122 if (StoreSize > TTI->getAtomicMemIntrinsicMaxElementSize())
1123 return false;
1124
1125 // Create the call.
1126 // Note that unordered atomic loads/stores are *required* by the spec to
1127 // have an alignment but non-atomic loads/stores may not.
1128 NewCall = Builder.CreateElementUnorderedAtomicMemCpy(
1129 StoreBasePtr, *StoreAlign, LoadBasePtr, *LoadAlign, NumBytes,
1130 StoreSize);
1131 }
1132 NewCall->setDebugLoc(SI->getDebugLoc());
1133
1134 if (MSSAU) {
1135 MemoryAccess *NewMemAcc = MSSAU->createMemoryAccessInBB(
1136 NewCall, nullptr, NewCall->getParent(), MemorySSA::BeforeTerminator);
1137 MSSAU->insertDef(cast<MemoryDef>(NewMemAcc), true);
1138 }
1139
1140 LLVM_DEBUG(dbgs() << " Formed memcpy: " << *NewCall << "\n"do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-idiom")) { dbgs() << " Formed memcpy: " <<
*NewCall << "\n" << " from load ptr=" <<
*LoadEv << " at: " << *LI << "\n" <<
" from store ptr=" << *StoreEv << " at: " <<
*SI << "\n"; } } while (false)
1141 << " from load ptr=" << *LoadEv << " at: " << *LI << "\n"do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-idiom")) { dbgs() << " Formed memcpy: " <<
*NewCall << "\n" << " from load ptr=" <<
*LoadEv << " at: " << *LI << "\n" <<
" from store ptr=" << *StoreEv << " at: " <<
*SI << "\n"; } } while (false)
1142 << " from store ptr=" << *StoreEv << " at: " << *SIdo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-idiom")) { dbgs() << " Formed memcpy: " <<
*NewCall << "\n" << " from load ptr=" <<
*LoadEv << " at: " << *LI << "\n" <<
" from store ptr=" << *StoreEv << " at: " <<
*SI << "\n"; } } while (false)
1143 << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-idiom")) { dbgs() << " Formed memcpy: " <<
*NewCall << "\n" << " from load ptr=" <<
*LoadEv << " at: " << *LI << "\n" <<
" from store ptr=" << *StoreEv << " at: " <<
*SI << "\n"; } } while (false)
;
1144
1145 ORE.emit([&]() {
1146 return OptimizationRemark(DEBUG_TYPE"loop-idiom", "ProcessLoopStoreOfLoopLoad",
1147 NewCall->getDebugLoc(), Preheader)
1148 << "Formed a call to "
1149 << ore::NV("NewFunction", NewCall->getCalledFunction())
1150 << "() function";
1151 });
1152
1153 // Okay, the memcpy has been formed. Zap the original store and anything that
1154 // feeds into it.
1155 if (MSSAU)
1156 MSSAU->removeMemoryAccess(SI, true);
1157 deleteDeadInstruction(SI);
1158 if (MSSAU && VerifyMemorySSA)
1159 MSSAU->getMemorySSA()->verifyMemorySSA();
1160 ++NumMemCpy;
1161 return true;
1162}
1163
1164// When compiling for codesize we avoid idiom recognition for a multi-block loop
1165// unless it is a loop_memset idiom or a memset/memcpy idiom in a nested loop.
1166//
1167bool LoopIdiomRecognize::avoidLIRForMultiBlockLoop(bool IsMemset,
1168 bool IsLoopMemset) {
1169 if (ApplyCodeSizeHeuristics && CurLoop->getNumBlocks() > 1) {
1170 if (!CurLoop->getParentLoop() && (!IsMemset || !IsLoopMemset)) {
1171 LLVM_DEBUG(dbgs() << " " << CurLoop->getHeader()->getParent()->getName()do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-idiom")) { dbgs() << " " << CurLoop->getHeader
()->getParent()->getName() << " : LIR " << (
IsMemset ? "Memset" : "Memcpy") << " avoided: multi-block top-level loop\n"
; } } while (false)
1172 << " : LIR " << (IsMemset ? "Memset" : "Memcpy")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-idiom")) { dbgs() << " " << CurLoop->getHeader
()->getParent()->getName() << " : LIR " << (
IsMemset ? "Memset" : "Memcpy") << " avoided: multi-block top-level loop\n"
; } } while (false)
1173 << " avoided: multi-block top-level loop\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-idiom")) { dbgs() << " " << CurLoop->getHeader
()->getParent()->getName() << " : LIR " << (
IsMemset ? "Memset" : "Memcpy") << " avoided: multi-block top-level loop\n"
; } } while (false)
;
1174 return true;
1175 }
1176 }
1177
1178 return false;
1179}
1180
1181bool LoopIdiomRecognize::runOnNoncountableLoop() {
1182 LLVM_DEBUG(dbgs() << DEBUG_TYPE " Scanning: F["do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-idiom")) { dbgs() << "loop-idiom" " Scanning: F["
<< CurLoop->getHeader()->getParent()->getName
() << "] Noncountable Loop %" << CurLoop->getHeader
()->getName() << "\n"; } } while (false)
1183 << CurLoop->getHeader()->getParent()->getName()do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-idiom")) { dbgs() << "loop-idiom" " Scanning: F["
<< CurLoop->getHeader()->getParent()->getName
() << "] Noncountable Loop %" << CurLoop->getHeader
()->getName() << "\n"; } } while (false)
1184 << "] Noncountable Loop %"do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-idiom")) { dbgs() << "loop-idiom" " Scanning: F["
<< CurLoop->getHeader()->getParent()->getName
() << "] Noncountable Loop %" << CurLoop->getHeader
()->getName() << "\n"; } } while (false)
1185 << CurLoop->getHeader()->getName() << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-idiom")) { dbgs() << "loop-idiom" " Scanning: F["
<< CurLoop->getHeader()->getParent()->getName
() << "] Noncountable Loop %" << CurLoop->getHeader
()->getName() << "\n"; } } while (false)
;
1186
1187 return recognizePopcount() || recognizeAndInsertFFS();
1188}
1189
1190/// Check if the given conditional branch is based on the comparison between
1191/// a variable and zero, and if the variable is non-zero or zero (JmpOnZero is
1192/// true), the control yields to the loop entry. If the branch matches the
1193/// behavior, the variable involved in the comparison is returned. This function
1194/// will be called to see if the precondition and postcondition of the loop are
1195/// in desirable form.
1196static Value *matchCondition(BranchInst *BI, BasicBlock *LoopEntry,
1197 bool JmpOnZero = false) {
1198 if (!BI || !BI->isConditional())
1199 return nullptr;
1200
1201 ICmpInst *Cond = dyn_cast<ICmpInst>(BI->getCondition());
1202 if (!Cond)
1203 return nullptr;
1204
1205 ConstantInt *CmpZero = dyn_cast<ConstantInt>(Cond->getOperand(1));
1206 if (!CmpZero || !CmpZero->isZero())
1207 return nullptr;
1208
1209 BasicBlock *TrueSucc = BI->getSuccessor(0);
1210 BasicBlock *FalseSucc = BI->getSuccessor(1);
1211 if (JmpOnZero)
1212 std::swap(TrueSucc, FalseSucc);
1213
1214 ICmpInst::Predicate Pred = Cond->getPredicate();
1215 if ((Pred == ICmpInst::ICMP_NE && TrueSucc == LoopEntry) ||
1216 (Pred == ICmpInst::ICMP_EQ && FalseSucc == LoopEntry))
1217 return Cond->getOperand(0);
1218
1219 return nullptr;
1220}
1221
1222// Check if the recurrence variable `VarX` is in the right form to create
1223// the idiom. Returns the value coerced to a PHINode if so.
1224static PHINode *getRecurrenceVar(Value *VarX, Instruction *DefX,
1225 BasicBlock *LoopEntry) {
1226 auto *PhiX = dyn_cast<PHINode>(VarX);
1227 if (PhiX && PhiX->getParent() == LoopEntry &&
1228 (PhiX->getOperand(0) == DefX || PhiX->getOperand(1) == DefX))
1229 return PhiX;
1230 return nullptr;
1231}
1232
1233/// Return true iff the idiom is detected in the loop.
1234///
1235/// Additionally:
1236/// 1) \p CntInst is set to the instruction counting the population bit.
1237/// 2) \p CntPhi is set to the corresponding phi node.
1238/// 3) \p Var is set to the value whose population bits are being counted.
1239///
1240/// The core idiom we are trying to detect is:
1241/// \code
1242/// if (x0 != 0)
1243/// goto loop-exit // the precondition of the loop
1244/// cnt0 = init-val;
1245/// do {
1246/// x1 = phi (x0, x2);
1247/// cnt1 = phi(cnt0, cnt2);
1248///
1249/// cnt2 = cnt1 + 1;
1250/// ...
1251/// x2 = x1 & (x1 - 1);
1252/// ...
1253/// } while(x != 0);
1254///
1255/// loop-exit:
1256/// \endcode
1257static bool detectPopcountIdiom(Loop *CurLoop, BasicBlock *PreCondBB,
1258 Instruction *&CntInst, PHINode *&CntPhi,
1259 Value *&Var) {
1260 // step 1: Check to see if the look-back branch match this pattern:
1261 // "if (a!=0) goto loop-entry".
1262 BasicBlock *LoopEntry;
1263 Instruction *DefX2, *CountInst;
1264 Value *VarX1, *VarX0;
1265 PHINode *PhiX, *CountPhi;
1266
1267 DefX2 = CountInst = nullptr;
1268 VarX1 = VarX0 = nullptr;
1269 PhiX = CountPhi = nullptr;
1270 LoopEntry = *(CurLoop->block_begin());
1271
1272 // step 1: Check if the loop-back branch is in desirable form.
1273 {
1274 if (Value *T = matchCondition(
1275 dyn_cast<BranchInst>(LoopEntry->getTerminator()), LoopEntry))
1276 DefX2 = dyn_cast<Instruction>(T);
1277 else
1278 return false;
1279 }
1280
1281 // step 2: detect instructions corresponding to "x2 = x1 & (x1 - 1)"
1282 {
1283 if (!DefX2 || DefX2->getOpcode() != Instruction::And)
1284 return false;
1285
1286 BinaryOperator *SubOneOp;
1287
1288 if ((SubOneOp = dyn_cast<BinaryOperator>(DefX2->getOperand(0))))
1289 VarX1 = DefX2->getOperand(1);
1290 else {
1291 VarX1 = DefX2->getOperand(0);
1292 SubOneOp = dyn_cast<BinaryOperator>(DefX2->getOperand(1));
1293 }
1294 if (!SubOneOp || SubOneOp->getOperand(0) != VarX1)
1295 return false;
1296
1297 ConstantInt *Dec = dyn_cast<ConstantInt>(SubOneOp->getOperand(1));
1298 if (!Dec ||
1299 !((SubOneOp->getOpcode() == Instruction::Sub && Dec->isOne()) ||
1300 (SubOneOp->getOpcode() == Instruction::Add &&
1301 Dec->isMinusOne()))) {
1302 return false;
1303 }
1304 }
1305
1306 // step 3: Check the recurrence of variable X
1307 PhiX = getRecurrenceVar(VarX1, DefX2, LoopEntry);
1308 if (!PhiX)
1309 return false;
1310
1311 // step 4: Find the instruction which count the population: cnt2 = cnt1 + 1
1312 {
1313 CountInst = nullptr;
1314 for (BasicBlock::iterator Iter = LoopEntry->getFirstNonPHI()->getIterator(),
1315 IterE = LoopEntry->end();
1316 Iter != IterE; Iter++) {
1317 Instruction *Inst = &*Iter;
1318 if (Inst->getOpcode() != Instruction::Add)
1319 continue;
1320
1321 ConstantInt *Inc = dyn_cast<ConstantInt>(Inst->getOperand(1));
1322 if (!Inc || !Inc->isOne())
1323 continue;
1324
1325 PHINode *Phi = getRecurrenceVar(Inst->getOperand(0), Inst, LoopEntry);
1326 if (!Phi)
1327 continue;
1328
1329 // Check if the result of the instruction is live of the loop.
1330 bool LiveOutLoop = false;
1331 for (User *U : Inst->users()) {
1332 if ((cast<Instruction>(U))->getParent() != LoopEntry) {
1333 LiveOutLoop = true;
1334 break;
1335 }
1336 }
1337
1338 if (LiveOutLoop) {
1339 CountInst = Inst;
1340 CountPhi = Phi;
1341 break;
1342 }
1343 }
1344
1345 if (!CountInst)
1346 return false;
1347 }
1348
1349 // step 5: check if the precondition is in this form:
1350 // "if (x != 0) goto loop-head ; else goto somewhere-we-don't-care;"
1351 {
1352 auto *PreCondBr = dyn_cast<BranchInst>(PreCondBB->getTerminator());
1353 Value *T = matchCondition(PreCondBr, CurLoop->getLoopPreheader());
1354 if (T != PhiX->getOperand(0) && T != PhiX->getOperand(1))
1355 return false;
1356
1357 CntInst = CountInst;
1358 CntPhi = CountPhi;
1359 Var = T;
1360 }
1361
1362 return true;
1363}
1364
1365/// Return true if the idiom is detected in the loop.
1366///
1367/// Additionally:
1368/// 1) \p CntInst is set to the instruction Counting Leading Zeros (CTLZ)
1369/// or nullptr if there is no such.
1370/// 2) \p CntPhi is set to the corresponding phi node
1371/// or nullptr if there is no such.
1372/// 3) \p Var is set to the value whose CTLZ could be used.
1373/// 4) \p DefX is set to the instruction calculating Loop exit condition.
1374///
1375/// The core idiom we are trying to detect is:
1376/// \code
1377/// if (x0 == 0)
1378/// goto loop-exit // the precondition of the loop
1379/// cnt0 = init-val;
1380/// do {
1381/// x = phi (x0, x.next); //PhiX
1382/// cnt = phi(cnt0, cnt.next);
1383///
1384/// cnt.next = cnt + 1;
1385/// ...
1386/// x.next = x >> 1; // DefX
1387/// ...
1388/// } while(x.next != 0);
1389///
1390/// loop-exit:
1391/// \endcode
1392static bool detectShiftUntilZeroIdiom(Loop *CurLoop, const DataLayout &DL,
1393 Intrinsic::ID &IntrinID, Value *&InitX,
1394 Instruction *&CntInst, PHINode *&CntPhi,
1395 Instruction *&DefX) {
1396 BasicBlock *LoopEntry;
1397 Value *VarX = nullptr;
1398
1399 DefX = nullptr;
1400 CntInst = nullptr;
1401 CntPhi = nullptr;
1402 LoopEntry = *(CurLoop->block_begin());
1403
1404 // step 1: Check if the loop-back branch is in desirable form.
1405 if (Value *T = matchCondition(
1406 dyn_cast<BranchInst>(LoopEntry->getTerminator()), LoopEntry))
1407 DefX = dyn_cast<Instruction>(T);
1408 else
1409 return false;
1410
1411 // step 2: detect instructions corresponding to "x.next = x >> 1 or x << 1"
1412 if (!DefX || !DefX->isShift())
1413 return false;
1414 IntrinID = DefX->getOpcode() == Instruction::Shl ? Intrinsic::cttz :
1415 Intrinsic::ctlz;
1416 ConstantInt *Shft = dyn_cast<ConstantInt>(DefX->getOperand(1));
1417 if (!Shft || !Shft->isOne())
1418 return false;
1419 VarX = DefX->getOperand(0);
1420
1421 // step 3: Check the recurrence of variable X
1422 PHINode *PhiX = getRecurrenceVar(VarX, DefX, LoopEntry);
1423 if (!PhiX)
1424 return false;
1425
1426 InitX = PhiX->getIncomingValueForBlock(CurLoop->getLoopPreheader());
1427
1428 // Make sure the initial value can't be negative otherwise the ashr in the
1429 // loop might never reach zero which would make the loop infinite.
1430 if (DefX->getOpcode() == Instruction::AShr && !isKnownNonNegative(InitX, DL))
1431 return false;
1432
1433 // step 4: Find the instruction which count the CTLZ: cnt.next = cnt + 1
1434 // TODO: We can skip the step. If loop trip count is known (CTLZ),
1435 // then all uses of "cnt.next" could be optimized to the trip count
1436 // plus "cnt0". Currently it is not optimized.
1437 // This step could be used to detect POPCNT instruction:
1438 // cnt.next = cnt + (x.next & 1)
1439 for (BasicBlock::iterator Iter = LoopEntry->getFirstNonPHI()->getIterator(),
1440 IterE = LoopEntry->end();
1441 Iter != IterE; Iter++) {
1442 Instruction *Inst = &*Iter;
1443 if (Inst->getOpcode() != Instruction::Add)
1444 continue;
1445
1446 ConstantInt *Inc = dyn_cast<ConstantInt>(Inst->getOperand(1));
1447 if (!Inc || !Inc->isOne())
1448 continue;
1449
1450 PHINode *Phi = getRecurrenceVar(Inst->getOperand(0), Inst, LoopEntry);
1451 if (!Phi)
1452 continue;
1453
1454 CntInst = Inst;
1455 CntPhi = Phi;
1456 break;
1457 }
1458 if (!CntInst)
1459 return false;
1460
1461 return true;
1462}
1463
1464/// Recognize CTLZ or CTTZ idiom in a non-countable loop and convert the loop
1465/// to countable (with CTLZ / CTTZ trip count). If CTLZ / CTTZ inserted as a new
1466/// trip count returns true; otherwise, returns false.
1467bool LoopIdiomRecognize::recognizeAndInsertFFS() {
1468 // Give up if the loop has multiple blocks or multiple backedges.
1469 if (CurLoop->getNumBackEdges() != 1 || CurLoop->getNumBlocks() != 1)
1470 return false;
1471
1472 Intrinsic::ID IntrinID;
1473 Value *InitX;
1474 Instruction *DefX = nullptr;
1475 PHINode *CntPhi = nullptr;
1476 Instruction *CntInst = nullptr;
1477 // Help decide if transformation is profitable. For ShiftUntilZero idiom,
1478 // this is always 6.
1479 size_t IdiomCanonicalSize = 6;
1480
1481 if (!detectShiftUntilZeroIdiom(CurLoop, *DL, IntrinID, InitX,
1482 CntInst, CntPhi, DefX))
1483 return false;
1484
1485 bool IsCntPhiUsedOutsideLoop = false;
1486 for (User *U : CntPhi->users())
1487 if (!CurLoop->contains(cast<Instruction>(U))) {
1488 IsCntPhiUsedOutsideLoop = true;
1489 break;
1490 }
1491 bool IsCntInstUsedOutsideLoop = false;
1492 for (User *U : CntInst->users())
1493 if (!CurLoop->contains(cast<Instruction>(U))) {
1494 IsCntInstUsedOutsideLoop = true;
1495 break;
1496 }
1497 // If both CntInst and CntPhi are used outside the loop the profitability
1498 // is questionable.
1499 if (IsCntInstUsedOutsideLoop && IsCntPhiUsedOutsideLoop)
1500 return false;
1501
1502 // For some CPUs result of CTLZ(X) intrinsic is undefined
1503 // when X is 0. If we can not guarantee X != 0, we need to check this
1504 // when expand.
1505 bool ZeroCheck = false;
1506 // It is safe to assume Preheader exist as it was checked in
1507 // parent function RunOnLoop.
1508 BasicBlock *PH = CurLoop->getLoopPreheader();
1509
1510 // If we are using the count instruction outside the loop, make sure we
1511 // have a zero check as a precondition. Without the check the loop would run
1512 // one iteration for before any check of the input value. This means 0 and 1
1513 // would have identical behavior in the original loop and thus
1514 if (!IsCntPhiUsedOutsideLoop) {
1515 auto *PreCondBB = PH->getSinglePredecessor();
1516 if (!PreCondBB)
1517 return false;
1518 auto *PreCondBI = dyn_cast<BranchInst>(PreCondBB->getTerminator());
1519 if (!PreCondBI)
1520 return false;
1521 if (matchCondition(PreCondBI, PH) != InitX)
1522 return false;
1523 ZeroCheck = true;
1524 }
1525
1526 // Check if CTLZ / CTTZ intrinsic is profitable. Assume it is always
1527 // profitable if we delete the loop.
1528
1529 // the loop has only 6 instructions:
1530 // %n.addr.0 = phi [ %n, %entry ], [ %shr, %while.cond ]
1531 // %i.0 = phi [ %i0, %entry ], [ %inc, %while.cond ]
1532 // %shr = ashr %n.addr.0, 1
1533 // %tobool = icmp eq %shr, 0
1534 // %inc = add nsw %i.0, 1
1535 // br i1 %tobool
1536
1537 const Value *Args[] =
1538 {InitX, ZeroCheck ? ConstantInt::getTrue(InitX->getContext())
1539 : ConstantInt::getFalse(InitX->getContext())};
1540
1541 // @llvm.dbg doesn't count as they have no semantic effect.
1542 auto InstWithoutDebugIt = CurLoop->getHeader()->instructionsWithoutDebug();
1543 uint32_t HeaderSize =
1544 std::distance(InstWithoutDebugIt.begin(), InstWithoutDebugIt.end());
1545
1546 if (HeaderSize != IdiomCanonicalSize &&
1547 TTI->getIntrinsicCost(IntrinID, InitX->getType(), Args) >
1548 TargetTransformInfo::TCC_Basic)
1549 return false;
1550
1551 transformLoopToCountable(IntrinID, PH, CntInst, CntPhi, InitX, DefX,
1552 DefX->getDebugLoc(), ZeroCheck,
1553 IsCntPhiUsedOutsideLoop);
1554 return true;
1555}
1556
1557/// Recognizes a population count idiom in a non-countable loop.
1558///
1559/// If detected, transforms the relevant code to issue the popcount intrinsic
1560/// function call, and returns true; otherwise, returns false.
1561bool LoopIdiomRecognize::recognizePopcount() {
1562 if (TTI->getPopcntSupport(32) != TargetTransformInfo::PSK_FastHardware)
1563 return false;
1564
1565 // Counting population are usually conducted by few arithmetic instructions.
1566 // Such instructions can be easily "absorbed" by vacant slots in a
1567 // non-compact loop. Therefore, recognizing popcount idiom only makes sense
1568 // in a compact loop.
1569
1570 // Give up if the loop has multiple blocks or multiple backedges.
1571 if (CurLoop->getNumBackEdges() != 1 || CurLoop->getNumBlocks() != 1)
1572 return false;
1573
1574 BasicBlock *LoopBody = *(CurLoop->block_begin());
1575 if (LoopBody->size() >= 20) {
1576 // The loop is too big, bail out.
1577 return false;
1578 }
1579
1580 // It should have a preheader containing nothing but an unconditional branch.
1581 BasicBlock *PH = CurLoop->getLoopPreheader();
1582 if (!PH || &PH->front() != PH->getTerminator())
1583 return false;
1584 auto *EntryBI = dyn_cast<BranchInst>(PH->getTerminator());
1585 if (!EntryBI || EntryBI->isConditional())
1586 return false;
1587
1588 // It should have a precondition block where the generated popcount intrinsic
1589 // function can be inserted.
1590 auto *PreCondBB = PH->getSinglePredecessor();
1591 if (!PreCondBB)
1592 return false;
1593 auto *PreCondBI = dyn_cast<BranchInst>(PreCondBB->getTerminator());
1594 if (!PreCondBI || PreCondBI->isUnconditional())
1595 return false;
1596
1597 Instruction *CntInst;
1598 PHINode *CntPhi;
1599 Value *Val;
1600 if (!detectPopcountIdiom(CurLoop, PreCondBB, CntInst, CntPhi, Val))
1601 return false;
1602
1603 transformLoopToPopcount(PreCondBB, CntInst, CntPhi, Val);
1604 return true;
1605}
1606
1607static CallInst *createPopcntIntrinsic(IRBuilder<> &IRBuilder, Value *Val,
1608 const DebugLoc &DL) {
1609 Value *Ops[] = {Val};
1610 Type *Tys[] = {Val->getType()};
1611
1612 Module *M = IRBuilder.GetInsertBlock()->getParent()->getParent();
1613 Function *Func = Intrinsic::getDeclaration(M, Intrinsic::ctpop, Tys);
1614 CallInst *CI = IRBuilder.CreateCall(Func, Ops);
1615 CI->setDebugLoc(DL);
1616
1617 return CI;
1618}
1619
1620static CallInst *createFFSIntrinsic(IRBuilder<> &IRBuilder, Value *Val,
1621 const DebugLoc &DL, bool ZeroCheck,
1622 Intrinsic::ID IID) {
1623 Value *Ops[] = {Val, ZeroCheck ? IRBuilder.getTrue() : IRBuilder.getFalse()};
1624 Type *Tys[] = {Val->getType()};
1625
1626 Module *M = IRBuilder.GetInsertBlock()->getParent()->getParent();
1627 Function *Func = Intrinsic::getDeclaration(M, IID, Tys);
1628 CallInst *CI = IRBuilder.CreateCall(Func, Ops);
1629 CI->setDebugLoc(DL);
1630
1631 return CI;
1632}
1633
1634/// Transform the following loop (Using CTLZ, CTTZ is similar):
1635/// loop:
1636/// CntPhi = PHI [Cnt0, CntInst]
1637/// PhiX = PHI [InitX, DefX]
1638/// CntInst = CntPhi + 1
1639/// DefX = PhiX >> 1
1640/// LOOP_BODY
1641/// Br: loop if (DefX != 0)
1642/// Use(CntPhi) or Use(CntInst)
1643///
1644/// Into:
1645/// If CntPhi used outside the loop:
1646/// CountPrev = BitWidth(InitX) - CTLZ(InitX >> 1)
1647/// Count = CountPrev + 1
1648/// else
1649/// Count = BitWidth(InitX) - CTLZ(InitX)
1650/// loop:
1651/// CntPhi = PHI [Cnt0, CntInst]
1652/// PhiX = PHI [InitX, DefX]
1653/// PhiCount = PHI [Count, Dec]
1654/// CntInst = CntPhi + 1
1655/// DefX = PhiX >> 1
1656/// Dec = PhiCount - 1
1657/// LOOP_BODY
1658/// Br: loop if (Dec != 0)
1659/// Use(CountPrev + Cnt0) // Use(CntPhi)
1660/// or
1661/// Use(Count + Cnt0) // Use(CntInst)
1662///
1663/// If LOOP_BODY is empty the loop will be deleted.
1664/// If CntInst and DefX are not used in LOOP_BODY they will be removed.
1665void LoopIdiomRecognize::transformLoopToCountable(
1666 Intrinsic::ID IntrinID, BasicBlock *Preheader, Instruction *CntInst,
1667 PHINode *CntPhi, Value *InitX, Instruction *DefX, const DebugLoc &DL,
1668 bool ZeroCheck, bool IsCntPhiUsedOutsideLoop) {
1669 BranchInst *PreheaderBr = cast<BranchInst>(Preheader->getTerminator());
1670
1671 // Step 1: Insert the CTLZ/CTTZ instruction at the end of the preheader block
1672 IRBuilder<> Builder(PreheaderBr);
1673 Builder.SetCurrentDebugLocation(DL);
1674 Value *FFS, *Count, *CountPrev, *NewCount, *InitXNext;
1675
1676 // Count = BitWidth - CTLZ(InitX);
1677 // If there are uses of CntPhi create:
1678 // CountPrev = BitWidth - CTLZ(InitX >> 1);
1679 if (IsCntPhiUsedOutsideLoop) {
1680 if (DefX->getOpcode() == Instruction::AShr)
1681 InitXNext =
1682 Builder.CreateAShr(InitX, ConstantInt::get(InitX->getType(), 1));
1683 else if (DefX->getOpcode() == Instruction::LShr)
1684 InitXNext =
1685 Builder.CreateLShr(InitX, ConstantInt::get(InitX->getType(), 1));
1686 else if (DefX->getOpcode() == Instruction::Shl) // cttz
1687 InitXNext =
1688 Builder.CreateShl(InitX, ConstantInt::get(InitX->getType(), 1));
1689 else
1690 llvm_unreachable("Unexpected opcode!")::llvm::llvm_unreachable_internal("Unexpected opcode!", "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp"
, 1690)
;
1691 } else
1692 InitXNext = InitX;
1693 FFS = createFFSIntrinsic(Builder, InitXNext, DL, ZeroCheck, IntrinID);
1694 Count = Builder.CreateSub(
1695 ConstantInt::get(FFS->getType(),
1696 FFS->getType()->getIntegerBitWidth()),
1697 FFS);
1698 if (IsCntPhiUsedOutsideLoop) {
1699 CountPrev = Count;
1700 Count = Builder.CreateAdd(
1701 CountPrev,
1702 ConstantInt::get(CountPrev->getType(), 1));
1703 }
1704
1705 NewCount = Builder.CreateZExtOrTrunc(
1706 IsCntPhiUsedOutsideLoop ? CountPrev : Count,
1707 cast<IntegerType>(CntInst->getType()));
1708
1709 // If the counter's initial value is not zero, insert Add Inst.
1710 Value *CntInitVal = CntPhi->getIncomingValueForBlock(Preheader);
1711 ConstantInt *InitConst = dyn_cast<ConstantInt>(CntInitVal);
1712 if (!InitConst || !InitConst->isZero())
1713 NewCount = Builder.CreateAdd(NewCount, CntInitVal);
1714
1715 // Step 2: Insert new IV and loop condition:
1716 // loop:
1717 // ...
1718 // PhiCount = PHI [Count, Dec]
1719 // ...
1720 // Dec = PhiCount - 1
1721 // ...
1722 // Br: loop if (Dec != 0)
1723 BasicBlock *Body = *(CurLoop->block_begin());
1724 auto *LbBr = cast<BranchInst>(Body->getTerminator());
1725 ICmpInst *LbCond = cast<ICmpInst>(LbBr->getCondition());
1726 Type *Ty = Count->getType();
1727
1728 PHINode *TcPhi = PHINode::Create(Ty, 2, "tcphi", &Body->front());
1729
1730 Builder.SetInsertPoint(LbCond);
1731 Instruction *TcDec = cast<Instruction>(
1732 Builder.CreateSub(TcPhi, ConstantInt::get(Ty, 1),
1733 "tcdec", false, true));
1734
1735 TcPhi->addIncoming(Count, Preheader);
1736 TcPhi->addIncoming(TcDec, Body);
1737
1738 CmpInst::Predicate Pred =
1739 (LbBr->getSuccessor(0) == Body) ? CmpInst::ICMP_NE : CmpInst::ICMP_EQ;
1740 LbCond->setPredicate(Pred);
1741 LbCond->setOperand(0, TcDec);
1742 LbCond->setOperand(1, ConstantInt::get(Ty, 0));
1743
1744 // Step 3: All the references to the original counter outside
1745 // the loop are replaced with the NewCount
1746 if (IsCntPhiUsedOutsideLoop)
1747 CntPhi->replaceUsesOutsideBlock(NewCount, Body);
1748 else
1749 CntInst->replaceUsesOutsideBlock(NewCount, Body);
1750
1751 // step 4: Forget the "non-computable" trip-count SCEV associated with the
1752 // loop. The loop would otherwise not be deleted even if it becomes empty.
1753 SE->forgetLoop(CurLoop);
1754}
1755
1756void LoopIdiomRecognize::transformLoopToPopcount(BasicBlock *PreCondBB,
1757 Instruction *CntInst,
1758 PHINode *CntPhi, Value *Var) {
1759 BasicBlock *PreHead = CurLoop->getLoopPreheader();
1760 auto *PreCondBr = cast<BranchInst>(PreCondBB->getTerminator());
1761 const DebugLoc &DL = CntInst->getDebugLoc();
1762
1763 // Assuming before transformation, the loop is following:
1764 // if (x) // the precondition
1765 // do { cnt++; x &= x - 1; } while(x);
1766
1767 // Step 1: Insert the ctpop instruction at the end of the precondition block
1768 IRBuilder<> Builder(PreCondBr);
1769 Value *PopCnt, *PopCntZext, *NewCount, *TripCnt;
1770 {
1771 PopCnt = createPopcntIntrinsic(Builder, Var, DL);
1772 NewCount = PopCntZext =
1773 Builder.CreateZExtOrTrunc(PopCnt, cast<IntegerType>(CntPhi->getType()));
1774
1775 if (NewCount != PopCnt)
1776 (cast<Instruction>(NewCount))->setDebugLoc(DL);
1777
1778 // TripCnt is exactly the number of iterations the loop has
1779 TripCnt = NewCount;
1780
1781 // If the population counter's initial value is not zero, insert Add Inst.
1782 Value *CntInitVal = CntPhi->getIncomingValueForBlock(PreHead);
1783 ConstantInt *InitConst = dyn_cast<ConstantInt>(CntInitVal);
1784 if (!InitConst || !InitConst->isZero()) {
1785 NewCount = Builder.CreateAdd(NewCount, CntInitVal);
1786 (cast<Instruction>(NewCount))->setDebugLoc(DL);
1787 }
1788 }
1789
1790 // Step 2: Replace the precondition from "if (x == 0) goto loop-exit" to
1791 // "if (NewCount == 0) loop-exit". Without this change, the intrinsic
1792 // function would be partial dead code, and downstream passes will drag
1793 // it back from the precondition block to the preheader.
1794 {
1795 ICmpInst *PreCond = cast<ICmpInst>(PreCondBr->getCondition());
1796
1797 Value *Opnd0 = PopCntZext;
1798 Value *Opnd1 = ConstantInt::get(PopCntZext->getType(), 0);
1799 if (PreCond->getOperand(0) != Var)
1800 std::swap(Opnd0, Opnd1);
1801
1802 ICmpInst *NewPreCond = cast<ICmpInst>(
1803 Builder.CreateICmp(PreCond->getPredicate(), Opnd0, Opnd1));
1804 PreCondBr->setCondition(NewPreCond);
1805
1806 RecursivelyDeleteTriviallyDeadInstructions(PreCond, TLI);
1807 }
1808
1809 // Step 3: Note that the population count is exactly the trip count of the
1810 // loop in question, which enable us to convert the loop from noncountable
1811 // loop into a countable one. The benefit is twofold:
1812 //
1813 // - If the loop only counts population, the entire loop becomes dead after
1814 // the transformation. It is a lot easier to prove a countable loop dead
1815 // than to prove a noncountable one. (In some C dialects, an infinite loop
1816 // isn't dead even if it computes nothing useful. In general, DCE needs
1817 // to prove a noncountable loop finite before safely delete it.)
1818 //
1819 // - If the loop also performs something else, it remains alive.
1820 // Since it is transformed to countable form, it can be aggressively
1821 // optimized by some optimizations which are in general not applicable
1822 // to a noncountable loop.
1823 //
1824 // After this step, this loop (conceptually) would look like following:
1825 // newcnt = __builtin_ctpop(x);
1826 // t = newcnt;
1827 // if (x)
1828 // do { cnt++; x &= x-1; t--) } while (t > 0);
1829 BasicBlock *Body = *(CurLoop->block_begin());
1830 {
1831 auto *LbBr = cast<BranchInst>(Body->getTerminator());
1832 ICmpInst *LbCond = cast<ICmpInst>(LbBr->getCondition());
1833 Type *Ty = TripCnt->getType();
1834
1835 PHINode *TcPhi = PHINode::Create(Ty, 2, "tcphi", &Body->front());
1836
1837 Builder.SetInsertPoint(LbCond);
1838 Instruction *TcDec = cast<Instruction>(
1839 Builder.CreateSub(TcPhi, ConstantInt::get(Ty, 1),
1840 "tcdec", false, true));
1841
1842 TcPhi->addIncoming(TripCnt, PreHead);
1843 TcPhi->addIncoming(TcDec, Body);
1844
1845 CmpInst::Predicate Pred =
1846 (LbBr->getSuccessor(0) == Body) ? CmpInst::ICMP_UGT : CmpInst::ICMP_SLE;
1847 LbCond->setPredicate(Pred);
1848 LbCond->setOperand(0, TcDec);
1849 LbCond->setOperand(1, ConstantInt::get(Ty, 0));
1850 }
1851
1852 // Step 4: All the references to the original population counter outside
1853 // the loop are replaced with the NewCount -- the value returned from
1854 // __builtin_ctpop().
1855 CntInst->replaceUsesOutsideBlock(NewCount, Body);
1856
1857 // step 5: Forget the "non-computable" trip-count SCEV associated with the
1858 // loop. The loop would otherwise not be deleted even if it becomes empty.
1859 SE->forgetLoop(CurLoop);
1860}