Bug Summary

File:llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
Warning:line 513, column 15
Although the value stored to 'PatternValue' is used in the enclosing expression, the value is never actually read from 'PatternValue'

Annotated Source Code

Press '?' to see keyboard shortcuts

clang -cc1 -cc1 -triple x86_64-pc-linux-gnu -analyze -disable-free -disable-llvm-verifier -discard-value-names -main-file-name LoopIdiomRecognize.cpp -analyzer-store=region -analyzer-opt-analyze-nested-blocks -analyzer-checker=core -analyzer-checker=apiModeling -analyzer-checker=unix -analyzer-checker=deadcode -analyzer-checker=cplusplus -analyzer-checker=security.insecureAPI.UncheckedReturn -analyzer-checker=security.insecureAPI.getpw -analyzer-checker=security.insecureAPI.gets -analyzer-checker=security.insecureAPI.mktemp -analyzer-checker=security.insecureAPI.mkstemp -analyzer-checker=security.insecureAPI.vfork -analyzer-checker=nullability.NullPassedToNonnull -analyzer-checker=nullability.NullReturnedFromNonnull -analyzer-output plist -w -setup-static-analyzer -analyzer-config-compatibility-mode=true -mrelocation-model pic -pic-level 2 -mframe-pointer=none -fmath-errno -fno-rounding-math -mconstructor-aliases -munwind-tables -target-cpu x86-64 -tune-cpu generic -fno-split-dwarf-inlining -debugger-tuning=gdb -ffunction-sections -fdata-sections -resource-dir /usr/lib/llvm-12/lib/clang/12.0.0 -D _DEBUG -D _GNU_SOURCE -D __STDC_CONSTANT_MACROS -D __STDC_FORMAT_MACROS -D __STDC_LIMIT_MACROS -I /build/llvm-toolchain-snapshot-12~++20200926111128+c6c5629f2fb/build-llvm/lib/Transforms/Scalar -I /build/llvm-toolchain-snapshot-12~++20200926111128+c6c5629f2fb/llvm/lib/Transforms/Scalar -I /build/llvm-toolchain-snapshot-12~++20200926111128+c6c5629f2fb/build-llvm/include -I /build/llvm-toolchain-snapshot-12~++20200926111128+c6c5629f2fb/llvm/include -U NDEBUG -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/6.3.0/../../../../include/c++/6.3.0 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/6.3.0/../../../../include/x86_64-linux-gnu/c++/6.3.0 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/6.3.0/../../../../include/x86_64-linux-gnu/c++/6.3.0 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/6.3.0/../../../../include/c++/6.3.0/backward -internal-isystem /usr/local/include -internal-isystem /usr/lib/llvm-12/lib/clang/12.0.0/include -internal-externc-isystem /usr/include/x86_64-linux-gnu -internal-externc-isystem /include -internal-externc-isystem /usr/include -O2 -Wno-unused-parameter -Wwrite-strings -Wno-missing-field-initializers -Wno-long-long -Wno-maybe-uninitialized -Wno-comment -std=c++14 -fdeprecated-macro -fdebug-compilation-dir /build/llvm-toolchain-snapshot-12~++20200926111128+c6c5629f2fb/build-llvm/lib/Transforms/Scalar -fdebug-prefix-map=/build/llvm-toolchain-snapshot-12~++20200926111128+c6c5629f2fb=. -ferror-limit 19 -fvisibility-inlines-hidden -stack-protector 2 -fgnuc-version=4.2.1 -vectorize-loops -vectorize-slp -analyzer-output=html -analyzer-config stable-report-filename=true -faddrsig -o /tmp/scan-build-2020-09-26-161721-17566-1 -x c++ /build/llvm-toolchain-snapshot-12~++20200926111128+c6c5629f2fb/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
1//===- LoopIdiomRecognize.cpp - Loop idiom recognition --------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This pass implements an idiom recognizer that transforms simple loops into a
10// non-loop form. In cases that this kicks in, it can be a significant
11// performance win.
12//
13// If compiling for code size we avoid idiom recognition if the resulting
14// code could be larger than the code for the original loop. One way this could
15// happen is if the loop is not removable after idiom recognition due to the
16// presence of non-idiom instructions. The initial implementation of the
17// heuristics applies to idioms in multi-block loops.
18//
19//===----------------------------------------------------------------------===//
20//
21// TODO List:
22//
23// Future loop memory idioms to recognize:
24// memcmp, memmove, strlen, etc.
25// Future floating point idioms to recognize in -ffast-math mode:
26// fpowi
27// Future integer operation idioms to recognize:
28// ctpop
29//
30// Beware that isel's default lowering for ctpop is highly inefficient for
31// i64 and larger types when i64 is legal and the value has few bits set. It
32// would be good to enhance isel to emit a loop for ctpop in this case.
33//
34// This could recognize common matrix multiplies and dot product idioms and
35// replace them with calls to BLAS (if linked in??).
36//
37//===----------------------------------------------------------------------===//
38
39#include "llvm/Transforms/Scalar/LoopIdiomRecognize.h"
40#include "llvm/ADT/APInt.h"
41#include "llvm/ADT/ArrayRef.h"
42#include "llvm/ADT/DenseMap.h"
43#include "llvm/ADT/MapVector.h"
44#include "llvm/ADT/SetVector.h"
45#include "llvm/ADT/SmallPtrSet.h"
46#include "llvm/ADT/SmallVector.h"
47#include "llvm/ADT/Statistic.h"
48#include "llvm/ADT/StringRef.h"
49#include "llvm/Analysis/AliasAnalysis.h"
50#include "llvm/Analysis/LoopAccessAnalysis.h"
51#include "llvm/Analysis/LoopInfo.h"
52#include "llvm/Analysis/LoopPass.h"
53#include "llvm/Analysis/MemoryLocation.h"
54#include "llvm/Analysis/MemorySSA.h"
55#include "llvm/Analysis/MemorySSAUpdater.h"
56#include "llvm/Analysis/MustExecute.h"
57#include "llvm/Analysis/OptimizationRemarkEmitter.h"
58#include "llvm/Analysis/ScalarEvolution.h"
59#include "llvm/Analysis/ScalarEvolutionExpressions.h"
60#include "llvm/Analysis/TargetLibraryInfo.h"
61#include "llvm/Analysis/TargetTransformInfo.h"
62#include "llvm/Analysis/ValueTracking.h"
63#include "llvm/IR/Attributes.h"
64#include "llvm/IR/BasicBlock.h"
65#include "llvm/IR/Constant.h"
66#include "llvm/IR/Constants.h"
67#include "llvm/IR/DataLayout.h"
68#include "llvm/IR/DebugLoc.h"
69#include "llvm/IR/DerivedTypes.h"
70#include "llvm/IR/Dominators.h"
71#include "llvm/IR/GlobalValue.h"
72#include "llvm/IR/GlobalVariable.h"
73#include "llvm/IR/IRBuilder.h"
74#include "llvm/IR/InstrTypes.h"
75#include "llvm/IR/Instruction.h"
76#include "llvm/IR/Instructions.h"
77#include "llvm/IR/IntrinsicInst.h"
78#include "llvm/IR/Intrinsics.h"
79#include "llvm/IR/LLVMContext.h"
80#include "llvm/IR/Module.h"
81#include "llvm/IR/PassManager.h"
82#include "llvm/IR/Type.h"
83#include "llvm/IR/User.h"
84#include "llvm/IR/Value.h"
85#include "llvm/IR/ValueHandle.h"
86#include "llvm/InitializePasses.h"
87#include "llvm/Pass.h"
88#include "llvm/Support/Casting.h"
89#include "llvm/Support/CommandLine.h"
90#include "llvm/Support/Debug.h"
91#include "llvm/Support/raw_ostream.h"
92#include "llvm/Transforms/Scalar.h"
93#include "llvm/Transforms/Utils/BuildLibCalls.h"
94#include "llvm/Transforms/Utils/Local.h"
95#include "llvm/Transforms/Utils/LoopUtils.h"
96#include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
97#include <algorithm>
98#include <cassert>
99#include <cstdint>
100#include <utility>
101#include <vector>
102
103using namespace llvm;
104
105#define DEBUG_TYPE"loop-idiom" "loop-idiom"
106
107STATISTIC(NumMemSet, "Number of memset's formed from loop stores")static llvm::Statistic NumMemSet = {"loop-idiom", "NumMemSet"
, "Number of memset's formed from loop stores"}
;
108STATISTIC(NumMemCpy, "Number of memcpy's formed from loop load+stores")static llvm::Statistic NumMemCpy = {"loop-idiom", "NumMemCpy"
, "Number of memcpy's formed from loop load+stores"}
;
109
110bool DisableLIRP::All;
111static cl::opt<bool, true>
112 DisableLIRPAll("disable-" DEBUG_TYPE"loop-idiom" "-all",
113 cl::desc("Options to disable Loop Idiom Recognize Pass."),
114 cl::location(DisableLIRP::All), cl::init(false),
115 cl::ReallyHidden);
116
117bool DisableLIRP::Memset;
118static cl::opt<bool, true>
119 DisableLIRPMemset("disable-" DEBUG_TYPE"loop-idiom" "-memset",
120 cl::desc("Proceed with loop idiom recognize pass, but do "
121 "not convert loop(s) to memset."),
122 cl::location(DisableLIRP::Memset), cl::init(false),
123 cl::ReallyHidden);
124
125bool DisableLIRP::Memcpy;
126static cl::opt<bool, true>
127 DisableLIRPMemcpy("disable-" DEBUG_TYPE"loop-idiom" "-memcpy",
128 cl::desc("Proceed with loop idiom recognize pass, but do "
129 "not convert loop(s) to memcpy."),
130 cl::location(DisableLIRP::Memcpy), cl::init(false),
131 cl::ReallyHidden);
132
133static cl::opt<bool> UseLIRCodeSizeHeurs(
134 "use-lir-code-size-heurs",
135 cl::desc("Use loop idiom recognition code size heuristics when compiling"
136 "with -Os/-Oz"),
137 cl::init(true), cl::Hidden);
138
139namespace {
140
141class LoopIdiomRecognize {
142 Loop *CurLoop = nullptr;
143 AliasAnalysis *AA;
144 DominatorTree *DT;
145 LoopInfo *LI;
146 ScalarEvolution *SE;
147 TargetLibraryInfo *TLI;
148 const TargetTransformInfo *TTI;
149 const DataLayout *DL;
150 OptimizationRemarkEmitter &ORE;
151 bool ApplyCodeSizeHeuristics;
152 std::unique_ptr<MemorySSAUpdater> MSSAU;
153
154public:
155 explicit LoopIdiomRecognize(AliasAnalysis *AA, DominatorTree *DT,
156 LoopInfo *LI, ScalarEvolution *SE,
157 TargetLibraryInfo *TLI,
158 const TargetTransformInfo *TTI, MemorySSA *MSSA,
159 const DataLayout *DL,
160 OptimizationRemarkEmitter &ORE)
161 : AA(AA), DT(DT), LI(LI), SE(SE), TLI(TLI), TTI(TTI), DL(DL), ORE(ORE) {
162 if (MSSA)
163 MSSAU = std::make_unique<MemorySSAUpdater>(MSSA);
164 }
165
166 bool runOnLoop(Loop *L);
167
168private:
169 using StoreList = SmallVector<StoreInst *, 8>;
170 using StoreListMap = MapVector<Value *, StoreList>;
171
172 StoreListMap StoreRefsForMemset;
173 StoreListMap StoreRefsForMemsetPattern;
174 StoreList StoreRefsForMemcpy;
175 bool HasMemset;
176 bool HasMemsetPattern;
177 bool HasMemcpy;
178
179 /// Return code for isLegalStore()
180 enum LegalStoreKind {
181 None = 0,
182 Memset,
183 MemsetPattern,
184 Memcpy,
185 UnorderedAtomicMemcpy,
186 DontUse // Dummy retval never to be used. Allows catching errors in retval
187 // handling.
188 };
189
190 /// \name Countable Loop Idiom Handling
191 /// @{
192
193 bool runOnCountableLoop();
194 bool runOnLoopBlock(BasicBlock *BB, const SCEV *BECount,
195 SmallVectorImpl<BasicBlock *> &ExitBlocks);
196
197 void collectStores(BasicBlock *BB);
198 LegalStoreKind isLegalStore(StoreInst *SI);
199 enum class ForMemset { No, Yes };
200 bool processLoopStores(SmallVectorImpl<StoreInst *> &SL, const SCEV *BECount,
201 ForMemset For);
202 bool processLoopMemSet(MemSetInst *MSI, const SCEV *BECount);
203
204 bool processLoopStridedStore(Value *DestPtr, unsigned StoreSize,
205 MaybeAlign StoreAlignment, Value *StoredVal,
206 Instruction *TheStore,
207 SmallPtrSetImpl<Instruction *> &Stores,
208 const SCEVAddRecExpr *Ev, const SCEV *BECount,
209 bool NegStride, bool IsLoopMemset = false);
210 bool processLoopStoreOfLoopLoad(StoreInst *SI, const SCEV *BECount);
211 bool avoidLIRForMultiBlockLoop(bool IsMemset = false,
212 bool IsLoopMemset = false);
213
214 /// @}
215 /// \name Noncountable Loop Idiom Handling
216 /// @{
217
218 bool runOnNoncountableLoop();
219
220 bool recognizePopcount();
221 void transformLoopToPopcount(BasicBlock *PreCondBB, Instruction *CntInst,
222 PHINode *CntPhi, Value *Var);
223 bool recognizeAndInsertFFS(); /// Find First Set: ctlz or cttz
224 void transformLoopToCountable(Intrinsic::ID IntrinID, BasicBlock *PreCondBB,
225 Instruction *CntInst, PHINode *CntPhi,
226 Value *Var, Instruction *DefX,
227 const DebugLoc &DL, bool ZeroCheck,
228 bool IsCntPhiUsedOutsideLoop);
229
230 /// @}
231};
232
233class LoopIdiomRecognizeLegacyPass : public LoopPass {
234public:
235 static char ID;
236
237 explicit LoopIdiomRecognizeLegacyPass() : LoopPass(ID) {
238 initializeLoopIdiomRecognizeLegacyPassPass(
239 *PassRegistry::getPassRegistry());
240 }
241
242 bool runOnLoop(Loop *L, LPPassManager &LPM) override {
243 if (DisableLIRP::All)
244 return false;
245
246 if (skipLoop(L))
247 return false;
248
249 AliasAnalysis *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
250 DominatorTree *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
251 LoopInfo *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
252 ScalarEvolution *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
253 TargetLibraryInfo *TLI =
254 &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(
255 *L->getHeader()->getParent());
256 const TargetTransformInfo *TTI =
257 &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(
258 *L->getHeader()->getParent());
259 const DataLayout *DL = &L->getHeader()->getModule()->getDataLayout();
260 auto *MSSAAnalysis = getAnalysisIfAvailable<MemorySSAWrapperPass>();
261 MemorySSA *MSSA = nullptr;
262 if (MSSAAnalysis)
263 MSSA = &MSSAAnalysis->getMSSA();
264
265 // For the old PM, we can't use OptimizationRemarkEmitter as an analysis
266 // pass. Function analyses need to be preserved across loop transformations
267 // but ORE cannot be preserved (see comment before the pass definition).
268 OptimizationRemarkEmitter ORE(L->getHeader()->getParent());
269
270 LoopIdiomRecognize LIR(AA, DT, LI, SE, TLI, TTI, MSSA, DL, ORE);
271 return LIR.runOnLoop(L);
272 }
273
274 /// This transformation requires natural loop information & requires that
275 /// loop preheaders be inserted into the CFG.
276 void getAnalysisUsage(AnalysisUsage &AU) const override {
277 AU.addRequired<TargetLibraryInfoWrapperPass>();
278 AU.addRequired<TargetTransformInfoWrapperPass>();
279 AU.addPreserved<MemorySSAWrapperPass>();
280 getLoopAnalysisUsage(AU);
281 }
282};
283
284} // end anonymous namespace
285
286char LoopIdiomRecognizeLegacyPass::ID = 0;
287
288PreservedAnalyses LoopIdiomRecognizePass::run(Loop &L, LoopAnalysisManager &AM,
289 LoopStandardAnalysisResults &AR,
290 LPMUpdater &) {
291 if (DisableLIRP::All)
292 return PreservedAnalyses::all();
293
294 const auto *DL = &L.getHeader()->getModule()->getDataLayout();
295
296 // For the new PM, we also can't use OptimizationRemarkEmitter as an analysis
297 // pass. Function analyses need to be preserved across loop transformations
298 // but ORE cannot be preserved (see comment before the pass definition).
299 OptimizationRemarkEmitter ORE(L.getHeader()->getParent());
300
301 LoopIdiomRecognize LIR(&AR.AA, &AR.DT, &AR.LI, &AR.SE, &AR.TLI, &AR.TTI,
302 AR.MSSA, DL, ORE);
303 if (!LIR.runOnLoop(&L))
304 return PreservedAnalyses::all();
305
306 auto PA = getLoopPassPreservedAnalyses();
307 if (AR.MSSA)
308 PA.preserve<MemorySSAAnalysis>();
309 return PA;
310}
311
312INITIALIZE_PASS_BEGIN(LoopIdiomRecognizeLegacyPass, "loop-idiom",static void *initializeLoopIdiomRecognizeLegacyPassPassOnce(PassRegistry
&Registry) {
313 "Recognize loop idioms", false, false)static void *initializeLoopIdiomRecognizeLegacyPassPassOnce(PassRegistry
&Registry) {
314INITIALIZE_PASS_DEPENDENCY(LoopPass)initializeLoopPassPass(Registry);
315INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)initializeTargetLibraryInfoWrapperPassPass(Registry);
316INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)initializeTargetTransformInfoWrapperPassPass(Registry);
317INITIALIZE_PASS_END(LoopIdiomRecognizeLegacyPass, "loop-idiom",PassInfo *PI = new PassInfo( "Recognize loop idioms", "loop-idiom"
, &LoopIdiomRecognizeLegacyPass::ID, PassInfo::NormalCtor_t
(callDefaultCtor<LoopIdiomRecognizeLegacyPass>), false,
false); Registry.registerPass(*PI, true); return PI; } static
llvm::once_flag InitializeLoopIdiomRecognizeLegacyPassPassFlag
; void llvm::initializeLoopIdiomRecognizeLegacyPassPass(PassRegistry
&Registry) { llvm::call_once(InitializeLoopIdiomRecognizeLegacyPassPassFlag
, initializeLoopIdiomRecognizeLegacyPassPassOnce, std::ref(Registry
)); }
318 "Recognize loop idioms", false, false)PassInfo *PI = new PassInfo( "Recognize loop idioms", "loop-idiom"
, &LoopIdiomRecognizeLegacyPass::ID, PassInfo::NormalCtor_t
(callDefaultCtor<LoopIdiomRecognizeLegacyPass>), false,
false); Registry.registerPass(*PI, true); return PI; } static
llvm::once_flag InitializeLoopIdiomRecognizeLegacyPassPassFlag
; void llvm::initializeLoopIdiomRecognizeLegacyPassPass(PassRegistry
&Registry) { llvm::call_once(InitializeLoopIdiomRecognizeLegacyPassPassFlag
, initializeLoopIdiomRecognizeLegacyPassPassOnce, std::ref(Registry
)); }
319
320Pass *llvm::createLoopIdiomPass() { return new LoopIdiomRecognizeLegacyPass(); }
321
322static void deleteDeadInstruction(Instruction *I) {
323 I->replaceAllUsesWith(UndefValue::get(I->getType()));
324 I->eraseFromParent();
325}
326
327//===----------------------------------------------------------------------===//
328//
329// Implementation of LoopIdiomRecognize
330//
331//===----------------------------------------------------------------------===//
332
333bool LoopIdiomRecognize::runOnLoop(Loop *L) {
334 CurLoop = L;
335 // If the loop could not be converted to canonical form, it must have an
336 // indirectbr in it, just give up.
337 if (!L->getLoopPreheader())
338 return false;
339
340 // Disable loop idiom recognition if the function's name is a common idiom.
341 StringRef Name = L->getHeader()->getParent()->getName();
342 if (Name == "memset" || Name == "memcpy")
343 return false;
344
345 // Determine if code size heuristics need to be applied.
346 ApplyCodeSizeHeuristics =
347 L->getHeader()->getParent()->hasOptSize() && UseLIRCodeSizeHeurs;
348
349 HasMemset = TLI->has(LibFunc_memset);
350 HasMemsetPattern = TLI->has(LibFunc_memset_pattern16);
351 HasMemcpy = TLI->has(LibFunc_memcpy);
352
353 if (HasMemset || HasMemsetPattern || HasMemcpy)
354 if (SE->hasLoopInvariantBackedgeTakenCount(L))
355 return runOnCountableLoop();
356
357 return runOnNoncountableLoop();
358}
359
360bool LoopIdiomRecognize::runOnCountableLoop() {
361 const SCEV *BECount = SE->getBackedgeTakenCount(CurLoop);
362 assert(!isa<SCEVCouldNotCompute>(BECount) &&((!isa<SCEVCouldNotCompute>(BECount) && "runOnCountableLoop() called on a loop without a predictable"
"backedge-taken count") ? static_cast<void> (0) : __assert_fail
("!isa<SCEVCouldNotCompute>(BECount) && \"runOnCountableLoop() called on a loop without a predictable\" \"backedge-taken count\""
, "/build/llvm-toolchain-snapshot-12~++20200926111128+c6c5629f2fb/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp"
, 364, __PRETTY_FUNCTION__))
363 "runOnCountableLoop() called on a loop without a predictable"((!isa<SCEVCouldNotCompute>(BECount) && "runOnCountableLoop() called on a loop without a predictable"
"backedge-taken count") ? static_cast<void> (0) : __assert_fail
("!isa<SCEVCouldNotCompute>(BECount) && \"runOnCountableLoop() called on a loop without a predictable\" \"backedge-taken count\""
, "/build/llvm-toolchain-snapshot-12~++20200926111128+c6c5629f2fb/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp"
, 364, __PRETTY_FUNCTION__))
364 "backedge-taken count")((!isa<SCEVCouldNotCompute>(BECount) && "runOnCountableLoop() called on a loop without a predictable"
"backedge-taken count") ? static_cast<void> (0) : __assert_fail
("!isa<SCEVCouldNotCompute>(BECount) && \"runOnCountableLoop() called on a loop without a predictable\" \"backedge-taken count\""
, "/build/llvm-toolchain-snapshot-12~++20200926111128+c6c5629f2fb/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp"
, 364, __PRETTY_FUNCTION__))
;
365
366 // If this loop executes exactly one time, then it should be peeled, not
367 // optimized by this pass.
368 if (const SCEVConstant *BECst = dyn_cast<SCEVConstant>(BECount))
369 if (BECst->getAPInt() == 0)
370 return false;
371
372 SmallVector<BasicBlock *, 8> ExitBlocks;
373 CurLoop->getUniqueExitBlocks(ExitBlocks);
374
375 LLVM_DEBUG(dbgs() << DEBUG_TYPE " Scanning: F["do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-idiom")) { dbgs() << "loop-idiom" " Scanning: F["
<< CurLoop->getHeader()->getParent()->getName
() << "] Countable Loop %" << CurLoop->getHeader
()->getName() << "\n"; } } while (false)
376 << CurLoop->getHeader()->getParent()->getName()do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-idiom")) { dbgs() << "loop-idiom" " Scanning: F["
<< CurLoop->getHeader()->getParent()->getName
() << "] Countable Loop %" << CurLoop->getHeader
()->getName() << "\n"; } } while (false)
377 << "] Countable Loop %" << CurLoop->getHeader()->getName()do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-idiom")) { dbgs() << "loop-idiom" " Scanning: F["
<< CurLoop->getHeader()->getParent()->getName
() << "] Countable Loop %" << CurLoop->getHeader
()->getName() << "\n"; } } while (false)
378 << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-idiom")) { dbgs() << "loop-idiom" " Scanning: F["
<< CurLoop->getHeader()->getParent()->getName
() << "] Countable Loop %" << CurLoop->getHeader
()->getName() << "\n"; } } while (false)
;
379
380 // The following transforms hoist stores/memsets into the loop pre-header.
381 // Give up if the loop has instructions that may throw.
382 SimpleLoopSafetyInfo SafetyInfo;
383 SafetyInfo.computeLoopSafetyInfo(CurLoop);
384 if (SafetyInfo.anyBlockMayThrow())
385 return false;
386
387 bool MadeChange = false;
388
389 // Scan all the blocks in the loop that are not in subloops.
390 for (auto *BB : CurLoop->getBlocks()) {
391 // Ignore blocks in subloops.
392 if (LI->getLoopFor(BB) != CurLoop)
393 continue;
394
395 MadeChange |= runOnLoopBlock(BB, BECount, ExitBlocks);
396 }
397 return MadeChange;
398}
399
400static APInt getStoreStride(const SCEVAddRecExpr *StoreEv) {
401 const SCEVConstant *ConstStride = cast<SCEVConstant>(StoreEv->getOperand(1));
402 return ConstStride->getAPInt();
403}
404
405/// getMemSetPatternValue - If a strided store of the specified value is safe to
406/// turn into a memset_pattern16, return a ConstantArray of 16 bytes that should
407/// be passed in. Otherwise, return null.
408///
409/// Note that we don't ever attempt to use memset_pattern8 or 4, because these
410/// just replicate their input array and then pass on to memset_pattern16.
411static Constant *getMemSetPatternValue(Value *V, const DataLayout *DL) {
412 // FIXME: This could check for UndefValue because it can be merged into any
413 // other valid pattern.
414
415 // If the value isn't a constant, we can't promote it to being in a constant
416 // array. We could theoretically do a store to an alloca or something, but
417 // that doesn't seem worthwhile.
418 Constant *C = dyn_cast<Constant>(V);
419 if (!C)
420 return nullptr;
421
422 // Only handle simple values that are a power of two bytes in size.
423 uint64_t Size = DL->getTypeSizeInBits(V->getType());
424 if (Size == 0 || (Size & 7) || (Size & (Size - 1)))
425 return nullptr;
426
427 // Don't care enough about darwin/ppc to implement this.
428 if (DL->isBigEndian())
429 return nullptr;
430
431 // Convert to size in bytes.
432 Size /= 8;
433
434 // TODO: If CI is larger than 16-bytes, we can try slicing it in half to see
435 // if the top and bottom are the same (e.g. for vectors and large integers).
436 if (Size > 16)
437 return nullptr;
438
439 // If the constant is exactly 16 bytes, just use it.
440 if (Size == 16)
441 return C;
442
443 // Otherwise, we'll use an array of the constants.
444 unsigned ArraySize = 16 / Size;
445 ArrayType *AT = ArrayType::get(V->getType(), ArraySize);
446 return ConstantArray::get(AT, std::vector<Constant *>(ArraySize, C));
447}
448
449LoopIdiomRecognize::LegalStoreKind
450LoopIdiomRecognize::isLegalStore(StoreInst *SI) {
451 // Don't touch volatile stores.
452 if (SI->isVolatile())
453 return LegalStoreKind::None;
454 // We only want simple or unordered-atomic stores.
455 if (!SI->isUnordered())
456 return LegalStoreKind::None;
457
458 // Don't convert stores of non-integral pointer types to memsets (which stores
459 // integers).
460 if (DL->isNonIntegralPointerType(SI->getValueOperand()->getType()))
461 return LegalStoreKind::None;
462
463 // Avoid merging nontemporal stores.
464 if (SI->getMetadata(LLVMContext::MD_nontemporal))
465 return LegalStoreKind::None;
466
467 Value *StoredVal = SI->getValueOperand();
468 Value *StorePtr = SI->getPointerOperand();
469
470 // Reject stores that are so large that they overflow an unsigned.
471 // When storing out scalable vectors we bail out for now, since the code
472 // below currently only works for constant strides.
473 TypeSize SizeInBits = DL->getTypeSizeInBits(StoredVal->getType());
474 if (SizeInBits.isScalable() || (SizeInBits.getFixedSize() & 7) ||
475 (SizeInBits.getFixedSize() >> 32) != 0)
476 return LegalStoreKind::None;
477
478 // See if the pointer expression is an AddRec like {base,+,1} on the current
479 // loop, which indicates a strided store. If we have something else, it's a
480 // random store we can't handle.
481 const SCEVAddRecExpr *StoreEv =
482 dyn_cast<SCEVAddRecExpr>(SE->getSCEV(StorePtr));
483 if (!StoreEv || StoreEv->getLoop() != CurLoop || !StoreEv->isAffine())
484 return LegalStoreKind::None;
485
486 // Check to see if we have a constant stride.
487 if (!isa<SCEVConstant>(StoreEv->getOperand(1)))
488 return LegalStoreKind::None;
489
490 // See if the store can be turned into a memset.
491
492 // If the stored value is a byte-wise value (like i32 -1), then it may be
493 // turned into a memset of i8 -1, assuming that all the consecutive bytes
494 // are stored. A store of i32 0x01020304 can never be turned into a memset,
495 // but it can be turned into memset_pattern if the target supports it.
496 Value *SplatValue = isBytewiseValue(StoredVal, *DL);
497 Constant *PatternValue = nullptr;
498
499 // Note: memset and memset_pattern on unordered-atomic is yet not supported
500 bool UnorderedAtomic = SI->isUnordered() && !SI->isSimple();
501
502 // If we're allowed to form a memset, and the stored value would be
503 // acceptable for memset, use it.
504 if (!UnorderedAtomic && HasMemset && SplatValue && !DisableLIRP::Memset &&
505 // Verify that the stored value is loop invariant. If not, we can't
506 // promote the memset.
507 CurLoop->isLoopInvariant(SplatValue)) {
508 // It looks like we can use SplatValue.
509 return LegalStoreKind::Memset;
510 } else if (!UnorderedAtomic && HasMemsetPattern && !DisableLIRP::Memset &&
511 // Don't create memset_pattern16s with address spaces.
512 StorePtr->getType()->getPointerAddressSpace() == 0 &&
513 (PatternValue = getMemSetPatternValue(StoredVal, DL))) {
Although the value stored to 'PatternValue' is used in the enclosing expression, the value is never actually read from 'PatternValue'
514 // It looks like we can use PatternValue!
515 return LegalStoreKind::MemsetPattern;
516 }
517
518 // Otherwise, see if the store can be turned into a memcpy.
519 if (HasMemcpy && !DisableLIRP::Memcpy) {
520 // Check to see if the stride matches the size of the store. If so, then we
521 // know that every byte is touched in the loop.
522 APInt Stride = getStoreStride(StoreEv);
523 unsigned StoreSize = DL->getTypeStoreSize(SI->getValueOperand()->getType());
524 if (StoreSize != Stride && StoreSize != -Stride)
525 return LegalStoreKind::None;
526
527 // The store must be feeding a non-volatile load.
528 LoadInst *LI = dyn_cast<LoadInst>(SI->getValueOperand());
529
530 // Only allow non-volatile loads
531 if (!LI || LI->isVolatile())
532 return LegalStoreKind::None;
533 // Only allow simple or unordered-atomic loads
534 if (!LI->isUnordered())
535 return LegalStoreKind::None;
536
537 // See if the pointer expression is an AddRec like {base,+,1} on the current
538 // loop, which indicates a strided load. If we have something else, it's a
539 // random load we can't handle.
540 const SCEVAddRecExpr *LoadEv =
541 dyn_cast<SCEVAddRecExpr>(SE->getSCEV(LI->getPointerOperand()));
542 if (!LoadEv || LoadEv->getLoop() != CurLoop || !LoadEv->isAffine())
543 return LegalStoreKind::None;
544
545 // The store and load must share the same stride.
546 if (StoreEv->getOperand(1) != LoadEv->getOperand(1))
547 return LegalStoreKind::None;
548
549 // Success. This store can be converted into a memcpy.
550 UnorderedAtomic = UnorderedAtomic || LI->isAtomic();
551 return UnorderedAtomic ? LegalStoreKind::UnorderedAtomicMemcpy
552 : LegalStoreKind::Memcpy;
553 }
554 // This store can't be transformed into a memset/memcpy.
555 return LegalStoreKind::None;
556}
557
558void LoopIdiomRecognize::collectStores(BasicBlock *BB) {
559 StoreRefsForMemset.clear();
560 StoreRefsForMemsetPattern.clear();
561 StoreRefsForMemcpy.clear();
562 for (Instruction &I : *BB) {
563 StoreInst *SI = dyn_cast<StoreInst>(&I);
564 if (!SI)
565 continue;
566
567 // Make sure this is a strided store with a constant stride.
568 switch (isLegalStore(SI)) {
569 case LegalStoreKind::None:
570 // Nothing to do
571 break;
572 case LegalStoreKind::Memset: {
573 // Find the base pointer.
574 Value *Ptr = getUnderlyingObject(SI->getPointerOperand());
575 StoreRefsForMemset[Ptr].push_back(SI);
576 } break;
577 case LegalStoreKind::MemsetPattern: {
578 // Find the base pointer.
579 Value *Ptr = getUnderlyingObject(SI->getPointerOperand());
580 StoreRefsForMemsetPattern[Ptr].push_back(SI);
581 } break;
582 case LegalStoreKind::Memcpy:
583 case LegalStoreKind::UnorderedAtomicMemcpy:
584 StoreRefsForMemcpy.push_back(SI);
585 break;
586 default:
587 assert(false && "unhandled return value")((false && "unhandled return value") ? static_cast<
void> (0) : __assert_fail ("false && \"unhandled return value\""
, "/build/llvm-toolchain-snapshot-12~++20200926111128+c6c5629f2fb/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp"
, 587, __PRETTY_FUNCTION__))
;
588 break;
589 }
590 }
591}
592
593/// runOnLoopBlock - Process the specified block, which lives in a counted loop
594/// with the specified backedge count. This block is known to be in the current
595/// loop and not in any subloops.
596bool LoopIdiomRecognize::runOnLoopBlock(
597 BasicBlock *BB, const SCEV *BECount,
598 SmallVectorImpl<BasicBlock *> &ExitBlocks) {
599 // We can only promote stores in this block if they are unconditionally
600 // executed in the loop. For a block to be unconditionally executed, it has
601 // to dominate all the exit blocks of the loop. Verify this now.
602 for (unsigned i = 0, e = ExitBlocks.size(); i != e; ++i)
603 if (!DT->dominates(BB, ExitBlocks[i]))
604 return false;
605
606 bool MadeChange = false;
607 // Look for store instructions, which may be optimized to memset/memcpy.
608 collectStores(BB);
609
610 // Look for a single store or sets of stores with a common base, which can be
611 // optimized into a memset (memset_pattern). The latter most commonly happens
612 // with structs and handunrolled loops.
613 for (auto &SL : StoreRefsForMemset)
614 MadeChange |= processLoopStores(SL.second, BECount, ForMemset::Yes);
615
616 for (auto &SL : StoreRefsForMemsetPattern)
617 MadeChange |= processLoopStores(SL.second, BECount, ForMemset::No);
618
619 // Optimize the store into a memcpy, if it feeds an similarly strided load.
620 for (auto &SI : StoreRefsForMemcpy)
621 MadeChange |= processLoopStoreOfLoopLoad(SI, BECount);
622
623 for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) {
624 Instruction *Inst = &*I++;
625 // Look for memset instructions, which may be optimized to a larger memset.
626 if (MemSetInst *MSI = dyn_cast<MemSetInst>(Inst)) {
627 WeakTrackingVH InstPtr(&*I);
628 if (!processLoopMemSet(MSI, BECount))
629 continue;
630 MadeChange = true;
631
632 // If processing the memset invalidated our iterator, start over from the
633 // top of the block.
634 if (!InstPtr)
635 I = BB->begin();
636 continue;
637 }
638 }
639
640 return MadeChange;
641}
642
643/// See if this store(s) can be promoted to a memset.
644bool LoopIdiomRecognize::processLoopStores(SmallVectorImpl<StoreInst *> &SL,
645 const SCEV *BECount, ForMemset For) {
646 // Try to find consecutive stores that can be transformed into memsets.
647 SetVector<StoreInst *> Heads, Tails;
648 SmallDenseMap<StoreInst *, StoreInst *> ConsecutiveChain;
649
650 // Do a quadratic search on all of the given stores and find
651 // all of the pairs of stores that follow each other.
652 SmallVector<unsigned, 16> IndexQueue;
653 for (unsigned i = 0, e = SL.size(); i < e; ++i) {
654 assert(SL[i]->isSimple() && "Expected only non-volatile stores.")((SL[i]->isSimple() && "Expected only non-volatile stores."
) ? static_cast<void> (0) : __assert_fail ("SL[i]->isSimple() && \"Expected only non-volatile stores.\""
, "/build/llvm-toolchain-snapshot-12~++20200926111128+c6c5629f2fb/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp"
, 654, __PRETTY_FUNCTION__))
;
655
656 Value *FirstStoredVal = SL[i]->getValueOperand();
657 Value *FirstStorePtr = SL[i]->getPointerOperand();
658 const SCEVAddRecExpr *FirstStoreEv =
659 cast<SCEVAddRecExpr>(SE->getSCEV(FirstStorePtr));
660 APInt FirstStride = getStoreStride(FirstStoreEv);
661 unsigned FirstStoreSize = DL->getTypeStoreSize(SL[i]->getValueOperand()->getType());
662
663 // See if we can optimize just this store in isolation.
664 if (FirstStride == FirstStoreSize || -FirstStride == FirstStoreSize) {
665 Heads.insert(SL[i]);
666 continue;
667 }
668
669 Value *FirstSplatValue = nullptr;
670 Constant *FirstPatternValue = nullptr;
671
672 if (For == ForMemset::Yes)
673 FirstSplatValue = isBytewiseValue(FirstStoredVal, *DL);
674 else
675 FirstPatternValue = getMemSetPatternValue(FirstStoredVal, DL);
676
677 assert((FirstSplatValue || FirstPatternValue) &&(((FirstSplatValue || FirstPatternValue) && "Expected either splat value or pattern value."
) ? static_cast<void> (0) : __assert_fail ("(FirstSplatValue || FirstPatternValue) && \"Expected either splat value or pattern value.\""
, "/build/llvm-toolchain-snapshot-12~++20200926111128+c6c5629f2fb/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp"
, 678, __PRETTY_FUNCTION__))
678 "Expected either splat value or pattern value.")(((FirstSplatValue || FirstPatternValue) && "Expected either splat value or pattern value."
) ? static_cast<void> (0) : __assert_fail ("(FirstSplatValue || FirstPatternValue) && \"Expected either splat value or pattern value.\""
, "/build/llvm-toolchain-snapshot-12~++20200926111128+c6c5629f2fb/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp"
, 678, __PRETTY_FUNCTION__))
;
679
680 IndexQueue.clear();
681 // If a store has multiple consecutive store candidates, search Stores
682 // array according to the sequence: from i+1 to e, then from i-1 to 0.
683 // This is because usually pairing with immediate succeeding or preceding
684 // candidate create the best chance to find memset opportunity.
685 unsigned j = 0;
686 for (j = i + 1; j < e; ++j)
687 IndexQueue.push_back(j);
688 for (j = i; j > 0; --j)
689 IndexQueue.push_back(j - 1);
690
691 for (auto &k : IndexQueue) {
692 assert(SL[k]->isSimple() && "Expected only non-volatile stores.")((SL[k]->isSimple() && "Expected only non-volatile stores."
) ? static_cast<void> (0) : __assert_fail ("SL[k]->isSimple() && \"Expected only non-volatile stores.\""
, "/build/llvm-toolchain-snapshot-12~++20200926111128+c6c5629f2fb/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp"
, 692, __PRETTY_FUNCTION__))
;
693 Value *SecondStorePtr = SL[k]->getPointerOperand();
694 const SCEVAddRecExpr *SecondStoreEv =
695 cast<SCEVAddRecExpr>(SE->getSCEV(SecondStorePtr));
696 APInt SecondStride = getStoreStride(SecondStoreEv);
697
698 if (FirstStride != SecondStride)
699 continue;
700
701 Value *SecondStoredVal = SL[k]->getValueOperand();
702 Value *SecondSplatValue = nullptr;
703 Constant *SecondPatternValue = nullptr;
704
705 if (For == ForMemset::Yes)
706 SecondSplatValue = isBytewiseValue(SecondStoredVal, *DL);
707 else
708 SecondPatternValue = getMemSetPatternValue(SecondStoredVal, DL);
709
710 assert((SecondSplatValue || SecondPatternValue) &&(((SecondSplatValue || SecondPatternValue) && "Expected either splat value or pattern value."
) ? static_cast<void> (0) : __assert_fail ("(SecondSplatValue || SecondPatternValue) && \"Expected either splat value or pattern value.\""
, "/build/llvm-toolchain-snapshot-12~++20200926111128+c6c5629f2fb/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp"
, 711, __PRETTY_FUNCTION__))
711 "Expected either splat value or pattern value.")(((SecondSplatValue || SecondPatternValue) && "Expected either splat value or pattern value."
) ? static_cast<void> (0) : __assert_fail ("(SecondSplatValue || SecondPatternValue) && \"Expected either splat value or pattern value.\""
, "/build/llvm-toolchain-snapshot-12~++20200926111128+c6c5629f2fb/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp"
, 711, __PRETTY_FUNCTION__))
;
712
713 if (isConsecutiveAccess(SL[i], SL[k], *DL, *SE, false)) {
714 if (For == ForMemset::Yes) {
715 if (isa<UndefValue>(FirstSplatValue))
716 FirstSplatValue = SecondSplatValue;
717 if (FirstSplatValue != SecondSplatValue)
718 continue;
719 } else {
720 if (isa<UndefValue>(FirstPatternValue))
721 FirstPatternValue = SecondPatternValue;
722 if (FirstPatternValue != SecondPatternValue)
723 continue;
724 }
725 Tails.insert(SL[k]);
726 Heads.insert(SL[i]);
727 ConsecutiveChain[SL[i]] = SL[k];
728 break;
729 }
730 }
731 }
732
733 // We may run into multiple chains that merge into a single chain. We mark the
734 // stores that we transformed so that we don't visit the same store twice.
735 SmallPtrSet<Value *, 16> TransformedStores;
736 bool Changed = false;
737
738 // For stores that start but don't end a link in the chain:
739 for (SetVector<StoreInst *>::iterator it = Heads.begin(), e = Heads.end();
740 it != e; ++it) {
741 if (Tails.count(*it))
742 continue;
743
744 // We found a store instr that starts a chain. Now follow the chain and try
745 // to transform it.
746 SmallPtrSet<Instruction *, 8> AdjacentStores;
747 StoreInst *I = *it;
748
749 StoreInst *HeadStore = I;
750 unsigned StoreSize = 0;
751
752 // Collect the chain into a list.
753 while (Tails.count(I) || Heads.count(I)) {
754 if (TransformedStores.count(I))
755 break;
756 AdjacentStores.insert(I);
757
758 StoreSize += DL->getTypeStoreSize(I->getValueOperand()->getType());
759 // Move to the next value in the chain.
760 I = ConsecutiveChain[I];
761 }
762
763 Value *StoredVal = HeadStore->getValueOperand();
764 Value *StorePtr = HeadStore->getPointerOperand();
765 const SCEVAddRecExpr *StoreEv = cast<SCEVAddRecExpr>(SE->getSCEV(StorePtr));
766 APInt Stride = getStoreStride(StoreEv);
767
768 // Check to see if the stride matches the size of the stores. If so, then
769 // we know that every byte is touched in the loop.
770 if (StoreSize != Stride && StoreSize != -Stride)
771 continue;
772
773 bool NegStride = StoreSize == -Stride;
774
775 if (processLoopStridedStore(StorePtr, StoreSize,
776 MaybeAlign(HeadStore->getAlignment()),
777 StoredVal, HeadStore, AdjacentStores, StoreEv,
778 BECount, NegStride)) {
779 TransformedStores.insert(AdjacentStores.begin(), AdjacentStores.end());
780 Changed = true;
781 }
782 }
783
784 return Changed;
785}
786
787/// processLoopMemSet - See if this memset can be promoted to a large memset.
788bool LoopIdiomRecognize::processLoopMemSet(MemSetInst *MSI,
789 const SCEV *BECount) {
790 // We can only handle non-volatile memsets with a constant size.
791 if (MSI->isVolatile() || !isa<ConstantInt>(MSI->getLength()))
792 return false;
793
794 // If we're not allowed to hack on memset, we fail.
795 if (!HasMemset)
796 return false;
797
798 Value *Pointer = MSI->getDest();
799
800 // See if the pointer expression is an AddRec like {base,+,1} on the current
801 // loop, which indicates a strided store. If we have something else, it's a
802 // random store we can't handle.
803 const SCEVAddRecExpr *Ev = dyn_cast<SCEVAddRecExpr>(SE->getSCEV(Pointer));
804 if (!Ev || Ev->getLoop() != CurLoop || !Ev->isAffine())
805 return false;
806
807 // Reject memsets that are so large that they overflow an unsigned.
808 uint64_t SizeInBytes = cast<ConstantInt>(MSI->getLength())->getZExtValue();
809 if ((SizeInBytes >> 32) != 0)
810 return false;
811
812 // Check to see if the stride matches the size of the memset. If so, then we
813 // know that every byte is touched in the loop.
814 const SCEVConstant *ConstStride = dyn_cast<SCEVConstant>(Ev->getOperand(1));
815 if (!ConstStride)
816 return false;
817
818 APInt Stride = ConstStride->getAPInt();
819 if (SizeInBytes != Stride && SizeInBytes != -Stride)
820 return false;
821
822 // Verify that the memset value is loop invariant. If not, we can't promote
823 // the memset.
824 Value *SplatValue = MSI->getValue();
825 if (!SplatValue || !CurLoop->isLoopInvariant(SplatValue))
826 return false;
827
828 SmallPtrSet<Instruction *, 1> MSIs;
829 MSIs.insert(MSI);
830 bool NegStride = SizeInBytes == -Stride;
831 return processLoopStridedStore(
832 Pointer, (unsigned)SizeInBytes, MaybeAlign(MSI->getDestAlignment()),
833 SplatValue, MSI, MSIs, Ev, BECount, NegStride, /*IsLoopMemset=*/true);
834}
835
836/// mayLoopAccessLocation - Return true if the specified loop might access the
837/// specified pointer location, which is a loop-strided access. The 'Access'
838/// argument specifies what the verboten forms of access are (read or write).
839static bool
840mayLoopAccessLocation(Value *Ptr, ModRefInfo Access, Loop *L,
841 const SCEV *BECount, unsigned StoreSize,
842 AliasAnalysis &AA,
843 SmallPtrSetImpl<Instruction *> &IgnoredStores) {
844 // Get the location that may be stored across the loop. Since the access is
845 // strided positively through memory, we say that the modified location starts
846 // at the pointer and has infinite size.
847 LocationSize AccessSize = LocationSize::unknown();
848
849 // If the loop iterates a fixed number of times, we can refine the access size
850 // to be exactly the size of the memset, which is (BECount+1)*StoreSize
851 if (const SCEVConstant *BECst = dyn_cast<SCEVConstant>(BECount))
852 AccessSize = LocationSize::precise((BECst->getValue()->getZExtValue() + 1) *
853 StoreSize);
854
855 // TODO: For this to be really effective, we have to dive into the pointer
856 // operand in the store. Store to &A[i] of 100 will always return may alias
857 // with store of &A[100], we need to StoreLoc to be "A" with size of 100,
858 // which will then no-alias a store to &A[100].
859 MemoryLocation StoreLoc(Ptr, AccessSize);
860
861 for (Loop::block_iterator BI = L->block_begin(), E = L->block_end(); BI != E;
862 ++BI)
863 for (Instruction &I : **BI)
864 if (IgnoredStores.count(&I) == 0 &&
865 isModOrRefSet(
866 intersectModRef(AA.getModRefInfo(&I, StoreLoc), Access)))
867 return true;
868
869 return false;
870}
871
872// If we have a negative stride, Start refers to the end of the memory location
873// we're trying to memset. Therefore, we need to recompute the base pointer,
874// which is just Start - BECount*Size.
875static const SCEV *getStartForNegStride(const SCEV *Start, const SCEV *BECount,
876 Type *IntPtr, unsigned StoreSize,
877 ScalarEvolution *SE) {
878 const SCEV *Index = SE->getTruncateOrZeroExtend(BECount, IntPtr);
879 if (StoreSize != 1)
880 Index = SE->getMulExpr(Index, SE->getConstant(IntPtr, StoreSize),
881 SCEV::FlagNUW);
882 return SE->getMinusSCEV(Start, Index);
883}
884
885/// Compute the number of bytes as a SCEV from the backedge taken count.
886///
887/// This also maps the SCEV into the provided type and tries to handle the
888/// computation in a way that will fold cleanly.
889static const SCEV *getNumBytes(const SCEV *BECount, Type *IntPtr,
890 unsigned StoreSize, Loop *CurLoop,
891 const DataLayout *DL, ScalarEvolution *SE) {
892 const SCEV *NumBytesS;
893 // The # stored bytes is (BECount+1)*Size. Expand the trip count out to
894 // pointer size if it isn't already.
895 //
896 // If we're going to need to zero extend the BE count, check if we can add
897 // one to it prior to zero extending without overflow. Provided this is safe,
898 // it allows better simplification of the +1.
899 if (DL->getTypeSizeInBits(BECount->getType()) <
900 DL->getTypeSizeInBits(IntPtr) &&
901 SE->isLoopEntryGuardedByCond(
902 CurLoop, ICmpInst::ICMP_NE, BECount,
903 SE->getNegativeSCEV(SE->getOne(BECount->getType())))) {
904 NumBytesS = SE->getZeroExtendExpr(
905 SE->getAddExpr(BECount, SE->getOne(BECount->getType()), SCEV::FlagNUW),
906 IntPtr);
907 } else {
908 NumBytesS = SE->getAddExpr(SE->getTruncateOrZeroExtend(BECount, IntPtr),
909 SE->getOne(IntPtr), SCEV::FlagNUW);
910 }
911
912 // And scale it based on the store size.
913 if (StoreSize != 1) {
914 NumBytesS = SE->getMulExpr(NumBytesS, SE->getConstant(IntPtr, StoreSize),
915 SCEV::FlagNUW);
916 }
917 return NumBytesS;
918}
919
920/// processLoopStridedStore - We see a strided store of some value. If we can
921/// transform this into a memset or memset_pattern in the loop preheader, do so.
922bool LoopIdiomRecognize::processLoopStridedStore(
923 Value *DestPtr, unsigned StoreSize, MaybeAlign StoreAlignment,
924 Value *StoredVal, Instruction *TheStore,
925 SmallPtrSetImpl<Instruction *> &Stores, const SCEVAddRecExpr *Ev,
926 const SCEV *BECount, bool NegStride, bool IsLoopMemset) {
927 Value *SplatValue = isBytewiseValue(StoredVal, *DL);
928 Constant *PatternValue = nullptr;
929
930 if (!SplatValue)
931 PatternValue = getMemSetPatternValue(StoredVal, DL);
932
933 assert((SplatValue || PatternValue) &&(((SplatValue || PatternValue) && "Expected either splat value or pattern value."
) ? static_cast<void> (0) : __assert_fail ("(SplatValue || PatternValue) && \"Expected either splat value or pattern value.\""
, "/build/llvm-toolchain-snapshot-12~++20200926111128+c6c5629f2fb/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp"
, 934, __PRETTY_FUNCTION__))
934 "Expected either splat value or pattern value.")(((SplatValue || PatternValue) && "Expected either splat value or pattern value."
) ? static_cast<void> (0) : __assert_fail ("(SplatValue || PatternValue) && \"Expected either splat value or pattern value.\""
, "/build/llvm-toolchain-snapshot-12~++20200926111128+c6c5629f2fb/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp"
, 934, __PRETTY_FUNCTION__))
;
935
936 // The trip count of the loop and the base pointer of the addrec SCEV is
937 // guaranteed to be loop invariant, which means that it should dominate the
938 // header. This allows us to insert code for it in the preheader.
939 unsigned DestAS = DestPtr->getType()->getPointerAddressSpace();
940 BasicBlock *Preheader = CurLoop->getLoopPreheader();
941 IRBuilder<> Builder(Preheader->getTerminator());
942 SCEVExpander Expander(*SE, *DL, "loop-idiom");
943 SCEVExpanderCleaner ExpCleaner(Expander, *DT);
944
945 Type *DestInt8PtrTy = Builder.getInt8PtrTy(DestAS);
946 Type *IntIdxTy = DL->getIndexType(DestPtr->getType());
947
948 bool Changed = false;
949 const SCEV *Start = Ev->getStart();
950 // Handle negative strided loops.
951 if (NegStride)
952 Start = getStartForNegStride(Start, BECount, IntIdxTy, StoreSize, SE);
953
954 // TODO: ideally we should still be able to generate memset if SCEV expander
955 // is taught to generate the dependencies at the latest point.
956 if (!isSafeToExpand(Start, *SE))
957 return Changed;
958
959 // Okay, we have a strided store "p[i]" of a splattable value. We can turn
960 // this into a memset in the loop preheader now if we want. However, this
961 // would be unsafe to do if there is anything else in the loop that may read
962 // or write to the aliased location. Check for any overlap by generating the
963 // base pointer and checking the region.
964 Value *BasePtr =
965 Expander.expandCodeFor(Start, DestInt8PtrTy, Preheader->getTerminator());
966
967 // From here on out, conservatively report to the pass manager that we've
968 // changed the IR, even if we later clean up these added instructions. There
969 // may be structural differences e.g. in the order of use lists not accounted
970 // for in just a textual dump of the IR. This is written as a variable, even
971 // though statically all the places this dominates could be replaced with
972 // 'true', with the hope that anyone trying to be clever / "more precise" with
973 // the return value will read this comment, and leave them alone.
974 Changed = true;
975
976 if (mayLoopAccessLocation(BasePtr, ModRefInfo::ModRef, CurLoop, BECount,
977 StoreSize, *AA, Stores))
978 return Changed;
979
980 if (avoidLIRForMultiBlockLoop(/*IsMemset=*/true, IsLoopMemset))
981 return Changed;
982
983 // Okay, everything looks good, insert the memset.
984
985 const SCEV *NumBytesS =
986 getNumBytes(BECount, IntIdxTy, StoreSize, CurLoop, DL, SE);
987
988 // TODO: ideally we should still be able to generate memset if SCEV expander
989 // is taught to generate the dependencies at the latest point.
990 if (!isSafeToExpand(NumBytesS, *SE))
991 return Changed;
992
993 Value *NumBytes =
994 Expander.expandCodeFor(NumBytesS, IntIdxTy, Preheader->getTerminator());
995
996 CallInst *NewCall;
997 if (SplatValue) {
998 NewCall = Builder.CreateMemSet(BasePtr, SplatValue, NumBytes,
999 MaybeAlign(StoreAlignment));
1000 } else {
1001 // Everything is emitted in default address space
1002 Type *Int8PtrTy = DestInt8PtrTy;
1003
1004 Module *M = TheStore->getModule();
1005 StringRef FuncName = "memset_pattern16";
1006 FunctionCallee MSP = M->getOrInsertFunction(FuncName, Builder.getVoidTy(),
1007 Int8PtrTy, Int8PtrTy, IntIdxTy);
1008 inferLibFuncAttributes(M, FuncName, *TLI);
1009
1010 // Otherwise we should form a memset_pattern16. PatternValue is known to be
1011 // an constant array of 16-bytes. Plop the value into a mergable global.
1012 GlobalVariable *GV = new GlobalVariable(*M, PatternValue->getType(), true,
1013 GlobalValue::PrivateLinkage,
1014 PatternValue, ".memset_pattern");
1015 GV->setUnnamedAddr(GlobalValue::UnnamedAddr::Global); // Ok to merge these.
1016 GV->setAlignment(Align(16));
1017 Value *PatternPtr = ConstantExpr::getBitCast(GV, Int8PtrTy);
1018 NewCall = Builder.CreateCall(MSP, {BasePtr, PatternPtr, NumBytes});
1019 }
1020 NewCall->setDebugLoc(TheStore->getDebugLoc());
1021
1022 if (MSSAU) {
1023 MemoryAccess *NewMemAcc = MSSAU->createMemoryAccessInBB(
1024 NewCall, nullptr, NewCall->getParent(), MemorySSA::BeforeTerminator);
1025 MSSAU->insertDef(cast<MemoryDef>(NewMemAcc), true);
1026 }
1027
1028 LLVM_DEBUG(dbgs() << " Formed memset: " << *NewCall << "\n"do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-idiom")) { dbgs() << " Formed memset: " <<
*NewCall << "\n" << " from store to: " <<
*Ev << " at: " << *TheStore << "\n"; } } while
(false)
1029 << " from store to: " << *Ev << " at: " << *TheStoredo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-idiom")) { dbgs() << " Formed memset: " <<
*NewCall << "\n" << " from store to: " <<
*Ev << " at: " << *TheStore << "\n"; } } while
(false)
1030 << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-idiom")) { dbgs() << " Formed memset: " <<
*NewCall << "\n" << " from store to: " <<
*Ev << " at: " << *TheStore << "\n"; } } while
(false)
;
1031
1032 ORE.emit([&]() {
1033 return OptimizationRemark(DEBUG_TYPE"loop-idiom", "ProcessLoopStridedStore",
1034 NewCall->getDebugLoc(), Preheader)
1035 << "Transformed loop-strided store into a call to "
1036 << ore::NV("NewFunction", NewCall->getCalledFunction())
1037 << "() function";
1038 });
1039
1040 // Okay, the memset has been formed. Zap the original store and anything that
1041 // feeds into it.
1042 for (auto *I : Stores) {
1043 if (MSSAU)
1044 MSSAU->removeMemoryAccess(I, true);
1045 deleteDeadInstruction(I);
1046 }
1047 if (MSSAU && VerifyMemorySSA)
1048 MSSAU->getMemorySSA()->verifyMemorySSA();
1049 ++NumMemSet;
1050 ExpCleaner.markResultUsed();
1051 return true;
1052}
1053
1054/// If the stored value is a strided load in the same loop with the same stride
1055/// this may be transformable into a memcpy. This kicks in for stuff like
1056/// for (i) A[i] = B[i];
1057bool LoopIdiomRecognize::processLoopStoreOfLoopLoad(StoreInst *SI,
1058 const SCEV *BECount) {
1059 assert(SI->isUnordered() && "Expected only non-volatile non-ordered stores.")((SI->isUnordered() && "Expected only non-volatile non-ordered stores."
) ? static_cast<void> (0) : __assert_fail ("SI->isUnordered() && \"Expected only non-volatile non-ordered stores.\""
, "/build/llvm-toolchain-snapshot-12~++20200926111128+c6c5629f2fb/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp"
, 1059, __PRETTY_FUNCTION__))
;
1060
1061 Value *StorePtr = SI->getPointerOperand();
1062 const SCEVAddRecExpr *StoreEv = cast<SCEVAddRecExpr>(SE->getSCEV(StorePtr));
1063 APInt Stride = getStoreStride(StoreEv);
1064 unsigned StoreSize = DL->getTypeStoreSize(SI->getValueOperand()->getType());
1065 bool NegStride = StoreSize == -Stride;
1066
1067 // The store must be feeding a non-volatile load.
1068 LoadInst *LI = cast<LoadInst>(SI->getValueOperand());
1069 assert(LI->isUnordered() && "Expected only non-volatile non-ordered loads.")((LI->isUnordered() && "Expected only non-volatile non-ordered loads."
) ? static_cast<void> (0) : __assert_fail ("LI->isUnordered() && \"Expected only non-volatile non-ordered loads.\""
, "/build/llvm-toolchain-snapshot-12~++20200926111128+c6c5629f2fb/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp"
, 1069, __PRETTY_FUNCTION__))
;
1070
1071 // See if the pointer expression is an AddRec like {base,+,1} on the current
1072 // loop, which indicates a strided load. If we have something else, it's a
1073 // random load we can't handle.
1074 const SCEVAddRecExpr *LoadEv =
1075 cast<SCEVAddRecExpr>(SE->getSCEV(LI->getPointerOperand()));
1076
1077 // The trip count of the loop and the base pointer of the addrec SCEV is
1078 // guaranteed to be loop invariant, which means that it should dominate the
1079 // header. This allows us to insert code for it in the preheader.
1080 BasicBlock *Preheader = CurLoop->getLoopPreheader();
1081 IRBuilder<> Builder(Preheader->getTerminator());
1082 SCEVExpander Expander(*SE, *DL, "loop-idiom");
1083
1084 SCEVExpanderCleaner ExpCleaner(Expander, *DT);
1085
1086 bool Changed = false;
1087 const SCEV *StrStart = StoreEv->getStart();
1088 unsigned StrAS = SI->getPointerAddressSpace();
1089 Type *IntIdxTy = Builder.getIntNTy(DL->getIndexSizeInBits(StrAS));
1090
1091 // Handle negative strided loops.
1092 if (NegStride)
1093 StrStart = getStartForNegStride(StrStart, BECount, IntIdxTy, StoreSize, SE);
1094
1095 // Okay, we have a strided store "p[i]" of a loaded value. We can turn
1096 // this into a memcpy in the loop preheader now if we want. However, this
1097 // would be unsafe to do if there is anything else in the loop that may read
1098 // or write the memory region we're storing to. This includes the load that
1099 // feeds the stores. Check for an alias by generating the base address and
1100 // checking everything.
1101 Value *StoreBasePtr = Expander.expandCodeFor(
1102 StrStart, Builder.getInt8PtrTy(StrAS), Preheader->getTerminator());
1103
1104 // From here on out, conservatively report to the pass manager that we've
1105 // changed the IR, even if we later clean up these added instructions. There
1106 // may be structural differences e.g. in the order of use lists not accounted
1107 // for in just a textual dump of the IR. This is written as a variable, even
1108 // though statically all the places this dominates could be replaced with
1109 // 'true', with the hope that anyone trying to be clever / "more precise" with
1110 // the return value will read this comment, and leave them alone.
1111 Changed = true;
1112
1113 SmallPtrSet<Instruction *, 1> Stores;
1114 Stores.insert(SI);
1115 if (mayLoopAccessLocation(StoreBasePtr, ModRefInfo::ModRef, CurLoop, BECount,
1116 StoreSize, *AA, Stores))
1117 return Changed;
1118
1119 const SCEV *LdStart = LoadEv->getStart();
1120 unsigned LdAS = LI->getPointerAddressSpace();
1121
1122 // Handle negative strided loops.
1123 if (NegStride)
1124 LdStart = getStartForNegStride(LdStart, BECount, IntIdxTy, StoreSize, SE);
1125
1126 // For a memcpy, we have to make sure that the input array is not being
1127 // mutated by the loop.
1128 Value *LoadBasePtr = Expander.expandCodeFor(
1129 LdStart, Builder.getInt8PtrTy(LdAS), Preheader->getTerminator());
1130
1131 if (mayLoopAccessLocation(LoadBasePtr, ModRefInfo::Mod, CurLoop, BECount,
1132 StoreSize, *AA, Stores))
1133 return Changed;
1134
1135 if (avoidLIRForMultiBlockLoop())
1136 return Changed;
1137
1138 // Okay, everything is safe, we can transform this!
1139
1140 const SCEV *NumBytesS =
1141 getNumBytes(BECount, IntIdxTy, StoreSize, CurLoop, DL, SE);
1142
1143 Value *NumBytes =
1144 Expander.expandCodeFor(NumBytesS, IntIdxTy, Preheader->getTerminator());
1145
1146 CallInst *NewCall = nullptr;
1147 // Check whether to generate an unordered atomic memcpy:
1148 // If the load or store are atomic, then they must necessarily be unordered
1149 // by previous checks.
1150 if (!SI->isAtomic() && !LI->isAtomic())
1151 NewCall = Builder.CreateMemCpy(StoreBasePtr, SI->getAlign(), LoadBasePtr,
1152 LI->getAlign(), NumBytes);
1153 else {
1154 // We cannot allow unaligned ops for unordered load/store, so reject
1155 // anything where the alignment isn't at least the element size.
1156 const Align StoreAlign = SI->getAlign();
1157 const Align LoadAlign = LI->getAlign();
1158 if (StoreAlign < StoreSize || LoadAlign < StoreSize)
1159 return Changed;
1160
1161 // If the element.atomic memcpy is not lowered into explicit
1162 // loads/stores later, then it will be lowered into an element-size
1163 // specific lib call. If the lib call doesn't exist for our store size, then
1164 // we shouldn't generate the memcpy.
1165 if (StoreSize > TTI->getAtomicMemIntrinsicMaxElementSize())
1166 return Changed;
1167
1168 // Create the call.
1169 // Note that unordered atomic loads/stores are *required* by the spec to
1170 // have an alignment but non-atomic loads/stores may not.
1171 NewCall = Builder.CreateElementUnorderedAtomicMemCpy(
1172 StoreBasePtr, StoreAlign, LoadBasePtr, LoadAlign, NumBytes,
1173 StoreSize);
1174 }
1175 NewCall->setDebugLoc(SI->getDebugLoc());
1176
1177 if (MSSAU) {
1178 MemoryAccess *NewMemAcc = MSSAU->createMemoryAccessInBB(
1179 NewCall, nullptr, NewCall->getParent(), MemorySSA::BeforeTerminator);
1180 MSSAU->insertDef(cast<MemoryDef>(NewMemAcc), true);
1181 }
1182
1183 LLVM_DEBUG(dbgs() << " Formed memcpy: " << *NewCall << "\n"do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-idiom")) { dbgs() << " Formed memcpy: " <<
*NewCall << "\n" << " from load ptr=" <<
*LoadEv << " at: " << *LI << "\n" <<
" from store ptr=" << *StoreEv << " at: " <<
*SI << "\n"; } } while (false)
1184 << " from load ptr=" << *LoadEv << " at: " << *LI << "\n"do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-idiom")) { dbgs() << " Formed memcpy: " <<
*NewCall << "\n" << " from load ptr=" <<
*LoadEv << " at: " << *LI << "\n" <<
" from store ptr=" << *StoreEv << " at: " <<
*SI << "\n"; } } while (false)
1185 << " from store ptr=" << *StoreEv << " at: " << *SIdo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-idiom")) { dbgs() << " Formed memcpy: " <<
*NewCall << "\n" << " from load ptr=" <<
*LoadEv << " at: " << *LI << "\n" <<
" from store ptr=" << *StoreEv << " at: " <<
*SI << "\n"; } } while (false)
1186 << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-idiom")) { dbgs() << " Formed memcpy: " <<
*NewCall << "\n" << " from load ptr=" <<
*LoadEv << " at: " << *LI << "\n" <<
" from store ptr=" << *StoreEv << " at: " <<
*SI << "\n"; } } while (false)
;
1187
1188 ORE.emit([&]() {
1189 return OptimizationRemark(DEBUG_TYPE"loop-idiom", "ProcessLoopStoreOfLoopLoad",
1190 NewCall->getDebugLoc(), Preheader)
1191 << "Formed a call to "
1192 << ore::NV("NewFunction", NewCall->getCalledFunction())
1193 << "() function";
1194 });
1195
1196 // Okay, the memcpy has been formed. Zap the original store and anything that
1197 // feeds into it.
1198 if (MSSAU)
1199 MSSAU->removeMemoryAccess(SI, true);
1200 deleteDeadInstruction(SI);
1201 if (MSSAU && VerifyMemorySSA)
1202 MSSAU->getMemorySSA()->verifyMemorySSA();
1203 ++NumMemCpy;
1204 ExpCleaner.markResultUsed();
1205 return true;
1206}
1207
1208// When compiling for codesize we avoid idiom recognition for a multi-block loop
1209// unless it is a loop_memset idiom or a memset/memcpy idiom in a nested loop.
1210//
1211bool LoopIdiomRecognize::avoidLIRForMultiBlockLoop(bool IsMemset,
1212 bool IsLoopMemset) {
1213 if (ApplyCodeSizeHeuristics && CurLoop->getNumBlocks() > 1) {
1214 if (CurLoop->isOutermost() && (!IsMemset || !IsLoopMemset)) {
1215 LLVM_DEBUG(dbgs() << " " << CurLoop->getHeader()->getParent()->getName()do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-idiom")) { dbgs() << " " << CurLoop->getHeader
()->getParent()->getName() << " : LIR " << (
IsMemset ? "Memset" : "Memcpy") << " avoided: multi-block top-level loop\n"
; } } while (false)
1216 << " : LIR " << (IsMemset ? "Memset" : "Memcpy")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-idiom")) { dbgs() << " " << CurLoop->getHeader
()->getParent()->getName() << " : LIR " << (
IsMemset ? "Memset" : "Memcpy") << " avoided: multi-block top-level loop\n"
; } } while (false)
1217 << " avoided: multi-block top-level loop\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-idiom")) { dbgs() << " " << CurLoop->getHeader
()->getParent()->getName() << " : LIR " << (
IsMemset ? "Memset" : "Memcpy") << " avoided: multi-block top-level loop\n"
; } } while (false)
;
1218 return true;
1219 }
1220 }
1221
1222 return false;
1223}
1224
1225bool LoopIdiomRecognize::runOnNoncountableLoop() {
1226 LLVM_DEBUG(dbgs() << DEBUG_TYPE " Scanning: F["do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-idiom")) { dbgs() << "loop-idiom" " Scanning: F["
<< CurLoop->getHeader()->getParent()->getName
() << "] Noncountable Loop %" << CurLoop->getHeader
()->getName() << "\n"; } } while (false)
1227 << CurLoop->getHeader()->getParent()->getName()do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-idiom")) { dbgs() << "loop-idiom" " Scanning: F["
<< CurLoop->getHeader()->getParent()->getName
() << "] Noncountable Loop %" << CurLoop->getHeader
()->getName() << "\n"; } } while (false)
1228 << "] Noncountable Loop %"do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-idiom")) { dbgs() << "loop-idiom" " Scanning: F["
<< CurLoop->getHeader()->getParent()->getName
() << "] Noncountable Loop %" << CurLoop->getHeader
()->getName() << "\n"; } } while (false)
1229 << CurLoop->getHeader()->getName() << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-idiom")) { dbgs() << "loop-idiom" " Scanning: F["
<< CurLoop->getHeader()->getParent()->getName
() << "] Noncountable Loop %" << CurLoop->getHeader
()->getName() << "\n"; } } while (false)
;
1230
1231 return recognizePopcount() || recognizeAndInsertFFS();
1232}
1233
1234/// Check if the given conditional branch is based on the comparison between
1235/// a variable and zero, and if the variable is non-zero or zero (JmpOnZero is
1236/// true), the control yields to the loop entry. If the branch matches the
1237/// behavior, the variable involved in the comparison is returned. This function
1238/// will be called to see if the precondition and postcondition of the loop are
1239/// in desirable form.
1240static Value *matchCondition(BranchInst *BI, BasicBlock *LoopEntry,
1241 bool JmpOnZero = false) {
1242 if (!BI || !BI->isConditional())
1243 return nullptr;
1244
1245 ICmpInst *Cond = dyn_cast<ICmpInst>(BI->getCondition());
1246 if (!Cond)
1247 return nullptr;
1248
1249 ConstantInt *CmpZero = dyn_cast<ConstantInt>(Cond->getOperand(1));
1250 if (!CmpZero || !CmpZero->isZero())
1251 return nullptr;
1252
1253 BasicBlock *TrueSucc = BI->getSuccessor(0);
1254 BasicBlock *FalseSucc = BI->getSuccessor(1);
1255 if (JmpOnZero)
1256 std::swap(TrueSucc, FalseSucc);
1257
1258 ICmpInst::Predicate Pred = Cond->getPredicate();
1259 if ((Pred == ICmpInst::ICMP_NE && TrueSucc == LoopEntry) ||
1260 (Pred == ICmpInst::ICMP_EQ && FalseSucc == LoopEntry))
1261 return Cond->getOperand(0);
1262
1263 return nullptr;
1264}
1265
1266// Check if the recurrence variable `VarX` is in the right form to create
1267// the idiom. Returns the value coerced to a PHINode if so.
1268static PHINode *getRecurrenceVar(Value *VarX, Instruction *DefX,
1269 BasicBlock *LoopEntry) {
1270 auto *PhiX = dyn_cast<PHINode>(VarX);
1271 if (PhiX && PhiX->getParent() == LoopEntry &&
1272 (PhiX->getOperand(0) == DefX || PhiX->getOperand(1) == DefX))
1273 return PhiX;
1274 return nullptr;
1275}
1276
1277/// Return true iff the idiom is detected in the loop.
1278///
1279/// Additionally:
1280/// 1) \p CntInst is set to the instruction counting the population bit.
1281/// 2) \p CntPhi is set to the corresponding phi node.
1282/// 3) \p Var is set to the value whose population bits are being counted.
1283///
1284/// The core idiom we are trying to detect is:
1285/// \code
1286/// if (x0 != 0)
1287/// goto loop-exit // the precondition of the loop
1288/// cnt0 = init-val;
1289/// do {
1290/// x1 = phi (x0, x2);
1291/// cnt1 = phi(cnt0, cnt2);
1292///
1293/// cnt2 = cnt1 + 1;
1294/// ...
1295/// x2 = x1 & (x1 - 1);
1296/// ...
1297/// } while(x != 0);
1298///
1299/// loop-exit:
1300/// \endcode
1301static bool detectPopcountIdiom(Loop *CurLoop, BasicBlock *PreCondBB,
1302 Instruction *&CntInst, PHINode *&CntPhi,
1303 Value *&Var) {
1304 // step 1: Check to see if the look-back branch match this pattern:
1305 // "if (a!=0) goto loop-entry".
1306 BasicBlock *LoopEntry;
1307 Instruction *DefX2, *CountInst;
1308 Value *VarX1, *VarX0;
1309 PHINode *PhiX, *CountPhi;
1310
1311 DefX2 = CountInst = nullptr;
1312 VarX1 = VarX0 = nullptr;
1313 PhiX = CountPhi = nullptr;
1314 LoopEntry = *(CurLoop->block_begin());
1315
1316 // step 1: Check if the loop-back branch is in desirable form.
1317 {
1318 if (Value *T = matchCondition(
1319 dyn_cast<BranchInst>(LoopEntry->getTerminator()), LoopEntry))
1320 DefX2 = dyn_cast<Instruction>(T);
1321 else
1322 return false;
1323 }
1324
1325 // step 2: detect instructions corresponding to "x2 = x1 & (x1 - 1)"
1326 {
1327 if (!DefX2 || DefX2->getOpcode() != Instruction::And)
1328 return false;
1329
1330 BinaryOperator *SubOneOp;
1331
1332 if ((SubOneOp = dyn_cast<BinaryOperator>(DefX2->getOperand(0))))
1333 VarX1 = DefX2->getOperand(1);
1334 else {
1335 VarX1 = DefX2->getOperand(0);
1336 SubOneOp = dyn_cast<BinaryOperator>(DefX2->getOperand(1));
1337 }
1338 if (!SubOneOp || SubOneOp->getOperand(0) != VarX1)
1339 return false;
1340
1341 ConstantInt *Dec = dyn_cast<ConstantInt>(SubOneOp->getOperand(1));
1342 if (!Dec ||
1343 !((SubOneOp->getOpcode() == Instruction::Sub && Dec->isOne()) ||
1344 (SubOneOp->getOpcode() == Instruction::Add &&
1345 Dec->isMinusOne()))) {
1346 return false;
1347 }
1348 }
1349
1350 // step 3: Check the recurrence of variable X
1351 PhiX = getRecurrenceVar(VarX1, DefX2, LoopEntry);
1352 if (!PhiX)
1353 return false;
1354
1355 // step 4: Find the instruction which count the population: cnt2 = cnt1 + 1
1356 {
1357 CountInst = nullptr;
1358 for (BasicBlock::iterator Iter = LoopEntry->getFirstNonPHI()->getIterator(),
1359 IterE = LoopEntry->end();
1360 Iter != IterE; Iter++) {
1361 Instruction *Inst = &*Iter;
1362 if (Inst->getOpcode() != Instruction::Add)
1363 continue;
1364
1365 ConstantInt *Inc = dyn_cast<ConstantInt>(Inst->getOperand(1));
1366 if (!Inc || !Inc->isOne())
1367 continue;
1368
1369 PHINode *Phi = getRecurrenceVar(Inst->getOperand(0), Inst, LoopEntry);
1370 if (!Phi)
1371 continue;
1372
1373 // Check if the result of the instruction is live of the loop.
1374 bool LiveOutLoop = false;
1375 for (User *U : Inst->users()) {
1376 if ((cast<Instruction>(U))->getParent() != LoopEntry) {
1377 LiveOutLoop = true;
1378 break;
1379 }
1380 }
1381
1382 if (LiveOutLoop) {
1383 CountInst = Inst;
1384 CountPhi = Phi;
1385 break;
1386 }
1387 }
1388
1389 if (!CountInst)
1390 return false;
1391 }
1392
1393 // step 5: check if the precondition is in this form:
1394 // "if (x != 0) goto loop-head ; else goto somewhere-we-don't-care;"
1395 {
1396 auto *PreCondBr = dyn_cast<BranchInst>(PreCondBB->getTerminator());
1397 Value *T = matchCondition(PreCondBr, CurLoop->getLoopPreheader());
1398 if (T != PhiX->getOperand(0) && T != PhiX->getOperand(1))
1399 return false;
1400
1401 CntInst = CountInst;
1402 CntPhi = CountPhi;
1403 Var = T;
1404 }
1405
1406 return true;
1407}
1408
1409/// Return true if the idiom is detected in the loop.
1410///
1411/// Additionally:
1412/// 1) \p CntInst is set to the instruction Counting Leading Zeros (CTLZ)
1413/// or nullptr if there is no such.
1414/// 2) \p CntPhi is set to the corresponding phi node
1415/// or nullptr if there is no such.
1416/// 3) \p Var is set to the value whose CTLZ could be used.
1417/// 4) \p DefX is set to the instruction calculating Loop exit condition.
1418///
1419/// The core idiom we are trying to detect is:
1420/// \code
1421/// if (x0 == 0)
1422/// goto loop-exit // the precondition of the loop
1423/// cnt0 = init-val;
1424/// do {
1425/// x = phi (x0, x.next); //PhiX
1426/// cnt = phi(cnt0, cnt.next);
1427///
1428/// cnt.next = cnt + 1;
1429/// ...
1430/// x.next = x >> 1; // DefX
1431/// ...
1432/// } while(x.next != 0);
1433///
1434/// loop-exit:
1435/// \endcode
1436static bool detectShiftUntilZeroIdiom(Loop *CurLoop, const DataLayout &DL,
1437 Intrinsic::ID &IntrinID, Value *&InitX,
1438 Instruction *&CntInst, PHINode *&CntPhi,
1439 Instruction *&DefX) {
1440 BasicBlock *LoopEntry;
1441 Value *VarX = nullptr;
1442
1443 DefX = nullptr;
1444 CntInst = nullptr;
1445 CntPhi = nullptr;
1446 LoopEntry = *(CurLoop->block_begin());
1447
1448 // step 1: Check if the loop-back branch is in desirable form.
1449 if (Value *T = matchCondition(
1450 dyn_cast<BranchInst>(LoopEntry->getTerminator()), LoopEntry))
1451 DefX = dyn_cast<Instruction>(T);
1452 else
1453 return false;
1454
1455 // step 2: detect instructions corresponding to "x.next = x >> 1 or x << 1"
1456 if (!DefX || !DefX->isShift())
1457 return false;
1458 IntrinID = DefX->getOpcode() == Instruction::Shl ? Intrinsic::cttz :
1459 Intrinsic::ctlz;
1460 ConstantInt *Shft = dyn_cast<ConstantInt>(DefX->getOperand(1));
1461 if (!Shft || !Shft->isOne())
1462 return false;
1463 VarX = DefX->getOperand(0);
1464
1465 // step 3: Check the recurrence of variable X
1466 PHINode *PhiX = getRecurrenceVar(VarX, DefX, LoopEntry);
1467 if (!PhiX)
1468 return false;
1469
1470 InitX = PhiX->getIncomingValueForBlock(CurLoop->getLoopPreheader());
1471
1472 // Make sure the initial value can't be negative otherwise the ashr in the
1473 // loop might never reach zero which would make the loop infinite.
1474 if (DefX->getOpcode() == Instruction::AShr && !isKnownNonNegative(InitX, DL))
1475 return false;
1476
1477 // step 4: Find the instruction which count the CTLZ: cnt.next = cnt + 1
1478 // TODO: We can skip the step. If loop trip count is known (CTLZ),
1479 // then all uses of "cnt.next" could be optimized to the trip count
1480 // plus "cnt0". Currently it is not optimized.
1481 // This step could be used to detect POPCNT instruction:
1482 // cnt.next = cnt + (x.next & 1)
1483 for (BasicBlock::iterator Iter = LoopEntry->getFirstNonPHI()->getIterator(),
1484 IterE = LoopEntry->end();
1485 Iter != IterE; Iter++) {
1486 Instruction *Inst = &*Iter;
1487 if (Inst->getOpcode() != Instruction::Add)
1488 continue;
1489
1490 ConstantInt *Inc = dyn_cast<ConstantInt>(Inst->getOperand(1));
1491 if (!Inc || !Inc->isOne())
1492 continue;
1493
1494 PHINode *Phi = getRecurrenceVar(Inst->getOperand(0), Inst, LoopEntry);
1495 if (!Phi)
1496 continue;
1497
1498 CntInst = Inst;
1499 CntPhi = Phi;
1500 break;
1501 }
1502 if (!CntInst)
1503 return false;
1504
1505 return true;
1506}
1507
1508/// Recognize CTLZ or CTTZ idiom in a non-countable loop and convert the loop
1509/// to countable (with CTLZ / CTTZ trip count). If CTLZ / CTTZ inserted as a new
1510/// trip count returns true; otherwise, returns false.
1511bool LoopIdiomRecognize::recognizeAndInsertFFS() {
1512 // Give up if the loop has multiple blocks or multiple backedges.
1513 if (CurLoop->getNumBackEdges() != 1 || CurLoop->getNumBlocks() != 1)
1514 return false;
1515
1516 Intrinsic::ID IntrinID;
1517 Value *InitX;
1518 Instruction *DefX = nullptr;
1519 PHINode *CntPhi = nullptr;
1520 Instruction *CntInst = nullptr;
1521 // Help decide if transformation is profitable. For ShiftUntilZero idiom,
1522 // this is always 6.
1523 size_t IdiomCanonicalSize = 6;
1524
1525 if (!detectShiftUntilZeroIdiom(CurLoop, *DL, IntrinID, InitX,
1526 CntInst, CntPhi, DefX))
1527 return false;
1528
1529 bool IsCntPhiUsedOutsideLoop = false;
1530 for (User *U : CntPhi->users())
1531 if (!CurLoop->contains(cast<Instruction>(U))) {
1532 IsCntPhiUsedOutsideLoop = true;
1533 break;
1534 }
1535 bool IsCntInstUsedOutsideLoop = false;
1536 for (User *U : CntInst->users())
1537 if (!CurLoop->contains(cast<Instruction>(U))) {
1538 IsCntInstUsedOutsideLoop = true;
1539 break;
1540 }
1541 // If both CntInst and CntPhi are used outside the loop the profitability
1542 // is questionable.
1543 if (IsCntInstUsedOutsideLoop && IsCntPhiUsedOutsideLoop)
1544 return false;
1545
1546 // For some CPUs result of CTLZ(X) intrinsic is undefined
1547 // when X is 0. If we can not guarantee X != 0, we need to check this
1548 // when expand.
1549 bool ZeroCheck = false;
1550 // It is safe to assume Preheader exist as it was checked in
1551 // parent function RunOnLoop.
1552 BasicBlock *PH = CurLoop->getLoopPreheader();
1553
1554 // If we are using the count instruction outside the loop, make sure we
1555 // have a zero check as a precondition. Without the check the loop would run
1556 // one iteration for before any check of the input value. This means 0 and 1
1557 // would have identical behavior in the original loop and thus
1558 if (!IsCntPhiUsedOutsideLoop) {
1559 auto *PreCondBB = PH->getSinglePredecessor();
1560 if (!PreCondBB)
1561 return false;
1562 auto *PreCondBI = dyn_cast<BranchInst>(PreCondBB->getTerminator());
1563 if (!PreCondBI)
1564 return false;
1565 if (matchCondition(PreCondBI, PH) != InitX)
1566 return false;
1567 ZeroCheck = true;
1568 }
1569
1570 // Check if CTLZ / CTTZ intrinsic is profitable. Assume it is always
1571 // profitable if we delete the loop.
1572
1573 // the loop has only 6 instructions:
1574 // %n.addr.0 = phi [ %n, %entry ], [ %shr, %while.cond ]
1575 // %i.0 = phi [ %i0, %entry ], [ %inc, %while.cond ]
1576 // %shr = ashr %n.addr.0, 1
1577 // %tobool = icmp eq %shr, 0
1578 // %inc = add nsw %i.0, 1
1579 // br i1 %tobool
1580
1581 const Value *Args[] = {
1582 InitX, ZeroCheck ? ConstantInt::getTrue(InitX->getContext())
1583 : ConstantInt::getFalse(InitX->getContext())};
1584
1585 // @llvm.dbg doesn't count as they have no semantic effect.
1586 auto InstWithoutDebugIt = CurLoop->getHeader()->instructionsWithoutDebug();
1587 uint32_t HeaderSize =
1588 std::distance(InstWithoutDebugIt.begin(), InstWithoutDebugIt.end());
1589
1590 IntrinsicCostAttributes Attrs(IntrinID, InitX->getType(), Args);
1591 int Cost =
1592 TTI->getIntrinsicInstrCost(Attrs, TargetTransformInfo::TCK_SizeAndLatency);
1593 if (HeaderSize != IdiomCanonicalSize &&
1594 Cost > TargetTransformInfo::TCC_Basic)
1595 return false;
1596
1597 transformLoopToCountable(IntrinID, PH, CntInst, CntPhi, InitX, DefX,
1598 DefX->getDebugLoc(), ZeroCheck,
1599 IsCntPhiUsedOutsideLoop);
1600 return true;
1601}
1602
1603/// Recognizes a population count idiom in a non-countable loop.
1604///
1605/// If detected, transforms the relevant code to issue the popcount intrinsic
1606/// function call, and returns true; otherwise, returns false.
1607bool LoopIdiomRecognize::recognizePopcount() {
1608 if (TTI->getPopcntSupport(32) != TargetTransformInfo::PSK_FastHardware)
1609 return false;
1610
1611 // Counting population are usually conducted by few arithmetic instructions.
1612 // Such instructions can be easily "absorbed" by vacant slots in a
1613 // non-compact loop. Therefore, recognizing popcount idiom only makes sense
1614 // in a compact loop.
1615
1616 // Give up if the loop has multiple blocks or multiple backedges.
1617 if (CurLoop->getNumBackEdges() != 1 || CurLoop->getNumBlocks() != 1)
1618 return false;
1619
1620 BasicBlock *LoopBody = *(CurLoop->block_begin());
1621 if (LoopBody->size() >= 20) {
1622 // The loop is too big, bail out.
1623 return false;
1624 }
1625
1626 // It should have a preheader containing nothing but an unconditional branch.
1627 BasicBlock *PH = CurLoop->getLoopPreheader();
1628 if (!PH || &PH->front() != PH->getTerminator())
1629 return false;
1630 auto *EntryBI = dyn_cast<BranchInst>(PH->getTerminator());
1631 if (!EntryBI || EntryBI->isConditional())
1632 return false;
1633
1634 // It should have a precondition block where the generated popcount intrinsic
1635 // function can be inserted.
1636 auto *PreCondBB = PH->getSinglePredecessor();
1637 if (!PreCondBB)
1638 return false;
1639 auto *PreCondBI = dyn_cast<BranchInst>(PreCondBB->getTerminator());
1640 if (!PreCondBI || PreCondBI->isUnconditional())
1641 return false;
1642
1643 Instruction *CntInst;
1644 PHINode *CntPhi;
1645 Value *Val;
1646 if (!detectPopcountIdiom(CurLoop, PreCondBB, CntInst, CntPhi, Val))
1647 return false;
1648
1649 transformLoopToPopcount(PreCondBB, CntInst, CntPhi, Val);
1650 return true;
1651}
1652
1653static CallInst *createPopcntIntrinsic(IRBuilder<> &IRBuilder, Value *Val,
1654 const DebugLoc &DL) {
1655 Value *Ops[] = {Val};
1656 Type *Tys[] = {Val->getType()};
1657
1658 Module *M = IRBuilder.GetInsertBlock()->getParent()->getParent();
1659 Function *Func = Intrinsic::getDeclaration(M, Intrinsic::ctpop, Tys);
1660 CallInst *CI = IRBuilder.CreateCall(Func, Ops);
1661 CI->setDebugLoc(DL);
1662
1663 return CI;
1664}
1665
1666static CallInst *createFFSIntrinsic(IRBuilder<> &IRBuilder, Value *Val,
1667 const DebugLoc &DL, bool ZeroCheck,
1668 Intrinsic::ID IID) {
1669 Value *Ops[] = {Val, ZeroCheck ? IRBuilder.getTrue() : IRBuilder.getFalse()};
1670 Type *Tys[] = {Val->getType()};
1671
1672 Module *M = IRBuilder.GetInsertBlock()->getParent()->getParent();
1673 Function *Func = Intrinsic::getDeclaration(M, IID, Tys);
1674 CallInst *CI = IRBuilder.CreateCall(Func, Ops);
1675 CI->setDebugLoc(DL);
1676
1677 return CI;
1678}
1679
1680/// Transform the following loop (Using CTLZ, CTTZ is similar):
1681/// loop:
1682/// CntPhi = PHI [Cnt0, CntInst]
1683/// PhiX = PHI [InitX, DefX]
1684/// CntInst = CntPhi + 1
1685/// DefX = PhiX >> 1
1686/// LOOP_BODY
1687/// Br: loop if (DefX != 0)
1688/// Use(CntPhi) or Use(CntInst)
1689///
1690/// Into:
1691/// If CntPhi used outside the loop:
1692/// CountPrev = BitWidth(InitX) - CTLZ(InitX >> 1)
1693/// Count = CountPrev + 1
1694/// else
1695/// Count = BitWidth(InitX) - CTLZ(InitX)
1696/// loop:
1697/// CntPhi = PHI [Cnt0, CntInst]
1698/// PhiX = PHI [InitX, DefX]
1699/// PhiCount = PHI [Count, Dec]
1700/// CntInst = CntPhi + 1
1701/// DefX = PhiX >> 1
1702/// Dec = PhiCount - 1
1703/// LOOP_BODY
1704/// Br: loop if (Dec != 0)
1705/// Use(CountPrev + Cnt0) // Use(CntPhi)
1706/// or
1707/// Use(Count + Cnt0) // Use(CntInst)
1708///
1709/// If LOOP_BODY is empty the loop will be deleted.
1710/// If CntInst and DefX are not used in LOOP_BODY they will be removed.
1711void LoopIdiomRecognize::transformLoopToCountable(
1712 Intrinsic::ID IntrinID, BasicBlock *Preheader, Instruction *CntInst,
1713 PHINode *CntPhi, Value *InitX, Instruction *DefX, const DebugLoc &DL,
1714 bool ZeroCheck, bool IsCntPhiUsedOutsideLoop) {
1715 BranchInst *PreheaderBr = cast<BranchInst>(Preheader->getTerminator());
1716
1717 // Step 1: Insert the CTLZ/CTTZ instruction at the end of the preheader block
1718 IRBuilder<> Builder(PreheaderBr);
1719 Builder.SetCurrentDebugLocation(DL);
1720 Value *FFS, *Count, *CountPrev, *NewCount, *InitXNext;
1721
1722 // Count = BitWidth - CTLZ(InitX);
1723 // If there are uses of CntPhi create:
1724 // CountPrev = BitWidth - CTLZ(InitX >> 1);
1725 if (IsCntPhiUsedOutsideLoop) {
1726 if (DefX->getOpcode() == Instruction::AShr)
1727 InitXNext =
1728 Builder.CreateAShr(InitX, ConstantInt::get(InitX->getType(), 1));
1729 else if (DefX->getOpcode() == Instruction::LShr)
1730 InitXNext =
1731 Builder.CreateLShr(InitX, ConstantInt::get(InitX->getType(), 1));
1732 else if (DefX->getOpcode() == Instruction::Shl) // cttz
1733 InitXNext =
1734 Builder.CreateShl(InitX, ConstantInt::get(InitX->getType(), 1));
1735 else
1736 llvm_unreachable("Unexpected opcode!")::llvm::llvm_unreachable_internal("Unexpected opcode!", "/build/llvm-toolchain-snapshot-12~++20200926111128+c6c5629f2fb/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp"
, 1736)
;
1737 } else
1738 InitXNext = InitX;
1739 FFS = createFFSIntrinsic(Builder, InitXNext, DL, ZeroCheck, IntrinID);
1740 Count = Builder.CreateSub(
1741 ConstantInt::get(FFS->getType(),
1742 FFS->getType()->getIntegerBitWidth()),
1743 FFS);
1744 if (IsCntPhiUsedOutsideLoop) {
1745 CountPrev = Count;
1746 Count = Builder.CreateAdd(
1747 CountPrev,
1748 ConstantInt::get(CountPrev->getType(), 1));
1749 }
1750
1751 NewCount = Builder.CreateZExtOrTrunc(
1752 IsCntPhiUsedOutsideLoop ? CountPrev : Count,
1753 cast<IntegerType>(CntInst->getType()));
1754
1755 // If the counter's initial value is not zero, insert Add Inst.
1756 Value *CntInitVal = CntPhi->getIncomingValueForBlock(Preheader);
1757 ConstantInt *InitConst = dyn_cast<ConstantInt>(CntInitVal);
1758 if (!InitConst || !InitConst->isZero())
1759 NewCount = Builder.CreateAdd(NewCount, CntInitVal);
1760
1761 // Step 2: Insert new IV and loop condition:
1762 // loop:
1763 // ...
1764 // PhiCount = PHI [Count, Dec]
1765 // ...
1766 // Dec = PhiCount - 1
1767 // ...
1768 // Br: loop if (Dec != 0)
1769 BasicBlock *Body = *(CurLoop->block_begin());
1770 auto *LbBr = cast<BranchInst>(Body->getTerminator());
1771 ICmpInst *LbCond = cast<ICmpInst>(LbBr->getCondition());
1772 Type *Ty = Count->getType();
1773
1774 PHINode *TcPhi = PHINode::Create(Ty, 2, "tcphi", &Body->front());
1775
1776 Builder.SetInsertPoint(LbCond);
1777 Instruction *TcDec = cast<Instruction>(
1778 Builder.CreateSub(TcPhi, ConstantInt::get(Ty, 1),
1779 "tcdec", false, true));
1780
1781 TcPhi->addIncoming(Count, Preheader);
1782 TcPhi->addIncoming(TcDec, Body);
1783
1784 CmpInst::Predicate Pred =
1785 (LbBr->getSuccessor(0) == Body) ? CmpInst::ICMP_NE : CmpInst::ICMP_EQ;
1786 LbCond->setPredicate(Pred);
1787 LbCond->setOperand(0, TcDec);
1788 LbCond->setOperand(1, ConstantInt::get(Ty, 0));
1789
1790 // Step 3: All the references to the original counter outside
1791 // the loop are replaced with the NewCount
1792 if (IsCntPhiUsedOutsideLoop)
1793 CntPhi->replaceUsesOutsideBlock(NewCount, Body);
1794 else
1795 CntInst->replaceUsesOutsideBlock(NewCount, Body);
1796
1797 // step 4: Forget the "non-computable" trip-count SCEV associated with the
1798 // loop. The loop would otherwise not be deleted even if it becomes empty.
1799 SE->forgetLoop(CurLoop);
1800}
1801
1802void LoopIdiomRecognize::transformLoopToPopcount(BasicBlock *PreCondBB,
1803 Instruction *CntInst,
1804 PHINode *CntPhi, Value *Var) {
1805 BasicBlock *PreHead = CurLoop->getLoopPreheader();
1806 auto *PreCondBr = cast<BranchInst>(PreCondBB->getTerminator());
1807 const DebugLoc &DL = CntInst->getDebugLoc();
1808
1809 // Assuming before transformation, the loop is following:
1810 // if (x) // the precondition
1811 // do { cnt++; x &= x - 1; } while(x);
1812
1813 // Step 1: Insert the ctpop instruction at the end of the precondition block
1814 IRBuilder<> Builder(PreCondBr);
1815 Value *PopCnt, *PopCntZext, *NewCount, *TripCnt;
1816 {
1817 PopCnt = createPopcntIntrinsic(Builder, Var, DL);
1818 NewCount = PopCntZext =
1819 Builder.CreateZExtOrTrunc(PopCnt, cast<IntegerType>(CntPhi->getType()));
1820
1821 if (NewCount != PopCnt)
1822 (cast<Instruction>(NewCount))->setDebugLoc(DL);
1823
1824 // TripCnt is exactly the number of iterations the loop has
1825 TripCnt = NewCount;
1826
1827 // If the population counter's initial value is not zero, insert Add Inst.
1828 Value *CntInitVal = CntPhi->getIncomingValueForBlock(PreHead);
1829 ConstantInt *InitConst = dyn_cast<ConstantInt>(CntInitVal);
1830 if (!InitConst || !InitConst->isZero()) {
1831 NewCount = Builder.CreateAdd(NewCount, CntInitVal);
1832 (cast<Instruction>(NewCount))->setDebugLoc(DL);
1833 }
1834 }
1835
1836 // Step 2: Replace the precondition from "if (x == 0) goto loop-exit" to
1837 // "if (NewCount == 0) loop-exit". Without this change, the intrinsic
1838 // function would be partial dead code, and downstream passes will drag
1839 // it back from the precondition block to the preheader.
1840 {
1841 ICmpInst *PreCond = cast<ICmpInst>(PreCondBr->getCondition());
1842
1843 Value *Opnd0 = PopCntZext;
1844 Value *Opnd1 = ConstantInt::get(PopCntZext->getType(), 0);
1845 if (PreCond->getOperand(0) != Var)
1846 std::swap(Opnd0, Opnd1);
1847
1848 ICmpInst *NewPreCond = cast<ICmpInst>(
1849 Builder.CreateICmp(PreCond->getPredicate(), Opnd0, Opnd1));
1850 PreCondBr->setCondition(NewPreCond);
1851
1852 RecursivelyDeleteTriviallyDeadInstructions(PreCond, TLI);
1853 }
1854
1855 // Step 3: Note that the population count is exactly the trip count of the
1856 // loop in question, which enable us to convert the loop from noncountable
1857 // loop into a countable one. The benefit is twofold:
1858 //
1859 // - If the loop only counts population, the entire loop becomes dead after
1860 // the transformation. It is a lot easier to prove a countable loop dead
1861 // than to prove a noncountable one. (In some C dialects, an infinite loop
1862 // isn't dead even if it computes nothing useful. In general, DCE needs
1863 // to prove a noncountable loop finite before safely delete it.)
1864 //
1865 // - If the loop also performs something else, it remains alive.
1866 // Since it is transformed to countable form, it can be aggressively
1867 // optimized by some optimizations which are in general not applicable
1868 // to a noncountable loop.
1869 //
1870 // After this step, this loop (conceptually) would look like following:
1871 // newcnt = __builtin_ctpop(x);
1872 // t = newcnt;
1873 // if (x)
1874 // do { cnt++; x &= x-1; t--) } while (t > 0);
1875 BasicBlock *Body = *(CurLoop->block_begin());
1876 {
1877 auto *LbBr = cast<BranchInst>(Body->getTerminator());
1878 ICmpInst *LbCond = cast<ICmpInst>(LbBr->getCondition());
1879 Type *Ty = TripCnt->getType();
1880
1881 PHINode *TcPhi = PHINode::Create(Ty, 2, "tcphi", &Body->front());
1882
1883 Builder.SetInsertPoint(LbCond);
1884 Instruction *TcDec = cast<Instruction>(
1885 Builder.CreateSub(TcPhi, ConstantInt::get(Ty, 1),
1886 "tcdec", false, true));
1887
1888 TcPhi->addIncoming(TripCnt, PreHead);
1889 TcPhi->addIncoming(TcDec, Body);
1890
1891 CmpInst::Predicate Pred =
1892 (LbBr->getSuccessor(0) == Body) ? CmpInst::ICMP_UGT : CmpInst::ICMP_SLE;
1893 LbCond->setPredicate(Pred);
1894 LbCond->setOperand(0, TcDec);
1895 LbCond->setOperand(1, ConstantInt::get(Ty, 0));
1896 }
1897
1898 // Step 4: All the references to the original population counter outside
1899 // the loop are replaced with the NewCount -- the value returned from
1900 // __builtin_ctpop().
1901 CntInst->replaceUsesOutsideBlock(NewCount, Body);
1902
1903 // step 5: Forget the "non-computable" trip-count SCEV associated with the
1904 // loop. The loop would otherwise not be deleted even if it becomes empty.
1905 SE->forgetLoop(CurLoop);
1906}