Bug Summary

File:llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
Warning:line 510, column 15
Although the value stored to 'PatternValue' is used in the enclosing expression, the value is never actually read from 'PatternValue'

Annotated Source Code

Press '?' to see keyboard shortcuts

clang -cc1 -cc1 -triple x86_64-pc-linux-gnu -analyze -disable-free -disable-llvm-verifier -discard-value-names -main-file-name LoopIdiomRecognize.cpp -analyzer-store=region -analyzer-opt-analyze-nested-blocks -analyzer-checker=core -analyzer-checker=apiModeling -analyzer-checker=unix -analyzer-checker=deadcode -analyzer-checker=cplusplus -analyzer-checker=security.insecureAPI.UncheckedReturn -analyzer-checker=security.insecureAPI.getpw -analyzer-checker=security.insecureAPI.gets -analyzer-checker=security.insecureAPI.mktemp -analyzer-checker=security.insecureAPI.mkstemp -analyzer-checker=security.insecureAPI.vfork -analyzer-checker=nullability.NullPassedToNonnull -analyzer-checker=nullability.NullReturnedFromNonnull -analyzer-output plist -w -setup-static-analyzer -analyzer-config-compatibility-mode=true -mrelocation-model pic -pic-level 2 -mframe-pointer=none -fmath-errno -fno-rounding-math -mconstructor-aliases -munwind-tables -target-cpu x86-64 -tune-cpu generic -fno-split-dwarf-inlining -debugger-tuning=gdb -ffunction-sections -fdata-sections -resource-dir /usr/lib/llvm-12/lib/clang/12.0.0 -D _DEBUG -D _GNU_SOURCE -D __STDC_CONSTANT_MACROS -D __STDC_FORMAT_MACROS -D __STDC_LIMIT_MACROS -I /build/llvm-toolchain-snapshot-12~++20200911111112+c0825fa5fc3/build-llvm/lib/Transforms/Scalar -I /build/llvm-toolchain-snapshot-12~++20200911111112+c0825fa5fc3/llvm/lib/Transforms/Scalar -I /build/llvm-toolchain-snapshot-12~++20200911111112+c0825fa5fc3/build-llvm/include -I /build/llvm-toolchain-snapshot-12~++20200911111112+c0825fa5fc3/llvm/include -U NDEBUG -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/6.3.0/../../../../include/c++/6.3.0 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/6.3.0/../../../../include/x86_64-linux-gnu/c++/6.3.0 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/6.3.0/../../../../include/x86_64-linux-gnu/c++/6.3.0 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/6.3.0/../../../../include/c++/6.3.0/backward -internal-isystem /usr/local/include -internal-isystem /usr/lib/llvm-12/lib/clang/12.0.0/include -internal-externc-isystem /usr/include/x86_64-linux-gnu -internal-externc-isystem /include -internal-externc-isystem /usr/include -O2 -Wno-unused-parameter -Wwrite-strings -Wno-missing-field-initializers -Wno-long-long -Wno-maybe-uninitialized -Wno-comment -std=c++14 -fdeprecated-macro -fdebug-compilation-dir /build/llvm-toolchain-snapshot-12~++20200911111112+c0825fa5fc3/build-llvm/lib/Transforms/Scalar -fdebug-prefix-map=/build/llvm-toolchain-snapshot-12~++20200911111112+c0825fa5fc3=. -ferror-limit 19 -fvisibility-inlines-hidden -stack-protector 2 -fgnuc-version=4.2.1 -vectorize-loops -vectorize-slp -analyzer-output=html -analyzer-config stable-report-filename=true -faddrsig -o /tmp/scan-build-2020-09-11-161148-48261-1 -x c++ /build/llvm-toolchain-snapshot-12~++20200911111112+c0825fa5fc3/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
1//===- LoopIdiomRecognize.cpp - Loop idiom recognition --------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This pass implements an idiom recognizer that transforms simple loops into a
10// non-loop form. In cases that this kicks in, it can be a significant
11// performance win.
12//
13// If compiling for code size we avoid idiom recognition if the resulting
14// code could be larger than the code for the original loop. One way this could
15// happen is if the loop is not removable after idiom recognition due to the
16// presence of non-idiom instructions. The initial implementation of the
17// heuristics applies to idioms in multi-block loops.
18//
19//===----------------------------------------------------------------------===//
20//
21// TODO List:
22//
23// Future loop memory idioms to recognize:
24// memcmp, memmove, strlen, etc.
25// Future floating point idioms to recognize in -ffast-math mode:
26// fpowi
27// Future integer operation idioms to recognize:
28// ctpop
29//
30// Beware that isel's default lowering for ctpop is highly inefficient for
31// i64 and larger types when i64 is legal and the value has few bits set. It
32// would be good to enhance isel to emit a loop for ctpop in this case.
33//
34// This could recognize common matrix multiplies and dot product idioms and
35// replace them with calls to BLAS (if linked in??).
36//
37//===----------------------------------------------------------------------===//
38
39#include "llvm/Transforms/Scalar/LoopIdiomRecognize.h"
40#include "llvm/ADT/APInt.h"
41#include "llvm/ADT/ArrayRef.h"
42#include "llvm/ADT/DenseMap.h"
43#include "llvm/ADT/MapVector.h"
44#include "llvm/ADT/SetVector.h"
45#include "llvm/ADT/SmallPtrSet.h"
46#include "llvm/ADT/SmallVector.h"
47#include "llvm/ADT/Statistic.h"
48#include "llvm/ADT/StringRef.h"
49#include "llvm/Analysis/AliasAnalysis.h"
50#include "llvm/Analysis/LoopAccessAnalysis.h"
51#include "llvm/Analysis/LoopInfo.h"
52#include "llvm/Analysis/LoopPass.h"
53#include "llvm/Analysis/MemoryLocation.h"
54#include "llvm/Analysis/MemorySSA.h"
55#include "llvm/Analysis/MemorySSAUpdater.h"
56#include "llvm/Analysis/MustExecute.h"
57#include "llvm/Analysis/OptimizationRemarkEmitter.h"
58#include "llvm/Analysis/ScalarEvolution.h"
59#include "llvm/Analysis/ScalarEvolutionExpressions.h"
60#include "llvm/Analysis/TargetLibraryInfo.h"
61#include "llvm/Analysis/TargetTransformInfo.h"
62#include "llvm/Analysis/ValueTracking.h"
63#include "llvm/IR/Attributes.h"
64#include "llvm/IR/BasicBlock.h"
65#include "llvm/IR/Constant.h"
66#include "llvm/IR/Constants.h"
67#include "llvm/IR/DataLayout.h"
68#include "llvm/IR/DebugLoc.h"
69#include "llvm/IR/DerivedTypes.h"
70#include "llvm/IR/Dominators.h"
71#include "llvm/IR/GlobalValue.h"
72#include "llvm/IR/GlobalVariable.h"
73#include "llvm/IR/IRBuilder.h"
74#include "llvm/IR/InstrTypes.h"
75#include "llvm/IR/Instruction.h"
76#include "llvm/IR/Instructions.h"
77#include "llvm/IR/IntrinsicInst.h"
78#include "llvm/IR/Intrinsics.h"
79#include "llvm/IR/LLVMContext.h"
80#include "llvm/IR/Module.h"
81#include "llvm/IR/PassManager.h"
82#include "llvm/IR/Type.h"
83#include "llvm/IR/User.h"
84#include "llvm/IR/Value.h"
85#include "llvm/IR/ValueHandle.h"
86#include "llvm/InitializePasses.h"
87#include "llvm/Pass.h"
88#include "llvm/Support/Casting.h"
89#include "llvm/Support/CommandLine.h"
90#include "llvm/Support/Debug.h"
91#include "llvm/Support/raw_ostream.h"
92#include "llvm/Transforms/Scalar.h"
93#include "llvm/Transforms/Utils/BuildLibCalls.h"
94#include "llvm/Transforms/Utils/Local.h"
95#include "llvm/Transforms/Utils/LoopUtils.h"
96#include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
97#include <algorithm>
98#include <cassert>
99#include <cstdint>
100#include <utility>
101#include <vector>
102
103using namespace llvm;
104
105#define DEBUG_TYPE"loop-idiom" "loop-idiom"
106
107STATISTIC(NumMemSet, "Number of memset's formed from loop stores")static llvm::Statistic NumMemSet = {"loop-idiom", "NumMemSet"
, "Number of memset's formed from loop stores"}
;
108STATISTIC(NumMemCpy, "Number of memcpy's formed from loop load+stores")static llvm::Statistic NumMemCpy = {"loop-idiom", "NumMemCpy"
, "Number of memcpy's formed from loop load+stores"}
;
109
110bool DisableLIRP::All;
111static cl::opt<bool, true>
112 DisableLIRPAll("disable-" DEBUG_TYPE"loop-idiom" "-all",
113 cl::desc("Options to disable Loop Idiom Recognize Pass."),
114 cl::location(DisableLIRP::All), cl::init(false),
115 cl::ReallyHidden);
116
117bool DisableLIRP::Memset;
118static cl::opt<bool, true>
119 DisableLIRPMemset("disable-" DEBUG_TYPE"loop-idiom" "-memset",
120 cl::desc("Proceed with loop idiom recognize pass, but do "
121 "not convert loop(s) to memset."),
122 cl::location(DisableLIRP::Memset), cl::init(false),
123 cl::ReallyHidden);
124
125bool DisableLIRP::Memcpy;
126static cl::opt<bool, true>
127 DisableLIRPMemcpy("disable-" DEBUG_TYPE"loop-idiom" "-memcpy",
128 cl::desc("Proceed with loop idiom recognize pass, but do "
129 "not convert loop(s) to memcpy."),
130 cl::location(DisableLIRP::Memcpy), cl::init(false),
131 cl::ReallyHidden);
132
133static cl::opt<bool> UseLIRCodeSizeHeurs(
134 "use-lir-code-size-heurs",
135 cl::desc("Use loop idiom recognition code size heuristics when compiling"
136 "with -Os/-Oz"),
137 cl::init(true), cl::Hidden);
138
139namespace {
140
141class LoopIdiomRecognize {
142 Loop *CurLoop = nullptr;
143 AliasAnalysis *AA;
144 DominatorTree *DT;
145 LoopInfo *LI;
146 ScalarEvolution *SE;
147 TargetLibraryInfo *TLI;
148 const TargetTransformInfo *TTI;
149 const DataLayout *DL;
150 OptimizationRemarkEmitter &ORE;
151 bool ApplyCodeSizeHeuristics;
152 std::unique_ptr<MemorySSAUpdater> MSSAU;
153
154public:
155 explicit LoopIdiomRecognize(AliasAnalysis *AA, DominatorTree *DT,
156 LoopInfo *LI, ScalarEvolution *SE,
157 TargetLibraryInfo *TLI,
158 const TargetTransformInfo *TTI, MemorySSA *MSSA,
159 const DataLayout *DL,
160 OptimizationRemarkEmitter &ORE)
161 : AA(AA), DT(DT), LI(LI), SE(SE), TLI(TLI), TTI(TTI), DL(DL), ORE(ORE) {
162 if (MSSA)
163 MSSAU = std::make_unique<MemorySSAUpdater>(MSSA);
164 }
165
166 bool runOnLoop(Loop *L);
167
168private:
169 using StoreList = SmallVector<StoreInst *, 8>;
170 using StoreListMap = MapVector<Value *, StoreList>;
171
172 StoreListMap StoreRefsForMemset;
173 StoreListMap StoreRefsForMemsetPattern;
174 StoreList StoreRefsForMemcpy;
175 bool HasMemset;
176 bool HasMemsetPattern;
177 bool HasMemcpy;
178
179 /// Return code for isLegalStore()
180 enum LegalStoreKind {
181 None = 0,
182 Memset,
183 MemsetPattern,
184 Memcpy,
185 UnorderedAtomicMemcpy,
186 DontUse // Dummy retval never to be used. Allows catching errors in retval
187 // handling.
188 };
189
190 /// \name Countable Loop Idiom Handling
191 /// @{
192
193 bool runOnCountableLoop();
194 bool runOnLoopBlock(BasicBlock *BB, const SCEV *BECount,
195 SmallVectorImpl<BasicBlock *> &ExitBlocks);
196
197 void collectStores(BasicBlock *BB);
198 LegalStoreKind isLegalStore(StoreInst *SI);
199 enum class ForMemset { No, Yes };
200 bool processLoopStores(SmallVectorImpl<StoreInst *> &SL, const SCEV *BECount,
201 ForMemset For);
202 bool processLoopMemSet(MemSetInst *MSI, const SCEV *BECount);
203
204 bool processLoopStridedStore(Value *DestPtr, unsigned StoreSize,
205 MaybeAlign StoreAlignment, Value *StoredVal,
206 Instruction *TheStore,
207 SmallPtrSetImpl<Instruction *> &Stores,
208 const SCEVAddRecExpr *Ev, const SCEV *BECount,
209 bool NegStride, bool IsLoopMemset = false);
210 bool processLoopStoreOfLoopLoad(StoreInst *SI, const SCEV *BECount);
211 bool avoidLIRForMultiBlockLoop(bool IsMemset = false,
212 bool IsLoopMemset = false);
213
214 /// @}
215 /// \name Noncountable Loop Idiom Handling
216 /// @{
217
218 bool runOnNoncountableLoop();
219
220 bool recognizePopcount();
221 void transformLoopToPopcount(BasicBlock *PreCondBB, Instruction *CntInst,
222 PHINode *CntPhi, Value *Var);
223 bool recognizeAndInsertFFS(); /// Find First Set: ctlz or cttz
224 void transformLoopToCountable(Intrinsic::ID IntrinID, BasicBlock *PreCondBB,
225 Instruction *CntInst, PHINode *CntPhi,
226 Value *Var, Instruction *DefX,
227 const DebugLoc &DL, bool ZeroCheck,
228 bool IsCntPhiUsedOutsideLoop);
229
230 /// @}
231};
232
233class LoopIdiomRecognizeLegacyPass : public LoopPass {
234public:
235 static char ID;
236
237 explicit LoopIdiomRecognizeLegacyPass() : LoopPass(ID) {
238 initializeLoopIdiomRecognizeLegacyPassPass(
239 *PassRegistry::getPassRegistry());
240 }
241
242 bool runOnLoop(Loop *L, LPPassManager &LPM) override {
243 if (DisableLIRP::All)
244 return false;
245
246 if (skipLoop(L))
247 return false;
248
249 AliasAnalysis *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
250 DominatorTree *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
251 LoopInfo *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
252 ScalarEvolution *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
253 TargetLibraryInfo *TLI =
254 &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(
255 *L->getHeader()->getParent());
256 const TargetTransformInfo *TTI =
257 &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(
258 *L->getHeader()->getParent());
259 const DataLayout *DL = &L->getHeader()->getModule()->getDataLayout();
260 auto *MSSAAnalysis = getAnalysisIfAvailable<MemorySSAWrapperPass>();
261 MemorySSA *MSSA = nullptr;
262 if (MSSAAnalysis)
263 MSSA = &MSSAAnalysis->getMSSA();
264
265 // For the old PM, we can't use OptimizationRemarkEmitter as an analysis
266 // pass. Function analyses need to be preserved across loop transformations
267 // but ORE cannot be preserved (see comment before the pass definition).
268 OptimizationRemarkEmitter ORE(L->getHeader()->getParent());
269
270 LoopIdiomRecognize LIR(AA, DT, LI, SE, TLI, TTI, MSSA, DL, ORE);
271 return LIR.runOnLoop(L);
272 }
273
274 /// This transformation requires natural loop information & requires that
275 /// loop preheaders be inserted into the CFG.
276 void getAnalysisUsage(AnalysisUsage &AU) const override {
277 AU.addRequired<TargetLibraryInfoWrapperPass>();
278 AU.addRequired<TargetTransformInfoWrapperPass>();
279 AU.addPreserved<MemorySSAWrapperPass>();
280 getLoopAnalysisUsage(AU);
281 }
282};
283
284} // end anonymous namespace
285
286char LoopIdiomRecognizeLegacyPass::ID = 0;
287
288PreservedAnalyses LoopIdiomRecognizePass::run(Loop &L, LoopAnalysisManager &AM,
289 LoopStandardAnalysisResults &AR,
290 LPMUpdater &) {
291 if (DisableLIRP::All)
292 return PreservedAnalyses::all();
293
294 const auto *DL = &L.getHeader()->getModule()->getDataLayout();
295
296 // For the new PM, we also can't use OptimizationRemarkEmitter as an analysis
297 // pass. Function analyses need to be preserved across loop transformations
298 // but ORE cannot be preserved (see comment before the pass definition).
299 OptimizationRemarkEmitter ORE(L.getHeader()->getParent());
300
301 LoopIdiomRecognize LIR(&AR.AA, &AR.DT, &AR.LI, &AR.SE, &AR.TLI, &AR.TTI,
302 AR.MSSA, DL, ORE);
303 if (!LIR.runOnLoop(&L))
304 return PreservedAnalyses::all();
305
306 auto PA = getLoopPassPreservedAnalyses();
307 if (AR.MSSA)
308 PA.preserve<MemorySSAAnalysis>();
309 return PA;
310}
311
312INITIALIZE_PASS_BEGIN(LoopIdiomRecognizeLegacyPass, "loop-idiom",static void *initializeLoopIdiomRecognizeLegacyPassPassOnce(PassRegistry
&Registry) {
313 "Recognize loop idioms", false, false)static void *initializeLoopIdiomRecognizeLegacyPassPassOnce(PassRegistry
&Registry) {
314INITIALIZE_PASS_DEPENDENCY(LoopPass)initializeLoopPassPass(Registry);
315INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)initializeTargetLibraryInfoWrapperPassPass(Registry);
316INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)initializeTargetTransformInfoWrapperPassPass(Registry);
317INITIALIZE_PASS_END(LoopIdiomRecognizeLegacyPass, "loop-idiom",PassInfo *PI = new PassInfo( "Recognize loop idioms", "loop-idiom"
, &LoopIdiomRecognizeLegacyPass::ID, PassInfo::NormalCtor_t
(callDefaultCtor<LoopIdiomRecognizeLegacyPass>), false,
false); Registry.registerPass(*PI, true); return PI; } static
llvm::once_flag InitializeLoopIdiomRecognizeLegacyPassPassFlag
; void llvm::initializeLoopIdiomRecognizeLegacyPassPass(PassRegistry
&Registry) { llvm::call_once(InitializeLoopIdiomRecognizeLegacyPassPassFlag
, initializeLoopIdiomRecognizeLegacyPassPassOnce, std::ref(Registry
)); }
318 "Recognize loop idioms", false, false)PassInfo *PI = new PassInfo( "Recognize loop idioms", "loop-idiom"
, &LoopIdiomRecognizeLegacyPass::ID, PassInfo::NormalCtor_t
(callDefaultCtor<LoopIdiomRecognizeLegacyPass>), false,
false); Registry.registerPass(*PI, true); return PI; } static
llvm::once_flag InitializeLoopIdiomRecognizeLegacyPassPassFlag
; void llvm::initializeLoopIdiomRecognizeLegacyPassPass(PassRegistry
&Registry) { llvm::call_once(InitializeLoopIdiomRecognizeLegacyPassPassFlag
, initializeLoopIdiomRecognizeLegacyPassPassOnce, std::ref(Registry
)); }
319
320Pass *llvm::createLoopIdiomPass() { return new LoopIdiomRecognizeLegacyPass(); }
321
322static void deleteDeadInstruction(Instruction *I) {
323 I->replaceAllUsesWith(UndefValue::get(I->getType()));
324 I->eraseFromParent();
325}
326
327//===----------------------------------------------------------------------===//
328//
329// Implementation of LoopIdiomRecognize
330//
331//===----------------------------------------------------------------------===//
332
333bool LoopIdiomRecognize::runOnLoop(Loop *L) {
334 CurLoop = L;
335 // If the loop could not be converted to canonical form, it must have an
336 // indirectbr in it, just give up.
337 if (!L->getLoopPreheader())
338 return false;
339
340 // Disable loop idiom recognition if the function's name is a common idiom.
341 StringRef Name = L->getHeader()->getParent()->getName();
342 if (Name == "memset" || Name == "memcpy")
343 return false;
344
345 // Determine if code size heuristics need to be applied.
346 ApplyCodeSizeHeuristics =
347 L->getHeader()->getParent()->hasOptSize() && UseLIRCodeSizeHeurs;
348
349 HasMemset = TLI->has(LibFunc_memset);
350 HasMemsetPattern = TLI->has(LibFunc_memset_pattern16);
351 HasMemcpy = TLI->has(LibFunc_memcpy);
352
353 if (HasMemset || HasMemsetPattern || HasMemcpy)
354 if (SE->hasLoopInvariantBackedgeTakenCount(L))
355 return runOnCountableLoop();
356
357 return runOnNoncountableLoop();
358}
359
360bool LoopIdiomRecognize::runOnCountableLoop() {
361 const SCEV *BECount = SE->getBackedgeTakenCount(CurLoop);
362 assert(!isa<SCEVCouldNotCompute>(BECount) &&((!isa<SCEVCouldNotCompute>(BECount) && "runOnCountableLoop() called on a loop without a predictable"
"backedge-taken count") ? static_cast<void> (0) : __assert_fail
("!isa<SCEVCouldNotCompute>(BECount) && \"runOnCountableLoop() called on a loop without a predictable\" \"backedge-taken count\""
, "/build/llvm-toolchain-snapshot-12~++20200911111112+c0825fa5fc3/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp"
, 364, __PRETTY_FUNCTION__))
363 "runOnCountableLoop() called on a loop without a predictable"((!isa<SCEVCouldNotCompute>(BECount) && "runOnCountableLoop() called on a loop without a predictable"
"backedge-taken count") ? static_cast<void> (0) : __assert_fail
("!isa<SCEVCouldNotCompute>(BECount) && \"runOnCountableLoop() called on a loop without a predictable\" \"backedge-taken count\""
, "/build/llvm-toolchain-snapshot-12~++20200911111112+c0825fa5fc3/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp"
, 364, __PRETTY_FUNCTION__))
364 "backedge-taken count")((!isa<SCEVCouldNotCompute>(BECount) && "runOnCountableLoop() called on a loop without a predictable"
"backedge-taken count") ? static_cast<void> (0) : __assert_fail
("!isa<SCEVCouldNotCompute>(BECount) && \"runOnCountableLoop() called on a loop without a predictable\" \"backedge-taken count\""
, "/build/llvm-toolchain-snapshot-12~++20200911111112+c0825fa5fc3/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp"
, 364, __PRETTY_FUNCTION__))
;
365
366 // If this loop executes exactly one time, then it should be peeled, not
367 // optimized by this pass.
368 if (const SCEVConstant *BECst = dyn_cast<SCEVConstant>(BECount))
369 if (BECst->getAPInt() == 0)
370 return false;
371
372 SmallVector<BasicBlock *, 8> ExitBlocks;
373 CurLoop->getUniqueExitBlocks(ExitBlocks);
374
375 LLVM_DEBUG(dbgs() << DEBUG_TYPE " Scanning: F["do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-idiom")) { dbgs() << "loop-idiom" " Scanning: F["
<< CurLoop->getHeader()->getParent()->getName
() << "] Countable Loop %" << CurLoop->getHeader
()->getName() << "\n"; } } while (false)
376 << CurLoop->getHeader()->getParent()->getName()do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-idiom")) { dbgs() << "loop-idiom" " Scanning: F["
<< CurLoop->getHeader()->getParent()->getName
() << "] Countable Loop %" << CurLoop->getHeader
()->getName() << "\n"; } } while (false)
377 << "] Countable Loop %" << CurLoop->getHeader()->getName()do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-idiom")) { dbgs() << "loop-idiom" " Scanning: F["
<< CurLoop->getHeader()->getParent()->getName
() << "] Countable Loop %" << CurLoop->getHeader
()->getName() << "\n"; } } while (false)
378 << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-idiom")) { dbgs() << "loop-idiom" " Scanning: F["
<< CurLoop->getHeader()->getParent()->getName
() << "] Countable Loop %" << CurLoop->getHeader
()->getName() << "\n"; } } while (false)
;
379
380 // The following transforms hoist stores/memsets into the loop pre-header.
381 // Give up if the loop has instructions that may throw.
382 SimpleLoopSafetyInfo SafetyInfo;
383 SafetyInfo.computeLoopSafetyInfo(CurLoop);
384 if (SafetyInfo.anyBlockMayThrow())
385 return false;
386
387 bool MadeChange = false;
388
389 // Scan all the blocks in the loop that are not in subloops.
390 for (auto *BB : CurLoop->getBlocks()) {
391 // Ignore blocks in subloops.
392 if (LI->getLoopFor(BB) != CurLoop)
393 continue;
394
395 MadeChange |= runOnLoopBlock(BB, BECount, ExitBlocks);
396 }
397 return MadeChange;
398}
399
400static APInt getStoreStride(const SCEVAddRecExpr *StoreEv) {
401 const SCEVConstant *ConstStride = cast<SCEVConstant>(StoreEv->getOperand(1));
402 return ConstStride->getAPInt();
403}
404
405/// getMemSetPatternValue - If a strided store of the specified value is safe to
406/// turn into a memset_pattern16, return a ConstantArray of 16 bytes that should
407/// be passed in. Otherwise, return null.
408///
409/// Note that we don't ever attempt to use memset_pattern8 or 4, because these
410/// just replicate their input array and then pass on to memset_pattern16.
411static Constant *getMemSetPatternValue(Value *V, const DataLayout *DL) {
412 // FIXME: This could check for UndefValue because it can be merged into any
413 // other valid pattern.
414
415 // If the value isn't a constant, we can't promote it to being in a constant
416 // array. We could theoretically do a store to an alloca or something, but
417 // that doesn't seem worthwhile.
418 Constant *C = dyn_cast<Constant>(V);
419 if (!C)
420 return nullptr;
421
422 // Only handle simple values that are a power of two bytes in size.
423 uint64_t Size = DL->getTypeSizeInBits(V->getType());
424 if (Size == 0 || (Size & 7) || (Size & (Size - 1)))
425 return nullptr;
426
427 // Don't care enough about darwin/ppc to implement this.
428 if (DL->isBigEndian())
429 return nullptr;
430
431 // Convert to size in bytes.
432 Size /= 8;
433
434 // TODO: If CI is larger than 16-bytes, we can try slicing it in half to see
435 // if the top and bottom are the same (e.g. for vectors and large integers).
436 if (Size > 16)
437 return nullptr;
438
439 // If the constant is exactly 16 bytes, just use it.
440 if (Size == 16)
441 return C;
442
443 // Otherwise, we'll use an array of the constants.
444 unsigned ArraySize = 16 / Size;
445 ArrayType *AT = ArrayType::get(V->getType(), ArraySize);
446 return ConstantArray::get(AT, std::vector<Constant *>(ArraySize, C));
447}
448
449LoopIdiomRecognize::LegalStoreKind
450LoopIdiomRecognize::isLegalStore(StoreInst *SI) {
451 // Don't touch volatile stores.
452 if (SI->isVolatile())
453 return LegalStoreKind::None;
454 // We only want simple or unordered-atomic stores.
455 if (!SI->isUnordered())
456 return LegalStoreKind::None;
457
458 // Don't convert stores of non-integral pointer types to memsets (which stores
459 // integers).
460 if (DL->isNonIntegralPointerType(SI->getValueOperand()->getType()))
461 return LegalStoreKind::None;
462
463 // Avoid merging nontemporal stores.
464 if (SI->getMetadata(LLVMContext::MD_nontemporal))
465 return LegalStoreKind::None;
466
467 Value *StoredVal = SI->getValueOperand();
468 Value *StorePtr = SI->getPointerOperand();
469
470 // Reject stores that are so large that they overflow an unsigned.
471 uint64_t SizeInBits = DL->getTypeSizeInBits(StoredVal->getType());
472 if ((SizeInBits & 7) || (SizeInBits >> 32) != 0)
473 return LegalStoreKind::None;
474
475 // See if the pointer expression is an AddRec like {base,+,1} on the current
476 // loop, which indicates a strided store. If we have something else, it's a
477 // random store we can't handle.
478 const SCEVAddRecExpr *StoreEv =
479 dyn_cast<SCEVAddRecExpr>(SE->getSCEV(StorePtr));
480 if (!StoreEv || StoreEv->getLoop() != CurLoop || !StoreEv->isAffine())
481 return LegalStoreKind::None;
482
483 // Check to see if we have a constant stride.
484 if (!isa<SCEVConstant>(StoreEv->getOperand(1)))
485 return LegalStoreKind::None;
486
487 // See if the store can be turned into a memset.
488
489 // If the stored value is a byte-wise value (like i32 -1), then it may be
490 // turned into a memset of i8 -1, assuming that all the consecutive bytes
491 // are stored. A store of i32 0x01020304 can never be turned into a memset,
492 // but it can be turned into memset_pattern if the target supports it.
493 Value *SplatValue = isBytewiseValue(StoredVal, *DL);
494 Constant *PatternValue = nullptr;
495
496 // Note: memset and memset_pattern on unordered-atomic is yet not supported
497 bool UnorderedAtomic = SI->isUnordered() && !SI->isSimple();
498
499 // If we're allowed to form a memset, and the stored value would be
500 // acceptable for memset, use it.
501 if (!UnorderedAtomic && HasMemset && SplatValue && !DisableLIRP::Memset &&
502 // Verify that the stored value is loop invariant. If not, we can't
503 // promote the memset.
504 CurLoop->isLoopInvariant(SplatValue)) {
505 // It looks like we can use SplatValue.
506 return LegalStoreKind::Memset;
507 } else if (!UnorderedAtomic && HasMemsetPattern && !DisableLIRP::Memset &&
508 // Don't create memset_pattern16s with address spaces.
509 StorePtr->getType()->getPointerAddressSpace() == 0 &&
510 (PatternValue = getMemSetPatternValue(StoredVal, DL))) {
Although the value stored to 'PatternValue' is used in the enclosing expression, the value is never actually read from 'PatternValue'
511 // It looks like we can use PatternValue!
512 return LegalStoreKind::MemsetPattern;
513 }
514
515 // Otherwise, see if the store can be turned into a memcpy.
516 if (HasMemcpy && !DisableLIRP::Memcpy) {
517 // Check to see if the stride matches the size of the store. If so, then we
518 // know that every byte is touched in the loop.
519 APInt Stride = getStoreStride(StoreEv);
520 unsigned StoreSize = DL->getTypeStoreSize(SI->getValueOperand()->getType());
521 if (StoreSize != Stride && StoreSize != -Stride)
522 return LegalStoreKind::None;
523
524 // The store must be feeding a non-volatile load.
525 LoadInst *LI = dyn_cast<LoadInst>(SI->getValueOperand());
526
527 // Only allow non-volatile loads
528 if (!LI || LI->isVolatile())
529 return LegalStoreKind::None;
530 // Only allow simple or unordered-atomic loads
531 if (!LI->isUnordered())
532 return LegalStoreKind::None;
533
534 // See if the pointer expression is an AddRec like {base,+,1} on the current
535 // loop, which indicates a strided load. If we have something else, it's a
536 // random load we can't handle.
537 const SCEVAddRecExpr *LoadEv =
538 dyn_cast<SCEVAddRecExpr>(SE->getSCEV(LI->getPointerOperand()));
539 if (!LoadEv || LoadEv->getLoop() != CurLoop || !LoadEv->isAffine())
540 return LegalStoreKind::None;
541
542 // The store and load must share the same stride.
543 if (StoreEv->getOperand(1) != LoadEv->getOperand(1))
544 return LegalStoreKind::None;
545
546 // Success. This store can be converted into a memcpy.
547 UnorderedAtomic = UnorderedAtomic || LI->isAtomic();
548 return UnorderedAtomic ? LegalStoreKind::UnorderedAtomicMemcpy
549 : LegalStoreKind::Memcpy;
550 }
551 // This store can't be transformed into a memset/memcpy.
552 return LegalStoreKind::None;
553}
554
555void LoopIdiomRecognize::collectStores(BasicBlock *BB) {
556 StoreRefsForMemset.clear();
557 StoreRefsForMemsetPattern.clear();
558 StoreRefsForMemcpy.clear();
559 for (Instruction &I : *BB) {
560 StoreInst *SI = dyn_cast<StoreInst>(&I);
561 if (!SI)
562 continue;
563
564 // Make sure this is a strided store with a constant stride.
565 switch (isLegalStore(SI)) {
566 case LegalStoreKind::None:
567 // Nothing to do
568 break;
569 case LegalStoreKind::Memset: {
570 // Find the base pointer.
571 Value *Ptr = getUnderlyingObject(SI->getPointerOperand());
572 StoreRefsForMemset[Ptr].push_back(SI);
573 } break;
574 case LegalStoreKind::MemsetPattern: {
575 // Find the base pointer.
576 Value *Ptr = getUnderlyingObject(SI->getPointerOperand());
577 StoreRefsForMemsetPattern[Ptr].push_back(SI);
578 } break;
579 case LegalStoreKind::Memcpy:
580 case LegalStoreKind::UnorderedAtomicMemcpy:
581 StoreRefsForMemcpy.push_back(SI);
582 break;
583 default:
584 assert(false && "unhandled return value")((false && "unhandled return value") ? static_cast<
void> (0) : __assert_fail ("false && \"unhandled return value\""
, "/build/llvm-toolchain-snapshot-12~++20200911111112+c0825fa5fc3/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp"
, 584, __PRETTY_FUNCTION__))
;
585 break;
586 }
587 }
588}
589
590/// runOnLoopBlock - Process the specified block, which lives in a counted loop
591/// with the specified backedge count. This block is known to be in the current
592/// loop and not in any subloops.
593bool LoopIdiomRecognize::runOnLoopBlock(
594 BasicBlock *BB, const SCEV *BECount,
595 SmallVectorImpl<BasicBlock *> &ExitBlocks) {
596 // We can only promote stores in this block if they are unconditionally
597 // executed in the loop. For a block to be unconditionally executed, it has
598 // to dominate all the exit blocks of the loop. Verify this now.
599 for (unsigned i = 0, e = ExitBlocks.size(); i != e; ++i)
600 if (!DT->dominates(BB, ExitBlocks[i]))
601 return false;
602
603 bool MadeChange = false;
604 // Look for store instructions, which may be optimized to memset/memcpy.
605 collectStores(BB);
606
607 // Look for a single store or sets of stores with a common base, which can be
608 // optimized into a memset (memset_pattern). The latter most commonly happens
609 // with structs and handunrolled loops.
610 for (auto &SL : StoreRefsForMemset)
611 MadeChange |= processLoopStores(SL.second, BECount, ForMemset::Yes);
612
613 for (auto &SL : StoreRefsForMemsetPattern)
614 MadeChange |= processLoopStores(SL.second, BECount, ForMemset::No);
615
616 // Optimize the store into a memcpy, if it feeds an similarly strided load.
617 for (auto &SI : StoreRefsForMemcpy)
618 MadeChange |= processLoopStoreOfLoopLoad(SI, BECount);
619
620 for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) {
621 Instruction *Inst = &*I++;
622 // Look for memset instructions, which may be optimized to a larger memset.
623 if (MemSetInst *MSI = dyn_cast<MemSetInst>(Inst)) {
624 WeakTrackingVH InstPtr(&*I);
625 if (!processLoopMemSet(MSI, BECount))
626 continue;
627 MadeChange = true;
628
629 // If processing the memset invalidated our iterator, start over from the
630 // top of the block.
631 if (!InstPtr)
632 I = BB->begin();
633 continue;
634 }
635 }
636
637 return MadeChange;
638}
639
640/// See if this store(s) can be promoted to a memset.
641bool LoopIdiomRecognize::processLoopStores(SmallVectorImpl<StoreInst *> &SL,
642 const SCEV *BECount, ForMemset For) {
643 // Try to find consecutive stores that can be transformed into memsets.
644 SetVector<StoreInst *> Heads, Tails;
645 SmallDenseMap<StoreInst *, StoreInst *> ConsecutiveChain;
646
647 // Do a quadratic search on all of the given stores and find
648 // all of the pairs of stores that follow each other.
649 SmallVector<unsigned, 16> IndexQueue;
650 for (unsigned i = 0, e = SL.size(); i < e; ++i) {
651 assert(SL[i]->isSimple() && "Expected only non-volatile stores.")((SL[i]->isSimple() && "Expected only non-volatile stores."
) ? static_cast<void> (0) : __assert_fail ("SL[i]->isSimple() && \"Expected only non-volatile stores.\""
, "/build/llvm-toolchain-snapshot-12~++20200911111112+c0825fa5fc3/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp"
, 651, __PRETTY_FUNCTION__))
;
652
653 Value *FirstStoredVal = SL[i]->getValueOperand();
654 Value *FirstStorePtr = SL[i]->getPointerOperand();
655 const SCEVAddRecExpr *FirstStoreEv =
656 cast<SCEVAddRecExpr>(SE->getSCEV(FirstStorePtr));
657 APInt FirstStride = getStoreStride(FirstStoreEv);
658 unsigned FirstStoreSize = DL->getTypeStoreSize(SL[i]->getValueOperand()->getType());
659
660 // See if we can optimize just this store in isolation.
661 if (FirstStride == FirstStoreSize || -FirstStride == FirstStoreSize) {
662 Heads.insert(SL[i]);
663 continue;
664 }
665
666 Value *FirstSplatValue = nullptr;
667 Constant *FirstPatternValue = nullptr;
668
669 if (For == ForMemset::Yes)
670 FirstSplatValue = isBytewiseValue(FirstStoredVal, *DL);
671 else
672 FirstPatternValue = getMemSetPatternValue(FirstStoredVal, DL);
673
674 assert((FirstSplatValue || FirstPatternValue) &&(((FirstSplatValue || FirstPatternValue) && "Expected either splat value or pattern value."
) ? static_cast<void> (0) : __assert_fail ("(FirstSplatValue || FirstPatternValue) && \"Expected either splat value or pattern value.\""
, "/build/llvm-toolchain-snapshot-12~++20200911111112+c0825fa5fc3/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp"
, 675, __PRETTY_FUNCTION__))
675 "Expected either splat value or pattern value.")(((FirstSplatValue || FirstPatternValue) && "Expected either splat value or pattern value."
) ? static_cast<void> (0) : __assert_fail ("(FirstSplatValue || FirstPatternValue) && \"Expected either splat value or pattern value.\""
, "/build/llvm-toolchain-snapshot-12~++20200911111112+c0825fa5fc3/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp"
, 675, __PRETTY_FUNCTION__))
;
676
677 IndexQueue.clear();
678 // If a store has multiple consecutive store candidates, search Stores
679 // array according to the sequence: from i+1 to e, then from i-1 to 0.
680 // This is because usually pairing with immediate succeeding or preceding
681 // candidate create the best chance to find memset opportunity.
682 unsigned j = 0;
683 for (j = i + 1; j < e; ++j)
684 IndexQueue.push_back(j);
685 for (j = i; j > 0; --j)
686 IndexQueue.push_back(j - 1);
687
688 for (auto &k : IndexQueue) {
689 assert(SL[k]->isSimple() && "Expected only non-volatile stores.")((SL[k]->isSimple() && "Expected only non-volatile stores."
) ? static_cast<void> (0) : __assert_fail ("SL[k]->isSimple() && \"Expected only non-volatile stores.\""
, "/build/llvm-toolchain-snapshot-12~++20200911111112+c0825fa5fc3/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp"
, 689, __PRETTY_FUNCTION__))
;
690 Value *SecondStorePtr = SL[k]->getPointerOperand();
691 const SCEVAddRecExpr *SecondStoreEv =
692 cast<SCEVAddRecExpr>(SE->getSCEV(SecondStorePtr));
693 APInt SecondStride = getStoreStride(SecondStoreEv);
694
695 if (FirstStride != SecondStride)
696 continue;
697
698 Value *SecondStoredVal = SL[k]->getValueOperand();
699 Value *SecondSplatValue = nullptr;
700 Constant *SecondPatternValue = nullptr;
701
702 if (For == ForMemset::Yes)
703 SecondSplatValue = isBytewiseValue(SecondStoredVal, *DL);
704 else
705 SecondPatternValue = getMemSetPatternValue(SecondStoredVal, DL);
706
707 assert((SecondSplatValue || SecondPatternValue) &&(((SecondSplatValue || SecondPatternValue) && "Expected either splat value or pattern value."
) ? static_cast<void> (0) : __assert_fail ("(SecondSplatValue || SecondPatternValue) && \"Expected either splat value or pattern value.\""
, "/build/llvm-toolchain-snapshot-12~++20200911111112+c0825fa5fc3/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp"
, 708, __PRETTY_FUNCTION__))
708 "Expected either splat value or pattern value.")(((SecondSplatValue || SecondPatternValue) && "Expected either splat value or pattern value."
) ? static_cast<void> (0) : __assert_fail ("(SecondSplatValue || SecondPatternValue) && \"Expected either splat value or pattern value.\""
, "/build/llvm-toolchain-snapshot-12~++20200911111112+c0825fa5fc3/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp"
, 708, __PRETTY_FUNCTION__))
;
709
710 if (isConsecutiveAccess(SL[i], SL[k], *DL, *SE, false)) {
711 if (For == ForMemset::Yes) {
712 if (isa<UndefValue>(FirstSplatValue))
713 FirstSplatValue = SecondSplatValue;
714 if (FirstSplatValue != SecondSplatValue)
715 continue;
716 } else {
717 if (isa<UndefValue>(FirstPatternValue))
718 FirstPatternValue = SecondPatternValue;
719 if (FirstPatternValue != SecondPatternValue)
720 continue;
721 }
722 Tails.insert(SL[k]);
723 Heads.insert(SL[i]);
724 ConsecutiveChain[SL[i]] = SL[k];
725 break;
726 }
727 }
728 }
729
730 // We may run into multiple chains that merge into a single chain. We mark the
731 // stores that we transformed so that we don't visit the same store twice.
732 SmallPtrSet<Value *, 16> TransformedStores;
733 bool Changed = false;
734
735 // For stores that start but don't end a link in the chain:
736 for (SetVector<StoreInst *>::iterator it = Heads.begin(), e = Heads.end();
737 it != e; ++it) {
738 if (Tails.count(*it))
739 continue;
740
741 // We found a store instr that starts a chain. Now follow the chain and try
742 // to transform it.
743 SmallPtrSet<Instruction *, 8> AdjacentStores;
744 StoreInst *I = *it;
745
746 StoreInst *HeadStore = I;
747 unsigned StoreSize = 0;
748
749 // Collect the chain into a list.
750 while (Tails.count(I) || Heads.count(I)) {
751 if (TransformedStores.count(I))
752 break;
753 AdjacentStores.insert(I);
754
755 StoreSize += DL->getTypeStoreSize(I->getValueOperand()->getType());
756 // Move to the next value in the chain.
757 I = ConsecutiveChain[I];
758 }
759
760 Value *StoredVal = HeadStore->getValueOperand();
761 Value *StorePtr = HeadStore->getPointerOperand();
762 const SCEVAddRecExpr *StoreEv = cast<SCEVAddRecExpr>(SE->getSCEV(StorePtr));
763 APInt Stride = getStoreStride(StoreEv);
764
765 // Check to see if the stride matches the size of the stores. If so, then
766 // we know that every byte is touched in the loop.
767 if (StoreSize != Stride && StoreSize != -Stride)
768 continue;
769
770 bool NegStride = StoreSize == -Stride;
771
772 if (processLoopStridedStore(StorePtr, StoreSize,
773 MaybeAlign(HeadStore->getAlignment()),
774 StoredVal, HeadStore, AdjacentStores, StoreEv,
775 BECount, NegStride)) {
776 TransformedStores.insert(AdjacentStores.begin(), AdjacentStores.end());
777 Changed = true;
778 }
779 }
780
781 return Changed;
782}
783
784/// processLoopMemSet - See if this memset can be promoted to a large memset.
785bool LoopIdiomRecognize::processLoopMemSet(MemSetInst *MSI,
786 const SCEV *BECount) {
787 // We can only handle non-volatile memsets with a constant size.
788 if (MSI->isVolatile() || !isa<ConstantInt>(MSI->getLength()))
789 return false;
790
791 // If we're not allowed to hack on memset, we fail.
792 if (!HasMemset)
793 return false;
794
795 Value *Pointer = MSI->getDest();
796
797 // See if the pointer expression is an AddRec like {base,+,1} on the current
798 // loop, which indicates a strided store. If we have something else, it's a
799 // random store we can't handle.
800 const SCEVAddRecExpr *Ev = dyn_cast<SCEVAddRecExpr>(SE->getSCEV(Pointer));
801 if (!Ev || Ev->getLoop() != CurLoop || !Ev->isAffine())
802 return false;
803
804 // Reject memsets that are so large that they overflow an unsigned.
805 uint64_t SizeInBytes = cast<ConstantInt>(MSI->getLength())->getZExtValue();
806 if ((SizeInBytes >> 32) != 0)
807 return false;
808
809 // Check to see if the stride matches the size of the memset. If so, then we
810 // know that every byte is touched in the loop.
811 const SCEVConstant *ConstStride = dyn_cast<SCEVConstant>(Ev->getOperand(1));
812 if (!ConstStride)
813 return false;
814
815 APInt Stride = ConstStride->getAPInt();
816 if (SizeInBytes != Stride && SizeInBytes != -Stride)
817 return false;
818
819 // Verify that the memset value is loop invariant. If not, we can't promote
820 // the memset.
821 Value *SplatValue = MSI->getValue();
822 if (!SplatValue || !CurLoop->isLoopInvariant(SplatValue))
823 return false;
824
825 SmallPtrSet<Instruction *, 1> MSIs;
826 MSIs.insert(MSI);
827 bool NegStride = SizeInBytes == -Stride;
828 return processLoopStridedStore(
829 Pointer, (unsigned)SizeInBytes, MaybeAlign(MSI->getDestAlignment()),
830 SplatValue, MSI, MSIs, Ev, BECount, NegStride, /*IsLoopMemset=*/true);
831}
832
833/// mayLoopAccessLocation - Return true if the specified loop might access the
834/// specified pointer location, which is a loop-strided access. The 'Access'
835/// argument specifies what the verboten forms of access are (read or write).
836static bool
837mayLoopAccessLocation(Value *Ptr, ModRefInfo Access, Loop *L,
838 const SCEV *BECount, unsigned StoreSize,
839 AliasAnalysis &AA,
840 SmallPtrSetImpl<Instruction *> &IgnoredStores) {
841 // Get the location that may be stored across the loop. Since the access is
842 // strided positively through memory, we say that the modified location starts
843 // at the pointer and has infinite size.
844 LocationSize AccessSize = LocationSize::unknown();
845
846 // If the loop iterates a fixed number of times, we can refine the access size
847 // to be exactly the size of the memset, which is (BECount+1)*StoreSize
848 if (const SCEVConstant *BECst = dyn_cast<SCEVConstant>(BECount))
849 AccessSize = LocationSize::precise((BECst->getValue()->getZExtValue() + 1) *
850 StoreSize);
851
852 // TODO: For this to be really effective, we have to dive into the pointer
853 // operand in the store. Store to &A[i] of 100 will always return may alias
854 // with store of &A[100], we need to StoreLoc to be "A" with size of 100,
855 // which will then no-alias a store to &A[100].
856 MemoryLocation StoreLoc(Ptr, AccessSize);
857
858 for (Loop::block_iterator BI = L->block_begin(), E = L->block_end(); BI != E;
859 ++BI)
860 for (Instruction &I : **BI)
861 if (IgnoredStores.count(&I) == 0 &&
862 isModOrRefSet(
863 intersectModRef(AA.getModRefInfo(&I, StoreLoc), Access)))
864 return true;
865
866 return false;
867}
868
869// If we have a negative stride, Start refers to the end of the memory location
870// we're trying to memset. Therefore, we need to recompute the base pointer,
871// which is just Start - BECount*Size.
872static const SCEV *getStartForNegStride(const SCEV *Start, const SCEV *BECount,
873 Type *IntPtr, unsigned StoreSize,
874 ScalarEvolution *SE) {
875 const SCEV *Index = SE->getTruncateOrZeroExtend(BECount, IntPtr);
876 if (StoreSize != 1)
877 Index = SE->getMulExpr(Index, SE->getConstant(IntPtr, StoreSize),
878 SCEV::FlagNUW);
879 return SE->getMinusSCEV(Start, Index);
880}
881
882/// Compute the number of bytes as a SCEV from the backedge taken count.
883///
884/// This also maps the SCEV into the provided type and tries to handle the
885/// computation in a way that will fold cleanly.
886static const SCEV *getNumBytes(const SCEV *BECount, Type *IntPtr,
887 unsigned StoreSize, Loop *CurLoop,
888 const DataLayout *DL, ScalarEvolution *SE) {
889 const SCEV *NumBytesS;
890 // The # stored bytes is (BECount+1)*Size. Expand the trip count out to
891 // pointer size if it isn't already.
892 //
893 // If we're going to need to zero extend the BE count, check if we can add
894 // one to it prior to zero extending without overflow. Provided this is safe,
895 // it allows better simplification of the +1.
896 if (DL->getTypeSizeInBits(BECount->getType()) <
897 DL->getTypeSizeInBits(IntPtr) &&
898 SE->isLoopEntryGuardedByCond(
899 CurLoop, ICmpInst::ICMP_NE, BECount,
900 SE->getNegativeSCEV(SE->getOne(BECount->getType())))) {
901 NumBytesS = SE->getZeroExtendExpr(
902 SE->getAddExpr(BECount, SE->getOne(BECount->getType()), SCEV::FlagNUW),
903 IntPtr);
904 } else {
905 NumBytesS = SE->getAddExpr(SE->getTruncateOrZeroExtend(BECount, IntPtr),
906 SE->getOne(IntPtr), SCEV::FlagNUW);
907 }
908
909 // And scale it based on the store size.
910 if (StoreSize != 1) {
911 NumBytesS = SE->getMulExpr(NumBytesS, SE->getConstant(IntPtr, StoreSize),
912 SCEV::FlagNUW);
913 }
914 return NumBytesS;
915}
916
917/// processLoopStridedStore - We see a strided store of some value. If we can
918/// transform this into a memset or memset_pattern in the loop preheader, do so.
919bool LoopIdiomRecognize::processLoopStridedStore(
920 Value *DestPtr, unsigned StoreSize, MaybeAlign StoreAlignment,
921 Value *StoredVal, Instruction *TheStore,
922 SmallPtrSetImpl<Instruction *> &Stores, const SCEVAddRecExpr *Ev,
923 const SCEV *BECount, bool NegStride, bool IsLoopMemset) {
924 Value *SplatValue = isBytewiseValue(StoredVal, *DL);
925 Constant *PatternValue = nullptr;
926
927 if (!SplatValue)
928 PatternValue = getMemSetPatternValue(StoredVal, DL);
929
930 assert((SplatValue || PatternValue) &&(((SplatValue || PatternValue) && "Expected either splat value or pattern value."
) ? static_cast<void> (0) : __assert_fail ("(SplatValue || PatternValue) && \"Expected either splat value or pattern value.\""
, "/build/llvm-toolchain-snapshot-12~++20200911111112+c0825fa5fc3/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp"
, 931, __PRETTY_FUNCTION__))
931 "Expected either splat value or pattern value.")(((SplatValue || PatternValue) && "Expected either splat value or pattern value."
) ? static_cast<void> (0) : __assert_fail ("(SplatValue || PatternValue) && \"Expected either splat value or pattern value.\""
, "/build/llvm-toolchain-snapshot-12~++20200911111112+c0825fa5fc3/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp"
, 931, __PRETTY_FUNCTION__))
;
932
933 // The trip count of the loop and the base pointer of the addrec SCEV is
934 // guaranteed to be loop invariant, which means that it should dominate the
935 // header. This allows us to insert code for it in the preheader.
936 unsigned DestAS = DestPtr->getType()->getPointerAddressSpace();
937 BasicBlock *Preheader = CurLoop->getLoopPreheader();
938 IRBuilder<> Builder(Preheader->getTerminator());
939 SCEVExpander Expander(*SE, *DL, "loop-idiom");
940 SCEVExpanderCleaner ExpCleaner(Expander, *DT);
941
942 Type *DestInt8PtrTy = Builder.getInt8PtrTy(DestAS);
943 Type *IntIdxTy = DL->getIndexType(DestPtr->getType());
944
945 bool Changed = false;
946 const SCEV *Start = Ev->getStart();
947 // Handle negative strided loops.
948 if (NegStride)
949 Start = getStartForNegStride(Start, BECount, IntIdxTy, StoreSize, SE);
950
951 // TODO: ideally we should still be able to generate memset if SCEV expander
952 // is taught to generate the dependencies at the latest point.
953 if (!isSafeToExpand(Start, *SE))
954 return Changed;
955
956 // Okay, we have a strided store "p[i]" of a splattable value. We can turn
957 // this into a memset in the loop preheader now if we want. However, this
958 // would be unsafe to do if there is anything else in the loop that may read
959 // or write to the aliased location. Check for any overlap by generating the
960 // base pointer and checking the region.
961 Value *BasePtr =
962 Expander.expandCodeFor(Start, DestInt8PtrTy, Preheader->getTerminator());
963
964 // From here on out, conservatively report to the pass manager that we've
965 // changed the IR, even if we later clean up these added instructions. There
966 // may be structural differences e.g. in the order of use lists not accounted
967 // for in just a textual dump of the IR. This is written as a variable, even
968 // though statically all the places this dominates could be replaced with
969 // 'true', with the hope that anyone trying to be clever / "more precise" with
970 // the return value will read this comment, and leave them alone.
971 Changed = true;
972
973 if (mayLoopAccessLocation(BasePtr, ModRefInfo::ModRef, CurLoop, BECount,
974 StoreSize, *AA, Stores))
975 return Changed;
976
977 if (avoidLIRForMultiBlockLoop(/*IsMemset=*/true, IsLoopMemset))
978 return Changed;
979
980 // Okay, everything looks good, insert the memset.
981
982 const SCEV *NumBytesS =
983 getNumBytes(BECount, IntIdxTy, StoreSize, CurLoop, DL, SE);
984
985 // TODO: ideally we should still be able to generate memset if SCEV expander
986 // is taught to generate the dependencies at the latest point.
987 if (!isSafeToExpand(NumBytesS, *SE))
988 return Changed;
989
990 Value *NumBytes =
991 Expander.expandCodeFor(NumBytesS, IntIdxTy, Preheader->getTerminator());
992
993 CallInst *NewCall;
994 if (SplatValue) {
995 NewCall = Builder.CreateMemSet(BasePtr, SplatValue, NumBytes,
996 MaybeAlign(StoreAlignment));
997 } else {
998 // Everything is emitted in default address space
999 Type *Int8PtrTy = DestInt8PtrTy;
1000
1001 Module *M = TheStore->getModule();
1002 StringRef FuncName = "memset_pattern16";
1003 FunctionCallee MSP = M->getOrInsertFunction(FuncName, Builder.getVoidTy(),
1004 Int8PtrTy, Int8PtrTy, IntIdxTy);
1005 inferLibFuncAttributes(M, FuncName, *TLI);
1006
1007 // Otherwise we should form a memset_pattern16. PatternValue is known to be
1008 // an constant array of 16-bytes. Plop the value into a mergable global.
1009 GlobalVariable *GV = new GlobalVariable(*M, PatternValue->getType(), true,
1010 GlobalValue::PrivateLinkage,
1011 PatternValue, ".memset_pattern");
1012 GV->setUnnamedAddr(GlobalValue::UnnamedAddr::Global); // Ok to merge these.
1013 GV->setAlignment(Align(16));
1014 Value *PatternPtr = ConstantExpr::getBitCast(GV, Int8PtrTy);
1015 NewCall = Builder.CreateCall(MSP, {BasePtr, PatternPtr, NumBytes});
1016 }
1017 NewCall->setDebugLoc(TheStore->getDebugLoc());
1018
1019 if (MSSAU) {
1020 MemoryAccess *NewMemAcc = MSSAU->createMemoryAccessInBB(
1021 NewCall, nullptr, NewCall->getParent(), MemorySSA::BeforeTerminator);
1022 MSSAU->insertDef(cast<MemoryDef>(NewMemAcc), true);
1023 }
1024
1025 LLVM_DEBUG(dbgs() << " Formed memset: " << *NewCall << "\n"do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-idiom")) { dbgs() << " Formed memset: " <<
*NewCall << "\n" << " from store to: " <<
*Ev << " at: " << *TheStore << "\n"; } } while
(false)
1026 << " from store to: " << *Ev << " at: " << *TheStoredo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-idiom")) { dbgs() << " Formed memset: " <<
*NewCall << "\n" << " from store to: " <<
*Ev << " at: " << *TheStore << "\n"; } } while
(false)
1027 << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-idiom")) { dbgs() << " Formed memset: " <<
*NewCall << "\n" << " from store to: " <<
*Ev << " at: " << *TheStore << "\n"; } } while
(false)
;
1028
1029 ORE.emit([&]() {
1030 return OptimizationRemark(DEBUG_TYPE"loop-idiom", "ProcessLoopStridedStore",
1031 NewCall->getDebugLoc(), Preheader)
1032 << "Transformed loop-strided store into a call to "
1033 << ore::NV("NewFunction", NewCall->getCalledFunction())
1034 << "() function";
1035 });
1036
1037 // Okay, the memset has been formed. Zap the original store and anything that
1038 // feeds into it.
1039 for (auto *I : Stores) {
1040 if (MSSAU)
1041 MSSAU->removeMemoryAccess(I, true);
1042 deleteDeadInstruction(I);
1043 }
1044 if (MSSAU && VerifyMemorySSA)
1045 MSSAU->getMemorySSA()->verifyMemorySSA();
1046 ++NumMemSet;
1047 ExpCleaner.markResultUsed();
1048 return true;
1049}
1050
1051/// If the stored value is a strided load in the same loop with the same stride
1052/// this may be transformable into a memcpy. This kicks in for stuff like
1053/// for (i) A[i] = B[i];
1054bool LoopIdiomRecognize::processLoopStoreOfLoopLoad(StoreInst *SI,
1055 const SCEV *BECount) {
1056 assert(SI->isUnordered() && "Expected only non-volatile non-ordered stores.")((SI->isUnordered() && "Expected only non-volatile non-ordered stores."
) ? static_cast<void> (0) : __assert_fail ("SI->isUnordered() && \"Expected only non-volatile non-ordered stores.\""
, "/build/llvm-toolchain-snapshot-12~++20200911111112+c0825fa5fc3/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp"
, 1056, __PRETTY_FUNCTION__))
;
1057
1058 Value *StorePtr = SI->getPointerOperand();
1059 const SCEVAddRecExpr *StoreEv = cast<SCEVAddRecExpr>(SE->getSCEV(StorePtr));
1060 APInt Stride = getStoreStride(StoreEv);
1061 unsigned StoreSize = DL->getTypeStoreSize(SI->getValueOperand()->getType());
1062 bool NegStride = StoreSize == -Stride;
1063
1064 // The store must be feeding a non-volatile load.
1065 LoadInst *LI = cast<LoadInst>(SI->getValueOperand());
1066 assert(LI->isUnordered() && "Expected only non-volatile non-ordered loads.")((LI->isUnordered() && "Expected only non-volatile non-ordered loads."
) ? static_cast<void> (0) : __assert_fail ("LI->isUnordered() && \"Expected only non-volatile non-ordered loads.\""
, "/build/llvm-toolchain-snapshot-12~++20200911111112+c0825fa5fc3/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp"
, 1066, __PRETTY_FUNCTION__))
;
1067
1068 // See if the pointer expression is an AddRec like {base,+,1} on the current
1069 // loop, which indicates a strided load. If we have something else, it's a
1070 // random load we can't handle.
1071 const SCEVAddRecExpr *LoadEv =
1072 cast<SCEVAddRecExpr>(SE->getSCEV(LI->getPointerOperand()));
1073
1074 // The trip count of the loop and the base pointer of the addrec SCEV is
1075 // guaranteed to be loop invariant, which means that it should dominate the
1076 // header. This allows us to insert code for it in the preheader.
1077 BasicBlock *Preheader = CurLoop->getLoopPreheader();
1078 IRBuilder<> Builder(Preheader->getTerminator());
1079 SCEVExpander Expander(*SE, *DL, "loop-idiom");
1080
1081 SCEVExpanderCleaner ExpCleaner(Expander, *DT);
1082
1083 bool Changed = false;
1084 const SCEV *StrStart = StoreEv->getStart();
1085 unsigned StrAS = SI->getPointerAddressSpace();
1086 Type *IntIdxTy = Builder.getIntNTy(DL->getIndexSizeInBits(StrAS));
1087
1088 // Handle negative strided loops.
1089 if (NegStride)
1090 StrStart = getStartForNegStride(StrStart, BECount, IntIdxTy, StoreSize, SE);
1091
1092 // Okay, we have a strided store "p[i]" of a loaded value. We can turn
1093 // this into a memcpy in the loop preheader now if we want. However, this
1094 // would be unsafe to do if there is anything else in the loop that may read
1095 // or write the memory region we're storing to. This includes the load that
1096 // feeds the stores. Check for an alias by generating the base address and
1097 // checking everything.
1098 Value *StoreBasePtr = Expander.expandCodeFor(
1099 StrStart, Builder.getInt8PtrTy(StrAS), Preheader->getTerminator());
1100
1101 // From here on out, conservatively report to the pass manager that we've
1102 // changed the IR, even if we later clean up these added instructions. There
1103 // may be structural differences e.g. in the order of use lists not accounted
1104 // for in just a textual dump of the IR. This is written as a variable, even
1105 // though statically all the places this dominates could be replaced with
1106 // 'true', with the hope that anyone trying to be clever / "more precise" with
1107 // the return value will read this comment, and leave them alone.
1108 Changed = true;
1109
1110 SmallPtrSet<Instruction *, 1> Stores;
1111 Stores.insert(SI);
1112 if (mayLoopAccessLocation(StoreBasePtr, ModRefInfo::ModRef, CurLoop, BECount,
1113 StoreSize, *AA, Stores))
1114 return Changed;
1115
1116 const SCEV *LdStart = LoadEv->getStart();
1117 unsigned LdAS = LI->getPointerAddressSpace();
1118
1119 // Handle negative strided loops.
1120 if (NegStride)
1121 LdStart = getStartForNegStride(LdStart, BECount, IntIdxTy, StoreSize, SE);
1122
1123 // For a memcpy, we have to make sure that the input array is not being
1124 // mutated by the loop.
1125 Value *LoadBasePtr = Expander.expandCodeFor(
1126 LdStart, Builder.getInt8PtrTy(LdAS), Preheader->getTerminator());
1127
1128 if (mayLoopAccessLocation(LoadBasePtr, ModRefInfo::Mod, CurLoop, BECount,
1129 StoreSize, *AA, Stores))
1130 return Changed;
1131
1132 if (avoidLIRForMultiBlockLoop())
1133 return Changed;
1134
1135 // Okay, everything is safe, we can transform this!
1136
1137 const SCEV *NumBytesS =
1138 getNumBytes(BECount, IntIdxTy, StoreSize, CurLoop, DL, SE);
1139
1140 Value *NumBytes =
1141 Expander.expandCodeFor(NumBytesS, IntIdxTy, Preheader->getTerminator());
1142
1143 CallInst *NewCall = nullptr;
1144 // Check whether to generate an unordered atomic memcpy:
1145 // If the load or store are atomic, then they must necessarily be unordered
1146 // by previous checks.
1147 if (!SI->isAtomic() && !LI->isAtomic())
1148 NewCall = Builder.CreateMemCpy(StoreBasePtr, SI->getAlign(), LoadBasePtr,
1149 LI->getAlign(), NumBytes);
1150 else {
1151 // We cannot allow unaligned ops for unordered load/store, so reject
1152 // anything where the alignment isn't at least the element size.
1153 const Align StoreAlign = SI->getAlign();
1154 const Align LoadAlign = LI->getAlign();
1155 if (StoreAlign < StoreSize || LoadAlign < StoreSize)
1156 return Changed;
1157
1158 // If the element.atomic memcpy is not lowered into explicit
1159 // loads/stores later, then it will be lowered into an element-size
1160 // specific lib call. If the lib call doesn't exist for our store size, then
1161 // we shouldn't generate the memcpy.
1162 if (StoreSize > TTI->getAtomicMemIntrinsicMaxElementSize())
1163 return Changed;
1164
1165 // Create the call.
1166 // Note that unordered atomic loads/stores are *required* by the spec to
1167 // have an alignment but non-atomic loads/stores may not.
1168 NewCall = Builder.CreateElementUnorderedAtomicMemCpy(
1169 StoreBasePtr, StoreAlign, LoadBasePtr, LoadAlign, NumBytes,
1170 StoreSize);
1171 }
1172 NewCall->setDebugLoc(SI->getDebugLoc());
1173
1174 if (MSSAU) {
1175 MemoryAccess *NewMemAcc = MSSAU->createMemoryAccessInBB(
1176 NewCall, nullptr, NewCall->getParent(), MemorySSA::BeforeTerminator);
1177 MSSAU->insertDef(cast<MemoryDef>(NewMemAcc), true);
1178 }
1179
1180 LLVM_DEBUG(dbgs() << " Formed memcpy: " << *NewCall << "\n"do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-idiom")) { dbgs() << " Formed memcpy: " <<
*NewCall << "\n" << " from load ptr=" <<
*LoadEv << " at: " << *LI << "\n" <<
" from store ptr=" << *StoreEv << " at: " <<
*SI << "\n"; } } while (false)
1181 << " from load ptr=" << *LoadEv << " at: " << *LI << "\n"do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-idiom")) { dbgs() << " Formed memcpy: " <<
*NewCall << "\n" << " from load ptr=" <<
*LoadEv << " at: " << *LI << "\n" <<
" from store ptr=" << *StoreEv << " at: " <<
*SI << "\n"; } } while (false)
1182 << " from store ptr=" << *StoreEv << " at: " << *SIdo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-idiom")) { dbgs() << " Formed memcpy: " <<
*NewCall << "\n" << " from load ptr=" <<
*LoadEv << " at: " << *LI << "\n" <<
" from store ptr=" << *StoreEv << " at: " <<
*SI << "\n"; } } while (false)
1183 << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-idiom")) { dbgs() << " Formed memcpy: " <<
*NewCall << "\n" << " from load ptr=" <<
*LoadEv << " at: " << *LI << "\n" <<
" from store ptr=" << *StoreEv << " at: " <<
*SI << "\n"; } } while (false)
;
1184
1185 ORE.emit([&]() {
1186 return OptimizationRemark(DEBUG_TYPE"loop-idiom", "ProcessLoopStoreOfLoopLoad",
1187 NewCall->getDebugLoc(), Preheader)
1188 << "Formed a call to "
1189 << ore::NV("NewFunction", NewCall->getCalledFunction())
1190 << "() function";
1191 });
1192
1193 // Okay, the memcpy has been formed. Zap the original store and anything that
1194 // feeds into it.
1195 if (MSSAU)
1196 MSSAU->removeMemoryAccess(SI, true);
1197 deleteDeadInstruction(SI);
1198 if (MSSAU && VerifyMemorySSA)
1199 MSSAU->getMemorySSA()->verifyMemorySSA();
1200 ++NumMemCpy;
1201 ExpCleaner.markResultUsed();
1202 return true;
1203}
1204
1205// When compiling for codesize we avoid idiom recognition for a multi-block loop
1206// unless it is a loop_memset idiom or a memset/memcpy idiom in a nested loop.
1207//
1208bool LoopIdiomRecognize::avoidLIRForMultiBlockLoop(bool IsMemset,
1209 bool IsLoopMemset) {
1210 if (ApplyCodeSizeHeuristics && CurLoop->getNumBlocks() > 1) {
1211 if (!CurLoop->getParentLoop() && (!IsMemset || !IsLoopMemset)) {
1212 LLVM_DEBUG(dbgs() << " " << CurLoop->getHeader()->getParent()->getName()do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-idiom")) { dbgs() << " " << CurLoop->getHeader
()->getParent()->getName() << " : LIR " << (
IsMemset ? "Memset" : "Memcpy") << " avoided: multi-block top-level loop\n"
; } } while (false)
1213 << " : LIR " << (IsMemset ? "Memset" : "Memcpy")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-idiom")) { dbgs() << " " << CurLoop->getHeader
()->getParent()->getName() << " : LIR " << (
IsMemset ? "Memset" : "Memcpy") << " avoided: multi-block top-level loop\n"
; } } while (false)
1214 << " avoided: multi-block top-level loop\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-idiom")) { dbgs() << " " << CurLoop->getHeader
()->getParent()->getName() << " : LIR " << (
IsMemset ? "Memset" : "Memcpy") << " avoided: multi-block top-level loop\n"
; } } while (false)
;
1215 return true;
1216 }
1217 }
1218
1219 return false;
1220}
1221
1222bool LoopIdiomRecognize::runOnNoncountableLoop() {
1223 LLVM_DEBUG(dbgs() << DEBUG_TYPE " Scanning: F["do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-idiom")) { dbgs() << "loop-idiom" " Scanning: F["
<< CurLoop->getHeader()->getParent()->getName
() << "] Noncountable Loop %" << CurLoop->getHeader
()->getName() << "\n"; } } while (false)
1224 << CurLoop->getHeader()->getParent()->getName()do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-idiom")) { dbgs() << "loop-idiom" " Scanning: F["
<< CurLoop->getHeader()->getParent()->getName
() << "] Noncountable Loop %" << CurLoop->getHeader
()->getName() << "\n"; } } while (false)
1225 << "] Noncountable Loop %"do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-idiom")) { dbgs() << "loop-idiom" " Scanning: F["
<< CurLoop->getHeader()->getParent()->getName
() << "] Noncountable Loop %" << CurLoop->getHeader
()->getName() << "\n"; } } while (false)
1226 << CurLoop->getHeader()->getName() << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-idiom")) { dbgs() << "loop-idiom" " Scanning: F["
<< CurLoop->getHeader()->getParent()->getName
() << "] Noncountable Loop %" << CurLoop->getHeader
()->getName() << "\n"; } } while (false)
;
1227
1228 return recognizePopcount() || recognizeAndInsertFFS();
1229}
1230
1231/// Check if the given conditional branch is based on the comparison between
1232/// a variable and zero, and if the variable is non-zero or zero (JmpOnZero is
1233/// true), the control yields to the loop entry. If the branch matches the
1234/// behavior, the variable involved in the comparison is returned. This function
1235/// will be called to see if the precondition and postcondition of the loop are
1236/// in desirable form.
1237static Value *matchCondition(BranchInst *BI, BasicBlock *LoopEntry,
1238 bool JmpOnZero = false) {
1239 if (!BI || !BI->isConditional())
1240 return nullptr;
1241
1242 ICmpInst *Cond = dyn_cast<ICmpInst>(BI->getCondition());
1243 if (!Cond)
1244 return nullptr;
1245
1246 ConstantInt *CmpZero = dyn_cast<ConstantInt>(Cond->getOperand(1));
1247 if (!CmpZero || !CmpZero->isZero())
1248 return nullptr;
1249
1250 BasicBlock *TrueSucc = BI->getSuccessor(0);
1251 BasicBlock *FalseSucc = BI->getSuccessor(1);
1252 if (JmpOnZero)
1253 std::swap(TrueSucc, FalseSucc);
1254
1255 ICmpInst::Predicate Pred = Cond->getPredicate();
1256 if ((Pred == ICmpInst::ICMP_NE && TrueSucc == LoopEntry) ||
1257 (Pred == ICmpInst::ICMP_EQ && FalseSucc == LoopEntry))
1258 return Cond->getOperand(0);
1259
1260 return nullptr;
1261}
1262
1263// Check if the recurrence variable `VarX` is in the right form to create
1264// the idiom. Returns the value coerced to a PHINode if so.
1265static PHINode *getRecurrenceVar(Value *VarX, Instruction *DefX,
1266 BasicBlock *LoopEntry) {
1267 auto *PhiX = dyn_cast<PHINode>(VarX);
1268 if (PhiX && PhiX->getParent() == LoopEntry &&
1269 (PhiX->getOperand(0) == DefX || PhiX->getOperand(1) == DefX))
1270 return PhiX;
1271 return nullptr;
1272}
1273
1274/// Return true iff the idiom is detected in the loop.
1275///
1276/// Additionally:
1277/// 1) \p CntInst is set to the instruction counting the population bit.
1278/// 2) \p CntPhi is set to the corresponding phi node.
1279/// 3) \p Var is set to the value whose population bits are being counted.
1280///
1281/// The core idiom we are trying to detect is:
1282/// \code
1283/// if (x0 != 0)
1284/// goto loop-exit // the precondition of the loop
1285/// cnt0 = init-val;
1286/// do {
1287/// x1 = phi (x0, x2);
1288/// cnt1 = phi(cnt0, cnt2);
1289///
1290/// cnt2 = cnt1 + 1;
1291/// ...
1292/// x2 = x1 & (x1 - 1);
1293/// ...
1294/// } while(x != 0);
1295///
1296/// loop-exit:
1297/// \endcode
1298static bool detectPopcountIdiom(Loop *CurLoop, BasicBlock *PreCondBB,
1299 Instruction *&CntInst, PHINode *&CntPhi,
1300 Value *&Var) {
1301 // step 1: Check to see if the look-back branch match this pattern:
1302 // "if (a!=0) goto loop-entry".
1303 BasicBlock *LoopEntry;
1304 Instruction *DefX2, *CountInst;
1305 Value *VarX1, *VarX0;
1306 PHINode *PhiX, *CountPhi;
1307
1308 DefX2 = CountInst = nullptr;
1309 VarX1 = VarX0 = nullptr;
1310 PhiX = CountPhi = nullptr;
1311 LoopEntry = *(CurLoop->block_begin());
1312
1313 // step 1: Check if the loop-back branch is in desirable form.
1314 {
1315 if (Value *T = matchCondition(
1316 dyn_cast<BranchInst>(LoopEntry->getTerminator()), LoopEntry))
1317 DefX2 = dyn_cast<Instruction>(T);
1318 else
1319 return false;
1320 }
1321
1322 // step 2: detect instructions corresponding to "x2 = x1 & (x1 - 1)"
1323 {
1324 if (!DefX2 || DefX2->getOpcode() != Instruction::And)
1325 return false;
1326
1327 BinaryOperator *SubOneOp;
1328
1329 if ((SubOneOp = dyn_cast<BinaryOperator>(DefX2->getOperand(0))))
1330 VarX1 = DefX2->getOperand(1);
1331 else {
1332 VarX1 = DefX2->getOperand(0);
1333 SubOneOp = dyn_cast<BinaryOperator>(DefX2->getOperand(1));
1334 }
1335 if (!SubOneOp || SubOneOp->getOperand(0) != VarX1)
1336 return false;
1337
1338 ConstantInt *Dec = dyn_cast<ConstantInt>(SubOneOp->getOperand(1));
1339 if (!Dec ||
1340 !((SubOneOp->getOpcode() == Instruction::Sub && Dec->isOne()) ||
1341 (SubOneOp->getOpcode() == Instruction::Add &&
1342 Dec->isMinusOne()))) {
1343 return false;
1344 }
1345 }
1346
1347 // step 3: Check the recurrence of variable X
1348 PhiX = getRecurrenceVar(VarX1, DefX2, LoopEntry);
1349 if (!PhiX)
1350 return false;
1351
1352 // step 4: Find the instruction which count the population: cnt2 = cnt1 + 1
1353 {
1354 CountInst = nullptr;
1355 for (BasicBlock::iterator Iter = LoopEntry->getFirstNonPHI()->getIterator(),
1356 IterE = LoopEntry->end();
1357 Iter != IterE; Iter++) {
1358 Instruction *Inst = &*Iter;
1359 if (Inst->getOpcode() != Instruction::Add)
1360 continue;
1361
1362 ConstantInt *Inc = dyn_cast<ConstantInt>(Inst->getOperand(1));
1363 if (!Inc || !Inc->isOne())
1364 continue;
1365
1366 PHINode *Phi = getRecurrenceVar(Inst->getOperand(0), Inst, LoopEntry);
1367 if (!Phi)
1368 continue;
1369
1370 // Check if the result of the instruction is live of the loop.
1371 bool LiveOutLoop = false;
1372 for (User *U : Inst->users()) {
1373 if ((cast<Instruction>(U))->getParent() != LoopEntry) {
1374 LiveOutLoop = true;
1375 break;
1376 }
1377 }
1378
1379 if (LiveOutLoop) {
1380 CountInst = Inst;
1381 CountPhi = Phi;
1382 break;
1383 }
1384 }
1385
1386 if (!CountInst)
1387 return false;
1388 }
1389
1390 // step 5: check if the precondition is in this form:
1391 // "if (x != 0) goto loop-head ; else goto somewhere-we-don't-care;"
1392 {
1393 auto *PreCondBr = dyn_cast<BranchInst>(PreCondBB->getTerminator());
1394 Value *T = matchCondition(PreCondBr, CurLoop->getLoopPreheader());
1395 if (T != PhiX->getOperand(0) && T != PhiX->getOperand(1))
1396 return false;
1397
1398 CntInst = CountInst;
1399 CntPhi = CountPhi;
1400 Var = T;
1401 }
1402
1403 return true;
1404}
1405
1406/// Return true if the idiom is detected in the loop.
1407///
1408/// Additionally:
1409/// 1) \p CntInst is set to the instruction Counting Leading Zeros (CTLZ)
1410/// or nullptr if there is no such.
1411/// 2) \p CntPhi is set to the corresponding phi node
1412/// or nullptr if there is no such.
1413/// 3) \p Var is set to the value whose CTLZ could be used.
1414/// 4) \p DefX is set to the instruction calculating Loop exit condition.
1415///
1416/// The core idiom we are trying to detect is:
1417/// \code
1418/// if (x0 == 0)
1419/// goto loop-exit // the precondition of the loop
1420/// cnt0 = init-val;
1421/// do {
1422/// x = phi (x0, x.next); //PhiX
1423/// cnt = phi(cnt0, cnt.next);
1424///
1425/// cnt.next = cnt + 1;
1426/// ...
1427/// x.next = x >> 1; // DefX
1428/// ...
1429/// } while(x.next != 0);
1430///
1431/// loop-exit:
1432/// \endcode
1433static bool detectShiftUntilZeroIdiom(Loop *CurLoop, const DataLayout &DL,
1434 Intrinsic::ID &IntrinID, Value *&InitX,
1435 Instruction *&CntInst, PHINode *&CntPhi,
1436 Instruction *&DefX) {
1437 BasicBlock *LoopEntry;
1438 Value *VarX = nullptr;
1439
1440 DefX = nullptr;
1441 CntInst = nullptr;
1442 CntPhi = nullptr;
1443 LoopEntry = *(CurLoop->block_begin());
1444
1445 // step 1: Check if the loop-back branch is in desirable form.
1446 if (Value *T = matchCondition(
1447 dyn_cast<BranchInst>(LoopEntry->getTerminator()), LoopEntry))
1448 DefX = dyn_cast<Instruction>(T);
1449 else
1450 return false;
1451
1452 // step 2: detect instructions corresponding to "x.next = x >> 1 or x << 1"
1453 if (!DefX || !DefX->isShift())
1454 return false;
1455 IntrinID = DefX->getOpcode() == Instruction::Shl ? Intrinsic::cttz :
1456 Intrinsic::ctlz;
1457 ConstantInt *Shft = dyn_cast<ConstantInt>(DefX->getOperand(1));
1458 if (!Shft || !Shft->isOne())
1459 return false;
1460 VarX = DefX->getOperand(0);
1461
1462 // step 3: Check the recurrence of variable X
1463 PHINode *PhiX = getRecurrenceVar(VarX, DefX, LoopEntry);
1464 if (!PhiX)
1465 return false;
1466
1467 InitX = PhiX->getIncomingValueForBlock(CurLoop->getLoopPreheader());
1468
1469 // Make sure the initial value can't be negative otherwise the ashr in the
1470 // loop might never reach zero which would make the loop infinite.
1471 if (DefX->getOpcode() == Instruction::AShr && !isKnownNonNegative(InitX, DL))
1472 return false;
1473
1474 // step 4: Find the instruction which count the CTLZ: cnt.next = cnt + 1
1475 // TODO: We can skip the step. If loop trip count is known (CTLZ),
1476 // then all uses of "cnt.next" could be optimized to the trip count
1477 // plus "cnt0". Currently it is not optimized.
1478 // This step could be used to detect POPCNT instruction:
1479 // cnt.next = cnt + (x.next & 1)
1480 for (BasicBlock::iterator Iter = LoopEntry->getFirstNonPHI()->getIterator(),
1481 IterE = LoopEntry->end();
1482 Iter != IterE; Iter++) {
1483 Instruction *Inst = &*Iter;
1484 if (Inst->getOpcode() != Instruction::Add)
1485 continue;
1486
1487 ConstantInt *Inc = dyn_cast<ConstantInt>(Inst->getOperand(1));
1488 if (!Inc || !Inc->isOne())
1489 continue;
1490
1491 PHINode *Phi = getRecurrenceVar(Inst->getOperand(0), Inst, LoopEntry);
1492 if (!Phi)
1493 continue;
1494
1495 CntInst = Inst;
1496 CntPhi = Phi;
1497 break;
1498 }
1499 if (!CntInst)
1500 return false;
1501
1502 return true;
1503}
1504
1505/// Recognize CTLZ or CTTZ idiom in a non-countable loop and convert the loop
1506/// to countable (with CTLZ / CTTZ trip count). If CTLZ / CTTZ inserted as a new
1507/// trip count returns true; otherwise, returns false.
1508bool LoopIdiomRecognize::recognizeAndInsertFFS() {
1509 // Give up if the loop has multiple blocks or multiple backedges.
1510 if (CurLoop->getNumBackEdges() != 1 || CurLoop->getNumBlocks() != 1)
1511 return false;
1512
1513 Intrinsic::ID IntrinID;
1514 Value *InitX;
1515 Instruction *DefX = nullptr;
1516 PHINode *CntPhi = nullptr;
1517 Instruction *CntInst = nullptr;
1518 // Help decide if transformation is profitable. For ShiftUntilZero idiom,
1519 // this is always 6.
1520 size_t IdiomCanonicalSize = 6;
1521
1522 if (!detectShiftUntilZeroIdiom(CurLoop, *DL, IntrinID, InitX,
1523 CntInst, CntPhi, DefX))
1524 return false;
1525
1526 bool IsCntPhiUsedOutsideLoop = false;
1527 for (User *U : CntPhi->users())
1528 if (!CurLoop->contains(cast<Instruction>(U))) {
1529 IsCntPhiUsedOutsideLoop = true;
1530 break;
1531 }
1532 bool IsCntInstUsedOutsideLoop = false;
1533 for (User *U : CntInst->users())
1534 if (!CurLoop->contains(cast<Instruction>(U))) {
1535 IsCntInstUsedOutsideLoop = true;
1536 break;
1537 }
1538 // If both CntInst and CntPhi are used outside the loop the profitability
1539 // is questionable.
1540 if (IsCntInstUsedOutsideLoop && IsCntPhiUsedOutsideLoop)
1541 return false;
1542
1543 // For some CPUs result of CTLZ(X) intrinsic is undefined
1544 // when X is 0. If we can not guarantee X != 0, we need to check this
1545 // when expand.
1546 bool ZeroCheck = false;
1547 // It is safe to assume Preheader exist as it was checked in
1548 // parent function RunOnLoop.
1549 BasicBlock *PH = CurLoop->getLoopPreheader();
1550
1551 // If we are using the count instruction outside the loop, make sure we
1552 // have a zero check as a precondition. Without the check the loop would run
1553 // one iteration for before any check of the input value. This means 0 and 1
1554 // would have identical behavior in the original loop and thus
1555 if (!IsCntPhiUsedOutsideLoop) {
1556 auto *PreCondBB = PH->getSinglePredecessor();
1557 if (!PreCondBB)
1558 return false;
1559 auto *PreCondBI = dyn_cast<BranchInst>(PreCondBB->getTerminator());
1560 if (!PreCondBI)
1561 return false;
1562 if (matchCondition(PreCondBI, PH) != InitX)
1563 return false;
1564 ZeroCheck = true;
1565 }
1566
1567 // Check if CTLZ / CTTZ intrinsic is profitable. Assume it is always
1568 // profitable if we delete the loop.
1569
1570 // the loop has only 6 instructions:
1571 // %n.addr.0 = phi [ %n, %entry ], [ %shr, %while.cond ]
1572 // %i.0 = phi [ %i0, %entry ], [ %inc, %while.cond ]
1573 // %shr = ashr %n.addr.0, 1
1574 // %tobool = icmp eq %shr, 0
1575 // %inc = add nsw %i.0, 1
1576 // br i1 %tobool
1577
1578 const Value *Args[] = {
1579 InitX, ZeroCheck ? ConstantInt::getTrue(InitX->getContext())
1580 : ConstantInt::getFalse(InitX->getContext())};
1581
1582 // @llvm.dbg doesn't count as they have no semantic effect.
1583 auto InstWithoutDebugIt = CurLoop->getHeader()->instructionsWithoutDebug();
1584 uint32_t HeaderSize =
1585 std::distance(InstWithoutDebugIt.begin(), InstWithoutDebugIt.end());
1586
1587 IntrinsicCostAttributes Attrs(IntrinID, InitX->getType(), Args);
1588 int Cost =
1589 TTI->getIntrinsicInstrCost(Attrs, TargetTransformInfo::TCK_SizeAndLatency);
1590 if (HeaderSize != IdiomCanonicalSize &&
1591 Cost > TargetTransformInfo::TCC_Basic)
1592 return false;
1593
1594 transformLoopToCountable(IntrinID, PH, CntInst, CntPhi, InitX, DefX,
1595 DefX->getDebugLoc(), ZeroCheck,
1596 IsCntPhiUsedOutsideLoop);
1597 return true;
1598}
1599
1600/// Recognizes a population count idiom in a non-countable loop.
1601///
1602/// If detected, transforms the relevant code to issue the popcount intrinsic
1603/// function call, and returns true; otherwise, returns false.
1604bool LoopIdiomRecognize::recognizePopcount() {
1605 if (TTI->getPopcntSupport(32) != TargetTransformInfo::PSK_FastHardware)
1606 return false;
1607
1608 // Counting population are usually conducted by few arithmetic instructions.
1609 // Such instructions can be easily "absorbed" by vacant slots in a
1610 // non-compact loop. Therefore, recognizing popcount idiom only makes sense
1611 // in a compact loop.
1612
1613 // Give up if the loop has multiple blocks or multiple backedges.
1614 if (CurLoop->getNumBackEdges() != 1 || CurLoop->getNumBlocks() != 1)
1615 return false;
1616
1617 BasicBlock *LoopBody = *(CurLoop->block_begin());
1618 if (LoopBody->size() >= 20) {
1619 // The loop is too big, bail out.
1620 return false;
1621 }
1622
1623 // It should have a preheader containing nothing but an unconditional branch.
1624 BasicBlock *PH = CurLoop->getLoopPreheader();
1625 if (!PH || &PH->front() != PH->getTerminator())
1626 return false;
1627 auto *EntryBI = dyn_cast<BranchInst>(PH->getTerminator());
1628 if (!EntryBI || EntryBI->isConditional())
1629 return false;
1630
1631 // It should have a precondition block where the generated popcount intrinsic
1632 // function can be inserted.
1633 auto *PreCondBB = PH->getSinglePredecessor();
1634 if (!PreCondBB)
1635 return false;
1636 auto *PreCondBI = dyn_cast<BranchInst>(PreCondBB->getTerminator());
1637 if (!PreCondBI || PreCondBI->isUnconditional())
1638 return false;
1639
1640 Instruction *CntInst;
1641 PHINode *CntPhi;
1642 Value *Val;
1643 if (!detectPopcountIdiom(CurLoop, PreCondBB, CntInst, CntPhi, Val))
1644 return false;
1645
1646 transformLoopToPopcount(PreCondBB, CntInst, CntPhi, Val);
1647 return true;
1648}
1649
1650static CallInst *createPopcntIntrinsic(IRBuilder<> &IRBuilder, Value *Val,
1651 const DebugLoc &DL) {
1652 Value *Ops[] = {Val};
1653 Type *Tys[] = {Val->getType()};
1654
1655 Module *M = IRBuilder.GetInsertBlock()->getParent()->getParent();
1656 Function *Func = Intrinsic::getDeclaration(M, Intrinsic::ctpop, Tys);
1657 CallInst *CI = IRBuilder.CreateCall(Func, Ops);
1658 CI->setDebugLoc(DL);
1659
1660 return CI;
1661}
1662
1663static CallInst *createFFSIntrinsic(IRBuilder<> &IRBuilder, Value *Val,
1664 const DebugLoc &DL, bool ZeroCheck,
1665 Intrinsic::ID IID) {
1666 Value *Ops[] = {Val, ZeroCheck ? IRBuilder.getTrue() : IRBuilder.getFalse()};
1667 Type *Tys[] = {Val->getType()};
1668
1669 Module *M = IRBuilder.GetInsertBlock()->getParent()->getParent();
1670 Function *Func = Intrinsic::getDeclaration(M, IID, Tys);
1671 CallInst *CI = IRBuilder.CreateCall(Func, Ops);
1672 CI->setDebugLoc(DL);
1673
1674 return CI;
1675}
1676
1677/// Transform the following loop (Using CTLZ, CTTZ is similar):
1678/// loop:
1679/// CntPhi = PHI [Cnt0, CntInst]
1680/// PhiX = PHI [InitX, DefX]
1681/// CntInst = CntPhi + 1
1682/// DefX = PhiX >> 1
1683/// LOOP_BODY
1684/// Br: loop if (DefX != 0)
1685/// Use(CntPhi) or Use(CntInst)
1686///
1687/// Into:
1688/// If CntPhi used outside the loop:
1689/// CountPrev = BitWidth(InitX) - CTLZ(InitX >> 1)
1690/// Count = CountPrev + 1
1691/// else
1692/// Count = BitWidth(InitX) - CTLZ(InitX)
1693/// loop:
1694/// CntPhi = PHI [Cnt0, CntInst]
1695/// PhiX = PHI [InitX, DefX]
1696/// PhiCount = PHI [Count, Dec]
1697/// CntInst = CntPhi + 1
1698/// DefX = PhiX >> 1
1699/// Dec = PhiCount - 1
1700/// LOOP_BODY
1701/// Br: loop if (Dec != 0)
1702/// Use(CountPrev + Cnt0) // Use(CntPhi)
1703/// or
1704/// Use(Count + Cnt0) // Use(CntInst)
1705///
1706/// If LOOP_BODY is empty the loop will be deleted.
1707/// If CntInst and DefX are not used in LOOP_BODY they will be removed.
1708void LoopIdiomRecognize::transformLoopToCountable(
1709 Intrinsic::ID IntrinID, BasicBlock *Preheader, Instruction *CntInst,
1710 PHINode *CntPhi, Value *InitX, Instruction *DefX, const DebugLoc &DL,
1711 bool ZeroCheck, bool IsCntPhiUsedOutsideLoop) {
1712 BranchInst *PreheaderBr = cast<BranchInst>(Preheader->getTerminator());
1713
1714 // Step 1: Insert the CTLZ/CTTZ instruction at the end of the preheader block
1715 IRBuilder<> Builder(PreheaderBr);
1716 Builder.SetCurrentDebugLocation(DL);
1717 Value *FFS, *Count, *CountPrev, *NewCount, *InitXNext;
1718
1719 // Count = BitWidth - CTLZ(InitX);
1720 // If there are uses of CntPhi create:
1721 // CountPrev = BitWidth - CTLZ(InitX >> 1);
1722 if (IsCntPhiUsedOutsideLoop) {
1723 if (DefX->getOpcode() == Instruction::AShr)
1724 InitXNext =
1725 Builder.CreateAShr(InitX, ConstantInt::get(InitX->getType(), 1));
1726 else if (DefX->getOpcode() == Instruction::LShr)
1727 InitXNext =
1728 Builder.CreateLShr(InitX, ConstantInt::get(InitX->getType(), 1));
1729 else if (DefX->getOpcode() == Instruction::Shl) // cttz
1730 InitXNext =
1731 Builder.CreateShl(InitX, ConstantInt::get(InitX->getType(), 1));
1732 else
1733 llvm_unreachable("Unexpected opcode!")::llvm::llvm_unreachable_internal("Unexpected opcode!", "/build/llvm-toolchain-snapshot-12~++20200911111112+c0825fa5fc3/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp"
, 1733)
;
1734 } else
1735 InitXNext = InitX;
1736 FFS = createFFSIntrinsic(Builder, InitXNext, DL, ZeroCheck, IntrinID);
1737 Count = Builder.CreateSub(
1738 ConstantInt::get(FFS->getType(),
1739 FFS->getType()->getIntegerBitWidth()),
1740 FFS);
1741 if (IsCntPhiUsedOutsideLoop) {
1742 CountPrev = Count;
1743 Count = Builder.CreateAdd(
1744 CountPrev,
1745 ConstantInt::get(CountPrev->getType(), 1));
1746 }
1747
1748 NewCount = Builder.CreateZExtOrTrunc(
1749 IsCntPhiUsedOutsideLoop ? CountPrev : Count,
1750 cast<IntegerType>(CntInst->getType()));
1751
1752 // If the counter's initial value is not zero, insert Add Inst.
1753 Value *CntInitVal = CntPhi->getIncomingValueForBlock(Preheader);
1754 ConstantInt *InitConst = dyn_cast<ConstantInt>(CntInitVal);
1755 if (!InitConst || !InitConst->isZero())
1756 NewCount = Builder.CreateAdd(NewCount, CntInitVal);
1757
1758 // Step 2: Insert new IV and loop condition:
1759 // loop:
1760 // ...
1761 // PhiCount = PHI [Count, Dec]
1762 // ...
1763 // Dec = PhiCount - 1
1764 // ...
1765 // Br: loop if (Dec != 0)
1766 BasicBlock *Body = *(CurLoop->block_begin());
1767 auto *LbBr = cast<BranchInst>(Body->getTerminator());
1768 ICmpInst *LbCond = cast<ICmpInst>(LbBr->getCondition());
1769 Type *Ty = Count->getType();
1770
1771 PHINode *TcPhi = PHINode::Create(Ty, 2, "tcphi", &Body->front());
1772
1773 Builder.SetInsertPoint(LbCond);
1774 Instruction *TcDec = cast<Instruction>(
1775 Builder.CreateSub(TcPhi, ConstantInt::get(Ty, 1),
1776 "tcdec", false, true));
1777
1778 TcPhi->addIncoming(Count, Preheader);
1779 TcPhi->addIncoming(TcDec, Body);
1780
1781 CmpInst::Predicate Pred =
1782 (LbBr->getSuccessor(0) == Body) ? CmpInst::ICMP_NE : CmpInst::ICMP_EQ;
1783 LbCond->setPredicate(Pred);
1784 LbCond->setOperand(0, TcDec);
1785 LbCond->setOperand(1, ConstantInt::get(Ty, 0));
1786
1787 // Step 3: All the references to the original counter outside
1788 // the loop are replaced with the NewCount
1789 if (IsCntPhiUsedOutsideLoop)
1790 CntPhi->replaceUsesOutsideBlock(NewCount, Body);
1791 else
1792 CntInst->replaceUsesOutsideBlock(NewCount, Body);
1793
1794 // step 4: Forget the "non-computable" trip-count SCEV associated with the
1795 // loop. The loop would otherwise not be deleted even if it becomes empty.
1796 SE->forgetLoop(CurLoop);
1797}
1798
1799void LoopIdiomRecognize::transformLoopToPopcount(BasicBlock *PreCondBB,
1800 Instruction *CntInst,
1801 PHINode *CntPhi, Value *Var) {
1802 BasicBlock *PreHead = CurLoop->getLoopPreheader();
1803 auto *PreCondBr = cast<BranchInst>(PreCondBB->getTerminator());
1804 const DebugLoc &DL = CntInst->getDebugLoc();
1805
1806 // Assuming before transformation, the loop is following:
1807 // if (x) // the precondition
1808 // do { cnt++; x &= x - 1; } while(x);
1809
1810 // Step 1: Insert the ctpop instruction at the end of the precondition block
1811 IRBuilder<> Builder(PreCondBr);
1812 Value *PopCnt, *PopCntZext, *NewCount, *TripCnt;
1813 {
1814 PopCnt = createPopcntIntrinsic(Builder, Var, DL);
1815 NewCount = PopCntZext =
1816 Builder.CreateZExtOrTrunc(PopCnt, cast<IntegerType>(CntPhi->getType()));
1817
1818 if (NewCount != PopCnt)
1819 (cast<Instruction>(NewCount))->setDebugLoc(DL);
1820
1821 // TripCnt is exactly the number of iterations the loop has
1822 TripCnt = NewCount;
1823
1824 // If the population counter's initial value is not zero, insert Add Inst.
1825 Value *CntInitVal = CntPhi->getIncomingValueForBlock(PreHead);
1826 ConstantInt *InitConst = dyn_cast<ConstantInt>(CntInitVal);
1827 if (!InitConst || !InitConst->isZero()) {
1828 NewCount = Builder.CreateAdd(NewCount, CntInitVal);
1829 (cast<Instruction>(NewCount))->setDebugLoc(DL);
1830 }
1831 }
1832
1833 // Step 2: Replace the precondition from "if (x == 0) goto loop-exit" to
1834 // "if (NewCount == 0) loop-exit". Without this change, the intrinsic
1835 // function would be partial dead code, and downstream passes will drag
1836 // it back from the precondition block to the preheader.
1837 {
1838 ICmpInst *PreCond = cast<ICmpInst>(PreCondBr->getCondition());
1839
1840 Value *Opnd0 = PopCntZext;
1841 Value *Opnd1 = ConstantInt::get(PopCntZext->getType(), 0);
1842 if (PreCond->getOperand(0) != Var)
1843 std::swap(Opnd0, Opnd1);
1844
1845 ICmpInst *NewPreCond = cast<ICmpInst>(
1846 Builder.CreateICmp(PreCond->getPredicate(), Opnd0, Opnd1));
1847 PreCondBr->setCondition(NewPreCond);
1848
1849 RecursivelyDeleteTriviallyDeadInstructions(PreCond, TLI);
1850 }
1851
1852 // Step 3: Note that the population count is exactly the trip count of the
1853 // loop in question, which enable us to convert the loop from noncountable
1854 // loop into a countable one. The benefit is twofold:
1855 //
1856 // - If the loop only counts population, the entire loop becomes dead after
1857 // the transformation. It is a lot easier to prove a countable loop dead
1858 // than to prove a noncountable one. (In some C dialects, an infinite loop
1859 // isn't dead even if it computes nothing useful. In general, DCE needs
1860 // to prove a noncountable loop finite before safely delete it.)
1861 //
1862 // - If the loop also performs something else, it remains alive.
1863 // Since it is transformed to countable form, it can be aggressively
1864 // optimized by some optimizations which are in general not applicable
1865 // to a noncountable loop.
1866 //
1867 // After this step, this loop (conceptually) would look like following:
1868 // newcnt = __builtin_ctpop(x);
1869 // t = newcnt;
1870 // if (x)
1871 // do { cnt++; x &= x-1; t--) } while (t > 0);
1872 BasicBlock *Body = *(CurLoop->block_begin());
1873 {
1874 auto *LbBr = cast<BranchInst>(Body->getTerminator());
1875 ICmpInst *LbCond = cast<ICmpInst>(LbBr->getCondition());
1876 Type *Ty = TripCnt->getType();
1877
1878 PHINode *TcPhi = PHINode::Create(Ty, 2, "tcphi", &Body->front());
1879
1880 Builder.SetInsertPoint(LbCond);
1881 Instruction *TcDec = cast<Instruction>(
1882 Builder.CreateSub(TcPhi, ConstantInt::get(Ty, 1),
1883 "tcdec", false, true));
1884
1885 TcPhi->addIncoming(TripCnt, PreHead);
1886 TcPhi->addIncoming(TcDec, Body);
1887
1888 CmpInst::Predicate Pred =
1889 (LbBr->getSuccessor(0) == Body) ? CmpInst::ICMP_UGT : CmpInst::ICMP_SLE;
1890 LbCond->setPredicate(Pred);
1891 LbCond->setOperand(0, TcDec);
1892 LbCond->setOperand(1, ConstantInt::get(Ty, 0));
1893 }
1894
1895 // Step 4: All the references to the original population counter outside
1896 // the loop are replaced with the NewCount -- the value returned from
1897 // __builtin_ctpop().
1898 CntInst->replaceUsesOutsideBlock(NewCount, Body);
1899
1900 // step 5: Forget the "non-computable" trip-count SCEV associated with the
1901 // loop. The loop would otherwise not be deleted even if it becomes empty.
1902 SE->forgetLoop(CurLoop);
1903}