Bug Summary

File:llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
Warning:line 521, column 15
Although the value stored to 'PatternValue' is used in the enclosing expression, the value is never actually read from 'PatternValue'

Annotated Source Code

Press '?' to see keyboard shortcuts

clang -cc1 -cc1 -triple x86_64-pc-linux-gnu -analyze -disable-free -disable-llvm-verifier -discard-value-names -main-file-name LoopIdiomRecognize.cpp -analyzer-store=region -analyzer-opt-analyze-nested-blocks -analyzer-checker=core -analyzer-checker=apiModeling -analyzer-checker=unix -analyzer-checker=deadcode -analyzer-checker=cplusplus -analyzer-checker=security.insecureAPI.UncheckedReturn -analyzer-checker=security.insecureAPI.getpw -analyzer-checker=security.insecureAPI.gets -analyzer-checker=security.insecureAPI.mktemp -analyzer-checker=security.insecureAPI.mkstemp -analyzer-checker=security.insecureAPI.vfork -analyzer-checker=nullability.NullPassedToNonnull -analyzer-checker=nullability.NullReturnedFromNonnull -analyzer-output plist -w -setup-static-analyzer -analyzer-config-compatibility-mode=true -mrelocation-model pic -pic-level 2 -fhalf-no-semantic-interposition -mframe-pointer=none -fmath-errno -fno-rounding-math -mconstructor-aliases -munwind-tables -target-cpu x86-64 -tune-cpu generic -debugger-tuning=gdb -ffunction-sections -fdata-sections -fcoverage-compilation-dir=/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/build-llvm/lib/Transforms/Scalar -resource-dir /usr/lib/llvm-13/lib/clang/13.0.0 -D _DEBUG -D _GNU_SOURCE -D __STDC_CONSTANT_MACROS -D __STDC_FORMAT_MACROS -D __STDC_LIMIT_MACROS -I /build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/build-llvm/lib/Transforms/Scalar -I /build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Scalar -I /build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/build-llvm/include -I /build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/include -U NDEBUG -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/6.3.0/../../../../include/c++/6.3.0 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/6.3.0/../../../../include/x86_64-linux-gnu/c++/6.3.0 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/6.3.0/../../../../include/c++/6.3.0/backward -internal-isystem /usr/lib/llvm-13/lib/clang/13.0.0/include -internal-isystem /usr/local/include -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/6.3.0/../../../../x86_64-linux-gnu/include -internal-externc-isystem /usr/include/x86_64-linux-gnu -internal-externc-isystem /include -internal-externc-isystem /usr/include -O2 -Wno-unused-parameter -Wwrite-strings -Wno-missing-field-initializers -Wno-long-long -Wno-maybe-uninitialized -Wno-comment -std=c++14 -fdeprecated-macro -fdebug-compilation-dir=/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/build-llvm/lib/Transforms/Scalar -fdebug-prefix-map=/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f=. -ferror-limit 19 -fvisibility-inlines-hidden -stack-protector 2 -fgnuc-version=4.2.1 -vectorize-loops -vectorize-slp -analyzer-output=html -analyzer-config stable-report-filename=true -faddrsig -D__GCC_HAVE_DWARF2_CFI_ASM=1 -o /tmp/scan-build-2021-04-14-063029-18377-1 -x c++ /build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
1//===- LoopIdiomRecognize.cpp - Loop idiom recognition --------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This pass implements an idiom recognizer that transforms simple loops into a
10// non-loop form. In cases that this kicks in, it can be a significant
11// performance win.
12//
13// If compiling for code size we avoid idiom recognition if the resulting
14// code could be larger than the code for the original loop. One way this could
15// happen is if the loop is not removable after idiom recognition due to the
16// presence of non-idiom instructions. The initial implementation of the
17// heuristics applies to idioms in multi-block loops.
18//
19//===----------------------------------------------------------------------===//
20//
21// TODO List:
22//
23// Future loop memory idioms to recognize:
24// memcmp, memmove, strlen, etc.
25// Future floating point idioms to recognize in -ffast-math mode:
26// fpowi
27// Future integer operation idioms to recognize:
28// ctpop
29//
30// Beware that isel's default lowering for ctpop is highly inefficient for
31// i64 and larger types when i64 is legal and the value has few bits set. It
32// would be good to enhance isel to emit a loop for ctpop in this case.
33//
34// This could recognize common matrix multiplies and dot product idioms and
35// replace them with calls to BLAS (if linked in??).
36//
37//===----------------------------------------------------------------------===//
38
39#include "llvm/Transforms/Scalar/LoopIdiomRecognize.h"
40#include "llvm/ADT/APInt.h"
41#include "llvm/ADT/ArrayRef.h"
42#include "llvm/ADT/DenseMap.h"
43#include "llvm/ADT/MapVector.h"
44#include "llvm/ADT/SetVector.h"
45#include "llvm/ADT/SmallPtrSet.h"
46#include "llvm/ADT/SmallVector.h"
47#include "llvm/ADT/Statistic.h"
48#include "llvm/ADT/StringRef.h"
49#include "llvm/Analysis/AliasAnalysis.h"
50#include "llvm/Analysis/CmpInstAnalysis.h"
51#include "llvm/Analysis/LoopAccessAnalysis.h"
52#include "llvm/Analysis/LoopInfo.h"
53#include "llvm/Analysis/LoopPass.h"
54#include "llvm/Analysis/MemoryLocation.h"
55#include "llvm/Analysis/MemorySSA.h"
56#include "llvm/Analysis/MemorySSAUpdater.h"
57#include "llvm/Analysis/MustExecute.h"
58#include "llvm/Analysis/OptimizationRemarkEmitter.h"
59#include "llvm/Analysis/ScalarEvolution.h"
60#include "llvm/Analysis/ScalarEvolutionExpressions.h"
61#include "llvm/Analysis/TargetLibraryInfo.h"
62#include "llvm/Analysis/TargetTransformInfo.h"
63#include "llvm/Analysis/ValueTracking.h"
64#include "llvm/IR/Attributes.h"
65#include "llvm/IR/BasicBlock.h"
66#include "llvm/IR/Constant.h"
67#include "llvm/IR/Constants.h"
68#include "llvm/IR/DataLayout.h"
69#include "llvm/IR/DebugLoc.h"
70#include "llvm/IR/DerivedTypes.h"
71#include "llvm/IR/Dominators.h"
72#include "llvm/IR/GlobalValue.h"
73#include "llvm/IR/GlobalVariable.h"
74#include "llvm/IR/IRBuilder.h"
75#include "llvm/IR/InstrTypes.h"
76#include "llvm/IR/Instruction.h"
77#include "llvm/IR/Instructions.h"
78#include "llvm/IR/IntrinsicInst.h"
79#include "llvm/IR/Intrinsics.h"
80#include "llvm/IR/LLVMContext.h"
81#include "llvm/IR/Module.h"
82#include "llvm/IR/PassManager.h"
83#include "llvm/IR/PatternMatch.h"
84#include "llvm/IR/Type.h"
85#include "llvm/IR/User.h"
86#include "llvm/IR/Value.h"
87#include "llvm/IR/ValueHandle.h"
88#include "llvm/InitializePasses.h"
89#include "llvm/Pass.h"
90#include "llvm/Support/Casting.h"
91#include "llvm/Support/CommandLine.h"
92#include "llvm/Support/Debug.h"
93#include "llvm/Support/InstructionCost.h"
94#include "llvm/Support/raw_ostream.h"
95#include "llvm/Transforms/Scalar.h"
96#include "llvm/Transforms/Utils/BuildLibCalls.h"
97#include "llvm/Transforms/Utils/Local.h"
98#include "llvm/Transforms/Utils/LoopUtils.h"
99#include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
100#include <algorithm>
101#include <cassert>
102#include <cstdint>
103#include <utility>
104#include <vector>
105
106using namespace llvm;
107
108#define DEBUG_TYPE"loop-idiom" "loop-idiom"
109
110STATISTIC(NumMemSet, "Number of memset's formed from loop stores")static llvm::Statistic NumMemSet = {"loop-idiom", "NumMemSet"
, "Number of memset's formed from loop stores"}
;
111STATISTIC(NumMemCpy, "Number of memcpy's formed from loop load+stores")static llvm::Statistic NumMemCpy = {"loop-idiom", "NumMemCpy"
, "Number of memcpy's formed from loop load+stores"}
;
112STATISTIC(static llvm::Statistic NumShiftUntilBitTest = {"loop-idiom", "NumShiftUntilBitTest"
, "Number of uncountable loops recognized as 'shift until bitttest' idiom"
}
113 NumShiftUntilBitTest,static llvm::Statistic NumShiftUntilBitTest = {"loop-idiom", "NumShiftUntilBitTest"
, "Number of uncountable loops recognized as 'shift until bitttest' idiom"
}
114 "Number of uncountable loops recognized as 'shift until bitttest' idiom")static llvm::Statistic NumShiftUntilBitTest = {"loop-idiom", "NumShiftUntilBitTest"
, "Number of uncountable loops recognized as 'shift until bitttest' idiom"
}
;
115
116bool DisableLIRP::All;
117static cl::opt<bool, true>
118 DisableLIRPAll("disable-" DEBUG_TYPE"loop-idiom" "-all",
119 cl::desc("Options to disable Loop Idiom Recognize Pass."),
120 cl::location(DisableLIRP::All), cl::init(false),
121 cl::ReallyHidden);
122
123bool DisableLIRP::Memset;
124static cl::opt<bool, true>
125 DisableLIRPMemset("disable-" DEBUG_TYPE"loop-idiom" "-memset",
126 cl::desc("Proceed with loop idiom recognize pass, but do "
127 "not convert loop(s) to memset."),
128 cl::location(DisableLIRP::Memset), cl::init(false),
129 cl::ReallyHidden);
130
131bool DisableLIRP::Memcpy;
132static cl::opt<bool, true>
133 DisableLIRPMemcpy("disable-" DEBUG_TYPE"loop-idiom" "-memcpy",
134 cl::desc("Proceed with loop idiom recognize pass, but do "
135 "not convert loop(s) to memcpy."),
136 cl::location(DisableLIRP::Memcpy), cl::init(false),
137 cl::ReallyHidden);
138
139static cl::opt<bool> UseLIRCodeSizeHeurs(
140 "use-lir-code-size-heurs",
141 cl::desc("Use loop idiom recognition code size heuristics when compiling"
142 "with -Os/-Oz"),
143 cl::init(true), cl::Hidden);
144
145namespace {
146
147class LoopIdiomRecognize {
148 Loop *CurLoop = nullptr;
149 AliasAnalysis *AA;
150 DominatorTree *DT;
151 LoopInfo *LI;
152 ScalarEvolution *SE;
153 TargetLibraryInfo *TLI;
154 const TargetTransformInfo *TTI;
155 const DataLayout *DL;
156 OptimizationRemarkEmitter &ORE;
157 bool ApplyCodeSizeHeuristics;
158 std::unique_ptr<MemorySSAUpdater> MSSAU;
159
160public:
161 explicit LoopIdiomRecognize(AliasAnalysis *AA, DominatorTree *DT,
162 LoopInfo *LI, ScalarEvolution *SE,
163 TargetLibraryInfo *TLI,
164 const TargetTransformInfo *TTI, MemorySSA *MSSA,
165 const DataLayout *DL,
166 OptimizationRemarkEmitter &ORE)
167 : AA(AA), DT(DT), LI(LI), SE(SE), TLI(TLI), TTI(TTI), DL(DL), ORE(ORE) {
168 if (MSSA)
169 MSSAU = std::make_unique<MemorySSAUpdater>(MSSA);
170 }
171
172 bool runOnLoop(Loop *L);
173
174private:
175 using StoreList = SmallVector<StoreInst *, 8>;
176 using StoreListMap = MapVector<Value *, StoreList>;
177
178 StoreListMap StoreRefsForMemset;
179 StoreListMap StoreRefsForMemsetPattern;
180 StoreList StoreRefsForMemcpy;
181 bool HasMemset;
182 bool HasMemsetPattern;
183 bool HasMemcpy;
184
185 /// Return code for isLegalStore()
186 enum LegalStoreKind {
187 None = 0,
188 Memset,
189 MemsetPattern,
190 Memcpy,
191 UnorderedAtomicMemcpy,
192 DontUse // Dummy retval never to be used. Allows catching errors in retval
193 // handling.
194 };
195
196 /// \name Countable Loop Idiom Handling
197 /// @{
198
199 bool runOnCountableLoop();
200 bool runOnLoopBlock(BasicBlock *BB, const SCEV *BECount,
201 SmallVectorImpl<BasicBlock *> &ExitBlocks);
202
203 void collectStores(BasicBlock *BB);
204 LegalStoreKind isLegalStore(StoreInst *SI);
205 enum class ForMemset { No, Yes };
206 bool processLoopStores(SmallVectorImpl<StoreInst *> &SL, const SCEV *BECount,
207 ForMemset For);
208 bool processLoopMemSet(MemSetInst *MSI, const SCEV *BECount);
209
210 bool processLoopStridedStore(Value *DestPtr, unsigned StoreSize,
211 MaybeAlign StoreAlignment, Value *StoredVal,
212 Instruction *TheStore,
213 SmallPtrSetImpl<Instruction *> &Stores,
214 const SCEVAddRecExpr *Ev, const SCEV *BECount,
215 bool NegStride, bool IsLoopMemset = false);
216 bool processLoopStoreOfLoopLoad(StoreInst *SI, const SCEV *BECount);
217 bool avoidLIRForMultiBlockLoop(bool IsMemset = false,
218 bool IsLoopMemset = false);
219
220 /// @}
221 /// \name Noncountable Loop Idiom Handling
222 /// @{
223
224 bool runOnNoncountableLoop();
225
226 bool recognizePopcount();
227 void transformLoopToPopcount(BasicBlock *PreCondBB, Instruction *CntInst,
228 PHINode *CntPhi, Value *Var);
229 bool recognizeAndInsertFFS(); /// Find First Set: ctlz or cttz
230 void transformLoopToCountable(Intrinsic::ID IntrinID, BasicBlock *PreCondBB,
231 Instruction *CntInst, PHINode *CntPhi,
232 Value *Var, Instruction *DefX,
233 const DebugLoc &DL, bool ZeroCheck,
234 bool IsCntPhiUsedOutsideLoop);
235
236 bool recognizeShiftUntilBitTest();
237
238 /// @}
239};
240
241class LoopIdiomRecognizeLegacyPass : public LoopPass {
242public:
243 static char ID;
244
245 explicit LoopIdiomRecognizeLegacyPass() : LoopPass(ID) {
246 initializeLoopIdiomRecognizeLegacyPassPass(
247 *PassRegistry::getPassRegistry());
248 }
249
250 bool runOnLoop(Loop *L, LPPassManager &LPM) override {
251 if (DisableLIRP::All)
252 return false;
253
254 if (skipLoop(L))
255 return false;
256
257 AliasAnalysis *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
258 DominatorTree *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
259 LoopInfo *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
260 ScalarEvolution *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
261 TargetLibraryInfo *TLI =
262 &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(
263 *L->getHeader()->getParent());
264 const TargetTransformInfo *TTI =
265 &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(
266 *L->getHeader()->getParent());
267 const DataLayout *DL = &L->getHeader()->getModule()->getDataLayout();
268 auto *MSSAAnalysis = getAnalysisIfAvailable<MemorySSAWrapperPass>();
269 MemorySSA *MSSA = nullptr;
270 if (MSSAAnalysis)
271 MSSA = &MSSAAnalysis->getMSSA();
272
273 // For the old PM, we can't use OptimizationRemarkEmitter as an analysis
274 // pass. Function analyses need to be preserved across loop transformations
275 // but ORE cannot be preserved (see comment before the pass definition).
276 OptimizationRemarkEmitter ORE(L->getHeader()->getParent());
277
278 LoopIdiomRecognize LIR(AA, DT, LI, SE, TLI, TTI, MSSA, DL, ORE);
279 return LIR.runOnLoop(L);
280 }
281
282 /// This transformation requires natural loop information & requires that
283 /// loop preheaders be inserted into the CFG.
284 void getAnalysisUsage(AnalysisUsage &AU) const override {
285 AU.addRequired<TargetLibraryInfoWrapperPass>();
286 AU.addRequired<TargetTransformInfoWrapperPass>();
287 AU.addPreserved<MemorySSAWrapperPass>();
288 getLoopAnalysisUsage(AU);
289 }
290};
291
292} // end anonymous namespace
293
294char LoopIdiomRecognizeLegacyPass::ID = 0;
295
296PreservedAnalyses LoopIdiomRecognizePass::run(Loop &L, LoopAnalysisManager &AM,
297 LoopStandardAnalysisResults &AR,
298 LPMUpdater &) {
299 if (DisableLIRP::All)
300 return PreservedAnalyses::all();
301
302 const auto *DL = &L.getHeader()->getModule()->getDataLayout();
303
304 // For the new PM, we also can't use OptimizationRemarkEmitter as an analysis
305 // pass. Function analyses need to be preserved across loop transformations
306 // but ORE cannot be preserved (see comment before the pass definition).
307 OptimizationRemarkEmitter ORE(L.getHeader()->getParent());
308
309 LoopIdiomRecognize LIR(&AR.AA, &AR.DT, &AR.LI, &AR.SE, &AR.TLI, &AR.TTI,
310 AR.MSSA, DL, ORE);
311 if (!LIR.runOnLoop(&L))
312 return PreservedAnalyses::all();
313
314 auto PA = getLoopPassPreservedAnalyses();
315 if (AR.MSSA)
316 PA.preserve<MemorySSAAnalysis>();
317 return PA;
318}
319
320INITIALIZE_PASS_BEGIN(LoopIdiomRecognizeLegacyPass, "loop-idiom",static void *initializeLoopIdiomRecognizeLegacyPassPassOnce(PassRegistry
&Registry) {
321 "Recognize loop idioms", false, false)static void *initializeLoopIdiomRecognizeLegacyPassPassOnce(PassRegistry
&Registry) {
322INITIALIZE_PASS_DEPENDENCY(LoopPass)initializeLoopPassPass(Registry);
323INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)initializeTargetLibraryInfoWrapperPassPass(Registry);
324INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)initializeTargetTransformInfoWrapperPassPass(Registry);
325INITIALIZE_PASS_END(LoopIdiomRecognizeLegacyPass, "loop-idiom",PassInfo *PI = new PassInfo( "Recognize loop idioms", "loop-idiom"
, &LoopIdiomRecognizeLegacyPass::ID, PassInfo::NormalCtor_t
(callDefaultCtor<LoopIdiomRecognizeLegacyPass>), false,
false); Registry.registerPass(*PI, true); return PI; } static
llvm::once_flag InitializeLoopIdiomRecognizeLegacyPassPassFlag
; void llvm::initializeLoopIdiomRecognizeLegacyPassPass(PassRegistry
&Registry) { llvm::call_once(InitializeLoopIdiomRecognizeLegacyPassPassFlag
, initializeLoopIdiomRecognizeLegacyPassPassOnce, std::ref(Registry
)); }
326 "Recognize loop idioms", false, false)PassInfo *PI = new PassInfo( "Recognize loop idioms", "loop-idiom"
, &LoopIdiomRecognizeLegacyPass::ID, PassInfo::NormalCtor_t
(callDefaultCtor<LoopIdiomRecognizeLegacyPass>), false,
false); Registry.registerPass(*PI, true); return PI; } static
llvm::once_flag InitializeLoopIdiomRecognizeLegacyPassPassFlag
; void llvm::initializeLoopIdiomRecognizeLegacyPassPass(PassRegistry
&Registry) { llvm::call_once(InitializeLoopIdiomRecognizeLegacyPassPassFlag
, initializeLoopIdiomRecognizeLegacyPassPassOnce, std::ref(Registry
)); }
327
328Pass *llvm::createLoopIdiomPass() { return new LoopIdiomRecognizeLegacyPass(); }
329
330static void deleteDeadInstruction(Instruction *I) {
331 I->replaceAllUsesWith(UndefValue::get(I->getType()));
332 I->eraseFromParent();
333}
334
335//===----------------------------------------------------------------------===//
336//
337// Implementation of LoopIdiomRecognize
338//
339//===----------------------------------------------------------------------===//
340
341bool LoopIdiomRecognize::runOnLoop(Loop *L) {
342 CurLoop = L;
343 // If the loop could not be converted to canonical form, it must have an
344 // indirectbr in it, just give up.
345 if (!L->getLoopPreheader())
346 return false;
347
348 // Disable loop idiom recognition if the function's name is a common idiom.
349 StringRef Name = L->getHeader()->getParent()->getName();
350 if (Name == "memset" || Name == "memcpy")
351 return false;
352
353 // Determine if code size heuristics need to be applied.
354 ApplyCodeSizeHeuristics =
355 L->getHeader()->getParent()->hasOptSize() && UseLIRCodeSizeHeurs;
356
357 HasMemset = TLI->has(LibFunc_memset);
358 HasMemsetPattern = TLI->has(LibFunc_memset_pattern16);
359 HasMemcpy = TLI->has(LibFunc_memcpy);
360
361 if (HasMemset || HasMemsetPattern || HasMemcpy)
362 if (SE->hasLoopInvariantBackedgeTakenCount(L))
363 return runOnCountableLoop();
364
365 return runOnNoncountableLoop();
366}
367
368bool LoopIdiomRecognize::runOnCountableLoop() {
369 const SCEV *BECount = SE->getBackedgeTakenCount(CurLoop);
370 assert(!isa<SCEVCouldNotCompute>(BECount) &&((!isa<SCEVCouldNotCompute>(BECount) && "runOnCountableLoop() called on a loop without a predictable"
"backedge-taken count") ? static_cast<void> (0) : __assert_fail
("!isa<SCEVCouldNotCompute>(BECount) && \"runOnCountableLoop() called on a loop without a predictable\" \"backedge-taken count\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp"
, 372, __PRETTY_FUNCTION__))
371 "runOnCountableLoop() called on a loop without a predictable"((!isa<SCEVCouldNotCompute>(BECount) && "runOnCountableLoop() called on a loop without a predictable"
"backedge-taken count") ? static_cast<void> (0) : __assert_fail
("!isa<SCEVCouldNotCompute>(BECount) && \"runOnCountableLoop() called on a loop without a predictable\" \"backedge-taken count\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp"
, 372, __PRETTY_FUNCTION__))
372 "backedge-taken count")((!isa<SCEVCouldNotCompute>(BECount) && "runOnCountableLoop() called on a loop without a predictable"
"backedge-taken count") ? static_cast<void> (0) : __assert_fail
("!isa<SCEVCouldNotCompute>(BECount) && \"runOnCountableLoop() called on a loop without a predictable\" \"backedge-taken count\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp"
, 372, __PRETTY_FUNCTION__))
;
373
374 // If this loop executes exactly one time, then it should be peeled, not
375 // optimized by this pass.
376 if (const SCEVConstant *BECst = dyn_cast<SCEVConstant>(BECount))
377 if (BECst->getAPInt() == 0)
378 return false;
379
380 SmallVector<BasicBlock *, 8> ExitBlocks;
381 CurLoop->getUniqueExitBlocks(ExitBlocks);
382
383 LLVM_DEBUG(dbgs() << DEBUG_TYPE " Scanning: F["do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-idiom")) { dbgs() << "loop-idiom" " Scanning: F["
<< CurLoop->getHeader()->getParent()->getName
() << "] Countable Loop %" << CurLoop->getHeader
()->getName() << "\n"; } } while (false)
384 << CurLoop->getHeader()->getParent()->getName()do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-idiom")) { dbgs() << "loop-idiom" " Scanning: F["
<< CurLoop->getHeader()->getParent()->getName
() << "] Countable Loop %" << CurLoop->getHeader
()->getName() << "\n"; } } while (false)
385 << "] Countable Loop %" << CurLoop->getHeader()->getName()do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-idiom")) { dbgs() << "loop-idiom" " Scanning: F["
<< CurLoop->getHeader()->getParent()->getName
() << "] Countable Loop %" << CurLoop->getHeader
()->getName() << "\n"; } } while (false)
386 << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-idiom")) { dbgs() << "loop-idiom" " Scanning: F["
<< CurLoop->getHeader()->getParent()->getName
() << "] Countable Loop %" << CurLoop->getHeader
()->getName() << "\n"; } } while (false)
;
387
388 // The following transforms hoist stores/memsets into the loop pre-header.
389 // Give up if the loop has instructions that may throw.
390 SimpleLoopSafetyInfo SafetyInfo;
391 SafetyInfo.computeLoopSafetyInfo(CurLoop);
392 if (SafetyInfo.anyBlockMayThrow())
393 return false;
394
395 bool MadeChange = false;
396
397 // Scan all the blocks in the loop that are not in subloops.
398 for (auto *BB : CurLoop->getBlocks()) {
399 // Ignore blocks in subloops.
400 if (LI->getLoopFor(BB) != CurLoop)
401 continue;
402
403 MadeChange |= runOnLoopBlock(BB, BECount, ExitBlocks);
404 }
405 return MadeChange;
406}
407
408static APInt getStoreStride(const SCEVAddRecExpr *StoreEv) {
409 const SCEVConstant *ConstStride = cast<SCEVConstant>(StoreEv->getOperand(1));
410 return ConstStride->getAPInt();
411}
412
413/// getMemSetPatternValue - If a strided store of the specified value is safe to
414/// turn into a memset_pattern16, return a ConstantArray of 16 bytes that should
415/// be passed in. Otherwise, return null.
416///
417/// Note that we don't ever attempt to use memset_pattern8 or 4, because these
418/// just replicate their input array and then pass on to memset_pattern16.
419static Constant *getMemSetPatternValue(Value *V, const DataLayout *DL) {
420 // FIXME: This could check for UndefValue because it can be merged into any
421 // other valid pattern.
422
423 // If the value isn't a constant, we can't promote it to being in a constant
424 // array. We could theoretically do a store to an alloca or something, but
425 // that doesn't seem worthwhile.
426 Constant *C = dyn_cast<Constant>(V);
427 if (!C)
428 return nullptr;
429
430 // Only handle simple values that are a power of two bytes in size.
431 uint64_t Size = DL->getTypeSizeInBits(V->getType());
432 if (Size == 0 || (Size & 7) || (Size & (Size - 1)))
433 return nullptr;
434
435 // Don't care enough about darwin/ppc to implement this.
436 if (DL->isBigEndian())
437 return nullptr;
438
439 // Convert to size in bytes.
440 Size /= 8;
441
442 // TODO: If CI is larger than 16-bytes, we can try slicing it in half to see
443 // if the top and bottom are the same (e.g. for vectors and large integers).
444 if (Size > 16)
445 return nullptr;
446
447 // If the constant is exactly 16 bytes, just use it.
448 if (Size == 16)
449 return C;
450
451 // Otherwise, we'll use an array of the constants.
452 unsigned ArraySize = 16 / Size;
453 ArrayType *AT = ArrayType::get(V->getType(), ArraySize);
454 return ConstantArray::get(AT, std::vector<Constant *>(ArraySize, C));
455}
456
457LoopIdiomRecognize::LegalStoreKind
458LoopIdiomRecognize::isLegalStore(StoreInst *SI) {
459 // Don't touch volatile stores.
460 if (SI->isVolatile())
461 return LegalStoreKind::None;
462 // We only want simple or unordered-atomic stores.
463 if (!SI->isUnordered())
464 return LegalStoreKind::None;
465
466 // Avoid merging nontemporal stores.
467 if (SI->getMetadata(LLVMContext::MD_nontemporal))
468 return LegalStoreKind::None;
469
470 Value *StoredVal = SI->getValueOperand();
471 Value *StorePtr = SI->getPointerOperand();
472
473 // Don't convert stores of non-integral pointer types to memsets (which stores
474 // integers).
475 if (DL->isNonIntegralPointerType(StoredVal->getType()->getScalarType()))
476 return LegalStoreKind::None;
477
478 // Reject stores that are so large that they overflow an unsigned.
479 // When storing out scalable vectors we bail out for now, since the code
480 // below currently only works for constant strides.
481 TypeSize SizeInBits = DL->getTypeSizeInBits(StoredVal->getType());
482 if (SizeInBits.isScalable() || (SizeInBits.getFixedSize() & 7) ||
483 (SizeInBits.getFixedSize() >> 32) != 0)
484 return LegalStoreKind::None;
485
486 // See if the pointer expression is an AddRec like {base,+,1} on the current
487 // loop, which indicates a strided store. If we have something else, it's a
488 // random store we can't handle.
489 const SCEVAddRecExpr *StoreEv =
490 dyn_cast<SCEVAddRecExpr>(SE->getSCEV(StorePtr));
491 if (!StoreEv || StoreEv->getLoop() != CurLoop || !StoreEv->isAffine())
492 return LegalStoreKind::None;
493
494 // Check to see if we have a constant stride.
495 if (!isa<SCEVConstant>(StoreEv->getOperand(1)))
496 return LegalStoreKind::None;
497
498 // See if the store can be turned into a memset.
499
500 // If the stored value is a byte-wise value (like i32 -1), then it may be
501 // turned into a memset of i8 -1, assuming that all the consecutive bytes
502 // are stored. A store of i32 0x01020304 can never be turned into a memset,
503 // but it can be turned into memset_pattern if the target supports it.
504 Value *SplatValue = isBytewiseValue(StoredVal, *DL);
505 Constant *PatternValue = nullptr;
506
507 // Note: memset and memset_pattern on unordered-atomic is yet not supported
508 bool UnorderedAtomic = SI->isUnordered() && !SI->isSimple();
509
510 // If we're allowed to form a memset, and the stored value would be
511 // acceptable for memset, use it.
512 if (!UnorderedAtomic && HasMemset && SplatValue && !DisableLIRP::Memset &&
513 // Verify that the stored value is loop invariant. If not, we can't
514 // promote the memset.
515 CurLoop->isLoopInvariant(SplatValue)) {
516 // It looks like we can use SplatValue.
517 return LegalStoreKind::Memset;
518 } else if (!UnorderedAtomic && HasMemsetPattern && !DisableLIRP::Memset &&
519 // Don't create memset_pattern16s with address spaces.
520 StorePtr->getType()->getPointerAddressSpace() == 0 &&
521 (PatternValue = getMemSetPatternValue(StoredVal, DL))) {
Although the value stored to 'PatternValue' is used in the enclosing expression, the value is never actually read from 'PatternValue'
522 // It looks like we can use PatternValue!
523 return LegalStoreKind::MemsetPattern;
524 }
525
526 // Otherwise, see if the store can be turned into a memcpy.
527 if (HasMemcpy && !DisableLIRP::Memcpy) {
528 // Check to see if the stride matches the size of the store. If so, then we
529 // know that every byte is touched in the loop.
530 APInt Stride = getStoreStride(StoreEv);
531 unsigned StoreSize = DL->getTypeStoreSize(SI->getValueOperand()->getType());
532 if (StoreSize != Stride && StoreSize != -Stride)
533 return LegalStoreKind::None;
534
535 // The store must be feeding a non-volatile load.
536 LoadInst *LI = dyn_cast<LoadInst>(SI->getValueOperand());
537
538 // Only allow non-volatile loads
539 if (!LI || LI->isVolatile())
540 return LegalStoreKind::None;
541 // Only allow simple or unordered-atomic loads
542 if (!LI->isUnordered())
543 return LegalStoreKind::None;
544
545 // See if the pointer expression is an AddRec like {base,+,1} on the current
546 // loop, which indicates a strided load. If we have something else, it's a
547 // random load we can't handle.
548 const SCEVAddRecExpr *LoadEv =
549 dyn_cast<SCEVAddRecExpr>(SE->getSCEV(LI->getPointerOperand()));
550 if (!LoadEv || LoadEv->getLoop() != CurLoop || !LoadEv->isAffine())
551 return LegalStoreKind::None;
552
553 // The store and load must share the same stride.
554 if (StoreEv->getOperand(1) != LoadEv->getOperand(1))
555 return LegalStoreKind::None;
556
557 // Success. This store can be converted into a memcpy.
558 UnorderedAtomic = UnorderedAtomic || LI->isAtomic();
559 return UnorderedAtomic ? LegalStoreKind::UnorderedAtomicMemcpy
560 : LegalStoreKind::Memcpy;
561 }
562 // This store can't be transformed into a memset/memcpy.
563 return LegalStoreKind::None;
564}
565
566void LoopIdiomRecognize::collectStores(BasicBlock *BB) {
567 StoreRefsForMemset.clear();
568 StoreRefsForMemsetPattern.clear();
569 StoreRefsForMemcpy.clear();
570 for (Instruction &I : *BB) {
571 StoreInst *SI = dyn_cast<StoreInst>(&I);
572 if (!SI)
573 continue;
574
575 // Make sure this is a strided store with a constant stride.
576 switch (isLegalStore(SI)) {
577 case LegalStoreKind::None:
578 // Nothing to do
579 break;
580 case LegalStoreKind::Memset: {
581 // Find the base pointer.
582 Value *Ptr = getUnderlyingObject(SI->getPointerOperand());
583 StoreRefsForMemset[Ptr].push_back(SI);
584 } break;
585 case LegalStoreKind::MemsetPattern: {
586 // Find the base pointer.
587 Value *Ptr = getUnderlyingObject(SI->getPointerOperand());
588 StoreRefsForMemsetPattern[Ptr].push_back(SI);
589 } break;
590 case LegalStoreKind::Memcpy:
591 case LegalStoreKind::UnorderedAtomicMemcpy:
592 StoreRefsForMemcpy.push_back(SI);
593 break;
594 default:
595 assert(false && "unhandled return value")((false && "unhandled return value") ? static_cast<
void> (0) : __assert_fail ("false && \"unhandled return value\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp"
, 595, __PRETTY_FUNCTION__))
;
596 break;
597 }
598 }
599}
600
601/// runOnLoopBlock - Process the specified block, which lives in a counted loop
602/// with the specified backedge count. This block is known to be in the current
603/// loop and not in any subloops.
604bool LoopIdiomRecognize::runOnLoopBlock(
605 BasicBlock *BB, const SCEV *BECount,
606 SmallVectorImpl<BasicBlock *> &ExitBlocks) {
607 // We can only promote stores in this block if they are unconditionally
608 // executed in the loop. For a block to be unconditionally executed, it has
609 // to dominate all the exit blocks of the loop. Verify this now.
610 for (unsigned i = 0, e = ExitBlocks.size(); i != e; ++i)
611 if (!DT->dominates(BB, ExitBlocks[i]))
612 return false;
613
614 bool MadeChange = false;
615 // Look for store instructions, which may be optimized to memset/memcpy.
616 collectStores(BB);
617
618 // Look for a single store or sets of stores with a common base, which can be
619 // optimized into a memset (memset_pattern). The latter most commonly happens
620 // with structs and handunrolled loops.
621 for (auto &SL : StoreRefsForMemset)
622 MadeChange |= processLoopStores(SL.second, BECount, ForMemset::Yes);
623
624 for (auto &SL : StoreRefsForMemsetPattern)
625 MadeChange |= processLoopStores(SL.second, BECount, ForMemset::No);
626
627 // Optimize the store into a memcpy, if it feeds an similarly strided load.
628 for (auto &SI : StoreRefsForMemcpy)
629 MadeChange |= processLoopStoreOfLoopLoad(SI, BECount);
630
631 for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) {
632 Instruction *Inst = &*I++;
633 // Look for memset instructions, which may be optimized to a larger memset.
634 if (MemSetInst *MSI = dyn_cast<MemSetInst>(Inst)) {
635 WeakTrackingVH InstPtr(&*I);
636 if (!processLoopMemSet(MSI, BECount))
637 continue;
638 MadeChange = true;
639
640 // If processing the memset invalidated our iterator, start over from the
641 // top of the block.
642 if (!InstPtr)
643 I = BB->begin();
644 continue;
645 }
646 }
647
648 return MadeChange;
649}
650
651/// See if this store(s) can be promoted to a memset.
652bool LoopIdiomRecognize::processLoopStores(SmallVectorImpl<StoreInst *> &SL,
653 const SCEV *BECount, ForMemset For) {
654 // Try to find consecutive stores that can be transformed into memsets.
655 SetVector<StoreInst *> Heads, Tails;
656 SmallDenseMap<StoreInst *, StoreInst *> ConsecutiveChain;
657
658 // Do a quadratic search on all of the given stores and find
659 // all of the pairs of stores that follow each other.
660 SmallVector<unsigned, 16> IndexQueue;
661 for (unsigned i = 0, e = SL.size(); i < e; ++i) {
662 assert(SL[i]->isSimple() && "Expected only non-volatile stores.")((SL[i]->isSimple() && "Expected only non-volatile stores."
) ? static_cast<void> (0) : __assert_fail ("SL[i]->isSimple() && \"Expected only non-volatile stores.\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp"
, 662, __PRETTY_FUNCTION__))
;
663
664 Value *FirstStoredVal = SL[i]->getValueOperand();
665 Value *FirstStorePtr = SL[i]->getPointerOperand();
666 const SCEVAddRecExpr *FirstStoreEv =
667 cast<SCEVAddRecExpr>(SE->getSCEV(FirstStorePtr));
668 APInt FirstStride = getStoreStride(FirstStoreEv);
669 unsigned FirstStoreSize = DL->getTypeStoreSize(SL[i]->getValueOperand()->getType());
670
671 // See if we can optimize just this store in isolation.
672 if (FirstStride == FirstStoreSize || -FirstStride == FirstStoreSize) {
673 Heads.insert(SL[i]);
674 continue;
675 }
676
677 Value *FirstSplatValue = nullptr;
678 Constant *FirstPatternValue = nullptr;
679
680 if (For == ForMemset::Yes)
681 FirstSplatValue = isBytewiseValue(FirstStoredVal, *DL);
682 else
683 FirstPatternValue = getMemSetPatternValue(FirstStoredVal, DL);
684
685 assert((FirstSplatValue || FirstPatternValue) &&(((FirstSplatValue || FirstPatternValue) && "Expected either splat value or pattern value."
) ? static_cast<void> (0) : __assert_fail ("(FirstSplatValue || FirstPatternValue) && \"Expected either splat value or pattern value.\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp"
, 686, __PRETTY_FUNCTION__))
686 "Expected either splat value or pattern value.")(((FirstSplatValue || FirstPatternValue) && "Expected either splat value or pattern value."
) ? static_cast<void> (0) : __assert_fail ("(FirstSplatValue || FirstPatternValue) && \"Expected either splat value or pattern value.\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp"
, 686, __PRETTY_FUNCTION__))
;
687
688 IndexQueue.clear();
689 // If a store has multiple consecutive store candidates, search Stores
690 // array according to the sequence: from i+1 to e, then from i-1 to 0.
691 // This is because usually pairing with immediate succeeding or preceding
692 // candidate create the best chance to find memset opportunity.
693 unsigned j = 0;
694 for (j = i + 1; j < e; ++j)
695 IndexQueue.push_back(j);
696 for (j = i; j > 0; --j)
697 IndexQueue.push_back(j - 1);
698
699 for (auto &k : IndexQueue) {
700 assert(SL[k]->isSimple() && "Expected only non-volatile stores.")((SL[k]->isSimple() && "Expected only non-volatile stores."
) ? static_cast<void> (0) : __assert_fail ("SL[k]->isSimple() && \"Expected only non-volatile stores.\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp"
, 700, __PRETTY_FUNCTION__))
;
701 Value *SecondStorePtr = SL[k]->getPointerOperand();
702 const SCEVAddRecExpr *SecondStoreEv =
703 cast<SCEVAddRecExpr>(SE->getSCEV(SecondStorePtr));
704 APInt SecondStride = getStoreStride(SecondStoreEv);
705
706 if (FirstStride != SecondStride)
707 continue;
708
709 Value *SecondStoredVal = SL[k]->getValueOperand();
710 Value *SecondSplatValue = nullptr;
711 Constant *SecondPatternValue = nullptr;
712
713 if (For == ForMemset::Yes)
714 SecondSplatValue = isBytewiseValue(SecondStoredVal, *DL);
715 else
716 SecondPatternValue = getMemSetPatternValue(SecondStoredVal, DL);
717
718 assert((SecondSplatValue || SecondPatternValue) &&(((SecondSplatValue || SecondPatternValue) && "Expected either splat value or pattern value."
) ? static_cast<void> (0) : __assert_fail ("(SecondSplatValue || SecondPatternValue) && \"Expected either splat value or pattern value.\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp"
, 719, __PRETTY_FUNCTION__))
719 "Expected either splat value or pattern value.")(((SecondSplatValue || SecondPatternValue) && "Expected either splat value or pattern value."
) ? static_cast<void> (0) : __assert_fail ("(SecondSplatValue || SecondPatternValue) && \"Expected either splat value or pattern value.\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp"
, 719, __PRETTY_FUNCTION__))
;
720
721 if (isConsecutiveAccess(SL[i], SL[k], *DL, *SE, false)) {
722 if (For == ForMemset::Yes) {
723 if (isa<UndefValue>(FirstSplatValue))
724 FirstSplatValue = SecondSplatValue;
725 if (FirstSplatValue != SecondSplatValue)
726 continue;
727 } else {
728 if (isa<UndefValue>(FirstPatternValue))
729 FirstPatternValue = SecondPatternValue;
730 if (FirstPatternValue != SecondPatternValue)
731 continue;
732 }
733 Tails.insert(SL[k]);
734 Heads.insert(SL[i]);
735 ConsecutiveChain[SL[i]] = SL[k];
736 break;
737 }
738 }
739 }
740
741 // We may run into multiple chains that merge into a single chain. We mark the
742 // stores that we transformed so that we don't visit the same store twice.
743 SmallPtrSet<Value *, 16> TransformedStores;
744 bool Changed = false;
745
746 // For stores that start but don't end a link in the chain:
747 for (SetVector<StoreInst *>::iterator it = Heads.begin(), e = Heads.end();
748 it != e; ++it) {
749 if (Tails.count(*it))
750 continue;
751
752 // We found a store instr that starts a chain. Now follow the chain and try
753 // to transform it.
754 SmallPtrSet<Instruction *, 8> AdjacentStores;
755 StoreInst *I = *it;
756
757 StoreInst *HeadStore = I;
758 unsigned StoreSize = 0;
759
760 // Collect the chain into a list.
761 while (Tails.count(I) || Heads.count(I)) {
762 if (TransformedStores.count(I))
763 break;
764 AdjacentStores.insert(I);
765
766 StoreSize += DL->getTypeStoreSize(I->getValueOperand()->getType());
767 // Move to the next value in the chain.
768 I = ConsecutiveChain[I];
769 }
770
771 Value *StoredVal = HeadStore->getValueOperand();
772 Value *StorePtr = HeadStore->getPointerOperand();
773 const SCEVAddRecExpr *StoreEv = cast<SCEVAddRecExpr>(SE->getSCEV(StorePtr));
774 APInt Stride = getStoreStride(StoreEv);
775
776 // Check to see if the stride matches the size of the stores. If so, then
777 // we know that every byte is touched in the loop.
778 if (StoreSize != Stride && StoreSize != -Stride)
779 continue;
780
781 bool NegStride = StoreSize == -Stride;
782
783 if (processLoopStridedStore(StorePtr, StoreSize,
784 MaybeAlign(HeadStore->getAlignment()),
785 StoredVal, HeadStore, AdjacentStores, StoreEv,
786 BECount, NegStride)) {
787 TransformedStores.insert(AdjacentStores.begin(), AdjacentStores.end());
788 Changed = true;
789 }
790 }
791
792 return Changed;
793}
794
795/// processLoopMemSet - See if this memset can be promoted to a large memset.
796bool LoopIdiomRecognize::processLoopMemSet(MemSetInst *MSI,
797 const SCEV *BECount) {
798 // We can only handle non-volatile memsets with a constant size.
799 if (MSI->isVolatile() || !isa<ConstantInt>(MSI->getLength()))
800 return false;
801
802 // If we're not allowed to hack on memset, we fail.
803 if (!HasMemset)
804 return false;
805
806 Value *Pointer = MSI->getDest();
807
808 // See if the pointer expression is an AddRec like {base,+,1} on the current
809 // loop, which indicates a strided store. If we have something else, it's a
810 // random store we can't handle.
811 const SCEVAddRecExpr *Ev = dyn_cast<SCEVAddRecExpr>(SE->getSCEV(Pointer));
812 if (!Ev || Ev->getLoop() != CurLoop || !Ev->isAffine())
813 return false;
814
815 // Reject memsets that are so large that they overflow an unsigned.
816 uint64_t SizeInBytes = cast<ConstantInt>(MSI->getLength())->getZExtValue();
817 if ((SizeInBytes >> 32) != 0)
818 return false;
819
820 // Check to see if the stride matches the size of the memset. If so, then we
821 // know that every byte is touched in the loop.
822 const SCEVConstant *ConstStride = dyn_cast<SCEVConstant>(Ev->getOperand(1));
823 if (!ConstStride)
824 return false;
825
826 APInt Stride = ConstStride->getAPInt();
827 if (SizeInBytes != Stride && SizeInBytes != -Stride)
828 return false;
829
830 // Verify that the memset value is loop invariant. If not, we can't promote
831 // the memset.
832 Value *SplatValue = MSI->getValue();
833 if (!SplatValue || !CurLoop->isLoopInvariant(SplatValue))
834 return false;
835
836 SmallPtrSet<Instruction *, 1> MSIs;
837 MSIs.insert(MSI);
838 bool NegStride = SizeInBytes == -Stride;
839 return processLoopStridedStore(
840 Pointer, (unsigned)SizeInBytes, MaybeAlign(MSI->getDestAlignment()),
841 SplatValue, MSI, MSIs, Ev, BECount, NegStride, /*IsLoopMemset=*/true);
842}
843
844/// mayLoopAccessLocation - Return true if the specified loop might access the
845/// specified pointer location, which is a loop-strided access. The 'Access'
846/// argument specifies what the verboten forms of access are (read or write).
847static bool
848mayLoopAccessLocation(Value *Ptr, ModRefInfo Access, Loop *L,
849 const SCEV *BECount, unsigned StoreSize,
850 AliasAnalysis &AA,
851 SmallPtrSetImpl<Instruction *> &IgnoredStores) {
852 // Get the location that may be stored across the loop. Since the access is
853 // strided positively through memory, we say that the modified location starts
854 // at the pointer and has infinite size.
855 LocationSize AccessSize = LocationSize::afterPointer();
856
857 // If the loop iterates a fixed number of times, we can refine the access size
858 // to be exactly the size of the memset, which is (BECount+1)*StoreSize
859 if (const SCEVConstant *BECst = dyn_cast<SCEVConstant>(BECount))
860 AccessSize = LocationSize::precise((BECst->getValue()->getZExtValue() + 1) *
861 StoreSize);
862
863 // TODO: For this to be really effective, we have to dive into the pointer
864 // operand in the store. Store to &A[i] of 100 will always return may alias
865 // with store of &A[100], we need to StoreLoc to be "A" with size of 100,
866 // which will then no-alias a store to &A[100].
867 MemoryLocation StoreLoc(Ptr, AccessSize);
868
869 for (Loop::block_iterator BI = L->block_begin(), E = L->block_end(); BI != E;
870 ++BI)
871 for (Instruction &I : **BI)
872 if (IgnoredStores.count(&I) == 0 &&
873 isModOrRefSet(
874 intersectModRef(AA.getModRefInfo(&I, StoreLoc), Access)))
875 return true;
876
877 return false;
878}
879
880// If we have a negative stride, Start refers to the end of the memory location
881// we're trying to memset. Therefore, we need to recompute the base pointer,
882// which is just Start - BECount*Size.
883static const SCEV *getStartForNegStride(const SCEV *Start, const SCEV *BECount,
884 Type *IntPtr, unsigned StoreSize,
885 ScalarEvolution *SE) {
886 const SCEV *Index = SE->getTruncateOrZeroExtend(BECount, IntPtr);
887 if (StoreSize != 1)
888 Index = SE->getMulExpr(Index, SE->getConstant(IntPtr, StoreSize),
889 SCEV::FlagNUW);
890 return SE->getMinusSCEV(Start, Index);
891}
892
893/// Compute the number of bytes as a SCEV from the backedge taken count.
894///
895/// This also maps the SCEV into the provided type and tries to handle the
896/// computation in a way that will fold cleanly.
897static const SCEV *getNumBytes(const SCEV *BECount, Type *IntPtr,
898 unsigned StoreSize, Loop *CurLoop,
899 const DataLayout *DL, ScalarEvolution *SE) {
900 const SCEV *NumBytesS;
901 // The # stored bytes is (BECount+1)*Size. Expand the trip count out to
902 // pointer size if it isn't already.
903 //
904 // If we're going to need to zero extend the BE count, check if we can add
905 // one to it prior to zero extending without overflow. Provided this is safe,
906 // it allows better simplification of the +1.
907 if (DL->getTypeSizeInBits(BECount->getType()).getFixedSize() <
908 DL->getTypeSizeInBits(IntPtr).getFixedSize() &&
909 SE->isLoopEntryGuardedByCond(
910 CurLoop, ICmpInst::ICMP_NE, BECount,
911 SE->getNegativeSCEV(SE->getOne(BECount->getType())))) {
912 NumBytesS = SE->getZeroExtendExpr(
913 SE->getAddExpr(BECount, SE->getOne(BECount->getType()), SCEV::FlagNUW),
914 IntPtr);
915 } else {
916 NumBytesS = SE->getAddExpr(SE->getTruncateOrZeroExtend(BECount, IntPtr),
917 SE->getOne(IntPtr), SCEV::FlagNUW);
918 }
919
920 // And scale it based on the store size.
921 if (StoreSize != 1) {
922 NumBytesS = SE->getMulExpr(NumBytesS, SE->getConstant(IntPtr, StoreSize),
923 SCEV::FlagNUW);
924 }
925 return NumBytesS;
926}
927
928/// processLoopStridedStore - We see a strided store of some value. If we can
929/// transform this into a memset or memset_pattern in the loop preheader, do so.
930bool LoopIdiomRecognize::processLoopStridedStore(
931 Value *DestPtr, unsigned StoreSize, MaybeAlign StoreAlignment,
932 Value *StoredVal, Instruction *TheStore,
933 SmallPtrSetImpl<Instruction *> &Stores, const SCEVAddRecExpr *Ev,
934 const SCEV *BECount, bool NegStride, bool IsLoopMemset) {
935 Value *SplatValue = isBytewiseValue(StoredVal, *DL);
936 Constant *PatternValue = nullptr;
937
938 if (!SplatValue)
939 PatternValue = getMemSetPatternValue(StoredVal, DL);
940
941 assert((SplatValue || PatternValue) &&(((SplatValue || PatternValue) && "Expected either splat value or pattern value."
) ? static_cast<void> (0) : __assert_fail ("(SplatValue || PatternValue) && \"Expected either splat value or pattern value.\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp"
, 942, __PRETTY_FUNCTION__))
942 "Expected either splat value or pattern value.")(((SplatValue || PatternValue) && "Expected either splat value or pattern value."
) ? static_cast<void> (0) : __assert_fail ("(SplatValue || PatternValue) && \"Expected either splat value or pattern value.\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp"
, 942, __PRETTY_FUNCTION__))
;
943
944 // The trip count of the loop and the base pointer of the addrec SCEV is
945 // guaranteed to be loop invariant, which means that it should dominate the
946 // header. This allows us to insert code for it in the preheader.
947 unsigned DestAS = DestPtr->getType()->getPointerAddressSpace();
948 BasicBlock *Preheader = CurLoop->getLoopPreheader();
949 IRBuilder<> Builder(Preheader->getTerminator());
950 SCEVExpander Expander(*SE, *DL, "loop-idiom");
951 SCEVExpanderCleaner ExpCleaner(Expander, *DT);
952
953 Type *DestInt8PtrTy = Builder.getInt8PtrTy(DestAS);
954 Type *IntIdxTy = DL->getIndexType(DestPtr->getType());
955
956 bool Changed = false;
957 const SCEV *Start = Ev->getStart();
958 // Handle negative strided loops.
959 if (NegStride)
960 Start = getStartForNegStride(Start, BECount, IntIdxTy, StoreSize, SE);
961
962 // TODO: ideally we should still be able to generate memset if SCEV expander
963 // is taught to generate the dependencies at the latest point.
964 if (!isSafeToExpand(Start, *SE))
965 return Changed;
966
967 // Okay, we have a strided store "p[i]" of a splattable value. We can turn
968 // this into a memset in the loop preheader now if we want. However, this
969 // would be unsafe to do if there is anything else in the loop that may read
970 // or write to the aliased location. Check for any overlap by generating the
971 // base pointer and checking the region.
972 Value *BasePtr =
973 Expander.expandCodeFor(Start, DestInt8PtrTy, Preheader->getTerminator());
974
975 // From here on out, conservatively report to the pass manager that we've
976 // changed the IR, even if we later clean up these added instructions. There
977 // may be structural differences e.g. in the order of use lists not accounted
978 // for in just a textual dump of the IR. This is written as a variable, even
979 // though statically all the places this dominates could be replaced with
980 // 'true', with the hope that anyone trying to be clever / "more precise" with
981 // the return value will read this comment, and leave them alone.
982 Changed = true;
983
984 if (mayLoopAccessLocation(BasePtr, ModRefInfo::ModRef, CurLoop, BECount,
985 StoreSize, *AA, Stores))
986 return Changed;
987
988 if (avoidLIRForMultiBlockLoop(/*IsMemset=*/true, IsLoopMemset))
989 return Changed;
990
991 // Okay, everything looks good, insert the memset.
992
993 const SCEV *NumBytesS =
994 getNumBytes(BECount, IntIdxTy, StoreSize, CurLoop, DL, SE);
995
996 // TODO: ideally we should still be able to generate memset if SCEV expander
997 // is taught to generate the dependencies at the latest point.
998 if (!isSafeToExpand(NumBytesS, *SE))
999 return Changed;
1000
1001 Value *NumBytes =
1002 Expander.expandCodeFor(NumBytesS, IntIdxTy, Preheader->getTerminator());
1003
1004 CallInst *NewCall;
1005 if (SplatValue) {
1006 NewCall = Builder.CreateMemSet(BasePtr, SplatValue, NumBytes,
1007 MaybeAlign(StoreAlignment));
1008 } else {
1009 // Everything is emitted in default address space
1010 Type *Int8PtrTy = DestInt8PtrTy;
1011
1012 Module *M = TheStore->getModule();
1013 StringRef FuncName = "memset_pattern16";
1014 FunctionCallee MSP = M->getOrInsertFunction(FuncName, Builder.getVoidTy(),
1015 Int8PtrTy, Int8PtrTy, IntIdxTy);
1016 inferLibFuncAttributes(M, FuncName, *TLI);
1017
1018 // Otherwise we should form a memset_pattern16. PatternValue is known to be
1019 // an constant array of 16-bytes. Plop the value into a mergable global.
1020 GlobalVariable *GV = new GlobalVariable(*M, PatternValue->getType(), true,
1021 GlobalValue::PrivateLinkage,
1022 PatternValue, ".memset_pattern");
1023 GV->setUnnamedAddr(GlobalValue::UnnamedAddr::Global); // Ok to merge these.
1024 GV->setAlignment(Align(16));
1025 Value *PatternPtr = ConstantExpr::getBitCast(GV, Int8PtrTy);
1026 NewCall = Builder.CreateCall(MSP, {BasePtr, PatternPtr, NumBytes});
1027 }
1028 NewCall->setDebugLoc(TheStore->getDebugLoc());
1029
1030 if (MSSAU) {
1031 MemoryAccess *NewMemAcc = MSSAU->createMemoryAccessInBB(
1032 NewCall, nullptr, NewCall->getParent(), MemorySSA::BeforeTerminator);
1033 MSSAU->insertDef(cast<MemoryDef>(NewMemAcc), true);
1034 }
1035
1036 LLVM_DEBUG(dbgs() << " Formed memset: " << *NewCall << "\n"do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-idiom")) { dbgs() << " Formed memset: " <<
*NewCall << "\n" << " from store to: " <<
*Ev << " at: " << *TheStore << "\n"; } } while
(false)
1037 << " from store to: " << *Ev << " at: " << *TheStoredo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-idiom")) { dbgs() << " Formed memset: " <<
*NewCall << "\n" << " from store to: " <<
*Ev << " at: " << *TheStore << "\n"; } } while
(false)
1038 << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-idiom")) { dbgs() << " Formed memset: " <<
*NewCall << "\n" << " from store to: " <<
*Ev << " at: " << *TheStore << "\n"; } } while
(false)
;
1039
1040 ORE.emit([&]() {
1041 return OptimizationRemark(DEBUG_TYPE"loop-idiom", "ProcessLoopStridedStore",
1042 NewCall->getDebugLoc(), Preheader)
1043 << "Transformed loop-strided store into a call to "
1044 << ore::NV("NewFunction", NewCall->getCalledFunction())
1045 << "() function";
1046 });
1047
1048 // Okay, the memset has been formed. Zap the original store and anything that
1049 // feeds into it.
1050 for (auto *I : Stores) {
1051 if (MSSAU)
1052 MSSAU->removeMemoryAccess(I, true);
1053 deleteDeadInstruction(I);
1054 }
1055 if (MSSAU && VerifyMemorySSA)
1056 MSSAU->getMemorySSA()->verifyMemorySSA();
1057 ++NumMemSet;
1058 ExpCleaner.markResultUsed();
1059 return true;
1060}
1061
1062/// If the stored value is a strided load in the same loop with the same stride
1063/// this may be transformable into a memcpy. This kicks in for stuff like
1064/// for (i) A[i] = B[i];
1065bool LoopIdiomRecognize::processLoopStoreOfLoopLoad(StoreInst *SI,
1066 const SCEV *BECount) {
1067 assert(SI->isUnordered() && "Expected only non-volatile non-ordered stores.")((SI->isUnordered() && "Expected only non-volatile non-ordered stores."
) ? static_cast<void> (0) : __assert_fail ("SI->isUnordered() && \"Expected only non-volatile non-ordered stores.\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp"
, 1067, __PRETTY_FUNCTION__))
;
1068
1069 Value *StorePtr = SI->getPointerOperand();
1070 const SCEVAddRecExpr *StoreEv = cast<SCEVAddRecExpr>(SE->getSCEV(StorePtr));
1071 APInt Stride = getStoreStride(StoreEv);
1072 unsigned StoreSize = DL->getTypeStoreSize(SI->getValueOperand()->getType());
1073 bool NegStride = StoreSize == -Stride;
1074
1075 // The store must be feeding a non-volatile load.
1076 LoadInst *LI = cast<LoadInst>(SI->getValueOperand());
1077 assert(LI->isUnordered() && "Expected only non-volatile non-ordered loads.")((LI->isUnordered() && "Expected only non-volatile non-ordered loads."
) ? static_cast<void> (0) : __assert_fail ("LI->isUnordered() && \"Expected only non-volatile non-ordered loads.\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp"
, 1077, __PRETTY_FUNCTION__))
;
1078
1079 // See if the pointer expression is an AddRec like {base,+,1} on the current
1080 // loop, which indicates a strided load. If we have something else, it's a
1081 // random load we can't handle.
1082 const SCEVAddRecExpr *LoadEv =
1083 cast<SCEVAddRecExpr>(SE->getSCEV(LI->getPointerOperand()));
1084
1085 // The trip count of the loop and the base pointer of the addrec SCEV is
1086 // guaranteed to be loop invariant, which means that it should dominate the
1087 // header. This allows us to insert code for it in the preheader.
1088 BasicBlock *Preheader = CurLoop->getLoopPreheader();
1089 IRBuilder<> Builder(Preheader->getTerminator());
1090 SCEVExpander Expander(*SE, *DL, "loop-idiom");
1091
1092 SCEVExpanderCleaner ExpCleaner(Expander, *DT);
1093
1094 bool Changed = false;
1095 const SCEV *StrStart = StoreEv->getStart();
1096 unsigned StrAS = SI->getPointerAddressSpace();
1097 Type *IntIdxTy = Builder.getIntNTy(DL->getIndexSizeInBits(StrAS));
1098
1099 // Handle negative strided loops.
1100 if (NegStride)
1101 StrStart = getStartForNegStride(StrStart, BECount, IntIdxTy, StoreSize, SE);
1102
1103 // Okay, we have a strided store "p[i]" of a loaded value. We can turn
1104 // this into a memcpy in the loop preheader now if we want. However, this
1105 // would be unsafe to do if there is anything else in the loop that may read
1106 // or write the memory region we're storing to. This includes the load that
1107 // feeds the stores. Check for an alias by generating the base address and
1108 // checking everything.
1109 Value *StoreBasePtr = Expander.expandCodeFor(
1110 StrStart, Builder.getInt8PtrTy(StrAS), Preheader->getTerminator());
1111
1112 // From here on out, conservatively report to the pass manager that we've
1113 // changed the IR, even if we later clean up these added instructions. There
1114 // may be structural differences e.g. in the order of use lists not accounted
1115 // for in just a textual dump of the IR. This is written as a variable, even
1116 // though statically all the places this dominates could be replaced with
1117 // 'true', with the hope that anyone trying to be clever / "more precise" with
1118 // the return value will read this comment, and leave them alone.
1119 Changed = true;
1120
1121 SmallPtrSet<Instruction *, 1> Stores;
1122 Stores.insert(SI);
1123 if (mayLoopAccessLocation(StoreBasePtr, ModRefInfo::ModRef, CurLoop, BECount,
1124 StoreSize, *AA, Stores))
1125 return Changed;
1126
1127 const SCEV *LdStart = LoadEv->getStart();
1128 unsigned LdAS = LI->getPointerAddressSpace();
1129
1130 // Handle negative strided loops.
1131 if (NegStride)
1132 LdStart = getStartForNegStride(LdStart, BECount, IntIdxTy, StoreSize, SE);
1133
1134 // For a memcpy, we have to make sure that the input array is not being
1135 // mutated by the loop.
1136 Value *LoadBasePtr = Expander.expandCodeFor(
1137 LdStart, Builder.getInt8PtrTy(LdAS), Preheader->getTerminator());
1138
1139 if (mayLoopAccessLocation(LoadBasePtr, ModRefInfo::Mod, CurLoop, BECount,
1140 StoreSize, *AA, Stores))
1141 return Changed;
1142
1143 if (avoidLIRForMultiBlockLoop())
1144 return Changed;
1145
1146 // Okay, everything is safe, we can transform this!
1147
1148 const SCEV *NumBytesS =
1149 getNumBytes(BECount, IntIdxTy, StoreSize, CurLoop, DL, SE);
1150
1151 Value *NumBytes =
1152 Expander.expandCodeFor(NumBytesS, IntIdxTy, Preheader->getTerminator());
1153
1154 CallInst *NewCall = nullptr;
1155 // Check whether to generate an unordered atomic memcpy:
1156 // If the load or store are atomic, then they must necessarily be unordered
1157 // by previous checks.
1158 if (!SI->isAtomic() && !LI->isAtomic())
1159 NewCall = Builder.CreateMemCpy(StoreBasePtr, SI->getAlign(), LoadBasePtr,
1160 LI->getAlign(), NumBytes);
1161 else {
1162 // We cannot allow unaligned ops for unordered load/store, so reject
1163 // anything where the alignment isn't at least the element size.
1164 const Align StoreAlign = SI->getAlign();
1165 const Align LoadAlign = LI->getAlign();
1166 if (StoreAlign < StoreSize || LoadAlign < StoreSize)
1167 return Changed;
1168
1169 // If the element.atomic memcpy is not lowered into explicit
1170 // loads/stores later, then it will be lowered into an element-size
1171 // specific lib call. If the lib call doesn't exist for our store size, then
1172 // we shouldn't generate the memcpy.
1173 if (StoreSize > TTI->getAtomicMemIntrinsicMaxElementSize())
1174 return Changed;
1175
1176 // Create the call.
1177 // Note that unordered atomic loads/stores are *required* by the spec to
1178 // have an alignment but non-atomic loads/stores may not.
1179 NewCall = Builder.CreateElementUnorderedAtomicMemCpy(
1180 StoreBasePtr, StoreAlign, LoadBasePtr, LoadAlign, NumBytes,
1181 StoreSize);
1182 }
1183 NewCall->setDebugLoc(SI->getDebugLoc());
1184
1185 if (MSSAU) {
1186 MemoryAccess *NewMemAcc = MSSAU->createMemoryAccessInBB(
1187 NewCall, nullptr, NewCall->getParent(), MemorySSA::BeforeTerminator);
1188 MSSAU->insertDef(cast<MemoryDef>(NewMemAcc), true);
1189 }
1190
1191 LLVM_DEBUG(dbgs() << " Formed memcpy: " << *NewCall << "\n"do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-idiom")) { dbgs() << " Formed memcpy: " <<
*NewCall << "\n" << " from load ptr=" <<
*LoadEv << " at: " << *LI << "\n" <<
" from store ptr=" << *StoreEv << " at: " <<
*SI << "\n"; } } while (false)
1192 << " from load ptr=" << *LoadEv << " at: " << *LI << "\n"do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-idiom")) { dbgs() << " Formed memcpy: " <<
*NewCall << "\n" << " from load ptr=" <<
*LoadEv << " at: " << *LI << "\n" <<
" from store ptr=" << *StoreEv << " at: " <<
*SI << "\n"; } } while (false)
1193 << " from store ptr=" << *StoreEv << " at: " << *SIdo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-idiom")) { dbgs() << " Formed memcpy: " <<
*NewCall << "\n" << " from load ptr=" <<
*LoadEv << " at: " << *LI << "\n" <<
" from store ptr=" << *StoreEv << " at: " <<
*SI << "\n"; } } while (false)
1194 << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-idiom")) { dbgs() << " Formed memcpy: " <<
*NewCall << "\n" << " from load ptr=" <<
*LoadEv << " at: " << *LI << "\n" <<
" from store ptr=" << *StoreEv << " at: " <<
*SI << "\n"; } } while (false)
;
1195
1196 ORE.emit([&]() {
1197 return OptimizationRemark(DEBUG_TYPE"loop-idiom", "ProcessLoopStoreOfLoopLoad",
1198 NewCall->getDebugLoc(), Preheader)
1199 << "Formed a call to "
1200 << ore::NV("NewFunction", NewCall->getCalledFunction())
1201 << "() function";
1202 });
1203
1204 // Okay, the memcpy has been formed. Zap the original store and anything that
1205 // feeds into it.
1206 if (MSSAU)
1207 MSSAU->removeMemoryAccess(SI, true);
1208 deleteDeadInstruction(SI);
1209 if (MSSAU && VerifyMemorySSA)
1210 MSSAU->getMemorySSA()->verifyMemorySSA();
1211 ++NumMemCpy;
1212 ExpCleaner.markResultUsed();
1213 return true;
1214}
1215
1216// When compiling for codesize we avoid idiom recognition for a multi-block loop
1217// unless it is a loop_memset idiom or a memset/memcpy idiom in a nested loop.
1218//
1219bool LoopIdiomRecognize::avoidLIRForMultiBlockLoop(bool IsMemset,
1220 bool IsLoopMemset) {
1221 if (ApplyCodeSizeHeuristics && CurLoop->getNumBlocks() > 1) {
1222 if (CurLoop->isOutermost() && (!IsMemset || !IsLoopMemset)) {
1223 LLVM_DEBUG(dbgs() << " " << CurLoop->getHeader()->getParent()->getName()do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-idiom")) { dbgs() << " " << CurLoop->getHeader
()->getParent()->getName() << " : LIR " << (
IsMemset ? "Memset" : "Memcpy") << " avoided: multi-block top-level loop\n"
; } } while (false)
1224 << " : LIR " << (IsMemset ? "Memset" : "Memcpy")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-idiom")) { dbgs() << " " << CurLoop->getHeader
()->getParent()->getName() << " : LIR " << (
IsMemset ? "Memset" : "Memcpy") << " avoided: multi-block top-level loop\n"
; } } while (false)
1225 << " avoided: multi-block top-level loop\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-idiom")) { dbgs() << " " << CurLoop->getHeader
()->getParent()->getName() << " : LIR " << (
IsMemset ? "Memset" : "Memcpy") << " avoided: multi-block top-level loop\n"
; } } while (false)
;
1226 return true;
1227 }
1228 }
1229
1230 return false;
1231}
1232
1233bool LoopIdiomRecognize::runOnNoncountableLoop() {
1234 LLVM_DEBUG(dbgs() << DEBUG_TYPE " Scanning: F["do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-idiom")) { dbgs() << "loop-idiom" " Scanning: F["
<< CurLoop->getHeader()->getParent()->getName
() << "] Noncountable Loop %" << CurLoop->getHeader
()->getName() << "\n"; } } while (false)
1235 << CurLoop->getHeader()->getParent()->getName()do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-idiom")) { dbgs() << "loop-idiom" " Scanning: F["
<< CurLoop->getHeader()->getParent()->getName
() << "] Noncountable Loop %" << CurLoop->getHeader
()->getName() << "\n"; } } while (false)
1236 << "] Noncountable Loop %"do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-idiom")) { dbgs() << "loop-idiom" " Scanning: F["
<< CurLoop->getHeader()->getParent()->getName
() << "] Noncountable Loop %" << CurLoop->getHeader
()->getName() << "\n"; } } while (false)
1237 << CurLoop->getHeader()->getName() << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-idiom")) { dbgs() << "loop-idiom" " Scanning: F["
<< CurLoop->getHeader()->getParent()->getName
() << "] Noncountable Loop %" << CurLoop->getHeader
()->getName() << "\n"; } } while (false)
;
1238
1239 return recognizePopcount() || recognizeAndInsertFFS() ||
1240 recognizeShiftUntilBitTest();
1241}
1242
1243/// Check if the given conditional branch is based on the comparison between
1244/// a variable and zero, and if the variable is non-zero or zero (JmpOnZero is
1245/// true), the control yields to the loop entry. If the branch matches the
1246/// behavior, the variable involved in the comparison is returned. This function
1247/// will be called to see if the precondition and postcondition of the loop are
1248/// in desirable form.
1249static Value *matchCondition(BranchInst *BI, BasicBlock *LoopEntry,
1250 bool JmpOnZero = false) {
1251 if (!BI || !BI->isConditional())
1252 return nullptr;
1253
1254 ICmpInst *Cond = dyn_cast<ICmpInst>(BI->getCondition());
1255 if (!Cond)
1256 return nullptr;
1257
1258 ConstantInt *CmpZero = dyn_cast<ConstantInt>(Cond->getOperand(1));
1259 if (!CmpZero || !CmpZero->isZero())
1260 return nullptr;
1261
1262 BasicBlock *TrueSucc = BI->getSuccessor(0);
1263 BasicBlock *FalseSucc = BI->getSuccessor(1);
1264 if (JmpOnZero)
1265 std::swap(TrueSucc, FalseSucc);
1266
1267 ICmpInst::Predicate Pred = Cond->getPredicate();
1268 if ((Pred == ICmpInst::ICMP_NE && TrueSucc == LoopEntry) ||
1269 (Pred == ICmpInst::ICMP_EQ && FalseSucc == LoopEntry))
1270 return Cond->getOperand(0);
1271
1272 return nullptr;
1273}
1274
1275// Check if the recurrence variable `VarX` is in the right form to create
1276// the idiom. Returns the value coerced to a PHINode if so.
1277static PHINode *getRecurrenceVar(Value *VarX, Instruction *DefX,
1278 BasicBlock *LoopEntry) {
1279 auto *PhiX = dyn_cast<PHINode>(VarX);
1280 if (PhiX && PhiX->getParent() == LoopEntry &&
1281 (PhiX->getOperand(0) == DefX || PhiX->getOperand(1) == DefX))
1282 return PhiX;
1283 return nullptr;
1284}
1285
1286/// Return true iff the idiom is detected in the loop.
1287///
1288/// Additionally:
1289/// 1) \p CntInst is set to the instruction counting the population bit.
1290/// 2) \p CntPhi is set to the corresponding phi node.
1291/// 3) \p Var is set to the value whose population bits are being counted.
1292///
1293/// The core idiom we are trying to detect is:
1294/// \code
1295/// if (x0 != 0)
1296/// goto loop-exit // the precondition of the loop
1297/// cnt0 = init-val;
1298/// do {
1299/// x1 = phi (x0, x2);
1300/// cnt1 = phi(cnt0, cnt2);
1301///
1302/// cnt2 = cnt1 + 1;
1303/// ...
1304/// x2 = x1 & (x1 - 1);
1305/// ...
1306/// } while(x != 0);
1307///
1308/// loop-exit:
1309/// \endcode
1310static bool detectPopcountIdiom(Loop *CurLoop, BasicBlock *PreCondBB,
1311 Instruction *&CntInst, PHINode *&CntPhi,
1312 Value *&Var) {
1313 // step 1: Check to see if the look-back branch match this pattern:
1314 // "if (a!=0) goto loop-entry".
1315 BasicBlock *LoopEntry;
1316 Instruction *DefX2, *CountInst;
1317 Value *VarX1, *VarX0;
1318 PHINode *PhiX, *CountPhi;
1319
1320 DefX2 = CountInst = nullptr;
1321 VarX1 = VarX0 = nullptr;
1322 PhiX = CountPhi = nullptr;
1323 LoopEntry = *(CurLoop->block_begin());
1324
1325 // step 1: Check if the loop-back branch is in desirable form.
1326 {
1327 if (Value *T = matchCondition(
1328 dyn_cast<BranchInst>(LoopEntry->getTerminator()), LoopEntry))
1329 DefX2 = dyn_cast<Instruction>(T);
1330 else
1331 return false;
1332 }
1333
1334 // step 2: detect instructions corresponding to "x2 = x1 & (x1 - 1)"
1335 {
1336 if (!DefX2 || DefX2->getOpcode() != Instruction::And)
1337 return false;
1338
1339 BinaryOperator *SubOneOp;
1340
1341 if ((SubOneOp = dyn_cast<BinaryOperator>(DefX2->getOperand(0))))
1342 VarX1 = DefX2->getOperand(1);
1343 else {
1344 VarX1 = DefX2->getOperand(0);
1345 SubOneOp = dyn_cast<BinaryOperator>(DefX2->getOperand(1));
1346 }
1347 if (!SubOneOp || SubOneOp->getOperand(0) != VarX1)
1348 return false;
1349
1350 ConstantInt *Dec = dyn_cast<ConstantInt>(SubOneOp->getOperand(1));
1351 if (!Dec ||
1352 !((SubOneOp->getOpcode() == Instruction::Sub && Dec->isOne()) ||
1353 (SubOneOp->getOpcode() == Instruction::Add &&
1354 Dec->isMinusOne()))) {
1355 return false;
1356 }
1357 }
1358
1359 // step 3: Check the recurrence of variable X
1360 PhiX = getRecurrenceVar(VarX1, DefX2, LoopEntry);
1361 if (!PhiX)
1362 return false;
1363
1364 // step 4: Find the instruction which count the population: cnt2 = cnt1 + 1
1365 {
1366 CountInst = nullptr;
1367 for (BasicBlock::iterator Iter = LoopEntry->getFirstNonPHI()->getIterator(),
1368 IterE = LoopEntry->end();
1369 Iter != IterE; Iter++) {
1370 Instruction *Inst = &*Iter;
1371 if (Inst->getOpcode() != Instruction::Add)
1372 continue;
1373
1374 ConstantInt *Inc = dyn_cast<ConstantInt>(Inst->getOperand(1));
1375 if (!Inc || !Inc->isOne())
1376 continue;
1377
1378 PHINode *Phi = getRecurrenceVar(Inst->getOperand(0), Inst, LoopEntry);
1379 if (!Phi)
1380 continue;
1381
1382 // Check if the result of the instruction is live of the loop.
1383 bool LiveOutLoop = false;
1384 for (User *U : Inst->users()) {
1385 if ((cast<Instruction>(U))->getParent() != LoopEntry) {
1386 LiveOutLoop = true;
1387 break;
1388 }
1389 }
1390
1391 if (LiveOutLoop) {
1392 CountInst = Inst;
1393 CountPhi = Phi;
1394 break;
1395 }
1396 }
1397
1398 if (!CountInst)
1399 return false;
1400 }
1401
1402 // step 5: check if the precondition is in this form:
1403 // "if (x != 0) goto loop-head ; else goto somewhere-we-don't-care;"
1404 {
1405 auto *PreCondBr = dyn_cast<BranchInst>(PreCondBB->getTerminator());
1406 Value *T = matchCondition(PreCondBr, CurLoop->getLoopPreheader());
1407 if (T != PhiX->getOperand(0) && T != PhiX->getOperand(1))
1408 return false;
1409
1410 CntInst = CountInst;
1411 CntPhi = CountPhi;
1412 Var = T;
1413 }
1414
1415 return true;
1416}
1417
1418/// Return true if the idiom is detected in the loop.
1419///
1420/// Additionally:
1421/// 1) \p CntInst is set to the instruction Counting Leading Zeros (CTLZ)
1422/// or nullptr if there is no such.
1423/// 2) \p CntPhi is set to the corresponding phi node
1424/// or nullptr if there is no such.
1425/// 3) \p Var is set to the value whose CTLZ could be used.
1426/// 4) \p DefX is set to the instruction calculating Loop exit condition.
1427///
1428/// The core idiom we are trying to detect is:
1429/// \code
1430/// if (x0 == 0)
1431/// goto loop-exit // the precondition of the loop
1432/// cnt0 = init-val;
1433/// do {
1434/// x = phi (x0, x.next); //PhiX
1435/// cnt = phi(cnt0, cnt.next);
1436///
1437/// cnt.next = cnt + 1;
1438/// ...
1439/// x.next = x >> 1; // DefX
1440/// ...
1441/// } while(x.next != 0);
1442///
1443/// loop-exit:
1444/// \endcode
1445static bool detectShiftUntilZeroIdiom(Loop *CurLoop, const DataLayout &DL,
1446 Intrinsic::ID &IntrinID, Value *&InitX,
1447 Instruction *&CntInst, PHINode *&CntPhi,
1448 Instruction *&DefX) {
1449 BasicBlock *LoopEntry;
1450 Value *VarX = nullptr;
1451
1452 DefX = nullptr;
1453 CntInst = nullptr;
1454 CntPhi = nullptr;
1455 LoopEntry = *(CurLoop->block_begin());
1456
1457 // step 1: Check if the loop-back branch is in desirable form.
1458 if (Value *T = matchCondition(
1459 dyn_cast<BranchInst>(LoopEntry->getTerminator()), LoopEntry))
1460 DefX = dyn_cast<Instruction>(T);
1461 else
1462 return false;
1463
1464 // step 2: detect instructions corresponding to "x.next = x >> 1 or x << 1"
1465 if (!DefX || !DefX->isShift())
1466 return false;
1467 IntrinID = DefX->getOpcode() == Instruction::Shl ? Intrinsic::cttz :
1468 Intrinsic::ctlz;
1469 ConstantInt *Shft = dyn_cast<ConstantInt>(DefX->getOperand(1));
1470 if (!Shft || !Shft->isOne())
1471 return false;
1472 VarX = DefX->getOperand(0);
1473
1474 // step 3: Check the recurrence of variable X
1475 PHINode *PhiX = getRecurrenceVar(VarX, DefX, LoopEntry);
1476 if (!PhiX)
1477 return false;
1478
1479 InitX = PhiX->getIncomingValueForBlock(CurLoop->getLoopPreheader());
1480
1481 // Make sure the initial value can't be negative otherwise the ashr in the
1482 // loop might never reach zero which would make the loop infinite.
1483 if (DefX->getOpcode() == Instruction::AShr && !isKnownNonNegative(InitX, DL))
1484 return false;
1485
1486 // step 4: Find the instruction which count the CTLZ: cnt.next = cnt + 1
1487 // or cnt.next = cnt + -1.
1488 // TODO: We can skip the step. If loop trip count is known (CTLZ),
1489 // then all uses of "cnt.next" could be optimized to the trip count
1490 // plus "cnt0". Currently it is not optimized.
1491 // This step could be used to detect POPCNT instruction:
1492 // cnt.next = cnt + (x.next & 1)
1493 for (BasicBlock::iterator Iter = LoopEntry->getFirstNonPHI()->getIterator(),
1494 IterE = LoopEntry->end();
1495 Iter != IterE; Iter++) {
1496 Instruction *Inst = &*Iter;
1497 if (Inst->getOpcode() != Instruction::Add)
1498 continue;
1499
1500 ConstantInt *Inc = dyn_cast<ConstantInt>(Inst->getOperand(1));
1501 if (!Inc || (!Inc->isOne() && !Inc->isMinusOne()))
1502 continue;
1503
1504 PHINode *Phi = getRecurrenceVar(Inst->getOperand(0), Inst, LoopEntry);
1505 if (!Phi)
1506 continue;
1507
1508 CntInst = Inst;
1509 CntPhi = Phi;
1510 break;
1511 }
1512 if (!CntInst)
1513 return false;
1514
1515 return true;
1516}
1517
1518/// Recognize CTLZ or CTTZ idiom in a non-countable loop and convert the loop
1519/// to countable (with CTLZ / CTTZ trip count). If CTLZ / CTTZ inserted as a new
1520/// trip count returns true; otherwise, returns false.
1521bool LoopIdiomRecognize::recognizeAndInsertFFS() {
1522 // Give up if the loop has multiple blocks or multiple backedges.
1523 if (CurLoop->getNumBackEdges() != 1 || CurLoop->getNumBlocks() != 1)
1524 return false;
1525
1526 Intrinsic::ID IntrinID;
1527 Value *InitX;
1528 Instruction *DefX = nullptr;
1529 PHINode *CntPhi = nullptr;
1530 Instruction *CntInst = nullptr;
1531 // Help decide if transformation is profitable. For ShiftUntilZero idiom,
1532 // this is always 6.
1533 size_t IdiomCanonicalSize = 6;
1534
1535 if (!detectShiftUntilZeroIdiom(CurLoop, *DL, IntrinID, InitX,
1536 CntInst, CntPhi, DefX))
1537 return false;
1538
1539 bool IsCntPhiUsedOutsideLoop = false;
1540 for (User *U : CntPhi->users())
1541 if (!CurLoop->contains(cast<Instruction>(U))) {
1542 IsCntPhiUsedOutsideLoop = true;
1543 break;
1544 }
1545 bool IsCntInstUsedOutsideLoop = false;
1546 for (User *U : CntInst->users())
1547 if (!CurLoop->contains(cast<Instruction>(U))) {
1548 IsCntInstUsedOutsideLoop = true;
1549 break;
1550 }
1551 // If both CntInst and CntPhi are used outside the loop the profitability
1552 // is questionable.
1553 if (IsCntInstUsedOutsideLoop && IsCntPhiUsedOutsideLoop)
1554 return false;
1555
1556 // For some CPUs result of CTLZ(X) intrinsic is undefined
1557 // when X is 0. If we can not guarantee X != 0, we need to check this
1558 // when expand.
1559 bool ZeroCheck = false;
1560 // It is safe to assume Preheader exist as it was checked in
1561 // parent function RunOnLoop.
1562 BasicBlock *PH = CurLoop->getLoopPreheader();
1563
1564 // If we are using the count instruction outside the loop, make sure we
1565 // have a zero check as a precondition. Without the check the loop would run
1566 // one iteration for before any check of the input value. This means 0 and 1
1567 // would have identical behavior in the original loop and thus
1568 if (!IsCntPhiUsedOutsideLoop) {
1569 auto *PreCondBB = PH->getSinglePredecessor();
1570 if (!PreCondBB)
1571 return false;
1572 auto *PreCondBI = dyn_cast<BranchInst>(PreCondBB->getTerminator());
1573 if (!PreCondBI)
1574 return false;
1575 if (matchCondition(PreCondBI, PH) != InitX)
1576 return false;
1577 ZeroCheck = true;
1578 }
1579
1580 // Check if CTLZ / CTTZ intrinsic is profitable. Assume it is always
1581 // profitable if we delete the loop.
1582
1583 // the loop has only 6 instructions:
1584 // %n.addr.0 = phi [ %n, %entry ], [ %shr, %while.cond ]
1585 // %i.0 = phi [ %i0, %entry ], [ %inc, %while.cond ]
1586 // %shr = ashr %n.addr.0, 1
1587 // %tobool = icmp eq %shr, 0
1588 // %inc = add nsw %i.0, 1
1589 // br i1 %tobool
1590
1591 const Value *Args[] = {InitX,
1592 ConstantInt::getBool(InitX->getContext(), ZeroCheck)};
1593
1594 // @llvm.dbg doesn't count as they have no semantic effect.
1595 auto InstWithoutDebugIt = CurLoop->getHeader()->instructionsWithoutDebug();
1596 uint32_t HeaderSize =
1597 std::distance(InstWithoutDebugIt.begin(), InstWithoutDebugIt.end());
1598
1599 IntrinsicCostAttributes Attrs(IntrinID, InitX->getType(), Args);
1600 InstructionCost Cost =
1601 TTI->getIntrinsicInstrCost(Attrs, TargetTransformInfo::TCK_SizeAndLatency);
1602 if (HeaderSize != IdiomCanonicalSize &&
1603 Cost > TargetTransformInfo::TCC_Basic)
1604 return false;
1605
1606 transformLoopToCountable(IntrinID, PH, CntInst, CntPhi, InitX, DefX,
1607 DefX->getDebugLoc(), ZeroCheck,
1608 IsCntPhiUsedOutsideLoop);
1609 return true;
1610}
1611
1612/// Recognizes a population count idiom in a non-countable loop.
1613///
1614/// If detected, transforms the relevant code to issue the popcount intrinsic
1615/// function call, and returns true; otherwise, returns false.
1616bool LoopIdiomRecognize::recognizePopcount() {
1617 if (TTI->getPopcntSupport(32) != TargetTransformInfo::PSK_FastHardware)
1618 return false;
1619
1620 // Counting population are usually conducted by few arithmetic instructions.
1621 // Such instructions can be easily "absorbed" by vacant slots in a
1622 // non-compact loop. Therefore, recognizing popcount idiom only makes sense
1623 // in a compact loop.
1624
1625 // Give up if the loop has multiple blocks or multiple backedges.
1626 if (CurLoop->getNumBackEdges() != 1 || CurLoop->getNumBlocks() != 1)
1627 return false;
1628
1629 BasicBlock *LoopBody = *(CurLoop->block_begin());
1630 if (LoopBody->size() >= 20) {
1631 // The loop is too big, bail out.
1632 return false;
1633 }
1634
1635 // It should have a preheader containing nothing but an unconditional branch.
1636 BasicBlock *PH = CurLoop->getLoopPreheader();
1637 if (!PH || &PH->front() != PH->getTerminator())
1638 return false;
1639 auto *EntryBI = dyn_cast<BranchInst>(PH->getTerminator());
1640 if (!EntryBI || EntryBI->isConditional())
1641 return false;
1642
1643 // It should have a precondition block where the generated popcount intrinsic
1644 // function can be inserted.
1645 auto *PreCondBB = PH->getSinglePredecessor();
1646 if (!PreCondBB)
1647 return false;
1648 auto *PreCondBI = dyn_cast<BranchInst>(PreCondBB->getTerminator());
1649 if (!PreCondBI || PreCondBI->isUnconditional())
1650 return false;
1651
1652 Instruction *CntInst;
1653 PHINode *CntPhi;
1654 Value *Val;
1655 if (!detectPopcountIdiom(CurLoop, PreCondBB, CntInst, CntPhi, Val))
1656 return false;
1657
1658 transformLoopToPopcount(PreCondBB, CntInst, CntPhi, Val);
1659 return true;
1660}
1661
1662static CallInst *createPopcntIntrinsic(IRBuilder<> &IRBuilder, Value *Val,
1663 const DebugLoc &DL) {
1664 Value *Ops[] = {Val};
1665 Type *Tys[] = {Val->getType()};
1666
1667 Module *M = IRBuilder.GetInsertBlock()->getParent()->getParent();
1668 Function *Func = Intrinsic::getDeclaration(M, Intrinsic::ctpop, Tys);
1669 CallInst *CI = IRBuilder.CreateCall(Func, Ops);
1670 CI->setDebugLoc(DL);
1671
1672 return CI;
1673}
1674
1675static CallInst *createFFSIntrinsic(IRBuilder<> &IRBuilder, Value *Val,
1676 const DebugLoc &DL, bool ZeroCheck,
1677 Intrinsic::ID IID) {
1678 Value *Ops[] = {Val, IRBuilder.getInt1(ZeroCheck)};
1679 Type *Tys[] = {Val->getType()};
1680
1681 Module *M = IRBuilder.GetInsertBlock()->getParent()->getParent();
1682 Function *Func = Intrinsic::getDeclaration(M, IID, Tys);
1683 CallInst *CI = IRBuilder.CreateCall(Func, Ops);
1684 CI->setDebugLoc(DL);
1685
1686 return CI;
1687}
1688
1689/// Transform the following loop (Using CTLZ, CTTZ is similar):
1690/// loop:
1691/// CntPhi = PHI [Cnt0, CntInst]
1692/// PhiX = PHI [InitX, DefX]
1693/// CntInst = CntPhi + 1
1694/// DefX = PhiX >> 1
1695/// LOOP_BODY
1696/// Br: loop if (DefX != 0)
1697/// Use(CntPhi) or Use(CntInst)
1698///
1699/// Into:
1700/// If CntPhi used outside the loop:
1701/// CountPrev = BitWidth(InitX) - CTLZ(InitX >> 1)
1702/// Count = CountPrev + 1
1703/// else
1704/// Count = BitWidth(InitX) - CTLZ(InitX)
1705/// loop:
1706/// CntPhi = PHI [Cnt0, CntInst]
1707/// PhiX = PHI [InitX, DefX]
1708/// PhiCount = PHI [Count, Dec]
1709/// CntInst = CntPhi + 1
1710/// DefX = PhiX >> 1
1711/// Dec = PhiCount - 1
1712/// LOOP_BODY
1713/// Br: loop if (Dec != 0)
1714/// Use(CountPrev + Cnt0) // Use(CntPhi)
1715/// or
1716/// Use(Count + Cnt0) // Use(CntInst)
1717///
1718/// If LOOP_BODY is empty the loop will be deleted.
1719/// If CntInst and DefX are not used in LOOP_BODY they will be removed.
1720void LoopIdiomRecognize::transformLoopToCountable(
1721 Intrinsic::ID IntrinID, BasicBlock *Preheader, Instruction *CntInst,
1722 PHINode *CntPhi, Value *InitX, Instruction *DefX, const DebugLoc &DL,
1723 bool ZeroCheck, bool IsCntPhiUsedOutsideLoop) {
1724 BranchInst *PreheaderBr = cast<BranchInst>(Preheader->getTerminator());
1725
1726 // Step 1: Insert the CTLZ/CTTZ instruction at the end of the preheader block
1727 IRBuilder<> Builder(PreheaderBr);
1728 Builder.SetCurrentDebugLocation(DL);
1729
1730 // If there are no uses of CntPhi crate:
1731 // Count = BitWidth - CTLZ(InitX);
1732 // NewCount = Count;
1733 // If there are uses of CntPhi create:
1734 // NewCount = BitWidth - CTLZ(InitX >> 1);
1735 // Count = NewCount + 1;
1736 Value *InitXNext;
1737 if (IsCntPhiUsedOutsideLoop) {
1738 if (DefX->getOpcode() == Instruction::AShr)
1739 InitXNext = Builder.CreateAShr(InitX, 1);
1740 else if (DefX->getOpcode() == Instruction::LShr)
1741 InitXNext = Builder.CreateLShr(InitX, 1);
1742 else if (DefX->getOpcode() == Instruction::Shl) // cttz
1743 InitXNext = Builder.CreateShl(InitX, 1);
1744 else
1745 llvm_unreachable("Unexpected opcode!")::llvm::llvm_unreachable_internal("Unexpected opcode!", "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp"
, 1745)
;
1746 } else
1747 InitXNext = InitX;
1748 Value *Count =
1749 createFFSIntrinsic(Builder, InitXNext, DL, ZeroCheck, IntrinID);
1750 Type *CountTy = Count->getType();
1751 Count = Builder.CreateSub(
1752 ConstantInt::get(CountTy, CountTy->getIntegerBitWidth()), Count);
1753 Value *NewCount = Count;
1754 if (IsCntPhiUsedOutsideLoop)
1755 Count = Builder.CreateAdd(Count, ConstantInt::get(CountTy, 1));
1756
1757 NewCount = Builder.CreateZExtOrTrunc(NewCount, CntInst->getType());
1758
1759 Value *CntInitVal = CntPhi->getIncomingValueForBlock(Preheader);
1760 if (cast<ConstantInt>(CntInst->getOperand(1))->isOne()) {
1761 // If the counter was being incremented in the loop, add NewCount to the
1762 // counter's initial value, but only if the initial value is not zero.
1763 ConstantInt *InitConst = dyn_cast<ConstantInt>(CntInitVal);
1764 if (!InitConst || !InitConst->isZero())
1765 NewCount = Builder.CreateAdd(NewCount, CntInitVal);
1766 } else {
1767 // If the count was being decremented in the loop, subtract NewCount from
1768 // the counter's initial value.
1769 NewCount = Builder.CreateSub(CntInitVal, NewCount);
1770 }
1771
1772 // Step 2: Insert new IV and loop condition:
1773 // loop:
1774 // ...
1775 // PhiCount = PHI [Count, Dec]
1776 // ...
1777 // Dec = PhiCount - 1
1778 // ...
1779 // Br: loop if (Dec != 0)
1780 BasicBlock *Body = *(CurLoop->block_begin());
1781 auto *LbBr = cast<BranchInst>(Body->getTerminator());
1782 ICmpInst *LbCond = cast<ICmpInst>(LbBr->getCondition());
1783
1784 PHINode *TcPhi = PHINode::Create(CountTy, 2, "tcphi", &Body->front());
1785
1786 Builder.SetInsertPoint(LbCond);
1787 Instruction *TcDec = cast<Instruction>(Builder.CreateSub(
1788 TcPhi, ConstantInt::get(CountTy, 1), "tcdec", false, true));
1789
1790 TcPhi->addIncoming(Count, Preheader);
1791 TcPhi->addIncoming(TcDec, Body);
1792
1793 CmpInst::Predicate Pred =
1794 (LbBr->getSuccessor(0) == Body) ? CmpInst::ICMP_NE : CmpInst::ICMP_EQ;
1795 LbCond->setPredicate(Pred);
1796 LbCond->setOperand(0, TcDec);
1797 LbCond->setOperand(1, ConstantInt::get(CountTy, 0));
1798
1799 // Step 3: All the references to the original counter outside
1800 // the loop are replaced with the NewCount
1801 if (IsCntPhiUsedOutsideLoop)
1802 CntPhi->replaceUsesOutsideBlock(NewCount, Body);
1803 else
1804 CntInst->replaceUsesOutsideBlock(NewCount, Body);
1805
1806 // step 4: Forget the "non-computable" trip-count SCEV associated with the
1807 // loop. The loop would otherwise not be deleted even if it becomes empty.
1808 SE->forgetLoop(CurLoop);
1809}
1810
1811void LoopIdiomRecognize::transformLoopToPopcount(BasicBlock *PreCondBB,
1812 Instruction *CntInst,
1813 PHINode *CntPhi, Value *Var) {
1814 BasicBlock *PreHead = CurLoop->getLoopPreheader();
1815 auto *PreCondBr = cast<BranchInst>(PreCondBB->getTerminator());
1816 const DebugLoc &DL = CntInst->getDebugLoc();
1817
1818 // Assuming before transformation, the loop is following:
1819 // if (x) // the precondition
1820 // do { cnt++; x &= x - 1; } while(x);
1821
1822 // Step 1: Insert the ctpop instruction at the end of the precondition block
1823 IRBuilder<> Builder(PreCondBr);
1824 Value *PopCnt, *PopCntZext, *NewCount, *TripCnt;
1825 {
1826 PopCnt = createPopcntIntrinsic(Builder, Var, DL);
1827 NewCount = PopCntZext =
1828 Builder.CreateZExtOrTrunc(PopCnt, cast<IntegerType>(CntPhi->getType()));
1829
1830 if (NewCount != PopCnt)
1831 (cast<Instruction>(NewCount))->setDebugLoc(DL);
1832
1833 // TripCnt is exactly the number of iterations the loop has
1834 TripCnt = NewCount;
1835
1836 // If the population counter's initial value is not zero, insert Add Inst.
1837 Value *CntInitVal = CntPhi->getIncomingValueForBlock(PreHead);
1838 ConstantInt *InitConst = dyn_cast<ConstantInt>(CntInitVal);
1839 if (!InitConst || !InitConst->isZero()) {
1840 NewCount = Builder.CreateAdd(NewCount, CntInitVal);
1841 (cast<Instruction>(NewCount))->setDebugLoc(DL);
1842 }
1843 }
1844
1845 // Step 2: Replace the precondition from "if (x == 0) goto loop-exit" to
1846 // "if (NewCount == 0) loop-exit". Without this change, the intrinsic
1847 // function would be partial dead code, and downstream passes will drag
1848 // it back from the precondition block to the preheader.
1849 {
1850 ICmpInst *PreCond = cast<ICmpInst>(PreCondBr->getCondition());
1851
1852 Value *Opnd0 = PopCntZext;
1853 Value *Opnd1 = ConstantInt::get(PopCntZext->getType(), 0);
1854 if (PreCond->getOperand(0) != Var)
1855 std::swap(Opnd0, Opnd1);
1856
1857 ICmpInst *NewPreCond = cast<ICmpInst>(
1858 Builder.CreateICmp(PreCond->getPredicate(), Opnd0, Opnd1));
1859 PreCondBr->setCondition(NewPreCond);
1860
1861 RecursivelyDeleteTriviallyDeadInstructions(PreCond, TLI);
1862 }
1863
1864 // Step 3: Note that the population count is exactly the trip count of the
1865 // loop in question, which enable us to convert the loop from noncountable
1866 // loop into a countable one. The benefit is twofold:
1867 //
1868 // - If the loop only counts population, the entire loop becomes dead after
1869 // the transformation. It is a lot easier to prove a countable loop dead
1870 // than to prove a noncountable one. (In some C dialects, an infinite loop
1871 // isn't dead even if it computes nothing useful. In general, DCE needs
1872 // to prove a noncountable loop finite before safely delete it.)
1873 //
1874 // - If the loop also performs something else, it remains alive.
1875 // Since it is transformed to countable form, it can be aggressively
1876 // optimized by some optimizations which are in general not applicable
1877 // to a noncountable loop.
1878 //
1879 // After this step, this loop (conceptually) would look like following:
1880 // newcnt = __builtin_ctpop(x);
1881 // t = newcnt;
1882 // if (x)
1883 // do { cnt++; x &= x-1; t--) } while (t > 0);
1884 BasicBlock *Body = *(CurLoop->block_begin());
1885 {
1886 auto *LbBr = cast<BranchInst>(Body->getTerminator());
1887 ICmpInst *LbCond = cast<ICmpInst>(LbBr->getCondition());
1888 Type *Ty = TripCnt->getType();
1889
1890 PHINode *TcPhi = PHINode::Create(Ty, 2, "tcphi", &Body->front());
1891
1892 Builder.SetInsertPoint(LbCond);
1893 Instruction *TcDec = cast<Instruction>(
1894 Builder.CreateSub(TcPhi, ConstantInt::get(Ty, 1),
1895 "tcdec", false, true));
1896
1897 TcPhi->addIncoming(TripCnt, PreHead);
1898 TcPhi->addIncoming(TcDec, Body);
1899
1900 CmpInst::Predicate Pred =
1901 (LbBr->getSuccessor(0) == Body) ? CmpInst::ICMP_UGT : CmpInst::ICMP_SLE;
1902 LbCond->setPredicate(Pred);
1903 LbCond->setOperand(0, TcDec);
1904 LbCond->setOperand(1, ConstantInt::get(Ty, 0));
1905 }
1906
1907 // Step 4: All the references to the original population counter outside
1908 // the loop are replaced with the NewCount -- the value returned from
1909 // __builtin_ctpop().
1910 CntInst->replaceUsesOutsideBlock(NewCount, Body);
1911
1912 // step 5: Forget the "non-computable" trip-count SCEV associated with the
1913 // loop. The loop would otherwise not be deleted even if it becomes empty.
1914 SE->forgetLoop(CurLoop);
1915}
1916
1917/// Match loop-invariant value.
1918template <typename SubPattern_t> struct match_LoopInvariant {
1919 SubPattern_t SubPattern;
1920 const Loop *L;
1921
1922 match_LoopInvariant(const SubPattern_t &SP, const Loop *L)
1923 : SubPattern(SP), L(L) {}
1924
1925 template <typename ITy> bool match(ITy *V) {
1926 return L->isLoopInvariant(V) && SubPattern.match(V);
1927 }
1928};
1929
1930/// Matches if the value is loop-invariant.
1931template <typename Ty>
1932inline match_LoopInvariant<Ty> m_LoopInvariant(const Ty &M, const Loop *L) {
1933 return match_LoopInvariant<Ty>(M, L);
1934}
1935
1936/// Return true if the idiom is detected in the loop.
1937///
1938/// The core idiom we are trying to detect is:
1939/// \code
1940/// entry:
1941/// <...>
1942/// %bitmask = shl i32 1, %bitpos
1943/// br label %loop
1944///
1945/// loop:
1946/// %x.curr = phi i32 [ %x, %entry ], [ %x.next, %loop ]
1947/// %x.curr.bitmasked = and i32 %x.curr, %bitmask
1948/// %x.curr.isbitunset = icmp eq i32 %x.curr.bitmasked, 0
1949/// %x.next = shl i32 %x.curr, 1
1950/// <...>
1951/// br i1 %x.curr.isbitunset, label %loop, label %end
1952///
1953/// end:
1954/// %x.curr.res = phi i32 [ %x.curr, %loop ] <...>
1955/// %x.next.res = phi i32 [ %x.next, %loop ] <...>
1956/// <...>
1957/// \endcode
1958static bool detectShiftUntilBitTestIdiom(Loop *CurLoop, Value *&BaseX,
1959 Value *&BitMask, Value *&BitPos,
1960 Value *&CurrX, Instruction *&NextX) {
1961 LLVM_DEBUG(dbgs() << DEBUG_TYPEdo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-idiom")) { dbgs() << "loop-idiom" " Performing shift-until-bittest idiom detection.\n"
; } } while (false)
1962 " Performing shift-until-bittest idiom detection.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-idiom")) { dbgs() << "loop-idiom" " Performing shift-until-bittest idiom detection.\n"
; } } while (false)
;
1963
1964 // Give up if the loop has multiple blocks or multiple backedges.
1965 if (CurLoop->getNumBlocks() != 1 || CurLoop->getNumBackEdges() != 1) {
1966 LLVM_DEBUG(dbgs() << DEBUG_TYPE " Bad block/backedge count.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-idiom")) { dbgs() << "loop-idiom" " Bad block/backedge count.\n"
; } } while (false)
;
1967 return false;
1968 }
1969
1970 BasicBlock *LoopHeaderBB = CurLoop->getHeader();
1971 BasicBlock *LoopPreheaderBB = CurLoop->getLoopPreheader();
1972 assert(LoopPreheaderBB && "There is always a loop preheader.")((LoopPreheaderBB && "There is always a loop preheader."
) ? static_cast<void> (0) : __assert_fail ("LoopPreheaderBB && \"There is always a loop preheader.\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp"
, 1972, __PRETTY_FUNCTION__))
;
1973
1974 using namespace PatternMatch;
1975
1976 // Step 1: Check if the loop backedge is in desirable form.
1977
1978 ICmpInst::Predicate Pred;
1979 Value *CmpLHS, *CmpRHS;
1980 BasicBlock *TrueBB, *FalseBB;
1981 if (!match(LoopHeaderBB->getTerminator(),
1982 m_Br(m_ICmp(Pred, m_Value(CmpLHS), m_Value(CmpRHS)),
1983 m_BasicBlock(TrueBB), m_BasicBlock(FalseBB)))) {
1984 LLVM_DEBUG(dbgs() << DEBUG_TYPE " Bad backedge structure.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-idiom")) { dbgs() << "loop-idiom" " Bad backedge structure.\n"
; } } while (false)
;
1985 return false;
1986 }
1987
1988 // Step 2: Check if the backedge's condition is in desirable form.
1989
1990 auto MatchVariableBitMask = [&]() {
1991 return ICmpInst::isEquality(Pred) && match(CmpRHS, m_Zero()) &&
1992 match(CmpLHS,
1993 m_c_And(m_Value(CurrX),
1994 m_CombineAnd(
1995 m_Value(BitMask),
1996 m_LoopInvariant(m_Shl(m_One(), m_Value(BitPos)),
1997 CurLoop))));
1998 };
1999 auto MatchConstantBitMask = [&]() {
2000 return ICmpInst::isEquality(Pred) && match(CmpRHS, m_Zero()) &&
2001 match(CmpLHS, m_And(m_Value(CurrX),
2002 m_CombineAnd(m_Value(BitMask), m_Power2()))) &&
2003 (BitPos = ConstantExpr::getExactLogBase2(cast<Constant>(BitMask)));
2004 };
2005 auto MatchDecomposableConstantBitMask = [&]() {
2006 APInt Mask;
2007 return llvm::decomposeBitTestICmp(CmpLHS, CmpRHS, Pred, CurrX, Mask) &&
2008 ICmpInst::isEquality(Pred) && Mask.isPowerOf2() &&
2009 (BitMask = ConstantInt::get(CurrX->getType(), Mask)) &&
2010 (BitPos = ConstantInt::get(CurrX->getType(), Mask.logBase2()));
2011 };
2012
2013 if (!MatchVariableBitMask() && !MatchConstantBitMask() &&
2014 !MatchDecomposableConstantBitMask()) {
2015 LLVM_DEBUG(dbgs() << DEBUG_TYPE " Bad backedge comparison.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-idiom")) { dbgs() << "loop-idiom" " Bad backedge comparison.\n"
; } } while (false)
;
2016 return false;
2017 }
2018
2019 // Step 3: Check if the recurrence is in desirable form.
2020 auto *CurrXPN = dyn_cast<PHINode>(CurrX);
2021 if (!CurrXPN || CurrXPN->getParent() != LoopHeaderBB) {
2022 LLVM_DEBUG(dbgs() << DEBUG_TYPE " Not an expected PHI node.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-idiom")) { dbgs() << "loop-idiom" " Not an expected PHI node.\n"
; } } while (false)
;
2023 return false;
2024 }
2025
2026 BaseX = CurrXPN->getIncomingValueForBlock(LoopPreheaderBB);
2027 NextX =
2028 dyn_cast<Instruction>(CurrXPN->getIncomingValueForBlock(LoopHeaderBB));
2029
2030 if (!NextX || !match(NextX, m_Shl(m_Specific(CurrX), m_One()))) {
2031 // FIXME: support right-shift?
2032 LLVM_DEBUG(dbgs() << DEBUG_TYPE " Bad recurrence.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-idiom")) { dbgs() << "loop-idiom" " Bad recurrence.\n"
; } } while (false)
;
2033 return false;
2034 }
2035
2036 // Step 4: Check if the backedge's destinations are in desirable form.
2037
2038 assert(ICmpInst::isEquality(Pred) &&((ICmpInst::isEquality(Pred) && "Should only get equality predicates here."
) ? static_cast<void> (0) : __assert_fail ("ICmpInst::isEquality(Pred) && \"Should only get equality predicates here.\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp"
, 2039, __PRETTY_FUNCTION__))
2039 "Should only get equality predicates here.")((ICmpInst::isEquality(Pred) && "Should only get equality predicates here."
) ? static_cast<void> (0) : __assert_fail ("ICmpInst::isEquality(Pred) && \"Should only get equality predicates here.\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp"
, 2039, __PRETTY_FUNCTION__))
;
2040
2041 // cmp-br is commutative, so canonicalize to a single variant.
2042 if (Pred != ICmpInst::Predicate::ICMP_EQ) {
2043 Pred = ICmpInst::getInversePredicate(Pred);
2044 std::swap(TrueBB, FalseBB);
2045 }
2046
2047 // We expect to exit loop when comparison yields false,
2048 // so when it yields true we should branch back to loop header.
2049 if (TrueBB != LoopHeaderBB) {
2050 LLVM_DEBUG(dbgs() << DEBUG_TYPE " Bad backedge flow.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-idiom")) { dbgs() << "loop-idiom" " Bad backedge flow.\n"
; } } while (false)
;
2051 return false;
2052 }
2053
2054 // Okay, idiom checks out.
2055 return true;
2056}
2057
2058/// Look for the following loop:
2059/// \code
2060/// entry:
2061/// <...>
2062/// %bitmask = shl i32 1, %bitpos
2063/// br label %loop
2064///
2065/// loop:
2066/// %x.curr = phi i32 [ %x, %entry ], [ %x.next, %loop ]
2067/// %x.curr.bitmasked = and i32 %x.curr, %bitmask
2068/// %x.curr.isbitunset = icmp eq i32 %x.curr.bitmasked, 0
2069/// %x.next = shl i32 %x.curr, 1
2070/// <...>
2071/// br i1 %x.curr.isbitunset, label %loop, label %end
2072///
2073/// end:
2074/// %x.curr.res = phi i32 [ %x.curr, %loop ] <...>
2075/// %x.next.res = phi i32 [ %x.next, %loop ] <...>
2076/// <...>
2077/// \endcode
2078///
2079/// And transform it into:
2080/// \code
2081/// entry:
2082/// %bitmask = shl i32 1, %bitpos
2083/// %lowbitmask = add i32 %bitmask, -1
2084/// %mask = or i32 %lowbitmask, %bitmask
2085/// %x.masked = and i32 %x, %mask
2086/// %x.masked.numleadingzeros = call i32 @llvm.ctlz.i32(i32 %x.masked,
2087/// i1 true)
2088/// %x.masked.numactivebits = sub i32 32, %x.masked.numleadingzeros
2089/// %x.masked.leadingonepos = add i32 %x.masked.numactivebits, -1
2090/// %backedgetakencount = sub i32 %bitpos, %x.masked.leadingonepos
2091/// %tripcount = add i32 %backedgetakencount, 1
2092/// %x.curr = shl i32 %x, %backedgetakencount
2093/// %x.next = shl i32 %x, %tripcount
2094/// br label %loop
2095///
2096/// loop:
2097/// %loop.iv = phi i32 [ 0, %entry ], [ %loop.iv.next, %loop ]
2098/// %loop.iv.next = add nuw i32 %loop.iv, 1
2099/// %loop.ivcheck = icmp eq i32 %loop.iv.next, %tripcount
2100/// <...>
2101/// br i1 %loop.ivcheck, label %end, label %loop
2102///
2103/// end:
2104/// %x.curr.res = phi i32 [ %x.curr, %loop ] <...>
2105/// %x.next.res = phi i32 [ %x.next, %loop ] <...>
2106/// <...>
2107/// \endcode
2108bool LoopIdiomRecognize::recognizeShiftUntilBitTest() {
2109 bool MadeChange = false;
2110
2111 Value *X, *BitMask, *BitPos, *XCurr;
2112 Instruction *XNext;
2113 if (!detectShiftUntilBitTestIdiom(CurLoop, X, BitMask, BitPos, XCurr,
2114 XNext)) {
2115 LLVM_DEBUG(dbgs() << DEBUG_TYPEdo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-idiom")) { dbgs() << "loop-idiom" " shift-until-bittest idiom detection failed.\n"
; } } while (false)
2116 " shift-until-bittest idiom detection failed.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-idiom")) { dbgs() << "loop-idiom" " shift-until-bittest idiom detection failed.\n"
; } } while (false)
;
2117 return MadeChange;
2118 }
2119 LLVM_DEBUG(dbgs() << DEBUG_TYPE " shift-until-bittest idiom detected!\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-idiom")) { dbgs() << "loop-idiom" " shift-until-bittest idiom detected!\n"
; } } while (false)
;
2120
2121 // Ok, it is the idiom we were looking for, we *could* transform this loop,
2122 // but is it profitable to transform?
2123
2124 BasicBlock *LoopHeaderBB = CurLoop->getHeader();
2125 BasicBlock *LoopPreheaderBB = CurLoop->getLoopPreheader();
2126 assert(LoopPreheaderBB && "There is always a loop preheader.")((LoopPreheaderBB && "There is always a loop preheader."
) ? static_cast<void> (0) : __assert_fail ("LoopPreheaderBB && \"There is always a loop preheader.\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp"
, 2126, __PRETTY_FUNCTION__))
;
2127
2128 BasicBlock *SuccessorBB = CurLoop->getExitBlock();
2129 assert(LoopPreheaderBB && "There is only a single successor.")((LoopPreheaderBB && "There is only a single successor."
) ? static_cast<void> (0) : __assert_fail ("LoopPreheaderBB && \"There is only a single successor.\""
, "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp"
, 2129, __PRETTY_FUNCTION__))
;
2130
2131 IRBuilder<> Builder(LoopPreheaderBB->getTerminator());
2132 Builder.SetCurrentDebugLocation(cast<Instruction>(XCurr)->getDebugLoc());
2133
2134 Intrinsic::ID IntrID = Intrinsic::ctlz;
2135 Type *Ty = X->getType();
2136 unsigned Bitwidth = Ty->getScalarSizeInBits();
2137
2138 TargetTransformInfo::TargetCostKind CostKind =
2139 TargetTransformInfo::TCK_SizeAndLatency;
2140
2141 // The rewrite is considered to be unprofitable iff and only iff the
2142 // intrinsic/shift we'll use are not cheap. Note that we are okay with *just*
2143 // making the loop countable, even if nothing else changes.
2144 IntrinsicCostAttributes Attrs(
2145 IntrID, Ty, {UndefValue::get(Ty), /*is_zero_undef=*/Builder.getTrue()});
2146 InstructionCost Cost = TTI->getIntrinsicInstrCost(Attrs, CostKind);
2147 if (Cost > TargetTransformInfo::TCC_Basic) {
2148 LLVM_DEBUG(dbgs() << DEBUG_TYPEdo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-idiom")) { dbgs() << "loop-idiom" " Intrinsic is too costly, not beneficial\n"
; } } while (false)
2149 " Intrinsic is too costly, not beneficial\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-idiom")) { dbgs() << "loop-idiom" " Intrinsic is too costly, not beneficial\n"
; } } while (false)
;
2150 return MadeChange;
2151 }
2152 if (TTI->getArithmeticInstrCost(Instruction::Shl, Ty, CostKind) >
2153 TargetTransformInfo::TCC_Basic) {
2154 LLVM_DEBUG(dbgs() << DEBUG_TYPE " Shift is too costly, not beneficial\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-idiom")) { dbgs() << "loop-idiom" " Shift is too costly, not beneficial\n"
; } } while (false)
;
2155 return MadeChange;
2156 }
2157
2158 // Ok, transform appears worthwhile.
2159 MadeChange = true;
2160
2161 // Step 1: Compute the loop trip count.
2162
2163 Value *LowBitMask = Builder.CreateAdd(BitMask, Constant::getAllOnesValue(Ty),
2164 BitPos->getName() + ".lowbitmask");
2165 Value *Mask =
2166 Builder.CreateOr(LowBitMask, BitMask, BitPos->getName() + ".mask");
2167 Value *XMasked = Builder.CreateAnd(X, Mask, X->getName() + ".masked");
2168 CallInst *XMaskedNumLeadingZeros = Builder.CreateIntrinsic(
2169 IntrID, Ty, {XMasked, /*is_zero_undef=*/Builder.getTrue()},
2170 /*FMFSource=*/nullptr, XMasked->getName() + ".numleadingzeros");
2171 Value *XMaskedNumActiveBits = Builder.CreateSub(
2172 ConstantInt::get(Ty, Ty->getScalarSizeInBits()), XMaskedNumLeadingZeros,
2173 XMasked->getName() + ".numactivebits", /*HasNUW=*/true,
2174 /*HasNSW=*/Bitwidth != 2);
2175 Value *XMaskedLeadingOnePos =
2176 Builder.CreateAdd(XMaskedNumActiveBits, Constant::getAllOnesValue(Ty),
2177 XMasked->getName() + ".leadingonepos", /*HasNUW=*/false,
2178 /*HasNSW=*/Bitwidth > 2);
2179
2180 Value *LoopBackedgeTakenCount = Builder.CreateSub(
2181 BitPos, XMaskedLeadingOnePos, CurLoop->getName() + ".backedgetakencount",
2182 /*HasNUW=*/true, /*HasNSW=*/true);
2183 // We know loop's backedge-taken count, but what's loop's trip count?
2184 // Note that while NUW is always safe, while NSW is only for bitwidths != 2.
2185 Value *LoopTripCount =
2186 Builder.CreateAdd(LoopBackedgeTakenCount, ConstantInt::get(Ty, 1),
2187 CurLoop->getName() + ".tripcount", /*HasNUW=*/true,
2188 /*HasNSW=*/Bitwidth != 2);
2189
2190 // Step 2: Compute the recurrence's final value without a loop.
2191
2192 // NewX is always safe to compute, because `LoopBackedgeTakenCount`
2193 // will always be smaller than `bitwidth(X)`, i.e. we never get poison.
2194 Value *NewX = Builder.CreateShl(X, LoopBackedgeTakenCount);
2195 NewX->takeName(XCurr);
2196 if (auto *I = dyn_cast<Instruction>(NewX))
2197 I->copyIRFlags(XNext, /*IncludeWrapFlags=*/true);
2198
2199 Value *NewXNext;
2200 // Rewriting XNext is more complicated, however, because `X << LoopTripCount`
2201 // will be poison iff `LoopTripCount == bitwidth(X)` (which will happen
2202 // iff `BitPos` is `bitwidth(x) - 1` and `X` is `1`). So unless we know
2203 // that isn't the case, we'll need to emit an alternative, safe IR.
2204 if (XNext->hasNoSignedWrap() || XNext->hasNoUnsignedWrap() ||
2205 PatternMatch::match(
2206 BitPos, PatternMatch::m_SpecificInt_ICMP(
2207 ICmpInst::ICMP_NE, APInt(Ty->getScalarSizeInBits(),
2208 Ty->getScalarSizeInBits() - 1))))
2209 NewXNext = Builder.CreateShl(X, LoopTripCount);
2210 else {
2211 // Otherwise, just additionally shift by one. It's the smallest solution,
2212 // alternatively, we could check that NewX is INT_MIN (or BitPos is )
2213 // and select 0 instead.
2214 NewXNext = Builder.CreateShl(NewX, ConstantInt::get(Ty, 1));
2215 }
2216
2217 NewXNext->takeName(XNext);
2218 if (auto *I = dyn_cast<Instruction>(NewXNext))
2219 I->copyIRFlags(XNext, /*IncludeWrapFlags=*/true);
2220
2221 // Step 3: Adjust the successor basic block to recieve the computed
2222 // recurrence's final value instead of the recurrence itself.
2223
2224 XCurr->replaceUsesOutsideBlock(NewX, LoopHeaderBB);
2225 XNext->replaceUsesOutsideBlock(NewXNext, LoopHeaderBB);
2226
2227 // Step 4: Rewrite the loop into a countable form, with canonical IV.
2228
2229 // The new canonical induction variable.
2230 Builder.SetInsertPoint(&LoopHeaderBB->front());
2231 auto *IV = Builder.CreatePHI(Ty, 2, CurLoop->getName() + ".iv");
2232
2233 // The induction itself.
2234 // Note that while NUW is always safe, while NSW is only for bitwidths != 2.
2235 Builder.SetInsertPoint(LoopHeaderBB->getTerminator());
2236 auto *IVNext =
2237 Builder.CreateAdd(IV, ConstantInt::get(Ty, 1), IV->getName() + ".next",
2238 /*HasNUW=*/true, /*HasNSW=*/Bitwidth != 2);
2239
2240 // The loop trip count check.
2241 auto *IVCheck = Builder.CreateICmpEQ(IVNext, LoopTripCount,
2242 CurLoop->getName() + ".ivcheck");
2243 Builder.CreateCondBr(IVCheck, SuccessorBB, LoopHeaderBB);
2244 LoopHeaderBB->getTerminator()->eraseFromParent();
2245
2246 // Populate the IV PHI.
2247 IV->addIncoming(ConstantInt::get(Ty, 0), LoopPreheaderBB);
2248 IV->addIncoming(IVNext, LoopHeaderBB);
2249
2250 // Step 5: Forget the "non-computable" trip-count SCEV associated with the
2251 // loop. The loop would otherwise not be deleted even if it becomes empty.
2252
2253 SE->forgetLoop(CurLoop);
2254
2255 // Other passes will take care of actually deleting the loop if possible.
2256
2257 LLVM_DEBUG(dbgs() << DEBUG_TYPE " shift-until-bittest idiom optimized!\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-idiom")) { dbgs() << "loop-idiom" " shift-until-bittest idiom optimized!\n"
; } } while (false)
;
2258
2259 ++NumShiftUntilBitTest;
2260 return MadeChange;
2261}