Bug Summary

File:llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
Warning:line 520, column 15
Although the value stored to 'PatternValue' is used in the enclosing expression, the value is never actually read from 'PatternValue'

Annotated Source Code

Press '?' to see keyboard shortcuts

clang -cc1 -cc1 -triple x86_64-pc-linux-gnu -analyze -disable-free -disable-llvm-verifier -discard-value-names -main-file-name LoopIdiomRecognize.cpp -analyzer-store=region -analyzer-opt-analyze-nested-blocks -analyzer-checker=core -analyzer-checker=apiModeling -analyzer-checker=unix -analyzer-checker=deadcode -analyzer-checker=cplusplus -analyzer-checker=security.insecureAPI.UncheckedReturn -analyzer-checker=security.insecureAPI.getpw -analyzer-checker=security.insecureAPI.gets -analyzer-checker=security.insecureAPI.mktemp -analyzer-checker=security.insecureAPI.mkstemp -analyzer-checker=security.insecureAPI.vfork -analyzer-checker=nullability.NullPassedToNonnull -analyzer-checker=nullability.NullReturnedFromNonnull -analyzer-output plist -w -setup-static-analyzer -analyzer-config-compatibility-mode=true -mrelocation-model pic -pic-level 2 -fhalf-no-semantic-interposition -mframe-pointer=none -fmath-errno -fno-rounding-math -mconstructor-aliases -munwind-tables -target-cpu x86-64 -tune-cpu generic -fno-split-dwarf-inlining -debugger-tuning=gdb -ffunction-sections -fdata-sections -resource-dir /usr/lib/llvm-12/lib/clang/12.0.0 -D _DEBUG -D _GNU_SOURCE -D __STDC_CONSTANT_MACROS -D __STDC_FORMAT_MACROS -D __STDC_LIMIT_MACROS -I /build/llvm-toolchain-snapshot-12~++20210124100612+2afaf072f5c1/build-llvm/lib/Transforms/Scalar -I /build/llvm-toolchain-snapshot-12~++20210124100612+2afaf072f5c1/llvm/lib/Transforms/Scalar -I /build/llvm-toolchain-snapshot-12~++20210124100612+2afaf072f5c1/build-llvm/include -I /build/llvm-toolchain-snapshot-12~++20210124100612+2afaf072f5c1/llvm/include -U NDEBUG -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/6.3.0/../../../../include/c++/6.3.0 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/6.3.0/../../../../include/x86_64-linux-gnu/c++/6.3.0 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/6.3.0/../../../../include/x86_64-linux-gnu/c++/6.3.0 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/6.3.0/../../../../include/c++/6.3.0/backward -internal-isystem /usr/local/include -internal-isystem /usr/lib/llvm-12/lib/clang/12.0.0/include -internal-externc-isystem /usr/include/x86_64-linux-gnu -internal-externc-isystem /include -internal-externc-isystem /usr/include -O2 -Wno-unused-parameter -Wwrite-strings -Wno-missing-field-initializers -Wno-long-long -Wno-maybe-uninitialized -Wno-comment -std=c++14 -fdeprecated-macro -fdebug-compilation-dir /build/llvm-toolchain-snapshot-12~++20210124100612+2afaf072f5c1/build-llvm/lib/Transforms/Scalar -fdebug-prefix-map=/build/llvm-toolchain-snapshot-12~++20210124100612+2afaf072f5c1=. -ferror-limit 19 -fvisibility-inlines-hidden -stack-protector 2 -fgnuc-version=4.2.1 -vectorize-loops -vectorize-slp -analyzer-output=html -analyzer-config stable-report-filename=true -faddrsig -o /tmp/scan-build-2021-01-24-223304-31662-1 -x c++ /build/llvm-toolchain-snapshot-12~++20210124100612+2afaf072f5c1/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
1//===- LoopIdiomRecognize.cpp - Loop idiom recognition --------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This pass implements an idiom recognizer that transforms simple loops into a
10// non-loop form. In cases that this kicks in, it can be a significant
11// performance win.
12//
13// If compiling for code size we avoid idiom recognition if the resulting
14// code could be larger than the code for the original loop. One way this could
15// happen is if the loop is not removable after idiom recognition due to the
16// presence of non-idiom instructions. The initial implementation of the
17// heuristics applies to idioms in multi-block loops.
18//
19//===----------------------------------------------------------------------===//
20//
21// TODO List:
22//
23// Future loop memory idioms to recognize:
24// memcmp, memmove, strlen, etc.
25// Future floating point idioms to recognize in -ffast-math mode:
26// fpowi
27// Future integer operation idioms to recognize:
28// ctpop
29//
30// Beware that isel's default lowering for ctpop is highly inefficient for
31// i64 and larger types when i64 is legal and the value has few bits set. It
32// would be good to enhance isel to emit a loop for ctpop in this case.
33//
34// This could recognize common matrix multiplies and dot product idioms and
35// replace them with calls to BLAS (if linked in??).
36//
37//===----------------------------------------------------------------------===//
38
39#include "llvm/Transforms/Scalar/LoopIdiomRecognize.h"
40#include "llvm/ADT/APInt.h"
41#include "llvm/ADT/ArrayRef.h"
42#include "llvm/ADT/DenseMap.h"
43#include "llvm/ADT/MapVector.h"
44#include "llvm/ADT/SetVector.h"
45#include "llvm/ADT/SmallPtrSet.h"
46#include "llvm/ADT/SmallVector.h"
47#include "llvm/ADT/Statistic.h"
48#include "llvm/ADT/StringRef.h"
49#include "llvm/Analysis/AliasAnalysis.h"
50#include "llvm/Analysis/CmpInstAnalysis.h"
51#include "llvm/Analysis/LoopAccessAnalysis.h"
52#include "llvm/Analysis/LoopInfo.h"
53#include "llvm/Analysis/LoopPass.h"
54#include "llvm/Analysis/MemoryLocation.h"
55#include "llvm/Analysis/MemorySSA.h"
56#include "llvm/Analysis/MemorySSAUpdater.h"
57#include "llvm/Analysis/MustExecute.h"
58#include "llvm/Analysis/OptimizationRemarkEmitter.h"
59#include "llvm/Analysis/ScalarEvolution.h"
60#include "llvm/Analysis/ScalarEvolutionExpressions.h"
61#include "llvm/Analysis/TargetLibraryInfo.h"
62#include "llvm/Analysis/TargetTransformInfo.h"
63#include "llvm/Analysis/ValueTracking.h"
64#include "llvm/IR/Attributes.h"
65#include "llvm/IR/BasicBlock.h"
66#include "llvm/IR/Constant.h"
67#include "llvm/IR/Constants.h"
68#include "llvm/IR/DataLayout.h"
69#include "llvm/IR/DebugLoc.h"
70#include "llvm/IR/DerivedTypes.h"
71#include "llvm/IR/Dominators.h"
72#include "llvm/IR/GlobalValue.h"
73#include "llvm/IR/GlobalVariable.h"
74#include "llvm/IR/IRBuilder.h"
75#include "llvm/IR/InstrTypes.h"
76#include "llvm/IR/Instruction.h"
77#include "llvm/IR/Instructions.h"
78#include "llvm/IR/IntrinsicInst.h"
79#include "llvm/IR/Intrinsics.h"
80#include "llvm/IR/LLVMContext.h"
81#include "llvm/IR/Module.h"
82#include "llvm/IR/PassManager.h"
83#include "llvm/IR/PatternMatch.h"
84#include "llvm/IR/Type.h"
85#include "llvm/IR/User.h"
86#include "llvm/IR/Value.h"
87#include "llvm/IR/ValueHandle.h"
88#include "llvm/InitializePasses.h"
89#include "llvm/Pass.h"
90#include "llvm/Support/Casting.h"
91#include "llvm/Support/CommandLine.h"
92#include "llvm/Support/Debug.h"
93#include "llvm/Support/raw_ostream.h"
94#include "llvm/Transforms/Scalar.h"
95#include "llvm/Transforms/Utils/BuildLibCalls.h"
96#include "llvm/Transforms/Utils/Local.h"
97#include "llvm/Transforms/Utils/LoopUtils.h"
98#include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
99#include <algorithm>
100#include <cassert>
101#include <cstdint>
102#include <utility>
103#include <vector>
104
105using namespace llvm;
106
107#define DEBUG_TYPE"loop-idiom" "loop-idiom"
108
109STATISTIC(NumMemSet, "Number of memset's formed from loop stores")static llvm::Statistic NumMemSet = {"loop-idiom", "NumMemSet"
, "Number of memset's formed from loop stores"}
;
110STATISTIC(NumMemCpy, "Number of memcpy's formed from loop load+stores")static llvm::Statistic NumMemCpy = {"loop-idiom", "NumMemCpy"
, "Number of memcpy's formed from loop load+stores"}
;
111STATISTIC(static llvm::Statistic NumShiftUntilBitTest = {"loop-idiom", "NumShiftUntilBitTest"
, "Number of uncountable loops recognized as 'shift until bitttest' idiom"
}
112 NumShiftUntilBitTest,static llvm::Statistic NumShiftUntilBitTest = {"loop-idiom", "NumShiftUntilBitTest"
, "Number of uncountable loops recognized as 'shift until bitttest' idiom"
}
113 "Number of uncountable loops recognized as 'shift until bitttest' idiom")static llvm::Statistic NumShiftUntilBitTest = {"loop-idiom", "NumShiftUntilBitTest"
, "Number of uncountable loops recognized as 'shift until bitttest' idiom"
}
;
114
115bool DisableLIRP::All;
116static cl::opt<bool, true>
117 DisableLIRPAll("disable-" DEBUG_TYPE"loop-idiom" "-all",
118 cl::desc("Options to disable Loop Idiom Recognize Pass."),
119 cl::location(DisableLIRP::All), cl::init(false),
120 cl::ReallyHidden);
121
122bool DisableLIRP::Memset;
123static cl::opt<bool, true>
124 DisableLIRPMemset("disable-" DEBUG_TYPE"loop-idiom" "-memset",
125 cl::desc("Proceed with loop idiom recognize pass, but do "
126 "not convert loop(s) to memset."),
127 cl::location(DisableLIRP::Memset), cl::init(false),
128 cl::ReallyHidden);
129
130bool DisableLIRP::Memcpy;
131static cl::opt<bool, true>
132 DisableLIRPMemcpy("disable-" DEBUG_TYPE"loop-idiom" "-memcpy",
133 cl::desc("Proceed with loop idiom recognize pass, but do "
134 "not convert loop(s) to memcpy."),
135 cl::location(DisableLIRP::Memcpy), cl::init(false),
136 cl::ReallyHidden);
137
138static cl::opt<bool> UseLIRCodeSizeHeurs(
139 "use-lir-code-size-heurs",
140 cl::desc("Use loop idiom recognition code size heuristics when compiling"
141 "with -Os/-Oz"),
142 cl::init(true), cl::Hidden);
143
144namespace {
145
146class LoopIdiomRecognize {
147 Loop *CurLoop = nullptr;
148 AliasAnalysis *AA;
149 DominatorTree *DT;
150 LoopInfo *LI;
151 ScalarEvolution *SE;
152 TargetLibraryInfo *TLI;
153 const TargetTransformInfo *TTI;
154 const DataLayout *DL;
155 OptimizationRemarkEmitter &ORE;
156 bool ApplyCodeSizeHeuristics;
157 std::unique_ptr<MemorySSAUpdater> MSSAU;
158
159public:
160 explicit LoopIdiomRecognize(AliasAnalysis *AA, DominatorTree *DT,
161 LoopInfo *LI, ScalarEvolution *SE,
162 TargetLibraryInfo *TLI,
163 const TargetTransformInfo *TTI, MemorySSA *MSSA,
164 const DataLayout *DL,
165 OptimizationRemarkEmitter &ORE)
166 : AA(AA), DT(DT), LI(LI), SE(SE), TLI(TLI), TTI(TTI), DL(DL), ORE(ORE) {
167 if (MSSA)
168 MSSAU = std::make_unique<MemorySSAUpdater>(MSSA);
169 }
170
171 bool runOnLoop(Loop *L);
172
173private:
174 using StoreList = SmallVector<StoreInst *, 8>;
175 using StoreListMap = MapVector<Value *, StoreList>;
176
177 StoreListMap StoreRefsForMemset;
178 StoreListMap StoreRefsForMemsetPattern;
179 StoreList StoreRefsForMemcpy;
180 bool HasMemset;
181 bool HasMemsetPattern;
182 bool HasMemcpy;
183
184 /// Return code for isLegalStore()
185 enum LegalStoreKind {
186 None = 0,
187 Memset,
188 MemsetPattern,
189 Memcpy,
190 UnorderedAtomicMemcpy,
191 DontUse // Dummy retval never to be used. Allows catching errors in retval
192 // handling.
193 };
194
195 /// \name Countable Loop Idiom Handling
196 /// @{
197
198 bool runOnCountableLoop();
199 bool runOnLoopBlock(BasicBlock *BB, const SCEV *BECount,
200 SmallVectorImpl<BasicBlock *> &ExitBlocks);
201
202 void collectStores(BasicBlock *BB);
203 LegalStoreKind isLegalStore(StoreInst *SI);
204 enum class ForMemset { No, Yes };
205 bool processLoopStores(SmallVectorImpl<StoreInst *> &SL, const SCEV *BECount,
206 ForMemset For);
207 bool processLoopMemSet(MemSetInst *MSI, const SCEV *BECount);
208
209 bool processLoopStridedStore(Value *DestPtr, unsigned StoreSize,
210 MaybeAlign StoreAlignment, Value *StoredVal,
211 Instruction *TheStore,
212 SmallPtrSetImpl<Instruction *> &Stores,
213 const SCEVAddRecExpr *Ev, const SCEV *BECount,
214 bool NegStride, bool IsLoopMemset = false);
215 bool processLoopStoreOfLoopLoad(StoreInst *SI, const SCEV *BECount);
216 bool avoidLIRForMultiBlockLoop(bool IsMemset = false,
217 bool IsLoopMemset = false);
218
219 /// @}
220 /// \name Noncountable Loop Idiom Handling
221 /// @{
222
223 bool runOnNoncountableLoop();
224
225 bool recognizePopcount();
226 void transformLoopToPopcount(BasicBlock *PreCondBB, Instruction *CntInst,
227 PHINode *CntPhi, Value *Var);
228 bool recognizeAndInsertFFS(); /// Find First Set: ctlz or cttz
229 void transformLoopToCountable(Intrinsic::ID IntrinID, BasicBlock *PreCondBB,
230 Instruction *CntInst, PHINode *CntPhi,
231 Value *Var, Instruction *DefX,
232 const DebugLoc &DL, bool ZeroCheck,
233 bool IsCntPhiUsedOutsideLoop);
234
235 bool recognizeShiftUntilBitTest();
236
237 /// @}
238};
239
240class LoopIdiomRecognizeLegacyPass : public LoopPass {
241public:
242 static char ID;
243
244 explicit LoopIdiomRecognizeLegacyPass() : LoopPass(ID) {
245 initializeLoopIdiomRecognizeLegacyPassPass(
246 *PassRegistry::getPassRegistry());
247 }
248
249 bool runOnLoop(Loop *L, LPPassManager &LPM) override {
250 if (DisableLIRP::All)
251 return false;
252
253 if (skipLoop(L))
254 return false;
255
256 AliasAnalysis *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
257 DominatorTree *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
258 LoopInfo *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
259 ScalarEvolution *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
260 TargetLibraryInfo *TLI =
261 &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(
262 *L->getHeader()->getParent());
263 const TargetTransformInfo *TTI =
264 &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(
265 *L->getHeader()->getParent());
266 const DataLayout *DL = &L->getHeader()->getModule()->getDataLayout();
267 auto *MSSAAnalysis = getAnalysisIfAvailable<MemorySSAWrapperPass>();
268 MemorySSA *MSSA = nullptr;
269 if (MSSAAnalysis)
270 MSSA = &MSSAAnalysis->getMSSA();
271
272 // For the old PM, we can't use OptimizationRemarkEmitter as an analysis
273 // pass. Function analyses need to be preserved across loop transformations
274 // but ORE cannot be preserved (see comment before the pass definition).
275 OptimizationRemarkEmitter ORE(L->getHeader()->getParent());
276
277 LoopIdiomRecognize LIR(AA, DT, LI, SE, TLI, TTI, MSSA, DL, ORE);
278 return LIR.runOnLoop(L);
279 }
280
281 /// This transformation requires natural loop information & requires that
282 /// loop preheaders be inserted into the CFG.
283 void getAnalysisUsage(AnalysisUsage &AU) const override {
284 AU.addRequired<TargetLibraryInfoWrapperPass>();
285 AU.addRequired<TargetTransformInfoWrapperPass>();
286 AU.addPreserved<MemorySSAWrapperPass>();
287 getLoopAnalysisUsage(AU);
288 }
289};
290
291} // end anonymous namespace
292
293char LoopIdiomRecognizeLegacyPass::ID = 0;
294
295PreservedAnalyses LoopIdiomRecognizePass::run(Loop &L, LoopAnalysisManager &AM,
296 LoopStandardAnalysisResults &AR,
297 LPMUpdater &) {
298 if (DisableLIRP::All)
299 return PreservedAnalyses::all();
300
301 const auto *DL = &L.getHeader()->getModule()->getDataLayout();
302
303 // For the new PM, we also can't use OptimizationRemarkEmitter as an analysis
304 // pass. Function analyses need to be preserved across loop transformations
305 // but ORE cannot be preserved (see comment before the pass definition).
306 OptimizationRemarkEmitter ORE(L.getHeader()->getParent());
307
308 LoopIdiomRecognize LIR(&AR.AA, &AR.DT, &AR.LI, &AR.SE, &AR.TLI, &AR.TTI,
309 AR.MSSA, DL, ORE);
310 if (!LIR.runOnLoop(&L))
311 return PreservedAnalyses::all();
312
313 auto PA = getLoopPassPreservedAnalyses();
314 if (AR.MSSA)
315 PA.preserve<MemorySSAAnalysis>();
316 return PA;
317}
318
319INITIALIZE_PASS_BEGIN(LoopIdiomRecognizeLegacyPass, "loop-idiom",static void *initializeLoopIdiomRecognizeLegacyPassPassOnce(PassRegistry
&Registry) {
320 "Recognize loop idioms", false, false)static void *initializeLoopIdiomRecognizeLegacyPassPassOnce(PassRegistry
&Registry) {
321INITIALIZE_PASS_DEPENDENCY(LoopPass)initializeLoopPassPass(Registry);
322INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)initializeTargetLibraryInfoWrapperPassPass(Registry);
323INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)initializeTargetTransformInfoWrapperPassPass(Registry);
324INITIALIZE_PASS_END(LoopIdiomRecognizeLegacyPass, "loop-idiom",PassInfo *PI = new PassInfo( "Recognize loop idioms", "loop-idiom"
, &LoopIdiomRecognizeLegacyPass::ID, PassInfo::NormalCtor_t
(callDefaultCtor<LoopIdiomRecognizeLegacyPass>), false,
false); Registry.registerPass(*PI, true); return PI; } static
llvm::once_flag InitializeLoopIdiomRecognizeLegacyPassPassFlag
; void llvm::initializeLoopIdiomRecognizeLegacyPassPass(PassRegistry
&Registry) { llvm::call_once(InitializeLoopIdiomRecognizeLegacyPassPassFlag
, initializeLoopIdiomRecognizeLegacyPassPassOnce, std::ref(Registry
)); }
325 "Recognize loop idioms", false, false)PassInfo *PI = new PassInfo( "Recognize loop idioms", "loop-idiom"
, &LoopIdiomRecognizeLegacyPass::ID, PassInfo::NormalCtor_t
(callDefaultCtor<LoopIdiomRecognizeLegacyPass>), false,
false); Registry.registerPass(*PI, true); return PI; } static
llvm::once_flag InitializeLoopIdiomRecognizeLegacyPassPassFlag
; void llvm::initializeLoopIdiomRecognizeLegacyPassPass(PassRegistry
&Registry) { llvm::call_once(InitializeLoopIdiomRecognizeLegacyPassPassFlag
, initializeLoopIdiomRecognizeLegacyPassPassOnce, std::ref(Registry
)); }
326
327Pass *llvm::createLoopIdiomPass() { return new LoopIdiomRecognizeLegacyPass(); }
328
329static void deleteDeadInstruction(Instruction *I) {
330 I->replaceAllUsesWith(UndefValue::get(I->getType()));
331 I->eraseFromParent();
332}
333
334//===----------------------------------------------------------------------===//
335//
336// Implementation of LoopIdiomRecognize
337//
338//===----------------------------------------------------------------------===//
339
340bool LoopIdiomRecognize::runOnLoop(Loop *L) {
341 CurLoop = L;
342 // If the loop could not be converted to canonical form, it must have an
343 // indirectbr in it, just give up.
344 if (!L->getLoopPreheader())
345 return false;
346
347 // Disable loop idiom recognition if the function's name is a common idiom.
348 StringRef Name = L->getHeader()->getParent()->getName();
349 if (Name == "memset" || Name == "memcpy")
350 return false;
351
352 // Determine if code size heuristics need to be applied.
353 ApplyCodeSizeHeuristics =
354 L->getHeader()->getParent()->hasOptSize() && UseLIRCodeSizeHeurs;
355
356 HasMemset = TLI->has(LibFunc_memset);
357 HasMemsetPattern = TLI->has(LibFunc_memset_pattern16);
358 HasMemcpy = TLI->has(LibFunc_memcpy);
359
360 if (HasMemset || HasMemsetPattern || HasMemcpy)
361 if (SE->hasLoopInvariantBackedgeTakenCount(L))
362 return runOnCountableLoop();
363
364 return runOnNoncountableLoop();
365}
366
367bool LoopIdiomRecognize::runOnCountableLoop() {
368 const SCEV *BECount = SE->getBackedgeTakenCount(CurLoop);
369 assert(!isa<SCEVCouldNotCompute>(BECount) &&((!isa<SCEVCouldNotCompute>(BECount) && "runOnCountableLoop() called on a loop without a predictable"
"backedge-taken count") ? static_cast<void> (0) : __assert_fail
("!isa<SCEVCouldNotCompute>(BECount) && \"runOnCountableLoop() called on a loop without a predictable\" \"backedge-taken count\""
, "/build/llvm-toolchain-snapshot-12~++20210124100612+2afaf072f5c1/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp"
, 371, __PRETTY_FUNCTION__))
370 "runOnCountableLoop() called on a loop without a predictable"((!isa<SCEVCouldNotCompute>(BECount) && "runOnCountableLoop() called on a loop without a predictable"
"backedge-taken count") ? static_cast<void> (0) : __assert_fail
("!isa<SCEVCouldNotCompute>(BECount) && \"runOnCountableLoop() called on a loop without a predictable\" \"backedge-taken count\""
, "/build/llvm-toolchain-snapshot-12~++20210124100612+2afaf072f5c1/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp"
, 371, __PRETTY_FUNCTION__))
371 "backedge-taken count")((!isa<SCEVCouldNotCompute>(BECount) && "runOnCountableLoop() called on a loop without a predictable"
"backedge-taken count") ? static_cast<void> (0) : __assert_fail
("!isa<SCEVCouldNotCompute>(BECount) && \"runOnCountableLoop() called on a loop without a predictable\" \"backedge-taken count\""
, "/build/llvm-toolchain-snapshot-12~++20210124100612+2afaf072f5c1/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp"
, 371, __PRETTY_FUNCTION__))
;
372
373 // If this loop executes exactly one time, then it should be peeled, not
374 // optimized by this pass.
375 if (const SCEVConstant *BECst = dyn_cast<SCEVConstant>(BECount))
376 if (BECst->getAPInt() == 0)
377 return false;
378
379 SmallVector<BasicBlock *, 8> ExitBlocks;
380 CurLoop->getUniqueExitBlocks(ExitBlocks);
381
382 LLVM_DEBUG(dbgs() << DEBUG_TYPE " Scanning: F["do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-idiom")) { dbgs() << "loop-idiom" " Scanning: F["
<< CurLoop->getHeader()->getParent()->getName
() << "] Countable Loop %" << CurLoop->getHeader
()->getName() << "\n"; } } while (false)
383 << CurLoop->getHeader()->getParent()->getName()do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-idiom")) { dbgs() << "loop-idiom" " Scanning: F["
<< CurLoop->getHeader()->getParent()->getName
() << "] Countable Loop %" << CurLoop->getHeader
()->getName() << "\n"; } } while (false)
384 << "] Countable Loop %" << CurLoop->getHeader()->getName()do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-idiom")) { dbgs() << "loop-idiom" " Scanning: F["
<< CurLoop->getHeader()->getParent()->getName
() << "] Countable Loop %" << CurLoop->getHeader
()->getName() << "\n"; } } while (false)
385 << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-idiom")) { dbgs() << "loop-idiom" " Scanning: F["
<< CurLoop->getHeader()->getParent()->getName
() << "] Countable Loop %" << CurLoop->getHeader
()->getName() << "\n"; } } while (false)
;
386
387 // The following transforms hoist stores/memsets into the loop pre-header.
388 // Give up if the loop has instructions that may throw.
389 SimpleLoopSafetyInfo SafetyInfo;
390 SafetyInfo.computeLoopSafetyInfo(CurLoop);
391 if (SafetyInfo.anyBlockMayThrow())
392 return false;
393
394 bool MadeChange = false;
395
396 // Scan all the blocks in the loop that are not in subloops.
397 for (auto *BB : CurLoop->getBlocks()) {
398 // Ignore blocks in subloops.
399 if (LI->getLoopFor(BB) != CurLoop)
400 continue;
401
402 MadeChange |= runOnLoopBlock(BB, BECount, ExitBlocks);
403 }
404 return MadeChange;
405}
406
407static APInt getStoreStride(const SCEVAddRecExpr *StoreEv) {
408 const SCEVConstant *ConstStride = cast<SCEVConstant>(StoreEv->getOperand(1));
409 return ConstStride->getAPInt();
410}
411
412/// getMemSetPatternValue - If a strided store of the specified value is safe to
413/// turn into a memset_pattern16, return a ConstantArray of 16 bytes that should
414/// be passed in. Otherwise, return null.
415///
416/// Note that we don't ever attempt to use memset_pattern8 or 4, because these
417/// just replicate their input array and then pass on to memset_pattern16.
418static Constant *getMemSetPatternValue(Value *V, const DataLayout *DL) {
419 // FIXME: This could check for UndefValue because it can be merged into any
420 // other valid pattern.
421
422 // If the value isn't a constant, we can't promote it to being in a constant
423 // array. We could theoretically do a store to an alloca or something, but
424 // that doesn't seem worthwhile.
425 Constant *C = dyn_cast<Constant>(V);
426 if (!C)
427 return nullptr;
428
429 // Only handle simple values that are a power of two bytes in size.
430 uint64_t Size = DL->getTypeSizeInBits(V->getType());
431 if (Size == 0 || (Size & 7) || (Size & (Size - 1)))
432 return nullptr;
433
434 // Don't care enough about darwin/ppc to implement this.
435 if (DL->isBigEndian())
436 return nullptr;
437
438 // Convert to size in bytes.
439 Size /= 8;
440
441 // TODO: If CI is larger than 16-bytes, we can try slicing it in half to see
442 // if the top and bottom are the same (e.g. for vectors and large integers).
443 if (Size > 16)
444 return nullptr;
445
446 // If the constant is exactly 16 bytes, just use it.
447 if (Size == 16)
448 return C;
449
450 // Otherwise, we'll use an array of the constants.
451 unsigned ArraySize = 16 / Size;
452 ArrayType *AT = ArrayType::get(V->getType(), ArraySize);
453 return ConstantArray::get(AT, std::vector<Constant *>(ArraySize, C));
454}
455
456LoopIdiomRecognize::LegalStoreKind
457LoopIdiomRecognize::isLegalStore(StoreInst *SI) {
458 // Don't touch volatile stores.
459 if (SI->isVolatile())
460 return LegalStoreKind::None;
461 // We only want simple or unordered-atomic stores.
462 if (!SI->isUnordered())
463 return LegalStoreKind::None;
464
465 // Avoid merging nontemporal stores.
466 if (SI->getMetadata(LLVMContext::MD_nontemporal))
467 return LegalStoreKind::None;
468
469 Value *StoredVal = SI->getValueOperand();
470 Value *StorePtr = SI->getPointerOperand();
471
472 // Don't convert stores of non-integral pointer types to memsets (which stores
473 // integers).
474 if (DL->isNonIntegralPointerType(StoredVal->getType()->getScalarType()))
475 return LegalStoreKind::None;
476
477 // Reject stores that are so large that they overflow an unsigned.
478 // When storing out scalable vectors we bail out for now, since the code
479 // below currently only works for constant strides.
480 TypeSize SizeInBits = DL->getTypeSizeInBits(StoredVal->getType());
481 if (SizeInBits.isScalable() || (SizeInBits.getFixedSize() & 7) ||
482 (SizeInBits.getFixedSize() >> 32) != 0)
483 return LegalStoreKind::None;
484
485 // See if the pointer expression is an AddRec like {base,+,1} on the current
486 // loop, which indicates a strided store. If we have something else, it's a
487 // random store we can't handle.
488 const SCEVAddRecExpr *StoreEv =
489 dyn_cast<SCEVAddRecExpr>(SE->getSCEV(StorePtr));
490 if (!StoreEv || StoreEv->getLoop() != CurLoop || !StoreEv->isAffine())
491 return LegalStoreKind::None;
492
493 // Check to see if we have a constant stride.
494 if (!isa<SCEVConstant>(StoreEv->getOperand(1)))
495 return LegalStoreKind::None;
496
497 // See if the store can be turned into a memset.
498
499 // If the stored value is a byte-wise value (like i32 -1), then it may be
500 // turned into a memset of i8 -1, assuming that all the consecutive bytes
501 // are stored. A store of i32 0x01020304 can never be turned into a memset,
502 // but it can be turned into memset_pattern if the target supports it.
503 Value *SplatValue = isBytewiseValue(StoredVal, *DL);
504 Constant *PatternValue = nullptr;
505
506 // Note: memset and memset_pattern on unordered-atomic is yet not supported
507 bool UnorderedAtomic = SI->isUnordered() && !SI->isSimple();
508
509 // If we're allowed to form a memset, and the stored value would be
510 // acceptable for memset, use it.
511 if (!UnorderedAtomic && HasMemset && SplatValue && !DisableLIRP::Memset &&
512 // Verify that the stored value is loop invariant. If not, we can't
513 // promote the memset.
514 CurLoop->isLoopInvariant(SplatValue)) {
515 // It looks like we can use SplatValue.
516 return LegalStoreKind::Memset;
517 } else if (!UnorderedAtomic && HasMemsetPattern && !DisableLIRP::Memset &&
518 // Don't create memset_pattern16s with address spaces.
519 StorePtr->getType()->getPointerAddressSpace() == 0 &&
520 (PatternValue = getMemSetPatternValue(StoredVal, DL))) {
Although the value stored to 'PatternValue' is used in the enclosing expression, the value is never actually read from 'PatternValue'
521 // It looks like we can use PatternValue!
522 return LegalStoreKind::MemsetPattern;
523 }
524
525 // Otherwise, see if the store can be turned into a memcpy.
526 if (HasMemcpy && !DisableLIRP::Memcpy) {
527 // Check to see if the stride matches the size of the store. If so, then we
528 // know that every byte is touched in the loop.
529 APInt Stride = getStoreStride(StoreEv);
530 unsigned StoreSize = DL->getTypeStoreSize(SI->getValueOperand()->getType());
531 if (StoreSize != Stride && StoreSize != -Stride)
532 return LegalStoreKind::None;
533
534 // The store must be feeding a non-volatile load.
535 LoadInst *LI = dyn_cast<LoadInst>(SI->getValueOperand());
536
537 // Only allow non-volatile loads
538 if (!LI || LI->isVolatile())
539 return LegalStoreKind::None;
540 // Only allow simple or unordered-atomic loads
541 if (!LI->isUnordered())
542 return LegalStoreKind::None;
543
544 // See if the pointer expression is an AddRec like {base,+,1} on the current
545 // loop, which indicates a strided load. If we have something else, it's a
546 // random load we can't handle.
547 const SCEVAddRecExpr *LoadEv =
548 dyn_cast<SCEVAddRecExpr>(SE->getSCEV(LI->getPointerOperand()));
549 if (!LoadEv || LoadEv->getLoop() != CurLoop || !LoadEv->isAffine())
550 return LegalStoreKind::None;
551
552 // The store and load must share the same stride.
553 if (StoreEv->getOperand(1) != LoadEv->getOperand(1))
554 return LegalStoreKind::None;
555
556 // Success. This store can be converted into a memcpy.
557 UnorderedAtomic = UnorderedAtomic || LI->isAtomic();
558 return UnorderedAtomic ? LegalStoreKind::UnorderedAtomicMemcpy
559 : LegalStoreKind::Memcpy;
560 }
561 // This store can't be transformed into a memset/memcpy.
562 return LegalStoreKind::None;
563}
564
565void LoopIdiomRecognize::collectStores(BasicBlock *BB) {
566 StoreRefsForMemset.clear();
567 StoreRefsForMemsetPattern.clear();
568 StoreRefsForMemcpy.clear();
569 for (Instruction &I : *BB) {
570 StoreInst *SI = dyn_cast<StoreInst>(&I);
571 if (!SI)
572 continue;
573
574 // Make sure this is a strided store with a constant stride.
575 switch (isLegalStore(SI)) {
576 case LegalStoreKind::None:
577 // Nothing to do
578 break;
579 case LegalStoreKind::Memset: {
580 // Find the base pointer.
581 Value *Ptr = getUnderlyingObject(SI->getPointerOperand());
582 StoreRefsForMemset[Ptr].push_back(SI);
583 } break;
584 case LegalStoreKind::MemsetPattern: {
585 // Find the base pointer.
586 Value *Ptr = getUnderlyingObject(SI->getPointerOperand());
587 StoreRefsForMemsetPattern[Ptr].push_back(SI);
588 } break;
589 case LegalStoreKind::Memcpy:
590 case LegalStoreKind::UnorderedAtomicMemcpy:
591 StoreRefsForMemcpy.push_back(SI);
592 break;
593 default:
594 assert(false && "unhandled return value")((false && "unhandled return value") ? static_cast<
void> (0) : __assert_fail ("false && \"unhandled return value\""
, "/build/llvm-toolchain-snapshot-12~++20210124100612+2afaf072f5c1/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp"
, 594, __PRETTY_FUNCTION__))
;
595 break;
596 }
597 }
598}
599
600/// runOnLoopBlock - Process the specified block, which lives in a counted loop
601/// with the specified backedge count. This block is known to be in the current
602/// loop and not in any subloops.
603bool LoopIdiomRecognize::runOnLoopBlock(
604 BasicBlock *BB, const SCEV *BECount,
605 SmallVectorImpl<BasicBlock *> &ExitBlocks) {
606 // We can only promote stores in this block if they are unconditionally
607 // executed in the loop. For a block to be unconditionally executed, it has
608 // to dominate all the exit blocks of the loop. Verify this now.
609 for (unsigned i = 0, e = ExitBlocks.size(); i != e; ++i)
610 if (!DT->dominates(BB, ExitBlocks[i]))
611 return false;
612
613 bool MadeChange = false;
614 // Look for store instructions, which may be optimized to memset/memcpy.
615 collectStores(BB);
616
617 // Look for a single store or sets of stores with a common base, which can be
618 // optimized into a memset (memset_pattern). The latter most commonly happens
619 // with structs and handunrolled loops.
620 for (auto &SL : StoreRefsForMemset)
621 MadeChange |= processLoopStores(SL.second, BECount, ForMemset::Yes);
622
623 for (auto &SL : StoreRefsForMemsetPattern)
624 MadeChange |= processLoopStores(SL.second, BECount, ForMemset::No);
625
626 // Optimize the store into a memcpy, if it feeds an similarly strided load.
627 for (auto &SI : StoreRefsForMemcpy)
628 MadeChange |= processLoopStoreOfLoopLoad(SI, BECount);
629
630 for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) {
631 Instruction *Inst = &*I++;
632 // Look for memset instructions, which may be optimized to a larger memset.
633 if (MemSetInst *MSI = dyn_cast<MemSetInst>(Inst)) {
634 WeakTrackingVH InstPtr(&*I);
635 if (!processLoopMemSet(MSI, BECount))
636 continue;
637 MadeChange = true;
638
639 // If processing the memset invalidated our iterator, start over from the
640 // top of the block.
641 if (!InstPtr)
642 I = BB->begin();
643 continue;
644 }
645 }
646
647 return MadeChange;
648}
649
650/// See if this store(s) can be promoted to a memset.
651bool LoopIdiomRecognize::processLoopStores(SmallVectorImpl<StoreInst *> &SL,
652 const SCEV *BECount, ForMemset For) {
653 // Try to find consecutive stores that can be transformed into memsets.
654 SetVector<StoreInst *> Heads, Tails;
655 SmallDenseMap<StoreInst *, StoreInst *> ConsecutiveChain;
656
657 // Do a quadratic search on all of the given stores and find
658 // all of the pairs of stores that follow each other.
659 SmallVector<unsigned, 16> IndexQueue;
660 for (unsigned i = 0, e = SL.size(); i < e; ++i) {
661 assert(SL[i]->isSimple() && "Expected only non-volatile stores.")((SL[i]->isSimple() && "Expected only non-volatile stores."
) ? static_cast<void> (0) : __assert_fail ("SL[i]->isSimple() && \"Expected only non-volatile stores.\""
, "/build/llvm-toolchain-snapshot-12~++20210124100612+2afaf072f5c1/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp"
, 661, __PRETTY_FUNCTION__))
;
662
663 Value *FirstStoredVal = SL[i]->getValueOperand();
664 Value *FirstStorePtr = SL[i]->getPointerOperand();
665 const SCEVAddRecExpr *FirstStoreEv =
666 cast<SCEVAddRecExpr>(SE->getSCEV(FirstStorePtr));
667 APInt FirstStride = getStoreStride(FirstStoreEv);
668 unsigned FirstStoreSize = DL->getTypeStoreSize(SL[i]->getValueOperand()->getType());
669
670 // See if we can optimize just this store in isolation.
671 if (FirstStride == FirstStoreSize || -FirstStride == FirstStoreSize) {
672 Heads.insert(SL[i]);
673 continue;
674 }
675
676 Value *FirstSplatValue = nullptr;
677 Constant *FirstPatternValue = nullptr;
678
679 if (For == ForMemset::Yes)
680 FirstSplatValue = isBytewiseValue(FirstStoredVal, *DL);
681 else
682 FirstPatternValue = getMemSetPatternValue(FirstStoredVal, DL);
683
684 assert((FirstSplatValue || FirstPatternValue) &&(((FirstSplatValue || FirstPatternValue) && "Expected either splat value or pattern value."
) ? static_cast<void> (0) : __assert_fail ("(FirstSplatValue || FirstPatternValue) && \"Expected either splat value or pattern value.\""
, "/build/llvm-toolchain-snapshot-12~++20210124100612+2afaf072f5c1/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp"
, 685, __PRETTY_FUNCTION__))
685 "Expected either splat value or pattern value.")(((FirstSplatValue || FirstPatternValue) && "Expected either splat value or pattern value."
) ? static_cast<void> (0) : __assert_fail ("(FirstSplatValue || FirstPatternValue) && \"Expected either splat value or pattern value.\""
, "/build/llvm-toolchain-snapshot-12~++20210124100612+2afaf072f5c1/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp"
, 685, __PRETTY_FUNCTION__))
;
686
687 IndexQueue.clear();
688 // If a store has multiple consecutive store candidates, search Stores
689 // array according to the sequence: from i+1 to e, then from i-1 to 0.
690 // This is because usually pairing with immediate succeeding or preceding
691 // candidate create the best chance to find memset opportunity.
692 unsigned j = 0;
693 for (j = i + 1; j < e; ++j)
694 IndexQueue.push_back(j);
695 for (j = i; j > 0; --j)
696 IndexQueue.push_back(j - 1);
697
698 for (auto &k : IndexQueue) {
699 assert(SL[k]->isSimple() && "Expected only non-volatile stores.")((SL[k]->isSimple() && "Expected only non-volatile stores."
) ? static_cast<void> (0) : __assert_fail ("SL[k]->isSimple() && \"Expected only non-volatile stores.\""
, "/build/llvm-toolchain-snapshot-12~++20210124100612+2afaf072f5c1/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp"
, 699, __PRETTY_FUNCTION__))
;
700 Value *SecondStorePtr = SL[k]->getPointerOperand();
701 const SCEVAddRecExpr *SecondStoreEv =
702 cast<SCEVAddRecExpr>(SE->getSCEV(SecondStorePtr));
703 APInt SecondStride = getStoreStride(SecondStoreEv);
704
705 if (FirstStride != SecondStride)
706 continue;
707
708 Value *SecondStoredVal = SL[k]->getValueOperand();
709 Value *SecondSplatValue = nullptr;
710 Constant *SecondPatternValue = nullptr;
711
712 if (For == ForMemset::Yes)
713 SecondSplatValue = isBytewiseValue(SecondStoredVal, *DL);
714 else
715 SecondPatternValue = getMemSetPatternValue(SecondStoredVal, DL);
716
717 assert((SecondSplatValue || SecondPatternValue) &&(((SecondSplatValue || SecondPatternValue) && "Expected either splat value or pattern value."
) ? static_cast<void> (0) : __assert_fail ("(SecondSplatValue || SecondPatternValue) && \"Expected either splat value or pattern value.\""
, "/build/llvm-toolchain-snapshot-12~++20210124100612+2afaf072f5c1/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp"
, 718, __PRETTY_FUNCTION__))
718 "Expected either splat value or pattern value.")(((SecondSplatValue || SecondPatternValue) && "Expected either splat value or pattern value."
) ? static_cast<void> (0) : __assert_fail ("(SecondSplatValue || SecondPatternValue) && \"Expected either splat value or pattern value.\""
, "/build/llvm-toolchain-snapshot-12~++20210124100612+2afaf072f5c1/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp"
, 718, __PRETTY_FUNCTION__))
;
719
720 if (isConsecutiveAccess(SL[i], SL[k], *DL, *SE, false)) {
721 if (For == ForMemset::Yes) {
722 if (isa<UndefValue>(FirstSplatValue))
723 FirstSplatValue = SecondSplatValue;
724 if (FirstSplatValue != SecondSplatValue)
725 continue;
726 } else {
727 if (isa<UndefValue>(FirstPatternValue))
728 FirstPatternValue = SecondPatternValue;
729 if (FirstPatternValue != SecondPatternValue)
730 continue;
731 }
732 Tails.insert(SL[k]);
733 Heads.insert(SL[i]);
734 ConsecutiveChain[SL[i]] = SL[k];
735 break;
736 }
737 }
738 }
739
740 // We may run into multiple chains that merge into a single chain. We mark the
741 // stores that we transformed so that we don't visit the same store twice.
742 SmallPtrSet<Value *, 16> TransformedStores;
743 bool Changed = false;
744
745 // For stores that start but don't end a link in the chain:
746 for (SetVector<StoreInst *>::iterator it = Heads.begin(), e = Heads.end();
747 it != e; ++it) {
748 if (Tails.count(*it))
749 continue;
750
751 // We found a store instr that starts a chain. Now follow the chain and try
752 // to transform it.
753 SmallPtrSet<Instruction *, 8> AdjacentStores;
754 StoreInst *I = *it;
755
756 StoreInst *HeadStore = I;
757 unsigned StoreSize = 0;
758
759 // Collect the chain into a list.
760 while (Tails.count(I) || Heads.count(I)) {
761 if (TransformedStores.count(I))
762 break;
763 AdjacentStores.insert(I);
764
765 StoreSize += DL->getTypeStoreSize(I->getValueOperand()->getType());
766 // Move to the next value in the chain.
767 I = ConsecutiveChain[I];
768 }
769
770 Value *StoredVal = HeadStore->getValueOperand();
771 Value *StorePtr = HeadStore->getPointerOperand();
772 const SCEVAddRecExpr *StoreEv = cast<SCEVAddRecExpr>(SE->getSCEV(StorePtr));
773 APInt Stride = getStoreStride(StoreEv);
774
775 // Check to see if the stride matches the size of the stores. If so, then
776 // we know that every byte is touched in the loop.
777 if (StoreSize != Stride && StoreSize != -Stride)
778 continue;
779
780 bool NegStride = StoreSize == -Stride;
781
782 if (processLoopStridedStore(StorePtr, StoreSize,
783 MaybeAlign(HeadStore->getAlignment()),
784 StoredVal, HeadStore, AdjacentStores, StoreEv,
785 BECount, NegStride)) {
786 TransformedStores.insert(AdjacentStores.begin(), AdjacentStores.end());
787 Changed = true;
788 }
789 }
790
791 return Changed;
792}
793
794/// processLoopMemSet - See if this memset can be promoted to a large memset.
795bool LoopIdiomRecognize::processLoopMemSet(MemSetInst *MSI,
796 const SCEV *BECount) {
797 // We can only handle non-volatile memsets with a constant size.
798 if (MSI->isVolatile() || !isa<ConstantInt>(MSI->getLength()))
799 return false;
800
801 // If we're not allowed to hack on memset, we fail.
802 if (!HasMemset)
803 return false;
804
805 Value *Pointer = MSI->getDest();
806
807 // See if the pointer expression is an AddRec like {base,+,1} on the current
808 // loop, which indicates a strided store. If we have something else, it's a
809 // random store we can't handle.
810 const SCEVAddRecExpr *Ev = dyn_cast<SCEVAddRecExpr>(SE->getSCEV(Pointer));
811 if (!Ev || Ev->getLoop() != CurLoop || !Ev->isAffine())
812 return false;
813
814 // Reject memsets that are so large that they overflow an unsigned.
815 uint64_t SizeInBytes = cast<ConstantInt>(MSI->getLength())->getZExtValue();
816 if ((SizeInBytes >> 32) != 0)
817 return false;
818
819 // Check to see if the stride matches the size of the memset. If so, then we
820 // know that every byte is touched in the loop.
821 const SCEVConstant *ConstStride = dyn_cast<SCEVConstant>(Ev->getOperand(1));
822 if (!ConstStride)
823 return false;
824
825 APInt Stride = ConstStride->getAPInt();
826 if (SizeInBytes != Stride && SizeInBytes != -Stride)
827 return false;
828
829 // Verify that the memset value is loop invariant. If not, we can't promote
830 // the memset.
831 Value *SplatValue = MSI->getValue();
832 if (!SplatValue || !CurLoop->isLoopInvariant(SplatValue))
833 return false;
834
835 SmallPtrSet<Instruction *, 1> MSIs;
836 MSIs.insert(MSI);
837 bool NegStride = SizeInBytes == -Stride;
838 return processLoopStridedStore(
839 Pointer, (unsigned)SizeInBytes, MaybeAlign(MSI->getDestAlignment()),
840 SplatValue, MSI, MSIs, Ev, BECount, NegStride, /*IsLoopMemset=*/true);
841}
842
843/// mayLoopAccessLocation - Return true if the specified loop might access the
844/// specified pointer location, which is a loop-strided access. The 'Access'
845/// argument specifies what the verboten forms of access are (read or write).
846static bool
847mayLoopAccessLocation(Value *Ptr, ModRefInfo Access, Loop *L,
848 const SCEV *BECount, unsigned StoreSize,
849 AliasAnalysis &AA,
850 SmallPtrSetImpl<Instruction *> &IgnoredStores) {
851 // Get the location that may be stored across the loop. Since the access is
852 // strided positively through memory, we say that the modified location starts
853 // at the pointer and has infinite size.
854 LocationSize AccessSize = LocationSize::afterPointer();
855
856 // If the loop iterates a fixed number of times, we can refine the access size
857 // to be exactly the size of the memset, which is (BECount+1)*StoreSize
858 if (const SCEVConstant *BECst = dyn_cast<SCEVConstant>(BECount))
859 AccessSize = LocationSize::precise((BECst->getValue()->getZExtValue() + 1) *
860 StoreSize);
861
862 // TODO: For this to be really effective, we have to dive into the pointer
863 // operand in the store. Store to &A[i] of 100 will always return may alias
864 // with store of &A[100], we need to StoreLoc to be "A" with size of 100,
865 // which will then no-alias a store to &A[100].
866 MemoryLocation StoreLoc(Ptr, AccessSize);
867
868 for (Loop::block_iterator BI = L->block_begin(), E = L->block_end(); BI != E;
869 ++BI)
870 for (Instruction &I : **BI)
871 if (IgnoredStores.count(&I) == 0 &&
872 isModOrRefSet(
873 intersectModRef(AA.getModRefInfo(&I, StoreLoc), Access)))
874 return true;
875
876 return false;
877}
878
879// If we have a negative stride, Start refers to the end of the memory location
880// we're trying to memset. Therefore, we need to recompute the base pointer,
881// which is just Start - BECount*Size.
882static const SCEV *getStartForNegStride(const SCEV *Start, const SCEV *BECount,
883 Type *IntPtr, unsigned StoreSize,
884 ScalarEvolution *SE) {
885 const SCEV *Index = SE->getTruncateOrZeroExtend(BECount, IntPtr);
886 if (StoreSize != 1)
887 Index = SE->getMulExpr(Index, SE->getConstant(IntPtr, StoreSize),
888 SCEV::FlagNUW);
889 return SE->getMinusSCEV(Start, Index);
890}
891
892/// Compute the number of bytes as a SCEV from the backedge taken count.
893///
894/// This also maps the SCEV into the provided type and tries to handle the
895/// computation in a way that will fold cleanly.
896static const SCEV *getNumBytes(const SCEV *BECount, Type *IntPtr,
897 unsigned StoreSize, Loop *CurLoop,
898 const DataLayout *DL, ScalarEvolution *SE) {
899 const SCEV *NumBytesS;
900 // The # stored bytes is (BECount+1)*Size. Expand the trip count out to
901 // pointer size if it isn't already.
902 //
903 // If we're going to need to zero extend the BE count, check if we can add
904 // one to it prior to zero extending without overflow. Provided this is safe,
905 // it allows better simplification of the +1.
906 if (DL->getTypeSizeInBits(BECount->getType()).getFixedSize() <
907 DL->getTypeSizeInBits(IntPtr).getFixedSize() &&
908 SE->isLoopEntryGuardedByCond(
909 CurLoop, ICmpInst::ICMP_NE, BECount,
910 SE->getNegativeSCEV(SE->getOne(BECount->getType())))) {
911 NumBytesS = SE->getZeroExtendExpr(
912 SE->getAddExpr(BECount, SE->getOne(BECount->getType()), SCEV::FlagNUW),
913 IntPtr);
914 } else {
915 NumBytesS = SE->getAddExpr(SE->getTruncateOrZeroExtend(BECount, IntPtr),
916 SE->getOne(IntPtr), SCEV::FlagNUW);
917 }
918
919 // And scale it based on the store size.
920 if (StoreSize != 1) {
921 NumBytesS = SE->getMulExpr(NumBytesS, SE->getConstant(IntPtr, StoreSize),
922 SCEV::FlagNUW);
923 }
924 return NumBytesS;
925}
926
927/// processLoopStridedStore - We see a strided store of some value. If we can
928/// transform this into a memset or memset_pattern in the loop preheader, do so.
929bool LoopIdiomRecognize::processLoopStridedStore(
930 Value *DestPtr, unsigned StoreSize, MaybeAlign StoreAlignment,
931 Value *StoredVal, Instruction *TheStore,
932 SmallPtrSetImpl<Instruction *> &Stores, const SCEVAddRecExpr *Ev,
933 const SCEV *BECount, bool NegStride, bool IsLoopMemset) {
934 Value *SplatValue = isBytewiseValue(StoredVal, *DL);
935 Constant *PatternValue = nullptr;
936
937 if (!SplatValue)
938 PatternValue = getMemSetPatternValue(StoredVal, DL);
939
940 assert((SplatValue || PatternValue) &&(((SplatValue || PatternValue) && "Expected either splat value or pattern value."
) ? static_cast<void> (0) : __assert_fail ("(SplatValue || PatternValue) && \"Expected either splat value or pattern value.\""
, "/build/llvm-toolchain-snapshot-12~++20210124100612+2afaf072f5c1/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp"
, 941, __PRETTY_FUNCTION__))
941 "Expected either splat value or pattern value.")(((SplatValue || PatternValue) && "Expected either splat value or pattern value."
) ? static_cast<void> (0) : __assert_fail ("(SplatValue || PatternValue) && \"Expected either splat value or pattern value.\""
, "/build/llvm-toolchain-snapshot-12~++20210124100612+2afaf072f5c1/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp"
, 941, __PRETTY_FUNCTION__))
;
942
943 // The trip count of the loop and the base pointer of the addrec SCEV is
944 // guaranteed to be loop invariant, which means that it should dominate the
945 // header. This allows us to insert code for it in the preheader.
946 unsigned DestAS = DestPtr->getType()->getPointerAddressSpace();
947 BasicBlock *Preheader = CurLoop->getLoopPreheader();
948 IRBuilder<> Builder(Preheader->getTerminator());
949 SCEVExpander Expander(*SE, *DL, "loop-idiom");
950 SCEVExpanderCleaner ExpCleaner(Expander, *DT);
951
952 Type *DestInt8PtrTy = Builder.getInt8PtrTy(DestAS);
953 Type *IntIdxTy = DL->getIndexType(DestPtr->getType());
954
955 bool Changed = false;
956 const SCEV *Start = Ev->getStart();
957 // Handle negative strided loops.
958 if (NegStride)
959 Start = getStartForNegStride(Start, BECount, IntIdxTy, StoreSize, SE);
960
961 // TODO: ideally we should still be able to generate memset if SCEV expander
962 // is taught to generate the dependencies at the latest point.
963 if (!isSafeToExpand(Start, *SE))
964 return Changed;
965
966 // Okay, we have a strided store "p[i]" of a splattable value. We can turn
967 // this into a memset in the loop preheader now if we want. However, this
968 // would be unsafe to do if there is anything else in the loop that may read
969 // or write to the aliased location. Check for any overlap by generating the
970 // base pointer and checking the region.
971 Value *BasePtr =
972 Expander.expandCodeFor(Start, DestInt8PtrTy, Preheader->getTerminator());
973
974 // From here on out, conservatively report to the pass manager that we've
975 // changed the IR, even if we later clean up these added instructions. There
976 // may be structural differences e.g. in the order of use lists not accounted
977 // for in just a textual dump of the IR. This is written as a variable, even
978 // though statically all the places this dominates could be replaced with
979 // 'true', with the hope that anyone trying to be clever / "more precise" with
980 // the return value will read this comment, and leave them alone.
981 Changed = true;
982
983 if (mayLoopAccessLocation(BasePtr, ModRefInfo::ModRef, CurLoop, BECount,
984 StoreSize, *AA, Stores))
985 return Changed;
986
987 if (avoidLIRForMultiBlockLoop(/*IsMemset=*/true, IsLoopMemset))
988 return Changed;
989
990 // Okay, everything looks good, insert the memset.
991
992 const SCEV *NumBytesS =
993 getNumBytes(BECount, IntIdxTy, StoreSize, CurLoop, DL, SE);
994
995 // TODO: ideally we should still be able to generate memset if SCEV expander
996 // is taught to generate the dependencies at the latest point.
997 if (!isSafeToExpand(NumBytesS, *SE))
998 return Changed;
999
1000 Value *NumBytes =
1001 Expander.expandCodeFor(NumBytesS, IntIdxTy, Preheader->getTerminator());
1002
1003 CallInst *NewCall;
1004 if (SplatValue) {
1005 NewCall = Builder.CreateMemSet(BasePtr, SplatValue, NumBytes,
1006 MaybeAlign(StoreAlignment));
1007 } else {
1008 // Everything is emitted in default address space
1009 Type *Int8PtrTy = DestInt8PtrTy;
1010
1011 Module *M = TheStore->getModule();
1012 StringRef FuncName = "memset_pattern16";
1013 FunctionCallee MSP = M->getOrInsertFunction(FuncName, Builder.getVoidTy(),
1014 Int8PtrTy, Int8PtrTy, IntIdxTy);
1015 inferLibFuncAttributes(M, FuncName, *TLI);
1016
1017 // Otherwise we should form a memset_pattern16. PatternValue is known to be
1018 // an constant array of 16-bytes. Plop the value into a mergable global.
1019 GlobalVariable *GV = new GlobalVariable(*M, PatternValue->getType(), true,
1020 GlobalValue::PrivateLinkage,
1021 PatternValue, ".memset_pattern");
1022 GV->setUnnamedAddr(GlobalValue::UnnamedAddr::Global); // Ok to merge these.
1023 GV->setAlignment(Align(16));
1024 Value *PatternPtr = ConstantExpr::getBitCast(GV, Int8PtrTy);
1025 NewCall = Builder.CreateCall(MSP, {BasePtr, PatternPtr, NumBytes});
1026 }
1027 NewCall->setDebugLoc(TheStore->getDebugLoc());
1028
1029 if (MSSAU) {
1030 MemoryAccess *NewMemAcc = MSSAU->createMemoryAccessInBB(
1031 NewCall, nullptr, NewCall->getParent(), MemorySSA::BeforeTerminator);
1032 MSSAU->insertDef(cast<MemoryDef>(NewMemAcc), true);
1033 }
1034
1035 LLVM_DEBUG(dbgs() << " Formed memset: " << *NewCall << "\n"do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-idiom")) { dbgs() << " Formed memset: " <<
*NewCall << "\n" << " from store to: " <<
*Ev << " at: " << *TheStore << "\n"; } } while
(false)
1036 << " from store to: " << *Ev << " at: " << *TheStoredo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-idiom")) { dbgs() << " Formed memset: " <<
*NewCall << "\n" << " from store to: " <<
*Ev << " at: " << *TheStore << "\n"; } } while
(false)
1037 << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-idiom")) { dbgs() << " Formed memset: " <<
*NewCall << "\n" << " from store to: " <<
*Ev << " at: " << *TheStore << "\n"; } } while
(false)
;
1038
1039 ORE.emit([&]() {
1040 return OptimizationRemark(DEBUG_TYPE"loop-idiom", "ProcessLoopStridedStore",
1041 NewCall->getDebugLoc(), Preheader)
1042 << "Transformed loop-strided store into a call to "
1043 << ore::NV("NewFunction", NewCall->getCalledFunction())
1044 << "() function";
1045 });
1046
1047 // Okay, the memset has been formed. Zap the original store and anything that
1048 // feeds into it.
1049 for (auto *I : Stores) {
1050 if (MSSAU)
1051 MSSAU->removeMemoryAccess(I, true);
1052 deleteDeadInstruction(I);
1053 }
1054 if (MSSAU && VerifyMemorySSA)
1055 MSSAU->getMemorySSA()->verifyMemorySSA();
1056 ++NumMemSet;
1057 ExpCleaner.markResultUsed();
1058 return true;
1059}
1060
1061/// If the stored value is a strided load in the same loop with the same stride
1062/// this may be transformable into a memcpy. This kicks in for stuff like
1063/// for (i) A[i] = B[i];
1064bool LoopIdiomRecognize::processLoopStoreOfLoopLoad(StoreInst *SI,
1065 const SCEV *BECount) {
1066 assert(SI->isUnordered() && "Expected only non-volatile non-ordered stores.")((SI->isUnordered() && "Expected only non-volatile non-ordered stores."
) ? static_cast<void> (0) : __assert_fail ("SI->isUnordered() && \"Expected only non-volatile non-ordered stores.\""
, "/build/llvm-toolchain-snapshot-12~++20210124100612+2afaf072f5c1/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp"
, 1066, __PRETTY_FUNCTION__))
;
1067
1068 Value *StorePtr = SI->getPointerOperand();
1069 const SCEVAddRecExpr *StoreEv = cast<SCEVAddRecExpr>(SE->getSCEV(StorePtr));
1070 APInt Stride = getStoreStride(StoreEv);
1071 unsigned StoreSize = DL->getTypeStoreSize(SI->getValueOperand()->getType());
1072 bool NegStride = StoreSize == -Stride;
1073
1074 // The store must be feeding a non-volatile load.
1075 LoadInst *LI = cast<LoadInst>(SI->getValueOperand());
1076 assert(LI->isUnordered() && "Expected only non-volatile non-ordered loads.")((LI->isUnordered() && "Expected only non-volatile non-ordered loads."
) ? static_cast<void> (0) : __assert_fail ("LI->isUnordered() && \"Expected only non-volatile non-ordered loads.\""
, "/build/llvm-toolchain-snapshot-12~++20210124100612+2afaf072f5c1/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp"
, 1076, __PRETTY_FUNCTION__))
;
1077
1078 // See if the pointer expression is an AddRec like {base,+,1} on the current
1079 // loop, which indicates a strided load. If we have something else, it's a
1080 // random load we can't handle.
1081 const SCEVAddRecExpr *LoadEv =
1082 cast<SCEVAddRecExpr>(SE->getSCEV(LI->getPointerOperand()));
1083
1084 // The trip count of the loop and the base pointer of the addrec SCEV is
1085 // guaranteed to be loop invariant, which means that it should dominate the
1086 // header. This allows us to insert code for it in the preheader.
1087 BasicBlock *Preheader = CurLoop->getLoopPreheader();
1088 IRBuilder<> Builder(Preheader->getTerminator());
1089 SCEVExpander Expander(*SE, *DL, "loop-idiom");
1090
1091 SCEVExpanderCleaner ExpCleaner(Expander, *DT);
1092
1093 bool Changed = false;
1094 const SCEV *StrStart = StoreEv->getStart();
1095 unsigned StrAS = SI->getPointerAddressSpace();
1096 Type *IntIdxTy = Builder.getIntNTy(DL->getIndexSizeInBits(StrAS));
1097
1098 // Handle negative strided loops.
1099 if (NegStride)
1100 StrStart = getStartForNegStride(StrStart, BECount, IntIdxTy, StoreSize, SE);
1101
1102 // Okay, we have a strided store "p[i]" of a loaded value. We can turn
1103 // this into a memcpy in the loop preheader now if we want. However, this
1104 // would be unsafe to do if there is anything else in the loop that may read
1105 // or write the memory region we're storing to. This includes the load that
1106 // feeds the stores. Check for an alias by generating the base address and
1107 // checking everything.
1108 Value *StoreBasePtr = Expander.expandCodeFor(
1109 StrStart, Builder.getInt8PtrTy(StrAS), Preheader->getTerminator());
1110
1111 // From here on out, conservatively report to the pass manager that we've
1112 // changed the IR, even if we later clean up these added instructions. There
1113 // may be structural differences e.g. in the order of use lists not accounted
1114 // for in just a textual dump of the IR. This is written as a variable, even
1115 // though statically all the places this dominates could be replaced with
1116 // 'true', with the hope that anyone trying to be clever / "more precise" with
1117 // the return value will read this comment, and leave them alone.
1118 Changed = true;
1119
1120 SmallPtrSet<Instruction *, 1> Stores;
1121 Stores.insert(SI);
1122 if (mayLoopAccessLocation(StoreBasePtr, ModRefInfo::ModRef, CurLoop, BECount,
1123 StoreSize, *AA, Stores))
1124 return Changed;
1125
1126 const SCEV *LdStart = LoadEv->getStart();
1127 unsigned LdAS = LI->getPointerAddressSpace();
1128
1129 // Handle negative strided loops.
1130 if (NegStride)
1131 LdStart = getStartForNegStride(LdStart, BECount, IntIdxTy, StoreSize, SE);
1132
1133 // For a memcpy, we have to make sure that the input array is not being
1134 // mutated by the loop.
1135 Value *LoadBasePtr = Expander.expandCodeFor(
1136 LdStart, Builder.getInt8PtrTy(LdAS), Preheader->getTerminator());
1137
1138 if (mayLoopAccessLocation(LoadBasePtr, ModRefInfo::Mod, CurLoop, BECount,
1139 StoreSize, *AA, Stores))
1140 return Changed;
1141
1142 if (avoidLIRForMultiBlockLoop())
1143 return Changed;
1144
1145 // Okay, everything is safe, we can transform this!
1146
1147 const SCEV *NumBytesS =
1148 getNumBytes(BECount, IntIdxTy, StoreSize, CurLoop, DL, SE);
1149
1150 Value *NumBytes =
1151 Expander.expandCodeFor(NumBytesS, IntIdxTy, Preheader->getTerminator());
1152
1153 CallInst *NewCall = nullptr;
1154 // Check whether to generate an unordered atomic memcpy:
1155 // If the load or store are atomic, then they must necessarily be unordered
1156 // by previous checks.
1157 if (!SI->isAtomic() && !LI->isAtomic())
1158 NewCall = Builder.CreateMemCpy(StoreBasePtr, SI->getAlign(), LoadBasePtr,
1159 LI->getAlign(), NumBytes);
1160 else {
1161 // We cannot allow unaligned ops for unordered load/store, so reject
1162 // anything where the alignment isn't at least the element size.
1163 const Align StoreAlign = SI->getAlign();
1164 const Align LoadAlign = LI->getAlign();
1165 if (StoreAlign < StoreSize || LoadAlign < StoreSize)
1166 return Changed;
1167
1168 // If the element.atomic memcpy is not lowered into explicit
1169 // loads/stores later, then it will be lowered into an element-size
1170 // specific lib call. If the lib call doesn't exist for our store size, then
1171 // we shouldn't generate the memcpy.
1172 if (StoreSize > TTI->getAtomicMemIntrinsicMaxElementSize())
1173 return Changed;
1174
1175 // Create the call.
1176 // Note that unordered atomic loads/stores are *required* by the spec to
1177 // have an alignment but non-atomic loads/stores may not.
1178 NewCall = Builder.CreateElementUnorderedAtomicMemCpy(
1179 StoreBasePtr, StoreAlign, LoadBasePtr, LoadAlign, NumBytes,
1180 StoreSize);
1181 }
1182 NewCall->setDebugLoc(SI->getDebugLoc());
1183
1184 if (MSSAU) {
1185 MemoryAccess *NewMemAcc = MSSAU->createMemoryAccessInBB(
1186 NewCall, nullptr, NewCall->getParent(), MemorySSA::BeforeTerminator);
1187 MSSAU->insertDef(cast<MemoryDef>(NewMemAcc), true);
1188 }
1189
1190 LLVM_DEBUG(dbgs() << " Formed memcpy: " << *NewCall << "\n"do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-idiom")) { dbgs() << " Formed memcpy: " <<
*NewCall << "\n" << " from load ptr=" <<
*LoadEv << " at: " << *LI << "\n" <<
" from store ptr=" << *StoreEv << " at: " <<
*SI << "\n"; } } while (false)
1191 << " from load ptr=" << *LoadEv << " at: " << *LI << "\n"do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-idiom")) { dbgs() << " Formed memcpy: " <<
*NewCall << "\n" << " from load ptr=" <<
*LoadEv << " at: " << *LI << "\n" <<
" from store ptr=" << *StoreEv << " at: " <<
*SI << "\n"; } } while (false)
1192 << " from store ptr=" << *StoreEv << " at: " << *SIdo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-idiom")) { dbgs() << " Formed memcpy: " <<
*NewCall << "\n" << " from load ptr=" <<
*LoadEv << " at: " << *LI << "\n" <<
" from store ptr=" << *StoreEv << " at: " <<
*SI << "\n"; } } while (false)
1193 << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-idiom")) { dbgs() << " Formed memcpy: " <<
*NewCall << "\n" << " from load ptr=" <<
*LoadEv << " at: " << *LI << "\n" <<
" from store ptr=" << *StoreEv << " at: " <<
*SI << "\n"; } } while (false)
;
1194
1195 ORE.emit([&]() {
1196 return OptimizationRemark(DEBUG_TYPE"loop-idiom", "ProcessLoopStoreOfLoopLoad",
1197 NewCall->getDebugLoc(), Preheader)
1198 << "Formed a call to "
1199 << ore::NV("NewFunction", NewCall->getCalledFunction())
1200 << "() function";
1201 });
1202
1203 // Okay, the memcpy has been formed. Zap the original store and anything that
1204 // feeds into it.
1205 if (MSSAU)
1206 MSSAU->removeMemoryAccess(SI, true);
1207 deleteDeadInstruction(SI);
1208 if (MSSAU && VerifyMemorySSA)
1209 MSSAU->getMemorySSA()->verifyMemorySSA();
1210 ++NumMemCpy;
1211 ExpCleaner.markResultUsed();
1212 return true;
1213}
1214
1215// When compiling for codesize we avoid idiom recognition for a multi-block loop
1216// unless it is a loop_memset idiom or a memset/memcpy idiom in a nested loop.
1217//
1218bool LoopIdiomRecognize::avoidLIRForMultiBlockLoop(bool IsMemset,
1219 bool IsLoopMemset) {
1220 if (ApplyCodeSizeHeuristics && CurLoop->getNumBlocks() > 1) {
1221 if (CurLoop->isOutermost() && (!IsMemset || !IsLoopMemset)) {
1222 LLVM_DEBUG(dbgs() << " " << CurLoop->getHeader()->getParent()->getName()do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-idiom")) { dbgs() << " " << CurLoop->getHeader
()->getParent()->getName() << " : LIR " << (
IsMemset ? "Memset" : "Memcpy") << " avoided: multi-block top-level loop\n"
; } } while (false)
1223 << " : LIR " << (IsMemset ? "Memset" : "Memcpy")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-idiom")) { dbgs() << " " << CurLoop->getHeader
()->getParent()->getName() << " : LIR " << (
IsMemset ? "Memset" : "Memcpy") << " avoided: multi-block top-level loop\n"
; } } while (false)
1224 << " avoided: multi-block top-level loop\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-idiom")) { dbgs() << " " << CurLoop->getHeader
()->getParent()->getName() << " : LIR " << (
IsMemset ? "Memset" : "Memcpy") << " avoided: multi-block top-level loop\n"
; } } while (false)
;
1225 return true;
1226 }
1227 }
1228
1229 return false;
1230}
1231
1232bool LoopIdiomRecognize::runOnNoncountableLoop() {
1233 LLVM_DEBUG(dbgs() << DEBUG_TYPE " Scanning: F["do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-idiom")) { dbgs() << "loop-idiom" " Scanning: F["
<< CurLoop->getHeader()->getParent()->getName
() << "] Noncountable Loop %" << CurLoop->getHeader
()->getName() << "\n"; } } while (false)
1234 << CurLoop->getHeader()->getParent()->getName()do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-idiom")) { dbgs() << "loop-idiom" " Scanning: F["
<< CurLoop->getHeader()->getParent()->getName
() << "] Noncountable Loop %" << CurLoop->getHeader
()->getName() << "\n"; } } while (false)
1235 << "] Noncountable Loop %"do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-idiom")) { dbgs() << "loop-idiom" " Scanning: F["
<< CurLoop->getHeader()->getParent()->getName
() << "] Noncountable Loop %" << CurLoop->getHeader
()->getName() << "\n"; } } while (false)
1236 << CurLoop->getHeader()->getName() << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-idiom")) { dbgs() << "loop-idiom" " Scanning: F["
<< CurLoop->getHeader()->getParent()->getName
() << "] Noncountable Loop %" << CurLoop->getHeader
()->getName() << "\n"; } } while (false)
;
1237
1238 return recognizePopcount() || recognizeAndInsertFFS() ||
1239 recognizeShiftUntilBitTest();
1240}
1241
1242/// Check if the given conditional branch is based on the comparison between
1243/// a variable and zero, and if the variable is non-zero or zero (JmpOnZero is
1244/// true), the control yields to the loop entry. If the branch matches the
1245/// behavior, the variable involved in the comparison is returned. This function
1246/// will be called to see if the precondition and postcondition of the loop are
1247/// in desirable form.
1248static Value *matchCondition(BranchInst *BI, BasicBlock *LoopEntry,
1249 bool JmpOnZero = false) {
1250 if (!BI || !BI->isConditional())
1251 return nullptr;
1252
1253 ICmpInst *Cond = dyn_cast<ICmpInst>(BI->getCondition());
1254 if (!Cond)
1255 return nullptr;
1256
1257 ConstantInt *CmpZero = dyn_cast<ConstantInt>(Cond->getOperand(1));
1258 if (!CmpZero || !CmpZero->isZero())
1259 return nullptr;
1260
1261 BasicBlock *TrueSucc = BI->getSuccessor(0);
1262 BasicBlock *FalseSucc = BI->getSuccessor(1);
1263 if (JmpOnZero)
1264 std::swap(TrueSucc, FalseSucc);
1265
1266 ICmpInst::Predicate Pred = Cond->getPredicate();
1267 if ((Pred == ICmpInst::ICMP_NE && TrueSucc == LoopEntry) ||
1268 (Pred == ICmpInst::ICMP_EQ && FalseSucc == LoopEntry))
1269 return Cond->getOperand(0);
1270
1271 return nullptr;
1272}
1273
1274// Check if the recurrence variable `VarX` is in the right form to create
1275// the idiom. Returns the value coerced to a PHINode if so.
1276static PHINode *getRecurrenceVar(Value *VarX, Instruction *DefX,
1277 BasicBlock *LoopEntry) {
1278 auto *PhiX = dyn_cast<PHINode>(VarX);
1279 if (PhiX && PhiX->getParent() == LoopEntry &&
1280 (PhiX->getOperand(0) == DefX || PhiX->getOperand(1) == DefX))
1281 return PhiX;
1282 return nullptr;
1283}
1284
1285/// Return true iff the idiom is detected in the loop.
1286///
1287/// Additionally:
1288/// 1) \p CntInst is set to the instruction counting the population bit.
1289/// 2) \p CntPhi is set to the corresponding phi node.
1290/// 3) \p Var is set to the value whose population bits are being counted.
1291///
1292/// The core idiom we are trying to detect is:
1293/// \code
1294/// if (x0 != 0)
1295/// goto loop-exit // the precondition of the loop
1296/// cnt0 = init-val;
1297/// do {
1298/// x1 = phi (x0, x2);
1299/// cnt1 = phi(cnt0, cnt2);
1300///
1301/// cnt2 = cnt1 + 1;
1302/// ...
1303/// x2 = x1 & (x1 - 1);
1304/// ...
1305/// } while(x != 0);
1306///
1307/// loop-exit:
1308/// \endcode
1309static bool detectPopcountIdiom(Loop *CurLoop, BasicBlock *PreCondBB,
1310 Instruction *&CntInst, PHINode *&CntPhi,
1311 Value *&Var) {
1312 // step 1: Check to see if the look-back branch match this pattern:
1313 // "if (a!=0) goto loop-entry".
1314 BasicBlock *LoopEntry;
1315 Instruction *DefX2, *CountInst;
1316 Value *VarX1, *VarX0;
1317 PHINode *PhiX, *CountPhi;
1318
1319 DefX2 = CountInst = nullptr;
1320 VarX1 = VarX0 = nullptr;
1321 PhiX = CountPhi = nullptr;
1322 LoopEntry = *(CurLoop->block_begin());
1323
1324 // step 1: Check if the loop-back branch is in desirable form.
1325 {
1326 if (Value *T = matchCondition(
1327 dyn_cast<BranchInst>(LoopEntry->getTerminator()), LoopEntry))
1328 DefX2 = dyn_cast<Instruction>(T);
1329 else
1330 return false;
1331 }
1332
1333 // step 2: detect instructions corresponding to "x2 = x1 & (x1 - 1)"
1334 {
1335 if (!DefX2 || DefX2->getOpcode() != Instruction::And)
1336 return false;
1337
1338 BinaryOperator *SubOneOp;
1339
1340 if ((SubOneOp = dyn_cast<BinaryOperator>(DefX2->getOperand(0))))
1341 VarX1 = DefX2->getOperand(1);
1342 else {
1343 VarX1 = DefX2->getOperand(0);
1344 SubOneOp = dyn_cast<BinaryOperator>(DefX2->getOperand(1));
1345 }
1346 if (!SubOneOp || SubOneOp->getOperand(0) != VarX1)
1347 return false;
1348
1349 ConstantInt *Dec = dyn_cast<ConstantInt>(SubOneOp->getOperand(1));
1350 if (!Dec ||
1351 !((SubOneOp->getOpcode() == Instruction::Sub && Dec->isOne()) ||
1352 (SubOneOp->getOpcode() == Instruction::Add &&
1353 Dec->isMinusOne()))) {
1354 return false;
1355 }
1356 }
1357
1358 // step 3: Check the recurrence of variable X
1359 PhiX = getRecurrenceVar(VarX1, DefX2, LoopEntry);
1360 if (!PhiX)
1361 return false;
1362
1363 // step 4: Find the instruction which count the population: cnt2 = cnt1 + 1
1364 {
1365 CountInst = nullptr;
1366 for (BasicBlock::iterator Iter = LoopEntry->getFirstNonPHI()->getIterator(),
1367 IterE = LoopEntry->end();
1368 Iter != IterE; Iter++) {
1369 Instruction *Inst = &*Iter;
1370 if (Inst->getOpcode() != Instruction::Add)
1371 continue;
1372
1373 ConstantInt *Inc = dyn_cast<ConstantInt>(Inst->getOperand(1));
1374 if (!Inc || !Inc->isOne())
1375 continue;
1376
1377 PHINode *Phi = getRecurrenceVar(Inst->getOperand(0), Inst, LoopEntry);
1378 if (!Phi)
1379 continue;
1380
1381 // Check if the result of the instruction is live of the loop.
1382 bool LiveOutLoop = false;
1383 for (User *U : Inst->users()) {
1384 if ((cast<Instruction>(U))->getParent() != LoopEntry) {
1385 LiveOutLoop = true;
1386 break;
1387 }
1388 }
1389
1390 if (LiveOutLoop) {
1391 CountInst = Inst;
1392 CountPhi = Phi;
1393 break;
1394 }
1395 }
1396
1397 if (!CountInst)
1398 return false;
1399 }
1400
1401 // step 5: check if the precondition is in this form:
1402 // "if (x != 0) goto loop-head ; else goto somewhere-we-don't-care;"
1403 {
1404 auto *PreCondBr = dyn_cast<BranchInst>(PreCondBB->getTerminator());
1405 Value *T = matchCondition(PreCondBr, CurLoop->getLoopPreheader());
1406 if (T != PhiX->getOperand(0) && T != PhiX->getOperand(1))
1407 return false;
1408
1409 CntInst = CountInst;
1410 CntPhi = CountPhi;
1411 Var = T;
1412 }
1413
1414 return true;
1415}
1416
1417/// Return true if the idiom is detected in the loop.
1418///
1419/// Additionally:
1420/// 1) \p CntInst is set to the instruction Counting Leading Zeros (CTLZ)
1421/// or nullptr if there is no such.
1422/// 2) \p CntPhi is set to the corresponding phi node
1423/// or nullptr if there is no such.
1424/// 3) \p Var is set to the value whose CTLZ could be used.
1425/// 4) \p DefX is set to the instruction calculating Loop exit condition.
1426///
1427/// The core idiom we are trying to detect is:
1428/// \code
1429/// if (x0 == 0)
1430/// goto loop-exit // the precondition of the loop
1431/// cnt0 = init-val;
1432/// do {
1433/// x = phi (x0, x.next); //PhiX
1434/// cnt = phi(cnt0, cnt.next);
1435///
1436/// cnt.next = cnt + 1;
1437/// ...
1438/// x.next = x >> 1; // DefX
1439/// ...
1440/// } while(x.next != 0);
1441///
1442/// loop-exit:
1443/// \endcode
1444static bool detectShiftUntilZeroIdiom(Loop *CurLoop, const DataLayout &DL,
1445 Intrinsic::ID &IntrinID, Value *&InitX,
1446 Instruction *&CntInst, PHINode *&CntPhi,
1447 Instruction *&DefX) {
1448 BasicBlock *LoopEntry;
1449 Value *VarX = nullptr;
1450
1451 DefX = nullptr;
1452 CntInst = nullptr;
1453 CntPhi = nullptr;
1454 LoopEntry = *(CurLoop->block_begin());
1455
1456 // step 1: Check if the loop-back branch is in desirable form.
1457 if (Value *T = matchCondition(
1458 dyn_cast<BranchInst>(LoopEntry->getTerminator()), LoopEntry))
1459 DefX = dyn_cast<Instruction>(T);
1460 else
1461 return false;
1462
1463 // step 2: detect instructions corresponding to "x.next = x >> 1 or x << 1"
1464 if (!DefX || !DefX->isShift())
1465 return false;
1466 IntrinID = DefX->getOpcode() == Instruction::Shl ? Intrinsic::cttz :
1467 Intrinsic::ctlz;
1468 ConstantInt *Shft = dyn_cast<ConstantInt>(DefX->getOperand(1));
1469 if (!Shft || !Shft->isOne())
1470 return false;
1471 VarX = DefX->getOperand(0);
1472
1473 // step 3: Check the recurrence of variable X
1474 PHINode *PhiX = getRecurrenceVar(VarX, DefX, LoopEntry);
1475 if (!PhiX)
1476 return false;
1477
1478 InitX = PhiX->getIncomingValueForBlock(CurLoop->getLoopPreheader());
1479
1480 // Make sure the initial value can't be negative otherwise the ashr in the
1481 // loop might never reach zero which would make the loop infinite.
1482 if (DefX->getOpcode() == Instruction::AShr && !isKnownNonNegative(InitX, DL))
1483 return false;
1484
1485 // step 4: Find the instruction which count the CTLZ: cnt.next = cnt + 1
1486 // or cnt.next = cnt + -1.
1487 // TODO: We can skip the step. If loop trip count is known (CTLZ),
1488 // then all uses of "cnt.next" could be optimized to the trip count
1489 // plus "cnt0". Currently it is not optimized.
1490 // This step could be used to detect POPCNT instruction:
1491 // cnt.next = cnt + (x.next & 1)
1492 for (BasicBlock::iterator Iter = LoopEntry->getFirstNonPHI()->getIterator(),
1493 IterE = LoopEntry->end();
1494 Iter != IterE; Iter++) {
1495 Instruction *Inst = &*Iter;
1496 if (Inst->getOpcode() != Instruction::Add)
1497 continue;
1498
1499 ConstantInt *Inc = dyn_cast<ConstantInt>(Inst->getOperand(1));
1500 if (!Inc || (!Inc->isOne() && !Inc->isMinusOne()))
1501 continue;
1502
1503 PHINode *Phi = getRecurrenceVar(Inst->getOperand(0), Inst, LoopEntry);
1504 if (!Phi)
1505 continue;
1506
1507 CntInst = Inst;
1508 CntPhi = Phi;
1509 break;
1510 }
1511 if (!CntInst)
1512 return false;
1513
1514 return true;
1515}
1516
1517/// Recognize CTLZ or CTTZ idiom in a non-countable loop and convert the loop
1518/// to countable (with CTLZ / CTTZ trip count). If CTLZ / CTTZ inserted as a new
1519/// trip count returns true; otherwise, returns false.
1520bool LoopIdiomRecognize::recognizeAndInsertFFS() {
1521 // Give up if the loop has multiple blocks or multiple backedges.
1522 if (CurLoop->getNumBackEdges() != 1 || CurLoop->getNumBlocks() != 1)
1523 return false;
1524
1525 Intrinsic::ID IntrinID;
1526 Value *InitX;
1527 Instruction *DefX = nullptr;
1528 PHINode *CntPhi = nullptr;
1529 Instruction *CntInst = nullptr;
1530 // Help decide if transformation is profitable. For ShiftUntilZero idiom,
1531 // this is always 6.
1532 size_t IdiomCanonicalSize = 6;
1533
1534 if (!detectShiftUntilZeroIdiom(CurLoop, *DL, IntrinID, InitX,
1535 CntInst, CntPhi, DefX))
1536 return false;
1537
1538 bool IsCntPhiUsedOutsideLoop = false;
1539 for (User *U : CntPhi->users())
1540 if (!CurLoop->contains(cast<Instruction>(U))) {
1541 IsCntPhiUsedOutsideLoop = true;
1542 break;
1543 }
1544 bool IsCntInstUsedOutsideLoop = false;
1545 for (User *U : CntInst->users())
1546 if (!CurLoop->contains(cast<Instruction>(U))) {
1547 IsCntInstUsedOutsideLoop = true;
1548 break;
1549 }
1550 // If both CntInst and CntPhi are used outside the loop the profitability
1551 // is questionable.
1552 if (IsCntInstUsedOutsideLoop && IsCntPhiUsedOutsideLoop)
1553 return false;
1554
1555 // For some CPUs result of CTLZ(X) intrinsic is undefined
1556 // when X is 0. If we can not guarantee X != 0, we need to check this
1557 // when expand.
1558 bool ZeroCheck = false;
1559 // It is safe to assume Preheader exist as it was checked in
1560 // parent function RunOnLoop.
1561 BasicBlock *PH = CurLoop->getLoopPreheader();
1562
1563 // If we are using the count instruction outside the loop, make sure we
1564 // have a zero check as a precondition. Without the check the loop would run
1565 // one iteration for before any check of the input value. This means 0 and 1
1566 // would have identical behavior in the original loop and thus
1567 if (!IsCntPhiUsedOutsideLoop) {
1568 auto *PreCondBB = PH->getSinglePredecessor();
1569 if (!PreCondBB)
1570 return false;
1571 auto *PreCondBI = dyn_cast<BranchInst>(PreCondBB->getTerminator());
1572 if (!PreCondBI)
1573 return false;
1574 if (matchCondition(PreCondBI, PH) != InitX)
1575 return false;
1576 ZeroCheck = true;
1577 }
1578
1579 // Check if CTLZ / CTTZ intrinsic is profitable. Assume it is always
1580 // profitable if we delete the loop.
1581
1582 // the loop has only 6 instructions:
1583 // %n.addr.0 = phi [ %n, %entry ], [ %shr, %while.cond ]
1584 // %i.0 = phi [ %i0, %entry ], [ %inc, %while.cond ]
1585 // %shr = ashr %n.addr.0, 1
1586 // %tobool = icmp eq %shr, 0
1587 // %inc = add nsw %i.0, 1
1588 // br i1 %tobool
1589
1590 const Value *Args[] = {
1591 InitX, ZeroCheck ? ConstantInt::getTrue(InitX->getContext())
1592 : ConstantInt::getFalse(InitX->getContext())};
1593
1594 // @llvm.dbg doesn't count as they have no semantic effect.
1595 auto InstWithoutDebugIt = CurLoop->getHeader()->instructionsWithoutDebug();
1596 uint32_t HeaderSize =
1597 std::distance(InstWithoutDebugIt.begin(), InstWithoutDebugIt.end());
1598
1599 IntrinsicCostAttributes Attrs(IntrinID, InitX->getType(), Args);
1600 int Cost =
1601 TTI->getIntrinsicInstrCost(Attrs, TargetTransformInfo::TCK_SizeAndLatency);
1602 if (HeaderSize != IdiomCanonicalSize &&
1603 Cost > TargetTransformInfo::TCC_Basic)
1604 return false;
1605
1606 transformLoopToCountable(IntrinID, PH, CntInst, CntPhi, InitX, DefX,
1607 DefX->getDebugLoc(), ZeroCheck,
1608 IsCntPhiUsedOutsideLoop);
1609 return true;
1610}
1611
1612/// Recognizes a population count idiom in a non-countable loop.
1613///
1614/// If detected, transforms the relevant code to issue the popcount intrinsic
1615/// function call, and returns true; otherwise, returns false.
1616bool LoopIdiomRecognize::recognizePopcount() {
1617 if (TTI->getPopcntSupport(32) != TargetTransformInfo::PSK_FastHardware)
1618 return false;
1619
1620 // Counting population are usually conducted by few arithmetic instructions.
1621 // Such instructions can be easily "absorbed" by vacant slots in a
1622 // non-compact loop. Therefore, recognizing popcount idiom only makes sense
1623 // in a compact loop.
1624
1625 // Give up if the loop has multiple blocks or multiple backedges.
1626 if (CurLoop->getNumBackEdges() != 1 || CurLoop->getNumBlocks() != 1)
1627 return false;
1628
1629 BasicBlock *LoopBody = *(CurLoop->block_begin());
1630 if (LoopBody->size() >= 20) {
1631 // The loop is too big, bail out.
1632 return false;
1633 }
1634
1635 // It should have a preheader containing nothing but an unconditional branch.
1636 BasicBlock *PH = CurLoop->getLoopPreheader();
1637 if (!PH || &PH->front() != PH->getTerminator())
1638 return false;
1639 auto *EntryBI = dyn_cast<BranchInst>(PH->getTerminator());
1640 if (!EntryBI || EntryBI->isConditional())
1641 return false;
1642
1643 // It should have a precondition block where the generated popcount intrinsic
1644 // function can be inserted.
1645 auto *PreCondBB = PH->getSinglePredecessor();
1646 if (!PreCondBB)
1647 return false;
1648 auto *PreCondBI = dyn_cast<BranchInst>(PreCondBB->getTerminator());
1649 if (!PreCondBI || PreCondBI->isUnconditional())
1650 return false;
1651
1652 Instruction *CntInst;
1653 PHINode *CntPhi;
1654 Value *Val;
1655 if (!detectPopcountIdiom(CurLoop, PreCondBB, CntInst, CntPhi, Val))
1656 return false;
1657
1658 transformLoopToPopcount(PreCondBB, CntInst, CntPhi, Val);
1659 return true;
1660}
1661
1662static CallInst *createPopcntIntrinsic(IRBuilder<> &IRBuilder, Value *Val,
1663 const DebugLoc &DL) {
1664 Value *Ops[] = {Val};
1665 Type *Tys[] = {Val->getType()};
1666
1667 Module *M = IRBuilder.GetInsertBlock()->getParent()->getParent();
1668 Function *Func = Intrinsic::getDeclaration(M, Intrinsic::ctpop, Tys);
1669 CallInst *CI = IRBuilder.CreateCall(Func, Ops);
1670 CI->setDebugLoc(DL);
1671
1672 return CI;
1673}
1674
1675static CallInst *createFFSIntrinsic(IRBuilder<> &IRBuilder, Value *Val,
1676 const DebugLoc &DL, bool ZeroCheck,
1677 Intrinsic::ID IID) {
1678 Value *Ops[] = {Val, ZeroCheck ? IRBuilder.getTrue() : IRBuilder.getFalse()};
1679 Type *Tys[] = {Val->getType()};
1680
1681 Module *M = IRBuilder.GetInsertBlock()->getParent()->getParent();
1682 Function *Func = Intrinsic::getDeclaration(M, IID, Tys);
1683 CallInst *CI = IRBuilder.CreateCall(Func, Ops);
1684 CI->setDebugLoc(DL);
1685
1686 return CI;
1687}
1688
1689/// Transform the following loop (Using CTLZ, CTTZ is similar):
1690/// loop:
1691/// CntPhi = PHI [Cnt0, CntInst]
1692/// PhiX = PHI [InitX, DefX]
1693/// CntInst = CntPhi + 1
1694/// DefX = PhiX >> 1
1695/// LOOP_BODY
1696/// Br: loop if (DefX != 0)
1697/// Use(CntPhi) or Use(CntInst)
1698///
1699/// Into:
1700/// If CntPhi used outside the loop:
1701/// CountPrev = BitWidth(InitX) - CTLZ(InitX >> 1)
1702/// Count = CountPrev + 1
1703/// else
1704/// Count = BitWidth(InitX) - CTLZ(InitX)
1705/// loop:
1706/// CntPhi = PHI [Cnt0, CntInst]
1707/// PhiX = PHI [InitX, DefX]
1708/// PhiCount = PHI [Count, Dec]
1709/// CntInst = CntPhi + 1
1710/// DefX = PhiX >> 1
1711/// Dec = PhiCount - 1
1712/// LOOP_BODY
1713/// Br: loop if (Dec != 0)
1714/// Use(CountPrev + Cnt0) // Use(CntPhi)
1715/// or
1716/// Use(Count + Cnt0) // Use(CntInst)
1717///
1718/// If LOOP_BODY is empty the loop will be deleted.
1719/// If CntInst and DefX are not used in LOOP_BODY they will be removed.
1720void LoopIdiomRecognize::transformLoopToCountable(
1721 Intrinsic::ID IntrinID, BasicBlock *Preheader, Instruction *CntInst,
1722 PHINode *CntPhi, Value *InitX, Instruction *DefX, const DebugLoc &DL,
1723 bool ZeroCheck, bool IsCntPhiUsedOutsideLoop) {
1724 BranchInst *PreheaderBr = cast<BranchInst>(Preheader->getTerminator());
1725
1726 // Step 1: Insert the CTLZ/CTTZ instruction at the end of the preheader block
1727 IRBuilder<> Builder(PreheaderBr);
1728 Builder.SetCurrentDebugLocation(DL);
1729
1730 // Count = BitWidth - CTLZ(InitX);
1731 // NewCount = Count;
1732 // If there are uses of CntPhi create:
1733 // NewCount = BitWidth - CTLZ(InitX >> 1);
1734 // Count = NewCount + 1;
1735 Value *InitXNext;
1736 if (IsCntPhiUsedOutsideLoop) {
1737 if (DefX->getOpcode() == Instruction::AShr)
1738 InitXNext =
1739 Builder.CreateAShr(InitX, ConstantInt::get(InitX->getType(), 1));
1740 else if (DefX->getOpcode() == Instruction::LShr)
1741 InitXNext =
1742 Builder.CreateLShr(InitX, ConstantInt::get(InitX->getType(), 1));
1743 else if (DefX->getOpcode() == Instruction::Shl) // cttz
1744 InitXNext =
1745 Builder.CreateShl(InitX, ConstantInt::get(InitX->getType(), 1));
1746 else
1747 llvm_unreachable("Unexpected opcode!")::llvm::llvm_unreachable_internal("Unexpected opcode!", "/build/llvm-toolchain-snapshot-12~++20210124100612+2afaf072f5c1/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp"
, 1747)
;
1748 } else
1749 InitXNext = InitX;
1750 Value *FFS = createFFSIntrinsic(Builder, InitXNext, DL, ZeroCheck, IntrinID);
1751 Value *Count = Builder.CreateSub(
1752 ConstantInt::get(FFS->getType(), FFS->getType()->getIntegerBitWidth()),
1753 FFS);
1754 Value *NewCount = Count;
1755 if (IsCntPhiUsedOutsideLoop) {
1756 NewCount = Count;
1757 Count = Builder.CreateAdd(Count, ConstantInt::get(Count->getType(), 1));
1758 }
1759
1760 NewCount = Builder.CreateZExtOrTrunc(NewCount,
1761 cast<IntegerType>(CntInst->getType()));
1762
1763 Value *CntInitVal = CntPhi->getIncomingValueForBlock(Preheader);
1764 if (cast<ConstantInt>(CntInst->getOperand(1))->isOne()) {
1765 // If the counter was being incremented in the loop, add NewCount to the
1766 // counter's initial value, but only if the initial value is not zero.
1767 ConstantInt *InitConst = dyn_cast<ConstantInt>(CntInitVal);
1768 if (!InitConst || !InitConst->isZero())
1769 NewCount = Builder.CreateAdd(NewCount, CntInitVal);
1770 } else {
1771 // If the count was being decremented in the loop, subtract NewCount from
1772 // the counter's initial value.
1773 NewCount = Builder.CreateSub(CntInitVal, NewCount);
1774 }
1775
1776 // Step 2: Insert new IV and loop condition:
1777 // loop:
1778 // ...
1779 // PhiCount = PHI [Count, Dec]
1780 // ...
1781 // Dec = PhiCount - 1
1782 // ...
1783 // Br: loop if (Dec != 0)
1784 BasicBlock *Body = *(CurLoop->block_begin());
1785 auto *LbBr = cast<BranchInst>(Body->getTerminator());
1786 ICmpInst *LbCond = cast<ICmpInst>(LbBr->getCondition());
1787 Type *Ty = Count->getType();
1788
1789 PHINode *TcPhi = PHINode::Create(Ty, 2, "tcphi", &Body->front());
1790
1791 Builder.SetInsertPoint(LbCond);
1792 Instruction *TcDec = cast<Instruction>(
1793 Builder.CreateSub(TcPhi, ConstantInt::get(Ty, 1),
1794 "tcdec", false, true));
1795
1796 TcPhi->addIncoming(Count, Preheader);
1797 TcPhi->addIncoming(TcDec, Body);
1798
1799 CmpInst::Predicate Pred =
1800 (LbBr->getSuccessor(0) == Body) ? CmpInst::ICMP_NE : CmpInst::ICMP_EQ;
1801 LbCond->setPredicate(Pred);
1802 LbCond->setOperand(0, TcDec);
1803 LbCond->setOperand(1, ConstantInt::get(Ty, 0));
1804
1805 // Step 3: All the references to the original counter outside
1806 // the loop are replaced with the NewCount
1807 if (IsCntPhiUsedOutsideLoop)
1808 CntPhi->replaceUsesOutsideBlock(NewCount, Body);
1809 else
1810 CntInst->replaceUsesOutsideBlock(NewCount, Body);
1811
1812 // step 4: Forget the "non-computable" trip-count SCEV associated with the
1813 // loop. The loop would otherwise not be deleted even if it becomes empty.
1814 SE->forgetLoop(CurLoop);
1815}
1816
1817void LoopIdiomRecognize::transformLoopToPopcount(BasicBlock *PreCondBB,
1818 Instruction *CntInst,
1819 PHINode *CntPhi, Value *Var) {
1820 BasicBlock *PreHead = CurLoop->getLoopPreheader();
1821 auto *PreCondBr = cast<BranchInst>(PreCondBB->getTerminator());
1822 const DebugLoc &DL = CntInst->getDebugLoc();
1823
1824 // Assuming before transformation, the loop is following:
1825 // if (x) // the precondition
1826 // do { cnt++; x &= x - 1; } while(x);
1827
1828 // Step 1: Insert the ctpop instruction at the end of the precondition block
1829 IRBuilder<> Builder(PreCondBr);
1830 Value *PopCnt, *PopCntZext, *NewCount, *TripCnt;
1831 {
1832 PopCnt = createPopcntIntrinsic(Builder, Var, DL);
1833 NewCount = PopCntZext =
1834 Builder.CreateZExtOrTrunc(PopCnt, cast<IntegerType>(CntPhi->getType()));
1835
1836 if (NewCount != PopCnt)
1837 (cast<Instruction>(NewCount))->setDebugLoc(DL);
1838
1839 // TripCnt is exactly the number of iterations the loop has
1840 TripCnt = NewCount;
1841
1842 // If the population counter's initial value is not zero, insert Add Inst.
1843 Value *CntInitVal = CntPhi->getIncomingValueForBlock(PreHead);
1844 ConstantInt *InitConst = dyn_cast<ConstantInt>(CntInitVal);
1845 if (!InitConst || !InitConst->isZero()) {
1846 NewCount = Builder.CreateAdd(NewCount, CntInitVal);
1847 (cast<Instruction>(NewCount))->setDebugLoc(DL);
1848 }
1849 }
1850
1851 // Step 2: Replace the precondition from "if (x == 0) goto loop-exit" to
1852 // "if (NewCount == 0) loop-exit". Without this change, the intrinsic
1853 // function would be partial dead code, and downstream passes will drag
1854 // it back from the precondition block to the preheader.
1855 {
1856 ICmpInst *PreCond = cast<ICmpInst>(PreCondBr->getCondition());
1857
1858 Value *Opnd0 = PopCntZext;
1859 Value *Opnd1 = ConstantInt::get(PopCntZext->getType(), 0);
1860 if (PreCond->getOperand(0) != Var)
1861 std::swap(Opnd0, Opnd1);
1862
1863 ICmpInst *NewPreCond = cast<ICmpInst>(
1864 Builder.CreateICmp(PreCond->getPredicate(), Opnd0, Opnd1));
1865 PreCondBr->setCondition(NewPreCond);
1866
1867 RecursivelyDeleteTriviallyDeadInstructions(PreCond, TLI);
1868 }
1869
1870 // Step 3: Note that the population count is exactly the trip count of the
1871 // loop in question, which enable us to convert the loop from noncountable
1872 // loop into a countable one. The benefit is twofold:
1873 //
1874 // - If the loop only counts population, the entire loop becomes dead after
1875 // the transformation. It is a lot easier to prove a countable loop dead
1876 // than to prove a noncountable one. (In some C dialects, an infinite loop
1877 // isn't dead even if it computes nothing useful. In general, DCE needs
1878 // to prove a noncountable loop finite before safely delete it.)
1879 //
1880 // - If the loop also performs something else, it remains alive.
1881 // Since it is transformed to countable form, it can be aggressively
1882 // optimized by some optimizations which are in general not applicable
1883 // to a noncountable loop.
1884 //
1885 // After this step, this loop (conceptually) would look like following:
1886 // newcnt = __builtin_ctpop(x);
1887 // t = newcnt;
1888 // if (x)
1889 // do { cnt++; x &= x-1; t--) } while (t > 0);
1890 BasicBlock *Body = *(CurLoop->block_begin());
1891 {
1892 auto *LbBr = cast<BranchInst>(Body->getTerminator());
1893 ICmpInst *LbCond = cast<ICmpInst>(LbBr->getCondition());
1894 Type *Ty = TripCnt->getType();
1895
1896 PHINode *TcPhi = PHINode::Create(Ty, 2, "tcphi", &Body->front());
1897
1898 Builder.SetInsertPoint(LbCond);
1899 Instruction *TcDec = cast<Instruction>(
1900 Builder.CreateSub(TcPhi, ConstantInt::get(Ty, 1),
1901 "tcdec", false, true));
1902
1903 TcPhi->addIncoming(TripCnt, PreHead);
1904 TcPhi->addIncoming(TcDec, Body);
1905
1906 CmpInst::Predicate Pred =
1907 (LbBr->getSuccessor(0) == Body) ? CmpInst::ICMP_UGT : CmpInst::ICMP_SLE;
1908 LbCond->setPredicate(Pred);
1909 LbCond->setOperand(0, TcDec);
1910 LbCond->setOperand(1, ConstantInt::get(Ty, 0));
1911 }
1912
1913 // Step 4: All the references to the original population counter outside
1914 // the loop are replaced with the NewCount -- the value returned from
1915 // __builtin_ctpop().
1916 CntInst->replaceUsesOutsideBlock(NewCount, Body);
1917
1918 // step 5: Forget the "non-computable" trip-count SCEV associated with the
1919 // loop. The loop would otherwise not be deleted even if it becomes empty.
1920 SE->forgetLoop(CurLoop);
1921}
1922
1923/// Match loop-invariant value.
1924template <typename SubPattern_t> struct match_LoopInvariant {
1925 SubPattern_t SubPattern;
1926 const Loop *L;
1927
1928 match_LoopInvariant(const SubPattern_t &SP, const Loop *L)
1929 : SubPattern(SP), L(L) {}
1930
1931 template <typename ITy> bool match(ITy *V) {
1932 return L->isLoopInvariant(V) && SubPattern.match(V);
1933 }
1934};
1935
1936/// Matches if the value is loop-invariant.
1937template <typename Ty>
1938inline match_LoopInvariant<Ty> m_LoopInvariant(const Ty &M, const Loop *L) {
1939 return match_LoopInvariant<Ty>(M, L);
1940}
1941
1942/// Return true if the idiom is detected in the loop.
1943///
1944/// The core idiom we are trying to detect is:
1945/// \code
1946/// entry:
1947/// <...>
1948/// %bitmask = shl i32 1, %bitpos
1949/// br label %loop
1950///
1951/// loop:
1952/// %x.curr = phi i32 [ %x, %entry ], [ %x.next, %loop ]
1953/// %x.curr.bitmasked = and i32 %x.curr, %bitmask
1954/// %x.curr.isbitunset = icmp eq i32 %x.curr.bitmasked, 0
1955/// %x.next = shl i32 %x.curr, 1
1956/// <...>
1957/// br i1 %x.curr.isbitunset, label %loop, label %end
1958///
1959/// end:
1960/// %x.curr.res = phi i32 [ %x.curr, %loop ] <...>
1961/// %x.next.res = phi i32 [ %x.next, %loop ] <...>
1962/// <...>
1963/// \endcode
1964static bool detectShiftUntilBitTestIdiom(Loop *CurLoop, Value *&BaseX,
1965 Value *&BitMask, Value *&BitPos,
1966 Value *&CurrX, Instruction *&NextX) {
1967 LLVM_DEBUG(dbgs() << DEBUG_TYPEdo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-idiom")) { dbgs() << "loop-idiom" " Performing shift-until-bittest idiom detection.\n"
; } } while (false)
1968 " Performing shift-until-bittest idiom detection.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-idiom")) { dbgs() << "loop-idiom" " Performing shift-until-bittest idiom detection.\n"
; } } while (false)
;
1969
1970 // Give up if the loop has multiple blocks or multiple backedges.
1971 if (CurLoop->getNumBlocks() != 1 || CurLoop->getNumBackEdges() != 1) {
1972 LLVM_DEBUG(dbgs() << DEBUG_TYPE " Bad block/backedge count.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-idiom")) { dbgs() << "loop-idiom" " Bad block/backedge count.\n"
; } } while (false)
;
1973 return false;
1974 }
1975
1976 BasicBlock *LoopHeaderBB = CurLoop->getHeader();
1977 BasicBlock *LoopPreheaderBB = CurLoop->getLoopPreheader();
1978 assert(LoopPreheaderBB && "There is always a loop preheader.")((LoopPreheaderBB && "There is always a loop preheader."
) ? static_cast<void> (0) : __assert_fail ("LoopPreheaderBB && \"There is always a loop preheader.\""
, "/build/llvm-toolchain-snapshot-12~++20210124100612+2afaf072f5c1/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp"
, 1978, __PRETTY_FUNCTION__))
;
1979
1980 using namespace PatternMatch;
1981
1982 // Step 1: Check if the loop backedge is in desirable form.
1983
1984 ICmpInst::Predicate Pred;
1985 Value *CmpLHS, *CmpRHS;
1986 BasicBlock *TrueBB, *FalseBB;
1987 if (!match(LoopHeaderBB->getTerminator(),
1988 m_Br(m_ICmp(Pred, m_Value(CmpLHS), m_Value(CmpRHS)),
1989 m_BasicBlock(TrueBB), m_BasicBlock(FalseBB)))) {
1990 LLVM_DEBUG(dbgs() << DEBUG_TYPE " Bad backedge structure.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-idiom")) { dbgs() << "loop-idiom" " Bad backedge structure.\n"
; } } while (false)
;
1991 return false;
1992 }
1993
1994 // Step 2: Check if the backedge's condition is in desirable form.
1995
1996 auto MatchVariableBitMask = [&]() {
1997 return ICmpInst::isEquality(Pred) && match(CmpRHS, m_Zero()) &&
1998 match(CmpLHS,
1999 m_c_And(m_Value(CurrX),
2000 m_CombineAnd(
2001 m_Value(BitMask),
2002 m_LoopInvariant(m_Shl(m_One(), m_Value(BitPos)),
2003 CurLoop))));
2004 };
2005 auto MatchConstantBitMask = [&]() {
2006 return ICmpInst::isEquality(Pred) && match(CmpRHS, m_Zero()) &&
2007 match(CmpLHS, m_And(m_Value(CurrX),
2008 m_CombineAnd(m_Value(BitMask), m_Power2()))) &&
2009 (BitPos = ConstantExpr::getExactLogBase2(cast<Constant>(BitMask)));
2010 };
2011 auto MatchDecomposableConstantBitMask = [&]() {
2012 APInt Mask;
2013 return llvm::decomposeBitTestICmp(CmpLHS, CmpRHS, Pred, CurrX, Mask) &&
2014 ICmpInst::isEquality(Pred) && Mask.isPowerOf2() &&
2015 (BitMask = ConstantInt::get(CurrX->getType(), Mask)) &&
2016 (BitPos = ConstantInt::get(CurrX->getType(), Mask.logBase2()));
2017 };
2018
2019 if (!MatchVariableBitMask() && !MatchConstantBitMask() &&
2020 !MatchDecomposableConstantBitMask()) {
2021 LLVM_DEBUG(dbgs() << DEBUG_TYPE " Bad backedge comparison.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-idiom")) { dbgs() << "loop-idiom" " Bad backedge comparison.\n"
; } } while (false)
;
2022 return false;
2023 }
2024
2025 // Step 3: Check if the recurrence is in desirable form.
2026 auto *CurrXPN = dyn_cast<PHINode>(CurrX);
2027 if (!CurrXPN || CurrXPN->getParent() != LoopHeaderBB) {
2028 LLVM_DEBUG(dbgs() << DEBUG_TYPE " Not an expected PHI node.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-idiom")) { dbgs() << "loop-idiom" " Not an expected PHI node.\n"
; } } while (false)
;
2029 return false;
2030 }
2031
2032 BaseX = CurrXPN->getIncomingValueForBlock(LoopPreheaderBB);
2033 NextX =
2034 dyn_cast<Instruction>(CurrXPN->getIncomingValueForBlock(LoopHeaderBB));
2035
2036 if (!NextX || !match(NextX, m_Shl(m_Specific(CurrX), m_One()))) {
2037 // FIXME: support right-shift?
2038 LLVM_DEBUG(dbgs() << DEBUG_TYPE " Bad recurrence.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-idiom")) { dbgs() << "loop-idiom" " Bad recurrence.\n"
; } } while (false)
;
2039 return false;
2040 }
2041
2042 // Step 4: Check if the backedge's destinations are in desirable form.
2043
2044 assert(ICmpInst::isEquality(Pred) &&((ICmpInst::isEquality(Pred) && "Should only get equality predicates here."
) ? static_cast<void> (0) : __assert_fail ("ICmpInst::isEquality(Pred) && \"Should only get equality predicates here.\""
, "/build/llvm-toolchain-snapshot-12~++20210124100612+2afaf072f5c1/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp"
, 2045, __PRETTY_FUNCTION__))
2045 "Should only get equality predicates here.")((ICmpInst::isEquality(Pred) && "Should only get equality predicates here."
) ? static_cast<void> (0) : __assert_fail ("ICmpInst::isEquality(Pred) && \"Should only get equality predicates here.\""
, "/build/llvm-toolchain-snapshot-12~++20210124100612+2afaf072f5c1/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp"
, 2045, __PRETTY_FUNCTION__))
;
2046
2047 // cmp-br is commutative, so canonicalize to a single variant.
2048 if (Pred != ICmpInst::Predicate::ICMP_EQ) {
2049 Pred = ICmpInst::getInversePredicate(Pred);
2050 std::swap(TrueBB, FalseBB);
2051 }
2052
2053 // We expect to exit loop when comparison yields false,
2054 // so when it yields true we should branch back to loop header.
2055 if (TrueBB != LoopHeaderBB) {
2056 LLVM_DEBUG(dbgs() << DEBUG_TYPE " Bad backedge flow.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-idiom")) { dbgs() << "loop-idiom" " Bad backedge flow.\n"
; } } while (false)
;
2057 return false;
2058 }
2059
2060 // Okay, idiom checks out.
2061 return true;
2062}
2063
2064/// Look for the following loop:
2065/// \code
2066/// entry:
2067/// <...>
2068/// %bitmask = shl i32 1, %bitpos
2069/// br label %loop
2070///
2071/// loop:
2072/// %x.curr = phi i32 [ %x, %entry ], [ %x.next, %loop ]
2073/// %x.curr.bitmasked = and i32 %x.curr, %bitmask
2074/// %x.curr.isbitunset = icmp eq i32 %x.curr.bitmasked, 0
2075/// %x.next = shl i32 %x.curr, 1
2076/// <...>
2077/// br i1 %x.curr.isbitunset, label %loop, label %end
2078///
2079/// end:
2080/// %x.curr.res = phi i32 [ %x.curr, %loop ] <...>
2081/// %x.next.res = phi i32 [ %x.next, %loop ] <...>
2082/// <...>
2083/// \endcode
2084///
2085/// And transform it into:
2086/// \code
2087/// entry:
2088/// %bitmask = shl i32 1, %bitpos
2089/// %lowbitmask = add i32 %bitmask, -1
2090/// %mask = or i32 %lowbitmask, %bitmask
2091/// %x.masked = and i32 %x, %mask
2092/// %x.masked.numleadingzeros = call i32 @llvm.ctlz.i32(i32 %x.masked,
2093/// i1 true)
2094/// %x.masked.numactivebits = sub i32 32, %x.masked.numleadingzeros
2095/// %x.masked.leadingonepos = add i32 %x.masked.numactivebits, -1
2096/// %backedgetakencount = sub i32 %bitpos, %x.masked.leadingonepos
2097/// %tripcount = add i32 %backedgetakencount, 1
2098/// %x.curr = shl i32 %x, %backedgetakencount
2099/// %x.next = shl i32 %x, %tripcount
2100/// br label %loop
2101///
2102/// loop:
2103/// %loop.iv = phi i32 [ 0, %entry ], [ %loop.iv.next, %loop ]
2104/// %loop.iv.next = add nuw i32 %loop.iv, 1
2105/// %loop.ivcheck = icmp eq i32 %loop.iv.next, %tripcount
2106/// <...>
2107/// br i1 %loop.ivcheck, label %end, label %loop
2108///
2109/// end:
2110/// %x.curr.res = phi i32 [ %x.curr, %loop ] <...>
2111/// %x.next.res = phi i32 [ %x.next, %loop ] <...>
2112/// <...>
2113/// \endcode
2114bool LoopIdiomRecognize::recognizeShiftUntilBitTest() {
2115 bool MadeChange = false;
2116
2117 Value *X, *BitMask, *BitPos, *XCurr;
2118 Instruction *XNext;
2119 if (!detectShiftUntilBitTestIdiom(CurLoop, X, BitMask, BitPos, XCurr,
2120 XNext)) {
2121 LLVM_DEBUG(dbgs() << DEBUG_TYPEdo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-idiom")) { dbgs() << "loop-idiom" " shift-until-bittest idiom detection failed.\n"
; } } while (false)
2122 " shift-until-bittest idiom detection failed.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-idiom")) { dbgs() << "loop-idiom" " shift-until-bittest idiom detection failed.\n"
; } } while (false)
;
2123 return MadeChange;
2124 }
2125 LLVM_DEBUG(dbgs() << DEBUG_TYPE " shift-until-bittest idiom detected!\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-idiom")) { dbgs() << "loop-idiom" " shift-until-bittest idiom detected!\n"
; } } while (false)
;
2126
2127 // Ok, it is the idiom we were looking for, we *could* transform this loop,
2128 // but is it profitable to transform?
2129
2130 BasicBlock *LoopHeaderBB = CurLoop->getHeader();
2131 BasicBlock *LoopPreheaderBB = CurLoop->getLoopPreheader();
2132 assert(LoopPreheaderBB && "There is always a loop preheader.")((LoopPreheaderBB && "There is always a loop preheader."
) ? static_cast<void> (0) : __assert_fail ("LoopPreheaderBB && \"There is always a loop preheader.\""
, "/build/llvm-toolchain-snapshot-12~++20210124100612+2afaf072f5c1/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp"
, 2132, __PRETTY_FUNCTION__))
;
2133
2134 BasicBlock *SuccessorBB = CurLoop->getExitBlock();
2135 assert(LoopPreheaderBB && "There is only a single successor.")((LoopPreheaderBB && "There is only a single successor."
) ? static_cast<void> (0) : __assert_fail ("LoopPreheaderBB && \"There is only a single successor.\""
, "/build/llvm-toolchain-snapshot-12~++20210124100612+2afaf072f5c1/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp"
, 2135, __PRETTY_FUNCTION__))
;
2136
2137 IRBuilder<> Builder(LoopPreheaderBB->getTerminator());
2138 Builder.SetCurrentDebugLocation(cast<Instruction>(XCurr)->getDebugLoc());
2139
2140 Intrinsic::ID IntrID = Intrinsic::ctlz;
2141 Type *Ty = X->getType();
2142
2143 TargetTransformInfo::TargetCostKind CostKind =
2144 TargetTransformInfo::TCK_SizeAndLatency;
2145
2146 // The rewrite is considered to be unprofitable iff and only iff the
2147 // intrinsic/shift we'll use are not cheap. Note that we are okay with *just*
2148 // making the loop countable, even if nothing else changes.
2149 IntrinsicCostAttributes Attrs(
2150 IntrID, Ty, {UndefValue::get(Ty), /*is_zero_undef=*/Builder.getTrue()});
2151 int Cost = TTI->getIntrinsicInstrCost(Attrs, CostKind);
2152 if (Cost > TargetTransformInfo::TCC_Basic) {
2153 LLVM_DEBUG(dbgs() << DEBUG_TYPEdo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-idiom")) { dbgs() << "loop-idiom" " Intrinsic is too costly, not beneficial\n"
; } } while (false)
2154 " Intrinsic is too costly, not beneficial\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-idiom")) { dbgs() << "loop-idiom" " Intrinsic is too costly, not beneficial\n"
; } } while (false)
;
2155 return MadeChange;
2156 }
2157 if (TTI->getArithmeticInstrCost(Instruction::Shl, Ty, CostKind) >
2158 TargetTransformInfo::TCC_Basic) {
2159 LLVM_DEBUG(dbgs() << DEBUG_TYPE " Shift is too costly, not beneficial\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-idiom")) { dbgs() << "loop-idiom" " Shift is too costly, not beneficial\n"
; } } while (false)
;
2160 return MadeChange;
2161 }
2162
2163 // Ok, transform appears worthwhile.
2164 MadeChange = true;
2165
2166 // Step 1: Compute the loop trip count.
2167
2168 Value *LowBitMask = Builder.CreateAdd(BitMask, Constant::getAllOnesValue(Ty),
2169 BitPos->getName() + ".lowbitmask");
2170 Value *Mask =
2171 Builder.CreateOr(LowBitMask, BitMask, BitPos->getName() + ".mask");
2172 Value *XMasked = Builder.CreateAnd(X, Mask, X->getName() + ".masked");
2173 CallInst *XMaskedNumLeadingZeros = Builder.CreateIntrinsic(
2174 IntrID, Ty, {XMasked, /*is_zero_undef=*/Builder.getTrue()},
2175 /*FMFSource=*/nullptr, XMasked->getName() + ".numleadingzeros");
2176 Value *XMaskedNumActiveBits = Builder.CreateSub(
2177 ConstantInt::get(Ty, Ty->getScalarSizeInBits()), XMaskedNumLeadingZeros,
2178 XMasked->getName() + ".numactivebits");
2179 Value *XMaskedLeadingOnePos =
2180 Builder.CreateAdd(XMaskedNumActiveBits, Constant::getAllOnesValue(Ty),
2181 XMasked->getName() + ".leadingonepos");
2182
2183 Value *LoopBackedgeTakenCount = Builder.CreateSub(
2184 BitPos, XMaskedLeadingOnePos, CurLoop->getName() + ".backedgetakencount");
2185 // We know loop's backedge-taken count, but what's loop's trip count?
2186 // Note that while NUW is always safe, while NSW is only for bitwidths != 2.
2187 Value *LoopTripCount =
2188 Builder.CreateNUWAdd(LoopBackedgeTakenCount, ConstantInt::get(Ty, 1),
2189 CurLoop->getName() + ".tripcount");
2190
2191 // Step 2: Compute the recurrence's final value without a loop.
2192
2193 // NewX is always safe to compute, because `LoopBackedgeTakenCount`
2194 // will always be smaller than `bitwidth(X)`, i.e. we never get poison.
2195 Value *NewX = Builder.CreateShl(X, LoopBackedgeTakenCount);
2196 NewX->takeName(XCurr);
2197 if (auto *I = dyn_cast<Instruction>(NewX))
2198 I->copyIRFlags(XNext, /*IncludeWrapFlags=*/true);
2199
2200 Value *NewXNext;
2201 // Rewriting XNext is more complicated, however, because `X << LoopTripCount`
2202 // will be poison iff `LoopTripCount == bitwidth(X)` (which will happen
2203 // iff `BitPos` is `bitwidth(x) - 1` and `X` is `1`). So unless we know
2204 // that isn't the case, we'll need to emit an alternative, safe IR.
2205 if (XNext->hasNoSignedWrap() || XNext->hasNoUnsignedWrap() ||
2206 PatternMatch::match(
2207 BitPos, PatternMatch::m_SpecificInt_ICMP(
2208 ICmpInst::ICMP_NE, APInt(Ty->getScalarSizeInBits(),
2209 Ty->getScalarSizeInBits() - 1))))
2210 NewXNext = Builder.CreateShl(X, LoopTripCount);
2211 else {
2212 // Otherwise, just additionally shift by one. It's the smallest solution,
2213 // alternatively, we could check that NewX is INT_MIN (or BitPos is )
2214 // and select 0 instead.
2215 NewXNext = Builder.CreateShl(NewX, ConstantInt::get(Ty, 1));
2216 }
2217
2218 NewXNext->takeName(XNext);
2219 if (auto *I = dyn_cast<Instruction>(NewXNext))
2220 I->copyIRFlags(XNext, /*IncludeWrapFlags=*/true);
2221
2222 // Step 3: Adjust the successor basic block to recieve the computed
2223 // recurrence's final value instead of the recurrence itself.
2224
2225 XCurr->replaceUsesOutsideBlock(NewX, LoopHeaderBB);
2226 XNext->replaceUsesOutsideBlock(NewXNext, LoopHeaderBB);
2227
2228 // Step 4: Rewrite the loop into a countable form, with canonical IV.
2229
2230 // The new canonical induction variable.
2231 Builder.SetInsertPoint(&LoopHeaderBB->front());
2232 auto *IV = Builder.CreatePHI(Ty, 2, CurLoop->getName() + ".iv");
2233
2234 // The induction itself.
2235 // Note that while NUW is always safe, while NSW is only for bitwidths != 2.
2236 Builder.SetInsertPoint(LoopHeaderBB->getTerminator());
2237 auto *IVNext = Builder.CreateNUWAdd(IV, ConstantInt::get(Ty, 1),
2238 IV->getName() + ".next");
2239
2240 // The loop trip count check.
2241 auto *IVCheck = Builder.CreateICmpEQ(IVNext, LoopTripCount,
2242 CurLoop->getName() + ".ivcheck");
2243 Builder.CreateCondBr(IVCheck, SuccessorBB, LoopHeaderBB);
2244 LoopHeaderBB->getTerminator()->eraseFromParent();
2245
2246 // Populate the IV PHI.
2247 IV->addIncoming(ConstantInt::get(Ty, 0), LoopPreheaderBB);
2248 IV->addIncoming(IVNext, LoopHeaderBB);
2249
2250 // Step 5: Forget the "non-computable" trip-count SCEV associated with the
2251 // loop. The loop would otherwise not be deleted even if it becomes empty.
2252
2253 SE->forgetLoop(CurLoop);
2254
2255 // Other passes will take care of actually deleting the loop if possible.
2256
2257 LLVM_DEBUG(dbgs() << DEBUG_TYPE " shift-until-bittest idiom optimized!\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-idiom")) { dbgs() << "loop-idiom" " shift-until-bittest idiom optimized!\n"
; } } while (false)
;
2258
2259 ++NumShiftUntilBitTest;
2260 return MadeChange;
2261}