/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp

Bug Summary

File:	llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
Warning:	line 521, column 15 Although the value stored to 'PatternValue' is used in the enclosing expression, the value is never actually read from 'PatternValue'

Annotated Source Code

Press '?' to see keyboard shortcuts

Show analyzer invocation

clang -cc1 -cc1 -triple x86_64-pc-linux-gnu -analyze -disable-free -disable-llvm-verifier -discard-value-names -main-file-name LoopIdiomRecognize.cpp -analyzer-store=region -analyzer-opt-analyze-nested-blocks -analyzer-checker=core -analyzer-checker=apiModeling -analyzer-checker=unix -analyzer-checker=deadcode -analyzer-checker=cplusplus -analyzer-checker=security.insecureAPI.UncheckedReturn -analyzer-checker=security.insecureAPI.getpw -analyzer-checker=security.insecureAPI.gets -analyzer-checker=security.insecureAPI.mktemp -analyzer-checker=security.insecureAPI.mkstemp -analyzer-checker=security.insecureAPI.vfork -analyzer-checker=nullability.NullPassedToNonnull -analyzer-checker=nullability.NullReturnedFromNonnull -analyzer-output plist -w -setup-static-analyzer -analyzer-config-compatibility-mode=true -mrelocation-model pic -pic-level 2 -fhalf-no-semantic-interposition -mframe-pointer=none -fmath-errno -fno-rounding-math -mconstructor-aliases -munwind-tables -target-cpu x86-64 -tune-cpu generic -debugger-tuning=gdb -ffunction-sections -fdata-sections -fcoverage-compilation-dir=/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/build-llvm/lib/Transforms/Scalar -resource-dir /usr/lib/llvm-13/lib/clang/13.0.0 -D _DEBUG -D _GNU_SOURCE -D __STDC_CONSTANT_MACROS -D __STDC_FORMAT_MACROS -D __STDC_LIMIT_MACROS -I /build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/build-llvm/lib/Transforms/Scalar -I /build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Scalar -I /build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/build-llvm/include -I /build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/include -U NDEBUG -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/6.3.0/../../../../include/c++/6.3.0 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/6.3.0/../../../../include/x86_64-linux-gnu/c++/6.3.0 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/6.3.0/../../../../include/c++/6.3.0/backward -internal-isystem /usr/lib/llvm-13/lib/clang/13.0.0/include -internal-isystem /usr/local/include -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/6.3.0/../../../../x86_64-linux-gnu/include -internal-externc-isystem /usr/include/x86_64-linux-gnu -internal-externc-isystem /include -internal-externc-isystem /usr/include -O2 -Wno-unused-parameter -Wwrite-strings -Wno-missing-field-initializers -Wno-long-long -Wno-maybe-uninitialized -Wno-comment -std=c++14 -fdeprecated-macro -fdebug-compilation-dir=/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/build-llvm/lib/Transforms/Scalar -fdebug-prefix-map=/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f=. -ferror-limit 19 -fvisibility-inlines-hidden -stack-protector 2 -fgnuc-version=4.2.1 -vectorize-loops -vectorize-slp -analyzer-output=html -analyzer-config stable-report-filename=true -faddrsig -D__GCC_HAVE_DWARF2_CFI_ASM=1 -o /tmp/scan-build-2021-04-14-063029-18377-1 -x c++ /build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp

1	//===- LoopIdiomRecognize.cpp - Loop idiom recognition --------------------===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8	//
9	// This pass implements an idiom recognizer that transforms simple loops into a
10	// non-loop form. In cases that this kicks in, it can be a significant
11	// performance win.
12	//
13	// If compiling for code size we avoid idiom recognition if the resulting
14	// code could be larger than the code for the original loop. One way this could
15	// happen is if the loop is not removable after idiom recognition due to the
16	// presence of non-idiom instructions. The initial implementation of the
17	// heuristics applies to idioms in multi-block loops.
18	//
19	//===----------------------------------------------------------------------===//
20	//
21	// TODO List:
22	//
23	// Future loop memory idioms to recognize:
24	// memcmp, memmove, strlen, etc.
25	// Future floating point idioms to recognize in -ffast-math mode:
26	// fpowi
27	// Future integer operation idioms to recognize:
28	// ctpop
29	//
30	// Beware that isel's default lowering for ctpop is highly inefficient for
31	// i64 and larger types when i64 is legal and the value has few bits set. It
32	// would be good to enhance isel to emit a loop for ctpop in this case.
33	//
34	// This could recognize common matrix multiplies and dot product idioms and
35	// replace them with calls to BLAS (if linked in??).
36	//
37	//===----------------------------------------------------------------------===//
38
39	#include "llvm/Transforms/Scalar/LoopIdiomRecognize.h"
40	#include "llvm/ADT/APInt.h"
41	#include "llvm/ADT/ArrayRef.h"
42	#include "llvm/ADT/DenseMap.h"
43	#include "llvm/ADT/MapVector.h"
44	#include "llvm/ADT/SetVector.h"
45	#include "llvm/ADT/SmallPtrSet.h"
46	#include "llvm/ADT/SmallVector.h"
47	#include "llvm/ADT/Statistic.h"
48	#include "llvm/ADT/StringRef.h"
49	#include "llvm/Analysis/AliasAnalysis.h"
50	#include "llvm/Analysis/CmpInstAnalysis.h"
51	#include "llvm/Analysis/LoopAccessAnalysis.h"
52	#include "llvm/Analysis/LoopInfo.h"
53	#include "llvm/Analysis/LoopPass.h"
54	#include "llvm/Analysis/MemoryLocation.h"
55	#include "llvm/Analysis/MemorySSA.h"
56	#include "llvm/Analysis/MemorySSAUpdater.h"
57	#include "llvm/Analysis/MustExecute.h"
58	#include "llvm/Analysis/OptimizationRemarkEmitter.h"
59	#include "llvm/Analysis/ScalarEvolution.h"
60	#include "llvm/Analysis/ScalarEvolutionExpressions.h"
61	#include "llvm/Analysis/TargetLibraryInfo.h"
62	#include "llvm/Analysis/TargetTransformInfo.h"
63	#include "llvm/Analysis/ValueTracking.h"
64	#include "llvm/IR/Attributes.h"
65	#include "llvm/IR/BasicBlock.h"
66	#include "llvm/IR/Constant.h"
67	#include "llvm/IR/Constants.h"
68	#include "llvm/IR/DataLayout.h"
69	#include "llvm/IR/DebugLoc.h"
70	#include "llvm/IR/DerivedTypes.h"
71	#include "llvm/IR/Dominators.h"
72	#include "llvm/IR/GlobalValue.h"
73	#include "llvm/IR/GlobalVariable.h"
74	#include "llvm/IR/IRBuilder.h"
75	#include "llvm/IR/InstrTypes.h"
76	#include "llvm/IR/Instruction.h"
77	#include "llvm/IR/Instructions.h"
78	#include "llvm/IR/IntrinsicInst.h"
79	#include "llvm/IR/Intrinsics.h"
80	#include "llvm/IR/LLVMContext.h"
81	#include "llvm/IR/Module.h"
82	#include "llvm/IR/PassManager.h"
83	#include "llvm/IR/PatternMatch.h"
84	#include "llvm/IR/Type.h"
85	#include "llvm/IR/User.h"
86	#include "llvm/IR/Value.h"
87	#include "llvm/IR/ValueHandle.h"
88	#include "llvm/InitializePasses.h"
89	#include "llvm/Pass.h"
90	#include "llvm/Support/Casting.h"
91	#include "llvm/Support/CommandLine.h"
92	#include "llvm/Support/Debug.h"
93	#include "llvm/Support/InstructionCost.h"
94	#include "llvm/Support/raw_ostream.h"
95	#include "llvm/Transforms/Scalar.h"
96	#include "llvm/Transforms/Utils/BuildLibCalls.h"
97	#include "llvm/Transforms/Utils/Local.h"
98	#include "llvm/Transforms/Utils/LoopUtils.h"
99	#include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
100	#include <algorithm>
101	#include <cassert>
102	#include <cstdint>
103	#include <utility>
104	#include <vector>
105
106	using namespace llvm;
107
108	#define DEBUG_TYPE"loop-idiom" "loop-idiom"
109
110	STATISTIC(NumMemSet, "Number of memset's formed from loop stores")static llvm::Statistic NumMemSet = {"loop-idiom", "NumMemSet" , "Number of memset's formed from loop stores"};
111	STATISTIC(NumMemCpy, "Number of memcpy's formed from loop load+stores")static llvm::Statistic NumMemCpy = {"loop-idiom", "NumMemCpy" , "Number of memcpy's formed from loop load+stores"};
112	STATISTIC(static llvm::Statistic NumShiftUntilBitTest = {"loop-idiom", "NumShiftUntilBitTest" , "Number of uncountable loops recognized as 'shift until bitttest' idiom" }
113	NumShiftUntilBitTest,static llvm::Statistic NumShiftUntilBitTest = {"loop-idiom", "NumShiftUntilBitTest" , "Number of uncountable loops recognized as 'shift until bitttest' idiom" }
114	"Number of uncountable loops recognized as 'shift until bitttest' idiom")static llvm::Statistic NumShiftUntilBitTest = {"loop-idiom", "NumShiftUntilBitTest" , "Number of uncountable loops recognized as 'shift until bitttest' idiom" };
115
116	bool DisableLIRP::All;
117	static cl::opt<bool, true>
118	DisableLIRPAll("disable-" DEBUG_TYPE"loop-idiom" "-all",
119	cl::desc("Options to disable Loop Idiom Recognize Pass."),
120	cl::location(DisableLIRP::All), cl::init(false),
121	cl::ReallyHidden);
122
123	bool DisableLIRP::Memset;
124	static cl::opt<bool, true>
125	DisableLIRPMemset("disable-" DEBUG_TYPE"loop-idiom" "-memset",
126	cl::desc("Proceed with loop idiom recognize pass, but do "
127	"not convert loop(s) to memset."),
128	cl::location(DisableLIRP::Memset), cl::init(false),
129	cl::ReallyHidden);
130
131	bool DisableLIRP::Memcpy;
132	static cl::opt<bool, true>
133	DisableLIRPMemcpy("disable-" DEBUG_TYPE"loop-idiom" "-memcpy",
134	cl::desc("Proceed with loop idiom recognize pass, but do "
135	"not convert loop(s) to memcpy."),
136	cl::location(DisableLIRP::Memcpy), cl::init(false),
137	cl::ReallyHidden);
138
139	static cl::opt<bool> UseLIRCodeSizeHeurs(
140	"use-lir-code-size-heurs",
141	cl::desc("Use loop idiom recognition code size heuristics when compiling"
142	"with -Os/-Oz"),
143	cl::init(true), cl::Hidden);
144
145	namespace {
146
147	class LoopIdiomRecognize {
148	Loop *CurLoop = nullptr;
149	AliasAnalysis *AA;
150	DominatorTree *DT;
151	LoopInfo *LI;
152	ScalarEvolution *SE;
153	TargetLibraryInfo *TLI;
154	const TargetTransformInfo *TTI;
155	const DataLayout *DL;
156	OptimizationRemarkEmitter &ORE;
157	bool ApplyCodeSizeHeuristics;
158	std::unique_ptr<MemorySSAUpdater> MSSAU;
159
160	public:
161	explicit LoopIdiomRecognize(AliasAnalysis AA, DominatorTree DT,
162	LoopInfo LI, ScalarEvolution SE,
163	TargetLibraryInfo *TLI,
164	const TargetTransformInfo TTI, MemorySSA MSSA,
165	const DataLayout *DL,
166	OptimizationRemarkEmitter &ORE)
167	: AA(AA), DT(DT), LI(LI), SE(SE), TLI(TLI), TTI(TTI), DL(DL), ORE(ORE) {
168	if (MSSA)
169	MSSAU = std::make_unique<MemorySSAUpdater>(MSSA);
170	}
171
172	bool runOnLoop(Loop *L);
173
174	private:
175	using StoreList = SmallVector<StoreInst *, 8>;
176	using StoreListMap = MapVector<Value *, StoreList>;
177
178	StoreListMap StoreRefsForMemset;
179	StoreListMap StoreRefsForMemsetPattern;
180	StoreList StoreRefsForMemcpy;
181	bool HasMemset;
182	bool HasMemsetPattern;
183	bool HasMemcpy;
184
185	/// Return code for isLegalStore()
186	enum LegalStoreKind {
187	None = 0,
188	Memset,
189	MemsetPattern,
190	Memcpy,
191	UnorderedAtomicMemcpy,
192	DontUse // Dummy retval never to be used. Allows catching errors in retval
193	// handling.
194	};
195
196	/// \name Countable Loop Idiom Handling
197	/// @{
198
199	bool runOnCountableLoop();
200	bool runOnLoopBlock(BasicBlock BB, const SCEV BECount,
201	SmallVectorImpl<BasicBlock *> &ExitBlocks);
202
203	void collectStores(BasicBlock *BB);
204	LegalStoreKind isLegalStore(StoreInst *SI);
205	enum class ForMemset { No, Yes };
206	bool processLoopStores(SmallVectorImpl<StoreInst > &SL, const SCEV BECount,
207	ForMemset For);
208	bool processLoopMemSet(MemSetInst MSI, const SCEV BECount);
209
210	bool processLoopStridedStore(Value *DestPtr, unsigned StoreSize,
211	MaybeAlign StoreAlignment, Value *StoredVal,
212	Instruction *TheStore,
213	SmallPtrSetImpl<Instruction *> &Stores,
214	const SCEVAddRecExpr Ev, const SCEV BECount,
215	bool NegStride, bool IsLoopMemset = false);
216	bool processLoopStoreOfLoopLoad(StoreInst SI, const SCEV BECount);
217	bool avoidLIRForMultiBlockLoop(bool IsMemset = false,
218	bool IsLoopMemset = false);
219
220	/// @}
221	/// \name Noncountable Loop Idiom Handling
222	/// @{
223
224	bool runOnNoncountableLoop();
225
226	bool recognizePopcount();
227	void transformLoopToPopcount(BasicBlock PreCondBB, Instruction CntInst,
228	PHINode CntPhi, Value Var);
229	bool recognizeAndInsertFFS(); /// Find First Set: ctlz or cttz
230	void transformLoopToCountable(Intrinsic::ID IntrinID, BasicBlock *PreCondBB,
231	Instruction CntInst, PHINode CntPhi,
232	Value Var, Instruction DefX,
233	const DebugLoc &DL, bool ZeroCheck,
234	bool IsCntPhiUsedOutsideLoop);
235
236	bool recognizeShiftUntilBitTest();
237
238	/// @}
239	};
240
241	class LoopIdiomRecognizeLegacyPass : public LoopPass {
242	public:
243	static char ID;
244
245	explicit LoopIdiomRecognizeLegacyPass() : LoopPass(ID) {
246	initializeLoopIdiomRecognizeLegacyPassPass(
247	*PassRegistry::getPassRegistry());
248	}
249
250	bool runOnLoop(Loop *L, LPPassManager &LPM) override {
251	if (DisableLIRP::All)
252	return false;
253
254	if (skipLoop(L))
255	return false;
256
257	AliasAnalysis *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
258	DominatorTree *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
259	LoopInfo *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
260	ScalarEvolution *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
261	TargetLibraryInfo *TLI =
262	&getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(
263	*L->getHeader()->getParent());
264	const TargetTransformInfo *TTI =
265	&getAnalysis<TargetTransformInfoWrapperPass>().getTTI(
266	*L->getHeader()->getParent());
267	const DataLayout *DL = &L->getHeader()->getModule()->getDataLayout();
268	auto *MSSAAnalysis = getAnalysisIfAvailable<MemorySSAWrapperPass>();
269	MemorySSA *MSSA = nullptr;
270	if (MSSAAnalysis)
271	MSSA = &MSSAAnalysis->getMSSA();
272
273	// For the old PM, we can't use OptimizationRemarkEmitter as an analysis
274	// pass. Function analyses need to be preserved across loop transformations
275	// but ORE cannot be preserved (see comment before the pass definition).
276	OptimizationRemarkEmitter ORE(L->getHeader()->getParent());
277
278	LoopIdiomRecognize LIR(AA, DT, LI, SE, TLI, TTI, MSSA, DL, ORE);
279	return LIR.runOnLoop(L);
280	}
281
282	/// This transformation requires natural loop information & requires that
283	/// loop preheaders be inserted into the CFG.
284	void getAnalysisUsage(AnalysisUsage &AU) const override {
285	AU.addRequired<TargetLibraryInfoWrapperPass>();
286	AU.addRequired<TargetTransformInfoWrapperPass>();
287	AU.addPreserved<MemorySSAWrapperPass>();
288	getLoopAnalysisUsage(AU);
289	}
290	};
291
292	} // end anonymous namespace
293
294	char LoopIdiomRecognizeLegacyPass::ID = 0;
295
296	PreservedAnalyses LoopIdiomRecognizePass::run(Loop &L, LoopAnalysisManager &AM,
297	LoopStandardAnalysisResults &AR,
298	LPMUpdater &) {
299	if (DisableLIRP::All)
300	return PreservedAnalyses::all();
301
302	const auto *DL = &L.getHeader()->getModule()->getDataLayout();
303
304	// For the new PM, we also can't use OptimizationRemarkEmitter as an analysis
305	// pass. Function analyses need to be preserved across loop transformations
306	// but ORE cannot be preserved (see comment before the pass definition).
307	OptimizationRemarkEmitter ORE(L.getHeader()->getParent());
308
309	LoopIdiomRecognize LIR(&AR.AA, &AR.DT, &AR.LI, &AR.SE, &AR.TLI, &AR.TTI,
310	AR.MSSA, DL, ORE);
311	if (!LIR.runOnLoop(&L))
312	return PreservedAnalyses::all();
313
314	auto PA = getLoopPassPreservedAnalyses();
315	if (AR.MSSA)
316	PA.preserve<MemorySSAAnalysis>();
317	return PA;
318	}
319
320	INITIALIZE_PASS_BEGIN(LoopIdiomRecognizeLegacyPass, "loop-idiom",static void *initializeLoopIdiomRecognizeLegacyPassPassOnce(PassRegistry &Registry) {
321	"Recognize loop idioms", false, false)static void *initializeLoopIdiomRecognizeLegacyPassPassOnce(PassRegistry &Registry) {
322	INITIALIZE_PASS_DEPENDENCY(LoopPass)initializeLoopPassPass(Registry);
323	INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)initializeTargetLibraryInfoWrapperPassPass(Registry);
324	INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)initializeTargetTransformInfoWrapperPassPass(Registry);
325	INITIALIZE_PASS_END(LoopIdiomRecognizeLegacyPass, "loop-idiom",PassInfo PI = new PassInfo( "Recognize loop idioms", "loop-idiom" , &LoopIdiomRecognizeLegacyPass::ID, PassInfo::NormalCtor_t (callDefaultCtor<LoopIdiomRecognizeLegacyPass>), false, false); Registry.registerPass(PI, true); return PI; } static llvm::once_flag InitializeLoopIdiomRecognizeLegacyPassPassFlag ; void llvm::initializeLoopIdiomRecognizeLegacyPassPass(PassRegistry &Registry) { llvm::call_once(InitializeLoopIdiomRecognizeLegacyPassPassFlag , initializeLoopIdiomRecognizeLegacyPassPassOnce, std::ref(Registry )); }
326	"Recognize loop idioms", false, false)PassInfo PI = new PassInfo( "Recognize loop idioms", "loop-idiom" , &LoopIdiomRecognizeLegacyPass::ID, PassInfo::NormalCtor_t (callDefaultCtor<LoopIdiomRecognizeLegacyPass>), false, false); Registry.registerPass(PI, true); return PI; } static llvm::once_flag InitializeLoopIdiomRecognizeLegacyPassPassFlag ; void llvm::initializeLoopIdiomRecognizeLegacyPassPass(PassRegistry &Registry) { llvm::call_once(InitializeLoopIdiomRecognizeLegacyPassPassFlag , initializeLoopIdiomRecognizeLegacyPassPassOnce, std::ref(Registry )); }
327
328	Pass *llvm::createLoopIdiomPass() { return new LoopIdiomRecognizeLegacyPass(); }
329
330	static void deleteDeadInstruction(Instruction *I) {
331	I->replaceAllUsesWith(UndefValue::get(I->getType()));
332	I->eraseFromParent();
333	}
334
335	//===----------------------------------------------------------------------===//
336	//
337	// Implementation of LoopIdiomRecognize
338	//
339	//===----------------------------------------------------------------------===//
340
341	bool LoopIdiomRecognize::runOnLoop(Loop *L) {
342	CurLoop = L;
343	// If the loop could not be converted to canonical form, it must have an
344	// indirectbr in it, just give up.
345	if (!L->getLoopPreheader())
346	return false;
347
348	// Disable loop idiom recognition if the function's name is a common idiom.
349	StringRef Name = L->getHeader()->getParent()->getName();
350	if (Name == "memset" \|\| Name == "memcpy")
351	return false;
352
353	// Determine if code size heuristics need to be applied.
354	ApplyCodeSizeHeuristics =
355	L->getHeader()->getParent()->hasOptSize() && UseLIRCodeSizeHeurs;
356
357	HasMemset = TLI->has(LibFunc_memset);
358	HasMemsetPattern = TLI->has(LibFunc_memset_pattern16);
359	HasMemcpy = TLI->has(LibFunc_memcpy);
360
361	if (HasMemset \|\| HasMemsetPattern \|\| HasMemcpy)
362	if (SE->hasLoopInvariantBackedgeTakenCount(L))
363	return runOnCountableLoop();
364
365	return runOnNoncountableLoop();
366	}
367
368	bool LoopIdiomRecognize::runOnCountableLoop() {
369	const SCEV *BECount = SE->getBackedgeTakenCount(CurLoop);
370	assert(!isa<SCEVCouldNotCompute>(BECount) &&((!isa<SCEVCouldNotCompute>(BECount) && "runOnCountableLoop() called on a loop without a predictable" "backedge-taken count") ? static_cast<void> (0) : __assert_fail ("!isa<SCEVCouldNotCompute>(BECount) && \"runOnCountableLoop() called on a loop without a predictable\" \"backedge-taken count\"" , "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp" , 372, __PRETTY_FUNCTION__))
371	"runOnCountableLoop() called on a loop without a predictable"((!isa<SCEVCouldNotCompute>(BECount) && "runOnCountableLoop() called on a loop without a predictable" "backedge-taken count") ? static_cast<void> (0) : __assert_fail ("!isa<SCEVCouldNotCompute>(BECount) && \"runOnCountableLoop() called on a loop without a predictable\" \"backedge-taken count\"" , "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp" , 372, __PRETTY_FUNCTION__))
372	"backedge-taken count")((!isa<SCEVCouldNotCompute>(BECount) && "runOnCountableLoop() called on a loop without a predictable" "backedge-taken count") ? static_cast<void> (0) : __assert_fail ("!isa<SCEVCouldNotCompute>(BECount) && \"runOnCountableLoop() called on a loop without a predictable\" \"backedge-taken count\"" , "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp" , 372, __PRETTY_FUNCTION__));
373
374	// If this loop executes exactly one time, then it should be peeled, not
375	// optimized by this pass.
376	if (const SCEVConstant *BECst = dyn_cast<SCEVConstant>(BECount))
377	if (BECst->getAPInt() == 0)
378	return false;
379
380	SmallVector<BasicBlock *, 8> ExitBlocks;
381	CurLoop->getUniqueExitBlocks(ExitBlocks);
382
383	LLVM_DEBUG(dbgs() << DEBUG_TYPE " Scanning: F["do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-idiom")) { dbgs() << "loop-idiom" " Scanning: F[" << CurLoop->getHeader()->getParent()->getName () << "] Countable Loop %" << CurLoop->getHeader ()->getName() << "\n"; } } while (false)
384	<< CurLoop->getHeader()->getParent()->getName()do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-idiom")) { dbgs() << "loop-idiom" " Scanning: F[" << CurLoop->getHeader()->getParent()->getName () << "] Countable Loop %" << CurLoop->getHeader ()->getName() << "\n"; } } while (false)
385	<< "] Countable Loop %" << CurLoop->getHeader()->getName()do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-idiom")) { dbgs() << "loop-idiom" " Scanning: F[" << CurLoop->getHeader()->getParent()->getName () << "] Countable Loop %" << CurLoop->getHeader ()->getName() << "\n"; } } while (false)
386	<< "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-idiom")) { dbgs() << "loop-idiom" " Scanning: F[" << CurLoop->getHeader()->getParent()->getName () << "] Countable Loop %" << CurLoop->getHeader ()->getName() << "\n"; } } while (false);
387
388	// The following transforms hoist stores/memsets into the loop pre-header.
389	// Give up if the loop has instructions that may throw.
390	SimpleLoopSafetyInfo SafetyInfo;
391	SafetyInfo.computeLoopSafetyInfo(CurLoop);
392	if (SafetyInfo.anyBlockMayThrow())
393	return false;
394
395	bool MadeChange = false;
396
397	// Scan all the blocks in the loop that are not in subloops.
398	for (auto *BB : CurLoop->getBlocks()) {
399	// Ignore blocks in subloops.
400	if (LI->getLoopFor(BB) != CurLoop)
401	continue;
402
403	MadeChange \|= runOnLoopBlock(BB, BECount, ExitBlocks);
404	}
405	return MadeChange;
406	}
407
408	static APInt getStoreStride(const SCEVAddRecExpr *StoreEv) {
409	const SCEVConstant *ConstStride = cast<SCEVConstant>(StoreEv->getOperand(1));
410	return ConstStride->getAPInt();
411	}
412
413	/// getMemSetPatternValue - If a strided store of the specified value is safe to
414	/// turn into a memset_pattern16, return a ConstantArray of 16 bytes that should
415	/// be passed in. Otherwise, return null.
416	///
417	/// Note that we don't ever attempt to use memset_pattern8 or 4, because these
418	/// just replicate their input array and then pass on to memset_pattern16.
419	static Constant getMemSetPatternValue(Value V, const DataLayout *DL) {
420	// FIXME: This could check for UndefValue because it can be merged into any
421	// other valid pattern.
422
423	// If the value isn't a constant, we can't promote it to being in a constant
424	// array. We could theoretically do a store to an alloca or something, but
425	// that doesn't seem worthwhile.
426	Constant *C = dyn_cast<Constant>(V);
427	if (!C)
428	return nullptr;
429
430	// Only handle simple values that are a power of two bytes in size.
431	uint64_t Size = DL->getTypeSizeInBits(V->getType());
432	if (Size == 0 \|\| (Size & 7) \|\| (Size & (Size - 1)))
433	return nullptr;
434
435	// Don't care enough about darwin/ppc to implement this.
436	if (DL->isBigEndian())
437	return nullptr;
438
439	// Convert to size in bytes.
440	Size /= 8;
441
442	// TODO: If CI is larger than 16-bytes, we can try slicing it in half to see
443	// if the top and bottom are the same (e.g. for vectors and large integers).
444	if (Size > 16)
445	return nullptr;
446
447	// If the constant is exactly 16 bytes, just use it.
448	if (Size == 16)
449	return C;
450
451	// Otherwise, we'll use an array of the constants.
452	unsigned ArraySize = 16 / Size;
453	ArrayType *AT = ArrayType::get(V->getType(), ArraySize);
454	return ConstantArray::get(AT, std::vector<Constant *>(ArraySize, C));
455	}
456
457	LoopIdiomRecognize::LegalStoreKind
458	LoopIdiomRecognize::isLegalStore(StoreInst *SI) {
459	// Don't touch volatile stores.
460	if (SI->isVolatile())
461	return LegalStoreKind::None;
462	// We only want simple or unordered-atomic stores.
463	if (!SI->isUnordered())
464	return LegalStoreKind::None;
465
466	// Avoid merging nontemporal stores.
467	if (SI->getMetadata(LLVMContext::MD_nontemporal))
468	return LegalStoreKind::None;
469
470	Value *StoredVal = SI->getValueOperand();
471	Value *StorePtr = SI->getPointerOperand();
472
473	// Don't convert stores of non-integral pointer types to memsets (which stores
474	// integers).
475	if (DL->isNonIntegralPointerType(StoredVal->getType()->getScalarType()))
476	return LegalStoreKind::None;
477
478	// Reject stores that are so large that they overflow an unsigned.
479	// When storing out scalable vectors we bail out for now, since the code
480	// below currently only works for constant strides.
481	TypeSize SizeInBits = DL->getTypeSizeInBits(StoredVal->getType());
482	if (SizeInBits.isScalable() \|\| (SizeInBits.getFixedSize() & 7) \|\|
483	(SizeInBits.getFixedSize() >> 32) != 0)
484	return LegalStoreKind::None;
485
486	// See if the pointer expression is an AddRec like {base,+,1} on the current
487	// loop, which indicates a strided store. If we have something else, it's a
488	// random store we can't handle.
489	const SCEVAddRecExpr *StoreEv =
490	dyn_cast<SCEVAddRecExpr>(SE->getSCEV(StorePtr));
491	if (!StoreEv \|\| StoreEv->getLoop() != CurLoop \|\| !StoreEv->isAffine())
492	return LegalStoreKind::None;
493
494	// Check to see if we have a constant stride.
495	if (!isa<SCEVConstant>(StoreEv->getOperand(1)))
496	return LegalStoreKind::None;
497
498	// See if the store can be turned into a memset.
499
500	// If the stored value is a byte-wise value (like i32 -1), then it may be
501	// turned into a memset of i8 -1, assuming that all the consecutive bytes
502	// are stored. A store of i32 0x01020304 can never be turned into a memset,
503	// but it can be turned into memset_pattern if the target supports it.
504	Value SplatValue = isBytewiseValue(StoredVal, DL);
505	Constant *PatternValue = nullptr;
506
507	// Note: memset and memset_pattern on unordered-atomic is yet not supported
508	bool UnorderedAtomic = SI->isUnordered() && !SI->isSimple();
509
510	// If we're allowed to form a memset, and the stored value would be
511	// acceptable for memset, use it.
512	if (!UnorderedAtomic && HasMemset && SplatValue && !DisableLIRP::Memset &&
513	// Verify that the stored value is loop invariant. If not, we can't
514	// promote the memset.
515	CurLoop->isLoopInvariant(SplatValue)) {
516	// It looks like we can use SplatValue.
517	return LegalStoreKind::Memset;
518	} else if (!UnorderedAtomic && HasMemsetPattern && !DisableLIRP::Memset &&
519	// Don't create memset_pattern16s with address spaces.
520	StorePtr->getType()->getPointerAddressSpace() == 0 &&
521	(PatternValue = getMemSetPatternValue(StoredVal, DL))) {
	Although the value stored to 'PatternValue' is used in the enclosing expression, the value is never actually read from 'PatternValue'
522	// It looks like we can use PatternValue!
523	return LegalStoreKind::MemsetPattern;
524	}
525
526	// Otherwise, see if the store can be turned into a memcpy.
527	if (HasMemcpy && !DisableLIRP::Memcpy) {
528	// Check to see if the stride matches the size of the store. If so, then we
529	// know that every byte is touched in the loop.
530	APInt Stride = getStoreStride(StoreEv);
531	unsigned StoreSize = DL->getTypeStoreSize(SI->getValueOperand()->getType());
532	if (StoreSize != Stride && StoreSize != -Stride)
533	return LegalStoreKind::None;
534
535	// The store must be feeding a non-volatile load.
536	LoadInst *LI = dyn_cast<LoadInst>(SI->getValueOperand());
537
538	// Only allow non-volatile loads
539	if (!LI \|\| LI->isVolatile())
540	return LegalStoreKind::None;
541	// Only allow simple or unordered-atomic loads
542	if (!LI->isUnordered())
543	return LegalStoreKind::None;
544
545	// See if the pointer expression is an AddRec like {base,+,1} on the current
546	// loop, which indicates a strided load. If we have something else, it's a
547	// random load we can't handle.
548	const SCEVAddRecExpr *LoadEv =
549	dyn_cast<SCEVAddRecExpr>(SE->getSCEV(LI->getPointerOperand()));
550	if (!LoadEv \|\| LoadEv->getLoop() != CurLoop \|\| !LoadEv->isAffine())
551	return LegalStoreKind::None;
552
553	// The store and load must share the same stride.
554	if (StoreEv->getOperand(1) != LoadEv->getOperand(1))
555	return LegalStoreKind::None;
556
557	// Success. This store can be converted into a memcpy.
558	UnorderedAtomic = UnorderedAtomic \|\| LI->isAtomic();
559	return UnorderedAtomic ? LegalStoreKind::UnorderedAtomicMemcpy
560	: LegalStoreKind::Memcpy;
561	}
562	// This store can't be transformed into a memset/memcpy.
563	return LegalStoreKind::None;
564	}
565
566	void LoopIdiomRecognize::collectStores(BasicBlock *BB) {
567	StoreRefsForMemset.clear();
568	StoreRefsForMemsetPattern.clear();
569	StoreRefsForMemcpy.clear();
570	for (Instruction &I : *BB) {
571	StoreInst *SI = dyn_cast<StoreInst>(&I);
572	if (!SI)
573	continue;
574
575	// Make sure this is a strided store with a constant stride.
576	switch (isLegalStore(SI)) {
577	case LegalStoreKind::None:
578	// Nothing to do
579	break;
580	case LegalStoreKind::Memset: {
581	// Find the base pointer.
582	Value *Ptr = getUnderlyingObject(SI->getPointerOperand());
583	StoreRefsForMemset[Ptr].push_back(SI);
584	} break;
585	case LegalStoreKind::MemsetPattern: {
586	// Find the base pointer.
587	Value *Ptr = getUnderlyingObject(SI->getPointerOperand());
588	StoreRefsForMemsetPattern[Ptr].push_back(SI);
589	} break;
590	case LegalStoreKind::Memcpy:
591	case LegalStoreKind::UnorderedAtomicMemcpy:
592	StoreRefsForMemcpy.push_back(SI);
593	break;
594	default:
595	assert(false && "unhandled return value")((false && "unhandled return value") ? static_cast< void> (0) : __assert_fail ("false && \"unhandled return value\"" , "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp" , 595, __PRETTY_FUNCTION__));
596	break;
597	}
598	}
599	}
600
601	/// runOnLoopBlock - Process the specified block, which lives in a counted loop
602	/// with the specified backedge count. This block is known to be in the current
603	/// loop and not in any subloops.
604	bool LoopIdiomRecognize::runOnLoopBlock(
605	BasicBlock BB, const SCEV BECount,
606	SmallVectorImpl<BasicBlock *> &ExitBlocks) {
607	// We can only promote stores in this block if they are unconditionally
608	// executed in the loop. For a block to be unconditionally executed, it has
609	// to dominate all the exit blocks of the loop. Verify this now.
610	for (unsigned i = 0, e = ExitBlocks.size(); i != e; ++i)
611	if (!DT->dominates(BB, ExitBlocks[i]))
612	return false;
613
614	bool MadeChange = false;
615	// Look for store instructions, which may be optimized to memset/memcpy.
616	collectStores(BB);
617
618	// Look for a single store or sets of stores with a common base, which can be
619	// optimized into a memset (memset_pattern). The latter most commonly happens
620	// with structs and handunrolled loops.
621	for (auto &SL : StoreRefsForMemset)
622	MadeChange \|= processLoopStores(SL.second, BECount, ForMemset::Yes);
623
624	for (auto &SL : StoreRefsForMemsetPattern)
625	MadeChange \|= processLoopStores(SL.second, BECount, ForMemset::No);
626
627	// Optimize the store into a memcpy, if it feeds an similarly strided load.
628	for (auto &SI : StoreRefsForMemcpy)
629	MadeChange \|= processLoopStoreOfLoopLoad(SI, BECount);
630
631	for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) {
632	Instruction Inst = &I++;
633	// Look for memset instructions, which may be optimized to a larger memset.
634	if (MemSetInst *MSI = dyn_cast<MemSetInst>(Inst)) {
635	WeakTrackingVH InstPtr(&*I);
636	if (!processLoopMemSet(MSI, BECount))
637	continue;
638	MadeChange = true;
639
640	// If processing the memset invalidated our iterator, start over from the
641	// top of the block.
642	if (!InstPtr)
643	I = BB->begin();
644	continue;
645	}
646	}
647
648	return MadeChange;
649	}
650
651	/// See if this store(s) can be promoted to a memset.
652	bool LoopIdiomRecognize::processLoopStores(SmallVectorImpl<StoreInst *> &SL,
653	const SCEV *BECount, ForMemset For) {
654	// Try to find consecutive stores that can be transformed into memsets.
655	SetVector<StoreInst *> Heads, Tails;
656	SmallDenseMap<StoreInst , StoreInst > ConsecutiveChain;
657
658	// Do a quadratic search on all of the given stores and find
659	// all of the pairs of stores that follow each other.
660	SmallVector<unsigned, 16> IndexQueue;
661	for (unsigned i = 0, e = SL.size(); i < e; ++i) {
662	assert(SL[i]->isSimple() && "Expected only non-volatile stores.")((SL[i]->isSimple() && "Expected only non-volatile stores." ) ? static_cast<void> (0) : __assert_fail ("SL[i]->isSimple() && \"Expected only non-volatile stores.\"" , "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp" , 662, __PRETTY_FUNCTION__));
663
664	Value *FirstStoredVal = SL[i]->getValueOperand();
665	Value *FirstStorePtr = SL[i]->getPointerOperand();
666	const SCEVAddRecExpr *FirstStoreEv =
667	cast<SCEVAddRecExpr>(SE->getSCEV(FirstStorePtr));
668	APInt FirstStride = getStoreStride(FirstStoreEv);
669	unsigned FirstStoreSize = DL->getTypeStoreSize(SL[i]->getValueOperand()->getType());
670
671	// See if we can optimize just this store in isolation.
672	if (FirstStride == FirstStoreSize \|\| -FirstStride == FirstStoreSize) {
673	Heads.insert(SL[i]);
674	continue;
675	}
676
677	Value *FirstSplatValue = nullptr;
678	Constant *FirstPatternValue = nullptr;
679
680	if (For == ForMemset::Yes)
681	FirstSplatValue = isBytewiseValue(FirstStoredVal, *DL);
682	else
683	FirstPatternValue = getMemSetPatternValue(FirstStoredVal, DL);
684
685	assert((FirstSplatValue \|\| FirstPatternValue) &&(((FirstSplatValue \|\| FirstPatternValue) && "Expected either splat value or pattern value." ) ? static_cast<void> (0) : __assert_fail ("(FirstSplatValue \|\| FirstPatternValue) && \"Expected either splat value or pattern value.\"" , "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp" , 686, __PRETTY_FUNCTION__))
686	"Expected either splat value or pattern value.")(((FirstSplatValue \|\| FirstPatternValue) && "Expected either splat value or pattern value." ) ? static_cast<void> (0) : __assert_fail ("(FirstSplatValue \|\| FirstPatternValue) && \"Expected either splat value or pattern value.\"" , "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp" , 686, __PRETTY_FUNCTION__));
687
688	IndexQueue.clear();
689	// If a store has multiple consecutive store candidates, search Stores
690	// array according to the sequence: from i+1 to e, then from i-1 to 0.
691	// This is because usually pairing with immediate succeeding or preceding
692	// candidate create the best chance to find memset opportunity.
693	unsigned j = 0;
694	for (j = i + 1; j < e; ++j)
695	IndexQueue.push_back(j);
696	for (j = i; j > 0; --j)
697	IndexQueue.push_back(j - 1);
698
699	for (auto &k : IndexQueue) {
700	assert(SL[k]->isSimple() && "Expected only non-volatile stores.")((SL[k]->isSimple() && "Expected only non-volatile stores." ) ? static_cast<void> (0) : __assert_fail ("SL[k]->isSimple() && \"Expected only non-volatile stores.\"" , "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp" , 700, __PRETTY_FUNCTION__));
701	Value *SecondStorePtr = SL[k]->getPointerOperand();
702	const SCEVAddRecExpr *SecondStoreEv =
703	cast<SCEVAddRecExpr>(SE->getSCEV(SecondStorePtr));
704	APInt SecondStride = getStoreStride(SecondStoreEv);
705
706	if (FirstStride != SecondStride)
707	continue;
708
709	Value *SecondStoredVal = SL[k]->getValueOperand();
710	Value *SecondSplatValue = nullptr;
711	Constant *SecondPatternValue = nullptr;
712
713	if (For == ForMemset::Yes)
714	SecondSplatValue = isBytewiseValue(SecondStoredVal, *DL);
715	else
716	SecondPatternValue = getMemSetPatternValue(SecondStoredVal, DL);
717
718	assert((SecondSplatValue \|\| SecondPatternValue) &&(((SecondSplatValue \|\| SecondPatternValue) && "Expected either splat value or pattern value." ) ? static_cast<void> (0) : __assert_fail ("(SecondSplatValue \|\| SecondPatternValue) && \"Expected either splat value or pattern value.\"" , "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp" , 719, __PRETTY_FUNCTION__))
719	"Expected either splat value or pattern value.")(((SecondSplatValue \|\| SecondPatternValue) && "Expected either splat value or pattern value." ) ? static_cast<void> (0) : __assert_fail ("(SecondSplatValue \|\| SecondPatternValue) && \"Expected either splat value or pattern value.\"" , "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp" , 719, __PRETTY_FUNCTION__));
720
721	if (isConsecutiveAccess(SL[i], SL[k], DL, SE, false)) {
722	if (For == ForMemset::Yes) {
723	if (isa<UndefValue>(FirstSplatValue))
724	FirstSplatValue = SecondSplatValue;
725	if (FirstSplatValue != SecondSplatValue)
726	continue;
727	} else {
728	if (isa<UndefValue>(FirstPatternValue))
729	FirstPatternValue = SecondPatternValue;
730	if (FirstPatternValue != SecondPatternValue)
731	continue;
732	}
733	Tails.insert(SL[k]);
734	Heads.insert(SL[i]);
735	ConsecutiveChain[SL[i]] = SL[k];
736	break;
737	}
738	}
739	}
740
741	// We may run into multiple chains that merge into a single chain. We mark the
742	// stores that we transformed so that we don't visit the same store twice.
743	SmallPtrSet<Value *, 16> TransformedStores;
744	bool Changed = false;
745
746	// For stores that start but don't end a link in the chain:
747	for (SetVector<StoreInst *>::iterator it = Heads.begin(), e = Heads.end();
748	it != e; ++it) {
749	if (Tails.count(*it))
750	continue;
751
752	// We found a store instr that starts a chain. Now follow the chain and try
753	// to transform it.
754	SmallPtrSet<Instruction *, 8> AdjacentStores;
755	StoreInst I = it;
756
757	StoreInst *HeadStore = I;
758	unsigned StoreSize = 0;
759
760	// Collect the chain into a list.
761	while (Tails.count(I) \|\| Heads.count(I)) {
762	if (TransformedStores.count(I))
763	break;
764	AdjacentStores.insert(I);
765
766	StoreSize += DL->getTypeStoreSize(I->getValueOperand()->getType());
767	// Move to the next value in the chain.
768	I = ConsecutiveChain[I];
769	}
770
771	Value *StoredVal = HeadStore->getValueOperand();
772	Value *StorePtr = HeadStore->getPointerOperand();
773	const SCEVAddRecExpr *StoreEv = cast<SCEVAddRecExpr>(SE->getSCEV(StorePtr));
774	APInt Stride = getStoreStride(StoreEv);
775
776	// Check to see if the stride matches the size of the stores. If so, then
777	// we know that every byte is touched in the loop.
778	if (StoreSize != Stride && StoreSize != -Stride)
779	continue;
780
781	bool NegStride = StoreSize == -Stride;
782
783	if (processLoopStridedStore(StorePtr, StoreSize,
784	MaybeAlign(HeadStore->getAlignment()),
785	StoredVal, HeadStore, AdjacentStores, StoreEv,
786	BECount, NegStride)) {
787	TransformedStores.insert(AdjacentStores.begin(), AdjacentStores.end());
788	Changed = true;
789	}
790	}
791
792	return Changed;
793	}
794
795	/// processLoopMemSet - See if this memset can be promoted to a large memset.
796	bool LoopIdiomRecognize::processLoopMemSet(MemSetInst *MSI,
797	const SCEV *BECount) {
798	// We can only handle non-volatile memsets with a constant size.
799	if (MSI->isVolatile() \|\| !isa<ConstantInt>(MSI->getLength()))
800	return false;
801
802	// If we're not allowed to hack on memset, we fail.
803	if (!HasMemset)
804	return false;
805
806	Value *Pointer = MSI->getDest();
807
808	// See if the pointer expression is an AddRec like {base,+,1} on the current
809	// loop, which indicates a strided store. If we have something else, it's a
810	// random store we can't handle.
811	const SCEVAddRecExpr *Ev = dyn_cast<SCEVAddRecExpr>(SE->getSCEV(Pointer));
812	if (!Ev \|\| Ev->getLoop() != CurLoop \|\| !Ev->isAffine())
813	return false;
814
815	// Reject memsets that are so large that they overflow an unsigned.
816	uint64_t SizeInBytes = cast<ConstantInt>(MSI->getLength())->getZExtValue();
817	if ((SizeInBytes >> 32) != 0)
818	return false;
819
820	// Check to see if the stride matches the size of the memset. If so, then we
821	// know that every byte is touched in the loop.
822	const SCEVConstant *ConstStride = dyn_cast<SCEVConstant>(Ev->getOperand(1));
823	if (!ConstStride)
824	return false;
825
826	APInt Stride = ConstStride->getAPInt();
827	if (SizeInBytes != Stride && SizeInBytes != -Stride)
828	return false;
829
830	// Verify that the memset value is loop invariant. If not, we can't promote
831	// the memset.
832	Value *SplatValue = MSI->getValue();
833	if (!SplatValue \|\| !CurLoop->isLoopInvariant(SplatValue))
834	return false;
835
836	SmallPtrSet<Instruction *, 1> MSIs;
837	MSIs.insert(MSI);
838	bool NegStride = SizeInBytes == -Stride;
839	return processLoopStridedStore(
840	Pointer, (unsigned)SizeInBytes, MaybeAlign(MSI->getDestAlignment()),
841	SplatValue, MSI, MSIs, Ev, BECount, NegStride, /IsLoopMemset=/true);
842	}
843
844	/// mayLoopAccessLocation - Return true if the specified loop might access the
845	/// specified pointer location, which is a loop-strided access. The 'Access'
846	/// argument specifies what the verboten forms of access are (read or write).
847	static bool
848	mayLoopAccessLocation(Value Ptr, ModRefInfo Access, Loop L,
849	const SCEV *BECount, unsigned StoreSize,
850	AliasAnalysis &AA,
851	SmallPtrSetImpl<Instruction *> &IgnoredStores) {
852	// Get the location that may be stored across the loop. Since the access is
853	// strided positively through memory, we say that the modified location starts
854	// at the pointer and has infinite size.
855	LocationSize AccessSize = LocationSize::afterPointer();
856
857	// If the loop iterates a fixed number of times, we can refine the access size
858	// to be exactly the size of the memset, which is (BECount+1)*StoreSize
859	if (const SCEVConstant *BECst = dyn_cast<SCEVConstant>(BECount))
860	AccessSize = LocationSize::precise((BECst->getValue()->getZExtValue() + 1) *
861	StoreSize);
862
863	// TODO: For this to be really effective, we have to dive into the pointer
864	// operand in the store. Store to &A[i] of 100 will always return may alias
865	// with store of &A[100], we need to StoreLoc to be "A" with size of 100,
866	// which will then no-alias a store to &A[100].
867	MemoryLocation StoreLoc(Ptr, AccessSize);
868
869	for (Loop::block_iterator BI = L->block_begin(), E = L->block_end(); BI != E;
870	++BI)
871	for (Instruction &I : **BI)
872	if (IgnoredStores.count(&I) == 0 &&
873	isModOrRefSet(
874	intersectModRef(AA.getModRefInfo(&I, StoreLoc), Access)))
875	return true;
876
877	return false;
878	}
879
880	// If we have a negative stride, Start refers to the end of the memory location
881	// we're trying to memset. Therefore, we need to recompute the base pointer,
882	// which is just Start - BECount*Size.
883	static const SCEV getStartForNegStride(const SCEV Start, const SCEV *BECount,
884	Type *IntPtr, unsigned StoreSize,
885	ScalarEvolution *SE) {
886	const SCEV *Index = SE->getTruncateOrZeroExtend(BECount, IntPtr);
887	if (StoreSize != 1)
888	Index = SE->getMulExpr(Index, SE->getConstant(IntPtr, StoreSize),
889	SCEV::FlagNUW);
890	return SE->getMinusSCEV(Start, Index);
891	}
892
893	/// Compute the number of bytes as a SCEV from the backedge taken count.
894	///
895	/// This also maps the SCEV into the provided type and tries to handle the
896	/// computation in a way that will fold cleanly.
897	static const SCEV getNumBytes(const SCEV BECount, Type *IntPtr,
898	unsigned StoreSize, Loop *CurLoop,
899	const DataLayout DL, ScalarEvolution SE) {
900	const SCEV *NumBytesS;
901	// The # stored bytes is (BECount+1)*Size. Expand the trip count out to
902	// pointer size if it isn't already.
903	//
904	// If we're going to need to zero extend the BE count, check if we can add
905	// one to it prior to zero extending without overflow. Provided this is safe,
906	// it allows better simplification of the +1.
907	if (DL->getTypeSizeInBits(BECount->getType()).getFixedSize() <
908	DL->getTypeSizeInBits(IntPtr).getFixedSize() &&
909	SE->isLoopEntryGuardedByCond(
910	CurLoop, ICmpInst::ICMP_NE, BECount,
911	SE->getNegativeSCEV(SE->getOne(BECount->getType())))) {
912	NumBytesS = SE->getZeroExtendExpr(
913	SE->getAddExpr(BECount, SE->getOne(BECount->getType()), SCEV::FlagNUW),
914	IntPtr);
915	} else {
916	NumBytesS = SE->getAddExpr(SE->getTruncateOrZeroExtend(BECount, IntPtr),
917	SE->getOne(IntPtr), SCEV::FlagNUW);
918	}
919
920	// And scale it based on the store size.
921	if (StoreSize != 1) {
922	NumBytesS = SE->getMulExpr(NumBytesS, SE->getConstant(IntPtr, StoreSize),
923	SCEV::FlagNUW);
924	}
925	return NumBytesS;
926	}
927
928	/// processLoopStridedStore - We see a strided store of some value. If we can
929	/// transform this into a memset or memset_pattern in the loop preheader, do so.
930	bool LoopIdiomRecognize::processLoopStridedStore(
931	Value *DestPtr, unsigned StoreSize, MaybeAlign StoreAlignment,
932	Value StoredVal, Instruction TheStore,
933	SmallPtrSetImpl<Instruction > &Stores, const SCEVAddRecExpr Ev,
934	const SCEV *BECount, bool NegStride, bool IsLoopMemset) {
935	Value SplatValue = isBytewiseValue(StoredVal, DL);
936	Constant *PatternValue = nullptr;
937
938	if (!SplatValue)
939	PatternValue = getMemSetPatternValue(StoredVal, DL);
940
941	assert((SplatValue \|\| PatternValue) &&(((SplatValue \|\| PatternValue) && "Expected either splat value or pattern value." ) ? static_cast<void> (0) : __assert_fail ("(SplatValue \|\| PatternValue) && \"Expected either splat value or pattern value.\"" , "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp" , 942, __PRETTY_FUNCTION__))
942	"Expected either splat value or pattern value.")(((SplatValue \|\| PatternValue) && "Expected either splat value or pattern value." ) ? static_cast<void> (0) : __assert_fail ("(SplatValue \|\| PatternValue) && \"Expected either splat value or pattern value.\"" , "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp" , 942, __PRETTY_FUNCTION__));
943
944	// The trip count of the loop and the base pointer of the addrec SCEV is
945	// guaranteed to be loop invariant, which means that it should dominate the
946	// header. This allows us to insert code for it in the preheader.
947	unsigned DestAS = DestPtr->getType()->getPointerAddressSpace();
948	BasicBlock *Preheader = CurLoop->getLoopPreheader();
949	IRBuilder<> Builder(Preheader->getTerminator());
950	SCEVExpander Expander(SE, DL, "loop-idiom");
951	SCEVExpanderCleaner ExpCleaner(Expander, *DT);
952
953	Type *DestInt8PtrTy = Builder.getInt8PtrTy(DestAS);
954	Type *IntIdxTy = DL->getIndexType(DestPtr->getType());
955
956	bool Changed = false;
957	const SCEV *Start = Ev->getStart();
958	// Handle negative strided loops.
959	if (NegStride)
960	Start = getStartForNegStride(Start, BECount, IntIdxTy, StoreSize, SE);
961
962	// TODO: ideally we should still be able to generate memset if SCEV expander
963	// is taught to generate the dependencies at the latest point.
964	if (!isSafeToExpand(Start, *SE))
965	return Changed;
966
967	// Okay, we have a strided store "p[i]" of a splattable value. We can turn
968	// this into a memset in the loop preheader now if we want. However, this
969	// would be unsafe to do if there is anything else in the loop that may read
970	// or write to the aliased location. Check for any overlap by generating the
971	// base pointer and checking the region.
972	Value *BasePtr =
973	Expander.expandCodeFor(Start, DestInt8PtrTy, Preheader->getTerminator());
974
975	// From here on out, conservatively report to the pass manager that we've
976	// changed the IR, even if we later clean up these added instructions. There
977	// may be structural differences e.g. in the order of use lists not accounted
978	// for in just a textual dump of the IR. This is written as a variable, even
979	// though statically all the places this dominates could be replaced with
980	// 'true', with the hope that anyone trying to be clever / "more precise" with
981	// the return value will read this comment, and leave them alone.
982	Changed = true;
983
984	if (mayLoopAccessLocation(BasePtr, ModRefInfo::ModRef, CurLoop, BECount,
985	StoreSize, *AA, Stores))
986	return Changed;
987
988	if (avoidLIRForMultiBlockLoop(/IsMemset=/true, IsLoopMemset))
989	return Changed;
990
991	// Okay, everything looks good, insert the memset.
992
993	const SCEV *NumBytesS =
994	getNumBytes(BECount, IntIdxTy, StoreSize, CurLoop, DL, SE);
995
996	// TODO: ideally we should still be able to generate memset if SCEV expander
997	// is taught to generate the dependencies at the latest point.
998	if (!isSafeToExpand(NumBytesS, *SE))
999	return Changed;
1000
1001	Value *NumBytes =
1002	Expander.expandCodeFor(NumBytesS, IntIdxTy, Preheader->getTerminator());
1003
1004	CallInst *NewCall;
1005	if (SplatValue) {
1006	NewCall = Builder.CreateMemSet(BasePtr, SplatValue, NumBytes,
1007	MaybeAlign(StoreAlignment));
1008	} else {
1009	// Everything is emitted in default address space
1010	Type *Int8PtrTy = DestInt8PtrTy;
1011
1012	Module *M = TheStore->getModule();
1013	StringRef FuncName = "memset_pattern16";
1014	FunctionCallee MSP = M->getOrInsertFunction(FuncName, Builder.getVoidTy(),
1015	Int8PtrTy, Int8PtrTy, IntIdxTy);
1016	inferLibFuncAttributes(M, FuncName, *TLI);
1017
1018	// Otherwise we should form a memset_pattern16. PatternValue is known to be
1019	// an constant array of 16-bytes. Plop the value into a mergable global.
1020	GlobalVariable GV = new GlobalVariable(M, PatternValue->getType(), true,
1021	GlobalValue::PrivateLinkage,
1022	PatternValue, ".memset_pattern");
1023	GV->setUnnamedAddr(GlobalValue::UnnamedAddr::Global); // Ok to merge these.
1024	GV->setAlignment(Align(16));
1025	Value *PatternPtr = ConstantExpr::getBitCast(GV, Int8PtrTy);
1026	NewCall = Builder.CreateCall(MSP, {BasePtr, PatternPtr, NumBytes});
1027	}
1028	NewCall->setDebugLoc(TheStore->getDebugLoc());
1029
1030	if (MSSAU) {
1031	MemoryAccess *NewMemAcc = MSSAU->createMemoryAccessInBB(
1032	NewCall, nullptr, NewCall->getParent(), MemorySSA::BeforeTerminator);
1033	MSSAU->insertDef(cast<MemoryDef>(NewMemAcc), true);
1034	}
1035
1036	LLVM_DEBUG(dbgs() << " Formed memset: " << NewCall << "\n"do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-idiom")) { dbgs() << " Formed memset: " << NewCall << "\n" << " from store to: " << Ev << " at: " << TheStore << "\n"; } } while (false)
1037	<< " from store to: " << Ev << " at: " << TheStoredo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-idiom")) { dbgs() << " Formed memset: " << NewCall << "\n" << " from store to: " << Ev << " at: " << *TheStore << "\n"; } } while (false)
1038	<< "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-idiom")) { dbgs() << " Formed memset: " << NewCall << "\n" << " from store to: " << Ev << " at: " << *TheStore << "\n"; } } while (false);
1039
1040	ORE.emit([&]() {
1041	return OptimizationRemark(DEBUG_TYPE"loop-idiom", "ProcessLoopStridedStore",
1042	NewCall->getDebugLoc(), Preheader)
1043	<< "Transformed loop-strided store into a call to "
1044	<< ore::NV("NewFunction", NewCall->getCalledFunction())
1045	<< "() function";
1046	});
1047
1048	// Okay, the memset has been formed. Zap the original store and anything that
1049	// feeds into it.
1050	for (auto *I : Stores) {
1051	if (MSSAU)
1052	MSSAU->removeMemoryAccess(I, true);
1053	deleteDeadInstruction(I);
1054	}
1055	if (MSSAU && VerifyMemorySSA)
1056	MSSAU->getMemorySSA()->verifyMemorySSA();
1057	++NumMemSet;
1058	ExpCleaner.markResultUsed();
1059	return true;
1060	}
1061
1062	/// If the stored value is a strided load in the same loop with the same stride
1063	/// this may be transformable into a memcpy. This kicks in for stuff like
1064	/// for (i) A[i] = B[i];
1065	bool LoopIdiomRecognize::processLoopStoreOfLoopLoad(StoreInst *SI,
1066	const SCEV *BECount) {
1067	assert(SI->isUnordered() && "Expected only non-volatile non-ordered stores.")((SI->isUnordered() && "Expected only non-volatile non-ordered stores." ) ? static_cast<void> (0) : __assert_fail ("SI->isUnordered() && \"Expected only non-volatile non-ordered stores.\"" , "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp" , 1067, __PRETTY_FUNCTION__));
1068
1069	Value *StorePtr = SI->getPointerOperand();
1070	const SCEVAddRecExpr *StoreEv = cast<SCEVAddRecExpr>(SE->getSCEV(StorePtr));
1071	APInt Stride = getStoreStride(StoreEv);
1072	unsigned StoreSize = DL->getTypeStoreSize(SI->getValueOperand()->getType());
1073	bool NegStride = StoreSize == -Stride;
1074
1075	// The store must be feeding a non-volatile load.
1076	LoadInst *LI = cast<LoadInst>(SI->getValueOperand());
1077	assert(LI->isUnordered() && "Expected only non-volatile non-ordered loads.")((LI->isUnordered() && "Expected only non-volatile non-ordered loads." ) ? static_cast<void> (0) : __assert_fail ("LI->isUnordered() && \"Expected only non-volatile non-ordered loads.\"" , "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp" , 1077, __PRETTY_FUNCTION__));
1078
1079	// See if the pointer expression is an AddRec like {base,+,1} on the current
1080	// loop, which indicates a strided load. If we have something else, it's a
1081	// random load we can't handle.
1082	const SCEVAddRecExpr *LoadEv =
1083	cast<SCEVAddRecExpr>(SE->getSCEV(LI->getPointerOperand()));
1084
1085	// The trip count of the loop and the base pointer of the addrec SCEV is
1086	// guaranteed to be loop invariant, which means that it should dominate the
1087	// header. This allows us to insert code for it in the preheader.
1088	BasicBlock *Preheader = CurLoop->getLoopPreheader();
1089	IRBuilder<> Builder(Preheader->getTerminator());
1090	SCEVExpander Expander(SE, DL, "loop-idiom");
1091
1092	SCEVExpanderCleaner ExpCleaner(Expander, *DT);
1093
1094	bool Changed = false;
1095	const SCEV *StrStart = StoreEv->getStart();
1096	unsigned StrAS = SI->getPointerAddressSpace();
1097	Type *IntIdxTy = Builder.getIntNTy(DL->getIndexSizeInBits(StrAS));
1098
1099	// Handle negative strided loops.
1100	if (NegStride)
1101	StrStart = getStartForNegStride(StrStart, BECount, IntIdxTy, StoreSize, SE);
1102
1103	// Okay, we have a strided store "p[i]" of a loaded value. We can turn
1104	// this into a memcpy in the loop preheader now if we want. However, this
1105	// would be unsafe to do if there is anything else in the loop that may read
1106	// or write the memory region we're storing to. This includes the load that
1107	// feeds the stores. Check for an alias by generating the base address and
1108	// checking everything.
1109	Value *StoreBasePtr = Expander.expandCodeFor(
1110	StrStart, Builder.getInt8PtrTy(StrAS), Preheader->getTerminator());
1111
1112	// From here on out, conservatively report to the pass manager that we've
1113	// changed the IR, even if we later clean up these added instructions. There
1114	// may be structural differences e.g. in the order of use lists not accounted
1115	// for in just a textual dump of the IR. This is written as a variable, even
1116	// though statically all the places this dominates could be replaced with
1117	// 'true', with the hope that anyone trying to be clever / "more precise" with
1118	// the return value will read this comment, and leave them alone.
1119	Changed = true;
1120
1121	SmallPtrSet<Instruction *, 1> Stores;
1122	Stores.insert(SI);
1123	if (mayLoopAccessLocation(StoreBasePtr, ModRefInfo::ModRef, CurLoop, BECount,
1124	StoreSize, *AA, Stores))
1125	return Changed;
1126
1127	const SCEV *LdStart = LoadEv->getStart();
1128	unsigned LdAS = LI->getPointerAddressSpace();
1129
1130	// Handle negative strided loops.
1131	if (NegStride)
1132	LdStart = getStartForNegStride(LdStart, BECount, IntIdxTy, StoreSize, SE);
1133
1134	// For a memcpy, we have to make sure that the input array is not being
1135	// mutated by the loop.
1136	Value *LoadBasePtr = Expander.expandCodeFor(
1137	LdStart, Builder.getInt8PtrTy(LdAS), Preheader->getTerminator());
1138
1139	if (mayLoopAccessLocation(LoadBasePtr, ModRefInfo::Mod, CurLoop, BECount,
1140	StoreSize, *AA, Stores))
1141	return Changed;
1142
1143	if (avoidLIRForMultiBlockLoop())
1144	return Changed;
1145
1146	// Okay, everything is safe, we can transform this!
1147
1148	const SCEV *NumBytesS =
1149	getNumBytes(BECount, IntIdxTy, StoreSize, CurLoop, DL, SE);
1150
1151	Value *NumBytes =
1152	Expander.expandCodeFor(NumBytesS, IntIdxTy, Preheader->getTerminator());
1153
1154	CallInst *NewCall = nullptr;
1155	// Check whether to generate an unordered atomic memcpy:
1156	// If the load or store are atomic, then they must necessarily be unordered
1157	// by previous checks.
1158	if (!SI->isAtomic() && !LI->isAtomic())
1159	NewCall = Builder.CreateMemCpy(StoreBasePtr, SI->getAlign(), LoadBasePtr,
1160	LI->getAlign(), NumBytes);
1161	else {
1162	// We cannot allow unaligned ops for unordered load/store, so reject
1163	// anything where the alignment isn't at least the element size.
1164	const Align StoreAlign = SI->getAlign();
1165	const Align LoadAlign = LI->getAlign();
1166	if (StoreAlign < StoreSize \|\| LoadAlign < StoreSize)
1167	return Changed;
1168
1169	// If the element.atomic memcpy is not lowered into explicit
1170	// loads/stores later, then it will be lowered into an element-size
1171	// specific lib call. If the lib call doesn't exist for our store size, then
1172	// we shouldn't generate the memcpy.
1173	if (StoreSize > TTI->getAtomicMemIntrinsicMaxElementSize())
1174	return Changed;
1175
1176	// Create the call.
1177	// Note that unordered atomic loads/stores are required by the spec to
1178	// have an alignment but non-atomic loads/stores may not.
1179	NewCall = Builder.CreateElementUnorderedAtomicMemCpy(
1180	StoreBasePtr, StoreAlign, LoadBasePtr, LoadAlign, NumBytes,
1181	StoreSize);
1182	}
1183	NewCall->setDebugLoc(SI->getDebugLoc());
1184
1185	if (MSSAU) {
1186	MemoryAccess *NewMemAcc = MSSAU->createMemoryAccessInBB(
1187	NewCall, nullptr, NewCall->getParent(), MemorySSA::BeforeTerminator);
1188	MSSAU->insertDef(cast<MemoryDef>(NewMemAcc), true);
1189	}
1190
1191	LLVM_DEBUG(dbgs() << " Formed memcpy: " << NewCall << "\n"do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-idiom")) { dbgs() << " Formed memcpy: " << NewCall << "\n" << " from load ptr=" << LoadEv << " at: " << LI << "\n" << " from store ptr=" << StoreEv << " at: " << SI << "\n"; } } while (false)
1192	<< " from load ptr=" << LoadEv << " at: " << LI << "\n"do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-idiom")) { dbgs() << " Formed memcpy: " << NewCall << "\n" << " from load ptr=" << LoadEv << " at: " << LI << "\n" << " from store ptr=" << StoreEv << " at: " << *SI << "\n"; } } while (false)
1193	<< " from store ptr=" << StoreEv << " at: " << SIdo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-idiom")) { dbgs() << " Formed memcpy: " << NewCall << "\n" << " from load ptr=" << LoadEv << " at: " << LI << "\n" << " from store ptr=" << StoreEv << " at: " << *SI << "\n"; } } while (false)
1194	<< "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-idiom")) { dbgs() << " Formed memcpy: " << NewCall << "\n" << " from load ptr=" << LoadEv << " at: " << LI << "\n" << " from store ptr=" << StoreEv << " at: " << *SI << "\n"; } } while (false);
1195
1196	ORE.emit([&]() {
1197	return OptimizationRemark(DEBUG_TYPE"loop-idiom", "ProcessLoopStoreOfLoopLoad",
1198	NewCall->getDebugLoc(), Preheader)
1199	<< "Formed a call to "
1200	<< ore::NV("NewFunction", NewCall->getCalledFunction())
1201	<< "() function";
1202	});
1203
1204	// Okay, the memcpy has been formed. Zap the original store and anything that
1205	// feeds into it.
1206	if (MSSAU)
1207	MSSAU->removeMemoryAccess(SI, true);
1208	deleteDeadInstruction(SI);
1209	if (MSSAU && VerifyMemorySSA)
1210	MSSAU->getMemorySSA()->verifyMemorySSA();
1211	++NumMemCpy;
1212	ExpCleaner.markResultUsed();
1213	return true;
1214	}
1215
1216	// When compiling for codesize we avoid idiom recognition for a multi-block loop
1217	// unless it is a loop_memset idiom or a memset/memcpy idiom in a nested loop.
1218	//
1219	bool LoopIdiomRecognize::avoidLIRForMultiBlockLoop(bool IsMemset,
1220	bool IsLoopMemset) {
1221	if (ApplyCodeSizeHeuristics && CurLoop->getNumBlocks() > 1) {
1222	if (CurLoop->isOutermost() && (!IsMemset \|\| !IsLoopMemset)) {
1223	LLVM_DEBUG(dbgs() << " " << CurLoop->getHeader()->getParent()->getName()do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-idiom")) { dbgs() << " " << CurLoop->getHeader ()->getParent()->getName() << " : LIR " << ( IsMemset ? "Memset" : "Memcpy") << " avoided: multi-block top-level loop\n" ; } } while (false)
1224	<< " : LIR " << (IsMemset ? "Memset" : "Memcpy")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-idiom")) { dbgs() << " " << CurLoop->getHeader ()->getParent()->getName() << " : LIR " << ( IsMemset ? "Memset" : "Memcpy") << " avoided: multi-block top-level loop\n" ; } } while (false)
1225	<< " avoided: multi-block top-level loop\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-idiom")) { dbgs() << " " << CurLoop->getHeader ()->getParent()->getName() << " : LIR " << ( IsMemset ? "Memset" : "Memcpy") << " avoided: multi-block top-level loop\n" ; } } while (false);
1226	return true;
1227	}
1228	}
1229
1230	return false;
1231	}
1232
1233	bool LoopIdiomRecognize::runOnNoncountableLoop() {
1234	LLVM_DEBUG(dbgs() << DEBUG_TYPE " Scanning: F["do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-idiom")) { dbgs() << "loop-idiom" " Scanning: F[" << CurLoop->getHeader()->getParent()->getName () << "] Noncountable Loop %" << CurLoop->getHeader ()->getName() << "\n"; } } while (false)
1235	<< CurLoop->getHeader()->getParent()->getName()do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-idiom")) { dbgs() << "loop-idiom" " Scanning: F[" << CurLoop->getHeader()->getParent()->getName () << "] Noncountable Loop %" << CurLoop->getHeader ()->getName() << "\n"; } } while (false)
1236	<< "] Noncountable Loop %"do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-idiom")) { dbgs() << "loop-idiom" " Scanning: F[" << CurLoop->getHeader()->getParent()->getName () << "] Noncountable Loop %" << CurLoop->getHeader ()->getName() << "\n"; } } while (false)
1237	<< CurLoop->getHeader()->getName() << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-idiom")) { dbgs() << "loop-idiom" " Scanning: F[" << CurLoop->getHeader()->getParent()->getName () << "] Noncountable Loop %" << CurLoop->getHeader ()->getName() << "\n"; } } while (false);
1238
1239	return recognizePopcount() \|\| recognizeAndInsertFFS() \|\|
1240	recognizeShiftUntilBitTest();
1241	}
1242
1243	/// Check if the given conditional branch is based on the comparison between
1244	/// a variable and zero, and if the variable is non-zero or zero (JmpOnZero is
1245	/// true), the control yields to the loop entry. If the branch matches the
1246	/// behavior, the variable involved in the comparison is returned. This function
1247	/// will be called to see if the precondition and postcondition of the loop are
1248	/// in desirable form.
1249	static Value matchCondition(BranchInst BI, BasicBlock *LoopEntry,
1250	bool JmpOnZero = false) {
1251	if (!BI \|\| !BI->isConditional())
1252	return nullptr;
1253
1254	ICmpInst *Cond = dyn_cast<ICmpInst>(BI->getCondition());
1255	if (!Cond)
1256	return nullptr;
1257
1258	ConstantInt *CmpZero = dyn_cast<ConstantInt>(Cond->getOperand(1));
1259	if (!CmpZero \|\| !CmpZero->isZero())
1260	return nullptr;
1261
1262	BasicBlock *TrueSucc = BI->getSuccessor(0);
1263	BasicBlock *FalseSucc = BI->getSuccessor(1);
1264	if (JmpOnZero)
1265	std::swap(TrueSucc, FalseSucc);
1266
1267	ICmpInst::Predicate Pred = Cond->getPredicate();
1268	if ((Pred == ICmpInst::ICMP_NE && TrueSucc == LoopEntry) \|\|
1269	(Pred == ICmpInst::ICMP_EQ && FalseSucc == LoopEntry))
1270	return Cond->getOperand(0);
1271
1272	return nullptr;
1273	}
1274
1275	// Check if the recurrence variable `VarX` is in the right form to create
1276	// the idiom. Returns the value coerced to a PHINode if so.
1277	static PHINode getRecurrenceVar(Value VarX, Instruction *DefX,
1278	BasicBlock *LoopEntry) {
1279	auto *PhiX = dyn_cast<PHINode>(VarX);
1280	if (PhiX && PhiX->getParent() == LoopEntry &&
1281	(PhiX->getOperand(0) == DefX \|\| PhiX->getOperand(1) == DefX))
1282	return PhiX;
1283	return nullptr;
1284	}
1285
1286	/// Return true iff the idiom is detected in the loop.
1287	///
1288	/// Additionally:
1289	/// 1) \p CntInst is set to the instruction counting the population bit.
1290	/// 2) \p CntPhi is set to the corresponding phi node.
1291	/// 3) \p Var is set to the value whose population bits are being counted.
1292	///
1293	/// The core idiom we are trying to detect is:
1294	/// \code
1295	/// if (x0 != 0)
1296	/// goto loop-exit // the precondition of the loop
1297	/// cnt0 = init-val;
1298	/// do {
1299	/// x1 = phi (x0, x2);
1300	/// cnt1 = phi(cnt0, cnt2);
1301	///
1302	/// cnt2 = cnt1 + 1;
1303	/// ...
1304	/// x2 = x1 & (x1 - 1);
1305	/// ...
1306	/// } while(x != 0);
1307	///
1308	/// loop-exit:
1309	/// \endcode
1310	static bool detectPopcountIdiom(Loop CurLoop, BasicBlock PreCondBB,
1311	Instruction &CntInst, PHINode &CntPhi,
1312	Value *&Var) {
1313	// step 1: Check to see if the look-back branch match this pattern:
1314	// "if (a!=0) goto loop-entry".
1315	BasicBlock *LoopEntry;
1316	Instruction DefX2, CountInst;
1317	Value VarX1, VarX0;
1318	PHINode PhiX, CountPhi;
1319
1320	DefX2 = CountInst = nullptr;
1321	VarX1 = VarX0 = nullptr;
1322	PhiX = CountPhi = nullptr;
1323	LoopEntry = *(CurLoop->block_begin());
1324
1325	// step 1: Check if the loop-back branch is in desirable form.
1326	{
1327	if (Value *T = matchCondition(
1328	dyn_cast<BranchInst>(LoopEntry->getTerminator()), LoopEntry))
1329	DefX2 = dyn_cast<Instruction>(T);
1330	else
1331	return false;
1332	}
1333
1334	// step 2: detect instructions corresponding to "x2 = x1 & (x1 - 1)"
1335	{
1336	if (!DefX2 \|\| DefX2->getOpcode() != Instruction::And)
1337	return false;
1338
1339	BinaryOperator *SubOneOp;
1340
1341	if ((SubOneOp = dyn_cast<BinaryOperator>(DefX2->getOperand(0))))
1342	VarX1 = DefX2->getOperand(1);
1343	else {
1344	VarX1 = DefX2->getOperand(0);
1345	SubOneOp = dyn_cast<BinaryOperator>(DefX2->getOperand(1));
1346	}
1347	if (!SubOneOp \|\| SubOneOp->getOperand(0) != VarX1)
1348	return false;
1349
1350	ConstantInt *Dec = dyn_cast<ConstantInt>(SubOneOp->getOperand(1));
1351	if (!Dec \|\|
1352	!((SubOneOp->getOpcode() == Instruction::Sub && Dec->isOne()) \|\|
1353	(SubOneOp->getOpcode() == Instruction::Add &&
1354	Dec->isMinusOne()))) {
1355	return false;
1356	}
1357	}
1358
1359	// step 3: Check the recurrence of variable X
1360	PhiX = getRecurrenceVar(VarX1, DefX2, LoopEntry);
1361	if (!PhiX)
1362	return false;
1363
1364	// step 4: Find the instruction which count the population: cnt2 = cnt1 + 1
1365	{
1366	CountInst = nullptr;
1367	for (BasicBlock::iterator Iter = LoopEntry->getFirstNonPHI()->getIterator(),
1368	IterE = LoopEntry->end();
1369	Iter != IterE; Iter++) {
1370	Instruction Inst = &Iter;
1371	if (Inst->getOpcode() != Instruction::Add)
1372	continue;
1373
1374	ConstantInt *Inc = dyn_cast<ConstantInt>(Inst->getOperand(1));
1375	if (!Inc \|\| !Inc->isOne())
1376	continue;
1377
1378	PHINode *Phi = getRecurrenceVar(Inst->getOperand(0), Inst, LoopEntry);
1379	if (!Phi)
1380	continue;
1381
1382	// Check if the result of the instruction is live of the loop.
1383	bool LiveOutLoop = false;
1384	for (User *U : Inst->users()) {
1385	if ((cast<Instruction>(U))->getParent() != LoopEntry) {
1386	LiveOutLoop = true;
1387	break;
1388	}
1389	}
1390
1391	if (LiveOutLoop) {
1392	CountInst = Inst;
1393	CountPhi = Phi;
1394	break;
1395	}
1396	}
1397
1398	if (!CountInst)
1399	return false;
1400	}
1401
1402	// step 5: check if the precondition is in this form:
1403	// "if (x != 0) goto loop-head ; else goto somewhere-we-don't-care;"
1404	{
1405	auto *PreCondBr = dyn_cast<BranchInst>(PreCondBB->getTerminator());
1406	Value *T = matchCondition(PreCondBr, CurLoop->getLoopPreheader());
1407	if (T != PhiX->getOperand(0) && T != PhiX->getOperand(1))
1408	return false;
1409
1410	CntInst = CountInst;
1411	CntPhi = CountPhi;
1412	Var = T;
1413	}
1414
1415	return true;
1416	}
1417
1418	/// Return true if the idiom is detected in the loop.
1419	///
1420	/// Additionally:
1421	/// 1) \p CntInst is set to the instruction Counting Leading Zeros (CTLZ)
1422	/// or nullptr if there is no such.
1423	/// 2) \p CntPhi is set to the corresponding phi node
1424	/// or nullptr if there is no such.
1425	/// 3) \p Var is set to the value whose CTLZ could be used.
1426	/// 4) \p DefX is set to the instruction calculating Loop exit condition.
1427	///
1428	/// The core idiom we are trying to detect is:
1429	/// \code
1430	/// if (x0 == 0)
1431	/// goto loop-exit // the precondition of the loop
1432	/// cnt0 = init-val;
1433	/// do {
1434	/// x = phi (x0, x.next); //PhiX
1435	/// cnt = phi(cnt0, cnt.next);
1436	///
1437	/// cnt.next = cnt + 1;
1438	/// ...
1439	/// x.next = x >> 1; // DefX
1440	/// ...
1441	/// } while(x.next != 0);
1442	///
1443	/// loop-exit:
1444	/// \endcode
1445	static bool detectShiftUntilZeroIdiom(Loop *CurLoop, const DataLayout &DL,
1446	Intrinsic::ID &IntrinID, Value *&InitX,
1447	Instruction &CntInst, PHINode &CntPhi,
1448	Instruction *&DefX) {
1449	BasicBlock *LoopEntry;
1450	Value *VarX = nullptr;
1451
1452	DefX = nullptr;
1453	CntInst = nullptr;
1454	CntPhi = nullptr;
1455	LoopEntry = *(CurLoop->block_begin());
1456
1457	// step 1: Check if the loop-back branch is in desirable form.
1458	if (Value *T = matchCondition(
1459	dyn_cast<BranchInst>(LoopEntry->getTerminator()), LoopEntry))
1460	DefX = dyn_cast<Instruction>(T);
1461	else
1462	return false;
1463
1464	// step 2: detect instructions corresponding to "x.next = x >> 1 or x << 1"
1465	if (!DefX \|\| !DefX->isShift())
1466	return false;
1467	IntrinID = DefX->getOpcode() == Instruction::Shl ? Intrinsic::cttz :
1468	Intrinsic::ctlz;
1469	ConstantInt *Shft = dyn_cast<ConstantInt>(DefX->getOperand(1));
1470	if (!Shft \|\| !Shft->isOne())
1471	return false;
1472	VarX = DefX->getOperand(0);
1473
1474	// step 3: Check the recurrence of variable X
1475	PHINode *PhiX = getRecurrenceVar(VarX, DefX, LoopEntry);
1476	if (!PhiX)
1477	return false;
1478
1479	InitX = PhiX->getIncomingValueForBlock(CurLoop->getLoopPreheader());
1480
1481	// Make sure the initial value can't be negative otherwise the ashr in the
1482	// loop might never reach zero which would make the loop infinite.
1483	if (DefX->getOpcode() == Instruction::AShr && !isKnownNonNegative(InitX, DL))
1484	return false;
1485
1486	// step 4: Find the instruction which count the CTLZ: cnt.next = cnt + 1
1487	// or cnt.next = cnt + -1.
1488	// TODO: We can skip the step. If loop trip count is known (CTLZ),
1489	// then all uses of "cnt.next" could be optimized to the trip count
1490	// plus "cnt0". Currently it is not optimized.
1491	// This step could be used to detect POPCNT instruction:
1492	// cnt.next = cnt + (x.next & 1)
1493	for (BasicBlock::iterator Iter = LoopEntry->getFirstNonPHI()->getIterator(),
1494	IterE = LoopEntry->end();
1495	Iter != IterE; Iter++) {
1496	Instruction Inst = &Iter;
1497	if (Inst->getOpcode() != Instruction::Add)
1498	continue;
1499
1500	ConstantInt *Inc = dyn_cast<ConstantInt>(Inst->getOperand(1));
1501	if (!Inc \|\| (!Inc->isOne() && !Inc->isMinusOne()))
1502	continue;
1503
1504	PHINode *Phi = getRecurrenceVar(Inst->getOperand(0), Inst, LoopEntry);
1505	if (!Phi)
1506	continue;
1507
1508	CntInst = Inst;
1509	CntPhi = Phi;
1510	break;
1511	}
1512	if (!CntInst)
1513	return false;
1514
1515	return true;
1516	}
1517
1518	/// Recognize CTLZ or CTTZ idiom in a non-countable loop and convert the loop
1519	/// to countable (with CTLZ / CTTZ trip count). If CTLZ / CTTZ inserted as a new
1520	/// trip count returns true; otherwise, returns false.
1521	bool LoopIdiomRecognize::recognizeAndInsertFFS() {
1522	// Give up if the loop has multiple blocks or multiple backedges.
1523	if (CurLoop->getNumBackEdges() != 1 \|\| CurLoop->getNumBlocks() != 1)
1524	return false;
1525
1526	Intrinsic::ID IntrinID;
1527	Value *InitX;
1528	Instruction *DefX = nullptr;
1529	PHINode *CntPhi = nullptr;
1530	Instruction *CntInst = nullptr;
1531	// Help decide if transformation is profitable. For ShiftUntilZero idiom,
1532	// this is always 6.
1533	size_t IdiomCanonicalSize = 6;
1534
1535	if (!detectShiftUntilZeroIdiom(CurLoop, *DL, IntrinID, InitX,
1536	CntInst, CntPhi, DefX))
1537	return false;
1538
1539	bool IsCntPhiUsedOutsideLoop = false;
1540	for (User *U : CntPhi->users())
1541	if (!CurLoop->contains(cast<Instruction>(U))) {
1542	IsCntPhiUsedOutsideLoop = true;
1543	break;
1544	}
1545	bool IsCntInstUsedOutsideLoop = false;
1546	for (User *U : CntInst->users())
1547	if (!CurLoop->contains(cast<Instruction>(U))) {
1548	IsCntInstUsedOutsideLoop = true;
1549	break;
1550	}
1551	// If both CntInst and CntPhi are used outside the loop the profitability
1552	// is questionable.
1553	if (IsCntInstUsedOutsideLoop && IsCntPhiUsedOutsideLoop)
1554	return false;
1555
1556	// For some CPUs result of CTLZ(X) intrinsic is undefined
1557	// when X is 0. If we can not guarantee X != 0, we need to check this
1558	// when expand.
1559	bool ZeroCheck = false;
1560	// It is safe to assume Preheader exist as it was checked in
1561	// parent function RunOnLoop.
1562	BasicBlock *PH = CurLoop->getLoopPreheader();
1563
1564	// If we are using the count instruction outside the loop, make sure we
1565	// have a zero check as a precondition. Without the check the loop would run
1566	// one iteration for before any check of the input value. This means 0 and 1
1567	// would have identical behavior in the original loop and thus
1568	if (!IsCntPhiUsedOutsideLoop) {
1569	auto *PreCondBB = PH->getSinglePredecessor();
1570	if (!PreCondBB)
1571	return false;
1572	auto *PreCondBI = dyn_cast<BranchInst>(PreCondBB->getTerminator());
1573	if (!PreCondBI)
1574	return false;
1575	if (matchCondition(PreCondBI, PH) != InitX)
1576	return false;
1577	ZeroCheck = true;
1578	}
1579
1580	// Check if CTLZ / CTTZ intrinsic is profitable. Assume it is always
1581	// profitable if we delete the loop.
1582
1583	// the loop has only 6 instructions:
1584	// %n.addr.0 = phi [ %n, %entry ], [ %shr, %while.cond ]
1585	// %i.0 = phi [ %i0, %entry ], [ %inc, %while.cond ]
1586	// %shr = ashr %n.addr.0, 1
1587	// %tobool = icmp eq %shr, 0
1588	// %inc = add nsw %i.0, 1
1589	// br i1 %tobool
1590
1591	const Value *Args[] = {InitX,
1592	ConstantInt::getBool(InitX->getContext(), ZeroCheck)};
1593
1594	// @llvm.dbg doesn't count as they have no semantic effect.
1595	auto InstWithoutDebugIt = CurLoop->getHeader()->instructionsWithoutDebug();
1596	uint32_t HeaderSize =
1597	std::distance(InstWithoutDebugIt.begin(), InstWithoutDebugIt.end());
1598
1599	IntrinsicCostAttributes Attrs(IntrinID, InitX->getType(), Args);
1600	InstructionCost Cost =
1601	TTI->getIntrinsicInstrCost(Attrs, TargetTransformInfo::TCK_SizeAndLatency);
1602	if (HeaderSize != IdiomCanonicalSize &&
1603	Cost > TargetTransformInfo::TCC_Basic)
1604	return false;
1605
1606	transformLoopToCountable(IntrinID, PH, CntInst, CntPhi, InitX, DefX,
1607	DefX->getDebugLoc(), ZeroCheck,
1608	IsCntPhiUsedOutsideLoop);
1609	return true;
1610	}
1611
1612	/// Recognizes a population count idiom in a non-countable loop.
1613	///
1614	/// If detected, transforms the relevant code to issue the popcount intrinsic
1615	/// function call, and returns true; otherwise, returns false.
1616	bool LoopIdiomRecognize::recognizePopcount() {
1617	if (TTI->getPopcntSupport(32) != TargetTransformInfo::PSK_FastHardware)
1618	return false;
1619
1620	// Counting population are usually conducted by few arithmetic instructions.
1621	// Such instructions can be easily "absorbed" by vacant slots in a
1622	// non-compact loop. Therefore, recognizing popcount idiom only makes sense
1623	// in a compact loop.
1624
1625	// Give up if the loop has multiple blocks or multiple backedges.
1626	if (CurLoop->getNumBackEdges() != 1 \|\| CurLoop->getNumBlocks() != 1)
1627	return false;
1628
1629	BasicBlock LoopBody = (CurLoop->block_begin());
1630	if (LoopBody->size() >= 20) {
1631	// The loop is too big, bail out.
1632	return false;
1633	}
1634
1635	// It should have a preheader containing nothing but an unconditional branch.
1636	BasicBlock *PH = CurLoop->getLoopPreheader();
1637	if (!PH \|\| &PH->front() != PH->getTerminator())
1638	return false;
1639	auto *EntryBI = dyn_cast<BranchInst>(PH->getTerminator());
1640	if (!EntryBI \|\| EntryBI->isConditional())
1641	return false;
1642
1643	// It should have a precondition block where the generated popcount intrinsic
1644	// function can be inserted.
1645	auto *PreCondBB = PH->getSinglePredecessor();
1646	if (!PreCondBB)
1647	return false;
1648	auto *PreCondBI = dyn_cast<BranchInst>(PreCondBB->getTerminator());
1649	if (!PreCondBI \|\| PreCondBI->isUnconditional())
1650	return false;
1651
1652	Instruction *CntInst;
1653	PHINode *CntPhi;
1654	Value *Val;
1655	if (!detectPopcountIdiom(CurLoop, PreCondBB, CntInst, CntPhi, Val))
1656	return false;
1657
1658	transformLoopToPopcount(PreCondBB, CntInst, CntPhi, Val);
1659	return true;
1660	}
1661
1662	static CallInst createPopcntIntrinsic(IRBuilder<> &IRBuilder, Value Val,
1663	const DebugLoc &DL) {
1664	Value *Ops[] = {Val};
1665	Type *Tys[] = {Val->getType()};
1666
1667	Module *M = IRBuilder.GetInsertBlock()->getParent()->getParent();
1668	Function *Func = Intrinsic::getDeclaration(M, Intrinsic::ctpop, Tys);
1669	CallInst *CI = IRBuilder.CreateCall(Func, Ops);
1670	CI->setDebugLoc(DL);
1671
1672	return CI;
1673	}
1674
1675	static CallInst createFFSIntrinsic(IRBuilder<> &IRBuilder, Value Val,
1676	const DebugLoc &DL, bool ZeroCheck,
1677	Intrinsic::ID IID) {
1678	Value *Ops[] = {Val, IRBuilder.getInt1(ZeroCheck)};
1679	Type *Tys[] = {Val->getType()};
1680
1681	Module *M = IRBuilder.GetInsertBlock()->getParent()->getParent();
1682	Function *Func = Intrinsic::getDeclaration(M, IID, Tys);
1683	CallInst *CI = IRBuilder.CreateCall(Func, Ops);
1684	CI->setDebugLoc(DL);
1685
1686	return CI;
1687	}
1688
1689	/// Transform the following loop (Using CTLZ, CTTZ is similar):
1690	/// loop:
1691	/// CntPhi = PHI [Cnt0, CntInst]
1692	/// PhiX = PHI [InitX, DefX]
1693	/// CntInst = CntPhi + 1
1694	/// DefX = PhiX >> 1
1695	/// LOOP_BODY
1696	/// Br: loop if (DefX != 0)
1697	/// Use(CntPhi) or Use(CntInst)
1698	///
1699	/// Into:
1700	/// If CntPhi used outside the loop:
1701	/// CountPrev = BitWidth(InitX) - CTLZ(InitX >> 1)
1702	/// Count = CountPrev + 1
1703	/// else
1704	/// Count = BitWidth(InitX) - CTLZ(InitX)
1705	/// loop:
1706	/// CntPhi = PHI [Cnt0, CntInst]
1707	/// PhiX = PHI [InitX, DefX]
1708	/// PhiCount = PHI [Count, Dec]
1709	/// CntInst = CntPhi + 1
1710	/// DefX = PhiX >> 1
1711	/// Dec = PhiCount - 1
1712	/// LOOP_BODY
1713	/// Br: loop if (Dec != 0)
1714	/// Use(CountPrev + Cnt0) // Use(CntPhi)
1715	/// or
1716	/// Use(Count + Cnt0) // Use(CntInst)
1717	///
1718	/// If LOOP_BODY is empty the loop will be deleted.
1719	/// If CntInst and DefX are not used in LOOP_BODY they will be removed.
1720	void LoopIdiomRecognize::transformLoopToCountable(
1721	Intrinsic::ID IntrinID, BasicBlock Preheader, Instruction CntInst,
1722	PHINode CntPhi, Value InitX, Instruction *DefX, const DebugLoc &DL,
1723	bool ZeroCheck, bool IsCntPhiUsedOutsideLoop) {
1724	BranchInst *PreheaderBr = cast<BranchInst>(Preheader->getTerminator());
1725
1726	// Step 1: Insert the CTLZ/CTTZ instruction at the end of the preheader block
1727	IRBuilder<> Builder(PreheaderBr);
1728	Builder.SetCurrentDebugLocation(DL);
1729
1730	// If there are no uses of CntPhi crate:
1731	// Count = BitWidth - CTLZ(InitX);
1732	// NewCount = Count;
1733	// If there are uses of CntPhi create:
1734	// NewCount = BitWidth - CTLZ(InitX >> 1);
1735	// Count = NewCount + 1;
1736	Value *InitXNext;
1737	if (IsCntPhiUsedOutsideLoop) {
1738	if (DefX->getOpcode() == Instruction::AShr)
1739	InitXNext = Builder.CreateAShr(InitX, 1);
1740	else if (DefX->getOpcode() == Instruction::LShr)
1741	InitXNext = Builder.CreateLShr(InitX, 1);
1742	else if (DefX->getOpcode() == Instruction::Shl) // cttz
1743	InitXNext = Builder.CreateShl(InitX, 1);
1744	else
1745	llvm_unreachable("Unexpected opcode!")::llvm::llvm_unreachable_internal("Unexpected opcode!", "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp" , 1745);
1746	} else
1747	InitXNext = InitX;
1748	Value *Count =
1749	createFFSIntrinsic(Builder, InitXNext, DL, ZeroCheck, IntrinID);
1750	Type *CountTy = Count->getType();
1751	Count = Builder.CreateSub(
1752	ConstantInt::get(CountTy, CountTy->getIntegerBitWidth()), Count);
1753	Value *NewCount = Count;
1754	if (IsCntPhiUsedOutsideLoop)
1755	Count = Builder.CreateAdd(Count, ConstantInt::get(CountTy, 1));
1756
1757	NewCount = Builder.CreateZExtOrTrunc(NewCount, CntInst->getType());
1758
1759	Value *CntInitVal = CntPhi->getIncomingValueForBlock(Preheader);
1760	if (cast<ConstantInt>(CntInst->getOperand(1))->isOne()) {
1761	// If the counter was being incremented in the loop, add NewCount to the
1762	// counter's initial value, but only if the initial value is not zero.
1763	ConstantInt *InitConst = dyn_cast<ConstantInt>(CntInitVal);
1764	if (!InitConst \|\| !InitConst->isZero())
1765	NewCount = Builder.CreateAdd(NewCount, CntInitVal);
1766	} else {
1767	// If the count was being decremented in the loop, subtract NewCount from
1768	// the counter's initial value.
1769	NewCount = Builder.CreateSub(CntInitVal, NewCount);
1770	}
1771
1772	// Step 2: Insert new IV and loop condition:
1773	// loop:
1774	// ...
1775	// PhiCount = PHI [Count, Dec]
1776	// ...
1777	// Dec = PhiCount - 1
1778	// ...
1779	// Br: loop if (Dec != 0)
1780	BasicBlock Body = (CurLoop->block_begin());
1781	auto *LbBr = cast<BranchInst>(Body->getTerminator());
1782	ICmpInst *LbCond = cast<ICmpInst>(LbBr->getCondition());
1783
1784	PHINode *TcPhi = PHINode::Create(CountTy, 2, "tcphi", &Body->front());
1785
1786	Builder.SetInsertPoint(LbCond);
1787	Instruction *TcDec = cast<Instruction>(Builder.CreateSub(
1788	TcPhi, ConstantInt::get(CountTy, 1), "tcdec", false, true));
1789
1790	TcPhi->addIncoming(Count, Preheader);
1791	TcPhi->addIncoming(TcDec, Body);
1792
1793	CmpInst::Predicate Pred =
1794	(LbBr->getSuccessor(0) == Body) ? CmpInst::ICMP_NE : CmpInst::ICMP_EQ;
1795	LbCond->setPredicate(Pred);
1796	LbCond->setOperand(0, TcDec);
1797	LbCond->setOperand(1, ConstantInt::get(CountTy, 0));
1798
1799	// Step 3: All the references to the original counter outside
1800	// the loop are replaced with the NewCount
1801	if (IsCntPhiUsedOutsideLoop)
1802	CntPhi->replaceUsesOutsideBlock(NewCount, Body);
1803	else
1804	CntInst->replaceUsesOutsideBlock(NewCount, Body);
1805
1806	// step 4: Forget the "non-computable" trip-count SCEV associated with the
1807	// loop. The loop would otherwise not be deleted even if it becomes empty.
1808	SE->forgetLoop(CurLoop);
1809	}
1810
1811	void LoopIdiomRecognize::transformLoopToPopcount(BasicBlock *PreCondBB,
1812	Instruction *CntInst,
1813	PHINode CntPhi, Value Var) {
1814	BasicBlock *PreHead = CurLoop->getLoopPreheader();
1815	auto *PreCondBr = cast<BranchInst>(PreCondBB->getTerminator());
1816	const DebugLoc &DL = CntInst->getDebugLoc();
1817
1818	// Assuming before transformation, the loop is following:
1819	// if (x) // the precondition
1820	// do { cnt++; x &= x - 1; } while(x);
1821
1822	// Step 1: Insert the ctpop instruction at the end of the precondition block
1823	IRBuilder<> Builder(PreCondBr);
1824	Value PopCnt, PopCntZext, NewCount, TripCnt;
1825	{
1826	PopCnt = createPopcntIntrinsic(Builder, Var, DL);
1827	NewCount = PopCntZext =
1828	Builder.CreateZExtOrTrunc(PopCnt, cast<IntegerType>(CntPhi->getType()));
1829
1830	if (NewCount != PopCnt)
1831	(cast<Instruction>(NewCount))->setDebugLoc(DL);
1832
1833	// TripCnt is exactly the number of iterations the loop has
1834	TripCnt = NewCount;
1835
1836	// If the population counter's initial value is not zero, insert Add Inst.
1837	Value *CntInitVal = CntPhi->getIncomingValueForBlock(PreHead);
1838	ConstantInt *InitConst = dyn_cast<ConstantInt>(CntInitVal);
1839	if (!InitConst \|\| !InitConst->isZero()) {
1840	NewCount = Builder.CreateAdd(NewCount, CntInitVal);
1841	(cast<Instruction>(NewCount))->setDebugLoc(DL);
1842	}
1843	}
1844
1845	// Step 2: Replace the precondition from "if (x == 0) goto loop-exit" to
1846	// "if (NewCount == 0) loop-exit". Without this change, the intrinsic
1847	// function would be partial dead code, and downstream passes will drag
1848	// it back from the precondition block to the preheader.
1849	{
1850	ICmpInst *PreCond = cast<ICmpInst>(PreCondBr->getCondition());
1851
1852	Value *Opnd0 = PopCntZext;
1853	Value *Opnd1 = ConstantInt::get(PopCntZext->getType(), 0);
1854	if (PreCond->getOperand(0) != Var)
1855	std::swap(Opnd0, Opnd1);
1856
1857	ICmpInst *NewPreCond = cast<ICmpInst>(
1858	Builder.CreateICmp(PreCond->getPredicate(), Opnd0, Opnd1));
1859	PreCondBr->setCondition(NewPreCond);
1860
1861	RecursivelyDeleteTriviallyDeadInstructions(PreCond, TLI);
1862	}
1863
1864	// Step 3: Note that the population count is exactly the trip count of the
1865	// loop in question, which enable us to convert the loop from noncountable
1866	// loop into a countable one. The benefit is twofold:
1867	//
1868	// - If the loop only counts population, the entire loop becomes dead after
1869	// the transformation. It is a lot easier to prove a countable loop dead
1870	// than to prove a noncountable one. (In some C dialects, an infinite loop
1871	// isn't dead even if it computes nothing useful. In general, DCE needs
1872	// to prove a noncountable loop finite before safely delete it.)
1873	//
1874	// - If the loop also performs something else, it remains alive.
1875	// Since it is transformed to countable form, it can be aggressively
1876	// optimized by some optimizations which are in general not applicable
1877	// to a noncountable loop.
1878	//
1879	// After this step, this loop (conceptually) would look like following:
1880	// newcnt = __builtin_ctpop(x);
1881	// t = newcnt;
1882	// if (x)
1883	// do { cnt++; x &= x-1; t--) } while (t > 0);
1884	BasicBlock Body = (CurLoop->block_begin());
1885	{
1886	auto *LbBr = cast<BranchInst>(Body->getTerminator());
1887	ICmpInst *LbCond = cast<ICmpInst>(LbBr->getCondition());
1888	Type *Ty = TripCnt->getType();
1889
1890	PHINode *TcPhi = PHINode::Create(Ty, 2, "tcphi", &Body->front());
1891
1892	Builder.SetInsertPoint(LbCond);
1893	Instruction *TcDec = cast<Instruction>(
1894	Builder.CreateSub(TcPhi, ConstantInt::get(Ty, 1),
1895	"tcdec", false, true));
1896
1897	TcPhi->addIncoming(TripCnt, PreHead);
1898	TcPhi->addIncoming(TcDec, Body);
1899
1900	CmpInst::Predicate Pred =
1901	(LbBr->getSuccessor(0) == Body) ? CmpInst::ICMP_UGT : CmpInst::ICMP_SLE;
1902	LbCond->setPredicate(Pred);
1903	LbCond->setOperand(0, TcDec);
1904	LbCond->setOperand(1, ConstantInt::get(Ty, 0));
1905	}
1906
1907	// Step 4: All the references to the original population counter outside
1908	// the loop are replaced with the NewCount -- the value returned from
1909	// __builtin_ctpop().
1910	CntInst->replaceUsesOutsideBlock(NewCount, Body);
1911
1912	// step 5: Forget the "non-computable" trip-count SCEV associated with the
1913	// loop. The loop would otherwise not be deleted even if it becomes empty.
1914	SE->forgetLoop(CurLoop);
1915	}
1916
1917	/// Match loop-invariant value.
1918	template <typename SubPattern_t> struct match_LoopInvariant {
1919	SubPattern_t SubPattern;
1920	const Loop *L;
1921
1922	match_LoopInvariant(const SubPattern_t &SP, const Loop *L)
1923	: SubPattern(SP), L(L) {}
1924
1925	template <typename ITy> bool match(ITy *V) {
1926	return L->isLoopInvariant(V) && SubPattern.match(V);
1927	}
1928	};
1929
1930	/// Matches if the value is loop-invariant.
1931	template <typename Ty>
1932	inline match_LoopInvariant<Ty> m_LoopInvariant(const Ty &M, const Loop *L) {
1933	return match_LoopInvariant<Ty>(M, L);
1934	}
1935
1936	/// Return true if the idiom is detected in the loop.
1937	///
1938	/// The core idiom we are trying to detect is:
1939	/// \code
1940	/// entry:
1941	/// <...>
1942	/// %bitmask = shl i32 1, %bitpos
1943	/// br label %loop
1944	///
1945	/// loop:
1946	/// %x.curr = phi i32 [ %x, %entry ], [ %x.next, %loop ]
1947	/// %x.curr.bitmasked = and i32 %x.curr, %bitmask
1948	/// %x.curr.isbitunset = icmp eq i32 %x.curr.bitmasked, 0
1949	/// %x.next = shl i32 %x.curr, 1
1950	/// <...>
1951	/// br i1 %x.curr.isbitunset, label %loop, label %end
1952	///
1953	/// end:
1954	/// %x.curr.res = phi i32 [ %x.curr, %loop ] <...>
1955	/// %x.next.res = phi i32 [ %x.next, %loop ] <...>
1956	/// <...>
1957	/// \endcode
1958	static bool detectShiftUntilBitTestIdiom(Loop CurLoop, Value &BaseX,
1959	Value &BitMask, Value &BitPos,
1960	Value &CurrX, Instruction &NextX) {
1961	LLVM_DEBUG(dbgs() << DEBUG_TYPEdo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-idiom")) { dbgs() << "loop-idiom" " Performing shift-until-bittest idiom detection.\n" ; } } while (false)
1962	" Performing shift-until-bittest idiom detection.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-idiom")) { dbgs() << "loop-idiom" " Performing shift-until-bittest idiom detection.\n" ; } } while (false);
1963
1964	// Give up if the loop has multiple blocks or multiple backedges.
1965	if (CurLoop->getNumBlocks() != 1 \|\| CurLoop->getNumBackEdges() != 1) {
1966	LLVM_DEBUG(dbgs() << DEBUG_TYPE " Bad block/backedge count.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-idiom")) { dbgs() << "loop-idiom" " Bad block/backedge count.\n" ; } } while (false);
1967	return false;
1968	}
1969
1970	BasicBlock *LoopHeaderBB = CurLoop->getHeader();
1971	BasicBlock *LoopPreheaderBB = CurLoop->getLoopPreheader();
1972	assert(LoopPreheaderBB && "There is always a loop preheader.")((LoopPreheaderBB && "There is always a loop preheader." ) ? static_cast<void> (0) : __assert_fail ("LoopPreheaderBB && \"There is always a loop preheader.\"" , "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp" , 1972, __PRETTY_FUNCTION__));
1973
1974	using namespace PatternMatch;
1975
1976	// Step 1: Check if the loop backedge is in desirable form.
1977
1978	ICmpInst::Predicate Pred;
1979	Value CmpLHS, CmpRHS;
1980	BasicBlock TrueBB, FalseBB;
1981	if (!match(LoopHeaderBB->getTerminator(),
1982	m_Br(m_ICmp(Pred, m_Value(CmpLHS), m_Value(CmpRHS)),
1983	m_BasicBlock(TrueBB), m_BasicBlock(FalseBB)))) {
1984	LLVM_DEBUG(dbgs() << DEBUG_TYPE " Bad backedge structure.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-idiom")) { dbgs() << "loop-idiom" " Bad backedge structure.\n" ; } } while (false);
1985	return false;
1986	}
1987
1988	// Step 2: Check if the backedge's condition is in desirable form.
1989
1990	auto MatchVariableBitMask = [&]() {
1991	return ICmpInst::isEquality(Pred) && match(CmpRHS, m_Zero()) &&
1992	match(CmpLHS,
1993	m_c_And(m_Value(CurrX),
1994	m_CombineAnd(
1995	m_Value(BitMask),
1996	m_LoopInvariant(m_Shl(m_One(), m_Value(BitPos)),
1997	CurLoop))));
1998	};
1999	auto MatchConstantBitMask = [&]() {
2000	return ICmpInst::isEquality(Pred) && match(CmpRHS, m_Zero()) &&
2001	match(CmpLHS, m_And(m_Value(CurrX),
2002	m_CombineAnd(m_Value(BitMask), m_Power2()))) &&
2003	(BitPos = ConstantExpr::getExactLogBase2(cast<Constant>(BitMask)));
2004	};
2005	auto MatchDecomposableConstantBitMask = [&]() {
2006	APInt Mask;
2007	return llvm::decomposeBitTestICmp(CmpLHS, CmpRHS, Pred, CurrX, Mask) &&
2008	ICmpInst::isEquality(Pred) && Mask.isPowerOf2() &&
2009	(BitMask = ConstantInt::get(CurrX->getType(), Mask)) &&
2010	(BitPos = ConstantInt::get(CurrX->getType(), Mask.logBase2()));
2011	};
2012
2013	if (!MatchVariableBitMask() && !MatchConstantBitMask() &&
2014	!MatchDecomposableConstantBitMask()) {
2015	LLVM_DEBUG(dbgs() << DEBUG_TYPE " Bad backedge comparison.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-idiom")) { dbgs() << "loop-idiom" " Bad backedge comparison.\n" ; } } while (false);
2016	return false;
2017	}
2018
2019	// Step 3: Check if the recurrence is in desirable form.
2020	auto *CurrXPN = dyn_cast<PHINode>(CurrX);
2021	if (!CurrXPN \|\| CurrXPN->getParent() != LoopHeaderBB) {
2022	LLVM_DEBUG(dbgs() << DEBUG_TYPE " Not an expected PHI node.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-idiom")) { dbgs() << "loop-idiom" " Not an expected PHI node.\n" ; } } while (false);
2023	return false;
2024	}
2025
2026	BaseX = CurrXPN->getIncomingValueForBlock(LoopPreheaderBB);
2027	NextX =
2028	dyn_cast<Instruction>(CurrXPN->getIncomingValueForBlock(LoopHeaderBB));
2029
2030	if (!NextX \|\| !match(NextX, m_Shl(m_Specific(CurrX), m_One()))) {
2031	// FIXME: support right-shift?
2032	LLVM_DEBUG(dbgs() << DEBUG_TYPE " Bad recurrence.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-idiom")) { dbgs() << "loop-idiom" " Bad recurrence.\n" ; } } while (false);
2033	return false;
2034	}
2035
2036	// Step 4: Check if the backedge's destinations are in desirable form.
2037
2038	assert(ICmpInst::isEquality(Pred) &&((ICmpInst::isEquality(Pred) && "Should only get equality predicates here." ) ? static_cast<void> (0) : __assert_fail ("ICmpInst::isEquality(Pred) && \"Should only get equality predicates here.\"" , "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp" , 2039, __PRETTY_FUNCTION__))
2039	"Should only get equality predicates here.")((ICmpInst::isEquality(Pred) && "Should only get equality predicates here." ) ? static_cast<void> (0) : __assert_fail ("ICmpInst::isEquality(Pred) && \"Should only get equality predicates here.\"" , "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp" , 2039, __PRETTY_FUNCTION__));
2040
2041	// cmp-br is commutative, so canonicalize to a single variant.
2042	if (Pred != ICmpInst::Predicate::ICMP_EQ) {
2043	Pred = ICmpInst::getInversePredicate(Pred);
2044	std::swap(TrueBB, FalseBB);
2045	}
2046
2047	// We expect to exit loop when comparison yields false,
2048	// so when it yields true we should branch back to loop header.
2049	if (TrueBB != LoopHeaderBB) {
2050	LLVM_DEBUG(dbgs() << DEBUG_TYPE " Bad backedge flow.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-idiom")) { dbgs() << "loop-idiom" " Bad backedge flow.\n" ; } } while (false);
2051	return false;
2052	}
2053
2054	// Okay, idiom checks out.
2055	return true;
2056	}
2057
2058	/// Look for the following loop:
2059	/// \code
2060	/// entry:
2061	/// <...>
2062	/// %bitmask = shl i32 1, %bitpos
2063	/// br label %loop
2064	///
2065	/// loop:
2066	/// %x.curr = phi i32 [ %x, %entry ], [ %x.next, %loop ]
2067	/// %x.curr.bitmasked = and i32 %x.curr, %bitmask
2068	/// %x.curr.isbitunset = icmp eq i32 %x.curr.bitmasked, 0
2069	/// %x.next = shl i32 %x.curr, 1
2070	/// <...>
2071	/// br i1 %x.curr.isbitunset, label %loop, label %end
2072	///
2073	/// end:
2074	/// %x.curr.res = phi i32 [ %x.curr, %loop ] <...>
2075	/// %x.next.res = phi i32 [ %x.next, %loop ] <...>
2076	/// <...>
2077	/// \endcode
2078	///
2079	/// And transform it into:
2080	/// \code
2081	/// entry:
2082	/// %bitmask = shl i32 1, %bitpos
2083	/// %lowbitmask = add i32 %bitmask, -1
2084	/// %mask = or i32 %lowbitmask, %bitmask
2085	/// %x.masked = and i32 %x, %mask
2086	/// %x.masked.numleadingzeros = call i32 @llvm.ctlz.i32(i32 %x.masked,
2087	/// i1 true)
2088	/// %x.masked.numactivebits = sub i32 32, %x.masked.numleadingzeros
2089	/// %x.masked.leadingonepos = add i32 %x.masked.numactivebits, -1
2090	/// %backedgetakencount = sub i32 %bitpos, %x.masked.leadingonepos
2091	/// %tripcount = add i32 %backedgetakencount, 1
2092	/// %x.curr = shl i32 %x, %backedgetakencount
2093	/// %x.next = shl i32 %x, %tripcount
2094	/// br label %loop
2095	///
2096	/// loop:
2097	/// %loop.iv = phi i32 [ 0, %entry ], [ %loop.iv.next, %loop ]
2098	/// %loop.iv.next = add nuw i32 %loop.iv, 1
2099	/// %loop.ivcheck = icmp eq i32 %loop.iv.next, %tripcount
2100	/// <...>
2101	/// br i1 %loop.ivcheck, label %end, label %loop
2102	///
2103	/// end:
2104	/// %x.curr.res = phi i32 [ %x.curr, %loop ] <...>
2105	/// %x.next.res = phi i32 [ %x.next, %loop ] <...>
2106	/// <...>
2107	/// \endcode
2108	bool LoopIdiomRecognize::recognizeShiftUntilBitTest() {
2109	bool MadeChange = false;
2110
2111	Value X, BitMask, BitPos, XCurr;
2112	Instruction *XNext;
2113	if (!detectShiftUntilBitTestIdiom(CurLoop, X, BitMask, BitPos, XCurr,
2114	XNext)) {
2115	LLVM_DEBUG(dbgs() << DEBUG_TYPEdo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-idiom")) { dbgs() << "loop-idiom" " shift-until-bittest idiom detection failed.\n" ; } } while (false)
2116	" shift-until-bittest idiom detection failed.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-idiom")) { dbgs() << "loop-idiom" " shift-until-bittest idiom detection failed.\n" ; } } while (false);
2117	return MadeChange;
2118	}
2119	LLVM_DEBUG(dbgs() << DEBUG_TYPE " shift-until-bittest idiom detected!\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-idiom")) { dbgs() << "loop-idiom" " shift-until-bittest idiom detected!\n" ; } } while (false);
2120
2121	// Ok, it is the idiom we were looking for, we could transform this loop,
2122	// but is it profitable to transform?
2123
2124	BasicBlock *LoopHeaderBB = CurLoop->getHeader();
2125	BasicBlock *LoopPreheaderBB = CurLoop->getLoopPreheader();
2126	assert(LoopPreheaderBB && "There is always a loop preheader.")((LoopPreheaderBB && "There is always a loop preheader." ) ? static_cast<void> (0) : __assert_fail ("LoopPreheaderBB && \"There is always a loop preheader.\"" , "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp" , 2126, __PRETTY_FUNCTION__));
2127
2128	BasicBlock *SuccessorBB = CurLoop->getExitBlock();
2129	assert(LoopPreheaderBB && "There is only a single successor.")((LoopPreheaderBB && "There is only a single successor." ) ? static_cast<void> (0) : __assert_fail ("LoopPreheaderBB && \"There is only a single successor.\"" , "/build/llvm-toolchain-snapshot-13~++20210413100635+64c24f493e5f/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp" , 2129, __PRETTY_FUNCTION__));
2130
2131	IRBuilder<> Builder(LoopPreheaderBB->getTerminator());
2132	Builder.SetCurrentDebugLocation(cast<Instruction>(XCurr)->getDebugLoc());
2133
2134	Intrinsic::ID IntrID = Intrinsic::ctlz;
2135	Type *Ty = X->getType();
2136	unsigned Bitwidth = Ty->getScalarSizeInBits();
2137
2138	TargetTransformInfo::TargetCostKind CostKind =
2139	TargetTransformInfo::TCK_SizeAndLatency;
2140
2141	// The rewrite is considered to be unprofitable iff and only iff the
2142	// intrinsic/shift we'll use are not cheap. Note that we are okay with just
2143	// making the loop countable, even if nothing else changes.
2144	IntrinsicCostAttributes Attrs(
2145	IntrID, Ty, {UndefValue::get(Ty), /is_zero_undef=/Builder.getTrue()});
2146	InstructionCost Cost = TTI->getIntrinsicInstrCost(Attrs, CostKind);
2147	if (Cost > TargetTransformInfo::TCC_Basic) {
2148	LLVM_DEBUG(dbgs() << DEBUG_TYPEdo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-idiom")) { dbgs() << "loop-idiom" " Intrinsic is too costly, not beneficial\n" ; } } while (false)
2149	" Intrinsic is too costly, not beneficial\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-idiom")) { dbgs() << "loop-idiom" " Intrinsic is too costly, not beneficial\n" ; } } while (false);
2150	return MadeChange;
2151	}
2152	if (TTI->getArithmeticInstrCost(Instruction::Shl, Ty, CostKind) >
2153	TargetTransformInfo::TCC_Basic) {
2154	LLVM_DEBUG(dbgs() << DEBUG_TYPE " Shift is too costly, not beneficial\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-idiom")) { dbgs() << "loop-idiom" " Shift is too costly, not beneficial\n" ; } } while (false);
2155	return MadeChange;
2156	}
2157
2158	// Ok, transform appears worthwhile.
2159	MadeChange = true;
2160
2161	// Step 1: Compute the loop trip count.
2162
2163	Value *LowBitMask = Builder.CreateAdd(BitMask, Constant::getAllOnesValue(Ty),
2164	BitPos->getName() + ".lowbitmask");
2165	Value *Mask =
2166	Builder.CreateOr(LowBitMask, BitMask, BitPos->getName() + ".mask");
2167	Value *XMasked = Builder.CreateAnd(X, Mask, X->getName() + ".masked");
2168	CallInst *XMaskedNumLeadingZeros = Builder.CreateIntrinsic(
2169	IntrID, Ty, {XMasked, /is_zero_undef=/Builder.getTrue()},
2170	/FMFSource=/nullptr, XMasked->getName() + ".numleadingzeros");
2171	Value *XMaskedNumActiveBits = Builder.CreateSub(
2172	ConstantInt::get(Ty, Ty->getScalarSizeInBits()), XMaskedNumLeadingZeros,
2173	XMasked->getName() + ".numactivebits", /HasNUW=/true,
2174	/HasNSW=/Bitwidth != 2);
2175	Value *XMaskedLeadingOnePos =
2176	Builder.CreateAdd(XMaskedNumActiveBits, Constant::getAllOnesValue(Ty),
2177	XMasked->getName() + ".leadingonepos", /HasNUW=/false,
2178	/HasNSW=/Bitwidth > 2);
2179
2180	Value *LoopBackedgeTakenCount = Builder.CreateSub(
2181	BitPos, XMaskedLeadingOnePos, CurLoop->getName() + ".backedgetakencount",
2182	/HasNUW=/true, /HasNSW=/true);
2183	// We know loop's backedge-taken count, but what's loop's trip count?
2184	// Note that while NUW is always safe, while NSW is only for bitwidths != 2.
2185	Value *LoopTripCount =
2186	Builder.CreateAdd(LoopBackedgeTakenCount, ConstantInt::get(Ty, 1),
2187	CurLoop->getName() + ".tripcount", /HasNUW=/true,
2188	/HasNSW=/Bitwidth != 2);
2189
2190	// Step 2: Compute the recurrence's final value without a loop.
2191
2192	// NewX is always safe to compute, because `LoopBackedgeTakenCount`
2193	// will always be smaller than `bitwidth(X)`, i.e. we never get poison.
2194	Value *NewX = Builder.CreateShl(X, LoopBackedgeTakenCount);
2195	NewX->takeName(XCurr);
2196	if (auto *I = dyn_cast<Instruction>(NewX))
2197	I->copyIRFlags(XNext, /IncludeWrapFlags=/true);
2198
2199	Value *NewXNext;
2200	// Rewriting XNext is more complicated, however, because `X << LoopTripCount`
2201	// will be poison iff `LoopTripCount == bitwidth(X)` (which will happen
2202	// iff `BitPos` is `bitwidth(x) - 1` and `X` is `1`). So unless we know
2203	// that isn't the case, we'll need to emit an alternative, safe IR.
2204	if (XNext->hasNoSignedWrap() \|\| XNext->hasNoUnsignedWrap() \|\|
2205	PatternMatch::match(
2206	BitPos, PatternMatch::m_SpecificInt_ICMP(
2207	ICmpInst::ICMP_NE, APInt(Ty->getScalarSizeInBits(),
2208	Ty->getScalarSizeInBits() - 1))))
2209	NewXNext = Builder.CreateShl(X, LoopTripCount);
2210	else {
2211	// Otherwise, just additionally shift by one. It's the smallest solution,
2212	// alternatively, we could check that NewX is INT_MIN (or BitPos is )
2213	// and select 0 instead.
2214	NewXNext = Builder.CreateShl(NewX, ConstantInt::get(Ty, 1));
2215	}
2216
2217	NewXNext->takeName(XNext);
2218	if (auto *I = dyn_cast<Instruction>(NewXNext))
2219	I->copyIRFlags(XNext, /IncludeWrapFlags=/true);
2220
2221	// Step 3: Adjust the successor basic block to recieve the computed
2222	// recurrence's final value instead of the recurrence itself.
2223
2224	XCurr->replaceUsesOutsideBlock(NewX, LoopHeaderBB);
2225	XNext->replaceUsesOutsideBlock(NewXNext, LoopHeaderBB);
2226
2227	// Step 4: Rewrite the loop into a countable form, with canonical IV.
2228
2229	// The new canonical induction variable.
2230	Builder.SetInsertPoint(&LoopHeaderBB->front());
2231	auto *IV = Builder.CreatePHI(Ty, 2, CurLoop->getName() + ".iv");
2232
2233	// The induction itself.
2234	// Note that while NUW is always safe, while NSW is only for bitwidths != 2.
2235	Builder.SetInsertPoint(LoopHeaderBB->getTerminator());
2236	auto *IVNext =
2237	Builder.CreateAdd(IV, ConstantInt::get(Ty, 1), IV->getName() + ".next",
2238	/HasNUW=/true, /HasNSW=/Bitwidth != 2);
2239
2240	// The loop trip count check.
2241	auto *IVCheck = Builder.CreateICmpEQ(IVNext, LoopTripCount,
2242	CurLoop->getName() + ".ivcheck");
2243	Builder.CreateCondBr(IVCheck, SuccessorBB, LoopHeaderBB);
2244	LoopHeaderBB->getTerminator()->eraseFromParent();
2245
2246	// Populate the IV PHI.
2247	IV->addIncoming(ConstantInt::get(Ty, 0), LoopPreheaderBB);
2248	IV->addIncoming(IVNext, LoopHeaderBB);
2249
2250	// Step 5: Forget the "non-computable" trip-count SCEV associated with the
2251	// loop. The loop would otherwise not be deleted even if it becomes empty.
2252
2253	SE->forgetLoop(CurLoop);
2254
2255	// Other passes will take care of actually deleting the loop if possible.
2256
2257	LLVM_DEBUG(dbgs() << DEBUG_TYPE " shift-until-bittest idiom optimized!\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-idiom")) { dbgs() << "loop-idiom" " shift-until-bittest idiom optimized!\n" ; } } while (false);
2258
2259	++NumShiftUntilBitTest;
2260	return MadeChange;
2261	}