/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp

Bug Summary

File:	llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
Warning:	line 2638, column 5 Value stored to 'Pred' is never read

Annotated Source Code

Press '?' to see keyboard shortcuts

Show analyzer invocation

clang -cc1 -cc1 -triple x86_64-pc-linux-gnu -analyze -disable-free -disable-llvm-verifier -discard-value-names -main-file-name LoopIdiomRecognize.cpp -analyzer-store=region -analyzer-opt-analyze-nested-blocks -analyzer-checker=core -analyzer-checker=apiModeling -analyzer-checker=unix -analyzer-checker=deadcode -analyzer-checker=cplusplus -analyzer-checker=security.insecureAPI.UncheckedReturn -analyzer-checker=security.insecureAPI.getpw -analyzer-checker=security.insecureAPI.gets -analyzer-checker=security.insecureAPI.mktemp -analyzer-checker=security.insecureAPI.mkstemp -analyzer-checker=security.insecureAPI.vfork -analyzer-checker=nullability.NullPassedToNonnull -analyzer-checker=nullability.NullReturnedFromNonnull -analyzer-output plist -w -setup-static-analyzer -analyzer-config-compatibility-mode=true -mrelocation-model pic -pic-level 2 -mframe-pointer=none -fmath-errno -fno-rounding-math -mconstructor-aliases -munwind-tables -target-cpu x86-64 -tune-cpu generic -debugger-tuning=gdb -ffunction-sections -fdata-sections -fcoverage-compilation-dir=/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/build-llvm/lib/Transforms/Scalar -resource-dir /usr/lib/llvm-14/lib/clang/14.0.0 -D _DEBUG -D _GNU_SOURCE -D __STDC_CONSTANT_MACROS -D __STDC_FORMAT_MACROS -D __STDC_LIMIT_MACROS -I /build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/build-llvm/lib/Transforms/Scalar -I /build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Transforms/Scalar -I /build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/build-llvm/include -I /build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/include -D NDEBUG -U NDEBUG -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../include/c++/10 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../include/x86_64-linux-gnu/c++/10 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../include/c++/10/backward -internal-isystem /usr/lib/llvm-14/lib/clang/14.0.0/include -internal-isystem /usr/local/include -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../x86_64-linux-gnu/include -internal-externc-isystem /usr/include/x86_64-linux-gnu -internal-externc-isystem /include -internal-externc-isystem /usr/include -O2 -Wno-unused-parameter -Wwrite-strings -Wno-missing-field-initializers -Wno-long-long -Wno-maybe-uninitialized -Wno-class-memaccess -Wno-redundant-move -Wno-pessimizing-move -Wno-noexcept-type -Wno-comment -std=c++14 -fdeprecated-macro -fdebug-compilation-dir=/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/build-llvm/lib/Transforms/Scalar -fdebug-prefix-map=/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0=. -ferror-limit 19 -fvisibility-inlines-hidden -stack-protector 2 -fgnuc-version=4.2.1 -vectorize-loops -vectorize-slp -analyzer-output=html -analyzer-config stable-report-filename=true -faddrsig -D__GCC_HAVE_DWARF2_CFI_ASM=1 -o /tmp/scan-build-2021-08-28-193554-24367-1 -x c++ /build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp

1	//===- LoopIdiomRecognize.cpp - Loop idiom recognition --------------------===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8	//
9	// This pass implements an idiom recognizer that transforms simple loops into a
10	// non-loop form. In cases that this kicks in, it can be a significant
11	// performance win.
12	//
13	// If compiling for code size we avoid idiom recognition if the resulting
14	// code could be larger than the code for the original loop. One way this could
15	// happen is if the loop is not removable after idiom recognition due to the
16	// presence of non-idiom instructions. The initial implementation of the
17	// heuristics applies to idioms in multi-block loops.
18	//
19	//===----------------------------------------------------------------------===//
20	//
21	// TODO List:
22	//
23	// Future loop memory idioms to recognize:
24	// memcmp, strlen, etc.
25	// Future floating point idioms to recognize in -ffast-math mode:
26	// fpowi
27	// Future integer operation idioms to recognize:
28	// ctpop
29	//
30	// Beware that isel's default lowering for ctpop is highly inefficient for
31	// i64 and larger types when i64 is legal and the value has few bits set. It
32	// would be good to enhance isel to emit a loop for ctpop in this case.
33	//
34	// This could recognize common matrix multiplies and dot product idioms and
35	// replace them with calls to BLAS (if linked in??).
36	//
37	//===----------------------------------------------------------------------===//
38
39	#include "llvm/Transforms/Scalar/LoopIdiomRecognize.h"
40	#include "llvm/ADT/APInt.h"
41	#include "llvm/ADT/ArrayRef.h"
42	#include "llvm/ADT/DenseMap.h"
43	#include "llvm/ADT/MapVector.h"
44	#include "llvm/ADT/SetVector.h"
45	#include "llvm/ADT/SmallPtrSet.h"
46	#include "llvm/ADT/SmallVector.h"
47	#include "llvm/ADT/Statistic.h"
48	#include "llvm/ADT/StringRef.h"
49	#include "llvm/Analysis/AliasAnalysis.h"
50	#include "llvm/Analysis/CmpInstAnalysis.h"
51	#include "llvm/Analysis/LoopAccessAnalysis.h"
52	#include "llvm/Analysis/LoopInfo.h"
53	#include "llvm/Analysis/LoopPass.h"
54	#include "llvm/Analysis/MemoryLocation.h"
55	#include "llvm/Analysis/MemorySSA.h"
56	#include "llvm/Analysis/MemorySSAUpdater.h"
57	#include "llvm/Analysis/MustExecute.h"
58	#include "llvm/Analysis/OptimizationRemarkEmitter.h"
59	#include "llvm/Analysis/ScalarEvolution.h"
60	#include "llvm/Analysis/ScalarEvolutionExpressions.h"
61	#include "llvm/Analysis/TargetLibraryInfo.h"
62	#include "llvm/Analysis/TargetTransformInfo.h"
63	#include "llvm/Analysis/ValueTracking.h"
64	#include "llvm/IR/Attributes.h"
65	#include "llvm/IR/BasicBlock.h"
66	#include "llvm/IR/Constant.h"
67	#include "llvm/IR/Constants.h"
68	#include "llvm/IR/DataLayout.h"
69	#include "llvm/IR/DebugLoc.h"
70	#include "llvm/IR/DerivedTypes.h"
71	#include "llvm/IR/Dominators.h"
72	#include "llvm/IR/GlobalValue.h"
73	#include "llvm/IR/GlobalVariable.h"
74	#include "llvm/IR/IRBuilder.h"
75	#include "llvm/IR/InstrTypes.h"
76	#include "llvm/IR/Instruction.h"
77	#include "llvm/IR/Instructions.h"
78	#include "llvm/IR/IntrinsicInst.h"
79	#include "llvm/IR/Intrinsics.h"
80	#include "llvm/IR/LLVMContext.h"
81	#include "llvm/IR/Module.h"
82	#include "llvm/IR/PassManager.h"
83	#include "llvm/IR/PatternMatch.h"
84	#include "llvm/IR/Type.h"
85	#include "llvm/IR/User.h"
86	#include "llvm/IR/Value.h"
87	#include "llvm/IR/ValueHandle.h"
88	#include "llvm/InitializePasses.h"
89	#include "llvm/Pass.h"
90	#include "llvm/Support/Casting.h"
91	#include "llvm/Support/CommandLine.h"
92	#include "llvm/Support/Debug.h"
93	#include "llvm/Support/InstructionCost.h"
94	#include "llvm/Support/raw_ostream.h"
95	#include "llvm/Transforms/Scalar.h"
96	#include "llvm/Transforms/Utils/BuildLibCalls.h"
97	#include "llvm/Transforms/Utils/Local.h"
98	#include "llvm/Transforms/Utils/LoopUtils.h"
99	#include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
100	#include <algorithm>
101	#include <cassert>
102	#include <cstdint>
103	#include <utility>
104	#include <vector>
105
106	using namespace llvm;
107
108	#define DEBUG_TYPE"loop-idiom" "loop-idiom"
109
110	STATISTIC(NumMemSet, "Number of memset's formed from loop stores")static llvm::Statistic NumMemSet = {"loop-idiom", "NumMemSet" , "Number of memset's formed from loop stores"};
111	STATISTIC(NumMemCpy, "Number of memcpy's formed from loop load+stores")static llvm::Statistic NumMemCpy = {"loop-idiom", "NumMemCpy" , "Number of memcpy's formed from loop load+stores"};
112	STATISTIC(NumMemMove, "Number of memmove's formed from loop load+stores")static llvm::Statistic NumMemMove = {"loop-idiom", "NumMemMove" , "Number of memmove's formed from loop load+stores"};
113	STATISTIC(static llvm::Statistic NumShiftUntilBitTest = {"loop-idiom", "NumShiftUntilBitTest" , "Number of uncountable loops recognized as 'shift until bitttest' idiom" }
114	NumShiftUntilBitTest,static llvm::Statistic NumShiftUntilBitTest = {"loop-idiom", "NumShiftUntilBitTest" , "Number of uncountable loops recognized as 'shift until bitttest' idiom" }
115	"Number of uncountable loops recognized as 'shift until bitttest' idiom")static llvm::Statistic NumShiftUntilBitTest = {"loop-idiom", "NumShiftUntilBitTest" , "Number of uncountable loops recognized as 'shift until bitttest' idiom" };
116	STATISTIC(NumShiftUntilZero,static llvm::Statistic NumShiftUntilZero = {"loop-idiom", "NumShiftUntilZero" , "Number of uncountable loops recognized as 'shift until zero' idiom" }
117	"Number of uncountable loops recognized as 'shift until zero' idiom")static llvm::Statistic NumShiftUntilZero = {"loop-idiom", "NumShiftUntilZero" , "Number of uncountable loops recognized as 'shift until zero' idiom" };
118
119	bool DisableLIRP::All;
120	static cl::opt<bool, true>
121	DisableLIRPAll("disable-" DEBUG_TYPE"loop-idiom" "-all",
122	cl::desc("Options to disable Loop Idiom Recognize Pass."),
123	cl::location(DisableLIRP::All), cl::init(false),
124	cl::ReallyHidden);
125
126	bool DisableLIRP::Memset;
127	static cl::opt<bool, true>
128	DisableLIRPMemset("disable-" DEBUG_TYPE"loop-idiom" "-memset",
129	cl::desc("Proceed with loop idiom recognize pass, but do "
130	"not convert loop(s) to memset."),
131	cl::location(DisableLIRP::Memset), cl::init(false),
132	cl::ReallyHidden);
133
134	bool DisableLIRP::Memcpy;
135	static cl::opt<bool, true>
136	DisableLIRPMemcpy("disable-" DEBUG_TYPE"loop-idiom" "-memcpy",
137	cl::desc("Proceed with loop idiom recognize pass, but do "
138	"not convert loop(s) to memcpy."),
139	cl::location(DisableLIRP::Memcpy), cl::init(false),
140	cl::ReallyHidden);
141
142	static cl::opt<bool> UseLIRCodeSizeHeurs(
143	"use-lir-code-size-heurs",
144	cl::desc("Use loop idiom recognition code size heuristics when compiling"
145	"with -Os/-Oz"),
146	cl::init(true), cl::Hidden);
147
148	namespace {
149
150	class LoopIdiomRecognize {
151	Loop *CurLoop = nullptr;
152	AliasAnalysis *AA;
153	DominatorTree *DT;
154	LoopInfo *LI;
155	ScalarEvolution *SE;
156	TargetLibraryInfo *TLI;
157	const TargetTransformInfo *TTI;
158	const DataLayout *DL;
159	OptimizationRemarkEmitter &ORE;
160	bool ApplyCodeSizeHeuristics;
161	std::unique_ptr<MemorySSAUpdater> MSSAU;
162
163	public:
164	explicit LoopIdiomRecognize(AliasAnalysis AA, DominatorTree DT,
165	LoopInfo LI, ScalarEvolution SE,
166	TargetLibraryInfo *TLI,
167	const TargetTransformInfo TTI, MemorySSA MSSA,
168	const DataLayout *DL,
169	OptimizationRemarkEmitter &ORE)
170	: AA(AA), DT(DT), LI(LI), SE(SE), TLI(TLI), TTI(TTI), DL(DL), ORE(ORE) {
171	if (MSSA)
172	MSSAU = std::make_unique<MemorySSAUpdater>(MSSA);
173	}
174
175	bool runOnLoop(Loop *L);
176
177	private:
178	using StoreList = SmallVector<StoreInst *, 8>;
179	using StoreListMap = MapVector<Value *, StoreList>;
180
181	StoreListMap StoreRefsForMemset;
182	StoreListMap StoreRefsForMemsetPattern;
183	StoreList StoreRefsForMemcpy;
184	bool HasMemset;
185	bool HasMemsetPattern;
186	bool HasMemcpy;
187
188	/// Return code for isLegalStore()
189	enum LegalStoreKind {
190	None = 0,
191	Memset,
192	MemsetPattern,
193	Memcpy,
194	UnorderedAtomicMemcpy,
195	DontUse // Dummy retval never to be used. Allows catching errors in retval
196	// handling.
197	};
198
199	/// \name Countable Loop Idiom Handling
200	/// @{
201
202	bool runOnCountableLoop();
203	bool runOnLoopBlock(BasicBlock BB, const SCEV BECount,
204	SmallVectorImpl<BasicBlock *> &ExitBlocks);
205
206	void collectStores(BasicBlock *BB);
207	LegalStoreKind isLegalStore(StoreInst *SI);
208	enum class ForMemset { No, Yes };
209	bool processLoopStores(SmallVectorImpl<StoreInst > &SL, const SCEV BECount,
210	ForMemset For);
211
212	template <typename MemInst>
213	bool processLoopMemIntrinsic(
214	BasicBlock *BB,
215	bool (LoopIdiomRecognize::Processor)(MemInst , const SCEV *),
216	const SCEV *BECount);
217	bool processLoopMemCpy(MemCpyInst MCI, const SCEV BECount);
218	bool processLoopMemSet(MemSetInst MSI, const SCEV BECount);
219
220	bool processLoopStridedStore(Value DestPtr, const SCEV StoreSizeSCEV,
221	MaybeAlign StoreAlignment, Value *StoredVal,
222	Instruction *TheStore,
223	SmallPtrSetImpl<Instruction *> &Stores,
224	const SCEVAddRecExpr Ev, const SCEV BECount,
225	bool IsNegStride, bool IsLoopMemset = false);
226	bool processLoopStoreOfLoopLoad(StoreInst SI, const SCEV BECount);
227	bool processLoopStoreOfLoopLoad(Value DestPtr, Value SourcePtr,
228	const SCEV *StoreSize, MaybeAlign StoreAlign,
229	MaybeAlign LoadAlign, Instruction *TheStore,
230	Instruction *TheLoad,
231	const SCEVAddRecExpr *StoreEv,
232	const SCEVAddRecExpr *LoadEv,
233	const SCEV *BECount);
234	bool avoidLIRForMultiBlockLoop(bool IsMemset = false,
235	bool IsLoopMemset = false);
236
237	/// @}
238	/// \name Noncountable Loop Idiom Handling
239	/// @{
240
241	bool runOnNoncountableLoop();
242
243	bool recognizePopcount();
244	void transformLoopToPopcount(BasicBlock PreCondBB, Instruction CntInst,
245	PHINode CntPhi, Value Var);
246	bool recognizeAndInsertFFS(); /// Find First Set: ctlz or cttz
247	void transformLoopToCountable(Intrinsic::ID IntrinID, BasicBlock *PreCondBB,
248	Instruction CntInst, PHINode CntPhi,
249	Value Var, Instruction DefX,
250	const DebugLoc &DL, bool ZeroCheck,
251	bool IsCntPhiUsedOutsideLoop);
252
253	bool recognizeShiftUntilBitTest();
254	bool recognizeShiftUntilZero();
255
256	/// @}
257	};
258
259	class LoopIdiomRecognizeLegacyPass : public LoopPass {
260	public:
261	static char ID;
262
263	explicit LoopIdiomRecognizeLegacyPass() : LoopPass(ID) {
264	initializeLoopIdiomRecognizeLegacyPassPass(
265	*PassRegistry::getPassRegistry());
266	}
267
268	bool runOnLoop(Loop *L, LPPassManager &LPM) override {
269	if (DisableLIRP::All)
270	return false;
271
272	if (skipLoop(L))
273	return false;
274
275	AliasAnalysis *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
276	DominatorTree *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
277	LoopInfo *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
278	ScalarEvolution *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
279	TargetLibraryInfo *TLI =
280	&getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(
281	*L->getHeader()->getParent());
282	const TargetTransformInfo *TTI =
283	&getAnalysis<TargetTransformInfoWrapperPass>().getTTI(
284	*L->getHeader()->getParent());
285	const DataLayout *DL = &L->getHeader()->getModule()->getDataLayout();
286	auto *MSSAAnalysis = getAnalysisIfAvailable<MemorySSAWrapperPass>();
287	MemorySSA *MSSA = nullptr;
288	if (MSSAAnalysis)
289	MSSA = &MSSAAnalysis->getMSSA();
290
291	// For the old PM, we can't use OptimizationRemarkEmitter as an analysis
292	// pass. Function analyses need to be preserved across loop transformations
293	// but ORE cannot be preserved (see comment before the pass definition).
294	OptimizationRemarkEmitter ORE(L->getHeader()->getParent());
295
296	LoopIdiomRecognize LIR(AA, DT, LI, SE, TLI, TTI, MSSA, DL, ORE);
297	return LIR.runOnLoop(L);
298	}
299
300	/// This transformation requires natural loop information & requires that
301	/// loop preheaders be inserted into the CFG.
302	void getAnalysisUsage(AnalysisUsage &AU) const override {
303	AU.addRequired<TargetLibraryInfoWrapperPass>();
304	AU.addRequired<TargetTransformInfoWrapperPass>();
305	AU.addPreserved<MemorySSAWrapperPass>();
306	getLoopAnalysisUsage(AU);
307	}
308	};
309
310	} // end anonymous namespace
311
312	char LoopIdiomRecognizeLegacyPass::ID = 0;
313
314	PreservedAnalyses LoopIdiomRecognizePass::run(Loop &L, LoopAnalysisManager &AM,
315	LoopStandardAnalysisResults &AR,
316	LPMUpdater &) {
317	if (DisableLIRP::All)
318	return PreservedAnalyses::all();
319
320	const auto *DL = &L.getHeader()->getModule()->getDataLayout();
321
322	// For the new PM, we also can't use OptimizationRemarkEmitter as an analysis
323	// pass. Function analyses need to be preserved across loop transformations
324	// but ORE cannot be preserved (see comment before the pass definition).
325	OptimizationRemarkEmitter ORE(L.getHeader()->getParent());
326
327	LoopIdiomRecognize LIR(&AR.AA, &AR.DT, &AR.LI, &AR.SE, &AR.TLI, &AR.TTI,
328	AR.MSSA, DL, ORE);
329	if (!LIR.runOnLoop(&L))
330	return PreservedAnalyses::all();
331
332	auto PA = getLoopPassPreservedAnalyses();
333	if (AR.MSSA)
334	PA.preserve<MemorySSAAnalysis>();
335	return PA;
336	}
337
338	INITIALIZE_PASS_BEGIN(LoopIdiomRecognizeLegacyPass, "loop-idiom",static void *initializeLoopIdiomRecognizeLegacyPassPassOnce(PassRegistry &Registry) {
339	"Recognize loop idioms", false, false)static void *initializeLoopIdiomRecognizeLegacyPassPassOnce(PassRegistry &Registry) {
340	INITIALIZE_PASS_DEPENDENCY(LoopPass)initializeLoopPassPass(Registry);
341	INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)initializeTargetLibraryInfoWrapperPassPass(Registry);
342	INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)initializeTargetTransformInfoWrapperPassPass(Registry);
343	INITIALIZE_PASS_END(LoopIdiomRecognizeLegacyPass, "loop-idiom",PassInfo PI = new PassInfo( "Recognize loop idioms", "loop-idiom" , &LoopIdiomRecognizeLegacyPass::ID, PassInfo::NormalCtor_t (callDefaultCtor<LoopIdiomRecognizeLegacyPass>), false, false); Registry.registerPass(PI, true); return PI; } static llvm::once_flag InitializeLoopIdiomRecognizeLegacyPassPassFlag ; void llvm::initializeLoopIdiomRecognizeLegacyPassPass(PassRegistry &Registry) { llvm::call_once(InitializeLoopIdiomRecognizeLegacyPassPassFlag , initializeLoopIdiomRecognizeLegacyPassPassOnce, std::ref(Registry )); }
344	"Recognize loop idioms", false, false)PassInfo PI = new PassInfo( "Recognize loop idioms", "loop-idiom" , &LoopIdiomRecognizeLegacyPass::ID, PassInfo::NormalCtor_t (callDefaultCtor<LoopIdiomRecognizeLegacyPass>), false, false); Registry.registerPass(PI, true); return PI; } static llvm::once_flag InitializeLoopIdiomRecognizeLegacyPassPassFlag ; void llvm::initializeLoopIdiomRecognizeLegacyPassPass(PassRegistry &Registry) { llvm::call_once(InitializeLoopIdiomRecognizeLegacyPassPassFlag , initializeLoopIdiomRecognizeLegacyPassPassOnce, std::ref(Registry )); }
345
346	Pass *llvm::createLoopIdiomPass() { return new LoopIdiomRecognizeLegacyPass(); }
347
348	static void deleteDeadInstruction(Instruction *I) {
349	I->replaceAllUsesWith(UndefValue::get(I->getType()));
350	I->eraseFromParent();
351	}
352
353	//===----------------------------------------------------------------------===//
354	//
355	// Implementation of LoopIdiomRecognize
356	//
357	//===----------------------------------------------------------------------===//
358
359	bool LoopIdiomRecognize::runOnLoop(Loop *L) {
360	CurLoop = L;
361	// If the loop could not be converted to canonical form, it must have an
362	// indirectbr in it, just give up.
363	if (!L->getLoopPreheader())
364	return false;
365
366	// Disable loop idiom recognition if the function's name is a common idiom.
367	StringRef Name = L->getHeader()->getParent()->getName();
368	if (Name == "memset" \|\| Name == "memcpy")
369	return false;
370
371	// Determine if code size heuristics need to be applied.
372	ApplyCodeSizeHeuristics =
373	L->getHeader()->getParent()->hasOptSize() && UseLIRCodeSizeHeurs;
374
375	HasMemset = TLI->has(LibFunc_memset);
376	HasMemsetPattern = TLI->has(LibFunc_memset_pattern16);
377	HasMemcpy = TLI->has(LibFunc_memcpy);
378
379	if (HasMemset \|\| HasMemsetPattern \|\| HasMemcpy)
380	if (SE->hasLoopInvariantBackedgeTakenCount(L))
381	return runOnCountableLoop();
382
383	return runOnNoncountableLoop();
384	}
385
386	bool LoopIdiomRecognize::runOnCountableLoop() {
387	const SCEV *BECount = SE->getBackedgeTakenCount(CurLoop);
388	assert(!isa<SCEVCouldNotCompute>(BECount) &&(static_cast <bool> (!isa<SCEVCouldNotCompute>(BECount ) && "runOnCountableLoop() called on a loop without a predictable" "backedge-taken count") ? void (0) : __assert_fail ("!isa<SCEVCouldNotCompute>(BECount) && \"runOnCountableLoop() called on a loop without a predictable\" \"backedge-taken count\"" , "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp" , 390, __extension__ __PRETTY_FUNCTION__))
389	"runOnCountableLoop() called on a loop without a predictable"(static_cast <bool> (!isa<SCEVCouldNotCompute>(BECount ) && "runOnCountableLoop() called on a loop without a predictable" "backedge-taken count") ? void (0) : __assert_fail ("!isa<SCEVCouldNotCompute>(BECount) && \"runOnCountableLoop() called on a loop without a predictable\" \"backedge-taken count\"" , "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp" , 390, __extension__ __PRETTY_FUNCTION__))
390	"backedge-taken count")(static_cast <bool> (!isa<SCEVCouldNotCompute>(BECount ) && "runOnCountableLoop() called on a loop without a predictable" "backedge-taken count") ? void (0) : __assert_fail ("!isa<SCEVCouldNotCompute>(BECount) && \"runOnCountableLoop() called on a loop without a predictable\" \"backedge-taken count\"" , "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp" , 390, __extension__ __PRETTY_FUNCTION__));
391
392	// If this loop executes exactly one time, then it should be peeled, not
393	// optimized by this pass.
394	if (const SCEVConstant *BECst = dyn_cast<SCEVConstant>(BECount))
395	if (BECst->getAPInt() == 0)
396	return false;
397
398	SmallVector<BasicBlock *, 8> ExitBlocks;
399	CurLoop->getUniqueExitBlocks(ExitBlocks);
400
401	LLVM_DEBUG(dbgs() << DEBUG_TYPE " Scanning: F["do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-idiom")) { dbgs() << "loop-idiom" " Scanning: F[" << CurLoop->getHeader()->getParent()->getName () << "] Countable Loop %" << CurLoop->getHeader ()->getName() << "\n"; } } while (false)
402	<< CurLoop->getHeader()->getParent()->getName()do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-idiom")) { dbgs() << "loop-idiom" " Scanning: F[" << CurLoop->getHeader()->getParent()->getName () << "] Countable Loop %" << CurLoop->getHeader ()->getName() << "\n"; } } while (false)
403	<< "] Countable Loop %" << CurLoop->getHeader()->getName()do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-idiom")) { dbgs() << "loop-idiom" " Scanning: F[" << CurLoop->getHeader()->getParent()->getName () << "] Countable Loop %" << CurLoop->getHeader ()->getName() << "\n"; } } while (false)
404	<< "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-idiom")) { dbgs() << "loop-idiom" " Scanning: F[" << CurLoop->getHeader()->getParent()->getName () << "] Countable Loop %" << CurLoop->getHeader ()->getName() << "\n"; } } while (false);
405
406	// The following transforms hoist stores/memsets into the loop pre-header.
407	// Give up if the loop has instructions that may throw.
408	SimpleLoopSafetyInfo SafetyInfo;
409	SafetyInfo.computeLoopSafetyInfo(CurLoop);
410	if (SafetyInfo.anyBlockMayThrow())
411	return false;
412
413	bool MadeChange = false;
414
415	// Scan all the blocks in the loop that are not in subloops.
416	for (auto *BB : CurLoop->getBlocks()) {
417	// Ignore blocks in subloops.
418	if (LI->getLoopFor(BB) != CurLoop)
419	continue;
420
421	MadeChange \|= runOnLoopBlock(BB, BECount, ExitBlocks);
422	}
423	return MadeChange;
424	}
425
426	static APInt getStoreStride(const SCEVAddRecExpr *StoreEv) {
427	const SCEVConstant *ConstStride = cast<SCEVConstant>(StoreEv->getOperand(1));
428	return ConstStride->getAPInt();
429	}
430
431	/// getMemSetPatternValue - If a strided store of the specified value is safe to
432	/// turn into a memset_pattern16, return a ConstantArray of 16 bytes that should
433	/// be passed in. Otherwise, return null.
434	///
435	/// Note that we don't ever attempt to use memset_pattern8 or 4, because these
436	/// just replicate their input array and then pass on to memset_pattern16.
437	static Constant getMemSetPatternValue(Value V, const DataLayout *DL) {
438	// FIXME: This could check for UndefValue because it can be merged into any
439	// other valid pattern.
440
441	// If the value isn't a constant, we can't promote it to being in a constant
442	// array. We could theoretically do a store to an alloca or something, but
443	// that doesn't seem worthwhile.
444	Constant *C = dyn_cast<Constant>(V);
445	if (!C)
446	return nullptr;
447
448	// Only handle simple values that are a power of two bytes in size.
449	uint64_t Size = DL->getTypeSizeInBits(V->getType());
450	if (Size == 0 \|\| (Size & 7) \|\| (Size & (Size - 1)))
451	return nullptr;
452
453	// Don't care enough about darwin/ppc to implement this.
454	if (DL->isBigEndian())
455	return nullptr;
456
457	// Convert to size in bytes.
458	Size /= 8;
459
460	// TODO: If CI is larger than 16-bytes, we can try slicing it in half to see
461	// if the top and bottom are the same (e.g. for vectors and large integers).
462	if (Size > 16)
463	return nullptr;
464
465	// If the constant is exactly 16 bytes, just use it.
466	if (Size == 16)
467	return C;
468
469	// Otherwise, we'll use an array of the constants.
470	unsigned ArraySize = 16 / Size;
471	ArrayType *AT = ArrayType::get(V->getType(), ArraySize);
472	return ConstantArray::get(AT, std::vector<Constant *>(ArraySize, C));
473	}
474
475	LoopIdiomRecognize::LegalStoreKind
476	LoopIdiomRecognize::isLegalStore(StoreInst *SI) {
477	// Don't touch volatile stores.
478	if (SI->isVolatile())
479	return LegalStoreKind::None;
480	// We only want simple or unordered-atomic stores.
481	if (!SI->isUnordered())
482	return LegalStoreKind::None;
483
484	// Avoid merging nontemporal stores.
485	if (SI->getMetadata(LLVMContext::MD_nontemporal))
486	return LegalStoreKind::None;
487
488	Value *StoredVal = SI->getValueOperand();
489	Value *StorePtr = SI->getPointerOperand();
490
491	// Don't convert stores of non-integral pointer types to memsets (which stores
492	// integers).
493	if (DL->isNonIntegralPointerType(StoredVal->getType()->getScalarType()))
494	return LegalStoreKind::None;
495
496	// Reject stores that are so large that they overflow an unsigned.
497	// When storing out scalable vectors we bail out for now, since the code
498	// below currently only works for constant strides.
499	TypeSize SizeInBits = DL->getTypeSizeInBits(StoredVal->getType());
500	if (SizeInBits.isScalable() \|\| (SizeInBits.getFixedSize() & 7) \|\|
501	(SizeInBits.getFixedSize() >> 32) != 0)
502	return LegalStoreKind::None;
503
504	// See if the pointer expression is an AddRec like {base,+,1} on the current
505	// loop, which indicates a strided store. If we have something else, it's a
506	// random store we can't handle.
507	const SCEVAddRecExpr *StoreEv =
508	dyn_cast<SCEVAddRecExpr>(SE->getSCEV(StorePtr));
509	if (!StoreEv \|\| StoreEv->getLoop() != CurLoop \|\| !StoreEv->isAffine())
510	return LegalStoreKind::None;
511
512	// Check to see if we have a constant stride.
513	if (!isa<SCEVConstant>(StoreEv->getOperand(1)))
514	return LegalStoreKind::None;
515
516	// See if the store can be turned into a memset.
517
518	// If the stored value is a byte-wise value (like i32 -1), then it may be
519	// turned into a memset of i8 -1, assuming that all the consecutive bytes
520	// are stored. A store of i32 0x01020304 can never be turned into a memset,
521	// but it can be turned into memset_pattern if the target supports it.
522	Value SplatValue = isBytewiseValue(StoredVal, DL);
523
524	// Note: memset and memset_pattern on unordered-atomic is yet not supported
525	bool UnorderedAtomic = SI->isUnordered() && !SI->isSimple();
526
527	// If we're allowed to form a memset, and the stored value would be
528	// acceptable for memset, use it.
529	if (!UnorderedAtomic && HasMemset && SplatValue && !DisableLIRP::Memset &&
530	// Verify that the stored value is loop invariant. If not, we can't
531	// promote the memset.
532	CurLoop->isLoopInvariant(SplatValue)) {
533	// It looks like we can use SplatValue.
534	return LegalStoreKind::Memset;
535	}
536	if (!UnorderedAtomic && HasMemsetPattern && !DisableLIRP::Memset &&
537	// Don't create memset_pattern16s with address spaces.
538	StorePtr->getType()->getPointerAddressSpace() == 0 &&
539	getMemSetPatternValue(StoredVal, DL)) {
540	// It looks like we can use PatternValue!
541	return LegalStoreKind::MemsetPattern;
542	}
543
544	// Otherwise, see if the store can be turned into a memcpy.
545	if (HasMemcpy && !DisableLIRP::Memcpy) {
546	// Check to see if the stride matches the size of the store. If so, then we
547	// know that every byte is touched in the loop.
548	APInt Stride = getStoreStride(StoreEv);
549	unsigned StoreSize = DL->getTypeStoreSize(SI->getValueOperand()->getType());
550	if (StoreSize != Stride && StoreSize != -Stride)
551	return LegalStoreKind::None;
552
553	// The store must be feeding a non-volatile load.
554	LoadInst *LI = dyn_cast<LoadInst>(SI->getValueOperand());
555
556	// Only allow non-volatile loads
557	if (!LI \|\| LI->isVolatile())
558	return LegalStoreKind::None;
559	// Only allow simple or unordered-atomic loads
560	if (!LI->isUnordered())
561	return LegalStoreKind::None;
562
563	// See if the pointer expression is an AddRec like {base,+,1} on the current
564	// loop, which indicates a strided load. If we have something else, it's a
565	// random load we can't handle.
566	const SCEVAddRecExpr *LoadEv =
567	dyn_cast<SCEVAddRecExpr>(SE->getSCEV(LI->getPointerOperand()));
568	if (!LoadEv \|\| LoadEv->getLoop() != CurLoop \|\| !LoadEv->isAffine())
569	return LegalStoreKind::None;
570
571	// The store and load must share the same stride.
572	if (StoreEv->getOperand(1) != LoadEv->getOperand(1))
573	return LegalStoreKind::None;
574
575	// Success. This store can be converted into a memcpy.
576	UnorderedAtomic = UnorderedAtomic \|\| LI->isAtomic();
577	return UnorderedAtomic ? LegalStoreKind::UnorderedAtomicMemcpy
578	: LegalStoreKind::Memcpy;
579	}
580	// This store can't be transformed into a memset/memcpy.
581	return LegalStoreKind::None;
582	}
583
584	void LoopIdiomRecognize::collectStores(BasicBlock *BB) {
585	StoreRefsForMemset.clear();
586	StoreRefsForMemsetPattern.clear();
587	StoreRefsForMemcpy.clear();
588	for (Instruction &I : *BB) {
589	StoreInst *SI = dyn_cast<StoreInst>(&I);
590	if (!SI)
591	continue;
592
593	// Make sure this is a strided store with a constant stride.
594	switch (isLegalStore(SI)) {
595	case LegalStoreKind::None:
596	// Nothing to do
597	break;
598	case LegalStoreKind::Memset: {
599	// Find the base pointer.
600	Value *Ptr = getUnderlyingObject(SI->getPointerOperand());
601	StoreRefsForMemset[Ptr].push_back(SI);
602	} break;
603	case LegalStoreKind::MemsetPattern: {
604	// Find the base pointer.
605	Value *Ptr = getUnderlyingObject(SI->getPointerOperand());
606	StoreRefsForMemsetPattern[Ptr].push_back(SI);
607	} break;
608	case LegalStoreKind::Memcpy:
609	case LegalStoreKind::UnorderedAtomicMemcpy:
610	StoreRefsForMemcpy.push_back(SI);
611	break;
612	default:
613	assert(false && "unhandled return value")(static_cast <bool> (false && "unhandled return value" ) ? void (0) : __assert_fail ("false && \"unhandled return value\"" , "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp" , 613, __extension__ __PRETTY_FUNCTION__));
614	break;
615	}
616	}
617	}
618
619	/// runOnLoopBlock - Process the specified block, which lives in a counted loop
620	/// with the specified backedge count. This block is known to be in the current
621	/// loop and not in any subloops.
622	bool LoopIdiomRecognize::runOnLoopBlock(
623	BasicBlock BB, const SCEV BECount,
624	SmallVectorImpl<BasicBlock *> &ExitBlocks) {
625	// We can only promote stores in this block if they are unconditionally
626	// executed in the loop. For a block to be unconditionally executed, it has
627	// to dominate all the exit blocks of the loop. Verify this now.
628	for (unsigned i = 0, e = ExitBlocks.size(); i != e; ++i)
629	if (!DT->dominates(BB, ExitBlocks[i]))
630	return false;
631
632	bool MadeChange = false;
633	// Look for store instructions, which may be optimized to memset/memcpy.
634	collectStores(BB);
635
636	// Look for a single store or sets of stores with a common base, which can be
637	// optimized into a memset (memset_pattern). The latter most commonly happens
638	// with structs and handunrolled loops.
639	for (auto &SL : StoreRefsForMemset)
640	MadeChange \|= processLoopStores(SL.second, BECount, ForMemset::Yes);
641
642	for (auto &SL : StoreRefsForMemsetPattern)
643	MadeChange \|= processLoopStores(SL.second, BECount, ForMemset::No);
644
645	// Optimize the store into a memcpy, if it feeds an similarly strided load.
646	for (auto &SI : StoreRefsForMemcpy)
647	MadeChange \|= processLoopStoreOfLoopLoad(SI, BECount);
648
649	MadeChange \|= processLoopMemIntrinsic<MemCpyInst>(
650	BB, &LoopIdiomRecognize::processLoopMemCpy, BECount);
651	MadeChange \|= processLoopMemIntrinsic<MemSetInst>(
652	BB, &LoopIdiomRecognize::processLoopMemSet, BECount);
653
654	return MadeChange;
655	}
656
657	/// See if this store(s) can be promoted to a memset.
658	bool LoopIdiomRecognize::processLoopStores(SmallVectorImpl<StoreInst *> &SL,
659	const SCEV *BECount, ForMemset For) {
660	// Try to find consecutive stores that can be transformed into memsets.
661	SetVector<StoreInst *> Heads, Tails;
662	SmallDenseMap<StoreInst , StoreInst > ConsecutiveChain;
663
664	// Do a quadratic search on all of the given stores and find
665	// all of the pairs of stores that follow each other.
666	SmallVector<unsigned, 16> IndexQueue;
667	for (unsigned i = 0, e = SL.size(); i < e; ++i) {
668	assert(SL[i]->isSimple() && "Expected only non-volatile stores.")(static_cast <bool> (SL[i]->isSimple() && "Expected only non-volatile stores." ) ? void (0) : __assert_fail ("SL[i]->isSimple() && \"Expected only non-volatile stores.\"" , "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp" , 668, __extension__ __PRETTY_FUNCTION__));
669
670	Value *FirstStoredVal = SL[i]->getValueOperand();
671	Value *FirstStorePtr = SL[i]->getPointerOperand();
672	const SCEVAddRecExpr *FirstStoreEv =
673	cast<SCEVAddRecExpr>(SE->getSCEV(FirstStorePtr));
674	APInt FirstStride = getStoreStride(FirstStoreEv);
675	unsigned FirstStoreSize = DL->getTypeStoreSize(SL[i]->getValueOperand()->getType());
676
677	// See if we can optimize just this store in isolation.
678	if (FirstStride == FirstStoreSize \|\| -FirstStride == FirstStoreSize) {
679	Heads.insert(SL[i]);
680	continue;
681	}
682
683	Value *FirstSplatValue = nullptr;
684	Constant *FirstPatternValue = nullptr;
685
686	if (For == ForMemset::Yes)
687	FirstSplatValue = isBytewiseValue(FirstStoredVal, *DL);
688	else
689	FirstPatternValue = getMemSetPatternValue(FirstStoredVal, DL);
690
691	assert((FirstSplatValue \|\| FirstPatternValue) &&(static_cast <bool> ((FirstSplatValue \|\| FirstPatternValue ) && "Expected either splat value or pattern value.") ? void (0) : __assert_fail ("(FirstSplatValue \|\| FirstPatternValue) && \"Expected either splat value or pattern value.\"" , "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp" , 692, __extension__ __PRETTY_FUNCTION__))
692	"Expected either splat value or pattern value.")(static_cast <bool> ((FirstSplatValue \|\| FirstPatternValue ) && "Expected either splat value or pattern value.") ? void (0) : __assert_fail ("(FirstSplatValue \|\| FirstPatternValue) && \"Expected either splat value or pattern value.\"" , "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp" , 692, __extension__ __PRETTY_FUNCTION__));
693
694	IndexQueue.clear();
695	// If a store has multiple consecutive store candidates, search Stores
696	// array according to the sequence: from i+1 to e, then from i-1 to 0.
697	// This is because usually pairing with immediate succeeding or preceding
698	// candidate create the best chance to find memset opportunity.
699	unsigned j = 0;
700	for (j = i + 1; j < e; ++j)
701	IndexQueue.push_back(j);
702	for (j = i; j > 0; --j)
703	IndexQueue.push_back(j - 1);
704
705	for (auto &k : IndexQueue) {
706	assert(SL[k]->isSimple() && "Expected only non-volatile stores.")(static_cast <bool> (SL[k]->isSimple() && "Expected only non-volatile stores." ) ? void (0) : __assert_fail ("SL[k]->isSimple() && \"Expected only non-volatile stores.\"" , "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp" , 706, __extension__ __PRETTY_FUNCTION__));
707	Value *SecondStorePtr = SL[k]->getPointerOperand();
708	const SCEVAddRecExpr *SecondStoreEv =
709	cast<SCEVAddRecExpr>(SE->getSCEV(SecondStorePtr));
710	APInt SecondStride = getStoreStride(SecondStoreEv);
711
712	if (FirstStride != SecondStride)
713	continue;
714
715	Value *SecondStoredVal = SL[k]->getValueOperand();
716	Value *SecondSplatValue = nullptr;
717	Constant *SecondPatternValue = nullptr;
718
719	if (For == ForMemset::Yes)
720	SecondSplatValue = isBytewiseValue(SecondStoredVal, *DL);
721	else
722	SecondPatternValue = getMemSetPatternValue(SecondStoredVal, DL);
723
724	assert((SecondSplatValue \|\| SecondPatternValue) &&(static_cast <bool> ((SecondSplatValue \|\| SecondPatternValue ) && "Expected either splat value or pattern value.") ? void (0) : __assert_fail ("(SecondSplatValue \|\| SecondPatternValue) && \"Expected either splat value or pattern value.\"" , "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp" , 725, __extension__ __PRETTY_FUNCTION__))
725	"Expected either splat value or pattern value.")(static_cast <bool> ((SecondSplatValue \|\| SecondPatternValue ) && "Expected either splat value or pattern value.") ? void (0) : __assert_fail ("(SecondSplatValue \|\| SecondPatternValue) && \"Expected either splat value or pattern value.\"" , "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp" , 725, __extension__ __PRETTY_FUNCTION__));
726
727	if (isConsecutiveAccess(SL[i], SL[k], DL, SE, false)) {
728	if (For == ForMemset::Yes) {
729	if (isa<UndefValue>(FirstSplatValue))
730	FirstSplatValue = SecondSplatValue;
731	if (FirstSplatValue != SecondSplatValue)
732	continue;
733	} else {
734	if (isa<UndefValue>(FirstPatternValue))
735	FirstPatternValue = SecondPatternValue;
736	if (FirstPatternValue != SecondPatternValue)
737	continue;
738	}
739	Tails.insert(SL[k]);
740	Heads.insert(SL[i]);
741	ConsecutiveChain[SL[i]] = SL[k];
742	break;
743	}
744	}
745	}
746
747	// We may run into multiple chains that merge into a single chain. We mark the
748	// stores that we transformed so that we don't visit the same store twice.
749	SmallPtrSet<Value *, 16> TransformedStores;
750	bool Changed = false;
751
752	// For stores that start but don't end a link in the chain:
753	for (SetVector<StoreInst *>::iterator it = Heads.begin(), e = Heads.end();
754	it != e; ++it) {
755	if (Tails.count(*it))
756	continue;
757
758	// We found a store instr that starts a chain. Now follow the chain and try
759	// to transform it.
760	SmallPtrSet<Instruction *, 8> AdjacentStores;
761	StoreInst I = it;
762
763	StoreInst *HeadStore = I;
764	unsigned StoreSize = 0;
765
766	// Collect the chain into a list.
767	while (Tails.count(I) \|\| Heads.count(I)) {
768	if (TransformedStores.count(I))
769	break;
770	AdjacentStores.insert(I);
771
772	StoreSize += DL->getTypeStoreSize(I->getValueOperand()->getType());
773	// Move to the next value in the chain.
774	I = ConsecutiveChain[I];
775	}
776
777	Value *StoredVal = HeadStore->getValueOperand();
778	Value *StorePtr = HeadStore->getPointerOperand();
779	const SCEVAddRecExpr *StoreEv = cast<SCEVAddRecExpr>(SE->getSCEV(StorePtr));
780	APInt Stride = getStoreStride(StoreEv);
781
782	// Check to see if the stride matches the size of the stores. If so, then
783	// we know that every byte is touched in the loop.
784	if (StoreSize != Stride && StoreSize != -Stride)
785	continue;
786
787	bool IsNegStride = StoreSize == -Stride;
788
789	const SCEV *StoreSizeSCEV = SE->getConstant(BECount->getType(), StoreSize);
790	if (processLoopStridedStore(StorePtr, StoreSizeSCEV,
791	MaybeAlign(HeadStore->getAlignment()),
792	StoredVal, HeadStore, AdjacentStores, StoreEv,
793	BECount, IsNegStride)) {
794	TransformedStores.insert(AdjacentStores.begin(), AdjacentStores.end());
795	Changed = true;
796	}
797	}
798
799	return Changed;
800	}
801
802	/// processLoopMemIntrinsic - Template function for calling different processor
803	/// functions based on mem instrinsic type.
804	template <typename MemInst>
805	bool LoopIdiomRecognize::processLoopMemIntrinsic(
806	BasicBlock *BB,
807	bool (LoopIdiomRecognize::Processor)(MemInst , const SCEV *),
808	const SCEV *BECount) {
809	bool MadeChange = false;
810	for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) {
811	Instruction Inst = &I++;
812	// Look for memory instructions, which may be optimized to a larger one.
813	if (MemInst *MI = dyn_cast<MemInst>(Inst)) {
814	WeakTrackingVH InstPtr(&*I);
815	if (!(this->*Processor)(MI, BECount))
816	continue;
817	MadeChange = true;
818
819	// If processing the instruction invalidated our iterator, start over from
820	// the top of the block.
821	if (!InstPtr)
822	I = BB->begin();
823	}
824	}
825	return MadeChange;
826	}
827
828	/// processLoopMemCpy - See if this memcpy can be promoted to a large memcpy
829	bool LoopIdiomRecognize::processLoopMemCpy(MemCpyInst *MCI,
830	const SCEV *BECount) {
831	// We can only handle non-volatile memcpys with a constant size.
832	if (MCI->isVolatile() \|\| !isa<ConstantInt>(MCI->getLength()))
833	return false;
834
835	// If we're not allowed to hack on memcpy, we fail.
836	if ((!HasMemcpy && !isa<MemCpyInlineInst>(MCI)) \|\| DisableLIRP::Memcpy)
837	return false;
838
839	Value *Dest = MCI->getDest();
840	Value *Source = MCI->getSource();
841	if (!Dest \|\| !Source)
842	return false;
843
844	// See if the load and store pointer expressions are AddRec like {base,+,1} on
845	// the current loop, which indicates a strided load and store. If we have
846	// something else, it's a random load or store we can't handle.
847	const SCEVAddRecExpr *StoreEv = dyn_cast<SCEVAddRecExpr>(SE->getSCEV(Dest));
848	if (!StoreEv \|\| StoreEv->getLoop() != CurLoop \|\| !StoreEv->isAffine())
849	return false;
850	const SCEVAddRecExpr *LoadEv = dyn_cast<SCEVAddRecExpr>(SE->getSCEV(Source));
851	if (!LoadEv \|\| LoadEv->getLoop() != CurLoop \|\| !LoadEv->isAffine())
852	return false;
853
854	// Reject memcpys that are so large that they overflow an unsigned.
855	uint64_t SizeInBytes = cast<ConstantInt>(MCI->getLength())->getZExtValue();
856	if ((SizeInBytes >> 32) != 0)
857	return false;
858
859	// Check if the stride matches the size of the memcpy. If so, then we know
860	// that every byte is touched in the loop.
861	const SCEVConstant *ConstStoreStride =
862	dyn_cast<SCEVConstant>(StoreEv->getOperand(1));
863	const SCEVConstant *ConstLoadStride =
864	dyn_cast<SCEVConstant>(LoadEv->getOperand(1));
865	if (!ConstStoreStride \|\| !ConstLoadStride)
866	return false;
867
868	APInt StoreStrideValue = ConstStoreStride->getAPInt();
869	APInt LoadStrideValue = ConstLoadStride->getAPInt();
870	// Huge stride value - give up
871	if (StoreStrideValue.getBitWidth() > 64 \|\| LoadStrideValue.getBitWidth() > 64)
872	return false;
873
874	if (SizeInBytes != StoreStrideValue && SizeInBytes != -StoreStrideValue) {
875	ORE.emit([&]() {
876	return OptimizationRemarkMissed(DEBUG_TYPE"loop-idiom", "SizeStrideUnequal", MCI)
877	<< ore::NV("Inst", "memcpy") << " in "
878	<< ore::NV("Function", MCI->getFunction())
879	<< " function will not be hoisted: "
880	<< ore::NV("Reason", "memcpy size is not equal to stride");
881	});
882	return false;
883	}
884
885	int64_t StoreStrideInt = StoreStrideValue.getSExtValue();
886	int64_t LoadStrideInt = LoadStrideValue.getSExtValue();
887	// Check if the load stride matches the store stride.
888	if (StoreStrideInt != LoadStrideInt)
889	return false;
890
891	return processLoopStoreOfLoopLoad(
892	Dest, Source, SE->getConstant(Dest->getType(), SizeInBytes),
893	MCI->getDestAlign(), MCI->getSourceAlign(), MCI, MCI, StoreEv, LoadEv,
894	BECount);
895	}
896
897	/// processLoopMemSet - See if this memset can be promoted to a large memset.
898	bool LoopIdiomRecognize::processLoopMemSet(MemSetInst *MSI,
899	const SCEV *BECount) {
900	// We can only handle non-volatile memsets.
901	if (MSI->isVolatile())
902	return false;
903
904	// If we're not allowed to hack on memset, we fail.
905	if (!HasMemset \|\| DisableLIRP::Memset)
906	return false;
907
908	Value *Pointer = MSI->getDest();
909
910	// See if the pointer expression is an AddRec like {base,+,1} on the current
911	// loop, which indicates a strided store. If we have something else, it's a
912	// random store we can't handle.
913	const SCEVAddRecExpr *Ev = dyn_cast<SCEVAddRecExpr>(SE->getSCEV(Pointer));
914	if (!Ev \|\| Ev->getLoop() != CurLoop)
915	return false;
916	if (!Ev->isAffine()) {
917	LLVM_DEBUG(dbgs() << " Pointer is not affine, abort\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-idiom")) { dbgs() << " Pointer is not affine, abort\n" ; } } while (false);
918	return false;
919	}
920
921	const SCEV *PointerStrideSCEV = Ev->getOperand(1);
922	const SCEV *MemsetSizeSCEV = SE->getSCEV(MSI->getLength());
923	if (!PointerStrideSCEV \|\| !MemsetSizeSCEV)
924	return false;
925
926	bool IsNegStride = false;
927	const bool IsConstantSize = isa<ConstantInt>(MSI->getLength());
928
929	if (IsConstantSize) {
930	// Memset size is constant.
931	// Check if the pointer stride matches the memset size. If so, then
932	// we know that every byte is touched in the loop.
933	LLVM_DEBUG(dbgs() << " memset size is constant\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-idiom")) { dbgs() << " memset size is constant\n" ; } } while (false);
934	uint64_t SizeInBytes = cast<ConstantInt>(MSI->getLength())->getZExtValue();
935	const SCEVConstant *ConstStride = dyn_cast<SCEVConstant>(Ev->getOperand(1));
936	if (!ConstStride)
937	return false;
938
939	APInt Stride = ConstStride->getAPInt();
940	if (SizeInBytes != Stride && SizeInBytes != -Stride)
941	return false;
942
943	IsNegStride = SizeInBytes == -Stride;
944	} else {
945	// Memset size is non-constant.
946	// Check if the pointer stride matches the memset size.
947	// To be conservative, the pass would not promote pointers that aren't in
948	// address space zero. Also, the pass only handles memset length and stride
949	// that are invariant for the top level loop.
950	LLVM_DEBUG(dbgs() << " memset size is non-constant\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-idiom")) { dbgs() << " memset size is non-constant\n" ; } } while (false);
951	if (Pointer->getType()->getPointerAddressSpace() != 0) {
952	LLVM_DEBUG(dbgs() << " pointer is not in address space zero, "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-idiom")) { dbgs() << " pointer is not in address space zero, " << "abort\n"; } } while (false)
953	<< "abort\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-idiom")) { dbgs() << " pointer is not in address space zero, " << "abort\n"; } } while (false);
954	return false;
955	}
956	if (!SE->isLoopInvariant(MemsetSizeSCEV, CurLoop)) {
957	LLVM_DEBUG(dbgs() << " memset size is not a loop-invariant, "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-idiom")) { dbgs() << " memset size is not a loop-invariant, " << "abort\n"; } } while (false)
958	<< "abort\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-idiom")) { dbgs() << " memset size is not a loop-invariant, " << "abort\n"; } } while (false);
959	return false;
960	}
961
962	// Compare positive direction PointerStrideSCEV with MemsetSizeSCEV
963	IsNegStride = PointerStrideSCEV->isNonConstantNegative();
964	const SCEV *PositiveStrideSCEV =
965	IsNegStride ? SE->getNegativeSCEV(PointerStrideSCEV)
966	: PointerStrideSCEV;
967	LLVM_DEBUG(dbgs() << " MemsetSizeSCEV: " << MemsetSizeSCEV << "\n"do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-idiom")) { dbgs() << " MemsetSizeSCEV: " << MemsetSizeSCEV << "\n" << " PositiveStrideSCEV: " << *PositiveStrideSCEV << "\n"; } } while (false )
968	<< " PositiveStrideSCEV: " << PositiveStrideSCEVdo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-idiom")) { dbgs() << " MemsetSizeSCEV: " << MemsetSizeSCEV << "\n" << " PositiveStrideSCEV: " << *PositiveStrideSCEV << "\n"; } } while (false )
969	<< "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-idiom")) { dbgs() << " MemsetSizeSCEV: " << MemsetSizeSCEV << "\n" << " PositiveStrideSCEV: " << PositiveStrideSCEV << "\n"; } } while (false );
970
971	if (PositiveStrideSCEV != MemsetSizeSCEV) {
972	// TODO: folding can be done to the SCEVs
973	// The folding is to fold expressions that is covered by the loop guard
974	// at loop entry. After the folding, compare again and proceed
975	// optimization if equal.
976	LLVM_DEBUG(dbgs() << " SCEV don't match, abort\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-idiom")) { dbgs() << " SCEV don't match, abort\n" ; } } while (false);
977	return false;
978	}
979	}
980
981	// Verify that the memset value is loop invariant. If not, we can't promote
982	// the memset.
983	Value *SplatValue = MSI->getValue();
984	if (!SplatValue \|\| !CurLoop->isLoopInvariant(SplatValue))
985	return false;
986
987	SmallPtrSet<Instruction *, 1> MSIs;
988	MSIs.insert(MSI);
989	return processLoopStridedStore(Pointer, SE->getSCEV(MSI->getLength()),
990	MaybeAlign(MSI->getDestAlignment()),
991	SplatValue, MSI, MSIs, Ev, BECount,
992	IsNegStride, /IsLoopMemset=/true);
993	}
994
995	/// mayLoopAccessLocation - Return true if the specified loop might access the
996	/// specified pointer location, which is a loop-strided access. The 'Access'
997	/// argument specifies what the verboten forms of access are (read or write).
998	static bool
999	mayLoopAccessLocation(Value Ptr, ModRefInfo Access, Loop L,
1000	const SCEV BECount, const SCEV StoreSizeSCEV,
1001	AliasAnalysis &AA,
1002	SmallPtrSetImpl<Instruction *> &IgnoredInsts) {
1003	// Get the location that may be stored across the loop. Since the access is
1004	// strided positively through memory, we say that the modified location starts
1005	// at the pointer and has infinite size.
1006	LocationSize AccessSize = LocationSize::afterPointer();
1007
1008	// If the loop iterates a fixed number of times, we can refine the access size
1009	// to be exactly the size of the memset, which is (BECount+1)*StoreSize
1010	const SCEVConstant *BECst = dyn_cast<SCEVConstant>(BECount);
1011	const SCEVConstant *ConstSize = dyn_cast<SCEVConstant>(StoreSizeSCEV);
1012	if (BECst && ConstSize)
1013	AccessSize = LocationSize::precise((BECst->getValue()->getZExtValue() + 1) *
1014	ConstSize->getValue()->getZExtValue());
1015
1016	// TODO: For this to be really effective, we have to dive into the pointer
1017	// operand in the store. Store to &A[i] of 100 will always return may alias
1018	// with store of &A[100], we need to StoreLoc to be "A" with size of 100,
1019	// which will then no-alias a store to &A[100].
1020	MemoryLocation StoreLoc(Ptr, AccessSize);
1021
1022	for (Loop::block_iterator BI = L->block_begin(), E = L->block_end(); BI != E;
1023	++BI)
1024	for (Instruction &I : **BI)
1025	if (IgnoredInsts.count(&I) == 0 &&
1026	isModOrRefSet(
1027	intersectModRef(AA.getModRefInfo(&I, StoreLoc), Access)))
1028	return true;
1029	return false;
1030	}
1031
1032	// If we have a negative stride, Start refers to the end of the memory location
1033	// we're trying to memset. Therefore, we need to recompute the base pointer,
1034	// which is just Start - BECount*Size.
1035	static const SCEV getStartForNegStride(const SCEV Start, const SCEV *BECount,
1036	Type IntPtr, const SCEV StoreSizeSCEV,
1037	ScalarEvolution *SE) {
1038	const SCEV *Index = SE->getTruncateOrZeroExtend(BECount, IntPtr);
1039	if (!StoreSizeSCEV->isOne()) {
1040	// index = back edge count * store size
1041	Index = SE->getMulExpr(Index,
1042	SE->getTruncateOrZeroExtend(StoreSizeSCEV, IntPtr),
1043	SCEV::FlagNUW);
1044	}
1045	// base pointer = start - index * store size
1046	return SE->getMinusSCEV(Start, Index);
1047	}
1048
1049	/// Compute trip count from the backedge taken count.
1050	static const SCEV getTripCount(const SCEV BECount, Type *IntPtr,
1051	Loop CurLoop, const DataLayout DL,
1052	ScalarEvolution *SE) {
1053	const SCEV *TripCountS = nullptr;
1054	// The # stored bytes is (BECount+1). Expand the trip count out to
1055	// pointer size if it isn't already.
1056	//
1057	// If we're going to need to zero extend the BE count, check if we can add
1058	// one to it prior to zero extending without overflow. Provided this is safe,
1059	// it allows better simplification of the +1.
1060	if (DL->getTypeSizeInBits(BECount->getType()) <
1061	DL->getTypeSizeInBits(IntPtr) &&
1062	SE->isLoopEntryGuardedByCond(
1063	CurLoop, ICmpInst::ICMP_NE, BECount,
1064	SE->getNegativeSCEV(SE->getOne(BECount->getType())))) {
1065	TripCountS = SE->getZeroExtendExpr(
1066	SE->getAddExpr(BECount, SE->getOne(BECount->getType()), SCEV::FlagNUW),
1067	IntPtr);
1068	} else {
1069	TripCountS = SE->getAddExpr(SE->getTruncateOrZeroExtend(BECount, IntPtr),
1070	SE->getOne(IntPtr), SCEV::FlagNUW);
1071	}
1072
1073	return TripCountS;
1074	}
1075
1076	/// Compute the number of bytes as a SCEV from the backedge taken count.
1077	///
1078	/// This also maps the SCEV into the provided type and tries to handle the
1079	/// computation in a way that will fold cleanly.
1080	static const SCEV getNumBytes(const SCEV BECount, Type *IntPtr,
1081	const SCEV StoreSizeSCEV, Loop CurLoop,
1082	const DataLayout DL, ScalarEvolution SE) {
1083	const SCEV *TripCountSCEV = getTripCount(BECount, IntPtr, CurLoop, DL, SE);
1084
1085	return SE->getMulExpr(TripCountSCEV,
1086	SE->getTruncateOrZeroExtend(StoreSizeSCEV, IntPtr),
1087	SCEV::FlagNUW);
1088	}
1089
1090	/// processLoopStridedStore - We see a strided store of some value. If we can
1091	/// transform this into a memset or memset_pattern in the loop preheader, do so.
1092	bool LoopIdiomRecognize::processLoopStridedStore(
1093	Value DestPtr, const SCEV StoreSizeSCEV, MaybeAlign StoreAlignment,
1094	Value StoredVal, Instruction TheStore,
1095	SmallPtrSetImpl<Instruction > &Stores, const SCEVAddRecExpr Ev,
1096	const SCEV *BECount, bool IsNegStride, bool IsLoopMemset) {
1097	Value SplatValue = isBytewiseValue(StoredVal, DL);
1098	Constant *PatternValue = nullptr;
1099
1100	if (!SplatValue)
1101	PatternValue = getMemSetPatternValue(StoredVal, DL);
1102
1103	assert((SplatValue \|\| PatternValue) &&(static_cast <bool> ((SplatValue \|\| PatternValue) && "Expected either splat value or pattern value.") ? void (0) : __assert_fail ("(SplatValue \|\| PatternValue) && \"Expected either splat value or pattern value.\"" , "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp" , 1104, __extension__ __PRETTY_FUNCTION__))
1104	"Expected either splat value or pattern value.")(static_cast <bool> ((SplatValue \|\| PatternValue) && "Expected either splat value or pattern value.") ? void (0) : __assert_fail ("(SplatValue \|\| PatternValue) && \"Expected either splat value or pattern value.\"" , "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp" , 1104, __extension__ __PRETTY_FUNCTION__));
1105
1106	// The trip count of the loop and the base pointer of the addrec SCEV is
1107	// guaranteed to be loop invariant, which means that it should dominate the
1108	// header. This allows us to insert code for it in the preheader.
1109	unsigned DestAS = DestPtr->getType()->getPointerAddressSpace();
1110	BasicBlock *Preheader = CurLoop->getLoopPreheader();
1111	IRBuilder<> Builder(Preheader->getTerminator());
1112	SCEVExpander Expander(SE, DL, "loop-idiom");
1113	SCEVExpanderCleaner ExpCleaner(Expander, *DT);
1114
1115	Type *DestInt8PtrTy = Builder.getInt8PtrTy(DestAS);
1116	Type *IntIdxTy = DL->getIndexType(DestPtr->getType());
1117
1118	bool Changed = false;
1119	const SCEV *Start = Ev->getStart();
1120	// Handle negative strided loops.
1121	if (IsNegStride)
1122	Start = getStartForNegStride(Start, BECount, IntIdxTy, StoreSizeSCEV, SE);
1123
1124	// TODO: ideally we should still be able to generate memset if SCEV expander
1125	// is taught to generate the dependencies at the latest point.
1126	if (!isSafeToExpand(Start, *SE))
1127	return Changed;
1128
1129	// Okay, we have a strided store "p[i]" of a splattable value. We can turn
1130	// this into a memset in the loop preheader now if we want. However, this
1131	// would be unsafe to do if there is anything else in the loop that may read
1132	// or write to the aliased location. Check for any overlap by generating the
1133	// base pointer and checking the region.
1134	Value *BasePtr =
1135	Expander.expandCodeFor(Start, DestInt8PtrTy, Preheader->getTerminator());
1136
1137	// From here on out, conservatively report to the pass manager that we've
1138	// changed the IR, even if we later clean up these added instructions. There
1139	// may be structural differences e.g. in the order of use lists not accounted
1140	// for in just a textual dump of the IR. This is written as a variable, even
1141	// though statically all the places this dominates could be replaced with
1142	// 'true', with the hope that anyone trying to be clever / "more precise" with
1143	// the return value will read this comment, and leave them alone.
1144	Changed = true;
1145
1146	if (mayLoopAccessLocation(BasePtr, ModRefInfo::ModRef, CurLoop, BECount,
1147	StoreSizeSCEV, *AA, Stores))
1148	return Changed;
1149
1150	if (avoidLIRForMultiBlockLoop(/IsMemset=/true, IsLoopMemset))
1151	return Changed;
1152
1153	// Okay, everything looks good, insert the memset.
1154
1155	const SCEV *NumBytesS =
1156	getNumBytes(BECount, IntIdxTy, StoreSizeSCEV, CurLoop, DL, SE);
1157
1158	// TODO: ideally we should still be able to generate memset if SCEV expander
1159	// is taught to generate the dependencies at the latest point.
1160	if (!isSafeToExpand(NumBytesS, *SE))
1161	return Changed;
1162
1163	Value *NumBytes =
1164	Expander.expandCodeFor(NumBytesS, IntIdxTy, Preheader->getTerminator());
1165
1166	CallInst *NewCall;
1167	if (SplatValue) {
1168	NewCall = Builder.CreateMemSet(BasePtr, SplatValue, NumBytes,
1169	MaybeAlign(StoreAlignment));
1170	} else {
1171	// Everything is emitted in default address space
1172	Type *Int8PtrTy = DestInt8PtrTy;
1173
1174	Module *M = TheStore->getModule();
1175	StringRef FuncName = "memset_pattern16";
1176	FunctionCallee MSP = M->getOrInsertFunction(FuncName, Builder.getVoidTy(),
1177	Int8PtrTy, Int8PtrTy, IntIdxTy);
1178	inferLibFuncAttributes(M, FuncName, *TLI);
1179
1180	// Otherwise we should form a memset_pattern16. PatternValue is known to be
1181	// an constant array of 16-bytes. Plop the value into a mergable global.
1182	GlobalVariable GV = new GlobalVariable(M, PatternValue->getType(), true,
1183	GlobalValue::PrivateLinkage,
1184	PatternValue, ".memset_pattern");
1185	GV->setUnnamedAddr(GlobalValue::UnnamedAddr::Global); // Ok to merge these.
1186	GV->setAlignment(Align(16));
1187	Value *PatternPtr = ConstantExpr::getBitCast(GV, Int8PtrTy);
1188	NewCall = Builder.CreateCall(MSP, {BasePtr, PatternPtr, NumBytes});
1189	}
1190	NewCall->setDebugLoc(TheStore->getDebugLoc());
1191
1192	if (MSSAU) {
1193	MemoryAccess *NewMemAcc = MSSAU->createMemoryAccessInBB(
1194	NewCall, nullptr, NewCall->getParent(), MemorySSA::BeforeTerminator);
1195	MSSAU->insertDef(cast<MemoryDef>(NewMemAcc), true);
1196	}
1197
1198	LLVM_DEBUG(dbgs() << " Formed memset: " << NewCall << "\n"do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-idiom")) { dbgs() << " Formed memset: " << NewCall << "\n" << " from store to: " << Ev << " at: " << TheStore << "\n"; } } while (false)
1199	<< " from store to: " << Ev << " at: " << TheStoredo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-idiom")) { dbgs() << " Formed memset: " << NewCall << "\n" << " from store to: " << Ev << " at: " << *TheStore << "\n"; } } while (false)
1200	<< "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-idiom")) { dbgs() << " Formed memset: " << NewCall << "\n" << " from store to: " << Ev << " at: " << *TheStore << "\n"; } } while (false);
1201
1202	ORE.emit([&]() {
1203	return OptimizationRemark(DEBUG_TYPE"loop-idiom", "ProcessLoopStridedStore",
1204	NewCall->getDebugLoc(), Preheader)
1205	<< "Transformed loop-strided store in "
1206	<< ore::NV("Function", TheStore->getFunction())
1207	<< " function into a call to "
1208	<< ore::NV("NewFunction", NewCall->getCalledFunction())
1209	<< "() intrinsic";
1210	});
1211
1212	// Okay, the memset has been formed. Zap the original store and anything that
1213	// feeds into it.
1214	for (auto *I : Stores) {
1215	if (MSSAU)
1216	MSSAU->removeMemoryAccess(I, true);
1217	deleteDeadInstruction(I);
1218	}
1219	if (MSSAU && VerifyMemorySSA)
1220	MSSAU->getMemorySSA()->verifyMemorySSA();
1221	++NumMemSet;
1222	ExpCleaner.markResultUsed();
1223	return true;
1224	}
1225
1226	/// If the stored value is a strided load in the same loop with the same stride
1227	/// this may be transformable into a memcpy. This kicks in for stuff like
1228	/// for (i) A[i] = B[i];
1229	bool LoopIdiomRecognize::processLoopStoreOfLoopLoad(StoreInst *SI,
1230	const SCEV *BECount) {
1231	assert(SI->isUnordered() && "Expected only non-volatile non-ordered stores.")(static_cast <bool> (SI->isUnordered() && "Expected only non-volatile non-ordered stores." ) ? void (0) : __assert_fail ("SI->isUnordered() && \"Expected only non-volatile non-ordered stores.\"" , "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp" , 1231, __extension__ __PRETTY_FUNCTION__));
1232
1233	Value *StorePtr = SI->getPointerOperand();
1234	const SCEVAddRecExpr *StoreEv = cast<SCEVAddRecExpr>(SE->getSCEV(StorePtr));
1235	unsigned StoreSize = DL->getTypeStoreSize(SI->getValueOperand()->getType());
1236
1237	// The store must be feeding a non-volatile load.
1238	LoadInst *LI = cast<LoadInst>(SI->getValueOperand());
1239	assert(LI->isUnordered() && "Expected only non-volatile non-ordered loads.")(static_cast <bool> (LI->isUnordered() && "Expected only non-volatile non-ordered loads." ) ? void (0) : __assert_fail ("LI->isUnordered() && \"Expected only non-volatile non-ordered loads.\"" , "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp" , 1239, __extension__ __PRETTY_FUNCTION__));
1240
1241	// See if the pointer expression is an AddRec like {base,+,1} on the current
1242	// loop, which indicates a strided load. If we have something else, it's a
1243	// random load we can't handle.
1244	Value *LoadPtr = LI->getPointerOperand();
1245	const SCEVAddRecExpr *LoadEv = cast<SCEVAddRecExpr>(SE->getSCEV(LoadPtr));
1246
1247	const SCEV *StoreSizeSCEV = SE->getConstant(StorePtr->getType(), StoreSize);
1248	return processLoopStoreOfLoopLoad(StorePtr, LoadPtr, StoreSizeSCEV,
1249	SI->getAlign(), LI->getAlign(), SI, LI,
1250	StoreEv, LoadEv, BECount);
1251	}
1252
1253	bool LoopIdiomRecognize::processLoopStoreOfLoopLoad(
1254	Value DestPtr, Value SourcePtr, const SCEV *StoreSizeSCEV,
1255	MaybeAlign StoreAlign, MaybeAlign LoadAlign, Instruction *TheStore,
1256	Instruction TheLoad, const SCEVAddRecExpr StoreEv,
1257	const SCEVAddRecExpr LoadEv, const SCEV BECount) {
1258
1259	// FIXME: until llvm.memcpy.inline supports dynamic sizes, we need to
1260	// conservatively bail here, since otherwise we may have to transform
1261	// llvm.memcpy.inline into llvm.memcpy which is illegal.
1262	if (isa<MemCpyInlineInst>(TheStore))
1263	return false;
1264
1265	// The trip count of the loop and the base pointer of the addrec SCEV is
1266	// guaranteed to be loop invariant, which means that it should dominate the
1267	// header. This allows us to insert code for it in the preheader.
1268	BasicBlock *Preheader = CurLoop->getLoopPreheader();
1269	IRBuilder<> Builder(Preheader->getTerminator());
1270	SCEVExpander Expander(SE, DL, "loop-idiom");
1271
1272	SCEVExpanderCleaner ExpCleaner(Expander, *DT);
1273
1274	bool Changed = false;
1275	const SCEV *StrStart = StoreEv->getStart();
1276	unsigned StrAS = DestPtr->getType()->getPointerAddressSpace();
1277	Type *IntIdxTy = Builder.getIntNTy(DL->getIndexSizeInBits(StrAS));
1278
1279	APInt Stride = getStoreStride(StoreEv);
1280	const SCEVConstant *ConstStoreSize = dyn_cast<SCEVConstant>(StoreSizeSCEV);
1281
1282	// TODO: Deal with non-constant size; Currently expect constant store size
1283	assert(ConstStoreSize && "store size is expected to be a constant")(static_cast <bool> (ConstStoreSize && "store size is expected to be a constant" ) ? void (0) : __assert_fail ("ConstStoreSize && \"store size is expected to be a constant\"" , "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp" , 1283, __extension__ __PRETTY_FUNCTION__));
1284
1285	int64_t StoreSize = ConstStoreSize->getValue()->getZExtValue();
1286	bool IsNegStride = StoreSize == -Stride;
1287
1288	// Handle negative strided loops.
1289	if (IsNegStride)
1290	StrStart =
1291	getStartForNegStride(StrStart, BECount, IntIdxTy, StoreSizeSCEV, SE);
1292
1293	// Okay, we have a strided store "p[i]" of a loaded value. We can turn
1294	// this into a memcpy in the loop preheader now if we want. However, this
1295	// would be unsafe to do if there is anything else in the loop that may read
1296	// or write the memory region we're storing to. This includes the load that
1297	// feeds the stores. Check for an alias by generating the base address and
1298	// checking everything.
1299	Value *StoreBasePtr = Expander.expandCodeFor(
1300	StrStart, Builder.getInt8PtrTy(StrAS), Preheader->getTerminator());
1301
1302	// From here on out, conservatively report to the pass manager that we've
1303	// changed the IR, even if we later clean up these added instructions. There
1304	// may be structural differences e.g. in the order of use lists not accounted
1305	// for in just a textual dump of the IR. This is written as a variable, even
1306	// though statically all the places this dominates could be replaced with
1307	// 'true', with the hope that anyone trying to be clever / "more precise" with
1308	// the return value will read this comment, and leave them alone.
1309	Changed = true;
1310
1311	SmallPtrSet<Instruction *, 2> IgnoredInsts;
1312	IgnoredInsts.insert(TheStore);
1313
1314	bool IsMemCpy = isa<MemCpyInst>(TheStore);
1315	const StringRef InstRemark = IsMemCpy ? "memcpy" : "load and store";
1316
1317	bool UseMemMove =
1318	mayLoopAccessLocation(StoreBasePtr, ModRefInfo::ModRef, CurLoop, BECount,
1319	StoreSizeSCEV, *AA, IgnoredInsts);
1320	if (UseMemMove) {
1321	// For memmove case it's not enough to guarantee that loop doesn't access
1322	// TheStore and TheLoad. Additionally we need to make sure that TheStore is
1323	// the only user of TheLoad.
1324	if (!TheLoad->hasOneUse())
1325	return Changed;
1326	IgnoredInsts.insert(TheLoad);
1327	if (mayLoopAccessLocation(StoreBasePtr, ModRefInfo::ModRef, CurLoop,
1328	BECount, StoreSizeSCEV, *AA, IgnoredInsts)) {
1329	ORE.emit([&]() {
1330	return OptimizationRemarkMissed(DEBUG_TYPE"loop-idiom", "LoopMayAccessStore",
1331	TheStore)
1332	<< ore::NV("Inst", InstRemark) << " in "
1333	<< ore::NV("Function", TheStore->getFunction())
1334	<< " function will not be hoisted: "
1335	<< ore::NV("Reason", "The loop may access store location");
1336	});
1337	return Changed;
1338	}
1339	IgnoredInsts.erase(TheLoad);
1340	}
1341
1342	const SCEV *LdStart = LoadEv->getStart();
1343	unsigned LdAS = SourcePtr->getType()->getPointerAddressSpace();
1344
1345	// Handle negative strided loops.
1346	if (IsNegStride)
1347	LdStart =
1348	getStartForNegStride(LdStart, BECount, IntIdxTy, StoreSizeSCEV, SE);
1349
1350	// For a memcpy, we have to make sure that the input array is not being
1351	// mutated by the loop.
1352	Value *LoadBasePtr = Expander.expandCodeFor(
1353	LdStart, Builder.getInt8PtrTy(LdAS), Preheader->getTerminator());
1354
1355	// If the store is a memcpy instruction, we must check if it will write to
1356	// the load memory locations. So remove it from the ignored stores.
1357	if (IsMemCpy)
1358	IgnoredInsts.erase(TheStore);
1359	if (mayLoopAccessLocation(LoadBasePtr, ModRefInfo::Mod, CurLoop, BECount,
1360	StoreSizeSCEV, *AA, IgnoredInsts)) {
1361	ORE.emit([&]() {
1362	return OptimizationRemarkMissed(DEBUG_TYPE"loop-idiom", "LoopMayAccessLoad", TheLoad)
1363	<< ore::NV("Inst", InstRemark) << " in "
1364	<< ore::NV("Function", TheStore->getFunction())
1365	<< " function will not be hoisted: "
1366	<< ore::NV("Reason", "The loop may access load location");
1367	});
1368	return Changed;
1369	}
1370	if (UseMemMove) {
1371	// Ensure that LoadBasePtr is after StoreBasePtr or before StoreBasePtr for
1372	// negative stride. LoadBasePtr shouldn't overlap with StoreBasePtr.
1373	int64_t LoadOff = 0, StoreOff = 0;
1374	const Value *BP1 = llvm::GetPointerBaseWithConstantOffset(
1375	LoadBasePtr->stripPointerCasts(), LoadOff, *DL);
1376	const Value *BP2 = llvm::GetPointerBaseWithConstantOffset(
1377	StoreBasePtr->stripPointerCasts(), StoreOff, *DL);
1378	int64_t LoadSize =
1379	DL->getTypeSizeInBits(TheLoad->getType()).getFixedSize() / 8;
1380	if (BP1 != BP2 \|\| LoadSize != int64_t(StoreSize))
1381	return Changed;
1382	if ((!IsNegStride && LoadOff < StoreOff + int64_t(StoreSize)) \|\|
1383	(IsNegStride && LoadOff + LoadSize > StoreOff))
1384	return Changed;
1385	}
1386
1387	if (avoidLIRForMultiBlockLoop())
1388	return Changed;
1389
1390	// Okay, everything is safe, we can transform this!
1391
1392	const SCEV *NumBytesS =
1393	getNumBytes(BECount, IntIdxTy, StoreSizeSCEV, CurLoop, DL, SE);
1394
1395	Value *NumBytes =
1396	Expander.expandCodeFor(NumBytesS, IntIdxTy, Preheader->getTerminator());
1397
1398	CallInst *NewCall = nullptr;
1399	// Check whether to generate an unordered atomic memcpy:
1400	// If the load or store are atomic, then they must necessarily be unordered
1401	// by previous checks.
1402	if (!TheStore->isAtomic() && !TheLoad->isAtomic()) {
1403	if (UseMemMove)
1404	NewCall = Builder.CreateMemMove(StoreBasePtr, StoreAlign, LoadBasePtr,
1405	LoadAlign, NumBytes);
1406	else
1407	NewCall = Builder.CreateMemCpy(StoreBasePtr, StoreAlign, LoadBasePtr,
1408	LoadAlign, NumBytes);
1409	} else {
1410	// For now don't support unordered atomic memmove.
1411	if (UseMemMove)
1412	return Changed;
1413	// We cannot allow unaligned ops for unordered load/store, so reject
1414	// anything where the alignment isn't at least the element size.
1415	assert((StoreAlign.hasValue() && LoadAlign.hasValue()) &&(static_cast <bool> ((StoreAlign.hasValue() && LoadAlign .hasValue()) && "Expect unordered load/store to have align." ) ? void (0) : __assert_fail ("(StoreAlign.hasValue() && LoadAlign.hasValue()) && \"Expect unordered load/store to have align.\"" , "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp" , 1416, __extension__ __PRETTY_FUNCTION__))
1416	"Expect unordered load/store to have align.")(static_cast <bool> ((StoreAlign.hasValue() && LoadAlign .hasValue()) && "Expect unordered load/store to have align." ) ? void (0) : __assert_fail ("(StoreAlign.hasValue() && LoadAlign.hasValue()) && \"Expect unordered load/store to have align.\"" , "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp" , 1416, __extension__ __PRETTY_FUNCTION__));
1417	if (StoreAlign.getValue() < StoreSize \|\| LoadAlign.getValue() < StoreSize)
1418	return Changed;
1419
1420	// If the element.atomic memcpy is not lowered into explicit
1421	// loads/stores later, then it will be lowered into an element-size
1422	// specific lib call. If the lib call doesn't exist for our store size, then
1423	// we shouldn't generate the memcpy.
1424	if (StoreSize > TTI->getAtomicMemIntrinsicMaxElementSize())
1425	return Changed;
1426
1427	// Create the call.
1428	// Note that unordered atomic loads/stores are required by the spec to
1429	// have an alignment but non-atomic loads/stores may not.
1430	NewCall = Builder.CreateElementUnorderedAtomicMemCpy(
1431	StoreBasePtr, StoreAlign.getValue(), LoadBasePtr, LoadAlign.getValue(),
1432	NumBytes, StoreSize);
1433	}
1434	NewCall->setDebugLoc(TheStore->getDebugLoc());
1435
1436	if (MSSAU) {
1437	MemoryAccess *NewMemAcc = MSSAU->createMemoryAccessInBB(
1438	NewCall, nullptr, NewCall->getParent(), MemorySSA::BeforeTerminator);
1439	MSSAU->insertDef(cast<MemoryDef>(NewMemAcc), true);
1440	}
1441
1442	LLVM_DEBUG(dbgs() << " Formed new call: " << NewCall << "\n"do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-idiom")) { dbgs() << " Formed new call: " << NewCall << "\n" << " from load ptr=" << LoadEv << " at: " << TheLoad << "\n" << " from store ptr=" << StoreEv << " at: " << TheStore << "\n"; } } while (false)
1443	<< " from load ptr=" << LoadEv << " at: " << TheLoaddo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-idiom")) { dbgs() << " Formed new call: " << NewCall << "\n" << " from load ptr=" << LoadEv << " at: " << TheLoad << "\n" << " from store ptr=" << StoreEv << " at: " << *TheStore << "\n"; } } while (false)
1444	<< "\n"do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-idiom")) { dbgs() << " Formed new call: " << NewCall << "\n" << " from load ptr=" << LoadEv << " at: " << TheLoad << "\n" << " from store ptr=" << StoreEv << " at: " << *TheStore << "\n"; } } while (false)
1445	<< " from store ptr=" << StoreEv << " at: " << TheStoredo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-idiom")) { dbgs() << " Formed new call: " << NewCall << "\n" << " from load ptr=" << LoadEv << " at: " << TheLoad << "\n" << " from store ptr=" << StoreEv << " at: " << *TheStore << "\n"; } } while (false)
1446	<< "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-idiom")) { dbgs() << " Formed new call: " << NewCall << "\n" << " from load ptr=" << LoadEv << " at: " << TheLoad << "\n" << " from store ptr=" << StoreEv << " at: " << *TheStore << "\n"; } } while (false);
1447
1448	ORE.emit([&]() {
1449	return OptimizationRemark(DEBUG_TYPE"loop-idiom", "ProcessLoopStoreOfLoopLoad",
1450	NewCall->getDebugLoc(), Preheader)
1451	<< "Formed a call to "
1452	<< ore::NV("NewFunction", NewCall->getCalledFunction())
1453	<< "() intrinsic from " << ore::NV("Inst", InstRemark)
1454	<< " instruction in " << ore::NV("Function", TheStore->getFunction())
1455	<< " function";
1456	});
1457
1458	// Okay, a new call to memcpy/memmove has been formed. Zap the original store
1459	// and anything that feeds into it.
1460	if (MSSAU)
1461	MSSAU->removeMemoryAccess(TheStore, true);
1462	deleteDeadInstruction(TheStore);
1463	if (MSSAU && VerifyMemorySSA)
1464	MSSAU->getMemorySSA()->verifyMemorySSA();
1465	if (UseMemMove)
1466	++NumMemMove;
1467	else
1468	++NumMemCpy;
1469	ExpCleaner.markResultUsed();
1470	return true;
1471	}
1472
1473	// When compiling for codesize we avoid idiom recognition for a multi-block loop
1474	// unless it is a loop_memset idiom or a memset/memcpy idiom in a nested loop.
1475	//
1476	bool LoopIdiomRecognize::avoidLIRForMultiBlockLoop(bool IsMemset,
1477	bool IsLoopMemset) {
1478	if (ApplyCodeSizeHeuristics && CurLoop->getNumBlocks() > 1) {
1479	if (CurLoop->isOutermost() && (!IsMemset \|\| !IsLoopMemset)) {
1480	LLVM_DEBUG(dbgs() << " " << CurLoop->getHeader()->getParent()->getName()do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-idiom")) { dbgs() << " " << CurLoop->getHeader ()->getParent()->getName() << " : LIR " << ( IsMemset ? "Memset" : "Memcpy") << " avoided: multi-block top-level loop\n" ; } } while (false)
1481	<< " : LIR " << (IsMemset ? "Memset" : "Memcpy")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-idiom")) { dbgs() << " " << CurLoop->getHeader ()->getParent()->getName() << " : LIR " << ( IsMemset ? "Memset" : "Memcpy") << " avoided: multi-block top-level loop\n" ; } } while (false)
1482	<< " avoided: multi-block top-level loop\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-idiom")) { dbgs() << " " << CurLoop->getHeader ()->getParent()->getName() << " : LIR " << ( IsMemset ? "Memset" : "Memcpy") << " avoided: multi-block top-level loop\n" ; } } while (false);
1483	return true;
1484	}
1485	}
1486
1487	return false;
1488	}
1489
1490	bool LoopIdiomRecognize::runOnNoncountableLoop() {
1491	LLVM_DEBUG(dbgs() << DEBUG_TYPE " Scanning: F["do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-idiom")) { dbgs() << "loop-idiom" " Scanning: F[" << CurLoop->getHeader()->getParent()->getName () << "] Noncountable Loop %" << CurLoop->getHeader ()->getName() << "\n"; } } while (false)
1492	<< CurLoop->getHeader()->getParent()->getName()do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-idiom")) { dbgs() << "loop-idiom" " Scanning: F[" << CurLoop->getHeader()->getParent()->getName () << "] Noncountable Loop %" << CurLoop->getHeader ()->getName() << "\n"; } } while (false)
1493	<< "] Noncountable Loop %"do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-idiom")) { dbgs() << "loop-idiom" " Scanning: F[" << CurLoop->getHeader()->getParent()->getName () << "] Noncountable Loop %" << CurLoop->getHeader ()->getName() << "\n"; } } while (false)
1494	<< CurLoop->getHeader()->getName() << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-idiom")) { dbgs() << "loop-idiom" " Scanning: F[" << CurLoop->getHeader()->getParent()->getName () << "] Noncountable Loop %" << CurLoop->getHeader ()->getName() << "\n"; } } while (false);
1495
1496	return recognizePopcount() \|\| recognizeAndInsertFFS() \|\|
1497	recognizeShiftUntilBitTest() \|\| recognizeShiftUntilZero();
1498	}
1499
1500	/// Check if the given conditional branch is based on the comparison between
1501	/// a variable and zero, and if the variable is non-zero or zero (JmpOnZero is
1502	/// true), the control yields to the loop entry. If the branch matches the
1503	/// behavior, the variable involved in the comparison is returned. This function
1504	/// will be called to see if the precondition and postcondition of the loop are
1505	/// in desirable form.
1506	static Value matchCondition(BranchInst BI, BasicBlock *LoopEntry,
1507	bool JmpOnZero = false) {
1508	if (!BI \|\| !BI->isConditional())
1509	return nullptr;
1510
1511	ICmpInst *Cond = dyn_cast<ICmpInst>(BI->getCondition());
1512	if (!Cond)
1513	return nullptr;
1514
1515	ConstantInt *CmpZero = dyn_cast<ConstantInt>(Cond->getOperand(1));
1516	if (!CmpZero \|\| !CmpZero->isZero())
1517	return nullptr;
1518
1519	BasicBlock *TrueSucc = BI->getSuccessor(0);
1520	BasicBlock *FalseSucc = BI->getSuccessor(1);
1521	if (JmpOnZero)
1522	std::swap(TrueSucc, FalseSucc);
1523
1524	ICmpInst::Predicate Pred = Cond->getPredicate();
1525	if ((Pred == ICmpInst::ICMP_NE && TrueSucc == LoopEntry) \|\|
1526	(Pred == ICmpInst::ICMP_EQ && FalseSucc == LoopEntry))
1527	return Cond->getOperand(0);
1528
1529	return nullptr;
1530	}
1531
1532	// Check if the recurrence variable `VarX` is in the right form to create
1533	// the idiom. Returns the value coerced to a PHINode if so.
1534	static PHINode getRecurrenceVar(Value VarX, Instruction *DefX,
1535	BasicBlock *LoopEntry) {
1536	auto *PhiX = dyn_cast<PHINode>(VarX);
1537	if (PhiX && PhiX->getParent() == LoopEntry &&
1538	(PhiX->getOperand(0) == DefX \|\| PhiX->getOperand(1) == DefX))
1539	return PhiX;
1540	return nullptr;
1541	}
1542
1543	/// Return true iff the idiom is detected in the loop.
1544	///
1545	/// Additionally:
1546	/// 1) \p CntInst is set to the instruction counting the population bit.
1547	/// 2) \p CntPhi is set to the corresponding phi node.
1548	/// 3) \p Var is set to the value whose population bits are being counted.
1549	///
1550	/// The core idiom we are trying to detect is:
1551	/// \code
1552	/// if (x0 != 0)
1553	/// goto loop-exit // the precondition of the loop
1554	/// cnt0 = init-val;
1555	/// do {
1556	/// x1 = phi (x0, x2);
1557	/// cnt1 = phi(cnt0, cnt2);
1558	///
1559	/// cnt2 = cnt1 + 1;
1560	/// ...
1561	/// x2 = x1 & (x1 - 1);
1562	/// ...
1563	/// } while(x != 0);
1564	///
1565	/// loop-exit:
1566	/// \endcode
1567	static bool detectPopcountIdiom(Loop CurLoop, BasicBlock PreCondBB,
1568	Instruction &CntInst, PHINode &CntPhi,
1569	Value *&Var) {
1570	// step 1: Check to see if the look-back branch match this pattern:
1571	// "if (a!=0) goto loop-entry".
1572	BasicBlock *LoopEntry;
1573	Instruction DefX2, CountInst;
1574	Value VarX1, VarX0;
1575	PHINode PhiX, CountPhi;
1576
1577	DefX2 = CountInst = nullptr;
1578	VarX1 = VarX0 = nullptr;
1579	PhiX = CountPhi = nullptr;
1580	LoopEntry = *(CurLoop->block_begin());
1581
1582	// step 1: Check if the loop-back branch is in desirable form.
1583	{
1584	if (Value *T = matchCondition(
1585	dyn_cast<BranchInst>(LoopEntry->getTerminator()), LoopEntry))
1586	DefX2 = dyn_cast<Instruction>(T);
1587	else
1588	return false;
1589	}
1590
1591	// step 2: detect instructions corresponding to "x2 = x1 & (x1 - 1)"
1592	{
1593	if (!DefX2 \|\| DefX2->getOpcode() != Instruction::And)
1594	return false;
1595
1596	BinaryOperator *SubOneOp;
1597
1598	if ((SubOneOp = dyn_cast<BinaryOperator>(DefX2->getOperand(0))))
1599	VarX1 = DefX2->getOperand(1);
1600	else {
1601	VarX1 = DefX2->getOperand(0);
1602	SubOneOp = dyn_cast<BinaryOperator>(DefX2->getOperand(1));
1603	}
1604	if (!SubOneOp \|\| SubOneOp->getOperand(0) != VarX1)
1605	return false;
1606
1607	ConstantInt *Dec = dyn_cast<ConstantInt>(SubOneOp->getOperand(1));
1608	if (!Dec \|\|
1609	!((SubOneOp->getOpcode() == Instruction::Sub && Dec->isOne()) \|\|
1610	(SubOneOp->getOpcode() == Instruction::Add &&
1611	Dec->isMinusOne()))) {
1612	return false;
1613	}
1614	}
1615
1616	// step 3: Check the recurrence of variable X
1617	PhiX = getRecurrenceVar(VarX1, DefX2, LoopEntry);
1618	if (!PhiX)
1619	return false;
1620
1621	// step 4: Find the instruction which count the population: cnt2 = cnt1 + 1
1622	{
1623	CountInst = nullptr;
1624	for (BasicBlock::iterator Iter = LoopEntry->getFirstNonPHI()->getIterator(),
1625	IterE = LoopEntry->end();
1626	Iter != IterE; Iter++) {
1627	Instruction Inst = &Iter;
1628	if (Inst->getOpcode() != Instruction::Add)
1629	continue;
1630
1631	ConstantInt *Inc = dyn_cast<ConstantInt>(Inst->getOperand(1));
1632	if (!Inc \|\| !Inc->isOne())
1633	continue;
1634
1635	PHINode *Phi = getRecurrenceVar(Inst->getOperand(0), Inst, LoopEntry);
1636	if (!Phi)
1637	continue;
1638
1639	// Check if the result of the instruction is live of the loop.
1640	bool LiveOutLoop = false;
1641	for (User *U : Inst->users()) {
1642	if ((cast<Instruction>(U))->getParent() != LoopEntry) {
1643	LiveOutLoop = true;
1644	break;
1645	}
1646	}
1647
1648	if (LiveOutLoop) {
1649	CountInst = Inst;
1650	CountPhi = Phi;
1651	break;
1652	}
1653	}
1654
1655	if (!CountInst)
1656	return false;
1657	}
1658
1659	// step 5: check if the precondition is in this form:
1660	// "if (x != 0) goto loop-head ; else goto somewhere-we-don't-care;"
1661	{
1662	auto *PreCondBr = dyn_cast<BranchInst>(PreCondBB->getTerminator());
1663	Value *T = matchCondition(PreCondBr, CurLoop->getLoopPreheader());
1664	if (T != PhiX->getOperand(0) && T != PhiX->getOperand(1))
1665	return false;
1666
1667	CntInst = CountInst;
1668	CntPhi = CountPhi;
1669	Var = T;
1670	}
1671
1672	return true;
1673	}
1674
1675	/// Return true if the idiom is detected in the loop.
1676	///
1677	/// Additionally:
1678	/// 1) \p CntInst is set to the instruction Counting Leading Zeros (CTLZ)
1679	/// or nullptr if there is no such.
1680	/// 2) \p CntPhi is set to the corresponding phi node
1681	/// or nullptr if there is no such.
1682	/// 3) \p Var is set to the value whose CTLZ could be used.
1683	/// 4) \p DefX is set to the instruction calculating Loop exit condition.
1684	///
1685	/// The core idiom we are trying to detect is:
1686	/// \code
1687	/// if (x0 == 0)
1688	/// goto loop-exit // the precondition of the loop
1689	/// cnt0 = init-val;
1690	/// do {
1691	/// x = phi (x0, x.next); //PhiX
1692	/// cnt = phi(cnt0, cnt.next);
1693	///
1694	/// cnt.next = cnt + 1;
1695	/// ...
1696	/// x.next = x >> 1; // DefX
1697	/// ...
1698	/// } while(x.next != 0);
1699	///
1700	/// loop-exit:
1701	/// \endcode
1702	static bool detectShiftUntilZeroIdiom(Loop *CurLoop, const DataLayout &DL,
1703	Intrinsic::ID &IntrinID, Value *&InitX,
1704	Instruction &CntInst, PHINode &CntPhi,
1705	Instruction *&DefX) {
1706	BasicBlock *LoopEntry;
1707	Value *VarX = nullptr;
1708
1709	DefX = nullptr;
1710	CntInst = nullptr;
1711	CntPhi = nullptr;
1712	LoopEntry = *(CurLoop->block_begin());
1713
1714	// step 1: Check if the loop-back branch is in desirable form.
1715	if (Value *T = matchCondition(
1716	dyn_cast<BranchInst>(LoopEntry->getTerminator()), LoopEntry))
1717	DefX = dyn_cast<Instruction>(T);
1718	else
1719	return false;
1720
1721	// step 2: detect instructions corresponding to "x.next = x >> 1 or x << 1"
1722	if (!DefX \|\| !DefX->isShift())
1723	return false;
1724	IntrinID = DefX->getOpcode() == Instruction::Shl ? Intrinsic::cttz :
1725	Intrinsic::ctlz;
1726	ConstantInt *Shft = dyn_cast<ConstantInt>(DefX->getOperand(1));
1727	if (!Shft \|\| !Shft->isOne())
1728	return false;
1729	VarX = DefX->getOperand(0);
1730
1731	// step 3: Check the recurrence of variable X
1732	PHINode *PhiX = getRecurrenceVar(VarX, DefX, LoopEntry);
1733	if (!PhiX)
1734	return false;
1735
1736	InitX = PhiX->getIncomingValueForBlock(CurLoop->getLoopPreheader());
1737
1738	// Make sure the initial value can't be negative otherwise the ashr in the
1739	// loop might never reach zero which would make the loop infinite.
1740	if (DefX->getOpcode() == Instruction::AShr && !isKnownNonNegative(InitX, DL))
1741	return false;
1742
1743	// step 4: Find the instruction which count the CTLZ: cnt.next = cnt + 1
1744	// or cnt.next = cnt + -1.
1745	// TODO: We can skip the step. If loop trip count is known (CTLZ),
1746	// then all uses of "cnt.next" could be optimized to the trip count
1747	// plus "cnt0". Currently it is not optimized.
1748	// This step could be used to detect POPCNT instruction:
1749	// cnt.next = cnt + (x.next & 1)
1750	for (BasicBlock::iterator Iter = LoopEntry->getFirstNonPHI()->getIterator(),
1751	IterE = LoopEntry->end();
1752	Iter != IterE; Iter++) {
1753	Instruction Inst = &Iter;
1754	if (Inst->getOpcode() != Instruction::Add)
1755	continue;
1756
1757	ConstantInt *Inc = dyn_cast<ConstantInt>(Inst->getOperand(1));
1758	if (!Inc \|\| (!Inc->isOne() && !Inc->isMinusOne()))
1759	continue;
1760
1761	PHINode *Phi = getRecurrenceVar(Inst->getOperand(0), Inst, LoopEntry);
1762	if (!Phi)
1763	continue;
1764
1765	CntInst = Inst;
1766	CntPhi = Phi;
1767	break;
1768	}
1769	if (!CntInst)
1770	return false;
1771
1772	return true;
1773	}
1774
1775	/// Recognize CTLZ or CTTZ idiom in a non-countable loop and convert the loop
1776	/// to countable (with CTLZ / CTTZ trip count). If CTLZ / CTTZ inserted as a new
1777	/// trip count returns true; otherwise, returns false.
1778	bool LoopIdiomRecognize::recognizeAndInsertFFS() {
1779	// Give up if the loop has multiple blocks or multiple backedges.
1780	if (CurLoop->getNumBackEdges() != 1 \|\| CurLoop->getNumBlocks() != 1)
1781	return false;
1782
1783	Intrinsic::ID IntrinID;
1784	Value *InitX;
1785	Instruction *DefX = nullptr;
1786	PHINode *CntPhi = nullptr;
1787	Instruction *CntInst = nullptr;
1788	// Help decide if transformation is profitable. For ShiftUntilZero idiom,
1789	// this is always 6.
1790	size_t IdiomCanonicalSize = 6;
1791
1792	if (!detectShiftUntilZeroIdiom(CurLoop, *DL, IntrinID, InitX,
1793	CntInst, CntPhi, DefX))
1794	return false;
1795
1796	bool IsCntPhiUsedOutsideLoop = false;
1797	for (User *U : CntPhi->users())
1798	if (!CurLoop->contains(cast<Instruction>(U))) {
1799	IsCntPhiUsedOutsideLoop = true;
1800	break;
1801	}
1802	bool IsCntInstUsedOutsideLoop = false;
1803	for (User *U : CntInst->users())
1804	if (!CurLoop->contains(cast<Instruction>(U))) {
1805	IsCntInstUsedOutsideLoop = true;
1806	break;
1807	}
1808	// If both CntInst and CntPhi are used outside the loop the profitability
1809	// is questionable.
1810	if (IsCntInstUsedOutsideLoop && IsCntPhiUsedOutsideLoop)
1811	return false;
1812
1813	// For some CPUs result of CTLZ(X) intrinsic is undefined
1814	// when X is 0. If we can not guarantee X != 0, we need to check this
1815	// when expand.
1816	bool ZeroCheck = false;
1817	// It is safe to assume Preheader exist as it was checked in
1818	// parent function RunOnLoop.
1819	BasicBlock *PH = CurLoop->getLoopPreheader();
1820
1821	// If we are using the count instruction outside the loop, make sure we
1822	// have a zero check as a precondition. Without the check the loop would run
1823	// one iteration for before any check of the input value. This means 0 and 1
1824	// would have identical behavior in the original loop and thus
1825	if (!IsCntPhiUsedOutsideLoop) {
1826	auto *PreCondBB = PH->getSinglePredecessor();
1827	if (!PreCondBB)
1828	return false;
1829	auto *PreCondBI = dyn_cast<BranchInst>(PreCondBB->getTerminator());
1830	if (!PreCondBI)
1831	return false;
1832	if (matchCondition(PreCondBI, PH) != InitX)
1833	return false;
1834	ZeroCheck = true;
1835	}
1836
1837	// Check if CTLZ / CTTZ intrinsic is profitable. Assume it is always
1838	// profitable if we delete the loop.
1839
1840	// the loop has only 6 instructions:
1841	// %n.addr.0 = phi [ %n, %entry ], [ %shr, %while.cond ]
1842	// %i.0 = phi [ %i0, %entry ], [ %inc, %while.cond ]
1843	// %shr = ashr %n.addr.0, 1
1844	// %tobool = icmp eq %shr, 0
1845	// %inc = add nsw %i.0, 1
1846	// br i1 %tobool
1847
1848	const Value *Args[] = {InitX,
1849	ConstantInt::getBool(InitX->getContext(), ZeroCheck)};
1850
1851	// @llvm.dbg doesn't count as they have no semantic effect.
1852	auto InstWithoutDebugIt = CurLoop->getHeader()->instructionsWithoutDebug();
1853	uint32_t HeaderSize =
1854	std::distance(InstWithoutDebugIt.begin(), InstWithoutDebugIt.end());
1855
1856	IntrinsicCostAttributes Attrs(IntrinID, InitX->getType(), Args);
1857	InstructionCost Cost =
1858	TTI->getIntrinsicInstrCost(Attrs, TargetTransformInfo::TCK_SizeAndLatency);
1859	if (HeaderSize != IdiomCanonicalSize &&
1860	Cost > TargetTransformInfo::TCC_Basic)
1861	return false;
1862
1863	transformLoopToCountable(IntrinID, PH, CntInst, CntPhi, InitX, DefX,
1864	DefX->getDebugLoc(), ZeroCheck,
1865	IsCntPhiUsedOutsideLoop);
1866	return true;
1867	}
1868
1869	/// Recognizes a population count idiom in a non-countable loop.
1870	///
1871	/// If detected, transforms the relevant code to issue the popcount intrinsic
1872	/// function call, and returns true; otherwise, returns false.
1873	bool LoopIdiomRecognize::recognizePopcount() {
1874	if (TTI->getPopcntSupport(32) != TargetTransformInfo::PSK_FastHardware)
1875	return false;
1876
1877	// Counting population are usually conducted by few arithmetic instructions.
1878	// Such instructions can be easily "absorbed" by vacant slots in a
1879	// non-compact loop. Therefore, recognizing popcount idiom only makes sense
1880	// in a compact loop.
1881
1882	// Give up if the loop has multiple blocks or multiple backedges.
1883	if (CurLoop->getNumBackEdges() != 1 \|\| CurLoop->getNumBlocks() != 1)
1884	return false;
1885
1886	BasicBlock LoopBody = (CurLoop->block_begin());
1887	if (LoopBody->size() >= 20) {
1888	// The loop is too big, bail out.
1889	return false;
1890	}
1891
1892	// It should have a preheader containing nothing but an unconditional branch.
1893	BasicBlock *PH = CurLoop->getLoopPreheader();
1894	if (!PH \|\| &PH->front() != PH->getTerminator())
1895	return false;
1896	auto *EntryBI = dyn_cast<BranchInst>(PH->getTerminator());
1897	if (!EntryBI \|\| EntryBI->isConditional())
1898	return false;
1899
1900	// It should have a precondition block where the generated popcount intrinsic
1901	// function can be inserted.
1902	auto *PreCondBB = PH->getSinglePredecessor();
1903	if (!PreCondBB)
1904	return false;
1905	auto *PreCondBI = dyn_cast<BranchInst>(PreCondBB->getTerminator());
1906	if (!PreCondBI \|\| PreCondBI->isUnconditional())
1907	return false;
1908
1909	Instruction *CntInst;
1910	PHINode *CntPhi;
1911	Value *Val;
1912	if (!detectPopcountIdiom(CurLoop, PreCondBB, CntInst, CntPhi, Val))
1913	return false;
1914
1915	transformLoopToPopcount(PreCondBB, CntInst, CntPhi, Val);
1916	return true;
1917	}
1918
1919	static CallInst createPopcntIntrinsic(IRBuilder<> &IRBuilder, Value Val,
1920	const DebugLoc &DL) {
1921	Value *Ops[] = {Val};
1922	Type *Tys[] = {Val->getType()};
1923
1924	Module *M = IRBuilder.GetInsertBlock()->getParent()->getParent();
1925	Function *Func = Intrinsic::getDeclaration(M, Intrinsic::ctpop, Tys);
1926	CallInst *CI = IRBuilder.CreateCall(Func, Ops);
1927	CI->setDebugLoc(DL);
1928
1929	return CI;
1930	}
1931
1932	static CallInst createFFSIntrinsic(IRBuilder<> &IRBuilder, Value Val,
1933	const DebugLoc &DL, bool ZeroCheck,
1934	Intrinsic::ID IID) {
1935	Value *Ops[] = {Val, IRBuilder.getInt1(ZeroCheck)};
1936	Type *Tys[] = {Val->getType()};
1937
1938	Module *M = IRBuilder.GetInsertBlock()->getParent()->getParent();
1939	Function *Func = Intrinsic::getDeclaration(M, IID, Tys);
1940	CallInst *CI = IRBuilder.CreateCall(Func, Ops);
1941	CI->setDebugLoc(DL);
1942
1943	return CI;
1944	}
1945
1946	/// Transform the following loop (Using CTLZ, CTTZ is similar):
1947	/// loop:
1948	/// CntPhi = PHI [Cnt0, CntInst]
1949	/// PhiX = PHI [InitX, DefX]
1950	/// CntInst = CntPhi + 1
1951	/// DefX = PhiX >> 1
1952	/// LOOP_BODY
1953	/// Br: loop if (DefX != 0)
1954	/// Use(CntPhi) or Use(CntInst)
1955	///
1956	/// Into:
1957	/// If CntPhi used outside the loop:
1958	/// CountPrev = BitWidth(InitX) - CTLZ(InitX >> 1)
1959	/// Count = CountPrev + 1
1960	/// else
1961	/// Count = BitWidth(InitX) - CTLZ(InitX)
1962	/// loop:
1963	/// CntPhi = PHI [Cnt0, CntInst]
1964	/// PhiX = PHI [InitX, DefX]
1965	/// PhiCount = PHI [Count, Dec]
1966	/// CntInst = CntPhi + 1
1967	/// DefX = PhiX >> 1
1968	/// Dec = PhiCount - 1
1969	/// LOOP_BODY
1970	/// Br: loop if (Dec != 0)
1971	/// Use(CountPrev + Cnt0) // Use(CntPhi)
1972	/// or
1973	/// Use(Count + Cnt0) // Use(CntInst)
1974	///
1975	/// If LOOP_BODY is empty the loop will be deleted.
1976	/// If CntInst and DefX are not used in LOOP_BODY they will be removed.
1977	void LoopIdiomRecognize::transformLoopToCountable(
1978	Intrinsic::ID IntrinID, BasicBlock Preheader, Instruction CntInst,
1979	PHINode CntPhi, Value InitX, Instruction *DefX, const DebugLoc &DL,
1980	bool ZeroCheck, bool IsCntPhiUsedOutsideLoop) {
1981	BranchInst *PreheaderBr = cast<BranchInst>(Preheader->getTerminator());
1982
1983	// Step 1: Insert the CTLZ/CTTZ instruction at the end of the preheader block
1984	IRBuilder<> Builder(PreheaderBr);
1985	Builder.SetCurrentDebugLocation(DL);
1986
1987	// If there are no uses of CntPhi crate:
1988	// Count = BitWidth - CTLZ(InitX);
1989	// NewCount = Count;
1990	// If there are uses of CntPhi create:
1991	// NewCount = BitWidth - CTLZ(InitX >> 1);
1992	// Count = NewCount + 1;
1993	Value *InitXNext;
1994	if (IsCntPhiUsedOutsideLoop) {
1995	if (DefX->getOpcode() == Instruction::AShr)
1996	InitXNext = Builder.CreateAShr(InitX, 1);
1997	else if (DefX->getOpcode() == Instruction::LShr)
1998	InitXNext = Builder.CreateLShr(InitX, 1);
1999	else if (DefX->getOpcode() == Instruction::Shl) // cttz
2000	InitXNext = Builder.CreateShl(InitX, 1);
2001	else
2002	llvm_unreachable("Unexpected opcode!")::llvm::llvm_unreachable_internal("Unexpected opcode!", "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp" , 2002);
2003	} else
2004	InitXNext = InitX;
2005	Value *Count =
2006	createFFSIntrinsic(Builder, InitXNext, DL, ZeroCheck, IntrinID);
2007	Type *CountTy = Count->getType();
2008	Count = Builder.CreateSub(
2009	ConstantInt::get(CountTy, CountTy->getIntegerBitWidth()), Count);
2010	Value *NewCount = Count;
2011	if (IsCntPhiUsedOutsideLoop)
2012	Count = Builder.CreateAdd(Count, ConstantInt::get(CountTy, 1));
2013
2014	NewCount = Builder.CreateZExtOrTrunc(NewCount, CntInst->getType());
2015
2016	Value *CntInitVal = CntPhi->getIncomingValueForBlock(Preheader);
2017	if (cast<ConstantInt>(CntInst->getOperand(1))->isOne()) {
2018	// If the counter was being incremented in the loop, add NewCount to the
2019	// counter's initial value, but only if the initial value is not zero.
2020	ConstantInt *InitConst = dyn_cast<ConstantInt>(CntInitVal);
2021	if (!InitConst \|\| !InitConst->isZero())
2022	NewCount = Builder.CreateAdd(NewCount, CntInitVal);
2023	} else {
2024	// If the count was being decremented in the loop, subtract NewCount from
2025	// the counter's initial value.
2026	NewCount = Builder.CreateSub(CntInitVal, NewCount);
2027	}
2028
2029	// Step 2: Insert new IV and loop condition:
2030	// loop:
2031	// ...
2032	// PhiCount = PHI [Count, Dec]
2033	// ...
2034	// Dec = PhiCount - 1
2035	// ...
2036	// Br: loop if (Dec != 0)
2037	BasicBlock Body = (CurLoop->block_begin());
2038	auto *LbBr = cast<BranchInst>(Body->getTerminator());
2039	ICmpInst *LbCond = cast<ICmpInst>(LbBr->getCondition());
2040
2041	PHINode *TcPhi = PHINode::Create(CountTy, 2, "tcphi", &Body->front());
2042
2043	Builder.SetInsertPoint(LbCond);
2044	Instruction *TcDec = cast<Instruction>(Builder.CreateSub(
2045	TcPhi, ConstantInt::get(CountTy, 1), "tcdec", false, true));
2046
2047	TcPhi->addIncoming(Count, Preheader);
2048	TcPhi->addIncoming(TcDec, Body);
2049
2050	CmpInst::Predicate Pred =
2051	(LbBr->getSuccessor(0) == Body) ? CmpInst::ICMP_NE : CmpInst::ICMP_EQ;
2052	LbCond->setPredicate(Pred);
2053	LbCond->setOperand(0, TcDec);
2054	LbCond->setOperand(1, ConstantInt::get(CountTy, 0));
2055
2056	// Step 3: All the references to the original counter outside
2057	// the loop are replaced with the NewCount
2058	if (IsCntPhiUsedOutsideLoop)
2059	CntPhi->replaceUsesOutsideBlock(NewCount, Body);
2060	else
2061	CntInst->replaceUsesOutsideBlock(NewCount, Body);
2062
2063	// step 4: Forget the "non-computable" trip-count SCEV associated with the
2064	// loop. The loop would otherwise not be deleted even if it becomes empty.
2065	SE->forgetLoop(CurLoop);
2066	}
2067
2068	void LoopIdiomRecognize::transformLoopToPopcount(BasicBlock *PreCondBB,
2069	Instruction *CntInst,
2070	PHINode CntPhi, Value Var) {
2071	BasicBlock *PreHead = CurLoop->getLoopPreheader();
2072	auto *PreCondBr = cast<BranchInst>(PreCondBB->getTerminator());
2073	const DebugLoc &DL = CntInst->getDebugLoc();
2074
2075	// Assuming before transformation, the loop is following:
2076	// if (x) // the precondition
2077	// do { cnt++; x &= x - 1; } while(x);
2078
2079	// Step 1: Insert the ctpop instruction at the end of the precondition block
2080	IRBuilder<> Builder(PreCondBr);
2081	Value PopCnt, PopCntZext, NewCount, TripCnt;
2082	{
2083	PopCnt = createPopcntIntrinsic(Builder, Var, DL);
2084	NewCount = PopCntZext =
2085	Builder.CreateZExtOrTrunc(PopCnt, cast<IntegerType>(CntPhi->getType()));
2086
2087	if (NewCount != PopCnt)
2088	(cast<Instruction>(NewCount))->setDebugLoc(DL);
2089
2090	// TripCnt is exactly the number of iterations the loop has
2091	TripCnt = NewCount;
2092
2093	// If the population counter's initial value is not zero, insert Add Inst.
2094	Value *CntInitVal = CntPhi->getIncomingValueForBlock(PreHead);
2095	ConstantInt *InitConst = dyn_cast<ConstantInt>(CntInitVal);
2096	if (!InitConst \|\| !InitConst->isZero()) {
2097	NewCount = Builder.CreateAdd(NewCount, CntInitVal);
2098	(cast<Instruction>(NewCount))->setDebugLoc(DL);
2099	}
2100	}
2101
2102	// Step 2: Replace the precondition from "if (x == 0) goto loop-exit" to
2103	// "if (NewCount == 0) loop-exit". Without this change, the intrinsic
2104	// function would be partial dead code, and downstream passes will drag
2105	// it back from the precondition block to the preheader.
2106	{
2107	ICmpInst *PreCond = cast<ICmpInst>(PreCondBr->getCondition());
2108
2109	Value *Opnd0 = PopCntZext;
2110	Value *Opnd1 = ConstantInt::get(PopCntZext->getType(), 0);
2111	if (PreCond->getOperand(0) != Var)
2112	std::swap(Opnd0, Opnd1);
2113
2114	ICmpInst *NewPreCond = cast<ICmpInst>(
2115	Builder.CreateICmp(PreCond->getPredicate(), Opnd0, Opnd1));
2116	PreCondBr->setCondition(NewPreCond);
2117
2118	RecursivelyDeleteTriviallyDeadInstructions(PreCond, TLI);
2119	}
2120
2121	// Step 3: Note that the population count is exactly the trip count of the
2122	// loop in question, which enable us to convert the loop from noncountable
2123	// loop into a countable one. The benefit is twofold:
2124	//
2125	// - If the loop only counts population, the entire loop becomes dead after
2126	// the transformation. It is a lot easier to prove a countable loop dead
2127	// than to prove a noncountable one. (In some C dialects, an infinite loop
2128	// isn't dead even if it computes nothing useful. In general, DCE needs
2129	// to prove a noncountable loop finite before safely delete it.)
2130	//
2131	// - If the loop also performs something else, it remains alive.
2132	// Since it is transformed to countable form, it can be aggressively
2133	// optimized by some optimizations which are in general not applicable
2134	// to a noncountable loop.
2135	//
2136	// After this step, this loop (conceptually) would look like following:
2137	// newcnt = __builtin_ctpop(x);
2138	// t = newcnt;
2139	// if (x)
2140	// do { cnt++; x &= x-1; t--) } while (t > 0);
2141	BasicBlock Body = (CurLoop->block_begin());
2142	{
2143	auto *LbBr = cast<BranchInst>(Body->getTerminator());
2144	ICmpInst *LbCond = cast<ICmpInst>(LbBr->getCondition());
2145	Type *Ty = TripCnt->getType();
2146
2147	PHINode *TcPhi = PHINode::Create(Ty, 2, "tcphi", &Body->front());
2148
2149	Builder.SetInsertPoint(LbCond);
2150	Instruction *TcDec = cast<Instruction>(
2151	Builder.CreateSub(TcPhi, ConstantInt::get(Ty, 1),
2152	"tcdec", false, true));
2153
2154	TcPhi->addIncoming(TripCnt, PreHead);
2155	TcPhi->addIncoming(TcDec, Body);
2156
2157	CmpInst::Predicate Pred =
2158	(LbBr->getSuccessor(0) == Body) ? CmpInst::ICMP_UGT : CmpInst::ICMP_SLE;
2159	LbCond->setPredicate(Pred);
2160	LbCond->setOperand(0, TcDec);
2161	LbCond->setOperand(1, ConstantInt::get(Ty, 0));
2162	}
2163
2164	// Step 4: All the references to the original population counter outside
2165	// the loop are replaced with the NewCount -- the value returned from
2166	// __builtin_ctpop().
2167	CntInst->replaceUsesOutsideBlock(NewCount, Body);
2168
2169	// step 5: Forget the "non-computable" trip-count SCEV associated with the
2170	// loop. The loop would otherwise not be deleted even if it becomes empty.
2171	SE->forgetLoop(CurLoop);
2172	}
2173
2174	/// Match loop-invariant value.
2175	template <typename SubPattern_t> struct match_LoopInvariant {
2176	SubPattern_t SubPattern;
2177	const Loop *L;
2178
2179	match_LoopInvariant(const SubPattern_t &SP, const Loop *L)
2180	: SubPattern(SP), L(L) {}
2181
2182	template <typename ITy> bool match(ITy *V) {
2183	return L->isLoopInvariant(V) && SubPattern.match(V);
2184	}
2185	};
2186
2187	/// Matches if the value is loop-invariant.
2188	template <typename Ty>
2189	inline match_LoopInvariant<Ty> m_LoopInvariant(const Ty &M, const Loop *L) {
2190	return match_LoopInvariant<Ty>(M, L);
2191	}
2192
2193	/// Return true if the idiom is detected in the loop.
2194	///
2195	/// The core idiom we are trying to detect is:
2196	/// \code
2197	/// entry:
2198	/// <...>
2199	/// %bitmask = shl i32 1, %bitpos
2200	/// br label %loop
2201	///
2202	/// loop:
2203	/// %x.curr = phi i32 [ %x, %entry ], [ %x.next, %loop ]
2204	/// %x.curr.bitmasked = and i32 %x.curr, %bitmask
2205	/// %x.curr.isbitunset = icmp eq i32 %x.curr.bitmasked, 0
2206	/// %x.next = shl i32 %x.curr, 1
2207	/// <...>
2208	/// br i1 %x.curr.isbitunset, label %loop, label %end
2209	///
2210	/// end:
2211	/// %x.curr.res = phi i32 [ %x.curr, %loop ] <...>
2212	/// %x.next.res = phi i32 [ %x.next, %loop ] <...>
2213	/// <...>
2214	/// \endcode
2215	static bool detectShiftUntilBitTestIdiom(Loop CurLoop, Value &BaseX,
2216	Value &BitMask, Value &BitPos,
2217	Value &CurrX, Instruction &NextX) {
2218	LLVM_DEBUG(dbgs() << DEBUG_TYPEdo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-idiom")) { dbgs() << "loop-idiom" " Performing shift-until-bittest idiom detection.\n" ; } } while (false)
2219	" Performing shift-until-bittest idiom detection.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-idiom")) { dbgs() << "loop-idiom" " Performing shift-until-bittest idiom detection.\n" ; } } while (false);
2220
2221	// Give up if the loop has multiple blocks or multiple backedges.
2222	if (CurLoop->getNumBlocks() != 1 \|\| CurLoop->getNumBackEdges() != 1) {
2223	LLVM_DEBUG(dbgs() << DEBUG_TYPE " Bad block/backedge count.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-idiom")) { dbgs() << "loop-idiom" " Bad block/backedge count.\n" ; } } while (false);
2224	return false;
2225	}
2226
2227	BasicBlock *LoopHeaderBB = CurLoop->getHeader();
2228	BasicBlock *LoopPreheaderBB = CurLoop->getLoopPreheader();
2229	assert(LoopPreheaderBB && "There is always a loop preheader.")(static_cast <bool> (LoopPreheaderBB && "There is always a loop preheader." ) ? void (0) : __assert_fail ("LoopPreheaderBB && \"There is always a loop preheader.\"" , "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp" , 2229, __extension__ __PRETTY_FUNCTION__));
2230
2231	using namespace PatternMatch;
2232
2233	// Step 1: Check if the loop backedge is in desirable form.
2234
2235	ICmpInst::Predicate Pred;
2236	Value CmpLHS, CmpRHS;
2237	BasicBlock TrueBB, FalseBB;
2238	if (!match(LoopHeaderBB->getTerminator(),
2239	m_Br(m_ICmp(Pred, m_Value(CmpLHS), m_Value(CmpRHS)),
2240	m_BasicBlock(TrueBB), m_BasicBlock(FalseBB)))) {
2241	LLVM_DEBUG(dbgs() << DEBUG_TYPE " Bad backedge structure.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-idiom")) { dbgs() << "loop-idiom" " Bad backedge structure.\n" ; } } while (false);
2242	return false;
2243	}
2244
2245	// Step 2: Check if the backedge's condition is in desirable form.
2246
2247	auto MatchVariableBitMask = [&]() {
2248	return ICmpInst::isEquality(Pred) && match(CmpRHS, m_Zero()) &&
2249	match(CmpLHS,
2250	m_c_And(m_Value(CurrX),
2251	m_CombineAnd(
2252	m_Value(BitMask),
2253	m_LoopInvariant(m_Shl(m_One(), m_Value(BitPos)),
2254	CurLoop))));
2255	};
2256	auto MatchConstantBitMask = [&]() {
2257	return ICmpInst::isEquality(Pred) && match(CmpRHS, m_Zero()) &&
2258	match(CmpLHS, m_And(m_Value(CurrX),
2259	m_CombineAnd(m_Value(BitMask), m_Power2()))) &&
2260	(BitPos = ConstantExpr::getExactLogBase2(cast<Constant>(BitMask)));
2261	};
2262	auto MatchDecomposableConstantBitMask = [&]() {
2263	APInt Mask;
2264	return llvm::decomposeBitTestICmp(CmpLHS, CmpRHS, Pred, CurrX, Mask) &&
2265	ICmpInst::isEquality(Pred) && Mask.isPowerOf2() &&
2266	(BitMask = ConstantInt::get(CurrX->getType(), Mask)) &&
2267	(BitPos = ConstantInt::get(CurrX->getType(), Mask.logBase2()));
2268	};
2269
2270	if (!MatchVariableBitMask() && !MatchConstantBitMask() &&
2271	!MatchDecomposableConstantBitMask()) {
2272	LLVM_DEBUG(dbgs() << DEBUG_TYPE " Bad backedge comparison.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-idiom")) { dbgs() << "loop-idiom" " Bad backedge comparison.\n" ; } } while (false);
2273	return false;
2274	}
2275
2276	// Step 3: Check if the recurrence is in desirable form.
2277	auto *CurrXPN = dyn_cast<PHINode>(CurrX);
2278	if (!CurrXPN \|\| CurrXPN->getParent() != LoopHeaderBB) {
2279	LLVM_DEBUG(dbgs() << DEBUG_TYPE " Not an expected PHI node.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-idiom")) { dbgs() << "loop-idiom" " Not an expected PHI node.\n" ; } } while (false);
2280	return false;
2281	}
2282
2283	BaseX = CurrXPN->getIncomingValueForBlock(LoopPreheaderBB);
2284	NextX =
2285	dyn_cast<Instruction>(CurrXPN->getIncomingValueForBlock(LoopHeaderBB));
2286
2287	assert(CurLoop->isLoopInvariant(BaseX) &&(static_cast <bool> (CurLoop->isLoopInvariant(BaseX) && "Expected BaseX to be avaliable in the preheader!" ) ? void (0) : __assert_fail ("CurLoop->isLoopInvariant(BaseX) && \"Expected BaseX to be avaliable in the preheader!\"" , "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp" , 2288, __extension__ __PRETTY_FUNCTION__))
2288	"Expected BaseX to be avaliable in the preheader!")(static_cast <bool> (CurLoop->isLoopInvariant(BaseX) && "Expected BaseX to be avaliable in the preheader!" ) ? void (0) : __assert_fail ("CurLoop->isLoopInvariant(BaseX) && \"Expected BaseX to be avaliable in the preheader!\"" , "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp" , 2288, __extension__ __PRETTY_FUNCTION__));
2289
2290	if (!NextX \|\| !match(NextX, m_Shl(m_Specific(CurrX), m_One()))) {
2291	// FIXME: support right-shift?
2292	LLVM_DEBUG(dbgs() << DEBUG_TYPE " Bad recurrence.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-idiom")) { dbgs() << "loop-idiom" " Bad recurrence.\n" ; } } while (false);
2293	return false;
2294	}
2295
2296	// Step 4: Check if the backedge's destinations are in desirable form.
2297
2298	assert(ICmpInst::isEquality(Pred) &&(static_cast <bool> (ICmpInst::isEquality(Pred) && "Should only get equality predicates here.") ? void (0) : __assert_fail ("ICmpInst::isEquality(Pred) && \"Should only get equality predicates here.\"" , "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp" , 2299, __extension__ __PRETTY_FUNCTION__))
2299	"Should only get equality predicates here.")(static_cast <bool> (ICmpInst::isEquality(Pred) && "Should only get equality predicates here.") ? void (0) : __assert_fail ("ICmpInst::isEquality(Pred) && \"Should only get equality predicates here.\"" , "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp" , 2299, __extension__ __PRETTY_FUNCTION__));
2300
2301	// cmp-br is commutative, so canonicalize to a single variant.
2302	if (Pred != ICmpInst::Predicate::ICMP_EQ) {
2303	Pred = ICmpInst::getInversePredicate(Pred);
2304	std::swap(TrueBB, FalseBB);
2305	}
2306
2307	// We expect to exit loop when comparison yields false,
2308	// so when it yields true we should branch back to loop header.
2309	if (TrueBB != LoopHeaderBB) {
2310	LLVM_DEBUG(dbgs() << DEBUG_TYPE " Bad backedge flow.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-idiom")) { dbgs() << "loop-idiom" " Bad backedge flow.\n" ; } } while (false);
2311	return false;
2312	}
2313
2314	// Okay, idiom checks out.
2315	return true;
2316	}
2317
2318	/// Look for the following loop:
2319	/// \code
2320	/// entry:
2321	/// <...>
2322	/// %bitmask = shl i32 1, %bitpos
2323	/// br label %loop
2324	///
2325	/// loop:
2326	/// %x.curr = phi i32 [ %x, %entry ], [ %x.next, %loop ]
2327	/// %x.curr.bitmasked = and i32 %x.curr, %bitmask
2328	/// %x.curr.isbitunset = icmp eq i32 %x.curr.bitmasked, 0
2329	/// %x.next = shl i32 %x.curr, 1
2330	/// <...>
2331	/// br i1 %x.curr.isbitunset, label %loop, label %end
2332	///
2333	/// end:
2334	/// %x.curr.res = phi i32 [ %x.curr, %loop ] <...>
2335	/// %x.next.res = phi i32 [ %x.next, %loop ] <...>
2336	/// <...>
2337	/// \endcode
2338	///
2339	/// And transform it into:
2340	/// \code
2341	/// entry:
2342	/// %bitmask = shl i32 1, %bitpos
2343	/// %lowbitmask = add i32 %bitmask, -1
2344	/// %mask = or i32 %lowbitmask, %bitmask
2345	/// %x.masked = and i32 %x, %mask
2346	/// %x.masked.numleadingzeros = call i32 @llvm.ctlz.i32(i32 %x.masked,
2347	/// i1 true)
2348	/// %x.masked.numactivebits = sub i32 32, %x.masked.numleadingzeros
2349	/// %x.masked.leadingonepos = add i32 %x.masked.numactivebits, -1
2350	/// %backedgetakencount = sub i32 %bitpos, %x.masked.leadingonepos
2351	/// %tripcount = add i32 %backedgetakencount, 1
2352	/// %x.curr = shl i32 %x, %backedgetakencount
2353	/// %x.next = shl i32 %x, %tripcount
2354	/// br label %loop
2355	///
2356	/// loop:
2357	/// %loop.iv = phi i32 [ 0, %entry ], [ %loop.iv.next, %loop ]
2358	/// %loop.iv.next = add nuw i32 %loop.iv, 1
2359	/// %loop.ivcheck = icmp eq i32 %loop.iv.next, %tripcount
2360	/// <...>
2361	/// br i1 %loop.ivcheck, label %end, label %loop
2362	///
2363	/// end:
2364	/// %x.curr.res = phi i32 [ %x.curr, %loop ] <...>
2365	/// %x.next.res = phi i32 [ %x.next, %loop ] <...>
2366	/// <...>
2367	/// \endcode
2368	bool LoopIdiomRecognize::recognizeShiftUntilBitTest() {
2369	bool MadeChange = false;
2370
2371	Value X, BitMask, BitPos, XCurr;
2372	Instruction *XNext;
2373	if (!detectShiftUntilBitTestIdiom(CurLoop, X, BitMask, BitPos, XCurr,
2374	XNext)) {
2375	LLVM_DEBUG(dbgs() << DEBUG_TYPEdo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-idiom")) { dbgs() << "loop-idiom" " shift-until-bittest idiom detection failed.\n" ; } } while (false)
2376	" shift-until-bittest idiom detection failed.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-idiom")) { dbgs() << "loop-idiom" " shift-until-bittest idiom detection failed.\n" ; } } while (false);
2377	return MadeChange;
2378	}
2379	LLVM_DEBUG(dbgs() << DEBUG_TYPE " shift-until-bittest idiom detected!\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-idiom")) { dbgs() << "loop-idiom" " shift-until-bittest idiom detected!\n" ; } } while (false);
2380
2381	// Ok, it is the idiom we were looking for, we could transform this loop,
2382	// but is it profitable to transform?
2383
2384	BasicBlock *LoopHeaderBB = CurLoop->getHeader();
2385	BasicBlock *LoopPreheaderBB = CurLoop->getLoopPreheader();
2386	assert(LoopPreheaderBB && "There is always a loop preheader.")(static_cast <bool> (LoopPreheaderBB && "There is always a loop preheader." ) ? void (0) : __assert_fail ("LoopPreheaderBB && \"There is always a loop preheader.\"" , "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp" , 2386, __extension__ __PRETTY_FUNCTION__));
2387
2388	BasicBlock *SuccessorBB = CurLoop->getExitBlock();
2389	assert(SuccessorBB && "There is only a single successor.")(static_cast <bool> (SuccessorBB && "There is only a single successor." ) ? void (0) : __assert_fail ("SuccessorBB && \"There is only a single successor.\"" , "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp" , 2389, __extension__ __PRETTY_FUNCTION__));
2390
2391	IRBuilder<> Builder(LoopPreheaderBB->getTerminator());
2392	Builder.SetCurrentDebugLocation(cast<Instruction>(XCurr)->getDebugLoc());
2393
2394	Intrinsic::ID IntrID = Intrinsic::ctlz;
2395	Type *Ty = X->getType();
2396	unsigned Bitwidth = Ty->getScalarSizeInBits();
2397
2398	TargetTransformInfo::TargetCostKind CostKind =
2399	TargetTransformInfo::TCK_SizeAndLatency;
2400
2401	// The rewrite is considered to be unprofitable iff and only iff the
2402	// intrinsic/shift we'll use are not cheap. Note that we are okay with just
2403	// making the loop countable, even if nothing else changes.
2404	IntrinsicCostAttributes Attrs(
2405	IntrID, Ty, {UndefValue::get(Ty), /is_zero_undef=/Builder.getTrue()});
2406	InstructionCost Cost = TTI->getIntrinsicInstrCost(Attrs, CostKind);
2407	if (Cost > TargetTransformInfo::TCC_Basic) {
2408	LLVM_DEBUG(dbgs() << DEBUG_TYPEdo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-idiom")) { dbgs() << "loop-idiom" " Intrinsic is too costly, not beneficial\n" ; } } while (false)
2409	" Intrinsic is too costly, not beneficial\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-idiom")) { dbgs() << "loop-idiom" " Intrinsic is too costly, not beneficial\n" ; } } while (false);
2410	return MadeChange;
2411	}
2412	if (TTI->getArithmeticInstrCost(Instruction::Shl, Ty, CostKind) >
2413	TargetTransformInfo::TCC_Basic) {
2414	LLVM_DEBUG(dbgs() << DEBUG_TYPE " Shift is too costly, not beneficial\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-idiom")) { dbgs() << "loop-idiom" " Shift is too costly, not beneficial\n" ; } } while (false);
2415	return MadeChange;
2416	}
2417
2418	// Ok, transform appears worthwhile.
2419	MadeChange = true;
2420
2421	// Step 1: Compute the loop trip count.
2422
2423	Value *LowBitMask = Builder.CreateAdd(BitMask, Constant::getAllOnesValue(Ty),
2424	BitPos->getName() + ".lowbitmask");
2425	Value *Mask =
2426	Builder.CreateOr(LowBitMask, BitMask, BitPos->getName() + ".mask");
2427	Value *XMasked = Builder.CreateAnd(X, Mask, X->getName() + ".masked");
2428	CallInst *XMaskedNumLeadingZeros = Builder.CreateIntrinsic(
2429	IntrID, Ty, {XMasked, /is_zero_undef=/Builder.getTrue()},
2430	/FMFSource=/nullptr, XMasked->getName() + ".numleadingzeros");
2431	Value *XMaskedNumActiveBits = Builder.CreateSub(
2432	ConstantInt::get(Ty, Ty->getScalarSizeInBits()), XMaskedNumLeadingZeros,
2433	XMasked->getName() + ".numactivebits", /HasNUW=/true,
2434	/HasNSW=/Bitwidth != 2);
2435	Value *XMaskedLeadingOnePos =
2436	Builder.CreateAdd(XMaskedNumActiveBits, Constant::getAllOnesValue(Ty),
2437	XMasked->getName() + ".leadingonepos", /HasNUW=/false,
2438	/HasNSW=/Bitwidth > 2);
2439
2440	Value *LoopBackedgeTakenCount = Builder.CreateSub(
2441	BitPos, XMaskedLeadingOnePos, CurLoop->getName() + ".backedgetakencount",
2442	/HasNUW=/true, /HasNSW=/true);
2443	// We know loop's backedge-taken count, but what's loop's trip count?
2444	// Note that while NUW is always safe, while NSW is only for bitwidths != 2.
2445	Value *LoopTripCount =
2446	Builder.CreateAdd(LoopBackedgeTakenCount, ConstantInt::get(Ty, 1),
2447	CurLoop->getName() + ".tripcount", /HasNUW=/true,
2448	/HasNSW=/Bitwidth != 2);
2449
2450	// Step 2: Compute the recurrence's final value without a loop.
2451
2452	// NewX is always safe to compute, because `LoopBackedgeTakenCount`
2453	// will always be smaller than `bitwidth(X)`, i.e. we never get poison.
2454	Value *NewX = Builder.CreateShl(X, LoopBackedgeTakenCount);
2455	NewX->takeName(XCurr);
2456	if (auto *I = dyn_cast<Instruction>(NewX))
2457	I->copyIRFlags(XNext, /IncludeWrapFlags=/true);
2458
2459	Value *NewXNext;
2460	// Rewriting XNext is more complicated, however, because `X << LoopTripCount`
2461	// will be poison iff `LoopTripCount == bitwidth(X)` (which will happen
2462	// iff `BitPos` is `bitwidth(x) - 1` and `X` is `1`). So unless we know
2463	// that isn't the case, we'll need to emit an alternative, safe IR.
2464	if (XNext->hasNoSignedWrap() \|\| XNext->hasNoUnsignedWrap() \|\|
2465	PatternMatch::match(
2466	BitPos, PatternMatch::m_SpecificInt_ICMP(
2467	ICmpInst::ICMP_NE, APInt(Ty->getScalarSizeInBits(),
2468	Ty->getScalarSizeInBits() - 1))))
2469	NewXNext = Builder.CreateShl(X, LoopTripCount);
2470	else {
2471	// Otherwise, just additionally shift by one. It's the smallest solution,
2472	// alternatively, we could check that NewX is INT_MIN (or BitPos is )
2473	// and select 0 instead.
2474	NewXNext = Builder.CreateShl(NewX, ConstantInt::get(Ty, 1));
2475	}
2476
2477	NewXNext->takeName(XNext);
2478	if (auto *I = dyn_cast<Instruction>(NewXNext))
2479	I->copyIRFlags(XNext, /IncludeWrapFlags=/true);
2480
2481	// Step 3: Adjust the successor basic block to recieve the computed
2482	// recurrence's final value instead of the recurrence itself.
2483
2484	XCurr->replaceUsesOutsideBlock(NewX, LoopHeaderBB);
2485	XNext->replaceUsesOutsideBlock(NewXNext, LoopHeaderBB);
2486
2487	// Step 4: Rewrite the loop into a countable form, with canonical IV.
2488
2489	// The new canonical induction variable.
2490	Builder.SetInsertPoint(&LoopHeaderBB->front());
2491	auto *IV = Builder.CreatePHI(Ty, 2, CurLoop->getName() + ".iv");
2492
2493	// The induction itself.
2494	// Note that while NUW is always safe, while NSW is only for bitwidths != 2.
2495	Builder.SetInsertPoint(LoopHeaderBB->getTerminator());
2496	auto *IVNext =
2497	Builder.CreateAdd(IV, ConstantInt::get(Ty, 1), IV->getName() + ".next",
2498	/HasNUW=/true, /HasNSW=/Bitwidth != 2);
2499
2500	// The loop trip count check.
2501	auto *IVCheck = Builder.CreateICmpEQ(IVNext, LoopTripCount,
2502	CurLoop->getName() + ".ivcheck");
2503	Builder.CreateCondBr(IVCheck, SuccessorBB, LoopHeaderBB);
2504	LoopHeaderBB->getTerminator()->eraseFromParent();
2505
2506	// Populate the IV PHI.
2507	IV->addIncoming(ConstantInt::get(Ty, 0), LoopPreheaderBB);
2508	IV->addIncoming(IVNext, LoopHeaderBB);
2509
2510	// Step 5: Forget the "non-computable" trip-count SCEV associated with the
2511	// loop. The loop would otherwise not be deleted even if it becomes empty.
2512
2513	SE->forgetLoop(CurLoop);
2514
2515	// Other passes will take care of actually deleting the loop if possible.
2516
2517	LLVM_DEBUG(dbgs() << DEBUG_TYPE " shift-until-bittest idiom optimized!\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-idiom")) { dbgs() << "loop-idiom" " shift-until-bittest idiom optimized!\n" ; } } while (false);
2518
2519	++NumShiftUntilBitTest;
2520	return MadeChange;
2521	}
2522
2523	/// Return true if the idiom is detected in the loop.
2524	///
2525	/// The core idiom we are trying to detect is:
2526	/// \code
2527	/// entry:
2528	/// <...>
2529	/// %start = <...>
2530	/// %extraoffset = <...>
2531	/// <...>
2532	/// br label %for.cond
2533	///
2534	/// loop:
2535	/// %iv = phi i8 [ %start, %entry ], [ %iv.next, %for.cond ]
2536	/// %nbits = add nsw i8 %iv, %extraoffset
2537	/// %val.shifted = {{l,a}shr,shl} i8 %val, %nbits
2538	/// %val.shifted.iszero = icmp eq i8 %val.shifted, 0
2539	/// %iv.next = add i8 %iv, 1
2540	/// <...>
2541	/// br i1 %val.shifted.iszero, label %end, label %loop
2542	///
2543	/// end:
2544	/// %iv.res = phi i8 [ %iv, %loop ] <...>
2545	/// %nbits.res = phi i8 [ %nbits, %loop ] <...>
2546	/// %val.shifted.res = phi i8 [ %val.shifted, %loop ] <...>
2547	/// %val.shifted.iszero.res = phi i1 [ %val.shifted.iszero, %loop ] <...>
2548	/// %iv.next.res = phi i8 [ %iv.next, %loop ] <...>
2549	/// <...>
2550	/// \endcode
2551	static bool detectShiftUntilZeroIdiom(Loop CurLoop, ScalarEvolution SE,
2552	Instruction *&ValShiftedIsZero,
2553	Intrinsic::ID &IntrinID, Instruction *&IV,
2554	Value &Start, Value &Val,
2555	const SCEV *&ExtraOffsetExpr,
2556	bool &InvertedCond) {
2557	LLVM_DEBUG(dbgs() << DEBUG_TYPEdo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-idiom")) { dbgs() << "loop-idiom" " Performing shift-until-zero idiom detection.\n" ; } } while (false)
2558	" Performing shift-until-zero idiom detection.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-idiom")) { dbgs() << "loop-idiom" " Performing shift-until-zero idiom detection.\n" ; } } while (false);
2559
2560	// Give up if the loop has multiple blocks or multiple backedges.
2561	if (CurLoop->getNumBlocks() != 1 \|\| CurLoop->getNumBackEdges() != 1) {
2562	LLVM_DEBUG(dbgs() << DEBUG_TYPE " Bad block/backedge count.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-idiom")) { dbgs() << "loop-idiom" " Bad block/backedge count.\n" ; } } while (false);
2563	return false;
2564	}
2565
2566	Instruction ValShifted, NBits, *IVNext;
2567	Value *ExtraOffset;
2568
2569	BasicBlock *LoopHeaderBB = CurLoop->getHeader();
2570	BasicBlock *LoopPreheaderBB = CurLoop->getLoopPreheader();
2571	assert(LoopPreheaderBB && "There is always a loop preheader.")(static_cast <bool> (LoopPreheaderBB && "There is always a loop preheader." ) ? void (0) : __assert_fail ("LoopPreheaderBB && \"There is always a loop preheader.\"" , "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp" , 2571, __extension__ __PRETTY_FUNCTION__));
2572
2573	using namespace PatternMatch;
2574
2575	// Step 1: Check if the loop backedge, condition is in desirable form.
2576
2577	ICmpInst::Predicate Pred;
2578	BasicBlock TrueBB, FalseBB;
2579	if (!match(LoopHeaderBB->getTerminator(),
2580	m_Br(m_Instruction(ValShiftedIsZero), m_BasicBlock(TrueBB),
2581	m_BasicBlock(FalseBB))) \|\|
2582	!match(ValShiftedIsZero,
2583	m_ICmp(Pred, m_Instruction(ValShifted), m_Zero())) \|\|
2584	!ICmpInst::isEquality(Pred)) {
2585	LLVM_DEBUG(dbgs() << DEBUG_TYPE " Bad backedge structure.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-idiom")) { dbgs() << "loop-idiom" " Bad backedge structure.\n" ; } } while (false);
2586	return false;
2587	}
2588
2589	// Step 2: Check if the comparison's operand is in desirable form.
2590	// FIXME: Val could be a one-input PHI node, which we should look past.
2591	if (!match(ValShifted, m_Shift(m_LoopInvariant(m_Value(Val), CurLoop),
2592	m_Instruction(NBits)))) {
2593	LLVM_DEBUG(dbgs() << DEBUG_TYPE " Bad comparisons value computation.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-idiom")) { dbgs() << "loop-idiom" " Bad comparisons value computation.\n" ; } } while (false);
2594	return false;
2595	}
2596	IntrinID = ValShifted->getOpcode() == Instruction::Shl ? Intrinsic::cttz
2597	: Intrinsic::ctlz;
2598
2599	// Step 3: Check if the shift amount is in desirable form.
2600
2601	if (match(NBits, m_c_Add(m_Instruction(IV),
2602	m_LoopInvariant(m_Value(ExtraOffset), CurLoop))) &&
2603	(NBits->hasNoSignedWrap() \|\| NBits->hasNoUnsignedWrap()))
2604	ExtraOffsetExpr = SE->getNegativeSCEV(SE->getSCEV(ExtraOffset));
2605	else if (match(NBits,
2606	m_Sub(m_Instruction(IV),
2607	m_LoopInvariant(m_Value(ExtraOffset), CurLoop))) &&
2608	NBits->hasNoSignedWrap())
2609	ExtraOffsetExpr = SE->getSCEV(ExtraOffset);
2610	else {
2611	IV = NBits;
2612	ExtraOffsetExpr = SE->getZero(NBits->getType());
2613	}
2614
2615	// Step 4: Check if the recurrence is in desirable form.
2616	auto *IVPN = dyn_cast<PHINode>(IV);
2617	if (!IVPN \|\| IVPN->getParent() != LoopHeaderBB) {
2618	LLVM_DEBUG(dbgs() << DEBUG_TYPE " Not an expected PHI node.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-idiom")) { dbgs() << "loop-idiom" " Not an expected PHI node.\n" ; } } while (false);
2619	return false;
2620	}
2621
2622	Start = IVPN->getIncomingValueForBlock(LoopPreheaderBB);
2623	IVNext = dyn_cast<Instruction>(IVPN->getIncomingValueForBlock(LoopHeaderBB));
2624
2625	if (!IVNext \|\| !match(IVNext, m_Add(m_Specific(IVPN), m_One()))) {
2626	LLVM_DEBUG(dbgs() << DEBUG_TYPE " Bad recurrence.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-idiom")) { dbgs() << "loop-idiom" " Bad recurrence.\n" ; } } while (false);
2627	return false;
2628	}
2629
2630	// Step 4: Check if the backedge's destinations are in desirable form.
2631
2632	assert(ICmpInst::isEquality(Pred) &&(static_cast <bool> (ICmpInst::isEquality(Pred) && "Should only get equality predicates here.") ? void (0) : __assert_fail ("ICmpInst::isEquality(Pred) && \"Should only get equality predicates here.\"" , "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp" , 2633, __extension__ __PRETTY_FUNCTION__))
2633	"Should only get equality predicates here.")(static_cast <bool> (ICmpInst::isEquality(Pred) && "Should only get equality predicates here.") ? void (0) : __assert_fail ("ICmpInst::isEquality(Pred) && \"Should only get equality predicates here.\"" , "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp" , 2633, __extension__ __PRETTY_FUNCTION__));
2634
2635	// cmp-br is commutative, so canonicalize to a single variant.
2636	InvertedCond = Pred != ICmpInst::Predicate::ICMP_EQ;
2637	if (InvertedCond) {
2638	Pred = ICmpInst::getInversePredicate(Pred);
	Value stored to 'Pred' is never read
2639	std::swap(TrueBB, FalseBB);
2640	}
2641
2642	// We expect to exit loop when comparison yields true,
2643	// so when it yields false we should branch back to loop header.
2644	if (FalseBB != LoopHeaderBB) {
2645	LLVM_DEBUG(dbgs() << DEBUG_TYPE " Bad backedge flow.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-idiom")) { dbgs() << "loop-idiom" " Bad backedge flow.\n" ; } } while (false);
2646	return false;
2647	}
2648
2649	// The new, countable, loop will certainly only run a known number of
2650	// iterations, It won't be infinite. But the old loop might be infinite
2651	// under certain conditions. For logical shifts, the value will become zero
2652	// after at most bitwidth(%Val) loop iterations. However, for arithmetic
2653	// right-shift, iff the sign bit was set, the value will never become zero,
2654	// and the loop may never finish.
2655	if (ValShifted->getOpcode() == Instruction::AShr &&
2656	!isMustProgress(CurLoop) && !SE->isKnownNonNegative(SE->getSCEV(Val))) {
2657	LLVM_DEBUG(dbgs() << DEBUG_TYPE " Can not prove the loop is finite.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-idiom")) { dbgs() << "loop-idiom" " Can not prove the loop is finite.\n" ; } } while (false);
2658	return false;
2659	}
2660
2661	// Okay, idiom checks out.
2662	return true;
2663	}
2664
2665	/// Look for the following loop:
2666	/// \code
2667	/// entry:
2668	/// <...>
2669	/// %start = <...>
2670	/// %extraoffset = <...>
2671	/// <...>
2672	/// br label %for.cond
2673	///
2674	/// loop:
2675	/// %iv = phi i8 [ %start, %entry ], [ %iv.next, %for.cond ]
2676	/// %nbits = add nsw i8 %iv, %extraoffset
2677	/// %val.shifted = {{l,a}shr,shl} i8 %val, %nbits
2678	/// %val.shifted.iszero = icmp eq i8 %val.shifted, 0
2679	/// %iv.next = add i8 %iv, 1
2680	/// <...>
2681	/// br i1 %val.shifted.iszero, label %end, label %loop
2682	///
2683	/// end:
2684	/// %iv.res = phi i8 [ %iv, %loop ] <...>
2685	/// %nbits.res = phi i8 [ %nbits, %loop ] <...>
2686	/// %val.shifted.res = phi i8 [ %val.shifted, %loop ] <...>
2687	/// %val.shifted.iszero.res = phi i1 [ %val.shifted.iszero, %loop ] <...>
2688	/// %iv.next.res = phi i8 [ %iv.next, %loop ] <...>
2689	/// <...>
2690	/// \endcode
2691	///
2692	/// And transform it into:
2693	/// \code
2694	/// entry:
2695	/// <...>
2696	/// %start = <...>
2697	/// %extraoffset = <...>
2698	/// <...>
2699	/// %val.numleadingzeros = call i8 @llvm.ct{l,t}z.i8(i8 %val, i1 0)
2700	/// %val.numactivebits = sub i8 8, %val.numleadingzeros
2701	/// %extraoffset.neg = sub i8 0, %extraoffset
2702	/// %tmp = add i8 %val.numactivebits, %extraoffset.neg
2703	/// %iv.final = call i8 @llvm.smax.i8(i8 %tmp, i8 %start)
2704	/// %loop.tripcount = sub i8 %iv.final, %start
2705	/// br label %loop
2706	///
2707	/// loop:
2708	/// %loop.iv = phi i8 [ 0, %entry ], [ %loop.iv.next, %loop ]
2709	/// %loop.iv.next = add i8 %loop.iv, 1
2710	/// %loop.ivcheck = icmp eq i8 %loop.iv.next, %loop.tripcount
2711	/// %iv = add i8 %loop.iv, %start
2712	/// <...>
2713	/// br i1 %loop.ivcheck, label %end, label %loop
2714	///
2715	/// end:
2716	/// %iv.res = phi i8 [ %iv.final, %loop ] <...>
2717	/// <...>
2718	/// \endcode
2719	bool LoopIdiomRecognize::recognizeShiftUntilZero() {
2720	bool MadeChange = false;
2721
2722	Instruction *ValShiftedIsZero;
2723	Intrinsic::ID IntrID;
2724	Instruction *IV;
2725	Value Start, Val;
2726	const SCEV *ExtraOffsetExpr;
2727	bool InvertedCond;
2728	if (!detectShiftUntilZeroIdiom(CurLoop, SE, ValShiftedIsZero, IntrID, IV,
2729	Start, Val, ExtraOffsetExpr, InvertedCond)) {
2730	LLVM_DEBUG(dbgs() << DEBUG_TYPEdo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-idiom")) { dbgs() << "loop-idiom" " shift-until-zero idiom detection failed.\n" ; } } while (false)
2731	" shift-until-zero idiom detection failed.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-idiom")) { dbgs() << "loop-idiom" " shift-until-zero idiom detection failed.\n" ; } } while (false);
2732	return MadeChange;
2733	}
2734	LLVM_DEBUG(dbgs() << DEBUG_TYPE " shift-until-zero idiom detected!\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-idiom")) { dbgs() << "loop-idiom" " shift-until-zero idiom detected!\n" ; } } while (false);
2735
2736	// Ok, it is the idiom we were looking for, we could transform this loop,
2737	// but is it profitable to transform?
2738
2739	BasicBlock *LoopHeaderBB = CurLoop->getHeader();
2740	BasicBlock *LoopPreheaderBB = CurLoop->getLoopPreheader();
2741	assert(LoopPreheaderBB && "There is always a loop preheader.")(static_cast <bool> (LoopPreheaderBB && "There is always a loop preheader." ) ? void (0) : __assert_fail ("LoopPreheaderBB && \"There is always a loop preheader.\"" , "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp" , 2741, __extension__ __PRETTY_FUNCTION__));
2742
2743	BasicBlock *SuccessorBB = CurLoop->getExitBlock();
2744	assert(SuccessorBB && "There is only a single successor.")(static_cast <bool> (SuccessorBB && "There is only a single successor." ) ? void (0) : __assert_fail ("SuccessorBB && \"There is only a single successor.\"" , "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp" , 2744, __extension__ __PRETTY_FUNCTION__));
2745
2746	IRBuilder<> Builder(LoopPreheaderBB->getTerminator());
2747	Builder.SetCurrentDebugLocation(IV->getDebugLoc());
2748
2749	Type *Ty = Val->getType();
2750	unsigned Bitwidth = Ty->getScalarSizeInBits();
2751
2752	TargetTransformInfo::TargetCostKind CostKind =
2753	TargetTransformInfo::TCK_SizeAndLatency;
2754
2755	// The rewrite is considered to be unprofitable iff and only iff the
2756	// intrinsic we'll use are not cheap. Note that we are okay with just
2757	// making the loop countable, even if nothing else changes.
2758	IntrinsicCostAttributes Attrs(
2759	IntrID, Ty, {UndefValue::get(Ty), /is_zero_undef=/Builder.getFalse()});
2760	InstructionCost Cost = TTI->getIntrinsicInstrCost(Attrs, CostKind);
2761	if (Cost > TargetTransformInfo::TCC_Basic) {
2762	LLVM_DEBUG(dbgs() << DEBUG_TYPEdo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-idiom")) { dbgs() << "loop-idiom" " Intrinsic is too costly, not beneficial\n" ; } } while (false)
2763	" Intrinsic is too costly, not beneficial\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-idiom")) { dbgs() << "loop-idiom" " Intrinsic is too costly, not beneficial\n" ; } } while (false);
2764	return MadeChange;
2765	}
2766
2767	// Ok, transform appears worthwhile.
2768	MadeChange = true;
2769
2770	bool OffsetIsZero = false;
2771	if (auto *ExtraOffsetExprC = dyn_cast<SCEVConstant>(ExtraOffsetExpr))
2772	OffsetIsZero = ExtraOffsetExprC->isZero();
2773
2774	// Step 1: Compute the loop's final IV value / trip count.
2775
2776	CallInst *ValNumLeadingZeros = Builder.CreateIntrinsic(
2777	IntrID, Ty, {Val, /is_zero_undef=/Builder.getFalse()},
2778	/FMFSource=/nullptr, Val->getName() + ".numleadingzeros");
2779	Value *ValNumActiveBits = Builder.CreateSub(
2780	ConstantInt::get(Ty, Ty->getScalarSizeInBits()), ValNumLeadingZeros,
2781	Val->getName() + ".numactivebits", /HasNUW=/true,
2782	/HasNSW=/Bitwidth != 2);
2783
2784	SCEVExpander Expander(SE, DL, "loop-idiom");
2785	Expander.setInsertPoint(&*Builder.GetInsertPoint());
2786	Value *ExtraOffset = Expander.expandCodeFor(ExtraOffsetExpr);
2787
2788	Value *ValNumActiveBitsOffset = Builder.CreateAdd(
2789	ValNumActiveBits, ExtraOffset, ValNumActiveBits->getName() + ".offset",
2790	/HasNUW=/OffsetIsZero, /HasNSW=/true);
2791	Value *IVFinal = Builder.CreateIntrinsic(Intrinsic::smax, {Ty},
2792	{ValNumActiveBitsOffset, Start},
2793	/FMFSource=/nullptr, "iv.final");
2794
2795	auto *LoopBackedgeTakenCount = cast<Instruction>(Builder.CreateSub(
2796	IVFinal, Start, CurLoop->getName() + ".backedgetakencount",
2797	/HasNUW=/OffsetIsZero, /HasNSW=/true));
2798	// FIXME: or when the offset was `add nuw`
2799
2800	// We know loop's backedge-taken count, but what's loop's trip count?
2801	Value *LoopTripCount =
2802	Builder.CreateAdd(LoopBackedgeTakenCount, ConstantInt::get(Ty, 1),
2803	CurLoop->getName() + ".tripcount", /HasNUW=/true,
2804	/HasNSW=/Bitwidth != 2);
2805
2806	// Step 2: Adjust the successor basic block to recieve the original
2807	// induction variable's final value instead of the orig. IV itself.
2808
2809	IV->replaceUsesOutsideBlock(IVFinal, LoopHeaderBB);
2810
2811	// Step 3: Rewrite the loop into a countable form, with canonical IV.
2812
2813	// The new canonical induction variable.
2814	Builder.SetInsertPoint(&LoopHeaderBB->front());
2815	auto *CIV = Builder.CreatePHI(Ty, 2, CurLoop->getName() + ".iv");
2816
2817	// The induction itself.
2818	Builder.SetInsertPoint(LoopHeaderBB->getFirstNonPHI());
2819	auto *CIVNext =
2820	Builder.CreateAdd(CIV, ConstantInt::get(Ty, 1), CIV->getName() + ".next",
2821	/HasNUW=/true, /HasNSW=/Bitwidth != 2);
2822
2823	// The loop trip count check.
2824	auto *CIVCheck = Builder.CreateICmpEQ(CIVNext, LoopTripCount,
2825	CurLoop->getName() + ".ivcheck");
2826	auto *NewIVCheck = CIVCheck;
2827	if (InvertedCond) {
2828	NewIVCheck = Builder.CreateNot(CIVCheck);
2829	NewIVCheck->takeName(ValShiftedIsZero);
2830	}
2831
2832	// The original IV, but rebased to be an offset to the CIV.
2833	auto IVDePHId = Builder.CreateAdd(CIV, Start, "", /HasNUW=*/false,
2834	/HasNSW=/true); // FIXME: what about NUW?
2835	IVDePHId->takeName(IV);
2836
2837	// The loop terminator.
2838	Builder.SetInsertPoint(LoopHeaderBB->getTerminator());
2839	Builder.CreateCondBr(CIVCheck, SuccessorBB, LoopHeaderBB);
2840	LoopHeaderBB->getTerminator()->eraseFromParent();
2841
2842	// Populate the IV PHI.
2843	CIV->addIncoming(ConstantInt::get(Ty, 0), LoopPreheaderBB);
2844	CIV->addIncoming(CIVNext, LoopHeaderBB);
2845
2846	// Step 4: Forget the "non-computable" trip-count SCEV associated with the
2847	// loop. The loop would otherwise not be deleted even if it becomes empty.
2848
2849	SE->forgetLoop(CurLoop);
2850
2851	// Step 5: Try to cleanup the loop's body somewhat.
2852	IV->replaceAllUsesWith(IVDePHId);
2853	IV->eraseFromParent();
2854
2855	ValShiftedIsZero->replaceAllUsesWith(NewIVCheck);
2856	ValShiftedIsZero->eraseFromParent();
2857
2858	// Other passes will take care of actually deleting the loop if possible.
2859
2860	LLVM_DEBUG(dbgs() << DEBUG_TYPE " shift-until-zero idiom optimized!\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("loop-idiom")) { dbgs() << "loop-idiom" " shift-until-zero idiom optimized!\n" ; } } while (false);
2861
2862	++NumShiftUntilZero;
2863	return MadeChange;
2864	}