Bug Summary

File:build/source/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
Warning:line 8987, column 3
Use of memory after it is freed

Annotated Source Code

Press '?' to see keyboard shortcuts

clang -cc1 -cc1 -triple x86_64-pc-linux-gnu -analyze -disable-free -clear-ast-before-backend -disable-llvm-verifier -discard-value-names -main-file-name LoopVectorize.cpp -analyzer-checker=core -analyzer-checker=apiModeling -analyzer-checker=unix -analyzer-checker=deadcode -analyzer-checker=cplusplus -analyzer-checker=security.insecureAPI.UncheckedReturn -analyzer-checker=security.insecureAPI.getpw -analyzer-checker=security.insecureAPI.gets -analyzer-checker=security.insecureAPI.mktemp -analyzer-checker=security.insecureAPI.mkstemp -analyzer-checker=security.insecureAPI.vfork -analyzer-checker=nullability.NullPassedToNonnull -analyzer-checker=nullability.NullReturnedFromNonnull -analyzer-output plist -w -setup-static-analyzer -analyzer-config-compatibility-mode=true -mrelocation-model pic -pic-level 2 -mframe-pointer=none -fmath-errno -ffp-contract=on -fno-rounding-math -mconstructor-aliases -funwind-tables=2 -target-cpu x86-64 -tune-cpu generic -debugger-tuning=gdb -ffunction-sections -fdata-sections -fcoverage-compilation-dir=/build/source/build-llvm -resource-dir /usr/lib/llvm-16/lib/clang/16 -I lib/Transforms/Vectorize -I /build/source/llvm/lib/Transforms/Vectorize -I include -I /build/source/llvm/include -D _DEBUG -D _GNU_SOURCE -D __STDC_CONSTANT_MACROS -D __STDC_FORMAT_MACROS -D __STDC_LIMIT_MACROS -D _FORTIFY_SOURCE=2 -D NDEBUG -U NDEBUG -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../include/c++/10 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../include/x86_64-linux-gnu/c++/10 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../include/c++/10/backward -internal-isystem /usr/lib/llvm-16/lib/clang/16/include -internal-isystem /usr/local/include -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../x86_64-linux-gnu/include -internal-externc-isystem /usr/include/x86_64-linux-gnu -internal-externc-isystem /include -internal-externc-isystem /usr/include -fmacro-prefix-map=/build/source/build-llvm=build-llvm -fmacro-prefix-map=/build/source/= -fcoverage-prefix-map=/build/source/build-llvm=build-llvm -fcoverage-prefix-map=/build/source/= -source-date-epoch 1671833309 -O3 -Wno-unused-command-line-argument -Wno-unused-parameter -Wwrite-strings -Wno-missing-field-initializers -Wno-long-long -Wno-maybe-uninitialized -Wno-class-memaccess -Wno-redundant-move -Wno-pessimizing-move -Wno-noexcept-type -Wno-comment -Wno-misleading-indentation -std=c++17 -fdeprecated-macro -fdebug-compilation-dir=/build/source/build-llvm -fdebug-prefix-map=/build/source/build-llvm=build-llvm -fdebug-prefix-map=/build/source/= -fdebug-prefix-map=/build/source/build-llvm=build-llvm -fdebug-prefix-map=/build/source/= -ferror-limit 19 -fvisibility-inlines-hidden -stack-protector 2 -fgnuc-version=4.2.1 -fcolor-diagnostics -vectorize-loops -vectorize-slp -analyzer-output=html -analyzer-config stable-report-filename=true -faddrsig -D__GCC_HAVE_DWARF2_CFI_ASM=1 -o /tmp/scan-build-2022-12-24-002659-1137794-1 -x c++ /build/source/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

/build/source/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

1//===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops
10// and generates target-independent LLVM-IR.
11// The vectorizer uses the TargetTransformInfo analysis to estimate the costs
12// of instructions in order to estimate the profitability of vectorization.
13//
14// The loop vectorizer combines consecutive loop iterations into a single
15// 'wide' iteration. After this transformation the index is incremented
16// by the SIMD vector width, and not by one.
17//
18// This pass has three parts:
19// 1. The main loop pass that drives the different parts.
20// 2. LoopVectorizationLegality - A unit that checks for the legality
21// of the vectorization.
22// 3. InnerLoopVectorizer - A unit that performs the actual
23// widening of instructions.
24// 4. LoopVectorizationCostModel - A unit that checks for the profitability
25// of vectorization. It decides on the optimal vector width, which
26// can be one, if vectorization is not profitable.
27//
28// There is a development effort going on to migrate loop vectorizer to the
29// VPlan infrastructure and to introduce outer loop vectorization support (see
30// docs/Proposal/VectorizationPlan.rst and
31// http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this
32// purpose, we temporarily introduced the VPlan-native vectorization path: an
33// alternative vectorization path that is natively implemented on top of the
34// VPlan infrastructure. See EnableVPlanNativePath for enabling.
35//
36//===----------------------------------------------------------------------===//
37//
38// The reduction-variable vectorization is based on the paper:
39// D. Nuzman and R. Henderson. Multi-platform Auto-vectorization.
40//
41// Variable uniformity checks are inspired by:
42// Karrenberg, R. and Hack, S. Whole Function Vectorization.
43//
44// The interleaved access vectorization is based on the paper:
45// Dorit Nuzman, Ira Rosen and Ayal Zaks. Auto-Vectorization of Interleaved
46// Data for SIMD
47//
48// Other ideas/concepts are from:
49// A. Zaks and D. Nuzman. Autovectorization in GCC-two years later.
50//
51// S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua. An Evaluation of
52// Vectorizing Compilers.
53//
54//===----------------------------------------------------------------------===//
55
56#include "llvm/Transforms/Vectorize/LoopVectorize.h"
57#include "LoopVectorizationPlanner.h"
58#include "VPRecipeBuilder.h"
59#include "VPlan.h"
60#include "VPlanHCFGBuilder.h"
61#include "VPlanTransforms.h"
62#include "llvm/ADT/APInt.h"
63#include "llvm/ADT/ArrayRef.h"
64#include "llvm/ADT/DenseMap.h"
65#include "llvm/ADT/DenseMapInfo.h"
66#include "llvm/ADT/Hashing.h"
67#include "llvm/ADT/MapVector.h"
68#include "llvm/ADT/STLExtras.h"
69#include "llvm/ADT/SmallPtrSet.h"
70#include "llvm/ADT/SmallSet.h"
71#include "llvm/ADT/SmallVector.h"
72#include "llvm/ADT/Statistic.h"
73#include "llvm/ADT/StringRef.h"
74#include "llvm/ADT/Twine.h"
75#include "llvm/ADT/iterator_range.h"
76#include "llvm/Analysis/AssumptionCache.h"
77#include "llvm/Analysis/BasicAliasAnalysis.h"
78#include "llvm/Analysis/BlockFrequencyInfo.h"
79#include "llvm/Analysis/CFG.h"
80#include "llvm/Analysis/CodeMetrics.h"
81#include "llvm/Analysis/DemandedBits.h"
82#include "llvm/Analysis/GlobalsModRef.h"
83#include "llvm/Analysis/LoopAccessAnalysis.h"
84#include "llvm/Analysis/LoopAnalysisManager.h"
85#include "llvm/Analysis/LoopInfo.h"
86#include "llvm/Analysis/LoopIterator.h"
87#include "llvm/Analysis/OptimizationRemarkEmitter.h"
88#include "llvm/Analysis/ProfileSummaryInfo.h"
89#include "llvm/Analysis/ScalarEvolution.h"
90#include "llvm/Analysis/ScalarEvolutionExpressions.h"
91#include "llvm/Analysis/TargetLibraryInfo.h"
92#include "llvm/Analysis/TargetTransformInfo.h"
93#include "llvm/Analysis/ValueTracking.h"
94#include "llvm/Analysis/VectorUtils.h"
95#include "llvm/IR/Attributes.h"
96#include "llvm/IR/BasicBlock.h"
97#include "llvm/IR/CFG.h"
98#include "llvm/IR/Constant.h"
99#include "llvm/IR/Constants.h"
100#include "llvm/IR/DataLayout.h"
101#include "llvm/IR/DebugInfoMetadata.h"
102#include "llvm/IR/DebugLoc.h"
103#include "llvm/IR/DerivedTypes.h"
104#include "llvm/IR/DiagnosticInfo.h"
105#include "llvm/IR/Dominators.h"
106#include "llvm/IR/Function.h"
107#include "llvm/IR/IRBuilder.h"
108#include "llvm/IR/InstrTypes.h"
109#include "llvm/IR/Instruction.h"
110#include "llvm/IR/Instructions.h"
111#include "llvm/IR/IntrinsicInst.h"
112#include "llvm/IR/Intrinsics.h"
113#include "llvm/IR/Metadata.h"
114#include "llvm/IR/Module.h"
115#include "llvm/IR/Operator.h"
116#include "llvm/IR/PatternMatch.h"
117#include "llvm/IR/Type.h"
118#include "llvm/IR/Use.h"
119#include "llvm/IR/User.h"
120#include "llvm/IR/Value.h"
121#include "llvm/IR/ValueHandle.h"
122#include "llvm/IR/Verifier.h"
123#include "llvm/InitializePasses.h"
124#include "llvm/Pass.h"
125#include "llvm/Support/Casting.h"
126#include "llvm/Support/CommandLine.h"
127#include "llvm/Support/Compiler.h"
128#include "llvm/Support/Debug.h"
129#include "llvm/Support/ErrorHandling.h"
130#include "llvm/Support/InstructionCost.h"
131#include "llvm/Support/MathExtras.h"
132#include "llvm/Support/raw_ostream.h"
133#include "llvm/Transforms/Utils/BasicBlockUtils.h"
134#include "llvm/Transforms/Utils/InjectTLIMappings.h"
135#include "llvm/Transforms/Utils/LoopSimplify.h"
136#include "llvm/Transforms/Utils/LoopUtils.h"
137#include "llvm/Transforms/Utils/LoopVersioning.h"
138#include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
139#include "llvm/Transforms/Utils/SizeOpts.h"
140#include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h"
141#include <algorithm>
142#include <cassert>
143#include <cmath>
144#include <cstdint>
145#include <functional>
146#include <iterator>
147#include <limits>
148#include <map>
149#include <memory>
150#include <string>
151#include <tuple>
152#include <utility>
153
154using namespace llvm;
155
156#define LV_NAME"loop-vectorize" "loop-vectorize"
157#define DEBUG_TYPE"loop-vectorize" LV_NAME"loop-vectorize"
158
159#ifndef NDEBUG
160const char VerboseDebug[] = DEBUG_TYPE"loop-vectorize" "-verbose";
161#endif
162
163/// @{
164/// Metadata attribute names
165const char LLVMLoopVectorizeFollowupAll[] = "llvm.loop.vectorize.followup_all";
166const char LLVMLoopVectorizeFollowupVectorized[] =
167 "llvm.loop.vectorize.followup_vectorized";
168const char LLVMLoopVectorizeFollowupEpilogue[] =
169 "llvm.loop.vectorize.followup_epilogue";
170/// @}
171
172STATISTIC(LoopsVectorized, "Number of loops vectorized")static llvm::Statistic LoopsVectorized = {"loop-vectorize", "LoopsVectorized"
, "Number of loops vectorized"}
;
173STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization")static llvm::Statistic LoopsAnalyzed = {"loop-vectorize", "LoopsAnalyzed"
, "Number of loops analyzed for vectorization"}
;
174STATISTIC(LoopsEpilogueVectorized, "Number of epilogues vectorized")static llvm::Statistic LoopsEpilogueVectorized = {"loop-vectorize"
, "LoopsEpilogueVectorized", "Number of epilogues vectorized"
}
;
175
176static cl::opt<bool> EnableEpilogueVectorization(
177 "enable-epilogue-vectorization", cl::init(true), cl::Hidden,
178 cl::desc("Enable vectorization of epilogue loops."));
179
180static cl::opt<unsigned> EpilogueVectorizationForceVF(
181 "epilogue-vectorization-force-VF", cl::init(1), cl::Hidden,
182 cl::desc("When epilogue vectorization is enabled, and a value greater than "
183 "1 is specified, forces the given VF for all applicable epilogue "
184 "loops."));
185
186static cl::opt<unsigned> EpilogueVectorizationMinVF(
187 "epilogue-vectorization-minimum-VF", cl::init(16), cl::Hidden,
188 cl::desc("Only loops with vectorization factor equal to or larger than "
189 "the specified value are considered for epilogue vectorization."));
190
191/// Loops with a known constant trip count below this number are vectorized only
192/// if no scalar iteration overheads are incurred.
193static cl::opt<unsigned> TinyTripCountVectorThreshold(
194 "vectorizer-min-trip-count", cl::init(16), cl::Hidden,
195 cl::desc("Loops with a constant trip count that is smaller than this "
196 "value are vectorized only if no scalar iteration overheads "
197 "are incurred."));
198
199static cl::opt<unsigned> VectorizeMemoryCheckThreshold(
200 "vectorize-memory-check-threshold", cl::init(128), cl::Hidden,
201 cl::desc("The maximum allowed number of runtime memory checks"));
202
203// Option prefer-predicate-over-epilogue indicates that an epilogue is undesired,
204// that predication is preferred, and this lists all options. I.e., the
205// vectorizer will try to fold the tail-loop (epilogue) into the vector body
206// and predicate the instructions accordingly. If tail-folding fails, there are
207// different fallback strategies depending on these values:
208namespace PreferPredicateTy {
209 enum Option {
210 ScalarEpilogue = 0,
211 PredicateElseScalarEpilogue,
212 PredicateOrDontVectorize
213 };
214} // namespace PreferPredicateTy
215
216static cl::opt<PreferPredicateTy::Option> PreferPredicateOverEpilogue(
217 "prefer-predicate-over-epilogue",
218 cl::init(PreferPredicateTy::ScalarEpilogue),
219 cl::Hidden,
220 cl::desc("Tail-folding and predication preferences over creating a scalar "
221 "epilogue loop."),
222 cl::values(clEnumValN(PreferPredicateTy::ScalarEpilogue,llvm::cl::OptionEnumValue { "scalar-epilogue", int(PreferPredicateTy
::ScalarEpilogue), "Don't tail-predicate loops, create scalar epilogue"
}
223 "scalar-epilogue",llvm::cl::OptionEnumValue { "scalar-epilogue", int(PreferPredicateTy
::ScalarEpilogue), "Don't tail-predicate loops, create scalar epilogue"
}
224 "Don't tail-predicate loops, create scalar epilogue")llvm::cl::OptionEnumValue { "scalar-epilogue", int(PreferPredicateTy
::ScalarEpilogue), "Don't tail-predicate loops, create scalar epilogue"
}
,
225 clEnumValN(PreferPredicateTy::PredicateElseScalarEpilogue,llvm::cl::OptionEnumValue { "predicate-else-scalar-epilogue",
int(PreferPredicateTy::PredicateElseScalarEpilogue), "prefer tail-folding, create scalar epilogue if tail "
"folding fails." }
226 "predicate-else-scalar-epilogue",llvm::cl::OptionEnumValue { "predicate-else-scalar-epilogue",
int(PreferPredicateTy::PredicateElseScalarEpilogue), "prefer tail-folding, create scalar epilogue if tail "
"folding fails." }
227 "prefer tail-folding, create scalar epilogue if tail "llvm::cl::OptionEnumValue { "predicate-else-scalar-epilogue",
int(PreferPredicateTy::PredicateElseScalarEpilogue), "prefer tail-folding, create scalar epilogue if tail "
"folding fails." }
228 "folding fails.")llvm::cl::OptionEnumValue { "predicate-else-scalar-epilogue",
int(PreferPredicateTy::PredicateElseScalarEpilogue), "prefer tail-folding, create scalar epilogue if tail "
"folding fails." }
,
229 clEnumValN(PreferPredicateTy::PredicateOrDontVectorize,llvm::cl::OptionEnumValue { "predicate-dont-vectorize", int(PreferPredicateTy
::PredicateOrDontVectorize), "prefers tail-folding, don't attempt vectorization if "
"tail-folding fails." }
230 "predicate-dont-vectorize",llvm::cl::OptionEnumValue { "predicate-dont-vectorize", int(PreferPredicateTy
::PredicateOrDontVectorize), "prefers tail-folding, don't attempt vectorization if "
"tail-folding fails." }
231 "prefers tail-folding, don't attempt vectorization if "llvm::cl::OptionEnumValue { "predicate-dont-vectorize", int(PreferPredicateTy
::PredicateOrDontVectorize), "prefers tail-folding, don't attempt vectorization if "
"tail-folding fails." }
232 "tail-folding fails.")llvm::cl::OptionEnumValue { "predicate-dont-vectorize", int(PreferPredicateTy
::PredicateOrDontVectorize), "prefers tail-folding, don't attempt vectorization if "
"tail-folding fails." }
));
233
234static cl::opt<bool> MaximizeBandwidth(
235 "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden,
236 cl::desc("Maximize bandwidth when selecting vectorization factor which "
237 "will be determined by the smallest type in loop."));
238
239static cl::opt<bool> EnableInterleavedMemAccesses(
240 "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden,
241 cl::desc("Enable vectorization on interleaved memory accesses in a loop"));
242
243/// An interleave-group may need masking if it resides in a block that needs
244/// predication, or in order to mask away gaps.
245static cl::opt<bool> EnableMaskedInterleavedMemAccesses(
246 "enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden,
247 cl::desc("Enable vectorization on masked interleaved memory accesses in a loop"));
248
249static cl::opt<unsigned> TinyTripCountInterleaveThreshold(
250 "tiny-trip-count-interleave-threshold", cl::init(128), cl::Hidden,
251 cl::desc("We don't interleave loops with a estimated constant trip count "
252 "below this number"));
253
254static cl::opt<unsigned> ForceTargetNumScalarRegs(
255 "force-target-num-scalar-regs", cl::init(0), cl::Hidden,
256 cl::desc("A flag that overrides the target's number of scalar registers."));
257
258static cl::opt<unsigned> ForceTargetNumVectorRegs(
259 "force-target-num-vector-regs", cl::init(0), cl::Hidden,
260 cl::desc("A flag that overrides the target's number of vector registers."));
261
262static cl::opt<unsigned> ForceTargetMaxScalarInterleaveFactor(
263 "force-target-max-scalar-interleave", cl::init(0), cl::Hidden,
264 cl::desc("A flag that overrides the target's max interleave factor for "
265 "scalar loops."));
266
267static cl::opt<unsigned> ForceTargetMaxVectorInterleaveFactor(
268 "force-target-max-vector-interleave", cl::init(0), cl::Hidden,
269 cl::desc("A flag that overrides the target's max interleave factor for "
270 "vectorized loops."));
271
272static cl::opt<unsigned> ForceTargetInstructionCost(
273 "force-target-instruction-cost", cl::init(0), cl::Hidden,
274 cl::desc("A flag that overrides the target's expected cost for "
275 "an instruction to a single constant value. Mostly "
276 "useful for getting consistent testing."));
277
278static cl::opt<bool> ForceTargetSupportsScalableVectors(
279 "force-target-supports-scalable-vectors", cl::init(false), cl::Hidden,
280 cl::desc(
281 "Pretend that scalable vectors are supported, even if the target does "
282 "not support them. This flag should only be used for testing."));
283
284static cl::opt<unsigned> SmallLoopCost(
285 "small-loop-cost", cl::init(20), cl::Hidden,
286 cl::desc(
287 "The cost of a loop that is considered 'small' by the interleaver."));
288
289static cl::opt<bool> LoopVectorizeWithBlockFrequency(
290 "loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden,
291 cl::desc("Enable the use of the block frequency analysis to access PGO "
292 "heuristics minimizing code growth in cold regions and being more "
293 "aggressive in hot regions."));
294
295// Runtime interleave loops for load/store throughput.
296static cl::opt<bool> EnableLoadStoreRuntimeInterleave(
297 "enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden,
298 cl::desc(
299 "Enable runtime interleaving until load/store ports are saturated"));
300
301/// Interleave small loops with scalar reductions.
302static cl::opt<bool> InterleaveSmallLoopScalarReduction(
303 "interleave-small-loop-scalar-reduction", cl::init(false), cl::Hidden,
304 cl::desc("Enable interleaving for loops with small iteration counts that "
305 "contain scalar reductions to expose ILP."));
306
307/// The number of stores in a loop that are allowed to need predication.
308static cl::opt<unsigned> NumberOfStoresToPredicate(
309 "vectorize-num-stores-pred", cl::init(1), cl::Hidden,
310 cl::desc("Max number of stores to be predicated behind an if."));
311
312static cl::opt<bool> EnableIndVarRegisterHeur(
313 "enable-ind-var-reg-heur", cl::init(true), cl::Hidden,
314 cl::desc("Count the induction variable only once when interleaving"));
315
316static cl::opt<bool> EnableCondStoresVectorization(
317 "enable-cond-stores-vec", cl::init(true), cl::Hidden,
318 cl::desc("Enable if predication of stores during vectorization."));
319
320static cl::opt<unsigned> MaxNestedScalarReductionIC(
321 "max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden,
322 cl::desc("The maximum interleave count to use when interleaving a scalar "
323 "reduction in a nested loop."));
324
325static cl::opt<bool>
326 PreferInLoopReductions("prefer-inloop-reductions", cl::init(false),
327 cl::Hidden,
328 cl::desc("Prefer in-loop vector reductions, "
329 "overriding the targets preference."));
330
331static cl::opt<bool> ForceOrderedReductions(
332 "force-ordered-reductions", cl::init(false), cl::Hidden,
333 cl::desc("Enable the vectorisation of loops with in-order (strict) "
334 "FP reductions"));
335
336static cl::opt<bool> PreferPredicatedReductionSelect(
337 "prefer-predicated-reduction-select", cl::init(false), cl::Hidden,
338 cl::desc(
339 "Prefer predicating a reduction operation over an after loop select."));
340
341cl::opt<bool> EnableVPlanNativePath(
342 "enable-vplan-native-path", cl::init(false), cl::Hidden,
343 cl::desc("Enable VPlan-native vectorization path with "
344 "support for outer loop vectorization."));
345
346// This flag enables the stress testing of the VPlan H-CFG construction in the
347// VPlan-native vectorization path. It must be used in conjuction with
348// -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the
349// verification of the H-CFGs built.
350static cl::opt<bool> VPlanBuildStressTest(
351 "vplan-build-stress-test", cl::init(false), cl::Hidden,
352 cl::desc(
353 "Build VPlan for every supported loop nest in the function and bail "
354 "out right after the build (stress test the VPlan H-CFG construction "
355 "in the VPlan-native vectorization path)."));
356
357cl::opt<bool> llvm::EnableLoopInterleaving(
358 "interleave-loops", cl::init(true), cl::Hidden,
359 cl::desc("Enable loop interleaving in Loop vectorization passes"));
360cl::opt<bool> llvm::EnableLoopVectorization(
361 "vectorize-loops", cl::init(true), cl::Hidden,
362 cl::desc("Run the Loop vectorization passes"));
363
364static cl::opt<bool> PrintVPlansInDotFormat(
365 "vplan-print-in-dot-format", cl::Hidden,
366 cl::desc("Use dot format instead of plain text when dumping VPlans"));
367
368static cl::opt<cl::boolOrDefault> ForceSafeDivisor(
369 "force-widen-divrem-via-safe-divisor", cl::Hidden,
370 cl::desc(
371 "Override cost based safe divisor widening for div/rem instructions"));
372
373/// A helper function that returns true if the given type is irregular. The
374/// type is irregular if its allocated size doesn't equal the store size of an
375/// element of the corresponding vector type.
376static bool hasIrregularType(Type *Ty, const DataLayout &DL) {
377 // Determine if an array of N elements of type Ty is "bitcast compatible"
378 // with a <N x Ty> vector.
379 // This is only true if there is no padding between the array elements.
380 return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty);
381}
382
383/// A helper function that returns the reciprocal of the block probability of
384/// predicated blocks. If we return X, we are assuming the predicated block
385/// will execute once for every X iterations of the loop header.
386///
387/// TODO: We should use actual block probability here, if available. Currently,
388/// we always assume predicated blocks have a 50% chance of executing.
389static unsigned getReciprocalPredBlockProb() { return 2; }
390
391/// A helper function that returns an integer or floating-point constant with
392/// value C.
393static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) {
394 return Ty->isIntegerTy() ? ConstantInt::getSigned(Ty, C)
395 : ConstantFP::get(Ty, C);
396}
397
398/// Returns "best known" trip count for the specified loop \p L as defined by
399/// the following procedure:
400/// 1) Returns exact trip count if it is known.
401/// 2) Returns expected trip count according to profile data if any.
402/// 3) Returns upper bound estimate if it is known.
403/// 4) Returns std::nullopt if all of the above failed.
404static std::optional<unsigned> getSmallBestKnownTC(ScalarEvolution &SE,
405 Loop *L) {
406 // Check if exact trip count is known.
407 if (unsigned ExpectedTC = SE.getSmallConstantTripCount(L))
408 return ExpectedTC;
409
410 // Check if there is an expected trip count available from profile data.
411 if (LoopVectorizeWithBlockFrequency)
412 if (auto EstimatedTC = getLoopEstimatedTripCount(L))
413 return *EstimatedTC;
414
415 // Check if upper bound estimate is known.
416 if (unsigned ExpectedTC = SE.getSmallConstantMaxTripCount(L))
417 return ExpectedTC;
418
419 return std::nullopt;
420}
421
422// Forward declare GeneratedRTChecks.
423class GeneratedRTChecks;
424
425namespace llvm {
426
427AnalysisKey ShouldRunExtraVectorPasses::Key;
428
429/// InnerLoopVectorizer vectorizes loops which contain only one basic
430/// block to a specified vectorization factor (VF).
431/// This class performs the widening of scalars into vectors, or multiple
432/// scalars. This class also implements the following features:
433/// * It inserts an epilogue loop for handling loops that don't have iteration
434/// counts that are known to be a multiple of the vectorization factor.
435/// * It handles the code generation for reduction variables.
436/// * Scalarization (implementation using scalars) of un-vectorizable
437/// instructions.
438/// InnerLoopVectorizer does not perform any vectorization-legality
439/// checks, and relies on the caller to check for the different legality
440/// aspects. The InnerLoopVectorizer relies on the
441/// LoopVectorizationLegality class to provide information about the induction
442/// and reduction variables that were found to a given vectorization factor.
443class InnerLoopVectorizer {
444public:
445 InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
446 LoopInfo *LI, DominatorTree *DT,
447 const TargetLibraryInfo *TLI,
448 const TargetTransformInfo *TTI, AssumptionCache *AC,
449 OptimizationRemarkEmitter *ORE, ElementCount VecWidth,
450 ElementCount MinProfitableTripCount,
451 unsigned UnrollFactor, LoopVectorizationLegality *LVL,
452 LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI,
453 ProfileSummaryInfo *PSI, GeneratedRTChecks &RTChecks)
454 : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI),
455 AC(AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor),
456 Builder(PSE.getSE()->getContext()), Legal(LVL), Cost(CM), BFI(BFI),
457 PSI(PSI), RTChecks(RTChecks) {
458 // Query this against the original loop and save it here because the profile
459 // of the original loop header may change as the transformation happens.
460 OptForSizeBasedOnProfile = llvm::shouldOptimizeForSize(
461 OrigLoop->getHeader(), PSI, BFI, PGSOQueryType::IRPass);
462
463 if (MinProfitableTripCount.isZero())
464 this->MinProfitableTripCount = VecWidth;
465 else
466 this->MinProfitableTripCount = MinProfitableTripCount;
467 }
468
469 virtual ~InnerLoopVectorizer() = default;
470
471 /// Create a new empty loop that will contain vectorized instructions later
472 /// on, while the old loop will be used as the scalar remainder. Control flow
473 /// is generated around the vectorized (and scalar epilogue) loops consisting
474 /// of various checks and bypasses. Return the pre-header block of the new
475 /// loop and the start value for the canonical induction, if it is != 0. The
476 /// latter is the case when vectorizing the epilogue loop. In the case of
477 /// epilogue vectorization, this function is overriden to handle the more
478 /// complex control flow around the loops.
479 virtual std::pair<BasicBlock *, Value *> createVectorizedLoopSkeleton();
480
481 /// Fix the vectorized code, taking care of header phi's, live-outs, and more.
482 void fixVectorizedLoop(VPTransformState &State, VPlan &Plan);
483
484 // Return true if any runtime check is added.
485 bool areSafetyChecksAdded() { return AddedSafetyChecks; }
486
487 /// A type for vectorized values in the new loop. Each value from the
488 /// original loop, when vectorized, is represented by UF vector values in the
489 /// new unrolled loop, where UF is the unroll factor.
490 using VectorParts = SmallVector<Value *, 2>;
491
492 /// A helper function to scalarize a single Instruction in the innermost loop.
493 /// Generates a sequence of scalar instances for each lane between \p MinLane
494 /// and \p MaxLane, times each part between \p MinPart and \p MaxPart,
495 /// inclusive. Uses the VPValue operands from \p RepRecipe instead of \p
496 /// Instr's operands.
497 void scalarizeInstruction(const Instruction *Instr,
498 VPReplicateRecipe *RepRecipe,
499 const VPIteration &Instance, bool IfPredicateInstr,
500 VPTransformState &State);
501
502 /// Construct the vector value of a scalarized value \p V one lane at a time.
503 void packScalarIntoVectorValue(VPValue *Def, const VPIteration &Instance,
504 VPTransformState &State);
505
506 /// Try to vectorize interleaved access group \p Group with the base address
507 /// given in \p Addr, optionally masking the vector operations if \p
508 /// BlockInMask is non-null. Use \p State to translate given VPValues to IR
509 /// values in the vectorized loop.
510 void vectorizeInterleaveGroup(const InterleaveGroup<Instruction> *Group,
511 ArrayRef<VPValue *> VPDefs,
512 VPTransformState &State, VPValue *Addr,
513 ArrayRef<VPValue *> StoredValues,
514 VPValue *BlockInMask = nullptr);
515
516 /// Fix the non-induction PHIs in \p Plan.
517 void fixNonInductionPHIs(VPlan &Plan, VPTransformState &State);
518
519 /// Returns true if the reordering of FP operations is not allowed, but we are
520 /// able to vectorize with strict in-order reductions for the given RdxDesc.
521 bool useOrderedReductions(const RecurrenceDescriptor &RdxDesc);
522
523 /// Create a broadcast instruction. This method generates a broadcast
524 /// instruction (shuffle) for loop invariant values and for the induction
525 /// value. If this is the induction variable then we extend it to N, N+1, ...
526 /// this is needed because each iteration in the loop corresponds to a SIMD
527 /// element.
528 virtual Value *getBroadcastInstrs(Value *V);
529
530 // Returns the resume value (bc.merge.rdx) for a reduction as
531 // generated by fixReduction.
532 PHINode *getReductionResumeValue(const RecurrenceDescriptor &RdxDesc);
533
534 /// Create a new phi node for the induction variable \p OrigPhi to resume
535 /// iteration count in the scalar epilogue, from where the vectorized loop
536 /// left off. In cases where the loop skeleton is more complicated (eg.
537 /// epilogue vectorization) and the resume values can come from an additional
538 /// bypass block, the \p AdditionalBypass pair provides information about the
539 /// bypass block and the end value on the edge from bypass to this loop.
540 PHINode *createInductionResumeValue(
541 PHINode *OrigPhi, const InductionDescriptor &ID,
542 ArrayRef<BasicBlock *> BypassBlocks,
543 std::pair<BasicBlock *, Value *> AdditionalBypass = {nullptr, nullptr});
544
545protected:
546 friend class LoopVectorizationPlanner;
547
548 /// A small list of PHINodes.
549 using PhiVector = SmallVector<PHINode *, 4>;
550
551 /// A type for scalarized values in the new loop. Each value from the
552 /// original loop, when scalarized, is represented by UF x VF scalar values
553 /// in the new unrolled loop, where UF is the unroll factor and VF is the
554 /// vectorization factor.
555 using ScalarParts = SmallVector<SmallVector<Value *, 4>, 2>;
556
557 /// Set up the values of the IVs correctly when exiting the vector loop.
558 void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II,
559 Value *VectorTripCount, Value *EndValue,
560 BasicBlock *MiddleBlock, BasicBlock *VectorHeader,
561 VPlan &Plan);
562
563 /// Handle all cross-iteration phis in the header.
564 void fixCrossIterationPHIs(VPTransformState &State);
565
566 /// Create the exit value of first order recurrences in the middle block and
567 /// update their users.
568 void fixFixedOrderRecurrence(VPFirstOrderRecurrencePHIRecipe *PhiR,
569 VPTransformState &State);
570
571 /// Create code for the loop exit value of the reduction.
572 void fixReduction(VPReductionPHIRecipe *Phi, VPTransformState &State);
573
574 /// Clear NSW/NUW flags from reduction instructions if necessary.
575 void clearReductionWrapFlags(VPReductionPHIRecipe *PhiR,
576 VPTransformState &State);
577
578 /// Iteratively sink the scalarized operands of a predicated instruction into
579 /// the block that was created for it.
580 void sinkScalarOperands(Instruction *PredInst);
581
582 /// Shrinks vector element sizes to the smallest bitwidth they can be legally
583 /// represented as.
584 void truncateToMinimalBitwidths(VPTransformState &State);
585
586 /// Returns (and creates if needed) the original loop trip count.
587 Value *getOrCreateTripCount(BasicBlock *InsertBlock);
588
589 /// Returns (and creates if needed) the trip count of the widened loop.
590 Value *getOrCreateVectorTripCount(BasicBlock *InsertBlock);
591
592 /// Returns a bitcasted value to the requested vector type.
593 /// Also handles bitcasts of vector<float> <-> vector<pointer> types.
594 Value *createBitOrPointerCast(Value *V, VectorType *DstVTy,
595 const DataLayout &DL);
596
597 /// Emit a bypass check to see if the vector trip count is zero, including if
598 /// it overflows.
599 void emitIterationCountCheck(BasicBlock *Bypass);
600
601 /// Emit a bypass check to see if all of the SCEV assumptions we've
602 /// had to make are correct. Returns the block containing the checks or
603 /// nullptr if no checks have been added.
604 BasicBlock *emitSCEVChecks(BasicBlock *Bypass);
605
606 /// Emit bypass checks to check any memory assumptions we may have made.
607 /// Returns the block containing the checks or nullptr if no checks have been
608 /// added.
609 BasicBlock *emitMemRuntimeChecks(BasicBlock *Bypass);
610
611 /// Emit basic blocks (prefixed with \p Prefix) for the iteration check,
612 /// vector loop preheader, middle block and scalar preheader.
613 void createVectorLoopSkeleton(StringRef Prefix);
614
615 /// Create new phi nodes for the induction variables to resume iteration count
616 /// in the scalar epilogue, from where the vectorized loop left off.
617 /// In cases where the loop skeleton is more complicated (eg. epilogue
618 /// vectorization) and the resume values can come from an additional bypass
619 /// block, the \p AdditionalBypass pair provides information about the bypass
620 /// block and the end value on the edge from bypass to this loop.
621 void createInductionResumeValues(
622 std::pair<BasicBlock *, Value *> AdditionalBypass = {nullptr, nullptr});
623
624 /// Complete the loop skeleton by adding debug MDs, creating appropriate
625 /// conditional branches in the middle block, preparing the builder and
626 /// running the verifier. Return the preheader of the completed vector loop.
627 BasicBlock *completeLoopSkeleton();
628
629 /// Collect poison-generating recipes that may generate a poison value that is
630 /// used after vectorization, even when their operands are not poison. Those
631 /// recipes meet the following conditions:
632 /// * Contribute to the address computation of a recipe generating a widen
633 /// memory load/store (VPWidenMemoryInstructionRecipe or
634 /// VPInterleaveRecipe).
635 /// * Such a widen memory load/store has at least one underlying Instruction
636 /// that is in a basic block that needs predication and after vectorization
637 /// the generated instruction won't be predicated.
638 void collectPoisonGeneratingRecipes(VPTransformState &State);
639
640 /// Allow subclasses to override and print debug traces before/after vplan
641 /// execution, when trace information is requested.
642 virtual void printDebugTracesAtStart(){};
643 virtual void printDebugTracesAtEnd(){};
644
645 /// The original loop.
646 Loop *OrigLoop;
647
648 /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies
649 /// dynamic knowledge to simplify SCEV expressions and converts them to a
650 /// more usable form.
651 PredicatedScalarEvolution &PSE;
652
653 /// Loop Info.
654 LoopInfo *LI;
655
656 /// Dominator Tree.
657 DominatorTree *DT;
658
659 /// Target Library Info.
660 const TargetLibraryInfo *TLI;
661
662 /// Target Transform Info.
663 const TargetTransformInfo *TTI;
664
665 /// Assumption Cache.
666 AssumptionCache *AC;
667
668 /// Interface to emit optimization remarks.
669 OptimizationRemarkEmitter *ORE;
670
671 /// The vectorization SIMD factor to use. Each vector will have this many
672 /// vector elements.
673 ElementCount VF;
674
675 ElementCount MinProfitableTripCount;
676
677 /// The vectorization unroll factor to use. Each scalar is vectorized to this
678 /// many different vector instructions.
679 unsigned UF;
680
681 /// The builder that we use
682 IRBuilder<> Builder;
683
684 // --- Vectorization state ---
685
686 /// The vector-loop preheader.
687 BasicBlock *LoopVectorPreHeader;
688
689 /// The scalar-loop preheader.
690 BasicBlock *LoopScalarPreHeader;
691
692 /// Middle Block between the vector and the scalar.
693 BasicBlock *LoopMiddleBlock;
694
695 /// The unique ExitBlock of the scalar loop if one exists. Note that
696 /// there can be multiple exiting edges reaching this block.
697 BasicBlock *LoopExitBlock;
698
699 /// The scalar loop body.
700 BasicBlock *LoopScalarBody;
701
702 /// A list of all bypass blocks. The first block is the entry of the loop.
703 SmallVector<BasicBlock *, 4> LoopBypassBlocks;
704
705 /// Store instructions that were predicated.
706 SmallVector<Instruction *, 4> PredicatedInstructions;
707
708 /// Trip count of the original loop.
709 Value *TripCount = nullptr;
710
711 /// Trip count of the widened loop (TripCount - TripCount % (VF*UF))
712 Value *VectorTripCount = nullptr;
713
714 /// The legality analysis.
715 LoopVectorizationLegality *Legal;
716
717 /// The profitablity analysis.
718 LoopVectorizationCostModel *Cost;
719
720 // Record whether runtime checks are added.
721 bool AddedSafetyChecks = false;
722
723 // Holds the end values for each induction variable. We save the end values
724 // so we can later fix-up the external users of the induction variables.
725 DenseMap<PHINode *, Value *> IVEndValues;
726
727 /// BFI and PSI are used to check for profile guided size optimizations.
728 BlockFrequencyInfo *BFI;
729 ProfileSummaryInfo *PSI;
730
731 // Whether this loop should be optimized for size based on profile guided size
732 // optimizatios.
733 bool OptForSizeBasedOnProfile;
734
735 /// Structure to hold information about generated runtime checks, responsible
736 /// for cleaning the checks, if vectorization turns out unprofitable.
737 GeneratedRTChecks &RTChecks;
738
739 // Holds the resume values for reductions in the loops, used to set the
740 // correct start value of reduction PHIs when vectorizing the epilogue.
741 SmallMapVector<const RecurrenceDescriptor *, PHINode *, 4>
742 ReductionResumeValues;
743};
744
745class InnerLoopUnroller : public InnerLoopVectorizer {
746public:
747 InnerLoopUnroller(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
748 LoopInfo *LI, DominatorTree *DT,
749 const TargetLibraryInfo *TLI,
750 const TargetTransformInfo *TTI, AssumptionCache *AC,
751 OptimizationRemarkEmitter *ORE, unsigned UnrollFactor,
752 LoopVectorizationLegality *LVL,
753 LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI,
754 ProfileSummaryInfo *PSI, GeneratedRTChecks &Check)
755 : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
756 ElementCount::getFixed(1),
757 ElementCount::getFixed(1), UnrollFactor, LVL, CM,
758 BFI, PSI, Check) {}
759
760private:
761 Value *getBroadcastInstrs(Value *V) override;
762};
763
764/// Encapsulate information regarding vectorization of a loop and its epilogue.
765/// This information is meant to be updated and used across two stages of
766/// epilogue vectorization.
767struct EpilogueLoopVectorizationInfo {
768 ElementCount MainLoopVF = ElementCount::getFixed(0);
769 unsigned MainLoopUF = 0;
770 ElementCount EpilogueVF = ElementCount::getFixed(0);
771 unsigned EpilogueUF = 0;
772 BasicBlock *MainLoopIterationCountCheck = nullptr;
773 BasicBlock *EpilogueIterationCountCheck = nullptr;
774 BasicBlock *SCEVSafetyCheck = nullptr;
775 BasicBlock *MemSafetyCheck = nullptr;
776 Value *TripCount = nullptr;
777 Value *VectorTripCount = nullptr;
778
779 EpilogueLoopVectorizationInfo(ElementCount MVF, unsigned MUF,
780 ElementCount EVF, unsigned EUF)
781 : MainLoopVF(MVF), MainLoopUF(MUF), EpilogueVF(EVF), EpilogueUF(EUF) {
782 assert(EUF == 1 &&(static_cast <bool> (EUF == 1 && "A high UF for the epilogue loop is likely not beneficial."
) ? void (0) : __assert_fail ("EUF == 1 && \"A high UF for the epilogue loop is likely not beneficial.\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 783, __extension__
__PRETTY_FUNCTION__))
783 "A high UF for the epilogue loop is likely not beneficial.")(static_cast <bool> (EUF == 1 && "A high UF for the epilogue loop is likely not beneficial."
) ? void (0) : __assert_fail ("EUF == 1 && \"A high UF for the epilogue loop is likely not beneficial.\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 783, __extension__
__PRETTY_FUNCTION__))
;
784 }
785};
786
787/// An extension of the inner loop vectorizer that creates a skeleton for a
788/// vectorized loop that has its epilogue (residual) also vectorized.
789/// The idea is to run the vplan on a given loop twice, firstly to setup the
790/// skeleton and vectorize the main loop, and secondly to complete the skeleton
791/// from the first step and vectorize the epilogue. This is achieved by
792/// deriving two concrete strategy classes from this base class and invoking
793/// them in succession from the loop vectorizer planner.
794class InnerLoopAndEpilogueVectorizer : public InnerLoopVectorizer {
795public:
796 InnerLoopAndEpilogueVectorizer(
797 Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI,
798 DominatorTree *DT, const TargetLibraryInfo *TLI,
799 const TargetTransformInfo *TTI, AssumptionCache *AC,
800 OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI,
801 LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM,
802 BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI,
803 GeneratedRTChecks &Checks)
804 : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
805 EPI.MainLoopVF, EPI.MainLoopVF, EPI.MainLoopUF, LVL,
806 CM, BFI, PSI, Checks),
807 EPI(EPI) {}
808
809 // Override this function to handle the more complex control flow around the
810 // three loops.
811 std::pair<BasicBlock *, Value *> createVectorizedLoopSkeleton() final {
812 return createEpilogueVectorizedLoopSkeleton();
813 }
814
815 /// The interface for creating a vectorized skeleton using one of two
816 /// different strategies, each corresponding to one execution of the vplan
817 /// as described above.
818 virtual std::pair<BasicBlock *, Value *>
819 createEpilogueVectorizedLoopSkeleton() = 0;
820
821 /// Holds and updates state information required to vectorize the main loop
822 /// and its epilogue in two separate passes. This setup helps us avoid
823 /// regenerating and recomputing runtime safety checks. It also helps us to
824 /// shorten the iteration-count-check path length for the cases where the
825 /// iteration count of the loop is so small that the main vector loop is
826 /// completely skipped.
827 EpilogueLoopVectorizationInfo &EPI;
828};
829
830/// A specialized derived class of inner loop vectorizer that performs
831/// vectorization of *main* loops in the process of vectorizing loops and their
832/// epilogues.
833class EpilogueVectorizerMainLoop : public InnerLoopAndEpilogueVectorizer {
834public:
835 EpilogueVectorizerMainLoop(
836 Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI,
837 DominatorTree *DT, const TargetLibraryInfo *TLI,
838 const TargetTransformInfo *TTI, AssumptionCache *AC,
839 OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI,
840 LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM,
841 BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI,
842 GeneratedRTChecks &Check)
843 : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
844 EPI, LVL, CM, BFI, PSI, Check) {}
845 /// Implements the interface for creating a vectorized skeleton using the
846 /// *main loop* strategy (ie the first pass of vplan execution).
847 std::pair<BasicBlock *, Value *> createEpilogueVectorizedLoopSkeleton() final;
848
849protected:
850 /// Emits an iteration count bypass check once for the main loop (when \p
851 /// ForEpilogue is false) and once for the epilogue loop (when \p
852 /// ForEpilogue is true).
853 BasicBlock *emitIterationCountCheck(BasicBlock *Bypass, bool ForEpilogue);
854 void printDebugTracesAtStart() override;
855 void printDebugTracesAtEnd() override;
856};
857
858// A specialized derived class of inner loop vectorizer that performs
859// vectorization of *epilogue* loops in the process of vectorizing loops and
860// their epilogues.
861class EpilogueVectorizerEpilogueLoop : public InnerLoopAndEpilogueVectorizer {
862public:
863 EpilogueVectorizerEpilogueLoop(
864 Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI,
865 DominatorTree *DT, const TargetLibraryInfo *TLI,
866 const TargetTransformInfo *TTI, AssumptionCache *AC,
867 OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI,
868 LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM,
869 BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI,
870 GeneratedRTChecks &Checks)
871 : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
872 EPI, LVL, CM, BFI, PSI, Checks) {
873 TripCount = EPI.TripCount;
874 }
875 /// Implements the interface for creating a vectorized skeleton using the
876 /// *epilogue loop* strategy (ie the second pass of vplan execution).
877 std::pair<BasicBlock *, Value *> createEpilogueVectorizedLoopSkeleton() final;
878
879protected:
880 /// Emits an iteration count bypass check after the main vector loop has
881 /// finished to see if there are any iterations left to execute by either
882 /// the vector epilogue or the scalar epilogue.
883 BasicBlock *emitMinimumVectorEpilogueIterCountCheck(
884 BasicBlock *Bypass,
885 BasicBlock *Insert);
886 void printDebugTracesAtStart() override;
887 void printDebugTracesAtEnd() override;
888};
889} // end namespace llvm
890
891/// Look for a meaningful debug location on the instruction or it's
892/// operands.
893static Instruction *getDebugLocFromInstOrOperands(Instruction *I) {
894 if (!I)
895 return I;
896
897 DebugLoc Empty;
898 if (I->getDebugLoc() != Empty)
899 return I;
900
901 for (Use &Op : I->operands()) {
902 if (Instruction *OpInst = dyn_cast<Instruction>(Op))
903 if (OpInst->getDebugLoc() != Empty)
904 return OpInst;
905 }
906
907 return I;
908}
909
910/// Write a \p DebugMsg about vectorization to the debug output stream. If \p I
911/// is passed, the message relates to that particular instruction.
912#ifndef NDEBUG
913static void debugVectorizationMessage(const StringRef Prefix,
914 const StringRef DebugMsg,
915 Instruction *I) {
916 dbgs() << "LV: " << Prefix << DebugMsg;
917 if (I != nullptr)
918 dbgs() << " " << *I;
919 else
920 dbgs() << '.';
921 dbgs() << '\n';
922}
923#endif
924
925/// Create an analysis remark that explains why vectorization failed
926///
927/// \p PassName is the name of the pass (e.g. can be AlwaysPrint). \p
928/// RemarkName is the identifier for the remark. If \p I is passed it is an
929/// instruction that prevents vectorization. Otherwise \p TheLoop is used for
930/// the location of the remark. \return the remark object that can be
931/// streamed to.
932static OptimizationRemarkAnalysis createLVAnalysis(const char *PassName,
933 StringRef RemarkName, Loop *TheLoop, Instruction *I) {
934 Value *CodeRegion = TheLoop->getHeader();
935 DebugLoc DL = TheLoop->getStartLoc();
936
937 if (I) {
938 CodeRegion = I->getParent();
939 // If there is no debug location attached to the instruction, revert back to
940 // using the loop's.
941 if (I->getDebugLoc())
942 DL = I->getDebugLoc();
943 }
944
945 return OptimizationRemarkAnalysis(PassName, RemarkName, DL, CodeRegion);
946}
947
948namespace llvm {
949
950/// Return a value for Step multiplied by VF.
951Value *createStepForVF(IRBuilderBase &B, Type *Ty, ElementCount VF,
952 int64_t Step) {
953 assert(Ty->isIntegerTy() && "Expected an integer step")(static_cast <bool> (Ty->isIntegerTy() && "Expected an integer step"
) ? void (0) : __assert_fail ("Ty->isIntegerTy() && \"Expected an integer step\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 953, __extension__
__PRETTY_FUNCTION__))
;
954 Constant *StepVal = ConstantInt::get(Ty, Step * VF.getKnownMinValue());
955 return VF.isScalable() ? B.CreateVScale(StepVal) : StepVal;
956}
957
958/// Return the runtime value for VF.
959Value *getRuntimeVF(IRBuilderBase &B, Type *Ty, ElementCount VF) {
960 Constant *EC = ConstantInt::get(Ty, VF.getKnownMinValue());
961 return VF.isScalable() ? B.CreateVScale(EC) : EC;
962}
963
964const SCEV *createTripCountSCEV(Type *IdxTy, PredicatedScalarEvolution &PSE) {
965 const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount();
966 assert(!isa<SCEVCouldNotCompute>(BackedgeTakenCount) && "Invalid loop count")(static_cast <bool> (!isa<SCEVCouldNotCompute>(BackedgeTakenCount
) && "Invalid loop count") ? void (0) : __assert_fail
("!isa<SCEVCouldNotCompute>(BackedgeTakenCount) && \"Invalid loop count\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 966, __extension__
__PRETTY_FUNCTION__))
;
967
968 ScalarEvolution &SE = *PSE.getSE();
969
970 // The exit count might have the type of i64 while the phi is i32. This can
971 // happen if we have an induction variable that is sign extended before the
972 // compare. The only way that we get a backedge taken count is that the
973 // induction variable was signed and as such will not overflow. In such a case
974 // truncation is legal.
975 if (SE.getTypeSizeInBits(BackedgeTakenCount->getType()) >
976 IdxTy->getPrimitiveSizeInBits())
977 BackedgeTakenCount = SE.getTruncateOrNoop(BackedgeTakenCount, IdxTy);
978 BackedgeTakenCount = SE.getNoopOrZeroExtend(BackedgeTakenCount, IdxTy);
979
980 // Get the total trip count from the count by adding 1.
981 return SE.getAddExpr(BackedgeTakenCount,
982 SE.getOne(BackedgeTakenCount->getType()));
983}
984
985static Value *getRuntimeVFAsFloat(IRBuilderBase &B, Type *FTy,
986 ElementCount VF) {
987 assert(FTy->isFloatingPointTy() && "Expected floating point type!")(static_cast <bool> (FTy->isFloatingPointTy() &&
"Expected floating point type!") ? void (0) : __assert_fail (
"FTy->isFloatingPointTy() && \"Expected floating point type!\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 987, __extension__
__PRETTY_FUNCTION__))
;
988 Type *IntTy = IntegerType::get(FTy->getContext(), FTy->getScalarSizeInBits());
989 Value *RuntimeVF = getRuntimeVF(B, IntTy, VF);
990 return B.CreateUIToFP(RuntimeVF, FTy);
991}
992
993void reportVectorizationFailure(const StringRef DebugMsg,
994 const StringRef OREMsg, const StringRef ORETag,
995 OptimizationRemarkEmitter *ORE, Loop *TheLoop,
996 Instruction *I) {
997 LLVM_DEBUG(debugVectorizationMessage("Not vectorizing: ", DebugMsg, I))do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { debugVectorizationMessage("Not vectorizing: "
, DebugMsg, I); } } while (false)
;
998 LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE);
999 ORE->emit(
1000 createLVAnalysis(Hints.vectorizeAnalysisPassName(), ORETag, TheLoop, I)
1001 << "loop not vectorized: " << OREMsg);
1002}
1003
1004void reportVectorizationInfo(const StringRef Msg, const StringRef ORETag,
1005 OptimizationRemarkEmitter *ORE, Loop *TheLoop,
1006 Instruction *I) {
1007 LLVM_DEBUG(debugVectorizationMessage("", Msg, I))do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { debugVectorizationMessage("", Msg, I); }
} while (false)
;
1008 LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE);
1009 ORE->emit(
1010 createLVAnalysis(Hints.vectorizeAnalysisPassName(), ORETag, TheLoop, I)
1011 << Msg);
1012}
1013
1014} // end namespace llvm
1015
1016#ifndef NDEBUG
1017/// \return string containing a file name and a line # for the given loop.
1018static std::string getDebugLocString(const Loop *L) {
1019 std::string Result;
1020 if (L) {
1021 raw_string_ostream OS(Result);
1022 if (const DebugLoc LoopDbgLoc = L->getStartLoc())
1023 LoopDbgLoc.print(OS);
1024 else
1025 // Just print the module name.
1026 OS << L->getHeader()->getParent()->getParent()->getModuleIdentifier();
1027 OS.flush();
1028 }
1029 return Result;
1030}
1031#endif
1032
1033void InnerLoopVectorizer::collectPoisonGeneratingRecipes(
1034 VPTransformState &State) {
1035
1036 // Collect recipes in the backward slice of `Root` that may generate a poison
1037 // value that is used after vectorization.
1038 SmallPtrSet<VPRecipeBase *, 16> Visited;
1039 auto collectPoisonGeneratingInstrsInBackwardSlice([&](VPRecipeBase *Root) {
1040 SmallVector<VPRecipeBase *, 16> Worklist;
1041 Worklist.push_back(Root);
1042
1043 // Traverse the backward slice of Root through its use-def chain.
1044 while (!Worklist.empty()) {
1045 VPRecipeBase *CurRec = Worklist.back();
1046 Worklist.pop_back();
1047
1048 if (!Visited.insert(CurRec).second)
1049 continue;
1050
1051 // Prune search if we find another recipe generating a widen memory
1052 // instruction. Widen memory instructions involved in address computation
1053 // will lead to gather/scatter instructions, which don't need to be
1054 // handled.
1055 if (isa<VPWidenMemoryInstructionRecipe>(CurRec) ||
1056 isa<VPInterleaveRecipe>(CurRec) ||
1057 isa<VPScalarIVStepsRecipe>(CurRec) ||
1058 isa<VPCanonicalIVPHIRecipe>(CurRec) ||
1059 isa<VPActiveLaneMaskPHIRecipe>(CurRec))
1060 continue;
1061
1062 // This recipe contributes to the address computation of a widen
1063 // load/store. Collect recipe if its underlying instruction has
1064 // poison-generating flags.
1065 Instruction *Instr = CurRec->getUnderlyingInstr();
1066 if (Instr && Instr->hasPoisonGeneratingFlags())
1067 State.MayGeneratePoisonRecipes.insert(CurRec);
1068
1069 // Add new definitions to the worklist.
1070 for (VPValue *operand : CurRec->operands())
1071 if (VPRecipeBase *OpDef = operand->getDefiningRecipe())
1072 Worklist.push_back(OpDef);
1073 }
1074 });
1075
1076 // Traverse all the recipes in the VPlan and collect the poison-generating
1077 // recipes in the backward slice starting at the address of a VPWidenRecipe or
1078 // VPInterleaveRecipe.
1079 auto Iter = depth_first(
1080 VPBlockRecursiveTraversalWrapper<VPBlockBase *>(State.Plan->getEntry()));
1081 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) {
1082 for (VPRecipeBase &Recipe : *VPBB) {
1083 if (auto *WidenRec = dyn_cast<VPWidenMemoryInstructionRecipe>(&Recipe)) {
1084 Instruction &UnderlyingInstr = WidenRec->getIngredient();
1085 VPRecipeBase *AddrDef = WidenRec->getAddr()->getDefiningRecipe();
1086 if (AddrDef && WidenRec->isConsecutive() &&
1087 Legal->blockNeedsPredication(UnderlyingInstr.getParent()))
1088 collectPoisonGeneratingInstrsInBackwardSlice(AddrDef);
1089 } else if (auto *InterleaveRec = dyn_cast<VPInterleaveRecipe>(&Recipe)) {
1090 VPRecipeBase *AddrDef = InterleaveRec->getAddr()->getDefiningRecipe();
1091 if (AddrDef) {
1092 // Check if any member of the interleave group needs predication.
1093 const InterleaveGroup<Instruction> *InterGroup =
1094 InterleaveRec->getInterleaveGroup();
1095 bool NeedPredication = false;
1096 for (int I = 0, NumMembers = InterGroup->getNumMembers();
1097 I < NumMembers; ++I) {
1098 Instruction *Member = InterGroup->getMember(I);
1099 if (Member)
1100 NeedPredication |=
1101 Legal->blockNeedsPredication(Member->getParent());
1102 }
1103
1104 if (NeedPredication)
1105 collectPoisonGeneratingInstrsInBackwardSlice(AddrDef);
1106 }
1107 }
1108 }
1109 }
1110}
1111
1112PHINode *InnerLoopVectorizer::getReductionResumeValue(
1113 const RecurrenceDescriptor &RdxDesc) {
1114 auto It = ReductionResumeValues.find(&RdxDesc);
1115 assert(It != ReductionResumeValues.end() &&(static_cast <bool> (It != ReductionResumeValues.end() &&
"Expected to find a resume value for the reduction.") ? void
(0) : __assert_fail ("It != ReductionResumeValues.end() && \"Expected to find a resume value for the reduction.\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 1116, __extension__
__PRETTY_FUNCTION__))
1116 "Expected to find a resume value for the reduction.")(static_cast <bool> (It != ReductionResumeValues.end() &&
"Expected to find a resume value for the reduction.") ? void
(0) : __assert_fail ("It != ReductionResumeValues.end() && \"Expected to find a resume value for the reduction.\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 1116, __extension__
__PRETTY_FUNCTION__))
;
1117 return It->second;
1118}
1119
1120namespace llvm {
1121
1122// Loop vectorization cost-model hints how the scalar epilogue loop should be
1123// lowered.
1124enum ScalarEpilogueLowering {
1125
1126 // The default: allowing scalar epilogues.
1127 CM_ScalarEpilogueAllowed,
1128
1129 // Vectorization with OptForSize: don't allow epilogues.
1130 CM_ScalarEpilogueNotAllowedOptSize,
1131
1132 // A special case of vectorisation with OptForSize: loops with a very small
1133 // trip count are considered for vectorization under OptForSize, thereby
1134 // making sure the cost of their loop body is dominant, free of runtime
1135 // guards and scalar iteration overheads.
1136 CM_ScalarEpilogueNotAllowedLowTripLoop,
1137
1138 // Loop hint predicate indicating an epilogue is undesired.
1139 CM_ScalarEpilogueNotNeededUsePredicate,
1140
1141 // Directive indicating we must either tail fold or not vectorize
1142 CM_ScalarEpilogueNotAllowedUsePredicate
1143};
1144
1145/// ElementCountComparator creates a total ordering for ElementCount
1146/// for the purposes of using it in a set structure.
1147struct ElementCountComparator {
1148 bool operator()(const ElementCount &LHS, const ElementCount &RHS) const {
1149 return std::make_tuple(LHS.isScalable(), LHS.getKnownMinValue()) <
1150 std::make_tuple(RHS.isScalable(), RHS.getKnownMinValue());
1151 }
1152};
1153using ElementCountSet = SmallSet<ElementCount, 16, ElementCountComparator>;
1154
1155/// LoopVectorizationCostModel - estimates the expected speedups due to
1156/// vectorization.
1157/// In many cases vectorization is not profitable. This can happen because of
1158/// a number of reasons. In this class we mainly attempt to predict the
1159/// expected speedup/slowdowns due to the supported instruction set. We use the
1160/// TargetTransformInfo to query the different backends for the cost of
1161/// different operations.
1162class LoopVectorizationCostModel {
1163public:
1164 LoopVectorizationCostModel(ScalarEpilogueLowering SEL, Loop *L,
1165 PredicatedScalarEvolution &PSE, LoopInfo *LI,
1166 LoopVectorizationLegality *Legal,
1167 const TargetTransformInfo &TTI,
1168 const TargetLibraryInfo *TLI, DemandedBits *DB,
1169 AssumptionCache *AC,
1170 OptimizationRemarkEmitter *ORE, const Function *F,
1171 const LoopVectorizeHints *Hints,
1172 InterleavedAccessInfo &IAI)
1173 : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal),
1174 TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F),
1175 Hints(Hints), InterleaveInfo(IAI) {}
1176
1177 /// \return An upper bound for the vectorization factors (both fixed and
1178 /// scalable). If the factors are 0, vectorization and interleaving should be
1179 /// avoided up front.
1180 FixedScalableVFPair computeMaxVF(ElementCount UserVF, unsigned UserIC);
1181
1182 /// \return True if runtime checks are required for vectorization, and false
1183 /// otherwise.
1184 bool runtimeChecksRequired();
1185
1186 /// \return The most profitable vectorization factor and the cost of that VF.
1187 /// This method checks every VF in \p CandidateVFs. If UserVF is not ZERO
1188 /// then this vectorization factor will be selected if vectorization is
1189 /// possible.
1190 VectorizationFactor
1191 selectVectorizationFactor(const ElementCountSet &CandidateVFs);
1192
1193 VectorizationFactor
1194 selectEpilogueVectorizationFactor(const ElementCount MaxVF,
1195 const LoopVectorizationPlanner &LVP);
1196
1197 /// Setup cost-based decisions for user vectorization factor.
1198 /// \return true if the UserVF is a feasible VF to be chosen.
1199 bool selectUserVectorizationFactor(ElementCount UserVF) {
1200 collectUniformsAndScalars(UserVF);
1201 collectInstsToScalarize(UserVF);
1202 return expectedCost(UserVF).first.isValid();
1203 }
1204
1205 /// \return The size (in bits) of the smallest and widest types in the code
1206 /// that needs to be vectorized. We ignore values that remain scalar such as
1207 /// 64 bit loop indices.
1208 std::pair<unsigned, unsigned> getSmallestAndWidestTypes();
1209
1210 /// \return The desired interleave count.
1211 /// If interleave count has been specified by metadata it will be returned.
1212 /// Otherwise, the interleave count is computed and returned. VF and LoopCost
1213 /// are the selected vectorization factor and the cost of the selected VF.
1214 unsigned selectInterleaveCount(ElementCount VF, InstructionCost LoopCost);
1215
1216 /// Memory access instruction may be vectorized in more than one way.
1217 /// Form of instruction after vectorization depends on cost.
1218 /// This function takes cost-based decisions for Load/Store instructions
1219 /// and collects them in a map. This decisions map is used for building
1220 /// the lists of loop-uniform and loop-scalar instructions.
1221 /// The calculated cost is saved with widening decision in order to
1222 /// avoid redundant calculations.
1223 void setCostBasedWideningDecision(ElementCount VF);
1224
1225 /// A struct that represents some properties of the register usage
1226 /// of a loop.
1227 struct RegisterUsage {
1228 /// Holds the number of loop invariant values that are used in the loop.
1229 /// The key is ClassID of target-provided register class.
1230 SmallMapVector<unsigned, unsigned, 4> LoopInvariantRegs;
1231 /// Holds the maximum number of concurrent live intervals in the loop.
1232 /// The key is ClassID of target-provided register class.
1233 SmallMapVector<unsigned, unsigned, 4> MaxLocalUsers;
1234 };
1235
1236 /// \return Returns information about the register usages of the loop for the
1237 /// given vectorization factors.
1238 SmallVector<RegisterUsage, 8>
1239 calculateRegisterUsage(ArrayRef<ElementCount> VFs);
1240
1241 /// Collect values we want to ignore in the cost model.
1242 void collectValuesToIgnore();
1243
1244 /// Collect all element types in the loop for which widening is needed.
1245 void collectElementTypesForWidening();
1246
1247 /// Split reductions into those that happen in the loop, and those that happen
1248 /// outside. In loop reductions are collected into InLoopReductionChains.
1249 void collectInLoopReductions();
1250
1251 /// Returns true if we should use strict in-order reductions for the given
1252 /// RdxDesc. This is true if the -enable-strict-reductions flag is passed,
1253 /// the IsOrdered flag of RdxDesc is set and we do not allow reordering
1254 /// of FP operations.
1255 bool useOrderedReductions(const RecurrenceDescriptor &RdxDesc) const {
1256 return !Hints->allowReordering() && RdxDesc.isOrdered();
1257 }
1258
1259 /// \returns The smallest bitwidth each instruction can be represented with.
1260 /// The vector equivalents of these instructions should be truncated to this
1261 /// type.
1262 const MapVector<Instruction *, uint64_t> &getMinimalBitwidths() const {
1263 return MinBWs;
1264 }
1265
1266 /// \returns True if it is more profitable to scalarize instruction \p I for
1267 /// vectorization factor \p VF.
1268 bool isProfitableToScalarize(Instruction *I, ElementCount VF) const {
1269 assert(VF.isVector() &&(static_cast <bool> (VF.isVector() && "Profitable to scalarize relevant only for VF > 1."
) ? void (0) : __assert_fail ("VF.isVector() && \"Profitable to scalarize relevant only for VF > 1.\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 1270, __extension__
__PRETTY_FUNCTION__))
1270 "Profitable to scalarize relevant only for VF > 1.")(static_cast <bool> (VF.isVector() && "Profitable to scalarize relevant only for VF > 1."
) ? void (0) : __assert_fail ("VF.isVector() && \"Profitable to scalarize relevant only for VF > 1.\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 1270, __extension__
__PRETTY_FUNCTION__))
;
1271
1272 // Cost model is not run in the VPlan-native path - return conservative
1273 // result until this changes.
1274 if (EnableVPlanNativePath)
1275 return false;
1276
1277 auto Scalars = InstsToScalarize.find(VF);
1278 assert(Scalars != InstsToScalarize.end() &&(static_cast <bool> (Scalars != InstsToScalarize.end() &&
"VF not yet analyzed for scalarization profitability") ? void
(0) : __assert_fail ("Scalars != InstsToScalarize.end() && \"VF not yet analyzed for scalarization profitability\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 1279, __extension__
__PRETTY_FUNCTION__))
1279 "VF not yet analyzed for scalarization profitability")(static_cast <bool> (Scalars != InstsToScalarize.end() &&
"VF not yet analyzed for scalarization profitability") ? void
(0) : __assert_fail ("Scalars != InstsToScalarize.end() && \"VF not yet analyzed for scalarization profitability\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 1279, __extension__
__PRETTY_FUNCTION__))
;
1280 return Scalars->second.find(I) != Scalars->second.end();
1281 }
1282
1283 /// Returns true if \p I is known to be uniform after vectorization.
1284 bool isUniformAfterVectorization(Instruction *I, ElementCount VF) const {
1285 if (VF.isScalar())
1286 return true;
1287
1288 // Cost model is not run in the VPlan-native path - return conservative
1289 // result until this changes.
1290 if (EnableVPlanNativePath)
1291 return false;
1292
1293 auto UniformsPerVF = Uniforms.find(VF);
1294 assert(UniformsPerVF != Uniforms.end() &&(static_cast <bool> (UniformsPerVF != Uniforms.end() &&
"VF not yet analyzed for uniformity") ? void (0) : __assert_fail
("UniformsPerVF != Uniforms.end() && \"VF not yet analyzed for uniformity\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 1295, __extension__
__PRETTY_FUNCTION__))
1295 "VF not yet analyzed for uniformity")(static_cast <bool> (UniformsPerVF != Uniforms.end() &&
"VF not yet analyzed for uniformity") ? void (0) : __assert_fail
("UniformsPerVF != Uniforms.end() && \"VF not yet analyzed for uniformity\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 1295, __extension__
__PRETTY_FUNCTION__))
;
1296 return UniformsPerVF->second.count(I);
1297 }
1298
1299 /// Returns true if \p I is known to be scalar after vectorization.
1300 bool isScalarAfterVectorization(Instruction *I, ElementCount VF) const {
1301 if (VF.isScalar())
1302 return true;
1303
1304 // Cost model is not run in the VPlan-native path - return conservative
1305 // result until this changes.
1306 if (EnableVPlanNativePath)
1307 return false;
1308
1309 auto ScalarsPerVF = Scalars.find(VF);
1310 assert(ScalarsPerVF != Scalars.end() &&(static_cast <bool> (ScalarsPerVF != Scalars.end() &&
"Scalar values are not calculated for VF") ? void (0) : __assert_fail
("ScalarsPerVF != Scalars.end() && \"Scalar values are not calculated for VF\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 1311, __extension__
__PRETTY_FUNCTION__))
1311 "Scalar values are not calculated for VF")(static_cast <bool> (ScalarsPerVF != Scalars.end() &&
"Scalar values are not calculated for VF") ? void (0) : __assert_fail
("ScalarsPerVF != Scalars.end() && \"Scalar values are not calculated for VF\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 1311, __extension__
__PRETTY_FUNCTION__))
;
1312 return ScalarsPerVF->second.count(I);
1313 }
1314
1315 /// \returns True if instruction \p I can be truncated to a smaller bitwidth
1316 /// for vectorization factor \p VF.
1317 bool canTruncateToMinimalBitwidth(Instruction *I, ElementCount VF) const {
1318 return VF.isVector() && MinBWs.find(I) != MinBWs.end() &&
1319 !isProfitableToScalarize(I, VF) &&
1320 !isScalarAfterVectorization(I, VF);
1321 }
1322
1323 /// Decision that was taken during cost calculation for memory instruction.
1324 enum InstWidening {
1325 CM_Unknown,
1326 CM_Widen, // For consecutive accesses with stride +1.
1327 CM_Widen_Reverse, // For consecutive accesses with stride -1.
1328 CM_Interleave,
1329 CM_GatherScatter,
1330 CM_Scalarize
1331 };
1332
1333 /// Save vectorization decision \p W and \p Cost taken by the cost model for
1334 /// instruction \p I and vector width \p VF.
1335 void setWideningDecision(Instruction *I, ElementCount VF, InstWidening W,
1336 InstructionCost Cost) {
1337 assert(VF.isVector() && "Expected VF >=2")(static_cast <bool> (VF.isVector() && "Expected VF >=2"
) ? void (0) : __assert_fail ("VF.isVector() && \"Expected VF >=2\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 1337, __extension__
__PRETTY_FUNCTION__))
;
1338 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
1339 }
1340
1341 /// Save vectorization decision \p W and \p Cost taken by the cost model for
1342 /// interleaving group \p Grp and vector width \p VF.
1343 void setWideningDecision(const InterleaveGroup<Instruction> *Grp,
1344 ElementCount VF, InstWidening W,
1345 InstructionCost Cost) {
1346 assert(VF.isVector() && "Expected VF >=2")(static_cast <bool> (VF.isVector() && "Expected VF >=2"
) ? void (0) : __assert_fail ("VF.isVector() && \"Expected VF >=2\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 1346, __extension__
__PRETTY_FUNCTION__))
;
1347 /// Broadcast this decicion to all instructions inside the group.
1348 /// But the cost will be assigned to one instruction only.
1349 for (unsigned i = 0; i < Grp->getFactor(); ++i) {
1350 if (auto *I = Grp->getMember(i)) {
1351 if (Grp->getInsertPos() == I)
1352 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
1353 else
1354 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, 0);
1355 }
1356 }
1357 }
1358
1359 /// Return the cost model decision for the given instruction \p I and vector
1360 /// width \p VF. Return CM_Unknown if this instruction did not pass
1361 /// through the cost modeling.
1362 InstWidening getWideningDecision(Instruction *I, ElementCount VF) const {
1363 assert(VF.isVector() && "Expected VF to be a vector VF")(static_cast <bool> (VF.isVector() && "Expected VF to be a vector VF"
) ? void (0) : __assert_fail ("VF.isVector() && \"Expected VF to be a vector VF\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 1363, __extension__
__PRETTY_FUNCTION__))
;
1364 // Cost model is not run in the VPlan-native path - return conservative
1365 // result until this changes.
1366 if (EnableVPlanNativePath)
1367 return CM_GatherScatter;
1368
1369 std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF);
1370 auto Itr = WideningDecisions.find(InstOnVF);
1371 if (Itr == WideningDecisions.end())
1372 return CM_Unknown;
1373 return Itr->second.first;
1374 }
1375
1376 /// Return the vectorization cost for the given instruction \p I and vector
1377 /// width \p VF.
1378 InstructionCost getWideningCost(Instruction *I, ElementCount VF) {
1379 assert(VF.isVector() && "Expected VF >=2")(static_cast <bool> (VF.isVector() && "Expected VF >=2"
) ? void (0) : __assert_fail ("VF.isVector() && \"Expected VF >=2\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 1379, __extension__
__PRETTY_FUNCTION__))
;
1380 std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF);
1381 assert(WideningDecisions.find(InstOnVF) != WideningDecisions.end() &&(static_cast <bool> (WideningDecisions.find(InstOnVF) !=
WideningDecisions.end() && "The cost is not calculated"
) ? void (0) : __assert_fail ("WideningDecisions.find(InstOnVF) != WideningDecisions.end() && \"The cost is not calculated\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 1382, __extension__
__PRETTY_FUNCTION__))
1382 "The cost is not calculated")(static_cast <bool> (WideningDecisions.find(InstOnVF) !=
WideningDecisions.end() && "The cost is not calculated"
) ? void (0) : __assert_fail ("WideningDecisions.find(InstOnVF) != WideningDecisions.end() && \"The cost is not calculated\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 1382, __extension__
__PRETTY_FUNCTION__))
;
1383 return WideningDecisions[InstOnVF].second;
1384 }
1385
1386 /// Return True if instruction \p I is an optimizable truncate whose operand
1387 /// is an induction variable. Such a truncate will be removed by adding a new
1388 /// induction variable with the destination type.
1389 bool isOptimizableIVTruncate(Instruction *I, ElementCount VF) {
1390 // If the instruction is not a truncate, return false.
1391 auto *Trunc = dyn_cast<TruncInst>(I);
1392 if (!Trunc)
1393 return false;
1394
1395 // Get the source and destination types of the truncate.
1396 Type *SrcTy = ToVectorTy(cast<CastInst>(I)->getSrcTy(), VF);
1397 Type *DestTy = ToVectorTy(cast<CastInst>(I)->getDestTy(), VF);
1398
1399 // If the truncate is free for the given types, return false. Replacing a
1400 // free truncate with an induction variable would add an induction variable
1401 // update instruction to each iteration of the loop. We exclude from this
1402 // check the primary induction variable since it will need an update
1403 // instruction regardless.
1404 Value *Op = Trunc->getOperand(0);
1405 if (Op != Legal->getPrimaryInduction() && TTI.isTruncateFree(SrcTy, DestTy))
1406 return false;
1407
1408 // If the truncated value is not an induction variable, return false.
1409 return Legal->isInductionPhi(Op);
1410 }
1411
1412 /// Collects the instructions to scalarize for each predicated instruction in
1413 /// the loop.
1414 void collectInstsToScalarize(ElementCount VF);
1415
1416 /// Collect Uniform and Scalar values for the given \p VF.
1417 /// The sets depend on CM decision for Load/Store instructions
1418 /// that may be vectorized as interleave, gather-scatter or scalarized.
1419 void collectUniformsAndScalars(ElementCount VF) {
1420 // Do the analysis once.
1421 if (VF.isScalar() || Uniforms.find(VF) != Uniforms.end())
1422 return;
1423 setCostBasedWideningDecision(VF);
1424 collectLoopUniforms(VF);
1425 collectLoopScalars(VF);
1426 }
1427
1428 /// Returns true if the target machine supports masked store operation
1429 /// for the given \p DataType and kind of access to \p Ptr.
1430 bool isLegalMaskedStore(Type *DataType, Value *Ptr, Align Alignment) const {
1431 return Legal->isConsecutivePtr(DataType, Ptr) &&
1432 TTI.isLegalMaskedStore(DataType, Alignment);
1433 }
1434
1435 /// Returns true if the target machine supports masked load operation
1436 /// for the given \p DataType and kind of access to \p Ptr.
1437 bool isLegalMaskedLoad(Type *DataType, Value *Ptr, Align Alignment) const {
1438 return Legal->isConsecutivePtr(DataType, Ptr) &&
1439 TTI.isLegalMaskedLoad(DataType, Alignment);
1440 }
1441
1442 /// Returns true if the target machine can represent \p V as a masked gather
1443 /// or scatter operation.
1444 bool isLegalGatherOrScatter(Value *V,
1445 ElementCount VF = ElementCount::getFixed(1)) {
1446 bool LI = isa<LoadInst>(V);
1447 bool SI = isa<StoreInst>(V);
1448 if (!LI && !SI)
1449 return false;
1450 auto *Ty = getLoadStoreType(V);
1451 Align Align = getLoadStoreAlignment(V);
1452 if (VF.isVector())
1453 Ty = VectorType::get(Ty, VF);
1454 return (LI && TTI.isLegalMaskedGather(Ty, Align)) ||
1455 (SI && TTI.isLegalMaskedScatter(Ty, Align));
1456 }
1457
1458 /// Returns true if the target machine supports all of the reduction
1459 /// variables found for the given VF.
1460 bool canVectorizeReductions(ElementCount VF) const {
1461 return (all_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool {
1462 const RecurrenceDescriptor &RdxDesc = Reduction.second;
1463 return TTI.isLegalToVectorizeReduction(RdxDesc, VF);
1464 }));
1465 }
1466
1467 /// Given costs for both strategies, return true if the scalar predication
1468 /// lowering should be used for div/rem. This incorporates an override
1469 /// option so it is not simply a cost comparison.
1470 bool isDivRemScalarWithPredication(InstructionCost ScalarCost,
1471 InstructionCost SafeDivisorCost) const {
1472 switch (ForceSafeDivisor) {
1473 case cl::BOU_UNSET:
1474 return ScalarCost < SafeDivisorCost;
1475 case cl::BOU_TRUE:
1476 return false;
1477 case cl::BOU_FALSE:
1478 return true;
1479 };
1480 llvm_unreachable("impossible case value")::llvm::llvm_unreachable_internal("impossible case value", "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1480)
;
1481 }
1482
1483 /// Returns true if \p I is an instruction which requires predication and
1484 /// for which our chosen predication strategy is scalarization (i.e. we
1485 /// don't have an alternate strategy such as masking available).
1486 /// \p VF is the vectorization factor that will be used to vectorize \p I.
1487 bool isScalarWithPredication(Instruction *I, ElementCount VF) const;
1488
1489 /// Returns true if \p I is an instruction that needs to be predicated
1490 /// at runtime. The result is independent of the predication mechanism.
1491 /// Superset of instructions that return true for isScalarWithPredication.
1492 bool isPredicatedInst(Instruction *I) const;
1493
1494 /// Return the costs for our two available strategies for lowering a
1495 /// div/rem operation which requires speculating at least one lane.
1496 /// First result is for scalarization (will be invalid for scalable
1497 /// vectors); second is for the safe-divisor strategy.
1498 std::pair<InstructionCost, InstructionCost>
1499 getDivRemSpeculationCost(Instruction *I,
1500 ElementCount VF) const;
1501
1502 /// Returns true if \p I is a memory instruction with consecutive memory
1503 /// access that can be widened.
1504 bool memoryInstructionCanBeWidened(Instruction *I, ElementCount VF);
1505
1506 /// Returns true if \p I is a memory instruction in an interleaved-group
1507 /// of memory accesses that can be vectorized with wide vector loads/stores
1508 /// and shuffles.
1509 bool interleavedAccessCanBeWidened(Instruction *I, ElementCount VF);
1510
1511 /// Check if \p Instr belongs to any interleaved access group.
1512 bool isAccessInterleaved(Instruction *Instr) {
1513 return InterleaveInfo.isInterleaved(Instr);
1514 }
1515
1516 /// Get the interleaved access group that \p Instr belongs to.
1517 const InterleaveGroup<Instruction> *
1518 getInterleavedAccessGroup(Instruction *Instr) {
1519 return InterleaveInfo.getInterleaveGroup(Instr);
1520 }
1521
1522 /// Returns true if we're required to use a scalar epilogue for at least
1523 /// the final iteration of the original loop.
1524 bool requiresScalarEpilogue(ElementCount VF) const {
1525 if (!isScalarEpilogueAllowed())
1526 return false;
1527 // If we might exit from anywhere but the latch, must run the exiting
1528 // iteration in scalar form.
1529 if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch())
1530 return true;
1531 return VF.isVector() && InterleaveInfo.requiresScalarEpilogue();
1532 }
1533
1534 /// Returns true if a scalar epilogue is not allowed due to optsize or a
1535 /// loop hint annotation.
1536 bool isScalarEpilogueAllowed() const {
1537 return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed;
1538 }
1539
1540 /// Returns true if all loop blocks should be masked to fold tail loop.
1541 bool foldTailByMasking() const { return FoldTailByMasking; }
1542
1543 /// Returns true if were tail-folding and want to use the active lane mask
1544 /// for vector loop control flow.
1545 bool useActiveLaneMaskForControlFlow() const {
1546 return FoldTailByMasking &&
1547 TTI.emitGetActiveLaneMask() == PredicationStyle::DataAndControlFlow;
1548 }
1549
1550 /// Returns true if the instructions in this block requires predication
1551 /// for any reason, e.g. because tail folding now requires a predicate
1552 /// or because the block in the original loop was predicated.
1553 bool blockNeedsPredicationForAnyReason(BasicBlock *BB) const {
1554 return foldTailByMasking() || Legal->blockNeedsPredication(BB);
1555 }
1556
1557 /// A SmallMapVector to store the InLoop reduction op chains, mapping phi
1558 /// nodes to the chain of instructions representing the reductions. Uses a
1559 /// MapVector to ensure deterministic iteration order.
1560 using ReductionChainMap =
1561 SmallMapVector<PHINode *, SmallVector<Instruction *, 4>, 4>;
1562
1563 /// Return the chain of instructions representing an inloop reduction.
1564 const ReductionChainMap &getInLoopReductionChains() const {
1565 return InLoopReductionChains;
1566 }
1567
1568 /// Returns true if the Phi is part of an inloop reduction.
1569 bool isInLoopReduction(PHINode *Phi) const {
1570 return InLoopReductionChains.count(Phi);
1571 }
1572
1573 /// Estimate cost of an intrinsic call instruction CI if it were vectorized
1574 /// with factor VF. Return the cost of the instruction, including
1575 /// scalarization overhead if it's needed.
1576 InstructionCost getVectorIntrinsicCost(CallInst *CI, ElementCount VF) const;
1577
1578 /// Estimate cost of a call instruction CI if it were vectorized with factor
1579 /// VF. Return the cost of the instruction, including scalarization overhead
1580 /// if it's needed. The flag NeedToScalarize shows if the call needs to be
1581 /// scalarized -
1582 /// i.e. either vector version isn't available, or is too expensive.
1583 InstructionCost getVectorCallCost(CallInst *CI, ElementCount VF,
1584 bool &NeedToScalarize) const;
1585
1586 /// Returns true if the per-lane cost of VectorizationFactor A is lower than
1587 /// that of B.
1588 bool isMoreProfitable(const VectorizationFactor &A,
1589 const VectorizationFactor &B) const;
1590
1591 /// Invalidates decisions already taken by the cost model.
1592 void invalidateCostModelingDecisions() {
1593 WideningDecisions.clear();
1594 Uniforms.clear();
1595 Scalars.clear();
1596 }
1597
1598 /// Convenience function that returns the value of vscale_range iff
1599 /// vscale_range.min == vscale_range.max or otherwise returns the value
1600 /// returned by the corresponding TLI method.
1601 std::optional<unsigned> getVScaleForTuning() const;
1602
1603private:
1604 unsigned NumPredStores = 0;
1605
1606 /// \return An upper bound for the vectorization factors for both
1607 /// fixed and scalable vectorization, where the minimum-known number of
1608 /// elements is a power-of-2 larger than zero. If scalable vectorization is
1609 /// disabled or unsupported, then the scalable part will be equal to
1610 /// ElementCount::getScalable(0).
1611 FixedScalableVFPair computeFeasibleMaxVF(unsigned ConstTripCount,
1612 ElementCount UserVF,
1613 bool FoldTailByMasking);
1614
1615 /// \return the maximized element count based on the targets vector
1616 /// registers and the loop trip-count, but limited to a maximum safe VF.
1617 /// This is a helper function of computeFeasibleMaxVF.
1618 ElementCount getMaximizedVFForTarget(unsigned ConstTripCount,
1619 unsigned SmallestType,
1620 unsigned WidestType,
1621 ElementCount MaxSafeVF,
1622 bool FoldTailByMasking);
1623
1624 /// \return the maximum legal scalable VF, based on the safe max number
1625 /// of elements.
1626 ElementCount getMaxLegalScalableVF(unsigned MaxSafeElements);
1627
1628 /// The vectorization cost is a combination of the cost itself and a boolean
1629 /// indicating whether any of the contributing operations will actually
1630 /// operate on vector values after type legalization in the backend. If this
1631 /// latter value is false, then all operations will be scalarized (i.e. no
1632 /// vectorization has actually taken place).
1633 using VectorizationCostTy = std::pair<InstructionCost, bool>;
1634
1635 /// Returns the expected execution cost. The unit of the cost does
1636 /// not matter because we use the 'cost' units to compare different
1637 /// vector widths. The cost that is returned is *not* normalized by
1638 /// the factor width. If \p Invalid is not nullptr, this function
1639 /// will add a pair(Instruction*, ElementCount) to \p Invalid for
1640 /// each instruction that has an Invalid cost for the given VF.
1641 using InstructionVFPair = std::pair<Instruction *, ElementCount>;
1642 VectorizationCostTy
1643 expectedCost(ElementCount VF,
1644 SmallVectorImpl<InstructionVFPair> *Invalid = nullptr);
1645
1646 /// Returns the execution time cost of an instruction for a given vector
1647 /// width. Vector width of one means scalar.
1648 VectorizationCostTy getInstructionCost(Instruction *I, ElementCount VF);
1649
1650 /// The cost-computation logic from getInstructionCost which provides
1651 /// the vector type as an output parameter.
1652 InstructionCost getInstructionCost(Instruction *I, ElementCount VF,
1653 Type *&VectorTy);
1654
1655 /// Return the cost of instructions in an inloop reduction pattern, if I is
1656 /// part of that pattern.
1657 std::optional<InstructionCost>
1658 getReductionPatternCost(Instruction *I, ElementCount VF, Type *VectorTy,
1659 TTI::TargetCostKind CostKind);
1660
1661 /// Calculate vectorization cost of memory instruction \p I.
1662 InstructionCost getMemoryInstructionCost(Instruction *I, ElementCount VF);
1663
1664 /// The cost computation for scalarized memory instruction.
1665 InstructionCost getMemInstScalarizationCost(Instruction *I, ElementCount VF);
1666
1667 /// The cost computation for interleaving group of memory instructions.
1668 InstructionCost getInterleaveGroupCost(Instruction *I, ElementCount VF);
1669
1670 /// The cost computation for Gather/Scatter instruction.
1671 InstructionCost getGatherScatterCost(Instruction *I, ElementCount VF);
1672
1673 /// The cost computation for widening instruction \p I with consecutive
1674 /// memory access.
1675 InstructionCost getConsecutiveMemOpCost(Instruction *I, ElementCount VF);
1676
1677 /// The cost calculation for Load/Store instruction \p I with uniform pointer -
1678 /// Load: scalar load + broadcast.
1679 /// Store: scalar store + (loop invariant value stored? 0 : extract of last
1680 /// element)
1681 InstructionCost getUniformMemOpCost(Instruction *I, ElementCount VF);
1682
1683 /// Estimate the overhead of scalarizing an instruction. This is a
1684 /// convenience wrapper for the type-based getScalarizationOverhead API.
1685 InstructionCost getScalarizationOverhead(Instruction *I,
1686 ElementCount VF) const;
1687
1688 /// Returns true if an artificially high cost for emulated masked memrefs
1689 /// should be used.
1690 bool useEmulatedMaskMemRefHack(Instruction *I, ElementCount VF);
1691
1692 /// Map of scalar integer values to the smallest bitwidth they can be legally
1693 /// represented as. The vector equivalents of these values should be truncated
1694 /// to this type.
1695 MapVector<Instruction *, uint64_t> MinBWs;
1696
1697 /// A type representing the costs for instructions if they were to be
1698 /// scalarized rather than vectorized. The entries are Instruction-Cost
1699 /// pairs.
1700 using ScalarCostsTy = DenseMap<Instruction *, InstructionCost>;
1701
1702 /// A set containing all BasicBlocks that are known to present after
1703 /// vectorization as a predicated block.
1704 DenseMap<ElementCount, SmallPtrSet<BasicBlock *, 4>>
1705 PredicatedBBsAfterVectorization;
1706
1707 /// Records whether it is allowed to have the original scalar loop execute at
1708 /// least once. This may be needed as a fallback loop in case runtime
1709 /// aliasing/dependence checks fail, or to handle the tail/remainder
1710 /// iterations when the trip count is unknown or doesn't divide by the VF,
1711 /// or as a peel-loop to handle gaps in interleave-groups.
1712 /// Under optsize and when the trip count is very small we don't allow any
1713 /// iterations to execute in the scalar loop.
1714 ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
1715
1716 /// All blocks of loop are to be masked to fold tail of scalar iterations.
1717 bool FoldTailByMasking = false;
1718
1719 /// A map holding scalar costs for different vectorization factors. The
1720 /// presence of a cost for an instruction in the mapping indicates that the
1721 /// instruction will be scalarized when vectorizing with the associated
1722 /// vectorization factor. The entries are VF-ScalarCostTy pairs.
1723 DenseMap<ElementCount, ScalarCostsTy> InstsToScalarize;
1724
1725 /// Holds the instructions known to be uniform after vectorization.
1726 /// The data is collected per VF.
1727 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Uniforms;
1728
1729 /// Holds the instructions known to be scalar after vectorization.
1730 /// The data is collected per VF.
1731 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Scalars;
1732
1733 /// Holds the instructions (address computations) that are forced to be
1734 /// scalarized.
1735 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> ForcedScalars;
1736
1737 /// PHINodes of the reductions that should be expanded in-loop along with
1738 /// their associated chains of reduction operations, in program order from top
1739 /// (PHI) to bottom
1740 ReductionChainMap InLoopReductionChains;
1741
1742 /// A Map of inloop reduction operations and their immediate chain operand.
1743 /// FIXME: This can be removed once reductions can be costed correctly in
1744 /// vplan. This was added to allow quick lookup to the inloop operations,
1745 /// without having to loop through InLoopReductionChains.
1746 DenseMap<Instruction *, Instruction *> InLoopReductionImmediateChains;
1747
1748 /// Returns the expected difference in cost from scalarizing the expression
1749 /// feeding a predicated instruction \p PredInst. The instructions to
1750 /// scalarize and their scalar costs are collected in \p ScalarCosts. A
1751 /// non-negative return value implies the expression will be scalarized.
1752 /// Currently, only single-use chains are considered for scalarization.
1753 InstructionCost computePredInstDiscount(Instruction *PredInst,
1754 ScalarCostsTy &ScalarCosts,
1755 ElementCount VF);
1756
1757 /// Collect the instructions that are uniform after vectorization. An
1758 /// instruction is uniform if we represent it with a single scalar value in
1759 /// the vectorized loop corresponding to each vector iteration. Examples of
1760 /// uniform instructions include pointer operands of consecutive or
1761 /// interleaved memory accesses. Note that although uniformity implies an
1762 /// instruction will be scalar, the reverse is not true. In general, a
1763 /// scalarized instruction will be represented by VF scalar values in the
1764 /// vectorized loop, each corresponding to an iteration of the original
1765 /// scalar loop.
1766 void collectLoopUniforms(ElementCount VF);
1767
1768 /// Collect the instructions that are scalar after vectorization. An
1769 /// instruction is scalar if it is known to be uniform or will be scalarized
1770 /// during vectorization. collectLoopScalars should only add non-uniform nodes
1771 /// to the list if they are used by a load/store instruction that is marked as
1772 /// CM_Scalarize. Non-uniform scalarized instructions will be represented by
1773 /// VF values in the vectorized loop, each corresponding to an iteration of
1774 /// the original scalar loop.
1775 void collectLoopScalars(ElementCount VF);
1776
1777 /// Keeps cost model vectorization decision and cost for instructions.
1778 /// Right now it is used for memory instructions only.
1779 using DecisionList = DenseMap<std::pair<Instruction *, ElementCount>,
1780 std::pair<InstWidening, InstructionCost>>;
1781
1782 DecisionList WideningDecisions;
1783
1784 /// Returns true if \p V is expected to be vectorized and it needs to be
1785 /// extracted.
1786 bool needsExtract(Value *V, ElementCount VF) const {
1787 Instruction *I = dyn_cast<Instruction>(V);
1788 if (VF.isScalar() || !I || !TheLoop->contains(I) ||
1789 TheLoop->isLoopInvariant(I))
1790 return false;
1791
1792 // Assume we can vectorize V (and hence we need extraction) if the
1793 // scalars are not computed yet. This can happen, because it is called
1794 // via getScalarizationOverhead from setCostBasedWideningDecision, before
1795 // the scalars are collected. That should be a safe assumption in most
1796 // cases, because we check if the operands have vectorizable types
1797 // beforehand in LoopVectorizationLegality.
1798 return Scalars.find(VF) == Scalars.end() ||
1799 !isScalarAfterVectorization(I, VF);
1800 };
1801
1802 /// Returns a range containing only operands needing to be extracted.
1803 SmallVector<Value *, 4> filterExtractingOperands(Instruction::op_range Ops,
1804 ElementCount VF) const {
1805 return SmallVector<Value *, 4>(make_filter_range(
1806 Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); }));
1807 }
1808
1809 /// Determines if we have the infrastructure to vectorize loop \p L and its
1810 /// epilogue, assuming the main loop is vectorized by \p VF.
1811 bool isCandidateForEpilogueVectorization(const Loop &L,
1812 const ElementCount VF) const;
1813
1814 /// Returns true if epilogue vectorization is considered profitable, and
1815 /// false otherwise.
1816 /// \p VF is the vectorization factor chosen for the original loop.
1817 bool isEpilogueVectorizationProfitable(const ElementCount VF) const;
1818
1819public:
1820 /// The loop that we evaluate.
1821 Loop *TheLoop;
1822
1823 /// Predicated scalar evolution analysis.
1824 PredicatedScalarEvolution &PSE;
1825
1826 /// Loop Info analysis.
1827 LoopInfo *LI;
1828
1829 /// Vectorization legality.
1830 LoopVectorizationLegality *Legal;
1831
1832 /// Vector target information.
1833 const TargetTransformInfo &TTI;
1834
1835 /// Target Library Info.
1836 const TargetLibraryInfo *TLI;
1837
1838 /// Demanded bits analysis.
1839 DemandedBits *DB;
1840
1841 /// Assumption cache.
1842 AssumptionCache *AC;
1843
1844 /// Interface to emit optimization remarks.
1845 OptimizationRemarkEmitter *ORE;
1846
1847 const Function *TheFunction;
1848
1849 /// Loop Vectorize Hint.
1850 const LoopVectorizeHints *Hints;
1851
1852 /// The interleave access information contains groups of interleaved accesses
1853 /// with the same stride and close to each other.
1854 InterleavedAccessInfo &InterleaveInfo;
1855
1856 /// Values to ignore in the cost model.
1857 SmallPtrSet<const Value *, 16> ValuesToIgnore;
1858
1859 /// Values to ignore in the cost model when VF > 1.
1860 SmallPtrSet<const Value *, 16> VecValuesToIgnore;
1861
1862 /// All element types found in the loop.
1863 SmallPtrSet<Type *, 16> ElementTypesInLoop;
1864
1865 /// Profitable vector factors.
1866 SmallVector<VectorizationFactor, 8> ProfitableVFs;
1867};
1868} // end namespace llvm
1869
1870/// Helper struct to manage generating runtime checks for vectorization.
1871///
1872/// The runtime checks are created up-front in temporary blocks to allow better
1873/// estimating the cost and un-linked from the existing IR. After deciding to
1874/// vectorize, the checks are moved back. If deciding not to vectorize, the
1875/// temporary blocks are completely removed.
1876class GeneratedRTChecks {
1877 /// Basic block which contains the generated SCEV checks, if any.
1878 BasicBlock *SCEVCheckBlock = nullptr;
1879
1880 /// The value representing the result of the generated SCEV checks. If it is
1881 /// nullptr, either no SCEV checks have been generated or they have been used.
1882 Value *SCEVCheckCond = nullptr;
1883
1884 /// Basic block which contains the generated memory runtime checks, if any.
1885 BasicBlock *MemCheckBlock = nullptr;
1886
1887 /// The value representing the result of the generated memory runtime checks.
1888 /// If it is nullptr, either no memory runtime checks have been generated or
1889 /// they have been used.
1890 Value *MemRuntimeCheckCond = nullptr;
1891
1892 DominatorTree *DT;
1893 LoopInfo *LI;
1894 TargetTransformInfo *TTI;
1895
1896 SCEVExpander SCEVExp;
1897 SCEVExpander MemCheckExp;
1898
1899 bool CostTooHigh = false;
1900
1901public:
1902 GeneratedRTChecks(ScalarEvolution &SE, DominatorTree *DT, LoopInfo *LI,
1903 TargetTransformInfo *TTI, const DataLayout &DL)
1904 : DT(DT), LI(LI), TTI(TTI), SCEVExp(SE, DL, "scev.check"),
1905 MemCheckExp(SE, DL, "scev.check") {}
1906
1907 /// Generate runtime checks in SCEVCheckBlock and MemCheckBlock, so we can
1908 /// accurately estimate the cost of the runtime checks. The blocks are
1909 /// un-linked from the IR and is added back during vector code generation. If
1910 /// there is no vector code generation, the check blocks are removed
1911 /// completely.
1912 void Create(Loop *L, const LoopAccessInfo &LAI,
1913 const SCEVPredicate &UnionPred, ElementCount VF, unsigned IC) {
1914
1915 // Hard cutoff to limit compile-time increase in case a very large number of
1916 // runtime checks needs to be generated.
1917 // TODO: Skip cutoff if the loop is guaranteed to execute, e.g. due to
1918 // profile info.
1919 CostTooHigh =
1920 LAI.getNumRuntimePointerChecks() > VectorizeMemoryCheckThreshold;
1921 if (CostTooHigh)
1922 return;
1923
1924 BasicBlock *LoopHeader = L->getHeader();
1925 BasicBlock *Preheader = L->getLoopPreheader();
1926
1927 // Use SplitBlock to create blocks for SCEV & memory runtime checks to
1928 // ensure the blocks are properly added to LoopInfo & DominatorTree. Those
1929 // may be used by SCEVExpander. The blocks will be un-linked from their
1930 // predecessors and removed from LI & DT at the end of the function.
1931 if (!UnionPred.isAlwaysTrue()) {
1932 SCEVCheckBlock = SplitBlock(Preheader, Preheader->getTerminator(), DT, LI,
1933 nullptr, "vector.scevcheck");
1934
1935 SCEVCheckCond = SCEVExp.expandCodeForPredicate(
1936 &UnionPred, SCEVCheckBlock->getTerminator());
1937 }
1938
1939 const auto &RtPtrChecking = *LAI.getRuntimePointerChecking();
1940 if (RtPtrChecking.Need) {
1941 auto *Pred = SCEVCheckBlock ? SCEVCheckBlock : Preheader;
1942 MemCheckBlock = SplitBlock(Pred, Pred->getTerminator(), DT, LI, nullptr,
1943 "vector.memcheck");
1944
1945 auto DiffChecks = RtPtrChecking.getDiffChecks();
1946 if (DiffChecks) {
1947 Value *RuntimeVF = nullptr;
1948 MemRuntimeCheckCond = addDiffRuntimeChecks(
1949 MemCheckBlock->getTerminator(), *DiffChecks, MemCheckExp,
1950 [VF, &RuntimeVF](IRBuilderBase &B, unsigned Bits) {
1951 if (!RuntimeVF)
1952 RuntimeVF = getRuntimeVF(B, B.getIntNTy(Bits), VF);
1953 return RuntimeVF;
1954 },
1955 IC);
1956 } else {
1957 MemRuntimeCheckCond =
1958 addRuntimeChecks(MemCheckBlock->getTerminator(), L,
1959 RtPtrChecking.getChecks(), MemCheckExp);
1960 }
1961 assert(MemRuntimeCheckCond &&(static_cast <bool> (MemRuntimeCheckCond && "no RT checks generated although RtPtrChecking "
"claimed checks are required") ? void (0) : __assert_fail ("MemRuntimeCheckCond && \"no RT checks generated although RtPtrChecking \" \"claimed checks are required\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 1963, __extension__
__PRETTY_FUNCTION__))
1962 "no RT checks generated although RtPtrChecking "(static_cast <bool> (MemRuntimeCheckCond && "no RT checks generated although RtPtrChecking "
"claimed checks are required") ? void (0) : __assert_fail ("MemRuntimeCheckCond && \"no RT checks generated although RtPtrChecking \" \"claimed checks are required\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 1963, __extension__
__PRETTY_FUNCTION__))
1963 "claimed checks are required")(static_cast <bool> (MemRuntimeCheckCond && "no RT checks generated although RtPtrChecking "
"claimed checks are required") ? void (0) : __assert_fail ("MemRuntimeCheckCond && \"no RT checks generated although RtPtrChecking \" \"claimed checks are required\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 1963, __extension__
__PRETTY_FUNCTION__))
;
1964 }
1965
1966 if (!MemCheckBlock && !SCEVCheckBlock)
1967 return;
1968
1969 // Unhook the temporary block with the checks, update various places
1970 // accordingly.
1971 if (SCEVCheckBlock)
1972 SCEVCheckBlock->replaceAllUsesWith(Preheader);
1973 if (MemCheckBlock)
1974 MemCheckBlock->replaceAllUsesWith(Preheader);
1975
1976 if (SCEVCheckBlock) {
1977 SCEVCheckBlock->getTerminator()->moveBefore(Preheader->getTerminator());
1978 new UnreachableInst(Preheader->getContext(), SCEVCheckBlock);
1979 Preheader->getTerminator()->eraseFromParent();
1980 }
1981 if (MemCheckBlock) {
1982 MemCheckBlock->getTerminator()->moveBefore(Preheader->getTerminator());
1983 new UnreachableInst(Preheader->getContext(), MemCheckBlock);
1984 Preheader->getTerminator()->eraseFromParent();
1985 }
1986
1987 DT->changeImmediateDominator(LoopHeader, Preheader);
1988 if (MemCheckBlock) {
1989 DT->eraseNode(MemCheckBlock);
1990 LI->removeBlock(MemCheckBlock);
1991 }
1992 if (SCEVCheckBlock) {
1993 DT->eraseNode(SCEVCheckBlock);
1994 LI->removeBlock(SCEVCheckBlock);
1995 }
1996 }
1997
1998 InstructionCost getCost() {
1999 if (SCEVCheckBlock || MemCheckBlock)
2000 LLVM_DEBUG(dbgs() << "Calculating cost of runtime checks:\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "Calculating cost of runtime checks:\n"
; } } while (false)
;
2001
2002 if (CostTooHigh) {
2003 InstructionCost Cost;
2004 Cost.setInvalid();
2005 LLVM_DEBUG(dbgs() << " number of checks exceeded threshold\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << " number of checks exceeded threshold\n"
; } } while (false)
;
2006 return Cost;
2007 }
2008
2009 InstructionCost RTCheckCost = 0;
2010 if (SCEVCheckBlock)
2011 for (Instruction &I : *SCEVCheckBlock) {
2012 if (SCEVCheckBlock->getTerminator() == &I)
2013 continue;
2014 InstructionCost C =
2015 TTI->getInstructionCost(&I, TTI::TCK_RecipThroughput);
2016 LLVM_DEBUG(dbgs() << " " << C << " for " << I << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << " " << C <<
" for " << I << "\n"; } } while (false)
;
2017 RTCheckCost += C;
2018 }
2019 if (MemCheckBlock)
2020 for (Instruction &I : *MemCheckBlock) {
2021 if (MemCheckBlock->getTerminator() == &I)
2022 continue;
2023 InstructionCost C =
2024 TTI->getInstructionCost(&I, TTI::TCK_RecipThroughput);
2025 LLVM_DEBUG(dbgs() << " " << C << " for " << I << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << " " << C <<
" for " << I << "\n"; } } while (false)
;
2026 RTCheckCost += C;
2027 }
2028
2029 if (SCEVCheckBlock || MemCheckBlock)
2030 LLVM_DEBUG(dbgs() << "Total cost of runtime checks: " << RTCheckCostdo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "Total cost of runtime checks: "
<< RTCheckCost << "\n"; } } while (false)
2031 << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "Total cost of runtime checks: "
<< RTCheckCost << "\n"; } } while (false)
;
2032
2033 return RTCheckCost;
2034 }
2035
2036 /// Remove the created SCEV & memory runtime check blocks & instructions, if
2037 /// unused.
2038 ~GeneratedRTChecks() {
2039 SCEVExpanderCleaner SCEVCleaner(SCEVExp);
2040 SCEVExpanderCleaner MemCheckCleaner(MemCheckExp);
2041 if (!SCEVCheckCond)
2042 SCEVCleaner.markResultUsed();
2043
2044 if (!MemRuntimeCheckCond)
2045 MemCheckCleaner.markResultUsed();
2046
2047 if (MemRuntimeCheckCond) {
2048 auto &SE = *MemCheckExp.getSE();
2049 // Memory runtime check generation creates compares that use expanded
2050 // values. Remove them before running the SCEVExpanderCleaners.
2051 for (auto &I : make_early_inc_range(reverse(*MemCheckBlock))) {
2052 if (MemCheckExp.isInsertedInstruction(&I))
2053 continue;
2054 SE.forgetValue(&I);
2055 I.eraseFromParent();
2056 }
2057 }
2058 MemCheckCleaner.cleanup();
2059 SCEVCleaner.cleanup();
2060
2061 if (SCEVCheckCond)
2062 SCEVCheckBlock->eraseFromParent();
2063 if (MemRuntimeCheckCond)
2064 MemCheckBlock->eraseFromParent();
2065 }
2066
2067 /// Adds the generated SCEVCheckBlock before \p LoopVectorPreHeader and
2068 /// adjusts the branches to branch to the vector preheader or \p Bypass,
2069 /// depending on the generated condition.
2070 BasicBlock *emitSCEVChecks(BasicBlock *Bypass,
2071 BasicBlock *LoopVectorPreHeader,
2072 BasicBlock *LoopExitBlock) {
2073 if (!SCEVCheckCond)
2074 return nullptr;
2075
2076 Value *Cond = SCEVCheckCond;
2077 // Mark the check as used, to prevent it from being removed during cleanup.
2078 SCEVCheckCond = nullptr;
2079 if (auto *C = dyn_cast<ConstantInt>(Cond))
2080 if (C->isZero())
2081 return nullptr;
2082
2083 auto *Pred = LoopVectorPreHeader->getSinglePredecessor();
2084
2085 BranchInst::Create(LoopVectorPreHeader, SCEVCheckBlock);
2086 // Create new preheader for vector loop.
2087 if (auto *PL = LI->getLoopFor(LoopVectorPreHeader))
2088 PL->addBasicBlockToLoop(SCEVCheckBlock, *LI);
2089
2090 SCEVCheckBlock->getTerminator()->eraseFromParent();
2091 SCEVCheckBlock->moveBefore(LoopVectorPreHeader);
2092 Pred->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader,
2093 SCEVCheckBlock);
2094
2095 DT->addNewBlock(SCEVCheckBlock, Pred);
2096 DT->changeImmediateDominator(LoopVectorPreHeader, SCEVCheckBlock);
2097
2098 ReplaceInstWithInst(SCEVCheckBlock->getTerminator(),
2099 BranchInst::Create(Bypass, LoopVectorPreHeader, Cond));
2100 return SCEVCheckBlock;
2101 }
2102
2103 /// Adds the generated MemCheckBlock before \p LoopVectorPreHeader and adjusts
2104 /// the branches to branch to the vector preheader or \p Bypass, depending on
2105 /// the generated condition.
2106 BasicBlock *emitMemRuntimeChecks(BasicBlock *Bypass,
2107 BasicBlock *LoopVectorPreHeader) {
2108 // Check if we generated code that checks in runtime if arrays overlap.
2109 if (!MemRuntimeCheckCond)
2110 return nullptr;
2111
2112 auto *Pred = LoopVectorPreHeader->getSinglePredecessor();
2113 Pred->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader,
2114 MemCheckBlock);
2115
2116 DT->addNewBlock(MemCheckBlock, Pred);
2117 DT->changeImmediateDominator(LoopVectorPreHeader, MemCheckBlock);
2118 MemCheckBlock->moveBefore(LoopVectorPreHeader);
2119
2120 if (auto *PL = LI->getLoopFor(LoopVectorPreHeader))
2121 PL->addBasicBlockToLoop(MemCheckBlock, *LI);
2122
2123 ReplaceInstWithInst(
2124 MemCheckBlock->getTerminator(),
2125 BranchInst::Create(Bypass, LoopVectorPreHeader, MemRuntimeCheckCond));
2126 MemCheckBlock->getTerminator()->setDebugLoc(
2127 Pred->getTerminator()->getDebugLoc());
2128
2129 // Mark the check as used, to prevent it from being removed during cleanup.
2130 MemRuntimeCheckCond = nullptr;
2131 return MemCheckBlock;
2132 }
2133};
2134
2135// Return true if \p OuterLp is an outer loop annotated with hints for explicit
2136// vectorization. The loop needs to be annotated with #pragma omp simd
2137// simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the
2138// vector length information is not provided, vectorization is not considered
2139// explicit. Interleave hints are not allowed either. These limitations will be
2140// relaxed in the future.
2141// Please, note that we are currently forced to abuse the pragma 'clang
2142// vectorize' semantics. This pragma provides *auto-vectorization hints*
2143// (i.e., LV must check that vectorization is legal) whereas pragma 'omp simd'
2144// provides *explicit vectorization hints* (LV can bypass legal checks and
2145// assume that vectorization is legal). However, both hints are implemented
2146// using the same metadata (llvm.loop.vectorize, processed by
2147// LoopVectorizeHints). This will be fixed in the future when the native IR
2148// representation for pragma 'omp simd' is introduced.
2149static bool isExplicitVecOuterLoop(Loop *OuterLp,
2150 OptimizationRemarkEmitter *ORE) {
2151 assert(!OuterLp->isInnermost() && "This is not an outer loop")(static_cast <bool> (!OuterLp->isInnermost() &&
"This is not an outer loop") ? void (0) : __assert_fail ("!OuterLp->isInnermost() && \"This is not an outer loop\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2151, __extension__
__PRETTY_FUNCTION__))
;
2152 LoopVectorizeHints Hints(OuterLp, true /*DisableInterleaving*/, *ORE);
2153
2154 // Only outer loops with an explicit vectorization hint are supported.
2155 // Unannotated outer loops are ignored.
2156 if (Hints.getForce() == LoopVectorizeHints::FK_Undefined)
2157 return false;
2158
2159 Function *Fn = OuterLp->getHeader()->getParent();
2160 if (!Hints.allowVectorization(Fn, OuterLp,
2161 true /*VectorizeOnlyWhenForced*/)) {
2162 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent outer loop vectorization.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Loop hints prevent outer loop vectorization.\n"
; } } while (false)
;
2163 return false;
2164 }
2165
2166 if (Hints.getInterleave() > 1) {
2167 // TODO: Interleave support is future work.
2168 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Not vectorizing: Interleave is not supported for "
"outer loops.\n"; } } while (false)
2169 "outer loops.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Not vectorizing: Interleave is not supported for "
"outer loops.\n"; } } while (false)
;
2170 Hints.emitRemarkWithHints();
2171 return false;
2172 }
2173
2174 return true;
2175}
2176
2177static void collectSupportedLoops(Loop &L, LoopInfo *LI,
2178 OptimizationRemarkEmitter *ORE,
2179 SmallVectorImpl<Loop *> &V) {
2180 // Collect inner loops and outer loops without irreducible control flow. For
2181 // now, only collect outer loops that have explicit vectorization hints. If we
2182 // are stress testing the VPlan H-CFG construction, we collect the outermost
2183 // loop of every loop nest.
2184 if (L.isInnermost() || VPlanBuildStressTest ||
2185 (EnableVPlanNativePath && isExplicitVecOuterLoop(&L, ORE))) {
2186 LoopBlocksRPO RPOT(&L);
2187 RPOT.perform(LI);
2188 if (!containsIrreducibleCFG<const BasicBlock *>(RPOT, *LI)) {
2189 V.push_back(&L);
2190 // TODO: Collect inner loops inside marked outer loops in case
2191 // vectorization fails for the outer loop. Do not invoke
2192 // 'containsIrreducibleCFG' again for inner loops when the outer loop is
2193 // already known to be reducible. We can use an inherited attribute for
2194 // that.
2195 return;
2196 }
2197 }
2198 for (Loop *InnerL : L)
2199 collectSupportedLoops(*InnerL, LI, ORE, V);
2200}
2201
2202namespace {
2203
2204/// The LoopVectorize Pass.
2205struct LoopVectorize : public FunctionPass {
2206 /// Pass identification, replacement for typeid
2207 static char ID;
2208
2209 LoopVectorizePass Impl;
2210
2211 explicit LoopVectorize(bool InterleaveOnlyWhenForced = false,
2212 bool VectorizeOnlyWhenForced = false)
2213 : FunctionPass(ID),
2214 Impl({InterleaveOnlyWhenForced, VectorizeOnlyWhenForced}) {
2215 initializeLoopVectorizePass(*PassRegistry::getPassRegistry());
2216 }
2217
2218 bool runOnFunction(Function &F) override {
2219 if (skipFunction(F))
2220 return false;
2221
2222 auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
2223 auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
2224 auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
2225 auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
2226 auto *BFI = &getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI();
2227 auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
2228 auto *TLI = TLIP ? &TLIP->getTLI(F) : nullptr;
2229 auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
2230 auto &LAIs = getAnalysis<LoopAccessLegacyAnalysis>().getLAIs();
2231 auto *DB = &getAnalysis<DemandedBitsWrapperPass>().getDemandedBits();
2232 auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
2233 auto *PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
2234
2235 return Impl
2236 .runImpl(F, *SE, *LI, *TTI, *DT, *BFI, TLI, *DB, *AC, LAIs, *ORE, PSI)
2237 .MadeAnyChange;
2238 }
2239
2240 void getAnalysisUsage(AnalysisUsage &AU) const override {
2241 AU.addRequired<AssumptionCacheTracker>();
2242 AU.addRequired<BlockFrequencyInfoWrapperPass>();
2243 AU.addRequired<DominatorTreeWrapperPass>();
2244 AU.addRequired<LoopInfoWrapperPass>();
2245 AU.addRequired<ScalarEvolutionWrapperPass>();
2246 AU.addRequired<TargetTransformInfoWrapperPass>();
2247 AU.addRequired<LoopAccessLegacyAnalysis>();
2248 AU.addRequired<DemandedBitsWrapperPass>();
2249 AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
2250 AU.addRequired<InjectTLIMappingsLegacy>();
2251
2252 // We currently do not preserve loopinfo/dominator analyses with outer loop
2253 // vectorization. Until this is addressed, mark these analyses as preserved
2254 // only for non-VPlan-native path.
2255 // TODO: Preserve Loop and Dominator analyses for VPlan-native path.
2256 if (!EnableVPlanNativePath) {
2257 AU.addPreserved<LoopInfoWrapperPass>();
2258 AU.addPreserved<DominatorTreeWrapperPass>();
2259 }
2260
2261 AU.addPreserved<BasicAAWrapperPass>();
2262 AU.addPreserved<GlobalsAAWrapperPass>();
2263 AU.addRequired<ProfileSummaryInfoWrapperPass>();
2264 }
2265};
2266
2267} // end anonymous namespace
2268
2269//===----------------------------------------------------------------------===//
2270// Implementation of LoopVectorizationLegality, InnerLoopVectorizer and
2271// LoopVectorizationCostModel and LoopVectorizationPlanner.
2272//===----------------------------------------------------------------------===//
2273
2274Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) {
2275 // We need to place the broadcast of invariant variables outside the loop,
2276 // but only if it's proven safe to do so. Else, broadcast will be inside
2277 // vector loop body.
2278 Instruction *Instr = dyn_cast<Instruction>(V);
2279 bool SafeToHoist = OrigLoop->isLoopInvariant(V) &&
2280 (!Instr ||
2281 DT->dominates(Instr->getParent(), LoopVectorPreHeader));
2282 // Place the code for broadcasting invariant variables in the new preheader.
2283 IRBuilder<>::InsertPointGuard Guard(Builder);
2284 if (SafeToHoist)
2285 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
2286
2287 // Broadcast the scalar into all locations in the vector.
2288 Value *Shuf = Builder.CreateVectorSplat(VF, V, "broadcast");
2289
2290 return Shuf;
2291}
2292
2293/// This function adds
2294/// (StartIdx * Step, (StartIdx + 1) * Step, (StartIdx + 2) * Step, ...)
2295/// to each vector element of Val. The sequence starts at StartIndex.
2296/// \p Opcode is relevant for FP induction variable.
2297static Value *getStepVector(Value *Val, Value *StartIdx, Value *Step,
2298 Instruction::BinaryOps BinOp, ElementCount VF,
2299 IRBuilderBase &Builder) {
2300 assert(VF.isVector() && "only vector VFs are supported")(static_cast <bool> (VF.isVector() && "only vector VFs are supported"
) ? void (0) : __assert_fail ("VF.isVector() && \"only vector VFs are supported\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2300, __extension__
__PRETTY_FUNCTION__))
;
2301
2302 // Create and check the types.
2303 auto *ValVTy = cast<VectorType>(Val->getType());
2304 ElementCount VLen = ValVTy->getElementCount();
2305
2306 Type *STy = Val->getType()->getScalarType();
2307 assert((STy->isIntegerTy() || STy->isFloatingPointTy()) &&(static_cast <bool> ((STy->isIntegerTy() || STy->
isFloatingPointTy()) && "Induction Step must be an integer or FP"
) ? void (0) : __assert_fail ("(STy->isIntegerTy() || STy->isFloatingPointTy()) && \"Induction Step must be an integer or FP\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2308, __extension__
__PRETTY_FUNCTION__))
2308 "Induction Step must be an integer or FP")(static_cast <bool> ((STy->isIntegerTy() || STy->
isFloatingPointTy()) && "Induction Step must be an integer or FP"
) ? void (0) : __assert_fail ("(STy->isIntegerTy() || STy->isFloatingPointTy()) && \"Induction Step must be an integer or FP\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2308, __extension__
__PRETTY_FUNCTION__))
;
2309 assert(Step->getType() == STy && "Step has wrong type")(static_cast <bool> (Step->getType() == STy &&
"Step has wrong type") ? void (0) : __assert_fail ("Step->getType() == STy && \"Step has wrong type\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2309, __extension__
__PRETTY_FUNCTION__))
;
2310
2311 SmallVector<Constant *, 8> Indices;
2312
2313 // Create a vector of consecutive numbers from zero to VF.
2314 VectorType *InitVecValVTy = ValVTy;
2315 if (STy->isFloatingPointTy()) {
2316 Type *InitVecValSTy =
2317 IntegerType::get(STy->getContext(), STy->getScalarSizeInBits());
2318 InitVecValVTy = VectorType::get(InitVecValSTy, VLen);
2319 }
2320 Value *InitVec = Builder.CreateStepVector(InitVecValVTy);
2321
2322 // Splat the StartIdx
2323 Value *StartIdxSplat = Builder.CreateVectorSplat(VLen, StartIdx);
2324
2325 if (STy->isIntegerTy()) {
2326 InitVec = Builder.CreateAdd(InitVec, StartIdxSplat);
2327 Step = Builder.CreateVectorSplat(VLen, Step);
2328 assert(Step->getType() == Val->getType() && "Invalid step vec")(static_cast <bool> (Step->getType() == Val->getType
() && "Invalid step vec") ? void (0) : __assert_fail (
"Step->getType() == Val->getType() && \"Invalid step vec\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2328, __extension__
__PRETTY_FUNCTION__))
;
2329 // FIXME: The newly created binary instructions should contain nsw/nuw
2330 // flags, which can be found from the original scalar operations.
2331 Step = Builder.CreateMul(InitVec, Step);
2332 return Builder.CreateAdd(Val, Step, "induction");
2333 }
2334
2335 // Floating point induction.
2336 assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) &&(static_cast <bool> ((BinOp == Instruction::FAdd || BinOp
== Instruction::FSub) && "Binary Opcode should be specified for FP induction"
) ? void (0) : __assert_fail ("(BinOp == Instruction::FAdd || BinOp == Instruction::FSub) && \"Binary Opcode should be specified for FP induction\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2337, __extension__
__PRETTY_FUNCTION__))
2337 "Binary Opcode should be specified for FP induction")(static_cast <bool> ((BinOp == Instruction::FAdd || BinOp
== Instruction::FSub) && "Binary Opcode should be specified for FP induction"
) ? void (0) : __assert_fail ("(BinOp == Instruction::FAdd || BinOp == Instruction::FSub) && \"Binary Opcode should be specified for FP induction\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2337, __extension__
__PRETTY_FUNCTION__))
;
2338 InitVec = Builder.CreateUIToFP(InitVec, ValVTy);
2339 InitVec = Builder.CreateFAdd(InitVec, StartIdxSplat);
2340
2341 Step = Builder.CreateVectorSplat(VLen, Step);
2342 Value *MulOp = Builder.CreateFMul(InitVec, Step);
2343 return Builder.CreateBinOp(BinOp, Val, MulOp, "induction");
2344}
2345
2346/// Compute scalar induction steps. \p ScalarIV is the scalar induction
2347/// variable on which to base the steps, \p Step is the size of the step.
2348static void buildScalarSteps(Value *ScalarIV, Value *Step,
2349 const InductionDescriptor &ID, VPValue *Def,
2350 VPTransformState &State) {
2351 IRBuilderBase &Builder = State.Builder;
2352
2353 // Ensure step has the same type as that of scalar IV.
2354 Type *ScalarIVTy = ScalarIV->getType()->getScalarType();
2355 if (ScalarIVTy != Step->getType()) {
2356 // TODO: Also use VPDerivedIVRecipe when only the step needs truncating, to
2357 // avoid separate truncate here.
2358 assert(Step->getType()->isIntegerTy() &&(static_cast <bool> (Step->getType()->isIntegerTy
() && "Truncation requires an integer step") ? void (
0) : __assert_fail ("Step->getType()->isIntegerTy() && \"Truncation requires an integer step\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2359, __extension__
__PRETTY_FUNCTION__))
2359 "Truncation requires an integer step")(static_cast <bool> (Step->getType()->isIntegerTy
() && "Truncation requires an integer step") ? void (
0) : __assert_fail ("Step->getType()->isIntegerTy() && \"Truncation requires an integer step\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2359, __extension__
__PRETTY_FUNCTION__))
;
2360 Step = State.Builder.CreateTrunc(Step, ScalarIVTy);
2361 }
2362
2363 // We build scalar steps for both integer and floating-point induction
2364 // variables. Here, we determine the kind of arithmetic we will perform.
2365 Instruction::BinaryOps AddOp;
2366 Instruction::BinaryOps MulOp;
2367 if (ScalarIVTy->isIntegerTy()) {
2368 AddOp = Instruction::Add;
2369 MulOp = Instruction::Mul;
2370 } else {
2371 AddOp = ID.getInductionOpcode();
2372 MulOp = Instruction::FMul;
2373 }
2374
2375 // Determine the number of scalars we need to generate for each unroll
2376 // iteration.
2377 bool FirstLaneOnly = vputils::onlyFirstLaneUsed(Def);
2378 // Compute the scalar steps and save the results in State.
2379 Type *IntStepTy = IntegerType::get(ScalarIVTy->getContext(),
2380 ScalarIVTy->getScalarSizeInBits());
2381 Type *VecIVTy = nullptr;
2382 Value *UnitStepVec = nullptr, *SplatStep = nullptr, *SplatIV = nullptr;
2383 if (!FirstLaneOnly && State.VF.isScalable()) {
2384 VecIVTy = VectorType::get(ScalarIVTy, State.VF);
2385 UnitStepVec =
2386 Builder.CreateStepVector(VectorType::get(IntStepTy, State.VF));
2387 SplatStep = Builder.CreateVectorSplat(State.VF, Step);
2388 SplatIV = Builder.CreateVectorSplat(State.VF, ScalarIV);
2389 }
2390
2391 unsigned StartPart = 0;
2392 unsigned EndPart = State.UF;
2393 unsigned StartLane = 0;
2394 unsigned EndLane = FirstLaneOnly ? 1 : State.VF.getKnownMinValue();
2395 if (State.Instance) {
2396 StartPart = State.Instance->Part;
2397 EndPart = StartPart + 1;
2398 StartLane = State.Instance->Lane.getKnownLane();
2399 EndLane = StartLane + 1;
2400 }
2401 for (unsigned Part = StartPart; Part < EndPart; ++Part) {
2402 Value *StartIdx0 = createStepForVF(Builder, IntStepTy, State.VF, Part);
2403
2404 if (!FirstLaneOnly && State.VF.isScalable()) {
2405 auto *SplatStartIdx = Builder.CreateVectorSplat(State.VF, StartIdx0);
2406 auto *InitVec = Builder.CreateAdd(SplatStartIdx, UnitStepVec);
2407 if (ScalarIVTy->isFloatingPointTy())
2408 InitVec = Builder.CreateSIToFP(InitVec, VecIVTy);
2409 auto *Mul = Builder.CreateBinOp(MulOp, InitVec, SplatStep);
2410 auto *Add = Builder.CreateBinOp(AddOp, SplatIV, Mul);
2411 State.set(Def, Add, Part);
2412 // It's useful to record the lane values too for the known minimum number
2413 // of elements so we do those below. This improves the code quality when
2414 // trying to extract the first element, for example.
2415 }
2416
2417 if (ScalarIVTy->isFloatingPointTy())
2418 StartIdx0 = Builder.CreateSIToFP(StartIdx0, ScalarIVTy);
2419
2420 for (unsigned Lane = StartLane; Lane < EndLane; ++Lane) {
2421 Value *StartIdx = Builder.CreateBinOp(
2422 AddOp, StartIdx0, getSignedIntOrFpConstant(ScalarIVTy, Lane));
2423 // The step returned by `createStepForVF` is a runtime-evaluated value
2424 // when VF is scalable. Otherwise, it should be folded into a Constant.
2425 assert((State.VF.isScalable() || isa<Constant>(StartIdx)) &&(static_cast <bool> ((State.VF.isScalable() || isa<Constant
>(StartIdx)) && "Expected StartIdx to be folded to a constant when VF is not "
"scalable") ? void (0) : __assert_fail ("(State.VF.isScalable() || isa<Constant>(StartIdx)) && \"Expected StartIdx to be folded to a constant when VF is not \" \"scalable\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2427, __extension__
__PRETTY_FUNCTION__))
2426 "Expected StartIdx to be folded to a constant when VF is not "(static_cast <bool> ((State.VF.isScalable() || isa<Constant
>(StartIdx)) && "Expected StartIdx to be folded to a constant when VF is not "
"scalable") ? void (0) : __assert_fail ("(State.VF.isScalable() || isa<Constant>(StartIdx)) && \"Expected StartIdx to be folded to a constant when VF is not \" \"scalable\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2427, __extension__
__PRETTY_FUNCTION__))
2427 "scalable")(static_cast <bool> ((State.VF.isScalable() || isa<Constant
>(StartIdx)) && "Expected StartIdx to be folded to a constant when VF is not "
"scalable") ? void (0) : __assert_fail ("(State.VF.isScalable() || isa<Constant>(StartIdx)) && \"Expected StartIdx to be folded to a constant when VF is not \" \"scalable\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2427, __extension__
__PRETTY_FUNCTION__))
;
2428 auto *Mul = Builder.CreateBinOp(MulOp, StartIdx, Step);
2429 auto *Add = Builder.CreateBinOp(AddOp, ScalarIV, Mul);
2430 State.set(Def, Add, VPIteration(Part, Lane));
2431 }
2432 }
2433}
2434
2435// Generate code for the induction step. Note that induction steps are
2436// required to be loop-invariant
2437static Value *CreateStepValue(const SCEV *Step, ScalarEvolution &SE,
2438 Instruction *InsertBefore,
2439 Loop *OrigLoop = nullptr) {
2440 const DataLayout &DL = SE.getDataLayout();
2441 assert((!OrigLoop || SE.isLoopInvariant(Step, OrigLoop)) &&(static_cast <bool> ((!OrigLoop || SE.isLoopInvariant(Step
, OrigLoop)) && "Induction step should be loop invariant"
) ? void (0) : __assert_fail ("(!OrigLoop || SE.isLoopInvariant(Step, OrigLoop)) && \"Induction step should be loop invariant\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2442, __extension__
__PRETTY_FUNCTION__))
2442 "Induction step should be loop invariant")(static_cast <bool> ((!OrigLoop || SE.isLoopInvariant(Step
, OrigLoop)) && "Induction step should be loop invariant"
) ? void (0) : __assert_fail ("(!OrigLoop || SE.isLoopInvariant(Step, OrigLoop)) && \"Induction step should be loop invariant\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2442, __extension__
__PRETTY_FUNCTION__))
;
2443 if (auto *E = dyn_cast<SCEVUnknown>(Step))
2444 return E->getValue();
2445
2446 SCEVExpander Exp(SE, DL, "induction");
2447 return Exp.expandCodeFor(Step, Step->getType(), InsertBefore);
2448}
2449
2450/// Compute the transformed value of Index at offset StartValue using step
2451/// StepValue.
2452/// For integer induction, returns StartValue + Index * StepValue.
2453/// For pointer induction, returns StartValue[Index * StepValue].
2454/// FIXME: The newly created binary instructions should contain nsw/nuw
2455/// flags, which can be found from the original scalar operations.
2456static Value *emitTransformedIndex(IRBuilderBase &B, Value *Index,
2457 Value *StartValue, Value *Step,
2458 const InductionDescriptor &ID) {
2459 Type *StepTy = Step->getType();
2460 Value *CastedIndex = StepTy->isIntegerTy()
2461 ? B.CreateSExtOrTrunc(Index, StepTy)
2462 : B.CreateCast(Instruction::SIToFP, Index, StepTy);
2463 if (CastedIndex != Index) {
2464 CastedIndex->setName(CastedIndex->getName() + ".cast");
2465 Index = CastedIndex;
2466 }
2467
2468 // Note: the IR at this point is broken. We cannot use SE to create any new
2469 // SCEV and then expand it, hoping that SCEV's simplification will give us
2470 // a more optimal code. Unfortunately, attempt of doing so on invalid IR may
2471 // lead to various SCEV crashes. So all we can do is to use builder and rely
2472 // on InstCombine for future simplifications. Here we handle some trivial
2473 // cases only.
2474 auto CreateAdd = [&B](Value *X, Value *Y) {
2475 assert(X->getType() == Y->getType() && "Types don't match!")(static_cast <bool> (X->getType() == Y->getType()
&& "Types don't match!") ? void (0) : __assert_fail (
"X->getType() == Y->getType() && \"Types don't match!\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2475, __extension__
__PRETTY_FUNCTION__))
;
2476 if (auto *CX = dyn_cast<ConstantInt>(X))
2477 if (CX->isZero())
2478 return Y;
2479 if (auto *CY = dyn_cast<ConstantInt>(Y))
2480 if (CY->isZero())
2481 return X;
2482 return B.CreateAdd(X, Y);
2483 };
2484
2485 // We allow X to be a vector type, in which case Y will potentially be
2486 // splatted into a vector with the same element count.
2487 auto CreateMul = [&B](Value *X, Value *Y) {
2488 assert(X->getType()->getScalarType() == Y->getType() &&(static_cast <bool> (X->getType()->getScalarType(
) == Y->getType() && "Types don't match!") ? void (
0) : __assert_fail ("X->getType()->getScalarType() == Y->getType() && \"Types don't match!\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2489, __extension__
__PRETTY_FUNCTION__))
2489 "Types don't match!")(static_cast <bool> (X->getType()->getScalarType(
) == Y->getType() && "Types don't match!") ? void (
0) : __assert_fail ("X->getType()->getScalarType() == Y->getType() && \"Types don't match!\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2489, __extension__
__PRETTY_FUNCTION__))
;
2490 if (auto *CX = dyn_cast<ConstantInt>(X))
2491 if (CX->isOne())
2492 return Y;
2493 if (auto *CY = dyn_cast<ConstantInt>(Y))
2494 if (CY->isOne())
2495 return X;
2496 VectorType *XVTy = dyn_cast<VectorType>(X->getType());
2497 if (XVTy && !isa<VectorType>(Y->getType()))
2498 Y = B.CreateVectorSplat(XVTy->getElementCount(), Y);
2499 return B.CreateMul(X, Y);
2500 };
2501
2502 switch (ID.getKind()) {
2503 case InductionDescriptor::IK_IntInduction: {
2504 assert(!isa<VectorType>(Index->getType()) &&(static_cast <bool> (!isa<VectorType>(Index->getType
()) && "Vector indices not supported for integer inductions yet"
) ? void (0) : __assert_fail ("!isa<VectorType>(Index->getType()) && \"Vector indices not supported for integer inductions yet\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2505, __extension__
__PRETTY_FUNCTION__))
2505 "Vector indices not supported for integer inductions yet")(static_cast <bool> (!isa<VectorType>(Index->getType
()) && "Vector indices not supported for integer inductions yet"
) ? void (0) : __assert_fail ("!isa<VectorType>(Index->getType()) && \"Vector indices not supported for integer inductions yet\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2505, __extension__
__PRETTY_FUNCTION__))
;
2506 assert(Index->getType() == StartValue->getType() &&(static_cast <bool> (Index->getType() == StartValue->
getType() && "Index type does not match StartValue type"
) ? void (0) : __assert_fail ("Index->getType() == StartValue->getType() && \"Index type does not match StartValue type\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2507, __extension__
__PRETTY_FUNCTION__))
2507 "Index type does not match StartValue type")(static_cast <bool> (Index->getType() == StartValue->
getType() && "Index type does not match StartValue type"
) ? void (0) : __assert_fail ("Index->getType() == StartValue->getType() && \"Index type does not match StartValue type\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2507, __extension__
__PRETTY_FUNCTION__))
;
2508 if (isa<ConstantInt>(Step) && cast<ConstantInt>(Step)->isMinusOne())
2509 return B.CreateSub(StartValue, Index);
2510 auto *Offset = CreateMul(Index, Step);
2511 return CreateAdd(StartValue, Offset);
2512 }
2513 case InductionDescriptor::IK_PtrInduction: {
2514 assert(isa<Constant>(Step) &&(static_cast <bool> (isa<Constant>(Step) &&
"Expected constant step for pointer induction") ? void (0) :
__assert_fail ("isa<Constant>(Step) && \"Expected constant step for pointer induction\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2515, __extension__
__PRETTY_FUNCTION__))
2515 "Expected constant step for pointer induction")(static_cast <bool> (isa<Constant>(Step) &&
"Expected constant step for pointer induction") ? void (0) :
__assert_fail ("isa<Constant>(Step) && \"Expected constant step for pointer induction\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2515, __extension__
__PRETTY_FUNCTION__))
;
2516 return B.CreateGEP(ID.getElementType(), StartValue, CreateMul(Index, Step));
2517 }
2518 case InductionDescriptor::IK_FpInduction: {
2519 assert(!isa<VectorType>(Index->getType()) &&(static_cast <bool> (!isa<VectorType>(Index->getType
()) && "Vector indices not supported for FP inductions yet"
) ? void (0) : __assert_fail ("!isa<VectorType>(Index->getType()) && \"Vector indices not supported for FP inductions yet\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2520, __extension__
__PRETTY_FUNCTION__))
2520 "Vector indices not supported for FP inductions yet")(static_cast <bool> (!isa<VectorType>(Index->getType
()) && "Vector indices not supported for FP inductions yet"
) ? void (0) : __assert_fail ("!isa<VectorType>(Index->getType()) && \"Vector indices not supported for FP inductions yet\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2520, __extension__
__PRETTY_FUNCTION__))
;
2521 assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value")(static_cast <bool> (Step->getType()->isFloatingPointTy
() && "Expected FP Step value") ? void (0) : __assert_fail
("Step->getType()->isFloatingPointTy() && \"Expected FP Step value\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2521, __extension__
__PRETTY_FUNCTION__))
;
2522 auto InductionBinOp = ID.getInductionBinOp();
2523 assert(InductionBinOp &&(static_cast <bool> (InductionBinOp && (InductionBinOp
->getOpcode() == Instruction::FAdd || InductionBinOp->getOpcode
() == Instruction::FSub) && "Original bin op should be defined for FP induction"
) ? void (0) : __assert_fail ("InductionBinOp && (InductionBinOp->getOpcode() == Instruction::FAdd || InductionBinOp->getOpcode() == Instruction::FSub) && \"Original bin op should be defined for FP induction\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2526, __extension__
__PRETTY_FUNCTION__))
2524 (InductionBinOp->getOpcode() == Instruction::FAdd ||(static_cast <bool> (InductionBinOp && (InductionBinOp
->getOpcode() == Instruction::FAdd || InductionBinOp->getOpcode
() == Instruction::FSub) && "Original bin op should be defined for FP induction"
) ? void (0) : __assert_fail ("InductionBinOp && (InductionBinOp->getOpcode() == Instruction::FAdd || InductionBinOp->getOpcode() == Instruction::FSub) && \"Original bin op should be defined for FP induction\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2526, __extension__
__PRETTY_FUNCTION__))
2525 InductionBinOp->getOpcode() == Instruction::FSub) &&(static_cast <bool> (InductionBinOp && (InductionBinOp
->getOpcode() == Instruction::FAdd || InductionBinOp->getOpcode
() == Instruction::FSub) && "Original bin op should be defined for FP induction"
) ? void (0) : __assert_fail ("InductionBinOp && (InductionBinOp->getOpcode() == Instruction::FAdd || InductionBinOp->getOpcode() == Instruction::FSub) && \"Original bin op should be defined for FP induction\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2526, __extension__
__PRETTY_FUNCTION__))
2526 "Original bin op should be defined for FP induction")(static_cast <bool> (InductionBinOp && (InductionBinOp
->getOpcode() == Instruction::FAdd || InductionBinOp->getOpcode
() == Instruction::FSub) && "Original bin op should be defined for FP induction"
) ? void (0) : __assert_fail ("InductionBinOp && (InductionBinOp->getOpcode() == Instruction::FAdd || InductionBinOp->getOpcode() == Instruction::FSub) && \"Original bin op should be defined for FP induction\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2526, __extension__
__PRETTY_FUNCTION__))
;
2527
2528 Value *MulExp = B.CreateFMul(Step, Index);
2529 return B.CreateBinOp(InductionBinOp->getOpcode(), StartValue, MulExp,
2530 "induction");
2531 }
2532 case InductionDescriptor::IK_NoInduction:
2533 return nullptr;
2534 }
2535 llvm_unreachable("invalid enum")::llvm::llvm_unreachable_internal("invalid enum", "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2535)
;
2536}
2537
2538void InnerLoopVectorizer::packScalarIntoVectorValue(VPValue *Def,
2539 const VPIteration &Instance,
2540 VPTransformState &State) {
2541 Value *ScalarInst = State.get(Def, Instance);
2542 Value *VectorValue = State.get(Def, Instance.Part);
2543 VectorValue = Builder.CreateInsertElement(
2544 VectorValue, ScalarInst,
2545 Instance.Lane.getAsRuntimeExpr(State.Builder, VF));
2546 State.set(Def, VectorValue, Instance.Part);
2547}
2548
2549// Return whether we allow using masked interleave-groups (for dealing with
2550// strided loads/stores that reside in predicated blocks, or for dealing
2551// with gaps).
2552static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) {
2553 // If an override option has been passed in for interleaved accesses, use it.
2554 if (EnableMaskedInterleavedMemAccesses.getNumOccurrences() > 0)
2555 return EnableMaskedInterleavedMemAccesses;
2556
2557 return TTI.enableMaskedInterleavedAccessVectorization();
2558}
2559
2560// Try to vectorize the interleave group that \p Instr belongs to.
2561//
2562// E.g. Translate following interleaved load group (factor = 3):
2563// for (i = 0; i < N; i+=3) {
2564// R = Pic[i]; // Member of index 0
2565// G = Pic[i+1]; // Member of index 1
2566// B = Pic[i+2]; // Member of index 2
2567// ... // do something to R, G, B
2568// }
2569// To:
2570// %wide.vec = load <12 x i32> ; Read 4 tuples of R,G,B
2571// %R.vec = shuffle %wide.vec, poison, <0, 3, 6, 9> ; R elements
2572// %G.vec = shuffle %wide.vec, poison, <1, 4, 7, 10> ; G elements
2573// %B.vec = shuffle %wide.vec, poison, <2, 5, 8, 11> ; B elements
2574//
2575// Or translate following interleaved store group (factor = 3):
2576// for (i = 0; i < N; i+=3) {
2577// ... do something to R, G, B
2578// Pic[i] = R; // Member of index 0
2579// Pic[i+1] = G; // Member of index 1
2580// Pic[i+2] = B; // Member of index 2
2581// }
2582// To:
2583// %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7>
2584// %B_U.vec = shuffle %B.vec, poison, <0, 1, 2, 3, u, u, u, u>
2585// %interleaved.vec = shuffle %R_G.vec, %B_U.vec,
2586// <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> ; Interleave R,G,B elements
2587// store <12 x i32> %interleaved.vec ; Write 4 tuples of R,G,B
2588void InnerLoopVectorizer::vectorizeInterleaveGroup(
2589 const InterleaveGroup<Instruction> *Group, ArrayRef<VPValue *> VPDefs,
2590 VPTransformState &State, VPValue *Addr, ArrayRef<VPValue *> StoredValues,
2591 VPValue *BlockInMask) {
2592 Instruction *Instr = Group->getInsertPos();
2593 const DataLayout &DL = Instr->getModule()->getDataLayout();
2594
2595 // Prepare for the vector type of the interleaved load/store.
2596 Type *ScalarTy = getLoadStoreType(Instr);
2597 unsigned InterleaveFactor = Group->getFactor();
2598 assert(!VF.isScalable() && "scalable vectors not yet supported.")(static_cast <bool> (!VF.isScalable() && "scalable vectors not yet supported."
) ? void (0) : __assert_fail ("!VF.isScalable() && \"scalable vectors not yet supported.\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2598, __extension__
__PRETTY_FUNCTION__))
;
2599 auto *VecTy = VectorType::get(ScalarTy, VF * InterleaveFactor);
2600
2601 // Prepare for the new pointers.
2602 SmallVector<Value *, 2> AddrParts;
2603 unsigned Index = Group->getIndex(Instr);
2604
2605 // TODO: extend the masked interleaved-group support to reversed access.
2606 assert((!BlockInMask || !Group->isReverse()) &&(static_cast <bool> ((!BlockInMask || !Group->isReverse
()) && "Reversed masked interleave-group not supported."
) ? void (0) : __assert_fail ("(!BlockInMask || !Group->isReverse()) && \"Reversed masked interleave-group not supported.\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2607, __extension__
__PRETTY_FUNCTION__))
2607 "Reversed masked interleave-group not supported.")(static_cast <bool> ((!BlockInMask || !Group->isReverse
()) && "Reversed masked interleave-group not supported."
) ? void (0) : __assert_fail ("(!BlockInMask || !Group->isReverse()) && \"Reversed masked interleave-group not supported.\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2607, __extension__
__PRETTY_FUNCTION__))
;
2608
2609 // If the group is reverse, adjust the index to refer to the last vector lane
2610 // instead of the first. We adjust the index from the first vector lane,
2611 // rather than directly getting the pointer for lane VF - 1, because the
2612 // pointer operand of the interleaved access is supposed to be uniform. For
2613 // uniform instructions, we're only required to generate a value for the
2614 // first vector lane in each unroll iteration.
2615 if (Group->isReverse())
2616 Index += (VF.getKnownMinValue() - 1) * Group->getFactor();
2617
2618 for (unsigned Part = 0; Part < UF; Part++) {
2619 Value *AddrPart = State.get(Addr, VPIteration(Part, 0));
2620 State.setDebugLocFromInst(AddrPart);
2621
2622 // Notice current instruction could be any index. Need to adjust the address
2623 // to the member of index 0.
2624 //
2625 // E.g. a = A[i+1]; // Member of index 1 (Current instruction)
2626 // b = A[i]; // Member of index 0
2627 // Current pointer is pointed to A[i+1], adjust it to A[i].
2628 //
2629 // E.g. A[i+1] = a; // Member of index 1
2630 // A[i] = b; // Member of index 0
2631 // A[i+2] = c; // Member of index 2 (Current instruction)
2632 // Current pointer is pointed to A[i+2], adjust it to A[i].
2633
2634 bool InBounds = false;
2635 if (auto *gep = dyn_cast<GetElementPtrInst>(AddrPart->stripPointerCasts()))
2636 InBounds = gep->isInBounds();
2637 AddrPart = Builder.CreateGEP(ScalarTy, AddrPart, Builder.getInt32(-Index));
2638 cast<GetElementPtrInst>(AddrPart)->setIsInBounds(InBounds);
2639
2640 // Cast to the vector pointer type.
2641 unsigned AddressSpace = AddrPart->getType()->getPointerAddressSpace();
2642 Type *PtrTy = VecTy->getPointerTo(AddressSpace);
2643 AddrParts.push_back(Builder.CreateBitCast(AddrPart, PtrTy));
2644 }
2645
2646 State.setDebugLocFromInst(Instr);
2647 Value *PoisonVec = PoisonValue::get(VecTy);
2648
2649 Value *MaskForGaps = nullptr;
2650 if (Group->requiresScalarEpilogue() && !Cost->isScalarEpilogueAllowed()) {
2651 MaskForGaps = createBitMaskForGaps(Builder, VF.getKnownMinValue(), *Group);
2652 assert(MaskForGaps && "Mask for Gaps is required but it is null")(static_cast <bool> (MaskForGaps && "Mask for Gaps is required but it is null"
) ? void (0) : __assert_fail ("MaskForGaps && \"Mask for Gaps is required but it is null\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2652, __extension__
__PRETTY_FUNCTION__))
;
2653 }
2654
2655 // Vectorize the interleaved load group.
2656 if (isa<LoadInst>(Instr)) {
2657 // For each unroll part, create a wide load for the group.
2658 SmallVector<Value *, 2> NewLoads;
2659 for (unsigned Part = 0; Part < UF; Part++) {
2660 Instruction *NewLoad;
2661 if (BlockInMask || MaskForGaps) {
2662 assert(useMaskedInterleavedAccesses(*TTI) &&(static_cast <bool> (useMaskedInterleavedAccesses(*TTI)
&& "masked interleaved groups are not allowed.") ? void
(0) : __assert_fail ("useMaskedInterleavedAccesses(*TTI) && \"masked interleaved groups are not allowed.\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2663, __extension__
__PRETTY_FUNCTION__))
2663 "masked interleaved groups are not allowed.")(static_cast <bool> (useMaskedInterleavedAccesses(*TTI)
&& "masked interleaved groups are not allowed.") ? void
(0) : __assert_fail ("useMaskedInterleavedAccesses(*TTI) && \"masked interleaved groups are not allowed.\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2663, __extension__
__PRETTY_FUNCTION__))
;
2664 Value *GroupMask = MaskForGaps;
2665 if (BlockInMask) {
2666 Value *BlockInMaskPart = State.get(BlockInMask, Part);
2667 Value *ShuffledMask = Builder.CreateShuffleVector(
2668 BlockInMaskPart,
2669 createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()),
2670 "interleaved.mask");
2671 GroupMask = MaskForGaps
2672 ? Builder.CreateBinOp(Instruction::And, ShuffledMask,
2673 MaskForGaps)
2674 : ShuffledMask;
2675 }
2676 NewLoad =
2677 Builder.CreateMaskedLoad(VecTy, AddrParts[Part], Group->getAlign(),
2678 GroupMask, PoisonVec, "wide.masked.vec");
2679 }
2680 else
2681 NewLoad = Builder.CreateAlignedLoad(VecTy, AddrParts[Part],
2682 Group->getAlign(), "wide.vec");
2683 Group->addMetadata(NewLoad);
2684 NewLoads.push_back(NewLoad);
2685 }
2686
2687 // For each member in the group, shuffle out the appropriate data from the
2688 // wide loads.
2689 unsigned J = 0;
2690 for (unsigned I = 0; I < InterleaveFactor; ++I) {
2691 Instruction *Member = Group->getMember(I);
2692
2693 // Skip the gaps in the group.
2694 if (!Member)
2695 continue;
2696
2697 auto StrideMask =
2698 createStrideMask(I, InterleaveFactor, VF.getKnownMinValue());
2699 for (unsigned Part = 0; Part < UF; Part++) {
2700 Value *StridedVec = Builder.CreateShuffleVector(
2701 NewLoads[Part], StrideMask, "strided.vec");
2702
2703 // If this member has different type, cast the result type.
2704 if (Member->getType() != ScalarTy) {
2705 assert(!VF.isScalable() && "VF is assumed to be non scalable.")(static_cast <bool> (!VF.isScalable() && "VF is assumed to be non scalable."
) ? void (0) : __assert_fail ("!VF.isScalable() && \"VF is assumed to be non scalable.\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2705, __extension__
__PRETTY_FUNCTION__))
;
2706 VectorType *OtherVTy = VectorType::get(Member->getType(), VF);
2707 StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL);
2708 }
2709
2710 if (Group->isReverse())
2711 StridedVec = Builder.CreateVectorReverse(StridedVec, "reverse");
2712
2713 State.set(VPDefs[J], StridedVec, Part);
2714 }
2715 ++J;
2716 }
2717 return;
2718 }
2719
2720 // The sub vector type for current instruction.
2721 auto *SubVT = VectorType::get(ScalarTy, VF);
2722
2723 // Vectorize the interleaved store group.
2724 MaskForGaps = createBitMaskForGaps(Builder, VF.getKnownMinValue(), *Group);
2725 assert((!MaskForGaps || useMaskedInterleavedAccesses(*TTI)) &&(static_cast <bool> ((!MaskForGaps || useMaskedInterleavedAccesses
(*TTI)) && "masked interleaved groups are not allowed."
) ? void (0) : __assert_fail ("(!MaskForGaps || useMaskedInterleavedAccesses(*TTI)) && \"masked interleaved groups are not allowed.\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2726, __extension__
__PRETTY_FUNCTION__))
2726 "masked interleaved groups are not allowed.")(static_cast <bool> ((!MaskForGaps || useMaskedInterleavedAccesses
(*TTI)) && "masked interleaved groups are not allowed."
) ? void (0) : __assert_fail ("(!MaskForGaps || useMaskedInterleavedAccesses(*TTI)) && \"masked interleaved groups are not allowed.\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2726, __extension__
__PRETTY_FUNCTION__))
;
2727 assert((!MaskForGaps || !VF.isScalable()) &&(static_cast <bool> ((!MaskForGaps || !VF.isScalable())
&& "masking gaps for scalable vectors is not yet supported."
) ? void (0) : __assert_fail ("(!MaskForGaps || !VF.isScalable()) && \"masking gaps for scalable vectors is not yet supported.\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2728, __extension__
__PRETTY_FUNCTION__))
2728 "masking gaps for scalable vectors is not yet supported.")(static_cast <bool> ((!MaskForGaps || !VF.isScalable())
&& "masking gaps for scalable vectors is not yet supported."
) ? void (0) : __assert_fail ("(!MaskForGaps || !VF.isScalable()) && \"masking gaps for scalable vectors is not yet supported.\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2728, __extension__
__PRETTY_FUNCTION__))
;
2729 for (unsigned Part = 0; Part < UF; Part++) {
2730 // Collect the stored vector from each member.
2731 SmallVector<Value *, 4> StoredVecs;
2732 unsigned StoredIdx = 0;
2733 for (unsigned i = 0; i < InterleaveFactor; i++) {
2734 assert((Group->getMember(i) || MaskForGaps) &&(static_cast <bool> ((Group->getMember(i) || MaskForGaps
) && "Fail to get a member from an interleaved store group"
) ? void (0) : __assert_fail ("(Group->getMember(i) || MaskForGaps) && \"Fail to get a member from an interleaved store group\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2735, __extension__
__PRETTY_FUNCTION__))
2735 "Fail to get a member from an interleaved store group")(static_cast <bool> ((Group->getMember(i) || MaskForGaps
) && "Fail to get a member from an interleaved store group"
) ? void (0) : __assert_fail ("(Group->getMember(i) || MaskForGaps) && \"Fail to get a member from an interleaved store group\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2735, __extension__
__PRETTY_FUNCTION__))
;
2736 Instruction *Member = Group->getMember(i);
2737
2738 // Skip the gaps in the group.
2739 if (!Member) {
2740 Value *Undef = PoisonValue::get(SubVT);
2741 StoredVecs.push_back(Undef);
2742 continue;
2743 }
2744
2745 Value *StoredVec = State.get(StoredValues[StoredIdx], Part);
2746 ++StoredIdx;
2747
2748 if (Group->isReverse())
2749 StoredVec = Builder.CreateVectorReverse(StoredVec, "reverse");
2750
2751 // If this member has different type, cast it to a unified type.
2752
2753 if (StoredVec->getType() != SubVT)
2754 StoredVec = createBitOrPointerCast(StoredVec, SubVT, DL);
2755
2756 StoredVecs.push_back(StoredVec);
2757 }
2758
2759 // Concatenate all vectors into a wide vector.
2760 Value *WideVec = concatenateVectors(Builder, StoredVecs);
2761
2762 // Interleave the elements in the wide vector.
2763 Value *IVec = Builder.CreateShuffleVector(
2764 WideVec, createInterleaveMask(VF.getKnownMinValue(), InterleaveFactor),
2765 "interleaved.vec");
2766
2767 Instruction *NewStoreInstr;
2768 if (BlockInMask || MaskForGaps) {
2769 Value *GroupMask = MaskForGaps;
2770 if (BlockInMask) {
2771 Value *BlockInMaskPart = State.get(BlockInMask, Part);
2772 Value *ShuffledMask = Builder.CreateShuffleVector(
2773 BlockInMaskPart,
2774 createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()),
2775 "interleaved.mask");
2776 GroupMask = MaskForGaps ? Builder.CreateBinOp(Instruction::And,
2777 ShuffledMask, MaskForGaps)
2778 : ShuffledMask;
2779 }
2780 NewStoreInstr = Builder.CreateMaskedStore(IVec, AddrParts[Part],
2781 Group->getAlign(), GroupMask);
2782 } else
2783 NewStoreInstr =
2784 Builder.CreateAlignedStore(IVec, AddrParts[Part], Group->getAlign());
2785
2786 Group->addMetadata(NewStoreInstr);
2787 }
2788}
2789
2790void InnerLoopVectorizer::scalarizeInstruction(const Instruction *Instr,
2791 VPReplicateRecipe *RepRecipe,
2792 const VPIteration &Instance,
2793 bool IfPredicateInstr,
2794 VPTransformState &State) {
2795 assert(!Instr->getType()->isAggregateType() && "Can't handle vectors")(static_cast <bool> (!Instr->getType()->isAggregateType
() && "Can't handle vectors") ? void (0) : __assert_fail
("!Instr->getType()->isAggregateType() && \"Can't handle vectors\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2795, __extension__
__PRETTY_FUNCTION__))
;
2796
2797 // llvm.experimental.noalias.scope.decl intrinsics must only be duplicated for
2798 // the first lane and part.
2799 if (isa<NoAliasScopeDeclInst>(Instr))
2800 if (!Instance.isFirstIteration())
2801 return;
2802
2803 // Does this instruction return a value ?
2804 bool IsVoidRetTy = Instr->getType()->isVoidTy();
2805
2806 Instruction *Cloned = Instr->clone();
2807 if (!IsVoidRetTy)
2808 Cloned->setName(Instr->getName() + ".cloned");
2809
2810 // If the scalarized instruction contributes to the address computation of a
2811 // widen masked load/store which was in a basic block that needed predication
2812 // and is not predicated after vectorization, we can't propagate
2813 // poison-generating flags (nuw/nsw, exact, inbounds, etc.). The scalarized
2814 // instruction could feed a poison value to the base address of the widen
2815 // load/store.
2816 if (State.MayGeneratePoisonRecipes.contains(RepRecipe))
2817 Cloned->dropPoisonGeneratingFlags();
2818
2819 if (Instr->getDebugLoc())
2820 State.setDebugLocFromInst(Instr);
2821
2822 // Replace the operands of the cloned instructions with their scalar
2823 // equivalents in the new loop.
2824 for (const auto &I : enumerate(RepRecipe->operands())) {
2825 auto InputInstance = Instance;
2826 VPValue *Operand = I.value();
2827 if (vputils::isUniformAfterVectorization(Operand))
2828 InputInstance.Lane = VPLane::getFirstLane();
2829 Cloned->setOperand(I.index(), State.get(Operand, InputInstance));
2830 }
2831 State.addNewMetadata(Cloned, Instr);
2832
2833 // Place the cloned scalar in the new loop.
2834 State.Builder.Insert(Cloned);
2835
2836 State.set(RepRecipe, Cloned, Instance);
2837
2838 // If we just cloned a new assumption, add it the assumption cache.
2839 if (auto *II = dyn_cast<AssumeInst>(Cloned))
2840 AC->registerAssumption(II);
2841
2842 // End if-block.
2843 if (IfPredicateInstr)
2844 PredicatedInstructions.push_back(Cloned);
2845}
2846
2847Value *InnerLoopVectorizer::getOrCreateTripCount(BasicBlock *InsertBlock) {
2848 if (TripCount)
2849 return TripCount;
2850
2851 assert(InsertBlock)(static_cast <bool> (InsertBlock) ? void (0) : __assert_fail
("InsertBlock", "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2851, __extension__ __PRETTY_FUNCTION__))
;
2852 IRBuilder<> Builder(InsertBlock->getTerminator());
2853 // Find the loop boundaries.
2854 Type *IdxTy = Legal->getWidestInductionType();
2855 assert(IdxTy && "No type for induction")(static_cast <bool> (IdxTy && "No type for induction"
) ? void (0) : __assert_fail ("IdxTy && \"No type for induction\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2855, __extension__
__PRETTY_FUNCTION__))
;
2856 const SCEV *ExitCount = createTripCountSCEV(IdxTy, PSE);
2857
2858 const DataLayout &DL = InsertBlock->getModule()->getDataLayout();
2859
2860 // Expand the trip count and place the new instructions in the preheader.
2861 // Notice that the pre-header does not change, only the loop body.
2862 SCEVExpander Exp(*PSE.getSE(), DL, "induction");
2863
2864 // Count holds the overall loop count (N).
2865 TripCount = Exp.expandCodeFor(ExitCount, ExitCount->getType(),
2866 InsertBlock->getTerminator());
2867
2868 if (TripCount->getType()->isPointerTy())
2869 TripCount =
2870 CastInst::CreatePointerCast(TripCount, IdxTy, "exitcount.ptrcnt.to.int",
2871 InsertBlock->getTerminator());
2872
2873 return TripCount;
2874}
2875
2876Value *
2877InnerLoopVectorizer::getOrCreateVectorTripCount(BasicBlock *InsertBlock) {
2878 if (VectorTripCount)
2879 return VectorTripCount;
2880
2881 Value *TC = getOrCreateTripCount(InsertBlock);
2882 IRBuilder<> Builder(InsertBlock->getTerminator());
2883
2884 Type *Ty = TC->getType();
2885 // This is where we can make the step a runtime constant.
2886 Value *Step = createStepForVF(Builder, Ty, VF, UF);
2887
2888 // If the tail is to be folded by masking, round the number of iterations N
2889 // up to a multiple of Step instead of rounding down. This is done by first
2890 // adding Step-1 and then rounding down. Note that it's ok if this addition
2891 // overflows: the vector induction variable will eventually wrap to zero given
2892 // that it starts at zero and its Step is a power of two; the loop will then
2893 // exit, with the last early-exit vector comparison also producing all-true.
2894 // For scalable vectors the VF is not guaranteed to be a power of 2, but this
2895 // is accounted for in emitIterationCountCheck that adds an overflow check.
2896 if (Cost->foldTailByMasking()) {
2897 assert(isPowerOf2_32(VF.getKnownMinValue() * UF) &&(static_cast <bool> (isPowerOf2_32(VF.getKnownMinValue(
) * UF) && "VF*UF must be a power of 2 when folding tail by masking"
) ? void (0) : __assert_fail ("isPowerOf2_32(VF.getKnownMinValue() * UF) && \"VF*UF must be a power of 2 when folding tail by masking\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2898, __extension__
__PRETTY_FUNCTION__))
2898 "VF*UF must be a power of 2 when folding tail by masking")(static_cast <bool> (isPowerOf2_32(VF.getKnownMinValue(
) * UF) && "VF*UF must be a power of 2 when folding tail by masking"
) ? void (0) : __assert_fail ("isPowerOf2_32(VF.getKnownMinValue() * UF) && \"VF*UF must be a power of 2 when folding tail by masking\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2898, __extension__
__PRETTY_FUNCTION__))
;
2899 Value *NumLanes = getRuntimeVF(Builder, Ty, VF * UF);
2900 TC = Builder.CreateAdd(
2901 TC, Builder.CreateSub(NumLanes, ConstantInt::get(Ty, 1)), "n.rnd.up");
2902 }
2903
2904 // Now we need to generate the expression for the part of the loop that the
2905 // vectorized body will execute. This is equal to N - (N % Step) if scalar
2906 // iterations are not required for correctness, or N - Step, otherwise. Step
2907 // is equal to the vectorization factor (number of SIMD elements) times the
2908 // unroll factor (number of SIMD instructions).
2909 Value *R = Builder.CreateURem(TC, Step, "n.mod.vf");
2910
2911 // There are cases where we *must* run at least one iteration in the remainder
2912 // loop. See the cost model for when this can happen. If the step evenly
2913 // divides the trip count, we set the remainder to be equal to the step. If
2914 // the step does not evenly divide the trip count, no adjustment is necessary
2915 // since there will already be scalar iterations. Note that the minimum
2916 // iterations check ensures that N >= Step.
2917 if (Cost->requiresScalarEpilogue(VF)) {
2918 auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0));
2919 R = Builder.CreateSelect(IsZero, Step, R);
2920 }
2921
2922 VectorTripCount = Builder.CreateSub(TC, R, "n.vec");
2923
2924 return VectorTripCount;
2925}
2926
2927Value *InnerLoopVectorizer::createBitOrPointerCast(Value *V, VectorType *DstVTy,
2928 const DataLayout &DL) {
2929 // Verify that V is a vector type with same number of elements as DstVTy.
2930 auto *DstFVTy = cast<FixedVectorType>(DstVTy);
2931 unsigned VF = DstFVTy->getNumElements();
2932 auto *SrcVecTy = cast<FixedVectorType>(V->getType());
2933 assert((VF == SrcVecTy->getNumElements()) && "Vector dimensions do not match")(static_cast <bool> ((VF == SrcVecTy->getNumElements
()) && "Vector dimensions do not match") ? void (0) :
__assert_fail ("(VF == SrcVecTy->getNumElements()) && \"Vector dimensions do not match\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2933, __extension__
__PRETTY_FUNCTION__))
;
2934 Type *SrcElemTy = SrcVecTy->getElementType();
2935 Type *DstElemTy = DstFVTy->getElementType();
2936 assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) &&(static_cast <bool> ((DL.getTypeSizeInBits(SrcElemTy) ==
DL.getTypeSizeInBits(DstElemTy)) && "Vector elements must have same size"
) ? void (0) : __assert_fail ("(DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) && \"Vector elements must have same size\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2937, __extension__
__PRETTY_FUNCTION__))
2937 "Vector elements must have same size")(static_cast <bool> ((DL.getTypeSizeInBits(SrcElemTy) ==
DL.getTypeSizeInBits(DstElemTy)) && "Vector elements must have same size"
) ? void (0) : __assert_fail ("(DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) && \"Vector elements must have same size\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2937, __extension__
__PRETTY_FUNCTION__))
;
2938
2939 // Do a direct cast if element types are castable.
2940 if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) {
2941 return Builder.CreateBitOrPointerCast(V, DstFVTy);
2942 }
2943 // V cannot be directly casted to desired vector type.
2944 // May happen when V is a floating point vector but DstVTy is a vector of
2945 // pointers or vice-versa. Handle this using a two-step bitcast using an
2946 // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float.
2947 assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) &&(static_cast <bool> ((DstElemTy->isPointerTy() != SrcElemTy
->isPointerTy()) && "Only one type should be a pointer type"
) ? void (0) : __assert_fail ("(DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) && \"Only one type should be a pointer type\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2948, __extension__
__PRETTY_FUNCTION__))
2948 "Only one type should be a pointer type")(static_cast <bool> ((DstElemTy->isPointerTy() != SrcElemTy
->isPointerTy()) && "Only one type should be a pointer type"
) ? void (0) : __assert_fail ("(DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) && \"Only one type should be a pointer type\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2948, __extension__
__PRETTY_FUNCTION__))
;
2949 assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) &&(static_cast <bool> ((DstElemTy->isFloatingPointTy()
!= SrcElemTy->isFloatingPointTy()) && "Only one type should be a floating point type"
) ? void (0) : __assert_fail ("(DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) && \"Only one type should be a floating point type\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2950, __extension__
__PRETTY_FUNCTION__))
2950 "Only one type should be a floating point type")(static_cast <bool> ((DstElemTy->isFloatingPointTy()
!= SrcElemTy->isFloatingPointTy()) && "Only one type should be a floating point type"
) ? void (0) : __assert_fail ("(DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) && \"Only one type should be a floating point type\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 2950, __extension__
__PRETTY_FUNCTION__))
;
2951 Type *IntTy =
2952 IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy));
2953 auto *VecIntTy = FixedVectorType::get(IntTy, VF);
2954 Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy);
2955 return Builder.CreateBitOrPointerCast(CastVal, DstFVTy);
2956}
2957
2958void InnerLoopVectorizer::emitIterationCountCheck(BasicBlock *Bypass) {
2959 Value *Count = getOrCreateTripCount(LoopVectorPreHeader);
2960 // Reuse existing vector loop preheader for TC checks.
2961 // Note that new preheader block is generated for vector loop.
2962 BasicBlock *const TCCheckBlock = LoopVectorPreHeader;
2963 IRBuilder<> Builder(TCCheckBlock->getTerminator());
2964
2965 // Generate code to check if the loop's trip count is less than VF * UF, or
2966 // equal to it in case a scalar epilogue is required; this implies that the
2967 // vector trip count is zero. This check also covers the case where adding one
2968 // to the backedge-taken count overflowed leading to an incorrect trip count
2969 // of zero. In this case we will also jump to the scalar loop.
2970 auto P = Cost->requiresScalarEpilogue(VF) ? ICmpInst::ICMP_ULE
2971 : ICmpInst::ICMP_ULT;
2972
2973 // If tail is to be folded, vector loop takes care of all iterations.
2974 Type *CountTy = Count->getType();
2975 Value *CheckMinIters = Builder.getFalse();
2976 auto CreateStep = [&]() -> Value * {
2977 // Create step with max(MinProTripCount, UF * VF).
2978 if (UF * VF.getKnownMinValue() >= MinProfitableTripCount.getKnownMinValue())
2979 return createStepForVF(Builder, CountTy, VF, UF);
2980
2981 Value *MinProfTC =
2982 createStepForVF(Builder, CountTy, MinProfitableTripCount, 1);
2983 if (!VF.isScalable())
2984 return MinProfTC;
2985 return Builder.CreateBinaryIntrinsic(
2986 Intrinsic::umax, MinProfTC, createStepForVF(Builder, CountTy, VF, UF));
2987 };
2988
2989 if (!Cost->foldTailByMasking())
2990 CheckMinIters =
2991 Builder.CreateICmp(P, Count, CreateStep(), "min.iters.check");
2992 else if (VF.isScalable()) {
2993 // vscale is not necessarily a power-of-2, which means we cannot guarantee
2994 // an overflow to zero when updating induction variables and so an
2995 // additional overflow check is required before entering the vector loop.
2996
2997 // Get the maximum unsigned value for the type.
2998 Value *MaxUIntTripCount =
2999 ConstantInt::get(CountTy, cast<IntegerType>(CountTy)->getMask());
3000 Value *LHS = Builder.CreateSub(MaxUIntTripCount, Count);
3001
3002 // Don't execute the vector loop if (UMax - n) < (VF * UF).
3003 CheckMinIters = Builder.CreateICmp(ICmpInst::ICMP_ULT, LHS, CreateStep());
3004 }
3005
3006 // Create new preheader for vector loop.
3007 LoopVectorPreHeader =
3008 SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), DT, LI, nullptr,
3009 "vector.ph");
3010
3011 assert(DT->properlyDominates(DT->getNode(TCCheckBlock),(static_cast <bool> (DT->properlyDominates(DT->getNode
(TCCheckBlock), DT->getNode(Bypass)->getIDom()) &&
"TC check is expected to dominate Bypass") ? void (0) : __assert_fail
("DT->properlyDominates(DT->getNode(TCCheckBlock), DT->getNode(Bypass)->getIDom()) && \"TC check is expected to dominate Bypass\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 3013, __extension__
__PRETTY_FUNCTION__))
3012 DT->getNode(Bypass)->getIDom()) &&(static_cast <bool> (DT->properlyDominates(DT->getNode
(TCCheckBlock), DT->getNode(Bypass)->getIDom()) &&
"TC check is expected to dominate Bypass") ? void (0) : __assert_fail
("DT->properlyDominates(DT->getNode(TCCheckBlock), DT->getNode(Bypass)->getIDom()) && \"TC check is expected to dominate Bypass\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 3013, __extension__
__PRETTY_FUNCTION__))
3013 "TC check is expected to dominate Bypass")(static_cast <bool> (DT->properlyDominates(DT->getNode
(TCCheckBlock), DT->getNode(Bypass)->getIDom()) &&
"TC check is expected to dominate Bypass") ? void (0) : __assert_fail
("DT->properlyDominates(DT->getNode(TCCheckBlock), DT->getNode(Bypass)->getIDom()) && \"TC check is expected to dominate Bypass\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 3013, __extension__
__PRETTY_FUNCTION__))
;
3014
3015 // Update dominator for Bypass & LoopExit (if needed).
3016 DT->changeImmediateDominator(Bypass, TCCheckBlock);
3017 if (!Cost->requiresScalarEpilogue(VF))
3018 // If there is an epilogue which must run, there's no edge from the
3019 // middle block to exit blocks and thus no need to update the immediate
3020 // dominator of the exit blocks.
3021 DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock);
3022
3023 ReplaceInstWithInst(
3024 TCCheckBlock->getTerminator(),
3025 BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters));
3026 LoopBypassBlocks.push_back(TCCheckBlock);
3027}
3028
3029BasicBlock *InnerLoopVectorizer::emitSCEVChecks(BasicBlock *Bypass) {
3030 BasicBlock *const SCEVCheckBlock =
3031 RTChecks.emitSCEVChecks(Bypass, LoopVectorPreHeader, LoopExitBlock);
3032 if (!SCEVCheckBlock)
3033 return nullptr;
3034
3035 assert(!(SCEVCheckBlock->getParent()->hasOptSize() ||(static_cast <bool> (!(SCEVCheckBlock->getParent()->
hasOptSize() || (OptForSizeBasedOnProfile && Cost->
Hints->getForce() != LoopVectorizeHints::FK_Enabled)) &&
"Cannot SCEV check stride or overflow when optimizing for size"
) ? void (0) : __assert_fail ("!(SCEVCheckBlock->getParent()->hasOptSize() || (OptForSizeBasedOnProfile && Cost->Hints->getForce() != LoopVectorizeHints::FK_Enabled)) && \"Cannot SCEV check stride or overflow when optimizing for size\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 3038, __extension__
__PRETTY_FUNCTION__))
3036 (OptForSizeBasedOnProfile &&(static_cast <bool> (!(SCEVCheckBlock->getParent()->
hasOptSize() || (OptForSizeBasedOnProfile && Cost->
Hints->getForce() != LoopVectorizeHints::FK_Enabled)) &&
"Cannot SCEV check stride or overflow when optimizing for size"
) ? void (0) : __assert_fail ("!(SCEVCheckBlock->getParent()->hasOptSize() || (OptForSizeBasedOnProfile && Cost->Hints->getForce() != LoopVectorizeHints::FK_Enabled)) && \"Cannot SCEV check stride or overflow when optimizing for size\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 3038, __extension__
__PRETTY_FUNCTION__))
3037 Cost->Hints->getForce() != LoopVectorizeHints::FK_Enabled)) &&(static_cast <bool> (!(SCEVCheckBlock->getParent()->
hasOptSize() || (OptForSizeBasedOnProfile && Cost->
Hints->getForce() != LoopVectorizeHints::FK_Enabled)) &&
"Cannot SCEV check stride or overflow when optimizing for size"
) ? void (0) : __assert_fail ("!(SCEVCheckBlock->getParent()->hasOptSize() || (OptForSizeBasedOnProfile && Cost->Hints->getForce() != LoopVectorizeHints::FK_Enabled)) && \"Cannot SCEV check stride or overflow when optimizing for size\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 3038, __extension__
__PRETTY_FUNCTION__))
3038 "Cannot SCEV check stride or overflow when optimizing for size")(static_cast <bool> (!(SCEVCheckBlock->getParent()->
hasOptSize() || (OptForSizeBasedOnProfile && Cost->
Hints->getForce() != LoopVectorizeHints::FK_Enabled)) &&
"Cannot SCEV check stride or overflow when optimizing for size"
) ? void (0) : __assert_fail ("!(SCEVCheckBlock->getParent()->hasOptSize() || (OptForSizeBasedOnProfile && Cost->Hints->getForce() != LoopVectorizeHints::FK_Enabled)) && \"Cannot SCEV check stride or overflow when optimizing for size\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 3038, __extension__
__PRETTY_FUNCTION__))
;
3039
3040
3041 // Update dominator only if this is first RT check.
3042 if (LoopBypassBlocks.empty()) {
3043 DT->changeImmediateDominator(Bypass, SCEVCheckBlock);
3044 if (!Cost->requiresScalarEpilogue(VF))
3045 // If there is an epilogue which must run, there's no edge from the
3046 // middle block to exit blocks and thus no need to update the immediate
3047 // dominator of the exit blocks.
3048 DT->changeImmediateDominator(LoopExitBlock, SCEVCheckBlock);
3049 }
3050
3051 LoopBypassBlocks.push_back(SCEVCheckBlock);
3052 AddedSafetyChecks = true;
3053 return SCEVCheckBlock;
3054}
3055
3056BasicBlock *InnerLoopVectorizer::emitMemRuntimeChecks(BasicBlock *Bypass) {
3057 // VPlan-native path does not do any analysis for runtime checks currently.
3058 if (EnableVPlanNativePath)
3059 return nullptr;
3060
3061 BasicBlock *const MemCheckBlock =
3062 RTChecks.emitMemRuntimeChecks(Bypass, LoopVectorPreHeader);
3063
3064 // Check if we generated code that checks in runtime if arrays overlap. We put
3065 // the checks into a separate block to make the more common case of few
3066 // elements faster.
3067 if (!MemCheckBlock)
3068 return nullptr;
3069
3070 if (MemCheckBlock->getParent()->hasOptSize() || OptForSizeBasedOnProfile) {
3071 assert(Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled &&(static_cast <bool> (Cost->Hints->getForce() == LoopVectorizeHints
::FK_Enabled && "Cannot emit memory checks when optimizing for size, unless forced "
"to vectorize.") ? void (0) : __assert_fail ("Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled && \"Cannot emit memory checks when optimizing for size, unless forced \" \"to vectorize.\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 3073, __extension__
__PRETTY_FUNCTION__))
3072 "Cannot emit memory checks when optimizing for size, unless forced "(static_cast <bool> (Cost->Hints->getForce() == LoopVectorizeHints
::FK_Enabled && "Cannot emit memory checks when optimizing for size, unless forced "
"to vectorize.") ? void (0) : __assert_fail ("Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled && \"Cannot emit memory checks when optimizing for size, unless forced \" \"to vectorize.\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 3073, __extension__
__PRETTY_FUNCTION__))
3073 "to vectorize.")(static_cast <bool> (Cost->Hints->getForce() == LoopVectorizeHints
::FK_Enabled && "Cannot emit memory checks when optimizing for size, unless forced "
"to vectorize.") ? void (0) : __assert_fail ("Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled && \"Cannot emit memory checks when optimizing for size, unless forced \" \"to vectorize.\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 3073, __extension__
__PRETTY_FUNCTION__))
;
3074 ORE->emit([&]() {
3075 return OptimizationRemarkAnalysis(DEBUG_TYPE"loop-vectorize", "VectorizationCodeSize",
3076 OrigLoop->getStartLoc(),
3077 OrigLoop->getHeader())
3078 << "Code-size may be reduced by not forcing "
3079 "vectorization, or by source-code modifications "
3080 "eliminating the need for runtime checks "
3081 "(e.g., adding 'restrict').";
3082 });
3083 }
3084
3085 LoopBypassBlocks.push_back(MemCheckBlock);
3086
3087 AddedSafetyChecks = true;
3088
3089 return MemCheckBlock;
3090}
3091
3092void InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) {
3093 LoopScalarBody = OrigLoop->getHeader();
3094 LoopVectorPreHeader = OrigLoop->getLoopPreheader();
3095 assert(LoopVectorPreHeader && "Invalid loop structure")(static_cast <bool> (LoopVectorPreHeader && "Invalid loop structure"
) ? void (0) : __assert_fail ("LoopVectorPreHeader && \"Invalid loop structure\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 3095, __extension__
__PRETTY_FUNCTION__))
;
3096 LoopExitBlock = OrigLoop->getUniqueExitBlock(); // may be nullptr
3097 assert((LoopExitBlock || Cost->requiresScalarEpilogue(VF)) &&(static_cast <bool> ((LoopExitBlock || Cost->requiresScalarEpilogue
(VF)) && "multiple exit loop without required epilogue?"
) ? void (0) : __assert_fail ("(LoopExitBlock || Cost->requiresScalarEpilogue(VF)) && \"multiple exit loop without required epilogue?\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 3098, __extension__
__PRETTY_FUNCTION__))
3098 "multiple exit loop without required epilogue?")(static_cast <bool> ((LoopExitBlock || Cost->requiresScalarEpilogue
(VF)) && "multiple exit loop without required epilogue?"
) ? void (0) : __assert_fail ("(LoopExitBlock || Cost->requiresScalarEpilogue(VF)) && \"multiple exit loop without required epilogue?\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 3098, __extension__
__PRETTY_FUNCTION__))
;
3099
3100 LoopMiddleBlock =
3101 SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT,
3102 LI, nullptr, Twine(Prefix) + "middle.block");
3103 LoopScalarPreHeader =
3104 SplitBlock(LoopMiddleBlock, LoopMiddleBlock->getTerminator(), DT, LI,
3105 nullptr, Twine(Prefix) + "scalar.ph");
3106
3107 auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator();
3108
3109 // Set up the middle block terminator. Two cases:
3110 // 1) If we know that we must execute the scalar epilogue, emit an
3111 // unconditional branch.
3112 // 2) Otherwise, we must have a single unique exit block (due to how we
3113 // implement the multiple exit case). In this case, set up a conditional
3114 // branch from the middle block to the loop scalar preheader, and the
3115 // exit block. completeLoopSkeleton will update the condition to use an
3116 // iteration check, if required to decide whether to execute the remainder.
3117 BranchInst *BrInst = Cost->requiresScalarEpilogue(VF) ?
3118 BranchInst::Create(LoopScalarPreHeader) :
3119 BranchInst::Create(LoopExitBlock, LoopScalarPreHeader,
3120 Builder.getTrue());
3121 BrInst->setDebugLoc(ScalarLatchTerm->getDebugLoc());
3122 ReplaceInstWithInst(LoopMiddleBlock->getTerminator(), BrInst);
3123
3124 // Update dominator for loop exit. During skeleton creation, only the vector
3125 // pre-header and the middle block are created. The vector loop is entirely
3126 // created during VPlan exection.
3127 if (!Cost->requiresScalarEpilogue(VF))
3128 // If there is an epilogue which must run, there's no edge from the
3129 // middle block to exit blocks and thus no need to update the immediate
3130 // dominator of the exit blocks.
3131 DT->changeImmediateDominator(LoopExitBlock, LoopMiddleBlock);
3132}
3133
3134PHINode *InnerLoopVectorizer::createInductionResumeValue(
3135 PHINode *OrigPhi, const InductionDescriptor &II,
3136 ArrayRef<BasicBlock *> BypassBlocks,
3137 std::pair<BasicBlock *, Value *> AdditionalBypass) {
3138 Value *VectorTripCount = getOrCreateVectorTripCount(LoopVectorPreHeader);
3139 assert(VectorTripCount && "Expected valid arguments")(static_cast <bool> (VectorTripCount && "Expected valid arguments"
) ? void (0) : __assert_fail ("VectorTripCount && \"Expected valid arguments\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 3139, __extension__
__PRETTY_FUNCTION__))
;
3140
3141 Instruction *OldInduction = Legal->getPrimaryInduction();
3142 Value *&EndValue = IVEndValues[OrigPhi];
3143 Value *EndValueFromAdditionalBypass = AdditionalBypass.second;
3144 if (OrigPhi == OldInduction) {
3145 // We know what the end value is.
3146 EndValue = VectorTripCount;
3147 } else {
3148 IRBuilder<> B(LoopVectorPreHeader->getTerminator());
3149
3150 // Fast-math-flags propagate from the original induction instruction.
3151 if (II.getInductionBinOp() && isa<FPMathOperator>(II.getInductionBinOp()))
3152 B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags());
3153
3154 Value *Step =
3155 CreateStepValue(II.getStep(), *PSE.getSE(), &*B.GetInsertPoint());
3156 EndValue =
3157 emitTransformedIndex(B, VectorTripCount, II.getStartValue(), Step, II);
3158 EndValue->setName("ind.end");
3159
3160 // Compute the end value for the additional bypass (if applicable).
3161 if (AdditionalBypass.first) {
3162 B.SetInsertPoint(&(*AdditionalBypass.first->getFirstInsertionPt()));
3163 Value *Step =
3164 CreateStepValue(II.getStep(), *PSE.getSE(), &*B.GetInsertPoint());
3165 EndValueFromAdditionalBypass = emitTransformedIndex(
3166 B, AdditionalBypass.second, II.getStartValue(), Step, II);
3167 EndValueFromAdditionalBypass->setName("ind.end");
3168 }
3169 }
3170
3171 // Create phi nodes to merge from the backedge-taken check block.
3172 PHINode *BCResumeVal = PHINode::Create(OrigPhi->getType(), 3, "bc.resume.val",
3173 LoopScalarPreHeader->getTerminator());
3174 // Copy original phi DL over to the new one.
3175 BCResumeVal->setDebugLoc(OrigPhi->getDebugLoc());
3176
3177 // The new PHI merges the original incoming value, in case of a bypass,
3178 // or the value at the end of the vectorized loop.
3179 BCResumeVal->addIncoming(EndValue, LoopMiddleBlock);
3180
3181 // Fix the scalar body counter (PHI node).
3182 // The old induction's phi node in the scalar body needs the truncated
3183 // value.
3184 for (BasicBlock *BB : BypassBlocks)
3185 BCResumeVal->addIncoming(II.getStartValue(), BB);
3186
3187 if (AdditionalBypass.first)
3188 BCResumeVal->setIncomingValueForBlock(AdditionalBypass.first,
3189 EndValueFromAdditionalBypass);
3190 return BCResumeVal;
3191}
3192
3193void InnerLoopVectorizer::createInductionResumeValues(
3194 std::pair<BasicBlock *, Value *> AdditionalBypass) {
3195 assert(((AdditionalBypass.first && AdditionalBypass.second) ||(static_cast <bool> (((AdditionalBypass.first &&
AdditionalBypass.second) || (!AdditionalBypass.first &&
!AdditionalBypass.second)) && "Inconsistent information about additional bypass."
) ? void (0) : __assert_fail ("((AdditionalBypass.first && AdditionalBypass.second) || (!AdditionalBypass.first && !AdditionalBypass.second)) && \"Inconsistent information about additional bypass.\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 3197, __extension__
__PRETTY_FUNCTION__))
3196 (!AdditionalBypass.first && !AdditionalBypass.second)) &&(static_cast <bool> (((AdditionalBypass.first &&
AdditionalBypass.second) || (!AdditionalBypass.first &&
!AdditionalBypass.second)) && "Inconsistent information about additional bypass."
) ? void (0) : __assert_fail ("((AdditionalBypass.first && AdditionalBypass.second) || (!AdditionalBypass.first && !AdditionalBypass.second)) && \"Inconsistent information about additional bypass.\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 3197, __extension__
__PRETTY_FUNCTION__))
3197 "Inconsistent information about additional bypass.")(static_cast <bool> (((AdditionalBypass.first &&
AdditionalBypass.second) || (!AdditionalBypass.first &&
!AdditionalBypass.second)) && "Inconsistent information about additional bypass."
) ? void (0) : __assert_fail ("((AdditionalBypass.first && AdditionalBypass.second) || (!AdditionalBypass.first && !AdditionalBypass.second)) && \"Inconsistent information about additional bypass.\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 3197, __extension__
__PRETTY_FUNCTION__))
;
3198 // We are going to resume the execution of the scalar loop.
3199 // Go over all of the induction variables that we found and fix the
3200 // PHIs that are left in the scalar version of the loop.
3201 // The starting values of PHI nodes depend on the counter of the last
3202 // iteration in the vectorized loop.
3203 // If we come from a bypass edge then we need to start from the original
3204 // start value.
3205 for (const auto &InductionEntry : Legal->getInductionVars()) {
3206 PHINode *OrigPhi = InductionEntry.first;
3207 const InductionDescriptor &II = InductionEntry.second;
3208 PHINode *BCResumeVal = createInductionResumeValue(
3209 OrigPhi, II, LoopBypassBlocks, AdditionalBypass);
3210 OrigPhi->setIncomingValueForBlock(LoopScalarPreHeader, BCResumeVal);
3211 }
3212}
3213
3214BasicBlock *InnerLoopVectorizer::completeLoopSkeleton() {
3215 // The trip counts should be cached by now.
3216 Value *Count = getOrCreateTripCount(LoopVectorPreHeader);
3217 Value *VectorTripCount = getOrCreateVectorTripCount(LoopVectorPreHeader);
3218
3219 auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator();
3220
3221 // Add a check in the middle block to see if we have completed
3222 // all of the iterations in the first vector loop. Three cases:
3223 // 1) If we require a scalar epilogue, there is no conditional branch as
3224 // we unconditionally branch to the scalar preheader. Do nothing.
3225 // 2) If (N - N%VF) == N, then we *don't* need to run the remainder.
3226 // Thus if tail is to be folded, we know we don't need to run the
3227 // remainder and we can use the previous value for the condition (true).
3228 // 3) Otherwise, construct a runtime check.
3229 if (!Cost->requiresScalarEpilogue(VF) && !Cost->foldTailByMasking()) {
3230 Instruction *CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ,
3231 Count, VectorTripCount, "cmp.n",
3232 LoopMiddleBlock->getTerminator());
3233
3234 // Here we use the same DebugLoc as the scalar loop latch terminator instead
3235 // of the corresponding compare because they may have ended up with
3236 // different line numbers and we want to avoid awkward line stepping while
3237 // debugging. Eg. if the compare has got a line number inside the loop.
3238 CmpN->setDebugLoc(ScalarLatchTerm->getDebugLoc());
3239 cast<BranchInst>(LoopMiddleBlock->getTerminator())->setCondition(CmpN);
3240 }
3241
3242#ifdef EXPENSIVE_CHECKS
3243 assert(DT->verify(DominatorTree::VerificationLevel::Fast))(static_cast <bool> (DT->verify(DominatorTree::VerificationLevel
::Fast)) ? void (0) : __assert_fail ("DT->verify(DominatorTree::VerificationLevel::Fast)"
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 3243, __extension__
__PRETTY_FUNCTION__))
;
3244#endif
3245
3246 return LoopVectorPreHeader;
3247}
3248
3249std::pair<BasicBlock *, Value *>
3250InnerLoopVectorizer::createVectorizedLoopSkeleton() {
3251 /*
3252 In this function we generate a new loop. The new loop will contain
3253 the vectorized instructions while the old loop will continue to run the
3254 scalar remainder.
3255
3256 [ ] <-- loop iteration number check.
3257 / |
3258 / v
3259 | [ ] <-- vector loop bypass (may consist of multiple blocks).
3260 | / |
3261 | / v
3262 || [ ] <-- vector pre header.
3263 |/ |
3264 | v
3265 | [ ] \
3266 | [ ]_| <-- vector loop (created during VPlan execution).
3267 | |
3268 | v
3269 \ -[ ] <--- middle-block.
3270 \/ |
3271 /\ v
3272 | ->[ ] <--- new preheader.
3273 | |
3274 (opt) v <-- edge from middle to exit iff epilogue is not required.
3275 | [ ] \
3276 | [ ]_| <-- old scalar loop to handle remainder (scalar epilogue).
3277 \ |
3278 \ v
3279 >[ ] <-- exit block(s).
3280 ...
3281 */
3282
3283 // Create an empty vector loop, and prepare basic blocks for the runtime
3284 // checks.
3285 createVectorLoopSkeleton("");
3286
3287 // Now, compare the new count to zero. If it is zero skip the vector loop and
3288 // jump to the scalar loop. This check also covers the case where the
3289 // backedge-taken count is uint##_max: adding one to it will overflow leading
3290 // to an incorrect trip count of zero. In this (rare) case we will also jump
3291 // to the scalar loop.
3292 emitIterationCountCheck(LoopScalarPreHeader);
3293
3294 // Generate the code to check any assumptions that we've made for SCEV
3295 // expressions.
3296 emitSCEVChecks(LoopScalarPreHeader);
3297
3298 // Generate the code that checks in runtime if arrays overlap. We put the
3299 // checks into a separate block to make the more common case of few elements
3300 // faster.
3301 emitMemRuntimeChecks(LoopScalarPreHeader);
3302
3303 // Emit phis for the new starting index of the scalar loop.
3304 createInductionResumeValues();
3305
3306 return {completeLoopSkeleton(), nullptr};
3307}
3308
3309// Fix up external users of the induction variable. At this point, we are
3310// in LCSSA form, with all external PHIs that use the IV having one input value,
3311// coming from the remainder loop. We need those PHIs to also have a correct
3312// value for the IV when arriving directly from the middle block.
3313void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi,
3314 const InductionDescriptor &II,
3315 Value *VectorTripCount, Value *EndValue,
3316 BasicBlock *MiddleBlock,
3317 BasicBlock *VectorHeader, VPlan &Plan) {
3318 // There are two kinds of external IV usages - those that use the value
3319 // computed in the last iteration (the PHI) and those that use the penultimate
3320 // value (the value that feeds into the phi from the loop latch).
3321 // We allow both, but they, obviously, have different values.
3322
3323 assert(OrigLoop->getUniqueExitBlock() && "Expected a single exit block")(static_cast <bool> (OrigLoop->getUniqueExitBlock() &&
"Expected a single exit block") ? void (0) : __assert_fail (
"OrigLoop->getUniqueExitBlock() && \"Expected a single exit block\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 3323, __extension__
__PRETTY_FUNCTION__))
;
3324
3325 DenseMap<Value *, Value *> MissingVals;
3326
3327 // An external user of the last iteration's value should see the value that
3328 // the remainder loop uses to initialize its own IV.
3329 Value *PostInc = OrigPhi->getIncomingValueForBlock(OrigLoop->getLoopLatch());
3330 for (User *U : PostInc->users()) {
3331 Instruction *UI = cast<Instruction>(U);
3332 if (!OrigLoop->contains(UI)) {
3333 assert(isa<PHINode>(UI) && "Expected LCSSA form")(static_cast <bool> (isa<PHINode>(UI) && "Expected LCSSA form"
) ? void (0) : __assert_fail ("isa<PHINode>(UI) && \"Expected LCSSA form\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 3333, __extension__
__PRETTY_FUNCTION__))
;
3334 MissingVals[UI] = EndValue;
3335 }
3336 }
3337
3338 // An external user of the penultimate value need to see EndValue - Step.
3339 // The simplest way to get this is to recompute it from the constituent SCEVs,
3340 // that is Start + (Step * (CRD - 1)).
3341 for (User *U : OrigPhi->users()) {
3342 auto *UI = cast<Instruction>(U);
3343 if (!OrigLoop->contains(UI)) {
3344 assert(isa<PHINode>(UI) && "Expected LCSSA form")(static_cast <bool> (isa<PHINode>(UI) && "Expected LCSSA form"
) ? void (0) : __assert_fail ("isa<PHINode>(UI) && \"Expected LCSSA form\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 3344, __extension__
__PRETTY_FUNCTION__))
;
3345
3346 IRBuilder<> B(MiddleBlock->getTerminator());
3347
3348 // Fast-math-flags propagate from the original induction instruction.
3349 if (II.getInductionBinOp() && isa<FPMathOperator>(II.getInductionBinOp()))
3350 B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags());
3351
3352 Value *CountMinusOne = B.CreateSub(
3353 VectorTripCount, ConstantInt::get(VectorTripCount->getType(), 1));
3354 CountMinusOne->setName("cmo");
3355 Value *Step = CreateStepValue(II.getStep(), *PSE.getSE(),
3356 VectorHeader->getTerminator());
3357 Value *Escape =
3358 emitTransformedIndex(B, CountMinusOne, II.getStartValue(), Step, II);
3359 Escape->setName("ind.escape");
3360 MissingVals[UI] = Escape;
3361 }
3362 }
3363
3364 for (auto &I : MissingVals) {
3365 PHINode *PHI = cast<PHINode>(I.first);
3366 // One corner case we have to handle is two IVs "chasing" each-other,
3367 // that is %IV2 = phi [...], [ %IV1, %latch ]
3368 // In this case, if IV1 has an external use, we need to avoid adding both
3369 // "last value of IV1" and "penultimate value of IV2". So, verify that we
3370 // don't already have an incoming value for the middle block.
3371 if (PHI->getBasicBlockIndex(MiddleBlock) == -1) {
3372 PHI->addIncoming(I.second, MiddleBlock);
3373 Plan.removeLiveOut(PHI);
3374 }
3375 }
3376}
3377
3378namespace {
3379
3380struct CSEDenseMapInfo {
3381 static bool canHandle(const Instruction *I) {
3382 return isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) ||
3383 isa<ShuffleVectorInst>(I) || isa<GetElementPtrInst>(I);
3384 }
3385
3386 static inline Instruction *getEmptyKey() {
3387 return DenseMapInfo<Instruction *>::getEmptyKey();
3388 }
3389
3390 static inline Instruction *getTombstoneKey() {
3391 return DenseMapInfo<Instruction *>::getTombstoneKey();
3392 }
3393
3394 static unsigned getHashValue(const Instruction *I) {
3395 assert(canHandle(I) && "Unknown instruction!")(static_cast <bool> (canHandle(I) && "Unknown instruction!"
) ? void (0) : __assert_fail ("canHandle(I) && \"Unknown instruction!\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 3395, __extension__
__PRETTY_FUNCTION__))
;
3396 return hash_combine(I->getOpcode(), hash_combine_range(I->value_op_begin(),
3397 I->value_op_end()));
3398 }
3399
3400 static bool isEqual(const Instruction *LHS, const Instruction *RHS) {
3401 if (LHS == getEmptyKey() || RHS == getEmptyKey() ||
3402 LHS == getTombstoneKey() || RHS == getTombstoneKey())
3403 return LHS == RHS;
3404 return LHS->isIdenticalTo(RHS);
3405 }
3406};
3407
3408} // end anonymous namespace
3409
3410///Perform cse of induction variable instructions.
3411static void cse(BasicBlock *BB) {
3412 // Perform simple cse.
3413 SmallDenseMap<Instruction *, Instruction *, 4, CSEDenseMapInfo> CSEMap;
3414 for (Instruction &In : llvm::make_early_inc_range(*BB)) {
3415 if (!CSEDenseMapInfo::canHandle(&In))
3416 continue;
3417
3418 // Check if we can replace this instruction with any of the
3419 // visited instructions.
3420 if (Instruction *V = CSEMap.lookup(&In)) {
3421 In.replaceAllUsesWith(V);
3422 In.eraseFromParent();
3423 continue;
3424 }
3425
3426 CSEMap[&In] = &In;
3427 }
3428}
3429
3430InstructionCost
3431LoopVectorizationCostModel::getVectorCallCost(CallInst *CI, ElementCount VF,
3432 bool &NeedToScalarize) const {
3433 Function *F = CI->getCalledFunction();
3434 Type *ScalarRetTy = CI->getType();
3435 SmallVector<Type *, 4> Tys, ScalarTys;
3436 for (auto &ArgOp : CI->args())
3437 ScalarTys.push_back(ArgOp->getType());
3438
3439 // Estimate cost of scalarized vector call. The source operands are assumed
3440 // to be vectors, so we need to extract individual elements from there,
3441 // execute VF scalar calls, and then gather the result into the vector return
3442 // value.
3443 InstructionCost ScalarCallCost =
3444 TTI.getCallInstrCost(F, ScalarRetTy, ScalarTys, TTI::TCK_RecipThroughput);
3445 if (VF.isScalar())
3446 return ScalarCallCost;
3447
3448 // Compute corresponding vector type for return value and arguments.
3449 Type *RetTy = ToVectorTy(ScalarRetTy, VF);
3450 for (Type *ScalarTy : ScalarTys)
3451 Tys.push_back(ToVectorTy(ScalarTy, VF));
3452
3453 // Compute costs of unpacking argument values for the scalar calls and
3454 // packing the return values to a vector.
3455 InstructionCost ScalarizationCost = getScalarizationOverhead(CI, VF);
3456
3457 InstructionCost Cost =
3458 ScalarCallCost * VF.getKnownMinValue() + ScalarizationCost;
3459
3460 // If we can't emit a vector call for this function, then the currently found
3461 // cost is the cost we need to return.
3462 NeedToScalarize = true;
3463 VFShape Shape = VFShape::get(*CI, VF, false /*HasGlobalPred*/);
3464 Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape);
3465
3466 if (!TLI || CI->isNoBuiltin() || !VecFunc)
3467 return Cost;
3468
3469 // If the corresponding vector cost is cheaper, return its cost.
3470 InstructionCost VectorCallCost =
3471 TTI.getCallInstrCost(nullptr, RetTy, Tys, TTI::TCK_RecipThroughput);
3472 if (VectorCallCost < Cost) {
3473 NeedToScalarize = false;
3474 Cost = VectorCallCost;
3475 }
3476 return Cost;
3477}
3478
3479static Type *MaybeVectorizeType(Type *Elt, ElementCount VF) {
3480 if (VF.isScalar() || (!Elt->isIntOrPtrTy() && !Elt->isFloatingPointTy()))
3481 return Elt;
3482 return VectorType::get(Elt, VF);
3483}
3484
3485InstructionCost
3486LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI,
3487 ElementCount VF) const {
3488 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
3489 assert(ID && "Expected intrinsic call!")(static_cast <bool> (ID && "Expected intrinsic call!"
) ? void (0) : __assert_fail ("ID && \"Expected intrinsic call!\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 3489, __extension__
__PRETTY_FUNCTION__))
;
3490 Type *RetTy = MaybeVectorizeType(CI->getType(), VF);
3491 FastMathFlags FMF;
3492 if (auto *FPMO = dyn_cast<FPMathOperator>(CI))
3493 FMF = FPMO->getFastMathFlags();
3494
3495 SmallVector<const Value *> Arguments(CI->args());
3496 FunctionType *FTy = CI->getCalledFunction()->getFunctionType();
3497 SmallVector<Type *> ParamTys;
3498 std::transform(FTy->param_begin(), FTy->param_end(),
3499 std::back_inserter(ParamTys),
3500 [&](Type *Ty) { return MaybeVectorizeType(Ty, VF); });
3501
3502 IntrinsicCostAttributes CostAttrs(ID, RetTy, Arguments, ParamTys, FMF,
3503 dyn_cast<IntrinsicInst>(CI));
3504 return TTI.getIntrinsicInstrCost(CostAttrs,
3505 TargetTransformInfo::TCK_RecipThroughput);
3506}
3507
3508static Type *smallestIntegerVectorType(Type *T1, Type *T2) {
3509 auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType());
3510 auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType());
3511 return I1->getBitWidth() < I2->getBitWidth() ? T1 : T2;
3512}
3513
3514static Type *largestIntegerVectorType(Type *T1, Type *T2) {
3515 auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType());
3516 auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType());
3517 return I1->getBitWidth() > I2->getBitWidth() ? T1 : T2;
3518}
3519
3520void InnerLoopVectorizer::truncateToMinimalBitwidths(VPTransformState &State) {
3521 // For every instruction `I` in MinBWs, truncate the operands, create a
3522 // truncated version of `I` and reextend its result. InstCombine runs
3523 // later and will remove any ext/trunc pairs.
3524 SmallPtrSet<Value *, 4> Erased;
3525 for (const auto &KV : Cost->getMinimalBitwidths()) {
3526 // If the value wasn't vectorized, we must maintain the original scalar
3527 // type. The absence of the value from State indicates that it
3528 // wasn't vectorized.
3529 // FIXME: Should not rely on getVPValue at this point.
3530 VPValue *Def = State.Plan->getVPValue(KV.first, true);
3531 if (!State.hasAnyVectorValue(Def))
3532 continue;
3533 for (unsigned Part = 0; Part < UF; ++Part) {
3534 Value *I = State.get(Def, Part);
3535 if (Erased.count(I) || I->use_empty() || !isa<Instruction>(I))
3536 continue;
3537 Type *OriginalTy = I->getType();
3538 Type *ScalarTruncatedTy =
3539 IntegerType::get(OriginalTy->getContext(), KV.second);
3540 auto *TruncatedTy = VectorType::get(
3541 ScalarTruncatedTy, cast<VectorType>(OriginalTy)->getElementCount());
3542 if (TruncatedTy == OriginalTy)
3543 continue;
3544
3545 IRBuilder<> B(cast<Instruction>(I));
3546 auto ShrinkOperand = [&](Value *V) -> Value * {
3547 if (auto *ZI = dyn_cast<ZExtInst>(V))
3548 if (ZI->getSrcTy() == TruncatedTy)
3549 return ZI->getOperand(0);
3550 return B.CreateZExtOrTrunc(V, TruncatedTy);
3551 };
3552
3553 // The actual instruction modification depends on the instruction type,
3554 // unfortunately.
3555 Value *NewI = nullptr;
3556 if (auto *BO = dyn_cast<BinaryOperator>(I)) {
3557 NewI = B.CreateBinOp(BO->getOpcode(), ShrinkOperand(BO->getOperand(0)),
3558 ShrinkOperand(BO->getOperand(1)));
3559
3560 // Any wrapping introduced by shrinking this operation shouldn't be
3561 // considered undefined behavior. So, we can't unconditionally copy
3562 // arithmetic wrapping flags to NewI.
3563 cast<BinaryOperator>(NewI)->copyIRFlags(I, /*IncludeWrapFlags=*/false);
3564 } else if (auto *CI = dyn_cast<ICmpInst>(I)) {
3565 NewI =
3566 B.CreateICmp(CI->getPredicate(), ShrinkOperand(CI->getOperand(0)),
3567 ShrinkOperand(CI->getOperand(1)));
3568 } else if (auto *SI = dyn_cast<SelectInst>(I)) {
3569 NewI = B.CreateSelect(SI->getCondition(),
3570 ShrinkOperand(SI->getTrueValue()),
3571 ShrinkOperand(SI->getFalseValue()));
3572 } else if (auto *CI = dyn_cast<CastInst>(I)) {
3573 switch (CI->getOpcode()) {
3574 default:
3575 llvm_unreachable("Unhandled cast!")::llvm::llvm_unreachable_internal("Unhandled cast!", "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3575)
;
3576 case Instruction::Trunc:
3577 NewI = ShrinkOperand(CI->getOperand(0));
3578 break;
3579 case Instruction::SExt:
3580 NewI = B.CreateSExtOrTrunc(
3581 CI->getOperand(0),
3582 smallestIntegerVectorType(OriginalTy, TruncatedTy));
3583 break;
3584 case Instruction::ZExt:
3585 NewI = B.CreateZExtOrTrunc(
3586 CI->getOperand(0),
3587 smallestIntegerVectorType(OriginalTy, TruncatedTy));
3588 break;
3589 }
3590 } else if (auto *SI = dyn_cast<ShuffleVectorInst>(I)) {
3591 auto Elements0 =
3592 cast<VectorType>(SI->getOperand(0)->getType())->getElementCount();
3593 auto *O0 = B.CreateZExtOrTrunc(
3594 SI->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements0));
3595 auto Elements1 =
3596 cast<VectorType>(SI->getOperand(1)->getType())->getElementCount();
3597 auto *O1 = B.CreateZExtOrTrunc(
3598 SI->getOperand(1), VectorType::get(ScalarTruncatedTy, Elements1));
3599
3600 NewI = B.CreateShuffleVector(O0, O1, SI->getShuffleMask());
3601 } else if (isa<LoadInst>(I) || isa<PHINode>(I)) {
3602 // Don't do anything with the operands, just extend the result.
3603 continue;
3604 } else if (auto *IE = dyn_cast<InsertElementInst>(I)) {
3605 auto Elements =
3606 cast<VectorType>(IE->getOperand(0)->getType())->getElementCount();
3607 auto *O0 = B.CreateZExtOrTrunc(
3608 IE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements));
3609 auto *O1 = B.CreateZExtOrTrunc(IE->getOperand(1), ScalarTruncatedTy);
3610 NewI = B.CreateInsertElement(O0, O1, IE->getOperand(2));
3611 } else if (auto *EE = dyn_cast<ExtractElementInst>(I)) {
3612 auto Elements =
3613 cast<VectorType>(EE->getOperand(0)->getType())->getElementCount();
3614 auto *O0 = B.CreateZExtOrTrunc(
3615 EE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements));
3616 NewI = B.CreateExtractElement(O0, EE->getOperand(2));
3617 } else {
3618 // If we don't know what to do, be conservative and don't do anything.
3619 continue;
3620 }
3621
3622 // Lastly, extend the result.
3623 NewI->takeName(cast<Instruction>(I));
3624 Value *Res = B.CreateZExtOrTrunc(NewI, OriginalTy);
3625 I->replaceAllUsesWith(Res);
3626 cast<Instruction>(I)->eraseFromParent();
3627 Erased.insert(I);
3628 State.reset(Def, Res, Part);
3629 }
3630 }
3631
3632 // We'll have created a bunch of ZExts that are now parentless. Clean up.
3633 for (const auto &KV : Cost->getMinimalBitwidths()) {
3634 // If the value wasn't vectorized, we must maintain the original scalar
3635 // type. The absence of the value from State indicates that it
3636 // wasn't vectorized.
3637 // FIXME: Should not rely on getVPValue at this point.
3638 VPValue *Def = State.Plan->getVPValue(KV.first, true);
3639 if (!State.hasAnyVectorValue(Def))
3640 continue;
3641 for (unsigned Part = 0; Part < UF; ++Part) {
3642 Value *I = State.get(Def, Part);
3643 ZExtInst *Inst = dyn_cast<ZExtInst>(I);
3644 if (Inst && Inst->use_empty()) {
3645 Value *NewI = Inst->getOperand(0);
3646 Inst->eraseFromParent();
3647 State.reset(Def, NewI, Part);
3648 }
3649 }
3650 }
3651}
3652
3653void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State,
3654 VPlan &Plan) {
3655 // Insert truncates and extends for any truncated instructions as hints to
3656 // InstCombine.
3657 if (VF.isVector())
3658 truncateToMinimalBitwidths(State);
3659
3660 // Fix widened non-induction PHIs by setting up the PHI operands.
3661 if (EnableVPlanNativePath)
3662 fixNonInductionPHIs(Plan, State);
3663
3664 // At this point every instruction in the original loop is widened to a
3665 // vector form. Now we need to fix the recurrences in the loop. These PHI
3666 // nodes are currently empty because we did not want to introduce cycles.
3667 // This is the second stage of vectorizing recurrences.
3668 fixCrossIterationPHIs(State);
3669
3670 // Forget the original basic block.
3671 PSE.getSE()->forgetLoop(OrigLoop);
3672
3673 VPBasicBlock *LatchVPBB = Plan.getVectorLoopRegion()->getExitingBasicBlock();
3674 Loop *VectorLoop = LI->getLoopFor(State.CFG.VPBB2IRBB[LatchVPBB]);
3675 if (Cost->requiresScalarEpilogue(VF)) {
3676 // No edge from the middle block to the unique exit block has been inserted
3677 // and there is nothing to fix from vector loop; phis should have incoming
3678 // from scalar loop only.
3679 Plan.clearLiveOuts();
3680 } else {
3681 // If we inserted an edge from the middle block to the unique exit block,
3682 // update uses outside the loop (phis) to account for the newly inserted
3683 // edge.
3684
3685 // Fix-up external users of the induction variables.
3686 for (const auto &Entry : Legal->getInductionVars())
3687 fixupIVUsers(Entry.first, Entry.second,
3688 getOrCreateVectorTripCount(VectorLoop->getLoopPreheader()),
3689 IVEndValues[Entry.first], LoopMiddleBlock,
3690 VectorLoop->getHeader(), Plan);
3691 }
3692
3693 // Fix LCSSA phis not already fixed earlier. Extracts may need to be generated
3694 // in the exit block, so update the builder.
3695 State.Builder.SetInsertPoint(State.CFG.ExitBB->getFirstNonPHI());
3696 for (const auto &KV : Plan.getLiveOuts())
3697 KV.second->fixPhi(Plan, State);
3698
3699 for (Instruction *PI : PredicatedInstructions)
3700 sinkScalarOperands(&*PI);
3701
3702 // Remove redundant induction instructions.
3703 cse(VectorLoop->getHeader());
3704
3705 // Set/update profile weights for the vector and remainder loops as original
3706 // loop iterations are now distributed among them. Note that original loop
3707 // represented by LoopScalarBody becomes remainder loop after vectorization.
3708 //
3709 // For cases like foldTailByMasking() and requiresScalarEpiloque() we may
3710 // end up getting slightly roughened result but that should be OK since
3711 // profile is not inherently precise anyway. Note also possible bypass of
3712 // vector code caused by legality checks is ignored, assigning all the weight
3713 // to the vector loop, optimistically.
3714 //
3715 // For scalable vectorization we can't know at compile time how many iterations
3716 // of the loop are handled in one vector iteration, so instead assume a pessimistic
3717 // vscale of '1'.
3718 setProfileInfoAfterUnrolling(LI->getLoopFor(LoopScalarBody), VectorLoop,
3719 LI->getLoopFor(LoopScalarBody),
3720 VF.getKnownMinValue() * UF);
3721}
3722
3723void InnerLoopVectorizer::fixCrossIterationPHIs(VPTransformState &State) {
3724 // In order to support recurrences we need to be able to vectorize Phi nodes.
3725 // Phi nodes have cycles, so we need to vectorize them in two stages. This is
3726 // stage #2: We now need to fix the recurrences by adding incoming edges to
3727 // the currently empty PHI nodes. At this point every instruction in the
3728 // original loop is widened to a vector form so we can use them to construct
3729 // the incoming edges.
3730 VPBasicBlock *Header =
3731 State.Plan->getVectorLoopRegion()->getEntryBasicBlock();
3732 for (VPRecipeBase &R : Header->phis()) {
3733 if (auto *ReductionPhi = dyn_cast<VPReductionPHIRecipe>(&R))
3734 fixReduction(ReductionPhi, State);
3735 else if (auto *FOR = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&R))
3736 fixFixedOrderRecurrence(FOR, State);
3737 }
3738}
3739
3740void InnerLoopVectorizer::fixFixedOrderRecurrence(
3741 VPFirstOrderRecurrencePHIRecipe *PhiR, VPTransformState &State) {
3742 // This is the second phase of vectorizing first-order recurrences. An
3743 // overview of the transformation is described below. Suppose we have the
3744 // following loop.
3745 //
3746 // for (int i = 0; i < n; ++i)
3747 // b[i] = a[i] - a[i - 1];
3748 //
3749 // There is a first-order recurrence on "a". For this loop, the shorthand
3750 // scalar IR looks like:
3751 //
3752 // scalar.ph:
3753 // s_init = a[-1]
3754 // br scalar.body
3755 //
3756 // scalar.body:
3757 // i = phi [0, scalar.ph], [i+1, scalar.body]
3758 // s1 = phi [s_init, scalar.ph], [s2, scalar.body]
3759 // s2 = a[i]
3760 // b[i] = s2 - s1
3761 // br cond, scalar.body, ...
3762 //
3763 // In this example, s1 is a recurrence because it's value depends on the
3764 // previous iteration. In the first phase of vectorization, we created a
3765 // vector phi v1 for s1. We now complete the vectorization and produce the
3766 // shorthand vector IR shown below (for VF = 4, UF = 1).
3767 //
3768 // vector.ph:
3769 // v_init = vector(..., ..., ..., a[-1])
3770 // br vector.body
3771 //
3772 // vector.body
3773 // i = phi [0, vector.ph], [i+4, vector.body]
3774 // v1 = phi [v_init, vector.ph], [v2, vector.body]
3775 // v2 = a[i, i+1, i+2, i+3];
3776 // v3 = vector(v1(3), v2(0, 1, 2))
3777 // b[i, i+1, i+2, i+3] = v2 - v3
3778 // br cond, vector.body, middle.block
3779 //
3780 // middle.block:
3781 // x = v2(3)
3782 // br scalar.ph
3783 //
3784 // scalar.ph:
3785 // s_init = phi [x, middle.block], [a[-1], otherwise]
3786 // br scalar.body
3787 //
3788 // After execution completes the vector loop, we extract the next value of
3789 // the recurrence (x) to use as the initial value in the scalar loop.
3790
3791 // Extract the last vector element in the middle block. This will be the
3792 // initial value for the recurrence when jumping to the scalar loop.
3793 VPValue *PreviousDef = PhiR->getBackedgeValue();
3794 Value *Incoming = State.get(PreviousDef, UF - 1);
3795 auto *ExtractForScalar = Incoming;
3796 auto *IdxTy = Builder.getInt32Ty();
3797 if (VF.isVector()) {
3798 auto *One = ConstantInt::get(IdxTy, 1);
3799 Builder.SetInsertPoint(LoopMiddleBlock->getTerminator());
3800 auto *RuntimeVF = getRuntimeVF(Builder, IdxTy, VF);
3801 auto *LastIdx = Builder.CreateSub(RuntimeVF, One);
3802 ExtractForScalar = Builder.CreateExtractElement(ExtractForScalar, LastIdx,
3803 "vector.recur.extract");
3804 }
3805 // Extract the second last element in the middle block if the
3806 // Phi is used outside the loop. We need to extract the phi itself
3807 // and not the last element (the phi update in the current iteration). This
3808 // will be the value when jumping to the exit block from the LoopMiddleBlock,
3809 // when the scalar loop is not run at all.
3810 Value *ExtractForPhiUsedOutsideLoop = nullptr;
3811 if (VF.isVector()) {
3812 auto *RuntimeVF = getRuntimeVF(Builder, IdxTy, VF);
3813 auto *Idx = Builder.CreateSub(RuntimeVF, ConstantInt::get(IdxTy, 2));
3814 ExtractForPhiUsedOutsideLoop = Builder.CreateExtractElement(
3815 Incoming, Idx, "vector.recur.extract.for.phi");
3816 } else if (UF > 1)
3817 // When loop is unrolled without vectorizing, initialize
3818 // ExtractForPhiUsedOutsideLoop with the value just prior to unrolled value
3819 // of `Incoming`. This is analogous to the vectorized case above: extracting
3820 // the second last element when VF > 1.
3821 ExtractForPhiUsedOutsideLoop = State.get(PreviousDef, UF - 2);
3822
3823 // Fix the initial value of the original recurrence in the scalar loop.
3824 Builder.SetInsertPoint(&*LoopScalarPreHeader->begin());
3825 PHINode *Phi = cast<PHINode>(PhiR->getUnderlyingValue());
3826 auto *Start = Builder.CreatePHI(Phi->getType(), 2, "scalar.recur.init");
3827 auto *ScalarInit = PhiR->getStartValue()->getLiveInIRValue();
3828 for (auto *BB : predecessors(LoopScalarPreHeader)) {
3829 auto *Incoming = BB == LoopMiddleBlock ? ExtractForScalar : ScalarInit;
3830 Start->addIncoming(Incoming, BB);
3831 }
3832
3833 Phi->setIncomingValueForBlock(LoopScalarPreHeader, Start);
3834 Phi->setName("scalar.recur");
3835
3836 // Finally, fix users of the recurrence outside the loop. The users will need
3837 // either the last value of the scalar recurrence or the last value of the
3838 // vector recurrence we extracted in the middle block. Since the loop is in
3839 // LCSSA form, we just need to find all the phi nodes for the original scalar
3840 // recurrence in the exit block, and then add an edge for the middle block.
3841 // Note that LCSSA does not imply single entry when the original scalar loop
3842 // had multiple exiting edges (as we always run the last iteration in the
3843 // scalar epilogue); in that case, there is no edge from middle to exit and
3844 // and thus no phis which needed updated.
3845 if (!Cost->requiresScalarEpilogue(VF))
3846 for (PHINode &LCSSAPhi : LoopExitBlock->phis())
3847 if (llvm::is_contained(LCSSAPhi.incoming_values(), Phi)) {
3848 LCSSAPhi.addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock);
3849 State.Plan->removeLiveOut(&LCSSAPhi);
3850 }
3851}
3852
3853void InnerLoopVectorizer::fixReduction(VPReductionPHIRecipe *PhiR,
3854 VPTransformState &State) {
3855 PHINode *OrigPhi = cast<PHINode>(PhiR->getUnderlyingValue());
3856 // Get it's reduction variable descriptor.
3857 assert(Legal->isReductionVariable(OrigPhi) &&(static_cast <bool> (Legal->isReductionVariable(OrigPhi
) && "Unable to find the reduction variable") ? void (
0) : __assert_fail ("Legal->isReductionVariable(OrigPhi) && \"Unable to find the reduction variable\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 3858, __extension__
__PRETTY_FUNCTION__))
3858 "Unable to find the reduction variable")(static_cast <bool> (Legal->isReductionVariable(OrigPhi
) && "Unable to find the reduction variable") ? void (
0) : __assert_fail ("Legal->isReductionVariable(OrigPhi) && \"Unable to find the reduction variable\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 3858, __extension__
__PRETTY_FUNCTION__))
;
3859 const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor();
3860
3861 RecurKind RK = RdxDesc.getRecurrenceKind();
3862 TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue();
3863 Instruction *LoopExitInst = RdxDesc.getLoopExitInstr();
3864 State.setDebugLocFromInst(ReductionStartValue);
3865
3866 VPValue *LoopExitInstDef = PhiR->getBackedgeValue();
3867 // This is the vector-clone of the value that leaves the loop.
3868 Type *VecTy = State.get(LoopExitInstDef, 0)->getType();
3869
3870 // Wrap flags are in general invalid after vectorization, clear them.
3871 clearReductionWrapFlags(PhiR, State);
3872
3873 // Before each round, move the insertion point right between
3874 // the PHIs and the values we are going to write.
3875 // This allows us to write both PHINodes and the extractelement
3876 // instructions.
3877 Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt());
3878
3879 State.setDebugLocFromInst(LoopExitInst);
3880
3881 Type *PhiTy = OrigPhi->getType();
3882
3883 VPBasicBlock *LatchVPBB =
3884 PhiR->getParent()->getEnclosingLoopRegion()->getExitingBasicBlock();
3885 BasicBlock *VectorLoopLatch = State.CFG.VPBB2IRBB[LatchVPBB];
3886 // If tail is folded by masking, the vector value to leave the loop should be
3887 // a Select choosing between the vectorized LoopExitInst and vectorized Phi,
3888 // instead of the former. For an inloop reduction the reduction will already
3889 // be predicated, and does not need to be handled here.
3890 if (Cost->foldTailByMasking() && !PhiR->isInLoop()) {
3891 for (unsigned Part = 0; Part < UF; ++Part) {
3892 Value *VecLoopExitInst = State.get(LoopExitInstDef, Part);
3893 SelectInst *Sel = nullptr;
3894 for (User *U : VecLoopExitInst->users()) {
3895 if (isa<SelectInst>(U)) {
3896 assert(!Sel && "Reduction exit feeding two selects")(static_cast <bool> (!Sel && "Reduction exit feeding two selects"
) ? void (0) : __assert_fail ("!Sel && \"Reduction exit feeding two selects\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 3896, __extension__
__PRETTY_FUNCTION__))
;
3897 Sel = cast<SelectInst>(U);
3898 } else
3899 assert(isa<PHINode>(U) && "Reduction exit must feed Phi's or select")(static_cast <bool> (isa<PHINode>(U) && "Reduction exit must feed Phi's or select"
) ? void (0) : __assert_fail ("isa<PHINode>(U) && \"Reduction exit must feed Phi's or select\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 3899, __extension__
__PRETTY_FUNCTION__))
;
3900 }
3901 assert(Sel && "Reduction exit feeds no select")(static_cast <bool> (Sel && "Reduction exit feeds no select"
) ? void (0) : __assert_fail ("Sel && \"Reduction exit feeds no select\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 3901, __extension__
__PRETTY_FUNCTION__))
;
3902 State.reset(LoopExitInstDef, Sel, Part);
3903
3904 if (isa<FPMathOperator>(Sel))
3905 Sel->setFastMathFlags(RdxDesc.getFastMathFlags());
3906
3907 // If the target can create a predicated operator for the reduction at no
3908 // extra cost in the loop (for example a predicated vadd), it can be
3909 // cheaper for the select to remain in the loop than be sunk out of it,
3910 // and so use the select value for the phi instead of the old
3911 // LoopExitValue.
3912 if (PreferPredicatedReductionSelect ||
3913 TTI->preferPredicatedReductionSelect(
3914 RdxDesc.getOpcode(), PhiTy,
3915 TargetTransformInfo::ReductionFlags())) {
3916 auto *VecRdxPhi =
3917 cast<PHINode>(State.get(PhiR, Part));
3918 VecRdxPhi->setIncomingValueForBlock(VectorLoopLatch, Sel);
3919 }
3920 }
3921 }
3922
3923 // If the vector reduction can be performed in a smaller type, we truncate
3924 // then extend the loop exit value to enable InstCombine to evaluate the
3925 // entire expression in the smaller type.
3926 if (VF.isVector() && PhiTy != RdxDesc.getRecurrenceType()) {
3927 assert(!PhiR->isInLoop() && "Unexpected truncated inloop reduction!")(static_cast <bool> (!PhiR->isInLoop() && "Unexpected truncated inloop reduction!"
) ? void (0) : __assert_fail ("!PhiR->isInLoop() && \"Unexpected truncated inloop reduction!\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 3927, __extension__
__PRETTY_FUNCTION__))
;
3928 Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), VF);
3929 Builder.SetInsertPoint(VectorLoopLatch->getTerminator());
3930 VectorParts RdxParts(UF);
3931 for (unsigned Part = 0; Part < UF; ++Part) {
3932 RdxParts[Part] = State.get(LoopExitInstDef, Part);
3933 Value *Trunc = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
3934 Value *Extnd = RdxDesc.isSigned() ? Builder.CreateSExt(Trunc, VecTy)
3935 : Builder.CreateZExt(Trunc, VecTy);
3936 for (User *U : llvm::make_early_inc_range(RdxParts[Part]->users()))
3937 if (U != Trunc) {
3938 U->replaceUsesOfWith(RdxParts[Part], Extnd);
3939 RdxParts[Part] = Extnd;
3940 }
3941 }
3942 Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt());
3943 for (unsigned Part = 0; Part < UF; ++Part) {
3944 RdxParts[Part] = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
3945 State.reset(LoopExitInstDef, RdxParts[Part], Part);
3946 }
3947 }
3948
3949 // Reduce all of the unrolled parts into a single vector.
3950 Value *ReducedPartRdx = State.get(LoopExitInstDef, 0);
3951 unsigned Op = RecurrenceDescriptor::getOpcode(RK);
3952
3953 // The middle block terminator has already been assigned a DebugLoc here (the
3954 // OrigLoop's single latch terminator). We want the whole middle block to
3955 // appear to execute on this line because: (a) it is all compiler generated,
3956 // (b) these instructions are always executed after evaluating the latch
3957 // conditional branch, and (c) other passes may add new predecessors which
3958 // terminate on this line. This is the easiest way to ensure we don't
3959 // accidentally cause an extra step back into the loop while debugging.
3960 State.setDebugLocFromInst(LoopMiddleBlock->getTerminator());
3961 if (PhiR->isOrdered())
3962 ReducedPartRdx = State.get(LoopExitInstDef, UF - 1);
3963 else {
3964 // Floating-point operations should have some FMF to enable the reduction.
3965 IRBuilderBase::FastMathFlagGuard FMFG(Builder);
3966 Builder.setFastMathFlags(RdxDesc.getFastMathFlags());
3967 for (unsigned Part = 1; Part < UF; ++Part) {
3968 Value *RdxPart = State.get(LoopExitInstDef, Part);
3969 if (Op != Instruction::ICmp && Op != Instruction::FCmp) {
3970 ReducedPartRdx = Builder.CreateBinOp(
3971 (Instruction::BinaryOps)Op, RdxPart, ReducedPartRdx, "bin.rdx");
3972 } else if (RecurrenceDescriptor::isSelectCmpRecurrenceKind(RK))
3973 ReducedPartRdx = createSelectCmpOp(Builder, ReductionStartValue, RK,
3974 ReducedPartRdx, RdxPart);
3975 else
3976 ReducedPartRdx = createMinMaxOp(Builder, RK, ReducedPartRdx, RdxPart);
3977 }
3978 }
3979
3980 // Create the reduction after the loop. Note that inloop reductions create the
3981 // target reduction in the loop using a Reduction recipe.
3982 if (VF.isVector() && !PhiR->isInLoop()) {
3983 ReducedPartRdx =
3984 createTargetReduction(Builder, TTI, RdxDesc, ReducedPartRdx, OrigPhi);
3985 // If the reduction can be performed in a smaller type, we need to extend
3986 // the reduction to the wider type before we branch to the original loop.
3987 if (PhiTy != RdxDesc.getRecurrenceType())
3988 ReducedPartRdx = RdxDesc.isSigned()
3989 ? Builder.CreateSExt(ReducedPartRdx, PhiTy)
3990 : Builder.CreateZExt(ReducedPartRdx, PhiTy);
3991 }
3992
3993 PHINode *ResumePhi =
3994 dyn_cast<PHINode>(PhiR->getStartValue()->getUnderlyingValue());
3995
3996 // Create a phi node that merges control-flow from the backedge-taken check
3997 // block and the middle block.
3998 PHINode *BCBlockPhi = PHINode::Create(PhiTy, 2, "bc.merge.rdx",
3999 LoopScalarPreHeader->getTerminator());
4000
4001 // If we are fixing reductions in the epilogue loop then we should already
4002 // have created a bc.merge.rdx Phi after the main vector body. Ensure that
4003 // we carry over the incoming values correctly.
4004 for (auto *Incoming : predecessors(LoopScalarPreHeader)) {
4005 if (Incoming == LoopMiddleBlock)
4006 BCBlockPhi->addIncoming(ReducedPartRdx, Incoming);
4007 else if (ResumePhi && llvm::is_contained(ResumePhi->blocks(), Incoming))
4008 BCBlockPhi->addIncoming(ResumePhi->getIncomingValueForBlock(Incoming),
4009 Incoming);
4010 else
4011 BCBlockPhi->addIncoming(ReductionStartValue, Incoming);
4012 }
4013
4014 // Set the resume value for this reduction
4015 ReductionResumeValues.insert({&RdxDesc, BCBlockPhi});
4016
4017 // If there were stores of the reduction value to a uniform memory address
4018 // inside the loop, create the final store here.
4019 if (StoreInst *SI = RdxDesc.IntermediateStore) {
4020 StoreInst *NewSI =
4021 Builder.CreateStore(ReducedPartRdx, SI->getPointerOperand());
4022 propagateMetadata(NewSI, SI);
4023
4024 // If the reduction value is used in other places,
4025 // then let the code below create PHI's for that.
4026 }
4027
4028 // Now, we need to fix the users of the reduction variable
4029 // inside and outside of the scalar remainder loop.
4030
4031 // We know that the loop is in LCSSA form. We need to update the PHI nodes
4032 // in the exit blocks. See comment on analogous loop in
4033 // fixFixedOrderRecurrence for a more complete explaination of the logic.
4034 if (!Cost->requiresScalarEpilogue(VF))
4035 for (PHINode &LCSSAPhi : LoopExitBlock->phis())
4036 if (llvm::is_contained(LCSSAPhi.incoming_values(), LoopExitInst)) {
4037 LCSSAPhi.addIncoming(ReducedPartRdx, LoopMiddleBlock);
4038 State.Plan->removeLiveOut(&LCSSAPhi);
4039 }
4040
4041 // Fix the scalar loop reduction variable with the incoming reduction sum
4042 // from the vector body and from the backedge value.
4043 int IncomingEdgeBlockIdx =
4044 OrigPhi->getBasicBlockIndex(OrigLoop->getLoopLatch());
4045 assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index")(static_cast <bool> (IncomingEdgeBlockIdx >= 0 &&
"Invalid block index") ? void (0) : __assert_fail ("IncomingEdgeBlockIdx >= 0 && \"Invalid block index\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 4045, __extension__
__PRETTY_FUNCTION__))
;
4046 // Pick the other block.
4047 int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1);
4048 OrigPhi->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi);
4049 OrigPhi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst);
4050}
4051
4052void InnerLoopVectorizer::clearReductionWrapFlags(VPReductionPHIRecipe *PhiR,
4053 VPTransformState &State) {
4054 const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor();
4055 RecurKind RK = RdxDesc.getRecurrenceKind();
4056 if (RK != RecurKind::Add && RK != RecurKind::Mul)
4057 return;
4058
4059 SmallVector<VPValue *, 8> Worklist;
4060 SmallPtrSet<VPValue *, 8> Visited;
4061 Worklist.push_back(PhiR);
4062 Visited.insert(PhiR);
4063
4064 while (!Worklist.empty()) {
4065 VPValue *Cur = Worklist.pop_back_val();
4066 for (unsigned Part = 0; Part < UF; ++Part) {
4067 Value *V = State.get(Cur, Part);
4068 if (!isa<OverflowingBinaryOperator>(V))
4069 break;
4070 cast<Instruction>(V)->dropPoisonGeneratingFlags();
4071 }
4072
4073 for (VPUser *U : Cur->users()) {
4074 auto *UserRecipe = dyn_cast<VPRecipeBase>(U);
4075 if (!UserRecipe)
4076 continue;
4077 for (VPValue *V : UserRecipe->definedValues())
4078 if (Visited.insert(V).second)
4079 Worklist.push_back(V);
4080 }
4081 }
4082}
4083
4084void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) {
4085 // The basic block and loop containing the predicated instruction.
4086 auto *PredBB = PredInst->getParent();
4087 auto *VectorLoop = LI->getLoopFor(PredBB);
4088
4089 // Initialize a worklist with the operands of the predicated instruction.
4090 SetVector<Value *> Worklist(PredInst->op_begin(), PredInst->op_end());
4091
4092 // Holds instructions that we need to analyze again. An instruction may be
4093 // reanalyzed if we don't yet know if we can sink it or not.
4094 SmallVector<Instruction *, 8> InstsToReanalyze;
4095
4096 // Returns true if a given use occurs in the predicated block. Phi nodes use
4097 // their operands in their corresponding predecessor blocks.
4098 auto isBlockOfUsePredicated = [&](Use &U) -> bool {
4099 auto *I = cast<Instruction>(U.getUser());
4100 BasicBlock *BB = I->getParent();
4101 if (auto *Phi = dyn_cast<PHINode>(I))
4102 BB = Phi->getIncomingBlock(
4103 PHINode::getIncomingValueNumForOperand(U.getOperandNo()));
4104 return BB == PredBB;
4105 };
4106
4107 // Iteratively sink the scalarized operands of the predicated instruction
4108 // into the block we created for it. When an instruction is sunk, it's
4109 // operands are then added to the worklist. The algorithm ends after one pass
4110 // through the worklist doesn't sink a single instruction.
4111 bool Changed;
4112 do {
4113 // Add the instructions that need to be reanalyzed to the worklist, and
4114 // reset the changed indicator.
4115 Worklist.insert(InstsToReanalyze.begin(), InstsToReanalyze.end());
4116 InstsToReanalyze.clear();
4117 Changed = false;
4118
4119 while (!Worklist.empty()) {
4120 auto *I = dyn_cast<Instruction>(Worklist.pop_back_val());
4121
4122 // We can't sink an instruction if it is a phi node, is not in the loop,
4123 // or may have side effects.
4124 if (!I || isa<PHINode>(I) || !VectorLoop->contains(I) ||
4125 I->mayHaveSideEffects())
4126 continue;
4127
4128 // If the instruction is already in PredBB, check if we can sink its
4129 // operands. In that case, VPlan's sinkScalarOperands() succeeded in
4130 // sinking the scalar instruction I, hence it appears in PredBB; but it
4131 // may have failed to sink I's operands (recursively), which we try
4132 // (again) here.
4133 if (I->getParent() == PredBB) {
4134 Worklist.insert(I->op_begin(), I->op_end());
4135 continue;
4136 }
4137
4138 // It's legal to sink the instruction if all its uses occur in the
4139 // predicated block. Otherwise, there's nothing to do yet, and we may
4140 // need to reanalyze the instruction.
4141 if (!llvm::all_of(I->uses(), isBlockOfUsePredicated)) {
4142 InstsToReanalyze.push_back(I);
4143 continue;
4144 }
4145
4146 // Move the instruction to the beginning of the predicated block, and add
4147 // it's operands to the worklist.
4148 I->moveBefore(&*PredBB->getFirstInsertionPt());
4149 Worklist.insert(I->op_begin(), I->op_end());
4150
4151 // The sinking may have enabled other instructions to be sunk, so we will
4152 // need to iterate.
4153 Changed = true;
4154 }
4155 } while (Changed);
4156}
4157
4158void InnerLoopVectorizer::fixNonInductionPHIs(VPlan &Plan,
4159 VPTransformState &State) {
4160 auto Iter = depth_first(
4161 VPBlockRecursiveTraversalWrapper<VPBlockBase *>(Plan.getEntry()));
4162 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) {
4163 for (VPRecipeBase &P : VPBB->phis()) {
4164 VPWidenPHIRecipe *VPPhi = dyn_cast<VPWidenPHIRecipe>(&P);
4165 if (!VPPhi)
4166 continue;
4167 PHINode *NewPhi = cast<PHINode>(State.get(VPPhi, 0));
4168 // Make sure the builder has a valid insert point.
4169 Builder.SetInsertPoint(NewPhi);
4170 for (unsigned i = 0; i < VPPhi->getNumOperands(); ++i) {
4171 VPValue *Inc = VPPhi->getIncomingValue(i);
4172 VPBasicBlock *VPBB = VPPhi->getIncomingBlock(i);
4173 NewPhi->addIncoming(State.get(Inc, 0), State.CFG.VPBB2IRBB[VPBB]);
4174 }
4175 }
4176 }
4177}
4178
4179bool InnerLoopVectorizer::useOrderedReductions(
4180 const RecurrenceDescriptor &RdxDesc) {
4181 return Cost->useOrderedReductions(RdxDesc);
4182}
4183
4184void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) {
4185 // We should not collect Scalars more than once per VF. Right now, this
4186 // function is called from collectUniformsAndScalars(), which already does
4187 // this check. Collecting Scalars for VF=1 does not make any sense.
4188 assert(VF.isVector() && Scalars.find(VF) == Scalars.end() &&(static_cast <bool> (VF.isVector() && Scalars.find
(VF) == Scalars.end() && "This function should not be visited twice for the same VF"
) ? void (0) : __assert_fail ("VF.isVector() && Scalars.find(VF) == Scalars.end() && \"This function should not be visited twice for the same VF\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 4189, __extension__
__PRETTY_FUNCTION__))
4189 "This function should not be visited twice for the same VF")(static_cast <bool> (VF.isVector() && Scalars.find
(VF) == Scalars.end() && "This function should not be visited twice for the same VF"
) ? void (0) : __assert_fail ("VF.isVector() && Scalars.find(VF) == Scalars.end() && \"This function should not be visited twice for the same VF\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 4189, __extension__
__PRETTY_FUNCTION__))
;
4190
4191 // This avoids any chances of creating a REPLICATE recipe during planning
4192 // since that would result in generation of scalarized code during execution,
4193 // which is not supported for scalable vectors.
4194 if (VF.isScalable()) {
4195 Scalars[VF].insert(Uniforms[VF].begin(), Uniforms[VF].end());
4196 return;
4197 }
4198
4199 SmallSetVector<Instruction *, 8> Worklist;
4200
4201 // These sets are used to seed the analysis with pointers used by memory
4202 // accesses that will remain scalar.
4203 SmallSetVector<Instruction *, 8> ScalarPtrs;
4204 SmallPtrSet<Instruction *, 8> PossibleNonScalarPtrs;
4205 auto *Latch = TheLoop->getLoopLatch();
4206
4207 // A helper that returns true if the use of Ptr by MemAccess will be scalar.
4208 // The pointer operands of loads and stores will be scalar as long as the
4209 // memory access is not a gather or scatter operation. The value operand of a
4210 // store will remain scalar if the store is scalarized.
4211 auto isScalarUse = [&](Instruction *MemAccess, Value *Ptr) {
4212 InstWidening WideningDecision = getWideningDecision(MemAccess, VF);
4213 assert(WideningDecision != CM_Unknown &&(static_cast <bool> (WideningDecision != CM_Unknown &&
"Widening decision should be ready at this moment") ? void (
0) : __assert_fail ("WideningDecision != CM_Unknown && \"Widening decision should be ready at this moment\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 4214, __extension__
__PRETTY_FUNCTION__))
4214 "Widening decision should be ready at this moment")(static_cast <bool> (WideningDecision != CM_Unknown &&
"Widening decision should be ready at this moment") ? void (
0) : __assert_fail ("WideningDecision != CM_Unknown && \"Widening decision should be ready at this moment\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 4214, __extension__
__PRETTY_FUNCTION__))
;
4215 if (auto *Store = dyn_cast<StoreInst>(MemAccess))
4216 if (Ptr == Store->getValueOperand())
4217 return WideningDecision == CM_Scalarize;
4218 assert(Ptr == getLoadStorePointerOperand(MemAccess) &&(static_cast <bool> (Ptr == getLoadStorePointerOperand(
MemAccess) && "Ptr is neither a value or pointer operand"
) ? void (0) : __assert_fail ("Ptr == getLoadStorePointerOperand(MemAccess) && \"Ptr is neither a value or pointer operand\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 4219, __extension__
__PRETTY_FUNCTION__))
4219 "Ptr is neither a value or pointer operand")(static_cast <bool> (Ptr == getLoadStorePointerOperand(
MemAccess) && "Ptr is neither a value or pointer operand"
) ? void (0) : __assert_fail ("Ptr == getLoadStorePointerOperand(MemAccess) && \"Ptr is neither a value or pointer operand\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 4219, __extension__
__PRETTY_FUNCTION__))
;
4220 return WideningDecision != CM_GatherScatter;
4221 };
4222
4223 // A helper that returns true if the given value is a bitcast or
4224 // getelementptr instruction contained in the loop.
4225 auto isLoopVaryingBitCastOrGEP = [&](Value *V) {
4226 return ((isa<BitCastInst>(V) && V->getType()->isPointerTy()) ||
4227 isa<GetElementPtrInst>(V)) &&
4228 !TheLoop->isLoopInvariant(V);
4229 };
4230
4231 // A helper that evaluates a memory access's use of a pointer. If the use will
4232 // be a scalar use and the pointer is only used by memory accesses, we place
4233 // the pointer in ScalarPtrs. Otherwise, the pointer is placed in
4234 // PossibleNonScalarPtrs.
4235 auto evaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) {
4236 // We only care about bitcast and getelementptr instructions contained in
4237 // the loop.
4238 if (!isLoopVaryingBitCastOrGEP(Ptr))
4239 return;
4240
4241 // If the pointer has already been identified as scalar (e.g., if it was
4242 // also identified as uniform), there's nothing to do.
4243 auto *I = cast<Instruction>(Ptr);
4244 if (Worklist.count(I))
4245 return;
4246
4247 // If the use of the pointer will be a scalar use, and all users of the
4248 // pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise,
4249 // place the pointer in PossibleNonScalarPtrs.
4250 if (isScalarUse(MemAccess, Ptr) && llvm::all_of(I->users(), [&](User *U) {
4251 return isa<LoadInst>(U) || isa<StoreInst>(U);
4252 }))
4253 ScalarPtrs.insert(I);
4254 else
4255 PossibleNonScalarPtrs.insert(I);
4256 };
4257
4258 // We seed the scalars analysis with three classes of instructions: (1)
4259 // instructions marked uniform-after-vectorization and (2) bitcast,
4260 // getelementptr and (pointer) phi instructions used by memory accesses
4261 // requiring a scalar use.
4262 //
4263 // (1) Add to the worklist all instructions that have been identified as
4264 // uniform-after-vectorization.
4265 Worklist.insert(Uniforms[VF].begin(), Uniforms[VF].end());
4266
4267 // (2) Add to the worklist all bitcast and getelementptr instructions used by
4268 // memory accesses requiring a scalar use. The pointer operands of loads and
4269 // stores will be scalar as long as the memory accesses is not a gather or
4270 // scatter operation. The value operand of a store will remain scalar if the
4271 // store is scalarized.
4272 for (auto *BB : TheLoop->blocks())
4273 for (auto &I : *BB) {
4274 if (auto *Load = dyn_cast<LoadInst>(&I)) {
4275 evaluatePtrUse(Load, Load->getPointerOperand());
4276 } else if (auto *Store = dyn_cast<StoreInst>(&I)) {
4277 evaluatePtrUse(Store, Store->getPointerOperand());
4278 evaluatePtrUse(Store, Store->getValueOperand());
4279 }
4280 }
4281 for (auto *I : ScalarPtrs)
4282 if (!PossibleNonScalarPtrs.count(I)) {
4283 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found scalar instruction: "
<< *I << "\n"; } } while (false)
;
4284 Worklist.insert(I);
4285 }
4286
4287 // Insert the forced scalars.
4288 // FIXME: Currently VPWidenPHIRecipe() often creates a dead vector
4289 // induction variable when the PHI user is scalarized.
4290 auto ForcedScalar = ForcedScalars.find(VF);
4291 if (ForcedScalar != ForcedScalars.end())
4292 for (auto *I : ForcedScalar->second) {
4293 LLVM_DEBUG(dbgs() << "LV: Found (forced) scalar instruction: " << *I << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found (forced) scalar instruction: "
<< *I << "\n"; } } while (false)
;
4294 Worklist.insert(I);
4295 }
4296
4297 // Expand the worklist by looking through any bitcasts and getelementptr
4298 // instructions we've already identified as scalar. This is similar to the
4299 // expansion step in collectLoopUniforms(); however, here we're only
4300 // expanding to include additional bitcasts and getelementptr instructions.
4301 unsigned Idx = 0;
4302 while (Idx != Worklist.size()) {
4303 Instruction *Dst = Worklist[Idx++];
4304 if (!isLoopVaryingBitCastOrGEP(Dst->getOperand(0)))
4305 continue;
4306 auto *Src = cast<Instruction>(Dst->getOperand(0));
4307 if (llvm::all_of(Src->users(), [&](User *U) -> bool {
4308 auto *J = cast<Instruction>(U);
4309 return !TheLoop->contains(J) || Worklist.count(J) ||
4310 ((isa<LoadInst>(J) || isa<StoreInst>(J)) &&
4311 isScalarUse(J, Src));
4312 })) {
4313 Worklist.insert(Src);
4314 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found scalar instruction: "
<< *Src << "\n"; } } while (false)
;
4315 }
4316 }
4317
4318 // An induction variable will remain scalar if all users of the induction
4319 // variable and induction variable update remain scalar.
4320 for (const auto &Induction : Legal->getInductionVars()) {
4321 auto *Ind = Induction.first;
4322 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
4323
4324 // If tail-folding is applied, the primary induction variable will be used
4325 // to feed a vector compare.
4326 if (Ind == Legal->getPrimaryInduction() && foldTailByMasking())
4327 continue;
4328
4329 // Returns true if \p Indvar is a pointer induction that is used directly by
4330 // load/store instruction \p I.
4331 auto IsDirectLoadStoreFromPtrIndvar = [&](Instruction *Indvar,
4332 Instruction *I) {
4333 return Induction.second.getKind() ==
4334 InductionDescriptor::IK_PtrInduction &&
4335 (isa<LoadInst>(I) || isa<StoreInst>(I)) &&
4336 Indvar == getLoadStorePointerOperand(I) && isScalarUse(I, Indvar);
4337 };
4338
4339 // Determine if all users of the induction variable are scalar after
4340 // vectorization.
4341 auto ScalarInd = llvm::all_of(Ind->users(), [&](User *U) -> bool {
4342 auto *I = cast<Instruction>(U);
4343 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) ||
4344 IsDirectLoadStoreFromPtrIndvar(Ind, I);
4345 });
4346 if (!ScalarInd)
4347 continue;
4348
4349 // Determine if all users of the induction variable update instruction are
4350 // scalar after vectorization.
4351 auto ScalarIndUpdate =
4352 llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
4353 auto *I = cast<Instruction>(U);
4354 return I == Ind || !TheLoop->contains(I) || Worklist.count(I) ||
4355 IsDirectLoadStoreFromPtrIndvar(IndUpdate, I);
4356 });
4357 if (!ScalarIndUpdate)
4358 continue;
4359
4360 // The induction variable and its update instruction will remain scalar.
4361 Worklist.insert(Ind);
4362 Worklist.insert(IndUpdate);
4363 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found scalar instruction: "
<< *Ind << "\n"; } } while (false)
;
4364 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdatedo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found scalar instruction: "
<< *IndUpdate << "\n"; } } while (false)
4365 << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found scalar instruction: "
<< *IndUpdate << "\n"; } } while (false)
;
4366 }
4367
4368 Scalars[VF].insert(Worklist.begin(), Worklist.end());
4369}
4370
4371bool LoopVectorizationCostModel::isScalarWithPredication(
4372 Instruction *I, ElementCount VF) const {
4373 if (!isPredicatedInst(I))
4374 return false;
4375
4376 // Do we have a non-scalar lowering for this predicated
4377 // instruction? No - it is scalar with predication.
4378 switch(I->getOpcode()) {
4379 default:
4380 return true;
4381 case Instruction::Load:
4382 case Instruction::Store: {
4383 auto *Ptr = getLoadStorePointerOperand(I);
4384 auto *Ty = getLoadStoreType(I);
4385 Type *VTy = Ty;
4386 if (VF.isVector())
4387 VTy = VectorType::get(Ty, VF);
4388 const Align Alignment = getLoadStoreAlignment(I);
4389 return isa<LoadInst>(I) ? !(isLegalMaskedLoad(Ty, Ptr, Alignment) ||
4390 TTI.isLegalMaskedGather(VTy, Alignment))
4391 : !(isLegalMaskedStore(Ty, Ptr, Alignment) ||
4392 TTI.isLegalMaskedScatter(VTy, Alignment));
4393 }
4394 case Instruction::UDiv:
4395 case Instruction::SDiv:
4396 case Instruction::SRem:
4397 case Instruction::URem: {
4398 // We have the option to use the safe-divisor idiom to avoid predication.
4399 // The cost based decision here will always select safe-divisor for
4400 // scalable vectors as scalarization isn't legal.
4401 const auto [ScalarCost, SafeDivisorCost] = getDivRemSpeculationCost(I, VF);
4402 return isDivRemScalarWithPredication(ScalarCost, SafeDivisorCost);
4403 }
4404 }
4405}
4406
4407bool LoopVectorizationCostModel::isPredicatedInst(Instruction *I) const {
4408 if (!blockNeedsPredicationForAnyReason(I->getParent()))
4409 return false;
4410
4411 // Can we prove this instruction is safe to unconditionally execute?
4412 // If not, we must use some form of predication.
4413 switch(I->getOpcode()) {
4414 default:
4415 return false;
4416 case Instruction::Load:
4417 case Instruction::Store: {
4418 if (!Legal->isMaskRequired(I))
4419 return false;
4420 // When we know the load's address is loop invariant and the instruction
4421 // in the original scalar loop was unconditionally executed then we
4422 // don't need to mark it as a predicated instruction. Tail folding may
4423 // introduce additional predication, but we're guaranteed to always have
4424 // at least one active lane. We call Legal->blockNeedsPredication here
4425 // because it doesn't query tail-folding. For stores, we need to prove
4426 // both speculation safety (which follows from the same argument as loads),
4427 // but also must prove the value being stored is correct. The easiest
4428 // form of the later is to require that all values stored are the same.
4429 if (Legal->isUniformMemOp(*I) &&
4430 (isa<LoadInst>(I) ||
4431 (isa<StoreInst>(I) &&
4432 TheLoop->isLoopInvariant(cast<StoreInst>(I)->getValueOperand()))) &&
4433 !Legal->blockNeedsPredication(I->getParent()))
4434 return false;
4435 return true;
4436 }
4437 case Instruction::UDiv:
4438 case Instruction::SDiv:
4439 case Instruction::SRem:
4440 case Instruction::URem:
4441 // TODO: We can use the loop-preheader as context point here and get
4442 // context sensitive reasoning
4443 return !isSafeToSpeculativelyExecute(I);
4444 }
4445}
4446
4447std::pair<InstructionCost, InstructionCost>
4448LoopVectorizationCostModel::getDivRemSpeculationCost(Instruction *I,
4449 ElementCount VF) const {
4450 assert(I->getOpcode() == Instruction::UDiv ||(static_cast <bool> (I->getOpcode() == Instruction::
UDiv || I->getOpcode() == Instruction::SDiv || I->getOpcode
() == Instruction::SRem || I->getOpcode() == Instruction::
URem) ? void (0) : __assert_fail ("I->getOpcode() == Instruction::UDiv || I->getOpcode() == Instruction::SDiv || I->getOpcode() == Instruction::SRem || I->getOpcode() == Instruction::URem"
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 4453, __extension__
__PRETTY_FUNCTION__))
4451 I->getOpcode() == Instruction::SDiv ||(static_cast <bool> (I->getOpcode() == Instruction::
UDiv || I->getOpcode() == Instruction::SDiv || I->getOpcode
() == Instruction::SRem || I->getOpcode() == Instruction::
URem) ? void (0) : __assert_fail ("I->getOpcode() == Instruction::UDiv || I->getOpcode() == Instruction::SDiv || I->getOpcode() == Instruction::SRem || I->getOpcode() == Instruction::URem"
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 4453, __extension__
__PRETTY_FUNCTION__))
4452 I->getOpcode() == Instruction::SRem ||(static_cast <bool> (I->getOpcode() == Instruction::
UDiv || I->getOpcode() == Instruction::SDiv || I->getOpcode
() == Instruction::SRem || I->getOpcode() == Instruction::
URem) ? void (0) : __assert_fail ("I->getOpcode() == Instruction::UDiv || I->getOpcode() == Instruction::SDiv || I->getOpcode() == Instruction::SRem || I->getOpcode() == Instruction::URem"
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 4453, __extension__
__PRETTY_FUNCTION__))
4453 I->getOpcode() == Instruction::URem)(static_cast <bool> (I->getOpcode() == Instruction::
UDiv || I->getOpcode() == Instruction::SDiv || I->getOpcode
() == Instruction::SRem || I->getOpcode() == Instruction::
URem) ? void (0) : __assert_fail ("I->getOpcode() == Instruction::UDiv || I->getOpcode() == Instruction::SDiv || I->getOpcode() == Instruction::SRem || I->getOpcode() == Instruction::URem"
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 4453, __extension__
__PRETTY_FUNCTION__))
;
4454 assert(!isSafeToSpeculativelyExecute(I))(static_cast <bool> (!isSafeToSpeculativelyExecute(I)) ?
void (0) : __assert_fail ("!isSafeToSpeculativelyExecute(I)"
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 4454, __extension__
__PRETTY_FUNCTION__))
;
4455
4456 const TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
4457
4458 // Scalarization isn't legal for scalable vector types
4459 InstructionCost ScalarizationCost = InstructionCost::getInvalid();
4460 if (!VF.isScalable()) {
4461 // Get the scalarization cost and scale this amount by the probability of
4462 // executing the predicated block. If the instruction is not predicated,
4463 // we fall through to the next case.
4464 ScalarizationCost = 0;
4465
4466 // These instructions have a non-void type, so account for the phi nodes
4467 // that we will create. This cost is likely to be zero. The phi node
4468 // cost, if any, should be scaled by the block probability because it
4469 // models a copy at the end of each predicated block.
4470 ScalarizationCost += VF.getKnownMinValue() *
4471 TTI.getCFInstrCost(Instruction::PHI, CostKind);
4472
4473 // The cost of the non-predicated instruction.
4474 ScalarizationCost += VF.getKnownMinValue() *
4475 TTI.getArithmeticInstrCost(I->getOpcode(), I->getType(), CostKind);
4476
4477 // The cost of insertelement and extractelement instructions needed for
4478 // scalarization.
4479 ScalarizationCost += getScalarizationOverhead(I, VF);
4480
4481 // Scale the cost by the probability of executing the predicated blocks.
4482 // This assumes the predicated block for each vector lane is equally
4483 // likely.
4484 ScalarizationCost = ScalarizationCost / getReciprocalPredBlockProb();
4485 }
4486 InstructionCost SafeDivisorCost = 0;
4487
4488 auto *VecTy = ToVectorTy(I->getType(), VF);
4489
4490 // The cost of the select guard to ensure all lanes are well defined
4491 // after we speculate above any internal control flow.
4492 SafeDivisorCost += TTI.getCmpSelInstrCost(
4493 Instruction::Select, VecTy,
4494 ToVectorTy(Type::getInt1Ty(I->getContext()), VF),
4495 CmpInst::BAD_ICMP_PREDICATE, CostKind);
4496
4497 // Certain instructions can be cheaper to vectorize if they have a constant
4498 // second vector operand. One example of this are shifts on x86.
4499 Value *Op2 = I->getOperand(1);
4500 auto Op2Info = TTI.getOperandInfo(Op2);
4501 if (Op2Info.Kind == TargetTransformInfo::OK_AnyValue && Legal->isUniform(Op2))
4502 Op2Info.Kind = TargetTransformInfo::OK_UniformValue;
4503
4504 SmallVector<const Value *, 4> Operands(I->operand_values());
4505 SafeDivisorCost += TTI.getArithmeticInstrCost(
4506 I->getOpcode(), VecTy, CostKind,
4507 {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
4508 Op2Info, Operands, I);
4509 return {ScalarizationCost, SafeDivisorCost};
4510}
4511
4512bool LoopVectorizationCostModel::interleavedAccessCanBeWidened(
4513 Instruction *I, ElementCount VF) {
4514 assert(isAccessInterleaved(I) && "Expecting interleaved access.")(static_cast <bool> (isAccessInterleaved(I) && "Expecting interleaved access."
) ? void (0) : __assert_fail ("isAccessInterleaved(I) && \"Expecting interleaved access.\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 4514, __extension__
__PRETTY_FUNCTION__))
;
4515 assert(getWideningDecision(I, VF) == CM_Unknown &&(static_cast <bool> (getWideningDecision(I, VF) == CM_Unknown
&& "Decision should not be set yet.") ? void (0) : __assert_fail
("getWideningDecision(I, VF) == CM_Unknown && \"Decision should not be set yet.\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 4516, __extension__
__PRETTY_FUNCTION__))
4516 "Decision should not be set yet.")(static_cast <bool> (getWideningDecision(I, VF) == CM_Unknown
&& "Decision should not be set yet.") ? void (0) : __assert_fail
("getWideningDecision(I, VF) == CM_Unknown && \"Decision should not be set yet.\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 4516, __extension__
__PRETTY_FUNCTION__))
;
4517 auto *Group = getInterleavedAccessGroup(I);
4518 assert(Group && "Must have a group.")(static_cast <bool> (Group && "Must have a group."
) ? void (0) : __assert_fail ("Group && \"Must have a group.\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 4518, __extension__
__PRETTY_FUNCTION__))
;
4519
4520 // If the instruction's allocated size doesn't equal it's type size, it
4521 // requires padding and will be scalarized.
4522 auto &DL = I->getModule()->getDataLayout();
4523 auto *ScalarTy = getLoadStoreType(I);
4524 if (hasIrregularType(ScalarTy, DL))
4525 return false;
4526
4527 // If the group involves a non-integral pointer, we may not be able to
4528 // losslessly cast all values to a common type.
4529 unsigned InterleaveFactor = Group->getFactor();
4530 bool ScalarNI = DL.isNonIntegralPointerType(ScalarTy);
4531 for (unsigned i = 0; i < InterleaveFactor; i++) {
4532 Instruction *Member = Group->getMember(i);
4533 if (!Member)
4534 continue;
4535 auto *MemberTy = getLoadStoreType(Member);
4536 bool MemberNI = DL.isNonIntegralPointerType(MemberTy);
4537 // Don't coerce non-integral pointers to integers or vice versa.
4538 if (MemberNI != ScalarNI) {
4539 // TODO: Consider adding special nullptr value case here
4540 return false;
4541 } else if (MemberNI && ScalarNI &&
4542 ScalarTy->getPointerAddressSpace() !=
4543 MemberTy->getPointerAddressSpace()) {
4544 return false;
4545 }
4546 }
4547
4548 // Check if masking is required.
4549 // A Group may need masking for one of two reasons: it resides in a block that
4550 // needs predication, or it was decided to use masking to deal with gaps
4551 // (either a gap at the end of a load-access that may result in a speculative
4552 // load, or any gaps in a store-access).
4553 bool PredicatedAccessRequiresMasking =
4554 blockNeedsPredicationForAnyReason(I->getParent()) &&
4555 Legal->isMaskRequired(I);
4556 bool LoadAccessWithGapsRequiresEpilogMasking =
4557 isa<LoadInst>(I) && Group->requiresScalarEpilogue() &&
4558 !isScalarEpilogueAllowed();
4559 bool StoreAccessWithGapsRequiresMasking =
4560 isa<StoreInst>(I) && (Group->getNumMembers() < Group->getFactor());
4561 if (!PredicatedAccessRequiresMasking &&
4562 !LoadAccessWithGapsRequiresEpilogMasking &&
4563 !StoreAccessWithGapsRequiresMasking)
4564 return true;
4565
4566 // If masked interleaving is required, we expect that the user/target had
4567 // enabled it, because otherwise it either wouldn't have been created or
4568 // it should have been invalidated by the CostModel.
4569 assert(useMaskedInterleavedAccesses(TTI) &&(static_cast <bool> (useMaskedInterleavedAccesses(TTI) &&
"Masked interleave-groups for predicated accesses are not enabled."
) ? void (0) : __assert_fail ("useMaskedInterleavedAccesses(TTI) && \"Masked interleave-groups for predicated accesses are not enabled.\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 4570, __extension__
__PRETTY_FUNCTION__))
4570 "Masked interleave-groups for predicated accesses are not enabled.")(static_cast <bool> (useMaskedInterleavedAccesses(TTI) &&
"Masked interleave-groups for predicated accesses are not enabled."
) ? void (0) : __assert_fail ("useMaskedInterleavedAccesses(TTI) && \"Masked interleave-groups for predicated accesses are not enabled.\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 4570, __extension__
__PRETTY_FUNCTION__))
;
4571
4572 if (Group->isReverse())
4573 return false;
4574
4575 auto *Ty = getLoadStoreType(I);
4576 const Align Alignment = getLoadStoreAlignment(I);
4577 return isa<LoadInst>(I) ? TTI.isLegalMaskedLoad(Ty, Alignment)
4578 : TTI.isLegalMaskedStore(Ty, Alignment);
4579}
4580
4581bool LoopVectorizationCostModel::memoryInstructionCanBeWidened(
4582 Instruction *I, ElementCount VF) {
4583 // Get and ensure we have a valid memory instruction.
4584 assert((isa<LoadInst, StoreInst>(I)) && "Invalid memory instruction")(static_cast <bool> ((isa<LoadInst, StoreInst>(I)
) && "Invalid memory instruction") ? void (0) : __assert_fail
("(isa<LoadInst, StoreInst>(I)) && \"Invalid memory instruction\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 4584, __extension__
__PRETTY_FUNCTION__))
;
4585
4586 auto *Ptr = getLoadStorePointerOperand(I);
4587 auto *ScalarTy = getLoadStoreType(I);
4588
4589 // In order to be widened, the pointer should be consecutive, first of all.
4590 if (!Legal->isConsecutivePtr(ScalarTy, Ptr))
4591 return false;
4592
4593 // If the instruction is a store located in a predicated block, it will be
4594 // scalarized.
4595 if (isScalarWithPredication(I, VF))
4596 return false;
4597
4598 // If the instruction's allocated size doesn't equal it's type size, it
4599 // requires padding and will be scalarized.
4600 auto &DL = I->getModule()->getDataLayout();
4601 if (hasIrregularType(ScalarTy, DL))
4602 return false;
4603
4604 return true;
4605}
4606
4607void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) {
4608 // We should not collect Uniforms more than once per VF. Right now,
4609 // this function is called from collectUniformsAndScalars(), which
4610 // already does this check. Collecting Uniforms for VF=1 does not make any
4611 // sense.
4612
4613 assert(VF.isVector() && Uniforms.find(VF) == Uniforms.end() &&(static_cast <bool> (VF.isVector() && Uniforms.
find(VF) == Uniforms.end() && "This function should not be visited twice for the same VF"
) ? void (0) : __assert_fail ("VF.isVector() && Uniforms.find(VF) == Uniforms.end() && \"This function should not be visited twice for the same VF\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 4614, __extension__
__PRETTY_FUNCTION__))
4614 "This function should not be visited twice for the same VF")(static_cast <bool> (VF.isVector() && Uniforms.
find(VF) == Uniforms.end() && "This function should not be visited twice for the same VF"
) ? void (0) : __assert_fail ("VF.isVector() && Uniforms.find(VF) == Uniforms.end() && \"This function should not be visited twice for the same VF\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 4614, __extension__
__PRETTY_FUNCTION__))
;
4615
4616 // Visit the list of Uniforms. If we'll not find any uniform value, we'll
4617 // not analyze again. Uniforms.count(VF) will return 1.
4618 Uniforms[VF].clear();
4619
4620 // We now know that the loop is vectorizable!
4621 // Collect instructions inside the loop that will remain uniform after
4622 // vectorization.
4623
4624 // Global values, params and instructions outside of current loop are out of
4625 // scope.
4626 auto isOutOfScope = [&](Value *V) -> bool {
4627 Instruction *I = dyn_cast<Instruction>(V);
4628 return (!I || !TheLoop->contains(I));
4629 };
4630
4631 // Worklist containing uniform instructions demanding lane 0.
4632 SetVector<Instruction *> Worklist;
4633 BasicBlock *Latch = TheLoop->getLoopLatch();
4634
4635 // Add uniform instructions demanding lane 0 to the worklist. Instructions
4636 // that are scalar with predication must not be considered uniform after
4637 // vectorization, because that would create an erroneous replicating region
4638 // where only a single instance out of VF should be formed.
4639 // TODO: optimize such seldom cases if found important, see PR40816.
4640 auto addToWorklistIfAllowed = [&](Instruction *I) -> void {
4641 if (isOutOfScope(I)) {
4642 LLVM_DEBUG(dbgs() << "LV: Found not uniform due to scope: "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found not uniform due to scope: "
<< *I << "\n"; } } while (false)
4643 << *I << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found not uniform due to scope: "
<< *I << "\n"; } } while (false)
;
4644 return;
4645 }
4646 if (isScalarWithPredication(I, VF)) {
4647 LLVM_DEBUG(dbgs() << "LV: Found not uniform being ScalarWithPredication: "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found not uniform being ScalarWithPredication: "
<< *I << "\n"; } } while (false)
4648 << *I << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found not uniform being ScalarWithPredication: "
<< *I << "\n"; } } while (false)
;
4649 return;
4650 }
4651 LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *I << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found uniform instruction: "
<< *I << "\n"; } } while (false)
;
4652 Worklist.insert(I);
4653 };
4654
4655 // Start with the conditional branch. If the branch condition is an
4656 // instruction contained in the loop that is only used by the branch, it is
4657 // uniform.
4658 auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0));
4659 if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse())
4660 addToWorklistIfAllowed(Cmp);
4661
4662 // Return true if all lanes perform the same memory operation, and we can
4663 // thus chose to execute only one.
4664 auto isUniformMemOpUse = [&](Instruction *I) {
4665 if (!Legal->isUniformMemOp(*I))
4666 return false;
4667 if (isa<LoadInst>(I))
4668 // Loading the same address always produces the same result - at least
4669 // assuming aliasing and ordering which have already been checked.
4670 return true;
4671 // Storing the same value on every iteration.
4672 return TheLoop->isLoopInvariant(cast<StoreInst>(I)->getValueOperand());
4673 };
4674
4675 auto isUniformDecision = [&](Instruction *I, ElementCount VF) {
4676 InstWidening WideningDecision = getWideningDecision(I, VF);
4677 assert(WideningDecision != CM_Unknown &&(static_cast <bool> (WideningDecision != CM_Unknown &&
"Widening decision should be ready at this moment") ? void (
0) : __assert_fail ("WideningDecision != CM_Unknown && \"Widening decision should be ready at this moment\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 4678, __extension__
__PRETTY_FUNCTION__))
4678 "Widening decision should be ready at this moment")(static_cast <bool> (WideningDecision != CM_Unknown &&
"Widening decision should be ready at this moment") ? void (
0) : __assert_fail ("WideningDecision != CM_Unknown && \"Widening decision should be ready at this moment\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 4678, __extension__
__PRETTY_FUNCTION__))
;
4679
4680 if (isUniformMemOpUse(I))
4681 return true;
4682
4683 return (WideningDecision == CM_Widen ||
4684 WideningDecision == CM_Widen_Reverse ||
4685 WideningDecision == CM_Interleave);
4686 };
4687
4688
4689 // Returns true if Ptr is the pointer operand of a memory access instruction
4690 // I, and I is known to not require scalarization.
4691 auto isVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool {
4692 return getLoadStorePointerOperand(I) == Ptr && isUniformDecision(I, VF);
4693 };
4694
4695 // Holds a list of values which are known to have at least one uniform use.
4696 // Note that there may be other uses which aren't uniform. A "uniform use"
4697 // here is something which only demands lane 0 of the unrolled iterations;
4698 // it does not imply that all lanes produce the same value (e.g. this is not
4699 // the usual meaning of uniform)
4700 SetVector<Value *> HasUniformUse;
4701
4702 // Scan the loop for instructions which are either a) known to have only
4703 // lane 0 demanded or b) are uses which demand only lane 0 of their operand.
4704 for (auto *BB : TheLoop->blocks())
4705 for (auto &I : *BB) {
4706 if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(&I)) {
4707 switch (II->getIntrinsicID()) {
4708 case Intrinsic::sideeffect:
4709 case Intrinsic::experimental_noalias_scope_decl:
4710 case Intrinsic::assume:
4711 case Intrinsic::lifetime_start:
4712 case Intrinsic::lifetime_end:
4713 if (TheLoop->hasLoopInvariantOperands(&I))
4714 addToWorklistIfAllowed(&I);
4715 break;
4716 default:
4717 break;
4718 }
4719 }
4720
4721 // ExtractValue instructions must be uniform, because the operands are
4722 // known to be loop-invariant.
4723 if (auto *EVI = dyn_cast<ExtractValueInst>(&I)) {
4724 assert(isOutOfScope(EVI->getAggregateOperand()) &&(static_cast <bool> (isOutOfScope(EVI->getAggregateOperand
()) && "Expected aggregate value to be loop invariant"
) ? void (0) : __assert_fail ("isOutOfScope(EVI->getAggregateOperand()) && \"Expected aggregate value to be loop invariant\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 4725, __extension__
__PRETTY_FUNCTION__))
4725 "Expected aggregate value to be loop invariant")(static_cast <bool> (isOutOfScope(EVI->getAggregateOperand
()) && "Expected aggregate value to be loop invariant"
) ? void (0) : __assert_fail ("isOutOfScope(EVI->getAggregateOperand()) && \"Expected aggregate value to be loop invariant\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 4725, __extension__
__PRETTY_FUNCTION__))
;
4726 addToWorklistIfAllowed(EVI);
4727 continue;
4728 }
4729
4730 // If there's no pointer operand, there's nothing to do.
4731 auto *Ptr = getLoadStorePointerOperand(&I);
4732 if (!Ptr)
4733 continue;
4734
4735 if (isUniformMemOpUse(&I))
4736 addToWorklistIfAllowed(&I);
4737
4738 if (isUniformDecision(&I, VF)) {
4739 assert(isVectorizedMemAccessUse(&I, Ptr) && "consistency check")(static_cast <bool> (isVectorizedMemAccessUse(&I, Ptr
) && "consistency check") ? void (0) : __assert_fail (
"isVectorizedMemAccessUse(&I, Ptr) && \"consistency check\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 4739, __extension__
__PRETTY_FUNCTION__))
;
4740 HasUniformUse.insert(Ptr);
4741 }
4742 }
4743
4744 // Add to the worklist any operands which have *only* uniform (e.g. lane 0
4745 // demanding) users. Since loops are assumed to be in LCSSA form, this
4746 // disallows uses outside the loop as well.
4747 for (auto *V : HasUniformUse) {
4748 if (isOutOfScope(V))
4749 continue;
4750 auto *I = cast<Instruction>(V);
4751 auto UsersAreMemAccesses =
4752 llvm::all_of(I->users(), [&](User *U) -> bool {
4753 return isVectorizedMemAccessUse(cast<Instruction>(U), V);
4754 });
4755 if (UsersAreMemAccesses)
4756 addToWorklistIfAllowed(I);
4757 }
4758
4759 // Expand Worklist in topological order: whenever a new instruction
4760 // is added , its users should be already inside Worklist. It ensures
4761 // a uniform instruction will only be used by uniform instructions.
4762 unsigned idx = 0;
4763 while (idx != Worklist.size()) {
4764 Instruction *I = Worklist[idx++];
4765
4766 for (auto *OV : I->operand_values()) {
4767 // isOutOfScope operands cannot be uniform instructions.
4768 if (isOutOfScope(OV))
4769 continue;
4770 // First order recurrence Phi's should typically be considered
4771 // non-uniform.
4772 auto *OP = dyn_cast<PHINode>(OV);
4773 if (OP && Legal->isFixedOrderRecurrence(OP))
4774 continue;
4775 // If all the users of the operand are uniform, then add the
4776 // operand into the uniform worklist.
4777 auto *OI = cast<Instruction>(OV);
4778 if (llvm::all_of(OI->users(), [&](User *U) -> bool {
4779 auto *J = cast<Instruction>(U);
4780 return Worklist.count(J) || isVectorizedMemAccessUse(J, OI);
4781 }))
4782 addToWorklistIfAllowed(OI);
4783 }
4784 }
4785
4786 // For an instruction to be added into Worklist above, all its users inside
4787 // the loop should also be in Worklist. However, this condition cannot be
4788 // true for phi nodes that form a cyclic dependence. We must process phi
4789 // nodes separately. An induction variable will remain uniform if all users
4790 // of the induction variable and induction variable update remain uniform.
4791 // The code below handles both pointer and non-pointer induction variables.
4792 for (const auto &Induction : Legal->getInductionVars()) {
4793 auto *Ind = Induction.first;
4794 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
4795
4796 // Determine if all users of the induction variable are uniform after
4797 // vectorization.
4798 auto UniformInd = llvm::all_of(Ind->users(), [&](User *U) -> bool {
4799 auto *I = cast<Instruction>(U);
4800 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) ||
4801 isVectorizedMemAccessUse(I, Ind);
4802 });
4803 if (!UniformInd)
4804 continue;
4805
4806 // Determine if all users of the induction variable update instruction are
4807 // uniform after vectorization.
4808 auto UniformIndUpdate =
4809 llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
4810 auto *I = cast<Instruction>(U);
4811 return I == Ind || !TheLoop->contains(I) || Worklist.count(I) ||
4812 isVectorizedMemAccessUse(I, IndUpdate);
4813 });
4814 if (!UniformIndUpdate)
4815 continue;
4816
4817 // The induction variable and its update instruction will remain uniform.
4818 addToWorklistIfAllowed(Ind);
4819 addToWorklistIfAllowed(IndUpdate);
4820 }
4821
4822 Uniforms[VF].insert(Worklist.begin(), Worklist.end());
4823}
4824
4825bool LoopVectorizationCostModel::runtimeChecksRequired() {
4826 LLVM_DEBUG(dbgs() << "LV: Performing code size checks.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Performing code size checks.\n"
; } } while (false)
;
4827
4828 if (Legal->getRuntimePointerChecking()->Need) {
4829 reportVectorizationFailure("Runtime ptr check is required with -Os/-Oz",
4830 "runtime pointer checks needed. Enable vectorization of this "
4831 "loop with '#pragma clang loop vectorize(enable)' when "
4832 "compiling with -Os/-Oz",
4833 "CantVersionLoopWithOptForSize", ORE, TheLoop);
4834 return true;
4835 }
4836
4837 if (!PSE.getPredicate().isAlwaysTrue()) {
4838 reportVectorizationFailure("Runtime SCEV check is required with -Os/-Oz",
4839 "runtime SCEV checks needed. Enable vectorization of this "
4840 "loop with '#pragma clang loop vectorize(enable)' when "
4841 "compiling with -Os/-Oz",
4842 "CantVersionLoopWithOptForSize", ORE, TheLoop);
4843 return true;
4844 }
4845
4846 // FIXME: Avoid specializing for stride==1 instead of bailing out.
4847 if (!Legal->getLAI()->getSymbolicStrides().empty()) {
4848 reportVectorizationFailure("Runtime stride check for small trip count",
4849 "runtime stride == 1 checks needed. Enable vectorization of "
4850 "this loop without such check by compiling with -Os/-Oz",
4851 "CantVersionLoopWithOptForSize", ORE, TheLoop);
4852 return true;
4853 }
4854
4855 return false;
4856}
4857
4858ElementCount
4859LoopVectorizationCostModel::getMaxLegalScalableVF(unsigned MaxSafeElements) {
4860 if (!TTI.supportsScalableVectors() && !ForceTargetSupportsScalableVectors)
4861 return ElementCount::getScalable(0);
4862
4863 if (Hints->isScalableVectorizationDisabled()) {
4864 reportVectorizationInfo("Scalable vectorization is explicitly disabled",
4865 "ScalableVectorizationDisabled", ORE, TheLoop);
4866 return ElementCount::getScalable(0);
4867 }
4868
4869 LLVM_DEBUG(dbgs() << "LV: Scalable vectorization is available\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Scalable vectorization is available\n"
; } } while (false)
;
4870
4871 auto MaxScalableVF = ElementCount::getScalable(
4872 std::numeric_limits<ElementCount::ScalarTy>::max());
4873
4874 // Test that the loop-vectorizer can legalize all operations for this MaxVF.
4875 // FIXME: While for scalable vectors this is currently sufficient, this should
4876 // be replaced by a more detailed mechanism that filters out specific VFs,
4877 // instead of invalidating vectorization for a whole set of VFs based on the
4878 // MaxVF.
4879
4880 // Disable scalable vectorization if the loop contains unsupported reductions.
4881 if (!canVectorizeReductions(MaxScalableVF)) {
4882 reportVectorizationInfo(
4883 "Scalable vectorization not supported for the reduction "
4884 "operations found in this loop.",
4885 "ScalableVFUnfeasible", ORE, TheLoop);
4886 return ElementCount::getScalable(0);
4887 }
4888
4889 // Disable scalable vectorization if the loop contains any instructions
4890 // with element types not supported for scalable vectors.
4891 if (any_of(ElementTypesInLoop, [&](Type *Ty) {
4892 return !Ty->isVoidTy() &&
4893 !this->TTI.isElementTypeLegalForScalableVector(Ty);
4894 })) {
4895 reportVectorizationInfo("Scalable vectorization is not supported "
4896 "for all element types found in this loop.",
4897 "ScalableVFUnfeasible", ORE, TheLoop);
4898 return ElementCount::getScalable(0);
4899 }
4900
4901 if (Legal->isSafeForAnyVectorWidth())
4902 return MaxScalableVF;
4903
4904 // Limit MaxScalableVF by the maximum safe dependence distance.
4905 std::optional<unsigned> MaxVScale = TTI.getMaxVScale();
4906 if (!MaxVScale && TheFunction->hasFnAttribute(Attribute::VScaleRange))
4907 MaxVScale =
4908 TheFunction->getFnAttribute(Attribute::VScaleRange).getVScaleRangeMax();
4909 MaxScalableVF =
4910 ElementCount::getScalable(MaxVScale ? (MaxSafeElements / *MaxVScale) : 0);
4911 if (!MaxScalableVF)
4912 reportVectorizationInfo(
4913 "Max legal vector width too small, scalable vectorization "
4914 "unfeasible.",
4915 "ScalableVFUnfeasible", ORE, TheLoop);
4916
4917 return MaxScalableVF;
4918}
4919
4920FixedScalableVFPair LoopVectorizationCostModel::computeFeasibleMaxVF(
4921 unsigned ConstTripCount, ElementCount UserVF, bool FoldTailByMasking) {
4922 MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI);
4923 unsigned SmallestType, WidestType;
4924 std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes();
4925
4926 // Get the maximum safe dependence distance in bits computed by LAA.
4927 // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from
4928 // the memory accesses that is most restrictive (involved in the smallest
4929 // dependence distance).
4930 unsigned MaxSafeElements =
4931 PowerOf2Floor(Legal->getMaxSafeVectorWidthInBits() / WidestType);
4932
4933 auto MaxSafeFixedVF = ElementCount::getFixed(MaxSafeElements);
4934 auto MaxSafeScalableVF = getMaxLegalScalableVF(MaxSafeElements);
4935
4936 LLVM_DEBUG(dbgs() << "LV: The max safe fixed VF is: " << MaxSafeFixedVFdo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: The max safe fixed VF is: "
<< MaxSafeFixedVF << ".\n"; } } while (false)
4937 << ".\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: The max safe fixed VF is: "
<< MaxSafeFixedVF << ".\n"; } } while (false)
;
4938 LLVM_DEBUG(dbgs() << "LV: The max safe scalable VF is: " << MaxSafeScalableVFdo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: The max safe scalable VF is: "
<< MaxSafeScalableVF << ".\n"; } } while (false)
4939 << ".\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: The max safe scalable VF is: "
<< MaxSafeScalableVF << ".\n"; } } while (false)
;
4940
4941 // First analyze the UserVF, fall back if the UserVF should be ignored.
4942 if (UserVF) {
4943 auto MaxSafeUserVF =
4944 UserVF.isScalable() ? MaxSafeScalableVF : MaxSafeFixedVF;
4945
4946 if (ElementCount::isKnownLE(UserVF, MaxSafeUserVF)) {
4947 // If `VF=vscale x N` is safe, then so is `VF=N`
4948 if (UserVF.isScalable())
4949 return FixedScalableVFPair(
4950 ElementCount::getFixed(UserVF.getKnownMinValue()), UserVF);
4951 else
4952 return UserVF;
4953 }
4954
4955 assert(ElementCount::isKnownGT(UserVF, MaxSafeUserVF))(static_cast <bool> (ElementCount::isKnownGT(UserVF, MaxSafeUserVF
)) ? void (0) : __assert_fail ("ElementCount::isKnownGT(UserVF, MaxSafeUserVF)"
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 4955, __extension__
__PRETTY_FUNCTION__))
;
4956
4957 // Only clamp if the UserVF is not scalable. If the UserVF is scalable, it
4958 // is better to ignore the hint and let the compiler choose a suitable VF.
4959 if (!UserVF.isScalable()) {
4960 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVFdo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: User VF=" <<
UserVF << " is unsafe, clamping to max safe VF=" <<
MaxSafeFixedVF << ".\n"; } } while (false)
4961 << " is unsafe, clamping to max safe VF="do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: User VF=" <<
UserVF << " is unsafe, clamping to max safe VF=" <<
MaxSafeFixedVF << ".\n"; } } while (false)
4962 << MaxSafeFixedVF << ".\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: User VF=" <<
UserVF << " is unsafe, clamping to max safe VF=" <<
MaxSafeFixedVF << ".\n"; } } while (false)
;
4963 ORE->emit([&]() {
4964 return OptimizationRemarkAnalysis(DEBUG_TYPE"loop-vectorize", "VectorizationFactor",
4965 TheLoop->getStartLoc(),
4966 TheLoop->getHeader())
4967 << "User-specified vectorization factor "
4968 << ore::NV("UserVectorizationFactor", UserVF)
4969 << " is unsafe, clamping to maximum safe vectorization factor "
4970 << ore::NV("VectorizationFactor", MaxSafeFixedVF);
4971 });
4972 return MaxSafeFixedVF;
4973 }
4974
4975 if (!TTI.supportsScalableVectors() && !ForceTargetSupportsScalableVectors) {
4976 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVFdo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: User VF=" <<
UserVF << " is ignored because scalable vectors are not "
"available.\n"; } } while (false)
4977 << " is ignored because scalable vectors are not "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: User VF=" <<
UserVF << " is ignored because scalable vectors are not "
"available.\n"; } } while (false)
4978 "available.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: User VF=" <<
UserVF << " is ignored because scalable vectors are not "
"available.\n"; } } while (false)
;
4979 ORE->emit([&]() {
4980 return OptimizationRemarkAnalysis(DEBUG_TYPE"loop-vectorize", "VectorizationFactor",
4981 TheLoop->getStartLoc(),
4982 TheLoop->getHeader())
4983 << "User-specified vectorization factor "
4984 << ore::NV("UserVectorizationFactor", UserVF)
4985 << " is ignored because the target does not support scalable "
4986 "vectors. The compiler will pick a more suitable value.";
4987 });
4988 } else {
4989 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVFdo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: User VF=" <<
UserVF << " is unsafe. Ignoring scalable UserVF.\n"; }
} while (false)
4990 << " is unsafe. Ignoring scalable UserVF.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: User VF=" <<
UserVF << " is unsafe. Ignoring scalable UserVF.\n"; }
} while (false)
;
4991 ORE->emit([&]() {
4992 return OptimizationRemarkAnalysis(DEBUG_TYPE"loop-vectorize", "VectorizationFactor",
4993 TheLoop->getStartLoc(),
4994 TheLoop->getHeader())
4995 << "User-specified vectorization factor "
4996 << ore::NV("UserVectorizationFactor", UserVF)
4997 << " is unsafe. Ignoring the hint to let the compiler pick a "
4998 "more suitable value.";
4999 });
5000 }
5001 }
5002
5003 LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestTypedo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: The Smallest and Widest types: "
<< SmallestType << " / " << WidestType <<
" bits.\n"; } } while (false)
5004 << " / " << WidestType << " bits.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: The Smallest and Widest types: "
<< SmallestType << " / " << WidestType <<
" bits.\n"; } } while (false)
;
5005
5006 FixedScalableVFPair Result(ElementCount::getFixed(1),
5007 ElementCount::getScalable(0));
5008 if (auto MaxVF =
5009 getMaximizedVFForTarget(ConstTripCount, SmallestType, WidestType,
5010 MaxSafeFixedVF, FoldTailByMasking))
5011 Result.FixedVF = MaxVF;
5012
5013 if (auto MaxVF =
5014 getMaximizedVFForTarget(ConstTripCount, SmallestType, WidestType,
5015 MaxSafeScalableVF, FoldTailByMasking))
5016 if (MaxVF.isScalable()) {
5017 Result.ScalableVF = MaxVF;
5018 LLVM_DEBUG(dbgs() << "LV: Found feasible scalable VF = " << MaxVFdo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found feasible scalable VF = "
<< MaxVF << "\n"; } } while (false)
5019 << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found feasible scalable VF = "
<< MaxVF << "\n"; } } while (false)
;
5020 }
5021
5022 return Result;
5023}
5024
5025FixedScalableVFPair
5026LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
5027 if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) {
5028 // TODO: It may by useful to do since it's still likely to be dynamically
5029 // uniform if the target can skip.
5030 reportVectorizationFailure(
5031 "Not inserting runtime ptr check for divergent target",
5032 "runtime pointer checks needed. Not enabled for divergent target",
5033 "CantVersionLoopWithDivergentTarget", ORE, TheLoop);
5034 return FixedScalableVFPair::getNone();
5035 }
5036
5037 unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop);
5038 LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n')do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found trip count: "
<< TC << '\n'; } } while (false)
;
5039 if (TC == 1) {
5040 reportVectorizationFailure("Single iteration (non) loop",
5041 "loop trip count is one, irrelevant for vectorization",
5042 "SingleIterationLoop", ORE, TheLoop);
5043 return FixedScalableVFPair::getNone();
5044 }
5045
5046 switch (ScalarEpilogueStatus) {
5047 case CM_ScalarEpilogueAllowed:
5048 return computeFeasibleMaxVF(TC, UserVF, false);
5049 case CM_ScalarEpilogueNotAllowedUsePredicate:
5050 [[fallthrough]];
5051 case CM_ScalarEpilogueNotNeededUsePredicate:
5052 LLVM_DEBUG(do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: vector predicate hint/switch found.\n"
<< "LV: Not allowing scalar epilogue, creating predicated "
<< "vector loop.\n"; } } while (false)
5053 dbgs() << "LV: vector predicate hint/switch found.\n"do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: vector predicate hint/switch found.\n"
<< "LV: Not allowing scalar epilogue, creating predicated "
<< "vector loop.\n"; } } while (false)
5054 << "LV: Not allowing scalar epilogue, creating predicated "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: vector predicate hint/switch found.\n"
<< "LV: Not allowing scalar epilogue, creating predicated "
<< "vector loop.\n"; } } while (false)
5055 << "vector loop.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: vector predicate hint/switch found.\n"
<< "LV: Not allowing scalar epilogue, creating predicated "
<< "vector loop.\n"; } } while (false)
;
5056 break;
5057 case CM_ScalarEpilogueNotAllowedLowTripLoop:
5058 // fallthrough as a special case of OptForSize
5059 case CM_ScalarEpilogueNotAllowedOptSize:
5060 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedOptSize)
5061 LLVM_DEBUG(do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n"
; } } while (false)
5062 dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n"
; } } while (false)
;
5063 else
5064 LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to low trip "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Not allowing scalar epilogue due to low trip "
<< "count.\n"; } } while (false)
5065 << "count.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Not allowing scalar epilogue due to low trip "
<< "count.\n"; } } while (false)
;
5066
5067 // Bail if runtime checks are required, which are not good when optimising
5068 // for size.
5069 if (runtimeChecksRequired())
5070 return FixedScalableVFPair::getNone();
5071
5072 break;
5073 }
5074
5075 // The only loops we can vectorize without a scalar epilogue, are loops with
5076 // a bottom-test and a single exiting block. We'd have to handle the fact
5077 // that not every instruction executes on the last iteration. This will
5078 // require a lane mask which varies through the vector loop body. (TODO)
5079 if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) {
5080 // If there was a tail-folding hint/switch, but we can't fold the tail by
5081 // masking, fallback to a vectorization with a scalar epilogue.
5082 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) {
5083 LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Cannot fold tail by masking: vectorize with a "
"scalar epilogue instead.\n"; } } while (false)
5084 "scalar epilogue instead.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Cannot fold tail by masking: vectorize with a "
"scalar epilogue instead.\n"; } } while (false)
;
5085 ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
5086 return computeFeasibleMaxVF(TC, UserVF, false);
5087 }
5088 return FixedScalableVFPair::getNone();
5089 }
5090
5091 // Now try the tail folding
5092
5093 // Invalidate interleave groups that require an epilogue if we can't mask
5094 // the interleave-group.
5095 if (!useMaskedInterleavedAccesses(TTI)) {
5096 assert(WideningDecisions.empty() && Uniforms.empty() && Scalars.empty() &&(static_cast <bool> (WideningDecisions.empty() &&
Uniforms.empty() && Scalars.empty() && "No decisions should have been taken at this point"
) ? void (0) : __assert_fail ("WideningDecisions.empty() && Uniforms.empty() && Scalars.empty() && \"No decisions should have been taken at this point\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 5097, __extension__
__PRETTY_FUNCTION__))
5097 "No decisions should have been taken at this point")(static_cast <bool> (WideningDecisions.empty() &&
Uniforms.empty() && Scalars.empty() && "No decisions should have been taken at this point"
) ? void (0) : __assert_fail ("WideningDecisions.empty() && Uniforms.empty() && Scalars.empty() && \"No decisions should have been taken at this point\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 5097, __extension__
__PRETTY_FUNCTION__))
;
5098 // Note: There is no need to invalidate any cost modeling decisions here, as
5099 // non where taken so far.
5100 InterleaveInfo.invalidateGroupsRequiringScalarEpilogue();
5101 }
5102
5103 FixedScalableVFPair MaxFactors = computeFeasibleMaxVF(TC, UserVF, true);
5104 // Avoid tail folding if the trip count is known to be a multiple of any VF
5105 // we chose.
5106 // FIXME: The condition below pessimises the case for fixed-width vectors,
5107 // when scalable VFs are also candidates for vectorization.
5108 if (MaxFactors.FixedVF.isVector() && !MaxFactors.ScalableVF) {
5109 ElementCount MaxFixedVF = MaxFactors.FixedVF;
5110 assert((UserVF.isNonZero() || isPowerOf2_32(MaxFixedVF.getFixedValue())) &&(static_cast <bool> ((UserVF.isNonZero() || isPowerOf2_32
(MaxFixedVF.getFixedValue())) && "MaxFixedVF must be a power of 2"
) ? void (0) : __assert_fail ("(UserVF.isNonZero() || isPowerOf2_32(MaxFixedVF.getFixedValue())) && \"MaxFixedVF must be a power of 2\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 5111, __extension__
__PRETTY_FUNCTION__))
5111 "MaxFixedVF must be a power of 2")(static_cast <bool> ((UserVF.isNonZero() || isPowerOf2_32
(MaxFixedVF.getFixedValue())) && "MaxFixedVF must be a power of 2"
) ? void (0) : __assert_fail ("(UserVF.isNonZero() || isPowerOf2_32(MaxFixedVF.getFixedValue())) && \"MaxFixedVF must be a power of 2\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 5111, __extension__
__PRETTY_FUNCTION__))
;
5112 unsigned MaxVFtimesIC = UserIC ? MaxFixedVF.getFixedValue() * UserIC
5113 : MaxFixedVF.getFixedValue();
5114 ScalarEvolution *SE = PSE.getSE();
5115 const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount();
5116 const SCEV *ExitCount = SE->getAddExpr(
5117 BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType()));
5118 const SCEV *Rem = SE->getURemExpr(
5119 SE->applyLoopGuards(ExitCount, TheLoop),
5120 SE->getConstant(BackedgeTakenCount->getType(), MaxVFtimesIC));
5121 if (Rem->isZero()) {
5122 // Accept MaxFixedVF if we do not have a tail.
5123 LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: No tail will remain for any chosen VF.\n"
; } } while (false)
;
5124 return MaxFactors;
5125 }
5126 }
5127
5128 // If we don't know the precise trip count, or if the trip count that we
5129 // found modulo the vectorization factor is not zero, try to fold the tail
5130 // by masking.
5131 // FIXME: look for a smaller MaxVF that does divide TC rather than masking.
5132 if (Legal->prepareToFoldTailByMasking()) {
5133 FoldTailByMasking = true;
5134 return MaxFactors;
5135 }
5136
5137 // If there was a tail-folding hint/switch, but we can't fold the tail by
5138 // masking, fallback to a vectorization with a scalar epilogue.
5139 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) {
5140 LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Cannot fold tail by masking: vectorize with a "
"scalar epilogue instead.\n"; } } while (false)
5141 "scalar epilogue instead.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Cannot fold tail by masking: vectorize with a "
"scalar epilogue instead.\n"; } } while (false)
;
5142 ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
5143 return MaxFactors;
5144 }
5145
5146 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedUsePredicate) {
5147 LLVM_DEBUG(dbgs() << "LV: Can't fold tail by masking: don't vectorize\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Can't fold tail by masking: don't vectorize\n"
; } } while (false)
;
5148 return FixedScalableVFPair::getNone();
5149 }
5150
5151 if (TC == 0) {
5152 reportVectorizationFailure(
5153 "Unable to calculate the loop count due to complex control flow",
5154 "unable to calculate the loop count due to complex control flow",
5155 "UnknownLoopCountComplexCFG", ORE, TheLoop);
5156 return FixedScalableVFPair::getNone();
5157 }
5158
5159 reportVectorizationFailure(
5160 "Cannot optimize for size and vectorize at the same time.",
5161 "cannot optimize for size and vectorize at the same time. "
5162 "Enable vectorization of this loop with '#pragma clang loop "
5163 "vectorize(enable)' when compiling with -Os/-Oz",
5164 "NoTailLoopWithOptForSize", ORE, TheLoop);
5165 return FixedScalableVFPair::getNone();
5166}
5167
5168ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget(
5169 unsigned ConstTripCount, unsigned SmallestType, unsigned WidestType,
5170 ElementCount MaxSafeVF, bool FoldTailByMasking) {
5171 bool ComputeScalableMaxVF = MaxSafeVF.isScalable();
5172 const TypeSize WidestRegister = TTI.getRegisterBitWidth(
5173 ComputeScalableMaxVF ? TargetTransformInfo::RGK_ScalableVector
5174 : TargetTransformInfo::RGK_FixedWidthVector);
5175
5176 // Convenience function to return the minimum of two ElementCounts.
5177 auto MinVF = [](const ElementCount &LHS, const ElementCount &RHS) {
5178 assert((LHS.isScalable() == RHS.isScalable()) &&(static_cast <bool> ((LHS.isScalable() == RHS.isScalable
()) && "Scalable flags must match") ? void (0) : __assert_fail
("(LHS.isScalable() == RHS.isScalable()) && \"Scalable flags must match\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 5179, __extension__
__PRETTY_FUNCTION__))
5179 "Scalable flags must match")(static_cast <bool> ((LHS.isScalable() == RHS.isScalable
()) && "Scalable flags must match") ? void (0) : __assert_fail
("(LHS.isScalable() == RHS.isScalable()) && \"Scalable flags must match\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 5179, __extension__
__PRETTY_FUNCTION__))
;
5180 return ElementCount::isKnownLT(LHS, RHS) ? LHS : RHS;
5181 };
5182
5183 // Ensure MaxVF is a power of 2; the dependence distance bound may not be.
5184 // Note that both WidestRegister and WidestType may not be a powers of 2.
5185 auto MaxVectorElementCount = ElementCount::get(
5186 PowerOf2Floor(WidestRegister.getKnownMinSize() / WidestType),
5187 ComputeScalableMaxVF);
5188 MaxVectorElementCount = MinVF(MaxVectorElementCount, MaxSafeVF);
5189 LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: The Widest register safe to use is: "
<< (MaxVectorElementCount * WidestType) << " bits.\n"
; } } while (false)
5190 << (MaxVectorElementCount * WidestType) << " bits.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: The Widest register safe to use is: "
<< (MaxVectorElementCount * WidestType) << " bits.\n"
; } } while (false)
;
5191
5192 if (!MaxVectorElementCount) {
5193 LLVM_DEBUG(dbgs() << "LV: The target has no "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: The target has no "
<< (ComputeScalableMaxVF ? "scalable" : "fixed") <<
" vector registers.\n"; } } while (false)
5194 << (ComputeScalableMaxVF ? "scalable" : "fixed")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: The target has no "
<< (ComputeScalableMaxVF ? "scalable" : "fixed") <<
" vector registers.\n"; } } while (false)
5195 << " vector registers.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: The target has no "
<< (ComputeScalableMaxVF ? "scalable" : "fixed") <<
" vector registers.\n"; } } while (false)
;
5196 return ElementCount::getFixed(1);
5197 }
5198
5199 unsigned WidestRegisterMinEC = MaxVectorElementCount.getKnownMinValue();
5200 if (MaxVectorElementCount.isScalable() &&
5201 TheFunction->hasFnAttribute(Attribute::VScaleRange)) {
5202 auto Attr = TheFunction->getFnAttribute(Attribute::VScaleRange);
5203 auto Min = Attr.getVScaleRangeMin();
5204 WidestRegisterMinEC *= Min;
5205 }
5206 if (ConstTripCount && ConstTripCount <= WidestRegisterMinEC &&
5207 (!FoldTailByMasking || isPowerOf2_32(ConstTripCount))) {
5208 // If loop trip count (TC) is known at compile time there is no point in
5209 // choosing VF greater than TC (as done in the loop below). Select maximum
5210 // power of two which doesn't exceed TC.
5211 // If MaxVectorElementCount is scalable, we only fall back on a fixed VF
5212 // when the TC is less than or equal to the known number of lanes.
5213 auto ClampedConstTripCount = PowerOf2Floor(ConstTripCount);
5214 LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to maximum power of two not "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Clamping the MaxVF to maximum power of two not "
"exceeding the constant trip count: " << ClampedConstTripCount
<< "\n"; } } while (false)
5215 "exceeding the constant trip count: "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Clamping the MaxVF to maximum power of two not "
"exceeding the constant trip count: " << ClampedConstTripCount
<< "\n"; } } while (false)
5216 << ClampedConstTripCount << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Clamping the MaxVF to maximum power of two not "
"exceeding the constant trip count: " << ClampedConstTripCount
<< "\n"; } } while (false)
;
5217 return ElementCount::getFixed(ClampedConstTripCount);
5218 }
5219
5220 TargetTransformInfo::RegisterKind RegKind =
5221 ComputeScalableMaxVF ? TargetTransformInfo::RGK_ScalableVector
5222 : TargetTransformInfo::RGK_FixedWidthVector;
5223 ElementCount MaxVF = MaxVectorElementCount;
5224 if (MaximizeBandwidth || (MaximizeBandwidth.getNumOccurrences() == 0 &&
5225 TTI.shouldMaximizeVectorBandwidth(RegKind))) {
5226 auto MaxVectorElementCountMaxBW = ElementCount::get(
5227 PowerOf2Floor(WidestRegister.getKnownMinSize() / SmallestType),
5228 ComputeScalableMaxVF);
5229 MaxVectorElementCountMaxBW = MinVF(MaxVectorElementCountMaxBW, MaxSafeVF);
5230
5231 // Collect all viable vectorization factors larger than the default MaxVF
5232 // (i.e. MaxVectorElementCount).
5233 SmallVector<ElementCount, 8> VFs;
5234 for (ElementCount VS = MaxVectorElementCount * 2;
5235 ElementCount::isKnownLE(VS, MaxVectorElementCountMaxBW); VS *= 2)
5236 VFs.push_back(VS);
5237
5238 // For each VF calculate its register usage.
5239 auto RUs = calculateRegisterUsage(VFs);
5240
5241 // Select the largest VF which doesn't require more registers than existing
5242 // ones.
5243 for (int i = RUs.size() - 1; i >= 0; --i) {
5244 bool Selected = true;
5245 for (auto &pair : RUs[i].MaxLocalUsers) {
5246 unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first);
5247 if (pair.second > TargetNumRegisters)
5248 Selected = false;
5249 }
5250 if (Selected) {
5251 MaxVF = VFs[i];
5252 break;
5253 }
5254 }
5255 if (ElementCount MinVF =
5256 TTI.getMinimumVF(SmallestType, ComputeScalableMaxVF)) {
5257 if (ElementCount::isKnownLT(MaxVF, MinVF)) {
5258 LLVM_DEBUG(dbgs() << "LV: Overriding calculated MaxVF(" << MaxVFdo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Overriding calculated MaxVF("
<< MaxVF << ") with target's minimum: " <<
MinVF << '\n'; } } while (false)
5259 << ") with target's minimum: " << MinVF << '\n')do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Overriding calculated MaxVF("
<< MaxVF << ") with target's minimum: " <<
MinVF << '\n'; } } while (false)
;
5260 MaxVF = MinVF;
5261 }
5262 }
5263
5264 // Invalidate any widening decisions we might have made, in case the loop
5265 // requires prediction (decided later), but we have already made some
5266 // load/store widening decisions.
5267 invalidateCostModelingDecisions();
5268 }
5269 return MaxVF;
5270}
5271
5272std::optional<unsigned> LoopVectorizationCostModel::getVScaleForTuning() const {
5273 if (TheFunction->hasFnAttribute(Attribute::VScaleRange)) {
5274 auto Attr = TheFunction->getFnAttribute(Attribute::VScaleRange);
5275 auto Min = Attr.getVScaleRangeMin();
5276 auto Max = Attr.getVScaleRangeMax();
5277 if (Max && Min == Max)
5278 return Max;
5279 }
5280
5281 return TTI.getVScaleForTuning();
5282}
5283
5284bool LoopVectorizationCostModel::isMoreProfitable(
5285 const VectorizationFactor &A, const VectorizationFactor &B) const {
5286 InstructionCost CostA = A.Cost;
5287 InstructionCost CostB = B.Cost;
5288
5289 unsigned MaxTripCount = PSE.getSE()->getSmallConstantMaxTripCount(TheLoop);
5290
5291 if (!A.Width.isScalable() && !B.Width.isScalable() && FoldTailByMasking &&
5292 MaxTripCount) {
5293 // If we are folding the tail and the trip count is a known (possibly small)
5294 // constant, the trip count will be rounded up to an integer number of
5295 // iterations. The total cost will be PerIterationCost*ceil(TripCount/VF),
5296 // which we compare directly. When not folding the tail, the total cost will
5297 // be PerIterationCost*floor(TC/VF) + Scalar remainder cost, and so is
5298 // approximated with the per-lane cost below instead of using the tripcount
5299 // as here.
5300 auto RTCostA = CostA * divideCeil(MaxTripCount, A.Width.getFixedValue());
5301 auto RTCostB = CostB * divideCeil(MaxTripCount, B.Width.getFixedValue());
5302 return RTCostA < RTCostB;
5303 }
5304
5305 // Improve estimate for the vector width if it is scalable.
5306 unsigned EstimatedWidthA = A.Width.getKnownMinValue();
5307 unsigned EstimatedWidthB = B.Width.getKnownMinValue();
5308 if (std::optional<unsigned> VScale = getVScaleForTuning()) {
5309 if (A.Width.isScalable())
5310 EstimatedWidthA *= *VScale;
5311 if (B.Width.isScalable())
5312 EstimatedWidthB *= *VScale;
5313 }
5314
5315 // Assume vscale may be larger than 1 (or the value being tuned for),
5316 // so that scalable vectorization is slightly favorable over fixed-width
5317 // vectorization.
5318 if (A.Width.isScalable() && !B.Width.isScalable())
5319 return (CostA * B.Width.getFixedValue()) <= (CostB * EstimatedWidthA);
5320
5321 // To avoid the need for FP division:
5322 // (CostA / A.Width) < (CostB / B.Width)
5323 // <=> (CostA * B.Width) < (CostB * A.Width)
5324 return (CostA * EstimatedWidthB) < (CostB * EstimatedWidthA);
5325}
5326
5327VectorizationFactor LoopVectorizationCostModel::selectVectorizationFactor(
5328 const ElementCountSet &VFCandidates) {
5329 InstructionCost ExpectedCost = expectedCost(ElementCount::getFixed(1)).first;
5330 LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << ExpectedCost << ".\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Scalar loop costs: "
<< ExpectedCost << ".\n"; } } while (false)
;
5331 assert(ExpectedCost.isValid() && "Unexpected invalid cost for scalar loop")(static_cast <bool> (ExpectedCost.isValid() && "Unexpected invalid cost for scalar loop"
) ? void (0) : __assert_fail ("ExpectedCost.isValid() && \"Unexpected invalid cost for scalar loop\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 5331, __extension__
__PRETTY_FUNCTION__))
;
5332 assert(VFCandidates.count(ElementCount::getFixed(1)) &&(static_cast <bool> (VFCandidates.count(ElementCount::getFixed
(1)) && "Expected Scalar VF to be a candidate") ? void
(0) : __assert_fail ("VFCandidates.count(ElementCount::getFixed(1)) && \"Expected Scalar VF to be a candidate\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 5333, __extension__
__PRETTY_FUNCTION__))
5333 "Expected Scalar VF to be a candidate")(static_cast <bool> (VFCandidates.count(ElementCount::getFixed
(1)) && "Expected Scalar VF to be a candidate") ? void
(0) : __assert_fail ("VFCandidates.count(ElementCount::getFixed(1)) && \"Expected Scalar VF to be a candidate\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 5333, __extension__
__PRETTY_FUNCTION__))
;
5334
5335 const VectorizationFactor ScalarCost(ElementCount::getFixed(1), ExpectedCost,
5336 ExpectedCost);
5337 VectorizationFactor ChosenFactor = ScalarCost;
5338
5339 bool ForceVectorization = Hints->getForce() == LoopVectorizeHints::FK_Enabled;
5340 if (ForceVectorization && VFCandidates.size() > 1) {
5341 // Ignore scalar width, because the user explicitly wants vectorization.
5342 // Initialize cost to max so that VF = 2 is, at least, chosen during cost
5343 // evaluation.
5344 ChosenFactor.Cost = InstructionCost::getMax();
5345 }
5346
5347 SmallVector<InstructionVFPair> InvalidCosts;
5348 for (const auto &i : VFCandidates) {
5349 // The cost for scalar VF=1 is already calculated, so ignore it.
5350 if (i.isScalar())
5351 continue;
5352
5353 VectorizationCostTy C = expectedCost(i, &InvalidCosts);
5354 VectorizationFactor Candidate(i, C.first, ScalarCost.ScalarCost);
5355
5356#ifndef NDEBUG
5357 unsigned AssumedMinimumVscale = 1;
5358 if (std::optional<unsigned> VScale = getVScaleForTuning())
5359 AssumedMinimumVscale = *VScale;
5360 unsigned Width =
5361 Candidate.Width.isScalable()
5362 ? Candidate.Width.getKnownMinValue() * AssumedMinimumVscale
5363 : Candidate.Width.getFixedValue();
5364 LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << ido { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Vector loop of width "
<< i << " costs: " << (Candidate.Cost / Width
); } } while (false)
5365 << " costs: " << (Candidate.Cost / Width))do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Vector loop of width "
<< i << " costs: " << (Candidate.Cost / Width
); } } while (false)
;
5366 if (i.isScalable())
5367 LLVM_DEBUG(dbgs() << " (assuming a minimum vscale of "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << " (assuming a minimum vscale of "
<< AssumedMinimumVscale << ")"; } } while (false
)
5368 << AssumedMinimumVscale << ")")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << " (assuming a minimum vscale of "
<< AssumedMinimumVscale << ")"; } } while (false
)
;
5369 LLVM_DEBUG(dbgs() << ".\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << ".\n"; } } while (false
)
;
5370#endif
5371
5372 if (!C.second && !ForceVectorization) {
5373 LLVM_DEBUG(do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Not considering vector loop of width "
<< i << " because it will not generate any vector instructions.\n"
; } } while (false)
5374 dbgs() << "LV: Not considering vector loop of width " << ido { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Not considering vector loop of width "
<< i << " because it will not generate any vector instructions.\n"
; } } while (false)
5375 << " because it will not generate any vector instructions.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Not considering vector loop of width "
<< i << " because it will not generate any vector instructions.\n"
; } } while (false)
;
5376 continue;
5377 }
5378
5379 // If profitable add it to ProfitableVF list.
5380 if (isMoreProfitable(Candidate, ScalarCost))
5381 ProfitableVFs.push_back(Candidate);
5382
5383 if (isMoreProfitable(Candidate, ChosenFactor))
5384 ChosenFactor = Candidate;
5385 }
5386
5387 // Emit a report of VFs with invalid costs in the loop.
5388 if (!InvalidCosts.empty()) {
5389 // Group the remarks per instruction, keeping the instruction order from
5390 // InvalidCosts.
5391 std::map<Instruction *, unsigned> Numbering;
5392 unsigned I = 0;
5393 for (auto &Pair : InvalidCosts)
5394 if (!Numbering.count(Pair.first))
5395 Numbering[Pair.first] = I++;
5396
5397 // Sort the list, first on instruction(number) then on VF.
5398 llvm::sort(InvalidCosts,
5399 [&Numbering](InstructionVFPair &A, InstructionVFPair &B) {
5400 if (Numbering[A.first] != Numbering[B.first])
5401 return Numbering[A.first] < Numbering[B.first];
5402 ElementCountComparator ECC;
5403 return ECC(A.second, B.second);
5404 });
5405
5406 // For a list of ordered instruction-vf pairs:
5407 // [(load, vf1), (load, vf2), (store, vf1)]
5408 // Group the instructions together to emit separate remarks for:
5409 // load (vf1, vf2)
5410 // store (vf1)
5411 auto Tail = ArrayRef<InstructionVFPair>(InvalidCosts);
5412 auto Subset = ArrayRef<InstructionVFPair>();
5413 do {
5414 if (Subset.empty())
5415 Subset = Tail.take_front(1);
5416
5417 Instruction *I = Subset.front().first;
5418
5419 // If the next instruction is different, or if there are no other pairs,
5420 // emit a remark for the collated subset. e.g.
5421 // [(load, vf1), (load, vf2))]
5422 // to emit:
5423 // remark: invalid costs for 'load' at VF=(vf, vf2)
5424 if (Subset == Tail || Tail[Subset.size()].first != I) {
5425 std::string OutString;
5426 raw_string_ostream OS(OutString);
5427 assert(!Subset.empty() && "Unexpected empty range")(static_cast <bool> (!Subset.empty() && "Unexpected empty range"
) ? void (0) : __assert_fail ("!Subset.empty() && \"Unexpected empty range\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 5427, __extension__
__PRETTY_FUNCTION__))
;
5428 OS << "Instruction with invalid costs prevented vectorization at VF=(";
5429 for (const auto &Pair : Subset)
5430 OS << (Pair.second == Subset.front().second ? "" : ", ")
5431 << Pair.second;
5432 OS << "):";
5433 if (auto *CI = dyn_cast<CallInst>(I))
5434 OS << " call to " << CI->getCalledFunction()->getName();
5435 else
5436 OS << " " << I->getOpcodeName();
5437 OS.flush();
5438 reportVectorizationInfo(OutString, "InvalidCost", ORE, TheLoop, I);
5439 Tail = Tail.drop_front(Subset.size());
5440 Subset = {};
5441 } else
5442 // Grow the subset by one element
5443 Subset = Tail.take_front(Subset.size() + 1);
5444 } while (!Tail.empty());
5445 }
5446
5447 if (!EnableCondStoresVectorization && NumPredStores) {
5448 reportVectorizationFailure("There are conditional stores.",
5449 "store that is conditionally executed prevents vectorization",
5450 "ConditionalStore", ORE, TheLoop);
5451 ChosenFactor = ScalarCost;
5452 }
5453
5454 LLVM_DEBUG(if (ForceVectorization && !ChosenFactor.Width.isScalar() &&do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { if (ForceVectorization && !ChosenFactor
.Width.isScalar() && !isMoreProfitable(ChosenFactor, ScalarCost
)) dbgs() << "LV: Vectorization seems to be not beneficial, "
<< "but was forced by a user.\n"; } } while (false)
5455 !isMoreProfitable(ChosenFactor, ScalarCost)) dbgs()do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { if (ForceVectorization && !ChosenFactor
.Width.isScalar() && !isMoreProfitable(ChosenFactor, ScalarCost
)) dbgs() << "LV: Vectorization seems to be not beneficial, "
<< "but was forced by a user.\n"; } } while (false)
5456 << "LV: Vectorization seems to be not beneficial, "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { if (ForceVectorization && !ChosenFactor
.Width.isScalar() && !isMoreProfitable(ChosenFactor, ScalarCost
)) dbgs() << "LV: Vectorization seems to be not beneficial, "
<< "but was forced by a user.\n"; } } while (false)
5457 << "but was forced by a user.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { if (ForceVectorization && !ChosenFactor
.Width.isScalar() && !isMoreProfitable(ChosenFactor, ScalarCost
)) dbgs() << "LV: Vectorization seems to be not beneficial, "
<< "but was forced by a user.\n"; } } while (false)
;
5458 LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << ChosenFactor.Width << ".\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Selecting VF: " <<
ChosenFactor.Width << ".\n"; } } while (false)
;
5459 return ChosenFactor;
5460}
5461
5462bool LoopVectorizationCostModel::isCandidateForEpilogueVectorization(
5463 const Loop &L, ElementCount VF) const {
5464 // Cross iteration phis such as reductions need special handling and are
5465 // currently unsupported.
5466 if (any_of(L.getHeader()->phis(),
5467 [&](PHINode &Phi) { return Legal->isFixedOrderRecurrence(&Phi); }))
5468 return false;
5469
5470 // Phis with uses outside of the loop require special handling and are
5471 // currently unsupported.
5472 for (const auto &Entry : Legal->getInductionVars()) {
5473 // Look for uses of the value of the induction at the last iteration.
5474 Value *PostInc = Entry.first->getIncomingValueForBlock(L.getLoopLatch());
5475 for (User *U : PostInc->users())
5476 if (!L.contains(cast<Instruction>(U)))
5477 return false;
5478 // Look for uses of penultimate value of the induction.
5479 for (User *U : Entry.first->users())
5480 if (!L.contains(cast<Instruction>(U)))
5481 return false;
5482 }
5483
5484 // Epilogue vectorization code has not been auditted to ensure it handles
5485 // non-latch exits properly. It may be fine, but it needs auditted and
5486 // tested.
5487 if (L.getExitingBlock() != L.getLoopLatch())
5488 return false;
5489
5490 return true;
5491}
5492
5493bool LoopVectorizationCostModel::isEpilogueVectorizationProfitable(
5494 const ElementCount VF) const {
5495 // FIXME: We need a much better cost-model to take different parameters such
5496 // as register pressure, code size increase and cost of extra branches into
5497 // account. For now we apply a very crude heuristic and only consider loops
5498 // with vectorization factors larger than a certain value.
5499
5500 // Allow the target to opt out entirely.
5501 if (!TTI.preferEpilogueVectorization())
5502 return false;
5503
5504 // We also consider epilogue vectorization unprofitable for targets that don't
5505 // consider interleaving beneficial (eg. MVE).
5506 if (TTI.getMaxInterleaveFactor(VF.getKnownMinValue()) <= 1)
5507 return false;
5508 // FIXME: We should consider changing the threshold for scalable
5509 // vectors to take VScaleForTuning into account.
5510 if (VF.getKnownMinValue() >= EpilogueVectorizationMinVF)
5511 return true;
5512 return false;
5513}
5514
5515VectorizationFactor
5516LoopVectorizationCostModel::selectEpilogueVectorizationFactor(
5517 const ElementCount MainLoopVF, const LoopVectorizationPlanner &LVP) {
5518 VectorizationFactor Result = VectorizationFactor::Disabled();
5519 if (!EnableEpilogueVectorization) {
5520 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is disabled.\n";)do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LEV: Epilogue vectorization is disabled.\n"
;; } } while (false)
;
5521 return Result;
5522 }
5523
5524 if (!isScalarEpilogueAllowed()) {
5525 LLVM_DEBUG(do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LEV: Unable to vectorize epilogue because no epilogue is "
"allowed.\n";; } } while (false)
5526 dbgs() << "LEV: Unable to vectorize epilogue because no epilogue is "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LEV: Unable to vectorize epilogue because no epilogue is "
"allowed.\n";; } } while (false)
5527 "allowed.\n";)do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LEV: Unable to vectorize epilogue because no epilogue is "
"allowed.\n";; } } while (false)
;
5528 return Result;
5529 }
5530
5531 // Not really a cost consideration, but check for unsupported cases here to
5532 // simplify the logic.
5533 if (!isCandidateForEpilogueVectorization(*TheLoop, MainLoopVF)) {
5534 LLVM_DEBUG(do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LEV: Unable to vectorize epilogue because the loop is "
"not a supported candidate.\n";; } } while (false)
5535 dbgs() << "LEV: Unable to vectorize epilogue because the loop is "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LEV: Unable to vectorize epilogue because the loop is "
"not a supported candidate.\n";; } } while (false)
5536 "not a supported candidate.\n";)do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LEV: Unable to vectorize epilogue because the loop is "
"not a supported candidate.\n";; } } while (false)
;
5537 return Result;
5538 }
5539
5540 if (EpilogueVectorizationForceVF > 1) {
5541 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization factor is forced.\n";)do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LEV: Epilogue vectorization factor is forced.\n"
;; } } while (false)
;
5542 ElementCount ForcedEC = ElementCount::getFixed(EpilogueVectorizationForceVF);
5543 if (LVP.hasPlanWithVF(ForcedEC))
5544 return {ForcedEC, 0, 0};
5545 else {
5546 LLVM_DEBUG(do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LEV: Epilogue vectorization forced factor is not viable.\n"
;; } } while (false)
5547 dbgs()do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LEV: Epilogue vectorization forced factor is not viable.\n"
;; } } while (false)
5548 << "LEV: Epilogue vectorization forced factor is not viable.\n";)do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LEV: Epilogue vectorization forced factor is not viable.\n"
;; } } while (false)
;
5549 return Result;
5550 }
5551 }
5552
5553 if (TheLoop->getHeader()->getParent()->hasOptSize() ||
5554 TheLoop->getHeader()->getParent()->hasMinSize()) {
5555 LLVM_DEBUG(do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LEV: Epilogue vectorization skipped due to opt for size.\n"
;; } } while (false)
5556 dbgs()do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LEV: Epilogue vectorization skipped due to opt for size.\n"
;; } } while (false)
5557 << "LEV: Epilogue vectorization skipped due to opt for size.\n";)do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LEV: Epilogue vectorization skipped due to opt for size.\n"
;; } } while (false)
;
5558 return Result;
5559 }
5560
5561 if (!isEpilogueVectorizationProfitable(MainLoopVF)) {
5562 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is not profitable for "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LEV: Epilogue vectorization is not profitable for "
"this loop\n"; } } while (false)
5563 "this loop\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LEV: Epilogue vectorization is not profitable for "
"this loop\n"; } } while (false)
;
5564 return Result;
5565 }
5566
5567 // If MainLoopVF = vscale x 2, and vscale is expected to be 4, then we know
5568 // the main loop handles 8 lanes per iteration. We could still benefit from
5569 // vectorizing the epilogue loop with VF=4.
5570 ElementCount EstimatedRuntimeVF = MainLoopVF;
5571 if (MainLoopVF.isScalable()) {
5572 EstimatedRuntimeVF = ElementCount::getFixed(MainLoopVF.getKnownMinValue());
5573 if (std::optional<unsigned> VScale = getVScaleForTuning())
5574 EstimatedRuntimeVF *= *VScale;
5575 }
5576
5577 for (auto &NextVF : ProfitableVFs)
5578 if (((!NextVF.Width.isScalable() && MainLoopVF.isScalable() &&
5579 ElementCount::isKnownLT(NextVF.Width, EstimatedRuntimeVF)) ||
5580 ElementCount::isKnownLT(NextVF.Width, MainLoopVF)) &&
5581 (Result.Width.isScalar() || isMoreProfitable(NextVF, Result)) &&
5582 LVP.hasPlanWithVF(NextVF.Width))
5583 Result = NextVF;
5584
5585 if (Result != VectorizationFactor::Disabled())
5586 LLVM_DEBUG(dbgs() << "LEV: Vectorizing epilogue loop with VF = "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LEV: Vectorizing epilogue loop with VF = "
<< Result.Width << "\n";; } } while (false)
5587 << Result.Width << "\n";)do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LEV: Vectorizing epilogue loop with VF = "
<< Result.Width << "\n";; } } while (false)
;
5588 return Result;
5589}
5590
5591std::pair<unsigned, unsigned>
5592LoopVectorizationCostModel::getSmallestAndWidestTypes() {
5593 unsigned MinWidth = -1U;
5594 unsigned MaxWidth = 8;
5595 const DataLayout &DL = TheFunction->getParent()->getDataLayout();
5596 // For in-loop reductions, no element types are added to ElementTypesInLoop
5597 // if there are no loads/stores in the loop. In this case, check through the
5598 // reduction variables to determine the maximum width.
5599 if (ElementTypesInLoop.empty() && !Legal->getReductionVars().empty()) {
5600 // Reset MaxWidth so that we can find the smallest type used by recurrences
5601 // in the loop.
5602 MaxWidth = -1U;
5603 for (const auto &PhiDescriptorPair : Legal->getReductionVars()) {
5604 const RecurrenceDescriptor &RdxDesc = PhiDescriptorPair.second;
5605 // When finding the min width used by the recurrence we need to account
5606 // for casts on the input operands of the recurrence.
5607 MaxWidth = std::min<unsigned>(
5608 MaxWidth, std::min<unsigned>(
5609 RdxDesc.getMinWidthCastToRecurrenceTypeInBits(),
5610 RdxDesc.getRecurrenceType()->getScalarSizeInBits()));
5611 }
5612 } else {
5613 for (Type *T : ElementTypesInLoop) {
5614 MinWidth = std::min<unsigned>(
5615 MinWidth, DL.getTypeSizeInBits(T->getScalarType()).getFixedSize());
5616 MaxWidth = std::max<unsigned>(
5617 MaxWidth, DL.getTypeSizeInBits(T->getScalarType()).getFixedSize());
5618 }
5619 }
5620 return {MinWidth, MaxWidth};
5621}
5622
5623void LoopVectorizationCostModel::collectElementTypesForWidening() {
5624 ElementTypesInLoop.clear();
5625 // For each block.
5626 for (BasicBlock *BB : TheLoop->blocks()) {
5627 // For each instruction in the loop.
5628 for (Instruction &I : BB->instructionsWithoutDebug()) {
5629 Type *T = I.getType();
5630
5631 // Skip ignored values.
5632 if (ValuesToIgnore.count(&I))
5633 continue;
5634
5635 // Only examine Loads, Stores and PHINodes.
5636 if (!isa<LoadInst>(I) && !isa<StoreInst>(I) && !isa<PHINode>(I))
5637 continue;
5638
5639 // Examine PHI nodes that are reduction variables. Update the type to
5640 // account for the recurrence type.
5641 if (auto *PN = dyn_cast<PHINode>(&I)) {
5642 if (!Legal->isReductionVariable(PN))
5643 continue;
5644 const RecurrenceDescriptor &RdxDesc =
5645 Legal->getReductionVars().find(PN)->second;
5646 if (PreferInLoopReductions || useOrderedReductions(RdxDesc) ||
5647 TTI.preferInLoopReduction(RdxDesc.getOpcode(),
5648 RdxDesc.getRecurrenceType(),
5649 TargetTransformInfo::ReductionFlags()))
5650 continue;
5651 T = RdxDesc.getRecurrenceType();
5652 }
5653
5654 // Examine the stored values.
5655 if (auto *ST = dyn_cast<StoreInst>(&I))
5656 T = ST->getValueOperand()->getType();
5657
5658 assert(T->isSized() &&(static_cast <bool> (T->isSized() && "Expected the load/store/recurrence type to be sized"
) ? void (0) : __assert_fail ("T->isSized() && \"Expected the load/store/recurrence type to be sized\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 5659, __extension__
__PRETTY_FUNCTION__))
5659 "Expected the load/store/recurrence type to be sized")(static_cast <bool> (T->isSized() && "Expected the load/store/recurrence type to be sized"
) ? void (0) : __assert_fail ("T->isSized() && \"Expected the load/store/recurrence type to be sized\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 5659, __extension__
__PRETTY_FUNCTION__))
;
5660
5661 ElementTypesInLoop.insert(T);
5662 }
5663 }
5664}
5665
5666unsigned
5667LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF,
5668 InstructionCost LoopCost) {
5669 // -- The interleave heuristics --
5670 // We interleave the loop in order to expose ILP and reduce the loop overhead.
5671 // There are many micro-architectural considerations that we can't predict
5672 // at this level. For example, frontend pressure (on decode or fetch) due to
5673 // code size, or the number and capabilities of the execution ports.
5674 //
5675 // We use the following heuristics to select the interleave count:
5676 // 1. If the code has reductions, then we interleave to break the cross
5677 // iteration dependency.
5678 // 2. If the loop is really small, then we interleave to reduce the loop
5679 // overhead.
5680 // 3. We don't interleave if we think that we will spill registers to memory
5681 // due to the increased register pressure.
5682
5683 if (!isScalarEpilogueAllowed())
5684 return 1;
5685
5686 // We used the distance for the interleave count.
5687 if (Legal->getMaxSafeDepDistBytes() != -1U)
5688 return 1;
5689
5690 auto BestKnownTC = getSmallBestKnownTC(*PSE.getSE(), TheLoop);
5691 const bool HasReductions = !Legal->getReductionVars().empty();
5692 // Do not interleave loops with a relatively small known or estimated trip
5693 // count. But we will interleave when InterleaveSmallLoopScalarReduction is
5694 // enabled, and the code has scalar reductions(HasReductions && VF = 1),
5695 // because with the above conditions interleaving can expose ILP and break
5696 // cross iteration dependences for reductions.
5697 if (BestKnownTC && (*BestKnownTC < TinyTripCountInterleaveThreshold) &&
5698 !(InterleaveSmallLoopScalarReduction && HasReductions && VF.isScalar()))
5699 return 1;
5700
5701 // If we did not calculate the cost for VF (because the user selected the VF)
5702 // then we calculate the cost of VF here.
5703 if (LoopCost == 0) {
5704 LoopCost = expectedCost(VF).first;
5705 assert(LoopCost.isValid() && "Expected to have chosen a VF with valid cost")(static_cast <bool> (LoopCost.isValid() && "Expected to have chosen a VF with valid cost"
) ? void (0) : __assert_fail ("LoopCost.isValid() && \"Expected to have chosen a VF with valid cost\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 5705, __extension__
__PRETTY_FUNCTION__))
;
5706
5707 // Loop body is free and there is no need for interleaving.
5708 if (LoopCost == 0)
5709 return 1;
5710 }
5711
5712 RegisterUsage R = calculateRegisterUsage({VF})[0];
5713 // We divide by these constants so assume that we have at least one
5714 // instruction that uses at least one register.
5715 for (auto& pair : R.MaxLocalUsers) {
5716 pair.second = std::max(pair.second, 1U);
5717 }
5718
5719 // We calculate the interleave count using the following formula.
5720 // Subtract the number of loop invariants from the number of available
5721 // registers. These registers are used by all of the interleaved instances.
5722 // Next, divide the remaining registers by the number of registers that is
5723 // required by the loop, in order to estimate how many parallel instances
5724 // fit without causing spills. All of this is rounded down if necessary to be
5725 // a power of two. We want power of two interleave count to simplify any
5726 // addressing operations or alignment considerations.
5727 // We also want power of two interleave counts to ensure that the induction
5728 // variable of the vector loop wraps to zero, when tail is folded by masking;
5729 // this currently happens when OptForSize, in which case IC is set to 1 above.
5730 unsigned IC = UINT_MAX(2147483647 *2U +1U);
5731
5732 for (auto& pair : R.MaxLocalUsers) {
5733 unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first);
5734 LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegistersdo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: The target has " <<
TargetNumRegisters << " registers of " << TTI.getRegisterClassName
(pair.first) << " register class\n"; } } while (false)
5735 << " registers of "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: The target has " <<
TargetNumRegisters << " registers of " << TTI.getRegisterClassName
(pair.first) << " register class\n"; } } while (false)
5736 << TTI.getRegisterClassName(pair.first) << " register class\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: The target has " <<
TargetNumRegisters << " registers of " << TTI.getRegisterClassName
(pair.first) << " register class\n"; } } while (false)
;
5737 if (VF.isScalar()) {
5738 if (ForceTargetNumScalarRegs.getNumOccurrences() > 0)
5739 TargetNumRegisters = ForceTargetNumScalarRegs;
5740 } else {
5741 if (ForceTargetNumVectorRegs.getNumOccurrences() > 0)
5742 TargetNumRegisters = ForceTargetNumVectorRegs;
5743 }
5744 unsigned MaxLocalUsers = pair.second;
5745 unsigned LoopInvariantRegs = 0;
5746 if (R.LoopInvariantRegs.find(pair.first) != R.LoopInvariantRegs.end())
5747 LoopInvariantRegs = R.LoopInvariantRegs[pair.first];
5748
5749 unsigned TmpIC = PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs) / MaxLocalUsers);
5750 // Don't count the induction variable as interleaved.
5751 if (EnableIndVarRegisterHeur) {
5752 TmpIC =
5753 PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs - 1) /
5754 std::max(1U, (MaxLocalUsers - 1)));
5755 }
5756
5757 IC = std::min(IC, TmpIC);
5758 }
5759
5760 // Clamp the interleave ranges to reasonable counts.
5761 unsigned MaxInterleaveCount =
5762 TTI.getMaxInterleaveFactor(VF.getKnownMinValue());
5763
5764 // Check if the user has overridden the max.
5765 if (VF.isScalar()) {
5766 if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0)
5767 MaxInterleaveCount = ForceTargetMaxScalarInterleaveFactor;
5768 } else {
5769 if (ForceTargetMaxVectorInterleaveFactor.getNumOccurrences() > 0)
5770 MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor;
5771 }
5772
5773 // If trip count is known or estimated compile time constant, limit the
5774 // interleave count to be less than the trip count divided by VF, provided it
5775 // is at least 1.
5776 //
5777 // For scalable vectors we can't know if interleaving is beneficial. It may
5778 // not be beneficial for small loops if none of the lanes in the second vector
5779 // iterations is enabled. However, for larger loops, there is likely to be a
5780 // similar benefit as for fixed-width vectors. For now, we choose to leave
5781 // the InterleaveCount as if vscale is '1', although if some information about
5782 // the vector is known (e.g. min vector size), we can make a better decision.
5783 if (BestKnownTC) {
5784 MaxInterleaveCount =
5785 std::min(*BestKnownTC / VF.getKnownMinValue(), MaxInterleaveCount);
5786 // Make sure MaxInterleaveCount is greater than 0.
5787 MaxInterleaveCount = std::max(1u, MaxInterleaveCount);
5788 }
5789
5790 assert(MaxInterleaveCount > 0 &&(static_cast <bool> (MaxInterleaveCount > 0 &&
"Maximum interleave count must be greater than 0") ? void (0
) : __assert_fail ("MaxInterleaveCount > 0 && \"Maximum interleave count must be greater than 0\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 5791, __extension__
__PRETTY_FUNCTION__))
5791 "Maximum interleave count must be greater than 0")(static_cast <bool> (MaxInterleaveCount > 0 &&
"Maximum interleave count must be greater than 0") ? void (0
) : __assert_fail ("MaxInterleaveCount > 0 && \"Maximum interleave count must be greater than 0\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 5791, __extension__
__PRETTY_FUNCTION__))
;
5792
5793 // Clamp the calculated IC to be between the 1 and the max interleave count
5794 // that the target and trip count allows.
5795 if (IC > MaxInterleaveCount)
5796 IC = MaxInterleaveCount;
5797 else
5798 // Make sure IC is greater than 0.
5799 IC = std::max(1u, IC);
5800
5801 assert(IC > 0 && "Interleave count must be greater than 0.")(static_cast <bool> (IC > 0 && "Interleave count must be greater than 0."
) ? void (0) : __assert_fail ("IC > 0 && \"Interleave count must be greater than 0.\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 5801, __extension__
__PRETTY_FUNCTION__))
;
5802
5803 // Interleave if we vectorized this loop and there is a reduction that could
5804 // benefit from interleaving.
5805 if (VF.isVector() && HasReductions) {
5806 LLVM_DEBUG(dbgs() << "LV: Interleaving because of reductions.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Interleaving because of reductions.\n"
; } } while (false)
;
5807 return IC;
5808 }
5809
5810 // For any scalar loop that either requires runtime checks or predication we
5811 // are better off leaving this to the unroller. Note that if we've already
5812 // vectorized the loop we will have done the runtime check and so interleaving
5813 // won't require further checks.
5814 bool ScalarInterleavingRequiresPredication =
5815 (VF.isScalar() && any_of(TheLoop->blocks(), [this](BasicBlock *BB) {
5816 return Legal->blockNeedsPredication(BB);
5817 }));
5818 bool ScalarInterleavingRequiresRuntimePointerCheck =
5819 (VF.isScalar() && Legal->getRuntimePointerChecking()->Need);
5820
5821 // We want to interleave small loops in order to reduce the loop overhead and
5822 // potentially expose ILP opportunities.
5823 LLVM_DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n'do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Loop cost is " <<
LoopCost << '\n' << "LV: IC is " << IC <<
'\n' << "LV: VF is " << VF << '\n'; } } while
(false)
5824 << "LV: IC is " << IC << '\n'do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Loop cost is " <<
LoopCost << '\n' << "LV: IC is " << IC <<
'\n' << "LV: VF is " << VF << '\n'; } } while
(false)
5825 << "LV: VF is " << VF << '\n')do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Loop cost is " <<
LoopCost << '\n' << "LV: IC is " << IC <<
'\n' << "LV: VF is " << VF << '\n'; } } while
(false)
;
5826 const bool AggressivelyInterleaveReductions =
5827 TTI.enableAggressiveInterleaving(HasReductions);
5828 if (!ScalarInterleavingRequiresRuntimePointerCheck &&
5829 !ScalarInterleavingRequiresPredication && LoopCost < SmallLoopCost) {
5830 // We assume that the cost overhead is 1 and we use the cost model
5831 // to estimate the cost of the loop and interleave until the cost of the
5832 // loop overhead is about 5% of the cost of the loop.
5833 unsigned SmallIC = std::min(
5834 IC, (unsigned)PowerOf2Floor(SmallLoopCost / *LoopCost.getValue()));
5835
5836 // Interleave until store/load ports (estimated by max interleave count) are
5837 // saturated.
5838 unsigned NumStores = Legal->getNumStores();
5839 unsigned NumLoads = Legal->getNumLoads();
5840 unsigned StoresIC = IC / (NumStores ? NumStores : 1);
5841 unsigned LoadsIC = IC / (NumLoads ? NumLoads : 1);
5842
5843 // There is little point in interleaving for reductions containing selects
5844 // and compares when VF=1 since it may just create more overhead than it's
5845 // worth for loops with small trip counts. This is because we still have to
5846 // do the final reduction after the loop.
5847 bool HasSelectCmpReductions =
5848 HasReductions &&
5849 any_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool {
5850 const RecurrenceDescriptor &RdxDesc = Reduction.second;
5851 return RecurrenceDescriptor::isSelectCmpRecurrenceKind(
5852 RdxDesc.getRecurrenceKind());
5853 });
5854 if (HasSelectCmpReductions) {
5855 LLVM_DEBUG(dbgs() << "LV: Not interleaving select-cmp reductions.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Not interleaving select-cmp reductions.\n"
; } } while (false)
;
5856 return 1;
5857 }
5858
5859 // If we have a scalar reduction (vector reductions are already dealt with
5860 // by this point), we can increase the critical path length if the loop
5861 // we're interleaving is inside another loop. For tree-wise reductions
5862 // set the limit to 2, and for ordered reductions it's best to disable
5863 // interleaving entirely.
5864 if (HasReductions && TheLoop->getLoopDepth() > 1) {
5865 bool HasOrderedReductions =
5866 any_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool {
5867 const RecurrenceDescriptor &RdxDesc = Reduction.second;
5868 return RdxDesc.isOrdered();
5869 });
5870 if (HasOrderedReductions) {
5871 LLVM_DEBUG(do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Not interleaving scalar ordered reductions.\n"
; } } while (false)
5872 dbgs() << "LV: Not interleaving scalar ordered reductions.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Not interleaving scalar ordered reductions.\n"
; } } while (false)
;
5873 return 1;
5874 }
5875
5876 unsigned F = static_cast<unsigned>(MaxNestedScalarReductionIC);
5877 SmallIC = std::min(SmallIC, F);
5878 StoresIC = std::min(StoresIC, F);
5879 LoadsIC = std::min(LoadsIC, F);
5880 }
5881
5882 if (EnableLoadStoreRuntimeInterleave &&
5883 std::max(StoresIC, LoadsIC) > SmallIC) {
5884 LLVM_DEBUG(do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Interleaving to saturate store or load ports.\n"
; } } while (false)
5885 dbgs() << "LV: Interleaving to saturate store or load ports.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Interleaving to saturate store or load ports.\n"
; } } while (false)
;
5886 return std::max(StoresIC, LoadsIC);
5887 }
5888
5889 // If there are scalar reductions and TTI has enabled aggressive
5890 // interleaving for reductions, we will interleave to expose ILP.
5891 if (InterleaveSmallLoopScalarReduction && VF.isScalar() &&
5892 AggressivelyInterleaveReductions) {
5893 LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Interleaving to expose ILP.\n"
; } } while (false)
;
5894 // Interleave no less than SmallIC but not as aggressive as the normal IC
5895 // to satisfy the rare situation when resources are too limited.
5896 return std::max(IC / 2, SmallIC);
5897 } else {
5898 LLVM_DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Interleaving to reduce branch cost.\n"
; } } while (false)
;
5899 return SmallIC;
5900 }
5901 }
5902
5903 // Interleave if this is a large loop (small loops are already dealt with by
5904 // this point) that could benefit from interleaving.
5905 if (AggressivelyInterleaveReductions) {
5906 LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Interleaving to expose ILP.\n"
; } } while (false)
;
5907 return IC;
5908 }
5909
5910 LLVM_DEBUG(dbgs() << "LV: Not Interleaving.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Not Interleaving.\n"
; } } while (false)
;
5911 return 1;
5912}
5913
5914SmallVector<LoopVectorizationCostModel::RegisterUsage, 8>
5915LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<ElementCount> VFs) {
5916 // This function calculates the register usage by measuring the highest number
5917 // of values that are alive at a single location. Obviously, this is a very
5918 // rough estimation. We scan the loop in a topological order in order and
5919 // assign a number to each instruction. We use RPO to ensure that defs are
5920 // met before their users. We assume that each instruction that has in-loop
5921 // users starts an interval. We record every time that an in-loop value is
5922 // used, so we have a list of the first and last occurrences of each
5923 // instruction. Next, we transpose this data structure into a multi map that
5924 // holds the list of intervals that *end* at a specific location. This multi
5925 // map allows us to perform a linear search. We scan the instructions linearly
5926 // and record each time that a new interval starts, by placing it in a set.
5927 // If we find this value in the multi-map then we remove it from the set.
5928 // The max register usage is the maximum size of the set.
5929 // We also search for instructions that are defined outside the loop, but are
5930 // used inside the loop. We need this number separately from the max-interval
5931 // usage number because when we unroll, loop-invariant values do not take
5932 // more register.
5933 LoopBlocksDFS DFS(TheLoop);
5934 DFS.perform(LI);
5935
5936 RegisterUsage RU;
5937
5938 // Each 'key' in the map opens a new interval. The values
5939 // of the map are the index of the 'last seen' usage of the
5940 // instruction that is the key.
5941 using IntervalMap = DenseMap<Instruction *, unsigned>;
5942
5943 // Maps instruction to its index.
5944 SmallVector<Instruction *, 64> IdxToInstr;
5945 // Marks the end of each interval.
5946 IntervalMap EndPoint;
5947 // Saves the list of instruction indices that are used in the loop.
5948 SmallPtrSet<Instruction *, 8> Ends;
5949 // Saves the list of values that are used in the loop but are defined outside
5950 // the loop (not including non-instruction values such as arguments and
5951 // constants).
5952 SmallPtrSet<Value *, 8> LoopInvariants;
5953
5954 for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
5955 for (Instruction &I : BB->instructionsWithoutDebug()) {
5956 IdxToInstr.push_back(&I);
5957
5958 // Save the end location of each USE.
5959 for (Value *U : I.operands()) {
5960 auto *Instr = dyn_cast<Instruction>(U);
5961
5962 // Ignore non-instruction values such as arguments, constants, etc.
5963 // FIXME: Might need some motivation why these values are ignored. If
5964 // for example an argument is used inside the loop it will increase the
5965 // register pressure (so shouldn't we add it to LoopInvariants).
5966 if (!Instr)
5967 continue;
5968
5969 // If this instruction is outside the loop then record it and continue.
5970 if (!TheLoop->contains(Instr)) {
5971 LoopInvariants.insert(Instr);
5972 continue;
5973 }
5974
5975 // Overwrite previous end points.
5976 EndPoint[Instr] = IdxToInstr.size();
5977 Ends.insert(Instr);
5978 }
5979 }
5980 }
5981
5982 // Saves the list of intervals that end with the index in 'key'.
5983 using InstrList = SmallVector<Instruction *, 2>;
5984 DenseMap<unsigned, InstrList> TransposeEnds;
5985
5986 // Transpose the EndPoints to a list of values that end at each index.
5987 for (auto &Interval : EndPoint)
5988 TransposeEnds[Interval.second].push_back(Interval.first);
5989
5990 SmallPtrSet<Instruction *, 8> OpenIntervals;
5991 SmallVector<RegisterUsage, 8> RUs(VFs.size());
5992 SmallVector<SmallMapVector<unsigned, unsigned, 4>, 8> MaxUsages(VFs.size());
5993
5994 LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV(REG): Calculating max register usage:\n"
; } } while (false)
;
5995
5996 const auto &TTICapture = TTI;
5997 auto GetRegUsage = [&TTICapture](Type *Ty, ElementCount VF) -> unsigned {
5998 if (Ty->isTokenTy() || !VectorType::isValidElementType(Ty))
5999 return 0;
6000 return TTICapture.getRegUsageForType(VectorType::get(Ty, VF));
6001 };
6002
6003 for (unsigned int i = 0, s = IdxToInstr.size(); i < s; ++i) {
6004 Instruction *I = IdxToInstr[i];
6005
6006 // Remove all of the instructions that end at this location.
6007 InstrList &List = TransposeEnds[i];
6008 for (Instruction *ToRemove : List)
6009 OpenIntervals.erase(ToRemove);
6010
6011 // Ignore instructions that are never used within the loop.
6012 if (!Ends.count(I))
6013 continue;
6014
6015 // Skip ignored values.
6016 if (ValuesToIgnore.count(I))
6017 continue;
6018
6019 // For each VF find the maximum usage of registers.
6020 for (unsigned j = 0, e = VFs.size(); j < e; ++j) {
6021 // Count the number of registers used, per register class, given all open
6022 // intervals.
6023 // Note that elements in this SmallMapVector will be default constructed
6024 // as 0. So we can use "RegUsage[ClassID] += n" in the code below even if
6025 // there is no previous entry for ClassID.
6026 SmallMapVector<unsigned, unsigned, 4> RegUsage;
6027
6028 if (VFs[j].isScalar()) {
6029 for (auto *Inst : OpenIntervals) {
6030 unsigned ClassID =
6031 TTI.getRegisterClassForType(false, Inst->getType());
6032 // FIXME: The target might use more than one register for the type
6033 // even in the scalar case.
6034 RegUsage[ClassID] += 1;
6035 }
6036 } else {
6037 collectUniformsAndScalars(VFs[j]);
6038 for (auto *Inst : OpenIntervals) {
6039 // Skip ignored values for VF > 1.
6040 if (VecValuesToIgnore.count(Inst))
6041 continue;
6042 if (isScalarAfterVectorization(Inst, VFs[j])) {
6043 unsigned ClassID =
6044 TTI.getRegisterClassForType(false, Inst->getType());
6045 // FIXME: The target might use more than one register for the type
6046 // even in the scalar case.
6047 RegUsage[ClassID] += 1;
6048 } else {
6049 unsigned ClassID =
6050 TTI.getRegisterClassForType(true, Inst->getType());
6051 RegUsage[ClassID] += GetRegUsage(Inst->getType(), VFs[j]);
6052 }
6053 }
6054 }
6055
6056 for (auto& pair : RegUsage) {
6057 auto &Entry = MaxUsages[j][pair.first];
6058 Entry = std::max(Entry, pair.second);
6059 }
6060 }
6061
6062 LLVM_DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV(REG): At #" <<
i << " Interval # " << OpenIntervals.size() <<
'\n'; } } while (false)
6063 << OpenIntervals.size() << '\n')do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV(REG): At #" <<
i << " Interval # " << OpenIntervals.size() <<
'\n'; } } while (false)
;
6064
6065 // Add the current instruction to the list of open intervals.
6066 OpenIntervals.insert(I);
6067 }
6068
6069 for (unsigned i = 0, e = VFs.size(); i < e; ++i) {
6070 // Note that elements in this SmallMapVector will be default constructed
6071 // as 0. So we can use "Invariant[ClassID] += n" in the code below even if
6072 // there is no previous entry for ClassID.
6073 SmallMapVector<unsigned, unsigned, 4> Invariant;
6074
6075 for (auto *Inst : LoopInvariants) {
6076 // FIXME: The target might use more than one register for the type
6077 // even in the scalar case.
6078 unsigned Usage =
6079 VFs[i].isScalar() ? 1 : GetRegUsage(Inst->getType(), VFs[i]);
6080 unsigned ClassID =
6081 TTI.getRegisterClassForType(VFs[i].isVector(), Inst->getType());
6082 Invariant[ClassID] += Usage;
6083 }
6084
6085 LLVM_DEBUG({do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { { dbgs() << "LV(REG): VF = " <<
VFs[i] << '\n'; dbgs() << "LV(REG): Found max usage: "
<< MaxUsages[i].size() << " item\n"; for (const auto
&pair : MaxUsages[i]) { dbgs() << "LV(REG): RegisterClass: "
<< TTI.getRegisterClassName(pair.first) << ", " <<
pair.second << " registers\n"; } dbgs() << "LV(REG): Found invariant usage: "
<< Invariant.size() << " item\n"; for (const auto
&pair : Invariant) { dbgs() << "LV(REG): RegisterClass: "
<< TTI.getRegisterClassName(pair.first) << ", " <<
pair.second << " registers\n"; } }; } } while (false)
6086 dbgs() << "LV(REG): VF = " << VFs[i] << '\n';do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { { dbgs() << "LV(REG): VF = " <<
VFs[i] << '\n'; dbgs() << "LV(REG): Found max usage: "
<< MaxUsages[i].size() << " item\n"; for (const auto
&pair : MaxUsages[i]) { dbgs() << "LV(REG): RegisterClass: "
<< TTI.getRegisterClassName(pair.first) << ", " <<
pair.second << " registers\n"; } dbgs() << "LV(REG): Found invariant usage: "
<< Invariant.size() << " item\n"; for (const auto
&pair : Invariant) { dbgs() << "LV(REG): RegisterClass: "
<< TTI.getRegisterClassName(pair.first) << ", " <<
pair.second << " registers\n"; } }; } } while (false)
6087 dbgs() << "LV(REG): Found max usage: " << MaxUsages[i].size()do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { { dbgs() << "LV(REG): VF = " <<
VFs[i] << '\n'; dbgs() << "LV(REG): Found max usage: "
<< MaxUsages[i].size() << " item\n"; for (const auto
&pair : MaxUsages[i]) { dbgs() << "LV(REG): RegisterClass: "
<< TTI.getRegisterClassName(pair.first) << ", " <<
pair.second << " registers\n"; } dbgs() << "LV(REG): Found invariant usage: "
<< Invariant.size() << " item\n"; for (const auto
&pair : Invariant) { dbgs() << "LV(REG): RegisterClass: "
<< TTI.getRegisterClassName(pair.first) << ", " <<
pair.second << " registers\n"; } }; } } while (false)
6088 << " item\n";do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { { dbgs() << "LV(REG): VF = " <<
VFs[i] << '\n'; dbgs() << "LV(REG): Found max usage: "
<< MaxUsages[i].size() << " item\n"; for (const auto
&pair : MaxUsages[i]) { dbgs() << "LV(REG): RegisterClass: "
<< TTI.getRegisterClassName(pair.first) << ", " <<
pair.second << " registers\n"; } dbgs() << "LV(REG): Found invariant usage: "
<< Invariant.size() << " item\n"; for (const auto
&pair : Invariant) { dbgs() << "LV(REG): RegisterClass: "
<< TTI.getRegisterClassName(pair.first) << ", " <<
pair.second << " registers\n"; } }; } } while (false)
6089 for (const auto &pair : MaxUsages[i]) {do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { { dbgs() << "LV(REG): VF = " <<
VFs[i] << '\n'; dbgs() << "LV(REG): Found max usage: "
<< MaxUsages[i].size() << " item\n"; for (const auto
&pair : MaxUsages[i]) { dbgs() << "LV(REG): RegisterClass: "
<< TTI.getRegisterClassName(pair.first) << ", " <<
pair.second << " registers\n"; } dbgs() << "LV(REG): Found invariant usage: "
<< Invariant.size() << " item\n"; for (const auto
&pair : Invariant) { dbgs() << "LV(REG): RegisterClass: "
<< TTI.getRegisterClassName(pair.first) << ", " <<
pair.second << " registers\n"; } }; } } while (false)
6090 dbgs() << "LV(REG): RegisterClass: "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { { dbgs() << "LV(REG): VF = " <<
VFs[i] << '\n'; dbgs() << "LV(REG): Found max usage: "
<< MaxUsages[i].size() << " item\n"; for (const auto
&pair : MaxUsages[i]) { dbgs() << "LV(REG): RegisterClass: "
<< TTI.getRegisterClassName(pair.first) << ", " <<
pair.second << " registers\n"; } dbgs() << "LV(REG): Found invariant usage: "
<< Invariant.size() << " item\n"; for (const auto
&pair : Invariant) { dbgs() << "LV(REG): RegisterClass: "
<< TTI.getRegisterClassName(pair.first) << ", " <<
pair.second << " registers\n"; } }; } } while (false)
6091 << TTI.getRegisterClassName(pair.first) << ", " << pair.seconddo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { { dbgs() << "LV(REG): VF = " <<
VFs[i] << '\n'; dbgs() << "LV(REG): Found max usage: "
<< MaxUsages[i].size() << " item\n"; for (const auto
&pair : MaxUsages[i]) { dbgs() << "LV(REG): RegisterClass: "
<< TTI.getRegisterClassName(pair.first) << ", " <<
pair.second << " registers\n"; } dbgs() << "LV(REG): Found invariant usage: "
<< Invariant.size() << " item\n"; for (const auto
&pair : Invariant) { dbgs() << "LV(REG): RegisterClass: "
<< TTI.getRegisterClassName(pair.first) << ", " <<
pair.second << " registers\n"; } }; } } while (false)
6092 << " registers\n";do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { { dbgs() << "LV(REG): VF = " <<
VFs[i] << '\n'; dbgs() << "LV(REG): Found max usage: "
<< MaxUsages[i].size() << " item\n"; for (const auto
&pair : MaxUsages[i]) { dbgs() << "LV(REG): RegisterClass: "
<< TTI.getRegisterClassName(pair.first) << ", " <<
pair.second << " registers\n"; } dbgs() << "LV(REG): Found invariant usage: "
<< Invariant.size() << " item\n"; for (const auto
&pair : Invariant) { dbgs() << "LV(REG): RegisterClass: "
<< TTI.getRegisterClassName(pair.first) << ", " <<
pair.second << " registers\n"; } }; } } while (false)
6093 }do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { { dbgs() << "LV(REG): VF = " <<
VFs[i] << '\n'; dbgs() << "LV(REG): Found max usage: "
<< MaxUsages[i].size() << " item\n"; for (const auto
&pair : MaxUsages[i]) { dbgs() << "LV(REG): RegisterClass: "
<< TTI.getRegisterClassName(pair.first) << ", " <<
pair.second << " registers\n"; } dbgs() << "LV(REG): Found invariant usage: "
<< Invariant.size() << " item\n"; for (const auto
&pair : Invariant) { dbgs() << "LV(REG): RegisterClass: "
<< TTI.getRegisterClassName(pair.first) << ", " <<
pair.second << " registers\n"; } }; } } while (false)
6094 dbgs() << "LV(REG): Found invariant usage: " << Invariant.size()do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { { dbgs() << "LV(REG): VF = " <<
VFs[i] << '\n'; dbgs() << "LV(REG): Found max usage: "
<< MaxUsages[i].size() << " item\n"; for (const auto
&pair : MaxUsages[i]) { dbgs() << "LV(REG): RegisterClass: "
<< TTI.getRegisterClassName(pair.first) << ", " <<
pair.second << " registers\n"; } dbgs() << "LV(REG): Found invariant usage: "
<< Invariant.size() << " item\n"; for (const auto
&pair : Invariant) { dbgs() << "LV(REG): RegisterClass: "
<< TTI.getRegisterClassName(pair.first) << ", " <<
pair.second << " registers\n"; } }; } } while (false)
6095 << " item\n";do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { { dbgs() << "LV(REG): VF = " <<
VFs[i] << '\n'; dbgs() << "LV(REG): Found max usage: "
<< MaxUsages[i].size() << " item\n"; for (const auto
&pair : MaxUsages[i]) { dbgs() << "LV(REG): RegisterClass: "
<< TTI.getRegisterClassName(pair.first) << ", " <<
pair.second << " registers\n"; } dbgs() << "LV(REG): Found invariant usage: "
<< Invariant.size() << " item\n"; for (const auto
&pair : Invariant) { dbgs() << "LV(REG): RegisterClass: "
<< TTI.getRegisterClassName(pair.first) << ", " <<
pair.second << " registers\n"; } }; } } while (false)
6096 for (const auto &pair : Invariant) {do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { { dbgs() << "LV(REG): VF = " <<
VFs[i] << '\n'; dbgs() << "LV(REG): Found max usage: "
<< MaxUsages[i].size() << " item\n"; for (const auto
&pair : MaxUsages[i]) { dbgs() << "LV(REG): RegisterClass: "
<< TTI.getRegisterClassName(pair.first) << ", " <<
pair.second << " registers\n"; } dbgs() << "LV(REG): Found invariant usage: "
<< Invariant.size() << " item\n"; for (const auto
&pair : Invariant) { dbgs() << "LV(REG): RegisterClass: "
<< TTI.getRegisterClassName(pair.first) << ", " <<
pair.second << " registers\n"; } }; } } while (false)
6097 dbgs() << "LV(REG): RegisterClass: "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { { dbgs() << "LV(REG): VF = " <<
VFs[i] << '\n'; dbgs() << "LV(REG): Found max usage: "
<< MaxUsages[i].size() << " item\n"; for (const auto
&pair : MaxUsages[i]) { dbgs() << "LV(REG): RegisterClass: "
<< TTI.getRegisterClassName(pair.first) << ", " <<
pair.second << " registers\n"; } dbgs() << "LV(REG): Found invariant usage: "
<< Invariant.size() << " item\n"; for (const auto
&pair : Invariant) { dbgs() << "LV(REG): RegisterClass: "
<< TTI.getRegisterClassName(pair.first) << ", " <<
pair.second << " registers\n"; } }; } } while (false)
6098 << TTI.getRegisterClassName(pair.first) << ", " << pair.seconddo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { { dbgs() << "LV(REG): VF = " <<
VFs[i] << '\n'; dbgs() << "LV(REG): Found max usage: "
<< MaxUsages[i].size() << " item\n"; for (const auto
&pair : MaxUsages[i]) { dbgs() << "LV(REG): RegisterClass: "
<< TTI.getRegisterClassName(pair.first) << ", " <<
pair.second << " registers\n"; } dbgs() << "LV(REG): Found invariant usage: "
<< Invariant.size() << " item\n"; for (const auto
&pair : Invariant) { dbgs() << "LV(REG): RegisterClass: "
<< TTI.getRegisterClassName(pair.first) << ", " <<
pair.second << " registers\n"; } }; } } while (false)
6099 << " registers\n";do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { { dbgs() << "LV(REG): VF = " <<
VFs[i] << '\n'; dbgs() << "LV(REG): Found max usage: "
<< MaxUsages[i].size() << " item\n"; for (const auto
&pair : MaxUsages[i]) { dbgs() << "LV(REG): RegisterClass: "
<< TTI.getRegisterClassName(pair.first) << ", " <<
pair.second << " registers\n"; } dbgs() << "LV(REG): Found invariant usage: "
<< Invariant.size() << " item\n"; for (const auto
&pair : Invariant) { dbgs() << "LV(REG): RegisterClass: "
<< TTI.getRegisterClassName(pair.first) << ", " <<
pair.second << " registers\n"; } }; } } while (false)
6100 }do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { { dbgs() << "LV(REG): VF = " <<
VFs[i] << '\n'; dbgs() << "LV(REG): Found max usage: "
<< MaxUsages[i].size() << " item\n"; for (const auto
&pair : MaxUsages[i]) { dbgs() << "LV(REG): RegisterClass: "
<< TTI.getRegisterClassName(pair.first) << ", " <<
pair.second << " registers\n"; } dbgs() << "LV(REG): Found invariant usage: "
<< Invariant.size() << " item\n"; for (const auto
&pair : Invariant) { dbgs() << "LV(REG): RegisterClass: "
<< TTI.getRegisterClassName(pair.first) << ", " <<
pair.second << " registers\n"; } }; } } while (false)
6101 })do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { { dbgs() << "LV(REG): VF = " <<
VFs[i] << '\n'; dbgs() << "LV(REG): Found max usage: "
<< MaxUsages[i].size() << " item\n"; for (const auto
&pair : MaxUsages[i]) { dbgs() << "LV(REG): RegisterClass: "
<< TTI.getRegisterClassName(pair.first) << ", " <<
pair.second << " registers\n"; } dbgs() << "LV(REG): Found invariant usage: "
<< Invariant.size() << " item\n"; for (const auto
&pair : Invariant) { dbgs() << "LV(REG): RegisterClass: "
<< TTI.getRegisterClassName(pair.first) << ", " <<
pair.second << " registers\n"; } }; } } while (false)
;
6102
6103 RU.LoopInvariantRegs = Invariant;
6104 RU.MaxLocalUsers = MaxUsages[i];
6105 RUs[i] = RU;
6106 }
6107
6108 return RUs;
6109}
6110
6111bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I,
6112 ElementCount VF) {
6113 // TODO: Cost model for emulated masked load/store is completely
6114 // broken. This hack guides the cost model to use an artificially
6115 // high enough value to practically disable vectorization with such
6116 // operations, except where previously deployed legality hack allowed
6117 // using very low cost values. This is to avoid regressions coming simply
6118 // from moving "masked load/store" check from legality to cost model.
6119 // Masked Load/Gather emulation was previously never allowed.
6120 // Limited number of Masked Store/Scatter emulation was allowed.
6121 assert((isPredicatedInst(I)) &&(static_cast <bool> ((isPredicatedInst(I)) && "Expecting a scalar emulated instruction"
) ? void (0) : __assert_fail ("(isPredicatedInst(I)) && \"Expecting a scalar emulated instruction\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 6122, __extension__
__PRETTY_FUNCTION__))
6122 "Expecting a scalar emulated instruction")(static_cast <bool> ((isPredicatedInst(I)) && "Expecting a scalar emulated instruction"
) ? void (0) : __assert_fail ("(isPredicatedInst(I)) && \"Expecting a scalar emulated instruction\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 6122, __extension__
__PRETTY_FUNCTION__))
;
6123 return isa<LoadInst>(I) ||
6124 (isa<StoreInst>(I) &&
6125 NumPredStores > NumberOfStoresToPredicate);
6126}
6127
6128void LoopVectorizationCostModel::collectInstsToScalarize(ElementCount VF) {
6129 // If we aren't vectorizing the loop, or if we've already collected the
6130 // instructions to scalarize, there's nothing to do. Collection may already
6131 // have occurred if we have a user-selected VF and are now computing the
6132 // expected cost for interleaving.
6133 if (VF.isScalar() || VF.isZero() ||
6134 InstsToScalarize.find(VF) != InstsToScalarize.end())
6135 return;
6136
6137 // Initialize a mapping for VF in InstsToScalalarize. If we find that it's
6138 // not profitable to scalarize any instructions, the presence of VF in the
6139 // map will indicate that we've analyzed it already.
6140 ScalarCostsTy &ScalarCostsVF = InstsToScalarize[VF];
6141
6142 PredicatedBBsAfterVectorization[VF].clear();
6143
6144 // Find all the instructions that are scalar with predication in the loop and
6145 // determine if it would be better to not if-convert the blocks they are in.
6146 // If so, we also record the instructions to scalarize.
6147 for (BasicBlock *BB : TheLoop->blocks()) {
6148 if (!blockNeedsPredicationForAnyReason(BB))
6149 continue;
6150 for (Instruction &I : *BB)
6151 if (isScalarWithPredication(&I, VF)) {
6152 ScalarCostsTy ScalarCosts;
6153 // Do not apply discount if scalable, because that would lead to
6154 // invalid scalarization costs.
6155 // Do not apply discount logic if hacked cost is needed
6156 // for emulated masked memrefs.
6157 if (!VF.isScalable() && !useEmulatedMaskMemRefHack(&I, VF) &&
6158 computePredInstDiscount(&I, ScalarCosts, VF) >= 0)
6159 ScalarCostsVF.insert(ScalarCosts.begin(), ScalarCosts.end());
6160 // Remember that BB will remain after vectorization.
6161 PredicatedBBsAfterVectorization[VF].insert(BB);
6162 }
6163 }
6164}
6165
6166InstructionCost LoopVectorizationCostModel::computePredInstDiscount(
6167 Instruction *PredInst, ScalarCostsTy &ScalarCosts, ElementCount VF) {
6168 assert(!isUniformAfterVectorization(PredInst, VF) &&(static_cast <bool> (!isUniformAfterVectorization(PredInst
, VF) && "Instruction marked uniform-after-vectorization will be predicated"
) ? void (0) : __assert_fail ("!isUniformAfterVectorization(PredInst, VF) && \"Instruction marked uniform-after-vectorization will be predicated\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 6169, __extension__
__PRETTY_FUNCTION__))
6169 "Instruction marked uniform-after-vectorization will be predicated")(static_cast <bool> (!isUniformAfterVectorization(PredInst
, VF) && "Instruction marked uniform-after-vectorization will be predicated"
) ? void (0) : __assert_fail ("!isUniformAfterVectorization(PredInst, VF) && \"Instruction marked uniform-after-vectorization will be predicated\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 6169, __extension__
__PRETTY_FUNCTION__))
;
6170
6171 // Initialize the discount to zero, meaning that the scalar version and the
6172 // vector version cost the same.
6173 InstructionCost Discount = 0;
6174
6175 // Holds instructions to analyze. The instructions we visit are mapped in
6176 // ScalarCosts. Those instructions are the ones that would be scalarized if
6177 // we find that the scalar version costs less.
6178 SmallVector<Instruction *, 8> Worklist;
6179
6180 // Returns true if the given instruction can be scalarized.
6181 auto canBeScalarized = [&](Instruction *I) -> bool {
6182 // We only attempt to scalarize instructions forming a single-use chain
6183 // from the original predicated block that would otherwise be vectorized.
6184 // Although not strictly necessary, we give up on instructions we know will
6185 // already be scalar to avoid traversing chains that are unlikely to be
6186 // beneficial.
6187 if (!I->hasOneUse() || PredInst->getParent() != I->getParent() ||
6188 isScalarAfterVectorization(I, VF))
6189 return false;
6190
6191 // If the instruction is scalar with predication, it will be analyzed
6192 // separately. We ignore it within the context of PredInst.
6193 if (isScalarWithPredication(I, VF))
6194 return false;
6195
6196 // If any of the instruction's operands are uniform after vectorization,
6197 // the instruction cannot be scalarized. This prevents, for example, a
6198 // masked load from being scalarized.
6199 //
6200 // We assume we will only emit a value for lane zero of an instruction
6201 // marked uniform after vectorization, rather than VF identical values.
6202 // Thus, if we scalarize an instruction that uses a uniform, we would
6203 // create uses of values corresponding to the lanes we aren't emitting code
6204 // for. This behavior can be changed by allowing getScalarValue to clone
6205 // the lane zero values for uniforms rather than asserting.
6206 for (Use &U : I->operands())
6207 if (auto *J = dyn_cast<Instruction>(U.get()))
6208 if (isUniformAfterVectorization(J, VF))
6209 return false;
6210
6211 // Otherwise, we can scalarize the instruction.
6212 return true;
6213 };
6214
6215 // Compute the expected cost discount from scalarizing the entire expression
6216 // feeding the predicated instruction. We currently only consider expressions
6217 // that are single-use instruction chains.
6218 Worklist.push_back(PredInst);
6219 while (!Worklist.empty()) {
6220 Instruction *I = Worklist.pop_back_val();
6221
6222 // If we've already analyzed the instruction, there's nothing to do.
6223 if (ScalarCosts.find(I) != ScalarCosts.end())
6224 continue;
6225
6226 // Compute the cost of the vector instruction. Note that this cost already
6227 // includes the scalarization overhead of the predicated instruction.
6228 InstructionCost VectorCost = getInstructionCost(I, VF).first;
6229
6230 // Compute the cost of the scalarized instruction. This cost is the cost of
6231 // the instruction as if it wasn't if-converted and instead remained in the
6232 // predicated block. We will scale this cost by block probability after
6233 // computing the scalarization overhead.
6234 InstructionCost ScalarCost =
6235 VF.getFixedValue() *
6236 getInstructionCost(I, ElementCount::getFixed(1)).first;
6237
6238 // Compute the scalarization overhead of needed insertelement instructions
6239 // and phi nodes.
6240 if (isScalarWithPredication(I, VF) && !I->getType()->isVoidTy()) {
6241 ScalarCost += TTI.getScalarizationOverhead(
6242 cast<VectorType>(ToVectorTy(I->getType(), VF)),
6243 APInt::getAllOnes(VF.getFixedValue()), true, false);
6244 ScalarCost +=
6245 VF.getFixedValue() *
6246 TTI.getCFInstrCost(Instruction::PHI, TTI::TCK_RecipThroughput);
6247 }
6248
6249 // Compute the scalarization overhead of needed extractelement
6250 // instructions. For each of the instruction's operands, if the operand can
6251 // be scalarized, add it to the worklist; otherwise, account for the
6252 // overhead.
6253 for (Use &U : I->operands())
6254 if (auto *J = dyn_cast<Instruction>(U.get())) {
6255 assert(VectorType::isValidElementType(J->getType()) &&(static_cast <bool> (VectorType::isValidElementType(J->
getType()) && "Instruction has non-scalar type") ? void
(0) : __assert_fail ("VectorType::isValidElementType(J->getType()) && \"Instruction has non-scalar type\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 6256, __extension__
__PRETTY_FUNCTION__))
6256 "Instruction has non-scalar type")(static_cast <bool> (VectorType::isValidElementType(J->
getType()) && "Instruction has non-scalar type") ? void
(0) : __assert_fail ("VectorType::isValidElementType(J->getType()) && \"Instruction has non-scalar type\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 6256, __extension__
__PRETTY_FUNCTION__))
;
6257 if (canBeScalarized(J))
6258 Worklist.push_back(J);
6259 else if (needsExtract(J, VF)) {
6260 ScalarCost += TTI.getScalarizationOverhead(
6261 cast<VectorType>(ToVectorTy(J->getType(), VF)),
6262 APInt::getAllOnes(VF.getFixedValue()), false, true);
6263 }
6264 }
6265
6266 // Scale the total scalar cost by block probability.
6267 ScalarCost /= getReciprocalPredBlockProb();
6268
6269 // Compute the discount. A non-negative discount means the vector version
6270 // of the instruction costs more, and scalarizing would be beneficial.
6271 Discount += VectorCost - ScalarCost;
6272 ScalarCosts[I] = ScalarCost;
6273 }
6274
6275 return Discount;
6276}
6277
6278LoopVectorizationCostModel::VectorizationCostTy
6279LoopVectorizationCostModel::expectedCost(
6280 ElementCount VF, SmallVectorImpl<InstructionVFPair> *Invalid) {
6281 VectorizationCostTy Cost;
6282
6283 // For each block.
6284 for (BasicBlock *BB : TheLoop->blocks()) {
6285 VectorizationCostTy BlockCost;
6286
6287 // For each instruction in the old loop.
6288 for (Instruction &I : BB->instructionsWithoutDebug()) {
6289 // Skip ignored values.
6290 if (ValuesToIgnore.count(&I) ||
6291 (VF.isVector() && VecValuesToIgnore.count(&I)))
6292 continue;
6293
6294 VectorizationCostTy C = getInstructionCost(&I, VF);
6295
6296 // Check if we should override the cost.
6297 if (C.first.isValid() &&
6298 ForceTargetInstructionCost.getNumOccurrences() > 0)
6299 C.first = InstructionCost(ForceTargetInstructionCost);
6300
6301 // Keep a list of instructions with invalid costs.
6302 if (Invalid && !C.first.isValid())
6303 Invalid->emplace_back(&I, VF);
6304
6305 BlockCost.first += C.first;
6306 BlockCost.second |= C.second;
6307 LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C.firstdo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found an estimated cost of "
<< C.first << " for VF " << VF << " For instruction: "
<< I << '\n'; } } while (false)
6308 << " for VF " << VF << " For instruction: " << Ido { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found an estimated cost of "
<< C.first << " for VF " << VF << " For instruction: "
<< I << '\n'; } } while (false)
6309 << '\n')do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found an estimated cost of "
<< C.first << " for VF " << VF << " For instruction: "
<< I << '\n'; } } while (false)
;
6310 }
6311
6312 // If we are vectorizing a predicated block, it will have been
6313 // if-converted. This means that the block's instructions (aside from
6314 // stores and instructions that may divide by zero) will now be
6315 // unconditionally executed. For the scalar case, we may not always execute
6316 // the predicated block, if it is an if-else block. Thus, scale the block's
6317 // cost by the probability of executing it. blockNeedsPredication from
6318 // Legal is used so as to not include all blocks in tail folded loops.
6319 if (VF.isScalar() && Legal->blockNeedsPredication(BB))
6320 BlockCost.first /= getReciprocalPredBlockProb();
6321
6322 Cost.first += BlockCost.first;
6323 Cost.second |= BlockCost.second;
6324 }
6325
6326 return Cost;
6327}
6328
6329/// Gets Address Access SCEV after verifying that the access pattern
6330/// is loop invariant except the induction variable dependence.
6331///
6332/// This SCEV can be sent to the Target in order to estimate the address
6333/// calculation cost.
6334static const SCEV *getAddressAccessSCEV(
6335 Value *Ptr,
6336 LoopVectorizationLegality *Legal,
6337 PredicatedScalarEvolution &PSE,
6338 const Loop *TheLoop) {
6339
6340 auto *Gep = dyn_cast<GetElementPtrInst>(Ptr);
6341 if (!Gep)
6342 return nullptr;
6343
6344 // We are looking for a gep with all loop invariant indices except for one
6345 // which should be an induction variable.
6346 auto SE = PSE.getSE();
6347 unsigned NumOperands = Gep->getNumOperands();
6348 for (unsigned i = 1; i < NumOperands; ++i) {
6349 Value *Opd = Gep->getOperand(i);
6350 if (!SE->isLoopInvariant(SE->getSCEV(Opd), TheLoop) &&
6351 !Legal->isInductionVariable(Opd))
6352 return nullptr;
6353 }
6354
6355 // Now we know we have a GEP ptr, %inv, %ind, %inv. return the Ptr SCEV.
6356 return PSE.getSCEV(Ptr);
6357}
6358
6359static bool isStrideMul(Instruction *I, LoopVectorizationLegality *Legal) {
6360 return Legal->hasStride(I->getOperand(0)) ||
6361 Legal->hasStride(I->getOperand(1));
6362}
6363
6364InstructionCost
6365LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I,
6366 ElementCount VF) {
6367 assert(VF.isVector() &&(static_cast <bool> (VF.isVector() && "Scalarization cost of instruction implies vectorization."
) ? void (0) : __assert_fail ("VF.isVector() && \"Scalarization cost of instruction implies vectorization.\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 6368, __extension__
__PRETTY_FUNCTION__))
6368 "Scalarization cost of instruction implies vectorization.")(static_cast <bool> (VF.isVector() && "Scalarization cost of instruction implies vectorization."
) ? void (0) : __assert_fail ("VF.isVector() && \"Scalarization cost of instruction implies vectorization.\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 6368, __extension__
__PRETTY_FUNCTION__))
;
6369 if (VF.isScalable())
6370 return InstructionCost::getInvalid();
6371
6372 Type *ValTy = getLoadStoreType(I);
6373 auto SE = PSE.getSE();
6374
6375 unsigned AS = getLoadStoreAddressSpace(I);
6376 Value *Ptr = getLoadStorePointerOperand(I);
6377 Type *PtrTy = ToVectorTy(Ptr->getType(), VF);
6378 // NOTE: PtrTy is a vector to signal `TTI::getAddressComputationCost`
6379 // that it is being called from this specific place.
6380
6381 // Figure out whether the access is strided and get the stride value
6382 // if it's known in compile time
6383 const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, Legal, PSE, TheLoop);
6384
6385 // Get the cost of the scalar memory instruction and address computation.
6386 InstructionCost Cost =
6387 VF.getKnownMinValue() * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV);
6388
6389 // Don't pass *I here, since it is scalar but will actually be part of a
6390 // vectorized loop where the user of it is a vectorized instruction.
6391 const Align Alignment = getLoadStoreAlignment(I);
6392 Cost += VF.getKnownMinValue() *
6393 TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(), Alignment,
6394 AS, TTI::TCK_RecipThroughput);
6395
6396 // Get the overhead of the extractelement and insertelement instructions
6397 // we might create due to scalarization.
6398 Cost += getScalarizationOverhead(I, VF);
6399
6400 // If we have a predicated load/store, it will need extra i1 extracts and
6401 // conditional branches, but may not be executed for each vector lane. Scale
6402 // the cost by the probability of executing the predicated block.
6403 if (isPredicatedInst(I)) {
6404 Cost /= getReciprocalPredBlockProb();
6405
6406 // Add the cost of an i1 extract and a branch
6407 auto *Vec_i1Ty =
6408 VectorType::get(IntegerType::getInt1Ty(ValTy->getContext()), VF);
6409 Cost += TTI.getScalarizationOverhead(
6410 Vec_i1Ty, APInt::getAllOnes(VF.getKnownMinValue()),
6411 /*Insert=*/false, /*Extract=*/true);
6412 Cost += TTI.getCFInstrCost(Instruction::Br, TTI::TCK_RecipThroughput);
6413
6414 if (useEmulatedMaskMemRefHack(I, VF))
6415 // Artificially setting to a high enough value to practically disable
6416 // vectorization with such operations.
6417 Cost = 3000000;
6418 }
6419
6420 return Cost;
6421}
6422
6423InstructionCost
6424LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I,
6425 ElementCount VF) {
6426 Type *ValTy = getLoadStoreType(I);
6427 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
6428 Value *Ptr = getLoadStorePointerOperand(I);
6429 unsigned AS = getLoadStoreAddressSpace(I);
6430 int ConsecutiveStride = Legal->isConsecutivePtr(ValTy, Ptr);
6431 enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
6432
6433 assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&(static_cast <bool> ((ConsecutiveStride == 1 || ConsecutiveStride
== -1) && "Stride should be 1 or -1 for consecutive memory access"
) ? void (0) : __assert_fail ("(ConsecutiveStride == 1 || ConsecutiveStride == -1) && \"Stride should be 1 or -1 for consecutive memory access\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 6434, __extension__
__PRETTY_FUNCTION__))
6434 "Stride should be 1 or -1 for consecutive memory access")(static_cast <bool> ((ConsecutiveStride == 1 || ConsecutiveStride
== -1) && "Stride should be 1 or -1 for consecutive memory access"
) ? void (0) : __assert_fail ("(ConsecutiveStride == 1 || ConsecutiveStride == -1) && \"Stride should be 1 or -1 for consecutive memory access\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 6434, __extension__
__PRETTY_FUNCTION__))
;
6435 const Align Alignment = getLoadStoreAlignment(I);
6436 InstructionCost Cost = 0;
6437 if (Legal->isMaskRequired(I)) {
6438 Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS,
6439 CostKind);
6440 } else {
6441 TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(I->getOperand(0));
6442 Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS,
6443 CostKind, OpInfo, I);
6444 }
6445
6446 bool Reverse = ConsecutiveStride < 0;
6447 if (Reverse)
6448 Cost += TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy,
6449 std::nullopt, CostKind, 0);
6450 return Cost;
6451}
6452
6453InstructionCost
6454LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I,
6455 ElementCount VF) {
6456 assert(Legal->isUniformMemOp(*I))(static_cast <bool> (Legal->isUniformMemOp(*I)) ? void
(0) : __assert_fail ("Legal->isUniformMemOp(*I)", "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 6456, __extension__ __PRETTY_FUNCTION__))
;
6457
6458 Type *ValTy = getLoadStoreType(I);
6459 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
6460 const Align Alignment = getLoadStoreAlignment(I);
6461 unsigned AS = getLoadStoreAddressSpace(I);
6462 enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
6463 if (isa<LoadInst>(I)) {
6464 return TTI.getAddressComputationCost(ValTy) +
6465 TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS,
6466 CostKind) +
6467 TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VectorTy);
6468 }
6469 StoreInst *SI = cast<StoreInst>(I);
6470
6471 bool isLoopInvariantStoreValue = Legal->isUniform(SI->getValueOperand());
6472 return TTI.getAddressComputationCost(ValTy) +
6473 TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS,
6474 CostKind) +
6475 (isLoopInvariantStoreValue
6476 ? 0
6477 : TTI.getVectorInstrCost(Instruction::ExtractElement, VectorTy,
6478 VF.getKnownMinValue() - 1));
6479}
6480
6481InstructionCost
6482LoopVectorizationCostModel::getGatherScatterCost(Instruction *I,
6483 ElementCount VF) {
6484 Type *ValTy = getLoadStoreType(I);
6485 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
6486 const Align Alignment = getLoadStoreAlignment(I);
6487 const Value *Ptr = getLoadStorePointerOperand(I);
6488
6489 return TTI.getAddressComputationCost(VectorTy) +
6490 TTI.getGatherScatterOpCost(
6491 I->getOpcode(), VectorTy, Ptr, Legal->isMaskRequired(I), Alignment,
6492 TargetTransformInfo::TCK_RecipThroughput, I);
6493}
6494
6495InstructionCost
6496LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I,
6497 ElementCount VF) {
6498 // TODO: Once we have support for interleaving with scalable vectors
6499 // we can calculate the cost properly here.
6500 if (VF.isScalable())
6501 return InstructionCost::getInvalid();
6502
6503 Type *ValTy = getLoadStoreType(I);
6504 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
6505 unsigned AS = getLoadStoreAddressSpace(I);
6506 enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
6507
6508 auto Group = getInterleavedAccessGroup(I);
6509 assert(Group && "Fail to get an interleaved access group.")(static_cast <bool> (Group && "Fail to get an interleaved access group."
) ? void (0) : __assert_fail ("Group && \"Fail to get an interleaved access group.\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 6509, __extension__
__PRETTY_FUNCTION__))
;
6510
6511 unsigned InterleaveFactor = Group->getFactor();
6512 auto *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor);
6513
6514 // Holds the indices of existing members in the interleaved group.
6515 SmallVector<unsigned, 4> Indices;
6516 for (unsigned IF = 0; IF < InterleaveFactor; IF++)
6517 if (Group->getMember(IF))
6518 Indices.push_back(IF);
6519
6520 // Calculate the cost of the whole interleaved group.
6521 bool UseMaskForGaps =
6522 (Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed()) ||
6523 (isa<StoreInst>(I) && (Group->getNumMembers() < Group->getFactor()));
6524 InstructionCost Cost = TTI.getInterleavedMemoryOpCost(
6525 I->getOpcode(), WideVecTy, Group->getFactor(), Indices, Group->getAlign(),
6526 AS, CostKind, Legal->isMaskRequired(I), UseMaskForGaps);
6527
6528 if (Group->isReverse()) {
6529 // TODO: Add support for reversed masked interleaved access.
6530 assert(!Legal->isMaskRequired(I) &&(static_cast <bool> (!Legal->isMaskRequired(I) &&
"Reverse masked interleaved access not supported.") ? void (
0) : __assert_fail ("!Legal->isMaskRequired(I) && \"Reverse masked interleaved access not supported.\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 6531, __extension__
__PRETTY_FUNCTION__))
6531 "Reverse masked interleaved access not supported.")(static_cast <bool> (!Legal->isMaskRequired(I) &&
"Reverse masked interleaved access not supported.") ? void (
0) : __assert_fail ("!Legal->isMaskRequired(I) && \"Reverse masked interleaved access not supported.\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 6531, __extension__
__PRETTY_FUNCTION__))
;
6532 Cost += Group->getNumMembers() *
6533 TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy,
6534 std::nullopt, CostKind, 0);
6535 }
6536 return Cost;
6537}
6538
6539std::optional<InstructionCost>
6540LoopVectorizationCostModel::getReductionPatternCost(
6541 Instruction *I, ElementCount VF, Type *Ty, TTI::TargetCostKind CostKind) {
6542 using namespace llvm::PatternMatch;
6543 // Early exit for no inloop reductions
6544 if (InLoopReductionChains.empty() || VF.isScalar() || !isa<VectorType>(Ty))
6545 return std::nullopt;
6546 auto *VectorTy = cast<VectorType>(Ty);
6547
6548 // We are looking for a pattern of, and finding the minimal acceptable cost:
6549 // reduce(mul(ext(A), ext(B))) or
6550 // reduce(mul(A, B)) or
6551 // reduce(ext(A)) or
6552 // reduce(A).
6553 // The basic idea is that we walk down the tree to do that, finding the root
6554 // reduction instruction in InLoopReductionImmediateChains. From there we find
6555 // the pattern of mul/ext and test the cost of the entire pattern vs the cost
6556 // of the components. If the reduction cost is lower then we return it for the
6557 // reduction instruction and 0 for the other instructions in the pattern. If
6558 // it is not we return an invalid cost specifying the orignal cost method
6559 // should be used.
6560 Instruction *RetI = I;
6561 if (match(RetI, m_ZExtOrSExt(m_Value()))) {
6562 if (!RetI->hasOneUser())
6563 return std::nullopt;
6564 RetI = RetI->user_back();
6565 }
6566
6567 if (match(RetI, m_OneUse(m_Mul(m_Value(), m_Value()))) &&
6568 RetI->user_back()->getOpcode() == Instruction::Add) {
6569 RetI = RetI->user_back();
6570 }
6571
6572 // Test if the found instruction is a reduction, and if not return an invalid
6573 // cost specifying the parent to use the original cost modelling.
6574 if (!InLoopReductionImmediateChains.count(RetI))
6575 return std::nullopt;
6576
6577 // Find the reduction this chain is a part of and calculate the basic cost of
6578 // the reduction on its own.
6579 Instruction *LastChain = InLoopReductionImmediateChains[RetI];
6580 Instruction *ReductionPhi = LastChain;
6581 while (!isa<PHINode>(ReductionPhi))
6582 ReductionPhi = InLoopReductionImmediateChains[ReductionPhi];
6583
6584 const RecurrenceDescriptor &RdxDesc =
6585 Legal->getReductionVars().find(cast<PHINode>(ReductionPhi))->second;
6586
6587 InstructionCost BaseCost = TTI.getArithmeticReductionCost(
6588 RdxDesc.getOpcode(), VectorTy, RdxDesc.getFastMathFlags(), CostKind);
6589
6590 // For a call to the llvm.fmuladd intrinsic we need to add the cost of a
6591 // normal fmul instruction to the cost of the fadd reduction.
6592 if (RdxDesc.getRecurrenceKind() == RecurKind::FMulAdd)
6593 BaseCost +=
6594 TTI.getArithmeticInstrCost(Instruction::FMul, VectorTy, CostKind);
6595
6596 // If we're using ordered reductions then we can just return the base cost
6597 // here, since getArithmeticReductionCost calculates the full ordered
6598 // reduction cost when FP reassociation is not allowed.
6599 if (useOrderedReductions(RdxDesc))
6600 return BaseCost;
6601
6602 // Get the operand that was not the reduction chain and match it to one of the
6603 // patterns, returning the better cost if it is found.
6604 Instruction *RedOp = RetI->getOperand(1) == LastChain
6605 ? dyn_cast<Instruction>(RetI->getOperand(0))
6606 : dyn_cast<Instruction>(RetI->getOperand(1));
6607
6608 VectorTy = VectorType::get(I->getOperand(0)->getType(), VectorTy);
6609
6610 Instruction *Op0, *Op1;
6611 if (RedOp && RdxDesc.getOpcode() == Instruction::Add &&
6612 match(RedOp,
6613 m_ZExtOrSExt(m_Mul(m_Instruction(Op0), m_Instruction(Op1)))) &&
6614 match(Op0, m_ZExtOrSExt(m_Value())) &&
6615 Op0->getOpcode() == Op1->getOpcode() &&
6616 Op0->getOperand(0)->getType() == Op1->getOperand(0)->getType() &&
6617 !TheLoop->isLoopInvariant(Op0) && !TheLoop->isLoopInvariant(Op1) &&
6618 (Op0->getOpcode() == RedOp->getOpcode() || Op0 == Op1)) {
6619
6620 // Matched reduce.add(ext(mul(ext(A), ext(B)))
6621 // Note that the extend opcodes need to all match, or if A==B they will have
6622 // been converted to zext(mul(sext(A), sext(A))) as it is known positive,
6623 // which is equally fine.
6624 bool IsUnsigned = isa<ZExtInst>(Op0);
6625 auto *ExtType = VectorType::get(Op0->getOperand(0)->getType(), VectorTy);
6626 auto *MulType = VectorType::get(Op0->getType(), VectorTy);
6627
6628 InstructionCost ExtCost =
6629 TTI.getCastInstrCost(Op0->getOpcode(), MulType, ExtType,
6630 TTI::CastContextHint::None, CostKind, Op0);
6631 InstructionCost MulCost =
6632 TTI.getArithmeticInstrCost(Instruction::Mul, MulType, CostKind);
6633 InstructionCost Ext2Cost =
6634 TTI.getCastInstrCost(RedOp->getOpcode(), VectorTy, MulType,
6635 TTI::CastContextHint::None, CostKind, RedOp);
6636
6637 InstructionCost RedCost = TTI.getMulAccReductionCost(
6638 IsUnsigned, RdxDesc.getRecurrenceType(), ExtType, CostKind);
6639
6640 if (RedCost.isValid() &&
6641 RedCost < ExtCost * 2 + MulCost + Ext2Cost + BaseCost)
6642 return I == RetI ? RedCost : 0;
6643 } else if (RedOp && match(RedOp, m_ZExtOrSExt(m_Value())) &&
6644 !TheLoop->isLoopInvariant(RedOp)) {
6645 // Matched reduce(ext(A))
6646 bool IsUnsigned = isa<ZExtInst>(RedOp);
6647 auto *ExtType = VectorType::get(RedOp->getOperand(0)->getType(), VectorTy);
6648 InstructionCost RedCost = TTI.getExtendedReductionCost(
6649 RdxDesc.getOpcode(), IsUnsigned, RdxDesc.getRecurrenceType(), ExtType,
6650 RdxDesc.getFastMathFlags(), CostKind);
6651
6652 InstructionCost ExtCost =
6653 TTI.getCastInstrCost(RedOp->getOpcode(), VectorTy, ExtType,
6654 TTI::CastContextHint::None, CostKind, RedOp);
6655 if (RedCost.isValid() && RedCost < BaseCost + ExtCost)
6656 return I == RetI ? RedCost : 0;
6657 } else if (RedOp && RdxDesc.getOpcode() == Instruction::Add &&
6658 match(RedOp, m_Mul(m_Instruction(Op0), m_Instruction(Op1)))) {
6659 if (match(Op0, m_ZExtOrSExt(m_Value())) &&
6660 Op0->getOpcode() == Op1->getOpcode() &&
6661 !TheLoop->isLoopInvariant(Op0) && !TheLoop->isLoopInvariant(Op1)) {
6662 bool IsUnsigned = isa<ZExtInst>(Op0);
6663 Type *Op0Ty = Op0->getOperand(0)->getType();
6664 Type *Op1Ty = Op1->getOperand(0)->getType();
6665 Type *LargestOpTy =
6666 Op0Ty->getIntegerBitWidth() < Op1Ty->getIntegerBitWidth() ? Op1Ty
6667 : Op0Ty;
6668 auto *ExtType = VectorType::get(LargestOpTy, VectorTy);
6669
6670 // Matched reduce.add(mul(ext(A), ext(B))), where the two ext may be of
6671 // different sizes. We take the largest type as the ext to reduce, and add
6672 // the remaining cost as, for example reduce(mul(ext(ext(A)), ext(B))).
6673 InstructionCost ExtCost0 = TTI.getCastInstrCost(
6674 Op0->getOpcode(), VectorTy, VectorType::get(Op0Ty, VectorTy),
6675 TTI::CastContextHint::None, CostKind, Op0);
6676 InstructionCost ExtCost1 = TTI.getCastInstrCost(
6677 Op1->getOpcode(), VectorTy, VectorType::get(Op1Ty, VectorTy),
6678 TTI::CastContextHint::None, CostKind, Op1);
6679 InstructionCost MulCost =
6680 TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind);
6681
6682 InstructionCost RedCost = TTI.getMulAccReductionCost(
6683 IsUnsigned, RdxDesc.getRecurrenceType(), ExtType, CostKind);
6684 InstructionCost ExtraExtCost = 0;
6685 if (Op0Ty != LargestOpTy || Op1Ty != LargestOpTy) {
6686 Instruction *ExtraExtOp = (Op0Ty != LargestOpTy) ? Op0 : Op1;
6687 ExtraExtCost = TTI.getCastInstrCost(
6688 ExtraExtOp->getOpcode(), ExtType,
6689 VectorType::get(ExtraExtOp->getOperand(0)->getType(), VectorTy),
6690 TTI::CastContextHint::None, CostKind, ExtraExtOp);
6691 }
6692
6693 if (RedCost.isValid() &&
6694 (RedCost + ExtraExtCost) < (ExtCost0 + ExtCost1 + MulCost + BaseCost))
6695 return I == RetI ? RedCost : 0;
6696 } else if (!match(I, m_ZExtOrSExt(m_Value()))) {
6697 // Matched reduce.add(mul())
6698 InstructionCost MulCost =
6699 TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind);
6700
6701 InstructionCost RedCost = TTI.getMulAccReductionCost(
6702 true, RdxDesc.getRecurrenceType(), VectorTy, CostKind);
6703
6704 if (RedCost.isValid() && RedCost < MulCost + BaseCost)
6705 return I == RetI ? RedCost : 0;
6706 }
6707 }
6708
6709 return I == RetI ? std::optional<InstructionCost>(BaseCost) : std::nullopt;
6710}
6711
6712InstructionCost
6713LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I,
6714 ElementCount VF) {
6715 // Calculate scalar cost only. Vectorization cost should be ready at this
6716 // moment.
6717 if (VF.isScalar()) {
6718 Type *ValTy = getLoadStoreType(I);
6719 const Align Alignment = getLoadStoreAlignment(I);
6720 unsigned AS = getLoadStoreAddressSpace(I);
6721
6722 TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(I->getOperand(0));
6723 return TTI.getAddressComputationCost(ValTy) +
6724 TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS,
6725 TTI::TCK_RecipThroughput, OpInfo, I);
6726 }
6727 return getWideningCost(I, VF);
6728}
6729
6730LoopVectorizationCostModel::VectorizationCostTy
6731LoopVectorizationCostModel::getInstructionCost(Instruction *I,
6732 ElementCount VF) {
6733 // If we know that this instruction will remain uniform, check the cost of
6734 // the scalar version.
6735 if (isUniformAfterVectorization(I, VF))
6736 VF = ElementCount::getFixed(1);
6737
6738 if (VF.isVector() && isProfitableToScalarize(I, VF))
6739 return VectorizationCostTy(InstsToScalarize[VF][I], false);
6740
6741 // Forced scalars do not have any scalarization overhead.
6742 auto ForcedScalar = ForcedScalars.find(VF);
6743 if (VF.isVector() && ForcedScalar != ForcedScalars.end()) {
6744 auto InstSet = ForcedScalar->second;
6745 if (InstSet.count(I))
6746 return VectorizationCostTy(
6747 (getInstructionCost(I, ElementCount::getFixed(1)).first *
6748 VF.getKnownMinValue()),
6749 false);
6750 }
6751
6752 Type *VectorTy;
6753 InstructionCost C = getInstructionCost(I, VF, VectorTy);
6754
6755 bool TypeNotScalarized = false;
6756 if (VF.isVector() && VectorTy->isVectorTy()) {
6757 if (unsigned NumParts = TTI.getNumberOfParts(VectorTy)) {
6758 if (VF.isScalable())
6759 // <vscale x 1 x iN> is assumed to be profitable over iN because
6760 // scalable registers are a distinct register class from scalar ones.
6761 // If we ever find a target which wants to lower scalable vectors
6762 // back to scalars, we'll need to update this code to explicitly
6763 // ask TTI about the register class uses for each part.
6764 TypeNotScalarized = NumParts <= VF.getKnownMinValue();
6765 else
6766 TypeNotScalarized = NumParts < VF.getKnownMinValue();
6767 } else
6768 C = InstructionCost::getInvalid();
6769 }
6770 return VectorizationCostTy(C, TypeNotScalarized);
6771}
6772
6773InstructionCost
6774LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I,
6775 ElementCount VF) const {
6776
6777 // There is no mechanism yet to create a scalable scalarization loop,
6778 // so this is currently Invalid.
6779 if (VF.isScalable())
6780 return InstructionCost::getInvalid();
6781
6782 if (VF.isScalar())
6783 return 0;
6784
6785 InstructionCost Cost = 0;
6786 Type *RetTy = ToVectorTy(I->getType(), VF);
6787 if (!RetTy->isVoidTy() &&
6788 (!isa<LoadInst>(I) || !TTI.supportsEfficientVectorElementLoadStore()))
6789 Cost += TTI.getScalarizationOverhead(
6790 cast<VectorType>(RetTy), APInt::getAllOnes(VF.getKnownMinValue()), true,
6791 false);
6792
6793 // Some targets keep addresses scalar.
6794 if (isa<LoadInst>(I) && !TTI.prefersVectorizedAddressing())
6795 return Cost;
6796
6797 // Some targets support efficient element stores.
6798 if (isa<StoreInst>(I) && TTI.supportsEfficientVectorElementLoadStore())
6799 return Cost;
6800
6801 // Collect operands to consider.
6802 CallInst *CI = dyn_cast<CallInst>(I);
6803 Instruction::op_range Ops = CI ? CI->args() : I->operands();
6804
6805 // Skip operands that do not require extraction/scalarization and do not incur
6806 // any overhead.
6807 SmallVector<Type *> Tys;
6808 for (auto *V : filterExtractingOperands(Ops, VF))
6809 Tys.push_back(MaybeVectorizeType(V->getType(), VF));
6810 return Cost + TTI.getOperandsScalarizationOverhead(
6811 filterExtractingOperands(Ops, VF), Tys);
6812}
6813
6814void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) {
6815 if (VF.isScalar())
6816 return;
6817 NumPredStores = 0;
6818 for (BasicBlock *BB : TheLoop->blocks()) {
6819 // For each instruction in the old loop.
6820 for (Instruction &I : *BB) {
6821 Value *Ptr = getLoadStorePointerOperand(&I);
6822 if (!Ptr)
6823 continue;
6824
6825 // TODO: We should generate better code and update the cost model for
6826 // predicated uniform stores. Today they are treated as any other
6827 // predicated store (see added test cases in
6828 // invariant-store-vectorization.ll).
6829 if (isa<StoreInst>(&I) && isScalarWithPredication(&I, VF))
6830 NumPredStores++;
6831
6832 if (Legal->isUniformMemOp(I)) {
6833 auto isLegalToScalarize = [&]() {
6834 if (!VF.isScalable())
6835 // Scalarization of fixed length vectors "just works".
6836 return true;
6837
6838 // We have dedicated lowering for unpredicated uniform loads and
6839 // stores. Note that even with tail folding we know that at least
6840 // one lane is active (i.e. generalized predication is not possible
6841 // here), and the logic below depends on this fact.
6842 if (!foldTailByMasking())
6843 return true;
6844
6845 // For scalable vectors, a uniform memop load is always
6846 // uniform-by-parts and we know how to scalarize that.
6847 if (isa<LoadInst>(I))
6848 return true;
6849
6850 // A uniform store isn't neccessarily uniform-by-part
6851 // and we can't assume scalarization.
6852 auto &SI = cast<StoreInst>(I);
6853 return TheLoop->isLoopInvariant(SI.getValueOperand());
6854 };
6855
6856 const InstructionCost GatherScatterCost =
6857 isLegalGatherOrScatter(&I, VF) ?
6858 getGatherScatterCost(&I, VF) : InstructionCost::getInvalid();
6859
6860 // Load: Scalar load + broadcast
6861 // Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract
6862 // FIXME: This cost is a significant under-estimate for tail folded
6863 // memory ops.
6864 const InstructionCost ScalarizationCost = isLegalToScalarize() ?
6865 getUniformMemOpCost(&I, VF) : InstructionCost::getInvalid();
6866
6867 // Choose better solution for the current VF, Note that Invalid
6868 // costs compare as maximumal large. If both are invalid, we get
6869 // scalable invalid which signals a failure and a vectorization abort.
6870 if (GatherScatterCost < ScalarizationCost)
6871 setWideningDecision(&I, VF, CM_GatherScatter, GatherScatterCost);
6872 else
6873 setWideningDecision(&I, VF, CM_Scalarize, ScalarizationCost);
6874 continue;
6875 }
6876
6877 // We assume that widening is the best solution when possible.
6878 if (memoryInstructionCanBeWidened(&I, VF)) {
6879 InstructionCost Cost = getConsecutiveMemOpCost(&I, VF);
6880 int ConsecutiveStride = Legal->isConsecutivePtr(
6881 getLoadStoreType(&I), getLoadStorePointerOperand(&I));
6882 assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&(static_cast <bool> ((ConsecutiveStride == 1 || ConsecutiveStride
== -1) && "Expected consecutive stride.") ? void (0)
: __assert_fail ("(ConsecutiveStride == 1 || ConsecutiveStride == -1) && \"Expected consecutive stride.\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 6883, __extension__
__PRETTY_FUNCTION__))
6883 "Expected consecutive stride.")(static_cast <bool> ((ConsecutiveStride == 1 || ConsecutiveStride
== -1) && "Expected consecutive stride.") ? void (0)
: __assert_fail ("(ConsecutiveStride == 1 || ConsecutiveStride == -1) && \"Expected consecutive stride.\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 6883, __extension__
__PRETTY_FUNCTION__))
;
6884 InstWidening Decision =
6885 ConsecutiveStride == 1 ? CM_Widen : CM_Widen_Reverse;
6886 setWideningDecision(&I, VF, Decision, Cost);
6887 continue;
6888 }
6889
6890 // Choose between Interleaving, Gather/Scatter or Scalarization.
6891 InstructionCost InterleaveCost = InstructionCost::getInvalid();
6892 unsigned NumAccesses = 1;
6893 if (isAccessInterleaved(&I)) {
6894 auto Group = getInterleavedAccessGroup(&I);
6895 assert(Group && "Fail to get an interleaved access group.")(static_cast <bool> (Group && "Fail to get an interleaved access group."
) ? void (0) : __assert_fail ("Group && \"Fail to get an interleaved access group.\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 6895, __extension__
__PRETTY_FUNCTION__))
;
6896
6897 // Make one decision for the whole group.
6898 if (getWideningDecision(&I, VF) != CM_Unknown)
6899 continue;
6900
6901 NumAccesses = Group->getNumMembers();
6902 if (interleavedAccessCanBeWidened(&I, VF))
6903 InterleaveCost = getInterleaveGroupCost(&I, VF);
6904 }
6905
6906 InstructionCost GatherScatterCost =
6907 isLegalGatherOrScatter(&I, VF)
6908 ? getGatherScatterCost(&I, VF) * NumAccesses
6909 : InstructionCost::getInvalid();
6910
6911 InstructionCost ScalarizationCost =
6912 getMemInstScalarizationCost(&I, VF) * NumAccesses;
6913
6914 // Choose better solution for the current VF,
6915 // write down this decision and use it during vectorization.
6916 InstructionCost Cost;
6917 InstWidening Decision;
6918 if (InterleaveCost <= GatherScatterCost &&
6919 InterleaveCost < ScalarizationCost) {
6920 Decision = CM_Interleave;
6921 Cost = InterleaveCost;
6922 } else if (GatherScatterCost < ScalarizationCost) {
6923 Decision = CM_GatherScatter;
6924 Cost = GatherScatterCost;
6925 } else {
6926 Decision = CM_Scalarize;
6927 Cost = ScalarizationCost;
6928 }
6929 // If the instructions belongs to an interleave group, the whole group
6930 // receives the same decision. The whole group receives the cost, but
6931 // the cost will actually be assigned to one instruction.
6932 if (auto Group = getInterleavedAccessGroup(&I))
6933 setWideningDecision(Group, VF, Decision, Cost);
6934 else
6935 setWideningDecision(&I, VF, Decision, Cost);
6936 }
6937 }
6938
6939 // Make sure that any load of address and any other address computation
6940 // remains scalar unless there is gather/scatter support. This avoids
6941 // inevitable extracts into address registers, and also has the benefit of
6942 // activating LSR more, since that pass can't optimize vectorized
6943 // addresses.
6944 if (TTI.prefersVectorizedAddressing())
6945 return;
6946
6947 // Start with all scalar pointer uses.
6948 SmallPtrSet<Instruction *, 8> AddrDefs;
6949 for (BasicBlock *BB : TheLoop->blocks())
6950 for (Instruction &I : *BB) {
6951 Instruction *PtrDef =
6952 dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I));
6953 if (PtrDef && TheLoop->contains(PtrDef) &&
6954 getWideningDecision(&I, VF) != CM_GatherScatter)
6955 AddrDefs.insert(PtrDef);
6956 }
6957
6958 // Add all instructions used to generate the addresses.
6959 SmallVector<Instruction *, 4> Worklist;
6960 append_range(Worklist, AddrDefs);
6961 while (!Worklist.empty()) {
6962 Instruction *I = Worklist.pop_back_val();
6963 for (auto &Op : I->operands())
6964 if (auto *InstOp = dyn_cast<Instruction>(Op))
6965 if ((InstOp->getParent() == I->getParent()) && !isa<PHINode>(InstOp) &&
6966 AddrDefs.insert(InstOp).second)
6967 Worklist.push_back(InstOp);
6968 }
6969
6970 for (auto *I : AddrDefs) {
6971 if (isa<LoadInst>(I)) {
6972 // Setting the desired widening decision should ideally be handled in
6973 // by cost functions, but since this involves the task of finding out
6974 // if the loaded register is involved in an address computation, it is
6975 // instead changed here when we know this is the case.
6976 InstWidening Decision = getWideningDecision(I, VF);
6977 if (Decision == CM_Widen || Decision == CM_Widen_Reverse)
6978 // Scalarize a widened load of address.
6979 setWideningDecision(
6980 I, VF, CM_Scalarize,
6981 (VF.getKnownMinValue() *
6982 getMemoryInstructionCost(I, ElementCount::getFixed(1))));
6983 else if (auto Group = getInterleavedAccessGroup(I)) {
6984 // Scalarize an interleave group of address loads.
6985 for (unsigned I = 0; I < Group->getFactor(); ++I) {
6986 if (Instruction *Member = Group->getMember(I))
6987 setWideningDecision(
6988 Member, VF, CM_Scalarize,
6989 (VF.getKnownMinValue() *
6990 getMemoryInstructionCost(Member, ElementCount::getFixed(1))));
6991 }
6992 }
6993 } else
6994 // Make sure I gets scalarized and a cost estimate without
6995 // scalarization overhead.
6996 ForcedScalars[VF].insert(I);
6997 }
6998}
6999
7000InstructionCost
7001LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF,
7002 Type *&VectorTy) {
7003 Type *RetTy = I->getType();
7004 if (canTruncateToMinimalBitwidth(I, VF))
7005 RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]);
7006 auto SE = PSE.getSE();
7007 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
7008
7009 auto hasSingleCopyAfterVectorization = [this](Instruction *I,
7010 ElementCount VF) -> bool {
7011 if (VF.isScalar())
7012 return true;
7013
7014 auto Scalarized = InstsToScalarize.find(VF);
7015 assert(Scalarized != InstsToScalarize.end() &&(static_cast <bool> (Scalarized != InstsToScalarize.end
() && "VF not yet analyzed for scalarization profitability"
) ? void (0) : __assert_fail ("Scalarized != InstsToScalarize.end() && \"VF not yet analyzed for scalarization profitability\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 7016, __extension__
__PRETTY_FUNCTION__))
7016 "VF not yet analyzed for scalarization profitability")(static_cast <bool> (Scalarized != InstsToScalarize.end
() && "VF not yet analyzed for scalarization profitability"
) ? void (0) : __assert_fail ("Scalarized != InstsToScalarize.end() && \"VF not yet analyzed for scalarization profitability\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 7016, __extension__
__PRETTY_FUNCTION__))
;
7017 return !Scalarized->second.count(I) &&
7018 llvm::all_of(I->users(), [&](User *U) {
7019 auto *UI = cast<Instruction>(U);
7020 return !Scalarized->second.count(UI);
7021 });
7022 };
7023 (void) hasSingleCopyAfterVectorization;
7024
7025 if (isScalarAfterVectorization(I, VF)) {
7026 // With the exception of GEPs and PHIs, after scalarization there should
7027 // only be one copy of the instruction generated in the loop. This is
7028 // because the VF is either 1, or any instructions that need scalarizing
7029 // have already been dealt with by the the time we get here. As a result,
7030 // it means we don't have to multiply the instruction cost by VF.
7031 assert(I->getOpcode() == Instruction::GetElementPtr ||(static_cast <bool> (I->getOpcode() == Instruction::
GetElementPtr || I->getOpcode() == Instruction::PHI || (I->
getOpcode() == Instruction::BitCast && I->getType(
)->isPointerTy()) || hasSingleCopyAfterVectorization(I, VF
)) ? void (0) : __assert_fail ("I->getOpcode() == Instruction::GetElementPtr || I->getOpcode() == Instruction::PHI || (I->getOpcode() == Instruction::BitCast && I->getType()->isPointerTy()) || hasSingleCopyAfterVectorization(I, VF)"
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 7035, __extension__
__PRETTY_FUNCTION__))
7032 I->getOpcode() == Instruction::PHI ||(static_cast <bool> (I->getOpcode() == Instruction::
GetElementPtr || I->getOpcode() == Instruction::PHI || (I->
getOpcode() == Instruction::BitCast && I->getType(
)->isPointerTy()) || hasSingleCopyAfterVectorization(I, VF
)) ? void (0) : __assert_fail ("I->getOpcode() == Instruction::GetElementPtr || I->getOpcode() == Instruction::PHI || (I->getOpcode() == Instruction::BitCast && I->getType()->isPointerTy()) || hasSingleCopyAfterVectorization(I, VF)"
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 7035, __extension__
__PRETTY_FUNCTION__))
7033 (I->getOpcode() == Instruction::BitCast &&(static_cast <bool> (I->getOpcode() == Instruction::
GetElementPtr || I->getOpcode() == Instruction::PHI || (I->
getOpcode() == Instruction::BitCast && I->getType(
)->isPointerTy()) || hasSingleCopyAfterVectorization(I, VF
)) ? void (0) : __assert_fail ("I->getOpcode() == Instruction::GetElementPtr || I->getOpcode() == Instruction::PHI || (I->getOpcode() == Instruction::BitCast && I->getType()->isPointerTy()) || hasSingleCopyAfterVectorization(I, VF)"
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 7035, __extension__
__PRETTY_FUNCTION__))
7034 I->getType()->isPointerTy()) ||(static_cast <bool> (I->getOpcode() == Instruction::
GetElementPtr || I->getOpcode() == Instruction::PHI || (I->
getOpcode() == Instruction::BitCast && I->getType(
)->isPointerTy()) || hasSingleCopyAfterVectorization(I, VF
)) ? void (0) : __assert_fail ("I->getOpcode() == Instruction::GetElementPtr || I->getOpcode() == Instruction::PHI || (I->getOpcode() == Instruction::BitCast && I->getType()->isPointerTy()) || hasSingleCopyAfterVectorization(I, VF)"
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 7035, __extension__
__PRETTY_FUNCTION__))
7035 hasSingleCopyAfterVectorization(I, VF))(static_cast <bool> (I->getOpcode() == Instruction::
GetElementPtr || I->getOpcode() == Instruction::PHI || (I->
getOpcode() == Instruction::BitCast && I->getType(
)->isPointerTy()) || hasSingleCopyAfterVectorization(I, VF
)) ? void (0) : __assert_fail ("I->getOpcode() == Instruction::GetElementPtr || I->getOpcode() == Instruction::PHI || (I->getOpcode() == Instruction::BitCast && I->getType()->isPointerTy()) || hasSingleCopyAfterVectorization(I, VF)"
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 7035, __extension__
__PRETTY_FUNCTION__))
;
7036 VectorTy = RetTy;
7037 } else
7038 VectorTy = ToVectorTy(RetTy, VF);
7039
7040 // TODO: We need to estimate the cost of intrinsic calls.
7041 switch (I->getOpcode()) {
7042 case Instruction::GetElementPtr:
7043 // We mark this instruction as zero-cost because the cost of GEPs in
7044 // vectorized code depends on whether the corresponding memory instruction
7045 // is scalarized or not. Therefore, we handle GEPs with the memory
7046 // instruction cost.
7047 return 0;
7048 case Instruction::Br: {
7049 // In cases of scalarized and predicated instructions, there will be VF
7050 // predicated blocks in the vectorized loop. Each branch around these
7051 // blocks requires also an extract of its vector compare i1 element.
7052 bool ScalarPredicatedBB = false;
7053 BranchInst *BI = cast<BranchInst>(I);
7054 if (VF.isVector() && BI->isConditional() &&
7055 (PredicatedBBsAfterVectorization[VF].count(BI->getSuccessor(0)) ||
7056 PredicatedBBsAfterVectorization[VF].count(BI->getSuccessor(1))))
7057 ScalarPredicatedBB = true;
7058
7059 if (ScalarPredicatedBB) {
7060 // Not possible to scalarize scalable vector with predicated instructions.
7061 if (VF.isScalable())
7062 return InstructionCost::getInvalid();
7063 // Return cost for branches around scalarized and predicated blocks.
7064 auto *Vec_i1Ty =
7065 VectorType::get(IntegerType::getInt1Ty(RetTy->getContext()), VF);
7066 return (
7067 TTI.getScalarizationOverhead(
7068 Vec_i1Ty, APInt::getAllOnes(VF.getFixedValue()), false, true) +
7069 (TTI.getCFInstrCost(Instruction::Br, CostKind) * VF.getFixedValue()));
7070 } else if (I->getParent() == TheLoop->getLoopLatch() || VF.isScalar())
7071 // The back-edge branch will remain, as will all scalar branches.
7072 return TTI.getCFInstrCost(Instruction::Br, CostKind);
7073 else
7074 // This branch will be eliminated by if-conversion.
7075 return 0;
7076 // Note: We currently assume zero cost for an unconditional branch inside
7077 // a predicated block since it will become a fall-through, although we
7078 // may decide in the future to call TTI for all branches.
7079 }
7080 case Instruction::PHI: {
7081 auto *Phi = cast<PHINode>(I);
7082
7083 // First-order recurrences are replaced by vector shuffles inside the loop.
7084 if (VF.isVector() && Legal->isFixedOrderRecurrence(Phi)) {
7085 SmallVector<int> Mask(VF.getKnownMinValue());
7086 std::iota(Mask.begin(), Mask.end(), VF.getKnownMinValue() - 1);
7087 return TTI.getShuffleCost(TargetTransformInfo::SK_Splice,
7088 cast<VectorType>(VectorTy), Mask, CostKind,
7089 VF.getKnownMinValue() - 1);
7090 }
7091
7092 // Phi nodes in non-header blocks (not inductions, reductions, etc.) are
7093 // converted into select instructions. We require N - 1 selects per phi
7094 // node, where N is the number of incoming values.
7095 if (VF.isVector() && Phi->getParent() != TheLoop->getHeader())
7096 return (Phi->getNumIncomingValues() - 1) *
7097 TTI.getCmpSelInstrCost(
7098 Instruction::Select, ToVectorTy(Phi->getType(), VF),
7099 ToVectorTy(Type::getInt1Ty(Phi->getContext()), VF),
7100 CmpInst::BAD_ICMP_PREDICATE, CostKind);
7101
7102 return TTI.getCFInstrCost(Instruction::PHI, CostKind);
7103 }
7104 case Instruction::UDiv:
7105 case Instruction::SDiv:
7106 case Instruction::URem:
7107 case Instruction::SRem:
7108 if (VF.isVector() && isPredicatedInst(I)) {
7109 const auto [ScalarCost, SafeDivisorCost] = getDivRemSpeculationCost(I, VF);
7110 return isDivRemScalarWithPredication(ScalarCost, SafeDivisorCost) ?
7111 ScalarCost : SafeDivisorCost;
7112 }
7113 // We've proven all lanes safe to speculate, fall through.
7114 [[fallthrough]];
7115 case Instruction::Add:
7116 case Instruction::FAdd:
7117 case Instruction::Sub:
7118 case Instruction::FSub:
7119 case Instruction::Mul:
7120 case Instruction::FMul:
7121 case Instruction::FDiv:
7122 case Instruction::FRem:
7123 case Instruction::Shl:
7124 case Instruction::LShr:
7125 case Instruction::AShr:
7126 case Instruction::And:
7127 case Instruction::Or:
7128 case Instruction::Xor: {
7129 // Since we will replace the stride by 1 the multiplication should go away.
7130 if (I->getOpcode() == Instruction::Mul && isStrideMul(I, Legal))
7131 return 0;
7132
7133 // Detect reduction patterns
7134 if (auto RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind))
7135 return *RedCost;
7136
7137 // Certain instructions can be cheaper to vectorize if they have a constant
7138 // second vector operand. One example of this are shifts on x86.
7139 Value *Op2 = I->getOperand(1);
7140 auto Op2Info = TTI.getOperandInfo(Op2);
7141 if (Op2Info.Kind == TargetTransformInfo::OK_AnyValue && Legal->isUniform(Op2))
7142 Op2Info.Kind = TargetTransformInfo::OK_UniformValue;
7143
7144 SmallVector<const Value *, 4> Operands(I->operand_values());
7145 return TTI.getArithmeticInstrCost(
7146 I->getOpcode(), VectorTy, CostKind,
7147 {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
7148 Op2Info, Operands, I);
7149 }
7150 case Instruction::FNeg: {
7151 return TTI.getArithmeticInstrCost(
7152 I->getOpcode(), VectorTy, CostKind,
7153 {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
7154 {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
7155 I->getOperand(0), I);
7156 }
7157 case Instruction::Select: {
7158 SelectInst *SI = cast<SelectInst>(I);
7159 const SCEV *CondSCEV = SE->getSCEV(SI->getCondition());
7160 bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop));
7161
7162 const Value *Op0, *Op1;
7163 using namespace llvm::PatternMatch;
7164 if (!ScalarCond && (match(I, m_LogicalAnd(m_Value(Op0), m_Value(Op1))) ||
7165 match(I, m_LogicalOr(m_Value(Op0), m_Value(Op1))))) {
7166 // select x, y, false --> x & y
7167 // select x, true, y --> x | y
7168 const auto [Op1VK, Op1VP] = TTI::getOperandInfo(Op0);
7169 const auto [Op2VK, Op2VP] = TTI::getOperandInfo(Op1);
7170 assert(Op0->getType()->getScalarSizeInBits() == 1 &&(static_cast <bool> (Op0->getType()->getScalarSizeInBits
() == 1 && Op1->getType()->getScalarSizeInBits(
) == 1) ? void (0) : __assert_fail ("Op0->getType()->getScalarSizeInBits() == 1 && Op1->getType()->getScalarSizeInBits() == 1"
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 7171, __extension__
__PRETTY_FUNCTION__))
7171 Op1->getType()->getScalarSizeInBits() == 1)(static_cast <bool> (Op0->getType()->getScalarSizeInBits
() == 1 && Op1->getType()->getScalarSizeInBits(
) == 1) ? void (0) : __assert_fail ("Op0->getType()->getScalarSizeInBits() == 1 && Op1->getType()->getScalarSizeInBits() == 1"
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 7171, __extension__
__PRETTY_FUNCTION__))
;
7172
7173 SmallVector<const Value *, 2> Operands{Op0, Op1};
7174 return TTI.getArithmeticInstrCost(
7175 match(I, m_LogicalOr()) ? Instruction::Or : Instruction::And, VectorTy,
7176 CostKind, {Op1VK, Op1VP}, {Op2VK, Op2VP}, Operands, I);
7177 }
7178
7179 Type *CondTy = SI->getCondition()->getType();
7180 if (!ScalarCond)
7181 CondTy = VectorType::get(CondTy, VF);
7182
7183 CmpInst::Predicate Pred = CmpInst::BAD_ICMP_PREDICATE;
7184 if (auto *Cmp = dyn_cast<CmpInst>(SI->getCondition()))
7185 Pred = Cmp->getPredicate();
7186 return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy, Pred,
7187 CostKind, I);
7188 }
7189 case Instruction::ICmp:
7190 case Instruction::FCmp: {
7191 Type *ValTy = I->getOperand(0)->getType();
7192 Instruction *Op0AsInstruction = dyn_cast<Instruction>(I->getOperand(0));
7193 if (canTruncateToMinimalBitwidth(Op0AsInstruction, VF))
7194 ValTy = IntegerType::get(ValTy->getContext(), MinBWs[Op0AsInstruction]);
7195 VectorTy = ToVectorTy(ValTy, VF);
7196 return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, nullptr,
7197 cast<CmpInst>(I)->getPredicate(), CostKind,
7198 I);
7199 }
7200 case Instruction::Store:
7201 case Instruction::Load: {
7202 ElementCount Width = VF;
7203 if (Width.isVector()) {
7204 InstWidening Decision = getWideningDecision(I, Width);
7205 assert(Decision != CM_Unknown &&(static_cast <bool> (Decision != CM_Unknown && "CM decision should be taken at this point"
) ? void (0) : __assert_fail ("Decision != CM_Unknown && \"CM decision should be taken at this point\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 7206, __extension__
__PRETTY_FUNCTION__))
7206 "CM decision should be taken at this point")(static_cast <bool> (Decision != CM_Unknown && "CM decision should be taken at this point"
) ? void (0) : __assert_fail ("Decision != CM_Unknown && \"CM decision should be taken at this point\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 7206, __extension__
__PRETTY_FUNCTION__))
;
7207 if (getWideningCost(I, VF) == InstructionCost::getInvalid())
7208 return InstructionCost::getInvalid();
7209 if (Decision == CM_Scalarize)
7210 Width = ElementCount::getFixed(1);
7211 }
7212 VectorTy = ToVectorTy(getLoadStoreType(I), Width);
7213 return getMemoryInstructionCost(I, VF);
7214 }
7215 case Instruction::BitCast:
7216 if (I->getType()->isPointerTy())
7217 return 0;
7218 [[fallthrough]];
7219 case Instruction::ZExt:
7220 case Instruction::SExt:
7221 case Instruction::FPToUI:
7222 case Instruction::FPToSI:
7223 case Instruction::FPExt:
7224 case Instruction::PtrToInt:
7225 case Instruction::IntToPtr:
7226 case Instruction::SIToFP:
7227 case Instruction::UIToFP:
7228 case Instruction::Trunc:
7229 case Instruction::FPTrunc: {
7230 // Computes the CastContextHint from a Load/Store instruction.
7231 auto ComputeCCH = [&](Instruction *I) -> TTI::CastContextHint {
7232 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&(static_cast <bool> ((isa<LoadInst>(I) || isa<
StoreInst>(I)) && "Expected a load or a store!") ?
void (0) : __assert_fail ("(isa<LoadInst>(I) || isa<StoreInst>(I)) && \"Expected a load or a store!\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 7233, __extension__
__PRETTY_FUNCTION__))
7233 "Expected a load or a store!")(static_cast <bool> ((isa<LoadInst>(I) || isa<
StoreInst>(I)) && "Expected a load or a store!") ?
void (0) : __assert_fail ("(isa<LoadInst>(I) || isa<StoreInst>(I)) && \"Expected a load or a store!\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 7233, __extension__
__PRETTY_FUNCTION__))
;
7234
7235 if (VF.isScalar() || !TheLoop->contains(I))
7236 return TTI::CastContextHint::Normal;
7237
7238 switch (getWideningDecision(I, VF)) {
7239 case LoopVectorizationCostModel::CM_GatherScatter:
7240 return TTI::CastContextHint::GatherScatter;
7241 case LoopVectorizationCostModel::CM_Interleave:
7242 return TTI::CastContextHint::Interleave;
7243 case LoopVectorizationCostModel::CM_Scalarize:
7244 case LoopVectorizationCostModel::CM_Widen:
7245 return Legal->isMaskRequired(I) ? TTI::CastContextHint::Masked
7246 : TTI::CastContextHint::Normal;
7247 case LoopVectorizationCostModel::CM_Widen_Reverse:
7248 return TTI::CastContextHint::Reversed;
7249 case LoopVectorizationCostModel::CM_Unknown:
7250 llvm_unreachable("Instr did not go through cost modelling?")::llvm::llvm_unreachable_internal("Instr did not go through cost modelling?"
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 7250)
;
7251 }
7252
7253 llvm_unreachable("Unhandled case!")::llvm::llvm_unreachable_internal("Unhandled case!", "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 7253)
;
7254 };
7255
7256 unsigned Opcode = I->getOpcode();
7257 TTI::CastContextHint CCH = TTI::CastContextHint::None;
7258 // For Trunc, the context is the only user, which must be a StoreInst.
7259 if (Opcode == Instruction::Trunc || Opcode == Instruction::FPTrunc) {
7260 if (I->hasOneUse())
7261 if (StoreInst *Store = dyn_cast<StoreInst>(*I->user_begin()))
7262 CCH = ComputeCCH(Store);
7263 }
7264 // For Z/Sext, the context is the operand, which must be a LoadInst.
7265 else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt ||
7266 Opcode == Instruction::FPExt) {
7267 if (LoadInst *Load = dyn_cast<LoadInst>(I->getOperand(0)))
7268 CCH = ComputeCCH(Load);
7269 }
7270
7271 // We optimize the truncation of induction variables having constant
7272 // integer steps. The cost of these truncations is the same as the scalar
7273 // operation.
7274 if (isOptimizableIVTruncate(I, VF)) {
7275 auto *Trunc = cast<TruncInst>(I);
7276 return TTI.getCastInstrCost(Instruction::Trunc, Trunc->getDestTy(),
7277 Trunc->getSrcTy(), CCH, CostKind, Trunc);
7278 }
7279
7280 // Detect reduction patterns
7281 if (auto RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind))
7282 return *RedCost;
7283
7284 Type *SrcScalarTy = I->getOperand(0)->getType();
7285 Type *SrcVecTy =
7286 VectorTy->isVectorTy() ? ToVectorTy(SrcScalarTy, VF) : SrcScalarTy;
7287 if (canTruncateToMinimalBitwidth(I, VF)) {
7288 // This cast is going to be shrunk. This may remove the cast or it might
7289 // turn it into slightly different cast. For example, if MinBW == 16,
7290 // "zext i8 %1 to i32" becomes "zext i8 %1 to i16".
7291 //
7292 // Calculate the modified src and dest types.
7293 Type *MinVecTy = VectorTy;
7294 if (Opcode == Instruction::Trunc) {
7295 SrcVecTy = smallestIntegerVectorType(SrcVecTy, MinVecTy);
7296 VectorTy =
7297 largestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy);
7298 } else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt) {
7299 SrcVecTy = largestIntegerVectorType(SrcVecTy, MinVecTy);
7300 VectorTy =
7301 smallestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy);
7302 }
7303 }
7304
7305 return TTI.getCastInstrCost(Opcode, VectorTy, SrcVecTy, CCH, CostKind, I);
7306 }
7307 case Instruction::Call: {
7308 if (RecurrenceDescriptor::isFMulAddIntrinsic(I))
7309 if (auto RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind))
7310 return *RedCost;
7311 bool NeedToScalarize;
7312 CallInst *CI = cast<CallInst>(I);
7313 InstructionCost CallCost = getVectorCallCost(CI, VF, NeedToScalarize);
7314 if (getVectorIntrinsicIDForCall(CI, TLI)) {
7315 InstructionCost IntrinsicCost = getVectorIntrinsicCost(CI, VF);
7316 return std::min(CallCost, IntrinsicCost);
7317 }
7318 return CallCost;
7319 }
7320 case Instruction::ExtractValue:
7321 return TTI.getInstructionCost(I, TTI::TCK_RecipThroughput);
7322 case Instruction::Alloca:
7323 // We cannot easily widen alloca to a scalable alloca, as
7324 // the result would need to be a vector of pointers.
7325 if (VF.isScalable())
7326 return InstructionCost::getInvalid();
7327 [[fallthrough]];
7328 default:
7329 // This opcode is unknown. Assume that it is the same as 'mul'.
7330 return TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind);
7331 } // end of switch.
7332}
7333
7334char LoopVectorize::ID = 0;
7335
7336static const char lv_name[] = "Loop Vectorization";
7337
7338INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false)static void *initializeLoopVectorizePassOnce(PassRegistry &
Registry) {
7339INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)initializeTargetTransformInfoWrapperPassPass(Registry);
7340INITIALIZE_PASS_DEPENDENCY(BasicAAWrapperPass)initializeBasicAAWrapperPassPass(Registry);
7341INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass)initializeGlobalsAAWrapperPassPass(Registry);
7342INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)initializeAssumptionCacheTrackerPass(Registry);
7343INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass)initializeBlockFrequencyInfoWrapperPassPass(Registry);
7344INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)initializeDominatorTreeWrapperPassPass(Registry);
7345INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)initializeScalarEvolutionWrapperPassPass(Registry);
7346INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)initializeLoopInfoWrapperPassPass(Registry);
7347INITIALIZE_PASS_DEPENDENCY(LoopAccessLegacyAnalysis)initializeLoopAccessLegacyAnalysisPass(Registry);
7348INITIALIZE_PASS_DEPENDENCY(DemandedBitsWrapperPass)initializeDemandedBitsWrapperPassPass(Registry);
7349INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass)initializeOptimizationRemarkEmitterWrapperPassPass(Registry);
7350INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass)initializeProfileSummaryInfoWrapperPassPass(Registry);
7351INITIALIZE_PASS_DEPENDENCY(InjectTLIMappingsLegacy)initializeInjectTLIMappingsLegacyPass(Registry);
7352INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false)PassInfo *PI = new PassInfo( lv_name, "loop-vectorize", &
LoopVectorize::ID, PassInfo::NormalCtor_t(callDefaultCtor<
LoopVectorize>), false, false); Registry.registerPass(*PI,
true); return PI; } static llvm::once_flag InitializeLoopVectorizePassFlag
; void llvm::initializeLoopVectorizePass(PassRegistry &Registry
) { llvm::call_once(InitializeLoopVectorizePassFlag, initializeLoopVectorizePassOnce
, std::ref(Registry)); }
7353
7354namespace llvm {
7355
7356Pass *createLoopVectorizePass() { return new LoopVectorize(); }
7357
7358Pass *createLoopVectorizePass(bool InterleaveOnlyWhenForced,
7359 bool VectorizeOnlyWhenForced) {
7360 return new LoopVectorize(InterleaveOnlyWhenForced, VectorizeOnlyWhenForced);
7361}
7362
7363} // end namespace llvm
7364
7365void LoopVectorizationCostModel::collectValuesToIgnore() {
7366 // Ignore ephemeral values.
7367 CodeMetrics::collectEphemeralValues(TheLoop, AC, ValuesToIgnore);
7368
7369 // Find all stores to invariant variables. Since they are going to sink
7370 // outside the loop we do not need calculate cost for them.
7371 for (BasicBlock *BB : TheLoop->blocks())
7372 for (Instruction &I : *BB) {
7373 StoreInst *SI;
7374 if ((SI = dyn_cast<StoreInst>(&I)) &&
7375 Legal->isInvariantAddressOfReduction(SI->getPointerOperand()))
7376 ValuesToIgnore.insert(&I);
7377 }
7378
7379 // Ignore type-promoting instructions we identified during reduction
7380 // detection.
7381 for (const auto &Reduction : Legal->getReductionVars()) {
7382 const RecurrenceDescriptor &RedDes = Reduction.second;
7383 const SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts();
7384 VecValuesToIgnore.insert(Casts.begin(), Casts.end());
7385 }
7386 // Ignore type-casting instructions we identified during induction
7387 // detection.
7388 for (const auto &Induction : Legal->getInductionVars()) {
7389 const InductionDescriptor &IndDes = Induction.second;
7390 const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts();
7391 VecValuesToIgnore.insert(Casts.begin(), Casts.end());
7392 }
7393}
7394
7395void LoopVectorizationCostModel::collectInLoopReductions() {
7396 for (const auto &Reduction : Legal->getReductionVars()) {
7397 PHINode *Phi = Reduction.first;
7398 const RecurrenceDescriptor &RdxDesc = Reduction.second;
7399
7400 // We don't collect reductions that are type promoted (yet).
7401 if (RdxDesc.getRecurrenceType() != Phi->getType())
7402 continue;
7403
7404 // If the target would prefer this reduction to happen "in-loop", then we
7405 // want to record it as such.
7406 unsigned Opcode = RdxDesc.getOpcode();
7407 if (!PreferInLoopReductions && !useOrderedReductions(RdxDesc) &&
7408 !TTI.preferInLoopReduction(Opcode, Phi->getType(),
7409 TargetTransformInfo::ReductionFlags()))
7410 continue;
7411
7412 // Check that we can correctly put the reductions into the loop, by
7413 // finding the chain of operations that leads from the phi to the loop
7414 // exit value.
7415 SmallVector<Instruction *, 4> ReductionOperations =
7416 RdxDesc.getReductionOpChain(Phi, TheLoop);
7417 bool InLoop = !ReductionOperations.empty();
7418 if (InLoop) {
7419 InLoopReductionChains[Phi] = ReductionOperations;
7420 // Add the elements to InLoopReductionImmediateChains for cost modelling.
7421 Instruction *LastChain = Phi;
7422 for (auto *I : ReductionOperations) {
7423 InLoopReductionImmediateChains[I] = LastChain;
7424 LastChain = I;
7425 }
7426 }
7427 LLVM_DEBUG(dbgs() << "LV: Using " << (InLoop ? "inloop" : "out of loop")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Using " << (
InLoop ? "inloop" : "out of loop") << " reduction for phi: "
<< *Phi << "\n"; } } while (false)
7428 << " reduction for phi: " << *Phi << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Using " << (
InLoop ? "inloop" : "out of loop") << " reduction for phi: "
<< *Phi << "\n"; } } while (false)
;
7429 }
7430}
7431
7432// TODO: we could return a pair of values that specify the max VF and
7433// min VF, to be used in `buildVPlans(MinVF, MaxVF)` instead of
7434// `buildVPlans(VF, VF)`. We cannot do it because VPLAN at the moment
7435// doesn't have a cost model that can choose which plan to execute if
7436// more than one is generated.
7437static unsigned determineVPlanVF(const unsigned WidestVectorRegBits,
7438 LoopVectorizationCostModel &CM) {
7439 unsigned WidestType;
7440 std::tie(std::ignore, WidestType) = CM.getSmallestAndWidestTypes();
7441 return WidestVectorRegBits / WidestType;
7442}
7443
7444VectorizationFactor
7445LoopVectorizationPlanner::planInVPlanNativePath(ElementCount UserVF) {
7446 assert(!UserVF.isScalable() && "scalable vectors not yet supported")(static_cast <bool> (!UserVF.isScalable() && "scalable vectors not yet supported"
) ? void (0) : __assert_fail ("!UserVF.isScalable() && \"scalable vectors not yet supported\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 7446, __extension__
__PRETTY_FUNCTION__))
;
7447 ElementCount VF = UserVF;
7448 // Outer loop handling: They may require CFG and instruction level
7449 // transformations before even evaluating whether vectorization is profitable.
7450 // Since we cannot modify the incoming IR, we need to build VPlan upfront in
7451 // the vectorization pipeline.
7452 if (!OrigLoop->isInnermost()) {
7453 // If the user doesn't provide a vectorization factor, determine a
7454 // reasonable one.
7455 if (UserVF.isZero()) {
7456 VF = ElementCount::getFixed(determineVPlanVF(
7457 TTI->getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector)
7458 .getFixedSize(),
7459 CM));
7460 LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: VPlan computed VF "
<< VF << ".\n"; } } while (false)
;
7461
7462 // Make sure we have a VF > 1 for stress testing.
7463 if (VPlanBuildStressTest && (VF.isScalar() || VF.isZero())) {
7464 LLVM_DEBUG(dbgs() << "LV: VPlan stress testing: "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: VPlan stress testing: "
<< "overriding computed VF.\n"; } } while (false)
7465 << "overriding computed VF.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: VPlan stress testing: "
<< "overriding computed VF.\n"; } } while (false)
;
7466 VF = ElementCount::getFixed(4);
7467 }
7468 }
7469 assert(EnableVPlanNativePath && "VPlan-native path is not enabled.")(static_cast <bool> (EnableVPlanNativePath && "VPlan-native path is not enabled."
) ? void (0) : __assert_fail ("EnableVPlanNativePath && \"VPlan-native path is not enabled.\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 7469, __extension__
__PRETTY_FUNCTION__))
;
7470 assert(isPowerOf2_32(VF.getKnownMinValue()) &&(static_cast <bool> (isPowerOf2_32(VF.getKnownMinValue(
)) && "VF needs to be a power of two") ? void (0) : __assert_fail
("isPowerOf2_32(VF.getKnownMinValue()) && \"VF needs to be a power of two\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 7471, __extension__
__PRETTY_FUNCTION__))
7471 "VF needs to be a power of two")(static_cast <bool> (isPowerOf2_32(VF.getKnownMinValue(
)) && "VF needs to be a power of two") ? void (0) : __assert_fail
("isPowerOf2_32(VF.getKnownMinValue()) && \"VF needs to be a power of two\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 7471, __extension__
__PRETTY_FUNCTION__))
;
7472 LLVM_DEBUG(dbgs() << "LV: Using " << (!UserVF.isZero() ? "user " : "")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Using " << (
!UserVF.isZero() ? "user " : "") << "VF " << VF <<
" to build VPlans.\n"; } } while (false)
7473 << "VF " << VF << " to build VPlans.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Using " << (
!UserVF.isZero() ? "user " : "") << "VF " << VF <<
" to build VPlans.\n"; } } while (false)
;
7474 buildVPlans(VF, VF);
7475
7476 // For VPlan build stress testing, we bail out after VPlan construction.
7477 if (VPlanBuildStressTest)
7478 return VectorizationFactor::Disabled();
7479
7480 return {VF, 0 /*Cost*/, 0 /* ScalarCost */};
7481 }
7482
7483 LLVM_DEBUG(do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Not vectorizing. Inner loops aren't supported in the "
"VPlan-native path.\n"; } } while (false)
7484 dbgs() << "LV: Not vectorizing. Inner loops aren't supported in the "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Not vectorizing. Inner loops aren't supported in the "
"VPlan-native path.\n"; } } while (false)
7485 "VPlan-native path.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Not vectorizing. Inner loops aren't supported in the "
"VPlan-native path.\n"; } } while (false)
;
7486 return VectorizationFactor::Disabled();
7487}
7488
7489std::optional<VectorizationFactor>
7490LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {
7491 assert(OrigLoop->isInnermost() && "Inner loop expected.")(static_cast <bool> (OrigLoop->isInnermost() &&
"Inner loop expected.") ? void (0) : __assert_fail ("OrigLoop->isInnermost() && \"Inner loop expected.\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 7491, __extension__
__PRETTY_FUNCTION__))
;
7492 FixedScalableVFPair MaxFactors = CM.computeMaxVF(UserVF, UserIC);
7493 if (!MaxFactors) // Cases that should not to be vectorized nor interleaved.
7494 return std::nullopt;
7495
7496 // Invalidate interleave groups if all blocks of loop will be predicated.
7497 if (CM.blockNeedsPredicationForAnyReason(OrigLoop->getHeader()) &&
7498 !useMaskedInterleavedAccesses(*TTI)) {
7499 LLVM_DEBUG(do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Invalidate all interleaved groups due to fold-tail by masking "
"which requires masked-interleaved support.\n"; } } while (false
)
7500 dbgs()do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Invalidate all interleaved groups due to fold-tail by masking "
"which requires masked-interleaved support.\n"; } } while (false
)
7501 << "LV: Invalidate all interleaved groups due to fold-tail by masking "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Invalidate all interleaved groups due to fold-tail by masking "
"which requires masked-interleaved support.\n"; } } while (false
)
7502 "which requires masked-interleaved support.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Invalidate all interleaved groups due to fold-tail by masking "
"which requires masked-interleaved support.\n"; } } while (false
)
;
7503 if (CM.InterleaveInfo.invalidateGroups())
7504 // Invalidating interleave groups also requires invalidating all decisions
7505 // based on them, which includes widening decisions and uniform and scalar
7506 // values.
7507 CM.invalidateCostModelingDecisions();
7508 }
7509
7510 ElementCount MaxUserVF =
7511 UserVF.isScalable() ? MaxFactors.ScalableVF : MaxFactors.FixedVF;
7512 bool UserVFIsLegal = ElementCount::isKnownLE(UserVF, MaxUserVF);
7513 if (!UserVF.isZero() && UserVFIsLegal) {
7514 assert(isPowerOf2_32(UserVF.getKnownMinValue()) &&(static_cast <bool> (isPowerOf2_32(UserVF.getKnownMinValue
()) && "VF needs to be a power of two") ? void (0) : __assert_fail
("isPowerOf2_32(UserVF.getKnownMinValue()) && \"VF needs to be a power of two\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 7515, __extension__
__PRETTY_FUNCTION__))
7515 "VF needs to be a power of two")(static_cast <bool> (isPowerOf2_32(UserVF.getKnownMinValue
()) && "VF needs to be a power of two") ? void (0) : __assert_fail
("isPowerOf2_32(UserVF.getKnownMinValue()) && \"VF needs to be a power of two\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 7515, __extension__
__PRETTY_FUNCTION__))
;
7516 // Collect the instructions (and their associated costs) that will be more
7517 // profitable to scalarize.
7518 if (CM.selectUserVectorizationFactor(UserVF)) {
7519 LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Using user VF " <<
UserVF << ".\n"; } } while (false)
;
7520 CM.collectInLoopReductions();
7521 buildVPlansWithVPRecipes(UserVF, UserVF);
7522 LLVM_DEBUG(printPlans(dbgs()))do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { printPlans(dbgs()); } } while (false)
;
7523 return {{UserVF, 0, 0}};
7524 } else
7525 reportVectorizationInfo("UserVF ignored because of invalid costs.",
7526 "InvalidCost", ORE, OrigLoop);
7527 }
7528
7529 // Populate the set of Vectorization Factor Candidates.
7530 ElementCountSet VFCandidates;
7531 for (auto VF = ElementCount::getFixed(1);
7532 ElementCount::isKnownLE(VF, MaxFactors.FixedVF); VF *= 2)
7533 VFCandidates.insert(VF);
7534 for (auto VF = ElementCount::getScalable(1);
7535 ElementCount::isKnownLE(VF, MaxFactors.ScalableVF); VF *= 2)
7536 VFCandidates.insert(VF);
7537
7538 for (const auto &VF : VFCandidates) {
7539 // Collect Uniform and Scalar instructions after vectorization with VF.
7540 CM.collectUniformsAndScalars(VF);
7541
7542 // Collect the instructions (and their associated costs) that will be more
7543 // profitable to scalarize.
7544 if (VF.isVector())
7545 CM.collectInstsToScalarize(VF);
7546 }
7547
7548 CM.collectInLoopReductions();
7549 buildVPlansWithVPRecipes(ElementCount::getFixed(1), MaxFactors.FixedVF);
7550 buildVPlansWithVPRecipes(ElementCount::getScalable(1), MaxFactors.ScalableVF);
7551
7552 LLVM_DEBUG(printPlans(dbgs()))do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { printPlans(dbgs()); } } while (false)
;
7553 if (!MaxFactors.hasVector())
7554 return VectorizationFactor::Disabled();
7555
7556 // Select the optimal vectorization factor.
7557 VectorizationFactor VF = CM.selectVectorizationFactor(VFCandidates);
7558 assert((VF.Width.isScalar() || VF.ScalarCost > 0) && "when vectorizing, the scalar cost must be non-zero.")(static_cast <bool> ((VF.Width.isScalar() || VF.ScalarCost
> 0) && "when vectorizing, the scalar cost must be non-zero."
) ? void (0) : __assert_fail ("(VF.Width.isScalar() || VF.ScalarCost > 0) && \"when vectorizing, the scalar cost must be non-zero.\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 7558, __extension__
__PRETTY_FUNCTION__))
;
7559 return VF;
7560}
7561
7562VPlan &LoopVectorizationPlanner::getBestPlanFor(ElementCount VF) const {
7563 assert(count_if(VPlans,(static_cast <bool> (count_if(VPlans, [VF](const VPlanPtr
&Plan) { return Plan->hasVF(VF); }) == 1 && "Best VF has not a single VPlan."
) ? void (0) : __assert_fail ("count_if(VPlans, [VF](const VPlanPtr &Plan) { return Plan->hasVF(VF); }) == 1 && \"Best VF has not a single VPlan.\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 7566, __extension__
__PRETTY_FUNCTION__))
7564 [VF](const VPlanPtr &Plan) { return Plan->hasVF(VF); }) ==(static_cast <bool> (count_if(VPlans, [VF](const VPlanPtr
&Plan) { return Plan->hasVF(VF); }) == 1 && "Best VF has not a single VPlan."
) ? void (0) : __assert_fail ("count_if(VPlans, [VF](const VPlanPtr &Plan) { return Plan->hasVF(VF); }) == 1 && \"Best VF has not a single VPlan.\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 7566, __extension__
__PRETTY_FUNCTION__))
7565 1 &&(static_cast <bool> (count_if(VPlans, [VF](const VPlanPtr
&Plan) { return Plan->hasVF(VF); }) == 1 && "Best VF has not a single VPlan."
) ? void (0) : __assert_fail ("count_if(VPlans, [VF](const VPlanPtr &Plan) { return Plan->hasVF(VF); }) == 1 && \"Best VF has not a single VPlan.\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 7566, __extension__
__PRETTY_FUNCTION__))
7566 "Best VF has not a single VPlan.")(static_cast <bool> (count_if(VPlans, [VF](const VPlanPtr
&Plan) { return Plan->hasVF(VF); }) == 1 && "Best VF has not a single VPlan."
) ? void (0) : __assert_fail ("count_if(VPlans, [VF](const VPlanPtr &Plan) { return Plan->hasVF(VF); }) == 1 && \"Best VF has not a single VPlan.\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 7566, __extension__
__PRETTY_FUNCTION__))
;
7567
7568 for (const VPlanPtr &Plan : VPlans) {
7569 if (Plan->hasVF(VF))
7570 return *Plan.get();
7571 }
7572 llvm_unreachable("No plan found!")::llvm::llvm_unreachable_internal("No plan found!", "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 7572)
;
7573}
7574
7575static void AddRuntimeUnrollDisableMetaData(Loop *L) {
7576 SmallVector<Metadata *, 4> MDs;
7577 // Reserve first location for self reference to the LoopID metadata node.
7578 MDs.push_back(nullptr);
7579 bool IsUnrollMetadata = false;
7580 MDNode *LoopID = L->getLoopID();
7581 if (LoopID) {
7582 // First find existing loop unrolling disable metadata.
7583 for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) {
7584 auto *MD = dyn_cast<MDNode>(LoopID->getOperand(i));
7585 if (MD) {
7586 const auto *S = dyn_cast<MDString>(MD->getOperand(0));
7587 IsUnrollMetadata =
7588 S && S->getString().startswith("llvm.loop.unroll.disable");
7589 }
7590 MDs.push_back(LoopID->getOperand(i));
7591 }
7592 }
7593
7594 if (!IsUnrollMetadata) {
7595 // Add runtime unroll disable metadata.
7596 LLVMContext &Context = L->getHeader()->getContext();
7597 SmallVector<Metadata *, 1> DisableOperands;
7598 DisableOperands.push_back(
7599 MDString::get(Context, "llvm.loop.unroll.runtime.disable"));
7600 MDNode *DisableNode = MDNode::get(Context, DisableOperands);
7601 MDs.push_back(DisableNode);
7602 MDNode *NewLoopID = MDNode::get(Context, MDs);
7603 // Set operand 0 to refer to the loop id itself.
7604 NewLoopID->replaceOperandWith(0, NewLoopID);
7605 L->setLoopID(NewLoopID);
7606 }
7607}
7608
7609void LoopVectorizationPlanner::executePlan(ElementCount BestVF, unsigned BestUF,
7610 VPlan &BestVPlan,
7611 InnerLoopVectorizer &ILV,
7612 DominatorTree *DT,
7613 bool IsEpilogueVectorization) {
7614 assert(BestVPlan.hasVF(BestVF) &&(static_cast <bool> (BestVPlan.hasVF(BestVF) &&
"Trying to execute plan with unsupported VF") ? void (0) : __assert_fail
("BestVPlan.hasVF(BestVF) && \"Trying to execute plan with unsupported VF\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 7615, __extension__
__PRETTY_FUNCTION__))
7615 "Trying to execute plan with unsupported VF")(static_cast <bool> (BestVPlan.hasVF(BestVF) &&
"Trying to execute plan with unsupported VF") ? void (0) : __assert_fail
("BestVPlan.hasVF(BestVF) && \"Trying to execute plan with unsupported VF\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 7615, __extension__
__PRETTY_FUNCTION__))
;
7616 assert(BestVPlan.hasUF(BestUF) &&(static_cast <bool> (BestVPlan.hasUF(BestUF) &&
"Trying to execute plan with unsupported UF") ? void (0) : __assert_fail
("BestVPlan.hasUF(BestUF) && \"Trying to execute plan with unsupported UF\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 7617, __extension__
__PRETTY_FUNCTION__))
7617 "Trying to execute plan with unsupported UF")(static_cast <bool> (BestVPlan.hasUF(BestUF) &&
"Trying to execute plan with unsupported UF") ? void (0) : __assert_fail
("BestVPlan.hasUF(BestUF) && \"Trying to execute plan with unsupported UF\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 7617, __extension__
__PRETTY_FUNCTION__))
;
7618
7619 LLVM_DEBUG(dbgs() << "Executing best plan with VF=" << BestVF << ", UF=" << BestUFdo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "Executing best plan with VF="
<< BestVF << ", UF=" << BestUF << '\n'
; } } while (false)
7620 << '\n')do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "Executing best plan with VF="
<< BestVF << ", UF=" << BestUF << '\n'
; } } while (false)
;
7621
7622 // Workaround! Compute the trip count of the original loop and cache it
7623 // before we start modifying the CFG. This code has a systemic problem
7624 // wherein it tries to run analysis over partially constructed IR; this is
7625 // wrong, and not simply for SCEV. The trip count of the original loop
7626 // simply happens to be prone to hitting this in practice. In theory, we
7627 // can hit the same issue for any SCEV, or ValueTracking query done during
7628 // mutation. See PR49900.
7629 ILV.getOrCreateTripCount(OrigLoop->getLoopPreheader());
7630
7631 if (!IsEpilogueVectorization)
7632 VPlanTransforms::optimizeForVFAndUF(BestVPlan, BestVF, BestUF, PSE);
7633
7634 // Perform the actual loop transformation.
7635
7636 // 1. Set up the skeleton for vectorization, including vector pre-header and
7637 // middle block. The vector loop is created during VPlan execution.
7638 VPTransformState State{BestVF, BestUF, LI, DT, ILV.Builder, &ILV, &BestVPlan};
7639 Value *CanonicalIVStartValue;
7640 std::tie(State.CFG.PrevBB, CanonicalIVStartValue) =
7641 ILV.createVectorizedLoopSkeleton();
7642
7643 // Only use noalias metadata when using memory checks guaranteeing no overlap
7644 // across all iterations.
7645 const LoopAccessInfo *LAI = ILV.Legal->getLAI();
7646 if (LAI && !LAI->getRuntimePointerChecking()->getChecks().empty() &&
7647 !LAI->getRuntimePointerChecking()->getDiffChecks()) {
7648
7649 // We currently don't use LoopVersioning for the actual loop cloning but we
7650 // still use it to add the noalias metadata.
7651 // TODO: Find a better way to re-use LoopVersioning functionality to add
7652 // metadata.
7653 State.LVer = std::make_unique<LoopVersioning>(
7654 *LAI, LAI->getRuntimePointerChecking()->getChecks(), OrigLoop, LI, DT,
7655 PSE.getSE());
7656 State.LVer->prepareNoAliasMetadata();
7657 }
7658
7659 ILV.collectPoisonGeneratingRecipes(State);
7660
7661 ILV.printDebugTracesAtStart();
7662
7663 //===------------------------------------------------===//
7664 //
7665 // Notice: any optimization or new instruction that go
7666 // into the code below should also be implemented in
7667 // the cost-model.
7668 //
7669 //===------------------------------------------------===//
7670
7671 // 2. Copy and widen instructions from the old loop into the new loop.
7672 BestVPlan.prepareToExecute(ILV.getOrCreateTripCount(nullptr),
7673 ILV.getOrCreateVectorTripCount(nullptr),
7674 CanonicalIVStartValue, State,
7675 IsEpilogueVectorization);
7676
7677 BestVPlan.execute(&State);
7678
7679 // Keep all loop hints from the original loop on the vector loop (we'll
7680 // replace the vectorizer-specific hints below).
7681 MDNode *OrigLoopID = OrigLoop->getLoopID();
7682
7683 std::optional<MDNode *> VectorizedLoopID =
7684 makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll,
7685 LLVMLoopVectorizeFollowupVectorized});
7686
7687 VPBasicBlock *HeaderVPBB =
7688 BestVPlan.getVectorLoopRegion()->getEntryBasicBlock();
7689 Loop *L = LI->getLoopFor(State.CFG.VPBB2IRBB[HeaderVPBB]);
7690 if (VectorizedLoopID)
7691 L->setLoopID(*VectorizedLoopID);
7692 else {
7693 // Keep all loop hints from the original loop on the vector loop (we'll
7694 // replace the vectorizer-specific hints below).
7695 if (MDNode *LID = OrigLoop->getLoopID())
7696 L->setLoopID(LID);
7697
7698 LoopVectorizeHints Hints(L, true, *ORE);
7699 Hints.setAlreadyVectorized();
7700 }
7701 // Disable runtime unrolling when vectorizing the epilogue loop.
7702 if (CanonicalIVStartValue)
7703 AddRuntimeUnrollDisableMetaData(L);
7704
7705 // 3. Fix the vectorized code: take care of header phi's, live-outs,
7706 // predication, updating analyses.
7707 ILV.fixVectorizedLoop(State, BestVPlan);
7708
7709 ILV.printDebugTracesAtEnd();
7710}
7711
7712#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
7713void LoopVectorizationPlanner::printPlans(raw_ostream &O) {
7714 for (const auto &Plan : VPlans)
7715 if (PrintVPlansInDotFormat)
7716 Plan->printDOT(O);
7717 else
7718 Plan->print(O);
7719}
7720#endif
7721
7722Value *InnerLoopUnroller::getBroadcastInstrs(Value *V) { return V; }
7723
7724//===--------------------------------------------------------------------===//
7725// EpilogueVectorizerMainLoop
7726//===--------------------------------------------------------------------===//
7727
7728/// This function is partially responsible for generating the control flow
7729/// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization.
7730std::pair<BasicBlock *, Value *>
7731EpilogueVectorizerMainLoop::createEpilogueVectorizedLoopSkeleton() {
7732 createVectorLoopSkeleton("");
7733
7734 // Generate the code to check the minimum iteration count of the vector
7735 // epilogue (see below).
7736 EPI.EpilogueIterationCountCheck =
7737 emitIterationCountCheck(LoopScalarPreHeader, true);
7738 EPI.EpilogueIterationCountCheck->setName("iter.check");
7739
7740 // Generate the code to check any assumptions that we've made for SCEV
7741 // expressions.
7742 EPI.SCEVSafetyCheck = emitSCEVChecks(LoopScalarPreHeader);
7743
7744 // Generate the code that checks at runtime if arrays overlap. We put the
7745 // checks into a separate block to make the more common case of few elements
7746 // faster.
7747 EPI.MemSafetyCheck = emitMemRuntimeChecks(LoopScalarPreHeader);
7748
7749 // Generate the iteration count check for the main loop, *after* the check
7750 // for the epilogue loop, so that the path-length is shorter for the case
7751 // that goes directly through the vector epilogue. The longer-path length for
7752 // the main loop is compensated for, by the gain from vectorizing the larger
7753 // trip count. Note: the branch will get updated later on when we vectorize
7754 // the epilogue.
7755 EPI.MainLoopIterationCountCheck =
7756 emitIterationCountCheck(LoopScalarPreHeader, false);
7757
7758 // Generate the induction variable.
7759 EPI.VectorTripCount = getOrCreateVectorTripCount(LoopVectorPreHeader);
7760
7761 // Skip induction resume value creation here because they will be created in
7762 // the second pass for the scalar loop. The induction resume values for the
7763 // inductions in the epilogue loop are created before executing the plan for
7764 // the epilogue loop.
7765
7766 return {completeLoopSkeleton(), nullptr};
7767}
7768
7769void EpilogueVectorizerMainLoop::printDebugTracesAtStart() {
7770 LLVM_DEBUG({do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { { dbgs() << "Create Skeleton for epilogue vectorized loop (first pass)\n"
<< "Main Loop VF:" << EPI.MainLoopVF << ", Main Loop UF:"
<< EPI.MainLoopUF << ", Epilogue Loop VF:" <<
EPI.EpilogueVF << ", Epilogue Loop UF:" << EPI.EpilogueUF
<< "\n"; }; } } while (false)
7771 dbgs() << "Create Skeleton for epilogue vectorized loop (first pass)\n"do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { { dbgs() << "Create Skeleton for epilogue vectorized loop (first pass)\n"
<< "Main Loop VF:" << EPI.MainLoopVF << ", Main Loop UF:"
<< EPI.MainLoopUF << ", Epilogue Loop VF:" <<
EPI.EpilogueVF << ", Epilogue Loop UF:" << EPI.EpilogueUF
<< "\n"; }; } } while (false)
7772 << "Main Loop VF:" << EPI.MainLoopVFdo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { { dbgs() << "Create Skeleton for epilogue vectorized loop (first pass)\n"
<< "Main Loop VF:" << EPI.MainLoopVF << ", Main Loop UF:"
<< EPI.MainLoopUF << ", Epilogue Loop VF:" <<
EPI.EpilogueVF << ", Epilogue Loop UF:" << EPI.EpilogueUF
<< "\n"; }; } } while (false)
7773 << ", Main Loop UF:" << EPI.MainLoopUFdo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { { dbgs() << "Create Skeleton for epilogue vectorized loop (first pass)\n"
<< "Main Loop VF:" << EPI.MainLoopVF << ", Main Loop UF:"
<< EPI.MainLoopUF << ", Epilogue Loop VF:" <<
EPI.EpilogueVF << ", Epilogue Loop UF:" << EPI.EpilogueUF
<< "\n"; }; } } while (false)
7774 << ", Epilogue Loop VF:" << EPI.EpilogueVFdo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { { dbgs() << "Create Skeleton for epilogue vectorized loop (first pass)\n"
<< "Main Loop VF:" << EPI.MainLoopVF << ", Main Loop UF:"
<< EPI.MainLoopUF << ", Epilogue Loop VF:" <<
EPI.EpilogueVF << ", Epilogue Loop UF:" << EPI.EpilogueUF
<< "\n"; }; } } while (false)
7775 << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n";do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { { dbgs() << "Create Skeleton for epilogue vectorized loop (first pass)\n"
<< "Main Loop VF:" << EPI.MainLoopVF << ", Main Loop UF:"
<< EPI.MainLoopUF << ", Epilogue Loop VF:" <<
EPI.EpilogueVF << ", Epilogue Loop UF:" << EPI.EpilogueUF
<< "\n"; }; } } while (false)
7776 })do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { { dbgs() << "Create Skeleton for epilogue vectorized loop (first pass)\n"
<< "Main Loop VF:" << EPI.MainLoopVF << ", Main Loop UF:"
<< EPI.MainLoopUF << ", Epilogue Loop VF:" <<
EPI.EpilogueVF << ", Epilogue Loop UF:" << EPI.EpilogueUF
<< "\n"; }; } } while (false)
;
7777}
7778
7779void EpilogueVectorizerMainLoop::printDebugTracesAtEnd() {
7780 DEBUG_WITH_TYPE(VerboseDebug, {do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
(VerboseDebug)) { { dbgs() << "intermediate fn:\n" <<
*OrigLoop->getHeader()->getParent() << "\n"; }; }
} while (false)
7781 dbgs() << "intermediate fn:\n"do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
(VerboseDebug)) { { dbgs() << "intermediate fn:\n" <<
*OrigLoop->getHeader()->getParent() << "\n"; }; }
} while (false)
7782 << *OrigLoop->getHeader()->getParent() << "\n";do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
(VerboseDebug)) { { dbgs() << "intermediate fn:\n" <<
*OrigLoop->getHeader()->getParent() << "\n"; }; }
} while (false)
7783 })do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
(VerboseDebug)) { { dbgs() << "intermediate fn:\n" <<
*OrigLoop->getHeader()->getParent() << "\n"; }; }
} while (false)
;
7784}
7785
7786BasicBlock *
7787EpilogueVectorizerMainLoop::emitIterationCountCheck(BasicBlock *Bypass,
7788 bool ForEpilogue) {
7789 assert(Bypass && "Expected valid bypass basic block.")(static_cast <bool> (Bypass && "Expected valid bypass basic block."
) ? void (0) : __assert_fail ("Bypass && \"Expected valid bypass basic block.\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 7789, __extension__
__PRETTY_FUNCTION__))
;
7790 ElementCount VFactor = ForEpilogue ? EPI.EpilogueVF : VF;
7791 unsigned UFactor = ForEpilogue ? EPI.EpilogueUF : UF;
7792 Value *Count = getOrCreateTripCount(LoopVectorPreHeader);
7793 // Reuse existing vector loop preheader for TC checks.
7794 // Note that new preheader block is generated for vector loop.
7795 BasicBlock *const TCCheckBlock = LoopVectorPreHeader;
7796 IRBuilder<> Builder(TCCheckBlock->getTerminator());
7797
7798 // Generate code to check if the loop's trip count is less than VF * UF of the
7799 // main vector loop.
7800 auto P = Cost->requiresScalarEpilogue(ForEpilogue ? EPI.EpilogueVF : VF) ?
7801 ICmpInst::ICMP_ULE : ICmpInst::ICMP_ULT;
7802
7803 Value *CheckMinIters = Builder.CreateICmp(
7804 P, Count, createStepForVF(Builder, Count->getType(), VFactor, UFactor),
7805 "min.iters.check");
7806
7807 if (!ForEpilogue)
7808 TCCheckBlock->setName("vector.main.loop.iter.check");
7809
7810 // Create new preheader for vector loop.
7811 LoopVectorPreHeader = SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(),
7812 DT, LI, nullptr, "vector.ph");
7813
7814 if (ForEpilogue) {
7815 assert(DT->properlyDominates(DT->getNode(TCCheckBlock),(static_cast <bool> (DT->properlyDominates(DT->getNode
(TCCheckBlock), DT->getNode(Bypass)->getIDom()) &&
"TC check is expected to dominate Bypass") ? void (0) : __assert_fail
("DT->properlyDominates(DT->getNode(TCCheckBlock), DT->getNode(Bypass)->getIDom()) && \"TC check is expected to dominate Bypass\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 7817, __extension__
__PRETTY_FUNCTION__))
7816 DT->getNode(Bypass)->getIDom()) &&(static_cast <bool> (DT->properlyDominates(DT->getNode
(TCCheckBlock), DT->getNode(Bypass)->getIDom()) &&
"TC check is expected to dominate Bypass") ? void (0) : __assert_fail
("DT->properlyDominates(DT->getNode(TCCheckBlock), DT->getNode(Bypass)->getIDom()) && \"TC check is expected to dominate Bypass\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 7817, __extension__
__PRETTY_FUNCTION__))
7817 "TC check is expected to dominate Bypass")(static_cast <bool> (DT->properlyDominates(DT->getNode
(TCCheckBlock), DT->getNode(Bypass)->getIDom()) &&
"TC check is expected to dominate Bypass") ? void (0) : __assert_fail
("DT->properlyDominates(DT->getNode(TCCheckBlock), DT->getNode(Bypass)->getIDom()) && \"TC check is expected to dominate Bypass\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 7817, __extension__
__PRETTY_FUNCTION__))
;
7818
7819 // Update dominator for Bypass & LoopExit.
7820 DT->changeImmediateDominator(Bypass, TCCheckBlock);
7821 if (!Cost->requiresScalarEpilogue(EPI.EpilogueVF))
7822 // For loops with multiple exits, there's no edge from the middle block
7823 // to exit blocks (as the epilogue must run) and thus no need to update
7824 // the immediate dominator of the exit blocks.
7825 DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock);
7826
7827 LoopBypassBlocks.push_back(TCCheckBlock);
7828
7829 // Save the trip count so we don't have to regenerate it in the
7830 // vec.epilog.iter.check. This is safe to do because the trip count
7831 // generated here dominates the vector epilog iter check.
7832 EPI.TripCount = Count;
7833 }
7834
7835 ReplaceInstWithInst(
7836 TCCheckBlock->getTerminator(),
7837 BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters));
7838
7839 return TCCheckBlock;
7840}
7841
7842//===--------------------------------------------------------------------===//
7843// EpilogueVectorizerEpilogueLoop
7844//===--------------------------------------------------------------------===//
7845
7846/// This function is partially responsible for generating the control flow
7847/// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization.
7848std::pair<BasicBlock *, Value *>
7849EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton() {
7850 createVectorLoopSkeleton("vec.epilog.");
7851
7852 // Now, compare the remaining count and if there aren't enough iterations to
7853 // execute the vectorized epilogue skip to the scalar part.
7854 BasicBlock *VecEpilogueIterationCountCheck = LoopVectorPreHeader;
7855 VecEpilogueIterationCountCheck->setName("vec.epilog.iter.check");
7856 LoopVectorPreHeader =
7857 SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT,
7858 LI, nullptr, "vec.epilog.ph");
7859 emitMinimumVectorEpilogueIterCountCheck(LoopScalarPreHeader,
7860 VecEpilogueIterationCountCheck);
7861
7862 // Adjust the control flow taking the state info from the main loop
7863 // vectorization into account.
7864 assert(EPI.MainLoopIterationCountCheck && EPI.EpilogueIterationCountCheck &&(static_cast <bool> (EPI.MainLoopIterationCountCheck &&
EPI.EpilogueIterationCountCheck && "expected this to be saved from the previous pass."
) ? void (0) : __assert_fail ("EPI.MainLoopIterationCountCheck && EPI.EpilogueIterationCountCheck && \"expected this to be saved from the previous pass.\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 7865, __extension__
__PRETTY_FUNCTION__))
7865 "expected this to be saved from the previous pass.")(static_cast <bool> (EPI.MainLoopIterationCountCheck &&
EPI.EpilogueIterationCountCheck && "expected this to be saved from the previous pass."
) ? void (0) : __assert_fail ("EPI.MainLoopIterationCountCheck && EPI.EpilogueIterationCountCheck && \"expected this to be saved from the previous pass.\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 7865, __extension__
__PRETTY_FUNCTION__))
;
7866 EPI.MainLoopIterationCountCheck->getTerminator()->replaceUsesOfWith(
7867 VecEpilogueIterationCountCheck, LoopVectorPreHeader);
7868
7869 DT->changeImmediateDominator(LoopVectorPreHeader,
7870 EPI.MainLoopIterationCountCheck);
7871
7872 EPI.EpilogueIterationCountCheck->getTerminator()->replaceUsesOfWith(
7873 VecEpilogueIterationCountCheck, LoopScalarPreHeader);
7874
7875 if (EPI.SCEVSafetyCheck)
7876 EPI.SCEVSafetyCheck->getTerminator()->replaceUsesOfWith(
7877 VecEpilogueIterationCountCheck, LoopScalarPreHeader);
7878 if (EPI.MemSafetyCheck)
7879 EPI.MemSafetyCheck->getTerminator()->replaceUsesOfWith(
7880 VecEpilogueIterationCountCheck, LoopScalarPreHeader);
7881
7882 DT->changeImmediateDominator(
7883 VecEpilogueIterationCountCheck,
7884 VecEpilogueIterationCountCheck->getSinglePredecessor());
7885
7886 DT->changeImmediateDominator(LoopScalarPreHeader,
7887 EPI.EpilogueIterationCountCheck);
7888 if (!Cost->requiresScalarEpilogue(EPI.EpilogueVF))
7889 // If there is an epilogue which must run, there's no edge from the
7890 // middle block to exit blocks and thus no need to update the immediate
7891 // dominator of the exit blocks.
7892 DT->changeImmediateDominator(LoopExitBlock,
7893 EPI.EpilogueIterationCountCheck);
7894
7895 // Keep track of bypass blocks, as they feed start values to the induction and
7896 // reduction phis in the scalar loop preheader.
7897 if (EPI.SCEVSafetyCheck)
7898 LoopBypassBlocks.push_back(EPI.SCEVSafetyCheck);
7899 if (EPI.MemSafetyCheck)
7900 LoopBypassBlocks.push_back(EPI.MemSafetyCheck);
7901 LoopBypassBlocks.push_back(EPI.EpilogueIterationCountCheck);
7902
7903 // The vec.epilog.iter.check block may contain Phi nodes from inductions or
7904 // reductions which merge control-flow from the latch block and the middle
7905 // block. Update the incoming values here and move the Phi into the preheader.
7906 SmallVector<PHINode *, 4> PhisInBlock;
7907 for (PHINode &Phi : VecEpilogueIterationCountCheck->phis())
7908 PhisInBlock.push_back(&Phi);
7909
7910 for (PHINode *Phi : PhisInBlock) {
7911 Phi->moveBefore(LoopVectorPreHeader->getFirstNonPHI());
7912 Phi->replaceIncomingBlockWith(
7913 VecEpilogueIterationCountCheck->getSinglePredecessor(),
7914 VecEpilogueIterationCountCheck);
7915
7916 // If the phi doesn't have an incoming value from the
7917 // EpilogueIterationCountCheck, we are done. Otherwise remove the incoming
7918 // value and also those from other check blocks. This is needed for
7919 // reduction phis only.
7920 if (none_of(Phi->blocks(), [&](BasicBlock *IncB) {
7921 return EPI.EpilogueIterationCountCheck == IncB;
7922 }))
7923 continue;
7924 Phi->removeIncomingValue(EPI.EpilogueIterationCountCheck);
7925 if (EPI.SCEVSafetyCheck)
7926 Phi->removeIncomingValue(EPI.SCEVSafetyCheck);
7927 if (EPI.MemSafetyCheck)
7928 Phi->removeIncomingValue(EPI.MemSafetyCheck);
7929 }
7930
7931 // Generate a resume induction for the vector epilogue and put it in the
7932 // vector epilogue preheader
7933 Type *IdxTy = Legal->getWidestInductionType();
7934 PHINode *EPResumeVal = PHINode::Create(IdxTy, 2, "vec.epilog.resume.val",
7935 LoopVectorPreHeader->getFirstNonPHI());
7936 EPResumeVal->addIncoming(EPI.VectorTripCount, VecEpilogueIterationCountCheck);
7937 EPResumeVal->addIncoming(ConstantInt::get(IdxTy, 0),
7938 EPI.MainLoopIterationCountCheck);
7939
7940 // Generate induction resume values. These variables save the new starting
7941 // indexes for the scalar loop. They are used to test if there are any tail
7942 // iterations left once the vector loop has completed.
7943 // Note that when the vectorized epilogue is skipped due to iteration count
7944 // check, then the resume value for the induction variable comes from
7945 // the trip count of the main vector loop, hence passing the AdditionalBypass
7946 // argument.
7947 createInductionResumeValues({VecEpilogueIterationCountCheck,
7948 EPI.VectorTripCount} /* AdditionalBypass */);
7949
7950 return {completeLoopSkeleton(), EPResumeVal};
7951}
7952
7953BasicBlock *
7954EpilogueVectorizerEpilogueLoop::emitMinimumVectorEpilogueIterCountCheck(
7955 BasicBlock *Bypass, BasicBlock *Insert) {
7956
7957 assert(EPI.TripCount &&(static_cast <bool> (EPI.TripCount && "Expected trip count to have been safed in the first pass."
) ? void (0) : __assert_fail ("EPI.TripCount && \"Expected trip count to have been safed in the first pass.\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 7958, __extension__
__PRETTY_FUNCTION__))
7958 "Expected trip count to have been safed in the first pass.")(static_cast <bool> (EPI.TripCount && "Expected trip count to have been safed in the first pass."
) ? void (0) : __assert_fail ("EPI.TripCount && \"Expected trip count to have been safed in the first pass.\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 7958, __extension__
__PRETTY_FUNCTION__))
;
7959 assert((static_cast <bool> ((!isa<Instruction>(EPI.TripCount
) || DT->dominates(cast<Instruction>(EPI.TripCount)->
getParent(), Insert)) && "saved trip count does not dominate insertion point."
) ? void (0) : __assert_fail ("(!isa<Instruction>(EPI.TripCount) || DT->dominates(cast<Instruction>(EPI.TripCount)->getParent(), Insert)) && \"saved trip count does not dominate insertion point.\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 7962, __extension__
__PRETTY_FUNCTION__))
7960 (!isa<Instruction>(EPI.TripCount) ||(static_cast <bool> ((!isa<Instruction>(EPI.TripCount
) || DT->dominates(cast<Instruction>(EPI.TripCount)->
getParent(), Insert)) && "saved trip count does not dominate insertion point."
) ? void (0) : __assert_fail ("(!isa<Instruction>(EPI.TripCount) || DT->dominates(cast<Instruction>(EPI.TripCount)->getParent(), Insert)) && \"saved trip count does not dominate insertion point.\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 7962, __extension__
__PRETTY_FUNCTION__))
7961 DT->dominates(cast<Instruction>(EPI.TripCount)->getParent(), Insert)) &&(static_cast <bool> ((!isa<Instruction>(EPI.TripCount
) || DT->dominates(cast<Instruction>(EPI.TripCount)->
getParent(), Insert)) && "saved trip count does not dominate insertion point."
) ? void (0) : __assert_fail ("(!isa<Instruction>(EPI.TripCount) || DT->dominates(cast<Instruction>(EPI.TripCount)->getParent(), Insert)) && \"saved trip count does not dominate insertion point.\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 7962, __extension__
__PRETTY_FUNCTION__))
7962 "saved trip count does not dominate insertion point.")(static_cast <bool> ((!isa<Instruction>(EPI.TripCount
) || DT->dominates(cast<Instruction>(EPI.TripCount)->
getParent(), Insert)) && "saved trip count does not dominate insertion point."
) ? void (0) : __assert_fail ("(!isa<Instruction>(EPI.TripCount) || DT->dominates(cast<Instruction>(EPI.TripCount)->getParent(), Insert)) && \"saved trip count does not dominate insertion point.\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 7962, __extension__
__PRETTY_FUNCTION__))
;
7963 Value *TC = EPI.TripCount;
7964 IRBuilder<> Builder(Insert->getTerminator());
7965 Value *Count = Builder.CreateSub(TC, EPI.VectorTripCount, "n.vec.remaining");
7966
7967 // Generate code to check if the loop's trip count is less than VF * UF of the
7968 // vector epilogue loop.
7969 auto P = Cost->requiresScalarEpilogue(EPI.EpilogueVF) ?
7970 ICmpInst::ICMP_ULE : ICmpInst::ICMP_ULT;
7971
7972 Value *CheckMinIters =
7973 Builder.CreateICmp(P, Count,
7974 createStepForVF(Builder, Count->getType(),
7975 EPI.EpilogueVF, EPI.EpilogueUF),
7976 "min.epilog.iters.check");
7977
7978 ReplaceInstWithInst(
7979 Insert->getTerminator(),
7980 BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters));
7981
7982 LoopBypassBlocks.push_back(Insert);
7983 return Insert;
7984}
7985
7986void EpilogueVectorizerEpilogueLoop::printDebugTracesAtStart() {
7987 LLVM_DEBUG({do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { { dbgs() << "Create Skeleton for epilogue vectorized loop (second pass)\n"
<< "Epilogue Loop VF:" << EPI.EpilogueVF <<
", Epilogue Loop UF:" << EPI.EpilogueUF << "\n";
}; } } while (false)
7988 dbgs() << "Create Skeleton for epilogue vectorized loop (second pass)\n"do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { { dbgs() << "Create Skeleton for epilogue vectorized loop (second pass)\n"
<< "Epilogue Loop VF:" << EPI.EpilogueVF <<
", Epilogue Loop UF:" << EPI.EpilogueUF << "\n";
}; } } while (false)
7989 << "Epilogue Loop VF:" << EPI.EpilogueVFdo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { { dbgs() << "Create Skeleton for epilogue vectorized loop (second pass)\n"
<< "Epilogue Loop VF:" << EPI.EpilogueVF <<
", Epilogue Loop UF:" << EPI.EpilogueUF << "\n";
}; } } while (false)
7990 << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n";do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { { dbgs() << "Create Skeleton for epilogue vectorized loop (second pass)\n"
<< "Epilogue Loop VF:" << EPI.EpilogueVF <<
", Epilogue Loop UF:" << EPI.EpilogueUF << "\n";
}; } } while (false)
7991 })do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { { dbgs() << "Create Skeleton for epilogue vectorized loop (second pass)\n"
<< "Epilogue Loop VF:" << EPI.EpilogueVF <<
", Epilogue Loop UF:" << EPI.EpilogueUF << "\n";
}; } } while (false)
;
7992}
7993
7994void EpilogueVectorizerEpilogueLoop::printDebugTracesAtEnd() {
7995 DEBUG_WITH_TYPE(VerboseDebug, {do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
(VerboseDebug)) { { dbgs() << "final fn:\n" << *OrigLoop
->getHeader()->getParent() << "\n"; }; } } while (
false)
7996 dbgs() << "final fn:\n" << *OrigLoop->getHeader()->getParent() << "\n";do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
(VerboseDebug)) { { dbgs() << "final fn:\n" << *OrigLoop
->getHeader()->getParent() << "\n"; }; } } while (
false)
7997 })do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
(VerboseDebug)) { { dbgs() << "final fn:\n" << *OrigLoop
->getHeader()->getParent() << "\n"; }; } } while (
false)
;
7998}
7999
8000bool LoopVectorizationPlanner::getDecisionAndClampRange(
8001 const std::function<bool(ElementCount)> &Predicate, VFRange &Range) {
8002 assert(!Range.isEmpty() && "Trying to test an empty VF range.")(static_cast <bool> (!Range.isEmpty() && "Trying to test an empty VF range."
) ? void (0) : __assert_fail ("!Range.isEmpty() && \"Trying to test an empty VF range.\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 8002, __extension__
__PRETTY_FUNCTION__))
;
8003 bool PredicateAtRangeStart = Predicate(Range.Start);
8004
8005 for (ElementCount TmpVF = Range.Start * 2;
8006 ElementCount::isKnownLT(TmpVF, Range.End); TmpVF *= 2)
8007 if (Predicate(TmpVF) != PredicateAtRangeStart) {
8008 Range.End = TmpVF;
8009 break;
8010 }
8011
8012 return PredicateAtRangeStart;
8013}
8014
8015/// Build VPlans for the full range of feasible VF's = {\p MinVF, 2 * \p MinVF,
8016/// 4 * \p MinVF, ..., \p MaxVF} by repeatedly building a VPlan for a sub-range
8017/// of VF's starting at a given VF and extending it as much as possible. Each
8018/// vectorization decision can potentially shorten this sub-range during
8019/// buildVPlan().
8020void LoopVectorizationPlanner::buildVPlans(ElementCount MinVF,
8021 ElementCount MaxVF) {
8022 auto MaxVFPlusOne = MaxVF.getWithIncrement(1);
8023 for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFPlusOne);) {
8024 VFRange SubRange = {VF, MaxVFPlusOne};
8025 VPlans.push_back(buildVPlan(SubRange));
8026 VF = SubRange.End;
8027 }
8028}
8029
8030VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst,
8031 VPlanPtr &Plan) {
8032 assert(is_contained(predecessors(Dst), Src) && "Invalid edge")(static_cast <bool> (is_contained(predecessors(Dst), Src
) && "Invalid edge") ? void (0) : __assert_fail ("is_contained(predecessors(Dst), Src) && \"Invalid edge\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 8032, __extension__
__PRETTY_FUNCTION__))
;
8033
8034 // Look for cached value.
8035 std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst);
8036 EdgeMaskCacheTy::iterator ECEntryIt = EdgeMaskCache.find(Edge);
8037 if (ECEntryIt != EdgeMaskCache.end())
8038 return ECEntryIt->second;
8039
8040 VPValue *SrcMask = createBlockInMask(Src, Plan);
8041
8042 // The terminator has to be a branch inst!
8043 BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator());
8044 assert(BI && "Unexpected terminator found")(static_cast <bool> (BI && "Unexpected terminator found"
) ? void (0) : __assert_fail ("BI && \"Unexpected terminator found\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 8044, __extension__
__PRETTY_FUNCTION__))
;
8045
8046 if (!BI->isConditional() || BI->getSuccessor(0) == BI->getSuccessor(1))
8047 return EdgeMaskCache[Edge] = SrcMask;
8048
8049 // If source is an exiting block, we know the exit edge is dynamically dead
8050 // in the vector loop, and thus we don't need to restrict the mask. Avoid
8051 // adding uses of an otherwise potentially dead instruction.
8052 if (OrigLoop->isLoopExiting(Src))
8053 return EdgeMaskCache[Edge] = SrcMask;
8054
8055 VPValue *EdgeMask = Plan->getOrAddVPValue(BI->getCondition());
8056 assert(EdgeMask && "No Edge Mask found for condition")(static_cast <bool> (EdgeMask && "No Edge Mask found for condition"
) ? void (0) : __assert_fail ("EdgeMask && \"No Edge Mask found for condition\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 8056, __extension__
__PRETTY_FUNCTION__))
;
8057
8058 if (BI->getSuccessor(0) != Dst)
8059 EdgeMask = Builder.createNot(EdgeMask, BI->getDebugLoc());
8060
8061 if (SrcMask) { // Otherwise block in-mask is all-one, no need to AND.
8062 // The condition is 'SrcMask && EdgeMask', which is equivalent to
8063 // 'select i1 SrcMask, i1 EdgeMask, i1 false'.
8064 // The select version does not introduce new UB if SrcMask is false and
8065 // EdgeMask is poison. Using 'and' here introduces undefined behavior.
8066 VPValue *False = Plan->getOrAddVPValue(
8067 ConstantInt::getFalse(BI->getCondition()->getType()));
8068 EdgeMask =
8069 Builder.createSelect(SrcMask, EdgeMask, False, BI->getDebugLoc());
8070 }
8071
8072 return EdgeMaskCache[Edge] = EdgeMask;
8073}
8074
8075VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) {
8076 assert(OrigLoop->contains(BB) && "Block is not a part of a loop")(static_cast <bool> (OrigLoop->contains(BB) &&
"Block is not a part of a loop") ? void (0) : __assert_fail (
"OrigLoop->contains(BB) && \"Block is not a part of a loop\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 8076, __extension__
__PRETTY_FUNCTION__))
;
8077
8078 // Look for cached value.
8079 BlockMaskCacheTy::iterator BCEntryIt = BlockMaskCache.find(BB);
8080 if (BCEntryIt != BlockMaskCache.end())
8081 return BCEntryIt->second;
8082
8083 // All-one mask is modelled as no-mask following the convention for masked
8084 // load/store/gather/scatter. Initialize BlockMask to no-mask.
8085 VPValue *BlockMask = nullptr;
8086
8087 if (OrigLoop->getHeader() == BB) {
8088 if (!CM.blockNeedsPredicationForAnyReason(BB))
8089 return BlockMaskCache[BB] = BlockMask; // Loop incoming mask is all-one.
8090
8091 assert(CM.foldTailByMasking() && "must fold the tail")(static_cast <bool> (CM.foldTailByMasking() && "must fold the tail"
) ? void (0) : __assert_fail ("CM.foldTailByMasking() && \"must fold the tail\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 8091, __extension__
__PRETTY_FUNCTION__))
;
8092
8093 // If we're using the active lane mask for control flow, then we get the
8094 // mask from the active lane mask PHI that is cached in the VPlan.
8095 PredicationStyle EmitGetActiveLaneMask = CM.TTI.emitGetActiveLaneMask();
8096 if (EmitGetActiveLaneMask == PredicationStyle::DataAndControlFlow)
8097 return BlockMaskCache[BB] = Plan->getActiveLaneMaskPhi();
8098
8099 // Introduce the early-exit compare IV <= BTC to form header block mask.
8100 // This is used instead of IV < TC because TC may wrap, unlike BTC. Start by
8101 // constructing the desired canonical IV in the header block as its first
8102 // non-phi instructions.
8103
8104 VPBasicBlock *HeaderVPBB =
8105 Plan->getVectorLoopRegion()->getEntryBasicBlock();
8106 auto NewInsertionPoint = HeaderVPBB->getFirstNonPhi();
8107 auto *IV = new VPWidenCanonicalIVRecipe(Plan->getCanonicalIV());
8108 HeaderVPBB->insert(IV, HeaderVPBB->getFirstNonPhi());
8109
8110 VPBuilder::InsertPointGuard Guard(Builder);
8111 Builder.setInsertPoint(HeaderVPBB, NewInsertionPoint);
8112 if (EmitGetActiveLaneMask != PredicationStyle::None) {
8113 VPValue *TC = Plan->getOrCreateTripCount();
8114 BlockMask = Builder.createNaryOp(VPInstruction::ActiveLaneMask, {IV, TC},
8115 nullptr, "active.lane.mask");
8116 } else {
8117 VPValue *BTC = Plan->getOrCreateBackedgeTakenCount();
8118 BlockMask = Builder.createNaryOp(VPInstruction::ICmpULE, {IV, BTC});
8119 }
8120 return BlockMaskCache[BB] = BlockMask;
8121 }
8122
8123 // This is the block mask. We OR all incoming edges.
8124 for (auto *Predecessor : predecessors(BB)) {
8125 VPValue *EdgeMask = createEdgeMask(Predecessor, BB, Plan);
8126 if (!EdgeMask) // Mask of predecessor is all-one so mask of block is too.
8127 return BlockMaskCache[BB] = EdgeMask;
8128
8129 if (!BlockMask) { // BlockMask has its initialized nullptr value.
8130 BlockMask = EdgeMask;
8131 continue;
8132 }
8133
8134 BlockMask = Builder.createOr(BlockMask, EdgeMask, {});
8135 }
8136
8137 return BlockMaskCache[BB] = BlockMask;
8138}
8139
8140VPRecipeBase *VPRecipeBuilder::tryToWidenMemory(Instruction *I,
8141 ArrayRef<VPValue *> Operands,
8142 VFRange &Range,
8143 VPlanPtr &Plan) {
8144 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&(static_cast <bool> ((isa<LoadInst>(I) || isa<
StoreInst>(I)) && "Must be called with either a load or store"
) ? void (0) : __assert_fail ("(isa<LoadInst>(I) || isa<StoreInst>(I)) && \"Must be called with either a load or store\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 8145, __extension__
__PRETTY_FUNCTION__))
8145 "Must be called with either a load or store")(static_cast <bool> ((isa<LoadInst>(I) || isa<
StoreInst>(I)) && "Must be called with either a load or store"
) ? void (0) : __assert_fail ("(isa<LoadInst>(I) || isa<StoreInst>(I)) && \"Must be called with either a load or store\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 8145, __extension__
__PRETTY_FUNCTION__))
;
8146
8147 auto willWiden = [&](ElementCount VF) -> bool {
8148 LoopVectorizationCostModel::InstWidening Decision =
8149 CM.getWideningDecision(I, VF);
8150 assert(Decision != LoopVectorizationCostModel::CM_Unknown &&(static_cast <bool> (Decision != LoopVectorizationCostModel
::CM_Unknown && "CM decision should be taken at this point."
) ? void (0) : __assert_fail ("Decision != LoopVectorizationCostModel::CM_Unknown && \"CM decision should be taken at this point.\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 8151, __extension__
__PRETTY_FUNCTION__))
8151 "CM decision should be taken at this point.")(static_cast <bool> (Decision != LoopVectorizationCostModel
::CM_Unknown && "CM decision should be taken at this point."
) ? void (0) : __assert_fail ("Decision != LoopVectorizationCostModel::CM_Unknown && \"CM decision should be taken at this point.\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 8151, __extension__
__PRETTY_FUNCTION__))
;
8152 if (Decision == LoopVectorizationCostModel::CM_Interleave)
8153 return true;
8154 if (CM.isScalarAfterVectorization(I, VF) ||
8155 CM.isProfitableToScalarize(I, VF))
8156 return false;
8157 return Decision != LoopVectorizationCostModel::CM_Scalarize;
8158 };
8159
8160 if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range))
8161 return nullptr;
8162
8163 VPValue *Mask = nullptr;
8164 if (Legal->isMaskRequired(I))
8165 Mask = createBlockInMask(I->getParent(), Plan);
8166
8167 // Determine if the pointer operand of the access is either consecutive or
8168 // reverse consecutive.
8169 LoopVectorizationCostModel::InstWidening Decision =
8170 CM.getWideningDecision(I, Range.Start);
8171 bool Reverse = Decision == LoopVectorizationCostModel::CM_Widen_Reverse;
8172 bool Consecutive =
8173 Reverse || Decision == LoopVectorizationCostModel::CM_Widen;
8174
8175 if (LoadInst *Load = dyn_cast<LoadInst>(I))
8176 return new VPWidenMemoryInstructionRecipe(*Load, Operands[0], Mask,
8177 Consecutive, Reverse);
8178
8179 StoreInst *Store = cast<StoreInst>(I);
8180 return new VPWidenMemoryInstructionRecipe(*Store, Operands[1], Operands[0],
8181 Mask, Consecutive, Reverse);
8182}
8183
8184/// Creates a VPWidenIntOrFpInductionRecpipe for \p Phi. If needed, it will also
8185/// insert a recipe to expand the step for the induction recipe.
8186static VPWidenIntOrFpInductionRecipe *createWidenInductionRecipes(
8187 PHINode *Phi, Instruction *PhiOrTrunc, VPValue *Start,
8188 const InductionDescriptor &IndDesc, LoopVectorizationCostModel &CM,
8189 VPlan &Plan, ScalarEvolution &SE, Loop &OrigLoop, VFRange &Range) {
8190 // Returns true if an instruction \p I should be scalarized instead of
8191 // vectorized for the chosen vectorization factor.
8192 auto ShouldScalarizeInstruction = [&CM](Instruction *I, ElementCount VF) {
8193 return CM.isScalarAfterVectorization(I, VF) ||
8194 CM.isProfitableToScalarize(I, VF);
8195 };
8196
8197 bool NeedsScalarIVOnly = LoopVectorizationPlanner::getDecisionAndClampRange(
8198 [&](ElementCount VF) {
8199 return ShouldScalarizeInstruction(PhiOrTrunc, VF);
8200 },
8201 Range);
8202 assert(IndDesc.getStartValue() ==(static_cast <bool> (IndDesc.getStartValue() == Phi->
getIncomingValueForBlock(OrigLoop.getLoopPreheader())) ? void
(0) : __assert_fail ("IndDesc.getStartValue() == Phi->getIncomingValueForBlock(OrigLoop.getLoopPreheader())"
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 8203, __extension__
__PRETTY_FUNCTION__))
8203 Phi->getIncomingValueForBlock(OrigLoop.getLoopPreheader()))(static_cast <bool> (IndDesc.getStartValue() == Phi->
getIncomingValueForBlock(OrigLoop.getLoopPreheader())) ? void
(0) : __assert_fail ("IndDesc.getStartValue() == Phi->getIncomingValueForBlock(OrigLoop.getLoopPreheader())"
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 8203, __extension__
__PRETTY_FUNCTION__))
;
8204 assert(SE.isLoopInvariant(IndDesc.getStep(), &OrigLoop) &&(static_cast <bool> (SE.isLoopInvariant(IndDesc.getStep
(), &OrigLoop) && "step must be loop invariant") ?
void (0) : __assert_fail ("SE.isLoopInvariant(IndDesc.getStep(), &OrigLoop) && \"step must be loop invariant\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 8205, __extension__
__PRETTY_FUNCTION__))
8205 "step must be loop invariant")(static_cast <bool> (SE.isLoopInvariant(IndDesc.getStep
(), &OrigLoop) && "step must be loop invariant") ?
void (0) : __assert_fail ("SE.isLoopInvariant(IndDesc.getStep(), &OrigLoop) && \"step must be loop invariant\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 8205, __extension__
__PRETTY_FUNCTION__))
;
8206
8207 VPValue *Step =
8208 vputils::getOrCreateVPValueForSCEVExpr(Plan, IndDesc.getStep(), SE);
8209 if (auto *TruncI = dyn_cast<TruncInst>(PhiOrTrunc)) {
8210 return new VPWidenIntOrFpInductionRecipe(Phi, Start, Step, IndDesc, TruncI,
8211 !NeedsScalarIVOnly);
8212 }
8213 assert(isa<PHINode>(PhiOrTrunc) && "must be a phi node here")(static_cast <bool> (isa<PHINode>(PhiOrTrunc) &&
"must be a phi node here") ? void (0) : __assert_fail ("isa<PHINode>(PhiOrTrunc) && \"must be a phi node here\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 8213, __extension__
__PRETTY_FUNCTION__))
;
8214 return new VPWidenIntOrFpInductionRecipe(Phi, Start, Step, IndDesc,
8215 !NeedsScalarIVOnly);
8216}
8217
8218VPRecipeBase *VPRecipeBuilder::tryToOptimizeInductionPHI(
8219 PHINode *Phi, ArrayRef<VPValue *> Operands, VPlan &Plan, VFRange &Range) {
8220
8221 // Check if this is an integer or fp induction. If so, build the recipe that
8222 // produces its scalar and vector values.
8223 if (auto *II = Legal->getIntOrFpInductionDescriptor(Phi))
8224 return createWidenInductionRecipes(Phi, Phi, Operands[0], *II, CM, Plan,
8225 *PSE.getSE(), *OrigLoop, Range);
8226
8227 // Check if this is pointer induction. If so, build the recipe for it.
8228 if (auto *II = Legal->getPointerInductionDescriptor(Phi)) {
8229 VPValue *Step = vputils::getOrCreateVPValueForSCEVExpr(Plan, II->getStep(),
8230 *PSE.getSE());
8231 assert(isa<SCEVConstant>(II->getStep()))(static_cast <bool> (isa<SCEVConstant>(II->getStep
())) ? void (0) : __assert_fail ("isa<SCEVConstant>(II->getStep())"
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 8231, __extension__
__PRETTY_FUNCTION__))
;
8232 return new VPWidenPointerInductionRecipe(
8233 Phi, Operands[0], Step, *II,
8234 LoopVectorizationPlanner::getDecisionAndClampRange(
8235 [&](ElementCount VF) {
8236 return CM.isScalarAfterVectorization(Phi, VF);
8237 },
8238 Range));
8239 }
8240 return nullptr;
8241}
8242
8243VPWidenIntOrFpInductionRecipe *VPRecipeBuilder::tryToOptimizeInductionTruncate(
8244 TruncInst *I, ArrayRef<VPValue *> Operands, VFRange &Range, VPlan &Plan) {
8245 // Optimize the special case where the source is a constant integer
8246 // induction variable. Notice that we can only optimize the 'trunc' case
8247 // because (a) FP conversions lose precision, (b) sext/zext may wrap, and
8248 // (c) other casts depend on pointer size.
8249
8250 // Determine whether \p K is a truncation based on an induction variable that
8251 // can be optimized.
8252 auto isOptimizableIVTruncate =
8253 [&](Instruction *K) -> std::function<bool(ElementCount)> {
8254 return [=](ElementCount VF) -> bool {
8255 return CM.isOptimizableIVTruncate(K, VF);
8256 };
8257 };
8258
8259 if (LoopVectorizationPlanner::getDecisionAndClampRange(
8260 isOptimizableIVTruncate(I), Range)) {
8261
8262 auto *Phi = cast<PHINode>(I->getOperand(0));
8263 const InductionDescriptor &II = *Legal->getIntOrFpInductionDescriptor(Phi);
8264 VPValue *Start = Plan.getOrAddVPValue(II.getStartValue());
8265 return createWidenInductionRecipes(Phi, I, Start, II, CM, Plan,
8266 *PSE.getSE(), *OrigLoop, Range);
8267 }
8268 return nullptr;
8269}
8270
8271VPRecipeOrVPValueTy VPRecipeBuilder::tryToBlend(PHINode *Phi,
8272 ArrayRef<VPValue *> Operands,
8273 VPlanPtr &Plan) {
8274 // If all incoming values are equal, the incoming VPValue can be used directly
8275 // instead of creating a new VPBlendRecipe.
8276 if (llvm::all_equal(Operands))
8277 return Operands[0];
8278
8279 unsigned NumIncoming = Phi->getNumIncomingValues();
8280 // For in-loop reductions, we do not need to create an additional select.
8281 VPValue *InLoopVal = nullptr;
8282 for (unsigned In = 0; In < NumIncoming; In++) {
8283 PHINode *PhiOp =
8284 dyn_cast_or_null<PHINode>(Operands[In]->getUnderlyingValue());
8285 if (PhiOp && CM.isInLoopReduction(PhiOp)) {
8286 assert(!InLoopVal && "Found more than one in-loop reduction!")(static_cast <bool> (!InLoopVal && "Found more than one in-loop reduction!"
) ? void (0) : __assert_fail ("!InLoopVal && \"Found more than one in-loop reduction!\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 8286, __extension__
__PRETTY_FUNCTION__))
;
8287 InLoopVal = Operands[In];
8288 }
8289 }
8290
8291 assert((!InLoopVal || NumIncoming == 2) &&(static_cast <bool> ((!InLoopVal || NumIncoming == 2) &&
"Found an in-loop reduction for PHI with unexpected number of "
"incoming values") ? void (0) : __assert_fail ("(!InLoopVal || NumIncoming == 2) && \"Found an in-loop reduction for PHI with unexpected number of \" \"incoming values\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 8293, __extension__
__PRETTY_FUNCTION__))
8292 "Found an in-loop reduction for PHI with unexpected number of "(static_cast <bool> ((!InLoopVal || NumIncoming == 2) &&
"Found an in-loop reduction for PHI with unexpected number of "
"incoming values") ? void (0) : __assert_fail ("(!InLoopVal || NumIncoming == 2) && \"Found an in-loop reduction for PHI with unexpected number of \" \"incoming values\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 8293, __extension__
__PRETTY_FUNCTION__))
8293 "incoming values")(static_cast <bool> ((!InLoopVal || NumIncoming == 2) &&
"Found an in-loop reduction for PHI with unexpected number of "
"incoming values") ? void (0) : __assert_fail ("(!InLoopVal || NumIncoming == 2) && \"Found an in-loop reduction for PHI with unexpected number of \" \"incoming values\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 8293, __extension__
__PRETTY_FUNCTION__))
;
8294 if (InLoopVal)
8295 return Operands[Operands[0] == InLoopVal ? 1 : 0];
8296
8297 // We know that all PHIs in non-header blocks are converted into selects, so
8298 // we don't have to worry about the insertion order and we can just use the
8299 // builder. At this point we generate the predication tree. There may be
8300 // duplications since this is a simple recursive scan, but future
8301 // optimizations will clean it up.
8302 SmallVector<VPValue *, 2> OperandsWithMask;
8303
8304 for (unsigned In = 0; In < NumIncoming; In++) {
8305 VPValue *EdgeMask =
8306 createEdgeMask(Phi->getIncomingBlock(In), Phi->getParent(), Plan);
8307 assert((EdgeMask || NumIncoming == 1) &&(static_cast <bool> ((EdgeMask || NumIncoming == 1) &&
"Multiple predecessors with one having a full mask") ? void (
0) : __assert_fail ("(EdgeMask || NumIncoming == 1) && \"Multiple predecessors with one having a full mask\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 8308, __extension__
__PRETTY_FUNCTION__))
8308 "Multiple predecessors with one having a full mask")(static_cast <bool> ((EdgeMask || NumIncoming == 1) &&
"Multiple predecessors with one having a full mask") ? void (
0) : __assert_fail ("(EdgeMask || NumIncoming == 1) && \"Multiple predecessors with one having a full mask\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 8308, __extension__
__PRETTY_FUNCTION__))
;
8309 OperandsWithMask.push_back(Operands[In]);
8310 if (EdgeMask)
8311 OperandsWithMask.push_back(EdgeMask);
8312 }
8313 return toVPRecipeResult(new VPBlendRecipe(Phi, OperandsWithMask));
8314}
8315
8316VPWidenCallRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI,
8317 ArrayRef<VPValue *> Operands,
8318 VFRange &Range) const {
8319
8320 bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange(
8321 [this, CI](ElementCount VF) {
8322 return CM.isScalarWithPredication(CI, VF);
8323 },
8324 Range);
8325
8326 if (IsPredicated)
8327 return nullptr;
8328
8329 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
8330 if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end ||
8331 ID == Intrinsic::lifetime_start || ID == Intrinsic::sideeffect ||
8332 ID == Intrinsic::pseudoprobe ||
8333 ID == Intrinsic::experimental_noalias_scope_decl))
8334 return nullptr;
8335
8336 ArrayRef<VPValue *> Ops = Operands.take_front(CI->arg_size());
8337
8338 // Is it beneficial to perform intrinsic call compared to lib call?
8339 bool ShouldUseVectorIntrinsic =
8340 ID && LoopVectorizationPlanner::getDecisionAndClampRange(
8341 [&](ElementCount VF) -> bool {
8342 bool NeedToScalarize = false;
8343 // Is it beneficial to perform intrinsic call compared to lib
8344 // call?
8345 InstructionCost CallCost =
8346 CM.getVectorCallCost(CI, VF, NeedToScalarize);
8347 InstructionCost IntrinsicCost =
8348 CM.getVectorIntrinsicCost(CI, VF);
8349 return IntrinsicCost <= CallCost;
8350 },
8351 Range);
8352 if (ShouldUseVectorIntrinsic)
8353 return new VPWidenCallRecipe(*CI, make_range(Ops.begin(), Ops.end()), ID);
8354
8355 // Is better to call a vectorized version of the function than to to scalarize
8356 // the call?
8357 auto ShouldUseVectorCall = LoopVectorizationPlanner::getDecisionAndClampRange(
8358 [&](ElementCount VF) -> bool {
8359 // The following case may be scalarized depending on the VF.
8360 // The flag shows whether we can use a usual Call for vectorized
8361 // version of the instruction.
8362 bool NeedToScalarize = false;
8363 CM.getVectorCallCost(CI, VF, NeedToScalarize);
8364 return !NeedToScalarize;
8365 },
8366 Range);
8367 if (ShouldUseVectorCall)
8368 return new VPWidenCallRecipe(*CI, make_range(Ops.begin(), Ops.end()),
8369 Intrinsic::not_intrinsic);
8370
8371 return nullptr;
8372}
8373
8374bool VPRecipeBuilder::shouldWiden(Instruction *I, VFRange &Range) const {
8375 assert(!isa<BranchInst>(I) && !isa<PHINode>(I) && !isa<LoadInst>(I) &&(static_cast <bool> (!isa<BranchInst>(I) &&
!isa<PHINode>(I) && !isa<LoadInst>(I) &&
!isa<StoreInst>(I) && "Instruction should have been handled earlier"
) ? void (0) : __assert_fail ("!isa<BranchInst>(I) && !isa<PHINode>(I) && !isa<LoadInst>(I) && !isa<StoreInst>(I) && \"Instruction should have been handled earlier\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 8376, __extension__
__PRETTY_FUNCTION__))
8376 !isa<StoreInst>(I) && "Instruction should have been handled earlier")(static_cast <bool> (!isa<BranchInst>(I) &&
!isa<PHINode>(I) && !isa<LoadInst>(I) &&
!isa<StoreInst>(I) && "Instruction should have been handled earlier"
) ? void (0) : __assert_fail ("!isa<BranchInst>(I) && !isa<PHINode>(I) && !isa<LoadInst>(I) && !isa<StoreInst>(I) && \"Instruction should have been handled earlier\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 8376, __extension__
__PRETTY_FUNCTION__))
;
8377 // Instruction should be widened, unless it is scalar after vectorization,
8378 // scalarization is profitable or it is predicated.
8379 auto WillScalarize = [this, I](ElementCount VF) -> bool {
8380 return CM.isScalarAfterVectorization(I, VF) ||
8381 CM.isProfitableToScalarize(I, VF) ||
8382 CM.isScalarWithPredication(I, VF);
8383 };
8384 return !LoopVectorizationPlanner::getDecisionAndClampRange(WillScalarize,
8385 Range);
8386}
8387
8388VPRecipeBase *VPRecipeBuilder::tryToWiden(Instruction *I,
8389 ArrayRef<VPValue *> Operands,
8390 VPBasicBlock *VPBB, VPlanPtr &Plan) {
8391 switch (I->getOpcode()) {
8392 default:
8393 return nullptr;
8394 case Instruction::SDiv:
8395 case Instruction::UDiv:
8396 case Instruction::SRem:
8397 case Instruction::URem: {
8398 // If not provably safe, use a select to form a safe divisor before widening the
8399 // div/rem operation itself. Otherwise fall through to general handling below.
8400 if (CM.isPredicatedInst(I)) {
8401 SmallVector<VPValue *> Ops(Operands.begin(), Operands.end());
8402 VPValue *Mask = createBlockInMask(I->getParent(), Plan);
8403 VPValue *One =
8404 Plan->getOrAddExternalDef(ConstantInt::get(I->getType(), 1u, false));
8405 auto *SafeRHS =
8406 new VPInstruction(Instruction::Select, {Mask, Ops[1], One},
8407 I->getDebugLoc());
8408 VPBB->appendRecipe(SafeRHS);
8409 Ops[1] = SafeRHS;
8410 return new VPWidenRecipe(*I, make_range(Ops.begin(), Ops.end()));
8411 }
8412 LLVM_FALLTHROUGH[[fallthrough]];
8413 }
8414 case Instruction::Add:
8415 case Instruction::And:
8416 case Instruction::AShr:
8417 case Instruction::BitCast:
8418 case Instruction::FAdd:
8419 case Instruction::FCmp:
8420 case Instruction::FDiv:
8421 case Instruction::FMul:
8422 case Instruction::FNeg:
8423 case Instruction::FPExt:
8424 case Instruction::FPToSI:
8425 case Instruction::FPToUI:
8426 case Instruction::FPTrunc:
8427 case Instruction::FRem:
8428 case Instruction::FSub:
8429 case Instruction::ICmp:
8430 case Instruction::IntToPtr:
8431 case Instruction::LShr:
8432 case Instruction::Mul:
8433 case Instruction::Or:
8434 case Instruction::PtrToInt:
8435 case Instruction::Select:
8436 case Instruction::SExt:
8437 case Instruction::Shl:
8438 case Instruction::SIToFP:
8439 case Instruction::Sub:
8440 case Instruction::Trunc:
8441 case Instruction::UIToFP:
8442 case Instruction::Xor:
8443 case Instruction::ZExt:
8444 case Instruction::Freeze:
8445 return new VPWidenRecipe(*I, make_range(Operands.begin(), Operands.end()));
8446 };
8447}
8448
8449void VPRecipeBuilder::fixHeaderPhis() {
8450 BasicBlock *OrigLatch = OrigLoop->getLoopLatch();
8451 for (VPHeaderPHIRecipe *R : PhisToFix) {
8452 auto *PN = cast<PHINode>(R->getUnderlyingValue());
8453 VPRecipeBase *IncR =
8454 getRecipe(cast<Instruction>(PN->getIncomingValueForBlock(OrigLatch)));
8455 R->addOperand(IncR->getVPSingleValue());
8456 }
8457}
8458
8459VPBasicBlock *VPRecipeBuilder::handleReplication(
8460 Instruction *I, VFRange &Range, VPBasicBlock *VPBB,
8461 VPlanPtr &Plan) {
8462 bool IsUniform = LoopVectorizationPlanner::getDecisionAndClampRange(
8463 [&](ElementCount VF) { return CM.isUniformAfterVectorization(I, VF); },
8464 Range);
8465
8466 bool IsPredicated = CM.isPredicatedInst(I);
8467
8468 // Even if the instruction is not marked as uniform, there are certain
8469 // intrinsic calls that can be effectively treated as such, so we check for
8470 // them here. Conservatively, we only do this for scalable vectors, since
8471 // for fixed-width VFs we can always fall back on full scalarization.
8472 if (!IsUniform && Range.Start.isScalable() && isa<IntrinsicInst>(I)) {
8473 switch (cast<IntrinsicInst>(I)->getIntrinsicID()) {
8474 case Intrinsic::assume:
8475 case Intrinsic::lifetime_start:
8476 case Intrinsic::lifetime_end:
8477 // For scalable vectors if one of the operands is variant then we still
8478 // want to mark as uniform, which will generate one instruction for just
8479 // the first lane of the vector. We can't scalarize the call in the same
8480 // way as for fixed-width vectors because we don't know how many lanes
8481 // there are.
8482 //
8483 // The reasons for doing it this way for scalable vectors are:
8484 // 1. For the assume intrinsic generating the instruction for the first
8485 // lane is still be better than not generating any at all. For
8486 // example, the input may be a splat across all lanes.
8487 // 2. For the lifetime start/end intrinsics the pointer operand only
8488 // does anything useful when the input comes from a stack object,
8489 // which suggests it should always be uniform. For non-stack objects
8490 // the effect is to poison the object, which still allows us to
8491 // remove the call.
8492 IsUniform = true;
8493 break;
8494 default:
8495 break;
8496 }
8497 }
8498
8499 auto *Recipe = new VPReplicateRecipe(I, Plan->mapToVPValues(I->operands()),
8500 IsUniform, IsPredicated);
8501
8502 // Find if I uses a predicated instruction. If so, it will use its scalar
8503 // value. Avoid hoisting the insert-element which packs the scalar value into
8504 // a vector value, as that happens iff all users use the vector value.
8505 for (VPValue *Op : Recipe->operands()) {
8506 auto *PredR =
8507 dyn_cast_or_null<VPPredInstPHIRecipe>(Op->getDefiningRecipe());
8508 if (!PredR)
8509 continue;
8510 auto *RepR = cast<VPReplicateRecipe>(
8511 PredR->getOperand(0)->getDefiningRecipe());
8512 assert(RepR->isPredicated() &&(static_cast <bool> (RepR->isPredicated() &&
"expected Replicate recipe to be predicated") ? void (0) : __assert_fail
("RepR->isPredicated() && \"expected Replicate recipe to be predicated\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 8513, __extension__
__PRETTY_FUNCTION__))
8513 "expected Replicate recipe to be predicated")(static_cast <bool> (RepR->isPredicated() &&
"expected Replicate recipe to be predicated") ? void (0) : __assert_fail
("RepR->isPredicated() && \"expected Replicate recipe to be predicated\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 8513, __extension__
__PRETTY_FUNCTION__))
;
8514 RepR->setAlsoPack(false);
8515 }
8516
8517 // Finalize the recipe for Instr, first if it is not predicated.
8518 if (!IsPredicated) {
8519 LLVM_DEBUG(dbgs() << "LV: Scalarizing:" << *I << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Scalarizing:" <<
*I << "\n"; } } while (false)
;
8520 setRecipe(I, Recipe);
8521 Plan->addVPValue(I, Recipe);
8522 VPBB->appendRecipe(Recipe);
8523 return VPBB;
8524 }
8525 LLVM_DEBUG(dbgs() << "LV: Scalarizing and predicating:" << *I << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Scalarizing and predicating:"
<< *I << "\n"; } } while (false)
;
8526
8527 VPBlockBase *SingleSucc = VPBB->getSingleSuccessor();
8528 assert(SingleSucc && "VPBB must have a single successor when handling "(static_cast <bool> (SingleSucc && "VPBB must have a single successor when handling "
"predicated replication.") ? void (0) : __assert_fail ("SingleSucc && \"VPBB must have a single successor when handling \" \"predicated replication.\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 8529, __extension__
__PRETTY_FUNCTION__))
8529 "predicated replication.")(static_cast <bool> (SingleSucc && "VPBB must have a single successor when handling "
"predicated replication.") ? void (0) : __assert_fail ("SingleSucc && \"VPBB must have a single successor when handling \" \"predicated replication.\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 8529, __extension__
__PRETTY_FUNCTION__))
;
8530 VPBlockUtils::disconnectBlocks(VPBB, SingleSucc);
8531 // Record predicated instructions for above packing optimizations.
8532 VPBlockBase *Region = createReplicateRegion(Recipe, Plan);
8533 VPBlockUtils::insertBlockAfter(Region, VPBB);
8534 auto *RegSucc = new VPBasicBlock();
8535 VPBlockUtils::insertBlockAfter(RegSucc, Region);
8536 VPBlockUtils::connectBlocks(RegSucc, SingleSucc);
8537 return RegSucc;
8538}
8539
8540VPRegionBlock *
8541VPRecipeBuilder::createReplicateRegion(VPReplicateRecipe *PredRecipe,
8542 VPlanPtr &Plan) {
8543 Instruction *Instr = PredRecipe->getUnderlyingInstr();
8544 // Instructions marked for predication are replicated and placed under an
8545 // if-then construct to prevent side-effects.
8546 // Generate recipes to compute the block mask for this region.
8547 VPValue *BlockInMask = createBlockInMask(Instr->getParent(), Plan);
8548
8549 // Build the triangular if-then region.
8550 std::string RegionName = (Twine("pred.") + Instr->getOpcodeName()).str();
8551 assert(Instr->getParent() && "Predicated instruction not in any basic block")(static_cast <bool> (Instr->getParent() && "Predicated instruction not in any basic block"
) ? void (0) : __assert_fail ("Instr->getParent() && \"Predicated instruction not in any basic block\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 8551, __extension__
__PRETTY_FUNCTION__))
;
8552 auto *BOMRecipe = new VPBranchOnMaskRecipe(BlockInMask);
8553 auto *Entry = new VPBasicBlock(Twine(RegionName) + ".entry", BOMRecipe);
8554 auto *PHIRecipe = Instr->getType()->isVoidTy()
8555 ? nullptr
8556 : new VPPredInstPHIRecipe(PredRecipe);
8557 if (PHIRecipe) {
8558 setRecipe(Instr, PHIRecipe);
8559 Plan->addVPValue(Instr, PHIRecipe);
8560 } else {
8561 setRecipe(Instr, PredRecipe);
8562 Plan->addVPValue(Instr, PredRecipe);
8563 }
8564
8565 auto *Exiting = new VPBasicBlock(Twine(RegionName) + ".continue", PHIRecipe);
8566 auto *Pred = new VPBasicBlock(Twine(RegionName) + ".if", PredRecipe);
8567 VPRegionBlock *Region = new VPRegionBlock(Entry, Exiting, RegionName, true);
8568
8569 // Note: first set Entry as region entry and then connect successors starting
8570 // from it in order, to propagate the "parent" of each VPBasicBlock.
8571 VPBlockUtils::insertTwoBlocksAfter(Pred, Exiting, Entry);
8572 VPBlockUtils::connectBlocks(Pred, Exiting);
8573
8574 return Region;
8575}
8576
8577VPRecipeOrVPValueTy
8578VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr,
8579 ArrayRef<VPValue *> Operands,
8580 VFRange &Range, VPBasicBlock *VPBB,
8581 VPlanPtr &Plan) {
8582 // First, check for specific widening recipes that deal with inductions, Phi
8583 // nodes, calls and memory operations.
8584 VPRecipeBase *Recipe;
8585 if (auto Phi = dyn_cast<PHINode>(Instr)) {
8586 if (Phi->getParent() != OrigLoop->getHeader())
8587 return tryToBlend(Phi, Operands, Plan);
8588
8589 // Always record recipes for header phis. Later first-order recurrence phis
8590 // can have earlier phis as incoming values.
8591 recordRecipeOf(Phi);
8592
8593 if ((Recipe = tryToOptimizeInductionPHI(Phi, Operands, *Plan, Range)))
8594 return toVPRecipeResult(Recipe);
8595
8596 VPHeaderPHIRecipe *PhiRecipe = nullptr;
8597 assert((Legal->isReductionVariable(Phi) ||(static_cast <bool> ((Legal->isReductionVariable(Phi
) || Legal->isFixedOrderRecurrence(Phi)) && "can only widen reductions and fixed-order recurrences here"
) ? void (0) : __assert_fail ("(Legal->isReductionVariable(Phi) || Legal->isFixedOrderRecurrence(Phi)) && \"can only widen reductions and fixed-order recurrences here\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 8599, __extension__
__PRETTY_FUNCTION__))
8598 Legal->isFixedOrderRecurrence(Phi)) &&(static_cast <bool> ((Legal->isReductionVariable(Phi
) || Legal->isFixedOrderRecurrence(Phi)) && "can only widen reductions and fixed-order recurrences here"
) ? void (0) : __assert_fail ("(Legal->isReductionVariable(Phi) || Legal->isFixedOrderRecurrence(Phi)) && \"can only widen reductions and fixed-order recurrences here\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 8599, __extension__
__PRETTY_FUNCTION__))
8599 "can only widen reductions and fixed-order recurrences here")(static_cast <bool> ((Legal->isReductionVariable(Phi
) || Legal->isFixedOrderRecurrence(Phi)) && "can only widen reductions and fixed-order recurrences here"
) ? void (0) : __assert_fail ("(Legal->isReductionVariable(Phi) || Legal->isFixedOrderRecurrence(Phi)) && \"can only widen reductions and fixed-order recurrences here\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 8599, __extension__
__PRETTY_FUNCTION__))
;
8600 VPValue *StartV = Operands[0];
8601 if (Legal->isReductionVariable(Phi)) {
8602 const RecurrenceDescriptor &RdxDesc =
8603 Legal->getReductionVars().find(Phi)->second;
8604 assert(RdxDesc.getRecurrenceStartValue() ==(static_cast <bool> (RdxDesc.getRecurrenceStartValue() ==
Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader
())) ? void (0) : __assert_fail ("RdxDesc.getRecurrenceStartValue() == Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader())"
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 8605, __extension__
__PRETTY_FUNCTION__))
8605 Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader()))(static_cast <bool> (RdxDesc.getRecurrenceStartValue() ==
Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader
())) ? void (0) : __assert_fail ("RdxDesc.getRecurrenceStartValue() == Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader())"
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 8605, __extension__
__PRETTY_FUNCTION__))
;
8606 PhiRecipe = new VPReductionPHIRecipe(Phi, RdxDesc, *StartV,
8607 CM.isInLoopReduction(Phi),
8608 CM.useOrderedReductions(RdxDesc));
8609 } else {
8610 // TODO: Currently fixed-order recurrences are modeled as chains of
8611 // first-order recurrences. If there are no users of the intermediate
8612 // recurrences in the chain, the fixed order recurrence should be modeled
8613 // directly, enabling more efficient codegen.
8614 PhiRecipe = new VPFirstOrderRecurrencePHIRecipe(Phi, *StartV);
8615 }
8616
8617 // Record the incoming value from the backedge, so we can add the incoming
8618 // value from the backedge after all recipes have been created.
8619 auto *Inc = cast<Instruction>(
8620 Phi->getIncomingValueForBlock(OrigLoop->getLoopLatch()));
8621 auto RecipeIter = Ingredient2Recipe.find(Inc);
8622 if (RecipeIter == Ingredient2Recipe.end())
8623 recordRecipeOf(Inc);
8624
8625 PhisToFix.push_back(PhiRecipe);
8626 return toVPRecipeResult(PhiRecipe);
8627 }
8628
8629 if (isa<TruncInst>(Instr) &&
8630 (Recipe = tryToOptimizeInductionTruncate(cast<TruncInst>(Instr), Operands,
8631 Range, *Plan)))
8632 return toVPRecipeResult(Recipe);
8633
8634 // All widen recipes below deal only with VF > 1.
8635 if (LoopVectorizationPlanner::getDecisionAndClampRange(
8636 [&](ElementCount VF) { return VF.isScalar(); }, Range))
8637 return nullptr;
8638
8639 if (auto *CI = dyn_cast<CallInst>(Instr))
8640 return toVPRecipeResult(tryToWidenCall(CI, Operands, Range));
8641
8642 if (isa<LoadInst>(Instr) || isa<StoreInst>(Instr))
8643 return toVPRecipeResult(tryToWidenMemory(Instr, Operands, Range, Plan));
8644
8645 if (!shouldWiden(Instr, Range))
8646 return nullptr;
8647
8648 if (auto GEP = dyn_cast<GetElementPtrInst>(Instr))
8649 return toVPRecipeResult(new VPWidenGEPRecipe(
8650 GEP, make_range(Operands.begin(), Operands.end()), OrigLoop));
8651
8652 if (auto *SI = dyn_cast<SelectInst>(Instr)) {
8653 bool InvariantCond =
8654 PSE.getSE()->isLoopInvariant(PSE.getSCEV(SI->getOperand(0)), OrigLoop);
8655 return toVPRecipeResult(new VPWidenSelectRecipe(
8656 *SI, make_range(Operands.begin(), Operands.end()), InvariantCond));
8657 }
8658
8659 return toVPRecipeResult(tryToWiden(Instr, Operands, VPBB, Plan));
8660}
8661
8662void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF,
8663 ElementCount MaxVF) {
8664 assert(OrigLoop->isInnermost() && "Inner loop expected.")(static_cast <bool> (OrigLoop->isInnermost() &&
"Inner loop expected.") ? void (0) : __assert_fail ("OrigLoop->isInnermost() && \"Inner loop expected.\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 8664, __extension__
__PRETTY_FUNCTION__))
;
8665
8666 // Add assume instructions we need to drop to DeadInstructions, to prevent
8667 // them from being added to the VPlan.
8668 // TODO: We only need to drop assumes in blocks that get flattend. If the
8669 // control flow is preserved, we should keep them.
8670 SmallPtrSet<Instruction *, 4> DeadInstructions;
8671 auto &ConditionalAssumes = Legal->getConditionalAssumes();
8672 DeadInstructions.insert(ConditionalAssumes.begin(), ConditionalAssumes.end());
8673
8674 MapVector<Instruction *, Instruction *> &SinkAfter = Legal->getSinkAfter();
8675 // Dead instructions do not need sinking. Remove them from SinkAfter.
8676 for (Instruction *I : DeadInstructions)
8677 SinkAfter.erase(I);
8678
8679 // Cannot sink instructions after dead instructions (there won't be any
8680 // recipes for them). Instead, find the first non-dead previous instruction.
8681 for (auto &P : Legal->getSinkAfter()) {
8682 Instruction *SinkTarget = P.second;
8683 Instruction *FirstInst = &*SinkTarget->getParent()->begin();
8684 (void)FirstInst;
8685 while (DeadInstructions.contains(SinkTarget)) {
8686 assert((static_cast <bool> (SinkTarget != FirstInst &&
"Must find a live instruction (at least the one feeding the "
"fixed-order recurrence PHI) before reaching beginning of the block"
) ? void (0) : __assert_fail ("SinkTarget != FirstInst && \"Must find a live instruction (at least the one feeding the \" \"fixed-order recurrence PHI) before reaching beginning of the block\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 8689, __extension__
__PRETTY_FUNCTION__))
8687 SinkTarget != FirstInst &&(static_cast <bool> (SinkTarget != FirstInst &&
"Must find a live instruction (at least the one feeding the "
"fixed-order recurrence PHI) before reaching beginning of the block"
) ? void (0) : __assert_fail ("SinkTarget != FirstInst && \"Must find a live instruction (at least the one feeding the \" \"fixed-order recurrence PHI) before reaching beginning of the block\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 8689, __extension__
__PRETTY_FUNCTION__))
8688 "Must find a live instruction (at least the one feeding the "(static_cast <bool> (SinkTarget != FirstInst &&
"Must find a live instruction (at least the one feeding the "
"fixed-order recurrence PHI) before reaching beginning of the block"
) ? void (0) : __assert_fail ("SinkTarget != FirstInst && \"Must find a live instruction (at least the one feeding the \" \"fixed-order recurrence PHI) before reaching beginning of the block\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 8689, __extension__
__PRETTY_FUNCTION__))
8689 "fixed-order recurrence PHI) before reaching beginning of the block")(static_cast <bool> (SinkTarget != FirstInst &&
"Must find a live instruction (at least the one feeding the "
"fixed-order recurrence PHI) before reaching beginning of the block"
) ? void (0) : __assert_fail ("SinkTarget != FirstInst && \"Must find a live instruction (at least the one feeding the \" \"fixed-order recurrence PHI) before reaching beginning of the block\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 8689, __extension__
__PRETTY_FUNCTION__))
;
8690 SinkTarget = SinkTarget->getPrevNode();
8691 assert(SinkTarget != P.first &&(static_cast <bool> (SinkTarget != P.first && "sink source equals target, no sinking required"
) ? void (0) : __assert_fail ("SinkTarget != P.first && \"sink source equals target, no sinking required\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 8692, __extension__
__PRETTY_FUNCTION__))
8692 "sink source equals target, no sinking required")(static_cast <bool> (SinkTarget != P.first && "sink source equals target, no sinking required"
) ? void (0) : __assert_fail ("SinkTarget != P.first && \"sink source equals target, no sinking required\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 8692, __extension__
__PRETTY_FUNCTION__))
;
8693 }
8694 P.second = SinkTarget;
8695 }
8696
8697 auto MaxVFPlusOne = MaxVF.getWithIncrement(1);
8698 for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFPlusOne);) {
8699 VFRange SubRange = {VF, MaxVFPlusOne};
8700 VPlans.push_back(
8701 buildVPlanWithVPRecipes(SubRange, DeadInstructions, SinkAfter));
8702 VF = SubRange.End;
8703 }
8704}
8705
8706// Add the necessary canonical IV and branch recipes required to control the
8707// loop.
8708static void addCanonicalIVRecipes(VPlan &Plan, Type *IdxTy, DebugLoc DL,
8709 bool HasNUW,
8710 bool UseLaneMaskForLoopControlFlow) {
8711 Value *StartIdx = ConstantInt::get(IdxTy, 0);
8712 auto *StartV = Plan.getOrAddVPValue(StartIdx);
8713
8714 // Add a VPCanonicalIVPHIRecipe starting at 0 to the header.
8715 auto *CanonicalIVPHI = new VPCanonicalIVPHIRecipe(StartV, DL);
8716 VPRegionBlock *TopRegion = Plan.getVectorLoopRegion();
8717 VPBasicBlock *Header = TopRegion->getEntryBasicBlock();
8718 Header->insert(CanonicalIVPHI, Header->begin());
8719
8720 // Add a CanonicalIVIncrement{NUW} VPInstruction to increment the scalar
8721 // IV by VF * UF.
8722 auto *CanonicalIVIncrement =
8723 new VPInstruction(HasNUW ? VPInstruction::CanonicalIVIncrementNUW
8724 : VPInstruction::CanonicalIVIncrement,
8725 {CanonicalIVPHI}, DL, "index.next");
8726 CanonicalIVPHI->addOperand(CanonicalIVIncrement);
8727
8728 VPBasicBlock *EB = TopRegion->getExitingBasicBlock();
8729 EB->appendRecipe(CanonicalIVIncrement);
8730
8731 if (UseLaneMaskForLoopControlFlow) {
8732 // Create the active lane mask instruction in the vplan preheader.
8733 VPBasicBlock *Preheader = Plan.getEntry()->getEntryBasicBlock();
8734
8735 // We can't use StartV directly in the ActiveLaneMask VPInstruction, since
8736 // we have to take unrolling into account. Each part needs to start at
8737 // Part * VF
8738 auto *CanonicalIVIncrementParts =
8739 new VPInstruction(HasNUW ? VPInstruction::CanonicalIVIncrementForPartNUW
8740 : VPInstruction::CanonicalIVIncrementForPart,
8741 {StartV}, DL, "index.part.next");
8742 Preheader->appendRecipe(CanonicalIVIncrementParts);
8743
8744 // Create the ActiveLaneMask instruction using the correct start values.
8745 VPValue *TC = Plan.getOrCreateTripCount();
8746 auto *EntryALM = new VPInstruction(VPInstruction::ActiveLaneMask,
8747 {CanonicalIVIncrementParts, TC}, DL,
8748 "active.lane.mask.entry");
8749 Preheader->appendRecipe(EntryALM);
8750
8751 // Now create the ActiveLaneMaskPhi recipe in the main loop using the
8752 // preheader ActiveLaneMask instruction.
8753 auto *LaneMaskPhi = new VPActiveLaneMaskPHIRecipe(EntryALM, DebugLoc());
8754 Header->insert(LaneMaskPhi, Header->getFirstNonPhi());
8755
8756 // Create the active lane mask for the next iteration of the loop.
8757 CanonicalIVIncrementParts =
8758 new VPInstruction(HasNUW ? VPInstruction::CanonicalIVIncrementForPartNUW
8759 : VPInstruction::CanonicalIVIncrementForPart,
8760 {CanonicalIVIncrement}, DL);
8761 EB->appendRecipe(CanonicalIVIncrementParts);
8762
8763 auto *ALM = new VPInstruction(VPInstruction::ActiveLaneMask,
8764 {CanonicalIVIncrementParts, TC}, DL,
8765 "active.lane.mask.next");
8766 EB->appendRecipe(ALM);
8767 LaneMaskPhi->addOperand(ALM);
8768
8769 // We have to invert the mask here because a true condition means jumping
8770 // to the exit block.
8771 auto *NotMask = new VPInstruction(VPInstruction::Not, ALM, DL);
8772 EB->appendRecipe(NotMask);
8773
8774 VPInstruction *BranchBack =
8775 new VPInstruction(VPInstruction::BranchOnCond, {NotMask}, DL);
8776 EB->appendRecipe(BranchBack);
8777 } else {
8778 // Add the BranchOnCount VPInstruction to the latch.
8779 VPInstruction *BranchBack = new VPInstruction(
8780 VPInstruction::BranchOnCount,
8781 {CanonicalIVIncrement, &Plan.getVectorTripCount()}, DL);
8782 EB->appendRecipe(BranchBack);
8783 }
8784}
8785
8786// Add exit values to \p Plan. VPLiveOuts are added for each LCSSA phi in the
8787// original exit block.
8788static void addUsersInExitBlock(VPBasicBlock *HeaderVPBB,
8789 VPBasicBlock *MiddleVPBB, Loop *OrigLoop,
8790 VPlan &Plan) {
8791 BasicBlock *ExitBB = OrigLoop->getUniqueExitBlock();
8792 BasicBlock *ExitingBB = OrigLoop->getExitingBlock();
8793 // Only handle single-exit loops with unique exit blocks for now.
8794 if (!ExitBB || !ExitBB->getSinglePredecessor() || !ExitingBB)
8795 return;
8796
8797 // Introduce VPUsers modeling the exit values.
8798 for (PHINode &ExitPhi : ExitBB->phis()) {
8799 Value *IncomingValue =
8800 ExitPhi.getIncomingValueForBlock(ExitingBB);
8801 VPValue *V = Plan.getOrAddVPValue(IncomingValue, true);
8802 Plan.addLiveOut(&ExitPhi, V);
8803 }
8804}
8805
8806VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(
8807 VFRange &Range, SmallPtrSetImpl<Instruction *> &DeadInstructions,
8808 const MapVector<Instruction *, Instruction *> &SinkAfter) {
8809
8810 SmallPtrSet<const InterleaveGroup<Instruction> *, 1> InterleaveGroups;
8811
8812 VPRecipeBuilder RecipeBuilder(OrigLoop, TLI, Legal, CM, PSE, Builder);
8813
8814 // ---------------------------------------------------------------------------
8815 // Pre-construction: record ingredients whose recipes we'll need to further
8816 // process after constructing the initial VPlan.
8817 // ---------------------------------------------------------------------------
8818
8819 // Mark instructions we'll need to sink later and their targets as
8820 // ingredients whose recipe we'll need to record.
8821 for (const auto &Entry : SinkAfter) {
8822 RecipeBuilder.recordRecipeOf(Entry.first);
8823 RecipeBuilder.recordRecipeOf(Entry.second);
8824 }
8825 for (const auto &Reduction : CM.getInLoopReductionChains()) {
1
Assuming '__begin1' is equal to '__end1'
8826 PHINode *Phi = Reduction.first;
8827 RecurKind Kind =
8828 Legal->getReductionVars().find(Phi)->second.getRecurrenceKind();
8829 const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second;
8830
8831 RecipeBuilder.recordRecipeOf(Phi);
8832 for (const auto &R : ReductionOperations) {
8833 RecipeBuilder.recordRecipeOf(R);
8834 // For min/max reductions, where we have a pair of icmp/select, we also
8835 // need to record the ICmp recipe, so it can be removed later.
8836 assert(!RecurrenceDescriptor::isSelectCmpRecurrenceKind(Kind) &&(static_cast <bool> (!RecurrenceDescriptor::isSelectCmpRecurrenceKind
(Kind) && "Only min/max recurrences allowed for inloop reductions"
) ? void (0) : __assert_fail ("!RecurrenceDescriptor::isSelectCmpRecurrenceKind(Kind) && \"Only min/max recurrences allowed for inloop reductions\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 8837, __extension__
__PRETTY_FUNCTION__))
8837 "Only min/max recurrences allowed for inloop reductions")(static_cast <bool> (!RecurrenceDescriptor::isSelectCmpRecurrenceKind
(Kind) && "Only min/max recurrences allowed for inloop reductions"
) ? void (0) : __assert_fail ("!RecurrenceDescriptor::isSelectCmpRecurrenceKind(Kind) && \"Only min/max recurrences allowed for inloop reductions\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 8837, __extension__
__PRETTY_FUNCTION__))
;
8838 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind))
8839 RecipeBuilder.recordRecipeOf(cast<Instruction>(R->getOperand(0)));
8840 }
8841 }
8842
8843 // For each interleave group which is relevant for this (possibly trimmed)
8844 // Range, add it to the set of groups to be later applied to the VPlan and add
8845 // placeholders for its members' Recipes which we'll be replacing with a
8846 // single VPInterleaveRecipe.
8847 for (InterleaveGroup<Instruction> *IG : IAI.getInterleaveGroups()) {
8848 auto applyIG = [IG, this](ElementCount VF) -> bool {
8849 return (VF.isVector() && // Query is illegal for VF == 1
8850 CM.getWideningDecision(IG->getInsertPos(), VF) ==
8851 LoopVectorizationCostModel::CM_Interleave);
8852 };
8853 if (!getDecisionAndClampRange(applyIG, Range))
8854 continue;
8855 InterleaveGroups.insert(IG);
8856 for (unsigned i = 0; i < IG->getFactor(); i++)
8857 if (Instruction *Member = IG->getMember(i))
8858 RecipeBuilder.recordRecipeOf(Member);
8859 };
8860
8861 // ---------------------------------------------------------------------------
8862 // Build initial VPlan: Scan the body of the loop in a topological order to
8863 // visit each basic block after having visited its predecessor basic blocks.
8864 // ---------------------------------------------------------------------------
8865
8866 // Create initial VPlan skeleton, starting with a block for the pre-header,
8867 // followed by a region for the vector loop, followed by the middle block. The
8868 // skeleton vector loop region contains a header and latch block.
8869 VPBasicBlock *Preheader = new VPBasicBlock("vector.ph");
8870 auto Plan = std::make_unique<VPlan>(Preheader);
8871
8872 VPBasicBlock *HeaderVPBB = new VPBasicBlock("vector.body");
2
Memory is allocated
8873 VPBasicBlock *LatchVPBB = new VPBasicBlock("vector.latch");
8874 VPBlockUtils::insertBlockAfter(LatchVPBB, HeaderVPBB);
8875 auto *TopRegion = new VPRegionBlock(HeaderVPBB, LatchVPBB, "vector loop");
8876 VPBlockUtils::insertBlockAfter(TopRegion, Preheader);
8877 VPBasicBlock *MiddleVPBB = new VPBasicBlock("middle.block");
8878 VPBlockUtils::insertBlockAfter(MiddleVPBB, TopRegion);
8879
8880 Instruction *DLInst =
8881 getDebugLocFromInstOrOperands(Legal->getPrimaryInduction());
8882 addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(),
8883 DLInst
2.1
'DLInst' is non-null
2.1
'DLInst' is non-null
2.1
'DLInst' is non-null
? DLInst->getDebugLoc() : DebugLoc(),
3
'?' condition is true
8884 !CM.foldTailByMasking(),
4
Assuming the condition is false
8885 CM.useActiveLaneMaskForControlFlow());
8886
8887 // Scan the body of the loop in a topological order to visit each basic block
8888 // after having visited its predecessor basic blocks.
8889 LoopBlocksDFS DFS(OrigLoop);
8890 DFS.perform(LI);
8891
8892 VPBasicBlock *VPBB = HeaderVPBB;
8893 SmallVector<VPWidenIntOrFpInductionRecipe *> InductionsToMove;
8894 for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
8895 // Relevant instructions from basic block BB will be grouped into VPRecipe
8896 // ingredients and fill a new VPBasicBlock.
8897 unsigned VPBBsForBB = 0;
8898 if (VPBB != HeaderVPBB)
8899 VPBB->setName(BB->getName());
8900 Builder.setInsertPoint(VPBB);
8901
8902 // Introduce each ingredient into VPlan.
8903 // TODO: Model and preserve debug intrinsics in VPlan.
8904 for (Instruction &I : BB->instructionsWithoutDebug()) {
8905 Instruction *Instr = &I;
8906
8907 // First filter out irrelevant instructions, to ensure no recipes are
8908 // built for them.
8909 if (isa<BranchInst>(Instr) || DeadInstructions.count(Instr))
8910 continue;
8911
8912 SmallVector<VPValue *, 4> Operands;
8913 auto *Phi = dyn_cast<PHINode>(Instr);
8914 if (Phi && Phi->getParent() == OrigLoop->getHeader()) {
8915 Operands.push_back(Plan->getOrAddVPValue(
8916 Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader())));
8917 } else {
8918 auto OpRange = Plan->mapToVPValues(Instr->operands());
8919 Operands = {OpRange.begin(), OpRange.end()};
8920 }
8921
8922 // Invariant stores inside loop will be deleted and a single store
8923 // with the final reduction value will be added to the exit block
8924 StoreInst *SI;
8925 if ((SI = dyn_cast<StoreInst>(&I)) &&
8926 Legal->isInvariantAddressOfReduction(SI->getPointerOperand()))
8927 continue;
8928
8929 if (auto RecipeOrValue = RecipeBuilder.tryToCreateWidenRecipe(
8930 Instr, Operands, Range, VPBB, Plan)) {
8931 // If Instr can be simplified to an existing VPValue, use it.
8932 if (RecipeOrValue.is<VPValue *>()) {
8933 auto *VPV = RecipeOrValue.get<VPValue *>();
8934 Plan->addVPValue(Instr, VPV);
8935 // If the re-used value is a recipe, register the recipe for the
8936 // instruction, in case the recipe for Instr needs to be recorded.
8937 if (VPRecipeBase *R = VPV->getDefiningRecipe())
8938 RecipeBuilder.setRecipe(Instr, R);
8939 continue;
8940 }
8941 // Otherwise, add the new recipe.
8942 VPRecipeBase *Recipe = RecipeOrValue.get<VPRecipeBase *>();
8943 for (auto *Def : Recipe->definedValues()) {
8944 auto *UV = Def->getUnderlyingValue();
8945 Plan->addVPValue(UV, Def);
8946 }
8947
8948 if (isa<VPWidenIntOrFpInductionRecipe>(Recipe) &&
8949 HeaderVPBB->getFirstNonPhi() != VPBB->end()) {
8950 // Keep track of VPWidenIntOrFpInductionRecipes not in the phi section
8951 // of the header block. That can happen for truncates of induction
8952 // variables. Those recipes are moved to the phi section of the header
8953 // block after applying SinkAfter, which relies on the original
8954 // position of the trunc.
8955 assert(isa<TruncInst>(Instr))(static_cast <bool> (isa<TruncInst>(Instr)) ? void
(0) : __assert_fail ("isa<TruncInst>(Instr)", "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 8955, __extension__ __PRETTY_FUNCTION__))
;
8956 InductionsToMove.push_back(
8957 cast<VPWidenIntOrFpInductionRecipe>(Recipe));
8958 }
8959 RecipeBuilder.setRecipe(Instr, Recipe);
8960 VPBB->appendRecipe(Recipe);
8961 continue;
8962 }
8963
8964 // Otherwise, if all widening options failed, Instruction is to be
8965 // replicated. This may create a successor for VPBB.
8966 VPBasicBlock *NextVPBB =
8967 RecipeBuilder.handleReplication(Instr, Range, VPBB, Plan);
8968 if (NextVPBB != VPBB) {
8969 VPBB = NextVPBB;
8970 VPBB->setName(BB->hasName() ? BB->getName() + "." + Twine(VPBBsForBB++)
8971 : "");
8972 }
8973 }
8974
8975 VPBlockUtils::insertBlockAfter(new VPBasicBlock(), VPBB);
8976 VPBB = cast<VPBasicBlock>(VPBB->getSingleSuccessor());
8977 }
8978
8979 HeaderVPBB->setName("vector.body");
8980
8981 // Fold the last, empty block into its predecessor.
8982 VPBB = VPBlockUtils::tryToMergeBlockIntoPredecessor(VPBB);
5
Calling 'VPBlockUtils::tryToMergeBlockIntoPredecessor'
14
Returning; memory was released via 1st parameter
8983 assert(VPBB && "expected to fold last (empty) block")(static_cast <bool> (VPBB && "expected to fold last (empty) block"
) ? void (0) : __assert_fail ("VPBB && \"expected to fold last (empty) block\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 8983, __extension__
__PRETTY_FUNCTION__))
;
15
'?' condition is true
8984 // After here, VPBB should not be used.
8985 VPBB = nullptr;
8986
8987 addUsersInExitBlock(HeaderVPBB, MiddleVPBB, OrigLoop, *Plan);
16
Use of memory after it is freed
8988
8989 assert(isa<VPRegionBlock>(Plan->getVectorLoopRegion()) &&(static_cast <bool> (isa<VPRegionBlock>(Plan->
getVectorLoopRegion()) && !Plan->getVectorLoopRegion
()->getEntryBasicBlock()->empty() && "entry block must be set to a VPRegionBlock having a non-empty entry "
"VPBasicBlock") ? void (0) : __assert_fail ("isa<VPRegionBlock>(Plan->getVectorLoopRegion()) && !Plan->getVectorLoopRegion()->getEntryBasicBlock()->empty() && \"entry block must be set to a VPRegionBlock having a non-empty entry \" \"VPBasicBlock\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 8992, __extension__
__PRETTY_FUNCTION__))
8990 !Plan->getVectorLoopRegion()->getEntryBasicBlock()->empty() &&(static_cast <bool> (isa<VPRegionBlock>(Plan->
getVectorLoopRegion()) && !Plan->getVectorLoopRegion
()->getEntryBasicBlock()->empty() && "entry block must be set to a VPRegionBlock having a non-empty entry "
"VPBasicBlock") ? void (0) : __assert_fail ("isa<VPRegionBlock>(Plan->getVectorLoopRegion()) && !Plan->getVectorLoopRegion()->getEntryBasicBlock()->empty() && \"entry block must be set to a VPRegionBlock having a non-empty entry \" \"VPBasicBlock\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 8992, __extension__
__PRETTY_FUNCTION__))
8991 "entry block must be set to a VPRegionBlock having a non-empty entry "(static_cast <bool> (isa<VPRegionBlock>(Plan->
getVectorLoopRegion()) && !Plan->getVectorLoopRegion
()->getEntryBasicBlock()->empty() && "entry block must be set to a VPRegionBlock having a non-empty entry "
"VPBasicBlock") ? void (0) : __assert_fail ("isa<VPRegionBlock>(Plan->getVectorLoopRegion()) && !Plan->getVectorLoopRegion()->getEntryBasicBlock()->empty() && \"entry block must be set to a VPRegionBlock having a non-empty entry \" \"VPBasicBlock\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 8992, __extension__
__PRETTY_FUNCTION__))
8992 "VPBasicBlock")(static_cast <bool> (isa<VPRegionBlock>(Plan->
getVectorLoopRegion()) && !Plan->getVectorLoopRegion
()->getEntryBasicBlock()->empty() && "entry block must be set to a VPRegionBlock having a non-empty entry "
"VPBasicBlock") ? void (0) : __assert_fail ("isa<VPRegionBlock>(Plan->getVectorLoopRegion()) && !Plan->getVectorLoopRegion()->getEntryBasicBlock()->empty() && \"entry block must be set to a VPRegionBlock having a non-empty entry \" \"VPBasicBlock\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 8992, __extension__
__PRETTY_FUNCTION__))
;
8993 RecipeBuilder.fixHeaderPhis();
8994
8995 // ---------------------------------------------------------------------------
8996 // Transform initial VPlan: Apply previously taken decisions, in order, to
8997 // bring the VPlan to its final state.
8998 // ---------------------------------------------------------------------------
8999
9000 // Apply Sink-After legal constraints.
9001 auto GetReplicateRegion = [](VPRecipeBase *R) -> VPRegionBlock * {
9002 auto *Region = dyn_cast_or_null<VPRegionBlock>(R->getParent()->getParent());
9003 if (Region && Region->isReplicator()) {
9004 assert(Region->getNumSuccessors() == 1 &&(static_cast <bool> (Region->getNumSuccessors() == 1
&& Region->getNumPredecessors() == 1 && "Expected SESE region!"
) ? void (0) : __assert_fail ("Region->getNumSuccessors() == 1 && Region->getNumPredecessors() == 1 && \"Expected SESE region!\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 9005, __extension__
__PRETTY_FUNCTION__))
9005 Region->getNumPredecessors() == 1 && "Expected SESE region!")(static_cast <bool> (Region->getNumSuccessors() == 1
&& Region->getNumPredecessors() == 1 && "Expected SESE region!"
) ? void (0) : __assert_fail ("Region->getNumSuccessors() == 1 && Region->getNumPredecessors() == 1 && \"Expected SESE region!\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 9005, __extension__
__PRETTY_FUNCTION__))
;
9006 assert(R->getParent()->size() == 1 &&(static_cast <bool> (R->getParent()->size() == 1 &&
"A recipe in an original replicator region must be the only "
"recipe in its block") ? void (0) : __assert_fail ("R->getParent()->size() == 1 && \"A recipe in an original replicator region must be the only \" \"recipe in its block\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 9008, __extension__
__PRETTY_FUNCTION__))
9007 "A recipe in an original replicator region must be the only "(static_cast <bool> (R->getParent()->size() == 1 &&
"A recipe in an original replicator region must be the only "
"recipe in its block") ? void (0) : __assert_fail ("R->getParent()->size() == 1 && \"A recipe in an original replicator region must be the only \" \"recipe in its block\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 9008, __extension__
__PRETTY_FUNCTION__))
9008 "recipe in its block")(static_cast <bool> (R->getParent()->size() == 1 &&
"A recipe in an original replicator region must be the only "
"recipe in its block") ? void (0) : __assert_fail ("R->getParent()->size() == 1 && \"A recipe in an original replicator region must be the only \" \"recipe in its block\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 9008, __extension__
__PRETTY_FUNCTION__))
;
9009 return Region;
9010 }
9011 return nullptr;
9012 };
9013 for (const auto &Entry : SinkAfter) {
9014 VPRecipeBase *Sink = RecipeBuilder.getRecipe(Entry.first);
9015 VPRecipeBase *Target = RecipeBuilder.getRecipe(Entry.second);
9016
9017 auto *TargetRegion = GetReplicateRegion(Target);
9018 auto *SinkRegion = GetReplicateRegion(Sink);
9019 if (!SinkRegion) {
9020 // If the sink source is not a replicate region, sink the recipe directly.
9021 if (TargetRegion) {
9022 // The target is in a replication region, make sure to move Sink to
9023 // the block after it, not into the replication region itself.
9024 VPBasicBlock *NextBlock =
9025 cast<VPBasicBlock>(TargetRegion->getSuccessors().front());
9026 Sink->moveBefore(*NextBlock, NextBlock->getFirstNonPhi());
9027 } else
9028 Sink->moveAfter(Target);
9029 continue;
9030 }
9031
9032 // The sink source is in a replicate region. Unhook the region from the CFG.
9033 auto *SinkPred = SinkRegion->getSinglePredecessor();
9034 auto *SinkSucc = SinkRegion->getSingleSuccessor();
9035 VPBlockUtils::disconnectBlocks(SinkPred, SinkRegion);
9036 VPBlockUtils::disconnectBlocks(SinkRegion, SinkSucc);
9037 VPBlockUtils::connectBlocks(SinkPred, SinkSucc);
9038
9039 if (TargetRegion) {
9040 // The target recipe is also in a replicate region, move the sink region
9041 // after the target region.
9042 auto *TargetSucc = TargetRegion->getSingleSuccessor();
9043 VPBlockUtils::disconnectBlocks(TargetRegion, TargetSucc);
9044 VPBlockUtils::connectBlocks(TargetRegion, SinkRegion);
9045 VPBlockUtils::connectBlocks(SinkRegion, TargetSucc);
9046 } else {
9047 // The sink source is in a replicate region, we need to move the whole
9048 // replicate region, which should only contain a single recipe in the
9049 // main block.
9050 auto *SplitBlock =
9051 Target->getParent()->splitAt(std::next(Target->getIterator()));
9052
9053 auto *SplitPred = SplitBlock->getSinglePredecessor();
9054
9055 VPBlockUtils::disconnectBlocks(SplitPred, SplitBlock);
9056 VPBlockUtils::connectBlocks(SplitPred, SinkRegion);
9057 VPBlockUtils::connectBlocks(SinkRegion, SplitBlock);
9058 }
9059 }
9060
9061 VPlanTransforms::removeRedundantCanonicalIVs(*Plan);
9062 VPlanTransforms::removeRedundantInductionCasts(*Plan);
9063
9064 // Now that sink-after is done, move induction recipes for optimized truncates
9065 // to the phi section of the header block.
9066 for (VPWidenIntOrFpInductionRecipe *Ind : InductionsToMove)
9067 Ind->moveBefore(*HeaderVPBB, HeaderVPBB->getFirstNonPhi());
9068
9069 // Adjust the recipes for any inloop reductions.
9070 adjustRecipesForReductions(cast<VPBasicBlock>(TopRegion->getExiting()), Plan,
9071 RecipeBuilder, Range.Start);
9072
9073 // Introduce a recipe to combine the incoming and previous values of a
9074 // fixed-order recurrence.
9075 for (VPRecipeBase &R :
9076 Plan->getVectorLoopRegion()->getEntryBasicBlock()->phis()) {
9077 auto *RecurPhi = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&R);
9078 if (!RecurPhi)
9079 continue;
9080
9081 VPRecipeBase *PrevRecipe = &RecurPhi->getBackedgeRecipe();
9082 // Fixed-order recurrences do not contain cycles, so this loop is guaranteed
9083 // to terminate.
9084 while (auto *PrevPhi =
9085 dyn_cast<VPFirstOrderRecurrencePHIRecipe>(PrevRecipe))
9086 PrevRecipe = &PrevPhi->getBackedgeRecipe();
9087 VPBasicBlock *InsertBlock = PrevRecipe->getParent();
9088 auto *Region = GetReplicateRegion(PrevRecipe);
9089 if (Region)
9090 InsertBlock = dyn_cast<VPBasicBlock>(Region->getSingleSuccessor());
9091 if (!InsertBlock) {
9092 InsertBlock = new VPBasicBlock(Region->getName() + ".succ");
9093 VPBlockUtils::insertBlockAfter(InsertBlock, Region);
9094 }
9095 if (Region || PrevRecipe->isPhi())
9096 Builder.setInsertPoint(InsertBlock, InsertBlock->getFirstNonPhi());
9097 else
9098 Builder.setInsertPoint(InsertBlock, std::next(PrevRecipe->getIterator()));
9099
9100 auto *RecurSplice = cast<VPInstruction>(
9101 Builder.createNaryOp(VPInstruction::FirstOrderRecurrenceSplice,
9102 {RecurPhi, RecurPhi->getBackedgeValue()}));
9103
9104 RecurPhi->replaceAllUsesWith(RecurSplice);
9105 // Set the first operand of RecurSplice to RecurPhi again, after replacing
9106 // all users.
9107 RecurSplice->setOperand(0, RecurPhi);
9108 }
9109
9110 // Interleave memory: for each Interleave Group we marked earlier as relevant
9111 // for this VPlan, replace the Recipes widening its memory instructions with a
9112 // single VPInterleaveRecipe at its insertion point.
9113 for (const auto *IG : InterleaveGroups) {
9114 auto *Recipe = cast<VPWidenMemoryInstructionRecipe>(
9115 RecipeBuilder.getRecipe(IG->getInsertPos()));
9116 SmallVector<VPValue *, 4> StoredValues;
9117 for (unsigned i = 0; i < IG->getFactor(); ++i)
9118 if (auto *SI = dyn_cast_or_null<StoreInst>(IG->getMember(i))) {
9119 auto *StoreR =
9120 cast<VPWidenMemoryInstructionRecipe>(RecipeBuilder.getRecipe(SI));
9121 StoredValues.push_back(StoreR->getStoredValue());
9122 }
9123
9124 auto *VPIG = new VPInterleaveRecipe(IG, Recipe->getAddr(), StoredValues,
9125 Recipe->getMask());
9126 VPIG->insertBefore(Recipe);
9127 unsigned J = 0;
9128 for (unsigned i = 0; i < IG->getFactor(); ++i)
9129 if (Instruction *Member = IG->getMember(i)) {
9130 if (!Member->getType()->isVoidTy()) {
9131 VPValue *OriginalV = Plan->getVPValue(Member);
9132 Plan->removeVPValueFor(Member);
9133 Plan->addVPValue(Member, VPIG->getVPValue(J));
9134 OriginalV->replaceAllUsesWith(VPIG->getVPValue(J));
9135 J++;
9136 }
9137 RecipeBuilder.getRecipe(Member)->eraseFromParent();
9138 }
9139 }
9140
9141 for (ElementCount VF = Range.Start; ElementCount::isKnownLT(VF, Range.End);
9142 VF *= 2)
9143 Plan->addVF(VF);
9144 Plan->setName("Initial VPlan");
9145
9146 // From this point onwards, VPlan-to-VPlan transformations may change the plan
9147 // in ways that accessing values using original IR values is incorrect.
9148 Plan->disableValue2VPValue();
9149
9150 VPlanTransforms::optimizeInductions(*Plan, *PSE.getSE());
9151 VPlanTransforms::removeDeadRecipes(*Plan);
9152 VPlanTransforms::sinkScalarOperands(*Plan);
9153 VPlanTransforms::mergeReplicateRegions(*Plan);
9154 VPlanTransforms::removeRedundantExpandSCEVRecipes(*Plan);
9155
9156 // Fold Exit block into its predecessor if possible.
9157 // TODO: Fold block earlier once all VPlan transforms properly maintain a
9158 // VPBasicBlock as exit.
9159 VPBlockUtils::tryToMergeBlockIntoPredecessor(TopRegion->getExiting());
9160
9161 assert(VPlanVerifier::verifyPlanIsValid(*Plan) && "VPlan is invalid")(static_cast <bool> (VPlanVerifier::verifyPlanIsValid(*
Plan) && "VPlan is invalid") ? void (0) : __assert_fail
("VPlanVerifier::verifyPlanIsValid(*Plan) && \"VPlan is invalid\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 9161, __extension__
__PRETTY_FUNCTION__))
;
9162 return Plan;
9163}
9164
9165VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) {
9166 // Outer loop handling: They may require CFG and instruction level
9167 // transformations before even evaluating whether vectorization is profitable.
9168 // Since we cannot modify the incoming IR, we need to build VPlan upfront in
9169 // the vectorization pipeline.
9170 assert(!OrigLoop->isInnermost())(static_cast <bool> (!OrigLoop->isInnermost()) ? void
(0) : __assert_fail ("!OrigLoop->isInnermost()", "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 9170, __extension__ __PRETTY_FUNCTION__))
;
9171 assert(EnableVPlanNativePath && "VPlan-native path is not enabled.")(static_cast <bool> (EnableVPlanNativePath && "VPlan-native path is not enabled."
) ? void (0) : __assert_fail ("EnableVPlanNativePath && \"VPlan-native path is not enabled.\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 9171, __extension__
__PRETTY_FUNCTION__))
;
9172
9173 // Create new empty VPlan
9174 auto Plan = std::make_unique<VPlan>();
9175
9176 // Build hierarchical CFG
9177 VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI, *Plan);
9178 HCFGBuilder.buildHierarchicalCFG();
9179
9180 for (ElementCount VF = Range.Start; ElementCount::isKnownLT(VF, Range.End);
9181 VF *= 2)
9182 Plan->addVF(VF);
9183
9184 SmallPtrSet<Instruction *, 1> DeadInstructions;
9185 VPlanTransforms::VPInstructionsToVPRecipes(
9186 OrigLoop, Plan,
9187 [this](PHINode *P) { return Legal->getIntOrFpInductionDescriptor(P); },
9188 DeadInstructions, *PSE.getSE(), *TLI);
9189
9190 // Remove the existing terminator of the exiting block of the top-most region.
9191 // A BranchOnCount will be added instead when adding the canonical IV recipes.
9192 auto *Term =
9193 Plan->getVectorLoopRegion()->getExitingBasicBlock()->getTerminator();
9194 Term->eraseFromParent();
9195
9196 addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(), DebugLoc(),
9197 true, CM.useActiveLaneMaskForControlFlow());
9198 return Plan;
9199}
9200
9201// Adjust the recipes for reductions. For in-loop reductions the chain of
9202// instructions leading from the loop exit instr to the phi need to be converted
9203// to reductions, with one operand being vector and the other being the scalar
9204// reduction chain. For other reductions, a select is introduced between the phi
9205// and live-out recipes when folding the tail.
9206void LoopVectorizationPlanner::adjustRecipesForReductions(
9207 VPBasicBlock *LatchVPBB, VPlanPtr &Plan, VPRecipeBuilder &RecipeBuilder,
9208 ElementCount MinVF) {
9209 for (const auto &Reduction : CM.getInLoopReductionChains()) {
9210 PHINode *Phi = Reduction.first;
9211 const RecurrenceDescriptor &RdxDesc =
9212 Legal->getReductionVars().find(Phi)->second;
9213 const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second;
9214
9215 if (MinVF.isScalar() && !CM.useOrderedReductions(RdxDesc))
9216 continue;
9217
9218 // ReductionOperations are orders top-down from the phi's use to the
9219 // LoopExitValue. We keep a track of the previous item (the Chain) to tell
9220 // which of the two operands will remain scalar and which will be reduced.
9221 // For minmax the chain will be the select instructions.
9222 Instruction *Chain = Phi;
9223 for (Instruction *R : ReductionOperations) {
9224 VPRecipeBase *WidenRecipe = RecipeBuilder.getRecipe(R);
9225 RecurKind Kind = RdxDesc.getRecurrenceKind();
9226
9227 VPValue *ChainOp = Plan->getVPValue(Chain);
9228 unsigned FirstOpId;
9229 assert(!RecurrenceDescriptor::isSelectCmpRecurrenceKind(Kind) &&(static_cast <bool> (!RecurrenceDescriptor::isSelectCmpRecurrenceKind
(Kind) && "Only min/max recurrences allowed for inloop reductions"
) ? void (0) : __assert_fail ("!RecurrenceDescriptor::isSelectCmpRecurrenceKind(Kind) && \"Only min/max recurrences allowed for inloop reductions\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 9230, __extension__
__PRETTY_FUNCTION__))
9230 "Only min/max recurrences allowed for inloop reductions")(static_cast <bool> (!RecurrenceDescriptor::isSelectCmpRecurrenceKind
(Kind) && "Only min/max recurrences allowed for inloop reductions"
) ? void (0) : __assert_fail ("!RecurrenceDescriptor::isSelectCmpRecurrenceKind(Kind) && \"Only min/max recurrences allowed for inloop reductions\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 9230, __extension__
__PRETTY_FUNCTION__))
;
9231 // Recognize a call to the llvm.fmuladd intrinsic.
9232 bool IsFMulAdd = (Kind == RecurKind::FMulAdd);
9233 assert((!IsFMulAdd || RecurrenceDescriptor::isFMulAddIntrinsic(R)) &&(static_cast <bool> ((!IsFMulAdd || RecurrenceDescriptor
::isFMulAddIntrinsic(R)) && "Expected instruction to be a call to the llvm.fmuladd intrinsic"
) ? void (0) : __assert_fail ("(!IsFMulAdd || RecurrenceDescriptor::isFMulAddIntrinsic(R)) && \"Expected instruction to be a call to the llvm.fmuladd intrinsic\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 9234, __extension__
__PRETTY_FUNCTION__))
9234 "Expected instruction to be a call to the llvm.fmuladd intrinsic")(static_cast <bool> ((!IsFMulAdd || RecurrenceDescriptor
::isFMulAddIntrinsic(R)) && "Expected instruction to be a call to the llvm.fmuladd intrinsic"
) ? void (0) : __assert_fail ("(!IsFMulAdd || RecurrenceDescriptor::isFMulAddIntrinsic(R)) && \"Expected instruction to be a call to the llvm.fmuladd intrinsic\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 9234, __extension__
__PRETTY_FUNCTION__))
;
9235 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) {
9236 assert(isa<VPWidenSelectRecipe>(WidenRecipe) &&(static_cast <bool> (isa<VPWidenSelectRecipe>(WidenRecipe
) && "Expected to replace a VPWidenSelectSC") ? void (
0) : __assert_fail ("isa<VPWidenSelectRecipe>(WidenRecipe) && \"Expected to replace a VPWidenSelectSC\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 9237, __extension__
__PRETTY_FUNCTION__))
9237 "Expected to replace a VPWidenSelectSC")(static_cast <bool> (isa<VPWidenSelectRecipe>(WidenRecipe
) && "Expected to replace a VPWidenSelectSC") ? void (
0) : __assert_fail ("isa<VPWidenSelectRecipe>(WidenRecipe) && \"Expected to replace a VPWidenSelectSC\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 9237, __extension__
__PRETTY_FUNCTION__))
;
9238 FirstOpId = 1;
9239 } else {
9240 assert((MinVF.isScalar() || isa<VPWidenRecipe>(WidenRecipe) ||(static_cast <bool> ((MinVF.isScalar() || isa<VPWidenRecipe
>(WidenRecipe) || (IsFMulAdd && isa<VPWidenCallRecipe
>(WidenRecipe))) && "Expected to replace a VPWidenSC"
) ? void (0) : __assert_fail ("(MinVF.isScalar() || isa<VPWidenRecipe>(WidenRecipe) || (IsFMulAdd && isa<VPWidenCallRecipe>(WidenRecipe))) && \"Expected to replace a VPWidenSC\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 9242, __extension__
__PRETTY_FUNCTION__))
9241 (IsFMulAdd && isa<VPWidenCallRecipe>(WidenRecipe))) &&(static_cast <bool> ((MinVF.isScalar() || isa<VPWidenRecipe
>(WidenRecipe) || (IsFMulAdd && isa<VPWidenCallRecipe
>(WidenRecipe))) && "Expected to replace a VPWidenSC"
) ? void (0) : __assert_fail ("(MinVF.isScalar() || isa<VPWidenRecipe>(WidenRecipe) || (IsFMulAdd && isa<VPWidenCallRecipe>(WidenRecipe))) && \"Expected to replace a VPWidenSC\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 9242, __extension__
__PRETTY_FUNCTION__))
9242 "Expected to replace a VPWidenSC")(static_cast <bool> ((MinVF.isScalar() || isa<VPWidenRecipe
>(WidenRecipe) || (IsFMulAdd && isa<VPWidenCallRecipe
>(WidenRecipe))) && "Expected to replace a VPWidenSC"
) ? void (0) : __assert_fail ("(MinVF.isScalar() || isa<VPWidenRecipe>(WidenRecipe) || (IsFMulAdd && isa<VPWidenCallRecipe>(WidenRecipe))) && \"Expected to replace a VPWidenSC\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 9242, __extension__
__PRETTY_FUNCTION__))
;
9243 FirstOpId = 0;
9244 }
9245 unsigned VecOpId =
9246 R->getOperand(FirstOpId) == Chain ? FirstOpId + 1 : FirstOpId;
9247 VPValue *VecOp = Plan->getVPValue(R->getOperand(VecOpId));
9248
9249 auto *CondOp = CM.blockNeedsPredicationForAnyReason(R->getParent())
9250 ? RecipeBuilder.createBlockInMask(R->getParent(), Plan)
9251 : nullptr;
9252
9253 if (IsFMulAdd) {
9254 // If the instruction is a call to the llvm.fmuladd intrinsic then we
9255 // need to create an fmul recipe to use as the vector operand for the
9256 // fadd reduction.
9257 VPInstruction *FMulRecipe = new VPInstruction(
9258 Instruction::FMul, {VecOp, Plan->getVPValue(R->getOperand(1))});
9259 FMulRecipe->setFastMathFlags(R->getFastMathFlags());
9260 WidenRecipe->getParent()->insert(FMulRecipe,
9261 WidenRecipe->getIterator());
9262 VecOp = FMulRecipe;
9263 }
9264 VPReductionRecipe *RedRecipe =
9265 new VPReductionRecipe(&RdxDesc, R, ChainOp, VecOp, CondOp, TTI);
9266 WidenRecipe->getVPSingleValue()->replaceAllUsesWith(RedRecipe);
9267 Plan->removeVPValueFor(R);
9268 Plan->addVPValue(R, RedRecipe);
9269 // Append the recipe to the end of the VPBasicBlock because we need to
9270 // ensure that it comes after all of it's inputs, including CondOp.
9271 WidenRecipe->getParent()->appendRecipe(RedRecipe);
9272 WidenRecipe->getVPSingleValue()->replaceAllUsesWith(RedRecipe);
9273 WidenRecipe->eraseFromParent();
9274
9275 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) {
9276 VPRecipeBase *CompareRecipe =
9277 RecipeBuilder.getRecipe(cast<Instruction>(R->getOperand(0)));
9278 assert(isa<VPWidenRecipe>(CompareRecipe) &&(static_cast <bool> (isa<VPWidenRecipe>(CompareRecipe
) && "Expected to replace a VPWidenSC") ? void (0) : __assert_fail
("isa<VPWidenRecipe>(CompareRecipe) && \"Expected to replace a VPWidenSC\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 9279, __extension__
__PRETTY_FUNCTION__))
9279 "Expected to replace a VPWidenSC")(static_cast <bool> (isa<VPWidenRecipe>(CompareRecipe
) && "Expected to replace a VPWidenSC") ? void (0) : __assert_fail
("isa<VPWidenRecipe>(CompareRecipe) && \"Expected to replace a VPWidenSC\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 9279, __extension__
__PRETTY_FUNCTION__))
;
9280 assert(cast<VPWidenRecipe>(CompareRecipe)->getNumUsers() == 0 &&(static_cast <bool> (cast<VPWidenRecipe>(CompareRecipe
)->getNumUsers() == 0 && "Expected no remaining users"
) ? void (0) : __assert_fail ("cast<VPWidenRecipe>(CompareRecipe)->getNumUsers() == 0 && \"Expected no remaining users\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 9281, __extension__
__PRETTY_FUNCTION__))
9281 "Expected no remaining users")(static_cast <bool> (cast<VPWidenRecipe>(CompareRecipe
)->getNumUsers() == 0 && "Expected no remaining users"
) ? void (0) : __assert_fail ("cast<VPWidenRecipe>(CompareRecipe)->getNumUsers() == 0 && \"Expected no remaining users\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 9281, __extension__
__PRETTY_FUNCTION__))
;
9282 CompareRecipe->eraseFromParent();
9283 }
9284 Chain = R;
9285 }
9286 }
9287
9288 // If tail is folded by masking, introduce selects between the phi
9289 // and the live-out instruction of each reduction, at the beginning of the
9290 // dedicated latch block.
9291 if (CM.foldTailByMasking()) {
9292 Builder.setInsertPoint(LatchVPBB, LatchVPBB->begin());
9293 for (VPRecipeBase &R :
9294 Plan->getVectorLoopRegion()->getEntryBasicBlock()->phis()) {
9295 VPReductionPHIRecipe *PhiR = dyn_cast<VPReductionPHIRecipe>(&R);
9296 if (!PhiR || PhiR->isInLoop())
9297 continue;
9298 VPValue *Cond =
9299 RecipeBuilder.createBlockInMask(OrigLoop->getHeader(), Plan);
9300 VPValue *Red = PhiR->getBackedgeValue();
9301 assert(Red->getDefiningRecipe()->getParent() != LatchVPBB &&(static_cast <bool> (Red->getDefiningRecipe()->getParent
() != LatchVPBB && "reduction recipe must be defined before latch"
) ? void (0) : __assert_fail ("Red->getDefiningRecipe()->getParent() != LatchVPBB && \"reduction recipe must be defined before latch\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 9302, __extension__
__PRETTY_FUNCTION__))
9302 "reduction recipe must be defined before latch")(static_cast <bool> (Red->getDefiningRecipe()->getParent
() != LatchVPBB && "reduction recipe must be defined before latch"
) ? void (0) : __assert_fail ("Red->getDefiningRecipe()->getParent() != LatchVPBB && \"reduction recipe must be defined before latch\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 9302, __extension__
__PRETTY_FUNCTION__))
;
9303 Builder.createNaryOp(Instruction::Select, {Cond, Red, PhiR});
9304 }
9305 }
9306}
9307
9308#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
9309void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent,
9310 VPSlotTracker &SlotTracker) const {
9311 O << Indent << "INTERLEAVE-GROUP with factor " << IG->getFactor() << " at ";
9312 IG->getInsertPos()->printAsOperand(O, false);
9313 O << ", ";
9314 getAddr()->printAsOperand(O, SlotTracker);
9315 VPValue *Mask = getMask();
9316 if (Mask) {
9317 O << ", ";
9318 Mask->printAsOperand(O, SlotTracker);
9319 }
9320
9321 unsigned OpIdx = 0;
9322 for (unsigned i = 0; i < IG->getFactor(); ++i) {
9323 if (!IG->getMember(i))
9324 continue;
9325 if (getNumStoreOperands() > 0) {
9326 O << "\n" << Indent << " store ";
9327 getOperand(1 + OpIdx)->printAsOperand(O, SlotTracker);
9328 O << " to index " << i;
9329 } else {
9330 O << "\n" << Indent << " ";
9331 getVPValue(OpIdx)->printAsOperand(O, SlotTracker);
9332 O << " = load from index " << i;
9333 }
9334 ++OpIdx;
9335 }
9336}
9337#endif
9338
9339void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) {
9340 assert(!State.Instance && "Int or FP induction being replicated.")(static_cast <bool> (!State.Instance && "Int or FP induction being replicated."
) ? void (0) : __assert_fail ("!State.Instance && \"Int or FP induction being replicated.\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 9340, __extension__
__PRETTY_FUNCTION__))
;
9341
9342 Value *Start = getStartValue()->getLiveInIRValue();
9343 const InductionDescriptor &ID = getInductionDescriptor();
9344 TruncInst *Trunc = getTruncInst();
9345 IRBuilderBase &Builder = State.Builder;
9346 assert(IV->getType() == ID.getStartValue()->getType() && "Types must match")(static_cast <bool> (IV->getType() == ID.getStartValue
()->getType() && "Types must match") ? void (0) : __assert_fail
("IV->getType() == ID.getStartValue()->getType() && \"Types must match\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 9346, __extension__
__PRETTY_FUNCTION__))
;
9347 assert(State.VF.isVector() && "must have vector VF")(static_cast <bool> (State.VF.isVector() && "must have vector VF"
) ? void (0) : __assert_fail ("State.VF.isVector() && \"must have vector VF\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 9347, __extension__
__PRETTY_FUNCTION__))
;
9348
9349 // The value from the original loop to which we are mapping the new induction
9350 // variable.
9351 Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : IV;
9352
9353 // Fast-math-flags propagate from the original induction instruction.
9354 IRBuilder<>::FastMathFlagGuard FMFG(Builder);
9355 if (ID.getInductionBinOp() && isa<FPMathOperator>(ID.getInductionBinOp()))
9356 Builder.setFastMathFlags(ID.getInductionBinOp()->getFastMathFlags());
9357
9358 // Now do the actual transformations, and start with fetching the step value.
9359 Value *Step = State.get(getStepValue(), VPIteration(0, 0));
9360
9361 assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) &&(static_cast <bool> ((isa<PHINode>(EntryVal) || isa
<TruncInst>(EntryVal)) && "Expected either an induction phi-node or a truncate of it!"
) ? void (0) : __assert_fail ("(isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) && \"Expected either an induction phi-node or a truncate of it!\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 9362, __extension__
__PRETTY_FUNCTION__))
9362 "Expected either an induction phi-node or a truncate of it!")(static_cast <bool> ((isa<PHINode>(EntryVal) || isa
<TruncInst>(EntryVal)) && "Expected either an induction phi-node or a truncate of it!"
) ? void (0) : __assert_fail ("(isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) && \"Expected either an induction phi-node or a truncate of it!\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 9362, __extension__
__PRETTY_FUNCTION__))
;
9363
9364 // Construct the initial value of the vector IV in the vector loop preheader
9365 auto CurrIP = Builder.saveIP();
9366 BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this);
9367 Builder.SetInsertPoint(VectorPH->getTerminator());
9368 if (isa<TruncInst>(EntryVal)) {
9369 assert(Start->getType()->isIntegerTy() &&(static_cast <bool> (Start->getType()->isIntegerTy
() && "Truncation requires an integer type") ? void (
0) : __assert_fail ("Start->getType()->isIntegerTy() && \"Truncation requires an integer type\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 9370, __extension__
__PRETTY_FUNCTION__))
9370 "Truncation requires an integer type")(static_cast <bool> (Start->getType()->isIntegerTy
() && "Truncation requires an integer type") ? void (
0) : __assert_fail ("Start->getType()->isIntegerTy() && \"Truncation requires an integer type\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 9370, __extension__
__PRETTY_FUNCTION__))
;
9371 auto *TruncType = cast<IntegerType>(EntryVal->getType());
9372 Step = Builder.CreateTrunc(Step, TruncType);
9373 Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType);
9374 }
9375
9376 Value *Zero = getSignedIntOrFpConstant(Start->getType(), 0);
9377 Value *SplatStart = Builder.CreateVectorSplat(State.VF, Start);
9378 Value *SteppedStart = getStepVector(
9379 SplatStart, Zero, Step, ID.getInductionOpcode(), State.VF, State.Builder);
9380
9381 // We create vector phi nodes for both integer and floating-point induction
9382 // variables. Here, we determine the kind of arithmetic we will perform.
9383 Instruction::BinaryOps AddOp;
9384 Instruction::BinaryOps MulOp;
9385 if (Step->getType()->isIntegerTy()) {
9386 AddOp = Instruction::Add;
9387 MulOp = Instruction::Mul;
9388 } else {
9389 AddOp = ID.getInductionOpcode();
9390 MulOp = Instruction::FMul;
9391 }
9392
9393 // Multiply the vectorization factor by the step using integer or
9394 // floating-point arithmetic as appropriate.
9395 Type *StepType = Step->getType();
9396 Value *RuntimeVF;
9397 if (Step->getType()->isFloatingPointTy())
9398 RuntimeVF = getRuntimeVFAsFloat(Builder, StepType, State.VF);
9399 else
9400 RuntimeVF = getRuntimeVF(Builder, StepType, State.VF);
9401 Value *Mul = Builder.CreateBinOp(MulOp, Step, RuntimeVF);
9402
9403 // Create a vector splat to use in the induction update.
9404 //
9405 // FIXME: If the step is non-constant, we create the vector splat with
9406 // IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't
9407 // handle a constant vector splat.
9408 Value *SplatVF = isa<Constant>(Mul)
9409 ? ConstantVector::getSplat(State.VF, cast<Constant>(Mul))
9410 : Builder.CreateVectorSplat(State.VF, Mul);
9411 Builder.restoreIP(CurrIP);
9412
9413 // We may need to add the step a number of times, depending on the unroll
9414 // factor. The last of those goes into the PHI.
9415 PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind",
9416 &*State.CFG.PrevBB->getFirstInsertionPt());
9417 VecInd->setDebugLoc(EntryVal->getDebugLoc());
9418 Instruction *LastInduction = VecInd;
9419 for (unsigned Part = 0; Part < State.UF; ++Part) {
9420 State.set(this, LastInduction, Part);
9421
9422 if (isa<TruncInst>(EntryVal))
9423 State.addMetadata(LastInduction, EntryVal);
9424
9425 LastInduction = cast<Instruction>(
9426 Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add"));
9427 LastInduction->setDebugLoc(EntryVal->getDebugLoc());
9428 }
9429
9430 LastInduction->setName("vec.ind.next");
9431 VecInd->addIncoming(SteppedStart, VectorPH);
9432 // Add induction update using an incorrect block temporarily. The phi node
9433 // will be fixed after VPlan execution. Note that at this point the latch
9434 // block cannot be used, as it does not exist yet.
9435 // TODO: Model increment value in VPlan, by turning the recipe into a
9436 // multi-def and a subclass of VPHeaderPHIRecipe.
9437 VecInd->addIncoming(LastInduction, VectorPH);
9438}
9439
9440void VPWidenPointerInductionRecipe::execute(VPTransformState &State) {
9441 assert(IndDesc.getKind() == InductionDescriptor::IK_PtrInduction &&(static_cast <bool> (IndDesc.getKind() == InductionDescriptor
::IK_PtrInduction && "Not a pointer induction according to InductionDescriptor!"
) ? void (0) : __assert_fail ("IndDesc.getKind() == InductionDescriptor::IK_PtrInduction && \"Not a pointer induction according to InductionDescriptor!\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 9442, __extension__
__PRETTY_FUNCTION__))
9442 "Not a pointer induction according to InductionDescriptor!")(static_cast <bool> (IndDesc.getKind() == InductionDescriptor
::IK_PtrInduction && "Not a pointer induction according to InductionDescriptor!"
) ? void (0) : __assert_fail ("IndDesc.getKind() == InductionDescriptor::IK_PtrInduction && \"Not a pointer induction according to InductionDescriptor!\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 9442, __extension__
__PRETTY_FUNCTION__))
;
9443 assert(cast<PHINode>(getUnderlyingInstr())->getType()->isPointerTy() &&(static_cast <bool> (cast<PHINode>(getUnderlyingInstr
())->getType()->isPointerTy() && "Unexpected type."
) ? void (0) : __assert_fail ("cast<PHINode>(getUnderlyingInstr())->getType()->isPointerTy() && \"Unexpected type.\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 9444, __extension__
__PRETTY_FUNCTION__))
9444 "Unexpected type.")(static_cast <bool> (cast<PHINode>(getUnderlyingInstr
())->getType()->isPointerTy() && "Unexpected type."
) ? void (0) : __assert_fail ("cast<PHINode>(getUnderlyingInstr())->getType()->isPointerTy() && \"Unexpected type.\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 9444, __extension__
__PRETTY_FUNCTION__))
;
9445
9446 auto *IVR = getParent()->getPlan()->getCanonicalIV();
9447 PHINode *CanonicalIV = cast<PHINode>(State.get(IVR, 0));
9448
9449 if (onlyScalarsGenerated(State.VF)) {
9450 // This is the normalized GEP that starts counting at zero.
9451 Value *PtrInd = State.Builder.CreateSExtOrTrunc(
9452 CanonicalIV, IndDesc.getStep()->getType());
9453 // Determine the number of scalars we need to generate for each unroll
9454 // iteration. If the instruction is uniform, we only need to generate the
9455 // first lane. Otherwise, we generate all VF values.
9456 bool IsUniform = vputils::onlyFirstLaneUsed(this);
9457 assert((IsUniform || !State.VF.isScalable()) &&(static_cast <bool> ((IsUniform || !State.VF.isScalable
()) && "Cannot scalarize a scalable VF") ? void (0) :
__assert_fail ("(IsUniform || !State.VF.isScalable()) && \"Cannot scalarize a scalable VF\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 9458, __extension__
__PRETTY_FUNCTION__))
9458 "Cannot scalarize a scalable VF")(static_cast <bool> ((IsUniform || !State.VF.isScalable
()) && "Cannot scalarize a scalable VF") ? void (0) :
__assert_fail ("(IsUniform || !State.VF.isScalable()) && \"Cannot scalarize a scalable VF\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 9458, __extension__
__PRETTY_FUNCTION__))
;
9459 unsigned Lanes = IsUniform ? 1 : State.VF.getFixedValue();
9460
9461 for (unsigned Part = 0; Part < State.UF; ++Part) {
9462 Value *PartStart =
9463 createStepForVF(State.Builder, PtrInd->getType(), State.VF, Part);
9464
9465 for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
9466 Value *Idx = State.Builder.CreateAdd(
9467 PartStart, ConstantInt::get(PtrInd->getType(), Lane));
9468 Value *GlobalIdx = State.Builder.CreateAdd(PtrInd, Idx);
9469
9470 Value *Step = State.get(getOperand(1), VPIteration(0, Part));
9471 Value *SclrGep = emitTransformedIndex(
9472 State.Builder, GlobalIdx, IndDesc.getStartValue(), Step, IndDesc);
9473 SclrGep->setName("next.gep");
9474 State.set(this, SclrGep, VPIteration(Part, Lane));
9475 }
9476 }
9477 return;
9478 }
9479
9480 assert(isa<SCEVConstant>(IndDesc.getStep()) &&(static_cast <bool> (isa<SCEVConstant>(IndDesc.getStep
()) && "Induction step not a SCEV constant!") ? void (
0) : __assert_fail ("isa<SCEVConstant>(IndDesc.getStep()) && \"Induction step not a SCEV constant!\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 9481, __extension__
__PRETTY_FUNCTION__))
9481 "Induction step not a SCEV constant!")(static_cast <bool> (isa<SCEVConstant>(IndDesc.getStep
()) && "Induction step not a SCEV constant!") ? void (
0) : __assert_fail ("isa<SCEVConstant>(IndDesc.getStep()) && \"Induction step not a SCEV constant!\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 9481, __extension__
__PRETTY_FUNCTION__))
;
9482 Type *PhiType = IndDesc.getStep()->getType();
9483
9484 // Build a pointer phi
9485 Value *ScalarStartValue = getStartValue()->getLiveInIRValue();
9486 Type *ScStValueType = ScalarStartValue->getType();
9487 PHINode *NewPointerPhi =
9488 PHINode::Create(ScStValueType, 2, "pointer.phi", CanonicalIV);
9489
9490 BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this);
9491 NewPointerPhi->addIncoming(ScalarStartValue, VectorPH);
9492
9493 // A pointer induction, performed by using a gep
9494 Instruction *InductionLoc = &*State.Builder.GetInsertPoint();
9495
9496 Value *ScalarStepValue = State.get(getOperand(1), VPIteration(0, 0));
9497 Value *RuntimeVF = getRuntimeVF(State.Builder, PhiType, State.VF);
9498 Value *NumUnrolledElems =
9499 State.Builder.CreateMul(RuntimeVF, ConstantInt::get(PhiType, State.UF));
9500 Value *InductionGEP = GetElementPtrInst::Create(
9501 IndDesc.getElementType(), NewPointerPhi,
9502 State.Builder.CreateMul(ScalarStepValue, NumUnrolledElems), "ptr.ind",
9503 InductionLoc);
9504 // Add induction update using an incorrect block temporarily. The phi node
9505 // will be fixed after VPlan execution. Note that at this point the latch
9506 // block cannot be used, as it does not exist yet.
9507 // TODO: Model increment value in VPlan, by turning the recipe into a
9508 // multi-def and a subclass of VPHeaderPHIRecipe.
9509 NewPointerPhi->addIncoming(InductionGEP, VectorPH);
9510
9511 // Create UF many actual address geps that use the pointer
9512 // phi as base and a vectorized version of the step value
9513 // (<step*0, ..., step*N>) as offset.
9514 for (unsigned Part = 0; Part < State.UF; ++Part) {
9515 Type *VecPhiType = VectorType::get(PhiType, State.VF);
9516 Value *StartOffsetScalar =
9517 State.Builder.CreateMul(RuntimeVF, ConstantInt::get(PhiType, Part));
9518 Value *StartOffset =
9519 State.Builder.CreateVectorSplat(State.VF, StartOffsetScalar);
9520 // Create a vector of consecutive numbers from zero to VF.
9521 StartOffset = State.Builder.CreateAdd(
9522 StartOffset, State.Builder.CreateStepVector(VecPhiType));
9523
9524 assert(ScalarStepValue == State.get(getOperand(1), VPIteration(0, Part)) &&(static_cast <bool> (ScalarStepValue == State.get(getOperand
(1), VPIteration(0, Part)) && "scalar step must be the same across all parts"
) ? void (0) : __assert_fail ("ScalarStepValue == State.get(getOperand(1), VPIteration(0, Part)) && \"scalar step must be the same across all parts\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 9525, __extension__
__PRETTY_FUNCTION__))
9525 "scalar step must be the same across all parts")(static_cast <bool> (ScalarStepValue == State.get(getOperand
(1), VPIteration(0, Part)) && "scalar step must be the same across all parts"
) ? void (0) : __assert_fail ("ScalarStepValue == State.get(getOperand(1), VPIteration(0, Part)) && \"scalar step must be the same across all parts\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 9525, __extension__
__PRETTY_FUNCTION__))
;
9526 Value *GEP = State.Builder.CreateGEP(
9527 IndDesc.getElementType(), NewPointerPhi,
9528 State.Builder.CreateMul(
9529 StartOffset,
9530 State.Builder.CreateVectorSplat(State.VF, ScalarStepValue),
9531 "vector.gep"));
9532 State.set(this, GEP, Part);
9533 }
9534}
9535
9536void VPDerivedIVRecipe::execute(VPTransformState &State) {
9537 assert(!State.Instance && "VPDerivedIVRecipe being replicated.")(static_cast <bool> (!State.Instance && "VPDerivedIVRecipe being replicated."
) ? void (0) : __assert_fail ("!State.Instance && \"VPDerivedIVRecipe being replicated.\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 9537, __extension__
__PRETTY_FUNCTION__))
;
9538
9539 // Fast-math-flags propagate from the original induction instruction.
9540 IRBuilder<>::FastMathFlagGuard FMFG(State.Builder);
9541 if (IndDesc.getInductionBinOp() &&
9542 isa<FPMathOperator>(IndDesc.getInductionBinOp()))
9543 State.Builder.setFastMathFlags(
9544 IndDesc.getInductionBinOp()->getFastMathFlags());
9545
9546 Value *Step = State.get(getStepValue(), VPIteration(0, 0));
9547 Value *CanonicalIV = State.get(getCanonicalIV(), VPIteration(0, 0));
9548 Value *DerivedIV =
9549 emitTransformedIndex(State.Builder, CanonicalIV,
9550 getStartValue()->getLiveInIRValue(), Step, IndDesc);
9551 DerivedIV->setName("offset.idx");
9552 if (ResultTy != DerivedIV->getType()) {
9553 assert(Step->getType()->isIntegerTy() &&(static_cast <bool> (Step->getType()->isIntegerTy
() && "Truncation requires an integer step") ? void (
0) : __assert_fail ("Step->getType()->isIntegerTy() && \"Truncation requires an integer step\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 9554, __extension__
__PRETTY_FUNCTION__))
9554 "Truncation requires an integer step")(static_cast <bool> (Step->getType()->isIntegerTy
() && "Truncation requires an integer step") ? void (
0) : __assert_fail ("Step->getType()->isIntegerTy() && \"Truncation requires an integer step\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 9554, __extension__
__PRETTY_FUNCTION__))
;
9555 DerivedIV = State.Builder.CreateTrunc(DerivedIV, ResultTy);
9556 }
9557 assert(DerivedIV != CanonicalIV && "IV didn't need transforming?")(static_cast <bool> (DerivedIV != CanonicalIV &&
"IV didn't need transforming?") ? void (0) : __assert_fail (
"DerivedIV != CanonicalIV && \"IV didn't need transforming?\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 9557, __extension__
__PRETTY_FUNCTION__))
;
9558
9559 State.set(this, DerivedIV, VPIteration(0, 0));
9560}
9561
9562void VPScalarIVStepsRecipe::execute(VPTransformState &State) {
9563 // Fast-math-flags propagate from the original induction instruction.
9564 IRBuilder<>::FastMathFlagGuard FMFG(State.Builder);
9565 if (IndDesc.getInductionBinOp() &&
9566 isa<FPMathOperator>(IndDesc.getInductionBinOp()))
9567 State.Builder.setFastMathFlags(
9568 IndDesc.getInductionBinOp()->getFastMathFlags());
9569
9570 Value *BaseIV = State.get(getOperand(0), VPIteration(0, 0));
9571 Value *Step = State.get(getStepValue(), VPIteration(0, 0));
9572
9573 buildScalarSteps(BaseIV, Step, IndDesc, this, State);
9574}
9575
9576void VPInterleaveRecipe::execute(VPTransformState &State) {
9577 assert(!State.Instance && "Interleave group being replicated.")(static_cast <bool> (!State.Instance && "Interleave group being replicated."
) ? void (0) : __assert_fail ("!State.Instance && \"Interleave group being replicated.\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 9577, __extension__
__PRETTY_FUNCTION__))
;
9578 State.ILV->vectorizeInterleaveGroup(IG, definedValues(), State, getAddr(),
9579 getStoredValues(), getMask());
9580}
9581
9582void VPReductionRecipe::execute(VPTransformState &State) {
9583 assert(!State.Instance && "Reduction being replicated.")(static_cast <bool> (!State.Instance && "Reduction being replicated."
) ? void (0) : __assert_fail ("!State.Instance && \"Reduction being replicated.\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 9583, __extension__
__PRETTY_FUNCTION__))
;
9584 Value *PrevInChain = State.get(getChainOp(), 0);
9585 RecurKind Kind = RdxDesc->getRecurrenceKind();
9586 bool IsOrdered = State.ILV->useOrderedReductions(*RdxDesc);
9587 // Propagate the fast-math flags carried by the underlying instruction.
9588 IRBuilderBase::FastMathFlagGuard FMFGuard(State.Builder);
9589 State.Builder.setFastMathFlags(RdxDesc->getFastMathFlags());
9590 for (unsigned Part = 0; Part < State.UF; ++Part) {
9591 Value *NewVecOp = State.get(getVecOp(), Part);
9592 if (VPValue *Cond = getCondOp()) {
9593 Value *NewCond = State.get(Cond, Part);
9594 VectorType *VecTy = cast<VectorType>(NewVecOp->getType());
9595 Value *Iden = RdxDesc->getRecurrenceIdentity(
9596 Kind, VecTy->getElementType(), RdxDesc->getFastMathFlags());
9597 Value *IdenVec =
9598 State.Builder.CreateVectorSplat(VecTy->getElementCount(), Iden);
9599 Value *Select = State.Builder.CreateSelect(NewCond, NewVecOp, IdenVec);
9600 NewVecOp = Select;
9601 }
9602 Value *NewRed;
9603 Value *NextInChain;
9604 if (IsOrdered) {
9605 if (State.VF.isVector())
9606 NewRed = createOrderedReduction(State.Builder, *RdxDesc, NewVecOp,
9607 PrevInChain);
9608 else
9609 NewRed = State.Builder.CreateBinOp(
9610 (Instruction::BinaryOps)RdxDesc->getOpcode(Kind), PrevInChain,
9611 NewVecOp);
9612 PrevInChain = NewRed;
9613 } else {
9614 PrevInChain = State.get(getChainOp(), Part);
9615 NewRed = createTargetReduction(State.Builder, TTI, *RdxDesc, NewVecOp);
9616 }
9617 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) {
9618 NextInChain =
9619 createMinMaxOp(State.Builder, RdxDesc->getRecurrenceKind(),
9620 NewRed, PrevInChain);
9621 } else if (IsOrdered)
9622 NextInChain = NewRed;
9623 else
9624 NextInChain = State.Builder.CreateBinOp(
9625 (Instruction::BinaryOps)RdxDesc->getOpcode(Kind), NewRed,
9626 PrevInChain);
9627 State.set(this, NextInChain, Part);
9628 }
9629}
9630
9631void VPReplicateRecipe::execute(VPTransformState &State) {
9632 Instruction *UI = getUnderlyingInstr();
9633 if (State.Instance) { // Generate a single instance.
9634 assert(!State.VF.isScalable() && "Can't scalarize a scalable vector")(static_cast <bool> (!State.VF.isScalable() && "Can't scalarize a scalable vector"
) ? void (0) : __assert_fail ("!State.VF.isScalable() && \"Can't scalarize a scalable vector\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 9634, __extension__
__PRETTY_FUNCTION__))
;
9635 State.ILV->scalarizeInstruction(UI, this, *State.Instance,
9636 IsPredicated, State);
9637 // Insert scalar instance packing it into a vector.
9638 if (AlsoPack && State.VF.isVector()) {
9639 // If we're constructing lane 0, initialize to start from poison.
9640 if (State.Instance->Lane.isFirstLane()) {
9641 assert(!State.VF.isScalable() && "VF is assumed to be non scalable.")(static_cast <bool> (!State.VF.isScalable() && "VF is assumed to be non scalable."
) ? void (0) : __assert_fail ("!State.VF.isScalable() && \"VF is assumed to be non scalable.\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 9641, __extension__
__PRETTY_FUNCTION__))
;
9642 Value *Poison = PoisonValue::get(
9643 VectorType::get(UI->getType(), State.VF));
9644 State.set(this, Poison, State.Instance->Part);
9645 }
9646 State.ILV->packScalarIntoVectorValue(this, *State.Instance, State);
9647 }
9648 return;
9649 }
9650
9651 if (IsUniform) {
9652 // If the recipe is uniform across all parts (instead of just per VF), only
9653 // generate a single instance.
9654 if ((isa<LoadInst>(UI) || isa<StoreInst>(UI)) &&
9655 all_of(operands(), [](VPValue *Op) {
9656 return Op->isDefinedOutsideVectorRegions();
9657 })) {
9658 State.ILV->scalarizeInstruction(UI, this, VPIteration(0, 0), IsPredicated,
9659 State);
9660 if (user_begin() != user_end()) {
9661 for (unsigned Part = 1; Part < State.UF; ++Part)
9662 State.set(this, State.get(this, VPIteration(0, 0)),
9663 VPIteration(Part, 0));
9664 }
9665 return;
9666 }
9667
9668 // Uniform within VL means we need to generate lane 0 only for each
9669 // unrolled copy.
9670 for (unsigned Part = 0; Part < State.UF; ++Part)
9671 State.ILV->scalarizeInstruction(UI, this, VPIteration(Part, 0),
9672 IsPredicated, State);
9673 return;
9674 }
9675
9676 // A store of a loop varying value to a loop invariant address only
9677 // needs only the last copy of the store.
9678 if (isa<StoreInst>(UI) && !getOperand(1)->hasDefiningRecipe()) {
9679 auto Lane = VPLane::getLastLaneForVF(State.VF);
9680 State.ILV->scalarizeInstruction(UI, this, VPIteration(State.UF - 1, Lane), IsPredicated,
9681 State);
9682 return;
9683 }
9684
9685 // Generate scalar instances for all VF lanes of all UF parts.
9686 assert(!State.VF.isScalable() && "Can't scalarize a scalable vector")(static_cast <bool> (!State.VF.isScalable() && "Can't scalarize a scalable vector"
) ? void (0) : __assert_fail ("!State.VF.isScalable() && \"Can't scalarize a scalable vector\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 9686, __extension__
__PRETTY_FUNCTION__))
;
9687 const unsigned EndLane = State.VF.getKnownMinValue();
9688 for (unsigned Part = 0; Part < State.UF; ++Part)
9689 for (unsigned Lane = 0; Lane < EndLane; ++Lane)
9690 State.ILV->scalarizeInstruction(UI, this, VPIteration(Part, Lane),
9691 IsPredicated, State);
9692}
9693
9694void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) {
9695 VPValue *StoredValue = isStore() ? getStoredValue() : nullptr;
9696
9697 // Attempt to issue a wide load.
9698 LoadInst *LI = dyn_cast<LoadInst>(&Ingredient);
9699 StoreInst *SI = dyn_cast<StoreInst>(&Ingredient);
9700
9701 assert((LI || SI) && "Invalid Load/Store instruction")(static_cast <bool> ((LI || SI) && "Invalid Load/Store instruction"
) ? void (0) : __assert_fail ("(LI || SI) && \"Invalid Load/Store instruction\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 9701, __extension__
__PRETTY_FUNCTION__))
;
9702 assert((!SI || StoredValue) && "No stored value provided for widened store")(static_cast <bool> ((!SI || StoredValue) && "No stored value provided for widened store"
) ? void (0) : __assert_fail ("(!SI || StoredValue) && \"No stored value provided for widened store\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 9702, __extension__
__PRETTY_FUNCTION__))
;
9703 assert((!LI || !StoredValue) && "Stored value provided for widened load")(static_cast <bool> ((!LI || !StoredValue) && "Stored value provided for widened load"
) ? void (0) : __assert_fail ("(!LI || !StoredValue) && \"Stored value provided for widened load\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 9703, __extension__
__PRETTY_FUNCTION__))
;
9704
9705 Type *ScalarDataTy = getLoadStoreType(&Ingredient);
9706
9707 auto *DataTy = VectorType::get(ScalarDataTy, State.VF);
9708 const Align Alignment = getLoadStoreAlignment(&Ingredient);
9709 bool CreateGatherScatter = !Consecutive;
9710
9711 auto &Builder = State.Builder;
9712 InnerLoopVectorizer::VectorParts BlockInMaskParts(State.UF);
9713 bool isMaskRequired = getMask();
9714 if (isMaskRequired)
9715 for (unsigned Part = 0; Part < State.UF; ++Part)
9716 BlockInMaskParts[Part] = State.get(getMask(), Part);
9717
9718 const auto CreateVecPtr = [&](unsigned Part, Value *Ptr) -> Value * {
9719 // Calculate the pointer for the specific unroll-part.
9720 GetElementPtrInst *PartPtr = nullptr;
9721
9722 bool InBounds = false;
9723 if (auto *gep = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts()))
9724 InBounds = gep->isInBounds();
9725 if (Reverse) {
9726 // If the address is consecutive but reversed, then the
9727 // wide store needs to start at the last vector element.
9728 // RunTimeVF = VScale * VF.getKnownMinValue()
9729 // For fixed-width VScale is 1, then RunTimeVF = VF.getKnownMinValue()
9730 Value *RunTimeVF = getRuntimeVF(Builder, Builder.getInt32Ty(), State.VF);
9731 // NumElt = -Part * RunTimeVF
9732 Value *NumElt = Builder.CreateMul(Builder.getInt32(-Part), RunTimeVF);
9733 // LastLane = 1 - RunTimeVF
9734 Value *LastLane = Builder.CreateSub(Builder.getInt32(1), RunTimeVF);
9735 PartPtr =
9736 cast<GetElementPtrInst>(Builder.CreateGEP(ScalarDataTy, Ptr, NumElt));
9737 PartPtr->setIsInBounds(InBounds);
9738 PartPtr = cast<GetElementPtrInst>(
9739 Builder.CreateGEP(ScalarDataTy, PartPtr, LastLane));
9740 PartPtr->setIsInBounds(InBounds);
9741 if (isMaskRequired) // Reverse of a null all-one mask is a null mask.
9742 BlockInMaskParts[Part] =
9743 Builder.CreateVectorReverse(BlockInMaskParts[Part], "reverse");
9744 } else {
9745 Value *Increment =
9746 createStepForVF(Builder, Builder.getInt32Ty(), State.VF, Part);
9747 PartPtr = cast<GetElementPtrInst>(
9748 Builder.CreateGEP(ScalarDataTy, Ptr, Increment));
9749 PartPtr->setIsInBounds(InBounds);
9750 }
9751
9752 unsigned AddressSpace = Ptr->getType()->getPointerAddressSpace();
9753 return Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace));
9754 };
9755
9756 // Handle Stores:
9757 if (SI) {
9758 State.setDebugLocFromInst(SI);
9759
9760 for (unsigned Part = 0; Part < State.UF; ++Part) {
9761 Instruction *NewSI = nullptr;
9762 Value *StoredVal = State.get(StoredValue, Part);
9763 if (CreateGatherScatter) {
9764 Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr;
9765 Value *VectorGep = State.get(getAddr(), Part);
9766 NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep, Alignment,
9767 MaskPart);
9768 } else {
9769 if (Reverse) {
9770 // If we store to reverse consecutive memory locations, then we need
9771 // to reverse the order of elements in the stored value.
9772 StoredVal = Builder.CreateVectorReverse(StoredVal, "reverse");
9773 // We don't want to update the value in the map as it might be used in
9774 // another expression. So don't call resetVectorValue(StoredVal).
9775 }
9776 auto *VecPtr =
9777 CreateVecPtr(Part, State.get(getAddr(), VPIteration(0, 0)));
9778 if (isMaskRequired)
9779 NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr, Alignment,
9780 BlockInMaskParts[Part]);
9781 else
9782 NewSI = Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment);
9783 }
9784 State.addMetadata(NewSI, SI);
9785 }
9786 return;
9787 }
9788
9789 // Handle loads.
9790 assert(LI && "Must have a load instruction")(static_cast <bool> (LI && "Must have a load instruction"
) ? void (0) : __assert_fail ("LI && \"Must have a load instruction\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 9790, __extension__
__PRETTY_FUNCTION__))
;
9791 State.setDebugLocFromInst(LI);
9792 for (unsigned Part = 0; Part < State.UF; ++Part) {
9793 Value *NewLI;
9794 if (CreateGatherScatter) {
9795 Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr;
9796 Value *VectorGep = State.get(getAddr(), Part);
9797 NewLI = Builder.CreateMaskedGather(DataTy, VectorGep, Alignment, MaskPart,
9798 nullptr, "wide.masked.gather");
9799 State.addMetadata(NewLI, LI);
9800 } else {
9801 auto *VecPtr =
9802 CreateVecPtr(Part, State.get(getAddr(), VPIteration(0, 0)));
9803 if (isMaskRequired)
9804 NewLI = Builder.CreateMaskedLoad(
9805 DataTy, VecPtr, Alignment, BlockInMaskParts[Part],
9806 PoisonValue::get(DataTy), "wide.masked.load");
9807 else
9808 NewLI =
9809 Builder.CreateAlignedLoad(DataTy, VecPtr, Alignment, "wide.load");
9810
9811 // Add metadata to the load, but setVectorValue to the reverse shuffle.
9812 State.addMetadata(NewLI, LI);
9813 if (Reverse)
9814 NewLI = Builder.CreateVectorReverse(NewLI, "reverse");
9815 }
9816
9817 State.set(getVPSingleValue(), NewLI, Part);
9818 }
9819}
9820
9821// Determine how to lower the scalar epilogue, which depends on 1) optimising
9822// for minimum code-size, 2) predicate compiler options, 3) loop hints forcing
9823// predication, and 4) a TTI hook that analyses whether the loop is suitable
9824// for predication.
9825static ScalarEpilogueLowering getScalarEpilogueLowering(
9826 Function *F, Loop *L, LoopVectorizeHints &Hints, ProfileSummaryInfo *PSI,
9827 BlockFrequencyInfo *BFI, TargetTransformInfo *TTI, TargetLibraryInfo *TLI,
9828 AssumptionCache *AC, LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT,
9829 LoopVectorizationLegality &LVL, InterleavedAccessInfo *IAI) {
9830 // 1) OptSize takes precedence over all other options, i.e. if this is set,
9831 // don't look at hints or options, and don't request a scalar epilogue.
9832 // (For PGSO, as shouldOptimizeForSize isn't currently accessible from
9833 // LoopAccessInfo (due to code dependency and not being able to reliably get
9834 // PSI/BFI from a loop analysis under NPM), we cannot suppress the collection
9835 // of strides in LoopAccessInfo::analyzeLoop() and vectorize without
9836 // versioning when the vectorization is forced, unlike hasOptSize. So revert
9837 // back to the old way and vectorize with versioning when forced. See D81345.)
9838 if (F->hasOptSize() || (llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI,
9839 PGSOQueryType::IRPass) &&
9840 Hints.getForce() != LoopVectorizeHints::FK_Enabled))
9841 return CM_ScalarEpilogueNotAllowedOptSize;
9842
9843 // 2) If set, obey the directives
9844 if (PreferPredicateOverEpilogue.getNumOccurrences()) {
9845 switch (PreferPredicateOverEpilogue) {
9846 case PreferPredicateTy::ScalarEpilogue:
9847 return CM_ScalarEpilogueAllowed;
9848 case PreferPredicateTy::PredicateElseScalarEpilogue:
9849 return CM_ScalarEpilogueNotNeededUsePredicate;
9850 case PreferPredicateTy::PredicateOrDontVectorize:
9851 return CM_ScalarEpilogueNotAllowedUsePredicate;
9852 };
9853 }
9854
9855 // 3) If set, obey the hints
9856 switch (Hints.getPredicate()) {
9857 case LoopVectorizeHints::FK_Enabled:
9858 return CM_ScalarEpilogueNotNeededUsePredicate;
9859 case LoopVectorizeHints::FK_Disabled:
9860 return CM_ScalarEpilogueAllowed;
9861 };
9862
9863 // 4) if the TTI hook indicates this is profitable, request predication.
9864 if (TTI->preferPredicateOverEpilogue(L, LI, *SE, *AC, TLI, DT, &LVL, IAI))
9865 return CM_ScalarEpilogueNotNeededUsePredicate;
9866
9867 return CM_ScalarEpilogueAllowed;
9868}
9869
9870Value *VPTransformState::get(VPValue *Def, unsigned Part) {
9871 // If Values have been set for this Def return the one relevant for \p Part.
9872 if (hasVectorValue(Def, Part))
9873 return Data.PerPartOutput[Def][Part];
9874
9875 if (!hasScalarValue(Def, {Part, 0})) {
9876 Value *IRV = Def->getLiveInIRValue();
9877 Value *B = ILV->getBroadcastInstrs(IRV);
9878 set(Def, B, Part);
9879 return B;
9880 }
9881
9882 Value *ScalarValue = get(Def, {Part, 0});
9883 // If we aren't vectorizing, we can just copy the scalar map values over
9884 // to the vector map.
9885 if (VF.isScalar()) {
9886 set(Def, ScalarValue, Part);
9887 return ScalarValue;
9888 }
9889
9890 bool IsUniform = vputils::isUniformAfterVectorization(Def);
9891
9892 unsigned LastLane = IsUniform ? 0 : VF.getKnownMinValue() - 1;
9893 // Check if there is a scalar value for the selected lane.
9894 if (!hasScalarValue(Def, {Part, LastLane})) {
9895 // At the moment, VPWidenIntOrFpInductionRecipes and VPScalarIVStepsRecipes can also be uniform.
9896 assert((isa<VPWidenIntOrFpInductionRecipe>(Def->getDefiningRecipe()) ||(static_cast <bool> ((isa<VPWidenIntOrFpInductionRecipe
>(Def->getDefiningRecipe()) || isa<VPScalarIVStepsRecipe
>(Def->getDefiningRecipe())) && "unexpected recipe found to be invariant"
) ? void (0) : __assert_fail ("(isa<VPWidenIntOrFpInductionRecipe>(Def->getDefiningRecipe()) || isa<VPScalarIVStepsRecipe>(Def->getDefiningRecipe())) && \"unexpected recipe found to be invariant\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 9898, __extension__
__PRETTY_FUNCTION__))
9897 isa<VPScalarIVStepsRecipe>(Def->getDefiningRecipe())) &&(static_cast <bool> ((isa<VPWidenIntOrFpInductionRecipe
>(Def->getDefiningRecipe()) || isa<VPScalarIVStepsRecipe
>(Def->getDefiningRecipe())) && "unexpected recipe found to be invariant"
) ? void (0) : __assert_fail ("(isa<VPWidenIntOrFpInductionRecipe>(Def->getDefiningRecipe()) || isa<VPScalarIVStepsRecipe>(Def->getDefiningRecipe())) && \"unexpected recipe found to be invariant\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 9898, __extension__
__PRETTY_FUNCTION__))
9898 "unexpected recipe found to be invariant")(static_cast <bool> ((isa<VPWidenIntOrFpInductionRecipe
>(Def->getDefiningRecipe()) || isa<VPScalarIVStepsRecipe
>(Def->getDefiningRecipe())) && "unexpected recipe found to be invariant"
) ? void (0) : __assert_fail ("(isa<VPWidenIntOrFpInductionRecipe>(Def->getDefiningRecipe()) || isa<VPScalarIVStepsRecipe>(Def->getDefiningRecipe())) && \"unexpected recipe found to be invariant\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 9898, __extension__
__PRETTY_FUNCTION__))
;
9899 IsUniform = true;
9900 LastLane = 0;
9901 }
9902
9903 auto *LastInst = cast<Instruction>(get(Def, {Part, LastLane}));
9904 // Set the insert point after the last scalarized instruction or after the
9905 // last PHI, if LastInst is a PHI. This ensures the insertelement sequence
9906 // will directly follow the scalar definitions.
9907 auto OldIP = Builder.saveIP();
9908 auto NewIP =
9909 isa<PHINode>(LastInst)
9910 ? BasicBlock::iterator(LastInst->getParent()->getFirstNonPHI())
9911 : std::next(BasicBlock::iterator(LastInst));
9912 Builder.SetInsertPoint(&*NewIP);
9913
9914 // However, if we are vectorizing, we need to construct the vector values.
9915 // If the value is known to be uniform after vectorization, we can just
9916 // broadcast the scalar value corresponding to lane zero for each unroll
9917 // iteration. Otherwise, we construct the vector values using
9918 // insertelement instructions. Since the resulting vectors are stored in
9919 // State, we will only generate the insertelements once.
9920 Value *VectorValue = nullptr;
9921 if (IsUniform) {
9922 VectorValue = ILV->getBroadcastInstrs(ScalarValue);
9923 set(Def, VectorValue, Part);
9924 } else {
9925 // Initialize packing with insertelements to start from undef.
9926 assert(!VF.isScalable() && "VF is assumed to be non scalable.")(static_cast <bool> (!VF.isScalable() && "VF is assumed to be non scalable."
) ? void (0) : __assert_fail ("!VF.isScalable() && \"VF is assumed to be non scalable.\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 9926, __extension__
__PRETTY_FUNCTION__))
;
9927 Value *Undef = PoisonValue::get(VectorType::get(LastInst->getType(), VF));
9928 set(Def, Undef, Part);
9929 for (unsigned Lane = 0; Lane < VF.getKnownMinValue(); ++Lane)
9930 ILV->packScalarIntoVectorValue(Def, {Part, Lane}, *this);
9931 VectorValue = get(Def, Part);
9932 }
9933 Builder.restoreIP(OldIP);
9934 return VectorValue;
9935}
9936
9937// Process the loop in the VPlan-native vectorization path. This path builds
9938// VPlan upfront in the vectorization pipeline, which allows to apply
9939// VPlan-to-VPlan transformations from the very beginning without modifying the
9940// input LLVM IR.
9941static bool processLoopInVPlanNativePath(
9942 Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT,
9943 LoopVectorizationLegality *LVL, TargetTransformInfo *TTI,
9944 TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC,
9945 OptimizationRemarkEmitter *ORE, BlockFrequencyInfo *BFI,
9946 ProfileSummaryInfo *PSI, LoopVectorizeHints &Hints,
9947 LoopVectorizationRequirements &Requirements) {
9948
9949 if (isa<SCEVCouldNotCompute>(PSE.getBackedgeTakenCount())) {
9950 LLVM_DEBUG(dbgs() << "LV: cannot compute the outer-loop trip count\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: cannot compute the outer-loop trip count\n"
; } } while (false)
;
9951 return false;
9952 }
9953 assert(EnableVPlanNativePath && "VPlan-native path is disabled.")(static_cast <bool> (EnableVPlanNativePath && "VPlan-native path is disabled."
) ? void (0) : __assert_fail ("EnableVPlanNativePath && \"VPlan-native path is disabled.\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 9953, __extension__
__PRETTY_FUNCTION__))
;
9954 Function *F = L->getHeader()->getParent();
9955 InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI());
9956
9957 ScalarEpilogueLowering SEL = getScalarEpilogueLowering(
9958 F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, *LVL, &IAI);
9959
9960 LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F,
9961 &Hints, IAI);
9962 // Use the planner for outer loop vectorization.
9963 // TODO: CM is not used at this point inside the planner. Turn CM into an
9964 // optional argument if we don't need it in the future.
9965 LoopVectorizationPlanner LVP(L, LI, TLI, TTI, LVL, CM, IAI, PSE, Hints, ORE);
9966
9967 // Get user vectorization factor.
9968 ElementCount UserVF = Hints.getWidth();
9969
9970 CM.collectElementTypesForWidening();
9971
9972 // Plan how to best vectorize, return the best VF and its cost.
9973 const VectorizationFactor VF = LVP.planInVPlanNativePath(UserVF);
9974
9975 // If we are stress testing VPlan builds, do not attempt to generate vector
9976 // code. Masked vector code generation support will follow soon.
9977 // Also, do not attempt to vectorize if no vector code will be produced.
9978 if (VPlanBuildStressTest || VectorizationFactor::Disabled() == VF)
9979 return false;
9980
9981 VPlan &BestPlan = LVP.getBestPlanFor(VF.Width);
9982
9983 {
9984 GeneratedRTChecks Checks(*PSE.getSE(), DT, LI, TTI,
9985 F->getParent()->getDataLayout());
9986 InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width,
9987 VF.Width, 1, LVL, &CM, BFI, PSI, Checks);
9988 LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \""do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "Vectorizing outer loop in \""
<< L->getHeader()->getParent()->getName() <<
"\"\n"; } } while (false)
9989 << L->getHeader()->getParent()->getName() << "\"\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "Vectorizing outer loop in \""
<< L->getHeader()->getParent()->getName() <<
"\"\n"; } } while (false)
;
9990 LVP.executePlan(VF.Width, 1, BestPlan, LB, DT, false);
9991 }
9992
9993 // Mark the loop as already vectorized to avoid vectorizing again.
9994 Hints.setAlreadyVectorized();
9995 assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs()))(static_cast <bool> (!verifyFunction(*L->getHeader()
->getParent(), &dbgs())) ? void (0) : __assert_fail ("!verifyFunction(*L->getHeader()->getParent(), &dbgs())"
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 9995, __extension__
__PRETTY_FUNCTION__))
;
9996 return true;
9997}
9998
9999// Emit a remark if there are stores to floats that required a floating point
10000// extension. If the vectorized loop was generated with floating point there
10001// will be a performance penalty from the conversion overhead and the change in
10002// the vector width.
10003static void checkMixedPrecision(Loop *L, OptimizationRemarkEmitter *ORE) {
10004 SmallVector<Instruction *, 4> Worklist;
10005 for (BasicBlock *BB : L->getBlocks()) {
10006 for (Instruction &Inst : *BB) {
10007 if (auto *S = dyn_cast<StoreInst>(&Inst)) {
10008 if (S->getValueOperand()->getType()->isFloatTy())
10009 Worklist.push_back(S);
10010 }
10011 }
10012 }
10013
10014 // Traverse the floating point stores upwards searching, for floating point
10015 // conversions.
10016 SmallPtrSet<const Instruction *, 4> Visited;
10017 SmallPtrSet<const Instruction *, 4> EmittedRemark;
10018 while (!Worklist.empty()) {
10019 auto *I = Worklist.pop_back_val();
10020 if (!L->contains(I))
10021 continue;
10022 if (!Visited.insert(I).second)
10023 continue;
10024
10025 // Emit a remark if the floating point store required a floating
10026 // point conversion.
10027 // TODO: More work could be done to identify the root cause such as a
10028 // constant or a function return type and point the user to it.
10029 if (isa<FPExtInst>(I) && EmittedRemark.insert(I).second)
10030 ORE->emit([&]() {
10031 return OptimizationRemarkAnalysis(LV_NAME"loop-vectorize", "VectorMixedPrecision",
10032 I->getDebugLoc(), L->getHeader())
10033 << "floating point conversion changes vector width. "
10034 << "Mixed floating point precision requires an up/down "
10035 << "cast that will negatively impact performance.";
10036 });
10037
10038 for (Use &Op : I->operands())
10039 if (auto *OpI = dyn_cast<Instruction>(Op))
10040 Worklist.push_back(OpI);
10041 }
10042}
10043
10044static bool areRuntimeChecksProfitable(GeneratedRTChecks &Checks,
10045 VectorizationFactor &VF,
10046 std::optional<unsigned> VScale, Loop *L,
10047 ScalarEvolution &SE) {
10048 InstructionCost CheckCost = Checks.getCost();
10049 if (!CheckCost.isValid())
10050 return false;
10051
10052 // When interleaving only scalar and vector cost will be equal, which in turn
10053 // would lead to a divide by 0. Fall back to hard threshold.
10054 if (VF.Width.isScalar()) {
10055 if (CheckCost > VectorizeMemoryCheckThreshold) {
10056 LLVM_DEBUG(do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Interleaving only is not profitable due to runtime checks\n"
; } } while (false)
10057 dbgs()do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Interleaving only is not profitable due to runtime checks\n"
; } } while (false)
10058 << "LV: Interleaving only is not profitable due to runtime checks\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Interleaving only is not profitable due to runtime checks\n"
; } } while (false)
;
10059 return false;
10060 }
10061 return true;
10062 }
10063
10064 // The scalar cost should only be 0 when vectorizing with a user specified VF/IC. In those cases, runtime checks should always be generated.
10065 double ScalarC = *VF.ScalarCost.getValue();
10066 if (ScalarC == 0)
10067 return true;
10068
10069 // First, compute the minimum iteration count required so that the vector
10070 // loop outperforms the scalar loop.
10071 // The total cost of the scalar loop is
10072 // ScalarC * TC
10073 // where
10074 // * TC is the actual trip count of the loop.
10075 // * ScalarC is the cost of a single scalar iteration.
10076 //
10077 // The total cost of the vector loop is
10078 // RtC + VecC * (TC / VF) + EpiC
10079 // where
10080 // * RtC is the cost of the generated runtime checks
10081 // * VecC is the cost of a single vector iteration.
10082 // * TC is the actual trip count of the loop
10083 // * VF is the vectorization factor
10084 // * EpiCost is the cost of the generated epilogue, including the cost
10085 // of the remaining scalar operations.
10086 //
10087 // Vectorization is profitable once the total vector cost is less than the
10088 // total scalar cost:
10089 // RtC + VecC * (TC / VF) + EpiC < ScalarC * TC
10090 //
10091 // Now we can compute the minimum required trip count TC as
10092 // (RtC + EpiC) / (ScalarC - (VecC / VF)) < TC
10093 //
10094 // For now we assume the epilogue cost EpiC = 0 for simplicity. Note that
10095 // the computations are performed on doubles, not integers and the result
10096 // is rounded up, hence we get an upper estimate of the TC.
10097 unsigned IntVF = VF.Width.getKnownMinValue();
10098 if (VF.Width.isScalable()) {
10099 unsigned AssumedMinimumVscale = 1;
10100 if (VScale)
10101 AssumedMinimumVscale = *VScale;
10102 IntVF *= AssumedMinimumVscale;
10103 }
10104 double VecCOverVF = double(*VF.Cost.getValue()) / IntVF;
10105 double RtC = *CheckCost.getValue();
10106 double MinTC1 = RtC / (ScalarC - VecCOverVF);
10107
10108 // Second, compute a minimum iteration count so that the cost of the
10109 // runtime checks is only a fraction of the total scalar loop cost. This
10110 // adds a loop-dependent bound on the overhead incurred if the runtime
10111 // checks fail. In case the runtime checks fail, the cost is RtC + ScalarC
10112 // * TC. To bound the runtime check to be a fraction 1/X of the scalar
10113 // cost, compute
10114 // RtC < ScalarC * TC * (1 / X) ==> RtC * X / ScalarC < TC
10115 double MinTC2 = RtC * 10 / ScalarC;
10116
10117 // Now pick the larger minimum. If it is not a multiple of VF, choose the
10118 // next closest multiple of VF. This should partly compensate for ignoring
10119 // the epilogue cost.
10120 uint64_t MinTC = std::ceil(std::max(MinTC1, MinTC2));
10121 VF.MinProfitableTripCount = ElementCount::getFixed(alignTo(MinTC, IntVF));
10122
10123 LLVM_DEBUG(do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Minimum required TC for runtime checks to be profitable:"
<< VF.MinProfitableTripCount << "\n"; } } while (
false)
10124 dbgs() << "LV: Minimum required TC for runtime checks to be profitable:"do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Minimum required TC for runtime checks to be profitable:"
<< VF.MinProfitableTripCount << "\n"; } } while (
false)
10125 << VF.MinProfitableTripCount << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Minimum required TC for runtime checks to be profitable:"
<< VF.MinProfitableTripCount << "\n"; } } while (
false)
;
10126
10127 // Skip vectorization if the expected trip count is less than the minimum
10128 // required trip count.
10129 if (auto ExpectedTC = getSmallBestKnownTC(SE, L)) {
10130 if (ElementCount::isKnownLT(ElementCount::getFixed(*ExpectedTC),
10131 VF.MinProfitableTripCount)) {
10132 LLVM_DEBUG(dbgs() << "LV: Vectorization is not beneficial: expected "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Vectorization is not beneficial: expected "
"trip count < minimum profitable VF (" << *ExpectedTC
<< " < " << VF.MinProfitableTripCount <<
")\n"; } } while (false)
10133 "trip count < minimum profitable VF ("do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Vectorization is not beneficial: expected "
"trip count < minimum profitable VF (" << *ExpectedTC
<< " < " << VF.MinProfitableTripCount <<
")\n"; } } while (false)
10134 << *ExpectedTC << " < " << VF.MinProfitableTripCountdo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Vectorization is not beneficial: expected "
"trip count < minimum profitable VF (" << *ExpectedTC
<< " < " << VF.MinProfitableTripCount <<
")\n"; } } while (false)
10135 << ")\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Vectorization is not beneficial: expected "
"trip count < minimum profitable VF (" << *ExpectedTC
<< " < " << VF.MinProfitableTripCount <<
")\n"; } } while (false)
;
10136
10137 return false;
10138 }
10139 }
10140 return true;
10141}
10142
10143LoopVectorizePass::LoopVectorizePass(LoopVectorizeOptions Opts)
10144 : InterleaveOnlyWhenForced(Opts.InterleaveOnlyWhenForced ||
10145 !EnableLoopInterleaving),
10146 VectorizeOnlyWhenForced(Opts.VectorizeOnlyWhenForced ||
10147 !EnableLoopVectorization) {}
10148
10149bool LoopVectorizePass::processLoop(Loop *L) {
10150 assert((EnableVPlanNativePath || L->isInnermost()) &&(static_cast <bool> ((EnableVPlanNativePath || L->isInnermost
()) && "VPlan-native path is not enabled. Only process inner loops."
) ? void (0) : __assert_fail ("(EnableVPlanNativePath || L->isInnermost()) && \"VPlan-native path is not enabled. Only process inner loops.\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 10151, __extension__
__PRETTY_FUNCTION__))
10151 "VPlan-native path is not enabled. Only process inner loops.")(static_cast <bool> ((EnableVPlanNativePath || L->isInnermost
()) && "VPlan-native path is not enabled. Only process inner loops."
) ? void (0) : __assert_fail ("(EnableVPlanNativePath || L->isInnermost()) && \"VPlan-native path is not enabled. Only process inner loops.\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 10151, __extension__
__PRETTY_FUNCTION__))
;
10152
10153#ifndef NDEBUG
10154 const std::string DebugLocStr = getDebugLocString(L);
10155#endif /* NDEBUG */
10156
10157 LLVM_DEBUG(dbgs() << "\nLV: Checking a loop in '"do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "\nLV: Checking a loop in '"
<< L->getHeader()->getParent()->getName() <<
"' from " << DebugLocStr << "\n"; } } while (false
)
10158 << L->getHeader()->getParent()->getName() << "' from "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "\nLV: Checking a loop in '"
<< L->getHeader()->getParent()->getName() <<
"' from " << DebugLocStr << "\n"; } } while (false
)
10159 << DebugLocStr << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "\nLV: Checking a loop in '"
<< L->getHeader()->getParent()->getName() <<
"' from " << DebugLocStr << "\n"; } } while (false
)
;
10160
10161 LoopVectorizeHints Hints(L, InterleaveOnlyWhenForced, *ORE, TTI);
10162
10163 LLVM_DEBUG(do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Loop hints:" <<
" force=" << (Hints.getForce() == LoopVectorizeHints::
FK_Disabled ? "disabled" : (Hints.getForce() == LoopVectorizeHints
::FK_Enabled ? "enabled" : "?")) << " width=" << Hints
.getWidth() << " interleave=" << Hints.getInterleave
() << "\n"; } } while (false)
10164 dbgs() << "LV: Loop hints:"do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Loop hints:" <<
" force=" << (Hints.getForce() == LoopVectorizeHints::
FK_Disabled ? "disabled" : (Hints.getForce() == LoopVectorizeHints
::FK_Enabled ? "enabled" : "?")) << " width=" << Hints
.getWidth() << " interleave=" << Hints.getInterleave
() << "\n"; } } while (false)
10165 << " force="do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Loop hints:" <<
" force=" << (Hints.getForce() == LoopVectorizeHints::
FK_Disabled ? "disabled" : (Hints.getForce() == LoopVectorizeHints
::FK_Enabled ? "enabled" : "?")) << " width=" << Hints
.getWidth() << " interleave=" << Hints.getInterleave
() << "\n"; } } while (false)
10166 << (Hints.getForce() == LoopVectorizeHints::FK_Disableddo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Loop hints:" <<
" force=" << (Hints.getForce() == LoopVectorizeHints::
FK_Disabled ? "disabled" : (Hints.getForce() == LoopVectorizeHints
::FK_Enabled ? "enabled" : "?")) << " width=" << Hints
.getWidth() << " interleave=" << Hints.getInterleave
() << "\n"; } } while (false)
10167 ? "disabled"do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Loop hints:" <<
" force=" << (Hints.getForce() == LoopVectorizeHints::
FK_Disabled ? "disabled" : (Hints.getForce() == LoopVectorizeHints
::FK_Enabled ? "enabled" : "?")) << " width=" << Hints
.getWidth() << " interleave=" << Hints.getInterleave
() << "\n"; } } while (false)
10168 : (Hints.getForce() == LoopVectorizeHints::FK_Enableddo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Loop hints:" <<
" force=" << (Hints.getForce() == LoopVectorizeHints::
FK_Disabled ? "disabled" : (Hints.getForce() == LoopVectorizeHints
::FK_Enabled ? "enabled" : "?")) << " width=" << Hints
.getWidth() << " interleave=" << Hints.getInterleave
() << "\n"; } } while (false)
10169 ? "enabled"do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Loop hints:" <<
" force=" << (Hints.getForce() == LoopVectorizeHints::
FK_Disabled ? "disabled" : (Hints.getForce() == LoopVectorizeHints
::FK_Enabled ? "enabled" : "?")) << " width=" << Hints
.getWidth() << " interleave=" << Hints.getInterleave
() << "\n"; } } while (false)
10170 : "?"))do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Loop hints:" <<
" force=" << (Hints.getForce() == LoopVectorizeHints::
FK_Disabled ? "disabled" : (Hints.getForce() == LoopVectorizeHints
::FK_Enabled ? "enabled" : "?")) << " width=" << Hints
.getWidth() << " interleave=" << Hints.getInterleave
() << "\n"; } } while (false)
10171 << " width=" << Hints.getWidth()do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Loop hints:" <<
" force=" << (Hints.getForce() == LoopVectorizeHints::
FK_Disabled ? "disabled" : (Hints.getForce() == LoopVectorizeHints
::FK_Enabled ? "enabled" : "?")) << " width=" << Hints
.getWidth() << " interleave=" << Hints.getInterleave
() << "\n"; } } while (false)
10172 << " interleave=" << Hints.getInterleave() << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Loop hints:" <<
" force=" << (Hints.getForce() == LoopVectorizeHints::
FK_Disabled ? "disabled" : (Hints.getForce() == LoopVectorizeHints
::FK_Enabled ? "enabled" : "?")) << " width=" << Hints
.getWidth() << " interleave=" << Hints.getInterleave
() << "\n"; } } while (false)
;
10173
10174 // Function containing loop
10175 Function *F = L->getHeader()->getParent();
10176
10177 // Looking at the diagnostic output is the only way to determine if a loop
10178 // was vectorized (other than looking at the IR or machine code), so it
10179 // is important to generate an optimization remark for each loop. Most of
10180 // these messages are generated as OptimizationRemarkAnalysis. Remarks
10181 // generated as OptimizationRemark and OptimizationRemarkMissed are
10182 // less verbose reporting vectorized loops and unvectorized loops that may
10183 // benefit from vectorization, respectively.
10184
10185 if (!Hints.allowVectorization(F, L, VectorizeOnlyWhenForced)) {
10186 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent vectorization.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Loop hints prevent vectorization.\n"
; } } while (false)
;
10187 return false;
10188 }
10189
10190 PredicatedScalarEvolution PSE(*SE, *L);
10191
10192 // Check if it is legal to vectorize the loop.
10193 LoopVectorizationRequirements Requirements;
10194 LoopVectorizationLegality LVL(L, PSE, DT, TTI, TLI, F, *LAIs, LI, ORE,
10195 &Requirements, &Hints, DB, AC, BFI, PSI);
10196 if (!LVL.canVectorize(EnableVPlanNativePath)) {
10197 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Not vectorizing: Cannot prove legality.\n"
; } } while (false)
;
10198 Hints.emitRemarkWithHints();
10199 return false;
10200 }
10201
10202 // Entrance to the VPlan-native vectorization path. Outer loops are processed
10203 // here. They may require CFG and instruction level transformations before
10204 // even evaluating whether vectorization is profitable. Since we cannot modify
10205 // the incoming IR, we need to build VPlan upfront in the vectorization
10206 // pipeline.
10207 if (!L->isInnermost())
10208 return processLoopInVPlanNativePath(L, PSE, LI, DT, &LVL, TTI, TLI, DB, AC,
10209 ORE, BFI, PSI, Hints, Requirements);
10210
10211 assert(L->isInnermost() && "Inner loop expected.")(static_cast <bool> (L->isInnermost() && "Inner loop expected."
) ? void (0) : __assert_fail ("L->isInnermost() && \"Inner loop expected.\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 10211, __extension__
__PRETTY_FUNCTION__))
;
10212
10213 InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL.getLAI());
10214 bool UseInterleaved = TTI->enableInterleavedAccessVectorization();
10215
10216 // If an override option has been passed in for interleaved accesses, use it.
10217 if (EnableInterleavedMemAccesses.getNumOccurrences() > 0)
10218 UseInterleaved = EnableInterleavedMemAccesses;
10219
10220 // Analyze interleaved memory accesses.
10221 if (UseInterleaved)
10222 IAI.analyzeInterleaving(useMaskedInterleavedAccesses(*TTI));
10223
10224 // Check the function attributes and profiles to find out if this function
10225 // should be optimized for size.
10226 ScalarEpilogueLowering SEL = getScalarEpilogueLowering(
10227 F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, LVL, &IAI);
10228
10229 // Check the loop for a trip count threshold: vectorize loops with a tiny trip
10230 // count by optimizing for size, to minimize overheads.
10231 auto ExpectedTC = getSmallBestKnownTC(*SE, L);
10232 if (ExpectedTC && *ExpectedTC < TinyTripCountVectorThreshold) {
10233 LLVM_DEBUG(dbgs() << "LV: Found a loop with a very small trip count. "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found a loop with a very small trip count. "
<< "This loop is worth vectorizing only if no scalar "
<< "iteration overheads are incurred."; } } while (false
)
10234 << "This loop is worth vectorizing only if no scalar "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found a loop with a very small trip count. "
<< "This loop is worth vectorizing only if no scalar "
<< "iteration overheads are incurred."; } } while (false
)
10235 << "iteration overheads are incurred.")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found a loop with a very small trip count. "
<< "This loop is worth vectorizing only if no scalar "
<< "iteration overheads are incurred."; } } while (false
)
;
10236 if (Hints.getForce() == LoopVectorizeHints::FK_Enabled)
10237 LLVM_DEBUG(dbgs() << " But vectorizing was explicitly forced.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << " But vectorizing was explicitly forced.\n"
; } } while (false)
;
10238 else {
10239 if (*ExpectedTC > TTI->getMinTripCountTailFoldingThreshold()) {
10240 LLVM_DEBUG(dbgs() << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "\n"; } } while (false)
;
10241 SEL = CM_ScalarEpilogueNotAllowedLowTripLoop;
10242 } else {
10243 LLVM_DEBUG(dbgs() << " But the target considers the trip count too "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << " But the target considers the trip count too "
"small to consider vectorizing.\n"; } } while (false)
10244 "small to consider vectorizing.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << " But the target considers the trip count too "
"small to consider vectorizing.\n"; } } while (false)
;
10245 reportVectorizationFailure(
10246 "The trip count is below the minial threshold value.",
10247 "loop trip count is too low, avoiding vectorization",
10248 "LowTripCount", ORE, L);
10249 Hints.emitRemarkWithHints();
10250 return false;
10251 }
10252 }
10253 }
10254
10255 // Check the function attributes to see if implicit floats or vectors are
10256 // allowed.
10257 if (F->hasFnAttribute(Attribute::NoImplicitFloat)) {
10258 reportVectorizationFailure(
10259 "Can't vectorize when the NoImplicitFloat attribute is used",
10260 "loop not vectorized due to NoImplicitFloat attribute",
10261 "NoImplicitFloat", ORE, L);
10262 Hints.emitRemarkWithHints();
10263 return false;
10264 }
10265
10266 // Check if the target supports potentially unsafe FP vectorization.
10267 // FIXME: Add a check for the type of safety issue (denormal, signaling)
10268 // for the target we're vectorizing for, to make sure none of the
10269 // additional fp-math flags can help.
10270 if (Hints.isPotentiallyUnsafe() &&
10271 TTI->isFPVectorizationPotentiallyUnsafe()) {
10272 reportVectorizationFailure(
10273 "Potentially unsafe FP op prevents vectorization",
10274 "loop not vectorized due to unsafe FP support.",
10275 "UnsafeFP", ORE, L);
10276 Hints.emitRemarkWithHints();
10277 return false;
10278 }
10279
10280 bool AllowOrderedReductions;
10281 // If the flag is set, use that instead and override the TTI behaviour.
10282 if (ForceOrderedReductions.getNumOccurrences() > 0)
10283 AllowOrderedReductions = ForceOrderedReductions;
10284 else
10285 AllowOrderedReductions = TTI->enableOrderedReductions();
10286 if (!LVL.canVectorizeFPMath(AllowOrderedReductions)) {
10287 ORE->emit([&]() {
10288 auto *ExactFPMathInst = Requirements.getExactFPInst();
10289 return OptimizationRemarkAnalysisFPCommute(DEBUG_TYPE"loop-vectorize", "CantReorderFPOps",
10290 ExactFPMathInst->getDebugLoc(),
10291 ExactFPMathInst->getParent())
10292 << "loop not vectorized: cannot prove it is safe to reorder "
10293 "floating-point operations";
10294 });
10295 LLVM_DEBUG(dbgs() << "LV: loop not vectorized: cannot prove it is safe to "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: loop not vectorized: cannot prove it is safe to "
"reorder floating-point operations\n"; } } while (false)
10296 "reorder floating-point operations\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: loop not vectorized: cannot prove it is safe to "
"reorder floating-point operations\n"; } } while (false)
;
10297 Hints.emitRemarkWithHints();
10298 return false;
10299 }
10300
10301 // Use the cost model.
10302 LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE,
10303 F, &Hints, IAI);
10304 CM.collectValuesToIgnore();
10305 CM.collectElementTypesForWidening();
10306
10307 // Use the planner for vectorization.
10308 LoopVectorizationPlanner LVP(L, LI, TLI, TTI, &LVL, CM, IAI, PSE, Hints, ORE);
10309
10310 // Get user vectorization factor and interleave count.
10311 ElementCount UserVF = Hints.getWidth();
10312 unsigned UserIC = Hints.getInterleave();
10313
10314 // Plan how to best vectorize, return the best VF and its cost.
10315 std::optional<VectorizationFactor> MaybeVF = LVP.plan(UserVF, UserIC);
10316
10317 VectorizationFactor VF = VectorizationFactor::Disabled();
10318 unsigned IC = 1;
10319
10320 GeneratedRTChecks Checks(*PSE.getSE(), DT, LI, TTI,
10321 F->getParent()->getDataLayout());
10322 if (MaybeVF) {
10323 VF = *MaybeVF;
10324 // Select the interleave count.
10325 IC = CM.selectInterleaveCount(VF.Width, VF.Cost);
10326
10327 unsigned SelectedIC = std::max(IC, UserIC);
10328 // Optimistically generate runtime checks if they are needed. Drop them if
10329 // they turn out to not be profitable.
10330 if (VF.Width.isVector() || SelectedIC > 1)
10331 Checks.Create(L, *LVL.getLAI(), PSE.getPredicate(), VF.Width, SelectedIC);
10332
10333 // Check if it is profitable to vectorize with runtime checks.
10334 bool ForceVectorization =
10335 Hints.getForce() == LoopVectorizeHints::FK_Enabled;
10336 if (!ForceVectorization &&
10337 !areRuntimeChecksProfitable(Checks, VF, CM.getVScaleForTuning(), L,
10338 *PSE.getSE())) {
10339 ORE->emit([&]() {
10340 return OptimizationRemarkAnalysisAliasing(
10341 DEBUG_TYPE"loop-vectorize", "CantReorderMemOps", L->getStartLoc(),
10342 L->getHeader())
10343 << "loop not vectorized: cannot prove it is safe to reorder "
10344 "memory operations";
10345 });
10346 LLVM_DEBUG(dbgs() << "LV: Too many memory checks needed.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Too many memory checks needed.\n"
; } } while (false)
;
10347 Hints.emitRemarkWithHints();
10348 return false;
10349 }
10350 }
10351
10352 // Identify the diagnostic messages that should be produced.
10353 std::pair<StringRef, std::string> VecDiagMsg, IntDiagMsg;
10354 bool VectorizeLoop = true, InterleaveLoop = true;
10355 if (VF.Width.isScalar()) {
10356 LLVM_DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Vectorization is possible but not beneficial.\n"
; } } while (false)
;
10357 VecDiagMsg = std::make_pair(
10358 "VectorizationNotBeneficial",
10359 "the cost-model indicates that vectorization is not beneficial");
10360 VectorizeLoop = false;
10361 }
10362
10363 if (!MaybeVF && UserIC > 1) {
10364 // Tell the user interleaving was avoided up-front, despite being explicitly
10365 // requested.
10366 LLVM_DEBUG(dbgs() << "LV: Ignoring UserIC, because vectorization and "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Ignoring UserIC, because vectorization and "
"interleaving should be avoided up front\n"; } } while (false
)
10367 "interleaving should be avoided up front\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Ignoring UserIC, because vectorization and "
"interleaving should be avoided up front\n"; } } while (false
)
;
10368 IntDiagMsg = std::make_pair(
10369 "InterleavingAvoided",
10370 "Ignoring UserIC, because interleaving was avoided up front");
10371 InterleaveLoop = false;
10372 } else if (IC == 1 && UserIC <= 1) {
10373 // Tell the user interleaving is not beneficial.
10374 LLVM_DEBUG(dbgs() << "LV: Interleaving is not beneficial.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Interleaving is not beneficial.\n"
; } } while (false)
;
10375 IntDiagMsg = std::make_pair(
10376 "InterleavingNotBeneficial",
10377 "the cost-model indicates that interleaving is not beneficial");
10378 InterleaveLoop = false;
10379 if (UserIC == 1) {
10380 IntDiagMsg.first = "InterleavingNotBeneficialAndDisabled";
10381 IntDiagMsg.second +=
10382 " and is explicitly disabled or interleave count is set to 1";
10383 }
10384 } else if (IC > 1 && UserIC == 1) {
10385 // Tell the user interleaving is beneficial, but it explicitly disabled.
10386 LLVM_DEBUG(do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Interleaving is beneficial but is explicitly disabled."
; } } while (false)
10387 dbgs() << "LV: Interleaving is beneficial but is explicitly disabled.")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Interleaving is beneficial but is explicitly disabled."
; } } while (false)
;
10388 IntDiagMsg = std::make_pair(
10389 "InterleavingBeneficialButDisabled",
10390 "the cost-model indicates that interleaving is beneficial "
10391 "but is explicitly disabled or interleave count is set to 1");
10392 InterleaveLoop = false;
10393 }
10394
10395 // Override IC if user provided an interleave count.
10396 IC = UserIC > 0 ? UserIC : IC;
10397
10398 // Emit diagnostic messages, if any.
10399 const char *VAPassName = Hints.vectorizeAnalysisPassName();
10400 if (!VectorizeLoop && !InterleaveLoop) {
10401 // Do not vectorize or interleaving the loop.
10402 ORE->emit([&]() {
10403 return OptimizationRemarkMissed(VAPassName, VecDiagMsg.first,
10404 L->getStartLoc(), L->getHeader())
10405 << VecDiagMsg.second;
10406 });
10407 ORE->emit([&]() {
10408 return OptimizationRemarkMissed(LV_NAME"loop-vectorize", IntDiagMsg.first,
10409 L->getStartLoc(), L->getHeader())
10410 << IntDiagMsg.second;
10411 });
10412 return false;
10413 } else if (!VectorizeLoop && InterleaveLoop) {
10414 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n')do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Interleave Count is "
<< IC << '\n'; } } while (false)
;
10415 ORE->emit([&]() {
10416 return OptimizationRemarkAnalysis(VAPassName, VecDiagMsg.first,
10417 L->getStartLoc(), L->getHeader())
10418 << VecDiagMsg.second;
10419 });
10420 } else if (VectorizeLoop && !InterleaveLoop) {
10421 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Widthdo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found a vectorizable loop ("
<< VF.Width << ") in " << DebugLocStr <<
'\n'; } } while (false)
10422 << ") in " << DebugLocStr << '\n')do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found a vectorizable loop ("
<< VF.Width << ") in " << DebugLocStr <<
'\n'; } } while (false)
;
10423 ORE->emit([&]() {
10424 return OptimizationRemarkAnalysis(LV_NAME"loop-vectorize", IntDiagMsg.first,
10425 L->getStartLoc(), L->getHeader())
10426 << IntDiagMsg.second;
10427 });
10428 } else if (VectorizeLoop && InterleaveLoop) {
10429 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Widthdo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found a vectorizable loop ("
<< VF.Width << ") in " << DebugLocStr <<
'\n'; } } while (false)
10430 << ") in " << DebugLocStr << '\n')do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found a vectorizable loop ("
<< VF.Width << ") in " << DebugLocStr <<
'\n'; } } while (false)
;
10431 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n')do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Interleave Count is "
<< IC << '\n'; } } while (false)
;
10432 }
10433
10434 bool DisableRuntimeUnroll = false;
10435 MDNode *OrigLoopID = L->getLoopID();
10436 {
10437 using namespace ore;
10438 if (!VectorizeLoop) {
10439 assert(IC > 1 && "interleave count should not be 1 or 0")(static_cast <bool> (IC > 1 && "interleave count should not be 1 or 0"
) ? void (0) : __assert_fail ("IC > 1 && \"interleave count should not be 1 or 0\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 10439, __extension__
__PRETTY_FUNCTION__))
;
10440 // If we decided that it is not legal to vectorize the loop, then
10441 // interleave it.
10442 InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL,
10443 &CM, BFI, PSI, Checks);
10444
10445 VPlan &BestPlan = LVP.getBestPlanFor(VF.Width);
10446 LVP.executePlan(VF.Width, IC, BestPlan, Unroller, DT, false);
10447
10448 ORE->emit([&]() {
10449 return OptimizationRemark(LV_NAME"loop-vectorize", "Interleaved", L->getStartLoc(),
10450 L->getHeader())
10451 << "interleaved loop (interleaved count: "
10452 << NV("InterleaveCount", IC) << ")";
10453 });
10454 } else {
10455 // If we decided that it is *legal* to vectorize the loop, then do it.
10456
10457 // Consider vectorizing the epilogue too if it's profitable.
10458 VectorizationFactor EpilogueVF =
10459 CM.selectEpilogueVectorizationFactor(VF.Width, LVP);
10460 if (EpilogueVF.Width.isVector()) {
10461
10462 // The first pass vectorizes the main loop and creates a scalar epilogue
10463 // to be vectorized by executing the plan (potentially with a different
10464 // factor) again shortly afterwards.
10465 EpilogueLoopVectorizationInfo EPI(VF.Width, IC, EpilogueVF.Width, 1);
10466 EpilogueVectorizerMainLoop MainILV(L, PSE, LI, DT, TLI, TTI, AC, ORE,
10467 EPI, &LVL, &CM, BFI, PSI, Checks);
10468
10469 VPlan &BestMainPlan = LVP.getBestPlanFor(EPI.MainLoopVF);
10470 LVP.executePlan(EPI.MainLoopVF, EPI.MainLoopUF, BestMainPlan, MainILV,
10471 DT, true);
10472 ++LoopsVectorized;
10473
10474 // Second pass vectorizes the epilogue and adjusts the control flow
10475 // edges from the first pass.
10476 EPI.MainLoopVF = EPI.EpilogueVF;
10477 EPI.MainLoopUF = EPI.EpilogueUF;
10478 EpilogueVectorizerEpilogueLoop EpilogILV(L, PSE, LI, DT, TLI, TTI, AC,
10479 ORE, EPI, &LVL, &CM, BFI, PSI,
10480 Checks);
10481
10482 VPlan &BestEpiPlan = LVP.getBestPlanFor(EPI.EpilogueVF);
10483 VPRegionBlock *VectorLoop = BestEpiPlan.getVectorLoopRegion();
10484 VPBasicBlock *Header = VectorLoop->getEntryBasicBlock();
10485 Header->setName("vec.epilog.vector.body");
10486
10487 // Ensure that the start values for any VPWidenIntOrFpInductionRecipe,
10488 // VPWidenPointerInductionRecipe and VPReductionPHIRecipes are updated
10489 // before vectorizing the epilogue loop.
10490 for (VPRecipeBase &R : Header->phis()) {
10491 if (isa<VPCanonicalIVPHIRecipe>(&R))
10492 continue;
10493
10494 Value *ResumeV = nullptr;
10495 // TODO: Move setting of resume values to prepareToExecute.
10496 if (auto *ReductionPhi = dyn_cast<VPReductionPHIRecipe>(&R)) {
10497 ResumeV = MainILV.getReductionResumeValue(
10498 ReductionPhi->getRecurrenceDescriptor());
10499 } else {
10500 // Create induction resume values for both widened pointer and
10501 // integer/fp inductions and update the start value of the induction
10502 // recipes to use the resume value.
10503 PHINode *IndPhi = nullptr;
10504 const InductionDescriptor *ID;
10505 if (auto *Ind = dyn_cast<VPWidenPointerInductionRecipe>(&R)) {
10506 IndPhi = cast<PHINode>(Ind->getUnderlyingValue());
10507 ID = &Ind->getInductionDescriptor();
10508 } else {
10509 auto *WidenInd = cast<VPWidenIntOrFpInductionRecipe>(&R);
10510 IndPhi = WidenInd->getPHINode();
10511 ID = &WidenInd->getInductionDescriptor();
10512 }
10513
10514 ResumeV = MainILV.createInductionResumeValue(
10515 IndPhi, *ID, {EPI.MainLoopIterationCountCheck});
10516 }
10517 assert(ResumeV && "Must have a resume value")(static_cast <bool> (ResumeV && "Must have a resume value"
) ? void (0) : __assert_fail ("ResumeV && \"Must have a resume value\""
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 10517, __extension__
__PRETTY_FUNCTION__))
;
10518 VPValue *StartVal = BestEpiPlan.getOrAddExternalDef(ResumeV);
10519 cast<VPHeaderPHIRecipe>(&R)->setStartValue(StartVal);
10520 }
10521
10522 LVP.executePlan(EPI.EpilogueVF, EPI.EpilogueUF, BestEpiPlan, EpilogILV,
10523 DT, true);
10524 ++LoopsEpilogueVectorized;
10525
10526 if (!MainILV.areSafetyChecksAdded())
10527 DisableRuntimeUnroll = true;
10528 } else {
10529 InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width,
10530 VF.MinProfitableTripCount, IC, &LVL, &CM, BFI,
10531 PSI, Checks);
10532
10533 VPlan &BestPlan = LVP.getBestPlanFor(VF.Width);
10534 LVP.executePlan(VF.Width, IC, BestPlan, LB, DT, false);
10535 ++LoopsVectorized;
10536
10537 // Add metadata to disable runtime unrolling a scalar loop when there
10538 // are no runtime checks about strides and memory. A scalar loop that is
10539 // rarely used is not worth unrolling.
10540 if (!LB.areSafetyChecksAdded())
10541 DisableRuntimeUnroll = true;
10542 }
10543 // Report the vectorization decision.
10544 ORE->emit([&]() {
10545 return OptimizationRemark(LV_NAME"loop-vectorize", "Vectorized", L->getStartLoc(),
10546 L->getHeader())
10547 << "vectorized loop (vectorization width: "
10548 << NV("VectorizationFactor", VF.Width)
10549 << ", interleaved count: " << NV("InterleaveCount", IC) << ")";
10550 });
10551 }
10552
10553 if (ORE->allowExtraAnalysis(LV_NAME"loop-vectorize"))
10554 checkMixedPrecision(L, ORE);
10555 }
10556
10557 std::optional<MDNode *> RemainderLoopID =
10558 makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll,
10559 LLVMLoopVectorizeFollowupEpilogue});
10560 if (RemainderLoopID) {
10561 L->setLoopID(*RemainderLoopID);
10562 } else {
10563 if (DisableRuntimeUnroll)
10564 AddRuntimeUnrollDisableMetaData(L);
10565
10566 // Mark the loop as already vectorized to avoid vectorizing again.
10567 Hints.setAlreadyVectorized();
10568 }
10569
10570 assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs()))(static_cast <bool> (!verifyFunction(*L->getHeader()
->getParent(), &dbgs())) ? void (0) : __assert_fail ("!verifyFunction(*L->getHeader()->getParent(), &dbgs())"
, "llvm/lib/Transforms/Vectorize/LoopVectorize.cpp", 10570, __extension__
__PRETTY_FUNCTION__))
;
10571 return true;
10572}
10573
10574LoopVectorizeResult LoopVectorizePass::runImpl(
10575 Function &F, ScalarEvolution &SE_, LoopInfo &LI_, TargetTransformInfo &TTI_,
10576 DominatorTree &DT_, BlockFrequencyInfo &BFI_, TargetLibraryInfo *TLI_,
10577 DemandedBits &DB_, AssumptionCache &AC_, LoopAccessInfoManager &LAIs_,
10578 OptimizationRemarkEmitter &ORE_, ProfileSummaryInfo *PSI_) {
10579 SE = &SE_;
10580 LI = &LI_;
10581 TTI = &TTI_;
10582 DT = &DT_;
10583 BFI = &BFI_;
10584 TLI = TLI_;
10585 AC = &AC_;
10586 LAIs = &LAIs_;
10587 DB = &DB_;
10588 ORE = &ORE_;
10589 PSI = PSI_;
10590
10591 // Don't attempt if
10592 // 1. the target claims to have no vector registers, and
10593 // 2. interleaving won't help ILP.
10594 //
10595 // The second condition is necessary because, even if the target has no
10596 // vector registers, loop vectorization may still enable scalar
10597 // interleaving.
10598 if (!TTI->getNumberOfRegisters(TTI->getRegisterClassForType(true)) &&
10599 TTI->getMaxInterleaveFactor(1) < 2)
10600 return LoopVectorizeResult(false, false);
10601
10602 bool Changed = false, CFGChanged = false;
10603
10604 // The vectorizer requires loops to be in simplified form.
10605 // Since simplification may add new inner loops, it has to run before the
10606 // legality and profitability checks. This means running the loop vectorizer
10607 // will simplify all loops, regardless of whether anything end up being
10608 // vectorized.
10609 for (const auto &L : *LI)
10610 Changed |= CFGChanged |=
10611 simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */);
10612
10613 // Build up a worklist of inner-loops to vectorize. This is necessary as
10614 // the act of vectorizing or partially unrolling a loop creates new loops
10615 // and can invalidate iterators across the loops.
10616 SmallVector<Loop *, 8> Worklist;
10617
10618 for (Loop *L : *LI)
10619 collectSupportedLoops(*L, LI, ORE, Worklist);
10620
10621 LoopsAnalyzed += Worklist.size();
10622
10623 // Now walk the identified inner loops.
10624 while (!Worklist.empty()) {
10625 Loop *L = Worklist.pop_back_val();
10626
10627 // For the inner loops we actually process, form LCSSA to simplify the
10628 // transform.
10629 Changed |= formLCSSARecursively(*L, *DT, LI, SE);
10630
10631 Changed |= CFGChanged |= processLoop(L);
10632 }
10633
10634 // Process each loop nest in the function.
10635 return LoopVectorizeResult(Changed, CFGChanged);
10636}
10637
10638PreservedAnalyses LoopVectorizePass::run(Function &F,
10639 FunctionAnalysisManager &AM) {
10640 auto &LI = AM.getResult<LoopAnalysis>(F);
10641 // There are no loops in the function. Return before computing other expensive
10642 // analyses.
10643 if (LI.empty())
10644 return PreservedAnalyses::all();
10645 auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F);
10646 auto &TTI = AM.getResult<TargetIRAnalysis>(F);
10647 auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
10648 auto &BFI = AM.getResult<BlockFrequencyAnalysis>(F);
10649 auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
10650 auto &AC = AM.getResult<AssumptionAnalysis>(F);
10651 auto &DB = AM.getResult<DemandedBitsAnalysis>(F);
10652 auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F);
10653
10654 LoopAccessInfoManager &LAIs = AM.getResult<LoopAccessAnalysis>(F);
10655 auto &MAMProxy = AM.getResult<ModuleAnalysisManagerFunctionProxy>(F);
10656 ProfileSummaryInfo *PSI =
10657 MAMProxy.getCachedResult<ProfileSummaryAnalysis>(*F.getParent());
10658 LoopVectorizeResult Result =
10659 runImpl(F, SE, LI, TTI, DT, BFI, &TLI, DB, AC, LAIs, ORE, PSI);
10660 if (!Result.MadeAnyChange)
10661 return PreservedAnalyses::all();
10662 PreservedAnalyses PA;
10663
10664 // We currently do not preserve loopinfo/dominator analyses with outer loop
10665 // vectorization. Until this is addressed, mark these analyses as preserved
10666 // only for non-VPlan-native path.
10667 // TODO: Preserve Loop and Dominator analyses for VPlan-native path.
10668 if (!EnableVPlanNativePath) {
10669 PA.preserve<LoopAnalysis>();
10670 PA.preserve<DominatorTreeAnalysis>();
10671 }
10672
10673 if (Result.MadeCFGChange) {
10674 // Making CFG changes likely means a loop got vectorized. Indicate that
10675 // extra simplification passes should be run.
10676 // TODO: MadeCFGChanges is not a prefect proxy. Extra passes should only
10677 // be run if runtime checks have been added.
10678 AM.getResult<ShouldRunExtraVectorPasses>(F);
10679 PA.preserve<ShouldRunExtraVectorPasses>();
10680 } else {
10681 PA.preserveSet<CFGAnalyses>();
10682 }
10683 return PA;
10684}
10685
10686void LoopVectorizePass::printPipeline(
10687 raw_ostream &OS, function_ref<StringRef(StringRef)> MapClassName2PassName) {
10688 static_cast<PassInfoMixin<LoopVectorizePass> *>(this)->printPipeline(
10689 OS, MapClassName2PassName);
10690
10691 OS << "<";
10692 OS << (InterleaveOnlyWhenForced ? "" : "no-") << "interleave-forced-only;";
10693 OS << (VectorizeOnlyWhenForced ? "" : "no-") << "vectorize-forced-only;";
10694 OS << ">";
10695}

/usr/lib/gcc/x86_64-linux-gnu/10/../../../../include/c++/10/bits/basic_string.h

1// Components for manipulating sequences of characters -*- C++ -*-
2
3// Copyright (C) 1997-2020 Free Software Foundation, Inc.
4//
5// This file is part of the GNU ISO C++ Library. This library is free
6// software; you can redistribute it and/or modify it under the
7// terms of the GNU General Public License as published by the
8// Free Software Foundation; either version 3, or (at your option)
9// any later version.
10
11// This library is distributed in the hope that it will be useful,
12// but WITHOUT ANY WARRANTY; without even the implied warranty of
13// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14// GNU General Public License for more details.
15
16// Under Section 7 of GPL version 3, you are granted additional
17// permissions described in the GCC Runtime Library Exception, version
18// 3.1, as published by the Free Software Foundation.
19
20// You should have received a copy of the GNU General Public License and
21// a copy of the GCC Runtime Library Exception along with this program;
22// see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
23// <http://www.gnu.org/licenses/>.
24
25/** @file bits/basic_string.h
26 * This is an internal header file, included by other library headers.
27 * Do not attempt to use it directly. @headername{string}
28 */
29
30//
31// ISO C++ 14882: 21 Strings library
32//
33
34#ifndef _BASIC_STRING_H1
35#define _BASIC_STRING_H1 1
36
37#pragma GCC system_header
38
39#include <ext/atomicity.h>
40#include <ext/alloc_traits.h>
41#include <debug/debug.h>
42
43#if __cplusplus201703L >= 201103L
44#include <initializer_list>
45#endif
46
47#if __cplusplus201703L >= 201703L
48# include <string_view>
49#endif
50
51
52namespace std _GLIBCXX_VISIBILITY(default)__attribute__ ((__visibility__ ("default")))
53{
54_GLIBCXX_BEGIN_NAMESPACE_VERSION
55
56#if _GLIBCXX_USE_CXX11_ABI1
57_GLIBCXX_BEGIN_NAMESPACE_CXX11namespace __cxx11 {
58 /**
59 * @class basic_string basic_string.h <string>
60 * @brief Managing sequences of characters and character-like objects.
61 *
62 * @ingroup strings
63 * @ingroup sequences
64 *
65 * @tparam _CharT Type of character
66 * @tparam _Traits Traits for character type, defaults to
67 * char_traits<_CharT>.
68 * @tparam _Alloc Allocator type, defaults to allocator<_CharT>.
69 *
70 * Meets the requirements of a <a href="tables.html#65">container</a>, a
71 * <a href="tables.html#66">reversible container</a>, and a
72 * <a href="tables.html#67">sequence</a>. Of the
73 * <a href="tables.html#68">optional sequence requirements</a>, only
74 * @c push_back, @c at, and @c %array access are supported.
75 */
76 template<typename _CharT, typename _Traits, typename _Alloc>
77 class basic_string
78 {
79 typedef typename __gnu_cxx::__alloc_traits<_Alloc>::template
80 rebind<_CharT>::other _Char_alloc_type;
81 typedef __gnu_cxx::__alloc_traits<_Char_alloc_type> _Alloc_traits;
82
83 // Types:
84 public:
85 typedef _Traits traits_type;
86 typedef typename _Traits::char_type value_type;
87 typedef _Char_alloc_type allocator_type;
88 typedef typename _Alloc_traits::size_type size_type;
89 typedef typename _Alloc_traits::difference_type difference_type;
90 typedef typename _Alloc_traits::reference reference;
91 typedef typename _Alloc_traits::const_reference const_reference;
92 typedef typename _Alloc_traits::pointer pointer;
93 typedef typename _Alloc_traits::const_pointer const_pointer;
94 typedef __gnu_cxx::__normal_iterator<pointer, basic_string> iterator;
95 typedef __gnu_cxx::__normal_iterator<const_pointer, basic_string>
96 const_iterator;
97 typedef std::reverse_iterator<const_iterator> const_reverse_iterator;
98 typedef std::reverse_iterator<iterator> reverse_iterator;
99
100 /// Value returned by various member functions when they fail.
101 static const size_type npos = static_cast<size_type>(-1);
102
103 protected:
104 // type used for positions in insert, erase etc.
105#if __cplusplus201703L < 201103L
106 typedef iterator __const_iterator;
107#else
108 typedef const_iterator __const_iterator;
109#endif
110
111 private:
112#if __cplusplus201703L >= 201703L
113 // A helper type for avoiding boiler-plate.
114 typedef basic_string_view<_CharT, _Traits> __sv_type;
115
116 template<typename _Tp, typename _Res>
117 using _If_sv = enable_if_t<
118 __and_<is_convertible<const _Tp&, __sv_type>,
119 __not_<is_convertible<const _Tp*, const basic_string*>>,
120 __not_<is_convertible<const _Tp&, const _CharT*>>>::value,
121 _Res>;
122
123 // Allows an implicit conversion to __sv_type.
124 static __sv_type
125 _S_to_string_view(__sv_type __svt) noexcept
126 { return __svt; }
127
128 // Wraps a string_view by explicit conversion and thus
129 // allows to add an internal constructor that does not
130 // participate in overload resolution when a string_view
131 // is provided.
132 struct __sv_wrapper
133 {
134 explicit __sv_wrapper(__sv_type __sv) noexcept : _M_sv(__sv) { }
135 __sv_type _M_sv;
136 };
137
138 /**
139 * @brief Only internally used: Construct string from a string view
140 * wrapper.
141 * @param __svw string view wrapper.
142 * @param __a Allocator to use.
143 */
144 explicit
145 basic_string(__sv_wrapper __svw, const _Alloc& __a)
146 : basic_string(__svw._M_sv.data(), __svw._M_sv.size(), __a) { }
147#endif
148
149 // Use empty-base optimization: http://www.cantrip.org/emptyopt.html
150 struct _Alloc_hider : allocator_type // TODO check __is_final
151 {
152#if __cplusplus201703L < 201103L
153 _Alloc_hider(pointer __dat, const _Alloc& __a = _Alloc())
154 : allocator_type(__a), _M_p(__dat) { }
155#else
156 _Alloc_hider(pointer __dat, const _Alloc& __a)
157 : allocator_type(__a), _M_p(__dat) { }
158
159 _Alloc_hider(pointer __dat, _Alloc&& __a = _Alloc())
160 : allocator_type(std::move(__a)), _M_p(__dat) { }
161#endif
162
163 pointer _M_p; // The actual data.
164 };
165
166 _Alloc_hider _M_dataplus;
167 size_type _M_string_length;
168
169 enum { _S_local_capacity = 15 / sizeof(_CharT) };
170
171 union
172 {
173 _CharT _M_local_buf[_S_local_capacity + 1];
174 size_type _M_allocated_capacity;
175 };
176
177 void
178 _M_data(pointer __p)
179 { _M_dataplus._M_p = __p; }
180
181 void
182 _M_length(size_type __length)
183 { _M_string_length = __length; }
184
185 pointer
186 _M_data() const
187 { return _M_dataplus._M_p; }
188
189 pointer
190 _M_local_data()
191 {
192#if __cplusplus201703L >= 201103L
193 return std::pointer_traits<pointer>::pointer_to(*_M_local_buf);
194#else
195 return pointer(_M_local_buf);
196#endif
197 }
198
199 const_pointer
200 _M_local_data() const
201 {
202#if __cplusplus201703L >= 201103L
203 return std::pointer_traits<const_pointer>::pointer_to(*_M_local_buf);
204#else
205 return const_pointer(_M_local_buf);
206#endif
207 }
208
209 void
210 _M_capacity(size_type __capacity)
211 { _M_allocated_capacity = __capacity; }
212
213 void
214 _M_set_length(size_type __n)
215 {
216 _M_length(__n);
217 traits_type::assign(_M_data()[__n], _CharT());
218 }
219
220 bool
221 _M_is_local() const
222 { return _M_data() == _M_local_data(); }
223
224 // Create & Destroy
225 pointer
226 _M_create(size_type&, size_type);
227
228 void
229 _M_dispose()
230 {
231 if (!_M_is_local())
232 _M_destroy(_M_allocated_capacity);
233 }
234
235 void
236 _M_destroy(size_type __size) throw()
237 { _Alloc_traits::deallocate(_M_get_allocator(), _M_data(), __size + 1); }
238
239 // _M_construct_aux is used to implement the 21.3.1 para 15 which
240 // requires special behaviour if _InIterator is an integral type
241 template<typename _InIterator>
242 void
243 _M_construct_aux(_InIterator __beg, _InIterator __end,
244 std::__false_type)
245 {
246 typedef typename iterator_traits<_InIterator>::iterator_category _Tag;
247 _M_construct(__beg, __end, _Tag());
248 }
249
250 // _GLIBCXX_RESOLVE_LIB_DEFECTS
251 // 438. Ambiguity in the "do the right thing" clause
252 template<typename _Integer>
253 void
254 _M_construct_aux(_Integer __beg, _Integer __end, std::__true_type)
255 { _M_construct_aux_2(static_cast<size_type>(__beg), __end); }
256
257 void
258 _M_construct_aux_2(size_type __req, _CharT __c)
259 { _M_construct(__req, __c); }
260
261 template<typename _InIterator>
262 void
263 _M_construct(_InIterator __beg, _InIterator __end)
264 {
265 typedef typename std::__is_integer<_InIterator>::__type _Integral;
266 _M_construct_aux(__beg, __end, _Integral());
267 }
268
269 // For Input Iterators, used in istreambuf_iterators, etc.
270 template<typename _InIterator>
271 void
272 _M_construct(_InIterator __beg, _InIterator __end,
273 std::input_iterator_tag);
274
275 // For forward_iterators up to random_access_iterators, used for
276 // string::iterator, _CharT*, etc.
277 template<typename _FwdIterator>
278 void
279 _M_construct(_FwdIterator __beg, _FwdIterator __end,
280 std::forward_iterator_tag);
281
282 void
283 _M_construct(size_type __req, _CharT __c);
284
285 allocator_type&
286 _M_get_allocator()
287 { return _M_dataplus; }
288
289 const allocator_type&
290 _M_get_allocator() const
291 { return _M_dataplus; }
292
293 private:
294
295#ifdef _GLIBCXX_DISAMBIGUATE_REPLACE_INST
296 // The explicit instantiations in misc-inst.cc require this due to
297 // https://gcc.gnu.org/bugzilla/show_bug.cgi?id=64063
298 template<typename _Tp, bool _Requires =
299 !__are_same<_Tp, _CharT*>::__value
300 && !__are_same<_Tp, const _CharT*>::__value
301 && !__are_same<_Tp, iterator>::__value
302 && !__are_same<_Tp, const_iterator>::__value>
303 struct __enable_if_not_native_iterator
304 { typedef basic_string& __type; };
305 template<typename _Tp>
306 struct __enable_if_not_native_iterator<_Tp, false> { };
307#endif
308
309 size_type
310 _M_check(size_type __pos, const char* __s) const
311 {
312 if (__pos > this->size())
313 __throw_out_of_range_fmt(__N("%s: __pos (which is %zu) > "("%s: __pos (which is %zu) > " "this->size() (which is %zu)"
)
314 "this->size() (which is %zu)")("%s: __pos (which is %zu) > " "this->size() (which is %zu)"
)
,
315 __s, __pos, this->size());
316 return __pos;
317 }
318
319 void
320 _M_check_length(size_type __n1, size_type __n2, const char* __s) const
321 {
322 if (this->max_size() - (this->size() - __n1) < __n2)
323 __throw_length_error(__N(__s)(__s));
324 }
325
326
327 // NB: _M_limit doesn't check for a bad __pos value.
328 size_type
329 _M_limit(size_type __pos, size_type __off) const _GLIBCXX_NOEXCEPTnoexcept
330 {
331 const bool __testoff = __off < this->size() - __pos;
332 return __testoff ? __off : this->size() - __pos;
333 }
334
335 // True if _Rep and source do not overlap.
336 bool
337 _M_disjunct(const _CharT* __s) const _GLIBCXX_NOEXCEPTnoexcept
338 {
339 return (less<const _CharT*>()(__s, _M_data())
340 || less<const _CharT*>()(_M_data() + this->size(), __s));
341 }
342
343 // When __n = 1 way faster than the general multichar
344 // traits_type::copy/move/assign.
345 static void
346 _S_copy(_CharT* __d, const _CharT* __s, size_type __n)
347 {
348 if (__n == 1)
349 traits_type::assign(*__d, *__s);
350 else
351 traits_type::copy(__d, __s, __n);
352 }
353
354 static void
355 _S_move(_CharT* __d, const _CharT* __s, size_type __n)
356 {
357 if (__n == 1)
358 traits_type::assign(*__d, *__s);
359 else
360 traits_type::move(__d, __s, __n);
361 }
362
363 static void
364 _S_assign(_CharT* __d, size_type __n, _CharT __c)
365 {
366 if (__n == 1)
367 traits_type::assign(*__d, __c);
368 else
369 traits_type::assign(__d, __n, __c);
370 }
371
372 // _S_copy_chars is a separate template to permit specialization
373 // to optimize for the common case of pointers as iterators.
374 template<class _Iterator>
375 static void
376 _S_copy_chars(_CharT* __p, _Iterator __k1, _Iterator __k2)
377 {
378 for (; __k1 != __k2; ++__k1, (void)++__p)
379 traits_type::assign(*__p, *__k1); // These types are off.
380 }
381
382 static void
383 _S_copy_chars(_CharT* __p, iterator __k1, iterator __k2) _GLIBCXX_NOEXCEPTnoexcept
384 { _S_copy_chars(__p, __k1.base(), __k2.base()); }
385
386 static void
387 _S_copy_chars(_CharT* __p, const_iterator __k1, const_iterator __k2)
388 _GLIBCXX_NOEXCEPTnoexcept
389 { _S_copy_chars(__p, __k1.base(), __k2.base()); }
390
391 static void
392 _S_copy_chars(_CharT* __p, _CharT* __k1, _CharT* __k2) _GLIBCXX_NOEXCEPTnoexcept
393 { _S_copy(__p, __k1, __k2 - __k1); }
394
395 static void
396 _S_copy_chars(_CharT* __p, const _CharT* __k1, const _CharT* __k2)
397 _GLIBCXX_NOEXCEPTnoexcept
398 { _S_copy(__p, __k1, __k2 - __k1); }
399
400 static int
401 _S_compare(size_type __n1, size_type __n2) _GLIBCXX_NOEXCEPTnoexcept
402 {
403 const difference_type __d = difference_type(__n1 - __n2);
404
405 if (__d > __gnu_cxx::__numeric_traits<int>::__max)
406 return __gnu_cxx::__numeric_traits<int>::__max;
407 else if (__d < __gnu_cxx::__numeric_traits<int>::__min)
408 return __gnu_cxx::__numeric_traits<int>::__min;
409 else
410 return int(__d);
411 }
412
413 void
414 _M_assign(const basic_string&);
415
416 void
417 _M_mutate(size_type __pos, size_type __len1, const _CharT* __s,
418 size_type __len2);
419
420 void
421 _M_erase(size_type __pos, size_type __n);
422
423 public:
424 // Construct/copy/destroy:
425 // NB: We overload ctors in some cases instead of using default
426 // arguments, per 17.4.4.4 para. 2 item 2.
427
428 /**
429 * @brief Default constructor creates an empty string.
430 */
431 basic_string()
432 _GLIBCXX_NOEXCEPT_IF(is_nothrow_default_constructible<_Alloc>::value)noexcept(is_nothrow_default_constructible<_Alloc>::value
)
433 : _M_dataplus(_M_local_data())
434 { _M_set_length(0); }
435
436 /**
437 * @brief Construct an empty string using allocator @a a.
438 */
439 explicit
440 basic_string(const _Alloc& __a) _GLIBCXX_NOEXCEPTnoexcept
441 : _M_dataplus(_M_local_data(), __a)
442 { _M_set_length(0); }
443
444 /**
445 * @brief Construct string with copy of value of @a __str.
446 * @param __str Source string.
447 */
448 basic_string(const basic_string& __str)
449 : _M_dataplus(_M_local_data(),
450 _Alloc_traits::_S_select_on_copy(__str._M_get_allocator()))
451 { _M_construct(__str._M_data(), __str._M_data() + __str.length()); }
452
453 // _GLIBCXX_RESOLVE_LIB_DEFECTS
454 // 2583. no way to supply an allocator for basic_string(str, pos)
455 /**
456 * @brief Construct string as copy of a substring.
457 * @param __str Source string.
458 * @param __pos Index of first character to copy from.
459 * @param __a Allocator to use.
460 */
461 basic_string(const basic_string& __str, size_type __pos,
462 const _Alloc& __a = _Alloc())
463 : _M_dataplus(_M_local_data(), __a)
464 {
465 const _CharT* __start = __str._M_data()
466 + __str._M_check(__pos, "basic_string::basic_string");
467 _M_construct(__start, __start + __str._M_limit(__pos, npos));
468 }
469
470 /**
471 * @brief Construct string as copy of a substring.
472 * @param __str Source string.
473 * @param __pos Index of first character to copy from.
474 * @param __n Number of characters to copy.
475 */
476 basic_string(const basic_string& __str, size_type __pos,
477 size_type __n)
478 : _M_dataplus(_M_local_data())
479 {
480 const _CharT* __start = __str._M_data()
481 + __str._M_check(__pos, "basic_string::basic_string");
482 _M_construct(__start, __start + __str._M_limit(__pos, __n));
483 }
484
485 /**
486 * @brief Construct string as copy of a substring.
487 * @param __str Source string.
488 * @param __pos Index of first character to copy from.
489 * @param __n Number of characters to copy.
490 * @param __a Allocator to use.
491 */
492 basic_string(const basic_string& __str, size_type __pos,
493 size_type __n, const _Alloc& __a)
494 : _M_dataplus(_M_local_data(), __a)
495 {
496 const _CharT* __start
497 = __str._M_data() + __str._M_check(__pos, "string::string");
498 _M_construct(__start, __start + __str._M_limit(__pos, __n));
499 }
500
501 /**
502 * @brief Construct string initialized by a character %array.
503 * @param __s Source character %array.
504 * @param __n Number of characters to copy.
505 * @param __a Allocator to use (default is default allocator).
506 *
507 * NB: @a __s must have at least @a __n characters, &apos;\\0&apos;
508 * has no special meaning.
509 */
510 basic_string(const _CharT* __s, size_type __n,
511 const _Alloc& __a = _Alloc())
512 : _M_dataplus(_M_local_data(), __a)
513 { _M_construct(__s, __s + __n); }
514
515 /**
516 * @brief Construct string as copy of a C string.
517 * @param __s Source C string.
518 * @param __a Allocator to use (default is default allocator).
519 */
520#if __cpp_deduction_guides201703L && ! defined _GLIBCXX_DEFINING_STRING_INSTANTIATIONS
521 // _GLIBCXX_RESOLVE_LIB_DEFECTS
522 // 3076. basic_string CTAD ambiguity
523 template<typename = _RequireAllocator<_Alloc>>
524#endif
525 basic_string(const _CharT* __s, const _Alloc& __a = _Alloc())
526 : _M_dataplus(_M_local_data(), __a)
527 { _M_construct(__s, __s ? __s + traits_type::length(__s) : __s+npos); }
528
529 /**
530 * @brief Construct string as multiple characters.
531 * @param __n Number of characters.
532 * @param __c Character to use.
533 * @param __a Allocator to use (default is default allocator).
534 */
535#if __cpp_deduction_guides201703L && ! defined _GLIBCXX_DEFINING_STRING_INSTANTIATIONS
536 // _GLIBCXX_RESOLVE_LIB_DEFECTS
537 // 3076. basic_string CTAD ambiguity
538 template<typename = _RequireAllocator<_Alloc>>
539#endif
540 basic_string(size_type __n, _CharT __c, const _Alloc& __a = _Alloc())
541 : _M_dataplus(_M_local_data(), __a)
542 { _M_construct(__n, __c); }
543
544#if __cplusplus201703L >= 201103L
545 /**
546 * @brief Move construct string.
547 * @param __str Source string.
548 *
549 * The newly-created string contains the exact contents of @a __str.
550 * @a __str is a valid, but unspecified string.
551 **/
552 basic_string(basic_string&& __str) noexcept
553 : _M_dataplus(_M_local_data(), std::move(__str._M_get_allocator()))
554 {
555 if (__str._M_is_local())
556 {
557 traits_type::copy(_M_local_buf, __str._M_local_buf,
558 _S_local_capacity + 1);
559 }
560 else
561 {
562 _M_data(__str._M_data());
563 _M_capacity(__str._M_allocated_capacity);
564 }
565
566 // Must use _M_length() here not _M_set_length() because
567 // basic_stringbuf relies on writing into unallocated capacity so
568 // we mess up the contents if we put a '\0' in the string.
569 _M_length(__str.length());
570 __str._M_data(__str._M_local_data());
571 __str._M_set_length(0);
572 }
573
574 /**
575 * @brief Construct string from an initializer %list.
576 * @param __l std::initializer_list of characters.
577 * @param __a Allocator to use (default is default allocator).
578 */
579 basic_string(initializer_list<_CharT> __l, const _Alloc& __a = _Alloc())
580 : _M_dataplus(_M_local_data(), __a)
581 { _M_construct(__l.begin(), __l.end()); }
582
583 basic_string(const basic_string& __str, const _Alloc& __a)
584 : _M_dataplus(_M_local_data(), __a)
585 { _M_construct(__str.begin(), __str.end()); }
586
587 basic_string(basic_string&& __str, const _Alloc& __a)
588 noexcept(_Alloc_traits::_S_always_equal())
589 : _M_dataplus(_M_local_data(), __a)
590 {
591 if (__str._M_is_local())
592 {
593 traits_type::copy(_M_local_buf, __str._M_local_buf,
594 _S_local_capacity + 1);
595 _M_length(__str.length());
596 __str._M_set_length(0);
597 }
598 else if (_Alloc_traits::_S_always_equal()
599 || __str.get_allocator() == __a)
600 {
601 _M_data(__str._M_data());
602 _M_length(__str.length());
603 _M_capacity(__str._M_allocated_capacity);
604 __str._M_data(__str._M_local_buf);
605 __str._M_set_length(0);
606 }
607 else
608 _M_construct(__str.begin(), __str.end());
609 }
610
611#endif // C++11
612
613 /**
614 * @brief Construct string as copy of a range.
615 * @param __beg Start of range.
616 * @param __end End of range.
617 * @param __a Allocator to use (default is default allocator).
618 */
619#if __cplusplus201703L >= 201103L
620 template<typename _InputIterator,
621 typename = std::_RequireInputIter<_InputIterator>>
622#else
623 template<typename _InputIterator>
624#endif
625 basic_string(_InputIterator __beg, _InputIterator __end,
626 const _Alloc& __a = _Alloc())
627 : _M_dataplus(_M_local_data(), __a)
628 { _M_construct(__beg, __end); }
629
630#if __cplusplus201703L >= 201703L
631 /**
632 * @brief Construct string from a substring of a string_view.
633 * @param __t Source object convertible to string view.
634 * @param __pos The index of the first character to copy from __t.
635 * @param __n The number of characters to copy from __t.
636 * @param __a Allocator to use.
637 */
638 template<typename _Tp, typename = _If_sv<_Tp, void>>
639 basic_string(const _Tp& __t, size_type __pos, size_type __n,
640 const _Alloc& __a = _Alloc())
641 : basic_string(_S_to_string_view(__t).substr(__pos, __n), __a) { }
642
643 /**
644 * @brief Construct string from a string_view.
645 * @param __t Source object convertible to string view.
646 * @param __a Allocator to use (default is default allocator).
647 */
648 template<typename _Tp, typename = _If_sv<_Tp, void>>
649 explicit
650 basic_string(const _Tp& __t, const _Alloc& __a = _Alloc())
651 : basic_string(__sv_wrapper(_S_to_string_view(__t)), __a) { }
652#endif // C++17
653
654 /**
655 * @brief Destroy the string instance.
656 */
657 ~basic_string()
658 { _M_dispose(); }
659
660 /**
661 * @brief Assign the value of @a str to this string.
662 * @param __str Source string.
663 */
664 basic_string&
665 operator=(const basic_string& __str)
666 {
667 return this->assign(__str);
668 }
669
670 /**
671 * @brief Copy contents of @a s into this string.
672 * @param __s Source null-terminated string.
673 */
674 basic_string&
675 operator=(const _CharT* __s)
676 { return this->assign(__s); }
677
678 /**
679 * @brief Set value to string of length 1.
680 * @param __c Source character.
681 *
682 * Assigning to a character makes this string length 1 and
683 * (*this)[0] == @a c.
684 */
685 basic_string&
686 operator=(_CharT __c)
687 {
688 this->assign(1, __c);
689 return *this;
690 }
691
692#if __cplusplus201703L >= 201103L
693 /**
694 * @brief Move assign the value of @a str to this string.
695 * @param __str Source string.
696 *
697 * The contents of @a str are moved into this string (without copying).
698 * @a str is a valid, but unspecified string.
699 **/
700 // _GLIBCXX_RESOLVE_LIB_DEFECTS
701 // 2063. Contradictory requirements for string move assignment
702 basic_string&
703 operator=(basic_string&& __str)
704 noexcept(_Alloc_traits::_S_nothrow_move())
705 {
706 if (!_M_is_local() && _Alloc_traits::_S_propagate_on_move_assign()
707 && !_Alloc_traits::_S_always_equal()
708 && _M_get_allocator() != __str._M_get_allocator())
709 {
710 // Destroy existing storage before replacing allocator.
711 _M_destroy(_M_allocated_capacity);
712 _M_data(_M_local_data());
713 _M_set_length(0);
714 }
715 // Replace allocator if POCMA is true.
716 std::__alloc_on_move(_M_get_allocator(), __str._M_get_allocator());
717
718 if (__str._M_is_local())
719 {
720 // We've always got room for a short string, just copy it.
721 if (__str.size())
722 this->_S_copy(_M_data(), __str._M_data(), __str.size());
723 _M_set_length(__str.size());
724 }
725 else if (_Alloc_traits::_S_propagate_on_move_assign()
726 || _Alloc_traits::_S_always_equal()
727 || _M_get_allocator() == __str._M_get_allocator())
728 {
729 // Just move the allocated pointer, our allocator can free it.
730 pointer __data = nullptr;
731 size_type __capacity;
732 if (!_M_is_local())
733 {
734 if (_Alloc_traits::_S_always_equal())
735 {
736 // __str can reuse our existing storage.
737 __data = _M_data();
738 __capacity = _M_allocated_capacity;
739 }
740 else // __str can't use it, so free it.
741 _M_destroy(_M_allocated_capacity);
742 }
743
744 _M_data(__str._M_data());
745 _M_length(__str.length());
746 _M_capacity(__str._M_allocated_capacity);
747 if (__data)
748 {
749 __str._M_data(__data);
750 __str._M_capacity(__capacity);
751 }
752 else
753 __str._M_data(__str._M_local_buf);
754 }
755 else // Need to do a deep copy
756 assign(__str);
757 __str.clear();
758 return *this;
759 }
760
761 /**
762 * @brief Set value to string constructed from initializer %list.
763 * @param __l std::initializer_list.
764 */
765 basic_string&
766 operator=(initializer_list<_CharT> __l)
767 {
768 this->assign(__l.begin(), __l.size());
769 return *this;
770 }
771#endif // C++11
772
773#if __cplusplus201703L >= 201703L
774 /**
775 * @brief Set value to string constructed from a string_view.
776 * @param __svt An object convertible to string_view.
777 */
778 template<typename _Tp>
779 _If_sv<_Tp, basic_string&>
780 operator=(const _Tp& __svt)
781 { return this->assign(__svt); }
782
783 /**
784 * @brief Convert to a string_view.
785 * @return A string_view.
786 */
787 operator __sv_type() const noexcept
788 { return __sv_type(data(), size()); }
789#endif // C++17
790
791 // Iterators:
792 /**
793 * Returns a read/write iterator that points to the first character in
794 * the %string.
795 */
796 iterator
797 begin() _GLIBCXX_NOEXCEPTnoexcept
798 { return iterator(_M_data()); }
799
800 /**
801 * Returns a read-only (constant) iterator that points to the first
802 * character in the %string.
803 */
804 const_iterator
805 begin() const _GLIBCXX_NOEXCEPTnoexcept
806 { return const_iterator(_M_data()); }
807
808 /**
809 * Returns a read/write iterator that points one past the last
810 * character in the %string.
811 */
812 iterator
813 end() _GLIBCXX_NOEXCEPTnoexcept
814 { return iterator(_M_data() + this->size()); }
815
816 /**
817 * Returns a read-only (constant) iterator that points one past the
818 * last character in the %string.
819 */
820 const_iterator
821 end() const _GLIBCXX_NOEXCEPTnoexcept
822 { return const_iterator(_M_data() + this->size()); }
823
824 /**
825 * Returns a read/write reverse iterator that points to the last
826 * character in the %string. Iteration is done in reverse element
827 * order.
828 */
829 reverse_iterator
830 rbegin() _GLIBCXX_NOEXCEPTnoexcept
831 { return reverse_iterator(this->end()); }
832
833 /**
834 * Returns a read-only (constant) reverse iterator that points
835 * to the last character in the %string. Iteration is done in
836 * reverse element order.
837 */
838 const_reverse_iterator
839 rbegin() const _GLIBCXX_NOEXCEPTnoexcept
840 { return const_reverse_iterator(this->end()); }
841
842 /**
843 * Returns a read/write reverse iterator that points to one before the
844 * first character in the %string. Iteration is done in reverse
845 * element order.
846 */
847 reverse_iterator
848 rend() _GLIBCXX_NOEXCEPTnoexcept
849 { return reverse_iterator(this->begin()); }
850
851 /**
852 * Returns a read-only (constant) reverse iterator that points
853 * to one before the first character in the %string. Iteration
854 * is done in reverse element order.
855 */
856 const_reverse_iterator
857 rend() const _GLIBCXX_NOEXCEPTnoexcept
858 { return const_reverse_iterator(this->begin()); }
859
860#if __cplusplus201703L >= 201103L
861 /**
862 * Returns a read-only (constant) iterator that points to the first
863 * character in the %string.
864 */
865 const_iterator
866 cbegin() const noexcept
867 { return const_iterator(this->_M_data()); }
868
869 /**
870 * Returns a read-only (constant) iterator that points one past the
871 * last character in the %string.
872 */
873 const_iterator
874 cend() const noexcept
875 { return const_iterator(this->_M_data() + this->size()); }
876
877 /**
878 * Returns a read-only (constant) reverse iterator that points
879 * to the last character in the %string. Iteration is done in
880 * reverse element order.
881 */
882 const_reverse_iterator
883 crbegin() const noexcept
884 { return const_reverse_iterator(this->end()); }
885
886 /**
887 * Returns a read-only (constant) reverse iterator that points
888 * to one before the first character in the %string. Iteration
889 * is done in reverse element order.
890 */
891 const_reverse_iterator
892 crend() const noexcept
893 { return const_reverse_iterator(this->begin()); }
894#endif
895
896 public:
897 // Capacity:
898 /// Returns the number of characters in the string, not including any
899 /// null-termination.
900 size_type
901 size() const _GLIBCXX_NOEXCEPTnoexcept
902 { return _M_string_length; }
903
904 /// Returns the number of characters in the string, not including any
905 /// null-termination.
906 size_type
907 length() const _GLIBCXX_NOEXCEPTnoexcept
908 { return _M_string_length; }
909
910 /// Returns the size() of the largest possible %string.
911 size_type
912 max_size() const _GLIBCXX_NOEXCEPTnoexcept
913 { return (_Alloc_traits::max_size(_M_get_allocator()) - 1) / 2; }
914
915 /**
916 * @brief Resizes the %string to the specified number of characters.
917 * @param __n Number of characters the %string should contain.
918 * @param __c Character to fill any new elements.
919 *
920 * This function will %resize the %string to the specified
921 * number of characters. If the number is smaller than the
922 * %string's current size the %string is truncated, otherwise
923 * the %string is extended and new elements are %set to @a __c.
924 */
925 void
926 resize(size_type __n, _CharT __c);
927
928 /**
929 * @brief Resizes the %string to the specified number of characters.
930 * @param __n Number of characters the %string should contain.
931 *
932 * This function will resize the %string to the specified length. If
933 * the new size is smaller than the %string's current size the %string
934 * is truncated, otherwise the %string is extended and new characters
935 * are default-constructed. For basic types such as char, this means
936 * setting them to 0.
937 */
938 void
939 resize(size_type __n)
940 { this->resize(__n, _CharT()); }
941
942#if __cplusplus201703L >= 201103L
943 /// A non-binding request to reduce capacity() to size().
944 void
945 shrink_to_fit() noexcept
946 {
947#if __cpp_exceptions
948 if (capacity() > size())
949 {
950 try
951 { reserve(0); }
952 catch(...)
953 { }
954 }
955#endif
956 }
957#endif
958
959 /**
960 * Returns the total number of characters that the %string can hold
961 * before needing to allocate more memory.
962 */
963 size_type
964 capacity() const _GLIBCXX_NOEXCEPTnoexcept
965 {
966 return _M_is_local() ? size_type(_S_local_capacity)
967 : _M_allocated_capacity;
968 }
969
970 /**
971 * @brief Attempt to preallocate enough memory for specified number of
972 * characters.
973 * @param __res_arg Number of characters required.
974 * @throw std::length_error If @a __res_arg exceeds @c max_size().
975 *
976 * This function attempts to reserve enough memory for the
977 * %string to hold the specified number of characters. If the
978 * number requested is more than max_size(), length_error is
979 * thrown.
980 *
981 * The advantage of this function is that if optimal code is a
982 * necessity and the user can determine the string length that will be
983 * required, the user can reserve the memory in %advance, and thus
984 * prevent a possible reallocation of memory and copying of %string
985 * data.
986 */
987 void
988 reserve(size_type __res_arg = 0);
989
990 /**
991 * Erases the string, making it empty.
992 */
993 void
994 clear() _GLIBCXX_NOEXCEPTnoexcept
995 { _M_set_length(0); }
996
997 /**
998 * Returns true if the %string is empty. Equivalent to
999 * <code>*this == ""</code>.
1000 */
1001 _GLIBCXX_NODISCARD[[__nodiscard__]] bool
1002 empty() const _GLIBCXX_NOEXCEPTnoexcept
1003 { return this->size() == 0; }
1004
1005 // Element access:
1006 /**
1007 * @brief Subscript access to the data contained in the %string.
1008 * @param __pos The index of the character to access.
1009 * @return Read-only (constant) reference to the character.
1010 *
1011 * This operator allows for easy, array-style, data access.
1012 * Note that data access with this operator is unchecked and
1013 * out_of_range lookups are not defined. (For checked lookups
1014 * see at().)
1015 */
1016 const_reference
1017 operator[] (size_type __pos) const _GLIBCXX_NOEXCEPTnoexcept
1018 {
1019 __glibcxx_assert(__pos <= size());
1020 return _M_data()[__pos];
1021 }
1022
1023 /**
1024 * @brief Subscript access to the data contained in the %string.
1025 * @param __pos The index of the character to access.
1026 * @return Read/write reference to the character.
1027 *
1028 * This operator allows for easy, array-style, data access.
1029 * Note that data access with this operator is unchecked and
1030 * out_of_range lookups are not defined. (For checked lookups
1031 * see at().)
1032 */
1033 reference
1034 operator[](size_type __pos)
1035 {
1036 // Allow pos == size() both in C++98 mode, as v3 extension,
1037 // and in C++11 mode.
1038 __glibcxx_assert(__pos <= size());
1039 // In pedantic mode be strict in C++98 mode.
1040 _GLIBCXX_DEBUG_PEDASSERT(__cplusplus >= 201103L || __pos < size());
1041 return _M_data()[__pos];
1042 }
1043
1044 /**
1045 * @brief Provides access to the data contained in the %string.
1046 * @param __n The index of the character to access.
1047 * @return Read-only (const) reference to the character.
1048 * @throw std::out_of_range If @a n is an invalid index.
1049 *
1050 * This function provides for safer data access. The parameter is
1051 * first checked that it is in the range of the string. The function
1052 * throws out_of_range if the check fails.
1053 */
1054 const_reference
1055 at(size_type __n) const
1056 {
1057 if (__n >= this->size())
1058 __throw_out_of_range_fmt(__N("basic_string::at: __n "("basic_string::at: __n " "(which is %zu) >= this->size() "
"(which is %zu)")
1059 "(which is %zu) >= this->size() "("basic_string::at: __n " "(which is %zu) >= this->size() "
"(which is %zu)")
1060 "(which is %zu)")("basic_string::at: __n " "(which is %zu) >= this->size() "
"(which is %zu)")
,
1061 __n, this->size());
1062 return _M_data()[__n];
1063 }
1064
1065 /**
1066 * @brief Provides access to the data contained in the %string.
1067 * @param __n The index of the character to access.
1068 * @return Read/write reference to the character.
1069 * @throw std::out_of_range If @a n is an invalid index.
1070 *
1071 * This function provides for safer data access. The parameter is
1072 * first checked that it is in the range of the string. The function
1073 * throws out_of_range if the check fails.
1074 */
1075 reference
1076 at(size_type __n)
1077 {
1078 if (__n >= size())
1079 __throw_out_of_range_fmt(__N("basic_string::at: __n "("basic_string::at: __n " "(which is %zu) >= this->size() "
"(which is %zu)")
1080 "(which is %zu) >= this->size() "("basic_string::at: __n " "(which is %zu) >= this->size() "
"(which is %zu)")
1081 "(which is %zu)")("basic_string::at: __n " "(which is %zu) >= this->size() "
"(which is %zu)")
,
1082 __n, this->size());
1083 return _M_data()[__n];
1084 }
1085
1086#if __cplusplus201703L >= 201103L
1087 /**
1088 * Returns a read/write reference to the data at the first
1089 * element of the %string.
1090 */
1091 reference
1092 front() noexcept
1093 {
1094 __glibcxx_assert(!empty());
1095 return operator[](0);
1096 }
1097
1098 /**
1099 * Returns a read-only (constant) reference to the data at the first
1100 * element of the %string.
1101 */
1102 const_reference
1103 front() const noexcept
1104 {
1105 __glibcxx_assert(!empty());
1106 return operator[](0);
1107 }
1108
1109 /**
1110 * Returns a read/write reference to the data at the last
1111 * element of the %string.
1112 */
1113 reference
1114 back() noexcept
1115 {
1116 __glibcxx_assert(!empty());
1117 return operator[](this->size() - 1);
1118 }
1119
1120 /**
1121 * Returns a read-only (constant) reference to the data at the
1122 * last element of the %string.
1123 */
1124 const_reference
1125 back() const noexcept
1126 {
1127 __glibcxx_assert(!empty());
1128 return operator[](this->size() - 1);
1129 }
1130#endif
1131
1132 // Modifiers:
1133 /**
1134 * @brief Append a string to this string.
1135 * @param __str The string to append.
1136 * @return Reference to this string.
1137 */
1138 basic_string&
1139 operator+=(const basic_string& __str)
1140 { return this->append(__str); }
1141
1142 /**
1143 * @brief Append a C string.
1144 * @param __s The C string to append.
1145 * @return Reference to this string.
1146 */
1147 basic_string&
1148 operator+=(const _CharT* __s)
1149 { return this->append(__s); }
1150
1151 /**
1152 * @brief Append a character.
1153 * @param __c The character to append.
1154 * @return Reference to this string.
1155 */
1156 basic_string&
1157 operator+=(_CharT __c)
1158 {
1159 this->push_back(__c);
1160 return *this;
1161 }
1162
1163#if __cplusplus201703L >= 201103L
1164 /**
1165 * @brief Append an initializer_list of characters.
1166 * @param __l The initializer_list of characters to be appended.
1167 * @return Reference to this string.
1168 */
1169 basic_string&
1170 operator+=(initializer_list<_CharT> __l)
1171 { return this->append(__l.begin(), __l.size()); }
1172#endif // C++11
1173
1174#if __cplusplus201703L >= 201703L
1175 /**
1176 * @brief Append a string_view.
1177 * @param __svt An object convertible to string_view to be appended.
1178 * @return Reference to this string.
1179 */
1180 template<typename _Tp>
1181 _If_sv<_Tp, basic_string&>
1182 operator+=(const _Tp& __svt)
1183 { return this->append(__svt); }
1184#endif // C++17
1185
1186 /**
1187 * @brief Append a string to this string.
1188 * @param __str The string to append.
1189 * @return Reference to this string.
1190 */
1191 basic_string&
1192 append(const basic_string& __str)
1193 { return _M_append(__str._M_data(), __str.size()); }
1194
1195 /**
1196 * @brief Append a substring.
1197 * @param __str The string to append.
1198 * @param __pos Index of the first character of str to append.
1199 * @param __n The number of characters to append.
1200 * @return Reference to this string.
1201 * @throw std::out_of_range if @a __pos is not a valid index.
1202 *
1203 * This function appends @a __n characters from @a __str
1204 * starting at @a __pos to this string. If @a __n is is larger
1205 * than the number of available characters in @a __str, the
1206 * remainder of @a __str is appended.
1207 */
1208 basic_string&
1209 append(const basic_string& __str, size_type __pos, size_type __n = npos)
1210 { return _M_append(__str._M_data()
1211 + __str._M_check(__pos, "basic_string::append"),
1212 __str._M_limit(__pos, __n)); }
1213
1214 /**
1215 * @brief Append a C substring.
1216 * @param __s The C string to append.
1217 * @param __n The number of characters to append.
1218 * @return Reference to this string.
1219 */
1220 basic_string&
1221 append(const _CharT* __s, size_type __n)
1222 {
1223 __glibcxx_requires_string_len(__s, __n);
1224 _M_check_length(size_type(0), __n, "basic_string::append");
1225 return _M_append(__s, __n);
1226 }
1227
1228 /**
1229 * @brief Append a C string.
1230 * @param __s The C string to append.
1231 * @return Reference to this string.
1232 */
1233 basic_string&
1234 append(const _CharT* __s)
1235 {
1236 __glibcxx_requires_string(__s);
1237 const size_type __n = traits_type::length(__s);
1238 _M_check_length(size_type(0), __n, "basic_string::append");
1239 return _M_append(__s, __n);
1240 }
1241
1242 /**
1243 * @brief Append multiple characters.
1244 * @param __n The number of characters to append.
1245 * @param __c The character to use.
1246 * @return Reference to this string.
1247 *
1248 * Appends __n copies of __c to this string.
1249 */
1250 basic_string&
1251 append(size_type __n, _CharT __c)
1252 { return _M_replace_aux(this->size(), size_type(0), __n, __c); }
1253
1254#if __cplusplus201703L >= 201103L
1255 /**
1256 * @brief Append an initializer_list of characters.
1257 * @param __l The initializer_list of characters to append.
1258 * @return Reference to this string.
1259 */
1260 basic_string&
1261 append(initializer_list<_CharT> __l)
1262 { return this->append(__l.begin(), __l.size()); }
1263#endif // C++11
1264
1265 /**
1266 * @brief Append a range of characters.
1267 * @param __first Iterator referencing the first character to append.
1268 * @param __last Iterator marking the end of the range.
1269 * @return Reference to this string.
1270 *
1271 * Appends characters in the range [__first,__last) to this string.
1272 */
1273#if __cplusplus201703L >= 201103L
1274 template<class _InputIterator,
1275 typename = std::_RequireInputIter<_InputIterator>>
1276#else
1277 template<class _InputIterator>
1278#endif
1279 basic_string&
1280 append(_InputIterator __first, _InputIterator __last)
1281 { return this->replace(end(), end(), __first, __last); }
1282
1283#if __cplusplus201703L >= 201703L
1284 /**
1285 * @brief Append a string_view.
1286 * @param __svt An object convertible to string_view to be appended.
1287 * @return Reference to this string.
1288 */
1289 template<typename _Tp>
1290 _If_sv<_Tp, basic_string&>
1291 append(const _Tp& __svt)
1292 {
1293 __sv_type __sv = __svt;
1294 return this->append(__sv.data(), __sv.size());
1295 }
1296
1297 /**
1298 * @brief Append a range of characters from a string_view.
1299 * @param __svt An object convertible to string_view to be appended from.
1300 * @param __pos The position in the string_view to append from.
1301 * @param __n The number of characters to append from the string_view.
1302 * @return Reference to this string.
1303 */
1304 template<typename _Tp>
1305 _If_sv<_Tp, basic_string&>
1306 append(const _Tp& __svt, size_type __pos, size_type __n = npos)
1307 {
1308 __sv_type __sv = __svt;
1309 return _M_append(__sv.data()
1310 + std::__sv_check(__sv.size(), __pos, "basic_string::append"),
1311 std::__sv_limit(__sv.size(), __pos, __n));
1312 }
1313#endif // C++17
1314
1315 /**
1316 * @brief Append a single character.
1317 * @param __c Character to append.
1318 */
1319 void
1320 push_back(_CharT __c)
1321 {
1322 const size_type __size = this->size();
1323 if (__size + 1 > this->capacity())
1324 this->_M_mutate(__size, size_type(0), 0, size_type(1));
1325 traits_type::assign(this->_M_data()[__size], __c);
1326 this->_M_set_length(__size + 1);
1327 }
1328
1329 /**
1330 * @brief Set value to contents of another string.
1331 * @param __str Source string to use.
1332 * @return Reference to this string.
1333 */
1334 basic_string&
1335 assign(const basic_string& __str)
1336 {
1337#if __cplusplus201703L >= 201103L
1338 if (_Alloc_traits::_S_propagate_on_copy_assign())
1339 {
1340 if (!_Alloc_traits::_S_always_equal() && !_M_is_local()
1341 && _M_get_allocator() != __str._M_get_allocator())
1342 {
1343 // Propagating allocator cannot free existing storage so must
1344 // deallocate it before replacing current allocator.
1345 if (__str.size() <= _S_local_capacity)
1346 {
1347 _M_destroy(_M_allocated_capacity);
1348 _M_data(_M_local_data());
1349 _M_set_length(0);
1350 }
1351 else
1352 {
1353 const auto __len = __str.size();
1354 auto __alloc = __str._M_get_allocator();
1355 // If this allocation throws there are no effects:
1356 auto __ptr = _Alloc_traits::allocate(__alloc, __len + 1);
1357 _M_destroy(_M_allocated_capacity);
1358 _M_data(__ptr);
1359 _M_capacity(__len);
1360 _M_set_length(__len);
1361 }
1362 }
1363 std::__alloc_on_copy(_M_get_allocator(), __str._M_get_allocator());
1364 }
1365#endif
1366 this->_M_assign(__str);
1367 return *this;
1368 }
1369
1370#if __cplusplus201703L >= 201103L
1371 /**
1372 * @brief Set value to contents of another string.
1373 * @param __str Source string to use.
1374 * @return Reference to this string.
1375 *
1376 * This function sets this string to the exact contents of @a __str.
1377 * @a __str is a valid, but unspecified string.
1378 */
1379 basic_string&
1380 assign(basic_string&& __str)
1381 noexcept(_Alloc_traits::_S_nothrow_move())
1382 {
1383 // _GLIBCXX_RESOLVE_LIB_DEFECTS
1384 // 2063. Contradictory requirements for string move assignment
1385 return *this = std::move(__str);
1386 }
1387#endif // C++11
1388
1389 /**
1390 * @brief Set value to a substring of a string.
1391 * @param __str The string to use.
1392 * @param __pos Index of the first character of str.
1393 * @param __n Number of characters to use.
1394 * @return Reference to this string.
1395 * @throw std::out_of_range if @a pos is not a valid index.
1396 *
1397 * This function sets this string to the substring of @a __str
1398 * consisting of @a __n characters at @a __pos. If @a __n is
1399 * is larger than the number of available characters in @a
1400 * __str, the remainder of @a __str is used.
1401 */
1402 basic_string&
1403 assign(const basic_string& __str, size_type __pos, size_type __n = npos)
1404 { return _M_replace(size_type(0), this->size(), __str._M_data()
1405 + __str._M_check(__pos, "basic_string::assign"),
1406 __str._M_limit(__pos, __n)); }
1407
1408 /**
1409 * @brief Set value to a C substring.
1410 * @param __s The C string to use.
1411 * @param __n Number of characters to use.
1412 * @return Reference to this string.
1413 *
1414 * This function sets the value of this string to the first @a __n
1415 * characters of @a __s. If @a __n is is larger than the number of
1416 * available characters in @a __s, the remainder of @a __s is used.
1417 */
1418 basic_string&
1419 assign(const _CharT* __s, size_type __n)
1420 {
1421 __glibcxx_requires_string_len(__s, __n);
1422 return _M_replace(size_type(0), this->size(), __s, __n);
1423 }
1424
1425 /**
1426 * @brief Set value to contents of a C string.
1427 * @param __s The C string to use.
1428 * @return Reference to this string.
1429 *
1430 * This function sets the value of this string to the value of @a __s.
1431 * The data is copied, so there is no dependence on @a __s once the
1432 * function returns.
1433 */
1434 basic_string&
1435 assign(const _CharT* __s)
1436 {
1437 __glibcxx_requires_string(__s);
1438 return _M_replace(size_type(0), this->size(), __s,
1439 traits_type::length(__s));
1440 }
1441
1442 /**
1443 * @brief Set value to multiple characters.
1444 * @param __n Length of the resulting string.
1445 * @param __c The character to use.
1446 * @return Reference to this string.
1447 *
1448 * This function sets the value of this string to @a __n copies of
1449 * character @a __c.
1450 */
1451 basic_string&
1452 assign(size_type __n, _CharT __c)
1453 { return _M_replace_aux(size_type(0), this->size(), __n, __c); }
1454
1455 /**
1456 * @brief Set value to a range of characters.
1457 * @param __first Iterator referencing the first character to append.
1458 * @param __last Iterator marking the end of the range.
1459 * @return Reference to this string.
1460 *
1461 * Sets value of string to characters in the range [__first,__last).
1462 */
1463#if __cplusplus201703L >= 201103L
1464 template<class _InputIterator,
1465 typename = std::_RequireInputIter<_InputIterator>>
1466#else
1467 template<class _InputIterator>
1468#endif
1469 basic_string&
1470 assign(_InputIterator __first, _InputIterator __last)
1471 { return this->replace(begin(), end(), __first, __last); }
1472
1473#if __cplusplus201703L >= 201103L
1474 /**
1475 * @brief Set value to an initializer_list of characters.
1476 * @param __l The initializer_list of characters to assign.
1477 * @return Reference to this string.
1478 */
1479 basic_string&
1480 assign(initializer_list<_CharT> __l)
1481 { return this->assign(__l.begin(), __l.size()); }
1482#endif // C++11
1483
1484#if __cplusplus201703L >= 201703L
1485 /**
1486 * @brief Set value from a string_view.
1487 * @param __svt The source object convertible to string_view.
1488 * @return Reference to this string.
1489 */
1490 template<typename _Tp>
1491 _If_sv<_Tp, basic_string&>
1492 assign(const _Tp& __svt)
1493 {
1494 __sv_type __sv = __svt;
1495 return this->assign(__sv.data(), __sv.size());
1496 }
1497
1498 /**
1499 * @brief Set value from a range of characters in a string_view.
1500 * @param __svt The source object convertible to string_view.
1501 * @param __pos The position in the string_view to assign from.
1502 * @param __n The number of characters to assign.
1503 * @return Reference to this string.
1504 */
1505 template<typename _Tp>
1506 _If_sv<_Tp, basic_string&>
1507 assign(const _Tp& __svt, size_type __pos, size_type __n = npos)
1508 {
1509 __sv_type __sv = __svt;
1510 return _M_replace(size_type(0), this->size(),
1511 __sv.data()
1512 + std::__sv_check(__sv.size(), __pos, "basic_string::assign"),
1513 std::__sv_limit(__sv.size(), __pos, __n));
1514 }
1515#endif // C++17
1516
1517#if __cplusplus201703L >= 201103L
1518 /**
1519 * @brief Insert multiple characters.
1520 * @param __p Const_iterator referencing location in string to
1521 * insert at.
1522 * @param __n Number of characters to insert
1523 * @param __c The character to insert.
1524 * @return Iterator referencing the first inserted char.
1525 * @throw std::length_error If new length exceeds @c max_size().
1526 *
1527 * Inserts @a __n copies of character @a __c starting at the
1528 * position referenced by iterator @a __p. If adding
1529 * characters causes the length to exceed max_size(),
1530 * length_error is thrown. The value of the string doesn't
1531 * change if an error is thrown.
1532 */
1533 iterator
1534 insert(const_iterator __p, size_type __n, _CharT __c)
1535 {
1536 _GLIBCXX_DEBUG_PEDASSERT(__p >= begin() && __p <= end());
1537 const size_type __pos = __p - begin();
1538 this->replace(__p, __p, __n, __c);
1539 return iterator(this->_M_data() + __pos);
1540 }
1541#else
1542 /**
1543 * @brief Insert multiple characters.
1544 * @param __p Iterator referencing location in string to insert at.
1545 * @param __n Number of characters to insert
1546 * @param __c The character to insert.
1547 * @throw std::length_error If new length exceeds @c max_size().
1548 *
1549 * Inserts @a __n copies of character @a __c starting at the
1550 * position referenced by iterator @a __p. If adding
1551 * characters causes the length to exceed max_size(),
1552 * length_error is thrown. The value of the string doesn't
1553 * change if an error is thrown.
1554 */
1555 void
1556 insert(iterator __p, size_type __n, _CharT __c)
1557 { this->replace(__p, __p, __n, __c); }
1558#endif
1559
1560#if __cplusplus201703L >= 201103L
1561 /**
1562 * @brief Insert a range of characters.
1563 * @param __p Const_iterator referencing location in string to
1564 * insert at.
1565 * @param __beg Start of range.
1566 * @param __end End of range.
1567 * @return Iterator referencing the first inserted char.
1568 * @throw std::length_error If new length exceeds @c max_size().
1569 *
1570 * Inserts characters in range [beg,end). If adding characters
1571 * causes the length to exceed max_size(), length_error is
1572 * thrown. The value of the string doesn't change if an error
1573 * is thrown.
1574 */
1575 template<class _InputIterator,
1576 typename = std::_RequireInputIter<_InputIterator>>
1577 iterator
1578 insert(const_iterator __p, _InputIterator __beg, _InputIterator __end)
1579 {
1580 _GLIBCXX_DEBUG_PEDASSERT(__p >= begin() && __p <= end());
1581 const size_type __pos = __p - begin();
1582 this->replace(__p, __p, __beg, __end);
1583 return iterator(this->_M_data() + __pos);
1584 }
1585#else
1586 /**
1587 * @brief Insert a range of characters.
1588 * @param __p Iterator referencing location in string to insert at.
1589 * @param __beg Start of range.
1590 * @param __end End of range.
1591 * @throw std::length_error If new length exceeds @c max_size().
1592 *
1593 * Inserts characters in range [__beg,__end). If adding
1594 * characters causes the length to exceed max_size(),
1595 * length_error is thrown. The value of the string doesn't
1596 * change if an error is thrown.
1597 */
1598 template<class _InputIterator>
1599 void
1600 insert(iterator __p, _InputIterator __beg, _InputIterator __end)
1601 { this->replace(__p, __p, __beg, __end); }
1602#endif
1603
1604#if __cplusplus201703L >= 201103L
1605 /**
1606 * @brief Insert an initializer_list of characters.
1607 * @param __p Iterator referencing location in string to insert at.
1608 * @param __l The initializer_list of characters to insert.
1609 * @throw std::length_error If new length exceeds @c max_size().
1610 */
1611 iterator
1612 insert(const_iterator __p, initializer_list<_CharT> __l)
1613 { return this->insert(__p, __l.begin(), __l.end()); }
1614
1615#ifdef _GLIBCXX_DEFINING_STRING_INSTANTIATIONS
1616 // See PR libstdc++/83328
1617 void
1618 insert(iterator __p, initializer_list<_CharT> __l)
1619 {
1620 _GLIBCXX_DEBUG_PEDASSERT(__p >= begin() && __p <= end());
1621 this->insert(__p - begin(), __l.begin(), __l.size());
1622 }
1623#endif
1624#endif // C++11
1625
1626 /**
1627 * @brief Insert value of a string.
1628 * @param __pos1 Position in string to insert at.
1629 * @param __str The string to insert.
1630 * @return Reference to this string.
1631 * @throw std::length_error If new length exceeds @c max_size().
1632 *
1633 * Inserts value of @a __str starting at @a __pos1. If adding
1634 * characters causes the length to exceed max_size(),
1635 * length_error is thrown. The value of the string doesn't
1636 * change if an error is thrown.
1637 */
1638 basic_string&
1639 insert(size_type __pos1, const basic_string& __str)
1640 { return this->replace(__pos1, size_type(0),
1641 __str._M_data(), __str.size()); }
1642
1643 /**
1644 * @brief Insert a substring.
1645 * @param __pos1 Position in string to insert at.
1646 * @param __str The string to insert.
1647 * @param __pos2 Start of characters in str to insert.
1648 * @param __n Number of characters to insert.
1649 * @return Reference to this string.
1650 * @throw std::length_error If new length exceeds @c max_size().
1651 * @throw std::out_of_range If @a pos1 > size() or
1652 * @a __pos2 > @a str.size().
1653 *
1654 * Starting at @a pos1, insert @a __n character of @a __str
1655 * beginning with @a __pos2. If adding characters causes the
1656 * length to exceed max_size(), length_error is thrown. If @a
1657 * __pos1 is beyond the end of this string or @a __pos2 is
1658 * beyond the end of @a __str, out_of_range is thrown. The
1659 * value of the string doesn't change if an error is thrown.
1660 */
1661 basic_string&
1662 insert(size_type __pos1, const basic_string& __str,
1663 size_type __pos2, size_type __n = npos)
1664 { return this->replace(__pos1, size_type(0), __str._M_data()
1665 + __str._M_check(__pos2, "basic_string::insert"),
1666 __str._M_limit(__pos2, __n)); }
1667
1668 /**
1669 * @brief Insert a C substring.
1670 * @param __pos Position in string to insert at.
1671 * @param __s The C string to insert.
1672 * @param __n The number of characters to insert.
1673 * @return Reference to this string.
1674 * @throw std::length_error If new length exceeds @c max_size().
1675 * @throw std::out_of_range If @a __pos is beyond the end of this
1676 * string.
1677 *
1678 * Inserts the first @a __n characters of @a __s starting at @a
1679 * __pos. If adding characters causes the length to exceed
1680 * max_size(), length_error is thrown. If @a __pos is beyond
1681 * end(), out_of_range is thrown. The value of the string
1682 * doesn't change if an error is thrown.
1683 */
1684 basic_string&
1685 insert(size_type __pos, const _CharT* __s, size_type __n)
1686 { return this->replace(__pos, size_type(0), __s, __n); }
1687
1688 /**
1689 * @brief Insert a C string.
1690 * @param __pos Position in string to insert at.
1691 * @param __s The C string to insert.
1692 * @return Reference to this string.
1693 * @throw std::length_error If new length exceeds @c max_size().
1694 * @throw std::out_of_range If @a pos is beyond the end of this
1695 * string.
1696 *
1697 * Inserts the first @a n characters of @a __s starting at @a __pos. If
1698 * adding characters causes the length to exceed max_size(),
1699 * length_error is thrown. If @a __pos is beyond end(), out_of_range is
1700 * thrown. The value of the string doesn't change if an error is
1701 * thrown.
1702 */
1703 basic_string&
1704 insert(size_type __pos, const _CharT* __s)
1705 {
1706 __glibcxx_requires_string(__s);
1707 return this->replace(__pos, size_type(0), __s,
1708 traits_type::length(__s));
1709 }
1710
1711 /**
1712 * @brief Insert multiple characters.
1713 * @param __pos Index in string to insert at.
1714 * @param __n Number of characters to insert
1715 * @param __c The character to insert.
1716 * @return Reference to this string.
1717 * @throw std::length_error If new length exceeds @c max_size().
1718 * @throw std::out_of_range If @a __pos is beyond the end of this
1719 * string.
1720 *
1721 * Inserts @a __n copies of character @a __c starting at index
1722 * @a __pos. If adding characters causes the length to exceed
1723 * max_size(), length_error is thrown. If @a __pos > length(),
1724 * out_of_range is thrown. The value of the string doesn't
1725 * change if an error is thrown.
1726 */
1727 basic_string&
1728 insert(size_type __pos, size_type __n, _CharT __c)
1729 { return _M_replace_aux(_M_check(__pos, "basic_string::insert"),
1730 size_type(0), __n, __c); }
1731
1732 /**
1733 * @brief Insert one character.
1734 * @param __p Iterator referencing position in string to insert at.
1735 * @param __c The character to insert.
1736 * @return Iterator referencing newly inserted char.
1737 * @throw std::length_error If new length exceeds @c max_size().
1738 *
1739 * Inserts character @a __c at position referenced by @a __p.
1740 * If adding character causes the length to exceed max_size(),
1741 * length_error is thrown. If @a __p is beyond end of string,
1742 * out_of_range is thrown. The value of the string doesn't
1743 * change if an error is thrown.
1744 */
1745 iterator
1746 insert(__const_iterator __p, _CharT __c)
1747 {
1748 _GLIBCXX_DEBUG_PEDASSERT(__p >= begin() && __p <= end());
1749 const size_type __pos = __p - begin();
1750 _M_replace_aux(__pos, size_type(0), size_type(1), __c);
1751 return iterator(_M_data() + __pos);
1752 }
1753
1754#if __cplusplus201703L >= 201703L
1755 /**
1756 * @brief Insert a string_view.
1757 * @param __pos Position in string to insert at.
1758 * @param __svt The object convertible to string_view to insert.
1759 * @return Reference to this string.
1760 */
1761 template<typename _Tp>
1762 _If_sv<_Tp, basic_string&>
1763 insert(size_type __pos, const _Tp& __svt)
1764 {
1765 __sv_type __sv = __svt;
1766 return this->insert(__pos, __sv.data(), __sv.size());
1767 }
1768
1769 /**
1770 * @brief Insert a string_view.
1771 * @param __pos1 Position in string to insert at.
1772 * @param __svt The object convertible to string_view to insert from.
1773 * @param __pos2 Start of characters in str to insert.
1774 * @param __n The number of characters to insert.
1775 * @return Reference to this string.
1776 */
1777 template<typename _Tp>
1778 _If_sv<_Tp, basic_string&>
1779 insert(size_type __pos1, const _Tp& __svt,
1780 size_type __pos2, size_type __n = npos)
1781 {
1782 __sv_type __sv = __svt;
1783 return this->replace(__pos1, size_type(0),
1784 __sv.data()
1785 + std::__sv_check(__sv.size(), __pos2, "basic_string::insert"),
1786 std::__sv_limit(__sv.size(), __pos2, __n));
1787 }
1788#endif // C++17
1789
1790 /**
1791 * @brief Remove characters.
1792 * @param __pos Index of first character to remove (default 0).
1793 * @param __n Number of characters to remove (default remainder).
1794 * @return Reference to this string.
1795 * @throw std::out_of_range If @a pos is beyond the end of this
1796 * string.
1797 *
1798 * Removes @a __n characters from this string starting at @a
1799 * __pos. The length of the string is reduced by @a __n. If
1800 * there are < @a __n characters to remove, the remainder of
1801 * the string is truncated. If @a __p is beyond end of string,
1802 * out_of_range is thrown. The value of the string doesn't
1803 * change if an error is thrown.
1804 */
1805 basic_string&
1806 erase(size_type __pos = 0, size_type __n = npos)
1807 {
1808 _M_check(__pos, "basic_string::erase");
1809 if (__n == npos)
1810 this->_M_set_length(__pos);
1811 else if (__n != 0)
1812 this->_M_erase(__pos, _M_limit(__pos, __n));
1813 return *this;
1814 }
1815
1816 /**
1817 * @brief Remove one character.
1818 * @param __position Iterator referencing the character to remove.
1819 * @return iterator referencing same location after removal.
1820 *
1821 * Removes the character at @a __position from this string. The value
1822 * of the string doesn't change if an error is thrown.
1823 */
1824 iterator
1825 erase(__const_iterator __position)
1826 {
1827 _GLIBCXX_DEBUG_PEDASSERT(__position >= begin()
1828 && __position < end());
1829 const size_type __pos = __position - begin();
1830 this->_M_erase(__pos, size_type(1));
1831 return iterator(_M_data() + __pos);
1832 }
1833
1834 /**
1835 * @brief Remove a range of characters.
1836 * @param __first Iterator referencing the first character to remove.
1837 * @param __last Iterator referencing the end of the range.
1838 * @return Iterator referencing location of first after removal.
1839 *
1840 * Removes the characters in the range [first,last) from this string.
1841 * The value of the string doesn't change if an error is thrown.
1842 */
1843 iterator
1844 erase(__const_iterator __first, __const_iterator __last)
1845 {
1846 _GLIBCXX_DEBUG_PEDASSERT(__first >= begin() && __first <= __last
1847 && __last <= end());
1848 const size_type __pos = __first - begin();
1849 if (__last == end())
1850 this->_M_set_length(__pos);
1851 else
1852 this->_M_erase(__pos, __last - __first);
1853 return iterator(this->_M_data() + __pos);
1854 }
1855
1856#if __cplusplus201703L >= 201103L
1857 /**
1858 * @brief Remove the last character.
1859 *
1860 * The string must be non-empty.
1861 */
1862 void
1863 pop_back() noexcept
1864 {
1865 __glibcxx_assert(!empty());
1866 _M_erase(size() - 1, 1);
1867 }
1868#endif // C++11
1869
1870 /**
1871 * @brief Replace characters with value from another string.
1872 * @param __pos Index of first character to replace.
1873 * @param __n Number of characters to be replaced.
1874 * @param __str String to insert.
1875 * @return Reference to this string.
1876 * @throw std::out_of_range If @a pos is beyond the end of this
1877 * string.
1878 * @throw std::length_error If new length exceeds @c max_size().
1879 *
1880 * Removes the characters in the range [__pos,__pos+__n) from
1881 * this string. In place, the value of @a __str is inserted.
1882 * If @a __pos is beyond end of string, out_of_range is thrown.
1883 * If the length of the result exceeds max_size(), length_error
1884 * is thrown. The value of the string doesn't change if an
1885 * error is thrown.
1886 */
1887 basic_string&
1888 replace(size_type __pos, size_type __n, const basic_string& __str)
1889 { return this->replace(__pos, __n, __str._M_data(), __str.size()); }
1890
1891 /**
1892 * @brief Replace characters with value from another string.
1893 * @param __pos1 Index of first character to replace.
1894 * @param __n1 Number of characters to be replaced.
1895 * @param __str String to insert.
1896 * @param __pos2 Index of first character of str to use.
1897 * @param __n2 Number of characters from str to use.
1898 * @return Reference to this string.
1899 * @throw std::out_of_range If @a __pos1 > size() or @a __pos2 >
1900 * __str.size().
1901 * @throw std::length_error If new length exceeds @c max_size().
1902 *
1903 * Removes the characters in the range [__pos1,__pos1 + n) from this
1904 * string. In place, the value of @a __str is inserted. If @a __pos is
1905 * beyond end of string, out_of_range is thrown. If the length of the
1906 * result exceeds max_size(), length_error is thrown. The value of the
1907 * string doesn't change if an error is thrown.
1908 */
1909 basic_string&
1910 replace(size_type __pos1, size_type __n1, const basic_string& __str,
1911 size_type __pos2, size_type __n2 = npos)
1912 { return this->replace(__pos1, __n1, __str._M_data()
1913 + __str._M_check(__pos2, "basic_string::replace"),
1914 __str._M_limit(__pos2, __n2)); }
1915
1916 /**
1917 * @brief Replace characters with value of a C substring.
1918 * @param __pos Index of first character to replace.
1919 * @param __n1 Number of characters to be replaced.
1920 * @param __s C string to insert.
1921 * @param __n2 Number of characters from @a s to use.
1922 * @return Reference to this string.
1923 * @throw std::out_of_range If @a pos1 > size().
1924 * @throw std::length_error If new length exceeds @c max_size().
1925 *
1926 * Removes the characters in the range [__pos,__pos + __n1)
1927 * from this string. In place, the first @a __n2 characters of
1928 * @a __s are inserted, or all of @a __s if @a __n2 is too large. If
1929 * @a __pos is beyond end of string, out_of_range is thrown. If
1930 * the length of result exceeds max_size(), length_error is
1931 * thrown. The value of the string doesn't change if an error
1932 * is thrown.
1933 */
1934 basic_string&
1935 replace(size_type __pos, size_type __n1, const _CharT* __s,
1936 size_type __n2)
1937 {
1938 __glibcxx_requires_string_len(__s, __n2);
1939 return _M_replace(_M_check(__pos, "basic_string::replace"),
1940 _M_limit(__pos, __n1), __s, __n2);
1941 }
1942
1943 /**
1944 * @brief Replace characters with value of a C string.
1945 * @param __pos Index of first character to replace.
1946 * @param __n1 Number of characters to be replaced.
1947 * @param __s C string to insert.
1948 * @return Reference to this string.
1949 * @throw std::out_of_range If @a pos > size().
1950 * @throw std::length_error If new length exceeds @c max_size().
1951 *
1952 * Removes the characters in the range [__pos,__pos + __n1)
1953 * from this string. In place, the characters of @a __s are
1954 * inserted. If @a __pos is beyond end of string, out_of_range
1955 * is thrown. If the length of result exceeds max_size(),
1956 * length_error is thrown. The value of the string doesn't
1957 * change if an error is thrown.
1958 */
1959 basic_string&
1960 replace(size_type __pos, size_type __n1, const _CharT* __s)
1961 {
1962 __glibcxx_requires_string(__s);
1963 return this->replace(__pos, __n1, __s, traits_type::length(__s));
1964 }
1965
1966 /**
1967 * @brief Replace characters with multiple characters.
1968 * @param __pos Index of first character to replace.
1969 * @param __n1 Number of characters to be replaced.
1970 * @param __n2 Number of characters to insert.
1971 * @param __c Character to insert.
1972 * @return Reference to this string.
1973 * @throw std::out_of_range If @a __pos > size().
1974 * @throw std::length_error If new length exceeds @c max_size().
1975 *
1976 * Removes the characters in the range [pos,pos + n1) from this
1977 * string. In place, @a __n2 copies of @a __c are inserted.
1978 * If @a __pos is beyond end of string, out_of_range is thrown.
1979 * If the length of result exceeds max_size(), length_error is
1980 * thrown. The value of the string doesn't change if an error
1981 * is thrown.
1982 */
1983 basic_string&
1984 replace(size_type __pos, size_type __n1, size_type __n2, _CharT __c)
1985 { return _M_replace_aux(_M_check(__pos, "basic_string::replace"),
1986 _M_limit(__pos, __n1), __n2, __c); }
1987
1988 /**
1989 * @brief Replace range of characters with string.
1990 * @param __i1 Iterator referencing start of range to replace.
1991 * @param __i2 Iterator referencing end of range to replace.
1992 * @param __str String value to insert.
1993 * @return Reference to this string.
1994 * @throw std::length_error If new length exceeds @c max_size().
1995 *
1996 * Removes the characters in the range [__i1,__i2). In place,
1997 * the value of @a __str is inserted. If the length of result
1998 * exceeds max_size(), length_error is thrown. The value of
1999 * the string doesn't change if an error is thrown.
2000 */
2001 basic_string&
2002 replace(__const_iterator __i1, __const_iterator __i2,
2003 const basic_string& __str)
2004 { return this->replace(__i1, __i2, __str._M_data(), __str.size()); }
2005
2006 /**
2007 * @brief Replace range of characters with C substring.
2008 * @param __i1 Iterator referencing start of range to replace.
2009 * @param __i2 Iterator referencing end of range to replace.
2010 * @param __s C string value to insert.
2011 * @param __n Number of characters from s to insert.
2012 * @return Reference to this string.
2013 * @throw std::length_error If new length exceeds @c max_size().
2014 *
2015 * Removes the characters in the range [__i1,__i2). In place,
2016 * the first @a __n characters of @a __s are inserted. If the
2017 * length of result exceeds max_size(), length_error is thrown.
2018 * The value of the string doesn't change if an error is
2019 * thrown.
2020 */
2021 basic_string&
2022 replace(__const_iterator __i1, __const_iterator __i2,
2023 const _CharT* __s, size_type __n)
2024 {
2025 _GLIBCXX_DEBUG_PEDASSERT(begin() <= __i1 && __i1 <= __i2
2026 && __i2 <= end());
2027 return this->replace(__i1 - begin(), __i2 - __i1, __s, __n);
2028 }
2029
2030 /**
2031 * @brief Replace range of characters with C string.
2032 * @param __i1 Iterator referencing start of range to replace.
2033 * @param __i2 Iterator referencing end of range to replace.
2034 * @param __s C string value to insert.
2035 * @return Reference to this string.
2036 * @throw std::length_error If new length exceeds @c max_size().
2037 *
2038 * Removes the characters in the range [__i1,__i2). In place,
2039 * the characters of @a __s are inserted. If the length of
2040 * result exceeds max_size(), length_error is thrown. The
2041 * value of the string doesn't change if an error is thrown.
2042 */
2043 basic_string&
2044 replace(__const_iterator __i1, __const_iterator __i2, const _CharT* __s)
2045 {
2046 __glibcxx_requires_string(__s);
2047 return this->replace(__i1, __i2, __s, traits_type::length(__s));
2048 }
2049
2050 /**
2051 * @brief Replace range of characters with multiple characters
2052 * @param __i1 Iterator referencing start of range to replace.
2053 * @param __i2 Iterator referencing end of range to replace.
2054 * @param __n Number of characters to insert.
2055 * @param __c Character to insert.
2056 * @return Reference to this string.
2057 * @throw std::length_error If new length exceeds @c max_size().
2058 *
2059 * Removes the characters in the range [__i1,__i2). In place,
2060 * @a __n copies of @a __c are inserted. If the length of
2061 * result exceeds max_size(), length_error is thrown. The
2062 * value of the string doesn't change if an error is thrown.
2063 */
2064 basic_string&
2065 replace(__const_iterator __i1, __const_iterator __i2, size_type __n,
2066 _CharT __c)
2067 {
2068 _GLIBCXX_DEBUG_PEDASSERT(begin() <= __i1 && __i1 <= __i2
2069 && __i2 <= end());
2070 return _M_replace_aux(__i1 - begin(), __i2 - __i1, __n, __c);
2071 }
2072
2073 /**
2074 * @brief Replace range of characters with range.
2075 * @param __i1 Iterator referencing start of range to replace.
2076 * @param __i2 Iterator referencing end of range to replace.
2077 * @param __k1 Iterator referencing start of range to insert.
2078 * @param __k2 Iterator referencing end of range to insert.
2079 * @return Reference to this string.
2080 * @throw std::length_error If new length exceeds @c max_size().
2081 *
2082 * Removes the characters in the range [__i1,__i2). In place,
2083 * characters in the range [__k1,__k2) are inserted. If the
2084 * length of result exceeds max_size(), length_error is thrown.
2085 * The value of the string doesn't change if an error is
2086 * thrown.
2087 */
2088#if __cplusplus201703L >= 201103L
2089 template<class _InputIterator,
2090 typename = std::_RequireInputIter<_InputIterator>>
2091 basic_string&
2092 replace(const_iterator __i1, const_iterator __i2,
2093 _InputIterator __k1, _InputIterator __k2)
2094 {
2095 _GLIBCXX_DEBUG_PEDASSERT(begin() <= __i1 && __i1 <= __i2
2096 && __i2 <= end());
2097 __glibcxx_requires_valid_range(__k1, __k2);
2098 return this->_M_replace_dispatch(__i1, __i2, __k1, __k2,
2099 std::__false_type());
2100 }
2101#else
2102 template<class _InputIterator>
2103#ifdef _GLIBCXX_DISAMBIGUATE_REPLACE_INST
2104 typename __enable_if_not_native_iterator<_InputIterator>::__type
2105#else
2106 basic_string&
2107#endif
2108 replace(iterator __i1, iterator __i2,
2109 _InputIterator __k1, _InputIterator __k2)
2110 {
2111 _GLIBCXX_DEBUG_PEDASSERT(begin() <= __i1 && __i1 <= __i2
2112 && __i2 <= end());
2113 __glibcxx_requires_valid_range(__k1, __k2);
2114 typedef typename std::__is_integer<_InputIterator>::__type _Integral;
2115 return _M_replace_dispatch(__i1, __i2, __k1, __k2, _Integral());
2116 }
2117#endif
2118
2119 // Specializations for the common case of pointer and iterator:
2120 // useful to avoid the overhead of temporary buffering in _M_replace.
2121 basic_string&
2122 replace(__const_iterator __i1, __const_iterator __i2,
2123 _CharT* __k1, _CharT* __k2)
2124 {
2125 _GLIBCXX_DEBUG_PEDASSERT(begin() <= __i1 && __i1 <= __i2
2126 && __i2 <= end());
2127 __glibcxx_requires_valid_range(__k1, __k2);
2128 return this->replace(__i1 - begin(), __i2 - __i1,
2129 __k1, __k2 - __k1);
2130 }
2131
2132 basic_string&
2133 replace(__const_iterator __i1, __const_iterator __i2,
2134 const _CharT* __k1, const _CharT* __k2)
2135 {
2136 _GLIBCXX_DEBUG_PEDASSERT(begin() <= __i1 && __i1 <= __i2
2137 && __i2 <= end());
2138 __glibcxx_requires_valid_range(__k1, __k2);
2139 return this->replace(__i1 - begin(), __i2 - __i1,
2140 __k1, __k2 - __k1);
2141 }
2142
2143 basic_string&
2144 replace(__const_iterator __i1, __const_iterator __i2,
2145 iterator __k1, iterator __k2)
2146 {
2147 _GLIBCXX_DEBUG_PEDASSERT(begin() <= __i1 && __i1 <= __i2
2148 && __i2 <= end());
2149 __glibcxx_requires_valid_range(__k1, __k2);
2150 return this->replace(__i1 - begin(), __i2 - __i1,
2151 __k1.base(), __k2 - __k1);
2152 }
2153
2154 basic_string&
2155 replace(__const_iterator __i1, __const_iterator __i2,
2156 const_iterator __k1, const_iterator __k2)
2157 {
2158 _GLIBCXX_DEBUG_PEDASSERT(begin() <= __i1 && __i1 <= __i2
2159 && __i2 <= end());
2160 __glibcxx_requires_valid_range(__k1, __k2);
2161 return this->replace(__i1 - begin(), __i2 - __i1,
2162 __k1.base(), __k2 - __k1);
2163 }
2164
2165#if __cplusplus201703L >= 201103L
2166 /**
2167 * @brief Replace range of characters with initializer_list.
2168 * @param __i1 Iterator referencing start of range to replace.
2169 * @param __i2 Iterator referencing end of range to replace.
2170 * @param __l The initializer_list of characters to insert.
2171 * @return Reference to this string.
2172 * @throw std::length_error If new length exceeds @c max_size().
2173 *
2174 * Removes the characters in the range [__i1,__i2). In place,
2175 * characters in the range [__k1,__k2) are inserted. If the
2176 * length of result exceeds max_size(), length_error is thrown.
2177 * The value of the string doesn't change if an error is
2178 * thrown.
2179 */
2180 basic_string& replace(const_iterator __i1, const_iterator __i2,
2181 initializer_list<_CharT> __l)
2182 { return this->replace(__i1, __i2, __l.begin(), __l.size()); }
2183#endif // C++11
2184
2185#if __cplusplus201703L >= 201703L
2186 /**
2187 * @brief Replace range of characters with string_view.
2188 * @param __pos The position to replace at.
2189 * @param __n The number of characters to replace.
2190 * @param __svt The object convertible to string_view to insert.
2191 * @return Reference to this string.
2192 */
2193 template<typename _Tp>
2194 _If_sv<_Tp, basic_string&>
2195 replace(size_type __pos, size_type __n, const _Tp& __svt)
2196 {
2197 __sv_type __sv = __svt;
2198 return this->replace(__pos, __n, __sv.data(), __sv.size());
2199 }
2200
2201 /**
2202 * @brief Replace range of characters with string_view.
2203 * @param __pos1 The position to replace at.
2204 * @param __n1 The number of characters to replace.
2205 * @param __svt The object convertible to string_view to insert from.
2206 * @param __pos2 The position in the string_view to insert from.
2207 * @param __n2 The number of characters to insert.
2208 * @return Reference to this string.
2209 */
2210 template<typename _Tp>
2211 _If_sv<_Tp, basic_string&>
2212 replace(size_type __pos1, size_type __n1, const _Tp& __svt,
2213 size_type __pos2, size_type __n2 = npos)
2214 {
2215 __sv_type __sv = __svt;
2216 return this->replace(__pos1, __n1,
2217 __sv.data()
2218 + std::__sv_check(__sv.size(), __pos2, "basic_string::replace"),
2219 std::__sv_limit(__sv.size(), __pos2, __n2));
2220 }
2221
2222 /**
2223 * @brief Replace range of characters with string_view.
2224 * @param __i1 An iterator referencing the start position
2225 to replace at.
2226 * @param __i2 An iterator referencing the end position
2227 for the replace.
2228 * @param __svt The object convertible to string_view to insert from.
2229 * @return Reference to this string.
2230 */
2231 template<typename _Tp>
2232 _If_sv<_Tp, basic_string&>
2233 replace(const_iterator __i1, const_iterator __i2, const _Tp& __svt)
2234 {
2235 __sv_type __sv = __svt;
2236 return this->replace(__i1 - begin(), __i2 - __i1, __sv);
2237 }
2238#endif // C++17
2239
2240 private:
2241 template<class _Integer>
2242 basic_string&
2243 _M_replace_dispatch(const_iterator __i1, const_iterator __i2,
2244 _Integer __n, _Integer __val, __true_type)
2245 { return _M_replace_aux(__i1 - begin(), __i2 - __i1, __n, __val); }
2246
2247 template<class _InputIterator>
2248 basic_string&
2249 _M_replace_dispatch(const_iterator __i1, const_iterator __i2,
2250 _InputIterator __k1, _InputIterator __k2,
2251 __false_type);
2252
2253 basic_string&
2254 _M_replace_aux(size_type __pos1, size_type __n1, size_type __n2,
2255 _CharT __c);
2256
2257 basic_string&
2258 _M_replace(size_type __pos, size_type __len1, const _CharT* __s,
2259 const size_type __len2);
2260
2261 basic_string&
2262 _M_append(const _CharT* __s, size_type __n);
2263
2264 public:
2265
2266 /**
2267 * @brief Copy substring into C string.
2268 * @param __s C string to copy value into.
2269 * @param __n Number of characters to copy.
2270 * @param __pos Index of first character to copy.
2271 * @return Number of characters actually copied
2272 * @throw std::out_of_range If __pos > size().
2273 *
2274 * Copies up to @a __n characters starting at @a __pos into the
2275 * C string @a __s. If @a __pos is %greater than size(),
2276 * out_of_range is thrown.
2277 */
2278 size_type
2279 copy(_CharT* __s, size_type __n, size_type __pos = 0) const;
2280
2281 /**
2282 * @brief Swap contents with another string.
2283 * @param __s String to swap with.
2284 *
2285 * Exchanges the contents of this string with that of @a __s in constant
2286 * time.
2287 */
2288 void
2289 swap(basic_string& __s) _GLIBCXX_NOEXCEPTnoexcept;
2290
2291 // String operations:
2292 /**
2293 * @brief Return const pointer to null-terminated contents.
2294 *
2295 * This is a handle to internal data. Do not modify or dire things may
2296 * happen.
2297 */
2298 const _CharT*
2299 c_str() const _GLIBCXX_NOEXCEPTnoexcept
2300 { return _M_data(); }
2301
2302 /**
2303 * @brief Return const pointer to contents.
2304 *
2305 * This is a pointer to internal data. It is undefined to modify
2306 * the contents through the returned pointer. To get a pointer that
2307 * allows modifying the contents use @c &str[0] instead,
2308 * (or in C++17 the non-const @c str.data() overload).
2309 */
2310 const _CharT*
2311 data() const _GLIBCXX_NOEXCEPTnoexcept
2312 { return _M_data(); }
2313
2314#if __cplusplus201703L >= 201703L
2315 /**
2316 * @brief Return non-const pointer to contents.
2317 *
2318 * This is a pointer to the character sequence held by the string.
2319 * Modifying the characters in the sequence is allowed.
2320 */
2321 _CharT*
2322 data() noexcept
2323 { return _M_data(); }
2324#endif
2325
2326 /**
2327 * @brief Return copy of allocator used to construct this string.
2328 */
2329 allocator_type
2330 get_allocator() const _GLIBCXX_NOEXCEPTnoexcept
2331 { return _M_get_allocator(); }
2332
2333 /**
2334 * @brief Find position of a C substring.
2335 * @param __s C string to locate.
2336 * @param __pos Index of character to search from.
2337 * @param __n Number of characters from @a s to search for.
2338 * @return Index of start of first occurrence.
2339 *
2340 * Starting from @a __pos, searches forward for the first @a
2341 * __n characters in @a __s within this string. If found,
2342 * returns the index where it begins. If not found, returns
2343 * npos.
2344 */
2345 size_type
2346 find(const _CharT* __s, size_type __pos, size_type __n) const
2347 _GLIBCXX_NOEXCEPTnoexcept;
2348
2349 /**
2350 * @brief Find position of a string.
2351 * @param __str String to locate.
2352 * @param __pos Index of character to search from (default 0).
2353 * @return Index of start of first occurrence.
2354 *
2355 * Starting from @a __pos, searches forward for value of @a __str within
2356 * this string. If found, returns the index where it begins. If not
2357 * found, returns npos.
2358 */
2359 size_type
2360 find(const basic_string& __str, size_type __pos = 0) const
2361 _GLIBCXX_NOEXCEPTnoexcept
2362 { return this->find(__str.data(), __pos, __str.size()); }
2363
2364#if __cplusplus201703L >= 201703L
2365 /**
2366 * @brief Find position of a string_view.
2367 * @param __svt The object convertible to string_view to locate.
2368 * @param __pos Index of character to search from (default 0).
2369 * @return Index of start of first occurrence.
2370 */
2371 template<typename _Tp>
2372 _If_sv<_Tp, size_type>
2373 find(const _Tp& __svt, size_type __pos = 0) const
2374 noexcept(is_same<_Tp, __sv_type>::value)
2375 {
2376 __sv_type __sv = __svt;
2377 return this->find(__sv.data(), __pos, __sv.size());
2378 }
2379#endif // C++17
2380
2381 /**
2382 * @brief Find position of a C string.
2383 * @param __s C string to locate.
2384 * @param __pos Index of character to search from (default 0).
2385 * @return Index of start of first occurrence.
2386 *
2387 * Starting from @a __pos, searches forward for the value of @a
2388 * __s within this string. If found, returns the index where
2389 * it begins. If not found, returns npos.
2390 */
2391 size_type
2392 find(const _CharT* __s, size_type __pos = 0) const _GLIBCXX_NOEXCEPTnoexcept
2393 {
2394 __glibcxx_requires_string(__s);
2395 return this->find(__s, __pos, traits_type::length(__s));
2396 }
2397
2398 /**
2399 * @brief Find position of a character.
2400 * @param __c Character to locate.
2401 * @param __pos Index of character to search from (default 0).
2402 * @return Index of first occurrence.
2403 *
2404 * Starting from @a __pos, searches forward for @a __c within
2405 * this string. If found, returns the index where it was
2406 * found. If not found, returns npos.
2407 */
2408 size_type
2409 find(_CharT __c, size_type __pos = 0) const _GLIBCXX_NOEXCEPTnoexcept;
2410
2411 /**
2412 * @brief Find last position of a string.
2413 * @param __str String to locate.
2414 * @param __pos Index of character to search back from (default end).
2415 * @return Index of start of last occurrence.
2416 *
2417 * Starting from @a __pos, searches backward for value of @a
2418 * __str within this string. If found, returns the index where
2419 * it begins. If not found, returns npos.
2420 */
2421 size_type
2422 rfind(const basic_string& __str, size_type __pos = npos) const
2423 _GLIBCXX_NOEXCEPTnoexcept
2424 { return this->rfind(__str.data(), __pos, __str.size()); }
2425
2426#if __cplusplus201703L >= 201703L
2427 /**
2428 * @brief Find last position of a string_view.
2429 * @param __svt The object convertible to string_view to locate.
2430 * @param __pos Index of character to search back from (default end).
2431 * @return Index of start of last occurrence.
2432 */
2433 template<typename _Tp>
2434 _If_sv<_Tp, size_type>
2435 rfind(const _Tp& __svt, size_type __pos = npos) const
2436 noexcept(is_same<_Tp, __sv_type>::value)
2437 {
2438 __sv_type __sv = __svt;
2439 return this->rfind(__sv.data(), __pos, __sv.size());
2440 }
2441#endif // C++17
2442
2443 /**
2444 * @brief Find last position of a C substring.
2445 * @param __s C string to locate.
2446 * @param __pos Index of character to search back from.
2447 * @param __n Number of characters from s to search for.
2448 * @return Index of start of last occurrence.
2449 *
2450 * Starting from @a __pos, searches backward for the first @a
2451 * __n characters in @a __s within this string. If found,
2452 * returns the index where it begins. If not found, returns
2453 * npos.
2454 */
2455 size_type
2456 rfind(const _CharT* __s, size_type __pos, size_type __n) const
2457 _GLIBCXX_NOEXCEPTnoexcept;
2458
2459 /**
2460 * @brief Find last position of a C string.
2461 * @param __s C string to locate.
2462 * @param __pos Index of character to start search at (default end).
2463 * @return Index of start of last occurrence.
2464 *
2465 * Starting from @a __pos, searches backward for the value of
2466 * @a __s within this string. If found, returns the index
2467 * where it begins. If not found, returns npos.
2468 */
2469 size_type
2470 rfind(const _CharT* __s, size_type __pos = npos) const
2471 {
2472 __glibcxx_requires_string(__s);
2473 return this->rfind(__s, __pos, traits_type::length(__s));
2474 }
2475
2476 /**
2477 * @brief Find last position of a character.
2478 * @param __c Character to locate.
2479 * @param __pos Index of character to search back from (default end).
2480 * @return Index of last occurrence.
2481 *
2482 * Starting from @a __pos, searches backward for @a __c within
2483 * this string. If found, returns the index where it was
2484 * found. If not found, returns npos.
2485 */
2486 size_type
2487 rfind(_CharT __c, size_type __pos = npos) const _GLIBCXX_NOEXCEPTnoexcept;
2488
2489 /**
2490 * @brief Find position of a character of string.
2491 * @param __str String containing characters to locate.
2492 * @param __pos Index of character to search from (default 0).
2493 * @return Index of first occurrence.
2494 *
2495 * Starting from @a __pos, searches forward for one of the
2496 * characters of @a __str within this string. If found,
2497 * returns the index where it was found. If not found, returns
2498 * npos.
2499 */
2500 size_type
2501 find_first_of(const basic_string& __str, size_type __pos = 0) const
2502 _GLIBCXX_NOEXCEPTnoexcept
2503 { return this->find_first_of(__str.data(), __pos, __str.size()); }
2504
2505#if __cplusplus201703L >= 201703L
2506 /**
2507 * @brief Find position of a character of a string_view.
2508 * @param __svt An object convertible to string_view containing
2509 * characters to locate.
2510 * @param __pos Index of character to search from (default 0).
2511 * @return Index of first occurrence.
2512 */
2513 template<typename _Tp>
2514 _If_sv<_Tp, size_type>
2515 find_first_of(const _Tp& __svt, size_type __pos = 0) const
2516 noexcept(is_same<_Tp, __sv_type>::value)
2517 {
2518 __sv_type __sv = __svt;
2519 return this->find_first_of(__sv.data(), __pos, __sv.size());
2520 }
2521#endif // C++17
2522
2523 /**
2524 * @brief Find position of a character of C substring.
2525 * @param __s String containing characters to locate.
2526 * @param __pos Index of character to search from.
2527 * @param __n Number of characters from s to search for.
2528 * @return Index of first occurrence.
2529 *
2530 * Starting from @a __pos, searches forward for one of the
2531 * first @a __n characters of @a __s within this string. If
2532 * found, returns the index where it was found. If not found,
2533 * returns npos.
2534 */
2535 size_type
2536 find_first_of(const _CharT* __s, size_type __pos, size_type __n) const
2537 _GLIBCXX_NOEXCEPTnoexcept;
2538
2539 /**
2540 * @brief Find position of a character of C string.
2541 * @param __s String containing characters to locate.
2542 * @param __pos Index of character to search from (default 0).
2543 * @return Index of first occurrence.
2544 *
2545 * Starting from @a __pos, searches forward for one of the
2546 * characters of @a __s within this string. If found, returns
2547 * the index where it was found. If not found, returns npos.
2548 */
2549 size_type
2550 find_first_of(const _CharT* __s, size_type __pos = 0) const
2551 _GLIBCXX_NOEXCEPTnoexcept
2552 {
2553 __glibcxx_requires_string(__s);
2554 return this->find_first_of(__s, __pos, traits_type::length(__s));
2555 }
2556
2557 /**
2558 * @brief Find position of a character.
2559 * @param __c Character to locate.
2560 * @param __pos Index of character to search from (default 0).
2561 * @return Index of first occurrence.
2562 *
2563 * Starting from @a __pos, searches forward for the character
2564 * @a __c within this string. If found, returns the index
2565 * where it was found. If not found, returns npos.
2566 *
2567 * Note: equivalent to find(__c, __pos).
2568 */
2569 size_type
2570 find_first_of(_CharT __c, size_type __pos = 0) const _GLIBCXX_NOEXCEPTnoexcept
2571 { return this->find(__c, __pos); }
2572
2573 /**
2574 * @brief Find last position of a character of string.
2575 * @param __str String containing characters to locate.
2576 * @param __pos Index of character to search back from (default end).
2577 * @return Index of last occurrence.
2578 *
2579 * Starting from @a __pos, searches backward for one of the
2580 * characters of @a __str within this string. If found,
2581 * returns the index where it was found. If not found, returns
2582 * npos.
2583 */
2584 size_type
2585 find_last_of(const basic_string& __str, size_type __pos = npos) const
2586 _GLIBCXX_NOEXCEPTnoexcept
2587 { return this->find_last_of(__str.data(), __pos, __str.size()); }
2588
2589#if __cplusplus201703L >= 201703L
2590 /**
2591 * @brief Find last position of a character of string.
2592 * @param __svt An object convertible to string_view containing
2593 * characters to locate.
2594 * @param __pos Index of character to search back from (default end).
2595 * @return Index of last occurrence.
2596 */
2597 template<typename _Tp>
2598 _If_sv<_Tp, size_type>
2599 find_last_of(const _Tp& __svt, size_type __pos = npos) const
2600 noexcept(is_same<_Tp, __sv_type>::value)
2601 {
2602 __sv_type __sv = __svt;
2603 return this->find_last_of(__sv.data(), __pos, __sv.size());
2604 }
2605#endif // C++17
2606
2607 /**
2608 * @brief Find last position of a character of C substring.
2609 * @param __s C string containing characters to locate.
2610 * @param __pos Index of character to search back from.
2611 * @param __n Number of characters from s to search for.
2612 * @return Index of last occurrence.
2613 *
2614 * Starting from @a __pos, searches backward for one of the
2615 * first @a __n characters of @a __s within this string. If
2616 * found, returns the index where it was found. If not found,
2617 * returns npos.
2618 */
2619 size_type
2620 find_last_of(const _CharT* __s, size_type __pos, size_type __n) const
2621 _GLIBCXX_NOEXCEPTnoexcept;
2622
2623 /**
2624 * @brief Find last position of a character of C string.
2625 * @param __s C string containing characters to locate.
2626 * @param __pos Index of character to search back from (default end).
2627 * @return Index of last occurrence.
2628 *
2629 * Starting from @a __pos, searches backward for one of the
2630 * characters of @a __s within this string. If found, returns
2631 * the index where it was found. If not found, returns npos.
2632 */
2633 size_type
2634 find_last_of(const _CharT* __s, size_type __pos = npos) const
2635 _GLIBCXX_NOEXCEPTnoexcept
2636 {
2637 __glibcxx_requires_string(__s);
2638 return this->find_last_of(__s, __pos, traits_type::length(__s));
2639 }
2640
2641 /**
2642 * @brief Find last position of a character.
2643 * @param __c Character to locate.
2644 * @param __pos Index of character to search back from (default end).
2645 * @return Index of last occurrence.
2646 *
2647 * Starting from @a __pos, searches backward for @a __c within
2648 * this string. If found, returns the index where it was
2649 * found. If not found, returns npos.
2650 *
2651 * Note: equivalent to rfind(__c, __pos).
2652 */
2653 size_type
2654 find_last_of(_CharT __c, size_type __pos = npos) const _GLIBCXX_NOEXCEPTnoexcept
2655 { return this->rfind(__c, __pos); }
2656
2657 /**
2658 * @brief Find position of a character not in string.
2659 * @param __str String containing characters to avoid.
2660 * @param __pos Index of character to search from (default 0).
2661 * @return Index of first occurrence.
2662 *
2663 * Starting from @a __pos, searches forward for a character not contained
2664 * in @a __str within this string. If found, returns the index where it
2665 * was found. If not found, returns npos.
2666 */
2667 size_type
2668 find_first_not_of(const basic_string& __str, size_type __pos = 0) const
2669 _GLIBCXX_NOEXCEPTnoexcept
2670 { return this->find_first_not_of(__str.data(), __pos, __str.size()); }
2671
2672#if __cplusplus201703L >= 201703L
2673 /**
2674 * @brief Find position of a character not in a string_view.
2675 * @param __svt A object convertible to string_view containing
2676 * characters to avoid.
2677 * @param __pos Index of character to search from (default 0).
2678 * @return Index of first occurrence.
2679 */
2680 template<typename _Tp>
2681 _If_sv<_Tp, size_type>
2682 find_first_not_of(const _Tp& __svt, size_type __pos = 0) const
2683 noexcept(is_same<_Tp, __sv_type>::value)
2684 {
2685 __sv_type __sv = __svt;
2686 return this->find_first_not_of(__sv.data(), __pos, __sv.size());
2687 }
2688#endif // C++17
2689
2690 /**
2691 * @brief Find position of a character not in C substring.
2692 * @param __s C string containing characters to avoid.
2693 * @param __pos Index of character to search from.
2694 * @param __n Number of characters from __s to consider.
2695 * @return Index of first occurrence.
2696 *
2697 * Starting from @a __pos, searches forward for a character not
2698 * contained in the first @a __n characters of @a __s within
2699 * this string. If found, returns the index where it was
2700 * found. If not found, returns npos.
2701 */
2702 size_type
2703 find_first_not_of(const _CharT* __s, size_type __pos,
2704 size_type __n) const _GLIBCXX_NOEXCEPTnoexcept;
2705
2706 /**
2707 * @brief Find position of a character not in C string.
2708 * @param __s C string containing characters to avoid.
2709 * @param __pos Index of character to search from (default 0).
2710 * @return Index of first occurrence.
2711 *
2712 * Starting from @a __pos, searches forward for a character not
2713 * contained in @a __s within this string. If found, returns
2714 * the index where it was found. If not found, returns npos.
2715 */
2716 size_type
2717 find_first_not_of(const _CharT* __s, size_type __pos = 0) const
2718 _GLIBCXX_NOEXCEPTnoexcept
2719 {
2720 __glibcxx_requires_string(__s);
2721 return this->find_first_not_of(__s, __pos, traits_type::length(__s));
2722 }
2723
2724 /**
2725 * @brief Find position of a different character.
2726 * @param __c Character to avoid.
2727 * @param __pos Index of character to search from (default 0).
2728 * @return Index of first occurrence.
2729 *
2730 * Starting from @a __pos, searches forward for a character
2731 * other than @a __c within this string. If found, returns the
2732 * index where it was found. If not found, returns npos.
2733 */
2734 size_type
2735 find_first_not_of(_CharT __c, size_type __pos = 0) const
2736 _GLIBCXX_NOEXCEPTnoexcept;
2737
2738 /**
2739 * @brief Find last position of a character not in string.
2740 * @param __str String containing characters to avoid.
2741 * @param __pos Index of character to search back from (default end).
2742 * @return Index of last occurrence.
2743 *
2744 * Starting from @a __pos, searches backward for a character
2745 * not contained in @a __str within this string. If found,
2746 * returns the index where it was found. If not found, returns
2747 * npos.
2748 */
2749 size_type
2750 find_last_not_of(const basic_string& __str, size_type __pos = npos) const
2751 _GLIBCXX_NOEXCEPTnoexcept
2752 { return this->find_last_not_of(__str.data(), __pos, __str.size()); }
2753
2754#if __cplusplus201703L >= 201703L
2755 /**
2756 * @brief Find last position of a character not in a string_view.
2757 * @param __svt An object convertible to string_view containing
2758 * characters to avoid.
2759 * @param __pos Index of character to search back from (default end).
2760 * @return Index of last occurrence.
2761 */
2762 template<typename _Tp>
2763 _If_sv<_Tp, size_type>
2764 find_last_not_of(const _Tp& __svt, size_type __pos = npos) const
2765 noexcept(is_same<_Tp, __sv_type>::value)
2766 {
2767 __sv_type __sv = __svt;
2768 return this->find_last_not_of(__sv.data(), __pos, __sv.size());
2769 }
2770#endif // C++17
2771
2772 /**
2773 * @brief Find last position of a character not in C substring.
2774 * @param __s C string containing characters to avoid.
2775 * @param __pos Index of character to search back from.
2776 * @param __n Number of characters from s to consider.
2777 * @return Index of last occurrence.
2778 *
2779 * Starting from @a __pos, searches backward for a character not
2780 * contained in the first @a __n characters of @a __s within this string.
2781 * If found, returns the index where it was found. If not found,
2782 * returns npos.
2783 */
2784 size_type
2785 find_last_not_of(const _CharT* __s, size_type __pos,
2786 size_type __n) const _GLIBCXX_NOEXCEPTnoexcept;
2787 /**
2788 * @brief Find last position of a character not in C string.
2789 * @param __s C string containing characters to avoid.
2790 * @param __pos Index of character to search back from (default end).
2791 * @return Index of last occurrence.
2792 *
2793 * Starting from @a __pos, searches backward for a character
2794 * not contained in @a __s within this string. If found,
2795 * returns the index where it was found. If not found, returns
2796 * npos.
2797 */
2798 size_type
2799 find_last_not_of(const _CharT* __s, size_type __pos = npos) const
2800 _GLIBCXX_NOEXCEPTnoexcept
2801 {
2802 __glibcxx_requires_string(__s);
2803 return this->find_last_not_of(__s, __pos, traits_type::length(__s));
2804 }
2805
2806 /**
2807 * @brief Find last position of a different character.
2808 * @param __c Character to avoid.
2809 * @param __pos Index of character to search back from (default end).
2810 * @return Index of last occurrence.
2811 *
2812 * Starting from @a __pos, searches backward for a character other than
2813 * @a __c within this string. If found, returns the index where it was
2814 * found. If not found, returns npos.
2815 */
2816 size_type
2817 find_last_not_of(_CharT __c, size_type __pos = npos) const
2818 _GLIBCXX_NOEXCEPTnoexcept;
2819
2820 /**
2821 * @brief Get a substring.
2822 * @param __pos Index of first character (default 0).
2823 * @param __n Number of characters in substring (default remainder).
2824 * @return The new string.
2825 * @throw std::out_of_range If __pos > size().
2826 *
2827 * Construct and return a new string using the @a __n
2828 * characters starting at @a __pos. If the string is too
2829 * short, use the remainder of the characters. If @a __pos is
2830 * beyond the end of the string, out_of_range is thrown.
2831 */
2832 basic_string
2833 substr(size_type __pos = 0, size_type __n = npos) const
2834 { return basic_string(*this,
2835 _M_check(__pos, "basic_string::substr"), __n); }
2836
2837 /**
2838 * @brief Compare to a string.
2839 * @param __str String to compare against.
2840 * @return Integer < 0, 0, or > 0.
2841 *
2842 * Returns an integer < 0 if this string is ordered before @a
2843 * __str, 0 if their values are equivalent, or > 0 if this
2844 * string is ordered after @a __str. Determines the effective
2845 * length rlen of the strings to compare as the smallest of
2846 * size() and str.size(). The function then compares the two
2847 * strings by calling traits::compare(data(), str.data(),rlen).
2848 * If the result of the comparison is nonzero returns it,
2849 * otherwise the shorter one is ordered first.
2850 */
2851 int
2852 compare(const basic_string& __str) const
2853 {
2854 const size_type __size = this->size();
2855 const size_type __osize = __str.size();
2856 const size_type __len = std::min(__size, __osize);
2857
2858 int __r = traits_type::compare(_M_data(), __str.data(), __len);
2859 if (!__r)
2860 __r = _S_compare(__size, __osize);
2861 return __r;
2862 }
2863
2864#if __cplusplus201703L >= 201703L
2865 /**
2866 * @brief Compare to a string_view.
2867 * @param __svt An object convertible to string_view to compare against.
2868 * @return Integer < 0, 0, or > 0.
2869 */
2870 template<typename _Tp>
2871 _If_sv<_Tp, int>
2872 compare(const _Tp& __svt) const
2873 noexcept(is_same<_Tp, __sv_type>::value)
2874 {
2875 __sv_type __sv = __svt;
2876 const size_type __size = this->size();
2877 const size_type __osize = __sv.size();
2878 const size_type __len = std::min(__size, __osize);
2879
2880 int __r = traits_type::compare(_M_data(), __sv.data(), __len);
2881 if (!__r)
2882 __r = _S_compare(__size, __osize);
2883 return __r;
2884 }
2885
2886 /**
2887 * @brief Compare to a string_view.
2888 * @param __pos A position in the string to start comparing from.
2889 * @param __n The number of characters to compare.
2890 * @param __svt An object convertible to string_view to compare
2891 * against.
2892 * @return Integer < 0, 0, or > 0.
2893 */
2894 template<typename _Tp>
2895 _If_sv<_Tp, int>
2896 compare(size_type __pos, size_type __n, const _Tp& __svt) const
2897 noexcept(is_same<_Tp, __sv_type>::value)
2898 {
2899 __sv_type __sv = __svt;
2900 return __sv_type(*this).substr(__pos, __n).compare(__sv);
2901 }
2902
2903 /**
2904 * @brief Compare to a string_view.
2905 * @param __pos1 A position in the string to start comparing from.
2906 * @param __n1 The number of characters to compare.
2907 * @param __svt An object convertible to string_view to compare
2908 * against.
2909 * @param __pos2 A position in the string_view to start comparing from.
2910 * @param __n2 The number of characters to compare.
2911 * @return Integer < 0, 0, or > 0.
2912 */
2913 template<typename _Tp>
2914 _If_sv<_Tp, int>
2915 compare(size_type __pos1, size_type __n1, const _Tp& __svt,
2916 size_type __pos2, size_type __n2 = npos) const
2917 noexcept(is_same<_Tp, __sv_type>::value)
2918 {
2919 __sv_type __sv = __svt;
2920 return __sv_type(*this)
2921 .substr(__pos1, __n1).compare(__sv.substr(__pos2, __n2));
2922 }
2923#endif // C++17
2924
2925 /**
2926 * @brief Compare substring to a string.
2927 * @param __pos Index of first character of substring.
2928 * @param __n Number of characters in substring.
2929 * @param __str String to compare against.
2930 * @return Integer < 0, 0, or > 0.
2931 *
2932 * Form the substring of this string from the @a __n characters
2933 * starting at @a __pos. Returns an integer < 0 if the
2934 * substring is ordered before @a __str, 0 if their values are
2935 * equivalent, or > 0 if the substring is ordered after @a
2936 * __str. Determines the effective length rlen of the strings
2937 * to compare as the smallest of the length of the substring
2938 * and @a __str.size(). The function then compares the two
2939 * strings by calling
2940 * traits::compare(substring.data(),str.data(),rlen). If the
2941 * result of the comparison is nonzero returns it, otherwise
2942 * the shorter one is ordered first.
2943 */
2944 int
2945 compare(size_type __pos, size_type __n, const basic_string& __str) const;
2946
2947 /**
2948 * @brief Compare substring to a substring.
2949 * @param __pos1 Index of first character of substring.
2950 * @param __n1 Number of characters in substring.
2951 * @param __str String to compare against.
2952 * @param __pos2 Index of first character of substring of str.
2953 * @param __n2 Number of characters in substring of str.
2954 * @return Integer < 0, 0, or > 0.
2955 *
2956 * Form the substring of this string from the @a __n1
2957 * characters starting at @a __pos1. Form the substring of @a
2958 * __str from the @a __n2 characters starting at @a __pos2.
2959 * Returns an integer < 0 if this substring is ordered before
2960 * the substring of @a __str, 0 if their values are equivalent,
2961 * or > 0 if this substring is ordered after the substring of
2962 * @a __str. Determines the effective length rlen of the
2963 * strings to compare as the smallest of the lengths of the
2964 * substrings. The function then compares the two strings by
2965 * calling
2966 * traits::compare(substring.data(),str.substr(pos2,n2).data(),rlen).
2967 * If the result of the comparison is nonzero returns it,
2968 * otherwise the shorter one is ordered first.
2969 */
2970 int
2971 compare(size_type __pos1, size_type __n1, const basic_string& __str,
2972 size_type __pos2, size_type __n2 = npos) const;
2973
2974 /**
2975 * @brief Compare to a C string.
2976 * @param __s C string to compare against.
2977 * @return Integer < 0, 0, or > 0.
2978 *
2979 * Returns an integer < 0 if this string is ordered before @a __s, 0 if
2980 * their values are equivalent, or > 0 if this string is ordered after
2981 * @a __s. Determines the effective length rlen of the strings to
2982 * compare as the smallest of size() and the length of a string
2983 * constructed from @a __s. The function then compares the two strings
2984 * by calling traits::compare(data(),s,rlen). If the result of the
2985 * comparison is nonzero returns it, otherwise the shorter one is
2986 * ordered first.
2987 */
2988 int
2989 compare(const _CharT* __s) const _GLIBCXX_NOEXCEPTnoexcept;
2990
2991 // _GLIBCXX_RESOLVE_LIB_DEFECTS
2992 // 5 String::compare specification questionable
2993 /**
2994 * @brief Compare substring to a C string.
2995 * @param __pos Index of first character of substring.
2996 * @param __n1 Number of characters in substring.
2997 * @param __s C string to compare against.
2998 * @return Integer < 0, 0, or > 0.
2999 *
3000 * Form the substring of this string from the @a __n1
3001 * characters starting at @a pos. Returns an integer < 0 if
3002 * the substring is ordered before @a __s, 0 if their values
3003 * are equivalent, or > 0 if the substring is ordered after @a
3004 * __s. Determines the effective length rlen of the strings to
3005 * compare as the smallest of the length of the substring and
3006 * the length of a string constructed from @a __s. The
3007 * function then compares the two string by calling
3008 * traits::compare(substring.data(),__s,rlen). If the result of
3009 * the comparison is nonzero returns it, otherwise the shorter
3010 * one is ordered first.
3011 */
3012 int
3013 compare(size_type __pos, size_type __n1, const _CharT* __s) const;
3014
3015 /**
3016 * @brief Compare substring against a character %array.
3017 * @param __pos Index of first character of substring.
3018 * @param __n1 Number of characters in substring.
3019 * @param __s character %array to compare against.
3020 * @param __n2 Number of characters of s.
3021 * @return Integer < 0, 0, or > 0.
3022 *
3023 * Form the substring of this string from the @a __n1
3024 * characters starting at @a __pos. Form a string from the
3025 * first @a __n2 characters of @a __s. Returns an integer < 0
3026 * if this substring is ordered before the string from @a __s,
3027 * 0 if their values are equivalent, or > 0 if this substring
3028 * is ordered after the string from @a __s. Determines the
3029 * effective length rlen of the strings to compare as the
3030 * smallest of the length of the substring and @a __n2. The
3031 * function then compares the two strings by calling
3032 * traits::compare(substring.data(),s,rlen). If the result of
3033 * the comparison is nonzero returns it, otherwise the shorter
3034 * one is ordered first.
3035 *
3036 * NB: s must have at least n2 characters, &apos;\\0&apos; has
3037 * no special meaning.
3038 */
3039 int
3040 compare(size_type __pos, size_type __n1, const _CharT* __s,
3041 size_type __n2) const;
3042
3043#if __cplusplus201703L > 201703L
3044 bool
3045 starts_with(basic_string_view<_CharT, _Traits> __x) const noexcept
3046 { return __sv_type(this->data(), this->size()).starts_with(__x); }
3047
3048 bool
3049 starts_with(_CharT __x) const noexcept
3050 { return __sv_type(this->data(), this->size()).starts_with(__x); }
3051
3052 bool
3053 starts_with(const _CharT* __x) const noexcept
3054 { return __sv_type(this->data(), this->size()).starts_with(__x); }
3055
3056 bool
3057 ends_with(basic_string_view<_CharT, _Traits> __x) const noexcept
3058 { return __sv_type(this->data(), this->size()).ends_with(__x); }
3059
3060 bool
3061 ends_with(_CharT __x) const noexcept
3062 { return __sv_type(this->data(), this->size()).ends_with(__x); }
3063
3064 bool
3065 ends_with(const _CharT* __x) const noexcept
3066 { return __sv_type(this->data(), this->size()).ends_with(__x); }
3067#endif // C++20
3068
3069 // Allow basic_stringbuf::__xfer_bufptrs to call _M_length:
3070 template<typename, typename, typename> friend class basic_stringbuf;
3071 };
3072_GLIBCXX_END_NAMESPACE_CXX11}
3073#else // !_GLIBCXX_USE_CXX11_ABI
3074 // Reference-counted COW string implentation
3075
3076 /**
3077 * @class basic_string basic_string.h <string>
3078 * @brief Managing sequences of characters and character-like objects.
3079 *
3080 * @ingroup strings
3081 * @ingroup sequences
3082 *
3083 * @tparam _CharT Type of character
3084 * @tparam _Traits Traits for character type, defaults to
3085 * char_traits<_CharT>.
3086 * @tparam _Alloc Allocator type, defaults to allocator<_CharT>.
3087 *
3088 * Meets the requirements of a <a href="tables.html#65">container</a>, a
3089 * <a href="tables.html#66">reversible container</a>, and a
3090 * <a href="tables.html#67">sequence</a>. Of the
3091 * <a href="tables.html#68">optional sequence requirements</a>, only
3092 * @c push_back, @c at, and @c %array access are supported.
3093 *
3094 * @doctodo
3095 *
3096 *
3097 * Documentation? What's that?
3098 * Nathan Myers <ncm@cantrip.org>.
3099 *
3100 * A string looks like this:
3101 *
3102 * @code
3103 * [_Rep]
3104 * _M_length
3105 * [basic_string<char_type>] _M_capacity
3106 * _M_dataplus _M_refcount
3107 * _M_p ----------------> unnamed array of char_type
3108 * @endcode
3109 *
3110 * Where the _M_p points to the first character in the string, and
3111 * you cast it to a pointer-to-_Rep and subtract 1 to get a
3112 * pointer to the header.
3113 *
3114 * This approach has the enormous advantage that a string object
3115 * requires only one allocation. All the ugliness is confined
3116 * within a single %pair of inline functions, which each compile to
3117 * a single @a add instruction: _Rep::_M_data(), and
3118 * string::_M_rep(); and the allocation function which gets a
3119 * block of raw bytes and with room enough and constructs a _Rep
3120 * object at the front.
3121 *
3122 * The reason you want _M_data pointing to the character %array and
3123 * not the _Rep is so that the debugger can see the string
3124 * contents. (Probably we should add a non-inline member to get
3125 * the _Rep for the debugger to use, so users can check the actual
3126 * string length.)
3127 *
3128 * Note that the _Rep object is a POD so that you can have a
3129 * static <em>empty string</em> _Rep object already @a constructed before
3130 * static constructors have run. The reference-count encoding is
3131 * chosen so that a 0 indicates one reference, so you never try to
3132 * destroy the empty-string _Rep object.
3133 *
3134 * All but the last paragraph is considered pretty conventional
3135 * for a C++ string implementation.
3136 */
3137 // 21.3 Template class basic_string
3138 template<typename _CharT, typename _Traits, typename _Alloc>
3139 class basic_string
3140 {
3141 typedef typename __gnu_cxx::__alloc_traits<_Alloc>::template
3142 rebind<_CharT>::other _CharT_alloc_type;
3143 typedef __gnu_cxx::__alloc_traits<_CharT_alloc_type> _CharT_alloc_traits;
3144
3145 // Types:
3146 public:
3147 typedef _Traits traits_type;
3148 typedef typename _Traits::char_type value_type;
3149 typedef _Alloc allocator_type;
3150 typedef typename _CharT_alloc_type::size_type size_type;
3151 typedef typename _CharT_alloc_type::difference_type difference_type;
3152#if __cplusplus201703L < 201103L
3153 typedef typename _CharT_alloc_type::reference reference;
3154 typedef typename _CharT_alloc_type::const_reference const_reference;
3155#else
3156 typedef value_type& reference;
3157 typedef const value_type& const_reference;
3158#endif
3159 typedef typename _CharT_alloc_traits::pointer pointer;
3160 typedef typename _CharT_alloc_traits::const_pointer const_pointer;
3161 typedef __gnu_cxx::__normal_iterator<pointer, basic_string> iterator;
3162 typedef __gnu_cxx::__normal_iterator<const_pointer, basic_string>
3163 const_iterator;
3164 typedef std::reverse_iterator<const_iterator> const_reverse_iterator;
3165 typedef std::reverse_iterator<iterator> reverse_iterator;
3166
3167 protected:
3168 // type used for positions in insert, erase etc.
3169 typedef iterator __const_iterator;
3170
3171 private:
3172 // _Rep: string representation
3173 // Invariants:
3174 // 1. String really contains _M_length + 1 characters: due to 21.3.4
3175 // must be kept null-terminated.
3176 // 2. _M_capacity >= _M_length
3177 // Allocated memory is always (_M_capacity + 1) * sizeof(_CharT).
3178 // 3. _M_refcount has three states:
3179 // -1: leaked, one reference, no ref-copies allowed, non-const.
3180 // 0: one reference, non-const.
3181 // n>0: n + 1 references, operations require a lock, const.
3182 // 4. All fields==0 is an empty string, given the extra storage
3183 // beyond-the-end for a null terminator; thus, the shared
3184 // empty string representation needs no constructor.
3185
3186 struct _Rep_base
3187 {
3188 size_type _M_length;
3189 size_type _M_capacity;
3190 _Atomic_word _M_refcount;
3191 };
3192
3193 struct _Rep : _Rep_base
3194 {
3195 // Types:
3196 typedef typename __gnu_cxx::__alloc_traits<_Alloc>::template
3197 rebind<char>::other _Raw_bytes_alloc;
3198
3199 // (Public) Data members:
3200
3201 // The maximum number of individual char_type elements of an
3202 // individual string is determined by _S_max_size. This is the
3203 // value that will be returned by max_size(). (Whereas npos
3204 // is the maximum number of bytes the allocator can allocate.)
3205 // If one was to divvy up the theoretical largest size string,
3206 // with a terminating character and m _CharT elements, it'd
3207 // look like this:
3208 // npos = sizeof(_Rep) + (m * sizeof(_CharT)) + sizeof(_CharT)
3209 // Solving for m:
3210 // m = ((npos - sizeof(_Rep))/sizeof(CharT)) - 1
3211 // In addition, this implementation quarters this amount.
3212 static const size_type _S_max_size;
3213 static const _CharT _S_terminal;
3214
3215 // The following storage is init'd to 0 by the linker, resulting
3216 // (carefully) in an empty string with one reference.
3217 static size_type _S_empty_rep_storage[];
3218
3219 static _Rep&
3220 _S_empty_rep() _GLIBCXX_NOEXCEPTnoexcept
3221 {
3222 // NB: Mild hack to avoid strict-aliasing warnings. Note that
3223 // _S_empty_rep_storage is never modified and the punning should
3224 // be reasonably safe in this case.
3225 void* __p = reinterpret_cast<void*>(&_S_empty_rep_storage);
3226 return *reinterpret_cast<_Rep*>(__p);
3227 }
3228
3229 bool
3230 _M_is_leaked() const _GLIBCXX_NOEXCEPTnoexcept
3231 {
3232#if defined(__GTHREADS1)
3233 // _M_refcount is mutated concurrently by _M_refcopy/_M_dispose,
3234 // so we need to use an atomic load. However, _M_is_leaked
3235 // predicate does not change concurrently (i.e. the string is either
3236 // leaked or not), so a relaxed load is enough.
3237 return __atomic_load_n(&this->_M_refcount, __ATOMIC_RELAXED0) < 0;
3238#else
3239 return this->_M_refcount < 0;
3240#endif
3241 }
3242
3243 bool
3244 _M_is_shared() const _GLIBCXX_NOEXCEPTnoexcept
3245 {
3246#if defined(__GTHREADS1)
3247 // _M_refcount is mutated concurrently by _M_refcopy/_M_dispose,
3248 // so we need to use an atomic load. Another thread can drop last
3249 // but one reference concurrently with this check, so we need this
3250 // load to be acquire to synchronize with release fetch_and_add in
3251 // _M_dispose.
3252 return __atomic_load_n(&this->_M_refcount, __ATOMIC_ACQUIRE2) > 0;
3253#else
3254 return this->_M_refcount > 0;
3255#endif
3256 }
3257
3258 void
3259 _M_set_leaked() _GLIBCXX_NOEXCEPTnoexcept
3260 { this->_M_refcount = -1; }
3261
3262 void
3263 _M_set_sharable() _GLIBCXX_NOEXCEPTnoexcept
3264 { this->_M_refcount = 0; }
3265
3266 void
3267 _M_set_length_and_sharable(size_type __n) _GLIBCXX_NOEXCEPTnoexcept
3268 {
3269#if _GLIBCXX_FULLY_DYNAMIC_STRING0 == 0
3270 if (__builtin_expect(this != &_S_empty_rep(), false))
3271#endif
3272 {
3273 this->_M_set_sharable(); // One reference.
3274 this->_M_length = __n;
3275 traits_type::assign(this->_M_refdata()[__n], _S_terminal);
3276 // grrr. (per 21.3.4)
3277 // You cannot leave those LWG people alone for a second.
3278 }
3279 }
3280
3281 _CharT*
3282 _M_refdata() throw()
3283 { return reinterpret_cast<_CharT*>(this + 1); }
3284
3285 _CharT*
3286 _M_grab(const _Alloc& __alloc1, const _Alloc& __alloc2)
3287 {
3288 return (!_M_is_leaked() && __alloc1 == __alloc2)
3289 ? _M_refcopy() : _M_clone(__alloc1);
3290 }
3291
3292 // Create & Destroy
3293 static _Rep*
3294 _S_create(size_type, size_type, const _Alloc&);
3295
3296 void
3297 _M_dispose(const _Alloc& __a) _GLIBCXX_NOEXCEPTnoexcept
3298 {
3299#if _GLIBCXX_FULLY_DYNAMIC_STRING0 == 0
3300 if (__builtin_expect(this != &_S_empty_rep(), false))
3301#endif
3302 {
3303 // Be race-detector-friendly. For more info see bits/c++config.
3304 _GLIBCXX_SYNCHRONIZATION_HAPPENS_BEFORE(&this->_M_refcount);
3305 // Decrement of _M_refcount is acq_rel, because:
3306 // - all but last decrements need to release to synchronize with
3307 // the last decrement that will delete the object.
3308 // - the last decrement needs to acquire to synchronize with
3309 // all the previous decrements.
3310 // - last but one decrement needs to release to synchronize with
3311 // the acquire load in _M_is_shared that will conclude that
3312 // the object is not shared anymore.
3313 if (__gnu_cxx::__exchange_and_add_dispatch(&this->_M_refcount,
3314 -1) <= 0)
3315 {
3316 _GLIBCXX_SYNCHRONIZATION_HAPPENS_AFTER(&this->_M_refcount);
3317 _M_destroy(__a);
3318 }
3319 }
3320 } // XXX MT
3321
3322 void
3323 _M_destroy(const _Alloc&) throw();
3324
3325 _CharT*
3326 _M_refcopy() throw()
3327 {
3328#if _GLIBCXX_FULLY_DYNAMIC_STRING0 == 0
3329 if (__builtin_expect(this != &_S_empty_rep(), false))
3330#endif
3331 __gnu_cxx::__atomic_add_dispatch(&this->_M_refcount, 1);
3332 return _M_refdata();
3333 } // XXX MT
3334
3335 _CharT*
3336 _M_clone(const _Alloc&, size_type __res = 0);
3337 };
3338
3339 // Use empty-base optimization: http://www.cantrip.org/emptyopt.html
3340 struct _Alloc_hider : _Alloc
3341 {
3342 _Alloc_hider(_CharT* __dat, const _Alloc& __a) _GLIBCXX_NOEXCEPTnoexcept
3343 : _Alloc(__a), _M_p(__dat) { }
3344
3345 _CharT* _M_p; // The actual data.
3346 };
3347
3348 public:
3349 // Data Members (public):
3350 // NB: This is an unsigned type, and thus represents the maximum
3351 // size that the allocator can hold.
3352 /// Value returned by various member functions when they fail.
3353 static const size_type npos = static_cast<size_type>(-1);
3354
3355 private:
3356 // Data Members (private):
3357 mutable _Alloc_hider _M_dataplus;
3358
3359 _CharT*
3360 _M_data() const _GLIBCXX_NOEXCEPTnoexcept
3361 { return _M_dataplus._M_p; }
3362
3363 _CharT*
3364 _M_data(_CharT* __p) _GLIBCXX_NOEXCEPTnoexcept
3365 { return (_M_dataplus._M_p = __p); }
3366
3367 _Rep*
3368 _M_rep() const _GLIBCXX_NOEXCEPTnoexcept
3369 { return &((reinterpret_cast<_Rep*> (_M_data()))[-1]); }
3370
3371 // For the internal use we have functions similar to `begin'/`end'
3372 // but they do not call _M_leak.
3373 iterator
3374 _M_ibegin() const _GLIBCXX_NOEXCEPTnoexcept
3375 { return iterator(_M_data()); }
3376
3377 iterator
3378 _M_iend() const _GLIBCXX_NOEXCEPTnoexcept
3379 { return iterator(_M_data() + this->size()); }
3380
3381 void
3382 _M_leak() // for use in begin() & non-const op[]
3383 {
3384 if (!_M_rep()->_M_is_leaked())
3385 _M_leak_hard();
3386 }
3387
3388 size_type
3389 _M_check(size_type __pos, const char* __s) const
3390 {
3391 if (__pos > this->size())
3392 __throw_out_of_range_fmt(__N("%s: __pos (which is %zu) > "("%s: __pos (which is %zu) > " "this->size() (which is %zu)"
)
3393 "this->size() (which is %zu)")("%s: __pos (which is %zu) > " "this->size() (which is %zu)"
)
,
3394 __s, __pos, this->size());
3395 return __pos;
3396 }
3397
3398 void
3399 _M_check_length(size_type __n1, size_type __n2, const char* __s) const
3400 {
3401 if (this->max_size() - (this->size() - __n1) < __n2)
3402 __throw_length_error(__N(__s)(__s));
3403 }
3404
3405 // NB: _M_limit doesn't check for a bad __pos value.
3406 size_type
3407 _M_limit(size_type __pos, size_type __off) const _GLIBCXX_NOEXCEPTnoexcept
3408 {
3409 const bool __testoff = __off < this->size() - __pos;
3410 return __testoff ? __off : this->size() - __pos;
3411 }
3412
3413 // True if _Rep and source do not overlap.
3414 bool
3415 _M_disjunct(const _CharT* __s) const _GLIBCXX_NOEXCEPTnoexcept
3416 {
3417 return (less<const _CharT*>()(__s, _M_data())
3418 || less<const _CharT*>()(_M_data() + this->size(), __s));
3419 }
3420
3421 // When __n = 1 way faster than the general multichar
3422 // traits_type::copy/move/assign.
3423 static void
3424 _M_copy(_CharT* __d, const _CharT* __s, size_type __n) _GLIBCXX_NOEXCEPTnoexcept
3425 {
3426 if (__n == 1)
3427 traits_type::assign(*__d, *__s);
3428 else
3429 traits_type::copy(__d, __s, __n);
3430 }
3431
3432 static void
3433 _M_move(_CharT* __d, const _CharT* __s, size_type __n) _GLIBCXX_NOEXCEPTnoexcept
3434 {
3435 if (__n == 1)
3436 traits_type::assign(*__d, *__s);
3437 else
3438 traits_type::move(__d, __s, __n);
3439 }
3440
3441 static void
3442 _M_assign(_CharT* __d, size_type __n, _CharT __c) _GLIBCXX_NOEXCEPTnoexcept
3443 {
3444 if (__n == 1)
3445 traits_type::assign(*__d, __c);
3446 else
3447 traits_type::assign(__d, __n, __c);
3448 }
3449
3450 // _S_copy_chars is a separate template to permit specialization
3451 // to optimize for the common case of pointers as iterators.
3452 template<class _Iterator>
3453 static void
3454 _S_copy_chars(_CharT* __p, _Iterator __k1, _Iterator __k2)
3455 {
3456 for (; __k1 != __k2; ++__k1, (void)++__p)
3457 traits_type::assign(*__p, *__k1); // These types are off.
3458 }
3459
3460 static void
3461 _S_copy_chars(_CharT* __p, iterator __k1, iterator __k2) _GLIBCXX_NOEXCEPTnoexcept
3462 { _S_copy_chars(__p, __k1.base(), __k2.base()); }
3463
3464 static void
3465 _S_copy_chars(_CharT* __p, const_iterator __k1, const_iterator __k2)
3466 _GLIBCXX_NOEXCEPTnoexcept
3467 { _S_copy_chars(__p, __k1.base(), __k2.base()); }
3468
3469 static void
3470 _S_copy_chars(_CharT* __p, _CharT* __k1, _CharT* __k2) _GLIBCXX_NOEXCEPTnoexcept
3471 { _M_copy(__p, __k1, __k2 - __k1); }
3472
3473 static void
3474 _S_copy_chars(_CharT* __p, const _CharT* __k1, const _CharT* __k2)
3475 _GLIBCXX_NOEXCEPTnoexcept
3476 { _M_copy(__p, __k1, __k2 - __k1); }
3477
3478 static int
3479 _S_compare(size_type __n1, size_type __n2) _GLIBCXX_NOEXCEPTnoexcept
3480 {
3481 const difference_type __d = difference_type(__n1 - __n2);
3482
3483 if (__d > __gnu_cxx::__numeric_traits<int>::__max)
3484 return __gnu_cxx::__numeric_traits<int>::__max;
3485 else if (__d < __gnu_cxx::__numeric_traits<int>::__min)
3486 return __gnu_cxx::__numeric_traits<int>::__min;
3487 else
3488 return int(__d);
3489 }
3490
3491 void
3492 _M_mutate(size_type __pos, size_type __len1, size_type __len2);
3493
3494 void
3495 _M_leak_hard();
3496
3497 static _Rep&
3498 _S_empty_rep() _GLIBCXX_NOEXCEPTnoexcept
3499 { return _Rep::_S_empty_rep(); }
3500
3501#if __cplusplus201703L >= 201703L
3502 // A helper type for avoiding boiler-plate.
3503 typedef basic_string_view<_CharT, _Traits> __sv_type;
3504
3505 template<typename _Tp, typename _Res>
3506 using _If_sv = enable_if_t<
3507 __and_<is_convertible<const _Tp&, __sv_type>,
3508 __not_<is_convertible<const _Tp*, const basic_string*>>,
3509 __not_<is_convertible<const _Tp&, const _CharT*>>>::value,
3510 _Res>;
3511
3512 // Allows an implicit conversion to __sv_type.
3513 static __sv_type
3514 _S_to_string_view(__sv_type __svt) noexcept
3515 { return __svt; }
3516
3517 // Wraps a string_view by explicit conversion and thus
3518 // allows to add an internal constructor that does not
3519 // participate in overload resolution when a string_view
3520 // is provided.
3521 struct __sv_wrapper
3522 {
3523 explicit __sv_wrapper(__sv_type __sv) noexcept : _M_sv(__sv) { }
3524 __sv_type _M_sv;
3525 };
3526
3527 /**
3528 * @brief Only internally used: Construct string from a string view
3529 * wrapper.
3530 * @param __svw string view wrapper.
3531 * @param __a Allocator to use.
3532 */
3533 explicit
3534 basic_string(__sv_wrapper __svw, const _Alloc& __a)
3535 : basic_string(__svw._M_sv.data(), __svw._M_sv.size(), __a) { }
3536#endif
3537
3538 public:
3539 // Construct/copy/destroy:
3540 // NB: We overload ctors in some cases instead of using default
3541 // arguments, per 17.4.4.4 para. 2 item 2.
3542
3543 /**
3544 * @brief Default constructor creates an empty string.
3545 */
3546 basic_string()
3547#if _GLIBCXX_FULLY_DYNAMIC_STRING0 == 0
3548 _GLIBCXX_NOEXCEPTnoexcept
3549 : _M_dataplus(_S_empty_rep()._M_refdata(), _Alloc())
3550#else
3551 : _M_dataplus(_S_construct(size_type(), _CharT(), _Alloc()), _Alloc())
3552#endif
3553 { }
3554
3555 /**
3556 * @brief Construct an empty string using allocator @a a.
3557 */
3558 explicit
3559 basic_string(const _Alloc& __a);
3560
3561 // NB: per LWG issue 42, semantics different from IS:
3562 /**
3563 * @brief Construct string with copy of value of @a str.
3564 * @param __str Source string.
3565 */
3566 basic_string(const basic_string& __str);
3567
3568 // _GLIBCXX_RESOLVE_LIB_DEFECTS
3569 // 2583. no way to supply an allocator for basic_string(str, pos)
3570 /**
3571 * @brief Construct string as copy of a substring.
3572 * @param __str Source string.
3573 * @param __pos Index of first character to copy from.
3574 * @param __a Allocator to use.
3575 */
3576 basic_string(const basic_string& __str, size_type __pos,
3577 const _Alloc& __a = _Alloc());
3578
3579 /**
3580 * @brief Construct string as copy of a substring.
3581 * @param __str Source string.
3582 * @param __pos Index of first character to copy from.
3583 * @param __n Number of characters to copy.
3584 */
3585 basic_string(const basic_string& __str, size_type __pos,
3586 size_type __n);
3587 /**
3588 * @brief Construct string as copy of a substring.
3589 * @param __str Source string.
3590 * @param __pos Index of first character to copy from.
3591 * @param __n Number of characters to copy.
3592 * @param __a Allocator to use.
3593 */
3594 basic_string(const basic_string& __str, size_type __pos,
3595 size_type __n, const _Alloc& __a);
3596
3597 /**
3598 * @brief Construct string initialized by a character %array.
3599 * @param __s Source character %array.
3600 * @param __n Number of characters to copy.
3601 * @param __a Allocator to use (default is default allocator).
3602 *
3603 * NB: @a __s must have at least @a __n characters, &apos;\\0&apos;
3604 * has no special meaning.
3605 */
3606 basic_string(const _CharT* __s, size_type __n,
3607 const _Alloc& __a = _Alloc());
3608
3609 /**
3610 * @brief Construct string as copy of a C string.
3611 * @param __s Source C string.
3612 * @param __a Allocator to use (default is default allocator).
3613 */
3614#if __cpp_deduction_guides201703L && ! defined _GLIBCXX_DEFINING_STRING_INSTANTIATIONS
3615 // _GLIBCXX_RESOLVE_LIB_DEFECTS
3616 // 3076. basic_string CTAD ambiguity
3617 template<typename = _RequireAllocator<_Alloc>>
3618#endif
3619 basic_string(const _CharT* __s, const _Alloc& __a = _Alloc())
3620 : _M_dataplus(_S_construct(__s, __s ? __s + traits_type::length(__s) :
3621 __s + npos, __a), __a)
3622 { }
3623
3624 /**
3625 * @brief Construct string as multiple characters.
3626 * @param __n Number of characters.
3627 * @param __c Character to use.
3628 * @param __a Allocator to use (default is default allocator).
3629 */
3630 basic_string(size_type __n, _CharT __c, const _Alloc& __a = _Alloc());
3631
3632#if __cplusplus201703L >= 201103L
3633 /**
3634 * @brief Move construct string.
3635 * @param __str Source string.
3636 *
3637 * The newly-created string contains the exact contents of @a __str.
3638 * @a __str is a valid, but unspecified string.
3639 **/
3640 basic_string(basic_string&& __str)
3641#if _GLIBCXX_FULLY_DYNAMIC_STRING0 == 0
3642 noexcept // FIXME C++11: should always be noexcept.
3643#endif
3644 : _M_dataplus(std::move(__str._M_dataplus))
3645 {
3646#if _GLIBCXX_FULLY_DYNAMIC_STRING0 == 0
3647 __str._M_data(_S_empty_rep()._M_refdata());
3648#else
3649 __str._M_data(_S_construct(size_type(), _CharT(), get_allocator()));
3650#endif
3651 }
3652
3653 /**
3654 * @brief Construct string from an initializer %list.
3655 * @param __l std::initializer_list of characters.
3656 * @param __a Allocator to use (default is default allocator).
3657 */
3658 basic_string(initializer_list<_CharT> __l, const _Alloc& __a = _Alloc());
3659
3660 basic_string(const basic_string& __str, const _Alloc& __a)
3661 : _M_dataplus(__str._M_rep()->_M_grab(__a, __str.get_allocator()), __a)
3662 { }
3663
3664 basic_string(basic_string&& __str, const _Alloc& __a)
3665 : _M_dataplus(__str._M_data(), __a)
3666 {
3667 if (__a == __str.get_allocator())
3668 {
3669#if _GLIBCXX_FULLY_DYNAMIC_STRING0 == 0
3670 __str._M_data(_S_empty_rep()._M_refdata());
3671#else
3672 __str._M_data(_S_construct(size_type(), _CharT(), __a));
3673#endif
3674 }
3675 else
3676 _M_dataplus._M_p = _S_construct(__str.begin(), __str.end(), __a);
3677 }
3678#endif // C++11
3679
3680 /**
3681 * @brief Construct string as copy of a range.
3682 * @param __beg Start of range.
3683 * @param __end End of range.
3684 * @param __a Allocator to use (default is default allocator).
3685 */
3686 template<class _InputIterator>
3687 basic_string(_InputIterator __beg, _InputIterator __end,
3688 const _Alloc& __a = _Alloc());
3689
3690#if __cplusplus201703L >= 201703L
3691 /**
3692 * @brief Construct string from a substring of a string_view.
3693 * @param __t Source object convertible to string view.
3694 * @param __pos The index of the first character to copy from __t.
3695 * @param __n The number of characters to copy from __t.
3696 * @param __a Allocator to use.
3697 */
3698 template<typename _Tp, typename = _If_sv<_Tp, void>>
3699 basic_string(const _Tp& __t, size_type __pos, size_type __n,
3700 const _Alloc& __a = _Alloc())
3701 : basic_string(_S_to_string_view(__t).substr(__pos, __n), __a) { }
3702
3703 /**
3704 * @brief Construct string from a string_view.
3705 * @param __t Source object convertible to string view.
3706 * @param __a Allocator to use (default is default allocator).
3707 */
3708 template<typename _Tp, typename = _If_sv<_Tp, void>>
3709 explicit
3710 basic_string(const _Tp& __t, const _Alloc& __a = _Alloc())
3711 : basic_string(__sv_wrapper(_S_to_string_view(__t)), __a) { }
3712#endif // C++17
3713
3714 /**
3715 * @brief Destroy the string instance.
3716 */
3717 ~basic_string() _GLIBCXX_NOEXCEPTnoexcept
3718 { _M_rep()->_M_dispose(this->get_allocator()); }
3719
3720 /**
3721 * @brief Assign the value of @a str to this string.
3722 * @param __str Source string.
3723 */
3724 basic_string&
3725 operator=(const basic_string& __str)
3726 { return this->assign(__str); }
3727
3728 /**
3729 * @brief Copy contents of @a s into this string.
3730 * @param __s Source null-terminated string.
3731 */
3732 basic_string&
3733 operator=(const _CharT* __s)
3734 { return this->assign(__s); }
3735
3736 /**
3737 * @brief Set value to string of length 1.
3738 * @param __c Source character.
3739 *
3740 * Assigning to a character makes this string length 1 and
3741 * (*this)[0] == @a c.
3742 */
3743 basic_string&
3744 operator=(_CharT __c)
3745 {
3746 this->assign(1, __c);
3747 return *this;
3748 }
3749
3750#if __cplusplus201703L >= 201103L
3751 /**
3752 * @brief Move assign the value of @a str to this string.
3753 * @param __str Source string.
3754 *
3755 * The contents of @a str are moved into this string (without copying).
3756 * @a str is a valid, but unspecified string.
3757 **/
3758 basic_string&
3759 operator=(basic_string&& __str)
3760 _GLIBCXX_NOEXCEPT_IF(allocator_traits<_Alloc>::is_always_equal::value)noexcept(allocator_traits<_Alloc>::is_always_equal::value
)
3761 {
3762 // NB: DR 1204.
3763 this->swap(__str);
3764 return *this;
3765 }
3766
3767 /**
3768 * @brief Set value to string constructed from initializer %list.
3769 * @param __l std::initializer_list.
3770 */
3771 basic_string&
3772 operator=(initializer_list<_CharT> __l)
3773 {
3774 this->assign(__l.begin(), __l.size());
3775 return *this;
3776 }
3777#endif // C++11
3778
3779#if __cplusplus201703L >= 201703L
3780 /**
3781 * @brief Set value to string constructed from a string_view.
3782 * @param __svt An object convertible to string_view.
3783 */
3784 template<typename _Tp>
3785 _If_sv<_Tp, basic_string&>
3786 operator=(const _Tp& __svt)
3787 { return this->assign(__svt); }
3788
3789 /**
3790 * @brief Convert to a string_view.
3791 * @return A string_view.
3792 */
3793 operator __sv_type() const noexcept
3794 { return __sv_type(data(), size()); }
3795#endif // C++17
3796
3797 // Iterators:
3798 /**
3799 * Returns a read/write iterator that points to the first character in
3800 * the %string. Unshares the string.
3801 */
3802 iterator
3803 begin() // FIXME C++11: should be noexcept.
3804 {
3805 _M_leak();
3806 return iterator(_M_data());
3807 }
3808
3809 /**
3810 * Returns a read-only (constant) iterator that points to the first
3811 * character in the %string.
3812 */
3813 const_iterator
3814 begin() const _GLIBCXX_NOEXCEPTnoexcept
3815 { return const_iterator(_M_data()); }
3816
3817 /**
3818 * Returns a read/write iterator that points one past the last
3819 * character in the %string. Unshares the string.
3820 */
3821 iterator
3822 end() // FIXME C++11: should be noexcept.
3823 {
3824 _M_leak();
3825 return iterator(_M_data() + this->size());
3826 }
3827
3828 /**
3829 * Returns a read-only (constant) iterator that points one past the
3830 * last character in the %string.
3831 */
3832 const_iterator
3833 end() const _GLIBCXX_NOEXCEPTnoexcept
3834 { return const_iterator(_M_data() + this->size()); }
3835
3836 /**
3837 * Returns a read/write reverse iterator that points to the last
3838 * character in the %string. Iteration is done in reverse element
3839 * order. Unshares the string.
3840 */
3841 reverse_iterator
3842 rbegin() // FIXME C++11: should be noexcept.
3843 { return reverse_iterator(this->end()); }
3844
3845 /**
3846 * Returns a read-only (constant) reverse iterator that points
3847 * to the last character in the %string. Iteration is done in
3848 * reverse element order.
3849 */
3850 const_reverse_iterator
3851 rbegin() const _GLIBCXX_NOEXCEPTnoexcept
3852 { return const_reverse_iterator(this->end()); }
3853
3854 /**
3855 * Returns a read/write reverse iterator that points to one before the
3856 * first character in the %string. Iteration is done in reverse
3857 * element order. Unshares the string.
3858 */
3859 reverse_iterator
3860 rend() // FIXME C++11: should be noexcept.
3861 { return reverse_iterator(this->begin()); }
3862
3863 /**
3864 * Returns a read-only (constant) reverse iterator that points
3865 * to one before the first character in the %string. Iteration
3866 * is done in reverse element order.
3867 */
3868 const_reverse_iterator
3869 rend() const _GLIBCXX_NOEXCEPTnoexcept
3870 { return const_reverse_iterator(this->begin()); }
3871
3872#if __cplusplus201703L >= 201103L
3873 /**
3874 * Returns a read-only (constant) iterator that points to the first
3875 * character in the %string.
3876 */
3877 const_iterator
3878 cbegin() const noexcept
3879 { return const_iterator(this->_M_data()); }
3880
3881 /**
3882 * Returns a read-only (constant) iterator that points one past the
3883 * last character in the %string.
3884 */
3885 const_iterator
3886 cend() const noexcept
3887 { return const_iterator(this->_M_data() + this->size()); }
3888
3889 /**
3890 * Returns a read-only (constant) reverse iterator that points
3891 * to the last character in the %string. Iteration is done in
3892 * reverse element order.
3893 */
3894 const_reverse_iterator
3895 crbegin() const noexcept
3896 { return const_reverse_iterator(this->end()); }
3897
3898 /**
3899 * Returns a read-only (constant) reverse iterator that points
3900 * to one before the first character in the %string. Iteration
3901 * is done in reverse element order.
3902 */
3903 const_reverse_iterator
3904 crend() const noexcept
3905 { return const_reverse_iterator(this->begin()); }
3906#endif
3907
3908 public:
3909 // Capacity:
3910 /// Returns the number of characters in the string, not including any
3911 /// null-termination.
3912 size_type
3913 size() const _GLIBCXX_NOEXCEPTnoexcept
3914 { return _M_rep()->_M_length; }
3915
3916 /// Returns the number of characters in the string, not including any
3917 /// null-termination.
3918 size_type
3919 length() const _GLIBCXX_NOEXCEPTnoexcept
3920 { return _M_rep()->_M_length; }
3921
3922 /// Returns the size() of the largest possible %string.
3923 size_type
3924 max_size() const _GLIBCXX_NOEXCEPTnoexcept
3925 { return _Rep::_S_max_size; }
3926
3927 /**
3928 * @brief Resizes the %string to the specified number of characters.
3929 * @param __n Number of characters the %string should contain.
3930 * @param __c Character to fill any new elements.
3931 *
3932 * This function will %resize the %string to the specified
3933 * number of characters. If the number is smaller than the
3934 * %string's current size the %string is truncated, otherwise
3935 * the %string is extended and new elements are %set to @a __c.
3936 */
3937 void
3938 resize(size_type __n, _CharT __c);
3939
3940 /**
3941 * @brief Resizes the %string to the specified number of characters.
3942 * @param __n Number of characters the %string should contain.
3943 *
3944 * This function will resize the %string to the specified length. If
3945 * the new size is smaller than the %string's current size the %string
3946 * is truncated, otherwise the %string is extended and new characters
3947 * are default-constructed. For basic types such as char, this means
3948 * setting them to 0.
3949 */
3950 void
3951 resize(size_type __n)
3952 { this->resize(__n, _CharT()); }
3953
3954#if __cplusplus201703L >= 201103L
3955 /// A non-binding request to reduce capacity() to size().
3956 void
3957 shrink_to_fit() _GLIBCXX_NOEXCEPTnoexcept
3958 {
3959#if __cpp_exceptions
3960 if (capacity() > size())
3961 {
3962 try
3963 { reserve(0); }
3964 catch(...)
3965 { }
3966 }
3967#endif
3968 }
3969#endif
3970
3971 /**
3972 * Returns the total number of characters that the %string can hold
3973 * before needing to allocate more memory.
3974 */
3975 size_type
3976 capacity() const _GLIBCXX_NOEXCEPTnoexcept
3977 { return _M_rep()->_M_capacity; }
3978
3979 /**
3980 * @brief Attempt to preallocate enough memory for specified number of
3981 * characters.
3982 * @param __res_arg Number of characters required.
3983 * @throw std::length_error If @a __res_arg exceeds @c max_size().
3984 *
3985 * This function attempts to reserve enough memory for the
3986 * %string to hold the specified number of characters. If the
3987 * number requested is more than max_size(), length_error is
3988 * thrown.
3989 *
3990 * The advantage of this function is that if optimal code is a
3991 * necessity and the user can determine the string length that will be
3992 * required, the user can reserve the memory in %advance, and thus
3993 * prevent a possible reallocation of memory and copying of %string
3994 * data.
3995 */
3996 void
3997 reserve(size_type __res_arg = 0);
3998
3999 /**
4000 * Erases the string, making it empty.
4001 */
4002#if _GLIBCXX_FULLY_DYNAMIC_STRING0 == 0
4003 void
4004 clear() _GLIBCXX_NOEXCEPTnoexcept
4005 {
4006 if (_M_rep()->_M_is_shared())
4007 {
4008 _M_rep()->_M_dispose(this->get_allocator());
4009 _M_data(_S_empty_rep()._M_refdata());
4010 }
4011 else
4012 _M_rep()->_M_set_length_and_sharable(0);
4013 }
4014#else
4015 // PR 56166: this should not throw.
4016 void
4017 clear()
4018 { _M_mutate(0, this->size(), 0); }
4019#endif
4020
4021 /**
4022 * Returns true if the %string is empty. Equivalent to
4023 * <code>*this == ""</code>.
4024 */
4025 _GLIBCXX_NODISCARD[[__nodiscard__]] bool
4026 empty() const _GLIBCXX_NOEXCEPTnoexcept
4027 { return this->size() == 0; }
4028
4029 // Element access:
4030 /**
4031 * @brief Subscript access to the data contained in the %string.
4032 * @param __pos The index of the character to access.
4033 * @return Read-only (constant) reference to the character.
4034 *
4035 * This operator allows for easy, array-style, data access.
4036 * Note that data access with this operator is unchecked and
4037 * out_of_range lookups are not defined. (For checked lookups
4038 * see at().)
4039 */
4040 const_reference
4041 operator[] (size_type __pos) const _GLIBCXX_NOEXCEPTnoexcept
4042 {
4043 __glibcxx_assert(__pos <= size());
4044 return _M_data()[__pos];
4045 }
4046
4047 /**
4048 * @brief Subscript access to the data contained in the %string.
4049 * @param __pos The index of the character to access.
4050 * @return Read/write reference to the character.
4051 *
4052 * This operator allows for easy, array-style, data access.
4053 * Note that data access with this operator is unchecked and
4054 * out_of_range lookups are not defined. (For checked lookups
4055 * see at().) Unshares the string.
4056 */
4057 reference
4058 operator[](size_type __pos)
4059 {
4060 // Allow pos == size() both in C++98 mode, as v3 extension,
4061 // and in C++11 mode.
4062 __glibcxx_assert(__pos <= size());
4063 // In pedantic mode be strict in C++98 mode.
4064 _GLIBCXX_DEBUG_PEDASSERT(__cplusplus >= 201103L || __pos < size());
4065 _M_leak();
4066 return _M_data()[__pos];
4067 }
4068
4069 /**
4070 * @brief Provides access to the data contained in the %string.
4071 * @param __n The index of the character to access.
4072 * @return Read-only (const) reference to the character.
4073 * @throw std::out_of_range If @a n is an invalid index.
4074 *
4075 * This function provides for safer data access. The parameter is
4076 * first checked that it is in the range of the string. The function
4077 * throws out_of_range if the check fails.
4078 */
4079 const_reference
4080 at(size_type __n) const
4081 {
4082 if (__n >= this->size())
4083 __throw_out_of_range_fmt(__N("basic_string::at: __n "("basic_string::at: __n " "(which is %zu) >= this->size() "
"(which is %zu)")
4084 "(which is %zu) >= this->size() "("basic_string::at: __n " "(which is %zu) >= this->size() "
"(which is %zu)")
4085 "(which is %zu)")("basic_string::at: __n " "(which is %zu) >= this->size() "
"(which is %zu)")
,
4086 __n, this->size());
4087 return _M_data()[__n];
4088 }
4089
4090 /**
4091 * @brief Provides access to the data contained in the %string.
4092 * @param __n The index of the character to access.
4093 * @return Read/write reference to the character.
4094 * @throw std::out_of_range If @a n is an invalid index.
4095 *
4096 * This function provides for safer data access. The parameter is
4097 * first checked that it is in the range of the string. The function
4098 * throws out_of_range if the check fails. Success results in
4099 * unsharing the string.
4100 */
4101 reference
4102 at(size_type __n)
4103 {
4104 if (__n >= size())
4105 __throw_out_of_range_fmt(__N("basic_string::at: __n "("basic_string::at: __n " "(which is %zu) >= this->size() "
"(which is %zu)")
4106 "(which is %zu) >= this->size() "("basic_string::at: __n " "(which is %zu) >= this->size() "
"(which is %zu)")
4107 "(which is %zu)")("basic_string::at: __n " "(which is %zu) >= this->size() "
"(which is %zu)")
,
4108 __n, this->size());
4109 _M_leak();
4110 return _M_data()[__n];
4111 }
4112
4113#if __cplusplus201703L >= 201103L
4114 /**
4115 * Returns a read/write reference to the data at the first
4116 * element of the %string.
4117 */
4118 reference
4119 front()
4120 {
4121 __glibcxx_assert(!empty());
4122 return operator[](0);
4123 }
4124
4125 /**
4126 * Returns a read-only (constant) reference to the data at the first
4127 * element of the %string.
4128 */
4129 const_reference
4130 front() const noexcept
4131 {
4132 __glibcxx_assert(!empty());
4133 return operator[](0);
4134 }
4135
4136 /**
4137 * Returns a read/write reference to the data at the last
4138 * element of the %string.
4139 */
4140 reference
4141 back()
4142 {
4143 __glibcxx_assert(!empty());
4144 return operator[](this->size() - 1);
4145 }
4146
4147 /**
4148 * Returns a read-only (constant) reference to the data at the
4149 * last element of the %string.
4150 */
4151 const_reference
4152 back() const noexcept
4153 {
4154 __glibcxx_assert(!empty());
4155 return operator[](this->size() - 1);
4156 }
4157#endif
4158
4159 // Modifiers:
4160 /**
4161 * @brief Append a string to this string.
4162 * @param __str The string to append.
4163 * @return Reference to this string.
4164 */
4165 basic_string&
4166 operator+=(const basic_string& __str)
4167 { return this->append(__str); }
4168
4169 /**
4170 * @brief Append a C string.
4171 * @param __s The C string to append.
4172 * @return Reference to this string.
4173 */
4174 basic_string&
4175 operator+=(const _CharT* __s)
4176 { return this->append(__s); }
4177
4178 /**
4179 * @brief Append a character.
4180 * @param __c The character to append.
4181 * @return Reference to this string.
4182 */
4183 basic_string&
4184 operator+=(_CharT __c)
4185 {
4186 this->push_back(__c);
4187 return *this;
4188 }
4189
4190#if __cplusplus201703L >= 201103L
4191 /**
4192 * @brief Append an initializer_list of characters.
4193 * @param __l The initializer_list of characters to be appended.
4194 * @return Reference to this string.
4195 */
4196 basic_string&
4197 operator+=(initializer_list<_CharT> __l)
4198 { return this->append(__l.begin(), __l.size()); }
4199#endif // C++11
4200
4201#if __cplusplus201703L >= 201703L
4202 /**
4203 * @brief Append a string_view.
4204 * @param __svt The object convertible to string_view to be appended.
4205 * @return Reference to this string.
4206 */
4207 template<typename _Tp>
4208 _If_sv<_Tp, basic_string&>
4209 operator+=(const _Tp& __svt)
4210 { return this->append(__svt); }
4211#endif // C++17
4212
4213 /**
4214 * @brief Append a string to this string.
4215 * @param __str The string to append.
4216 * @return Reference to this string.
4217 */
4218 basic_string&
4219 append(const basic_string& __str);
4220
4221 /**
4222 * @brief Append a substring.
4223 * @param __str The string to append.
4224 * @param __pos Index of the first character of str to append.
4225 * @param __n The number of characters to append.
4226 * @return Reference to this string.
4227 * @throw std::out_of_range if @a __pos is not a valid index.
4228 *
4229 * This function appends @a __n characters from @a __str
4230 * starting at @a __pos to this string. If @a __n is is larger
4231 * than the number of available characters in @a __str, the
4232 * remainder of @a __str is appended.
4233 */
4234 basic_string&
4235 append(const basic_string& __str, size_type __pos, size_type __n = npos);
4236
4237 /**
4238 * @brief Append a C substring.
4239 * @param __s The C string to append.
4240 * @param __n The number of characters to append.
4241 * @return Reference to this string.
4242 */
4243 basic_string&
4244 append(const _CharT* __s, size_type __n);
4245
4246 /**
4247 * @brief Append a C string.
4248 * @param __s The C string to append.
4249 * @return Reference to this string.
4250 */
4251 basic_string&
4252 append(const _CharT* __s)
4253 {
4254 __glibcxx_requires_string(__s);
4255 return this->append(__s, traits_type::length(__s));
4256 }
4257
4258 /**
4259 * @brief Append multiple characters.
4260 * @param __n The number of characters to append.
4261 * @param __c The character to use.
4262 * @return Reference to this string.
4263 *
4264 * Appends __n copies of __c to this string.
4265 */
4266 basic_string&
4267 append(size_type __n, _CharT __c);
4268
4269#if __cplusplus201703L >= 201103L
4270 /**
4271 * @brief Append an initializer_list of characters.
4272 * @param __l The initializer_list of characters to append.
4273 * @return Reference to this string.
4274 */
4275 basic_string&
4276 append(initializer_list<_CharT> __l)
4277 { return this->append(__l.begin(), __l.size()); }
4278#endif // C++11
4279
4280 /**
4281 * @brief Append a range of characters.
4282 * @param __first Iterator referencing the first character to append.
4283 * @param __last Iterator marking the end of the range.
4284 * @return Reference to this string.
4285 *
4286 * Appends characters in the range [__first,__last) to this string.
4287 */
4288 template<class _InputIterator>
4289 basic_string&
4290 append(_InputIterator __first, _InputIterator __last)
4291 { return this->replace(_M_iend(), _M_iend(), __first, __last); }
4292
4293#if __cplusplus201703L >= 201703L
4294 /**
4295 * @brief Append a string_view.
4296 * @param __svt The object convertible to string_view to be appended.
4297 * @return Reference to this string.
4298 */
4299 template<typename _Tp>
4300 _If_sv<_Tp, basic_string&>
4301 append(const _Tp& __svt)
4302 {
4303 __sv_type __sv = __svt;
4304 return this->append(__sv.data(), __sv.size());
4305 }
4306
4307 /**
4308 * @brief Append a range of characters from a string_view.
4309 * @param __svt The object convertible to string_view to be appended
4310 * from.
4311 * @param __pos The position in the string_view to append from.
4312 * @param __n The number of characters to append from the string_view.
4313 * @return Reference to this string.
4314 */
4315 template<typename _Tp>
4316 _If_sv<_Tp, basic_string&>
4317 append(const _Tp& __svt, size_type __pos, size_type __n = npos)
4318 {
4319 __sv_type __sv = __svt;
4320 return append(__sv.data()
4321 + std::__sv_check(__sv.size(), __pos, "basic_string::append"),
4322 std::__sv_limit(__sv.size(), __pos, __n));
4323 }
4324#endif // C++17
4325
4326 /**
4327 * @brief Append a single character.
4328 * @param __c Character to append.
4329 */
4330 void
4331 push_back(_CharT __c)
4332 {
4333 const size_type __len = 1 + this->size();
4334 if (__len > this->capacity() || _M_rep()->_M_is_shared())
4335 this->reserve(__len);
4336 traits_type::assign(_M_data()[this->size()], __c);
4337 _M_rep()->_M_set_length_and_sharable(__len);
4338 }
4339
4340 /**
4341 * @brief Set value to contents of another string.
4342 * @param __str Source string to use.
4343 * @return Reference to this string.
4344 */
4345 basic_string&
4346 assign(const basic_string& __str);
4347
4348#if __cplusplus201703L >= 201103L
4349 /**
4350 * @brief Set value to contents of another string.
4351 * @param __str Source string to use.
4352 * @return Reference to this string.
4353 *
4354 * This function sets this string to the exact contents of @a __str.
4355 * @a __str is a valid, but unspecified string.
4356 */
4357 basic_string&
4358 assign(basic_string&& __str)
4359 noexcept(allocator_traits<_Alloc>::is_always_equal::value)
4360 {
4361 this->swap(__str);
4362 return *this;
4363 }
4364#endif // C++11
4365
4366 /**
4367 * @brief Set value to a substring of a string.
4368 * @param __str The string to use.
4369 * @param __pos Index of the first character of str.
4370 * @param __n Number of characters to use.
4371 * @return Reference to this string.
4372 * @throw std::out_of_range if @a pos is not a valid index.
4373 *
4374 * This function sets this string to the substring of @a __str
4375 * consisting of @a __n characters at @a __pos. If @a __n is
4376 * is larger than the number of available characters in @a
4377 * __str, the remainder of @a __str is used.
4378 */
4379 basic_string&
4380 assign(const basic_string& __str, size_type __pos, size_type __n = npos)
4381 { return this->assign(__str._M_data()
4382 + __str._M_check(__pos, "basic_string::assign"),
4383 __str._M_limit(__pos, __n)); }
4384
4385 /**
4386 * @brief Set value to a C substring.
4387 * @param __s The C string to use.
4388 * @param __n Number of characters to use.
4389 * @return Reference to this string.
4390 *
4391 * This function sets the value of this string to the first @a __n
4392 * characters of @a __s. If @a __n is is larger than the number of
4393 * available characters in @a __s, the remainder of @a __s is used.
4394 */
4395 basic_string&
4396 assign(const _CharT* __s, size_type __n);
4397
4398 /**
4399 * @brief Set value to contents of a C string.
4400 * @param __s The C string to use.
4401 * @return Reference to this string.
4402 *
4403 * This function sets the value of this string to the value of @a __s.
4404 * The data is copied, so there is no dependence on @a __s once the
4405 * function returns.
4406 */
4407 basic_string&
4408 assign(const _CharT* __s)
4409 {
4410 __glibcxx_requires_string(__s);
4411 return this->assign(__s, traits_type::length(__s));
4412 }
4413
4414 /**
4415 * @brief Set value to multiple characters.
4416 * @param __n Length of the resulting string.
4417 * @param __c The character to use.
4418 * @return Reference to this string.
4419 *
4420 * This function sets the value of this string to @a __n copies of
4421 * character @a __c.
4422 */
4423 basic_string&
4424 assign(size_type __n, _CharT __c)
4425 { return _M_replace_aux(size_type(0), this->size(), __n, __c); }
4426
4427 /**
4428 * @brief Set value to a range of characters.
4429 * @param __first Iterator referencing the first character to append.
4430 * @param __last Iterator marking the end of the range.
4431 * @return Reference to this string.
4432 *
4433 * Sets value of string to characters in the range [__first,__last).
4434 */
4435 template<class _InputIterator>
4436 basic_string&
4437 assign(_InputIterator __first, _InputIterator __last)
4438 { return this->replace(_M_ibegin(), _M_iend(), __first, __last); }
4439
4440#if __cplusplus201703L >= 201103L
4441 /**
4442 * @brief Set value to an initializer_list of characters.
4443 * @param __l The initializer_list of characters to assign.
4444 * @return Reference to this string.
4445 */
4446 basic_string&
4447 assign(initializer_list<_CharT> __l)
4448 { return this->assign(__l.begin(), __l.size()); }
4449#endif // C++11
4450
4451#if __cplusplus201703L >= 201703L
4452 /**
4453 * @brief Set value from a string_view.
4454 * @param __svt The source object convertible to string_view.
4455 * @return Reference to this string.
4456 */
4457 template<typename _Tp>
4458 _If_sv<_Tp, basic_string&>
4459 assign(const _Tp& __svt)
4460 {
4461 __sv_type __sv = __svt;
4462 return this->assign(__sv.data(), __sv.size());
4463 }
4464
4465 /**
4466 * @brief Set value from a range of characters in a string_view.
4467 * @param __svt The source object convertible to string_view.
4468 * @param __pos The position in the string_view to assign from.
4469 * @param __n The number of characters to assign.
4470 * @return Reference to this string.
4471 */
4472 template<typename _Tp>
4473 _If_sv<_Tp, basic_string&>
4474 assign(const _Tp& __svt, size_type __pos, size_type __n = npos)
4475 {
4476 __sv_type __sv = __svt;
4477 return assign(__sv.data()
4478 + std::__sv_check(__sv.size(), __pos, "basic_string::assign"),
4479 std::__sv_limit(__sv.size(), __pos, __n));
4480 }
4481#endif // C++17
4482
4483 /**
4484 * @brief Insert multiple characters.
4485 * @param __p Iterator referencing location in string to insert at.
4486 * @param __n Number of characters to insert
4487 * @param __c The character to insert.
4488 * @throw std::length_error If new length exceeds @c max_size().
4489 *
4490 * Inserts @a __n copies of character @a __c starting at the
4491 * position referenced by iterator @a __p. If adding
4492 * characters causes the length to exceed max_size(),
4493 * length_error is thrown. The value of the string doesn't
4494 * change if an error is thrown.
4495 */
4496 void
4497 insert(iterator __p, size_type __n, _CharT __c)
4498 { this->replace(__p, __p, __n, __c); }
4499
4500 /**
4501 * @brief Insert a range of characters.
4502 * @param __p Iterator referencing location in string to insert at.
4503 * @param __beg Start of range.
4504 * @param __end End of range.
4505 * @throw std::length_error If new length exceeds @c max_size().
4506 *
4507 * Inserts characters in range [__beg,__end). If adding
4508 * characters causes the length to exceed max_size(),
4509 * length_error is thrown. The value of the string doesn't
4510 * change if an error is thrown.
4511 */
4512 template<class _InputIterator>
4513 void
4514 insert(iterator __p, _InputIterator __beg, _InputIterator __end)
4515 { this->replace(__p, __p, __beg, __end); }
4516
4517#if __cplusplus201703L >= 201103L
4518 /**
4519 * @brief Insert an initializer_list of characters.
4520 * @param __p Iterator referencing location in string to insert at.
4521 * @param __l The initializer_list of characters to insert.
4522 * @throw std::length_error If new length exceeds @c max_size().
4523 */
4524 void
4525 insert(iterator __p, initializer_list<_CharT> __l)
4526 {
4527 _GLIBCXX_DEBUG_PEDASSERT(__p >= _M_ibegin() && __p <= _M_iend());
4528 this->insert(__p - _M_ibegin(), __l.begin(), __l.size());
4529 }
4530#endif // C++11
4531
4532 /**
4533 * @brief Insert value of a string.
4534 * @param __pos1 Position in string to insert at.
4535 * @param __str The string to insert.
4536 * @return Reference to this string.
4537 * @throw std::length_error If new length exceeds @c max_size().
4538 *
4539 * Inserts value of @a __str starting at @a __pos1. If adding
4540 * characters causes the length to exceed max_size(),
4541 * length_error is thrown. The value of the string doesn't
4542 * change if an error is thrown.
4543 */
4544 basic_string&
4545 insert(size_type __pos1, const basic_string& __str)
4546 { return this->insert(__pos1, __str, size_type(0), __str.size()); }
4547
4548 /**
4549 * @brief Insert a substring.
4550 * @param __pos1 Position in string to insert at.
4551 * @param __str The string to insert.
4552 * @param __pos2 Start of characters in str to insert.
4553 * @param __n Number of characters to insert.
4554 * @return Reference to this string.
4555 * @throw std::length_error If new length exceeds @c max_size().
4556 * @throw std::out_of_range If @a pos1 > size() or
4557 * @a __pos2 > @a str.size().
4558 *
4559 * Starting at @a pos1, insert @a __n character of @a __str
4560 * beginning with @a __pos2. If adding characters causes the
4561 * length to exceed max_size(), length_error is thrown. If @a
4562 * __pos1 is beyond the end of this string or @a __pos2 is
4563 * beyond the end of @a __str, out_of_range is thrown. The
4564 * value of the string doesn't change if an error is thrown.
4565 */
4566 basic_string&
4567 insert(size_type __pos1, const basic_string& __str,
4568 size_type __pos2, size_type __n = npos)
4569 { return this->insert(__pos1, __str._M_data()
4570 + __str._M_check(__pos2, "basic_string::insert"),
4571 __str._M_limit(__pos2, __n)); }
4572
4573 /**
4574 * @brief Insert a C substring.
4575 * @param __pos Position in string to insert at.
4576 * @param __s The C string to insert.
4577 * @param __n The number of characters to insert.
4578 * @return Reference to this string.
4579 * @throw std::length_error If new length exceeds @c max_size().
4580 * @throw std::out_of_range If @a __pos is beyond the end of this
4581 * string.
4582 *
4583 * Inserts the first @a __n characters of @a __s starting at @a
4584 * __pos. If adding characters causes the length to exceed
4585 * max_size(), length_error is thrown. If @a __pos is beyond
4586 * end(), out_of_range is thrown. The value of the string
4587 * doesn't change if an error is thrown.
4588 */
4589 basic_string&
4590 insert(size_type __pos, const _CharT* __s, size_type __n);
4591
4592 /**
4593 * @brief Insert a C string.
4594 * @param __pos Position in string to insert at.
4595 * @param __s The C string to insert.
4596 * @return Reference to this string.
4597 * @throw std::length_error If new length exceeds @c max_size().
4598 * @throw std::out_of_range If @a pos is beyond the end of this
4599 * string.
4600 *
4601 * Inserts the first @a n characters of @a __s starting at @a __pos. If
4602 * adding characters causes the length to exceed max_size(),
4603 * length_error is thrown. If @a __pos is beyond end(), out_of_range is
4604 * thrown. The value of the string doesn't change if an error is
4605 * thrown.
4606 */
4607 basic_string&
4608 insert(size_type __pos, const _CharT* __s)
4609 {
4610 __glibcxx_requires_string(__s);
4611 return this->insert(__pos, __s, traits_type::length(__s));
4612 }
4613
4614 /**
4615 * @brief Insert multiple characters.
4616 * @param __pos Index in string to insert at.
4617 * @param __n Number of characters to insert
4618 * @param __c The character to insert.
4619 * @return Reference to this string.
4620 * @throw std::length_error If new length exceeds @c max_size().
4621 * @throw std::out_of_range If @a __pos is beyond the end of this
4622 * string.
4623 *
4624 * Inserts @a __n copies of character @a __c starting at index
4625 * @a __pos. If adding characters causes the length to exceed
4626 * max_size(), length_error is thrown. If @a __pos > length(),
4627 * out_of_range is thrown. The value of the string doesn't
4628 * change if an error is thrown.
4629 */
4630 basic_string&
4631 insert(size_type __pos, size_type __n, _CharT __c)
4632 { return _M_replace_aux(_M_check(__pos, "basic_string::insert"),
4633 size_type(0), __n, __c); }
4634
4635 /**
4636 * @brief Insert one character.
4637 * @param __p Iterator referencing position in string to insert at.
4638 * @param __c The character to insert.
4639 * @return Iterator referencing newly inserted char.
4640 * @throw std::length_error If new length exceeds @c max_size().
4641 *
4642 * Inserts character @a __c at position referenced by @a __p.
4643 * If adding character causes the length to exceed max_size(),
4644 * length_error is thrown. If @a __p is beyond end of string,
4645 * out_of_range is thrown. The value of the string doesn't
4646 * change if an error is thrown.
4647 */
4648 iterator
4649 insert(iterator __p, _CharT __c)
4650 {
4651 _GLIBCXX_DEBUG_PEDASSERT(__p >= _M_ibegin() && __p <= _M_iend());
4652 const size_type __pos = __p - _M_ibegin();
4653 _M_replace_aux(__pos, size_type(0), size_type(1), __c);
4654 _M_rep()->_M_set_leaked();
4655 return iterator(_M_data() + __pos);
4656 }
4657
4658#if __cplusplus201703L >= 201703L
4659 /**
4660 * @brief Insert a string_view.
4661 * @param __pos Position in string to insert at.
4662 * @param __svt The object convertible to string_view to insert.
4663 * @return Reference to this string.
4664 */
4665 template<typename _Tp>
4666 _If_sv<_Tp, basic_string&>
4667 insert(size_type __pos, const _Tp& __svt)
4668 {
4669 __sv_type __sv = __svt;
4670 return this->insert(__pos, __sv.data(), __sv.size());
4671 }
4672
4673 /**
4674 * @brief Insert a string_view.
4675 * @param __pos Position in string to insert at.
4676 * @param __svt The object convertible to string_view to insert from.
4677 * @param __pos Position in string_view to insert
4678 * from.
4679 * @param __n The number of characters to insert.
4680 * @return Reference to this string.
4681 */
4682 template<typename _Tp>
4683 _If_sv<_Tp, basic_string&>
4684 insert(size_type __pos1, const _Tp& __svt,
4685 size_type __pos2, size_type __n = npos)
4686 {
4687 __sv_type __sv = __svt;
4688 return this->replace(__pos1, size_type(0), __sv.data()
4689 + std::__sv_check(__sv.size(), __pos2, "basic_string::insert"),
4690 std::__sv_limit(__sv.size(), __pos2, __n));
4691 }
4692#endif // C++17
4693
4694 /**
4695 * @brief Remove characters.
4696 * @param __pos Index of first character to remove (default 0).
4697 * @param __n Number of characters to remove (default remainder).
4698 * @return Reference to this string.
4699 * @throw std::out_of_range If @a pos is beyond the end of this
4700 * string.
4701 *
4702 * Removes @a __n characters from this string starting at @a
4703 * __pos. The length of the string is reduced by @a __n. If
4704 * there are < @a __n characters to remove, the remainder of
4705 * the string is truncated. If @a __p is beyond end of string,
4706 * out_of_range is thrown. The value of the string doesn't
4707 * change if an error is thrown.
4708 */
4709 basic_string&
4710 erase(size_type __pos = 0, size_type __n = npos)
4711 {
4712 _M_mutate(_M_check(__pos, "basic_string::erase"),
4713 _M_limit(__pos, __n), size_type(0));
4714 return *this;
4715 }
4716
4717 /**
4718 * @brief Remove one character.
4719 * @param __position Iterator referencing the character to remove.
4720 * @return iterator referencing same location after removal.
4721 *
4722 * Removes the character at @a __position from this string. The value
4723 * of the string doesn't change if an error is thrown.
4724 */
4725 iterator
4726 erase(iterator __position)
4727 {
4728 _GLIBCXX_DEBUG_PEDASSERT(__position >= _M_ibegin()
4729 && __position < _M_iend());
4730 const size_type __pos = __position - _M_ibegin();
4731 _M_mutate(__pos, size_type(1), size_type(0));
4732 _M_rep()->_M_set_leaked();
4733 return iterator(_M_data() + __pos);
4734 }
4735
4736 /**
4737 * @brief Remove a range of characters.
4738 * @param __first Iterator referencing the first character to remove.
4739 * @param __last Iterator referencing the end of the range.
4740 * @return Iterator referencing location of first after removal.
4741 *
4742 * Removes the characters in the range [first,last) from this string.
4743 * The value of the string doesn't change if an error is thrown.
4744 */
4745 iterator
4746 erase(iterator __first, iterator __last);
4747
4748#if __cplusplus201703L >= 201103L
4749 /**
4750 * @brief Remove the last character.
4751 *
4752 * The string must be non-empty.
4753 */
4754 void
4755 pop_back() // FIXME C++11: should be noexcept.
4756 {
4757 __glibcxx_assert(!empty());
4758 erase(size() - 1, 1);
4759 }
4760#endif // C++11
4761
4762 /**
4763 * @brief Replace characters with value from another string.
4764 * @param __pos Index of first character to replace.
4765 * @param __n Number of characters to be replaced.
4766 * @param __str String to insert.
4767 * @return Reference to this string.
4768 * @throw std::out_of_range If @a pos is beyond the end of this
4769 * string.
4770 * @throw std::length_error If new length exceeds @c max_size().
4771 *
4772 * Removes the characters in the range [__pos,__pos+__n) from
4773 * this string. In place, the value of @a __str is inserted.
4774 * If @a __pos is beyond end of string, out_of_range is thrown.
4775 * If the length of the result exceeds max_size(), length_error
4776 * is thrown. The value of the string doesn't change if an
4777 * error is thrown.
4778 */
4779 basic_string&
4780 replace(size_type __pos, size_type __n, const basic_string& __str)
4781 { return this->replace(__pos, __n, __str._M_data(), __str.size()); }
4782
4783 /**
4784 * @brief Replace characters with value from another string.
4785 * @param __pos1 Index of first character to replace.
4786 * @param __n1 Number of characters to be replaced.
4787 * @param __str String to insert.
4788 * @param __pos2 Index of first character of str to use.
4789 * @param __n2 Number of characters from str to use.
4790 * @return Reference to this string.
4791 * @throw std::out_of_range If @a __pos1 > size() or @a __pos2 >
4792 * __str.size().
4793 * @throw std::length_error If new length exceeds @c max_size().
4794 *
4795 * Removes the characters in the range [__pos1,__pos1 + n) from this
4796 * string. In place, the value of @a __str is inserted. If @a __pos is
4797 * beyond end of string, out_of_range is thrown. If the length of the
4798 * result exceeds max_size(), length_error is thrown. The value of the
4799 * string doesn't change if an error is thrown.
4800 */
4801 basic_string&
4802 replace(size_type __pos1, size_type __n1, const basic_string& __str,
4803 size_type __pos2, size_type __n2 = npos)
4804 { return this->replace(__pos1, __n1, __str._M_data()
4805 + __str._M_check(__pos2, "basic_string::replace"),
4806 __str._M_limit(__pos2, __n2)); }
4807
4808 /**
4809 * @brief Replace characters with value of a C substring.
4810 * @param __pos Index of first character to replace.
4811 * @param __n1 Number of characters to be replaced.
4812 * @param __s C string to insert.
4813 * @param __n2 Number of characters from @a s to use.
4814 * @return Reference to this string.
4815 * @throw std::out_of_range If @a pos1 > size().
4816 * @throw std::length_error If new length exceeds @c max_size().
4817 *
4818 * Removes the characters in the range [__pos,__pos + __n1)
4819 * from this string. In place, the first @a __n2 characters of
4820 * @a __s are inserted, or all of @a __s if @a __n2 is too large. If
4821 * @a __pos is beyond end of string, out_of_range is thrown. If
4822 * the length of result exceeds max_size(), length_error is
4823 * thrown. The value of the string doesn't change if an error
4824 * is thrown.
4825 */
4826 basic_string&
4827 replace(size_type __pos, size_type __n1, const _CharT* __s,
4828 size_type __n2);
4829
4830 /**
4831 * @brief Replace characters with value of a C string.
4832 * @param __pos Index of first character to replace.
4833 * @param __n1 Number of characters to be replaced.
4834 * @param __s C string to insert.
4835 * @return Reference to this string.
4836 * @throw std::out_of_range If @a pos > size().
4837 * @throw std::length_error If new length exceeds @c max_size().
4838 *
4839 * Removes the characters in the range [__pos,__pos + __n1)
4840 * from this string. In place, the characters of @a __s are
4841 * inserted. If @a __pos is beyond end of string, out_of_range
4842 * is thrown. If the length of result exceeds max_size(),
4843 * length_error is thrown. The value of the string doesn't
4844 * change if an error is thrown.
4845 */
4846 basic_string&
4847 replace(size_type __pos, size_type __n1, const _CharT* __s)
4848 {
4849 __glibcxx_requires_string(__s);
4850 return this->replace(__pos, __n1, __s, traits_type::length(__s));
4851 }
4852
4853 /**
4854 * @brief Replace characters with multiple characters.
4855 * @param __pos Index of first character to replace.
4856 * @param __n1 Number of characters to be replaced.
4857 * @param __n2 Number of characters to insert.
4858 * @param __c Character to insert.
4859 * @return Reference to this string.
4860 * @throw std::out_of_range If @a __pos > size().
4861 * @throw std::length_error If new length exceeds @c max_size().
4862 *
4863 * Removes the characters in the range [pos,pos + n1) from this
4864 * string. In place, @a __n2 copies of @a __c are inserted.
4865 * If @a __pos is beyond end of string, out_of_range is thrown.
4866 * If the length of result exceeds max_size(), length_error is
4867 * thrown. The value of the string doesn't change if an error
4868 * is thrown.
4869 */
4870 basic_string&
4871 replace(size_type __pos, size_type __n1, size_type __n2, _CharT __c)
4872 { return _M_replace_aux(_M_check(__pos, "basic_string::replace"),
4873 _M_limit(__pos, __n1), __n2, __c); }
4874
4875 /**
4876 * @brief Replace range of characters with string.
4877 * @param __i1 Iterator referencing start of range to replace.
4878 * @param __i2 Iterator referencing end of range to replace.
4879 * @param __str String value to insert.
4880 * @return Reference to this string.
4881 * @throw std::length_error If new length exceeds @c max_size().
4882 *
4883 * Removes the characters in the range [__i1,__i2). In place,
4884 * the value of @a __str is inserted. If the length of result
4885 * exceeds max_size(), length_error is thrown. The value of
4886 * the string doesn't change if an error is thrown.
4887 */
4888 basic_string&
4889 replace(iterator __i1, iterator __i2, const basic_string& __str)
4890 { return this->replace(__i1, __i2, __str._M_data(), __str.size()); }
4891
4892 /**
4893 * @brief Replace range of characters with C substring.
4894 * @param __i1 Iterator referencing start of range to replace.
4895 * @param __i2 Iterator referencing end of range to replace.
4896 * @param __s C string value to insert.
4897 * @param __n Number of characters from s to insert.
4898 * @return Reference to this string.
4899 * @throw std::length_error If new length exceeds @c max_size().
4900 *
4901 * Removes the characters in the range [__i1,__i2). In place,
4902 * the first @a __n characters of @a __s are inserted. If the
4903 * length of result exceeds max_size(), length_error is thrown.
4904 * The value of the string doesn't change if an error is
4905 * thrown.
4906 */
4907 basic_string&
4908 replace(iterator __i1, iterator __i2, const _CharT* __s, size_type __n)
4909 {
4910 _GLIBCXX_DEBUG_PEDASSERT(_M_ibegin() <= __i1 && __i1 <= __i2
4911 && __i2 <= _M_iend());
4912 return this->replace(__i1 - _M_ibegin(), __i2 - __i1, __s, __n);
4913 }
4914
4915 /**
4916 * @brief Replace range of characters with C string.
4917 * @param __i1 Iterator referencing start of range to replace.
4918 * @param __i2 Iterator referencing end of range to replace.
4919 * @param __s C string value to insert.
4920 * @return Reference to this string.
4921 * @throw std::length_error If new length exceeds @c max_size().
4922 *
4923 * Removes the characters in the range [__i1,__i2). In place,
4924 * the characters of @a __s are inserted. If the length of
4925 * result exceeds max_size(), length_error is thrown. The
4926 * value of the string doesn't change if an error is thrown.
4927 */
4928 basic_string&
4929 replace(iterator __i1, iterator __i2, const _CharT* __s)
4930 {
4931 __glibcxx_requires_string(__s);
4932 return this->replace(__i1, __i2, __s, traits_type::length(__s));
4933 }
4934
4935 /**
4936 * @brief Replace range of characters with multiple characters
4937 * @param __i1 Iterator referencing start of range to replace.
4938 * @param __i2 Iterator referencing end of range to replace.
4939 * @param __n Number of characters to insert.
4940 * @param __c Character to insert.
4941 * @return Reference to this string.
4942 * @throw std::length_error If new length exceeds @c max_size().
4943 *
4944 * Removes the characters in the range [__i1,__i2). In place,
4945 * @a __n copies of @a __c are inserted. If the length of
4946 * result exceeds max_size(), length_error is thrown. The
4947 * value of the string doesn't change if an error is thrown.
4948 */
4949 basic_string&
4950 replace(iterator __i1, iterator __i2, size_type __n, _CharT __c)
4951 {
4952 _GLIBCXX_DEBUG_PEDASSERT(_M_ibegin() <= __i1 && __i1 <= __i2
4953 && __i2 <= _M_iend());
4954 return _M_replace_aux(__i1 - _M_ibegin(), __i2 - __i1, __n, __c);
4955 }
4956
4957 /**
4958 * @brief Replace range of characters with range.
4959 * @param __i1 Iterator referencing start of range to replace.
4960 * @param __i2 Iterator referencing end of range to replace.
4961 * @param __k1 Iterator referencing start of range to insert.
4962 * @param __k2 Iterator referencing end of range to insert.
4963 * @return Reference to this string.
4964 * @throw std::length_error If new length exceeds @c max_size().
4965 *
4966 * Removes the characters in the range [__i1,__i2). In place,
4967 * characters in the range [__k1,__k2) are inserted. If the
4968 * length of result exceeds max_size(), length_error is thrown.
4969 * The value of the string doesn't change if an error is
4970 * thrown.
4971 */
4972 template<class _InputIterator>
4973 basic_string&
4974 replace(iterator __i1, iterator __i2,
4975 _InputIterator __k1, _InputIterator __k2)
4976 {
4977 _GLIBCXX_DEBUG_PEDASSERT(_M_ibegin() <= __i1 && __i1 <= __i2
4978 && __i2 <= _M_iend());
4979 __glibcxx_requires_valid_range(__k1, __k2);
4980 typedef typename std::__is_integer<_InputIterator>::__type _Integral;
4981 return _M_replace_dispatch(__i1, __i2, __k1, __k2, _Integral());
4982 }
4983
4984 // Specializations for the common case of pointer and iterator:
4985 // useful to avoid the overhead of temporary buffering in _M_replace.
4986 basic_string&
4987 replace(iterator __i1, iterator __i2, _CharT* __k1, _CharT* __k2)
4988 {
4989 _GLIBCXX_DEBUG_PEDASSERT(_M_ibegin() <= __i1 && __i1 <= __i2
4990 && __i2 <= _M_iend());
4991 __glibcxx_requires_valid_range(__k1, __k2);
4992 return this->replace(__i1 - _M_ibegin(), __i2 - __i1,
4993 __k1, __k2 - __k1);
4994 }
4995
4996 basic_string&
4997 replace(iterator __i1, iterator __i2,
4998 const _CharT* __k1, const _CharT* __k2)
4999 {
5000 _GLIBCXX_DEBUG_PEDASSERT(_M_ibegin() <= __i1 && __i1 <= __i2
5001 && __i2 <= _M_iend());
5002 __glibcxx_requires_valid_range(__k1, __k2);
5003 return this->replace(__i1 - _M_ibegin(), __i2 - __i1,
5004 __k1, __k2 - __k1);
5005 }
5006
5007 basic_string&
5008 replace(iterator __i1, iterator __i2, iterator __k1, iterator __k2)
5009 {
5010 _GLIBCXX_DEBUG_PEDASSERT(_M_ibegin() <= __i1 && __i1 <= __i2
5011 && __i2 <= _M_iend());
5012 __glibcxx_requires_valid_range(__k1, __k2);
5013 return this->replace(__i1 - _M_ibegin(), __i2 - __i1,
5014 __k1.base(), __k2 - __k1);
5015 }
5016
5017 basic_string&
5018 replace(iterator __i1, iterator __i2,
5019 const_iterator __k1, const_iterator __k2)
5020 {
5021 _GLIBCXX_DEBUG_PEDASSERT(_M_ibegin() <= __i1 && __i1 <= __i2
5022 && __i2 <= _M_iend());
5023 __glibcxx_requires_valid_range(__k1, __k2);
5024 return this->replace(__i1 - _M_ibegin(), __i2 - __i1,
5025 __k1.base(), __k2 - __k1);
5026 }
5027
5028#if __cplusplus201703L >= 201103L
5029 /**
5030 * @brief Replace range of characters with initializer_list.
5031 * @param __i1 Iterator referencing start of range to replace.
5032 * @param __i2 Iterator referencing end of range to replace.
5033 * @param __l The initializer_list of characters to insert.
5034 * @return Reference to this string.
5035 * @throw std::length_error If new length exceeds @c max_size().
5036 *
5037 * Removes the characters in the range [__i1,__i2). In place,
5038 * characters in the range [__k1,__k2) are inserted. If the
5039 * length of result exceeds max_size(), length_error is thrown.
5040 * The value of the string doesn't change if an error is
5041 * thrown.
5042 */
5043 basic_string& replace(iterator __i1, iterator __i2,
5044 initializer_list<_CharT> __l)
5045 { return this->replace(__i1, __i2, __l.begin(), __l.end()); }
5046#endif // C++11
5047
5048#if __cplusplus201703L >= 201703L
5049 /**
5050 * @brief Replace range of characters with string_view.
5051 * @param __pos The position to replace at.
5052 * @param __n The number of characters to replace.
5053 * @param __svt The object convertible to string_view to insert.
5054 * @return Reference to this string.
5055 */
5056 template<typename _Tp>
5057 _If_sv<_Tp, basic_string&>
5058 replace(size_type __pos, size_type __n, const _Tp& __svt)
5059 {
5060 __sv_type __sv = __svt;
5061 return this->replace(__pos, __n, __sv.data(), __sv.size());
5062 }
5063
5064 /**
5065 * @brief Replace range of characters with string_view.
5066 * @param __pos1 The position to replace at.
5067 * @param __n1 The number of characters to replace.
5068 * @param __svt The object convertible to string_view to insert from.
5069 * @param __pos2 The position in the string_view to insert from.
5070 * @param __n2 The number of characters to insert.
5071 * @return Reference to this string.
5072 */
5073 template<typename _Tp>
5074 _If_sv<_Tp, basic_string&>
5075 replace(size_type __pos1, size_type __n1, const _Tp& __svt,
5076 size_type __pos2, size_type __n2 = npos)
5077 {
5078 __sv_type __sv = __svt;
5079 return this->replace(__pos1, __n1,
5080 __sv.data()
5081 + std::__sv_check(__sv.size(), __pos2, "basic_string::replace"),
5082 std::__sv_limit(__sv.size(), __pos2, __n2));
5083 }
5084
5085 /**
5086 * @brief Replace range of characters with string_view.
5087 * @param __i1 An iterator referencing the start position
5088 to replace at.
5089 * @param __i2 An iterator referencing the end position
5090 for the replace.
5091 * @param __svt The object convertible to string_view to insert from.
5092 * @return Reference to this string.
5093 */
5094 template<typename _Tp>
5095 _If_sv<_Tp, basic_string&>
5096 replace(const_iterator __i1, const_iterator __i2, const _Tp& __svt)
5097 {
5098 __sv_type __sv = __svt;
5099 return this->replace(__i1 - begin(), __i2 - __i1, __sv);
5100 }
5101#endif // C++17
5102
5103 private:
5104 template<class _Integer>
5105 basic_string&
5106 _M_replace_dispatch(iterator __i1, iterator __i2, _Integer __n,
5107 _Integer __val, __true_type)
5108 { return _M_replace_aux(__i1 - _M_ibegin(), __i2 - __i1, __n, __val); }
5109
5110 template<class _InputIterator>
5111 basic_string&
5112 _M_replace_dispatch(iterator __i1, iterator __i2, _InputIterator __k1,
5113 _InputIterator __k2, __false_type);
5114
5115 basic_string&
5116 _M_replace_aux(size_type __pos1, size_type __n1, size_type __n2,
5117 _CharT __c);
5118
5119 basic_string&
5120 _M_replace_safe(size_type __pos1, size_type __n1, const _CharT* __s,
5121 size_type __n2);
5122
5123 // _S_construct_aux is used to implement the 21.3.1 para 15 which
5124 // requires special behaviour if _InIter is an integral type
5125 template<class _InIterator>
5126 static _CharT*
5127 _S_construct_aux(_InIterator __beg, _InIterator __end,
5128 const _Alloc& __a, __false_type)
5129 {
5130 typedef typename iterator_traits<_InIterator>::iterator_category _Tag;
5131 return _S_construct(__beg, __end, __a, _Tag());
5132 }
5133
5134 // _GLIBCXX_RESOLVE_LIB_DEFECTS
5135 // 438. Ambiguity in the "do the right thing" clause
5136 template<class _Integer>
5137 static _CharT*
5138 _S_construct_aux(_Integer __beg, _Integer __end,
5139 const _Alloc& __a, __true_type)
5140 { return _S_construct_aux_2(static_cast<size_type>(__beg),
5141 __end, __a); }
5142
5143 static _CharT*
5144 _S_construct_aux_2(size_type __req, _CharT __c, const _Alloc& __a)
5145 { return _S_construct(__req, __c, __a); }
5146
5147 template<class _InIterator>
5148 static _CharT*
5149 _S_construct(_InIterator __beg, _InIterator __end, const _Alloc& __a)
5150 {
5151 typedef typename std::__is_integer<_InIterator>::__type _Integral;
5152 return _S_construct_aux(__beg, __end, __a, _Integral());
5153 }
5154
5155 // For Input Iterators, used in istreambuf_iterators, etc.
5156 template<class _InIterator>
5157 static _CharT*
5158 _S_construct(_InIterator __beg, _InIterator __end, const _Alloc& __a,
5159 input_iterator_tag);
5160
5161 // For forward_iterators up to random_access_iterators, used for
5162 // string::iterator, _CharT*, etc.
5163 template<class _FwdIterator>
5164 static _CharT*
5165 _S_construct(_FwdIterator __beg, _FwdIterator __end, const _Alloc& __a,
5166 forward_iterator_tag);
5167
5168 static _CharT*
5169 _S_construct(size_type __req, _CharT __c, const _Alloc& __a);
5170
5171 public:
5172
5173 /**
5174 * @brief Copy substring into C string.
5175 * @param __s C string to copy value into.
5176 * @param __n Number of characters to copy.
5177 * @param __pos Index of first character to copy.
5178 * @return Number of characters actually copied
5179 * @throw std::out_of_range If __pos > size().
5180 *
5181 * Copies up to @a __n characters starting at @a __pos into the
5182 * C string @a __s. If @a __pos is %greater than size(),
5183 * out_of_range is thrown.
5184 */
5185 size_type
5186 copy(_CharT* __s, size_type __n, size_type __pos = 0) const;
5187
5188 /**
5189 * @brief Swap contents with another string.
5190 * @param __s String to swap with.
5191 *
5192 * Exchanges the contents of this string with that of @a __s in constant
5193 * time.
5194 */
5195 void
5196 swap(basic_string& __s)
5197 _GLIBCXX_NOEXCEPT_IF(allocator_traits<_Alloc>::is_always_equal::value)noexcept(allocator_traits<_Alloc>::is_always_equal::value
)
;
5198
5199 // String operations:
5200 /**
5201 * @brief Return const pointer to null-terminated contents.
5202 *
5203 * This is a handle to internal data. Do not modify or dire things may
5204 * happen.
5205 */
5206 const _CharT*
5207 c_str() const _GLIBCXX_NOEXCEPTnoexcept
5208 { return _M_data(); }
5209
5210 /**
5211 * @brief Return const pointer to contents.
5212 *
5213 * This is a pointer to internal data. It is undefined to modify
5214 * the contents through the returned pointer. To get a pointer that
5215 * allows modifying the contents use @c &str[0] instead,
5216 * (or in C++17 the non-const @c str.data() overload).
5217 */
5218 const _CharT*
5219 data() const _GLIBCXX_NOEXCEPTnoexcept
5220 { return _M_data(); }
5221
5222#if __cplusplus201703L >= 201703L
5223 /**
5224 * @brief Return non-const pointer to contents.
5225 *
5226 * This is a pointer to the character sequence held by the string.
5227 * Modifying the characters in the sequence is allowed.
5228 */
5229 _CharT*
5230 data() noexcept
5231 {
5232 _M_leak();
5233 return _M_data();
5234 }
5235#endif
5236
5237 /**
5238 * @brief Return copy of allocator used to construct this string.
5239 */
5240 allocator_type
5241 get_allocator() const _GLIBCXX_NOEXCEPTnoexcept
5242 { return _M_dataplus; }
5243
5244 /**
5245 * @brief Find position of a C substring.
5246 * @param __s C string to locate.
5247 * @param __pos Index of character to search from.
5248 * @param __n Number of characters from @a s to search for.
5249 * @return Index of start of first occurrence.
5250 *
5251 * Starting from @a __pos, searches forward for the first @a
5252 * __n characters in @a __s within this string. If found,
5253 * returns the index where it begins. If not found, returns
5254 * npos.
5255 */
5256 size_type
5257 find(const _CharT* __s, size_type __pos, size_type __n) const
5258 _GLIBCXX_NOEXCEPTnoexcept;
5259
5260 /**
5261 * @brief Find position of a string.
5262 * @param __str String to locate.
5263 * @param __pos Index of character to search from (default 0).
5264 * @return Index of start of first occurrence.
5265 *
5266 * Starting from @a __pos, searches forward for value of @a __str within
5267 * this string. If found, returns the index where it begins. If not
5268 * found, returns npos.
5269 */
5270 size_type
5271 find(const basic_string& __str, size_type __pos = 0) const
5272 _GLIBCXX_NOEXCEPTnoexcept
5273 { return this->find(__str.data(), __pos, __str.size()); }
5274
5275 /**
5276 * @brief Find position of a C string.
5277 * @param __s C string to locate.
5278 * @param __pos Index of character to search from (default 0).
5279 * @return Index of start of first occurrence.
5280 *
5281 * Starting from @a __pos, searches forward for the value of @a
5282 * __s within this string. If found, returns the index where
5283 * it begins. If not found, returns npos.
5284 */
5285 size_type
5286 find(const _CharT* __s, size_type __pos = 0) const _GLIBCXX_NOEXCEPTnoexcept
5287 {
5288 __glibcxx_requires_string(__s);
5289 return this->find(__s, __pos, traits_type::length(__s));
5290 }
5291
5292 /**
5293 * @brief Find position of a character.
5294 * @param __c Character to locate.
5295 * @param __pos Index of character to search from (default 0).
5296 * @return Index of first occurrence.
5297 *
5298 * Starting from @a __pos, searches forward for @a __c within
5299 * this string. If found, returns the index where it was
5300 * found. If not found, returns npos.
5301 */
5302 size_type
5303 find(_CharT __c, size_type __pos = 0) const _GLIBCXX_NOEXCEPTnoexcept;
5304
5305#if __cplusplus201703L >= 201703L
5306 /**
5307 * @brief Find position of a string_view.
5308 * @param __svt The object convertible to string_view to locate.
5309 * @param __pos Index of character to search from (default 0).
5310 * @return Index of start of first occurrence.
5311 */
5312 template<typename _Tp>
5313 _If_sv<_Tp, size_type>
5314 find(const _Tp& __svt, size_type __pos = 0) const
5315 noexcept(is_same<_Tp, __sv_type>::value)
5316 {
5317 __sv_type __sv = __svt;
5318 return this->find(__sv.data(), __pos, __sv.size());
5319 }
5320#endif // C++17
5321
5322 /**
5323 * @brief Find last position of a string.
5324 * @param __str String to locate.
5325 * @param __pos Index of character to search back from (default end).
5326 * @return Index of start of last occurrence.
5327 *
5328 * Starting from @a __pos, searches backward for value of @a
5329 * __str within this string. If found, returns the index where
5330 * it begins. If not found, returns npos.
5331 */
5332 size_type
5333 rfind(const basic_string& __str, size_type __pos = npos) const
5334 _GLIBCXX_NOEXCEPTnoexcept
5335 { return this->rfind(__str.data(), __pos, __str.size()); }
5336
5337 /**
5338 * @brief Find last position of a C substring.
5339 * @param __s C string to locate.
5340 * @param __pos Index of character to search back from.
5341 * @param __n Number of characters from s to search for.
5342 * @return Index of start of last occurrence.
5343 *
5344 * Starting from @a __pos, searches backward for the first @a
5345 * __n characters in @a __s within this string. If found,
5346 * returns the index where it begins. If not found, returns
5347 * npos.
5348 */
5349 size_type
5350 rfind(const _CharT* __s, size_type __pos, size_type __n) const
5351 _GLIBCXX_NOEXCEPTnoexcept;
5352
5353 /**
5354 * @brief Find last position of a C string.
5355 * @param __s C string to locate.
5356 * @param __pos Index of character to start search at (default end).
5357 * @return Index of start of last occurrence.
5358 *
5359 * Starting from @a __pos, searches backward for the value of
5360 * @a __s within this string. If found, returns the index
5361 * where it begins. If not found, returns npos.
5362 */
5363 size_type
5364 rfind(const _CharT* __s, size_type __pos = npos) const _GLIBCXX_NOEXCEPTnoexcept
5365 {
5366 __glibcxx_requires_string(__s);
5367 return this->rfind(__s, __pos, traits_type::length(__s));
5368 }
5369
5370 /**
5371 * @brief Find last position of a character.
5372 * @param __c Character to locate.
5373 * @param __pos Index of character to search back from (default end).
5374 * @return Index of last occurrence.
5375 *
5376 * Starting from @a __pos, searches backward for @a __c within
5377 * this string. If found, returns the index where it was
5378 * found. If not found, returns npos.
5379 */
5380 size_type
5381 rfind(_CharT __c, size_type __pos = npos) const _GLIBCXX_NOEXCEPTnoexcept;
5382
5383#if __cplusplus201703L >= 201703L
5384 /**
5385 * @brief Find last position of a string_view.
5386 * @param __svt The object convertible to string_view to locate.
5387 * @param __pos Index of character to search back from (default end).
5388 * @return Index of start of last occurrence.
5389 */
5390 template<typename _Tp>
5391 _If_sv<_Tp, size_type>
5392 rfind(const _Tp& __svt, size_type __pos = npos) const
5393 noexcept(is_same<_Tp, __sv_type>::value)
5394 {
5395 __sv_type __sv = __svt;
5396 return this->rfind(__sv.data(), __pos, __sv.size());
5397 }
5398#endif // C++17
5399
5400 /**
5401 * @brief Find position of a character of string.
5402 * @param __str String containing characters to locate.
5403 * @param __pos Index of character to search from (default 0).
5404 * @return Index of first occurrence.
5405 *
5406 * Starting from @a __pos, searches forward for one of the
5407 * characters of @a __str within this string. If found,
5408 * returns the index where it was found. If not found, returns
5409 * npos.
5410 */
5411 size_type
5412 find_first_of(const basic_string& __str, size_type __pos = 0) const
5413 _GLIBCXX_NOEXCEPTnoexcept
5414 { return this->find_first_of(__str.data(), __pos, __str.size()); }
5415
5416 /**
5417 * @brief Find position of a character of C substring.
5418 * @param __s String containing characters to locate.
5419 * @param __pos Index of character to search from.
5420 * @param __n Number of characters from s to search for.
5421 * @return Index of first occurrence.
5422 *
5423 * Starting from @a __pos, searches forward for one of the
5424 * first @a __n characters of @a __s within this string. If
5425 * found, returns the index where it was found. If not found,
5426 * returns npos.
5427 */
5428 size_type
5429 find_first_of(const _CharT* __s, size_type __pos, size_type __n) const
5430 _GLIBCXX_NOEXCEPTnoexcept;
5431
5432 /**
5433 * @brief Find position of a character of C string.
5434 * @param __s String containing characters to locate.
5435 * @param __pos Index of character to search from (default 0).
5436 * @return Index of first occurrence.
5437 *
5438 * Starting from @a __pos, searches forward for one of the
5439 * characters of @a __s within this string. If found, returns
5440 * the index where it was found. If not found, returns npos.
5441 */
5442 size_type
5443 find_first_of(const _CharT* __s, size_type __pos = 0) const
5444 _GLIBCXX_NOEXCEPTnoexcept
5445 {
5446 __glibcxx_requires_string(__s);
5447 return this->find_first_of(__s, __pos, traits_type::length(__s));
5448 }
5449
5450 /**
5451 * @brief Find position of a character.
5452 * @param __c Character to locate.
5453 * @param __pos Index of character to search from (default 0).
5454 * @return Index of first occurrence.
5455 *
5456 * Starting from @a __pos, searches forward for the character
5457 * @a __c within this string. If found, returns the index
5458 * where it was found. If not found, returns npos.
5459 *
5460 * Note: equivalent to find(__c, __pos).
5461 */
5462 size_type
5463 find_first_of(_CharT __c, size_type __pos = 0) const _GLIBCXX_NOEXCEPTnoexcept
5464 { return this->find(__c, __pos); }
5465
5466#if __cplusplus201703L >= 201703L
5467 /**
5468 * @brief Find position of a character of a string_view.
5469 * @param __svt An object convertible to string_view containing
5470 * characters to locate.
5471 * @param __pos Index of character to search from (default 0).
5472 * @return Index of first occurrence.
5473 */
5474 template<typename _Tp>
5475 _If_sv<_Tp, size_type>
5476 find_first_of(const _Tp& __svt, size_type __pos = 0) const
5477 noexcept(is_same<_Tp, __sv_type>::value)
5478 {
5479 __sv_type __sv = __svt;
5480 return this->find_first_of(__sv.data(), __pos, __sv.size());
5481 }
5482#endif // C++17
5483
5484 /**
5485 * @brief Find last position of a character of string.
5486 * @param __str String containing characters to locate.
5487 * @param __pos Index of character to search back from (default end).
5488 * @return Index of last occurrence.
5489 *
5490 * Starting from @a __pos, searches backward for one of the
5491 * characters of @a __str within this string. If found,
5492 * returns the index where it was found. If not found, returns
5493 * npos.
5494 */
5495 size_type
5496 find_last_of(const basic_string& __str, size_type __pos = npos) const
5497 _GLIBCXX_NOEXCEPTnoexcept
5498 { return this->find_last_of(__str.data(), __pos, __str.size()); }
5499
5500 /**
5501 * @brief Find last position of a character of C substring.
5502 * @param __s C string containing characters to locate.
5503 * @param __pos Index of character to search back from.
5504 * @param __n Number of characters from s to search for.
5505 * @return Index of last occurrence.
5506 *
5507 * Starting from @a __pos, searches backward for one of the
5508 * first @a __n characters of @a __s within this string. If
5509 * found, returns the index where it was found. If not found,
5510 * returns npos.
5511 */
5512 size_type
5513 find_last_of(const _CharT* __s, size_type __pos, size_type __n) const
5514 _GLIBCXX_NOEXCEPTnoexcept;
5515
5516 /**
5517 * @brief Find last position of a character of C string.
5518 * @param __s C string containing characters to locate.
5519 * @param __pos Index of character to search back from (default end).
5520 * @return Index of last occurrence.
5521 *
5522 * Starting from @a __pos, searches backward for one of the
5523 * characters of @a __s within this string. If found, returns
5524 * the index where it was found. If not found, returns npos.
5525 */
5526 size_type
5527 find_last_of(const _CharT* __s, size_type __pos = npos) const
5528 _GLIBCXX_NOEXCEPTnoexcept
5529 {
5530 __glibcxx_requires_string(__s);
5531 return this->find_last_of(__s, __pos, traits_type::length(__s));
5532 }
5533
5534 /**
5535 * @brief Find last position of a character.
5536 * @param __c Character to locate.
5537 * @param __pos Index of character to search back from (default end).
5538 * @return Index of last occurrence.
5539 *
5540 * Starting from @a __pos, searches backward for @a __c within
5541 * this string. If found, returns the index where it was
5542 * found. If not found, returns npos.
5543 *
5544 * Note: equivalent to rfind(__c, __pos).
5545 */
5546 size_type
5547 find_last_of(_CharT __c, size_type __pos = npos) const _GLIBCXX_NOEXCEPTnoexcept
5548 { return this->rfind(__c, __pos); }
5549
5550#if __cplusplus201703L >= 201703L
5551 /**
5552 * @brief Find last position of a character of string.
5553 * @param __svt An object convertible to string_view containing
5554 * characters to locate.
5555 * @param __pos Index of character to search back from (default end).
5556 * @return Index of last occurrence.
5557 */
5558 template<typename _Tp>
5559 _If_sv<_Tp, size_type>
5560 find_last_of(const _Tp& __svt, size_type __pos = npos) const
5561 noexcept(is_same<_Tp, __sv_type>::value)
5562 {
5563 __sv_type __sv = __svt;
5564 return this->find_last_of(__sv.data(), __pos, __sv.size());
5565 }
5566#endif // C++17
5567
5568 /**
5569 * @brief Find position of a character not in string.
5570 * @param __str String containing characters to avoid.
5571 * @param __pos Index of character to search from (default 0).
5572 * @return Index of first occurrence.
5573 *
5574 * Starting from @a __pos, searches forward for a character not contained
5575 * in @a __str within this string. If found, returns the index where it
5576 * was found. If not found, returns npos.
5577 */
5578 size_type
5579 find_first_not_of(const basic_string& __str, size_type __pos = 0) const
5580 _GLIBCXX_NOEXCEPTnoexcept
5581 { return this->find_first_not_of(__str.data(), __pos, __str.size()); }
5582
5583 /**
5584 * @brief Find position of a character not in C substring.
5585 * @param __s C string containing characters to avoid.
5586 * @param __pos Index of character to search from.
5587 * @param __n Number of characters from __s to consider.
5588 * @return Index of first occurrence.
5589 *
5590 * Starting from @a __pos, searches forward for a character not
5591 * contained in the first @a __n characters of @a __s within
5592 * this string. If found, returns the index where it was
5593 * found. If not found, returns npos.
5594 */
5595 size_type
5596 find_first_not_of(const _CharT* __s, size_type __pos,
5597 size_type __n) const _GLIBCXX_NOEXCEPTnoexcept;
5598
5599 /**
5600 * @brief Find position of a character not in C string.
5601 * @param __s C string containing characters to avoid.
5602 * @param __pos Index of character to search from (default 0).
5603 * @return Index of first occurrence.
5604 *
5605 * Starting from @a __pos, searches forward for a character not
5606 * contained in @a __s within this string. If found, returns
5607 * the index where it was found. If not found, returns npos.
5608 */
5609 size_type
5610 find_first_not_of(const _CharT* __s, size_type __pos = 0) const
5611 _GLIBCXX_NOEXCEPTnoexcept
5612 {
5613 __glibcxx_requires_string(__s);
5614 return this->find_first_not_of(__s, __pos, traits_type::length(__s));
5615 }
5616
5617 /**
5618 * @brief Find position of a different character.
5619 * @param __c Character to avoid.
5620 * @param __pos Index of character to search from (default 0).
5621 * @return Index of first occurrence.
5622 *
5623 * Starting from @a __pos, searches forward for a character
5624 * other than @a __c within this string. If found, returns the
5625 * index where it was found. If not found, returns npos.
5626 */
5627 size_type
5628 find_first_not_of(_CharT __c, size_type __pos = 0) const
5629 _GLIBCXX_NOEXCEPTnoexcept;
5630
5631#if __cplusplus201703L >= 201703L
5632 /**
5633 * @brief Find position of a character not in a string_view.
5634 * @param __svt An object convertible to string_view containing
5635 * characters to avoid.
5636 * @param __pos Index of character to search from (default 0).
5637 * @return Index of first occurrence.
5638 */
5639 template<typename _Tp>
5640 _If_sv<_Tp, size_type>
5641 find_first_not_of(const _Tp& __svt, size_type __pos = 0) const
5642 noexcept(is_same<_Tp, __sv_type>::value)
5643 {
5644 __sv_type __sv = __svt;
5645 return this->find_first_not_of(__sv.data(), __pos, __sv.size());
5646 }
5647#endif // C++17
5648
5649 /**
5650 * @brief Find last position of a character not in string.
5651 * @param __str String containing characters to avoid.
5652 * @param __pos Index of character to search back from (default end).
5653 * @return Index of last occurrence.
5654 *
5655 * Starting from @a __pos, searches backward for a character
5656 * not contained in @a __str within this string. If found,
5657 * returns the index where it was found. If not found, returns
5658 * npos.
5659 */
5660 size_type
5661 find_last_not_of(const basic_string& __str, size_type __pos = npos) const
5662 _GLIBCXX_NOEXCEPTnoexcept
5663 { return this->find_last_not_of(__str.data(), __pos, __str.size()); }
5664
5665 /**
5666 * @brief Find last position of a character not in C substring.
5667 * @param __s C string containing characters to avoid.
5668 * @param __pos Index of character to search back from.
5669 * @param __n Number of characters from s to consider.
5670 * @return Index of last occurrence.
5671 *
5672 * Starting from @a __pos, searches backward for a character not
5673 * contained in the first @a __n characters of @a __s within this string.
5674 * If found, returns the index where it was found. If not found,
5675 * returns npos.
5676 */
5677 size_type
5678 find_last_not_of(const _CharT* __s, size_type __pos,
5679 size_type __n) const _GLIBCXX_NOEXCEPTnoexcept;
5680 /**
5681 * @brief Find last position of a character not in C string.
5682 * @param __s C string containing characters to avoid.
5683 * @param __pos Index of character to search back from (default end).
5684 * @return Index of last occurrence.
5685 *
5686 * Starting from @a __pos, searches backward for a character
5687 * not contained in @a __s within this string. If found,
5688 * returns the index where it was found. If not found, returns
5689 * npos.
5690 */
5691 size_type
5692 find_last_not_of(const _CharT* __s, size_type __pos = npos) const
5693 _GLIBCXX_NOEXCEPTnoexcept
5694 {
5695 __glibcxx_requires_string(__s);
5696 return this->find_last_not_of(__s, __pos, traits_type::length(__s));
5697 }
5698
5699 /**
5700 * @brief Find last position of a different character.
5701 * @param __c Character to avoid.
5702 * @param __pos Index of character to search back from (default end).
5703 * @return Index of last occurrence.
5704 *
5705 * Starting from @a __pos, searches backward for a character other than
5706 * @a __c within this string. If found, returns the index where it was
5707 * found. If not found, returns npos.
5708 */
5709 size_type
5710 find_last_not_of(_CharT __c, size_type __pos = npos) const
5711 _GLIBCXX_NOEXCEPTnoexcept;
5712
5713#if __cplusplus201703L >= 201703L
5714 /**
5715 * @brief Find last position of a character not in a string_view.
5716 * @param __svt An object convertible to string_view containing
5717 * characters to avoid.
5718 * @param __pos Index of character to search back from (default end).
5719 * @return Index of last occurrence.
5720 */
5721 template<typename _Tp>
5722 _If_sv<_Tp, size_type>
5723 find_last_not_of(const _Tp& __svt, size_type __pos = npos) const
5724 noexcept(is_same<_Tp, __sv_type>::value)
5725 {
5726 __sv_type __sv = __svt;
5727 return this->find_last_not_of(__sv.data(), __pos, __sv.size());
5728 }
5729#endif // C++17
5730
5731 /**
5732 * @brief Get a substring.
5733 * @param __pos Index of first character (default 0).
5734 * @param __n Number of characters in substring (default remainder).
5735 * @return The new string.
5736 * @throw std::out_of_range If __pos > size().
5737 *
5738 * Construct and return a new string using the @a __n
5739 * characters starting at @a __pos. If the string is too
5740 * short, use the remainder of the characters. If @a __pos is
5741 * beyond the end of the string, out_of_range is thrown.
5742 */
5743 basic_string
5744 substr(size_type __pos = 0, size_type __n = npos) const
5745 { return basic_string(*this,
5746 _M_check(__pos, "basic_string::substr"), __n); }
5747
5748 /**
5749 * @brief Compare to a string.
5750 * @param __str String to compare against.
5751 * @return Integer < 0, 0, or > 0.
5752 *
5753 * Returns an integer < 0 if this string is ordered before @a
5754 * __str, 0 if their values are equivalent, or > 0 if this
5755 * string is ordered after @a __str. Determines the effective
5756 * length rlen of the strings to compare as the smallest of
5757 * size() and str.size(). The function then compares the two
5758 * strings by calling traits::compare(data(), str.data(),rlen).
5759 * If the result of the comparison is nonzero returns it,
5760 * otherwise the shorter one is ordered first.
5761 */
5762 int
5763 compare(const basic_string& __str) const
5764 {
5765 const size_type __size = this->size();
5766 const size_type __osize = __str.size();
5767 const size_type __len = std::min(__size, __osize);
5768
5769 int __r = traits_type::compare(_M_data(), __str.data(), __len);
5770 if (!__r)
5771 __r = _S_compare(__size, __osize);
5772 return __r;
5773 }
5774
5775#if __cplusplus201703L >= 201703L
5776 /**
5777 * @brief Compare to a string_view.
5778 * @param __svt An object convertible to string_view to compare against.
5779 * @return Integer < 0, 0, or > 0.
5780 */
5781 template<typename _Tp>
5782 _If_sv<_Tp, int>
5783 compare(const _Tp& __svt) const
5784 noexcept(is_same<_Tp, __sv_type>::value)
5785 {
5786 __sv_type __sv = __svt;
5787 const size_type __size = this->size();
5788 const size_type __osize = __sv.size();
5789 const size_type __len = std::min(__size, __osize);
5790
5791 int __r = traits_type::compare(_M_data(), __sv.data(), __len);
5792 if (!__r)
5793 __r = _S_compare(__size, __osize);
5794 return __r;
5795 }
5796
5797 /**
5798 * @brief Compare to a string_view.
5799 * @param __pos A position in the string to start comparing from.
5800 * @param __n The number of characters to compare.
5801 * @param __svt An object convertible to string_view to compare
5802 * against.
5803 * @return Integer < 0, 0, or > 0.
5804 */
5805 template<typename _Tp>
5806 _If_sv<_Tp, int>
5807 compare(size_type __pos, size_type __n, const _Tp& __svt) const
5808 noexcept(is_same<_Tp, __sv_type>::value)
5809 {
5810 __sv_type __sv = __svt;
5811 return __sv_type(*this).substr(__pos, __n).compare(__sv);
5812 }
5813
5814 /**
5815 * @brief Compare to a string_view.
5816 * @param __pos1 A position in the string to start comparing from.
5817 * @param __n1 The number of characters to compare.
5818 * @param __svt An object convertible to string_view to compare
5819 * against.
5820 * @param __pos2 A position in the string_view to start comparing from.
5821 * @param __n2 The number of characters to compare.
5822 * @return Integer < 0, 0, or > 0.
5823 */
5824 template<typename _Tp>
5825 _If_sv<_Tp, int>
5826 compare(size_type __pos1, size_type __n1, const _Tp& __svt,
5827 size_type __pos2, size_type __n2 = npos) const
5828 noexcept(is_same<_Tp, __sv_type>::value)
5829 {
5830 __sv_type __sv = __svt;
5831 return __sv_type(*this)
5832 .substr(__pos1, __n1).compare(__sv.substr(__pos2, __n2));
5833 }
5834#endif // C++17
5835
5836 /**
5837 * @brief Compare substring to a string.
5838 * @param __pos Index of first character of substring.
5839 * @param __n Number of characters in substring.
5840 * @param __str String to compare against.
5841 * @return Integer < 0, 0, or > 0.
5842 *
5843 * Form the substring of this string from the @a __n characters
5844 * starting at @a __pos. Returns an integer < 0 if the
5845 * substring is ordered before @a __str, 0 if their values are
5846 * equivalent, or > 0 if the substring is ordered after @a
5847 * __str. Determines the effective length rlen of the strings
5848 * to compare as the smallest of the length of the substring
5849 * and @a __str.size(). The function then compares the two
5850 * strings by calling
5851 * traits::compare(substring.data(),str.data(),rlen). If the
5852 * result of the comparison is nonzero returns it, otherwise
5853 * the shorter one is ordered first.
5854 */
5855 int
5856 compare(size_type __pos, size_type __n, const basic_string& __str) const;
5857
5858 /**
5859 * @brief Compare substring to a substring.
5860 * @param __pos1 Index of first character of substring.
5861 * @param __n1 Number of characters in substring.
5862 * @param __str String to compare against.
5863 * @param __pos2 Index of first character of substring of str.
5864 * @param __n2 Number of characters in substring of str.
5865 * @return Integer < 0, 0, or > 0.
5866 *
5867 * Form the substring of this string from the @a __n1
5868 * characters starting at @a __pos1. Form the substring of @a
5869 * __str from the @a __n2 characters starting at @a __pos2.
5870 * Returns an integer < 0 if this substring is ordered before
5871 * the substring of @a __str, 0 if their values are equivalent,
5872 * or > 0 if this substring is ordered after the substring of
5873 * @a __str. Determines the effective length rlen of the
5874 * strings to compare as the smallest of the lengths of the
5875 * substrings. The function then compares the two strings by
5876 * calling
5877 * traits::compare(substring.data(),str.substr(pos2,n2).data(),rlen).
5878 * If the result of the comparison is nonzero returns it,
5879 * otherwise the shorter one is ordered first.
5880 */
5881 int
5882 compare(size_type __pos1, size_type __n1, const basic_string& __str,
5883 size_type __pos2, size_type __n2 = npos) const;
5884
5885 /**
5886 * @brief Compare to a C string.
5887 * @param __s C string to compare against.
5888 * @return Integer < 0, 0, or > 0.
5889 *
5890 * Returns an integer < 0 if this string is ordered before @a __s, 0 if
5891 * their values are equivalent, or > 0 if this string is ordered after
5892 * @a __s. Determines the effective length rlen of the strings to
5893 * compare as the smallest of size() and the length of a string
5894 * constructed from @a __s. The function then compares the two strings
5895 * by calling traits::compare(data(),s,rlen). If the result of the
5896 * comparison is nonzero returns it, otherwise the shorter one is
5897 * ordered first.
5898 */
5899 int
5900 compare(const _CharT* __s) const _GLIBCXX_NOEXCEPTnoexcept;
5901
5902 // _GLIBCXX_RESOLVE_LIB_DEFECTS
5903 // 5 String::compare specification questionable
5904 /**
5905 * @brief Compare substring to a C string.
5906 * @param __pos Index of first character of substring.
5907 * @param __n1 Number of characters in substring.
5908 * @param __s C string to compare against.
5909 * @return Integer < 0, 0, or > 0.
5910 *
5911 * Form the substring of this string from the @a __n1
5912 * characters starting at @a pos. Returns an integer < 0 if
5913 * the substring is ordered before @a __s, 0 if their values
5914 * are equivalent, or > 0 if the substring is ordered after @a
5915 * __s. Determines the effective length rlen of the strings to
5916 * compare as the smallest of the length of the substring and
5917 * the length of a string constructed from @a __s. The
5918 * function then compares the two string by calling
5919 * traits::compare(substring.data(),__s,rlen). If the result of
5920 * the comparison is nonzero returns it, otherwise the shorter
5921 * one is ordered first.
5922 */
5923 int
5924 compare(size_type __pos, size_type __n1, const _CharT* __s) const;
5925
5926 /**
5927 * @brief Compare substring against a character %array.
5928 * @param __pos Index of first character of substring.
5929 * @param __n1 Number of characters in substring.
5930 * @param __s character %array to compare against.
5931 * @param __n2 Number of characters of s.
5932 * @return Integer < 0, 0, or > 0.
5933 *
5934 * Form the substring of this string from the @a __n1
5935 * characters starting at @a __pos. Form a string from the
5936 * first @a __n2 characters of @a __s. Returns an integer < 0
5937 * if this substring is ordered before the string from @a __s,
5938 * 0 if their values are equivalent, or > 0 if this substring
5939 * is ordered after the string from @a __s. Determines the
5940 * effective length rlen of the strings to compare as the
5941 * smallest of the length of the substring and @a __n2. The
5942 * function then compares the two strings by calling
5943 * traits::compare(substring.data(),s,rlen). If the result of
5944 * the comparison is nonzero returns it, otherwise the shorter
5945 * one is ordered first.
5946 *
5947 * NB: s must have at least n2 characters, &apos;\\0&apos; has
5948 * no special meaning.
5949 */
5950 int
5951 compare(size_type __pos, size_type __n1, const _CharT* __s,
5952 size_type __n2) const;
5953
5954#if __cplusplus201703L > 201703L
5955 bool
5956 starts_with(basic_string_view<_CharT, _Traits> __x) const noexcept
5957 { return __sv_type(this->data(), this->size()).starts_with(__x); }
5958
5959 bool
5960 starts_with(_CharT __x) const noexcept
5961 { return __sv_type(this->data(), this->size()).starts_with(__x); }
5962
5963 bool
5964 starts_with(const _CharT* __x) const noexcept
5965 { return __sv_type(this->data(), this->size()).starts_with(__x); }
5966
5967 bool
5968 ends_with(basic_string_view<_CharT, _Traits> __x) const noexcept
5969 { return __sv_type(this->data(), this->size()).ends_with(__x); }
5970
5971 bool
5972 ends_with(_CharT __x) const noexcept
5973 { return __sv_type(this->data(), this->size()).ends_with(__x); }
5974
5975 bool
5976 ends_with(const _CharT* __x) const noexcept
5977 { return __sv_type(this->data(), this->size()).ends_with(__x); }
5978#endif // C++20
5979
5980# ifdef _GLIBCXX_TM_TS_INTERNAL
5981 friend void
5982 ::_txnal_cow_string_C1_for_exceptions(void* that, const char* s,
5983 void* exc);
5984 friend const char*
5985 ::_txnal_cow_string_c_str(const void *that);
5986 friend void
5987 ::_txnal_cow_string_D1(void *that);
5988 friend void
5989 ::_txnal_cow_string_D1_commit(void *that);
5990# endif
5991 };
5992#endif // !_GLIBCXX_USE_CXX11_ABI
5993
5994#if __cpp_deduction_guides201703L >= 201606
5995_GLIBCXX_BEGIN_NAMESPACE_CXX11namespace __cxx11 {
5996 template<typename _InputIterator, typename _CharT
5997 = typename iterator_traits<_InputIterator>::value_type,
5998 typename _Allocator = allocator<_CharT>,
5999 typename = _RequireInputIter<_InputIterator>,
6000 typename = _RequireAllocator<_Allocator>>
6001 basic_string(_InputIterator, _InputIterator, _Allocator = _Allocator())
6002 -> basic_string<_CharT, char_traits<_CharT>, _Allocator>;
6003
6004 // _GLIBCXX_RESOLVE_LIB_DEFECTS
6005 // 3075. basic_string needs deduction guides from basic_string_view
6006 template<typename _CharT, typename _Traits,
6007 typename _Allocator = allocator<_CharT>,
6008 typename = _RequireAllocator<_Allocator>>
6009 basic_string(basic_string_view<_CharT, _Traits>, const _Allocator& = _Allocator())
6010 -> basic_string<_CharT, _Traits, _Allocator>;
6011
6012 template<typename _CharT, typename _Traits,
6013 typename _Allocator = allocator<_CharT>,
6014 typename = _RequireAllocator<_Allocator>>
6015 basic_string(basic_string_view<_CharT, _Traits>,
6016 typename basic_string<_CharT, _Traits, _Allocator>::size_type,
6017 typename basic_string<_CharT, _Traits, _Allocator>::size_type,
6018 const _Allocator& = _Allocator())
6019 -> basic_string<_CharT, _Traits, _Allocator>;
6020_GLIBCXX_END_NAMESPACE_CXX11}
6021#endif
6022
6023 // operator+
6024 /**
6025 * @brief Concatenate two strings.
6026 * @param __lhs First string.
6027 * @param __rhs Last string.
6028 * @return New string with value of @a __lhs followed by @a __rhs.
6029 */
6030 template<typename _CharT, typename _Traits, typename _Alloc>
6031 basic_string<_CharT, _Traits, _Alloc>
6032 operator+(const basic_string<_CharT, _Traits, _Alloc>& __lhs,
6033 const basic_string<_CharT, _Traits, _Alloc>& __rhs)
6034 {
6035 basic_string<_CharT, _Traits, _Alloc> __str(__lhs);
6036 __str.append(__rhs);
6037 return __str;
6038 }
6039
6040 /**
6041 * @brief Concatenate C string and string.
6042 * @param __lhs First string.
6043 * @param __rhs Last string.
6044 * @return New string with value of @a __lhs followed by @a __rhs.
6045 */
6046 template<typename _CharT, typename _Traits, typename _Alloc>
6047 basic_string<_CharT,_Traits,_Alloc>
6048 operator+(const _CharT* __lhs,
6049 const basic_string<_CharT,_Traits,_Alloc>& __rhs);
6050
6051 /**
6052 * @brief Concatenate character and string.
6053 * @param __lhs First string.
6054 * @param __rhs Last string.
6055 * @return New string with @a __lhs followed by @a __rhs.
6056 */
6057 template<typename _CharT, typename _Traits, typename _Alloc>
6058 basic_string<_CharT,_Traits,_Alloc>
6059 operator+(_CharT __lhs, const basic_string<_CharT,_Traits,_Alloc>& __rhs);
6060
6061 /**
6062 * @brief Concatenate string and C string.
6063 * @param __lhs First string.
6064 * @param __rhs Last string.
6065 * @return New string with @a __lhs followed by @a __rhs.
6066 */
6067 template<typename _CharT, typename _Traits, typename _Alloc>
6068 inline basic_string<_CharT, _Traits, _Alloc>
6069 operator+(const basic_string<_CharT, _Traits, _Alloc>& __lhs,
6070 const _CharT* __rhs)
6071 {
6072 basic_string<_CharT, _Traits, _Alloc> __str(__lhs);
6073 __str.append(__rhs);
6074 return __str;
6075 }
6076
6077 /**
6078 * @brief Concatenate string and character.
6079 * @param __lhs First string.
6080 * @param __rhs Last string.
6081 * @return New string with @a __lhs followed by @a __rhs.
6082 */
6083 template<typename _CharT, typename _Traits, typename _Alloc>
6084 inline basic_string<_CharT, _Traits, _Alloc>
6085 operator+(const basic_string<_CharT, _Traits, _Alloc>& __lhs, _CharT __rhs)
6086 {
6087 typedef basic_string<_CharT, _Traits, _Alloc> __string_type;
6088 typedef typename __string_type::size_type __size_type;
6089 __string_type __str(__lhs);
6090 __str.append(__size_type(1), __rhs);
6091 return __str;
6092 }
6093
6094#if __cplusplus201703L >= 201103L
6095 template<typename _CharT, typename _Traits, typename _Alloc>
6096 inline basic_string<_CharT, _Traits, _Alloc>
6097 operator+(basic_string<_CharT, _Traits, _Alloc>&& __lhs,
6098 const basic_string<_CharT, _Traits, _Alloc>& __rhs)
6099 { return std::move(__lhs.append(__rhs)); }
6100
6101 template<typename _CharT, typename _Traits, typename _Alloc>
6102 inline basic_string<_CharT, _Traits, _Alloc>
6103 operator+(const basic_string<_CharT, _Traits, _Alloc>& __lhs,
6104 basic_string<_CharT, _Traits, _Alloc>&& __rhs)
6105 { return std::move(__rhs.insert(0, __lhs)); }
6106
6107 template<typename _CharT, typename _Traits, typename _Alloc>
6108 inline basic_string<_CharT, _Traits, _Alloc>
6109 operator+(basic_string<_CharT, _Traits, _Alloc>&& __lhs,
6110 basic_string<_CharT, _Traits, _Alloc>&& __rhs)
6111 {
6112#if _GLIBCXX_USE_CXX11_ABI1
6113 using _Alloc_traits = allocator_traits<_Alloc>;
6114 bool __use_rhs = false;
6115 if _GLIBCXX17_CONSTEXPRconstexpr (typename _Alloc_traits::is_always_equal{})
6116 __use_rhs = true;
6117 else if (__lhs.get_allocator() == __rhs.get_allocator())
6118 __use_rhs = true;
6119 if (__use_rhs)
6120#endif
6121 {
6122 const auto __size = __lhs.size() + __rhs.size();
6123 if (__size > __lhs.capacity() && __size <= __rhs.capacity())
6124 return std::move(__rhs.insert(0, __lhs));
6125 }
6126 return std::move(__lhs.append(__rhs));
6127 }
6128
6129 template<typename _CharT, typename _Traits, typename _Alloc>
6130 inline basic_string<_CharT, _Traits, _Alloc>
6131 operator+(const _CharT* __lhs,
6132 basic_string<_CharT, _Traits, _Alloc>&& __rhs)
6133 { return std::move(__rhs.insert(0, __lhs)); }
6134
6135 template<typename _CharT, typename _Traits, typename _Alloc>
6136 inline basic_string<_CharT, _Traits, _Alloc>
6137 operator+(_CharT __lhs,
6138 basic_string<_CharT, _Traits, _Alloc>&& __rhs)
6139 { return std::move(__rhs.insert(0, 1, __lhs)); }
6140
6141 template<typename _CharT, typename _Traits, typename _Alloc>
6142 inline basic_string<_CharT, _Traits, _Alloc>
6143 operator+(basic_string<_CharT, _Traits, _Alloc>&& __lhs,
6144 const _CharT* __rhs)
6145 { return std::move(__lhs.append(__rhs)); }
6146
6147 template<typename _CharT, typename _Traits, typename _Alloc>
6148 inline basic_string<_CharT, _Traits, _Alloc>
6149 operator+(basic_string<_CharT, _Traits, _Alloc>&& __lhs,
6150 _CharT __rhs)
6151 { return std::move(__lhs.append(1, __rhs)); }
6152#endif
6153
6154 // operator ==
6155 /**
6156 * @brief Test equivalence of two strings.
6157 * @param __lhs First string.
6158 * @param __rhs Second string.
6159 * @return True if @a __lhs.compare(@a __rhs) == 0. False otherwise.
6160 */
6161 template<typename _CharT, typename _Traits, typename _Alloc>
6162 inline bool
6163 operator==(const basic_string<_CharT, _Traits, _Alloc>& __lhs,
6164 const basic_string<_CharT, _Traits, _Alloc>& __rhs)
6165 _GLIBCXX_NOEXCEPTnoexcept
6166 { return __lhs.compare(__rhs) == 0; }
6167
6168 template<typename _CharT>
6169 inline
6170 typename __gnu_cxx::__enable_if<__is_char<_CharT>::__value, bool>::__type
6171 operator==(const basic_string<_CharT>& __lhs,
6172 const basic_string<_CharT>& __rhs) _GLIBCXX_NOEXCEPTnoexcept
6173 { return (__lhs.size() == __rhs.size()
6174 && !std::char_traits<_CharT>::compare(__lhs.data(), __rhs.data(),
6175 __lhs.size())); }
6176
6177 /**
6178 * @brief Test equivalence of string and C string.
6179 * @param __lhs String.
6180 * @param __rhs C string.
6181 * @return True if @a __lhs.compare(@a __rhs) == 0. False otherwise.
6182 */
6183 template<typename _CharT, typename _Traits, typename _Alloc>
6184 inline bool
6185 operator==(const basic_string<_CharT, _Traits, _Alloc>& __lhs,
6186 const _CharT* __rhs)
6187 { return __lhs.compare(__rhs) == 0; }
6188
6189#if __cpp_lib_three_way_comparison
6190 /**
6191 * @brief Three-way comparison of a string and a C string.
6192 * @param __lhs A string.
6193 * @param __rhs A null-terminated string.
6194 * @return A value indicating whether `__lhs` is less than, equal to,
6195 * greater than, or incomparable with `__rhs`.
6196 */
6197 template<typename _CharT, typename _Traits, typename _Alloc>
6198 inline auto
6199 operator<=>(const basic_string<_CharT, _Traits, _Alloc>& __lhs,
6200 const basic_string<_CharT, _Traits, _Alloc>& __rhs) noexcept
6201 -> decltype(__detail::__char_traits_cmp_cat<_Traits>(0))
6202 { return __detail::__char_traits_cmp_cat<_Traits>(__lhs.compare(__rhs)); }
6203
6204 /**
6205 * @brief Three-way comparison of a string and a C string.
6206 * @param __lhs A string.
6207 * @param __rhs A null-terminated string.
6208 * @return A value indicating whether `__lhs` is less than, equal to,
6209 * greater than, or incomparable with `__rhs`.
6210 */
6211 template<typename _CharT, typename _Traits, typename _Alloc>
6212 inline auto
6213 operator<=>(const basic_string<_CharT, _Traits, _Alloc>& __lhs,
6214 const _CharT* __rhs) noexcept
6215 -> decltype(__detail::__char_traits_cmp_cat<_Traits>(0))
6216 { return __detail::__char_traits_cmp_cat<_Traits>(__lhs.compare(__rhs)); }
6217#else
6218 /**
6219 * @brief Test equivalence of C string and string.
6220 * @param __lhs C string.
6221 * @param __rhs String.
6222 * @return True if @a __rhs.compare(@a __lhs) == 0. False otherwise.
6223 */
6224 template<typename _CharT, typename _Traits, typename _Alloc>
6225 inline bool
6226 operator==(const _CharT* __lhs,
6227 const basic_string<_CharT, _Traits, _Alloc>& __rhs)
6228 { return __rhs.compare(__lhs) == 0; }
6229
6230 // operator !=
6231 /**
6232 * @brief Test difference of two strings.
6233 * @param __lhs First string.
6234 * @param __rhs Second string.
6235 * @return True if @a __lhs.compare(@a __rhs) != 0. False otherwise.
6236 */
6237 template<typename _CharT, typename _Traits, typename _Alloc>
6238 inline bool
6239 operator!=(const basic_string<_CharT, _Traits, _Alloc>& __lhs,
6240 const basic_string<_CharT, _Traits, _Alloc>& __rhs)
6241 _GLIBCXX_NOEXCEPTnoexcept
6242 { return !(__lhs == __rhs); }
6243
6244 /**
6245 * @brief Test difference of C string and string.
6246 * @param __lhs C string.
6247 * @param __rhs String.
6248 * @return True if @a __rhs.compare(@a __lhs) != 0. False otherwise.
6249 */
6250 template<typename _CharT, typename _Traits, typename _Alloc>
6251 inline bool
6252 operator!=(const _CharT* __lhs,
6253 const basic_string<_CharT, _Traits, _Alloc>& __rhs)
6254 { return !(__lhs == __rhs); }
6255
6256 /**
6257 * @brief Test difference of string and C string.
6258 * @param __lhs String.
6259 * @param __rhs C string.
6260 * @return True if @a __lhs.compare(@a __rhs) != 0. False otherwise.
6261 */
6262 template<typename _CharT, typename _Traits, typename _Alloc>
6263 inline bool
6264 operator!=(const basic_string<_CharT, _Traits, _Alloc>& __lhs,
6265 const _CharT* __rhs)
6266 { return !(__lhs == __rhs); }
6267
6268 // operator <
6269 /**
6270 * @brief Test if string precedes string.
6271 * @param __lhs First string.
6272 * @param __rhs Second string.
6273 * @return True if @a __lhs precedes @a __rhs. False otherwise.
6274 */
6275 template<typename _CharT, typename _Traits, typename _Alloc>
6276 inline bool
6277 operator<(const basic_string<_CharT, _Traits, _Alloc>& __lhs,
6278 const basic_string<_CharT, _Traits, _Alloc>& __rhs)
6279 _GLIBCXX_NOEXCEPTnoexcept
6280 { return __lhs.compare(__rhs) < 0; }
6281
6282 /**
6283 * @brief Test if string precedes C string.
6284 * @param __lhs String.
6285 * @param __rhs C string.
6286 * @return True if @a __lhs precedes @a __rhs. False otherwise.
6287 */
6288 template<typename _CharT, typename _Traits, typename _Alloc>
6289 inline bool
6290 operator<(const basic_string<_CharT, _Traits, _Alloc>& __lhs,
6291 const _CharT* __rhs)
6292 { return __lhs.compare(__rhs) < 0; }
6293
6294 /**
6295 * @brief Test if C string precedes string.
6296 * @param __lhs C string.
6297 * @param __rhs String.
6298 * @return True if @a __lhs precedes @a __rhs. False otherwise.
6299 */
6300 template<typename _CharT, typename _Traits, typename _Alloc>
6301 inline bool
6302 operator<(const _CharT* __lhs,
6303 const basic_string<_CharT, _Traits, _Alloc>& __rhs)
6304 { return __rhs.compare(__lhs) > 0; }
6305
6306 // operator >
6307 /**
6308 * @brief Test if string follows string.
6309 * @param __lhs First string.
6310 * @param __rhs Second string.
6311 * @return True if @a __lhs follows @a __rhs. False otherwise.
6312 */
6313 template<typename _CharT, typename _Traits, typename _Alloc>
6314 inline bool
6315 operator>(const basic_string<_CharT, _Traits, _Alloc>& __lhs,
6316 const basic_string<_CharT, _Traits, _Alloc>& __rhs)
6317 _GLIBCXX_NOEXCEPTnoexcept
6318 { return __lhs.compare(__rhs) > 0; }
6319
6320 /**
6321 * @brief Test if string follows C string.
6322 * @param __lhs String.
6323 * @param __rhs C string.
6324 * @return True if @a __lhs follows @a __rhs. False otherwise.
6325 */
6326 template<typename _CharT, typename _Traits, typename _Alloc>
6327 inline bool
6328 operator>(const basic_string<_CharT, _Traits, _Alloc>& __lhs,
6329 const _CharT* __rhs)
6330 { return __lhs.compare(__rhs) > 0; }
6331
6332 /**
6333 * @brief Test if C string follows string.
6334 * @param __lhs C string.
6335 * @param __rhs String.
6336 * @return True if @a __lhs follows @a __rhs. False otherwise.
6337 */
6338 template<typename _CharT, typename _Traits, typename _Alloc>
6339 inline bool
6340 operator>(const _CharT* __lhs,
6341 const basic_string<_CharT, _Traits, _Alloc>& __rhs)
6342 { return __rhs.compare(__lhs) < 0; }
6343
6344 // operator <=
6345 /**
6346 * @brief Test if string doesn't follow string.
6347 * @param __lhs First string.
6348 * @param __rhs Second string.
6349 * @return True if @a __lhs doesn't follow @a __rhs. False otherwise.
6350 */
6351 template<typename _CharT, typename _Traits, typename _Alloc>
6352 inline bool
6353 operator<=(const basic_string<_CharT, _Traits, _Alloc>& __lhs,
6354 const basic_string<_CharT, _Traits, _Alloc>& __rhs)
6355 _GLIBCXX_NOEXCEPTnoexcept
6356 { return __lhs.compare(__rhs) <= 0; }
6357
6358 /**
6359 * @brief Test if string doesn't follow C string.
6360 * @param __lhs String.
6361 * @param __rhs C string.
6362 * @return True if @a __lhs doesn't follow @a __rhs. False otherwise.
6363 */
6364 template<typename _CharT, typename _Traits, typename _Alloc>
6365 inline bool
6366 operator<=(const basic_string<_CharT, _Traits, _Alloc>& __lhs,
6367 const _CharT* __rhs)
6368 { return __lhs.compare(__rhs) <= 0; }
6369
6370 /**
6371 * @brief Test if C string doesn't follow string.
6372 * @param __lhs C string.
6373 * @param __rhs String.
6374 * @return True if @a __lhs doesn't follow @a __rhs. False otherwise.
6375 */
6376 template<typename _CharT, typename _Traits, typename _Alloc>
6377 inline bool
6378 operator<=(const _CharT* __lhs,
6379 const basic_string<_CharT, _Traits, _Alloc>& __rhs)
6380 { return __rhs.compare(__lhs) >= 0; }
6381
6382 // operator >=
6383 /**
6384 * @brief Test if string doesn't precede string.
6385 * @param __lhs First string.
6386 * @param __rhs Second string.
6387 * @return True if @a __lhs doesn't precede @a __rhs. False otherwise.
6388 */
6389 template<typename _CharT, typename _Traits, typename _Alloc>
6390 inline bool
6391 operator>=(const basic_string<_CharT, _Traits, _Alloc>& __lhs,
6392 const basic_string<_CharT, _Traits, _Alloc>& __rhs)
6393 _GLIBCXX_NOEXCEPTnoexcept
6394 { return __lhs.compare(__rhs) >= 0; }
6395
6396 /**
6397 * @brief Test if string doesn't precede C string.
6398 * @param __lhs String.
6399 * @param __rhs C string.
6400 * @return True if @a __lhs doesn't precede @a __rhs. False otherwise.
6401 */
6402 template<typename _CharT, typename _Traits, typename _Alloc>
6403 inline bool
6404 operator>=(const basic_string<_CharT, _Traits, _Alloc>& __lhs,
6405 const _CharT* __rhs)
6406 { return __lhs.compare(__rhs) >= 0; }
6407
6408 /**
6409 * @brief Test if C string doesn't precede string.
6410 * @param __lhs C string.
6411 * @param __rhs String.
6412 * @return True if @a __lhs doesn't precede @a __rhs. False otherwise.
6413 */
6414 template<typename _CharT, typename _Traits, typename _Alloc>
6415 inline bool
6416 operator>=(const _CharT* __lhs,
6417 const basic_string<_CharT, _Traits, _Alloc>& __rhs)
6418 { return __rhs.compare(__lhs) <= 0; }
6419#endif // three-way comparison
6420
6421 /**
6422 * @brief Swap contents of two strings.
6423 * @param __lhs First string.
6424 * @param __rhs Second string.
6425 *
6426 * Exchanges the contents of @a __lhs and @a __rhs in constant time.
6427 */
6428 template<typename _CharT, typename _Traits, typename _Alloc>
6429 inline void
6430 swap(basic_string<_CharT, _Traits, _Alloc>& __lhs,
6431 basic_string<_CharT, _Traits, _Alloc>& __rhs)
6432 _GLIBCXX_NOEXCEPT_IF(noexcept(__lhs.swap(__rhs)))noexcept(noexcept(__lhs.swap(__rhs)))
6433 { __lhs.swap(__rhs); }
6434
6435
6436 /**
6437 * @brief Read stream into a string.
6438 * @param __is Input stream.
6439 * @param __str Buffer to store into.
6440 * @return Reference to the input stream.
6441 *
6442 * Stores characters from @a __is into @a __str until whitespace is
6443 * found, the end of the stream is encountered, or str.max_size()
6444 * is reached. If is.width() is non-zero, that is the limit on the
6445 * number of characters stored into @a __str. Any previous
6446 * contents of @a __str are erased.
6447 */
6448 template<typename _CharT, typename _Traits, typename _Alloc>
6449 basic_istream<_CharT, _Traits>&
6450 operator>>(basic_istream<_CharT, _Traits>& __is,
6451 basic_string<_CharT, _Traits, _Alloc>& __str);
6452
6453 template<>
6454 basic_istream<char>&
6455 operator>>(basic_istream<char>& __is, basic_string<char>& __str);
6456
6457 /**
6458 * @brief Write string to a stream.
6459 * @param __os Output stream.
6460 * @param __str String to write out.
6461 * @return Reference to the output stream.
6462 *
6463 * Output characters of @a __str into os following the same rules as for
6464 * writing a C string.
6465 */
6466 template<typename _CharT, typename _Traits, typename _Alloc>
6467 inline basic_ostream<_CharT, _Traits>&
6468 operator<<(basic_ostream<_CharT, _Traits>& __os,
6469 const basic_string<_CharT, _Traits, _Alloc>& __str)
6470 {
6471 // _GLIBCXX_RESOLVE_LIB_DEFECTS
6472 // 586. string inserter not a formatted function
6473 return __ostream_insert(__os, __str.data(), __str.size());
6474 }
6475
6476 /**
6477 * @brief Read a line from stream into a string.
6478 * @param __is Input stream.
6479 * @param __str Buffer to store into.
6480 * @param __delim Character marking end of line.
6481 * @return Reference to the input stream.
6482 *
6483 * Stores characters from @a __is into @a __str until @a __delim is
6484 * found, the end of the stream is encountered, or str.max_size()
6485 * is reached. Any previous contents of @a __str are erased. If
6486 * @a __delim is encountered, it is extracted but not stored into
6487 * @a __str.
6488 */
6489 template<typename _CharT, typename _Traits, typename _Alloc>
6490 basic_istream<_CharT, _Traits>&
6491 getline(basic_istream<_CharT, _Traits>& __is,
6492 basic_string<_CharT, _Traits, _Alloc>& __str, _CharT __delim);
6493
6494 /**
6495 * @brief Read a line from stream into a string.
6496 * @param __is Input stream.
6497 * @param __str Buffer to store into.
6498 * @return Reference to the input stream.
6499 *
6500 * Stores characters from is into @a __str until &apos;\n&apos; is
6501 * found, the end of the stream is encountered, or str.max_size()
6502 * is reached. Any previous contents of @a __str are erased. If
6503 * end of line is encountered, it is extracted but not stored into
6504 * @a __str.
6505 */
6506 template<typename _CharT, typename _Traits, typename _Alloc>
6507 inline basic_istream<_CharT, _Traits>&
6508 getline(basic_istream<_CharT, _Traits>& __is,
6509 basic_string<_CharT, _Traits, _Alloc>& __str)
6510 { return std::getline(__is, __str, __is.widen('\n')); }
6511
6512#if __cplusplus201703L >= 201103L
6513 /// Read a line from an rvalue stream into a string.
6514 template<typename _CharT, typename _Traits, typename _Alloc>
6515 inline basic_istream<_CharT, _Traits>&
6516 getline(basic_istream<_CharT, _Traits>&& __is,
6517 basic_string<_CharT, _Traits, _Alloc>& __str, _CharT __delim)
6518 { return std::getline(__is, __str, __delim); }
6519
6520 /// Read a line from an rvalue stream into a string.
6521 template<typename _CharT, typename _Traits, typename _Alloc>
6522 inline basic_istream<_CharT, _Traits>&
6523 getline(basic_istream<_CharT, _Traits>&& __is,
6524 basic_string<_CharT, _Traits, _Alloc>& __str)
6525 { return std::getline(__is, __str); }
6526#endif
6527
6528 template<>
6529 basic_istream<char>&
6530 getline(basic_istream<char>& __in, basic_string<char>& __str,
6531 char __delim);
6532
6533#ifdef _GLIBCXX_USE_WCHAR_T1
6534 template<>
6535 basic_istream<wchar_t>&
6536 getline(basic_istream<wchar_t>& __in, basic_string<wchar_t>& __str,
6537 wchar_t __delim);
6538#endif
6539
6540_GLIBCXX_END_NAMESPACE_VERSION
6541} // namespace
6542
6543#if __cplusplus201703L >= 201103L
6544
6545#include <ext/string_conversions.h>
6546#include <bits/charconv.h>
6547
6548namespace std _GLIBCXX_VISIBILITY(default)__attribute__ ((__visibility__ ("default")))
6549{
6550_GLIBCXX_BEGIN_NAMESPACE_VERSION
6551_GLIBCXX_BEGIN_NAMESPACE_CXX11namespace __cxx11 {
6552
6553#if _GLIBCXX_USE_C99_STDLIB1
6554 // 21.4 Numeric Conversions [string.conversions].
6555 inline int
6556 stoi(const string& __str, size_t* __idx = 0, int __base = 10)
6557 { return __gnu_cxx::__stoa<long, int>(&std::strtol, "stoi", __str.c_str(),
6558 __idx, __base); }
6559
6560 inline long
6561 stol(const string& __str, size_t* __idx = 0, int __base = 10)
6562 { return __gnu_cxx::__stoa(&std::strtol, "stol", __str.c_str(),
6563 __idx, __base); }
6564
6565 inline unsigned long
6566 stoul(const string& __str, size_t* __idx = 0, int __base = 10)
6567 { return __gnu_cxx::__stoa(&std::strtoul, "stoul", __str.c_str(),
6568 __idx, __base); }
6569
6570 inline long long
6571 stoll(const string& __str, size_t* __idx = 0, int __base = 10)
6572 { return __gnu_cxx::__stoa(&std::strtoll, "stoll", __str.c_str(),
6573 __idx, __base); }
6574
6575 inline unsigned long long
6576 stoull(const string& __str, size_t* __idx = 0, int __base = 10)
6577 { return __gnu_cxx::__stoa(&std::strtoull, "stoull", __str.c_str(),
6578 __idx, __base); }
6579
6580 // NB: strtof vs strtod.
6581 inline float
6582 stof(const string& __str, size_t* __idx = 0)
6583 { return __gnu_cxx::__stoa(&std::strtof, "stof", __str.c_str(), __idx); }
6584
6585 inline double
6586 stod(const string& __str, size_t* __idx = 0)
6587 { return __gnu_cxx::__stoa(&std::strtod, "stod", __str.c_str(), __idx); }
6588
6589 inline long double
6590 stold(const string& __str, size_t* __idx = 0)
6591 { return __gnu_cxx::__stoa(&std::strtold, "stold", __str.c_str(), __idx); }
6592#endif // _GLIBCXX_USE_C99_STDLIB
6593
6594 // DR 1261. Insufficent overloads for to_string / to_wstring
6595
6596 inline string
6597 to_string(int __val)
6598 {
6599 const bool __neg = __val < 0;
6600 const unsigned __uval = __neg ? (unsigned)~__val + 1u : __val;
6601 const auto __len = __detail::__to_chars_len(__uval);
6602 string __str(__neg + __len, '-');
6603 __detail::__to_chars_10_impl(&__str[__neg], __len, __uval);
6604 return __str;
6605 }
6606
6607 inline string
6608 to_string(unsigned __val)
6609 {
6610 string __str(__detail::__to_chars_len(__val), '\0');
6611 __detail::__to_chars_10_impl(&__str[0], __str.size(), __val);
6612 return __str;
6613 }
6614
6615 inline string
6616 to_string(long __val)
6617 {
6618 const bool __neg = __val < 0;
6619 const unsigned long __uval = __neg ? (unsigned long)~__val + 1ul : __val;
6620 const auto __len = __detail::__to_chars_len(__uval);
6621 string __str(__neg + __len, '-');
6622 __detail::__to_chars_10_impl(&__str[__neg], __len, __uval);
6623 return __str;
6624 }
6625
6626 inline string
6627 to_string(unsigned long __val)
6628 {
6629 string __str(__detail::__to_chars_len(__val), '\0');
6630 __detail::__to_chars_10_impl(&__str[0], __str.size(), __val);
6631 return __str;
6632 }
6633
6634 inline string
6635 to_string(long long __val)
6636 {
6637 const bool __neg = __val < 0;
6638 const unsigned long long __uval
6639 = __neg ? (unsigned long long)~__val + 1ull : __val;
6640 const auto __len = __detail::__to_chars_len(__uval);
6641 string __str(__neg + __len, '-');
6642 __detail::__to_chars_10_impl(&__str[__neg], __len, __uval);
6643 return __str;
6644 }
6645
6646 inline string
6647 to_string(unsigned long long __val)
6648 {
6649 string __str(__detail::__to_chars_len(__val), '\0');
6650 __detail::__to_chars_10_impl(&__str[0], __str.size(), __val);
6651 return __str;
6652 }
6653
6654#if _GLIBCXX_USE_C99_STDIO1
6655 // NB: (v)snprintf vs sprintf.
6656
6657 inline string
6658 to_string(float __val)
6659 {
6660 const int __n =
6661 __gnu_cxx::__numeric_traits<float>::__max_exponent10 + 20;
6662 return __gnu_cxx::__to_xstring<string>(&std::vsnprintf, __n,
6663 "%f", __val);
6664 }
6665
6666 inline string
6667 to_string(double __val)
6668 {
6669 const int __n =
6670 __gnu_cxx::__numeric_traits<double>::__max_exponent10 + 20;
6671 return __gnu_cxx::__to_xstring<string>(&std::vsnprintf, __n,
6672 "%f", __val);
6673 }
6674
6675 inline string
6676 to_string(long double __val)
6677 {
6678 const int __n =
6679 __gnu_cxx::__numeric_traits<long double>::__max_exponent10 + 20;
6680 return __gnu_cxx::__to_xstring<string>(&std::vsnprintf, __n,
6681 "%Lf", __val);
6682 }
6683#endif // _GLIBCXX_USE_C99_STDIO
6684
6685#if defined(_GLIBCXX_USE_WCHAR_T1) && _GLIBCXX_USE_C99_WCHAR1
6686 inline int
6687 stoi(const wstring& __str, size_t* __idx = 0, int __base = 10)
6688 { return __gnu_cxx::__stoa<long, int>(&std::wcstol, "stoi", __str.c_str(),
6689 __idx, __base); }
6690
6691 inline long
6692 stol(const wstring& __str, size_t* __idx = 0, int __base = 10)
6693 { return __gnu_cxx::__stoa(&std::wcstol, "stol", __str.c_str(),
6694 __idx, __base); }
6695
6696 inline unsigned long
6697 stoul(const wstring& __str, size_t* __idx = 0, int __base = 10)
6698 { return __gnu_cxx::__stoa(&std::wcstoul, "stoul", __str.c_str(),
6699 __idx, __base); }
6700
6701 inline long long
6702 stoll(const wstring& __str, size_t* __idx = 0, int __base = 10)
6703 { return __gnu_cxx::__stoa(&std::wcstoll, "stoll", __str.c_str(),
6704 __idx, __base); }
6705
6706 inline unsigned long long
6707 stoull(const wstring& __str, size_t* __idx = 0, int __base = 10)
6708 { return __gnu_cxx::__stoa(&std::wcstoull, "stoull", __str.c_str(),
6709 __idx, __base); }
6710
6711 // NB: wcstof vs wcstod.
6712 inline float
6713 stof(const wstring& __str, size_t* __idx = 0)
6714 { return __gnu_cxx::__stoa(&std::wcstof, "stof", __str.c_str(), __idx); }
6715
6716 inline double
6717 stod(const wstring& __str, size_t* __idx = 0)
6718 { return __gnu_cxx::__stoa(&std::wcstod, "stod", __str.c_str(), __idx); }
6719
6720 inline long double
6721 stold(const wstring& __str, size_t* __idx = 0)
6722 { return __gnu_cxx::__stoa(&std::wcstold, "stold", __str.c_str(), __idx); }
6723
6724#ifndef _GLIBCXX_HAVE_BROKEN_VSWPRINTF
6725 // DR 1261.
6726 inline wstring
6727 to_wstring(int __val)
6728 { return __gnu_cxx::__to_xstring<wstring>(&std::vswprintf, 4 * sizeof(int),
6729 L"%d", __val); }
6730
6731 inline wstring
6732 to_wstring(unsigned __val)
6733 { return __gnu_cxx::__to_xstring<wstring>(&std::vswprintf,
6734 4 * sizeof(unsigned),
6735 L"%u", __val); }
6736
6737 inline wstring
6738 to_wstring(long __val)
6739 { return __gnu_cxx::__to_xstring<wstring>(&std::vswprintf, 4 * sizeof(long),
6740 L"%ld", __val); }
6741
6742 inline wstring
6743 to_wstring(unsigned long __val)
6744 { return __gnu_cxx::__to_xstring<wstring>(&std::vswprintf,
6745 4 * sizeof(unsigned long),
6746 L"%lu", __val); }
6747
6748 inline wstring
6749 to_wstring(long long __val)
6750 { return __gnu_cxx::__to_xstring<wstring>(&std::vswprintf,
6751 4 * sizeof(long long),
6752 L"%lld", __val); }
6753
6754 inline wstring
6755 to_wstring(unsigned long long __val)
6756 { return __gnu_cxx::__to_xstring<wstring>(&std::vswprintf,
6757 4 * sizeof(unsigned long long),
6758 L"%llu", __val); }
6759
6760 inline wstring
6761 to_wstring(float __val)
6762 {
6763 const int __n =
6764 __gnu_cxx::__numeric_traits<float>::__max_exponent10 + 20;
6765 return __gnu_cxx::__to_xstring<wstring>(&std::vswprintf, __n,
6766 L"%f", __val);
6767 }
6768
6769 inline wstring
6770 to_wstring(double __val)
6771 {
6772 const int __n =
6773 __gnu_cxx::__numeric_traits<double>::__max_exponent10 + 20;
6774 return __gnu_cxx::__to_xstring<wstring>(&std::vswprintf, __n,
6775 L"%f", __val);
6776 }
6777
6778 inline wstring
6779 to_wstring(long double __val)
6780 {
6781 const int __n =
6782 __gnu_cxx::__numeric_traits<long double>::__max_exponent10 + 20;
6783 return __gnu_cxx::__to_xstring<wstring>(&std::vswprintf, __n,
6784 L"%Lf", __val);
6785 }
6786#endif // _GLIBCXX_HAVE_BROKEN_VSWPRINTF
6787#endif // _GLIBCXX_USE_WCHAR_T && _GLIBCXX_USE_C99_WCHAR
6788
6789_GLIBCXX_END_NAMESPACE_CXX11}
6790_GLIBCXX_END_NAMESPACE_VERSION
6791} // namespace
6792
6793#endif /* C++11 */
6794
6795#if __cplusplus201703L >= 201103L
6796
6797#include <bits/functional_hash.h>
6798
6799namespace std _GLIBCXX_VISIBILITY(default)__attribute__ ((__visibility__ ("default")))
6800{
6801_GLIBCXX_BEGIN_NAMESPACE_VERSION
6802
6803 // DR 1182.
6804
6805#ifndef _GLIBCXX_COMPATIBILITY_CXX0X
6806 /// std::hash specialization for string.
6807 template<>
6808 struct hash<string>
6809 : public __hash_base<size_t, string>
6810 {
6811 size_t
6812 operator()(const string& __s) const noexcept
6813 { return std::_Hash_impl::hash(__s.data(), __s.length()); }
6814 };
6815
6816 template<>
6817 struct __is_fast_hash<hash<string>> : std::false_type
6818 { };
6819
6820#ifdef _GLIBCXX_USE_WCHAR_T1
6821 /// std::hash specialization for wstring.
6822 template<>
6823 struct hash<wstring>
6824 : public __hash_base<size_t, wstring>
6825 {
6826 size_t
6827 operator()(const wstring& __s) const noexcept
6828 { return std::_Hash_impl::hash(__s.data(),
6829 __s.length() * sizeof(wchar_t)); }
6830 };
6831
6832 template<>
6833 struct __is_fast_hash<hash<wstring>> : std::false_type
6834 { };
6835#endif
6836#endif /* _GLIBCXX_COMPATIBILITY_CXX0X */
6837
6838#ifdef _GLIBCXX_USE_CHAR8_T
6839 /// std::hash specialization for u8string.
6840 template<>
6841 struct hash<u8string>
6842 : public __hash_base<size_t, u8string>
6843 {
6844 size_t
6845 operator()(const u8string& __s) const noexcept
6846 { return std::_Hash_impl::hash(__s.data(),
6847 __s.length() * sizeof(char8_t)); }
6848 };
6849
6850 template<>
6851 struct __is_fast_hash<hash<u8string>> : std::false_type
6852 { };
6853#endif
6854
6855 /// std::hash specialization for u16string.
6856 template<>
6857 struct hash<u16string>
6858 : public __hash_base<size_t, u16string>
6859 {
6860 size_t
6861 operator()(const u16string& __s) const noexcept
6862 { return std::_Hash_impl::hash(__s.data(),
6863 __s.length() * sizeof(char16_t)); }
6864 };
6865
6866 template<>
6867 struct __is_fast_hash<hash<u16string>> : std::false_type
6868 { };
6869
6870 /// std::hash specialization for u32string.
6871 template<>
6872 struct hash<u32string>
6873 : public __hash_base<size_t, u32string>
6874 {
6875 size_t
6876 operator()(const u32string& __s) const noexcept
6877 { return std::_Hash_impl::hash(__s.data(),
6878 __s.length() * sizeof(char32_t)); }
6879 };
6880
6881 template<>
6882 struct __is_fast_hash<hash<u32string>> : std::false_type
6883 { };
6884
6885#if __cplusplus201703L >= 201402L
6886
6887#define __cpp_lib_string_udls201304 201304
6888
6889 inline namespace literals
6890 {
6891 inline namespace string_literals
6892 {
6893#pragma GCC diagnostic push
6894#pragma GCC diagnostic ignored "-Wliteral-suffix"
6895 _GLIBCXX_DEFAULT_ABI_TAG__attribute ((__abi_tag__ ("cxx11")))
6896 inline basic_string<char>
6897 operator""s(const char* __str, size_t __len)
6898 { return basic_string<char>{__str, __len}; }
6899
6900#ifdef _GLIBCXX_USE_WCHAR_T1
6901 _GLIBCXX_DEFAULT_ABI_TAG__attribute ((__abi_tag__ ("cxx11")))
6902 inline basic_string<wchar_t>
6903 operator""s(const wchar_t* __str, size_t __len)
6904 { return basic_string<wchar_t>{__str, __len}; }
6905#endif
6906
6907#ifdef _GLIBCXX_USE_CHAR8_T
6908 _GLIBCXX_DEFAULT_ABI_TAG__attribute ((__abi_tag__ ("cxx11")))
6909 inline basic_string<char8_t>
6910 operator""s(const char8_t* __str, size_t __len)
6911 { return basic_string<char8_t>{__str, __len}; }
6912#endif
6913
6914 _GLIBCXX_DEFAULT_ABI_TAG__attribute ((__abi_tag__ ("cxx11")))
6915 inline basic_string<char16_t>
6916 operator""s(const char16_t* __str, size_t __len)
6917 { return basic_string<char16_t>{__str, __len}; }
6918
6919 _GLIBCXX_DEFAULT_ABI_TAG__attribute ((__abi_tag__ ("cxx11")))
6920 inline basic_string<char32_t>
6921 operator""s(const char32_t* __str, size_t __len)
6922 { return basic_string<char32_t>{__str, __len}; }
6923
6924#pragma GCC diagnostic pop
6925 } // inline namespace string_literals
6926 } // inline namespace literals
6927
6928#if __cplusplus201703L >= 201703L
6929 namespace __detail::__variant
6930 {
6931 template<typename> struct _Never_valueless_alt; // see <variant>
6932
6933 // Provide the strong exception-safety guarantee when emplacing a
6934 // basic_string into a variant, but only if moving the string cannot throw.
6935 template<typename _Tp, typename _Traits, typename _Alloc>
6936 struct _Never_valueless_alt<std::basic_string<_Tp, _Traits, _Alloc>>
6937 : __and_<
6938 is_nothrow_move_constructible<std::basic_string<_Tp, _Traits, _Alloc>>,
6939 is_nothrow_move_assignable<std::basic_string<_Tp, _Traits, _Alloc>>
6940 >::type
6941 { };
6942 } // namespace __detail::__variant
6943#endif // C++17
6944#endif // C++14
6945
6946_GLIBCXX_END_NAMESPACE_VERSION
6947} // namespace std
6948
6949#endif // C++11
6950
6951#endif /* _BASIC_STRING_H */

/build/source/llvm/lib/Transforms/Vectorize/VPlan.h

1//===- VPlan.h - Represent A Vectorizer Plan --------------------*- C++ -*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// This file contains the declarations of the Vectorization Plan base classes:
11/// 1. VPBasicBlock and VPRegionBlock that inherit from a common pure virtual
12/// VPBlockBase, together implementing a Hierarchical CFG;
13/// 2. Specializations of GraphTraits that allow VPBlockBase graphs to be
14/// treated as proper graphs for generic algorithms;
15/// 3. Pure virtual VPRecipeBase serving as the base class for recipes contained
16/// within VPBasicBlocks;
17/// 4. VPInstruction, a concrete Recipe and VPUser modeling a single planned
18/// instruction;
19/// 5. The VPlan class holding a candidate for vectorization;
20/// 6. The VPlanPrinter class providing a way to print a plan in dot format;
21/// These are documented in docs/VectorizationPlan.rst.
22//
23//===----------------------------------------------------------------------===//
24
25#ifndef LLVM_TRANSFORMS_VECTORIZE_VPLAN_H
26#define LLVM_TRANSFORMS_VECTORIZE_VPLAN_H
27
28#include "VPlanValue.h"
29#include "llvm/ADT/DenseMap.h"
30#include "llvm/ADT/DepthFirstIterator.h"
31#include "llvm/ADT/GraphTraits.h"
32#include "llvm/ADT/MapVector.h"
33#include "llvm/ADT/SmallBitVector.h"
34#include "llvm/ADT/SmallPtrSet.h"
35#include "llvm/ADT/SmallVector.h"
36#include "llvm/ADT/Twine.h"
37#include "llvm/ADT/ilist.h"
38#include "llvm/ADT/ilist_node.h"
39#include "llvm/Analysis/LoopInfo.h"
40#include "llvm/Analysis/VectorUtils.h"
41#include "llvm/IR/DebugLoc.h"
42#include "llvm/IR/FMF.h"
43#include "llvm/Transforms/Utils/LoopVersioning.h"
44#include <algorithm>
45#include <cassert>
46#include <cstddef>
47#include <string>
48
49namespace llvm {
50
51class BasicBlock;
52class DominatorTree;
53class InductionDescriptor;
54class InnerLoopVectorizer;
55class IRBuilderBase;
56class LoopInfo;
57class PredicateScalarEvolution;
58class raw_ostream;
59class RecurrenceDescriptor;
60class SCEV;
61class Type;
62class VPBasicBlock;
63class VPRegionBlock;
64class VPlan;
65class VPReplicateRecipe;
66class VPlanSlp;
67class Value;
68
69namespace Intrinsic {
70typedef unsigned ID;
71}
72
73/// Returns a calculation for the total number of elements for a given \p VF.
74/// For fixed width vectors this value is a constant, whereas for scalable
75/// vectors it is an expression determined at runtime.
76Value *getRuntimeVF(IRBuilderBase &B, Type *Ty, ElementCount VF);
77
78/// Return a value for Step multiplied by VF.
79Value *createStepForVF(IRBuilderBase &B, Type *Ty, ElementCount VF,
80 int64_t Step);
81
82const SCEV *createTripCountSCEV(Type *IdxTy, PredicatedScalarEvolution &PSE);
83
84/// A range of powers-of-2 vectorization factors with fixed start and
85/// adjustable end. The range includes start and excludes end, e.g.,:
86/// [1, 9) = {1, 2, 4, 8}
87struct VFRange {
88 // A power of 2.
89 const ElementCount Start;
90
91 // Need not be a power of 2. If End <= Start range is empty.
92 ElementCount End;
93
94 bool isEmpty() const {
95 return End.getKnownMinValue() <= Start.getKnownMinValue();
96 }
97
98 VFRange(const ElementCount &Start, const ElementCount &End)
99 : Start(Start), End(End) {
100 assert(Start.isScalable() == End.isScalable() &&(static_cast <bool> (Start.isScalable() == End.isScalable
() && "Both Start and End should have the same scalable flag"
) ? void (0) : __assert_fail ("Start.isScalable() == End.isScalable() && \"Both Start and End should have the same scalable flag\""
, "llvm/lib/Transforms/Vectorize/VPlan.h", 101, __extension__
__PRETTY_FUNCTION__))
101 "Both Start and End should have the same scalable flag")(static_cast <bool> (Start.isScalable() == End.isScalable
() && "Both Start and End should have the same scalable flag"
) ? void (0) : __assert_fail ("Start.isScalable() == End.isScalable() && \"Both Start and End should have the same scalable flag\""
, "llvm/lib/Transforms/Vectorize/VPlan.h", 101, __extension__
__PRETTY_FUNCTION__))
;
102 assert(isPowerOf2_32(Start.getKnownMinValue()) &&(static_cast <bool> (isPowerOf2_32(Start.getKnownMinValue
()) && "Expected Start to be a power of 2") ? void (0
) : __assert_fail ("isPowerOf2_32(Start.getKnownMinValue()) && \"Expected Start to be a power of 2\""
, "llvm/lib/Transforms/Vectorize/VPlan.h", 103, __extension__
__PRETTY_FUNCTION__))
103 "Expected Start to be a power of 2")(static_cast <bool> (isPowerOf2_32(Start.getKnownMinValue
()) && "Expected Start to be a power of 2") ? void (0
) : __assert_fail ("isPowerOf2_32(Start.getKnownMinValue()) && \"Expected Start to be a power of 2\""
, "llvm/lib/Transforms/Vectorize/VPlan.h", 103, __extension__
__PRETTY_FUNCTION__))
;
104 }
105};
106
107using VPlanPtr = std::unique_ptr<VPlan>;
108
109/// In what follows, the term "input IR" refers to code that is fed into the
110/// vectorizer whereas the term "output IR" refers to code that is generated by
111/// the vectorizer.
112
113/// VPLane provides a way to access lanes in both fixed width and scalable
114/// vectors, where for the latter the lane index sometimes needs calculating
115/// as a runtime expression.
116class VPLane {
117public:
118 /// Kind describes how to interpret Lane.
119 enum class Kind : uint8_t {
120 /// For First, Lane is the index into the first N elements of a
121 /// fixed-vector <N x <ElTy>> or a scalable vector <vscale x N x <ElTy>>.
122 First,
123 /// For ScalableLast, Lane is the offset from the start of the last
124 /// N-element subvector in a scalable vector <vscale x N x <ElTy>>. For
125 /// example, a Lane of 0 corresponds to lane `(vscale - 1) * N`, a Lane of
126 /// 1 corresponds to `((vscale - 1) * N) + 1`, etc.
127 ScalableLast
128 };
129
130private:
131 /// in [0..VF)
132 unsigned Lane;
133
134 /// Indicates how the Lane should be interpreted, as described above.
135 Kind LaneKind;
136
137public:
138 VPLane(unsigned Lane, Kind LaneKind) : Lane(Lane), LaneKind(LaneKind) {}
139
140 static VPLane getFirstLane() { return VPLane(0, VPLane::Kind::First); }
141
142 static VPLane getLastLaneForVF(const ElementCount &VF) {
143 unsigned LaneOffset = VF.getKnownMinValue() - 1;
144 Kind LaneKind;
145 if (VF.isScalable())
146 // In this case 'LaneOffset' refers to the offset from the start of the
147 // last subvector with VF.getKnownMinValue() elements.
148 LaneKind = VPLane::Kind::ScalableLast;
149 else
150 LaneKind = VPLane::Kind::First;
151 return VPLane(LaneOffset, LaneKind);
152 }
153
154 /// Returns a compile-time known value for the lane index and asserts if the
155 /// lane can only be calculated at runtime.
156 unsigned getKnownLane() const {
157 assert(LaneKind == Kind::First)(static_cast <bool> (LaneKind == Kind::First) ? void (0
) : __assert_fail ("LaneKind == Kind::First", "llvm/lib/Transforms/Vectorize/VPlan.h"
, 157, __extension__ __PRETTY_FUNCTION__))
;
158 return Lane;
159 }
160
161 /// Returns an expression describing the lane index that can be used at
162 /// runtime.
163 Value *getAsRuntimeExpr(IRBuilderBase &Builder, const ElementCount &VF) const;
164
165 /// Returns the Kind of lane offset.
166 Kind getKind() const { return LaneKind; }
167
168 /// Returns true if this is the first lane of the whole vector.
169 bool isFirstLane() const { return Lane == 0 && LaneKind == Kind::First; }
170
171 /// Maps the lane to a cache index based on \p VF.
172 unsigned mapToCacheIndex(const ElementCount &VF) const {
173 switch (LaneKind) {
174 case VPLane::Kind::ScalableLast:
175 assert(VF.isScalable() && Lane < VF.getKnownMinValue())(static_cast <bool> (VF.isScalable() && Lane <
VF.getKnownMinValue()) ? void (0) : __assert_fail ("VF.isScalable() && Lane < VF.getKnownMinValue()"
, "llvm/lib/Transforms/Vectorize/VPlan.h", 175, __extension__
__PRETTY_FUNCTION__))
;
176 return VF.getKnownMinValue() + Lane;
177 default:
178 assert(Lane < VF.getKnownMinValue())(static_cast <bool> (Lane < VF.getKnownMinValue()) ?
void (0) : __assert_fail ("Lane < VF.getKnownMinValue()",
"llvm/lib/Transforms/Vectorize/VPlan.h", 178, __extension__ __PRETTY_FUNCTION__
))
;
179 return Lane;
180 }
181 }
182
183 /// Returns the maxmimum number of lanes that we are able to consider
184 /// caching for \p VF.
185 static unsigned getNumCachedLanes(const ElementCount &VF) {
186 return VF.getKnownMinValue() * (VF.isScalable() ? 2 : 1);
187 }
188};
189
190/// VPIteration represents a single point in the iteration space of the output
191/// (vectorized and/or unrolled) IR loop.
192struct VPIteration {
193 /// in [0..UF)
194 unsigned Part;
195
196 VPLane Lane;
197
198 VPIteration(unsigned Part, unsigned Lane,
199 VPLane::Kind Kind = VPLane::Kind::First)
200 : Part(Part), Lane(Lane, Kind) {}
201
202 VPIteration(unsigned Part, const VPLane &Lane) : Part(Part), Lane(Lane) {}
203
204 bool isFirstIteration() const { return Part == 0 && Lane.isFirstLane(); }
205};
206
207/// VPTransformState holds information passed down when "executing" a VPlan,
208/// needed for generating the output IR.
209struct VPTransformState {
210 VPTransformState(ElementCount VF, unsigned UF, LoopInfo *LI,
211 DominatorTree *DT, IRBuilderBase &Builder,
212 InnerLoopVectorizer *ILV, VPlan *Plan)
213 : VF(VF), UF(UF), LI(LI), DT(DT), Builder(Builder), ILV(ILV), Plan(Plan),
214 LVer(nullptr) {}
215
216 /// The chosen Vectorization and Unroll Factors of the loop being vectorized.
217 ElementCount VF;
218 unsigned UF;
219
220 /// Hold the indices to generate specific scalar instructions. Null indicates
221 /// that all instances are to be generated, using either scalar or vector
222 /// instructions.
223 std::optional<VPIteration> Instance;
224
225 struct DataState {
226 /// A type for vectorized values in the new loop. Each value from the
227 /// original loop, when vectorized, is represented by UF vector values in
228 /// the new unrolled loop, where UF is the unroll factor.
229 typedef SmallVector<Value *, 2> PerPartValuesTy;
230
231 DenseMap<VPValue *, PerPartValuesTy> PerPartOutput;
232
233 using ScalarsPerPartValuesTy = SmallVector<SmallVector<Value *, 4>, 2>;
234 DenseMap<VPValue *, ScalarsPerPartValuesTy> PerPartScalars;
235 } Data;
236
237 /// Get the generated Value for a given VPValue and a given Part. Note that
238 /// as some Defs are still created by ILV and managed in its ValueMap, this
239 /// method will delegate the call to ILV in such cases in order to provide
240 /// callers a consistent API.
241 /// \see set.
242 Value *get(VPValue *Def, unsigned Part);
243
244 /// Get the generated Value for a given VPValue and given Part and Lane.
245 Value *get(VPValue *Def, const VPIteration &Instance);
246
247 bool hasVectorValue(VPValue *Def, unsigned Part) {
248 auto I = Data.PerPartOutput.find(Def);
249 return I != Data.PerPartOutput.end() && Part < I->second.size() &&
250 I->second[Part];
251 }
252
253 bool hasAnyVectorValue(VPValue *Def) const {
254 return Data.PerPartOutput.find(Def) != Data.PerPartOutput.end();
255 }
256
257 bool hasScalarValue(VPValue *Def, VPIteration Instance) {
258 auto I = Data.PerPartScalars.find(Def);
259 if (I == Data.PerPartScalars.end())
260 return false;
261 unsigned CacheIdx = Instance.Lane.mapToCacheIndex(VF);
262 return Instance.Part < I->second.size() &&
263 CacheIdx < I->second[Instance.Part].size() &&
264 I->second[Instance.Part][CacheIdx];
265 }
266
267 /// Set the generated Value for a given VPValue and a given Part.
268 void set(VPValue *Def, Value *V, unsigned Part) {
269 if (!Data.PerPartOutput.count(Def)) {
270 DataState::PerPartValuesTy Entry(UF);
271 Data.PerPartOutput[Def] = Entry;
272 }
273 Data.PerPartOutput[Def][Part] = V;
274 }
275 /// Reset an existing vector value for \p Def and a given \p Part.
276 void reset(VPValue *Def, Value *V, unsigned Part) {
277 auto Iter = Data.PerPartOutput.find(Def);
278 assert(Iter != Data.PerPartOutput.end() &&(static_cast <bool> (Iter != Data.PerPartOutput.end() &&
"need to overwrite existing value") ? void (0) : __assert_fail
("Iter != Data.PerPartOutput.end() && \"need to overwrite existing value\""
, "llvm/lib/Transforms/Vectorize/VPlan.h", 279, __extension__
__PRETTY_FUNCTION__))
279 "need to overwrite existing value")(static_cast <bool> (Iter != Data.PerPartOutput.end() &&
"need to overwrite existing value") ? void (0) : __assert_fail
("Iter != Data.PerPartOutput.end() && \"need to overwrite existing value\""
, "llvm/lib/Transforms/Vectorize/VPlan.h", 279, __extension__
__PRETTY_FUNCTION__))
;
280 Iter->second[Part] = V;
281 }
282
283 /// Set the generated scalar \p V for \p Def and the given \p Instance.
284 void set(VPValue *Def, Value *V, const VPIteration &Instance) {
285 auto Iter = Data.PerPartScalars.insert({Def, {}});
286 auto &PerPartVec = Iter.first->second;
287 while (PerPartVec.size() <= Instance.Part)
288 PerPartVec.emplace_back();
289 auto &Scalars = PerPartVec[Instance.Part];
290 unsigned CacheIdx = Instance.Lane.mapToCacheIndex(VF);
291 while (Scalars.size() <= CacheIdx)
292 Scalars.push_back(nullptr);
293 assert(!Scalars[CacheIdx] && "should overwrite existing value")(static_cast <bool> (!Scalars[CacheIdx] && "should overwrite existing value"
) ? void (0) : __assert_fail ("!Scalars[CacheIdx] && \"should overwrite existing value\""
, "llvm/lib/Transforms/Vectorize/VPlan.h", 293, __extension__
__PRETTY_FUNCTION__))
;
294 Scalars[CacheIdx] = V;
295 }
296
297 /// Reset an existing scalar value for \p Def and a given \p Instance.
298 void reset(VPValue *Def, Value *V, const VPIteration &Instance) {
299 auto Iter = Data.PerPartScalars.find(Def);
300 assert(Iter != Data.PerPartScalars.end() &&(static_cast <bool> (Iter != Data.PerPartScalars.end() &&
"need to overwrite existing value") ? void (0) : __assert_fail
("Iter != Data.PerPartScalars.end() && \"need to overwrite existing value\""
, "llvm/lib/Transforms/Vectorize/VPlan.h", 301, __extension__
__PRETTY_FUNCTION__))
301 "need to overwrite existing value")(static_cast <bool> (Iter != Data.PerPartScalars.end() &&
"need to overwrite existing value") ? void (0) : __assert_fail
("Iter != Data.PerPartScalars.end() && \"need to overwrite existing value\""
, "llvm/lib/Transforms/Vectorize/VPlan.h", 301, __extension__
__PRETTY_FUNCTION__))
;
302 assert(Instance.Part < Iter->second.size() &&(static_cast <bool> (Instance.Part < Iter->second
.size() && "need to overwrite existing value") ? void
(0) : __assert_fail ("Instance.Part < Iter->second.size() && \"need to overwrite existing value\""
, "llvm/lib/Transforms/Vectorize/VPlan.h", 303, __extension__
__PRETTY_FUNCTION__))
303 "need to overwrite existing value")(static_cast <bool> (Instance.Part < Iter->second
.size() && "need to overwrite existing value") ? void
(0) : __assert_fail ("Instance.Part < Iter->second.size() && \"need to overwrite existing value\""
, "llvm/lib/Transforms/Vectorize/VPlan.h", 303, __extension__
__PRETTY_FUNCTION__))
;
304 unsigned CacheIdx = Instance.Lane.mapToCacheIndex(VF);
305 assert(CacheIdx < Iter->second[Instance.Part].size() &&(static_cast <bool> (CacheIdx < Iter->second[Instance
.Part].size() && "need to overwrite existing value") ?
void (0) : __assert_fail ("CacheIdx < Iter->second[Instance.Part].size() && \"need to overwrite existing value\""
, "llvm/lib/Transforms/Vectorize/VPlan.h", 306, __extension__
__PRETTY_FUNCTION__))
306 "need to overwrite existing value")(static_cast <bool> (CacheIdx < Iter->second[Instance
.Part].size() && "need to overwrite existing value") ?
void (0) : __assert_fail ("CacheIdx < Iter->second[Instance.Part].size() && \"need to overwrite existing value\""
, "llvm/lib/Transforms/Vectorize/VPlan.h", 306, __extension__
__PRETTY_FUNCTION__))
;
307 Iter->second[Instance.Part][CacheIdx] = V;
308 }
309
310 /// Add additional metadata to \p To that was not present on \p Orig.
311 ///
312 /// Currently this is used to add the noalias annotations based on the
313 /// inserted memchecks. Use this for instructions that are *cloned* into the
314 /// vector loop.
315 void addNewMetadata(Instruction *To, const Instruction *Orig);
316
317 /// Add metadata from one instruction to another.
318 ///
319 /// This includes both the original MDs from \p From and additional ones (\see
320 /// addNewMetadata). Use this for *newly created* instructions in the vector
321 /// loop.
322 void addMetadata(Instruction *To, Instruction *From);
323
324 /// Similar to the previous function but it adds the metadata to a
325 /// vector of instructions.
326 void addMetadata(ArrayRef<Value *> To, Instruction *From);
327
328 /// Set the debug location in the builder using the debug location in \p V.
329 void setDebugLocFromInst(const Value *V);
330
331 /// Hold state information used when constructing the CFG of the output IR,
332 /// traversing the VPBasicBlocks and generating corresponding IR BasicBlocks.
333 struct CFGState {
334 /// The previous VPBasicBlock visited. Initially set to null.
335 VPBasicBlock *PrevVPBB = nullptr;
336
337 /// The previous IR BasicBlock created or used. Initially set to the new
338 /// header BasicBlock.
339 BasicBlock *PrevBB = nullptr;
340
341 /// The last IR BasicBlock in the output IR. Set to the exit block of the
342 /// vector loop.
343 BasicBlock *ExitBB = nullptr;
344
345 /// A mapping of each VPBasicBlock to the corresponding BasicBlock. In case
346 /// of replication, maps the BasicBlock of the last replica created.
347 SmallDenseMap<VPBasicBlock *, BasicBlock *> VPBB2IRBB;
348
349 CFGState() = default;
350
351 /// Returns the BasicBlock* mapped to the pre-header of the loop region
352 /// containing \p R.
353 BasicBlock *getPreheaderBBFor(VPRecipeBase *R);
354 } CFG;
355
356 /// Hold a pointer to LoopInfo to register new basic blocks in the loop.
357 LoopInfo *LI;
358
359 /// Hold a pointer to Dominator Tree to register new basic blocks in the loop.
360 DominatorTree *DT;
361
362 /// Hold a reference to the IRBuilder used to generate output IR code.
363 IRBuilderBase &Builder;
364
365 VPValue2ValueTy VPValue2Value;
366
367 /// Hold the canonical scalar IV of the vector loop (start=0, step=VF*UF).
368 Value *CanonicalIV = nullptr;
369
370 /// Hold a pointer to InnerLoopVectorizer to reuse its IR generation methods.
371 InnerLoopVectorizer *ILV;
372
373 /// Pointer to the VPlan code is generated for.
374 VPlan *Plan;
375
376 /// Holds recipes that may generate a poison value that is used after
377 /// vectorization, even when their operands are not poison.
378 SmallPtrSet<VPRecipeBase *, 16> MayGeneratePoisonRecipes;
379
380 /// The loop object for the current parent region, or nullptr.
381 Loop *CurrentVectorLoop = nullptr;
382
383 /// LoopVersioning. It's only set up (non-null) if memchecks were
384 /// used.
385 ///
386 /// This is currently only used to add no-alias metadata based on the
387 /// memchecks. The actually versioning is performed manually.
388 std::unique_ptr<LoopVersioning> LVer;
389};
390
391/// VPBlockBase is the building block of the Hierarchical Control-Flow Graph.
392/// A VPBlockBase can be either a VPBasicBlock or a VPRegionBlock.
393class VPBlockBase {
394 friend class VPBlockUtils;
395
396 const unsigned char SubclassID; ///< Subclass identifier (for isa/dyn_cast).
397
398 /// An optional name for the block.
399 std::string Name;
400
401 /// The immediate VPRegionBlock which this VPBlockBase belongs to, or null if
402 /// it is a topmost VPBlockBase.
403 VPRegionBlock *Parent = nullptr;
404
405 /// List of predecessor blocks.
406 SmallVector<VPBlockBase *, 1> Predecessors;
407
408 /// List of successor blocks.
409 SmallVector<VPBlockBase *, 1> Successors;
410
411 /// VPlan containing the block. Can only be set on the entry block of the
412 /// plan.
413 VPlan *Plan = nullptr;
414
415 /// Add \p Successor as the last successor to this block.
416 void appendSuccessor(VPBlockBase *Successor) {
417 assert(Successor && "Cannot add nullptr successor!")(static_cast <bool> (Successor && "Cannot add nullptr successor!"
) ? void (0) : __assert_fail ("Successor && \"Cannot add nullptr successor!\""
, "llvm/lib/Transforms/Vectorize/VPlan.h", 417, __extension__
__PRETTY_FUNCTION__))
;
418 Successors.push_back(Successor);
419 }
420
421 /// Add \p Predecessor as the last predecessor to this block.
422 void appendPredecessor(VPBlockBase *Predecessor) {
423 assert(Predecessor && "Cannot add nullptr predecessor!")(static_cast <bool> (Predecessor && "Cannot add nullptr predecessor!"
) ? void (0) : __assert_fail ("Predecessor && \"Cannot add nullptr predecessor!\""
, "llvm/lib/Transforms/Vectorize/VPlan.h", 423, __extension__
__PRETTY_FUNCTION__))
;
424 Predecessors.push_back(Predecessor);
425 }
426
427 /// Remove \p Predecessor from the predecessors of this block.
428 void removePredecessor(VPBlockBase *Predecessor) {
429 auto Pos = find(Predecessors, Predecessor);
430 assert(Pos && "Predecessor does not exist")(static_cast <bool> (Pos && "Predecessor does not exist"
) ? void (0) : __assert_fail ("Pos && \"Predecessor does not exist\""
, "llvm/lib/Transforms/Vectorize/VPlan.h", 430, __extension__
__PRETTY_FUNCTION__))
;
431 Predecessors.erase(Pos);
432 }
433
434 /// Remove \p Successor from the successors of this block.
435 void removeSuccessor(VPBlockBase *Successor) {
436 auto Pos = find(Successors, Successor);
437 assert(Pos && "Successor does not exist")(static_cast <bool> (Pos && "Successor does not exist"
) ? void (0) : __assert_fail ("Pos && \"Successor does not exist\""
, "llvm/lib/Transforms/Vectorize/VPlan.h", 437, __extension__
__PRETTY_FUNCTION__))
;
438 Successors.erase(Pos);
439 }
440
441protected:
442 VPBlockBase(const unsigned char SC, const std::string &N)
443 : SubclassID(SC), Name(N) {}
444
445public:
446 /// An enumeration for keeping track of the concrete subclass of VPBlockBase
447 /// that are actually instantiated. Values of this enumeration are kept in the
448 /// SubclassID field of the VPBlockBase objects. They are used for concrete
449 /// type identification.
450 using VPBlockTy = enum { VPBasicBlockSC, VPRegionBlockSC };
451
452 using VPBlocksTy = SmallVectorImpl<VPBlockBase *>;
453
454 virtual ~VPBlockBase() = default;
455
456 const std::string &getName() const { return Name; }
457
458 void setName(const Twine &newName) { Name = newName.str(); }
459
460 /// \return an ID for the concrete type of this object.
461 /// This is used to implement the classof checks. This should not be used
462 /// for any other purpose, as the values may change as LLVM evolves.
463 unsigned getVPBlockID() const { return SubclassID; }
464
465 VPRegionBlock *getParent() { return Parent; }
466 const VPRegionBlock *getParent() const { return Parent; }
467
468 /// \return A pointer to the plan containing the current block.
469 VPlan *getPlan();
470 const VPlan *getPlan() const;
471
472 /// Sets the pointer of the plan containing the block. The block must be the
473 /// entry block into the VPlan.
474 void setPlan(VPlan *ParentPlan);
475
476 void setParent(VPRegionBlock *P) { Parent = P; }
477
478 /// \return the VPBasicBlock that is the entry of this VPBlockBase,
479 /// recursively, if the latter is a VPRegionBlock. Otherwise, if this
480 /// VPBlockBase is a VPBasicBlock, it is returned.
481 const VPBasicBlock *getEntryBasicBlock() const;
482 VPBasicBlock *getEntryBasicBlock();
483
484 /// \return the VPBasicBlock that is the exiting this VPBlockBase,
485 /// recursively, if the latter is a VPRegionBlock. Otherwise, if this
486 /// VPBlockBase is a VPBasicBlock, it is returned.
487 const VPBasicBlock *getExitingBasicBlock() const;
488 VPBasicBlock *getExitingBasicBlock();
489
490 const VPBlocksTy &getSuccessors() const { return Successors; }
491 VPBlocksTy &getSuccessors() { return Successors; }
492
493 iterator_range<VPBlockBase **> successors() { return Successors; }
494
495 const VPBlocksTy &getPredecessors() const { return Predecessors; }
496 VPBlocksTy &getPredecessors() { return Predecessors; }
497
498 /// \return the successor of this VPBlockBase if it has a single successor.
499 /// Otherwise return a null pointer.
500 VPBlockBase *getSingleSuccessor() const {
501 return (Successors.size() == 1 ? *Successors.begin() : nullptr);
502 }
503
504 /// \return the predecessor of this VPBlockBase if it has a single
505 /// predecessor. Otherwise return a null pointer.
506 VPBlockBase *getSinglePredecessor() const {
507 return (Predecessors.size() == 1 ? *Predecessors.begin() : nullptr);
508 }
509
510 size_t getNumSuccessors() const { return Successors.size(); }
511 size_t getNumPredecessors() const { return Predecessors.size(); }
512
513 /// An Enclosing Block of a block B is any block containing B, including B
514 /// itself. \return the closest enclosing block starting from "this", which
515 /// has successors. \return the root enclosing block if all enclosing blocks
516 /// have no successors.
517 VPBlockBase *getEnclosingBlockWithSuccessors();
518
519 /// \return the closest enclosing block starting from "this", which has
520 /// predecessors. \return the root enclosing block if all enclosing blocks
521 /// have no predecessors.
522 VPBlockBase *getEnclosingBlockWithPredecessors();
523
524 /// \return the successors either attached directly to this VPBlockBase or, if
525 /// this VPBlockBase is the exit block of a VPRegionBlock and has no
526 /// successors of its own, search recursively for the first enclosing
527 /// VPRegionBlock that has successors and return them. If no such
528 /// VPRegionBlock exists, return the (empty) successors of the topmost
529 /// VPBlockBase reached.
530 const VPBlocksTy &getHierarchicalSuccessors() {
531 return getEnclosingBlockWithSuccessors()->getSuccessors();
532 }
533
534 /// \return the hierarchical successor of this VPBlockBase if it has a single
535 /// hierarchical successor. Otherwise return a null pointer.
536 VPBlockBase *getSingleHierarchicalSuccessor() {
537 return getEnclosingBlockWithSuccessors()->getSingleSuccessor();
538 }
539
540 /// \return the predecessors either attached directly to this VPBlockBase or,
541 /// if this VPBlockBase is the entry block of a VPRegionBlock and has no
542 /// predecessors of its own, search recursively for the first enclosing
543 /// VPRegionBlock that has predecessors and return them. If no such
544 /// VPRegionBlock exists, return the (empty) predecessors of the topmost
545 /// VPBlockBase reached.
546 const VPBlocksTy &getHierarchicalPredecessors() {
547 return getEnclosingBlockWithPredecessors()->getPredecessors();
548 }
549
550 /// \return the hierarchical predecessor of this VPBlockBase if it has a
551 /// single hierarchical predecessor. Otherwise return a null pointer.
552 VPBlockBase *getSingleHierarchicalPredecessor() {
553 return getEnclosingBlockWithPredecessors()->getSinglePredecessor();
554 }
555
556 /// Set a given VPBlockBase \p Successor as the single successor of this
557 /// VPBlockBase. This VPBlockBase is not added as predecessor of \p Successor.
558 /// This VPBlockBase must have no successors.
559 void setOneSuccessor(VPBlockBase *Successor) {
560 assert(Successors.empty() && "Setting one successor when others exist.")(static_cast <bool> (Successors.empty() && "Setting one successor when others exist."
) ? void (0) : __assert_fail ("Successors.empty() && \"Setting one successor when others exist.\""
, "llvm/lib/Transforms/Vectorize/VPlan.h", 560, __extension__
__PRETTY_FUNCTION__))
;
561 appendSuccessor(Successor);
562 }
563
564 /// Set two given VPBlockBases \p IfTrue and \p IfFalse to be the two
565 /// successors of this VPBlockBase. This VPBlockBase is not added as
566 /// predecessor of \p IfTrue or \p IfFalse. This VPBlockBase must have no
567 /// successors.
568 void setTwoSuccessors(VPBlockBase *IfTrue, VPBlockBase *IfFalse) {
569 assert(Successors.empty() && "Setting two successors when others exist.")(static_cast <bool> (Successors.empty() && "Setting two successors when others exist."
) ? void (0) : __assert_fail ("Successors.empty() && \"Setting two successors when others exist.\""
, "llvm/lib/Transforms/Vectorize/VPlan.h", 569, __extension__
__PRETTY_FUNCTION__))
;
570 appendSuccessor(IfTrue);
571 appendSuccessor(IfFalse);
572 }
573
574 /// Set each VPBasicBlock in \p NewPreds as predecessor of this VPBlockBase.
575 /// This VPBlockBase must have no predecessors. This VPBlockBase is not added
576 /// as successor of any VPBasicBlock in \p NewPreds.
577 void setPredecessors(ArrayRef<VPBlockBase *> NewPreds) {
578 assert(Predecessors.empty() && "Block predecessors already set.")(static_cast <bool> (Predecessors.empty() && "Block predecessors already set."
) ? void (0) : __assert_fail ("Predecessors.empty() && \"Block predecessors already set.\""
, "llvm/lib/Transforms/Vectorize/VPlan.h", 578, __extension__
__PRETTY_FUNCTION__))
;
579 for (auto *Pred : NewPreds)
580 appendPredecessor(Pred);
581 }
582
583 /// Remove all the predecessor of this block.
584 void clearPredecessors() { Predecessors.clear(); }
585
586 /// Remove all the successors of this block.
587 void clearSuccessors() { Successors.clear(); }
588
589 /// The method which generates the output IR that correspond to this
590 /// VPBlockBase, thereby "executing" the VPlan.
591 virtual void execute(VPTransformState *State) = 0;
592
593 /// Delete all blocks reachable from a given VPBlockBase, inclusive.
594 static void deleteCFG(VPBlockBase *Entry);
595
596 /// Return true if it is legal to hoist instructions into this block.
597 bool isLegalToHoistInto() {
598 // There are currently no constraints that prevent an instruction to be
599 // hoisted into a VPBlockBase.
600 return true;
601 }
602
603 /// Replace all operands of VPUsers in the block with \p NewValue and also
604 /// replaces all uses of VPValues defined in the block with NewValue.
605 virtual void dropAllReferences(VPValue *NewValue) = 0;
606
607#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
608 void printAsOperand(raw_ostream &OS, bool PrintType) const {
609 OS << getName();
610 }
611
612 /// Print plain-text dump of this VPBlockBase to \p O, prefixing all lines
613 /// with \p Indent. \p SlotTracker is used to print unnamed VPValue's using
614 /// consequtive numbers.
615 ///
616 /// Note that the numbering is applied to the whole VPlan, so printing
617 /// individual blocks is consistent with the whole VPlan printing.
618 virtual void print(raw_ostream &O, const Twine &Indent,
619 VPSlotTracker &SlotTracker) const = 0;
620
621 /// Print plain-text dump of this VPlan to \p O.
622 void print(raw_ostream &O) const {
623 VPSlotTracker SlotTracker(getPlan());
624 print(O, "", SlotTracker);
625 }
626
627 /// Print the successors of this block to \p O, prefixing all lines with \p
628 /// Indent.
629 void printSuccessors(raw_ostream &O, const Twine &Indent) const;
630
631 /// Dump this VPBlockBase to dbgs().
632 LLVM_DUMP_METHOD__attribute__((noinline)) __attribute__((__used__)) void dump() const { print(dbgs()); }
633#endif
634};
635
636/// A value that is used outside the VPlan. The operand of the user needs to be
637/// added to the associated LCSSA phi node.
638class VPLiveOut : public VPUser {
639 PHINode *Phi;
640
641public:
642 VPLiveOut(PHINode *Phi, VPValue *Op)
643 : VPUser({Op}, VPUser::VPUserID::LiveOut), Phi(Phi) {}
644
645 /// Fixup the wrapped LCSSA phi node in the unique exit block. This simply
646 /// means we need to add the appropriate incoming value from the middle
647 /// block as exiting edges from the scalar epilogue loop (if present) are
648 /// already in place, and we exit the vector loop exclusively to the middle
649 /// block.
650 void fixPhi(VPlan &Plan, VPTransformState &State);
651
652 /// Returns true if the VPLiveOut uses scalars of operand \p Op.
653 bool usesScalars(const VPValue *Op) const override {
654 assert(is_contained(operands(), Op) &&(static_cast <bool> (is_contained(operands(), Op) &&
"Op must be an operand of the recipe") ? void (0) : __assert_fail
("is_contained(operands(), Op) && \"Op must be an operand of the recipe\""
, "llvm/lib/Transforms/Vectorize/VPlan.h", 655, __extension__
__PRETTY_FUNCTION__))
655 "Op must be an operand of the recipe")(static_cast <bool> (is_contained(operands(), Op) &&
"Op must be an operand of the recipe") ? void (0) : __assert_fail
("is_contained(operands(), Op) && \"Op must be an operand of the recipe\""
, "llvm/lib/Transforms/Vectorize/VPlan.h", 655, __extension__
__PRETTY_FUNCTION__))
;
656 return true;
657 }
658
659 PHINode *getPhi() const { return Phi; }
660};
661
662/// VPRecipeBase is a base class modeling a sequence of one or more output IR
663/// instructions. VPRecipeBase owns the the VPValues it defines through VPDef
664/// and is responsible for deleting its defined values. Single-value
665/// VPRecipeBases that also inherit from VPValue must make sure to inherit from
666/// VPRecipeBase before VPValue.
667class VPRecipeBase : public ilist_node_with_parent<VPRecipeBase, VPBasicBlock>,
668 public VPDef,
669 public VPUser {
670 friend VPBasicBlock;
671 friend class VPBlockUtils;
672
673 /// Each VPRecipe belongs to a single VPBasicBlock.
674 VPBasicBlock *Parent = nullptr;
675
676public:
677 VPRecipeBase(const unsigned char SC, ArrayRef<VPValue *> Operands)
678 : VPDef(SC), VPUser(Operands, VPUser::VPUserID::Recipe) {}
679
680 template <typename IterT>
681 VPRecipeBase(const unsigned char SC, iterator_range<IterT> Operands)
682 : VPDef(SC), VPUser(Operands, VPUser::VPUserID::Recipe) {}
683 virtual ~VPRecipeBase() = default;
684
685 /// \return the VPBasicBlock which this VPRecipe belongs to.
686 VPBasicBlock *getParent() { return Parent; }
687 const VPBasicBlock *getParent() const { return Parent; }
688
689 /// The method which generates the output IR instructions that correspond to
690 /// this VPRecipe, thereby "executing" the VPlan.
691 virtual void execute(VPTransformState &State) = 0;
692
693 /// Insert an unlinked recipe into a basic block immediately before
694 /// the specified recipe.
695 void insertBefore(VPRecipeBase *InsertPos);
696 /// Insert an unlinked recipe into \p BB immediately before the insertion
697 /// point \p IP;
698 void insertBefore(VPBasicBlock &BB, iplist<VPRecipeBase>::iterator IP);
699
700 /// Insert an unlinked Recipe into a basic block immediately after
701 /// the specified Recipe.
702 void insertAfter(VPRecipeBase *InsertPos);
703
704 /// Unlink this recipe from its current VPBasicBlock and insert it into
705 /// the VPBasicBlock that MovePos lives in, right after MovePos.
706 void moveAfter(VPRecipeBase *MovePos);
707
708 /// Unlink this recipe and insert into BB before I.
709 ///
710 /// \pre I is a valid iterator into BB.
711 void moveBefore(VPBasicBlock &BB, iplist<VPRecipeBase>::iterator I);
712
713 /// This method unlinks 'this' from the containing basic block, but does not
714 /// delete it.
715 void removeFromParent();
716
717 /// This method unlinks 'this' from the containing basic block and deletes it.
718 ///
719 /// \returns an iterator pointing to the element after the erased one
720 iplist<VPRecipeBase>::iterator eraseFromParent();
721
722 /// Returns the underlying instruction, if the recipe is a VPValue or nullptr
723 /// otherwise.
724 Instruction *getUnderlyingInstr() {
725 return cast<Instruction>(getVPSingleValue()->getUnderlyingValue());
726 }
727 const Instruction *getUnderlyingInstr() const {
728 return cast<Instruction>(getVPSingleValue()->getUnderlyingValue());
729 }
730
731 /// Method to support type inquiry through isa, cast, and dyn_cast.
732 static inline bool classof(const VPDef *D) {
733 // All VPDefs are also VPRecipeBases.
734 return true;
735 }
736
737 static inline bool classof(const VPUser *U) {
738 return U->getVPUserID() == VPUser::VPUserID::Recipe;
739 }
740
741 /// Returns true if the recipe may have side-effects.
742 bool mayHaveSideEffects() const;
743
744 /// Returns true for PHI-like recipes.
745 bool isPhi() const {
746 return getVPDefID() >= VPFirstPHISC && getVPDefID() <= VPLastPHISC;
747 }
748
749 /// Returns true if the recipe may read from memory.
750 bool mayReadFromMemory() const;
751
752 /// Returns true if the recipe may write to memory.
753 bool mayWriteToMemory() const;
754
755 /// Returns true if the recipe may read from or write to memory.
756 bool mayReadOrWriteMemory() const {
757 return mayReadFromMemory() || mayWriteToMemory();
758 }
759};
760
761inline bool VPUser::classof(const VPDef *Def) {
762 return Def->getVPDefID() == VPRecipeBase::VPInstructionSC ||
763 Def->getVPDefID() == VPRecipeBase::VPWidenSC ||
764 Def->getVPDefID() == VPRecipeBase::VPWidenCallSC ||
765 Def->getVPDefID() == VPRecipeBase::VPWidenSelectSC ||
766 Def->getVPDefID() == VPRecipeBase::VPWidenGEPSC ||
767 Def->getVPDefID() == VPRecipeBase::VPBlendSC ||
768 Def->getVPDefID() == VPRecipeBase::VPInterleaveSC ||
769 Def->getVPDefID() == VPRecipeBase::VPReplicateSC ||
770 Def->getVPDefID() == VPRecipeBase::VPReductionSC ||
771 Def->getVPDefID() == VPRecipeBase::VPBranchOnMaskSC ||
772 Def->getVPDefID() == VPRecipeBase::VPWidenMemoryInstructionSC;
773}
774
775// Helper macro to define common classof implementations for recipes.
776#define VP_CLASSOF_IMPL(VPDefID, VPValueID)static inline bool classof(const VPDef *D) { return D->getVPDefID
() == VPDefID; } static inline bool classof(const VPValue *V)
{ return V->getVPValueID() == VPValueID; } static inline bool
classof(const VPUser *U) { auto *R = dyn_cast<VPRecipeBase
>(U); return R && R->getVPDefID() == VPDefID; }
static inline bool classof(const VPRecipeBase *R) { return R
->getVPDefID() == VPDefID; }
\
777 static inline bool classof(const VPDef *D) { \
778 return D->getVPDefID() == VPDefID; \
779 } \
780 static inline bool classof(const VPValue *V) { \
781 return V->getVPValueID() == VPValueID; \
782 } \
783 static inline bool classof(const VPUser *U) { \
784 auto *R = dyn_cast<VPRecipeBase>(U); \
785 return R && R->getVPDefID() == VPDefID; \
786 } \
787 static inline bool classof(const VPRecipeBase *R) { \
788 return R->getVPDefID() == VPDefID; \
789 }
790
791/// This is a concrete Recipe that models a single VPlan-level instruction.
792/// While as any Recipe it may generate a sequence of IR instructions when
793/// executed, these instructions would always form a single-def expression as
794/// the VPInstruction is also a single def-use vertex.
795class VPInstruction : public VPRecipeBase, public VPValue {
796 friend class VPlanSlp;
797
798public:
799 /// VPlan opcodes, extending LLVM IR with idiomatics instructions.
800 enum {
801 FirstOrderRecurrenceSplice =
802 Instruction::OtherOpsEnd + 1, // Combines the incoming and previous
803 // values of a first-order recurrence.
804 Not,
805 ICmpULE,
806 SLPLoad,
807 SLPStore,
808 ActiveLaneMask,
809 CanonicalIVIncrement,
810 CanonicalIVIncrementNUW,
811 // The next two are similar to the above, but instead increment the
812 // canonical IV separately for each unrolled part.
813 CanonicalIVIncrementForPart,
814 CanonicalIVIncrementForPartNUW,
815 BranchOnCount,
816 BranchOnCond
817 };
818
819private:
820 typedef unsigned char OpcodeTy;
821 OpcodeTy Opcode;
822 FastMathFlags FMF;
823 DebugLoc DL;
824
825 /// An optional name that can be used for the generated IR instruction.
826 const std::string Name;
827
828 /// Utility method serving execute(): generates a single instance of the
829 /// modeled instruction.
830 void generateInstruction(VPTransformState &State, unsigned Part);
831
832protected:
833 void setUnderlyingInstr(Instruction *I) { setUnderlyingValue(I); }
834
835public:
836 VPInstruction(unsigned Opcode, ArrayRef<VPValue *> Operands, DebugLoc DL,
837 const Twine &Name = "")
838 : VPRecipeBase(VPRecipeBase::VPInstructionSC, Operands),
839 VPValue(VPValue::VPVInstructionSC, nullptr, this), Opcode(Opcode),
840 DL(DL), Name(Name.str()) {}
841
842 VPInstruction(unsigned Opcode, std::initializer_list<VPValue *> Operands,
843 DebugLoc DL = {}, const Twine &Name = "")
844 : VPInstruction(Opcode, ArrayRef<VPValue *>(Operands), DL, Name) {}
845
846 VP_CLASSOF_IMPL(VPRecipeBase::VPInstructionSC, VPValue::VPVInstructionSC)static inline bool classof(const VPDef *D) { return D->getVPDefID
() == VPRecipeBase::VPInstructionSC; } static inline bool classof
(const VPValue *V) { return V->getVPValueID() == VPValue::
VPVInstructionSC; } static inline bool classof(const VPUser *
U) { auto *R = dyn_cast<VPRecipeBase>(U); return R &&
R->getVPDefID() == VPRecipeBase::VPInstructionSC; } static
inline bool classof(const VPRecipeBase *R) { return R->getVPDefID
() == VPRecipeBase::VPInstructionSC; }
847
848 VPInstruction *clone() const {
849 SmallVector<VPValue *, 2> Operands(operands());
850 return new VPInstruction(Opcode, Operands, DL, Name);
851 }
852
853 unsigned getOpcode() const { return Opcode; }
854
855 /// Generate the instruction.
856 /// TODO: We currently execute only per-part unless a specific instance is
857 /// provided.
858 void execute(VPTransformState &State) override;
859
860#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
861 /// Print the VPInstruction to \p O.
862 void print(raw_ostream &O, const Twine &Indent,
863 VPSlotTracker &SlotTracker) const override;
864
865 /// Print the VPInstruction to dbgs() (for debugging).
866 LLVM_DUMP_METHOD__attribute__((noinline)) __attribute__((__used__)) void dump() const;
867#endif
868
869 /// Return true if this instruction may modify memory.
870 bool mayWriteToMemory() const {
871 // TODO: we can use attributes of the called function to rule out memory
872 // modifications.
873 return Opcode == Instruction::Store || Opcode == Instruction::Call ||
874 Opcode == Instruction::Invoke || Opcode == SLPStore;
875 }
876
877 bool hasResult() const {
878 // CallInst may or may not have a result, depending on the called function.
879 // Conservatively return calls have results for now.
880 switch (getOpcode()) {
881 case Instruction::Ret:
882 case Instruction::Br:
883 case Instruction::Store:
884 case Instruction::Switch:
885 case Instruction::IndirectBr:
886 case Instruction::Resume:
887 case Instruction::CatchRet:
888 case Instruction::Unreachable:
889 case Instruction::Fence:
890 case Instruction::AtomicRMW:
891 case VPInstruction::BranchOnCond:
892 case VPInstruction::BranchOnCount:
893 return false;
894 default:
895 return true;
896 }
897 }
898
899 /// Set the fast-math flags.
900 void setFastMathFlags(FastMathFlags FMFNew);
901
902 /// Returns true if the recipe only uses the first lane of operand \p Op.
903 bool onlyFirstLaneUsed(const VPValue *Op) const override {
904 assert(is_contained(operands(), Op) &&(static_cast <bool> (is_contained(operands(), Op) &&
"Op must be an operand of the recipe") ? void (0) : __assert_fail
("is_contained(operands(), Op) && \"Op must be an operand of the recipe\""
, "llvm/lib/Transforms/Vectorize/VPlan.h", 905, __extension__
__PRETTY_FUNCTION__))
905 "Op must be an operand of the recipe")(static_cast <bool> (is_contained(operands(), Op) &&
"Op must be an operand of the recipe") ? void (0) : __assert_fail
("is_contained(operands(), Op) && \"Op must be an operand of the recipe\""
, "llvm/lib/Transforms/Vectorize/VPlan.h", 905, __extension__
__PRETTY_FUNCTION__))
;
906 if (getOperand(0) != Op)
907 return false;
908 switch (getOpcode()) {
909 default:
910 return false;
911 case VPInstruction::ActiveLaneMask:
912 case VPInstruction::CanonicalIVIncrement:
913 case VPInstruction::CanonicalIVIncrementNUW:
914 case VPInstruction::CanonicalIVIncrementForPart:
915 case VPInstruction::CanonicalIVIncrementForPartNUW:
916 case VPInstruction::BranchOnCount:
917 return true;
918 };
919 llvm_unreachable("switch should return")::llvm::llvm_unreachable_internal("switch should return", "llvm/lib/Transforms/Vectorize/VPlan.h"
, 919)
;
920 }
921};
922
923/// VPWidenRecipe is a recipe for producing a copy of vector type its
924/// ingredient. This recipe covers most of the traditional vectorization cases
925/// where each ingredient transforms into a vectorized version of itself.
926class VPWidenRecipe : public VPRecipeBase, public VPValue {
927public:
928 template <typename IterT>
929 VPWidenRecipe(Instruction &I, iterator_range<IterT> Operands)
930 : VPRecipeBase(VPRecipeBase::VPWidenSC, Operands),
931 VPValue(VPValue::VPVWidenSC, &I, this) {}
932
933 ~VPWidenRecipe() override = default;
934
935 VP_CLASSOF_IMPL(VPRecipeBase::VPWidenSC, VPValue::VPVWidenSC)static inline bool classof(const VPDef *D) { return D->getVPDefID
() == VPRecipeBase::VPWidenSC; } static inline bool classof(const
VPValue *V) { return V->getVPValueID() == VPValue::VPVWidenSC
; } static inline bool classof(const VPUser *U) { auto *R = dyn_cast
<VPRecipeBase>(U); return R && R->getVPDefID
() == VPRecipeBase::VPWidenSC; } static inline bool classof(const
VPRecipeBase *R) { return R->getVPDefID() == VPRecipeBase
::VPWidenSC; }
936
937 /// Produce widened copies of all Ingredients.
938 void execute(VPTransformState &State) override;
939
940#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
941 /// Print the recipe.
942 void print(raw_ostream &O, const Twine &Indent,
943 VPSlotTracker &SlotTracker) const override;
944#endif
945};
946
947/// A recipe for widening Call instructions.
948class VPWidenCallRecipe : public VPRecipeBase, public VPValue {
949 /// ID of the vector intrinsic to call when widening the call. If set the
950 /// Intrinsic::not_intrinsic, a library call will be used instead.
951 Intrinsic::ID VectorIntrinsicID;
952
953public:
954 template <typename IterT>
955 VPWidenCallRecipe(CallInst &I, iterator_range<IterT> CallArguments,
956 Intrinsic::ID VectorIntrinsicID)
957 : VPRecipeBase(VPRecipeBase::VPWidenCallSC, CallArguments),
958 VPValue(VPValue::VPVWidenCallSC, &I, this),
959 VectorIntrinsicID(VectorIntrinsicID) {}
960
961 ~VPWidenCallRecipe() override = default;
962
963 VP_CLASSOF_IMPL(VPRecipeBase::VPWidenCallSC, VPValue::VPVWidenCallSC)static inline bool classof(const VPDef *D) { return D->getVPDefID
() == VPRecipeBase::VPWidenCallSC; } static inline bool classof
(const VPValue *V) { return V->getVPValueID() == VPValue::
VPVWidenCallSC; } static inline bool classof(const VPUser *U)
{ auto *R = dyn_cast<VPRecipeBase>(U); return R &&
R->getVPDefID() == VPRecipeBase::VPWidenCallSC; } static inline
bool classof(const VPRecipeBase *R) { return R->getVPDefID
() == VPRecipeBase::VPWidenCallSC; }
964
965 /// Produce a widened version of the call instruction.
966 void execute(VPTransformState &State) override;
967
968#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
969 /// Print the recipe.
970 void print(raw_ostream &O, const Twine &Indent,
971 VPSlotTracker &SlotTracker) const override;
972#endif
973};
974
975/// A recipe for widening select instructions.
976class VPWidenSelectRecipe : public VPRecipeBase, public VPValue {
977
978 /// Is the condition of the select loop invariant?
979 bool InvariantCond;
980
981public:
982 template <typename IterT>
983 VPWidenSelectRecipe(SelectInst &I, iterator_range<IterT> Operands,
984 bool InvariantCond)
985 : VPRecipeBase(VPRecipeBase::VPWidenSelectSC, Operands),
986 VPValue(VPValue::VPVWidenSelectSC, &I, this),
987 InvariantCond(InvariantCond) {}
988
989 ~VPWidenSelectRecipe() override = default;
990
991 VP_CLASSOF_IMPL(VPRecipeBase::VPWidenSelectSC, VPValue::VPVWidenSelectSC)static inline bool classof(const VPDef *D) { return D->getVPDefID
() == VPRecipeBase::VPWidenSelectSC; } static inline bool classof
(const VPValue *V) { return V->getVPValueID() == VPValue::
VPVWidenSelectSC; } static inline bool classof(const VPUser *
U) { auto *R = dyn_cast<VPRecipeBase>(U); return R &&
R->getVPDefID() == VPRecipeBase::VPWidenSelectSC; } static
inline bool classof(const VPRecipeBase *R) { return R->getVPDefID
() == VPRecipeBase::VPWidenSelectSC; }
992
993 /// Produce a widened version of the select instruction.
994 void execute(VPTransformState &State) override;
995
996#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
997 /// Print the recipe.
998 void print(raw_ostream &O, const Twine &Indent,
999 VPSlotTracker &SlotTracker) const override;
1000#endif
1001};
1002
1003/// A recipe for handling GEP instructions.
1004class VPWidenGEPRecipe : public VPRecipeBase, public VPValue {
1005 bool IsPtrLoopInvariant;
1006 SmallBitVector IsIndexLoopInvariant;
1007
1008public:
1009 template <typename IterT>
1010 VPWidenGEPRecipe(GetElementPtrInst *GEP, iterator_range<IterT> Operands)
1011 : VPRecipeBase(VPRecipeBase::VPWidenGEPSC, Operands),
1012 VPValue(VPWidenGEPSC, GEP, this),
1013 IsIndexLoopInvariant(GEP->getNumIndices(), false) {}
1014
1015 template <typename IterT>
1016 VPWidenGEPRecipe(GetElementPtrInst *GEP, iterator_range<IterT> Operands,
1017 Loop *OrigLoop)
1018 : VPRecipeBase(VPRecipeBase::VPWidenGEPSC, Operands),
1019 VPValue(VPValue::VPVWidenGEPSC, GEP, this),
1020 IsIndexLoopInvariant(GEP->getNumIndices(), false) {
1021 IsPtrLoopInvariant = OrigLoop->isLoopInvariant(GEP->getPointerOperand());
1022 for (auto Index : enumerate(GEP->indices()))
1023 IsIndexLoopInvariant[Index.index()] =
1024 OrigLoop->isLoopInvariant(Index.value().get());
1025 }
1026 ~VPWidenGEPRecipe() override = default;
1027
1028 VP_CLASSOF_IMPL(VPRecipeBase::VPWidenGEPSC, VPValue::VPVWidenGEPSC)static inline bool classof(const VPDef *D) { return D->getVPDefID
() == VPRecipeBase::VPWidenGEPSC; } static inline bool classof
(const VPValue *V) { return V->getVPValueID() == VPValue::
VPVWidenGEPSC; } static inline bool classof(const VPUser *U) {
auto *R = dyn_cast<VPRecipeBase>(U); return R &&
R->getVPDefID() == VPRecipeBase::VPWidenGEPSC; } static inline
bool classof(const VPRecipeBase *R) { return R->getVPDefID
() == VPRecipeBase::VPWidenGEPSC; }
1029
1030 /// Generate the gep nodes.
1031 void execute(VPTransformState &State) override;
1032
1033#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1034 /// Print the recipe.
1035 void print(raw_ostream &O, const Twine &Indent,
1036 VPSlotTracker &SlotTracker) const override;
1037#endif
1038};
1039
1040/// A recipe for handling phi nodes of integer and floating-point inductions,
1041/// producing their vector values.
1042class VPWidenIntOrFpInductionRecipe : public VPRecipeBase, public VPValue {
1043 PHINode *IV;
1044 const InductionDescriptor &IndDesc;
1045 bool NeedsVectorIV;
1046
1047public:
1048 VPWidenIntOrFpInductionRecipe(PHINode *IV, VPValue *Start, VPValue *Step,
1049 const InductionDescriptor &IndDesc,
1050 bool NeedsVectorIV)
1051 : VPRecipeBase(VPWidenIntOrFpInductionSC, {Start, Step}),
1052 VPValue(IV, this), IV(IV), IndDesc(IndDesc),
1053 NeedsVectorIV(NeedsVectorIV) {}
1054
1055 VPWidenIntOrFpInductionRecipe(PHINode *IV, VPValue *Start, VPValue *Step,
1056 const InductionDescriptor &IndDesc,
1057 TruncInst *Trunc, bool NeedsVectorIV)
1058 : VPRecipeBase(VPWidenIntOrFpInductionSC, {Start, Step}),
1059 VPValue(Trunc, this), IV(IV), IndDesc(IndDesc),
1060 NeedsVectorIV(NeedsVectorIV) {}
1061
1062 ~VPWidenIntOrFpInductionRecipe() override = default;
1063
1064 VP_CLASSOF_IMPL(VPRecipeBase::VPWidenIntOrFpInductionSC,static inline bool classof(const VPDef *D) { return D->getVPDefID
() == VPRecipeBase::VPWidenIntOrFpInductionSC; } static inline
bool classof(const VPValue *V) { return V->getVPValueID()
== VPValue::VPVWidenIntOrFpInductionSC; } static inline bool
classof(const VPUser *U) { auto *R = dyn_cast<VPRecipeBase
>(U); return R && R->getVPDefID() == VPRecipeBase
::VPWidenIntOrFpInductionSC; } static inline bool classof(const
VPRecipeBase *R) { return R->getVPDefID() == VPRecipeBase
::VPWidenIntOrFpInductionSC; }
1065 VPValue::VPVWidenIntOrFpInductionSC)static inline bool classof(const VPDef *D) { return D->getVPDefID
() == VPRecipeBase::VPWidenIntOrFpInductionSC; } static inline
bool classof(const VPValue *V) { return V->getVPValueID()
== VPValue::VPVWidenIntOrFpInductionSC; } static inline bool
classof(const VPUser *U) { auto *R = dyn_cast<VPRecipeBase
>(U); return R && R->getVPDefID() == VPRecipeBase
::VPWidenIntOrFpInductionSC; } static inline bool classof(const
VPRecipeBase *R) { return R->getVPDefID() == VPRecipeBase
::VPWidenIntOrFpInductionSC; }
1066
1067 /// Generate the vectorized and scalarized versions of the phi node as
1068 /// needed by their users.
1069 void execute(VPTransformState &State) override;
1070
1071#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1072 /// Print the recipe.
1073 void print(raw_ostream &O, const Twine &Indent,
1074 VPSlotTracker &SlotTracker) const override;
1075#endif
1076
1077 /// Returns the start value of the induction.
1078 VPValue *getStartValue() { return getOperand(0); }
1079 const VPValue *getStartValue() const { return getOperand(0); }
1080
1081 /// Returns the step value of the induction.
1082 VPValue *getStepValue() { return getOperand(1); }
1083 const VPValue *getStepValue() const { return getOperand(1); }
1084
1085 /// Returns the first defined value as TruncInst, if it is one or nullptr
1086 /// otherwise.
1087 TruncInst *getTruncInst() {
1088 return dyn_cast_or_null<TruncInst>(getVPValue(0)->getUnderlyingValue());
1089 }
1090 const TruncInst *getTruncInst() const {
1091 return dyn_cast_or_null<TruncInst>(getVPValue(0)->getUnderlyingValue());
1092 }
1093
1094 PHINode *getPHINode() { return IV; }
1095
1096 /// Returns the induction descriptor for the recipe.
1097 const InductionDescriptor &getInductionDescriptor() const { return IndDesc; }
1098
1099 /// Returns true if the induction is canonical, i.e. starting at 0 and
1100 /// incremented by UF * VF (= the original IV is incremented by 1).
1101 bool isCanonical() const;
1102
1103 /// Returns the scalar type of the induction.
1104 const Type *getScalarType() const {
1105 const TruncInst *TruncI = getTruncInst();
1106 return TruncI ? TruncI->getType() : IV->getType();
1107 }
1108
1109 /// Returns true if a vector phi needs to be created for the induction.
1110 bool needsVectorIV() const { return NeedsVectorIV; }
1111};
1112
1113/// A pure virtual base class for all recipes modeling header phis, including
1114/// phis for first order recurrences, pointer inductions and reductions. The
1115/// start value is the first operand of the recipe and the incoming value from
1116/// the backedge is the second operand.
1117///
1118/// Inductions are modeled using the following sub-classes:
1119/// * VPCanonicalIVPHIRecipe: Canonical scalar induction of the vector loop,
1120/// starting at a specified value (zero for the main vector loop, the resume
1121/// value for the epilogue vector loop) and stepping by 1. The induction
1122/// controls exiting of the vector loop by comparing against the vector trip
1123/// count. Produces a single scalar PHI for the induction value per
1124/// iteration.
1125/// * VPWidenIntOrFpInductionRecipe: Generates vector values for integer and
1126/// floating point inductions with arbitrary start and step values. Produces
1127/// a vector PHI per-part.
1128/// * VPDerivedIVRecipe: Converts the canonical IV value to the corresponding
1129/// value of an IV with different start and step values. Produces a single
1130/// scalar value per iteration
1131/// * VPScalarIVStepsRecipe: Generates scalar values per-lane based on a
1132/// canonical or derived induction.
1133/// * VPWidenPointerInductionRecipe: Generate vector and scalar values for a
1134/// pointer induction. Produces either a vector PHI per-part or scalar values
1135/// per-lane based on the canonical induction.
1136class VPHeaderPHIRecipe : public VPRecipeBase, public VPValue {
1137protected:
1138 VPHeaderPHIRecipe(unsigned char VPVID, unsigned char VPDefID, PHINode *Phi,
1139 VPValue *Start = nullptr)
1140 : VPRecipeBase(VPDefID, {}), VPValue(VPVID, Phi, this) {
1141 if (Start)
1142 addOperand(Start);
1143 }
1144
1145public:
1146 ~VPHeaderPHIRecipe() override = default;
1147
1148 /// Method to support type inquiry through isa, cast, and dyn_cast.
1149 static inline bool classof(const VPRecipeBase *B) {
1150 return B->getVPDefID() >= VPRecipeBase::VPFirstHeaderPHISC &&
1151 B->getVPDefID() <= VPRecipeBase::VPLastPHISC;
1152 }
1153 static inline bool classof(const VPValue *V) {
1154 return V->getVPValueID() >= VPValue::VPVFirstHeaderPHISC &&
1155 V->getVPValueID() <= VPValue::VPVLastPHISC;
1156 }
1157
1158 /// Generate the phi nodes.
1159 void execute(VPTransformState &State) override = 0;
1160
1161#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1162 /// Print the recipe.
1163 void print(raw_ostream &O, const Twine &Indent,
1164 VPSlotTracker &SlotTracker) const override = 0;
1165#endif
1166
1167 /// Returns the start value of the phi, if one is set.
1168 VPValue *getStartValue() {
1169 return getNumOperands() == 0 ? nullptr : getOperand(0);
1170 }
1171 VPValue *getStartValue() const {
1172 return getNumOperands() == 0 ? nullptr : getOperand(0);
1173 }
1174
1175 /// Update the start value of the recipe.
1176 void setStartValue(VPValue *V) { setOperand(0, V); }
1177
1178 /// Returns the incoming value from the loop backedge.
1179 VPValue *getBackedgeValue() {
1180 return getOperand(1);
1181 }
1182
1183 /// Returns the backedge value as a recipe. The backedge value is guaranteed
1184 /// to be a recipe.
1185 VPRecipeBase &getBackedgeRecipe() {
1186 return *getBackedgeValue()->getDefiningRecipe();
1187 }
1188};
1189
1190class VPWidenPointerInductionRecipe : public VPHeaderPHIRecipe {
1191 const InductionDescriptor &IndDesc;
1192
1193 bool IsScalarAfterVectorization;
1194
1195public:
1196 /// Create a new VPWidenPointerInductionRecipe for \p Phi with start value \p
1197 /// Start.
1198 VPWidenPointerInductionRecipe(PHINode *Phi, VPValue *Start, VPValue *Step,
1199 const InductionDescriptor &IndDesc,
1200 bool IsScalarAfterVectorization)
1201 : VPHeaderPHIRecipe(VPVWidenPointerInductionSC, VPWidenPointerInductionSC,
1202 Phi),
1203 IndDesc(IndDesc),
1204 IsScalarAfterVectorization(IsScalarAfterVectorization) {
1205 addOperand(Start);
1206 addOperand(Step);
1207 }
1208
1209 ~VPWidenPointerInductionRecipe() override = default;
1210
1211 VP_CLASSOF_IMPL(VPRecipeBase::VPWidenPointerInductionSC,static inline bool classof(const VPDef *D) { return D->getVPDefID
() == VPRecipeBase::VPWidenPointerInductionSC; } static inline
bool classof(const VPValue *V) { return V->getVPValueID()
== VPValue::VPVWidenPointerInductionSC; } static inline bool
classof(const VPUser *U) { auto *R = dyn_cast<VPRecipeBase
>(U); return R && R->getVPDefID() == VPRecipeBase
::VPWidenPointerInductionSC; } static inline bool classof(const
VPRecipeBase *R) { return R->getVPDefID() == VPRecipeBase
::VPWidenPointerInductionSC; }
1212 VPValue::VPVWidenPointerInductionSC)static inline bool classof(const VPDef *D) { return D->getVPDefID
() == VPRecipeBase::VPWidenPointerInductionSC; } static inline
bool classof(const VPValue *V) { return V->getVPValueID()
== VPValue::VPVWidenPointerInductionSC; } static inline bool
classof(const VPUser *U) { auto *R = dyn_cast<VPRecipeBase
>(U); return R && R->getVPDefID() == VPRecipeBase
::VPWidenPointerInductionSC; } static inline bool classof(const
VPRecipeBase *R) { return R->getVPDefID() == VPRecipeBase
::VPWidenPointerInductionSC; }
1213
1214 /// Generate vector values for the pointer induction.
1215 void execute(VPTransformState &State) override;
1216
1217 /// Returns true if only scalar values will be generated.
1218 bool onlyScalarsGenerated(ElementCount VF);
1219
1220 /// Returns the induction descriptor for the recipe.
1221 const InductionDescriptor &getInductionDescriptor() const { return IndDesc; }
1222
1223#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1224 /// Print the recipe.
1225 void print(raw_ostream &O, const Twine &Indent,
1226 VPSlotTracker &SlotTracker) const override;
1227#endif
1228};
1229
1230/// A recipe for handling header phis that are widened in the vector loop.
1231/// In the VPlan native path, all incoming VPValues & VPBasicBlock pairs are
1232/// managed in the recipe directly.
1233class VPWidenPHIRecipe : public VPHeaderPHIRecipe {
1234 /// List of incoming blocks. Only used in the VPlan native path.
1235 SmallVector<VPBasicBlock *, 2> IncomingBlocks;
1236
1237public:
1238 /// Create a new VPWidenPHIRecipe for \p Phi with start value \p Start.
1239 VPWidenPHIRecipe(PHINode *Phi, VPValue *Start = nullptr)
1240 : VPHeaderPHIRecipe(VPVWidenPHISC, VPWidenPHISC, Phi) {
1241 if (Start)
1242 addOperand(Start);
1243 }
1244
1245 ~VPWidenPHIRecipe() override = default;
1246
1247 VP_CLASSOF_IMPL(VPRecipeBase::VPWidenPHISC, VPValue::VPVWidenPHISC)static inline bool classof(const VPDef *D) { return D->getVPDefID
() == VPRecipeBase::VPWidenPHISC; } static inline bool classof
(const VPValue *V) { return V->getVPValueID() == VPValue::
VPVWidenPHISC; } static inline bool classof(const VPUser *U) {
auto *R = dyn_cast<VPRecipeBase>(U); return R &&
R->getVPDefID() == VPRecipeBase::VPWidenPHISC; } static inline
bool classof(const VPRecipeBase *R) { return R->getVPDefID
() == VPRecipeBase::VPWidenPHISC; }
1248
1249 /// Generate the phi/select nodes.
1250 void execute(VPTransformState &State) override;
1251
1252#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1253 /// Print the recipe.
1254 void print(raw_ostream &O, const Twine &Indent,
1255 VPSlotTracker &SlotTracker) const override;
1256#endif
1257
1258 /// Adds a pair (\p IncomingV, \p IncomingBlock) to the phi.
1259 void addIncoming(VPValue *IncomingV, VPBasicBlock *IncomingBlock) {
1260 addOperand(IncomingV);
1261 IncomingBlocks.push_back(IncomingBlock);
1262 }
1263
1264 /// Returns the \p I th incoming VPBasicBlock.
1265 VPBasicBlock *getIncomingBlock(unsigned I) { return IncomingBlocks[I]; }
1266
1267 /// Returns the \p I th incoming VPValue.
1268 VPValue *getIncomingValue(unsigned I) { return getOperand(I); }
1269};
1270
1271/// A recipe for handling first-order recurrence phis. The start value is the
1272/// first operand of the recipe and the incoming value from the backedge is the
1273/// second operand.
1274struct VPFirstOrderRecurrencePHIRecipe : public VPHeaderPHIRecipe {
1275 VPFirstOrderRecurrencePHIRecipe(PHINode *Phi, VPValue &Start)
1276 : VPHeaderPHIRecipe(VPVFirstOrderRecurrencePHISC,
1277 VPFirstOrderRecurrencePHISC, Phi, &Start) {}
1278
1279 VP_CLASSOF_IMPL(VPRecipeBase::VPFirstOrderRecurrencePHISC,static inline bool classof(const VPDef *D) { return D->getVPDefID
() == VPRecipeBase::VPFirstOrderRecurrencePHISC; } static inline
bool classof(const VPValue *V) { return V->getVPValueID()
== VPValue::VPVFirstOrderRecurrencePHISC; } static inline bool
classof(const VPUser *U) { auto *R = dyn_cast<VPRecipeBase
>(U); return R && R->getVPDefID() == VPRecipeBase
::VPFirstOrderRecurrencePHISC; } static inline bool classof(const
VPRecipeBase *R) { return R->getVPDefID() == VPRecipeBase
::VPFirstOrderRecurrencePHISC; }
1280 VPValue::VPVFirstOrderRecurrencePHISC)static inline bool classof(const VPDef *D) { return D->getVPDefID
() == VPRecipeBase::VPFirstOrderRecurrencePHISC; } static inline
bool classof(const VPValue *V) { return V->getVPValueID()
== VPValue::VPVFirstOrderRecurrencePHISC; } static inline bool
classof(const VPUser *U) { auto *R = dyn_cast<VPRecipeBase
>(U); return R && R->getVPDefID() == VPRecipeBase
::VPFirstOrderRecurrencePHISC; } static inline bool classof(const
VPRecipeBase *R) { return R->getVPDefID() == VPRecipeBase
::VPFirstOrderRecurrencePHISC; }
1281
1282 static inline bool classof(const VPHeaderPHIRecipe *R) {
1283 return R->getVPDefID() == VPRecipeBase::VPFirstOrderRecurrencePHISC;
1284 }
1285
1286 void execute(VPTransformState &State) override;
1287
1288#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1289 /// Print the recipe.
1290 void print(raw_ostream &O, const Twine &Indent,
1291 VPSlotTracker &SlotTracker) const override;
1292#endif
1293};
1294
1295/// A recipe for handling reduction phis. The start value is the first operand
1296/// of the recipe and the incoming value from the backedge is the second
1297/// operand.
1298class VPReductionPHIRecipe : public VPHeaderPHIRecipe {
1299 /// Descriptor for the reduction.
1300 const RecurrenceDescriptor &RdxDesc;
1301
1302 /// The phi is part of an in-loop reduction.
1303 bool IsInLoop;
1304
1305 /// The phi is part of an ordered reduction. Requires IsInLoop to be true.
1306 bool IsOrdered;
1307
1308public:
1309 /// Create a new VPReductionPHIRecipe for the reduction \p Phi described by \p
1310 /// RdxDesc.
1311 VPReductionPHIRecipe(PHINode *Phi, const RecurrenceDescriptor &RdxDesc,
1312 VPValue &Start, bool IsInLoop = false,
1313 bool IsOrdered = false)
1314 : VPHeaderPHIRecipe(VPVReductionPHISC, VPReductionPHISC, Phi, &Start),
1315 RdxDesc(RdxDesc), IsInLoop(IsInLoop), IsOrdered(IsOrdered) {
1316 assert((!IsOrdered || IsInLoop) && "IsOrdered requires IsInLoop")(static_cast <bool> ((!IsOrdered || IsInLoop) &&
"IsOrdered requires IsInLoop") ? void (0) : __assert_fail ("(!IsOrdered || IsInLoop) && \"IsOrdered requires IsInLoop\""
, "llvm/lib/Transforms/Vectorize/VPlan.h", 1316, __extension__
__PRETTY_FUNCTION__))
;
1317 }
1318
1319 ~VPReductionPHIRecipe() override = default;
1320
1321 VP_CLASSOF_IMPL(VPRecipeBase::VPReductionPHISC, VPValue::VPVReductionPHISC)static inline bool classof(const VPDef *D) { return D->getVPDefID
() == VPRecipeBase::VPReductionPHISC; } static inline bool classof
(const VPValue *V) { return V->getVPValueID() == VPValue::
VPVReductionPHISC; } static inline bool classof(const VPUser *
U) { auto *R = dyn_cast<VPRecipeBase>(U); return R &&
R->getVPDefID() == VPRecipeBase::VPReductionPHISC; } static
inline bool classof(const VPRecipeBase *R) { return R->getVPDefID
() == VPRecipeBase::VPReductionPHISC; }
1322
1323 static inline bool classof(const VPHeaderPHIRecipe *R) {
1324 return R->getVPDefID() == VPRecipeBase::VPReductionPHISC;
1325 }
1326
1327 /// Generate the phi/select nodes.
1328 void execute(VPTransformState &State) override;
1329
1330#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1331 /// Print the recipe.
1332 void print(raw_ostream &O, const Twine &Indent,
1333 VPSlotTracker &SlotTracker) const override;
1334#endif
1335
1336 const RecurrenceDescriptor &getRecurrenceDescriptor() const {
1337 return RdxDesc;
1338 }
1339
1340 /// Returns true, if the phi is part of an ordered reduction.
1341 bool isOrdered() const { return IsOrdered; }
1342
1343 /// Returns true, if the phi is part of an in-loop reduction.
1344 bool isInLoop() const { return IsInLoop; }
1345};
1346
1347/// A recipe for vectorizing a phi-node as a sequence of mask-based select
1348/// instructions.
1349class VPBlendRecipe : public VPRecipeBase, public VPValue {
1350 PHINode *Phi;
1351
1352public:
1353 /// The blend operation is a User of the incoming values and of their
1354 /// respective masks, ordered [I0, M0, I1, M1, ...]. Note that a single value
1355 /// might be incoming with a full mask for which there is no VPValue.
1356 VPBlendRecipe(PHINode *Phi, ArrayRef<VPValue *> Operands)
1357 : VPRecipeBase(VPBlendSC, Operands),
1358 VPValue(VPValue::VPVBlendSC, Phi, this), Phi(Phi) {
1359 assert(Operands.size() > 0 &&(static_cast <bool> (Operands.size() > 0 && (
(Operands.size() == 1) || (Operands.size() % 2 == 0)) &&
"Expected either a single incoming value or a positive even number "
"of operands") ? void (0) : __assert_fail ("Operands.size() > 0 && ((Operands.size() == 1) || (Operands.size() % 2 == 0)) && \"Expected either a single incoming value or a positive even number \" \"of operands\""
, "llvm/lib/Transforms/Vectorize/VPlan.h", 1362, __extension__
__PRETTY_FUNCTION__))
1360 ((Operands.size() == 1) || (Operands.size() % 2 == 0)) &&(static_cast <bool> (Operands.size() > 0 && (
(Operands.size() == 1) || (Operands.size() % 2 == 0)) &&
"Expected either a single incoming value or a positive even number "
"of operands") ? void (0) : __assert_fail ("Operands.size() > 0 && ((Operands.size() == 1) || (Operands.size() % 2 == 0)) && \"Expected either a single incoming value or a positive even number \" \"of operands\""
, "llvm/lib/Transforms/Vectorize/VPlan.h", 1362, __extension__
__PRETTY_FUNCTION__))
1361 "Expected either a single incoming value or a positive even number "(static_cast <bool> (Operands.size() > 0 && (
(Operands.size() == 1) || (Operands.size() % 2 == 0)) &&
"Expected either a single incoming value or a positive even number "
"of operands") ? void (0) : __assert_fail ("Operands.size() > 0 && ((Operands.size() == 1) || (Operands.size() % 2 == 0)) && \"Expected either a single incoming value or a positive even number \" \"of operands\""
, "llvm/lib/Transforms/Vectorize/VPlan.h", 1362, __extension__
__PRETTY_FUNCTION__))
1362 "of operands")(static_cast <bool> (Operands.size() > 0 && (
(Operands.size() == 1) || (Operands.size() % 2 == 0)) &&
"Expected either a single incoming value or a positive even number "
"of operands") ? void (0) : __assert_fail ("Operands.size() > 0 && ((Operands.size() == 1) || (Operands.size() % 2 == 0)) && \"Expected either a single incoming value or a positive even number \" \"of operands\""
, "llvm/lib/Transforms/Vectorize/VPlan.h", 1362, __extension__
__PRETTY_FUNCTION__))
;
1363 }
1364
1365 VP_CLASSOF_IMPL(VPRecipeBase::VPBlendSC, VPValue::VPVBlendSC)static inline bool classof(const VPDef *D) { return D->getVPDefID
() == VPRecipeBase::VPBlendSC; } static inline bool classof(const
VPValue *V) { return V->getVPValueID() == VPValue::VPVBlendSC
; } static inline bool classof(const VPUser *U) { auto *R = dyn_cast
<VPRecipeBase>(U); return R && R->getVPDefID
() == VPRecipeBase::VPBlendSC; } static inline bool classof(const
VPRecipeBase *R) { return R->getVPDefID() == VPRecipeBase
::VPBlendSC; }
1366
1367 /// Return the number of incoming values, taking into account that a single
1368 /// incoming value has no mask.
1369 unsigned getNumIncomingValues() const { return (getNumOperands() + 1) / 2; }
1370
1371 /// Return incoming value number \p Idx.
1372 VPValue *getIncomingValue(unsigned Idx) const { return getOperand(Idx * 2); }
1373
1374 /// Return mask number \p Idx.
1375 VPValue *getMask(unsigned Idx) const { return getOperand(Idx * 2 + 1); }
1376
1377 /// Generate the phi/select nodes.
1378 void execute(VPTransformState &State) override;
1379
1380#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1381 /// Print the recipe.
1382 void print(raw_ostream &O, const Twine &Indent,
1383 VPSlotTracker &SlotTracker) const override;
1384#endif
1385
1386 /// Returns true if the recipe only uses the first lane of operand \p Op.
1387 bool onlyFirstLaneUsed(const VPValue *Op) const override {
1388 assert(is_contained(operands(), Op) &&(static_cast <bool> (is_contained(operands(), Op) &&
"Op must be an operand of the recipe") ? void (0) : __assert_fail
("is_contained(operands(), Op) && \"Op must be an operand of the recipe\""
, "llvm/lib/Transforms/Vectorize/VPlan.h", 1389, __extension__
__PRETTY_FUNCTION__))
1389 "Op must be an operand of the recipe")(static_cast <bool> (is_contained(operands(), Op) &&
"Op must be an operand of the recipe") ? void (0) : __assert_fail
("is_contained(operands(), Op) && \"Op must be an operand of the recipe\""
, "llvm/lib/Transforms/Vectorize/VPlan.h", 1389, __extension__
__PRETTY_FUNCTION__))
;
1390 // Recursing through Blend recipes only, must terminate at header phi's the
1391 // latest.
1392 return all_of(users(),
1393 [this](VPUser *U) { return U->onlyFirstLaneUsed(this); });
1394 }
1395};
1396
1397/// VPInterleaveRecipe is a recipe for transforming an interleave group of load
1398/// or stores into one wide load/store and shuffles. The first operand of a
1399/// VPInterleave recipe is the address, followed by the stored values, followed
1400/// by an optional mask.
1401class VPInterleaveRecipe : public VPRecipeBase {
1402 const InterleaveGroup<Instruction> *IG;
1403
1404 bool HasMask = false;
1405
1406public:
1407 VPInterleaveRecipe(const InterleaveGroup<Instruction> *IG, VPValue *Addr,
1408 ArrayRef<VPValue *> StoredValues, VPValue *Mask)
1409 : VPRecipeBase(VPInterleaveSC, {Addr}), IG(IG) {
1410 for (unsigned i = 0; i < IG->getFactor(); ++i)
1411 if (Instruction *I = IG->getMember(i)) {
1412 if (I->getType()->isVoidTy())
1413 continue;
1414 new VPValue(I, this);
1415 }
1416
1417 for (auto *SV : StoredValues)
1418 addOperand(SV);
1419 if (Mask) {
1420 HasMask = true;
1421 addOperand(Mask);
1422 }
1423 }
1424 ~VPInterleaveRecipe() override = default;
1425
1426 static inline bool classof(const VPDef *D) {
1427 return D->getVPDefID() == VPRecipeBase::VPInterleaveSC;
1428 }
1429
1430 /// Return the address accessed by this recipe.
1431 VPValue *getAddr() const {
1432 return getOperand(0); // Address is the 1st, mandatory operand.
1433 }
1434
1435 /// Return the mask used by this recipe. Note that a full mask is represented
1436 /// by a nullptr.
1437 VPValue *getMask() const {
1438 // Mask is optional and therefore the last, currently 2nd operand.
1439 return HasMask ? getOperand(getNumOperands() - 1) : nullptr;
1440 }
1441
1442 /// Return the VPValues stored by this interleave group. If it is a load
1443 /// interleave group, return an empty ArrayRef.
1444 ArrayRef<VPValue *> getStoredValues() const {
1445 // The first operand is the address, followed by the stored values, followed
1446 // by an optional mask.
1447 return ArrayRef<VPValue *>(op_begin(), getNumOperands())
1448 .slice(1, getNumStoreOperands());
1449 }
1450
1451 /// Generate the wide load or store, and shuffles.
1452 void execute(VPTransformState &State) override;
1453
1454#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1455 /// Print the recipe.
1456 void print(raw_ostream &O, const Twine &Indent,
1457 VPSlotTracker &SlotTracker) const override;
1458#endif
1459
1460 const InterleaveGroup<Instruction> *getInterleaveGroup() { return IG; }
1461
1462 /// Returns the number of stored operands of this interleave group. Returns 0
1463 /// for load interleave groups.
1464 unsigned getNumStoreOperands() const {
1465 return getNumOperands() - (HasMask ? 2 : 1);
1466 }
1467
1468 /// The recipe only uses the first lane of the address.
1469 bool onlyFirstLaneUsed(const VPValue *Op) const override {
1470 assert(is_contained(operands(), Op) &&(static_cast <bool> (is_contained(operands(), Op) &&
"Op must be an operand of the recipe") ? void (0) : __assert_fail
("is_contained(operands(), Op) && \"Op must be an operand of the recipe\""
, "llvm/lib/Transforms/Vectorize/VPlan.h", 1471, __extension__
__PRETTY_FUNCTION__))
1471 "Op must be an operand of the recipe")(static_cast <bool> (is_contained(operands(), Op) &&
"Op must be an operand of the recipe") ? void (0) : __assert_fail
("is_contained(operands(), Op) && \"Op must be an operand of the recipe\""
, "llvm/lib/Transforms/Vectorize/VPlan.h", 1471, __extension__
__PRETTY_FUNCTION__))
;
1472 return Op == getAddr() && !llvm::is_contained(getStoredValues(), Op);
1473 }
1474};
1475
1476/// A recipe to represent inloop reduction operations, performing a reduction on
1477/// a vector operand into a scalar value, and adding the result to a chain.
1478/// The Operands are {ChainOp, VecOp, [Condition]}.
1479class VPReductionRecipe : public VPRecipeBase, public VPValue {
1480 /// The recurrence decriptor for the reduction in question.
1481 const RecurrenceDescriptor *RdxDesc;
1482 /// Pointer to the TTI, needed to create the target reduction
1483 const TargetTransformInfo *TTI;
1484
1485public:
1486 VPReductionRecipe(const RecurrenceDescriptor *R, Instruction *I,
1487 VPValue *ChainOp, VPValue *VecOp, VPValue *CondOp,
1488 const TargetTransformInfo *TTI)
1489 : VPRecipeBase(VPRecipeBase::VPReductionSC, {ChainOp, VecOp}),
1490 VPValue(VPValue::VPVReductionSC, I, this), RdxDesc(R), TTI(TTI) {
1491 if (CondOp)
1492 addOperand(CondOp);
1493 }
1494
1495 ~VPReductionRecipe() override = default;
1496
1497 VP_CLASSOF_IMPL(VPRecipeBase::VPReductionSC, VPValue::VPVReductionSC)static inline bool classof(const VPDef *D) { return D->getVPDefID
() == VPRecipeBase::VPReductionSC; } static inline bool classof
(const VPValue *V) { return V->getVPValueID() == VPValue::
VPVReductionSC; } static inline bool classof(const VPUser *U)
{ auto *R = dyn_cast<VPRecipeBase>(U); return R &&
R->getVPDefID() == VPRecipeBase::VPReductionSC; } static inline
bool classof(const VPRecipeBase *R) { return R->getVPDefID
() == VPRecipeBase::VPReductionSC; }
1498
1499 /// Generate the reduction in the loop
1500 void execute(VPTransformState &State) override;
1501
1502#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1503 /// Print the recipe.
1504 void print(raw_ostream &O, const Twine &Indent,
1505 VPSlotTracker &SlotTracker) const override;
1506#endif
1507
1508 /// The VPValue of the scalar Chain being accumulated.
1509 VPValue *getChainOp() const { return getOperand(0); }
1510 /// The VPValue of the vector value to be reduced.
1511 VPValue *getVecOp() const { return getOperand(1); }
1512 /// The VPValue of the condition for the block.
1513 VPValue *getCondOp() const {
1514 return getNumOperands() > 2 ? getOperand(2) : nullptr;
1515 }
1516};
1517
1518/// VPReplicateRecipe replicates a given instruction producing multiple scalar
1519/// copies of the original scalar type, one per lane, instead of producing a
1520/// single copy of widened type for all lanes. If the instruction is known to be
1521/// uniform only one copy, per lane zero, will be generated.
1522class VPReplicateRecipe : public VPRecipeBase, public VPValue {
1523 /// Indicator if only a single replica per lane is needed.
1524 bool IsUniform;
1525
1526 /// Indicator if the replicas are also predicated.
1527 bool IsPredicated;
1528
1529 /// Indicator if the scalar values should also be packed into a vector.
1530 bool AlsoPack;
1531
1532public:
1533 template <typename IterT>
1534 VPReplicateRecipe(Instruction *I, iterator_range<IterT> Operands,
1535 bool IsUniform, bool IsPredicated = false)
1536 : VPRecipeBase(VPReplicateSC, Operands), VPValue(VPVReplicateSC, I, this),
1537 IsUniform(IsUniform), IsPredicated(IsPredicated) {
1538 // Retain the previous behavior of predicateInstructions(), where an
1539 // insert-element of a predicated instruction got hoisted into the
1540 // predicated basic block iff it was its only user. This is achieved by
1541 // having predicated instructions also pack their values into a vector by
1542 // default unless they have a replicated user which uses their scalar value.
1543 AlsoPack = IsPredicated && !I->use_empty();
1544 }
1545
1546 ~VPReplicateRecipe() override = default;
1547
1548 VP_CLASSOF_IMPL(VPRecipeBase::VPReplicateSC, VPValue::VPVReplicateSC)static inline bool classof(const VPDef *D) { return D->getVPDefID
() == VPRecipeBase::VPReplicateSC; } static inline bool classof
(const VPValue *V) { return V->getVPValueID() == VPValue::
VPVReplicateSC; } static inline bool classof(const VPUser *U)
{ auto *R = dyn_cast<VPRecipeBase>(U); return R &&
R->getVPDefID() == VPRecipeBase::VPReplicateSC; } static inline
bool classof(const VPRecipeBase *R) { return R->getVPDefID
() == VPRecipeBase::VPReplicateSC; }
1549
1550 /// Generate replicas of the desired Ingredient. Replicas will be generated
1551 /// for all parts and lanes unless a specific part and lane are specified in
1552 /// the \p State.
1553 void execute(VPTransformState &State) override;
1554
1555 void setAlsoPack(bool Pack) { AlsoPack = Pack; }
1556
1557#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1558 /// Print the recipe.
1559 void print(raw_ostream &O, const Twine &Indent,
1560 VPSlotTracker &SlotTracker) const override;
1561#endif
1562
1563 bool isUniform() const { return IsUniform; }
1564
1565 bool isPacked() const { return AlsoPack; }
1566
1567 bool isPredicated() const { return IsPredicated; }
1568
1569 /// Returns true if the recipe only uses the first lane of operand \p Op.
1570 bool onlyFirstLaneUsed(const VPValue *Op) const override {
1571 assert(is_contained(operands(), Op) &&(static_cast <bool> (is_contained(operands(), Op) &&
"Op must be an operand of the recipe") ? void (0) : __assert_fail
("is_contained(operands(), Op) && \"Op must be an operand of the recipe\""
, "llvm/lib/Transforms/Vectorize/VPlan.h", 1572, __extension__
__PRETTY_FUNCTION__))
1572 "Op must be an operand of the recipe")(static_cast <bool> (is_contained(operands(), Op) &&
"Op must be an operand of the recipe") ? void (0) : __assert_fail
("is_contained(operands(), Op) && \"Op must be an operand of the recipe\""
, "llvm/lib/Transforms/Vectorize/VPlan.h", 1572, __extension__
__PRETTY_FUNCTION__))
;
1573 return isUniform();
1574 }
1575
1576 /// Returns true if the recipe uses scalars of operand \p Op.
1577 bool usesScalars(const VPValue *Op) const override {
1578 assert(is_contained(operands(), Op) &&(static_cast <bool> (is_contained(operands(), Op) &&
"Op must be an operand of the recipe") ? void (0) : __assert_fail
("is_contained(operands(), Op) && \"Op must be an operand of the recipe\""
, "llvm/lib/Transforms/Vectorize/VPlan.h", 1579, __extension__
__PRETTY_FUNCTION__))
1579 "Op must be an operand of the recipe")(static_cast <bool> (is_contained(operands(), Op) &&
"Op must be an operand of the recipe") ? void (0) : __assert_fail
("is_contained(operands(), Op) && \"Op must be an operand of the recipe\""
, "llvm/lib/Transforms/Vectorize/VPlan.h", 1579, __extension__
__PRETTY_FUNCTION__))
;
1580 return true;
1581 }
1582};
1583
1584/// A recipe for generating conditional branches on the bits of a mask.
1585class VPBranchOnMaskRecipe : public VPRecipeBase {
1586public:
1587 VPBranchOnMaskRecipe(VPValue *BlockInMask)
1588 : VPRecipeBase(VPBranchOnMaskSC, {}) {
1589 if (BlockInMask) // nullptr means all-one mask.
1590 addOperand(BlockInMask);
1591 }
1592
1593 /// Method to support type inquiry through isa, cast, and dyn_cast.
1594 static inline bool classof(const VPDef *D) {
1595 return D->getVPDefID() == VPRecipeBase::VPBranchOnMaskSC;
1596 }
1597
1598 /// Generate the extraction of the appropriate bit from the block mask and the
1599 /// conditional branch.
1600 void execute(VPTransformState &State) override;
1601
1602#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1603 /// Print the recipe.
1604 void print(raw_ostream &O, const Twine &Indent,
1605 VPSlotTracker &SlotTracker) const override {
1606 O << Indent << "BRANCH-ON-MASK ";
1607 if (VPValue *Mask = getMask())
1608 Mask->printAsOperand(O, SlotTracker);
1609 else
1610 O << " All-One";
1611 }
1612#endif
1613
1614 /// Return the mask used by this recipe. Note that a full mask is represented
1615 /// by a nullptr.
1616 VPValue *getMask() const {
1617 assert(getNumOperands() <= 1 && "should have either 0 or 1 operands")(static_cast <bool> (getNumOperands() <= 1 &&
"should have either 0 or 1 operands") ? void (0) : __assert_fail
("getNumOperands() <= 1 && \"should have either 0 or 1 operands\""
, "llvm/lib/Transforms/Vectorize/VPlan.h", 1617, __extension__
__PRETTY_FUNCTION__))
;
1618 // Mask is optional.
1619 return getNumOperands() == 1 ? getOperand(0) : nullptr;
1620 }
1621
1622 /// Returns true if the recipe uses scalars of operand \p Op.
1623 bool usesScalars(const VPValue *Op) const override {
1624 assert(is_contained(operands(), Op) &&(static_cast <bool> (is_contained(operands(), Op) &&
"Op must be an operand of the recipe") ? void (0) : __assert_fail
("is_contained(operands(), Op) && \"Op must be an operand of the recipe\""
, "llvm/lib/Transforms/Vectorize/VPlan.h", 1625, __extension__
__PRETTY_FUNCTION__))
1625 "Op must be an operand of the recipe")(static_cast <bool> (is_contained(operands(), Op) &&
"Op must be an operand of the recipe") ? void (0) : __assert_fail
("is_contained(operands(), Op) && \"Op must be an operand of the recipe\""
, "llvm/lib/Transforms/Vectorize/VPlan.h", 1625, __extension__
__PRETTY_FUNCTION__))
;
1626 return true;
1627 }
1628};
1629
1630/// VPPredInstPHIRecipe is a recipe for generating the phi nodes needed when
1631/// control converges back from a Branch-on-Mask. The phi nodes are needed in
1632/// order to merge values that are set under such a branch and feed their uses.
1633/// The phi nodes can be scalar or vector depending on the users of the value.
1634/// This recipe works in concert with VPBranchOnMaskRecipe.
1635class VPPredInstPHIRecipe : public VPRecipeBase, public VPValue {
1636public:
1637 /// Construct a VPPredInstPHIRecipe given \p PredInst whose value needs a phi
1638 /// nodes after merging back from a Branch-on-Mask.
1639 VPPredInstPHIRecipe(VPValue *PredV)
1640 : VPRecipeBase(VPPredInstPHISC, PredV),
1641 VPValue(VPValue::VPVPredInstPHI, nullptr, this) {}
1642 ~VPPredInstPHIRecipe() override = default;
1643
1644 /// Method to support type inquiry through isa, cast, and dyn_cast.
1645 static inline bool classof(const VPDef *D) {
1646 return D->getVPDefID() == VPRecipeBase::VPPredInstPHISC;
1647 }
1648
1649 /// Generates phi nodes for live-outs as needed to retain SSA form.
1650 void execute(VPTransformState &State) override;
1651
1652#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1653 /// Print the recipe.
1654 void print(raw_ostream &O, const Twine &Indent,
1655 VPSlotTracker &SlotTracker) const override;
1656#endif
1657
1658 /// Returns true if the recipe uses scalars of operand \p Op.
1659 bool usesScalars(const VPValue *Op) const override {
1660 assert(is_contained(operands(), Op) &&(static_cast <bool> (is_contained(operands(), Op) &&
"Op must be an operand of the recipe") ? void (0) : __assert_fail
("is_contained(operands(), Op) && \"Op must be an operand of the recipe\""
, "llvm/lib/Transforms/Vectorize/VPlan.h", 1661, __extension__
__PRETTY_FUNCTION__))
1661 "Op must be an operand of the recipe")(static_cast <bool> (is_contained(operands(), Op) &&
"Op must be an operand of the recipe") ? void (0) : __assert_fail
("is_contained(operands(), Op) && \"Op must be an operand of the recipe\""
, "llvm/lib/Transforms/Vectorize/VPlan.h", 1661, __extension__
__PRETTY_FUNCTION__))
;
1662 return true;
1663 }
1664};
1665
1666/// A Recipe for widening load/store operations.
1667/// The recipe uses the following VPValues:
1668/// - For load: Address, optional mask
1669/// - For store: Address, stored value, optional mask
1670/// TODO: We currently execute only per-part unless a specific instance is
1671/// provided.
1672class VPWidenMemoryInstructionRecipe : public VPRecipeBase {
1673 Instruction &Ingredient;
1674
1675 // Whether the loaded-from / stored-to addresses are consecutive.
1676 bool Consecutive;
1677
1678 // Whether the consecutive loaded/stored addresses are in reverse order.
1679 bool Reverse;
1680
1681 void setMask(VPValue *Mask) {
1682 if (!Mask)
1683 return;
1684 addOperand(Mask);
1685 }
1686
1687 bool isMasked() const {
1688 return isStore() ? getNumOperands() == 3 : getNumOperands() == 2;
1689 }
1690
1691public:
1692 VPWidenMemoryInstructionRecipe(LoadInst &Load, VPValue *Addr, VPValue *Mask,
1693 bool Consecutive, bool Reverse)
1694 : VPRecipeBase(VPWidenMemoryInstructionSC, {Addr}), Ingredient(Load),
1695 Consecutive(Consecutive), Reverse(Reverse) {
1696 assert((Consecutive || !Reverse) && "Reverse implies consecutive")(static_cast <bool> ((Consecutive || !Reverse) &&
"Reverse implies consecutive") ? void (0) : __assert_fail ("(Consecutive || !Reverse) && \"Reverse implies consecutive\""
, "llvm/lib/Transforms/Vectorize/VPlan.h", 1696, __extension__
__PRETTY_FUNCTION__))
;
1697 new VPValue(VPValue::VPVMemoryInstructionSC, &Load, this);
1698 setMask(Mask);
1699 }
1700
1701 VPWidenMemoryInstructionRecipe(StoreInst &Store, VPValue *Addr,
1702 VPValue *StoredValue, VPValue *Mask,
1703 bool Consecutive, bool Reverse)
1704 : VPRecipeBase(VPWidenMemoryInstructionSC, {Addr, StoredValue}),
1705 Ingredient(Store), Consecutive(Consecutive), Reverse(Reverse) {
1706 assert((Consecutive || !Reverse) && "Reverse implies consecutive")(static_cast <bool> ((Consecutive || !Reverse) &&
"Reverse implies consecutive") ? void (0) : __assert_fail ("(Consecutive || !Reverse) && \"Reverse implies consecutive\""
, "llvm/lib/Transforms/Vectorize/VPlan.h", 1706, __extension__
__PRETTY_FUNCTION__))
;
1707 setMask(Mask);
1708 }
1709
1710 /// Method to support type inquiry through isa, cast, and dyn_cast.
1711 static inline bool classof(const VPDef *D) {
1712 return D->getVPDefID() == VPRecipeBase::VPWidenMemoryInstructionSC;
1713 }
1714
1715 /// Return the address accessed by this recipe.
1716 VPValue *getAddr() const {
1717 return getOperand(0); // Address is the 1st, mandatory operand.
1718 }
1719
1720 /// Return the mask used by this recipe. Note that a full mask is represented
1721 /// by a nullptr.
1722 VPValue *getMask() const {
1723 // Mask is optional and therefore the last operand.
1724 return isMasked() ? getOperand(getNumOperands() - 1) : nullptr;
1725 }
1726
1727 /// Returns true if this recipe is a store.
1728 bool isStore() const { return isa<StoreInst>(Ingredient); }
1729
1730 /// Return the address accessed by this recipe.
1731 VPValue *getStoredValue() const {
1732 assert(isStore() && "Stored value only available for store instructions")(static_cast <bool> (isStore() && "Stored value only available for store instructions"
) ? void (0) : __assert_fail ("isStore() && \"Stored value only available for store instructions\""
, "llvm/lib/Transforms/Vectorize/VPlan.h", 1732, __extension__
__PRETTY_FUNCTION__))
;
1733 return getOperand(1); // Stored value is the 2nd, mandatory operand.
1734 }
1735
1736 // Return whether the loaded-from / stored-to addresses are consecutive.
1737 bool isConsecutive() const { return Consecutive; }
1738
1739 // Return whether the consecutive loaded/stored addresses are in reverse
1740 // order.
1741 bool isReverse() const { return Reverse; }
1742
1743 /// Generate the wide load/store.
1744 void execute(VPTransformState &State) override;
1745
1746#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1747 /// Print the recipe.
1748 void print(raw_ostream &O, const Twine &Indent,
1749 VPSlotTracker &SlotTracker) const override;
1750#endif
1751
1752 /// Returns true if the recipe only uses the first lane of operand \p Op.
1753 bool onlyFirstLaneUsed(const VPValue *Op) const override {
1754 assert(is_contained(operands(), Op) &&(static_cast <bool> (is_contained(operands(), Op) &&
"Op must be an operand of the recipe") ? void (0) : __assert_fail
("is_contained(operands(), Op) && \"Op must be an operand of the recipe\""
, "llvm/lib/Transforms/Vectorize/VPlan.h", 1755, __extension__
__PRETTY_FUNCTION__))
1755 "Op must be an operand of the recipe")(static_cast <bool> (is_contained(operands(), Op) &&
"Op must be an operand of the recipe") ? void (0) : __assert_fail
("is_contained(operands(), Op) && \"Op must be an operand of the recipe\""
, "llvm/lib/Transforms/Vectorize/VPlan.h", 1755, __extension__
__PRETTY_FUNCTION__))
;
1756
1757 // Widened, consecutive memory operations only demand the first lane of
1758 // their address, unless the same operand is also stored. That latter can
1759 // happen with opaque pointers.
1760 return Op == getAddr() && isConsecutive() &&
1761 (!isStore() || Op != getStoredValue());
1762 }
1763
1764 Instruction &getIngredient() const { return Ingredient; }
1765};
1766
1767/// Recipe to expand a SCEV expression.
1768class VPExpandSCEVRecipe : public VPRecipeBase, public VPValue {
1769 const SCEV *Expr;
1770 ScalarEvolution &SE;
1771
1772public:
1773 VPExpandSCEVRecipe(const SCEV *Expr, ScalarEvolution &SE)
1774 : VPRecipeBase(VPExpandSCEVSC, {}), VPValue(nullptr, this), Expr(Expr),
1775 SE(SE) {}
1776
1777 ~VPExpandSCEVRecipe() override = default;
1778
1779 /// Method to support type inquiry through isa, cast, and dyn_cast.
1780 static inline bool classof(const VPDef *D) {
1781 return D->getVPDefID() == VPExpandSCEVSC;
1782 }
1783
1784 /// Generate a canonical vector induction variable of the vector loop, with
1785 void execute(VPTransformState &State) override;
1786
1787#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1788 /// Print the recipe.
1789 void print(raw_ostream &O, const Twine &Indent,
1790 VPSlotTracker &SlotTracker) const override;
1791#endif
1792
1793 const SCEV *getSCEV() const { return Expr; }
1794};
1795
1796/// Canonical scalar induction phi of the vector loop. Starting at the specified
1797/// start value (either 0 or the resume value when vectorizing the epilogue
1798/// loop). VPWidenCanonicalIVRecipe represents the vector version of the
1799/// canonical induction variable.
1800class VPCanonicalIVPHIRecipe : public VPHeaderPHIRecipe {
1801 DebugLoc DL;
1802
1803public:
1804 VPCanonicalIVPHIRecipe(VPValue *StartV, DebugLoc DL)
1805 : VPHeaderPHIRecipe(VPValue::VPVCanonicalIVPHISC, VPCanonicalIVPHISC,
1806 nullptr, StartV),
1807 DL(DL) {}
1808
1809 ~VPCanonicalIVPHIRecipe() override = default;
1810
1811 VP_CLASSOF_IMPL(VPRecipeBase::VPCanonicalIVPHISC,static inline bool classof(const VPDef *D) { return D->getVPDefID
() == VPRecipeBase::VPCanonicalIVPHISC; } static inline bool classof
(const VPValue *V) { return V->getVPValueID() == VPValue::
VPVCanonicalIVPHISC; } static inline bool classof(const VPUser
*U) { auto *R = dyn_cast<VPRecipeBase>(U); return R &&
R->getVPDefID() == VPRecipeBase::VPCanonicalIVPHISC; } static
inline bool classof(const VPRecipeBase *R) { return R->getVPDefID
() == VPRecipeBase::VPCanonicalIVPHISC; }
1812 VPValue::VPVCanonicalIVPHISC)static inline bool classof(const VPDef *D) { return D->getVPDefID
() == VPRecipeBase::VPCanonicalIVPHISC; } static inline bool classof
(const VPValue *V) { return V->getVPValueID() == VPValue::
VPVCanonicalIVPHISC; } static inline bool classof(const VPUser
*U) { auto *R = dyn_cast<VPRecipeBase>(U); return R &&
R->getVPDefID() == VPRecipeBase::VPCanonicalIVPHISC; } static
inline bool classof(const VPRecipeBase *R) { return R->getVPDefID
() == VPRecipeBase::VPCanonicalIVPHISC; }
1813
1814 static inline bool classof(const VPHeaderPHIRecipe *D) {
1815 return D->getVPDefID() == VPCanonicalIVPHISC;
1816 }
1817
1818 /// Generate the canonical scalar induction phi of the vector loop.
1819 void execute(VPTransformState &State) override;
1820
1821#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1822 /// Print the recipe.
1823 void print(raw_ostream &O, const Twine &Indent,
1824 VPSlotTracker &SlotTracker) const override;
1825#endif
1826
1827 /// Returns the scalar type of the induction.
1828 const Type *getScalarType() const {
1829 return getOperand(0)->getLiveInIRValue()->getType();
1830 }
1831
1832 /// Returns true if the recipe only uses the first lane of operand \p Op.
1833 bool onlyFirstLaneUsed(const VPValue *Op) const override {
1834 assert(is_contained(operands(), Op) &&(static_cast <bool> (is_contained(operands(), Op) &&
"Op must be an operand of the recipe") ? void (0) : __assert_fail
("is_contained(operands(), Op) && \"Op must be an operand of the recipe\""
, "llvm/lib/Transforms/Vectorize/VPlan.h", 1835, __extension__
__PRETTY_FUNCTION__))
1835 "Op must be an operand of the recipe")(static_cast <bool> (is_contained(operands(), Op) &&
"Op must be an operand of the recipe") ? void (0) : __assert_fail
("is_contained(operands(), Op) && \"Op must be an operand of the recipe\""
, "llvm/lib/Transforms/Vectorize/VPlan.h", 1835, __extension__
__PRETTY_FUNCTION__))
;
1836 return true;
1837 }
1838
1839 /// Check if the induction described by \p ID is canonical, i.e. has the same
1840 /// start, step (of 1), and type as the canonical IV.
1841 bool isCanonical(const InductionDescriptor &ID, Type *Ty) const;
1842};
1843
1844/// A recipe for generating the active lane mask for the vector loop that is
1845/// used to predicate the vector operations.
1846/// TODO: It would be good to use the existing VPWidenPHIRecipe instead and
1847/// remove VPActiveLaneMaskPHIRecipe.
1848class VPActiveLaneMaskPHIRecipe : public VPHeaderPHIRecipe {
1849 DebugLoc DL;
1850
1851public:
1852 VPActiveLaneMaskPHIRecipe(VPValue *StartMask, DebugLoc DL)
1853 : VPHeaderPHIRecipe(VPValue::VPVActiveLaneMaskPHISC,
1854 VPActiveLaneMaskPHISC, nullptr, StartMask),
1855 DL(DL) {}
1856
1857 ~VPActiveLaneMaskPHIRecipe() override = default;
1858
1859 VP_CLASSOF_IMPL(VPRecipeBase::VPActiveLaneMaskPHISC,static inline bool classof(const VPDef *D) { return D->getVPDefID
() == VPRecipeBase::VPActiveLaneMaskPHISC; } static inline bool
classof(const VPValue *V) { return V->getVPValueID() == VPValue
::VPVActiveLaneMaskPHISC; } static inline bool classof(const VPUser
*U) { auto *R = dyn_cast<VPRecipeBase>(U); return R &&
R->getVPDefID() == VPRecipeBase::VPActiveLaneMaskPHISC; }
static inline bool classof(const VPRecipeBase *R) { return R
->getVPDefID() == VPRecipeBase::VPActiveLaneMaskPHISC; }
1860 VPValue::VPVActiveLaneMaskPHISC)static inline bool classof(const VPDef *D) { return D->getVPDefID
() == VPRecipeBase::VPActiveLaneMaskPHISC; } static inline bool
classof(const VPValue *V) { return V->getVPValueID() == VPValue
::VPVActiveLaneMaskPHISC; } static inline bool classof(const VPUser
*U) { auto *R = dyn_cast<VPRecipeBase>(U); return R &&
R->getVPDefID() == VPRecipeBase::VPActiveLaneMaskPHISC; }
static inline bool classof(const VPRecipeBase *R) { return R
->getVPDefID() == VPRecipeBase::VPActiveLaneMaskPHISC; }
1861
1862 static inline bool classof(const VPHeaderPHIRecipe *D) {
1863 return D->getVPDefID() == VPActiveLaneMaskPHISC;
1864 }
1865
1866 /// Generate the active lane mask phi of the vector loop.
1867 void execute(VPTransformState &State) override;
1868
1869#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1870 /// Print the recipe.
1871 void print(raw_ostream &O, const Twine &Indent,
1872 VPSlotTracker &SlotTracker) const override;
1873#endif
1874};
1875
1876/// A Recipe for widening the canonical induction variable of the vector loop.
1877class VPWidenCanonicalIVRecipe : public VPRecipeBase, public VPValue {
1878public:
1879 VPWidenCanonicalIVRecipe(VPCanonicalIVPHIRecipe *CanonicalIV)
1880 : VPRecipeBase(VPWidenCanonicalIVSC, {CanonicalIV}),
1881 VPValue(VPValue::VPVWidenCanonicalIVSC, nullptr, this) {}
1882
1883 ~VPWidenCanonicalIVRecipe() override = default;
1884
1885 /// Method to support type inquiry through isa, cast, and dyn_cast.
1886 static inline bool classof(const VPDef *D) {
1887 return D->getVPDefID() == VPRecipeBase::VPWidenCanonicalIVSC;
1888 }
1889
1890 /// Extra classof implementations to allow directly casting from VPUser ->
1891 /// VPWidenCanonicalIVRecipe.
1892 static inline bool classof(const VPUser *U) {
1893 auto *R = dyn_cast<VPRecipeBase>(U);
1894 return R && R->getVPDefID() == VPRecipeBase::VPWidenCanonicalIVSC;
1895 }
1896 static inline bool classof(const VPRecipeBase *R) {
1897 return R->getVPDefID() == VPRecipeBase::VPWidenCanonicalIVSC;
1898 }
1899
1900 /// Generate a canonical vector induction variable of the vector loop, with
1901 /// start = {<Part*VF, Part*VF+1, ..., Part*VF+VF-1> for 0 <= Part < UF}, and
1902 /// step = <VF*UF, VF*UF, ..., VF*UF>.
1903 void execute(VPTransformState &State) override;
1904
1905#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1906 /// Print the recipe.
1907 void print(raw_ostream &O, const Twine &Indent,
1908 VPSlotTracker &SlotTracker) const override;
1909#endif
1910
1911 /// Returns the scalar type of the induction.
1912 const Type *getScalarType() const {
1913 return cast<VPCanonicalIVPHIRecipe>(getOperand(0)->getDefiningRecipe())
1914 ->getScalarType();
1915 }
1916};
1917
1918/// A recipe for converting the canonical IV value to the corresponding value of
1919/// an IV with different start and step values, using Start + CanonicalIV *
1920/// Step.
1921class VPDerivedIVRecipe : public VPRecipeBase, public VPValue {
1922 /// The type of the result value. It may be smaller than the type of the
1923 /// induction and in this case it will get truncated to ResultTy.
1924 Type *ResultTy;
1925
1926 /// Induction descriptor for the induction the canonical IV is transformed to.
1927 const InductionDescriptor &IndDesc;
1928
1929public:
1930 VPDerivedIVRecipe(const InductionDescriptor &IndDesc, VPValue *Start,
1931 VPCanonicalIVPHIRecipe *CanonicalIV, VPValue *Step,
1932 Type *ResultTy)
1933 : VPRecipeBase(VPDerivedIVSC, {Start, CanonicalIV, Step}),
1934 VPValue(VPVDerivedIVSC, nullptr, this), ResultTy(ResultTy),
1935 IndDesc(IndDesc) {}
1936
1937 ~VPDerivedIVRecipe() override = default;
1938
1939 VP_CLASSOF_IMPL(VPRecipeBase::VPDerivedIVSC, VPValue::VPVInstructionSC)static inline bool classof(const VPDef *D) { return D->getVPDefID
() == VPRecipeBase::VPDerivedIVSC; } static inline bool classof
(const VPValue *V) { return V->getVPValueID() == VPValue::
VPVInstructionSC; } static inline bool classof(const VPUser *
U) { auto *R = dyn_cast<VPRecipeBase>(U); return R &&
R->getVPDefID() == VPRecipeBase::VPDerivedIVSC; } static inline
bool classof(const VPRecipeBase *R) { return R->getVPDefID
() == VPRecipeBase::VPDerivedIVSC; }
1940
1941 /// Generate the transformed value of the induction at offset StartValue (1.
1942 /// operand) + IV (2. operand) * StepValue (3, operand).
1943 void execute(VPTransformState &State) override;
1944
1945#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1946 /// Print the recipe.
1947 void print(raw_ostream &O, const Twine &Indent,
1948 VPSlotTracker &SlotTracker) const override;
1949#endif
1950
1951 VPValue *getStartValue() const { return getOperand(0); }
1952 VPValue *getCanonicalIV() const { return getOperand(1); }
1953 VPValue *getStepValue() const { return getOperand(2); }
1954
1955 /// Returns true if the recipe only uses the first lane of operand \p Op.
1956 bool onlyFirstLaneUsed(const VPValue *Op) const override {
1957 assert(is_contained(operands(), Op) &&(static_cast <bool> (is_contained(operands(), Op) &&
"Op must be an operand of the recipe") ? void (0) : __assert_fail
("is_contained(operands(), Op) && \"Op must be an operand of the recipe\""
, "llvm/lib/Transforms/Vectorize/VPlan.h", 1958, __extension__
__PRETTY_FUNCTION__))
1958 "Op must be an operand of the recipe")(static_cast <bool> (is_contained(operands(), Op) &&
"Op must be an operand of the recipe") ? void (0) : __assert_fail
("is_contained(operands(), Op) && \"Op must be an operand of the recipe\""
, "llvm/lib/Transforms/Vectorize/VPlan.h", 1958, __extension__
__PRETTY_FUNCTION__))
;
1959 return true;
1960 }
1961};
1962
1963/// A recipe for handling phi nodes of integer and floating-point inductions,
1964/// producing their scalar values.
1965class VPScalarIVStepsRecipe : public VPRecipeBase, public VPValue {
1966 const InductionDescriptor &IndDesc;
1967
1968public:
1969 VPScalarIVStepsRecipe(const InductionDescriptor &IndDesc, VPValue *IV,
1970 VPValue *Step)
1971 : VPRecipeBase(VPScalarIVStepsSC, {IV, Step}), VPValue(nullptr, this),
1972 IndDesc(IndDesc) {}
1973
1974 ~VPScalarIVStepsRecipe() override = default;
1975
1976 /// Method to support type inquiry through isa, cast, and dyn_cast.
1977 static inline bool classof(const VPDef *D) {
1978 return D->getVPDefID() == VPRecipeBase::VPScalarIVStepsSC;
1979 }
1980 /// Extra classof implementations to allow directly casting from VPUser ->
1981 /// VPScalarIVStepsRecipe.
1982 static inline bool classof(const VPUser *U) {
1983 auto *R = dyn_cast<VPRecipeBase>(U);
1984 return R && R->getVPDefID() == VPRecipeBase::VPScalarIVStepsSC;
1985 }
1986 static inline bool classof(const VPRecipeBase *R) {
1987 return R->getVPDefID() == VPRecipeBase::VPScalarIVStepsSC;
1988 }
1989
1990 /// Generate the scalarized versions of the phi node as needed by their users.
1991 void execute(VPTransformState &State) override;
1992
1993#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1994 /// Print the recipe.
1995 void print(raw_ostream &O, const Twine &Indent,
1996 VPSlotTracker &SlotTracker) const override;
1997#endif
1998
1999 VPValue *getStepValue() const { return getOperand(1); }
2000
2001 /// Returns true if the recipe only uses the first lane of operand \p Op.
2002 bool onlyFirstLaneUsed(const VPValue *Op) const override {
2003 assert(is_contained(operands(), Op) &&(static_cast <bool> (is_contained(operands(), Op) &&
"Op must be an operand of the recipe") ? void (0) : __assert_fail
("is_contained(operands(), Op) && \"Op must be an operand of the recipe\""
, "llvm/lib/Transforms/Vectorize/VPlan.h", 2004, __extension__
__PRETTY_FUNCTION__))
2004 "Op must be an operand of the recipe")(static_cast <bool> (is_contained(operands(), Op) &&
"Op must be an operand of the recipe") ? void (0) : __assert_fail
("is_contained(operands(), Op) && \"Op must be an operand of the recipe\""
, "llvm/lib/Transforms/Vectorize/VPlan.h", 2004, __extension__
__PRETTY_FUNCTION__))
;
2005 return true;
2006 }
2007};
2008
2009/// VPBasicBlock serves as the leaf of the Hierarchical Control-Flow Graph. It
2010/// holds a sequence of zero or more VPRecipe's each representing a sequence of
2011/// output IR instructions. All PHI-like recipes must come before any non-PHI recipes.
2012class VPBasicBlock : public VPBlockBase {
2013public:
2014 using RecipeListTy = iplist<VPRecipeBase>;
2015
2016private:
2017 /// The VPRecipes held in the order of output instructions to generate.
2018 RecipeListTy Recipes;
2019
2020public:
2021 VPBasicBlock(const Twine &Name = "", VPRecipeBase *Recipe = nullptr)
2022 : VPBlockBase(VPBasicBlockSC, Name.str()) {
2023 if (Recipe)
2024 appendRecipe(Recipe);
2025 }
2026
2027 ~VPBasicBlock() override {
2028 while (!Recipes.empty())
2029 Recipes.pop_back();
2030 }
2031
2032 /// Instruction iterators...
2033 using iterator = RecipeListTy::iterator;
2034 using const_iterator = RecipeListTy::const_iterator;
2035 using reverse_iterator = RecipeListTy::reverse_iterator;
2036 using const_reverse_iterator = RecipeListTy::const_reverse_iterator;
2037
2038 //===--------------------------------------------------------------------===//
2039 /// Recipe iterator methods
2040 ///
2041 inline iterator begin() { return Recipes.begin(); }
2042 inline const_iterator begin() const { return Recipes.begin(); }
2043 inline iterator end() { return Recipes.end(); }
2044 inline const_iterator end() const { return Recipes.end(); }
2045
2046 inline reverse_iterator rbegin() { return Recipes.rbegin(); }
2047 inline const_reverse_iterator rbegin() const { return Recipes.rbegin(); }
2048 inline reverse_iterator rend() { return Recipes.rend(); }
2049 inline const_reverse_iterator rend() const { return Recipes.rend(); }
2050
2051 inline size_t size() const { return Recipes.size(); }
2052 inline bool empty() const { return Recipes.empty(); }
2053 inline const VPRecipeBase &front() const { return Recipes.front(); }
2054 inline VPRecipeBase &front() { return Recipes.front(); }
2055 inline const VPRecipeBase &back() const { return Recipes.back(); }
2056 inline VPRecipeBase &back() { return Recipes.back(); }
2057
2058 /// Returns a reference to the list of recipes.
2059 RecipeListTy &getRecipeList() { return Recipes; }
2060
2061 /// Returns a pointer to a member of the recipe list.
2062 static RecipeListTy VPBasicBlock::*getSublistAccess(VPRecipeBase *) {
2063 return &VPBasicBlock::Recipes;
2064 }
2065
2066 /// Method to support type inquiry through isa, cast, and dyn_cast.
2067 static inline bool classof(const VPBlockBase *V) {
2068 return V->getVPBlockID() == VPBlockBase::VPBasicBlockSC;
2069 }
2070
2071 void insert(VPRecipeBase *Recipe, iterator InsertPt) {
2072 assert(Recipe && "No recipe to append.")(static_cast <bool> (Recipe && "No recipe to append."
) ? void (0) : __assert_fail ("Recipe && \"No recipe to append.\""
, "llvm/lib/Transforms/Vectorize/VPlan.h", 2072, __extension__
__PRETTY_FUNCTION__))
;
2073 assert(!Recipe->Parent && "Recipe already in VPlan")(static_cast <bool> (!Recipe->Parent && "Recipe already in VPlan"
) ? void (0) : __assert_fail ("!Recipe->Parent && \"Recipe already in VPlan\""
, "llvm/lib/Transforms/Vectorize/VPlan.h", 2073, __extension__
__PRETTY_FUNCTION__))
;
2074 Recipe->Parent = this;
2075 Recipes.insert(InsertPt, Recipe);
2076 }
2077
2078 /// Augment the existing recipes of a VPBasicBlock with an additional
2079 /// \p Recipe as the last recipe.
2080 void appendRecipe(VPRecipeBase *Recipe) { insert(Recipe, end()); }
2081
2082 /// The method which generates the output IR instructions that correspond to
2083 /// this VPBasicBlock, thereby "executing" the VPlan.
2084 void execute(VPTransformState *State) override;
2085
2086 /// Return the position of the first non-phi node recipe in the block.
2087 iterator getFirstNonPhi();
2088
2089 /// Returns an iterator range over the PHI-like recipes in the block.
2090 iterator_range<iterator> phis() {
2091 return make_range(begin(), getFirstNonPhi());
2092 }
2093
2094 void dropAllReferences(VPValue *NewValue) override;
2095
2096 /// Split current block at \p SplitAt by inserting a new block between the
2097 /// current block and its successors and moving all recipes starting at
2098 /// SplitAt to the new block. Returns the new block.
2099 VPBasicBlock *splitAt(iterator SplitAt);
2100
2101 VPRegionBlock *getEnclosingLoopRegion();
2102
2103#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2104 /// Print this VPBsicBlock to \p O, prefixing all lines with \p Indent. \p
2105 /// SlotTracker is used to print unnamed VPValue's using consequtive numbers.
2106 ///
2107 /// Note that the numbering is applied to the whole VPlan, so printing
2108 /// individual blocks is consistent with the whole VPlan printing.
2109 void print(raw_ostream &O, const Twine &Indent,
2110 VPSlotTracker &SlotTracker) const override;
2111 using VPBlockBase::print; // Get the print(raw_stream &O) version.
2112#endif
2113
2114 /// If the block has multiple successors, return the branch recipe terminating
2115 /// the block. If there are no or only a single successor, return nullptr;
2116 VPRecipeBase *getTerminator();
2117 const VPRecipeBase *getTerminator() const;
2118
2119 /// Returns true if the block is exiting it's parent region.
2120 bool isExiting() const;
2121
2122private:
2123 /// Create an IR BasicBlock to hold the output instructions generated by this
2124 /// VPBasicBlock, and return it. Update the CFGState accordingly.
2125 BasicBlock *createEmptyBasicBlock(VPTransformState::CFGState &CFG);
2126};
2127
2128/// VPRegionBlock represents a collection of VPBasicBlocks and VPRegionBlocks
2129/// which form a Single-Entry-Single-Exiting subgraph of the output IR CFG.
2130/// A VPRegionBlock may indicate that its contents are to be replicated several
2131/// times. This is designed to support predicated scalarization, in which a
2132/// scalar if-then code structure needs to be generated VF * UF times. Having
2133/// this replication indicator helps to keep a single model for multiple
2134/// candidate VF's. The actual replication takes place only once the desired VF
2135/// and UF have been determined.
2136class VPRegionBlock : public VPBlockBase {
2137 /// Hold the Single Entry of the SESE region modelled by the VPRegionBlock.
2138 VPBlockBase *Entry;
2139
2140 /// Hold the Single Exiting block of the SESE region modelled by the
2141 /// VPRegionBlock.
2142 VPBlockBase *Exiting;
2143
2144 /// An indicator whether this region is to generate multiple replicated
2145 /// instances of output IR corresponding to its VPBlockBases.
2146 bool IsReplicator;
2147
2148public:
2149 VPRegionBlock(VPBlockBase *Entry, VPBlockBase *Exiting,
2150 const std::string &Name = "", bool IsReplicator = false)
2151 : VPBlockBase(VPRegionBlockSC, Name), Entry(Entry), Exiting(Exiting),
2152 IsReplicator(IsReplicator) {
2153 assert(Entry->getPredecessors().empty() && "Entry block has predecessors.")(static_cast <bool> (Entry->getPredecessors().empty(
) && "Entry block has predecessors.") ? void (0) : __assert_fail
("Entry->getPredecessors().empty() && \"Entry block has predecessors.\""
, "llvm/lib/Transforms/Vectorize/VPlan.h", 2153, __extension__
__PRETTY_FUNCTION__))
;
2154 assert(Exiting->getSuccessors().empty() && "Exit block has successors.")(static_cast <bool> (Exiting->getSuccessors().empty(
) && "Exit block has successors.") ? void (0) : __assert_fail
("Exiting->getSuccessors().empty() && \"Exit block has successors.\""
, "llvm/lib/Transforms/Vectorize/VPlan.h", 2154, __extension__
__PRETTY_FUNCTION__))
;
2155 Entry->setParent(this);
2156 Exiting->setParent(this);
2157 }
2158 VPRegionBlock(const std::string &Name = "", bool IsReplicator = false)
2159 : VPBlockBase(VPRegionBlockSC, Name), Entry(nullptr), Exiting(nullptr),
2160 IsReplicator(IsReplicator) {}
2161
2162 ~VPRegionBlock() override {
2163 if (Entry) {
2164 VPValue DummyValue;
2165 Entry->dropAllReferences(&DummyValue);
2166 deleteCFG(Entry);
2167 }
2168 }
2169
2170 /// Method to support type inquiry through isa, cast, and dyn_cast.
2171 static inline bool classof(const VPBlockBase *V) {
2172 return V->getVPBlockID() == VPBlockBase::VPRegionBlockSC;
2173 }
2174
2175 const VPBlockBase *getEntry() const { return Entry; }
2176 VPBlockBase *getEntry() { return Entry; }
2177
2178 /// Set \p EntryBlock as the entry VPBlockBase of this VPRegionBlock. \p
2179 /// EntryBlock must have no predecessors.
2180 void setEntry(VPBlockBase *EntryBlock) {
2181 assert(EntryBlock->getPredecessors().empty() &&(static_cast <bool> (EntryBlock->getPredecessors().empty
() && "Entry block cannot have predecessors.") ? void
(0) : __assert_fail ("EntryBlock->getPredecessors().empty() && \"Entry block cannot have predecessors.\""
, "llvm/lib/Transforms/Vectorize/VPlan.h", 2182, __extension__
__PRETTY_FUNCTION__))
2182 "Entry block cannot have predecessors.")(static_cast <bool> (EntryBlock->getPredecessors().empty
() && "Entry block cannot have predecessors.") ? void
(0) : __assert_fail ("EntryBlock->getPredecessors().empty() && \"Entry block cannot have predecessors.\""
, "llvm/lib/Transforms/Vectorize/VPlan.h", 2182, __extension__
__PRETTY_FUNCTION__))
;
2183 Entry = EntryBlock;
2184 EntryBlock->setParent(this);
2185 }
2186
2187 // FIXME: DominatorTreeBase is doing 'A->getParent()->front()'. 'front' is a
2188 // specific interface of llvm::Function, instead of using
2189 // GraphTraints::getEntryNode. We should add a new template parameter to
2190 // DominatorTreeBase representing the Graph type.
2191 VPBlockBase &front() const { return *Entry; }
2192
2193 const VPBlockBase *getExiting() const { return Exiting; }
2194 VPBlockBase *getExiting() { return Exiting; }
2195
2196 /// Set \p ExitingBlock as the exiting VPBlockBase of this VPRegionBlock. \p
2197 /// ExitingBlock must have no successors.
2198 void setExiting(VPBlockBase *ExitingBlock) {
2199 assert(ExitingBlock->getSuccessors().empty() &&(static_cast <bool> (ExitingBlock->getSuccessors().empty
() && "Exit block cannot have successors.") ? void (0
) : __assert_fail ("ExitingBlock->getSuccessors().empty() && \"Exit block cannot have successors.\""
, "llvm/lib/Transforms/Vectorize/VPlan.h", 2200, __extension__
__PRETTY_FUNCTION__))
2200 "Exit block cannot have successors.")(static_cast <bool> (ExitingBlock->getSuccessors().empty
() && "Exit block cannot have successors.") ? void (0
) : __assert_fail ("ExitingBlock->getSuccessors().empty() && \"Exit block cannot have successors.\""
, "llvm/lib/Transforms/Vectorize/VPlan.h", 2200, __extension__
__PRETTY_FUNCTION__))
;
2201 Exiting = ExitingBlock;
2202 ExitingBlock->setParent(this);
2203 }
2204
2205 /// Returns the pre-header VPBasicBlock of the loop region.
2206 VPBasicBlock *getPreheaderVPBB() {
2207 assert(!isReplicator() && "should only get pre-header of loop regions")(static_cast <bool> (!isReplicator() && "should only get pre-header of loop regions"
) ? void (0) : __assert_fail ("!isReplicator() && \"should only get pre-header of loop regions\""
, "llvm/lib/Transforms/Vectorize/VPlan.h", 2207, __extension__
__PRETTY_FUNCTION__))
;
2208 return getSinglePredecessor()->getExitingBasicBlock();
2209 }
2210
2211 /// An indicator whether this region is to generate multiple replicated
2212 /// instances of output IR corresponding to its VPBlockBases.
2213 bool isReplicator() const { return IsReplicator; }
2214
2215 /// The method which generates the output IR instructions that correspond to
2216 /// this VPRegionBlock, thereby "executing" the VPlan.
2217 void execute(VPTransformState *State) override;
2218
2219 void dropAllReferences(VPValue *NewValue) override;
2220
2221#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2222 /// Print this VPRegionBlock to \p O (recursively), prefixing all lines with
2223 /// \p Indent. \p SlotTracker is used to print unnamed VPValue's using
2224 /// consequtive numbers.
2225 ///
2226 /// Note that the numbering is applied to the whole VPlan, so printing
2227 /// individual regions is consistent with the whole VPlan printing.
2228 void print(raw_ostream &O, const Twine &Indent,
2229 VPSlotTracker &SlotTracker) const override;
2230 using VPBlockBase::print; // Get the print(raw_stream &O) version.
2231#endif
2232};
2233
2234//===----------------------------------------------------------------------===//
2235// GraphTraits specializations for VPlan Hierarchical Control-Flow Graphs //
2236//===----------------------------------------------------------------------===//
2237
2238// The following set of template specializations implement GraphTraits to treat
2239// any VPBlockBase as a node in a graph of VPBlockBases. It's important to note
2240// that VPBlockBase traits don't recurse into VPRegioBlocks, i.e., if the
2241// VPBlockBase is a VPRegionBlock, this specialization provides access to its
2242// successors/predecessors but not to the blocks inside the region.
2243
2244template <> struct GraphTraits<VPBlockBase *> {
2245 using NodeRef = VPBlockBase *;
2246 using ChildIteratorType = SmallVectorImpl<VPBlockBase *>::iterator;
2247
2248 static NodeRef getEntryNode(NodeRef N) { return N; }
2249
2250 static inline ChildIteratorType child_begin(NodeRef N) {
2251 return N->getSuccessors().begin();
2252 }
2253
2254 static inline ChildIteratorType child_end(NodeRef N) {
2255 return N->getSuccessors().end();
2256 }
2257};
2258
2259template <> struct GraphTraits<const VPBlockBase *> {
2260 using NodeRef = const VPBlockBase *;
2261 using ChildIteratorType = SmallVectorImpl<VPBlockBase *>::const_iterator;
2262
2263 static NodeRef getEntryNode(NodeRef N) { return N; }
2264
2265 static inline ChildIteratorType child_begin(NodeRef N) {
2266 return N->getSuccessors().begin();
2267 }
2268
2269 static inline ChildIteratorType child_end(NodeRef N) {
2270 return N->getSuccessors().end();
2271 }
2272};
2273
2274// Inverse order specialization for VPBasicBlocks. Predecessors are used instead
2275// of successors for the inverse traversal.
2276template <> struct GraphTraits<Inverse<VPBlockBase *>> {
2277 using NodeRef = VPBlockBase *;
2278 using ChildIteratorType = SmallVectorImpl<VPBlockBase *>::iterator;
2279
2280 static NodeRef getEntryNode(Inverse<NodeRef> B) { return B.Graph; }
2281
2282 static inline ChildIteratorType child_begin(NodeRef N) {
2283 return N->getPredecessors().begin();
2284 }
2285
2286 static inline ChildIteratorType child_end(NodeRef N) {
2287 return N->getPredecessors().end();
2288 }
2289};
2290
2291// The following set of template specializations implement GraphTraits to
2292// treat VPRegionBlock as a graph and recurse inside its nodes. It's important
2293// to note that the blocks inside the VPRegionBlock are treated as VPBlockBases
2294// (i.e., no dyn_cast is performed, VPBlockBases specialization is used), so
2295// there won't be automatic recursion into other VPBlockBases that turn to be
2296// VPRegionBlocks.
2297
2298template <>
2299struct GraphTraits<VPRegionBlock *> : public GraphTraits<VPBlockBase *> {
2300 using GraphRef = VPRegionBlock *;
2301 using nodes_iterator = df_iterator<NodeRef>;
2302
2303 static NodeRef getEntryNode(GraphRef N) { return N->getEntry(); }
2304
2305 static nodes_iterator nodes_begin(GraphRef N) {
2306 return nodes_iterator::begin(N->getEntry());
2307 }
2308
2309 static nodes_iterator nodes_end(GraphRef N) {
2310 // df_iterator::end() returns an empty iterator so the node used doesn't
2311 // matter.
2312 return nodes_iterator::end(N);
2313 }
2314};
2315
2316template <>
2317struct GraphTraits<const VPRegionBlock *>
2318 : public GraphTraits<const VPBlockBase *> {
2319 using GraphRef = const VPRegionBlock *;
2320 using nodes_iterator = df_iterator<NodeRef>;
2321
2322 static NodeRef getEntryNode(GraphRef N) { return N->getEntry(); }
2323
2324 static nodes_iterator nodes_begin(GraphRef N) {
2325 return nodes_iterator::begin(N->getEntry());
2326 }
2327
2328 static nodes_iterator nodes_end(GraphRef N) {
2329 // df_iterator::end() returns an empty iterator so the node used doesn't
2330 // matter.
2331 return nodes_iterator::end(N);
2332 }
2333};
2334
2335template <>
2336struct GraphTraits<Inverse<VPRegionBlock *>>
2337 : public GraphTraits<Inverse<VPBlockBase *>> {
2338 using GraphRef = VPRegionBlock *;
2339 using nodes_iterator = df_iterator<NodeRef>;
2340
2341 static NodeRef getEntryNode(Inverse<GraphRef> N) {
2342 return N.Graph->getExiting();
2343 }
2344
2345 static nodes_iterator nodes_begin(GraphRef N) {
2346 return nodes_iterator::begin(N->getExiting());
2347 }
2348
2349 static nodes_iterator nodes_end(GraphRef N) {
2350 // df_iterator::end() returns an empty iterator so the node used doesn't
2351 // matter.
2352 return nodes_iterator::end(N);
2353 }
2354};
2355
2356/// Iterator to traverse all successors of a VPBlockBase node. This includes the
2357/// entry node of VPRegionBlocks. Exit blocks of a region implicitly have their
2358/// parent region's successors. This ensures all blocks in a region are visited
2359/// before any blocks in a successor region when doing a reverse post-order
2360// traversal of the graph.
2361template <typename BlockPtrTy>
2362class VPAllSuccessorsIterator
2363 : public iterator_facade_base<VPAllSuccessorsIterator<BlockPtrTy>,
2364 std::forward_iterator_tag, VPBlockBase> {
2365 BlockPtrTy Block;
2366 /// Index of the current successor. For VPBasicBlock nodes, this simply is the
2367 /// index for the successor array. For VPRegionBlock, SuccessorIdx == 0 is
2368 /// used for the region's entry block, and SuccessorIdx - 1 are the indices
2369 /// for the successor array.
2370 size_t SuccessorIdx;
2371
2372 static BlockPtrTy getBlockWithSuccs(BlockPtrTy Current) {
2373 while (Current && Current->getNumSuccessors() == 0)
2374 Current = Current->getParent();
2375 return Current;
2376 }
2377
2378 /// Templated helper to dereference successor \p SuccIdx of \p Block. Used by
2379 /// both the const and non-const operator* implementations.
2380 template <typename T1> static T1 deref(T1 Block, unsigned SuccIdx) {
2381 if (auto *R = dyn_cast<VPRegionBlock>(Block)) {
2382 if (SuccIdx == 0)
2383 return R->getEntry();
2384 SuccIdx--;
2385 }
2386
2387 // For exit blocks, use the next parent region with successors.
2388 return getBlockWithSuccs(Block)->getSuccessors()[SuccIdx];
2389 }
2390
2391public:
2392 VPAllSuccessorsIterator(BlockPtrTy Block, size_t Idx = 0)
2393 : Block(Block), SuccessorIdx(Idx) {}
2394 VPAllSuccessorsIterator(const VPAllSuccessorsIterator &Other)
2395 : Block(Other.Block), SuccessorIdx(Other.SuccessorIdx) {}
2396
2397 VPAllSuccessorsIterator &operator=(const VPAllSuccessorsIterator &R) {
2398 Block = R.Block;
2399 SuccessorIdx = R.SuccessorIdx;
2400 return *this;
2401 }
2402
2403 static VPAllSuccessorsIterator end(BlockPtrTy Block) {
2404 BlockPtrTy ParentWithSuccs = getBlockWithSuccs(Block);
2405 unsigned NumSuccessors = ParentWithSuccs
2406 ? ParentWithSuccs->getNumSuccessors()
2407 : Block->getNumSuccessors();
2408
2409 if (auto *R = dyn_cast<VPRegionBlock>(Block))
2410 return {R, NumSuccessors + 1};
2411 return {Block, NumSuccessors};
2412 }
2413
2414 bool operator==(const VPAllSuccessorsIterator &R) const {
2415 return Block == R.Block && SuccessorIdx == R.SuccessorIdx;
2416 }
2417
2418 const VPBlockBase *operator*() const { return deref(Block, SuccessorIdx); }
2419
2420 BlockPtrTy operator*() { return deref(Block, SuccessorIdx); }
2421
2422 VPAllSuccessorsIterator &operator++() {
2423 SuccessorIdx++;
2424 return *this;
2425 }
2426
2427 VPAllSuccessorsIterator operator++(int X) {
2428 VPAllSuccessorsIterator Orig = *this;
2429 SuccessorIdx++;
2430 return Orig;
2431 }
2432};
2433
2434/// Helper for GraphTraits specialization that traverses through VPRegionBlocks.
2435template <typename BlockTy> class VPBlockRecursiveTraversalWrapper {
2436 BlockTy Entry;
2437
2438public:
2439 VPBlockRecursiveTraversalWrapper(BlockTy Entry) : Entry(Entry) {}
2440 BlockTy getEntry() { return Entry; }
2441};
2442
2443/// GraphTraits specialization to recursively traverse VPBlockBase nodes,
2444/// including traversing through VPRegionBlocks. Exit blocks of a region
2445/// implicitly have their parent region's successors. This ensures all blocks in
2446/// a region are visited before any blocks in a successor region when doing a
2447/// reverse post-order traversal of the graph.
2448template <>
2449struct GraphTraits<VPBlockRecursiveTraversalWrapper<VPBlockBase *>> {
2450 using NodeRef = VPBlockBase *;
2451 using ChildIteratorType = VPAllSuccessorsIterator<VPBlockBase *>;
2452
2453 static NodeRef
2454 getEntryNode(VPBlockRecursiveTraversalWrapper<VPBlockBase *> N) {
2455 return N.getEntry();
2456 }
2457
2458 static inline ChildIteratorType child_begin(NodeRef N) {
2459 return ChildIteratorType(N);
2460 }
2461
2462 static inline ChildIteratorType child_end(NodeRef N) {
2463 return ChildIteratorType::end(N);
2464 }
2465};
2466
2467template <>
2468struct GraphTraits<VPBlockRecursiveTraversalWrapper<const VPBlockBase *>> {
2469 using NodeRef = const VPBlockBase *;
2470 using ChildIteratorType = VPAllSuccessorsIterator<const VPBlockBase *>;
2471
2472 static NodeRef
2473 getEntryNode(VPBlockRecursiveTraversalWrapper<const VPBlockBase *> N) {
2474 return N.getEntry();
2475 }
2476
2477 static inline ChildIteratorType child_begin(NodeRef N) {
2478 return ChildIteratorType(N);
2479 }
2480
2481 static inline ChildIteratorType child_end(NodeRef N) {
2482 return ChildIteratorType::end(N);
2483 }
2484};
2485
2486/// VPlan models a candidate for vectorization, encoding various decisions take
2487/// to produce efficient output IR, including which branches, basic-blocks and
2488/// output IR instructions to generate, and their cost. VPlan holds a
2489/// Hierarchical-CFG of VPBasicBlocks and VPRegionBlocks rooted at an Entry
2490/// VPBlock.
2491class VPlan {
2492 friend class VPlanPrinter;
2493 friend class VPSlotTracker;
2494
2495 /// Hold the single entry to the Hierarchical CFG of the VPlan.
2496 VPBlockBase *Entry;
2497
2498 /// Holds the VFs applicable to this VPlan.
2499 SmallSetVector<ElementCount, 2> VFs;
2500
2501 /// Holds the UFs applicable to this VPlan. If empty, the VPlan is valid for
2502 /// any UF.
2503 SmallSetVector<unsigned, 2> UFs;
2504
2505 /// Holds the name of the VPlan, for printing.
2506 std::string Name;
2507
2508 /// Holds all the external definitions created for this VPlan. External
2509 /// definitions must be immutable and hold a pointer to their underlying IR.
2510 DenseMap<Value *, VPValue *> VPExternalDefs;
2511
2512 /// Represents the trip count of the original loop, for folding
2513 /// the tail.
2514 VPValue *TripCount = nullptr;
2515
2516 /// Represents the backedge taken count of the original loop, for folding
2517 /// the tail. It equals TripCount - 1.
2518 VPValue *BackedgeTakenCount = nullptr;
2519
2520 /// Represents the vector trip count.
2521 VPValue VectorTripCount;
2522
2523 /// Holds a mapping between Values and their corresponding VPValue inside
2524 /// VPlan.
2525 Value2VPValueTy Value2VPValue;
2526
2527 /// Contains all VPValues that been allocated by addVPValue directly and need
2528 /// to be free when the plan's destructor is called.
2529 SmallVector<VPValue *, 16> VPValuesToFree;
2530
2531 /// Indicates whether it is safe use the Value2VPValue mapping or if the
2532 /// mapping cannot be used any longer, because it is stale.
2533 bool Value2VPValueEnabled = true;
2534
2535 /// Values used outside the plan.
2536 MapVector<PHINode *, VPLiveOut *> LiveOuts;
2537
2538public:
2539 VPlan(VPBlockBase *Entry = nullptr) : Entry(Entry) {
2540 if (Entry)
2541 Entry->setPlan(this);
2542 }
2543
2544 ~VPlan() {
2545 clearLiveOuts();
2546
2547 if (Entry) {
2548 VPValue DummyValue;
2549 for (VPBlockBase *Block : depth_first(Entry))
2550 Block->dropAllReferences(&DummyValue);
2551
2552 VPBlockBase::deleteCFG(Entry);
2553 }
2554 for (VPValue *VPV : VPValuesToFree)
2555 delete VPV;
2556 if (TripCount)
2557 delete TripCount;
2558 if (BackedgeTakenCount)
2559 delete BackedgeTakenCount;
2560 for (auto &P : VPExternalDefs)
2561 delete P.second;
2562 }
2563
2564 /// Prepare the plan for execution, setting up the required live-in values.
2565 void prepareToExecute(Value *TripCount, Value *VectorTripCount,
2566 Value *CanonicalIVStartValue, VPTransformState &State,
2567 bool IsEpilogueVectorization);
2568
2569 /// Generate the IR code for this VPlan.
2570 void execute(VPTransformState *State);
2571
2572 VPBlockBase *getEntry() { return Entry; }
2573 const VPBlockBase *getEntry() const { return Entry; }
2574
2575 VPBlockBase *setEntry(VPBlockBase *Block) {
2576 Entry = Block;
2577 Block->setPlan(this);
2578 return Entry;
2579 }
2580
2581 /// The trip count of the original loop.
2582 VPValue *getOrCreateTripCount() {
2583 if (!TripCount)
2584 TripCount = new VPValue();
2585 return TripCount;
2586 }
2587
2588 /// The backedge taken count of the original loop.
2589 VPValue *getOrCreateBackedgeTakenCount() {
2590 if (!BackedgeTakenCount)
2591 BackedgeTakenCount = new VPValue();
2592 return BackedgeTakenCount;
2593 }
2594
2595 /// The vector trip count.
2596 VPValue &getVectorTripCount() { return VectorTripCount; }
2597
2598 /// Mark the plan to indicate that using Value2VPValue is not safe any
2599 /// longer, because it may be stale.
2600 void disableValue2VPValue() { Value2VPValueEnabled = false; }
2601
2602 void addVF(ElementCount VF) { VFs.insert(VF); }
2603
2604 void setVF(ElementCount VF) {
2605 assert(hasVF(VF) && "Cannot set VF not already in plan")(static_cast <bool> (hasVF(VF) && "Cannot set VF not already in plan"
) ? void (0) : __assert_fail ("hasVF(VF) && \"Cannot set VF not already in plan\""
, "llvm/lib/Transforms/Vectorize/VPlan.h", 2605, __extension__
__PRETTY_FUNCTION__))
;
2606 VFs.clear();
2607 VFs.insert(VF);
2608 }
2609
2610 bool hasVF(ElementCount VF) { return VFs.count(VF); }
2611
2612 bool hasScalarVFOnly() const { return VFs.size() == 1 && VFs[0].isScalar(); }
2613
2614 bool hasUF(unsigned UF) const { return UFs.empty() || UFs.contains(UF); }
2615
2616 void setUF(unsigned UF) {
2617 assert(hasUF(UF) && "Cannot set the UF not already in plan")(static_cast <bool> (hasUF(UF) && "Cannot set the UF not already in plan"
) ? void (0) : __assert_fail ("hasUF(UF) && \"Cannot set the UF not already in plan\""
, "llvm/lib/Transforms/Vectorize/VPlan.h", 2617, __extension__
__PRETTY_FUNCTION__))
;
2618 UFs.clear();
2619 UFs.insert(UF);
2620 }
2621
2622 /// Return a string with the name of the plan and the applicable VFs and UFs.
2623 std::string getName() const;
2624
2625 void setName(const Twine &newName) { Name = newName.str(); }
2626
2627 /// Get the existing or add a new external definition for \p V.
2628 VPValue *getOrAddExternalDef(Value *V) {
2629 auto I = VPExternalDefs.insert({V, nullptr});
2630 if (I.second)
2631 I.first->second = new VPValue(V);
2632 return I.first->second;
2633 }
2634
2635 void addVPValue(Value *V) {
2636 assert(Value2VPValueEnabled &&(static_cast <bool> (Value2VPValueEnabled && "IR value to VPValue mapping may be out of date!"
) ? void (0) : __assert_fail ("Value2VPValueEnabled && \"IR value to VPValue mapping may be out of date!\""
, "llvm/lib/Transforms/Vectorize/VPlan.h", 2637, __extension__
__PRETTY_FUNCTION__))
2637 "IR value to VPValue mapping may be out of date!")(static_cast <bool> (Value2VPValueEnabled && "IR value to VPValue mapping may be out of date!"
) ? void (0) : __assert_fail ("Value2VPValueEnabled && \"IR value to VPValue mapping may be out of date!\""
, "llvm/lib/Transforms/Vectorize/VPlan.h", 2637, __extension__
__PRETTY_FUNCTION__))
;
2638 assert(V && "Trying to add a null Value to VPlan")(static_cast <bool> (V && "Trying to add a null Value to VPlan"
) ? void (0) : __assert_fail ("V && \"Trying to add a null Value to VPlan\""
, "llvm/lib/Transforms/Vectorize/VPlan.h", 2638, __extension__
__PRETTY_FUNCTION__))
;
2639 assert(!Value2VPValue.count(V) && "Value already exists in VPlan")(static_cast <bool> (!Value2VPValue.count(V) &&
"Value already exists in VPlan") ? void (0) : __assert_fail (
"!Value2VPValue.count(V) && \"Value already exists in VPlan\""
, "llvm/lib/Transforms/Vectorize/VPlan.h", 2639, __extension__
__PRETTY_FUNCTION__))
;
2640 VPValue *VPV = new VPValue(V);
2641 Value2VPValue[V] = VPV;
2642 VPValuesToFree.push_back(VPV);
2643 }
2644
2645 void addVPValue(Value *V, VPValue *VPV) {
2646 assert(Value2VPValueEnabled && "Value2VPValue mapping may be out of date!")(static_cast <bool> (Value2VPValueEnabled && "Value2VPValue mapping may be out of date!"
) ? void (0) : __assert_fail ("Value2VPValueEnabled && \"Value2VPValue mapping may be out of date!\""
, "llvm/lib/Transforms/Vectorize/VPlan.h", 2646, __extension__
__PRETTY_FUNCTION__))
;
2647 assert(V && "Trying to add a null Value to VPlan")(static_cast <bool> (V && "Trying to add a null Value to VPlan"
) ? void (0) : __assert_fail ("V && \"Trying to add a null Value to VPlan\""
, "llvm/lib/Transforms/Vectorize/VPlan.h", 2647, __extension__
__PRETTY_FUNCTION__))
;
2648 assert(!Value2VPValue.count(V) && "Value already exists in VPlan")(static_cast <bool> (!Value2VPValue.count(V) &&
"Value already exists in VPlan") ? void (0) : __assert_fail (
"!Value2VPValue.count(V) && \"Value already exists in VPlan\""
, "llvm/lib/Transforms/Vectorize/VPlan.h", 2648, __extension__
__PRETTY_FUNCTION__))
;
2649 Value2VPValue[V] = VPV;
2650 }
2651
2652 /// Returns the VPValue for \p V. \p OverrideAllowed can be used to disable
2653 /// checking whether it is safe to query VPValues using IR Values.
2654 VPValue *getVPValue(Value *V, bool OverrideAllowed = false) {
2655 assert((OverrideAllowed || isa<Constant>(V) || Value2VPValueEnabled) &&(static_cast <bool> ((OverrideAllowed || isa<Constant
>(V) || Value2VPValueEnabled) && "Value2VPValue mapping may be out of date!"
) ? void (0) : __assert_fail ("(OverrideAllowed || isa<Constant>(V) || Value2VPValueEnabled) && \"Value2VPValue mapping may be out of date!\""
, "llvm/lib/Transforms/Vectorize/VPlan.h", 2656, __extension__
__PRETTY_FUNCTION__))
2656 "Value2VPValue mapping may be out of date!")(static_cast <bool> ((OverrideAllowed || isa<Constant
>(V) || Value2VPValueEnabled) && "Value2VPValue mapping may be out of date!"
) ? void (0) : __assert_fail ("(OverrideAllowed || isa<Constant>(V) || Value2VPValueEnabled) && \"Value2VPValue mapping may be out of date!\""
, "llvm/lib/Transforms/Vectorize/VPlan.h", 2656, __extension__
__PRETTY_FUNCTION__))
;
2657 assert(V && "Trying to get the VPValue of a null Value")(static_cast <bool> (V && "Trying to get the VPValue of a null Value"
) ? void (0) : __assert_fail ("V && \"Trying to get the VPValue of a null Value\""
, "llvm/lib/Transforms/Vectorize/VPlan.h", 2657, __extension__
__PRETTY_FUNCTION__))
;
2658 assert(Value2VPValue.count(V) && "Value does not exist in VPlan")(static_cast <bool> (Value2VPValue.count(V) && "Value does not exist in VPlan"
) ? void (0) : __assert_fail ("Value2VPValue.count(V) && \"Value does not exist in VPlan\""
, "llvm/lib/Transforms/Vectorize/VPlan.h", 2658, __extension__
__PRETTY_FUNCTION__))
;
2659 return Value2VPValue[V];
2660 }
2661
2662 /// Gets the VPValue or adds a new one (if none exists yet) for \p V. \p
2663 /// OverrideAllowed can be used to disable checking whether it is safe to
2664 /// query VPValues using IR Values.
2665 VPValue *getOrAddVPValue(Value *V, bool OverrideAllowed = false) {
2666 assert((OverrideAllowed || isa<Constant>(V) || Value2VPValueEnabled) &&(static_cast <bool> ((OverrideAllowed || isa<Constant
>(V) || Value2VPValueEnabled) && "Value2VPValue mapping may be out of date!"
) ? void (0) : __assert_fail ("(OverrideAllowed || isa<Constant>(V) || Value2VPValueEnabled) && \"Value2VPValue mapping may be out of date!\""
, "llvm/lib/Transforms/Vectorize/VPlan.h", 2667, __extension__
__PRETTY_FUNCTION__))
2667 "Value2VPValue mapping may be out of date!")(static_cast <bool> ((OverrideAllowed || isa<Constant
>(V) || Value2VPValueEnabled) && "Value2VPValue mapping may be out of date!"
) ? void (0) : __assert_fail ("(OverrideAllowed || isa<Constant>(V) || Value2VPValueEnabled) && \"Value2VPValue mapping may be out of date!\""
, "llvm/lib/Transforms/Vectorize/VPlan.h", 2667, __extension__
__PRETTY_FUNCTION__))
;
2668 assert(V && "Trying to get or add the VPValue of a null Value")(static_cast <bool> (V && "Trying to get or add the VPValue of a null Value"
) ? void (0) : __assert_fail ("V && \"Trying to get or add the VPValue of a null Value\""
, "llvm/lib/Transforms/Vectorize/VPlan.h", 2668, __extension__
__PRETTY_FUNCTION__))
;
2669 if (!Value2VPValue.count(V))
2670 addVPValue(V);
2671 return getVPValue(V);
2672 }
2673
2674 void removeVPValueFor(Value *V) {
2675 assert(Value2VPValueEnabled &&(static_cast <bool> (Value2VPValueEnabled && "IR value to VPValue mapping may be out of date!"
) ? void (0) : __assert_fail ("Value2VPValueEnabled && \"IR value to VPValue mapping may be out of date!\""
, "llvm/lib/Transforms/Vectorize/VPlan.h", 2676, __extension__
__PRETTY_FUNCTION__))
2676 "IR value to VPValue mapping may be out of date!")(static_cast <bool> (Value2VPValueEnabled && "IR value to VPValue mapping may be out of date!"
) ? void (0) : __assert_fail ("Value2VPValueEnabled && \"IR value to VPValue mapping may be out of date!\""
, "llvm/lib/Transforms/Vectorize/VPlan.h", 2676, __extension__
__PRETTY_FUNCTION__))
;
2677 Value2VPValue.erase(V);
2678 }
2679
2680#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2681 /// Print this VPlan to \p O.
2682 void print(raw_ostream &O) const;
2683
2684 /// Print this VPlan in DOT format to \p O.
2685 void printDOT(raw_ostream &O) const;
2686
2687 /// Dump the plan to stderr (for debugging).
2688 LLVM_DUMP_METHOD__attribute__((noinline)) __attribute__((__used__)) void dump() const;
2689#endif
2690
2691 /// Returns a range mapping the values the range \p Operands to their
2692 /// corresponding VPValues.
2693 iterator_range<mapped_iterator<Use *, std::function<VPValue *(Value *)>>>
2694 mapToVPValues(User::op_range Operands) {
2695 std::function<VPValue *(Value *)> Fn = [this](Value *Op) {
2696 return getOrAddVPValue(Op);
2697 };
2698 return map_range(Operands, Fn);
2699 }
2700
2701 /// Returns the VPRegionBlock of the vector loop.
2702 VPRegionBlock *getVectorLoopRegion() {
2703 return cast<VPRegionBlock>(getEntry()->getSingleSuccessor());
2704 }
2705 const VPRegionBlock *getVectorLoopRegion() const {
2706 return cast<VPRegionBlock>(getEntry()->getSingleSuccessor());
2707 }
2708
2709 /// Returns the canonical induction recipe of the vector loop.
2710 VPCanonicalIVPHIRecipe *getCanonicalIV() {
2711 VPBasicBlock *EntryVPBB = getVectorLoopRegion()->getEntryBasicBlock();
2712 if (EntryVPBB->empty()) {
2713 // VPlan native path.
2714 EntryVPBB = cast<VPBasicBlock>(EntryVPBB->getSingleSuccessor());
2715 }
2716 return cast<VPCanonicalIVPHIRecipe>(&*EntryVPBB->begin());
2717 }
2718
2719 /// Find and return the VPActiveLaneMaskPHIRecipe from the header - there
2720 /// be only one at most. If there isn't one, then return nullptr.
2721 VPActiveLaneMaskPHIRecipe *getActiveLaneMaskPhi();
2722
2723 void addLiveOut(PHINode *PN, VPValue *V);
2724
2725 void clearLiveOuts() {
2726 for (auto &KV : LiveOuts)
2727 delete KV.second;
2728 LiveOuts.clear();
2729 }
2730
2731 void removeLiveOut(PHINode *PN) {
2732 delete LiveOuts[PN];
2733 LiveOuts.erase(PN);
2734 }
2735
2736 const MapVector<PHINode *, VPLiveOut *> &getLiveOuts() const {
2737 return LiveOuts;
2738 }
2739
2740private:
2741 /// Add to the given dominator tree the header block and every new basic block
2742 /// that was created between it and the latch block, inclusive.
2743 static void updateDominatorTree(DominatorTree *DT, BasicBlock *LoopLatchBB,
2744 BasicBlock *LoopPreHeaderBB,
2745 BasicBlock *LoopExitBB);
2746};
2747
2748#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2749/// VPlanPrinter prints a given VPlan to a given output stream. The printing is
2750/// indented and follows the dot format.
2751class VPlanPrinter {
2752 raw_ostream &OS;
2753 const VPlan &Plan;
2754 unsigned Depth = 0;
2755 unsigned TabWidth = 2;
2756 std::string Indent;
2757 unsigned BID = 0;
2758 SmallDenseMap<const VPBlockBase *, unsigned> BlockID;
2759
2760 VPSlotTracker SlotTracker;
2761
2762 /// Handle indentation.
2763 void bumpIndent(int b) { Indent = std::string((Depth += b) * TabWidth, ' '); }
2764
2765 /// Print a given \p Block of the Plan.
2766 void dumpBlock(const VPBlockBase *Block);
2767
2768 /// Print the information related to the CFG edges going out of a given
2769 /// \p Block, followed by printing the successor blocks themselves.
2770 void dumpEdges(const VPBlockBase *Block);
2771
2772 /// Print a given \p BasicBlock, including its VPRecipes, followed by printing
2773 /// its successor blocks.
2774 void dumpBasicBlock(const VPBasicBlock *BasicBlock);
2775
2776 /// Print a given \p Region of the Plan.
2777 void dumpRegion(const VPRegionBlock *Region);
2778
2779 unsigned getOrCreateBID(const VPBlockBase *Block) {
2780 return BlockID.count(Block) ? BlockID[Block] : BlockID[Block] = BID++;
2781 }
2782
2783 Twine getOrCreateName(const VPBlockBase *Block);
2784
2785 Twine getUID(const VPBlockBase *Block);
2786
2787 /// Print the information related to a CFG edge between two VPBlockBases.
2788 void drawEdge(const VPBlockBase *From, const VPBlockBase *To, bool Hidden,
2789 const Twine &Label);
2790
2791public:
2792 VPlanPrinter(raw_ostream &O, const VPlan &P)
2793 : OS(O), Plan(P), SlotTracker(&P) {}
2794
2795 LLVM_DUMP_METHOD__attribute__((noinline)) __attribute__((__used__)) void dump();
2796};
2797
2798struct VPlanIngredient {
2799 const Value *V;
2800
2801 VPlanIngredient(const Value *V) : V(V) {}
2802
2803 void print(raw_ostream &O) const;
2804};
2805
2806inline raw_ostream &operator<<(raw_ostream &OS, const VPlanIngredient &I) {
2807 I.print(OS);
2808 return OS;
2809}
2810
2811inline raw_ostream &operator<<(raw_ostream &OS, const VPlan &Plan) {
2812 Plan.print(OS);
2813 return OS;
2814}
2815#endif
2816
2817//===----------------------------------------------------------------------===//
2818// VPlan Utilities
2819//===----------------------------------------------------------------------===//
2820
2821/// Class that provides utilities for VPBlockBases in VPlan.
2822class VPBlockUtils {
2823public:
2824 VPBlockUtils() = delete;
2825
2826 /// Insert disconnected VPBlockBase \p NewBlock after \p BlockPtr. Add \p
2827 /// NewBlock as successor of \p BlockPtr and \p BlockPtr as predecessor of \p
2828 /// NewBlock, and propagate \p BlockPtr parent to \p NewBlock. \p BlockPtr's
2829 /// successors are moved from \p BlockPtr to \p NewBlock. \p NewBlock must
2830 /// have neither successors nor predecessors.
2831 static void insertBlockAfter(VPBlockBase *NewBlock, VPBlockBase *BlockPtr) {
2832 assert(NewBlock->getSuccessors().empty() &&(static_cast <bool> (NewBlock->getSuccessors().empty
() && NewBlock->getPredecessors().empty() &&
"Can't insert new block with predecessors or successors.") ?
void (0) : __assert_fail ("NewBlock->getSuccessors().empty() && NewBlock->getPredecessors().empty() && \"Can't insert new block with predecessors or successors.\""
, "llvm/lib/Transforms/Vectorize/VPlan.h", 2834, __extension__
__PRETTY_FUNCTION__))
2833 NewBlock->getPredecessors().empty() &&(static_cast <bool> (NewBlock->getSuccessors().empty
() && NewBlock->getPredecessors().empty() &&
"Can't insert new block with predecessors or successors.") ?
void (0) : __assert_fail ("NewBlock->getSuccessors().empty() && NewBlock->getPredecessors().empty() && \"Can't insert new block with predecessors or successors.\""
, "llvm/lib/Transforms/Vectorize/VPlan.h", 2834, __extension__
__PRETTY_FUNCTION__))
2834 "Can't insert new block with predecessors or successors.")(static_cast <bool> (NewBlock->getSuccessors().empty
() && NewBlock->getPredecessors().empty() &&
"Can't insert new block with predecessors or successors.") ?
void (0) : __assert_fail ("NewBlock->getSuccessors().empty() && NewBlock->getPredecessors().empty() && \"Can't insert new block with predecessors or successors.\""
, "llvm/lib/Transforms/Vectorize/VPlan.h", 2834, __extension__
__PRETTY_FUNCTION__))
;
2835 NewBlock->setParent(BlockPtr->getParent());
2836 SmallVector<VPBlockBase *> Succs(BlockPtr->successors());
2837 for (VPBlockBase *Succ : Succs) {
2838 disconnectBlocks(BlockPtr, Succ);
2839 connectBlocks(NewBlock, Succ);
2840 }
2841 connectBlocks(BlockPtr, NewBlock);
2842 }
2843
2844 /// Insert disconnected VPBlockBases \p IfTrue and \p IfFalse after \p
2845 /// BlockPtr. Add \p IfTrue and \p IfFalse as succesors of \p BlockPtr and \p
2846 /// BlockPtr as predecessor of \p IfTrue and \p IfFalse. Propagate \p BlockPtr
2847 /// parent to \p IfTrue and \p IfFalse. \p BlockPtr must have no successors
2848 /// and \p IfTrue and \p IfFalse must have neither successors nor
2849 /// predecessors.
2850 static void insertTwoBlocksAfter(VPBlockBase *IfTrue, VPBlockBase *IfFalse,
2851 VPBlockBase *BlockPtr) {
2852 assert(IfTrue->getSuccessors().empty() &&(static_cast <bool> (IfTrue->getSuccessors().empty()
&& "Can't insert IfTrue with successors.") ? void (0
) : __assert_fail ("IfTrue->getSuccessors().empty() && \"Can't insert IfTrue with successors.\""
, "llvm/lib/Transforms/Vectorize/VPlan.h", 2853, __extension__
__PRETTY_FUNCTION__))
2853 "Can't insert IfTrue with successors.")(static_cast <bool> (IfTrue->getSuccessors().empty()
&& "Can't insert IfTrue with successors.") ? void (0
) : __assert_fail ("IfTrue->getSuccessors().empty() && \"Can't insert IfTrue with successors.\""
, "llvm/lib/Transforms/Vectorize/VPlan.h", 2853, __extension__
__PRETTY_FUNCTION__))
;
2854 assert(IfFalse->getSuccessors().empty() &&(static_cast <bool> (IfFalse->getSuccessors().empty(
) && "Can't insert IfFalse with successors.") ? void (
0) : __assert_fail ("IfFalse->getSuccessors().empty() && \"Can't insert IfFalse with successors.\""
, "llvm/lib/Transforms/Vectorize/VPlan.h", 2855, __extension__
__PRETTY_FUNCTION__))
2855 "Can't insert IfFalse with successors.")(static_cast <bool> (IfFalse->getSuccessors().empty(
) && "Can't insert IfFalse with successors.") ? void (
0) : __assert_fail ("IfFalse->getSuccessors().empty() && \"Can't insert IfFalse with successors.\""
, "llvm/lib/Transforms/Vectorize/VPlan.h", 2855, __extension__
__PRETTY_FUNCTION__))
;
2856 BlockPtr->setTwoSuccessors(IfTrue, IfFalse);
2857 IfTrue->setPredecessors({BlockPtr});
2858 IfFalse->setPredecessors({BlockPtr});
2859 IfTrue->setParent(BlockPtr->getParent());
2860 IfFalse->setParent(BlockPtr->getParent());
2861 }
2862
2863 /// Connect VPBlockBases \p From and \p To bi-directionally. Append \p To to
2864 /// the successors of \p From and \p From to the predecessors of \p To. Both
2865 /// VPBlockBases must have the same parent, which can be null. Both
2866 /// VPBlockBases can be already connected to other VPBlockBases.
2867 static void connectBlocks(VPBlockBase *From, VPBlockBase *To) {
2868 assert((From->getParent() == To->getParent()) &&(static_cast <bool> ((From->getParent() == To->getParent
()) && "Can't connect two block with different parents"
) ? void (0) : __assert_fail ("(From->getParent() == To->getParent()) && \"Can't connect two block with different parents\""
, "llvm/lib/Transforms/Vectorize/VPlan.h", 2869, __extension__
__PRETTY_FUNCTION__))
2869 "Can't connect two block with different parents")(static_cast <bool> ((From->getParent() == To->getParent
()) && "Can't connect two block with different parents"
) ? void (0) : __assert_fail ("(From->getParent() == To->getParent()) && \"Can't connect two block with different parents\""
, "llvm/lib/Transforms/Vectorize/VPlan.h", 2869, __extension__
__PRETTY_FUNCTION__))
;
2870 assert(From->getNumSuccessors() < 2 &&(static_cast <bool> (From->getNumSuccessors() < 2
&& "Blocks can't have more than two successors.") ? void
(0) : __assert_fail ("From->getNumSuccessors() < 2 && \"Blocks can't have more than two successors.\""
, "llvm/lib/Transforms/Vectorize/VPlan.h", 2871, __extension__
__PRETTY_FUNCTION__))
2871 "Blocks can't have more than two successors.")(static_cast <bool> (From->getNumSuccessors() < 2
&& "Blocks can't have more than two successors.") ? void
(0) : __assert_fail ("From->getNumSuccessors() < 2 && \"Blocks can't have more than two successors.\""
, "llvm/lib/Transforms/Vectorize/VPlan.h", 2871, __extension__
__PRETTY_FUNCTION__))
;
2872 From->appendSuccessor(To);
2873 To->appendPredecessor(From);
2874 }
2875
2876 /// Disconnect VPBlockBases \p From and \p To bi-directionally. Remove \p To
2877 /// from the successors of \p From and \p From from the predecessors of \p To.
2878 static void disconnectBlocks(VPBlockBase *From, VPBlockBase *To) {
2879 assert(To && "Successor to disconnect is null.")(static_cast <bool> (To && "Successor to disconnect is null."
) ? void (0) : __assert_fail ("To && \"Successor to disconnect is null.\""
, "llvm/lib/Transforms/Vectorize/VPlan.h", 2879, __extension__
__PRETTY_FUNCTION__))
;
2880 From->removeSuccessor(To);
2881 To->removePredecessor(From);
2882 }
2883
2884 /// Try to merge \p Block into its single predecessor, if \p Block is a
2885 /// VPBasicBlock and its predecessor has a single successor. Returns a pointer
2886 /// to the predecessor \p Block was merged into or nullptr otherwise.
2887 static VPBasicBlock *tryToMergeBlockIntoPredecessor(VPBlockBase *Block) {
2888 auto *VPBB = dyn_cast<VPBasicBlock>(Block);
6
Assuming 'Block' is a 'CastReturnType'
2889 auto *PredVPBB =
2890 dyn_cast_or_null<VPBasicBlock>(Block->getSinglePredecessor());
7
Assuming the object is a 'CastReturnType'
2891 if (!VPBB
7.1
'VPBB' is non-null
7.1
'VPBB' is non-null
7.1
'VPBB' is non-null
|| !PredVPBB
7.2
'PredVPBB' is non-null
7.2
'PredVPBB' is non-null
7.2
'PredVPBB' is non-null
|| PredVPBB->getNumSuccessors() != 1)
8
Assuming the condition is false
9
Taking false branch
2892 return nullptr;
2893
2894 for (VPRecipeBase &R : make_early_inc_range(*VPBB))
2895 R.moveBefore(*PredVPBB, PredVPBB->end());
2896 VPBlockUtils::disconnectBlocks(PredVPBB, VPBB);
2897 auto *ParentRegion = cast<VPRegionBlock>(Block->getParent());
10
The object is a 'CastReturnType'
2898 if (ParentRegion->getExiting() == Block)
11
Taking false branch
2899 ParentRegion->setExiting(PredVPBB);
2900 SmallVector<VPBlockBase *> Successors(Block->successors());
2901 for (auto *Succ : Successors) {
12
Assuming '__begin2' is equal to '__end2'
2902 VPBlockUtils::disconnectBlocks(Block, Succ);
2903 VPBlockUtils::connectBlocks(PredVPBB, Succ);
2904 }
2905 delete Block;
13
Memory is released
2906 return PredVPBB;
2907 }
2908
2909 /// Return an iterator range over \p Range which only includes \p BlockTy
2910 /// blocks. The accesses are casted to \p BlockTy.
2911 template <typename BlockTy, typename T>
2912 static auto blocksOnly(const T &Range) {
2913 // Create BaseTy with correct const-ness based on BlockTy.
2914 using BaseTy = std::conditional_t<std::is_const<BlockTy>::value,
2915 const VPBlockBase, VPBlockBase>;
2916
2917 // We need to first create an iterator range over (const) BlocktTy & instead
2918 // of (const) BlockTy * for filter_range to work properly.
2919 auto Mapped =
2920 map_range(Range, [](BaseTy *Block) -> BaseTy & { return *Block; });
2921 auto Filter = make_filter_range(
2922 Mapped, [](BaseTy &Block) { return isa<BlockTy>(&Block); });
2923 return map_range(Filter, [](BaseTy &Block) -> BlockTy * {
2924 return cast<BlockTy>(&Block);
2925 });
2926 }
2927};
2928
2929class VPInterleavedAccessInfo {
2930 DenseMap<VPInstruction *, InterleaveGroup<VPInstruction> *>
2931 InterleaveGroupMap;
2932
2933 /// Type for mapping of instruction based interleave groups to VPInstruction
2934 /// interleave groups
2935 using Old2NewTy = DenseMap<InterleaveGroup<Instruction> *,
2936 InterleaveGroup<VPInstruction> *>;
2937
2938 /// Recursively \p Region and populate VPlan based interleave groups based on
2939 /// \p IAI.
2940 void visitRegion(VPRegionBlock *Region, Old2NewTy &Old2New,
2941 InterleavedAccessInfo &IAI);
2942 /// Recursively traverse \p Block and populate VPlan based interleave groups
2943 /// based on \p IAI.
2944 void visitBlock(VPBlockBase *Block, Old2NewTy &Old2New,
2945 InterleavedAccessInfo &IAI);
2946
2947public:
2948 VPInterleavedAccessInfo(VPlan &Plan, InterleavedAccessInfo &IAI);
2949
2950 ~VPInterleavedAccessInfo() {
2951 SmallPtrSet<InterleaveGroup<VPInstruction> *, 4> DelSet;
2952 // Avoid releasing a pointer twice.
2953 for (auto &I : InterleaveGroupMap)
2954 DelSet.insert(I.second);
2955 for (auto *Ptr : DelSet)
2956 delete Ptr;
2957 }
2958
2959 /// Get the interleave group that \p Instr belongs to.
2960 ///
2961 /// \returns nullptr if doesn't have such group.
2962 InterleaveGroup<VPInstruction> *
2963 getInterleaveGroup(VPInstruction *Instr) const {
2964 return InterleaveGroupMap.lookup(Instr);
2965 }
2966};
2967
2968/// Class that maps (parts of) an existing VPlan to trees of combined
2969/// VPInstructions.
2970class VPlanSlp {
2971 enum class OpMode { Failed, Load, Opcode };
2972
2973 /// A DenseMapInfo implementation for using SmallVector<VPValue *, 4> as
2974 /// DenseMap keys.
2975 struct BundleDenseMapInfo {
2976 static SmallVector<VPValue *, 4> getEmptyKey() {
2977 return {reinterpret_cast<VPValue *>(-1)};
2978 }
2979
2980 static SmallVector<VPValue *, 4> getTombstoneKey() {
2981 return {reinterpret_cast<VPValue *>(-2)};
2982 }
2983
2984 static unsigned getHashValue(const SmallVector<VPValue *, 4> &V) {
2985 return static_cast<unsigned>(hash_combine_range(V.begin(), V.end()));
2986 }
2987
2988 static bool isEqual(const SmallVector<VPValue *, 4> &LHS,
2989 const SmallVector<VPValue *, 4> &RHS) {
2990 return LHS == RHS;
2991 }
2992 };
2993
2994 /// Mapping of values in the original VPlan to a combined VPInstruction.
2995 DenseMap<SmallVector<VPValue *, 4>, VPInstruction *, BundleDenseMapInfo>
2996 BundleToCombined;
2997
2998 VPInterleavedAccessInfo &IAI;
2999
3000 /// Basic block to operate on. For now, only instructions in a single BB are
3001 /// considered.
3002 const VPBasicBlock &BB;
3003
3004 /// Indicates whether we managed to combine all visited instructions or not.
3005 bool CompletelySLP = true;
3006
3007 /// Width of the widest combined bundle in bits.
3008 unsigned WidestBundleBits = 0;
3009
3010 using MultiNodeOpTy =
3011 typename std::pair<VPInstruction *, SmallVector<VPValue *, 4>>;
3012
3013 // Input operand bundles for the current multi node. Each multi node operand
3014 // bundle contains values not matching the multi node's opcode. They will
3015 // be reordered in reorderMultiNodeOps, once we completed building a
3016 // multi node.
3017 SmallVector<MultiNodeOpTy, 4> MultiNodeOps;
3018
3019 /// Indicates whether we are building a multi node currently.
3020 bool MultiNodeActive = false;
3021
3022 /// Check if we can vectorize Operands together.
3023 bool areVectorizable(ArrayRef<VPValue *> Operands) const;
3024
3025 /// Add combined instruction \p New for the bundle \p Operands.
3026 void addCombined(ArrayRef<VPValue *> Operands, VPInstruction *New);
3027
3028 /// Indicate we hit a bundle we failed to combine. Returns nullptr for now.
3029 VPInstruction *markFailed();
3030
3031 /// Reorder operands in the multi node to maximize sequential memory access
3032 /// and commutative operations.
3033 SmallVector<MultiNodeOpTy, 4> reorderMultiNodeOps();
3034
3035 /// Choose the best candidate to use for the lane after \p Last. The set of
3036 /// candidates to choose from are values with an opcode matching \p Last's
3037 /// or loads consecutive to \p Last.
3038 std::pair<OpMode, VPValue *> getBest(OpMode Mode, VPValue *Last,
3039 SmallPtrSetImpl<VPValue *> &Candidates,
3040 VPInterleavedAccessInfo &IAI);
3041
3042#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3043 /// Print bundle \p Values to dbgs().
3044 void dumpBundle(ArrayRef<VPValue *> Values);
3045#endif
3046
3047public:
3048 VPlanSlp(VPInterleavedAccessInfo &IAI, VPBasicBlock &BB) : IAI(IAI), BB(BB) {}
3049
3050 ~VPlanSlp() = default;
3051
3052 /// Tries to build an SLP tree rooted at \p Operands and returns a
3053 /// VPInstruction combining \p Operands, if they can be combined.
3054 VPInstruction *buildGraph(ArrayRef<VPValue *> Operands);
3055
3056 /// Return the width of the widest combined bundle in bits.
3057 unsigned getWidestBundleBits() const { return WidestBundleBits; }
3058
3059 /// Return true if all visited instruction can be combined.
3060 bool isCompletelySLP() const { return CompletelySLP; }
3061};
3062
3063namespace vputils {
3064
3065/// Returns true if only the first lane of \p Def is used.
3066bool onlyFirstLaneUsed(VPValue *Def);
3067
3068/// Get or create a VPValue that corresponds to the expansion of \p Expr. If \p
3069/// Expr is a SCEVConstant or SCEVUnknown, return a VPValue wrapping the live-in
3070/// value. Otherwise return a VPExpandSCEVRecipe to expand \p Expr. If \p Plan's
3071/// pre-header already contains a recipe expanding \p Expr, return it. If not,
3072/// create a new one.
3073VPValue *getOrCreateVPValueForSCEVExpr(VPlan &Plan, const SCEV *Expr,
3074 ScalarEvolution &SE);
3075
3076/// Returns true if \p VPV is uniform after vectorization.
3077inline bool isUniformAfterVectorization(VPValue *VPV) {
3078 // A value defined outside the vector region must be uniform after
3079 // vectorization inside a vector region.
3080 if (VPV->isDefinedOutsideVectorRegions())
3081 return true;
3082 VPRecipeBase *Def = VPV->getDefiningRecipe();
3083 assert(Def && "Must have definition for value defined inside vector region")(static_cast <bool> (Def && "Must have definition for value defined inside vector region"
) ? void (0) : __assert_fail ("Def && \"Must have definition for value defined inside vector region\""
, "llvm/lib/Transforms/Vectorize/VPlan.h", 3083, __extension__
__PRETTY_FUNCTION__))
;
3084 if (auto Rep = dyn_cast<VPReplicateRecipe>(Def))
3085 return Rep->isUniform();
3086 return false;
3087}
3088} // end namespace vputils
3089
3090} // end namespace llvm
3091
3092#endif // LLVM_TRANSFORMS_VECTORIZE_VPLAN_H