/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

Bug Summary

File:	llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
Warning:	line 6767, column 35 Potential leak of memory pointed to by 'BlockMask'

Annotated Source Code

Press '?' to see keyboard shortcuts

Show analyzer invocation

clang -cc1 -triple x86_64-pc-linux-gnu -analyze -disable-free -disable-llvm-verifier -discard-value-names -main-file-name LoopVectorize.cpp -analyzer-store=region -analyzer-opt-analyze-nested-blocks -analyzer-checker=core -analyzer-checker=apiModeling -analyzer-checker=unix -analyzer-checker=deadcode -analyzer-checker=cplusplus -analyzer-checker=security.insecureAPI.UncheckedReturn -analyzer-checker=security.insecureAPI.getpw -analyzer-checker=security.insecureAPI.gets -analyzer-checker=security.insecureAPI.mktemp -analyzer-checker=security.insecureAPI.mkstemp -analyzer-checker=security.insecureAPI.vfork -analyzer-checker=nullability.NullPassedToNonnull -analyzer-checker=nullability.NullReturnedFromNonnull -analyzer-output plist -w -setup-static-analyzer -analyzer-config-compatibility-mode=true -mrelocation-model pic -pic-level 2 -mthread-model posix -mframe-pointer=none -fmath-errno -fno-rounding-math -masm-verbose -mconstructor-aliases -munwind-tables -target-cpu x86-64 -dwarf-column-info -fno-split-dwarf-inlining -debugger-tuning=gdb -ffunction-sections -fdata-sections -resource-dir /usr/lib/llvm-11/lib/clang/11.0.0 -D _DEBUG -D _GNU_SOURCE -D __STDC_CONSTANT_MACROS -D __STDC_FORMAT_MACROS -D __STDC_LIMIT_MACROS -I /build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/build-llvm/lib/Transforms/Vectorize -I /build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize -I /build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/build-llvm/include -I /build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/include -U NDEBUG -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/6.3.0/../../../../include/c++/6.3.0 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/6.3.0/../../../../include/x86_64-linux-gnu/c++/6.3.0 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/6.3.0/../../../../include/x86_64-linux-gnu/c++/6.3.0 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/6.3.0/../../../../include/c++/6.3.0/backward -internal-isystem /usr/local/include -internal-isystem /usr/lib/llvm-11/lib/clang/11.0.0/include -internal-externc-isystem /usr/include/x86_64-linux-gnu -internal-externc-isystem /include -internal-externc-isystem /usr/include -O2 -Wno-unused-parameter -Wwrite-strings -Wno-missing-field-initializers -Wno-long-long -Wno-maybe-uninitialized -Wno-comment -std=c++14 -fdeprecated-macro -fdebug-compilation-dir /build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/build-llvm/lib/Transforms/Vectorize -fdebug-prefix-map=/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347=. -ferror-limit 19 -fmessage-length 0 -fvisibility-inlines-hidden -stack-protector 2 -fgnuc-version=4.2.1 -fobjc-runtime=gcc -fdiagnostics-show-option -vectorize-loops -vectorize-slp -analyzer-output=html -analyzer-config stable-report-filename=true -faddrsig -o /tmp/scan-build-2020-03-09-184146-41876-1 -x c++ /build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

→

1//===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops
10// and generates target-independent LLVM-IR.
11// The vectorizer uses the TargetTransformInfo analysis to estimate the costs
12// of instructions in order to estimate the profitability of vectorization.
13//
14// The loop vectorizer combines consecutive loop iterations into a single
15// 'wide' iteration. After this transformation the index is incremented
16// by the SIMD vector width, and not by one.
17//
18// This pass has three parts:
19// 1. The main loop pass that drives the different parts.
20// 2. LoopVectorizationLegality - A unit that checks for the legality
21//    of the vectorization.
22// 3. InnerLoopVectorizer - A unit that performs the actual
23//    widening of instructions.
24// 4. LoopVectorizationCostModel - A unit that checks for the profitability
25//    of vectorization. It decides on the optimal vector width, which
26//    can be one, if vectorization is not profitable.
27//
28// There is a development effort going on to migrate loop vectorizer to the
29// VPlan infrastructure and to introduce outer loop vectorization support (see
30// docs/Proposal/VectorizationPlan.rst and
31// http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this
32// purpose, we temporarily introduced the VPlan-native vectorization path: an
33// alternative vectorization path that is natively implemented on top of the
34// VPlan infrastructure. See EnableVPlanNativePath for enabling.
35//
36//===----------------------------------------------------------------------===//
37//
38// The reduction-variable vectorization is based on the paper:
39//  D. Nuzman and R. Henderson. Multi-platform Auto-vectorization.
40//
41// Variable uniformity checks are inspired by:
42//  Karrenberg, R. and Hack, S. Whole Function Vectorization.
43//
44// The interleaved access vectorization is based on the paper:
45//  Dorit Nuzman, Ira Rosen and Ayal Zaks.  Auto-Vectorization of Interleaved
46//  Data for SIMD
47//
48// Other ideas/concepts are from:
49//  A. Zaks and D. Nuzman. Autovectorization in GCC-two years later.
50//
51//  S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua.  An Evaluation of
52//  Vectorizing Compilers.
53//
54//===----------------------------------------------------------------------===//

56#include "llvm/Transforms/Vectorize/LoopVectorize.h"
57#include "LoopVectorizationPlanner.h"
58#include "VPRecipeBuilder.h"
59#include "VPlan.h"
60#include "VPlanHCFGBuilder.h"
61#include "VPlanPredicator.h"
62#include "VPlanTransforms.h"
63#include "llvm/ADT/APInt.h"
64#include "llvm/ADT/ArrayRef.h"
65#include "llvm/ADT/DenseMap.h"
66#include "llvm/ADT/DenseMapInfo.h"
67#include "llvm/ADT/Hashing.h"
68#include "llvm/ADT/MapVector.h"
69#include "llvm/ADT/None.h"
70#include "llvm/ADT/Optional.h"
71#include "llvm/ADT/STLExtras.h"
72#include "llvm/ADT/SetVector.h"
73#include "llvm/ADT/SmallPtrSet.h"
74#include "llvm/ADT/SmallVector.h"
75#include "llvm/ADT/Statistic.h"
76#include "llvm/ADT/StringRef.h"
77#include "llvm/ADT/Twine.h"
78#include "llvm/ADT/iterator_range.h"
79#include "llvm/Analysis/AssumptionCache.h"
80#include "llvm/Analysis/BasicAliasAnalysis.h"
81#include "llvm/Analysis/BlockFrequencyInfo.h"
82#include "llvm/Analysis/CFG.h"
83#include "llvm/Analysis/CodeMetrics.h"
84#include "llvm/Analysis/DemandedBits.h"
85#include "llvm/Analysis/GlobalsModRef.h"
86#include "llvm/Analysis/LoopAccessAnalysis.h"
87#include "llvm/Analysis/LoopAnalysisManager.h"
88#include "llvm/Analysis/LoopInfo.h"
89#include "llvm/Analysis/LoopIterator.h"
90#include "llvm/Analysis/MemorySSA.h"
91#include "llvm/Analysis/OptimizationRemarkEmitter.h"
92#include "llvm/Analysis/ProfileSummaryInfo.h"
93#include "llvm/Analysis/ScalarEvolution.h"
94#include "llvm/Analysis/ScalarEvolutionExpander.h"
95#include "llvm/Analysis/ScalarEvolutionExpressions.h"
96#include "llvm/Analysis/TargetLibraryInfo.h"
97#include "llvm/Analysis/TargetTransformInfo.h"
98#include "llvm/Analysis/VectorUtils.h"
99#include "llvm/IR/Attributes.h"
100#include "llvm/IR/BasicBlock.h"
101#include "llvm/IR/CFG.h"
102#include "llvm/IR/Constant.h"
103#include "llvm/IR/Constants.h"
104#include "llvm/IR/DataLayout.h"
105#include "llvm/IR/DebugInfoMetadata.h"
106#include "llvm/IR/DebugLoc.h"
107#include "llvm/IR/DerivedTypes.h"
108#include "llvm/IR/DiagnosticInfo.h"
109#include "llvm/IR/Dominators.h"
110#include "llvm/IR/Function.h"
111#include "llvm/IR/IRBuilder.h"
112#include "llvm/IR/InstrTypes.h"
113#include "llvm/IR/Instruction.h"
114#include "llvm/IR/Instructions.h"
115#include "llvm/IR/IntrinsicInst.h"
116#include "llvm/IR/Intrinsics.h"
117#include "llvm/IR/LLVMContext.h"
118#include "llvm/IR/Metadata.h"
119#include "llvm/IR/Module.h"
120#include "llvm/IR/Operator.h"
121#include "llvm/IR/Type.h"
122#include "llvm/IR/Use.h"
123#include "llvm/IR/User.h"
124#include "llvm/IR/Value.h"
125#include "llvm/IR/ValueHandle.h"
126#include "llvm/IR/Verifier.h"
127#include "llvm/InitializePasses.h"
128#include "llvm/Pass.h"
129#include "llvm/Support/Casting.h"
130#include "llvm/Support/CommandLine.h"
131#include "llvm/Support/Compiler.h"
132#include "llvm/Support/Debug.h"
133#include "llvm/Support/ErrorHandling.h"
134#include "llvm/Support/MathExtras.h"
135#include "llvm/Support/raw_ostream.h"
136#include "llvm/Transforms/Utils/BasicBlockUtils.h"
137#include "llvm/Transforms/Utils/InjectTLIMappings.h"
138#include "llvm/Transforms/Utils/LoopSimplify.h"
139#include "llvm/Transforms/Utils/LoopUtils.h"
140#include "llvm/Transforms/Utils/LoopVersioning.h"
141#include "llvm/Transforms/Utils/SizeOpts.h"
142#include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h"
143#include <algorithm>
144#include <cassert>
145#include <cstdint>
146#include <cstdlib>
147#include <functional>
148#include <iterator>
149#include <limits>
150#include <memory>
151#include <string>
152#include <tuple>
153#include <utility>

155using namespace llvm;

157#define LV_NAME"loop-vectorize" "loop-vectorize"
158#define DEBUG_TYPE"loop-vectorize" LV_NAME"loop-vectorize"

160/// @{
161/// Metadata attribute names
162static const char *const LLVMLoopVectorizeFollowupAll =
  "llvm.loop.vectorize.followup_all";
164static const char *const LLVMLoopVectorizeFollowupVectorized =
  "llvm.loop.vectorize.followup_vectorized";
166static const char *const LLVMLoopVectorizeFollowupEpilogue =
  "llvm.loop.vectorize.followup_epilogue";
168/// @}

170STATISTIC(LoopsVectorized, "Number of loops vectorized")static llvm::Statistic LoopsVectorized = {"loop-vectorize", "LoopsVectorized"
, "Number of loops vectorized"};
171STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization")static llvm::Statistic LoopsAnalyzed = {"loop-vectorize", "LoopsAnalyzed"
, "Number of loops analyzed for vectorization"};

173/// Loops with a known constant trip count below this number are vectorized only
174/// if no scalar iteration overheads are incurred.
175static cl::opt<unsigned> TinyTripCountVectorThreshold(
  "vectorizer-min-trip-count", cl::init(16), cl::Hidden,
  cl::desc("Loops with a constant trip count that is smaller than this "
           "value are vectorized only if no scalar iteration overheads "
           "are incurred."));

181// Indicates that an epilogue is undesired, predication is preferred.
182// This means that the vectorizer will try to fold the loop-tail (epilogue)
183// into the loop and predicate the loop body accordingly.
184static cl::opt<bool> PreferPredicateOverEpilog(
  "prefer-predicate-over-epilog", cl::init(false), cl::Hidden,
  cl::desc("Indicate that an epilogue is undesired, predication should be "
           "used instead."));

189static cl::opt<bool> MaximizeBandwidth(
  "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden,
  cl::desc("Maximize bandwidth when selecting vectorization factor which "
           "will be determined by the smallest type in loop."));

194static cl::opt<bool> EnableInterleavedMemAccesses(
  "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden,
  cl::desc("Enable vectorization on interleaved memory accesses in a loop"));

198/// An interleave-group may need masking if it resides in a block that needs
199/// predication, or in order to mask away gaps. 
200static cl::opt<bool> EnableMaskedInterleavedMemAccesses(
  "enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden,
  cl::desc("Enable vectorization on masked interleaved memory accesses in a loop"));

204static cl::opt<unsigned> TinyTripCountInterleaveThreshold(
  "tiny-trip-count-interleave-threshold", cl::init(128), cl::Hidden,
  cl::desc("We don't interleave loops with a estimated constant trip count "
           "below this number"));

209static cl::opt<unsigned> ForceTargetNumScalarRegs(
  "force-target-num-scalar-regs", cl::init(0), cl::Hidden,
  cl::desc("A flag that overrides the target's number of scalar registers."));

213static cl::opt<unsigned> ForceTargetNumVectorRegs(
  "force-target-num-vector-regs", cl::init(0), cl::Hidden,
  cl::desc("A flag that overrides the target's number of vector registers."));

217static cl::opt<unsigned> ForceTargetMaxScalarInterleaveFactor(
  "force-target-max-scalar-interleave", cl::init(0), cl::Hidden,
  cl::desc("A flag that overrides the target's max interleave factor for "
           "scalar loops."));

222static cl::opt<unsigned> ForceTargetMaxVectorInterleaveFactor(
  "force-target-max-vector-interleave", cl::init(0), cl::Hidden,
  cl::desc("A flag that overrides the target's max interleave factor for "
           "vectorized loops."));

227static cl::opt<unsigned> ForceTargetInstructionCost(
  "force-target-instruction-cost", cl::init(0), cl::Hidden,
  cl::desc("A flag that overrides the target's expected cost for "
           "an instruction to a single constant value. Mostly "
           "useful for getting consistent testing."));

233static cl::opt<unsigned> SmallLoopCost(
  "small-loop-cost", cl::init(20), cl::Hidden,
  cl::desc(
      "The cost of a loop that is considered 'small' by the interleaver."));

238static cl::opt<bool> LoopVectorizeWithBlockFrequency(
  "loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden,
  cl::desc("Enable the use of the block frequency analysis to access PGO "
           "heuristics minimizing code growth in cold regions and being more "
           "aggressive in hot regions."));

244// Runtime interleave loops for load/store throughput.
245static cl::opt<bool> EnableLoadStoreRuntimeInterleave(
  "enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden,
  cl::desc(
      "Enable runtime interleaving until load/store ports are saturated"));

250/// The number of stores in a loop that are allowed to need predication.
251static cl::opt<unsigned> NumberOfStoresToPredicate(
  "vectorize-num-stores-pred", cl::init(1), cl::Hidden,
  cl::desc("Max number of stores to be predicated behind an if."));

255static cl::opt<bool> EnableIndVarRegisterHeur(
  "enable-ind-var-reg-heur", cl::init(true), cl::Hidden,
  cl::desc("Count the induction variable only once when interleaving"));

259static cl::opt<bool> EnableCondStoresVectorization(
  "enable-cond-stores-vec", cl::init(true), cl::Hidden,
  cl::desc("Enable if predication of stores during vectorization."));

263static cl::opt<unsigned> MaxNestedScalarReductionIC(
  "max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden,
  cl::desc("The maximum interleave count to use when interleaving a scalar "
           "reduction in a nested loop."));

268cl::opt<bool> EnableVPlanNativePath(
  "enable-vplan-native-path", cl::init(false), cl::Hidden,
  cl::desc("Enable VPlan-native vectorization path with "
           "support for outer loop vectorization."));

273// FIXME: Remove this switch once we have divergence analysis. Currently we
274// assume divergent non-backedge branches when this switch is true.
275cl::opt<bool> EnableVPlanPredication(
  "enable-vplan-predication", cl::init(false), cl::Hidden,
  cl::desc("Enable VPlan-native vectorization path predicator with "
           "support for outer loop vectorization."));

280// This flag enables the stress testing of the VPlan H-CFG construction in the
281// VPlan-native vectorization path. It must be used in conjuction with
282// -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the
283// verification of the H-CFGs built.
284static cl::opt<bool> VPlanBuildStressTest(
  "vplan-build-stress-test", cl::init(false), cl::Hidden,
  cl::desc(
      "Build VPlan for every supported loop nest in the function and bail "
      "out right after the build (stress test the VPlan H-CFG construction "
      "in the VPlan-native vectorization path)."));

291cl::opt<bool> llvm::EnableLoopInterleaving(
  "interleave-loops", cl::init(true), cl::Hidden,
  cl::desc("Enable loop interleaving in Loop vectorization passes"));
294cl::opt<bool> llvm::EnableLoopVectorization(
  "vectorize-loops", cl::init(true), cl::Hidden,
  cl::desc("Run the Loop vectorization passes"));

298/// A helper function that returns the type of loaded or stored value.
299static Type *getMemInstValueType(Value *I) {
assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&(((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
 "Expected Load or Store instruction") ? static_cast<void>
 (0) : __assert_fail ("(isa<LoadInst>(I) || isa<StoreInst>(I)) && \"Expected Load or Store instruction\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 301, __PRETTY_FUNCTION__))
       "Expected Load or Store instruction")(((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
 "Expected Load or Store instruction") ? static_cast<void>
 (0) : __assert_fail ("(isa<LoadInst>(I) || isa<StoreInst>(I)) && \"Expected Load or Store instruction\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 301, __PRETTY_FUNCTION__));
if (auto *LI = dyn_cast<LoadInst>(I))
  return LI->getType();
return cast<StoreInst>(I)->getValueOperand()->getType();
305}

307/// A helper function that returns true if the given type is irregular. The
308/// type is irregular if its allocated size doesn't equal the store size of an
309/// element of the corresponding vector type at the given vectorization factor.
310static bool hasIrregularType(Type *Ty, const DataLayout &DL, unsigned VF) {
// Determine if an array of VF elements of type Ty is "bitcast compatible"
// with a <VF x Ty> vector.
if (VF > 1) {
  auto *VectorTy = VectorType::get(Ty, VF);
  return VF * DL.getTypeAllocSize(Ty) != DL.getTypeStoreSize(VectorTy);
}

// If the vectorization factor is one, we just check if an array of type Ty
// requires padding between elements.
return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty);
321}

323/// A helper function that returns the reciprocal of the block probability of
324/// predicated blocks. If we return X, we are assuming the predicated block
325/// will execute once for every X iterations of the loop header.
326///
327/// TODO: We should use actual block probability here, if available. Currently,
328///       we always assume predicated blocks have a 50% chance of executing.
329static unsigned getReciprocalPredBlockProb() { return 2; }

331/// A helper function that adds a 'fast' flag to floating-point operations.
332static Value *addFastMathFlag(Value *V) {
if (isa<FPMathOperator>(V))
  cast<Instruction>(V)->setFastMathFlags(FastMathFlags::getFast());
return V;
336}

338static Value *addFastMathFlag(Value *V, FastMathFlags FMF) {
if (isa<FPMathOperator>(V))
  cast<Instruction>(V)->setFastMathFlags(FMF);
return V;
342}

344/// A helper function that returns an integer or floating-point constant with
345/// value C.
346static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) {
return Ty->isIntegerTy() ? ConstantInt::getSigned(Ty, C)
                         : ConstantFP::get(Ty, C);
349}

351/// Returns "best known" trip count for the specified loop \p L as defined by
352/// the following procedure:
353///   1) Returns exact trip count if it is known.
354///   2) Returns expected trip count according to profile data if any.
355///   3) Returns upper bound estimate if it is known.
356///   4) Returns None if all of the above failed.
357static Optional<unsigned> getSmallBestKnownTC(ScalarEvolution &SE, Loop *L) {
// Check if exact trip count is known.
if (unsigned ExpectedTC = SE.getSmallConstantTripCount(L))
  return ExpectedTC;

// Check if there is an expected trip count available from profile data.
if (LoopVectorizeWithBlockFrequency)
  if (auto EstimatedTC = getLoopEstimatedTripCount(L))
    return EstimatedTC;

// Check if upper bound estimate is known.
if (unsigned ExpectedTC = SE.getSmallConstantMaxTripCount(L))
  return ExpectedTC;

return None;
372}

374namespace llvm {

376/// InnerLoopVectorizer vectorizes loops which contain only one basic
377/// block to a specified vectorization factor (VF).
378/// This class performs the widening of scalars into vectors, or multiple
379/// scalars. This class also implements the following features:
380/// * It inserts an epilogue loop for handling loops that don't have iteration
381///   counts that are known to be a multiple of the vectorization factor.
382/// * It handles the code generation for reduction variables.
383/// * Scalarization (implementation using scalars) of un-vectorizable
384///   instructions.
385/// InnerLoopVectorizer does not perform any vectorization-legality
386/// checks, and relies on the caller to check for the different legality
387/// aspects. The InnerLoopVectorizer relies on the
388/// LoopVectorizationLegality class to provide information about the induction
389/// and reduction variables that were found to a given vectorization factor.
390class InnerLoopVectorizer {
391public:
InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
                    LoopInfo *LI, DominatorTree *DT,
                    const TargetLibraryInfo *TLI,
                    const TargetTransformInfo *TTI, AssumptionCache *AC,
                    OptimizationRemarkEmitter *ORE, unsigned VecWidth,
                    unsigned UnrollFactor, LoopVectorizationLegality *LVL,
                    LoopVectorizationCostModel *CM)
    : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI),
      AC(AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor),
      Builder(PSE.getSE()->getContext()),
      VectorLoopValueMap(UnrollFactor, VecWidth), Legal(LVL), Cost(CM) {}
virtual ~InnerLoopVectorizer() = default;

/// Create a new empty loop. Unlink the old loop and connect the new one.
/// Return the pre-header block of the new loop.
BasicBlock *createVectorizedLoopSkeleton();

/// Widen a single instruction within the innermost loop.
void widenInstruction(Instruction &I);

/// Fix the vectorized code, taking care of header phi's, live-outs, and more.
void fixVectorizedLoop();

// Return true if any runtime check is added.
bool areSafetyChecksAdded() { return AddedSafetyChecks; }

/// A type for vectorized values in the new loop. Each value from the
/// original loop, when vectorized, is represented by UF vector values in the
/// new unrolled loop, where UF is the unroll factor.
using VectorParts = SmallVector<Value *, 2>;

/// Vectorize a single GetElementPtrInst based on information gathered and
/// decisions taken during planning.
void widenGEP(GetElementPtrInst *GEP, unsigned UF, unsigned VF,
              bool IsPtrLoopInvariant, SmallBitVector &IsIndexLoopInvariant);

/// Vectorize a single PHINode in a block. This method handles the induction
/// variable canonicalization. It supports both VF = 1 for unrolled loops and
/// arbitrary length vectors.
void widenPHIInstruction(Instruction *PN, unsigned UF, unsigned VF);

/// A helper function to scalarize a single Instruction in the innermost loop.
/// Generates a sequence of scalar instances for each lane between \p MinLane
/// and \p MaxLane, times each part between \p MinPart and \p MaxPart,
/// inclusive..
void scalarizeInstruction(Instruction *Instr, const VPIteration &Instance,
                          bool IfPredicateInstr);

/// Widen an integer or floating-point induction variable \p IV. If \p Trunc
/// is provided, the integer induction variable will first be truncated to
/// the corresponding type.
void widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc = nullptr);

/// getOrCreateVectorValue and getOrCreateScalarValue coordinate to generate a
/// vector or scalar value on-demand if one is not yet available. When
/// vectorizing a loop, we visit the definition of an instruction before its
/// uses. When visiting the definition, we either vectorize or scalarize the
/// instruction, creating an entry for it in the corresponding map. (In some
/// cases, such as induction variables, we will create both vector and scalar
/// entries.) Then, as we encounter uses of the definition, we derive values
/// for each scalar or vector use unless such a value is already available.
/// For example, if we scalarize a definition and one of its uses is vector,
/// we build the required vector on-demand with an insertelement sequence
/// when visiting the use. Otherwise, if the use is scalar, we can use the
/// existing scalar definition.
///
/// Return a value in the new loop corresponding to \p V from the original
/// loop at unroll index \p Part. If the value has already been vectorized,
/// the corresponding vector entry in VectorLoopValueMap is returned. If,
/// however, the value has a scalar entry in VectorLoopValueMap, we construct
/// a new vector value on-demand by inserting the scalar values into a vector
/// with an insertelement sequence. If the value has been neither vectorized
/// nor scalarized, it must be loop invariant, so we simply broadcast the
/// value into a vector.
Value *getOrCreateVectorValue(Value *V, unsigned Part);

/// Return a value in the new loop corresponding to \p V from the original
/// loop at unroll and vector indices \p Instance. If the value has been
/// vectorized but not scalarized, the necessary extractelement instruction
/// will be generated.
Value *getOrCreateScalarValue(Value *V, const VPIteration &Instance);

/// Construct the vector value of a scalarized value \p V one lane at a time.
void packScalarIntoVectorValue(Value *V, const VPIteration &Instance);

/// Try to vectorize the interleaved access group that \p Instr belongs to
/// with the base address given in \p Addr, optionally masking the vector
/// operations if \p BlockInMask is non-null. Use \p State to translate given
/// VPValues to IR values in the vectorized loop.
void vectorizeInterleaveGroup(Instruction *Instr, VPTransformState &State,
                              VPValue *Addr, VPValue *BlockInMask = nullptr);

/// Vectorize Load and Store instructions with the base address given in \p
/// Addr, optionally masking the vector operations if \p BlockInMask is
/// non-null. Use \p State to translate given VPValues to IR values in the
/// vectorized loop.
void vectorizeMemoryInstruction(Instruction *Instr, VPTransformState &State,
                                VPValue *Addr,
                                VPValue *BlockInMask = nullptr);

/// Set the debug location in the builder using the debug location in
/// the instruction.
void setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr);

/// Fix the non-induction PHIs in the OrigPHIsToFix vector.
void fixNonInductionPHIs(void);

499protected:
friend class LoopVectorizationPlanner;

/// A small list of PHINodes.
using PhiVector = SmallVector<PHINode *, 4>;

/// A type for scalarized values in the new loop. Each value from the
/// original loop, when scalarized, is represented by UF x VF scalar values
/// in the new unrolled loop, where UF is the unroll factor and VF is the
/// vectorization factor.
using ScalarParts = SmallVector<SmallVector<Value *, 4>, 2>;

/// Set up the values of the IVs correctly when exiting the vector loop.
void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II,
                  Value *CountRoundDown, Value *EndValue,
                  BasicBlock *MiddleBlock);

/// Create a new induction variable inside L.
PHINode *createInductionVariable(Loop *L, Value *Start, Value *End,
                                 Value *Step, Instruction *DL);

/// Handle all cross-iteration phis in the header.
void fixCrossIterationPHIs();

/// Fix a first-order recurrence. This is the second phase of vectorizing
/// this phi node.
void fixFirstOrderRecurrence(PHINode *Phi);

/// Fix a reduction cross-iteration phi. This is the second phase of
/// vectorizing this phi node.
void fixReduction(PHINode *Phi);

/// Clear NSW/NUW flags from reduction instructions if necessary.
void clearReductionWrapFlags(RecurrenceDescriptor &RdxDesc);

/// The Loop exit block may have single value PHI nodes with some
/// incoming value. While vectorizing we only handled real values
/// that were defined inside the loop and we should have one value for
/// each predecessor of its parent basic block. See PR14725.
void fixLCSSAPHIs();

/// Iteratively sink the scalarized operands of a predicated instruction into
/// the block that was created for it.
void sinkScalarOperands(Instruction *PredInst);

/// Shrinks vector element sizes to the smallest bitwidth they can be legally
/// represented as.
void truncateToMinimalBitwidths();

/// Create a broadcast instruction. This method generates a broadcast
/// instruction (shuffle) for loop invariant values and for the induction
/// value. If this is the induction variable then we extend it to N, N+1, ...
/// this is needed because each iteration in the loop corresponds to a SIMD
/// element.
virtual Value *getBroadcastInstrs(Value *V);

/// This function adds (StartIdx, StartIdx + Step, StartIdx + 2*Step, ...)
/// to each vector element of Val. The sequence starts at StartIndex.
/// \p Opcode is relevant for FP induction variable.
virtual Value *getStepVector(Value *Val, int StartIdx, Value *Step,
                             Instruction::BinaryOps Opcode =
                             Instruction::BinaryOpsEnd);

/// Compute scalar induction steps. \p ScalarIV is the scalar induction
/// variable on which to base the steps, \p Step is the size of the step, and
/// \p EntryVal is the value from the original loop that maps to the steps.
/// Note that \p EntryVal doesn't have to be an induction variable - it
/// can also be a truncate instruction.
void buildScalarSteps(Value *ScalarIV, Value *Step, Instruction *EntryVal,
                      const InductionDescriptor &ID);

/// Create a vector induction phi node based on an existing scalar one. \p
/// EntryVal is the value from the original loop that maps to the vector phi
/// node, and \p Step is the loop-invariant step. If \p EntryVal is a
/// truncate instruction, instead of widening the original IV, we widen a
/// version of the IV truncated to \p EntryVal's type.
void createVectorIntOrFpInductionPHI(const InductionDescriptor &II,
                                     Value *Step, Instruction *EntryVal);

/// Returns true if an instruction \p I should be scalarized instead of
/// vectorized for the chosen vectorization factor.
bool shouldScalarizeInstruction(Instruction *I) const;

/// Returns true if we should generate a scalar version of \p IV.
bool needsScalarInduction(Instruction *IV) const;

/// If there is a cast involved in the induction variable \p ID, which should
/// be ignored in the vectorized loop body, this function records the
/// VectorLoopValue of the respective Phi also as the VectorLoopValue of the
/// cast. We had already proved that the casted Phi is equal to the uncasted
/// Phi in the vectorized loop (under a runtime guard), and therefore
/// there is no need to vectorize the cast - the same value can be used in the
/// vector loop for both the Phi and the cast.
/// If \p VectorLoopValue is a scalarized value, \p Lane is also specified,
/// Otherwise, \p VectorLoopValue is a widened/vectorized value.
///
/// \p EntryVal is the value from the original loop that maps to the vector
/// phi node and is used to distinguish what is the IV currently being
/// processed - original one (if \p EntryVal is a phi corresponding to the
/// original IV) or the "newly-created" one based on the proof mentioned above
/// (see also buildScalarSteps() and createVectorIntOrFPInductionPHI()). In the
/// latter case \p EntryVal is a TruncInst and we must not record anything for
/// that IV, but it's error-prone to expect callers of this routine to care
/// about that, hence this explicit parameter.
void recordVectorLoopValueForInductionCast(const InductionDescriptor &ID,
                                           const Instruction *EntryVal,
                                           Value *VectorLoopValue,
                                           unsigned Part,
                                           unsigned Lane = UINT_MAX(2147483647 *2U +1U));

/// Generate a shuffle sequence that will reverse the vector Vec.
virtual Value *reverseVector(Value *Vec);

/// Returns (and creates if needed) the original loop trip count.
Value *getOrCreateTripCount(Loop *NewLoop);

/// Returns (and creates if needed) the trip count of the widened loop.
Value *getOrCreateVectorTripCount(Loop *NewLoop);

/// Returns a bitcasted value to the requested vector type.
/// Also handles bitcasts of vector<float> <-> vector<pointer> types.
Value *createBitOrPointerCast(Value *V, VectorType *DstVTy,
                              const DataLayout &DL);

/// Emit a bypass check to see if the vector trip count is zero, including if
/// it overflows.
void emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass);

/// Emit a bypass check to see if all of the SCEV assumptions we've
/// had to make are correct.
void emitSCEVChecks(Loop *L, BasicBlock *Bypass);

/// Emit bypass checks to check any memory assumptions we may have made.
void emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass);

/// Compute the transformed value of Index at offset StartValue using step
/// StepValue.
/// For integer induction, returns StartValue + Index * StepValue.
/// For pointer induction, returns StartValue[Index * StepValue].
/// FIXME: The newly created binary instructions should contain nsw/nuw
/// flags, which can be found from the original scalar operations.
Value *emitTransformedIndex(IRBuilder<> &B, Value *Index, ScalarEvolution *SE,
                            const DataLayout &DL,
                            const InductionDescriptor &ID) const;

/// Add additional metadata to \p To that was not present on \p Orig.
///
/// Currently this is used to add the noalias annotations based on the
/// inserted memchecks.  Use this for instructions that are *cloned* into the
/// vector loop.
void addNewMetadata(Instruction *To, const Instruction *Orig);

/// Add metadata from one instruction to another.
///
/// This includes both the original MDs from \p From and additional ones (\see
/// addNewMetadata).  Use this for *newly created* instructions in the vector
/// loop.
void addMetadata(Instruction *To, Instruction *From);

/// Similar to the previous function but it adds the metadata to a
/// vector of instructions.
void addMetadata(ArrayRef<Value *> To, Instruction *From);

/// The original loop.
Loop *OrigLoop;

/// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies
/// dynamic knowledge to simplify SCEV expressions and converts them to a
/// more usable form.
PredicatedScalarEvolution &PSE;

/// Loop Info.
LoopInfo *LI;

/// Dominator Tree.
DominatorTree *DT;

/// Alias Analysis.
AliasAnalysis *AA;

/// Target Library Info.
const TargetLibraryInfo *TLI;

/// Target Transform Info.
const TargetTransformInfo *TTI;

/// Assumption Cache.
AssumptionCache *AC;

/// Interface to emit optimization remarks.
OptimizationRemarkEmitter *ORE;

/// LoopVersioning.  It's only set up (non-null) if memchecks were
/// used.
///
/// This is currently only used to add no-alias metadata based on the
/// memchecks.  The actually versioning is performed manually.
std::unique_ptr<LoopVersioning> LVer;

/// The vectorization SIMD factor to use. Each vector will have this many
/// vector elements.
unsigned VF;

/// The vectorization unroll factor to use. Each scalar is vectorized to this
/// many different vector instructions.
unsigned UF;

/// The builder that we use
IRBuilder<> Builder;

// --- Vectorization state ---

/// The vector-loop preheader.
BasicBlock *LoopVectorPreHeader;

/// The scalar-loop preheader.
BasicBlock *LoopScalarPreHeader;

/// Middle Block between the vector and the scalar.
BasicBlock *LoopMiddleBlock;

/// The ExitBlock of the scalar loop.
BasicBlock *LoopExitBlock;

/// The vector loop body.
BasicBlock *LoopVectorBody;

/// The scalar loop body.
BasicBlock *LoopScalarBody;

/// A list of all bypass blocks. The first block is the entry of the loop.
SmallVector<BasicBlock *, 4> LoopBypassBlocks;

/// The new Induction variable which was added to the new block.
PHINode *Induction = nullptr;

/// The induction variable of the old basic block.
PHINode *OldInduction = nullptr;

/// Maps values from the original loop to their corresponding values in the
/// vectorized loop. A key value can map to either vector values, scalar
/// values or both kinds of values, depending on whether the key was
/// vectorized and scalarized.
VectorizerValueMap VectorLoopValueMap;

/// Store instructions that were predicated.
SmallVector<Instruction *, 4> PredicatedInstructions;

/// Trip count of the original loop.
Value *TripCount = nullptr;

/// Trip count of the widened loop (TripCount - TripCount % (VF*UF))
Value *VectorTripCount = nullptr;

/// The legality analysis.
LoopVectorizationLegality *Legal;

/// The profitablity analysis.
LoopVectorizationCostModel *Cost;

// Record whether runtime checks are added.
bool AddedSafetyChecks = false;

// Holds the end values for each induction variable. We save the end values
// so we can later fix-up the external users of the induction variables.
DenseMap<PHINode *, Value *> IVEndValues;

// Vector of original scalar PHIs whose corresponding widened PHIs need to be
// fixed up at the end of vector code generation.
SmallVector<PHINode *, 8> OrigPHIsToFix;
769};

771class InnerLoopUnroller : public InnerLoopVectorizer {
772public:
InnerLoopUnroller(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
                  LoopInfo *LI, DominatorTree *DT,
                  const TargetLibraryInfo *TLI,
                  const TargetTransformInfo *TTI, AssumptionCache *AC,
                  OptimizationRemarkEmitter *ORE, unsigned UnrollFactor,
                  LoopVectorizationLegality *LVL,
                  LoopVectorizationCostModel *CM)
    : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 1,
                          UnrollFactor, LVL, CM) {}

783private:
Value *getBroadcastInstrs(Value *V) override;
Value *getStepVector(Value *Val, int StartIdx, Value *Step,
                     Instruction::BinaryOps Opcode =
                     Instruction::BinaryOpsEnd) override;
Value *reverseVector(Value *Vec) override;
789};

791} // end namespace llvm

793/// Look for a meaningful debug location on the instruction or it's
794/// operands.
795static Instruction *getDebugLocFromInstOrOperands(Instruction *I) {
if (!I)
  return I;

DebugLoc Empty;
if (I->getDebugLoc() != Empty)
  return I;

for (User::op_iterator OI = I->op_begin(), OE = I->op_end(); OI != OE; ++OI) {
  if (Instruction *OpInst = dyn_cast<Instruction>(*OI))
    if (OpInst->getDebugLoc() != Empty)
      return OpInst;
}

return I;
810}

812void InnerLoopVectorizer::setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr) {
if (const Instruction *Inst = dyn_cast_or_null<Instruction>(Ptr)) {
  const DILocation *DIL = Inst->getDebugLoc();
  if (DIL && Inst->getFunction()->isDebugInfoForProfiling() &&
      !isa<DbgInfoIntrinsic>(Inst)) {
    auto NewDIL = DIL->cloneByMultiplyingDuplicationFactor(UF * VF);
    if (NewDIL)
      B.SetCurrentDebugLocation(NewDIL.getValue());
    else
      LLVM_DEBUG(dbgs()do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "Failed to create new discriminator: "
 << DIL->getFilename() << " Line: " << DIL
->getLine(); } } while (false)
                 << "Failed to create new discriminator: "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "Failed to create new discriminator: "
 << DIL->getFilename() << " Line: " << DIL
->getLine(); } } while (false)
                 << DIL->getFilename() << " Line: " << DIL->getLine())do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "Failed to create new discriminator: "
 << DIL->getFilename() << " Line: " << DIL
->getLine(); } } while (false);
  }
  else
    B.SetCurrentDebugLocation(DIL);
} else
  B.SetCurrentDebugLocation(DebugLoc());
829}

831/// Write a record \p DebugMsg about vectorization failure to the debug
832/// output stream. If \p I is passed, it is an instruction that prevents
833/// vectorization.
834#ifndef NDEBUG
835static void debugVectorizationFailure(const StringRef DebugMsg,
  Instruction *I) {
dbgs() << "LV: Not vectorizing: " << DebugMsg;
if (I != nullptr)
  dbgs() << " " << *I;
else
  dbgs() << '.';
dbgs() << '\n';
843}
844#endif

846/// Create an analysis remark that explains why vectorization failed
847///
848/// \p PassName is the name of the pass (e.g. can be AlwaysPrint).  \p
849/// RemarkName is the identifier for the remark.  If \p I is passed it is an
850/// instruction that prevents vectorization.  Otherwise \p TheLoop is used for
851/// the location of the remark.  \return the remark object that can be
852/// streamed to.
853static OptimizationRemarkAnalysis createLVAnalysis(const char *PassName,
  StringRef RemarkName, Loop *TheLoop, Instruction *I) {
Value *CodeRegion = TheLoop->getHeader();
DebugLoc DL = TheLoop->getStartLoc();

if (I) {
  CodeRegion = I->getParent();
  // If there is no debug location attached to the instruction, revert back to
  // using the loop's.
  if (I->getDebugLoc())
    DL = I->getDebugLoc();
}

OptimizationRemarkAnalysis R(PassName, RemarkName, DL, CodeRegion);
R << "loop not vectorized: ";
return R;
869}

871namespace llvm {

873void reportVectorizationFailure(const StringRef DebugMsg,
  const StringRef OREMsg, const StringRef ORETag,
  OptimizationRemarkEmitter *ORE, Loop *TheLoop, Instruction *I) {
LLVM_DEBUG(debugVectorizationFailure(DebugMsg, I))do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { debugVectorizationFailure(DebugMsg, I);
 } } while (false);
LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE);
ORE->emit(createLVAnalysis(Hints.vectorizeAnalysisPassName(),
              ORETag, TheLoop, I) << OREMsg);
880}

882} // end namespace llvm

884#ifndef NDEBUG
885/// \return string containing a file name and a line # for the given loop.
886static std::string getDebugLocString(const Loop *L) {
std::string Result;
if (L) {
  raw_string_ostream OS(Result);
  if (const DebugLoc LoopDbgLoc = L->getStartLoc())
    LoopDbgLoc.print(OS);
  else
    // Just print the module name.
    OS << L->getHeader()->getParent()->getParent()->getModuleIdentifier();
  OS.flush();
}
return Result;
898}
899#endif

901void InnerLoopVectorizer::addNewMetadata(Instruction *To,
                                       const Instruction *Orig) {
// If the loop was versioned with memchecks, add the corresponding no-alias
// metadata.
if (LVer && (isa<LoadInst>(Orig) || isa<StoreInst>(Orig)))
  LVer->annotateInstWithNoAlias(To, Orig);
907}

909void InnerLoopVectorizer::addMetadata(Instruction *To,
                                    Instruction *From) {
propagateMetadata(To, From);
addNewMetadata(To, From);
913}

915void InnerLoopVectorizer::addMetadata(ArrayRef<Value *> To,
                                    Instruction *From) {
for (Value *V : To) {
  if (Instruction *I = dyn_cast<Instruction>(V))
    addMetadata(I, From);
}
921}

923namespace llvm {

925// Loop vectorization cost-model hints how the scalar epilogue loop should be
926// lowered.
927enum ScalarEpilogueLowering {

// The default: allowing scalar epilogues.
CM_ScalarEpilogueAllowed,

// Vectorization with OptForSize: don't allow epilogues.
CM_ScalarEpilogueNotAllowedOptSize,

// A special case of vectorisation with OptForSize: loops with a very small
// trip count are considered for vectorization under OptForSize, thereby
// making sure the cost of their loop body is dominant, free of runtime
// guards and scalar iteration overheads.
CM_ScalarEpilogueNotAllowedLowTripLoop,

// Loop hint predicate indicating an epilogue is undesired.
CM_ScalarEpilogueNotNeededUsePredicate
943};

945/// LoopVectorizationCostModel - estimates the expected speedups due to
946/// vectorization.
947/// In many cases vectorization is not profitable. This can happen because of
948/// a number of reasons. In this class we mainly attempt to predict the
949/// expected speedup/slowdowns due to the supported instruction set. We use the
950/// TargetTransformInfo to query the different backends for the cost of
951/// different operations.
952class LoopVectorizationCostModel {
953public:
LoopVectorizationCostModel(ScalarEpilogueLowering SEL, Loop *L,
                           PredicatedScalarEvolution &PSE, LoopInfo *LI,
                           LoopVectorizationLegality *Legal,
                           const TargetTransformInfo &TTI,
                           const TargetLibraryInfo *TLI, DemandedBits *DB,
                           AssumptionCache *AC,
                           OptimizationRemarkEmitter *ORE, const Function *F,
                           const LoopVectorizeHints *Hints,
                           InterleavedAccessInfo &IAI)
    : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal),
      TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F),
      Hints(Hints), InterleaveInfo(IAI) {}

/// \return An upper bound for the vectorization factor, or None if
/// vectorization and interleaving should be avoided up front.
Optional<unsigned> computeMaxVF();

/// \return True if runtime checks are required for vectorization, and false
/// otherwise.
bool runtimeChecksRequired();

/// \return The most profitable vectorization factor and the cost of that VF.
/// This method checks every power of two up to MaxVF. If UserVF is not ZERO
/// then this vectorization factor will be selected if vectorization is
/// possible.
VectorizationFactor selectVectorizationFactor(unsigned MaxVF);

/// Setup cost-based decisions for user vectorization factor.
void selectUserVectorizationFactor(unsigned UserVF) {
  collectUniformsAndScalars(UserVF);
  collectInstsToScalarize(UserVF);
}

/// \return The size (in bits) of the smallest and widest types in the code
/// that needs to be vectorized. We ignore values that remain scalar such as
/// 64 bit loop indices.
std::pair<unsigned, unsigned> getSmallestAndWidestTypes();

/// \return The desired interleave count.
/// If interleave count has been specified by metadata it will be returned.
/// Otherwise, the interleave count is computed and returned. VF and LoopCost
/// are the selected vectorization factor and the cost of the selected VF.
unsigned selectInterleaveCount(unsigned VF, unsigned LoopCost);

/// Memory access instruction may be vectorized in more than one way.
/// Form of instruction after vectorization depends on cost.
/// This function takes cost-based decisions for Load/Store instructions
/// and collects them in a map. This decisions map is used for building
/// the lists of loop-uniform and loop-scalar instructions.
/// The calculated cost is saved with widening decision in order to
/// avoid redundant calculations.
void setCostBasedWideningDecision(unsigned VF);

/// A struct that represents some properties of the register usage
/// of a loop.
struct RegisterUsage {
  /// Holds the number of loop invariant values that are used in the loop.
  /// The key is ClassID of target-provided register class.
  SmallMapVector<unsigned, unsigned, 4> LoopInvariantRegs;
  /// Holds the maximum number of concurrent live intervals in the loop.
  /// The key is ClassID of target-provided register class.
  SmallMapVector<unsigned, unsigned, 4> MaxLocalUsers;
};

/// \return Returns information about the register usages of the loop for the
/// given vectorization factors.
SmallVector<RegisterUsage, 8> calculateRegisterUsage(ArrayRef<unsigned> VFs);

/// Collect values we want to ignore in the cost model.
void collectValuesToIgnore();

/// \returns The smallest bitwidth each instruction can be represented with.
/// The vector equivalents of these instructions should be truncated to this
/// type.
const MapVector<Instruction *, uint64_t> &getMinimalBitwidths() const {
  return MinBWs;
}

/// \returns True if it is more profitable to scalarize instruction \p I for
/// vectorization factor \p VF.
bool isProfitableToScalarize(Instruction *I, unsigned VF) const {
  assert(VF > 1 && "Profitable to scalarize relevant only for VF > 1.")((VF > 1 && "Profitable to scalarize relevant only for VF > 1."
) ? static_cast<void> (0) : __assert_fail ("VF > 1 && \"Profitable to scalarize relevant only for VF > 1.\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1035, __PRETTY_FUNCTION__));

  // Cost model is not run in the VPlan-native path - return conservative
  // result until this changes.
  if (EnableVPlanNativePath)
    return false;

  auto Scalars = InstsToScalarize.find(VF);
  assert(Scalars != InstsToScalarize.end() &&((Scalars != InstsToScalarize.end() && "VF not yet analyzed for scalarization profitability"
) ? static_cast<void> (0) : __assert_fail ("Scalars != InstsToScalarize.end() && \"VF not yet analyzed for scalarization profitability\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1044, __PRETTY_FUNCTION__))
         "VF not yet analyzed for scalarization profitability")((Scalars != InstsToScalarize.end() && "VF not yet analyzed for scalarization profitability"
) ? static_cast<void> (0) : __assert_fail ("Scalars != InstsToScalarize.end() && \"VF not yet analyzed for scalarization profitability\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1044, __PRETTY_FUNCTION__));
  return Scalars->second.find(I) != Scalars->second.end();
}

/// Returns true if \p I is known to be uniform after vectorization.
bool isUniformAfterVectorization(Instruction *I, unsigned VF) const {
  if (VF == 1)
    return true;

  // Cost model is not run in the VPlan-native path - return conservative
  // result until this changes.
  if (EnableVPlanNativePath)
    return false;

  auto UniformsPerVF = Uniforms.find(VF);
  assert(UniformsPerVF != Uniforms.end() &&((UniformsPerVF != Uniforms.end() && "VF not yet analyzed for uniformity"
) ? static_cast<void> (0) : __assert_fail ("UniformsPerVF != Uniforms.end() && \"VF not yet analyzed for uniformity\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1060, __PRETTY_FUNCTION__))
         "VF not yet analyzed for uniformity")((UniformsPerVF != Uniforms.end() && "VF not yet analyzed for uniformity"
) ? static_cast<void> (0) : __assert_fail ("UniformsPerVF != Uniforms.end() && \"VF not yet analyzed for uniformity\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1060, __PRETTY_FUNCTION__));
  return UniformsPerVF->second.find(I) != UniformsPerVF->second.end();
}

/// Returns true if \p I is known to be scalar after vectorization.
bool isScalarAfterVectorization(Instruction *I, unsigned VF) const {
  if (VF == 1)
    return true;

  // Cost model is not run in the VPlan-native path - return conservative
  // result until this changes.
  if (EnableVPlanNativePath)
    return false;

  auto ScalarsPerVF = Scalars.find(VF);
  assert(ScalarsPerVF != Scalars.end() &&((ScalarsPerVF != Scalars.end() && "Scalar values are not calculated for VF"
) ? static_cast<void> (0) : __assert_fail ("ScalarsPerVF != Scalars.end() && \"Scalar values are not calculated for VF\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1076, __PRETTY_FUNCTION__))
         "Scalar values are not calculated for VF")((ScalarsPerVF != Scalars.end() && "Scalar values are not calculated for VF"
) ? static_cast<void> (0) : __assert_fail ("ScalarsPerVF != Scalars.end() && \"Scalar values are not calculated for VF\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1076, __PRETTY_FUNCTION__));
  return ScalarsPerVF->second.find(I) != ScalarsPerVF->second.end();
}

/// \returns True if instruction \p I can be truncated to a smaller bitwidth
/// for vectorization factor \p VF.
bool canTruncateToMinimalBitwidth(Instruction *I, unsigned VF) const {
  return VF > 1 && MinBWs.find(I) != MinBWs.end() &&
         !isProfitableToScalarize(I, VF) &&
         !isScalarAfterVectorization(I, VF);
}

/// Decision that was taken during cost calculation for memory instruction.
enum InstWidening {
  CM_Unknown,
  CM_Widen,         // For consecutive accesses with stride +1.
  CM_Widen_Reverse, // For consecutive accesses with stride -1.
  CM_Interleave,
  CM_GatherScatter,
  CM_Scalarize
};

/// Save vectorization decision \p W and \p Cost taken by the cost model for
/// instruction \p I and vector width \p VF.
void setWideningDecision(Instruction *I, unsigned VF, InstWidening W,
                         unsigned Cost) {
  assert(VF >= 2 && "Expected VF >=2")((VF >= 2 && "Expected VF >=2") ? static_cast<
void> (0) : __assert_fail ("VF >= 2 && \"Expected VF >=2\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1102, __PRETTY_FUNCTION__));
  WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
}

/// Save vectorization decision \p W and \p Cost taken by the cost model for
/// interleaving group \p Grp and vector width \p VF.
void setWideningDecision(const InterleaveGroup<Instruction> *Grp, unsigned VF,
                         InstWidening W, unsigned Cost) {
  assert(VF >= 2 && "Expected VF >=2")((VF >= 2 && "Expected VF >=2") ? static_cast<
void> (0) : __assert_fail ("VF >= 2 && \"Expected VF >=2\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1110, __PRETTY_FUNCTION__));
  /// Broadcast this decicion to all instructions inside the group.
  /// But the cost will be assigned to one instruction only.
  for (unsigned i = 0; i < Grp->getFactor(); ++i) {
    if (auto *I = Grp->getMember(i)) {
      if (Grp->getInsertPos() == I)
        WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
      else
        WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, 0);
    }
  }
}

/// Return the cost model decision for the given instruction \p I and vector
/// width \p VF. Return CM_Unknown if this instruction did not pass
/// through the cost modeling.
InstWidening getWideningDecision(Instruction *I, unsigned VF) {
  assert(VF >= 2 && "Expected VF >=2")((VF >= 2 && "Expected VF >=2") ? static_cast<
void> (0) : __assert_fail ("VF >= 2 && \"Expected VF >=2\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1127, __PRETTY_FUNCTION__));

  // Cost model is not run in the VPlan-native path - return conservative
  // result until this changes.
  if (EnableVPlanNativePath)
    return CM_GatherScatter;

  std::pair<Instruction *, unsigned> InstOnVF = std::make_pair(I, VF);
  auto Itr = WideningDecisions.find(InstOnVF);
  if (Itr == WideningDecisions.end())
    return CM_Unknown;
  return Itr->second.first;
}

/// Return the vectorization cost for the given instruction \p I and vector
/// width \p VF.
unsigned getWideningCost(Instruction *I, unsigned VF) {
  assert(VF >= 2 && "Expected VF >=2")((VF >= 2 && "Expected VF >=2") ? static_cast<
void> (0) : __assert_fail ("VF >= 2 && \"Expected VF >=2\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1144, __PRETTY_FUNCTION__));
  std::pair<Instruction *, unsigned> InstOnVF = std::make_pair(I, VF);
  assert(WideningDecisions.find(InstOnVF) != WideningDecisions.end() &&((WideningDecisions.find(InstOnVF) != WideningDecisions.end()
 && "The cost is not calculated") ? static_cast<void
> (0) : __assert_fail ("WideningDecisions.find(InstOnVF) != WideningDecisions.end() && \"The cost is not calculated\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1147, __PRETTY_FUNCTION__))
         "The cost is not calculated")((WideningDecisions.find(InstOnVF) != WideningDecisions.end()
 && "The cost is not calculated") ? static_cast<void
> (0) : __assert_fail ("WideningDecisions.find(InstOnVF) != WideningDecisions.end() && \"The cost is not calculated\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1147, __PRETTY_FUNCTION__));
  return WideningDecisions[InstOnVF].second;
}

/// Return True if instruction \p I is an optimizable truncate whose operand
/// is an induction variable. Such a truncate will be removed by adding a new
/// induction variable with the destination type.
bool isOptimizableIVTruncate(Instruction *I, unsigned VF) {
  // If the instruction is not a truncate, return false.
  auto *Trunc = dyn_cast<TruncInst>(I);
  if (!Trunc)
    return false;

  // Get the source and destination types of the truncate.
  Type *SrcTy = ToVectorTy(cast<CastInst>(I)->getSrcTy(), VF);
  Type *DestTy = ToVectorTy(cast<CastInst>(I)->getDestTy(), VF);

  // If the truncate is free for the given types, return false. Replacing a
  // free truncate with an induction variable would add an induction variable
  // update instruction to each iteration of the loop. We exclude from this
  // check the primary induction variable since it will need an update
  // instruction regardless.
  Value *Op = Trunc->getOperand(0);
  if (Op != Legal->getPrimaryInduction() && TTI.isTruncateFree(SrcTy, DestTy))
    return false;

  // If the truncated value is not an induction variable, return false.
  return Legal->isInductionPhi(Op);
}

/// Collects the instructions to scalarize for each predicated instruction in
/// the loop.
void collectInstsToScalarize(unsigned VF);

/// Collect Uniform and Scalar values for the given \p VF.
/// The sets depend on CM decision for Load/Store instructions
/// that may be vectorized as interleave, gather-scatter or scalarized.
void collectUniformsAndScalars(unsigned VF) {
  // Do the analysis once.
  if (VF == 1 || Uniforms.find(VF) != Uniforms.end())
    return;
  setCostBasedWideningDecision(VF);
  collectLoopUniforms(VF);
  collectLoopScalars(VF);
}

/// Returns true if the target machine supports masked store operation
/// for the given \p DataType and kind of access to \p Ptr.
bool isLegalMaskedStore(Type *DataType, Value *Ptr, MaybeAlign Alignment) {
  return Legal->isConsecutivePtr(Ptr) &&
         TTI.isLegalMaskedStore(DataType, Alignment);
}

/// Returns true if the target machine supports masked load operation
/// for the given \p DataType and kind of access to \p Ptr.
bool isLegalMaskedLoad(Type *DataType, Value *Ptr, MaybeAlign Alignment) {
  return Legal->isConsecutivePtr(Ptr) &&
         TTI.isLegalMaskedLoad(DataType, Alignment);
}

/// Returns true if the target machine supports masked scatter operation
/// for the given \p DataType.
bool isLegalMaskedScatter(Type *DataType, MaybeAlign Alignment) {
  return TTI.isLegalMaskedScatter(DataType, Alignment);
}

/// Returns true if the target machine supports masked gather operation
/// for the given \p DataType.
bool isLegalMaskedGather(Type *DataType, MaybeAlign Alignment) {
  return TTI.isLegalMaskedGather(DataType, Alignment);
}

/// Returns true if the target machine can represent \p V as a masked gather
/// or scatter operation.
bool isLegalGatherOrScatter(Value *V) {
  bool LI = isa<LoadInst>(V);
  bool SI = isa<StoreInst>(V);
  if (!LI && !SI)
    return false;
  auto *Ty = getMemInstValueType(V);
  MaybeAlign Align = getLoadStoreAlignment(V);
  return (LI && isLegalMaskedGather(Ty, Align)) ||
         (SI && isLegalMaskedScatter(Ty, Align));
}

/// Returns true if \p I is an instruction that will be scalarized with
/// predication. Such instructions include conditional stores and
/// instructions that may divide by zero.
/// If a non-zero VF has been calculated, we check if I will be scalarized
/// predication for that VF.
bool isScalarWithPredication(Instruction *I, unsigned VF = 1);

// Returns true if \p I is an instruction that will be predicated either
// through scalar predication or masked load/store or masked gather/scatter.
// Superset of instructions that return true for isScalarWithPredication.
bool isPredicatedInst(Instruction *I) {
  if (!blockNeedsPredication(I->getParent()))
    return false;
  // Loads and stores that need some form of masked operation are predicated
  // instructions.
  if (isa<LoadInst>(I) || isa<StoreInst>(I))
    return Legal->isMaskRequired(I);
  return isScalarWithPredication(I);
}

/// Returns true if \p I is a memory instruction with consecutive memory
/// access that can be widened.
bool memoryInstructionCanBeWidened(Instruction *I, unsigned VF = 1);

/// Returns true if \p I is a memory instruction in an interleaved-group
/// of memory accesses that can be vectorized with wide vector loads/stores
/// and shuffles.
bool interleavedAccessCanBeWidened(Instruction *I, unsigned VF = 1);

/// Check if \p Instr belongs to any interleaved access group.
bool isAccessInterleaved(Instruction *Instr) {
  return InterleaveInfo.isInterleaved(Instr);
}

/// Get the interleaved access group that \p Instr belongs to.
const InterleaveGroup<Instruction> *
getInterleavedAccessGroup(Instruction *Instr) {
  return InterleaveInfo.getInterleaveGroup(Instr);
}

/// Returns true if an interleaved group requires a scalar iteration
/// to handle accesses with gaps, and there is nothing preventing us from
/// creating a scalar epilogue.
bool requiresScalarEpilogue() const {
  return isScalarEpilogueAllowed() && InterleaveInfo.requiresScalarEpilogue();
}

/// Returns true if a scalar epilogue is not allowed due to optsize or a
/// loop hint annotation.
bool isScalarEpilogueAllowed() const {
  return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed;
}

/// Returns true if all loop blocks should be masked to fold tail loop.
bool foldTailByMasking() const { return FoldTailByMasking; }

bool blockNeedsPredication(BasicBlock *BB) {
  return foldTailByMasking() || Legal->blockNeedsPredication(BB);
}

/// Estimate cost of an intrinsic call instruction CI if it were vectorized
/// with factor VF.  Return the cost of the instruction, including
/// scalarization overhead if it's needed.
unsigned getVectorIntrinsicCost(CallInst *CI, unsigned VF);

/// Estimate cost of a call instruction CI if it were vectorized with factor
/// VF. Return the cost of the instruction, including scalarization overhead
/// if it's needed. The flag NeedToScalarize shows if the call needs to be
/// scalarized -
/// i.e. either vector version isn't available, or is too expensive.
unsigned getVectorCallCost(CallInst *CI, unsigned VF, bool &NeedToScalarize);

1304private:
unsigned NumPredStores = 0;

/// \return An upper bound for the vectorization factor, larger than zero.
/// One is returned if vectorization should best be avoided due to cost.
unsigned computeFeasibleMaxVF(unsigned ConstTripCount);

/// The vectorization cost is a combination of the cost itself and a boolean
/// indicating whether any of the contributing operations will actually
/// operate on
/// vector values after type legalization in the backend. If this latter value
/// is
/// false, then all operations will be scalarized (i.e. no vectorization has
/// actually taken place).
using VectorizationCostTy = std::pair<unsigned, bool>;

/// Returns the expected execution cost. The unit of the cost does
/// not matter because we use the 'cost' units to compare different
/// vector widths. The cost that is returned is *not* normalized by
/// the factor width.
VectorizationCostTy expectedCost(unsigned VF);

/// Returns the execution time cost of an instruction for a given vector
/// width. Vector width of one means scalar.
VectorizationCostTy getInstructionCost(Instruction *I, unsigned VF);

/// The cost-computation logic from getInstructionCost which provides
/// the vector type as an output parameter.
unsigned getInstructionCost(Instruction *I, unsigned VF, Type *&VectorTy);

/// Calculate vectorization cost of memory instruction \p I.
unsigned getMemoryInstructionCost(Instruction *I, unsigned VF);

/// The cost computation for scalarized memory instruction.
unsigned getMemInstScalarizationCost(Instruction *I, unsigned VF);

/// The cost computation for interleaving group of memory instructions.
unsigned getInterleaveGroupCost(Instruction *I, unsigned VF);

/// The cost computation for Gather/Scatter instruction.
unsigned getGatherScatterCost(Instruction *I, unsigned VF);

/// The cost computation for widening instruction \p I with consecutive
/// memory access.
unsigned getConsecutiveMemOpCost(Instruction *I, unsigned VF);

/// The cost calculation for Load/Store instruction \p I with uniform pointer -
/// Load: scalar load + broadcast.
/// Store: scalar store + (loop invariant value stored? 0 : extract of last
/// element)
unsigned getUniformMemOpCost(Instruction *I, unsigned VF);

/// Estimate the overhead of scalarizing an instruction. This is a
/// convenience wrapper for the type-based getScalarizationOverhead API.
unsigned getScalarizationOverhead(Instruction *I, unsigned VF);

/// Returns whether the instruction is a load or store and will be a emitted
/// as a vector operation.
bool isConsecutiveLoadOrStore(Instruction *I);

/// Returns true if an artificially high cost for emulated masked memrefs
/// should be used.
bool useEmulatedMaskMemRefHack(Instruction *I);

/// Map of scalar integer values to the smallest bitwidth they can be legally
/// represented as. The vector equivalents of these values should be truncated
/// to this type.
MapVector<Instruction *, uint64_t> MinBWs;

/// A type representing the costs for instructions if they were to be
/// scalarized rather than vectorized. The entries are Instruction-Cost
/// pairs.
using ScalarCostsTy = DenseMap<Instruction *, unsigned>;

/// A set containing all BasicBlocks that are known to present after
/// vectorization as a predicated block.
SmallPtrSet<BasicBlock *, 4> PredicatedBBsAfterVectorization;

/// Records whether it is allowed to have the original scalar loop execute at
/// least once. This may be needed as a fallback loop in case runtime
/// aliasing/dependence checks fail, or to handle the tail/remainder
/// iterations when the trip count is unknown or doesn't divide by the VF,
/// or as a peel-loop to handle gaps in interleave-groups.
/// Under optsize and when the trip count is very small we don't allow any
/// iterations to execute in the scalar loop.
ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;

/// All blocks of loop are to be masked to fold tail of scalar iterations.
bool FoldTailByMasking = false;

/// A map holding scalar costs for different vectorization factors. The
/// presence of a cost for an instruction in the mapping indicates that the
/// instruction will be scalarized when vectorizing with the associated
/// vectorization factor. The entries are VF-ScalarCostTy pairs.
DenseMap<unsigned, ScalarCostsTy> InstsToScalarize;

/// Holds the instructions known to be uniform after vectorization.
/// The data is collected per VF.
DenseMap<unsigned, SmallPtrSet<Instruction *, 4>> Uniforms;

/// Holds the instructions known to be scalar after vectorization.
/// The data is collected per VF.
DenseMap<unsigned, SmallPtrSet<Instruction *, 4>> Scalars;

/// Holds the instructions (address computations) that are forced to be
/// scalarized.
DenseMap<unsigned, SmallPtrSet<Instruction *, 4>> ForcedScalars;

/// Returns the expected difference in cost from scalarizing the expression
/// feeding a predicated instruction \p PredInst. The instructions to
/// scalarize and their scalar costs are collected in \p ScalarCosts. A
/// non-negative return value implies the expression will be scalarized.
/// Currently, only single-use chains are considered for scalarization.
int computePredInstDiscount(Instruction *PredInst, ScalarCostsTy &ScalarCosts,
                            unsigned VF);

/// Collect the instructions that are uniform after vectorization. An
/// instruction is uniform if we represent it with a single scalar value in
/// the vectorized loop corresponding to each vector iteration. Examples of
/// uniform instructions include pointer operands of consecutive or
/// interleaved memory accesses. Note that although uniformity implies an
/// instruction will be scalar, the reverse is not true. In general, a
/// scalarized instruction will be represented by VF scalar values in the
/// vectorized loop, each corresponding to an iteration of the original
/// scalar loop.
void collectLoopUniforms(unsigned VF);

/// Collect the instructions that are scalar after vectorization. An
/// instruction is scalar if it is known to be uniform or will be scalarized
/// during vectorization. Non-uniform scalarized instructions will be
/// represented by VF values in the vectorized loop, each corresponding to an
/// iteration of the original scalar loop.
void collectLoopScalars(unsigned VF);

/// Keeps cost model vectorization decision and cost for instructions.
/// Right now it is used for memory instructions only.
using DecisionList = DenseMap<std::pair<Instruction *, unsigned>,
                              std::pair<InstWidening, unsigned>>;

DecisionList WideningDecisions;

/// Returns true if \p V is expected to be vectorized and it needs to be
/// extracted.
bool needsExtract(Value *V, unsigned VF) const {
  Instruction *I = dyn_cast<Instruction>(V);
  if (VF == 1 || !I || !TheLoop->contains(I) || TheLoop->isLoopInvariant(I))
    return false;

  // Assume we can vectorize V (and hence we need extraction) if the
  // scalars are not computed yet. This can happen, because it is called
  // via getScalarizationOverhead from setCostBasedWideningDecision, before
  // the scalars are collected. That should be a safe assumption in most
  // cases, because we check if the operands have vectorizable types
  // beforehand in LoopVectorizationLegality.
  return Scalars.find(VF) == Scalars.end() ||
         !isScalarAfterVectorization(I, VF);
};

/// Returns a range containing only operands needing to be extracted.
SmallVector<Value *, 4> filterExtractingOperands(Instruction::op_range Ops,
                                                 unsigned VF) {
  return SmallVector<Value *, 4>(make_filter_range(
      Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); }));
}

1469public:
/// The loop that we evaluate.
Loop *TheLoop;

/// Predicated scalar evolution analysis.
PredicatedScalarEvolution &PSE;

/// Loop Info analysis.
LoopInfo *LI;

/// Vectorization legality.
LoopVectorizationLegality *Legal;

/// Vector target information.
const TargetTransformInfo &TTI;

/// Target Library Info.
const TargetLibraryInfo *TLI;

/// Demanded bits analysis.
DemandedBits *DB;

/// Assumption cache.
AssumptionCache *AC;

/// Interface to emit optimization remarks.
OptimizationRemarkEmitter *ORE;

const Function *TheFunction;

/// Loop Vectorize Hint.
const LoopVectorizeHints *Hints;

/// The interleave access information contains groups of interleaved accesses
/// with the same stride and close to each other.
InterleavedAccessInfo &InterleaveInfo;

/// Values to ignore in the cost model.
SmallPtrSet<const Value *, 16> ValuesToIgnore;

/// Values to ignore in the cost model when VF > 1.
SmallPtrSet<const Value *, 16> VecValuesToIgnore;
1511};

1513} // end namespace llvm

1515// Return true if \p OuterLp is an outer loop annotated with hints for explicit
1516// vectorization. The loop needs to be annotated with #pragma omp simd
1517// simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the
1518// vector length information is not provided, vectorization is not considered
1519// explicit. Interleave hints are not allowed either. These limitations will be
1520// relaxed in the future.
1521// Please, note that we are currently forced to abuse the pragma 'clang
1522// vectorize' semantics. This pragma provides *auto-vectorization hints*
1523// (i.e., LV must check that vectorization is legal) whereas pragma 'omp simd'
1524// provides *explicit vectorization hints* (LV can bypass legal checks and
1525// assume that vectorization is legal). However, both hints are implemented
1526// using the same metadata (llvm.loop.vectorize, processed by
1527// LoopVectorizeHints). This will be fixed in the future when the native IR
1528// representation for pragma 'omp simd' is introduced.
1529static bool isExplicitVecOuterLoop(Loop *OuterLp,
                                 OptimizationRemarkEmitter *ORE) {
assert(!OuterLp->empty() && "This is not an outer loop")((!OuterLp->empty() && "This is not an outer loop"
) ? static_cast<void> (0) : __assert_fail ("!OuterLp->empty() && \"This is not an outer loop\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1531, __PRETTY_FUNCTION__));
LoopVectorizeHints Hints(OuterLp, true /*DisableInterleaving*/, *ORE);

// Only outer loops with an explicit vectorization hint are supported.
// Unannotated outer loops are ignored.
if (Hints.getForce() == LoopVectorizeHints::FK_Undefined)
  return false;

Function *Fn = OuterLp->getHeader()->getParent();
if (!Hints.allowVectorization(Fn, OuterLp,
                              true /*VectorizeOnlyWhenForced*/)) {
  LLVM_DEBUG(dbgs() << "LV: Loop hints prevent outer loop vectorization.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Loop hints prevent outer loop vectorization.\n"
; } } while (false);
  return false;
}

if (Hints.getInterleave() > 1) {
  // TODO: Interleave support is future work.
  LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Not vectorizing: Interleave is not supported for "
 "outer loops.\n"; } } while (false)
                       "outer loops.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Not vectorizing: Interleave is not supported for "
 "outer loops.\n"; } } while (false);
  Hints.emitRemarkWithHints();
  return false;
}

return true;
1555}

1557static void collectSupportedLoops(Loop &L, LoopInfo *LI,
                                OptimizationRemarkEmitter *ORE,
                                SmallVectorImpl<Loop *> &V) {
// Collect inner loops and outer loops without irreducible control flow. For
// now, only collect outer loops that have explicit vectorization hints. If we
// are stress testing the VPlan H-CFG construction, we collect the outermost
// loop of every loop nest.
if (L.empty() || VPlanBuildStressTest ||
    (EnableVPlanNativePath && isExplicitVecOuterLoop(&L, ORE))) {
  LoopBlocksRPO RPOT(&L);
  RPOT.perform(LI);
  if (!containsIrreducibleCFG<const BasicBlock *>(RPOT, *LI)) {
    V.push_back(&L);
    // TODO: Collect inner loops inside marked outer loops in case
    // vectorization fails for the outer loop. Do not invoke
    // 'containsIrreducibleCFG' again for inner loops when the outer loop is
    // already known to be reducible. We can use an inherited attribute for
    // that.
    return;
  }
}
for (Loop *InnerL : L)
  collectSupportedLoops(*InnerL, LI, ORE, V);
1580}

1582namespace {

1584/// The LoopVectorize Pass.
1585struct LoopVectorize : public FunctionPass {
/// Pass identification, replacement for typeid
static char ID;

LoopVectorizePass Impl;

explicit LoopVectorize(bool InterleaveOnlyWhenForced = false,
                       bool VectorizeOnlyWhenForced = false)
    : FunctionPass(ID) {
  Impl.InterleaveOnlyWhenForced = InterleaveOnlyWhenForced;
  Impl.VectorizeOnlyWhenForced = VectorizeOnlyWhenForced;
  initializeLoopVectorizePass(*PassRegistry::getPassRegistry());
}

bool runOnFunction(Function &F) override {
  if (skipFunction(F))
    return false;

  auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
  auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
  auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
  auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
  auto *BFI = &getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI();
  auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
  auto *TLI = TLIP ? &TLIP->getTLI(F) : nullptr;
  auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
  auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
  auto *LAA = &getAnalysis<LoopAccessLegacyAnalysis>();
  auto *DB = &getAnalysis<DemandedBitsWrapperPass>().getDemandedBits();
  auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
  auto *PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();

  std::function<const LoopAccessInfo &(Loop &)> GetLAA =
      [&](Loop &L) -> const LoopAccessInfo & { return LAA->getInfo(&L); };

  return Impl.runImpl(F, *SE, *LI, *TTI, *DT, *BFI, TLI, *DB, *AA, *AC,
                      GetLAA, *ORE, PSI);
}

void getAnalysisUsage(AnalysisUsage &AU) const override {
  AU.addRequired<AssumptionCacheTracker>();
  AU.addRequired<BlockFrequencyInfoWrapperPass>();
  AU.addRequired<DominatorTreeWrapperPass>();
  AU.addRequired<LoopInfoWrapperPass>();
  AU.addRequired<ScalarEvolutionWrapperPass>();
  AU.addRequired<TargetTransformInfoWrapperPass>();
  AU.addRequired<AAResultsWrapperPass>();
  AU.addRequired<LoopAccessLegacyAnalysis>();
  AU.addRequired<DemandedBitsWrapperPass>();
  AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
  AU.addRequired<InjectTLIMappingsLegacy>();

  // We currently do not preserve loopinfo/dominator analyses with outer loop
  // vectorization. Until this is addressed, mark these analyses as preserved
  // only for non-VPlan-native path.
  // TODO: Preserve Loop and Dominator analyses for VPlan-native path.
  if (!EnableVPlanNativePath) {
    AU.addPreserved<LoopInfoWrapperPass>();
    AU.addPreserved<DominatorTreeWrapperPass>();
  }

  AU.addPreserved<BasicAAWrapperPass>();
  AU.addPreserved<GlobalsAAWrapperPass>();
  AU.addRequired<ProfileSummaryInfoWrapperPass>();
}
1650};

1652} // end anonymous namespace

1654//===----------------------------------------------------------------------===//
1655// Implementation of LoopVectorizationLegality, InnerLoopVectorizer and
1656// LoopVectorizationCostModel and LoopVectorizationPlanner.
1657//===----------------------------------------------------------------------===//

1659Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) {
// We need to place the broadcast of invariant variables outside the loop,
// but only if it's proven safe to do so. Else, broadcast will be inside
// vector loop body.
Instruction *Instr = dyn_cast<Instruction>(V);
bool SafeToHoist = OrigLoop->isLoopInvariant(V) &&
                   (!Instr ||
                    DT->dominates(Instr->getParent(), LoopVectorPreHeader));
// Place the code for broadcasting invariant variables in the new preheader.
IRBuilder<>::InsertPointGuard Guard(Builder);
if (SafeToHoist)
  Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());

// Broadcast the scalar into all locations in the vector.
Value *Shuf = Builder.CreateVectorSplat(VF, V, "broadcast");

return Shuf;
1676}

1678void InnerLoopVectorizer::createVectorIntOrFpInductionPHI(
  const InductionDescriptor &II, Value *Step, Instruction *EntryVal) {
assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) &&(((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal
)) && "Expected either an induction phi-node or a truncate of it!"
) ? static_cast<void> (0) : __assert_fail ("(isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) && \"Expected either an induction phi-node or a truncate of it!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1681, __PRETTY_FUNCTION__))
       "Expected either an induction phi-node or a truncate of it!")(((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal
)) && "Expected either an induction phi-node or a truncate of it!"
) ? static_cast<void> (0) : __assert_fail ("(isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) && \"Expected either an induction phi-node or a truncate of it!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1681, __PRETTY_FUNCTION__));
Value *Start = II.getStartValue();

// Construct the initial value of the vector IV in the vector loop preheader
auto CurrIP = Builder.saveIP();
Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
if (isa<TruncInst>(EntryVal)) {
  assert(Start->getType()->isIntegerTy() &&((Start->getType()->isIntegerTy() && "Truncation requires an integer type"
) ? static_cast<void> (0) : __assert_fail ("Start->getType()->isIntegerTy() && \"Truncation requires an integer type\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1689, __PRETTY_FUNCTION__))
         "Truncation requires an integer type")((Start->getType()->isIntegerTy() && "Truncation requires an integer type"
) ? static_cast<void> (0) : __assert_fail ("Start->getType()->isIntegerTy() && \"Truncation requires an integer type\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1689, __PRETTY_FUNCTION__));
  auto *TruncType = cast<IntegerType>(EntryVal->getType());
  Step = Builder.CreateTrunc(Step, TruncType);
  Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType);
}
Value *SplatStart = Builder.CreateVectorSplat(VF, Start);
Value *SteppedStart =
    getStepVector(SplatStart, 0, Step, II.getInductionOpcode());

// We create vector phi nodes for both integer and floating-point induction
// variables. Here, we determine the kind of arithmetic we will perform.
Instruction::BinaryOps AddOp;
Instruction::BinaryOps MulOp;
if (Step->getType()->isIntegerTy()) {
  AddOp = Instruction::Add;
  MulOp = Instruction::Mul;
} else {
  AddOp = II.getInductionOpcode();
  MulOp = Instruction::FMul;
}

// Multiply the vectorization factor by the step using integer or
// floating-point arithmetic as appropriate.
Value *ConstVF = getSignedIntOrFpConstant(Step->getType(), VF);
Value *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, Step, ConstVF));

// Create a vector splat to use in the induction update.
//
// FIXME: If the step is non-constant, we create the vector splat with
//        IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't
//        handle a constant vector splat.
Value *SplatVF = isa<Constant>(Mul)
                     ? ConstantVector::getSplat(VF, cast<Constant>(Mul))
                     : Builder.CreateVectorSplat(VF, Mul);
Builder.restoreIP(CurrIP);

// We may need to add the step a number of times, depending on the unroll
// factor. The last of those goes into the PHI.
PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind",
                                  &*LoopVectorBody->getFirstInsertionPt());
VecInd->setDebugLoc(EntryVal->getDebugLoc());
Instruction *LastInduction = VecInd;
for (unsigned Part = 0; Part < UF; ++Part) {
  VectorLoopValueMap.setVectorValue(EntryVal, Part, LastInduction);

  if (isa<TruncInst>(EntryVal))
    addMetadata(LastInduction, EntryVal);
  recordVectorLoopValueForInductionCast(II, EntryVal, LastInduction, Part);

  LastInduction = cast<Instruction>(addFastMathFlag(
      Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add")));
  LastInduction->setDebugLoc(EntryVal->getDebugLoc());
}

// Move the last step to the end of the latch block. This ensures consistent
// placement of all induction updates.
auto *LoopVectorLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch();
auto *Br = cast<BranchInst>(LoopVectorLatch->getTerminator());
auto *ICmp = cast<Instruction>(Br->getCondition());
LastInduction->moveBefore(ICmp);
LastInduction->setName("vec.ind.next");

VecInd->addIncoming(SteppedStart, LoopVectorPreHeader);
VecInd->addIncoming(LastInduction, LoopVectorLatch);
1753}

1755bool InnerLoopVectorizer::shouldScalarizeInstruction(Instruction *I) const {
return Cost->isScalarAfterVectorization(I, VF) ||
       Cost->isProfitableToScalarize(I, VF);
1758}

1760bool InnerLoopVectorizer::needsScalarInduction(Instruction *IV) const {
if (shouldScalarizeInstruction(IV))
  return true;
auto isScalarInst = [&](User *U) -> bool {
  auto *I = cast<Instruction>(U);
  return (OrigLoop->contains(I) && shouldScalarizeInstruction(I));
};
return llvm::any_of(IV->users(), isScalarInst);
1768}

1770void InnerLoopVectorizer::recordVectorLoopValueForInductionCast(
  const InductionDescriptor &ID, const Instruction *EntryVal,
  Value *VectorLoopVal, unsigned Part, unsigned Lane) {
assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) &&(((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal
)) && "Expected either an induction phi-node or a truncate of it!"
) ? static_cast<void> (0) : __assert_fail ("(isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) && \"Expected either an induction phi-node or a truncate of it!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1774, __PRETTY_FUNCTION__))
       "Expected either an induction phi-node or a truncate of it!")(((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal
)) && "Expected either an induction phi-node or a truncate of it!"
) ? static_cast<void> (0) : __assert_fail ("(isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) && \"Expected either an induction phi-node or a truncate of it!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1774, __PRETTY_FUNCTION__));

// This induction variable is not the phi from the original loop but the
// newly-created IV based on the proof that casted Phi is equal to the
// uncasted Phi in the vectorized loop (under a runtime guard possibly). It
// re-uses the same InductionDescriptor that original IV uses but we don't
// have to do any recording in this case - that is done when original IV is
// processed.
if (isa<TruncInst>(EntryVal))
  return;

const SmallVectorImpl<Instruction *> &Casts = ID.getCastInsts();
if (Casts.empty())
  return;
// Only the first Cast instruction in the Casts vector is of interest.
// The rest of the Casts (if exist) have no uses outside the
// induction update chain itself.
Instruction *CastInst = *Casts.begin();
if (Lane < UINT_MAX(2147483647 *2U +1U))
  VectorLoopValueMap.setScalarValue(CastInst, {Part, Lane}, VectorLoopVal);
else
  VectorLoopValueMap.setVectorValue(CastInst, Part, VectorLoopVal);
1796}

1798void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc) {
assert((IV->getType()->isIntegerTy() || IV != OldInduction) &&(((IV->getType()->isIntegerTy() || IV != OldInduction) &&
 "Primary induction variable must have an integer type") ? static_cast
<void> (0) : __assert_fail ("(IV->getType()->isIntegerTy() || IV != OldInduction) && \"Primary induction variable must have an integer type\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1800, __PRETTY_FUNCTION__))
       "Primary induction variable must have an integer type")(((IV->getType()->isIntegerTy() || IV != OldInduction) &&
 "Primary induction variable must have an integer type") ? static_cast
<void> (0) : __assert_fail ("(IV->getType()->isIntegerTy() || IV != OldInduction) && \"Primary induction variable must have an integer type\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1800, __PRETTY_FUNCTION__));

auto II = Legal->getInductionVars().find(IV);
assert(II != Legal->getInductionVars().end() && "IV is not an induction")((II != Legal->getInductionVars().end() && "IV is not an induction"
) ? static_cast<void> (0) : __assert_fail ("II != Legal->getInductionVars().end() && \"IV is not an induction\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1803, __PRETTY_FUNCTION__));

auto ID = II->second;
assert(IV->getType() == ID.getStartValue()->getType() && "Types must match")((IV->getType() == ID.getStartValue()->getType() &&
 "Types must match") ? static_cast<void> (0) : __assert_fail
 ("IV->getType() == ID.getStartValue()->getType() && \"Types must match\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1806, __PRETTY_FUNCTION__));

// The scalar value to broadcast. This will be derived from the canonical
// induction variable.
Value *ScalarIV = nullptr;

// The value from the original loop to which we are mapping the new induction
// variable.
Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : IV;

// True if we have vectorized the induction variable.
auto VectorizedIV = false;

// Determine if we want a scalar version of the induction variable. This is
// true if the induction variable itself is not widened, or if it has at
// least one user in the loop that is not widened.
auto NeedsScalarIV = VF > 1 && needsScalarInduction(EntryVal);

// Generate code for the induction step. Note that induction steps are
// required to be loop-invariant
assert(PSE.getSE()->isLoopInvariant(ID.getStep(), OrigLoop) &&((PSE.getSE()->isLoopInvariant(ID.getStep(), OrigLoop) &&
 "Induction step should be loop invariant") ? static_cast<
void> (0) : __assert_fail ("PSE.getSE()->isLoopInvariant(ID.getStep(), OrigLoop) && \"Induction step should be loop invariant\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1827, __PRETTY_FUNCTION__))
       "Induction step should be loop invariant")((PSE.getSE()->isLoopInvariant(ID.getStep(), OrigLoop) &&
 "Induction step should be loop invariant") ? static_cast<
void> (0) : __assert_fail ("PSE.getSE()->isLoopInvariant(ID.getStep(), OrigLoop) && \"Induction step should be loop invariant\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1827, __PRETTY_FUNCTION__));
auto &DL = OrigLoop->getHeader()->getModule()->getDataLayout();
Value *Step = nullptr;
if (PSE.getSE()->isSCEVable(IV->getType())) {
  SCEVExpander Exp(*PSE.getSE(), DL, "induction");
  Step = Exp.expandCodeFor(ID.getStep(), ID.getStep()->getType(),
                           LoopVectorPreHeader->getTerminator());
} else {
  Step = cast<SCEVUnknown>(ID.getStep())->getValue();
}

// Try to create a new independent vector induction variable. If we can't
// create the phi node, we will splat the scalar induction variable in each
// loop iteration.
if (VF > 1 && !shouldScalarizeInstruction(EntryVal)) {
  createVectorIntOrFpInductionPHI(ID, Step, EntryVal);
  VectorizedIV = true;
}

// If we haven't yet vectorized the induction variable, or if we will create
// a scalar one, we need to define the scalar induction variable and step
// values. If we were given a truncation type, truncate the canonical
// induction variable and step. Otherwise, derive these values from the
// induction descriptor.
if (!VectorizedIV || NeedsScalarIV) {
  ScalarIV = Induction;
  if (IV != OldInduction) {
    ScalarIV = IV->getType()->isIntegerTy()
                   ? Builder.CreateSExtOrTrunc(Induction, IV->getType())
                   : Builder.CreateCast(Instruction::SIToFP, Induction,
                                        IV->getType());
    ScalarIV = emitTransformedIndex(Builder, ScalarIV, PSE.getSE(), DL, ID);
    ScalarIV->setName("offset.idx");
  }
  if (Trunc) {
    auto *TruncType = cast<IntegerType>(Trunc->getType());
    assert(Step->getType()->isIntegerTy() &&((Step->getType()->isIntegerTy() && "Truncation requires an integer step"
) ? static_cast<void> (0) : __assert_fail ("Step->getType()->isIntegerTy() && \"Truncation requires an integer step\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1864, __PRETTY_FUNCTION__))
           "Truncation requires an integer step")((Step->getType()->isIntegerTy() && "Truncation requires an integer step"
) ? static_cast<void> (0) : __assert_fail ("Step->getType()->isIntegerTy() && \"Truncation requires an integer step\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1864, __PRETTY_FUNCTION__));
    ScalarIV = Builder.CreateTrunc(ScalarIV, TruncType);
    Step = Builder.CreateTrunc(Step, TruncType);
  }
}

// If we haven't yet vectorized the induction variable, splat the scalar
// induction variable, and build the necessary step vectors.
// TODO: Don't do it unless the vectorized IV is really required.
if (!VectorizedIV) {
  Value *Broadcasted = getBroadcastInstrs(ScalarIV);
  for (unsigned Part = 0; Part < UF; ++Part) {
    Value *EntryPart =
        getStepVector(Broadcasted, VF * Part, Step, ID.getInductionOpcode());
    VectorLoopValueMap.setVectorValue(EntryVal, Part, EntryPart);
    if (Trunc)
      addMetadata(EntryPart, Trunc);
    recordVectorLoopValueForInductionCast(ID, EntryVal, EntryPart, Part);
  }
}

// If an induction variable is only used for counting loop iterations or
// calculating addresses, it doesn't need to be widened. Create scalar steps
// that can be used by instructions we will later scalarize. Note that the
// addition of the scalar steps will not increase the number of instructions
// in the loop in the common case prior to InstCombine. We will be trading
// one vector extract for each scalar step.
if (NeedsScalarIV)
  buildScalarSteps(ScalarIV, Step, EntryVal, ID);
1893}

1895Value *InnerLoopVectorizer::getStepVector(Value *Val, int StartIdx, Value *Step,
                                        Instruction::BinaryOps BinOp) {
// Create and check the types.
assert(Val->getType()->isVectorTy() && "Must be a vector")((Val->getType()->isVectorTy() && "Must be a vector"
) ? static_cast<void> (0) : __assert_fail ("Val->getType()->isVectorTy() && \"Must be a vector\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1898, __PRETTY_FUNCTION__));
int VLen = Val->getType()->getVectorNumElements();

Type *STy = Val->getType()->getScalarType();
assert((STy->isIntegerTy() || STy->isFloatingPointTy()) &&(((STy->isIntegerTy() || STy->isFloatingPointTy()) &&
 "Induction Step must be an integer or FP") ? static_cast<
void> (0) : __assert_fail ("(STy->isIntegerTy() || STy->isFloatingPointTy()) && \"Induction Step must be an integer or FP\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1903, __PRETTY_FUNCTION__))
       "Induction Step must be an integer or FP")(((STy->isIntegerTy() || STy->isFloatingPointTy()) &&
 "Induction Step must be an integer or FP") ? static_cast<
void> (0) : __assert_fail ("(STy->isIntegerTy() || STy->isFloatingPointTy()) && \"Induction Step must be an integer or FP\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1903, __PRETTY_FUNCTION__));
assert(Step->getType() == STy && "Step has wrong type")((Step->getType() == STy && "Step has wrong type")
 ? static_cast<void> (0) : __assert_fail ("Step->getType() == STy && \"Step has wrong type\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1904, __PRETTY_FUNCTION__));

SmallVector<Constant *, 8> Indices;

if (STy->isIntegerTy()) {
  // Create a vector of consecutive numbers from zero to VF.
  for (int i = 0; i < VLen; ++i)
    Indices.push_back(ConstantInt::get(STy, StartIdx + i));

  // Add the consecutive indices to the vector value.
  Constant *Cv = ConstantVector::get(Indices);
  assert(Cv->getType() == Val->getType() && "Invalid consecutive vec")((Cv->getType() == Val->getType() && "Invalid consecutive vec"
) ? static_cast<void> (0) : __assert_fail ("Cv->getType() == Val->getType() && \"Invalid consecutive vec\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1915, __PRETTY_FUNCTION__));
  Step = Builder.CreateVectorSplat(VLen, Step);
  assert(Step->getType() == Val->getType() && "Invalid step vec")((Step->getType() == Val->getType() && "Invalid step vec"
) ? static_cast<void> (0) : __assert_fail ("Step->getType() == Val->getType() && \"Invalid step vec\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1917, __PRETTY_FUNCTION__));
  // FIXME: The newly created binary instructions should contain nsw/nuw flags,
  // which can be found from the original scalar operations.
  Step = Builder.CreateMul(Cv, Step);
  return Builder.CreateAdd(Val, Step, "induction");
}

// Floating point induction.
assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) &&(((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) &&
 "Binary Opcode should be specified for FP induction") ? static_cast
<void> (0) : __assert_fail ("(BinOp == Instruction::FAdd || BinOp == Instruction::FSub) && \"Binary Opcode should be specified for FP induction\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1926, __PRETTY_FUNCTION__))
       "Binary Opcode should be specified for FP induction")(((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) &&
 "Binary Opcode should be specified for FP induction") ? static_cast
<void> (0) : __assert_fail ("(BinOp == Instruction::FAdd || BinOp == Instruction::FSub) && \"Binary Opcode should be specified for FP induction\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1926, __PRETTY_FUNCTION__));
// Create a vector of consecutive numbers from zero to VF.
for (int i = 0; i < VLen; ++i)
  Indices.push_back(ConstantFP::get(STy, (double)(StartIdx + i)));

// Add the consecutive indices to the vector value.
Constant *Cv = ConstantVector::get(Indices);

Step = Builder.CreateVectorSplat(VLen, Step);

// Floating point operations had to be 'fast' to enable the induction.
FastMathFlags Flags;
Flags.setFast();

Value *MulOp = Builder.CreateFMul(Cv, Step);
if (isa<Instruction>(MulOp))
  // Have to check, MulOp may be a constant
  cast<Instruction>(MulOp)->setFastMathFlags(Flags);

Value *BOp = Builder.CreateBinOp(BinOp, Val, MulOp, "induction");
if (isa<Instruction>(BOp))
  cast<Instruction>(BOp)->setFastMathFlags(Flags);
return BOp;
1949}

1951void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step,
                                         Instruction *EntryVal,
                                         const InductionDescriptor &ID) {
// We shouldn't have to build scalar steps if we aren't vectorizing.
assert(VF > 1 && "VF should be greater than one")((VF > 1 && "VF should be greater than one") ? static_cast
<void> (0) : __assert_fail ("VF > 1 && \"VF should be greater than one\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1955, __PRETTY_FUNCTION__));

// Get the value type and ensure it and the step have the same integer type.
Type *ScalarIVTy = ScalarIV->getType()->getScalarType();
assert(ScalarIVTy == Step->getType() &&((ScalarIVTy == Step->getType() && "Val and Step should have the same type"
) ? static_cast<void> (0) : __assert_fail ("ScalarIVTy == Step->getType() && \"Val and Step should have the same type\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1960, __PRETTY_FUNCTION__))
       "Val and Step should have the same type")((ScalarIVTy == Step->getType() && "Val and Step should have the same type"
) ? static_cast<void> (0) : __assert_fail ("ScalarIVTy == Step->getType() && \"Val and Step should have the same type\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1960, __PRETTY_FUNCTION__));

// We build scalar steps for both integer and floating-point induction
// variables. Here, we determine the kind of arithmetic we will perform.
Instruction::BinaryOps AddOp;
Instruction::BinaryOps MulOp;
if (ScalarIVTy->isIntegerTy()) {
  AddOp = Instruction::Add;
  MulOp = Instruction::Mul;
} else {
  AddOp = ID.getInductionOpcode();
  MulOp = Instruction::FMul;
}

// Determine the number of scalars we need to generate for each unroll
// iteration. If EntryVal is uniform, we only need to generate the first
// lane. Otherwise, we generate all VF values.
unsigned Lanes =
    Cost->isUniformAfterVectorization(cast<Instruction>(EntryVal), VF) ? 1
                                                                       : VF;
// Compute the scalar steps and save the results in VectorLoopValueMap.
for (unsigned Part = 0; Part < UF; ++Part) {
  for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
    auto *StartIdx = getSignedIntOrFpConstant(ScalarIVTy, VF * Part + Lane);
    auto *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, StartIdx, Step));
    auto *Add = addFastMathFlag(Builder.CreateBinOp(AddOp, ScalarIV, Mul));
    VectorLoopValueMap.setScalarValue(EntryVal, {Part, Lane}, Add);
    recordVectorLoopValueForInductionCast(ID, EntryVal, Add, Part, Lane);
  }
}
1990}

1992Value *InnerLoopVectorizer::getOrCreateVectorValue(Value *V, unsigned Part) {
assert(V != Induction && "The new induction variable should not be used.")((V != Induction && "The new induction variable should not be used."
) ? static_cast<void> (0) : __assert_fail ("V != Induction && \"The new induction variable should not be used.\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1993, __PRETTY_FUNCTION__));
assert(!V->getType()->isVectorTy() && "Can't widen a vector")((!V->getType()->isVectorTy() && "Can't widen a vector"
) ? static_cast<void> (0) : __assert_fail ("!V->getType()->isVectorTy() && \"Can't widen a vector\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1994, __PRETTY_FUNCTION__));
assert(!V->getType()->isVoidTy() && "Type does not produce a value")((!V->getType()->isVoidTy() && "Type does not produce a value"
) ? static_cast<void> (0) : __assert_fail ("!V->getType()->isVoidTy() && \"Type does not produce a value\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1995, __PRETTY_FUNCTION__));

// If we have a stride that is replaced by one, do it here. Defer this for
// the VPlan-native path until we start running Legal checks in that path.
if (!EnableVPlanNativePath && Legal->hasStride(V))
  V = ConstantInt::get(V->getType(), 1);

// If we have a vector mapped to this value, return it.
if (VectorLoopValueMap.hasVectorValue(V, Part))
  return VectorLoopValueMap.getVectorValue(V, Part);

// If the value has not been vectorized, check if it has been scalarized
// instead. If it has been scalarized, and we actually need the value in
// vector form, we will construct the vector values on demand.
if (VectorLoopValueMap.hasAnyScalarValue(V)) {
  Value *ScalarValue = VectorLoopValueMap.getScalarValue(V, {Part, 0});

  // If we've scalarized a value, that value should be an instruction.
  auto *I = cast<Instruction>(V);

  // If we aren't vectorizing, we can just copy the scalar map values over to
  // the vector map.
  if (VF == 1) {
    VectorLoopValueMap.setVectorValue(V, Part, ScalarValue);
    return ScalarValue;
  }

  // Get the last scalar instruction we generated for V and Part. If the value
  // is known to be uniform after vectorization, this corresponds to lane zero
  // of the Part unroll iteration. Otherwise, the last instruction is the one
  // we created for the last vector lane of the Part unroll iteration.
  unsigned LastLane = Cost->isUniformAfterVectorization(I, VF) ? 0 : VF - 1;
  auto *LastInst = cast<Instruction>(
      VectorLoopValueMap.getScalarValue(V, {Part, LastLane}));

  // Set the insert point after the last scalarized instruction. This ensures
  // the insertelement sequence will directly follow the scalar definitions.
  auto OldIP = Builder.saveIP();
  auto NewIP = std::next(BasicBlock::iterator(LastInst));
  Builder.SetInsertPoint(&*NewIP);

  // However, if we are vectorizing, we need to construct the vector values.
  // If the value is known to be uniform after vectorization, we can just
  // broadcast the scalar value corresponding to lane zero for each unroll
  // iteration. Otherwise, we construct the vector values using insertelement
  // instructions. Since the resulting vectors are stored in
  // VectorLoopValueMap, we will only generate the insertelements once.
  Value *VectorValue = nullptr;
  if (Cost->isUniformAfterVectorization(I, VF)) {
    VectorValue = getBroadcastInstrs(ScalarValue);
    VectorLoopValueMap.setVectorValue(V, Part, VectorValue);
  } else {
    // Initialize packing with insertelements to start from undef.
    Value *Undef = UndefValue::get(VectorType::get(V->getType(), VF));
    VectorLoopValueMap.setVectorValue(V, Part, Undef);
    for (unsigned Lane = 0; Lane < VF; ++Lane)
      packScalarIntoVectorValue(V, {Part, Lane});
    VectorValue = VectorLoopValueMap.getVectorValue(V, Part);
  }
  Builder.restoreIP(OldIP);
  return VectorValue;
}

// If this scalar is unknown, assume that it is a constant or that it is
// loop invariant. Broadcast V and save the value for future uses.
Value *B = getBroadcastInstrs(V);
VectorLoopValueMap.setVectorValue(V, Part, B);
return B;
2063}

2065Value *
2066InnerLoopVectorizer::getOrCreateScalarValue(Value *V,
                                          const VPIteration &Instance) {
// If the value is not an instruction contained in the loop, it should
// already be scalar.
if (OrigLoop->isLoopInvariant(V))
  return V;

assert(Instance.Lane > 0((Instance.Lane > 0 ? !Cost->isUniformAfterVectorization
(cast<Instruction>(V), VF) : true && "Uniform values only have lane zero"
) ? static_cast<void> (0) : __assert_fail ("Instance.Lane > 0 ? !Cost->isUniformAfterVectorization(cast<Instruction>(V), VF) : true && \"Uniform values only have lane zero\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2075, __PRETTY_FUNCTION__))
           ? !Cost->isUniformAfterVectorization(cast<Instruction>(V), VF)((Instance.Lane > 0 ? !Cost->isUniformAfterVectorization
(cast<Instruction>(V), VF) : true && "Uniform values only have lane zero"
) ? static_cast<void> (0) : __assert_fail ("Instance.Lane > 0 ? !Cost->isUniformAfterVectorization(cast<Instruction>(V), VF) : true && \"Uniform values only have lane zero\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2075, __PRETTY_FUNCTION__))
           : true && "Uniform values only have lane zero")((Instance.Lane > 0 ? !Cost->isUniformAfterVectorization
(cast<Instruction>(V), VF) : true && "Uniform values only have lane zero"
) ? static_cast<void> (0) : __assert_fail ("Instance.Lane > 0 ? !Cost->isUniformAfterVectorization(cast<Instruction>(V), VF) : true && \"Uniform values only have lane zero\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2075, __PRETTY_FUNCTION__));

// If the value from the original loop has not been vectorized, it is
// represented by UF x VF scalar values in the new loop. Return the requested
// scalar value.
if (VectorLoopValueMap.hasScalarValue(V, Instance))
  return VectorLoopValueMap.getScalarValue(V, Instance);

// If the value has not been scalarized, get its entry in VectorLoopValueMap
// for the given unroll part. If this entry is not a vector type (i.e., the
// vectorization factor is one), there is no need to generate an
// extractelement instruction.
auto *U = getOrCreateVectorValue(V, Instance.Part);
if (!U->getType()->isVectorTy()) {
  assert(VF == 1 && "Value not scalarized has non-vector type")((VF == 1 && "Value not scalarized has non-vector type"
) ? static_cast<void> (0) : __assert_fail ("VF == 1 && \"Value not scalarized has non-vector type\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2089, __PRETTY_FUNCTION__));
  return U;
}

// Otherwise, the value from the original loop has been vectorized and is
// represented by UF vector values. Extract and return the requested scalar
// value from the appropriate vector lane.
return Builder.CreateExtractElement(U, Builder.getInt32(Instance.Lane));
2097}

2099void InnerLoopVectorizer::packScalarIntoVectorValue(
  Value *V, const VPIteration &Instance) {
assert(V != Induction && "The new induction variable should not be used.")((V != Induction && "The new induction variable should not be used."
) ? static_cast<void> (0) : __assert_fail ("V != Induction && \"The new induction variable should not be used.\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2101, __PRETTY_FUNCTION__));
assert(!V->getType()->isVectorTy() && "Can't pack a vector")((!V->getType()->isVectorTy() && "Can't pack a vector"
) ? static_cast<void> (0) : __assert_fail ("!V->getType()->isVectorTy() && \"Can't pack a vector\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2102, __PRETTY_FUNCTION__));
assert(!V->getType()->isVoidTy() && "Type does not produce a value")((!V->getType()->isVoidTy() && "Type does not produce a value"
) ? static_cast<void> (0) : __assert_fail ("!V->getType()->isVoidTy() && \"Type does not produce a value\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2103, __PRETTY_FUNCTION__));

Value *ScalarInst = VectorLoopValueMap.getScalarValue(V, Instance);
Value *VectorValue = VectorLoopValueMap.getVectorValue(V, Instance.Part);
VectorValue = Builder.CreateInsertElement(VectorValue, ScalarInst,
                                          Builder.getInt32(Instance.Lane));
VectorLoopValueMap.resetVectorValue(V, Instance.Part, VectorValue);
2110}

2112Value *InnerLoopVectorizer::reverseVector(Value *Vec) {
assert(Vec->getType()->isVectorTy() && "Invalid type")((Vec->getType()->isVectorTy() && "Invalid type"
) ? static_cast<void> (0) : __assert_fail ("Vec->getType()->isVectorTy() && \"Invalid type\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2113, __PRETTY_FUNCTION__));
SmallVector<Constant *, 8> ShuffleMask;
for (unsigned i = 0; i < VF; ++i)
  ShuffleMask.push_back(Builder.getInt32(VF - i - 1));

return Builder.CreateShuffleVector(Vec, UndefValue::get(Vec->getType()),
                                   ConstantVector::get(ShuffleMask),
                                   "reverse");
2121}

2123// Return whether we allow using masked interleave-groups (for dealing with
2124// strided loads/stores that reside in predicated blocks, or for dealing
2125// with gaps).
2126static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) {
// If an override option has been passed in for interleaved accesses, use it.
if (EnableMaskedInterleavedMemAccesses.getNumOccurrences() > 0)
  return EnableMaskedInterleavedMemAccesses;

return TTI.enableMaskedInterleavedAccessVectorization();
2132}

2134// Try to vectorize the interleave group that \p Instr belongs to.
2135//
2136// E.g. Translate following interleaved load group (factor = 3):
2137//   for (i = 0; i < N; i+=3) {
2138//     R = Pic[i];             // Member of index 0
2139//     G = Pic[i+1];           // Member of index 1
2140//     B = Pic[i+2];           // Member of index 2
2141//     ... // do something to R, G, B
2142//   }
2143// To:
2144//   %wide.vec = load <12 x i32>                       ; Read 4 tuples of R,G,B
2145//   %R.vec = shuffle %wide.vec, undef, <0, 3, 6, 9>   ; R elements
2146//   %G.vec = shuffle %wide.vec, undef, <1, 4, 7, 10>  ; G elements
2147//   %B.vec = shuffle %wide.vec, undef, <2, 5, 8, 11>  ; B elements
2148//
2149// Or translate following interleaved store group (factor = 3):
2150//   for (i = 0; i < N; i+=3) {
2151//     ... do something to R, G, B
2152//     Pic[i]   = R;           // Member of index 0
2153//     Pic[i+1] = G;           // Member of index 1
2154//     Pic[i+2] = B;           // Member of index 2
2155//   }
2156// To:
2157//   %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7>
2158//   %B_U.vec = shuffle %B.vec, undef, <0, 1, 2, 3, u, u, u, u>
2159//   %interleaved.vec = shuffle %R_G.vec, %B_U.vec,
2160//        <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11>    ; Interleave R,G,B elements
2161//   store <12 x i32> %interleaved.vec              ; Write 4 tuples of R,G,B
2162void InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction *Instr,
                                                 VPTransformState &State,
                                                 VPValue *Addr,
                                                 VPValue *BlockInMask) {
const InterleaveGroup<Instruction> *Group =
    Cost->getInterleavedAccessGroup(Instr);
assert(Group && "Fail to get an interleaved access group.")((Group && "Fail to get an interleaved access group."
) ? static_cast<void> (0) : __assert_fail ("Group && \"Fail to get an interleaved access group.\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2168, __PRETTY_FUNCTION__));

// Skip if current instruction is not the insert position.
if (Instr != Group->getInsertPos())
  return;

const DataLayout &DL = Instr->getModule()->getDataLayout();

// Prepare for the vector type of the interleaved load/store.
Type *ScalarTy = getMemInstValueType(Instr);
unsigned InterleaveFactor = Group->getFactor();
Type *VecTy = VectorType::get(ScalarTy, InterleaveFactor * VF);

// Prepare for the new pointers.
SmallVector<Value *, 2> AddrParts;
unsigned Index = Group->getIndex(Instr);

// TODO: extend the masked interleaved-group support to reversed access.
assert((!BlockInMask || !Group->isReverse()) &&(((!BlockInMask || !Group->isReverse()) && "Reversed masked interleave-group not supported."
) ? static_cast<void> (0) : __assert_fail ("(!BlockInMask || !Group->isReverse()) && \"Reversed masked interleave-group not supported.\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2187, __PRETTY_FUNCTION__))
       "Reversed masked interleave-group not supported.")(((!BlockInMask || !Group->isReverse()) && "Reversed masked interleave-group not supported."
) ? static_cast<void> (0) : __assert_fail ("(!BlockInMask || !Group->isReverse()) && \"Reversed masked interleave-group not supported.\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2187, __PRETTY_FUNCTION__));

// If the group is reverse, adjust the index to refer to the last vector lane
// instead of the first. We adjust the index from the first vector lane,
// rather than directly getting the pointer for lane VF - 1, because the
// pointer operand of the interleaved access is supposed to be uniform. For
// uniform instructions, we're only required to generate a value for the
// first vector lane in each unroll iteration.
if (Group->isReverse())
  Index += (VF - 1) * Group->getFactor();

for (unsigned Part = 0; Part < UF; Part++) {
  Value *AddrPart = State.get(Addr, {Part, 0});
  setDebugLocFromInst(Builder, AddrPart);

  // Notice current instruction could be any index. Need to adjust the address
  // to the member of index 0.
  //
  // E.g.  a = A[i+1];     // Member of index 1 (Current instruction)
  //       b = A[i];       // Member of index 0
  // Current pointer is pointed to A[i+1], adjust it to A[i].
  //
  // E.g.  A[i+1] = a;     // Member of index 1
  //       A[i]   = b;     // Member of index 0
  //       A[i+2] = c;     // Member of index 2 (Current instruction)
  // Current pointer is pointed to A[i+2], adjust it to A[i].

  bool InBounds = false;
  if (auto *gep = dyn_cast<GetElementPtrInst>(AddrPart->stripPointerCasts()))
    InBounds = gep->isInBounds();
  AddrPart = Builder.CreateGEP(ScalarTy, AddrPart, Builder.getInt32(-Index));
  cast<GetElementPtrInst>(AddrPart)->setIsInBounds(InBounds);

  // Cast to the vector pointer type.
  unsigned AddressSpace = AddrPart->getType()->getPointerAddressSpace();
  Type *PtrTy = VecTy->getPointerTo(AddressSpace);
  AddrParts.push_back(Builder.CreateBitCast(AddrPart, PtrTy));
}

setDebugLocFromInst(Builder, Instr);
Value *UndefVec = UndefValue::get(VecTy);

Value *MaskForGaps = nullptr;
if (Group->requiresScalarEpilogue() && !Cost->isScalarEpilogueAllowed()) {
  MaskForGaps = createBitMaskForGaps(Builder, VF, *Group);
  assert(MaskForGaps && "Mask for Gaps is required but it is null")((MaskForGaps && "Mask for Gaps is required but it is null"
) ? static_cast<void> (0) : __assert_fail ("MaskForGaps && \"Mask for Gaps is required but it is null\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2232, __PRETTY_FUNCTION__));
}

// Vectorize the interleaved load group.
if (isa<LoadInst>(Instr)) {
  // For each unroll part, create a wide load for the group.
  SmallVector<Value *, 2> NewLoads;
  for (unsigned Part = 0; Part < UF; Part++) {
    Instruction *NewLoad;
    if (BlockInMask || MaskForGaps) {
      assert(useMaskedInterleavedAccesses(*TTI) &&((useMaskedInterleavedAccesses(*TTI) && "masked interleaved groups are not allowed."
) ? static_cast<void> (0) : __assert_fail ("useMaskedInterleavedAccesses(*TTI) && \"masked interleaved groups are not allowed.\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2243, __PRETTY_FUNCTION__))
             "masked interleaved groups are not allowed.")((useMaskedInterleavedAccesses(*TTI) && "masked interleaved groups are not allowed."
) ? static_cast<void> (0) : __assert_fail ("useMaskedInterleavedAccesses(*TTI) && \"masked interleaved groups are not allowed.\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2243, __PRETTY_FUNCTION__));
      Value *GroupMask = MaskForGaps;
      if (BlockInMask) {
        Value *BlockInMaskPart = State.get(BlockInMask, Part);
        auto *Undefs = UndefValue::get(BlockInMaskPart->getType());
        auto *RepMask = createReplicatedMask(Builder, InterleaveFactor, VF);
        Value *ShuffledMask = Builder.CreateShuffleVector(
            BlockInMaskPart, Undefs, RepMask, "interleaved.mask");
        GroupMask = MaskForGaps
                        ? Builder.CreateBinOp(Instruction::And, ShuffledMask,
                                              MaskForGaps)
                        : ShuffledMask;
      }
      NewLoad =
          Builder.CreateMaskedLoad(AddrParts[Part], Group->getAlign(),
                                   GroupMask, UndefVec, "wide.masked.vec");
    }
    else
      NewLoad = Builder.CreateAlignedLoad(VecTy, AddrParts[Part],
                                          Group->getAlign(), "wide.vec");
    Group->addMetadata(NewLoad);
    NewLoads.push_back(NewLoad);
  }

  // For each member in the group, shuffle out the appropriate data from the
  // wide loads.
  for (unsigned I = 0; I < InterleaveFactor; ++I) {
    Instruction *Member = Group->getMember(I);

    // Skip the gaps in the group.
    if (!Member)
      continue;

    Constant *StrideMask = createStrideMask(Builder, I, InterleaveFactor, VF);
    for (unsigned Part = 0; Part < UF; Part++) {
      Value *StridedVec = Builder.CreateShuffleVector(
          NewLoads[Part], UndefVec, StrideMask, "strided.vec");

      // If this member has different type, cast the result type.
      if (Member->getType() != ScalarTy) {
        VectorType *OtherVTy = VectorType::get(Member->getType(), VF);
        StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL);
      }

      if (Group->isReverse())
        StridedVec = reverseVector(StridedVec);

      VectorLoopValueMap.setVectorValue(Member, Part, StridedVec);
    }
  }
  return;
}

// The sub vector type for current instruction.
VectorType *SubVT = VectorType::get(ScalarTy, VF);

// Vectorize the interleaved store group.
for (unsigned Part = 0; Part < UF; Part++) {
  // Collect the stored vector from each member.
  SmallVector<Value *, 4> StoredVecs;
  for (unsigned i = 0; i < InterleaveFactor; i++) {
    // Interleaved store group doesn't allow a gap, so each index has a member
    Instruction *Member = Group->getMember(i);
    assert(Member && "Fail to get a member from an interleaved store group")((Member && "Fail to get a member from an interleaved store group"
) ? static_cast<void> (0) : __assert_fail ("Member && \"Fail to get a member from an interleaved store group\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2306, __PRETTY_FUNCTION__));

    Value *StoredVec = getOrCreateVectorValue(
        cast<StoreInst>(Member)->getValueOperand(), Part);
    if (Group->isReverse())
      StoredVec = reverseVector(StoredVec);

    // If this member has different type, cast it to a unified type.

    if (StoredVec->getType() != SubVT)
      StoredVec = createBitOrPointerCast(StoredVec, SubVT, DL);

    StoredVecs.push_back(StoredVec);
  }

  // Concatenate all vectors into a wide vector.
  Value *WideVec = concatenateVectors(Builder, StoredVecs);

  // Interleave the elements in the wide vector.
  Constant *IMask = createInterleaveMask(Builder, VF, InterleaveFactor);
  Value *IVec = Builder.CreateShuffleVector(WideVec, UndefVec, IMask,
                                            "interleaved.vec");

  Instruction *NewStoreInstr;
  if (BlockInMask) {
    Value *BlockInMaskPart = State.get(BlockInMask, Part);
    auto *Undefs = UndefValue::get(BlockInMaskPart->getType());
    auto *RepMask = createReplicatedMask(Builder, InterleaveFactor, VF);
    Value *ShuffledMask = Builder.CreateShuffleVector(
        BlockInMaskPart, Undefs, RepMask, "interleaved.mask");
    NewStoreInstr = Builder.CreateMaskedStore(
        IVec, AddrParts[Part], Group->getAlign(), ShuffledMask);
  }
  else
    NewStoreInstr =
        Builder.CreateAlignedStore(IVec, AddrParts[Part], Group->getAlign());

  Group->addMetadata(NewStoreInstr);
}
2345}

2347void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr,
                                                   VPTransformState &State,
                                                   VPValue *Addr,
                                                   VPValue *BlockInMask) {
// Attempt to issue a wide load.
LoadInst *LI = dyn_cast<LoadInst>(Instr);
StoreInst *SI = dyn_cast<StoreInst>(Instr);

assert((LI || SI) && "Invalid Load/Store instruction")(((LI || SI) && "Invalid Load/Store instruction") ? static_cast
<void> (0) : __assert_fail ("(LI || SI) && \"Invalid Load/Store instruction\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2355, __PRETTY_FUNCTION__));

LoopVectorizationCostModel::InstWidening Decision =
    Cost->getWideningDecision(Instr, VF);
assert(Decision != LoopVectorizationCostModel::CM_Unknown &&((Decision != LoopVectorizationCostModel::CM_Unknown &&
 "CM decision should be taken at this point") ? static_cast<
void> (0) : __assert_fail ("Decision != LoopVectorizationCostModel::CM_Unknown && \"CM decision should be taken at this point\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2360, __PRETTY_FUNCTION__))
       "CM decision should be taken at this point")((Decision != LoopVectorizationCostModel::CM_Unknown &&
 "CM decision should be taken at this point") ? static_cast<
void> (0) : __assert_fail ("Decision != LoopVectorizationCostModel::CM_Unknown && \"CM decision should be taken at this point\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2360, __PRETTY_FUNCTION__));
if (Decision == LoopVectorizationCostModel::CM_Interleave)
  return vectorizeInterleaveGroup(Instr, State, Addr, BlockInMask);

Type *ScalarDataTy = getMemInstValueType(Instr);
Type *DataTy = VectorType::get(ScalarDataTy, VF);
// An alignment of 0 means target abi alignment. We need to use the scalar's
// target abi alignment in such a case.
const DataLayout &DL = Instr->getModule()->getDataLayout();
const Align Alignment =
    DL.getValueOrABITypeAlignment(getLoadStoreAlignment(Instr), ScalarDataTy);

// Determine if the pointer operand of the access is either consecutive or
// reverse consecutive.
bool Reverse = (Decision == LoopVectorizationCostModel::CM_Widen_Reverse);
bool ConsecutiveStride =
    Reverse || (Decision == LoopVectorizationCostModel::CM_Widen);
bool CreateGatherScatter =
    (Decision == LoopVectorizationCostModel::CM_GatherScatter);

// Either Ptr feeds a vector load/store, or a vector GEP should feed a vector
// gather/scatter. Otherwise Decision should have been to Scalarize.
assert((ConsecutiveStride || CreateGatherScatter) &&(((ConsecutiveStride || CreateGatherScatter) && "The instruction should be scalarized"
) ? static_cast<void> (0) : __assert_fail ("(ConsecutiveStride || CreateGatherScatter) && \"The instruction should be scalarized\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2383, __PRETTY_FUNCTION__))
       "The instruction should be scalarized")(((ConsecutiveStride || CreateGatherScatter) && "The instruction should be scalarized"
) ? static_cast<void> (0) : __assert_fail ("(ConsecutiveStride || CreateGatherScatter) && \"The instruction should be scalarized\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2383, __PRETTY_FUNCTION__));
(void)ConsecutiveStride;

VectorParts BlockInMaskParts(UF);
bool isMaskRequired = BlockInMask;
if (isMaskRequired)
  for (unsigned Part = 0; Part < UF; ++Part)
    BlockInMaskParts[Part] = State.get(BlockInMask, Part);

const auto CreateVecPtr = [&](unsigned Part, Value *Ptr) -> Value * {
  // Calculate the pointer for the specific unroll-part.
  GetElementPtrInst *PartPtr = nullptr;

  bool InBounds = false;
  if (auto *gep = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts()))
    InBounds = gep->isInBounds();

  if (Reverse) {
    // If the address is consecutive but reversed, then the
    // wide store needs to start at the last vector element.
    PartPtr = cast<GetElementPtrInst>(
        Builder.CreateGEP(ScalarDataTy, Ptr, Builder.getInt32(-Part * VF)));
    PartPtr->setIsInBounds(InBounds);
    PartPtr = cast<GetElementPtrInst>(
        Builder.CreateGEP(ScalarDataTy, PartPtr, Builder.getInt32(1 - VF)));
    PartPtr->setIsInBounds(InBounds);
    if (isMaskRequired) // Reverse of a null all-one mask is a null mask.
      BlockInMaskParts[Part] = reverseVector(BlockInMaskParts[Part]);
  } else {
    PartPtr = cast<GetElementPtrInst>(
        Builder.CreateGEP(ScalarDataTy, Ptr, Builder.getInt32(Part * VF)));
    PartPtr->setIsInBounds(InBounds);
  }

  unsigned AddressSpace = Ptr->getType()->getPointerAddressSpace();
  return Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace));
};

// Handle Stores:
if (SI) {
  setDebugLocFromInst(Builder, SI);

  for (unsigned Part = 0; Part < UF; ++Part) {
    Instruction *NewSI = nullptr;
    Value *StoredVal = getOrCreateVectorValue(SI->getValueOperand(), Part);
    if (CreateGatherScatter) {
      Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr;
      Value *VectorGep = State.get(Addr, Part);
      NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep, Alignment,
                                          MaskPart);
    } else {
      if (Reverse) {
        // If we store to reverse consecutive memory locations, then we need
        // to reverse the order of elements in the stored value.
        StoredVal = reverseVector(StoredVal);
        // We don't want to update the value in the map as it might be used in
        // another expression. So don't call resetVectorValue(StoredVal).
      }
      auto *VecPtr = CreateVecPtr(Part, State.get(Addr, {0, 0}));
      if (isMaskRequired)
        NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr, Alignment,
                                          BlockInMaskParts[Part]);
      else
        NewSI = Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment);
    }
    addMetadata(NewSI, SI);
  }
  return;
}

// Handle loads.
assert(LI && "Must have a load instruction")((LI && "Must have a load instruction") ? static_cast
<void> (0) : __assert_fail ("LI && \"Must have a load instruction\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2454, __PRETTY_FUNCTION__));
setDebugLocFromInst(Builder, LI);
for (unsigned Part = 0; Part < UF; ++Part) {
  Value *NewLI;
  if (CreateGatherScatter) {
    Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr;
    Value *VectorGep = State.get(Addr, Part);
    NewLI = Builder.CreateMaskedGather(VectorGep, Alignment, MaskPart,
                                       nullptr, "wide.masked.gather");
    addMetadata(NewLI, LI);
  } else {
    auto *VecPtr = CreateVecPtr(Part, State.get(Addr, {0, 0}));
    if (isMaskRequired)
      NewLI = Builder.CreateMaskedLoad(
          VecPtr, Alignment, BlockInMaskParts[Part], UndefValue::get(DataTy),
          "wide.masked.load");
    else
      NewLI =
          Builder.CreateAlignedLoad(DataTy, VecPtr, Alignment, "wide.load");

    // Add metadata to the load, but setVectorValue to the reverse shuffle.
    addMetadata(NewLI, LI);
    if (Reverse)
      NewLI = reverseVector(NewLI);
  }
  VectorLoopValueMap.setVectorValue(Instr, Part, NewLI);
}
2481}

2483void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr,
                                             const VPIteration &Instance,
                                             bool IfPredicateInstr) {
assert(!Instr->getType()->isAggregateType() && "Can't handle vectors")((!Instr->getType()->isAggregateType() && "Can't handle vectors"
) ? static_cast<void> (0) : __assert_fail ("!Instr->getType()->isAggregateType() && \"Can't handle vectors\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2486, __PRETTY_FUNCTION__));

setDebugLocFromInst(Builder, Instr);

// Does this instruction return a value ?
bool IsVoidRetTy = Instr->getType()->isVoidTy();

Instruction *Cloned = Instr->clone();
if (!IsVoidRetTy)
  Cloned->setName(Instr->getName() + ".cloned");

// Replace the operands of the cloned instructions with their scalar
// equivalents in the new loop.
for (unsigned op = 0, e = Instr->getNumOperands(); op != e; ++op) {
  auto *NewOp = getOrCreateScalarValue(Instr->getOperand(op), Instance);
  Cloned->setOperand(op, NewOp);
}
addNewMetadata(Cloned, Instr);

// Place the cloned scalar in the new loop.
Builder.Insert(Cloned);

// Add the cloned scalar to the scalar map entry.
VectorLoopValueMap.setScalarValue(Instr, Instance, Cloned);

// If we just cloned a new assumption, add it the assumption cache.
if (auto *II = dyn_cast<IntrinsicInst>(Cloned))
  if (II->getIntrinsicID() == Intrinsic::assume)
    AC->registerAssumption(II);

// End if-block.
if (IfPredicateInstr)
  PredicatedInstructions.push_back(Cloned);
2519}

2521PHINode *InnerLoopVectorizer::createInductionVariable(Loop *L, Value *Start,
                                                    Value *End, Value *Step,
                                                    Instruction *DL) {
BasicBlock *Header = L->getHeader();
BasicBlock *Latch = L->getLoopLatch();
// As we're just creating this loop, it's possible no latch exists
// yet. If so, use the header as this will be a single block loop.
if (!Latch)
  Latch = Header;

IRBuilder<> Builder(&*Header->getFirstInsertionPt());
Instruction *OldInst = getDebugLocFromInstOrOperands(OldInduction);
setDebugLocFromInst(Builder, OldInst);
auto *Induction = Builder.CreatePHI(Start->getType(), 2, "index");

Builder.SetInsertPoint(Latch->getTerminator());
setDebugLocFromInst(Builder, OldInst);

// Create i+1 and fill the PHINode.
Value *Next = Builder.CreateAdd(Induction, Step, "index.next");
Induction->addIncoming(Start, L->getLoopPreheader());
Induction->addIncoming(Next, Latch);
// Create the compare.
Value *ICmp = Builder.CreateICmpEQ(Next, End);
Builder.CreateCondBr(ICmp, L->getExitBlock(), Header);

// Now we have two terminators. Remove the old one from the block.
Latch->getTerminator()->eraseFromParent();

return Induction;
2551}

2553Value *InnerLoopVectorizer::getOrCreateTripCount(Loop *L) {
if (TripCount)
  return TripCount;

assert(L && "Create Trip Count for null loop.")((L && "Create Trip Count for null loop.") ? static_cast
<void> (0) : __assert_fail ("L && \"Create Trip Count for null loop.\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2557, __PRETTY_FUNCTION__));
IRBuilder<> Builder(L->getLoopPreheader()->getTerminator());
// Find the loop boundaries.
ScalarEvolution *SE = PSE.getSE();
const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount();
assert(BackedgeTakenCount != SE->getCouldNotCompute() &&((BackedgeTakenCount != SE->getCouldNotCompute() &&
 "Invalid loop count") ? static_cast<void> (0) : __assert_fail
 ("BackedgeTakenCount != SE->getCouldNotCompute() && \"Invalid loop count\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2563, __PRETTY_FUNCTION__))
       "Invalid loop count")((BackedgeTakenCount != SE->getCouldNotCompute() &&
 "Invalid loop count") ? static_cast<void> (0) : __assert_fail
 ("BackedgeTakenCount != SE->getCouldNotCompute() && \"Invalid loop count\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2563, __PRETTY_FUNCTION__));

Type *IdxTy = Legal->getWidestInductionType();
assert(IdxTy && "No type for induction")((IdxTy && "No type for induction") ? static_cast<
void> (0) : __assert_fail ("IdxTy && \"No type for induction\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2566, __PRETTY_FUNCTION__));

// The exit count might have the type of i64 while the phi is i32. This can
// happen if we have an induction variable that is sign extended before the
// compare. The only way that we get a backedge taken count is that the
// induction variable was signed and as such will not overflow. In such a case
// truncation is legal.
if (BackedgeTakenCount->getType()->getPrimitiveSizeInBits() >
    IdxTy->getPrimitiveSizeInBits())
  BackedgeTakenCount = SE->getTruncateOrNoop(BackedgeTakenCount, IdxTy);
BackedgeTakenCount = SE->getNoopOrZeroExtend(BackedgeTakenCount, IdxTy);

// Get the total trip count from the count by adding 1.
const SCEV *ExitCount = SE->getAddExpr(
    BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType()));

const DataLayout &DL = L->getHeader()->getModule()->getDataLayout();

// Expand the trip count and place the new instructions in the preheader.
// Notice that the pre-header does not change, only the loop body.
SCEVExpander Exp(*SE, DL, "induction");

// Count holds the overall loop count (N).
TripCount = Exp.expandCodeFor(ExitCount, ExitCount->getType(),
                              L->getLoopPreheader()->getTerminator());

if (TripCount->getType()->isPointerTy())
  TripCount =
      CastInst::CreatePointerCast(TripCount, IdxTy, "exitcount.ptrcnt.to.int",
                                  L->getLoopPreheader()->getTerminator());

return TripCount;
2598}

2600Value *InnerLoopVectorizer::getOrCreateVectorTripCount(Loop *L) {
if (VectorTripCount)
  return VectorTripCount;

Value *TC = getOrCreateTripCount(L);
IRBuilder<> Builder(L->getLoopPreheader()->getTerminator());

Type *Ty = TC->getType();
Constant *Step = ConstantInt::get(Ty, VF * UF);

// If the tail is to be folded by masking, round the number of iterations N
// up to a multiple of Step instead of rounding down. This is done by first
// adding Step-1 and then rounding down. Note that it's ok if this addition
// overflows: the vector induction variable will eventually wrap to zero given
// that it starts at zero and its Step is a power of two; the loop will then
// exit, with the last early-exit vector comparison also producing all-true.
if (Cost->foldTailByMasking()) {
  assert(isPowerOf2_32(VF * UF) &&((isPowerOf2_32(VF * UF) && "VF*UF must be a power of 2 when folding tail by masking"
) ? static_cast<void> (0) : __assert_fail ("isPowerOf2_32(VF * UF) && \"VF*UF must be a power of 2 when folding tail by masking\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2618, __PRETTY_FUNCTION__))
         "VF*UF must be a power of 2 when folding tail by masking")((isPowerOf2_32(VF * UF) && "VF*UF must be a power of 2 when folding tail by masking"
) ? static_cast<void> (0) : __assert_fail ("isPowerOf2_32(VF * UF) && \"VF*UF must be a power of 2 when folding tail by masking\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2618, __PRETTY_FUNCTION__));
  TC = Builder.CreateAdd(TC, ConstantInt::get(Ty, VF * UF - 1), "n.rnd.up");
}

// Now we need to generate the expression for the part of the loop that the
// vectorized body will execute. This is equal to N - (N % Step) if scalar
// iterations are not required for correctness, or N - Step, otherwise. Step
// is equal to the vectorization factor (number of SIMD elements) times the
// unroll factor (number of SIMD instructions).
Value *R = Builder.CreateURem(TC, Step, "n.mod.vf");

// If there is a non-reversed interleaved group that may speculatively access
// memory out-of-bounds, we need to ensure that there will be at least one
// iteration of the scalar epilogue loop. Thus, if the step evenly divides
// the trip count, we set the remainder to be equal to the step. If the step
// does not evenly divide the trip count, no adjustment is necessary since
// there will already be scalar iterations. Note that the minimum iterations
// check ensures that N >= Step.
if (VF > 1 && Cost->requiresScalarEpilogue()) {
  auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0));
  R = Builder.CreateSelect(IsZero, Step, R);
}

VectorTripCount = Builder.CreateSub(TC, R, "n.vec");

return VectorTripCount;
2644}

2646Value *InnerLoopVectorizer::createBitOrPointerCast(Value *V, VectorType *DstVTy,
                                                 const DataLayout &DL) {
// Verify that V is a vector type with same number of elements as DstVTy.
unsigned VF = DstVTy->getNumElements();
VectorType *SrcVecTy = cast<VectorType>(V->getType());
assert((VF == SrcVecTy->getNumElements()) && "Vector dimensions do not match")(((VF == SrcVecTy->getNumElements()) && "Vector dimensions do not match"
) ? static_cast<void> (0) : __assert_fail ("(VF == SrcVecTy->getNumElements()) && \"Vector dimensions do not match\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2651, __PRETTY_FUNCTION__));
Type *SrcElemTy = SrcVecTy->getElementType();
Type *DstElemTy = DstVTy->getElementType();
assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) &&(((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy
)) && "Vector elements must have same size") ? static_cast
<void> (0) : __assert_fail ("(DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) && \"Vector elements must have same size\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2655, __PRETTY_FUNCTION__))
       "Vector elements must have same size")(((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy
)) && "Vector elements must have same size") ? static_cast
<void> (0) : __assert_fail ("(DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) && \"Vector elements must have same size\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2655, __PRETTY_FUNCTION__));

// Do a direct cast if element types are castable.
if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) {
  return Builder.CreateBitOrPointerCast(V, DstVTy);
}
// V cannot be directly casted to desired vector type.
// May happen when V is a floating point vector but DstVTy is a vector of
// pointers or vice-versa. Handle this using a two-step bitcast using an
// intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float.
assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) &&(((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()
) && "Only one type should be a pointer type") ? static_cast
<void> (0) : __assert_fail ("(DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) && \"Only one type should be a pointer type\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2666, __PRETTY_FUNCTION__))
       "Only one type should be a pointer type")(((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()
) && "Only one type should be a pointer type") ? static_cast
<void> (0) : __assert_fail ("(DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) && \"Only one type should be a pointer type\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2666, __PRETTY_FUNCTION__));
assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) &&(((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy
()) && "Only one type should be a floating point type"
) ? static_cast<void> (0) : __assert_fail ("(DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) && \"Only one type should be a floating point type\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2668, __PRETTY_FUNCTION__))
       "Only one type should be a floating point type")(((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy
()) && "Only one type should be a floating point type"
) ? static_cast<void> (0) : __assert_fail ("(DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) && \"Only one type should be a floating point type\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2668, __PRETTY_FUNCTION__));
Type *IntTy =
    IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy));
VectorType *VecIntTy = VectorType::get(IntTy, VF);
Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy);
return Builder.CreateBitOrPointerCast(CastVal, DstVTy);
2674}

2676void InnerLoopVectorizer::emitMinimumIterationCountCheck(Loop *L,
                                                       BasicBlock *Bypass) {
Value *Count = getOrCreateTripCount(L);
// Reuse existing vector loop preheader for TC checks.
// Note that new preheader block is generated for vector loop.
BasicBlock *const TCCheckBlock = LoopVectorPreHeader;
IRBuilder<> Builder(TCCheckBlock->getTerminator());

// Generate code to check if the loop's trip count is less than VF * UF, or
// equal to it in case a scalar epilogue is required; this implies that the
// vector trip count is zero. This check also covers the case where adding one
// to the backedge-taken count overflowed leading to an incorrect trip count
// of zero. In this case we will also jump to the scalar loop.
auto P = Cost->requiresScalarEpilogue() ? ICmpInst::ICMP_ULE
                                        : ICmpInst::ICMP_ULT;

// If tail is to be folded, vector loop takes care of all iterations.
Value *CheckMinIters = Builder.getFalse();
if (!Cost->foldTailByMasking())
  CheckMinIters = Builder.CreateICmp(
      P, Count, ConstantInt::get(Count->getType(), VF * UF),
      "min.iters.check");

// Create new preheader for vector loop.
LoopVectorPreHeader =
    SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), DT, LI, nullptr,
               "vector.ph");

assert(DT->properlyDominates(DT->getNode(TCCheckBlock),((DT->properlyDominates(DT->getNode(TCCheckBlock), DT->
getNode(Bypass)->getIDom()) && "TC check is expected to dominate Bypass"
) ? static_cast<void> (0) : __assert_fail ("DT->properlyDominates(DT->getNode(TCCheckBlock), DT->getNode(Bypass)->getIDom()) && \"TC check is expected to dominate Bypass\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2706, __PRETTY_FUNCTION__))
                             DT->getNode(Bypass)->getIDom()) &&((DT->properlyDominates(DT->getNode(TCCheckBlock), DT->
getNode(Bypass)->getIDom()) && "TC check is expected to dominate Bypass"
) ? static_cast<void> (0) : __assert_fail ("DT->properlyDominates(DT->getNode(TCCheckBlock), DT->getNode(Bypass)->getIDom()) && \"TC check is expected to dominate Bypass\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2706, __PRETTY_FUNCTION__))
       "TC check is expected to dominate Bypass")((DT->properlyDominates(DT->getNode(TCCheckBlock), DT->
getNode(Bypass)->getIDom()) && "TC check is expected to dominate Bypass"
) ? static_cast<void> (0) : __assert_fail ("DT->properlyDominates(DT->getNode(TCCheckBlock), DT->getNode(Bypass)->getIDom()) && \"TC check is expected to dominate Bypass\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2706, __PRETTY_FUNCTION__));

// Update dominator for Bypass & LoopExit.
DT->changeImmediateDominator(Bypass, TCCheckBlock);
DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock);

ReplaceInstWithInst(
    TCCheckBlock->getTerminator(),
    BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters));
LoopBypassBlocks.push_back(TCCheckBlock);
2716}

2718void InnerLoopVectorizer::emitSCEVChecks(Loop *L, BasicBlock *Bypass) {
// Reuse existing vector loop preheader for SCEV checks.
// Note that new preheader block is generated for vector loop.
BasicBlock *const SCEVCheckBlock = LoopVectorPreHeader;

// Generate the code to check that the SCEV assumptions that we made.
// We want the new basic block to start at the first instruction in a
// sequence of instructions that form a check.
SCEVExpander Exp(*PSE.getSE(), Bypass->getModule()->getDataLayout(),
                 "scev.check");
Value *SCEVCheck = Exp.expandCodeForPredicate(
    &PSE.getUnionPredicate(), SCEVCheckBlock->getTerminator());

if (auto *C = dyn_cast<ConstantInt>(SCEVCheck))
  if (C->isZero())
    return;

assert(!SCEVCheckBlock->getParent()->hasOptSize() &&((!SCEVCheckBlock->getParent()->hasOptSize() &&
 "Cannot SCEV check stride or overflow when optimizing for size"
) ? static_cast<void> (0) : __assert_fail ("!SCEVCheckBlock->getParent()->hasOptSize() && \"Cannot SCEV check stride or overflow when optimizing for size\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2736, __PRETTY_FUNCTION__))
       "Cannot SCEV check stride or overflow when optimizing for size")((!SCEVCheckBlock->getParent()->hasOptSize() &&
 "Cannot SCEV check stride or overflow when optimizing for size"
) ? static_cast<void> (0) : __assert_fail ("!SCEVCheckBlock->getParent()->hasOptSize() && \"Cannot SCEV check stride or overflow when optimizing for size\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2736, __PRETTY_FUNCTION__));

SCEVCheckBlock->setName("vector.scevcheck");
// Create new preheader for vector loop.
LoopVectorPreHeader =
    SplitBlock(SCEVCheckBlock, SCEVCheckBlock->getTerminator(), DT, LI,
               nullptr, "vector.ph");

// Update dominator only if this is first RT check.
if (LoopBypassBlocks.empty()) {
  DT->changeImmediateDominator(Bypass, SCEVCheckBlock);
  DT->changeImmediateDominator(LoopExitBlock, SCEVCheckBlock);
}

ReplaceInstWithInst(
    SCEVCheckBlock->getTerminator(),
    BranchInst::Create(Bypass, LoopVectorPreHeader, SCEVCheck));
LoopBypassBlocks.push_back(SCEVCheckBlock);
AddedSafetyChecks = true;
2755}

2757void InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass) {
// VPlan-native path does not do any analysis for runtime checks currently.
if (EnableVPlanNativePath)
  return;

// Reuse existing vector loop preheader for runtime memory checks.
// Note that new preheader block is generated for vector loop.
BasicBlock *const MemCheckBlock = L->getLoopPreheader();

// Generate the code that checks in runtime if arrays overlap. We put the
// checks into a separate block to make the more common case of few elements
// faster.
Instruction *FirstCheckInst;
Instruction *MemRuntimeCheck;
std::tie(FirstCheckInst, MemRuntimeCheck) =
    Legal->getLAI()->addRuntimeChecks(MemCheckBlock->getTerminator());
if (!MemRuntimeCheck)
  return;

if (MemCheckBlock->getParent()->hasOptSize()) {
  assert(Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled &&((Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled
 && "Cannot emit memory checks when optimizing for size, unless forced "
 "to vectorize.") ? static_cast<void> (0) : __assert_fail
 ("Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled && \"Cannot emit memory checks when optimizing for size, unless forced \" \"to vectorize.\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2779, __PRETTY_FUNCTION__))
         "Cannot emit memory checks when optimizing for size, unless forced "((Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled
 && "Cannot emit memory checks when optimizing for size, unless forced "
 "to vectorize.") ? static_cast<void> (0) : __assert_fail
 ("Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled && \"Cannot emit memory checks when optimizing for size, unless forced \" \"to vectorize.\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2779, __PRETTY_FUNCTION__))
         "to vectorize.")((Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled
 && "Cannot emit memory checks when optimizing for size, unless forced "
 "to vectorize.") ? static_cast<void> (0) : __assert_fail
 ("Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled && \"Cannot emit memory checks when optimizing for size, unless forced \" \"to vectorize.\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2779, __PRETTY_FUNCTION__));
  ORE->emit([&]() {
    return OptimizationRemarkAnalysis(DEBUG_TYPE"loop-vectorize", "VectorizationCodeSize",
                                      L->getStartLoc(), L->getHeader())
           << "Code-size may be reduced by not forcing "
              "vectorization, or by source-code modifications "
              "eliminating the need for runtime checks "
              "(e.g., adding 'restrict').";
  });
}

MemCheckBlock->setName("vector.memcheck");
// Create new preheader for vector loop.
LoopVectorPreHeader =
    SplitBlock(MemCheckBlock, MemCheckBlock->getTerminator(), DT, LI, nullptr,
               "vector.ph");

// Update dominator only if this is first RT check.
if (LoopBypassBlocks.empty()) {
  DT->changeImmediateDominator(Bypass, MemCheckBlock);
  DT->changeImmediateDominator(LoopExitBlock, MemCheckBlock);
}

ReplaceInstWithInst(
    MemCheckBlock->getTerminator(),
    BranchInst::Create(Bypass, LoopVectorPreHeader, MemRuntimeCheck));
LoopBypassBlocks.push_back(MemCheckBlock);
AddedSafetyChecks = true;

// We currently don't use LoopVersioning for the actual loop cloning but we
// still use it to add the noalias metadata.
LVer = std::make_unique<LoopVersioning>(*Legal->getLAI(), OrigLoop, LI, DT,
                                        PSE.getSE());
LVer->prepareNoAliasMetadata();
2813}

2815Value *InnerLoopVectorizer::emitTransformedIndex(
  IRBuilder<> &B, Value *Index, ScalarEvolution *SE, const DataLayout &DL,
  const InductionDescriptor &ID) const {

SCEVExpander Exp(*SE, DL, "induction");
auto Step = ID.getStep();
auto StartValue = ID.getStartValue();
assert(Index->getType() == Step->getType() &&((Index->getType() == Step->getType() && "Index type does not match StepValue type"
) ? static_cast<void> (0) : __assert_fail ("Index->getType() == Step->getType() && \"Index type does not match StepValue type\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2823, __PRETTY_FUNCTION__))
       "Index type does not match StepValue type")((Index->getType() == Step->getType() && "Index type does not match StepValue type"
) ? static_cast<void> (0) : __assert_fail ("Index->getType() == Step->getType() && \"Index type does not match StepValue type\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2823, __PRETTY_FUNCTION__));

// Note: the IR at this point is broken. We cannot use SE to create any new
// SCEV and then expand it, hoping that SCEV's simplification will give us
// a more optimal code. Unfortunately, attempt of doing so on invalid IR may
// lead to various SCEV crashes. So all we can do is to use builder and rely
// on InstCombine for future simplifications. Here we handle some trivial
// cases only.
auto CreateAdd = [&B](Value *X, Value *Y) {
  assert(X->getType() == Y->getType() && "Types don't match!")((X->getType() == Y->getType() && "Types don't match!"
) ? static_cast<void> (0) : __assert_fail ("X->getType() == Y->getType() && \"Types don't match!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2832, __PRETTY_FUNCTION__));
  if (auto *CX = dyn_cast<ConstantInt>(X))
    if (CX->isZero())
      return Y;
  if (auto *CY = dyn_cast<ConstantInt>(Y))
    if (CY->isZero())
      return X;
  return B.CreateAdd(X, Y);
};

auto CreateMul = [&B](Value *X, Value *Y) {
  assert(X->getType() == Y->getType() && "Types don't match!")((X->getType() == Y->getType() && "Types don't match!"
) ? static_cast<void> (0) : __assert_fail ("X->getType() == Y->getType() && \"Types don't match!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2843, __PRETTY_FUNCTION__));
  if (auto *CX = dyn_cast<ConstantInt>(X))
    if (CX->isOne())
      return Y;
  if (auto *CY = dyn_cast<ConstantInt>(Y))
    if (CY->isOne())
      return X;
  return B.CreateMul(X, Y);
};

switch (ID.getKind()) {
case InductionDescriptor::IK_IntInduction: {
  assert(Index->getType() == StartValue->getType() &&((Index->getType() == StartValue->getType() && "Index type does not match StartValue type"
) ? static_cast<void> (0) : __assert_fail ("Index->getType() == StartValue->getType() && \"Index type does not match StartValue type\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2856, __PRETTY_FUNCTION__))
         "Index type does not match StartValue type")((Index->getType() == StartValue->getType() && "Index type does not match StartValue type"
) ? static_cast<void> (0) : __assert_fail ("Index->getType() == StartValue->getType() && \"Index type does not match StartValue type\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2856, __PRETTY_FUNCTION__));
  if (ID.getConstIntStepValue() && ID.getConstIntStepValue()->isMinusOne())
    return B.CreateSub(StartValue, Index);
  auto *Offset = CreateMul(
      Index, Exp.expandCodeFor(Step, Index->getType(), &*B.GetInsertPoint()));
  return CreateAdd(StartValue, Offset);
}
case InductionDescriptor::IK_PtrInduction: {
  assert(isa<SCEVConstant>(Step) &&((isa<SCEVConstant>(Step) && "Expected constant step for pointer induction"
) ? static_cast<void> (0) : __assert_fail ("isa<SCEVConstant>(Step) && \"Expected constant step for pointer induction\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2865, __PRETTY_FUNCTION__))
         "Expected constant step for pointer induction")((isa<SCEVConstant>(Step) && "Expected constant step for pointer induction"
) ? static_cast<void> (0) : __assert_fail ("isa<SCEVConstant>(Step) && \"Expected constant step for pointer induction\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2865, __PRETTY_FUNCTION__));
  return B.CreateGEP(
      StartValue->getType()->getPointerElementType(), StartValue,
      CreateMul(Index, Exp.expandCodeFor(Step, Index->getType(),
                                         &*B.GetInsertPoint())));
}
case InductionDescriptor::IK_FpInduction: {
  assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value")((Step->getType()->isFloatingPointTy() && "Expected FP Step value"
) ? static_cast<void> (0) : __assert_fail ("Step->getType()->isFloatingPointTy() && \"Expected FP Step value\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2872, __PRETTY_FUNCTION__));
  auto InductionBinOp = ID.getInductionBinOp();
  assert(InductionBinOp &&((InductionBinOp && (InductionBinOp->getOpcode() ==
 Instruction::FAdd || InductionBinOp->getOpcode() == Instruction
::FSub) && "Original bin op should be defined for FP induction"
) ? static_cast<void> (0) : __assert_fail ("InductionBinOp && (InductionBinOp->getOpcode() == Instruction::FAdd || InductionBinOp->getOpcode() == Instruction::FSub) && \"Original bin op should be defined for FP induction\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2877, __PRETTY_FUNCTION__))
         (InductionBinOp->getOpcode() == Instruction::FAdd ||((InductionBinOp && (InductionBinOp->getOpcode() ==
 Instruction::FAdd || InductionBinOp->getOpcode() == Instruction
::FSub) && "Original bin op should be defined for FP induction"
) ? static_cast<void> (0) : __assert_fail ("InductionBinOp && (InductionBinOp->getOpcode() == Instruction::FAdd || InductionBinOp->getOpcode() == Instruction::FSub) && \"Original bin op should be defined for FP induction\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2877, __PRETTY_FUNCTION__))
          InductionBinOp->getOpcode() == Instruction::FSub) &&((InductionBinOp && (InductionBinOp->getOpcode() ==
 Instruction::FAdd || InductionBinOp->getOpcode() == Instruction
::FSub) && "Original bin op should be defined for FP induction"
) ? static_cast<void> (0) : __assert_fail ("InductionBinOp && (InductionBinOp->getOpcode() == Instruction::FAdd || InductionBinOp->getOpcode() == Instruction::FSub) && \"Original bin op should be defined for FP induction\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2877, __PRETTY_FUNCTION__))
         "Original bin op should be defined for FP induction")((InductionBinOp && (InductionBinOp->getOpcode() ==
 Instruction::FAdd || InductionBinOp->getOpcode() == Instruction
::FSub) && "Original bin op should be defined for FP induction"
) ? static_cast<void> (0) : __assert_fail ("InductionBinOp && (InductionBinOp->getOpcode() == Instruction::FAdd || InductionBinOp->getOpcode() == Instruction::FSub) && \"Original bin op should be defined for FP induction\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2877, __PRETTY_FUNCTION__));

  Value *StepValue = cast<SCEVUnknown>(Step)->getValue();

  // Floating point operations had to be 'fast' to enable the induction.
  FastMathFlags Flags;
  Flags.setFast();

  Value *MulExp = B.CreateFMul(StepValue, Index);
  if (isa<Instruction>(MulExp))
    // We have to check, the MulExp may be a constant.
    cast<Instruction>(MulExp)->setFastMathFlags(Flags);

  Value *BOp = B.CreateBinOp(InductionBinOp->getOpcode(), StartValue, MulExp,
                             "induction");
  if (isa<Instruction>(BOp))
    cast<Instruction>(BOp)->setFastMathFlags(Flags);

  return BOp;
}
case InductionDescriptor::IK_NoInduction:
  return nullptr;
}
llvm_unreachable("invalid enum")::llvm::llvm_unreachable_internal("invalid enum", "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2900);
2901}

2903BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() {
/*
 In this function we generate a new loop. The new loop will contain
 the vectorized instructions while the old loop will continue to run the
 scalar remainder.

     [ ] <-- loop iteration number check.
  /   |
 /    v
|    [ ] <-- vector loop bypass (may consist of multiple blocks).
|  /  |
| /   v
||   [ ]     <-- vector pre header.
|/    |
|     v
|    [  ] \
|    [  ]_|   <-- vector loop.
|     |
|     v
|   -[ ]   <--- middle-block.
|  /  |
| /   v
-|- >[ ]     <--- new preheader.
 |    |
 |    v
 |   [ ] \
 |   [ ]_|   <-- old scalar loop to handle remainder.
  \   |
   \  v
    >[ ]     <-- exit block.
 ...
 */

MDNode *OrigLoopID = OrigLoop->getLoopID();

// Some loops have a single integer induction variable, while other loops
// don't. One example is c++ iterators that often have multiple pointer
// induction variables. In the code below we also support a case where we
// don't have a single induction variable.
//
// We try to obtain an induction variable from the original loop as hard
// as possible. However if we don't find one that:
//   - is an integer
//   - counts from zero, stepping by one
//   - is the size of the widest induction variable type
// then we create a new one.
OldInduction = Legal->getPrimaryInduction();
Type *IdxTy = Legal->getWidestInductionType();

// Split the single block loop into the two loop structure described above.
LoopScalarBody = OrigLoop->getHeader();
LoopVectorPreHeader = OrigLoop->getLoopPreheader();
LoopExitBlock = OrigLoop->getExitBlock();
assert(LoopExitBlock && "Must have an exit block")((LoopExitBlock && "Must have an exit block") ? static_cast
<void> (0) : __assert_fail ("LoopExitBlock && \"Must have an exit block\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2956, __PRETTY_FUNCTION__));
assert(LoopVectorPreHeader && "Invalid loop structure")((LoopVectorPreHeader && "Invalid loop structure") ? static_cast
<void> (0) : __assert_fail ("LoopVectorPreHeader && \"Invalid loop structure\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2957, __PRETTY_FUNCTION__));

LoopMiddleBlock =
    SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT,
               LI, nullptr, "middle.block");
LoopScalarPreHeader =
    SplitBlock(LoopMiddleBlock, LoopMiddleBlock->getTerminator(), DT, LI,
               nullptr, "scalar.ph");
// We intentionally don't let SplitBlock to update LoopInfo since
// LoopVectorBody should belong to another loop than LoopVectorPreHeader.
// LoopVectorBody is explicitly added to the correct place few lines later.
LoopVectorBody =
    SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT,
               nullptr, nullptr, "vector.body");

// Update dominator for loop exit.
DT->changeImmediateDominator(LoopExitBlock, LoopMiddleBlock);

// Create and register the new vector loop.
Loop *Lp = LI->AllocateLoop();
Loop *ParentLoop = OrigLoop->getParentLoop();

// Insert the new loop into the loop nest and register the new basic blocks
// before calling any utilities such as SCEV that require valid LoopInfo.
if (ParentLoop) {
  ParentLoop->addChildLoop(Lp);
} else {
  LI->addTopLevelLoop(Lp);
}
Lp->addBasicBlockToLoop(LoopVectorBody, *LI);

// Find the loop boundaries.
Value *Count = getOrCreateTripCount(Lp);

Value *StartIdx = ConstantInt::get(IdxTy, 0);

// Now, compare the new count to zero. If it is zero skip the vector loop and
// jump to the scalar loop. This check also covers the case where the
// backedge-taken count is uint##_max: adding one to it will overflow leading
// to an incorrect trip count of zero. In this (rare) case we will also jump
// to the scalar loop.
emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader);

// Generate the code to check any assumptions that we've made for SCEV
// expressions.
emitSCEVChecks(Lp, LoopScalarPreHeader);

// Generate the code that checks in runtime if arrays overlap. We put the
// checks into a separate block to make the more common case of few elements
// faster.
emitMemRuntimeChecks(Lp, LoopScalarPreHeader);

// Generate the induction variable.
// The loop step is equal to the vectorization factor (num of SIMD elements)
// times the unroll factor (num of SIMD instructions).
Value *CountRoundDown = getOrCreateVectorTripCount(Lp);
Constant *Step = ConstantInt::get(IdxTy, VF * UF);
Induction =
    createInductionVariable(Lp, StartIdx, CountRoundDown, Step,
                            getDebugLocFromInstOrOperands(OldInduction));

// We are going to resume the execution of the scalar loop.
// Go over all of the induction variables that we found and fix the
// PHIs that are left in the scalar version of the loop.
// The starting values of PHI nodes depend on the counter of the last
// iteration in the vectorized loop.
// If we come from a bypass edge then we need to start from the original
// start value.

// This variable saves the new starting index for the scalar loop. It is used
// to test if there are any tail iterations left once the vector loop has
// completed.
for (auto &InductionEntry : Legal->getInductionVars()) {
  PHINode *OrigPhi = InductionEntry.first;
  InductionDescriptor II = InductionEntry.second;

  // Create phi nodes to merge from the  backedge-taken check block.
  PHINode *BCResumeVal =
      PHINode::Create(OrigPhi->getType(), 3, "bc.resume.val",
                      LoopScalarPreHeader->getTerminator());
  // Copy original phi DL over to the new one.
  BCResumeVal->setDebugLoc(OrigPhi->getDebugLoc());
  Value *&EndValue = IVEndValues[OrigPhi];
  if (OrigPhi == OldInduction) {
    // We know what the end value is.
    EndValue = CountRoundDown;
  } else {
    IRBuilder<> B(Lp->getLoopPreheader()->getTerminator());
    Type *StepType = II.getStep()->getType();
    Instruction::CastOps CastOp =
        CastInst::getCastOpcode(CountRoundDown, true, StepType, true);
    Value *CRD = B.CreateCast(CastOp, CountRoundDown, StepType, "cast.crd");
    const DataLayout &DL = LoopScalarBody->getModule()->getDataLayout();
    EndValue = emitTransformedIndex(B, CRD, PSE.getSE(), DL, II);
    EndValue->setName("ind.end");
  }

  // The new PHI merges the original incoming value, in case of a bypass,
  // or the value at the end of the vectorized loop.
  BCResumeVal->addIncoming(EndValue, LoopMiddleBlock);

  // Fix the scalar body counter (PHI node).
  // The old induction's phi node in the scalar body needs the truncated
  // value.
  for (BasicBlock *BB : LoopBypassBlocks)
    BCResumeVal->addIncoming(II.getStartValue(), BB);
  OrigPhi->setIncomingValueForBlock(LoopScalarPreHeader, BCResumeVal);
}

// We need the OrigLoop (scalar loop part) latch terminator to help
// produce correct debug info for the middle block BB instructions.
// The legality check stage guarantees that the loop will have a single
// latch.
assert(isa<BranchInst>(OrigLoop->getLoopLatch()->getTerminator()) &&((isa<BranchInst>(OrigLoop->getLoopLatch()->getTerminator
()) && "Scalar loop latch terminator isn't a branch")
 ? static_cast<void> (0) : __assert_fail ("isa<BranchInst>(OrigLoop->getLoopLatch()->getTerminator()) && \"Scalar loop latch terminator isn't a branch\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3071, __PRETTY_FUNCTION__))
       "Scalar loop latch terminator isn't a branch")((isa<BranchInst>(OrigLoop->getLoopLatch()->getTerminator
()) && "Scalar loop latch terminator isn't a branch")
 ? static_cast<void> (0) : __assert_fail ("isa<BranchInst>(OrigLoop->getLoopLatch()->getTerminator()) && \"Scalar loop latch terminator isn't a branch\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3071, __PRETTY_FUNCTION__));
BranchInst *ScalarLatchBr =
    cast<BranchInst>(OrigLoop->getLoopLatch()->getTerminator());

// Add a check in the middle block to see if we have completed
// all of the iterations in the first vector loop.
// If (N - N%VF) == N, then we *don't* need to run the remainder.
// If tail is to be folded, we know we don't need to run the remainder.
Value *CmpN = Builder.getTrue();
if (!Cost->foldTailByMasking()) {
  CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, Count,
                         CountRoundDown, "cmp.n",
                         LoopMiddleBlock->getTerminator());

  // Here we use the same DebugLoc as the scalar loop latch branch instead
  // of the corresponding compare because they may have ended up with
  // different line numbers and we want to avoid awkward line stepping while
  // debugging. Eg. if the compare has got a line number inside the loop.
  cast<Instruction>(CmpN)->setDebugLoc(ScalarLatchBr->getDebugLoc());
}

BranchInst *BrInst =
    BranchInst::Create(LoopExitBlock, LoopScalarPreHeader, CmpN);
BrInst->setDebugLoc(ScalarLatchBr->getDebugLoc());
ReplaceInstWithInst(LoopMiddleBlock->getTerminator(), BrInst);

// Get ready to start creating new instructions into the vectorized body.
assert(LoopVectorPreHeader == Lp->getLoopPreheader() &&((LoopVectorPreHeader == Lp->getLoopPreheader() &&
 "Inconsistent vector loop preheader") ? static_cast<void>
 (0) : __assert_fail ("LoopVectorPreHeader == Lp->getLoopPreheader() && \"Inconsistent vector loop preheader\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3099, __PRETTY_FUNCTION__))
       "Inconsistent vector loop preheader")((LoopVectorPreHeader == Lp->getLoopPreheader() &&
 "Inconsistent vector loop preheader") ? static_cast<void>
 (0) : __assert_fail ("LoopVectorPreHeader == Lp->getLoopPreheader() && \"Inconsistent vector loop preheader\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3099, __PRETTY_FUNCTION__));
Builder.SetInsertPoint(&*LoopVectorBody->getFirstInsertionPt());

Optional<MDNode *> VectorizedLoopID =
    makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll,
                                    LLVMLoopVectorizeFollowupVectorized});
if (VectorizedLoopID.hasValue()) {
  Lp->setLoopID(VectorizedLoopID.getValue());

  // Do not setAlreadyVectorized if loop attributes have been defined
  // explicitly.
  return LoopVectorPreHeader;
}

// Keep all loop hints from the original loop on the vector loop (we'll
// replace the vectorizer-specific hints below).
if (MDNode *LID = OrigLoop->getLoopID())
  Lp->setLoopID(LID);

LoopVectorizeHints Hints(Lp, true, *ORE);
Hints.setAlreadyVectorized();

3121#ifdef EXPENSIVE_CHECKS
assert(DT->verify(DominatorTree::VerificationLevel::Fast))((DT->verify(DominatorTree::VerificationLevel::Fast)) ? static_cast
<void> (0) : __assert_fail ("DT->verify(DominatorTree::VerificationLevel::Fast)"
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3122, __PRETTY_FUNCTION__));
LI->verify(*DT);
3124#endif

return LoopVectorPreHeader;
3127}

3129// Fix up external users of the induction variable. At this point, we are
3130// in LCSSA form, with all external PHIs that use the IV having one input value,
3131// coming from the remainder loop. We need those PHIs to also have a correct
3132// value for the IV when arriving directly from the middle block.
3133void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi,
                                     const InductionDescriptor &II,
                                     Value *CountRoundDown, Value *EndValue,
                                     BasicBlock *MiddleBlock) {
// There are two kinds of external IV usages - those that use the value
// computed in the last iteration (the PHI) and those that use the penultimate
// value (the value that feeds into the phi from the loop latch).
// We allow both, but they, obviously, have different values.

assert(OrigLoop->getExitBlock() && "Expected a single exit block")((OrigLoop->getExitBlock() && "Expected a single exit block"
) ? static_cast<void> (0) : __assert_fail ("OrigLoop->getExitBlock() && \"Expected a single exit block\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3142, __PRETTY_FUNCTION__));

DenseMap<Value *, Value *> MissingVals;

// An external user of the last iteration's value should see the value that
// the remainder loop uses to initialize its own IV.
Value *PostInc = OrigPhi->getIncomingValueForBlock(OrigLoop->getLoopLatch());
for (User *U : PostInc->users()) {
  Instruction *UI = cast<Instruction>(U);
  if (!OrigLoop->contains(UI)) {
    assert(isa<PHINode>(UI) && "Expected LCSSA form")((isa<PHINode>(UI) && "Expected LCSSA form") ? static_cast
<void> (0) : __assert_fail ("isa<PHINode>(UI) && \"Expected LCSSA form\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3152, __PRETTY_FUNCTION__));
    MissingVals[UI] = EndValue;
  }
}

// An external user of the penultimate value need to see EndValue - Step.
// The simplest way to get this is to recompute it from the constituent SCEVs,
// that is Start + (Step * (CRD - 1)).
for (User *U : OrigPhi->users()) {
  auto *UI = cast<Instruction>(U);
  if (!OrigLoop->contains(UI)) {
    const DataLayout &DL =
        OrigLoop->getHeader()->getModule()->getDataLayout();
    assert(isa<PHINode>(UI) && "Expected LCSSA form")((isa<PHINode>(UI) && "Expected LCSSA form") ? static_cast
<void> (0) : __assert_fail ("isa<PHINode>(UI) && \"Expected LCSSA form\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3165, __PRETTY_FUNCTION__));

    IRBuilder<> B(MiddleBlock->getTerminator());
    Value *CountMinusOne = B.CreateSub(
        CountRoundDown, ConstantInt::get(CountRoundDown->getType(), 1));
    Value *CMO =
        !II.getStep()->getType()->isIntegerTy()
            ? B.CreateCast(Instruction::SIToFP, CountMinusOne,
                           II.getStep()->getType())
            : B.CreateSExtOrTrunc(CountMinusOne, II.getStep()->getType());
    CMO->setName("cast.cmo");
    Value *Escape = emitTransformedIndex(B, CMO, PSE.getSE(), DL, II);
    Escape->setName("ind.escape");
    MissingVals[UI] = Escape;
  }
}

for (auto &I : MissingVals) {
  PHINode *PHI = cast<PHINode>(I.first);
  // One corner case we have to handle is two IVs "chasing" each-other,
  // that is %IV2 = phi [...], [ %IV1, %latch ]
  // In this case, if IV1 has an external use, we need to avoid adding both
  // "last value of IV1" and "penultimate value of IV2". So, verify that we
  // don't already have an incoming value for the middle block.
  if (PHI->getBasicBlockIndex(MiddleBlock) == -1)
    PHI->addIncoming(I.second, MiddleBlock);
}
3192}

3194namespace {

3196struct CSEDenseMapInfo {
static bool canHandle(const Instruction *I) {
  return isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) ||
         isa<ShuffleVectorInst>(I) || isa<GetElementPtrInst>(I);
}

static inline Instruction *getEmptyKey() {
  return DenseMapInfo<Instruction *>::getEmptyKey();
}

static inline Instruction *getTombstoneKey() {
  return DenseMapInfo<Instruction *>::getTombstoneKey();
}

static unsigned getHashValue(const Instruction *I) {
  assert(canHandle(I) && "Unknown instruction!")((canHandle(I) && "Unknown instruction!") ? static_cast
<void> (0) : __assert_fail ("canHandle(I) && \"Unknown instruction!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3211, __PRETTY_FUNCTION__));
  return hash_combine(I->getOpcode(), hash_combine_range(I->value_op_begin(),
                                                         I->value_op_end()));
}

static bool isEqual(const Instruction *LHS, const Instruction *RHS) {
  if (LHS == getEmptyKey() || RHS == getEmptyKey() ||
      LHS == getTombstoneKey() || RHS == getTombstoneKey())
    return LHS == RHS;
  return LHS->isIdenticalTo(RHS);
}
3222};

3224} // end anonymous namespace

3226///Perform cse of induction variable instructions.
3227static void cse(BasicBlock *BB) {
// Perform simple cse.
SmallDenseMap<Instruction *, Instruction *, 4, CSEDenseMapInfo> CSEMap;
for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) {
  Instruction *In = &*I++;

  if (!CSEDenseMapInfo::canHandle(In))
    continue;

  // Check if we can replace this instruction with any of the
  // visited instructions.
  if (Instruction *V = CSEMap.lookup(In)) {
    In->replaceAllUsesWith(V);
    In->eraseFromParent();
    continue;
  }

  CSEMap[In] = In;
}
3246}

3248unsigned LoopVectorizationCostModel::getVectorCallCost(CallInst *CI,
                                                     unsigned VF,
                                                     bool &NeedToScalarize) {
Function *F = CI->getCalledFunction();
Type *ScalarRetTy = CI->getType();
SmallVector<Type *, 4> Tys, ScalarTys;
for (auto &ArgOp : CI->arg_operands())
  ScalarTys.push_back(ArgOp->getType());

// Estimate cost of scalarized vector call. The source operands are assumed
// to be vectors, so we need to extract individual elements from there,
// execute VF scalar calls, and then gather the result into the vector return
// value.
unsigned ScalarCallCost = TTI.getCallInstrCost(F, ScalarRetTy, ScalarTys);
if (VF == 1)
  return ScalarCallCost;

// Compute corresponding vector type for return value and arguments.
Type *RetTy = ToVectorTy(ScalarRetTy, VF);
for (Type *ScalarTy : ScalarTys)
  Tys.push_back(ToVectorTy(ScalarTy, VF));

// Compute costs of unpacking argument values for the scalar calls and
// packing the return values to a vector.
unsigned ScalarizationCost = getScalarizationOverhead(CI, VF);

unsigned Cost = ScalarCallCost * VF + ScalarizationCost;

// If we can't emit a vector call for this function, then the currently found
// cost is the cost we need to return.
NeedToScalarize = true;
VFShape Shape = VFShape::get(*CI, {VF, false}, false /*HasGlobalPred*/);
Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape);

if (!TLI || CI->isNoBuiltin() || !VecFunc)
  return Cost;

// If the corresponding vector cost is cheaper, return its cost.
unsigned VectorCallCost = TTI.getCallInstrCost(nullptr, RetTy, Tys);
if (VectorCallCost < Cost) {
  NeedToScalarize = false;
  return VectorCallCost;
}
return Cost;
3292}

3294unsigned LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI,
                                                          unsigned VF) {
Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
assert(ID && "Expected intrinsic call!")((ID && "Expected intrinsic call!") ? static_cast<
void> (0) : __assert_fail ("ID && \"Expected intrinsic call!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3297, __PRETTY_FUNCTION__));

FastMathFlags FMF;
if (auto *FPMO = dyn_cast<FPMathOperator>(CI))
  FMF = FPMO->getFastMathFlags();

SmallVector<Value *, 4> Operands(CI->arg_operands());
return TTI.getIntrinsicInstrCost(ID, CI->getType(), Operands, FMF, VF);
3305}

3307static Type *smallestIntegerVectorType(Type *T1, Type *T2) {
auto *I1 = cast<IntegerType>(T1->getVectorElementType());
auto *I2 = cast<IntegerType>(T2->getVectorElementType());
return I1->getBitWidth() < I2->getBitWidth() ? T1 : T2;
3311}
3312static Type *largestIntegerVectorType(Type *T1, Type *T2) {
auto *I1 = cast<IntegerType>(T1->getVectorElementType());
auto *I2 = cast<IntegerType>(T2->getVectorElementType());
return I1->getBitWidth() > I2->getBitWidth() ? T1 : T2;
3316}

3318void InnerLoopVectorizer::truncateToMinimalBitwidths() {
// For every instruction `I` in MinBWs, truncate the operands, create a
// truncated version of `I` and reextend its result. InstCombine runs
// later and will remove any ext/trunc pairs.
SmallPtrSet<Value *, 4> Erased;
for (const auto &KV : Cost->getMinimalBitwidths()) {
  // If the value wasn't vectorized, we must maintain the original scalar
  // type. The absence of the value from VectorLoopValueMap indicates that it
  // wasn't vectorized.
  if (!VectorLoopValueMap.hasAnyVectorValue(KV.first))
    continue;
  for (unsigned Part = 0; Part < UF; ++Part) {
    Value *I = getOrCreateVectorValue(KV.first, Part);
    if (Erased.find(I) != Erased.end() || I->use_empty() ||
        !isa<Instruction>(I))
      continue;
    Type *OriginalTy = I->getType();
    Type *ScalarTruncatedTy =
        IntegerType::get(OriginalTy->getContext(), KV.second);
    Type *TruncatedTy = VectorType::get(ScalarTruncatedTy,
                                        OriginalTy->getVectorNumElements());
    if (TruncatedTy == OriginalTy)
      continue;

    IRBuilder<> B(cast<Instruction>(I));
    auto ShrinkOperand = [&](Value *V) -> Value * {
      if (auto *ZI = dyn_cast<ZExtInst>(V))
        if (ZI->getSrcTy() == TruncatedTy)
          return ZI->getOperand(0);
      return B.CreateZExtOrTrunc(V, TruncatedTy);
    };

    // The actual instruction modification depends on the instruction type,
    // unfortunately.
    Value *NewI = nullptr;
    if (auto *BO = dyn_cast<BinaryOperator>(I)) {
      NewI = B.CreateBinOp(BO->getOpcode(), ShrinkOperand(BO->getOperand(0)),
                           ShrinkOperand(BO->getOperand(1)));

      // Any wrapping introduced by shrinking this operation shouldn't be
      // considered undefined behavior. So, we can't unconditionally copy
      // arithmetic wrapping flags to NewI.
      cast<BinaryOperator>(NewI)->copyIRFlags(I, /*IncludeWrapFlags=*/false);
    } else if (auto *CI = dyn_cast<ICmpInst>(I)) {
      NewI =
          B.CreateICmp(CI->getPredicate(), ShrinkOperand(CI->getOperand(0)),
                       ShrinkOperand(CI->getOperand(1)));
    } else if (auto *SI = dyn_cast<SelectInst>(I)) {
      NewI = B.CreateSelect(SI->getCondition(),
                            ShrinkOperand(SI->getTrueValue()),
                            ShrinkOperand(SI->getFalseValue()));
    } else if (auto *CI = dyn_cast<CastInst>(I)) {
      switch (CI->getOpcode()) {
      default:
        llvm_unreachable("Unhandled cast!")::llvm::llvm_unreachable_internal("Unhandled cast!", "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3372);
      case Instruction::Trunc:
        NewI = ShrinkOperand(CI->getOperand(0));
        break;
      case Instruction::SExt:
        NewI = B.CreateSExtOrTrunc(
            CI->getOperand(0),
            smallestIntegerVectorType(OriginalTy, TruncatedTy));
        break;
      case Instruction::ZExt:
        NewI = B.CreateZExtOrTrunc(
            CI->getOperand(0),
            smallestIntegerVectorType(OriginalTy, TruncatedTy));
        break;
      }
    } else if (auto *SI = dyn_cast<ShuffleVectorInst>(I)) {
      auto Elements0 = SI->getOperand(0)->getType()->getVectorNumElements();
      auto *O0 = B.CreateZExtOrTrunc(
          SI->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements0));
      auto Elements1 = SI->getOperand(1)->getType()->getVectorNumElements();
      auto *O1 = B.CreateZExtOrTrunc(
          SI->getOperand(1), VectorType::get(ScalarTruncatedTy, Elements1));

      NewI = B.CreateShuffleVector(O0, O1, SI->getMask());
    } else if (isa<LoadInst>(I) || isa<PHINode>(I)) {
      // Don't do anything with the operands, just extend the result.
      continue;
    } else if (auto *IE = dyn_cast<InsertElementInst>(I)) {
      auto Elements = IE->getOperand(0)->getType()->getVectorNumElements();
      auto *O0 = B.CreateZExtOrTrunc(
          IE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements));
      auto *O1 = B.CreateZExtOrTrunc(IE->getOperand(1), ScalarTruncatedTy);
      NewI = B.CreateInsertElement(O0, O1, IE->getOperand(2));
    } else if (auto *EE = dyn_cast<ExtractElementInst>(I)) {
      auto Elements = EE->getOperand(0)->getType()->getVectorNumElements();
      auto *O0 = B.CreateZExtOrTrunc(
          EE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements));
      NewI = B.CreateExtractElement(O0, EE->getOperand(2));
    } else {
      // If we don't know what to do, be conservative and don't do anything.
      continue;
    }

    // Lastly, extend the result.
    NewI->takeName(cast<Instruction>(I));
    Value *Res = B.CreateZExtOrTrunc(NewI, OriginalTy);
    I->replaceAllUsesWith(Res);
    cast<Instruction>(I)->eraseFromParent();
    Erased.insert(I);
    VectorLoopValueMap.resetVectorValue(KV.first, Part, Res);
  }
}

// We'll have created a bunch of ZExts that are now parentless. Clean up.
for (const auto &KV : Cost->getMinimalBitwidths()) {
  // If the value wasn't vectorized, we must maintain the original scalar
  // type. The absence of the value from VectorLoopValueMap indicates that it
  // wasn't vectorized.
  if (!VectorLoopValueMap.hasAnyVectorValue(KV.first))
    continue;
  for (unsigned Part = 0; Part < UF; ++Part) {
    Value *I = getOrCreateVectorValue(KV.first, Part);
    ZExtInst *Inst = dyn_cast<ZExtInst>(I);
    if (Inst && Inst->use_empty()) {
      Value *NewI = Inst->getOperand(0);
      Inst->eraseFromParent();
      VectorLoopValueMap.resetVectorValue(KV.first, Part, NewI);
    }
  }
}
3442}

3444void InnerLoopVectorizer::fixVectorizedLoop() {
// Insert truncates and extends for any truncated instructions as hints to
// InstCombine.
if (VF > 1)
  truncateToMinimalBitwidths();

// Fix widened non-induction PHIs by setting up the PHI operands.
if (OrigPHIsToFix.size()) {
  assert(EnableVPlanNativePath &&((EnableVPlanNativePath && "Unexpected non-induction PHIs for fixup in non VPlan-native path"
) ? static_cast<void> (0) : __assert_fail ("EnableVPlanNativePath && \"Unexpected non-induction PHIs for fixup in non VPlan-native path\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3453, __PRETTY_FUNCTION__))
         "Unexpected non-induction PHIs for fixup in non VPlan-native path")((EnableVPlanNativePath && "Unexpected non-induction PHIs for fixup in non VPlan-native path"
) ? static_cast<void> (0) : __assert_fail ("EnableVPlanNativePath && \"Unexpected non-induction PHIs for fixup in non VPlan-native path\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3453, __PRETTY_FUNCTION__));
  fixNonInductionPHIs();
}

// At this point every instruction in the original loop is widened to a
// vector form. Now we need to fix the recurrences in the loop. These PHI
// nodes are currently empty because we did not want to introduce cycles.
// This is the second stage of vectorizing recurrences.
fixCrossIterationPHIs();

// Forget the original basic block.
PSE.getSE()->forgetLoop(OrigLoop);

// Fix-up external users of the induction variables.
for (auto &Entry : Legal->getInductionVars())
  fixupIVUsers(Entry.first, Entry.second,
               getOrCreateVectorTripCount(LI->getLoopFor(LoopVectorBody)),
               IVEndValues[Entry.first], LoopMiddleBlock);

fixLCSSAPHIs();
for (Instruction *PI : PredicatedInstructions)
  sinkScalarOperands(&*PI);

// Remove redundant induction instructions.
cse(LoopVectorBody);

// Set/update profile weights for the vector and remainder loops as original
// loop iterations are now distributed among them. Note that original loop
// represented by LoopScalarBody becomes remainder loop after vectorization.
//
// For cases like foldTailByMasking() and requiresScalarEpiloque() we may
// end up getting slightly roughened result but that should be OK since
// profile is not inherently precise anyway. Note also possible bypass of
// vector code caused by legality checks is ignored, assigning all the weight
// to the vector loop, optimistically.
setProfileInfoAfterUnrolling(LI->getLoopFor(LoopScalarBody),
                             LI->getLoopFor(LoopVectorBody),
                             LI->getLoopFor(LoopScalarBody), VF * UF);
3491}

3493void InnerLoopVectorizer::fixCrossIterationPHIs() {
// In order to support recurrences we need to be able to vectorize Phi nodes.
// Phi nodes have cycles, so we need to vectorize them in two stages. This is
// stage #2: We now need to fix the recurrences by adding incoming edges to
// the currently empty PHI nodes. At this point every instruction in the
// original loop is widened to a vector form so we can use them to construct
// the incoming edges.
for (PHINode &Phi : OrigLoop->getHeader()->phis()) {
  // Handle first-order recurrences and reductions that need to be fixed.
  if (Legal->isFirstOrderRecurrence(&Phi))
    fixFirstOrderRecurrence(&Phi);
  else if (Legal->isReductionVariable(&Phi))
    fixReduction(&Phi);
}
3507}

3509void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi) {
// This is the second phase of vectorizing first-order recurrences. An
// overview of the transformation is described below. Suppose we have the
// following loop.
//
//   for (int i = 0; i < n; ++i)
//     b[i] = a[i] - a[i - 1];
//
// There is a first-order recurrence on "a". For this loop, the shorthand
// scalar IR looks like:
//
//   scalar.ph:
//     s_init = a[-1]
//     br scalar.body
//
//   scalar.body:
//     i = phi [0, scalar.ph], [i+1, scalar.body]
//     s1 = phi [s_init, scalar.ph], [s2, scalar.body]
//     s2 = a[i]
//     b[i] = s2 - s1
//     br cond, scalar.body, ...
//
// In this example, s1 is a recurrence because it's value depends on the
// previous iteration. In the first phase of vectorization, we created a
// temporary value for s1. We now complete the vectorization and produce the
// shorthand vector IR shown below (for VF = 4, UF = 1).
//
//   vector.ph:
//     v_init = vector(..., ..., ..., a[-1])
//     br vector.body
//
//   vector.body
//     i = phi [0, vector.ph], [i+4, vector.body]
//     v1 = phi [v_init, vector.ph], [v2, vector.body]
//     v2 = a[i, i+1, i+2, i+3];
//     v3 = vector(v1(3), v2(0, 1, 2))
//     b[i, i+1, i+2, i+3] = v2 - v3
//     br cond, vector.body, middle.block
//
//   middle.block:
//     x = v2(3)
//     br scalar.ph
//
//   scalar.ph:
//     s_init = phi [x, middle.block], [a[-1], otherwise]
//     br scalar.body
//
// After execution completes the vector loop, we extract the next value of
// the recurrence (x) to use as the initial value in the scalar loop.

// Get the original loop preheader and single loop latch.
auto *Preheader = OrigLoop->getLoopPreheader();
auto *Latch = OrigLoop->getLoopLatch();

// Get the initial and previous values of the scalar recurrence.
auto *ScalarInit = Phi->getIncomingValueForBlock(Preheader);
auto *Previous = Phi->getIncomingValueForBlock(Latch);

// Create a vector from the initial value.
auto *VectorInit = ScalarInit;
if (VF > 1) {
  Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
  VectorInit = Builder.CreateInsertElement(
      UndefValue::get(VectorType::get(VectorInit->getType(), VF)), VectorInit,
      Builder.getInt32(VF - 1), "vector.recur.init");
}

// We constructed a temporary phi node in the first phase of vectorization.
// This phi node will eventually be deleted.
Builder.SetInsertPoint(
    cast<Instruction>(VectorLoopValueMap.getVectorValue(Phi, 0)));

// Create a phi node for the new recurrence. The current value will either be
// the initial value inserted into a vector or loop-varying vector value.
auto *VecPhi = Builder.CreatePHI(VectorInit->getType(), 2, "vector.recur");
VecPhi->addIncoming(VectorInit, LoopVectorPreHeader);

// Get the vectorized previous value of the last part UF - 1. It appears last
// among all unrolled iterations, due to the order of their construction.
Value *PreviousLastPart = getOrCreateVectorValue(Previous, UF - 1);

// Find and set the insertion point after the previous value if it is an
// instruction.
BasicBlock::iterator InsertPt;
// Note that the previous value may have been constant-folded so it is not
// guaranteed to be an instruction in the vector loop.
// FIXME: Loop invariant values do not form recurrences. We should deal with
//        them earlier.
if (LI->getLoopFor(LoopVectorBody)->isLoopInvariant(PreviousLastPart))
  InsertPt = LoopVectorBody->getFirstInsertionPt();
else {
  Instruction *PreviousInst = cast<Instruction>(PreviousLastPart);
  if (isa<PHINode>(PreviousLastPart))
    // If the previous value is a phi node, we should insert after all the phi
    // nodes in the block containing the PHI to avoid breaking basic block
    // verification. Note that the basic block may be different to
    // LoopVectorBody, in case we predicate the loop.
    InsertPt = PreviousInst->getParent()->getFirstInsertionPt();
  else
    InsertPt = ++PreviousInst->getIterator();
}
Builder.SetInsertPoint(&*InsertPt);

// We will construct a vector for the recurrence by combining the values for
// the current and previous iterations. This is the required shuffle mask.
SmallVector<Constant *, 8> ShuffleMask(VF);
ShuffleMask[0] = Builder.getInt32(VF - 1);
for (unsigned I = 1; I < VF; ++I)
  ShuffleMask[I] = Builder.getInt32(I + VF - 1);

// The vector from which to take the initial value for the current iteration
// (actual or unrolled). Initially, this is the vector phi node.
Value *Incoming = VecPhi;

// Shuffle the current and previous vector and update the vector parts.
for (unsigned Part = 0; Part < UF; ++Part) {
  Value *PreviousPart = getOrCreateVectorValue(Previous, Part);
  Value *PhiPart = VectorLoopValueMap.getVectorValue(Phi, Part);
  auto *Shuffle =
      VF > 1 ? Builder.CreateShuffleVector(Incoming, PreviousPart,
                                           ConstantVector::get(ShuffleMask))
             : Incoming;
  PhiPart->replaceAllUsesWith(Shuffle);
  cast<Instruction>(PhiPart)->eraseFromParent();
  VectorLoopValueMap.resetVectorValue(Phi, Part, Shuffle);
  Incoming = PreviousPart;
}

// Fix the latch value of the new recurrence in the vector loop.
VecPhi->addIncoming(Incoming, LI->getLoopFor(LoopVectorBody)->getLoopLatch());

// Extract the last vector element in the middle block. This will be the
// initial value for the recurrence when jumping to the scalar loop.
auto *ExtractForScalar = Incoming;
if (VF > 1) {
  Builder.SetInsertPoint(LoopMiddleBlock->getTerminator());
  ExtractForScalar = Builder.CreateExtractElement(
      ExtractForScalar, Builder.getInt32(VF - 1), "vector.recur.extract");
}
// Extract the second last element in the middle block if the
// Phi is used outside the loop. We need to extract the phi itself
// and not the last element (the phi update in the current iteration). This
// will be the value when jumping to the exit block from the LoopMiddleBlock,
// when the scalar loop is not run at all.
Value *ExtractForPhiUsedOutsideLoop = nullptr;
if (VF > 1)
  ExtractForPhiUsedOutsideLoop = Builder.CreateExtractElement(
      Incoming, Builder.getInt32(VF - 2), "vector.recur.extract.for.phi");
// When loop is unrolled without vectorizing, initialize
// ExtractForPhiUsedOutsideLoop with the value just prior to unrolled value of
// `Incoming`. This is analogous to the vectorized case above: extracting the
// second last element when VF > 1.
else if (UF > 1)
  ExtractForPhiUsedOutsideLoop = getOrCreateVectorValue(Previous, UF - 2);

// Fix the initial value of the original recurrence in the scalar loop.
Builder.SetInsertPoint(&*LoopScalarPreHeader->begin());
auto *Start = Builder.CreatePHI(Phi->getType(), 2, "scalar.recur.init");
for (auto *BB : predecessors(LoopScalarPreHeader)) {
  auto *Incoming = BB == LoopMiddleBlock ? ExtractForScalar : ScalarInit;
  Start->addIncoming(Incoming, BB);
}

Phi->setIncomingValueForBlock(LoopScalarPreHeader, Start);
Phi->setName("scalar.recur");

// Finally, fix users of the recurrence outside the loop. The users will need
// either the last value of the scalar recurrence or the last value of the
// vector recurrence we extracted in the middle block. Since the loop is in
// LCSSA form, we just need to find all the phi nodes for the original scalar
// recurrence in the exit block, and then add an edge for the middle block.
for (PHINode &LCSSAPhi : LoopExitBlock->phis()) {
  if (LCSSAPhi.getIncomingValue(0) == Phi) {
    LCSSAPhi.addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock);
  }
}
3685}

3687void InnerLoopVectorizer::fixReduction(PHINode *Phi) {
Constant *Zero = Builder.getInt32(0);

// Get it's reduction variable descriptor.
assert(Legal->isReductionVariable(Phi) &&((Legal->isReductionVariable(Phi) && "Unable to find the reduction variable"
) ? static_cast<void> (0) : __assert_fail ("Legal->isReductionVariable(Phi) && \"Unable to find the reduction variable\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3692, __PRETTY_FUNCTION__))
       "Unable to find the reduction variable")((Legal->isReductionVariable(Phi) && "Unable to find the reduction variable"
) ? static_cast<void> (0) : __assert_fail ("Legal->isReductionVariable(Phi) && \"Unable to find the reduction variable\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3692, __PRETTY_FUNCTION__));
RecurrenceDescriptor RdxDesc = Legal->getReductionVars()[Phi];

RecurrenceDescriptor::RecurrenceKind RK = RdxDesc.getRecurrenceKind();
TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue();
Instruction *LoopExitInst = RdxDesc.getLoopExitInstr();
RecurrenceDescriptor::MinMaxRecurrenceKind MinMaxKind =
  RdxDesc.getMinMaxRecurrenceKind();
setDebugLocFromInst(Builder, ReductionStartValue);

// We need to generate a reduction vector from the incoming scalar.
// To do so, we need to generate the 'identity' vector and override
// one of the elements with the incoming scalar reduction. We need
// to do it in the vector-loop preheader.
Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());

// This is the vector-clone of the value that leaves the loop.
Type *VecTy = getOrCreateVectorValue(LoopExitInst, 0)->getType();

// Find the reduction identity variable. Zero for addition, or, xor,
// one for multiplication, -1 for And.
Value *Identity;
Value *VectorStart;
if (RK == RecurrenceDescriptor::RK_IntegerMinMax ||
    RK == RecurrenceDescriptor::RK_FloatMinMax) {
  // MinMax reduction have the start value as their identify.
  if (VF == 1) {
    VectorStart = Identity = ReductionStartValue;
  } else {
    VectorStart = Identity =
      Builder.CreateVectorSplat(VF, ReductionStartValue, "minmax.ident");
  }
} else {
  // Handle other reduction kinds:
  Constant *Iden = RecurrenceDescriptor::getRecurrenceIdentity(
      RK, VecTy->getScalarType());
  if (VF == 1) {
    Identity = Iden;
    // This vector is the Identity vector where the first element is the
    // incoming scalar reduction.
    VectorStart = ReductionStartValue;
  } else {
    Identity = ConstantVector::getSplat(VF, Iden);

    // This vector is the Identity vector where the first element is the
    // incoming scalar reduction.
    VectorStart =
      Builder.CreateInsertElement(Identity, ReductionStartValue, Zero);
  }
}

// Wrap flags are in general invalid after vectorization, clear them.
clearReductionWrapFlags(RdxDesc);

// Fix the vector-loop phi.

// Reductions do not have to start at zero. They can start with
// any loop invariant values.
BasicBlock *Latch = OrigLoop->getLoopLatch();
Value *LoopVal = Phi->getIncomingValueForBlock(Latch);

for (unsigned Part = 0; Part < UF; ++Part) {
  Value *VecRdxPhi = getOrCreateVectorValue(Phi, Part);
  Value *Val = getOrCreateVectorValue(LoopVal, Part);
  // Make sure to add the reduction start value only to the
  // first unroll part.
  Value *StartVal = (Part == 0) ? VectorStart : Identity;
  cast<PHINode>(VecRdxPhi)->addIncoming(StartVal, LoopVectorPreHeader);
  cast<PHINode>(VecRdxPhi)
    ->addIncoming(Val, LI->getLoopFor(LoopVectorBody)->getLoopLatch());
}

// Before each round, move the insertion point right between
// the PHIs and the values we are going to write.
// This allows us to write both PHINodes and the extractelement
// instructions.
Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt());

setDebugLocFromInst(Builder, LoopExitInst);

// If tail is folded by masking, the vector value to leave the loop should be
// a Select choosing between the vectorized LoopExitInst and vectorized Phi,
// instead of the former.
if (Cost->foldTailByMasking()) {
  for (unsigned Part = 0; Part < UF; ++Part) {
    Value *VecLoopExitInst =
        VectorLoopValueMap.getVectorValue(LoopExitInst, Part);
    Value *Sel = nullptr;
    for (User *U : VecLoopExitInst->users()) {
      if (isa<SelectInst>(U)) {
        assert(!Sel && "Reduction exit feeding two selects")((!Sel && "Reduction exit feeding two selects") ? static_cast
<void> (0) : __assert_fail ("!Sel && \"Reduction exit feeding two selects\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3782, __PRETTY_FUNCTION__));
        Sel = U;
      } else
        assert(isa<PHINode>(U) && "Reduction exit must feed Phi's or select")((isa<PHINode>(U) && "Reduction exit must feed Phi's or select"
) ? static_cast<void> (0) : __assert_fail ("isa<PHINode>(U) && \"Reduction exit must feed Phi's or select\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3785, __PRETTY_FUNCTION__));
    }
    assert(Sel && "Reduction exit feeds no select")((Sel && "Reduction exit feeds no select") ? static_cast
<void> (0) : __assert_fail ("Sel && \"Reduction exit feeds no select\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3787, __PRETTY_FUNCTION__));
    VectorLoopValueMap.resetVectorValue(LoopExitInst, Part, Sel);
  }
}

// If the vector reduction can be performed in a smaller type, we truncate
// then extend the loop exit value to enable InstCombine to evaluate the
// entire expression in the smaller type.
if (VF > 1 && Phi->getType() != RdxDesc.getRecurrenceType()) {
  Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), VF);
  Builder.SetInsertPoint(
      LI->getLoopFor(LoopVectorBody)->getLoopLatch()->getTerminator());
  VectorParts RdxParts(UF);
  for (unsigned Part = 0; Part < UF; ++Part) {
    RdxParts[Part] = VectorLoopValueMap.getVectorValue(LoopExitInst, Part);
    Value *Trunc = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
    Value *Extnd = RdxDesc.isSigned() ? Builder.CreateSExt(Trunc, VecTy)
                                      : Builder.CreateZExt(Trunc, VecTy);
    for (Value::user_iterator UI = RdxParts[Part]->user_begin();
         UI != RdxParts[Part]->user_end();)
      if (*UI != Trunc) {
        (*UI++)->replaceUsesOfWith(RdxParts[Part], Extnd);
        RdxParts[Part] = Extnd;
      } else {
        ++UI;
      }
  }
  Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt());
  for (unsigned Part = 0; Part < UF; ++Part) {
    RdxParts[Part] = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
    VectorLoopValueMap.resetVectorValue(LoopExitInst, Part, RdxParts[Part]);
  }
}

// Reduce all of the unrolled parts into a single vector.
Value *ReducedPartRdx = VectorLoopValueMap.getVectorValue(LoopExitInst, 0);
unsigned Op = RecurrenceDescriptor::getRecurrenceBinOp(RK);

// The middle block terminator has already been assigned a DebugLoc here (the
// OrigLoop's single latch terminator). We want the whole middle block to
// appear to execute on this line because: (a) it is all compiler generated,
// (b) these instructions are always executed after evaluating the latch
// conditional branch, and (c) other passes may add new predecessors which
// terminate on this line. This is the easiest way to ensure we don't
// accidentally cause an extra step back into the loop while debugging.
setDebugLocFromInst(Builder, LoopMiddleBlock->getTerminator());
for (unsigned Part = 1; Part < UF; ++Part) {
  Value *RdxPart = VectorLoopValueMap.getVectorValue(LoopExitInst, Part);
  if (Op != Instruction::ICmp && Op != Instruction::FCmp)
    // Floating point operations had to be 'fast' to enable the reduction.
    ReducedPartRdx = addFastMathFlag(
        Builder.CreateBinOp((Instruction::BinaryOps)Op, RdxPart,
                            ReducedPartRdx, "bin.rdx"),
        RdxDesc.getFastMathFlags());
  else
    ReducedPartRdx = createMinMaxOp(Builder, MinMaxKind, ReducedPartRdx,
                                    RdxPart);
}

if (VF > 1) {
  bool NoNaN = Legal->hasFunNoNaNAttr();
  ReducedPartRdx =
      createTargetReduction(Builder, TTI, RdxDesc, ReducedPartRdx, NoNaN);
  // If the reduction can be performed in a smaller type, we need to extend
  // the reduction to the wider type before we branch to the original loop.
  if (Phi->getType() != RdxDesc.getRecurrenceType())
    ReducedPartRdx =
      RdxDesc.isSigned()
      ? Builder.CreateSExt(ReducedPartRdx, Phi->getType())
      : Builder.CreateZExt(ReducedPartRdx, Phi->getType());
}

// Create a phi node that merges control-flow from the backedge-taken check
// block and the middle block.
PHINode *BCBlockPhi = PHINode::Create(Phi->getType(), 2, "bc.merge.rdx",
                                      LoopScalarPreHeader->getTerminator());
for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I)
  BCBlockPhi->addIncoming(ReductionStartValue, LoopBypassBlocks[I]);
BCBlockPhi->addIncoming(ReducedPartRdx, LoopMiddleBlock);

// Now, we need to fix the users of the reduction variable
// inside and outside of the scalar remainder loop.
// We know that the loop is in LCSSA form. We need to update the
// PHI nodes in the exit blocks.
for (PHINode &LCSSAPhi : LoopExitBlock->phis()) {
  // All PHINodes need to have a single entry edge, or two if
  // we already fixed them.
  assert(LCSSAPhi.getNumIncomingValues() < 3 && "Invalid LCSSA PHI")((LCSSAPhi.getNumIncomingValues() < 3 && "Invalid LCSSA PHI"
) ? static_cast<void> (0) : __assert_fail ("LCSSAPhi.getNumIncomingValues() < 3 && \"Invalid LCSSA PHI\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3874, __PRETTY_FUNCTION__));

  // We found a reduction value exit-PHI. Update it with the
  // incoming bypass edge.
  if (LCSSAPhi.getIncomingValue(0) == LoopExitInst)
    LCSSAPhi.addIncoming(ReducedPartRdx, LoopMiddleBlock);
} // end of the LCSSA phi scan.

  // Fix the scalar loop reduction variable with the incoming reduction sum
  // from the vector body and from the backedge value.
int IncomingEdgeBlockIdx =
  Phi->getBasicBlockIndex(OrigLoop->getLoopLatch());
assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index")((IncomingEdgeBlockIdx >= 0 && "Invalid block index"
) ? static_cast<void> (0) : __assert_fail ("IncomingEdgeBlockIdx >= 0 && \"Invalid block index\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3886, __PRETTY_FUNCTION__));
// Pick the other block.
int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1);
Phi->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi);
Phi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst);
3891}

3893void InnerLoopVectorizer::clearReductionWrapFlags(
  RecurrenceDescriptor &RdxDesc) {
RecurrenceDescriptor::RecurrenceKind RK = RdxDesc.getRecurrenceKind();
if (RK != RecurrenceDescriptor::RK_IntegerAdd &&
    RK != RecurrenceDescriptor::RK_IntegerMult)
  return;

Instruction *LoopExitInstr = RdxDesc.getLoopExitInstr();
assert(LoopExitInstr && "null loop exit instruction")((LoopExitInstr && "null loop exit instruction") ? static_cast
<void> (0) : __assert_fail ("LoopExitInstr && \"null loop exit instruction\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3901, __PRETTY_FUNCTION__));
SmallVector<Instruction *, 8> Worklist;
SmallPtrSet<Instruction *, 8> Visited;
Worklist.push_back(LoopExitInstr);
Visited.insert(LoopExitInstr);

while (!Worklist.empty()) {
  Instruction *Cur = Worklist.pop_back_val();
  if (isa<OverflowingBinaryOperator>(Cur))
    for (unsigned Part = 0; Part < UF; ++Part) {
      Value *V = getOrCreateVectorValue(Cur, Part);
      cast<Instruction>(V)->dropPoisonGeneratingFlags();
    }

  for (User *U : Cur->users()) {
    Instruction *UI = cast<Instruction>(U);
    if ((Cur != LoopExitInstr || OrigLoop->contains(UI->getParent())) &&
        Visited.insert(UI).second)
      Worklist.push_back(UI);
  }
}
3922}

3924void InnerLoopVectorizer::fixLCSSAPHIs() {
for (PHINode &LCSSAPhi : LoopExitBlock->phis()) {
  if (LCSSAPhi.getNumIncomingValues() == 1) {
    auto *IncomingValue = LCSSAPhi.getIncomingValue(0);
    // Non-instruction incoming values will have only one value.
    unsigned LastLane = 0;
    if (isa<Instruction>(IncomingValue)) 
        LastLane = Cost->isUniformAfterVectorization(
                       cast<Instruction>(IncomingValue), VF)
                       ? 0
                       : VF - 1;
    // Can be a loop invariant incoming value or the last scalar value to be
    // extracted from the vectorized loop.
    Builder.SetInsertPoint(LoopMiddleBlock->getTerminator());
    Value *lastIncomingValue =
        getOrCreateScalarValue(IncomingValue, { UF - 1, LastLane });
    LCSSAPhi.addIncoming(lastIncomingValue, LoopMiddleBlock);
  }
}
3943}

3945void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) {
// The basic block and loop containing the predicated instruction.
auto *PredBB = PredInst->getParent();
auto *VectorLoop = LI->getLoopFor(PredBB);

// Initialize a worklist with the operands of the predicated instruction.
SetVector<Value *> Worklist(PredInst->op_begin(), PredInst->op_end());

// Holds instructions that we need to analyze again. An instruction may be
// reanalyzed if we don't yet know if we can sink it or not.
SmallVector<Instruction *, 8> InstsToReanalyze;

// Returns true if a given use occurs in the predicated block. Phi nodes use
// their operands in their corresponding predecessor blocks.
auto isBlockOfUsePredicated = [&](Use &U) -> bool {
  auto *I = cast<Instruction>(U.getUser());
  BasicBlock *BB = I->getParent();
  if (auto *Phi = dyn_cast<PHINode>(I))
    BB = Phi->getIncomingBlock(
        PHINode::getIncomingValueNumForOperand(U.getOperandNo()));
  return BB == PredBB;
};

// Iteratively sink the scalarized operands of the predicated instruction
// into the block we created for it. When an instruction is sunk, it's
// operands are then added to the worklist. The algorithm ends after one pass
// through the worklist doesn't sink a single instruction.
bool Changed;
do {
  // Add the instructions that need to be reanalyzed to the worklist, and
  // reset the changed indicator.
  Worklist.insert(InstsToReanalyze.begin(), InstsToReanalyze.end());
  InstsToReanalyze.clear();
  Changed = false;

  while (!Worklist.empty()) {
    auto *I = dyn_cast<Instruction>(Worklist.pop_back_val());

    // We can't sink an instruction if it is a phi node, is already in the
    // predicated block, is not in the loop, or may have side effects.
    if (!I || isa<PHINode>(I) || I->getParent() == PredBB ||
        !VectorLoop->contains(I) || I->mayHaveSideEffects())
      continue;

    // It's legal to sink the instruction if all its uses occur in the
    // predicated block. Otherwise, there's nothing to do yet, and we may
    // need to reanalyze the instruction.
    if (!llvm::all_of(I->uses(), isBlockOfUsePredicated)) {
      InstsToReanalyze.push_back(I);
      continue;
    }

    // Move the instruction to the beginning of the predicated block, and add
    // it's operands to the worklist.
    I->moveBefore(&*PredBB->getFirstInsertionPt());
    Worklist.insert(I->op_begin(), I->op_end());

    // The sinking may have enabled other instructions to be sunk, so we will
    // need to iterate.
    Changed = true;
  }
} while (Changed);
4007}

4009void InnerLoopVectorizer::fixNonInductionPHIs() {
for (PHINode *OrigPhi : OrigPHIsToFix) {
  PHINode *NewPhi =
      cast<PHINode>(VectorLoopValueMap.getVectorValue(OrigPhi, 0));
  unsigned NumIncomingValues = OrigPhi->getNumIncomingValues();

  SmallVector<BasicBlock *, 2> ScalarBBPredecessors(
      predecessors(OrigPhi->getParent()));
  SmallVector<BasicBlock *, 2> VectorBBPredecessors(
      predecessors(NewPhi->getParent()));
  assert(ScalarBBPredecessors.size() == VectorBBPredecessors.size() &&((ScalarBBPredecessors.size() == VectorBBPredecessors.size() &&
 "Scalar and Vector BB should have the same number of predecessors"
) ? static_cast<void> (0) : __assert_fail ("ScalarBBPredecessors.size() == VectorBBPredecessors.size() && \"Scalar and Vector BB should have the same number of predecessors\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4020, __PRETTY_FUNCTION__))
         "Scalar and Vector BB should have the same number of predecessors")((ScalarBBPredecessors.size() == VectorBBPredecessors.size() &&
 "Scalar and Vector BB should have the same number of predecessors"
) ? static_cast<void> (0) : __assert_fail ("ScalarBBPredecessors.size() == VectorBBPredecessors.size() && \"Scalar and Vector BB should have the same number of predecessors\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4020, __PRETTY_FUNCTION__));

  // The insertion point in Builder may be invalidated by the time we get
  // here. Force the Builder insertion point to something valid so that we do
  // not run into issues during insertion point restore in
  // getOrCreateVectorValue calls below.
  Builder.SetInsertPoint(NewPhi);

  // The predecessor order is preserved and we can rely on mapping between
  // scalar and vector block predecessors.
  for (unsigned i = 0; i < NumIncomingValues; ++i) {
    BasicBlock *NewPredBB = VectorBBPredecessors[i];

    // When looking up the new scalar/vector values to fix up, use incoming
    // values from original phi.
    Value *ScIncV =
        OrigPhi->getIncomingValueForBlock(ScalarBBPredecessors[i]);

    // Scalar incoming value may need a broadcast
    Value *NewIncV = getOrCreateVectorValue(ScIncV, 0);
    NewPhi->addIncoming(NewIncV, NewPredBB);
  }
}
4043}

4045void InnerLoopVectorizer::widenGEP(GetElementPtrInst *GEP, unsigned UF,
                                 unsigned VF, bool IsPtrLoopInvariant,
                                 SmallBitVector &IsIndexLoopInvariant) {
// Construct a vector GEP by widening the operands of the scalar GEP as
// necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP
// results in a vector of pointers when at least one operand of the GEP
// is vector-typed. Thus, to keep the representation compact, we only use
// vector-typed operands for loop-varying values.

if (VF > 1 && IsPtrLoopInvariant && IsIndexLoopInvariant.all()) {
  // If we are vectorizing, but the GEP has only loop-invariant operands,
  // the GEP we build (by only using vector-typed operands for
  // loop-varying values) would be a scalar pointer. Thus, to ensure we
  // produce a vector of pointers, we need to either arbitrarily pick an
  // operand to broadcast, or broadcast a clone of the original GEP.
  // Here, we broadcast a clone of the original.
  //
  // TODO: If at some point we decide to scalarize instructions having
  //       loop-invariant operands, this special case will no longer be
  //       required. We would add the scalarization decision to
  //       collectLoopScalars() and teach getVectorValue() to broadcast
  //       the lane-zero scalar value.
  auto *Clone = Builder.Insert(GEP->clone());
  for (unsigned Part = 0; Part < UF; ++Part) {
    Value *EntryPart = Builder.CreateVectorSplat(VF, Clone);
    VectorLoopValueMap.setVectorValue(GEP, Part, EntryPart);
    addMetadata(EntryPart, GEP);
  }
} else {
  // If the GEP has at least one loop-varying operand, we are sure to
  // produce a vector of pointers. But if we are only unrolling, we want
  // to produce a scalar GEP for each unroll part. Thus, the GEP we
  // produce with the code below will be scalar (if VF == 1) or vector
  // (otherwise). Note that for the unroll-only case, we still maintain
  // values in the vector mapping with initVector, as we do for other
  // instructions.
  for (unsigned Part = 0; Part < UF; ++Part) {
    // The pointer operand of the new GEP. If it's loop-invariant, we
    // won't broadcast it.
    auto *Ptr = IsPtrLoopInvariant
                    ? GEP->getPointerOperand()
                    : getOrCreateVectorValue(GEP->getPointerOperand(), Part);

    // Collect all the indices for the new GEP. If any index is
    // loop-invariant, we won't broadcast it.
    SmallVector<Value *, 4> Indices;
    for (auto Index : enumerate(GEP->indices())) {
      Value *User = Index.value().get();
      if (IsIndexLoopInvariant[Index.index()])
        Indices.push_back(User);
      else
        Indices.push_back(getOrCreateVectorValue(User, Part));
    }

    // Create the new GEP. Note that this GEP may be a scalar if VF == 1,
    // but it should be a vector, otherwise.
    auto *NewGEP =
        GEP->isInBounds()
            ? Builder.CreateInBoundsGEP(GEP->getSourceElementType(), Ptr,
                                        Indices)
            : Builder.CreateGEP(GEP->getSourceElementType(), Ptr, Indices);
    assert((VF == 1 || NewGEP->getType()->isVectorTy()) &&(((VF == 1 || NewGEP->getType()->isVectorTy()) &&
 "NewGEP is not a pointer vector") ? static_cast<void> (
0) : __assert_fail ("(VF == 1 || NewGEP->getType()->isVectorTy()) && \"NewGEP is not a pointer vector\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4107, __PRETTY_FUNCTION__))
           "NewGEP is not a pointer vector")(((VF == 1 || NewGEP->getType()->isVectorTy()) &&
 "NewGEP is not a pointer vector") ? static_cast<void> (
0) : __assert_fail ("(VF == 1 || NewGEP->getType()->isVectorTy()) && \"NewGEP is not a pointer vector\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4107, __PRETTY_FUNCTION__));
    VectorLoopValueMap.setVectorValue(GEP, Part, NewGEP);
    addMetadata(NewGEP, GEP);
  }
}
4112}

4114void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, unsigned UF,
                                            unsigned VF) {
PHINode *P = cast<PHINode>(PN);
if (EnableVPlanNativePath) {
  // Currently we enter here in the VPlan-native path for non-induction
  // PHIs where all control flow is uniform. We simply widen these PHIs.
  // Create a vector phi with no operands - the vector phi operands will be
  // set at the end of vector code generation.
  Type *VecTy =
      (VF == 1) ? PN->getType() : VectorType::get(PN->getType(), VF);
  Value *VecPhi = Builder.CreatePHI(VecTy, PN->getNumOperands(), "vec.phi");
  VectorLoopValueMap.setVectorValue(P, 0, VecPhi);
  OrigPHIsToFix.push_back(P);

  return;
}

assert(PN->getParent() == OrigLoop->getHeader() &&((PN->getParent() == OrigLoop->getHeader() && "Non-header phis should have been handled elsewhere"
) ? static_cast<void> (0) : __assert_fail ("PN->getParent() == OrigLoop->getHeader() && \"Non-header phis should have been handled elsewhere\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4132, __PRETTY_FUNCTION__))
       "Non-header phis should have been handled elsewhere")((PN->getParent() == OrigLoop->getHeader() && "Non-header phis should have been handled elsewhere"
) ? static_cast<void> (0) : __assert_fail ("PN->getParent() == OrigLoop->getHeader() && \"Non-header phis should have been handled elsewhere\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4132, __PRETTY_FUNCTION__));

// In order to support recurrences we need to be able to vectorize Phi nodes.
// Phi nodes have cycles, so we need to vectorize them in two stages. This is
// stage #1: We create a new vector PHI node with no incoming edges. We'll use
// this value when we vectorize all of the instructions that use the PHI.
if (Legal->isReductionVariable(P) || Legal->isFirstOrderRecurrence(P)) {
  for (unsigned Part = 0; Part < UF; ++Part) {
    // This is phase one of vectorizing PHIs.
    Type *VecTy =
        (VF == 1) ? PN->getType() : VectorType::get(PN->getType(), VF);
    Value *EntryPart = PHINode::Create(
        VecTy, 2, "vec.phi", &*LoopVectorBody->getFirstInsertionPt());
    VectorLoopValueMap.setVectorValue(P, Part, EntryPart);
  }
  return;
}

setDebugLocFromInst(Builder, P);

// This PHINode must be an induction variable.
// Make sure that we know about it.
assert(Legal->getInductionVars().count(P) && "Not an induction variable")((Legal->getInductionVars().count(P) && "Not an induction variable"
) ? static_cast<void> (0) : __assert_fail ("Legal->getInductionVars().count(P) && \"Not an induction variable\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4154, __PRETTY_FUNCTION__));

InductionDescriptor II = Legal->getInductionVars().lookup(P);
const DataLayout &DL = OrigLoop->getHeader()->getModule()->getDataLayout();

// FIXME: The newly created binary instructions should contain nsw/nuw flags,
// which can be found from the original scalar operations.
switch (II.getKind()) {
case InductionDescriptor::IK_NoInduction:
  llvm_unreachable("Unknown induction")::llvm::llvm_unreachable_internal("Unknown induction", "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4163);
case InductionDescriptor::IK_IntInduction:
case InductionDescriptor::IK_FpInduction:
  llvm_unreachable("Integer/fp induction is handled elsewhere.")::llvm::llvm_unreachable_internal("Integer/fp induction is handled elsewhere."
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4166);
case InductionDescriptor::IK_PtrInduction: {
  // Handle the pointer induction variable case.
  assert(P->getType()->isPointerTy() && "Unexpected type.")((P->getType()->isPointerTy() && "Unexpected type."
) ? static_cast<void> (0) : __assert_fail ("P->getType()->isPointerTy() && \"Unexpected type.\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4169, __PRETTY_FUNCTION__));
  // This is the normalized GEP that starts counting at zero.
  Value *PtrInd = Induction;
  PtrInd = Builder.CreateSExtOrTrunc(PtrInd, II.getStep()->getType());
  // Determine the number of scalars we need to generate for each unroll
  // iteration. If the instruction is uniform, we only need to generate the
  // first lane. Otherwise, we generate all VF values.
  unsigned Lanes = Cost->isUniformAfterVectorization(P, VF) ? 1 : VF;
  // These are the scalar results. Notice that we don't generate vector GEPs
  // because scalar GEPs result in better code.
  for (unsigned Part = 0; Part < UF; ++Part) {
    for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
      Constant *Idx = ConstantInt::get(PtrInd->getType(), Lane + Part * VF);
      Value *GlobalIdx = Builder.CreateAdd(PtrInd, Idx);
      Value *SclrGep =
          emitTransformedIndex(Builder, GlobalIdx, PSE.getSE(), DL, II);
      SclrGep->setName("next.gep");
      VectorLoopValueMap.setScalarValue(P, {Part, Lane}, SclrGep);
    }
  }
  return;
}
}
4192}

4194/// A helper function for checking whether an integer division-related
4195/// instruction may divide by zero (in which case it must be predicated if
4196/// executed conditionally in the scalar code).
4197/// TODO: It may be worthwhile to generalize and check isKnownNonZero().
4198/// Non-zero divisors that are non compile-time constants will not be
4199/// converted into multiplication, so we will still end up scalarizing
4200/// the division, but can do so w/o predication.
4201static bool mayDivideByZero(Instruction &I) {
assert((I.getOpcode() == Instruction::UDiv ||(((I.getOpcode() == Instruction::UDiv || I.getOpcode() == Instruction
::SDiv || I.getOpcode() == Instruction::URem || I.getOpcode()
 == Instruction::SRem) && "Unexpected instruction") ?
 static_cast<void> (0) : __assert_fail ("(I.getOpcode() == Instruction::UDiv || I.getOpcode() == Instruction::SDiv || I.getOpcode() == Instruction::URem || I.getOpcode() == Instruction::SRem) && \"Unexpected instruction\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4206, __PRETTY_FUNCTION__))
        I.getOpcode() == Instruction::SDiv ||(((I.getOpcode() == Instruction::UDiv || I.getOpcode() == Instruction
::SDiv || I.getOpcode() == Instruction::URem || I.getOpcode()
 == Instruction::SRem) && "Unexpected instruction") ?
 static_cast<void> (0) : __assert_fail ("(I.getOpcode() == Instruction::UDiv || I.getOpcode() == Instruction::SDiv || I.getOpcode() == Instruction::URem || I.getOpcode() == Instruction::SRem) && \"Unexpected instruction\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4206, __PRETTY_FUNCTION__))
        I.getOpcode() == Instruction::URem ||(((I.getOpcode() == Instruction::UDiv || I.getOpcode() == Instruction
::SDiv || I.getOpcode() == Instruction::URem || I.getOpcode()
 == Instruction::SRem) && "Unexpected instruction") ?
 static_cast<void> (0) : __assert_fail ("(I.getOpcode() == Instruction::UDiv || I.getOpcode() == Instruction::SDiv || I.getOpcode() == Instruction::URem || I.getOpcode() == Instruction::SRem) && \"Unexpected instruction\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4206, __PRETTY_FUNCTION__))
        I.getOpcode() == Instruction::SRem) &&(((I.getOpcode() == Instruction::UDiv || I.getOpcode() == Instruction
::SDiv || I.getOpcode() == Instruction::URem || I.getOpcode()
 == Instruction::SRem) && "Unexpected instruction") ?
 static_cast<void> (0) : __assert_fail ("(I.getOpcode() == Instruction::UDiv || I.getOpcode() == Instruction::SDiv || I.getOpcode() == Instruction::URem || I.getOpcode() == Instruction::SRem) && \"Unexpected instruction\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4206, __PRETTY_FUNCTION__))
       "Unexpected instruction")(((I.getOpcode() == Instruction::UDiv || I.getOpcode() == Instruction
::SDiv || I.getOpcode() == Instruction::URem || I.getOpcode()
 == Instruction::SRem) && "Unexpected instruction") ?
 static_cast<void> (0) : __assert_fail ("(I.getOpcode() == Instruction::UDiv || I.getOpcode() == Instruction::SDiv || I.getOpcode() == Instruction::URem || I.getOpcode() == Instruction::SRem) && \"Unexpected instruction\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4206, __PRETTY_FUNCTION__));
Value *Divisor = I.getOperand(1);
auto *CInt = dyn_cast<ConstantInt>(Divisor);
return !CInt || CInt->isZero();
4210}

4212void InnerLoopVectorizer::widenInstruction(Instruction &I) {
switch (I.getOpcode()) {
case Instruction::Br:
case Instruction::PHI:
case Instruction::GetElementPtr:
  llvm_unreachable("This instruction is handled by a different recipe.")::llvm::llvm_unreachable_internal("This instruction is handled by a different recipe."
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4217);
case Instruction::UDiv:
case Instruction::SDiv:
case Instruction::SRem:
case Instruction::URem:
case Instruction::Add:
case Instruction::FAdd:
case Instruction::Sub:
case Instruction::FSub:
case Instruction::FNeg:
case Instruction::Mul:
case Instruction::FMul:
case Instruction::FDiv:
case Instruction::FRem:
case Instruction::Shl:
case Instruction::LShr:
case Instruction::AShr:
case Instruction::And:
case Instruction::Or:
case Instruction::Xor: {
  // Just widen unops and binops.
  setDebugLocFromInst(Builder, &I);

  for (unsigned Part = 0; Part < UF; ++Part) {
    SmallVector<Value *, 2> Ops;
    for (Value *Op : I.operands())
      Ops.push_back(getOrCreateVectorValue(Op, Part));

    Value *V = Builder.CreateNAryOp(I.getOpcode(), Ops);

    if (auto *VecOp = dyn_cast<Instruction>(V))
      VecOp->copyIRFlags(&I);

    // Use this vector value for all users of the original instruction.
    VectorLoopValueMap.setVectorValue(&I, Part, V);
    addMetadata(V, &I);
  }

  break;
}
case Instruction::Select: {
  // Widen selects.
  // If the selector is loop invariant we can create a select
  // instruction with a scalar condition. Otherwise, use vector-select.
  auto *SE = PSE.getSE();
  bool InvariantCond =
      SE->isLoopInvariant(PSE.getSCEV(I.getOperand(0)), OrigLoop);
  setDebugLocFromInst(Builder, &I);

  // The condition can be loop invariant  but still defined inside the
  // loop. This means that we can't just use the original 'cond' value.
  // We have to take the 'vectorized' value and pick the first lane.
  // Instcombine will make this a no-op.

  auto *ScalarCond = getOrCreateScalarValue(I.getOperand(0), {0, 0});

  for (unsigned Part = 0; Part < UF; ++Part) {
    Value *Cond = getOrCreateVectorValue(I.getOperand(0), Part);
    Value *Op0 = getOrCreateVectorValue(I.getOperand(1), Part);
    Value *Op1 = getOrCreateVectorValue(I.getOperand(2), Part);
    Value *Sel =
        Builder.CreateSelect(InvariantCond ? ScalarCond : Cond, Op0, Op1);
    VectorLoopValueMap.setVectorValue(&I, Part, Sel);
    addMetadata(Sel, &I);
  }

  break;
}

case Instruction::ICmp:
case Instruction::FCmp: {
  // Widen compares. Generate vector compares.
  bool FCmp = (I.getOpcode() == Instruction::FCmp);
  auto *Cmp = cast<CmpInst>(&I);
  setDebugLocFromInst(Builder, Cmp);
  for (unsigned Part = 0; Part < UF; ++Part) {
    Value *A = getOrCreateVectorValue(Cmp->getOperand(0), Part);
    Value *B = getOrCreateVectorValue(Cmp->getOperand(1), Part);
    Value *C = nullptr;
    if (FCmp) {
      // Propagate fast math flags.
      IRBuilder<>::FastMathFlagGuard FMFG(Builder);
      Builder.setFastMathFlags(Cmp->getFastMathFlags());
      C = Builder.CreateFCmp(Cmp->getPredicate(), A, B);
    } else {
      C = Builder.CreateICmp(Cmp->getPredicate(), A, B);
    }
    VectorLoopValueMap.setVectorValue(&I, Part, C);
    addMetadata(C, &I);
  }

  break;
}

case Instruction::ZExt:
case Instruction::SExt:
case Instruction::FPToUI:
case Instruction::FPToSI:
case Instruction::FPExt:
case Instruction::PtrToInt:
case Instruction::IntToPtr:
case Instruction::SIToFP:
case Instruction::UIToFP:
case Instruction::Trunc:
case Instruction::FPTrunc:
case Instruction::BitCast: {
  auto *CI = cast<CastInst>(&I);
  setDebugLocFromInst(Builder, CI);

  /// Vectorize casts.
  Type *DestTy =
      (VF == 1) ? CI->getType() : VectorType::get(CI->getType(), VF);

  for (unsigned Part = 0; Part < UF; ++Part) {
    Value *A = getOrCreateVectorValue(CI->getOperand(0), Part);
    Value *Cast = Builder.CreateCast(CI->getOpcode(), A, DestTy);
    VectorLoopValueMap.setVectorValue(&I, Part, Cast);
    addMetadata(Cast, &I);
  }
  break;
}

case Instruction::Call: {
  // Ignore dbg intrinsics.
  if (isa<DbgInfoIntrinsic>(I))
    break;
  setDebugLocFromInst(Builder, &I);

  Module *M = I.getParent()->getParent()->getParent();
  auto *CI = cast<CallInst>(&I);

  SmallVector<Type *, 4> Tys;
  for (Value *ArgOperand : CI->arg_operands())
    Tys.push_back(ToVectorTy(ArgOperand->getType(), VF));

  Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);

  // The flag shows whether we use Intrinsic or a usual Call for vectorized
  // version of the instruction.
  // Is it beneficial to perform intrinsic call compared to lib call?
  bool NeedToScalarize = false;
  unsigned CallCost = Cost->getVectorCallCost(CI, VF, NeedToScalarize);
  bool UseVectorIntrinsic =
      ID && Cost->getVectorIntrinsicCost(CI, VF) <= CallCost;
  assert((UseVectorIntrinsic || !NeedToScalarize) &&(((UseVectorIntrinsic || !NeedToScalarize) && "Instruction should be scalarized elsewhere."
) ? static_cast<void> (0) : __assert_fail ("(UseVectorIntrinsic || !NeedToScalarize) && \"Instruction should be scalarized elsewhere.\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4362, __PRETTY_FUNCTION__))
         "Instruction should be scalarized elsewhere.")(((UseVectorIntrinsic || !NeedToScalarize) && "Instruction should be scalarized elsewhere."
) ? static_cast<void> (0) : __assert_fail ("(UseVectorIntrinsic || !NeedToScalarize) && \"Instruction should be scalarized elsewhere.\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4362, __PRETTY_FUNCTION__));

  for (unsigned Part = 0; Part < UF; ++Part) {
    SmallVector<Value *, 4> Args;
    for (unsigned i = 0, ie = CI->getNumArgOperands(); i != ie; ++i) {
      Value *Arg = CI->getArgOperand(i);
      // Some intrinsics have a scalar argument - don't replace it with a
      // vector.
      if (!UseVectorIntrinsic || !hasVectorInstrinsicScalarOpd(ID, i))
        Arg = getOrCreateVectorValue(CI->getArgOperand(i), Part);
      Args.push_back(Arg);
    }

    Function *VectorF;
    if (UseVectorIntrinsic) {
      // Use vector version of the intrinsic.
      Type *TysForDecl[] = {CI->getType()};
      if (VF > 1)
        TysForDecl[0] = VectorType::get(CI->getType()->getScalarType(), VF);
      VectorF = Intrinsic::getDeclaration(M, ID, TysForDecl);
    } else {
      // Use vector version of the function call.
      const VFShape Shape =
          VFShape::get(*CI, {VF, false} /*EC*/, false /*HasGlobalPred*/);
4386#ifndef NDEBUG
      const SmallVector<VFInfo, 8> Infos = VFDatabase::getMappings(*CI);
      assert(std::find_if(Infos.begin(), Infos.end(),((std::find_if(Infos.begin(), Infos.end(), [&Shape](const
 VFInfo &Info) { return Info.Shape == Shape; }) != Infos.
end() && "Vector function shape is missing from the database."
) ? static_cast<void> (0) : __assert_fail ("std::find_if(Infos.begin(), Infos.end(), [&Shape](const VFInfo &Info) { return Info.Shape == Shape; }) != Infos.end() && \"Vector function shape is missing from the database.\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4392, __PRETTY_FUNCTION__))
                          [&Shape](const VFInfo &Info) {((std::find_if(Infos.begin(), Infos.end(), [&Shape](const
 VFInfo &Info) { return Info.Shape == Shape; }) != Infos.
end() && "Vector function shape is missing from the database."
) ? static_cast<void> (0) : __assert_fail ("std::find_if(Infos.begin(), Infos.end(), [&Shape](const VFInfo &Info) { return Info.Shape == Shape; }) != Infos.end() && \"Vector function shape is missing from the database.\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4392, __PRETTY_FUNCTION__))
                            return Info.Shape == Shape;((std::find_if(Infos.begin(), Infos.end(), [&Shape](const
 VFInfo &Info) { return Info.Shape == Shape; }) != Infos.
end() && "Vector function shape is missing from the database."
) ? static_cast<void> (0) : __assert_fail ("std::find_if(Infos.begin(), Infos.end(), [&Shape](const VFInfo &Info) { return Info.Shape == Shape; }) != Infos.end() && \"Vector function shape is missing from the database.\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4392, __PRETTY_FUNCTION__))
                          }) != Infos.end() &&((std::find_if(Infos.begin(), Infos.end(), [&Shape](const
 VFInfo &Info) { return Info.Shape == Shape; }) != Infos.
end() && "Vector function shape is missing from the database."
) ? static_cast<void> (0) : __assert_fail ("std::find_if(Infos.begin(), Infos.end(), [&Shape](const VFInfo &Info) { return Info.Shape == Shape; }) != Infos.end() && \"Vector function shape is missing from the database.\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4392, __PRETTY_FUNCTION__))
             "Vector function shape is missing from the database.")((std::find_if(Infos.begin(), Infos.end(), [&Shape](const
 VFInfo &Info) { return Info.Shape == Shape; }) != Infos.
end() && "Vector function shape is missing from the database."
) ? static_cast<void> (0) : __assert_fail ("std::find_if(Infos.begin(), Infos.end(), [&Shape](const VFInfo &Info) { return Info.Shape == Shape; }) != Infos.end() && \"Vector function shape is missing from the database.\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4392, __PRETTY_FUNCTION__));
4393#endif
      VectorF = VFDatabase(*CI).getVectorizedFunction(Shape);
    }
    assert(VectorF && "Can't create vector function.")((VectorF && "Can't create vector function.") ? static_cast
<void> (0) : __assert_fail ("VectorF && \"Can't create vector function.\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4396, __PRETTY_FUNCTION__));

    SmallVector<OperandBundleDef, 1> OpBundles;
    CI->getOperandBundlesAsDefs(OpBundles);
    CallInst *V = Builder.CreateCall(VectorF, Args, OpBundles);

    if (isa<FPMathOperator>(V))
      V->copyFastMathFlags(CI);

    VectorLoopValueMap.setVectorValue(&I, Part, V);
    addMetadata(V, &I);
  }

  break;
}

default:
  // This instruction is not vectorized by simple widening.
  LLVM_DEBUG(dbgs() << "LV: Found an unhandled instruction: " << I)do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found an unhandled instruction: "
 << I; } } while (false);
  llvm_unreachable("Unhandled instruction!")::llvm::llvm_unreachable_internal("Unhandled instruction!", "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4415);
} // end of switch.
4417}

4419void LoopVectorizationCostModel::collectLoopScalars(unsigned VF) {
// We should not collect Scalars more than once per VF. Right now, this
// function is called from collectUniformsAndScalars(), which already does
// this check. Collecting Scalars for VF=1 does not make any sense.
assert(VF >= 2 && Scalars.find(VF) == Scalars.end() &&((VF >= 2 && Scalars.find(VF) == Scalars.end() &&
 "This function should not be visited twice for the same VF")
 ? static_cast<void> (0) : __assert_fail ("VF >= 2 && Scalars.find(VF) == Scalars.end() && \"This function should not be visited twice for the same VF\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4424, __PRETTY_FUNCTION__))
       "This function should not be visited twice for the same VF")((VF >= 2 && Scalars.find(VF) == Scalars.end() &&
 "This function should not be visited twice for the same VF")
 ? static_cast<void> (0) : __assert_fail ("VF >= 2 && Scalars.find(VF) == Scalars.end() && \"This function should not be visited twice for the same VF\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4424, __PRETTY_FUNCTION__));

SmallSetVector<Instruction *, 8> Worklist;

// These sets are used to seed the analysis with pointers used by memory
// accesses that will remain scalar.
SmallSetVector<Instruction *, 8> ScalarPtrs;
SmallPtrSet<Instruction *, 8> PossibleNonScalarPtrs;

// A helper that returns true if the use of Ptr by MemAccess will be scalar.
// The pointer operands of loads and stores will be scalar as long as the
// memory access is not a gather or scatter operation. The value operand of a
// store will remain scalar if the store is scalarized.
auto isScalarUse = [&](Instruction *MemAccess, Value *Ptr) {
  InstWidening WideningDecision = getWideningDecision(MemAccess, VF);
  assert(WideningDecision != CM_Unknown &&((WideningDecision != CM_Unknown && "Widening decision should be ready at this moment"
) ? static_cast<void> (0) : __assert_fail ("WideningDecision != CM_Unknown && \"Widening decision should be ready at this moment\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4440, __PRETTY_FUNCTION__))
         "Widening decision should be ready at this moment")((WideningDecision != CM_Unknown && "Widening decision should be ready at this moment"
) ? static_cast<void> (0) : __assert_fail ("WideningDecision != CM_Unknown && \"Widening decision should be ready at this moment\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4440, __PRETTY_FUNCTION__));
  if (auto *Store = dyn_cast<StoreInst>(MemAccess))
    if (Ptr == Store->getValueOperand())
      return WideningDecision == CM_Scalarize;
  assert(Ptr == getLoadStorePointerOperand(MemAccess) &&((Ptr == getLoadStorePointerOperand(MemAccess) && "Ptr is neither a value or pointer operand"
) ? static_cast<void> (0) : __assert_fail ("Ptr == getLoadStorePointerOperand(MemAccess) && \"Ptr is neither a value or pointer operand\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4445, __PRETTY_FUNCTION__))
         "Ptr is neither a value or pointer operand")((Ptr == getLoadStorePointerOperand(MemAccess) && "Ptr is neither a value or pointer operand"
) ? static_cast<void> (0) : __assert_fail ("Ptr == getLoadStorePointerOperand(MemAccess) && \"Ptr is neither a value or pointer operand\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4445, __PRETTY_FUNCTION__));
  return WideningDecision != CM_GatherScatter;
};

// A helper that returns true if the given value is a bitcast or
// getelementptr instruction contained in the loop.
auto isLoopVaryingBitCastOrGEP = [&](Value *V) {
  return ((isa<BitCastInst>(V) && V->getType()->isPointerTy()) ||
          isa<GetElementPtrInst>(V)) &&
         !TheLoop->isLoopInvariant(V);
};

// A helper that evaluates a memory access's use of a pointer. If the use
// will be a scalar use, and the pointer is only used by memory accesses, we
// place the pointer in ScalarPtrs. Otherwise, the pointer is placed in
// PossibleNonScalarPtrs.
auto evaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) {
  // We only care about bitcast and getelementptr instructions contained in
  // the loop.
  if (!isLoopVaryingBitCastOrGEP(Ptr))
    return;

  // If the pointer has already been identified as scalar (e.g., if it was
  // also identified as uniform), there's nothing to do.
  auto *I = cast<Instruction>(Ptr);
  if (Worklist.count(I))
    return;

  // If the use of the pointer will be a scalar use, and all users of the
  // pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise,
  // place the pointer in PossibleNonScalarPtrs.
  if (isScalarUse(MemAccess, Ptr) && llvm::all_of(I->users(), [&](User *U) {
        return isa<LoadInst>(U) || isa<StoreInst>(U);
      }))
    ScalarPtrs.insert(I);
  else
    PossibleNonScalarPtrs.insert(I);
};

// We seed the scalars analysis with three classes of instructions: (1)
// instructions marked uniform-after-vectorization, (2) bitcast and
// getelementptr instructions used by memory accesses requiring a scalar use,
// and (3) pointer induction variables and their update instructions (we
// currently only scalarize these).
//
// (1) Add to the worklist all instructions that have been identified as
// uniform-after-vectorization.
Worklist.insert(Uniforms[VF].begin(), Uniforms[VF].end());

// (2) Add to the worklist all bitcast and getelementptr instructions used by
// memory accesses requiring a scalar use. The pointer operands of loads and
// stores will be scalar as long as the memory accesses is not a gather or
// scatter operation. The value operand of a store will remain scalar if the
// store is scalarized.
for (auto *BB : TheLoop->blocks())
  for (auto &I : *BB) {
    if (auto *Load = dyn_cast<LoadInst>(&I)) {
      evaluatePtrUse(Load, Load->getPointerOperand());
    } else if (auto *Store = dyn_cast<StoreInst>(&I)) {
      evaluatePtrUse(Store, Store->getPointerOperand());
      evaluatePtrUse(Store, Store->getValueOperand());
    }
  }
for (auto *I : ScalarPtrs)
  if (PossibleNonScalarPtrs.find(I) == PossibleNonScalarPtrs.end()) {
    LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found scalar instruction: "
 << *I << "\n"; } } while (false);
    Worklist.insert(I);
  }

// (3) Add to the worklist all pointer induction variables and their update
// instructions.
//
// TODO: Once we are able to vectorize pointer induction variables we should
//       no longer insert them into the worklist here.
auto *Latch = TheLoop->getLoopLatch();
for (auto &Induction : Legal->getInductionVars()) {
  auto *Ind = Induction.first;
  auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
  if (Induction.second.getKind() != InductionDescriptor::IK_PtrInduction)
    continue;
  Worklist.insert(Ind);
  Worklist.insert(IndUpdate);
  LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found scalar instruction: "
 << *Ind << "\n"; } } while (false);
  LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdatedo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found scalar instruction: "
 << *IndUpdate << "\n"; } } while (false)
                    << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found scalar instruction: "
 << *IndUpdate << "\n"; } } while (false);
}

// Insert the forced scalars.
// FIXME: Currently widenPHIInstruction() often creates a dead vector
// induction variable when the PHI user is scalarized.
auto ForcedScalar = ForcedScalars.find(VF);
if (ForcedScalar != ForcedScalars.end())
  for (auto *I : ForcedScalar->second)
    Worklist.insert(I);

// Expand the worklist by looking through any bitcasts and getelementptr
// instructions we've already identified as scalar. This is similar to the
// expansion step in collectLoopUniforms(); however, here we're only
// expanding to include additional bitcasts and getelementptr instructions.
unsigned Idx = 0;
while (Idx != Worklist.size()) {
  Instruction *Dst = Worklist[Idx++];
  if (!isLoopVaryingBitCastOrGEP(Dst->getOperand(0)))
    continue;
  auto *Src = cast<Instruction>(Dst->getOperand(0));
  if (llvm::all_of(Src->users(), [&](User *U) -> bool {
        auto *J = cast<Instruction>(U);
        return !TheLoop->contains(J) || Worklist.count(J) ||
               ((isa<LoadInst>(J) || isa<StoreInst>(J)) &&
                isScalarUse(J, Src));
      })) {
    Worklist.insert(Src);
    LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found scalar instruction: "
 << *Src << "\n"; } } while (false);
  }
}

// An induction variable will remain scalar if all users of the induction
// variable and induction variable update remain scalar.
for (auto &Induction : Legal->getInductionVars()) {
  auto *Ind = Induction.first;
  auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));

  // We already considered pointer induction variables, so there's no reason
  // to look at their users again.
  //
  // TODO: Once we are able to vectorize pointer induction variables we
  //       should no longer skip over them here.
  if (Induction.second.getKind() == InductionDescriptor::IK_PtrInduction)
    continue;

  // Determine if all users of the induction variable are scalar after
  // vectorization.
  auto ScalarInd = llvm::all_of(Ind->users(), [&](User *U) -> bool {
    auto *I = cast<Instruction>(U);
    return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I);
  });
  if (!ScalarInd)
    continue;

  // Determine if all users of the induction variable update instruction are
  // scalar after vectorization.
  auto ScalarIndUpdate =
      llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
        auto *I = cast<Instruction>(U);
        return I == Ind || !TheLoop->contains(I) || Worklist.count(I);
      });
  if (!ScalarIndUpdate)
    continue;

  // The induction variable and its update instruction will remain scalar.
  Worklist.insert(Ind);
  Worklist.insert(IndUpdate);
  LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found scalar instruction: "
 << *Ind << "\n"; } } while (false);
  LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdatedo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found scalar instruction: "
 << *IndUpdate << "\n"; } } while (false)
                    << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found scalar instruction: "
 << *IndUpdate << "\n"; } } while (false);
}

Scalars[VF].insert(Worklist.begin(), Worklist.end());
4603}

4605bool LoopVectorizationCostModel::isScalarWithPredication(Instruction *I, unsigned VF) {
if (!blockNeedsPredication(I->getParent()))
  return false;
switch(I->getOpcode()) {
default:
  break;
case Instruction::Load:
case Instruction::Store: {
  if (!Legal->isMaskRequired(I))
    return false;
  auto *Ptr = getLoadStorePointerOperand(I);
  auto *Ty = getMemInstValueType(I);
  // We have already decided how to vectorize this instruction, get that
  // result.
  if (VF > 1) {
    InstWidening WideningDecision = getWideningDecision(I, VF);
    assert(WideningDecision != CM_Unknown &&((WideningDecision != CM_Unknown && "Widening decision should be ready at this moment"
) ? static_cast<void> (0) : __assert_fail ("WideningDecision != CM_Unknown && \"Widening decision should be ready at this moment\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4622, __PRETTY_FUNCTION__))
           "Widening decision should be ready at this moment")((WideningDecision != CM_Unknown && "Widening decision should be ready at this moment"
) ? static_cast<void> (0) : __assert_fail ("WideningDecision != CM_Unknown && \"Widening decision should be ready at this moment\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4622, __PRETTY_FUNCTION__));
    return WideningDecision == CM_Scalarize;
  }
  const MaybeAlign Alignment = getLoadStoreAlignment(I);
  return isa<LoadInst>(I) ? !(isLegalMaskedLoad(Ty, Ptr, Alignment) ||
                              isLegalMaskedGather(Ty, Alignment))
                          : !(isLegalMaskedStore(Ty, Ptr, Alignment) ||
                              isLegalMaskedScatter(Ty, Alignment));
}
case Instruction::UDiv:
case Instruction::SDiv:
case Instruction::SRem:
case Instruction::URem:
  return mayDivideByZero(*I);
}
return false;
4638}

4640bool LoopVectorizationCostModel::interleavedAccessCanBeWidened(Instruction *I,
                                                             unsigned VF) {
assert(isAccessInterleaved(I) && "Expecting interleaved access.")((isAccessInterleaved(I) && "Expecting interleaved access."
) ? static_cast<void> (0) : __assert_fail ("isAccessInterleaved(I) && \"Expecting interleaved access.\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4642, __PRETTY_FUNCTION__));
assert(getWideningDecision(I, VF) == CM_Unknown &&((getWideningDecision(I, VF) == CM_Unknown && "Decision should not be set yet."
) ? static_cast<void> (0) : __assert_fail ("getWideningDecision(I, VF) == CM_Unknown && \"Decision should not be set yet.\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4644, __PRETTY_FUNCTION__))
       "Decision should not be set yet.")((getWideningDecision(I, VF) == CM_Unknown && "Decision should not be set yet."
) ? static_cast<void> (0) : __assert_fail ("getWideningDecision(I, VF) == CM_Unknown && \"Decision should not be set yet.\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4644, __PRETTY_FUNCTION__));
auto *Group = getInterleavedAccessGroup(I);
assert(Group && "Must have a group.")((Group && "Must have a group.") ? static_cast<void
> (0) : __assert_fail ("Group && \"Must have a group.\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4646, __PRETTY_FUNCTION__));

// If the instruction's allocated size doesn't equal it's type size, it
// requires padding and will be scalarized.
auto &DL = I->getModule()->getDataLayout();
auto *ScalarTy = getMemInstValueType(I);
if (hasIrregularType(ScalarTy, DL, VF))
  return false;

// Check if masking is required.
// A Group may need masking for one of two reasons: it resides in a block that
// needs predication, or it was decided to use masking to deal with gaps.
bool PredicatedAccessRequiresMasking =
    Legal->blockNeedsPredication(I->getParent()) && Legal->isMaskRequired(I);
bool AccessWithGapsRequiresMasking =
    Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed();
if (!PredicatedAccessRequiresMasking && !AccessWithGapsRequiresMasking)
  return true;

// If masked interleaving is required, we expect that the user/target had
// enabled it, because otherwise it either wouldn't have been created or
// it should have been invalidated by the CostModel.
assert(useMaskedInterleavedAccesses(TTI) &&((useMaskedInterleavedAccesses(TTI) && "Masked interleave-groups for predicated accesses are not enabled."
) ? static_cast<void> (0) : __assert_fail ("useMaskedInterleavedAccesses(TTI) && \"Masked interleave-groups for predicated accesses are not enabled.\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4669, __PRETTY_FUNCTION__))
       "Masked interleave-groups for predicated accesses are not enabled.")((useMaskedInterleavedAccesses(TTI) && "Masked interleave-groups for predicated accesses are not enabled."
) ? static_cast<void> (0) : __assert_fail ("useMaskedInterleavedAccesses(TTI) && \"Masked interleave-groups for predicated accesses are not enabled.\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4669, __PRETTY_FUNCTION__));

auto *Ty = getMemInstValueType(I);
const MaybeAlign Alignment = getLoadStoreAlignment(I);
return isa<LoadInst>(I) ? TTI.isLegalMaskedLoad(Ty, Alignment)
                        : TTI.isLegalMaskedStore(Ty, Alignment);
4675}

4677bool LoopVectorizationCostModel::memoryInstructionCanBeWidened(Instruction *I,
                                                             unsigned VF) {
// Get and ensure we have a valid memory instruction.
LoadInst *LI = dyn_cast<LoadInst>(I);
StoreInst *SI = dyn_cast<StoreInst>(I);
assert((LI || SI) && "Invalid memory instruction")(((LI || SI) && "Invalid memory instruction") ? static_cast
<void> (0) : __assert_fail ("(LI || SI) && \"Invalid memory instruction\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4682, __PRETTY_FUNCTION__));

auto *Ptr = getLoadStorePointerOperand(I);

// In order to be widened, the pointer should be consecutive, first of all.
if (!Legal->isConsecutivePtr(Ptr))
  return false;

// If the instruction is a store located in a predicated block, it will be
// scalarized.
if (isScalarWithPredication(I))
  return false;

// If the instruction's allocated size doesn't equal it's type size, it
// requires padding and will be scalarized.
auto &DL = I->getModule()->getDataLayout();
auto *ScalarTy = LI ? LI->getType() : SI->getValueOperand()->getType();
if (hasIrregularType(ScalarTy, DL, VF))
  return false;

return true;
4703}

4705void LoopVectorizationCostModel::collectLoopUniforms(unsigned VF) {
// We should not collect Uniforms more than once per VF. Right now,
// this function is called from collectUniformsAndScalars(), which
// already does this check. Collecting Uniforms for VF=1 does not make any
// sense.

assert(VF >= 2 && Uniforms.find(VF) == Uniforms.end() &&((VF >= 2 && Uniforms.find(VF) == Uniforms.end() &&
 "This function should not be visited twice for the same VF")
 ? static_cast<void> (0) : __assert_fail ("VF >= 2 && Uniforms.find(VF) == Uniforms.end() && \"This function should not be visited twice for the same VF\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4712, __PRETTY_FUNCTION__))
       "This function should not be visited twice for the same VF")((VF >= 2 && Uniforms.find(VF) == Uniforms.end() &&
 "This function should not be visited twice for the same VF")
 ? static_cast<void> (0) : __assert_fail ("VF >= 2 && Uniforms.find(VF) == Uniforms.end() && \"This function should not be visited twice for the same VF\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4712, __PRETTY_FUNCTION__));

// Visit the list of Uniforms. If we'll not find any uniform value, we'll
// not analyze again.  Uniforms.count(VF) will return 1.
Uniforms[VF].clear();

// We now know that the loop is vectorizable!
// Collect instructions inside the loop that will remain uniform after
// vectorization.

// Global values, params and instructions outside of current loop are out of
// scope.
auto isOutOfScope = [&](Value *V) -> bool {
  Instruction *I = dyn_cast<Instruction>(V);
  return (!I || !TheLoop->contains(I));
};

SetVector<Instruction *> Worklist;
BasicBlock *Latch = TheLoop->getLoopLatch();

// Instructions that are scalar with predication must not be considered
// uniform after vectorization, because that would create an erroneous
// replicating region where only a single instance out of VF should be formed.
// TODO: optimize such seldom cases if found important, see PR40816.
auto addToWorklistIfAllowed = [&](Instruction *I) -> void {
  if (isScalarWithPredication(I, VF)) {
    LLVM_DEBUG(dbgs() << "LV: Found not uniform being ScalarWithPredication: "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found not uniform being ScalarWithPredication: "
 << *I << "\n"; } } while (false)
                      << *I << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found not uniform being ScalarWithPredication: "
 << *I << "\n"; } } while (false);
    return;
  }
  LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *I << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found uniform instruction: "
 << *I << "\n"; } } while (false);
  Worklist.insert(I);
};

// Start with the conditional branch. If the branch condition is an
// instruction contained in the loop that is only used by the branch, it is
// uniform.
auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0));
if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse())
  addToWorklistIfAllowed(Cmp);

// Holds consecutive and consecutive-like pointers. Consecutive-like pointers
// are pointers that are treated like consecutive pointers during
// vectorization. The pointer operands of interleaved accesses are an
// example.
SmallSetVector<Instruction *, 8> ConsecutiveLikePtrs;

// Holds pointer operands of instructions that are possibly non-uniform.
SmallPtrSet<Instruction *, 8> PossibleNonUniformPtrs;

auto isUniformDecision = [&](Instruction *I, unsigned VF) {
  InstWidening WideningDecision = getWideningDecision(I, VF);
  assert(WideningDecision != CM_Unknown &&((WideningDecision != CM_Unknown && "Widening decision should be ready at this moment"
) ? static_cast<void> (0) : __assert_fail ("WideningDecision != CM_Unknown && \"Widening decision should be ready at this moment\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4765, __PRETTY_FUNCTION__))
         "Widening decision should be ready at this moment")((WideningDecision != CM_Unknown && "Widening decision should be ready at this moment"
) ? static_cast<void> (0) : __assert_fail ("WideningDecision != CM_Unknown && \"Widening decision should be ready at this moment\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4765, __PRETTY_FUNCTION__));

  return (WideningDecision == CM_Widen ||
          WideningDecision == CM_Widen_Reverse ||
          WideningDecision == CM_Interleave);
};
// Iterate over the instructions in the loop, and collect all
// consecutive-like pointer operands in ConsecutiveLikePtrs. If it's possible
// that a consecutive-like pointer operand will be scalarized, we collect it
// in PossibleNonUniformPtrs instead. We use two sets here because a single
// getelementptr instruction can be used by both vectorized and scalarized
// memory instructions. For example, if a loop loads and stores from the same
// location, but the store is conditional, the store will be scalarized, and
// the getelementptr won't remain uniform.
for (auto *BB : TheLoop->blocks())
  for (auto &I : *BB) {
    // If there's no pointer operand, there's nothing to do.
    auto *Ptr = dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I));
    if (!Ptr)
      continue;

    // True if all users of Ptr are memory accesses that have Ptr as their
    // pointer operand.
    auto UsersAreMemAccesses =
        llvm::all_of(Ptr->users(), [&](User *U) -> bool {
          return getLoadStorePointerOperand(U) == Ptr;
        });

    // Ensure the memory instruction will not be scalarized or used by
    // gather/scatter, making its pointer operand non-uniform. If the pointer
    // operand is used by any instruction other than a memory access, we
    // conservatively assume the pointer operand may be non-uniform.
    if (!UsersAreMemAccesses || !isUniformDecision(&I, VF))
      PossibleNonUniformPtrs.insert(Ptr);

    // If the memory instruction will be vectorized and its pointer operand
    // is consecutive-like, or interleaving - the pointer operand should
    // remain uniform.
    else
      ConsecutiveLikePtrs.insert(Ptr);
  }

// Add to the Worklist all consecutive and consecutive-like pointers that
// aren't also identified as possibly non-uniform.
for (auto *V : ConsecutiveLikePtrs)
  if (PossibleNonUniformPtrs.find(V) == PossibleNonUniformPtrs.end())
    addToWorklistIfAllowed(V);

// Expand Worklist in topological order: whenever a new instruction
// is added , its users should be already inside Worklist.  It ensures
// a uniform instruction will only be used by uniform instructions.
unsigned idx = 0;
while (idx != Worklist.size()) {
  Instruction *I = Worklist[idx++];

  for (auto OV : I->operand_values()) {
    // isOutOfScope operands cannot be uniform instructions.
    if (isOutOfScope(OV))
      continue;
    // First order recurrence Phi's should typically be considered
    // non-uniform.
    auto *OP = dyn_cast<PHINode>(OV);
    if (OP && Legal->isFirstOrderRecurrence(OP))
      continue;
    // If all the users of the operand are uniform, then add the
    // operand into the uniform worklist.
    auto *OI = cast<Instruction>(OV);
    if (llvm::all_of(OI->users(), [&](User *U) -> bool {
          auto *J = cast<Instruction>(U);
          return Worklist.count(J) ||
                 (OI == getLoadStorePointerOperand(J) &&
                  isUniformDecision(J, VF));
        }))
      addToWorklistIfAllowed(OI);
  }
}

// Returns true if Ptr is the pointer operand of a memory access instruction
// I, and I is known to not require scalarization.
auto isVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool {
  return getLoadStorePointerOperand(I) == Ptr && isUniformDecision(I, VF);
};

// For an instruction to be added into Worklist above, all its users inside
// the loop should also be in Worklist. However, this condition cannot be
// true for phi nodes that form a cyclic dependence. We must process phi
// nodes separately. An induction variable will remain uniform if all users
// of the induction variable and induction variable update remain uniform.
// The code below handles both pointer and non-pointer induction variables.
for (auto &Induction : Legal->getInductionVars()) {
  auto *Ind = Induction.first;
  auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));

  // Determine if all users of the induction variable are uniform after
  // vectorization.
  auto UniformInd = llvm::all_of(Ind->users(), [&](User *U) -> bool {
    auto *I = cast<Instruction>(U);
    return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) ||
           isVectorizedMemAccessUse(I, Ind);
  });
  if (!UniformInd)
    continue;

  // Determine if all users of the induction variable update instruction are
  // uniform after vectorization.
  auto UniformIndUpdate =
      llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
        auto *I = cast<Instruction>(U);
        return I == Ind || !TheLoop->contains(I) || Worklist.count(I) ||
               isVectorizedMemAccessUse(I, IndUpdate);
      });
  if (!UniformIndUpdate)
    continue;

  // The induction variable and its update instruction will remain uniform.
  addToWorklistIfAllowed(Ind);
  addToWorklistIfAllowed(IndUpdate);
}

Uniforms[VF].insert(Worklist.begin(), Worklist.end());
4885}

4887bool LoopVectorizationCostModel::runtimeChecksRequired() {
LLVM_DEBUG(dbgs() << "LV: Performing code size checks.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Performing code size checks.\n"
; } } while (false);

if (Legal->getRuntimePointerChecking()->Need) {
  reportVectorizationFailure("Runtime ptr check is required with -Os/-Oz",
      "runtime pointer checks needed. Enable vectorization of this "
      "loop with '#pragma clang loop vectorize(enable)' when "
      "compiling with -Os/-Oz",
      "CantVersionLoopWithOptForSize", ORE, TheLoop);
  return true;
}

if (!PSE.getUnionPredicate().getPredicates().empty()) {
  reportVectorizationFailure("Runtime SCEV check is required with -Os/-Oz",
      "runtime SCEV checks needed. Enable vectorization of this "
      "loop with '#pragma clang loop vectorize(enable)' when "
      "compiling with -Os/-Oz",
      "CantVersionLoopWithOptForSize", ORE, TheLoop);
  return true;
}

// FIXME: Avoid specializing for stride==1 instead of bailing out.
if (!Legal->getLAI()->getSymbolicStrides().empty()) {
  reportVectorizationFailure("Runtime stride check is required with -Os/-Oz",
      "runtime stride == 1 checks needed. Enable vectorization of "
      "this loop with '#pragma clang loop vectorize(enable)' when "
      "compiling with -Os/-Oz",
      "CantVersionLoopWithOptForSize", ORE, TheLoop);
  return true;
}

return false;
4919}

4921Optional<unsigned> LoopVectorizationCostModel::computeMaxVF() {
if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) {
  // TODO: It may by useful to do since it's still likely to be dynamically
  // uniform if the target can skip.
  reportVectorizationFailure(
      "Not inserting runtime ptr check for divergent target",
      "runtime pointer checks needed. Not enabled for divergent target",
      "CantVersionLoopWithDivergentTarget", ORE, TheLoop);
  return None;
}

unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop);
LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n')do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found trip count: "
 << TC << '\n'; } } while (false);
if (TC == 1) {
  reportVectorizationFailure("Single iteration (non) loop",
      "loop trip count is one, irrelevant for vectorization",
      "SingleIterationLoop", ORE, TheLoop);
  return None;
}

switch (ScalarEpilogueStatus) {
case CM_ScalarEpilogueAllowed:
  return computeFeasibleMaxVF(TC);
case CM_ScalarEpilogueNotNeededUsePredicate:
  LLVM_DEBUG(do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: vector predicate hint/switch found.\n"
 << "LV: Not allowing scalar epilogue, creating predicated "
 << "vector loop.\n"; } } while (false)
      dbgs() << "LV: vector predicate hint/switch found.\n"do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: vector predicate hint/switch found.\n"
 << "LV: Not allowing scalar epilogue, creating predicated "
 << "vector loop.\n"; } } while (false)
             << "LV: Not allowing scalar epilogue, creating predicated "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: vector predicate hint/switch found.\n"
 << "LV: Not allowing scalar epilogue, creating predicated "
 << "vector loop.\n"; } } while (false)
             << "vector loop.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: vector predicate hint/switch found.\n"
 << "LV: Not allowing scalar epilogue, creating predicated "
 << "vector loop.\n"; } } while (false);
  break;
case CM_ScalarEpilogueNotAllowedLowTripLoop:
  // fallthrough as a special case of OptForSize
case CM_ScalarEpilogueNotAllowedOptSize:
  if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedOptSize)
    LLVM_DEBUG(do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n"
; } } while (false)
        dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n"
; } } while (false);
  else
    LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to low trip "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Not allowing scalar epilogue due to low trip "
 << "count.\n"; } } while (false)
                      << "count.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Not allowing scalar epilogue due to low trip "
 << "count.\n"; } } while (false);

  // Bail if runtime checks are required, which are not good when optimising
  // for size.
  if (runtimeChecksRequired())
    return None;
  break;
}

// Now try the tail folding

// Invalidate interleave groups that require an epilogue if we can't mask
// the interleave-group.
if (!useMaskedInterleavedAccesses(TTI))
  InterleaveInfo.invalidateGroupsRequiringScalarEpilogue();

unsigned MaxVF = computeFeasibleMaxVF(TC);
if (TC > 0 && TC % MaxVF == 0) {
  // Accept MaxVF if we do not have a tail.
  LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: No tail will remain for any chosen VF.\n"
; } } while (false);
  return MaxVF;
}

// If we don't know the precise trip count, or if the trip count that we
// found modulo the vectorization factor is not zero, try to fold the tail
// by masking.
// FIXME: look for a smaller MaxVF that does divide TC rather than masking.
if (Legal->prepareToFoldTailByMasking()) {
  FoldTailByMasking = true;
  return MaxVF;
}

if (TC == 0) {
  reportVectorizationFailure(
      "Unable to calculate the loop count due to complex control flow",
      "unable to calculate the loop count due to complex control flow",
      "UnknownLoopCountComplexCFG", ORE, TheLoop);
  return None;
}

reportVectorizationFailure(
    "Cannot optimize for size and vectorize at the same time.",
    "cannot optimize for size and vectorize at the same time. "
    "Enable vectorization of this loop with '#pragma clang loop "
    "vectorize(enable)' when compiling with -Os/-Oz",
    "NoTailLoopWithOptForSize", ORE, TheLoop);
return None;
5005}

5007unsigned
5008LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount) {
MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI);
unsigned SmallestType, WidestType;
std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes();
unsigned WidestRegister = TTI.getRegisterBitWidth(true);

// Get the maximum safe dependence distance in bits computed by LAA.
// It is computed by MaxVF * sizeOf(type) * 8, where type is taken from
// the memory accesses that is most restrictive (involved in the smallest
// dependence distance).
unsigned MaxSafeRegisterWidth = Legal->getMaxSafeRegisterWidth();

WidestRegister = std::min(WidestRegister, MaxSafeRegisterWidth);

unsigned MaxVectorSize = WidestRegister / WidestType;

LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestTypedo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: The Smallest and Widest types: "
 << SmallestType << " / " << WidestType <<
 " bits.\n"; } } while (false)
                  << " / " << WidestType << " bits.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: The Smallest and Widest types: "
 << SmallestType << " / " << WidestType <<
 " bits.\n"; } } while (false);
LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: The Widest register safe to use is: "
 << WidestRegister << " bits.\n"; } } while (false
)
                  << WidestRegister << " bits.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: The Widest register safe to use is: "
 << WidestRegister << " bits.\n"; } } while (false
);

assert(MaxVectorSize <= 256 && "Did not expect to pack so many elements"((MaxVectorSize <= 256 && "Did not expect to pack so many elements"
 " into one vector!") ? static_cast<void> (0) : __assert_fail
 ("MaxVectorSize <= 256 && \"Did not expect to pack so many elements\" \" into one vector!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 5030, __PRETTY_FUNCTION__))
                               " into one vector!")((MaxVectorSize <= 256 && "Did not expect to pack so many elements"
 " into one vector!") ? static_cast<void> (0) : __assert_fail
 ("MaxVectorSize <= 256 && \"Did not expect to pack so many elements\" \" into one vector!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 5030, __PRETTY_FUNCTION__));
if (MaxVectorSize == 0) {
  LLVM_DEBUG(dbgs() << "LV: The target has no vector registers.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: The target has no vector registers.\n"
; } } while (false);
  MaxVectorSize = 1;
  return MaxVectorSize;
} else if (ConstTripCount && ConstTripCount < MaxVectorSize &&
           isPowerOf2_32(ConstTripCount)) {
  // We need to clamp the VF to be the ConstTripCount. There is no point in
  // choosing a higher viable VF as done in the loop below.
  LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to the constant trip count: "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Clamping the MaxVF to the constant trip count: "
 << ConstTripCount << "\n"; } } while (false)
                    << ConstTripCount << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Clamping the MaxVF to the constant trip count: "
 << ConstTripCount << "\n"; } } while (false);
  MaxVectorSize = ConstTripCount;
  return MaxVectorSize;
}

unsigned MaxVF = MaxVectorSize;
if (TTI.shouldMaximizeVectorBandwidth(!isScalarEpilogueAllowed()) ||
    (MaximizeBandwidth && isScalarEpilogueAllowed())) {
  // Collect all viable vectorization factors larger than the default MaxVF
  // (i.e. MaxVectorSize).
  SmallVector<unsigned, 8> VFs;
  unsigned NewMaxVectorSize = WidestRegister / SmallestType;
  for (unsigned VS = MaxVectorSize * 2; VS <= NewMaxVectorSize; VS *= 2)
    VFs.push_back(VS);

  // For each VF calculate its register usage.
  auto RUs = calculateRegisterUsage(VFs);

  // Select the largest VF which doesn't require more registers than existing
  // ones.
  for (int i = RUs.size() - 1; i >= 0; --i) {
    bool Selected = true;
    for (auto& pair : RUs[i].MaxLocalUsers) {
      unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first);
      if (pair.second > TargetNumRegisters)
        Selected = false;
    }
    if (Selected) {
      MaxVF = VFs[i];
      break;
    }
  }
  if (unsigned MinVF = TTI.getMinimumVF(SmallestType)) {
    if (MaxVF < MinVF) {
      LLVM_DEBUG(dbgs() << "LV: Overriding calculated MaxVF(" << MaxVFdo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Overriding calculated MaxVF("
 << MaxVF << ") with target's minimum: " <<
 MinVF << '\n'; } } while (false)
                        << ") with target's minimum: " << MinVF << '\n')do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Overriding calculated MaxVF("
 << MaxVF << ") with target's minimum: " <<
 MinVF << '\n'; } } while (false);
      MaxVF = MinVF;
    }
  }
}
return MaxVF;
5081}

5083VectorizationFactor
5084LoopVectorizationCostModel::selectVectorizationFactor(unsigned MaxVF) {
float Cost = expectedCost(1).first;
const float ScalarCost = Cost;
unsigned Width = 1;
LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << (int)ScalarCost << ".\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Scalar loop costs: "
 << (int)ScalarCost << ".\n"; } } while (false);

bool ForceVectorization = Hints->getForce() == LoopVectorizeHints::FK_Enabled;
if (ForceVectorization && MaxVF > 1) {
  // Ignore scalar width, because the user explicitly wants vectorization.
  // Initialize cost to max so that VF = 2 is, at least, chosen during cost
  // evaluation.
  Cost = std::numeric_limits<float>::max();
}

for (unsigned i = 2; i <= MaxVF; i *= 2) {
  // Notice that the vector loop needs to be executed less times, so
  // we need to divide the cost of the vector loops by the width of
  // the vector elements.
  VectorizationCostTy C = expectedCost(i);
  float VectorCost = C.first / (float)i;
  LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << ido { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Vector loop of width "
 << i << " costs: " << (int)VectorCost <<
 ".\n"; } } while (false)
                    << " costs: " << (int)VectorCost << ".\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Vector loop of width "
 << i << " costs: " << (int)VectorCost <<
 ".\n"; } } while (false);
  if (!C.second && !ForceVectorization) {
    LLVM_DEBUG(do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Not considering vector loop of width "
 << i << " because it will not generate any vector instructions.\n"
; } } while (false)
        dbgs() << "LV: Not considering vector loop of width " << ido { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Not considering vector loop of width "
 << i << " because it will not generate any vector instructions.\n"
; } } while (false)
               << " because it will not generate any vector instructions.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Not considering vector loop of width "
 << i << " because it will not generate any vector instructions.\n"
; } } while (false);
    continue;
  }
  if (VectorCost < Cost) {
    Cost = VectorCost;
    Width = i;
  }
}

if (!EnableCondStoresVectorization && NumPredStores) {
  reportVectorizationFailure("There are conditional stores.",
      "store that is conditionally executed prevents vectorization",
      "ConditionalStore", ORE, TheLoop);
  Width = 1;
  Cost = ScalarCost;
}

LLVM_DEBUG(if (ForceVectorization && Width > 1 && Cost >= ScalarCost) dbgs()do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { if (ForceVectorization && Width
 > 1 && Cost >= ScalarCost) dbgs() << "LV: Vectorization seems to be not beneficial, "
 << "but was forced by a user.\n"; } } while (false)
           << "LV: Vectorization seems to be not beneficial, "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { if (ForceVectorization && Width
 > 1 && Cost >= ScalarCost) dbgs() << "LV: Vectorization seems to be not beneficial, "
 << "but was forced by a user.\n"; } } while (false)
           << "but was forced by a user.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { if (ForceVectorization && Width
 > 1 && Cost >= ScalarCost) dbgs() << "LV: Vectorization seems to be not beneficial, "
 << "but was forced by a user.\n"; } } while (false);
LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << Width << ".\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Selecting VF: " <<
 Width << ".\n"; } } while (false);
VectorizationFactor Factor = {Width, (unsigned)(Width * Cost)};
return Factor;
5132}

5134std::pair<unsigned, unsigned>
5135LoopVectorizationCostModel::getSmallestAndWidestTypes() {
unsigned MinWidth = -1U;
unsigned MaxWidth = 8;
const DataLayout &DL = TheFunction->getParent()->getDataLayout();

// For each block.
for (BasicBlock *BB : TheLoop->blocks()) {
  // For each instruction in the loop.
  for (Instruction &I : BB->instructionsWithoutDebug()) {
    Type *T = I.getType();

    // Skip ignored values.
    if (ValuesToIgnore.find(&I) != ValuesToIgnore.end())
      continue;

    // Only examine Loads, Stores and PHINodes.
    if (!isa<LoadInst>(I) && !isa<StoreInst>(I) && !isa<PHINode>(I))
      continue;

    // Examine PHI nodes that are reduction variables. Update the type to
    // account for the recurrence type.
    if (auto *PN = dyn_cast<PHINode>(&I)) {
      if (!Legal->isReductionVariable(PN))
        continue;
      RecurrenceDescriptor RdxDesc = Legal->getReductionVars()[PN];
      T = RdxDesc.getRecurrenceType();
    }

    // Examine the stored values.
    if (auto *ST = dyn_cast<StoreInst>(&I))
      T = ST->getValueOperand()->getType();

    // Ignore loaded pointer types and stored pointer types that are not
    // vectorizable.
    //
    // FIXME: The check here attempts to predict whether a load or store will
    //        be vectorized. We only know this for certain after a VF has
    //        been selected. Here, we assume that if an access can be
    //        vectorized, it will be. We should also look at extending this
    //        optimization to non-pointer types.
    //
    if (T->isPointerTy() && !isConsecutiveLoadOrStore(&I) &&
        !isAccessInterleaved(&I) && !isLegalGatherOrScatter(&I))
      continue;

    MinWidth = std::min(MinWidth,
                        (unsigned)DL.getTypeSizeInBits(T->getScalarType()));
    MaxWidth = std::max(MaxWidth,
                        (unsigned)DL.getTypeSizeInBits(T->getScalarType()));
  }
}

return {MinWidth, MaxWidth};
5188}

5190unsigned LoopVectorizationCostModel::selectInterleaveCount(unsigned VF,
                                                         unsigned LoopCost) {
// -- The interleave heuristics --
// We interleave the loop in order to expose ILP and reduce the loop overhead.
// There are many micro-architectural considerations that we can't predict
// at this level. For example, frontend pressure (on decode or fetch) due to
// code size, or the number and capabilities of the execution ports.
//
// We use the following heuristics to select the interleave count:
// 1. If the code has reductions, then we interleave to break the cross
// iteration dependency.
// 2. If the loop is really small, then we interleave to reduce the loop
// overhead.
// 3. We don't interleave if we think that we will spill registers to memory
// due to the increased register pressure.

if (!isScalarEpilogueAllowed())
  return 1;

// We used the distance for the interleave count.
if (Legal->getMaxSafeDepDistBytes() != -1U)
  return 1;

// Do not interleave loops with a relatively small known or estimated trip
// count.
auto BestKnownTC = getSmallBestKnownTC(*PSE.getSE(), TheLoop);
if (BestKnownTC && *BestKnownTC < TinyTripCountInterleaveThreshold)
  return 1;

RegisterUsage R = calculateRegisterUsage({VF})[0];
// We divide by these constants so assume that we have at least one
// instruction that uses at least one register.
for (auto& pair : R.MaxLocalUsers) {
  pair.second = std::max(pair.second, 1U);
}

// We calculate the interleave count using the following formula.
// Subtract the number of loop invariants from the number of available
// registers. These registers are used by all of the interleaved instances.
// Next, divide the remaining registers by the number of registers that is
// required by the loop, in order to estimate how many parallel instances
// fit without causing spills. All of this is rounded down if necessary to be
// a power of two. We want power of two interleave count to simplify any
// addressing operations or alignment considerations.
// We also want power of two interleave counts to ensure that the induction
// variable of the vector loop wraps to zero, when tail is folded by masking;
// this currently happens when OptForSize, in which case IC is set to 1 above.
unsigned IC = UINT_MAX(2147483647 *2U +1U);

for (auto& pair : R.MaxLocalUsers) {
  unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first);
  LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegistersdo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: The target has " <<
 TargetNumRegisters << " registers of " << TTI.getRegisterClassName
(pair.first) << " register class\n"; } } while (false)
                    << " registers of "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: The target has " <<
 TargetNumRegisters << " registers of " << TTI.getRegisterClassName
(pair.first) << " register class\n"; } } while (false)
                    << TTI.getRegisterClassName(pair.first) << " register class\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: The target has " <<
 TargetNumRegisters << " registers of " << TTI.getRegisterClassName
(pair.first) << " register class\n"; } } while (false);
  if (VF == 1) {
    if (ForceTargetNumScalarRegs.getNumOccurrences() > 0)
      TargetNumRegisters = ForceTargetNumScalarRegs;
  } else {
    if (ForceTargetNumVectorRegs.getNumOccurrences() > 0)
      TargetNumRegisters = ForceTargetNumVectorRegs;
  }
  unsigned MaxLocalUsers = pair.second;
  unsigned LoopInvariantRegs = 0;
  if (R.LoopInvariantRegs.find(pair.first) != R.LoopInvariantRegs.end())
    LoopInvariantRegs = R.LoopInvariantRegs[pair.first];

  unsigned TmpIC = PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs) / MaxLocalUsers);
  // Don't count the induction variable as interleaved.
  if (EnableIndVarRegisterHeur) {
    TmpIC =
        PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs - 1) /
                      std::max(1U, (MaxLocalUsers - 1)));
  }

  IC = std::min(IC, TmpIC);
}

// Clamp the interleave ranges to reasonable counts.
unsigned MaxInterleaveCount = TTI.getMaxInterleaveFactor(VF);

// Check if the user has overridden the max.
if (VF == 1) {
  if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0)
    MaxInterleaveCount = ForceTargetMaxScalarInterleaveFactor;
} else {
  if (ForceTargetMaxVectorInterleaveFactor.getNumOccurrences() > 0)
    MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor;
}

// If trip count is known or estimated compile time constant, limit the
// interleave count to be less than the trip count divided by VF.
if (BestKnownTC) {
  MaxInterleaveCount = std::min(*BestKnownTC / VF, MaxInterleaveCount);
}

// If we did not calculate the cost for VF (because the user selected the VF)
// then we calculate the cost of VF here.
if (LoopCost == 0)
  LoopCost = expectedCost(VF).first;

assert(LoopCost && "Non-zero loop cost expected")((LoopCost && "Non-zero loop cost expected") ? static_cast
<void> (0) : __assert_fail ("LoopCost && \"Non-zero loop cost expected\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 5290, __PRETTY_FUNCTION__));

// Clamp the calculated IC to be between the 1 and the max interleave count
// that the target and trip count allows.
if (IC > MaxInterleaveCount)
  IC = MaxInterleaveCount;
else if (IC < 1)
  IC = 1;

// Interleave if we vectorized this loop and there is a reduction that could
// benefit from interleaving.
if (VF > 1 && !Legal->getReductionVars().empty()) {
  LLVM_DEBUG(dbgs() << "LV: Interleaving because of reductions.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Interleaving because of reductions.\n"
; } } while (false);
  return IC;
}

// Note that if we've already vectorized the loop we will have done the
// runtime check and so interleaving won't require further checks.
bool InterleavingRequiresRuntimePointerCheck =
    (VF == 1 && Legal->getRuntimePointerChecking()->Need);

// We want to interleave small loops in order to reduce the loop overhead and
// potentially expose ILP opportunities.
LLVM_DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n')do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Loop cost is " <<
 LoopCost << '\n'; } } while (false);
if (!InterleavingRequiresRuntimePointerCheck && LoopCost < SmallLoopCost) {
  // We assume that the cost overhead is 1 and we use the cost model
  // to estimate the cost of the loop and interleave until the cost of the
  // loop overhead is about 5% of the cost of the loop.
  unsigned SmallIC =
      std::min(IC, (unsigned)PowerOf2Floor(SmallLoopCost / LoopCost));

  // Interleave until store/load ports (estimated by max interleave count) are
  // saturated.
  unsigned NumStores = Legal->getNumStores();
  unsigned NumLoads = Legal->getNumLoads();
  unsigned StoresIC = IC / (NumStores ? NumStores : 1);
  unsigned LoadsIC = IC / (NumLoads ? NumLoads : 1);

  // If we have a scalar reduction (vector reductions are already dealt with
  // by this point), we can increase the critical path length if the loop
  // we're interleaving is inside another loop. Limit, by default to 2, so the
  // critical path only gets increased by one reduction operation.
  if (!Legal->getReductionVars().empty() && TheLoop->getLoopDepth() > 1) {
    unsigned F = static_cast<unsigned>(MaxNestedScalarReductionIC);
    SmallIC = std::min(SmallIC, F);
    StoresIC = std::min(StoresIC, F);
    LoadsIC = std::min(LoadsIC, F);
  }

  if (EnableLoadStoreRuntimeInterleave &&
      std::max(StoresIC, LoadsIC) > SmallIC) {
    LLVM_DEBUG(do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Interleaving to saturate store or load ports.\n"
; } } while (false)
        dbgs() << "LV: Interleaving to saturate store or load ports.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Interleaving to saturate store or load ports.\n"
; } } while (false);
    return std::max(StoresIC, LoadsIC);
  }

  LLVM_DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Interleaving to reduce branch cost.\n"
; } } while (false);
  return SmallIC;
}

// Interleave if this is a large loop (small loops are already dealt with by
// this point) that could benefit from interleaving.
bool HasReductions = !Legal->getReductionVars().empty();
if (TTI.enableAggressiveInterleaving(HasReductions)) {
  LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Interleaving to expose ILP.\n"
; } } while (false);
  return IC;
}

LLVM_DEBUG(dbgs() << "LV: Not Interleaving.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Not Interleaving.\n"
; } } while (false);
return 1;
5360}

5362SmallVector<LoopVectorizationCostModel::RegisterUsage, 8>
5363LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<unsigned> VFs) {
// This function calculates the register usage by measuring the highest number
// of values that are alive at a single location. Obviously, this is a very
// rough estimation. We scan the loop in a topological order in order and
// assign a number to each instruction. We use RPO to ensure that defs are
// met before their users. We assume that each instruction that has in-loop
// users starts an interval. We record every time that an in-loop value is
// used, so we have a list of the first and last occurrences of each
// instruction. Next, we transpose this data structure into a multi map that
// holds the list of intervals that *end* at a specific location. This multi
// map allows us to perform a linear search. We scan the instructions linearly
// and record each time that a new interval starts, by placing it in a set.
// If we find this value in the multi-map then we remove it from the set.
// The max register usage is the maximum size of the set.
// We also search for instructions that are defined outside the loop, but are
// used inside the loop. We need this number separately from the max-interval
// usage number because when we unroll, loop-invariant values do not take
// more register.
LoopBlocksDFS DFS(TheLoop);
DFS.perform(LI);

RegisterUsage RU;

// Each 'key' in the map opens a new interval. The values
// of the map are the index of the 'last seen' usage of the
// instruction that is the key.
using IntervalMap = DenseMap<Instruction *, unsigned>;

// Maps instruction to its index.
SmallVector<Instruction *, 64> IdxToInstr;
// Marks the end of each interval.
IntervalMap EndPoint;
// Saves the list of instruction indices that are used in the loop.
SmallPtrSet<Instruction *, 8> Ends;
// Saves the list of values that are used in the loop but are
// defined outside the loop, such as arguments and constants.
SmallPtrSet<Value *, 8> LoopInvariants;

for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
  for (Instruction &I : BB->instructionsWithoutDebug()) {
    IdxToInstr.push_back(&I);

    // Save the end location of each USE.
    for (Value *U : I.operands()) {
      auto *Instr = dyn_cast<Instruction>(U);

      // Ignore non-instruction values such as arguments, constants, etc.
      if (!Instr)
        continue;

      // If this instruction is outside the loop then record it and continue.
      if (!TheLoop->contains(Instr)) {
        LoopInvariants.insert(Instr);
        continue;
      }

      // Overwrite previous end points.
      EndPoint[Instr] = IdxToInstr.size();
      Ends.insert(Instr);
    }
  }
}

// Saves the list of intervals that end with the index in 'key'.
using InstrList = SmallVector<Instruction *, 2>;
DenseMap<unsigned, InstrList> TransposeEnds;

// Transpose the EndPoints to a list of values that end at each index.
for (auto &Interval : EndPoint)
  TransposeEnds[Interval.second].push_back(Interval.first);

SmallPtrSet<Instruction *, 8> OpenIntervals;

// Get the size of the widest register.
unsigned MaxSafeDepDist = -1U;
if (Legal->getMaxSafeDepDistBytes() != -1U)
  MaxSafeDepDist = Legal->getMaxSafeDepDistBytes() * 8;
unsigned WidestRegister =
    std::min(TTI.getRegisterBitWidth(true), MaxSafeDepDist);
const DataLayout &DL = TheFunction->getParent()->getDataLayout();

SmallVector<RegisterUsage, 8> RUs(VFs.size());
SmallVector<SmallMapVector<unsigned, unsigned, 4>, 8> MaxUsages(VFs.size());

LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV(REG): Calculating max register usage:\n"
; } } while (false);

// A lambda that gets the register usage for the given type and VF.
auto GetRegUsage = [&DL, WidestRegister](Type *Ty, unsigned VF) {
  if (Ty->isTokenTy())
    return 0U;
  unsigned TypeSize = DL.getTypeSizeInBits(Ty->getScalarType());
  return std::max<unsigned>(1, VF * TypeSize / WidestRegister);
};

for (unsigned int i = 0, s = IdxToInstr.size(); i < s; ++i) {
  Instruction *I = IdxToInstr[i];

  // Remove all of the instructions that end at this location.
  InstrList &List = TransposeEnds[i];
  for (Instruction *ToRemove : List)
    OpenIntervals.erase(ToRemove);

  // Ignore instructions that are never used within the loop.
  if (Ends.find(I) == Ends.end())
    continue;

  // Skip ignored values.
  if (ValuesToIgnore.find(I) != ValuesToIgnore.end())
    continue;

  // For each VF find the maximum usage of registers.
  for (unsigned j = 0, e = VFs.size(); j < e; ++j) {
    // Count the number of live intervals.
    SmallMapVector<unsigned, unsigned, 4> RegUsage;

    if (VFs[j] == 1) {
      for (auto Inst : OpenIntervals) {
        unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType());
        if (RegUsage.find(ClassID) == RegUsage.end())
          RegUsage[ClassID] = 1;
        else
          RegUsage[ClassID] += 1;
      }
    } else {
      collectUniformsAndScalars(VFs[j]);
      for (auto Inst : OpenIntervals) {
        // Skip ignored values for VF > 1.
        if (VecValuesToIgnore.find(Inst) != VecValuesToIgnore.end())
          continue;
        if (isScalarAfterVectorization(Inst, VFs[j])) {
          unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType());
          if (RegUsage.find(ClassID) == RegUsage.end())
            RegUsage[ClassID] = 1;
          else
            RegUsage[ClassID] += 1;
        } else {
          unsigned ClassID = TTI.getRegisterClassForType(true, Inst->getType());
          if (RegUsage.find(ClassID) == RegUsage.end())
            RegUsage[ClassID] = GetRegUsage(Inst->getType(), VFs[j]);
          else
            RegUsage[ClassID] += GetRegUsage(Inst->getType(), VFs[j]);
        }
      }
    }
  
    for (auto& pair : RegUsage) {
      if (MaxUsages[j].find(pair.first) != MaxUsages[j].end())
        MaxUsages[j][pair.first] = std::max(MaxUsages[j][pair.first], pair.second);
      else
        MaxUsages[j][pair.first] = pair.second;
    }
  }

  LLVM_DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV(REG): At #" <<
 i << " Interval # " << OpenIntervals.size() <<
 '\n'; } } while (false)
                    << OpenIntervals.size() << '\n')do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV(REG): At #" <<
 i << " Interval # " << OpenIntervals.size() <<
 '\n'; } } while (false);

  // Add the current instruction to the list of open intervals.
  OpenIntervals.insert(I);
}

for (unsigned i = 0, e = VFs.size(); i < e; ++i) {
  SmallMapVector<unsigned, unsigned, 4> Invariant;

  for (auto Inst : LoopInvariants) {
    unsigned Usage = VFs[i] == 1 ? 1 : GetRegUsage(Inst->getType(), VFs[i]);
    unsigned ClassID = TTI.getRegisterClassForType(VFs[i] > 1, Inst->getType());
    if (Invariant.find(ClassID) == Invariant.end())
      Invariant[ClassID] = Usage;
    else
      Invariant[ClassID] += Usage;
  }

  LLVM_DEBUG({do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { { dbgs() << "LV(REG): VF = " <<
 VFs[i] << '\n'; dbgs() << "LV(REG): Found max usage: "
 << MaxUsages[i].size() << " item\n"; for (const auto
 &pair : MaxUsages[i]) { dbgs() << "LV(REG): RegisterClass: "
 << TTI.getRegisterClassName(pair.first) << ", " <<
 pair.second << " registers\n"; } dbgs() << "LV(REG): Found invariant usage: "
 << Invariant.size() << " item\n"; for (const auto
 &pair : Invariant) { dbgs() << "LV(REG): RegisterClass: "
 << TTI.getRegisterClassName(pair.first) << ", " <<
 pair.second << " registers\n"; } }; } } while (false)
    dbgs() << "LV(REG): VF = " << VFs[i] << '\n';do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { { dbgs() << "LV(REG): VF = " <<
 VFs[i] << '\n'; dbgs() << "LV(REG): Found max usage: "
 << MaxUsages[i].size() << " item\n"; for (const auto
 &pair : MaxUsages[i]) { dbgs() << "LV(REG): RegisterClass: "
 << TTI.getRegisterClassName(pair.first) << ", " <<
 pair.second << " registers\n"; } dbgs() << "LV(REG): Found invariant usage: "
 << Invariant.size() << " item\n"; for (const auto
 &pair : Invariant) { dbgs() << "LV(REG): RegisterClass: "
 << TTI.getRegisterClassName(pair.first) << ", " <<
 pair.second << " registers\n"; } }; } } while (false)
    dbgs() << "LV(REG): Found max usage: " << MaxUsages[i].size()do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { { dbgs() << "LV(REG): VF = " <<
 VFs[i] << '\n'; dbgs() << "LV(REG): Found max usage: "
 << MaxUsages[i].size() << " item\n"; for (const auto
 &pair : MaxUsages[i]) { dbgs() << "LV(REG): RegisterClass: "
 << TTI.getRegisterClassName(pair.first) << ", " <<
 pair.second << " registers\n"; } dbgs() << "LV(REG): Found invariant usage: "
 << Invariant.size() << " item\n"; for (const auto
 &pair : Invariant) { dbgs() << "LV(REG): RegisterClass: "
 << TTI.getRegisterClassName(pair.first) << ", " <<
 pair.second << " registers\n"; } }; } } while (false)
           << " item\n";do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { { dbgs() << "LV(REG): VF = " <<
 VFs[i] << '\n'; dbgs() << "LV(REG): Found max usage: "
 << MaxUsages[i].size() << " item\n"; for (const auto
 &pair : MaxUsages[i]) { dbgs() << "LV(REG): RegisterClass: "
 << TTI.getRegisterClassName(pair.first) << ", " <<
 pair.second << " registers\n"; } dbgs() << "LV(REG): Found invariant usage: "
 << Invariant.size() << " item\n"; for (const auto
 &pair : Invariant) { dbgs() << "LV(REG): RegisterClass: "
 << TTI.getRegisterClassName(pair.first) << ", " <<
 pair.second << " registers\n"; } }; } } while (false)
    for (const auto &pair : MaxUsages[i]) {do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { { dbgs() << "LV(REG): VF = " <<
 VFs[i] << '\n'; dbgs() << "LV(REG): Found max usage: "
 << MaxUsages[i].size() << " item\n"; for (const auto
 &pair : MaxUsages[i]) { dbgs() << "LV(REG): RegisterClass: "
 << TTI.getRegisterClassName(pair.first) << ", " <<
 pair.second << " registers\n"; } dbgs() << "LV(REG): Found invariant usage: "
 << Invariant.size() << " item\n"; for (const auto
 &pair : Invariant) { dbgs() << "LV(REG): RegisterClass: "
 << TTI.getRegisterClassName(pair.first) << ", " <<
 pair.second << " registers\n"; } }; } } while (false)
      dbgs() << "LV(REG): RegisterClass: "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { { dbgs() << "LV(REG): VF = " <<
 VFs[i] << '\n'; dbgs() << "LV(REG): Found max usage: "
 << MaxUsages[i].size() << " item\n"; for (const auto
 &pair : MaxUsages[i]) { dbgs() << "LV(REG): RegisterClass: "
 << TTI.getRegisterClassName(pair.first) << ", " <<
 pair.second << " registers\n"; } dbgs() << "LV(REG): Found invariant usage: "
 << Invariant.size() << " item\n"; for (const auto
 &pair : Invariant) { dbgs() << "LV(REG): RegisterClass: "
 << TTI.getRegisterClassName(pair.first) << ", " <<
 pair.second << " registers\n"; } }; } } while (false)
             << TTI.getRegisterClassName(pair.first) << ", " << pair.seconddo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { { dbgs() << "LV(REG): VF = " <<
 VFs[i] << '\n'; dbgs() << "LV(REG): Found max usage: "
 << MaxUsages[i].size() << " item\n"; for (const auto
 &pair : MaxUsages[i]) { dbgs() << "LV(REG): RegisterClass: "
 << TTI.getRegisterClassName(pair.first) << ", " <<
 pair.second << " registers\n"; } dbgs() << "LV(REG): Found invariant usage: "
 << Invariant.size() << " item\n"; for (const auto
 &pair : Invariant) { dbgs() << "LV(REG): RegisterClass: "
 << TTI.getRegisterClassName(pair.first) << ", " <<
 pair.second << " registers\n"; } }; } } while (false)
             << " registers\n";do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { { dbgs() << "LV(REG): VF = " <<
 VFs[i] << '\n'; dbgs() << "LV(REG): Found max usage: "
 << MaxUsages[i].size() << " item\n"; for (const auto
 &pair : MaxUsages[i]) { dbgs() << "LV(REG): RegisterClass: "
 << TTI.getRegisterClassName(pair.first) << ", " <<
 pair.second << " registers\n"; } dbgs() << "LV(REG): Found invariant usage: "
 << Invariant.size() << " item\n"; for (const auto
 &pair : Invariant) { dbgs() << "LV(REG): RegisterClass: "
 << TTI.getRegisterClassName(pair.first) << ", " <<
 pair.second << " registers\n"; } }; } } while (false)
    }do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { { dbgs() << "LV(REG): VF = " <<
 VFs[i] << '\n'; dbgs() << "LV(REG): Found max usage: "
 << MaxUsages[i].size() << " item\n"; for (const auto
 &pair : MaxUsages[i]) { dbgs() << "LV(REG): RegisterClass: "
 << TTI.getRegisterClassName(pair.first) << ", " <<
 pair.second << " registers\n"; } dbgs() << "LV(REG): Found invariant usage: "
 << Invariant.size() << " item\n"; for (const auto
 &pair : Invariant) { dbgs() << "LV(REG): RegisterClass: "
 << TTI.getRegisterClassName(pair.first) << ", " <<
 pair.second << " registers\n"; } }; } } while (false)
    dbgs() << "LV(REG): Found invariant usage: " << Invariant.size()do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { { dbgs() << "LV(REG): VF = " <<
 VFs[i] << '\n'; dbgs() << "LV(REG): Found max usage: "
 << MaxUsages[i].size() << " item\n"; for (const auto
 &pair : MaxUsages[i]) { dbgs() << "LV(REG): RegisterClass: "
 << TTI.getRegisterClassName(pair.first) << ", " <<
 pair.second << " registers\n"; } dbgs() << "LV(REG): Found invariant usage: "
 << Invariant.size() << " item\n"; for (const auto
 &pair : Invariant) { dbgs() << "LV(REG): RegisterClass: "
 << TTI.getRegisterClassName(pair.first) << ", " <<
 pair.second << " registers\n"; } }; } } while (false)
           << " item\n";do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { { dbgs() << "LV(REG): VF = " <<
 VFs[i] << '\n'; dbgs() << "LV(REG): Found max usage: "
 << MaxUsages[i].size() << " item\n"; for (const auto
 &pair : MaxUsages[i]) { dbgs() << "LV(REG): RegisterClass: "
 << TTI.getRegisterClassName(pair.first) << ", " <<
 pair.second << " registers\n"; } dbgs() << "LV(REG): Found invariant usage: "
 << Invariant.size() << " item\n"; for (const auto
 &pair : Invariant) { dbgs() << "LV(REG): RegisterClass: "
 << TTI.getRegisterClassName(pair.first) << ", " <<
 pair.second << " registers\n"; } }; } } while (false)
    for (const auto &pair : Invariant) {do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { { dbgs() << "LV(REG): VF = " <<
 VFs[i] << '\n'; dbgs() << "LV(REG): Found max usage: "
 << MaxUsages[i].size() << " item\n"; for (const auto
 &pair : MaxUsages[i]) { dbgs() << "LV(REG): RegisterClass: "
 << TTI.getRegisterClassName(pair.first) << ", " <<
 pair.second << " registers\n"; } dbgs() << "LV(REG): Found invariant usage: "
 << Invariant.size() << " item\n"; for (const auto
 &pair : Invariant) { dbgs() << "LV(REG): RegisterClass: "
 << TTI.getRegisterClassName(pair.first) << ", " <<
 pair.second << " registers\n"; } }; } } while (false)
      dbgs() << "LV(REG): RegisterClass: "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { { dbgs() << "LV(REG): VF = " <<
 VFs[i] << '\n'; dbgs() << "LV(REG): Found max usage: "
 << MaxUsages[i].size() << " item\n"; for (const auto
 &pair : MaxUsages[i]) { dbgs() << "LV(REG): RegisterClass: "
 << TTI.getRegisterClassName(pair.first) << ", " <<
 pair.second << " registers\n"; } dbgs() << "LV(REG): Found invariant usage: "
 << Invariant.size() << " item\n"; for (const auto
 &pair : Invariant) { dbgs() << "LV(REG): RegisterClass: "
 << TTI.getRegisterClassName(pair.first) << ", " <<
 pair.second << " registers\n"; } }; } } while (false)
             << TTI.getRegisterClassName(pair.first) << ", " << pair.seconddo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { { dbgs() << "LV(REG): VF = " <<
 VFs[i] << '\n'; dbgs() << "LV(REG): Found max usage: "
 << MaxUsages[i].size() << " item\n"; for (const auto
 &pair : MaxUsages[i]) { dbgs() << "LV(REG): RegisterClass: "
 << TTI.getRegisterClassName(pair.first) << ", " <<
 pair.second << " registers\n"; } dbgs() << "LV(REG): Found invariant usage: "
 << Invariant.size() << " item\n"; for (const auto
 &pair : Invariant) { dbgs() << "LV(REG): RegisterClass: "
 << TTI.getRegisterClassName(pair.first) << ", " <<
 pair.second << " registers\n"; } }; } } while (false)
             << " registers\n";do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { { dbgs() << "LV(REG): VF = " <<
 VFs[i] << '\n'; dbgs() << "LV(REG): Found max usage: "
 << MaxUsages[i].size() << " item\n"; for (const auto
 &pair : MaxUsages[i]) { dbgs() << "LV(REG): RegisterClass: "
 << TTI.getRegisterClassName(pair.first) << ", " <<
 pair.second << " registers\n"; } dbgs() << "LV(REG): Found invariant usage: "
 << Invariant.size() << " item\n"; for (const auto
 &pair : Invariant) { dbgs() << "LV(REG): RegisterClass: "
 << TTI.getRegisterClassName(pair.first) << ", " <<
 pair.second << " registers\n"; } }; } } while (false)
    }do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { { dbgs() << "LV(REG): VF = " <<
 VFs[i] << '\n'; dbgs() << "LV(REG): Found max usage: "
 << MaxUsages[i].size() << " item\n"; for (const auto
 &pair : MaxUsages[i]) { dbgs() << "LV(REG): RegisterClass: "
 << TTI.getRegisterClassName(pair.first) << ", " <<
 pair.second << " registers\n"; } dbgs() << "LV(REG): Found invariant usage: "
 << Invariant.size() << " item\n"; for (const auto
 &pair : Invariant) { dbgs() << "LV(REG): RegisterClass: "
 << TTI.getRegisterClassName(pair.first) << ", " <<
 pair.second << " registers\n"; } }; } } while (false)
  })do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { { dbgs() << "LV(REG): VF = " <<
 VFs[i] << '\n'; dbgs() << "LV(REG): Found max usage: "
 << MaxUsages[i].size() << " item\n"; for (const auto
 &pair : MaxUsages[i]) { dbgs() << "LV(REG): RegisterClass: "
 << TTI.getRegisterClassName(pair.first) << ", " <<
 pair.second << " registers\n"; } dbgs() << "LV(REG): Found invariant usage: "
 << Invariant.size() << " item\n"; for (const auto
 &pair : Invariant) { dbgs() << "LV(REG): RegisterClass: "
 << TTI.getRegisterClassName(pair.first) << ", " <<
 pair.second << " registers\n"; } }; } } while (false);

  RU.LoopInvariantRegs = Invariant;
  RU.MaxLocalUsers = MaxUsages[i];
  RUs[i] = RU;
}

return RUs;
5559}

5561bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I){
// TODO: Cost model for emulated masked load/store is completely
// broken. This hack guides the cost model to use an artificially
// high enough value to practically disable vectorization with such
// operations, except where previously deployed legality hack allowed
// using very low cost values. This is to avoid regressions coming simply
// from moving "masked load/store" check from legality to cost model.
// Masked Load/Gather emulation was previously never allowed.
// Limited number of Masked Store/Scatter emulation was allowed.
assert(isPredicatedInst(I) && "Expecting a scalar emulated instruction")((isPredicatedInst(I) && "Expecting a scalar emulated instruction"
) ? static_cast<void> (0) : __assert_fail ("isPredicatedInst(I) && \"Expecting a scalar emulated instruction\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 5570, __PRETTY_FUNCTION__));
return isa<LoadInst>(I) ||
       (isa<StoreInst>(I) &&
        NumPredStores > NumberOfStoresToPredicate);
5574}

5576void LoopVectorizationCostModel::collectInstsToScalarize(unsigned VF) {
// If we aren't vectorizing the loop, or if we've already collected the
// instructions to scalarize, there's nothing to do. Collection may already
// have occurred if we have a user-selected VF and are now computing the
// expected cost for interleaving.
if (VF < 2 || InstsToScalarize.find(VF) != InstsToScalarize.end())
  return;

// Initialize a mapping for VF in InstsToScalalarize. If we find that it's
// not profitable to scalarize any instructions, the presence of VF in the
// map will indicate that we've analyzed it already.
ScalarCostsTy &ScalarCostsVF = InstsToScalarize[VF];

// Find all the instructions that are scalar with predication in the loop and
// determine if it would be better to not if-convert the blocks they are in.
// If so, we also record the instructions to scalarize.
for (BasicBlock *BB : TheLoop->blocks()) {
  if (!blockNeedsPredication(BB))
    continue;
  for (Instruction &I : *BB)
    if (isScalarWithPredication(&I)) {
      ScalarCostsTy ScalarCosts;
      // Do not apply discount logic if hacked cost is needed
      // for emulated masked memrefs.
      if (!useEmulatedMaskMemRefHack(&I) &&
          computePredInstDiscount(&I, ScalarCosts, VF) >= 0)
        ScalarCostsVF.insert(ScalarCosts.begin(), ScalarCosts.end());
      // Remember that BB will remain after vectorization.
      PredicatedBBsAfterVectorization.insert(BB);
    }
}
5607}

5609int LoopVectorizationCostModel::computePredInstDiscount(
  Instruction *PredInst, DenseMap<Instruction *, unsigned> &ScalarCosts,
  unsigned VF) {
assert(!isUniformAfterVectorization(PredInst, VF) &&((!isUniformAfterVectorization(PredInst, VF) && "Instruction marked uniform-after-vectorization will be predicated"
) ? static_cast<void> (0) : __assert_fail ("!isUniformAfterVectorization(PredInst, VF) && \"Instruction marked uniform-after-vectorization will be predicated\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 5613, __PRETTY_FUNCTION__))
       "Instruction marked uniform-after-vectorization will be predicated")((!isUniformAfterVectorization(PredInst, VF) && "Instruction marked uniform-after-vectorization will be predicated"
) ? static_cast<void> (0) : __assert_fail ("!isUniformAfterVectorization(PredInst, VF) && \"Instruction marked uniform-after-vectorization will be predicated\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 5613, __PRETTY_FUNCTION__));

// Initialize the discount to zero, meaning that the scalar version and the
// vector version cost the same.
int Discount = 0;

// Holds instructions to analyze. The instructions we visit are mapped in
// ScalarCosts. Those instructions are the ones that would be scalarized if
// we find that the scalar version costs less.
SmallVector<Instruction *, 8> Worklist;

// Returns true if the given instruction can be scalarized.
auto canBeScalarized = [&](Instruction *I) -> bool {
  // We only attempt to scalarize instructions forming a single-use chain
  // from the original predicated block that would otherwise be vectorized.
  // Although not strictly necessary, we give up on instructions we know will
  // already be scalar to avoid traversing chains that are unlikely to be
  // beneficial.
  if (!I->hasOneUse() || PredInst->getParent() != I->getParent() ||
      isScalarAfterVectorization(I, VF))
    return false;

  // If the instruction is scalar with predication, it will be analyzed
  // separately. We ignore it within the context of PredInst.
  if (isScalarWithPredication(I))
    return false;

  // If any of the instruction's operands are uniform after vectorization,
  // the instruction cannot be scalarized. This prevents, for example, a
  // masked load from being scalarized.
  //
  // We assume we will only emit a value for lane zero of an instruction
  // marked uniform after vectorization, rather than VF identical values.
  // Thus, if we scalarize an instruction that uses a uniform, we would
  // create uses of values corresponding to the lanes we aren't emitting code
  // for. This behavior can be changed by allowing getScalarValue to clone
  // the lane zero values for uniforms rather than asserting.
  for (Use &U : I->operands())
    if (auto *J = dyn_cast<Instruction>(U.get()))
      if (isUniformAfterVectorization(J, VF))
        return false;

  // Otherwise, we can scalarize the instruction.
  return true;
};

// Compute the expected cost discount from scalarizing the entire expression
// feeding the predicated instruction. We currently only consider expressions
// that are single-use instruction chains.
Worklist.push_back(PredInst);
while (!Worklist.empty()) {
  Instruction *I = Worklist.pop_back_val();

  // If we've already analyzed the instruction, there's nothing to do.
  if (ScalarCosts.find(I) != ScalarCosts.end())
    continue;

  // Compute the cost of the vector instruction. Note that this cost already
  // includes the scalarization overhead of the predicated instruction.
  unsigned VectorCost = getInstructionCost(I, VF).first;

  // Compute the cost of the scalarized instruction. This cost is the cost of
  // the instruction as if it wasn't if-converted and instead remained in the
  // predicated block. We will scale this cost by block probability after
  // computing the scalarization overhead.
  unsigned ScalarCost = VF * getInstructionCost(I, 1).first;

  // Compute the scalarization overhead of needed insertelement instructions
  // and phi nodes.
  if (isScalarWithPredication(I) && !I->getType()->isVoidTy()) {
    ScalarCost += TTI.getScalarizationOverhead(ToVectorTy(I->getType(), VF),
                                               true, false);
    ScalarCost += VF * TTI.getCFInstrCost(Instruction::PHI);
  }

  // Compute the scalarization overhead of needed extractelement
  // instructions. For each of the instruction's operands, if the operand can
  // be scalarized, add it to the worklist; otherwise, account for the
  // overhead.
  for (Use &U : I->operands())
    if (auto *J = dyn_cast<Instruction>(U.get())) {
      assert(VectorType::isValidElementType(J->getType()) &&((VectorType::isValidElementType(J->getType()) && "Instruction has non-scalar type"
) ? static_cast<void> (0) : __assert_fail ("VectorType::isValidElementType(J->getType()) && \"Instruction has non-scalar type\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 5695, __PRETTY_FUNCTION__))
             "Instruction has non-scalar type")((VectorType::isValidElementType(J->getType()) && "Instruction has non-scalar type"
) ? static_cast<void> (0) : __assert_fail ("VectorType::isValidElementType(J->getType()) && \"Instruction has non-scalar type\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 5695, __PRETTY_FUNCTION__));
      if (canBeScalarized(J))
        Worklist.push_back(J);
      else if (needsExtract(J, VF))
        ScalarCost += TTI.getScalarizationOverhead(
                            ToVectorTy(J->getType(),VF), false, true);
    }

  // Scale the total scalar cost by block probability.
  ScalarCost /= getReciprocalPredBlockProb();

  // Compute the discount. A non-negative discount means the vector version
  // of the instruction costs more, and scalarizing would be beneficial.
  Discount += VectorCost - ScalarCost;
  ScalarCosts[I] = ScalarCost;
}

return Discount;
5713}

5715LoopVectorizationCostModel::VectorizationCostTy
5716LoopVectorizationCostModel::expectedCost(unsigned VF) {
VectorizationCostTy Cost;

// For each block.
for (BasicBlock *BB : TheLoop->blocks()) {
  VectorizationCostTy BlockCost;

  // For each instruction in the old loop.
  for (Instruction &I : BB->instructionsWithoutDebug()) {
    // Skip ignored values.
    if (ValuesToIgnore.find(&I) != ValuesToIgnore.end() ||
        (VF > 1 && VecValuesToIgnore.find(&I) != VecValuesToIgnore.end()))
      continue;

    VectorizationCostTy C = getInstructionCost(&I, VF);

    // Check if we should override the cost.
    if (ForceTargetInstructionCost.getNumOccurrences() > 0)
      C.first = ForceTargetInstructionCost;

    BlockCost.first += C.first;
    BlockCost.second |= C.second;
    LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C.firstdo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found an estimated cost of "
 << C.first << " for VF " << VF << " For instruction: "
 << I << '\n'; } } while (false)
                      << " for VF " << VF << " For instruction: " << Ido { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found an estimated cost of "
 << C.first << " for VF " << VF << " For instruction: "
 << I << '\n'; } } while (false)
                      << '\n')do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found an estimated cost of "
 << C.first << " for VF " << VF << " For instruction: "
 << I << '\n'; } } while (false);
  }

  // If we are vectorizing a predicated block, it will have been
  // if-converted. This means that the block's instructions (aside from
  // stores and instructions that may divide by zero) will now be
  // unconditionally executed. For the scalar case, we may not always execute
  // the predicated block. Thus, scale the block's cost by the probability of
  // executing it.
  if (VF == 1 && blockNeedsPredication(BB))
    BlockCost.first /= getReciprocalPredBlockProb();

  Cost.first += BlockCost.first;
  Cost.second |= BlockCost.second;
}

return Cost;
5757}

5759/// Gets Address Access SCEV after verifying that the access pattern
5760/// is loop invariant except the induction variable dependence.
5761///
5762/// This SCEV can be sent to the Target in order to estimate the address
5763/// calculation cost.
5764static const SCEV *getAddressAccessSCEV(
            Value *Ptr,
            LoopVectorizationLegality *Legal,
            PredicatedScalarEvolution &PSE,
            const Loop *TheLoop) {

auto *Gep = dyn_cast<GetElementPtrInst>(Ptr);
if (!Gep)
  return nullptr;

// We are looking for a gep with all loop invariant indices except for one
// which should be an induction variable.
auto SE = PSE.getSE();
unsigned NumOperands = Gep->getNumOperands();
for (unsigned i = 1; i < NumOperands; ++i) {
  Value *Opd = Gep->getOperand(i);
  if (!SE->isLoopInvariant(SE->getSCEV(Opd), TheLoop) &&
      !Legal->isInductionVariable(Opd))
    return nullptr;
}

// Now we know we have a GEP ptr, %inv, %ind, %inv. return the Ptr SCEV.
return PSE.getSCEV(Ptr);
5787}

5789static bool isStrideMul(Instruction *I, LoopVectorizationLegality *Legal) {
return Legal->hasStride(I->getOperand(0)) ||
       Legal->hasStride(I->getOperand(1));
5792}

5794unsigned LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I,
                                                               unsigned VF) {
assert(VF > 1 && "Scalarization cost of instruction implies vectorization.")((VF > 1 && "Scalarization cost of instruction implies vectorization."
) ? static_cast<void> (0) : __assert_fail ("VF > 1 && \"Scalarization cost of instruction implies vectorization.\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 5796, __PRETTY_FUNCTION__));
Type *ValTy = getMemInstValueType(I);
auto SE = PSE.getSE();

unsigned AS = getLoadStoreAddressSpace(I);
Value *Ptr = getLoadStorePointerOperand(I);
Type *PtrTy = ToVectorTy(Ptr->getType(), VF);

// Figure out whether the access is strided and get the stride value
// if it's known in compile time
const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, Legal, PSE, TheLoop);

// Get the cost of the scalar memory instruction and address computation.
unsigned Cost = VF * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV);

// Don't pass *I here, since it is scalar but will actually be part of a
// vectorized loop where the user of it is a vectorized instruction.
const MaybeAlign Alignment = getLoadStoreAlignment(I);
Cost += VF * TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(),
                                 Alignment, AS);

// Get the overhead of the extractelement and insertelement instructions
// we might create due to scalarization.
Cost += getScalarizationOverhead(I, VF);

// If we have a predicated store, it may not be executed for each vector
// lane. Scale the cost by the probability of executing the predicated
// block.
if (isPredicatedInst(I)) {
  Cost /= getReciprocalPredBlockProb();

  if (useEmulatedMaskMemRefHack(I))
    // Artificially setting to a high enough value to practically disable
    // vectorization with such operations.
    Cost = 3000000;
}

return Cost;
5834}

5836unsigned LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I,
                                                           unsigned VF) {
Type *ValTy = getMemInstValueType(I);
Type *VectorTy = ToVectorTy(ValTy, VF);
Value *Ptr = getLoadStorePointerOperand(I);
unsigned AS = getLoadStoreAddressSpace(I);
int ConsecutiveStride = Legal->isConsecutivePtr(Ptr);

assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&(((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
 "Stride should be 1 or -1 for consecutive memory access") ? static_cast
<void> (0) : __assert_fail ("(ConsecutiveStride == 1 || ConsecutiveStride == -1) && \"Stride should be 1 or -1 for consecutive memory access\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 5845, __PRETTY_FUNCTION__))
       "Stride should be 1 or -1 for consecutive memory access")(((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
 "Stride should be 1 or -1 for consecutive memory access") ? static_cast
<void> (0) : __assert_fail ("(ConsecutiveStride == 1 || ConsecutiveStride == -1) && \"Stride should be 1 or -1 for consecutive memory access\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 5845, __PRETTY_FUNCTION__));
const MaybeAlign Alignment = getLoadStoreAlignment(I);
unsigned Cost = 0;
if (Legal->isMaskRequired(I))
  Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy,
                                    Alignment ? Alignment->value() : 0, AS);
else
  Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS, I);

bool Reverse = ConsecutiveStride < 0;
if (Reverse)
  Cost += TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0);
return Cost;
5858}

5860unsigned LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I,
                                                       unsigned VF) {
Type *ValTy = getMemInstValueType(I);
Type *VectorTy = ToVectorTy(ValTy, VF);
const MaybeAlign Alignment = getLoadStoreAlignment(I);
unsigned AS = getLoadStoreAddressSpace(I);
if (isa<LoadInst>(I)) {
  return TTI.getAddressComputationCost(ValTy) +
         TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS) +
         TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VectorTy);
}
StoreInst *SI = cast<StoreInst>(I);

bool isLoopInvariantStoreValue = Legal->isUniform(SI->getValueOperand());
return TTI.getAddressComputationCost(ValTy) +
       TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS) +
       (isLoopInvariantStoreValue
            ? 0
            : TTI.getVectorInstrCost(Instruction::ExtractElement, VectorTy,
                                     VF - 1));
5880}

5882unsigned LoopVectorizationCostModel::getGatherScatterCost(Instruction *I,
                                                        unsigned VF) {
Type *ValTy = getMemInstValueType(I);
Type *VectorTy = ToVectorTy(ValTy, VF);
const MaybeAlign Alignment = getLoadStoreAlignment(I);
Value *Ptr = getLoadStorePointerOperand(I);

return TTI.getAddressComputationCost(VectorTy) +
       TTI.getGatherScatterOpCost(I->getOpcode(), VectorTy, Ptr,
                                  Legal->isMaskRequired(I),
                                  Alignment ? Alignment->value() : 0);
5893}

5895unsigned LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I,
                                                          unsigned VF) {
Type *ValTy = getMemInstValueType(I);
Type *VectorTy = ToVectorTy(ValTy, VF);
unsigned AS = getLoadStoreAddressSpace(I);

auto Group = getInterleavedAccessGroup(I);
assert(Group && "Fail to get an interleaved access group.")((Group && "Fail to get an interleaved access group."
) ? static_cast<void> (0) : __assert_fail ("Group && \"Fail to get an interleaved access group.\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 5902, __PRETTY_FUNCTION__));

unsigned InterleaveFactor = Group->getFactor();
Type *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor);

// Holds the indices of existing members in an interleaved load group.
// An interleaved store group doesn't need this as it doesn't allow gaps.
SmallVector<unsigned, 4> Indices;
if (isa<LoadInst>(I)) {
  for (unsigned i = 0; i < InterleaveFactor; i++)
    if (Group->getMember(i))
      Indices.push_back(i);
}

// Calculate the cost of the whole interleaved group.
bool UseMaskForGaps =
    Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed();
unsigned Cost = TTI.getInterleavedMemoryOpCost(
    I->getOpcode(), WideVecTy, Group->getFactor(), Indices,
    Group->getAlignment(), AS, Legal->isMaskRequired(I), UseMaskForGaps);

if (Group->isReverse()) {
  // TODO: Add support for reversed masked interleaved access.
  assert(!Legal->isMaskRequired(I) &&((!Legal->isMaskRequired(I) && "Reverse masked interleaved access not supported."
) ? static_cast<void> (0) : __assert_fail ("!Legal->isMaskRequired(I) && \"Reverse masked interleaved access not supported.\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 5926, __PRETTY_FUNCTION__))
         "Reverse masked interleaved access not supported.")((!Legal->isMaskRequired(I) && "Reverse masked interleaved access not supported."
) ? static_cast<void> (0) : __assert_fail ("!Legal->isMaskRequired(I) && \"Reverse masked interleaved access not supported.\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 5926, __PRETTY_FUNCTION__));
  Cost += Group->getNumMembers() *
          TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0);
}
return Cost;
5931}

5933unsigned LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I,
                                                            unsigned VF) {
// Calculate scalar cost only. Vectorization cost should be ready at this
// moment.
if (VF == 1) {
  Type *ValTy = getMemInstValueType(I);
  const MaybeAlign Alignment = getLoadStoreAlignment(I);
  unsigned AS = getLoadStoreAddressSpace(I);

  return TTI.getAddressComputationCost(ValTy) +
         TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS, I);
}
return getWideningCost(I, VF);
5946}

5948LoopVectorizationCostModel::VectorizationCostTy
5949LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) {
// If we know that this instruction will remain uniform, check the cost of
// the scalar version.
if (isUniformAfterVectorization(I, VF))
  VF = 1;

if (VF > 1 && isProfitableToScalarize(I, VF))
  return VectorizationCostTy(InstsToScalarize[VF][I], false);

// Forced scalars do not have any scalarization overhead.
auto ForcedScalar = ForcedScalars.find(VF);
if (VF > 1 && ForcedScalar != ForcedScalars.end()) {
  auto InstSet = ForcedScalar->second;
  if (InstSet.find(I) != InstSet.end())
    return VectorizationCostTy((getInstructionCost(I, 1).first * VF), false);
}

Type *VectorTy;
unsigned C = getInstructionCost(I, VF, VectorTy);

bool TypeNotScalarized =
    VF > 1 && VectorTy->isVectorTy() && TTI.getNumberOfParts(VectorTy) < VF;
return VectorizationCostTy(C, TypeNotScalarized);
5972}

5974unsigned LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I,
                                                            unsigned VF) {

if (VF == 1)
  return 0;

unsigned Cost = 0;
Type *RetTy = ToVectorTy(I->getType(), VF);
if (!RetTy->isVoidTy() &&
    (!isa<LoadInst>(I) || !TTI.supportsEfficientVectorElementLoadStore()))
  Cost += TTI.getScalarizationOverhead(RetTy, true, false);

// Some targets keep addresses scalar.
if (isa<LoadInst>(I) && !TTI.prefersVectorizedAddressing())
  return Cost;

// Some targets support efficient element stores.
if (isa<StoreInst>(I) && TTI.supportsEfficientVectorElementLoadStore())
  return Cost;

// Collect operands to consider.
CallInst *CI = dyn_cast<CallInst>(I);
Instruction::op_range Ops = CI ? CI->arg_operands() : I->operands();

// Skip operands that do not require extraction/scalarization and do not incur
// any overhead.
return Cost + TTI.getOperandsScalarizationOverhead(
                  filterExtractingOperands(Ops, VF), VF);
6002}

6004void LoopVectorizationCostModel::setCostBasedWideningDecision(unsigned VF) {
if (VF == 1)
  return;
NumPredStores = 0;
for (BasicBlock *BB : TheLoop->blocks()) {
  // For each instruction in the old loop.
  for (Instruction &I : *BB) {
    Value *Ptr =  getLoadStorePointerOperand(&I);
    if (!Ptr)
      continue;

    // TODO: We should generate better code and update the cost model for
    // predicated uniform stores. Today they are treated as any other
    // predicated store (see added test cases in
    // invariant-store-vectorization.ll).
    if (isa<StoreInst>(&I) && isScalarWithPredication(&I))
      NumPredStores++;

    if (Legal->isUniform(Ptr) &&
        // Conditional loads and stores should be scalarized and predicated.
        // isScalarWithPredication cannot be used here since masked
        // gather/scatters are not considered scalar with predication.
        !Legal->blockNeedsPredication(I.getParent())) {
      // TODO: Avoid replicating loads and stores instead of
      // relying on instcombine to remove them.
      // Load: Scalar load + broadcast
      // Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract
      unsigned Cost = getUniformMemOpCost(&I, VF);
      setWideningDecision(&I, VF, CM_Scalarize, Cost);
      continue;
    }

    // We assume that widening is the best solution when possible.
    if (memoryInstructionCanBeWidened(&I, VF)) {
      unsigned Cost = getConsecutiveMemOpCost(&I, VF);
      int ConsecutiveStride =
             Legal->isConsecutivePtr(getLoadStorePointerOperand(&I));
      assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&(((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
 "Expected consecutive stride.") ? static_cast<void> (0
) : __assert_fail ("(ConsecutiveStride == 1 || ConsecutiveStride == -1) && \"Expected consecutive stride.\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 6042, __PRETTY_FUNCTION__))
             "Expected consecutive stride.")(((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
 "Expected consecutive stride.") ? static_cast<void> (0
) : __assert_fail ("(ConsecutiveStride == 1 || ConsecutiveStride == -1) && \"Expected consecutive stride.\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 6042, __PRETTY_FUNCTION__));
      InstWidening Decision =
          ConsecutiveStride == 1 ? CM_Widen : CM_Widen_Reverse;
      setWideningDecision(&I, VF, Decision, Cost);
      continue;
    }

    // Choose between Interleaving, Gather/Scatter or Scalarization.
    unsigned InterleaveCost = std::numeric_limits<unsigned>::max();
    unsigned NumAccesses = 1;
    if (isAccessInterleaved(&I)) {
      auto Group = getInterleavedAccessGroup(&I);
      assert(Group && "Fail to get an interleaved access group.")((Group && "Fail to get an interleaved access group."
) ? static_cast<void> (0) : __assert_fail ("Group && \"Fail to get an interleaved access group.\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 6054, __PRETTY_FUNCTION__));

      // Make one decision for the whole group.
      if (getWideningDecision(&I, VF) != CM_Unknown)
        continue;

      NumAccesses = Group->getNumMembers();
      if (interleavedAccessCanBeWidened(&I, VF))
        InterleaveCost = getInterleaveGroupCost(&I, VF);
    }

    unsigned GatherScatterCost =
        isLegalGatherOrScatter(&I)
            ? getGatherScatterCost(&I, VF) * NumAccesses
            : std::numeric_limits<unsigned>::max();

    unsigned ScalarizationCost =
        getMemInstScalarizationCost(&I, VF) * NumAccesses;

    // Choose better solution for the current VF,
    // write down this decision and use it during vectorization.
    unsigned Cost;
    InstWidening Decision;
    if (InterleaveCost <= GatherScatterCost &&
        InterleaveCost < ScalarizationCost) {
      Decision = CM_Interleave;
      Cost = InterleaveCost;
    } else if (GatherScatterCost < ScalarizationCost) {
      Decision = CM_GatherScatter;
      Cost = GatherScatterCost;
    } else {
      Decision = CM_Scalarize;
      Cost = ScalarizationCost;
    }
    // If the instructions belongs to an interleave group, the whole group
    // receives the same decision. The whole group receives the cost, but
    // the cost will actually be assigned to one instruction.
    if (auto Group = getInterleavedAccessGroup(&I))
      setWideningDecision(Group, VF, Decision, Cost);
    else
      setWideningDecision(&I, VF, Decision, Cost);
  }
}

// Make sure that any load of address and any other address computation
// remains scalar unless there is gather/scatter support. This avoids
// inevitable extracts into address registers, and also has the benefit of
// activating LSR more, since that pass can't optimize vectorized
// addresses.
if (TTI.prefersVectorizedAddressing())
  return;

// Start with all scalar pointer uses.
SmallPtrSet<Instruction *, 8> AddrDefs;
for (BasicBlock *BB : TheLoop->blocks())
  for (Instruction &I : *BB) {
    Instruction *PtrDef =
      dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I));
    if (PtrDef && TheLoop->contains(PtrDef) &&
        getWideningDecision(&I, VF) != CM_GatherScatter)
      AddrDefs.insert(PtrDef);
  }

// Add all instructions used to generate the addresses.
SmallVector<Instruction *, 4> Worklist;
for (auto *I : AddrDefs)
  Worklist.push_back(I);
while (!Worklist.empty()) {
  Instruction *I = Worklist.pop_back_val();
  for (auto &Op : I->operands())
    if (auto *InstOp = dyn_cast<Instruction>(Op))
      if ((InstOp->getParent() == I->getParent()) && !isa<PHINode>(InstOp) &&
          AddrDefs.insert(InstOp).second)
        Worklist.push_back(InstOp);
}

for (auto *I : AddrDefs) {
  if (isa<LoadInst>(I)) {
    // Setting the desired widening decision should ideally be handled in
    // by cost functions, but since this involves the task of finding out
    // if the loaded register is involved in an address computation, it is
    // instead changed here when we know this is the case.
    InstWidening Decision = getWideningDecision(I, VF);
    if (Decision == CM_Widen || Decision == CM_Widen_Reverse)
      // Scalarize a widened load of address.
      setWideningDecision(I, VF, CM_Scalarize,
                          (VF * getMemoryInstructionCost(I, 1)));
    else if (auto Group = getInterleavedAccessGroup(I)) {
      // Scalarize an interleave group of address loads.
      for (unsigned I = 0; I < Group->getFactor(); ++I) {
        if (Instruction *Member = Group->getMember(I))
          setWideningDecision(Member, VF, CM_Scalarize,
                              (VF * getMemoryInstructionCost(Member, 1)));
      }
    }
  } else
    // Make sure I gets scalarized and a cost estimate without
    // scalarization overhead.
    ForcedScalars[VF].insert(I);
}
6154}

6156unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I,
                                                      unsigned VF,
                                                      Type *&VectorTy) {
Type *RetTy = I->getType();
if (canTruncateToMinimalBitwidth(I, VF))
  RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]);
VectorTy = isScalarAfterVectorization(I, VF) ? RetTy : ToVectorTy(RetTy, VF);
auto SE = PSE.getSE();

// TODO: We need to estimate the cost of intrinsic calls.
switch (I->getOpcode()) {
case Instruction::GetElementPtr:
  // We mark this instruction as zero-cost because the cost of GEPs in
  // vectorized code depends on whether the corresponding memory instruction
  // is scalarized or not. Therefore, we handle GEPs with the memory
  // instruction cost.
  return 0;
case Instruction::Br: {
  // In cases of scalarized and predicated instructions, there will be VF
  // predicated blocks in the vectorized loop. Each branch around these
  // blocks requires also an extract of its vector compare i1 element.
  bool ScalarPredicatedBB = false;
  BranchInst *BI = cast<BranchInst>(I);
  if (VF > 1 && BI->isConditional() &&
      (PredicatedBBsAfterVectorization.find(BI->getSuccessor(0)) !=
           PredicatedBBsAfterVectorization.end() ||
       PredicatedBBsAfterVectorization.find(BI->getSuccessor(1)) !=
           PredicatedBBsAfterVectorization.end()))
    ScalarPredicatedBB = true;

  if (ScalarPredicatedBB) {
    // Return cost for branches around scalarized and predicated blocks.
    Type *Vec_i1Ty =
        VectorType::get(IntegerType::getInt1Ty(RetTy->getContext()), VF);
    return (TTI.getScalarizationOverhead(Vec_i1Ty, false, true) +
            (TTI.getCFInstrCost(Instruction::Br) * VF));
  } else if (I->getParent() == TheLoop->getLoopLatch() || VF == 1)
    // The back-edge branch will remain, as will all scalar branches.
    return TTI.getCFInstrCost(Instruction::Br);
  else
    // This branch will be eliminated by if-conversion.
    return 0;
  // Note: We currently assume zero cost for an unconditional branch inside
  // a predicated block since it will become a fall-through, although we
  // may decide in the future to call TTI for all branches.
}
case Instruction::PHI: {
  auto *Phi = cast<PHINode>(I);

  // First-order recurrences are replaced by vector shuffles inside the loop.
  // NOTE: Don't use ToVectorTy as SK_ExtractSubvector expects a vector type.
  if (VF > 1 && Legal->isFirstOrderRecurrence(Phi))
    return TTI.getShuffleCost(TargetTransformInfo::SK_ExtractSubvector,
                              VectorTy, VF - 1, VectorType::get(RetTy, 1));

  // Phi nodes in non-header blocks (not inductions, reductions, etc.) are
  // converted into select instructions. We require N - 1 selects per phi
  // node, where N is the number of incoming values.
  if (VF > 1 && Phi->getParent() != TheLoop->getHeader())
    return (Phi->getNumIncomingValues() - 1) *
           TTI.getCmpSelInstrCost(
               Instruction::Select, ToVectorTy(Phi->getType(), VF),
               ToVectorTy(Type::getInt1Ty(Phi->getContext()), VF));

  return TTI.getCFInstrCost(Instruction::PHI);
}
case Instruction::UDiv:
case Instruction::SDiv:
case Instruction::URem:
case Instruction::SRem:
  // If we have a predicated instruction, it may not be executed for each
  // vector lane. Get the scalarization cost and scale this amount by the
  // probability of executing the predicated block. If the instruction is not
  // predicated, we fall through to the next case.
  if (VF > 1 && isScalarWithPredication(I)) {
    unsigned Cost = 0;

    // These instructions have a non-void type, so account for the phi nodes
    // that we will create. This cost is likely to be zero. The phi node
    // cost, if any, should be scaled by the block probability because it
    // models a copy at the end of each predicated block.
    Cost += VF * TTI.getCFInstrCost(Instruction::PHI);

    // The cost of the non-predicated instruction.
    Cost += VF * TTI.getArithmeticInstrCost(I->getOpcode(), RetTy);

    // The cost of insertelement and extractelement instructions needed for
    // scalarization.
    Cost += getScalarizationOverhead(I, VF);

    // Scale the cost by the probability of executing the predicated blocks.
    // This assumes the predicated block for each vector lane is equally
    // likely.
    return Cost / getReciprocalPredBlockProb();
  }
  LLVM_FALLTHROUGH[[gnu::fallthrough]];
case Instruction::Add:
case Instruction::FAdd:
case Instruction::Sub:
case Instruction::FSub:
case Instruction::Mul:
case Instruction::FMul:
case Instruction::FDiv:
case Instruction::FRem:
case Instruction::Shl:
case Instruction::LShr:
case Instruction::AShr:
case Instruction::And:
case Instruction::Or:
case Instruction::Xor: {
  // Since we will replace the stride by 1 the multiplication should go away.
  if (I->getOpcode() == Instruction::Mul && isStrideMul(I, Legal))
    return 0;
  // Certain instructions can be cheaper to vectorize if they have a constant
  // second vector operand. One example of this are shifts on x86.
  Value *Op2 = I->getOperand(1);
  TargetTransformInfo::OperandValueProperties Op2VP;
  TargetTransformInfo::OperandValueKind Op2VK =
      TTI.getOperandInfo(Op2, Op2VP);
  if (Op2VK == TargetTransformInfo::OK_AnyValue && Legal->isUniform(Op2))
    Op2VK = TargetTransformInfo::OK_UniformValue;

  SmallVector<const Value *, 4> Operands(I->operand_values());
  unsigned N = isScalarAfterVectorization(I, VF) ? VF : 1;
  return N * TTI.getArithmeticInstrCost(
                 I->getOpcode(), VectorTy, TargetTransformInfo::OK_AnyValue,
                 Op2VK, TargetTransformInfo::OP_None, Op2VP, Operands, I);
}
case Instruction::FNeg: {
  unsigned N = isScalarAfterVectorization(I, VF) ? VF : 1;
  return N * TTI.getArithmeticInstrCost(
                 I->getOpcode(), VectorTy, TargetTransformInfo::OK_AnyValue,
                 TargetTransformInfo::OK_AnyValue,
                 TargetTransformInfo::OP_None, TargetTransformInfo::OP_None,
                 I->getOperand(0), I);
}
case Instruction::Select: {
  SelectInst *SI = cast<SelectInst>(I);
  const SCEV *CondSCEV = SE->getSCEV(SI->getCondition());
  bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop));
  Type *CondTy = SI->getCondition()->getType();
  if (!ScalarCond)
    CondTy = VectorType::get(CondTy, VF);

  return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy, I);
}
case Instruction::ICmp:
case Instruction::FCmp: {
  Type *ValTy = I->getOperand(0)->getType();
  Instruction *Op0AsInstruction = dyn_cast<Instruction>(I->getOperand(0));
  if (canTruncateToMinimalBitwidth(Op0AsInstruction, VF))
    ValTy = IntegerType::get(ValTy->getContext(), MinBWs[Op0AsInstruction]);
  VectorTy = ToVectorTy(ValTy, VF);
  return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, nullptr, I);
}
case Instruction::Store:
case Instruction::Load: {
  unsigned Width = VF;
  if (Width > 1) {
    InstWidening Decision = getWideningDecision(I, Width);
    assert(Decision != CM_Unknown &&((Decision != CM_Unknown && "CM decision should be taken at this point"
) ? static_cast<void> (0) : __assert_fail ("Decision != CM_Unknown && \"CM decision should be taken at this point\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 6317, __PRETTY_FUNCTION__))
           "CM decision should be taken at this point")((Decision != CM_Unknown && "CM decision should be taken at this point"
) ? static_cast<void> (0) : __assert_fail ("Decision != CM_Unknown && \"CM decision should be taken at this point\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 6317, __PRETTY_FUNCTION__));
    if (Decision == CM_Scalarize)
      Width = 1;
  }
  VectorTy = ToVectorTy(getMemInstValueType(I), Width);
  return getMemoryInstructionCost(I, VF);
}
case Instruction::ZExt:
case Instruction::SExt:
case Instruction::FPToUI:
case Instruction::FPToSI:
case Instruction::FPExt:
case Instruction::PtrToInt:
case Instruction::IntToPtr:
case Instruction::SIToFP:
case Instruction::UIToFP:
case Instruction::Trunc:
case Instruction::FPTrunc:
case Instruction::BitCast: {
  // We optimize the truncation of induction variables having constant
  // integer steps. The cost of these truncations is the same as the scalar
  // operation.
  if (isOptimizableIVTruncate(I, VF)) {
    auto *Trunc = cast<TruncInst>(I);
    return TTI.getCastInstrCost(Instruction::Trunc, Trunc->getDestTy(),
                                Trunc->getSrcTy(), Trunc);
  }

  Type *SrcScalarTy = I->getOperand(0)->getType();
  Type *SrcVecTy =
      VectorTy->isVectorTy() ? ToVectorTy(SrcScalarTy, VF) : SrcScalarTy;
  if (canTruncateToMinimalBitwidth(I, VF)) {
    // This cast is going to be shrunk. This may remove the cast or it might
    // turn it into slightly different cast. For example, if MinBW == 16,
    // "zext i8 %1 to i32" becomes "zext i8 %1 to i16".
    //
    // Calculate the modified src and dest types.
    Type *MinVecTy = VectorTy;
    if (I->getOpcode() == Instruction::Trunc) {
      SrcVecTy = smallestIntegerVectorType(SrcVecTy, MinVecTy);
      VectorTy =
          largestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy);
    } else if (I->getOpcode() == Instruction::ZExt ||
               I->getOpcode() == Instruction::SExt) {
      SrcVecTy = largestIntegerVectorType(SrcVecTy, MinVecTy);
      VectorTy =
          smallestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy);
    }
  }

  unsigned N = isScalarAfterVectorization(I, VF) ? VF : 1;
  return N * TTI.getCastInstrCost(I->getOpcode(), VectorTy, SrcVecTy, I);
}
case Instruction::Call: {
  bool NeedToScalarize;
  CallInst *CI = cast<CallInst>(I);
  unsigned CallCost = getVectorCallCost(CI, VF, NeedToScalarize);
  if (getVectorIntrinsicIDForCall(CI, TLI))
    return std::min(CallCost, getVectorIntrinsicCost(CI, VF));
  return CallCost;
}
default:
  // The cost of executing VF copies of the scalar instruction. This opcode
  // is unknown. Assume that it is the same as 'mul'.
  return VF * TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy) +
         getScalarizationOverhead(I, VF);
} // end of switch.
6384}

6386char LoopVectorize::ID = 0;

6388static const char lv_name[] = "Loop Vectorization";

6390INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false)static void *initializeLoopVectorizePassOnce(PassRegistry &
Registry) {
6391INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)initializeTargetTransformInfoWrapperPassPass(Registry);
6392INITIALIZE_PASS_DEPENDENCY(BasicAAWrapperPass)initializeBasicAAWrapperPassPass(Registry);
6393INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)initializeAAResultsWrapperPassPass(Registry);
6394INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass)initializeGlobalsAAWrapperPassPass(Registry);
6395INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)initializeAssumptionCacheTrackerPass(Registry);
6396INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass)initializeBlockFrequencyInfoWrapperPassPass(Registry);
6397INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)initializeDominatorTreeWrapperPassPass(Registry);
6398INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)initializeScalarEvolutionWrapperPassPass(Registry);
6399INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)initializeLoopInfoWrapperPassPass(Registry);
6400INITIALIZE_PASS_DEPENDENCY(LoopAccessLegacyAnalysis)initializeLoopAccessLegacyAnalysisPass(Registry);
6401INITIALIZE_PASS_DEPENDENCY(DemandedBitsWrapperPass)initializeDemandedBitsWrapperPassPass(Registry);
6402INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass)initializeOptimizationRemarkEmitterWrapperPassPass(Registry);
6403INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass)initializeProfileSummaryInfoWrapperPassPass(Registry);
6404INITIALIZE_PASS_DEPENDENCY(InjectTLIMappingsLegacy)initializeInjectTLIMappingsLegacyPass(Registry);
6405INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false)PassInfo *PI = new PassInfo( lv_name, "loop-vectorize", &
LoopVectorize::ID, PassInfo::NormalCtor_t(callDefaultCtor<
LoopVectorize>), false, false); Registry.registerPass(*PI,
 true); return PI; } static llvm::once_flag InitializeLoopVectorizePassFlag
; void llvm::initializeLoopVectorizePass(PassRegistry &Registry
) { llvm::call_once(InitializeLoopVectorizePassFlag, initializeLoopVectorizePassOnce
, std::ref(Registry)); }

6407namespace llvm {

6409Pass *createLoopVectorizePass() { return new LoopVectorize(); }

6411Pass *createLoopVectorizePass(bool InterleaveOnlyWhenForced,
                            bool VectorizeOnlyWhenForced) {
return new LoopVectorize(InterleaveOnlyWhenForced, VectorizeOnlyWhenForced);
6414}

6416} // end namespace llvm

6418bool LoopVectorizationCostModel::isConsecutiveLoadOrStore(Instruction *Inst) {
// Check if the pointer operand of a load or store instruction is
// consecutive.
if (auto *Ptr = getLoadStorePointerOperand(Inst))
  return Legal->isConsecutivePtr(Ptr);
return false;
6424}

6426void LoopVectorizationCostModel::collectValuesToIgnore() {
// Ignore ephemeral values.
CodeMetrics::collectEphemeralValues(TheLoop, AC, ValuesToIgnore);

// Ignore type-promoting instructions we identified during reduction
// detection.
for (auto &Reduction : Legal->getReductionVars()) {
  RecurrenceDescriptor &RedDes = Reduction.second;
  SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts();
  VecValuesToIgnore.insert(Casts.begin(), Casts.end());
}
// Ignore type-casting instructions we identified during induction
// detection.
for (auto &Induction : Legal->getInductionVars()) {
  InductionDescriptor &IndDes = Induction.second;
  const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts();
  VecValuesToIgnore.insert(Casts.begin(), Casts.end());
}
6444}

6446// TODO: we could return a pair of values that specify the max VF and
6447// min VF, to be used in `buildVPlans(MinVF, MaxVF)` instead of
6448// `buildVPlans(VF, VF)`. We cannot do it because VPLAN at the moment
6449// doesn't have a cost model that can choose which plan to execute if
6450// more than one is generated.
6451static unsigned determineVPlanVF(const unsigned WidestVectorRegBits,
                               LoopVectorizationCostModel &CM) {
unsigned WidestType;
std::tie(std::ignore, WidestType) = CM.getSmallestAndWidestTypes();
return WidestVectorRegBits / WidestType;
6456}

6458VectorizationFactor
6459LoopVectorizationPlanner::planInVPlanNativePath(unsigned UserVF) {
unsigned VF = UserVF;
// Outer loop handling: They may require CFG and instruction level
// transformations before even evaluating whether vectorization is profitable.
// Since we cannot modify the incoming IR, we need to build VPlan upfront in
// the vectorization pipeline.
if (!OrigLoop->empty()) {
  // If the user doesn't provide a vectorization factor, determine a
  // reasonable one.
  if (!UserVF) {
    VF = determineVPlanVF(TTI->getRegisterBitWidth(true /* Vector*/), CM);
    LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: VPlan computed VF "
 << VF << ".\n"; } } while (false);

    // Make sure we have a VF > 1 for stress testing.
    if (VPlanBuildStressTest && VF < 2) {
      LLVM_DEBUG(dbgs() << "LV: VPlan stress testing: "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: VPlan stress testing: "
 << "overriding computed VF.\n"; } } while (false)
                        << "overriding computed VF.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: VPlan stress testing: "
 << "overriding computed VF.\n"; } } while (false);
      VF = 4;
    }
  }
  assert(EnableVPlanNativePath && "VPlan-native path is not enabled.")((EnableVPlanNativePath && "VPlan-native path is not enabled."
) ? static_cast<void> (0) : __assert_fail ("EnableVPlanNativePath && \"VPlan-native path is not enabled.\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 6479, __PRETTY_FUNCTION__));
  assert(isPowerOf2_32(VF) && "VF needs to be a power of two")((isPowerOf2_32(VF) && "VF needs to be a power of two"
) ? static_cast<void> (0) : __assert_fail ("isPowerOf2_32(VF) && \"VF needs to be a power of two\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 6480, __PRETTY_FUNCTION__));
  LLVM_DEBUG(dbgs() << "LV: Using " << (UserVF ? "user " : "") << "VF " << VFdo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Using " << (
UserVF ? "user " : "") << "VF " << VF << " to build VPlans.\n"
; } } while (false)
                    << " to build VPlans.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Using " << (
UserVF ? "user " : "") << "VF " << VF << " to build VPlans.\n"
; } } while (false);
  buildVPlans(VF, VF);

  // For VPlan build stress testing, we bail out after VPlan construction.
  if (VPlanBuildStressTest)
    return VectorizationFactor::Disabled();

  return {VF, 0};
}

LLVM_DEBUG(do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Not vectorizing. Inner loops aren't supported in the "
 "VPlan-native path.\n"; } } while (false)
    dbgs() << "LV: Not vectorizing. Inner loops aren't supported in the "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Not vectorizing. Inner loops aren't supported in the "
 "VPlan-native path.\n"; } } while (false)
              "VPlan-native path.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Not vectorizing. Inner loops aren't supported in the "
 "VPlan-native path.\n"; } } while (false);
return VectorizationFactor::Disabled();
6496}

6498Optional<VectorizationFactor> LoopVectorizationPlanner::plan(unsigned UserVF) {
assert(OrigLoop->empty() && "Inner loop expected.")((OrigLoop->empty() && "Inner loop expected.") ? static_cast
<void> (0) : __assert_fail ("OrigLoop->empty() && \"Inner loop expected.\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 6499, __PRETTY_FUNCTION__));
Optional<unsigned> MaybeMaxVF = CM.computeMaxVF();
if (!MaybeMaxVF) // Cases that should not to be vectorized nor interleaved.
  return None;

// Invalidate interleave groups if all blocks of loop will be predicated.
if (CM.blockNeedsPredication(OrigLoop->getHeader()) &&
    !useMaskedInterleavedAccesses(*TTI)) {
  LLVM_DEBUG(do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Invalidate all interleaved groups due to fold-tail by masking "
 "which requires masked-interleaved support.\n"; } } while (false
)
      dbgs()do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Invalidate all interleaved groups due to fold-tail by masking "
 "which requires masked-interleaved support.\n"; } } while (false
)
      << "LV: Invalidate all interleaved groups due to fold-tail by masking "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Invalidate all interleaved groups due to fold-tail by masking "
 "which requires masked-interleaved support.\n"; } } while (false
)
         "which requires masked-interleaved support.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Invalidate all interleaved groups due to fold-tail by masking "
 "which requires masked-interleaved support.\n"; } } while (false
);
  CM.InterleaveInfo.reset();
}

if (UserVF) {
  LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Using user VF " <<
 UserVF << ".\n"; } } while (false);
  assert(isPowerOf2_32(UserVF) && "VF needs to be a power of two")((isPowerOf2_32(UserVF) && "VF needs to be a power of two"
) ? static_cast<void> (0) : __assert_fail ("isPowerOf2_32(UserVF) && \"VF needs to be a power of two\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 6516, __PRETTY_FUNCTION__));
  // Collect the instructions (and their associated costs) that will be more
  // profitable to scalarize.
  CM.selectUserVectorizationFactor(UserVF);
  buildVPlansWithVPRecipes(UserVF, UserVF);
  LLVM_DEBUG(printPlans(dbgs()))do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { printPlans(dbgs()); } } while (false);
  return {{UserVF, 0}};
}

unsigned MaxVF = MaybeMaxVF.getValue();
assert(MaxVF != 0 && "MaxVF is zero.")((MaxVF != 0 && "MaxVF is zero.") ? static_cast<void
> (0) : __assert_fail ("MaxVF != 0 && \"MaxVF is zero.\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 6526, __PRETTY_FUNCTION__));

for (unsigned VF = 1; VF <= MaxVF; VF *= 2) {
  // Collect Uniform and Scalar instructions after vectorization with VF.
  CM.collectUniformsAndScalars(VF);

  // Collect the instructions (and their associated costs) that will be more
  // profitable to scalarize.
  if (VF > 1)
    CM.collectInstsToScalarize(VF);
}

buildVPlansWithVPRecipes(1, MaxVF);
LLVM_DEBUG(printPlans(dbgs()))do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { printPlans(dbgs()); } } while (false);
if (MaxVF == 1)
  return VectorizationFactor::Disabled();

// Select the optimal vectorization factor.
return CM.selectVectorizationFactor(MaxVF);
6545}

6547void LoopVectorizationPlanner::setBestPlan(unsigned VF, unsigned UF) {
LLVM_DEBUG(dbgs() << "Setting best plan to VF=" << VF << ", UF=" << UFdo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "Setting best plan to VF="
 << VF << ", UF=" << UF << '\n'; } } while
 (false)
                  << '\n')do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "Setting best plan to VF="
 << VF << ", UF=" << UF << '\n'; } } while
 (false);
BestVF = VF;
BestUF = UF;

erase_if(VPlans, [VF](const VPlanPtr &Plan) {
  return !Plan->hasVF(VF);
});
assert(VPlans.size() == 1 && "Best VF has not a single VPlan.")((VPlans.size() == 1 && "Best VF has not a single VPlan."
) ? static_cast<void> (0) : __assert_fail ("VPlans.size() == 1 && \"Best VF has not a single VPlan.\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 6556, __PRETTY_FUNCTION__));
6557}

6559void LoopVectorizationPlanner::executePlan(InnerLoopVectorizer &ILV,
                                         DominatorTree *DT) {
// Perform the actual loop transformation.

// 1. Create a new empty loop. Unlink the old loop and connect the new one.
VPCallbackILV CallbackILV(ILV);

VPTransformState State{BestVF, BestUF,      LI,
                       DT,     ILV.Builder, ILV.VectorLoopValueMap,
                       &ILV,   CallbackILV};
State.CFG.PrevBB = ILV.createVectorizedLoopSkeleton();
State.TripCount = ILV.getOrCreateTripCount(nullptr);

//===------------------------------------------------===//
//
// Notice: any optimization or new instruction that go
// into the code below should also be implemented in
// the cost-model.
//
//===------------------------------------------------===//

// 2. Copy and widen instructions from the old loop into the new loop.
assert(VPlans.size() == 1 && "Not a single VPlan to execute.")((VPlans.size() == 1 && "Not a single VPlan to execute."
) ? static_cast<void> (0) : __assert_fail ("VPlans.size() == 1 && \"Not a single VPlan to execute.\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 6581, __PRETTY_FUNCTION__));
VPlans.front()->execute(&State);

// 3. Fix the vectorized code: take care of header phi's, live-outs,
//    predication, updating analyses.
ILV.fixVectorizedLoop();
6587}

6589void LoopVectorizationPlanner::collectTriviallyDeadInstructions(
  SmallPtrSetImpl<Instruction *> &DeadInstructions) {
BasicBlock *Latch = OrigLoop->getLoopLatch();

// We create new control-flow for the vectorized loop, so the original
// condition will be dead after vectorization if it's only used by the
// branch.
auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0));
if (Cmp && Cmp->hasOneUse())
  DeadInstructions.insert(Cmp);

// We create new "steps" for induction variable updates to which the original
// induction variables map. An original update instruction will be dead if
// all its users except the induction variable are dead.
for (auto &Induction : Legal->getInductionVars()) {
  PHINode *Ind = Induction.first;
  auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
  if (llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
        return U == Ind || DeadInstructions.find(cast<Instruction>(U)) !=
                               DeadInstructions.end();
      }))
    DeadInstructions.insert(IndUpdate);

  // We record as "Dead" also the type-casting instructions we had identified
  // during induction analysis. We don't need any handling for them in the
  // vectorized loop because we have proven that, under a proper runtime
  // test guarding the vectorized loop, the value of the phi, and the casted
  // value of the phi, are the same. The last instruction in this casting chain
  // will get its scalar/vector/widened def from the scalar/vector/widened def
  // of the respective phi node. Any other casts in the induction def-use chain
  // have no other uses outside the phi update chain, and will be ignored.
  InductionDescriptor &IndDes = Induction.second;
  const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts();
  DeadInstructions.insert(Casts.begin(), Casts.end());
}
6624}

6626Value *InnerLoopUnroller::reverseVector(Value *Vec) { return Vec; }

6628Value *InnerLoopUnroller::getBroadcastInstrs(Value *V) { return V; }

6630Value *InnerLoopUnroller::getStepVector(Value *Val, int StartIdx, Value *Step,
                                      Instruction::BinaryOps BinOp) {
// When unrolling and the VF is 1, we only need to add a simple scalar.
Type *Ty = Val->getType();
assert(!Ty->isVectorTy() && "Val must be a scalar")((!Ty->isVectorTy() && "Val must be a scalar") ? static_cast
<void> (0) : __assert_fail ("!Ty->isVectorTy() && \"Val must be a scalar\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 6634, __PRETTY_FUNCTION__));

if (Ty->isFloatingPointTy()) {
  Constant *C = ConstantFP::get(Ty, (double)StartIdx);

  // Floating point operations had to be 'fast' to enable the unrolling.
  Value *MulOp = addFastMathFlag(Builder.CreateFMul(C, Step));
  return addFastMathFlag(Builder.CreateBinOp(BinOp, Val, MulOp));
}
Constant *C = ConstantInt::get(Ty, StartIdx);
return Builder.CreateAdd(Val, Builder.CreateMul(C, Step), "induction");
6645}

6647static void AddRuntimeUnrollDisableMetaData(Loop *L) {
SmallVector<Metadata *, 4> MDs;
// Reserve first location for self reference to the LoopID metadata node.
MDs.push_back(nullptr);
bool IsUnrollMetadata = false;
MDNode *LoopID = L->getLoopID();
if (LoopID) {
  // First find existing loop unrolling disable metadata.
  for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) {
    auto *MD = dyn_cast<MDNode>(LoopID->getOperand(i));
    if (MD) {
      const auto *S = dyn_cast<MDString>(MD->getOperand(0));
      IsUnrollMetadata =
          S && S->getString().startswith("llvm.loop.unroll.disable");
    }
    MDs.push_back(LoopID->getOperand(i));
  }
}

if (!IsUnrollMetadata) {
  // Add runtime unroll disable metadata.
  LLVMContext &Context = L->getHeader()->getContext();
  SmallVector<Metadata *, 1> DisableOperands;
  DisableOperands.push_back(
      MDString::get(Context, "llvm.loop.unroll.runtime.disable"));
  MDNode *DisableNode = MDNode::get(Context, DisableOperands);
  MDs.push_back(DisableNode);
  MDNode *NewLoopID = MDNode::get(Context, MDs);
  // Set operand 0 to refer to the loop id itself.
  NewLoopID->replaceOperandWith(0, NewLoopID);
  L->setLoopID(NewLoopID);
}
6679}

6681bool LoopVectorizationPlanner::getDecisionAndClampRange(
  const std::function<bool(unsigned)> &Predicate, VFRange &Range) {
assert(Range.End > Range.Start && "Trying to test an empty VF range.")((Range.End > Range.Start && "Trying to test an empty VF range."
) ? static_cast<void> (0) : __assert_fail ("Range.End > Range.Start && \"Trying to test an empty VF range.\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 6683, __PRETTY_FUNCTION__));
bool PredicateAtRangeStart = Predicate(Range.Start);

for (unsigned TmpVF = Range.Start * 2; TmpVF < Range.End; TmpVF *= 2)
  if (Predicate(TmpVF) != PredicateAtRangeStart) {
    Range.End = TmpVF;
    break;
  }

return PredicateAtRangeStart;
6693}

6695/// Build VPlans for the full range of feasible VF's = {\p MinVF, 2 * \p MinVF,
6696/// 4 * \p MinVF, ..., \p MaxVF} by repeatedly building a VPlan for a sub-range
6697/// of VF's starting at a given VF and extending it as much as possible. Each
6698/// vectorization decision can potentially shorten this sub-range during
6699/// buildVPlan().
6700void LoopVectorizationPlanner::buildVPlans(unsigned MinVF, unsigned MaxVF) {
for (unsigned VF = MinVF; VF < MaxVF + 1;) {
  VFRange SubRange = {VF, MaxVF + 1};
  VPlans.push_back(buildVPlan(SubRange));
  VF = SubRange.End;
}
6706}

6708VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst,
                                       VPlanPtr &Plan) {
assert(is_contained(predecessors(Dst), Src) && "Invalid edge")((is_contained(predecessors(Dst), Src) && "Invalid edge"
) ? static_cast<void> (0) : __assert_fail ("is_contained(predecessors(Dst), Src) && \"Invalid edge\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 6710, __PRETTY_FUNCTION__));

// Look for cached value.
std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst);
EdgeMaskCacheTy::iterator ECEntryIt = EdgeMaskCache.find(Edge);
if (ECEntryIt != EdgeMaskCache.end())
  return ECEntryIt->second;

VPValue *SrcMask = createBlockInMask(Src, Plan);

// The terminator has to be a branch inst!
BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator());
assert(BI && "Unexpected terminator found")((BI && "Unexpected terminator found") ? static_cast<
void> (0) : __assert_fail ("BI && \"Unexpected terminator found\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 6722, __PRETTY_FUNCTION__));

if (!BI->isConditional() || BI->getSuccessor(0) == BI->getSuccessor(1))
  return EdgeMaskCache[Edge] = SrcMask;

VPValue *EdgeMask = Plan->getVPValue(BI->getCondition());
assert(EdgeMask && "No Edge Mask found for condition")((EdgeMask && "No Edge Mask found for condition") ? static_cast
<void> (0) : __assert_fail ("EdgeMask && \"No Edge Mask found for condition\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 6728, __PRETTY_FUNCTION__));

if (BI->getSuccessor(0) != Dst)
  EdgeMask = Builder.createNot(EdgeMask);

if (SrcMask) // Otherwise block in-mask is all-one, no need to AND.
  EdgeMask = Builder.createAnd(EdgeMask, SrcMask);

return EdgeMaskCache[Edge] = EdgeMask;
6737}

6739VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) {
assert(OrigLoop->contains(BB) && "Block is not a part of a loop")((OrigLoop->contains(BB) && "Block is not a part of a loop"
) ? static_cast<void> (0) : __assert_fail ("OrigLoop->contains(BB) && \"Block is not a part of a loop\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 6740, __PRETTY_FUNCTION__));
6
←
Assuming the condition is true→
7
←
'?' condition is true→

// Look for cached value.
BlockMaskCacheTy::iterator BCEntryIt = BlockMaskCache.find(BB);
if (BCEntryIt != BlockMaskCache.end())
8
←
Assuming the condition is false→
9
←
Taking false branch→
  return BCEntryIt->second;

// All-one mask is modelled as no-mask following the convention for masked
// load/store/gather/scatter. Initialize BlockMask to no-mask.
VPValue *BlockMask = nullptr;

if (OrigLoop->getHeader() == BB) {
10
←
Assuming the condition is false→
11
←
Taking false branch→
  if (!CM.blockNeedsPredication(BB))
    return BlockMaskCache[BB] = BlockMask; // Loop incoming mask is all-one.

  // Introduce the early-exit compare IV <= BTC to form header block mask.
  // This is used instead of IV < TC because TC may wrap, unlike BTC.
  VPValue *IV = Plan->getVPValue(Legal->getPrimaryInduction());
  VPValue *BTC = Plan->getOrCreateBackedgeTakenCount();
  BlockMask = Builder.createNaryOp(VPInstruction::ICmpULE, {IV, BTC});
  return BlockMaskCache[BB] = BlockMask;
}

// This is the block mask. We OR all incoming edges.
for (auto *Predecessor : predecessors(BB)) {
  VPValue *EdgeMask = createEdgeMask(Predecessor, BB, Plan);
  if (!EdgeMask) // Mask of predecessor is all-one so mask of block is too.
12
←
Assuming 'EdgeMask' is non-null→
13
←
Taking false branch→
16
←
Assuming 'EdgeMask' is non-null→
17
←
Taking false branch→
28
←
Assuming 'EdgeMask' is null→
29
←
Taking true branch→
    return BlockMaskCache[BB] = EdgeMask;
30
←
Potential leak of memory pointed to by 'BlockMask'

  if (!BlockMask13.1
'BlockMask' is null
1
'BlockMask' is non-null
1
'BlockMask' is null
1
'BlockMask' is non-null
) { // BlockMask has its initialized nullptr value.
14
←
Taking true branch→
18
←
Taking false branch→
    BlockMask = EdgeMask;
    continue;
15
←
 Execution continues on line 6764→
  }

  BlockMask = Builder.createOr(BlockMask, EdgeMask);
19
←
Calling 'VPBuilder::createOr'→
27
←
Returned allocated memory→
}

return BlockMaskCache[BB] = BlockMask;
6778}

6780VPWidenMemoryInstructionRecipe *
6781VPRecipeBuilder::tryToWidenMemory(Instruction *I, VFRange &Range,
                                VPlanPtr &Plan) {
if (!isa<LoadInst>(I) && !isa<StoreInst>(I))
1
Assuming 'I' is a 'LoadInst'→
  return nullptr;

auto willWiden = [&](unsigned VF) -> bool {
  if (VF == 1)
    return false;
  LoopVectorizationCostModel::InstWidening Decision =
      CM.getWideningDecision(I, VF);
  assert(Decision != LoopVectorizationCostModel::CM_Unknown &&((Decision != LoopVectorizationCostModel::CM_Unknown &&
 "CM decision should be taken at this point.") ? static_cast<
void> (0) : __assert_fail ("Decision != LoopVectorizationCostModel::CM_Unknown && \"CM decision should be taken at this point.\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 6792, __PRETTY_FUNCTION__))
         "CM decision should be taken at this point.")((Decision != LoopVectorizationCostModel::CM_Unknown &&
 "CM decision should be taken at this point.") ? static_cast<
void> (0) : __assert_fail ("Decision != LoopVectorizationCostModel::CM_Unknown && \"CM decision should be taken at this point.\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 6792, __PRETTY_FUNCTION__));
  if (Decision == LoopVectorizationCostModel::CM_Interleave)
    return true;
  if (CM.isScalarAfterVectorization(I, VF) ||
      CM.isProfitableToScalarize(I, VF))
    return false;
  return Decision != LoopVectorizationCostModel::CM_Scalarize;
};

if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range))
2
←
Assuming the condition is false→
3
←
Taking false branch→
  return nullptr;

VPValue *Mask = nullptr;
if (Legal->isMaskRequired(I))
4
←
Taking true branch→
  Mask = createBlockInMask(I->getParent(), Plan);
5
←
Calling 'VPRecipeBuilder::createBlockInMask'→

VPValue *Addr = Plan->getOrAddVPValue(getLoadStorePointerOperand(I));
return new VPWidenMemoryInstructionRecipe(*I, Addr, Mask);
6810}

6812VPWidenIntOrFpInductionRecipe *
6813VPRecipeBuilder::tryToOptimizeInduction(Instruction *I, VFRange &Range) {
if (PHINode *Phi = dyn_cast<PHINode>(I)) {
  // Check if this is an integer or fp induction. If so, build the recipe that
  // produces its scalar and vector values.
  InductionDescriptor II = Legal->getInductionVars().lookup(Phi);
  if (II.getKind() == InductionDescriptor::IK_IntInduction ||
      II.getKind() == InductionDescriptor::IK_FpInduction)
    return new VPWidenIntOrFpInductionRecipe(Phi);

  return nullptr;
}

// Optimize the special case where the source is a constant integer
// induction variable. Notice that we can only optimize the 'trunc' case
// because (a) FP conversions lose precision, (b) sext/zext may wrap, and
// (c) other casts depend on pointer size.

// Determine whether \p K is a truncation based on an induction variable that
// can be optimized.
auto isOptimizableIVTruncate =
    [&](Instruction *K) -> std::function<bool(unsigned)> {
  return
      [=](unsigned VF) -> bool { return CM.isOptimizableIVTruncate(K, VF); };
};

if (isa<TruncInst>(I) && LoopVectorizationPlanner::getDecisionAndClampRange(
                             isOptimizableIVTruncate(I), Range))
  return new VPWidenIntOrFpInductionRecipe(cast<PHINode>(I->getOperand(0)),
                                           cast<TruncInst>(I));
return nullptr;
6843}

6845VPBlendRecipe *VPRecipeBuilder::tryToBlend(Instruction *I, VPlanPtr &Plan) {
PHINode *Phi = dyn_cast<PHINode>(I);
if (!Phi || Phi->getParent() == OrigLoop->getHeader())
  return nullptr;

// We know that all PHIs in non-header blocks are converted into selects, so
// we don't have to worry about the insertion order and we can just use the
// builder. At this point we generate the predication tree. There may be
// duplications since this is a simple recursive scan, but future
// optimizations will clean it up.

SmallVector<VPValue *, 2> Masks;
unsigned NumIncoming = Phi->getNumIncomingValues();
for (unsigned In = 0; In < NumIncoming; In++) {
  VPValue *EdgeMask =
    createEdgeMask(Phi->getIncomingBlock(In), Phi->getParent(), Plan);
  assert((EdgeMask || NumIncoming == 1) &&(((EdgeMask || NumIncoming == 1) && "Multiple predecessors with one having a full mask"
) ? static_cast<void> (0) : __assert_fail ("(EdgeMask || NumIncoming == 1) && \"Multiple predecessors with one having a full mask\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 6862, __PRETTY_FUNCTION__))
         "Multiple predecessors with one having a full mask")(((EdgeMask || NumIncoming == 1) && "Multiple predecessors with one having a full mask"
) ? static_cast<void> (0) : __assert_fail ("(EdgeMask || NumIncoming == 1) && \"Multiple predecessors with one having a full mask\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 6862, __PRETTY_FUNCTION__));
  if (EdgeMask)
    Masks.push_back(EdgeMask);
}
return new VPBlendRecipe(Phi, Masks);
6867}

6869bool VPRecipeBuilder::tryToWiden(Instruction *I, VPBasicBlock *VPBB,
                               VFRange &Range) {

bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange(
    [&](unsigned VF) { return CM.isScalarWithPredication(I, VF); }, Range);

if (IsPredicated)
  return false;

auto IsVectorizableOpcode = [](unsigned Opcode) {
  switch (Opcode) {
  case Instruction::Add:
  case Instruction::And:
  case Instruction::AShr:
  case Instruction::BitCast:
  case Instruction::Br:
  case Instruction::Call:
  case Instruction::FAdd:
  case Instruction::FCmp:
  case Instruction::FDiv:
  case Instruction::FMul:
  case Instruction::FNeg:
  case Instruction::FPExt:
  case Instruction::FPToSI:
  case Instruction::FPToUI:
  case Instruction::FPTrunc:
  case Instruction::FRem:
  case Instruction::FSub:
  case Instruction::ICmp:
  case Instruction::IntToPtr:
  case Instruction::Load:
  case Instruction::LShr:
  case Instruction::Mul:
  case Instruction::Or:
  case Instruction::PHI:
  case Instruction::PtrToInt:
  case Instruction::SDiv:
  case Instruction::Select:
  case Instruction::SExt:
  case Instruction::Shl:
  case Instruction::SIToFP:
  case Instruction::SRem:
  case Instruction::Store:
  case Instruction::Sub:
  case Instruction::Trunc:
  case Instruction::UDiv:
  case Instruction::UIToFP:
  case Instruction::URem:
  case Instruction::Xor:
  case Instruction::ZExt:
    return true;
  }
  return false;
};

if (!IsVectorizableOpcode(I->getOpcode()))
  return false;

if (CallInst *CI = dyn_cast<CallInst>(I)) {
  Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
  if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end ||
             ID == Intrinsic::lifetime_start || ID == Intrinsic::sideeffect))
    return false;
}

auto willWiden = [&](unsigned VF) -> bool {
  if (!isa<PHINode>(I) && (CM.isScalarAfterVectorization(I, VF) ||
                           CM.isProfitableToScalarize(I, VF)))
    return false;
  if (CallInst *CI = dyn_cast<CallInst>(I)) {
    Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
    // The following case may be scalarized depending on the VF.
    // The flag shows whether we use Intrinsic or a usual Call for vectorized
    // version of the instruction.
    // Is it beneficial to perform intrinsic call compared to lib call?
    bool NeedToScalarize;
    unsigned CallCost = CM.getVectorCallCost(CI, VF, NeedToScalarize);
    bool UseVectorIntrinsic =
        ID && CM.getVectorIntrinsicCost(CI, VF) <= CallCost;
    return UseVectorIntrinsic || !NeedToScalarize;
  }
  if (isa<LoadInst>(I) || isa<StoreInst>(I)) {
    assert(CM.getWideningDecision(I, VF) ==((CM.getWideningDecision(I, VF) == LoopVectorizationCostModel
::CM_Scalarize && "Memory widening decisions should have been taken care by now"
) ? static_cast<void> (0) : __assert_fail ("CM.getWideningDecision(I, VF) == LoopVectorizationCostModel::CM_Scalarize && \"Memory widening decisions should have been taken care by now\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 6953, __PRETTY_FUNCTION__))
               LoopVectorizationCostModel::CM_Scalarize &&((CM.getWideningDecision(I, VF) == LoopVectorizationCostModel
::CM_Scalarize && "Memory widening decisions should have been taken care by now"
) ? static_cast<void> (0) : __assert_fail ("CM.getWideningDecision(I, VF) == LoopVectorizationCostModel::CM_Scalarize && \"Memory widening decisions should have been taken care by now\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 6953, __PRETTY_FUNCTION__))
           "Memory widening decisions should have been taken care by now")((CM.getWideningDecision(I, VF) == LoopVectorizationCostModel
::CM_Scalarize && "Memory widening decisions should have been taken care by now"
) ? static_cast<void> (0) : __assert_fail ("CM.getWideningDecision(I, VF) == LoopVectorizationCostModel::CM_Scalarize && \"Memory widening decisions should have been taken care by now\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 6953, __PRETTY_FUNCTION__));
    return false;
  }
  return true;
};

if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range))
  return false;
// If this ingredient's recipe is to be recorded, keep its recipe a singleton
// to avoid having to split recipes later.
bool IsSingleton = Ingredient2Recipe.count(I);

// Success: widen this instruction.

// Use the default widening recipe. We optimize the common case where
// consecutive instructions can be represented by a single recipe.
if (!IsSingleton && !VPBB->empty() && LastExtensibleRecipe == &VPBB->back() &&
    LastExtensibleRecipe->appendInstruction(I))
  return true;

VPWidenRecipe *WidenRecipe = new VPWidenRecipe(I);
if (!IsSingleton)
  LastExtensibleRecipe = WidenRecipe;
setRecipe(I, WidenRecipe);
VPBB->appendRecipe(WidenRecipe);
return true;
6979}

6981VPBasicBlock *VPRecipeBuilder::handleReplication(
  Instruction *I, VFRange &Range, VPBasicBlock *VPBB,
  DenseMap<Instruction *, VPReplicateRecipe *> &PredInst2Recipe,
  VPlanPtr &Plan) {
bool IsUniform = LoopVectorizationPlanner::getDecisionAndClampRange(
    [&](unsigned VF) { return CM.isUniformAfterVectorization(I, VF); },
    Range);

bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange(
    [&](unsigned VF) { return CM.isScalarWithPredication(I, VF); }, Range);

auto *Recipe = new VPReplicateRecipe(I, IsUniform, IsPredicated);
setRecipe(I, Recipe);

// Find if I uses a predicated instruction. If so, it will use its scalar
// value. Avoid hoisting the insert-element which packs the scalar value into
// a vector value, as that happens iff all users use the vector value.
for (auto &Op : I->operands())
  if (auto *PredInst = dyn_cast<Instruction>(Op))
    if (PredInst2Recipe.find(PredInst) != PredInst2Recipe.end())
      PredInst2Recipe[PredInst]->setAlsoPack(false);

// Finalize the recipe for Instr, first if it is not predicated.
if (!IsPredicated) {
  LLVM_DEBUG(dbgs() << "LV: Scalarizing:" << *I << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Scalarizing:" <<
 *I << "\n"; } } while (false);
  VPBB->appendRecipe(Recipe);
  return VPBB;
}
LLVM_DEBUG(dbgs() << "LV: Scalarizing and predicating:" << *I << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Scalarizing and predicating:"
 << *I << "\n"; } } while (false);
assert(VPBB->getSuccessors().empty() &&((VPBB->getSuccessors().empty() && "VPBB has successors when handling predicated replication."
) ? static_cast<void> (0) : __assert_fail ("VPBB->getSuccessors().empty() && \"VPBB has successors when handling predicated replication.\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 7011, __PRETTY_FUNCTION__))
       "VPBB has successors when handling predicated replication.")((VPBB->getSuccessors().empty() && "VPBB has successors when handling predicated replication."
) ? static_cast<void> (0) : __assert_fail ("VPBB->getSuccessors().empty() && \"VPBB has successors when handling predicated replication.\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 7011, __PRETTY_FUNCTION__));
// Record predicated instructions for above packing optimizations.
PredInst2Recipe[I] = Recipe;
VPBlockBase *Region = createReplicateRegion(I, Recipe, Plan);
VPBlockUtils::insertBlockAfter(Region, VPBB);
auto *RegSucc = new VPBasicBlock();
VPBlockUtils::insertBlockAfter(RegSucc, Region);
return RegSucc;
7019}

7021VPRegionBlock *VPRecipeBuilder::createReplicateRegion(Instruction *Instr,
                                                    VPRecipeBase *PredRecipe,
                                                    VPlanPtr &Plan) {
// Instructions marked for predication are replicated and placed under an
// if-then construct to prevent side-effects.

// Generate recipes to compute the block mask for this region.
VPValue *BlockInMask = createBlockInMask(Instr->getParent(), Plan);

// Build the triangular if-then region.
std::string RegionName = (Twine("pred.") + Instr->getOpcodeName()).str();
assert(Instr->getParent() && "Predicated instruction not in any basic block")((Instr->getParent() && "Predicated instruction not in any basic block"
) ? static_cast<void> (0) : __assert_fail ("Instr->getParent() && \"Predicated instruction not in any basic block\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 7032, __PRETTY_FUNCTION__));
auto *BOMRecipe = new VPBranchOnMaskRecipe(BlockInMask);
auto *Entry = new VPBasicBlock(Twine(RegionName) + ".entry", BOMRecipe);
auto *PHIRecipe =
    Instr->getType()->isVoidTy() ? nullptr : new VPPredInstPHIRecipe(Instr);
auto *Exit = new VPBasicBlock(Twine(RegionName) + ".continue", PHIRecipe);
auto *Pred = new VPBasicBlock(Twine(RegionName) + ".if", PredRecipe);
VPRegionBlock *Region = new VPRegionBlock(Entry, Exit, RegionName, true);

// Note: first set Entry as region entry and then connect successors starting
// from it in order, to propagate the "parent" of each VPBasicBlock.
VPBlockUtils::insertTwoBlocksAfter(Pred, Exit, BlockInMask, Entry);
VPBlockUtils::connectBlocks(Pred, Exit);

return Region;
7047}

7049bool VPRecipeBuilder::tryToCreateRecipe(Instruction *Instr, VFRange &Range,
                                      VPlanPtr &Plan, VPBasicBlock *VPBB) {
VPRecipeBase *Recipe = nullptr;

// First, check for specific widening recipes that deal with memory
// operations, inductions and Phi nodes.
if ((Recipe = tryToWidenMemory(Instr, Range, Plan)) ||
    (Recipe = tryToOptimizeInduction(Instr, Range)) ||
    (Recipe = tryToBlend(Instr, Plan)) ||
    (isa<PHINode>(Instr) &&
     (Recipe = new VPWidenPHIRecipe(cast<PHINode>(Instr))))) {
  setRecipe(Instr, Recipe);
  VPBB->appendRecipe(Recipe);
  return true;
}

// Handle GEP widening.
if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Instr)) {
  auto Scalarize = [&](unsigned VF) {
    return CM.isScalarWithPredication(Instr, VF) ||
           CM.isScalarAfterVectorization(Instr, VF) ||
           CM.isProfitableToScalarize(Instr, VF);
  };
  if (LoopVectorizationPlanner::getDecisionAndClampRange(Scalarize, Range))
    return false;
  VPWidenGEPRecipe *Recipe = new VPWidenGEPRecipe(GEP, OrigLoop);
  setRecipe(Instr, Recipe);
  VPBB->appendRecipe(Recipe);
  return true;
}

// Check if Instr is to be widened by a general VPWidenRecipe, after
// having first checked for specific widening recipes.
if (tryToWiden(Instr, VPBB, Range))
  return true;

return false;
7086}

7088void LoopVectorizationPlanner::buildVPlansWithVPRecipes(unsigned MinVF,
                                                      unsigned MaxVF) {
assert(OrigLoop->empty() && "Inner loop expected.")((OrigLoop->empty() && "Inner loop expected.") ? static_cast
<void> (0) : __assert_fail ("OrigLoop->empty() && \"Inner loop expected.\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 7090, __PRETTY_FUNCTION__));

// Collect conditions feeding internal conditional branches; they need to be
// represented in VPlan for it to model masking.
SmallPtrSet<Value *, 1> NeedDef;

auto *Latch = OrigLoop->getLoopLatch();
for (BasicBlock *BB : OrigLoop->blocks()) {
  if (BB == Latch)
    continue;
  BranchInst *Branch = dyn_cast<BranchInst>(BB->getTerminator());
  if (Branch && Branch->isConditional())
    NeedDef.insert(Branch->getCondition());
}

// If the tail is to be folded by masking, the primary induction variable
// needs to be represented in VPlan for it to model early-exit masking.
// Also, both the Phi and the live-out instruction of each reduction are
// required in order to introduce a select between them in VPlan.
if (CM.foldTailByMasking()) {
  NeedDef.insert(Legal->getPrimaryInduction());
  for (auto &Reduction : Legal->getReductionVars()) {
    NeedDef.insert(Reduction.first);
    NeedDef.insert(Reduction.second.getLoopExitInstr());
  }
}

// Collect instructions from the original loop that will become trivially dead
// in the vectorized loop. We don't need to vectorize these instructions. For
// example, original induction update instructions can become dead because we
// separately emit induction "steps" when generating code for the new loop.
// Similarly, we create a new latch condition when setting up the structure
// of the new loop, so the old one can become dead.
SmallPtrSet<Instruction *, 4> DeadInstructions;
collectTriviallyDeadInstructions(DeadInstructions);

// Add assume instructions we need to drop to DeadInstructions, to prevent
// them from being added to the VPlan.
// TODO: We only need to drop assumes in blocks that get flattend. If the
// control flow is preserved, we should keep them.
auto &ConditionalAssumes = Legal->getConditionalAssumes();
DeadInstructions.insert(ConditionalAssumes.begin(), ConditionalAssumes.end());

DenseMap<Instruction *, Instruction *> &SinkAfter = Legal->getSinkAfter();
// Dead instructions do not need sinking. Remove them from SinkAfter.
for (Instruction *I : DeadInstructions)
  SinkAfter.erase(I);

for (unsigned VF = MinVF; VF < MaxVF + 1;) {
  VFRange SubRange = {VF, MaxVF + 1};
  VPlans.push_back(buildVPlanWithVPRecipes(SubRange, NeedDef,
                                           DeadInstructions, SinkAfter));
  VF = SubRange.End;
}
7144}

7146VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(
  VFRange &Range, SmallPtrSetImpl<Value *> &NeedDef,
  SmallPtrSetImpl<Instruction *> &DeadInstructions,
  const DenseMap<Instruction *, Instruction *> &SinkAfter) {

// Hold a mapping from predicated instructions to their recipes, in order to
// fix their AlsoPack behavior if a user is determined to replicate and use a
// scalar instead of vector value.
DenseMap<Instruction *, VPReplicateRecipe *> PredInst2Recipe;

SmallPtrSet<const InterleaveGroup<Instruction> *, 1> InterleaveGroups;

VPRecipeBuilder RecipeBuilder(OrigLoop, TLI, Legal, CM, Builder);

// ---------------------------------------------------------------------------
// Pre-construction: record ingredients whose recipes we'll need to further
// process after constructing the initial VPlan.
// ---------------------------------------------------------------------------

// Mark instructions we'll need to sink later and their targets as
// ingredients whose recipe we'll need to record.
for (auto &Entry : SinkAfter) {
  RecipeBuilder.recordRecipeOf(Entry.first);
  RecipeBuilder.recordRecipeOf(Entry.second);
}

// For each interleave group which is relevant for this (possibly trimmed)
// Range, add it to the set of groups to be later applied to the VPlan and add
// placeholders for its members' Recipes which we'll be replacing with a
// single VPInterleaveRecipe.
for (InterleaveGroup<Instruction> *IG : IAI.getInterleaveGroups()) {
  auto applyIG = [IG, this](unsigned VF) -> bool {
    return (VF >= 2 && // Query is illegal for VF == 1
            CM.getWideningDecision(IG->getInsertPos(), VF) ==
                LoopVectorizationCostModel::CM_Interleave);
  };
  if (!getDecisionAndClampRange(applyIG, Range))
    continue;
  InterleaveGroups.insert(IG);
  for (unsigned i = 0; i < IG->getFactor(); i++)
    if (Instruction *Member = IG->getMember(i))
      RecipeBuilder.recordRecipeOf(Member);
};

// ---------------------------------------------------------------------------
// Build initial VPlan: Scan the body of the loop in a topological order to
// visit each basic block after having visited its predecessor basic blocks.
// ---------------------------------------------------------------------------

// Create a dummy pre-entry VPBasicBlock to start building the VPlan.
auto Plan = std::make_unique<VPlan>();
VPBasicBlock *VPBB = new VPBasicBlock("Pre-Entry");
Plan->setEntry(VPBB);

// Represent values that will have defs inside VPlan.
for (Value *V : NeedDef)
  Plan->addVPValue(V);

// Scan the body of the loop in a topological order to visit each basic block
// after having visited its predecessor basic blocks.
LoopBlocksDFS DFS(OrigLoop);
DFS.perform(LI);

for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
  // Relevant instructions from basic block BB will be grouped into VPRecipe
  // ingredients and fill a new VPBasicBlock.
  unsigned VPBBsForBB = 0;
  auto *FirstVPBBForBB = new VPBasicBlock(BB->getName());
  VPBlockUtils::insertBlockAfter(FirstVPBBForBB, VPBB);
  VPBB = FirstVPBBForBB;
  Builder.setInsertPoint(VPBB);

  // Introduce each ingredient into VPlan.
  for (Instruction &I : BB->instructionsWithoutDebug()) {
    Instruction *Instr = &I;

    // First filter out irrelevant instructions, to ensure no recipes are
    // built for them.
    if (isa<BranchInst>(Instr) ||
        DeadInstructions.find(Instr) != DeadInstructions.end())
      continue;

    if (RecipeBuilder.tryToCreateRecipe(Instr, Range, Plan, VPBB))
      continue;

    // Otherwise, if all widening options failed, Instruction is to be
    // replicated. This may create a successor for VPBB.
    VPBasicBlock *NextVPBB = RecipeBuilder.handleReplication(
        Instr, Range, VPBB, PredInst2Recipe, Plan);
    if (NextVPBB != VPBB) {
      VPBB = NextVPBB;
      VPBB->setName(BB->hasName() ? BB->getName() + "." + Twine(VPBBsForBB++)
                                  : "");
    }
  }
}

// Discard empty dummy pre-entry VPBasicBlock. Note that other VPBasicBlocks
// may also be empty, such as the last one VPBB, reflecting original
// basic-blocks with no recipes.
VPBasicBlock *PreEntry = cast<VPBasicBlock>(Plan->getEntry());
assert(PreEntry->empty() && "Expecting empty pre-entry block.")((PreEntry->empty() && "Expecting empty pre-entry block."
) ? static_cast<void> (0) : __assert_fail ("PreEntry->empty() && \"Expecting empty pre-entry block.\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 7247, __PRETTY_FUNCTION__));
VPBlockBase *Entry = Plan->setEntry(PreEntry->getSingleSuccessor());
VPBlockUtils::disconnectBlocks(PreEntry, Entry);
delete PreEntry;

// ---------------------------------------------------------------------------
// Transform initial VPlan: Apply previously taken decisions, in order, to
// bring the VPlan to its final state.
// ---------------------------------------------------------------------------

// Apply Sink-After legal constraints.
for (auto &Entry : SinkAfter) {
  VPRecipeBase *Sink = RecipeBuilder.getRecipe(Entry.first);
  VPRecipeBase *Target = RecipeBuilder.getRecipe(Entry.second);
  Sink->moveAfter(Target);
}

// Interleave memory: for each Interleave Group we marked earlier as relevant
// for this VPlan, replace the Recipes widening its memory instructions with a
// single VPInterleaveRecipe at its insertion point.
for (auto IG : InterleaveGroups) {
  auto *Recipe = cast<VPWidenMemoryInstructionRecipe>(
      RecipeBuilder.getRecipe(IG->getInsertPos()));
  (new VPInterleaveRecipe(IG, Recipe->getAddr(), Recipe->getMask()))
      ->insertBefore(Recipe);

  for (unsigned i = 0; i < IG->getFactor(); ++i)
    if (Instruction *Member = IG->getMember(i)) {
      RecipeBuilder.getRecipe(Member)->eraseFromParent();
    }
}

// Finally, if tail is folded by masking, introduce selects between the phi
// and the live-out instruction of each reduction, at the end of the latch.
if (CM.foldTailByMasking()) {
  Builder.setInsertPoint(VPBB);
  auto *Cond = RecipeBuilder.createBlockInMask(OrigLoop->getHeader(), Plan);
  for (auto &Reduction : Legal->getReductionVars()) {
    VPValue *Phi = Plan->getVPValue(Reduction.first);
    VPValue *Red = Plan->getVPValue(Reduction.second.getLoopExitInstr());
    Builder.createNaryOp(Instruction::Select, {Cond, Red, Phi});
  }
}

std::string PlanName;
raw_string_ostream RSO(PlanName);
unsigned VF = Range.Start;
Plan->addVF(VF);
RSO << "Initial VPlan for VF={" << VF;
for (VF *= 2; VF < Range.End; VF *= 2) {
  Plan->addVF(VF);
  RSO << "," << VF;
}
RSO << "},UF>=1";
RSO.flush();
Plan->setName(PlanName);

return Plan;
7305}

7307VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) {
// Outer loop handling: They may require CFG and instruction level
// transformations before even evaluating whether vectorization is profitable.
// Since we cannot modify the incoming IR, we need to build VPlan upfront in
// the vectorization pipeline.
assert(!OrigLoop->empty())((!OrigLoop->empty()) ? static_cast<void> (0) : __assert_fail
 ("!OrigLoop->empty()", "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 7312, __PRETTY_FUNCTION__));
assert(EnableVPlanNativePath && "VPlan-native path is not enabled.")((EnableVPlanNativePath && "VPlan-native path is not enabled."
) ? static_cast<void> (0) : __assert_fail ("EnableVPlanNativePath && \"VPlan-native path is not enabled.\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 7313, __PRETTY_FUNCTION__));

// Create new empty VPlan
auto Plan = std::make_unique<VPlan>();

// Build hierarchical CFG
VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI, *Plan);
HCFGBuilder.buildHierarchicalCFG();

for (unsigned VF = Range.Start; VF < Range.End; VF *= 2)
  Plan->addVF(VF);

if (EnableVPlanPredication) {
  VPlanPredicator VPP(*Plan);
  VPP.predicate();

  // Avoid running transformation to recipes until masked code generation in
  // VPlan-native path is in place.
  return Plan;
}

SmallPtrSet<Instruction *, 1> DeadInstructions;
VPlanTransforms::VPInstructionsToVPRecipes(
    OrigLoop, Plan, Legal->getInductionVars(), DeadInstructions);
return Plan;
7338}

7340Value* LoopVectorizationPlanner::VPCallbackILV::
7341getOrCreateVectorValues(Value *V, unsigned Part) {
    return ILV.getOrCreateVectorValue(V, Part);
7343}

7345Value *LoopVectorizationPlanner::VPCallbackILV::getOrCreateScalarValue(
  Value *V, const VPIteration &Instance) {
return ILV.getOrCreateScalarValue(V, Instance);
7348}

7350void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent,
                             VPSlotTracker &SlotTracker) const {
O << " +\n"
  << Indent << "\"INTERLEAVE-GROUP with factor " << IG->getFactor() << " at ";
IG->getInsertPos()->printAsOperand(O, false);
O << ", ";
getAddr()->printAsOperand(O, SlotTracker);
VPValue *Mask = getMask();
if (Mask) {
  O << ", ";
  Mask->printAsOperand(O, SlotTracker);
}
O << "\\l\"";
for (unsigned i = 0; i < IG->getFactor(); ++i)
  if (Instruction *I = IG->getMember(i))
    O << " +\n"
      << Indent << "\"  " << VPlanIngredient(I) << " " << i << "\\l\"";
7367}

7369void VPWidenRecipe::execute(VPTransformState &State) {
for (auto &Instr : make_range(Begin, End))
  State.ILV->widenInstruction(Instr);
7372}

7374void VPWidenGEPRecipe::execute(VPTransformState &State) {
State.ILV->widenGEP(GEP, State.UF, State.VF, IsPtrLoopInvariant,
                    IsIndexLoopInvariant);
7377}

7379void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) {
assert(!State.Instance && "Int or FP induction being replicated.")((!State.Instance && "Int or FP induction being replicated."
) ? static_cast<void> (0) : __assert_fail ("!State.Instance && \"Int or FP induction being replicated.\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 7380, __PRETTY_FUNCTION__));
State.ILV->widenIntOrFpInduction(IV, Trunc);
7382}

7384void VPWidenPHIRecipe::execute(VPTransformState &State) {
State.ILV->widenPHIInstruction(Phi, State.UF, State.VF);
7386}

7388void VPBlendRecipe::execute(VPTransformState &State) {
State.ILV->setDebugLocFromInst(State.Builder, Phi);
// We know that all PHIs in non-header blocks are converted into
// selects, so we don't have to worry about the insertion order and we
// can just use the builder.
// At this point we generate the predication tree. There may be
// duplications since this is a simple recursive scan, but future
// optimizations will clean it up.

unsigned NumIncoming = Phi->getNumIncomingValues();

assert((User || NumIncoming == 1) &&(((User || NumIncoming == 1) && "Multiple predecessors with predecessors having a full mask"
) ? static_cast<void> (0) : __assert_fail ("(User || NumIncoming == 1) && \"Multiple predecessors with predecessors having a full mask\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 7400, __PRETTY_FUNCTION__))
       "Multiple predecessors with predecessors having a full mask")(((User || NumIncoming == 1) && "Multiple predecessors with predecessors having a full mask"
) ? static_cast<void> (0) : __assert_fail ("(User || NumIncoming == 1) && \"Multiple predecessors with predecessors having a full mask\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 7400, __PRETTY_FUNCTION__));
// Generate a sequence of selects of the form:
// SELECT(Mask3, In3,
//      SELECT(Mask2, In2,
//                   ( ...)))
InnerLoopVectorizer::VectorParts Entry(State.UF);
for (unsigned In = 0; In < NumIncoming; ++In) {
  for (unsigned Part = 0; Part < State.UF; ++Part) {
    // We might have single edge PHIs (blocks) - use an identity
    // 'select' for the first PHI operand.
    Value *In0 =
        State.ILV->getOrCreateVectorValue(Phi->getIncomingValue(In), Part);
    if (In == 0)
      Entry[Part] = In0; // Initialize with the first incoming value.
    else {
      // Select between the current value and the previous incoming edge
      // based on the incoming mask.
      Value *Cond = State.get(User->getOperand(In), Part);
      Entry[Part] =
          State.Builder.CreateSelect(Cond, In0, Entry[Part], "predphi");
    }
  }
}
for (unsigned Part = 0; Part < State.UF; ++Part)
  State.ValueMap.setVectorValue(Phi, Part, Entry[Part]);
7425}

7427void VPInterleaveRecipe::execute(VPTransformState &State) {
assert(!State.Instance && "Interleave group being replicated.")((!State.Instance && "Interleave group being replicated."
) ? static_cast<void> (0) : __assert_fail ("!State.Instance && \"Interleave group being replicated.\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 7428, __PRETTY_FUNCTION__));
State.ILV->vectorizeInterleaveGroup(IG->getInsertPos(), State, getAddr(),
                                    getMask());
7431}

7433void VPReplicateRecipe::execute(VPTransformState &State) {
if (State.Instance) { // Generate a single instance.
  State.ILV->scalarizeInstruction(Ingredient, *State.Instance, IsPredicated);
  // Insert scalar instance packing it into a vector.
  if (AlsoPack && State.VF > 1) {
    // If we're constructing lane 0, initialize to start from undef.
    if (State.Instance->Lane == 0) {
      Value *Undef =
          UndefValue::get(VectorType::get(Ingredient->getType(), State.VF));
      State.ValueMap.setVectorValue(Ingredient, State.Instance->Part, Undef);
    }
    State.ILV->packScalarIntoVectorValue(Ingredient, *State.Instance);
  }
  return;
}

// Generate scalar instances for all VF lanes of all UF parts, unless the
// instruction is uniform inwhich case generate only the first lane for each
// of the UF parts.
unsigned EndLane = IsUniform ? 1 : State.VF;
for (unsigned Part = 0; Part < State.UF; ++Part)
  for (unsigned Lane = 0; Lane < EndLane; ++Lane)
    State.ILV->scalarizeInstruction(Ingredient, {Part, Lane}, IsPredicated);
7456}

7458void VPBranchOnMaskRecipe::execute(VPTransformState &State) {
assert(State.Instance && "Branch on Mask works only on single instance.")((State.Instance && "Branch on Mask works only on single instance."
) ? static_cast<void> (0) : __assert_fail ("State.Instance && \"Branch on Mask works only on single instance.\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 7459, __PRETTY_FUNCTION__));

unsigned Part = State.Instance->Part;
unsigned Lane = State.Instance->Lane;

Value *ConditionBit = nullptr;
if (!User) // Block in mask is all-one.
  ConditionBit = State.Builder.getTrue();
else {
  VPValue *BlockInMask = User->getOperand(0);
  ConditionBit = State.get(BlockInMask, Part);
  if (ConditionBit->getType()->isVectorTy())
    ConditionBit = State.Builder.CreateExtractElement(
        ConditionBit, State.Builder.getInt32(Lane));
}

// Replace the temporary unreachable terminator with a new conditional branch,
// whose two destinations will be set later when they are created.
auto *CurrentTerminator = State.CFG.PrevBB->getTerminator();
assert(isa<UnreachableInst>(CurrentTerminator) &&((isa<UnreachableInst>(CurrentTerminator) && "Expected to replace unreachable terminator with conditional branch."
) ? static_cast<void> (0) : __assert_fail ("isa<UnreachableInst>(CurrentTerminator) && \"Expected to replace unreachable terminator with conditional branch.\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 7479, __PRETTY_FUNCTION__))
       "Expected to replace unreachable terminator with conditional branch.")((isa<UnreachableInst>(CurrentTerminator) && "Expected to replace unreachable terminator with conditional branch."
) ? static_cast<void> (0) : __assert_fail ("isa<UnreachableInst>(CurrentTerminator) && \"Expected to replace unreachable terminator with conditional branch.\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 7479, __PRETTY_FUNCTION__));
auto *CondBr = BranchInst::Create(State.CFG.PrevBB, nullptr, ConditionBit);
CondBr->setSuccessor(0, nullptr);
ReplaceInstWithInst(CurrentTerminator, CondBr);
7483}

7485void VPPredInstPHIRecipe::execute(VPTransformState &State) {
assert(State.Instance && "Predicated instruction PHI works per instance.")((State.Instance && "Predicated instruction PHI works per instance."
) ? static_cast<void> (0) : __assert_fail ("State.Instance && \"Predicated instruction PHI works per instance.\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 7486, __PRETTY_FUNCTION__));
Instruction *ScalarPredInst = cast<Instruction>(
    State.ValueMap.getScalarValue(PredInst, *State.Instance));
BasicBlock *PredicatedBB = ScalarPredInst->getParent();
BasicBlock *PredicatingBB = PredicatedBB->getSinglePredecessor();
assert(PredicatingBB && "Predicated block has no single predecessor.")((PredicatingBB && "Predicated block has no single predecessor."
) ? static_cast<void> (0) : __assert_fail ("PredicatingBB && \"Predicated block has no single predecessor.\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 7491, __PRETTY_FUNCTION__));

// By current pack/unpack logic we need to generate only a single phi node: if
// a vector value for the predicated instruction exists at this point it means
// the instruction has vector users only, and a phi for the vector value is
// needed. In this case the recipe of the predicated instruction is marked to
// also do that packing, thereby "hoisting" the insert-element sequence.
// Otherwise, a phi node for the scalar value is needed.
unsigned Part = State.Instance->Part;
if (State.ValueMap.hasVectorValue(PredInst, Part)) {
  Value *VectorValue = State.ValueMap.getVectorValue(PredInst, Part);
  InsertElementInst *IEI = cast<InsertElementInst>(VectorValue);
  PHINode *VPhi = State.Builder.CreatePHI(IEI->getType(), 2);
  VPhi->addIncoming(IEI->getOperand(0), PredicatingBB); // Unmodified vector.
  VPhi->addIncoming(IEI, PredicatedBB); // New vector with inserted element.
  State.ValueMap.resetVectorValue(PredInst, Part, VPhi); // Update cache.
} else {
  Type *PredInstType = PredInst->getType();
  PHINode *Phi = State.Builder.CreatePHI(PredInstType, 2);
  Phi->addIncoming(UndefValue::get(ScalarPredInst->getType()), PredicatingBB);
  Phi->addIncoming(ScalarPredInst, PredicatedBB);
  State.ValueMap.resetScalarValue(PredInst, *State.Instance, Phi);
}
7514}

7516void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) {
State.ILV->vectorizeMemoryInstruction(&Instr, State, getAddr(), getMask());
7518}

7520// Determine how to lower the scalar epilogue, which depends on 1) optimising
7521// for minimum code-size, 2) predicate compiler options, 3) loop hints forcing
7522// predication, and 4) a TTI hook that analyses whether the loop is suitable
7523// for predication.
7524static ScalarEpilogueLowering getScalarEpilogueLowering(
  Function *F, Loop *L, LoopVectorizeHints &Hints, ProfileSummaryInfo *PSI,
  BlockFrequencyInfo *BFI, TargetTransformInfo *TTI, TargetLibraryInfo *TLI,
  AssumptionCache *AC, LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT,
  LoopVectorizationLegality &LVL) {
bool OptSize =
    F->hasOptSize() || llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI,
                                                   PGSOQueryType::IRPass);
// 1) OptSize takes precedence over all other options, i.e. if this is set,
// don't look at hints or options, and don't request a scalar epilogue.
if (OptSize && Hints.getForce() != LoopVectorizeHints::FK_Enabled)
  return CM_ScalarEpilogueNotAllowedOptSize;

bool PredicateOptDisabled = PreferPredicateOverEpilog.getNumOccurrences() &&
                            !PreferPredicateOverEpilog;

// 2) Next, if disabling predication is requested on the command line, honour
// this and request a scalar epilogue. Also do this if we don't have a
// primary induction variable, which is required for predication.
if (PredicateOptDisabled || !LVL.getPrimaryInduction())
  return CM_ScalarEpilogueAllowed;

// 3) and 4) look if enabling predication is requested on the command line,
// with a loop hint, or if the TTI hook indicates this is profitable, request
// predication .
if (PreferPredicateOverEpilog ||
    Hints.getPredicate() == LoopVectorizeHints::FK_Enabled ||
    (TTI->preferPredicateOverEpilogue(L, LI, *SE, *AC, TLI, DT,
                                      LVL.getLAI()) &&
     Hints.getPredicate() != LoopVectorizeHints::FK_Disabled))
  return CM_ScalarEpilogueNotNeededUsePredicate;

return CM_ScalarEpilogueAllowed;
7557}

7559// Process the loop in the VPlan-native vectorization path. This path builds
7560// VPlan upfront in the vectorization pipeline, which allows to apply
7561// VPlan-to-VPlan transformations from the very beginning without modifying the
7562// input LLVM IR.
7563static bool processLoopInVPlanNativePath(
  Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT,
  LoopVectorizationLegality *LVL, TargetTransformInfo *TTI,
  TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC,
  OptimizationRemarkEmitter *ORE, BlockFrequencyInfo *BFI,
  ProfileSummaryInfo *PSI, LoopVectorizeHints &Hints) {

assert(EnableVPlanNativePath && "VPlan-native path is disabled.")((EnableVPlanNativePath && "VPlan-native path is disabled."
) ? static_cast<void> (0) : __assert_fail ("EnableVPlanNativePath && \"VPlan-native path is disabled.\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 7570, __PRETTY_FUNCTION__));
Function *F = L->getHeader()->getParent();
InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI());

ScalarEpilogueLowering SEL = getScalarEpilogueLowering(
    F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, *LVL);

LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F,
                              &Hints, IAI);
// Use the planner for outer loop vectorization.
// TODO: CM is not used at this point inside the planner. Turn CM into an
// optional argument if we don't need it in the future.
LoopVectorizationPlanner LVP(L, LI, TLI, TTI, LVL, CM, IAI);

// Get user vectorization factor.
const unsigned UserVF = Hints.getWidth();

// Plan how to best vectorize, return the best VF and its cost.
const VectorizationFactor VF = LVP.planInVPlanNativePath(UserVF);

// If we are stress testing VPlan builds, do not attempt to generate vector
// code. Masked vector code generation support will follow soon.
// Also, do not attempt to vectorize if no vector code will be produced.
if (VPlanBuildStressTest || EnableVPlanPredication ||
    VectorizationFactor::Disabled() == VF)
  return false;

LVP.setBestPlan(VF.Width, 1);

InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, 1, LVL,
                       &CM);
LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \""do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "Vectorizing outer loop in \""
 << L->getHeader()->getParent()->getName() <<
 "\"\n"; } } while (false)
                  << L->getHeader()->getParent()->getName() << "\"\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "Vectorizing outer loop in \""
 << L->getHeader()->getParent()->getName() <<
 "\"\n"; } } while (false);
LVP.executePlan(LB, DT);

// Mark the loop as already vectorized to avoid vectorizing again.
Hints.setAlreadyVectorized();

LLVM_DEBUG(verifyFunction(*L->getHeader()->getParent()))do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { verifyFunction(*L->getHeader()->getParent
()); } } while (false);
return true;
7610}

7612bool LoopVectorizePass::processLoop(Loop *L) {
assert((EnableVPlanNativePath || L->empty()) &&(((EnableVPlanNativePath || L->empty()) && "VPlan-native path is not enabled. Only process inner loops."
) ? static_cast<void> (0) : __assert_fail ("(EnableVPlanNativePath || L->empty()) && \"VPlan-native path is not enabled. Only process inner loops.\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 7614, __PRETTY_FUNCTION__))
       "VPlan-native path is not enabled. Only process inner loops.")(((EnableVPlanNativePath || L->empty()) && "VPlan-native path is not enabled. Only process inner loops."
) ? static_cast<void> (0) : __assert_fail ("(EnableVPlanNativePath || L->empty()) && \"VPlan-native path is not enabled. Only process inner loops.\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 7614, __PRETTY_FUNCTION__));

7616#ifndef NDEBUG
const std::string DebugLocStr = getDebugLocString(L);
7618#endif /* NDEBUG */

LLVM_DEBUG(dbgs() << "\nLV: Checking a loop in \""do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "\nLV: Checking a loop in \""
 << L->getHeader()->getParent()->getName() <<
 "\" from " << DebugLocStr << "\n"; } } while (false
)
                  << L->getHeader()->getParent()->getName() << "\" from "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "\nLV: Checking a loop in \""
 << L->getHeader()->getParent()->getName() <<
 "\" from " << DebugLocStr << "\n"; } } while (false
)
                  << DebugLocStr << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "\nLV: Checking a loop in \""
 << L->getHeader()->getParent()->getName() <<
 "\" from " << DebugLocStr << "\n"; } } while (false
);

LoopVectorizeHints Hints(L, InterleaveOnlyWhenForced, *ORE);

LLVM_DEBUG(do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Loop hints:" <<
 " force=" << (Hints.getForce() == LoopVectorizeHints::
FK_Disabled ? "disabled" : (Hints.getForce() == LoopVectorizeHints
::FK_Enabled ? "enabled" : "?")) << " width=" << Hints
.getWidth() << " unroll=" << Hints.getInterleave(
) << "\n"; } } while (false)
    dbgs() << "LV: Loop hints:"do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Loop hints:" <<
 " force=" << (Hints.getForce() == LoopVectorizeHints::
FK_Disabled ? "disabled" : (Hints.getForce() == LoopVectorizeHints
::FK_Enabled ? "enabled" : "?")) << " width=" << Hints
.getWidth() << " unroll=" << Hints.getInterleave(
) << "\n"; } } while (false)
           << " force="do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Loop hints:" <<
 " force=" << (Hints.getForce() == LoopVectorizeHints::
FK_Disabled ? "disabled" : (Hints.getForce() == LoopVectorizeHints
::FK_Enabled ? "enabled" : "?")) << " width=" << Hints
.getWidth() << " unroll=" << Hints.getInterleave(
) << "\n"; } } while (false)
           << (Hints.getForce() == LoopVectorizeHints::FK_Disableddo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Loop hints:" <<
 " force=" << (Hints.getForce() == LoopVectorizeHints::
FK_Disabled ? "disabled" : (Hints.getForce() == LoopVectorizeHints
::FK_Enabled ? "enabled" : "?")) << " width=" << Hints
.getWidth() << " unroll=" << Hints.getInterleave(
) << "\n"; } } while (false)
                   ? "disabled"do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Loop hints:" <<
 " force=" << (Hints.getForce() == LoopVectorizeHints::
FK_Disabled ? "disabled" : (Hints.getForce() == LoopVectorizeHints
::FK_Enabled ? "enabled" : "?")) << " width=" << Hints
.getWidth() << " unroll=" << Hints.getInterleave(
) << "\n"; } } while (false)
                   : (Hints.getForce() == LoopVectorizeHints::FK_Enableddo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Loop hints:" <<
 " force=" << (Hints.getForce() == LoopVectorizeHints::
FK_Disabled ? "disabled" : (Hints.getForce() == LoopVectorizeHints
::FK_Enabled ? "enabled" : "?")) << " width=" << Hints
.getWidth() << " unroll=" << Hints.getInterleave(
) << "\n"; } } while (false)
                          ? "enabled"do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Loop hints:" <<
 " force=" << (Hints.getForce() == LoopVectorizeHints::
FK_Disabled ? "disabled" : (Hints.getForce() == LoopVectorizeHints
::FK_Enabled ? "enabled" : "?")) << " width=" << Hints
.getWidth() << " unroll=" << Hints.getInterleave(
) << "\n"; } } while (false)
                          : "?"))do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Loop hints:" <<
 " force=" << (Hints.getForce() == LoopVectorizeHints::
FK_Disabled ? "disabled" : (Hints.getForce() == LoopVectorizeHints
::FK_Enabled ? "enabled" : "?")) << " width=" << Hints
.getWidth() << " unroll=" << Hints.getInterleave(
) << "\n"; } } while (false)
           << " width=" << Hints.getWidth()do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Loop hints:" <<
 " force=" << (Hints.getForce() == LoopVectorizeHints::
FK_Disabled ? "disabled" : (Hints.getForce() == LoopVectorizeHints
::FK_Enabled ? "enabled" : "?")) << " width=" << Hints
.getWidth() << " unroll=" << Hints.getInterleave(
) << "\n"; } } while (false)
           << " unroll=" << Hints.getInterleave() << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Loop hints:" <<
 " force=" << (Hints.getForce() == LoopVectorizeHints::
FK_Disabled ? "disabled" : (Hints.getForce() == LoopVectorizeHints
::FK_Enabled ? "enabled" : "?")) << " width=" << Hints
.getWidth() << " unroll=" << Hints.getInterleave(
) << "\n"; } } while (false);

// Function containing loop
Function *F = L->getHeader()->getParent();

// Looking at the diagnostic output is the only way to determine if a loop
// was vectorized (other than looking at the IR or machine code), so it
// is important to generate an optimization remark for each loop. Most of
// these messages are generated as OptimizationRemarkAnalysis. Remarks
// generated as OptimizationRemark and OptimizationRemarkMissed are
// less verbose reporting vectorized loops and unvectorized loops that may
// benefit from vectorization, respectively.

if (!Hints.allowVectorization(F, L, VectorizeOnlyWhenForced)) {
  LLVM_DEBUG(dbgs() << "LV: Loop hints prevent vectorization.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Loop hints prevent vectorization.\n"
; } } while (false);
  return false;
}

PredicatedScalarEvolution PSE(*SE, *L);

// Check if it is legal to vectorize the loop.
LoopVectorizationRequirements Requirements(*ORE);
LoopVectorizationLegality LVL(L, PSE, DT, TTI, TLI, AA, F, GetLAA, LI, ORE,
                              &Requirements, &Hints, DB, AC);
if (!LVL.canVectorize(EnableVPlanNativePath)) {
  LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Not vectorizing: Cannot prove legality.\n"
; } } while (false);
  Hints.emitRemarkWithHints();
  return false;
}

// Check the function attributes and profiles to find out if this function
// should be optimized for size.
ScalarEpilogueLowering SEL = getScalarEpilogueLowering(
    F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, LVL);

// Entrance to the VPlan-native vectorization path. Outer loops are processed
// here. They may require CFG and instruction level transformations before
// even evaluating whether vectorization is profitable. Since we cannot modify
// the incoming IR, we need to build VPlan upfront in the vectorization
// pipeline.
if (!L->empty())
  return processLoopInVPlanNativePath(L, PSE, LI, DT, &LVL, TTI, TLI, DB, AC,
                                      ORE, BFI, PSI, Hints);

assert(L->empty() && "Inner loop expected.")((L->empty() && "Inner loop expected.") ? static_cast
<void> (0) : __assert_fail ("L->empty() && \"Inner loop expected.\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 7679, __PRETTY_FUNCTION__));

// Check the loop for a trip count threshold: vectorize loops with a tiny trip
// count by optimizing for size, to minimize overheads.
auto ExpectedTC = getSmallBestKnownTC(*SE, L);
if (ExpectedTC && *ExpectedTC < TinyTripCountVectorThreshold) {
  LLVM_DEBUG(dbgs() << "LV: Found a loop with a very small trip count. "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found a loop with a very small trip count. "
 << "This loop is worth vectorizing only if no scalar "
 << "iteration overheads are incurred."; } } while (false
)
                    << "This loop is worth vectorizing only if no scalar "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found a loop with a very small trip count. "
 << "This loop is worth vectorizing only if no scalar "
 << "iteration overheads are incurred."; } } while (false
)
                    << "iteration overheads are incurred.")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found a loop with a very small trip count. "
 << "This loop is worth vectorizing only if no scalar "
 << "iteration overheads are incurred."; } } while (false
);
  if (Hints.getForce() == LoopVectorizeHints::FK_Enabled)
    LLVM_DEBUG(dbgs() << " But vectorizing was explicitly forced.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << " But vectorizing was explicitly forced.\n"
; } } while (false);
  else {
    LLVM_DEBUG(dbgs() << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "\n"; } } while (false);
    SEL = CM_ScalarEpilogueNotAllowedLowTripLoop;
  }
}

// Check the function attributes to see if implicit floats are allowed.
// FIXME: This check doesn't seem possibly correct -- what if the loop is
// an integer loop and the vector instructions selected are purely integer
// vector instructions?
if (F->hasFnAttribute(Attribute::NoImplicitFloat)) {
  reportVectorizationFailure(
      "Can't vectorize when the NoImplicitFloat attribute is used",
      "loop not vectorized due to NoImplicitFloat attribute",
      "NoImplicitFloat", ORE, L);
  Hints.emitRemarkWithHints();
  return false;
}

// Check if the target supports potentially unsafe FP vectorization.
// FIXME: Add a check for the type of safety issue (denormal, signaling)
// for the target we're vectorizing for, to make sure none of the
// additional fp-math flags can help.
if (Hints.isPotentiallyUnsafe() &&
    TTI->isFPVectorizationPotentiallyUnsafe()) {
  reportVectorizationFailure(
      "Potentially unsafe FP op prevents vectorization",
      "loop not vectorized due to unsafe FP support.",
      "UnsafeFP", ORE, L);
  Hints.emitRemarkWithHints();
  return false;
}

bool UseInterleaved = TTI->enableInterleavedAccessVectorization();
InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL.getLAI());

// If an override option has been passed in for interleaved accesses, use it.
if (EnableInterleavedMemAccesses.getNumOccurrences() > 0)
  UseInterleaved = EnableInterleavedMemAccesses;

// Analyze interleaved memory accesses.
if (UseInterleaved) {
  IAI.analyzeInterleaving(useMaskedInterleavedAccesses(*TTI));
}

// Use the cost model.
LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE,
                              F, &Hints, IAI);
CM.collectValuesToIgnore();

// Use the planner for vectorization.
LoopVectorizationPlanner LVP(L, LI, TLI, TTI, &LVL, CM, IAI);

// Get user vectorization factor.
unsigned UserVF = Hints.getWidth();

// Plan how to best vectorize, return the best VF and its cost.
Optional<VectorizationFactor> MaybeVF = LVP.plan(UserVF);

VectorizationFactor VF = VectorizationFactor::Disabled();
unsigned IC = 1;
unsigned UserIC = Hints.getInterleave();

if (MaybeVF) {
  VF = *MaybeVF;
  // Select the interleave count.
  IC = CM.selectInterleaveCount(VF.Width, VF.Cost);
}

// Identify the diagnostic messages that should be produced.
std::pair<StringRef, std::string> VecDiagMsg, IntDiagMsg;
bool VectorizeLoop = true, InterleaveLoop = true;
if (Requirements.doesNotMeet(F, L, Hints)) {
  LLVM_DEBUG(dbgs() << "LV: Not vectorizing: loop did not meet vectorization "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Not vectorizing: loop did not meet vectorization "
 "requirements.\n"; } } while (false)
                       "requirements.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Not vectorizing: loop did not meet vectorization "
 "requirements.\n"; } } while (false);
  Hints.emitRemarkWithHints();
  return false;
}

if (VF.Width == 1) {
  LLVM_DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Vectorization is possible but not beneficial.\n"
; } } while (false);
  VecDiagMsg = std::make_pair(
      "VectorizationNotBeneficial",
      "the cost-model indicates that vectorization is not beneficial");
  VectorizeLoop = false;
}

if (!MaybeVF && UserIC > 1) {
  // Tell the user interleaving was avoided up-front, despite being explicitly
  // requested.
  LLVM_DEBUG(dbgs() << "LV: Ignoring UserIC, because vectorization and "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Ignoring UserIC, because vectorization and "
 "interleaving should be avoided up front\n"; } } while (false
)
                       "interleaving should be avoided up front\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Ignoring UserIC, because vectorization and "
 "interleaving should be avoided up front\n"; } } while (false
);
  IntDiagMsg = std::make_pair(
      "InterleavingAvoided",
      "Ignoring UserIC, because interleaving was avoided up front");
  InterleaveLoop = false;
} else if (IC == 1 && UserIC <= 1) {
  // Tell the user interleaving is not beneficial.
  LLVM_DEBUG(dbgs() << "LV: Interleaving is not beneficial.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Interleaving is not beneficial.\n"
; } } while (false);
  IntDiagMsg = std::make_pair(
      "InterleavingNotBeneficial",
      "the cost-model indicates that interleaving is not beneficial");
  InterleaveLoop = false;
  if (UserIC == 1) {
    IntDiagMsg.first = "InterleavingNotBeneficialAndDisabled";
    IntDiagMsg.second +=
        " and is explicitly disabled or interleave count is set to 1";
  }
} else if (IC > 1 && UserIC == 1) {
  // Tell the user interleaving is beneficial, but it explicitly disabled.
  LLVM_DEBUG(do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Interleaving is beneficial but is explicitly disabled."
; } } while (false)
      dbgs() << "LV: Interleaving is beneficial but is explicitly disabled.")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Interleaving is beneficial but is explicitly disabled."
; } } while (false);
  IntDiagMsg = std::make_pair(
      "InterleavingBeneficialButDisabled",
      "the cost-model indicates that interleaving is beneficial "
      "but is explicitly disabled or interleave count is set to 1");
  InterleaveLoop = false;
}

// Override IC if user provided an interleave count.
IC = UserIC > 0 ? UserIC : IC;

// Emit diagnostic messages, if any.
const char *VAPassName = Hints.vectorizeAnalysisPassName();
if (!VectorizeLoop && !InterleaveLoop) {
  // Do not vectorize or interleaving the loop.
  ORE->emit([&]() {
    return OptimizationRemarkMissed(VAPassName, VecDiagMsg.first,
                                    L->getStartLoc(), L->getHeader())
           << VecDiagMsg.second;
  });
  ORE->emit([&]() {
    return OptimizationRemarkMissed(LV_NAME"loop-vectorize", IntDiagMsg.first,
                                    L->getStartLoc(), L->getHeader())
           << IntDiagMsg.second;
  });
  return false;
} else if (!VectorizeLoop && InterleaveLoop) {
  LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n')do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Interleave Count is "
 << IC << '\n'; } } while (false);
  ORE->emit([&]() {
    return OptimizationRemarkAnalysis(VAPassName, VecDiagMsg.first,
                                      L->getStartLoc(), L->getHeader())
           << VecDiagMsg.second;
  });
} else if (VectorizeLoop && !InterleaveLoop) {
  LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Widthdo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found a vectorizable loop ("
 << VF.Width << ") in " << DebugLocStr <<
 '\n'; } } while (false)
                    << ") in " << DebugLocStr << '\n')do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found a vectorizable loop ("
 << VF.Width << ") in " << DebugLocStr <<
 '\n'; } } while (false);
  ORE->emit([&]() {
    return OptimizationRemarkAnalysis(LV_NAME"loop-vectorize", IntDiagMsg.first,
                                      L->getStartLoc(), L->getHeader())
           << IntDiagMsg.second;
  });
} else if (VectorizeLoop && InterleaveLoop) {
  LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Widthdo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found a vectorizable loop ("
 << VF.Width << ") in " << DebugLocStr <<
 '\n'; } } while (false)
                    << ") in " << DebugLocStr << '\n')do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found a vectorizable loop ("
 << VF.Width << ") in " << DebugLocStr <<
 '\n'; } } while (false);
  LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n')do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Interleave Count is "
 << IC << '\n'; } } while (false);
}

LVP.setBestPlan(VF.Width, IC);

using namespace ore;
bool DisableRuntimeUnroll = false;
MDNode *OrigLoopID = L->getLoopID();

if (!VectorizeLoop) {
  assert(IC > 1 && "interleave count should not be 1 or 0")((IC > 1 && "interleave count should not be 1 or 0"
) ? static_cast<void> (0) : __assert_fail ("IC > 1 && \"interleave count should not be 1 or 0\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 7855, __PRETTY_FUNCTION__));
  // If we decided that it is not legal to vectorize the loop, then
  // interleave it.
  InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL,
                             &CM);
  LVP.executePlan(Unroller, DT);

  ORE->emit([&]() {
    return OptimizationRemark(LV_NAME"loop-vectorize", "Interleaved", L->getStartLoc(),
                              L->getHeader())
           << "interleaved loop (interleaved count: "
           << NV("InterleaveCount", IC) << ")";
  });
} else {
  // If we decided that it is *legal* to vectorize the loop, then do it.
  InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, IC,
                         &LVL, &CM);
  LVP.executePlan(LB, DT);
  ++LoopsVectorized;

  // Add metadata to disable runtime unrolling a scalar loop when there are
  // no runtime checks about strides and memory. A scalar loop that is
  // rarely used is not worth unrolling.
  if (!LB.areSafetyChecksAdded())
    DisableRuntimeUnroll = true;

  // Report the vectorization decision.
  ORE->emit([&]() {
    return OptimizationRemark(LV_NAME"loop-vectorize", "Vectorized", L->getStartLoc(),
                              L->getHeader())
           << "vectorized loop (vectorization width: "
           << NV("VectorizationFactor", VF.Width)
           << ", interleaved count: " << NV("InterleaveCount", IC) << ")";
  });
}

Optional<MDNode *> RemainderLoopID =
    makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll,
                                    LLVMLoopVectorizeFollowupEpilogue});
if (RemainderLoopID.hasValue()) {
  L->setLoopID(RemainderLoopID.getValue());
} else {
  if (DisableRuntimeUnroll)
    AddRuntimeUnrollDisableMetaData(L);

  // Mark the loop as already vectorized to avoid vectorizing again.
  Hints.setAlreadyVectorized();
}

LLVM_DEBUG(verifyFunction(*L->getHeader()->getParent()))do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { verifyFunction(*L->getHeader()->getParent
()); } } while (false);
return true;
7906}

7908bool LoopVectorizePass::runImpl(
  Function &F, ScalarEvolution &SE_, LoopInfo &LI_, TargetTransformInfo &TTI_,
  DominatorTree &DT_, BlockFrequencyInfo &BFI_, TargetLibraryInfo *TLI_,
  DemandedBits &DB_, AliasAnalysis &AA_, AssumptionCache &AC_,
  std::function<const LoopAccessInfo &(Loop &)> &GetLAA_,
  OptimizationRemarkEmitter &ORE_, ProfileSummaryInfo *PSI_) {
SE = &SE_;
LI = &LI_;
TTI = &TTI_;
DT = &DT_;
BFI = &BFI_;
TLI = TLI_;
AA = &AA_;
AC = &AC_;
GetLAA = &GetLAA_;
DB = &DB_;
ORE = &ORE_;
PSI = PSI_;

// Don't attempt if
// 1. the target claims to have no vector registers, and
// 2. interleaving won't help ILP.
//
// The second condition is necessary because, even if the target has no
// vector registers, loop vectorization may still enable scalar
// interleaving.
if (!TTI->getNumberOfRegisters(TTI->getRegisterClassForType(true)) &&
    TTI->getMaxInterleaveFactor(1) < 2)
  return false;

bool Changed = false;

// The vectorizer requires loops to be in simplified form.
// Since simplification may add new inner loops, it has to run before the
// legality and profitability checks. This means running the loop vectorizer
// will simplify all loops, regardless of whether anything end up being
// vectorized.
for (auto &L : *LI)
  Changed |=
      simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */);

// Build up a worklist of inner-loops to vectorize. This is necessary as
// the act of vectorizing or partially unrolling a loop creates new loops
// and can invalidate iterators across the loops.
SmallVector<Loop *, 8> Worklist;

for (Loop *L : *LI)
  collectSupportedLoops(*L, LI, ORE, Worklist);

LoopsAnalyzed += Worklist.size();

// Now walk the identified inner loops.
while (!Worklist.empty()) {
  Loop *L = Worklist.pop_back_val();

  // For the inner loops we actually process, form LCSSA to simplify the
  // transform.
  Changed |= formLCSSARecursively(*L, *DT, LI, SE);

  Changed |= processLoop(L);
}

// Process each loop nest in the function.
return Changed;
7972}

7974PreservedAnalyses LoopVectorizePass::run(Function &F,
                                       FunctionAnalysisManager &AM) {
  auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F);
  auto &LI = AM.getResult<LoopAnalysis>(F);
  auto &TTI = AM.getResult<TargetIRAnalysis>(F);
  auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
  auto &BFI = AM.getResult<BlockFrequencyAnalysis>(F);
  auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
  auto &AA = AM.getResult<AAManager>(F);
  auto &AC = AM.getResult<AssumptionAnalysis>(F);
  auto &DB = AM.getResult<DemandedBitsAnalysis>(F);
  auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F);
  MemorySSA *MSSA = EnableMSSALoopDependency
                        ? &AM.getResult<MemorySSAAnalysis>(F).getMSSA()
                        : nullptr;

  auto &LAM = AM.getResult<LoopAnalysisManagerFunctionProxy>(F).getManager();
  std::function<const LoopAccessInfo &(Loop &)> GetLAA =
      [&](Loop &L) -> const LoopAccessInfo & {
    LoopStandardAnalysisResults AR = {AA, AC, DT, LI, SE, TLI, TTI, MSSA};
    return LAM.getResult<LoopAccessAnalysis>(L, AR);
  };
  const ModuleAnalysisManager &MAM =
      AM.getResult<ModuleAnalysisManagerFunctionProxy>(F).getManager();
  ProfileSummaryInfo *PSI =
      MAM.getCachedResult<ProfileSummaryAnalysis>(*F.getParent());
  bool Changed =
      runImpl(F, SE, LI, TTI, DT, BFI, &TLI, DB, AA, AC, GetLAA, ORE, PSI);
  if (!Changed)
    return PreservedAnalyses::all();
  PreservedAnalyses PA;

  // We currently do not preserve loopinfo/dominator analyses with outer loop
  // vectorization. Until this is addressed, mark these analyses as preserved
  // only for non-VPlan-native path.
  // TODO: Preserve Loop and Dominator analyses for VPlan-native path.
  if (!EnableVPlanNativePath) {
    PA.preserve<LoopAnalysis>();
    PA.preserve<DominatorTreeAnalysis>();
  }
  PA.preserve<BasicAA>();
  PA.preserve<GlobalsAA>();
  return PA;
8017}

←

/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h

1//===- LoopVectorizationPlanner.h - Planner for LoopVectorization ---------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8///
9/// \file
10/// This file provides a LoopVectorizationPlanner class.
11/// InnerLoopVectorizer vectorizes loops which contain only one basic
12/// LoopVectorizationPlanner - drives the vectorization process after having
13/// passed Legality checks.
14/// The planner builds and optimizes the Vectorization Plans which record the
15/// decisions how to vectorize the given loop. In particular, represent the
16/// control-flow of the vectorized version, the replication of instructions that
17/// are to be scalarized, and interleave access groups.
18///
19/// Also provides a VPlan-based builder utility analogous to IRBuilder.
20/// It provides an instruction-level API for generating VPInstructions while
21/// abstracting away the Recipe manipulation details.
22//===----------------------------------------------------------------------===//

24#ifndef LLVM_TRANSFORMS_VECTORIZE_LOOPVECTORIZATIONPLANNER_H
25#define LLVM_TRANSFORMS_VECTORIZE_LOOPVECTORIZATIONPLANNER_H

27#include "VPlan.h"
28#include "llvm/Analysis/LoopInfo.h"
29#include "llvm/Analysis/TargetLibraryInfo.h"
30#include "llvm/Analysis/TargetTransformInfo.h"

32namespace llvm {

34/// VPlan-based builder utility analogous to IRBuilder.
35class VPBuilder {
36private:
VPBasicBlock *BB = nullptr;
VPBasicBlock::iterator InsertPt = VPBasicBlock::iterator();

VPInstruction *createInstruction(unsigned Opcode,
                                 ArrayRef<VPValue *> Operands) {
  VPInstruction *Instr = new VPInstruction(Opcode, Operands);
22
←
Memory is allocated→
  if (BB)
23
←
Assuming field 'BB' is null→
24
←
Taking false branch→
    BB->insert(Instr, InsertPt);
  return Instr;
}

VPInstruction *createInstruction(unsigned Opcode,
                                 std::initializer_list<VPValue *> Operands) {
  return createInstruction(Opcode, ArrayRef<VPValue *>(Operands));
21
←
Calling 'VPBuilder::createInstruction'→
25
←
Returned allocated memory→
}

53public:
VPBuilder() {}

/// Clear the insertion point: created instructions will not be inserted into
/// a block.
void clearInsertionPoint() {
  BB = nullptr;
  InsertPt = VPBasicBlock::iterator();
}

VPBasicBlock *getInsertBlock() const { return BB; }
VPBasicBlock::iterator getInsertPoint() const { return InsertPt; }

/// InsertPoint - A saved insertion point.
class VPInsertPoint {
  VPBasicBlock *Block = nullptr;
  VPBasicBlock::iterator Point;

public:
  /// Creates a new insertion point which doesn't point to anything.
  VPInsertPoint() = default;

  /// Creates a new insertion point at the given location.
  VPInsertPoint(VPBasicBlock *InsertBlock, VPBasicBlock::iterator InsertPoint)
      : Block(InsertBlock), Point(InsertPoint) {}

  /// Returns true if this insert point is set.
  bool isSet() const { return Block != nullptr; }

  VPBasicBlock *getBlock() const { return Block; }
  VPBasicBlock::iterator getPoint() const { return Point; }
};

/// Sets the current insert point to a previously-saved location.
void restoreIP(VPInsertPoint IP) {
  if (IP.isSet())
    setInsertPoint(IP.getBlock(), IP.getPoint());
  else
    clearInsertionPoint();
}

/// This specifies that created VPInstructions should be appended to the end
/// of the specified block.
void setInsertPoint(VPBasicBlock *TheBB) {
  assert(TheBB && "Attempting to set a null insert point")((TheBB && "Attempting to set a null insert point") ?
 static_cast<void> (0) : __assert_fail ("TheBB && \"Attempting to set a null insert point\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h"
, 97, __PRETTY_FUNCTION__));
  BB = TheBB;
  InsertPt = BB->end();
}

/// This specifies that created instructions should be inserted at the
/// specified point.
void setInsertPoint(VPBasicBlock *TheBB, VPBasicBlock::iterator IP) {
  BB = TheBB;
  InsertPt = IP;
}

/// Insert and return the specified instruction.
VPInstruction *insert(VPInstruction *I) const {
  BB->insert(I, InsertPt);
  return I;
}

/// Create an N-ary operation with \p Opcode, \p Operands and set \p Inst as
/// its underlying Instruction.
VPValue *createNaryOp(unsigned Opcode, ArrayRef<VPValue *> Operands,
                      Instruction *Inst = nullptr) {
  VPInstruction *NewVPInst = createInstruction(Opcode, Operands);
  NewVPInst->setUnderlyingValue(Inst);
  return NewVPInst;
}
VPValue *createNaryOp(unsigned Opcode,
                      std::initializer_list<VPValue *> Operands,
                      Instruction *Inst = nullptr) {
  return createNaryOp(Opcode, ArrayRef<VPValue *>(Operands), Inst);
}

VPValue *createNot(VPValue *Operand) {
  return createInstruction(VPInstruction::Not, {Operand});
}

VPValue *createAnd(VPValue *LHS, VPValue *RHS) {
  return createInstruction(Instruction::BinaryOps::And, {LHS, RHS});
}

VPValue *createOr(VPValue *LHS, VPValue *RHS) {
  return createInstruction(Instruction::BinaryOps::Or, {LHS, RHS});
20
←
Calling 'VPBuilder::createInstruction'→
26
←
Returned allocated memory→
}

//===--------------------------------------------------------------------===//
// RAII helpers.
//===--------------------------------------------------------------------===//

/// RAII object that stores the current insertion point and restores it when
/// the object is destroyed.
class InsertPointGuard {
  VPBuilder &Builder;
  VPBasicBlock *Block;
  VPBasicBlock::iterator Point;

public:
  InsertPointGuard(VPBuilder &B)
      : Builder(B), Block(B.getInsertBlock()), Point(B.getInsertPoint()) {}

  InsertPointGuard(const InsertPointGuard &) = delete;
  InsertPointGuard &operator=(const InsertPointGuard &) = delete;

  ~InsertPointGuard() { Builder.restoreIP(VPInsertPoint(Block, Point)); }
};
161};

163/// TODO: The following VectorizationFactor was pulled out of
164/// LoopVectorizationCostModel class. LV also deals with
165/// VectorizerParams::VectorizationFactor and VectorizationCostTy.
166/// We need to streamline them.

168/// Information about vectorization costs
169struct VectorizationFactor {
// Vector width with best cost
unsigned Width;
// Cost of the loop with that width
unsigned Cost;

// Width 1 means no vectorization, cost 0 means uncomputed cost.
static VectorizationFactor Disabled() { return {1, 0}; }

bool operator==(const VectorizationFactor &rhs) const {
  return Width == rhs.Width && Cost == rhs.Cost;
}
181};

183/// Planner drives the vectorization process after having passed
184/// Legality checks.
185class LoopVectorizationPlanner {
/// The loop that we evaluate.
Loop *OrigLoop;

/// Loop Info analysis.
LoopInfo *LI;

/// Target Library Info.
const TargetLibraryInfo *TLI;

/// Target Transform Info.
const TargetTransformInfo *TTI;

/// The legality analysis.
LoopVectorizationLegality *Legal;

/// The profitability analysis.
LoopVectorizationCostModel &CM;

/// The interleaved access analysis.
InterleavedAccessInfo &IAI;

SmallVector<VPlanPtr, 4> VPlans;

/// This class is used to enable the VPlan to invoke a method of ILV. This is
/// needed until the method is refactored out of ILV and becomes reusable.
struct VPCallbackILV : public VPCallback {
  InnerLoopVectorizer &ILV;

  VPCallbackILV(InnerLoopVectorizer &ILV) : ILV(ILV) {}

  Value *getOrCreateVectorValues(Value *V, unsigned Part) override;
  Value *getOrCreateScalarValue(Value *V,
                                const VPIteration &Instance) override;
};

/// A builder used to construct the current plan.
VPBuilder Builder;

unsigned BestVF = 0;
unsigned BestUF = 0;

227public:
LoopVectorizationPlanner(Loop *L, LoopInfo *LI, const TargetLibraryInfo *TLI,
                         const TargetTransformInfo *TTI,
                         LoopVectorizationLegality *Legal,
                         LoopVectorizationCostModel &CM,
                         InterleavedAccessInfo &IAI)
    : OrigLoop(L), LI(LI), TLI(TLI), TTI(TTI), Legal(Legal), CM(CM),
      IAI(IAI) {}

/// Plan how to best vectorize, return the best VF and its cost, or None if
/// vectorization and interleaving should be avoided up front.
Optional<VectorizationFactor> plan(unsigned UserVF);

/// Use the VPlan-native path to plan how to best vectorize, return the best
/// VF and its cost.
VectorizationFactor planInVPlanNativePath(unsigned UserVF);

/// Finalize the best decision and dispose of all other VPlans.
void setBestPlan(unsigned VF, unsigned UF);

/// Generate the IR code for the body of the vectorized loop according to the
/// best selected VPlan.
void executePlan(InnerLoopVectorizer &LB, DominatorTree *DT);

void printPlans(raw_ostream &O) {
  for (const auto &Plan : VPlans)
    O << *Plan;
}

/// Test a \p Predicate on a \p Range of VF's. Return the value of applying
/// \p Predicate on Range.Start, possibly decreasing Range.End such that the
/// returned value holds for the entire \p Range.
static bool
getDecisionAndClampRange(const std::function<bool(unsigned)> &Predicate,
                         VFRange &Range);

263protected:
/// Collect the instructions from the original loop that would be trivially
/// dead in the vectorized loop if generated.
void collectTriviallyDeadInstructions(
    SmallPtrSetImpl<Instruction *> &DeadInstructions);

/// Build VPlans for power-of-2 VF's between \p MinVF and \p MaxVF inclusive,
/// according to the information gathered by Legal when it checked if it is
/// legal to vectorize the loop.
void buildVPlans(unsigned MinVF, unsigned MaxVF);

274private:
/// Build a VPlan according to the information gathered by Legal. \return a
/// VPlan for vectorization factors \p Range.Start and up to \p Range.End
/// exclusive, possibly decreasing \p Range.End.
VPlanPtr buildVPlan(VFRange &Range);

/// Build a VPlan using VPRecipes according to the information gather by
/// Legal. This method is only used for the legacy inner loop vectorizer.
VPlanPtr buildVPlanWithVPRecipes(
    VFRange &Range, SmallPtrSetImpl<Value *> &NeedDef,
    SmallPtrSetImpl<Instruction *> &DeadInstructions,
    const DenseMap<Instruction *, Instruction *> &SinkAfter);

/// Build VPlans for power-of-2 VF's between \p MinVF and \p MaxVF inclusive,
/// according to the information gathered by Legal when it checked if it is
/// legal to vectorize the loop. This method creates VPlans using VPRecipes.
void buildVPlansWithVPRecipes(unsigned MinVF, unsigned MaxVF);
291};

293} // namespace llvm

295#endif // LLVM_TRANSFORMS_VECTORIZE_LOOPVECTORIZATIONPLANNER_H