/build/llvm-toolchain-snapshot-7~svn338205/lib/Transforms/Vectorize/LoopVectorize.cpp

Bug Summary

File:	lib/Transforms/Vectorize/LoopVectorize.cpp
Warning:	line 6562, column 35 Potential leak of memory pointed to by 'BlockMask'

Annotated Source Code

Press '?' to see keyboard shortcuts

Show analyzer invocation

clang -cc1 -triple x86_64-pc-linux-gnu -analyze -disable-free -disable-llvm-verifier -discard-value-names -main-file-name LoopVectorize.cpp -analyzer-store=region -analyzer-opt-analyze-nested-blocks -analyzer-eagerly-assume -analyzer-checker=core -analyzer-checker=apiModeling -analyzer-checker=unix -analyzer-checker=deadcode -analyzer-checker=cplusplus -analyzer-checker=security.insecureAPI.UncheckedReturn -analyzer-checker=security.insecureAPI.getpw -analyzer-checker=security.insecureAPI.gets -analyzer-checker=security.insecureAPI.mktemp -analyzer-checker=security.insecureAPI.mkstemp -analyzer-checker=security.insecureAPI.vfork -analyzer-checker=nullability.NullPassedToNonnull -analyzer-checker=nullability.NullReturnedFromNonnull -analyzer-output plist -w -mrelocation-model pic -pic-level 2 -mthread-model posix -fmath-errno -masm-verbose -mconstructor-aliases -munwind-tables -fuse-init-array -target-cpu x86-64 -dwarf-column-info -debugger-tuning=gdb -momit-leaf-frame-pointer -ffunction-sections -fdata-sections -resource-dir /usr/lib/llvm-7/lib/clang/7.0.0 -D _DEBUG -D _GNU_SOURCE -D __STDC_CONSTANT_MACROS -D __STDC_FORMAT_MACROS -D __STDC_LIMIT_MACROS -I /build/llvm-toolchain-snapshot-7~svn338205/build-llvm/lib/Transforms/Vectorize -I /build/llvm-toolchain-snapshot-7~svn338205/lib/Transforms/Vectorize -I /build/llvm-toolchain-snapshot-7~svn338205/build-llvm/include -I /build/llvm-toolchain-snapshot-7~svn338205/include -U NDEBUG -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/8/../../../../include/c++/8 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/8/../../../../include/x86_64-linux-gnu/c++/8 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/8/../../../../include/x86_64-linux-gnu/c++/8 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/8/../../../../include/c++/8/backward -internal-isystem /usr/include/clang/7.0.0/include/ -internal-isystem /usr/local/include -internal-isystem /usr/lib/llvm-7/lib/clang/7.0.0/include -internal-externc-isystem /usr/lib/gcc/x86_64-linux-gnu/8/include -internal-externc-isystem /usr/include/x86_64-linux-gnu -internal-externc-isystem /include -internal-externc-isystem /usr/include -O2 -Wno-unused-parameter -Wwrite-strings -Wno-missing-field-initializers -Wno-long-long -Wno-maybe-uninitialized -Wno-class-memaccess -Wno-comment -std=c++11 -fdeprecated-macro -fdebug-compilation-dir /build/llvm-toolchain-snapshot-7~svn338205/build-llvm/lib/Transforms/Vectorize -ferror-limit 19 -fmessage-length 0 -fvisibility-inlines-hidden -fobjc-runtime=gcc -fdiagnostics-show-option -vectorize-loops -vectorize-slp -analyzer-output=html -analyzer-config stable-report-filename=true -o /tmp/scan-build-2018-07-29-043837-17923-1 -x c++ /build/llvm-toolchain-snapshot-7~svn338205/lib/Transforms/Vectorize/LoopVectorize.cpp -faddrsig

/build/llvm-toolchain-snapshot-7~svn338205/lib/Transforms/Vectorize/LoopVectorize.cpp

→

1//===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===//
2//
3//                     The LLVM Compiler Infrastructure
4//
5// This file is distributed under the University of Illinois Open Source
6// License. See LICENSE.TXT for details.
7//
8//===----------------------------------------------------------------------===//
9//
10// This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops
11// and generates target-independent LLVM-IR.
12// The vectorizer uses the TargetTransformInfo analysis to estimate the costs
13// of instructions in order to estimate the profitability of vectorization.
14//
15// The loop vectorizer combines consecutive loop iterations into a single
16// 'wide' iteration. After this transformation the index is incremented
17// by the SIMD vector width, and not by one.
18//
19// This pass has three parts:
20// 1. The main loop pass that drives the different parts.
21// 2. LoopVectorizationLegality - A unit that checks for the legality
22//    of the vectorization.
23// 3. InnerLoopVectorizer - A unit that performs the actual
24//    widening of instructions.
25// 4. LoopVectorizationCostModel - A unit that checks for the profitability
26//    of vectorization. It decides on the optimal vector width, which
27//    can be one, if vectorization is not profitable.
28//
29// There is a development effort going on to migrate loop vectorizer to the
30// VPlan infrastructure and to introduce outer loop vectorization support (see
31// docs/Proposal/VectorizationPlan.rst and
32// http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this
33// purpose, we temporarily introduced the VPlan-native vectorization path: an
34// alternative vectorization path that is natively implemented on top of the
35// VPlan infrastructure. See EnableVPlanNativePath for enabling.
36//
37//===----------------------------------------------------------------------===//
38//
39// The reduction-variable vectorization is based on the paper:
40//  D. Nuzman and R. Henderson. Multi-platform Auto-vectorization.
41//
42// Variable uniformity checks are inspired by:
43//  Karrenberg, R. and Hack, S. Whole Function Vectorization.
44//
45// The interleaved access vectorization is based on the paper:
46//  Dorit Nuzman, Ira Rosen and Ayal Zaks.  Auto-Vectorization of Interleaved
47//  Data for SIMD
48//
49// Other ideas/concepts are from:
50//  A. Zaks and D. Nuzman. Autovectorization in GCC-two years later.
51//
52//  S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua.  An Evaluation of
53//  Vectorizing Compilers.
54//
55//===----------------------------------------------------------------------===//

57#include "llvm/Transforms/Vectorize/LoopVectorize.h"
58#include "LoopVectorizationPlanner.h"
59#include "VPRecipeBuilder.h"
60#include "VPlanHCFGBuilder.h"
61#include "llvm/ADT/APInt.h"
62#include "llvm/ADT/ArrayRef.h"
63#include "llvm/ADT/DenseMap.h"
64#include "llvm/ADT/DenseMapInfo.h"
65#include "llvm/ADT/Hashing.h"
66#include "llvm/ADT/MapVector.h"
67#include "llvm/ADT/None.h"
68#include "llvm/ADT/Optional.h"
69#include "llvm/ADT/STLExtras.h"
70#include "llvm/ADT/SetVector.h"
71#include "llvm/ADT/SmallPtrSet.h"
72#include "llvm/ADT/SmallVector.h"
73#include "llvm/ADT/Statistic.h"
74#include "llvm/ADT/StringRef.h"
75#include "llvm/ADT/Twine.h"
76#include "llvm/ADT/iterator_range.h"
77#include "llvm/Analysis/AssumptionCache.h"
78#include "llvm/Analysis/BasicAliasAnalysis.h"
79#include "llvm/Analysis/BlockFrequencyInfo.h"
80#include "llvm/Analysis/CFG.h"
81#include "llvm/Analysis/CodeMetrics.h"
82#include "llvm/Analysis/DemandedBits.h"
83#include "llvm/Analysis/GlobalsModRef.h"
84#include "llvm/Analysis/LoopAccessAnalysis.h"
85#include "llvm/Analysis/LoopAnalysisManager.h"
86#include "llvm/Analysis/LoopInfo.h"
87#include "llvm/Analysis/LoopIterator.h"
88#include "llvm/Analysis/OptimizationRemarkEmitter.h"
89#include "llvm/Analysis/ScalarEvolution.h"
90#include "llvm/Analysis/ScalarEvolutionExpander.h"
91#include "llvm/Analysis/ScalarEvolutionExpressions.h"
92#include "llvm/Analysis/TargetLibraryInfo.h"
93#include "llvm/Analysis/TargetTransformInfo.h"
94#include "llvm/Analysis/VectorUtils.h"
95#include "llvm/IR/Attributes.h"
96#include "llvm/IR/BasicBlock.h"
97#include "llvm/IR/CFG.h"
98#include "llvm/IR/Constant.h"
99#include "llvm/IR/Constants.h"
100#include "llvm/IR/DataLayout.h"
101#include "llvm/IR/DebugInfoMetadata.h"
102#include "llvm/IR/DebugLoc.h"
103#include "llvm/IR/DerivedTypes.h"
104#include "llvm/IR/DiagnosticInfo.h"
105#include "llvm/IR/Dominators.h"
106#include "llvm/IR/Function.h"
107#include "llvm/IR/IRBuilder.h"
108#include "llvm/IR/InstrTypes.h"
109#include "llvm/IR/Instruction.h"
110#include "llvm/IR/Instructions.h"
111#include "llvm/IR/IntrinsicInst.h"
112#include "llvm/IR/Intrinsics.h"
113#include "llvm/IR/LLVMContext.h"
114#include "llvm/IR/Metadata.h"
115#include "llvm/IR/Module.h"
116#include "llvm/IR/Operator.h"
117#include "llvm/IR/Type.h"
118#include "llvm/IR/Use.h"
119#include "llvm/IR/User.h"
120#include "llvm/IR/Value.h"
121#include "llvm/IR/ValueHandle.h"
122#include "llvm/IR/Verifier.h"
123#include "llvm/Pass.h"
124#include "llvm/Support/Casting.h"
125#include "llvm/Support/CommandLine.h"
126#include "llvm/Support/Compiler.h"
127#include "llvm/Support/Debug.h"
128#include "llvm/Support/ErrorHandling.h"
129#include "llvm/Support/MathExtras.h"
130#include "llvm/Support/raw_ostream.h"
131#include "llvm/Transforms/Utils/BasicBlockUtils.h"
132#include "llvm/Transforms/Utils/LoopSimplify.h"
133#include "llvm/Transforms/Utils/LoopUtils.h"
134#include "llvm/Transforms/Utils/LoopVersioning.h"
135#include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h"
136#include <algorithm>
137#include <cassert>
138#include <cstdint>
139#include <cstdlib>
140#include <functional>
141#include <iterator>
142#include <limits>
143#include <memory>
144#include <string>
145#include <tuple>
146#include <utility>
147#include <vector>

149using namespace llvm;

151#define LV_NAME"loop-vectorize" "loop-vectorize"
152#define DEBUG_TYPE"loop-vectorize" LV_NAME"loop-vectorize"

154STATISTIC(LoopsVectorized, "Number of loops vectorized")static llvm::Statistic LoopsVectorized = {"loop-vectorize", "LoopsVectorized"
, "Number of loops vectorized", {0}, {false}};
155STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization")static llvm::Statistic LoopsAnalyzed = {"loop-vectorize", "LoopsAnalyzed"
, "Number of loops analyzed for vectorization", {0}, {false}};

157/// Loops with a known constant trip count below this number are vectorized only
158/// if no scalar iteration overheads are incurred.
159static cl::opt<unsigned> TinyTripCountVectorThreshold(
  "vectorizer-min-trip-count", cl::init(16), cl::Hidden,
  cl::desc("Loops with a constant trip count that is smaller than this "
           "value are vectorized only if no scalar iteration overheads "
           "are incurred."));

165static cl::opt<bool> MaximizeBandwidth(
  "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden,
  cl::desc("Maximize bandwidth when selecting vectorization factor which "
           "will be determined by the smallest type in loop."));

170static cl::opt<bool> EnableInterleavedMemAccesses(
  "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden,
  cl::desc("Enable vectorization on interleaved memory accesses in a loop"));

174/// Maximum factor for an interleaved memory access.
175static cl::opt<unsigned> MaxInterleaveGroupFactor(
  "max-interleave-group-factor", cl::Hidden,
  cl::desc("Maximum factor for an interleaved access group (default = 8)"),
  cl::init(8));

180/// We don't interleave loops with a known constant trip count below this
181/// number.
182static const unsigned TinyTripCountInterleaveThreshold = 128;

184static cl::opt<unsigned> ForceTargetNumScalarRegs(
  "force-target-num-scalar-regs", cl::init(0), cl::Hidden,
  cl::desc("A flag that overrides the target's number of scalar registers."));

188static cl::opt<unsigned> ForceTargetNumVectorRegs(
  "force-target-num-vector-regs", cl::init(0), cl::Hidden,
  cl::desc("A flag that overrides the target's number of vector registers."));

192static cl::opt<unsigned> ForceTargetMaxScalarInterleaveFactor(
  "force-target-max-scalar-interleave", cl::init(0), cl::Hidden,
  cl::desc("A flag that overrides the target's max interleave factor for "
           "scalar loops."));

197static cl::opt<unsigned> ForceTargetMaxVectorInterleaveFactor(
  "force-target-max-vector-interleave", cl::init(0), cl::Hidden,
  cl::desc("A flag that overrides the target's max interleave factor for "
           "vectorized loops."));

202static cl::opt<unsigned> ForceTargetInstructionCost(
  "force-target-instruction-cost", cl::init(0), cl::Hidden,
  cl::desc("A flag that overrides the target's expected cost for "
           "an instruction to a single constant value. Mostly "
           "useful for getting consistent testing."));

208static cl::opt<unsigned> SmallLoopCost(
  "small-loop-cost", cl::init(20), cl::Hidden,
  cl::desc(
      "The cost of a loop that is considered 'small' by the interleaver."));

213static cl::opt<bool> LoopVectorizeWithBlockFrequency(
  "loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden,
  cl::desc("Enable the use of the block frequency analysis to access PGO "
           "heuristics minimizing code growth in cold regions and being more "
           "aggressive in hot regions."));

219// Runtime interleave loops for load/store throughput.
220static cl::opt<bool> EnableLoadStoreRuntimeInterleave(
  "enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden,
  cl::desc(
      "Enable runtime interleaving until load/store ports are saturated"));

225/// The number of stores in a loop that are allowed to need predication.
226static cl::opt<unsigned> NumberOfStoresToPredicate(
  "vectorize-num-stores-pred", cl::init(1), cl::Hidden,
  cl::desc("Max number of stores to be predicated behind an if."));

230static cl::opt<bool> EnableIndVarRegisterHeur(
  "enable-ind-var-reg-heur", cl::init(true), cl::Hidden,
  cl::desc("Count the induction variable only once when interleaving"));

234static cl::opt<bool> EnableCondStoresVectorization(
  "enable-cond-stores-vec", cl::init(true), cl::Hidden,
  cl::desc("Enable if predication of stores during vectorization."));

238static cl::opt<unsigned> MaxNestedScalarReductionIC(
  "max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden,
  cl::desc("The maximum interleave count to use when interleaving a scalar "
           "reduction in a nested loop."));

243static cl::opt<bool> EnableVPlanNativePath(
  "enable-vplan-native-path", cl::init(false), cl::Hidden,
  cl::desc("Enable VPlan-native vectorization path with "
           "support for outer loop vectorization."));

248// This flag enables the stress testing of the VPlan H-CFG construction in the
249// VPlan-native vectorization path. It must be used in conjuction with
250// -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the
251// verification of the H-CFGs built.
252static cl::opt<bool> VPlanBuildStressTest(
  "vplan-build-stress-test", cl::init(false), cl::Hidden,
  cl::desc(
      "Build VPlan for every supported loop nest in the function and bail "
      "out right after the build (stress test the VPlan H-CFG construction "
      "in the VPlan-native vectorization path)."));

259/// A helper function for converting Scalar types to vector types.
260/// If the incoming type is void, we return void. If the VF is 1, we return
261/// the scalar type.
262static Type *ToVectorTy(Type *Scalar, unsigned VF) {
if (Scalar->isVoidTy() || VF == 1)
  return Scalar;
return VectorType::get(Scalar, VF);
266}

268// FIXME: The following helper functions have multiple implementations
269// in the project. They can be effectively organized in a common Load/Store
270// utilities unit.

272/// A helper function that returns the type of loaded or stored value.
273static Type *getMemInstValueType(Value *I) {
assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&(static_cast <bool> ((isa<LoadInst>(I) || isa<
StoreInst>(I)) && "Expected Load or Store instruction"
) ? void (0) : __assert_fail ("(isa<LoadInst>(I) || isa<StoreInst>(I)) && \"Expected Load or Store instruction\""
, "/build/llvm-toolchain-snapshot-7~svn338205/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 275, __extension__ __PRETTY_FUNCTION__))
       "Expected Load or Store instruction")(static_cast <bool> ((isa<LoadInst>(I) || isa<
StoreInst>(I)) && "Expected Load or Store instruction"
) ? void (0) : __assert_fail ("(isa<LoadInst>(I) || isa<StoreInst>(I)) && \"Expected Load or Store instruction\""
, "/build/llvm-toolchain-snapshot-7~svn338205/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 275, __extension__ __PRETTY_FUNCTION__));
if (auto *LI = dyn_cast<LoadInst>(I))
  return LI->getType();
return cast<StoreInst>(I)->getValueOperand()->getType();
279}

281/// A helper function that returns the alignment of load or store instruction.
282static unsigned getMemInstAlignment(Value *I) {
assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&(static_cast <bool> ((isa<LoadInst>(I) || isa<
StoreInst>(I)) && "Expected Load or Store instruction"
) ? void (0) : __assert_fail ("(isa<LoadInst>(I) || isa<StoreInst>(I)) && \"Expected Load or Store instruction\""
, "/build/llvm-toolchain-snapshot-7~svn338205/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 284, __extension__ __PRETTY_FUNCTION__))
       "Expected Load or Store instruction")(static_cast <bool> ((isa<LoadInst>(I) || isa<
StoreInst>(I)) && "Expected Load or Store instruction"
) ? void (0) : __assert_fail ("(isa<LoadInst>(I) || isa<StoreInst>(I)) && \"Expected Load or Store instruction\""
, "/build/llvm-toolchain-snapshot-7~svn338205/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 284, __extension__ __PRETTY_FUNCTION__));
if (auto *LI = dyn_cast<LoadInst>(I))
  return LI->getAlignment();
return cast<StoreInst>(I)->getAlignment();
288}

290/// A helper function that returns the address space of the pointer operand of
291/// load or store instruction.
292static unsigned getMemInstAddressSpace(Value *I) {
assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&(static_cast <bool> ((isa<LoadInst>(I) || isa<
StoreInst>(I)) && "Expected Load or Store instruction"
) ? void (0) : __assert_fail ("(isa<LoadInst>(I) || isa<StoreInst>(I)) && \"Expected Load or Store instruction\""
, "/build/llvm-toolchain-snapshot-7~svn338205/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 294, __extension__ __PRETTY_FUNCTION__))
       "Expected Load or Store instruction")(static_cast <bool> ((isa<LoadInst>(I) || isa<
StoreInst>(I)) && "Expected Load or Store instruction"
) ? void (0) : __assert_fail ("(isa<LoadInst>(I) || isa<StoreInst>(I)) && \"Expected Load or Store instruction\""
, "/build/llvm-toolchain-snapshot-7~svn338205/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 294, __extension__ __PRETTY_FUNCTION__));
if (auto *LI = dyn_cast<LoadInst>(I))
  return LI->getPointerAddressSpace();
return cast<StoreInst>(I)->getPointerAddressSpace();
298}

300/// A helper function that returns true if the given type is irregular. The
301/// type is irregular if its allocated size doesn't equal the store size of an
302/// element of the corresponding vector type at the given vectorization factor.
303static bool hasIrregularType(Type *Ty, const DataLayout &DL, unsigned VF) {
// Determine if an array of VF elements of type Ty is "bitcast compatible"
// with a <VF x Ty> vector.
if (VF > 1) {
  auto *VectorTy = VectorType::get(Ty, VF);
  return VF * DL.getTypeAllocSize(Ty) != DL.getTypeStoreSize(VectorTy);
}

// If the vectorization factor is one, we just check if an array of type Ty
// requires padding between elements.
return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty);
314}

316/// A helper function that returns the reciprocal of the block probability of
317/// predicated blocks. If we return X, we are assuming the predicated block
318/// will execute once for every X iterations of the loop header.
319///
320/// TODO: We should use actual block probability here, if available. Currently,
321///       we always assume predicated blocks have a 50% chance of executing.
322static unsigned getReciprocalPredBlockProb() { return 2; }

324/// A helper function that adds a 'fast' flag to floating-point operations.
325static Value *addFastMathFlag(Value *V) {
if (isa<FPMathOperator>(V)) {
  FastMathFlags Flags;
  Flags.setFast();
  cast<Instruction>(V)->setFastMathFlags(Flags);
}
return V;
332}

334/// A helper function that returns an integer or floating-point constant with
335/// value C.
336static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) {
return Ty->isIntegerTy() ? ConstantInt::getSigned(Ty, C)
                         : ConstantFP::get(Ty, C);
339}

341namespace llvm {

343/// InnerLoopVectorizer vectorizes loops which contain only one basic
344/// block to a specified vectorization factor (VF).
345/// This class performs the widening of scalars into vectors, or multiple
346/// scalars. This class also implements the following features:
347/// * It inserts an epilogue loop for handling loops that don't have iteration
348///   counts that are known to be a multiple of the vectorization factor.
349/// * It handles the code generation for reduction variables.
350/// * Scalarization (implementation using scalars) of un-vectorizable
351///   instructions.
352/// InnerLoopVectorizer does not perform any vectorization-legality
353/// checks, and relies on the caller to check for the different legality
354/// aspects. The InnerLoopVectorizer relies on the
355/// LoopVectorizationLegality class to provide information about the induction
356/// and reduction variables that were found to a given vectorization factor.
357class InnerLoopVectorizer {
358public:
InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
                    LoopInfo *LI, DominatorTree *DT,
                    const TargetLibraryInfo *TLI,
                    const TargetTransformInfo *TTI, AssumptionCache *AC,
                    OptimizationRemarkEmitter *ORE, unsigned VecWidth,
                    unsigned UnrollFactor, LoopVectorizationLegality *LVL,
                    LoopVectorizationCostModel *CM)
    : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI),
      AC(AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor),
      Builder(PSE.getSE()->getContext()),
      VectorLoopValueMap(UnrollFactor, VecWidth), Legal(LVL), Cost(CM) {}
virtual ~InnerLoopVectorizer() = default;

/// Create a new empty loop. Unlink the old loop and connect the new one.
/// Return the pre-header block of the new loop.
BasicBlock *createVectorizedLoopSkeleton();

/// Widen a single instruction within the innermost loop.
void widenInstruction(Instruction &I);

/// Fix the vectorized code, taking care of header phi's, live-outs, and more.
void fixVectorizedLoop();

// Return true if any runtime check is added.
bool areSafetyChecksAdded() { return AddedSafetyChecks; }

/// A type for vectorized values in the new loop. Each value from the
/// original loop, when vectorized, is represented by UF vector values in the
/// new unrolled loop, where UF is the unroll factor.
using VectorParts = SmallVector<Value *, 2>;

/// Vectorize a single PHINode in a block. This method handles the induction
/// variable canonicalization. It supports both VF = 1 for unrolled loops and
/// arbitrary length vectors.
void widenPHIInstruction(Instruction *PN, unsigned UF, unsigned VF);

/// A helper function to scalarize a single Instruction in the innermost loop.
/// Generates a sequence of scalar instances for each lane between \p MinLane
/// and \p MaxLane, times each part between \p MinPart and \p MaxPart,
/// inclusive..
void scalarizeInstruction(Instruction *Instr, const VPIteration &Instance,
                          bool IfPredicateInstr);

/// Widen an integer or floating-point induction variable \p IV. If \p Trunc
/// is provided, the integer induction variable will first be truncated to
/// the corresponding type.
void widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc = nullptr);

/// getOrCreateVectorValue and getOrCreateScalarValue coordinate to generate a
/// vector or scalar value on-demand if one is not yet available. When
/// vectorizing a loop, we visit the definition of an instruction before its
/// uses. When visiting the definition, we either vectorize or scalarize the
/// instruction, creating an entry for it in the corresponding map. (In some
/// cases, such as induction variables, we will create both vector and scalar
/// entries.) Then, as we encounter uses of the definition, we derive values
/// for each scalar or vector use unless such a value is already available.
/// For example, if we scalarize a definition and one of its uses is vector,
/// we build the required vector on-demand with an insertelement sequence
/// when visiting the use. Otherwise, if the use is scalar, we can use the
/// existing scalar definition.
///
/// Return a value in the new loop corresponding to \p V from the original
/// loop at unroll index \p Part. If the value has already been vectorized,
/// the corresponding vector entry in VectorLoopValueMap is returned. If,
/// however, the value has a scalar entry in VectorLoopValueMap, we construct
/// a new vector value on-demand by inserting the scalar values into a vector
/// with an insertelement sequence. If the value has been neither vectorized
/// nor scalarized, it must be loop invariant, so we simply broadcast the
/// value into a vector.
Value *getOrCreateVectorValue(Value *V, unsigned Part);

/// Return a value in the new loop corresponding to \p V from the original
/// loop at unroll and vector indices \p Instance. If the value has been
/// vectorized but not scalarized, the necessary extractelement instruction
/// will be generated.
Value *getOrCreateScalarValue(Value *V, const VPIteration &Instance);

/// Construct the vector value of a scalarized value \p V one lane at a time.
void packScalarIntoVectorValue(Value *V, const VPIteration &Instance);

/// Try to vectorize the interleaved access group that \p Instr belongs to.
void vectorizeInterleaveGroup(Instruction *Instr);

/// Vectorize Load and Store instructions, optionally masking the vector
/// operations if \p BlockInMask is non-null.
void vectorizeMemoryInstruction(Instruction *Instr,
                                VectorParts *BlockInMask = nullptr);

/// Set the debug location in the builder using the debug location in
/// the instruction.
void setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr);

451protected:
friend class LoopVectorizationPlanner;

/// A small list of PHINodes.
using PhiVector = SmallVector<PHINode *, 4>;

/// A type for scalarized values in the new loop. Each value from the
/// original loop, when scalarized, is represented by UF x VF scalar values
/// in the new unrolled loop, where UF is the unroll factor and VF is the
/// vectorization factor.
using ScalarParts = SmallVector<SmallVector<Value *, 4>, 2>;

/// Set up the values of the IVs correctly when exiting the vector loop.
void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II,
                  Value *CountRoundDown, Value *EndValue,
                  BasicBlock *MiddleBlock);

/// Create a new induction variable inside L.
PHINode *createInductionVariable(Loop *L, Value *Start, Value *End,
                                 Value *Step, Instruction *DL);

/// Handle all cross-iteration phis in the header.
void fixCrossIterationPHIs();

/// Fix a first-order recurrence. This is the second phase of vectorizing
/// this phi node.
void fixFirstOrderRecurrence(PHINode *Phi);

/// Fix a reduction cross-iteration phi. This is the second phase of
/// vectorizing this phi node.
void fixReduction(PHINode *Phi);

/// The Loop exit block may have single value PHI nodes with some
/// incoming value. While vectorizing we only handled real values
/// that were defined inside the loop and we should have one value for
/// each predecessor of its parent basic block. See PR14725.
void fixLCSSAPHIs();

/// Iteratively sink the scalarized operands of a predicated instruction into
/// the block that was created for it.
void sinkScalarOperands(Instruction *PredInst);

/// Shrinks vector element sizes to the smallest bitwidth they can be legally
/// represented as.
void truncateToMinimalBitwidths();

/// Insert the new loop to the loop hierarchy and pass manager
/// and update the analysis passes.
void updateAnalysis();

/// Create a broadcast instruction. This method generates a broadcast
/// instruction (shuffle) for loop invariant values and for the induction
/// value. If this is the induction variable then we extend it to N, N+1, ...
/// this is needed because each iteration in the loop corresponds to a SIMD
/// element.
virtual Value *getBroadcastInstrs(Value *V);

/// This function adds (StartIdx, StartIdx + Step, StartIdx + 2*Step, ...)
/// to each vector element of Val. The sequence starts at StartIndex.
/// \p Opcode is relevant for FP induction variable.
virtual Value *getStepVector(Value *Val, int StartIdx, Value *Step,
                             Instruction::BinaryOps Opcode =
                             Instruction::BinaryOpsEnd);

/// Compute scalar induction steps. \p ScalarIV is the scalar induction
/// variable on which to base the steps, \p Step is the size of the step, and
/// \p EntryVal is the value from the original loop that maps to the steps.
/// Note that \p EntryVal doesn't have to be an induction variable - it
/// can also be a truncate instruction.
void buildScalarSteps(Value *ScalarIV, Value *Step, Instruction *EntryVal,
                      const InductionDescriptor &ID);

/// Create a vector induction phi node based on an existing scalar one. \p
/// EntryVal is the value from the original loop that maps to the vector phi
/// node, and \p Step is the loop-invariant step. If \p EntryVal is a
/// truncate instruction, instead of widening the original IV, we widen a
/// version of the IV truncated to \p EntryVal's type.
void createVectorIntOrFpInductionPHI(const InductionDescriptor &II,
                                     Value *Step, Instruction *EntryVal);

/// Returns true if an instruction \p I should be scalarized instead of
/// vectorized for the chosen vectorization factor.
bool shouldScalarizeInstruction(Instruction *I) const;

/// Returns true if we should generate a scalar version of \p IV.
bool needsScalarInduction(Instruction *IV) const;

/// If there is a cast involved in the induction variable \p ID, which should 
/// be ignored in the vectorized loop body, this function records the 
/// VectorLoopValue of the respective Phi also as the VectorLoopValue of the 
/// cast. We had already proved that the casted Phi is equal to the uncasted 
/// Phi in the vectorized loop (under a runtime guard), and therefore 
/// there is no need to vectorize the cast - the same value can be used in the 
/// vector loop for both the Phi and the cast. 
/// If \p VectorLoopValue is a scalarized value, \p Lane is also specified,
/// Otherwise, \p VectorLoopValue is a widened/vectorized value.
///
/// \p EntryVal is the value from the original loop that maps to the vector
/// phi node and is used to distinguish what is the IV currently being
/// processed - original one (if \p EntryVal is a phi corresponding to the
/// original IV) or the "newly-created" one based on the proof mentioned above
/// (see also buildScalarSteps() and createVectorIntOrFPInductionPHI()). In the
/// latter case \p EntryVal is a TruncInst and we must not record anything for
/// that IV, but it's error-prone to expect callers of this routine to care
/// about that, hence this explicit parameter.
void recordVectorLoopValueForInductionCast(const InductionDescriptor &ID,
                                           const Instruction *EntryVal,
                                           Value *VectorLoopValue,
                                           unsigned Part,
                                           unsigned Lane = UINT_MAX(2147483647 *2U +1U));

/// Generate a shuffle sequence that will reverse the vector Vec.
virtual Value *reverseVector(Value *Vec);

/// Returns (and creates if needed) the original loop trip count.
Value *getOrCreateTripCount(Loop *NewLoop);

/// Returns (and creates if needed) the trip count of the widened loop.
Value *getOrCreateVectorTripCount(Loop *NewLoop);

/// Returns a bitcasted value to the requested vector type.
/// Also handles bitcasts of vector<float> <-> vector<pointer> types.
Value *createBitOrPointerCast(Value *V, VectorType *DstVTy,
                              const DataLayout &DL);

/// Emit a bypass check to see if the vector trip count is zero, including if
/// it overflows.
void emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass);

/// Emit a bypass check to see if all of the SCEV assumptions we've
/// had to make are correct.
void emitSCEVChecks(Loop *L, BasicBlock *Bypass);

/// Emit bypass checks to check any memory assumptions we may have made.
void emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass);

/// Add additional metadata to \p To that was not present on \p Orig.
///
/// Currently this is used to add the noalias annotations based on the
/// inserted memchecks.  Use this for instructions that are *cloned* into the
/// vector loop.
void addNewMetadata(Instruction *To, const Instruction *Orig);

/// Add metadata from one instruction to another.
///
/// This includes both the original MDs from \p From and additional ones (\see
/// addNewMetadata).  Use this for *newly created* instructions in the vector
/// loop.
void addMetadata(Instruction *To, Instruction *From);

/// Similar to the previous function but it adds the metadata to a
/// vector of instructions.
void addMetadata(ArrayRef<Value *> To, Instruction *From);

/// The original loop.
Loop *OrigLoop;

/// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies
/// dynamic knowledge to simplify SCEV expressions and converts them to a
/// more usable form.
PredicatedScalarEvolution &PSE;

/// Loop Info.
LoopInfo *LI;

/// Dominator Tree.
DominatorTree *DT;

/// Alias Analysis.
AliasAnalysis *AA;

/// Target Library Info.
const TargetLibraryInfo *TLI;

/// Target Transform Info.
const TargetTransformInfo *TTI;

/// Assumption Cache.
AssumptionCache *AC;

/// Interface to emit optimization remarks.
OptimizationRemarkEmitter *ORE;

/// LoopVersioning.  It's only set up (non-null) if memchecks were
/// used.
///
/// This is currently only used to add no-alias metadata based on the
/// memchecks.  The actually versioning is performed manually.
std::unique_ptr<LoopVersioning> LVer;

/// The vectorization SIMD factor to use. Each vector will have this many
/// vector elements.
unsigned VF;

/// The vectorization unroll factor to use. Each scalar is vectorized to this
/// many different vector instructions.
unsigned UF;

/// The builder that we use
IRBuilder<> Builder;

// --- Vectorization state ---

/// The vector-loop preheader.
BasicBlock *LoopVectorPreHeader;

/// The scalar-loop preheader.
BasicBlock *LoopScalarPreHeader;

/// Middle Block between the vector and the scalar.
BasicBlock *LoopMiddleBlock;

/// The ExitBlock of the scalar loop.
BasicBlock *LoopExitBlock;

/// The vector loop body.
BasicBlock *LoopVectorBody;

/// The scalar loop body.
BasicBlock *LoopScalarBody;

/// A list of all bypass blocks. The first block is the entry of the loop.
SmallVector<BasicBlock *, 4> LoopBypassBlocks;

/// The new Induction variable which was added to the new block.
PHINode *Induction = nullptr;

/// The induction variable of the old basic block.
PHINode *OldInduction = nullptr;

/// Maps values from the original loop to their corresponding values in the
/// vectorized loop. A key value can map to either vector values, scalar
/// values or both kinds of values, depending on whether the key was
/// vectorized and scalarized.
VectorizerValueMap VectorLoopValueMap;

/// Store instructions that were predicated.
SmallVector<Instruction *, 4> PredicatedInstructions;

/// Trip count of the original loop.
Value *TripCount = nullptr;

/// Trip count of the widened loop (TripCount - TripCount % (VF*UF))
Value *VectorTripCount = nullptr;

/// The legality analysis.
LoopVectorizationLegality *Legal;

/// The profitablity analysis.
LoopVectorizationCostModel *Cost;

// Record whether runtime checks are added.
bool AddedSafetyChecks = false;

// Holds the end values for each induction variable. We save the end values
// so we can later fix-up the external users of the induction variables.
DenseMap<PHINode *, Value *> IVEndValues;
708};

710class InnerLoopUnroller : public InnerLoopVectorizer {
711public:
InnerLoopUnroller(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
                  LoopInfo *LI, DominatorTree *DT,
                  const TargetLibraryInfo *TLI,
                  const TargetTransformInfo *TTI, AssumptionCache *AC,
                  OptimizationRemarkEmitter *ORE, unsigned UnrollFactor,
                  LoopVectorizationLegality *LVL,
                  LoopVectorizationCostModel *CM)
    : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 1,
                          UnrollFactor, LVL, CM) {}

722private:
Value *getBroadcastInstrs(Value *V) override;
Value *getStepVector(Value *Val, int StartIdx, Value *Step,
                     Instruction::BinaryOps Opcode =
                     Instruction::BinaryOpsEnd) override;
Value *reverseVector(Value *Vec) override;
728};

730} // end namespace llvm

732/// Look for a meaningful debug location on the instruction or it's
733/// operands.
734static Instruction *getDebugLocFromInstOrOperands(Instruction *I) {
if (!I)
  return I;

DebugLoc Empty;
if (I->getDebugLoc() != Empty)
  return I;

for (User::op_iterator OI = I->op_begin(), OE = I->op_end(); OI != OE; ++OI) {
  if (Instruction *OpInst = dyn_cast<Instruction>(*OI))
    if (OpInst->getDebugLoc() != Empty)
      return OpInst;
}

return I;
749}

751void InnerLoopVectorizer::setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr) {
if (const Instruction *Inst = dyn_cast_or_null<Instruction>(Ptr)) {
  const DILocation *DIL = Inst->getDebugLoc();
  if (DIL && Inst->getFunction()->isDebugInfoForProfiling() &&
      !isa<DbgInfoIntrinsic>(Inst))
    B.SetCurrentDebugLocation(DIL->cloneWithDuplicationFactor(UF * VF));
  else
    B.SetCurrentDebugLocation(DIL);
} else
  B.SetCurrentDebugLocation(DebugLoc());
761}

763#ifndef NDEBUG
764/// \return string containing a file name and a line # for the given loop.
765static std::string getDebugLocString(const Loop *L) {
std::string Result;
if (L) {
  raw_string_ostream OS(Result);
  if (const DebugLoc LoopDbgLoc = L->getStartLoc())
    LoopDbgLoc.print(OS);
  else
    // Just print the module name.
    OS << L->getHeader()->getParent()->getParent()->getModuleIdentifier();
  OS.flush();
}
return Result;
777}
778#endif

780void InnerLoopVectorizer::addNewMetadata(Instruction *To,
                                       const Instruction *Orig) {
// If the loop was versioned with memchecks, add the corresponding no-alias
// metadata.
if (LVer && (isa<LoadInst>(Orig) || isa<StoreInst>(Orig)))
  LVer->annotateInstWithNoAlias(To, Orig);
786}

788void InnerLoopVectorizer::addMetadata(Instruction *To,
                                    Instruction *From) {
propagateMetadata(To, From);
addNewMetadata(To, From);
792}

794void InnerLoopVectorizer::addMetadata(ArrayRef<Value *> To,
                                    Instruction *From) {
for (Value *V : To) {
  if (Instruction *I = dyn_cast<Instruction>(V))
    addMetadata(I, From);
}
800}

802namespace llvm {

804/// The group of interleaved loads/stores sharing the same stride and
805/// close to each other.
806///
807/// Each member in this group has an index starting from 0, and the largest
808/// index should be less than interleaved factor, which is equal to the absolute
809/// value of the access's stride.
810///
811/// E.g. An interleaved load group of factor 4:
812///        for (unsigned i = 0; i < 1024; i+=4) {
813///          a = A[i];                           // Member of index 0
814///          b = A[i+1];                         // Member of index 1
815///          d = A[i+3];                         // Member of index 3
816///          ...
817///        }
818///
819///      An interleaved store group of factor 4:
820///        for (unsigned i = 0; i < 1024; i+=4) {
821///          ...
822///          A[i]   = a;                         // Member of index 0
823///          A[i+1] = b;                         // Member of index 1
824///          A[i+2] = c;                         // Member of index 2
825///          A[i+3] = d;                         // Member of index 3
826///        }
827///
828/// Note: the interleaved load group could have gaps (missing members), but
829/// the interleaved store group doesn't allow gaps.
830class InterleaveGroup {
831public:
InterleaveGroup(Instruction *Instr, int Stride, unsigned Align)
    : Align(Align), InsertPos(Instr) {
  assert(Align && "The alignment should be non-zero")(static_cast <bool> (Align && "The alignment should be non-zero"
) ? void (0) : __assert_fail ("Align && \"The alignment should be non-zero\""
, "/build/llvm-toolchain-snapshot-7~svn338205/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 834, __extension__ __PRETTY_FUNCTION__));

  Factor = std::abs(Stride);
  assert(Factor > 1 && "Invalid interleave factor")(static_cast <bool> (Factor > 1 && "Invalid interleave factor"
) ? void (0) : __assert_fail ("Factor > 1 && \"Invalid interleave factor\""
, "/build/llvm-toolchain-snapshot-7~svn338205/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 837, __extension__ __PRETTY_FUNCTION__));

  Reverse = Stride < 0;
  Members[0] = Instr;
}

bool isReverse() const { return Reverse; }
unsigned getFactor() const { return Factor; }
unsigned getAlignment() const { return Align; }
unsigned getNumMembers() const { return Members.size(); }

/// Try to insert a new member \p Instr with index \p Index and
/// alignment \p NewAlign. The index is related to the leader and it could be
/// negative if it is the new leader.
///
/// \returns false if the instruction doesn't belong to the group.
bool insertMember(Instruction *Instr, int Index, unsigned NewAlign) {
  assert(NewAlign && "The new member's alignment should be non-zero")(static_cast <bool> (NewAlign && "The new member's alignment should be non-zero"
) ? void (0) : __assert_fail ("NewAlign && \"The new member's alignment should be non-zero\""
, "/build/llvm-toolchain-snapshot-7~svn338205/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 854, __extension__ __PRETTY_FUNCTION__));

  int Key = Index + SmallestKey;

  // Skip if there is already a member with the same index.
  if (Members.count(Key))
    return false;

  if (Key > LargestKey) {
    // The largest index is always less than the interleave factor.
    if (Index >= static_cast<int>(Factor))
      return false;

    LargestKey = Key;
  } else if (Key < SmallestKey) {
    // The largest index is always less than the interleave factor.
    if (LargestKey - Key >= static_cast<int>(Factor))
      return false;

    SmallestKey = Key;
  }

  // It's always safe to select the minimum alignment.
  Align = std::min(Align, NewAlign);
  Members[Key] = Instr;
  return true;
}

/// Get the member with the given index \p Index
///
/// \returns nullptr if contains no such member.
Instruction *getMember(unsigned Index) const {
  int Key = SmallestKey + Index;
  if (!Members.count(Key))
    return nullptr;

  return Members.find(Key)->second;
}

/// Get the index for the given member. Unlike the key in the member
/// map, the index starts from 0.
unsigned getIndex(Instruction *Instr) const {
  for (auto I : Members)
    if (I.second == Instr)
      return I.first - SmallestKey;

  llvm_unreachable("InterleaveGroup contains no such member")::llvm::llvm_unreachable_internal("InterleaveGroup contains no such member"
, "/build/llvm-toolchain-snapshot-7~svn338205/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 900);
}

Instruction *getInsertPos() const { return InsertPos; }
void setInsertPos(Instruction *Inst) { InsertPos = Inst; }

/// Add metadata (e.g. alias info) from the instructions in this group to \p
/// NewInst.
///
/// FIXME: this function currently does not add noalias metadata a'la
/// addNewMedata.  To do that we need to compute the intersection of the
/// noalias info from all members.
void addMetadata(Instruction *NewInst) const {
  SmallVector<Value *, 4> VL;
  std::transform(Members.begin(), Members.end(), std::back_inserter(VL),
                 [](std::pair<int, Instruction *> p) { return p.second; });
  propagateMetadata(NewInst, VL);
}

919private:
unsigned Factor; // Interleave Factor.
bool Reverse;
unsigned Align;
DenseMap<int, Instruction *> Members;
int SmallestKey = 0;
int LargestKey = 0;

// To avoid breaking dependences, vectorized instructions of an interleave
// group should be inserted at either the first load or the last store in
// program order.
//
// E.g. %even = load i32             // Insert Position
//      %add = add i32 %even         // Use of %even
//      %odd = load i32
//
//      store i32 %even
//      %odd = add i32               // Def of %odd
//      store i32 %odd               // Insert Position
Instruction *InsertPos;
939};
940} // end namespace llvm

942namespace {

944/// Drive the analysis of interleaved memory accesses in the loop.
945///
946/// Use this class to analyze interleaved accesses only when we can vectorize
947/// a loop. Otherwise it's meaningless to do analysis as the vectorization
948/// on interleaved accesses is unsafe.
949///
950/// The analysis collects interleave groups and records the relationships
951/// between the member and the group in a map.
952class InterleavedAccessInfo {
953public:
InterleavedAccessInfo(PredicatedScalarEvolution &PSE, Loop *L,
                      DominatorTree *DT, LoopInfo *LI,
                      const LoopAccessInfo *LAI)
  : PSE(PSE), TheLoop(L), DT(DT), LI(LI), LAI(LAI) {}

~InterleavedAccessInfo() {
  SmallPtrSet<InterleaveGroup *, 4> DelSet;
  // Avoid releasing a pointer twice.
  for (auto &I : InterleaveGroupMap)
    DelSet.insert(I.second);
  for (auto *Ptr : DelSet)
    delete Ptr;
}

/// Analyze the interleaved accesses and collect them in interleave
/// groups. Substitute symbolic strides using \p Strides.
void analyzeInterleaving();

/// Check if \p Instr belongs to any interleave group.
bool isInterleaved(Instruction *Instr) const {
  return InterleaveGroupMap.count(Instr);
}

/// Get the interleave group that \p Instr belongs to.
///
/// \returns nullptr if doesn't have such group.
InterleaveGroup *getInterleaveGroup(Instruction *Instr) const {
  if (InterleaveGroupMap.count(Instr))
    return InterleaveGroupMap.find(Instr)->second;
  return nullptr;
}

/// Returns true if an interleaved group that may access memory
/// out-of-bounds requires a scalar epilogue iteration for correctness.
bool requiresScalarEpilogue() const { return RequiresScalarEpilogue; }

990private:
/// A wrapper around ScalarEvolution, used to add runtime SCEV checks.
/// Simplifies SCEV expressions in the context of existing SCEV assumptions.
/// The interleaved access analysis can also add new predicates (for example
/// by versioning strides of pointers).
PredicatedScalarEvolution &PSE;

Loop *TheLoop;
DominatorTree *DT;
LoopInfo *LI;
const LoopAccessInfo *LAI;

/// True if the loop may contain non-reversed interleaved groups with
/// out-of-bounds accesses. We ensure we don't speculatively access memory
/// out-of-bounds by executing at least one scalar epilogue iteration.
bool RequiresScalarEpilogue = false;

/// Holds the relationships between the members and the interleave group.
DenseMap<Instruction *, InterleaveGroup *> InterleaveGroupMap;

/// Holds dependences among the memory accesses in the loop. It maps a source
/// access to a set of dependent sink accesses.
DenseMap<Instruction *, SmallPtrSet<Instruction *, 2>> Dependences;

/// The descriptor for a strided memory access.
struct StrideDescriptor {
  StrideDescriptor() = default;
  StrideDescriptor(int64_t Stride, const SCEV *Scev, uint64_t Size,
                   unsigned Align)
      : Stride(Stride), Scev(Scev), Size(Size), Align(Align) {}

  // The access's stride. It is negative for a reverse access.
  int64_t Stride = 0;

  // The scalar expression of this access.
  const SCEV *Scev = nullptr;

  // The size of the memory object.
  uint64_t Size = 0;

  // The alignment of this access.
  unsigned Align = 0;
};

/// A type for holding instructions and their stride descriptors.
using StrideEntry = std::pair<Instruction *, StrideDescriptor>;

/// Create a new interleave group with the given instruction \p Instr,
/// stride \p Stride and alignment \p Align.
///
/// \returns the newly created interleave group.
InterleaveGroup *createInterleaveGroup(Instruction *Instr, int Stride,
                                       unsigned Align) {
  assert(!InterleaveGroupMap.count(Instr) &&(static_cast <bool> (!InterleaveGroupMap.count(Instr) &&
 "Already in an interleaved access group") ? void (0) : __assert_fail
 ("!InterleaveGroupMap.count(Instr) && \"Already in an interleaved access group\""
, "/build/llvm-toolchain-snapshot-7~svn338205/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1044, __extension__ __PRETTY_FUNCTION__))
         "Already in an interleaved access group")(static_cast <bool> (!InterleaveGroupMap.count(Instr) &&
 "Already in an interleaved access group") ? void (0) : __assert_fail
 ("!InterleaveGroupMap.count(Instr) && \"Already in an interleaved access group\""
, "/build/llvm-toolchain-snapshot-7~svn338205/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1044, __extension__ __PRETTY_FUNCTION__));
  InterleaveGroupMap[Instr] = new InterleaveGroup(Instr, Stride, Align);
  return InterleaveGroupMap[Instr];
}

/// Release the group and remove all the relationships.
void releaseGroup(InterleaveGroup *Group) {
  for (unsigned i = 0; i < Group->getFactor(); i++)
    if (Instruction *Member = Group->getMember(i))
      InterleaveGroupMap.erase(Member);

  delete Group;
}

/// Collect all the accesses with a constant stride in program order.
void collectConstStrideAccesses(
    MapVector<Instruction *, StrideDescriptor> &AccessStrideInfo,
    const ValueToValueMap &Strides);

/// Returns true if \p Stride is allowed in an interleaved group.
static bool isStrided(int Stride) {
  unsigned Factor = std::abs(Stride);
  return Factor >= 2 && Factor <= MaxInterleaveGroupFactor;
}

/// Returns true if \p BB is a predicated block.
bool isPredicated(BasicBlock *BB) const {
  return LoopAccessInfo::blockNeedsPredication(BB, TheLoop, DT);
}

/// Returns true if LoopAccessInfo can be used for dependence queries.
bool areDependencesValid() const {
  return LAI && LAI->getDepChecker().getDependences();
}

/// Returns true if memory accesses \p A and \p B can be reordered, if
/// necessary, when constructing interleaved groups.
///
/// \p A must precede \p B in program order. We return false if reordering is
/// not necessary or is prevented because \p A and \p B may be dependent.
bool canReorderMemAccessesForInterleavedGroups(StrideEntry *A,
                                               StrideEntry *B) const {
  // Code motion for interleaved accesses can potentially hoist strided loads
  // and sink strided stores. The code below checks the legality of the
  // following two conditions:
  //
  // 1. Potentially moving a strided load (B) before any store (A) that
  //    precedes B, or
  //
  // 2. Potentially moving a strided store (A) after any load or store (B)
  //    that A precedes.
  //
  // It's legal to reorder A and B if we know there isn't a dependence from A
  // to B. Note that this determination is conservative since some
  // dependences could potentially be reordered safely.

  // A is potentially the source of a dependence.
  auto *Src = A->first;
  auto SrcDes = A->second;

  // B is potentially the sink of a dependence.
  auto *Sink = B->first;
  auto SinkDes = B->second;

  // Code motion for interleaved accesses can't violate WAR dependences.
  // Thus, reordering is legal if the source isn't a write.
  if (!Src->mayWriteToMemory())
    return true;

  // At least one of the accesses must be strided.
  if (!isStrided(SrcDes.Stride) && !isStrided(SinkDes.Stride))
    return true;

  // If dependence information is not available from LoopAccessInfo,
  // conservatively assume the instructions can't be reordered.
  if (!areDependencesValid())
    return false;

  // If we know there is a dependence from source to sink, assume the
  // instructions can't be reordered. Otherwise, reordering is legal.
  return !Dependences.count(Src) || !Dependences.lookup(Src).count(Sink);
}

/// Collect the dependences from LoopAccessInfo.
///
/// We process the dependences once during the interleaved access analysis to
/// enable constant-time dependence queries.
void collectDependences() {
  if (!areDependencesValid())
    return;
  auto *Deps = LAI->getDepChecker().getDependences();
  for (auto Dep : *Deps)
    Dependences[Dep.getSource(*LAI)].insert(Dep.getDestination(*LAI));
}
1138};

1140} // end anonymous namespace

1142static void emitMissedWarning(Function *F, Loop *L,
                            const LoopVectorizeHints &LH,
                            OptimizationRemarkEmitter *ORE) {
LH.emitRemarkWithHints();

if (LH.getForce() == LoopVectorizeHints::FK_Enabled) {
  if (LH.getWidth() != 1)
    ORE->emit(DiagnosticInfoOptimizationFailure(
                  DEBUG_TYPE"loop-vectorize", "FailedRequestedVectorization",
                  L->getStartLoc(), L->getHeader())
              << "loop not vectorized: "
              << "failed explicitly specified loop vectorization");
  else if (LH.getInterleave() != 1)
    ORE->emit(DiagnosticInfoOptimizationFailure(
                  DEBUG_TYPE"loop-vectorize", "FailedRequestedInterleaving", L->getStartLoc(),
                  L->getHeader())
              << "loop not interleaved: "
              << "failed explicitly specified loop interleaving");
}
1161}

1163namespace llvm {

1165/// LoopVectorizationCostModel - estimates the expected speedups due to
1166/// vectorization.
1167/// In many cases vectorization is not profitable. This can happen because of
1168/// a number of reasons. In this class we mainly attempt to predict the
1169/// expected speedup/slowdowns due to the supported instruction set. We use the
1170/// TargetTransformInfo to query the different backends for the cost of
1171/// different operations.
1172class LoopVectorizationCostModel {
1173public:
LoopVectorizationCostModel(Loop *L, PredicatedScalarEvolution &PSE,
                           LoopInfo *LI, LoopVectorizationLegality *Legal,
                           const TargetTransformInfo &TTI,
                           const TargetLibraryInfo *TLI, DemandedBits *DB,
                           AssumptionCache *AC,
                           OptimizationRemarkEmitter *ORE, const Function *F,
                           const LoopVectorizeHints *Hints,
                           InterleavedAccessInfo &IAI)
    : TheLoop(L), PSE(PSE), LI(LI), Legal(Legal), TTI(TTI), TLI(TLI), DB(DB),
  AC(AC), ORE(ORE), TheFunction(F), Hints(Hints), InterleaveInfo(IAI) {}

/// \return An upper bound for the vectorization factor, or None if
/// vectorization should be avoided up front.
Optional<unsigned> computeMaxVF(bool OptForSize);

/// \return The most profitable vectorization factor and the cost of that VF.
/// This method checks every power of two up to MaxVF. If UserVF is not ZERO
/// then this vectorization factor will be selected if vectorization is
/// possible.
VectorizationFactor selectVectorizationFactor(unsigned MaxVF);

/// Setup cost-based decisions for user vectorization factor.
void selectUserVectorizationFactor(unsigned UserVF) {
  collectUniformsAndScalars(UserVF);
  collectInstsToScalarize(UserVF);
}

/// \return The size (in bits) of the smallest and widest types in the code
/// that needs to be vectorized. We ignore values that remain scalar such as
/// 64 bit loop indices.
std::pair<unsigned, unsigned> getSmallestAndWidestTypes();

/// \return The desired interleave count.
/// If interleave count has been specified by metadata it will be returned.
/// Otherwise, the interleave count is computed and returned. VF and LoopCost
/// are the selected vectorization factor and the cost of the selected VF.
unsigned selectInterleaveCount(bool OptForSize, unsigned VF,
                               unsigned LoopCost);

/// Memory access instruction may be vectorized in more than one way.
/// Form of instruction after vectorization depends on cost.
/// This function takes cost-based decisions for Load/Store instructions
/// and collects them in a map. This decisions map is used for building
/// the lists of loop-uniform and loop-scalar instructions.
/// The calculated cost is saved with widening decision in order to
/// avoid redundant calculations.
void setCostBasedWideningDecision(unsigned VF);

/// A struct that represents some properties of the register usage
/// of a loop.
struct RegisterUsage {
  /// Holds the number of loop invariant values that are used in the loop.
  unsigned LoopInvariantRegs;

  /// Holds the maximum number of concurrent live intervals in the loop.
  unsigned MaxLocalUsers;
};

/// \return Returns information about the register usages of the loop for the
/// given vectorization factors.
SmallVector<RegisterUsage, 8> calculateRegisterUsage(ArrayRef<unsigned> VFs);

/// Collect values we want to ignore in the cost model.
void collectValuesToIgnore();

/// \returns The smallest bitwidth each instruction can be represented with.
/// The vector equivalents of these instructions should be truncated to this
/// type.
const MapVector<Instruction *, uint64_t> &getMinimalBitwidths() const {
  return MinBWs;
}

/// \returns True if it is more profitable to scalarize instruction \p I for
/// vectorization factor \p VF.
bool isProfitableToScalarize(Instruction *I, unsigned VF) const {
  assert(VF > 1 && "Profitable to scalarize relevant only for VF > 1.")(static_cast <bool> (VF > 1 && "Profitable to scalarize relevant only for VF > 1."
) ? void (0) : __assert_fail ("VF > 1 && \"Profitable to scalarize relevant only for VF > 1.\""
, "/build/llvm-toolchain-snapshot-7~svn338205/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1249, __extension__ __PRETTY_FUNCTION__));
  auto Scalars = InstsToScalarize.find(VF);
  assert(Scalars != InstsToScalarize.end() &&(static_cast <bool> (Scalars != InstsToScalarize.end() &&
 "VF not yet analyzed for scalarization profitability") ? void
 (0) : __assert_fail ("Scalars != InstsToScalarize.end() && \"VF not yet analyzed for scalarization profitability\""
, "/build/llvm-toolchain-snapshot-7~svn338205/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1252, __extension__ __PRETTY_FUNCTION__))
         "VF not yet analyzed for scalarization profitability")(static_cast <bool> (Scalars != InstsToScalarize.end() &&
 "VF not yet analyzed for scalarization profitability") ? void
 (0) : __assert_fail ("Scalars != InstsToScalarize.end() && \"VF not yet analyzed for scalarization profitability\""
, "/build/llvm-toolchain-snapshot-7~svn338205/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1252, __extension__ __PRETTY_FUNCTION__));
  return Scalars->second.count(I);
}

/// Returns true if \p I is known to be uniform after vectorization.
bool isUniformAfterVectorization(Instruction *I, unsigned VF) const {
  if (VF == 1)
    return true;
  assert(Uniforms.count(VF) && "VF not yet analyzed for uniformity")(static_cast <bool> (Uniforms.count(VF) && "VF not yet analyzed for uniformity"
) ? void (0) : __assert_fail ("Uniforms.count(VF) && \"VF not yet analyzed for uniformity\""
, "/build/llvm-toolchain-snapshot-7~svn338205/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1260, __extension__ __PRETTY_FUNCTION__));
  auto UniformsPerVF = Uniforms.find(VF);
  return UniformsPerVF->second.count(I);
}

/// Returns true if \p I is known to be scalar after vectorization.
bool isScalarAfterVectorization(Instruction *I, unsigned VF) const {
  if (VF == 1)
    return true;
  assert(Scalars.count(VF) && "Scalar values are not calculated for VF")(static_cast <bool> (Scalars.count(VF) && "Scalar values are not calculated for VF"
) ? void (0) : __assert_fail ("Scalars.count(VF) && \"Scalar values are not calculated for VF\""
, "/build/llvm-toolchain-snapshot-7~svn338205/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1269, __extension__ __PRETTY_FUNCTION__));
  auto ScalarsPerVF = Scalars.find(VF);
  return ScalarsPerVF->second.count(I);
}

/// \returns True if instruction \p I can be truncated to a smaller bitwidth
/// for vectorization factor \p VF.
bool canTruncateToMinimalBitwidth(Instruction *I, unsigned VF) const {
  return VF > 1 && MinBWs.count(I) && !isProfitableToScalarize(I, VF) &&
         !isScalarAfterVectorization(I, VF);
}

/// Decision that was taken during cost calculation for memory instruction.
enum InstWidening {
  CM_Unknown,
  CM_Widen,         // For consecutive accesses with stride +1.
  CM_Widen_Reverse, // For consecutive accesses with stride -1.
  CM_Interleave,
  CM_GatherScatter,
  CM_Scalarize
};

/// Save vectorization decision \p W and \p Cost taken by the cost model for
/// instruction \p I and vector width \p VF.
void setWideningDecision(Instruction *I, unsigned VF, InstWidening W,
                         unsigned Cost) {
  assert(VF >= 2 && "Expected VF >=2")(static_cast <bool> (VF >= 2 && "Expected VF >=2"
) ? void (0) : __assert_fail ("VF >= 2 && \"Expected VF >=2\""
, "/build/llvm-toolchain-snapshot-7~svn338205/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1295, __extension__ __PRETTY_FUNCTION__));
  WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
}

/// Save vectorization decision \p W and \p Cost taken by the cost model for
/// interleaving group \p Grp and vector width \p VF.
void setWideningDecision(const InterleaveGroup *Grp, unsigned VF,
                         InstWidening W, unsigned Cost) {
  assert(VF >= 2 && "Expected VF >=2")(static_cast <bool> (VF >= 2 && "Expected VF >=2"
) ? void (0) : __assert_fail ("VF >= 2 && \"Expected VF >=2\""
, "/build/llvm-toolchain-snapshot-7~svn338205/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1303, __extension__ __PRETTY_FUNCTION__));
  /// Broadcast this decicion to all instructions inside the group.
  /// But the cost will be assigned to one instruction only.
  for (unsigned i = 0; i < Grp->getFactor(); ++i) {
    if (auto *I = Grp->getMember(i)) {
      if (Grp->getInsertPos() == I)
        WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
      else
        WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, 0);
    }
  }
}

/// Return the cost model decision for the given instruction \p I and vector
/// width \p VF. Return CM_Unknown if this instruction did not pass
/// through the cost modeling.
InstWidening getWideningDecision(Instruction *I, unsigned VF) {
  assert(VF >= 2 && "Expected VF >=2")(static_cast <bool> (VF >= 2 && "Expected VF >=2"
) ? void (0) : __assert_fail ("VF >= 2 && \"Expected VF >=2\""
, "/build/llvm-toolchain-snapshot-7~svn338205/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1320, __extension__ __PRETTY_FUNCTION__));
  std::pair<Instruction *, unsigned> InstOnVF = std::make_pair(I, VF);
  auto Itr = WideningDecisions.find(InstOnVF);
  if (Itr == WideningDecisions.end())
    return CM_Unknown;
  return Itr->second.first;
}

/// Return the vectorization cost for the given instruction \p I and vector
/// width \p VF.
unsigned getWideningCost(Instruction *I, unsigned VF) {
  assert(VF >= 2 && "Expected VF >=2")(static_cast <bool> (VF >= 2 && "Expected VF >=2"
) ? void (0) : __assert_fail ("VF >= 2 && \"Expected VF >=2\""
, "/build/llvm-toolchain-snapshot-7~svn338205/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1331, __extension__ __PRETTY_FUNCTION__));
  std::pair<Instruction *, unsigned> InstOnVF = std::make_pair(I, VF);
  assert(WideningDecisions.count(InstOnVF) && "The cost is not calculated")(static_cast <bool> (WideningDecisions.count(InstOnVF) &&
 "The cost is not calculated") ? void (0) : __assert_fail ("WideningDecisions.count(InstOnVF) && \"The cost is not calculated\""
, "/build/llvm-toolchain-snapshot-7~svn338205/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1333, __extension__ __PRETTY_FUNCTION__));
  return WideningDecisions[InstOnVF].second;
}

/// Return True if instruction \p I is an optimizable truncate whose operand
/// is an induction variable. Such a truncate will be removed by adding a new
/// induction variable with the destination type.
bool isOptimizableIVTruncate(Instruction *I, unsigned VF) {
  // If the instruction is not a truncate, return false.
  auto *Trunc = dyn_cast<TruncInst>(I);
  if (!Trunc)
    return false;

  // Get the source and destination types of the truncate.
  Type *SrcTy = ToVectorTy(cast<CastInst>(I)->getSrcTy(), VF);
  Type *DestTy = ToVectorTy(cast<CastInst>(I)->getDestTy(), VF);

  // If the truncate is free for the given types, return false. Replacing a
  // free truncate with an induction variable would add an induction variable
  // update instruction to each iteration of the loop. We exclude from this
  // check the primary induction variable since it will need an update
  // instruction regardless.
  Value *Op = Trunc->getOperand(0);
  if (Op != Legal->getPrimaryInduction() && TTI.isTruncateFree(SrcTy, DestTy))
    return false;

  // If the truncated value is not an induction variable, return false.
  return Legal->isInductionPhi(Op);
}

/// Collects the instructions to scalarize for each predicated instruction in
/// the loop.
void collectInstsToScalarize(unsigned VF);

/// Collect Uniform and Scalar values for the given \p VF.
/// The sets depend on CM decision for Load/Store instructions
/// that may be vectorized as interleave, gather-scatter or scalarized.
void collectUniformsAndScalars(unsigned VF) {
  // Do the analysis once.
  if (VF == 1 || Uniforms.count(VF))
    return;
  setCostBasedWideningDecision(VF);
  collectLoopUniforms(VF);
  collectLoopScalars(VF);
}

/// Returns true if the target machine supports masked store operation
/// for the given \p DataType and kind of access to \p Ptr.
bool isLegalMaskedStore(Type *DataType, Value *Ptr) {
  return Legal->isConsecutivePtr(Ptr) && TTI.isLegalMaskedStore(DataType);
}

/// Returns true if the target machine supports masked load operation
/// for the given \p DataType and kind of access to \p Ptr.
bool isLegalMaskedLoad(Type *DataType, Value *Ptr) {
  return Legal->isConsecutivePtr(Ptr) && TTI.isLegalMaskedLoad(DataType);
}

/// Returns true if the target machine supports masked scatter operation
/// for the given \p DataType.
bool isLegalMaskedScatter(Type *DataType) {
  return TTI.isLegalMaskedScatter(DataType);
}

/// Returns true if the target machine supports masked gather operation
/// for the given \p DataType.
bool isLegalMaskedGather(Type *DataType) {
  return TTI.isLegalMaskedGather(DataType);
}

/// Returns true if the target machine can represent \p V as a masked gather
/// or scatter operation.
bool isLegalGatherOrScatter(Value *V) {
  bool LI = isa<LoadInst>(V);
  bool SI = isa<StoreInst>(V);
  if (!LI && !SI)
    return false;
  auto *Ty = getMemInstValueType(V);
  return (LI && isLegalMaskedGather(Ty)) || (SI && isLegalMaskedScatter(Ty));
}

/// Returns true if \p I is an instruction that will be scalarized with
/// predication. Such instructions include conditional stores and
/// instructions that may divide by zero.
bool isScalarWithPredication(Instruction *I);

/// Returns true if \p I is a memory instruction with consecutive memory
/// access that can be widened.
bool memoryInstructionCanBeWidened(Instruction *I, unsigned VF = 1);

/// Check if \p Instr belongs to any interleaved access group.
bool isAccessInterleaved(Instruction *Instr) {
  return InterleaveInfo.isInterleaved(Instr);
}

/// Get the interleaved access group that \p Instr belongs to.
const InterleaveGroup *getInterleavedAccessGroup(Instruction *Instr) {
  return InterleaveInfo.getInterleaveGroup(Instr);
}

/// Returns true if an interleaved group requires a scalar iteration
/// to handle accesses with gaps.
bool requiresScalarEpilogue() const {
  return InterleaveInfo.requiresScalarEpilogue();
}

1439private:
unsigned NumPredStores = 0;

/// \return An upper bound for the vectorization factor, larger than zero.
/// One is returned if vectorization should best be avoided due to cost.
unsigned computeFeasibleMaxVF(bool OptForSize, unsigned ConstTripCount);

/// The vectorization cost is a combination of the cost itself and a boolean
/// indicating whether any of the contributing operations will actually
/// operate on
/// vector values after type legalization in the backend. If this latter value
/// is
/// false, then all operations will be scalarized (i.e. no vectorization has
/// actually taken place).
using VectorizationCostTy = std::pair<unsigned, bool>;

/// Returns the expected execution cost. The unit of the cost does
/// not matter because we use the 'cost' units to compare different
/// vector widths. The cost that is returned is *not* normalized by
/// the factor width.
VectorizationCostTy expectedCost(unsigned VF);

/// Returns the execution time cost of an instruction for a given vector
/// width. Vector width of one means scalar.
VectorizationCostTy getInstructionCost(Instruction *I, unsigned VF);

/// The cost-computation logic from getInstructionCost which provides
/// the vector type as an output parameter.
unsigned getInstructionCost(Instruction *I, unsigned VF, Type *&VectorTy);

/// Calculate vectorization cost of memory instruction \p I.
unsigned getMemoryInstructionCost(Instruction *I, unsigned VF);

/// The cost computation for scalarized memory instruction.
unsigned getMemInstScalarizationCost(Instruction *I, unsigned VF);

/// The cost computation for interleaving group of memory instructions.
unsigned getInterleaveGroupCost(Instruction *I, unsigned VF);

/// The cost computation for Gather/Scatter instruction.
unsigned getGatherScatterCost(Instruction *I, unsigned VF);

/// The cost computation for widening instruction \p I with consecutive
/// memory access.
unsigned getConsecutiveMemOpCost(Instruction *I, unsigned VF);

/// The cost calculation for Load instruction \p I with uniform pointer -
/// scalar load + broadcast.
unsigned getUniformMemOpCost(Instruction *I, unsigned VF);

/// Returns whether the instruction is a load or store and will be a emitted
/// as a vector operation.
bool isConsecutiveLoadOrStore(Instruction *I);

/// Returns true if an artificially high cost for emulated masked memrefs
/// should be used.
bool useEmulatedMaskMemRefHack(Instruction *I);

/// Create an analysis remark that explains why vectorization failed
///
/// \p RemarkName is the identifier for the remark.  \return the remark object
/// that can be streamed to.
OptimizationRemarkAnalysis createMissedAnalysis(StringRef RemarkName) {
  return createLVMissedAnalysis(Hints->vectorizeAnalysisPassName(),
                                RemarkName, TheLoop);
}

/// Map of scalar integer values to the smallest bitwidth they can be legally
/// represented as. The vector equivalents of these values should be truncated
/// to this type.
MapVector<Instruction *, uint64_t> MinBWs;

/// A type representing the costs for instructions if they were to be
/// scalarized rather than vectorized. The entries are Instruction-Cost
/// pairs.
using ScalarCostsTy = DenseMap<Instruction *, unsigned>;

/// A set containing all BasicBlocks that are known to present after
/// vectorization as a predicated block.
SmallPtrSet<BasicBlock *, 4> PredicatedBBsAfterVectorization;

/// A map holding scalar costs for different vectorization factors. The
/// presence of a cost for an instruction in the mapping indicates that the
/// instruction will be scalarized when vectorizing with the associated
/// vectorization factor. The entries are VF-ScalarCostTy pairs.
DenseMap<unsigned, ScalarCostsTy> InstsToScalarize;

/// Holds the instructions known to be uniform after vectorization.
/// The data is collected per VF.
DenseMap<unsigned, SmallPtrSet<Instruction *, 4>> Uniforms;

/// Holds the instructions known to be scalar after vectorization.
/// The data is collected per VF.
DenseMap<unsigned, SmallPtrSet<Instruction *, 4>> Scalars;

/// Holds the instructions (address computations) that are forced to be
/// scalarized.
DenseMap<unsigned, SmallPtrSet<Instruction *, 4>> ForcedScalars;

/// Returns the expected difference in cost from scalarizing the expression
/// feeding a predicated instruction \p PredInst. The instructions to
/// scalarize and their scalar costs are collected in \p ScalarCosts. A
/// non-negative return value implies the expression will be scalarized.
/// Currently, only single-use chains are considered for scalarization.
int computePredInstDiscount(Instruction *PredInst, ScalarCostsTy &ScalarCosts,
                            unsigned VF);

/// Collect the instructions that are uniform after vectorization. An
/// instruction is uniform if we represent it with a single scalar value in
/// the vectorized loop corresponding to each vector iteration. Examples of
/// uniform instructions include pointer operands of consecutive or
/// interleaved memory accesses. Note that although uniformity implies an
/// instruction will be scalar, the reverse is not true. In general, a
/// scalarized instruction will be represented by VF scalar values in the
/// vectorized loop, each corresponding to an iteration of the original
/// scalar loop.
void collectLoopUniforms(unsigned VF);

/// Collect the instructions that are scalar after vectorization. An
/// instruction is scalar if it is known to be uniform or will be scalarized
/// during vectorization. Non-uniform scalarized instructions will be
/// represented by VF values in the vectorized loop, each corresponding to an
/// iteration of the original scalar loop.
void collectLoopScalars(unsigned VF);

/// Keeps cost model vectorization decision and cost for instructions.
/// Right now it is used for memory instructions only.
using DecisionList = DenseMap<std::pair<Instruction *, unsigned>,
                              std::pair<InstWidening, unsigned>>;

DecisionList WideningDecisions;

1571public:
/// The loop that we evaluate.
Loop *TheLoop;

/// Predicated scalar evolution analysis.
PredicatedScalarEvolution &PSE;

/// Loop Info analysis.
LoopInfo *LI;

/// Vectorization legality.
LoopVectorizationLegality *Legal;

/// Vector target information.
const TargetTransformInfo &TTI;

/// Target Library Info.
const TargetLibraryInfo *TLI;

/// Demanded bits analysis.
DemandedBits *DB;

/// Assumption cache.
AssumptionCache *AC;

/// Interface to emit optimization remarks.
OptimizationRemarkEmitter *ORE;

const Function *TheFunction;

/// Loop Vectorize Hint.
const LoopVectorizeHints *Hints;

/// The interleave access information contains groups of interleaved accesses
/// with the same stride and close to each other.
InterleavedAccessInfo &InterleaveInfo;

/// Values to ignore in the cost model.
SmallPtrSet<const Value *, 16> ValuesToIgnore;

/// Values to ignore in the cost model when VF > 1.
SmallPtrSet<const Value *, 16> VecValuesToIgnore;
1613};

1615} // end namespace llvm

1617// Return true if \p OuterLp is an outer loop annotated with hints for explicit
1618// vectorization. The loop needs to be annotated with #pragma omp simd
1619// simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the
1620// vector length information is not provided, vectorization is not considered
1621// explicit. Interleave hints are not allowed either. These limitations will be
1622// relaxed in the future.
1623// Please, note that we are currently forced to abuse the pragma 'clang
1624// vectorize' semantics. This pragma provides *auto-vectorization hints*
1625// (i.e., LV must check that vectorization is legal) whereas pragma 'omp simd'
1626// provides *explicit vectorization hints* (LV can bypass legal checks and
1627// assume that vectorization is legal). However, both hints are implemented
1628// using the same metadata (llvm.loop.vectorize, processed by
1629// LoopVectorizeHints). This will be fixed in the future when the native IR
1630// representation for pragma 'omp simd' is introduced.
1631static bool isExplicitVecOuterLoop(Loop *OuterLp,
                                 OptimizationRemarkEmitter *ORE) {
assert(!OuterLp->empty() && "This is not an outer loop")(static_cast <bool> (!OuterLp->empty() && "This is not an outer loop"
) ? void (0) : __assert_fail ("!OuterLp->empty() && \"This is not an outer loop\""
, "/build/llvm-toolchain-snapshot-7~svn338205/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1633, __extension__ __PRETTY_FUNCTION__));
LoopVectorizeHints Hints(OuterLp, true /*DisableInterleaving*/, *ORE);

// Only outer loops with an explicit vectorization hint are supported.
// Unannotated outer loops are ignored.
if (Hints.getForce() == LoopVectorizeHints::FK_Undefined)
  return false;

Function *Fn = OuterLp->getHeader()->getParent();
if (!Hints.allowVectorization(Fn, OuterLp, false /*AlwaysVectorize*/)) {
  LLVM_DEBUG(dbgs() << "LV: Loop hints prevent outer loop vectorization.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Loop hints prevent outer loop vectorization.\n"
; } } while (false);
  return false;
}

if (!Hints.getWidth()) {
  LLVM_DEBUG(dbgs() << "LV: Not vectorizing: No user vector width.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Not vectorizing: No user vector width.\n"
; } } while (false);
  emitMissedWarning(Fn, OuterLp, Hints, ORE);
  return false;
}

if (Hints.getInterleave() > 1) {
  // TODO: Interleave support is future work.
  LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Not vectorizing: Interleave is not supported for "
 "outer loops.\n"; } } while (false)
                       "outer loops.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Not vectorizing: Interleave is not supported for "
 "outer loops.\n"; } } while (false);
  emitMissedWarning(Fn, OuterLp, Hints, ORE);
  return false;
}

return true;
1662}

1664static void collectSupportedLoops(Loop &L, LoopInfo *LI,
                                OptimizationRemarkEmitter *ORE,
                                SmallVectorImpl<Loop *> &V) {
// Collect inner loops and outer loops without irreducible control flow. For
// now, only collect outer loops that have explicit vectorization hints. If we
// are stress testing the VPlan H-CFG construction, we collect the outermost
// loop of every loop nest.
if (L.empty() || VPlanBuildStressTest ||
    (EnableVPlanNativePath && isExplicitVecOuterLoop(&L, ORE))) {
  LoopBlocksRPO RPOT(&L);
  RPOT.perform(LI);
  if (!containsIrreducibleCFG<const BasicBlock *>(RPOT, *LI)) {
    V.push_back(&L);
    // TODO: Collect inner loops inside marked outer loops in case
    // vectorization fails for the outer loop. Do not invoke
    // 'containsIrreducibleCFG' again for inner loops when the outer loop is
    // already known to be reducible. We can use an inherited attribute for
    // that.
    return;
  }
}
for (Loop *InnerL : L)
  collectSupportedLoops(*InnerL, LI, ORE, V);
1687}

1689namespace {

1691/// The LoopVectorize Pass.
1692struct LoopVectorize : public FunctionPass {
/// Pass identification, replacement for typeid
static char ID;

LoopVectorizePass Impl;

explicit LoopVectorize(bool NoUnrolling = false, bool AlwaysVectorize = true)
    : FunctionPass(ID) {
  Impl.DisableUnrolling = NoUnrolling;
  Impl.AlwaysVectorize = AlwaysVectorize;
  initializeLoopVectorizePass(*PassRegistry::getPassRegistry());
}

bool runOnFunction(Function &F) override {
  if (skipFunction(F))
    return false;

  auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
  auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
  auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
  auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
  auto *BFI = &getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI();
  auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
  auto *TLI = TLIP ? &TLIP->getTLI() : nullptr;
  auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
  auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
  auto *LAA = &getAnalysis<LoopAccessLegacyAnalysis>();
  auto *DB = &getAnalysis<DemandedBitsWrapperPass>().getDemandedBits();
  auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();

  std::function<const LoopAccessInfo &(Loop &)> GetLAA =
      [&](Loop &L) -> const LoopAccessInfo & { return LAA->getInfo(&L); };

  return Impl.runImpl(F, *SE, *LI, *TTI, *DT, *BFI, TLI, *DB, *AA, *AC,
                      GetLAA, *ORE);
}

void getAnalysisUsage(AnalysisUsage &AU) const override {
  AU.addRequired<AssumptionCacheTracker>();
  AU.addRequired<BlockFrequencyInfoWrapperPass>();
  AU.addRequired<DominatorTreeWrapperPass>();
  AU.addRequired<LoopInfoWrapperPass>();
  AU.addRequired<ScalarEvolutionWrapperPass>();
  AU.addRequired<TargetTransformInfoWrapperPass>();
  AU.addRequired<AAResultsWrapperPass>();
  AU.addRequired<LoopAccessLegacyAnalysis>();
  AU.addRequired<DemandedBitsWrapperPass>();
  AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
  AU.addPreserved<LoopInfoWrapperPass>();
  AU.addPreserved<DominatorTreeWrapperPass>();
  AU.addPreserved<BasicAAWrapperPass>();
  AU.addPreserved<GlobalsAAWrapperPass>();
}
1745};

1747} // end anonymous namespace

1749//===----------------------------------------------------------------------===//
1750// Implementation of LoopVectorizationLegality, InnerLoopVectorizer and
1751// LoopVectorizationCostModel and LoopVectorizationPlanner.
1752//===----------------------------------------------------------------------===//

1754Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) {
// We need to place the broadcast of invariant variables outside the loop,
// but only if it's proven safe to do so. Else, broadcast will be inside
// vector loop body.
Instruction *Instr = dyn_cast<Instruction>(V);
bool SafeToHoist = OrigLoop->isLoopInvariant(V) &&
                   (!Instr ||
                    DT->dominates(Instr->getParent(), LoopVectorPreHeader));
// Place the code for broadcasting invariant variables in the new preheader.
IRBuilder<>::InsertPointGuard Guard(Builder);
if (SafeToHoist)
  Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());

// Broadcast the scalar into all locations in the vector.
Value *Shuf = Builder.CreateVectorSplat(VF, V, "broadcast");

return Shuf;
1771}

1773void InnerLoopVectorizer::createVectorIntOrFpInductionPHI(
  const InductionDescriptor &II, Value *Step, Instruction *EntryVal) {
assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) &&(static_cast <bool> ((isa<PHINode>(EntryVal) || isa
<TruncInst>(EntryVal)) && "Expected either an induction phi-node or a truncate of it!"
) ? void (0) : __assert_fail ("(isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) && \"Expected either an induction phi-node or a truncate of it!\""
, "/build/llvm-toolchain-snapshot-7~svn338205/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1776, __extension__ __PRETTY_FUNCTION__))
       "Expected either an induction phi-node or a truncate of it!")(static_cast <bool> ((isa<PHINode>(EntryVal) || isa
<TruncInst>(EntryVal)) && "Expected either an induction phi-node or a truncate of it!"
) ? void (0) : __assert_fail ("(isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) && \"Expected either an induction phi-node or a truncate of it!\""
, "/build/llvm-toolchain-snapshot-7~svn338205/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1776, __extension__ __PRETTY_FUNCTION__));
Value *Start = II.getStartValue();

// Construct the initial value of the vector IV in the vector loop preheader
auto CurrIP = Builder.saveIP();
Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
if (isa<TruncInst>(EntryVal)) {
  assert(Start->getType()->isIntegerTy() &&(static_cast <bool> (Start->getType()->isIntegerTy
() && "Truncation requires an integer type") ? void (
0) : __assert_fail ("Start->getType()->isIntegerTy() && \"Truncation requires an integer type\""
, "/build/llvm-toolchain-snapshot-7~svn338205/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1784, __extension__ __PRETTY_FUNCTION__))
         "Truncation requires an integer type")(static_cast <bool> (Start->getType()->isIntegerTy
() && "Truncation requires an integer type") ? void (
0) : __assert_fail ("Start->getType()->isIntegerTy() && \"Truncation requires an integer type\""
, "/build/llvm-toolchain-snapshot-7~svn338205/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1784, __extension__ __PRETTY_FUNCTION__));
  auto *TruncType = cast<IntegerType>(EntryVal->getType());
  Step = Builder.CreateTrunc(Step, TruncType);
  Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType);
}
Value *SplatStart = Builder.CreateVectorSplat(VF, Start);
Value *SteppedStart =
    getStepVector(SplatStart, 0, Step, II.getInductionOpcode());

// We create vector phi nodes for both integer and floating-point induction
// variables. Here, we determine the kind of arithmetic we will perform.
Instruction::BinaryOps AddOp;
Instruction::BinaryOps MulOp;
if (Step->getType()->isIntegerTy()) {
  AddOp = Instruction::Add;
  MulOp = Instruction::Mul;
} else {
  AddOp = II.getInductionOpcode();
  MulOp = Instruction::FMul;
}

// Multiply the vectorization factor by the step using integer or
// floating-point arithmetic as appropriate.
Value *ConstVF = getSignedIntOrFpConstant(Step->getType(), VF);
Value *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, Step, ConstVF));

// Create a vector splat to use in the induction update.
//
// FIXME: If the step is non-constant, we create the vector splat with
//        IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't
//        handle a constant vector splat.
Value *SplatVF = isa<Constant>(Mul)
                     ? ConstantVector::getSplat(VF, cast<Constant>(Mul))
                     : Builder.CreateVectorSplat(VF, Mul);
Builder.restoreIP(CurrIP);

// We may need to add the step a number of times, depending on the unroll
// factor. The last of those goes into the PHI.
PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind",
                                  &*LoopVectorBody->getFirstInsertionPt());
VecInd->setDebugLoc(EntryVal->getDebugLoc());
Instruction *LastInduction = VecInd;
for (unsigned Part = 0; Part < UF; ++Part) {
  VectorLoopValueMap.setVectorValue(EntryVal, Part, LastInduction);

  if (isa<TruncInst>(EntryVal))
    addMetadata(LastInduction, EntryVal);
  recordVectorLoopValueForInductionCast(II, EntryVal, LastInduction, Part);

  LastInduction = cast<Instruction>(addFastMathFlag(
      Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add")));
  LastInduction->setDebugLoc(EntryVal->getDebugLoc());
}

// Move the last step to the end of the latch block. This ensures consistent
// placement of all induction updates.
auto *LoopVectorLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch();
auto *Br = cast<BranchInst>(LoopVectorLatch->getTerminator());
auto *ICmp = cast<Instruction>(Br->getCondition());
LastInduction->moveBefore(ICmp);
LastInduction->setName("vec.ind.next");

VecInd->addIncoming(SteppedStart, LoopVectorPreHeader);
VecInd->addIncoming(LastInduction, LoopVectorLatch);
1848}

1850bool InnerLoopVectorizer::shouldScalarizeInstruction(Instruction *I) const {
return Cost->isScalarAfterVectorization(I, VF) ||
       Cost->isProfitableToScalarize(I, VF);
1853}

1855bool InnerLoopVectorizer::needsScalarInduction(Instruction *IV) const {
if (shouldScalarizeInstruction(IV))
  return true;
auto isScalarInst = [&](User *U) -> bool {
  auto *I = cast<Instruction>(U);
  return (OrigLoop->contains(I) && shouldScalarizeInstruction(I));
};
return llvm::any_of(IV->users(), isScalarInst);
1863}

1865void InnerLoopVectorizer::recordVectorLoopValueForInductionCast(
  const InductionDescriptor &ID, const Instruction *EntryVal,
  Value *VectorLoopVal, unsigned Part, unsigned Lane) {
assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) &&(static_cast <bool> ((isa<PHINode>(EntryVal) || isa
<TruncInst>(EntryVal)) && "Expected either an induction phi-node or a truncate of it!"
) ? void (0) : __assert_fail ("(isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) && \"Expected either an induction phi-node or a truncate of it!\""
, "/build/llvm-toolchain-snapshot-7~svn338205/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1869, __extension__ __PRETTY_FUNCTION__))
       "Expected either an induction phi-node or a truncate of it!")(static_cast <bool> ((isa<PHINode>(EntryVal) || isa
<TruncInst>(EntryVal)) && "Expected either an induction phi-node or a truncate of it!"
) ? void (0) : __assert_fail ("(isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) && \"Expected either an induction phi-node or a truncate of it!\""
, "/build/llvm-toolchain-snapshot-7~svn338205/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1869, __extension__ __PRETTY_FUNCTION__));

// This induction variable is not the phi from the original loop but the
// newly-created IV based on the proof that casted Phi is equal to the
// uncasted Phi in the vectorized loop (under a runtime guard possibly). It
// re-uses the same InductionDescriptor that original IV uses but we don't
// have to do any recording in this case - that is done when original IV is
// processed.
if (isa<TruncInst>(EntryVal))
  return;

const SmallVectorImpl<Instruction *> &Casts = ID.getCastInsts();
if (Casts.empty())
  return;
// Only the first Cast instruction in the Casts vector is of interest.
// The rest of the Casts (if exist) have no uses outside the
// induction update chain itself.
Instruction *CastInst = *Casts.begin();
if (Lane < UINT_MAX(2147483647 *2U +1U))
  VectorLoopValueMap.setScalarValue(CastInst, {Part, Lane}, VectorLoopVal);
else
  VectorLoopValueMap.setVectorValue(CastInst, Part, VectorLoopVal);
1891}

1893void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc) {
assert((IV->getType()->isIntegerTy() || IV != OldInduction) &&(static_cast <bool> ((IV->getType()->isIntegerTy(
) || IV != OldInduction) && "Primary induction variable must have an integer type"
) ? void (0) : __assert_fail ("(IV->getType()->isIntegerTy() || IV != OldInduction) && \"Primary induction variable must have an integer type\""
, "/build/llvm-toolchain-snapshot-7~svn338205/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1895, __extension__ __PRETTY_FUNCTION__))
       "Primary induction variable must have an integer type")(static_cast <bool> ((IV->getType()->isIntegerTy(
) || IV != OldInduction) && "Primary induction variable must have an integer type"
) ? void (0) : __assert_fail ("(IV->getType()->isIntegerTy() || IV != OldInduction) && \"Primary induction variable must have an integer type\""
, "/build/llvm-toolchain-snapshot-7~svn338205/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1895, __extension__ __PRETTY_FUNCTION__));

auto II = Legal->getInductionVars()->find(IV);
assert(II != Legal->getInductionVars()->end() && "IV is not an induction")(static_cast <bool> (II != Legal->getInductionVars()
->end() && "IV is not an induction") ? void (0) : __assert_fail
 ("II != Legal->getInductionVars()->end() && \"IV is not an induction\""
, "/build/llvm-toolchain-snapshot-7~svn338205/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1898, __extension__ __PRETTY_FUNCTION__));

auto ID = II->second;
assert(IV->getType() == ID.getStartValue()->getType() && "Types must match")(static_cast <bool> (IV->getType() == ID.getStartValue
()->getType() && "Types must match") ? void (0) : __assert_fail
 ("IV->getType() == ID.getStartValue()->getType() && \"Types must match\""
, "/build/llvm-toolchain-snapshot-7~svn338205/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1901, __extension__ __PRETTY_FUNCTION__));

// The scalar value to broadcast. This will be derived from the canonical
// induction variable.
Value *ScalarIV = nullptr;

// The value from the original loop to which we are mapping the new induction
// variable.
Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : IV;

// True if we have vectorized the induction variable.
auto VectorizedIV = false;

// Determine if we want a scalar version of the induction variable. This is
// true if the induction variable itself is not widened, or if it has at
// least one user in the loop that is not widened.
auto NeedsScalarIV = VF > 1 && needsScalarInduction(EntryVal);

// Generate code for the induction step. Note that induction steps are
// required to be loop-invariant
assert(PSE.getSE()->isLoopInvariant(ID.getStep(), OrigLoop) &&(static_cast <bool> (PSE.getSE()->isLoopInvariant(ID
.getStep(), OrigLoop) && "Induction step should be loop invariant"
) ? void (0) : __assert_fail ("PSE.getSE()->isLoopInvariant(ID.getStep(), OrigLoop) && \"Induction step should be loop invariant\""
, "/build/llvm-toolchain-snapshot-7~svn338205/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1922, __extension__ __PRETTY_FUNCTION__))
       "Induction step should be loop invariant")(static_cast <bool> (PSE.getSE()->isLoopInvariant(ID
.getStep(), OrigLoop) && "Induction step should be loop invariant"
) ? void (0) : __assert_fail ("PSE.getSE()->isLoopInvariant(ID.getStep(), OrigLoop) && \"Induction step should be loop invariant\""
, "/build/llvm-toolchain-snapshot-7~svn338205/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1922, __extension__ __PRETTY_FUNCTION__));
auto &DL = OrigLoop->getHeader()->getModule()->getDataLayout();
Value *Step = nullptr;
if (PSE.getSE()->isSCEVable(IV->getType())) {
  SCEVExpander Exp(*PSE.getSE(), DL, "induction");
  Step = Exp.expandCodeFor(ID.getStep(), ID.getStep()->getType(),
                           LoopVectorPreHeader->getTerminator());
} else {
  Step = cast<SCEVUnknown>(ID.getStep())->getValue();
}

// Try to create a new independent vector induction variable. If we can't
// create the phi node, we will splat the scalar induction variable in each
// loop iteration.
if (VF > 1 && !shouldScalarizeInstruction(EntryVal)) {
  createVectorIntOrFpInductionPHI(ID, Step, EntryVal);
  VectorizedIV = true;
}

// If we haven't yet vectorized the induction variable, or if we will create
// a scalar one, we need to define the scalar induction variable and step
// values. If we were given a truncation type, truncate the canonical
// induction variable and step. Otherwise, derive these values from the
// induction descriptor.
if (!VectorizedIV || NeedsScalarIV) {
  ScalarIV = Induction;
  if (IV != OldInduction) {
    ScalarIV = IV->getType()->isIntegerTy()
                   ? Builder.CreateSExtOrTrunc(Induction, IV->getType())
                   : Builder.CreateCast(Instruction::SIToFP, Induction,
                                        IV->getType());
    ScalarIV = ID.transform(Builder, ScalarIV, PSE.getSE(), DL);
    ScalarIV->setName("offset.idx");
  }
  if (Trunc) {
    auto *TruncType = cast<IntegerType>(Trunc->getType());
    assert(Step->getType()->isIntegerTy() &&(static_cast <bool> (Step->getType()->isIntegerTy
() && "Truncation requires an integer step") ? void (
0) : __assert_fail ("Step->getType()->isIntegerTy() && \"Truncation requires an integer step\""
, "/build/llvm-toolchain-snapshot-7~svn338205/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1959, __extension__ __PRETTY_FUNCTION__))
           "Truncation requires an integer step")(static_cast <bool> (Step->getType()->isIntegerTy
() && "Truncation requires an integer step") ? void (
0) : __assert_fail ("Step->getType()->isIntegerTy() && \"Truncation requires an integer step\""
, "/build/llvm-toolchain-snapshot-7~svn338205/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1959, __extension__ __PRETTY_FUNCTION__));
    ScalarIV = Builder.CreateTrunc(ScalarIV, TruncType);
    Step = Builder.CreateTrunc(Step, TruncType);
  }
}

// If we haven't yet vectorized the induction variable, splat the scalar
// induction variable, and build the necessary step vectors.
// TODO: Don't do it unless the vectorized IV is really required.
if (!VectorizedIV) {
  Value *Broadcasted = getBroadcastInstrs(ScalarIV);
  for (unsigned Part = 0; Part < UF; ++Part) {
    Value *EntryPart =
        getStepVector(Broadcasted, VF * Part, Step, ID.getInductionOpcode());
    VectorLoopValueMap.setVectorValue(EntryVal, Part, EntryPart);
    if (Trunc)
      addMetadata(EntryPart, Trunc);
    recordVectorLoopValueForInductionCast(ID, EntryVal, EntryPart, Part);
  }
}

// If an induction variable is only used for counting loop iterations or
// calculating addresses, it doesn't need to be widened. Create scalar steps
// that can be used by instructions we will later scalarize. Note that the
// addition of the scalar steps will not increase the number of instructions
// in the loop in the common case prior to InstCombine. We will be trading
// one vector extract for each scalar step.
if (NeedsScalarIV)
  buildScalarSteps(ScalarIV, Step, EntryVal, ID);
1988}

1990Value *InnerLoopVectorizer::getStepVector(Value *Val, int StartIdx, Value *Step,
                                        Instruction::BinaryOps BinOp) {
// Create and check the types.
assert(Val->getType()->isVectorTy() && "Must be a vector")(static_cast <bool> (Val->getType()->isVectorTy()
 && "Must be a vector") ? void (0) : __assert_fail ("Val->getType()->isVectorTy() && \"Must be a vector\""
, "/build/llvm-toolchain-snapshot-7~svn338205/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1993, __extension__ __PRETTY_FUNCTION__));
int VLen = Val->getType()->getVectorNumElements();

Type *STy = Val->getType()->getScalarType();
assert((STy->isIntegerTy() || STy->isFloatingPointTy()) &&(static_cast <bool> ((STy->isIntegerTy() || STy->
isFloatingPointTy()) && "Induction Step must be an integer or FP"
) ? void (0) : __assert_fail ("(STy->isIntegerTy() || STy->isFloatingPointTy()) && \"Induction Step must be an integer or FP\""
, "/build/llvm-toolchain-snapshot-7~svn338205/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1998, __extension__ __PRETTY_FUNCTION__))
       "Induction Step must be an integer or FP")(static_cast <bool> ((STy->isIntegerTy() || STy->
isFloatingPointTy()) && "Induction Step must be an integer or FP"
) ? void (0) : __assert_fail ("(STy->isIntegerTy() || STy->isFloatingPointTy()) && \"Induction Step must be an integer or FP\""
, "/build/llvm-toolchain-snapshot-7~svn338205/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1998, __extension__ __PRETTY_FUNCTION__));
assert(Step->getType() == STy && "Step has wrong type")(static_cast <bool> (Step->getType() == STy &&
 "Step has wrong type") ? void (0) : __assert_fail ("Step->getType() == STy && \"Step has wrong type\""
, "/build/llvm-toolchain-snapshot-7~svn338205/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1999, __extension__ __PRETTY_FUNCTION__));

SmallVector<Constant *, 8> Indices;

if (STy->isIntegerTy()) {
  // Create a vector of consecutive numbers from zero to VF.
  for (int i = 0; i < VLen; ++i)
    Indices.push_back(ConstantInt::get(STy, StartIdx + i));

  // Add the consecutive indices to the vector value.
  Constant *Cv = ConstantVector::get(Indices);
  assert(Cv->getType() == Val->getType() && "Invalid consecutive vec")(static_cast <bool> (Cv->getType() == Val->getType
() && "Invalid consecutive vec") ? void (0) : __assert_fail
 ("Cv->getType() == Val->getType() && \"Invalid consecutive vec\""
, "/build/llvm-toolchain-snapshot-7~svn338205/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2010, __extension__ __PRETTY_FUNCTION__));
  Step = Builder.CreateVectorSplat(VLen, Step);
  assert(Step->getType() == Val->getType() && "Invalid step vec")(static_cast <bool> (Step->getType() == Val->getType
() && "Invalid step vec") ? void (0) : __assert_fail (
"Step->getType() == Val->getType() && \"Invalid step vec\""
, "/build/llvm-toolchain-snapshot-7~svn338205/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2012, __extension__ __PRETTY_FUNCTION__));
  // FIXME: The newly created binary instructions should contain nsw/nuw flags,
  // which can be found from the original scalar operations.
  Step = Builder.CreateMul(Cv, Step);
  return Builder.CreateAdd(Val, Step, "induction");
}

// Floating point induction.
assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) &&(static_cast <bool> ((BinOp == Instruction::FAdd || BinOp
 == Instruction::FSub) && "Binary Opcode should be specified for FP induction"
) ? void (0) : __assert_fail ("(BinOp == Instruction::FAdd || BinOp == Instruction::FSub) && \"Binary Opcode should be specified for FP induction\""
, "/build/llvm-toolchain-snapshot-7~svn338205/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2021, __extension__ __PRETTY_FUNCTION__))
       "Binary Opcode should be specified for FP induction")(static_cast <bool> ((BinOp == Instruction::FAdd || BinOp
 == Instruction::FSub) && "Binary Opcode should be specified for FP induction"
) ? void (0) : __assert_fail ("(BinOp == Instruction::FAdd || BinOp == Instruction::FSub) && \"Binary Opcode should be specified for FP induction\""
, "/build/llvm-toolchain-snapshot-7~svn338205/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2021, __extension__ __PRETTY_FUNCTION__));
// Create a vector of consecutive numbers from zero to VF.
for (int i = 0; i < VLen; ++i)
  Indices.push_back(ConstantFP::get(STy, (double)(StartIdx + i)));

// Add the consecutive indices to the vector value.
Constant *Cv = ConstantVector::get(Indices);

Step = Builder.CreateVectorSplat(VLen, Step);

// Floating point operations had to be 'fast' to enable the induction.
FastMathFlags Flags;
Flags.setFast();

Value *MulOp = Builder.CreateFMul(Cv, Step);
if (isa<Instruction>(MulOp))
  // Have to check, MulOp may be a constant
  cast<Instruction>(MulOp)->setFastMathFlags(Flags);

Value *BOp = Builder.CreateBinOp(BinOp, Val, MulOp, "induction");
if (isa<Instruction>(BOp))
  cast<Instruction>(BOp)->setFastMathFlags(Flags);
return BOp;
2044}

2046void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step,
                                         Instruction *EntryVal,
                                         const InductionDescriptor &ID) {
// We shouldn't have to build scalar steps if we aren't vectorizing.
assert(VF > 1 && "VF should be greater than one")(static_cast <bool> (VF > 1 && "VF should be greater than one"
) ? void (0) : __assert_fail ("VF > 1 && \"VF should be greater than one\""
, "/build/llvm-toolchain-snapshot-7~svn338205/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2050, __extension__ __PRETTY_FUNCTION__));

// Get the value type and ensure it and the step have the same integer type.
Type *ScalarIVTy = ScalarIV->getType()->getScalarType();
assert(ScalarIVTy == Step->getType() &&(static_cast <bool> (ScalarIVTy == Step->getType() &&
 "Val and Step should have the same type") ? void (0) : __assert_fail
 ("ScalarIVTy == Step->getType() && \"Val and Step should have the same type\""
, "/build/llvm-toolchain-snapshot-7~svn338205/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2055, __extension__ __PRETTY_FUNCTION__))
       "Val and Step should have the same type")(static_cast <bool> (ScalarIVTy == Step->getType() &&
 "Val and Step should have the same type") ? void (0) : __assert_fail
 ("ScalarIVTy == Step->getType() && \"Val and Step should have the same type\""
, "/build/llvm-toolchain-snapshot-7~svn338205/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2055, __extension__ __PRETTY_FUNCTION__));

// We build scalar steps for both integer and floating-point induction
// variables. Here, we determine the kind of arithmetic we will perform.
Instruction::BinaryOps AddOp;
Instruction::BinaryOps MulOp;
if (ScalarIVTy->isIntegerTy()) {
  AddOp = Instruction::Add;
  MulOp = Instruction::Mul;
} else {
  AddOp = ID.getInductionOpcode();
  MulOp = Instruction::FMul;
}

// Determine the number of scalars we need to generate for each unroll
// iteration. If EntryVal is uniform, we only need to generate the first
// lane. Otherwise, we generate all VF values.
unsigned Lanes =
    Cost->isUniformAfterVectorization(cast<Instruction>(EntryVal), VF) ? 1
                                                                       : VF;
// Compute the scalar steps and save the results in VectorLoopValueMap.
for (unsigned Part = 0; Part < UF; ++Part) {
  for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
    auto *StartIdx = getSignedIntOrFpConstant(ScalarIVTy, VF * Part + Lane);
    auto *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, StartIdx, Step));
    auto *Add = addFastMathFlag(Builder.CreateBinOp(AddOp, ScalarIV, Mul));
    VectorLoopValueMap.setScalarValue(EntryVal, {Part, Lane}, Add);
    recordVectorLoopValueForInductionCast(ID, EntryVal, Add, Part, Lane);
  }
}
2085}

2087Value *InnerLoopVectorizer::getOrCreateVectorValue(Value *V, unsigned Part) {
assert(V != Induction && "The new induction variable should not be used.")(static_cast <bool> (V != Induction && "The new induction variable should not be used."
) ? void (0) : __assert_fail ("V != Induction && \"The new induction variable should not be used.\""
, "/build/llvm-toolchain-snapshot-7~svn338205/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2088, __extension__ __PRETTY_FUNCTION__));
assert(!V->getType()->isVectorTy() && "Can't widen a vector")(static_cast <bool> (!V->getType()->isVectorTy() &&
 "Can't widen a vector") ? void (0) : __assert_fail ("!V->getType()->isVectorTy() && \"Can't widen a vector\""
, "/build/llvm-toolchain-snapshot-7~svn338205/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2089, __extension__ __PRETTY_FUNCTION__));
assert(!V->getType()->isVoidTy() && "Type does not produce a value")(static_cast <bool> (!V->getType()->isVoidTy() &&
 "Type does not produce a value") ? void (0) : __assert_fail (
"!V->getType()->isVoidTy() && \"Type does not produce a value\""
, "/build/llvm-toolchain-snapshot-7~svn338205/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2090, __extension__ __PRETTY_FUNCTION__));

// If we have a stride that is replaced by one, do it here.
if (Legal->hasStride(V))
  V = ConstantInt::get(V->getType(), 1);

// If we have a vector mapped to this value, return it.
if (VectorLoopValueMap.hasVectorValue(V, Part))
  return VectorLoopValueMap.getVectorValue(V, Part);

// If the value has not been vectorized, check if it has been scalarized
// instead. If it has been scalarized, and we actually need the value in
// vector form, we will construct the vector values on demand.
if (VectorLoopValueMap.hasAnyScalarValue(V)) {
  Value *ScalarValue = VectorLoopValueMap.getScalarValue(V, {Part, 0});

  // If we've scalarized a value, that value should be an instruction.
  auto *I = cast<Instruction>(V);

  // If we aren't vectorizing, we can just copy the scalar map values over to
  // the vector map.
  if (VF == 1) {
    VectorLoopValueMap.setVectorValue(V, Part, ScalarValue);
    return ScalarValue;
  }

  // Get the last scalar instruction we generated for V and Part. If the value
  // is known to be uniform after vectorization, this corresponds to lane zero
  // of the Part unroll iteration. Otherwise, the last instruction is the one
  // we created for the last vector lane of the Part unroll iteration.
  unsigned LastLane = Cost->isUniformAfterVectorization(I, VF) ? 0 : VF - 1;
  auto *LastInst = cast<Instruction>(
      VectorLoopValueMap.getScalarValue(V, {Part, LastLane}));

  // Set the insert point after the last scalarized instruction. This ensures
  // the insertelement sequence will directly follow the scalar definitions.
  auto OldIP = Builder.saveIP();
  auto NewIP = std::next(BasicBlock::iterator(LastInst));
  Builder.SetInsertPoint(&*NewIP);

  // However, if we are vectorizing, we need to construct the vector values.
  // If the value is known to be uniform after vectorization, we can just
  // broadcast the scalar value corresponding to lane zero for each unroll
  // iteration. Otherwise, we construct the vector values using insertelement
  // instructions. Since the resulting vectors are stored in
  // VectorLoopValueMap, we will only generate the insertelements once.
  Value *VectorValue = nullptr;
  if (Cost->isUniformAfterVectorization(I, VF)) {
    VectorValue = getBroadcastInstrs(ScalarValue);
    VectorLoopValueMap.setVectorValue(V, Part, VectorValue);
  } else {
    // Initialize packing with insertelements to start from undef.
    Value *Undef = UndefValue::get(VectorType::get(V->getType(), VF));
    VectorLoopValueMap.setVectorValue(V, Part, Undef);
    for (unsigned Lane = 0; Lane < VF; ++Lane)
      packScalarIntoVectorValue(V, {Part, Lane});
    VectorValue = VectorLoopValueMap.getVectorValue(V, Part);
  }
  Builder.restoreIP(OldIP);
  return VectorValue;
}

// If this scalar is unknown, assume that it is a constant or that it is
// loop invariant. Broadcast V and save the value for future uses.
Value *B = getBroadcastInstrs(V);
VectorLoopValueMap.setVectorValue(V, Part, B);
return B;
2157}

2159Value *
2160InnerLoopVectorizer::getOrCreateScalarValue(Value *V,
                                          const VPIteration &Instance) {
// If the value is not an instruction contained in the loop, it should
// already be scalar.
if (OrigLoop->isLoopInvariant(V))
  return V;

assert(Instance.Lane > 0(static_cast <bool> (Instance.Lane > 0 ? !Cost->isUniformAfterVectorization
(cast<Instruction>(V), VF) : true && "Uniform values only have lane zero"
) ? void (0) : __assert_fail ("Instance.Lane > 0 ? !Cost->isUniformAfterVectorization(cast<Instruction>(V), VF) : true && \"Uniform values only have lane zero\""
, "/build/llvm-toolchain-snapshot-7~svn338205/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2169, __extension__ __PRETTY_FUNCTION__))
           ? !Cost->isUniformAfterVectorization(cast<Instruction>(V), VF)(static_cast <bool> (Instance.Lane > 0 ? !Cost->isUniformAfterVectorization
(cast<Instruction>(V), VF) : true && "Uniform values only have lane zero"
) ? void (0) : __assert_fail ("Instance.Lane > 0 ? !Cost->isUniformAfterVectorization(cast<Instruction>(V), VF) : true && \"Uniform values only have lane zero\""
, "/build/llvm-toolchain-snapshot-7~svn338205/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2169, __extension__ __PRETTY_FUNCTION__))
           : true && "Uniform values only have lane zero")(static_cast <bool> (Instance.Lane > 0 ? !Cost->isUniformAfterVectorization
(cast<Instruction>(V), VF) : true && "Uniform values only have lane zero"
) ? void (0) : __assert_fail ("Instance.Lane > 0 ? !Cost->isUniformAfterVectorization(cast<Instruction>(V), VF) : true && \"Uniform values only have lane zero\""
, "/build/llvm-toolchain-snapshot-7~svn338205/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2169, __extension__ __PRETTY_FUNCTION__));

// If the value from the original loop has not been vectorized, it is
// represented by UF x VF scalar values in the new loop. Return the requested
// scalar value.
if (VectorLoopValueMap.hasScalarValue(V, Instance))
  return VectorLoopValueMap.getScalarValue(V, Instance);

// If the value has not been scalarized, get its entry in VectorLoopValueMap
// for the given unroll part. If this entry is not a vector type (i.e., the
// vectorization factor is one), there is no need to generate an
// extractelement instruction.
auto *U = getOrCreateVectorValue(V, Instance.Part);
if (!U->getType()->isVectorTy()) {
  assert(VF == 1 && "Value not scalarized has non-vector type")(static_cast <bool> (VF == 1 && "Value not scalarized has non-vector type"
) ? void (0) : __assert_fail ("VF == 1 && \"Value not scalarized has non-vector type\""
, "/build/llvm-toolchain-snapshot-7~svn338205/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2183, __extension__ __PRETTY_FUNCTION__));
  return U;
}

// Otherwise, the value from the original loop has been vectorized and is
// represented by UF vector values. Extract and return the requested scalar
// value from the appropriate vector lane.
return Builder.CreateExtractElement(U, Builder.getInt32(Instance.Lane));
2191}

2193void InnerLoopVectorizer::packScalarIntoVectorValue(
  Value *V, const VPIteration &Instance) {
assert(V != Induction && "The new induction variable should not be used.")(static_cast <bool> (V != Induction && "The new induction variable should not be used."
) ? void (0) : __assert_fail ("V != Induction && \"The new induction variable should not be used.\""
, "/build/llvm-toolchain-snapshot-7~svn338205/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2195, __extension__ __PRETTY_FUNCTION__));
assert(!V->getType()->isVectorTy() && "Can't pack a vector")(static_cast <bool> (!V->getType()->isVectorTy() &&
 "Can't pack a vector") ? void (0) : __assert_fail ("!V->getType()->isVectorTy() && \"Can't pack a vector\""
, "/build/llvm-toolchain-snapshot-7~svn338205/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2196, __extension__ __PRETTY_FUNCTION__));
assert(!V->getType()->isVoidTy() && "Type does not produce a value")(static_cast <bool> (!V->getType()->isVoidTy() &&
 "Type does not produce a value") ? void (0) : __assert_fail (
"!V->getType()->isVoidTy() && \"Type does not produce a value\""
, "/build/llvm-toolchain-snapshot-7~svn338205/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2197, __extension__ __PRETTY_FUNCTION__));

Value *ScalarInst = VectorLoopValueMap.getScalarValue(V, Instance);
Value *VectorValue = VectorLoopValueMap.getVectorValue(V, Instance.Part);
VectorValue = Builder.CreateInsertElement(VectorValue, ScalarInst,
                                          Builder.getInt32(Instance.Lane));
VectorLoopValueMap.resetVectorValue(V, Instance.Part, VectorValue);
2204}

2206Value *InnerLoopVectorizer::reverseVector(Value *Vec) {
assert(Vec->getType()->isVectorTy() && "Invalid type")(static_cast <bool> (Vec->getType()->isVectorTy()
 && "Invalid type") ? void (0) : __assert_fail ("Vec->getType()->isVectorTy() && \"Invalid type\""
, "/build/llvm-toolchain-snapshot-7~svn338205/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2207, __extension__ __PRETTY_FUNCTION__));
SmallVector<Constant *, 8> ShuffleMask;
for (unsigned i = 0; i < VF; ++i)
  ShuffleMask.push_back(Builder.getInt32(VF - i - 1));

return Builder.CreateShuffleVector(Vec, UndefValue::get(Vec->getType()),
                                   ConstantVector::get(ShuffleMask),
                                   "reverse");
2215}

2217// Try to vectorize the interleave group that \p Instr belongs to.
2218//
2219// E.g. Translate following interleaved load group (factor = 3):
2220//   for (i = 0; i < N; i+=3) {
2221//     R = Pic[i];             // Member of index 0
2222//     G = Pic[i+1];           // Member of index 1
2223//     B = Pic[i+2];           // Member of index 2
2224//     ... // do something to R, G, B
2225//   }
2226// To:
2227//   %wide.vec = load <12 x i32>                       ; Read 4 tuples of R,G,B
2228//   %R.vec = shuffle %wide.vec, undef, <0, 3, 6, 9>   ; R elements
2229//   %G.vec = shuffle %wide.vec, undef, <1, 4, 7, 10>  ; G elements
2230//   %B.vec = shuffle %wide.vec, undef, <2, 5, 8, 11>  ; B elements
2231//
2232// Or translate following interleaved store group (factor = 3):
2233//   for (i = 0; i < N; i+=3) {
2234//     ... do something to R, G, B
2235//     Pic[i]   = R;           // Member of index 0
2236//     Pic[i+1] = G;           // Member of index 1
2237//     Pic[i+2] = B;           // Member of index 2
2238//   }
2239// To:
2240//   %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7>
2241//   %B_U.vec = shuffle %B.vec, undef, <0, 1, 2, 3, u, u, u, u>
2242//   %interleaved.vec = shuffle %R_G.vec, %B_U.vec,
2243//        <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11>    ; Interleave R,G,B elements
2244//   store <12 x i32> %interleaved.vec              ; Write 4 tuples of R,G,B
2245void InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction *Instr) {
const InterleaveGroup *Group = Cost->getInterleavedAccessGroup(Instr);
assert(Group && "Fail to get an interleaved access group.")(static_cast <bool> (Group && "Fail to get an interleaved access group."
) ? void (0) : __assert_fail ("Group && \"Fail to get an interleaved access group.\""
, "/build/llvm-toolchain-snapshot-7~svn338205/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2247, __extension__ __PRETTY_FUNCTION__));

// Skip if current instruction is not the insert position.
if (Instr != Group->getInsertPos())
  return;

const DataLayout &DL = Instr->getModule()->getDataLayout();
Value *Ptr = getLoadStorePointerOperand(Instr);

// Prepare for the vector type of the interleaved load/store.
Type *ScalarTy = getMemInstValueType(Instr);
unsigned InterleaveFactor = Group->getFactor();
Type *VecTy = VectorType::get(ScalarTy, InterleaveFactor * VF);
Type *PtrTy = VecTy->getPointerTo(getMemInstAddressSpace(Instr));

// Prepare for the new pointers.
setDebugLocFromInst(Builder, Ptr);
SmallVector<Value *, 2> NewPtrs;
unsigned Index = Group->getIndex(Instr);

// If the group is reverse, adjust the index to refer to the last vector lane
// instead of the first. We adjust the index from the first vector lane,
// rather than directly getting the pointer for lane VF - 1, because the
// pointer operand of the interleaved access is supposed to be uniform. For
// uniform instructions, we're only required to generate a value for the
// first vector lane in each unroll iteration.
if (Group->isReverse())
  Index += (VF - 1) * Group->getFactor();

bool InBounds = false;
if (auto *gep = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts()))
  InBounds = gep->isInBounds();

for (unsigned Part = 0; Part < UF; Part++) {
  Value *NewPtr = getOrCreateScalarValue(Ptr, {Part, 0});

  // Notice current instruction could be any index. Need to adjust the address
  // to the member of index 0.
  //
  // E.g.  a = A[i+1];     // Member of index 1 (Current instruction)
  //       b = A[i];       // Member of index 0
  // Current pointer is pointed to A[i+1], adjust it to A[i].
  //
  // E.g.  A[i+1] = a;     // Member of index 1
  //       A[i]   = b;     // Member of index 0
  //       A[i+2] = c;     // Member of index 2 (Current instruction)
  // Current pointer is pointed to A[i+2], adjust it to A[i].
  NewPtr = Builder.CreateGEP(NewPtr, Builder.getInt32(-Index));
  if (InBounds)
    cast<GetElementPtrInst>(NewPtr)->setIsInBounds(true);

  // Cast to the vector pointer type.
  NewPtrs.push_back(Builder.CreateBitCast(NewPtr, PtrTy));
}

setDebugLocFromInst(Builder, Instr);
Value *UndefVec = UndefValue::get(VecTy);

// Vectorize the interleaved load group.
if (isa<LoadInst>(Instr)) {
  // For each unroll part, create a wide load for the group.
  SmallVector<Value *, 2> NewLoads;
  for (unsigned Part = 0; Part < UF; Part++) {
    auto *NewLoad = Builder.CreateAlignedLoad(
        NewPtrs[Part], Group->getAlignment(), "wide.vec");
    Group->addMetadata(NewLoad);
    NewLoads.push_back(NewLoad);
  }

  // For each member in the group, shuffle out the appropriate data from the
  // wide loads.
  for (unsigned I = 0; I < InterleaveFactor; ++I) {
    Instruction *Member = Group->getMember(I);

    // Skip the gaps in the group.
    if (!Member)
      continue;

    Constant *StrideMask = createStrideMask(Builder, I, InterleaveFactor, VF);
    for (unsigned Part = 0; Part < UF; Part++) {
      Value *StridedVec = Builder.CreateShuffleVector(
          NewLoads[Part], UndefVec, StrideMask, "strided.vec");

      // If this member has different type, cast the result type.
      if (Member->getType() != ScalarTy) {
        VectorType *OtherVTy = VectorType::get(Member->getType(), VF);
        StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL);
      }

      if (Group->isReverse())
        StridedVec = reverseVector(StridedVec);

      VectorLoopValueMap.setVectorValue(Member, Part, StridedVec);
    }
  }
  return;
}

// The sub vector type for current instruction.
VectorType *SubVT = VectorType::get(ScalarTy, VF);

// Vectorize the interleaved store group.
for (unsigned Part = 0; Part < UF; Part++) {
  // Collect the stored vector from each member.
  SmallVector<Value *, 4> StoredVecs;
  for (unsigned i = 0; i < InterleaveFactor; i++) {
    // Interleaved store group doesn't allow a gap, so each index has a member
    Instruction *Member = Group->getMember(i);
    assert(Member && "Fail to get a member from an interleaved store group")(static_cast <bool> (Member && "Fail to get a member from an interleaved store group"
) ? void (0) : __assert_fail ("Member && \"Fail to get a member from an interleaved store group\""
, "/build/llvm-toolchain-snapshot-7~svn338205/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2355, __extension__ __PRETTY_FUNCTION__));

    Value *StoredVec = getOrCreateVectorValue(
        cast<StoreInst>(Member)->getValueOperand(), Part);
    if (Group->isReverse())
      StoredVec = reverseVector(StoredVec);

    // If this member has different type, cast it to a unified type.

    if (StoredVec->getType() != SubVT)
      StoredVec = createBitOrPointerCast(StoredVec, SubVT, DL);

    StoredVecs.push_back(StoredVec);
  }

  // Concatenate all vectors into a wide vector.
  Value *WideVec = concatenateVectors(Builder, StoredVecs);

  // Interleave the elements in the wide vector.
  Constant *IMask = createInterleaveMask(Builder, VF, InterleaveFactor);
  Value *IVec = Builder.CreateShuffleVector(WideVec, UndefVec, IMask,
                                            "interleaved.vec");

  Instruction *NewStoreInstr =
      Builder.CreateAlignedStore(IVec, NewPtrs[Part], Group->getAlignment());

  Group->addMetadata(NewStoreInstr);
}
2383}

2385void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr,
                                                   VectorParts *BlockInMask) {
// Attempt to issue a wide load.
LoadInst *LI = dyn_cast<LoadInst>(Instr);
StoreInst *SI = dyn_cast<StoreInst>(Instr);

assert((LI || SI) && "Invalid Load/Store instruction")(static_cast <bool> ((LI || SI) && "Invalid Load/Store instruction"
) ? void (0) : __assert_fail ("(LI || SI) && \"Invalid Load/Store instruction\""
, "/build/llvm-toolchain-snapshot-7~svn338205/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2391, __extension__ __PRETTY_FUNCTION__));

LoopVectorizationCostModel::InstWidening Decision =
    Cost->getWideningDecision(Instr, VF);
assert(Decision != LoopVectorizationCostModel::CM_Unknown &&(static_cast <bool> (Decision != LoopVectorizationCostModel
::CM_Unknown && "CM decision should be taken at this point"
) ? void (0) : __assert_fail ("Decision != LoopVectorizationCostModel::CM_Unknown && \"CM decision should be taken at this point\""
, "/build/llvm-toolchain-snapshot-7~svn338205/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2396, __extension__ __PRETTY_FUNCTION__))
       "CM decision should be taken at this point")(static_cast <bool> (Decision != LoopVectorizationCostModel
::CM_Unknown && "CM decision should be taken at this point"
) ? void (0) : __assert_fail ("Decision != LoopVectorizationCostModel::CM_Unknown && \"CM decision should be taken at this point\""
, "/build/llvm-toolchain-snapshot-7~svn338205/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2396, __extension__ __PRETTY_FUNCTION__));
if (Decision == LoopVectorizationCostModel::CM_Interleave)
  return vectorizeInterleaveGroup(Instr);

Type *ScalarDataTy = getMemInstValueType(Instr);
Type *DataTy = VectorType::get(ScalarDataTy, VF);
Value *Ptr = getLoadStorePointerOperand(Instr);
unsigned Alignment = getMemInstAlignment(Instr);
// An alignment of 0 means target abi alignment. We need to use the scalar's
// target abi alignment in such a case.
const DataLayout &DL = Instr->getModule()->getDataLayout();
if (!Alignment)
  Alignment = DL.getABITypeAlignment(ScalarDataTy);
unsigned AddressSpace = getMemInstAddressSpace(Instr);

// Determine if the pointer operand of the access is either consecutive or
// reverse consecutive.
bool Reverse = (Decision == LoopVectorizationCostModel::CM_Widen_Reverse);
bool ConsecutiveStride =
    Reverse || (Decision == LoopVectorizationCostModel::CM_Widen);
bool CreateGatherScatter =
    (Decision == LoopVectorizationCostModel::CM_GatherScatter);

// Either Ptr feeds a vector load/store, or a vector GEP should feed a vector
// gather/scatter. Otherwise Decision should have been to Scalarize.
assert((ConsecutiveStride || CreateGatherScatter) &&(static_cast <bool> ((ConsecutiveStride || CreateGatherScatter
) && "The instruction should be scalarized") ? void (
0) : __assert_fail ("(ConsecutiveStride || CreateGatherScatter) && \"The instruction should be scalarized\""
, "/build/llvm-toolchain-snapshot-7~svn338205/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2422, __extension__ __PRETTY_FUNCTION__))
       "The instruction should be scalarized")(static_cast <bool> ((ConsecutiveStride || CreateGatherScatter
) && "The instruction should be scalarized") ? void (
0) : __assert_fail ("(ConsecutiveStride || CreateGatherScatter) && \"The instruction should be scalarized\""
, "/build/llvm-toolchain-snapshot-7~svn338205/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2422, __extension__ __PRETTY_FUNCTION__));

// Handle consecutive loads/stores.
if (ConsecutiveStride)
  Ptr = getOrCreateScalarValue(Ptr, {0, 0});

VectorParts Mask;
bool isMaskRequired = BlockInMask;
if (isMaskRequired)
  Mask = *BlockInMask;

bool InBounds = false;
if (auto *gep = dyn_cast<GetElementPtrInst>(
        getLoadStorePointerOperand(Instr)->stripPointerCasts()))
  InBounds = gep->isInBounds();

const auto CreateVecPtr = [&](unsigned Part, Value *Ptr) -> Value * {
  // Calculate the pointer for the specific unroll-part.
  GetElementPtrInst *PartPtr = nullptr;

  if (Reverse) {
    // If the address is consecutive but reversed, then the
    // wide store needs to start at the last vector element.
    PartPtr = cast<GetElementPtrInst>(
        Builder.CreateGEP(Ptr, Builder.getInt32(-Part * VF)));
    PartPtr->setIsInBounds(InBounds);
    PartPtr = cast<GetElementPtrInst>(
        Builder.CreateGEP(PartPtr, Builder.getInt32(1 - VF)));
    PartPtr->setIsInBounds(InBounds);
    if (isMaskRequired) // Reverse of a null all-one mask is a null mask.
      Mask[Part] = reverseVector(Mask[Part]);
  } else {
    PartPtr = cast<GetElementPtrInst>(
        Builder.CreateGEP(Ptr, Builder.getInt32(Part * VF)));
    PartPtr->setIsInBounds(InBounds);
  }

  return Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace));
};

// Handle Stores:
if (SI) {
  setDebugLocFromInst(Builder, SI);

  for (unsigned Part = 0; Part < UF; ++Part) {
    Instruction *NewSI = nullptr;
    Value *StoredVal = getOrCreateVectorValue(SI->getValueOperand(), Part);
    if (CreateGatherScatter) {
      Value *MaskPart = isMaskRequired ? Mask[Part] : nullptr;
      Value *VectorGep = getOrCreateVectorValue(Ptr, Part);
      NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep, Alignment,
                                          MaskPart);
    } else {
      if (Reverse) {
        // If we store to reverse consecutive memory locations, then we need
        // to reverse the order of elements in the stored value.
        StoredVal = reverseVector(StoredVal);
        // We don't want to update the value in the map as it might be used in
        // another expression. So don't call resetVectorValue(StoredVal).
      }
      auto *VecPtr = CreateVecPtr(Part, Ptr);
      if (isMaskRequired)
        NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr, Alignment,
                                          Mask[Part]);
      else
        NewSI = Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment);
    }
    addMetadata(NewSI, SI);
  }
  return;
}

// Handle loads.
assert(LI && "Must have a load instruction")(static_cast <bool> (LI && "Must have a load instruction"
) ? void (0) : __assert_fail ("LI && \"Must have a load instruction\""
, "/build/llvm-toolchain-snapshot-7~svn338205/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2495, __extension__ __PRETTY_FUNCTION__));
setDebugLocFromInst(Builder, LI);
for (unsigned Part = 0; Part < UF; ++Part) {
  Value *NewLI;
  if (CreateGatherScatter) {
    Value *MaskPart = isMaskRequired ? Mask[Part] : nullptr;
    Value *VectorGep = getOrCreateVectorValue(Ptr, Part);
    NewLI = Builder.CreateMaskedGather(VectorGep, Alignment, MaskPart,
                                       nullptr, "wide.masked.gather");
    addMetadata(NewLI, LI);
  } else {
    auto *VecPtr = CreateVecPtr(Part, Ptr);
    if (isMaskRequired)
      NewLI = Builder.CreateMaskedLoad(VecPtr, Alignment, Mask[Part],
                                       UndefValue::get(DataTy),
                                       "wide.masked.load");
    else
      NewLI = Builder.CreateAlignedLoad(VecPtr, Alignment, "wide.load");

    // Add metadata to the load, but setVectorValue to the reverse shuffle.
    addMetadata(NewLI, LI);
    if (Reverse)
      NewLI = reverseVector(NewLI);
  }
  VectorLoopValueMap.setVectorValue(Instr, Part, NewLI);
}
2521}

2523void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr,
                                             const VPIteration &Instance,
                                             bool IfPredicateInstr) {
assert(!Instr->getType()->isAggregateType() && "Can't handle vectors")(static_cast <bool> (!Instr->getType()->isAggregateType
() && "Can't handle vectors") ? void (0) : __assert_fail
 ("!Instr->getType()->isAggregateType() && \"Can't handle vectors\""
, "/build/llvm-toolchain-snapshot-7~svn338205/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2526, __extension__ __PRETTY_FUNCTION__));

setDebugLocFromInst(Builder, Instr);

// Does this instruction return a value ?
bool IsVoidRetTy = Instr->getType()->isVoidTy();

Instruction *Cloned = Instr->clone();
if (!IsVoidRetTy)
  Cloned->setName(Instr->getName() + ".cloned");

// Replace the operands of the cloned instructions with their scalar
// equivalents in the new loop.
for (unsigned op = 0, e = Instr->getNumOperands(); op != e; ++op) {
  auto *NewOp = getOrCreateScalarValue(Instr->getOperand(op), Instance);
  Cloned->setOperand(op, NewOp);
}
addNewMetadata(Cloned, Instr);

// Place the cloned scalar in the new loop.
Builder.Insert(Cloned);

// Add the cloned scalar to the scalar map entry.
VectorLoopValueMap.setScalarValue(Instr, Instance, Cloned);

// If we just cloned a new assumption, add it the assumption cache.
if (auto *II = dyn_cast<IntrinsicInst>(Cloned))
  if (II->getIntrinsicID() == Intrinsic::assume)
    AC->registerAssumption(II);

// End if-block.
if (IfPredicateInstr)
  PredicatedInstructions.push_back(Cloned);
2559}

2561PHINode *InnerLoopVectorizer::createInductionVariable(Loop *L, Value *Start,
                                                    Value *End, Value *Step,
                                                    Instruction *DL) {
BasicBlock *Header = L->getHeader();
BasicBlock *Latch = L->getLoopLatch();
// As we're just creating this loop, it's possible no latch exists
// yet. If so, use the header as this will be a single block loop.
if (!Latch)
  Latch = Header;

IRBuilder<> Builder(&*Header->getFirstInsertionPt());
Instruction *OldInst = getDebugLocFromInstOrOperands(OldInduction);
setDebugLocFromInst(Builder, OldInst);
auto *Induction = Builder.CreatePHI(Start->getType(), 2, "index");

Builder.SetInsertPoint(Latch->getTerminator());
setDebugLocFromInst(Builder, OldInst);

// Create i+1 and fill the PHINode.
Value *Next = Builder.CreateAdd(Induction, Step, "index.next");
Induction->addIncoming(Start, L->getLoopPreheader());
Induction->addIncoming(Next, Latch);
// Create the compare.
Value *ICmp = Builder.CreateICmpEQ(Next, End);
Builder.CreateCondBr(ICmp, L->getExitBlock(), Header);

// Now we have two terminators. Remove the old one from the block.
Latch->getTerminator()->eraseFromParent();

return Induction;
2591}

2593Value *InnerLoopVectorizer::getOrCreateTripCount(Loop *L) {
if (TripCount)
  return TripCount;

IRBuilder<> Builder(L->getLoopPreheader()->getTerminator());
// Find the loop boundaries.
ScalarEvolution *SE = PSE.getSE();
const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount();
assert(BackedgeTakenCount != SE->getCouldNotCompute() &&(static_cast <bool> (BackedgeTakenCount != SE->getCouldNotCompute
() && "Invalid loop count") ? void (0) : __assert_fail
 ("BackedgeTakenCount != SE->getCouldNotCompute() && \"Invalid loop count\""
, "/build/llvm-toolchain-snapshot-7~svn338205/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2602, __extension__ __PRETTY_FUNCTION__))
       "Invalid loop count")(static_cast <bool> (BackedgeTakenCount != SE->getCouldNotCompute
() && "Invalid loop count") ? void (0) : __assert_fail
 ("BackedgeTakenCount != SE->getCouldNotCompute() && \"Invalid loop count\""
, "/build/llvm-toolchain-snapshot-7~svn338205/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2602, __extension__ __PRETTY_FUNCTION__));

Type *IdxTy = Legal->getWidestInductionType();

// The exit count might have the type of i64 while the phi is i32. This can
// happen if we have an induction variable that is sign extended before the
// compare. The only way that we get a backedge taken count is that the
// induction variable was signed and as such will not overflow. In such a case
// truncation is legal.
if (BackedgeTakenCount->getType()->getPrimitiveSizeInBits() >
    IdxTy->getPrimitiveSizeInBits())
  BackedgeTakenCount = SE->getTruncateOrNoop(BackedgeTakenCount, IdxTy);
BackedgeTakenCount = SE->getNoopOrZeroExtend(BackedgeTakenCount, IdxTy);

// Get the total trip count from the count by adding 1.
const SCEV *ExitCount = SE->getAddExpr(
    BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType()));

const DataLayout &DL = L->getHeader()->getModule()->getDataLayout();

// Expand the trip count and place the new instructions in the preheader.
// Notice that the pre-header does not change, only the loop body.
SCEVExpander Exp(*SE, DL, "induction");

// Count holds the overall loop count (N).
TripCount = Exp.expandCodeFor(ExitCount, ExitCount->getType(),
                              L->getLoopPreheader()->getTerminator());

if (TripCount->getType()->isPointerTy())
  TripCount =
      CastInst::CreatePointerCast(TripCount, IdxTy, "exitcount.ptrcnt.to.int",
                                  L->getLoopPreheader()->getTerminator());

return TripCount;
2636}

2638Value *InnerLoopVectorizer::getOrCreateVectorTripCount(Loop *L) {
if (VectorTripCount)
  return VectorTripCount;

Value *TC = getOrCreateTripCount(L);
IRBuilder<> Builder(L->getLoopPreheader()->getTerminator());

// Now we need to generate the expression for the part of the loop that the
// vectorized body will execute. This is equal to N - (N % Step) if scalar
// iterations are not required for correctness, or N - Step, otherwise. Step
// is equal to the vectorization factor (number of SIMD elements) times the
// unroll factor (number of SIMD instructions).
Constant *Step = ConstantInt::get(TC->getType(), VF * UF);
Value *R = Builder.CreateURem(TC, Step, "n.mod.vf");

// If there is a non-reversed interleaved group that may speculatively access
// memory out-of-bounds, we need to ensure that there will be at least one
// iteration of the scalar epilogue loop. Thus, if the step evenly divides
// the trip count, we set the remainder to be equal to the step. If the step
// does not evenly divide the trip count, no adjustment is necessary since
// there will already be scalar iterations. Note that the minimum iterations
// check ensures that N >= Step.
if (VF > 1 && Cost->requiresScalarEpilogue()) {
  auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0));
  R = Builder.CreateSelect(IsZero, Step, R);
}

VectorTripCount = Builder.CreateSub(TC, R, "n.vec");

return VectorTripCount;
2668}

2670Value *InnerLoopVectorizer::createBitOrPointerCast(Value *V, VectorType *DstVTy,
                                                 const DataLayout &DL) {
// Verify that V is a vector type with same number of elements as DstVTy.
unsigned VF = DstVTy->getNumElements();
VectorType *SrcVecTy = cast<VectorType>(V->getType());
assert((VF == SrcVecTy->getNumElements()) && "Vector dimensions do not match")(static_cast <bool> ((VF == SrcVecTy->getNumElements
()) && "Vector dimensions do not match") ? void (0) :
 __assert_fail ("(VF == SrcVecTy->getNumElements()) && \"Vector dimensions do not match\""
, "/build/llvm-toolchain-snapshot-7~svn338205/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2675, __extension__ __PRETTY_FUNCTION__));
Type *SrcElemTy = SrcVecTy->getElementType();
Type *DstElemTy = DstVTy->getElementType();
assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) &&(static_cast <bool> ((DL.getTypeSizeInBits(SrcElemTy) ==
 DL.getTypeSizeInBits(DstElemTy)) && "Vector elements must have same size"
) ? void (0) : __assert_fail ("(DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) && \"Vector elements must have same size\""
, "/build/llvm-toolchain-snapshot-7~svn338205/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2679, __extension__ __PRETTY_FUNCTION__))
       "Vector elements must have same size")(static_cast <bool> ((DL.getTypeSizeInBits(SrcElemTy) ==
 DL.getTypeSizeInBits(DstElemTy)) && "Vector elements must have same size"
) ? void (0) : __assert_fail ("(DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) && \"Vector elements must have same size\""
, "/build/llvm-toolchain-snapshot-7~svn338205/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2679, __extension__ __PRETTY_FUNCTION__));

// Do a direct cast if element types are castable.
if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) {
  return Builder.CreateBitOrPointerCast(V, DstVTy);
}
// V cannot be directly casted to desired vector type.
// May happen when V is a floating point vector but DstVTy is a vector of
// pointers or vice-versa. Handle this using a two-step bitcast using an
// intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float.
assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) &&(static_cast <bool> ((DstElemTy->isPointerTy() != SrcElemTy
->isPointerTy()) && "Only one type should be a pointer type"
) ? void (0) : __assert_fail ("(DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) && \"Only one type should be a pointer type\""
, "/build/llvm-toolchain-snapshot-7~svn338205/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2690, __extension__ __PRETTY_FUNCTION__))
       "Only one type should be a pointer type")(static_cast <bool> ((DstElemTy->isPointerTy() != SrcElemTy
->isPointerTy()) && "Only one type should be a pointer type"
) ? void (0) : __assert_fail ("(DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) && \"Only one type should be a pointer type\""
, "/build/llvm-toolchain-snapshot-7~svn338205/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2690, __extension__ __PRETTY_FUNCTION__));
assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) &&(static_cast <bool> ((DstElemTy->isFloatingPointTy()
 != SrcElemTy->isFloatingPointTy()) && "Only one type should be a floating point type"
) ? void (0) : __assert_fail ("(DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) && \"Only one type should be a floating point type\""
, "/build/llvm-toolchain-snapshot-7~svn338205/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2692, __extension__ __PRETTY_FUNCTION__))
       "Only one type should be a floating point type")(static_cast <bool> ((DstElemTy->isFloatingPointTy()
 != SrcElemTy->isFloatingPointTy()) && "Only one type should be a floating point type"
) ? void (0) : __assert_fail ("(DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) && \"Only one type should be a floating point type\""
, "/build/llvm-toolchain-snapshot-7~svn338205/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2692, __extension__ __PRETTY_FUNCTION__));
Type *IntTy =
    IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy));
VectorType *VecIntTy = VectorType::get(IntTy, VF);
Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy);
return Builder.CreateBitOrPointerCast(CastVal, DstVTy);
2698}

2700void InnerLoopVectorizer::emitMinimumIterationCountCheck(Loop *L,
                                                       BasicBlock *Bypass) {
Value *Count = getOrCreateTripCount(L);
BasicBlock *BB = L->getLoopPreheader();
IRBuilder<> Builder(BB->getTerminator());

// Generate code to check if the loop's trip count is less than VF * UF, or
// equal to it in case a scalar epilogue is required; this implies that the
// vector trip count is zero. This check also covers the case where adding one
// to the backedge-taken count overflowed leading to an incorrect trip count
// of zero. In this case we will also jump to the scalar loop.
auto P = Cost->requiresScalarEpilogue() ? ICmpInst::ICMP_ULE
                                        : ICmpInst::ICMP_ULT;
Value *CheckMinIters = Builder.CreateICmp(
    P, Count, ConstantInt::get(Count->getType(), VF * UF), "min.iters.check");

BasicBlock *NewBB = BB->splitBasicBlock(BB->getTerminator(), "vector.ph");
// Update dominator tree immediately if the generated block is a
// LoopBypassBlock because SCEV expansions to generate loop bypass
// checks may query it before the current function is finished.
DT->addNewBlock(NewBB, BB);
if (L->getParentLoop())
  L->getParentLoop()->addBasicBlockToLoop(NewBB, *LI);
ReplaceInstWithInst(BB->getTerminator(),
                    BranchInst::Create(Bypass, NewBB, CheckMinIters));
LoopBypassBlocks.push_back(BB);
2726}

2728void InnerLoopVectorizer::emitSCEVChecks(Loop *L, BasicBlock *Bypass) {
BasicBlock *BB = L->getLoopPreheader();

// Generate the code to check that the SCEV assumptions that we made.
// We want the new basic block to start at the first instruction in a
// sequence of instructions that form a check.
SCEVExpander Exp(*PSE.getSE(), Bypass->getModule()->getDataLayout(),
                 "scev.check");
Value *SCEVCheck =
    Exp.expandCodeForPredicate(&PSE.getUnionPredicate(), BB->getTerminator());

if (auto *C = dyn_cast<ConstantInt>(SCEVCheck))
  if (C->isZero())
    return;

// Create a new block containing the stride check.
BB->setName("vector.scevcheck");
auto *NewBB = BB->splitBasicBlock(BB->getTerminator(), "vector.ph");
// Update dominator tree immediately if the generated block is a
// LoopBypassBlock because SCEV expansions to generate loop bypass
// checks may query it before the current function is finished.
DT->addNewBlock(NewBB, BB);
if (L->getParentLoop())
  L->getParentLoop()->addBasicBlockToLoop(NewBB, *LI);
ReplaceInstWithInst(BB->getTerminator(),
                    BranchInst::Create(Bypass, NewBB, SCEVCheck));
LoopBypassBlocks.push_back(BB);
AddedSafetyChecks = true;
2756}

2758void InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass) {
BasicBlock *BB = L->getLoopPreheader();

// Generate the code that checks in runtime if arrays overlap. We put the
// checks into a separate block to make the more common case of few elements
// faster.
Instruction *FirstCheckInst;
Instruction *MemRuntimeCheck;
std::tie(FirstCheckInst, MemRuntimeCheck) =
    Legal->getLAI()->addRuntimeChecks(BB->getTerminator());
if (!MemRuntimeCheck)
  return;

// Create a new block containing the memory check.
BB->setName("vector.memcheck");
auto *NewBB = BB->splitBasicBlock(BB->getTerminator(), "vector.ph");
// Update dominator tree immediately if the generated block is a
// LoopBypassBlock because SCEV expansions to generate loop bypass
// checks may query it before the current function is finished.
DT->addNewBlock(NewBB, BB);
if (L->getParentLoop())
  L->getParentLoop()->addBasicBlockToLoop(NewBB, *LI);
ReplaceInstWithInst(BB->getTerminator(),
                    BranchInst::Create(Bypass, NewBB, MemRuntimeCheck));
LoopBypassBlocks.push_back(BB);
AddedSafetyChecks = true;

// We currently don't use LoopVersioning for the actual loop cloning but we
// still use it to add the noalias metadata.
LVer = llvm::make_unique<LoopVersioning>(*Legal->getLAI(), OrigLoop, LI, DT,
                                         PSE.getSE());
LVer->prepareNoAliasMetadata();
2790}

2792BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() {
/*
 In this function we generate a new loop. The new loop will contain
 the vectorized instructions while the old loop will continue to run the
 scalar remainder.

     [ ] <-- loop iteration number check.
  /   |
 /    v
|    [ ] <-- vector loop bypass (may consist of multiple blocks).
|  /  |
| /   v
||   [ ]     <-- vector pre header.
|/    |
|     v
|    [  ] \
|    [  ]_|   <-- vector loop.
|     |
|     v
|   -[ ]   <--- middle-block.
|  /  |
| /   v
-|- >[ ]     <--- new preheader.
 |    |
 |    v
 |   [ ] \
 |   [ ]_|   <-- old scalar loop to handle remainder.
  \   |
   \  v
    >[ ]     <-- exit block.
 ...
 */

BasicBlock *OldBasicBlock = OrigLoop->getHeader();
BasicBlock *VectorPH = OrigLoop->getLoopPreheader();
BasicBlock *ExitBlock = OrigLoop->getExitBlock();
assert(VectorPH && "Invalid loop structure")(static_cast <bool> (VectorPH && "Invalid loop structure"
) ? void (0) : __assert_fail ("VectorPH && \"Invalid loop structure\""
, "/build/llvm-toolchain-snapshot-7~svn338205/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2828, __extension__ __PRETTY_FUNCTION__));
assert(ExitBlock && "Must have an exit block")(static_cast <bool> (ExitBlock && "Must have an exit block"
) ? void (0) : __assert_fail ("ExitBlock && \"Must have an exit block\""
, "/build/llvm-toolchain-snapshot-7~svn338205/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2829, __extension__ __PRETTY_FUNCTION__));

// Some loops have a single integer induction variable, while other loops
// don't. One example is c++ iterators that often have multiple pointer
// induction variables. In the code below we also support a case where we
// don't have a single induction variable.
//
// We try to obtain an induction variable from the original loop as hard
// as possible. However if we don't find one that:
//   - is an integer
//   - counts from zero, stepping by one
//   - is the size of the widest induction variable type
// then we create a new one.
OldInduction = Legal->getPrimaryInduction();
Type *IdxTy = Legal->getWidestInductionType();

// Split the single block loop into the two loop structure described above.
BasicBlock *VecBody =
    VectorPH->splitBasicBlock(VectorPH->getTerminator(), "vector.body");
BasicBlock *MiddleBlock =
    VecBody->splitBasicBlock(VecBody->getTerminator(), "middle.block");
BasicBlock *ScalarPH =
    MiddleBlock->splitBasicBlock(MiddleBlock->getTerminator(), "scalar.ph");

// Create and register the new vector loop.
Loop *Lp = LI->AllocateLoop();
Loop *ParentLoop = OrigLoop->getParentLoop();

// Insert the new loop into the loop nest and register the new basic blocks
// before calling any utilities such as SCEV that require valid LoopInfo.
if (ParentLoop) {
  ParentLoop->addChildLoop(Lp);
  ParentLoop->addBasicBlockToLoop(ScalarPH, *LI);
  ParentLoop->addBasicBlockToLoop(MiddleBlock, *LI);
} else {
  LI->addTopLevelLoop(Lp);
}
Lp->addBasicBlockToLoop(VecBody, *LI);

// Find the loop boundaries.
Value *Count = getOrCreateTripCount(Lp);

Value *StartIdx = ConstantInt::get(IdxTy, 0);

// Now, compare the new count to zero. If it is zero skip the vector loop and
// jump to the scalar loop. This check also covers the case where the
// backedge-taken count is uint##_max: adding one to it will overflow leading
// to an incorrect trip count of zero. In this (rare) case we will also jump
// to the scalar loop.
emitMinimumIterationCountCheck(Lp, ScalarPH);

// Generate the code to check any assumptions that we've made for SCEV
// expressions.
emitSCEVChecks(Lp, ScalarPH);

// Generate the code that checks in runtime if arrays overlap. We put the
// checks into a separate block to make the more common case of few elements
// faster.
emitMemRuntimeChecks(Lp, ScalarPH);

// Generate the induction variable.
// The loop step is equal to the vectorization factor (num of SIMD elements)
// times the unroll factor (num of SIMD instructions).
Value *CountRoundDown = getOrCreateVectorTripCount(Lp);
Constant *Step = ConstantInt::get(IdxTy, VF * UF);
Induction =
    createInductionVariable(Lp, StartIdx, CountRoundDown, Step,
                            getDebugLocFromInstOrOperands(OldInduction));

// We are going to resume the execution of the scalar loop.
// Go over all of the induction variables that we found and fix the
// PHIs that are left in the scalar version of the loop.
// The starting values of PHI nodes depend on the counter of the last
// iteration in the vectorized loop.
// If we come from a bypass edge then we need to start from the original
// start value.

// This variable saves the new starting index for the scalar loop. It is used
// to test if there are any tail iterations left once the vector loop has
// completed.
LoopVectorizationLegality::InductionList *List = Legal->getInductionVars();
for (auto &InductionEntry : *List) {
  PHINode *OrigPhi = InductionEntry.first;
  InductionDescriptor II = InductionEntry.second;

  // Create phi nodes to merge from the  backedge-taken check block.
  PHINode *BCResumeVal = PHINode::Create(
      OrigPhi->getType(), 3, "bc.resume.val", ScalarPH->getTerminator());
  // Copy original phi DL over to the new one.
  BCResumeVal->setDebugLoc(OrigPhi->getDebugLoc());
  Value *&EndValue = IVEndValues[OrigPhi];
  if (OrigPhi == OldInduction) {
    // We know what the end value is.
    EndValue = CountRoundDown;
  } else {
    IRBuilder<> B(Lp->getLoopPreheader()->getTerminator());
    Type *StepType = II.getStep()->getType();
    Instruction::CastOps CastOp =
      CastInst::getCastOpcode(CountRoundDown, true, StepType, true);
    Value *CRD = B.CreateCast(CastOp, CountRoundDown, StepType, "cast.crd");
    const DataLayout &DL = OrigLoop->getHeader()->getModule()->getDataLayout();
    EndValue = II.transform(B, CRD, PSE.getSE(), DL);
    EndValue->setName("ind.end");
  }

  // The new PHI merges the original incoming value, in case of a bypass,
  // or the value at the end of the vectorized loop.
  BCResumeVal->addIncoming(EndValue, MiddleBlock);

  // Fix the scalar body counter (PHI node).
  unsigned BlockIdx = OrigPhi->getBasicBlockIndex(ScalarPH);

  // The old induction's phi node in the scalar body needs the truncated
  // value.
  for (BasicBlock *BB : LoopBypassBlocks)
    BCResumeVal->addIncoming(II.getStartValue(), BB);
  OrigPhi->setIncomingValue(BlockIdx, BCResumeVal);
}

// Add a check in the middle block to see if we have completed
// all of the iterations in the first vector loop.
// If (N - N%VF) == N, then we *don't* need to run the remainder.
Value *CmpN =
    CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, Count,
                    CountRoundDown, "cmp.n", MiddleBlock->getTerminator());
ReplaceInstWithInst(MiddleBlock->getTerminator(),
                    BranchInst::Create(ExitBlock, ScalarPH, CmpN));

// Get ready to start creating new instructions into the vectorized body.
Builder.SetInsertPoint(&*VecBody->getFirstInsertionPt());

// Save the state.
LoopVectorPreHeader = Lp->getLoopPreheader();
LoopScalarPreHeader = ScalarPH;
LoopMiddleBlock = MiddleBlock;
LoopExitBlock = ExitBlock;
LoopVectorBody = VecBody;
LoopScalarBody = OldBasicBlock;

// Keep all loop hints from the original loop on the vector loop (we'll
// replace the vectorizer-specific hints below).
if (MDNode *LID = OrigLoop->getLoopID())
  Lp->setLoopID(LID);

LoopVectorizeHints Hints(Lp, true, *ORE);
Hints.setAlreadyVectorized();

return LoopVectorPreHeader;
2977}

2979// Fix up external users of the induction variable. At this point, we are
2980// in LCSSA form, with all external PHIs that use the IV having one input value,
2981// coming from the remainder loop. We need those PHIs to also have a correct
2982// value for the IV when arriving directly from the middle block.
2983void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi,
                                     const InductionDescriptor &II,
                                     Value *CountRoundDown, Value *EndValue,
                                     BasicBlock *MiddleBlock) {
// There are two kinds of external IV usages - those that use the value
// computed in the last iteration (the PHI) and those that use the penultimate
// value (the value that feeds into the phi from the loop latch).
// We allow both, but they, obviously, have different values.

assert(OrigLoop->getExitBlock() && "Expected a single exit block")(static_cast <bool> (OrigLoop->getExitBlock() &&
 "Expected a single exit block") ? void (0) : __assert_fail (
"OrigLoop->getExitBlock() && \"Expected a single exit block\""
, "/build/llvm-toolchain-snapshot-7~svn338205/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2992, __extension__ __PRETTY_FUNCTION__));

DenseMap<Value *, Value *> MissingVals;

// An external user of the last iteration's value should see the value that
// the remainder loop uses to initialize its own IV.
Value *PostInc = OrigPhi->getIncomingValueForBlock(OrigLoop->getLoopLatch());
for (User *U : PostInc->users()) {
  Instruction *UI = cast<Instruction>(U);
  if (!OrigLoop->contains(UI)) {
    assert(isa<PHINode>(UI) && "Expected LCSSA form")(static_cast <bool> (isa<PHINode>(UI) && "Expected LCSSA form"
) ? void (0) : __assert_fail ("isa<PHINode>(UI) && \"Expected LCSSA form\""
, "/build/llvm-toolchain-snapshot-7~svn338205/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3002, __extension__ __PRETTY_FUNCTION__));
    MissingVals[UI] = EndValue;
  }
}

// An external user of the penultimate value need to see EndValue - Step.
// The simplest way to get this is to recompute it from the constituent SCEVs,
// that is Start + (Step * (CRD - 1)).
for (User *U : OrigPhi->users()) {
  auto *UI = cast<Instruction>(U);
  if (!OrigLoop->contains(UI)) {
    const DataLayout &DL =
        OrigLoop->getHeader()->getModule()->getDataLayout();
    assert(isa<PHINode>(UI) && "Expected LCSSA form")(static_cast <bool> (isa<PHINode>(UI) && "Expected LCSSA form"
) ? void (0) : __assert_fail ("isa<PHINode>(UI) && \"Expected LCSSA form\""
, "/build/llvm-toolchain-snapshot-7~svn338205/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3015, __extension__ __PRETTY_FUNCTION__));

    IRBuilder<> B(MiddleBlock->getTerminator());
    Value *CountMinusOne = B.CreateSub(
        CountRoundDown, ConstantInt::get(CountRoundDown->getType(), 1));
    Value *CMO =
        !II.getStep()->getType()->isIntegerTy()
            ? B.CreateCast(Instruction::SIToFP, CountMinusOne,
                           II.getStep()->getType())
            : B.CreateSExtOrTrunc(CountMinusOne, II.getStep()->getType());
    CMO->setName("cast.cmo");
    Value *Escape = II.transform(B, CMO, PSE.getSE(), DL);
    Escape->setName("ind.escape");
    MissingVals[UI] = Escape;
  }
}

for (auto &I : MissingVals) {
  PHINode *PHI = cast<PHINode>(I.first);
  // One corner case we have to handle is two IVs "chasing" each-other,
  // that is %IV2 = phi [...], [ %IV1, %latch ]
  // In this case, if IV1 has an external use, we need to avoid adding both
  // "last value of IV1" and "penultimate value of IV2". So, verify that we
  // don't already have an incoming value for the middle block.
  if (PHI->getBasicBlockIndex(MiddleBlock) == -1)
    PHI->addIncoming(I.second, MiddleBlock);
}
3042}

3044namespace {

3046struct CSEDenseMapInfo {
static bool canHandle(const Instruction *I) {
  return isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) ||
         isa<ShuffleVectorInst>(I) || isa<GetElementPtrInst>(I);
}

static inline Instruction *getEmptyKey() {
  return DenseMapInfo<Instruction *>::getEmptyKey();
}

static inline Instruction *getTombstoneKey() {
  return DenseMapInfo<Instruction *>::getTombstoneKey();
}

static unsigned getHashValue(const Instruction *I) {
  assert(canHandle(I) && "Unknown instruction!")(static_cast <bool> (canHandle(I) && "Unknown instruction!"
) ? void (0) : __assert_fail ("canHandle(I) && \"Unknown instruction!\""
, "/build/llvm-toolchain-snapshot-7~svn338205/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3061, __extension__ __PRETTY_FUNCTION__));
  return hash_combine(I->getOpcode(), hash_combine_range(I->value_op_begin(),
                                                         I->value_op_end()));
}

static bool isEqual(const Instruction *LHS, const Instruction *RHS) {
  if (LHS == getEmptyKey() || RHS == getEmptyKey() ||
      LHS == getTombstoneKey() || RHS == getTombstoneKey())
    return LHS == RHS;
  return LHS->isIdenticalTo(RHS);
}
3072};

3074} // end anonymous namespace

3076///Perform cse of induction variable instructions.
3077static void cse(BasicBlock *BB) {
// Perform simple cse.
SmallDenseMap<Instruction *, Instruction *, 4, CSEDenseMapInfo> CSEMap;
for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) {
  Instruction *In = &*I++;

  if (!CSEDenseMapInfo::canHandle(In))
    continue;

  // Check if we can replace this instruction with any of the
  // visited instructions.
  if (Instruction *V = CSEMap.lookup(In)) {
    In->replaceAllUsesWith(V);
    In->eraseFromParent();
    continue;
  }

  CSEMap[In] = In;
}
3096}

3098/// Estimate the overhead of scalarizing an instruction. This is a
3099/// convenience wrapper for the type-based getScalarizationOverhead API.
3100static unsigned getScalarizationOverhead(Instruction *I, unsigned VF,
                                       const TargetTransformInfo &TTI) {
if (VF == 1)
  return 0;

unsigned Cost = 0;
Type *RetTy = ToVectorTy(I->getType(), VF);
if (!RetTy->isVoidTy() &&
    (!isa<LoadInst>(I) ||
     !TTI.supportsEfficientVectorElementLoadStore()))
  Cost += TTI.getScalarizationOverhead(RetTy, true, false);

if (CallInst *CI = dyn_cast<CallInst>(I)) {
  SmallVector<const Value *, 4> Operands(CI->arg_operands());
  Cost += TTI.getOperandsScalarizationOverhead(Operands, VF);
}
else if (!isa<StoreInst>(I) ||
         !TTI.supportsEfficientVectorElementLoadStore()) {
  SmallVector<const Value *, 4> Operands(I->operand_values());
  Cost += TTI.getOperandsScalarizationOverhead(Operands, VF);
}

return Cost;
3123}

3125// Estimate cost of a call instruction CI if it were vectorized with factor VF.
3126// Return the cost of the instruction, including scalarization overhead if it's
3127// needed. The flag NeedToScalarize shows if the call needs to be scalarized -
3128// i.e. either vector version isn't available, or is too expensive.
3129static unsigned getVectorCallCost(CallInst *CI, unsigned VF,
                                const TargetTransformInfo &TTI,
                                const TargetLibraryInfo *TLI,
                                bool &NeedToScalarize) {
Function *F = CI->getCalledFunction();
StringRef FnName = CI->getCalledFunction()->getName();
Type *ScalarRetTy = CI->getType();
SmallVector<Type *, 4> Tys, ScalarTys;
for (auto &ArgOp : CI->arg_operands())
  ScalarTys.push_back(ArgOp->getType());

// Estimate cost of scalarized vector call. The source operands are assumed
// to be vectors, so we need to extract individual elements from there,
// execute VF scalar calls, and then gather the result into the vector return
// value.
unsigned ScalarCallCost = TTI.getCallInstrCost(F, ScalarRetTy, ScalarTys);
if (VF == 1)
  return ScalarCallCost;

// Compute corresponding vector type for return value and arguments.
Type *RetTy = ToVectorTy(ScalarRetTy, VF);
for (Type *ScalarTy : ScalarTys)
  Tys.push_back(ToVectorTy(ScalarTy, VF));

// Compute costs of unpacking argument values for the scalar calls and
// packing the return values to a vector.
unsigned ScalarizationCost = getScalarizationOverhead(CI, VF, TTI);

unsigned Cost = ScalarCallCost * VF + ScalarizationCost;

// If we can't emit a vector call for this function, then the currently found
// cost is the cost we need to return.
NeedToScalarize = true;
if (!TLI || !TLI->isFunctionVectorizable(FnName, VF) || CI->isNoBuiltin())
  return Cost;

// If the corresponding vector cost is cheaper, return its cost.
unsigned VectorCallCost = TTI.getCallInstrCost(nullptr, RetTy, Tys);
if (VectorCallCost < Cost) {
  NeedToScalarize = false;
  return VectorCallCost;
}
return Cost;
3172}

3174// Estimate cost of an intrinsic call instruction CI if it were vectorized with
3175// factor VF.  Return the cost of the instruction, including scalarization
3176// overhead if it's needed.
3177static unsigned getVectorIntrinsicCost(CallInst *CI, unsigned VF,
                                     const TargetTransformInfo &TTI,
                                     const TargetLibraryInfo *TLI) {
Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
assert(ID && "Expected intrinsic call!")(static_cast <bool> (ID && "Expected intrinsic call!"
) ? void (0) : __assert_fail ("ID && \"Expected intrinsic call!\""
, "/build/llvm-toolchain-snapshot-7~svn338205/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3181, __extension__ __PRETTY_FUNCTION__));

FastMathFlags FMF;
if (auto *FPMO = dyn_cast<FPMathOperator>(CI))
  FMF = FPMO->getFastMathFlags();

SmallVector<Value *, 4> Operands(CI->arg_operands());
return TTI.getIntrinsicInstrCost(ID, CI->getType(), Operands, FMF, VF);
3189}

3191static Type *smallestIntegerVectorType(Type *T1, Type *T2) {
auto *I1 = cast<IntegerType>(T1->getVectorElementType());
auto *I2 = cast<IntegerType>(T2->getVectorElementType());
return I1->getBitWidth() < I2->getBitWidth() ? T1 : T2;
3195}
3196static Type *largestIntegerVectorType(Type *T1, Type *T2) {
auto *I1 = cast<IntegerType>(T1->getVectorElementType());
auto *I2 = cast<IntegerType>(T2->getVectorElementType());
return I1->getBitWidth() > I2->getBitWidth() ? T1 : T2;
3200}

3202void InnerLoopVectorizer::truncateToMinimalBitwidths() {
// For every instruction `I` in MinBWs, truncate the operands, create a
// truncated version of `I` and reextend its result. InstCombine runs
// later and will remove any ext/trunc pairs.
SmallPtrSet<Value *, 4> Erased;
for (const auto &KV : Cost->getMinimalBitwidths()) {
  // If the value wasn't vectorized, we must maintain the original scalar
  // type. The absence of the value from VectorLoopValueMap indicates that it
  // wasn't vectorized.
  if (!VectorLoopValueMap.hasAnyVectorValue(KV.first))
    continue;
  for (unsigned Part = 0; Part < UF; ++Part) {
    Value *I = getOrCreateVectorValue(KV.first, Part);
    if (Erased.count(I) || I->use_empty() || !isa<Instruction>(I))
      continue;
    Type *OriginalTy = I->getType();
    Type *ScalarTruncatedTy =
        IntegerType::get(OriginalTy->getContext(), KV.second);
    Type *TruncatedTy = VectorType::get(ScalarTruncatedTy,
                                        OriginalTy->getVectorNumElements());
    if (TruncatedTy == OriginalTy)
      continue;

    IRBuilder<> B(cast<Instruction>(I));
    auto ShrinkOperand = [&](Value *V) -> Value * {
      if (auto *ZI = dyn_cast<ZExtInst>(V))
        if (ZI->getSrcTy() == TruncatedTy)
          return ZI->getOperand(0);
      return B.CreateZExtOrTrunc(V, TruncatedTy);
    };

    // The actual instruction modification depends on the instruction type,
    // unfortunately.
    Value *NewI = nullptr;
    if (auto *BO = dyn_cast<BinaryOperator>(I)) {
      NewI = B.CreateBinOp(BO->getOpcode(), ShrinkOperand(BO->getOperand(0)),
                           ShrinkOperand(BO->getOperand(1)));

      // Any wrapping introduced by shrinking this operation shouldn't be
      // considered undefined behavior. So, we can't unconditionally copy
      // arithmetic wrapping flags to NewI.
      cast<BinaryOperator>(NewI)->copyIRFlags(I, /*IncludeWrapFlags=*/false);
    } else if (auto *CI = dyn_cast<ICmpInst>(I)) {
      NewI =
          B.CreateICmp(CI->getPredicate(), ShrinkOperand(CI->getOperand(0)),
                       ShrinkOperand(CI->getOperand(1)));
    } else if (auto *SI = dyn_cast<SelectInst>(I)) {
      NewI = B.CreateSelect(SI->getCondition(),
                            ShrinkOperand(SI->getTrueValue()),
                            ShrinkOperand(SI->getFalseValue()));
    } else if (auto *CI = dyn_cast<CastInst>(I)) {
      switch (CI->getOpcode()) {
      default:
        llvm_unreachable("Unhandled cast!")::llvm::llvm_unreachable_internal("Unhandled cast!", "/build/llvm-toolchain-snapshot-7~svn338205/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3255);
      case Instruction::Trunc:
        NewI = ShrinkOperand(CI->getOperand(0));
        break;
      case Instruction::SExt:
        NewI = B.CreateSExtOrTrunc(
            CI->getOperand(0),
            smallestIntegerVectorType(OriginalTy, TruncatedTy));
        break;
      case Instruction::ZExt:
        NewI = B.CreateZExtOrTrunc(
            CI->getOperand(0),
            smallestIntegerVectorType(OriginalTy, TruncatedTy));
        break;
      }
    } else if (auto *SI = dyn_cast<ShuffleVectorInst>(I)) {
      auto Elements0 = SI->getOperand(0)->getType()->getVectorNumElements();
      auto *O0 = B.CreateZExtOrTrunc(
          SI->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements0));
      auto Elements1 = SI->getOperand(1)->getType()->getVectorNumElements();
      auto *O1 = B.CreateZExtOrTrunc(
          SI->getOperand(1), VectorType::get(ScalarTruncatedTy, Elements1));

      NewI = B.CreateShuffleVector(O0, O1, SI->getMask());
    } else if (isa<LoadInst>(I) || isa<PHINode>(I)) {
      // Don't do anything with the operands, just extend the result.
      continue;
    } else if (auto *IE = dyn_cast<InsertElementInst>(I)) {
      auto Elements = IE->getOperand(0)->getType()->getVectorNumElements();
      auto *O0 = B.CreateZExtOrTrunc(
          IE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements));
      auto *O1 = B.CreateZExtOrTrunc(IE->getOperand(1), ScalarTruncatedTy);
      NewI = B.CreateInsertElement(O0, O1, IE->getOperand(2));
    } else if (auto *EE = dyn_cast<ExtractElementInst>(I)) {
      auto Elements = EE->getOperand(0)->getType()->getVectorNumElements();
      auto *O0 = B.CreateZExtOrTrunc(
          EE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements));
      NewI = B.CreateExtractElement(O0, EE->getOperand(2));
    } else {
      // If we don't know what to do, be conservative and don't do anything.
      continue;
    }

    // Lastly, extend the result.
    NewI->takeName(cast<Instruction>(I));
    Value *Res = B.CreateZExtOrTrunc(NewI, OriginalTy);
    I->replaceAllUsesWith(Res);
    cast<Instruction>(I)->eraseFromParent();
    Erased.insert(I);
    VectorLoopValueMap.resetVectorValue(KV.first, Part, Res);
  }
}

// We'll have created a bunch of ZExts that are now parentless. Clean up.
for (const auto &KV : Cost->getMinimalBitwidths()) {
  // If the value wasn't vectorized, we must maintain the original scalar
  // type. The absence of the value from VectorLoopValueMap indicates that it
  // wasn't vectorized.
  if (!VectorLoopValueMap.hasAnyVectorValue(KV.first))
    continue;
  for (unsigned Part = 0; Part < UF; ++Part) {
    Value *I = getOrCreateVectorValue(KV.first, Part);
    ZExtInst *Inst = dyn_cast<ZExtInst>(I);
    if (Inst && Inst->use_empty()) {
      Value *NewI = Inst->getOperand(0);
      Inst->eraseFromParent();
      VectorLoopValueMap.resetVectorValue(KV.first, Part, NewI);
    }
  }
}
3325}

3327void InnerLoopVectorizer::fixVectorizedLoop() {
// Insert truncates and extends for any truncated instructions as hints to
// InstCombine.
if (VF > 1)
  truncateToMinimalBitwidths();

// At this point every instruction in the original loop is widened to a
// vector form. Now we need to fix the recurrences in the loop. These PHI
// nodes are currently empty because we did not want to introduce cycles.
// This is the second stage of vectorizing recurrences.
fixCrossIterationPHIs();

// Update the dominator tree.
//
// FIXME: After creating the structure of the new loop, the dominator tree is
//        no longer up-to-date, and it remains that way until we update it
//        here. An out-of-date dominator tree is problematic for SCEV,
//        because SCEVExpander uses it to guide code generation. The
//        vectorizer use SCEVExpanders in several places. Instead, we should
//        keep the dominator tree up-to-date as we go.
updateAnalysis();

// Fix-up external users of the induction variables.
for (auto &Entry : *Legal->getInductionVars())
  fixupIVUsers(Entry.first, Entry.second,
               getOrCreateVectorTripCount(LI->getLoopFor(LoopVectorBody)),
               IVEndValues[Entry.first], LoopMiddleBlock);

fixLCSSAPHIs();
for (Instruction *PI : PredicatedInstructions)
  sinkScalarOperands(&*PI);

// Remove redundant induction instructions.
cse(LoopVectorBody);
3361}

3363void InnerLoopVectorizer::fixCrossIterationPHIs() {
// In order to support recurrences we need to be able to vectorize Phi nodes.
// Phi nodes have cycles, so we need to vectorize them in two stages. This is
// stage #2: We now need to fix the recurrences by adding incoming edges to
// the currently empty PHI nodes. At this point every instruction in the
// original loop is widened to a vector form so we can use them to construct
// the incoming edges.
for (PHINode &Phi : OrigLoop->getHeader()->phis()) {
  // Handle first-order recurrences and reductions that need to be fixed.
  if (Legal->isFirstOrderRecurrence(&Phi))
    fixFirstOrderRecurrence(&Phi);
  else if (Legal->isReductionVariable(&Phi))
    fixReduction(&Phi);
}
3377}

3379void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi) {
// This is the second phase of vectorizing first-order recurrences. An
// overview of the transformation is described below. Suppose we have the
// following loop.
//
//   for (int i = 0; i < n; ++i)
//     b[i] = a[i] - a[i - 1];
//
// There is a first-order recurrence on "a". For this loop, the shorthand
// scalar IR looks like:
//
//   scalar.ph:
//     s_init = a[-1]
//     br scalar.body
//
//   scalar.body:
//     i = phi [0, scalar.ph], [i+1, scalar.body]
//     s1 = phi [s_init, scalar.ph], [s2, scalar.body]
//     s2 = a[i]
//     b[i] = s2 - s1
//     br cond, scalar.body, ...
//
// In this example, s1 is a recurrence because it's value depends on the
// previous iteration. In the first phase of vectorization, we created a
// temporary value for s1. We now complete the vectorization and produce the
// shorthand vector IR shown below (for VF = 4, UF = 1).
//
//   vector.ph:
//     v_init = vector(..., ..., ..., a[-1])
//     br vector.body
//
//   vector.body
//     i = phi [0, vector.ph], [i+4, vector.body]
//     v1 = phi [v_init, vector.ph], [v2, vector.body]
//     v2 = a[i, i+1, i+2, i+3];
//     v3 = vector(v1(3), v2(0, 1, 2))
//     b[i, i+1, i+2, i+3] = v2 - v3
//     br cond, vector.body, middle.block
//
//   middle.block:
//     x = v2(3)
//     br scalar.ph
//
//   scalar.ph:
//     s_init = phi [x, middle.block], [a[-1], otherwise]
//     br scalar.body
//
// After execution completes the vector loop, we extract the next value of
// the recurrence (x) to use as the initial value in the scalar loop.

// Get the original loop preheader and single loop latch.
auto *Preheader = OrigLoop->getLoopPreheader();
auto *Latch = OrigLoop->getLoopLatch();

// Get the initial and previous values of the scalar recurrence.
auto *ScalarInit = Phi->getIncomingValueForBlock(Preheader);
auto *Previous = Phi->getIncomingValueForBlock(Latch);

// Create a vector from the initial value.
auto *VectorInit = ScalarInit;
if (VF > 1) {
  Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
  VectorInit = Builder.CreateInsertElement(
      UndefValue::get(VectorType::get(VectorInit->getType(), VF)), VectorInit,
      Builder.getInt32(VF - 1), "vector.recur.init");
}

// We constructed a temporary phi node in the first phase of vectorization.
// This phi node will eventually be deleted.
Builder.SetInsertPoint(
    cast<Instruction>(VectorLoopValueMap.getVectorValue(Phi, 0)));

// Create a phi node for the new recurrence. The current value will either be
// the initial value inserted into a vector or loop-varying vector value.
auto *VecPhi = Builder.CreatePHI(VectorInit->getType(), 2, "vector.recur");
VecPhi->addIncoming(VectorInit, LoopVectorPreHeader);

// Get the vectorized previous value of the last part UF - 1. It appears last
// among all unrolled iterations, due to the order of their construction.
Value *PreviousLastPart = getOrCreateVectorValue(Previous, UF - 1);

// Set the insertion point after the previous value if it is an instruction.
// Note that the previous value may have been constant-folded so it is not
// guaranteed to be an instruction in the vector loop. Also, if the previous
// value is a phi node, we should insert after all the phi nodes to avoid
// breaking basic block verification.
if (LI->getLoopFor(LoopVectorBody)->isLoopInvariant(PreviousLastPart) ||
    isa<PHINode>(PreviousLastPart))
  Builder.SetInsertPoint(&*LoopVectorBody->getFirstInsertionPt());
else
  Builder.SetInsertPoint(
      &*++BasicBlock::iterator(cast<Instruction>(PreviousLastPart)));

// We will construct a vector for the recurrence by combining the values for
// the current and previous iterations. This is the required shuffle mask.
SmallVector<Constant *, 8> ShuffleMask(VF);
ShuffleMask[0] = Builder.getInt32(VF - 1);
for (unsigned I = 1; I < VF; ++I)
  ShuffleMask[I] = Builder.getInt32(I + VF - 1);

// The vector from which to take the initial value for the current iteration
// (actual or unrolled). Initially, this is the vector phi node.
Value *Incoming = VecPhi;

// Shuffle the current and previous vector and update the vector parts.
for (unsigned Part = 0; Part < UF; ++Part) {
  Value *PreviousPart = getOrCreateVectorValue(Previous, Part);
  Value *PhiPart = VectorLoopValueMap.getVectorValue(Phi, Part);
  auto *Shuffle =
      VF > 1 ? Builder.CreateShuffleVector(Incoming, PreviousPart,
                                           ConstantVector::get(ShuffleMask))
             : Incoming;
  PhiPart->replaceAllUsesWith(Shuffle);
  cast<Instruction>(PhiPart)->eraseFromParent();
  VectorLoopValueMap.resetVectorValue(Phi, Part, Shuffle);
  Incoming = PreviousPart;
}

// Fix the latch value of the new recurrence in the vector loop.
VecPhi->addIncoming(Incoming, LI->getLoopFor(LoopVectorBody)->getLoopLatch());

// Extract the last vector element in the middle block. This will be the
// initial value for the recurrence when jumping to the scalar loop.
auto *ExtractForScalar = Incoming;
if (VF > 1) {
  Builder.SetInsertPoint(LoopMiddleBlock->getTerminator());
  ExtractForScalar = Builder.CreateExtractElement(
      ExtractForScalar, Builder.getInt32(VF - 1), "vector.recur.extract");
}
// Extract the second last element in the middle block if the
// Phi is used outside the loop. We need to extract the phi itself
// and not the last element (the phi update in the current iteration). This
// will be the value when jumping to the exit block from the LoopMiddleBlock,
// when the scalar loop is not run at all.
Value *ExtractForPhiUsedOutsideLoop = nullptr;
if (VF > 1)
  ExtractForPhiUsedOutsideLoop = Builder.CreateExtractElement(
      Incoming, Builder.getInt32(VF - 2), "vector.recur.extract.for.phi");
// When loop is unrolled without vectorizing, initialize
// ExtractForPhiUsedOutsideLoop with the value just prior to unrolled value of
// `Incoming`. This is analogous to the vectorized case above: extracting the
// second last element when VF > 1.
else if (UF > 1)
  ExtractForPhiUsedOutsideLoop = getOrCreateVectorValue(Previous, UF - 2);

// Fix the initial value of the original recurrence in the scalar loop.
Builder.SetInsertPoint(&*LoopScalarPreHeader->begin());
auto *Start = Builder.CreatePHI(Phi->getType(), 2, "scalar.recur.init");
for (auto *BB : predecessors(LoopScalarPreHeader)) {
  auto *Incoming = BB == LoopMiddleBlock ? ExtractForScalar : ScalarInit;
  Start->addIncoming(Incoming, BB);
}

Phi->setIncomingValue(Phi->getBasicBlockIndex(LoopScalarPreHeader), Start);
Phi->setName("scalar.recur");

// Finally, fix users of the recurrence outside the loop. The users will need
// either the last value of the scalar recurrence or the last value of the
// vector recurrence we extracted in the middle block. Since the loop is in
// LCSSA form, we just need to find all the phi nodes for the original scalar
// recurrence in the exit block, and then add an edge for the middle block.
for (PHINode &LCSSAPhi : LoopExitBlock->phis()) {
  if (LCSSAPhi.getIncomingValue(0) == Phi) {
    LCSSAPhi.addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock);
  }
}
3545}

3547void InnerLoopVectorizer::fixReduction(PHINode *Phi) {
Constant *Zero = Builder.getInt32(0);

// Get it's reduction variable descriptor.
assert(Legal->isReductionVariable(Phi) &&(static_cast <bool> (Legal->isReductionVariable(Phi)
 && "Unable to find the reduction variable") ? void (
0) : __assert_fail ("Legal->isReductionVariable(Phi) && \"Unable to find the reduction variable\""
, "/build/llvm-toolchain-snapshot-7~svn338205/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3552, __extension__ __PRETTY_FUNCTION__))
       "Unable to find the reduction variable")(static_cast <bool> (Legal->isReductionVariable(Phi)
 && "Unable to find the reduction variable") ? void (
0) : __assert_fail ("Legal->isReductionVariable(Phi) && \"Unable to find the reduction variable\""
, "/build/llvm-toolchain-snapshot-7~svn338205/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3552, __extension__ __PRETTY_FUNCTION__));
RecurrenceDescriptor RdxDesc = (*Legal->getReductionVars())[Phi];

RecurrenceDescriptor::RecurrenceKind RK = RdxDesc.getRecurrenceKind();
TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue();
Instruction *LoopExitInst = RdxDesc.getLoopExitInstr();
RecurrenceDescriptor::MinMaxRecurrenceKind MinMaxKind =
  RdxDesc.getMinMaxRecurrenceKind();
setDebugLocFromInst(Builder, ReductionStartValue);

// We need to generate a reduction vector from the incoming scalar.
// To do so, we need to generate the 'identity' vector and override
// one of the elements with the incoming scalar reduction. We need
// to do it in the vector-loop preheader.
Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());

// This is the vector-clone of the value that leaves the loop.
Type *VecTy = getOrCreateVectorValue(LoopExitInst, 0)->getType();

// Find the reduction identity variable. Zero for addition, or, xor,
// one for multiplication, -1 for And.
Value *Identity;
Value *VectorStart;
if (RK == RecurrenceDescriptor::RK_IntegerMinMax ||
    RK == RecurrenceDescriptor::RK_FloatMinMax) {
  // MinMax reduction have the start value as their identify.
  if (VF == 1) {
    VectorStart = Identity = ReductionStartValue;
  } else {
    VectorStart = Identity =
      Builder.CreateVectorSplat(VF, ReductionStartValue, "minmax.ident");
  }
} else {
  // Handle other reduction kinds:
  Constant *Iden = RecurrenceDescriptor::getRecurrenceIdentity(
      RK, VecTy->getScalarType());
  if (VF == 1) {
    Identity = Iden;
    // This vector is the Identity vector where the first element is the
    // incoming scalar reduction.
    VectorStart = ReductionStartValue;
  } else {
    Identity = ConstantVector::getSplat(VF, Iden);

    // This vector is the Identity vector where the first element is the
    // incoming scalar reduction.
    VectorStart =
      Builder.CreateInsertElement(Identity, ReductionStartValue, Zero);
  }
}

// Fix the vector-loop phi.

// Reductions do not have to start at zero. They can start with
// any loop invariant values.
BasicBlock *Latch = OrigLoop->getLoopLatch();
Value *LoopVal = Phi->getIncomingValueForBlock(Latch);
for (unsigned Part = 0; Part < UF; ++Part) {
  Value *VecRdxPhi = getOrCreateVectorValue(Phi, Part);
  Value *Val = getOrCreateVectorValue(LoopVal, Part);
  // Make sure to add the reduction stat value only to the
  // first unroll part.
  Value *StartVal = (Part == 0) ? VectorStart : Identity;
  cast<PHINode>(VecRdxPhi)->addIncoming(StartVal, LoopVectorPreHeader);
  cast<PHINode>(VecRdxPhi)
    ->addIncoming(Val, LI->getLoopFor(LoopVectorBody)->getLoopLatch());
}

// Before each round, move the insertion point right between
// the PHIs and the values we are going to write.
// This allows us to write both PHINodes and the extractelement
// instructions.
Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt());

setDebugLocFromInst(Builder, LoopExitInst);

// If the vector reduction can be performed in a smaller type, we truncate
// then extend the loop exit value to enable InstCombine to evaluate the
// entire expression in the smaller type.
if (VF > 1 && Phi->getType() != RdxDesc.getRecurrenceType()) {
  Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), VF);
  Builder.SetInsertPoint(
      LI->getLoopFor(LoopVectorBody)->getLoopLatch()->getTerminator());
  VectorParts RdxParts(UF);
  for (unsigned Part = 0; Part < UF; ++Part) {
    RdxParts[Part] = VectorLoopValueMap.getVectorValue(LoopExitInst, Part);
    Value *Trunc = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
    Value *Extnd = RdxDesc.isSigned() ? Builder.CreateSExt(Trunc, VecTy)
                                      : Builder.CreateZExt(Trunc, VecTy);
    for (Value::user_iterator UI = RdxParts[Part]->user_begin();
         UI != RdxParts[Part]->user_end();)
      if (*UI != Trunc) {
        (*UI++)->replaceUsesOfWith(RdxParts[Part], Extnd);
        RdxParts[Part] = Extnd;
      } else {
        ++UI;
      }
  }
  Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt());
  for (unsigned Part = 0; Part < UF; ++Part) {
    RdxParts[Part] = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
    VectorLoopValueMap.resetVectorValue(LoopExitInst, Part, RdxParts[Part]);
  }
}

// Reduce all of the unrolled parts into a single vector.
Value *ReducedPartRdx = VectorLoopValueMap.getVectorValue(LoopExitInst, 0);
unsigned Op = RecurrenceDescriptor::getRecurrenceBinOp(RK);
setDebugLocFromInst(Builder, ReducedPartRdx);
for (unsigned Part = 1; Part < UF; ++Part) {
  Value *RdxPart = VectorLoopValueMap.getVectorValue(LoopExitInst, Part);
  if (Op != Instruction::ICmp && Op != Instruction::FCmp)
    // Floating point operations had to be 'fast' to enable the reduction.
    ReducedPartRdx = addFastMathFlag(
        Builder.CreateBinOp((Instruction::BinaryOps)Op, RdxPart,
                            ReducedPartRdx, "bin.rdx"));
  else
    ReducedPartRdx = RecurrenceDescriptor::createMinMaxOp(
        Builder, MinMaxKind, ReducedPartRdx, RdxPart);
}

if (VF > 1) {
  bool NoNaN = Legal->hasFunNoNaNAttr();
  ReducedPartRdx =
      createTargetReduction(Builder, TTI, RdxDesc, ReducedPartRdx, NoNaN);
  // If the reduction can be performed in a smaller type, we need to extend
  // the reduction to the wider type before we branch to the original loop.
  if (Phi->getType() != RdxDesc.getRecurrenceType())
    ReducedPartRdx =
      RdxDesc.isSigned()
      ? Builder.CreateSExt(ReducedPartRdx, Phi->getType())
      : Builder.CreateZExt(ReducedPartRdx, Phi->getType());
}

// Create a phi node that merges control-flow from the backedge-taken check
// block and the middle block.
PHINode *BCBlockPhi = PHINode::Create(Phi->getType(), 2, "bc.merge.rdx",
                                      LoopScalarPreHeader->getTerminator());
for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I)
  BCBlockPhi->addIncoming(ReductionStartValue, LoopBypassBlocks[I]);
BCBlockPhi->addIncoming(ReducedPartRdx, LoopMiddleBlock);

// Now, we need to fix the users of the reduction variable
// inside and outside of the scalar remainder loop.
// We know that the loop is in LCSSA form. We need to update the
// PHI nodes in the exit blocks.
for (PHINode &LCSSAPhi : LoopExitBlock->phis()) {
  // All PHINodes need to have a single entry edge, or two if
  // we already fixed them.
  assert(LCSSAPhi.getNumIncomingValues() < 3 && "Invalid LCSSA PHI")(static_cast <bool> (LCSSAPhi.getNumIncomingValues() <
&& "Invalid LCSSA PHI") ? void (0) : __assert_fail
 ("LCSSAPhi.getNumIncomingValues() < 3 && \"Invalid LCSSA PHI\""
, "/build/llvm-toolchain-snapshot-7~svn338205/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3701, __extension__ __PRETTY_FUNCTION__));

  // We found a reduction value exit-PHI. Update it with the
  // incoming bypass edge.
  if (LCSSAPhi.getIncomingValue(0) == LoopExitInst)
    LCSSAPhi.addIncoming(ReducedPartRdx, LoopMiddleBlock);
} // end of the LCSSA phi scan.

  // Fix the scalar loop reduction variable with the incoming reduction sum
  // from the vector body and from the backedge value.
int IncomingEdgeBlockIdx =
  Phi->getBasicBlockIndex(OrigLoop->getLoopLatch());
assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index")(static_cast <bool> (IncomingEdgeBlockIdx >= 0 &&
 "Invalid block index") ? void (0) : __assert_fail ("IncomingEdgeBlockIdx >= 0 && \"Invalid block index\""
, "/build/llvm-toolchain-snapshot-7~svn338205/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3713, __extension__ __PRETTY_FUNCTION__));
// Pick the other block.
int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1);
Phi->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi);
Phi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst);
3718}

3720void InnerLoopVectorizer::fixLCSSAPHIs() {
for (PHINode &LCSSAPhi : LoopExitBlock->phis()) {
  if (LCSSAPhi.getNumIncomingValues() == 1) {
    assert(OrigLoop->isLoopInvariant(LCSSAPhi.getIncomingValue(0)) &&(static_cast <bool> (OrigLoop->isLoopInvariant(LCSSAPhi
.getIncomingValue(0)) && "Incoming value isn't loop invariant"
) ? void (0) : __assert_fail ("OrigLoop->isLoopInvariant(LCSSAPhi.getIncomingValue(0)) && \"Incoming value isn't loop invariant\""
, "/build/llvm-toolchain-snapshot-7~svn338205/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3724, __extension__ __PRETTY_FUNCTION__))
           "Incoming value isn't loop invariant")(static_cast <bool> (OrigLoop->isLoopInvariant(LCSSAPhi
.getIncomingValue(0)) && "Incoming value isn't loop invariant"
) ? void (0) : __assert_fail ("OrigLoop->isLoopInvariant(LCSSAPhi.getIncomingValue(0)) && \"Incoming value isn't loop invariant\""
, "/build/llvm-toolchain-snapshot-7~svn338205/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3724, __extension__ __PRETTY_FUNCTION__));
    LCSSAPhi.addIncoming(LCSSAPhi.getIncomingValue(0), LoopMiddleBlock);
  }
}
3728}

3730void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) {
// The basic block and loop containing the predicated instruction.
auto *PredBB = PredInst->getParent();
auto *VectorLoop = LI->getLoopFor(PredBB);

// Initialize a worklist with the operands of the predicated instruction.
SetVector<Value *> Worklist(PredInst->op_begin(), PredInst->op_end());

// Holds instructions that we need to analyze again. An instruction may be
// reanalyzed if we don't yet know if we can sink it or not.
SmallVector<Instruction *, 8> InstsToReanalyze;

// Returns true if a given use occurs in the predicated block. Phi nodes use
// their operands in their corresponding predecessor blocks.
auto isBlockOfUsePredicated = [&](Use &U) -> bool {
  auto *I = cast<Instruction>(U.getUser());
  BasicBlock *BB = I->getParent();
  if (auto *Phi = dyn_cast<PHINode>(I))
    BB = Phi->getIncomingBlock(
        PHINode::getIncomingValueNumForOperand(U.getOperandNo()));
  return BB == PredBB;
};

// Iteratively sink the scalarized operands of the predicated instruction
// into the block we created for it. When an instruction is sunk, it's
// operands are then added to the worklist. The algorithm ends after one pass
// through the worklist doesn't sink a single instruction.
bool Changed;
do {
  // Add the instructions that need to be reanalyzed to the worklist, and
  // reset the changed indicator.
  Worklist.insert(InstsToReanalyze.begin(), InstsToReanalyze.end());
  InstsToReanalyze.clear();
  Changed = false;

  while (!Worklist.empty()) {
    auto *I = dyn_cast<Instruction>(Worklist.pop_back_val());

    // We can't sink an instruction if it is a phi node, is already in the
    // predicated block, is not in the loop, or may have side effects.
    if (!I || isa<PHINode>(I) || I->getParent() == PredBB ||
        !VectorLoop->contains(I) || I->mayHaveSideEffects())
      continue;

    // It's legal to sink the instruction if all its uses occur in the
    // predicated block. Otherwise, there's nothing to do yet, and we may
    // need to reanalyze the instruction.
    if (!llvm::all_of(I->uses(), isBlockOfUsePredicated)) {
      InstsToReanalyze.push_back(I);
      continue;
    }

    // Move the instruction to the beginning of the predicated block, and add
    // it's operands to the worklist.
    I->moveBefore(&*PredBB->getFirstInsertionPt());
    Worklist.insert(I->op_begin(), I->op_end());

    // The sinking may have enabled other instructions to be sunk, so we will
    // need to iterate.
    Changed = true;
  }
} while (Changed);
3792}

3794void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, unsigned UF,
                                            unsigned VF) {
assert(PN->getParent() == OrigLoop->getHeader() &&(static_cast <bool> (PN->getParent() == OrigLoop->
getHeader() && "Non-header phis should have been handled elsewhere"
) ? void (0) : __assert_fail ("PN->getParent() == OrigLoop->getHeader() && \"Non-header phis should have been handled elsewhere\""
, "/build/llvm-toolchain-snapshot-7~svn338205/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3797, __extension__ __PRETTY_FUNCTION__))
       "Non-header phis should have been handled elsewhere")(static_cast <bool> (PN->getParent() == OrigLoop->
getHeader() && "Non-header phis should have been handled elsewhere"
) ? void (0) : __assert_fail ("PN->getParent() == OrigLoop->getHeader() && \"Non-header phis should have been handled elsewhere\""
, "/build/llvm-toolchain-snapshot-7~svn338205/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3797, __extension__ __PRETTY_FUNCTION__));

PHINode *P = cast<PHINode>(PN);
// In order to support recurrences we need to be able to vectorize Phi nodes.
// Phi nodes have cycles, so we need to vectorize them in two stages. This is
// stage #1: We create a new vector PHI node with no incoming edges. We'll use
// this value when we vectorize all of the instructions that use the PHI.
if (Legal->isReductionVariable(P) || Legal->isFirstOrderRecurrence(P)) {
  for (unsigned Part = 0; Part < UF; ++Part) {
    // This is phase one of vectorizing PHIs.
    Type *VecTy =
        (VF == 1) ? PN->getType() : VectorType::get(PN->getType(), VF);
    Value *EntryPart = PHINode::Create(
        VecTy, 2, "vec.phi", &*LoopVectorBody->getFirstInsertionPt());
    VectorLoopValueMap.setVectorValue(P, Part, EntryPart);
  }
  return;
}

setDebugLocFromInst(Builder, P);

// This PHINode must be an induction variable.
// Make sure that we know about it.
assert(Legal->getInductionVars()->count(P) && "Not an induction variable")(static_cast <bool> (Legal->getInductionVars()->count
(P) && "Not an induction variable") ? void (0) : __assert_fail
 ("Legal->getInductionVars()->count(P) && \"Not an induction variable\""
, "/build/llvm-toolchain-snapshot-7~svn338205/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3820, __extension__ __PRETTY_FUNCTION__));

InductionDescriptor II = Legal->getInductionVars()->lookup(P);
const DataLayout &DL = OrigLoop->getHeader()->getModule()->getDataLayout();

// FIXME: The newly created binary instructions should contain nsw/nuw flags,
// which can be found from the original scalar operations.
switch (II.getKind()) {
case InductionDescriptor::IK_NoInduction:
  llvm_unreachable("Unknown induction")::llvm::llvm_unreachable_internal("Unknown induction", "/build/llvm-toolchain-snapshot-7~svn338205/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3829);
case InductionDescriptor::IK_IntInduction:
case InductionDescriptor::IK_FpInduction:
  llvm_unreachable("Integer/fp induction is handled elsewhere.")::llvm::llvm_unreachable_internal("Integer/fp induction is handled elsewhere."
, "/build/llvm-toolchain-snapshot-7~svn338205/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3832);
case InductionDescriptor::IK_PtrInduction: {
  // Handle the pointer induction variable case.
  assert(P->getType()->isPointerTy() && "Unexpected type.")(static_cast <bool> (P->getType()->isPointerTy() &&
 "Unexpected type.") ? void (0) : __assert_fail ("P->getType()->isPointerTy() && \"Unexpected type.\""
, "/build/llvm-toolchain-snapshot-7~svn338205/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3835, __extension__ __PRETTY_FUNCTION__));
  // This is the normalized GEP that starts counting at zero.
  Value *PtrInd = Induction;
  PtrInd = Builder.CreateSExtOrTrunc(PtrInd, II.getStep()->getType());
  // Determine the number of scalars we need to generate for each unroll
  // iteration. If the instruction is uniform, we only need to generate the
  // first lane. Otherwise, we generate all VF values.
  unsigned Lanes = Cost->isUniformAfterVectorization(P, VF) ? 1 : VF;
  // These are the scalar results. Notice that we don't generate vector GEPs
  // because scalar GEPs result in better code.
  for (unsigned Part = 0; Part < UF; ++Part) {
    for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
      Constant *Idx = ConstantInt::get(PtrInd->getType(), Lane + Part * VF);
      Value *GlobalIdx = Builder.CreateAdd(PtrInd, Idx);
      Value *SclrGep = II.transform(Builder, GlobalIdx, PSE.getSE(), DL);
      SclrGep->setName("next.gep");
      VectorLoopValueMap.setScalarValue(P, {Part, Lane}, SclrGep);
    }
  }
  return;
}
}
3857}

3859/// A helper function for checking whether an integer division-related
3860/// instruction may divide by zero (in which case it must be predicated if
3861/// executed conditionally in the scalar code).
3862/// TODO: It may be worthwhile to generalize and check isKnownNonZero().
3863/// Non-zero divisors that are non compile-time constants will not be
3864/// converted into multiplication, so we will still end up scalarizing
3865/// the division, but can do so w/o predication.
3866static bool mayDivideByZero(Instruction &I) {
assert((I.getOpcode() == Instruction::UDiv ||(static_cast <bool> ((I.getOpcode() == Instruction::UDiv
 || I.getOpcode() == Instruction::SDiv || I.getOpcode() == Instruction
::URem || I.getOpcode() == Instruction::SRem) && "Unexpected instruction"
) ? void (0) : __assert_fail ("(I.getOpcode() == Instruction::UDiv || I.getOpcode() == Instruction::SDiv || I.getOpcode() == Instruction::URem || I.getOpcode() == Instruction::SRem) && \"Unexpected instruction\""
, "/build/llvm-toolchain-snapshot-7~svn338205/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3871, __extension__ __PRETTY_FUNCTION__))
        I.getOpcode() == Instruction::SDiv ||(static_cast <bool> ((I.getOpcode() == Instruction::UDiv
 || I.getOpcode() == Instruction::SDiv || I.getOpcode() == Instruction
::URem || I.getOpcode() == Instruction::SRem) && "Unexpected instruction"
) ? void (0) : __assert_fail ("(I.getOpcode() == Instruction::UDiv || I.getOpcode() == Instruction::SDiv || I.getOpcode() == Instruction::URem || I.getOpcode() == Instruction::SRem) && \"Unexpected instruction\""
, "/build/llvm-toolchain-snapshot-7~svn338205/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3871, __extension__ __PRETTY_FUNCTION__))
        I.getOpcode() == Instruction::URem ||(static_cast <bool> ((I.getOpcode() == Instruction::UDiv
 || I.getOpcode() == Instruction::SDiv || I.getOpcode() == Instruction
::URem || I.getOpcode() == Instruction::SRem) && "Unexpected instruction"
) ? void (0) : __assert_fail ("(I.getOpcode() == Instruction::UDiv || I.getOpcode() == Instruction::SDiv || I.getOpcode() == Instruction::URem || I.getOpcode() == Instruction::SRem) && \"Unexpected instruction\""
, "/build/llvm-toolchain-snapshot-7~svn338205/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3871, __extension__ __PRETTY_FUNCTION__))
        I.getOpcode() == Instruction::SRem) &&(static_cast <bool> ((I.getOpcode() == Instruction::UDiv
 || I.getOpcode() == Instruction::SDiv || I.getOpcode() == Instruction
::URem || I.getOpcode() == Instruction::SRem) && "Unexpected instruction"
) ? void (0) : __assert_fail ("(I.getOpcode() == Instruction::UDiv || I.getOpcode() == Instruction::SDiv || I.getOpcode() == Instruction::URem || I.getOpcode() == Instruction::SRem) && \"Unexpected instruction\""
, "/build/llvm-toolchain-snapshot-7~svn338205/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3871, __extension__ __PRETTY_FUNCTION__))
       "Unexpected instruction")(static_cast <bool> ((I.getOpcode() == Instruction::UDiv
 || I.getOpcode() == Instruction::SDiv || I.getOpcode() == Instruction
::URem || I.getOpcode() == Instruction::SRem) && "Unexpected instruction"
) ? void (0) : __assert_fail ("(I.getOpcode() == Instruction::UDiv || I.getOpcode() == Instruction::SDiv || I.getOpcode() == Instruction::URem || I.getOpcode() == Instruction::SRem) && \"Unexpected instruction\""
, "/build/llvm-toolchain-snapshot-7~svn338205/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3871, __extension__ __PRETTY_FUNCTION__));
Value *Divisor = I.getOperand(1);
auto *CInt = dyn_cast<ConstantInt>(Divisor);
return !CInt || CInt->isZero();
3875}

3877void InnerLoopVectorizer::widenInstruction(Instruction &I) {
switch (I.getOpcode()) {
case Instruction::Br:
case Instruction::PHI:
  llvm_unreachable("This instruction is handled by a different recipe.")::llvm::llvm_unreachable_internal("This instruction is handled by a different recipe."
, "/build/llvm-toolchain-snapshot-7~svn338205/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3881);
case Instruction::GetElementPtr: {
  // Construct a vector GEP by widening the operands of the scalar GEP as
  // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP
  // results in a vector of pointers when at least one operand of the GEP
  // is vector-typed. Thus, to keep the representation compact, we only use
  // vector-typed operands for loop-varying values.
  auto *GEP = cast<GetElementPtrInst>(&I);

  if (VF > 1 && OrigLoop->hasLoopInvariantOperands(GEP)) {
    // If we are vectorizing, but the GEP has only loop-invariant operands,
    // the GEP we build (by only using vector-typed operands for
    // loop-varying values) would be a scalar pointer. Thus, to ensure we
    // produce a vector of pointers, we need to either arbitrarily pick an
    // operand to broadcast, or broadcast a clone of the original GEP.
    // Here, we broadcast a clone of the original.
    //
    // TODO: If at some point we decide to scalarize instructions having
    //       loop-invariant operands, this special case will no longer be
    //       required. We would add the scalarization decision to
    //       collectLoopScalars() and teach getVectorValue() to broadcast
    //       the lane-zero scalar value.
    auto *Clone = Builder.Insert(GEP->clone());
    for (unsigned Part = 0; Part < UF; ++Part) {
      Value *EntryPart = Builder.CreateVectorSplat(VF, Clone);
      VectorLoopValueMap.setVectorValue(&I, Part, EntryPart);
      addMetadata(EntryPart, GEP);
    }
  } else {
    // If the GEP has at least one loop-varying operand, we are sure to
    // produce a vector of pointers. But if we are only unrolling, we want
    // to produce a scalar GEP for each unroll part. Thus, the GEP we
    // produce with the code below will be scalar (if VF == 1) or vector
    // (otherwise). Note that for the unroll-only case, we still maintain
    // values in the vector mapping with initVector, as we do for other
    // instructions.
    for (unsigned Part = 0; Part < UF; ++Part) {
      // The pointer operand of the new GEP. If it's loop-invariant, we
      // won't broadcast it.
      auto *Ptr =
          OrigLoop->isLoopInvariant(GEP->getPointerOperand())
              ? GEP->getPointerOperand()
              : getOrCreateVectorValue(GEP->getPointerOperand(), Part);

      // Collect all the indices for the new GEP. If any index is
      // loop-invariant, we won't broadcast it.
      SmallVector<Value *, 4> Indices;
      for (auto &U : make_range(GEP->idx_begin(), GEP->idx_end())) {
        if (OrigLoop->isLoopInvariant(U.get()))
          Indices.push_back(U.get());
        else
          Indices.push_back(getOrCreateVectorValue(U.get(), Part));
      }

      // Create the new GEP. Note that this GEP may be a scalar if VF == 1,
      // but it should be a vector, otherwise.
      auto *NewGEP = GEP->isInBounds()
                         ? Builder.CreateInBoundsGEP(Ptr, Indices)
                         : Builder.CreateGEP(Ptr, Indices);
      assert((VF == 1 || NewGEP->getType()->isVectorTy()) &&(static_cast <bool> ((VF == 1 || NewGEP->getType()->
isVectorTy()) && "NewGEP is not a pointer vector") ? void
 (0) : __assert_fail ("(VF == 1 || NewGEP->getType()->isVectorTy()) && \"NewGEP is not a pointer vector\""
, "/build/llvm-toolchain-snapshot-7~svn338205/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3941, __extension__ __PRETTY_FUNCTION__))
             "NewGEP is not a pointer vector")(static_cast <bool> ((VF == 1 || NewGEP->getType()->
isVectorTy()) && "NewGEP is not a pointer vector") ? void
 (0) : __assert_fail ("(VF == 1 || NewGEP->getType()->isVectorTy()) && \"NewGEP is not a pointer vector\""
, "/build/llvm-toolchain-snapshot-7~svn338205/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3941, __extension__ __PRETTY_FUNCTION__));
      VectorLoopValueMap.setVectorValue(&I, Part, NewGEP);
      addMetadata(NewGEP, GEP);
    }
  }

  break;
}
case Instruction::UDiv:
case Instruction::SDiv:
case Instruction::SRem:
case Instruction::URem:
case Instruction::Add:
case Instruction::FAdd:
case Instruction::Sub:
case Instruction::FSub:
case Instruction::Mul:
case Instruction::FMul:
case Instruction::FDiv:
case Instruction::FRem:
case Instruction::Shl:
case Instruction::LShr:
case Instruction::AShr:
case Instruction::And:
case Instruction::Or:
case Instruction::Xor: {
  // Just widen binops.
  auto *BinOp = cast<BinaryOperator>(&I);
  setDebugLocFromInst(Builder, BinOp);

  for (unsigned Part = 0; Part < UF; ++Part) {
    Value *A = getOrCreateVectorValue(BinOp->getOperand(0), Part);
    Value *B = getOrCreateVectorValue(BinOp->getOperand(1), Part);
    Value *V = Builder.CreateBinOp(BinOp->getOpcode(), A, B);

    if (BinaryOperator *VecOp = dyn_cast<BinaryOperator>(V))
      VecOp->copyIRFlags(BinOp);

    // Use this vector value for all users of the original instruction.
    VectorLoopValueMap.setVectorValue(&I, Part, V);
    addMetadata(V, BinOp);
  }

  break;
}
case Instruction::Select: {
  // Widen selects.
  // If the selector is loop invariant we can create a select
  // instruction with a scalar condition. Otherwise, use vector-select.
  auto *SE = PSE.getSE();
  bool InvariantCond =
      SE->isLoopInvariant(PSE.getSCEV(I.getOperand(0)), OrigLoop);
  setDebugLocFromInst(Builder, &I);

  // The condition can be loop invariant  but still defined inside the
  // loop. This means that we can't just use the original 'cond' value.
  // We have to take the 'vectorized' value and pick the first lane.
  // Instcombine will make this a no-op.

  auto *ScalarCond = getOrCreateScalarValue(I.getOperand(0), {0, 0});

  for (unsigned Part = 0; Part < UF; ++Part) {
    Value *Cond = getOrCreateVectorValue(I.getOperand(0), Part);
    Value *Op0 = getOrCreateVectorValue(I.getOperand(1), Part);
    Value *Op1 = getOrCreateVectorValue(I.getOperand(2), Part);
    Value *Sel =
        Builder.CreateSelect(InvariantCond ? ScalarCond : Cond, Op0, Op1);
    VectorLoopValueMap.setVectorValue(&I, Part, Sel);
    addMetadata(Sel, &I);
  }

  break;
}

case Instruction::ICmp:
case Instruction::FCmp: {
  // Widen compares. Generate vector compares.
  bool FCmp = (I.getOpcode() == Instruction::FCmp);
  auto *Cmp = dyn_cast<CmpInst>(&I);
  setDebugLocFromInst(Builder, Cmp);
  for (unsigned Part = 0; Part < UF; ++Part) {
    Value *A = getOrCreateVectorValue(Cmp->getOperand(0), Part);
    Value *B = getOrCreateVectorValue(Cmp->getOperand(1), Part);
    Value *C = nullptr;
    if (FCmp) {
      // Propagate fast math flags.
      IRBuilder<>::FastMathFlagGuard FMFG(Builder);
      Builder.setFastMathFlags(Cmp->getFastMathFlags());
      C = Builder.CreateFCmp(Cmp->getPredicate(), A, B);
    } else {
      C = Builder.CreateICmp(Cmp->getPredicate(), A, B);
    }
    VectorLoopValueMap.setVectorValue(&I, Part, C);
    addMetadata(C, &I);
  }

  break;
}

case Instruction::ZExt:
case Instruction::SExt:
case Instruction::FPToUI:
case Instruction::FPToSI:
case Instruction::FPExt:
case Instruction::PtrToInt:
case Instruction::IntToPtr:
case Instruction::SIToFP:
case Instruction::UIToFP:
case Instruction::Trunc:
case Instruction::FPTrunc:
case Instruction::BitCast: {
  auto *CI = dyn_cast<CastInst>(&I);
  setDebugLocFromInst(Builder, CI);

  /// Vectorize casts.
  Type *DestTy =
      (VF == 1) ? CI->getType() : VectorType::get(CI->getType(), VF);

  for (unsigned Part = 0; Part < UF; ++Part) {
    Value *A = getOrCreateVectorValue(CI->getOperand(0), Part);
    Value *Cast = Builder.CreateCast(CI->getOpcode(), A, DestTy);
    VectorLoopValueMap.setVectorValue(&I, Part, Cast);
    addMetadata(Cast, &I);
  }
  break;
}

case Instruction::Call: {
  // Ignore dbg intrinsics.
  if (isa<DbgInfoIntrinsic>(I))
    break;
  setDebugLocFromInst(Builder, &I);

  Module *M = I.getParent()->getParent()->getParent();
  auto *CI = cast<CallInst>(&I);

  StringRef FnName = CI->getCalledFunction()->getName();
  Function *F = CI->getCalledFunction();
  Type *RetTy = ToVectorTy(CI->getType(), VF);
  SmallVector<Type *, 4> Tys;
  for (Value *ArgOperand : CI->arg_operands())
    Tys.push_back(ToVectorTy(ArgOperand->getType(), VF));

  Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);

  // The flag shows whether we use Intrinsic or a usual Call for vectorized
  // version of the instruction.
  // Is it beneficial to perform intrinsic call compared to lib call?
  bool NeedToScalarize;
  unsigned CallCost = getVectorCallCost(CI, VF, *TTI, TLI, NeedToScalarize);
  bool UseVectorIntrinsic =
      ID && getVectorIntrinsicCost(CI, VF, *TTI, TLI) <= CallCost;
  assert((UseVectorIntrinsic || !NeedToScalarize) &&(static_cast <bool> ((UseVectorIntrinsic || !NeedToScalarize
) && "Instruction should be scalarized elsewhere.") ?
 void (0) : __assert_fail ("(UseVectorIntrinsic || !NeedToScalarize) && \"Instruction should be scalarized elsewhere.\""
, "/build/llvm-toolchain-snapshot-7~svn338205/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4094, __extension__ __PRETTY_FUNCTION__))
         "Instruction should be scalarized elsewhere.")(static_cast <bool> ((UseVectorIntrinsic || !NeedToScalarize
) && "Instruction should be scalarized elsewhere.") ?
 void (0) : __assert_fail ("(UseVectorIntrinsic || !NeedToScalarize) && \"Instruction should be scalarized elsewhere.\""
, "/build/llvm-toolchain-snapshot-7~svn338205/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4094, __extension__ __PRETTY_FUNCTION__));

  for (unsigned Part = 0; Part < UF; ++Part) {
    SmallVector<Value *, 4> Args;
    for (unsigned i = 0, ie = CI->getNumArgOperands(); i != ie; ++i) {
      Value *Arg = CI->getArgOperand(i);
      // Some intrinsics have a scalar argument - don't replace it with a
      // vector.
      if (!UseVectorIntrinsic || !hasVectorInstrinsicScalarOpd(ID, i))
        Arg = getOrCreateVectorValue(CI->getArgOperand(i), Part);
      Args.push_back(Arg);
    }

    Function *VectorF;
    if (UseVectorIntrinsic) {
      // Use vector version of the intrinsic.
      Type *TysForDecl[] = {CI->getType()};
      if (VF > 1)
        TysForDecl[0] = VectorType::get(CI->getType()->getScalarType(), VF);
      VectorF = Intrinsic::getDeclaration(M, ID, TysForDecl);
    } else {
      // Use vector version of the library call.
      StringRef VFnName = TLI->getVectorizedFunction(FnName, VF);
      assert(!VFnName.empty() && "Vector function name is empty.")(static_cast <bool> (!VFnName.empty() && "Vector function name is empty."
) ? void (0) : __assert_fail ("!VFnName.empty() && \"Vector function name is empty.\""
, "/build/llvm-toolchain-snapshot-7~svn338205/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4117, __extension__ __PRETTY_FUNCTION__));
      VectorF = M->getFunction(VFnName);
      if (!VectorF) {
        // Generate a declaration
        FunctionType *FTy = FunctionType::get(RetTy, Tys, false);
        VectorF =
            Function::Create(FTy, Function::ExternalLinkage, VFnName, M);
        VectorF->copyAttributesFrom(F);
      }
    }
    assert(VectorF && "Can't create vector function.")(static_cast <bool> (VectorF && "Can't create vector function."
) ? void (0) : __assert_fail ("VectorF && \"Can't create vector function.\""
, "/build/llvm-toolchain-snapshot-7~svn338205/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4127, __extension__ __PRETTY_FUNCTION__));

    SmallVector<OperandBundleDef, 1> OpBundles;
    CI->getOperandBundlesAsDefs(OpBundles);
    CallInst *V = Builder.CreateCall(VectorF, Args, OpBundles);

    if (isa<FPMathOperator>(V))
      V->copyFastMathFlags(CI);

    VectorLoopValueMap.setVectorValue(&I, Part, V);
    addMetadata(V, &I);
  }

  break;
}

default:
  // This instruction is not vectorized by simple widening.
  LLVM_DEBUG(dbgs() << "LV: Found an unhandled instruction: " << I)do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found an unhandled instruction: "
 << I; } } while (false);
  llvm_unreachable("Unhandled instruction!")::llvm::llvm_unreachable_internal("Unhandled instruction!", "/build/llvm-toolchain-snapshot-7~svn338205/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4146);
} // end of switch.
4148}

4150void InnerLoopVectorizer::updateAnalysis() {
// Forget the original basic block.
PSE.getSE()->forgetLoop(OrigLoop);

// Update the dominator tree information.
assert(DT->properlyDominates(LoopBypassBlocks.front(), LoopExitBlock) &&(static_cast <bool> (DT->properlyDominates(LoopBypassBlocks
.front(), LoopExitBlock) && "Entry does not dominate exit."
) ? void (0) : __assert_fail ("DT->properlyDominates(LoopBypassBlocks.front(), LoopExitBlock) && \"Entry does not dominate exit.\""
, "/build/llvm-toolchain-snapshot-7~svn338205/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4156, __extension__ __PRETTY_FUNCTION__))
       "Entry does not dominate exit.")(static_cast <bool> (DT->properlyDominates(LoopBypassBlocks
.front(), LoopExitBlock) && "Entry does not dominate exit."
) ? void (0) : __assert_fail ("DT->properlyDominates(LoopBypassBlocks.front(), LoopExitBlock) && \"Entry does not dominate exit.\""
, "/build/llvm-toolchain-snapshot-7~svn338205/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4156, __extension__ __PRETTY_FUNCTION__));

DT->addNewBlock(LoopMiddleBlock,
                LI->getLoopFor(LoopVectorBody)->getLoopLatch());
DT->addNewBlock(LoopScalarPreHeader, LoopBypassBlocks[0]);
DT->changeImmediateDominator(LoopScalarBody, LoopScalarPreHeader);
DT->changeImmediateDominator(LoopExitBlock, LoopBypassBlocks[0]);
assert(DT->verify(DominatorTree::VerificationLevel::Fast))(static_cast <bool> (DT->verify(DominatorTree::VerificationLevel
::Fast)) ? void (0) : __assert_fail ("DT->verify(DominatorTree::VerificationLevel::Fast)"
, "/build/llvm-toolchain-snapshot-7~svn338205/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4163, __extension__ __PRETTY_FUNCTION__));
4164}

4166void LoopVectorizationCostModel::collectLoopScalars(unsigned VF) {
// We should not collect Scalars more than once per VF. Right now, this
// function is called from collectUniformsAndScalars(), which already does
// this check. Collecting Scalars for VF=1 does not make any sense.
assert(VF >= 2 && !Scalars.count(VF) &&(static_cast <bool> (VF >= 2 && !Scalars.count
(VF) && "This function should not be visited twice for the same VF"
) ? void (0) : __assert_fail ("VF >= 2 && !Scalars.count(VF) && \"This function should not be visited twice for the same VF\""
, "/build/llvm-toolchain-snapshot-7~svn338205/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4171, __extension__ __PRETTY_FUNCTION__))
       "This function should not be visited twice for the same VF")(static_cast <bool> (VF >= 2 && !Scalars.count
(VF) && "This function should not be visited twice for the same VF"
) ? void (0) : __assert_fail ("VF >= 2 && !Scalars.count(VF) && \"This function should not be visited twice for the same VF\""
, "/build/llvm-toolchain-snapshot-7~svn338205/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4171, __extension__ __PRETTY_FUNCTION__));

SmallSetVector<Instruction *, 8> Worklist;

// These sets are used to seed the analysis with pointers used by memory
// accesses that will remain scalar.
SmallSetVector<Instruction *, 8> ScalarPtrs;
SmallPtrSet<Instruction *, 8> PossibleNonScalarPtrs;

// A helper that returns true if the use of Ptr by MemAccess will be scalar.
// The pointer operands of loads and stores will be scalar as long as the
// memory access is not a gather or scatter operation. The value operand of a
// store will remain scalar if the store is scalarized.
auto isScalarUse = [&](Instruction *MemAccess, Value *Ptr) {
  InstWidening WideningDecision = getWideningDecision(MemAccess, VF);
  assert(WideningDecision != CM_Unknown &&(static_cast <bool> (WideningDecision != CM_Unknown &&
 "Widening decision should be ready at this moment") ? void (
0) : __assert_fail ("WideningDecision != CM_Unknown && \"Widening decision should be ready at this moment\""
, "/build/llvm-toolchain-snapshot-7~svn338205/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4187, __extension__ __PRETTY_FUNCTION__))
         "Widening decision should be ready at this moment")(static_cast <bool> (WideningDecision != CM_Unknown &&
 "Widening decision should be ready at this moment") ? void (
0) : __assert_fail ("WideningDecision != CM_Unknown && \"Widening decision should be ready at this moment\""
, "/build/llvm-toolchain-snapshot-7~svn338205/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4187, __extension__ __PRETTY_FUNCTION__));
  if (auto *Store = dyn_cast<StoreInst>(MemAccess))
    if (Ptr == Store->getValueOperand())
      return WideningDecision == CM_Scalarize;
  assert(Ptr == getLoadStorePointerOperand(MemAccess) &&(static_cast <bool> (Ptr == getLoadStorePointerOperand(
MemAccess) && "Ptr is neither a value or pointer operand"
) ? void (0) : __assert_fail ("Ptr == getLoadStorePointerOperand(MemAccess) && \"Ptr is neither a value or pointer operand\""
, "/build/llvm-toolchain-snapshot-7~svn338205/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4192, __extension__ __PRETTY_FUNCTION__))
         "Ptr is neither a value or pointer operand")(static_cast <bool> (Ptr == getLoadStorePointerOperand(
MemAccess) && "Ptr is neither a value or pointer operand"
) ? void (0) : __assert_fail ("Ptr == getLoadStorePointerOperand(MemAccess) && \"Ptr is neither a value or pointer operand\""
, "/build/llvm-toolchain-snapshot-7~svn338205/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4192, __extension__ __PRETTY_FUNCTION__));
  return WideningDecision != CM_GatherScatter;
};

// A helper that returns true if the given value is a bitcast or
// getelementptr instruction contained in the loop.
auto isLoopVaryingBitCastOrGEP = [&](Value *V) {
  return ((isa<BitCastInst>(V) && V->getType()->isPointerTy()) ||
          isa<GetElementPtrInst>(V)) &&
         !TheLoop->isLoopInvariant(V);
};

// A helper that evaluates a memory access's use of a pointer. If the use
// will be a scalar use, and the pointer is only used by memory accesses, we
// place the pointer in ScalarPtrs. Otherwise, the pointer is placed in
// PossibleNonScalarPtrs.
auto evaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) {
  // We only care about bitcast and getelementptr instructions contained in
  // the loop.
  if (!isLoopVaryingBitCastOrGEP(Ptr))
    return;

  // If the pointer has already been identified as scalar (e.g., if it was
  // also identified as uniform), there's nothing to do.
  auto *I = cast<Instruction>(Ptr);
  if (Worklist.count(I))
    return;

  // If the use of the pointer will be a scalar use, and all users of the
  // pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise,
  // place the pointer in PossibleNonScalarPtrs.
  if (isScalarUse(MemAccess, Ptr) && llvm::all_of(I->users(), [&](User *U) {
        return isa<LoadInst>(U) || isa<StoreInst>(U);
      }))
    ScalarPtrs.insert(I);
  else
    PossibleNonScalarPtrs.insert(I);
};

// We seed the scalars analysis with three classes of instructions: (1)
// instructions marked uniform-after-vectorization, (2) bitcast and
// getelementptr instructions used by memory accesses requiring a scalar use,
// and (3) pointer induction variables and their update instructions (we
// currently only scalarize these).
//
// (1) Add to the worklist all instructions that have been identified as
// uniform-after-vectorization.
Worklist.insert(Uniforms[VF].begin(), Uniforms[VF].end());

// (2) Add to the worklist all bitcast and getelementptr instructions used by
// memory accesses requiring a scalar use. The pointer operands of loads and
// stores will be scalar as long as the memory accesses is not a gather or
// scatter operation. The value operand of a store will remain scalar if the
// store is scalarized.
for (auto *BB : TheLoop->blocks())
  for (auto &I : *BB) {
    if (auto *Load = dyn_cast<LoadInst>(&I)) {
      evaluatePtrUse(Load, Load->getPointerOperand());
    } else if (auto *Store = dyn_cast<StoreInst>(&I)) {
      evaluatePtrUse(Store, Store->getPointerOperand());
      evaluatePtrUse(Store, Store->getValueOperand());
    }
  }
for (auto *I : ScalarPtrs)
  if (!PossibleNonScalarPtrs.count(I)) {
    LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found scalar instruction: "
 << *I << "\n"; } } while (false);
    Worklist.insert(I);
  }

// (3) Add to the worklist all pointer induction variables and their update
// instructions.
//
// TODO: Once we are able to vectorize pointer induction variables we should
//       no longer insert them into the worklist here.
auto *Latch = TheLoop->getLoopLatch();
for (auto &Induction : *Legal->getInductionVars()) {
  auto *Ind = Induction.first;
  auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
  if (Induction.second.getKind() != InductionDescriptor::IK_PtrInduction)
    continue;
  Worklist.insert(Ind);
  Worklist.insert(IndUpdate);
  LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found scalar instruction: "
 << *Ind << "\n"; } } while (false);
  LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdatedo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found scalar instruction: "
 << *IndUpdate << "\n"; } } while (false)
                    << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found scalar instruction: "
 << *IndUpdate << "\n"; } } while (false);
}

// Insert the forced scalars.
// FIXME: Currently widenPHIInstruction() often creates a dead vector
// induction variable when the PHI user is scalarized.
if (ForcedScalars.count(VF))
  for (auto *I : ForcedScalars.find(VF)->second)
    Worklist.insert(I);

// Expand the worklist by looking through any bitcasts and getelementptr
// instructions we've already identified as scalar. This is similar to the
// expansion step in collectLoopUniforms(); however, here we're only
// expanding to include additional bitcasts and getelementptr instructions.
unsigned Idx = 0;
while (Idx != Worklist.size()) {
  Instruction *Dst = Worklist[Idx++];
  if (!isLoopVaryingBitCastOrGEP(Dst->getOperand(0)))
    continue;
  auto *Src = cast<Instruction>(Dst->getOperand(0));
  if (llvm::all_of(Src->users(), [&](User *U) -> bool {
        auto *J = cast<Instruction>(U);
        return !TheLoop->contains(J) || Worklist.count(J) ||
               ((isa<LoadInst>(J) || isa<StoreInst>(J)) &&
                isScalarUse(J, Src));
      })) {
    Worklist.insert(Src);
    LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found scalar instruction: "
 << *Src << "\n"; } } while (false);
  }
}

// An induction variable will remain scalar if all users of the induction
// variable and induction variable update remain scalar.
for (auto &Induction : *Legal->getInductionVars()) {
  auto *Ind = Induction.first;
  auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));

  // We already considered pointer induction variables, so there's no reason
  // to look at their users again.
  //
  // TODO: Once we are able to vectorize pointer induction variables we
  //       should no longer skip over them here.
  if (Induction.second.getKind() == InductionDescriptor::IK_PtrInduction)
    continue;

  // Determine if all users of the induction variable are scalar after
  // vectorization.
  auto ScalarInd = llvm::all_of(Ind->users(), [&](User *U) -> bool {
    auto *I = cast<Instruction>(U);
    return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I);
  });
  if (!ScalarInd)
    continue;

  // Determine if all users of the induction variable update instruction are
  // scalar after vectorization.
  auto ScalarIndUpdate =
      llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
        auto *I = cast<Instruction>(U);
        return I == Ind || !TheLoop->contains(I) || Worklist.count(I);
      });
  if (!ScalarIndUpdate)
    continue;

  // The induction variable and its update instruction will remain scalar.
  Worklist.insert(Ind);
  Worklist.insert(IndUpdate);
  LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found scalar instruction: "
 << *Ind << "\n"; } } while (false);
  LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdatedo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found scalar instruction: "
 << *IndUpdate << "\n"; } } while (false)
                    << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found scalar instruction: "
 << *IndUpdate << "\n"; } } while (false);
}

Scalars[VF].insert(Worklist.begin(), Worklist.end());
4349}

4351bool LoopVectorizationCostModel::isScalarWithPredication(Instruction *I) {
if (!Legal->blockNeedsPredication(I->getParent()))
  return false;
switch(I->getOpcode()) {
default:
  break;
case Instruction::Load:
case Instruction::Store: {
  if (!Legal->isMaskRequired(I))
    return false;
  auto *Ptr = getLoadStorePointerOperand(I);
  auto *Ty = getMemInstValueType(I);
  return isa<LoadInst>(I) ?
      !(isLegalMaskedLoad(Ty, Ptr)  || isLegalMaskedGather(Ty))
    : !(isLegalMaskedStore(Ty, Ptr) || isLegalMaskedScatter(Ty));
}
case Instruction::UDiv:
case Instruction::SDiv:
case Instruction::SRem:
case Instruction::URem:
  return mayDivideByZero(*I);
}
return false;
4374}

4376bool LoopVectorizationCostModel::memoryInstructionCanBeWidened(Instruction *I,
                                                             unsigned VF) {
// Get and ensure we have a valid memory instruction.
LoadInst *LI = dyn_cast<LoadInst>(I);
StoreInst *SI = dyn_cast<StoreInst>(I);
assert((LI || SI) && "Invalid memory instruction")(static_cast <bool> ((LI || SI) && "Invalid memory instruction"
) ? void (0) : __assert_fail ("(LI || SI) && \"Invalid memory instruction\""
, "/build/llvm-toolchain-snapshot-7~svn338205/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4381, __extension__ __PRETTY_FUNCTION__));

auto *Ptr = getLoadStorePointerOperand(I);

// In order to be widened, the pointer should be consecutive, first of all.
if (!Legal->isConsecutivePtr(Ptr))
  return false;

// If the instruction is a store located in a predicated block, it will be
// scalarized.
if (isScalarWithPredication(I))
  return false;

// If the instruction's allocated size doesn't equal it's type size, it
// requires padding and will be scalarized.
auto &DL = I->getModule()->getDataLayout();
auto *ScalarTy = LI ? LI->getType() : SI->getValueOperand()->getType();
if (hasIrregularType(ScalarTy, DL, VF))
  return false;

return true;
4402}

4404void LoopVectorizationCostModel::collectLoopUniforms(unsigned VF) {
// We should not collect Uniforms more than once per VF. Right now,
// this function is called from collectUniformsAndScalars(), which
// already does this check. Collecting Uniforms for VF=1 does not make any
// sense.

assert(VF >= 2 && !Uniforms.count(VF) &&(static_cast <bool> (VF >= 2 && !Uniforms.count
(VF) && "This function should not be visited twice for the same VF"
) ? void (0) : __assert_fail ("VF >= 2 && !Uniforms.count(VF) && \"This function should not be visited twice for the same VF\""
, "/build/llvm-toolchain-snapshot-7~svn338205/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4411, __extension__ __PRETTY_FUNCTION__))
       "This function should not be visited twice for the same VF")(static_cast <bool> (VF >= 2 && !Uniforms.count
(VF) && "This function should not be visited twice for the same VF"
) ? void (0) : __assert_fail ("VF >= 2 && !Uniforms.count(VF) && \"This function should not be visited twice for the same VF\""
, "/build/llvm-toolchain-snapshot-7~svn338205/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4411, __extension__ __PRETTY_FUNCTION__));

// Visit the list of Uniforms. If we'll not find any uniform value, we'll
// not analyze again.  Uniforms.count(VF) will return 1.
Uniforms[VF].clear();

// We now know that the loop is vectorizable!
// Collect instructions inside the loop that will remain uniform after
// vectorization.

// Global values, params and instructions outside of current loop are out of
// scope.
auto isOutOfScope = [&](Value *V) -> bool {
  Instruction *I = dyn_cast<Instruction>(V);
  return (!I || !TheLoop->contains(I));
};

SetVector<Instruction *> Worklist;
BasicBlock *Latch = TheLoop->getLoopLatch();

// Start with the conditional branch. If the branch condition is an
// instruction contained in the loop that is only used by the branch, it is
// uniform.
auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0));
if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse()) {
  Worklist.insert(Cmp);
  LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *Cmp << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found uniform instruction: "
 << *Cmp << "\n"; } } while (false);
}

// Holds consecutive and consecutive-like pointers. Consecutive-like pointers
// are pointers that are treated like consecutive pointers during
// vectorization. The pointer operands of interleaved accesses are an
// example.
SmallSetVector<Instruction *, 8> ConsecutiveLikePtrs;

// Holds pointer operands of instructions that are possibly non-uniform.
SmallPtrSet<Instruction *, 8> PossibleNonUniformPtrs;

auto isUniformDecision = [&](Instruction *I, unsigned VF) {
  InstWidening WideningDecision = getWideningDecision(I, VF);
  assert(WideningDecision != CM_Unknown &&(static_cast <bool> (WideningDecision != CM_Unknown &&
 "Widening decision should be ready at this moment") ? void (
0) : __assert_fail ("WideningDecision != CM_Unknown && \"Widening decision should be ready at this moment\""
, "/build/llvm-toolchain-snapshot-7~svn338205/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4452, __extension__ __PRETTY_FUNCTION__))
         "Widening decision should be ready at this moment")(static_cast <bool> (WideningDecision != CM_Unknown &&
 "Widening decision should be ready at this moment") ? void (
0) : __assert_fail ("WideningDecision != CM_Unknown && \"Widening decision should be ready at this moment\""
, "/build/llvm-toolchain-snapshot-7~svn338205/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4452, __extension__ __PRETTY_FUNCTION__));

  return (WideningDecision == CM_Widen ||
          WideningDecision == CM_Widen_Reverse ||
          WideningDecision == CM_Interleave);
};
// Iterate over the instructions in the loop, and collect all
// consecutive-like pointer operands in ConsecutiveLikePtrs. If it's possible
// that a consecutive-like pointer operand will be scalarized, we collect it
// in PossibleNonUniformPtrs instead. We use two sets here because a single
// getelementptr instruction can be used by both vectorized and scalarized
// memory instructions. For example, if a loop loads and stores from the same
// location, but the store is conditional, the store will be scalarized, and
// the getelementptr won't remain uniform.
for (auto *BB : TheLoop->blocks())
  for (auto &I : *BB) {
    // If there's no pointer operand, there's nothing to do.
    auto *Ptr = dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I));
    if (!Ptr)
      continue;

    // True if all users of Ptr are memory accesses that have Ptr as their
    // pointer operand.
    auto UsersAreMemAccesses =
        llvm::all_of(Ptr->users(), [&](User *U) -> bool {
          return getLoadStorePointerOperand(U) == Ptr;
        });

    // Ensure the memory instruction will not be scalarized or used by
    // gather/scatter, making its pointer operand non-uniform. If the pointer
    // operand is used by any instruction other than a memory access, we
    // conservatively assume the pointer operand may be non-uniform.
    if (!UsersAreMemAccesses || !isUniformDecision(&I, VF))
      PossibleNonUniformPtrs.insert(Ptr);

    // If the memory instruction will be vectorized and its pointer operand
    // is consecutive-like, or interleaving - the pointer operand should
    // remain uniform.
    else
      ConsecutiveLikePtrs.insert(Ptr);
  }

// Add to the Worklist all consecutive and consecutive-like pointers that
// aren't also identified as possibly non-uniform.
for (auto *V : ConsecutiveLikePtrs)
  if (!PossibleNonUniformPtrs.count(V)) {
    LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *V << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found uniform instruction: "
 << *V << "\n"; } } while (false);
    Worklist.insert(V);
  }

// Expand Worklist in topological order: whenever a new instruction
// is added , its users should be either already inside Worklist, or
// out of scope. It ensures a uniform instruction will only be used
// by uniform instructions or out of scope instructions.
unsigned idx = 0;
while (idx != Worklist.size()) {
  Instruction *I = Worklist[idx++];

  for (auto OV : I->operand_values()) {
    if (isOutOfScope(OV))
      continue;
    auto *OI = cast<Instruction>(OV);
    if (llvm::all_of(OI->users(), [&](User *U) -> bool {
          auto *J = cast<Instruction>(U);
          return !TheLoop->contains(J) || Worklist.count(J) ||
                 (OI == getLoadStorePointerOperand(J) &&
                  isUniformDecision(J, VF));
        })) {
      Worklist.insert(OI);
      LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *OI << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found uniform instruction: "
 << *OI << "\n"; } } while (false);
    }
  }
}

// Returns true if Ptr is the pointer operand of a memory access instruction
// I, and I is known to not require scalarization.
auto isVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool {
  return getLoadStorePointerOperand(I) == Ptr && isUniformDecision(I, VF);
};

// For an instruction to be added into Worklist above, all its users inside
// the loop should also be in Worklist. However, this condition cannot be
// true for phi nodes that form a cyclic dependence. We must process phi
// nodes separately. An induction variable will remain uniform if all users
// of the induction variable and induction variable update remain uniform.
// The code below handles both pointer and non-pointer induction variables.
for (auto &Induction : *Legal->getInductionVars()) {
  auto *Ind = Induction.first;
  auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));

  // Determine if all users of the induction variable are uniform after
  // vectorization.
  auto UniformInd = llvm::all_of(Ind->users(), [&](User *U) -> bool {
    auto *I = cast<Instruction>(U);
    return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) ||
           isVectorizedMemAccessUse(I, Ind);
  });
  if (!UniformInd)
    continue;

  // Determine if all users of the induction variable update instruction are
  // uniform after vectorization.
  auto UniformIndUpdate =
      llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
        auto *I = cast<Instruction>(U);
        return I == Ind || !TheLoop->contains(I) || Worklist.count(I) ||
               isVectorizedMemAccessUse(I, IndUpdate);
      });
  if (!UniformIndUpdate)
    continue;

  // The induction variable and its update instruction will remain uniform.
  Worklist.insert(Ind);
  Worklist.insert(IndUpdate);
  LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *Ind << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found uniform instruction: "
 << *Ind << "\n"; } } while (false);
  LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *IndUpdatedo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found uniform instruction: "
 << *IndUpdate << "\n"; } } while (false)
                    << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found uniform instruction: "
 << *IndUpdate << "\n"; } } while (false);
}

Uniforms[VF].insert(Worklist.begin(), Worklist.end());
4572}

4574void InterleavedAccessInfo::collectConstStrideAccesses(
  MapVector<Instruction *, StrideDescriptor> &AccessStrideInfo,
  const ValueToValueMap &Strides) {
auto &DL = TheLoop->getHeader()->getModule()->getDataLayout();

// Since it's desired that the load/store instructions be maintained in
// "program order" for the interleaved access analysis, we have to visit the
// blocks in the loop in reverse postorder (i.e., in a topological order).
// Such an ordering will ensure that any load/store that may be executed
// before a second load/store will precede the second load/store in
// AccessStrideInfo.
LoopBlocksDFS DFS(TheLoop);
DFS.perform(LI);
for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO()))
  for (auto &I : *BB) {
    auto *LI = dyn_cast<LoadInst>(&I);
    auto *SI = dyn_cast<StoreInst>(&I);
    if (!LI && !SI)
      continue;

    Value *Ptr = getLoadStorePointerOperand(&I);
    // We don't check wrapping here because we don't know yet if Ptr will be
    // part of a full group or a group with gaps. Checking wrapping for all
    // pointers (even those that end up in groups with no gaps) will be overly
    // conservative. For full groups, wrapping should be ok since if we would
    // wrap around the address space we would do a memory access at nullptr
    // even without the transformation. The wrapping checks are therefore
    // deferred until after we've formed the interleaved groups.
    int64_t Stride = getPtrStride(PSE, Ptr, TheLoop, Strides,
                                  /*Assume=*/true, /*ShouldCheckWrap=*/false);

    const SCEV *Scev = replaceSymbolicStrideSCEV(PSE, Strides, Ptr);
    PointerType *PtrTy = dyn_cast<PointerType>(Ptr->getType());
    uint64_t Size = DL.getTypeAllocSize(PtrTy->getElementType());

    // An alignment of 0 means target ABI alignment.
    unsigned Align = getMemInstAlignment(&I);
    if (!Align)
      Align = DL.getABITypeAlignment(PtrTy->getElementType());

    AccessStrideInfo[&I] = StrideDescriptor(Stride, Scev, Size, Align);
  }
4616}

4618// Analyze interleaved accesses and collect them into interleaved load and
4619// store groups.
4620//
4621// When generating code for an interleaved load group, we effectively hoist all
4622// loads in the group to the location of the first load in program order. When
4623// generating code for an interleaved store group, we sink all stores to the
4624// location of the last store. This code motion can change the order of load
4625// and store instructions and may break dependences.
4626//
4627// The code generation strategy mentioned above ensures that we won't violate
4628// any write-after-read (WAR) dependences.
4629//
4630// E.g., for the WAR dependence:  a = A[i];      // (1)
4631//                                A[i] = b;      // (2)
4632//
4633// The store group of (2) is always inserted at or below (2), and the load
4634// group of (1) is always inserted at or above (1). Thus, the instructions will
4635// never be reordered. All other dependences are checked to ensure the
4636// correctness of the instruction reordering.
4637//
4638// The algorithm visits all memory accesses in the loop in bottom-up program
4639// order. Program order is established by traversing the blocks in the loop in
4640// reverse postorder when collecting the accesses.
4641//
4642// We visit the memory accesses in bottom-up order because it can simplify the
4643// construction of store groups in the presence of write-after-write (WAW)
4644// dependences.
4645//
4646// E.g., for the WAW dependence:  A[i] = a;      // (1)
4647//                                A[i] = b;      // (2)
4648//                                A[i + 1] = c;  // (3)
4649//
4650// We will first create a store group with (3) and (2). (1) can't be added to
4651// this group because it and (2) are dependent. However, (1) can be grouped
4652// with other accesses that may precede it in program order. Note that a
4653// bottom-up order does not imply that WAW dependences should not be checked.
4654void InterleavedAccessInfo::analyzeInterleaving() {
LLVM_DEBUG(dbgs() << "LV: Analyzing interleaved accesses...\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Analyzing interleaved accesses...\n"
; } } while (false);
const ValueToValueMap &Strides = LAI->getSymbolicStrides();

// Holds all accesses with a constant stride.
MapVector<Instruction *, StrideDescriptor> AccessStrideInfo;
collectConstStrideAccesses(AccessStrideInfo, Strides);

if (AccessStrideInfo.empty())
  return;

// Collect the dependences in the loop.
collectDependences();

// Holds all interleaved store groups temporarily.
SmallSetVector<InterleaveGroup *, 4> StoreGroups;
// Holds all interleaved load groups temporarily.
SmallSetVector<InterleaveGroup *, 4> LoadGroups;

// Search in bottom-up program order for pairs of accesses (A and B) that can
// form interleaved load or store groups. In the algorithm below, access A
// precedes access B in program order. We initialize a group for B in the
// outer loop of the algorithm, and then in the inner loop, we attempt to
// insert each A into B's group if:
//
//  1. A and B have the same stride,
//  2. A and B have the same memory object size, and
//  3. A belongs in B's group according to its distance from B.
//
// Special care is taken to ensure group formation will not break any
// dependences.
for (auto BI = AccessStrideInfo.rbegin(), E = AccessStrideInfo.rend();
     BI != E; ++BI) {
  Instruction *B = BI->first;
  StrideDescriptor DesB = BI->second;

  // Initialize a group for B if it has an allowable stride. Even if we don't
  // create a group for B, we continue with the bottom-up algorithm to ensure
  // we don't break any of B's dependences.
  InterleaveGroup *Group = nullptr;
  if (isStrided(DesB.Stride)) {
    Group = getInterleaveGroup(B);
    if (!Group) {
      LLVM_DEBUG(dbgs() << "LV: Creating an interleave group with:" << *Bdo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Creating an interleave group with:"
 << *B << '\n'; } } while (false)
                        << '\n')do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Creating an interleave group with:"
 << *B << '\n'; } } while (false);
      Group = createInterleaveGroup(B, DesB.Stride, DesB.Align);
    }
    if (B->mayWriteToMemory())
      StoreGroups.insert(Group);
    else
      LoadGroups.insert(Group);
  }

  for (auto AI = std::next(BI); AI != E; ++AI) {
    Instruction *A = AI->first;
    StrideDescriptor DesA = AI->second;

    // Our code motion strategy implies that we can't have dependences
    // between accesses in an interleaved group and other accesses located
    // between the first and last member of the group. Note that this also
    // means that a group can't have more than one member at a given offset.
    // The accesses in a group can have dependences with other accesses, but
    // we must ensure we don't extend the boundaries of the group such that
    // we encompass those dependent accesses.
    //
    // For example, assume we have the sequence of accesses shown below in a
    // stride-2 loop:
    //
    //  (1, 2) is a group | A[i]   = a;  // (1)
    //                    | A[i-1] = b;  // (2) |
    //                      A[i-3] = c;  // (3)
    //                      A[i]   = d;  // (4) | (2, 4) is not a group
    //
    // Because accesses (2) and (3) are dependent, we can group (2) with (1)
    // but not with (4). If we did, the dependent access (3) would be within
    // the boundaries of the (2, 4) group.
    if (!canReorderMemAccessesForInterleavedGroups(&*AI, &*BI)) {
      // If a dependence exists and A is already in a group, we know that A
      // must be a store since A precedes B and WAR dependences are allowed.
      // Thus, A would be sunk below B. We release A's group to prevent this
      // illegal code motion. A will then be free to form another group with
      // instructions that precede it.
      if (isInterleaved(A)) {
        InterleaveGroup *StoreGroup = getInterleaveGroup(A);
        StoreGroups.remove(StoreGroup);
        releaseGroup(StoreGroup);
      }

      // If a dependence exists and A is not already in a group (or it was
      // and we just released it), B might be hoisted above A (if B is a
      // load) or another store might be sunk below A (if B is a store). In
      // either case, we can't add additional instructions to B's group. B
      // will only form a group with instructions that it precedes.
      break;
    }

    // At this point, we've checked for illegal code motion. If either A or B
    // isn't strided, there's nothing left to do.
    if (!isStrided(DesA.Stride) || !isStrided(DesB.Stride))
      continue;

    // Ignore A if it's already in a group or isn't the same kind of memory
    // operation as B.
    // Note that mayReadFromMemory() isn't mutually exclusive to mayWriteToMemory
    // in the case of atomic loads. We shouldn't see those here, canVectorizeMemory()
    // should have returned false - except for the case we asked for optimization
    // remarks.
    if (isInterleaved(A) || (A->mayReadFromMemory() != B->mayReadFromMemory())
        || (A->mayWriteToMemory() != B->mayWriteToMemory()))
      continue;

    // Check rules 1 and 2. Ignore A if its stride or size is different from
    // that of B.
    if (DesA.Stride != DesB.Stride || DesA.Size != DesB.Size)
      continue;

    // Ignore A if the memory object of A and B don't belong to the same
    // address space
    if (getMemInstAddressSpace(A) != getMemInstAddressSpace(B))
      continue;

    // Calculate the distance from A to B.
    const SCEVConstant *DistToB = dyn_cast<SCEVConstant>(
        PSE.getSE()->getMinusSCEV(DesA.Scev, DesB.Scev));
    if (!DistToB)
      continue;
    int64_t DistanceToB = DistToB->getAPInt().getSExtValue();

    // Check rule 3. Ignore A if its distance to B is not a multiple of the
    // size.
    if (DistanceToB % static_cast<int64_t>(DesB.Size))
      continue;

    // Ignore A if either A or B is in a predicated block. Although we
    // currently prevent group formation for predicated accesses, we may be
    // able to relax this limitation in the future once we handle more
    // complicated blocks.
    if (isPredicated(A->getParent()) || isPredicated(B->getParent()))
      continue;

    // The index of A is the index of B plus A's distance to B in multiples
    // of the size.
    int IndexA =
        Group->getIndex(B) + DistanceToB / static_cast<int64_t>(DesB.Size);

    // Try to insert A into B's group.
    if (Group->insertMember(A, IndexA, DesA.Align)) {
      LLVM_DEBUG(dbgs() << "LV: Inserted:" << *A << '\n'do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Inserted:" <<
 *A << '\n' << "    into the interleave group with"
 << *B << '\n'; } } while (false)
                        << "    into the interleave group with" << *Bdo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Inserted:" <<
 *A << '\n' << "    into the interleave group with"
 << *B << '\n'; } } while (false)
                        << '\n')do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Inserted:" <<
 *A << '\n' << "    into the interleave group with"
 << *B << '\n'; } } while (false);
      InterleaveGroupMap[A] = Group;

      // Set the first load in program order as the insert position.
      if (A->mayReadFromMemory())
        Group->setInsertPos(A);
    }
  } // Iteration over A accesses.
} // Iteration over B accesses.

// Remove interleaved store groups with gaps.
for (InterleaveGroup *Group : StoreGroups)
  if (Group->getNumMembers() != Group->getFactor()) {
    LLVM_DEBUG(do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Invalidate candidate interleaved store group due "
 "to gaps.\n"; } } while (false)
        dbgs() << "LV: Invalidate candidate interleaved store group due "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Invalidate candidate interleaved store group due "
 "to gaps.\n"; } } while (false)
                  "to gaps.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Invalidate candidate interleaved store group due "
 "to gaps.\n"; } } while (false);
    releaseGroup(Group);
  }
// Remove interleaved groups with gaps (currently only loads) whose memory
// accesses may wrap around. We have to revisit the getPtrStride analysis,
// this time with ShouldCheckWrap=true, since collectConstStrideAccesses does
// not check wrapping (see documentation there).
// FORNOW we use Assume=false;
// TODO: Change to Assume=true but making sure we don't exceed the threshold
// of runtime SCEV assumptions checks (thereby potentially failing to
// vectorize altogether).
// Additional optional optimizations:
// TODO: If we are peeling the loop and we know that the first pointer doesn't
// wrap then we can deduce that all pointers in the group don't wrap.
// This means that we can forcefully peel the loop in order to only have to
// check the first pointer for no-wrap. When we'll change to use Assume=true
// we'll only need at most one runtime check per interleaved group.
for (InterleaveGroup *Group : LoadGroups) {
  // Case 1: A full group. Can Skip the checks; For full groups, if the wide
  // load would wrap around the address space we would do a memory access at
  // nullptr even without the transformation.
  if (Group->getNumMembers() == Group->getFactor())
    continue;

  // Case 2: If first and last members of the group don't wrap this implies
  // that all the pointers in the group don't wrap.
  // So we check only group member 0 (which is always guaranteed to exist),
  // and group member Factor - 1; If the latter doesn't exist we rely on
  // peeling (if it is a non-reveresed accsess -- see Case 3).
  Value *FirstMemberPtr = getLoadStorePointerOperand(Group->getMember(0));
  if (!getPtrStride(PSE, FirstMemberPtr, TheLoop, Strides, /*Assume=*/false,
                    /*ShouldCheckWrap=*/true)) {
    LLVM_DEBUG(do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Invalidate candidate interleaved group due to "
 "first group member potentially pointer-wrapping.\n"; } } while
 (false)
        dbgs() << "LV: Invalidate candidate interleaved group due to "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Invalidate candidate interleaved group due to "
 "first group member potentially pointer-wrapping.\n"; } } while
 (false)
                  "first group member potentially pointer-wrapping.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Invalidate candidate interleaved group due to "
 "first group member potentially pointer-wrapping.\n"; } } while
 (false);
    releaseGroup(Group);
    continue;
  }
  Instruction *LastMember = Group->getMember(Group->getFactor() - 1);
  if (LastMember) {
    Value *LastMemberPtr = getLoadStorePointerOperand(LastMember);
    if (!getPtrStride(PSE, LastMemberPtr, TheLoop, Strides, /*Assume=*/false,
                      /*ShouldCheckWrap=*/true)) {
      LLVM_DEBUG(do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Invalidate candidate interleaved group due to "
 "last group member potentially pointer-wrapping.\n"; } } while
 (false)
          dbgs() << "LV: Invalidate candidate interleaved group due to "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Invalidate candidate interleaved group due to "
 "last group member potentially pointer-wrapping.\n"; } } while
 (false)
                    "last group member potentially pointer-wrapping.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Invalidate candidate interleaved group due to "
 "last group member potentially pointer-wrapping.\n"; } } while
 (false);
      releaseGroup(Group);
    }
  } else {
    // Case 3: A non-reversed interleaved load group with gaps: We need
    // to execute at least one scalar epilogue iteration. This will ensure
    // we don't speculatively access memory out-of-bounds. We only need
    // to look for a member at index factor - 1, since every group must have
    // a member at index zero.
    if (Group->isReverse()) {
      LLVM_DEBUG(do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Invalidate candidate interleaved group due to "
 "a reverse access with gaps.\n"; } } while (false)
          dbgs() << "LV: Invalidate candidate interleaved group due to "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Invalidate candidate interleaved group due to "
 "a reverse access with gaps.\n"; } } while (false)
                    "a reverse access with gaps.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Invalidate candidate interleaved group due to "
 "a reverse access with gaps.\n"; } } while (false);
      releaseGroup(Group);
      continue;
    }
    LLVM_DEBUG(do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Interleaved group requires epilogue iteration.\n"
; } } while (false)
        dbgs() << "LV: Interleaved group requires epilogue iteration.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Interleaved group requires epilogue iteration.\n"
; } } while (false);
    RequiresScalarEpilogue = true;
  }
}
4884}

4886Optional<unsigned> LoopVectorizationCostModel::computeMaxVF(bool OptForSize) {
if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) {
  // TODO: It may by useful to do since it's still likely to be dynamically
  // uniform if the target can skip.
  LLVM_DEBUG(do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Not inserting runtime ptr check for divergent target"
; } } while (false)
      dbgs() << "LV: Not inserting runtime ptr check for divergent target")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Not inserting runtime ptr check for divergent target"
; } } while (false);

  ORE->emit(
    createMissedAnalysis("CantVersionLoopWithDivergentTarget")
    << "runtime pointer checks needed. Not enabled for divergent target");

  return None;
}

unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop);
if (!OptForSize) // Remaining checks deal with scalar loop when OptForSize.
  return computeFeasibleMaxVF(OptForSize, TC);

if (Legal->getRuntimePointerChecking()->Need) {
  ORE->emit(createMissedAnalysis("CantVersionLoopWithOptForSize")
            << "runtime pointer checks needed. Enable vectorization of this "
               "loop with '#pragma clang loop vectorize(enable)' when "
               "compiling with -Os/-Oz");
  LLVM_DEBUG(do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Aborting. Runtime ptr check is required with -Os/-Oz.\n"
; } } while (false)
      dbgs()do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Aborting. Runtime ptr check is required with -Os/-Oz.\n"
; } } while (false)
      << "LV: Aborting. Runtime ptr check is required with -Os/-Oz.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Aborting. Runtime ptr check is required with -Os/-Oz.\n"
; } } while (false);
  return None;
}

// If we optimize the program for size, avoid creating the tail loop.
LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n')do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found trip count: "
 << TC << '\n'; } } while (false);

// If we don't know the precise trip count, don't try to vectorize.
if (TC < 2) {
  ORE->emit(
      createMissedAnalysis("UnknownLoopCountComplexCFG")
      << "unable to calculate the loop count due to complex control flow");
  LLVM_DEBUG(do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Aborting. A tail loop is required with -Os/-Oz.\n"
; } } while (false)
      dbgs() << "LV: Aborting. A tail loop is required with -Os/-Oz.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Aborting. A tail loop is required with -Os/-Oz.\n"
; } } while (false);
  return None;
}

unsigned MaxVF = computeFeasibleMaxVF(OptForSize, TC);

if (TC % MaxVF != 0) {
  // If the trip count that we found modulo the vectorization factor is not
  // zero then we require a tail.
  // FIXME: look for a smaller MaxVF that does divide TC rather than give up.
  // FIXME: return None if loop requiresScalarEpilog(<MaxVF>), or look for a
  //        smaller MaxVF that does not require a scalar epilog.

  ORE->emit(createMissedAnalysis("NoTailLoopWithOptForSize")
            << "cannot optimize for size and vectorize at the "
               "same time. Enable vectorization of this loop "
               "with '#pragma clang loop vectorize(enable)' "
               "when compiling with -Os/-Oz");
  LLVM_DEBUG(do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Aborting. A tail loop is required with -Os/-Oz.\n"
; } } while (false)
      dbgs() << "LV: Aborting. A tail loop is required with -Os/-Oz.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Aborting. A tail loop is required with -Os/-Oz.\n"
; } } while (false);
  return None;
}

return MaxVF;
4948}

4950unsigned
4951LoopVectorizationCostModel::computeFeasibleMaxVF(bool OptForSize,
                                               unsigned ConstTripCount) {
MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI);
unsigned SmallestType, WidestType;
std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes();
unsigned WidestRegister = TTI.getRegisterBitWidth(true);

// Get the maximum safe dependence distance in bits computed by LAA.
// It is computed by MaxVF * sizeOf(type) * 8, where type is taken from
// the memory accesses that is most restrictive (involved in the smallest
// dependence distance).
unsigned MaxSafeRegisterWidth = Legal->getMaxSafeRegisterWidth();

WidestRegister = std::min(WidestRegister, MaxSafeRegisterWidth);

unsigned MaxVectorSize = WidestRegister / WidestType;

LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestTypedo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: The Smallest and Widest types: "
 << SmallestType << " / " << WidestType <<
 " bits.\n"; } } while (false)
                  << " / " << WidestType << " bits.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: The Smallest and Widest types: "
 << SmallestType << " / " << WidestType <<
 " bits.\n"; } } while (false);
LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: The Widest register safe to use is: "
 << WidestRegister << " bits.\n"; } } while (false
)
                  << WidestRegister << " bits.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: The Widest register safe to use is: "
 << WidestRegister << " bits.\n"; } } while (false
);

assert(MaxVectorSize <= 256 && "Did not expect to pack so many elements"(static_cast <bool> (MaxVectorSize <= 256 &&
 "Did not expect to pack so many elements" " into one vector!"
) ? void (0) : __assert_fail ("MaxVectorSize <= 256 && \"Did not expect to pack so many elements\" \" into one vector!\""
, "/build/llvm-toolchain-snapshot-7~svn338205/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4974, __extension__ __PRETTY_FUNCTION__))
                               " into one vector!")(static_cast <bool> (MaxVectorSize <= 256 &&
 "Did not expect to pack so many elements" " into one vector!"
) ? void (0) : __assert_fail ("MaxVectorSize <= 256 && \"Did not expect to pack so many elements\" \" into one vector!\""
, "/build/llvm-toolchain-snapshot-7~svn338205/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4974, __extension__ __PRETTY_FUNCTION__));
if (MaxVectorSize == 0) {
  LLVM_DEBUG(dbgs() << "LV: The target has no vector registers.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: The target has no vector registers.\n"
; } } while (false);
  MaxVectorSize = 1;
  return MaxVectorSize;
} else if (ConstTripCount && ConstTripCount < MaxVectorSize &&
           isPowerOf2_32(ConstTripCount)) {
  // We need to clamp the VF to be the ConstTripCount. There is no point in
  // choosing a higher viable VF as done in the loop below.
  LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to the constant trip count: "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Clamping the MaxVF to the constant trip count: "
 << ConstTripCount << "\n"; } } while (false)
                    << ConstTripCount << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Clamping the MaxVF to the constant trip count: "
 << ConstTripCount << "\n"; } } while (false);
  MaxVectorSize = ConstTripCount;
  return MaxVectorSize;
}

unsigned MaxVF = MaxVectorSize;
if (TTI.shouldMaximizeVectorBandwidth(OptForSize) ||
    (MaximizeBandwidth && !OptForSize)) {
  // Collect all viable vectorization factors larger than the default MaxVF
  // (i.e. MaxVectorSize).
  SmallVector<unsigned, 8> VFs;
  unsigned NewMaxVectorSize = WidestRegister / SmallestType;
  for (unsigned VS = MaxVectorSize * 2; VS <= NewMaxVectorSize; VS *= 2)
    VFs.push_back(VS);

  // For each VF calculate its register usage.
  auto RUs = calculateRegisterUsage(VFs);

  // Select the largest VF which doesn't require more registers than existing
  // ones.
  unsigned TargetNumRegisters = TTI.getNumberOfRegisters(true);
  for (int i = RUs.size() - 1; i >= 0; --i) {
    if (RUs[i].MaxLocalUsers <= TargetNumRegisters) {
      MaxVF = VFs[i];
      break;
    }
  }
  if (unsigned MinVF = TTI.getMinimumVF(SmallestType)) {
    if (MaxVF < MinVF) {
      LLVM_DEBUG(dbgs() << "LV: Overriding calculated MaxVF(" << MaxVFdo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Overriding calculated MaxVF("
 << MaxVF << ") with target's minimum: " <<
 MinVF << '\n'; } } while (false)
                        << ") with target's minimum: " << MinVF << '\n')do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Overriding calculated MaxVF("
 << MaxVF << ") with target's minimum: " <<
 MinVF << '\n'; } } while (false);
      MaxVF = MinVF;
    }
  }
}
return MaxVF;
5020}

5022VectorizationFactor
5023LoopVectorizationCostModel::selectVectorizationFactor(unsigned MaxVF) {
float Cost = expectedCost(1).first;
const float ScalarCost = Cost;
unsigned Width = 1;
LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << (int)ScalarCost << ".\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Scalar loop costs: "
 << (int)ScalarCost << ".\n"; } } while (false);

bool ForceVectorization = Hints->getForce() == LoopVectorizeHints::FK_Enabled;
if (ForceVectorization && MaxVF > 1) {
  // Ignore scalar width, because the user explicitly wants vectorization.
  // Initialize cost to max so that VF = 2 is, at least, chosen during cost
  // evaluation.
  Cost = std::numeric_limits<float>::max();
}

for (unsigned i = 2; i <= MaxVF; i *= 2) {
  // Notice that the vector loop needs to be executed less times, so
  // we need to divide the cost of the vector loops by the width of
  // the vector elements.
  VectorizationCostTy C = expectedCost(i);
  float VectorCost = C.first / (float)i;
  LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << ido { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Vector loop of width "
 << i << " costs: " << (int)VectorCost <<
 ".\n"; } } while (false)
                    << " costs: " << (int)VectorCost << ".\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Vector loop of width "
 << i << " costs: " << (int)VectorCost <<
 ".\n"; } } while (false);
  if (!C.second && !ForceVectorization) {
    LLVM_DEBUG(do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Not considering vector loop of width "
 << i << " because it will not generate any vector instructions.\n"
; } } while (false)
        dbgs() << "LV: Not considering vector loop of width " << ido { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Not considering vector loop of width "
 << i << " because it will not generate any vector instructions.\n"
; } } while (false)
               << " because it will not generate any vector instructions.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Not considering vector loop of width "
 << i << " because it will not generate any vector instructions.\n"
; } } while (false);
    continue;
  }
  if (VectorCost < Cost) {
    Cost = VectorCost;
    Width = i;
  }
}

if (!EnableCondStoresVectorization && NumPredStores) {
  ORE->emit(createMissedAnalysis("ConditionalStore")
            << "store that is conditionally executed prevents vectorization");
  LLVM_DEBUG(do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: No vectorization. There are conditional stores.\n"
; } } while (false)
      dbgs() << "LV: No vectorization. There are conditional stores.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: No vectorization. There are conditional stores.\n"
; } } while (false);
  Width = 1;
  Cost = ScalarCost;
}

LLVM_DEBUG(if (ForceVectorization && Width > 1 && Cost >= ScalarCost) dbgs()do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { if (ForceVectorization && Width
 > 1 && Cost >= ScalarCost) dbgs() << "LV: Vectorization seems to be not beneficial, "
 << "but was forced by a user.\n"; } } while (false)
           << "LV: Vectorization seems to be not beneficial, "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { if (ForceVectorization && Width
 > 1 && Cost >= ScalarCost) dbgs() << "LV: Vectorization seems to be not beneficial, "
 << "but was forced by a user.\n"; } } while (false)
           << "but was forced by a user.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { if (ForceVectorization && Width
 > 1 && Cost >= ScalarCost) dbgs() << "LV: Vectorization seems to be not beneficial, "
 << "but was forced by a user.\n"; } } while (false);
LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << Width << ".\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Selecting VF: " <<
 Width << ".\n"; } } while (false);
VectorizationFactor Factor = {Width, (unsigned)(Width * Cost)};
return Factor;
5072}

5074std::pair<unsigned, unsigned>
5075LoopVectorizationCostModel::getSmallestAndWidestTypes() {
unsigned MinWidth = -1U;
unsigned MaxWidth = 8;
const DataLayout &DL = TheFunction->getParent()->getDataLayout();

// For each block.
for (BasicBlock *BB : TheLoop->blocks()) {
  // For each instruction in the loop.
  for (Instruction &I : *BB) {
    Type *T = I.getType();

    // Skip ignored values.
    if (ValuesToIgnore.count(&I))
      continue;

    // Only examine Loads, Stores and PHINodes.
    if (!isa<LoadInst>(I) && !isa<StoreInst>(I) && !isa<PHINode>(I))
      continue;

    // Examine PHI nodes that are reduction variables. Update the type to
    // account for the recurrence type.
    if (auto *PN = dyn_cast<PHINode>(&I)) {
      if (!Legal->isReductionVariable(PN))
        continue;
      RecurrenceDescriptor RdxDesc = (*Legal->getReductionVars())[PN];
      T = RdxDesc.getRecurrenceType();
    }

    // Examine the stored values.
    if (auto *ST = dyn_cast<StoreInst>(&I))
      T = ST->getValueOperand()->getType();

    // Ignore loaded pointer types and stored pointer types that are not
    // vectorizable.
    //
    // FIXME: The check here attempts to predict whether a load or store will
    //        be vectorized. We only know this for certain after a VF has
    //        been selected. Here, we assume that if an access can be
    //        vectorized, it will be. We should also look at extending this
    //        optimization to non-pointer types.
    //
    if (T->isPointerTy() && !isConsecutiveLoadOrStore(&I) &&
        !isAccessInterleaved(&I) && !isLegalGatherOrScatter(&I))
      continue;

    MinWidth = std::min(MinWidth,
                        (unsigned)DL.getTypeSizeInBits(T->getScalarType()));
    MaxWidth = std::max(MaxWidth,
                        (unsigned)DL.getTypeSizeInBits(T->getScalarType()));
  }
}

return {MinWidth, MaxWidth};
5128}

5130unsigned LoopVectorizationCostModel::selectInterleaveCount(bool OptForSize,
                                                         unsigned VF,
                                                         unsigned LoopCost) {
// -- The interleave heuristics --
// We interleave the loop in order to expose ILP and reduce the loop overhead.
// There are many micro-architectural considerations that we can't predict
// at this level. For example, frontend pressure (on decode or fetch) due to
// code size, or the number and capabilities of the execution ports.
//
// We use the following heuristics to select the interleave count:
// 1. If the code has reductions, then we interleave to break the cross
// iteration dependency.
// 2. If the loop is really small, then we interleave to reduce the loop
// overhead.
// 3. We don't interleave if we think that we will spill registers to memory
// due to the increased register pressure.

// When we optimize for size, we don't interleave.
if (OptForSize)
  return 1;

// We used the distance for the interleave count.
if (Legal->getMaxSafeDepDistBytes() != -1U)
  return 1;

// Do not interleave loops with a relatively small trip count.
unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop);
if (TC > 1 && TC < TinyTripCountInterleaveThreshold)
  return 1;

unsigned TargetNumRegisters = TTI.getNumberOfRegisters(VF > 1);
LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegistersdo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: The target has " <<
 TargetNumRegisters << " registers\n"; } } while (false
)
                  << " registers\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: The target has " <<
 TargetNumRegisters << " registers\n"; } } while (false
);

if (VF == 1) {
  if (ForceTargetNumScalarRegs.getNumOccurrences() > 0)
    TargetNumRegisters = ForceTargetNumScalarRegs;
} else {
  if (ForceTargetNumVectorRegs.getNumOccurrences() > 0)
    TargetNumRegisters = ForceTargetNumVectorRegs;
}

RegisterUsage R = calculateRegisterUsage({VF})[0];
// We divide by these constants so assume that we have at least one
// instruction that uses at least one register.
R.MaxLocalUsers = std::max(R.MaxLocalUsers, 1U);

// We calculate the interleave count using the following formula.
// Subtract the number of loop invariants from the number of available
// registers. These registers are used by all of the interleaved instances.
// Next, divide the remaining registers by the number of registers that is
// required by the loop, in order to estimate how many parallel instances
// fit without causing spills. All of this is rounded down if necessary to be
// a power of two. We want power of two interleave count to simplify any
// addressing operations or alignment considerations.
unsigned IC = PowerOf2Floor((TargetNumRegisters - R.LoopInvariantRegs) /
                            R.MaxLocalUsers);

// Don't count the induction variable as interleaved.
if (EnableIndVarRegisterHeur)
  IC = PowerOf2Floor((TargetNumRegisters - R.LoopInvariantRegs - 1) /
                     std::max(1U, (R.MaxLocalUsers - 1)));

// Clamp the interleave ranges to reasonable counts.
unsigned MaxInterleaveCount = TTI.getMaxInterleaveFactor(VF);

// Check if the user has overridden the max.
if (VF == 1) {
  if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0)
    MaxInterleaveCount = ForceTargetMaxScalarInterleaveFactor;
} else {
  if (ForceTargetMaxVectorInterleaveFactor.getNumOccurrences() > 0)
    MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor;
}

// If we did not calculate the cost for VF (because the user selected the VF)
// then we calculate the cost of VF here.
if (LoopCost == 0)
  LoopCost = expectedCost(VF).first;

// Clamp the calculated IC to be between the 1 and the max interleave count
// that the target allows.
if (IC > MaxInterleaveCount)
  IC = MaxInterleaveCount;
else if (IC < 1)
  IC = 1;

// Interleave if we vectorized this loop and there is a reduction that could
// benefit from interleaving.
if (VF > 1 && !Legal->getReductionVars()->empty()) {
  LLVM_DEBUG(dbgs() << "LV: Interleaving because of reductions.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Interleaving because of reductions.\n"
; } } while (false);
  return IC;
}

// Note that if we've already vectorized the loop we will have done the
// runtime check and so interleaving won't require further checks.
bool InterleavingRequiresRuntimePointerCheck =
    (VF == 1 && Legal->getRuntimePointerChecking()->Need);

// We want to interleave small loops in order to reduce the loop overhead and
// potentially expose ILP opportunities.
LLVM_DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n')do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Loop cost is " <<
 LoopCost << '\n'; } } while (false);
if (!InterleavingRequiresRuntimePointerCheck && LoopCost < SmallLoopCost) {
  // We assume that the cost overhead is 1 and we use the cost model
  // to estimate the cost of the loop and interleave until the cost of the
  // loop overhead is about 5% of the cost of the loop.
  unsigned SmallIC =
      std::min(IC, (unsigned)PowerOf2Floor(SmallLoopCost / LoopCost));

  // Interleave until store/load ports (estimated by max interleave count) are
  // saturated.
  unsigned NumStores = Legal->getNumStores();
  unsigned NumLoads = Legal->getNumLoads();
  unsigned StoresIC = IC / (NumStores ? NumStores : 1);
  unsigned LoadsIC = IC / (NumLoads ? NumLoads : 1);

  // If we have a scalar reduction (vector reductions are already dealt with
  // by this point), we can increase the critical path length if the loop
  // we're interleaving is inside another loop. Limit, by default to 2, so the
  // critical path only gets increased by one reduction operation.
  if (!Legal->getReductionVars()->empty() && TheLoop->getLoopDepth() > 1) {
    unsigned F = static_cast<unsigned>(MaxNestedScalarReductionIC);
    SmallIC = std::min(SmallIC, F);
    StoresIC = std::min(StoresIC, F);
    LoadsIC = std::min(LoadsIC, F);
  }

  if (EnableLoadStoreRuntimeInterleave &&
      std::max(StoresIC, LoadsIC) > SmallIC) {
    LLVM_DEBUG(do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Interleaving to saturate store or load ports.\n"
; } } while (false)
        dbgs() << "LV: Interleaving to saturate store or load ports.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Interleaving to saturate store or load ports.\n"
; } } while (false);
    return std::max(StoresIC, LoadsIC);
  }

  LLVM_DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Interleaving to reduce branch cost.\n"
; } } while (false);
  return SmallIC;
}

// Interleave if this is a large loop (small loops are already dealt with by
// this point) that could benefit from interleaving.
bool HasReductions = !Legal->getReductionVars()->empty();
if (TTI.enableAggressiveInterleaving(HasReductions)) {
  LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Interleaving to expose ILP.\n"
; } } while (false);
  return IC;
}

LLVM_DEBUG(dbgs() << "LV: Not Interleaving.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Not Interleaving.\n"
; } } while (false);
return 1;
5278}

5280SmallVector<LoopVectorizationCostModel::RegisterUsage, 8>
5281LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<unsigned> VFs) {
// This function calculates the register usage by measuring the highest number
// of values that are alive at a single location. Obviously, this is a very
// rough estimation. We scan the loop in a topological order in order and
// assign a number to each instruction. We use RPO to ensure that defs are
// met before their users. We assume that each instruction that has in-loop
// users starts an interval. We record every time that an in-loop value is
// used, so we have a list of the first and last occurrences of each
// instruction. Next, we transpose this data structure into a multi map that
// holds the list of intervals that *end* at a specific location. This multi
// map allows us to perform a linear search. We scan the instructions linearly
// and record each time that a new interval starts, by placing it in a set.
// If we find this value in the multi-map then we remove it from the set.
// The max register usage is the maximum size of the set.
// We also search for instructions that are defined outside the loop, but are
// used inside the loop. We need this number separately from the max-interval
// usage number because when we unroll, loop-invariant values do not take
// more register.
LoopBlocksDFS DFS(TheLoop);
DFS.perform(LI);

RegisterUsage RU;

// Each 'key' in the map opens a new interval. The values
// of the map are the index of the 'last seen' usage of the
// instruction that is the key.
using IntervalMap = DenseMap<Instruction *, unsigned>;

// Maps instruction to its index.
DenseMap<unsigned, Instruction *> IdxToInstr;
// Marks the end of each interval.
IntervalMap EndPoint;
// Saves the list of instruction indices that are used in the loop.
SmallPtrSet<Instruction *, 8> Ends;
// Saves the list of values that are used in the loop but are
// defined outside the loop, such as arguments and constants.
SmallPtrSet<Value *, 8> LoopInvariants;

unsigned Index = 0;
for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
  for (Instruction &I : *BB) {
    IdxToInstr[Index++] = &I;

    // Save the end location of each USE.
    for (Value *U : I.operands()) {
      auto *Instr = dyn_cast<Instruction>(U);

      // Ignore non-instruction values such as arguments, constants, etc.
      if (!Instr)
        continue;

      // If this instruction is outside the loop then record it and continue.
      if (!TheLoop->contains(Instr)) {
        LoopInvariants.insert(Instr);
        continue;
      }

      // Overwrite previous end points.
      EndPoint[Instr] = Index;
      Ends.insert(Instr);
    }
  }
}

// Saves the list of intervals that end with the index in 'key'.
using InstrList = SmallVector<Instruction *, 2>;
DenseMap<unsigned, InstrList> TransposeEnds;

// Transpose the EndPoints to a list of values that end at each index.
for (auto &Interval : EndPoint)
  TransposeEnds[Interval.second].push_back(Interval.first);

SmallPtrSet<Instruction *, 8> OpenIntervals;

// Get the size of the widest register.
unsigned MaxSafeDepDist = -1U;
if (Legal->getMaxSafeDepDistBytes() != -1U)
  MaxSafeDepDist = Legal->getMaxSafeDepDistBytes() * 8;
unsigned WidestRegister =
    std::min(TTI.getRegisterBitWidth(true), MaxSafeDepDist);
const DataLayout &DL = TheFunction->getParent()->getDataLayout();

SmallVector<RegisterUsage, 8> RUs(VFs.size());
SmallVector<unsigned, 8> MaxUsages(VFs.size(), 0);

LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV(REG): Calculating max register usage:\n"
; } } while (false);

// A lambda that gets the register usage for the given type and VF.
auto GetRegUsage = [&DL, WidestRegister](Type *Ty, unsigned VF) {
  if (Ty->isTokenTy())
    return 0U;
  unsigned TypeSize = DL.getTypeSizeInBits(Ty->getScalarType());
  return std::max<unsigned>(1, VF * TypeSize / WidestRegister);
};

for (unsigned int i = 0; i < Index; ++i) {
  Instruction *I = IdxToInstr[i];

  // Remove all of the instructions that end at this location.
  InstrList &List = TransposeEnds[i];
  for (Instruction *ToRemove : List)
    OpenIntervals.erase(ToRemove);

  // Ignore instructions that are never used within the loop.
  if (!Ends.count(I))
    continue;

  // Skip ignored values.
  if (ValuesToIgnore.count(I))
    continue;

  // For each VF find the maximum usage of registers.
  for (unsigned j = 0, e = VFs.size(); j < e; ++j) {
    if (VFs[j] == 1) {
      MaxUsages[j] = std::max(MaxUsages[j], OpenIntervals.size());
      continue;
    }
    collectUniformsAndScalars(VFs[j]);
    // Count the number of live intervals.
    unsigned RegUsage = 0;
    for (auto Inst : OpenIntervals) {
      // Skip ignored values for VF > 1.
      if (VecValuesToIgnore.count(Inst) ||
          isScalarAfterVectorization(Inst, VFs[j]))
        continue;
      RegUsage += GetRegUsage(Inst->getType(), VFs[j]);
    }
    MaxUsages[j] = std::max(MaxUsages[j], RegUsage);
  }

  LLVM_DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV(REG): At #" <<
 i << " Interval # " << OpenIntervals.size() <<
 '\n'; } } while (false)
                    << OpenIntervals.size() << '\n')do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV(REG): At #" <<
 i << " Interval # " << OpenIntervals.size() <<
 '\n'; } } while (false);

  // Add the current instruction to the list of open intervals.
  OpenIntervals.insert(I);
}

for (unsigned i = 0, e = VFs.size(); i < e; ++i) {
  unsigned Invariant = 0;
  if (VFs[i] == 1)
    Invariant = LoopInvariants.size();
  else {
    for (auto Inst : LoopInvariants)
      Invariant += GetRegUsage(Inst->getType(), VFs[i]);
  }

  LLVM_DEBUG(dbgs() << "LV(REG): VF = " << VFs[i] << '\n')do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV(REG): VF = " <<
 VFs[i] << '\n'; } } while (false);
  LLVM_DEBUG(dbgs() << "LV(REG): Found max usage: " << MaxUsages[i] << '\n')do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV(REG): Found max usage: "
 << MaxUsages[i] << '\n'; } } while (false);
  LLVM_DEBUG(dbgs() << "LV(REG): Found invariant usage: " << Invariantdo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV(REG): Found invariant usage: "
 << Invariant << '\n'; } } while (false)
                    << '\n')do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV(REG): Found invariant usage: "
 << Invariant << '\n'; } } while (false);

  RU.LoopInvariantRegs = Invariant;
  RU.MaxLocalUsers = MaxUsages[i];
  RUs[i] = RU;
}

return RUs;
5438}

5440bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I){
// TODO: Cost model for emulated masked load/store is completely
// broken. This hack guides the cost model to use an artificially
// high enough value to practically disable vectorization with such
// operations, except where previously deployed legality hack allowed
// using very low cost values. This is to avoid regressions coming simply
// from moving "masked load/store" check from legality to cost model. 
// Masked Load/Gather emulation was previously never allowed.
// Limited number of Masked Store/Scatter emulation was allowed.
assert(isScalarWithPredication(I) &&(static_cast <bool> (isScalarWithPredication(I) &&
 "Expecting a scalar emulated instruction") ? void (0) : __assert_fail
 ("isScalarWithPredication(I) && \"Expecting a scalar emulated instruction\""
, "/build/llvm-toolchain-snapshot-7~svn338205/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 5450, __extension__ __PRETTY_FUNCTION__))
       "Expecting a scalar emulated instruction")(static_cast <bool> (isScalarWithPredication(I) &&
 "Expecting a scalar emulated instruction") ? void (0) : __assert_fail
 ("isScalarWithPredication(I) && \"Expecting a scalar emulated instruction\""
, "/build/llvm-toolchain-snapshot-7~svn338205/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 5450, __extension__ __PRETTY_FUNCTION__));
return isa<LoadInst>(I) ||
       (isa<StoreInst>(I) &&
        NumPredStores > NumberOfStoresToPredicate);
5454}

5456void LoopVectorizationCostModel::collectInstsToScalarize(unsigned VF) {
// If we aren't vectorizing the loop, or if we've already collected the
// instructions to scalarize, there's nothing to do. Collection may already
// have occurred if we have a user-selected VF and are now computing the
// expected cost for interleaving.
if (VF < 2 || InstsToScalarize.count(VF))
  return;

// Initialize a mapping for VF in InstsToScalalarize. If we find that it's
// not profitable to scalarize any instructions, the presence of VF in the
// map will indicate that we've analyzed it already.
ScalarCostsTy &ScalarCostsVF = InstsToScalarize[VF];

// Find all the instructions that are scalar with predication in the loop and
// determine if it would be better to not if-convert the blocks they are in.
// If so, we also record the instructions to scalarize.
for (BasicBlock *BB : TheLoop->blocks()) {
  if (!Legal->blockNeedsPredication(BB))
    continue;
  for (Instruction &I : *BB)
    if (isScalarWithPredication(&I)) {
      ScalarCostsTy ScalarCosts;
      // Do not apply discount logic if hacked cost is needed
      // for emulated masked memrefs.
      if (!useEmulatedMaskMemRefHack(&I) &&
          computePredInstDiscount(&I, ScalarCosts, VF) >= 0)
        ScalarCostsVF.insert(ScalarCosts.begin(), ScalarCosts.end());
      // Remember that BB will remain after vectorization.
      PredicatedBBsAfterVectorization.insert(BB);
    }
}
5487}

5489int LoopVectorizationCostModel::computePredInstDiscount(
  Instruction *PredInst, DenseMap<Instruction *, unsigned> &ScalarCosts,
  unsigned VF) {
assert(!isUniformAfterVectorization(PredInst, VF) &&(static_cast <bool> (!isUniformAfterVectorization(PredInst
, VF) && "Instruction marked uniform-after-vectorization will be predicated"
) ? void (0) : __assert_fail ("!isUniformAfterVectorization(PredInst, VF) && \"Instruction marked uniform-after-vectorization will be predicated\""
, "/build/llvm-toolchain-snapshot-7~svn338205/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 5493, __extension__ __PRETTY_FUNCTION__))
       "Instruction marked uniform-after-vectorization will be predicated")(static_cast <bool> (!isUniformAfterVectorization(PredInst
, VF) && "Instruction marked uniform-after-vectorization will be predicated"
) ? void (0) : __assert_fail ("!isUniformAfterVectorization(PredInst, VF) && \"Instruction marked uniform-after-vectorization will be predicated\""
, "/build/llvm-toolchain-snapshot-7~svn338205/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 5493, __extension__ __PRETTY_FUNCTION__));

// Initialize the discount to zero, meaning that the scalar version and the
// vector version cost the same.
int Discount = 0;

// Holds instructions to analyze. The instructions we visit are mapped in
// ScalarCosts. Those instructions are the ones that would be scalarized if
// we find that the scalar version costs less.
SmallVector<Instruction *, 8> Worklist;

// Returns true if the given instruction can be scalarized.
auto canBeScalarized = [&](Instruction *I) -> bool {
  // We only attempt to scalarize instructions forming a single-use chain
  // from the original predicated block that would otherwise be vectorized.
  // Although not strictly necessary, we give up on instructions we know will
  // already be scalar to avoid traversing chains that are unlikely to be
  // beneficial.
  if (!I->hasOneUse() || PredInst->getParent() != I->getParent() ||
      isScalarAfterVectorization(I, VF))
    return false;

  // If the instruction is scalar with predication, it will be analyzed
  // separately. We ignore it within the context of PredInst.
  if (isScalarWithPredication(I))
    return false;

  // If any of the instruction's operands are uniform after vectorization,
  // the instruction cannot be scalarized. This prevents, for example, a
  // masked load from being scalarized.
  //
  // We assume we will only emit a value for lane zero of an instruction
  // marked uniform after vectorization, rather than VF identical values.
  // Thus, if we scalarize an instruction that uses a uniform, we would
  // create uses of values corresponding to the lanes we aren't emitting code
  // for. This behavior can be changed by allowing getScalarValue to clone
  // the lane zero values for uniforms rather than asserting.
  for (Use &U : I->operands())
    if (auto *J = dyn_cast<Instruction>(U.get()))
      if (isUniformAfterVectorization(J, VF))
        return false;

  // Otherwise, we can scalarize the instruction.
  return true;
};

// Returns true if an operand that cannot be scalarized must be extracted
// from a vector. We will account for this scalarization overhead below. Note
// that the non-void predicated instructions are placed in their own blocks,
// and their return values are inserted into vectors. Thus, an extract would
// still be required.
auto needsExtract = [&](Instruction *I) -> bool {
  return TheLoop->contains(I) && !isScalarAfterVectorization(I, VF);
};

// Compute the expected cost discount from scalarizing the entire expression
// feeding the predicated instruction. We currently only consider expressions
// that are single-use instruction chains.
Worklist.push_back(PredInst);
while (!Worklist.empty()) {
  Instruction *I = Worklist.pop_back_val();

  // If we've already analyzed the instruction, there's nothing to do.
  if (ScalarCosts.count(I))
    continue;

  // Compute the cost of the vector instruction. Note that this cost already
  // includes the scalarization overhead of the predicated instruction.
  unsigned VectorCost = getInstructionCost(I, VF).first;

  // Compute the cost of the scalarized instruction. This cost is the cost of
  // the instruction as if it wasn't if-converted and instead remained in the
  // predicated block. We will scale this cost by block probability after
  // computing the scalarization overhead.
  unsigned ScalarCost = VF * getInstructionCost(I, 1).first;

  // Compute the scalarization overhead of needed insertelement instructions
  // and phi nodes.
  if (isScalarWithPredication(I) && !I->getType()->isVoidTy()) {
    ScalarCost += TTI.getScalarizationOverhead(ToVectorTy(I->getType(), VF),
                                               true, false);
    ScalarCost += VF * TTI.getCFInstrCost(Instruction::PHI);
  }

  // Compute the scalarization overhead of needed extractelement
  // instructions. For each of the instruction's operands, if the operand can
  // be scalarized, add it to the worklist; otherwise, account for the
  // overhead.
  for (Use &U : I->operands())
    if (auto *J = dyn_cast<Instruction>(U.get())) {
      assert(VectorType::isValidElementType(J->getType()) &&(static_cast <bool> (VectorType::isValidElementType(J->
getType()) && "Instruction has non-scalar type") ? void
 (0) : __assert_fail ("VectorType::isValidElementType(J->getType()) && \"Instruction has non-scalar type\""
, "/build/llvm-toolchain-snapshot-7~svn338205/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 5584, __extension__ __PRETTY_FUNCTION__))
             "Instruction has non-scalar type")(static_cast <bool> (VectorType::isValidElementType(J->
getType()) && "Instruction has non-scalar type") ? void
 (0) : __assert_fail ("VectorType::isValidElementType(J->getType()) && \"Instruction has non-scalar type\""
, "/build/llvm-toolchain-snapshot-7~svn338205/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 5584, __extension__ __PRETTY_FUNCTION__));
      if (canBeScalarized(J))
        Worklist.push_back(J);
      else if (needsExtract(J))
        ScalarCost += TTI.getScalarizationOverhead(
                            ToVectorTy(J->getType(),VF), false, true);
    }

  // Scale the total scalar cost by block probability.
  ScalarCost /= getReciprocalPredBlockProb();

  // Compute the discount. A non-negative discount means the vector version
  // of the instruction costs more, and scalarizing would be beneficial.
  Discount += VectorCost - ScalarCost;
  ScalarCosts[I] = ScalarCost;
}

return Discount;
5602}

5604LoopVectorizationCostModel::VectorizationCostTy
5605LoopVectorizationCostModel::expectedCost(unsigned VF) {
VectorizationCostTy Cost;

// For each block.
for (BasicBlock *BB : TheLoop->blocks()) {
  VectorizationCostTy BlockCost;

  // For each instruction in the old loop.
  for (Instruction &I : BB->instructionsWithoutDebug()) {
    // Skip ignored values.
    if (ValuesToIgnore.count(&I) ||
        (VF > 1 && VecValuesToIgnore.count(&I)))
      continue;

    VectorizationCostTy C = getInstructionCost(&I, VF);

    // Check if we should override the cost.
    if (ForceTargetInstructionCost.getNumOccurrences() > 0)
      C.first = ForceTargetInstructionCost;

    BlockCost.first += C.first;
    BlockCost.second |= C.second;
    LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C.firstdo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found an estimated cost of "
 << C.first << " for VF " << VF << " For instruction: "
 << I << '\n'; } } while (false)
                      << " for VF " << VF << " For instruction: " << Ido { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found an estimated cost of "
 << C.first << " for VF " << VF << " For instruction: "
 << I << '\n'; } } while (false)
                      << '\n')do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found an estimated cost of "
 << C.first << " for VF " << VF << " For instruction: "
 << I << '\n'; } } while (false);
  }

  // If we are vectorizing a predicated block, it will have been
  // if-converted. This means that the block's instructions (aside from
  // stores and instructions that may divide by zero) will now be
  // unconditionally executed. For the scalar case, we may not always execute
  // the predicated block. Thus, scale the block's cost by the probability of
  // executing it.
  if (VF == 1 && Legal->blockNeedsPredication(BB))
    BlockCost.first /= getReciprocalPredBlockProb();

  Cost.first += BlockCost.first;
  Cost.second |= BlockCost.second;
}

return Cost;
5646}

5648/// Gets Address Access SCEV after verifying that the access pattern
5649/// is loop invariant except the induction variable dependence.
5650///
5651/// This SCEV can be sent to the Target in order to estimate the address
5652/// calculation cost.
5653static const SCEV *getAddressAccessSCEV(
            Value *Ptr,
            LoopVectorizationLegality *Legal,
            PredicatedScalarEvolution &PSE,
            const Loop *TheLoop) {

auto *Gep = dyn_cast<GetElementPtrInst>(Ptr);
if (!Gep)
  return nullptr;

// We are looking for a gep with all loop invariant indices except for one
// which should be an induction variable.
auto SE = PSE.getSE();
unsigned NumOperands = Gep->getNumOperands();
for (unsigned i = 1; i < NumOperands; ++i) {
  Value *Opd = Gep->getOperand(i);
  if (!SE->isLoopInvariant(SE->getSCEV(Opd), TheLoop) &&
      !Legal->isInductionVariable(Opd))
    return nullptr;
}

// Now we know we have a GEP ptr, %inv, %ind, %inv. return the Ptr SCEV.
return PSE.getSCEV(Ptr);
5676}

5678static bool isStrideMul(Instruction *I, LoopVectorizationLegality *Legal) {
return Legal->hasStride(I->getOperand(0)) ||
       Legal->hasStride(I->getOperand(1));
5681}

5683unsigned LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I,
                                                               unsigned VF) {
Type *ValTy = getMemInstValueType(I);
auto SE = PSE.getSE();

unsigned Alignment = getMemInstAlignment(I);
unsigned AS = getMemInstAddressSpace(I);
Value *Ptr = getLoadStorePointerOperand(I);
Type *PtrTy = ToVectorTy(Ptr->getType(), VF);

// Figure out whether the access is strided and get the stride value
// if it's known in compile time
const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, Legal, PSE, TheLoop);

// Get the cost of the scalar memory instruction and address computation.
unsigned Cost = VF * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV);

Cost += VF *
        TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(), Alignment,
                            AS, I);

// Get the overhead of the extractelement and insertelement instructions
// we might create due to scalarization.
Cost += getScalarizationOverhead(I, VF, TTI);

// If we have a predicated store, it may not be executed for each vector
// lane. Scale the cost by the probability of executing the predicated
// block.
if (isScalarWithPredication(I)) {
  Cost /= getReciprocalPredBlockProb();

  if (useEmulatedMaskMemRefHack(I))
    // Artificially setting to a high enough value to practically disable
    // vectorization with such operations.
    Cost = 3000000;
}

return Cost;
5721}

5723unsigned LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I,
                                                           unsigned VF) {
Type *ValTy = getMemInstValueType(I);
Type *VectorTy = ToVectorTy(ValTy, VF);
unsigned Alignment = getMemInstAlignment(I);
Value *Ptr = getLoadStorePointerOperand(I);
unsigned AS = getMemInstAddressSpace(I);
int ConsecutiveStride = Legal->isConsecutivePtr(Ptr);

assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&(static_cast <bool> ((ConsecutiveStride == 1 || ConsecutiveStride
 == -1) && "Stride should be 1 or -1 for consecutive memory access"
) ? void (0) : __assert_fail ("(ConsecutiveStride == 1 || ConsecutiveStride == -1) && \"Stride should be 1 or -1 for consecutive memory access\""
, "/build/llvm-toolchain-snapshot-7~svn338205/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 5733, __extension__ __PRETTY_FUNCTION__))
       "Stride should be 1 or -1 for consecutive memory access")(static_cast <bool> ((ConsecutiveStride == 1 || ConsecutiveStride
 == -1) && "Stride should be 1 or -1 for consecutive memory access"
) ? void (0) : __assert_fail ("(ConsecutiveStride == 1 || ConsecutiveStride == -1) && \"Stride should be 1 or -1 for consecutive memory access\""
, "/build/llvm-toolchain-snapshot-7~svn338205/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 5733, __extension__ __PRETTY_FUNCTION__));
unsigned Cost = 0;
if (Legal->isMaskRequired(I))
  Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS);
else
  Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS, I);

bool Reverse = ConsecutiveStride < 0;
if (Reverse)
  Cost += TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0);
return Cost;
5744}

5746unsigned LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I,
                                                       unsigned VF) {
LoadInst *LI = cast<LoadInst>(I);
Type *ValTy = LI->getType();
Type *VectorTy = ToVectorTy(ValTy, VF);
unsigned Alignment = LI->getAlignment();
unsigned AS = LI->getPointerAddressSpace();

return TTI.getAddressComputationCost(ValTy) +
       TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS) +
       TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VectorTy);
5757}

5759unsigned LoopVectorizationCostModel::getGatherScatterCost(Instruction *I,
                                                        unsigned VF) {
Type *ValTy = getMemInstValueType(I);
Type *VectorTy = ToVectorTy(ValTy, VF);
unsigned Alignment = getMemInstAlignment(I);
Value *Ptr = getLoadStorePointerOperand(I);

return TTI.getAddressComputationCost(VectorTy) +
       TTI.getGatherScatterOpCost(I->getOpcode(), VectorTy, Ptr,
                                  Legal->isMaskRequired(I), Alignment);
5769}

5771unsigned LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I,
                                                          unsigned VF) {
Type *ValTy = getMemInstValueType(I);
Type *VectorTy = ToVectorTy(ValTy, VF);
unsigned AS = getMemInstAddressSpace(I);

auto Group = getInterleavedAccessGroup(I);
assert(Group && "Fail to get an interleaved access group.")(static_cast <bool> (Group && "Fail to get an interleaved access group."
) ? void (0) : __assert_fail ("Group && \"Fail to get an interleaved access group.\""
, "/build/llvm-toolchain-snapshot-7~svn338205/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 5778, __extension__ __PRETTY_FUNCTION__));

unsigned InterleaveFactor = Group->getFactor();
Type *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor);

// Holds the indices of existing members in an interleaved load group.
// An interleaved store group doesn't need this as it doesn't allow gaps.
SmallVector<unsigned, 4> Indices;
if (isa<LoadInst>(I)) {
  for (unsigned i = 0; i < InterleaveFactor; i++)
    if (Group->getMember(i))
      Indices.push_back(i);
}

// Calculate the cost of the whole interleaved group.
unsigned Cost = TTI.getInterleavedMemoryOpCost(I->getOpcode(), WideVecTy,
                                               Group->getFactor(), Indices,
                                               Group->getAlignment(), AS);

if (Group->isReverse())
  Cost += Group->getNumMembers() *
          TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0);
return Cost;
5801}

5803unsigned LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I,
                                                            unsigned VF) {
// Calculate scalar cost only. Vectorization cost should be ready at this
// moment.
if (VF == 1) {
  Type *ValTy = getMemInstValueType(I);
  unsigned Alignment = getMemInstAlignment(I);
  unsigned AS = getMemInstAddressSpace(I);

  return TTI.getAddressComputationCost(ValTy) +
         TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS, I);
}
return getWideningCost(I, VF);
5816}

5818LoopVectorizationCostModel::VectorizationCostTy
5819LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) {
// If we know that this instruction will remain uniform, check the cost of
// the scalar version.
if (isUniformAfterVectorization(I, VF))
  VF = 1;

if (VF > 1 && isProfitableToScalarize(I, VF))
  return VectorizationCostTy(InstsToScalarize[VF][I], false);

// Forced scalars do not have any scalarization overhead.
if (VF > 1 && ForcedScalars.count(VF) &&
    ForcedScalars.find(VF)->second.count(I))
  return VectorizationCostTy((getInstructionCost(I, 1).first * VF), false);

Type *VectorTy;
unsigned C = getInstructionCost(I, VF, VectorTy);

bool TypeNotScalarized =
    VF > 1 && VectorTy->isVectorTy() && TTI.getNumberOfParts(VectorTy) < VF;
return VectorizationCostTy(C, TypeNotScalarized);
5839}

5841void LoopVectorizationCostModel::setCostBasedWideningDecision(unsigned VF) {
if (VF == 1)
  return;
NumPredStores = 0;
for (BasicBlock *BB : TheLoop->blocks()) {
  // For each instruction in the old loop.
  for (Instruction &I : *BB) {
    Value *Ptr =  getLoadStorePointerOperand(&I);
    if (!Ptr)
      continue;

    if (isa<StoreInst>(&I) && isScalarWithPredication(&I))
      NumPredStores++;
    if (isa<LoadInst>(&I) && Legal->isUniform(Ptr)) {
      // Scalar load + broadcast
      unsigned Cost = getUniformMemOpCost(&I, VF);
      setWideningDecision(&I, VF, CM_Scalarize, Cost);
      continue;
    }

    // We assume that widening is the best solution when possible.
    if (memoryInstructionCanBeWidened(&I, VF)) {
      unsigned Cost = getConsecutiveMemOpCost(&I, VF);
      int ConsecutiveStride =
             Legal->isConsecutivePtr(getLoadStorePointerOperand(&I));
      assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&(static_cast <bool> ((ConsecutiveStride == 1 || ConsecutiveStride
 == -1) && "Expected consecutive stride.") ? void (0)
 : __assert_fail ("(ConsecutiveStride == 1 || ConsecutiveStride == -1) && \"Expected consecutive stride.\""
, "/build/llvm-toolchain-snapshot-7~svn338205/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 5867, __extension__ __PRETTY_FUNCTION__))
             "Expected consecutive stride.")(static_cast <bool> ((ConsecutiveStride == 1 || ConsecutiveStride
 == -1) && "Expected consecutive stride.") ? void (0)
 : __assert_fail ("(ConsecutiveStride == 1 || ConsecutiveStride == -1) && \"Expected consecutive stride.\""
, "/build/llvm-toolchain-snapshot-7~svn338205/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 5867, __extension__ __PRETTY_FUNCTION__));
      InstWidening Decision =
          ConsecutiveStride == 1 ? CM_Widen : CM_Widen_Reverse;
      setWideningDecision(&I, VF, Decision, Cost);
      continue;
    }

    // Choose between Interleaving, Gather/Scatter or Scalarization.
    unsigned InterleaveCost = std::numeric_limits<unsigned>::max();
    unsigned NumAccesses = 1;
    if (isAccessInterleaved(&I)) {
      auto Group = getInterleavedAccessGroup(&I);
      assert(Group && "Fail to get an interleaved access group.")(static_cast <bool> (Group && "Fail to get an interleaved access group."
) ? void (0) : __assert_fail ("Group && \"Fail to get an interleaved access group.\""
, "/build/llvm-toolchain-snapshot-7~svn338205/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 5879, __extension__ __PRETTY_FUNCTION__));

      // Make one decision for the whole group.
      if (getWideningDecision(&I, VF) != CM_Unknown)
        continue;

      NumAccesses = Group->getNumMembers();
      InterleaveCost = getInterleaveGroupCost(&I, VF);
    }

    unsigned GatherScatterCost =
        isLegalGatherOrScatter(&I)
            ? getGatherScatterCost(&I, VF) * NumAccesses
            : std::numeric_limits<unsigned>::max();

    unsigned ScalarizationCost =
        getMemInstScalarizationCost(&I, VF) * NumAccesses;

    // Choose better solution for the current VF,
    // write down this decision and use it during vectorization.
    unsigned Cost;
    InstWidening Decision;
    if (InterleaveCost <= GatherScatterCost &&
        InterleaveCost < ScalarizationCost) {
      Decision = CM_Interleave;
      Cost = InterleaveCost;
    } else if (GatherScatterCost < ScalarizationCost) {
      Decision = CM_GatherScatter;
      Cost = GatherScatterCost;
    } else {
      Decision = CM_Scalarize;
      Cost = ScalarizationCost;
    }
    // If the instructions belongs to an interleave group, the whole group
    // receives the same decision. The whole group receives the cost, but
    // the cost will actually be assigned to one instruction.
    if (auto Group = getInterleavedAccessGroup(&I))
      setWideningDecision(Group, VF, Decision, Cost);
    else
      setWideningDecision(&I, VF, Decision, Cost);
  }
}

// Make sure that any load of address and any other address computation
// remains scalar unless there is gather/scatter support. This avoids
// inevitable extracts into address registers, and also has the benefit of
// activating LSR more, since that pass can't optimize vectorized
// addresses.
if (TTI.prefersVectorizedAddressing())
  return;

// Start with all scalar pointer uses.
SmallPtrSet<Instruction *, 8> AddrDefs;
for (BasicBlock *BB : TheLoop->blocks())
  for (Instruction &I : *BB) {
    Instruction *PtrDef =
      dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I));
    if (PtrDef && TheLoop->contains(PtrDef) &&
        getWideningDecision(&I, VF) != CM_GatherScatter)
      AddrDefs.insert(PtrDef);
  }

// Add all instructions used to generate the addresses.
SmallVector<Instruction *, 4> Worklist;
for (auto *I : AddrDefs)
  Worklist.push_back(I);
while (!Worklist.empty()) {
  Instruction *I = Worklist.pop_back_val();
  for (auto &Op : I->operands())
    if (auto *InstOp = dyn_cast<Instruction>(Op))
      if ((InstOp->getParent() == I->getParent()) && !isa<PHINode>(InstOp) &&
          AddrDefs.insert(InstOp).second)
        Worklist.push_back(InstOp);
}

for (auto *I : AddrDefs) {
  if (isa<LoadInst>(I)) {
    // Setting the desired widening decision should ideally be handled in
    // by cost functions, but since this involves the task of finding out
    // if the loaded register is involved in an address computation, it is
    // instead changed here when we know this is the case.
    InstWidening Decision = getWideningDecision(I, VF);
    if (Decision == CM_Widen || Decision == CM_Widen_Reverse)
      // Scalarize a widened load of address.
      setWideningDecision(I, VF, CM_Scalarize,
                          (VF * getMemoryInstructionCost(I, 1)));
    else if (auto Group = getInterleavedAccessGroup(I)) {
      // Scalarize an interleave group of address loads.
      for (unsigned I = 0; I < Group->getFactor(); ++I) {
        if (Instruction *Member = Group->getMember(I))
          setWideningDecision(Member, VF, CM_Scalarize,
                              (VF * getMemoryInstructionCost(Member, 1)));
      }
    }
  } else
    // Make sure I gets scalarized and a cost estimate without
    // scalarization overhead.
    ForcedScalars[VF].insert(I);
}
5978}

5980unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I,
                                                      unsigned VF,
                                                      Type *&VectorTy) {
Type *RetTy = I->getType();
if (canTruncateToMinimalBitwidth(I, VF))
  RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]);
VectorTy = isScalarAfterVectorization(I, VF) ? RetTy : ToVectorTy(RetTy, VF);
auto SE = PSE.getSE();

// TODO: We need to estimate the cost of intrinsic calls.
switch (I->getOpcode()) {
case Instruction::GetElementPtr:
  // We mark this instruction as zero-cost because the cost of GEPs in
  // vectorized code depends on whether the corresponding memory instruction
  // is scalarized or not. Therefore, we handle GEPs with the memory
  // instruction cost.
  return 0;
case Instruction::Br: {
  // In cases of scalarized and predicated instructions, there will be VF
  // predicated blocks in the vectorized loop. Each branch around these
  // blocks requires also an extract of its vector compare i1 element.
  bool ScalarPredicatedBB = false;
  BranchInst *BI = cast<BranchInst>(I);
  if (VF > 1 && BI->isConditional() &&
      (PredicatedBBsAfterVectorization.count(BI->getSuccessor(0)) ||
       PredicatedBBsAfterVectorization.count(BI->getSuccessor(1))))
    ScalarPredicatedBB = true;

  if (ScalarPredicatedBB) {
    // Return cost for branches around scalarized and predicated blocks.
    Type *Vec_i1Ty =
        VectorType::get(IntegerType::getInt1Ty(RetTy->getContext()), VF);
    return (TTI.getScalarizationOverhead(Vec_i1Ty, false, true) +
            (TTI.getCFInstrCost(Instruction::Br) * VF));
  } else if (I->getParent() == TheLoop->getLoopLatch() || VF == 1)
    // The back-edge branch will remain, as will all scalar branches.
    return TTI.getCFInstrCost(Instruction::Br);
  else
    // This branch will be eliminated by if-conversion.
    return 0;
  // Note: We currently assume zero cost for an unconditional branch inside
  // a predicated block since it will become a fall-through, although we
  // may decide in the future to call TTI for all branches.
}
case Instruction::PHI: {
  auto *Phi = cast<PHINode>(I);

  // First-order recurrences are replaced by vector shuffles inside the loop.
  if (VF > 1 && Legal->isFirstOrderRecurrence(Phi))
    return TTI.getShuffleCost(TargetTransformInfo::SK_ExtractSubvector,
                              VectorTy, VF - 1, VectorTy);

  // Phi nodes in non-header blocks (not inductions, reductions, etc.) are
  // converted into select instructions. We require N - 1 selects per phi
  // node, where N is the number of incoming values.
  if (VF > 1 && Phi->getParent() != TheLoop->getHeader())
    return (Phi->getNumIncomingValues() - 1) *
           TTI.getCmpSelInstrCost(
               Instruction::Select, ToVectorTy(Phi->getType(), VF),
               ToVectorTy(Type::getInt1Ty(Phi->getContext()), VF));

  return TTI.getCFInstrCost(Instruction::PHI);
}
case Instruction::UDiv:
case Instruction::SDiv:
case Instruction::URem:
case Instruction::SRem:
  // If we have a predicated instruction, it may not be executed for each
  // vector lane. Get the scalarization cost and scale this amount by the
  // probability of executing the predicated block. If the instruction is not
  // predicated, we fall through to the next case.
  if (VF > 1 && isScalarWithPredication(I)) {
    unsigned Cost = 0;

    // These instructions have a non-void type, so account for the phi nodes
    // that we will create. This cost is likely to be zero. The phi node
    // cost, if any, should be scaled by the block probability because it
    // models a copy at the end of each predicated block.
    Cost += VF * TTI.getCFInstrCost(Instruction::PHI);

    // The cost of the non-predicated instruction.
    Cost += VF * TTI.getArithmeticInstrCost(I->getOpcode(), RetTy);

    // The cost of insertelement and extractelement instructions needed for
    // scalarization.
    Cost += getScalarizationOverhead(I, VF, TTI);

    // Scale the cost by the probability of executing the predicated blocks.
    // This assumes the predicated block for each vector lane is equally
    // likely.
    return Cost / getReciprocalPredBlockProb();
  }
  LLVM_FALLTHROUGH[[clang::fallthrough]];
case Instruction::Add:
case Instruction::FAdd:
case Instruction::Sub:
case Instruction::FSub:
case Instruction::Mul:
case Instruction::FMul:
case Instruction::FDiv:
case Instruction::FRem:
case Instruction::Shl:
case Instruction::LShr:
case Instruction::AShr:
case Instruction::And:
case Instruction::Or:
case Instruction::Xor: {
  // Since we will replace the stride by 1 the multiplication should go away.
  if (I->getOpcode() == Instruction::Mul && isStrideMul(I, Legal))
    return 0;
  // Certain instructions can be cheaper to vectorize if they have a constant
  // second vector operand. One example of this are shifts on x86.
  TargetTransformInfo::OperandValueKind Op1VK =
      TargetTransformInfo::OK_AnyValue;
  TargetTransformInfo::OperandValueKind Op2VK =
      TargetTransformInfo::OK_AnyValue;
  TargetTransformInfo::OperandValueProperties Op1VP =
      TargetTransformInfo::OP_None;
  TargetTransformInfo::OperandValueProperties Op2VP =
      TargetTransformInfo::OP_None;
  Value *Op2 = I->getOperand(1);

  // Check for a splat or for a non uniform vector of constants.
  if (isa<ConstantInt>(Op2)) {
    ConstantInt *CInt = cast<ConstantInt>(Op2);
    if (CInt && CInt->getValue().isPowerOf2())
      Op2VP = TargetTransformInfo::OP_PowerOf2;
    Op2VK = TargetTransformInfo::OK_UniformConstantValue;
  } else if (isa<ConstantVector>(Op2) || isa<ConstantDataVector>(Op2)) {
    Op2VK = TargetTransformInfo::OK_NonUniformConstantValue;
    Constant *SplatValue = cast<Constant>(Op2)->getSplatValue();
    if (SplatValue) {
      ConstantInt *CInt = dyn_cast<ConstantInt>(SplatValue);
      if (CInt && CInt->getValue().isPowerOf2())
        Op2VP = TargetTransformInfo::OP_PowerOf2;
      Op2VK = TargetTransformInfo::OK_UniformConstantValue;
    }
  } else if (Legal->isUniform(Op2)) {
    Op2VK = TargetTransformInfo::OK_UniformValue;
  }
  SmallVector<const Value *, 4> Operands(I->operand_values());
  unsigned N = isScalarAfterVectorization(I, VF) ? VF : 1;
  return N * TTI.getArithmeticInstrCost(I->getOpcode(), VectorTy, Op1VK,
                                        Op2VK, Op1VP, Op2VP, Operands);
}
case Instruction::Select: {
  SelectInst *SI = cast<SelectInst>(I);
  const SCEV *CondSCEV = SE->getSCEV(SI->getCondition());
  bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop));
  Type *CondTy = SI->getCondition()->getType();
  if (!ScalarCond)
    CondTy = VectorType::get(CondTy, VF);

  return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy, I);
}
case Instruction::ICmp:
case Instruction::FCmp: {
  Type *ValTy = I->getOperand(0)->getType();
  Instruction *Op0AsInstruction = dyn_cast<Instruction>(I->getOperand(0));
  if (canTruncateToMinimalBitwidth(Op0AsInstruction, VF))
    ValTy = IntegerType::get(ValTy->getContext(), MinBWs[Op0AsInstruction]);
  VectorTy = ToVectorTy(ValTy, VF);
  return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, nullptr, I);
}
case Instruction::Store:
case Instruction::Load: {
  unsigned Width = VF;
  if (Width > 1) {
    InstWidening Decision = getWideningDecision(I, Width);
    assert(Decision != CM_Unknown &&(static_cast <bool> (Decision != CM_Unknown && "CM decision should be taken at this point"
) ? void (0) : __assert_fail ("Decision != CM_Unknown && \"CM decision should be taken at this point\""
, "/build/llvm-toolchain-snapshot-7~svn338205/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 6150, __extension__ __PRETTY_FUNCTION__))
           "CM decision should be taken at this point")(static_cast <bool> (Decision != CM_Unknown && "CM decision should be taken at this point"
) ? void (0) : __assert_fail ("Decision != CM_Unknown && \"CM decision should be taken at this point\""
, "/build/llvm-toolchain-snapshot-7~svn338205/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 6150, __extension__ __PRETTY_FUNCTION__));
    if (Decision == CM_Scalarize)
      Width = 1;
  }
  VectorTy = ToVectorTy(getMemInstValueType(I), Width);
  return getMemoryInstructionCost(I, VF);
}
case Instruction::ZExt:
case Instruction::SExt:
case Instruction::FPToUI:
case Instruction::FPToSI:
case Instruction::FPExt:
case Instruction::PtrToInt:
case Instruction::IntToPtr:
case Instruction::SIToFP:
case Instruction::UIToFP:
case Instruction::Trunc:
case Instruction::FPTrunc:
case Instruction::BitCast: {
  // We optimize the truncation of induction variables having constant
  // integer steps. The cost of these truncations is the same as the scalar
  // operation.
  if (isOptimizableIVTruncate(I, VF)) {
    auto *Trunc = cast<TruncInst>(I);
    return TTI.getCastInstrCost(Instruction::Trunc, Trunc->getDestTy(),
                                Trunc->getSrcTy(), Trunc);
  }

  Type *SrcScalarTy = I->getOperand(0)->getType();
  Type *SrcVecTy =
      VectorTy->isVectorTy() ? ToVectorTy(SrcScalarTy, VF) : SrcScalarTy;
  if (canTruncateToMinimalBitwidth(I, VF)) {
    // This cast is going to be shrunk. This may remove the cast or it might
    // turn it into slightly different cast. For example, if MinBW == 16,
    // "zext i8 %1 to i32" becomes "zext i8 %1 to i16".
    //
    // Calculate the modified src and dest types.
    Type *MinVecTy = VectorTy;
    if (I->getOpcode() == Instruction::Trunc) {
      SrcVecTy = smallestIntegerVectorType(SrcVecTy, MinVecTy);
      VectorTy =
          largestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy);
    } else if (I->getOpcode() == Instruction::ZExt ||
               I->getOpcode() == Instruction::SExt) {
      SrcVecTy = largestIntegerVectorType(SrcVecTy, MinVecTy);
      VectorTy =
          smallestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy);
    }
  }

  unsigned N = isScalarAfterVectorization(I, VF) ? VF : 1;
  return N * TTI.getCastInstrCost(I->getOpcode(), VectorTy, SrcVecTy, I);
}
case Instruction::Call: {
  bool NeedToScalarize;
  CallInst *CI = cast<CallInst>(I);
  unsigned CallCost = getVectorCallCost(CI, VF, TTI, TLI, NeedToScalarize);
  if (getVectorIntrinsicIDForCall(CI, TLI))
    return std::min(CallCost, getVectorIntrinsicCost(CI, VF, TTI, TLI));
  return CallCost;
}
default:
  // The cost of executing VF copies of the scalar instruction. This opcode
  // is unknown. Assume that it is the same as 'mul'.
  return VF * TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy) +
         getScalarizationOverhead(I, VF, TTI);
} // end of switch.
6217}

6219char LoopVectorize::ID = 0;

6221static const char lv_name[] = "Loop Vectorization";

6223INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false)static void *initializeLoopVectorizePassOnce(PassRegistry &
Registry) {
6224INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)initializeTargetTransformInfoWrapperPassPass(Registry);
6225INITIALIZE_PASS_DEPENDENCY(BasicAAWrapperPass)initializeBasicAAWrapperPassPass(Registry);
6226INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)initializeAAResultsWrapperPassPass(Registry);
6227INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass)initializeGlobalsAAWrapperPassPass(Registry);
6228INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)initializeAssumptionCacheTrackerPass(Registry);
6229INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass)initializeBlockFrequencyInfoWrapperPassPass(Registry);
6230INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)initializeDominatorTreeWrapperPassPass(Registry);
6231INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)initializeScalarEvolutionWrapperPassPass(Registry);
6232INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)initializeLoopInfoWrapperPassPass(Registry);
6233INITIALIZE_PASS_DEPENDENCY(LoopAccessLegacyAnalysis)initializeLoopAccessLegacyAnalysisPass(Registry);
6234INITIALIZE_PASS_DEPENDENCY(DemandedBitsWrapperPass)initializeDemandedBitsWrapperPassPass(Registry);
6235INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass)initializeOptimizationRemarkEmitterWrapperPassPass(Registry);
6236INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false)PassInfo *PI = new PassInfo( lv_name, "loop-vectorize", &
LoopVectorize::ID, PassInfo::NormalCtor_t(callDefaultCtor<
LoopVectorize>), false, false); Registry.registerPass(*PI,
 true); return PI; } static llvm::once_flag InitializeLoopVectorizePassFlag
; void llvm::initializeLoopVectorizePass(PassRegistry &Registry
) { llvm::call_once(InitializeLoopVectorizePassFlag, initializeLoopVectorizePassOnce
, std::ref(Registry)); }

6238namespace llvm {

6240Pass *createLoopVectorizePass(bool NoUnrolling, bool AlwaysVectorize) {
return new LoopVectorize(NoUnrolling, AlwaysVectorize);
6242}

6244} // end namespace llvm

6246bool LoopVectorizationCostModel::isConsecutiveLoadOrStore(Instruction *Inst) {
// Check if the pointer operand of a load or store instruction is
// consecutive.
if (auto *Ptr = getLoadStorePointerOperand(Inst))
  return Legal->isConsecutivePtr(Ptr);
return false;
6252}

6254void LoopVectorizationCostModel::collectValuesToIgnore() {
// Ignore ephemeral values.
CodeMetrics::collectEphemeralValues(TheLoop, AC, ValuesToIgnore);

// Ignore type-promoting instructions we identified during reduction
// detection.
for (auto &Reduction : *Legal->getReductionVars()) {
  RecurrenceDescriptor &RedDes = Reduction.second;
  SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts();
  VecValuesToIgnore.insert(Casts.begin(), Casts.end());
}
// Ignore type-casting instructions we identified during induction
// detection.
for (auto &Induction : *Legal->getInductionVars()) {
  InductionDescriptor &IndDes = Induction.second;
  const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts();
  VecValuesToIgnore.insert(Casts.begin(), Casts.end());
}
6272}

6274VectorizationFactor
6275LoopVectorizationPlanner::planInVPlanNativePath(bool OptForSize,
                                              unsigned UserVF) {
// Width 1 means no vectorization, cost 0 means uncomputed cost.
const VectorizationFactor NoVectorization = {1U, 0U};

// Outer loop handling: They may require CFG and instruction level
// transformations before even evaluating whether vectorization is profitable.
// Since we cannot modify the incoming IR, we need to build VPlan upfront in
// the vectorization pipeline.
if (!OrigLoop->empty()) {
  // TODO: If UserVF is not provided, we set UserVF to 4 for stress testing.
  // This won't be necessary when UserVF is not required in the VPlan-native
  // path.
  if (VPlanBuildStressTest && !UserVF)
    UserVF = 4;

  assert(EnableVPlanNativePath && "VPlan-native path is not enabled.")(static_cast <bool> (EnableVPlanNativePath && "VPlan-native path is not enabled."
) ? void (0) : __assert_fail ("EnableVPlanNativePath && \"VPlan-native path is not enabled.\""
, "/build/llvm-toolchain-snapshot-7~svn338205/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 6291, __extension__ __PRETTY_FUNCTION__));
  assert(UserVF && "Expected UserVF for outer loop vectorization.")(static_cast <bool> (UserVF && "Expected UserVF for outer loop vectorization."
) ? void (0) : __assert_fail ("UserVF && \"Expected UserVF for outer loop vectorization.\""
, "/build/llvm-toolchain-snapshot-7~svn338205/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 6292, __extension__ __PRETTY_FUNCTION__));
  assert(isPowerOf2_32(UserVF) && "VF needs to be a power of two")(static_cast <bool> (isPowerOf2_32(UserVF) && "VF needs to be a power of two"
) ? void (0) : __assert_fail ("isPowerOf2_32(UserVF) && \"VF needs to be a power of two\""
, "/build/llvm-toolchain-snapshot-7~svn338205/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 6293, __extension__ __PRETTY_FUNCTION__));
  LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Using user VF " <<
 UserVF << ".\n"; } } while (false);
  buildVPlans(UserVF, UserVF);

  // For VPlan build stress testing, we bail out after VPlan construction.
  if (VPlanBuildStressTest)
    return NoVectorization;

  return {UserVF, 0};
}

LLVM_DEBUG(do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Not vectorizing. Inner loops aren't supported in the "
 "VPlan-native path.\n"; } } while (false)
    dbgs() << "LV: Not vectorizing. Inner loops aren't supported in the "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Not vectorizing. Inner loops aren't supported in the "
 "VPlan-native path.\n"; } } while (false)
              "VPlan-native path.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Not vectorizing. Inner loops aren't supported in the "
 "VPlan-native path.\n"; } } while (false);
return NoVectorization;
6308}

6310VectorizationFactor
6311LoopVectorizationPlanner::plan(bool OptForSize, unsigned UserVF) {
assert(OrigLoop->empty() && "Inner loop expected.")(static_cast <bool> (OrigLoop->empty() && "Inner loop expected."
) ? void (0) : __assert_fail ("OrigLoop->empty() && \"Inner loop expected.\""
, "/build/llvm-toolchain-snapshot-7~svn338205/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 6312, __extension__ __PRETTY_FUNCTION__));
// Width 1 means no vectorization, cost 0 means uncomputed cost.
const VectorizationFactor NoVectorization = {1U, 0U};
Optional<unsigned> MaybeMaxVF = CM.computeMaxVF(OptForSize);
if (!MaybeMaxVF.hasValue()) // Cases considered too costly to vectorize.
  return NoVectorization;

if (UserVF) {
  LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Using user VF " <<
 UserVF << ".\n"; } } while (false);
  assert(isPowerOf2_32(UserVF) && "VF needs to be a power of two")(static_cast <bool> (isPowerOf2_32(UserVF) && "VF needs to be a power of two"
) ? void (0) : __assert_fail ("isPowerOf2_32(UserVF) && \"VF needs to be a power of two\""
, "/build/llvm-toolchain-snapshot-7~svn338205/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 6321, __extension__ __PRETTY_FUNCTION__));
  // Collect the instructions (and their associated costs) that will be more
  // profitable to scalarize.
  CM.selectUserVectorizationFactor(UserVF);
  buildVPlansWithVPRecipes(UserVF, UserVF);
  LLVM_DEBUG(printPlans(dbgs()))do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { printPlans(dbgs()); } } while (false);
  return {UserVF, 0};
}

unsigned MaxVF = MaybeMaxVF.getValue();
assert(MaxVF != 0 && "MaxVF is zero.")(static_cast <bool> (MaxVF != 0 && "MaxVF is zero."
) ? void (0) : __assert_fail ("MaxVF != 0 && \"MaxVF is zero.\""
, "/build/llvm-toolchain-snapshot-7~svn338205/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 6331, __extension__ __PRETTY_FUNCTION__));

for (unsigned VF = 1; VF <= MaxVF; VF *= 2) {
  // Collect Uniform and Scalar instructions after vectorization with VF.
  CM.collectUniformsAndScalars(VF);

  // Collect the instructions (and their associated costs) that will be more
  // profitable to scalarize.
  if (VF > 1)
    CM.collectInstsToScalarize(VF);
}

buildVPlansWithVPRecipes(1, MaxVF);
LLVM_DEBUG(printPlans(dbgs()))do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { printPlans(dbgs()); } } while (false);
if (MaxVF == 1)
  return NoVectorization;

// Select the optimal vectorization factor.
return CM.selectVectorizationFactor(MaxVF);
6350}

6352void LoopVectorizationPlanner::setBestPlan(unsigned VF, unsigned UF) {
LLVM_DEBUG(dbgs() << "Setting best plan to VF=" << VF << ", UF=" << UFdo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "Setting best plan to VF="
 << VF << ", UF=" << UF << '\n'; } } while
 (false)
                  << '\n')do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "Setting best plan to VF="
 << VF << ", UF=" << UF << '\n'; } } while
 (false);
BestVF = VF;
BestUF = UF;

erase_if(VPlans, [VF](const VPlanPtr &Plan) {
  return !Plan->hasVF(VF);
});
assert(VPlans.size() == 1 && "Best VF has not a single VPlan.")(static_cast <bool> (VPlans.size() == 1 && "Best VF has not a single VPlan."
) ? void (0) : __assert_fail ("VPlans.size() == 1 && \"Best VF has not a single VPlan.\""
, "/build/llvm-toolchain-snapshot-7~svn338205/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 6361, __extension__ __PRETTY_FUNCTION__));
6362}

6364void LoopVectorizationPlanner::executePlan(InnerLoopVectorizer &ILV,
                                         DominatorTree *DT) {
// Perform the actual loop transformation.

// 1. Create a new empty loop. Unlink the old loop and connect the new one.
VPCallbackILV CallbackILV(ILV);

VPTransformState State{BestVF, BestUF,      LI,
                       DT,     ILV.Builder, ILV.VectorLoopValueMap,
                       &ILV,   CallbackILV};
State.CFG.PrevBB = ILV.createVectorizedLoopSkeleton();

//===------------------------------------------------===//
//
// Notice: any optimization or new instruction that go
// into the code below should also be implemented in
// the cost-model.
//
//===------------------------------------------------===//

// 2. Copy and widen instructions from the old loop into the new loop.
assert(VPlans.size() == 1 && "Not a single VPlan to execute.")(static_cast <bool> (VPlans.size() == 1 && "Not a single VPlan to execute."
) ? void (0) : __assert_fail ("VPlans.size() == 1 && \"Not a single VPlan to execute.\""
, "/build/llvm-toolchain-snapshot-7~svn338205/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 6385, __extension__ __PRETTY_FUNCTION__));
VPlans.front()->execute(&State);

// 3. Fix the vectorized code: take care of header phi's, live-outs,
//    predication, updating analyses.
ILV.fixVectorizedLoop();
6391}

6393void LoopVectorizationPlanner::collectTriviallyDeadInstructions(
  SmallPtrSetImpl<Instruction *> &DeadInstructions) {
BasicBlock *Latch = OrigLoop->getLoopLatch();

// We create new control-flow for the vectorized loop, so the original
// condition will be dead after vectorization if it's only used by the
// branch.
auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0));
if (Cmp && Cmp->hasOneUse())
  DeadInstructions.insert(Cmp);

// We create new "steps" for induction variable updates to which the original
// induction variables map. An original update instruction will be dead if
// all its users except the induction variable are dead.
for (auto &Induction : *Legal->getInductionVars()) {
  PHINode *Ind = Induction.first;
  auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
  if (llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
        return U == Ind || DeadInstructions.count(cast<Instruction>(U));
      }))
    DeadInstructions.insert(IndUpdate);

  // We record as "Dead" also the type-casting instructions we had identified 
  // during induction analysis. We don't need any handling for them in the
  // vectorized loop because we have proven that, under a proper runtime 
  // test guarding the vectorized loop, the value of the phi, and the casted 
  // value of the phi, are the same. The last instruction in this casting chain
  // will get its scalar/vector/widened def from the scalar/vector/widened def 
  // of the respective phi node. Any other casts in the induction def-use chain
  // have no other uses outside the phi update chain, and will be ignored.
  InductionDescriptor &IndDes = Induction.second;
  const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts();
  DeadInstructions.insert(Casts.begin(), Casts.end());
}
6427}

6429Value *InnerLoopUnroller::reverseVector(Value *Vec) { return Vec; }

6431Value *InnerLoopUnroller::getBroadcastInstrs(Value *V) { return V; }

6433Value *InnerLoopUnroller::getStepVector(Value *Val, int StartIdx, Value *Step,
                                      Instruction::BinaryOps BinOp) {
// When unrolling and the VF is 1, we only need to add a simple scalar.
Type *Ty = Val->getType();
assert(!Ty->isVectorTy() && "Val must be a scalar")(static_cast <bool> (!Ty->isVectorTy() && "Val must be a scalar"
) ? void (0) : __assert_fail ("!Ty->isVectorTy() && \"Val must be a scalar\""
, "/build/llvm-toolchain-snapshot-7~svn338205/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 6437, __extension__ __PRETTY_FUNCTION__));

if (Ty->isFloatingPointTy()) {
  Constant *C = ConstantFP::get(Ty, (double)StartIdx);

  // Floating point operations had to be 'fast' to enable the unrolling.
  Value *MulOp = addFastMathFlag(Builder.CreateFMul(C, Step));
  return addFastMathFlag(Builder.CreateBinOp(BinOp, Val, MulOp));
}
Constant *C = ConstantInt::get(Ty, StartIdx);
return Builder.CreateAdd(Val, Builder.CreateMul(C, Step), "induction");
6448}

6450static void AddRuntimeUnrollDisableMetaData(Loop *L) {
SmallVector<Metadata *, 4> MDs;
// Reserve first location for self reference to the LoopID metadata node.
MDs.push_back(nullptr);
bool IsUnrollMetadata = false;
MDNode *LoopID = L->getLoopID();
if (LoopID) {
  // First find existing loop unrolling disable metadata.
  for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) {
    auto *MD = dyn_cast<MDNode>(LoopID->getOperand(i));
    if (MD) {
      const auto *S = dyn_cast<MDString>(MD->getOperand(0));
      IsUnrollMetadata =
          S && S->getString().startswith("llvm.loop.unroll.disable");
    }
    MDs.push_back(LoopID->getOperand(i));
  }
}

if (!IsUnrollMetadata) {
  // Add runtime unroll disable metadata.
  LLVMContext &Context = L->getHeader()->getContext();
  SmallVector<Metadata *, 1> DisableOperands;
  DisableOperands.push_back(
      MDString::get(Context, "llvm.loop.unroll.runtime.disable"));
  MDNode *DisableNode = MDNode::get(Context, DisableOperands);
  MDs.push_back(DisableNode);
  MDNode *NewLoopID = MDNode::get(Context, MDs);
  // Set operand 0 to refer to the loop id itself.
  NewLoopID->replaceOperandWith(0, NewLoopID);
  L->setLoopID(NewLoopID);
}
6482}

6484bool LoopVectorizationPlanner::getDecisionAndClampRange(
  const std::function<bool(unsigned)> &Predicate, VFRange &Range) {
assert(Range.End > Range.Start && "Trying to test an empty VF range.")(static_cast <bool> (Range.End > Range.Start &&
 "Trying to test an empty VF range.") ? void (0) : __assert_fail
 ("Range.End > Range.Start && \"Trying to test an empty VF range.\""
, "/build/llvm-toolchain-snapshot-7~svn338205/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 6486, __extension__ __PRETTY_FUNCTION__));
bool PredicateAtRangeStart = Predicate(Range.Start);

for (unsigned TmpVF = Range.Start * 2; TmpVF < Range.End; TmpVF *= 2)
  if (Predicate(TmpVF) != PredicateAtRangeStart) {
    Range.End = TmpVF;
    break;
  }

return PredicateAtRangeStart;
6496}

6498/// Build VPlans for the full range of feasible VF's = {\p MinVF, 2 * \p MinVF,
6499/// 4 * \p MinVF, ..., \p MaxVF} by repeatedly building a VPlan for a sub-range
6500/// of VF's starting at a given VF and extending it as much as possible. Each
6501/// vectorization decision can potentially shorten this sub-range during
6502/// buildVPlan().
6503void LoopVectorizationPlanner::buildVPlans(unsigned MinVF, unsigned MaxVF) {
for (unsigned VF = MinVF; VF < MaxVF + 1;) {
  VFRange SubRange = {VF, MaxVF + 1};
  VPlans.push_back(buildVPlan(SubRange));
  VF = SubRange.End;
}
6509}

6511VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst,
                                       VPlanPtr &Plan) {
assert(is_contained(predecessors(Dst), Src) && "Invalid edge")(static_cast <bool> (is_contained(predecessors(Dst), Src
) && "Invalid edge") ? void (0) : __assert_fail ("is_contained(predecessors(Dst), Src) && \"Invalid edge\""
, "/build/llvm-toolchain-snapshot-7~svn338205/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 6513, __extension__ __PRETTY_FUNCTION__));

// Look for cached value.
std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst);
EdgeMaskCacheTy::iterator ECEntryIt = EdgeMaskCache.find(Edge);
if (ECEntryIt != EdgeMaskCache.end())
7
←
Assuming the condition is false→
8
←
Taking false branch→
  return ECEntryIt->second;

VPValue *SrcMask = createBlockInMask(Src, Plan);
9
←
Calling 'VPRecipeBuilder::createBlockInMask'→

// The terminator has to be a branch inst!
BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator());
assert(BI && "Unexpected terminator found")(static_cast <bool> (BI && "Unexpected terminator found"
) ? void (0) : __assert_fail ("BI && \"Unexpected terminator found\""
, "/build/llvm-toolchain-snapshot-7~svn338205/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 6525, __extension__ __PRETTY_FUNCTION__));

if (!BI->isConditional())
  return EdgeMaskCache[Edge] = SrcMask;

VPValue *EdgeMask = Plan->getVPValue(BI->getCondition());
assert(EdgeMask && "No Edge Mask found for condition")(static_cast <bool> (EdgeMask && "No Edge Mask found for condition"
) ? void (0) : __assert_fail ("EdgeMask && \"No Edge Mask found for condition\""
, "/build/llvm-toolchain-snapshot-7~svn338205/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 6531, __extension__ __PRETTY_FUNCTION__));

if (BI->getSuccessor(0) != Dst)
  EdgeMask = Builder.createNot(EdgeMask);

if (SrcMask) // Otherwise block in-mask is all-one, no need to AND.
  EdgeMask = Builder.createAnd(EdgeMask, SrcMask);

return EdgeMaskCache[Edge] = EdgeMask;
6540}

6542VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) {
assert(OrigLoop->contains(BB) && "Block is not a part of a loop")(static_cast <bool> (OrigLoop->contains(BB) &&
 "Block is not a part of a loop") ? void (0) : __assert_fail (
"OrigLoop->contains(BB) && \"Block is not a part of a loop\""
, "/build/llvm-toolchain-snapshot-7~svn338205/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 6543, __extension__ __PRETTY_FUNCTION__));

// Look for cached value.
BlockMaskCacheTy::iterator BCEntryIt = BlockMaskCache.find(BB);
if (BCEntryIt != BlockMaskCache.end())
2
←
Assuming the condition is false→
3
←
Taking false branch→
10
←
Assuming the condition is false→
11
←
Taking false branch→
  return BCEntryIt->second;

// All-one mask is modelled as no-mask following the convention for masked
// load/store/gather/scatter. Initialize BlockMask to no-mask.
VPValue *BlockMask = nullptr;

// Loop incoming mask is all-one.
if (OrigLoop->getHeader() == BB)
4
←
Assuming the condition is false→
5
←
Taking false branch→
12
←
Assuming the condition is false→
13
←
Taking false branch→
  return BlockMaskCache[BB] = BlockMask;

// This is the block mask. We OR all incoming edges.
for (auto *Predecessor : predecessors(BB)) {
  VPValue *EdgeMask = createEdgeMask(Predecessor, BB, Plan);
6
←
Calling 'VPRecipeBuilder::createEdgeMask'→
  if (!EdgeMask) // Mask of predecessor is all-one so mask of block is too.
14
←
Assuming 'EdgeMask' is non-null→
15
←
Taking false branch→
18
←
Assuming 'EdgeMask' is non-null→
19
←
Taking false branch→
30
←
Assuming 'EdgeMask' is null→
31
←
Taking true branch→
    return BlockMaskCache[BB] = EdgeMask;
32
←
Potential leak of memory pointed to by 'BlockMask'

  if (!BlockMask) { // BlockMask has its initialized nullptr value.
16
←
Taking true branch→
20
←
Taking false branch→
    BlockMask = EdgeMask;
    continue;
17
←
 Execution continues on line 6559→
  }

  BlockMask = Builder.createOr(BlockMask, EdgeMask);
21
←
Calling 'VPBuilder::createOr'→
29
←
Returned allocated memory→
}

return BlockMaskCache[BB] = BlockMask;
6573}

6575VPInterleaveRecipe *VPRecipeBuilder::tryToInterleaveMemory(Instruction *I,
                                                         VFRange &Range) {
const InterleaveGroup *IG = CM.getInterleavedAccessGroup(I);
if (!IG)
  return nullptr;

// Now check if IG is relevant for VF's in the given range.
auto isIGMember = [&](Instruction *I) -> std::function<bool(unsigned)> {
  return [=](unsigned VF) -> bool {
    return (VF >= 2 && // Query is illegal for VF == 1
            CM.getWideningDecision(I, VF) ==
                LoopVectorizationCostModel::CM_Interleave);
  };
};
if (!LoopVectorizationPlanner::getDecisionAndClampRange(isIGMember(I), Range))
  return nullptr;

// I is a member of an InterleaveGroup for VF's in the (possibly trimmed)
// range. If it's the primary member of the IG construct a VPInterleaveRecipe.
// Otherwise, it's an adjunct member of the IG, do not construct any Recipe.
assert(I == IG->getInsertPos() &&(static_cast <bool> (I == IG->getInsertPos() &&
 "Generating a recipe for an adjunct member of an interleave group"
) ? void (0) : __assert_fail ("I == IG->getInsertPos() && \"Generating a recipe for an adjunct member of an interleave group\""
, "/build/llvm-toolchain-snapshot-7~svn338205/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 6596, __extension__ __PRETTY_FUNCTION__))
       "Generating a recipe for an adjunct member of an interleave group")(static_cast <bool> (I == IG->getInsertPos() &&
 "Generating a recipe for an adjunct member of an interleave group"
) ? void (0) : __assert_fail ("I == IG->getInsertPos() && \"Generating a recipe for an adjunct member of an interleave group\""
, "/build/llvm-toolchain-snapshot-7~svn338205/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 6596, __extension__ __PRETTY_FUNCTION__));

return new VPInterleaveRecipe(IG);
6599}

6601VPWidenMemoryInstructionRecipe *
6602VPRecipeBuilder::tryToWidenMemory(Instruction *I, VFRange &Range,
                                VPlanPtr &Plan) {
if (!isa<LoadInst>(I) && !isa<StoreInst>(I))
  return nullptr;

auto willWiden = [&](unsigned VF) -> bool {
  if (VF == 1)
    return false;
  if (CM.isScalarAfterVectorization(I, VF) ||
      CM.isProfitableToScalarize(I, VF))
    return false;
  LoopVectorizationCostModel::InstWidening Decision =
      CM.getWideningDecision(I, VF);
  assert(Decision != LoopVectorizationCostModel::CM_Unknown &&(static_cast <bool> (Decision != LoopVectorizationCostModel
::CM_Unknown && "CM decision should be taken at this point."
) ? void (0) : __assert_fail ("Decision != LoopVectorizationCostModel::CM_Unknown && \"CM decision should be taken at this point.\""
, "/build/llvm-toolchain-snapshot-7~svn338205/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 6616, __extension__ __PRETTY_FUNCTION__))
         "CM decision should be taken at this point.")(static_cast <bool> (Decision != LoopVectorizationCostModel
::CM_Unknown && "CM decision should be taken at this point."
) ? void (0) : __assert_fail ("Decision != LoopVectorizationCostModel::CM_Unknown && \"CM decision should be taken at this point.\""
, "/build/llvm-toolchain-snapshot-7~svn338205/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 6616, __extension__ __PRETTY_FUNCTION__));
  assert(Decision != LoopVectorizationCostModel::CM_Interleave &&(static_cast <bool> (Decision != LoopVectorizationCostModel
::CM_Interleave && "Interleave memory opportunity should be caught earlier."
) ? void (0) : __assert_fail ("Decision != LoopVectorizationCostModel::CM_Interleave && \"Interleave memory opportunity should be caught earlier.\""
, "/build/llvm-toolchain-snapshot-7~svn338205/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 6618, __extension__ __PRETTY_FUNCTION__))
         "Interleave memory opportunity should be caught earlier.")(static_cast <bool> (Decision != LoopVectorizationCostModel
::CM_Interleave && "Interleave memory opportunity should be caught earlier."
) ? void (0) : __assert_fail ("Decision != LoopVectorizationCostModel::CM_Interleave && \"Interleave memory opportunity should be caught earlier.\""
, "/build/llvm-toolchain-snapshot-7~svn338205/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 6618, __extension__ __PRETTY_FUNCTION__));
  return Decision != LoopVectorizationCostModel::CM_Scalarize;
};

if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range))
  return nullptr;

VPValue *Mask = nullptr;
if (Legal->isMaskRequired(I))
  Mask = createBlockInMask(I->getParent(), Plan);

return new VPWidenMemoryInstructionRecipe(*I, Mask);
6630}

6632VPWidenIntOrFpInductionRecipe *
6633VPRecipeBuilder::tryToOptimizeInduction(Instruction *I, VFRange &Range) {
if (PHINode *Phi = dyn_cast<PHINode>(I)) {
  // Check if this is an integer or fp induction. If so, build the recipe that
  // produces its scalar and vector values.
  InductionDescriptor II = Legal->getInductionVars()->lookup(Phi);
  if (II.getKind() == InductionDescriptor::IK_IntInduction ||
      II.getKind() == InductionDescriptor::IK_FpInduction)
    return new VPWidenIntOrFpInductionRecipe(Phi);

  return nullptr;
}

// Optimize the special case where the source is a constant integer
// induction variable. Notice that we can only optimize the 'trunc' case
// because (a) FP conversions lose precision, (b) sext/zext may wrap, and
// (c) other casts depend on pointer size.

// Determine whether \p K is a truncation based on an induction variable that
// can be optimized.
auto isOptimizableIVTruncate =
    [&](Instruction *K) -> std::function<bool(unsigned)> {
  return
      [=](unsigned VF) -> bool { return CM.isOptimizableIVTruncate(K, VF); };
};

if (isa<TruncInst>(I) && LoopVectorizationPlanner::getDecisionAndClampRange(
                             isOptimizableIVTruncate(I), Range))
  return new VPWidenIntOrFpInductionRecipe(cast<PHINode>(I->getOperand(0)),
                                           cast<TruncInst>(I));
return nullptr;
6663}

6665VPBlendRecipe *VPRecipeBuilder::tryToBlend(Instruction *I, VPlanPtr &Plan) {
PHINode *Phi = dyn_cast<PHINode>(I);
if (!Phi || Phi->getParent() == OrigLoop->getHeader())
  return nullptr;

// We know that all PHIs in non-header blocks are converted into selects, so
// we don't have to worry about the insertion order and we can just use the
// builder. At this point we generate the predication tree. There may be
// duplications since this is a simple recursive scan, but future
// optimizations will clean it up.

SmallVector<VPValue *, 2> Masks;
unsigned NumIncoming = Phi->getNumIncomingValues();
for (unsigned In = 0; In < NumIncoming; In++) {
  VPValue *EdgeMask =
    createEdgeMask(Phi->getIncomingBlock(In), Phi->getParent(), Plan);
  assert((EdgeMask || NumIncoming == 1) &&(static_cast <bool> ((EdgeMask || NumIncoming == 1) &&
 "Multiple predecessors with one having a full mask") ? void (
0) : __assert_fail ("(EdgeMask || NumIncoming == 1) && \"Multiple predecessors with one having a full mask\""
, "/build/llvm-toolchain-snapshot-7~svn338205/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 6682, __extension__ __PRETTY_FUNCTION__))
         "Multiple predecessors with one having a full mask")(static_cast <bool> ((EdgeMask || NumIncoming == 1) &&
 "Multiple predecessors with one having a full mask") ? void (
0) : __assert_fail ("(EdgeMask || NumIncoming == 1) && \"Multiple predecessors with one having a full mask\""
, "/build/llvm-toolchain-snapshot-7~svn338205/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 6682, __extension__ __PRETTY_FUNCTION__));
  if (EdgeMask)
    Masks.push_back(EdgeMask);
}
return new VPBlendRecipe(Phi, Masks);
6687}

6689bool VPRecipeBuilder::tryToWiden(Instruction *I, VPBasicBlock *VPBB,
                               VFRange &Range) {
if (CM.isScalarWithPredication(I))
  return false;

auto IsVectorizableOpcode = [](unsigned Opcode) {
  switch (Opcode) {
  case Instruction::Add:
  case Instruction::And:
  case Instruction::AShr:
  case Instruction::BitCast:
  case Instruction::Br:
  case Instruction::Call:
  case Instruction::FAdd:
  case Instruction::FCmp:
  case Instruction::FDiv:
  case Instruction::FMul:
  case Instruction::FPExt:
  case Instruction::FPToSI:
  case Instruction::FPToUI:
  case Instruction::FPTrunc:
  case Instruction::FRem:
  case Instruction::FSub:
  case Instruction::GetElementPtr:
  case Instruction::ICmp:
  case Instruction::IntToPtr:
  case Instruction::Load:
  case Instruction::LShr:
  case Instruction::Mul:
  case Instruction::Or:
  case Instruction::PHI:
  case Instruction::PtrToInt:
  case Instruction::SDiv:
  case Instruction::Select:
  case Instruction::SExt:
  case Instruction::Shl:
  case Instruction::SIToFP:
  case Instruction::SRem:
  case Instruction::Store:
  case Instruction::Sub:
  case Instruction::Trunc:
  case Instruction::UDiv:
  case Instruction::UIToFP:
  case Instruction::URem:
  case Instruction::Xor:
  case Instruction::ZExt:
    return true;
  }
  return false;
};

if (!IsVectorizableOpcode(I->getOpcode()))
  return false;

if (CallInst *CI = dyn_cast<CallInst>(I)) {
  Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
  if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end ||
             ID == Intrinsic::lifetime_start || ID == Intrinsic::sideeffect))
    return false;
}

auto willWiden = [&](unsigned VF) -> bool {
  if (!isa<PHINode>(I) && (CM.isScalarAfterVectorization(I, VF) ||
                           CM.isProfitableToScalarize(I, VF)))
    return false;
  if (CallInst *CI = dyn_cast<CallInst>(I)) {
    Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
    // The following case may be scalarized depending on the VF.
    // The flag shows whether we use Intrinsic or a usual Call for vectorized
    // version of the instruction.
    // Is it beneficial to perform intrinsic call compared to lib call?
    bool NeedToScalarize;
    unsigned CallCost = getVectorCallCost(CI, VF, *TTI, TLI, NeedToScalarize);
    bool UseVectorIntrinsic =
        ID && getVectorIntrinsicCost(CI, VF, *TTI, TLI) <= CallCost;
    return UseVectorIntrinsic || !NeedToScalarize;
  }
  if (isa<LoadInst>(I) || isa<StoreInst>(I)) {
    assert(CM.getWideningDecision(I, VF) ==(static_cast <bool> (CM.getWideningDecision(I, VF) == LoopVectorizationCostModel
::CM_Scalarize && "Memory widening decisions should have been taken care by now"
) ? void (0) : __assert_fail ("CM.getWideningDecision(I, VF) == LoopVectorizationCostModel::CM_Scalarize && \"Memory widening decisions should have been taken care by now\""
, "/build/llvm-toolchain-snapshot-7~svn338205/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 6769, __extension__ __PRETTY_FUNCTION__))
               LoopVectorizationCostModel::CM_Scalarize &&(static_cast <bool> (CM.getWideningDecision(I, VF) == LoopVectorizationCostModel
::CM_Scalarize && "Memory widening decisions should have been taken care by now"
) ? void (0) : __assert_fail ("CM.getWideningDecision(I, VF) == LoopVectorizationCostModel::CM_Scalarize && \"Memory widening decisions should have been taken care by now\""
, "/build/llvm-toolchain-snapshot-7~svn338205/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 6769, __extension__ __PRETTY_FUNCTION__))
           "Memory widening decisions should have been taken care by now")(static_cast <bool> (CM.getWideningDecision(I, VF) == LoopVectorizationCostModel
::CM_Scalarize && "Memory widening decisions should have been taken care by now"
) ? void (0) : __assert_fail ("CM.getWideningDecision(I, VF) == LoopVectorizationCostModel::CM_Scalarize && \"Memory widening decisions should have been taken care by now\""
, "/build/llvm-toolchain-snapshot-7~svn338205/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 6769, __extension__ __PRETTY_FUNCTION__));
    return false;
  }
  return true;
};

if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range))
  return false;

// Success: widen this instruction. We optimize the common case where
// consecutive instructions can be represented by a single recipe.
if (!VPBB->empty()) {
  VPWidenRecipe *LastWidenRecipe = dyn_cast<VPWidenRecipe>(&VPBB->back());
  if (LastWidenRecipe && LastWidenRecipe->appendInstruction(I))
    return true;
}

VPBB->appendRecipe(new VPWidenRecipe(I));
return true;
6788}

6790VPBasicBlock *VPRecipeBuilder::handleReplication(
  Instruction *I, VFRange &Range, VPBasicBlock *VPBB,
  DenseMap<Instruction *, VPReplicateRecipe *> &PredInst2Recipe,
  VPlanPtr &Plan) {
bool IsUniform = LoopVectorizationPlanner::getDecisionAndClampRange(
    [&](unsigned VF) { return CM.isUniformAfterVectorization(I, VF); },
    Range);

bool IsPredicated = CM.isScalarWithPredication(I);
auto *Recipe = new VPReplicateRecipe(I, IsUniform, IsPredicated);

// Find if I uses a predicated instruction. If so, it will use its scalar
// value. Avoid hoisting the insert-element which packs the scalar value into
// a vector value, as that happens iff all users use the vector value.
for (auto &Op : I->operands())
  if (auto *PredInst = dyn_cast<Instruction>(Op))
    if (PredInst2Recipe.find(PredInst) != PredInst2Recipe.end())
      PredInst2Recipe[PredInst]->setAlsoPack(false);

// Finalize the recipe for Instr, first if it is not predicated.
if (!IsPredicated) {
  LLVM_DEBUG(dbgs() << "LV: Scalarizing:" << *I << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Scalarizing:" <<
 *I << "\n"; } } while (false);
  VPBB->appendRecipe(Recipe);
  return VPBB;
}
LLVM_DEBUG(dbgs() << "LV: Scalarizing and predicating:" << *I << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Scalarizing and predicating:"
 << *I << "\n"; } } while (false);
assert(VPBB->getSuccessors().empty() &&(static_cast <bool> (VPBB->getSuccessors().empty() &&
 "VPBB has successors when handling predicated replication.")
 ? void (0) : __assert_fail ("VPBB->getSuccessors().empty() && \"VPBB has successors when handling predicated replication.\""
, "/build/llvm-toolchain-snapshot-7~svn338205/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 6817, __extension__ __PRETTY_FUNCTION__))
       "VPBB has successors when handling predicated replication.")(static_cast <bool> (VPBB->getSuccessors().empty() &&
 "VPBB has successors when handling predicated replication.")
 ? void (0) : __assert_fail ("VPBB->getSuccessors().empty() && \"VPBB has successors when handling predicated replication.\""
, "/build/llvm-toolchain-snapshot-7~svn338205/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 6817, __extension__ __PRETTY_FUNCTION__));
// Record predicated instructions for above packing optimizations.
PredInst2Recipe[I] = Recipe;
VPBlockBase *Region = createReplicateRegion(I, Recipe, Plan);
VPBlockUtils::insertBlockAfter(Region, VPBB);
auto *RegSucc = new VPBasicBlock();
VPBlockUtils::insertBlockAfter(RegSucc, Region);
return RegSucc;
6825}

6827VPRegionBlock *VPRecipeBuilder::createReplicateRegion(Instruction *Instr,
                                                    VPRecipeBase *PredRecipe,
                                                    VPlanPtr &Plan) {
// Instructions marked for predication are replicated and placed under an
// if-then construct to prevent side-effects.

// Generate recipes to compute the block mask for this region.
VPValue *BlockInMask = createBlockInMask(Instr->getParent(), Plan);
1
Calling 'VPRecipeBuilder::createBlockInMask'→

// Build the triangular if-then region.
std::string RegionName = (Twine("pred.") + Instr->getOpcodeName()).str();
assert(Instr->getParent() && "Predicated instruction not in any basic block")(static_cast <bool> (Instr->getParent() && "Predicated instruction not in any basic block"
) ? void (0) : __assert_fail ("Instr->getParent() && \"Predicated instruction not in any basic block\""
, "/build/llvm-toolchain-snapshot-7~svn338205/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 6838, __extension__ __PRETTY_FUNCTION__));
auto *BOMRecipe = new VPBranchOnMaskRecipe(BlockInMask);
auto *Entry = new VPBasicBlock(Twine(RegionName) + ".entry", BOMRecipe);
auto *PHIRecipe =
    Instr->getType()->isVoidTy() ? nullptr : new VPPredInstPHIRecipe(Instr);
auto *Exit = new VPBasicBlock(Twine(RegionName) + ".continue", PHIRecipe);
auto *Pred = new VPBasicBlock(Twine(RegionName) + ".if", PredRecipe);
VPRegionBlock *Region = new VPRegionBlock(Entry, Exit, RegionName, true);

// Note: first set Entry as region entry and then connect successors starting
// from it in order, to propagate the "parent" of each VPBasicBlock.
VPBlockUtils::insertTwoBlocksAfter(Pred, Exit, BlockInMask, Entry);
VPBlockUtils::connectBlocks(Pred, Exit);

return Region;
6853}

6855bool VPRecipeBuilder::tryToCreateRecipe(Instruction *Instr, VFRange &Range,
                                      VPlanPtr &Plan, VPBasicBlock *VPBB) {
VPRecipeBase *Recipe = nullptr;
// Check if Instr should belong to an interleave memory recipe, or already
// does. In the latter case Instr is irrelevant.
if ((Recipe = tryToInterleaveMemory(Instr, Range))) {
  VPBB->appendRecipe(Recipe);
  return true;
}

// Check if Instr is a memory operation that should be widened.
if ((Recipe = tryToWidenMemory(Instr, Range, Plan))) {
  VPBB->appendRecipe(Recipe);
  return true;
}

// Check if Instr should form some PHI recipe.
if ((Recipe = tryToOptimizeInduction(Instr, Range))) {
  VPBB->appendRecipe(Recipe);
  return true;
}
if ((Recipe = tryToBlend(Instr, Plan))) {
  VPBB->appendRecipe(Recipe);
  return true;
}
if (PHINode *Phi = dyn_cast<PHINode>(Instr)) {
  VPBB->appendRecipe(new VPWidenPHIRecipe(Phi));
  return true;
}

// Check if Instr is to be widened by a general VPWidenRecipe, after
// having first checked for specific widening recipes that deal with
// Interleave Groups, Inductions and Phi nodes.
if (tryToWiden(Instr, VPBB, Range))
  return true;

return false;
6892}

6894void LoopVectorizationPlanner::buildVPlansWithVPRecipes(unsigned MinVF,
                                                      unsigned MaxVF) {
assert(OrigLoop->empty() && "Inner loop expected.")(static_cast <bool> (OrigLoop->empty() && "Inner loop expected."
) ? void (0) : __assert_fail ("OrigLoop->empty() && \"Inner loop expected.\""
, "/build/llvm-toolchain-snapshot-7~svn338205/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 6896, __extension__ __PRETTY_FUNCTION__));

// Collect conditions feeding internal conditional branches; they need to be
// represented in VPlan for it to model masking.
SmallPtrSet<Value *, 1> NeedDef;

auto *Latch = OrigLoop->getLoopLatch();
for (BasicBlock *BB : OrigLoop->blocks()) {
  if (BB == Latch)
    continue;
  BranchInst *Branch = dyn_cast<BranchInst>(BB->getTerminator());
  if (Branch && Branch->isConditional())
    NeedDef.insert(Branch->getCondition());
}

// Collect instructions from the original loop that will become trivially dead
// in the vectorized loop. We don't need to vectorize these instructions. For
// example, original induction update instructions can become dead because we
// separately emit induction "steps" when generating code for the new loop.
// Similarly, we create a new latch condition when setting up the structure
// of the new loop, so the old one can become dead.
SmallPtrSet<Instruction *, 4> DeadInstructions;
collectTriviallyDeadInstructions(DeadInstructions);

for (unsigned VF = MinVF; VF < MaxVF + 1;) {
  VFRange SubRange = {VF, MaxVF + 1};
  VPlans.push_back(
      buildVPlanWithVPRecipes(SubRange, NeedDef, DeadInstructions));
  VF = SubRange.End;
}
6926}

6928LoopVectorizationPlanner::VPlanPtr
6929LoopVectorizationPlanner::buildVPlanWithVPRecipes(
  VFRange &Range, SmallPtrSetImpl<Value *> &NeedDef,
  SmallPtrSetImpl<Instruction *> &DeadInstructions) {
// Hold a mapping from predicated instructions to their recipes, in order to
// fix their AlsoPack behavior if a user is determined to replicate and use a
// scalar instead of vector value.
DenseMap<Instruction *, VPReplicateRecipe *> PredInst2Recipe;

DenseMap<Instruction *, Instruction *> &SinkAfter = Legal->getSinkAfter();
DenseMap<Instruction *, Instruction *> SinkAfterInverse;

// Create a dummy pre-entry VPBasicBlock to start building the VPlan.
VPBasicBlock *VPBB = new VPBasicBlock("Pre-Entry");
auto Plan = llvm::make_unique<VPlan>(VPBB);

VPRecipeBuilder RecipeBuilder(OrigLoop, TLI, TTI, Legal, CM, Builder);
// Represent values that will have defs inside VPlan.
for (Value *V : NeedDef)
  Plan->addVPValue(V);

// Scan the body of the loop in a topological order to visit each basic block
// after having visited its predecessor basic blocks.
LoopBlocksDFS DFS(OrigLoop);
DFS.perform(LI);

for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
  // Relevant instructions from basic block BB will be grouped into VPRecipe
  // ingredients and fill a new VPBasicBlock.
  unsigned VPBBsForBB = 0;
  auto *FirstVPBBForBB = new VPBasicBlock(BB->getName());
  VPBlockUtils::insertBlockAfter(FirstVPBBForBB, VPBB);
  VPBB = FirstVPBBForBB;
  Builder.setInsertPoint(VPBB);

  std::vector<Instruction *> Ingredients;

  // Organize the ingredients to vectorize from current basic block in the
  // right order.
  for (Instruction &I : BB->instructionsWithoutDebug()) {
    Instruction *Instr = &I;

    // First filter out irrelevant instructions, to ensure no recipes are
    // built for them.
    if (isa<BranchInst>(Instr) || DeadInstructions.count(Instr))
      continue;

    // I is a member of an InterleaveGroup for Range.Start. If it's an adjunct
    // member of the IG, do not construct any Recipe for it.
    const InterleaveGroup *IG = CM.getInterleavedAccessGroup(Instr);
    if (IG && Instr != IG->getInsertPos() &&
        Range.Start >= 2 && // Query is illegal for VF == 1
        CM.getWideningDecision(Instr, Range.Start) ==
            LoopVectorizationCostModel::CM_Interleave) {
      if (SinkAfterInverse.count(Instr))
        Ingredients.push_back(SinkAfterInverse.find(Instr)->second);
      continue;
    }

    // Move instructions to handle first-order recurrences, step 1: avoid
    // handling this instruction until after we've handled the instruction it
    // should follow.
    auto SAIt = SinkAfter.find(Instr);
    if (SAIt != SinkAfter.end()) {
      LLVM_DEBUG(dbgs() << "Sinking" << *SAIt->first << " after"do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "Sinking" << *SAIt
->first << " after" << *SAIt->second <<
 " to vectorize a 1st order recurrence.\n"; } } while (false)
                        << *SAIt->seconddo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "Sinking" << *SAIt
->first << " after" << *SAIt->second <<
 " to vectorize a 1st order recurrence.\n"; } } while (false)
                        << " to vectorize a 1st order recurrence.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "Sinking" << *SAIt
->first << " after" << *SAIt->second <<
 " to vectorize a 1st order recurrence.\n"; } } while (false);
      SinkAfterInverse[SAIt->second] = Instr;
      continue;
    }

    Ingredients.push_back(Instr);

    // Move instructions to handle first-order recurrences, step 2: push the
    // instruction to be sunk at its insertion point.
    auto SAInvIt = SinkAfterInverse.find(Instr);
    if (SAInvIt != SinkAfterInverse.end())
      Ingredients.push_back(SAInvIt->second);
  }

  // Introduce each ingredient into VPlan.
  for (Instruction *Instr : Ingredients) {
    if (RecipeBuilder.tryToCreateRecipe(Instr, Range, Plan, VPBB))
      continue;

    // Otherwise, if all widening options failed, Instruction is to be
    // replicated. This may create a successor for VPBB.
    VPBasicBlock *NextVPBB = RecipeBuilder.handleReplication(
        Instr, Range, VPBB, PredInst2Recipe, Plan);
    if (NextVPBB != VPBB) {
      VPBB = NextVPBB;
      VPBB->setName(BB->hasName() ? BB->getName() + "." + Twine(VPBBsForBB++)
                                  : "");
    }
  }
}

// Discard empty dummy pre-entry VPBasicBlock. Note that other VPBasicBlocks
// may also be empty, such as the last one VPBB, reflecting original
// basic-blocks with no recipes.
VPBasicBlock *PreEntry = cast<VPBasicBlock>(Plan->getEntry());
assert(PreEntry->empty() && "Expecting empty pre-entry block.")(static_cast <bool> (PreEntry->empty() && "Expecting empty pre-entry block."
) ? void (0) : __assert_fail ("PreEntry->empty() && \"Expecting empty pre-entry block.\""
, "/build/llvm-toolchain-snapshot-7~svn338205/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 7029, __extension__ __PRETTY_FUNCTION__));
VPBlockBase *Entry = Plan->setEntry(PreEntry->getSingleSuccessor());
VPBlockUtils::disconnectBlocks(PreEntry, Entry);
delete PreEntry;

std::string PlanName;
raw_string_ostream RSO(PlanName);
unsigned VF = Range.Start;
Plan->addVF(VF);
RSO << "Initial VPlan for VF={" << VF;
for (VF *= 2; VF < Range.End; VF *= 2) {
  Plan->addVF(VF);
  RSO << "," << VF;
}
RSO << "},UF>=1";
RSO.flush();
Plan->setName(PlanName);

return Plan;
7048}

7050LoopVectorizationPlanner::VPlanPtr
7051LoopVectorizationPlanner::buildVPlan(VFRange &Range) {
// Outer loop handling: They may require CFG and instruction level
// transformations before even evaluating whether vectorization is profitable.
// Since we cannot modify the incoming IR, we need to build VPlan upfront in
// the vectorization pipeline.
assert(!OrigLoop->empty())(static_cast <bool> (!OrigLoop->empty()) ? void (0) :
 __assert_fail ("!OrigLoop->empty()", "/build/llvm-toolchain-snapshot-7~svn338205/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 7056, __extension__ __PRETTY_FUNCTION__));
assert(EnableVPlanNativePath && "VPlan-native path is not enabled.")(static_cast <bool> (EnableVPlanNativePath && "VPlan-native path is not enabled."
) ? void (0) : __assert_fail ("EnableVPlanNativePath && \"VPlan-native path is not enabled.\""
, "/build/llvm-toolchain-snapshot-7~svn338205/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 7057, __extension__ __PRETTY_FUNCTION__));

// Create new empty VPlan
auto Plan = llvm::make_unique<VPlan>();

// Build hierarchical CFG
VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI);
HCFGBuilder.buildHierarchicalCFG(*Plan.get());

return Plan;
7067}

7069Value* LoopVectorizationPlanner::VPCallbackILV::
7070getOrCreateVectorValues(Value *V, unsigned Part) {
    return ILV.getOrCreateVectorValue(V, Part);
7072}

7074void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent) const {
O << " +\n"
  << Indent << "\"INTERLEAVE-GROUP with factor " << IG->getFactor() << " at ";
IG->getInsertPos()->printAsOperand(O, false);
O << "\\l\"";
for (unsigned i = 0; i < IG->getFactor(); ++i)
  if (Instruction *I = IG->getMember(i))
    O << " +\n"
      << Indent << "\"  " << VPlanIngredient(I) << " " << i << "\\l\"";
7083}

7085void VPWidenRecipe::execute(VPTransformState &State) {
for (auto &Instr : make_range(Begin, End))
  State.ILV->widenInstruction(Instr);
7088}

7090void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) {
assert(!State.Instance && "Int or FP induction being replicated.")(static_cast <bool> (!State.Instance && "Int or FP induction being replicated."
) ? void (0) : __assert_fail ("!State.Instance && \"Int or FP induction being replicated.\""
, "/build/llvm-toolchain-snapshot-7~svn338205/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 7091, __extension__ __PRETTY_FUNCTION__));
State.ILV->widenIntOrFpInduction(IV, Trunc);
7093}

7095void VPWidenPHIRecipe::execute(VPTransformState &State) {
State.ILV->widenPHIInstruction(Phi, State.UF, State.VF);
7097}

7099void VPBlendRecipe::execute(VPTransformState &State) {
State.ILV->setDebugLocFromInst(State.Builder, Phi);
// We know that all PHIs in non-header blocks are converted into
// selects, so we don't have to worry about the insertion order and we
// can just use the builder.
// At this point we generate the predication tree. There may be
// duplications since this is a simple recursive scan, but future
// optimizations will clean it up.

unsigned NumIncoming = Phi->getNumIncomingValues();

assert((User || NumIncoming == 1) &&(static_cast <bool> ((User || NumIncoming == 1) &&
 "Multiple predecessors with predecessors having a full mask"
) ? void (0) : __assert_fail ("(User || NumIncoming == 1) && \"Multiple predecessors with predecessors having a full mask\""
, "/build/llvm-toolchain-snapshot-7~svn338205/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 7111, __extension__ __PRETTY_FUNCTION__))
       "Multiple predecessors with predecessors having a full mask")(static_cast <bool> ((User || NumIncoming == 1) &&
 "Multiple predecessors with predecessors having a full mask"
) ? void (0) : __assert_fail ("(User || NumIncoming == 1) && \"Multiple predecessors with predecessors having a full mask\""
, "/build/llvm-toolchain-snapshot-7~svn338205/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 7111, __extension__ __PRETTY_FUNCTION__));
// Generate a sequence of selects of the form:
// SELECT(Mask3, In3,
//      SELECT(Mask2, In2,
//                   ( ...)))
InnerLoopVectorizer::VectorParts Entry(State.UF);
for (unsigned In = 0; In < NumIncoming; ++In) {
  for (unsigned Part = 0; Part < State.UF; ++Part) {
    // We might have single edge PHIs (blocks) - use an identity
    // 'select' for the first PHI operand.
    Value *In0 =
        State.ILV->getOrCreateVectorValue(Phi->getIncomingValue(In), Part);
    if (In == 0)
      Entry[Part] = In0; // Initialize with the first incoming value.
    else {
      // Select between the current value and the previous incoming edge
      // based on the incoming mask.
      Value *Cond = State.get(User->getOperand(In), Part);
      Entry[Part] =
          State.Builder.CreateSelect(Cond, In0, Entry[Part], "predphi");
    }
  }
}
for (unsigned Part = 0; Part < State.UF; ++Part)
  State.ValueMap.setVectorValue(Phi, Part, Entry[Part]);
7136}

7138void VPInterleaveRecipe::execute(VPTransformState &State) {
assert(!State.Instance && "Interleave group being replicated.")(static_cast <bool> (!State.Instance && "Interleave group being replicated."
) ? void (0) : __assert_fail ("!State.Instance && \"Interleave group being replicated.\""
, "/build/llvm-toolchain-snapshot-7~svn338205/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 7139, __extension__ __PRETTY_FUNCTION__));
State.ILV->vectorizeInterleaveGroup(IG->getInsertPos());
7141}

7143void VPReplicateRecipe::execute(VPTransformState &State) {
if (State.Instance) { // Generate a single instance.
  State.ILV->scalarizeInstruction(Ingredient, *State.Instance, IsPredicated);
  // Insert scalar instance packing it into a vector.
  if (AlsoPack && State.VF > 1) {
    // If we're constructing lane 0, initialize to start from undef.
    if (State.Instance->Lane == 0) {
      Value *Undef =
          UndefValue::get(VectorType::get(Ingredient->getType(), State.VF));
      State.ValueMap.setVectorValue(Ingredient, State.Instance->Part, Undef);
    }
    State.ILV->packScalarIntoVectorValue(Ingredient, *State.Instance);
  }
  return;
}

// Generate scalar instances for all VF lanes of all UF parts, unless the
// instruction is uniform inwhich case generate only the first lane for each
// of the UF parts.
unsigned EndLane = IsUniform ? 1 : State.VF;
for (unsigned Part = 0; Part < State.UF; ++Part)
  for (unsigned Lane = 0; Lane < EndLane; ++Lane)
    State.ILV->scalarizeInstruction(Ingredient, {Part, Lane}, IsPredicated);
7166}

7168void VPBranchOnMaskRecipe::execute(VPTransformState &State) {
assert(State.Instance && "Branch on Mask works only on single instance.")(static_cast <bool> (State.Instance && "Branch on Mask works only on single instance."
) ? void (0) : __assert_fail ("State.Instance && \"Branch on Mask works only on single instance.\""
, "/build/llvm-toolchain-snapshot-7~svn338205/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 7169, __extension__ __PRETTY_FUNCTION__));

unsigned Part = State.Instance->Part;
unsigned Lane = State.Instance->Lane;

Value *ConditionBit = nullptr;
if (!User) // Block in mask is all-one.
  ConditionBit = State.Builder.getTrue();
else {
  VPValue *BlockInMask = User->getOperand(0);
  ConditionBit = State.get(BlockInMask, Part);
  if (ConditionBit->getType()->isVectorTy())
    ConditionBit = State.Builder.CreateExtractElement(
        ConditionBit, State.Builder.getInt32(Lane));
}

// Replace the temporary unreachable terminator with a new conditional branch,
// whose two destinations will be set later when they are created.
auto *CurrentTerminator = State.CFG.PrevBB->getTerminator();
assert(isa<UnreachableInst>(CurrentTerminator) &&(static_cast <bool> (isa<UnreachableInst>(CurrentTerminator
) && "Expected to replace unreachable terminator with conditional branch."
) ? void (0) : __assert_fail ("isa<UnreachableInst>(CurrentTerminator) && \"Expected to replace unreachable terminator with conditional branch.\""
, "/build/llvm-toolchain-snapshot-7~svn338205/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 7189, __extension__ __PRETTY_FUNCTION__))
       "Expected to replace unreachable terminator with conditional branch.")(static_cast <bool> (isa<UnreachableInst>(CurrentTerminator
) && "Expected to replace unreachable terminator with conditional branch."
) ? void (0) : __assert_fail ("isa<UnreachableInst>(CurrentTerminator) && \"Expected to replace unreachable terminator with conditional branch.\""
, "/build/llvm-toolchain-snapshot-7~svn338205/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 7189, __extension__ __PRETTY_FUNCTION__));
auto *CondBr = BranchInst::Create(State.CFG.PrevBB, nullptr, ConditionBit);
CondBr->setSuccessor(0, nullptr);
ReplaceInstWithInst(CurrentTerminator, CondBr);
7193}

7195void VPPredInstPHIRecipe::execute(VPTransformState &State) {
assert(State.Instance && "Predicated instruction PHI works per instance.")(static_cast <bool> (State.Instance && "Predicated instruction PHI works per instance."
) ? void (0) : __assert_fail ("State.Instance && \"Predicated instruction PHI works per instance.\""
, "/build/llvm-toolchain-snapshot-7~svn338205/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 7196, __extension__ __PRETTY_FUNCTION__));
Instruction *ScalarPredInst = cast<Instruction>(
    State.ValueMap.getScalarValue(PredInst, *State.Instance));
BasicBlock *PredicatedBB = ScalarPredInst->getParent();
BasicBlock *PredicatingBB = PredicatedBB->getSinglePredecessor();
assert(PredicatingBB && "Predicated block has no single predecessor.")(static_cast <bool> (PredicatingBB && "Predicated block has no single predecessor."
) ? void (0) : __assert_fail ("PredicatingBB && \"Predicated block has no single predecessor.\""
, "/build/llvm-toolchain-snapshot-7~svn338205/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 7201, __extension__ __PRETTY_FUNCTION__));

// By current pack/unpack logic we need to generate only a single phi node: if
// a vector value for the predicated instruction exists at this point it means
// the instruction has vector users only, and a phi for the vector value is
// needed. In this case the recipe of the predicated instruction is marked to
// also do that packing, thereby "hoisting" the insert-element sequence.
// Otherwise, a phi node for the scalar value is needed.
unsigned Part = State.Instance->Part;
if (State.ValueMap.hasVectorValue(PredInst, Part)) {
  Value *VectorValue = State.ValueMap.getVectorValue(PredInst, Part);
  InsertElementInst *IEI = cast<InsertElementInst>(VectorValue);
  PHINode *VPhi = State.Builder.CreatePHI(IEI->getType(), 2);
  VPhi->addIncoming(IEI->getOperand(0), PredicatingBB); // Unmodified vector.
  VPhi->addIncoming(IEI, PredicatedBB); // New vector with inserted element.
  State.ValueMap.resetVectorValue(PredInst, Part, VPhi); // Update cache.
} else {
  Type *PredInstType = PredInst->getType();
  PHINode *Phi = State.Builder.CreatePHI(PredInstType, 2);
  Phi->addIncoming(UndefValue::get(ScalarPredInst->getType()), PredicatingBB);
  Phi->addIncoming(ScalarPredInst, PredicatedBB);
  State.ValueMap.resetScalarValue(PredInst, *State.Instance, Phi);
}
7224}

7226void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) {
if (!User)
  return State.ILV->vectorizeMemoryInstruction(&Instr);

// Last (and currently only) operand is a mask.
InnerLoopVectorizer::VectorParts MaskValues(State.UF);
VPValue *Mask = User->getOperand(User->getNumOperands() - 1);
for (unsigned Part = 0; Part < State.UF; ++Part)
  MaskValues[Part] = State.get(Mask, Part);
State.ILV->vectorizeMemoryInstruction(&Instr, &MaskValues);
7236}

7238// Process the loop in the VPlan-native vectorization path. This path builds
7239// VPlan upfront in the vectorization pipeline, which allows to apply
7240// VPlan-to-VPlan transformations from the very beginning without modifying the
7241// input LLVM IR.
7242static bool processLoopInVPlanNativePath(
  Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT,
  LoopVectorizationLegality *LVL, TargetTransformInfo *TTI,
  TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC,
  OptimizationRemarkEmitter *ORE, LoopVectorizeHints &Hints) {

assert(EnableVPlanNativePath && "VPlan-native path is disabled.")(static_cast <bool> (EnableVPlanNativePath && "VPlan-native path is disabled."
) ? void (0) : __assert_fail ("EnableVPlanNativePath && \"VPlan-native path is disabled.\""
, "/build/llvm-toolchain-snapshot-7~svn338205/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 7248, __extension__ __PRETTY_FUNCTION__));
Function *F = L->getHeader()->getParent();
InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI());
LoopVectorizationCostModel CM(L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F,
                              &Hints, IAI);
// Use the planner for outer loop vectorization.
// TODO: CM is not used at this point inside the planner. Turn CM into an
// optional argument if we don't need it in the future.
LoopVectorizationPlanner LVP(L, LI, TLI, TTI, LVL, CM);

// Get user vectorization factor.
unsigned UserVF = Hints.getWidth();

// Check the function attributes to find out if this function should be
// optimized for size.
bool OptForSize =
    Hints.getForce() != LoopVectorizeHints::FK_Enabled && F->optForSize();

// Plan how to best vectorize, return the best VF and its cost.
LVP.planInVPlanNativePath(OptForSize, UserVF);

// Returning false. We are currently not generating vector code in the VPlan
// native path.
return false;
7272}

7274bool LoopVectorizePass::processLoop(Loop *L) {
assert((EnableVPlanNativePath || L->empty()) &&(static_cast <bool> ((EnableVPlanNativePath || L->empty
()) && "VPlan-native path is not enabled. Only process inner loops."
) ? void (0) : __assert_fail ("(EnableVPlanNativePath || L->empty()) && \"VPlan-native path is not enabled. Only process inner loops.\""
, "/build/llvm-toolchain-snapshot-7~svn338205/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 7276, __extension__ __PRETTY_FUNCTION__))
       "VPlan-native path is not enabled. Only process inner loops.")(static_cast <bool> ((EnableVPlanNativePath || L->empty
()) && "VPlan-native path is not enabled. Only process inner loops."
) ? void (0) : __assert_fail ("(EnableVPlanNativePath || L->empty()) && \"VPlan-native path is not enabled. Only process inner loops.\""
, "/build/llvm-toolchain-snapshot-7~svn338205/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 7276, __extension__ __PRETTY_FUNCTION__));

7278#ifndef NDEBUG
const std::string DebugLocStr = getDebugLocString(L);
7280#endif /* NDEBUG */

LLVM_DEBUG(dbgs() << "\nLV: Checking a loop in \""do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "\nLV: Checking a loop in \""
 << L->getHeader()->getParent()->getName() <<
 "\" from " << DebugLocStr << "\n"; } } while (false
)
                  << L->getHeader()->getParent()->getName() << "\" from "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "\nLV: Checking a loop in \""
 << L->getHeader()->getParent()->getName() <<
 "\" from " << DebugLocStr << "\n"; } } while (false
)
                  << DebugLocStr << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "\nLV: Checking a loop in \""
 << L->getHeader()->getParent()->getName() <<
 "\" from " << DebugLocStr << "\n"; } } while (false
);

LoopVectorizeHints Hints(L, DisableUnrolling, *ORE);

LLVM_DEBUG(do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Loop hints:" <<
 " force=" << (Hints.getForce() == LoopVectorizeHints::
FK_Disabled ? "disabled" : (Hints.getForce() == LoopVectorizeHints
::FK_Enabled ? "enabled" : "?")) << " width=" << Hints
.getWidth() << " unroll=" << Hints.getInterleave(
) << "\n"; } } while (false)
    dbgs() << "LV: Loop hints:"do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Loop hints:" <<
 " force=" << (Hints.getForce() == LoopVectorizeHints::
FK_Disabled ? "disabled" : (Hints.getForce() == LoopVectorizeHints
::FK_Enabled ? "enabled" : "?")) << " width=" << Hints
.getWidth() << " unroll=" << Hints.getInterleave(
) << "\n"; } } while (false)
           << " force="do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Loop hints:" <<
 " force=" << (Hints.getForce() == LoopVectorizeHints::
FK_Disabled ? "disabled" : (Hints.getForce() == LoopVectorizeHints
::FK_Enabled ? "enabled" : "?")) << " width=" << Hints
.getWidth() << " unroll=" << Hints.getInterleave(
) << "\n"; } } while (false)
           << (Hints.getForce() == LoopVectorizeHints::FK_Disableddo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Loop hints:" <<
 " force=" << (Hints.getForce() == LoopVectorizeHints::
FK_Disabled ? "disabled" : (Hints.getForce() == LoopVectorizeHints
::FK_Enabled ? "enabled" : "?")) << " width=" << Hints
.getWidth() << " unroll=" << Hints.getInterleave(
) << "\n"; } } while (false)
                   ? "disabled"do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Loop hints:" <<
 " force=" << (Hints.getForce() == LoopVectorizeHints::
FK_Disabled ? "disabled" : (Hints.getForce() == LoopVectorizeHints
::FK_Enabled ? "enabled" : "?")) << " width=" << Hints
.getWidth() << " unroll=" << Hints.getInterleave(
) << "\n"; } } while (false)
                   : (Hints.getForce() == LoopVectorizeHints::FK_Enableddo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Loop hints:" <<
 " force=" << (Hints.getForce() == LoopVectorizeHints::
FK_Disabled ? "disabled" : (Hints.getForce() == LoopVectorizeHints
::FK_Enabled ? "enabled" : "?")) << " width=" << Hints
.getWidth() << " unroll=" << Hints.getInterleave(
) << "\n"; } } while (false)
                          ? "enabled"do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Loop hints:" <<
 " force=" << (Hints.getForce() == LoopVectorizeHints::
FK_Disabled ? "disabled" : (Hints.getForce() == LoopVectorizeHints
::FK_Enabled ? "enabled" : "?")) << " width=" << Hints
.getWidth() << " unroll=" << Hints.getInterleave(
) << "\n"; } } while (false)
                          : "?"))do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Loop hints:" <<
 " force=" << (Hints.getForce() == LoopVectorizeHints::
FK_Disabled ? "disabled" : (Hints.getForce() == LoopVectorizeHints
::FK_Enabled ? "enabled" : "?")) << " width=" << Hints
.getWidth() << " unroll=" << Hints.getInterleave(
) << "\n"; } } while (false)
           << " width=" << Hints.getWidth()do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Loop hints:" <<
 " force=" << (Hints.getForce() == LoopVectorizeHints::
FK_Disabled ? "disabled" : (Hints.getForce() == LoopVectorizeHints
::FK_Enabled ? "enabled" : "?")) << " width=" << Hints
.getWidth() << " unroll=" << Hints.getInterleave(
) << "\n"; } } while (false)
           << " unroll=" << Hints.getInterleave() << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Loop hints:" <<
 " force=" << (Hints.getForce() == LoopVectorizeHints::
FK_Disabled ? "disabled" : (Hints.getForce() == LoopVectorizeHints
::FK_Enabled ? "enabled" : "?")) << " width=" << Hints
.getWidth() << " unroll=" << Hints.getInterleave(
) << "\n"; } } while (false);

// Function containing loop
Function *F = L->getHeader()->getParent();

// Looking at the diagnostic output is the only way to determine if a loop
// was vectorized (other than looking at the IR or machine code), so it
// is important to generate an optimization remark for each loop. Most of
// these messages are generated as OptimizationRemarkAnalysis. Remarks
// generated as OptimizationRemark and OptimizationRemarkMissed are
// less verbose reporting vectorized loops and unvectorized loops that may
// benefit from vectorization, respectively.

if (!Hints.allowVectorization(F, L, AlwaysVectorize)) {
  LLVM_DEBUG(dbgs() << "LV: Loop hints prevent vectorization.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Loop hints prevent vectorization.\n"
; } } while (false);
  return false;
}

PredicatedScalarEvolution PSE(*SE, *L);

// Check if it is legal to vectorize the loop.
LoopVectorizationRequirements Requirements(*ORE);
LoopVectorizationLegality LVL(L, PSE, DT, TLI, AA, F, GetLAA, LI, ORE,
                              &Requirements, &Hints, DB, AC);
if (!LVL.canVectorize(EnableVPlanNativePath)) {
  LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Not vectorizing: Cannot prove legality.\n"
; } } while (false);
  emitMissedWarning(F, L, Hints, ORE);
  return false;
}

// Check the function attributes to find out if this function should be
// optimized for size.
bool OptForSize =
    Hints.getForce() != LoopVectorizeHints::FK_Enabled && F->optForSize();

// Entrance to the VPlan-native vectorization path. Outer loops are processed
// here. They may require CFG and instruction level transformations before
// even evaluating whether vectorization is profitable. Since we cannot modify
// the incoming IR, we need to build VPlan upfront in the vectorization
// pipeline.
if (!L->empty())
  return processLoopInVPlanNativePath(L, PSE, LI, DT, &LVL, TTI, TLI, DB, AC,
                                      ORE, Hints);

assert(L->empty() && "Inner loop expected.")(static_cast <bool> (L->empty() && "Inner loop expected."
) ? void (0) : __assert_fail ("L->empty() && \"Inner loop expected.\""
, "/build/llvm-toolchain-snapshot-7~svn338205/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 7341, __extension__ __PRETTY_FUNCTION__));
// Check the loop for a trip count threshold: vectorize loops with a tiny trip
// count by optimizing for size, to minimize overheads.
// Prefer constant trip counts over profile data, over upper bound estimate.
unsigned ExpectedTC = 0;
bool HasExpectedTC = false;
if (const SCEVConstant *ConstExits =
    dyn_cast<SCEVConstant>(SE->getBackedgeTakenCount(L))) {
  const APInt &ExitsCount = ConstExits->getAPInt();
  // We are interested in small values for ExpectedTC. Skip over those that
  // can't fit an unsigned.
  if (ExitsCount.ult(std::numeric_limits<unsigned>::max())) {
    ExpectedTC = static_cast<unsigned>(ExitsCount.getZExtValue()) + 1;
    HasExpectedTC = true;
  }
}
// ExpectedTC may be large because it's bound by a variable. Check
// profiling information to validate we should vectorize.
if (!HasExpectedTC && LoopVectorizeWithBlockFrequency) {
  auto EstimatedTC = getLoopEstimatedTripCount(L);
  if (EstimatedTC) {
    ExpectedTC = *EstimatedTC;
    HasExpectedTC = true;
  }
}
if (!HasExpectedTC) {
  ExpectedTC = SE->getSmallConstantMaxTripCount(L);
  HasExpectedTC = (ExpectedTC > 0);
}

if (HasExpectedTC && ExpectedTC < TinyTripCountVectorThreshold) {
  LLVM_DEBUG(dbgs() << "LV: Found a loop with a very small trip count. "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found a loop with a very small trip count. "
 << "This loop is worth vectorizing only if no scalar "
 << "iteration overheads are incurred."; } } while (false
)
                    << "This loop is worth vectorizing only if no scalar "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found a loop with a very small trip count. "
 << "This loop is worth vectorizing only if no scalar "
 << "iteration overheads are incurred."; } } while (false
)
                    << "iteration overheads are incurred.")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found a loop with a very small trip count. "
 << "This loop is worth vectorizing only if no scalar "
 << "iteration overheads are incurred."; } } while (false
);
  if (Hints.getForce() == LoopVectorizeHints::FK_Enabled)
    LLVM_DEBUG(dbgs() << " But vectorizing was explicitly forced.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << " But vectorizing was explicitly forced.\n"
; } } while (false);
  else {
    LLVM_DEBUG(dbgs() << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "\n"; } } while (false);
    // Loops with a very small trip count are considered for vectorization
    // under OptForSize, thereby making sure the cost of their loop body is
    // dominant, free of runtime guards and scalar iteration overheads.
    OptForSize = true;
  }
}

// Check the function attributes to see if implicit floats are allowed.
// FIXME: This check doesn't seem possibly correct -- what if the loop is
// an integer loop and the vector instructions selected are purely integer
// vector instructions?
if (F->hasFnAttribute(Attribute::NoImplicitFloat)) {
  LLVM_DEBUG(dbgs() << "LV: Can't vectorize when the NoImplicitFloat"do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Can't vectorize when the NoImplicitFloat"
 "attribute is used.\n"; } } while (false)
                       "attribute is used.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Can't vectorize when the NoImplicitFloat"
 "attribute is used.\n"; } } while (false);
  ORE->emit(createLVMissedAnalysis(Hints.vectorizeAnalysisPassName(),
                                   "NoImplicitFloat", L)
            << "loop not vectorized due to NoImplicitFloat attribute");
  emitMissedWarning(F, L, Hints, ORE);
  return false;
}

// Check if the target supports potentially unsafe FP vectorization.
// FIXME: Add a check for the type of safety issue (denormal, signaling)
// for the target we're vectorizing for, to make sure none of the
// additional fp-math flags can help.
if (Hints.isPotentiallyUnsafe() &&
    TTI->isFPVectorizationPotentiallyUnsafe()) {
  LLVM_DEBUG(do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Potentially unsafe FP op prevents vectorization.\n"
; } } while (false)
      dbgs() << "LV: Potentially unsafe FP op prevents vectorization.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Potentially unsafe FP op prevents vectorization.\n"
; } } while (false);
  ORE->emit(
      createLVMissedAnalysis(Hints.vectorizeAnalysisPassName(), "UnsafeFP", L)
      << "loop not vectorized due to unsafe FP support.");
  emitMissedWarning(F, L, Hints, ORE);
  return false;
}

bool UseInterleaved = TTI->enableInterleavedAccessVectorization();
InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL.getLAI());

// If an override option has been passed in for interleaved accesses, use it.
if (EnableInterleavedMemAccesses.getNumOccurrences() > 0)
  UseInterleaved = EnableInterleavedMemAccesses;

// Analyze interleaved memory accesses.
if (UseInterleaved) {
  IAI.analyzeInterleaving();
}

// Use the cost model.
LoopVectorizationCostModel CM(L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE, F,
                              &Hints, IAI);
CM.collectValuesToIgnore();

// Use the planner for vectorization.
LoopVectorizationPlanner LVP(L, LI, TLI, TTI, &LVL, CM);

// Get user vectorization factor.
unsigned UserVF = Hints.getWidth();

// Plan how to best vectorize, return the best VF and its cost.
VectorizationFactor VF = LVP.plan(OptForSize, UserVF);

// Select the interleave count.
unsigned IC = CM.selectInterleaveCount(OptForSize, VF.Width, VF.Cost);

// Get user interleave count.
unsigned UserIC = Hints.getInterleave();

// Identify the diagnostic messages that should be produced.
std::pair<StringRef, std::string> VecDiagMsg, IntDiagMsg;
bool VectorizeLoop = true, InterleaveLoop = true;
if (Requirements.doesNotMeet(F, L, Hints)) {
  LLVM_DEBUG(dbgs() << "LV: Not vectorizing: loop did not meet vectorization "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Not vectorizing: loop did not meet vectorization "
 "requirements.\n"; } } while (false)
                       "requirements.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Not vectorizing: loop did not meet vectorization "
 "requirements.\n"; } } while (false);
  emitMissedWarning(F, L, Hints, ORE);
  return false;
}

if (VF.Width == 1) {
  LLVM_DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Vectorization is possible but not beneficial.\n"
; } } while (false);
  VecDiagMsg = std::make_pair(
      "VectorizationNotBeneficial",
      "the cost-model indicates that vectorization is not beneficial");
  VectorizeLoop = false;
}

if (IC == 1 && UserIC <= 1) {
  // Tell the user interleaving is not beneficial.
  LLVM_DEBUG(dbgs() << "LV: Interleaving is not beneficial.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Interleaving is not beneficial.\n"
; } } while (false);
  IntDiagMsg = std::make_pair(
      "InterleavingNotBeneficial",
      "the cost-model indicates that interleaving is not beneficial");
  InterleaveLoop = false;
  if (UserIC == 1) {
    IntDiagMsg.first = "InterleavingNotBeneficialAndDisabled";
    IntDiagMsg.second +=
        " and is explicitly disabled or interleave count is set to 1";
  }
} else if (IC > 1 && UserIC == 1) {
  // Tell the user interleaving is beneficial, but it explicitly disabled.
  LLVM_DEBUG(do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Interleaving is beneficial but is explicitly disabled."
; } } while (false)
      dbgs() << "LV: Interleaving is beneficial but is explicitly disabled.")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Interleaving is beneficial but is explicitly disabled."
; } } while (false);
  IntDiagMsg = std::make_pair(
      "InterleavingBeneficialButDisabled",
      "the cost-model indicates that interleaving is beneficial "
      "but is explicitly disabled or interleave count is set to 1");
  InterleaveLoop = false;
}

// Override IC if user provided an interleave count.
IC = UserIC > 0 ? UserIC : IC;

// Emit diagnostic messages, if any.
const char *VAPassName = Hints.vectorizeAnalysisPassName();
if (!VectorizeLoop && !InterleaveLoop) {
  // Do not vectorize or interleaving the loop.
  ORE->emit([&]() {
    return OptimizationRemarkMissed(VAPassName, VecDiagMsg.first,
                                    L->getStartLoc(), L->getHeader())
           << VecDiagMsg.second;
  });
  ORE->emit([&]() {
    return OptimizationRemarkMissed(LV_NAME"loop-vectorize", IntDiagMsg.first,
                                    L->getStartLoc(), L->getHeader())
           << IntDiagMsg.second;
  });
  return false;
} else if (!VectorizeLoop && InterleaveLoop) {
  LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n')do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Interleave Count is "
 << IC << '\n'; } } while (false);
  ORE->emit([&]() {
    return OptimizationRemarkAnalysis(VAPassName, VecDiagMsg.first,
                                      L->getStartLoc(), L->getHeader())
           << VecDiagMsg.second;
  });
} else if (VectorizeLoop && !InterleaveLoop) {
  LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Widthdo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found a vectorizable loop ("
 << VF.Width << ") in " << DebugLocStr <<
 '\n'; } } while (false)
                    << ") in " << DebugLocStr << '\n')do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found a vectorizable loop ("
 << VF.Width << ") in " << DebugLocStr <<
 '\n'; } } while (false);
  ORE->emit([&]() {
    return OptimizationRemarkAnalysis(LV_NAME"loop-vectorize", IntDiagMsg.first,
                                      L->getStartLoc(), L->getHeader())
           << IntDiagMsg.second;
  });
} else if (VectorizeLoop && InterleaveLoop) {
  LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Widthdo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found a vectorizable loop ("
 << VF.Width << ") in " << DebugLocStr <<
 '\n'; } } while (false)
                    << ") in " << DebugLocStr << '\n')do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found a vectorizable loop ("
 << VF.Width << ") in " << DebugLocStr <<
 '\n'; } } while (false);
  LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n')do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Interleave Count is "
 << IC << '\n'; } } while (false);
}

LVP.setBestPlan(VF.Width, IC);

using namespace ore;

if (!VectorizeLoop) {
  assert(IC > 1 && "interleave count should not be 1 or 0")(static_cast <bool> (IC > 1 && "interleave count should not be 1 or 0"
) ? void (0) : __assert_fail ("IC > 1 && \"interleave count should not be 1 or 0\""
, "/build/llvm-toolchain-snapshot-7~svn338205/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 7532, __extension__ __PRETTY_FUNCTION__));
  // If we decided that it is not legal to vectorize the loop, then
  // interleave it.
  InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL,
                             &CM);
  LVP.executePlan(Unroller, DT);

  ORE->emit([&]() {
    return OptimizationRemark(LV_NAME"loop-vectorize", "Interleaved", L->getStartLoc(),
                              L->getHeader())
           << "interleaved loop (interleaved count: "
           << NV("InterleaveCount", IC) << ")";
  });
} else {
  // If we decided that it is *legal* to vectorize the loop, then do it.
  InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, IC,
                         &LVL, &CM);
  LVP.executePlan(LB, DT);
  ++LoopsVectorized;

  // Add metadata to disable runtime unrolling a scalar loop when there are
  // no runtime checks about strides and memory. A scalar loop that is
  // rarely used is not worth unrolling.
  if (!LB.areSafetyChecksAdded())
    AddRuntimeUnrollDisableMetaData(L);

  // Report the vectorization decision.
  ORE->emit([&]() {
    return OptimizationRemark(LV_NAME"loop-vectorize", "Vectorized", L->getStartLoc(),
                              L->getHeader())
           << "vectorized loop (vectorization width: "
           << NV("VectorizationFactor", VF.Width)
           << ", interleaved count: " << NV("InterleaveCount", IC) << ")";
  });
}

// Mark the loop as already vectorized to avoid vectorizing again.
Hints.setAlreadyVectorized();

LLVM_DEBUG(verifyFunction(*L->getHeader()->getParent()))do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { verifyFunction(*L->getHeader()->getParent
()); } } while (false);
return true;
7573}

7575bool LoopVectorizePass::runImpl(
  Function &F, ScalarEvolution &SE_, LoopInfo &LI_, TargetTransformInfo &TTI_,
  DominatorTree &DT_, BlockFrequencyInfo &BFI_, TargetLibraryInfo *TLI_,
  DemandedBits &DB_, AliasAnalysis &AA_, AssumptionCache &AC_,
  std::function<const LoopAccessInfo &(Loop &)> &GetLAA_,
  OptimizationRemarkEmitter &ORE_) {
SE = &SE_;
LI = &LI_;
TTI = &TTI_;
DT = &DT_;
BFI = &BFI_;
TLI = TLI_;
AA = &AA_;
AC = &AC_;
GetLAA = &GetLAA_;
DB = &DB_;
ORE = &ORE_;

// Don't attempt if
// 1. the target claims to have no vector registers, and
// 2. interleaving won't help ILP.
//
// The second condition is necessary because, even if the target has no
// vector registers, loop vectorization may still enable scalar
// interleaving.
if (!TTI->getNumberOfRegisters(true) && TTI->getMaxInterleaveFactor(1) < 2)
  return false;

bool Changed = false;

// The vectorizer requires loops to be in simplified form.
// Since simplification may add new inner loops, it has to run before the
// legality and profitability checks. This means running the loop vectorizer
// will simplify all loops, regardless of whether anything end up being
// vectorized.
for (auto &L : *LI)
  Changed |= simplifyLoop(L, DT, LI, SE, AC, false /* PreserveLCSSA */);

// Build up a worklist of inner-loops to vectorize. This is necessary as
// the act of vectorizing or partially unrolling a loop creates new loops
// and can invalidate iterators across the loops.
SmallVector<Loop *, 8> Worklist;

for (Loop *L : *LI)
  collectSupportedLoops(*L, LI, ORE, Worklist);

LoopsAnalyzed += Worklist.size();

// Now walk the identified inner loops.
while (!Worklist.empty()) {
  Loop *L = Worklist.pop_back_val();

  // For the inner loops we actually process, form LCSSA to simplify the
  // transform.
  Changed |= formLCSSARecursively(*L, *DT, LI, SE);

  Changed |= processLoop(L);
}

// Process each loop nest in the function.
return Changed;
7636}

7638PreservedAnalyses LoopVectorizePass::run(Function &F,
                                       FunctionAnalysisManager &AM) {
  auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F);
  auto &LI = AM.getResult<LoopAnalysis>(F);
  auto &TTI = AM.getResult<TargetIRAnalysis>(F);
  auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
  auto &BFI = AM.getResult<BlockFrequencyAnalysis>(F);
  auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
  auto &AA = AM.getResult<AAManager>(F);
  auto &AC = AM.getResult<AssumptionAnalysis>(F);
  auto &DB = AM.getResult<DemandedBitsAnalysis>(F);
  auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F);

  auto &LAM = AM.getResult<LoopAnalysisManagerFunctionProxy>(F).getManager();
  std::function<const LoopAccessInfo &(Loop &)> GetLAA =
      [&](Loop &L) -> const LoopAccessInfo & {
    LoopStandardAnalysisResults AR = {AA, AC, DT, LI, SE, TLI, TTI, nullptr};
    return LAM.getResult<LoopAccessAnalysis>(L, AR);
  };
  bool Changed =
      runImpl(F, SE, LI, TTI, DT, BFI, &TLI, DB, AA, AC, GetLAA, ORE);
  if (!Changed)
    return PreservedAnalyses::all();
  PreservedAnalyses PA;
  PA.preserve<LoopAnalysis>();
  PA.preserve<DominatorTreeAnalysis>();
  PA.preserve<BasicAA>();
  PA.preserve<GlobalsAA>();
  return PA;
7667}

←

/build/llvm-toolchain-snapshot-7~svn338205/lib/Transforms/Vectorize/LoopVectorizationPlanner.h

1//===- LoopVectorizationPlanner.h - Planner for LoopVectorization ---------===//
2//
3//                     The LLVM Compiler Infrastructure
4//
5// This file is distributed under the University of Illinois Open Source
6// License. See LICENSE.TXT for details.
7//
8//===----------------------------------------------------------------------===//
9///
10/// \file
11/// This file provides a LoopVectorizationPlanner class.
12/// InnerLoopVectorizer vectorizes loops which contain only one basic
13/// LoopVectorizationPlanner - drives the vectorization process after having
14/// passed Legality checks.
15/// The planner builds and optimizes the Vectorization Plans which record the
16/// decisions how to vectorize the given loop. In particular, represent the
17/// control-flow of the vectorized version, the replication of instructions that
18/// are to be scalarized, and interleave access groups.
19///
20/// Also provides a VPlan-based builder utility analogous to IRBuilder.
21/// It provides an instruction-level API for generating VPInstructions while
22/// abstracting away the Recipe manipulation details.
23//===----------------------------------------------------------------------===//

25#ifndef LLVM_TRANSFORMS_VECTORIZE_LOOPVECTORIZATIONPLANNER_H
26#define LLVM_TRANSFORMS_VECTORIZE_LOOPVECTORIZATIONPLANNER_H

28#include "VPlan.h"
29#include "llvm/Analysis/LoopInfo.h"
30#include "llvm/Analysis/TargetLibraryInfo.h"
31#include "llvm/Analysis/TargetTransformInfo.h"

33namespace llvm {

35/// VPlan-based builder utility analogous to IRBuilder.
36class VPBuilder {
37private:
VPBasicBlock *BB = nullptr;
VPBasicBlock::iterator InsertPt = VPBasicBlock::iterator();

VPInstruction *createInstruction(unsigned Opcode,
                                 ArrayRef<VPValue *> Operands) {
  VPInstruction *Instr = new VPInstruction(Opcode, Operands);
24
←
Memory is allocated→
  if (BB)
25
←
Assuming the condition is false→
26
←
Taking false branch→
    BB->insert(Instr, InsertPt);
  return Instr;
}

VPInstruction *createInstruction(unsigned Opcode,
                                 std::initializer_list<VPValue *> Operands) {
  return createInstruction(Opcode, ArrayRef<VPValue *>(Operands));
23
←
Calling 'VPBuilder::createInstruction'→
27
←
Returned allocated memory→
}

54public:
VPBuilder() {}

/// Clear the insertion point: created instructions will not be inserted into
/// a block.
void clearInsertionPoint() {
  BB = nullptr;
  InsertPt = VPBasicBlock::iterator();
}

VPBasicBlock *getInsertBlock() const { return BB; }
VPBasicBlock::iterator getInsertPoint() const { return InsertPt; }

/// InsertPoint - A saved insertion point.
class VPInsertPoint {
  VPBasicBlock *Block = nullptr;
  VPBasicBlock::iterator Point;

public:
  /// Creates a new insertion point which doesn't point to anything.
  VPInsertPoint() = default;

  /// Creates a new insertion point at the given location.
  VPInsertPoint(VPBasicBlock *InsertBlock, VPBasicBlock::iterator InsertPoint)
      : Block(InsertBlock), Point(InsertPoint) {}

  /// Returns true if this insert point is set.
  bool isSet() const { return Block != nullptr; }

  VPBasicBlock *getBlock() const { return Block; }
  VPBasicBlock::iterator getPoint() const { return Point; }
};

/// Sets the current insert point to a previously-saved location.
void restoreIP(VPInsertPoint IP) {
  if (IP.isSet())
    setInsertPoint(IP.getBlock(), IP.getPoint());
  else
    clearInsertionPoint();
}

/// This specifies that created VPInstructions should be appended to the end
/// of the specified block.
void setInsertPoint(VPBasicBlock *TheBB) {
  assert(TheBB && "Attempting to set a null insert point")(static_cast <bool> (TheBB && "Attempting to set a null insert point"
) ? void (0) : __assert_fail ("TheBB && \"Attempting to set a null insert point\""
, "/build/llvm-toolchain-snapshot-7~svn338205/lib/Transforms/Vectorize/LoopVectorizationPlanner.h"
, 98, __extension__ __PRETTY_FUNCTION__));
  BB = TheBB;
  InsertPt = BB->end();
}

/// This specifies that created instructions should be inserted at the
/// specified point.
void setInsertPoint(VPBasicBlock *TheBB, VPBasicBlock::iterator IP) {
  BB = TheBB;
  InsertPt = IP;
}

/// Insert and return the specified instruction.
VPInstruction *insert(VPInstruction *I) const {
  BB->insert(I, InsertPt);
  return I;
}

/// Create an N-ary operation with \p Opcode, \p Operands and set \p Inst as
/// its underlying Instruction.
VPValue *createNaryOp(unsigned Opcode, ArrayRef<VPValue *> Operands,
                      Instruction *Inst = nullptr) {
  VPInstruction *NewVPInst = createInstruction(Opcode, Operands);
  NewVPInst->setUnderlyingValue(Inst);
  return NewVPInst;
}
VPValue *createNaryOp(unsigned Opcode,
                      std::initializer_list<VPValue *> Operands,
                      Instruction *Inst = nullptr) {
  return createNaryOp(Opcode, ArrayRef<VPValue *>(Operands), Inst);
}

VPValue *createNot(VPValue *Operand) {
  return createInstruction(VPInstruction::Not, {Operand});
}

VPValue *createAnd(VPValue *LHS, VPValue *RHS) {
  return createInstruction(Instruction::BinaryOps::And, {LHS, RHS});
}

VPValue *createOr(VPValue *LHS, VPValue *RHS) {
  return createInstruction(Instruction::BinaryOps::Or, {LHS, RHS});
22
←
Calling 'VPBuilder::createInstruction'→
28
←
Returned allocated memory→
}

//===--------------------------------------------------------------------===//
// RAII helpers.
//===--------------------------------------------------------------------===//

/// RAII object that stores the current insertion point and restores it when
/// the object is destroyed.
class InsertPointGuard {
  VPBuilder &Builder;
  VPBasicBlock *Block;
  VPBasicBlock::iterator Point;

public:
  InsertPointGuard(VPBuilder &B)
      : Builder(B), Block(B.getInsertBlock()), Point(B.getInsertPoint()) {}

  InsertPointGuard(const InsertPointGuard &) = delete;
  InsertPointGuard &operator=(const InsertPointGuard &) = delete;

  ~InsertPointGuard() { Builder.restoreIP(VPInsertPoint(Block, Point)); }
};
162};

164/// TODO: The following VectorizationFactor was pulled out of
165/// LoopVectorizationCostModel class. LV also deals with
166/// VectorizerParams::VectorizationFactor and VectorizationCostTy.
167/// We need to streamline them.

169/// Information about vectorization costs
170struct VectorizationFactor {
// Vector width with best cost
unsigned Width;
// Cost of the loop with that width
unsigned Cost;
175};

177/// Planner drives the vectorization process after having passed
178/// Legality checks.
179class LoopVectorizationPlanner {
/// The loop that we evaluate.
Loop *OrigLoop;

/// Loop Info analysis.
LoopInfo *LI;

/// Target Library Info.
const TargetLibraryInfo *TLI;

/// Target Transform Info.
const TargetTransformInfo *TTI;

/// The legality analysis.
LoopVectorizationLegality *Legal;

/// The profitablity analysis.
LoopVectorizationCostModel &CM;

using VPlanPtr = std::unique_ptr<VPlan>;

SmallVector<VPlanPtr, 4> VPlans;

/// This class is used to enable the VPlan to invoke a method of ILV. This is
/// needed until the method is refactored out of ILV and becomes reusable.
struct VPCallbackILV : public VPCallback {
  InnerLoopVectorizer &ILV;

  VPCallbackILV(InnerLoopVectorizer &ILV) : ILV(ILV) {}

  Value *getOrCreateVectorValues(Value *V, unsigned Part) override;
};

/// A builder used to construct the current plan.
VPBuilder Builder;

unsigned BestVF = 0;
unsigned BestUF = 0;

218public:
LoopVectorizationPlanner(Loop *L, LoopInfo *LI, const TargetLibraryInfo *TLI,
                         const TargetTransformInfo *TTI,
                         LoopVectorizationLegality *Legal,
                         LoopVectorizationCostModel &CM)
    : OrigLoop(L), LI(LI), TLI(TLI), TTI(TTI), Legal(Legal), CM(CM) {}

/// Plan how to best vectorize, return the best VF and its cost.
VectorizationFactor plan(bool OptForSize, unsigned UserVF);

/// Use the VPlan-native path to plan how to best vectorize, return the best
/// VF and its cost.
VectorizationFactor planInVPlanNativePath(bool OptForSize, unsigned UserVF);

/// Finalize the best decision and dispose of all other VPlans.
void setBestPlan(unsigned VF, unsigned UF);

/// Generate the IR code for the body of the vectorized loop according to the
/// best selected VPlan.
void executePlan(InnerLoopVectorizer &LB, DominatorTree *DT);

void printPlans(raw_ostream &O) {
  for (const auto &Plan : VPlans)
    O << *Plan;
}

/// Test a \p Predicate on a \p Range of VF's. Return the value of applying
/// \p Predicate on Range.Start, possibly decreasing Range.End such that the
/// returned value holds for the entire \p Range.
static bool
getDecisionAndClampRange(const std::function<bool(unsigned)> &Predicate,
                         VFRange &Range);

251protected:
/// Collect the instructions from the original loop that would be trivially
/// dead in the vectorized loop if generated.
void collectTriviallyDeadInstructions(
    SmallPtrSetImpl<Instruction *> &DeadInstructions);

/// Build VPlans for power-of-2 VF's between \p MinVF and \p MaxVF inclusive,
/// according to the information gathered by Legal when it checked if it is
/// legal to vectorize the loop.
void buildVPlans(unsigned MinVF, unsigned MaxVF);

262private:
/// Build a VPlan according to the information gathered by Legal. \return a
/// VPlan for vectorization factors \p Range.Start and up to \p Range.End
/// exclusive, possibly decreasing \p Range.End.
VPlanPtr buildVPlan(VFRange &Range);

/// Build a VPlan using VPRecipes according to the information gather by
/// Legal. This method is only used for the legacy inner loop vectorizer.
VPlanPtr
buildVPlanWithVPRecipes(VFRange &Range, SmallPtrSetImpl<Value *> &NeedDef,
                        SmallPtrSetImpl<Instruction *> &DeadInstructions);

/// Build VPlans for power-of-2 VF's between \p MinVF and \p MaxVF inclusive,
/// according to the information gathered by Legal when it checked if it is
/// legal to vectorize the loop. This method creates VPlans using VPRecipes.
void buildVPlansWithVPRecipes(unsigned MinVF, unsigned MaxVF);
278};

280} // namespace llvm

282#endif // LLVM_TRANSFORMS_VECTORIZE_LOOPVECTORIZATIONPLANNER_H