/tmp/buildd/llvm-toolchain-snapshot-3.7~svn240924/lib/Transforms/Vectorize/LoopVectorize.cpp

Bug Summary

File:	lib/Transforms/Vectorize/LoopVectorize.cpp
Location:	line 4869, column 75
Description:	Division by zero

Annotated Source Code

//===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===//

// The LLVM Compiler Infrastructure

// This file is distributed under the University of Illinois Open Source

// License. See LICENSE.TXT for details.

//===----------------------------------------------------------------------===//

// This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops

// and generates target-independent LLVM-IR.

// The vectorizer uses the TargetTransformInfo analysis to estimate the costs

// of instructions in order to estimate the profitability of vectorization.

// The loop vectorizer combines consecutive loop iterations into a single

// 'wide' iteration. After this transformation the index is incremented

// by the SIMD vector width, and not by one.

// This pass has three parts:

// 1. The main loop pass that drives the different parts.

// 2. LoopVectorizationLegality - A unit that checks for the legality

// of the vectorization.

// 3. InnerLoopVectorizer - A unit that performs the actual

// widening of instructions.

// 4. LoopVectorizationCostModel - A unit that checks for the profitability

// of vectorization. It decides on the optimal vector width, which

// can be one, if vectorization is not profitable.

//===----------------------------------------------------------------------===//

// The reduction-variable vectorization is based on the paper:

// D. Nuzman and R. Henderson. Multi-platform Auto-vectorization.

// Variable uniformity checks are inspired by:

// Karrenberg, R. and Hack, S. Whole Function Vectorization.

// The interleaved access vectorization is based on the paper:

// Dorit Nuzman, Ira Rosen and Ayal Zaks. Auto-Vectorization of Interleaved

// Data for SIMD

// Other ideas/concepts are from:

// A. Zaks and D. Nuzman. Autovectorization in GCC-two years later.

// S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua. An Evaluation of

// Vectorizing Compilers.

//===----------------------------------------------------------------------===//

#include "llvm/Transforms/Vectorize.h"

#include "llvm/ADT/DenseMap.h"

#include "llvm/ADT/EquivalenceClasses.h"

#include "llvm/ADT/Hashing.h"

#include "llvm/ADT/MapVector.h"

#include "llvm/ADT/SetVector.h"

#include "llvm/ADT/SmallPtrSet.h"

#include "llvm/ADT/SmallSet.h"

#include "llvm/ADT/SmallVector.h"

#include "llvm/ADT/Statistic.h"

#include "llvm/ADT/StringExtras.h"

#include "llvm/Analysis/AliasAnalysis.h"

#include "llvm/Analysis/AliasSetTracker.h"

#include "llvm/Analysis/AssumptionCache.h"

#include "llvm/Analysis/BlockFrequencyInfo.h"

#include "llvm/Analysis/CodeMetrics.h"

#include "llvm/Analysis/LoopAccessAnalysis.h"

#include "llvm/Analysis/LoopInfo.h"

#include "llvm/Analysis/LoopIterator.h"

#include "llvm/Analysis/LoopPass.h"

#include "llvm/Analysis/ScalarEvolution.h"

#include "llvm/Analysis/ScalarEvolutionExpander.h"

#include "llvm/Analysis/ScalarEvolutionExpressions.h"

#include "llvm/Analysis/TargetTransformInfo.h"

#include "llvm/Analysis/ValueTracking.h"

#include "llvm/IR/Constants.h"

#include "llvm/IR/DataLayout.h"

#include "llvm/IR/DebugInfo.h"

#include "llvm/IR/DerivedTypes.h"

#include "llvm/IR/DiagnosticInfo.h"

#include "llvm/IR/Dominators.h"

#include "llvm/IR/Function.h"

#include "llvm/IR/IRBuilder.h"

#include "llvm/IR/Instructions.h"

#include "llvm/IR/IntrinsicInst.h"

#include "llvm/IR/LLVMContext.h"

#include "llvm/IR/Module.h"

#include "llvm/IR/PatternMatch.h"

#include "llvm/IR/Type.h"

#include "llvm/IR/Value.h"

#include "llvm/IR/ValueHandle.h"

#include "llvm/IR/Verifier.h"

#include "llvm/Pass.h"

#include "llvm/Support/BranchProbability.h"

#include "llvm/Support/CommandLine.h"

#include "llvm/Support/Debug.h"

#include "llvm/Support/raw_ostream.h"

#include "llvm/Transforms/Scalar.h"

#include "llvm/Transforms/Utils/BasicBlockUtils.h"

#include "llvm/Transforms/Utils/Local.h"

#include "llvm/Analysis/VectorUtils.h"

100

#include "llvm/Transforms/Utils/LoopUtils.h"

101

#include <algorithm>

102

#include <map>

103

#include <tuple>

104

105

using namespace llvm;

106

using namespace llvm::PatternMatch;

107

108

#define LV_NAME"loop-vectorize" "loop-vectorize"

109

#define DEBUG_TYPE"loop-vectorize" LV_NAME"loop-vectorize"

110

111

STATISTIC(LoopsVectorized, "Number of loops vectorized")static llvm::Statistic LoopsVectorized = { "loop-vectorize", "Number of loops vectorized"
, 0, 0 };

112

STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization")static llvm::Statistic LoopsAnalyzed = { "loop-vectorize", "Number of loops analyzed for vectorization"
, 0, 0 };

113

114

static cl::opt<bool>

115

EnableIfConversion("enable-if-conversion", cl::init(true), cl::Hidden,

116

cl::desc("Enable if-conversion during vectorization."));

117

118

/// We don't vectorize loops with a known constant trip count below this number.

119

static cl::opt<unsigned>

120

TinyTripCountVectorThreshold("vectorizer-min-trip-count", cl::init(16),

121

cl::Hidden,

122

cl::desc("Don't vectorize loops with a constant "

123

"trip count that is smaller than this "

124

"value."));

125

126

/// This enables versioning on the strides of symbolically striding memory

127

/// accesses in code like the following.

128

/// for (i = 0; i < N; ++i)

129

/// A[i * Stride1] += B[i * Stride2] ...

130

///

131

/// Will be roughly translated to

132

/// if (Stride1 == 1 && Stride2 == 1) {

133

/// for (i = 0; i < N; i+=4)

134

/// A[i:i+3] += ...

135

/// } else

136

/// ...

137

static cl::opt<bool> EnableMemAccessVersioning(

138

"enable-mem-access-versioning", cl::init(true), cl::Hidden,

139

cl::desc("Enable symblic stride memory access versioning"));

140

141

static cl::opt<bool> EnableInterleavedMemAccesses(

142

"enable-interleaved-mem-accesses", cl::init(false), cl::Hidden,

143

cl::desc("Enable vectorization on interleaved memory accesses in a loop"));

144

145

/// Maximum factor for an interleaved memory access.

146

static cl::opt<unsigned> MaxInterleaveGroupFactor(

147

"max-interleave-group-factor", cl::Hidden,

148

cl::desc("Maximum factor for an interleaved access group (default = 8)"),

149

cl::init(8));

150

151

/// We don't unroll loops with a known constant trip count below this number.

152

static const unsigned TinyTripCountUnrollThreshold = 128;

153

154

static cl::opt<unsigned> ForceTargetNumScalarRegs(

155

"force-target-num-scalar-regs", cl::init(0), cl::Hidden,

156

cl::desc("A flag that overrides the target's number of scalar registers."));

157

158

static cl::opt<unsigned> ForceTargetNumVectorRegs(

159

"force-target-num-vector-regs", cl::init(0), cl::Hidden,

160

cl::desc("A flag that overrides the target's number of vector registers."));

161

162

/// Maximum vectorization interleave count.

163

static const unsigned MaxInterleaveFactor = 16;

164

165

static cl::opt<unsigned> ForceTargetMaxScalarInterleaveFactor(

166

"force-target-max-scalar-interleave", cl::init(0), cl::Hidden,

167

cl::desc("A flag that overrides the target's max interleave factor for "

168

"scalar loops."));

169

170

static cl::opt<unsigned> ForceTargetMaxVectorInterleaveFactor(

171

"force-target-max-vector-interleave", cl::init(0), cl::Hidden,

172

cl::desc("A flag that overrides the target's max interleave factor for "

173

"vectorized loops."));

174

175

static cl::opt<unsigned> ForceTargetInstructionCost(

176

"force-target-instruction-cost", cl::init(0), cl::Hidden,

177

cl::desc("A flag that overrides the target's expected cost for "

178

"an instruction to a single constant value. Mostly "

179

"useful for getting consistent testing."));

180

181

static cl::opt<unsigned> SmallLoopCost(

182

"small-loop-cost", cl::init(20), cl::Hidden,

183

cl::desc("The cost of a loop that is considered 'small' by the unroller."));

184

185

static cl::opt<bool> LoopVectorizeWithBlockFrequency(

186

"loop-vectorize-with-block-frequency", cl::init(false), cl::Hidden,

187

cl::desc("Enable the use of the block frequency analysis to access PGO "

188

"heuristics minimizing code growth in cold regions and being more "

189

"aggressive in hot regions."));

190

191

// Runtime unroll loops for load/store throughput.

192

static cl::opt<bool> EnableLoadStoreRuntimeUnroll(

193

"enable-loadstore-runtime-unroll", cl::init(true), cl::Hidden,

194

cl::desc("Enable runtime unrolling until load/store ports are saturated"));

195

196

/// The number of stores in a loop that are allowed to need predication.

197

static cl::opt<unsigned> NumberOfStoresToPredicate(

198

"vectorize-num-stores-pred", cl::init(1), cl::Hidden,

199

cl::desc("Max number of stores to be predicated behind an if."));

200

201

static cl::opt<bool> EnableIndVarRegisterHeur(

202

"enable-ind-var-reg-heur", cl::init(true), cl::Hidden,

203

cl::desc("Count the induction variable only once when unrolling"));

204

205

static cl::opt<bool> EnableCondStoresVectorization(

206

"enable-cond-stores-vec", cl::init(false), cl::Hidden,

207

cl::desc("Enable if predication of stores during vectorization."));

208

209

static cl::opt<unsigned> MaxNestedScalarReductionUF(

210

"max-nested-scalar-reduction-unroll", cl::init(2), cl::Hidden,

211

cl::desc("The maximum unroll factor to use when unrolling a scalar "

212

"reduction in a nested loop."));

213

214

namespace {

215

216

// Forward declarations.

217

class LoopVectorizationLegality;

218

class LoopVectorizationCostModel;

219

class LoopVectorizeHints;

220

221

/// \brief This modifies LoopAccessReport to initialize message with

222

/// loop-vectorizer-specific part.

223

class VectorizationReport : public LoopAccessReport {

224

public:

225

VectorizationReport(Instruction *I = nullptr)

226

: LoopAccessReport("loop not vectorized: ", I) {}

227

228

/// \brief This allows promotion of the loop-access analysis report into the

229

/// loop-vectorizer report. It modifies the message to add the

230

/// loop-vectorizer-specific part of the message.

231

explicit VectorizationReport(const LoopAccessReport &R)

232

: LoopAccessReport(Twine("loop not vectorized: ") + R.str(),

233

R.getInstr()) {}

234

};

235

236

/// A helper function for converting Scalar types to vector types.

237

/// If the incoming type is void, we return void. If the VF is 1, we return

238

/// the scalar type.

239

static Type* ToVectorTy(Type *Scalar, unsigned VF) {

240

if (Scalar->isVoidTy() || VF == 1)

241

return Scalar;

242

return VectorType::get(Scalar, VF);

243

}

244

245

/// InnerLoopVectorizer vectorizes loops which contain only one basic

246

/// block to a specified vectorization factor (VF).

247

/// This class performs the widening of scalars into vectors, or multiple

248

/// scalars. This class also implements the following features:

249

/// * It inserts an epilogue loop for handling loops that don't have iteration

250

/// counts that are known to be a multiple of the vectorization factor.

251

/// * It handles the code generation for reduction variables.

252

/// * Scalarization (implementation using scalars) of un-vectorizable

253

/// instructions.

254

/// InnerLoopVectorizer does not perform any vectorization-legality

255

/// checks, and relies on the caller to check for the different legality

256

/// aspects. The InnerLoopVectorizer relies on the

257

/// LoopVectorizationLegality class to provide information about the induction

258

/// and reduction variables that were found to a given vectorization factor.

259

class InnerLoopVectorizer {

260

public:

261

InnerLoopVectorizer(Loop *OrigLoop, ScalarEvolution *SE, LoopInfo *LI,

262

DominatorTree *DT, const TargetLibraryInfo *TLI,

263

const TargetTransformInfo *TTI, unsigned VecWidth,

264

unsigned UnrollFactor)

265

: OrigLoop(OrigLoop), SE(SE), LI(LI), DT(DT), TLI(TLI), TTI(TTI),

266

VF(VecWidth), UF(UnrollFactor), Builder(SE->getContext()),

267

Induction(nullptr), OldInduction(nullptr), WidenMap(UnrollFactor),

268

Legal(nullptr), AddedSafetyChecks(false) {}

269

270

// Perform the actual loop widening (vectorization).

271

void vectorize(LoopVectorizationLegality *L) {

272

Legal = L;

273

// Create a new empty loop. Unlink the old loop and connect the new one.

274

createEmptyLoop();

275

// Widen each instruction in the old loop to a new one in the new loop.

276

// Use the Legality module to find the induction and reduction variables.

277

vectorizeLoop();

278

// Register the new loop and update the analysis passes.

279

updateAnalysis();

280

}

281

282

// Return true if any runtime check is added.

283

bool IsSafetyChecksAdded() {

284

return AddedSafetyChecks;

285

}

286

287

virtual ~InnerLoopVectorizer() {}

288

289

protected:

290

/// A small list of PHINodes.

291

typedef SmallVector<PHINode*, 4> PhiVector;

292

/// When we unroll loops we have multiple vector values for each scalar.

293

/// This data structure holds the unrolled and vectorized values that

294

/// originated from one scalar instruction.

295

typedef SmallVector<Value*, 2> VectorParts;

296

297

// When we if-convert we need to create edge masks. We have to cache values

298

// so that we don't end up with exponential recursion/IR.

299

typedef DenseMap<std::pair<BasicBlock*, BasicBlock*>,

300

VectorParts> EdgeMaskCache;

301

302

/// \brief Add checks for strides that were assumed to be 1.

303

///

304

/// Returns the last check instruction and the first check instruction in the

305

/// pair as (first, last).

306

std::pair<Instruction *, Instruction *> addStrideCheck(Instruction *Loc);

307

308

/// Create an empty loop, based on the loop ranges of the old loop.

309

void createEmptyLoop();

310

/// Copy and widen the instructions from the old loop.

311

virtual void vectorizeLoop();

312

313

/// \brief The Loop exit block may have single value PHI nodes where the

314

/// incoming value is 'Undef'. While vectorizing we only handled real values

315

/// that were defined inside the loop. Here we fix the 'undef case'.

316

/// See PR14725.

317

void fixLCSSAPHIs();

318

319

/// A helper function that computes the predicate of the block BB, assuming

320

/// that the header block of the loop is set to True. It returns the *entry*

321

/// mask for the block BB.

322

VectorParts createBlockInMask(BasicBlock *BB);

323

/// A helper function that computes the predicate of the edge between SRC

324

/// and DST.

325

VectorParts createEdgeMask(BasicBlock *Src, BasicBlock *Dst);

326

327

/// A helper function to vectorize a single BB within the innermost loop.

328

void vectorizeBlockInLoop(BasicBlock *BB, PhiVector *PV);

329

330

/// Vectorize a single PHINode in a block. This method handles the induction

331

/// variable canonicalization. It supports both VF = 1 for unrolled loops and

332

/// arbitrary length vectors.

333

void widenPHIInstruction(Instruction *PN, VectorParts &Entry,

334

unsigned UF, unsigned VF, PhiVector *PV);

335

336

/// Insert the new loop to the loop hierarchy and pass manager

337

/// and update the analysis passes.

338

void updateAnalysis();

339

340

/// This instruction is un-vectorizable. Implement it as a sequence

341

/// of scalars. If \p IfPredicateStore is true we need to 'hide' each

342

/// scalarized instruction behind an if block predicated on the control

343

/// dependence of the instruction.

344

virtual void scalarizeInstruction(Instruction *Instr,

345

bool IfPredicateStore=false);

346

347

/// Vectorize Load and Store instructions,

348

virtual void vectorizeMemoryInstruction(Instruction *Instr);

349

350

/// Create a broadcast instruction. This method generates a broadcast

351

/// instruction (shuffle) for loop invariant values and for the induction

352

/// value. If this is the induction variable then we extend it to N, N+1, ...

353

/// this is needed because each iteration in the loop corresponds to a SIMD

354

/// element.

355

virtual Value *getBroadcastInstrs(Value *V);

356

357

/// This function adds (StartIdx, StartIdx + Step, StartIdx + 2*Step, ...)

358

/// to each vector element of Val. The sequence starts at StartIndex.

359

virtual Value *getStepVector(Value *Val, int StartIdx, Value *Step);

360

361

/// When we go over instructions in the basic block we rely on previous

362

/// values within the current basic block or on loop invariant values.

363

/// When we widen (vectorize) values we place them in the map. If the values

364

/// are not within the map, they have to be loop invariant, so we simply

365

/// broadcast them into a vector.

366

VectorParts &getVectorValue(Value *V);

367

368

/// Try to vectorize the interleaved access group that \p Instr belongs to.

369

void vectorizeInterleaveGroup(Instruction *Instr);

370

371

/// Generate a shuffle sequence that will reverse the vector Vec.

372

virtual Value *reverseVector(Value *Vec);

373

374

/// This is a helper class that holds the vectorizer state. It maps scalar

375

/// instructions to vector instructions. When the code is 'unrolled' then

376

/// then a single scalar value is mapped to multiple vector parts. The parts

377

/// are stored in the VectorPart type.

378

struct ValueMap {

379

/// C'tor. UnrollFactor controls the number of vectors ('parts') that

380

/// are mapped.

381

ValueMap(unsigned UnrollFactor) : UF(UnrollFactor) {}

382

383

/// \return True if 'Key' is saved in the Value Map.

384

bool has(Value *Key) const { return MapStorage.count(Key); }

385

386

/// Initializes a new entry in the map. Sets all of the vector parts to the

387

/// save value in 'Val'.

388

/// \return A reference to a vector with splat values.

389

VectorParts &splat(Value *Key, Value *Val) {

390

VectorParts &Entry = MapStorage[Key];

391

Entry.assign(UF, Val);

392

return Entry;

393

}

394

395

///\return A reference to the value that is stored at 'Key'.

396

VectorParts &get(Value *Key) {

397

VectorParts &Entry = MapStorage[Key];

398

if (Entry.empty())

399

Entry.resize(UF);

400

assert(Entry.size() == UF)((Entry.size() == UF) ? static_cast<void> (0) : __assert_fail
("Entry.size() == UF", "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn240924/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 400, __PRETTY_FUNCTION__));

401

return Entry;

402

}

403

404

private:

405

/// The unroll factor. Each entry in the map stores this number of vector

406

/// elements.

407

unsigned UF;

408

409

/// Map storage. We use std::map and not DenseMap because insertions to a

410

/// dense map invalidates its iterators.

411

std::map<Value *, VectorParts> MapStorage;

412

};

413

414

/// The original loop.

415

Loop *OrigLoop;

416

/// Scev analysis to use.

417

ScalarEvolution *SE;

418

/// Loop Info.

419

LoopInfo *LI;

420

/// Dominator Tree.

421

DominatorTree *DT;

422

/// Alias Analysis.

423

AliasAnalysis *AA;

424

/// Target Library Info.

425

const TargetLibraryInfo *TLI;

426

/// Target Transform Info.

427

const TargetTransformInfo *TTI;

428

429

/// The vectorization SIMD factor to use. Each vector will have this many

430

/// vector elements.

431

unsigned VF;

432

433

protected:

434

/// The vectorization unroll factor to use. Each scalar is vectorized to this

435

/// many different vector instructions.

436

unsigned UF;

437

438

/// The builder that we use

439

IRBuilder<> Builder;

440

441

// --- Vectorization state ---

442

443

/// The vector-loop preheader.

444

BasicBlock *LoopVectorPreHeader;

445

/// The scalar-loop preheader.

446

BasicBlock *LoopScalarPreHeader;

447

/// Middle Block between the vector and the scalar.

448

BasicBlock *LoopMiddleBlock;

449

///The ExitBlock of the scalar loop.

450

BasicBlock *LoopExitBlock;

451

///The vector loop body.

452

SmallVector<BasicBlock *, 4> LoopVectorBody;

453

///The scalar loop body.

454

BasicBlock *LoopScalarBody;

455

/// A list of all bypass blocks. The first block is the entry of the loop.

456

SmallVector<BasicBlock *, 4> LoopBypassBlocks;

457

458

/// The new Induction variable which was added to the new block.

459

PHINode *Induction;

460

/// The induction variable of the old basic block.

461

PHINode *OldInduction;

462

/// Holds the extended (to the widest induction type) start index.

463

Value *ExtendedIdx;

464

/// Maps scalars to widened vectors.

465

ValueMap WidenMap;

466

EdgeMaskCache MaskCache;

467

468

LoopVectorizationLegality *Legal;

469

470

// Record whether runtime check is added.

471

bool AddedSafetyChecks;

472

};

473

474

class InnerLoopUnroller : public InnerLoopVectorizer {

475

public:

476

InnerLoopUnroller(Loop *OrigLoop, ScalarEvolution *SE, LoopInfo *LI,

477

DominatorTree *DT, const TargetLibraryInfo *TLI,

478

const TargetTransformInfo *TTI, unsigned UnrollFactor)

479

: InnerLoopVectorizer(OrigLoop, SE, LI, DT, TLI, TTI, 1, UnrollFactor) {}

480

481

private:

482

void scalarizeInstruction(Instruction *Instr,

483

bool IfPredicateStore = false) override;

484

void vectorizeMemoryInstruction(Instruction *Instr) override;

485

Value *getBroadcastInstrs(Value *V) override;

486

Value *getStepVector(Value *Val, int StartIdx, Value *Step) override;

487

Value *reverseVector(Value *Vec) override;

488

};

489

490

/// \brief Look for a meaningful debug location on the instruction or it's

491

/// operands.

492

static Instruction *getDebugLocFromInstOrOperands(Instruction *I) {

493

if (!I)

494

return I;

495

496

DebugLoc Empty;

497

if (I->getDebugLoc() != Empty)

498

return I;

499

500

for (User::op_iterator OI = I->op_begin(), OE = I->op_end(); OI != OE; ++OI) {

501

if (Instruction *OpInst = dyn_cast<Instruction>(*OI))

502

if (OpInst->getDebugLoc() != Empty)

503

return OpInst;

504

}

505

506

return I;

507

}

508

509

/// \brief Set the debug location in the builder using the debug location in the

510

/// instruction.

511

static void setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr) {

512

if (const Instruction *Inst = dyn_cast_or_null<Instruction>(Ptr))

513

B.SetCurrentDebugLocation(Inst->getDebugLoc());

514

else

515

B.SetCurrentDebugLocation(DebugLoc());

516

}

517

518

#ifndef NDEBUG

519

/// \return string containing a file name and a line # for the given loop.

520

static std::string getDebugLocString(const Loop *L) {

521

std::string Result;

522

if (L) {

523

raw_string_ostream OS(Result);

524

if (const DebugLoc LoopDbgLoc = L->getStartLoc())

525

LoopDbgLoc.print(OS);

526

else

527

// Just print the module name.

528

OS << L->getHeader()->getParent()->getParent()->getModuleIdentifier();

529

OS.flush();

530

}

531

return Result;

532

}

533

#endif

534

535

/// \brief Propagate known metadata from one instruction to another.

536

static void propagateMetadata(Instruction *To, const Instruction *From) {

537

SmallVector<std::pair<unsigned, MDNode *>, 4> Metadata;

538

From->getAllMetadataOtherThanDebugLoc(Metadata);

539

540

for (auto M : Metadata) {

541

unsigned Kind = M.first;

542

543

// These are safe to transfer (this is safe for TBAA, even when we

544

// if-convert, because should that metadata have had a control dependency

545

// on the condition, and thus actually aliased with some other

546

// non-speculated memory access when the condition was false, this would be

547

// caught by the runtime overlap checks).

548

if (Kind != LLVMContext::MD_tbaa &&

549

Kind != LLVMContext::MD_alias_scope &&

550

Kind != LLVMContext::MD_noalias &&

551

Kind != LLVMContext::MD_fpmath)

552

continue;

553

554

To->setMetadata(Kind, M.second);

555

}

556

}

557

558

/// \brief Propagate known metadata from one instruction to a vector of others.

559

static void propagateMetadata(SmallVectorImpl<Value *> &To, const Instruction *From) {

560

for (Value *V : To)

561

if (Instruction *I = dyn_cast<Instruction>(V))

562

propagateMetadata(I, From);

563

}

564

565

/// \brief The group of interleaved loads/stores sharing the same stride and

566

/// close to each other.

567

///

568

/// Each member in this group has an index starting from 0, and the largest

569

/// index should be less than interleaved factor, which is equal to the absolute

570

/// value of the access's stride.

571

///

572

/// E.g. An interleaved load group of factor 4:

573

/// for (unsigned i = 0; i < 1024; i+=4) {

574

/// a = A[i]; // Member of index 0

575

/// b = A[i+1]; // Member of index 1

576

/// d = A[i+3]; // Member of index 3

577

/// ...

578

/// }

579

///

580

/// An interleaved store group of factor 4:

581

/// for (unsigned i = 0; i < 1024; i+=4) {

582

/// ...

583

/// A[i] = a; // Member of index 0

584

/// A[i+1] = b; // Member of index 1

585

/// A[i+2] = c; // Member of index 2

586

/// A[i+3] = d; // Member of index 3

587

/// }

588

///

589

/// Note: the interleaved load group could have gaps (missing members), but

590

/// the interleaved store group doesn't allow gaps.

591

class InterleaveGroup {

592

public:

593

InterleaveGroup(Instruction *Instr, int Stride, unsigned Align)

594

: Align(Align), SmallestKey(0), LargestKey(0), InsertPos(Instr) {

595

assert(Align && "The alignment should be non-zero")((Align && "The alignment should be non-zero") ? static_cast
<void> (0) : __assert_fail ("Align && \"The alignment should be non-zero\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn240924/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 595, __PRETTY_FUNCTION__));

596

597

Factor = std::abs(Stride);

598

assert(Factor > 1 && "Invalid interleave factor")((Factor > 1 && "Invalid interleave factor") ? static_cast
<void> (0) : __assert_fail ("Factor > 1 && \"Invalid interleave factor\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn240924/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 598, __PRETTY_FUNCTION__));

599

600

Reverse = Stride < 0;

601

Members[0] = Instr;

602

}

603

604

bool isReverse() const { return Reverse; }

605

unsigned getFactor() const { return Factor; }

606

unsigned getAlignment() const { return Align; }

607

unsigned getNumMembers() const { return Members.size(); }

608

609

/// \brief Try to insert a new member \p Instr with index \p Index and

610

/// alignment \p NewAlign. The index is related to the leader and it could be

611

/// negative if it is the new leader.

612

///

613

/// \returns false if the instruction doesn't belong to the group.

614

bool insertMember(Instruction *Instr, int Index, unsigned NewAlign) {

615

assert(NewAlign && "The new member's alignment should be non-zero")((NewAlign && "The new member's alignment should be non-zero"
) ? static_cast<void> (0) : __assert_fail ("NewAlign && \"The new member's alignment should be non-zero\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn240924/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 615, __PRETTY_FUNCTION__));

616

617

int Key = Index + SmallestKey;

618

619

// Skip if there is already a member with the same index.

620

if (Members.count(Key))

621

return false;

622

623

if (Key > LargestKey) {

624

// The largest index is always less than the interleave factor.

625

if (Index >= static_cast<int>(Factor))

626

return false;

627

628

LargestKey = Key;

629

} else if (Key < SmallestKey) {

630

// The largest index is always less than the interleave factor.

631

if (LargestKey - Key >= static_cast<int>(Factor))

632

return false;

633

634

SmallestKey = Key;

635

}

636

637

// It's always safe to select the minimum alignment.

638

Align = std::min(Align, NewAlign);

639

Members[Key] = Instr;

640

return true;

641

}

642

643

/// \brief Get the member with the given index \p Index

644

///

645

/// \returns nullptr if contains no such member.

646

Instruction *getMember(unsigned Index) const {

647

int Key = SmallestKey + Index;

648

if (!Members.count(Key))

649

return nullptr;

650

651

return Members.find(Key)->second;

652

}

653

654

/// \brief Get the index for the given member. Unlike the key in the member

655

/// map, the index starts from 0.

656

unsigned getIndex(Instruction *Instr) const {

657

for (auto I : Members)

658

if (I.second == Instr)

659

return I.first - SmallestKey;

660

661

llvm_unreachable("InterleaveGroup contains no such member")::llvm::llvm_unreachable_internal("InterleaveGroup contains no such member"
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn240924/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 661);

662

}

663

664

Instruction *getInsertPos() const { return InsertPos; }

665

void setInsertPos(Instruction *Inst) { InsertPos = Inst; }

666

667

private:

668

unsigned Factor; // Interleave Factor.

669

bool Reverse;

670

unsigned Align;

671

DenseMap<int, Instruction *> Members;

672

int SmallestKey;

673

int LargestKey;

674

675

// To avoid breaking dependences, vectorized instructions of an interleave

676

// group should be inserted at either the first load or the last store in

677

// program order.

678

679

// E.g. %even = load i32 // Insert Position

680

// %add = add i32 %even // Use of %even

681

// %odd = load i32

682

683

// store i32 %even

684

// %odd = add i32 // Def of %odd

685

// store i32 %odd // Insert Position

686

Instruction *InsertPos;

687

};

688

689

/// \brief Drive the analysis of interleaved memory accesses in the loop.

690

///

691

/// Use this class to analyze interleaved accesses only when we can vectorize

692

/// a loop. Otherwise it's meaningless to do analysis as the vectorization

693

/// on interleaved accesses is unsafe.

694

///

695

/// The analysis collects interleave groups and records the relationships

696

/// between the member and the group in a map.

697

class InterleavedAccessInfo {

698

public:

699

InterleavedAccessInfo(ScalarEvolution *SE, Loop *L, DominatorTree *DT)

700

: SE(SE), TheLoop(L), DT(DT) {}

701

702

~InterleavedAccessInfo() {

703

SmallSet<InterleaveGroup *, 4> DelSet;

704

// Avoid releasing a pointer twice.

705

for (auto &I : InterleaveGroupMap)

706

DelSet.insert(I.second);

707

for (auto *Ptr : DelSet)

708

delete Ptr;

709

}

710

711

/// \brief Analyze the interleaved accesses and collect them in interleave

712

/// groups. Substitute symbolic strides using \p Strides.

713

void analyzeInterleaving(const ValueToValueMap &Strides);

714

715

/// \brief Check if \p Instr belongs to any interleave group.

716

bool isInterleaved(Instruction *Instr) const {

717

return InterleaveGroupMap.count(Instr);

718

}

719

720

/// \brief Get the interleave group that \p Instr belongs to.

721

///

722

/// \returns nullptr if doesn't have such group.

723

InterleaveGroup *getInterleaveGroup(Instruction *Instr) const {

724

if (InterleaveGroupMap.count(Instr))

725

return InterleaveGroupMap.find(Instr)->second;

726

return nullptr;

727

}

728

729

private:

730

ScalarEvolution *SE;

731

Loop *TheLoop;

732

DominatorTree *DT;

733

734

/// Holds the relationships between the members and the interleave group.

735

DenseMap<Instruction *, InterleaveGroup *> InterleaveGroupMap;

736

737

/// \brief The descriptor for a strided memory access.

738

struct StrideDescriptor {

739

StrideDescriptor(int Stride, const SCEV *Scev, unsigned Size,

740

unsigned Align)

741

: Stride(Stride), Scev(Scev), Size(Size), Align(Align) {}

742

743

StrideDescriptor() : Stride(0), Scev(nullptr), Size(0), Align(0) {}

744

745

int Stride; // The access's stride. It is negative for a reverse access.

746

const SCEV *Scev; // The scalar expression of this access

747

unsigned Size; // The size of the memory object.

748

unsigned Align; // The alignment of this access.

749

};

750

751

/// \brief Create a new interleave group with the given instruction \p Instr,

752

/// stride \p Stride and alignment \p Align.

753

///

754

/// \returns the newly created interleave group.

755

InterleaveGroup *createInterleaveGroup(Instruction *Instr, int Stride,

756

unsigned Align) {

757

assert(!InterleaveGroupMap.count(Instr) &&((!InterleaveGroupMap.count(Instr) && "Already in an interleaved access group"
) ? static_cast<void> (0) : __assert_fail ("!InterleaveGroupMap.count(Instr) && \"Already in an interleaved access group\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn240924/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 758, __PRETTY_FUNCTION__))

758

"Already in an interleaved access group")((!InterleaveGroupMap.count(Instr) && "Already in an interleaved access group"
) ? static_cast<void> (0) : __assert_fail ("!InterleaveGroupMap.count(Instr) && \"Already in an interleaved access group\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn240924/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 758, __PRETTY_FUNCTION__));

759

InterleaveGroupMap[Instr] = new InterleaveGroup(Instr, Stride, Align);

760

return InterleaveGroupMap[Instr];

761

}

762

763

/// \brief Release the group and remove all the relationships.

764

void releaseGroup(InterleaveGroup *Group) {

765

for (unsigned i = 0; i < Group->getFactor(); i++)

766

if (Instruction *Member = Group->getMember(i))

767

InterleaveGroupMap.erase(Member);

768

769

delete Group;

770

}

771

772

/// \brief Collect all the accesses with a constant stride in program order.

773

void collectConstStridedAccesses(

774

MapVector<Instruction *, StrideDescriptor> &StrideAccesses,

775

const ValueToValueMap &Strides);

776

};

777

778

/// LoopVectorizationLegality checks if it is legal to vectorize a loop, and

779

/// to what vectorization factor.

780

/// This class does not look at the profitability of vectorization, only the

781

/// legality. This class has two main kinds of checks:

782

/// * Memory checks - The code in canVectorizeMemory checks if vectorization

783

/// will change the order of memory accesses in a way that will change the

784

/// correctness of the program.

785

/// * Scalars checks - The code in canVectorizeInstrs and canVectorizeMemory

786

/// checks for a number of different conditions, such as the availability of a

787

/// single induction variable, that all types are supported and vectorize-able,

788

/// etc. This code reflects the capabilities of InnerLoopVectorizer.

789

/// This class is also used by InnerLoopVectorizer for identifying

790

/// induction variable and the different reduction variables.

791

class LoopVectorizationLegality {

792

public:

793

LoopVectorizationLegality(Loop *L, ScalarEvolution *SE, DominatorTree *DT,

794

TargetLibraryInfo *TLI, AliasAnalysis *AA,

795

Function *F, const TargetTransformInfo *TTI,

796

LoopAccessAnalysis *LAA)

797

: NumPredStores(0), TheLoop(L), SE(SE), TLI(TLI), TheFunction(F),

798

TTI(TTI), DT(DT), LAA(LAA), LAI(nullptr), InterleaveInfo(SE, L, DT),

799

Induction(nullptr), WidestIndTy(nullptr), HasFunNoNaNAttr(false) {}

800

801

/// This enum represents the kinds of inductions that we support.

802

enum InductionKind {

803

IK_NoInduction, ///< Not an induction variable.

804

IK_IntInduction, ///< Integer induction variable. Step = C.

805

IK_PtrInduction ///< Pointer induction var. Step = C / sizeof(elem).

806

};

807

808

/// A struct for saving information about induction variables.

809

struct InductionInfo {

810

InductionInfo(Value *Start, InductionKind K, ConstantInt *Step)

811

: StartValue(Start), IK(K), StepValue(Step) {

812

assert(IK != IK_NoInduction && "Not an induction")((IK != IK_NoInduction && "Not an induction") ? static_cast
<void> (0) : __assert_fail ("IK != IK_NoInduction && \"Not an induction\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn240924/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 812, __PRETTY_FUNCTION__));

813

assert(StartValue && "StartValue is null")((StartValue && "StartValue is null") ? static_cast<
void> (0) : __assert_fail ("StartValue && \"StartValue is null\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn240924/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 813, __PRETTY_FUNCTION__));

814

assert(StepValue && !StepValue->isZero() && "StepValue is zero")((StepValue && !StepValue->isZero() && "StepValue is zero"
) ? static_cast<void> (0) : __assert_fail ("StepValue && !StepValue->isZero() && \"StepValue is zero\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn240924/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 814, __PRETTY_FUNCTION__));

815

assert((IK != IK_PtrInduction || StartValue->getType()->isPointerTy()) &&(((IK != IK_PtrInduction || StartValue->getType()->isPointerTy
()) && "StartValue is not a pointer for pointer induction"
) ? static_cast<void> (0) : __assert_fail ("(IK != IK_PtrInduction || StartValue->getType()->isPointerTy()) && \"StartValue is not a pointer for pointer induction\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn240924/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 816, __PRETTY_FUNCTION__))

816

"StartValue is not a pointer for pointer induction")(((IK != IK_PtrInduction || StartValue->getType()->isPointerTy
()) && "StartValue is not a pointer for pointer induction"
) ? static_cast<void> (0) : __assert_fail ("(IK != IK_PtrInduction || StartValue->getType()->isPointerTy()) && \"StartValue is not a pointer for pointer induction\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn240924/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 816, __PRETTY_FUNCTION__));

817

assert((IK != IK_IntInduction || StartValue->getType()->isIntegerTy()) &&(((IK != IK_IntInduction || StartValue->getType()->isIntegerTy
()) && "StartValue is not an integer for integer induction"
) ? static_cast<void> (0) : __assert_fail ("(IK != IK_IntInduction || StartValue->getType()->isIntegerTy()) && \"StartValue is not an integer for integer induction\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn240924/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 818, __PRETTY_FUNCTION__))

818

"StartValue is not an integer for integer induction")(((IK != IK_IntInduction || StartValue->getType()->isIntegerTy
()) && "StartValue is not an integer for integer induction"
) ? static_cast<void> (0) : __assert_fail ("(IK != IK_IntInduction || StartValue->getType()->isIntegerTy()) && \"StartValue is not an integer for integer induction\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn240924/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 818, __PRETTY_FUNCTION__));

819

assert(StepValue->getType()->isIntegerTy() &&((StepValue->getType()->isIntegerTy() && "StepValue is not an integer"
) ? static_cast<void> (0) : __assert_fail ("StepValue->getType()->isIntegerTy() && \"StepValue is not an integer\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn240924/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 820, __PRETTY_FUNCTION__))

820

"StepValue is not an integer")((StepValue->getType()->isIntegerTy() && "StepValue is not an integer"
) ? static_cast<void> (0) : __assert_fail ("StepValue->getType()->isIntegerTy() && \"StepValue is not an integer\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn240924/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 820, __PRETTY_FUNCTION__));

821

}

822

InductionInfo()

823

: StartValue(nullptr), IK(IK_NoInduction), StepValue(nullptr) {}

824

825

/// Get the consecutive direction. Returns:

826

/// 0 - unknown or non-consecutive.

827

/// 1 - consecutive and increasing.

828

/// -1 - consecutive and decreasing.

829

int getConsecutiveDirection() const {

830

if (StepValue && (StepValue->isOne() || StepValue->isMinusOne()))

831

return StepValue->getSExtValue();

832

return 0;

833

}

834

835

/// Compute the transformed value of Index at offset StartValue using step

836

/// StepValue.

837

/// For integer induction, returns StartValue + Index * StepValue.

838

/// For pointer induction, returns StartValue[Index * StepValue].

839

/// FIXME: The newly created binary instructions should contain nsw/nuw

840

/// flags, which can be found from the original scalar operations.

841

Value *transform(IRBuilder<> &B, Value *Index) const {

842

switch (IK) {

843

case IK_IntInduction:

844

assert(Index->getType() == StartValue->getType() &&((Index->getType() == StartValue->getType() && "Index type does not match StartValue type"
) ? static_cast<void> (0) : __assert_fail ("Index->getType() == StartValue->getType() && \"Index type does not match StartValue type\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn240924/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 845, __PRETTY_FUNCTION__))

845

"Index type does not match StartValue type")((Index->getType() == StartValue->getType() && "Index type does not match StartValue type"
) ? static_cast<void> (0) : __assert_fail ("Index->getType() == StartValue->getType() && \"Index type does not match StartValue type\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn240924/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 845, __PRETTY_FUNCTION__));

846

if (StepValue->isMinusOne())

847

return B.CreateSub(StartValue, Index);

848

if (!StepValue->isOne())

849

Index = B.CreateMul(Index, StepValue);

850

return B.CreateAdd(StartValue, Index);

851

852

case IK_PtrInduction:

853

assert(Index->getType() == StepValue->getType() &&((Index->getType() == StepValue->getType() && "Index type does not match StepValue type"
) ? static_cast<void> (0) : __assert_fail ("Index->getType() == StepValue->getType() && \"Index type does not match StepValue type\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn240924/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 854, __PRETTY_FUNCTION__))

854

"Index type does not match StepValue type")((Index->getType() == StepValue->getType() && "Index type does not match StepValue type"
) ? static_cast<void> (0) : __assert_fail ("Index->getType() == StepValue->getType() && \"Index type does not match StepValue type\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn240924/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 854, __PRETTY_FUNCTION__));

855

if (StepValue->isMinusOne())

856

Index = B.CreateNeg(Index);

857

else if (!StepValue->isOne())

858

Index = B.CreateMul(Index, StepValue);

859

return B.CreateGEP(nullptr, StartValue, Index);

860

861

case IK_NoInduction:

862

return nullptr;

863

}

864

llvm_unreachable("invalid enum")::llvm::llvm_unreachable_internal("invalid enum", "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn240924/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 864);

865

}

866

867

/// Start value.

868

TrackingVH<Value> StartValue;

869

/// Induction kind.

870

InductionKind IK;

871

/// Step value.

872

ConstantInt *StepValue;

873

};

874

875

/// ReductionList contains the reduction descriptors for all

876

/// of the reductions that were found in the loop.

877

typedef DenseMap<PHINode *, RecurrenceDescriptor> ReductionList;

878

879

/// InductionList saves induction variables and maps them to the

880

/// induction descriptor.

881

typedef MapVector<PHINode*, InductionInfo> InductionList;

882

883

/// Returns true if it is legal to vectorize this loop.

884

/// This does not mean that it is profitable to vectorize this

885

/// loop, only that it is legal to do so.

886

bool canVectorize();

887

888

/// Returns the Induction variable.

889

PHINode *getInduction() { return Induction; }

890

891

/// Returns the reduction variables found in the loop.

892

ReductionList *getReductionVars() { return &Reductions; }

893

894

/// Returns the induction variables found in the loop.

895

InductionList *getInductionVars() { return &Inductions; }

896

897

/// Returns the widest induction type.

898

Type *getWidestInductionType() { return WidestIndTy; }

899

900

/// Returns True if V is an induction variable in this loop.

901

bool isInductionVariable(const Value *V);

902

903

/// Return true if the block BB needs to be predicated in order for the loop

904

/// to be vectorized.

905

bool blockNeedsPredication(BasicBlock *BB);

906

907

/// Check if this pointer is consecutive when vectorizing. This happens

908

/// when the last index of the GEP is the induction variable, or that the

909

/// pointer itself is an induction variable.

910

/// This check allows us to vectorize A[idx] into a wide load/store.

911

/// Returns:

912

/// 0 - Stride is unknown or non-consecutive.

913

/// 1 - Address is consecutive.

914

/// -1 - Address is consecutive, and decreasing.

915

int isConsecutivePtr(Value *Ptr);

916

917

/// Returns true if the value V is uniform within the loop.

918

bool isUniform(Value *V);

919

920

/// Returns true if this instruction will remain scalar after vectorization.

921

bool isUniformAfterVectorization(Instruction* I) { return Uniforms.count(I); }

922

923

/// Returns the information that we collected about runtime memory check.

924

const LoopAccessInfo::RuntimePointerCheck *getRuntimePointerCheck() const {

925

return LAI->getRuntimePointerCheck();

926

}

927

928

const LoopAccessInfo *getLAI() const {

929

return LAI;

930

}

931

932

/// \brief Check if \p Instr belongs to any interleaved access group.

933

bool isAccessInterleaved(Instruction *Instr) {

934

return InterleaveInfo.isInterleaved(Instr);

935

}

936

937

/// \brief Get the interleaved access group that \p Instr belongs to.

938

const InterleaveGroup *getInterleavedAccessGroup(Instruction *Instr) {

939

return InterleaveInfo.getInterleaveGroup(Instr);

940

}

941

942

unsigned getMaxSafeDepDistBytes() { return LAI->getMaxSafeDepDistBytes(); }

943

944

bool hasStride(Value *V) { return StrideSet.count(V); }

945

bool mustCheckStrides() { return !StrideSet.empty(); }

946

SmallPtrSet<Value *, 8>::iterator strides_begin() {

947

return StrideSet.begin();

948

}

949

SmallPtrSet<Value *, 8>::iterator strides_end() { return StrideSet.end(); }

950

951

/// Returns true if the target machine supports masked store operation

952

/// for the given \p DataType and kind of access to \p Ptr.

953

bool isLegalMaskedStore(Type *DataType, Value *Ptr) {

954

return TTI->isLegalMaskedStore(DataType, isConsecutivePtr(Ptr));

955

}

956

/// Returns true if the target machine supports masked load operation

957

/// for the given \p DataType and kind of access to \p Ptr.

958

bool isLegalMaskedLoad(Type *DataType, Value *Ptr) {

959

return TTI->isLegalMaskedLoad(DataType, isConsecutivePtr(Ptr));

960

}

961

/// Returns true if vector representation of the instruction \p I

962

/// requires mask.

963

bool isMaskRequired(const Instruction* I) {

964

return (MaskedOp.count(I) != 0);

965

}

966

unsigned getNumStores() const {

967

return LAI->getNumStores();

968

}

969

unsigned getNumLoads() const {

970

return LAI->getNumLoads();

971

}

972

unsigned getNumPredStores() const {

973

return NumPredStores;

974

}

975

private:

976

/// Check if a single basic block loop is vectorizable.

977

/// At this point we know that this is a loop with a constant trip count

978

/// and we only need to check individual instructions.

979

bool canVectorizeInstrs();

980

981

/// When we vectorize loops we may change the order in which

982

/// we read and write from memory. This method checks if it is

983

/// legal to vectorize the code, considering only memory constrains.

984

/// Returns true if the loop is vectorizable

985

bool canVectorizeMemory();

986

987

/// Return true if we can vectorize this loop using the IF-conversion

988

/// transformation.

989

bool canVectorizeWithIfConvert();

990

991

/// Collect the variables that need to stay uniform after vectorization.

992

void collectLoopUniforms();

993

994

/// Return true if all of the instructions in the block can be speculatively

995

/// executed. \p SafePtrs is a list of addresses that are known to be legal

996

/// and we know that we can read from them without segfault.

997

bool blockCanBePredicated(BasicBlock *BB, SmallPtrSetImpl<Value *> &SafePtrs);

998

999

/// Returns the induction kind of Phi and record the step. This function may

1000

/// return NoInduction if the PHI is not an induction variable.

1001

InductionKind isInductionVariable(PHINode *Phi, ConstantInt *&StepValue);

1002

1003

/// \brief Collect memory access with loop invariant strides.

1004

///

1005

/// Looks for accesses like "a[i * StrideA]" where "StrideA" is loop

1006

/// invariant.

1007

void collectStridedAccess(Value *LoadOrStoreInst);

1008

1009

/// Report an analysis message to assist the user in diagnosing loops that are

1010

/// not vectorized. These are handled as LoopAccessReport rather than

1011

/// VectorizationReport because the << operator of VectorizationReport returns

1012

/// LoopAccessReport.

1013

void emitAnalysis(const LoopAccessReport &Message) {

1014

LoopAccessReport::emitAnalysis(Message, TheFunction, TheLoop, LV_NAME"loop-vectorize");

1015

}

1016

1017

unsigned NumPredStores;

1018

1019

/// The loop that we evaluate.

1020

Loop *TheLoop;

1021

/// Scev analysis.

1022

ScalarEvolution *SE;

1023

/// Target Library Info.

1024

TargetLibraryInfo *TLI;

1025

/// Parent function

1026

Function *TheFunction;

1027

/// Target Transform Info

1028

const TargetTransformInfo *TTI;

1029

/// Dominator Tree.

1030

DominatorTree *DT;

1031

// LoopAccess analysis.

1032

LoopAccessAnalysis *LAA;

1033

// And the loop-accesses info corresponding to this loop. This pointer is

1034

// null until canVectorizeMemory sets it up.

1035

const LoopAccessInfo *LAI;

1036

1037

/// The interleave access information contains groups of interleaved accesses

1038

/// with the same stride and close to each other.

1039

InterleavedAccessInfo InterleaveInfo;

1040

1041

// --- vectorization state --- //

1042

1043

/// Holds the integer induction variable. This is the counter of the

1044

/// loop.

1045

PHINode *Induction;

1046

/// Holds the reduction variables.

1047

ReductionList Reductions;

1048

/// Holds all of the induction variables that we found in the loop.

1049

/// Notice that inductions don't need to start at zero and that induction

1050

/// variables can be pointers.

1051

InductionList Inductions;

1052

/// Holds the widest induction type encountered.

1053

Type *WidestIndTy;

1054

1055

/// Allowed outside users. This holds the reduction

1056

/// vars which can be accessed from outside the loop.

1057

SmallPtrSet<Value*, 4> AllowedExit;

1058

/// This set holds the variables which are known to be uniform after

1059

/// vectorization.

1060

SmallPtrSet<Instruction*, 4> Uniforms;

1061

1062

/// Can we assume the absence of NaNs.

1063

bool HasFunNoNaNAttr;

1064

1065

ValueToValueMap Strides;

1066

SmallPtrSet<Value *, 8> StrideSet;

1067

1068

/// While vectorizing these instructions we have to generate a

1069

/// call to the appropriate masked intrinsic

1070

SmallPtrSet<const Instruction*, 8> MaskedOp;

1071

};

1072

1073

/// LoopVectorizationCostModel - estimates the expected speedups due to

1074

/// vectorization.

1075

/// In many cases vectorization is not profitable. This can happen because of

1076

/// a number of reasons. In this class we mainly attempt to predict the

1077

/// expected speedup/slowdowns due to the supported instruction set. We use the

1078

/// TargetTransformInfo to query the different backends for the cost of

1079

/// different operations.

1080

class LoopVectorizationCostModel {

1081

public:

1082

LoopVectorizationCostModel(Loop *L, ScalarEvolution *SE, LoopInfo *LI,

1083

LoopVectorizationLegality *Legal,

1084

const TargetTransformInfo &TTI,

1085

const TargetLibraryInfo *TLI, AssumptionCache *AC,

1086

const Function *F, const LoopVectorizeHints *Hints)

1087

: TheLoop(L), SE(SE), LI(LI), Legal(Legal), TTI(TTI), TLI(TLI),

1088

TheFunction(F), Hints(Hints) {

1089

CodeMetrics::collectEphemeralValues(L, AC, EphValues);

1090

}

1091

1092

/// Information about vectorization costs

1093

struct VectorizationFactor {

1094

unsigned Width; // Vector width with best cost

1095

unsigned Cost; // Cost of the loop with that width

1096

};

1097

/// \return The most profitable vectorization factor and the cost of that VF.

1098

/// This method checks every power of two up to VF. If UserVF is not ZERO

1099

/// then this vectorization factor will be selected if vectorization is

1100

/// possible.

1101

VectorizationFactor selectVectorizationFactor(bool OptForSize);

1102

1103

/// \return The size (in bits) of the widest type in the code that

1104

/// needs to be vectorized. We ignore values that remain scalar such as

1105

/// 64 bit loop indices.

1106

unsigned getWidestType();

1107

1108

/// \return The most profitable unroll factor.

1109

/// If UserUF is non-zero then this method finds the best unroll-factor

1110

/// based on register pressure and other parameters.

1111

/// VF and LoopCost are the selected vectorization factor and the cost of the

1112

/// selected VF.

1113

unsigned selectUnrollFactor(bool OptForSize, unsigned VF, unsigned LoopCost);

1114

1115

/// \brief A struct that represents some properties of the register usage

1116

/// of a loop.

1117

struct RegisterUsage {

1118

/// Holds the number of loop invariant values that are used in the loop.

1119

unsigned LoopInvariantRegs;

1120

/// Holds the maximum number of concurrent live intervals in the loop.

1121

unsigned MaxLocalUsers;

1122

/// Holds the number of instructions in the loop.

1123

unsigned NumInstructions;

1124

};

1125

1126

/// \return information about the register usage of the loop.

1127

RegisterUsage calculateRegisterUsage();

1128

1129

private:

1130

/// Returns the expected execution cost. The unit of the cost does

1131

/// not matter because we use the 'cost' units to compare different

1132

/// vector widths. The cost that is returned is *not* normalized by

1133

/// the factor width.

1134

unsigned expectedCost(unsigned VF);

1135

1136

/// Returns the execution time cost of an instruction for a given vector

1137

/// width. Vector width of one means scalar.

1138

unsigned getInstructionCost(Instruction *I, unsigned VF);

1139

1140

/// Returns whether the instruction is a load or store and will be a emitted

1141

/// as a vector operation.

1142

bool isConsecutiveLoadOrStore(Instruction *I);

1143

1144

/// Report an analysis message to assist the user in diagnosing loops that are

1145

/// not vectorized. These are handled as LoopAccessReport rather than

1146

/// VectorizationReport because the << operator of VectorizationReport returns

1147

/// LoopAccessReport.

1148

void emitAnalysis(const LoopAccessReport &Message) {

1149

LoopAccessReport::emitAnalysis(Message, TheFunction, TheLoop, LV_NAME"loop-vectorize");

1150

}

1151

1152

/// Values used only by @llvm.assume calls.

1153

SmallPtrSet<const Value *, 32> EphValues;

1154

1155

/// The loop that we evaluate.

1156

Loop *TheLoop;

1157

/// Scev analysis.

1158

ScalarEvolution *SE;

1159

/// Loop Info analysis.

1160

LoopInfo *LI;

1161

/// Vectorization legality.

1162

LoopVectorizationLegality *Legal;

1163

/// Vector target information.

1164

const TargetTransformInfo &TTI;

1165

/// Target Library Info.

1166

const TargetLibraryInfo *TLI;

1167

const Function *TheFunction;

1168

// Loop Vectorize Hint.

1169

const LoopVectorizeHints *Hints;

1170

};

1171

1172

/// Utility class for getting and setting loop vectorizer hints in the form

1173

/// of loop metadata.

1174

/// This class keeps a number of loop annotations locally (as member variables)

1175

/// and can, upon request, write them back as metadata on the loop. It will

1176

/// initially scan the loop for existing metadata, and will update the local

1177

/// values based on information in the loop.

1178

/// We cannot write all values to metadata, as the mere presence of some info,

1179

/// for example 'force', means a decision has been made. So, we need to be

1180

/// careful NOT to add them if the user hasn't specifically asked so.

1181

class LoopVectorizeHints {

1182

enum HintKind {

1183

HK_WIDTH,

1184

HK_UNROLL,

1185

HK_FORCE

1186

};

1187

1188

/// Hint - associates name and validation with the hint value.

1189

struct Hint {

1190

const char * Name;

1191

unsigned Value; // This may have to change for non-numeric values.

1192

HintKind Kind;

1193

1194

Hint(const char * Name, unsigned Value, HintKind Kind)

1195

: Name(Name), Value(Value), Kind(Kind) { }

1196

1197

bool validate(unsigned Val) {

1198

switch (Kind) {

1199

case HK_WIDTH:

1200

return isPowerOf2_32(Val) && Val <= VectorizerParams::MaxVectorWidth;

1201

case HK_UNROLL:

1202

return isPowerOf2_32(Val) && Val <= MaxInterleaveFactor;

1203

case HK_FORCE:

1204

return (Val <= 1);

1205

}

1206

return false;

1207

}

1208

};

1209

1210

/// Vectorization width.

1211

Hint Width;

1212

/// Vectorization interleave factor.

1213

Hint Interleave;

1214

/// Vectorization forced

1215

Hint Force;

1216

1217

/// Return the loop metadata prefix.

1218

static StringRef Prefix() { return "llvm.loop."; }

1219

1220

public:

1221

enum ForceKind {

1222

FK_Undefined = -1, ///< Not selected.

1223

FK_Disabled = 0, ///< Forcing disabled.

1224

FK_Enabled = 1, ///< Forcing enabled.

1225

};

1226

1227

LoopVectorizeHints(const Loop *L, bool DisableInterleaving)

1228

: Width("vectorize.width", VectorizerParams::VectorizationFactor,

1229

HK_WIDTH),

1230

Interleave("interleave.count", DisableInterleaving, HK_UNROLL),

1231

Force("vectorize.enable", FK_Undefined, HK_FORCE),

1232

TheLoop(L) {

1233

// Populate values with existing loop metadata.

1234

getHintsFromMetadata();

1235

1236

// force-vector-interleave overrides DisableInterleaving.

1237

if (VectorizerParams::isInterleaveForced())

1238

Interleave.Value = VectorizerParams::VectorizationInterleave;

1239

1240

DEBUG(if (DisableInterleaving && Interleave.Value == 1) dbgs()do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { if (DisableInterleaving && Interleave
.Value == 1) dbgs() << "LV: Interleaving disabled by the pass manager\n"
; } } while (0)

1241

<< "LV: Interleaving disabled by the pass manager\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { if (DisableInterleaving && Interleave
.Value == 1) dbgs() << "LV: Interleaving disabled by the pass manager\n"
; } } while (0);

1242

}

1243

1244

/// Mark the loop L as already vectorized by setting the width to 1.

1245

void setAlreadyVectorized() {

1246

Width.Value = Interleave.Value = 1;

1247

Hint Hints[] = {Width, Interleave};

1248

writeHintsToMetadata(Hints);

1249

}

1250

1251

/// Dumps all the hint information.

1252

std::string emitRemark() const {

1253

VectorizationReport R;

1254

if (Force.Value == LoopVectorizeHints::FK_Disabled)

1255

R << "vectorization is explicitly disabled";

1256

else {

1257

R << "use -Rpass-analysis=loop-vectorize for more info";

1258

if (Force.Value == LoopVectorizeHints::FK_Enabled) {

1259

R << " (Force=true";

1260

if (Width.Value != 0)

1261

R << ", Vector Width=" << Width.Value;

1262

if (Interleave.Value != 0)

1263

R << ", Interleave Count=" << Interleave.Value;

1264

R << ")";

1265

}

1266

}

1267

1268

return R.str();

1269

}

1270

1271

unsigned getWidth() const { return Width.Value; }

1272

unsigned getInterleave() const { return Interleave.Value; }

1273

enum ForceKind getForce() const { return (ForceKind)Force.Value; }

1274

1275

private:

1276

/// Find hints specified in the loop metadata and update local values.

1277

void getHintsFromMetadata() {

1278

MDNode *LoopID = TheLoop->getLoopID();

1279

if (!LoopID)

1280

return;

1281

1282

// First operand should refer to the loop id itself.

1283

assert(LoopID->getNumOperands() > 0 && "requires at least one operand")((LoopID->getNumOperands() > 0 && "requires at least one operand"
) ? static_cast<void> (0) : __assert_fail ("LoopID->getNumOperands() > 0 && \"requires at least one operand\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn240924/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1283, __PRETTY_FUNCTION__));

1284

assert(LoopID->getOperand(0) == LoopID && "invalid loop id")((LoopID->getOperand(0) == LoopID && "invalid loop id"
) ? static_cast<void> (0) : __assert_fail ("LoopID->getOperand(0) == LoopID && \"invalid loop id\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn240924/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1284, __PRETTY_FUNCTION__));

1285

1286

for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) {

1287

const MDString *S = nullptr;

1288

SmallVector<Metadata *, 4> Args;

1289

1290

// The expected hint is either a MDString or a MDNode with the first

1291

// operand a MDString.

1292

if (const MDNode *MD = dyn_cast<MDNode>(LoopID->getOperand(i))) {

1293

if (!MD || MD->getNumOperands() == 0)

1294

continue;

1295

S = dyn_cast<MDString>(MD->getOperand(0));

1296

for (unsigned i = 1, ie = MD->getNumOperands(); i < ie; ++i)

1297

Args.push_back(MD->getOperand(i));

1298

} else {

1299

S = dyn_cast<MDString>(LoopID->getOperand(i));

1300

assert(Args.size() == 0 && "too many arguments for MDString")((Args.size() == 0 && "too many arguments for MDString"
) ? static_cast<void> (0) : __assert_fail ("Args.size() == 0 && \"too many arguments for MDString\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn240924/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1300, __PRETTY_FUNCTION__));

1301

}

1302

1303

if (!S)

1304

continue;

1305

1306

// Check if the hint starts with the loop metadata prefix.

1307

StringRef Name = S->getString();

1308

if (Args.size() == 1)

1309

setHint(Name, Args[0]);

1310

}

1311

}

1312

1313

/// Checks string hint with one operand and set value if valid.

1314

void setHint(StringRef Name, Metadata *Arg) {

1315

if (!Name.startswith(Prefix()))

1316

return;

1317

Name = Name.substr(Prefix().size(), StringRef::npos);

1318

1319

const ConstantInt *C = mdconst::dyn_extract<ConstantInt>(Arg);

1320

if (!C) return;

1321

unsigned Val = C->getZExtValue();

1322

1323

Hint *Hints[] = {&Width, &Interleave, &Force};

1324

for (auto H : Hints) {

1325

if (Name == H->Name) {

1326

if (H->validate(Val))

1327

H->Value = Val;

1328

else

1329

DEBUG(dbgs() << "LV: ignoring invalid hint '" << Name << "'\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: ignoring invalid hint '"
<< Name << "'\n"; } } while (0);

1330

break;

1331

}

1332

}

1333

}

1334

1335

/// Create a new hint from name / value pair.

1336

MDNode *createHintMetadata(StringRef Name, unsigned V) const {

1337

LLVMContext &Context = TheLoop->getHeader()->getContext();

1338

Metadata *MDs[] = {MDString::get(Context, Name),

1339

ConstantAsMetadata::get(

1340

ConstantInt::get(Type::getInt32Ty(Context), V))};

1341

return MDNode::get(Context, MDs);

1342

}

1343

1344

/// Matches metadata with hint name.

1345

bool matchesHintMetadataName(MDNode *Node, ArrayRef<Hint> HintTypes) {

1346

MDString* Name = dyn_cast<MDString>(Node->getOperand(0));

1347

if (!Name)

1348

return false;

1349

1350

for (auto H : HintTypes)

1351

if (Name->getString().endswith(H.Name))

1352

return true;

1353

return false;

1354

}

1355

1356

/// Sets current hints into loop metadata, keeping other values intact.

1357

void writeHintsToMetadata(ArrayRef<Hint> HintTypes) {

1358

if (HintTypes.size() == 0)

1359

return;

1360

1361

// Reserve the first element to LoopID (see below).

1362

SmallVector<Metadata *, 4> MDs(1);

1363

// If the loop already has metadata, then ignore the existing operands.

1364

MDNode *LoopID = TheLoop->getLoopID();

1365

if (LoopID) {

1366

for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) {

1367

MDNode *Node = cast<MDNode>(LoopID->getOperand(i));

1368

// If node in update list, ignore old value.

1369

if (!matchesHintMetadataName(Node, HintTypes))

1370

MDs.push_back(Node);

1371

}

1372

}

1373

1374

// Now, add the missing hints.

1375

for (auto H : HintTypes)

1376

MDs.push_back(createHintMetadata(Twine(Prefix(), H.Name).str(), H.Value));

1377

1378

// Replace current metadata node with new one.

1379

LLVMContext &Context = TheLoop->getHeader()->getContext();

1380

MDNode *NewLoopID = MDNode::get(Context, MDs);

1381

// Set operand 0 to refer to the loop id itself.

1382

NewLoopID->replaceOperandWith(0, NewLoopID);

1383

1384

TheLoop->setLoopID(NewLoopID);

1385

}

1386

1387

/// The loop these hints belong to.

1388

const Loop *TheLoop;

1389

};

1390

1391

static void emitMissedWarning(Function *F, Loop *L,

1392

const LoopVectorizeHints &LH) {

1393

emitOptimizationRemarkMissed(F->getContext(), DEBUG_TYPE"loop-vectorize", *F,

1394

L->getStartLoc(), LH.emitRemark());

1395

1396

if (LH.getForce() == LoopVectorizeHints::FK_Enabled) {

1397

if (LH.getWidth() != 1)

1398

emitLoopVectorizeWarning(

1399

F->getContext(), *F, L->getStartLoc(),

1400

"failed explicitly specified loop vectorization");

1401

else if (LH.getInterleave() != 1)

1402

emitLoopInterleaveWarning(

1403

F->getContext(), *F, L->getStartLoc(),

1404

"failed explicitly specified loop interleaving");

1405

}

1406

}

1407

1408

static void addInnerLoop(Loop &L, SmallVectorImpl<Loop *> &V) {

1409

if (L.empty())

1410

return V.push_back(&L);

1411

1412

for (Loop *InnerL : L)

1413

addInnerLoop(*InnerL, V);

1414

}

1415

1416

/// The LoopVectorize Pass.

1417

struct LoopVectorize : public FunctionPass {

1418

/// Pass identification, replacement for typeid

1419

static char ID;

1420

1421

explicit LoopVectorize(bool NoUnrolling = false, bool AlwaysVectorize = true)

1422

: FunctionPass(ID),

1423

DisableUnrolling(NoUnrolling),

1424

AlwaysVectorize(AlwaysVectorize) {

1425

initializeLoopVectorizePass(*PassRegistry::getPassRegistry());

1426

}

1427

1428

ScalarEvolution *SE;

1429

LoopInfo *LI;

1430

TargetTransformInfo *TTI;

1431

DominatorTree *DT;

1432

BlockFrequencyInfo *BFI;

1433

TargetLibraryInfo *TLI;

1434

AliasAnalysis *AA;

1435

AssumptionCache *AC;

1436

LoopAccessAnalysis *LAA;

1437

bool DisableUnrolling;

1438

bool AlwaysVectorize;

1439

1440

BlockFrequency ColdEntryFreq;

1441

1442

bool runOnFunction(Function &F) override {

1443

SE = &getAnalysis<ScalarEvolution>();

1444

LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();

1445

TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);

1446

DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();

1447

BFI = &getAnalysis<BlockFrequencyInfo>();

1448

auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();

1449

TLI = TLIP ? &TLIP->getTLI() : nullptr;

1450

AA = &getAnalysis<AliasAnalysis>();

1451

AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);

1452

LAA = &getAnalysis<LoopAccessAnalysis>();

1453

1454

// Compute some weights outside of the loop over the loops. Compute this

1455

// using a BranchProbability to re-use its scaling math.

1456

const BranchProbability ColdProb(1, 5); // 20%

1457

ColdEntryFreq = BlockFrequency(BFI->getEntryFreq()) * ColdProb;

1458

1459

// If the target claims to have no vector registers don't attempt

1460

// vectorization.

1461

if (!TTI->getNumberOfRegisters(true))

1462

return false;

1463

1464

// Build up a worklist of inner-loops to vectorize. This is necessary as

1465

// the act of vectorizing or partially unrolling a loop creates new loops

1466

// and can invalidate iterators across the loops.

1467

SmallVector<Loop *, 8> Worklist;

1468

1469

for (Loop *L : *LI)

1470

addInnerLoop(*L, Worklist);

1471

1472

LoopsAnalyzed += Worklist.size();

1473

1474

// Now walk the identified inner loops.

1475

bool Changed = false;

1476

while (!Worklist.empty())

1477

Changed |= processLoop(Worklist.pop_back_val());

1478

1479

// Process each loop nest in the function.

1480

return Changed;

1481

}

1482

1483

static void AddRuntimeUnrollDisableMetaData(Loop *L) {

1484

SmallVector<Metadata *, 4> MDs;

1485

// Reserve first location for self reference to the LoopID metadata node.

1486

MDs.push_back(nullptr);

1487

bool IsUnrollMetadata = false;

1488

MDNode *LoopID = L->getLoopID();

1489

if (LoopID) {

1490

// First find existing loop unrolling disable metadata.

1491

for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) {

1492

MDNode *MD = dyn_cast<MDNode>(LoopID->getOperand(i));

1493

if (MD) {

1494

const MDString *S = dyn_cast<MDString>(MD->getOperand(0));

1495

IsUnrollMetadata =

1496

S && S->getString().startswith("llvm.loop.unroll.disable");

1497

}

1498

MDs.push_back(LoopID->getOperand(i));

1499

}

1500

}

1501

1502

if (!IsUnrollMetadata) {

1503

// Add runtime unroll disable metadata.

1504

LLVMContext &Context = L->getHeader()->getContext();

1505

SmallVector<Metadata *, 1> DisableOperands;

1506

DisableOperands.push_back(

1507

MDString::get(Context, "llvm.loop.unroll.runtime.disable"));

1508

MDNode *DisableNode = MDNode::get(Context, DisableOperands);

1509

MDs.push_back(DisableNode);

1510

MDNode *NewLoopID = MDNode::get(Context, MDs);

1511

// Set operand 0 to refer to the loop id itself.

1512

NewLoopID->replaceOperandWith(0, NewLoopID);

1513

L->setLoopID(NewLoopID);

1514

}

1515

}

1516

1517

bool processLoop(Loop *L) {

1518

assert(L->empty() && "Only process inner loops.")((L->empty() && "Only process inner loops.") ? static_cast
<void> (0) : __assert_fail ("L->empty() && \"Only process inner loops.\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn240924/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1518, __PRETTY_FUNCTION__));

1519

1520

#ifndef NDEBUG

1521

const std::string DebugLocStr = getDebugLocString(L);

1522

#endif /* NDEBUG */

1523

1524

DEBUG(dbgs() << "\nLV: Checking a loop in \""do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "\nLV: Checking a loop in \""
<< L->getHeader()->getParent()->getName() <<
"\" from " << DebugLocStr << "\n"; } } while (0)

1525

<< L->getHeader()->getParent()->getName() << "\" from "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "\nLV: Checking a loop in \""
<< L->getHeader()->getParent()->getName() <<
"\" from " << DebugLocStr << "\n"; } } while (0)

1526

<< DebugLocStr << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "\nLV: Checking a loop in \""
<< L->getHeader()->getParent()->getName() <<
"\" from " << DebugLocStr << "\n"; } } while (0);

1527

1528

LoopVectorizeHints Hints(L, DisableUnrolling);

1529

1530

DEBUG(dbgs() << "LV: Loop hints:"do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Loop hints:" <<
" force=" << (Hints.getForce() == LoopVectorizeHints::
FK_Disabled ? "disabled" : (Hints.getForce() == LoopVectorizeHints
::FK_Enabled ? "enabled" : "?")) << " width=" << Hints
.getWidth() << " unroll=" << Hints.getInterleave(
) << "\n"; } } while (0)

1531

<< " force="do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Loop hints:" <<
" force=" << (Hints.getForce() == LoopVectorizeHints::
FK_Disabled ? "disabled" : (Hints.getForce() == LoopVectorizeHints
::FK_Enabled ? "enabled" : "?")) << " width=" << Hints
.getWidth() << " unroll=" << Hints.getInterleave(
) << "\n"; } } while (0)

1532

<< (Hints.getForce() == LoopVectorizeHints::FK_Disableddo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Loop hints:" <<
" force=" << (Hints.getForce() == LoopVectorizeHints::
FK_Disabled ? "disabled" : (Hints.getForce() == LoopVectorizeHints
::FK_Enabled ? "enabled" : "?")) << " width=" << Hints
.getWidth() << " unroll=" << Hints.getInterleave(
) << "\n"; } } while (0)

1533

? "disabled"do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Loop hints:" <<
" force=" << (Hints.getForce() == LoopVectorizeHints::
FK_Disabled ? "disabled" : (Hints.getForce() == LoopVectorizeHints
::FK_Enabled ? "enabled" : "?")) << " width=" << Hints
.getWidth() << " unroll=" << Hints.getInterleave(
) << "\n"; } } while (0)

1534

: (Hints.getForce() == LoopVectorizeHints::FK_Enableddo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Loop hints:" <<
" force=" << (Hints.getForce() == LoopVectorizeHints::
FK_Disabled ? "disabled" : (Hints.getForce() == LoopVectorizeHints
::FK_Enabled ? "enabled" : "?")) << " width=" << Hints
.getWidth() << " unroll=" << Hints.getInterleave(
) << "\n"; } } while (0)

1535

? "enabled"do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Loop hints:" <<
" force=" << (Hints.getForce() == LoopVectorizeHints::
FK_Disabled ? "disabled" : (Hints.getForce() == LoopVectorizeHints
::FK_Enabled ? "enabled" : "?")) << " width=" << Hints
.getWidth() << " unroll=" << Hints.getInterleave(
) << "\n"; } } while (0)

1536

: "?")) << " width=" << Hints.getWidth()do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Loop hints:" <<
" force=" << (Hints.getForce() == LoopVectorizeHints::
FK_Disabled ? "disabled" : (Hints.getForce() == LoopVectorizeHints
::FK_Enabled ? "enabled" : "?")) << " width=" << Hints
.getWidth() << " unroll=" << Hints.getInterleave(
) << "\n"; } } while (0)

1537

<< " unroll=" << Hints.getInterleave() << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Loop hints:" <<
" force=" << (Hints.getForce() == LoopVectorizeHints::
FK_Disabled ? "disabled" : (Hints.getForce() == LoopVectorizeHints
::FK_Enabled ? "enabled" : "?")) << " width=" << Hints
.getWidth() << " unroll=" << Hints.getInterleave(
) << "\n"; } } while (0);

1538

1539

// Function containing loop

1540

Function *F = L->getHeader()->getParent();

1541

1542

// Looking at the diagnostic output is the only way to determine if a loop

1543

// was vectorized (other than looking at the IR or machine code), so it

1544

// is important to generate an optimization remark for each loop. Most of

1545

// these messages are generated by emitOptimizationRemarkAnalysis. Remarks

1546

// generated by emitOptimizationRemark and emitOptimizationRemarkMissed are

1547

// less verbose reporting vectorized loops and unvectorized loops that may

1548

// benefit from vectorization, respectively.

1549

1550

if (Hints.getForce() == LoopVectorizeHints::FK_Disabled) {

1551

DEBUG(dbgs() << "LV: Not vectorizing: #pragma vectorize disable.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Not vectorizing: #pragma vectorize disable.\n"
; } } while (0);

1552

emitOptimizationRemarkAnalysis(F->getContext(), DEBUG_TYPE"loop-vectorize", *F,

1553

L->getStartLoc(), Hints.emitRemark());

1554

return false;

1555

}

1556

1557

if (!AlwaysVectorize && Hints.getForce() != LoopVectorizeHints::FK_Enabled) {

1558

DEBUG(dbgs() << "LV: Not vectorizing: No #pragma vectorize enable.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Not vectorizing: No #pragma vectorize enable.\n"
; } } while (0);

1559

emitOptimizationRemarkAnalysis(F->getContext(), DEBUG_TYPE"loop-vectorize", *F,

1560

L->getStartLoc(), Hints.emitRemark());

1561

return false;

1562

}

1563

1564

if (Hints.getWidth() == 1 && Hints.getInterleave() == 1) {

1565

DEBUG(dbgs() << "LV: Not vectorizing: Disabled/already vectorized.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Not vectorizing: Disabled/already vectorized.\n"
; } } while (0);

1566

emitOptimizationRemarkAnalysis(

1567

F->getContext(), DEBUG_TYPE"loop-vectorize", *F, L->getStartLoc(),

1568

"loop not vectorized: vector width and interleave count are "

1569

"explicitly set to 1");

1570

return false;

1571

}

1572

1573

// Check the loop for a trip count threshold:

1574

// do not vectorize loops with a tiny trip count.

1575

const unsigned TC = SE->getSmallConstantTripCount(L);

1576

if (TC > 0u && TC < TinyTripCountVectorThreshold) {

1577

DEBUG(dbgs() << "LV: Found a loop with a very small trip count. "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found a loop with a very small trip count. "
<< "This loop is not worth vectorizing."; } } while (0
)

1578

<< "This loop is not worth vectorizing.")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found a loop with a very small trip count. "
<< "This loop is not worth vectorizing."; } } while (0
);

1579

if (Hints.getForce() == LoopVectorizeHints::FK_Enabled)

1580

DEBUG(dbgs() << " But vectorizing was explicitly forced.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << " But vectorizing was explicitly forced.\n"
; } } while (0);

1581

else {

1582

DEBUG(dbgs() << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "\n"; } } while (0);

1583

emitOptimizationRemarkAnalysis(

1584

F->getContext(), DEBUG_TYPE"loop-vectorize", *F, L->getStartLoc(),

1585

"vectorization is not beneficial and is not explicitly forced");

1586

return false;

1587

}

1588

}

1589

1590

// Check if it is legal to vectorize the loop.

1591

LoopVectorizationLegality LVL(L, SE, DT, TLI, AA, F, TTI, LAA);

1592

if (!LVL.canVectorize()) {

1593

DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Not vectorizing: Cannot prove legality.\n"
; } } while (0);

1594

emitMissedWarning(F, L, Hints);

1595

return false;

1596

}

1597

1598

// Use the cost model.

1599

LoopVectorizationCostModel CM(L, SE, LI, &LVL, *TTI, TLI, AC, F, &Hints);

1600

1601

// Check the function attributes to find out if this function should be

1602

// optimized for size.

1603

bool OptForSize = Hints.getForce() != LoopVectorizeHints::FK_Enabled &&

1604

F->hasFnAttribute(Attribute::OptimizeForSize);

1605

1606

// Compute the weighted frequency of this loop being executed and see if it

1607

// is less than 20% of the function entry baseline frequency. Note that we

1608

// always have a canonical loop here because we think we *can* vectoriez.

1609

// FIXME: This is hidden behind a flag due to pervasive problems with

1610

// exactly what block frequency models.

1611

if (LoopVectorizeWithBlockFrequency) {

1612

BlockFrequency LoopEntryFreq = BFI->getBlockFreq(L->getLoopPreheader());

1613

if (Hints.getForce() != LoopVectorizeHints::FK_Enabled &&

1614

LoopEntryFreq < ColdEntryFreq)

1615

OptForSize = true;

1616

}

1617

1618

// Check the function attributes to see if implicit floats are allowed.a

1619

// FIXME: This check doesn't seem possibly correct -- what if the loop is

1620

// an integer loop and the vector instructions selected are purely integer

1621

// vector instructions?

1622

if (F->hasFnAttribute(Attribute::NoImplicitFloat)) {

1623

DEBUG(dbgs() << "LV: Can't vectorize when the NoImplicitFloat"do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Can't vectorize when the NoImplicitFloat"
"attribute is used.\n"; } } while (0)

1624

"attribute is used.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Can't vectorize when the NoImplicitFloat"
"attribute is used.\n"; } } while (0);

1625

emitOptimizationRemarkAnalysis(

1626

F->getContext(), DEBUG_TYPE"loop-vectorize", *F, L->getStartLoc(),

1627

"loop not vectorized due to NoImplicitFloat attribute");

1628

emitMissedWarning(F, L, Hints);

1629

return false;

1630

}

1631

1632

// Select the optimal vectorization factor.

1633

const LoopVectorizationCostModel::VectorizationFactor VF =

1634

CM.selectVectorizationFactor(OptForSize);

1635

1636

// Select the unroll factor.

1637

const unsigned UF =

1638

CM.selectUnrollFactor(OptForSize, VF.Width, VF.Cost);

1639

1640

DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width << ") in "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found a vectorizable loop ("
<< VF.Width << ") in " << DebugLocStr <<
'\n'; } } while (0)

1641

<< DebugLocStr << '\n')do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found a vectorizable loop ("
<< VF.Width << ") in " << DebugLocStr <<
'\n'; } } while (0);

1642

DEBUG(dbgs() << "LV: Unroll Factor is " << UF << '\n')do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Unroll Factor is "
<< UF << '\n'; } } while (0);

1643

1644

if (VF.Width == 1) {

1645

DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Vectorization is possible but not beneficial\n"
; } } while (0);

1646

1647

if (UF == 1) {

1648

emitOptimizationRemarkAnalysis(

1649

F->getContext(), DEBUG_TYPE"loop-vectorize", *F, L->getStartLoc(),

1650

"not beneficial to vectorize and user disabled interleaving");

1651

return false;

1652

}

1653

DEBUG(dbgs() << "LV: Trying to at least unroll the loops.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Trying to at least unroll the loops.\n"
; } } while (0);

1654

1655

// Report the unrolling decision.

1656

emitOptimizationRemark(F->getContext(), DEBUG_TYPE"loop-vectorize", *F, L->getStartLoc(),

1657

Twine("unrolled with interleaving factor " +

1658

Twine(UF) +

1659

" (vectorization not beneficial)"));

1660

1661

// We decided not to vectorize, but we may want to unroll.

1662

1663

InnerLoopUnroller Unroller(L, SE, LI, DT, TLI, TTI, UF);

1664

Unroller.vectorize(&LVL);

1665

} else {

1666

// If we decided that it is *legal* to vectorize the loop then do it.

1667

InnerLoopVectorizer LB(L, SE, LI, DT, TLI, TTI, VF.Width, UF);

1668

LB.vectorize(&LVL);

1669

++LoopsVectorized;

1670

1671

// Add metadata to disable runtime unrolling scalar loop when there's no

1672

// runtime check about strides and memory. Because at this situation,

1673

// scalar loop is rarely used not worthy to be unrolled.

1674

if (!LB.IsSafetyChecksAdded())

1675

AddRuntimeUnrollDisableMetaData(L);

1676

1677

// Report the vectorization decision.

1678

emitOptimizationRemark(

1679

F->getContext(), DEBUG_TYPE"loop-vectorize", *F, L->getStartLoc(),

1680

Twine("vectorized loop (vectorization factor: ") + Twine(VF.Width) +

1681

", unrolling interleave factor: " + Twine(UF) + ")");

1682

}

1683

1684

// Mark the loop as already vectorized to avoid vectorizing again.

1685

Hints.setAlreadyVectorized();

1686

1687

DEBUG(verifyFunction(*L->getHeader()->getParent()))do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { verifyFunction(*L->getHeader()->getParent
()); } } while (0);

1688

return true;

1689

}

1690

1691

void getAnalysisUsage(AnalysisUsage &AU) const override {

1692

AU.addRequired<AssumptionCacheTracker>();

1693

AU.addRequiredID(LoopSimplifyID);

1694

AU.addRequiredID(LCSSAID);

1695

AU.addRequired<BlockFrequencyInfo>();

1696

AU.addRequired<DominatorTreeWrapperPass>();

1697

AU.addRequired<LoopInfoWrapperPass>();

1698

AU.addRequired<ScalarEvolution>();

1699

AU.addRequired<TargetTransformInfoWrapperPass>();

1700

AU.addRequired<AliasAnalysis>();

1701

AU.addRequired<LoopAccessAnalysis>();

1702

AU.addPreserved<LoopInfoWrapperPass>();

1703

AU.addPreserved<DominatorTreeWrapperPass>();

1704

AU.addPreserved<AliasAnalysis>();

1705

}

1706

1707

};

1708

1709

} // end anonymous namespace

1710

1711

//===----------------------------------------------------------------------===//

1712

// Implementation of LoopVectorizationLegality, InnerLoopVectorizer and

1713

// LoopVectorizationCostModel.

1714

//===----------------------------------------------------------------------===//

1715

1716

Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) {

1717

// We need to place the broadcast of invariant variables outside the loop.

1718

Instruction *Instr = dyn_cast<Instruction>(V);

1719

bool NewInstr =

1720

(Instr && std::find(LoopVectorBody.begin(), LoopVectorBody.end(),

1721

Instr->getParent()) != LoopVectorBody.end());

1722

bool Invariant = OrigLoop->isLoopInvariant(V) && !NewInstr;

1723

1724

// Place the code for broadcasting invariant variables in the new preheader.

1725

IRBuilder<>::InsertPointGuard Guard(Builder);

1726

if (Invariant)

1727

Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());

1728

1729

// Broadcast the scalar into all locations in the vector.

1730

Value *Shuf = Builder.CreateVectorSplat(VF, V, "broadcast");

1731

1732

return Shuf;

1733

}

1734

1735

Value *InnerLoopVectorizer::getStepVector(Value *Val, int StartIdx,

1736

Value *Step) {

1737

assert(Val->getType()->isVectorTy() && "Must be a vector")((Val->getType()->isVectorTy() && "Must be a vector"
) ? static_cast<void> (0) : __assert_fail ("Val->getType()->isVectorTy() && \"Must be a vector\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn240924/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1737, __PRETTY_FUNCTION__));

1738

assert(Val->getType()->getScalarType()->isIntegerTy() &&((Val->getType()->getScalarType()->isIntegerTy() &&
"Elem must be an integer") ? static_cast<void> (0) : __assert_fail
("Val->getType()->getScalarType()->isIntegerTy() && \"Elem must be an integer\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn240924/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1739, __PRETTY_FUNCTION__))

1739

"Elem must be an integer")((Val->getType()->getScalarType()->isIntegerTy() &&
"Elem must be an integer") ? static_cast<void> (0) : __assert_fail
("Val->getType()->getScalarType()->isIntegerTy() && \"Elem must be an integer\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn240924/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1739, __PRETTY_FUNCTION__));

1740

assert(Step->getType() == Val->getType()->getScalarType() &&((Step->getType() == Val->getType()->getScalarType()
&& "Step has wrong type") ? static_cast<void> (
0) : __assert_fail ("Step->getType() == Val->getType()->getScalarType() && \"Step has wrong type\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn240924/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1741, __PRETTY_FUNCTION__))

1741

"Step has wrong type")((Step->getType() == Val->getType()->getScalarType()
&& "Step has wrong type") ? static_cast<void> (
0) : __assert_fail ("Step->getType() == Val->getType()->getScalarType() && \"Step has wrong type\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn240924/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1741, __PRETTY_FUNCTION__));

1742

// Create the types.

1743

Type *ITy = Val->getType()->getScalarType();

1744

VectorType *Ty = cast<VectorType>(Val->getType());

1745

int VLen = Ty->getNumElements();

1746

SmallVector<Constant*, 8> Indices;

1747

1748

// Create a vector of consecutive numbers from zero to VF.

1749

for (int i = 0; i < VLen; ++i)

1750

Indices.push_back(ConstantInt::get(ITy, StartIdx + i));

1751

1752

// Add the consecutive indices to the vector value.

1753

Constant *Cv = ConstantVector::get(Indices);

1754

assert(Cv->getType() == Val->getType() && "Invalid consecutive vec")((Cv->getType() == Val->getType() && "Invalid consecutive vec"
) ? static_cast<void> (0) : __assert_fail ("Cv->getType() == Val->getType() && \"Invalid consecutive vec\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn240924/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1754, __PRETTY_FUNCTION__));

1755

Step = Builder.CreateVectorSplat(VLen, Step);

1756

assert(Step->getType() == Val->getType() && "Invalid step vec")((Step->getType() == Val->getType() && "Invalid step vec"
) ? static_cast<void> (0) : __assert_fail ("Step->getType() == Val->getType() && \"Invalid step vec\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn240924/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1756, __PRETTY_FUNCTION__));

1757

// FIXME: The newly created binary instructions should contain nsw/nuw flags,

1758

// which can be found from the original scalar operations.

1759

Step = Builder.CreateMul(Cv, Step);

1760

return Builder.CreateAdd(Val, Step, "induction");

1761

}

1762

1763

/// \brief Find the operand of the GEP that should be checked for consecutive

1764

/// stores. This ignores trailing indices that have no effect on the final

1765

/// pointer.

1766

static unsigned getGEPInductionOperand(const GetElementPtrInst *Gep) {

1767

const DataLayout &DL = Gep->getModule()->getDataLayout();

1768

unsigned LastOperand = Gep->getNumOperands() - 1;

1769

unsigned GEPAllocSize = DL.getTypeAllocSize(

1770

cast<PointerType>(Gep->getType()->getScalarType())->getElementType());

1771

1772

// Walk backwards and try to peel off zeros.

1773

while (LastOperand > 1 && match(Gep->getOperand(LastOperand), m_Zero())) {

1774

// Find the type we're currently indexing into.

1775

gep_type_iterator GEPTI = gep_type_begin(Gep);

1776

std::advance(GEPTI, LastOperand - 1);

1777

1778

// If it's a type with the same allocation size as the result of the GEP we

1779

// can peel off the zero index.

1780

if (DL.getTypeAllocSize(*GEPTI) != GEPAllocSize)

1781

break;

1782

--LastOperand;

1783

}

1784

1785

return LastOperand;

1786

}

1787

1788

int LoopVectorizationLegality::isConsecutivePtr(Value *Ptr) {

1789

assert(Ptr->getType()->isPointerTy() && "Unexpected non-ptr")((Ptr->getType()->isPointerTy() && "Unexpected non-ptr"
) ? static_cast<void> (0) : __assert_fail ("Ptr->getType()->isPointerTy() && \"Unexpected non-ptr\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn240924/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1789, __PRETTY_FUNCTION__));

1790

// Make sure that the pointer does not point to structs.

1791

if (Ptr->getType()->getPointerElementType()->isAggregateType())

1792

return 0;

1793

1794

// If this value is a pointer induction variable we know it is consecutive.

1795

PHINode *Phi = dyn_cast_or_null<PHINode>(Ptr);

1796

if (Phi && Inductions.count(Phi)) {

1797

InductionInfo II = Inductions[Phi];

1798

return II.getConsecutiveDirection();

1799

}

1800

1801

GetElementPtrInst *Gep = dyn_cast_or_null<GetElementPtrInst>(Ptr);

1802

if (!Gep)

1803

return 0;

1804

1805

unsigned NumOperands = Gep->getNumOperands();

1806

Value *GpPtr = Gep->getPointerOperand();

1807

// If this GEP value is a consecutive pointer induction variable and all of

1808

// the indices are constant then we know it is consecutive. We can

1809

Phi = dyn_cast<PHINode>(GpPtr);

1810

if (Phi && Inductions.count(Phi)) {

1811

1812

// Make sure that the pointer does not point to structs.

1813

PointerType *GepPtrType = cast<PointerType>(GpPtr->getType());

1814

if (GepPtrType->getElementType()->isAggregateType())

1815

return 0;

1816

1817

// Make sure that all of the index operands are loop invariant.

1818

for (unsigned i = 1; i < NumOperands; ++i)

1819

if (!SE->isLoopInvariant(SE->getSCEV(Gep->getOperand(i)), TheLoop))

1820

return 0;

1821

1822

InductionInfo II = Inductions[Phi];

1823

return II.getConsecutiveDirection();

1824

}

1825

1826

unsigned InductionOperand = getGEPInductionOperand(Gep);

1827

1828

// Check that all of the gep indices are uniform except for our induction

1829

// operand.

1830

for (unsigned i = 0; i != NumOperands; ++i)

1831

if (i != InductionOperand &&

1832

!SE->isLoopInvariant(SE->getSCEV(Gep->getOperand(i)), TheLoop))

1833

return 0;

1834

1835

// We can emit wide load/stores only if the last non-zero index is the

1836

// induction variable.

1837

const SCEV *Last = nullptr;

1838

if (!Strides.count(Gep))

1839

Last = SE->getSCEV(Gep->getOperand(InductionOperand));

1840

else {

1841

// Because of the multiplication by a stride we can have a s/zext cast.

1842

// We are going to replace this stride by 1 so the cast is safe to ignore.

1843

1844

// %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]

1845

// %0 = trunc i64 %indvars.iv to i32

1846

// %mul = mul i32 %0, %Stride1

1847

// %idxprom = zext i32 %mul to i64 << Safe cast.

1848

// %arrayidx = getelementptr inbounds i32* %B, i64 %idxprom

1849

1850

Last = replaceSymbolicStrideSCEV(SE, Strides,

1851

Gep->getOperand(InductionOperand), Gep);

1852

if (const SCEVCastExpr *C = dyn_cast<SCEVCastExpr>(Last))

1853

Last =

1854

(C->getSCEVType() == scSignExtend || C->getSCEVType() == scZeroExtend)

1855

? C->getOperand()

1856

: Last;

1857

}

1858

if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Last)) {

1859

const SCEV *Step = AR->getStepRecurrence(*SE);

1860

1861

// The memory is consecutive because the last index is consecutive

1862

// and all other indices are loop invariant.

1863

if (Step->isOne())

1864

return 1;

1865

if (Step->isAllOnesValue())

1866

return -1;

1867

}

1868

1869

return 0;

1870

}

1871

1872

bool LoopVectorizationLegality::isUniform(Value *V) {

1873

return LAI->isUniform(V);

1874

}

1875

1876

InnerLoopVectorizer::VectorParts&

1877

InnerLoopVectorizer::getVectorValue(Value *V) {

1878

assert(V != Induction && "The new induction variable should not be used.")((V != Induction && "The new induction variable should not be used."
) ? static_cast<void> (0) : __assert_fail ("V != Induction && \"The new induction variable should not be used.\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn240924/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1878, __PRETTY_FUNCTION__));

1879

assert(!V->getType()->isVectorTy() && "Can't widen a vector")((!V->getType()->isVectorTy() && "Can't widen a vector"
) ? static_cast<void> (0) : __assert_fail ("!V->getType()->isVectorTy() && \"Can't widen a vector\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn240924/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1879, __PRETTY_FUNCTION__));

1880

1881

// If we have a stride that is replaced by one, do it here.

1882

if (Legal->hasStride(V))

1883

V = ConstantInt::get(V->getType(), 1);

1884

1885

// If we have this scalar in the map, return it.

1886

if (WidenMap.has(V))

1887

return WidenMap.get(V);

1888

1889

// If this scalar is unknown, assume that it is a constant or that it is

1890

// loop invariant. Broadcast V and save the value for future uses.

1891

Value *B = getBroadcastInstrs(V);

1892

return WidenMap.splat(V, B);

1893

}

1894

1895

Value *InnerLoopVectorizer::reverseVector(Value *Vec) {

1896

assert(Vec->getType()->isVectorTy() && "Invalid type")((Vec->getType()->isVectorTy() && "Invalid type"
) ? static_cast<void> (0) : __assert_fail ("Vec->getType()->isVectorTy() && \"Invalid type\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn240924/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1896, __PRETTY_FUNCTION__));

1897

SmallVector<Constant*, 8> ShuffleMask;

1898

for (unsigned i = 0; i < VF; ++i)

1899

ShuffleMask.push_back(Builder.getInt32(VF - i - 1));

1900

1901

return Builder.CreateShuffleVector(Vec, UndefValue::get(Vec->getType()),

1902

ConstantVector::get(ShuffleMask),

1903

"reverse");

1904

}

1905

1906

// Get a mask to interleave \p NumVec vectors into a wide vector.

1907

// I.e. <0, VF, VF*2, ..., VF*(NumVec-1), 1, VF+1, VF*2+1, ...>

1908

// E.g. For 2 interleaved vectors, if VF is 4, the mask is:

1909

// <0, 4, 1, 5, 2, 6, 3, 7>

1910

static Constant *getInterleavedMask(IRBuilder<> &Builder, unsigned VF,

1911

unsigned NumVec) {

1912

SmallVector<Constant *, 16> Mask;

1913

for (unsigned i = 0; i < VF; i++)

1914

for (unsigned j = 0; j < NumVec; j++)

1915

Mask.push_back(Builder.getInt32(j * VF + i));

1916

1917

return ConstantVector::get(Mask);

1918

}

1919

1920

// Get the strided mask starting from index \p Start.

1921

// I.e. <Start, Start + Stride, ..., Start + Stride*(VF-1)>

1922

static Constant *getStridedMask(IRBuilder<> &Builder, unsigned Start,

1923

unsigned Stride, unsigned VF) {

1924

SmallVector<Constant *, 16> Mask;

1925

for (unsigned i = 0; i < VF; i++)

1926

Mask.push_back(Builder.getInt32(Start + i * Stride));

1927

1928

return ConstantVector::get(Mask);

1929

}

1930

1931

// Get a mask of two parts: The first part consists of sequential integers

1932

// starting from 0, The second part consists of UNDEFs.

1933

// I.e. <0, 1, 2, ..., NumInt - 1, undef, ..., undef>

1934

static Constant *getSequentialMask(IRBuilder<> &Builder, unsigned NumInt,

1935

unsigned NumUndef) {

1936

SmallVector<Constant *, 16> Mask;

1937

for (unsigned i = 0; i < NumInt; i++)

1938

Mask.push_back(Builder.getInt32(i));

1939

1940

Constant *Undef = UndefValue::get(Builder.getInt32Ty());

1941

for (unsigned i = 0; i < NumUndef; i++)

1942

Mask.push_back(Undef);

1943

1944

return ConstantVector::get(Mask);

1945

}

1946

1947

// Concatenate two vectors with the same element type. The 2nd vector should

1948

// not have more elements than the 1st vector. If the 2nd vector has less

1949

// elements, extend it with UNDEFs.

1950

static Value *ConcatenateTwoVectors(IRBuilder<> &Builder, Value *V1,

1951

Value *V2) {

1952

VectorType *VecTy1 = dyn_cast<VectorType>(V1->getType());

1953

VectorType *VecTy2 = dyn_cast<VectorType>(V2->getType());

1954

assert(VecTy1 && VecTy2 &&((VecTy1 && VecTy2 && VecTy1->getScalarType
() == VecTy2->getScalarType() && "Expect two vectors with the same element type"
) ? static_cast<void> (0) : __assert_fail ("VecTy1 && VecTy2 && VecTy1->getScalarType() == VecTy2->getScalarType() && \"Expect two vectors with the same element type\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn240924/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1956, __PRETTY_FUNCTION__))

1955

VecTy1->getScalarType() == VecTy2->getScalarType() &&((VecTy1 && VecTy2 && VecTy1->getScalarType
() == VecTy2->getScalarType() && "Expect two vectors with the same element type"
) ? static_cast<void> (0) : __assert_fail ("VecTy1 && VecTy2 && VecTy1->getScalarType() == VecTy2->getScalarType() && \"Expect two vectors with the same element type\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn240924/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1956, __PRETTY_FUNCTION__))

1956

"Expect two vectors with the same element type")((VecTy1 && VecTy2 && VecTy1->getScalarType
() == VecTy2->getScalarType() && "Expect two vectors with the same element type"
) ? static_cast<void> (0) : __assert_fail ("VecTy1 && VecTy2 && VecTy1->getScalarType() == VecTy2->getScalarType() && \"Expect two vectors with the same element type\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn240924/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1956, __PRETTY_FUNCTION__));

1957

1958

unsigned NumElts1 = VecTy1->getNumElements();

1959

unsigned NumElts2 = VecTy2->getNumElements();

1960

assert(NumElts1 >= NumElts2 && "Unexpect the first vector has less elements")((NumElts1 >= NumElts2 && "Unexpect the first vector has less elements"
) ? static_cast<void> (0) : __assert_fail ("NumElts1 >= NumElts2 && \"Unexpect the first vector has less elements\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn240924/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1960, __PRETTY_FUNCTION__));

1961

1962

if (NumElts1 > NumElts2) {

1963

// Extend with UNDEFs.

1964

Constant *ExtMask =

1965

getSequentialMask(Builder, NumElts2, NumElts1 - NumElts2);

1966

V2 = Builder.CreateShuffleVector(V2, UndefValue::get(VecTy2), ExtMask);

1967

}

1968

1969

Constant *Mask = getSequentialMask(Builder, NumElts1 + NumElts2, 0);

1970

return Builder.CreateShuffleVector(V1, V2, Mask);

1971

}

1972

1973

// Concatenate vectors in the given list. All vectors have the same type.

1974

static Value *ConcatenateVectors(IRBuilder<> &Builder,

1975

ArrayRef<Value *> InputList) {

1976

unsigned NumVec = InputList.size();

1977

assert(NumVec > 1 && "Should be at least two vectors")((NumVec > 1 && "Should be at least two vectors") ?
static_cast<void> (0) : __assert_fail ("NumVec > 1 && \"Should be at least two vectors\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn240924/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1977, __PRETTY_FUNCTION__));

1978

1979

SmallVector<Value *, 8> ResList;

1980

ResList.append(InputList.begin(), InputList.end());

1981

do {

1982

SmallVector<Value *, 8> TmpList;

1983

for (unsigned i = 0; i < NumVec - 1; i += 2) {

1984

Value *V0 = ResList[i], *V1 = ResList[i + 1];

1985

assert((V0->getType() == V1->getType() || i == NumVec - 2) &&(((V0->getType() == V1->getType() || i == NumVec - 2) &&
"Only the last vector may have a different type") ? static_cast
<void> (0) : __assert_fail ("(V0->getType() == V1->getType() || i == NumVec - 2) && \"Only the last vector may have a different type\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn240924/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1986, __PRETTY_FUNCTION__))

1986

"Only the last vector may have a different type")(((V0->getType() == V1->getType() || i == NumVec - 2) &&
"Only the last vector may have a different type") ? static_cast
<void> (0) : __assert_fail ("(V0->getType() == V1->getType() || i == NumVec - 2) && \"Only the last vector may have a different type\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn240924/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 1986, __PRETTY_FUNCTION__));

1987

1988

TmpList.push_back(ConcatenateTwoVectors(Builder, V0, V1));

1989

}

1990

1991

// Push the last vector if the total number of vectors is odd.

1992

if (NumVec % 2 != 0)

1993

TmpList.push_back(ResList[NumVec - 1]);

1994

1995

ResList = TmpList;

1996

NumVec = ResList.size();

1997

} while (NumVec > 1);

1998

1999

return ResList[0];

2000

}

2001

2002

// Try to vectorize the interleave group that \p Instr belongs to.

2003

2004

// E.g. Translate following interleaved load group (factor = 3):

2005

// for (i = 0; i < N; i+=3) {

2006

// R = Pic[i]; // Member of index 0

2007

// G = Pic[i+1]; // Member of index 1

2008

// B = Pic[i+2]; // Member of index 2

2009

// ... // do something to R, G, B

2010

// }

2011

// To:

2012

// %wide.vec = load <12 x i32> ; Read 4 tuples of R,G,B

2013

// %R.vec = shuffle %wide.vec, undef, <0, 3, 6, 9> ; R elements

2014

// %G.vec = shuffle %wide.vec, undef, <1, 4, 7, 10> ; G elements

2015

// %B.vec = shuffle %wide.vec, undef, <2, 5, 8, 11> ; B elements

2016

2017

// Or translate following interleaved store group (factor = 3):

2018

// for (i = 0; i < N; i+=3) {

2019

// ... do something to R, G, B

2020

// Pic[i] = R; // Member of index 0

2021

// Pic[i+1] = G; // Member of index 1

2022

// Pic[i+2] = B; // Member of index 2

2023

// }

2024

// To:

2025

// %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7>

2026

// %B_U.vec = shuffle %B.vec, undef, <0, 1, 2, 3, u, u, u, u>

2027

// %interleaved.vec = shuffle %R_G.vec, %B_U.vec,

2028

// <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> ; Interleave R,G,B elements

2029

// store <12 x i32> %interleaved.vec ; Write 4 tuples of R,G,B

2030

void InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction *Instr) {

2031

const InterleaveGroup *Group = Legal->getInterleavedAccessGroup(Instr);

2032

assert(Group && "Fail to get an interleaved access group.")((Group && "Fail to get an interleaved access group."
) ? static_cast<void> (0) : __assert_fail ("Group && \"Fail to get an interleaved access group.\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn240924/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2032, __PRETTY_FUNCTION__));

2033

2034

// Skip if current instruction is not the insert position.

2035

if (Instr != Group->getInsertPos())

2036

return;

2037

2038

LoadInst *LI = dyn_cast<LoadInst>(Instr);

2039

StoreInst *SI = dyn_cast<StoreInst>(Instr);

2040

Value *Ptr = LI ? LI->getPointerOperand() : SI->getPointerOperand();

2041

2042

// Prepare for the vector type of the interleaved load/store.

2043

Type *ScalarTy = LI ? LI->getType() : SI->getValueOperand()->getType();

2044

unsigned InterleaveFactor = Group->getFactor();

2045

Type *VecTy = VectorType::get(ScalarTy, InterleaveFactor * VF);

2046

Type *PtrTy = VecTy->getPointerTo(Ptr->getType()->getPointerAddressSpace());

2047

2048

// Prepare for the new pointers.

2049

setDebugLocFromInst(Builder, Ptr);

2050

VectorParts &PtrParts = getVectorValue(Ptr);

2051

SmallVector<Value *, 2> NewPtrs;

2052

unsigned Index = Group->getIndex(Instr);

2053

for (unsigned Part = 0; Part < UF; Part++) {

2054

// Extract the pointer for current instruction from the pointer vector. A

2055

// reverse access uses the pointer in the last lane.

2056

Value *NewPtr = Builder.CreateExtractElement(

2057

PtrParts[Part],

2058

Group->isReverse() ? Builder.getInt32(VF - 1) : Builder.getInt32(0));

2059

2060

// Notice current instruction could be any index. Need to adjust the address

2061

// to the member of index 0.

2062

2063

// E.g. a = A[i+1]; // Member of index 1 (Current instruction)

2064

// b = A[i]; // Member of index 0

2065

// Current pointer is pointed to A[i+1], adjust it to A[i].

2066

2067

// E.g. A[i+1] = a; // Member of index 1

2068

// A[i] = b; // Member of index 0

2069

// A[i+2] = c; // Member of index 2 (Current instruction)

2070

// Current pointer is pointed to A[i+2], adjust it to A[i].

2071

NewPtr = Builder.CreateGEP(NewPtr, Builder.getInt32(-Index));

2072

2073

// Cast to the vector pointer type.

2074

NewPtrs.push_back(Builder.CreateBitCast(NewPtr, PtrTy));

2075

}

2076

2077

setDebugLocFromInst(Builder, Instr);

2078

Value *UndefVec = UndefValue::get(VecTy);

2079

2080

// Vectorize the interleaved load group.

2081

if (LI) {

2082

for (unsigned Part = 0; Part < UF; Part++) {

2083

Instruction *NewLoadInstr = Builder.CreateAlignedLoad(

2084

NewPtrs[Part], Group->getAlignment(), "wide.vec");

2085

2086

for (unsigned i = 0; i < InterleaveFactor; i++) {

2087

Instruction *Member = Group->getMember(i);

2088

2089

// Skip the gaps in the group.

2090

if (!Member)

2091

continue;

2092

2093

Constant *StrideMask = getStridedMask(Builder, i, InterleaveFactor, VF);

2094

Value *StridedVec = Builder.CreateShuffleVector(

2095

NewLoadInstr, UndefVec, StrideMask, "strided.vec");

2096

2097

// If this member has different type, cast the result type.

2098

if (Member->getType() != ScalarTy) {

2099

VectorType *OtherVTy = VectorType::get(Member->getType(), VF);

2100

StridedVec = Builder.CreateBitOrPointerCast(StridedVec, OtherVTy);

2101

}

2102

2103

VectorParts &Entry = WidenMap.get(Member);

2104

Entry[Part] =

2105

Group->isReverse() ? reverseVector(StridedVec) : StridedVec;

2106

}

2107

2108

propagateMetadata(NewLoadInstr, Instr);

2109

}

2110

return;

2111

}

2112

2113

// The sub vector type for current instruction.

2114

VectorType *SubVT = VectorType::get(ScalarTy, VF);

2115

2116

// Vectorize the interleaved store group.

2117

for (unsigned Part = 0; Part < UF; Part++) {

2118

// Collect the stored vector from each member.

2119

SmallVector<Value *, 4> StoredVecs;

2120

for (unsigned i = 0; i < InterleaveFactor; i++) {

2121

// Interleaved store group doesn't allow a gap, so each index has a member

2122

Instruction *Member = Group->getMember(i);

2123

assert(Member && "Fail to get a member from an interleaved store group")((Member && "Fail to get a member from an interleaved store group"
) ? static_cast<void> (0) : __assert_fail ("Member && \"Fail to get a member from an interleaved store group\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn240924/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2123, __PRETTY_FUNCTION__));

2124

2125

Value *StoredVec =

2126

getVectorValue(dyn_cast<StoreInst>(Member)->getValueOperand())[Part];

2127

if (Group->isReverse())

2128

StoredVec = reverseVector(StoredVec);

2129

2130

// If this member has different type, cast it to an unified type.

2131

if (StoredVec->getType() != SubVT)

2132

StoredVec = Builder.CreateBitOrPointerCast(StoredVec, SubVT);

2133

2134

StoredVecs.push_back(StoredVec);

2135

}

2136

2137

// Concatenate all vectors into a wide vector.

2138

Value *WideVec = ConcatenateVectors(Builder, StoredVecs);

2139

2140

// Interleave the elements in the wide vector.

2141

Constant *IMask = getInterleavedMask(Builder, VF, InterleaveFactor);

2142

Value *IVec = Builder.CreateShuffleVector(WideVec, UndefVec, IMask,

2143

"interleaved.vec");

2144

2145

Instruction *NewStoreInstr =

2146

Builder.CreateAlignedStore(IVec, NewPtrs[Part], Group->getAlignment());

2147

propagateMetadata(NewStoreInstr, Instr);

2148

}

2149

}

2150

2151

void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr) {

2152

// Attempt to issue a wide load.

2153

LoadInst *LI = dyn_cast<LoadInst>(Instr);

2154

StoreInst *SI = dyn_cast<StoreInst>(Instr);

2155

2156

assert((LI || SI) && "Invalid Load/Store instruction")(((LI || SI) && "Invalid Load/Store instruction") ? static_cast
<void> (0) : __assert_fail ("(LI || SI) && \"Invalid Load/Store instruction\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn240924/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2156, __PRETTY_FUNCTION__));

2157

2158

// Try to vectorize the interleave group if this access is interleaved.

2159

if (Legal->isAccessInterleaved(Instr))

2160

return vectorizeInterleaveGroup(Instr);

2161

2162

Type *ScalarDataTy = LI ? LI->getType() : SI->getValueOperand()->getType();

2163

Type *DataTy = VectorType::get(ScalarDataTy, VF);

2164

Value *Ptr = LI ? LI->getPointerOperand() : SI->getPointerOperand();

2165

unsigned Alignment = LI ? LI->getAlignment() : SI->getAlignment();

2166

// An alignment of 0 means target abi alignment. We need to use the scalar's

2167

// target abi alignment in such a case.

2168

const DataLayout &DL = Instr->getModule()->getDataLayout();

2169

if (!Alignment)

2170

Alignment = DL.getABITypeAlignment(ScalarDataTy);

2171

unsigned AddressSpace = Ptr->getType()->getPointerAddressSpace();

2172

unsigned ScalarAllocatedSize = DL.getTypeAllocSize(ScalarDataTy);

2173

unsigned VectorElementSize = DL.getTypeStoreSize(DataTy) / VF;

2174

2175

if (SI && Legal->blockNeedsPredication(SI->getParent()) &&

2176

!Legal->isMaskRequired(SI))

2177

return scalarizeInstruction(Instr, true);

2178

2179

if (ScalarAllocatedSize != VectorElementSize)

2180

return scalarizeInstruction(Instr);

2181

2182

// If the pointer is loop invariant or if it is non-consecutive,

2183

// scalarize the load.

2184

int ConsecutiveStride = Legal->isConsecutivePtr(Ptr);

2185

bool Reverse = ConsecutiveStride < 0;

2186

bool UniformLoad = LI && Legal->isUniform(Ptr);

2187

if (!ConsecutiveStride || UniformLoad)

2188

return scalarizeInstruction(Instr);

2189

2190

Constant *Zero = Builder.getInt32(0);

2191

VectorParts &Entry = WidenMap.get(Instr);

2192

2193

// Handle consecutive loads/stores.

2194

GetElementPtrInst *Gep = dyn_cast<GetElementPtrInst>(Ptr);

2195

if (Gep && Legal->isInductionVariable(Gep->getPointerOperand())) {

2196

setDebugLocFromInst(Builder, Gep);

2197

Value *PtrOperand = Gep->getPointerOperand();

2198

Value *FirstBasePtr = getVectorValue(PtrOperand)[0];

2199

FirstBasePtr = Builder.CreateExtractElement(FirstBasePtr, Zero);

2200

2201

// Create the new GEP with the new induction variable.

2202

GetElementPtrInst *Gep2 = cast<GetElementPtrInst>(Gep->clone());

2203

Gep2->setOperand(0, FirstBasePtr);

2204

Gep2->setName("gep.indvar.base");

2205

Ptr = Builder.Insert(Gep2);

2206

} else if (Gep) {

2207

setDebugLocFromInst(Builder, Gep);

2208

assert(SE->isLoopInvariant(SE->getSCEV(Gep->getPointerOperand()),((SE->isLoopInvariant(SE->getSCEV(Gep->getPointerOperand
()), OrigLoop) && "Base ptr must be invariant") ? static_cast
<void> (0) : __assert_fail ("SE->isLoopInvariant(SE->getSCEV(Gep->getPointerOperand()), OrigLoop) && \"Base ptr must be invariant\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn240924/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2209, __PRETTY_FUNCTION__))

2209

OrigLoop) && "Base ptr must be invariant")((SE->isLoopInvariant(SE->getSCEV(Gep->getPointerOperand
()), OrigLoop) && "Base ptr must be invariant") ? static_cast
<void> (0) : __assert_fail ("SE->isLoopInvariant(SE->getSCEV(Gep->getPointerOperand()), OrigLoop) && \"Base ptr must be invariant\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn240924/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2209, __PRETTY_FUNCTION__));

2210

2211

// The last index does not have to be the induction. It can be

2212

// consecutive and be a function of the index. For example A[I+1];

2213

unsigned NumOperands = Gep->getNumOperands();

2214

unsigned InductionOperand = getGEPInductionOperand(Gep);

2215

// Create the new GEP with the new induction variable.

2216

GetElementPtrInst *Gep2 = cast<GetElementPtrInst>(Gep->clone());

2217

2218

for (unsigned i = 0; i < NumOperands; ++i) {

2219

Value *GepOperand = Gep->getOperand(i);

2220

Instruction *GepOperandInst = dyn_cast<Instruction>(GepOperand);

2221

2222

// Update last index or loop invariant instruction anchored in loop.

2223

if (i == InductionOperand ||

2224

(GepOperandInst && OrigLoop->contains(GepOperandInst))) {

2225

assert((i == InductionOperand ||(((i == InductionOperand || SE->isLoopInvariant(SE->getSCEV
(GepOperandInst), OrigLoop)) && "Must be last index or loop invariant"
) ? static_cast<void> (0) : __assert_fail ("(i == InductionOperand || SE->isLoopInvariant(SE->getSCEV(GepOperandInst), OrigLoop)) && \"Must be last index or loop invariant\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn240924/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2227, __PRETTY_FUNCTION__))

2226

SE->isLoopInvariant(SE->getSCEV(GepOperandInst), OrigLoop)) &&(((i == InductionOperand || SE->isLoopInvariant(SE->getSCEV
(GepOperandInst), OrigLoop)) && "Must be last index or loop invariant"
) ? static_cast<void> (0) : __assert_fail ("(i == InductionOperand || SE->isLoopInvariant(SE->getSCEV(GepOperandInst), OrigLoop)) && \"Must be last index or loop invariant\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn240924/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2227, __PRETTY_FUNCTION__))

2227

"Must be last index or loop invariant")(((i == InductionOperand || SE->isLoopInvariant(SE->getSCEV
(GepOperandInst), OrigLoop)) && "Must be last index or loop invariant"
) ? static_cast<void> (0) : __assert_fail ("(i == InductionOperand || SE->isLoopInvariant(SE->getSCEV(GepOperandInst), OrigLoop)) && \"Must be last index or loop invariant\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn240924/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2227, __PRETTY_FUNCTION__));

2228

2229

VectorParts &GEPParts = getVectorValue(GepOperand);

2230

Value *Index = GEPParts[0];

2231

Index = Builder.CreateExtractElement(Index, Zero);

2232

Gep2->setOperand(i, Index);

2233

Gep2->setName("gep.indvar.idx");

2234

}

2235

}

2236

Ptr = Builder.Insert(Gep2);

2237

} else {

2238

// Use the induction element ptr.

2239

assert(isa<PHINode>(Ptr) && "Invalid induction ptr")((isa<PHINode>(Ptr) && "Invalid induction ptr")
? static_cast<void> (0) : __assert_fail ("isa<PHINode>(Ptr) && \"Invalid induction ptr\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn240924/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2239, __PRETTY_FUNCTION__));

2240

setDebugLocFromInst(Builder, Ptr);

2241

VectorParts &PtrVal = getVectorValue(Ptr);

2242

Ptr = Builder.CreateExtractElement(PtrVal[0], Zero);

2243

}

2244

2245

VectorParts Mask = createBlockInMask(Instr->getParent());

2246

// Handle Stores:

2247

if (SI) {

2248

assert(!Legal->isUniform(SI->getPointerOperand()) &&((!Legal->isUniform(SI->getPointerOperand()) &&
"We do not allow storing to uniform addresses") ? static_cast
<void> (0) : __assert_fail ("!Legal->isUniform(SI->getPointerOperand()) && \"We do not allow storing to uniform addresses\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn240924/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2249, __PRETTY_FUNCTION__))

2249

"We do not allow storing to uniform addresses")((!Legal->isUniform(SI->getPointerOperand()) &&
"We do not allow storing to uniform addresses") ? static_cast
<void> (0) : __assert_fail ("!Legal->isUniform(SI->getPointerOperand()) && \"We do not allow storing to uniform addresses\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn240924/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2249, __PRETTY_FUNCTION__));

2250

setDebugLocFromInst(Builder, SI);

2251

// We don't want to update the value in the map as it might be used in

2252

// another expression. So don't use a reference type for "StoredVal".

2253

VectorParts StoredVal = getVectorValue(SI->getValueOperand());

2254

2255

for (unsigned Part = 0; Part < UF; ++Part) {

2256

// Calculate the pointer for the specific unroll-part.

2257

Value *PartPtr =

2258

Builder.CreateGEP(nullptr, Ptr, Builder.getInt32(Part * VF));

2259

2260

if (Reverse) {

2261

// If we store to reverse consecutive memory locations then we need

2262

// to reverse the order of elements in the stored value.

2263

StoredVal[Part] = reverseVector(StoredVal[Part]);

2264

// If the address is consecutive but reversed, then the

2265

// wide store needs to start at the last vector element.

2266

PartPtr = Builder.CreateGEP(nullptr, Ptr, Builder.getInt32(-Part * VF));

2267

PartPtr = Builder.CreateGEP(nullptr, PartPtr, Builder.getInt32(1 - VF));

2268

Mask[Part] = reverseVector(Mask[Part]);

2269

}

2270

2271

Value *VecPtr = Builder.CreateBitCast(PartPtr,

2272

DataTy->getPointerTo(AddressSpace));

2273

2274

Instruction *NewSI;

2275

if (Legal->isMaskRequired(SI))

2276

NewSI = Builder.CreateMaskedStore(StoredVal[Part], VecPtr, Alignment,

2277

Mask[Part]);

2278

else

2279

NewSI = Builder.CreateAlignedStore(StoredVal[Part], VecPtr, Alignment);

2280

propagateMetadata(NewSI, SI);

2281

}

2282

return;

2283

}

2284

2285

// Handle loads.

2286

assert(LI && "Must have a load instruction")((LI && "Must have a load instruction") ? static_cast
<void> (0) : __assert_fail ("LI && \"Must have a load instruction\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn240924/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2286, __PRETTY_FUNCTION__));

2287

setDebugLocFromInst(Builder, LI);

2288

for (unsigned Part = 0; Part < UF; ++Part) {

2289

// Calculate the pointer for the specific unroll-part.

2290

Value *PartPtr =

2291

Builder.CreateGEP(nullptr, Ptr, Builder.getInt32(Part * VF));

2292

2293

if (Reverse) {

2294

// If the address is consecutive but reversed, then the

2295

// wide load needs to start at the last vector element.

2296

PartPtr = Builder.CreateGEP(nullptr, Ptr, Builder.getInt32(-Part * VF));

2297

PartPtr = Builder.CreateGEP(nullptr, PartPtr, Builder.getInt32(1 - VF));

2298

Mask[Part] = reverseVector(Mask[Part]);

2299

}

2300

2301

Instruction* NewLI;

2302

Value *VecPtr = Builder.CreateBitCast(PartPtr,

2303

DataTy->getPointerTo(AddressSpace));

2304

if (Legal->isMaskRequired(LI))

2305

NewLI = Builder.CreateMaskedLoad(VecPtr, Alignment, Mask[Part],

2306

UndefValue::get(DataTy),

2307

"wide.masked.load");

2308

else

2309

NewLI = Builder.CreateAlignedLoad(VecPtr, Alignment, "wide.load");

2310

propagateMetadata(NewLI, LI);

2311

Entry[Part] = Reverse ? reverseVector(NewLI) : NewLI;

2312

}

2313

}

2314

2315

void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, bool IfPredicateStore) {

2316

assert(!Instr->getType()->isAggregateType() && "Can't handle vectors")((!Instr->getType()->isAggregateType() && "Can't handle vectors"
) ? static_cast<void> (0) : __assert_fail ("!Instr->getType()->isAggregateType() && \"Can't handle vectors\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn240924/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2316, __PRETTY_FUNCTION__));

2317

// Holds vector parameters or scalars, in case of uniform vals.

2318

SmallVector<VectorParts, 4> Params;

2319

2320

setDebugLocFromInst(Builder, Instr);

2321

2322

// Find all of the vectorized parameters.

2323

for (unsigned op = 0, e = Instr->getNumOperands(); op != e; ++op) {

2324

Value *SrcOp = Instr->getOperand(op);

2325

2326

// If we are accessing the old induction variable, use the new one.

2327

if (SrcOp == OldInduction) {

2328

Params.push_back(getVectorValue(SrcOp));

2329

continue;

2330

}

2331

2332

// Try using previously calculated values.

2333

Instruction *SrcInst = dyn_cast<Instruction>(SrcOp);

2334

2335

// If the src is an instruction that appeared earlier in the basic block

2336

// then it should already be vectorized.

2337

if (SrcInst && OrigLoop->contains(SrcInst)) {

2338

assert(WidenMap.has(SrcInst) && "Source operand is unavailable")((WidenMap.has(SrcInst) && "Source operand is unavailable"
) ? static_cast<void> (0) : __assert_fail ("WidenMap.has(SrcInst) && \"Source operand is unavailable\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn240924/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2338, __PRETTY_FUNCTION__));

2339

// The parameter is a vector value from earlier.

2340

Params.push_back(WidenMap.get(SrcInst));

2341

} else {

2342

// The parameter is a scalar from outside the loop. Maybe even a constant.

2343

VectorParts Scalars;

2344

Scalars.append(UF, SrcOp);

2345

Params.push_back(Scalars);

2346

}

2347

}

2348

2349

assert(Params.size() == Instr->getNumOperands() &&((Params.size() == Instr->getNumOperands() && "Invalid number of operands"
) ? static_cast<void> (0) : __assert_fail ("Params.size() == Instr->getNumOperands() && \"Invalid number of operands\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn240924/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2350, __PRETTY_FUNCTION__))

2350

"Invalid number of operands")((Params.size() == Instr->getNumOperands() && "Invalid number of operands"
) ? static_cast<void> (0) : __assert_fail ("Params.size() == Instr->getNumOperands() && \"Invalid number of operands\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn240924/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2350, __PRETTY_FUNCTION__));

2351

2352

// Does this instruction return a value ?

2353

bool IsVoidRetTy = Instr->getType()->isVoidTy();

2354

2355

Value *UndefVec = IsVoidRetTy ? nullptr :

2356

UndefValue::get(VectorType::get(Instr->getType(), VF));

2357

// Create a new entry in the WidenMap and initialize it to Undef or Null.

2358

VectorParts &VecResults = WidenMap.splat(Instr, UndefVec);

2359

2360

Instruction *InsertPt = Builder.GetInsertPoint();

2361

BasicBlock *IfBlock = Builder.GetInsertBlock();

2362

BasicBlock *CondBlock = nullptr;

2363

2364

VectorParts Cond;

2365

Loop *VectorLp = nullptr;

2366

if (IfPredicateStore) {

2367

assert(Instr->getParent()->getSinglePredecessor() &&((Instr->getParent()->getSinglePredecessor() &&
"Only support single predecessor blocks") ? static_cast<void
> (0) : __assert_fail ("Instr->getParent()->getSinglePredecessor() && \"Only support single predecessor blocks\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn240924/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2368, __PRETTY_FUNCTION__))

2368

"Only support single predecessor blocks")((Instr->getParent()->getSinglePredecessor() &&
"Only support single predecessor blocks") ? static_cast<void
> (0) : __assert_fail ("Instr->getParent()->getSinglePredecessor() && \"Only support single predecessor blocks\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn240924/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2368, __PRETTY_FUNCTION__));

2369

Cond = createEdgeMask(Instr->getParent()->getSinglePredecessor(),

2370

Instr->getParent());

2371

VectorLp = LI->getLoopFor(IfBlock);

2372

assert(VectorLp && "Must have a loop for this block")((VectorLp && "Must have a loop for this block") ? static_cast
<void> (0) : __assert_fail ("VectorLp && \"Must have a loop for this block\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn240924/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2372, __PRETTY_FUNCTION__));

2373

}

2374

2375

// For each vector unroll 'part':

2376

for (unsigned Part = 0; Part < UF; ++Part) {

2377

// For each scalar that we create:

2378

for (unsigned Width = 0; Width < VF; ++Width) {

2379

2380

// Start if-block.

2381

Value *Cmp = nullptr;

2382

if (IfPredicateStore) {

2383

Cmp = Builder.CreateExtractElement(Cond[Part], Builder.getInt32(Width));

2384

Cmp = Builder.CreateICmp(ICmpInst::ICMP_EQ, Cmp, ConstantInt::get(Cmp->getType(), 1));

2385

CondBlock = IfBlock->splitBasicBlock(InsertPt, "cond.store");

2386

LoopVectorBody.push_back(CondBlock);

2387

VectorLp->addBasicBlockToLoop(CondBlock, *LI);

2388

// Update Builder with newly created basic block.

2389

Builder.SetInsertPoint(InsertPt);

2390

}

2391

2392

Instruction *Cloned = Instr->clone();

2393

if (!IsVoidRetTy)

2394

Cloned->setName(Instr->getName() + ".cloned");

2395

// Replace the operands of the cloned instructions with extracted scalars.

2396

for (unsigned op = 0, e = Instr->getNumOperands(); op != e; ++op) {

2397

Value *Op = Params[op][Part];

2398

// Param is a vector. Need to extract the right lane.

2399

if (Op->getType()->isVectorTy())

2400

Op = Builder.CreateExtractElement(Op, Builder.getInt32(Width));

2401

Cloned->setOperand(op, Op);

2402

}

2403

2404

// Place the cloned scalar in the new loop.

2405

Builder.Insert(Cloned);

2406

2407

// If the original scalar returns a value we need to place it in a vector

2408

// so that future users will be able to use it.

2409

if (!IsVoidRetTy)

2410

VecResults[Part] = Builder.CreateInsertElement(VecResults[Part], Cloned,

2411

Builder.getInt32(Width));

2412

// End if-block.

2413

if (IfPredicateStore) {

2414

BasicBlock *NewIfBlock = CondBlock->splitBasicBlock(InsertPt, "else");

2415

LoopVectorBody.push_back(NewIfBlock);

2416

VectorLp->addBasicBlockToLoop(NewIfBlock, *LI);

2417

Builder.SetInsertPoint(InsertPt);

2418

Instruction *OldBr = IfBlock->getTerminator();

2419

BranchInst::Create(CondBlock, NewIfBlock, Cmp, OldBr);

2420

OldBr->eraseFromParent();

2421

IfBlock = NewIfBlock;

2422

}

2423

}

2424

}

2425

}

2426

2427

static Instruction *getFirstInst(Instruction *FirstInst, Value *V,

2428

Instruction *Loc) {

2429

if (FirstInst)

2430

return FirstInst;

2431

if (Instruction *I = dyn_cast<Instruction>(V))

2432

return I->getParent() == Loc->getParent() ? I : nullptr;

2433

return nullptr;

2434

}

2435

2436

std::pair<Instruction *, Instruction *>

2437

InnerLoopVectorizer::addStrideCheck(Instruction *Loc) {

2438

Instruction *tnullptr = nullptr;

2439

if (!Legal->mustCheckStrides())

2440

return std::pair<Instruction *, Instruction *>(tnullptr, tnullptr);

2441

2442

IRBuilder<> ChkBuilder(Loc);

2443

2444

// Emit checks.

2445

Value *Check = nullptr;

2446

Instruction *FirstInst = nullptr;

2447

for (SmallPtrSet<Value *, 8>::iterator SI = Legal->strides_begin(),

2448

SE = Legal->strides_end();

2449

SI != SE; ++SI) {

2450

Value *Ptr = stripIntegerCast(*SI);

2451

Value *C = ChkBuilder.CreateICmpNE(Ptr, ConstantInt::get(Ptr->getType(), 1),

2452

"stride.chk");

2453

// Store the first instruction we create.

2454

FirstInst = getFirstInst(FirstInst, C, Loc);

2455

if (Check)

2456

Check = ChkBuilder.CreateOr(Check, C);

2457

else

2458

Check = C;

2459

}

2460

2461

// We have to do this trickery because the IRBuilder might fold the check to a

2462

// constant expression in which case there is no Instruction anchored in a

2463

// the block.

2464

LLVMContext &Ctx = Loc->getContext();

2465

Instruction *TheCheck =

2466

BinaryOperator::CreateAnd(Check, ConstantInt::getTrue(Ctx));

2467

ChkBuilder.Insert(TheCheck, "stride.not.one");

2468

FirstInst = getFirstInst(FirstInst, TheCheck, Loc);

2469

2470

return std::make_pair(FirstInst, TheCheck);

2471

}

2472

2473

void InnerLoopVectorizer::createEmptyLoop() {

2474

2475

In this function we generate a new loop. The new loop will contain

2476

the vectorized instructions while the old loop will continue to run the

2477

scalar remainder.

2478

2479

[ ] <-- Back-edge taken count overflow check.

2480

/ |

2481

/ v

2482

| [ ] <-- vector loop bypass (may consist of multiple blocks).

2483

| / |

2484

| / v

2485

|| [ ] <-- vector pre header.

2486

|| |

2487

|| v

2488

|| [ ] \

2489

|| [ ]_| <-- vector loop.

2490

|| |

2491

| \ v

2492

| >[ ] <--- middle-block.

2493

| / |

2494

| / v

2495

-|- >[ ] <--- new preheader.

2496

| |

2497

| v

2498

| [ ] \

2499

| [ ]_| <-- old scalar loop to handle remainder.

2500

\ |

2501

\ v

2502

>[ ] <-- exit block.

2503

...

2504

2505

2506

BasicBlock *OldBasicBlock = OrigLoop->getHeader();

2507

BasicBlock *BypassBlock = OrigLoop->getLoopPreheader();

2508

BasicBlock *ExitBlock = OrigLoop->getExitBlock();

2509

assert(BypassBlock && "Invalid loop structure")((BypassBlock && "Invalid loop structure") ? static_cast
<void> (0) : __assert_fail ("BypassBlock && \"Invalid loop structure\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn240924/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2509, __PRETTY_FUNCTION__));

2510

assert(ExitBlock && "Must have an exit block")((ExitBlock && "Must have an exit block") ? static_cast
<void> (0) : __assert_fail ("ExitBlock && \"Must have an exit block\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn240924/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2510, __PRETTY_FUNCTION__));

2511

2512

// Some loops have a single integer induction variable, while other loops

2513

// don't. One example is c++ iterators that often have multiple pointer

2514

// induction variables. In the code below we also support a case where we

2515

// don't have a single induction variable.

2516

OldInduction = Legal->getInduction();

2517

Type *IdxTy = Legal->getWidestInductionType();

2518

2519

// Find the loop boundaries.

2520

const SCEV *ExitCount = SE->getBackedgeTakenCount(OrigLoop);

2521

assert(ExitCount != SE->getCouldNotCompute() && "Invalid loop count")((ExitCount != SE->getCouldNotCompute() && "Invalid loop count"
) ? static_cast<void> (0) : __assert_fail ("ExitCount != SE->getCouldNotCompute() && \"Invalid loop count\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn240924/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2521, __PRETTY_FUNCTION__));

2522

2523

// The exit count might have the type of i64 while the phi is i32. This can

2524

// happen if we have an induction variable that is sign extended before the

2525

// compare. The only way that we get a backedge taken count is that the

2526

// induction variable was signed and as such will not overflow. In such a case

2527

// truncation is legal.

2528

if (ExitCount->getType()->getPrimitiveSizeInBits() >

2529

IdxTy->getPrimitiveSizeInBits())

2530

ExitCount = SE->getTruncateOrNoop(ExitCount, IdxTy);

2531

2532

const SCEV *BackedgeTakeCount = SE->getNoopOrZeroExtend(ExitCount, IdxTy);

2533

// Get the total trip count from the count by adding 1.

2534

ExitCount = SE->getAddExpr(BackedgeTakeCount,

2535

SE->getConstant(BackedgeTakeCount->getType(), 1));

2536

2537

const DataLayout &DL = OldBasicBlock->getModule()->getDataLayout();

2538

2539

// Expand the trip count and place the new instructions in the preheader.

2540

// Notice that the pre-header does not change, only the loop body.

2541

SCEVExpander Exp(*SE, DL, "induction");

2542

2543

// We need to test whether the backedge-taken count is uint##_max. Adding one

2544

// to it will cause overflow and an incorrect loop trip count in the vector

2545

// body. In case of overflow we want to directly jump to the scalar remainder

2546

// loop.

2547

Value *BackedgeCount =

2548

Exp.expandCodeFor(BackedgeTakeCount, BackedgeTakeCount->getType(),

2549

BypassBlock->getTerminator());

2550

if (BackedgeCount->getType()->isPointerTy())

2551

BackedgeCount = CastInst::CreatePointerCast(BackedgeCount, IdxTy,

2552

"backedge.ptrcnt.to.int",

2553

BypassBlock->getTerminator());

2554

Instruction *CheckBCOverflow =

2555

CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, BackedgeCount,

2556

Constant::getAllOnesValue(BackedgeCount->getType()),

2557

"backedge.overflow", BypassBlock->getTerminator());

2558

2559

// The loop index does not have to start at Zero. Find the original start

2560

// value from the induction PHI node. If we don't have an induction variable

2561

// then we know that it starts at zero.

2562

Builder.SetInsertPoint(BypassBlock->getTerminator());

2563

Value *StartIdx = ExtendedIdx = OldInduction ?

2564

Builder.CreateZExt(OldInduction->getIncomingValueForBlock(BypassBlock),

2565

IdxTy):

2566

ConstantInt::get(IdxTy, 0);

2567

2568

// We need an instruction to anchor the overflow check on. StartIdx needs to

2569

// be defined before the overflow check branch. Because the scalar preheader

2570

// is going to merge the start index and so the overflow branch block needs to

2571

// contain a definition of the start index.

2572

Instruction *OverflowCheckAnchor = BinaryOperator::CreateAdd(

2573

StartIdx, ConstantInt::get(IdxTy, 0), "overflow.check.anchor",

2574

BypassBlock->getTerminator());

2575

2576

// Count holds the overall loop count (N).

2577

Value *Count = Exp.expandCodeFor(ExitCount, ExitCount->getType(),

2578

BypassBlock->getTerminator());

2579

2580

LoopBypassBlocks.push_back(BypassBlock);

2581

2582

// Split the single block loop into the two loop structure described above.

2583

BasicBlock *VectorPH =

2584

BypassBlock->splitBasicBlock(BypassBlock->getTerminator(), "vector.ph");

2585

BasicBlock *VecBody =

2586

VectorPH->splitBasicBlock(VectorPH->getTerminator(), "vector.body");

2587

BasicBlock *MiddleBlock =

2588

VecBody->splitBasicBlock(VecBody->getTerminator(), "middle.block");

2589

BasicBlock *ScalarPH =

2590

MiddleBlock->splitBasicBlock(MiddleBlock->getTerminator(), "scalar.ph");

2591

2592

// Create and register the new vector loop.

2593

Loop* Lp = new Loop();

2594

Loop *ParentLoop = OrigLoop->getParentLoop();

2595

2596

// Insert the new loop into the loop nest and register the new basic blocks

2597

// before calling any utilities such as SCEV that require valid LoopInfo.

2598

if (ParentLoop) {

2599

ParentLoop->addChildLoop(Lp);

2600

ParentLoop->addBasicBlockToLoop(ScalarPH, *LI);

2601

ParentLoop->addBasicBlockToLoop(VectorPH, *LI);

2602

ParentLoop->addBasicBlockToLoop(MiddleBlock, *LI);

2603

} else {

2604

LI->addTopLevelLoop(Lp);

2605

}

2606

Lp->addBasicBlockToLoop(VecBody, *LI);

2607

2608

// Use this IR builder to create the loop instructions (Phi, Br, Cmp)

2609

// inside the loop.

2610

Builder.SetInsertPoint(VecBody->getFirstNonPHI());

2611

2612

// Generate the induction variable.

2613

setDebugLocFromInst(Builder, getDebugLocFromInstOrOperands(OldInduction));

2614

Induction = Builder.CreatePHI(IdxTy, 2, "index");

2615

// The loop step is equal to the vectorization factor (num of SIMD elements)

2616

// times the unroll factor (num of SIMD instructions).

2617

Constant *Step = ConstantInt::get(IdxTy, VF * UF);

2618

2619

// This is the IR builder that we use to add all of the logic for bypassing

2620

// the new vector loop.

2621

IRBuilder<> BypassBuilder(BypassBlock->getTerminator());

2622

setDebugLocFromInst(BypassBuilder,

2623

getDebugLocFromInstOrOperands(OldInduction));

2624

2625

// We may need to extend the index in case there is a type mismatch.

2626

// We know that the count starts at zero and does not overflow.

2627

if (Count->getType() != IdxTy) {

2628

// The exit count can be of pointer type. Convert it to the correct

2629

// integer type.

2630

if (ExitCount->getType()->isPointerTy())

2631

Count = BypassBuilder.CreatePointerCast(Count, IdxTy, "ptrcnt.to.int");

2632

else

2633

Count = BypassBuilder.CreateZExtOrTrunc(Count, IdxTy, "cnt.cast");

2634

}

2635

2636

// Add the start index to the loop count to get the new end index.

2637

Value *IdxEnd = BypassBuilder.CreateAdd(Count, StartIdx, "end.idx");

2638

2639

// Now we need to generate the expression for N - (N % VF), which is

2640

// the part that the vectorized body will execute.

2641

Value *R = BypassBuilder.CreateURem(Count, Step, "n.mod.vf");

2642

Value *CountRoundDown = BypassBuilder.CreateSub(Count, R, "n.vec");

2643

Value *IdxEndRoundDown = BypassBuilder.CreateAdd(CountRoundDown, StartIdx,

2644

"end.idx.rnd.down");

2645

2646

// Now, compare the new count to zero. If it is zero skip the vector loop and

2647

// jump to the scalar loop.

2648

Value *Cmp =

2649

BypassBuilder.CreateICmpEQ(IdxEndRoundDown, StartIdx, "cmp.zero");

2650

2651

BasicBlock *LastBypassBlock = BypassBlock;

2652

2653

// Generate code to check that the loops trip count that we computed by adding

2654

// one to the backedge-taken count will not overflow.

2655

{

2656

auto PastOverflowCheck =

2657

std::next(BasicBlock::iterator(OverflowCheckAnchor));

2658

BasicBlock *CheckBlock =

2659

LastBypassBlock->splitBasicBlock(PastOverflowCheck, "overflow.checked");

2660

if (ParentLoop)

2661

ParentLoop->addBasicBlockToLoop(CheckBlock, *LI);

2662

LoopBypassBlocks.push_back(CheckBlock);

2663

Instruction *OldTerm = LastBypassBlock->getTerminator();

2664

BranchInst::Create(ScalarPH, CheckBlock, CheckBCOverflow, OldTerm);

2665

OldTerm->eraseFromParent();

2666

LastBypassBlock = CheckBlock;

2667

}

2668

2669

// Generate the code to check that the strides we assumed to be one are really

2670

// one. We want the new basic block to start at the first instruction in a

2671

// sequence of instructions that form a check.

2672

Instruction *StrideCheck;

2673

Instruction *FirstCheckInst;

2674

std::tie(FirstCheckInst, StrideCheck) =

2675

addStrideCheck(LastBypassBlock->getTerminator());

2676

if (StrideCheck) {

2677

AddedSafetyChecks = true;

2678

// Create a new block containing the stride check.

2679

BasicBlock *CheckBlock =

2680

LastBypassBlock->splitBasicBlock(FirstCheckInst, "vector.stridecheck");

2681

if (ParentLoop)

2682

ParentLoop->addBasicBlockToLoop(CheckBlock, *LI);

2683

LoopBypassBlocks.push_back(CheckBlock);

2684

2685

// Replace the branch into the memory check block with a conditional branch

2686

// for the "few elements case".

2687

Instruction *OldTerm = LastBypassBlock->getTerminator();

2688

BranchInst::Create(MiddleBlock, CheckBlock, Cmp, OldTerm);

2689

OldTerm->eraseFromParent();

2690

2691

Cmp = StrideCheck;

2692

LastBypassBlock = CheckBlock;

2693

}

2694

2695

// Generate the code that checks in runtime if arrays overlap. We put the

2696

// checks into a separate block to make the more common case of few elements

2697

// faster.

2698

Instruction *MemRuntimeCheck;

2699

std::tie(FirstCheckInst, MemRuntimeCheck) =

2700

Legal->getLAI()->addRuntimeCheck(LastBypassBlock->getTerminator());

2701

if (MemRuntimeCheck) {

2702

AddedSafetyChecks = true;

2703

// Create a new block containing the memory check.

2704

BasicBlock *CheckBlock =

2705

LastBypassBlock->splitBasicBlock(FirstCheckInst, "vector.memcheck");

2706

if (ParentLoop)

2707

ParentLoop->addBasicBlockToLoop(CheckBlock, *LI);

2708

LoopBypassBlocks.push_back(CheckBlock);

2709

2710

// Replace the branch into the memory check block with a conditional branch

2711

// for the "few elements case".

2712

Instruction *OldTerm = LastBypassBlock->getTerminator();

2713

BranchInst::Create(MiddleBlock, CheckBlock, Cmp, OldTerm);

2714

OldTerm->eraseFromParent();

2715

2716

Cmp = MemRuntimeCheck;

2717

LastBypassBlock = CheckBlock;

2718

}

2719

2720

LastBypassBlock->getTerminator()->eraseFromParent();

2721

BranchInst::Create(MiddleBlock, VectorPH, Cmp,

2722

LastBypassBlock);

2723

2724

// We are going to resume the execution of the scalar loop.

2725

// Go over all of the induction variables that we found and fix the

2726

// PHIs that are left in the scalar version of the loop.

2727

// The starting values of PHI nodes depend on the counter of the last

2728

// iteration in the vectorized loop.

2729

// If we come from a bypass edge then we need to start from the original

2730

// start value.

2731

2732

// This variable saves the new starting index for the scalar loop.

2733

PHINode *ResumeIndex = nullptr;

2734

LoopVectorizationLegality::InductionList::iterator I, E;

2735

LoopVectorizationLegality::InductionList *List = Legal->getInductionVars();

2736

// Set builder to point to last bypass block.

2737

BypassBuilder.SetInsertPoint(LoopBypassBlocks.back()->getTerminator());

2738

for (I = List->begin(), E = List->end(); I != E; ++I) {

2739

PHINode *OrigPhi = I->first;

2740

LoopVectorizationLegality::InductionInfo II = I->second;

2741

2742

Type *ResumeValTy = (OrigPhi == OldInduction) ? IdxTy : OrigPhi->getType();

2743

PHINode *ResumeVal = PHINode::Create(ResumeValTy, 2, "resume.val",

2744

MiddleBlock->getTerminator());

2745

// We might have extended the type of the induction variable but we need a

2746

// truncated version for the scalar loop.

2747

PHINode *TruncResumeVal = (OrigPhi == OldInduction) ?

2748

PHINode::Create(OrigPhi->getType(), 2, "trunc.resume.val",

2749

MiddleBlock->getTerminator()) : nullptr;

2750

2751

// Create phi nodes to merge from the backedge-taken check block.

2752

PHINode *BCResumeVal = PHINode::Create(ResumeValTy, 3, "bc.resume.val",

2753

ScalarPH->getTerminator());

2754

BCResumeVal->addIncoming(ResumeVal, MiddleBlock);

2755

2756

PHINode *BCTruncResumeVal = nullptr;

2757

if (OrigPhi == OldInduction) {

2758

BCTruncResumeVal =

2759

PHINode::Create(OrigPhi->getType(), 2, "bc.trunc.resume.val",

2760

ScalarPH->getTerminator());

2761

BCTruncResumeVal->addIncoming(TruncResumeVal, MiddleBlock);

2762

}

2763

2764

Value *EndValue = nullptr;

2765

switch (II.IK) {

2766

case LoopVectorizationLegality::IK_NoInduction:

2767

llvm_unreachable("Unknown induction")::llvm::llvm_unreachable_internal("Unknown induction", "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn240924/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2767);

2768

case LoopVectorizationLegality::IK_IntInduction: {

2769

// Handle the integer induction counter.

2770

assert(OrigPhi->getType()->isIntegerTy() && "Invalid type")((OrigPhi->getType()->isIntegerTy() && "Invalid type"
) ? static_cast<void> (0) : __assert_fail ("OrigPhi->getType()->isIntegerTy() && \"Invalid type\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn240924/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2770, __PRETTY_FUNCTION__));

2771

2772

// We have the canonical induction variable.

2773

if (OrigPhi == OldInduction) {

2774

// Create a truncated version of the resume value for the scalar loop,

2775

// we might have promoted the type to a larger width.

2776

EndValue =

2777

BypassBuilder.CreateTrunc(IdxEndRoundDown, OrigPhi->getType());

2778

// The new PHI merges the original incoming value, in case of a bypass,

2779

// or the value at the end of the vectorized loop.

2780

for (unsigned I = 1, E = LoopBypassBlocks.size(); I != E; ++I)

2781

TruncResumeVal->addIncoming(II.StartValue, LoopBypassBlocks[I]);

2782

TruncResumeVal->addIncoming(EndValue, VecBody);

2783

2784

BCTruncResumeVal->addIncoming(II.StartValue, LoopBypassBlocks[0]);

2785

2786

// We know what the end value is.

2787

EndValue = IdxEndRoundDown;

2788

// We also know which PHI node holds it.

2789

ResumeIndex = ResumeVal;

2790

break;

2791

}

2792

2793

// Not the canonical induction variable - add the vector loop count to the

2794

// start value.

2795

Value *CRD = BypassBuilder.CreateSExtOrTrunc(CountRoundDown,

2796

II.StartValue->getType(),

2797

"cast.crd");

2798

EndValue = II.transform(BypassBuilder, CRD);

2799

EndValue->setName("ind.end");

2800

break;

2801

}

2802

case LoopVectorizationLegality::IK_PtrInduction: {

2803

Value *CRD = BypassBuilder.CreateSExtOrTrunc(CountRoundDown,

2804

II.StepValue->getType(),

2805

"cast.crd");

2806

EndValue = II.transform(BypassBuilder, CRD);

2807

EndValue->setName("ptr.ind.end");

2808

break;

2809

}

2810

}// end of case

2811

2812

// The new PHI merges the original incoming value, in case of a bypass,

2813

// or the value at the end of the vectorized loop.

2814

for (unsigned I = 1, E = LoopBypassBlocks.size(); I != E; ++I) {

2815

if (OrigPhi == OldInduction)

2816

ResumeVal->addIncoming(StartIdx, LoopBypassBlocks[I]);

2817

else

2818

ResumeVal->addIncoming(II.StartValue, LoopBypassBlocks[I]);

2819

}

2820

ResumeVal->addIncoming(EndValue, VecBody);

2821

2822

// Fix the scalar body counter (PHI node).

2823

unsigned BlockIdx = OrigPhi->getBasicBlockIndex(ScalarPH);

2824

2825

// The old induction's phi node in the scalar body needs the truncated

2826

// value.

2827

if (OrigPhi == OldInduction) {

2828

BCResumeVal->addIncoming(StartIdx, LoopBypassBlocks[0]);

2829

OrigPhi->setIncomingValue(BlockIdx, BCTruncResumeVal);

2830

} else {

2831

BCResumeVal->addIncoming(II.StartValue, LoopBypassBlocks[0]);

2832

OrigPhi->setIncomingValue(BlockIdx, BCResumeVal);

2833

}

2834

}

2835

2836

// If we are generating a new induction variable then we also need to

2837

// generate the code that calculates the exit value. This value is not

2838

// simply the end of the counter because we may skip the vectorized body

2839

// in case of a runtime check.

2840

if (!OldInduction){

2841

assert(!ResumeIndex && "Unexpected resume value found")((!ResumeIndex && "Unexpected resume value found") ? static_cast
<void> (0) : __assert_fail ("!ResumeIndex && \"Unexpected resume value found\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn240924/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2841, __PRETTY_FUNCTION__));

2842

ResumeIndex = PHINode::Create(IdxTy, 2, "new.indc.resume.val",

2843

MiddleBlock->getTerminator());

2844

for (unsigned I = 1, E = LoopBypassBlocks.size(); I != E; ++I)

2845

ResumeIndex->addIncoming(StartIdx, LoopBypassBlocks[I]);

2846

ResumeIndex->addIncoming(IdxEndRoundDown, VecBody);

2847

}

2848

2849

// Make sure that we found the index where scalar loop needs to continue.

2850

assert(ResumeIndex && ResumeIndex->getType()->isIntegerTy() &&((ResumeIndex && ResumeIndex->getType()->isIntegerTy
() && "Invalid resume Index") ? static_cast<void>
(0) : __assert_fail ("ResumeIndex && ResumeIndex->getType()->isIntegerTy() && \"Invalid resume Index\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn240924/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2851, __PRETTY_FUNCTION__))

2851

"Invalid resume Index")((ResumeIndex && ResumeIndex->getType()->isIntegerTy
() && "Invalid resume Index") ? static_cast<void>
(0) : __assert_fail ("ResumeIndex && ResumeIndex->getType()->isIntegerTy() && \"Invalid resume Index\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn240924/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2851, __PRETTY_FUNCTION__));

2852

2853

// Add a check in the middle block to see if we have completed

2854

// all of the iterations in the first vector loop.

2855

// If (N - N%VF) == N, then we *don't* need to run the remainder.

2856

Value *CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, IdxEnd,

2857

ResumeIndex, "cmp.n",

2858

MiddleBlock->getTerminator());

2859

2860

BranchInst::Create(ExitBlock, ScalarPH, CmpN, MiddleBlock->getTerminator());

2861

// Remove the old terminator.

2862

MiddleBlock->getTerminator()->eraseFromParent();

2863

2864

// Create i+1 and fill the PHINode.

2865

Value *NextIdx = Builder.CreateAdd(Induction, Step, "index.next");

2866

Induction->addIncoming(StartIdx, VectorPH);

2867

Induction->addIncoming(NextIdx, VecBody);

2868

// Create the compare.

2869

Value *ICmp = Builder.CreateICmpEQ(NextIdx, IdxEndRoundDown);

2870

Builder.CreateCondBr(ICmp, MiddleBlock, VecBody);

2871

2872

// Now we have two terminators. Remove the old one from the block.

2873

VecBody->getTerminator()->eraseFromParent();

2874

2875

// Get ready to start creating new instructions into the vectorized body.

2876

Builder.SetInsertPoint(VecBody->getFirstInsertionPt());

2877

2878

// Save the state.

2879

LoopVectorPreHeader = VectorPH;

2880

LoopScalarPreHeader = ScalarPH;

2881

LoopMiddleBlock = MiddleBlock;

2882

LoopExitBlock = ExitBlock;

2883

LoopVectorBody.push_back(VecBody);

2884

LoopScalarBody = OldBasicBlock;

2885

2886

LoopVectorizeHints Hints(Lp, true);

2887

Hints.setAlreadyVectorized();

2888

}

2889

2890

namespace {

2891

struct CSEDenseMapInfo {

2892

static bool canHandle(Instruction *I) {

2893

return isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) ||

2894

isa<ShuffleVectorInst>(I) || isa<GetElementPtrInst>(I);

2895

}

2896

static inline Instruction *getEmptyKey() {

2897

return DenseMapInfo<Instruction *>::getEmptyKey();

2898

}

2899

static inline Instruction *getTombstoneKey() {

2900

return DenseMapInfo<Instruction *>::getTombstoneKey();

2901

}

2902

static unsigned getHashValue(Instruction *I) {

2903

assert(canHandle(I) && "Unknown instruction!")((canHandle(I) && "Unknown instruction!") ? static_cast
<void> (0) : __assert_fail ("canHandle(I) && \"Unknown instruction!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn240924/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2903, __PRETTY_FUNCTION__));

2904

return hash_combine(I->getOpcode(), hash_combine_range(I->value_op_begin(),

2905

I->value_op_end()));

2906

}

2907

static bool isEqual(Instruction *LHS, Instruction *RHS) {

2908

if (LHS == getEmptyKey() || RHS == getEmptyKey() ||

2909

LHS == getTombstoneKey() || RHS == getTombstoneKey())

2910

return LHS == RHS;

2911

return LHS->isIdenticalTo(RHS);

2912

}

2913

};

2914

}

2915

2916

/// \brief Check whether this block is a predicated block.

2917

/// Due to if predication of stores we might create a sequence of "if(pred) a[i]

2918

/// = ...; " blocks. We start with one vectorized basic block. For every

2919

/// conditional block we split this vectorized block. Therefore, every second

2920

/// block will be a predicated one.

2921

static bool isPredicatedBlock(unsigned BlockNum) {

2922

return BlockNum % 2;

2923

}

2924

2925

///\brief Perform cse of induction variable instructions.

2926

static void cse(SmallVector<BasicBlock *, 4> &BBs) {

2927

// Perform simple cse.

2928

SmallDenseMap<Instruction *, Instruction *, 4, CSEDenseMapInfo> CSEMap;

2929

for (unsigned i = 0, e = BBs.size(); i != e; ++i) {

2930

BasicBlock *BB = BBs[i];

2931

for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) {

2932

Instruction *In = I++;

2933

2934

if (!CSEDenseMapInfo::canHandle(In))

2935

continue;

2936

2937

// Check if we can replace this instruction with any of the

2938

// visited instructions.

2939

if (Instruction *V = CSEMap.lookup(In)) {

2940

In->replaceAllUsesWith(V);

2941

In->eraseFromParent();

2942

continue;

2943

}

2944

// Ignore instructions in conditional blocks. We create "if (pred) a[i] =

2945

// ...;" blocks for predicated stores. Every second block is a predicated

2946

// block.

2947

if (isPredicatedBlock(i))

2948

continue;

2949

2950

CSEMap[In] = In;

2951

}

2952

}

2953

}

2954

2955

/// \brief Adds a 'fast' flag to floating point operations.

2956

static Value *addFastMathFlag(Value *V) {

2957

if (isa<FPMathOperator>(V)){

2958

FastMathFlags Flags;

2959

Flags.setUnsafeAlgebra();

2960

cast<Instruction>(V)->setFastMathFlags(Flags);

2961

}

2962

return V;

2963

}

2964

2965

/// Estimate the overhead of scalarizing a value. Insert and Extract are set if

2966

/// the result needs to be inserted and/or extracted from vectors.

2967

static unsigned getScalarizationOverhead(Type *Ty, bool Insert, bool Extract,

2968

const TargetTransformInfo &TTI) {

2969

if (Ty->isVoidTy())

2970

return 0;

2971

2972

assert(Ty->isVectorTy() && "Can only scalarize vectors")((Ty->isVectorTy() && "Can only scalarize vectors"
) ? static_cast<void> (0) : __assert_fail ("Ty->isVectorTy() && \"Can only scalarize vectors\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn240924/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 2972, __PRETTY_FUNCTION__));

2973

unsigned Cost = 0;

2974

2975

for (int i = 0, e = Ty->getVectorNumElements(); i < e; ++i) {

2976

if (Insert)

2977

Cost += TTI.getVectorInstrCost(Instruction::InsertElement, Ty, i);

2978

if (Extract)

2979

Cost += TTI.getVectorInstrCost(Instruction::ExtractElement, Ty, i);

2980

}

2981

2982

return Cost;

2983

}

2984

2985

// Estimate cost of a call instruction CI if it were vectorized with factor VF.

2986

// Return the cost of the instruction, including scalarization overhead if it's

2987

// needed. The flag NeedToScalarize shows if the call needs to be scalarized -

2988

// i.e. either vector version isn't available, or is too expensive.

2989

static unsigned getVectorCallCost(CallInst *CI, unsigned VF,

2990

const TargetTransformInfo &TTI,

2991

const TargetLibraryInfo *TLI,

2992

bool &NeedToScalarize) {

2993

Function *F = CI->getCalledFunction();

2994

StringRef FnName = CI->getCalledFunction()->getName();

2995

Type *ScalarRetTy = CI->getType();

2996

SmallVector<Type *, 4> Tys, ScalarTys;

2997

for (auto &ArgOp : CI->arg_operands())

2998

ScalarTys.push_back(ArgOp->getType());

2999

3000

// Estimate cost of scalarized vector call. The source operands are assumed

3001

// to be vectors, so we need to extract individual elements from there,

3002

// execute VF scalar calls, and then gather the result into the vector return

3003

// value.

3004

unsigned ScalarCallCost = TTI.getCallInstrCost(F, ScalarRetTy, ScalarTys);

3005

if (VF == 1)

3006

return ScalarCallCost;

3007

3008

// Compute corresponding vector type for return value and arguments.

3009

Type *RetTy = ToVectorTy(ScalarRetTy, VF);

3010

for (unsigned i = 0, ie = ScalarTys.size(); i != ie; ++i)

3011

Tys.push_back(ToVectorTy(ScalarTys[i], VF));

3012

3013

// Compute costs of unpacking argument values for the scalar calls and

3014

// packing the return values to a vector.

3015

unsigned ScalarizationCost =

3016

getScalarizationOverhead(RetTy, true, false, TTI);

3017

for (unsigned i = 0, ie = Tys.size(); i != ie; ++i)

3018

ScalarizationCost += getScalarizationOverhead(Tys[i], false, true, TTI);

3019

3020

unsigned Cost = ScalarCallCost * VF + ScalarizationCost;

3021

3022

// If we can't emit a vector call for this function, then the currently found

3023

// cost is the cost we need to return.

3024

NeedToScalarize = true;

3025

if (!TLI || !TLI->isFunctionVectorizable(FnName, VF) || CI->isNoBuiltin())

3026

return Cost;

3027

3028

// If the corresponding vector cost is cheaper, return its cost.

3029

unsigned VectorCallCost = TTI.getCallInstrCost(nullptr, RetTy, Tys);

3030

if (VectorCallCost < Cost) {

3031

NeedToScalarize = false;

3032

return VectorCallCost;

3033

}

3034

return Cost;

3035

}

3036

3037

// Estimate cost of an intrinsic call instruction CI if it were vectorized with

3038

// factor VF. Return the cost of the instruction, including scalarization

3039

// overhead if it's needed.

3040

static unsigned getVectorIntrinsicCost(CallInst *CI, unsigned VF,

3041

const TargetTransformInfo &TTI,

3042

const TargetLibraryInfo *TLI) {

3043

Intrinsic::ID ID = getIntrinsicIDForCall(CI, TLI);

3044

assert(ID && "Expected intrinsic call!")((ID && "Expected intrinsic call!") ? static_cast<
void> (0) : __assert_fail ("ID && \"Expected intrinsic call!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn240924/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3044, __PRETTY_FUNCTION__));

3045

3046

Type *RetTy = ToVectorTy(CI->getType(), VF);

3047

SmallVector<Type *, 4> Tys;

3048

for (unsigned i = 0, ie = CI->getNumArgOperands(); i != ie; ++i)

3049

Tys.push_back(ToVectorTy(CI->getArgOperand(i)->getType(), VF));

3050

3051

return TTI.getIntrinsicInstrCost(ID, RetTy, Tys);

3052

}

3053

3054

void InnerLoopVectorizer::vectorizeLoop() {

3055

//===------------------------------------------------===//

3056

3057

// Notice: any optimization or new instruction that go

3058

// into the code below should be also be implemented in

3059

// the cost-model.

3060

3061

//===------------------------------------------------===//

3062

Constant *Zero = Builder.getInt32(0);

3063

3064

// In order to support reduction variables we need to be able to vectorize

3065

// Phi nodes. Phi nodes have cycles, so we need to vectorize them in two

3066

// stages. First, we create a new vector PHI node with no incoming edges.

3067

// We use this value when we vectorize all of the instructions that use the

3068

// PHI. Next, after all of the instructions in the block are complete we

3069

// add the new incoming edges to the PHI. At this point all of the

3070

// instructions in the basic block are vectorized, so we can use them to

3071

// construct the PHI.

3072

PhiVector RdxPHIsToFix;

3073

3074

// Scan the loop in a topological order to ensure that defs are vectorized

3075

// before users.

3076

LoopBlocksDFS DFS(OrigLoop);

3077

DFS.perform(LI);

3078

3079

// Vectorize all of the blocks in the original loop.

3080

for (LoopBlocksDFS::RPOIterator bb = DFS.beginRPO(),

3081

be = DFS.endRPO(); bb != be; ++bb)

3082

vectorizeBlockInLoop(*bb, &RdxPHIsToFix);

3083

3084

// At this point every instruction in the original loop is widened to

3085

// a vector form. We are almost done. Now, we need to fix the PHI nodes

3086

// that we vectorized. The PHI nodes are currently empty because we did

3087

// not want to introduce cycles. Notice that the remaining PHI nodes

3088

// that we need to fix are reduction variables.

3089

3090

// Create the 'reduced' values for each of the induction vars.

3091

// The reduced values are the vector values that we scalarize and combine

3092

// after the loop is finished.

3093

for (PhiVector::iterator it = RdxPHIsToFix.begin(), e = RdxPHIsToFix.end();

3094

it != e; ++it) {

3095

PHINode *RdxPhi = *it;

3096

assert(RdxPhi && "Unable to recover vectorized PHI")((RdxPhi && "Unable to recover vectorized PHI") ? static_cast
<void> (0) : __assert_fail ("RdxPhi && \"Unable to recover vectorized PHI\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn240924/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3096, __PRETTY_FUNCTION__));

3097

3098

// Find the reduction variable descriptor.

3099

assert(Legal->getReductionVars()->count(RdxPhi) &&((Legal->getReductionVars()->count(RdxPhi) && "Unable to find the reduction variable"
) ? static_cast<void> (0) : __assert_fail ("Legal->getReductionVars()->count(RdxPhi) && \"Unable to find the reduction variable\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn240924/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3100, __PRETTY_FUNCTION__))

3100

"Unable to find the reduction variable")((Legal->getReductionVars()->count(RdxPhi) && "Unable to find the reduction variable"
) ? static_cast<void> (0) : __assert_fail ("Legal->getReductionVars()->count(RdxPhi) && \"Unable to find the reduction variable\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn240924/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3100, __PRETTY_FUNCTION__));

3101

RecurrenceDescriptor RdxDesc = (*Legal->getReductionVars())[RdxPhi];

3102

3103

RecurrenceDescriptor::RecurrenceKind RK = RdxDesc.getRecurrenceKind();

3104

TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue();

3105

Instruction *LoopExitInst = RdxDesc.getLoopExitInstr();

3106

RecurrenceDescriptor::MinMaxRecurrenceKind MinMaxKind =

3107

RdxDesc.getMinMaxRecurrenceKind();

3108

setDebugLocFromInst(Builder, ReductionStartValue);

3109

3110

// We need to generate a reduction vector from the incoming scalar.

3111

// To do so, we need to generate the 'identity' vector and override

3112

// one of the elements with the incoming scalar reduction. We need

3113

// to do it in the vector-loop preheader.

3114

Builder.SetInsertPoint(LoopBypassBlocks[1]->getTerminator());

3115

3116

// This is the vector-clone of the value that leaves the loop.

3117

VectorParts &VectorExit = getVectorValue(LoopExitInst);

3118

Type *VecTy = VectorExit[0]->getType();

3119

3120

// Find the reduction identity variable. Zero for addition, or, xor,

3121

// one for multiplication, -1 for And.

3122

Value *Identity;

3123

Value *VectorStart;

3124

if (RK == RecurrenceDescriptor::RK_IntegerMinMax ||

3125

RK == RecurrenceDescriptor::RK_FloatMinMax) {

3126

// MinMax reduction have the start value as their identify.

3127

if (VF == 1) {

3128

VectorStart = Identity = ReductionStartValue;

3129

} else {

3130

VectorStart = Identity =

3131

Builder.CreateVectorSplat(VF, ReductionStartValue, "minmax.ident");

3132

}

3133

} else {

3134

// Handle other reduction kinds:

3135

Constant *Iden = RecurrenceDescriptor::getRecurrenceIdentity(

3136

RK, VecTy->getScalarType());

3137

if (VF == 1) {

3138

Identity = Iden;

3139

// This vector is the Identity vector where the first element is the

3140

// incoming scalar reduction.

3141

VectorStart = ReductionStartValue;

3142

} else {

3143

Identity = ConstantVector::getSplat(VF, Iden);

3144

3145

// This vector is the Identity vector where the first element is the

3146

// incoming scalar reduction.

3147

VectorStart =

3148

Builder.CreateInsertElement(Identity, ReductionStartValue, Zero);

3149

}

3150

}

3151

3152

// Fix the vector-loop phi.

3153

3154

// Reductions do not have to start at zero. They can start with

3155

// any loop invariant values.

3156

VectorParts &VecRdxPhi = WidenMap.get(RdxPhi);

3157

BasicBlock *Latch = OrigLoop->getLoopLatch();

3158

Value *LoopVal = RdxPhi->getIncomingValueForBlock(Latch);

3159

VectorParts &Val = getVectorValue(LoopVal);

3160

for (unsigned part = 0; part < UF; ++part) {

3161

// Make sure to add the reduction stat value only to the

3162

// first unroll part.

3163

Value *StartVal = (part == 0) ? VectorStart : Identity;

3164

cast<PHINode>(VecRdxPhi[part])->addIncoming(StartVal,

3165

LoopVectorPreHeader);

3166

cast<PHINode>(VecRdxPhi[part])->addIncoming(Val[part],

3167

LoopVectorBody.back());

3168

}

3169

3170

// Before each round, move the insertion point right between

3171

// the PHIs and the values we are going to write.

3172

// This allows us to write both PHINodes and the extractelement

3173

// instructions.

3174

Builder.SetInsertPoint(LoopMiddleBlock->getFirstInsertionPt());

3175

3176

VectorParts RdxParts;

3177

setDebugLocFromInst(Builder, LoopExitInst);

3178

for (unsigned part = 0; part < UF; ++part) {

3179

// This PHINode contains the vectorized reduction variable, or

3180

// the initial value vector, if we bypass the vector loop.

3181

VectorParts &RdxExitVal = getVectorValue(LoopExitInst);

3182

PHINode *NewPhi = Builder.CreatePHI(VecTy, 2, "rdx.vec.exit.phi");

3183

Value *StartVal = (part == 0) ? VectorStart : Identity;

3184

for (unsigned I = 1, E = LoopBypassBlocks.size(); I != E; ++I)

3185

NewPhi->addIncoming(StartVal, LoopBypassBlocks[I]);

3186

NewPhi->addIncoming(RdxExitVal[part],

3187

LoopVectorBody.back());

3188

RdxParts.push_back(NewPhi);

3189

}

3190

3191

// Reduce all of the unrolled parts into a single vector.

3192

Value *ReducedPartRdx = RdxParts[0];

3193

unsigned Op = RecurrenceDescriptor::getRecurrenceBinOp(RK);

3194

setDebugLocFromInst(Builder, ReducedPartRdx);

3195

for (unsigned part = 1; part < UF; ++part) {

3196

if (Op != Instruction::ICmp && Op != Instruction::FCmp)

3197

// Floating point operations had to be 'fast' to enable the reduction.

3198

ReducedPartRdx = addFastMathFlag(

3199

Builder.CreateBinOp((Instruction::BinaryOps)Op, RdxParts[part],

3200

ReducedPartRdx, "bin.rdx"));

3201

else

3202

ReducedPartRdx = RecurrenceDescriptor::createMinMaxOp(

3203

Builder, MinMaxKind, ReducedPartRdx, RdxParts[part]);

3204

}

3205

3206

if (VF > 1) {

3207

// VF is a power of 2 so we can emit the reduction using log2(VF) shuffles

3208

// and vector ops, reducing the set of values being computed by half each

3209

// round.

3210

assert(isPowerOf2_32(VF) &&((isPowerOf2_32(VF) && "Reduction emission only supported for pow2 vectors!"
) ? static_cast<void> (0) : __assert_fail ("isPowerOf2_32(VF) && \"Reduction emission only supported for pow2 vectors!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn240924/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3211, __PRETTY_FUNCTION__))

3211

"Reduction emission only supported for pow2 vectors!")((isPowerOf2_32(VF) && "Reduction emission only supported for pow2 vectors!"
) ? static_cast<void> (0) : __assert_fail ("isPowerOf2_32(VF) && \"Reduction emission only supported for pow2 vectors!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn240924/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3211, __PRETTY_FUNCTION__));

3212

Value *TmpVec = ReducedPartRdx;

3213

SmallVector<Constant*, 32> ShuffleMask(VF, nullptr);

3214

for (unsigned i = VF; i != 1; i >>= 1) {

3215

// Move the upper half of the vector to the lower half.

3216

for (unsigned j = 0; j != i/2; ++j)

3217

ShuffleMask[j] = Builder.getInt32(i/2 + j);

3218

3219

// Fill the rest of the mask with undef.

3220

std::fill(&ShuffleMask[i/2], ShuffleMask.end(),

3221

UndefValue::get(Builder.getInt32Ty()));

3222

3223

Value *Shuf =

3224

Builder.CreateShuffleVector(TmpVec,

3225

UndefValue::get(TmpVec->getType()),

3226

ConstantVector::get(ShuffleMask),

3227

"rdx.shuf");

3228

3229

if (Op != Instruction::ICmp && Op != Instruction::FCmp)

3230

// Floating point operations had to be 'fast' to enable the reduction.

3231

TmpVec = addFastMathFlag(Builder.CreateBinOp(

3232

(Instruction::BinaryOps)Op, TmpVec, Shuf, "bin.rdx"));

3233

else

3234

TmpVec = RecurrenceDescriptor::createMinMaxOp(Builder, MinMaxKind,

3235

TmpVec, Shuf);

3236

}

3237

3238

// The result is in the first element of the vector.

3239

ReducedPartRdx = Builder.CreateExtractElement(TmpVec,

3240

Builder.getInt32(0));

3241

}

3242

3243

// Create a phi node that merges control-flow from the backedge-taken check

3244

// block and the middle block.

3245

PHINode *BCBlockPhi = PHINode::Create(RdxPhi->getType(), 2, "bc.merge.rdx",

3246

LoopScalarPreHeader->getTerminator());

3247

BCBlockPhi->addIncoming(ReductionStartValue, LoopBypassBlocks[0]);

3248

BCBlockPhi->addIncoming(ReducedPartRdx, LoopMiddleBlock);

3249

3250

// Now, we need to fix the users of the reduction variable

3251

// inside and outside of the scalar remainder loop.

3252

// We know that the loop is in LCSSA form. We need to update the

3253

// PHI nodes in the exit blocks.

3254

for (BasicBlock::iterator LEI = LoopExitBlock->begin(),

3255

LEE = LoopExitBlock->end(); LEI != LEE; ++LEI) {

3256

PHINode *LCSSAPhi = dyn_cast<PHINode>(LEI);

3257

if (!LCSSAPhi) break;

3258

3259

// All PHINodes need to have a single entry edge, or two if

3260

// we already fixed them.

3261

assert(LCSSAPhi->getNumIncomingValues() < 3 && "Invalid LCSSA PHI")((LCSSAPhi->getNumIncomingValues() < 3 && "Invalid LCSSA PHI"
) ? static_cast<void> (0) : __assert_fail ("LCSSAPhi->getNumIncomingValues() < 3 && \"Invalid LCSSA PHI\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn240924/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3261, __PRETTY_FUNCTION__));

3262

3263

// We found our reduction value exit-PHI. Update it with the

3264

// incoming bypass edge.

3265

if (LCSSAPhi->getIncomingValue(0) == LoopExitInst) {

3266

// Add an edge coming from the bypass.

3267

LCSSAPhi->addIncoming(ReducedPartRdx, LoopMiddleBlock);

3268

break;

3269

}

3270

}// end of the LCSSA phi scan.

3271

3272

// Fix the scalar loop reduction variable with the incoming reduction sum

3273

// from the vector body and from the backedge value.

3274

int IncomingEdgeBlockIdx =

3275

(RdxPhi)->getBasicBlockIndex(OrigLoop->getLoopLatch());

3276

assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index")((IncomingEdgeBlockIdx >= 0 && "Invalid block index"
) ? static_cast<void> (0) : __assert_fail ("IncomingEdgeBlockIdx >= 0 && \"Invalid block index\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn240924/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3276, __PRETTY_FUNCTION__));

3277

// Pick the other block.

3278

int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1);

3279

(RdxPhi)->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi);

3280

(RdxPhi)->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst);

3281

}// end of for each redux variable.

3282

3283

fixLCSSAPHIs();

3284

3285

// Remove redundant induction instructions.

3286

cse(LoopVectorBody);

3287

}

3288

3289

void InnerLoopVectorizer::fixLCSSAPHIs() {

3290

for (BasicBlock::iterator LEI = LoopExitBlock->begin(),

3291

LEE = LoopExitBlock->end(); LEI != LEE; ++LEI) {

3292

PHINode *LCSSAPhi = dyn_cast<PHINode>(LEI);

3293

if (!LCSSAPhi) break;

3294

if (LCSSAPhi->getNumIncomingValues() == 1)

3295

LCSSAPhi->addIncoming(UndefValue::get(LCSSAPhi->getType()),

3296

LoopMiddleBlock);

3297

}

3298

}

3299

3300

InnerLoopVectorizer::VectorParts

3301

InnerLoopVectorizer::createEdgeMask(BasicBlock *Src, BasicBlock *Dst) {

3302

assert(std::find(pred_begin(Dst), pred_end(Dst), Src) != pred_end(Dst) &&((std::find(pred_begin(Dst), pred_end(Dst), Src) != pred_end(
Dst) && "Invalid edge") ? static_cast<void> (0)
: __assert_fail ("std::find(pred_begin(Dst), pred_end(Dst), Src) != pred_end(Dst) && \"Invalid edge\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn240924/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3303, __PRETTY_FUNCTION__))

3303

"Invalid edge")((std::find(pred_begin(Dst), pred_end(Dst), Src) != pred_end(
Dst) && "Invalid edge") ? static_cast<void> (0)
: __assert_fail ("std::find(pred_begin(Dst), pred_end(Dst), Src) != pred_end(Dst) && \"Invalid edge\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn240924/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3303, __PRETTY_FUNCTION__));

3304

3305

// Look for cached value.

3306

std::pair<BasicBlock*, BasicBlock*> Edge(Src, Dst);

3307

EdgeMaskCache::iterator ECEntryIt = MaskCache.find(Edge);

3308

if (ECEntryIt != MaskCache.end())

3309

return ECEntryIt->second;

3310

3311

VectorParts SrcMask = createBlockInMask(Src);

3312

3313

// The terminator has to be a branch inst!

3314

BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator());

3315

assert(BI && "Unexpected terminator found")((BI && "Unexpected terminator found") ? static_cast<
void> (0) : __assert_fail ("BI && \"Unexpected terminator found\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn240924/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3315, __PRETTY_FUNCTION__));

3316

3317

if (BI->isConditional()) {

3318

VectorParts EdgeMask = getVectorValue(BI->getCondition());

3319

3320

if (BI->getSuccessor(0) != Dst)

3321

for (unsigned part = 0; part < UF; ++part)

3322

EdgeMask[part] = Builder.CreateNot(EdgeMask[part]);

3323

3324

for (unsigned part = 0; part < UF; ++part)

3325

EdgeMask[part] = Builder.CreateAnd(EdgeMask[part], SrcMask[part]);

3326

3327

MaskCache[Edge] = EdgeMask;

3328

return EdgeMask;

3329

}

3330

3331

MaskCache[Edge] = SrcMask;

3332

return SrcMask;

3333

}

3334

3335

InnerLoopVectorizer::VectorParts

3336

InnerLoopVectorizer::createBlockInMask(BasicBlock *BB) {

3337

assert(OrigLoop->contains(BB) && "Block is not a part of a loop")((OrigLoop->contains(BB) && "Block is not a part of a loop"
) ? static_cast<void> (0) : __assert_fail ("OrigLoop->contains(BB) && \"Block is not a part of a loop\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn240924/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3337, __PRETTY_FUNCTION__));

3338

3339

// Loop incoming mask is all-one.

3340

if (OrigLoop->getHeader() == BB) {

3341

Value *C = ConstantInt::get(IntegerType::getInt1Ty(BB->getContext()), 1);

3342

return getVectorValue(C);

3343

}

3344

3345

// This is the block mask. We OR all incoming edges, and with zero.

3346

Value *Zero = ConstantInt::get(IntegerType::getInt1Ty(BB->getContext()), 0);

3347

VectorParts BlockMask = getVectorValue(Zero);

3348

3349

// For each pred:

3350

for (pred_iterator it = pred_begin(BB), e = pred_end(BB); it != e; ++it) {

3351

VectorParts EM = createEdgeMask(*it, BB);

3352

for (unsigned part = 0; part < UF; ++part)

3353

BlockMask[part] = Builder.CreateOr(BlockMask[part], EM[part]);

3354

}

3355

3356

return BlockMask;

3357

}

3358

3359

void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN,

3360

InnerLoopVectorizer::VectorParts &Entry,

3361

unsigned UF, unsigned VF, PhiVector *PV) {

3362

PHINode* P = cast<PHINode>(PN);

3363

// Handle reduction variables:

3364

if (Legal->getReductionVars()->count(P)) {

3365

for (unsigned part = 0; part < UF; ++part) {

3366

// This is phase one of vectorizing PHIs.

3367

Type *VecTy = (VF == 1) ? PN->getType() :

3368

VectorType::get(PN->getType(), VF);

3369

Entry[part] = PHINode::Create(VecTy, 2, "vec.phi",

3370

LoopVectorBody.back()-> getFirstInsertionPt());

3371

}

3372

PV->push_back(P);

3373

return;

3374

}

3375

3376

setDebugLocFromInst(Builder, P);

3377

// Check for PHI nodes that are lowered to vector selects.

3378

if (P->getParent() != OrigLoop->getHeader()) {

3379

// We know that all PHIs in non-header blocks are converted into

3380

// selects, so we don't have to worry about the insertion order and we

3381

// can just use the builder.

3382

// At this point we generate the predication tree. There may be

3383

// duplications since this is a simple recursive scan, but future

3384

// optimizations will clean it up.

3385

3386

unsigned NumIncoming = P->getNumIncomingValues();

3387

3388

// Generate a sequence of selects of the form:

3389

// SELECT(Mask3, In3,

3390

// SELECT(Mask2, In2,

3391

// ( ...)))

3392

for (unsigned In = 0; In < NumIncoming; In++) {

3393

VectorParts Cond = createEdgeMask(P->getIncomingBlock(In),

3394

P->getParent());

3395

VectorParts &In0 = getVectorValue(P->getIncomingValue(In));

3396

3397

for (unsigned part = 0; part < UF; ++part) {

3398

// We might have single edge PHIs (blocks) - use an identity

3399

// 'select' for the first PHI operand.

3400

if (In == 0)

3401

Entry[part] = Builder.CreateSelect(Cond[part], In0[part],

3402

In0[part]);

3403

else

3404

// Select between the current value and the previous incoming edge

3405

// based on the incoming mask.

3406

Entry[part] = Builder.CreateSelect(Cond[part], In0[part],

3407

Entry[part], "predphi");

3408

}

3409

}

3410

return;

3411

}

3412

3413

// This PHINode must be an induction variable.

3414

// Make sure that we know about it.

3415

assert(Legal->getInductionVars()->count(P) &&((Legal->getInductionVars()->count(P) && "Not an induction variable"
) ? static_cast<void> (0) : __assert_fail ("Legal->getInductionVars()->count(P) && \"Not an induction variable\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn240924/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3416, __PRETTY_FUNCTION__))

3416

"Not an induction variable")((Legal->getInductionVars()->count(P) && "Not an induction variable"
) ? static_cast<void> (0) : __assert_fail ("Legal->getInductionVars()->count(P) && \"Not an induction variable\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn240924/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3416, __PRETTY_FUNCTION__));

3417

3418

LoopVectorizationLegality::InductionInfo II =

3419

Legal->getInductionVars()->lookup(P);

3420

3421

// FIXME: The newly created binary instructions should contain nsw/nuw flags,

3422

// which can be found from the original scalar operations.

3423

switch (II.IK) {

3424

case LoopVectorizationLegality::IK_NoInduction:

3425

llvm_unreachable("Unknown induction")::llvm::llvm_unreachable_internal("Unknown induction", "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn240924/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3425);

3426

case LoopVectorizationLegality::IK_IntInduction: {

3427

assert(P->getType() == II.StartValue->getType() && "Types must match")((P->getType() == II.StartValue->getType() && "Types must match"
) ? static_cast<void> (0) : __assert_fail ("P->getType() == II.StartValue->getType() && \"Types must match\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn240924/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3427, __PRETTY_FUNCTION__));

3428

Type *PhiTy = P->getType();

3429

Value *Broadcasted;

3430

if (P == OldInduction) {

3431

// Handle the canonical induction variable. We might have had to

3432

// extend the type.

3433

Broadcasted = Builder.CreateTrunc(Induction, PhiTy);

3434

} else {

3435

// Handle other induction variables that are now based on the

3436

// canonical one.

3437

Value *NormalizedIdx = Builder.CreateSub(Induction, ExtendedIdx,

3438

"normalized.idx");

3439

NormalizedIdx = Builder.CreateSExtOrTrunc(NormalizedIdx, PhiTy);

3440

Broadcasted = II.transform(Builder, NormalizedIdx);

3441

Broadcasted->setName("offset.idx");

3442

}

3443

Broadcasted = getBroadcastInstrs(Broadcasted);

3444

// After broadcasting the induction variable we need to make the vector

3445

// consecutive by adding 0, 1, 2, etc.

3446

for (unsigned part = 0; part < UF; ++part)

3447

Entry[part] = getStepVector(Broadcasted, VF * part, II.StepValue);

3448

return;

3449

}

3450

case LoopVectorizationLegality::IK_PtrInduction:

3451

// Handle the pointer induction variable case.

3452

assert(P->getType()->isPointerTy() && "Unexpected type.")((P->getType()->isPointerTy() && "Unexpected type."
) ? static_cast<void> (0) : __assert_fail ("P->getType()->isPointerTy() && \"Unexpected type.\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn240924/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3452, __PRETTY_FUNCTION__));

3453

// This is the normalized GEP that starts counting at zero.

3454

Value *NormalizedIdx =

3455

Builder.CreateSub(Induction, ExtendedIdx, "normalized.idx");

3456

NormalizedIdx =

3457

Builder.CreateSExtOrTrunc(NormalizedIdx, II.StepValue->getType());

3458

// This is the vector of results. Notice that we don't generate

3459

// vector geps because scalar geps result in better code.

3460

for (unsigned part = 0; part < UF; ++part) {

3461

if (VF == 1) {

3462

int EltIndex = part;

3463

Constant *Idx = ConstantInt::get(NormalizedIdx->getType(), EltIndex);

3464

Value *GlobalIdx = Builder.CreateAdd(NormalizedIdx, Idx);

3465

Value *SclrGep = II.transform(Builder, GlobalIdx);

3466

SclrGep->setName("next.gep");

3467

Entry[part] = SclrGep;

3468

continue;

3469

}

3470

3471

Value *VecVal = UndefValue::get(VectorType::get(P->getType(), VF));

3472

for (unsigned int i = 0; i < VF; ++i) {

3473

int EltIndex = i + part * VF;

3474

Constant *Idx = ConstantInt::get(NormalizedIdx->getType(), EltIndex);

3475

Value *GlobalIdx = Builder.CreateAdd(NormalizedIdx, Idx);

3476

Value *SclrGep = II.transform(Builder, GlobalIdx);

3477

SclrGep->setName("next.gep");

3478

VecVal = Builder.CreateInsertElement(VecVal, SclrGep,

3479

Builder.getInt32(i),

3480

"insert.gep");

3481

}

3482

Entry[part] = VecVal;

3483

}

3484

return;

3485

}

3486

}

3487

3488

void InnerLoopVectorizer::vectorizeBlockInLoop(BasicBlock *BB, PhiVector *PV) {

3489

// For each instruction in the old loop.

3490

for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) {

3491

VectorParts &Entry = WidenMap.get(it);

3492

switch (it->getOpcode()) {

3493

case Instruction::Br:

3494

// Nothing to do for PHIs and BR, since we already took care of the

3495

// loop control flow instructions.

3496

continue;

3497

case Instruction::PHI: {

3498

// Vectorize PHINodes.

3499

widenPHIInstruction(it, Entry, UF, VF, PV);

3500

continue;

3501

}// End of PHI.

3502

3503

case Instruction::Add:

3504

case Instruction::FAdd:

3505

case Instruction::Sub:

3506

case Instruction::FSub:

3507

case Instruction::Mul:

3508

case Instruction::FMul:

3509

case Instruction::UDiv:

3510

case Instruction::SDiv:

3511

case Instruction::FDiv:

3512

case Instruction::URem:

3513

case Instruction::SRem:

3514

case Instruction::FRem:

3515

case Instruction::Shl:

3516

case Instruction::LShr:

3517

case Instruction::AShr:

3518

case Instruction::And:

3519

case Instruction::Or:

3520

case Instruction::Xor: {

3521

// Just widen binops.

3522

BinaryOperator *BinOp = dyn_cast<BinaryOperator>(it);

3523

setDebugLocFromInst(Builder, BinOp);

3524

VectorParts &A = getVectorValue(it->getOperand(0));

3525

VectorParts &B = getVectorValue(it->getOperand(1));

3526

3527

// Use this vector value for all users of the original instruction.

3528

for (unsigned Part = 0; Part < UF; ++Part) {

3529

Value *V = Builder.CreateBinOp(BinOp->getOpcode(), A[Part], B[Part]);

3530

3531

if (BinaryOperator *VecOp = dyn_cast<BinaryOperator>(V))

3532

VecOp->copyIRFlags(BinOp);

3533

3534

Entry[Part] = V;

3535

}

3536

3537

propagateMetadata(Entry, it);

3538

break;

3539

}

3540

case Instruction::Select: {

3541

// Widen selects.

3542

// If the selector is loop invariant we can create a select

3543

// instruction with a scalar condition. Otherwise, use vector-select.

3544

bool InvariantCond = SE->isLoopInvariant(SE->getSCEV(it->getOperand(0)),

3545

OrigLoop);

3546

setDebugLocFromInst(Builder, it);

3547

3548

// The condition can be loop invariant but still defined inside the

3549

// loop. This means that we can't just use the original 'cond' value.

3550

// We have to take the 'vectorized' value and pick the first lane.

3551

// Instcombine will make this a no-op.

3552

VectorParts &Cond = getVectorValue(it->getOperand(0));

3553

VectorParts &Op0 = getVectorValue(it->getOperand(1));

3554

VectorParts &Op1 = getVectorValue(it->getOperand(2));

3555

3556

Value *ScalarCond = (VF == 1) ? Cond[0] :

3557

Builder.CreateExtractElement(Cond[0], Builder.getInt32(0));

3558

3559

for (unsigned Part = 0; Part < UF; ++Part) {

3560

Entry[Part] = Builder.CreateSelect(

3561

InvariantCond ? ScalarCond : Cond[Part],

3562

Op0[Part],

3563

Op1[Part]);

3564

}

3565

3566

propagateMetadata(Entry, it);

3567

break;

3568

}

3569

3570

case Instruction::ICmp:

3571

case Instruction::FCmp: {

3572

// Widen compares. Generate vector compares.

3573

bool FCmp = (it->getOpcode() == Instruction::FCmp);

3574

CmpInst *Cmp = dyn_cast<CmpInst>(it);

3575

setDebugLocFromInst(Builder, it);

3576

VectorParts &A = getVectorValue(it->getOperand(0));

3577

VectorParts &B = getVectorValue(it->getOperand(1));

3578

for (unsigned Part = 0; Part < UF; ++Part) {

3579

Value *C = nullptr;

3580

if (FCmp)

3581

C = Builder.CreateFCmp(Cmp->getPredicate(), A[Part], B[Part]);

3582

else

3583

C = Builder.CreateICmp(Cmp->getPredicate(), A[Part], B[Part]);

3584

Entry[Part] = C;

3585

}

3586

3587

propagateMetadata(Entry, it);

3588

break;

3589

}

3590

3591

case Instruction::Store:

3592

case Instruction::Load:

3593

vectorizeMemoryInstruction(it);

3594

break;

3595

case Instruction::ZExt:

3596

case Instruction::SExt:

3597

case Instruction::FPToUI:

3598

case Instruction::FPToSI:

3599

case Instruction::FPExt:

3600

case Instruction::PtrToInt:

3601

case Instruction::IntToPtr:

3602

case Instruction::SIToFP:

3603

case Instruction::UIToFP:

3604

case Instruction::Trunc:

3605

case Instruction::FPTrunc:

3606

case Instruction::BitCast: {

3607

CastInst *CI = dyn_cast<CastInst>(it);

3608

setDebugLocFromInst(Builder, it);

3609

/// Optimize the special case where the source is the induction

3610

/// variable. Notice that we can only optimize the 'trunc' case

3611

/// because: a. FP conversions lose precision, b. sext/zext may wrap,

3612

/// c. other casts depend on pointer size.

3613

if (CI->getOperand(0) == OldInduction &&

3614

it->getOpcode() == Instruction::Trunc) {

3615

Value *ScalarCast = Builder.CreateCast(CI->getOpcode(), Induction,

3616

CI->getType());

3617

Value *Broadcasted = getBroadcastInstrs(ScalarCast);

3618

LoopVectorizationLegality::InductionInfo II =

3619

Legal->getInductionVars()->lookup(OldInduction);

3620

Constant *Step =

3621

ConstantInt::getSigned(CI->getType(), II.StepValue->getSExtValue());

3622

for (unsigned Part = 0; Part < UF; ++Part)

3623

Entry[Part] = getStepVector(Broadcasted, VF * Part, Step);

3624

propagateMetadata(Entry, it);

3625

break;

3626

}

3627

/// Vectorize casts.

3628

Type *DestTy = (VF == 1) ? CI->getType() :

3629

VectorType::get(CI->getType(), VF);

3630

3631

VectorParts &A = getVectorValue(it->getOperand(0));

3632

for (unsigned Part = 0; Part < UF; ++Part)

3633

Entry[Part] = Builder.CreateCast(CI->getOpcode(), A[Part], DestTy);

3634

propagateMetadata(Entry, it);

3635

break;

3636

}

3637

3638

case Instruction::Call: {

3639

// Ignore dbg intrinsics.

3640

if (isa<DbgInfoIntrinsic>(it))

3641

break;

3642

setDebugLocFromInst(Builder, it);

3643

3644

Module *M = BB->getParent()->getParent();

3645

CallInst *CI = cast<CallInst>(it);

3646

3647

StringRef FnName = CI->getCalledFunction()->getName();

3648

Function *F = CI->getCalledFunction();

3649

Type *RetTy = ToVectorTy(CI->getType(), VF);

3650

SmallVector<Type *, 4> Tys;

3651

for (unsigned i = 0, ie = CI->getNumArgOperands(); i != ie; ++i)

3652

Tys.push_back(ToVectorTy(CI->getArgOperand(i)->getType(), VF));

3653

3654

Intrinsic::ID ID = getIntrinsicIDForCall(CI, TLI);

3655

if (ID &&

3656

(ID == Intrinsic::assume || ID == Intrinsic::lifetime_end ||

3657

ID == Intrinsic::lifetime_start)) {

3658

scalarizeInstruction(it);

3659

break;

3660

}

3661

// The flag shows whether we use Intrinsic or a usual Call for vectorized

3662

// version of the instruction.

3663

// Is it beneficial to perform intrinsic call compared to lib call?

3664

bool NeedToScalarize;

3665

unsigned CallCost = getVectorCallCost(CI, VF, *TTI, TLI, NeedToScalarize);

3666

bool UseVectorIntrinsic =

3667

ID && getVectorIntrinsicCost(CI, VF, *TTI, TLI) <= CallCost;

3668

if (!UseVectorIntrinsic && NeedToScalarize) {

3669

scalarizeInstruction(it);

3670

break;

3671

}

3672

3673

for (unsigned Part = 0; Part < UF; ++Part) {

3674

SmallVector<Value *, 4> Args;

3675

for (unsigned i = 0, ie = CI->getNumArgOperands(); i != ie; ++i) {

3676

Value *Arg = CI->getArgOperand(i);

3677

// Some intrinsics have a scalar argument - don't replace it with a

3678

// vector.

3679

if (!UseVectorIntrinsic || !hasVectorInstrinsicScalarOpd(ID, i)) {

3680

VectorParts &VectorArg = getVectorValue(CI->getArgOperand(i));

3681

Arg = VectorArg[Part];

3682

}

3683

Args.push_back(Arg);

3684

}

3685

3686

Function *VectorF;

3687

if (UseVectorIntrinsic) {

3688

// Use vector version of the intrinsic.

3689

Type *TysForDecl[] = {CI->getType()};

3690

if (VF > 1)

3691

TysForDecl[0] = VectorType::get(CI->getType()->getScalarType(), VF);

3692

VectorF = Intrinsic::getDeclaration(M, ID, TysForDecl);

3693

} else {

3694

// Use vector version of the library call.

3695

StringRef VFnName = TLI->getVectorizedFunction(FnName, VF);

3696

assert(!VFnName.empty() && "Vector function name is empty.")((!VFnName.empty() && "Vector function name is empty."
) ? static_cast<void> (0) : __assert_fail ("!VFnName.empty() && \"Vector function name is empty.\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn240924/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3696, __PRETTY_FUNCTION__));

3697

VectorF = M->getFunction(VFnName);

3698

if (!VectorF) {

3699

// Generate a declaration

3700

FunctionType *FTy = FunctionType::get(RetTy, Tys, false);

3701

VectorF =

3702

Function::Create(FTy, Function::ExternalLinkage, VFnName, M);

3703

VectorF->copyAttributesFrom(F);

3704

}

3705

}

3706

assert(VectorF && "Can't create vector function.")((VectorF && "Can't create vector function.") ? static_cast
<void> (0) : __assert_fail ("VectorF && \"Can't create vector function.\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn240924/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3706, __PRETTY_FUNCTION__));

3707

Entry[Part] = Builder.CreateCall(VectorF, Args);

3708

}

3709

3710

propagateMetadata(Entry, it);

3711

break;

3712

}

3713

3714

default:

3715

// All other instructions are unsupported. Scalarize them.

3716

scalarizeInstruction(it);

3717

break;

3718

}// end of switch.

3719

}// end of for_each instr.

3720

}

3721

3722

void InnerLoopVectorizer::updateAnalysis() {

3723

// Forget the original basic block.

3724

SE->forgetLoop(OrigLoop);

3725

3726

// Update the dominator tree information.

3727

assert(DT->properlyDominates(LoopBypassBlocks.front(), LoopExitBlock) &&((DT->properlyDominates(LoopBypassBlocks.front(), LoopExitBlock
) && "Entry does not dominate exit.") ? static_cast<
void> (0) : __assert_fail ("DT->properlyDominates(LoopBypassBlocks.front(), LoopExitBlock) && \"Entry does not dominate exit.\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn240924/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3728, __PRETTY_FUNCTION__))

3728

"Entry does not dominate exit.")((DT->properlyDominates(LoopBypassBlocks.front(), LoopExitBlock
) && "Entry does not dominate exit.") ? static_cast<
void> (0) : __assert_fail ("DT->properlyDominates(LoopBypassBlocks.front(), LoopExitBlock) && \"Entry does not dominate exit.\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn240924/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3728, __PRETTY_FUNCTION__));

3729

3730

for (unsigned I = 1, E = LoopBypassBlocks.size(); I != E; ++I)

3731

DT->addNewBlock(LoopBypassBlocks[I], LoopBypassBlocks[I-1]);

3732

DT->addNewBlock(LoopVectorPreHeader, LoopBypassBlocks.back());

3733

3734

// Due to if predication of stores we might create a sequence of "if(pred)

3735

// a[i] = ...; " blocks.

3736

for (unsigned i = 0, e = LoopVectorBody.size(); i != e; ++i) {

3737

if (i == 0)

3738

DT->addNewBlock(LoopVectorBody[0], LoopVectorPreHeader);

3739

else if (isPredicatedBlock(i)) {

3740

DT->addNewBlock(LoopVectorBody[i], LoopVectorBody[i-1]);

3741

} else {

3742

DT->addNewBlock(LoopVectorBody[i], LoopVectorBody[i-2]);

3743

}

3744

}

3745

3746

DT->addNewBlock(LoopMiddleBlock, LoopBypassBlocks[1]);

3747

DT->addNewBlock(LoopScalarPreHeader, LoopBypassBlocks[0]);

3748

DT->changeImmediateDominator(LoopScalarBody, LoopScalarPreHeader);

3749

DT->changeImmediateDominator(LoopExitBlock, LoopBypassBlocks[0]);

3750

3751

DEBUG(DT->verifyDomTree())do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { DT->verifyDomTree(); } } while (0);

3752

}

3753

3754

/// \brief Check whether it is safe to if-convert this phi node.

3755

///

3756

/// Phi nodes with constant expressions that can trap are not safe to if

3757

/// convert.

3758

static bool canIfConvertPHINodes(BasicBlock *BB) {

3759

for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) {

3760

PHINode *Phi = dyn_cast<PHINode>(I);

3761

if (!Phi)

3762

return true;

3763

for (unsigned p = 0, e = Phi->getNumIncomingValues(); p != e; ++p)

3764

if (Constant *C = dyn_cast<Constant>(Phi->getIncomingValue(p)))

3765

if (C->canTrap())

3766

return false;

3767

}

3768

return true;

3769

}

3770

3771

bool LoopVectorizationLegality::canVectorizeWithIfConvert() {

3772

if (!EnableIfConversion) {

3773

emitAnalysis(VectorizationReport() << "if-conversion is disabled");

3774

return false;

3775

}

3776

3777

assert(TheLoop->getNumBlocks() > 1 && "Single block loops are vectorizable")((TheLoop->getNumBlocks() > 1 && "Single block loops are vectorizable"
) ? static_cast<void> (0) : __assert_fail ("TheLoop->getNumBlocks() > 1 && \"Single block loops are vectorizable\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn240924/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 3777, __PRETTY_FUNCTION__));

3778

3779

// A list of pointers that we can safely read and write to.

3780

SmallPtrSet<Value *, 8> SafePointes;

3781

3782

// Collect safe addresses.

3783

for (Loop::block_iterator BI = TheLoop->block_begin(),

3784

BE = TheLoop->block_end(); BI != BE; ++BI) {

3785

BasicBlock *BB = *BI;

3786

3787

if (blockNeedsPredication(BB))

3788

continue;

3789

3790

for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) {

3791

if (LoadInst *LI = dyn_cast<LoadInst>(I))

3792

SafePointes.insert(LI->getPointerOperand());

3793

else if (StoreInst *SI = dyn_cast<StoreInst>(I))

3794

SafePointes.insert(SI->getPointerOperand());

3795

}

3796

}

3797

3798

// Collect the blocks that need predication.

3799

BasicBlock *Header = TheLoop->getHeader();

3800

for (Loop::block_iterator BI = TheLoop->block_begin(),

3801

BE = TheLoop->block_end(); BI != BE; ++BI) {

3802

BasicBlock *BB = *BI;

3803

3804

// We don't support switch statements inside loops.

3805

if (!isa<BranchInst>(BB->getTerminator())) {

3806

emitAnalysis(VectorizationReport(BB->getTerminator())

3807

<< "loop contains a switch statement");

3808

return false;

3809

}

3810

3811

// We must be able to predicate all blocks that need to be predicated.

3812

if (blockNeedsPredication(BB)) {

3813

if (!blockCanBePredicated(BB, SafePointes)) {

3814

emitAnalysis(VectorizationReport(BB->getTerminator())

3815

<< "control flow cannot be substituted for a select");

3816

return false;

3817

}

3818

} else if (BB != Header && !canIfConvertPHINodes(BB)) {

3819

emitAnalysis(VectorizationReport(BB->getTerminator())

3820

<< "control flow cannot be substituted for a select");

3821

return false;

3822

}

3823

}

3824

3825

// We can if-convert this loop.

3826

return true;

3827

}

3828

3829

bool LoopVectorizationLegality::canVectorize() {

3830

// We must have a loop in canonical form. Loops with indirectbr in them cannot

3831

// be canonicalized.

3832

if (!TheLoop->getLoopPreheader()) {

3833

emitAnalysis(

3834

VectorizationReport() <<

3835

"loop control flow is not understood by vectorizer");

3836

return false;

3837

}

3838

3839

// We can only vectorize innermost loops.

3840

if (!TheLoop->getSubLoopsVector().empty()) {

3841

emitAnalysis(VectorizationReport() << "loop is not the innermost loop");

3842

return false;

3843

}

3844

3845

// We must have a single backedge.

3846

if (TheLoop->getNumBackEdges() != 1) {

3847

emitAnalysis(

3848

VectorizationReport() <<

3849

"loop control flow is not understood by vectorizer");

3850

return false;

3851

}

3852

3853

// We must have a single exiting block.

3854

if (!TheLoop->getExitingBlock()) {

3855

emitAnalysis(

3856

VectorizationReport() <<

3857

"loop control flow is not understood by vectorizer");

3858

return false;

3859

}

3860

3861

// We only handle bottom-tested loops, i.e. loop in which the condition is

3862

// checked at the end of each iteration. With that we can assume that all

3863

// instructions in the loop are executed the same number of times.

3864

if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) {

3865

emitAnalysis(

3866

VectorizationReport() <<

3867

"loop control flow is not understood by vectorizer");

3868

return false;

3869

}

3870

3871

// We need to have a loop header.

3872

DEBUG(dbgs() << "LV: Found a loop: " <<do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found a loop: " <<
TheLoop->getHeader()->getName() << '\n'; } } while
(0)

3873

TheLoop->getHeader()->getName() << '\n')do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found a loop: " <<
TheLoop->getHeader()->getName() << '\n'; } } while
(0);

3874

3875

// Check if we can if-convert non-single-bb loops.

3876

unsigned NumBlocks = TheLoop->getNumBlocks();

3877

if (NumBlocks != 1 && !canVectorizeWithIfConvert()) {

3878

DEBUG(dbgs() << "LV: Can't if-convert the loop.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Can't if-convert the loop.\n"
; } } while (0);

3879

return false;

3880

}

3881

3882

// ScalarEvolution needs to be able to find the exit count.

3883

const SCEV *ExitCount = SE->getBackedgeTakenCount(TheLoop);

3884

if (ExitCount == SE->getCouldNotCompute()) {

3885

emitAnalysis(VectorizationReport() <<

3886

"could not determine number of loop iterations");

3887

DEBUG(dbgs() << "LV: SCEV could not compute the loop exit count.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: SCEV could not compute the loop exit count.\n"
; } } while (0);

3888

return false;

3889

}

3890

3891

// Check if we can vectorize the instructions and CFG in this loop.

3892

if (!canVectorizeInstrs()) {

3893

DEBUG(dbgs() << "LV: Can't vectorize the instructions or CFG\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Can't vectorize the instructions or CFG\n"
; } } while (0);

3894

return false;

3895

}

3896

3897

// Go over each instruction and look at memory deps.

3898

if (!canVectorizeMemory()) {

3899

DEBUG(dbgs() << "LV: Can't vectorize due to memory conflicts\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Can't vectorize due to memory conflicts\n"
; } } while (0);

3900

return false;

3901

}

3902

3903

// Collect all of the variables that remain uniform after vectorization.

3904

collectLoopUniforms();

3905

3906

DEBUG(dbgs() << "LV: We can vectorize this loop" <<do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: We can vectorize this loop"
<< (LAI->getRuntimePointerCheck()->Need ? " (with a runtime bound check)"
: "") <<"!\n"; } } while (0)

3907

(LAI->getRuntimePointerCheck()->Need ? " (with a runtime bound check)" :do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: We can vectorize this loop"
<< (LAI->getRuntimePointerCheck()->Need ? " (with a runtime bound check)"
: "") <<"!\n"; } } while (0)

3908

"")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: We can vectorize this loop"
<< (LAI->getRuntimePointerCheck()->Need ? " (with a runtime bound check)"
: "") <<"!\n"; } } while (0)

3909

<<"!\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: We can vectorize this loop"
<< (LAI->getRuntimePointerCheck()->Need ? " (with a runtime bound check)"
: "") <<"!\n"; } } while (0);

3910

3911

// Analyze interleaved memory accesses.

3912

if (EnableInterleavedMemAccesses)

3913

InterleaveInfo.analyzeInterleaving(Strides);

3914

3915

// Okay! We can vectorize. At this point we don't have any other mem analysis

3916

// which may limit our maximum vectorization factor, so just return true with

3917

// no restrictions.

3918

return true;

3919

}

3920

3921

static Type *convertPointerToIntegerType(const DataLayout &DL, Type *Ty) {

3922

if (Ty->isPointerTy())

3923

return DL.getIntPtrType(Ty);

3924

3925

// It is possible that char's or short's overflow when we ask for the loop's

3926

// trip count, work around this by changing the type size.

3927

if (Ty->getScalarSizeInBits() < 32)

3928

return Type::getInt32Ty(Ty->getContext());

3929

3930

return Ty;

3931

}

3932

3933

static Type* getWiderType(const DataLayout &DL, Type *Ty0, Type *Ty1) {

3934

Ty0 = convertPointerToIntegerType(DL, Ty0);

3935

Ty1 = convertPointerToIntegerType(DL, Ty1);

3936

if (Ty0->getScalarSizeInBits() > Ty1->getScalarSizeInBits())

3937

return Ty0;

3938

return Ty1;

3939

}

3940

3941

/// \brief Check that the instruction has outside loop users and is not an

3942

/// identified reduction variable.

3943

static bool hasOutsideLoopUser(const Loop *TheLoop, Instruction *Inst,

3944

SmallPtrSetImpl<Value *> &Reductions) {

3945

// Reduction instructions are allowed to have exit users. All other

3946

// instructions must not have external users.

3947

if (!Reductions.count(Inst))

3948

//Check that all of the users of the loop are inside the BB.

3949

for (User *U : Inst->users()) {

3950

Instruction *UI = cast<Instruction>(U);

3951

// This user may be a reduction exit value.

3952

if (!TheLoop->contains(UI)) {

3953

DEBUG(dbgs() << "LV: Found an outside user for : " << *UI << '\n')do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found an outside user for : "
<< *UI << '\n'; } } while (0);

3954

return true;

3955

}

3956

}

3957

return false;

3958

}

3959

3960

bool LoopVectorizationLegality::canVectorizeInstrs() {

3961

BasicBlock *PreHeader = TheLoop->getLoopPreheader();

3962

BasicBlock *Header = TheLoop->getHeader();

3963

3964

// Look for the attribute signaling the absence of NaNs.

3965

Function &F = *Header->getParent();

3966

const DataLayout &DL = F.getParent()->getDataLayout();

3967

if (F.hasFnAttribute("no-nans-fp-math"))

3968

HasFunNoNaNAttr =

3969

F.getFnAttribute("no-nans-fp-math").getValueAsString() == "true";

3970

3971

// For each block in the loop.

3972

for (Loop::block_iterator bb = TheLoop->block_begin(),

3973

be = TheLoop->block_end(); bb != be; ++bb) {

3974

3975

// Scan the instructions in the block and look for hazards.

3976

for (BasicBlock::iterator it = (*bb)->begin(), e = (*bb)->end(); it != e;

3977

++it) {

3978

3979

if (PHINode *Phi = dyn_cast<PHINode>(it)) {

3980

Type *PhiTy = Phi->getType();

3981

// Check that this PHI type is allowed.

3982

if (!PhiTy->isIntegerTy() &&

3983

!PhiTy->isFloatingPointTy() &&

3984

!PhiTy->isPointerTy()) {

3985

emitAnalysis(VectorizationReport(it)

3986

<< "loop control flow is not understood by vectorizer");

3987

DEBUG(dbgs() << "LV: Found an non-int non-pointer PHI.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found an non-int non-pointer PHI.\n"
; } } while (0);

3988

return false;

3989

}

3990

3991

// If this PHINode is not in the header block, then we know that we

3992

// can convert it to select during if-conversion. No need to check if

3993

// the PHIs in this block are induction or reduction variables.

3994

if (*bb != Header) {

3995

// Check that this instruction has no outside users or is an

3996

// identified reduction value with an outside user.

3997

if (!hasOutsideLoopUser(TheLoop, it, AllowedExit))

3998

continue;

3999

emitAnalysis(VectorizationReport(it) <<

4000

"value could not be identified as "

4001

"an induction or reduction variable");

4002

return false;

4003

}

4004

4005

// We only allow if-converted PHIs with exactly two incoming values.

4006

if (Phi->getNumIncomingValues() != 2) {

4007

emitAnalysis(VectorizationReport(it)

4008

<< "control flow not understood by vectorizer");

4009

DEBUG(dbgs() << "LV: Found an invalid PHI.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found an invalid PHI.\n"
; } } while (0);

4010

return false;

4011

}

4012

4013

// This is the value coming from the preheader.

4014

Value *StartValue = Phi->getIncomingValueForBlock(PreHeader);

4015

ConstantInt *StepValue = nullptr;

4016

// Check if this is an induction variable.

4017

InductionKind IK = isInductionVariable(Phi, StepValue);

4018

4019

if (IK_NoInduction != IK) {

4020

// Get the widest type.

4021

if (!WidestIndTy)

4022

WidestIndTy = convertPointerToIntegerType(DL, PhiTy);

4023

else

4024

WidestIndTy = getWiderType(DL, PhiTy, WidestIndTy);

4025

4026

// Int inductions are special because we only allow one IV.

4027

if (IK == IK_IntInduction && StepValue->isOne()) {

4028

// Use the phi node with the widest type as induction. Use the last

4029

// one if there are multiple (no good reason for doing this other

4030

// than it is expedient).

4031

if (!Induction || PhiTy == WidestIndTy)

4032

Induction = Phi;

4033

}

4034

4035

DEBUG(dbgs() << "LV: Found an induction variable.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found an induction variable.\n"
; } } while (0);

4036

Inductions[Phi] = InductionInfo(StartValue, IK, StepValue);

4037

4038

// Until we explicitly handle the case of an induction variable with

4039

// an outside loop user we have to give up vectorizing this loop.

4040

if (hasOutsideLoopUser(TheLoop, it, AllowedExit)) {

4041

emitAnalysis(VectorizationReport(it) <<

4042

"use of induction value outside of the "

4043

"loop is not handled by vectorizer");

4044

return false;

4045

}

4046

4047

continue;

4048

}

4049

4050

if (RecurrenceDescriptor::isReductionPHI(Phi, TheLoop,

4051

Reductions[Phi])) {

4052

AllowedExit.insert(Reductions[Phi].getLoopExitInstr());

4053

continue;

4054

}

4055

4056

emitAnalysis(VectorizationReport(it) <<

4057

"value that could not be identified as "

4058

"reduction is used outside the loop");

4059

DEBUG(dbgs() << "LV: Found an unidentified PHI."<< *Phi <<"\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found an unidentified PHI."
<< *Phi <<"\n"; } } while (0);

4060

return false;

4061

}// end of PHI handling

4062

4063

// We handle calls that:

4064

// * Are debug info intrinsics.

4065

// * Have a mapping to an IR intrinsic.

4066

// * Have a vector version available.

4067

CallInst *CI = dyn_cast<CallInst>(it);

4068

if (CI && !getIntrinsicIDForCall(CI, TLI) && !isa<DbgInfoIntrinsic>(CI) &&

4069

!(CI->getCalledFunction() && TLI &&

4070

TLI->isFunctionVectorizable(CI->getCalledFunction()->getName()))) {

4071

emitAnalysis(VectorizationReport(it) <<

4072

"call instruction cannot be vectorized");

4073

DEBUG(dbgs() << "LV: Found a non-intrinsic, non-libfunc callsite.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found a non-intrinsic, non-libfunc callsite.\n"
; } } while (0);

4074

return false;

4075

}

4076

4077

// Intrinsics such as powi,cttz and ctlz are legal to vectorize if the

4078

// second argument is the same (i.e. loop invariant)

4079

if (CI &&

4080

hasVectorInstrinsicScalarOpd(getIntrinsicIDForCall(CI, TLI), 1)) {

4081

if (!SE->isLoopInvariant(SE->getSCEV(CI->getOperand(1)), TheLoop)) {

4082

emitAnalysis(VectorizationReport(it)

4083

<< "intrinsic instruction cannot be vectorized");

4084

DEBUG(dbgs() << "LV: Found unvectorizable intrinsic " << *CI << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found unvectorizable intrinsic "
<< *CI << "\n"; } } while (0);

4085

return false;

4086

}

4087

}

4088

4089

// Check that the instruction return type is vectorizable.

4090

// Also, we can't vectorize extractelement instructions.

4091

if ((!VectorType::isValidElementType(it->getType()) &&

4092

!it->getType()->isVoidTy()) || isa<ExtractElementInst>(it)) {

4093

emitAnalysis(VectorizationReport(it)

4094

<< "instruction return type cannot be vectorized");

4095

DEBUG(dbgs() << "LV: Found unvectorizable type.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found unvectorizable type.\n"
; } } while (0);

4096

return false;

4097

}

4098

4099

// Check that the stored type is vectorizable.

4100

if (StoreInst *ST = dyn_cast<StoreInst>(it)) {

4101

Type *T = ST->getValueOperand()->getType();

4102

if (!VectorType::isValidElementType(T)) {

4103

emitAnalysis(VectorizationReport(ST) <<

4104

"store instruction cannot be vectorized");

4105

return false;

4106

}

4107

if (EnableMemAccessVersioning)

4108

collectStridedAccess(ST);

4109

}

4110

4111

if (EnableMemAccessVersioning)

4112

if (LoadInst *LI = dyn_cast<LoadInst>(it))

4113

collectStridedAccess(LI);

4114

4115

// Reduction instructions are allowed to have exit users.

4116

// All other instructions must not have external users.

4117

if (hasOutsideLoopUser(TheLoop, it, AllowedExit)) {

4118

emitAnalysis(VectorizationReport(it) <<

4119

"value cannot be used outside the loop");

4120

return false;

4121

}

4122

4123

} // next instr.

4124

4125

}

4126

4127

if (!Induction) {

4128

DEBUG(dbgs() << "LV: Did not find one integer induction var.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Did not find one integer induction var.\n"
; } } while (0);

4129

if (Inductions.empty()) {

4130

emitAnalysis(VectorizationReport()

4131

<< "loop induction variable could not be identified");

4132

return false;

4133

}

4134

}

4135

4136

return true;

4137

}

4138

4139

///\brief Remove GEPs whose indices but the last one are loop invariant and

4140

/// return the induction operand of the gep pointer.

4141

static Value *stripGetElementPtr(Value *Ptr, ScalarEvolution *SE, Loop *Lp) {

4142

GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Ptr);

4143

if (!GEP)

4144

return Ptr;

4145

4146

unsigned InductionOperand = getGEPInductionOperand(GEP);

4147

4148

// Check that all of the gep indices are uniform except for our induction

4149

// operand.

4150

for (unsigned i = 0, e = GEP->getNumOperands(); i != e; ++i)

4151

if (i != InductionOperand &&

4152

!SE->isLoopInvariant(SE->getSCEV(GEP->getOperand(i)), Lp))

4153

return Ptr;

4154

return GEP->getOperand(InductionOperand);

4155

}

4156

4157

///\brief Look for a cast use of the passed value.

4158

static Value *getUniqueCastUse(Value *Ptr, Loop *Lp, Type *Ty) {

4159

Value *UniqueCast = nullptr;

4160

for (User *U : Ptr->users()) {

4161

CastInst *CI = dyn_cast<CastInst>(U);

4162

if (CI && CI->getType() == Ty) {

4163

if (!UniqueCast)

4164

UniqueCast = CI;

4165

else

4166

return nullptr;

4167

}

4168

}

4169

return UniqueCast;

4170

}

4171

4172

///\brief Get the stride of a pointer access in a loop.

4173

/// Looks for symbolic strides "a[i*stride]". Returns the symbolic stride as a

4174

/// pointer to the Value, or null otherwise.

4175

static Value *getStrideFromPointer(Value *Ptr, ScalarEvolution *SE, Loop *Lp) {

4176

const PointerType *PtrTy = dyn_cast<PointerType>(Ptr->getType());

4177

if (!PtrTy || PtrTy->isAggregateType())

4178

return nullptr;

4179

4180

// Try to remove a gep instruction to make the pointer (actually index at this

4181

// point) easier analyzable. If OrigPtr is equal to Ptr we are analzying the

4182

// pointer, otherwise, we are analyzing the index.

4183

Value *OrigPtr = Ptr;

4184

4185

// The size of the pointer access.

4186

int64_t PtrAccessSize = 1;

4187

4188

Ptr = stripGetElementPtr(Ptr, SE, Lp);

4189

const SCEV *V = SE->getSCEV(Ptr);

4190

4191

if (Ptr != OrigPtr)

4192

// Strip off casts.

4193

while (const SCEVCastExpr *C = dyn_cast<SCEVCastExpr>(V))

4194

V = C->getOperand();

4195

4196

const SCEVAddRecExpr *S = dyn_cast<SCEVAddRecExpr>(V);

4197

if (!S)

4198

return nullptr;

4199

4200

V = S->getStepRecurrence(*SE);

4201

if (!V)

4202

return nullptr;

4203

4204

// Strip off the size of access multiplication if we are still analyzing the

4205

// pointer.

4206

if (OrigPtr == Ptr) {

4207

const DataLayout &DL = Lp->getHeader()->getModule()->getDataLayout();

4208

DL.getTypeAllocSize(PtrTy->getElementType());

4209

if (const SCEVMulExpr *M = dyn_cast<SCEVMulExpr>(V)) {

4210

if (M->getOperand(0)->getSCEVType() != scConstant)

4211

return nullptr;

4212

4213

const APInt &APStepVal =

4214

cast<SCEVConstant>(M->getOperand(0))->getValue()->getValue();

4215

4216

// Huge step value - give up.

4217

if (APStepVal.getBitWidth() > 64)

4218

return nullptr;

4219

4220

int64_t StepVal = APStepVal.getSExtValue();

4221

if (PtrAccessSize != StepVal)

4222

return nullptr;

4223

V = M->getOperand(1);

4224

}

4225

}

4226

4227

// Strip off casts.

4228

Type *StripedOffRecurrenceCast = nullptr;

4229

if (const SCEVCastExpr *C = dyn_cast<SCEVCastExpr>(V)) {

4230

StripedOffRecurrenceCast = C->getType();

4231

V = C->getOperand();

4232

}

4233

4234

// Look for the loop invariant symbolic value.

4235

const SCEVUnknown *U = dyn_cast<SCEVUnknown>(V);

4236

if (!U)

4237

return nullptr;

4238

4239

Value *Stride = U->getValue();

4240

if (!Lp->isLoopInvariant(Stride))

4241

return nullptr;

4242

4243

// If we have stripped off the recurrence cast we have to make sure that we

4244

// return the value that is used in this loop so that we can replace it later.

4245

if (StripedOffRecurrenceCast)

4246

Stride = getUniqueCastUse(Stride, Lp, StripedOffRecurrenceCast);

4247

4248

return Stride;

4249

}

4250

4251

void LoopVectorizationLegality::collectStridedAccess(Value *MemAccess) {

4252

Value *Ptr = nullptr;

4253

if (LoadInst *LI = dyn_cast<LoadInst>(MemAccess))

4254

Ptr = LI->getPointerOperand();

4255

else if (StoreInst *SI = dyn_cast<StoreInst>(MemAccess))

4256

Ptr = SI->getPointerOperand();

4257

else

4258

return;

4259

4260

Value *Stride = getStrideFromPointer(Ptr, SE, TheLoop);

4261

if (!Stride)

4262

return;

4263

4264

DEBUG(dbgs() << "LV: Found a strided access that we can version")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found a strided access that we can version"
; } } while (0);

4265

DEBUG(dbgs() << " Ptr: " << *Ptr << " Stride: " << *Stride << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << " Ptr: " << *Ptr
<< " Stride: " << *Stride << "\n"; } } while
(0);

4266

Strides[Ptr] = Stride;

4267

StrideSet.insert(Stride);

4268

}

4269

4270

void LoopVectorizationLegality::collectLoopUniforms() {

4271

// We now know that the loop is vectorizable!

4272

// Collect variables that will remain uniform after vectorization.

4273

std::vector<Value*> Worklist;

4274

BasicBlock *Latch = TheLoop->getLoopLatch();

4275

4276

// Start with the conditional branch and walk up the block.

4277

Worklist.push_back(Latch->getTerminator()->getOperand(0));

4278

4279

// Also add all consecutive pointer values; these values will be uniform

4280

// after vectorization (and subsequent cleanup) and, until revectorization is

4281

// supported, all dependencies must also be uniform.

4282

for (Loop::block_iterator B = TheLoop->block_begin(),

4283

BE = TheLoop->block_end(); B != BE; ++B)

4284

for (BasicBlock::iterator I = (*B)->begin(), IE = (*B)->end();

4285

I != IE; ++I)

4286

if (I->getType()->isPointerTy() && isConsecutivePtr(I))

4287

Worklist.insert(Worklist.end(), I->op_begin(), I->op_end());

4288

4289

while (!Worklist.empty()) {

4290

Instruction *I = dyn_cast<Instruction>(Worklist.back());

4291

Worklist.pop_back();

4292

4293

// Look at instructions inside this loop.

4294

// Stop when reaching PHI nodes.

4295

// TODO: we need to follow values all over the loop, not only in this block.

4296

if (!I || !TheLoop->contains(I) || isa<PHINode>(I))

4297

continue;

4298

4299

// This is a known uniform.

4300

Uniforms.insert(I);

4301

4302

// Insert all operands.

4303

Worklist.insert(Worklist.end(), I->op_begin(), I->op_end());

4304

}

4305

}

4306

4307

bool LoopVectorizationLegality::canVectorizeMemory() {

4308

LAI = &LAA->getInfo(TheLoop, Strides);

4309

auto &OptionalReport = LAI->getReport();

4310

if (OptionalReport)

4311

emitAnalysis(VectorizationReport(*OptionalReport));

4312

if (!LAI->canVectorizeMemory())

4313

return false;

4314

4315

if (LAI->hasStoreToLoopInvariantAddress()) {

4316

emitAnalysis(

4317

VectorizationReport()

4318

<< "write to a loop invariant address could not be vectorized");

4319

DEBUG(dbgs() << "LV: We don't allow storing to uniform addresses\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: We don't allow storing to uniform addresses\n"
; } } while (0);

4320

return false;

4321

}

4322

4323

if (LAI->getNumRuntimePointerChecks() >

4324

VectorizerParams::RuntimeMemoryCheckThreshold) {

4325

emitAnalysis(VectorizationReport()

4326

<< LAI->getNumRuntimePointerChecks() << " exceeds limit of "

4327

<< VectorizerParams::RuntimeMemoryCheckThreshold

4328

<< " dependent memory operations checked at runtime");

4329

DEBUG(dbgs() << "LV: Too many memory checks needed.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Too many memory checks needed.\n"
; } } while (0);

4330

return false;

4331

}

4332

return true;

4333

}

4334

4335

LoopVectorizationLegality::InductionKind

4336

LoopVectorizationLegality::isInductionVariable(PHINode *Phi,

4337

ConstantInt *&StepValue) {

4338

if (!isInductionPHI(Phi, SE, StepValue))

4339

return IK_NoInduction;

4340

4341

Type *PhiTy = Phi->getType();

4342

// Found an Integer induction variable.

4343

if (PhiTy->isIntegerTy())

4344

return IK_IntInduction;

4345

// Found an Pointer induction variable.

4346

return IK_PtrInduction;

4347

}

4348

4349

bool LoopVectorizationLegality::isInductionVariable(const Value *V) {

4350

Value *In0 = const_cast<Value*>(V);

4351

PHINode *PN = dyn_cast_or_null<PHINode>(In0);

4352

if (!PN)

4353

return false;

4354

4355

return Inductions.count(PN);

4356

}

4357

4358

bool LoopVectorizationLegality::blockNeedsPredication(BasicBlock *BB) {

4359

return LoopAccessInfo::blockNeedsPredication(BB, TheLoop, DT);

4360

}

4361

4362

bool LoopVectorizationLegality::blockCanBePredicated(BasicBlock *BB,

4363

SmallPtrSetImpl<Value *> &SafePtrs) {

4364

4365

for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) {

4366

// Check that we don't have a constant expression that can trap as operand.

4367

for (Instruction::op_iterator OI = it->op_begin(), OE = it->op_end();

4368

OI != OE; ++OI) {

4369

if (Constant *C = dyn_cast<Constant>(*OI))

4370

if (C->canTrap())

4371

return false;

4372

}

4373

// We might be able to hoist the load.

4374

if (it->mayReadFromMemory()) {

4375

LoadInst *LI = dyn_cast<LoadInst>(it);

4376

if (!LI)

4377

return false;

4378

if (!SafePtrs.count(LI->getPointerOperand())) {

4379

if (isLegalMaskedLoad(LI->getType(), LI->getPointerOperand())) {

4380

MaskedOp.insert(LI);

4381

continue;

4382

}

4383

return false;

4384

}

4385

}

4386

4387

// We don't predicate stores at the moment.

4388

if (it->mayWriteToMemory()) {

4389

StoreInst *SI = dyn_cast<StoreInst>(it);

4390

// We only support predication of stores in basic blocks with one

4391

// predecessor.

4392

if (!SI)

4393

return false;

4394

4395

bool isSafePtr = (SafePtrs.count(SI->getPointerOperand()) != 0);

4396

bool isSinglePredecessor = SI->getParent()->getSinglePredecessor();

4397

4398

if (++NumPredStores > NumberOfStoresToPredicate || !isSafePtr ||

4399

!isSinglePredecessor) {

4400

// Build a masked store if it is legal for the target, otherwise scalarize

4401

// the block.

4402

bool isLegalMaskedOp =

4403

isLegalMaskedStore(SI->getValueOperand()->getType(),

4404

SI->getPointerOperand());

4405

if (isLegalMaskedOp) {

4406

--NumPredStores;

4407

MaskedOp.insert(SI);

4408

continue;

4409

}

4410

return false;

4411

}

4412

}

4413

if (it->mayThrow())

4414

return false;

4415

4416

// The instructions below can trap.

4417

switch (it->getOpcode()) {

4418

default: continue;

4419

case Instruction::UDiv:

4420

case Instruction::SDiv:

4421

case Instruction::URem:

4422

case Instruction::SRem:

4423

return false;

4424

}

4425

}

4426

4427

return true;

4428

}

4429

4430

void InterleavedAccessInfo::collectConstStridedAccesses(

4431

MapVector<Instruction *, StrideDescriptor> &StrideAccesses,

4432

const ValueToValueMap &Strides) {

4433

// Holds load/store instructions in program order.

4434

SmallVector<Instruction *, 16> AccessList;

4435

4436

for (auto *BB : TheLoop->getBlocks()) {

4437

bool IsPred = LoopAccessInfo::blockNeedsPredication(BB, TheLoop, DT);

4438

4439

for (auto &I : *BB) {

4440

if (!isa<LoadInst>(&I) && !isa<StoreInst>(&I))

4441

continue;

4442

// FIXME: Currently we can't handle mixed accesses and predicated accesses

4443

if (IsPred)

4444

return;

4445

4446

AccessList.push_back(&I);

4447

}

4448

}

4449

4450

if (AccessList.empty())

4451

return;

4452

4453

auto &DL = TheLoop->getHeader()->getModule()->getDataLayout();

4454

for (auto I : AccessList) {

4455

LoadInst *LI = dyn_cast<LoadInst>(I);

4456

StoreInst *SI = dyn_cast<StoreInst>(I);

4457

4458

Value *Ptr = LI ? LI->getPointerOperand() : SI->getPointerOperand();

4459

int Stride = isStridedPtr(SE, Ptr, TheLoop, Strides);

4460

4461

// The factor of the corresponding interleave group.

4462

unsigned Factor = std::abs(Stride);

4463

4464

// Ignore the access if the factor is too small or too large.

4465

if (Factor < 2 || Factor > MaxInterleaveGroupFactor)

4466

continue;

4467

4468

const SCEV *Scev = replaceSymbolicStrideSCEV(SE, Strides, Ptr);

4469

PointerType *PtrTy = dyn_cast<PointerType>(Ptr->getType());

4470

unsigned Size = DL.getTypeAllocSize(PtrTy->getElementType());

4471

4472

// An alignment of 0 means target ABI alignment.

4473

unsigned Align = LI ? LI->getAlignment() : SI->getAlignment();

4474

if (!Align)

4475

Align = DL.getABITypeAlignment(PtrTy->getElementType());

4476

4477

StrideAccesses[I] = StrideDescriptor(Stride, Scev, Size, Align);

4478

}

4479

}

4480

4481

// Analyze interleaved accesses and collect them into interleave groups.

4482

4483

// Notice that the vectorization on interleaved groups will change instruction

4484

// orders and may break dependences. But the memory dependence check guarantees

4485

// that there is no overlap between two pointers of different strides, element

4486

// sizes or underlying bases.

4487

4488

// For pointers sharing the same stride, element size and underlying base, no

4489

// need to worry about Read-After-Write dependences and Write-After-Read

4490

// dependences.

4491

4492

// E.g. The RAW dependence: A[i] = a;

4493

// b = A[i];

4494

// This won't exist as it is a store-load forwarding conflict, which has

4495

// already been checked and forbidden in the dependence check.

4496

4497

// E.g. The WAR dependence: a = A[i]; // (1)

4498

// A[i] = b; // (2)

4499

// The store group of (2) is always inserted at or below (2), and the load group

4500

// of (1) is always inserted at or above (1). The dependence is safe.

4501

void InterleavedAccessInfo::analyzeInterleaving(

4502

const ValueToValueMap &Strides) {

4503

DEBUG(dbgs() << "LV: Analyzing interleaved accesses...\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Analyzing interleaved accesses...\n"
; } } while (0);

4504

4505

// Holds all the stride accesses.

4506

MapVector<Instruction *, StrideDescriptor> StrideAccesses;

4507

collectConstStridedAccesses(StrideAccesses, Strides);

4508

4509

if (StrideAccesses.empty())

4510

return;

4511

4512

// Holds all interleaved store groups temporarily.

4513

SmallSetVector<InterleaveGroup *, 4> StoreGroups;

4514

4515

// Search the load-load/write-write pair B-A in bottom-up order and try to

4516

// insert B into the interleave group of A according to 3 rules:

4517

// 1. A and B have the same stride.

4518

// 2. A and B have the same memory object size.

4519

// 3. B belongs to the group according to the distance.

4520

4521

// The bottom-up order can avoid breaking the Write-After-Write dependences

4522

// between two pointers of the same base.

4523

// E.g. A[i] = a; (1)

4524

// A[i] = b; (2)

4525

// A[i+1] = c (3)

4526

// We form the group (2)+(3) in front, so (1) has to form groups with accesses

4527

// above (1), which guarantees that (1) is always above (2).

4528

for (auto I = StrideAccesses.rbegin(), E = StrideAccesses.rend(); I != E;

4529

++I) {

4530

Instruction *A = I->first;

4531

StrideDescriptor DesA = I->second;

4532

4533

InterleaveGroup *Group = getInterleaveGroup(A);

4534

if (!Group) {

4535

DEBUG(dbgs() << "LV: Creating an interleave group with:" << *A << '\n')do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Creating an interleave group with:"
<< *A << '\n'; } } while (0);

4536

Group = createInterleaveGroup(A, DesA.Stride, DesA.Align);

4537

}

4538

4539

if (A->mayWriteToMemory())

4540

StoreGroups.insert(Group);

4541

4542

for (auto II = std::next(I); II != E; ++II) {

4543

Instruction *B = II->first;

4544

StrideDescriptor DesB = II->second;

4545

4546

// Ignore if B is already in a group or B is a different memory operation.

4547

if (isInterleaved(B) || A->mayReadFromMemory() != B->mayReadFromMemory())

4548

continue;

4549

4550

// Check the rule 1 and 2.

4551

if (DesB.Stride != DesA.Stride || DesB.Size != DesA.Size)

4552

continue;

4553

4554

// Calculate the distance and prepare for the rule 3.

4555

const SCEVConstant *DistToA =

4556

dyn_cast<SCEVConstant>(SE->getMinusSCEV(DesB.Scev, DesA.Scev));

4557

if (!DistToA)

4558

continue;

4559

4560

int DistanceToA = DistToA->getValue()->getValue().getSExtValue();

4561

4562

// Skip if the distance is not multiple of size as they are not in the

4563

// same group.

4564

if (DistanceToA % static_cast<int>(DesA.Size))

4565

continue;

4566

4567

// The index of B is the index of A plus the related index to A.

4568

int IndexB =

4569

Group->getIndex(A) + DistanceToA / static_cast<int>(DesA.Size);

4570

4571

// Try to insert B into the group.

4572

if (Group->insertMember(B, IndexB, DesB.Align)) {

4573

DEBUG(dbgs() << "LV: Inserted:" << *B << '\n'do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Inserted:" <<
*B << '\n' << " into the interleave group with"
<< *A << '\n'; } } while (0)

4574

<< " into the interleave group with" << *A << '\n')do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Inserted:" <<
*B << '\n' << " into the interleave group with"
<< *A << '\n'; } } while (0);

4575

InterleaveGroupMap[B] = Group;

4576

4577

// Set the first load in program order as the insert position.

4578

if (B->mayReadFromMemory())

4579

Group->setInsertPos(B);

4580

}

4581

} // Iteration on instruction B

4582

} // Iteration on instruction A

4583

4584

// Remove interleaved store groups with gaps.

4585

for (InterleaveGroup *Group : StoreGroups)

4586

if (Group->getNumMembers() != Group->getFactor())

4587

releaseGroup(Group);

4588

}

4589

4590

LoopVectorizationCostModel::VectorizationFactor

4591

LoopVectorizationCostModel::selectVectorizationFactor(bool OptForSize) {

4592

// Width 1 means no vectorize

4593

VectorizationFactor Factor = { 1U, 0U };

4594

if (OptForSize && Legal->getRuntimePointerCheck()->Need) {

4595

emitAnalysis(VectorizationReport() <<

4596

"runtime pointer checks needed. Enable vectorization of this "

4597

"loop with '#pragma clang loop vectorize(enable)' when "

4598

"compiling with -Os");

4599

DEBUG(dbgs() << "LV: Aborting. Runtime ptr check is required in Os.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Aborting. Runtime ptr check is required in Os.\n"
; } } while (0);

4600

return Factor;

4601

}

4602

4603

if (!EnableCondStoresVectorization && Legal->getNumPredStores()) {

4604

emitAnalysis(VectorizationReport() <<

4605

"store that is conditionally executed prevents vectorization");

4606

DEBUG(dbgs() << "LV: No vectorization. There are conditional stores.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: No vectorization. There are conditional stores.\n"
; } } while (0);

4607

return Factor;

4608

}

4609

4610

// Find the trip count.

4611

unsigned TC = SE->getSmallConstantTripCount(TheLoop);

4612

DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n')do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found trip count: "
<< TC << '\n'; } } while (0);

4613

4614

unsigned WidestType = getWidestType();

4615

unsigned WidestRegister = TTI.getRegisterBitWidth(true);

4616

unsigned MaxSafeDepDist = -1U;

4617

if (Legal->getMaxSafeDepDistBytes() != -1U)

4618

MaxSafeDepDist = Legal->getMaxSafeDepDistBytes() * 8;

4619

WidestRegister = ((WidestRegister < MaxSafeDepDist) ?

4620

WidestRegister : MaxSafeDepDist);

4621

unsigned MaxVectorSize = WidestRegister / WidestType;

4622

DEBUG(dbgs() << "LV: The Widest type: " << WidestType << " bits.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: The Widest type: "
<< WidestType << " bits.\n"; } } while (0);

4623

DEBUG(dbgs() << "LV: The Widest register is: "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: The Widest register is: "
<< WidestRegister << " bits.\n"; } } while (0)

4624

<< WidestRegister << " bits.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: The Widest register is: "
<< WidestRegister << " bits.\n"; } } while (0);

4625

4626

if (MaxVectorSize == 0) {

4627

DEBUG(dbgs() << "LV: The target has no vector registers.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: The target has no vector registers.\n"
; } } while (0);

4628

MaxVectorSize = 1;

4629

}

4630

4631

assert(MaxVectorSize <= 64 && "Did not expect to pack so many elements"((MaxVectorSize <= 64 && "Did not expect to pack so many elements"
" into one vector!") ? static_cast<void> (0) : __assert_fail
("MaxVectorSize <= 64 && \"Did not expect to pack so many elements\" \" into one vector!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn240924/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4632, __PRETTY_FUNCTION__))

4632

" into one vector!")((MaxVectorSize <= 64 && "Did not expect to pack so many elements"
" into one vector!") ? static_cast<void> (0) : __assert_fail
("MaxVectorSize <= 64 && \"Did not expect to pack so many elements\" \" into one vector!\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn240924/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4632, __PRETTY_FUNCTION__));

4633

4634

unsigned VF = MaxVectorSize;

4635

4636

// If we optimize the program for size, avoid creating the tail loop.

4637

if (OptForSize) {

4638

// If we are unable to calculate the trip count then don't try to vectorize.

4639

if (TC < 2) {

4640

emitAnalysis

4641

(VectorizationReport() <<

4642

"unable to calculate the loop count due to complex control flow");

4643

DEBUG(dbgs() << "LV: Aborting. A tail loop is required in Os.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Aborting. A tail loop is required in Os.\n"
; } } while (0);

4644

return Factor;

4645

}

4646

4647

// Find the maximum SIMD width that can fit within the trip count.

4648

VF = TC % MaxVectorSize;

4649

4650

if (VF == 0)

4651

VF = MaxVectorSize;

4652

else {

4653

// If the trip count that we found modulo the vectorization factor is not

4654

// zero then we require a tail.

4655

emitAnalysis(VectorizationReport() <<

4656

"cannot optimize for size and vectorize at the "

4657

"same time. Enable vectorization of this loop "

4658

"with '#pragma clang loop vectorize(enable)' "

4659

"when compiling with -Os");

4660

4661

return Factor;

4662

}

4663

}

4664

4665

int UserVF = Hints->getWidth();

4666

if (UserVF != 0) {

4667

assert(isPowerOf2_32(UserVF) && "VF needs to be a power of two")((isPowerOf2_32(UserVF) && "VF needs to be a power of two"
) ? static_cast<void> (0) : __assert_fail ("isPowerOf2_32(UserVF) && \"VF needs to be a power of two\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn240924/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 4667, __PRETTY_FUNCTION__));

4668

DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Using user VF " <<
UserVF << ".\n"; } } while (0);

4669

4670

Factor.Width = UserVF;

4671

return Factor;

4672

}

4673

4674

float Cost = expectedCost(1);

4675

#ifndef NDEBUG

4676

const float ScalarCost = Cost;

4677

#endif /* NDEBUG */

4678

unsigned Width = 1;

4679

DEBUG(dbgs() << "LV: Scalar loop costs: " << (int)ScalarCost << ".\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Scalar loop costs: "
<< (int)ScalarCost << ".\n"; } } while (0);

4680

4681

bool ForceVectorization = Hints->getForce() == LoopVectorizeHints::FK_Enabled;

4682

// Ignore scalar width, because the user explicitly wants vectorization.

4683

if (ForceVectorization && VF > 1) {

4684

Width = 2;

4685

Cost = expectedCost(Width) / (float)Width;

4686

}

4687

4688

for (unsigned i=2; i <= VF; i*=2) {

4689

// Notice that the vector loop needs to be executed less times, so

4690

// we need to divide the cost of the vector loops by the width of

4691

// the vector elements.

4692

float VectorCost = expectedCost(i) / (float)i;

4693

DEBUG(dbgs() << "LV: Vector loop of width " << i << " costs: " <<do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Vector loop of width "
<< i << " costs: " << (int)VectorCost <<
".\n"; } } while (0)

4694

(int)VectorCost << ".\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Vector loop of width "
<< i << " costs: " << (int)VectorCost <<
".\n"; } } while (0);

4695

if (VectorCost < Cost) {

4696

Cost = VectorCost;

4697

Width = i;

4698

}

4699

}

4700

4701

DEBUG(if (ForceVectorization && Width > 1 && Cost >= ScalarCost) dbgs()do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { if (ForceVectorization && Width
> 1 && Cost >= ScalarCost) dbgs() << "LV: Vectorization seems to be not beneficial, "
<< "but was forced by a user.\n"; } } while (0)

4702

<< "LV: Vectorization seems to be not beneficial, "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { if (ForceVectorization && Width
> 1 && Cost >= ScalarCost) dbgs() << "LV: Vectorization seems to be not beneficial, "
<< "but was forced by a user.\n"; } } while (0)

4703

<< "but was forced by a user.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { if (ForceVectorization && Width
> 1 && Cost >= ScalarCost) dbgs() << "LV: Vectorization seems to be not beneficial, "
<< "but was forced by a user.\n"; } } while (0);

4704

DEBUG(dbgs() << "LV: Selecting VF: "<< Width << ".\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Selecting VF: "<<
Width << ".\n"; } } while (0);

4705

Factor.Width = Width;

4706

Factor.Cost = Width * Cost;

4707

return Factor;

4708

}

4709

4710

unsigned LoopVectorizationCostModel::getWidestType() {

4711

unsigned MaxWidth = 8;

4712

const DataLayout &DL = TheFunction->getParent()->getDataLayout();

4713

4714

// For each block.

4715

for (Loop::block_iterator bb = TheLoop->block_begin(),

4716

be = TheLoop->block_end(); bb != be; ++bb) {

4717

BasicBlock *BB = *bb;

4718

4719

// For each instruction in the loop.

4720

for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) {

4721

Type *T = it->getType();

4722

4723

// Ignore ephemeral values.

4724

if (EphValues.count(it))

4725

continue;

4726

4727

// Only examine Loads, Stores and PHINodes.

4728

if (!isa<LoadInst>(it) && !isa<StoreInst>(it) && !isa<PHINode>(it))

4729

continue;

4730

4731

// Examine PHI nodes that are reduction variables.

4732

if (PHINode *PN = dyn_cast<PHINode>(it))

4733

if (!Legal->getReductionVars()->count(PN))

4734

continue;

4735

4736

// Examine the stored values.

4737

if (StoreInst *ST = dyn_cast<StoreInst>(it))

4738

T = ST->getValueOperand()->getType();

4739

4740

// Ignore loaded pointer types and stored pointer types that are not

4741

// consecutive. However, we do want to take consecutive stores/loads of

4742

// pointer vectors into account.

4743

if (T->isPointerTy() && !isConsecutiveLoadOrStore(it))

4744

continue;

4745

4746

MaxWidth = std::max(MaxWidth,

4747

(unsigned)DL.getTypeSizeInBits(T->getScalarType()));

4748

}

4749

}

4750

4751

return MaxWidth;

4752

}

4753

4754

unsigned

4755

LoopVectorizationCostModel::selectUnrollFactor(bool OptForSize,

4756

unsigned VF,

4757

unsigned LoopCost) {

4758

4759

// -- The unroll heuristics --

4760

// We unroll the loop in order to expose ILP and reduce the loop overhead.

4761

// There are many micro-architectural considerations that we can't predict

4762

// at this level. For example, frontend pressure (on decode or fetch) due to

4763

// code size, or the number and capabilities of the execution ports.

4764

4765

// We use the following heuristics to select the unroll factor:

4766

// 1. If the code has reductions, then we unroll in order to break the cross

4767

// iteration dependency.

4768

// 2. If the loop is really small, then we unroll in order to reduce the loop

4769

// overhead.

4770

// 3. We don't unroll if we think that we will spill registers to memory due

4771

// to the increased register pressure.

4772

4773

// Use the user preference, unless 'auto' is selected.

4774

int UserUF = Hints->getInterleave();

4775

if (UserUF != 0)

Assuming 'UserUF' is equal to 0

→

←

Taking false branch

→

4776

return UserUF;

4777

4778

// When we optimize for size, we don't unroll.

4779

if (OptForSize)

←

Assuming 'OptForSize' is 0

→

←

Taking false branch

→

4780

return 1;

4781

4782

// We used the distance for the unroll factor.

4783

if (Legal->getMaxSafeDepDistBytes() != -1U)

←

Taking false branch

→

4784

return 1;

4785

4786

// Do not unroll loops with a relatively small trip count.

4787

unsigned TC = SE->getSmallConstantTripCount(TheLoop);

4788

if (TC > 1 && TC < TinyTripCountUnrollThreshold)

←

Assuming 'TC' is <= 1

→

4789

return 1;

4790

4791

unsigned TargetNumRegisters = TTI.getNumberOfRegisters(VF > 1);

←

Assuming 'VF' is <= 1

→

4792

DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters <<do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: The target has " <<
TargetNumRegisters << " registers\n"; } } while (0)

4793

" registers\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: The target has " <<
TargetNumRegisters << " registers\n"; } } while (0);

4794

4795

if (VF == 1) {

←

Assuming 'VF' is not equal to 1

→

←

Taking false branch

→

4796

if (ForceTargetNumScalarRegs.getNumOccurrences() > 0)

4797

TargetNumRegisters = ForceTargetNumScalarRegs;

4798

} else {

4799

if (ForceTargetNumVectorRegs.getNumOccurrences() > 0)

←

Taking false branch

→

4800

TargetNumRegisters = ForceTargetNumVectorRegs;

4801

}

4802

4803

LoopVectorizationCostModel::RegisterUsage R = calculateRegisterUsage();

4804

// We divide by these constants so assume that we have at least one

4805

// instruction that uses at least one register.

4806

R.MaxLocalUsers = std::max(R.MaxLocalUsers, 1U);

4807

R.NumInstructions = std::max(R.NumInstructions, 1U);

4808

4809

// We calculate the unroll factor using the following formula.

4810

// Subtract the number of loop invariants from the number of available

4811

// registers. These registers are used by all of the unrolled instances.

4812

// Next, divide the remaining registers by the number of registers that is

4813

// required by the loop, in order to estimate how many parallel instances

4814

// fit without causing spills. All of this is rounded down if necessary to be

4815

// a power of two. We want power of two unroll factors to simplify any

4816

// addressing operations or alignment considerations.

4817

unsigned UF = PowerOf2Floor((TargetNumRegisters - R.LoopInvariantRegs) /

4818

R.MaxLocalUsers);

4819

4820

// Don't count the induction variable as unrolled.

4821

if (EnableIndVarRegisterHeur)

←

Taking false branch

→

4822

UF = PowerOf2Floor((TargetNumRegisters - R.LoopInvariantRegs - 1) /

4823

std::max(1U, (R.MaxLocalUsers - 1)));

4824

4825

// Clamp the unroll factor ranges to reasonable factors.

4826

unsigned MaxInterleaveSize = TTI.getMaxInterleaveFactor(VF);

4827

4828

// Check if the user has overridden the unroll max.

4829

if (VF == 1) {

←

Taking false branch

→

4830

if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0)

4831

MaxInterleaveSize = ForceTargetMaxScalarInterleaveFactor;

4832

} else {

4833

if (ForceTargetMaxVectorInterleaveFactor.getNumOccurrences() > 0)

←

Taking false branch

→

4834

MaxInterleaveSize = ForceTargetMaxVectorInterleaveFactor;

4835

}

4836

4837

// If we did not calculate the cost for VF (because the user selected the VF)

4838

// then we calculate the cost of VF here.

4839

if (LoopCost == 0)

←

Assuming 'LoopCost' is equal to 0

→

←

Taking true branch

→

4840

LoopCost = expectedCost(VF);

←

Calling 'LoopVectorizationCostModel::expectedCost'

→

←

Returning from 'LoopVectorizationCostModel::expectedCost'

→

←

The value 0 is assigned to 'LoopCost'

→

4841

4842

// Clamp the calculated UF to be between the 1 and the max unroll factor

4843

// that the target allows.

4844

if (UF > MaxInterleaveSize)

←

Taking false branch

→

4845

UF = MaxInterleaveSize;

4846

else if (UF < 1)

←

Assuming 'UF' is >= 1

→

←

Taking false branch

→

4847

UF = 1;

4848

4849

// Unroll if we vectorized this loop and there is a reduction that could

4850

// benefit from unrolling.

4851

if (VF > 1 && Legal->getReductionVars()->size()) {

4852

DEBUG(dbgs() << "LV: Unrolling because of reductions.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Unrolling because of reductions.\n"
; } } while (0);

4853

return UF;

4854

}

4855

4856

// Note that if we've already vectorized the loop we will have done the

4857

// runtime check and so unrolling won't require further checks.

4858

bool UnrollingRequiresRuntimePointerCheck =

4859

(VF == 1 && Legal->getRuntimePointerCheck()->Need);

4860

4861

// We want to unroll small loops in order to reduce the loop overhead and

4862

// potentially expose ILP opportunities.

4863

DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n')do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Loop cost is " <<
LoopCost << '\n'; } } while (0);

4864

if (!UnrollingRequiresRuntimePointerCheck &&

←

Taking true branch

→

4865

LoopCost < SmallLoopCost) {

4866

// We assume that the cost overhead is 1 and we use the cost model

4867

// to estimate the cost of the loop and unroll until the cost of the

4868

// loop overhead is about 5% of the cost of the loop.

4869

unsigned SmallUF = std::min(UF, (unsigned)PowerOf2Floor(SmallLoopCost / LoopCost));

←

Division by zero

4870

4871

// Unroll until store/load ports (estimated by max unroll factor) are

4872

// saturated.

4873

unsigned NumStores = Legal->getNumStores();

4874

unsigned NumLoads = Legal->getNumLoads();

4875

unsigned StoresUF = UF / (NumStores ? NumStores : 1);

4876

unsigned LoadsUF = UF / (NumLoads ? NumLoads : 1);

4877

4878

// If we have a scalar reduction (vector reductions are already dealt with

4879

// by this point), we can increase the critical path length if the loop

4880

// we're unrolling is inside another loop. Limit, by default to 2, so the

4881

// critical path only gets increased by one reduction operation.

4882

if (Legal->getReductionVars()->size() &&

4883

TheLoop->getLoopDepth() > 1) {

4884

unsigned F = static_cast<unsigned>(MaxNestedScalarReductionUF);

4885

SmallUF = std::min(SmallUF, F);

4886

StoresUF = std::min(StoresUF, F);

4887

LoadsUF = std::min(LoadsUF, F);

4888

}

4889

4890

if (EnableLoadStoreRuntimeUnroll && std::max(StoresUF, LoadsUF) > SmallUF) {

4891

DEBUG(dbgs() << "LV: Unrolling to saturate store or load ports.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Unrolling to saturate store or load ports.\n"
; } } while (0);

4892

return std::max(StoresUF, LoadsUF);

4893

}

4894

4895

DEBUG(dbgs() << "LV: Unrolling to reduce branch cost.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Unrolling to reduce branch cost.\n"
; } } while (0);

4896

return SmallUF;

4897

}

4898

4899

// Unroll if this is a large loop (small loops are already dealt with by this

4900

// point) that could benefit from interleaved unrolling.

4901

bool HasReductions = (Legal->getReductionVars()->size() > 0);

4902

if (TTI.enableAggressiveInterleaving(HasReductions)) {

4903

DEBUG(dbgs() << "LV: Unrolling to expose ILP.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Unrolling to expose ILP.\n"
; } } while (0);

4904

return UF;

4905

}

4906

4907

DEBUG(dbgs() << "LV: Not Unrolling.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Not Unrolling.\n";
} } while (0);

4908

return 1;

4909

}

4910

4911

LoopVectorizationCostModel::RegisterUsage

4912

LoopVectorizationCostModel::calculateRegisterUsage() {

4913

// This function calculates the register usage by measuring the highest number

4914

// of values that are alive at a single location. Obviously, this is a very

4915

// rough estimation. We scan the loop in a topological order in order and

4916

// assign a number to each instruction. We use RPO to ensure that defs are

4917

// met before their users. We assume that each instruction that has in-loop

4918

// users starts an interval. We record every time that an in-loop value is

4919

// used, so we have a list of the first and last occurrences of each

4920

// instruction. Next, we transpose this data structure into a multi map that

4921

// holds the list of intervals that *end* at a specific location. This multi

4922

// map allows us to perform a linear search. We scan the instructions linearly

4923

// and record each time that a new interval starts, by placing it in a set.

4924

// If we find this value in the multi-map then we remove it from the set.

4925

// The max register usage is the maximum size of the set.

4926

// We also search for instructions that are defined outside the loop, but are

4927

// used inside the loop. We need this number separately from the max-interval

4928

// usage number because when we unroll, loop-invariant values do not take

4929

// more register.

4930

LoopBlocksDFS DFS(TheLoop);

4931

DFS.perform(LI);

4932

4933

RegisterUsage R;

4934

R.NumInstructions = 0;

4935

4936

// Each 'key' in the map opens a new interval. The values

4937

// of the map are the index of the 'last seen' usage of the

4938

// instruction that is the key.

4939

typedef DenseMap<Instruction*, unsigned> IntervalMap;

4940

// Maps instruction to its index.

4941

DenseMap<unsigned, Instruction*> IdxToInstr;

4942

// Marks the end of each interval.

4943

IntervalMap EndPoint;

4944

// Saves the list of instruction indices that are used in the loop.

4945

SmallSet<Instruction*, 8> Ends;

4946

// Saves the list of values that are used in the loop but are

4947

// defined outside the loop, such as arguments and constants.

4948

SmallPtrSet<Value*, 8> LoopInvariants;

4949

4950

unsigned Index = 0;

4951

for (LoopBlocksDFS::RPOIterator bb = DFS.beginRPO(),

4952

be = DFS.endRPO(); bb != be; ++bb) {

4953

R.NumInstructions += (*bb)->size();

4954

for (BasicBlock::iterator it = (*bb)->begin(), e = (*bb)->end(); it != e;

4955

++it) {

4956

Instruction *I = it;

4957

IdxToInstr[Index++] = I;

4958

4959

// Save the end location of each USE.

4960

for (unsigned i = 0; i < I->getNumOperands(); ++i) {

4961

Value *U = I->getOperand(i);

4962

Instruction *Instr = dyn_cast<Instruction>(U);

4963

4964

// Ignore non-instruction values such as arguments, constants, etc.

4965

if (!Instr) continue;

4966

4967

// If this instruction is outside the loop then record it and continue.

4968

if (!TheLoop->contains(Instr)) {

4969

LoopInvariants.insert(Instr);

4970

continue;

4971

}

4972

4973

// Overwrite previous end points.

4974

EndPoint[Instr] = Index;

4975

Ends.insert(Instr);

4976

}

4977

}

4978

}

4979

4980

// Saves the list of intervals that end with the index in 'key'.

4981

typedef SmallVector<Instruction*, 2> InstrList;

4982

DenseMap<unsigned, InstrList> TransposeEnds;

4983

4984

// Transpose the EndPoints to a list of values that end at each index.

4985

for (IntervalMap::iterator it = EndPoint.begin(), e = EndPoint.end();

4986

it != e; ++it)

4987

TransposeEnds[it->second].push_back(it->first);

4988

4989

SmallSet<Instruction*, 8> OpenIntervals;

4990

unsigned MaxUsage = 0;

4991

4992

4993

DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV(REG): Calculating max register usage:\n"
; } } while (0);

4994

for (unsigned int i = 0; i < Index; ++i) {

4995

Instruction *I = IdxToInstr[i];

4996

// Ignore instructions that are never used within the loop.

4997

if (!Ends.count(I)) continue;

4998

4999

// Ignore ephemeral values.

5000

if (EphValues.count(I))

5001

continue;

5002

5003

// Remove all of the instructions that end at this location.

5004

InstrList &List = TransposeEnds[i];

5005

for (unsigned int j=0, e = List.size(); j < e; ++j)

5006

OpenIntervals.erase(List[j]);

5007

5008

// Count the number of live interals.

5009

MaxUsage = std::max(MaxUsage, OpenIntervals.size());

5010

5011

DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # " <<do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV(REG): At #" <<
i << " Interval # " << OpenIntervals.size() <<
'\n'; } } while (0)

5012

OpenIntervals.size() << '\n')do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV(REG): At #" <<
i << " Interval # " << OpenIntervals.size() <<
'\n'; } } while (0);

5013

5014

// Add the current instruction to the list of open intervals.

5015

OpenIntervals.insert(I);

5016

}

5017

5018

unsigned Invariant = LoopInvariants.size();

5019

DEBUG(dbgs() << "LV(REG): Found max usage: " << MaxUsage << '\n')do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV(REG): Found max usage: "
<< MaxUsage << '\n'; } } while (0);

5020

DEBUG(dbgs() << "LV(REG): Found invariant usage: " << Invariant << '\n')do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV(REG): Found invariant usage: "
<< Invariant << '\n'; } } while (0);

5021

DEBUG(dbgs() << "LV(REG): LoopSize: " << R.NumInstructions << '\n')do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV(REG): LoopSize: " <<
R.NumInstructions << '\n'; } } while (0);

5022

5023

R.LoopInvariantRegs = Invariant;

5024

R.MaxLocalUsers = MaxUsage;

5025

return R;

5026

}

5027

5028

unsigned LoopVectorizationCostModel::expectedCost(unsigned VF) {

5029

unsigned Cost = 0;

←

'Cost' initialized to 0

→

5030

5031

// For each block.

5032

for (Loop::block_iterator bb = TheLoop->block_begin(),

←

Loop condition is false. Execution continues on line 5067

→

5033

be = TheLoop->block_end(); bb != be; ++bb) {

←

Calling 'operator!='

→

←

Returning from 'operator!='

→

5034

unsigned BlockCost = 0;

5035

BasicBlock *BB = *bb;

5036

5037

// For each instruction in the old loop.

5038

for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) {

5039

// Skip dbg intrinsics.

5040

if (isa<DbgInfoIntrinsic>(it))

5041

continue;

5042

5043

// Ignore ephemeral values.

5044

if (EphValues.count(it))

5045

continue;

5046

5047

unsigned C = getInstructionCost(it, VF);

5048

5049

// Check if we should override the cost.

5050

if (ForceTargetInstructionCost.getNumOccurrences() > 0)

5051

C = ForceTargetInstructionCost;

5052

5053

BlockCost += C;

5054

DEBUG(dbgs() << "LV: Found an estimated cost of " << C << " for VF " <<do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found an estimated cost of "
<< C << " for VF " << VF << " For instruction: "
<< *it << '\n'; } } while (0)

5055

VF << " For instruction: " << *it << '\n')do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("loop-vectorize")) { dbgs() << "LV: Found an estimated cost of "
<< C << " for VF " << VF << " For instruction: "
<< *it << '\n'; } } while (0);

5056

}

5057

5058

// We assume that if-converted blocks have a 50% chance of being executed.

5059

// When the code is scalar then some of the blocks are avoided due to CF.

5060

// When the code is vectorized we execute all code paths.

5061

if (VF == 1 && Legal->blockNeedsPredication(*bb))

5062

BlockCost /= 2;

5063

5064

Cost += BlockCost;

5065

}

5066

5067

return Cost;

←

Returning zero (loaded from 'Cost')

→

5068

}

5069

5070

/// \brief Check whether the address computation for a non-consecutive memory

5071

/// access looks like an unlikely candidate for being merged into the indexing

5072

/// mode.

5073

///

5074

/// We look for a GEP which has one index that is an induction variable and all

5075

/// other indices are loop invariant. If the stride of this access is also

5076

/// within a small bound we decide that this address computation can likely be

5077

/// merged into the addressing mode.

5078

/// In all other cases, we identify the address computation as complex.

5079

static bool isLikelyComplexAddressComputation(Value *Ptr,

5080

LoopVectorizationLegality *Legal,

5081

ScalarEvolution *SE,

5082

const Loop *TheLoop) {

5083

GetElementPtrInst *Gep = dyn_cast<GetElementPtrInst>(Ptr);

5084

if (!Gep)

5085

return true;

5086

5087

// We are looking for a gep with all loop invariant indices except for one

5088

// which should be an induction variable.

5089

unsigned NumOperands = Gep->getNumOperands();

5090

for (unsigned i = 1; i < NumOperands; ++i) {

5091

Value *Opd = Gep->getOperand(i);

5092

if (!SE->isLoopInvariant(SE->getSCEV(Opd), TheLoop) &&

5093

!Legal->isInductionVariable(Opd))

5094

return true;

5095

}

5096

5097

// Now we know we have a GEP ptr, %inv, %ind, %inv. Make sure that the step

5098

// can likely be merged into the address computation.

5099

unsigned MaxMergeDistance = 64;

5100

5101

const SCEVAddRecExpr *AddRec = dyn_cast<SCEVAddRecExpr>(SE->getSCEV(Ptr));

5102

if (!AddRec)

5103

return true;

5104

5105

// Check the step is constant.

5106

const SCEV *Step = AddRec->getStepRecurrence(*SE);

5107

// Calculate the pointer stride and check if it is consecutive.

5108

const SCEVConstant *C = dyn_cast<SCEVConstant>(Step);

5109

if (!C)

5110

return true;

5111

5112

const APInt &APStepVal = C->getValue()->getValue();

5113

5114

// Huge step value - give up.

5115

if (APStepVal.getBitWidth() > 64)

5116

return true;

5117

5118

int64_t StepVal = APStepVal.getSExtValue();

5119

5120

return StepVal > MaxMergeDistance;

5121

}

5122

5123

static bool isStrideMul(Instruction *I, LoopVectorizationLegality *Legal) {

5124

if (Legal->hasStride(I->getOperand(0)) || Legal->hasStride(I->getOperand(1)))

5125

return true;

5126

return false;

5127

}

5128

5129

unsigned

5130

LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) {

5131

// If we know that this instruction will remain uniform, check the cost of

5132

// the scalar version.

5133

if (Legal->isUniformAfterVectorization(I))

5134

VF = 1;

5135

5136

Type *RetTy = I->getType();

5137

Type *VectorTy = ToVectorTy(RetTy, VF);

5138

5139

// TODO: We need to estimate the cost of intrinsic calls.

5140

switch (I->getOpcode()) {

5141

case Instruction::GetElementPtr:

5142

// We mark this instruction as zero-cost because the cost of GEPs in

5143

// vectorized code depends on whether the corresponding memory instruction

5144

// is scalarized or not. Therefore, we handle GEPs with the memory

5145

// instruction cost.

5146

return 0;

5147

case Instruction::Br: {

5148

return TTI.getCFInstrCost(I->getOpcode());

5149

}

5150

case Instruction::PHI:

5151

//TODO: IF-converted IFs become selects.

5152

return 0;

5153

case Instruction::Add:

5154

case Instruction::FAdd:

5155

case Instruction::Sub:

5156

case Instruction::FSub:

5157

case Instruction::Mul:

5158

case Instruction::FMul:

5159

case Instruction::UDiv:

5160

case Instruction::SDiv:

5161

case Instruction::FDiv:

5162

case Instruction::URem:

5163

case Instruction::SRem:

5164

case Instruction::FRem:

5165

case Instruction::Shl:

5166

case Instruction::LShr:

5167

case Instruction::AShr:

5168

case Instruction::And:

5169

case Instruction::Or:

5170

case Instruction::Xor: {

5171

// Since we will replace the stride by 1 the multiplication should go away.

5172

if (I->getOpcode() == Instruction::Mul && isStrideMul(I, Legal))

5173

return 0;

5174

// Certain instructions can be cheaper to vectorize if they have a constant

5175

// second vector operand. One example of this are shifts on x86.

5176

TargetTransformInfo::OperandValueKind Op1VK =

5177

TargetTransformInfo::OK_AnyValue;

5178

TargetTransformInfo::OperandValueKind Op2VK =

5179

TargetTransformInfo::OK_AnyValue;

5180

TargetTransformInfo::OperandValueProperties Op1VP =

5181

TargetTransformInfo::OP_None;

5182

TargetTransformInfo::OperandValueProperties Op2VP =

5183

TargetTransformInfo::OP_None;

5184

Value *Op2 = I->getOperand(1);

5185

5186

// Check for a splat of a constant or for a non uniform vector of constants.

5187

if (isa<ConstantInt>(Op2)) {

5188

ConstantInt *CInt = cast<ConstantInt>(Op2);

5189

if (CInt && CInt->getValue().isPowerOf2())

5190

Op2VP = TargetTransformInfo::OP_PowerOf2;

5191

Op2VK = TargetTransformInfo::OK_UniformConstantValue;

5192

} else if (isa<ConstantVector>(Op2) || isa<ConstantDataVector>(Op2)) {

5193

Op2VK = TargetTransformInfo::OK_NonUniformConstantValue;

5194

Constant *SplatValue = cast<Constant>(Op2)->getSplatValue();

5195

if (SplatValue) {

5196

ConstantInt *CInt = dyn_cast<ConstantInt>(SplatValue);

5197

if (CInt && CInt->getValue().isPowerOf2())

5198

Op2VP = TargetTransformInfo::OP_PowerOf2;

5199

Op2VK = TargetTransformInfo::OK_UniformConstantValue;

5200

}

5201

}

5202

5203

return TTI.getArithmeticInstrCost(I->getOpcode(), VectorTy, Op1VK, Op2VK,

5204

Op1VP, Op2VP);

5205

}

5206

case Instruction::Select: {

5207

SelectInst *SI = cast<SelectInst>(I);

5208

const SCEV *CondSCEV = SE->getSCEV(SI->getCondition());

5209

bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop));

5210

Type *CondTy = SI->getCondition()->getType();

5211

if (!ScalarCond)

5212

CondTy = VectorType::get(CondTy, VF);

5213

5214

return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy);

5215

}

5216

case Instruction::ICmp:

5217

case Instruction::FCmp: {

5218

Type *ValTy = I->getOperand(0)->getType();

5219

VectorTy = ToVectorTy(ValTy, VF);

5220

return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy);

5221

}

5222

case Instruction::Store:

5223

case Instruction::Load: {

5224

StoreInst *SI = dyn_cast<StoreInst>(I);

5225

LoadInst *LI = dyn_cast<LoadInst>(I);

5226

Type *ValTy = (SI ? SI->getValueOperand()->getType() :

5227

LI->getType());

5228

VectorTy = ToVectorTy(ValTy, VF);

5229

5230

unsigned Alignment = SI ? SI->getAlignment() : LI->getAlignment();

5231

unsigned AS = SI ? SI->getPointerAddressSpace() :

5232

LI->getPointerAddressSpace();

5233

Value *Ptr = SI ? SI->getPointerOperand() : LI->getPointerOperand();

5234

// We add the cost of address computation here instead of with the gep

5235

// instruction because only here we know whether the operation is

5236

// scalarized.

5237

if (VF == 1)

5238

return TTI.getAddressComputationCost(VectorTy) +

5239

TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS);

5240

5241

// For an interleaved access, calculate the total cost of the whole

5242

// interleave group.

5243

if (Legal->isAccessInterleaved(I)) {

5244

auto Group = Legal->getInterleavedAccessGroup(I);

5245

5246

5247

// Only calculate the cost once at the insert position.

5248

if (Group->getInsertPos() != I)

5249

return 0;

5250

5251

unsigned InterleaveFactor = Group->getFactor();

5252

Type *WideVecTy =

5253

VectorType::get(VectorTy->getVectorElementType(),

5254

VectorTy->getVectorNumElements() * InterleaveFactor);

5255

5256

// Holds the indices of existing members in an interleaved load group.

5257

// An interleaved store group doesn't need this as it dones't allow gaps.

5258

SmallVector<unsigned, 4> Indices;

5259

if (LI) {

5260

for (unsigned i = 0; i < InterleaveFactor; i++)

5261

if (Group->getMember(i))

5262

Indices.push_back(i);

5263

}

5264

5265

// Calculate the cost of the whole interleaved group.

5266

unsigned Cost = TTI.getInterleavedMemoryOpCost(

5267

I->getOpcode(), WideVecTy, Group->getFactor(), Indices,

5268

Group->getAlignment(), AS);

5269

5270

if (Group->isReverse())

5271

Cost +=

5272

Group->getNumMembers() *

5273

TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0);

5274

5275

// FIXME: The interleaved load group with a huge gap could be even more

5276

// expensive than scalar operations. Then we could ignore such group and

5277

// use scalar operations instead.

5278

return Cost;

5279

}

5280

5281

// Scalarized loads/stores.

5282

int ConsecutiveStride = Legal->isConsecutivePtr(Ptr);

5283

bool Reverse = ConsecutiveStride < 0;

5284

const DataLayout &DL = I->getModule()->getDataLayout();

5285

unsigned ScalarAllocatedSize = DL.getTypeAllocSize(ValTy);

5286

unsigned VectorElementSize = DL.getTypeStoreSize(VectorTy) / VF;

5287

if (!ConsecutiveStride || ScalarAllocatedSize != VectorElementSize) {

5288

bool IsComplexComputation =

5289

isLikelyComplexAddressComputation(Ptr, Legal, SE, TheLoop);

5290

unsigned Cost = 0;

5291

// The cost of extracting from the value vector and pointer vector.

5292

Type *PtrTy = ToVectorTy(Ptr->getType(), VF);

5293

for (unsigned i = 0; i < VF; ++i) {

5294

// The cost of extracting the pointer operand.

5295

Cost += TTI.getVectorInstrCost(Instruction::ExtractElement, PtrTy, i);

5296

// In case of STORE, the cost of ExtractElement from the vector.

5297

// In case of LOAD, the cost of InsertElement into the returned

5298

// vector.

5299

Cost += TTI.getVectorInstrCost(SI ? Instruction::ExtractElement :

5300

Instruction::InsertElement,

5301

VectorTy, i);

5302

}

5303

5304

// The cost of the scalar loads/stores.

5305

Cost += VF * TTI.getAddressComputationCost(PtrTy, IsComplexComputation);

5306

Cost += VF * TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(),

5307

Alignment, AS);

5308

return Cost;

5309

}

5310

5311

// Wide load/stores.

5312

unsigned Cost = TTI.getAddressComputationCost(VectorTy);

5313

if (Legal->isMaskRequired(I))

5314

Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment,

5315

AS);

5316

else

5317

Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS);

5318

5319

if (Reverse)

5320

Cost += TTI.getShuffleCost(TargetTransformInfo::SK_Reverse,

5321

VectorTy, 0);

5322

return Cost;

5323

}

5324

case Instruction::ZExt:

5325

case Instruction::SExt:

5326

case Instruction::FPToUI:

5327

case Instruction::FPToSI:

5328

case Instruction::FPExt:

5329

case Instruction::PtrToInt:

5330

case Instruction::IntToPtr:

5331

case Instruction::SIToFP:

5332

case Instruction::UIToFP:

5333

case Instruction::Trunc:

5334

case Instruction::FPTrunc:

5335

case Instruction::BitCast: {

5336

// We optimize the truncation of induction variable.

5337

// The cost of these is the same as the scalar operation.

5338

if (I->getOpcode() == Instruction::Trunc &&

5339

Legal->isInductionVariable(I->getOperand(0)))

5340

return TTI.getCastInstrCost(I->getOpcode(), I->getType(),

5341

I->getOperand(0)->getType());

5342

5343

Type *SrcVecTy = ToVectorTy(I->getOperand(0)->getType(), VF);

5344

return TTI.getCastInstrCost(I->getOpcode(), VectorTy, SrcVecTy);

5345

}

5346

case Instruction::Call: {

5347

bool NeedToScalarize;

5348

CallInst *CI = cast<CallInst>(I);

5349

unsigned CallCost = getVectorCallCost(CI, VF, TTI, TLI, NeedToScalarize);

5350

if (getIntrinsicIDForCall(CI, TLI))

5351

return std::min(CallCost, getVectorIntrinsicCost(CI, VF, TTI, TLI));

5352

return CallCost;

5353

}

5354

default: {

5355

// We are scalarizing the instruction. Return the cost of the scalar

5356

// instruction, plus the cost of insert and extract into vector

5357

// elements, times the vector width.

5358

unsigned Cost = 0;

5359

5360

if (!RetTy->isVoidTy() && VF != 1) {

5361

unsigned InsCost = TTI.getVectorInstrCost(Instruction::InsertElement,

5362

VectorTy);

5363

unsigned ExtCost = TTI.getVectorInstrCost(Instruction::ExtractElement,

5364

VectorTy);

5365

5366

// The cost of inserting the results plus extracting each one of the

5367

// operands.

5368

Cost += VF * (InsCost + ExtCost * I->getNumOperands());

5369

}

5370

5371

// The cost of executing VF copies of the scalar instruction. This opcode

5372

// is unknown. Assume that it is the same as 'mul'.

5373

Cost += VF * TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy);

5374

return Cost;

5375

}

5376

}// end of switch.

5377

}

5378

5379

char LoopVectorize::ID = 0;

5380

static const char lv_name[] = "Loop Vectorization";

5381

INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false)static void* initializeLoopVectorizePassOnce(PassRegistry &
Registry) {

5382

INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)initializeTargetTransformInfoWrapperPassPass(Registry);

5383

INITIALIZE_AG_DEPENDENCY(AliasAnalysis)initializeAliasAnalysisAnalysisGroup(Registry);

5384

INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)initializeAssumptionCacheTrackerPass(Registry);

5385

INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfo)initializeBlockFrequencyInfoPass(Registry);

5386

INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)initializeDominatorTreeWrapperPassPass(Registry);

5387

INITIALIZE_PASS_DEPENDENCY(ScalarEvolution)initializeScalarEvolutionPass(Registry);

5388

INITIALIZE_PASS_DEPENDENCY(LCSSA)initializeLCSSAPass(Registry);

5389

INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)initializeLoopInfoWrapperPassPass(Registry);

5390

INITIALIZE_PASS_DEPENDENCY(LoopSimplify)initializeLoopSimplifyPass(Registry);

5391

INITIALIZE_PASS_DEPENDENCY(LoopAccessAnalysis)initializeLoopAccessAnalysisPass(Registry);

5392

INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false)PassInfo *PI = new PassInfo(lv_name, "loop-vectorize", & LoopVectorize
::ID, PassInfo::NormalCtor_t(callDefaultCtor< LoopVectorize
>), false, false); Registry.registerPass(*PI, true); return
PI; } void llvm::initializeLoopVectorizePass(PassRegistry &
Registry) { static volatile sys::cas_flag initialized = 0; sys
::cas_flag old_val = sys::CompareAndSwap(&initialized, 1,
0); if (old_val == 0) { initializeLoopVectorizePassOnce(Registry
); sys::MemoryFence(); AnnotateIgnoreWritesBegin("/tmp/buildd/llvm-toolchain-snapshot-3.7~svn240924/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 5392); AnnotateHappensBefore("/tmp/buildd/llvm-toolchain-snapshot-3.7~svn240924/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 5392, &initialized); initialized = 2; AnnotateIgnoreWritesEnd
("/tmp/buildd/llvm-toolchain-snapshot-3.7~svn240924/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 5392); } else { sys::cas_flag tmp = initialized; sys::MemoryFence
(); while (tmp != 2) { tmp = initialized; sys::MemoryFence();
} } AnnotateHappensAfter("/tmp/buildd/llvm-toolchain-snapshot-3.7~svn240924/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 5392, &initialized); }

5393

5394

namespace llvm {

5395

Pass *createLoopVectorizePass(bool NoUnrolling, bool AlwaysVectorize) {

5396

return new LoopVectorize(NoUnrolling, AlwaysVectorize);

5397

}

5398

}

5399

5400

bool LoopVectorizationCostModel::isConsecutiveLoadOrStore(Instruction *Inst) {

5401

// Check for a store.

5402

if (StoreInst *ST = dyn_cast<StoreInst>(Inst))

5403

return Legal->isConsecutivePtr(ST->getPointerOperand()) != 0;

5404

5405

// Check for a load.

5406

if (LoadInst *LI = dyn_cast<LoadInst>(Inst))

5407

return Legal->isConsecutivePtr(LI->getPointerOperand()) != 0;

5408

5409

return false;

5410

}

5411

5412

5413

void InnerLoopUnroller::scalarizeInstruction(Instruction *Instr,

5414

bool IfPredicateStore) {

5415

5416

// Holds vector parameters or scalars, in case of uniform vals.

5417

SmallVector<VectorParts, 4> Params;

5418

5419

setDebugLocFromInst(Builder, Instr);

5420

5421

// Find all of the vectorized parameters.

5422

for (unsigned op = 0, e = Instr->getNumOperands(); op != e; ++op) {

5423

Value *SrcOp = Instr->getOperand(op);

5424

5425

// If we are accessing the old induction variable, use the new one.

5426

if (SrcOp == OldInduction) {

5427

Params.push_back(getVectorValue(SrcOp));

5428

continue;

5429

}

5430

5431

// Try using previously calculated values.

5432

Instruction *SrcInst = dyn_cast<Instruction>(SrcOp);

5433

5434

// If the src is an instruction that appeared earlier in the basic block

5435

// then it should already be vectorized.

5436

if (SrcInst && OrigLoop->contains(SrcInst)) {

5437

5438

// The parameter is a vector value from earlier.

5439

Params.push_back(WidenMap.get(SrcInst));

5440

} else {

5441

// The parameter is a scalar from outside the loop. Maybe even a constant.

5442

VectorParts Scalars;

5443

Scalars.append(UF, SrcOp);

5444

Params.push_back(Scalars);

5445

}

5446

}

5447

5448

5449

5450

5451

// Does this instruction return a value ?

5452

bool IsVoidRetTy = Instr->getType()->isVoidTy();

5453

5454

Value *UndefVec = IsVoidRetTy ? nullptr :

5455

UndefValue::get(Instr->getType());

5456

// Create a new entry in the WidenMap and initialize it to Undef or Null.

5457

VectorParts &VecResults = WidenMap.splat(Instr, UndefVec);

5458

5459

Instruction *InsertPt = Builder.GetInsertPoint();

5460

BasicBlock *IfBlock = Builder.GetInsertBlock();

5461

BasicBlock *CondBlock = nullptr;

5462

5463

VectorParts Cond;

5464

Loop *VectorLp = nullptr;

5465

if (IfPredicateStore) {

5466

5467

5468

Cond = createEdgeMask(Instr->getParent()->getSinglePredecessor(),

5469

Instr->getParent());

5470

VectorLp = LI->getLoopFor(IfBlock);

5471

5472

}

5473

5474

// For each vector unroll 'part':

5475

for (unsigned Part = 0; Part < UF; ++Part) {

5476

// For each scalar that we create:

5477

5478

// Start an "if (pred) a[i] = ..." block.

5479

Value *Cmp = nullptr;

5480

if (IfPredicateStore) {

5481

if (Cond[Part]->getType()->isVectorTy())

5482

Cond[Part] =

5483

Builder.CreateExtractElement(Cond[Part], Builder.getInt32(0));

5484

Cmp = Builder.CreateICmp(ICmpInst::ICMP_EQ, Cond[Part],

5485

ConstantInt::get(Cond[Part]->getType(), 1));

5486

CondBlock = IfBlock->splitBasicBlock(InsertPt, "cond.store");

5487

LoopVectorBody.push_back(CondBlock);

5488

VectorLp->addBasicBlockToLoop(CondBlock, *LI);

5489

// Update Builder with newly created basic block.

5490

Builder.SetInsertPoint(InsertPt);

5491

}

5492

5493

Instruction *Cloned = Instr->clone();

5494

if (!IsVoidRetTy)

5495

Cloned->setName(Instr->getName() + ".cloned");

5496

// Replace the operands of the cloned instructions with extracted scalars.

5497

for (unsigned op = 0, e = Instr->getNumOperands(); op != e; ++op) {

5498

Value *Op = Params[op][Part];

5499

Cloned->setOperand(op, Op);

5500

}

5501

5502

// Place the cloned scalar in the new loop.

5503

Builder.Insert(Cloned);

5504

5505

// If the original scalar returns a value we need to place it in a vector

5506

// so that future users will be able to use it.

5507

if (!IsVoidRetTy)

5508

VecResults[Part] = Cloned;

5509

5510

// End if-block.

5511

if (IfPredicateStore) {

5512

BasicBlock *NewIfBlock = CondBlock->splitBasicBlock(InsertPt, "else");

5513

LoopVectorBody.push_back(NewIfBlock);

5514

VectorLp->addBasicBlockToLoop(NewIfBlock, *LI);

5515

Builder.SetInsertPoint(InsertPt);

5516

Instruction *OldBr = IfBlock->getTerminator();

5517

BranchInst::Create(CondBlock, NewIfBlock, Cmp, OldBr);

5518

OldBr->eraseFromParent();

5519

IfBlock = NewIfBlock;

5520

}

5521

}

5522

}

5523

5524

void InnerLoopUnroller::vectorizeMemoryInstruction(Instruction *Instr) {

5525

StoreInst *SI = dyn_cast<StoreInst>(Instr);

5526

bool IfPredicateStore = (SI && Legal->blockNeedsPredication(SI->getParent()));

5527

5528

return scalarizeInstruction(Instr, IfPredicateStore);

5529

}

5530

5531

Value *InnerLoopUnroller::reverseVector(Value *Vec) {

5532

return Vec;

5533

}

5534

5535

Value *InnerLoopUnroller::getBroadcastInstrs(Value *V) {

5536

return V;

5537

}

5538

5539

Value *InnerLoopUnroller::getStepVector(Value *Val, int StartIdx, Value *Step) {

5540

// When unrolling and the VF is 1, we only need to add a simple scalar.

5541

Type *ITy = Val->getType();

5542

assert(!ITy->isVectorTy() && "Val must be a scalar")((!ITy->isVectorTy() && "Val must be a scalar") ? static_cast
<void> (0) : __assert_fail ("!ITy->isVectorTy() && \"Val must be a scalar\""
, "/tmp/buildd/llvm-toolchain-snapshot-3.7~svn240924/lib/Transforms/Vectorize/LoopVectorize.cpp"
, 5542, __PRETTY_FUNCTION__));

5543

Constant *C = ConstantInt::get(ITy, StartIdx);

5544

return Builder.CreateAdd(Val, Builder.CreateMul(C, Step), "induction");

5545

}