/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Transforms/Vectorize/SLPVectorizer.cpp

Bug Summary

File:	lib/Transforms/Vectorize/SLPVectorizer.cpp
Warning:	line 2382, column 32 Called C++ object pointer is null

Annotated Source Code

//===- SLPVectorizer.cpp - A bottom up SLP Vectorizer ---------------------===//

// The LLVM Compiler Infrastructure

// This file is distributed under the University of Illinois Open Source

// License. See LICENSE.TXT for details.

//===----------------------------------------------------------------------===//

// This pass implements the Bottom Up SLP vectorizer. It detects consecutive

// stores that can be put together into vector-stores. Next, it attempts to

// construct vectorizable tree using the use-def chains. If a profitable tree

// was found, the SLP vectorizer performs vectorization on the tree.

// The pass is inspired by the work described in the paper:

// "Loop-Aware SLP in GCC" by Ira Rosen, Dorit Nuzman, Ayal Zaks.

//===----------------------------------------------------------------------===//

#include "llvm/Transforms/Vectorize/SLPVectorizer.h"

#include "llvm/ADT/Optional.h"

#include "llvm/ADT/PostOrderIterator.h"

#include "llvm/ADT/SetVector.h"

#include "llvm/ADT/Statistic.h"

#include "llvm/Analysis/CodeMetrics.h"

#include "llvm/Analysis/GlobalsModRef.h"

#include "llvm/Analysis/LoopAccessAnalysis.h"

#include "llvm/Analysis/ScalarEvolutionExpressions.h"

#include "llvm/Analysis/ValueTracking.h"

#include "llvm/Analysis/VectorUtils.h"

#include "llvm/IR/DataLayout.h"

#include "llvm/IR/Dominators.h"

#include "llvm/IR/IRBuilder.h"

#include "llvm/IR/Instructions.h"

#include "llvm/IR/IntrinsicInst.h"

#include "llvm/IR/Module.h"

#include "llvm/IR/NoFolder.h"

#include "llvm/IR/Type.h"

#include "llvm/IR/Value.h"

#include "llvm/IR/Verifier.h"

#include "llvm/Pass.h"

#include "llvm/Support/CommandLine.h"

#include "llvm/Support/Debug.h"

#include "llvm/Support/GraphWriter.h"

#include "llvm/Support/KnownBits.h"

#include "llvm/Support/raw_ostream.h"

#include "llvm/Transforms/Utils/LoopUtils.h"

#include "llvm/Transforms/Vectorize.h"

#include <algorithm>

#include <memory>

using namespace llvm;

using namespace slpvectorizer;

#define SV_NAME"slp-vectorizer" "slp-vectorizer"

#define DEBUG_TYPE"SLP" "SLP"

STATISTIC(NumVectorInstructions, "Number of vector instructions generated")static llvm::Statistic NumVectorInstructions = {"SLP", "NumVectorInstructions"
, "Number of vector instructions generated", {0}, false};

static cl::opt<int>

SLPCostThreshold("slp-threshold", cl::init(0), cl::Hidden,

cl::desc("Only vectorize if you gain more than this "

"number "));

static cl::opt<bool>

ShouldVectorizeHor("slp-vectorize-hor", cl::init(true), cl::Hidden,

cl::desc("Attempt to vectorize horizontal reductions"));

static cl::opt<bool> ShouldStartVectorizeHorAtStore(

"slp-vectorize-hor-store", cl::init(false), cl::Hidden,

cl::desc(

"Attempt to vectorize horizontal reductions feeding into a store"));

static cl::opt<int>

MaxVectorRegSizeOption("slp-max-reg-size", cl::init(128), cl::Hidden,

cl::desc("Attempt to vectorize for this register size in bits"));

/// Limits the size of scheduling regions in a block.

/// It avoid long compile times for _very_ large blocks where vector

/// instructions are spread over a wide range.

/// This limit is way higher than needed by real-world functions.

static cl::opt<int>

ScheduleRegionSizeBudget("slp-schedule-budget", cl::init(100000), cl::Hidden,

cl::desc("Limit the size of the SLP scheduling region per block"));

static cl::opt<int> MinVectorRegSizeOption(

"slp-min-reg-size", cl::init(128), cl::Hidden,

cl::desc("Attempt to vectorize for this register size in bits"));

static cl::opt<unsigned> RecursionMaxDepth(

"slp-recursion-max-depth", cl::init(12), cl::Hidden,

cl::desc("Limit the recursion depth when building a vectorizable tree"));

static cl::opt<unsigned> MinTreeSize(

"slp-min-tree-size", cl::init(3), cl::Hidden,

cl::desc("Only vectorize small trees if they are fully vectorizable"));

static cl::opt<bool>

ViewSLPTree("view-slp-tree", cl::Hidden,

cl::desc("Display the SLP trees with Graphviz"));

100

// Limit the number of alias checks. The limit is chosen so that

101

// it has no negative effect on the llvm benchmarks.

102

static const unsigned AliasedCheckLimit = 10;

103

104

// Another limit for the alias checks: The maximum distance between load/store

105

// instructions where alias checks are done.

106

// This limit is useful for very large basic blocks.

107

static const unsigned MaxMemDepDistance = 160;

108

109

/// If the ScheduleRegionSizeBudget is exhausted, we allow small scheduling

110

/// regions to be handled.

111

static const int MinScheduleRegionSize = 16;

112

113

/// \brief Predicate for the element types that the SLP vectorizer supports.

114

///

115

/// The most important thing to filter here are types which are invalid in LLVM

116

/// vectors. We also filter target specific types which have absolutely no

117

/// meaningful vectorization path such as x86_fp80 and ppc_f128. This just

118

/// avoids spending time checking the cost model and realizing that they will

119

/// be inevitably scalarized.

120

static bool isValidElementType(Type *Ty) {

121

return VectorType::isValidElementType(Ty) && !Ty->isX86_FP80Ty() &&

122

!Ty->isPPC_FP128Ty();

123

}

124

125

/// \returns true if all of the instructions in \p VL are in the same block or

126

/// false otherwise.

127

static bool allSameBlock(ArrayRef<Value *> VL) {

128

Instruction *I0 = dyn_cast<Instruction>(VL[0]);

129

if (!I0)

130

return false;

131

BasicBlock *BB = I0->getParent();

132

for (int i = 1, e = VL.size(); i < e; i++) {

133

Instruction *I = dyn_cast<Instruction>(VL[i]);

134

if (!I)

135

return false;

136

137

if (BB != I->getParent())

138

return false;

139

}

140

return true;

141

}

142

143

/// \returns True if all of the values in \p VL are constants.

144

static bool allConstant(ArrayRef<Value *> VL) {

145

for (Value *i : VL)

146

if (!isa<Constant>(i))

147

return false;

148

return true;

149

}

150

151

/// \returns True if all of the values in \p VL are identical.

152

static bool isSplat(ArrayRef<Value *> VL) {

153

for (unsigned i = 1, e = VL.size(); i < e; ++i)

154

if (VL[i] != VL[0])

155

return false;

156

return true;

157

}

158

159

///\returns Opcode that can be clubbed with \p Op to create an alternate

160

/// sequence which can later be merged as a ShuffleVector instruction.

161

static unsigned getAltOpcode(unsigned Op) {

162

switch (Op) {

163

case Instruction::FAdd:

164

return Instruction::FSub;

165

case Instruction::FSub:

166

return Instruction::FAdd;

167

case Instruction::Add:

168

return Instruction::Sub;

169

case Instruction::Sub:

170

return Instruction::Add;

171

default:

172

return 0;

173

}

174

}

175

176

///\returns bool representing if Opcode \p Op can be part

177

/// of an alternate sequence which can later be merged as

178

/// a ShuffleVector instruction.

179

static bool canCombineAsAltInst(unsigned Op) {

180

return Op == Instruction::FAdd || Op == Instruction::FSub ||

181

Op == Instruction::Sub || Op == Instruction::Add;

182

}

183

184

/// \returns ShuffleVector instruction if instructions in \p VL have

185

/// alternate fadd,fsub / fsub,fadd/add,sub/sub,add sequence.

186

/// (i.e. e.g. opcodes of fadd,fsub,fadd,fsub...)

187

static unsigned isAltInst(ArrayRef<Value *> VL) {

188

Instruction *I0 = dyn_cast<Instruction>(VL[0]);

189

unsigned Opcode = I0->getOpcode();

190

unsigned AltOpcode = getAltOpcode(Opcode);

191

for (int i = 1, e = VL.size(); i < e; i++) {

192

Instruction *I = dyn_cast<Instruction>(VL[i]);

193

if (!I || I->getOpcode() != ((i & 1) ? AltOpcode : Opcode))

194

return 0;

195

}

196

return Instruction::ShuffleVector;

197

}

198

199

/// \returns The opcode if all of the Instructions in \p VL have the same

200

/// opcode, or zero.

201

static unsigned getSameOpcode(ArrayRef<Value *> VL) {

202

Instruction *I0 = dyn_cast<Instruction>(VL[0]);

203

if (!I0)

204

return 0;

205

unsigned Opcode = I0->getOpcode();

206

for (int i = 1, e = VL.size(); i < e; i++) {

207

Instruction *I = dyn_cast<Instruction>(VL[i]);

208

if (!I || Opcode != I->getOpcode()) {

209

if (canCombineAsAltInst(Opcode) && i == 1)

210

return isAltInst(VL);

211

return 0;

212

}

213

}

214

return Opcode;

215

}

216

217

/// \returns true if all of the values in \p VL have the same type or false

218

/// otherwise.

219

static bool allSameType(ArrayRef<Value *> VL) {

220

Type *Ty = VL[0]->getType();

221

for (int i = 1, e = VL.size(); i < e; i++)

222

if (VL[i]->getType() != Ty)

223

return false;

224

225

return true;

226

}

227

228

/// \returns True if Extract{Value,Element} instruction extracts element Idx.

229

static bool matchExtractIndex(Instruction *E, unsigned Idx, unsigned Opcode) {

230

assert(Opcode == Instruction::ExtractElement ||((Opcode == Instruction::ExtractElement || Opcode == Instruction
::ExtractValue) ? static_cast<void> (0) : __assert_fail
("Opcode == Instruction::ExtractElement || Opcode == Instruction::ExtractValue"
, "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 231, __PRETTY_FUNCTION__))

231

Opcode == Instruction::ExtractValue)((Opcode == Instruction::ExtractElement || Opcode == Instruction
::ExtractValue) ? static_cast<void> (0) : __assert_fail
("Opcode == Instruction::ExtractElement || Opcode == Instruction::ExtractValue"
, "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 231, __PRETTY_FUNCTION__));

232

if (Opcode == Instruction::ExtractElement) {

233

ConstantInt *CI = dyn_cast<ConstantInt>(E->getOperand(1));

234

return CI && CI->getZExtValue() == Idx;

235

} else {

236

ExtractValueInst *EI = cast<ExtractValueInst>(E);

237

return EI->getNumIndices() == 1 && *EI->idx_begin() == Idx;

238

}

239

}

240

241

/// \returns True if in-tree use also needs extract. This refers to

242

/// possible scalar operand in vectorized instruction.

243

static bool InTreeUserNeedToExtract(Value *Scalar, Instruction *UserInst,

244

TargetLibraryInfo *TLI) {

245

246

unsigned Opcode = UserInst->getOpcode();

247

switch (Opcode) {

248

case Instruction::Load: {

249

LoadInst *LI = cast<LoadInst>(UserInst);

250

return (LI->getPointerOperand() == Scalar);

251

}

252

case Instruction::Store: {

253

StoreInst *SI = cast<StoreInst>(UserInst);

254

return (SI->getPointerOperand() == Scalar);

255

}

256

case Instruction::Call: {

257

CallInst *CI = cast<CallInst>(UserInst);

258

Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);

259

if (hasVectorInstrinsicScalarOpd(ID, 1)) {

260

return (CI->getArgOperand(1) == Scalar);

261

}

262

LLVM_FALLTHROUGH[[clang::fallthrough]];

263

}

264

default:

265

return false;

266

}

267

}

268

269

/// \returns the AA location that is being access by the instruction.

270

static MemoryLocation getLocation(Instruction *I, AliasAnalysis *AA) {

271

if (StoreInst *SI = dyn_cast<StoreInst>(I))

272

return MemoryLocation::get(SI);

273

if (LoadInst *LI = dyn_cast<LoadInst>(I))

274

return MemoryLocation::get(LI);

275

return MemoryLocation();

276

}

277

278

/// \returns True if the instruction is not a volatile or atomic load/store.

279

static bool isSimple(Instruction *I) {

280

if (LoadInst *LI = dyn_cast<LoadInst>(I))

281

return LI->isSimple();

282

if (StoreInst *SI = dyn_cast<StoreInst>(I))

283

return SI->isSimple();

284

if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(I))

285

return !MI->isVolatile();

286

return true;

287

}

288

289

namespace llvm {

290

namespace slpvectorizer {

291

/// Bottom Up SLP Vectorizer.

292

class BoUpSLP {

293

public:

294

typedef SmallVector<Value *, 8> ValueList;

295

typedef SmallVector<Instruction *, 16> InstrList;

296

typedef SmallPtrSet<Value *, 16> ValueSet;

297

typedef SmallVector<StoreInst *, 8> StoreList;

298

typedef MapVector<Value *, SmallVector<Instruction *, 2>>

299

ExtraValueToDebugLocsMap;

300

301

BoUpSLP(Function *Func, ScalarEvolution *Se, TargetTransformInfo *Tti,

302

TargetLibraryInfo *TLi, AliasAnalysis *Aa, LoopInfo *Li,

303

DominatorTree *Dt, AssumptionCache *AC, DemandedBits *DB,

304

const DataLayout *DL, OptimizationRemarkEmitter *ORE)

305

: NumLoadsWantToKeepOrder(0), NumLoadsWantToChangeOrder(0), F(Func),

306

SE(Se), TTI(Tti), TLI(TLi), AA(Aa), LI(Li), DT(Dt), AC(AC), DB(DB),

307

DL(DL), ORE(ORE), Builder(Se->getContext()) {

308

CodeMetrics::collectEphemeralValues(F, AC, EphValues);

309

// Use the vector register size specified by the target unless overridden

310

// by a command-line option.

311

// TODO: It would be better to limit the vectorization factor based on

312

// data type rather than just register size. For example, x86 AVX has

313

// 256-bit registers, but it does not support integer operations

314

// at that width (that requires AVX2).

315

if (MaxVectorRegSizeOption.getNumOccurrences())

316

MaxVecRegSize = MaxVectorRegSizeOption;

317

else

318

MaxVecRegSize = TTI->getRegisterBitWidth(true);

319

320

if (MinVectorRegSizeOption.getNumOccurrences())

321

MinVecRegSize = MinVectorRegSizeOption;

322

else

323

MinVecRegSize = TTI->getMinVectorRegisterBitWidth();

324

}

325

326

/// \brief Vectorize the tree that starts with the elements in \p VL.

327

/// Returns the vectorized root.

328

Value *vectorizeTree();

329

/// Vectorize the tree but with the list of externally used values \p

330

/// ExternallyUsedValues. Values in this MapVector can be replaced but the

331

/// generated extractvalue instructions.

332

Value *vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues);

333

334

/// \returns the cost incurred by unwanted spills and fills, caused by

335

/// holding live values over call sites.

336

int getSpillCost();

337

338

/// \returns the vectorization cost of the subtree that starts at \p VL.

339

/// A negative number means that this is profitable.

340

int getTreeCost();

341

342

/// Construct a vectorizable tree that starts at \p Roots, ignoring users for

343

/// the purpose of scheduling and extraction in the \p UserIgnoreLst.

344

void buildTree(ArrayRef<Value *> Roots,

345

ArrayRef<Value *> UserIgnoreLst = None);

346

/// Construct a vectorizable tree that starts at \p Roots, ignoring users for

347

/// the purpose of scheduling and extraction in the \p UserIgnoreLst taking

348

/// into account (anf updating it, if required) list of externally used

349

/// values stored in \p ExternallyUsedValues.

350

void buildTree(ArrayRef<Value *> Roots,

351

ExtraValueToDebugLocsMap &ExternallyUsedValues,

352

ArrayRef<Value *> UserIgnoreLst = None);

353

354

/// Clear the internal data structures that are created by 'buildTree'.

355

void deleteTree() {

356

VectorizableTree.clear();

357

ScalarToTreeEntry.clear();

358

MustGather.clear();

359

ExternalUses.clear();

360

NumLoadsWantToKeepOrder = 0;

361

NumLoadsWantToChangeOrder = 0;

362

for (auto &Iter : BlocksSchedules) {

363

BlockScheduling *BS = Iter.second.get();

364

BS->clear();

365

}

366

MinBWs.clear();

367

}

368

369

unsigned getTreeSize() const { return VectorizableTree.size(); }

370

371

/// \brief Perform LICM and CSE on the newly generated gather sequences.

372

void optimizeGatherSequence();

373

374

/// \returns true if it is beneficial to reverse the vector order.

375

bool shouldReorder() const {

376

return NumLoadsWantToChangeOrder > NumLoadsWantToKeepOrder;

377

}

378

379

/// \return The vector element size in bits to use when vectorizing the

380

/// expression tree ending at \p V. If V is a store, the size is the width of

381

/// the stored value. Otherwise, the size is the width of the largest loaded

382

/// value reaching V. This method is used by the vectorizer to calculate

383

/// vectorization factors.

384

unsigned getVectorElementSize(Value *V);

385

386

/// Compute the minimum type sizes required to represent the entries in a

387

/// vectorizable tree.

388

void computeMinimumValueSizes();

389

390

// \returns maximum vector register size as set by TTI or overridden by cl::opt.

391

unsigned getMaxVecRegSize() const {

392

return MaxVecRegSize;

393

}

394

395

// \returns minimum vector register size as set by cl::opt.

396

unsigned getMinVecRegSize() const {

397

return MinVecRegSize;

398

}

399

400

/// \brief Check if ArrayType or StructType is isomorphic to some VectorType.

401

///

402

/// \returns number of elements in vector if isomorphism exists, 0 otherwise.

403

unsigned canMapToVector(Type *T, const DataLayout &DL) const;

404

405

/// \returns True if the VectorizableTree is both tiny and not fully

406

/// vectorizable. We do not vectorize such trees.

407

bool isTreeTinyAndNotFullyVectorizable();

408

409

OptimizationRemarkEmitter *getORE() { return ORE; }

410

411

private:

412

struct TreeEntry;

413

414

/// \returns the cost of the vectorizable entry.

415

int getEntryCost(TreeEntry *E);

416

417

/// This is the recursive part of buildTree.

418

void buildTree_rec(ArrayRef<Value *> Roots, unsigned Depth, int);

419

420

/// \returns True if the ExtractElement/ExtractValue instructions in VL can

421

/// be vectorized to use the original vector (or aggregate "bitcast" to a vector).

422

bool canReuseExtract(ArrayRef<Value *> VL, unsigned Opcode) const;

423

424

/// Vectorize a single entry in the tree.

425

Value *vectorizeTree(TreeEntry *E);

426

427

/// Vectorize a single entry in the tree, starting in \p VL.

428

Value *vectorizeTree(ArrayRef<Value *> VL);

429

430

/// \returns the pointer to the vectorized value if \p VL is already

431

/// vectorized, or NULL. They may happen in cycles.

432

Value *alreadyVectorized(ArrayRef<Value *> VL) const;

433

434

/// \returns the scalarization cost for this type. Scalarization in this

435

/// context means the creation of vectors from a group of scalars.

436

int getGatherCost(Type *Ty);

437

438

/// \returns the scalarization cost for this list of values. Assuming that

439

/// this subtree gets vectorized, we may need to extract the values from the

440

/// roots. This method calculates the cost of extracting the values.

441

int getGatherCost(ArrayRef<Value *> VL);

442

443

/// \brief Set the Builder insert point to one after the last instruction in

444

/// the bundle

445

void setInsertPointAfterBundle(ArrayRef<Value *> VL);

446

447

/// \returns a vector from a collection of scalars in \p VL.

448

Value *Gather(ArrayRef<Value *> VL, VectorType *Ty);

449

450

/// \returns whether the VectorizableTree is fully vectorizable and will

451

/// be beneficial even the tree height is tiny.

452

bool isFullyVectorizableTinyTree();

453

454

/// \reorder commutative operands in alt shuffle if they result in

455

/// vectorized code.

456

void reorderAltShuffleOperands(ArrayRef<Value *> VL,

457

SmallVectorImpl<Value *> &Left,

458

SmallVectorImpl<Value *> &Right);

459

/// \reorder commutative operands to get better probability of

460

/// generating vectorized code.

461

void reorderInputsAccordingToOpcode(ArrayRef<Value *> VL,

462

SmallVectorImpl<Value *> &Left,

463

SmallVectorImpl<Value *> &Right);

464

struct TreeEntry {

465

TreeEntry(std::vector<TreeEntry> &Container)

466

: Scalars(), VectorizedValue(nullptr), NeedToGather(0),

467

Container(Container) {}

468

469

/// \returns true if the scalars in VL are equal to this entry.

470

bool isSame(ArrayRef<Value *> VL) const {

471

assert(VL.size() == Scalars.size() && "Invalid size")((VL.size() == Scalars.size() && "Invalid size") ? static_cast
<void> (0) : __assert_fail ("VL.size() == Scalars.size() && \"Invalid size\""
, "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 471, __PRETTY_FUNCTION__));

472

return std::equal(VL.begin(), VL.end(), Scalars.begin());

473

}

474

475

/// A vector of scalars.

476

ValueList Scalars;

477

478

/// The Scalars are vectorized into this value. It is initialized to Null.

479

Value *VectorizedValue;

480

481

/// Do we need to gather this sequence ?

482

bool NeedToGather;

483

484

/// Points back to the VectorizableTree.

485

///

486

/// Only used for Graphviz right now. Unfortunately GraphTrait::NodeRef has

487

/// to be a pointer and needs to be able to initialize the child iterator.

488

/// Thus we need a reference back to the container to translate the indices

489

/// to entries.

490

std::vector<TreeEntry> &Container;

491

492

/// The TreeEntry index containing the user of this entry. We can actually

493

/// have multiple users so the data structure is not truly a tree.

494

SmallVector<int, 1> UserTreeIndices;

495

};

496

497

/// Create a new VectorizableTree entry.

498

TreeEntry *newTreeEntry(ArrayRef<Value *> VL, bool Vectorized,

499

int &UserTreeIdx) {

500

VectorizableTree.emplace_back(VectorizableTree);

501

int idx = VectorizableTree.size() - 1;

502

TreeEntry *Last = &VectorizableTree[idx];

503

Last->Scalars.insert(Last->Scalars.begin(), VL.begin(), VL.end());

504

Last->NeedToGather = !Vectorized;

505

if (Vectorized) {

506

for (int i = 0, e = VL.size(); i != e; ++i) {

507

assert(!ScalarToTreeEntry.count(VL[i]) && "Scalar already in tree!")((!ScalarToTreeEntry.count(VL[i]) && "Scalar already in tree!"
) ? static_cast<void> (0) : __assert_fail ("!ScalarToTreeEntry.count(VL[i]) && \"Scalar already in tree!\""
, "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 507, __PRETTY_FUNCTION__));

508

ScalarToTreeEntry[VL[i]] = idx;

509

}

510

} else {

511

MustGather.insert(VL.begin(), VL.end());

512

}

513

514

if (UserTreeIdx >= 0)

515

Last->UserTreeIndices.push_back(UserTreeIdx);

516

UserTreeIdx = idx;

517

return Last;

518

}

519

520

/// -- Vectorization State --

521

/// Holds all of the tree entries.

522

std::vector<TreeEntry> VectorizableTree;

523

524

/// Maps a specific scalar to its tree entry.

525

SmallDenseMap<Value*, int> ScalarToTreeEntry;

526

527

/// A list of scalars that we found that we need to keep as scalars.

528

ValueSet MustGather;

529

530

/// This POD struct describes one external user in the vectorized tree.

531

struct ExternalUser {

532

ExternalUser (Value *S, llvm::User *U, int L) :

533

Scalar(S), User(U), Lane(L){}

534

// Which scalar in our function.

535

Value *Scalar;

536

// Which user that uses the scalar.

537

llvm::User *User;

538

// Which lane does the scalar belong to.

539

int Lane;

540

};

541

typedef SmallVector<ExternalUser, 16> UserList;

542

543

/// Checks if two instructions may access the same memory.

544

///

545

/// \p Loc1 is the location of \p Inst1. It is passed explicitly because it

546

/// is invariant in the calling loop.

547

bool isAliased(const MemoryLocation &Loc1, Instruction *Inst1,

548

Instruction *Inst2) {

549

550

// First check if the result is already in the cache.

551

AliasCacheKey key = std::make_pair(Inst1, Inst2);

552

Optional<bool> &result = AliasCache[key];

553

if (result.hasValue()) {

554

return result.getValue();

555

}

556

MemoryLocation Loc2 = getLocation(Inst2, AA);

557

bool aliased = true;

558

if (Loc1.Ptr && Loc2.Ptr && isSimple(Inst1) && isSimple(Inst2)) {

559

// Do the alias check.

560

aliased = AA->alias(Loc1, Loc2);

561

}

562

// Store the result in the cache.

563

result = aliased;

564

return aliased;

565

}

566

567

typedef std::pair<Instruction *, Instruction *> AliasCacheKey;

568

569

/// Cache for alias results.

570

/// TODO: consider moving this to the AliasAnalysis itself.

571

DenseMap<AliasCacheKey, Optional<bool>> AliasCache;

572

573

/// Removes an instruction from its block and eventually deletes it.

574

/// It's like Instruction::eraseFromParent() except that the actual deletion

575

/// is delayed until BoUpSLP is destructed.

576

/// This is required to ensure that there are no incorrect collisions in the

577

/// AliasCache, which can happen if a new instruction is allocated at the

578

/// same address as a previously deleted instruction.

579

void eraseInstruction(Instruction *I) {

580

I->removeFromParent();

581

I->dropAllReferences();

582

DeletedInstructions.emplace_back(I);

583

}

584

585

/// Temporary store for deleted instructions. Instructions will be deleted

586

/// eventually when the BoUpSLP is destructed.

587

SmallVector<unique_value, 8> DeletedInstructions;

588

589

/// A list of values that need to extracted out of the tree.

590

/// This list holds pairs of (Internal Scalar : External User). External User

591

/// can be nullptr, it means that this Internal Scalar will be used later,

592

/// after vectorization.

593

UserList ExternalUses;

594

595

/// Values used only by @llvm.assume calls.

596

SmallPtrSet<const Value *, 32> EphValues;

597

598

/// Holds all of the instructions that we gathered.

599

SetVector<Instruction *> GatherSeq;

600

/// A list of blocks that we are going to CSE.

601

SetVector<BasicBlock *> CSEBlocks;

602

603

/// Contains all scheduling relevant data for an instruction.

604

/// A ScheduleData either represents a single instruction or a member of an

605

/// instruction bundle (= a group of instructions which is combined into a

606

/// vector instruction).

607

struct ScheduleData {

608

609

// The initial value for the dependency counters. It means that the

610

// dependencies are not calculated yet.

611

enum { InvalidDeps = -1 };

612

613

ScheduleData()

614

: Inst(nullptr), FirstInBundle(nullptr), NextInBundle(nullptr),

615

NextLoadStore(nullptr), SchedulingRegionID(0), SchedulingPriority(0),

616

Dependencies(InvalidDeps), UnscheduledDeps(InvalidDeps),

617

UnscheduledDepsInBundle(InvalidDeps), IsScheduled(false) {}

618

619

void init(int BlockSchedulingRegionID) {

620

FirstInBundle = this;

621

NextInBundle = nullptr;

622

NextLoadStore = nullptr;

623

IsScheduled = false;

624

SchedulingRegionID = BlockSchedulingRegionID;

625

UnscheduledDepsInBundle = UnscheduledDeps;

626

clearDependencies();

627

}

628

629

/// Returns true if the dependency information has been calculated.

630

bool hasValidDependencies() const { return Dependencies != InvalidDeps; }

631

632

/// Returns true for single instructions and for bundle representatives

633

/// (= the head of a bundle).

634

bool isSchedulingEntity() const { return FirstInBundle == this; }

635

636

/// Returns true if it represents an instruction bundle and not only a

637

/// single instruction.

638

bool isPartOfBundle() const {

639

return NextInBundle != nullptr || FirstInBundle != this;

640

}

641

642

/// Returns true if it is ready for scheduling, i.e. it has no more

643

/// unscheduled depending instructions/bundles.

644

bool isReady() const {

645

assert(isSchedulingEntity() &&((isSchedulingEntity() && "can't consider non-scheduling entity for ready list"
) ? static_cast<void> (0) : __assert_fail ("isSchedulingEntity() && \"can't consider non-scheduling entity for ready list\""
, "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 646, __PRETTY_FUNCTION__))

646

"can't consider non-scheduling entity for ready list")((isSchedulingEntity() && "can't consider non-scheduling entity for ready list"
) ? static_cast<void> (0) : __assert_fail ("isSchedulingEntity() && \"can't consider non-scheduling entity for ready list\""
, "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 646, __PRETTY_FUNCTION__));

647

return UnscheduledDepsInBundle == 0 && !IsScheduled;

648

}

649

650

/// Modifies the number of unscheduled dependencies, also updating it for

651

/// the whole bundle.

652

int incrementUnscheduledDeps(int Incr) {

653

UnscheduledDeps += Incr;

654

return FirstInBundle->UnscheduledDepsInBundle += Incr;

655

}

656

657

/// Sets the number of unscheduled dependencies to the number of

658

/// dependencies.

659

void resetUnscheduledDeps() {

660

incrementUnscheduledDeps(Dependencies - UnscheduledDeps);

661

}

662

663

/// Clears all dependency information.

664

void clearDependencies() {

665

Dependencies = InvalidDeps;

666

resetUnscheduledDeps();

667

MemoryDependencies.clear();

668

}

669

670

void dump(raw_ostream &os) const {

671

if (!isSchedulingEntity()) {

672

os << "/ " << *Inst;

673

} else if (NextInBundle) {

674

os << '[' << *Inst;

675

ScheduleData *SD = NextInBundle;

676

while (SD) {

677

os << ';' << *SD->Inst;

678

SD = SD->NextInBundle;

679

}

680

os << ']';

681

} else {

682

os << *Inst;

683

}

684

}

685

686

Instruction *Inst;

687

688

/// Points to the head in an instruction bundle (and always to this for

689

/// single instructions).

690

ScheduleData *FirstInBundle;

691

692

/// Single linked list of all instructions in a bundle. Null if it is a

693

/// single instruction.

694

ScheduleData *NextInBundle;

695

696

/// Single linked list of all memory instructions (e.g. load, store, call)

697

/// in the block - until the end of the scheduling region.

698

ScheduleData *NextLoadStore;

699

700

/// The dependent memory instructions.

701

/// This list is derived on demand in calculateDependencies().

702

SmallVector<ScheduleData *, 4> MemoryDependencies;

703

704

/// This ScheduleData is in the current scheduling region if this matches

705

/// the current SchedulingRegionID of BlockScheduling.

706

int SchedulingRegionID;

707

708

/// Used for getting a "good" final ordering of instructions.

709

int SchedulingPriority;

710

711

/// The number of dependencies. Constitutes of the number of users of the

712

/// instruction plus the number of dependent memory instructions (if any).

713

/// This value is calculated on demand.

714

/// If InvalidDeps, the number of dependencies is not calculated yet.

715

///

716

int Dependencies;

717

718

/// The number of dependencies minus the number of dependencies of scheduled

719

/// instructions. As soon as this is zero, the instruction/bundle gets ready

720

/// for scheduling.

721

/// Note that this is negative as long as Dependencies is not calculated.

722

int UnscheduledDeps;

723

724

/// The sum of UnscheduledDeps in a bundle. Equals to UnscheduledDeps for

725

/// single instructions.

726

int UnscheduledDepsInBundle;

727

728

/// True if this instruction is scheduled (or considered as scheduled in the

729

/// dry-run).

730

bool IsScheduled;

731

};

732

733

#ifndef NDEBUG

734

friend inline raw_ostream &operator<<(raw_ostream &os,

735

const BoUpSLP::ScheduleData &SD) {

736

SD.dump(os);

737

return os;

738

}

739

#endif

740

friend struct GraphTraits<BoUpSLP *>;

741

friend struct DOTGraphTraits<BoUpSLP *>;

742

743

/// Contains all scheduling data for a basic block.

744

///

745

struct BlockScheduling {

746

747

BlockScheduling(BasicBlock *BB)

748

: BB(BB), ChunkSize(BB->size()), ChunkPos(ChunkSize),

749

ScheduleStart(nullptr), ScheduleEnd(nullptr),

750

FirstLoadStoreInRegion(nullptr), LastLoadStoreInRegion(nullptr),

751

ScheduleRegionSize(0),

752

ScheduleRegionSizeLimit(ScheduleRegionSizeBudget),

753

// Make sure that the initial SchedulingRegionID is greater than the

754

// initial SchedulingRegionID in ScheduleData (which is 0).

755

SchedulingRegionID(1) {}

756

757

void clear() {

758

ReadyInsts.clear();

759

ScheduleStart = nullptr;

760

ScheduleEnd = nullptr;

761

FirstLoadStoreInRegion = nullptr;

762

LastLoadStoreInRegion = nullptr;

763

764

// Reduce the maximum schedule region size by the size of the

765

// previous scheduling run.

766

ScheduleRegionSizeLimit -= ScheduleRegionSize;

767

if (ScheduleRegionSizeLimit < MinScheduleRegionSize)

768

ScheduleRegionSizeLimit = MinScheduleRegionSize;

769

ScheduleRegionSize = 0;

770

771

// Make a new scheduling region, i.e. all existing ScheduleData is not

772

// in the new region yet.

773

++SchedulingRegionID;

774

}

775

776

ScheduleData *getScheduleData(Value *V) {

777

ScheduleData *SD = ScheduleDataMap[V];

778

if (SD && SD->SchedulingRegionID == SchedulingRegionID)

779

return SD;

780

return nullptr;

781

}

782

783

bool isInSchedulingRegion(ScheduleData *SD) {

784

return SD->SchedulingRegionID == SchedulingRegionID;

785

}

786

787

/// Marks an instruction as scheduled and puts all dependent ready

788

/// instructions into the ready-list.

789

template <typename ReadyListType>

790

void schedule(ScheduleData *SD, ReadyListType &ReadyList) {

791

SD->IsScheduled = true;

792

DEBUG(dbgs() << "SLP: schedule " << *SD << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: schedule " << *SD <<
"\n"; } } while (false);

793

794

ScheduleData *BundleMember = SD;

795

while (BundleMember) {

796

// Handle the def-use chain dependencies.

797

for (Use &U : BundleMember->Inst->operands()) {

798

ScheduleData *OpDef = getScheduleData(U.get());

799

if (OpDef && OpDef->hasValidDependencies() &&

800

OpDef->incrementUnscheduledDeps(-1) == 0) {

801

// There are no more unscheduled dependencies after decrementing,

802

// so we can put the dependent instruction into the ready list.

803

ScheduleData *DepBundle = OpDef->FirstInBundle;

804

assert(!DepBundle->IsScheduled &&((!DepBundle->IsScheduled && "already scheduled bundle gets ready"
) ? static_cast<void> (0) : __assert_fail ("!DepBundle->IsScheduled && \"already scheduled bundle gets ready\""
, "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 805, __PRETTY_FUNCTION__))

805

"already scheduled bundle gets ready")((!DepBundle->IsScheduled && "already scheduled bundle gets ready"
) ? static_cast<void> (0) : __assert_fail ("!DepBundle->IsScheduled && \"already scheduled bundle gets ready\""
, "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 805, __PRETTY_FUNCTION__));

806

ReadyList.insert(DepBundle);

807

DEBUG(dbgs() << "SLP: gets ready (def): " << *DepBundle << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: gets ready (def): " <<
*DepBundle << "\n"; } } while (false);

808

}

809

}

810

// Handle the memory dependencies.

811

for (ScheduleData *MemoryDepSD : BundleMember->MemoryDependencies) {

812

if (MemoryDepSD->incrementUnscheduledDeps(-1) == 0) {

813

// There are no more unscheduled dependencies after decrementing,

814

// so we can put the dependent instruction into the ready list.

815

ScheduleData *DepBundle = MemoryDepSD->FirstInBundle;

816

817

818

ReadyList.insert(DepBundle);

819

DEBUG(dbgs() << "SLP: gets ready (mem): " << *DepBundle << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: gets ready (mem): " <<
*DepBundle << "\n"; } } while (false);

820

}

821

}

822

BundleMember = BundleMember->NextInBundle;

823

}

824

}

825

826

/// Put all instructions into the ReadyList which are ready for scheduling.

827

template <typename ReadyListType>

828

void initialFillReadyList(ReadyListType &ReadyList) {

829

for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {

830

ScheduleData *SD = getScheduleData(I);

831

if (SD->isSchedulingEntity() && SD->isReady()) {

832

ReadyList.insert(SD);

833

DEBUG(dbgs() << "SLP: initially in ready list: " << *I << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: initially in ready list: "
<< *I << "\n"; } } while (false);

834

}

835

}

836

}

837

838

/// Checks if a bundle of instructions can be scheduled, i.e. has no

839

/// cyclic dependencies. This is only a dry-run, no instructions are

840

/// actually moved at this stage.

841

bool tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP);

842

843

/// Un-bundles a group of instructions.

844

void cancelScheduling(ArrayRef<Value *> VL);

845

846

/// Extends the scheduling region so that V is inside the region.

847

/// \returns true if the region size is within the limit.

848

bool extendSchedulingRegion(Value *V);

849

850

/// Initialize the ScheduleData structures for new instructions in the

851

/// scheduling region.

852

void initScheduleData(Instruction *FromI, Instruction *ToI,

853

ScheduleData *PrevLoadStore,

854

ScheduleData *NextLoadStore);

855

856

/// Updates the dependency information of a bundle and of all instructions/

857

/// bundles which depend on the original bundle.

858

void calculateDependencies(ScheduleData *SD, bool InsertInReadyList,

859

BoUpSLP *SLP);

860

861

/// Sets all instruction in the scheduling region to un-scheduled.

862

void resetSchedule();

863

864

BasicBlock *BB;

865

866

/// Simple memory allocation for ScheduleData.

867

std::vector<std::unique_ptr<ScheduleData[]>> ScheduleDataChunks;

868

869

/// The size of a ScheduleData array in ScheduleDataChunks.

870

int ChunkSize;

871

872

/// The allocator position in the current chunk, which is the last entry

873

/// of ScheduleDataChunks.

874

int ChunkPos;

875

876

/// Attaches ScheduleData to Instruction.

877

/// Note that the mapping survives during all vectorization iterations, i.e.

878

/// ScheduleData structures are recycled.

879

DenseMap<Value *, ScheduleData *> ScheduleDataMap;

880

881

struct ReadyList : SmallVector<ScheduleData *, 8> {

882

void insert(ScheduleData *SD) { push_back(SD); }

883

};

884

885

/// The ready-list for scheduling (only used for the dry-run).

886

ReadyList ReadyInsts;

887

888

/// The first instruction of the scheduling region.

889

Instruction *ScheduleStart;

890

891

/// The first instruction _after_ the scheduling region.

892

Instruction *ScheduleEnd;

893

894

/// The first memory accessing instruction in the scheduling region

895

/// (can be null).

896

ScheduleData *FirstLoadStoreInRegion;

897

898

/// The last memory accessing instruction in the scheduling region

899

/// (can be null).

900

ScheduleData *LastLoadStoreInRegion;

901

902

/// The current size of the scheduling region.

903

int ScheduleRegionSize;

904

905

/// The maximum size allowed for the scheduling region.

906

int ScheduleRegionSizeLimit;

907

908

/// The ID of the scheduling region. For a new vectorization iteration this

909

/// is incremented which "removes" all ScheduleData from the region.

910

int SchedulingRegionID;

911

};

912

913

/// Attaches the BlockScheduling structures to basic blocks.

914

MapVector<BasicBlock *, std::unique_ptr<BlockScheduling>> BlocksSchedules;

915

916

/// Performs the "real" scheduling. Done before vectorization is actually

917

/// performed in a basic block.

918

void scheduleBlock(BlockScheduling *BS);

919

920

/// List of users to ignore during scheduling and that don't need extracting.

921

ArrayRef<Value *> UserIgnoreList;

922

923

// Number of load bundles that contain consecutive loads.

924

int NumLoadsWantToKeepOrder;

925

926

// Number of load bundles that contain consecutive loads in reversed order.

927

int NumLoadsWantToChangeOrder;

928

929

// Analysis and block reference.

930

Function *F;

931

ScalarEvolution *SE;

932

TargetTransformInfo *TTI;

933

TargetLibraryInfo *TLI;

934

AliasAnalysis *AA;

935

LoopInfo *LI;

936

DominatorTree *DT;

937

AssumptionCache *AC;

938

DemandedBits *DB;

939

const DataLayout *DL;

940

OptimizationRemarkEmitter *ORE;

941

942

unsigned MaxVecRegSize; // This is set by TTI or overridden by cl::opt.

943

unsigned MinVecRegSize; // Set by cl::opt (default: 128).

944

/// Instruction builder to construct the vectorized tree.

945

IRBuilder<> Builder;

946

947

/// A map of scalar integer values to the smallest bit width with which they

948

/// can legally be represented. The values map to (width, signed) pairs,

949

/// where "width" indicates the minimum bit width and "signed" is True if the

950

/// value must be signed-extended, rather than zero-extended, back to its

951

/// original width.

952

MapVector<Value *, std::pair<uint64_t, bool>> MinBWs;

953

};

954

} // end namespace slpvectorizer

955

956

template <> struct GraphTraits<BoUpSLP *> {

957

typedef BoUpSLP::TreeEntry TreeEntry;

958

959

/// NodeRef has to be a pointer per the GraphWriter.

960

typedef TreeEntry *NodeRef;

961

962

/// \brief Add the VectorizableTree to the index iterator to be able to return

963

/// TreeEntry pointers.

964

struct ChildIteratorType

965

: public iterator_adaptor_base<ChildIteratorType,

966

SmallVector<int, 1>::iterator> {

967

968

std::vector<TreeEntry> &VectorizableTree;

969

970

ChildIteratorType(SmallVector<int, 1>::iterator W,

971

std::vector<TreeEntry> &VT)

972

: ChildIteratorType::iterator_adaptor_base(W), VectorizableTree(VT) {}

973

974

NodeRef operator*() { return &VectorizableTree[*I]; }

975

};

976

977

static NodeRef getEntryNode(BoUpSLP &R) { return &R.VectorizableTree[0]; }

978

979

static ChildIteratorType child_begin(NodeRef N) {

980

return {N->UserTreeIndices.begin(), N->Container};

981

}

982

static ChildIteratorType child_end(NodeRef N) {

983

return {N->UserTreeIndices.end(), N->Container};

984

}

985

986

/// For the node iterator we just need to turn the TreeEntry iterator into a

987

/// TreeEntry* iterator so that it dereferences to NodeRef.

988

typedef pointer_iterator<std::vector<TreeEntry>::iterator> nodes_iterator;

989

990

static nodes_iterator nodes_begin(BoUpSLP *R) {

991

return nodes_iterator(R->VectorizableTree.begin());

992

}

993

static nodes_iterator nodes_end(BoUpSLP *R) {

994

return nodes_iterator(R->VectorizableTree.end());

995

}

996

997

static unsigned size(BoUpSLP *R) { return R->VectorizableTree.size(); }

998

};

999

1000

template <> struct DOTGraphTraits<BoUpSLP *> : public DefaultDOTGraphTraits {

1001

typedef BoUpSLP::TreeEntry TreeEntry;

1002

1003

DOTGraphTraits(bool isSimple = false) : DefaultDOTGraphTraits(isSimple) {}

1004

1005

std::string getNodeLabel(const TreeEntry *Entry, const BoUpSLP *R) {

1006

std::string Str;

1007

raw_string_ostream OS(Str);

1008

if (isSplat(Entry->Scalars)) {

1009

OS << "<splat> " << *Entry->Scalars[0];

1010

return Str;

1011

}

1012

for (auto V : Entry->Scalars) {

1013

OS << *V;

1014

if (std::any_of(

1015

R->ExternalUses.begin(), R->ExternalUses.end(),

1016

[&](const BoUpSLP::ExternalUser &EU) { return EU.Scalar == V; }))

1017

OS << " <extract>";

1018

OS << "\n";

1019

}

1020

return Str;

1021

}

1022

1023

static std::string getNodeAttributes(const TreeEntry *Entry,

1024

const BoUpSLP *) {

1025

if (Entry->NeedToGather)

1026

return "color=red";

1027

return "";

1028

}

1029

};

1030

1031

} // end namespace llvm

1032

1033

void BoUpSLP::buildTree(ArrayRef<Value *> Roots,

1034

ArrayRef<Value *> UserIgnoreLst) {

1035

ExtraValueToDebugLocsMap ExternallyUsedValues;

1036

buildTree(Roots, ExternallyUsedValues, UserIgnoreLst);

1037

}

1038

void BoUpSLP::buildTree(ArrayRef<Value *> Roots,

1039

ExtraValueToDebugLocsMap &ExternallyUsedValues,

1040

ArrayRef<Value *> UserIgnoreLst) {

1041

deleteTree();

1042

UserIgnoreList = UserIgnoreLst;

1043

if (!allSameType(Roots))

1044

return;

1045

buildTree_rec(Roots, 0, -1);

1046

1047

// Collect the values that we need to extract from the tree.

1048

for (TreeEntry &EIdx : VectorizableTree) {

1049

TreeEntry *Entry = &EIdx;

1050

1051

// For each lane:

1052

for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {

1053

Value *Scalar = Entry->Scalars[Lane];

1054

1055

// No need to handle users of gathered values.

1056

if (Entry->NeedToGather)

1057

continue;

1058

1059

// Check if the scalar is externally used as an extra arg.

1060

auto ExtI = ExternallyUsedValues.find(Scalar);

1061

if (ExtI != ExternallyUsedValues.end()) {

1062

DEBUG(dbgs() << "SLP: Need to extract: Extra arg from lane " <<do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Need to extract: Extra arg from lane "
<< Lane << " from " << *Scalar << ".\n"
; } } while (false)

1063

Lane << " from " << *Scalar << ".\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Need to extract: Extra arg from lane "
<< Lane << " from " << *Scalar << ".\n"
; } } while (false);

1064

ExternalUses.emplace_back(Scalar, nullptr, Lane);

1065

continue;

1066

}

1067

for (User *U : Scalar->users()) {

1068

DEBUG(dbgs() << "SLP: Checking user:" << *U << ".\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Checking user:" << *U <<
".\n"; } } while (false);

1069

1070

Instruction *UserInst = dyn_cast<Instruction>(U);

1071

if (!UserInst)

1072

continue;

1073

1074

// Skip in-tree scalars that become vectors

1075

if (ScalarToTreeEntry.count(U)) {

1076

int Idx = ScalarToTreeEntry[U];

1077

TreeEntry *UseEntry = &VectorizableTree[Idx];

1078

Value *UseScalar = UseEntry->Scalars[0];

1079

// Some in-tree scalars will remain as scalar in vectorized

1080

// instructions. If that is the case, the one in Lane 0 will

1081

// be used.

1082

if (UseScalar != U ||

1083

!InTreeUserNeedToExtract(Scalar, UserInst, TLI)) {

1084

DEBUG(dbgs() << "SLP: \tInternal user will be removed:" << *Udo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: \tInternal user will be removed:"
<< *U << ".\n"; } } while (false)

1085

<< ".\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: \tInternal user will be removed:"
<< *U << ".\n"; } } while (false);

1086

assert(!VectorizableTree[Idx].NeedToGather && "Bad state")((!VectorizableTree[Idx].NeedToGather && "Bad state")
? static_cast<void> (0) : __assert_fail ("!VectorizableTree[Idx].NeedToGather && \"Bad state\""
, "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 1086, __PRETTY_FUNCTION__));

1087

continue;

1088

}

1089

}

1090

1091

// Ignore users in the user ignore list.

1092

if (is_contained(UserIgnoreList, UserInst))

1093

continue;

1094

1095

DEBUG(dbgs() << "SLP: Need to extract:" << *U << " from lane " <<do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Need to extract:" << *
U << " from lane " << Lane << " from " <<
*Scalar << ".\n"; } } while (false)

1096

Lane << " from " << *Scalar << ".\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Need to extract:" << *
U << " from lane " << Lane << " from " <<
*Scalar << ".\n"; } } while (false);

1097

ExternalUses.push_back(ExternalUser(Scalar, U, Lane));

1098

}

1099

}

1100

}

1101

}

1102

1103

void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,

1104

int UserTreeIdx) {

1105

bool isAltShuffle = false;

1106

assert((allConstant(VL) || allSameType(VL)) && "Invalid types!")(((allConstant(VL) || allSameType(VL)) && "Invalid types!"
) ? static_cast<void> (0) : __assert_fail ("(allConstant(VL) || allSameType(VL)) && \"Invalid types!\""
, "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 1106, __PRETTY_FUNCTION__));

1107

1108

if (Depth == RecursionMaxDepth) {

1109

DEBUG(dbgs() << "SLP: Gathering due to max recursion depth.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Gathering due to max recursion depth.\n"
; } } while (false);

1110

newTreeEntry(VL, false, UserTreeIdx);

1111

return;

1112

}

1113

1114

// Don't handle vectors.

1115

if (VL[0]->getType()->isVectorTy()) {

1116

DEBUG(dbgs() << "SLP: Gathering due to vector type.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Gathering due to vector type.\n"
; } } while (false);

1117

newTreeEntry(VL, false, UserTreeIdx);

1118

return;

1119

}

1120

1121

if (StoreInst *SI = dyn_cast<StoreInst>(VL[0]))

1122

if (SI->getValueOperand()->getType()->isVectorTy()) {

1123

DEBUG(dbgs() << "SLP: Gathering due to store vector type.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Gathering due to store vector type.\n"
; } } while (false);

1124

newTreeEntry(VL, false, UserTreeIdx);

1125

return;

1126

}

1127

unsigned Opcode = getSameOpcode(VL);

1128

1129

// Check that this shuffle vector refers to the alternate

1130

// sequence of opcodes.

1131

if (Opcode == Instruction::ShuffleVector) {

1132

Instruction *I0 = dyn_cast<Instruction>(VL[0]);

1133

unsigned Op = I0->getOpcode();

1134

if (Op != Instruction::ShuffleVector)

1135

isAltShuffle = true;

1136

}

1137

1138

// If all of the operands are identical or constant we have a simple solution.

1139

if (allConstant(VL) || isSplat(VL) || !allSameBlock(VL) || !Opcode) {

1140

DEBUG(dbgs() << "SLP: Gathering due to C,S,B,O. \n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Gathering due to C,S,B,O. \n"
; } } while (false);

1141

newTreeEntry(VL, false, UserTreeIdx);

1142

return;

1143

}

1144

1145

// We now know that this is a vector of instructions of the same type from

1146

// the same block.

1147

1148

// Don't vectorize ephemeral values.

1149

for (unsigned i = 0, e = VL.size(); i != e; ++i) {

1150

if (EphValues.count(VL[i])) {

1151

DEBUG(dbgs() << "SLP: The instruction (" << *VL[i] <<do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: The instruction (" << *
VL[i] << ") is ephemeral.\n"; } } while (false)

1152

") is ephemeral.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: The instruction (" << *
VL[i] << ") is ephemeral.\n"; } } while (false);

1153

newTreeEntry(VL, false, UserTreeIdx);

1154

return;

1155

}

1156

}

1157

1158

// Check if this is a duplicate of another entry.

1159

if (ScalarToTreeEntry.count(VL[0])) {

1160

int Idx = ScalarToTreeEntry[VL[0]];

1161

TreeEntry *E = &VectorizableTree[Idx];

1162

for (unsigned i = 0, e = VL.size(); i != e; ++i) {

1163

DEBUG(dbgs() << "SLP: \tChecking bundle: " << *VL[i] << ".\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: \tChecking bundle: " <<
*VL[i] << ".\n"; } } while (false);

1164

if (E->Scalars[i] != VL[i]) {

1165

DEBUG(dbgs() << "SLP: Gathering due to partial overlap.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Gathering due to partial overlap.\n"
; } } while (false);

1166

newTreeEntry(VL, false, UserTreeIdx);

1167

return;

1168

}

1169

}

1170

// Record the reuse of the tree node. FIXME, currently this is only used to

1171

// properly draw the graph rather than for the actual vectorization.

1172

E->UserTreeIndices.push_back(UserTreeIdx);

1173

DEBUG(dbgs() << "SLP: Perfect diamond merge at " << *VL[0] << ".\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Perfect diamond merge at " <<
*VL[0] << ".\n"; } } while (false);

1174

return;

1175

}

1176

1177

// Check that none of the instructions in the bundle are already in the tree.

1178

for (unsigned i = 0, e = VL.size(); i != e; ++i) {

1179

if (ScalarToTreeEntry.count(VL[i])) {

1180

DEBUG(dbgs() << "SLP: The instruction (" << *VL[i] <<do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: The instruction (" << *
VL[i] << ") is already in tree.\n"; } } while (false)

1181

") is already in tree.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: The instruction (" << *
VL[i] << ") is already in tree.\n"; } } while (false);

1182

newTreeEntry(VL, false, UserTreeIdx);

1183

return;

1184

}

1185

}

1186

1187

// If any of the scalars is marked as a value that needs to stay scalar then

1188

// we need to gather the scalars.

1189

for (unsigned i = 0, e = VL.size(); i != e; ++i) {

1190

if (MustGather.count(VL[i])) {

1191

DEBUG(dbgs() << "SLP: Gathering due to gathered scalar.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Gathering due to gathered scalar.\n"
; } } while (false);

1192

newTreeEntry(VL, false, UserTreeIdx);

1193

return;

1194

}

1195

}

1196

1197

// Check that all of the users of the scalars that we want to vectorize are

1198

// schedulable.

1199

Instruction *VL0 = cast<Instruction>(VL[0]);

1200

BasicBlock *BB = cast<Instruction>(VL0)->getParent();

1201

1202

if (!DT->isReachableFromEntry(BB)) {

1203

// Don't go into unreachable blocks. They may contain instructions with

1204

// dependency cycles which confuse the final scheduling.

1205

DEBUG(dbgs() << "SLP: bundle in unreachable block.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: bundle in unreachable block.\n"
; } } while (false);

1206

newTreeEntry(VL, false, UserTreeIdx);

1207

return;

1208

}

1209

1210

// Check that every instructions appears once in this bundle.

1211

for (unsigned i = 0, e = VL.size(); i < e; ++i)

1212

for (unsigned j = i+1; j < e; ++j)

1213

if (VL[i] == VL[j]) {

1214

DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Scalar used twice in bundle.\n"
; } } while (false);

1215

newTreeEntry(VL, false, UserTreeIdx);

1216

return;

1217

}

1218

1219

auto &BSRef = BlocksSchedules[BB];

1220

if (!BSRef) {

1221

BSRef = llvm::make_unique<BlockScheduling>(BB);

1222

}

1223

BlockScheduling &BS = *BSRef.get();

1224

1225

if (!BS.tryScheduleBundle(VL, this)) {

1226

DEBUG(dbgs() << "SLP: We are not able to schedule this bundle!\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: We are not able to schedule this bundle!\n"
; } } while (false);

1227

assert((!BS.getScheduleData(VL[0]) ||(((!BS.getScheduleData(VL[0]) || !BS.getScheduleData(VL[0])->
isPartOfBundle()) && "tryScheduleBundle should cancelScheduling on failure"
) ? static_cast<void> (0) : __assert_fail ("(!BS.getScheduleData(VL[0]) || !BS.getScheduleData(VL[0])->isPartOfBundle()) && \"tryScheduleBundle should cancelScheduling on failure\""
, "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 1229, __PRETTY_FUNCTION__))

1228

!BS.getScheduleData(VL[0])->isPartOfBundle()) &&(((!BS.getScheduleData(VL[0]) || !BS.getScheduleData(VL[0])->
isPartOfBundle()) && "tryScheduleBundle should cancelScheduling on failure"
) ? static_cast<void> (0) : __assert_fail ("(!BS.getScheduleData(VL[0]) || !BS.getScheduleData(VL[0])->isPartOfBundle()) && \"tryScheduleBundle should cancelScheduling on failure\""
, "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 1229, __PRETTY_FUNCTION__))

1229

"tryScheduleBundle should cancelScheduling on failure")(((!BS.getScheduleData(VL[0]) || !BS.getScheduleData(VL[0])->
isPartOfBundle()) && "tryScheduleBundle should cancelScheduling on failure"
) ? static_cast<void> (0) : __assert_fail ("(!BS.getScheduleData(VL[0]) || !BS.getScheduleData(VL[0])->isPartOfBundle()) && \"tryScheduleBundle should cancelScheduling on failure\""
, "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 1229, __PRETTY_FUNCTION__));

1230

newTreeEntry(VL, false, UserTreeIdx);

1231

return;

1232

}

1233

DEBUG(dbgs() << "SLP: We are able to schedule this bundle.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: We are able to schedule this bundle.\n"
; } } while (false);

1234

1235

switch (Opcode) {

1236

case Instruction::PHI: {

1237

PHINode *PH = dyn_cast<PHINode>(VL0);

1238

1239

// Check for terminator values (e.g. invoke).

1240

for (unsigned j = 0; j < VL.size(); ++j)

1241

for (unsigned i = 0, e = PH->getNumIncomingValues(); i < e; ++i) {

1242

TerminatorInst *Term = dyn_cast<TerminatorInst>(

1243

cast<PHINode>(VL[j])->getIncomingValueForBlock(PH->getIncomingBlock(i)));

1244

if (Term) {

1245

DEBUG(dbgs() << "SLP: Need to swizzle PHINodes (TerminatorInst use).\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Need to swizzle PHINodes (TerminatorInst use).\n"
; } } while (false);

1246

BS.cancelScheduling(VL);

1247

newTreeEntry(VL, false, UserTreeIdx);

1248

return;

1249

}

1250

}

1251

1252

newTreeEntry(VL, true, UserTreeIdx);

1253

DEBUG(dbgs() << "SLP: added a vector of PHINodes.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: added a vector of PHINodes.\n"
; } } while (false);

1254

1255

for (unsigned i = 0, e = PH->getNumIncomingValues(); i < e; ++i) {

1256

ValueList Operands;

1257

// Prepare the operand vector.

1258

for (Value *j : VL)

1259

Operands.push_back(cast<PHINode>(j)->getIncomingValueForBlock(

1260

PH->getIncomingBlock(i)));

1261

1262

buildTree_rec(Operands, Depth + 1, UserTreeIdx);

1263

}

1264

return;

1265

}

1266

case Instruction::ExtractValue:

1267

case Instruction::ExtractElement: {

1268

bool Reuse = canReuseExtract(VL, Opcode);

1269

if (Reuse) {

1270

DEBUG(dbgs() << "SLP: Reusing extract sequence.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Reusing extract sequence.\n"
; } } while (false);

1271

} else {

1272

BS.cancelScheduling(VL);

1273

}

1274

newTreeEntry(VL, Reuse, UserTreeIdx);

1275

return;

1276

}

1277

case Instruction::Load: {

1278

// Check that a vectorized load would load the same memory as a scalar

1279

// load.

1280

// For example we don't want vectorize loads that are smaller than 8 bit.

1281

// Even though we have a packed struct {<i2, i2, i2, i2>} LLVM treats

1282

// loading/storing it as an i8 struct. If we vectorize loads/stores from

1283

// such a struct we read/write packed bits disagreeing with the

1284

// unvectorized version.

1285

Type *ScalarTy = VL[0]->getType();

1286

1287

if (DL->getTypeSizeInBits(ScalarTy) !=

1288

DL->getTypeAllocSizeInBits(ScalarTy)) {

1289

BS.cancelScheduling(VL);

1290

newTreeEntry(VL, false, UserTreeIdx);

1291

DEBUG(dbgs() << "SLP: Gathering loads of non-packed type.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Gathering loads of non-packed type.\n"
; } } while (false);

1292

return;

1293

}

1294

1295

// Make sure all loads in the bundle are simple - we can't vectorize

1296

// atomic or volatile loads.

1297

for (unsigned i = 0, e = VL.size() - 1; i < e; ++i) {

1298

LoadInst *L = cast<LoadInst>(VL[i]);

1299

if (!L->isSimple()) {

1300

BS.cancelScheduling(VL);

1301

newTreeEntry(VL, false, UserTreeIdx);

1302

DEBUG(dbgs() << "SLP: Gathering non-simple loads.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Gathering non-simple loads.\n"
; } } while (false);

1303

return;

1304

}

1305

}

1306

1307

// Check if the loads are consecutive, reversed, or neither.

1308

// TODO: What we really want is to sort the loads, but for now, check

1309

// the two likely directions.

1310

bool Consecutive = true;

1311

bool ReverseConsecutive = true;

1312

for (unsigned i = 0, e = VL.size() - 1; i < e; ++i) {

1313

if (!isConsecutiveAccess(VL[i], VL[i + 1], *DL, *SE)) {

1314

Consecutive = false;

1315

break;

1316

} else {

1317

ReverseConsecutive = false;

1318

}

1319

}

1320

1321

if (Consecutive) {

1322

++NumLoadsWantToKeepOrder;

1323

newTreeEntry(VL, true, UserTreeIdx);

1324

DEBUG(dbgs() << "SLP: added a vector of loads.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: added a vector of loads.\n";
} } while (false);

1325

return;

1326

}

1327

1328

// If none of the load pairs were consecutive when checked in order,

1329

// check the reverse order.

1330

if (ReverseConsecutive)

1331

for (unsigned i = VL.size() - 1; i > 0; --i)

1332

if (!isConsecutiveAccess(VL[i], VL[i - 1], *DL, *SE)) {

1333

ReverseConsecutive = false;

1334

break;

1335

}

1336

1337

BS.cancelScheduling(VL);

1338

newTreeEntry(VL, false, UserTreeIdx);

1339

1340

if (ReverseConsecutive) {

1341

++NumLoadsWantToChangeOrder;

1342

DEBUG(dbgs() << "SLP: Gathering reversed loads.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Gathering reversed loads.\n"
; } } while (false);

1343

} else {

1344

DEBUG(dbgs() << "SLP: Gathering non-consecutive loads.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Gathering non-consecutive loads.\n"
; } } while (false);

1345

}

1346

return;

1347

}

1348

case Instruction::ZExt:

1349

case Instruction::SExt:

1350

case Instruction::FPToUI:

1351

case Instruction::FPToSI:

1352

case Instruction::FPExt:

1353

case Instruction::PtrToInt:

1354

case Instruction::IntToPtr:

1355

case Instruction::SIToFP:

1356

case Instruction::UIToFP:

1357

case Instruction::Trunc:

1358

case Instruction::FPTrunc:

1359

case Instruction::BitCast: {

1360

Type *SrcTy = VL0->getOperand(0)->getType();

1361

for (unsigned i = 0; i < VL.size(); ++i) {

1362

Type *Ty = cast<Instruction>(VL[i])->getOperand(0)->getType();

1363

if (Ty != SrcTy || !isValidElementType(Ty)) {

1364

BS.cancelScheduling(VL);

1365

newTreeEntry(VL, false, UserTreeIdx);

1366

DEBUG(dbgs() << "SLP: Gathering casts with different src types.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Gathering casts with different src types.\n"
; } } while (false);

1367

return;

1368

}

1369

}

1370

newTreeEntry(VL, true, UserTreeIdx);

1371

DEBUG(dbgs() << "SLP: added a vector of casts.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: added a vector of casts.\n";
} } while (false);

1372

1373

for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {

1374

ValueList Operands;

1375

// Prepare the operand vector.

1376

for (Value *j : VL)

1377

Operands.push_back(cast<Instruction>(j)->getOperand(i));

1378

1379

buildTree_rec(Operands, Depth + 1, UserTreeIdx);

1380

}

1381

return;

1382

}

1383

case Instruction::ICmp:

1384

case Instruction::FCmp: {

1385

// Check that all of the compares have the same predicate.

1386

CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate();

1387

Type *ComparedTy = cast<Instruction>(VL[0])->getOperand(0)->getType();

1388

for (unsigned i = 1, e = VL.size(); i < e; ++i) {

1389

CmpInst *Cmp = cast<CmpInst>(VL[i]);

1390

if (Cmp->getPredicate() != P0 ||

1391

Cmp->getOperand(0)->getType() != ComparedTy) {

1392

BS.cancelScheduling(VL);

1393

newTreeEntry(VL, false, UserTreeIdx);

1394

DEBUG(dbgs() << "SLP: Gathering cmp with different predicate.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Gathering cmp with different predicate.\n"
; } } while (false);

1395

return;

1396

}

1397

}

1398

1399

newTreeEntry(VL, true, UserTreeIdx);

1400

DEBUG(dbgs() << "SLP: added a vector of compares.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: added a vector of compares.\n"
; } } while (false);

1401

1402

for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {

1403

ValueList Operands;

1404

// Prepare the operand vector.

1405

for (Value *j : VL)

1406

Operands.push_back(cast<Instruction>(j)->getOperand(i));

1407

1408

buildTree_rec(Operands, Depth + 1, UserTreeIdx);

1409

}

1410

return;

1411

}

1412

case Instruction::Select:

1413

case Instruction::Add:

1414

case Instruction::FAdd:

1415

case Instruction::Sub:

1416

case Instruction::FSub:

1417

case Instruction::Mul:

1418

case Instruction::FMul:

1419

case Instruction::UDiv:

1420

case Instruction::SDiv:

1421

case Instruction::FDiv:

1422

case Instruction::URem:

1423

case Instruction::SRem:

1424

case Instruction::FRem:

1425

case Instruction::Shl:

1426

case Instruction::LShr:

1427

case Instruction::AShr:

1428

case Instruction::And:

1429

case Instruction::Or:

1430

case Instruction::Xor: {

1431

newTreeEntry(VL, true, UserTreeIdx);

1432

DEBUG(dbgs() << "SLP: added a vector of bin op.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: added a vector of bin op.\n"
; } } while (false);

1433

1434

// Sort operands of the instructions so that each side is more likely to

1435

// have the same opcode.

1436

if (isa<BinaryOperator>(VL0) && VL0->isCommutative()) {

1437

ValueList Left, Right;

1438

reorderInputsAccordingToOpcode(VL, Left, Right);

1439

buildTree_rec(Left, Depth + 1, UserTreeIdx);

1440

buildTree_rec(Right, Depth + 1, UserTreeIdx);

1441

return;

1442

}

1443

1444

for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {

1445

ValueList Operands;

1446

// Prepare the operand vector.

1447

for (Value *j : VL)

1448

Operands.push_back(cast<Instruction>(j)->getOperand(i));

1449

1450

buildTree_rec(Operands, Depth + 1, UserTreeIdx);

1451

}

1452

return;

1453

}

1454

case Instruction::GetElementPtr: {

1455

// We don't combine GEPs with complicated (nested) indexing.

1456

for (unsigned j = 0; j < VL.size(); ++j) {

1457

if (cast<Instruction>(VL[j])->getNumOperands() != 2) {

1458

DEBUG(dbgs() << "SLP: not-vectorizable GEP (nested indexes).\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: not-vectorizable GEP (nested indexes).\n"
; } } while (false);

1459

BS.cancelScheduling(VL);

1460

newTreeEntry(VL, false, UserTreeIdx);

1461

return;

1462

}

1463

}

1464

1465

// We can't combine several GEPs into one vector if they operate on

1466

// different types.

1467

Type *Ty0 = cast<Instruction>(VL0)->getOperand(0)->getType();

1468

for (unsigned j = 0; j < VL.size(); ++j) {

1469

Type *CurTy = cast<Instruction>(VL[j])->getOperand(0)->getType();

1470

if (Ty0 != CurTy) {

1471

DEBUG(dbgs() << "SLP: not-vectorizable GEP (different types).\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: not-vectorizable GEP (different types).\n"
; } } while (false);

1472

BS.cancelScheduling(VL);

1473

newTreeEntry(VL, false, UserTreeIdx);

1474

return;

1475

}

1476

}

1477

1478

// We don't combine GEPs with non-constant indexes.

1479

for (unsigned j = 0; j < VL.size(); ++j) {

1480

auto Op = cast<Instruction>(VL[j])->getOperand(1);

1481

if (!isa<ConstantInt>(Op)) {

1482

DEBUG(do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: not-vectorizable GEP (non-constant indexes).\n"
; } } while (false)

1483

dbgs() << "SLP: not-vectorizable GEP (non-constant indexes).\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: not-vectorizable GEP (non-constant indexes).\n"
; } } while (false);

1484

BS.cancelScheduling(VL);

1485

newTreeEntry(VL, false, UserTreeIdx);

1486

return;

1487

}

1488

}

1489

1490

newTreeEntry(VL, true, UserTreeIdx);

1491

DEBUG(dbgs() << "SLP: added a vector of GEPs.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: added a vector of GEPs.\n"; }
} while (false);

1492

for (unsigned i = 0, e = 2; i < e; ++i) {

1493

ValueList Operands;

1494

// Prepare the operand vector.

1495

for (Value *j : VL)

1496

Operands.push_back(cast<Instruction>(j)->getOperand(i));

1497

1498

buildTree_rec(Operands, Depth + 1, UserTreeIdx);

1499

}

1500

return;

1501

}

1502

case Instruction::Store: {

1503

// Check if the stores are consecutive or of we need to swizzle them.

1504

for (unsigned i = 0, e = VL.size() - 1; i < e; ++i)

1505

if (!isConsecutiveAccess(VL[i], VL[i + 1], *DL, *SE)) {

1506

BS.cancelScheduling(VL);

1507

newTreeEntry(VL, false, UserTreeIdx);

1508

DEBUG(dbgs() << "SLP: Non-consecutive store.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Non-consecutive store.\n"; }
} while (false);

1509

return;

1510

}

1511

1512

newTreeEntry(VL, true, UserTreeIdx);

1513

DEBUG(dbgs() << "SLP: added a vector of stores.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: added a vector of stores.\n"
; } } while (false);

1514

1515

ValueList Operands;

1516

for (Value *j : VL)

1517

Operands.push_back(cast<Instruction>(j)->getOperand(0));

1518

1519

buildTree_rec(Operands, Depth + 1, UserTreeIdx);

1520

return;

1521

}

1522

case Instruction::Call: {

1523

// Check if the calls are all to the same vectorizable intrinsic.

1524

CallInst *CI = cast<CallInst>(VL[0]);

1525

// Check if this is an Intrinsic call or something that can be

1526

// represented by an intrinsic call

1527

Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);

1528

if (!isTriviallyVectorizable(ID)) {

1529

BS.cancelScheduling(VL);

1530

newTreeEntry(VL, false, UserTreeIdx);

1531

DEBUG(dbgs() << "SLP: Non-vectorizable call.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Non-vectorizable call.\n"; }
} while (false);

1532

return;

1533

}

1534

Function *Int = CI->getCalledFunction();

1535

Value *A1I = nullptr;

1536

if (hasVectorInstrinsicScalarOpd(ID, 1))

1537

A1I = CI->getArgOperand(1);

1538

for (unsigned i = 1, e = VL.size(); i != e; ++i) {

1539

CallInst *CI2 = dyn_cast<CallInst>(VL[i]);

1540

if (!CI2 || CI2->getCalledFunction() != Int ||

1541

getVectorIntrinsicIDForCall(CI2, TLI) != ID ||

1542

!CI->hasIdenticalOperandBundleSchema(*CI2)) {

1543

BS.cancelScheduling(VL);

1544

newTreeEntry(VL, false, UserTreeIdx);

1545

DEBUG(dbgs() << "SLP: mismatched calls:" << *CI << "!=" << *VL[i]do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: mismatched calls:" << *
CI << "!=" << *VL[i] << "\n"; } } while (false
)

1546

<< "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: mismatched calls:" << *
CI << "!=" << *VL[i] << "\n"; } } while (false
);

1547

return;

1548

}

1549

// ctlz,cttz and powi are special intrinsics whose second argument

1550

// should be same in order for them to be vectorized.

1551

if (hasVectorInstrinsicScalarOpd(ID, 1)) {

1552

Value *A1J = CI2->getArgOperand(1);

1553

if (A1I != A1J) {

1554

BS.cancelScheduling(VL);

1555

newTreeEntry(VL, false, UserTreeIdx);

1556

DEBUG(dbgs() << "SLP: mismatched arguments in call:" << *CIdo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: mismatched arguments in call:"
<< *CI << " argument "<< A1I<<"!=" <<
A1J << "\n"; } } while (false)

1557

<< " argument "<< A1I<<"!=" << A1Jdo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: mismatched arguments in call:"
<< *CI << " argument "<< A1I<<"!=" <<
A1J << "\n"; } } while (false)

1558

<< "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: mismatched arguments in call:"
<< *CI << " argument "<< A1I<<"!=" <<
A1J << "\n"; } } while (false);

1559

return;

1560

}

1561

}

1562

// Verify that the bundle operands are identical between the two calls.

1563

if (CI->hasOperandBundles() &&

1564

!std::equal(CI->op_begin() + CI->getBundleOperandsStartIndex(),

1565

CI->op_begin() + CI->getBundleOperandsEndIndex(),

1566

CI2->op_begin() + CI2->getBundleOperandsStartIndex())) {

1567

BS.cancelScheduling(VL);

1568

newTreeEntry(VL, false, UserTreeIdx);

1569

DEBUG(dbgs() << "SLP: mismatched bundle operands in calls:" << *CI << "!="do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: mismatched bundle operands in calls:"
<< *CI << "!=" << *VL[i] << '\n'; } }
while (false)

1570

<< *VL[i] << '\n')do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: mismatched bundle operands in calls:"
<< *CI << "!=" << *VL[i] << '\n'; } }
while (false);

1571

return;

1572

}

1573

}

1574

1575

newTreeEntry(VL, true, UserTreeIdx);

1576

for (unsigned i = 0, e = CI->getNumArgOperands(); i != e; ++i) {

1577

ValueList Operands;

1578

// Prepare the operand vector.

1579

for (Value *j : VL) {

1580

CallInst *CI2 = dyn_cast<CallInst>(j);

1581

Operands.push_back(CI2->getArgOperand(i));

1582

}

1583

buildTree_rec(Operands, Depth + 1, UserTreeIdx);

1584

}

1585

return;

1586

}

1587

case Instruction::ShuffleVector: {

1588

// If this is not an alternate sequence of opcode like add-sub

1589

// then do not vectorize this instruction.

1590

if (!isAltShuffle) {

1591

BS.cancelScheduling(VL);

1592

newTreeEntry(VL, false, UserTreeIdx);

1593

DEBUG(dbgs() << "SLP: ShuffleVector are not vectorized.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: ShuffleVector are not vectorized.\n"
; } } while (false);

1594

return;

1595

}

1596

newTreeEntry(VL, true, UserTreeIdx);

1597

DEBUG(dbgs() << "SLP: added a ShuffleVector op.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: added a ShuffleVector op.\n"
; } } while (false);

1598

1599

// Reorder operands if reordering would enable vectorization.

1600

if (isa<BinaryOperator>(VL0)) {

1601

ValueList Left, Right;

1602

reorderAltShuffleOperands(VL, Left, Right);

1603

buildTree_rec(Left, Depth + 1, UserTreeIdx);

1604

buildTree_rec(Right, Depth + 1, UserTreeIdx);

1605

return;

1606

}

1607

1608

for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {

1609

ValueList Operands;

1610

// Prepare the operand vector.

1611

for (Value *j : VL)

1612

Operands.push_back(cast<Instruction>(j)->getOperand(i));

1613

1614

buildTree_rec(Operands, Depth + 1, UserTreeIdx);

1615

}

1616

return;

1617

}

1618

default:

1619

BS.cancelScheduling(VL);

1620

newTreeEntry(VL, false, UserTreeIdx);

1621

DEBUG(dbgs() << "SLP: Gathering unknown instruction.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Gathering unknown instruction.\n"
; } } while (false);

1622

return;

1623

}

1624

}

1625

1626

unsigned BoUpSLP::canMapToVector(Type *T, const DataLayout &DL) const {

1627

unsigned N;

1628

Type *EltTy;

1629

auto *ST = dyn_cast<StructType>(T);

1630

if (ST) {

1631

N = ST->getNumElements();

1632

EltTy = *ST->element_begin();

1633

} else {

1634

N = cast<ArrayType>(T)->getNumElements();

1635

EltTy = cast<ArrayType>(T)->getElementType();

1636

}

1637

if (!isValidElementType(EltTy))

1638

return 0;

1639

uint64_t VTSize = DL.getTypeStoreSizeInBits(VectorType::get(EltTy, N));

1640

if (VTSize < MinVecRegSize || VTSize > MaxVecRegSize || VTSize != DL.getTypeStoreSizeInBits(T))

1641

return 0;

1642

if (ST) {

1643

// Check that struct is homogeneous.

1644

for (const auto *Ty : ST->elements())

1645

if (Ty != EltTy)

1646

return 0;

1647

}

1648

return N;

1649

}

1650

1651

bool BoUpSLP::canReuseExtract(ArrayRef<Value *> VL, unsigned Opcode) const {

1652

1653

1654

assert(Opcode == getSameOpcode(VL) && "Invalid opcode")((Opcode == getSameOpcode(VL) && "Invalid opcode") ? static_cast
<void> (0) : __assert_fail ("Opcode == getSameOpcode(VL) && \"Invalid opcode\""
, "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 1654, __PRETTY_FUNCTION__));

1655

// Check if all of the extracts come from the same vector and from the

1656

// correct offset.

1657

Value *VL0 = VL[0];

1658

Instruction *E0 = cast<Instruction>(VL0);

1659

Value *Vec = E0->getOperand(0);

1660

1661

// We have to extract from a vector/aggregate with the same number of elements.

1662

unsigned NElts;

1663

if (Opcode == Instruction::ExtractValue) {

1664

const DataLayout &DL = E0->getModule()->getDataLayout();

1665

NElts = canMapToVector(Vec->getType(), DL);

1666

if (!NElts)

1667

return false;

1668

// Check if load can be rewritten as load of vector.

1669

LoadInst *LI = dyn_cast<LoadInst>(Vec);

1670

if (!LI || !LI->isSimple() || !LI->hasNUses(VL.size()))

1671

return false;

1672

} else {

1673

NElts = Vec->getType()->getVectorNumElements();

1674

}

1675

1676

if (NElts != VL.size())

1677

return false;

1678

1679

// Check that all of the indices extract from the correct offset.

1680

if (!matchExtractIndex(E0, 0, Opcode))

1681

return false;

1682

1683

for (unsigned i = 1, e = VL.size(); i < e; ++i) {

1684

Instruction *E = cast<Instruction>(VL[i]);

1685

if (!matchExtractIndex(E, i, Opcode))

1686

return false;

1687

if (E->getOperand(0) != Vec)

1688

return false;

1689

}

1690

1691

return true;

1692

}

1693

1694

int BoUpSLP::getEntryCost(TreeEntry *E) {

1695

ArrayRef<Value*> VL = E->Scalars;

1696

1697

Type *ScalarTy = VL[0]->getType();

1698

if (StoreInst *SI = dyn_cast<StoreInst>(VL[0]))

1699

ScalarTy = SI->getValueOperand()->getType();

1700

else if (CmpInst *CI = dyn_cast<CmpInst>(VL[0]))

1701

ScalarTy = CI->getOperand(0)->getType();

1702

VectorType *VecTy = VectorType::get(ScalarTy, VL.size());

1703

1704

// If we have computed a smaller type for the expression, update VecTy so

1705

// that the costs will be accurate.

1706

if (MinBWs.count(VL[0]))

1707

VecTy = VectorType::get(

1708

IntegerType::get(F->getContext(), MinBWs[VL[0]].first), VL.size());

1709

1710

if (E->NeedToGather) {

1711

if (allConstant(VL))

1712

return 0;

1713

if (isSplat(VL)) {

1714

return TTI->getShuffleCost(TargetTransformInfo::SK_Broadcast, VecTy, 0);

1715

}

1716

return getGatherCost(E->Scalars);

1717

}

1718

unsigned Opcode = getSameOpcode(VL);

1719

assert(Opcode && allSameType(VL) && allSameBlock(VL) && "Invalid VL")((Opcode && allSameType(VL) && allSameBlock(VL
) && "Invalid VL") ? static_cast<void> (0) : __assert_fail
("Opcode && allSameType(VL) && allSameBlock(VL) && \"Invalid VL\""
, "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 1719, __PRETTY_FUNCTION__));

1720

Instruction *VL0 = cast<Instruction>(VL[0]);

1721

switch (Opcode) {

1722

case Instruction::PHI: {

1723

return 0;

1724

}

1725

case Instruction::ExtractValue:

1726

case Instruction::ExtractElement: {

1727

if (canReuseExtract(VL, Opcode)) {

1728

int DeadCost = 0;

1729

for (unsigned i = 0, e = VL.size(); i < e; ++i) {

1730

Instruction *E = cast<Instruction>(VL[i]);

1731

// If all users are going to be vectorized, instruction can be

1732

// considered as dead.

1733

// The same, if have only one user, it will be vectorized for sure.

1734

if (E->hasOneUse() ||

1735

std::all_of(E->user_begin(), E->user_end(), [this](User *U) {

1736

return ScalarToTreeEntry.count(U) > 0;

1737

}))

1738

// Take credit for instruction that will become dead.

1739

DeadCost +=

1740

TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, i);

1741

}

1742

return -DeadCost;

1743

}

1744

return getGatherCost(VecTy);

1745

}

1746

case Instruction::ZExt:

1747

case Instruction::SExt:

1748

case Instruction::FPToUI:

1749

case Instruction::FPToSI:

1750

case Instruction::FPExt:

1751

case Instruction::PtrToInt:

1752

case Instruction::IntToPtr:

1753

case Instruction::SIToFP:

1754

case Instruction::UIToFP:

1755

case Instruction::Trunc:

1756

case Instruction::FPTrunc:

1757

case Instruction::BitCast: {

1758

Type *SrcTy = VL0->getOperand(0)->getType();

1759

1760

// Calculate the cost of this instruction.

1761

int ScalarCost = VL.size() * TTI->getCastInstrCost(VL0->getOpcode(),

1762

VL0->getType(), SrcTy, VL0);

1763

1764

VectorType *SrcVecTy = VectorType::get(SrcTy, VL.size());

1765

int VecCost = TTI->getCastInstrCost(VL0->getOpcode(), VecTy, SrcVecTy, VL0);

1766

return VecCost - ScalarCost;

1767

}

1768

case Instruction::FCmp:

1769

case Instruction::ICmp:

1770

case Instruction::Select: {

1771

// Calculate the cost of this instruction.

1772

VectorType *MaskTy = VectorType::get(Builder.getInt1Ty(), VL.size());

1773

int ScalarCost = VecTy->getNumElements() *

1774

TTI->getCmpSelInstrCost(Opcode, ScalarTy, Builder.getInt1Ty(), VL0);

1775

int VecCost = TTI->getCmpSelInstrCost(Opcode, VecTy, MaskTy, VL0);

1776

return VecCost - ScalarCost;

1777

}

1778

case Instruction::Add:

1779

case Instruction::FAdd:

1780

case Instruction::Sub:

1781

case Instruction::FSub:

1782

case Instruction::Mul:

1783

case Instruction::FMul:

1784

case Instruction::UDiv:

1785

case Instruction::SDiv:

1786

case Instruction::FDiv:

1787

case Instruction::URem:

1788

case Instruction::SRem:

1789

case Instruction::FRem:

1790

case Instruction::Shl:

1791

case Instruction::LShr:

1792

case Instruction::AShr:

1793

case Instruction::And:

1794

case Instruction::Or:

1795

case Instruction::Xor: {

1796

// Certain instructions can be cheaper to vectorize if they have a

1797

// constant second vector operand.

1798

TargetTransformInfo::OperandValueKind Op1VK =

1799

TargetTransformInfo::OK_AnyValue;

1800

TargetTransformInfo::OperandValueKind Op2VK =

1801

TargetTransformInfo::OK_UniformConstantValue;

1802

TargetTransformInfo::OperandValueProperties Op1VP =

1803

TargetTransformInfo::OP_None;

1804

TargetTransformInfo::OperandValueProperties Op2VP =

1805

TargetTransformInfo::OP_None;

1806

1807

// If all operands are exactly the same ConstantInt then set the

1808

// operand kind to OK_UniformConstantValue.

1809

// If instead not all operands are constants, then set the operand kind

1810

// to OK_AnyValue. If all operands are constants but not the same,

1811

// then set the operand kind to OK_NonUniformConstantValue.

1812

ConstantInt *CInt = nullptr;

1813

for (unsigned i = 0; i < VL.size(); ++i) {

1814

const Instruction *I = cast<Instruction>(VL[i]);

1815

if (!isa<ConstantInt>(I->getOperand(1))) {

1816

Op2VK = TargetTransformInfo::OK_AnyValue;

1817

break;

1818

}

1819

if (i == 0) {

1820

CInt = cast<ConstantInt>(I->getOperand(1));

1821

continue;

1822

}

1823

if (Op2VK == TargetTransformInfo::OK_UniformConstantValue &&

1824

CInt != cast<ConstantInt>(I->getOperand(1)))

1825

Op2VK = TargetTransformInfo::OK_NonUniformConstantValue;

1826

}

1827

// FIXME: Currently cost of model modification for division by power of

1828

// 2 is handled for X86 and AArch64. Add support for other targets.

1829

if (Op2VK == TargetTransformInfo::OK_UniformConstantValue && CInt &&

1830

CInt->getValue().isPowerOf2())

1831

Op2VP = TargetTransformInfo::OP_PowerOf2;

1832

1833

SmallVector<const Value *, 4> Operands(VL0->operand_values());

1834

int ScalarCost =

1835

VecTy->getNumElements() *

1836

TTI->getArithmeticInstrCost(Opcode, ScalarTy, Op1VK, Op2VK, Op1VP,

1837

Op2VP, Operands);

1838

int VecCost = TTI->getArithmeticInstrCost(Opcode, VecTy, Op1VK, Op2VK,

1839

Op1VP, Op2VP, Operands);

1840

return VecCost - ScalarCost;

1841

}

1842

case Instruction::GetElementPtr: {

1843

TargetTransformInfo::OperandValueKind Op1VK =

1844

TargetTransformInfo::OK_AnyValue;

1845

TargetTransformInfo::OperandValueKind Op2VK =

1846

TargetTransformInfo::OK_UniformConstantValue;

1847

1848

int ScalarCost =

1849

VecTy->getNumElements() *

1850

TTI->getArithmeticInstrCost(Instruction::Add, ScalarTy, Op1VK, Op2VK);

1851

int VecCost =

1852

TTI->getArithmeticInstrCost(Instruction::Add, VecTy, Op1VK, Op2VK);

1853

1854

return VecCost - ScalarCost;

1855

}

1856

case Instruction::Load: {

1857

// Cost of wide load - cost of scalar loads.

1858

unsigned alignment = dyn_cast<LoadInst>(VL0)->getAlignment();

1859

int ScalarLdCost = VecTy->getNumElements() *

1860

TTI->getMemoryOpCost(Instruction::Load, ScalarTy, alignment, 0, VL0);

1861

int VecLdCost = TTI->getMemoryOpCost(Instruction::Load,

1862

VecTy, alignment, 0, VL0);

1863

return VecLdCost - ScalarLdCost;

1864

}

1865

case Instruction::Store: {

1866

// We know that we can merge the stores. Calculate the cost.

1867

unsigned alignment = dyn_cast<StoreInst>(VL0)->getAlignment();

1868

int ScalarStCost = VecTy->getNumElements() *

1869

TTI->getMemoryOpCost(Instruction::Store, ScalarTy, alignment, 0, VL0);

1870

int VecStCost = TTI->getMemoryOpCost(Instruction::Store,

1871

VecTy, alignment, 0, VL0);

1872

return VecStCost - ScalarStCost;

1873

}

1874

case Instruction::Call: {

1875

CallInst *CI = cast<CallInst>(VL0);

1876

Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);

1877

1878

// Calculate the cost of the scalar and vector calls.

1879

SmallVector<Type*, 4> ScalarTys;

1880

for (unsigned op = 0, opc = CI->getNumArgOperands(); op!= opc; ++op)

1881

ScalarTys.push_back(CI->getArgOperand(op)->getType());

1882

1883

FastMathFlags FMF;

1884

if (auto *FPMO = dyn_cast<FPMathOperator>(CI))

1885

FMF = FPMO->getFastMathFlags();

1886

1887

int ScalarCallCost = VecTy->getNumElements() *

1888

TTI->getIntrinsicInstrCost(ID, ScalarTy, ScalarTys, FMF);

1889

1890

SmallVector<Value *, 4> Args(CI->arg_operands());

1891

int VecCallCost = TTI->getIntrinsicInstrCost(ID, CI->getType(), Args, FMF,

1892

VecTy->getNumElements());

1893

1894

DEBUG(dbgs() << "SLP: Call cost "<< VecCallCost - ScalarCallCostdo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Call cost "<< VecCallCost
- ScalarCallCost << " (" << VecCallCost <<
"-" << ScalarCallCost << ")" << " for " <<
*CI << "\n"; } } while (false)

1895

<< " (" << VecCallCost << "-" << ScalarCallCost << ")"do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Call cost "<< VecCallCost
- ScalarCallCost << " (" << VecCallCost <<
"-" << ScalarCallCost << ")" << " for " <<
*CI << "\n"; } } while (false)

1896

<< " for " << *CI << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Call cost "<< VecCallCost
- ScalarCallCost << " (" << VecCallCost <<
"-" << ScalarCallCost << ")" << " for " <<
*CI << "\n"; } } while (false);

1897

1898

return VecCallCost - ScalarCallCost;

1899

}

1900

case Instruction::ShuffleVector: {

1901

TargetTransformInfo::OperandValueKind Op1VK =

1902

TargetTransformInfo::OK_AnyValue;

1903

TargetTransformInfo::OperandValueKind Op2VK =

1904

TargetTransformInfo::OK_AnyValue;

1905

int ScalarCost = 0;

1906

int VecCost = 0;

1907

for (Value *i : VL) {

1908

Instruction *I = cast<Instruction>(i);

1909

if (!I)

1910

break;

1911

ScalarCost +=

1912

TTI->getArithmeticInstrCost(I->getOpcode(), ScalarTy, Op1VK, Op2VK);

1913

}

1914

// VecCost is equal to sum of the cost of creating 2 vectors

1915

// and the cost of creating shuffle.

1916

Instruction *I0 = cast<Instruction>(VL[0]);

1917

VecCost =

1918

TTI->getArithmeticInstrCost(I0->getOpcode(), VecTy, Op1VK, Op2VK);

1919

Instruction *I1 = cast<Instruction>(VL[1]);

1920

VecCost +=

1921

TTI->getArithmeticInstrCost(I1->getOpcode(), VecTy, Op1VK, Op2VK);

1922

VecCost +=

1923

TTI->getShuffleCost(TargetTransformInfo::SK_Alternate, VecTy, 0);

1924

return VecCost - ScalarCost;

1925

}

1926

default:

1927

llvm_unreachable("Unknown instruction")::llvm::llvm_unreachable_internal("Unknown instruction", "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 1927);

1928

}

1929

}

1930

1931

bool BoUpSLP::isFullyVectorizableTinyTree() {

1932

DEBUG(dbgs() << "SLP: Check whether the tree with height " <<do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Check whether the tree with height "
<< VectorizableTree.size() << " is fully vectorizable .\n"
; } } while (false)

1933

VectorizableTree.size() << " is fully vectorizable .\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Check whether the tree with height "
<< VectorizableTree.size() << " is fully vectorizable .\n"
; } } while (false);

1934

1935

// We only handle trees of heights 1 and 2.

1936

if (VectorizableTree.size() == 1 && !VectorizableTree[0].NeedToGather)

1937

return true;

1938

1939

if (VectorizableTree.size() != 2)

1940

return false;

1941

1942

// Handle splat and all-constants stores.

1943

if (!VectorizableTree[0].NeedToGather &&

1944

(allConstant(VectorizableTree[1].Scalars) ||

1945

isSplat(VectorizableTree[1].Scalars)))

1946

return true;

1947

1948

// Gathering cost would be too much for tiny trees.

1949

if (VectorizableTree[0].NeedToGather || VectorizableTree[1].NeedToGather)

1950

return false;

1951

1952

return true;

1953

}

1954

1955

bool BoUpSLP::isTreeTinyAndNotFullyVectorizable() {

1956

1957

// We can vectorize the tree if its size is greater than or equal to the

1958

// minimum size specified by the MinTreeSize command line option.

1959

if (VectorizableTree.size() >= MinTreeSize)

1960

return false;

1961

1962

// If we have a tiny tree (a tree whose size is less than MinTreeSize), we

1963

// can vectorize it if we can prove it fully vectorizable.

1964

if (isFullyVectorizableTinyTree())

1965

return false;

1966

1967

assert(VectorizableTree.empty()((VectorizableTree.empty() ? ExternalUses.empty() : true &&
"We shouldn't have any external users") ? static_cast<void
> (0) : __assert_fail ("VectorizableTree.empty() ? ExternalUses.empty() : true && \"We shouldn't have any external users\""
, "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 1969, __PRETTY_FUNCTION__))

1968

? ExternalUses.empty()((VectorizableTree.empty() ? ExternalUses.empty() : true &&
"We shouldn't have any external users") ? static_cast<void
> (0) : __assert_fail ("VectorizableTree.empty() ? ExternalUses.empty() : true && \"We shouldn't have any external users\""
, "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 1969, __PRETTY_FUNCTION__))

1969

: true && "We shouldn't have any external users")((VectorizableTree.empty() ? ExternalUses.empty() : true &&
"We shouldn't have any external users") ? static_cast<void
> (0) : __assert_fail ("VectorizableTree.empty() ? ExternalUses.empty() : true && \"We shouldn't have any external users\""
, "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 1969, __PRETTY_FUNCTION__));

1970

1971

// Otherwise, we can't vectorize the tree. It is both tiny and not fully

1972

// vectorizable.

1973

return true;

1974

}

1975

1976

int BoUpSLP::getSpillCost() {

1977

// Walk from the bottom of the tree to the top, tracking which values are

1978

// live. When we see a call instruction that is not part of our tree,

1979

// query TTI to see if there is a cost to keeping values live over it

1980

// (for example, if spills and fills are required).

1981

unsigned BundleWidth = VectorizableTree.front().Scalars.size();

1982

int Cost = 0;

1983

1984

SmallPtrSet<Instruction*, 4> LiveValues;

1985

Instruction *PrevInst = nullptr;

1986

1987

for (const auto &N : VectorizableTree) {

1988

Instruction *Inst = dyn_cast<Instruction>(N.Scalars[0]);

1989

if (!Inst)

1990

continue;

1991

1992

if (!PrevInst) {

1993

PrevInst = Inst;

1994

continue;

1995

}

1996

1997

// Update LiveValues.

1998

LiveValues.erase(PrevInst);

1999

for (auto &J : PrevInst->operands()) {

2000

if (isa<Instruction>(&*J) && ScalarToTreeEntry.count(&*J))

2001

LiveValues.insert(cast<Instruction>(&*J));

2002

}

2003

2004

DEBUG(do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: #LV: " << LiveValues.size
(); for (auto *X : LiveValues) dbgs() << " " << X
->getName(); dbgs() << ", Looking at "; Inst->dump
();; } } while (false)

2005

dbgs() << "SLP: #LV: " << LiveValues.size();do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: #LV: " << LiveValues.size
(); for (auto *X : LiveValues) dbgs() << " " << X
->getName(); dbgs() << ", Looking at "; Inst->dump
();; } } while (false)

2006

for (auto *X : LiveValues)do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: #LV: " << LiveValues.size
(); for (auto *X : LiveValues) dbgs() << " " << X
->getName(); dbgs() << ", Looking at "; Inst->dump
();; } } while (false)

2007

dbgs() << " " << X->getName();do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: #LV: " << LiveValues.size
(); for (auto *X : LiveValues) dbgs() << " " << X
->getName(); dbgs() << ", Looking at "; Inst->dump
();; } } while (false)

2008

dbgs() << ", Looking at ";do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: #LV: " << LiveValues.size
(); for (auto *X : LiveValues) dbgs() << " " << X
->getName(); dbgs() << ", Looking at "; Inst->dump
();; } } while (false)

2009

Inst->dump();do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: #LV: " << LiveValues.size
(); for (auto *X : LiveValues) dbgs() << " " << X
->getName(); dbgs() << ", Looking at "; Inst->dump
();; } } while (false)

2010

)do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: #LV: " << LiveValues.size
(); for (auto *X : LiveValues) dbgs() << " " << X
->getName(); dbgs() << ", Looking at "; Inst->dump
();; } } while (false);

2011

2012

// Now find the sequence of instructions between PrevInst and Inst.

2013

BasicBlock::reverse_iterator InstIt = ++Inst->getIterator().getReverse(),

2014

PrevInstIt =

2015

PrevInst->getIterator().getReverse();

2016

while (InstIt != PrevInstIt) {

2017

if (PrevInstIt == PrevInst->getParent()->rend()) {

2018

PrevInstIt = Inst->getParent()->rbegin();

2019

continue;

2020

}

2021

2022

if (isa<CallInst>(&*PrevInstIt) && &*PrevInstIt != PrevInst) {

2023

SmallVector<Type*, 4> V;

2024

for (auto *II : LiveValues)

2025

V.push_back(VectorType::get(II->getType(), BundleWidth));

2026

Cost += TTI->getCostOfKeepingLiveOverCall(V);

2027

}

2028

2029

++PrevInstIt;

2030

}

2031

2032

PrevInst = Inst;

2033

}

2034

2035

return Cost;

2036

}

2037

2038

int BoUpSLP::getTreeCost() {

2039

int Cost = 0;

2040

DEBUG(dbgs() << "SLP: Calculating cost for tree of size " <<do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Calculating cost for tree of size "
<< VectorizableTree.size() << ".\n"; } } while (
false)

2041

VectorizableTree.size() << ".\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Calculating cost for tree of size "
<< VectorizableTree.size() << ".\n"; } } while (
false);

2042

2043

unsigned BundleWidth = VectorizableTree[0].Scalars.size();

2044

2045

for (TreeEntry &TE : VectorizableTree) {

2046

int C = getEntryCost(&TE);

2047

DEBUG(dbgs() << "SLP: Adding cost " << C << " for bundle that starts with "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Adding cost " << C <<
" for bundle that starts with " << *TE.Scalars[0] <<
".\n"; } } while (false)

2048

<< *TE.Scalars[0] << ".\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Adding cost " << C <<
" for bundle that starts with " << *TE.Scalars[0] <<
".\n"; } } while (false);

2049

Cost += C;

2050

}

2051

2052

SmallSet<Value *, 16> ExtractCostCalculated;

2053

int ExtractCost = 0;

2054

for (ExternalUser &EU : ExternalUses) {

2055

// We only add extract cost once for the same scalar.

2056

if (!ExtractCostCalculated.insert(EU.Scalar).second)

2057

continue;

2058

2059

// Uses by ephemeral values are free (because the ephemeral value will be

2060

// removed prior to code generation, and so the extraction will be

2061

// removed as well).

2062

if (EphValues.count(EU.User))

2063

continue;

2064

2065

// If we plan to rewrite the tree in a smaller type, we will need to sign

2066

// extend the extracted value back to the original type. Here, we account

2067

// for the extract and the added cost of the sign extend if needed.

2068

auto *VecTy = VectorType::get(EU.Scalar->getType(), BundleWidth);

2069

auto *ScalarRoot = VectorizableTree[0].Scalars[0];

2070

if (MinBWs.count(ScalarRoot)) {

2071

auto *MinTy = IntegerType::get(F->getContext(), MinBWs[ScalarRoot].first);

2072

auto Extend =

2073

MinBWs[ScalarRoot].second ? Instruction::SExt : Instruction::ZExt;

2074

VecTy = VectorType::get(MinTy, BundleWidth);

2075

ExtractCost += TTI->getExtractWithExtendCost(Extend, EU.Scalar->getType(),

2076

VecTy, EU.Lane);

2077

} else {

2078

ExtractCost +=

2079

TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, EU.Lane);

2080

}

2081

}

2082

2083

int SpillCost = getSpillCost();

2084

Cost += SpillCost + ExtractCost;

2085

2086

std::string Str;

2087

{

2088

raw_string_ostream OS(Str);

2089

OS << "SLP: Spill Cost = " << SpillCost << ".\n"

2090

<< "SLP: Extract Cost = " << ExtractCost << ".\n"

2091

<< "SLP: Total Cost = " << Cost << ".\n";

2092

}

2093

DEBUG(dbgs() << Str)do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << Str; } } while (false);

2094

2095

if (ViewSLPTree)

2096

ViewGraph(this, "SLP" + F->getName(), false, Str);

2097

2098

return Cost;

2099

}

2100

2101

int BoUpSLP::getGatherCost(Type *Ty) {

2102

int Cost = 0;

2103

for (unsigned i = 0, e = cast<VectorType>(Ty)->getNumElements(); i < e; ++i)

2104

Cost += TTI->getVectorInstrCost(Instruction::InsertElement, Ty, i);

2105

return Cost;

2106

}

2107

2108

int BoUpSLP::getGatherCost(ArrayRef<Value *> VL) {

2109

// Find the type of the operands in VL.

2110

Type *ScalarTy = VL[0]->getType();

2111

if (StoreInst *SI = dyn_cast<StoreInst>(VL[0]))

2112

ScalarTy = SI->getValueOperand()->getType();

2113

VectorType *VecTy = VectorType::get(ScalarTy, VL.size());

2114

// Find the cost of inserting/extracting values from the vector.

2115

return getGatherCost(VecTy);

2116

}

2117

2118

// Reorder commutative operations in alternate shuffle if the resulting vectors

2119

// are consecutive loads. This would allow us to vectorize the tree.

2120

// If we have something like-

2121

// load a[0] - load b[0]

2122

// load b[1] + load a[1]

2123

// load a[2] - load b[2]

2124

// load a[3] + load b[3]

2125

// Reordering the second load b[1] load a[1] would allow us to vectorize this

2126

// code.

2127

void BoUpSLP::reorderAltShuffleOperands(ArrayRef<Value *> VL,

2128

SmallVectorImpl<Value *> &Left,

2129

SmallVectorImpl<Value *> &Right) {

2130

// Push left and right operands of binary operation into Left and Right

2131

for (Value *i : VL) {

2132

Left.push_back(cast<Instruction>(i)->getOperand(0));

2133

Right.push_back(cast<Instruction>(i)->getOperand(1));

2134

}

2135

2136

// Reorder if we have a commutative operation and consecutive access

2137

// are on either side of the alternate instructions.

2138

for (unsigned j = 0; j < VL.size() - 1; ++j) {

2139

if (LoadInst *L = dyn_cast<LoadInst>(Left[j])) {

2140

if (LoadInst *L1 = dyn_cast<LoadInst>(Right[j + 1])) {

2141

Instruction *VL1 = cast<Instruction>(VL[j]);

2142

Instruction *VL2 = cast<Instruction>(VL[j + 1]);

2143

if (VL1->isCommutative() && isConsecutiveAccess(L, L1, *DL, *SE)) {

2144

std::swap(Left[j], Right[j]);

2145

continue;

2146

} else if (VL2->isCommutative() &&

2147

isConsecutiveAccess(L, L1, *DL, *SE)) {

2148

std::swap(Left[j + 1], Right[j + 1]);

2149

continue;

2150

}

2151

// else unchanged

2152

}

2153

}

2154

if (LoadInst *L = dyn_cast<LoadInst>(Right[j])) {

2155

if (LoadInst *L1 = dyn_cast<LoadInst>(Left[j + 1])) {

2156

Instruction *VL1 = cast<Instruction>(VL[j]);

2157

Instruction *VL2 = cast<Instruction>(VL[j + 1]);

2158

if (VL1->isCommutative() && isConsecutiveAccess(L, L1, *DL, *SE)) {

2159

std::swap(Left[j], Right[j]);

2160

continue;

2161

} else if (VL2->isCommutative() &&

2162

isConsecutiveAccess(L, L1, *DL, *SE)) {

2163

std::swap(Left[j + 1], Right[j + 1]);

2164

continue;

2165

}

2166

// else unchanged

2167

}

2168

}

2169

}

2170

}

2171

2172

// Return true if I should be commuted before adding it's left and right

2173

// operands to the arrays Left and Right.

2174

2175

// The vectorizer is trying to either have all elements one side being

2176

// instruction with the same opcode to enable further vectorization, or having

2177

// a splat to lower the vectorizing cost.

2178

static bool shouldReorderOperands(int i, Instruction &I,

2179

SmallVectorImpl<Value *> &Left,

2180

SmallVectorImpl<Value *> &Right,

2181

bool AllSameOpcodeLeft,

2182

bool AllSameOpcodeRight, bool SplatLeft,

2183

bool SplatRight) {

2184

Value *VLeft = I.getOperand(0);

2185

Value *VRight = I.getOperand(1);

2186

// If we have "SplatRight", try to see if commuting is needed to preserve it.

2187

if (SplatRight) {

2188

if (VRight == Right[i - 1])

2189

// Preserve SplatRight

2190

return false;

2191

if (VLeft == Right[i - 1]) {

2192

// Commuting would preserve SplatRight, but we don't want to break

2193

// SplatLeft either, i.e. preserve the original order if possible.

2194

// (FIXME: why do we care?)

2195

if (SplatLeft && VLeft == Left[i - 1])

2196

return false;

2197

return true;

2198

}

2199

}

2200

// Symmetrically handle Right side.

2201

if (SplatLeft) {

2202

if (VLeft == Left[i - 1])

2203

// Preserve SplatLeft

2204

return false;

2205

if (VRight == Left[i - 1])

2206

return true;

2207

}

2208

2209

Instruction *ILeft = dyn_cast<Instruction>(VLeft);

2210

Instruction *IRight = dyn_cast<Instruction>(VRight);

2211

2212

// If we have "AllSameOpcodeRight", try to see if the left operands preserves

2213

// it and not the right, in this case we want to commute.

2214

if (AllSameOpcodeRight) {

2215

unsigned RightPrevOpcode = cast<Instruction>(Right[i - 1])->getOpcode();

2216

if (IRight && RightPrevOpcode == IRight->getOpcode())

2217

// Do not commute, a match on the right preserves AllSameOpcodeRight

2218

return false;

2219

if (ILeft && RightPrevOpcode == ILeft->getOpcode()) {

2220

// We have a match and may want to commute, but first check if there is

2221

// not also a match on the existing operands on the Left to preserve

2222

// AllSameOpcodeLeft, i.e. preserve the original order if possible.

2223

// (FIXME: why do we care?)

2224

if (AllSameOpcodeLeft && ILeft &&

2225

cast<Instruction>(Left[i - 1])->getOpcode() == ILeft->getOpcode())

2226

return false;

2227

return true;

2228

}

2229

}

2230

// Symmetrically handle Left side.

2231

if (AllSameOpcodeLeft) {

2232

unsigned LeftPrevOpcode = cast<Instruction>(Left[i - 1])->getOpcode();

2233

if (ILeft && LeftPrevOpcode == ILeft->getOpcode())

2234

return false;

2235

if (IRight && LeftPrevOpcode == IRight->getOpcode())

2236

return true;

2237

}

2238

return false;

2239

}

2240

2241

void BoUpSLP::reorderInputsAccordingToOpcode(ArrayRef<Value *> VL,

2242

SmallVectorImpl<Value *> &Left,

2243

SmallVectorImpl<Value *> &Right) {

2244

2245

if (VL.size()) {

2246

// Peel the first iteration out of the loop since there's nothing

2247

// interesting to do anyway and it simplifies the checks in the loop.

2248

auto VLeft = cast<Instruction>(VL[0])->getOperand(0);

2249

auto VRight = cast<Instruction>(VL[0])->getOperand(1);

2250

if (!isa<Instruction>(VRight) && isa<Instruction>(VLeft))

2251

// Favor having instruction to the right. FIXME: why?

2252

std::swap(VLeft, VRight);

2253

Left.push_back(VLeft);

2254

Right.push_back(VRight);

2255

}

2256

2257

// Keep track if we have instructions with all the same opcode on one side.

2258

bool AllSameOpcodeLeft = isa<Instruction>(Left[0]);

2259

bool AllSameOpcodeRight = isa<Instruction>(Right[0]);

2260

// Keep track if we have one side with all the same value (broadcast).

2261

bool SplatLeft = true;

2262

bool SplatRight = true;

2263

2264

for (unsigned i = 1, e = VL.size(); i != e; ++i) {

2265

Instruction *I = cast<Instruction>(VL[i]);

2266

assert(I->isCommutative() && "Can only process commutative instruction")((I->isCommutative() && "Can only process commutative instruction"
) ? static_cast<void> (0) : __assert_fail ("I->isCommutative() && \"Can only process commutative instruction\""
, "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 2266, __PRETTY_FUNCTION__));

2267

// Commute to favor either a splat or maximizing having the same opcodes on

2268

// one side.

2269

if (shouldReorderOperands(i, *I, Left, Right, AllSameOpcodeLeft,

2270

AllSameOpcodeRight, SplatLeft, SplatRight)) {

2271

Left.push_back(I->getOperand(1));

2272

Right.push_back(I->getOperand(0));

2273

} else {

2274

Left.push_back(I->getOperand(0));

2275

Right.push_back(I->getOperand(1));

2276

}

2277

// Update Splat* and AllSameOpcode* after the insertion.

2278

SplatRight = SplatRight && (Right[i - 1] == Right[i]);

2279

SplatLeft = SplatLeft && (Left[i - 1] == Left[i]);

2280

AllSameOpcodeLeft = AllSameOpcodeLeft && isa<Instruction>(Left[i]) &&

2281

(cast<Instruction>(Left[i - 1])->getOpcode() ==

2282

cast<Instruction>(Left[i])->getOpcode());

2283

AllSameOpcodeRight = AllSameOpcodeRight && isa<Instruction>(Right[i]) &&

2284

(cast<Instruction>(Right[i - 1])->getOpcode() ==

2285

cast<Instruction>(Right[i])->getOpcode());

2286

}

2287

2288

// If one operand end up being broadcast, return this operand order.

2289

if (SplatRight || SplatLeft)

2290

return;

2291

2292

// Finally check if we can get longer vectorizable chain by reordering

2293

// without breaking the good operand order detected above.

2294

// E.g. If we have something like-

2295

// load a[0] load b[0]

2296

// load b[1] load a[1]

2297

// load a[2] load b[2]

2298

// load a[3] load b[3]

2299

// Reordering the second load b[1] load a[1] would allow us to vectorize

2300

// this code and we still retain AllSameOpcode property.

2301

// FIXME: This load reordering might break AllSameOpcode in some rare cases

2302

// such as-

2303

// add a[0],c[0] load b[0]

2304

// add a[1],c[2] load b[1]

2305

// b[2] load b[2]

2306

// add a[3],c[3] load b[3]

2307

for (unsigned j = 0; j < VL.size() - 1; ++j) {

2308

if (LoadInst *L = dyn_cast<LoadInst>(Left[j])) {

2309

if (LoadInst *L1 = dyn_cast<LoadInst>(Right[j + 1])) {

2310

if (isConsecutiveAccess(L, L1, *DL, *SE)) {

2311

std::swap(Left[j + 1], Right[j + 1]);

2312

continue;

2313

}

2314

}

2315

}

2316

if (LoadInst *L = dyn_cast<LoadInst>(Right[j])) {

2317

if (LoadInst *L1 = dyn_cast<LoadInst>(Left[j + 1])) {

2318

if (isConsecutiveAccess(L, L1, *DL, *SE)) {

2319

std::swap(Left[j + 1], Right[j + 1]);

2320

continue;

2321

}

2322

}

2323

}

2324

// else unchanged

2325

}

2326

}

2327

2328

void BoUpSLP::setInsertPointAfterBundle(ArrayRef<Value *> VL) {

2329

2330

// Get the basic block this bundle is in. All instructions in the bundle

2331

// should be in this block.

2332

auto *Front = cast<Instruction>(VL.front());

2333

auto *BB = Front->getParent();

2334

assert(all_of(make_range(VL.begin(), VL.end()), [&](Value *V) -> bool {((all_of(make_range(VL.begin(), VL.end()), [&](Value *V) ->
bool { return cast<Instruction>(V)->getParent() == BB
; })) ? static_cast<void> (0) : __assert_fail ("all_of(make_range(VL.begin(), VL.end()), [&](Value *V) -> bool { return cast<Instruction>(V)->getParent() == BB; })"
, "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 2336, __PRETTY_FUNCTION__))

2335

return cast<Instruction>(V)->getParent() == BB;((all_of(make_range(VL.begin(), VL.end()), [&](Value *V) ->
bool { return cast<Instruction>(V)->getParent() == BB
; })) ? static_cast<void> (0) : __assert_fail ("all_of(make_range(VL.begin(), VL.end()), [&](Value *V) -> bool { return cast<Instruction>(V)->getParent() == BB; })"
, "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 2336, __PRETTY_FUNCTION__))

2336

}))((all_of(make_range(VL.begin(), VL.end()), [&](Value *V) ->
bool { return cast<Instruction>(V)->getParent() == BB
; })) ? static_cast<void> (0) : __assert_fail ("all_of(make_range(VL.begin(), VL.end()), [&](Value *V) -> bool { return cast<Instruction>(V)->getParent() == BB; })"
, "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 2336, __PRETTY_FUNCTION__));

2337

2338

// The last instruction in the bundle in program order.

2339

Instruction *LastInst = nullptr;

←

'LastInst' initialized to a null pointer value

→

2340

2341

// Find the last instruction. The common case should be that BB has been

2342

// scheduled, and the last instruction is VL.back(). So we start with

2343

// VL.back() and iterate over schedule data until we reach the end of the

2344

// bundle. The end of the bundle is marked by null ScheduleData.

2345

if (BlocksSchedules.count(BB)) {

←

Assuming the condition is false

→

←

Taking false branch

→

2346

auto *Bundle = BlocksSchedules[BB]->getScheduleData(VL.back());

2347

if (Bundle && Bundle->isPartOfBundle())

2348

for (; Bundle; Bundle = Bundle->NextInBundle)

2349

LastInst = Bundle->Inst;

2350

}

2351

2352

// LastInst can still be null at this point if there's either not an entry

2353

// for BB in BlocksSchedules or there's no ScheduleData available for

2354

// VL.back(). This can be the case if buildTree_rec aborts for various

2355

// reasons (e.g., the maximum recursion depth is reached, the maximum region

2356

// size is reached, etc.). ScheduleData is initialized in the scheduling

2357

// "dry-run".

2358

2359

// If this happens, we can still find the last instruction by brute force. We

2360

// iterate forwards from Front (inclusive) until we either see all

2361

// instructions in the bundle or reach the end of the block. If Front is the

2362

// last instruction in program order, LastInst will be set to Front, and we

2363

// will visit all the remaining instructions in the block.

2364

2365

// One of the reasons we exit early from buildTree_rec is to place an upper

2366

// bound on compile-time. Thus, taking an additional compile-time hit here is

2367

// not ideal. However, this should be exceedingly rare since it requires that

2368

// we both exit early from buildTree_rec and that the bundle be out-of-order

2369

// (causing us to iterate all the way to the end of the block).

2370

if (!LastInst) {

←

Taking true branch

→

2371

SmallPtrSet<Value *, 16> Bundle(VL.begin(), VL.end());

2372

for (auto &I : make_range(BasicBlock::iterator(Front), BB->end())) {

2373

if (Bundle.erase(&I))

2374

LastInst = &I;

2375

if (Bundle.empty())

2376

break;

2377

}

2378

}

2379

2380

// Set the insertion point after the last instruction in the bundle. Set the

2381

// debug location to Front.

2382

Builder.SetInsertPoint(BB, ++LastInst->getIterator());

←

Called C++ object pointer is null

2383

Builder.SetCurrentDebugLocation(Front->getDebugLoc());

2384

}

2385

2386

Value *BoUpSLP::Gather(ArrayRef<Value *> VL, VectorType *Ty) {

2387

Value *Vec = UndefValue::get(Ty);

2388

// Generate the 'InsertElement' instruction.

2389

for (unsigned i = 0; i < Ty->getNumElements(); ++i) {

2390

Vec = Builder.CreateInsertElement(Vec, VL[i], Builder.getInt32(i));

2391

if (Instruction *Insrt = dyn_cast<Instruction>(Vec)) {

2392

GatherSeq.insert(Insrt);

2393

CSEBlocks.insert(Insrt->getParent());

2394

2395

// Add to our 'need-to-extract' list.

2396

if (ScalarToTreeEntry.count(VL[i])) {

2397

int Idx = ScalarToTreeEntry[VL[i]];

2398

TreeEntry *E = &VectorizableTree[Idx];

2399

// Find which lane we need to extract.

2400

int FoundLane = -1;

2401

for (unsigned Lane = 0, LE = VL.size(); Lane != LE; ++Lane) {

2402

// Is this the lane of the scalar that we are looking for ?

2403

if (E->Scalars[Lane] == VL[i]) {

2404

FoundLane = Lane;

2405

break;

2406

}

2407

}

2408

assert(FoundLane >= 0 && "Could not find the correct lane")((FoundLane >= 0 && "Could not find the correct lane"
) ? static_cast<void> (0) : __assert_fail ("FoundLane >= 0 && \"Could not find the correct lane\""
, "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 2408, __PRETTY_FUNCTION__));

2409

ExternalUses.push_back(ExternalUser(VL[i], Insrt, FoundLane));

2410

}

2411

}

2412

}

2413

2414

return Vec;

2415

}

2416

2417

Value *BoUpSLP::alreadyVectorized(ArrayRef<Value *> VL) const {

2418

SmallDenseMap<Value*, int>::const_iterator Entry

2419

= ScalarToTreeEntry.find(VL[0]);

2420

if (Entry != ScalarToTreeEntry.end()) {

2421

int Idx = Entry->second;

2422

const TreeEntry *En = &VectorizableTree[Idx];

2423

if (En->isSame(VL) && En->VectorizedValue)

2424

return En->VectorizedValue;

2425

}

2426

return nullptr;

2427

}

2428

2429

Value *BoUpSLP::vectorizeTree(ArrayRef<Value *> VL) {

2430

if (ScalarToTreeEntry.count(VL[0])) {

2431

int Idx = ScalarToTreeEntry[VL[0]];

2432

TreeEntry *E = &VectorizableTree[Idx];

2433

if (E->isSame(VL))

2434

return vectorizeTree(E);

2435

}

2436

2437

Type *ScalarTy = VL[0]->getType();

2438

if (StoreInst *SI = dyn_cast<StoreInst>(VL[0]))

2439

ScalarTy = SI->getValueOperand()->getType();

2440

VectorType *VecTy = VectorType::get(ScalarTy, VL.size());

2441

2442

return Gather(VL, VecTy);

2443

}

2444

2445

Value *BoUpSLP::vectorizeTree(TreeEntry *E) {

2446

IRBuilder<>::InsertPointGuard Guard(Builder);

2447

2448

if (E->VectorizedValue) {

Assuming the condition is false

→

←

Taking false branch

→

2449

DEBUG(dbgs() << "SLP: Diamond merged for " << *E->Scalars[0] << ".\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Diamond merged for " <<
*E->Scalars[0] << ".\n"; } } while (false);

2450

return E->VectorizedValue;

2451

}

2452

2453

Instruction *VL0 = cast<Instruction>(E->Scalars[0]);

2454

Type *ScalarTy = VL0->getType();

2455

if (StoreInst *SI = dyn_cast<StoreInst>(VL0))

←

Taking false branch

→

2456

ScalarTy = SI->getValueOperand()->getType();

2457

VectorType *VecTy = VectorType::get(ScalarTy, E->Scalars.size());

2458

2459

if (E->NeedToGather) {

←

Assuming the condition is false

→

←

Taking false branch

→

2460

setInsertPointAfterBundle(E->Scalars);

2461

auto *V = Gather(E->Scalars, VecTy);

2462

E->VectorizedValue = V;

2463

return V;

2464

}

2465

2466

unsigned Opcode = getSameOpcode(E->Scalars);

2467

2468

switch (Opcode) {

←

Control jumps to 'case ShuffleVector:' at line 2794

→

2469

case Instruction::PHI: {

2470

PHINode *PH = dyn_cast<PHINode>(VL0);

2471

Builder.SetInsertPoint(PH->getParent()->getFirstNonPHI());

2472

Builder.SetCurrentDebugLocation(PH->getDebugLoc());

2473

PHINode *NewPhi = Builder.CreatePHI(VecTy, PH->getNumIncomingValues());

2474

E->VectorizedValue = NewPhi;

2475

2476

// PHINodes may have multiple entries from the same block. We want to

2477

// visit every block once.

2478

SmallSet<BasicBlock*, 4> VisitedBBs;

2479

2480

for (unsigned i = 0, e = PH->getNumIncomingValues(); i < e; ++i) {

2481

ValueList Operands;

2482

BasicBlock *IBB = PH->getIncomingBlock(i);

2483

2484

if (!VisitedBBs.insert(IBB).second) {

2485

NewPhi->addIncoming(NewPhi->getIncomingValueForBlock(IBB), IBB);

2486

continue;

2487

}

2488

2489

// Prepare the operand vector.

2490

for (Value *V : E->Scalars)

2491

Operands.push_back(cast<PHINode>(V)->getIncomingValueForBlock(IBB));

2492

2493

Builder.SetInsertPoint(IBB->getTerminator());

2494

Builder.SetCurrentDebugLocation(PH->getDebugLoc());

2495

Value *Vec = vectorizeTree(Operands);

2496

NewPhi->addIncoming(Vec, IBB);

2497

}

2498

2499

assert(NewPhi->getNumIncomingValues() == PH->getNumIncomingValues() &&((NewPhi->getNumIncomingValues() == PH->getNumIncomingValues
() && "Invalid number of incoming values") ? static_cast
<void> (0) : __assert_fail ("NewPhi->getNumIncomingValues() == PH->getNumIncomingValues() && \"Invalid number of incoming values\""
, "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 2500, __PRETTY_FUNCTION__))

2500

"Invalid number of incoming values")((NewPhi->getNumIncomingValues() == PH->getNumIncomingValues
() && "Invalid number of incoming values") ? static_cast
<void> (0) : __assert_fail ("NewPhi->getNumIncomingValues() == PH->getNumIncomingValues() && \"Invalid number of incoming values\""
, "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 2500, __PRETTY_FUNCTION__));

2501

return NewPhi;

2502

}

2503

2504

case Instruction::ExtractElement: {

2505

if (canReuseExtract(E->Scalars, Instruction::ExtractElement)) {

2506

Value *V = VL0->getOperand(0);

2507

E->VectorizedValue = V;

2508

return V;

2509

}

2510

setInsertPointAfterBundle(E->Scalars);

2511

auto *V = Gather(E->Scalars, VecTy);

2512

E->VectorizedValue = V;

2513

return V;

2514

}

2515

case Instruction::ExtractValue: {

2516

if (canReuseExtract(E->Scalars, Instruction::ExtractValue)) {

2517

LoadInst *LI = cast<LoadInst>(VL0->getOperand(0));

2518

Builder.SetInsertPoint(LI);

2519

PointerType *PtrTy = PointerType::get(VecTy, LI->getPointerAddressSpace());

2520

Value *Ptr = Builder.CreateBitCast(LI->getOperand(0), PtrTy);

2521

LoadInst *V = Builder.CreateAlignedLoad(Ptr, LI->getAlignment());

2522

E->VectorizedValue = V;

2523

return propagateMetadata(V, E->Scalars);

2524

}

2525

setInsertPointAfterBundle(E->Scalars);

2526

auto *V = Gather(E->Scalars, VecTy);

2527

E->VectorizedValue = V;

2528

return V;

2529

}

2530

case Instruction::ZExt:

2531

case Instruction::SExt:

2532

case Instruction::FPToUI:

2533

case Instruction::FPToSI:

2534

case Instruction::FPExt:

2535

case Instruction::PtrToInt:

2536

case Instruction::IntToPtr:

2537

case Instruction::SIToFP:

2538

case Instruction::UIToFP:

2539

case Instruction::Trunc:

2540

case Instruction::FPTrunc:

2541

case Instruction::BitCast: {

2542

ValueList INVL;

2543

for (Value *V : E->Scalars)

2544

INVL.push_back(cast<Instruction>(V)->getOperand(0));

2545

2546

setInsertPointAfterBundle(E->Scalars);

2547

2548

Value *InVec = vectorizeTree(INVL);

2549

2550

if (Value *V = alreadyVectorized(E->Scalars))

2551

return V;

2552

2553

CastInst *CI = dyn_cast<CastInst>(VL0);

2554

Value *V = Builder.CreateCast(CI->getOpcode(), InVec, VecTy);

2555

E->VectorizedValue = V;

2556

++NumVectorInstructions;

2557

return V;

2558

}

2559

case Instruction::FCmp:

2560

case Instruction::ICmp: {

2561

ValueList LHSV, RHSV;

2562

for (Value *V : E->Scalars) {

2563

LHSV.push_back(cast<Instruction>(V)->getOperand(0));

2564

RHSV.push_back(cast<Instruction>(V)->getOperand(1));

2565

}

2566

2567

setInsertPointAfterBundle(E->Scalars);

2568

2569

Value *L = vectorizeTree(LHSV);

2570

Value *R = vectorizeTree(RHSV);

2571

2572

if (Value *V = alreadyVectorized(E->Scalars))

2573

return V;

2574

2575

CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate();

2576

Value *V;

2577

if (Opcode == Instruction::FCmp)

2578

V = Builder.CreateFCmp(P0, L, R);

2579

else

2580

V = Builder.CreateICmp(P0, L, R);

2581

2582

E->VectorizedValue = V;

2583

propagateIRFlags(E->VectorizedValue, E->Scalars);

2584

++NumVectorInstructions;

2585

return V;

2586

}

2587

case Instruction::Select: {

2588

ValueList TrueVec, FalseVec, CondVec;

2589

for (Value *V : E->Scalars) {

2590

CondVec.push_back(cast<Instruction>(V)->getOperand(0));

2591

TrueVec.push_back(cast<Instruction>(V)->getOperand(1));

2592

FalseVec.push_back(cast<Instruction>(V)->getOperand(2));

2593

}

2594

2595

setInsertPointAfterBundle(E->Scalars);

2596

2597

Value *Cond = vectorizeTree(CondVec);

2598

Value *True = vectorizeTree(TrueVec);

2599

Value *False = vectorizeTree(FalseVec);

2600

2601

if (Value *V = alreadyVectorized(E->Scalars))

2602

return V;

2603

2604

Value *V = Builder.CreateSelect(Cond, True, False);

2605

E->VectorizedValue = V;

2606

++NumVectorInstructions;

2607

return V;

2608

}

2609

case Instruction::Add:

2610

case Instruction::FAdd:

2611

case Instruction::Sub:

2612

case Instruction::FSub:

2613

case Instruction::Mul:

2614

case Instruction::FMul:

2615

case Instruction::UDiv:

2616

case Instruction::SDiv:

2617

case Instruction::FDiv:

2618

case Instruction::URem:

2619

case Instruction::SRem:

2620

case Instruction::FRem:

2621

case Instruction::Shl:

2622

case Instruction::LShr:

2623

case Instruction::AShr:

2624

case Instruction::And:

2625

case Instruction::Or:

2626

case Instruction::Xor: {

2627

ValueList LHSVL, RHSVL;

2628

if (isa<BinaryOperator>(VL0) && VL0->isCommutative())

2629

reorderInputsAccordingToOpcode(E->Scalars, LHSVL, RHSVL);

2630

else

2631

for (Value *V : E->Scalars) {

2632

LHSVL.push_back(cast<Instruction>(V)->getOperand(0));

2633

RHSVL.push_back(cast<Instruction>(V)->getOperand(1));

2634

}

2635

2636

setInsertPointAfterBundle(E->Scalars);

2637

2638

Value *LHS = vectorizeTree(LHSVL);

2639

Value *RHS = vectorizeTree(RHSVL);

2640

2641

if (Value *V = alreadyVectorized(E->Scalars))

2642

return V;

2643

2644

BinaryOperator *BinOp = cast<BinaryOperator>(VL0);

2645

Value *V = Builder.CreateBinOp(BinOp->getOpcode(), LHS, RHS);

2646

E->VectorizedValue = V;

2647

propagateIRFlags(E->VectorizedValue, E->Scalars);

2648

++NumVectorInstructions;

2649

2650

if (Instruction *I = dyn_cast<Instruction>(V))

2651

return propagateMetadata(I, E->Scalars);

2652

2653

return V;

2654

}

2655

case Instruction::Load: {

2656

// Loads are inserted at the head of the tree because we don't want to

2657

// sink them all the way down past store instructions.

2658

setInsertPointAfterBundle(E->Scalars);

2659

2660

LoadInst *LI = cast<LoadInst>(VL0);

2661

Type *ScalarLoadTy = LI->getType();

2662

unsigned AS = LI->getPointerAddressSpace();

2663

2664

Value *VecPtr = Builder.CreateBitCast(LI->getPointerOperand(),

2665

VecTy->getPointerTo(AS));

2666

2667

// The pointer operand uses an in-tree scalar so we add the new BitCast to

2668

// ExternalUses list to make sure that an extract will be generated in the

2669

// future.

2670

if (ScalarToTreeEntry.count(LI->getPointerOperand()))

2671

ExternalUses.push_back(

2672

ExternalUser(LI->getPointerOperand(), cast<User>(VecPtr), 0));

2673

2674

unsigned Alignment = LI->getAlignment();

2675

LI = Builder.CreateLoad(VecPtr);

2676

if (!Alignment) {

2677

Alignment = DL->getABITypeAlignment(ScalarLoadTy);

2678

}

2679

LI->setAlignment(Alignment);

2680

E->VectorizedValue = LI;

2681

++NumVectorInstructions;

2682

return propagateMetadata(LI, E->Scalars);

2683

}

2684

case Instruction::Store: {

2685

StoreInst *SI = cast<StoreInst>(VL0);

2686

unsigned Alignment = SI->getAlignment();

2687

unsigned AS = SI->getPointerAddressSpace();

2688

2689

ValueList ValueOp;

2690

for (Value *V : E->Scalars)

2691

ValueOp.push_back(cast<StoreInst>(V)->getValueOperand());

2692

2693

setInsertPointAfterBundle(E->Scalars);

2694

2695

Value *VecValue = vectorizeTree(ValueOp);

2696

Value *VecPtr = Builder.CreateBitCast(SI->getPointerOperand(),

2697

VecTy->getPointerTo(AS));

2698

StoreInst *S = Builder.CreateStore(VecValue, VecPtr);

2699

2700

// The pointer operand uses an in-tree scalar so we add the new BitCast to

2701

// ExternalUses list to make sure that an extract will be generated in the

2702

// future.

2703

if (ScalarToTreeEntry.count(SI->getPointerOperand()))

2704

ExternalUses.push_back(

2705

ExternalUser(SI->getPointerOperand(), cast<User>(VecPtr), 0));

2706

2707

if (!Alignment) {

2708

Alignment = DL->getABITypeAlignment(SI->getValueOperand()->getType());

2709

}

2710

S->setAlignment(Alignment);

2711

E->VectorizedValue = S;

2712

++NumVectorInstructions;

2713

return propagateMetadata(S, E->Scalars);

2714

}

2715

case Instruction::GetElementPtr: {

2716

setInsertPointAfterBundle(E->Scalars);

2717

2718

ValueList Op0VL;

2719

for (Value *V : E->Scalars)

2720

Op0VL.push_back(cast<GetElementPtrInst>(V)->getOperand(0));

2721

2722

Value *Op0 = vectorizeTree(Op0VL);

2723

2724

std::vector<Value *> OpVecs;

2725

for (int j = 1, e = cast<GetElementPtrInst>(VL0)->getNumOperands(); j < e;

2726

++j) {

2727

ValueList OpVL;

2728

for (Value *V : E->Scalars)

2729

OpVL.push_back(cast<GetElementPtrInst>(V)->getOperand(j));

2730

2731

Value *OpVec = vectorizeTree(OpVL);

2732

OpVecs.push_back(OpVec);

2733

}

2734

2735

Value *V = Builder.CreateGEP(

2736

cast<GetElementPtrInst>(VL0)->getSourceElementType(), Op0, OpVecs);

2737

E->VectorizedValue = V;

2738

++NumVectorInstructions;

2739

2740

if (Instruction *I = dyn_cast<Instruction>(V))

2741

return propagateMetadata(I, E->Scalars);

2742

2743

return V;

2744

}

2745

case Instruction::Call: {

2746

CallInst *CI = cast<CallInst>(VL0);

2747

setInsertPointAfterBundle(E->Scalars);

2748

Function *FI;

2749

Intrinsic::ID IID = Intrinsic::not_intrinsic;

2750

Value *ScalarArg = nullptr;

2751

if (CI && (FI = CI->getCalledFunction())) {

2752

IID = FI->getIntrinsicID();

2753

}

2754

std::vector<Value *> OpVecs;

2755

for (int j = 0, e = CI->getNumArgOperands(); j < e; ++j) {

2756

ValueList OpVL;

2757

// ctlz,cttz and powi are special intrinsics whose second argument is

2758

// a scalar. This argument should not be vectorized.

2759

if (hasVectorInstrinsicScalarOpd(IID, 1) && j == 1) {

2760

CallInst *CEI = cast<CallInst>(E->Scalars[0]);

2761

ScalarArg = CEI->getArgOperand(j);

2762

OpVecs.push_back(CEI->getArgOperand(j));

2763

continue;

2764

}

2765

for (Value *V : E->Scalars) {

2766

CallInst *CEI = cast<CallInst>(V);

2767

OpVL.push_back(CEI->getArgOperand(j));

2768

}

2769

2770

Value *OpVec = vectorizeTree(OpVL);

2771

DEBUG(dbgs() << "SLP: OpVec[" << j << "]: " << *OpVec << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: OpVec[" << j << "]: "
<< *OpVec << "\n"; } } while (false);

2772

OpVecs.push_back(OpVec);

2773

}

2774

2775

Module *M = F->getParent();

2776

Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);

2777

Type *Tys[] = { VectorType::get(CI->getType(), E->Scalars.size()) };

2778

Function *CF = Intrinsic::getDeclaration(M, ID, Tys);

2779

SmallVector<OperandBundleDef, 1> OpBundles;

2780

CI->getOperandBundlesAsDefs(OpBundles);

2781

Value *V = Builder.CreateCall(CF, OpVecs, OpBundles);

2782

2783

// The scalar argument uses an in-tree scalar so we add the new vectorized

2784

// call to ExternalUses list to make sure that an extract will be

2785

// generated in the future.

2786

if (ScalarArg && ScalarToTreeEntry.count(ScalarArg))

2787

ExternalUses.push_back(ExternalUser(ScalarArg, cast<User>(V), 0));

2788

2789

E->VectorizedValue = V;

2790

propagateIRFlags(E->VectorizedValue, E->Scalars);

2791

++NumVectorInstructions;

2792

return V;

2793

}

2794

case Instruction::ShuffleVector: {

2795

ValueList LHSVL, RHSVL;

2796

assert(isa<BinaryOperator>(VL0) && "Invalid Shuffle Vector Operand")((isa<BinaryOperator>(VL0) && "Invalid Shuffle Vector Operand"
) ? static_cast<void> (0) : __assert_fail ("isa<BinaryOperator>(VL0) && \"Invalid Shuffle Vector Operand\""
, "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 2796, __PRETTY_FUNCTION__));

2797

reorderAltShuffleOperands(E->Scalars, LHSVL, RHSVL);

2798

setInsertPointAfterBundle(E->Scalars);

←

Calling 'BoUpSLP::setInsertPointAfterBundle'

→

2799

2800

Value *LHS = vectorizeTree(LHSVL);

2801

Value *RHS = vectorizeTree(RHSVL);

2802

2803

if (Value *V = alreadyVectorized(E->Scalars))

2804

return V;

2805

2806

// Create a vector of LHS op1 RHS

2807

BinaryOperator *BinOp0 = cast<BinaryOperator>(VL0);

2808

Value *V0 = Builder.CreateBinOp(BinOp0->getOpcode(), LHS, RHS);

2809

2810

// Create a vector of LHS op2 RHS

2811

Instruction *VL1 = cast<Instruction>(E->Scalars[1]);

2812

BinaryOperator *BinOp1 = cast<BinaryOperator>(VL1);

2813

Value *V1 = Builder.CreateBinOp(BinOp1->getOpcode(), LHS, RHS);

2814

2815

// Create shuffle to take alternate operations from the vector.

2816

// Also, gather up odd and even scalar ops to propagate IR flags to

2817

// each vector operation.

2818

ValueList OddScalars, EvenScalars;

2819

unsigned e = E->Scalars.size();

2820

SmallVector<Constant *, 8> Mask(e);

2821

for (unsigned i = 0; i < e; ++i) {

2822

if (i & 1) {

2823

Mask[i] = Builder.getInt32(e + i);

2824

OddScalars.push_back(E->Scalars[i]);

2825

} else {

2826

Mask[i] = Builder.getInt32(i);

2827

EvenScalars.push_back(E->Scalars[i]);

2828

}

2829

}

2830

2831

Value *ShuffleMask = ConstantVector::get(Mask);

2832

propagateIRFlags(V0, EvenScalars);

2833

propagateIRFlags(V1, OddScalars);

2834

2835

Value *V = Builder.CreateShuffleVector(V0, V1, ShuffleMask);

2836

E->VectorizedValue = V;

2837

++NumVectorInstructions;

2838

if (Instruction *I = dyn_cast<Instruction>(V))

2839

return propagateMetadata(I, E->Scalars);

2840

2841

return V;

2842

}

2843

default:

2844

llvm_unreachable("unknown inst")::llvm::llvm_unreachable_internal("unknown inst", "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 2844);

2845

}

2846

return nullptr;

2847

}

2848

2849

Value *BoUpSLP::vectorizeTree() {

2850

ExtraValueToDebugLocsMap ExternallyUsedValues;

2851

return vectorizeTree(ExternallyUsedValues);

2852

}

2853

2854

Value *

2855

BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues) {

2856

2857

// All blocks must be scheduled before any instructions are inserted.

2858

for (auto &BSIter : BlocksSchedules) {

2859

scheduleBlock(BSIter.second.get());

2860

}

2861

2862

Builder.SetInsertPoint(&F->getEntryBlock().front());

2863

auto *VectorRoot = vectorizeTree(&VectorizableTree[0]);

2864

2865

// If the vectorized tree can be rewritten in a smaller type, we truncate the

2866

// vectorized root. InstCombine will then rewrite the entire expression. We

2867

// sign extend the extracted values below.

2868

auto *ScalarRoot = VectorizableTree[0].Scalars[0];

2869

if (MinBWs.count(ScalarRoot)) {

2870

if (auto *I = dyn_cast<Instruction>(VectorRoot))

2871

Builder.SetInsertPoint(&*++BasicBlock::iterator(I));

2872

auto BundleWidth = VectorizableTree[0].Scalars.size();

2873

auto *MinTy = IntegerType::get(F->getContext(), MinBWs[ScalarRoot].first);

2874

auto *VecTy = VectorType::get(MinTy, BundleWidth);

2875

auto *Trunc = Builder.CreateTrunc(VectorRoot, VecTy);

2876

VectorizableTree[0].VectorizedValue = Trunc;

2877

}

2878

2879

DEBUG(dbgs() << "SLP: Extracting " << ExternalUses.size() << " values .\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Extracting " << ExternalUses
.size() << " values .\n"; } } while (false);

2880

2881

// If necessary, sign-extend or zero-extend ScalarRoot to the larger type

2882

// specified by ScalarType.

2883

auto extend = [&](Value *ScalarRoot, Value *Ex, Type *ScalarType) {

2884

if (!MinBWs.count(ScalarRoot))

2885

return Ex;

2886

if (MinBWs[ScalarRoot].second)

2887

return Builder.CreateSExt(Ex, ScalarType);

2888

return Builder.CreateZExt(Ex, ScalarType);

2889

};

2890

2891

// Extract all of the elements with the external uses.

2892

for (const auto &ExternalUse : ExternalUses) {

2893

Value *Scalar = ExternalUse.Scalar;

2894

llvm::User *User = ExternalUse.User;

2895

2896

// Skip users that we already RAUW. This happens when one instruction

2897

// has multiple uses of the same value.

2898

if (User && !is_contained(Scalar->users(), User))

2899

continue;

2900

assert(ScalarToTreeEntry.count(Scalar) && "Invalid scalar")((ScalarToTreeEntry.count(Scalar) && "Invalid scalar"
) ? static_cast<void> (0) : __assert_fail ("ScalarToTreeEntry.count(Scalar) && \"Invalid scalar\""
, "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 2900, __PRETTY_FUNCTION__));

2901

2902

int Idx = ScalarToTreeEntry[Scalar];

2903

TreeEntry *E = &VectorizableTree[Idx];

2904

assert(!E->NeedToGather && "Extracting from a gather list")((!E->NeedToGather && "Extracting from a gather list"
) ? static_cast<void> (0) : __assert_fail ("!E->NeedToGather && \"Extracting from a gather list\""
, "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 2904, __PRETTY_FUNCTION__));

2905

2906

Value *Vec = E->VectorizedValue;

2907

assert(Vec && "Can't find vectorizable value")((Vec && "Can't find vectorizable value") ? static_cast
<void> (0) : __assert_fail ("Vec && \"Can't find vectorizable value\""
, "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 2907, __PRETTY_FUNCTION__));

2908

2909

Value *Lane = Builder.getInt32(ExternalUse.Lane);

2910

// If User == nullptr, the Scalar is used as extra arg. Generate

2911

// ExtractElement instruction and update the record for this scalar in

2912

// ExternallyUsedValues.

2913

if (!User) {

2914

assert(ExternallyUsedValues.count(Scalar) &&((ExternallyUsedValues.count(Scalar) && "Scalar with nullptr as an external user must be registered in "
"ExternallyUsedValues map") ? static_cast<void> (0) : __assert_fail
("ExternallyUsedValues.count(Scalar) && \"Scalar with nullptr as an external user must be registered in \" \"ExternallyUsedValues map\""
, "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 2916, __PRETTY_FUNCTION__))

2915

"Scalar with nullptr as an external user must be registered in "((ExternallyUsedValues.count(Scalar) && "Scalar with nullptr as an external user must be registered in "
"ExternallyUsedValues map") ? static_cast<void> (0) : __assert_fail
("ExternallyUsedValues.count(Scalar) && \"Scalar with nullptr as an external user must be registered in \" \"ExternallyUsedValues map\""
, "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 2916, __PRETTY_FUNCTION__))

2916

"ExternallyUsedValues map")((ExternallyUsedValues.count(Scalar) && "Scalar with nullptr as an external user must be registered in "
"ExternallyUsedValues map") ? static_cast<void> (0) : __assert_fail
("ExternallyUsedValues.count(Scalar) && \"Scalar with nullptr as an external user must be registered in \" \"ExternallyUsedValues map\""
, "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 2916, __PRETTY_FUNCTION__));

2917

if (auto *VecI = dyn_cast<Instruction>(Vec)) {

2918

Builder.SetInsertPoint(VecI->getParent(),

2919

std::next(VecI->getIterator()));

2920

} else {

2921

Builder.SetInsertPoint(&F->getEntryBlock().front());

2922

}

2923

Value *Ex = Builder.CreateExtractElement(Vec, Lane);

2924

Ex = extend(ScalarRoot, Ex, Scalar->getType());

2925

CSEBlocks.insert(cast<Instruction>(Scalar)->getParent());

2926

auto &Locs = ExternallyUsedValues[Scalar];

2927

ExternallyUsedValues.insert({Ex, Locs});

2928

ExternallyUsedValues.erase(Scalar);

2929

continue;

2930

}

2931

2932

// Generate extracts for out-of-tree users.

2933

// Find the insertion point for the extractelement lane.

2934

if (auto *VecI = dyn_cast<Instruction>(Vec)) {

2935

if (PHINode *PH = dyn_cast<PHINode>(User)) {

2936

for (int i = 0, e = PH->getNumIncomingValues(); i != e; ++i) {

2937

if (PH->getIncomingValue(i) == Scalar) {

2938

TerminatorInst *IncomingTerminator =

2939

PH->getIncomingBlock(i)->getTerminator();

2940

if (isa<CatchSwitchInst>(IncomingTerminator)) {

2941

Builder.SetInsertPoint(VecI->getParent(),

2942

std::next(VecI->getIterator()));

2943

} else {

2944

Builder.SetInsertPoint(PH->getIncomingBlock(i)->getTerminator());

2945

}

2946

Value *Ex = Builder.CreateExtractElement(Vec, Lane);

2947

Ex = extend(ScalarRoot, Ex, Scalar->getType());

2948

CSEBlocks.insert(PH->getIncomingBlock(i));

2949

PH->setOperand(i, Ex);

2950

}

2951

}

2952

} else {

2953

Builder.SetInsertPoint(cast<Instruction>(User));

2954

Value *Ex = Builder.CreateExtractElement(Vec, Lane);

2955

Ex = extend(ScalarRoot, Ex, Scalar->getType());

2956

CSEBlocks.insert(cast<Instruction>(User)->getParent());

2957

User->replaceUsesOfWith(Scalar, Ex);

2958

}

2959

} else {

2960

Builder.SetInsertPoint(&F->getEntryBlock().front());

2961

Value *Ex = Builder.CreateExtractElement(Vec, Lane);

2962

Ex = extend(ScalarRoot, Ex, Scalar->getType());

2963

CSEBlocks.insert(&F->getEntryBlock());

2964

User->replaceUsesOfWith(Scalar, Ex);

2965

}

2966

2967

DEBUG(dbgs() << "SLP: Replaced:" << *User << ".\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Replaced:" << *User <<
".\n"; } } while (false);

2968

}

2969

2970

// For each vectorized value:

2971

for (TreeEntry &EIdx : VectorizableTree) {

2972

TreeEntry *Entry = &EIdx;

2973

2974

// For each lane:

2975

for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {

2976

Value *Scalar = Entry->Scalars[Lane];

2977

// No need to handle users of gathered values.

2978

if (Entry->NeedToGather)

2979

continue;

2980

2981

assert(Entry->VectorizedValue && "Can't find vectorizable value")((Entry->VectorizedValue && "Can't find vectorizable value"
) ? static_cast<void> (0) : __assert_fail ("Entry->VectorizedValue && \"Can't find vectorizable value\""
, "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 2981, __PRETTY_FUNCTION__));

2982

2983

Type *Ty = Scalar->getType();

2984

if (!Ty->isVoidTy()) {

2985

#ifndef NDEBUG

2986

for (User *U : Scalar->users()) {

2987

DEBUG(dbgs() << "SLP: \tvalidating user:" << *U << ".\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: \tvalidating user:" <<
*U << ".\n"; } } while (false);

2988

2989

assert((ScalarToTreeEntry.count(U) ||(((ScalarToTreeEntry.count(U) || is_contained(UserIgnoreList,
U)) && "Replacing out-of-tree value with undef") ? static_cast
<void> (0) : __assert_fail ("(ScalarToTreeEntry.count(U) || is_contained(UserIgnoreList, U)) && \"Replacing out-of-tree value with undef\""
, "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 2992, __PRETTY_FUNCTION__))

2990

// It is legal to replace users in the ignorelist by undef.(((ScalarToTreeEntry.count(U) || is_contained(UserIgnoreList,
U)) && "Replacing out-of-tree value with undef") ? static_cast
<void> (0) : __assert_fail ("(ScalarToTreeEntry.count(U) || is_contained(UserIgnoreList, U)) && \"Replacing out-of-tree value with undef\""
, "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 2992, __PRETTY_FUNCTION__))

2991

is_contained(UserIgnoreList, U)) &&(((ScalarToTreeEntry.count(U) || is_contained(UserIgnoreList,
U)) && "Replacing out-of-tree value with undef") ? static_cast
<void> (0) : __assert_fail ("(ScalarToTreeEntry.count(U) || is_contained(UserIgnoreList, U)) && \"Replacing out-of-tree value with undef\""
, "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 2992, __PRETTY_FUNCTION__))

2992

"Replacing out-of-tree value with undef")(((ScalarToTreeEntry.count(U) || is_contained(UserIgnoreList,
U)) && "Replacing out-of-tree value with undef") ? static_cast
<void> (0) : __assert_fail ("(ScalarToTreeEntry.count(U) || is_contained(UserIgnoreList, U)) && \"Replacing out-of-tree value with undef\""
, "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 2992, __PRETTY_FUNCTION__));

2993

}

2994

#endif

2995

Value *Undef = UndefValue::get(Ty);

2996

Scalar->replaceAllUsesWith(Undef);

2997

}

2998

DEBUG(dbgs() << "SLP: \tErasing scalar:" << *Scalar << ".\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: \tErasing scalar:" << *
Scalar << ".\n"; } } while (false);

2999

eraseInstruction(cast<Instruction>(Scalar));

3000

}

3001

}

3002

3003

Builder.ClearInsertionPoint();

3004

3005

return VectorizableTree[0].VectorizedValue;

3006

}

3007

3008

void BoUpSLP::optimizeGatherSequence() {

3009

DEBUG(dbgs() << "SLP: Optimizing " << GatherSeq.size()do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Optimizing " << GatherSeq
.size() << " gather sequences instructions.\n"; } } while
(false)

3010

<< " gather sequences instructions.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Optimizing " << GatherSeq
.size() << " gather sequences instructions.\n"; } } while
(false);

3011

// LICM InsertElementInst sequences.

3012

for (Instruction *it : GatherSeq) {

3013

InsertElementInst *Insert = dyn_cast<InsertElementInst>(it);

3014

3015

if (!Insert)

3016

continue;

3017

3018

// Check if this block is inside a loop.

3019

Loop *L = LI->getLoopFor(Insert->getParent());

3020

if (!L)

3021

continue;

3022

3023

// Check if it has a preheader.

3024

BasicBlock *PreHeader = L->getLoopPreheader();

3025

if (!PreHeader)

3026

continue;

3027

3028

// If the vector or the element that we insert into it are

3029

// instructions that are defined in this basic block then we can't

3030

// hoist this instruction.

3031

Instruction *CurrVec = dyn_cast<Instruction>(Insert->getOperand(0));

3032

Instruction *NewElem = dyn_cast<Instruction>(Insert->getOperand(1));

3033

if (CurrVec && L->contains(CurrVec))

3034

continue;

3035

if (NewElem && L->contains(NewElem))

3036

continue;

3037

3038

// We can hoist this instruction. Move it to the pre-header.

3039

Insert->moveBefore(PreHeader->getTerminator());

3040

}

3041

3042

// Make a list of all reachable blocks in our CSE queue.

3043

SmallVector<const DomTreeNode *, 8> CSEWorkList;

3044

CSEWorkList.reserve(CSEBlocks.size());

3045

for (BasicBlock *BB : CSEBlocks)

3046

if (DomTreeNode *N = DT->getNode(BB)) {

3047

assert(DT->isReachableFromEntry(N))((DT->isReachableFromEntry(N)) ? static_cast<void> (
0) : __assert_fail ("DT->isReachableFromEntry(N)", "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 3047, __PRETTY_FUNCTION__));

3048

CSEWorkList.push_back(N);

3049

}

3050

3051

// Sort blocks by domination. This ensures we visit a block after all blocks

3052

// dominating it are visited.

3053

std::stable_sort(CSEWorkList.begin(), CSEWorkList.end(),

3054

[this](const DomTreeNode *A, const DomTreeNode *B) {

3055

return DT->properlyDominates(A, B);

3056

});

3057

3058

// Perform O(N^2) search over the gather sequences and merge identical

3059

// instructions. TODO: We can further optimize this scan if we split the

3060

// instructions into different buckets based on the insert lane.

3061

SmallVector<Instruction *, 16> Visited;

3062

for (auto I = CSEWorkList.begin(), E = CSEWorkList.end(); I != E; ++I) {

3063

assert((I == CSEWorkList.begin() || !DT->dominates(*I, *std::prev(I))) &&(((I == CSEWorkList.begin() || !DT->dominates(*I, *std::prev
(I))) && "Worklist not sorted properly!") ? static_cast
<void> (0) : __assert_fail ("(I == CSEWorkList.begin() || !DT->dominates(*I, *std::prev(I))) && \"Worklist not sorted properly!\""
, "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 3064, __PRETTY_FUNCTION__))

3064

"Worklist not sorted properly!")(((I == CSEWorkList.begin() || !DT->dominates(*I, *std::prev
(I))) && "Worklist not sorted properly!") ? static_cast
<void> (0) : __assert_fail ("(I == CSEWorkList.begin() || !DT->dominates(*I, *std::prev(I))) && \"Worklist not sorted properly!\""
, "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 3064, __PRETTY_FUNCTION__));

3065

BasicBlock *BB = (*I)->getBlock();

3066

// For all instructions in blocks containing gather sequences:

3067

for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e;) {

3068

Instruction *In = &*it++;

3069

if (!isa<InsertElementInst>(In) && !isa<ExtractElementInst>(In))

3070

continue;

3071

3072

// Check if we can replace this instruction with any of the

3073

// visited instructions.

3074

for (Instruction *v : Visited) {

3075

if (In->isIdenticalTo(v) &&

3076

DT->dominates(v->getParent(), In->getParent())) {

3077

In->replaceAllUsesWith(v);

3078

eraseInstruction(In);

3079

In = nullptr;

3080

break;

3081

}

3082

}

3083

if (In) {

3084

assert(!is_contained(Visited, In))((!is_contained(Visited, In)) ? static_cast<void> (0) :
__assert_fail ("!is_contained(Visited, In)", "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 3084, __PRETTY_FUNCTION__));

3085

Visited.push_back(In);

3086

}

3087

}

3088

}

3089

CSEBlocks.clear();

3090

GatherSeq.clear();

3091

}

3092

3093

// Groups the instructions to a bundle (which is then a single scheduling entity)

3094

// and schedules instructions until the bundle gets ready.

3095

bool BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL,

3096

BoUpSLP *SLP) {

3097

if (isa<PHINode>(VL[0]))

3098

return true;

3099

3100

// Initialize the instruction bundle.

3101

Instruction *OldScheduleEnd = ScheduleEnd;

3102

ScheduleData *PrevInBundle = nullptr;

3103

ScheduleData *Bundle = nullptr;

3104

bool ReSchedule = false;

3105

DEBUG(dbgs() << "SLP: bundle: " << *VL[0] << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: bundle: " << *VL[0] <<
"\n"; } } while (false);

3106

3107

// Make sure that the scheduling region contains all

3108

// instructions of the bundle.

3109

for (Value *V : VL) {

3110

if (!extendSchedulingRegion(V))

3111

return false;

3112

}

3113

3114

for (Value *V : VL) {

3115

ScheduleData *BundleMember = getScheduleData(V);

3116

assert(BundleMember &&((BundleMember && "no ScheduleData for bundle member (maybe not in same basic block)"
) ? static_cast<void> (0) : __assert_fail ("BundleMember && \"no ScheduleData for bundle member (maybe not in same basic block)\""
, "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 3117, __PRETTY_FUNCTION__))

3117

"no ScheduleData for bundle member (maybe not in same basic block)")((BundleMember && "no ScheduleData for bundle member (maybe not in same basic block)"
) ? static_cast<void> (0) : __assert_fail ("BundleMember && \"no ScheduleData for bundle member (maybe not in same basic block)\""
, "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 3117, __PRETTY_FUNCTION__));

3118

if (BundleMember->IsScheduled) {

3119

// A bundle member was scheduled as single instruction before and now

3120

// needs to be scheduled as part of the bundle. We just get rid of the

3121

// existing schedule.

3122

DEBUG(dbgs() << "SLP: reset schedule because " << *BundleMemberdo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: reset schedule because " <<
*BundleMember << " was already scheduled\n"; } } while
(false)

3123

<< " was already scheduled\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: reset schedule because " <<
*BundleMember << " was already scheduled\n"; } } while
(false);

3124

ReSchedule = true;

3125

}

3126

assert(BundleMember->isSchedulingEntity() &&((BundleMember->isSchedulingEntity() && "bundle member already part of other bundle"
) ? static_cast<void> (0) : __assert_fail ("BundleMember->isSchedulingEntity() && \"bundle member already part of other bundle\""
, "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 3127, __PRETTY_FUNCTION__))

3127

"bundle member already part of other bundle")((BundleMember->isSchedulingEntity() && "bundle member already part of other bundle"
) ? static_cast<void> (0) : __assert_fail ("BundleMember->isSchedulingEntity() && \"bundle member already part of other bundle\""
, "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 3127, __PRETTY_FUNCTION__));

3128

if (PrevInBundle) {

3129

PrevInBundle->NextInBundle = BundleMember;

3130

} else {

3131

Bundle = BundleMember;

3132

}

3133

BundleMember->UnscheduledDepsInBundle = 0;

3134

Bundle->UnscheduledDepsInBundle += BundleMember->UnscheduledDeps;

3135

3136

// Group the instructions to a bundle.

3137

BundleMember->FirstInBundle = Bundle;

3138

PrevInBundle = BundleMember;

3139

}

3140

if (ScheduleEnd != OldScheduleEnd) {

3141

// The scheduling region got new instructions at the lower end (or it is a

3142

// new region for the first bundle). This makes it necessary to

3143

// recalculate all dependencies.

3144

// It is seldom that this needs to be done a second time after adding the

3145

// initial bundle to the region.

3146

for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {

3147

ScheduleData *SD = getScheduleData(I);

3148

SD->clearDependencies();

3149

}

3150

ReSchedule = true;

3151

}

3152

if (ReSchedule) {

3153

resetSchedule();

3154

initialFillReadyList(ReadyInsts);

3155

}

3156

3157

DEBUG(dbgs() << "SLP: try schedule bundle " << *Bundle << " in block "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: try schedule bundle " <<
*Bundle << " in block " << BB->getName() <<
"\n"; } } while (false)

3158

<< BB->getName() << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: try schedule bundle " <<
*Bundle << " in block " << BB->getName() <<
"\n"; } } while (false);

3159

3160

calculateDependencies(Bundle, true, SLP);

3161

3162

// Now try to schedule the new bundle. As soon as the bundle is "ready" it

3163

// means that there are no cyclic dependencies and we can schedule it.

3164

// Note that's important that we don't "schedule" the bundle yet (see

3165

// cancelScheduling).

3166

while (!Bundle->isReady() && !ReadyInsts.empty()) {

3167

3168

ScheduleData *pickedSD = ReadyInsts.back();

3169

ReadyInsts.pop_back();

3170

3171

if (pickedSD->isSchedulingEntity() && pickedSD->isReady()) {

3172

schedule(pickedSD, ReadyInsts);

3173

}

3174

}

3175

if (!Bundle->isReady()) {

3176

cancelScheduling(VL);

3177

return false;

3178

}

3179

return true;

3180

}

3181

3182

void BoUpSLP::BlockScheduling::cancelScheduling(ArrayRef<Value *> VL) {

3183

if (isa<PHINode>(VL[0]))

3184

return;

3185

3186

ScheduleData *Bundle = getScheduleData(VL[0]);

3187

DEBUG(dbgs() << "SLP: cancel scheduling of " << *Bundle << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: cancel scheduling of " <<
*Bundle << "\n"; } } while (false);

3188

assert(!Bundle->IsScheduled &&((!Bundle->IsScheduled && "Can't cancel bundle which is already scheduled"
) ? static_cast<void> (0) : __assert_fail ("!Bundle->IsScheduled && \"Can't cancel bundle which is already scheduled\""
, "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 3189, __PRETTY_FUNCTION__))

3189

"Can't cancel bundle which is already scheduled")((!Bundle->IsScheduled && "Can't cancel bundle which is already scheduled"
) ? static_cast<void> (0) : __assert_fail ("!Bundle->IsScheduled && \"Can't cancel bundle which is already scheduled\""
, "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 3189, __PRETTY_FUNCTION__));

3190

assert(Bundle->isSchedulingEntity() && Bundle->isPartOfBundle() &&((Bundle->isSchedulingEntity() && Bundle->isPartOfBundle
() && "tried to unbundle something which is not a bundle"
) ? static_cast<void> (0) : __assert_fail ("Bundle->isSchedulingEntity() && Bundle->isPartOfBundle() && \"tried to unbundle something which is not a bundle\""
, "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 3191, __PRETTY_FUNCTION__))

3191

"tried to unbundle something which is not a bundle")((Bundle->isSchedulingEntity() && Bundle->isPartOfBundle
() && "tried to unbundle something which is not a bundle"
) ? static_cast<void> (0) : __assert_fail ("Bundle->isSchedulingEntity() && Bundle->isPartOfBundle() && \"tried to unbundle something which is not a bundle\""
, "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 3191, __PRETTY_FUNCTION__));

3192

3193

// Un-bundle: make single instructions out of the bundle.

3194

ScheduleData *BundleMember = Bundle;

3195

while (BundleMember) {

3196

assert(BundleMember->FirstInBundle == Bundle && "corrupt bundle links")((BundleMember->FirstInBundle == Bundle && "corrupt bundle links"
) ? static_cast<void> (0) : __assert_fail ("BundleMember->FirstInBundle == Bundle && \"corrupt bundle links\""
, "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 3196, __PRETTY_FUNCTION__));

3197

BundleMember->FirstInBundle = BundleMember;

3198

ScheduleData *Next = BundleMember->NextInBundle;

3199

BundleMember->NextInBundle = nullptr;

3200

BundleMember->UnscheduledDepsInBundle = BundleMember->UnscheduledDeps;

3201

if (BundleMember->UnscheduledDepsInBundle == 0) {

3202

ReadyInsts.insert(BundleMember);

3203

}

3204

BundleMember = Next;

3205

}

3206

}

3207

3208

bool BoUpSLP::BlockScheduling::extendSchedulingRegion(Value *V) {

3209

if (getScheduleData(V))

3210

return true;

3211

Instruction *I = dyn_cast<Instruction>(V);

3212

assert(I && "bundle member must be an instruction")((I && "bundle member must be an instruction") ? static_cast
<void> (0) : __assert_fail ("I && \"bundle member must be an instruction\""
, "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 3212, __PRETTY_FUNCTION__));

3213

assert(!isa<PHINode>(I) && "phi nodes don't need to be scheduled")((!isa<PHINode>(I) && "phi nodes don't need to be scheduled"
) ? static_cast<void> (0) : __assert_fail ("!isa<PHINode>(I) && \"phi nodes don't need to be scheduled\""
, "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 3213, __PRETTY_FUNCTION__));

3214

if (!ScheduleStart) {

3215

// It's the first instruction in the new region.

3216

initScheduleData(I, I->getNextNode(), nullptr, nullptr);

3217

ScheduleStart = I;

3218

ScheduleEnd = I->getNextNode();

3219

assert(ScheduleEnd && "tried to vectorize a TerminatorInst?")((ScheduleEnd && "tried to vectorize a TerminatorInst?"
) ? static_cast<void> (0) : __assert_fail ("ScheduleEnd && \"tried to vectorize a TerminatorInst?\""
, "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 3219, __PRETTY_FUNCTION__));

3220

DEBUG(dbgs() << "SLP: initialize schedule region to " << *I << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: initialize schedule region to "
<< *I << "\n"; } } while (false);

3221

return true;

3222

}

3223

// Search up and down at the same time, because we don't know if the new

3224

// instruction is above or below the existing scheduling region.

3225

BasicBlock::reverse_iterator UpIter =

3226

++ScheduleStart->getIterator().getReverse();

3227

BasicBlock::reverse_iterator UpperEnd = BB->rend();

3228

BasicBlock::iterator DownIter = ScheduleEnd->getIterator();

3229

BasicBlock::iterator LowerEnd = BB->end();

3230

for (;;) {

3231

if (++ScheduleRegionSize > ScheduleRegionSizeLimit) {

3232

DEBUG(dbgs() << "SLP: exceeded schedule region size limit\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: exceeded schedule region size limit\n"
; } } while (false);

3233

return false;

3234

}

3235

3236

if (UpIter != UpperEnd) {

3237

if (&*UpIter == I) {

3238

initScheduleData(I, ScheduleStart, nullptr, FirstLoadStoreInRegion);

3239

ScheduleStart = I;

3240

DEBUG(dbgs() << "SLP: extend schedule region start to " << *I << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: extend schedule region start to "
<< *I << "\n"; } } while (false);

3241

return true;

3242

}

3243

UpIter++;

3244

}

3245

if (DownIter != LowerEnd) {

3246

if (&*DownIter == I) {

3247

initScheduleData(ScheduleEnd, I->getNextNode(), LastLoadStoreInRegion,

3248

nullptr);

3249

ScheduleEnd = I->getNextNode();

3250

3251

DEBUG(dbgs() << "SLP: extend schedule region end to " << *I << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: extend schedule region end to "
<< *I << "\n"; } } while (false);

3252

return true;

3253

}

3254

DownIter++;

3255

}

3256

assert((UpIter != UpperEnd || DownIter != LowerEnd) &&(((UpIter != UpperEnd || DownIter != LowerEnd) && "instruction not found in block"
) ? static_cast<void> (0) : __assert_fail ("(UpIter != UpperEnd || DownIter != LowerEnd) && \"instruction not found in block\""
, "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 3257, __PRETTY_FUNCTION__))

3257

"instruction not found in block")(((UpIter != UpperEnd || DownIter != LowerEnd) && "instruction not found in block"
) ? static_cast<void> (0) : __assert_fail ("(UpIter != UpperEnd || DownIter != LowerEnd) && \"instruction not found in block\""
, "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 3257, __PRETTY_FUNCTION__));

3258

}

3259

return true;

3260

}

3261

3262

void BoUpSLP::BlockScheduling::initScheduleData(Instruction *FromI,

3263

Instruction *ToI,

3264

ScheduleData *PrevLoadStore,

3265

ScheduleData *NextLoadStore) {

3266

ScheduleData *CurrentLoadStore = PrevLoadStore;

3267

for (Instruction *I = FromI; I != ToI; I = I->getNextNode()) {

3268

ScheduleData *SD = ScheduleDataMap[I];

3269

if (!SD) {

3270

// Allocate a new ScheduleData for the instruction.

3271

if (ChunkPos >= ChunkSize) {

3272

ScheduleDataChunks.push_back(

3273

llvm::make_unique<ScheduleData[]>(ChunkSize));

3274

ChunkPos = 0;

3275

}

3276

SD = &(ScheduleDataChunks.back()[ChunkPos++]);

3277

ScheduleDataMap[I] = SD;

3278

SD->Inst = I;

3279

}

3280

assert(!isInSchedulingRegion(SD) &&((!isInSchedulingRegion(SD) && "new ScheduleData already in scheduling region"
) ? static_cast<void> (0) : __assert_fail ("!isInSchedulingRegion(SD) && \"new ScheduleData already in scheduling region\""
, "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 3281, __PRETTY_FUNCTION__))

3281

"new ScheduleData already in scheduling region")((!isInSchedulingRegion(SD) && "new ScheduleData already in scheduling region"
) ? static_cast<void> (0) : __assert_fail ("!isInSchedulingRegion(SD) && \"new ScheduleData already in scheduling region\""
, "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 3281, __PRETTY_FUNCTION__));

3282

SD->init(SchedulingRegionID);

3283

3284

if (I->mayReadOrWriteMemory()) {

3285

// Update the linked list of memory accessing instructions.

3286

if (CurrentLoadStore) {

3287

CurrentLoadStore->NextLoadStore = SD;

3288

} else {

3289

FirstLoadStoreInRegion = SD;

3290

}

3291

CurrentLoadStore = SD;

3292

}

3293

}

3294

if (NextLoadStore) {

3295

if (CurrentLoadStore)

3296

CurrentLoadStore->NextLoadStore = NextLoadStore;

3297

} else {

3298

LastLoadStoreInRegion = CurrentLoadStore;

3299

}

3300

}

3301

3302

void BoUpSLP::BlockScheduling::calculateDependencies(ScheduleData *SD,

3303

bool InsertInReadyList,

3304

BoUpSLP *SLP) {

3305

assert(SD->isSchedulingEntity())((SD->isSchedulingEntity()) ? static_cast<void> (0) :
__assert_fail ("SD->isSchedulingEntity()", "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 3305, __PRETTY_FUNCTION__));

3306

3307

SmallVector<ScheduleData *, 10> WorkList;

3308

WorkList.push_back(SD);

3309

3310

while (!WorkList.empty()) {

3311

ScheduleData *SD = WorkList.back();

3312

WorkList.pop_back();

3313

3314

ScheduleData *BundleMember = SD;

3315

while (BundleMember) {

3316

assert(isInSchedulingRegion(BundleMember))((isInSchedulingRegion(BundleMember)) ? static_cast<void>
(0) : __assert_fail ("isInSchedulingRegion(BundleMember)", "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 3316, __PRETTY_FUNCTION__));

3317

if (!BundleMember->hasValidDependencies()) {

3318

3319

DEBUG(dbgs() << "SLP: update deps of " << *BundleMember << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: update deps of " <<
*BundleMember << "\n"; } } while (false);

3320

BundleMember->Dependencies = 0;

3321

BundleMember->resetUnscheduledDeps();

3322

3323

// Handle def-use chain dependencies.

3324

for (User *U : BundleMember->Inst->users()) {

3325

if (isa<Instruction>(U)) {

3326

ScheduleData *UseSD = getScheduleData(U);

3327

if (UseSD && isInSchedulingRegion(UseSD->FirstInBundle)) {

3328

BundleMember->Dependencies++;

3329

ScheduleData *DestBundle = UseSD->FirstInBundle;

3330

if (!DestBundle->IsScheduled)

3331

BundleMember->incrementUnscheduledDeps(1);

3332

if (!DestBundle->hasValidDependencies())

3333

WorkList.push_back(DestBundle);

3334

}

3335

} else {

3336

// I'm not sure if this can ever happen. But we need to be safe.

3337

// This lets the instruction/bundle never be scheduled and

3338

// eventually disable vectorization.

3339

BundleMember->Dependencies++;

3340

BundleMember->incrementUnscheduledDeps(1);

3341

}

3342

}

3343

3344

// Handle the memory dependencies.

3345

ScheduleData *DepDest = BundleMember->NextLoadStore;

3346

if (DepDest) {

3347

Instruction *SrcInst = BundleMember->Inst;

3348

MemoryLocation SrcLoc = getLocation(SrcInst, SLP->AA);

3349

bool SrcMayWrite = BundleMember->Inst->mayWriteToMemory();

3350

unsigned numAliased = 0;

3351

unsigned DistToSrc = 1;

3352

3353

while (DepDest) {

3354

assert(isInSchedulingRegion(DepDest))((isInSchedulingRegion(DepDest)) ? static_cast<void> (0
) : __assert_fail ("isInSchedulingRegion(DepDest)", "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 3354, __PRETTY_FUNCTION__));

3355

3356

// We have two limits to reduce the complexity:

3357

// 1) AliasedCheckLimit: It's a small limit to reduce calls to

3358

// SLP->isAliased (which is the expensive part in this loop).

3359

// 2) MaxMemDepDistance: It's for very large blocks and it aborts

3360

// the whole loop (even if the loop is fast, it's quadratic).

3361

// It's important for the loop break condition (see below) to

3362

// check this limit even between two read-only instructions.

3363

if (DistToSrc >= MaxMemDepDistance ||

3364

((SrcMayWrite || DepDest->Inst->mayWriteToMemory()) &&

3365

(numAliased >= AliasedCheckLimit ||

3366

SLP->isAliased(SrcLoc, SrcInst, DepDest->Inst)))) {

3367

3368

// We increment the counter only if the locations are aliased

3369

// (instead of counting all alias checks). This gives a better

3370

// balance between reduced runtime and accurate dependencies.

3371

numAliased++;

3372

3373

DepDest->MemoryDependencies.push_back(BundleMember);

3374

BundleMember->Dependencies++;

3375

ScheduleData *DestBundle = DepDest->FirstInBundle;

3376

if (!DestBundle->IsScheduled) {

3377

BundleMember->incrementUnscheduledDeps(1);

3378

}

3379

if (!DestBundle->hasValidDependencies()) {

3380

WorkList.push_back(DestBundle);

3381

}

3382

}

3383

DepDest = DepDest->NextLoadStore;

3384

3385

// Example, explaining the loop break condition: Let's assume our

3386

// starting instruction is i0 and MaxMemDepDistance = 3.

3387

3388

// +--------v--v--v

3389

// i0,i1,i2,i3,i4,i5,i6,i7,i8

3390

// +--------^--^--^

3391

3392

// MaxMemDepDistance let us stop alias-checking at i3 and we add

3393

// dependencies from i0 to i3,i4,.. (even if they are not aliased).

3394

// Previously we already added dependencies from i3 to i6,i7,i8

3395

// (because of MaxMemDepDistance). As we added a dependency from

3396

// i0 to i3, we have transitive dependencies from i0 to i6,i7,i8

3397

// and we can abort this loop at i6.

3398

if (DistToSrc >= 2 * MaxMemDepDistance)

3399

break;

3400

DistToSrc++;

3401

}

3402

}

3403

}

3404

BundleMember = BundleMember->NextInBundle;

3405

}

3406

if (InsertInReadyList && SD->isReady()) {

3407

ReadyInsts.push_back(SD);

3408

DEBUG(dbgs() << "SLP: gets ready on update: " << *SD->Inst << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: gets ready on update: " <<
*SD->Inst << "\n"; } } while (false);

3409

}

3410

}

3411

}

3412

3413

void BoUpSLP::BlockScheduling::resetSchedule() {

3414

assert(ScheduleStart &&((ScheduleStart && "tried to reset schedule on block which has not been scheduled"
) ? static_cast<void> (0) : __assert_fail ("ScheduleStart && \"tried to reset schedule on block which has not been scheduled\""
, "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 3415, __PRETTY_FUNCTION__))

3415

"tried to reset schedule on block which has not been scheduled")((ScheduleStart && "tried to reset schedule on block which has not been scheduled"
) ? static_cast<void> (0) : __assert_fail ("ScheduleStart && \"tried to reset schedule on block which has not been scheduled\""
, "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 3415, __PRETTY_FUNCTION__));

3416

for (Instruction *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {

3417

ScheduleData *SD = getScheduleData(I);

3418

assert(isInSchedulingRegion(SD))((isInSchedulingRegion(SD)) ? static_cast<void> (0) : __assert_fail
("isInSchedulingRegion(SD)", "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 3418, __PRETTY_FUNCTION__));

3419

SD->IsScheduled = false;

3420

SD->resetUnscheduledDeps();

3421

}

3422

ReadyInsts.clear();

3423

}

3424

3425

void BoUpSLP::scheduleBlock(BlockScheduling *BS) {

3426

3427

if (!BS->ScheduleStart)

3428

return;

3429

3430

DEBUG(dbgs() << "SLP: schedule block " << BS->BB->getName() << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: schedule block " << BS
->BB->getName() << "\n"; } } while (false);

3431

3432

BS->resetSchedule();

3433

3434

// For the real scheduling we use a more sophisticated ready-list: it is

3435

// sorted by the original instruction location. This lets the final schedule

3436

// be as close as possible to the original instruction order.

3437

struct ScheduleDataCompare {

3438

bool operator()(ScheduleData *SD1, ScheduleData *SD2) const {

3439

return SD2->SchedulingPriority < SD1->SchedulingPriority;

3440

}

3441

};

3442

std::set<ScheduleData *, ScheduleDataCompare> ReadyInsts;

3443

3444

// Ensure that all dependency data is updated and fill the ready-list with

3445

// initial instructions.

3446

int Idx = 0;

3447

int NumToSchedule = 0;

3448

for (auto *I = BS->ScheduleStart; I != BS->ScheduleEnd;

3449

I = I->getNextNode()) {

3450

ScheduleData *SD = BS->getScheduleData(I);

3451

assert(((SD->isPartOfBundle() == (ScalarToTreeEntry.count(SD->
Inst) != 0) && "scheduler and vectorizer have different opinion on what is a bundle"
) ? static_cast<void> (0) : __assert_fail ("SD->isPartOfBundle() == (ScalarToTreeEntry.count(SD->Inst) != 0) && \"scheduler and vectorizer have different opinion on what is a bundle\""
, "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 3453, __PRETTY_FUNCTION__))

3452

SD->isPartOfBundle() == (ScalarToTreeEntry.count(SD->Inst) != 0) &&((SD->isPartOfBundle() == (ScalarToTreeEntry.count(SD->
Inst) != 0) && "scheduler and vectorizer have different opinion on what is a bundle"
) ? static_cast<void> (0) : __assert_fail ("SD->isPartOfBundle() == (ScalarToTreeEntry.count(SD->Inst) != 0) && \"scheduler and vectorizer have different opinion on what is a bundle\""
, "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 3453, __PRETTY_FUNCTION__))

3453

"scheduler and vectorizer have different opinion on what is a bundle")((SD->isPartOfBundle() == (ScalarToTreeEntry.count(SD->
Inst) != 0) && "scheduler and vectorizer have different opinion on what is a bundle"
) ? static_cast<void> (0) : __assert_fail ("SD->isPartOfBundle() == (ScalarToTreeEntry.count(SD->Inst) != 0) && \"scheduler and vectorizer have different opinion on what is a bundle\""
, "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 3453, __PRETTY_FUNCTION__));

3454

SD->FirstInBundle->SchedulingPriority = Idx++;

3455

if (SD->isSchedulingEntity()) {

3456

BS->calculateDependencies(SD, false, this);

3457

NumToSchedule++;

3458

}

3459

}

3460

BS->initialFillReadyList(ReadyInsts);

3461

3462

Instruction *LastScheduledInst = BS->ScheduleEnd;

3463

3464

// Do the "real" scheduling.

3465

while (!ReadyInsts.empty()) {

3466

ScheduleData *picked = *ReadyInsts.begin();

3467

ReadyInsts.erase(ReadyInsts.begin());

3468

3469

// Move the scheduled instruction(s) to their dedicated places, if not

3470

// there yet.

3471

ScheduleData *BundleMember = picked;

3472

while (BundleMember) {

3473

Instruction *pickedInst = BundleMember->Inst;

3474

if (LastScheduledInst->getNextNode() != pickedInst) {

3475

BS->BB->getInstList().remove(pickedInst);

3476

BS->BB->getInstList().insert(LastScheduledInst->getIterator(),

3477

pickedInst);

3478

}

3479

LastScheduledInst = pickedInst;

3480

BundleMember = BundleMember->NextInBundle;

3481

}

3482

3483

BS->schedule(picked, ReadyInsts);

3484

NumToSchedule--;

3485

}

3486

assert(NumToSchedule == 0 && "could not schedule all instructions")((NumToSchedule == 0 && "could not schedule all instructions"
) ? static_cast<void> (0) : __assert_fail ("NumToSchedule == 0 && \"could not schedule all instructions\""
, "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 3486, __PRETTY_FUNCTION__));

3487

3488

// Avoid duplicate scheduling of the block.

3489

BS->ScheduleStart = nullptr;

3490

}

3491

3492

unsigned BoUpSLP::getVectorElementSize(Value *V) {

3493

// If V is a store, just return the width of the stored value without

3494

// traversing the expression tree. This is the common case.

3495

if (auto *Store = dyn_cast<StoreInst>(V))

3496

return DL->getTypeSizeInBits(Store->getValueOperand()->getType());

3497

3498

// If V is not a store, we can traverse the expression tree to find loads

3499

// that feed it. The type of the loaded value may indicate a more suitable

3500

// width than V's type. We want to base the vector element size on the width

3501

// of memory operations where possible.

3502

SmallVector<Instruction *, 16> Worklist;

3503

SmallPtrSet<Instruction *, 16> Visited;

3504

if (auto *I = dyn_cast<Instruction>(V))

3505

Worklist.push_back(I);

3506

3507

// Traverse the expression tree in bottom-up order looking for loads. If we

3508

// encounter an instruciton we don't yet handle, we give up.

3509

auto MaxWidth = 0u;

3510

auto FoundUnknownInst = false;

3511

while (!Worklist.empty() && !FoundUnknownInst) {

3512

auto *I = Worklist.pop_back_val();

3513

Visited.insert(I);

3514

3515

// We should only be looking at scalar instructions here. If the current

3516

// instruction has a vector type, give up.

3517

auto *Ty = I->getType();

3518

if (isa<VectorType>(Ty))

3519

FoundUnknownInst = true;

3520

3521

// If the current instruction is a load, update MaxWidth to reflect the

3522

// width of the loaded value.

3523

else if (isa<LoadInst>(I))

3524

MaxWidth = std::max<unsigned>(MaxWidth, DL->getTypeSizeInBits(Ty));

3525

3526

// Otherwise, we need to visit the operands of the instruction. We only

3527

// handle the interesting cases from buildTree here. If an operand is an

3528

// instruction we haven't yet visited, we add it to the worklist.

3529

else if (isa<PHINode>(I) || isa<CastInst>(I) || isa<GetElementPtrInst>(I) ||

3530

isa<CmpInst>(I) || isa<SelectInst>(I) || isa<BinaryOperator>(I)) {

3531

for (Use &U : I->operands())

3532

if (auto *J = dyn_cast<Instruction>(U.get()))

3533

if (!Visited.count(J))

3534

Worklist.push_back(J);

3535

}

3536

3537

// If we don't yet handle the instruction, give up.

3538

else

3539

FoundUnknownInst = true;

3540

}

3541

3542

// If we didn't encounter a memory access in the expression tree, or if we

3543

// gave up for some reason, just return the width of V.

3544

if (!MaxWidth || FoundUnknownInst)

3545

return DL->getTypeSizeInBits(V->getType());

3546

3547

// Otherwise, return the maximum width we found.

3548

return MaxWidth;

3549

}

3550

3551

// Determine if a value V in a vectorizable expression Expr can be demoted to a

3552

// smaller type with a truncation. We collect the values that will be demoted

3553

// in ToDemote and additional roots that require investigating in Roots.

3554

static bool collectValuesToDemote(Value *V, SmallPtrSetImpl<Value *> &Expr,

3555

SmallVectorImpl<Value *> &ToDemote,

3556

SmallVectorImpl<Value *> &Roots) {

3557

3558

// We can always demote constants.

3559

if (isa<Constant>(V)) {

3560

ToDemote.push_back(V);

3561

return true;

3562

}

3563

3564

// If the value is not an instruction in the expression with only one use, it

3565

// cannot be demoted.

3566

auto *I = dyn_cast<Instruction>(V);

3567

if (!I || !I->hasOneUse() || !Expr.count(I))

3568

return false;

3569

3570

switch (I->getOpcode()) {

3571

3572

// We can always demote truncations and extensions. Since truncations can

3573

// seed additional demotion, we save the truncated value.

3574

case Instruction::Trunc:

3575

Roots.push_back(I->getOperand(0));

3576

case Instruction::ZExt:

3577

case Instruction::SExt:

3578

break;

3579

3580

// We can demote certain binary operations if we can demote both of their

3581

// operands.

3582

case Instruction::Add:

3583

case Instruction::Sub:

3584

case Instruction::Mul:

3585

case Instruction::And:

3586

case Instruction::Or:

3587

case Instruction::Xor:

3588

if (!collectValuesToDemote(I->getOperand(0), Expr, ToDemote, Roots) ||

3589

!collectValuesToDemote(I->getOperand(1), Expr, ToDemote, Roots))

3590

return false;

3591

break;

3592

3593

// We can demote selects if we can demote their true and false values.

3594

case Instruction::Select: {

3595

SelectInst *SI = cast<SelectInst>(I);

3596

if (!collectValuesToDemote(SI->getTrueValue(), Expr, ToDemote, Roots) ||

3597

!collectValuesToDemote(SI->getFalseValue(), Expr, ToDemote, Roots))

3598

return false;

3599

break;

3600

}

3601

3602

// We can demote phis if we can demote all their incoming operands. Note that

3603

// we don't need to worry about cycles since we ensure single use above.

3604

case Instruction::PHI: {

3605

PHINode *PN = cast<PHINode>(I);

3606

for (Value *IncValue : PN->incoming_values())

3607

if (!collectValuesToDemote(IncValue, Expr, ToDemote, Roots))

3608

return false;

3609

break;

3610

}

3611

3612

// Otherwise, conservatively give up.

3613

default:

3614

return false;

3615

}

3616

3617

// Record the value that we can demote.

3618

ToDemote.push_back(V);

3619

return true;

3620

}

3621

3622

void BoUpSLP::computeMinimumValueSizes() {

3623

// If there are no external uses, the expression tree must be rooted by a

3624

// store. We can't demote in-memory values, so there is nothing to do here.

3625

if (ExternalUses.empty())

3626

return;

3627

3628

// We only attempt to truncate integer expressions.

3629

auto &TreeRoot = VectorizableTree[0].Scalars;

3630

auto *TreeRootIT = dyn_cast<IntegerType>(TreeRoot[0]->getType());

3631

if (!TreeRootIT)

3632

return;

3633

3634

// If the expression is not rooted by a store, these roots should have

3635

// external uses. We will rely on InstCombine to rewrite the expression in

3636

// the narrower type. However, InstCombine only rewrites single-use values.

3637

// This means that if a tree entry other than a root is used externally, it

3638

// must have multiple uses and InstCombine will not rewrite it. The code

3639

// below ensures that only the roots are used externally.

3640

SmallPtrSet<Value *, 32> Expr(TreeRoot.begin(), TreeRoot.end());

3641

for (auto &EU : ExternalUses)

3642

if (!Expr.erase(EU.Scalar))

3643

return;

3644

if (!Expr.empty())

3645

return;

3646

3647

// Collect the scalar values of the vectorizable expression. We will use this

3648

// context to determine which values can be demoted. If we see a truncation,

3649

// we mark it as seeding another demotion.

3650

for (auto &Entry : VectorizableTree)

3651

Expr.insert(Entry.Scalars.begin(), Entry.Scalars.end());

3652

3653

// Ensure the roots of the vectorizable tree don't form a cycle. They must

3654

// have a single external user that is not in the vectorizable tree.

3655

for (auto *Root : TreeRoot)

3656

if (!Root->hasOneUse() || Expr.count(*Root->user_begin()))

3657

return;

3658

3659

// Conservatively determine if we can actually truncate the roots of the

3660

// expression. Collect the values that can be demoted in ToDemote and

3661

// additional roots that require investigating in Roots.

3662

SmallVector<Value *, 32> ToDemote;

3663

SmallVector<Value *, 4> Roots;

3664

for (auto *Root : TreeRoot)

3665

if (!collectValuesToDemote(Root, Expr, ToDemote, Roots))

3666

return;

3667

3668

// The maximum bit width required to represent all the values that can be

3669

// demoted without loss of precision. It would be safe to truncate the roots

3670

// of the expression to this width.

3671

auto MaxBitWidth = 8u;

3672

3673

// We first check if all the bits of the roots are demanded. If they're not,

3674

// we can truncate the roots to this narrower type.

3675

for (auto *Root : TreeRoot) {

3676

auto Mask = DB->getDemandedBits(cast<Instruction>(Root));

3677

MaxBitWidth = std::max<unsigned>(

3678

Mask.getBitWidth() - Mask.countLeadingZeros(), MaxBitWidth);

3679

}

3680

3681

// True if the roots can be zero-extended back to their original type, rather

3682

// than sign-extended. We know that if the leading bits are not demanded, we

3683

// can safely zero-extend. So we initialize IsKnownPositive to True.

3684

bool IsKnownPositive = true;

3685

3686

// If all the bits of the roots are demanded, we can try a little harder to

3687

// compute a narrower type. This can happen, for example, if the roots are

3688

// getelementptr indices. InstCombine promotes these indices to the pointer

3689

// width. Thus, all their bits are technically demanded even though the

3690

// address computation might be vectorized in a smaller type.

3691

3692

// We start by looking at each entry that can be demoted. We compute the

3693

// maximum bit width required to store the scalar by using ValueTracking to

3694

// compute the number of high-order bits we can truncate.

3695

if (MaxBitWidth == DL->getTypeSizeInBits(TreeRoot[0]->getType())) {

3696

MaxBitWidth = 8u;

3697

3698

// Determine if the sign bit of all the roots is known to be zero. If not,

3699

// IsKnownPositive is set to False.

3700

IsKnownPositive = all_of(TreeRoot, [&](Value *R) {

3701

KnownBits Known = computeKnownBits(R, *DL);

3702

return Known.isNonNegative();

3703

});

3704

3705

// Determine the maximum number of bits required to store the scalar

3706

// values.

3707

for (auto *Scalar : ToDemote) {

3708

auto NumSignBits = ComputeNumSignBits(Scalar, *DL, 0, AC, 0, DT);

3709

auto NumTypeBits = DL->getTypeSizeInBits(Scalar->getType());

3710

MaxBitWidth = std::max<unsigned>(NumTypeBits - NumSignBits, MaxBitWidth);

3711

}

3712

3713

// If we can't prove that the sign bit is zero, we must add one to the

3714

// maximum bit width to account for the unknown sign bit. This preserves

3715

// the existing sign bit so we can safely sign-extend the root back to the

3716

// original type. Otherwise, if we know the sign bit is zero, we will

3717

// zero-extend the root instead.

3718

3719

// FIXME: This is somewhat suboptimal, as there will be cases where adding

3720

// one to the maximum bit width will yield a larger-than-necessary

3721

// type. In general, we need to add an extra bit only if we can't

3722

// prove that the upper bit of the original type is equal to the

3723

// upper bit of the proposed smaller type. If these two bits are the

3724

// same (either zero or one) we know that sign-extending from the

3725

// smaller type will result in the same value. Here, since we can't

3726

// yet prove this, we are just making the proposed smaller type

3727

// larger to ensure correctness.

3728

if (!IsKnownPositive)

3729

++MaxBitWidth;

3730

}

3731

3732

// Round MaxBitWidth up to the next power-of-two.

3733

if (!isPowerOf2_64(MaxBitWidth))

3734

MaxBitWidth = NextPowerOf2(MaxBitWidth);

3735

3736

// If the maximum bit width we compute is less than the with of the roots'

3737

// type, we can proceed with the narrowing. Otherwise, do nothing.

3738

if (MaxBitWidth >= TreeRootIT->getBitWidth())

3739

return;

3740

3741

// If we can truncate the root, we must collect additional values that might

3742

// be demoted as a result. That is, those seeded by truncations we will

3743

// modify.

3744

while (!Roots.empty())

3745

collectValuesToDemote(Roots.pop_back_val(), Expr, ToDemote, Roots);

3746

3747

// Finally, map the values we can demote to the maximum bit with we computed.

3748

for (auto *Scalar : ToDemote)

3749

MinBWs[Scalar] = std::make_pair(MaxBitWidth, !IsKnownPositive);

3750

}

3751

3752

namespace {

3753

/// The SLPVectorizer Pass.

3754

struct SLPVectorizer : public FunctionPass {

3755

SLPVectorizerPass Impl;

3756

3757

/// Pass identification, replacement for typeid

3758

static char ID;

3759

3760

explicit SLPVectorizer() : FunctionPass(ID) {

3761

initializeSLPVectorizerPass(*PassRegistry::getPassRegistry());

3762

}

3763

3764

3765

bool doInitialization(Module &M) override {

3766

return false;

3767

}

3768

3769

bool runOnFunction(Function &F) override {

3770

if (skipFunction(F))

3771

return false;

3772

3773

auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();

3774

auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);

3775

auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();

3776

auto *TLI = TLIP ? &TLIP->getTLI() : nullptr;

3777

auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();

3778

auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();

3779

auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();

3780

auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);

3781

auto *DB = &getAnalysis<DemandedBitsWrapperPass>().getDemandedBits();

3782

auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();

3783

3784

return Impl.runImpl(F, SE, TTI, TLI, AA, LI, DT, AC, DB, ORE);

3785

}

3786

3787

void getAnalysisUsage(AnalysisUsage &AU) const override {

3788

FunctionPass::getAnalysisUsage(AU);

3789

AU.addRequired<AssumptionCacheTracker>();

3790

AU.addRequired<ScalarEvolutionWrapperPass>();

3791

AU.addRequired<AAResultsWrapperPass>();

3792

AU.addRequired<TargetTransformInfoWrapperPass>();

3793

AU.addRequired<LoopInfoWrapperPass>();

3794

AU.addRequired<DominatorTreeWrapperPass>();

3795

AU.addRequired<DemandedBitsWrapperPass>();

3796

AU.addRequired<OptimizationRemarkEmitterWrapperPass>();

3797

AU.addPreserved<LoopInfoWrapperPass>();

3798

AU.addPreserved<DominatorTreeWrapperPass>();

3799

AU.addPreserved<AAResultsWrapperPass>();

3800

AU.addPreserved<GlobalsAAWrapperPass>();

3801

AU.setPreservesCFG();

3802

}

3803

};

3804

} // end anonymous namespace

3805

3806

PreservedAnalyses SLPVectorizerPass::run(Function &F, FunctionAnalysisManager &AM) {

3807

auto *SE = &AM.getResult<ScalarEvolutionAnalysis>(F);

3808

auto *TTI = &AM.getResult<TargetIRAnalysis>(F);

3809

auto *TLI = AM.getCachedResult<TargetLibraryAnalysis>(F);

3810

auto *AA = &AM.getResult<AAManager>(F);

3811

auto *LI = &AM.getResult<LoopAnalysis>(F);

3812

auto *DT = &AM.getResult<DominatorTreeAnalysis>(F);

3813

auto *AC = &AM.getResult<AssumptionAnalysis>(F);

3814

auto *DB = &AM.getResult<DemandedBitsAnalysis>(F);

3815

auto *ORE = &AM.getResult<OptimizationRemarkEmitterAnalysis>(F);

3816

3817

bool Changed = runImpl(F, SE, TTI, TLI, AA, LI, DT, AC, DB, ORE);

3818

if (!Changed)

3819

return PreservedAnalyses::all();

3820

3821

PreservedAnalyses PA;

3822

PA.preserveSet<CFGAnalyses>();

3823

PA.preserve<AAManager>();

3824

PA.preserve<GlobalsAA>();

3825

return PA;

3826

}

3827

3828

bool SLPVectorizerPass::runImpl(Function &F, ScalarEvolution *SE_,

3829

TargetTransformInfo *TTI_,

3830

TargetLibraryInfo *TLI_, AliasAnalysis *AA_,

3831

LoopInfo *LI_, DominatorTree *DT_,

3832

AssumptionCache *AC_, DemandedBits *DB_,

3833

OptimizationRemarkEmitter *ORE_) {

3834

SE = SE_;

3835

TTI = TTI_;

3836

TLI = TLI_;

3837

AA = AA_;

3838

LI = LI_;

3839

DT = DT_;

3840

AC = AC_;

3841

DB = DB_;

3842

DL = &F.getParent()->getDataLayout();

3843

3844

Stores.clear();

3845

GEPs.clear();

3846

bool Changed = false;

3847

3848

// If the target claims to have no vector registers don't attempt

3849

// vectorization.

3850

if (!TTI->getNumberOfRegisters(true))

3851

return false;

3852

3853

// Don't vectorize when the attribute NoImplicitFloat is used.

3854

if (F.hasFnAttribute(Attribute::NoImplicitFloat))

3855

return false;

3856

3857

DEBUG(dbgs() << "SLP: Analyzing blocks in " << F.getName() << ".\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Analyzing blocks in " <<
F.getName() << ".\n"; } } while (false);

3858

3859

// Use the bottom up slp vectorizer to construct chains that start with

3860

// store instructions.

3861

BoUpSLP R(&F, SE, TTI, TLI, AA, LI, DT, AC, DB, DL, ORE_);

3862

3863

// A general note: the vectorizer must use BoUpSLP::eraseInstruction() to

3864

// delete instructions.

3865

3866

// Scan the blocks in the function in post order.

3867

for (auto BB : post_order(&F.getEntryBlock())) {

3868

collectSeedInstructions(BB);

3869

3870

// Vectorize trees that end at stores.

3871

if (!Stores.empty()) {

3872

DEBUG(dbgs() << "SLP: Found stores for " << Stores.size()do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Found stores for " << Stores
.size() << " underlying objects.\n"; } } while (false)

3873

<< " underlying objects.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Found stores for " << Stores
.size() << " underlying objects.\n"; } } while (false);

3874

Changed |= vectorizeStoreChains(R);

3875

}

3876

3877

// Vectorize trees that end at reductions.

3878

Changed |= vectorizeChainsInBlock(BB, R);

3879

3880

// Vectorize the index computations of getelementptr instructions. This

3881

// is primarily intended to catch gather-like idioms ending at

3882

// non-consecutive loads.

3883

if (!GEPs.empty()) {

3884

DEBUG(dbgs() << "SLP: Found GEPs for " << GEPs.size()do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Found GEPs for " << GEPs
.size() << " underlying objects.\n"; } } while (false)

3885

<< " underlying objects.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Found GEPs for " << GEPs
.size() << " underlying objects.\n"; } } while (false);

3886

Changed |= vectorizeGEPIndices(BB, R);

3887

}

3888

}

3889

3890

if (Changed) {

3891

R.optimizeGatherSequence();

3892

DEBUG(dbgs() << "SLP: vectorized \"" << F.getName() << "\"\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: vectorized \"" << F.getName
() << "\"\n"; } } while (false);

3893

DEBUG(verifyFunction(F))do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { verifyFunction(F); } } while (false);

3894

}

3895

return Changed;

3896

}

3897

3898

/// \brief Check that the Values in the slice in VL array are still existent in

3899

/// the WeakTrackingVH array.

3900

/// Vectorization of part of the VL array may cause later values in the VL array

3901

/// to become invalid. We track when this has happened in the WeakTrackingVH

3902

/// array.

3903

static bool hasValueBeenRAUWed(ArrayRef<Value *> VL,

3904

ArrayRef<WeakTrackingVH> VH, unsigned SliceBegin,

3905

unsigned SliceSize) {

3906

VL = VL.slice(SliceBegin, SliceSize);

3907

VH = VH.slice(SliceBegin, SliceSize);

3908

return !std::equal(VL.begin(), VL.end(), VH.begin());

3909

}

3910

3911

bool SLPVectorizerPass::vectorizeStoreChain(ArrayRef<Value *> Chain, BoUpSLP &R,

3912

unsigned VecRegSize) {

3913

unsigned ChainLen = Chain.size();

3914

DEBUG(dbgs() << "SLP: Analyzing a store chain of length " << ChainLendo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Analyzing a store chain of length "
<< ChainLen << "\n"; } } while (false)

3915

<< "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Analyzing a store chain of length "
<< ChainLen << "\n"; } } while (false);

3916

unsigned Sz = R.getVectorElementSize(Chain[0]);

3917

unsigned VF = VecRegSize / Sz;

3918

3919

if (!isPowerOf2_32(Sz) || VF < 2)

3920

return false;

3921

3922

// Keep track of values that were deleted by vectorizing in the loop below.

3923

SmallVector<WeakTrackingVH, 8> TrackValues(Chain.begin(), Chain.end());

3924

3925

bool Changed = false;

3926

// Look for profitable vectorizable trees at all offsets, starting at zero.

3927

for (unsigned i = 0, e = ChainLen; i < e; ++i) {

3928

if (i + VF > e)

3929

break;

3930

3931

// Check that a previous iteration of this loop did not delete the Value.

3932

if (hasValueBeenRAUWed(Chain, TrackValues, i, VF))

3933

continue;

3934

3935

DEBUG(dbgs() << "SLP: Analyzing " << VF << " stores at offset " << ido { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Analyzing " << VF <<
" stores at offset " << i << "\n"; } } while (false
)

3936

<< "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Analyzing " << VF <<
" stores at offset " << i << "\n"; } } while (false
);

3937

ArrayRef<Value *> Operands = Chain.slice(i, VF);

3938

3939

R.buildTree(Operands);

3940

if (R.isTreeTinyAndNotFullyVectorizable())

3941

continue;

3942

3943

R.computeMinimumValueSizes();

3944

3945

int Cost = R.getTreeCost();

3946

3947

DEBUG(dbgs() << "SLP: Found cost=" << Cost << " for VF=" << VF << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Found cost=" << Cost <<
" for VF=" << VF << "\n"; } } while (false);

3948

if (Cost < -SLPCostThreshold) {

3949

DEBUG(dbgs() << "SLP: Decided to vectorize cost=" << Cost << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Decided to vectorize cost=" <<
Cost << "\n"; } } while (false);

3950

using namespace ore;

3951

R.getORE()->emit(OptimizationRemark(SV_NAME"slp-vectorizer", "StoresVectorized",

3952

cast<StoreInst>(Chain[i]))

3953

<< "Stores SLP vectorized with cost " << NV("Cost", Cost)

3954

<< " and with tree size "

3955

<< NV("TreeSize", R.getTreeSize()));

3956

3957

R.vectorizeTree();

3958

3959

// Move to the next bundle.

3960

i += VF - 1;

3961

Changed = true;

3962

}

3963

}

3964

3965

return Changed;

3966

}

3967

3968

bool SLPVectorizerPass::vectorizeStores(ArrayRef<StoreInst *> Stores,

3969

BoUpSLP &R) {

3970

SetVector<StoreInst *> Heads, Tails;

3971

SmallDenseMap<StoreInst *, StoreInst *> ConsecutiveChain;

3972

3973

// We may run into multiple chains that merge into a single chain. We mark the

3974

// stores that we vectorized so that we don't visit the same store twice.

3975

BoUpSLP::ValueSet VectorizedStores;

3976

bool Changed = false;

3977

3978

// Do a quadratic search on all of the given stores and find

3979

// all of the pairs of stores that follow each other.

3980

SmallVector<unsigned, 16> IndexQueue;

3981

for (unsigned i = 0, e = Stores.size(); i < e; ++i) {

3982

IndexQueue.clear();

3983

// If a store has multiple consecutive store candidates, search Stores

3984

// array according to the sequence: from i+1 to e, then from i-1 to 0.

3985

// This is because usually pairing with immediate succeeding or preceding

3986

// candidate create the best chance to find slp vectorization opportunity.

3987

unsigned j = 0;

3988

for (j = i + 1; j < e; ++j)

3989

IndexQueue.push_back(j);

3990

for (j = i; j > 0; --j)

3991

IndexQueue.push_back(j - 1);

3992

3993

for (auto &k : IndexQueue) {

3994

if (isConsecutiveAccess(Stores[i], Stores[k], *DL, *SE)) {

3995

Tails.insert(Stores[k]);

3996

Heads.insert(Stores[i]);

3997

ConsecutiveChain[Stores[i]] = Stores[k];

3998

break;

3999

}

4000

}

4001

}

4002

4003

// For stores that start but don't end a link in the chain:

4004

for (SetVector<StoreInst *>::iterator it = Heads.begin(), e = Heads.end();

4005

it != e; ++it) {

4006

if (Tails.count(*it))

4007

continue;

4008

4009

// We found a store instr that starts a chain. Now follow the chain and try

4010

// to vectorize it.

4011

BoUpSLP::ValueList Operands;

4012

StoreInst *I = *it;

4013

// Collect the chain into a list.

4014

while (Tails.count(I) || Heads.count(I)) {

4015

if (VectorizedStores.count(I))

4016

break;

4017

Operands.push_back(I);

4018

// Move to the next value in the chain.

4019

I = ConsecutiveChain[I];

4020

}

4021

4022

// FIXME: Is division-by-2 the correct step? Should we assert that the

4023

// register size is a power-of-2?

4024

for (unsigned Size = R.getMaxVecRegSize(); Size >= R.getMinVecRegSize();

4025

Size /= 2) {

4026

if (vectorizeStoreChain(Operands, R, Size)) {

4027

// Mark the vectorized stores so that we don't vectorize them again.

4028

VectorizedStores.insert(Operands.begin(), Operands.end());

4029

Changed = true;

4030

break;

4031

}

4032

}

4033

}

4034

4035

return Changed;

4036

}

4037

4038

void SLPVectorizerPass::collectSeedInstructions(BasicBlock *BB) {

4039

4040

// Initialize the collections. We will make a single pass over the block.

4041

Stores.clear();

4042

GEPs.clear();

4043

4044

// Visit the store and getelementptr instructions in BB and organize them in

4045

// Stores and GEPs according to the underlying objects of their pointer

4046

// operands.

4047

for (Instruction &I : *BB) {

4048

4049

// Ignore store instructions that are volatile or have a pointer operand

4050

// that doesn't point to a scalar type.

4051

if (auto *SI = dyn_cast<StoreInst>(&I)) {

4052

if (!SI->isSimple())

4053

continue;

4054

if (!isValidElementType(SI->getValueOperand()->getType()))

4055

continue;

4056

Stores[GetUnderlyingObject(SI->getPointerOperand(), *DL)].push_back(SI);

4057

}

4058

4059

// Ignore getelementptr instructions that have more than one index, a

4060

// constant index, or a pointer operand that doesn't point to a scalar

4061

// type.

4062

else if (auto *GEP = dyn_cast<GetElementPtrInst>(&I)) {

4063

auto Idx = GEP->idx_begin()->get();

4064

if (GEP->getNumIndices() > 1 || isa<Constant>(Idx))

4065

continue;

4066

if (!isValidElementType(Idx->getType()))

4067

continue;

4068

if (GEP->getType()->isVectorTy())

4069

continue;

4070

GEPs[GetUnderlyingObject(GEP->getPointerOperand(), *DL)].push_back(GEP);

4071

}

4072

}

4073

}

4074

4075

bool SLPVectorizerPass::tryToVectorizePair(Value *A, Value *B, BoUpSLP &R) {

4076

if (!A || !B)

4077

return false;

4078

Value *VL[] = { A, B };

4079

return tryToVectorizeList(VL, R, None, true);

4080

}

4081

4082

bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,

4083

ArrayRef<Value *> BuildVector,

4084

bool AllowReorder) {

4085

if (VL.size() < 2)

4086

return false;

4087

4088

DEBUG(dbgs() << "SLP: Trying to vectorize a list of length = " << VL.size()do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Trying to vectorize a list of length = "
<< VL.size() << ".\n"; } } while (false)

4089

<< ".\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Trying to vectorize a list of length = "
<< VL.size() << ".\n"; } } while (false);

4090

4091

// Check that all of the parts are scalar instructions of the same type.

4092

Instruction *I0 = dyn_cast<Instruction>(VL[0]);

4093

if (!I0)

4094

return false;

4095

4096

unsigned Opcode0 = I0->getOpcode();

4097

4098

unsigned Sz = R.getVectorElementSize(I0);

4099

unsigned MinVF = std::max(2U, R.getMinVecRegSize() / Sz);

4100

unsigned MaxVF = std::max<unsigned>(PowerOf2Floor(VL.size()), MinVF);

4101

if (MaxVF < 2)

4102

return false;

4103

4104

for (Value *V : VL) {

4105

Type *Ty = V->getType();

4106

if (!isValidElementType(Ty))

4107

return false;

4108

Instruction *Inst = dyn_cast<Instruction>(V);

4109

if (!Inst || Inst->getOpcode() != Opcode0)

4110

return false;

4111

}

4112

4113

bool Changed = false;

4114

4115

// Keep track of values that were deleted by vectorizing in the loop below.

4116

SmallVector<WeakTrackingVH, 8> TrackValues(VL.begin(), VL.end());

4117

4118

unsigned NextInst = 0, MaxInst = VL.size();

4119

for (unsigned VF = MaxVF; NextInst + 1 < MaxInst && VF >= MinVF;

4120

VF /= 2) {

4121

// No actual vectorization should happen, if number of parts is the same as

4122

// provided vectorization factor (i.e. the scalar type is used for vector

4123

// code during codegen).

4124

auto *VecTy = VectorType::get(VL[0]->getType(), VF);

4125

if (TTI->getNumberOfParts(VecTy) == VF)

4126

continue;

4127

for (unsigned I = NextInst; I < MaxInst; ++I) {

4128

unsigned OpsWidth = 0;

4129

4130

if (I + VF > MaxInst)

4131

OpsWidth = MaxInst - I;

4132

else

4133

OpsWidth = VF;

4134

4135

if (!isPowerOf2_32(OpsWidth) || OpsWidth < 2)

4136

break;

4137

4138

// Check that a previous iteration of this loop did not delete the Value.

4139

if (hasValueBeenRAUWed(VL, TrackValues, I, OpsWidth))

4140

continue;

4141

4142

DEBUG(dbgs() << "SLP: Analyzing " << OpsWidth << " operations "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Analyzing " << OpsWidth
<< " operations " << "\n"; } } while (false)

4143

<< "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Analyzing " << OpsWidth
<< " operations " << "\n"; } } while (false);

4144

ArrayRef<Value *> Ops = VL.slice(I, OpsWidth);

4145

4146

ArrayRef<Value *> BuildVectorSlice;

4147

if (!BuildVector.empty())

4148

BuildVectorSlice = BuildVector.slice(I, OpsWidth);

4149

4150

R.buildTree(Ops, BuildVectorSlice);

4151

// TODO: check if we can allow reordering for more cases.

4152

if (AllowReorder && R.shouldReorder()) {

4153

// Conceptually, there is nothing actually preventing us from trying to

4154

// reorder a larger list. In fact, we do exactly this when vectorizing

4155

// reductions. However, at this point, we only expect to get here when

4156

// there are exactly two operations.

4157

assert(Ops.size() == 2)((Ops.size() == 2) ? static_cast<void> (0) : __assert_fail
("Ops.size() == 2", "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 4157, __PRETTY_FUNCTION__));

4158

assert(BuildVectorSlice.empty())((BuildVectorSlice.empty()) ? static_cast<void> (0) : __assert_fail
("BuildVectorSlice.empty()", "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 4158, __PRETTY_FUNCTION__));

4159

Value *ReorderedOps[] = {Ops[1], Ops[0]};

4160

R.buildTree(ReorderedOps, None);

4161

}

4162

if (R.isTreeTinyAndNotFullyVectorizable())

4163

continue;

4164

4165

R.computeMinimumValueSizes();

4166

int Cost = R.getTreeCost();

4167

4168

if (Cost < -SLPCostThreshold) {

4169

DEBUG(dbgs() << "SLP: Vectorizing list at cost:" << Cost << ".\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Vectorizing list at cost:" <<
Cost << ".\n"; } } while (false);

4170

R.getORE()->emit(OptimizationRemark(SV_NAME"slp-vectorizer", "VectorizedList",

4171

cast<Instruction>(Ops[0]))

4172

<< "SLP vectorized with cost " << ore::NV("Cost", Cost)

4173

<< " and with tree size "

4174

<< ore::NV("TreeSize", R.getTreeSize()));

4175

4176

Value *VectorizedRoot = R.vectorizeTree();

4177

4178

// Reconstruct the build vector by extracting the vectorized root. This

4179

// way we handle the case where some elements of the vector are

4180

// undefined.

4181

// (return (inserelt <4 xi32> (insertelt undef (opd0) 0) (opd1) 2))

4182

if (!BuildVectorSlice.empty()) {

4183

// The insert point is the last build vector instruction. The

4184

// vectorized root will precede it. This guarantees that we get an

4185

// instruction. The vectorized tree could have been constant folded.

4186

Instruction *InsertAfter = cast<Instruction>(BuildVectorSlice.back());

4187

unsigned VecIdx = 0;

4188

for (auto &V : BuildVectorSlice) {

4189

IRBuilder<NoFolder> Builder(InsertAfter->getParent(),

4190

++BasicBlock::iterator(InsertAfter));

4191

Instruction *I = cast<Instruction>(V);

4192

assert(isa<InsertElementInst>(I) || isa<InsertValueInst>(I))((isa<InsertElementInst>(I) || isa<InsertValueInst>
(I)) ? static_cast<void> (0) : __assert_fail ("isa<InsertElementInst>(I) || isa<InsertValueInst>(I)"
, "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 4192, __PRETTY_FUNCTION__));

4193

Instruction *Extract =

4194

cast<Instruction>(Builder.CreateExtractElement(

4195

VectorizedRoot, Builder.getInt32(VecIdx++)));

4196

I->setOperand(1, Extract);

4197

I->removeFromParent();

4198

I->insertAfter(Extract);

4199

InsertAfter = I;

4200

}

4201

}

4202

// Move to the next bundle.

4203

I += VF - 1;

4204

NextInst = I + 1;

4205

Changed = true;

4206

}

4207

}

4208

}

4209

4210

return Changed;

4211

}

4212

4213

bool SLPVectorizerPass::tryToVectorize(BinaryOperator *V, BoUpSLP &R) {

4214

if (!V)

4215

return false;

4216

4217

Value *P = V->getParent();

4218

4219

// Vectorize in current basic block only.

4220

auto *Op0 = dyn_cast<Instruction>(V->getOperand(0));

4221

auto *Op1 = dyn_cast<Instruction>(V->getOperand(1));

4222

if (!Op0 || !Op1 || Op0->getParent() != P || Op1->getParent() != P)

4223

return false;

4224

4225

// Try to vectorize V.

4226

if (tryToVectorizePair(Op0, Op1, R))

4227

return true;

4228

4229

auto *A = dyn_cast<BinaryOperator>(Op0);

4230

auto *B = dyn_cast<BinaryOperator>(Op1);

4231

// Try to skip B.

4232

if (B && B->hasOneUse()) {

4233

auto *B0 = dyn_cast<BinaryOperator>(B->getOperand(0));

4234

auto *B1 = dyn_cast<BinaryOperator>(B->getOperand(1));

4235

if (B0 && B0->getParent() == P && tryToVectorizePair(A, B0, R))

4236

return true;

4237

if (B1 && B1->getParent() == P && tryToVectorizePair(A, B1, R))

4238

return true;

4239

}

4240

4241

// Try to skip A.

4242

if (A && A->hasOneUse()) {

4243

auto *A0 = dyn_cast<BinaryOperator>(A->getOperand(0));

4244

auto *A1 = dyn_cast<BinaryOperator>(A->getOperand(1));

4245

if (A0 && A0->getParent() == P && tryToVectorizePair(A0, B, R))

4246

return true;

4247

if (A1 && A1->getParent() == P && tryToVectorizePair(A1, B, R))

4248

return true;

4249

}

4250

return false;

4251

}

4252

4253

/// \brief Generate a shuffle mask to be used in a reduction tree.

4254

///

4255

/// \param VecLen The length of the vector to be reduced.

4256

/// \param NumEltsToRdx The number of elements that should be reduced in the

4257

/// vector.

4258

/// \param IsPairwise Whether the reduction is a pairwise or splitting

4259

/// reduction. A pairwise reduction will generate a mask of

4260

/// <0,2,...> or <1,3,..> while a splitting reduction will generate

4261

/// <2,3, undef,undef> for a vector of 4 and NumElts = 2.

4262

/// \param IsLeft True will generate a mask of even elements, odd otherwise.

4263

static Value *createRdxShuffleMask(unsigned VecLen, unsigned NumEltsToRdx,

4264

bool IsPairwise, bool IsLeft,

4265

IRBuilder<> &Builder) {

4266

assert((IsPairwise || !IsLeft) && "Don't support a <0,1,undef,...> mask")(((IsPairwise || !IsLeft) && "Don't support a <0,1,undef,...> mask"
) ? static_cast<void> (0) : __assert_fail ("(IsPairwise || !IsLeft) && \"Don't support a <0,1,undef,...> mask\""
, "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 4266, __PRETTY_FUNCTION__));

4267

4268

SmallVector<Constant *, 32> ShuffleMask(

4269

VecLen, UndefValue::get(Builder.getInt32Ty()));

4270

4271

if (IsPairwise)

4272

// Build a mask of 0, 2, ... (left) or 1, 3, ... (right).

4273

for (unsigned i = 0; i != NumEltsToRdx; ++i)

4274

ShuffleMask[i] = Builder.getInt32(2 * i + !IsLeft);

4275

else

4276

// Move the upper half of the vector to the lower half.

4277

for (unsigned i = 0; i != NumEltsToRdx; ++i)

4278

ShuffleMask[i] = Builder.getInt32(NumEltsToRdx + i);

4279

4280

return ConstantVector::get(ShuffleMask);

4281

}

4282

4283

namespace {

4284

/// Model horizontal reductions.

4285

///

4286

/// A horizontal reduction is a tree of reduction operations (currently add and

4287

/// fadd) that has operations that can be put into a vector as its leaf.

4288

/// For example, this tree:

4289

///

4290

/// mul mul mul mul

4291

/// \ / \ /

4292

/// + +

4293

/// \ /

4294

/// +

4295

/// This tree has "mul" as its reduced values and "+" as its reduction

4296

/// operations. A reduction might be feeding into a store or a binary operation

4297

/// feeding a phi.

4298

/// ...

4299

/// \ /

4300

/// +

4301

/// |

4302

/// phi +=

4303

///

4304

/// Or:

4305

/// ...

4306

/// \ /

4307

/// +

4308

/// |

4309

/// *p =

4310

///

4311

class HorizontalReduction {

4312

SmallVector<Value *, 16> ReductionOps;

4313

SmallVector<Value *, 32> ReducedVals;

4314

// Use map vector to make stable output.

4315

MapVector<Instruction *, Value *> ExtraArgs;

4316

4317

BinaryOperator *ReductionRoot = nullptr;

4318

4319

/// The opcode of the reduction.

4320

Instruction::BinaryOps ReductionOpcode = Instruction::BinaryOpsEnd;

4321

/// The opcode of the values we perform a reduction on.

4322

unsigned ReducedValueOpcode = 0;

4323

/// Should we model this reduction as a pairwise reduction tree or a tree that

4324

/// splits the vector in halves and adds those halves.

4325

bool IsPairwiseReduction = false;

4326

4327

/// Checks if the ParentStackElem.first should be marked as a reduction

4328

/// operation with an extra argument or as extra argument itself.

4329

void markExtraArg(std::pair<Instruction *, unsigned> &ParentStackElem,

4330

Value *ExtraArg) {

4331

if (ExtraArgs.count(ParentStackElem.first)) {

4332

ExtraArgs[ParentStackElem.first] = nullptr;

4333

// We ran into something like:

4334

// ParentStackElem.first = ExtraArgs[ParentStackElem.first] + ExtraArg.

4335

// The whole ParentStackElem.first should be considered as an extra value

4336

// in this case.

4337

// Do not perform analysis of remaining operands of ParentStackElem.first

4338

// instruction, this whole instruction is an extra argument.

4339

ParentStackElem.second = ParentStackElem.first->getNumOperands();

4340

} else {

4341

// We ran into something like:

4342

// ParentStackElem.first += ... + ExtraArg + ...

4343

ExtraArgs[ParentStackElem.first] = ExtraArg;

4344

}

4345

}

4346

4347

public:

4348

HorizontalReduction() = default;

4349

4350

/// \brief Try to find a reduction tree.

4351

bool matchAssociativeReduction(PHINode *Phi, BinaryOperator *B) {

4352

assert((!Phi || is_contained(Phi->operands(), B)) &&(((!Phi || is_contained(Phi->operands(), B)) && "Thi phi needs to use the binary operator"
) ? static_cast<void> (0) : __assert_fail ("(!Phi || is_contained(Phi->operands(), B)) && \"Thi phi needs to use the binary operator\""
, "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 4353, __PRETTY_FUNCTION__))

4353

"Thi phi needs to use the binary operator")(((!Phi || is_contained(Phi->operands(), B)) && "Thi phi needs to use the binary operator"
) ? static_cast<void> (0) : __assert_fail ("(!Phi || is_contained(Phi->operands(), B)) && \"Thi phi needs to use the binary operator\""
, "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 4353, __PRETTY_FUNCTION__));

4354

4355

// We could have a initial reductions that is not an add.

4356

// r *= v1 + v2 + v3 + v4

4357

// In such a case start looking for a tree rooted in the first '+'.

4358

if (Phi) {

4359

if (B->getOperand(0) == Phi) {

4360

Phi = nullptr;

4361

B = dyn_cast<BinaryOperator>(B->getOperand(1));

4362

} else if (B->getOperand(1) == Phi) {

4363

Phi = nullptr;

4364

B = dyn_cast<BinaryOperator>(B->getOperand(0));

4365

}

4366

}

4367

4368

if (!B)

4369

return false;

4370

4371

Type *Ty = B->getType();

4372

if (!isValidElementType(Ty))

4373

return false;

4374

4375

ReductionOpcode = B->getOpcode();

4376

ReducedValueOpcode = 0;

4377

ReductionRoot = B;

4378

4379

// We currently only support adds.

4380

if ((ReductionOpcode != Instruction::Add &&

4381

ReductionOpcode != Instruction::FAdd) ||

4382

!B->isAssociative())

4383

return false;

4384

4385

// Post order traverse the reduction tree starting at B. We only handle true

4386

// trees containing only binary operators or selects.

4387

SmallVector<std::pair<Instruction *, unsigned>, 32> Stack;

4388

Stack.push_back(std::make_pair(B, 0));

4389

while (!Stack.empty()) {

4390

Instruction *TreeN = Stack.back().first;

4391

unsigned EdgeToVist = Stack.back().second++;

4392

bool IsReducedValue = TreeN->getOpcode() != ReductionOpcode;

4393

4394

// Postorder vist.

4395

if (EdgeToVist == 2 || IsReducedValue) {

4396

if (IsReducedValue)

4397

ReducedVals.push_back(TreeN);

4398

else {

4399

auto I = ExtraArgs.find(TreeN);

4400

if (I != ExtraArgs.end() && !I->second) {

4401

// Check if TreeN is an extra argument of its parent operation.

4402

if (Stack.size() <= 1) {

4403

// TreeN can't be an extra argument as it is a root reduction

4404

// operation.

4405

return false;

4406

}

4407

// Yes, TreeN is an extra argument, do not add it to a list of

4408

// reduction operations.

4409

// Stack[Stack.size() - 2] always points to the parent operation.

4410

markExtraArg(Stack[Stack.size() - 2], TreeN);

4411

ExtraArgs.erase(TreeN);

4412

} else

4413

ReductionOps.push_back(TreeN);

4414

}

4415

// Retract.

4416

Stack.pop_back();

4417

continue;

4418

}

4419

4420

// Visit left or right.

4421

Value *NextV = TreeN->getOperand(EdgeToVist);

4422

if (NextV != Phi) {

4423

auto *I = dyn_cast<Instruction>(NextV);

4424

// Continue analysis if the next operand is a reduction operation or

4425

// (possibly) a reduced value. If the reduced value opcode is not set,

4426

// the first met operation != reduction operation is considered as the

4427

// reduced value class.

4428

if (I && (!ReducedValueOpcode || I->getOpcode() == ReducedValueOpcode ||

4429

I->getOpcode() == ReductionOpcode)) {

4430

// Only handle trees in the current basic block.

4431

if (I->getParent() != B->getParent()) {

4432

// I is an extra argument for TreeN (its parent operation).

4433

markExtraArg(Stack.back(), I);

4434

continue;

4435

}

4436

4437

// Each tree node needs to have one user except for the ultimate

4438

// reduction.

4439

if (!I->hasOneUse() && I != B) {

4440

// I is an extra argument for TreeN (its parent operation).

4441

markExtraArg(Stack.back(), I);

4442

continue;

4443

}

4444

4445

if (I->getOpcode() == ReductionOpcode) {

4446

// We need to be able to reassociate the reduction operations.

4447

if (!I->isAssociative()) {

4448

// I is an extra argument for TreeN (its parent operation).

4449

markExtraArg(Stack.back(), I);

4450

continue;

4451

}

4452

} else if (ReducedValueOpcode &&

4453

ReducedValueOpcode != I->getOpcode()) {

4454

// Make sure that the opcodes of the operations that we are going to

4455

// reduce match.

4456

// I is an extra argument for TreeN (its parent operation).

4457

markExtraArg(Stack.back(), I);

4458

continue;

4459

} else if (!ReducedValueOpcode)

4460

ReducedValueOpcode = I->getOpcode();

4461

4462

Stack.push_back(std::make_pair(I, 0));

4463

continue;

4464

}

4465

}

4466

// NextV is an extra argument for TreeN (its parent operation).

4467

markExtraArg(Stack.back(), NextV);

4468

}

4469

return true;

4470

}

4471

4472

/// \brief Attempt to vectorize the tree found by

4473

/// matchAssociativeReduction.

4474

bool tryToReduce(BoUpSLP &V, TargetTransformInfo *TTI) {

4475

if (ReducedVals.empty())

4476

return false;

4477

4478

// If there is a sufficient number of reduction values, reduce

4479

// to a nearby power-of-2. Can safely generate oversized

4480

// vectors and rely on the backend to split them to legal sizes.

4481

unsigned NumReducedVals = ReducedVals.size();

4482

if (NumReducedVals < 4)

4483

return false;

4484

4485

unsigned ReduxWidth = PowerOf2Floor(NumReducedVals);

4486

4487

Value *VectorizedTree = nullptr;

4488

IRBuilder<> Builder(ReductionRoot);

4489

FastMathFlags Unsafe;

4490

Unsafe.setUnsafeAlgebra();

4491

Builder.setFastMathFlags(Unsafe);

4492

unsigned i = 0;

4493

4494

BoUpSLP::ExtraValueToDebugLocsMap ExternallyUsedValues;

4495

// The same extra argument may be used several time, so log each attempt

4496

// to use it.

4497

for (auto &Pair : ExtraArgs)

4498

ExternallyUsedValues[Pair.second].push_back(Pair.first);

4499

while (i < NumReducedVals - ReduxWidth + 1 && ReduxWidth > 2) {

4500

auto VL = makeArrayRef(&ReducedVals[i], ReduxWidth);

4501

V.buildTree(VL, ExternallyUsedValues, ReductionOps);

4502

if (V.shouldReorder()) {

4503

SmallVector<Value *, 8> Reversed(VL.rbegin(), VL.rend());

4504

V.buildTree(Reversed, ExternallyUsedValues, ReductionOps);

4505

}

4506

if (V.isTreeTinyAndNotFullyVectorizable())

4507

break;

4508

4509

V.computeMinimumValueSizes();

4510

4511

// Estimate cost.

4512

int Cost =

4513

V.getTreeCost() + getReductionCost(TTI, ReducedVals[i], ReduxWidth);

4514

if (Cost >= -SLPCostThreshold)

4515

break;

4516

4517

DEBUG(dbgs() << "SLP: Vectorizing horizontal reduction at cost:" << Costdo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Vectorizing horizontal reduction at cost:"
<< Cost << ". (HorRdx)\n"; } } while (false)

4518

<< ". (HorRdx)\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Vectorizing horizontal reduction at cost:"
<< Cost << ". (HorRdx)\n"; } } while (false);

4519

auto *I0 = cast<Instruction>(VL[0]);

4520

V.getORE()->emit(

4521

OptimizationRemark(SV_NAME"slp-vectorizer", "VectorizedHorizontalReduction", I0)

4522

<< "Vectorized horizontal reduction with cost "

4523

<< ore::NV("Cost", Cost) << " and with tree size "

4524

<< ore::NV("TreeSize", V.getTreeSize()));

4525

4526

// Vectorize a tree.

4527

DebugLoc Loc = cast<Instruction>(ReducedVals[i])->getDebugLoc();

4528

Value *VectorizedRoot = V.vectorizeTree(ExternallyUsedValues);

4529

4530

// Emit a reduction.

4531

Value *ReducedSubTree =

4532

emitReduction(VectorizedRoot, Builder, ReduxWidth, ReductionOps, TTI);

4533

if (VectorizedTree) {

4534

Builder.SetCurrentDebugLocation(Loc);

4535

VectorizedTree = Builder.CreateBinOp(ReductionOpcode, VectorizedTree,

4536

ReducedSubTree, "bin.rdx");

4537

propagateIRFlags(VectorizedTree, ReductionOps);

4538

} else

4539

VectorizedTree = ReducedSubTree;

4540

i += ReduxWidth;

4541

ReduxWidth = PowerOf2Floor(NumReducedVals - i);

4542

}

4543

4544

if (VectorizedTree) {

4545

// Finish the reduction.

4546

for (; i < NumReducedVals; ++i) {

4547

auto *I = cast<Instruction>(ReducedVals[i]);

4548

Builder.SetCurrentDebugLocation(I->getDebugLoc());

4549

VectorizedTree =

4550

Builder.CreateBinOp(ReductionOpcode, VectorizedTree, I);

4551

propagateIRFlags(VectorizedTree, ReductionOps);

4552

}

4553

for (auto &Pair : ExternallyUsedValues) {

4554

assert(!Pair.second.empty() &&((!Pair.second.empty() && "At least one DebugLoc must be inserted"
) ? static_cast<void> (0) : __assert_fail ("!Pair.second.empty() && \"At least one DebugLoc must be inserted\""
, "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 4555, __PRETTY_FUNCTION__))

4555

"At least one DebugLoc must be inserted")((!Pair.second.empty() && "At least one DebugLoc must be inserted"
) ? static_cast<void> (0) : __assert_fail ("!Pair.second.empty() && \"At least one DebugLoc must be inserted\""
, "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 4555, __PRETTY_FUNCTION__));

4556

// Add each externally used value to the final reduction.

4557

for (auto *I : Pair.second) {

4558

Builder.SetCurrentDebugLocation(I->getDebugLoc());

4559

VectorizedTree = Builder.CreateBinOp(ReductionOpcode, VectorizedTree,

4560

Pair.first, "bin.extra");

4561

propagateIRFlags(VectorizedTree, I);

4562

}

4563

}

4564

// Update users.

4565

ReductionRoot->replaceAllUsesWith(VectorizedTree);

4566

}

4567

return VectorizedTree != nullptr;

4568

}

4569

4570

unsigned numReductionValues() const {

4571

return ReducedVals.size();

4572

}

4573

4574

private:

4575

/// \brief Calculate the cost of a reduction.

4576

int getReductionCost(TargetTransformInfo *TTI, Value *FirstReducedVal,

4577

unsigned ReduxWidth) {

4578

Type *ScalarTy = FirstReducedVal->getType();

4579

Type *VecTy = VectorType::get(ScalarTy, ReduxWidth);

4580

4581

int PairwiseRdxCost = TTI->getReductionCost(ReductionOpcode, VecTy, true);

4582

int SplittingRdxCost = TTI->getReductionCost(ReductionOpcode, VecTy, false);

4583

4584

IsPairwiseReduction = PairwiseRdxCost < SplittingRdxCost;

4585

int VecReduxCost = IsPairwiseReduction ? PairwiseRdxCost : SplittingRdxCost;

4586

4587

int ScalarReduxCost =

4588

(ReduxWidth - 1) *

4589

TTI->getArithmeticInstrCost(ReductionOpcode, ScalarTy);

4590

4591

DEBUG(dbgs() << "SLP: Adding cost " << VecReduxCost - ScalarReduxCostdo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Adding cost " << VecReduxCost
- ScalarReduxCost << " for reduction that starts with "
<< *FirstReducedVal << " (It is a " << (IsPairwiseReduction
? "pairwise" : "splitting") << " reduction)\n"; } } while
(false)

4592

<< " for reduction that starts with " << *FirstReducedValdo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Adding cost " << VecReduxCost
- ScalarReduxCost << " for reduction that starts with "
<< *FirstReducedVal << " (It is a " << (IsPairwiseReduction
? "pairwise" : "splitting") << " reduction)\n"; } } while
(false)

4593

<< " (It is a "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Adding cost " << VecReduxCost
- ScalarReduxCost << " for reduction that starts with "
<< *FirstReducedVal << " (It is a " << (IsPairwiseReduction
? "pairwise" : "splitting") << " reduction)\n"; } } while
(false)

4594

<< (IsPairwiseReduction ? "pairwise" : "splitting")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Adding cost " << VecReduxCost
- ScalarReduxCost << " for reduction that starts with "
<< *FirstReducedVal << " (It is a " << (IsPairwiseReduction
? "pairwise" : "splitting") << " reduction)\n"; } } while
(false)

4595

<< " reduction)\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Adding cost " << VecReduxCost
- ScalarReduxCost << " for reduction that starts with "
<< *FirstReducedVal << " (It is a " << (IsPairwiseReduction
? "pairwise" : "splitting") << " reduction)\n"; } } while
(false);

4596

4597

return VecReduxCost - ScalarReduxCost;

4598

}

4599

4600

/// \brief Emit a horizontal reduction of the vectorized value.

4601

Value *emitReduction(Value *VectorizedValue, IRBuilder<> &Builder,

4602

unsigned ReduxWidth, ArrayRef<Value *> RedOps,

4603

const TargetTransformInfo *TTI) {

4604

assert(VectorizedValue && "Need to have a vectorized tree node")((VectorizedValue && "Need to have a vectorized tree node"
) ? static_cast<void> (0) : __assert_fail ("VectorizedValue && \"Need to have a vectorized tree node\""
, "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 4604, __PRETTY_FUNCTION__));

4605

assert(isPowerOf2_32(ReduxWidth) &&((isPowerOf2_32(ReduxWidth) && "We only handle power-of-two reductions for now"
) ? static_cast<void> (0) : __assert_fail ("isPowerOf2_32(ReduxWidth) && \"We only handle power-of-two reductions for now\""
, "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 4606, __PRETTY_FUNCTION__))

4606

"We only handle power-of-two reductions for now")((isPowerOf2_32(ReduxWidth) && "We only handle power-of-two reductions for now"
) ? static_cast<void> (0) : __assert_fail ("isPowerOf2_32(ReduxWidth) && \"We only handle power-of-two reductions for now\""
, "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 4606, __PRETTY_FUNCTION__));

4607

4608

if (!IsPairwiseReduction)

4609

return createSimpleTargetReduction(

4610

Builder, TTI, ReductionOpcode, VectorizedValue,

4611

TargetTransformInfo::ReductionFlags(), RedOps);

4612

4613

Value *TmpVec = VectorizedValue;

4614

for (unsigned i = ReduxWidth / 2; i != 0; i >>= 1) {

4615

Value *LeftMask =

4616

createRdxShuffleMask(ReduxWidth, i, true, true, Builder);

4617

Value *RightMask =

4618

createRdxShuffleMask(ReduxWidth, i, true, false, Builder);

4619

4620

Value *LeftShuf = Builder.CreateShuffleVector(

4621

TmpVec, UndefValue::get(TmpVec->getType()), LeftMask, "rdx.shuf.l");

4622

Value *RightShuf = Builder.CreateShuffleVector(

4623

TmpVec, UndefValue::get(TmpVec->getType()), (RightMask),

4624

"rdx.shuf.r");

4625

TmpVec =

4626

Builder.CreateBinOp(ReductionOpcode, LeftShuf, RightShuf, "bin.rdx");

4627

propagateIRFlags(TmpVec, RedOps);

4628

}

4629

4630

// The result is in the first element of the vector.

4631

return Builder.CreateExtractElement(TmpVec, Builder.getInt32(0));

4632

}

4633

};

4634

} // end anonymous namespace

4635

4636

/// \brief Recognize construction of vectors like

4637

/// %ra = insertelement <4 x float> undef, float %s0, i32 0

4638

/// %rb = insertelement <4 x float> %ra, float %s1, i32 1

4639

/// %rc = insertelement <4 x float> %rb, float %s2, i32 2

4640

/// %rd = insertelement <4 x float> %rc, float %s3, i32 3

4641

///

4642

/// Returns true if it matches

4643

///

4644

static bool findBuildVector(InsertElementInst *FirstInsertElem,

4645

SmallVectorImpl<Value *> &BuildVector,

4646

SmallVectorImpl<Value *> &BuildVectorOpds) {

4647

if (!isa<UndefValue>(FirstInsertElem->getOperand(0)))

4648

return false;

4649

4650

InsertElementInst *IE = FirstInsertElem;

4651

while (true) {

4652

BuildVector.push_back(IE);

4653

BuildVectorOpds.push_back(IE->getOperand(1));

4654

4655

if (IE->use_empty())

4656

return false;

4657

4658

InsertElementInst *NextUse = dyn_cast<InsertElementInst>(IE->user_back());

4659

if (!NextUse)

4660

return true;

4661

4662

// If this isn't the final use, make sure the next insertelement is the only

4663

// use. It's OK if the final constructed vector is used multiple times

4664

if (!IE->hasOneUse())

4665

return false;

4666

4667

IE = NextUse;

4668

}

4669

4670

return false;

4671

}

4672

4673

/// \brief Like findBuildVector, but looks backwards for construction of aggregate.

4674

///

4675

/// \return true if it matches.

4676

static bool findBuildAggregate(InsertValueInst *IV,

4677

SmallVectorImpl<Value *> &BuildVector,

4678

SmallVectorImpl<Value *> &BuildVectorOpds) {

4679

Value *V;

4680

do {

4681

BuildVector.push_back(IV);

4682

BuildVectorOpds.push_back(IV->getInsertedValueOperand());

4683

V = IV->getAggregateOperand();

4684

if (isa<UndefValue>(V))

4685

break;

4686

IV = dyn_cast<InsertValueInst>(V);

4687

if (!IV || !IV->hasOneUse())

4688

return false;

4689

} while (true);

4690

std::reverse(BuildVector.begin(), BuildVector.end());

4691

std::reverse(BuildVectorOpds.begin(), BuildVectorOpds.end());

4692

return true;

4693

}

4694

4695

static bool PhiTypeSorterFunc(Value *V, Value *V2) {

4696

return V->getType() < V2->getType();

4697

}

4698

4699

/// \brief Try and get a reduction value from a phi node.

4700

///

4701

/// Given a phi node \p P in a block \p ParentBB, consider possible reductions

4702

/// if they come from either \p ParentBB or a containing loop latch.

4703

///

4704

/// \returns A candidate reduction value if possible, or \code nullptr \endcode

4705

/// if not possible.

4706

static Value *getReductionValue(const DominatorTree *DT, PHINode *P,

4707

BasicBlock *ParentBB, LoopInfo *LI) {

4708

// There are situations where the reduction value is not dominated by the

4709

// reduction phi. Vectorizing such cases has been reported to cause

4710

// miscompiles. See PR25787.

4711

auto DominatedReduxValue = [&](Value *R) {

4712

return (

4713

dyn_cast<Instruction>(R) &&

4714

DT->dominates(P->getParent(), dyn_cast<Instruction>(R)->getParent()));

4715

};

4716

4717

Value *Rdx = nullptr;

4718

4719

// Return the incoming value if it comes from the same BB as the phi node.

4720

if (P->getIncomingBlock(0) == ParentBB) {

4721

Rdx = P->getIncomingValue(0);

4722

} else if (P->getIncomingBlock(1) == ParentBB) {

4723

Rdx = P->getIncomingValue(1);

4724

}

4725

4726

if (Rdx && DominatedReduxValue(Rdx))

4727

return Rdx;

4728

4729

// Otherwise, check whether we have a loop latch to look at.

4730

Loop *BBL = LI->getLoopFor(ParentBB);

4731

if (!BBL)

4732

return nullptr;

4733

BasicBlock *BBLatch = BBL->getLoopLatch();

4734

if (!BBLatch)

4735

return nullptr;

4736

4737

// There is a loop latch, return the incoming value if it comes from

4738

// that. This reduction pattern occasionally turns up.

4739

if (P->getIncomingBlock(0) == BBLatch) {

4740

Rdx = P->getIncomingValue(0);

4741

} else if (P->getIncomingBlock(1) == BBLatch) {

4742

Rdx = P->getIncomingValue(1);

4743

}

4744

4745

if (Rdx && DominatedReduxValue(Rdx))

4746

return Rdx;

4747

4748

return nullptr;

4749

}

4750

4751

/// Attempt to reduce a horizontal reduction.

4752

/// If it is legal to match a horizontal reduction feeding the phi node \a P

4753

/// with reduction operators \a Root (or one of its operands) in a basic block

4754

/// \a BB, then check if it can be done. If horizontal reduction is not found

4755

/// and root instruction is a binary operation, vectorization of the operands is

4756

/// attempted.

4757

/// \returns true if a horizontal reduction was matched and reduced or operands

4758

/// of one of the binary instruction were vectorized.

4759

/// \returns false if a horizontal reduction was not matched (or not possible)

4760

/// or no vectorization of any binary operation feeding \a Root instruction was

4761

/// performed.

4762

static bool tryToVectorizeHorReductionOrInstOperands(

4763

PHINode *P, Instruction *Root, BasicBlock *BB, BoUpSLP &R,

4764

TargetTransformInfo *TTI,

4765

const function_ref<bool(BinaryOperator *, BoUpSLP &)> Vectorize) {

4766

if (!ShouldVectorizeHor)

4767

return false;

4768

4769

if (!Root)

4770

return false;

4771

4772

if (Root->getParent() != BB)

4773

return false;

4774

// Start analysis starting from Root instruction. If horizontal reduction is

4775

// found, try to vectorize it. If it is not a horizontal reduction or

4776

// vectorization is not possible or not effective, and currently analyzed

4777

// instruction is a binary operation, try to vectorize the operands, using

4778

// pre-order DFS traversal order. If the operands were not vectorized, repeat

4779

// the same procedure considering each operand as a possible root of the

4780

// horizontal reduction.

4781

// Interrupt the process if the Root instruction itself was vectorized or all

4782

// sub-trees not higher that RecursionMaxDepth were analyzed/vectorized.

4783

SmallVector<std::pair<WeakTrackingVH, unsigned>, 8> Stack(1, {Root, 0});

4784

SmallSet<Value *, 8> VisitedInstrs;

4785

bool Res = false;

4786

while (!Stack.empty()) {

4787

Value *V;

4788

unsigned Level;

4789

std::tie(V, Level) = Stack.pop_back_val();

4790

if (!V)

4791

continue;

4792

auto *Inst = dyn_cast<Instruction>(V);

4793

if (!Inst || isa<PHINode>(Inst))

4794

continue;

4795

if (auto *BI = dyn_cast<BinaryOperator>(Inst)) {

4796

HorizontalReduction HorRdx;

4797

if (HorRdx.matchAssociativeReduction(P, BI)) {

4798

if (HorRdx.tryToReduce(R, TTI)) {

4799

Res = true;

4800

// Set P to nullptr to avoid re-analysis of phi node in

4801

// matchAssociativeReduction function unless this is the root node.

4802

P = nullptr;

4803

continue;

4804

}

4805

}

4806

if (P) {

4807

Inst = dyn_cast<Instruction>(BI->getOperand(0));

4808

if (Inst == P)

4809

Inst = dyn_cast<Instruction>(BI->getOperand(1));

4810

if (!Inst) {

4811

// Set P to nullptr to avoid re-analysis of phi node in

4812

// matchAssociativeReduction function unless this is the root node.

4813

P = nullptr;

4814

continue;

4815

}

4816

}

4817

}

4818

// Set P to nullptr to avoid re-analysis of phi node in

4819

// matchAssociativeReduction function unless this is the root node.

4820

P = nullptr;

4821

if (Vectorize(dyn_cast<BinaryOperator>(Inst), R)) {

4822

Res = true;

4823

continue;

4824

}

4825

4826

// Try to vectorize operands.

4827

if (++Level < RecursionMaxDepth)

4828

for (auto *Op : Inst->operand_values())

4829

Stack.emplace_back(Op, Level);

4830

}

4831

return Res;

4832

}

4833

4834

bool SLPVectorizerPass::vectorizeRootInstruction(PHINode *P, Value *V,

4835

BasicBlock *BB, BoUpSLP &R,

4836

TargetTransformInfo *TTI) {

4837

if (!V)

4838

return false;

4839

auto *I = dyn_cast<Instruction>(V);

4840

if (!I)

4841

return false;

4842

4843

if (!isa<BinaryOperator>(I))

4844

P = nullptr;

4845

// Try to match and vectorize a horizontal reduction.

4846

return tryToVectorizeHorReductionOrInstOperands(

4847

P, I, BB, R, TTI, [this](BinaryOperator *BI, BoUpSLP &R) -> bool {

4848

return tryToVectorize(BI, R);

4849

});

4850

}

4851

4852

bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {

4853

bool Changed = false;

4854

SmallVector<Value *, 4> Incoming;

4855

SmallSet<Value *, 16> VisitedInstrs;

4856

4857

bool HaveVectorizedPhiNodes = true;

4858

while (HaveVectorizedPhiNodes) {

4859

HaveVectorizedPhiNodes = false;

4860

4861

// Collect the incoming values from the PHIs.

4862

Incoming.clear();

4863

for (Instruction &I : *BB) {

4864

PHINode *P = dyn_cast<PHINode>(&I);

4865

if (!P)

4866

break;

4867

4868

if (!VisitedInstrs.count(P))

4869

Incoming.push_back(P);

4870

}

4871

4872

// Sort by type.

4873

std::stable_sort(Incoming.begin(), Incoming.end(), PhiTypeSorterFunc);

4874

4875

// Try to vectorize elements base on their type.

4876

for (SmallVector<Value *, 4>::iterator IncIt = Incoming.begin(),

4877

E = Incoming.end();

4878

IncIt != E;) {

4879

4880

// Look for the next elements with the same type.

4881

SmallVector<Value *, 4>::iterator SameTypeIt = IncIt;

4882

while (SameTypeIt != E &&

4883

(*SameTypeIt)->getType() == (*IncIt)->getType()) {

4884

VisitedInstrs.insert(*SameTypeIt);

4885

++SameTypeIt;

4886

}

4887

4888

// Try to vectorize them.

4889

unsigned NumElts = (SameTypeIt - IncIt);

4890

DEBUG(errs() << "SLP: Trying to vectorize starting at PHIs (" << NumElts << ")\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { errs() << "SLP: Trying to vectorize starting at PHIs ("
<< NumElts << ")\n"; } } while (false);

4891

// The order in which the phi nodes appear in the program does not matter.

4892

// So allow tryToVectorizeList to reorder them if it is beneficial. This

4893

// is done when there are exactly two elements since tryToVectorizeList

4894

// asserts that there are only two values when AllowReorder is true.

4895

bool AllowReorder = NumElts == 2;

4896

if (NumElts > 1 && tryToVectorizeList(makeArrayRef(IncIt, NumElts), R,

4897

None, AllowReorder)) {

4898

// Success start over because instructions might have been changed.

4899

HaveVectorizedPhiNodes = true;

4900

Changed = true;

4901

break;

4902

}

4903

4904

// Start over at the next instruction of a different type (or the end).

4905

IncIt = SameTypeIt;

4906

}

4907

}

4908

4909

VisitedInstrs.clear();

4910

4911

for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; it++) {

4912

// We may go through BB multiple times so skip the one we have checked.

4913

if (!VisitedInstrs.insert(&*it).second)

4914

continue;

4915

4916

if (isa<DbgInfoIntrinsic>(it))

4917

continue;

4918

4919

// Try to vectorize reductions that use PHINodes.

4920

if (PHINode *P = dyn_cast<PHINode>(it)) {

4921

// Check that the PHI is a reduction PHI.

4922

if (P->getNumIncomingValues() != 2)

4923

return Changed;

4924

4925

// Try to match and vectorize a horizontal reduction.

4926

if (vectorizeRootInstruction(P, getReductionValue(DT, P, BB, LI), BB, R,

4927

TTI)) {

4928

Changed = true;

4929

it = BB->begin();

4930

e = BB->end();

4931

continue;

4932

}

4933

continue;

4934

}

4935

4936

if (ShouldStartVectorizeHorAtStore) {

4937

if (StoreInst *SI = dyn_cast<StoreInst>(it)) {

4938

// Try to match and vectorize a horizontal reduction.

4939

if (vectorizeRootInstruction(nullptr, SI->getValueOperand(), BB, R,

4940

TTI)) {

4941

Changed = true;

4942

it = BB->begin();

4943

e = BB->end();

4944

continue;

4945

}

4946

}

4947

}

4948

4949

// Try to vectorize horizontal reductions feeding into a return.

4950

if (ReturnInst *RI = dyn_cast<ReturnInst>(it)) {

4951

if (RI->getNumOperands() != 0) {

4952

// Try to match and vectorize a horizontal reduction.

4953

if (vectorizeRootInstruction(nullptr, RI->getOperand(0), BB, R, TTI)) {

4954

Changed = true;

4955

it = BB->begin();

4956

e = BB->end();

4957

continue;

4958

}

4959

}

4960

}

4961

4962

// Try to vectorize trees that start at compare instructions.

4963

if (CmpInst *CI = dyn_cast<CmpInst>(it)) {

4964

if (tryToVectorizePair(CI->getOperand(0), CI->getOperand(1), R)) {

4965

Changed = true;

4966

// We would like to start over since some instructions are deleted

4967

// and the iterator may become invalid value.

4968

it = BB->begin();

4969

e = BB->end();

4970

continue;

4971

}

4972

4973

for (int I = 0; I < 2; ++I) {

4974

if (vectorizeRootInstruction(nullptr, CI->getOperand(I), BB, R, TTI)) {

4975

Changed = true;

4976

// We would like to start over since some instructions are deleted

4977

// and the iterator may become invalid value.

4978

it = BB->begin();

4979

e = BB->end();

4980

break;

4981

}

4982

}

4983

continue;

4984

}

4985

4986

// Try to vectorize trees that start at insertelement instructions.

4987

if (InsertElementInst *FirstInsertElem = dyn_cast<InsertElementInst>(it)) {

4988

SmallVector<Value *, 16> BuildVector;

4989

SmallVector<Value *, 16> BuildVectorOpds;

4990

if (!findBuildVector(FirstInsertElem, BuildVector, BuildVectorOpds))

4991

continue;

4992

4993

// Vectorize starting with the build vector operands ignoring the

4994

// BuildVector instructions for the purpose of scheduling and user

4995

// extraction.

4996

if (tryToVectorizeList(BuildVectorOpds, R, BuildVector)) {

4997

Changed = true;

4998

it = BB->begin();

4999

e = BB->end();

5000

}

5001

5002

continue;

5003

}

5004

5005

// Try to vectorize trees that start at insertvalue instructions feeding into

5006

// a store.

5007

if (StoreInst *SI = dyn_cast<StoreInst>(it)) {

5008

if (InsertValueInst *LastInsertValue = dyn_cast<InsertValueInst>(SI->getValueOperand())) {

5009

const DataLayout &DL = BB->getModule()->getDataLayout();

5010

if (R.canMapToVector(SI->getValueOperand()->getType(), DL)) {

5011

SmallVector<Value *, 16> BuildVector;

5012

SmallVector<Value *, 16> BuildVectorOpds;

5013

if (!findBuildAggregate(LastInsertValue, BuildVector, BuildVectorOpds))

5014

continue;

5015

5016

DEBUG(dbgs() << "SLP: store of array mappable to vector: " << *SI << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: store of array mappable to vector: "
<< *SI << "\n"; } } while (false);

5017

if (tryToVectorizeList(BuildVectorOpds, R, BuildVector, false)) {

5018

Changed = true;

5019

it = BB->begin();

5020

e = BB->end();

5021

}

5022

continue;

5023

}

5024

}

5025

}

5026

}

5027

5028

return Changed;

5029

}

5030

5031

bool SLPVectorizerPass::vectorizeGEPIndices(BasicBlock *BB, BoUpSLP &R) {

5032

auto Changed = false;

5033

for (auto &Entry : GEPs) {

5034

5035

// If the getelementptr list has fewer than two elements, there's nothing

5036

// to do.

5037

if (Entry.second.size() < 2)

5038

continue;

5039

5040

DEBUG(dbgs() << "SLP: Analyzing a getelementptr list of length "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Analyzing a getelementptr list of length "
<< Entry.second.size() << ".\n"; } } while (false
)

5041

<< Entry.second.size() << ".\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Analyzing a getelementptr list of length "
<< Entry.second.size() << ".\n"; } } while (false
);

5042

5043

// We process the getelementptr list in chunks of 16 (like we do for

5044

// stores) to minimize compile-time.

5045

for (unsigned BI = 0, BE = Entry.second.size(); BI < BE; BI += 16) {

5046

auto Len = std::min<unsigned>(BE - BI, 16);

5047

auto GEPList = makeArrayRef(&Entry.second[BI], Len);

5048

5049

// Initialize a set a candidate getelementptrs. Note that we use a

5050

// SetVector here to preserve program order. If the index computations

5051

// are vectorizable and begin with loads, we want to minimize the chance

5052

// of having to reorder them later.

5053

SetVector<Value *> Candidates(GEPList.begin(), GEPList.end());

5054

5055

// Some of the candidates may have already been vectorized after we

5056

// initially collected them. If so, the WeakTrackingVHs will have

5057

// nullified the

5058

// values, so remove them from the set of candidates.

5059

Candidates.remove(nullptr);

5060

5061

// Remove from the set of candidates all pairs of getelementptrs with

5062

// constant differences. Such getelementptrs are likely not good

5063

// candidates for vectorization in a bottom-up phase since one can be

5064

// computed from the other. We also ensure all candidate getelementptr

5065

// indices are unique.

5066

for (int I = 0, E = GEPList.size(); I < E && Candidates.size() > 1; ++I) {

5067

auto *GEPI = cast<GetElementPtrInst>(GEPList[I]);

5068

if (!Candidates.count(GEPI))

5069

continue;

5070

auto *SCEVI = SE->getSCEV(GEPList[I]);

5071

for (int J = I + 1; J < E && Candidates.size() > 1; ++J) {

5072

auto *GEPJ = cast<GetElementPtrInst>(GEPList[J]);

5073

auto *SCEVJ = SE->getSCEV(GEPList[J]);

5074

if (isa<SCEVConstant>(SE->getMinusSCEV(SCEVI, SCEVJ))) {

5075

Candidates.remove(GEPList[I]);

5076

Candidates.remove(GEPList[J]);

5077

} else if (GEPI->idx_begin()->get() == GEPJ->idx_begin()->get()) {

5078

Candidates.remove(GEPList[J]);

5079

}

5080

}

5081

}

5082

5083

// We break out of the above computation as soon as we know there are

5084

// fewer than two candidates remaining.

5085

if (Candidates.size() < 2)

5086

continue;

5087

5088

// Add the single, non-constant index of each candidate to the bundle. We

5089

// ensured the indices met these constraints when we originally collected

5090

// the getelementptrs.

5091

SmallVector<Value *, 16> Bundle(Candidates.size());

5092

auto BundleIndex = 0u;

5093

for (auto *V : Candidates) {

5094

auto *GEP = cast<GetElementPtrInst>(V);

5095

auto *GEPIdx = GEP->idx_begin()->get();

5096

assert(GEP->getNumIndices() == 1 || !isa<Constant>(GEPIdx))((GEP->getNumIndices() == 1 || !isa<Constant>(GEPIdx
)) ? static_cast<void> (0) : __assert_fail ("GEP->getNumIndices() == 1 || !isa<Constant>(GEPIdx)"
, "/tmp/buildd/llvm-toolchain-snapshot-5.0~svn306458/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 5096, __PRETTY_FUNCTION__));

5097

Bundle[BundleIndex++] = GEPIdx;

5098

}

5099

5100

// Try and vectorize the indices. We are currently only interested in

5101

// gather-like cases of the form:

5102

5103

// ... = g[a[0] - b[0]] + g[a[1] - b[1]] + ...

5104

5105

// where the loads of "a", the loads of "b", and the subtractions can be

5106

// performed in parallel. It's likely that detecting this pattern in a

5107

// bottom-up phase will be simpler and less costly than building a

5108

// full-blown top-down phase beginning at the consecutive loads.

5109

Changed |= tryToVectorizeList(Bundle, R);

5110

}

5111

}

5112

return Changed;

5113

}

5114

5115

bool SLPVectorizerPass::vectorizeStoreChains(BoUpSLP &R) {

5116

bool Changed = false;

5117

// Attempt to sort and vectorize each of the store-groups.

5118

for (StoreListMap::iterator it = Stores.begin(), e = Stores.end(); it != e;

5119

++it) {

5120

if (it->second.size() < 2)

5121

continue;

5122

5123

DEBUG(dbgs() << "SLP: Analyzing a store chain of length "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Analyzing a store chain of length "
<< it->second.size() << ".\n"; } } while (false
)

5124

<< it->second.size() << ".\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Analyzing a store chain of length "
<< it->second.size() << ".\n"; } } while (false
);

5125

5126

// Process the stores in chunks of 16.

5127

// TODO: The limit of 16 inhibits greater vectorization factors.

5128

// For example, AVX2 supports v32i8. Increasing this limit, however,

5129

// may cause a significant compile-time increase.

5130

for (unsigned CI = 0, CE = it->second.size(); CI < CE; CI+=16) {

5131

unsigned Len = std::min<unsigned>(CE - CI, 16);

5132

Changed |= vectorizeStores(makeArrayRef(&it->second[CI], Len), R);

5133

}

5134

}

5135

return Changed;

5136

}

5137

5138

char SLPVectorizer::ID = 0;

5139

static const char lv_name[] = "SLP Vectorizer";

5140

INITIALIZE_PASS_BEGIN(SLPVectorizer, SV_NAME, lv_name, false, false)static void *initializeSLPVectorizerPassOnce(PassRegistry &
Registry) {

5141

INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)initializeAAResultsWrapperPassPass(Registry);

5142

INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)initializeTargetTransformInfoWrapperPassPass(Registry);

5143

INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)initializeAssumptionCacheTrackerPass(Registry);

5144

INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)initializeScalarEvolutionWrapperPassPass(Registry);

5145

INITIALIZE_PASS_DEPENDENCY(LoopSimplify)initializeLoopSimplifyPass(Registry);

5146

INITIALIZE_PASS_DEPENDENCY(DemandedBitsWrapperPass)initializeDemandedBitsWrapperPassPass(Registry);

5147

INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass)initializeOptimizationRemarkEmitterWrapperPassPass(Registry);

5148

INITIALIZE_PASS_END(SLPVectorizer, SV_NAME, lv_name, false, false)PassInfo *PI = new PassInfo( lv_name, "slp-vectorizer", &
SLPVectorizer::ID, PassInfo::NormalCtor_t(callDefaultCtor<
SLPVectorizer>), false, false); Registry.registerPass(*PI,
true); return PI; } static llvm::once_flag InitializeSLPVectorizerPassFlag
; void llvm::initializeSLPVectorizerPass(PassRegistry &Registry
) { llvm::call_once(InitializeSLPVectorizerPassFlag, initializeSLPVectorizerPassOnce
, std::ref(Registry)); }

5149

5150

namespace llvm {

5151

Pass *createSLPVectorizerPass() { return new SLPVectorizer(); }

5152

}