/build/llvm-toolchain-snapshot-16~++20221003111214+1fa2019828ca/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp

1

//===- SLPVectorizer.cpp - A bottom up SLP Vectorizer ---------------------===//

2

//

3

// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.

4

// See https://llvm.org/LICENSE.txt for license information.

5

// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

6

//

7

//===----------------------------------------------------------------------===//

8

//

9

// This pass implements the Bottom Up SLP vectorizer. It detects consecutive

10

// stores that can be put together into vector-stores. Next, it attempts to

11

// construct vectorizable tree using the use-def chains. If a profitable tree

12

// was found, the SLP vectorizer performs vectorization on the tree.

13

//

14

// The pass is inspired by the work described in the paper:

15

// "Loop-Aware SLP in GCC" by Ira Rosen, Dorit Nuzman, Ayal Zaks.

16

//

17

//===----------------------------------------------------------------------===//

18

19

#include "llvm/Transforms/Vectorize/SLPVectorizer.h"

20

#include "llvm/ADT/DenseMap.h"

21

#include "llvm/ADT/DenseSet.h"

22

#include "llvm/ADT/Optional.h"

23

#include "llvm/ADT/PostOrderIterator.h"

24

#include "llvm/ADT/PriorityQueue.h"

25

#include "llvm/ADT/STLExtras.h"

26

#include "llvm/ADT/SetOperations.h"

27

#include "llvm/ADT/SetVector.h"

28

#include "llvm/ADT/SmallBitVector.h"

29

#include "llvm/ADT/SmallPtrSet.h"

30

#include "llvm/ADT/SmallSet.h"

31

#include "llvm/ADT/SmallString.h"

32

#include "llvm/ADT/Statistic.h"

33

#include "llvm/ADT/iterator.h"

34

#include "llvm/ADT/iterator_range.h"

35

#include "llvm/Analysis/AliasAnalysis.h"

36

#include "llvm/Analysis/AssumptionCache.h"

37

#include "llvm/Analysis/CodeMetrics.h"

38

#include "llvm/Analysis/DemandedBits.h"

39

#include "llvm/Analysis/GlobalsModRef.h"

40

#include "llvm/Analysis/IVDescriptors.h"

41

#include "llvm/Analysis/LoopAccessAnalysis.h"

42

#include "llvm/Analysis/LoopInfo.h"

43

#include "llvm/Analysis/MemoryLocation.h"

44

#include "llvm/Analysis/OptimizationRemarkEmitter.h"

45

#include "llvm/Analysis/ScalarEvolution.h"

46

#include "llvm/Analysis/ScalarEvolutionExpressions.h"

47

#include "llvm/Analysis/TargetLibraryInfo.h"

48

#include "llvm/Analysis/TargetTransformInfo.h"

49

#include "llvm/Analysis/ValueTracking.h"

50

#include "llvm/Analysis/VectorUtils.h"

51

#include "llvm/IR/Attributes.h"

52

#include "llvm/IR/BasicBlock.h"

53

#include "llvm/IR/Constant.h"

54

#include "llvm/IR/Constants.h"

55

#include "llvm/IR/DataLayout.h"

56

#include "llvm/IR/DerivedTypes.h"

57

#include "llvm/IR/Dominators.h"

58

#include "llvm/IR/Function.h"

59

#include "llvm/IR/IRBuilder.h"

60

#include "llvm/IR/InstrTypes.h"

61

#include "llvm/IR/Instruction.h"

62

#include "llvm/IR/Instructions.h"

63

#include "llvm/IR/IntrinsicInst.h"

64

#include "llvm/IR/Intrinsics.h"

65

#include "llvm/IR/Module.h"

66

#include "llvm/IR/Operator.h"

67

#include "llvm/IR/PatternMatch.h"

68

#include "llvm/IR/Type.h"

69

#include "llvm/IR/Use.h"

70

#include "llvm/IR/User.h"

71

#include "llvm/IR/Value.h"

72

#include "llvm/IR/ValueHandle.h"

73

#ifdef EXPENSIVE_CHECKS

74

#include "llvm/IR/Verifier.h"

75

#endif

76

#include "llvm/Pass.h"

77

#include "llvm/Support/Casting.h"

78

#include "llvm/Support/CommandLine.h"

79

#include "llvm/Support/Compiler.h"

80

#include "llvm/Support/DOTGraphTraits.h"

81

#include "llvm/Support/Debug.h"

82

#include "llvm/Support/ErrorHandling.h"

83

#include "llvm/Support/GraphWriter.h"

84

#include "llvm/Support/InstructionCost.h"

85

#include "llvm/Support/KnownBits.h"

86

#include "llvm/Support/MathExtras.h"

87

#include "llvm/Support/raw_ostream.h"

88

#include "llvm/Transforms/Utils/InjectTLIMappings.h"

89

#include "llvm/Transforms/Utils/Local.h"

90

#include "llvm/Transforms/Utils/LoopUtils.h"

91

#include "llvm/Transforms/Vectorize.h"

92

#include <algorithm>

93

#include <cassert>

94

#include <cstdint>

95

#include <iterator>

96

#include <memory>

97

#include <set>

98

#include <string>

99

#include <tuple>

100

#include <utility>

101

#include <vector>

102

103

using namespace llvm;

104

using namespace llvm::PatternMatch;

105

using namespace slpvectorizer;

106

107

#define SV_NAME"slp-vectorizer" "slp-vectorizer"

108

#define DEBUG_TYPE"SLP" "SLP"

109

110

STATISTIC(NumVectorInstructions, "Number of vector instructions generated")static llvm::Statistic NumVectorInstructions = {"SLP", "NumVectorInstructions"
, "Number of vector instructions generated"};

111

112

cl::opt<bool> RunSLPVectorization("vectorize-slp", cl::init(true), cl::Hidden,

113

cl::desc("Run the SLP vectorization passes"));

114

115

static cl::opt<int>

116

SLPCostThreshold("slp-threshold", cl::init(0), cl::Hidden,

117

cl::desc("Only vectorize if you gain more than this "

118

"number "));

119

120

static cl::opt<bool>

121

ShouldVectorizeHor("slp-vectorize-hor", cl::init(true), cl::Hidden,

122

cl::desc("Attempt to vectorize horizontal reductions"));

123

124

static cl::opt<bool> ShouldStartVectorizeHorAtStore(

125

"slp-vectorize-hor-store", cl::init(false), cl::Hidden,

126

cl::desc(

127

"Attempt to vectorize horizontal reductions feeding into a store"));

128

129

static cl::opt<int>

130

MaxVectorRegSizeOption("slp-max-reg-size", cl::init(128), cl::Hidden,

131

cl::desc("Attempt to vectorize for this register size in bits"));

132

133

static cl::opt<unsigned>

134

MaxVFOption("slp-max-vf", cl::init(0), cl::Hidden,

135

cl::desc("Maximum SLP vectorization factor (0=unlimited)"));

136

137

static cl::opt<int>

138

MaxStoreLookup("slp-max-store-lookup", cl::init(32), cl::Hidden,

139

cl::desc("Maximum depth of the lookup for consecutive stores."));

140

141

/// Limits the size of scheduling regions in a block.

142

/// It avoid long compile times for _very_ large blocks where vector

143

/// instructions are spread over a wide range.

144

/// This limit is way higher than needed by real-world functions.

145

static cl::opt<int>

146

ScheduleRegionSizeBudget("slp-schedule-budget", cl::init(100000), cl::Hidden,

147

cl::desc("Limit the size of the SLP scheduling region per block"));

148

149

static cl::opt<int> MinVectorRegSizeOption(

150

"slp-min-reg-size", cl::init(128), cl::Hidden,

151

cl::desc("Attempt to vectorize for this register size in bits"));

152

153

static cl::opt<unsigned> RecursionMaxDepth(

154

"slp-recursion-max-depth", cl::init(12), cl::Hidden,

155

cl::desc("Limit the recursion depth when building a vectorizable tree"));

156

157

static cl::opt<unsigned> MinTreeSize(

158

"slp-min-tree-size", cl::init(3), cl::Hidden,

159

cl::desc("Only vectorize small trees if they are fully vectorizable"));

160

161

// The maximum depth that the look-ahead score heuristic will explore.

162

// The higher this value, the higher the compilation time overhead.

163

static cl::opt<int> LookAheadMaxDepth(

164

"slp-max-look-ahead-depth", cl::init(2), cl::Hidden,

165

cl::desc("The maximum look-ahead depth for operand reordering scores"));

166

167

// The maximum depth that the look-ahead score heuristic will explore

168

// when it probing among candidates for vectorization tree roots.

169

// The higher this value, the higher the compilation time overhead but unlike

170

// similar limit for operands ordering this is less frequently used, hence

171

// impact of higher value is less noticeable.

172

static cl::opt<int> RootLookAheadMaxDepth(

173

"slp-max-root-look-ahead-depth", cl::init(2), cl::Hidden,

174

cl::desc("The maximum look-ahead depth for searching best rooting option"));

175

176

static cl::opt<bool>

177

ViewSLPTree("view-slp-tree", cl::Hidden,

178

cl::desc("Display the SLP trees with Graphviz"));

179

180

// Limit the number of alias checks. The limit is chosen so that

181

// it has no negative effect on the llvm benchmarks.

182

static const unsigned AliasedCheckLimit = 10;

183

184

// Another limit for the alias checks: The maximum distance between load/store

185

// instructions where alias checks are done.

186

// This limit is useful for very large basic blocks.

187

static const unsigned MaxMemDepDistance = 160;

188

189

/// If the ScheduleRegionSizeBudget is exhausted, we allow small scheduling

190

/// regions to be handled.

191

static const int MinScheduleRegionSize = 16;

192

193

/// Predicate for the element types that the SLP vectorizer supports.

194

///

195

/// The most important thing to filter here are types which are invalid in LLVM

196

/// vectors. We also filter target specific types which have absolutely no

197

/// meaningful vectorization path such as x86_fp80 and ppc_f128. This just

198

/// avoids spending time checking the cost model and realizing that they will

199

/// be inevitably scalarized.

200

static bool isValidElementType(Type *Ty) {

201

return VectorType::isValidElementType(Ty) && !Ty->isX86_FP80Ty() &&

202

!Ty->isPPC_FP128Ty();

203

}

204

205

/// \returns True if the value is a constant (but not globals/constant

206

/// expressions).

207

static bool isConstant(Value *V) {

208

return isa<Constant>(V) && !isa<ConstantExpr, GlobalValue>(V);

209

}

210

211

/// Checks if \p V is one of vector-like instructions, i.e. undef,

212

/// insertelement/extractelement with constant indices for fixed vector type or

213

/// extractvalue instruction.

214

static bool isVectorLikeInstWithConstOps(Value *V) {

215

if (!isa<InsertElementInst, ExtractElementInst>(V) &&

216

!isa<ExtractValueInst, UndefValue>(V))

217

return false;

218

auto *I = dyn_cast<Instruction>(V);

219

if (!I || isa<ExtractValueInst>(I))

220

return true;

221

if (!isa<FixedVectorType>(I->getOperand(0)->getType()))

222

return false;

223

if (isa<ExtractElementInst>(I))

224

return isConstant(I->getOperand(1));

225

assert(isa<InsertElementInst>(V) && "Expected only insertelement.")(static_cast <bool> (isa<InsertElementInst>(V) &&
"Expected only insertelement.") ? void (0) : __assert_fail (
"isa<InsertElementInst>(V) && \"Expected only insertelement.\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 225, __extension__
__PRETTY_FUNCTION__));

226

return isConstant(I->getOperand(2));

227

}

228

229

/// \returns true if all of the instructions in \p VL are in the same block or

230

/// false otherwise.

231

static bool allSameBlock(ArrayRef<Value *> VL) {

232

Instruction *I0 = dyn_cast<Instruction>(VL[0]);

233

if (!I0)

234

return false;

235

if (all_of(VL, isVectorLikeInstWithConstOps))

236

return true;

237

238

BasicBlock *BB = I0->getParent();

239

for (int I = 1, E = VL.size(); I < E; I++) {

240

auto *II = dyn_cast<Instruction>(VL[I]);

241

if (!II)

242

return false;

243

244

if (BB != II->getParent())

245

return false;

246

}

247

return true;

248

}

249

250

/// \returns True if all of the values in \p VL are constants (but not

251

/// globals/constant expressions).

252

static bool allConstant(ArrayRef<Value *> VL) {

253

// Constant expressions and globals can't be vectorized like normal integer/FP

254

// constants.

255

return all_of(VL, isConstant);

256

}

257

258

/// \returns True if all of the values in \p VL are identical or some of them

259

/// are UndefValue.

260

static bool isSplat(ArrayRef<Value *> VL) {

261

Value *FirstNonUndef = nullptr;

262

for (Value *V : VL) {

263

if (isa<UndefValue>(V))

264

continue;

265

if (!FirstNonUndef) {

266

FirstNonUndef = V;

267

continue;

268

}

269

if (V != FirstNonUndef)

270

return false;

271

}

272

return FirstNonUndef != nullptr;

273

}

274

275

/// \returns True if \p I is commutative, handles CmpInst and BinaryOperator.

276

static bool isCommutative(Instruction *I) {

277

if (auto *Cmp = dyn_cast<CmpInst>(I))

278

return Cmp->isCommutative();

279

if (auto *BO = dyn_cast<BinaryOperator>(I))

280

return BO->isCommutative();

281

// TODO: This should check for generic Instruction::isCommutative(), but

282

// we need to confirm that the caller code correctly handles Intrinsics

283

// for example (does not have 2 operands).

284

return false;

285

}

286

287

/// \returns inserting index of InsertElement or InsertValue instruction,

288

/// using Offset as base offset for index.

289

static Optional<unsigned> getInsertIndex(const Value *InsertInst,

290

unsigned Offset = 0) {

291

int Index = Offset;

292

if (const auto *IE = dyn_cast<InsertElementInst>(InsertInst)) {

293

if (const auto *CI = dyn_cast<ConstantInt>(IE->getOperand(2))) {

294

auto *VT = cast<FixedVectorType>(IE->getType());

295

if (CI->getValue().uge(VT->getNumElements()))

296

return None;

297

Index *= VT->getNumElements();

298

Index += CI->getZExtValue();

299

return Index;

300

}

301

return None;

302

}

303

304

const auto *IV = cast<InsertValueInst>(InsertInst);

305

Type *CurrentType = IV->getType();

306

for (unsigned I : IV->indices()) {

307

if (const auto *ST = dyn_cast<StructType>(CurrentType)) {

308

Index *= ST->getNumElements();

309

CurrentType = ST->getElementType(I);

310

} else if (const auto *AT = dyn_cast<ArrayType>(CurrentType)) {

311

Index *= AT->getNumElements();

312

CurrentType = AT->getElementType();

313

} else {

314

return None;

315

}

316

Index += I;

317

}

318

return Index;

319

}

320

321

/// Checks if the given value is actually an undefined constant vector.

322

/// Also, if the\p ShuffleMask is not empty, tries to check if the non-masked

323

/// elements actually mask the insertelement buildvector, if any.

324

static bool isUndefVector(const Value *V, ArrayRef<int> ShuffleMask = None) {

325

if (isa<UndefValue>(V))

326

return true;

327

auto *VecTy = dyn_cast<FixedVectorType>(V->getType());

328

if (!VecTy)

329

return false;

330

auto *C = dyn_cast<Constant>(V);

331

if (!C) {

332

if (!ShuffleMask.empty()) {

333

const Value *Base = V;

334

while (auto *II = dyn_cast<InsertElementInst>(Base)) {

335

Base = II->getOperand(0);

336

Optional<unsigned> Idx = getInsertIndex(II);

337

if (!Idx)

338

continue;

339

if (*Idx < ShuffleMask.size() && ShuffleMask[*Idx] == UndefMaskElem)

340

return false;

341

}

342

return V != Base && isUndefVector(Base);

343

}

344

return false;

345

}

346

for (unsigned I = 0, E = VecTy->getNumElements(); I != E; ++I) {

347

if (Constant *Elem = C->getAggregateElement(I))

348

if (!isa<UndefValue>(Elem) &&

349

(ShuffleMask.empty() ||

350

(I < ShuffleMask.size() && ShuffleMask[I] == UndefMaskElem)))

351

return false;

352

}

353

return true;

354

}

355

356

/// Checks if the vector of instructions can be represented as a shuffle, like:

357

/// %x0 = extractelement <4 x i8> %x, i32 0

358

/// %x3 = extractelement <4 x i8> %x, i32 3

359

/// %y1 = extractelement <4 x i8> %y, i32 1

360

/// %y2 = extractelement <4 x i8> %y, i32 2

361

/// %x0x0 = mul i8 %x0, %x0

362

/// %x3x3 = mul i8 %x3, %x3

363

/// %y1y1 = mul i8 %y1, %y1

364

/// %y2y2 = mul i8 %y2, %y2

365

/// %ins1 = insertelement <4 x i8> poison, i8 %x0x0, i32 0

366

/// %ins2 = insertelement <4 x i8> %ins1, i8 %x3x3, i32 1

367

/// %ins3 = insertelement <4 x i8> %ins2, i8 %y1y1, i32 2

368

/// %ins4 = insertelement <4 x i8> %ins3, i8 %y2y2, i32 3

369

/// ret <4 x i8> %ins4

370

/// can be transformed into:

371

/// %1 = shufflevector <4 x i8> %x, <4 x i8> %y, <4 x i32> <i32 0, i32 3, i32 5,

372

/// i32 6>

373

/// %2 = mul <4 x i8> %1, %1

374

/// ret <4 x i8> %2

375

/// We convert this initially to something like:

376

/// %x0 = extractelement <4 x i8> %x, i32 0

377

/// %x3 = extractelement <4 x i8> %x, i32 3

378

/// %y1 = extractelement <4 x i8> %y, i32 1

379

/// %y2 = extractelement <4 x i8> %y, i32 2

380

/// %1 = insertelement <4 x i8> poison, i8 %x0, i32 0

381

/// %2 = insertelement <4 x i8> %1, i8 %x3, i32 1

382

/// %3 = insertelement <4 x i8> %2, i8 %y1, i32 2

383

/// %4 = insertelement <4 x i8> %3, i8 %y2, i32 3

384

/// %5 = mul <4 x i8> %4, %4

385

/// %6 = extractelement <4 x i8> %5, i32 0

386

/// %ins1 = insertelement <4 x i8> poison, i8 %6, i32 0

387

/// %7 = extractelement <4 x i8> %5, i32 1

388

/// %ins2 = insertelement <4 x i8> %ins1, i8 %7, i32 1

389

/// %8 = extractelement <4 x i8> %5, i32 2

390

/// %ins3 = insertelement <4 x i8> %ins2, i8 %8, i32 2

391

/// %9 = extractelement <4 x i8> %5, i32 3

392

/// %ins4 = insertelement <4 x i8> %ins3, i8 %9, i32 3

393

/// ret <4 x i8> %ins4

394

/// InstCombiner transforms this into a shuffle and vector mul

395

/// Mask will return the Shuffle Mask equivalent to the extracted elements.

396

/// TODO: Can we split off and reuse the shuffle mask detection from

397

/// ShuffleVectorInst/getShuffleCost?

398

static Optional<TargetTransformInfo::ShuffleKind>

399

isFixedVectorShuffle(ArrayRef<Value *> VL, SmallVectorImpl<int> &Mask) {

400

const auto *It =

401

find_if(VL, [](Value *V) { return isa<ExtractElementInst>(V); });

402

if (It == VL.end())

403

return None;

404

auto *EI0 = cast<ExtractElementInst>(*It);

405

if (isa<ScalableVectorType>(EI0->getVectorOperandType()))

406

return None;

407

unsigned Size =

408

cast<FixedVectorType>(EI0->getVectorOperandType())->getNumElements();

409

Value *Vec1 = nullptr;

410

Value *Vec2 = nullptr;

411

enum ShuffleMode { Unknown, Select, Permute };

412

ShuffleMode CommonShuffleMode = Unknown;

413

Mask.assign(VL.size(), UndefMaskElem);

414

for (unsigned I = 0, E = VL.size(); I < E; ++I) {

415

// Undef can be represented as an undef element in a vector.

416

if (isa<UndefValue>(VL[I]))

417

continue;

418

auto *EI = cast<ExtractElementInst>(VL[I]);

419

if (isa<ScalableVectorType>(EI->getVectorOperandType()))

420

return None;

421

auto *Vec = EI->getVectorOperand();

422

// We can extractelement from undef or poison vector.

423

if (isUndefVector(Vec))

424

continue;

425

// All vector operands must have the same number of vector elements.

426

if (cast<FixedVectorType>(Vec->getType())->getNumElements() != Size)

427

return None;

428

if (isa<UndefValue>(EI->getIndexOperand()))

429

continue;

430

auto *Idx = dyn_cast<ConstantInt>(EI->getIndexOperand());

431

if (!Idx)

432

return None;

433

// Undefined behavior if Idx is negative or >= Size.

434

if (Idx->getValue().uge(Size))

435

continue;

436

unsigned IntIdx = Idx->getValue().getZExtValue();

437

Mask[I] = IntIdx;

438

// For correct shuffling we have to have at most 2 different vector operands

439

// in all extractelement instructions.

440

if (!Vec1 || Vec1 == Vec) {

441

Vec1 = Vec;

442

} else if (!Vec2 || Vec2 == Vec) {

443

Vec2 = Vec;

444

Mask[I] += Size;

445

} else {

446

return None;

447

}

448

if (CommonShuffleMode == Permute)

449

continue;

450

// If the extract index is not the same as the operation number, it is a

451

// permutation.

452

if (IntIdx != I) {

453

CommonShuffleMode = Permute;

454

continue;

455

}

456

CommonShuffleMode = Select;

457

}

458

// If we're not crossing lanes in different vectors, consider it as blending.

459

if (CommonShuffleMode == Select && Vec2)

460

return TargetTransformInfo::SK_Select;

461

// If Vec2 was never used, we have a permutation of a single vector, otherwise

462

// we have permutation of 2 vectors.

463

return Vec2 ? TargetTransformInfo::SK_PermuteTwoSrc

464

: TargetTransformInfo::SK_PermuteSingleSrc;

465

}

466

467

namespace {

468

469

/// Main data required for vectorization of instructions.

470

struct InstructionsState {

471

/// The very first instruction in the list with the main opcode.

472

Value *OpValue = nullptr;

473

474

/// The main/alternate instruction.

475

Instruction *MainOp = nullptr;

476

Instruction *AltOp = nullptr;

477

478

/// The main/alternate opcodes for the list of instructions.

479

unsigned getOpcode() const {

480

return MainOp ? MainOp->getOpcode() : 0;

481

}

482

483

unsigned getAltOpcode() const {

484

return AltOp ? AltOp->getOpcode() : 0;

485

}

486

487

/// Some of the instructions in the list have alternate opcodes.

488

bool isAltShuffle() const { return AltOp != MainOp; }

489

490

bool isOpcodeOrAlt(Instruction *I) const {

491

unsigned CheckedOpcode = I->getOpcode();

492

return getOpcode() == CheckedOpcode || getAltOpcode() == CheckedOpcode;

493

}

494

495

InstructionsState() = delete;

496

InstructionsState(Value *OpValue, Instruction *MainOp, Instruction *AltOp)

497

: OpValue(OpValue), MainOp(MainOp), AltOp(AltOp) {}

498

};

499

500

} // end anonymous namespace

501

502

/// Chooses the correct key for scheduling data. If \p Op has the same (or

503

/// alternate) opcode as \p OpValue, the key is \p Op. Otherwise the key is \p

504

/// OpValue.

505

static Value *isOneOf(const InstructionsState &S, Value *Op) {

506

auto *I = dyn_cast<Instruction>(Op);

507

if (I && S.isOpcodeOrAlt(I))

508

return Op;

509

return S.OpValue;

510

}

511

512

/// \returns true if \p Opcode is allowed as part of of the main/alternate

513

/// instruction for SLP vectorization.

514

///

515

/// Example of unsupported opcode is SDIV that can potentially cause UB if the

516

/// "shuffled out" lane would result in division by zero.

517

static bool isValidForAlternation(unsigned Opcode) {

518

if (Instruction::isIntDivRem(Opcode))

519

return false;

520

521

return true;

522

}

523

524

static InstructionsState getSameOpcode(ArrayRef<Value *> VL,

525

unsigned BaseIndex = 0);

526

527

/// Checks if the provided operands of 2 cmp instructions are compatible, i.e.

528

/// compatible instructions or constants, or just some other regular values.

529

static bool areCompatibleCmpOps(Value *BaseOp0, Value *BaseOp1, Value *Op0,

530

Value *Op1) {

531

return (isConstant(BaseOp0) && isConstant(Op0)) ||

532

(isConstant(BaseOp1) && isConstant(Op1)) ||

533

(!isa<Instruction>(BaseOp0) && !isa<Instruction>(Op0) &&

534

!isa<Instruction>(BaseOp1) && !isa<Instruction>(Op1)) ||

535

getSameOpcode({BaseOp0, Op0}).getOpcode() ||

536

getSameOpcode({BaseOp1, Op1}).getOpcode();

537

}

538

539

/// \returns true if a compare instruction \p CI has similar "look" and

540

/// same predicate as \p BaseCI, "as is" or with its operands and predicate

541

/// swapped, false otherwise.

542

static bool isCmpSameOrSwapped(const CmpInst *BaseCI, const CmpInst *CI) {

543

assert(BaseCI->getOperand(0)->getType() == CI->getOperand(0)->getType() &&(static_cast <bool> (BaseCI->getOperand(0)->getType
() == CI->getOperand(0)->getType() && "Assessing comparisons of different types?"
) ? void (0) : __assert_fail ("BaseCI->getOperand(0)->getType() == CI->getOperand(0)->getType() && \"Assessing comparisons of different types?\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 544, __extension__
__PRETTY_FUNCTION__))

544

"Assessing comparisons of different types?")(static_cast <bool> (BaseCI->getOperand(0)->getType
() == CI->getOperand(0)->getType() && "Assessing comparisons of different types?"
) ? void (0) : __assert_fail ("BaseCI->getOperand(0)->getType() == CI->getOperand(0)->getType() && \"Assessing comparisons of different types?\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 544, __extension__
__PRETTY_FUNCTION__));

545

CmpInst::Predicate BasePred = BaseCI->getPredicate();

546

CmpInst::Predicate Pred = CI->getPredicate();

547

CmpInst::Predicate SwappedPred = CmpInst::getSwappedPredicate(Pred);

548

549

Value *BaseOp0 = BaseCI->getOperand(0);

550

Value *BaseOp1 = BaseCI->getOperand(1);

551

Value *Op0 = CI->getOperand(0);

552

Value *Op1 = CI->getOperand(1);

553

554

return (BasePred == Pred &&

555

areCompatibleCmpOps(BaseOp0, BaseOp1, Op0, Op1)) ||

556

(BasePred == SwappedPred &&

557

areCompatibleCmpOps(BaseOp0, BaseOp1, Op1, Op0));

558

}

559

560

/// \returns analysis of the Instructions in \p VL described in

561

/// InstructionsState, the Opcode that we suppose the whole list

562

/// could be vectorized even if its structure is diverse.

563

static InstructionsState getSameOpcode(ArrayRef<Value *> VL,

564

unsigned BaseIndex) {

565

// Make sure these are all Instructions.

566

if (llvm::any_of(VL, [](Value *V) { return !isa<Instruction>(V); }))

567

return InstructionsState(VL[BaseIndex], nullptr, nullptr);

568

569

bool IsCastOp = isa<CastInst>(VL[BaseIndex]);

570

bool IsBinOp = isa<BinaryOperator>(VL[BaseIndex]);

571

bool IsCmpOp = isa<CmpInst>(VL[BaseIndex]);

572

CmpInst::Predicate BasePred =

573

IsCmpOp ? cast<CmpInst>(VL[BaseIndex])->getPredicate()

574

: CmpInst::BAD_ICMP_PREDICATE;

575

unsigned Opcode = cast<Instruction>(VL[BaseIndex])->getOpcode();

576

unsigned AltOpcode = Opcode;

577

unsigned AltIndex = BaseIndex;

578

579

// Check for one alternate opcode from another BinaryOperator.

580

// TODO - generalize to support all operators (types, calls etc.).

581

for (int Cnt = 0, E = VL.size(); Cnt < E; Cnt++) {

582

unsigned InstOpcode = cast<Instruction>(VL[Cnt])->getOpcode();

583

if (IsBinOp && isa<BinaryOperator>(VL[Cnt])) {

584

if (InstOpcode == Opcode || InstOpcode == AltOpcode)

585

continue;

586

if (Opcode == AltOpcode && isValidForAlternation(InstOpcode) &&

587

isValidForAlternation(Opcode)) {

588

AltOpcode = InstOpcode;

589

AltIndex = Cnt;

590

continue;

591

}

592

} else if (IsCastOp && isa<CastInst>(VL[Cnt])) {

593

Type *Ty0 = cast<Instruction>(VL[BaseIndex])->getOperand(0)->getType();

594

Type *Ty1 = cast<Instruction>(VL[Cnt])->getOperand(0)->getType();

595

if (Ty0 == Ty1) {

596

if (InstOpcode == Opcode || InstOpcode == AltOpcode)

597

continue;

598

if (Opcode == AltOpcode) {

599

assert(isValidForAlternation(Opcode) &&(static_cast <bool> (isValidForAlternation(Opcode) &&
isValidForAlternation(InstOpcode) && "Cast isn't safe for alternation, logic needs to be updated!"
) ? void (0) : __assert_fail ("isValidForAlternation(Opcode) && isValidForAlternation(InstOpcode) && \"Cast isn't safe for alternation, logic needs to be updated!\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 601, __extension__
__PRETTY_FUNCTION__))

600

isValidForAlternation(InstOpcode) &&(static_cast <bool> (isValidForAlternation(Opcode) &&
isValidForAlternation(InstOpcode) && "Cast isn't safe for alternation, logic needs to be updated!"
) ? void (0) : __assert_fail ("isValidForAlternation(Opcode) && isValidForAlternation(InstOpcode) && \"Cast isn't safe for alternation, logic needs to be updated!\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 601, __extension__
__PRETTY_FUNCTION__))

601

"Cast isn't safe for alternation, logic needs to be updated!")(static_cast <bool> (isValidForAlternation(Opcode) &&
isValidForAlternation(InstOpcode) && "Cast isn't safe for alternation, logic needs to be updated!"
) ? void (0) : __assert_fail ("isValidForAlternation(Opcode) && isValidForAlternation(InstOpcode) && \"Cast isn't safe for alternation, logic needs to be updated!\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 601, __extension__
__PRETTY_FUNCTION__));

602

AltOpcode = InstOpcode;

603

AltIndex = Cnt;

604

continue;

605

}

606

}

607

} else if (auto *Inst = dyn_cast<CmpInst>(VL[Cnt]); Inst && IsCmpOp) {

608

auto *BaseInst = cast<CmpInst>(VL[BaseIndex]);

609

Type *Ty0 = BaseInst->getOperand(0)->getType();

610

Type *Ty1 = Inst->getOperand(0)->getType();

611

if (Ty0 == Ty1) {

612

assert(InstOpcode == Opcode && "Expected same CmpInst opcode.")(static_cast <bool> (InstOpcode == Opcode && "Expected same CmpInst opcode."
) ? void (0) : __assert_fail ("InstOpcode == Opcode && \"Expected same CmpInst opcode.\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 612, __extension__
__PRETTY_FUNCTION__));

613

// Check for compatible operands. If the corresponding operands are not

614

// compatible - need to perform alternate vectorization.

615

CmpInst::Predicate CurrentPred = Inst->getPredicate();

616

CmpInst::Predicate SwappedCurrentPred =

617

CmpInst::getSwappedPredicate(CurrentPred);

618

619

if (E == 2 &&

620

(BasePred == CurrentPred || BasePred == SwappedCurrentPred))

621

continue;

622

623

if (isCmpSameOrSwapped(BaseInst, Inst))

624

continue;

625

auto *AltInst = cast<CmpInst>(VL[AltIndex]);

626

if (AltIndex != BaseIndex) {

627

if (isCmpSameOrSwapped(AltInst, Inst))

628

continue;

629

} else if (BasePred != CurrentPred) {

630

assert((static_cast <bool> (isValidForAlternation(InstOpcode) &&
"CmpInst isn't safe for alternation, logic needs to be updated!"
) ? void (0) : __assert_fail ("isValidForAlternation(InstOpcode) && \"CmpInst isn't safe for alternation, logic needs to be updated!\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 632, __extension__
__PRETTY_FUNCTION__))

631

isValidForAlternation(InstOpcode) &&(static_cast <bool> (isValidForAlternation(InstOpcode) &&
"CmpInst isn't safe for alternation, logic needs to be updated!"
) ? void (0) : __assert_fail ("isValidForAlternation(InstOpcode) && \"CmpInst isn't safe for alternation, logic needs to be updated!\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 632, __extension__
__PRETTY_FUNCTION__))

632

"CmpInst isn't safe for alternation, logic needs to be updated!")(static_cast <bool> (isValidForAlternation(InstOpcode) &&
"CmpInst isn't safe for alternation, logic needs to be updated!"
) ? void (0) : __assert_fail ("isValidForAlternation(InstOpcode) && \"CmpInst isn't safe for alternation, logic needs to be updated!\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 632, __extension__
__PRETTY_FUNCTION__));

633

AltIndex = Cnt;

634

continue;

635

}

636

CmpInst::Predicate AltPred = AltInst->getPredicate();

637

if (BasePred == CurrentPred || BasePred == SwappedCurrentPred ||

638

AltPred == CurrentPred || AltPred == SwappedCurrentPred)

639

continue;

640

}

641

} else if (InstOpcode == Opcode || InstOpcode == AltOpcode)

642

continue;

643

return InstructionsState(VL[BaseIndex], nullptr, nullptr);

644

}

645

646

return InstructionsState(VL[BaseIndex], cast<Instruction>(VL[BaseIndex]),

647

cast<Instruction>(VL[AltIndex]));

648

}

649

650

/// \returns true if all of the values in \p VL have the same type or false

651

/// otherwise.

652

static bool allSameType(ArrayRef<Value *> VL) {

653

Type *Ty = VL[0]->getType();

654

for (int i = 1, e = VL.size(); i < e; i++)

655

if (VL[i]->getType() != Ty)

656

return false;

657

658

return true;

659

}

660

661

/// \returns True if Extract{Value,Element} instruction extracts element Idx.

662

static Optional<unsigned> getExtractIndex(Instruction *E) {

663

unsigned Opcode = E->getOpcode();

664

assert((Opcode == Instruction::ExtractElement ||(static_cast <bool> ((Opcode == Instruction::ExtractElement
|| Opcode == Instruction::ExtractValue) && "Expected extractelement or extractvalue instruction."
) ? void (0) : __assert_fail ("(Opcode == Instruction::ExtractElement || Opcode == Instruction::ExtractValue) && \"Expected extractelement or extractvalue instruction.\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 666, __extension__
__PRETTY_FUNCTION__))

665

Opcode == Instruction::ExtractValue) &&(static_cast <bool> ((Opcode == Instruction::ExtractElement
|| Opcode == Instruction::ExtractValue) && "Expected extractelement or extractvalue instruction."
) ? void (0) : __assert_fail ("(Opcode == Instruction::ExtractElement || Opcode == Instruction::ExtractValue) && \"Expected extractelement or extractvalue instruction.\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 666, __extension__
__PRETTY_FUNCTION__))

666

"Expected extractelement or extractvalue instruction.")(static_cast <bool> ((Opcode == Instruction::ExtractElement
|| Opcode == Instruction::ExtractValue) && "Expected extractelement or extractvalue instruction."
) ? void (0) : __assert_fail ("(Opcode == Instruction::ExtractElement || Opcode == Instruction::ExtractValue) && \"Expected extractelement or extractvalue instruction.\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 666, __extension__
__PRETTY_FUNCTION__));

667

if (Opcode == Instruction::ExtractElement) {

668

auto *CI = dyn_cast<ConstantInt>(E->getOperand(1));

669

if (!CI)

670

return None;

671

return CI->getZExtValue();

672

}

673

ExtractValueInst *EI = cast<ExtractValueInst>(E);

674

if (EI->getNumIndices() != 1)

675

return None;

676

return *EI->idx_begin();

677

}

678

679

/// \returns True if in-tree use also needs extract. This refers to

680

/// possible scalar operand in vectorized instruction.

681

static bool InTreeUserNeedToExtract(Value *Scalar, Instruction *UserInst,

682

TargetLibraryInfo *TLI) {

683

unsigned Opcode = UserInst->getOpcode();

684

switch (Opcode) {

685

case Instruction::Load: {

686

LoadInst *LI = cast<LoadInst>(UserInst);

687

return (LI->getPointerOperand() == Scalar);

688

}

689

case Instruction::Store: {

690

StoreInst *SI = cast<StoreInst>(UserInst);

691

return (SI->getPointerOperand() == Scalar);

692

}

693

case Instruction::Call: {

694

CallInst *CI = cast<CallInst>(UserInst);

695

Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);

696

for (unsigned i = 0, e = CI->arg_size(); i != e; ++i) {

697

if (isVectorIntrinsicWithScalarOpAtArg(ID, i))

698

return (CI->getArgOperand(i) == Scalar);

699

}

700

[[fallthrough]];

701

}

702

default:

703

return false;

704

}

705

}

706

707

/// \returns the AA location that is being access by the instruction.

708

static MemoryLocation getLocation(Instruction *I) {

709

if (StoreInst *SI = dyn_cast<StoreInst>(I))

710

return MemoryLocation::get(SI);

711

if (LoadInst *LI = dyn_cast<LoadInst>(I))

712

return MemoryLocation::get(LI);

713

return MemoryLocation();

714

}

715

716

/// \returns True if the instruction is not a volatile or atomic load/store.

717

static bool isSimple(Instruction *I) {

718

if (LoadInst *LI = dyn_cast<LoadInst>(I))

719

return LI->isSimple();

720

if (StoreInst *SI = dyn_cast<StoreInst>(I))

721

return SI->isSimple();

722

if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(I))

723

return !MI->isVolatile();

724

return true;

725

}

726

727

/// Shuffles \p Mask in accordance with the given \p SubMask.

728

static void addMask(SmallVectorImpl<int> &Mask, ArrayRef<int> SubMask) {

729

if (SubMask.empty())

730

return;

731

if (Mask.empty()) {

732

Mask.append(SubMask.begin(), SubMask.end());

733

return;

734

}

735

SmallVector<int> NewMask(SubMask.size(), UndefMaskElem);

736

int TermValue = std::min(Mask.size(), SubMask.size());

737

for (int I = 0, E = SubMask.size(); I < E; ++I) {

738

if (SubMask[I] >= TermValue || SubMask[I] == UndefMaskElem ||

739

Mask[SubMask[I]] >= TermValue)

740

continue;

741

NewMask[I] = Mask[SubMask[I]];

742

}

743

Mask.swap(NewMask);

744

}

745

746

/// Order may have elements assigned special value (size) which is out of

747

/// bounds. Such indices only appear on places which correspond to undef values

748

/// (see canReuseExtract for details) and used in order to avoid undef values

749

/// have effect on operands ordering.

750

/// The first loop below simply finds all unused indices and then the next loop

751

/// nest assigns these indices for undef values positions.

752

/// As an example below Order has two undef positions and they have assigned

753

/// values 3 and 7 respectively:

754

/// before: 6 9 5 4 9 2 1 0

755

/// after: 6 3 5 4 7 2 1 0

756

static void fixupOrderingIndices(SmallVectorImpl<unsigned> &Order) {

757

const unsigned Sz = Order.size();

758

SmallBitVector UnusedIndices(Sz, /*t=*/true);

759

SmallBitVector MaskedIndices(Sz);

760

for (unsigned I = 0; I < Sz; ++I) {

761

if (Order[I] < Sz)

762

UnusedIndices.reset(Order[I]);

763

else

764

MaskedIndices.set(I);

765

}

766

if (MaskedIndices.none())

767

return;

768

assert(UnusedIndices.count() == MaskedIndices.count() &&(static_cast <bool> (UnusedIndices.count() == MaskedIndices
.count() && "Non-synced masked/available indices.") ?
void (0) : __assert_fail ("UnusedIndices.count() == MaskedIndices.count() && \"Non-synced masked/available indices.\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 769, __extension__
__PRETTY_FUNCTION__))

769

"Non-synced masked/available indices.")(static_cast <bool> (UnusedIndices.count() == MaskedIndices
.count() && "Non-synced masked/available indices.") ?
void (0) : __assert_fail ("UnusedIndices.count() == MaskedIndices.count() && \"Non-synced masked/available indices.\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 769, __extension__
__PRETTY_FUNCTION__));

770

int Idx = UnusedIndices.find_first();

771

int MIdx = MaskedIndices.find_first();

772

while (MIdx >= 0) {

773

assert(Idx >= 0 && "Indices must be synced.")(static_cast <bool> (Idx >= 0 && "Indices must be synced."
) ? void (0) : __assert_fail ("Idx >= 0 && \"Indices must be synced.\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 773, __extension__
__PRETTY_FUNCTION__));

774

Order[MIdx] = Idx;

775

Idx = UnusedIndices.find_next(Idx);

776

MIdx = MaskedIndices.find_next(MIdx);

777

}

778

}

779

780

namespace llvm {

781

782

static void inversePermutation(ArrayRef<unsigned> Indices,

783

SmallVectorImpl<int> &Mask) {

784

Mask.clear();

785

const unsigned E = Indices.size();

786

Mask.resize(E, UndefMaskElem);

787

for (unsigned I = 0; I < E; ++I)

788

Mask[Indices[I]] = I;

789

}

790

791

/// Reorders the list of scalars in accordance with the given \p Mask.

792

static void reorderScalars(SmallVectorImpl<Value *> &Scalars,

793

ArrayRef<int> Mask) {

794

assert(!Mask.empty() && "Expected non-empty mask.")(static_cast <bool> (!Mask.empty() && "Expected non-empty mask."
) ? void (0) : __assert_fail ("!Mask.empty() && \"Expected non-empty mask.\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 794, __extension__
__PRETTY_FUNCTION__));

795

SmallVector<Value *> Prev(Scalars.size(),

796

UndefValue::get(Scalars.front()->getType()));

797

Prev.swap(Scalars);

798

for (unsigned I = 0, E = Prev.size(); I < E; ++I)

799

if (Mask[I] != UndefMaskElem)

800

Scalars[Mask[I]] = Prev[I];

801

}

802

803

/// Checks if the provided value does not require scheduling. It does not

804

/// require scheduling if this is not an instruction or it is an instruction

805

/// that does not read/write memory and all operands are either not instructions

806

/// or phi nodes or instructions from different blocks.

807

static bool areAllOperandsNonInsts(Value *V) {

808

auto *I = dyn_cast<Instruction>(V);

809

if (!I)

810

return true;

811

return !mayHaveNonDefUseDependency(*I) &&

812

all_of(I->operands(), [I](Value *V) {

813

auto *IO = dyn_cast<Instruction>(V);

814

if (!IO)

815

return true;

816

return isa<PHINode>(IO) || IO->getParent() != I->getParent();

817

});

818

}

819

820

/// Checks if the provided value does not require scheduling. It does not

821

/// require scheduling if this is not an instruction or it is an instruction

822

/// that does not read/write memory and all users are phi nodes or instructions

823

/// from the different blocks.

824

static bool isUsedOutsideBlock(Value *V) {

825

auto *I = dyn_cast<Instruction>(V);

826

if (!I)

827

return true;

828

// Limits the number of uses to save compile time.

829

constexpr int UsesLimit = 8;

830

return !I->mayReadOrWriteMemory() && !I->hasNUsesOrMore(UsesLimit) &&

831

all_of(I->users(), [I](User *U) {

832

auto *IU = dyn_cast<Instruction>(U);

833

if (!IU)

834

return true;

835

return IU->getParent() != I->getParent() || isa<PHINode>(IU);

836

});

837

}

838

839

/// Checks if the specified value does not require scheduling. It does not

840

/// require scheduling if all operands and all users do not need to be scheduled

841

/// in the current basic block.

842

static bool doesNotNeedToBeScheduled(Value *V) {

843

return areAllOperandsNonInsts(V) && isUsedOutsideBlock(V);

844

}

845

846

/// Checks if the specified array of instructions does not require scheduling.

847

/// It is so if all either instructions have operands that do not require

848

/// scheduling or their users do not require scheduling since they are phis or

849

/// in other basic blocks.

850

static bool doesNotNeedToSchedule(ArrayRef<Value *> VL) {

851

return !VL.empty() &&

852

(all_of(VL, isUsedOutsideBlock) || all_of(VL, areAllOperandsNonInsts));

853

}

854

855

namespace slpvectorizer {

856

857

/// Bottom Up SLP Vectorizer.

858

class BoUpSLP {

859

struct TreeEntry;

860

struct ScheduleData;

861

862

public:

863

using ValueList = SmallVector<Value *, 8>;

864

using InstrList = SmallVector<Instruction *, 16>;

865

using ValueSet = SmallPtrSet<Value *, 16>;

866

using StoreList = SmallVector<StoreInst *, 8>;

867

using ExtraValueToDebugLocsMap =

868

MapVector<Value *, SmallVector<Instruction *, 2>>;

869

using OrdersType = SmallVector<unsigned, 4>;

870

871

BoUpSLP(Function *Func, ScalarEvolution *Se, TargetTransformInfo *Tti,

872

TargetLibraryInfo *TLi, AAResults *Aa, LoopInfo *Li,

873

DominatorTree *Dt, AssumptionCache *AC, DemandedBits *DB,

874

const DataLayout *DL, OptimizationRemarkEmitter *ORE)

875

: BatchAA(*Aa), F(Func), SE(Se), TTI(Tti), TLI(TLi), LI(Li),

876

DT(Dt), AC(AC), DB(DB), DL(DL), ORE(ORE), Builder(Se->getContext()) {

877

CodeMetrics::collectEphemeralValues(F, AC, EphValues);

878

// Use the vector register size specified by the target unless overridden

879

// by a command-line option.

880

// TODO: It would be better to limit the vectorization factor based on

881

// data type rather than just register size. For example, x86 AVX has

882

// 256-bit registers, but it does not support integer operations

883

// at that width (that requires AVX2).

884

if (MaxVectorRegSizeOption.getNumOccurrences())

885

MaxVecRegSize = MaxVectorRegSizeOption;

886

else

887

MaxVecRegSize =

888

TTI->getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector)

889

.getFixedSize();

890

891

if (MinVectorRegSizeOption.getNumOccurrences())

892

MinVecRegSize = MinVectorRegSizeOption;

893

else

894

MinVecRegSize = TTI->getMinVectorRegisterBitWidth();

895

}

896

897

/// Vectorize the tree that starts with the elements in \p VL.

898

/// Returns the vectorized root.

899

Value *vectorizeTree();

900

901

/// Vectorize the tree but with the list of externally used values \p

902

/// ExternallyUsedValues. Values in this MapVector can be replaced but the

903

/// generated extractvalue instructions.

904

Value *vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues);

905

906

/// \returns the cost incurred by unwanted spills and fills, caused by

907

/// holding live values over call sites.

908

InstructionCost getSpillCost() const;

909

910

/// \returns the vectorization cost of the subtree that starts at \p VL.

911

/// A negative number means that this is profitable.

912

InstructionCost getTreeCost(ArrayRef<Value *> VectorizedVals = None);

913

914

/// Construct a vectorizable tree that starts at \p Roots, ignoring users for

915

/// the purpose of scheduling and extraction in the \p UserIgnoreLst.

916

void buildTree(ArrayRef<Value *> Roots,

917

const SmallDenseSet<Value *> &UserIgnoreLst);

918

919

/// Construct a vectorizable tree that starts at \p Roots.

920

void buildTree(ArrayRef<Value *> Roots);

921

922

/// Builds external uses of the vectorized scalars, i.e. the list of

923

/// vectorized scalars to be extracted, their lanes and their scalar users. \p

924

/// ExternallyUsedValues contains additional list of external uses to handle

925

/// vectorization of reductions.

926

void

927

buildExternalUses(const ExtraValueToDebugLocsMap &ExternallyUsedValues = {});

928

929

/// Clear the internal data structures that are created by 'buildTree'.

930

void deleteTree() {

931

VectorizableTree.clear();

932

ScalarToTreeEntry.clear();

933

MustGather.clear();

934

ExternalUses.clear();

935

for (auto &Iter : BlocksSchedules) {

936

BlockScheduling *BS = Iter.second.get();

937

BS->clear();

938

}

939

MinBWs.clear();

940

InstrElementSize.clear();

941

UserIgnoreList = nullptr;

942

}

943

944

unsigned getTreeSize() const { return VectorizableTree.size(); }

945

946

/// Perform LICM and CSE on the newly generated gather sequences.

947

void optimizeGatherSequence();

948

949

/// Checks if the specified gather tree entry \p TE can be represented as a

950

/// shuffled vector entry + (possibly) permutation with other gathers. It

951

/// implements the checks only for possibly ordered scalars (Loads,

952

/// ExtractElement, ExtractValue), which can be part of the graph.

953

Optional<OrdersType> findReusedOrderedScalars(const TreeEntry &TE);

954

955

/// Sort loads into increasing pointers offsets to allow greater clustering.

956

Optional<OrdersType> findPartiallyOrderedLoads(const TreeEntry &TE);

957

958

/// Gets reordering data for the given tree entry. If the entry is vectorized

959

/// - just return ReorderIndices, otherwise check if the scalars can be

960

/// reordered and return the most optimal order.

961

/// \param TopToBottom If true, include the order of vectorized stores and

962

/// insertelement nodes, otherwise skip them.

963

Optional<OrdersType> getReorderingData(const TreeEntry &TE, bool TopToBottom);

964

965

/// Reorders the current graph to the most profitable order starting from the

966

/// root node to the leaf nodes. The best order is chosen only from the nodes

967

/// of the same size (vectorization factor). Smaller nodes are considered

968

/// parts of subgraph with smaller VF and they are reordered independently. We

969

/// can make it because we still need to extend smaller nodes to the wider VF

970

/// and we can merge reordering shuffles with the widening shuffles.

971

void reorderTopToBottom();

972

973

/// Reorders the current graph to the most profitable order starting from

974

/// leaves to the root. It allows to rotate small subgraphs and reduce the

975

/// number of reshuffles if the leaf nodes use the same order. In this case we

976

/// can merge the orders and just shuffle user node instead of shuffling its

977

/// operands. Plus, even the leaf nodes have different orders, it allows to

978

/// sink reordering in the graph closer to the root node and merge it later

979

/// during analysis.

980

void reorderBottomToTop(bool IgnoreReorder = false);

981

982

/// \return The vector element size in bits to use when vectorizing the

983

/// expression tree ending at \p V. If V is a store, the size is the width of

984

/// the stored value. Otherwise, the size is the width of the largest loaded

985

/// value reaching V. This method is used by the vectorizer to calculate

986

/// vectorization factors.

987

unsigned getVectorElementSize(Value *V);

988

989

/// Compute the minimum type sizes required to represent the entries in a

990

/// vectorizable tree.

991

void computeMinimumValueSizes();

992

993

// \returns maximum vector register size as set by TTI or overridden by cl::opt.

994

unsigned getMaxVecRegSize() const {

995

return MaxVecRegSize;

996

}

997

998

// \returns minimum vector register size as set by cl::opt.

999

unsigned getMinVecRegSize() const {

1000

return MinVecRegSize;

1001

}

1002

1003

unsigned getMinVF(unsigned Sz) const {

1004

return std::max(2U, getMinVecRegSize() / Sz);

1005

}

1006

1007

unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const {

1008

unsigned MaxVF = MaxVFOption.getNumOccurrences() ?

1009

MaxVFOption : TTI->getMaximumVF(ElemWidth, Opcode);

1010

return MaxVF ? MaxVF : UINT_MAX(2147483647 *2U +1U);

1011

}

1012

1013

/// Check if homogeneous aggregate is isomorphic to some VectorType.

1014

/// Accepts homogeneous multidimensional aggregate of scalars/vectors like

1015

/// {[4 x i16], [4 x i16]}, { <2 x float>, <2 x float> },

1016

/// {{{i16, i16}, {i16, i16}}, {{i16, i16}, {i16, i16}}} and so on.

1017

///

1018

/// \returns number of elements in vector if isomorphism exists, 0 otherwise.

1019

unsigned canMapToVector(Type *T, const DataLayout &DL) const;

1020

1021

/// \returns True if the VectorizableTree is both tiny and not fully

1022

/// vectorizable. We do not vectorize such trees.

1023

bool isTreeTinyAndNotFullyVectorizable(bool ForReduction = false) const;

1024

1025

/// Assume that a legal-sized 'or'-reduction of shifted/zexted loaded values

1026

/// can be load combined in the backend. Load combining may not be allowed in

1027

/// the IR optimizer, so we do not want to alter the pattern. For example,

1028

/// partially transforming a scalar bswap() pattern into vector code is

1029

/// effectively impossible for the backend to undo.

1030

/// TODO: If load combining is allowed in the IR optimizer, this analysis

1031

/// may not be necessary.

1032

bool isLoadCombineReductionCandidate(RecurKind RdxKind) const;

1033

1034

/// Assume that a vector of stores of bitwise-or/shifted/zexted loaded values

1035

/// can be load combined in the backend. Load combining may not be allowed in

1036

/// the IR optimizer, so we do not want to alter the pattern. For example,

1037

/// partially transforming a scalar bswap() pattern into vector code is

1038

/// effectively impossible for the backend to undo.

1039

/// TODO: If load combining is allowed in the IR optimizer, this analysis

1040

/// may not be necessary.

1041

bool isLoadCombineCandidate() const;

1042

1043

OptimizationRemarkEmitter *getORE() { return ORE; }

1044

1045

/// This structure holds any data we need about the edges being traversed

1046

/// during buildTree_rec(). We keep track of:

1047

/// (i) the user TreeEntry index, and

1048

/// (ii) the index of the edge.

1049

struct EdgeInfo {

1050

EdgeInfo() = default;

1051

EdgeInfo(TreeEntry *UserTE, unsigned EdgeIdx)

1052

: UserTE(UserTE), EdgeIdx(EdgeIdx) {}

1053

/// The user TreeEntry.

1054

TreeEntry *UserTE = nullptr;

1055

/// The operand index of the use.

1056

unsigned EdgeIdx = UINT_MAX(2147483647 *2U +1U);

1057

#ifndef NDEBUG

1058

friend inline raw_ostream &operator<<(raw_ostream &OS,

1059

const BoUpSLP::EdgeInfo &EI) {

1060

EI.dump(OS);

1061

return OS;

1062

}

1063

/// Debug print.

1064

void dump(raw_ostream &OS) const {

1065

OS << "{User:" << (UserTE ? std::to_string(UserTE->Idx) : "null")

1066

<< " EdgeIdx:" << EdgeIdx << "}";

1067

}

1068

LLVM_DUMP_METHOD__attribute__((noinline)) __attribute__((__used__)) void dump() const { dump(dbgs()); }

1069

#endif

1070

};

1071

1072

/// A helper class used for scoring candidates for two consecutive lanes.

1073

class LookAheadHeuristics {

1074

const DataLayout &DL;

1075

ScalarEvolution &SE;

1076

const BoUpSLP &R;

1077

int NumLanes; // Total number of lanes (aka vectorization factor).

1078

int MaxLevel; // The maximum recursion depth for accumulating score.

1079

1080

public:

1081

LookAheadHeuristics(const DataLayout &DL, ScalarEvolution &SE,

1082

const BoUpSLP &R, int NumLanes, int MaxLevel)

1083

: DL(DL), SE(SE), R(R), NumLanes(NumLanes), MaxLevel(MaxLevel) {}

1084

1085

// The hard-coded scores listed here are not very important, though it shall

1086

// be higher for better matches to improve the resulting cost. When

1087

// computing the scores of matching one sub-tree with another, we are

1088

// basically counting the number of values that are matching. So even if all

1089

// scores are set to 1, we would still get a decent matching result.

1090

// However, sometimes we have to break ties. For example we may have to

1091

// choose between matching loads vs matching opcodes. This is what these

1092

// scores are helping us with: they provide the order of preference. Also,

1093

// this is important if the scalar is externally used or used in another

1094

// tree entry node in the different lane.

1095

1096

/// Loads from consecutive memory addresses, e.g. load(A[i]), load(A[i+1]).

1097

static const int ScoreConsecutiveLoads = 4;

1098

/// The same load multiple times. This should have a better score than

1099

/// `ScoreSplat` because it in x86 for a 2-lane vector we can represent it

1100

/// with `movddup (%reg), xmm0` which has a throughput of 0.5 versus 0.5 for

1101

/// a vector load and 1.0 for a broadcast.

1102

static const int ScoreSplatLoads = 3;

1103

/// Loads from reversed memory addresses, e.g. load(A[i+1]), load(A[i]).

1104

static const int ScoreReversedLoads = 3;

1105

/// ExtractElementInst from same vector and consecutive indexes.

1106

static const int ScoreConsecutiveExtracts = 4;

1107

/// ExtractElementInst from same vector and reversed indices.

1108

static const int ScoreReversedExtracts = 3;

1109

/// Constants.

1110

static const int ScoreConstants = 2;

1111

/// Instructions with the same opcode.

1112

static const int ScoreSameOpcode = 2;

1113

/// Instructions with alt opcodes (e.g, add + sub).

1114

static const int ScoreAltOpcodes = 1;

1115

/// Identical instructions (a.k.a. splat or broadcast).

1116

static const int ScoreSplat = 1;

1117

/// Matching with an undef is preferable to failing.

1118

static const int ScoreUndef = 1;

1119

/// Score for failing to find a decent match.

1120

static const int ScoreFail = 0;

1121

/// Score if all users are vectorized.

1122

static const int ScoreAllUserVectorized = 1;

1123

1124

/// \returns the score of placing \p V1 and \p V2 in consecutive lanes.

1125

/// \p U1 and \p U2 are the users of \p V1 and \p V2.

1126

/// Also, checks if \p V1 and \p V2 are compatible with instructions in \p

1127

/// MainAltOps.

1128

int getShallowScore(Value *V1, Value *V2, Instruction *U1, Instruction *U2,

1129

ArrayRef<Value *> MainAltOps) const {

1130

if (V1 == V2) {

1131

if (isa<LoadInst>(V1)) {

1132

// Retruns true if the users of V1 and V2 won't need to be extracted.

1133

auto AllUsersAreInternal = [U1, U2, this](Value *V1, Value *V2) {

1134

// Bail out if we have too many uses to save compilation time.

1135

static constexpr unsigned Limit = 8;

1136

if (V1->hasNUsesOrMore(Limit) || V2->hasNUsesOrMore(Limit))

1137

return false;

1138

1139

auto AllUsersVectorized = [U1, U2, this](Value *V) {

1140

return llvm::all_of(V->users(), [U1, U2, this](Value *U) {

1141

return U == U1 || U == U2 || R.getTreeEntry(U) != nullptr;

1142

});

1143

};

1144

return AllUsersVectorized(V1) && AllUsersVectorized(V2);

1145

};

1146

// A broadcast of a load can be cheaper on some targets.

1147

if (R.TTI->isLegalBroadcastLoad(V1->getType(),

1148

ElementCount::getFixed(NumLanes)) &&

1149

((int)V1->getNumUses() == NumLanes ||

1150

AllUsersAreInternal(V1, V2)))

1151

return LookAheadHeuristics::ScoreSplatLoads;

1152

}

1153

return LookAheadHeuristics::ScoreSplat;

1154

}

1155

1156

auto *LI1 = dyn_cast<LoadInst>(V1);

1157

auto *LI2 = dyn_cast<LoadInst>(V2);

1158

if (LI1 && LI2) {

1159

if (LI1->getParent() != LI2->getParent())

1160

return LookAheadHeuristics::ScoreFail;

1161

1162

Optional<int> Dist = getPointersDiff(

1163

LI1->getType(), LI1->getPointerOperand(), LI2->getType(),

1164

LI2->getPointerOperand(), DL, SE, /*StrictCheck=*/true);

1165

if (!Dist || *Dist == 0)

1166

return LookAheadHeuristics::ScoreFail;

1167

// The distance is too large - still may be profitable to use masked

1168

// loads/gathers.

1169

if (std::abs(*Dist) > NumLanes / 2)

1170

return LookAheadHeuristics::ScoreAltOpcodes;

1171

// This still will detect consecutive loads, but we might have "holes"

1172

// in some cases. It is ok for non-power-2 vectorization and may produce

1173

// better results. It should not affect current vectorization.

1174

return (*Dist > 0) ? LookAheadHeuristics::ScoreConsecutiveLoads

1175

: LookAheadHeuristics::ScoreReversedLoads;

1176

}

1177

1178

auto *C1 = dyn_cast<Constant>(V1);

1179

auto *C2 = dyn_cast<Constant>(V2);

1180

if (C1 && C2)

1181

return LookAheadHeuristics::ScoreConstants;

1182

1183

// Extracts from consecutive indexes of the same vector better score as

1184

// the extracts could be optimized away.

1185

Value *EV1;

1186

ConstantInt *Ex1Idx;

1187

if (match(V1, m_ExtractElt(m_Value(EV1), m_ConstantInt(Ex1Idx)))) {

1188

// Undefs are always profitable for extractelements.

1189

if (isa<UndefValue>(V2))

1190

return LookAheadHeuristics::ScoreConsecutiveExtracts;

1191

Value *EV2 = nullptr;

1192

ConstantInt *Ex2Idx = nullptr;

1193

if (match(V2,

1194

m_ExtractElt(m_Value(EV2), m_CombineOr(m_ConstantInt(Ex2Idx),

1195

m_Undef())))) {

1196

// Undefs are always profitable for extractelements.

1197

if (!Ex2Idx)

1198

return LookAheadHeuristics::ScoreConsecutiveExtracts;

1199

if (isUndefVector(EV2) && EV2->getType() == EV1->getType())

1200

return LookAheadHeuristics::ScoreConsecutiveExtracts;

1201

if (EV2 == EV1) {

1202

int Idx1 = Ex1Idx->getZExtValue();

1203

int Idx2 = Ex2Idx->getZExtValue();

1204

int Dist = Idx2 - Idx1;

1205

// The distance is too large - still may be profitable to use

1206

// shuffles.

1207

if (std::abs(Dist) == 0)

1208

return LookAheadHeuristics::ScoreSplat;

1209

if (std::abs(Dist) > NumLanes / 2)

1210

return LookAheadHeuristics::ScoreSameOpcode;

1211

return (Dist > 0) ? LookAheadHeuristics::ScoreConsecutiveExtracts

1212

: LookAheadHeuristics::ScoreReversedExtracts;

1213

}

1214

return LookAheadHeuristics::ScoreAltOpcodes;

1215

}

1216

return LookAheadHeuristics::ScoreFail;

1217

}

1218

1219

auto *I1 = dyn_cast<Instruction>(V1);

1220

auto *I2 = dyn_cast<Instruction>(V2);

1221

if (I1 && I2) {

1222

if (I1->getParent() != I2->getParent())

1223

return LookAheadHeuristics::ScoreFail;

1224

SmallVector<Value *, 4> Ops(MainAltOps.begin(), MainAltOps.end());

1225

Ops.push_back(I1);

1226

Ops.push_back(I2);

1227

InstructionsState S = getSameOpcode(Ops);

1228

// Note: Only consider instructions with <= 2 operands to avoid

1229

// complexity explosion.

1230

if (S.getOpcode() &&

1231

(S.MainOp->getNumOperands() <= 2 || !MainAltOps.empty() ||

1232

!S.isAltShuffle()) &&

1233

all_of(Ops, [&S](Value *V) {

1234

return cast<Instruction>(V)->getNumOperands() ==

1235

S.MainOp->getNumOperands();

1236

}))

1237

return S.isAltShuffle() ? LookAheadHeuristics::ScoreAltOpcodes

1238

: LookAheadHeuristics::ScoreSameOpcode;

1239

}

1240

1241

if (isa<UndefValue>(V2))

1242

return LookAheadHeuristics::ScoreUndef;

1243

1244

return LookAheadHeuristics::ScoreFail;

1245

}

1246

1247

/// Go through the operands of \p LHS and \p RHS recursively until

1248

/// MaxLevel, and return the cummulative score. \p U1 and \p U2 are

1249

/// the users of \p LHS and \p RHS (that is \p LHS and \p RHS are operands

1250

/// of \p U1 and \p U2), except at the beginning of the recursion where

1251

/// these are set to nullptr.

1252

///

1253

/// For example:

1254

/// \verbatim

1255

/// A[0] B[0] A[1] B[1] C[0] D[0] B[1] A[1]

1256

/// \ / \ / \ / \ /

1257

/// + + + +

1258

/// G1 G2 G3 G4

1259

/// \endverbatim

1260

/// The getScoreAtLevelRec(G1, G2) function will try to match the nodes at

1261

/// each level recursively, accumulating the score. It starts from matching

1262

/// the additions at level 0, then moves on to the loads (level 1). The

1263

/// score of G1 and G2 is higher than G1 and G3, because {A[0],A[1]} and

1264

/// {B[0],B[1]} match with LookAheadHeuristics::ScoreConsecutiveLoads, while

1265

/// {A[0],C[0]} has a score of LookAheadHeuristics::ScoreFail.

1266

/// Please note that the order of the operands does not matter, as we

1267

/// evaluate the score of all profitable combinations of operands. In

1268

/// other words the score of G1 and G4 is the same as G1 and G2. This

1269

/// heuristic is based on ideas described in:

1270

/// Look-ahead SLP: Auto-vectorization in the presence of commutative

1271

/// operations, CGO 2018 by Vasileios Porpodas, Rodrigo C. O. Rocha,

1272

/// Luís F. W. Góes

1273

int getScoreAtLevelRec(Value *LHS, Value *RHS, Instruction *U1,

1274

Instruction *U2, int CurrLevel,

1275

ArrayRef<Value *> MainAltOps) const {

1276

1277

// Get the shallow score of V1 and V2.

1278

int ShallowScoreAtThisLevel =

1279

getShallowScore(LHS, RHS, U1, U2, MainAltOps);

1280

1281

// If reached MaxLevel,

1282

// or if V1 and V2 are not instructions,

1283

// or if they are SPLAT,

1284

// or if they are not consecutive,

1285

// or if profitable to vectorize loads or extractelements, early return

1286

// the current cost.

1287

auto *I1 = dyn_cast<Instruction>(LHS);

1288

auto *I2 = dyn_cast<Instruction>(RHS);

1289

if (CurrLevel == MaxLevel || !(I1 && I2) || I1 == I2 ||

1290

ShallowScoreAtThisLevel == LookAheadHeuristics::ScoreFail ||

1291

(((isa<LoadInst>(I1) && isa<LoadInst>(I2)) ||

1292

(I1->getNumOperands() > 2 && I2->getNumOperands() > 2) ||

1293

(isa<ExtractElementInst>(I1) && isa<ExtractElementInst>(I2))) &&

1294

ShallowScoreAtThisLevel))

1295

return ShallowScoreAtThisLevel;

1296

assert(I1 && I2 && "Should have early exited.")(static_cast <bool> (I1 && I2 && "Should have early exited."
) ? void (0) : __assert_fail ("I1 && I2 && \"Should have early exited.\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 1296, __extension__
__PRETTY_FUNCTION__));

1297

1298

// Contains the I2 operand indexes that got matched with I1 operands.

1299

SmallSet<unsigned, 4> Op2Used;

1300

1301

// Recursion towards the operands of I1 and I2. We are trying all possible

1302

// operand pairs, and keeping track of the best score.

1303

for (unsigned OpIdx1 = 0, NumOperands1 = I1->getNumOperands();

1304

OpIdx1 != NumOperands1; ++OpIdx1) {

1305

// Try to pair op1I with the best operand of I2.

1306

int MaxTmpScore = 0;

1307

unsigned MaxOpIdx2 = 0;

1308

bool FoundBest = false;

1309

// If I2 is commutative try all combinations.

1310

unsigned FromIdx = isCommutative(I2) ? 0 : OpIdx1;

1311

unsigned ToIdx = isCommutative(I2)

1312

? I2->getNumOperands()

1313

: std::min(I2->getNumOperands(), OpIdx1 + 1);

1314

assert(FromIdx <= ToIdx && "Bad index")(static_cast <bool> (FromIdx <= ToIdx && "Bad index"
) ? void (0) : __assert_fail ("FromIdx <= ToIdx && \"Bad index\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 1314, __extension__
__PRETTY_FUNCTION__));

1315

for (unsigned OpIdx2 = FromIdx; OpIdx2 != ToIdx; ++OpIdx2) {

1316

// Skip operands already paired with OpIdx1.

1317

if (Op2Used.count(OpIdx2))

1318

continue;

1319

// Recursively calculate the cost at each level

1320

int TmpScore =

1321

getScoreAtLevelRec(I1->getOperand(OpIdx1), I2->getOperand(OpIdx2),

1322

I1, I2, CurrLevel + 1, None);

1323

// Look for the best score.

1324

if (TmpScore > LookAheadHeuristics::ScoreFail &&

1325

TmpScore > MaxTmpScore) {

1326

MaxTmpScore = TmpScore;

1327

MaxOpIdx2 = OpIdx2;

1328

FoundBest = true;

1329

}

1330

}

1331

if (FoundBest) {

1332

// Pair {OpIdx1, MaxOpIdx2} was found to be best. Never revisit it.

1333

Op2Used.insert(MaxOpIdx2);

1334

ShallowScoreAtThisLevel += MaxTmpScore;

1335

}

1336

}

1337

return ShallowScoreAtThisLevel;

1338

}

1339

};

1340

/// A helper data structure to hold the operands of a vector of instructions.

1341

/// This supports a fixed vector length for all operand vectors.

1342

class VLOperands {

1343

/// For each operand we need (i) the value, and (ii) the opcode that it

1344

/// would be attached to if the expression was in a left-linearized form.

1345

/// This is required to avoid illegal operand reordering.

1346

/// For example:

1347

/// \verbatim

1348

/// 0 Op1

1349

/// |/

1350

/// Op1 Op2 Linearized + Op2

1351

/// \ / ----------> |/

1352

/// - -

1353

///

1354

/// Op1 - Op2 (0 + Op1) - Op2

1355

/// \endverbatim

1356

///

1357

/// Value Op1 is attached to a '+' operation, and Op2 to a '-'.

1358

///

1359

/// Another way to think of this is to track all the operations across the

1360

/// path from the operand all the way to the root of the tree and to

1361

/// calculate the operation that corresponds to this path. For example, the

1362

/// path from Op2 to the root crosses the RHS of the '-', therefore the

1363

/// corresponding operation is a '-' (which matches the one in the

1364

/// linearized tree, as shown above).

1365

///

1366

/// For lack of a better term, we refer to this operation as Accumulated

1367

/// Path Operation (APO).

1368

struct OperandData {

1369

OperandData() = default;

1370

OperandData(Value *V, bool APO, bool IsUsed)

1371

: V(V), APO(APO), IsUsed(IsUsed) {}

1372

/// The operand value.

1373

Value *V = nullptr;

1374

/// TreeEntries only allow a single opcode, or an alternate sequence of

1375

/// them (e.g, +, -). Therefore, we can safely use a boolean value for the

1376

/// APO. It is set to 'true' if 'V' is attached to an inverse operation

1377

/// in the left-linearized form (e.g., Sub/Div), and 'false' otherwise

1378

/// (e.g., Add/Mul)

1379

bool APO = false;

1380

/// Helper data for the reordering function.

1381

bool IsUsed = false;

1382

};

1383

1384

/// During operand reordering, we are trying to select the operand at lane

1385

/// that matches best with the operand at the neighboring lane. Our

1386

/// selection is based on the type of value we are looking for. For example,

1387

/// if the neighboring lane has a load, we need to look for a load that is

1388

/// accessing a consecutive address. These strategies are summarized in the

1389

/// 'ReorderingMode' enumerator.

1390

enum class ReorderingMode {

1391

Load, ///< Matching loads to consecutive memory addresses

1392

Opcode, ///< Matching instructions based on opcode (same or alternate)

1393

Constant, ///< Matching constants

1394

Splat, ///< Matching the same instruction multiple times (broadcast)

1395

Failed, ///< We failed to create a vectorizable group

1396

};

1397

1398

using OperandDataVec = SmallVector<OperandData, 2>;

1399

1400

/// A vector of operand vectors.

1401

SmallVector<OperandDataVec, 4> OpsVec;

1402

1403

const DataLayout &DL;

1404

ScalarEvolution &SE;

1405

const BoUpSLP &R;

1406

1407

/// \returns the operand data at \p OpIdx and \p Lane.

1408

OperandData &getData(unsigned OpIdx, unsigned Lane) {

1409

return OpsVec[OpIdx][Lane];

1410

}

1411

1412

/// \returns the operand data at \p OpIdx and \p Lane. Const version.

1413

const OperandData &getData(unsigned OpIdx, unsigned Lane) const {

1414

return OpsVec[OpIdx][Lane];

1415

}

1416

1417

/// Clears the used flag for all entries.

1418

void clearUsed() {

1419

for (unsigned OpIdx = 0, NumOperands = getNumOperands();

1420

OpIdx != NumOperands; ++OpIdx)

1421

for (unsigned Lane = 0, NumLanes = getNumLanes(); Lane != NumLanes;

1422

++Lane)

1423

OpsVec[OpIdx][Lane].IsUsed = false;

1424

}

1425

1426

/// Swap the operand at \p OpIdx1 with that one at \p OpIdx2.

1427

void swap(unsigned OpIdx1, unsigned OpIdx2, unsigned Lane) {

1428

std::swap(OpsVec[OpIdx1][Lane], OpsVec[OpIdx2][Lane]);

1429

}

1430

1431

/// \param Lane lane of the operands under analysis.

1432

/// \param OpIdx operand index in \p Lane lane we're looking the best

1433

/// candidate for.

1434

/// \param Idx operand index of the current candidate value.

1435

/// \returns The additional score due to possible broadcasting of the

1436

/// elements in the lane. It is more profitable to have power-of-2 unique

1437

/// elements in the lane, it will be vectorized with higher probability

1438

/// after removing duplicates. Currently the SLP vectorizer supports only

1439

/// vectorization of the power-of-2 number of unique scalars.

1440

int getSplatScore(unsigned Lane, unsigned OpIdx, unsigned Idx) const {

1441

Value *IdxLaneV = getData(Idx, Lane).V;

1442

if (!isa<Instruction>(IdxLaneV) || IdxLaneV == getData(OpIdx, Lane).V)

1443

return 0;

1444

SmallPtrSet<Value *, 4> Uniques;

1445

for (unsigned Ln = 0, E = getNumLanes(); Ln < E; ++Ln) {

1446

if (Ln == Lane)

1447

continue;

1448

Value *OpIdxLnV = getData(OpIdx, Ln).V;

1449

if (!isa<Instruction>(OpIdxLnV))

1450

return 0;

1451

Uniques.insert(OpIdxLnV);

1452

}

1453

int UniquesCount = Uniques.size();

1454

int UniquesCntWithIdxLaneV =

1455

Uniques.contains(IdxLaneV) ? UniquesCount : UniquesCount + 1;

1456

Value *OpIdxLaneV = getData(OpIdx, Lane).V;

1457

int UniquesCntWithOpIdxLaneV =

1458

Uniques.contains(OpIdxLaneV) ? UniquesCount : UniquesCount + 1;

1459

if (UniquesCntWithIdxLaneV == UniquesCntWithOpIdxLaneV)

1460

return 0;

1461

return (PowerOf2Ceil(UniquesCntWithOpIdxLaneV) -

1462

UniquesCntWithOpIdxLaneV) -

1463

(PowerOf2Ceil(UniquesCntWithIdxLaneV) - UniquesCntWithIdxLaneV);

1464

}

1465

1466

/// \param Lane lane of the operands under analysis.

1467

/// \param OpIdx operand index in \p Lane lane we're looking the best

1468

/// candidate for.

1469

/// \param Idx operand index of the current candidate value.

1470

/// \returns The additional score for the scalar which users are all

1471

/// vectorized.

1472

int getExternalUseScore(unsigned Lane, unsigned OpIdx, unsigned Idx) const {

1473

Value *IdxLaneV = getData(Idx, Lane).V;

1474

Value *OpIdxLaneV = getData(OpIdx, Lane).V;

1475

// Do not care about number of uses for vector-like instructions

1476

// (extractelement/extractvalue with constant indices), they are extracts

1477

// themselves and already externally used. Vectorization of such

1478

// instructions does not add extra extractelement instruction, just may

1479

// remove it.

1480

if (isVectorLikeInstWithConstOps(IdxLaneV) &&

1481

isVectorLikeInstWithConstOps(OpIdxLaneV))

1482

return LookAheadHeuristics::ScoreAllUserVectorized;

1483

auto *IdxLaneI = dyn_cast<Instruction>(IdxLaneV);

1484

if (!IdxLaneI || !isa<Instruction>(OpIdxLaneV))

1485

return 0;

1486

return R.areAllUsersVectorized(IdxLaneI, None)

1487

? LookAheadHeuristics::ScoreAllUserVectorized

1488

: 0;

1489

}

1490

1491

/// Score scaling factor for fully compatible instructions but with

1492

/// different number of external uses. Allows better selection of the

1493

/// instructions with less external uses.

1494

static const int ScoreScaleFactor = 10;

1495

1496

/// \Returns the look-ahead score, which tells us how much the sub-trees

1497

/// rooted at \p LHS and \p RHS match, the more they match the higher the

1498

/// score. This helps break ties in an informed way when we cannot decide on

1499

/// the order of the operands by just considering the immediate

1500

/// predecessors.

1501

int getLookAheadScore(Value *LHS, Value *RHS, ArrayRef<Value *> MainAltOps,

1502

int Lane, unsigned OpIdx, unsigned Idx,

1503

bool &IsUsed) {

1504

LookAheadHeuristics LookAhead(DL, SE, R, getNumLanes(),

1505

LookAheadMaxDepth);

1506

// Keep track of the instruction stack as we recurse into the operands

1507

// during the look-ahead score exploration.

1508

int Score =

1509

LookAhead.getScoreAtLevelRec(LHS, RHS, /*U1=*/nullptr, /*U2=*/nullptr,

1510

/*CurrLevel=*/1, MainAltOps);

1511

if (Score) {

1512

int SplatScore = getSplatScore(Lane, OpIdx, Idx);

1513

if (Score <= -SplatScore) {

1514

// Set the minimum score for splat-like sequence to avoid setting

1515

// failed state.

1516

Score = 1;

1517

} else {

1518

Score += SplatScore;

1519

// Scale score to see the difference between different operands

1520

// and similar operands but all vectorized/not all vectorized

1521

// uses. It does not affect actual selection of the best

1522

// compatible operand in general, just allows to select the

1523

// operand with all vectorized uses.

1524

Score *= ScoreScaleFactor;

1525

Score += getExternalUseScore(Lane, OpIdx, Idx);

1526

IsUsed = true;

1527

}

1528

}

1529

return Score;

1530

}

1531

1532

/// Best defined scores per lanes between the passes. Used to choose the

1533

/// best operand (with the highest score) between the passes.

1534

/// The key - {Operand Index, Lane}.

1535

/// The value - the best score between the passes for the lane and the

1536

/// operand.

1537

SmallDenseMap<std::pair<unsigned, unsigned>, unsigned, 8>

1538

BestScoresPerLanes;

1539

1540

// Search all operands in Ops[*][Lane] for the one that matches best

1541

// Ops[OpIdx][LastLane] and return its opreand index.

1542

// If no good match can be found, return None.

1543

Optional<unsigned> getBestOperand(unsigned OpIdx, int Lane, int LastLane,

1544

ArrayRef<ReorderingMode> ReorderingModes,

1545

ArrayRef<Value *> MainAltOps) {

1546

unsigned NumOperands = getNumOperands();

1547

1548

// The operand of the previous lane at OpIdx.

1549

Value *OpLastLane = getData(OpIdx, LastLane).V;

1550

1551

// Our strategy mode for OpIdx.

1552

ReorderingMode RMode = ReorderingModes[OpIdx];

1553

if (RMode == ReorderingMode::Failed)

1554

return None;

1555

1556

// The linearized opcode of the operand at OpIdx, Lane.

1557

bool OpIdxAPO = getData(OpIdx, Lane).APO;

1558

1559

// The best operand index and its score.

1560

// Sometimes we have more than one option (e.g., Opcode and Undefs), so we

1561

// are using the score to differentiate between the two.

1562

struct BestOpData {

1563

Optional<unsigned> Idx = None;

1564

unsigned Score = 0;

1565

} BestOp;

1566

BestOp.Score =

1567

BestScoresPerLanes.try_emplace(std::make_pair(OpIdx, Lane), 0)

1568

.first->second;

1569

1570

// Track if the operand must be marked as used. If the operand is set to

1571

// Score 1 explicitly (because of non power-of-2 unique scalars, we may

1572

// want to reestimate the operands again on the following iterations).

1573

bool IsUsed =

1574

RMode == ReorderingMode::Splat || RMode == ReorderingMode::Constant;

1575

// Iterate through all unused operands and look for the best.

1576

for (unsigned Idx = 0; Idx != NumOperands; ++Idx) {

1577

// Get the operand at Idx and Lane.

1578

OperandData &OpData = getData(Idx, Lane);

1579

Value *Op = OpData.V;

1580

bool OpAPO = OpData.APO;

1581

1582

// Skip already selected operands.

1583

if (OpData.IsUsed)

1584

continue;

1585

1586

// Skip if we are trying to move the operand to a position with a

1587

// different opcode in the linearized tree form. This would break the

1588

// semantics.

1589

if (OpAPO != OpIdxAPO)

1590

continue;

1591

1592

// Look for an operand that matches the current mode.

1593

switch (RMode) {

1594

case ReorderingMode::Load:

1595

case ReorderingMode::Constant:

1596

case ReorderingMode::Opcode: {

1597

bool LeftToRight = Lane > LastLane;

1598

Value *OpLeft = (LeftToRight) ? OpLastLane : Op;

1599

Value *OpRight = (LeftToRight) ? Op : OpLastLane;

1600

int Score = getLookAheadScore(OpLeft, OpRight, MainAltOps, Lane,

1601

OpIdx, Idx, IsUsed);

1602

if (Score > static_cast<int>(BestOp.Score)) {

1603

BestOp.Idx = Idx;

1604

BestOp.Score = Score;

1605

BestScoresPerLanes[std::make_pair(OpIdx, Lane)] = Score;

1606

}

1607

break;

1608

}

1609

case ReorderingMode::Splat:

1610

if (Op == OpLastLane)

1611

BestOp.Idx = Idx;

1612

break;

1613

case ReorderingMode::Failed:

1614

llvm_unreachable("Not expected Failed reordering mode.")::llvm::llvm_unreachable_internal("Not expected Failed reordering mode."
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 1614);

1615

}

1616

}

1617

1618

if (BestOp.Idx) {

1619

getData(*BestOp.Idx, Lane).IsUsed = IsUsed;

1620

return BestOp.Idx;

1621

}

1622

// If we could not find a good match return None.

1623

return None;

1624

}

1625

1626

/// Helper for reorderOperandVecs.

1627

/// \returns the lane that we should start reordering from. This is the one

1628

/// which has the least number of operands that can freely move about or

1629

/// less profitable because it already has the most optimal set of operands.

1630

unsigned getBestLaneToStartReordering() const {

1631

unsigned Min = UINT_MAX(2147483647 *2U +1U);

1632

unsigned SameOpNumber = 0;

1633

// std::pair<unsigned, unsigned> is used to implement a simple voting

1634

// algorithm and choose the lane with the least number of operands that

1635

// can freely move about or less profitable because it already has the

1636

// most optimal set of operands. The first unsigned is a counter for

1637

// voting, the second unsigned is the counter of lanes with instructions

1638

// with same/alternate opcodes and same parent basic block.

1639

MapVector<unsigned, std::pair<unsigned, unsigned>> HashMap;

1640

// Try to be closer to the original results, if we have multiple lanes

1641

// with same cost. If 2 lanes have the same cost, use the one with the

1642

// lowest index.

1643

for (int I = getNumLanes(); I > 0; --I) {

1644

unsigned Lane = I - 1;

1645

OperandsOrderData NumFreeOpsHash =

1646

getMaxNumOperandsThatCanBeReordered(Lane);

1647

// Compare the number of operands that can move and choose the one with

1648

// the least number.

1649

if (NumFreeOpsHash.NumOfAPOs < Min) {

1650

Min = NumFreeOpsHash.NumOfAPOs;

1651

SameOpNumber = NumFreeOpsHash.NumOpsWithSameOpcodeParent;

1652

HashMap.clear();

1653

HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane);

1654

} else if (NumFreeOpsHash.NumOfAPOs == Min &&

1655

NumFreeOpsHash.NumOpsWithSameOpcodeParent < SameOpNumber) {

1656

// Select the most optimal lane in terms of number of operands that

1657

// should be moved around.

1658

SameOpNumber = NumFreeOpsHash.NumOpsWithSameOpcodeParent;

1659

HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane);

1660

} else if (NumFreeOpsHash.NumOfAPOs == Min &&

1661

NumFreeOpsHash.NumOpsWithSameOpcodeParent == SameOpNumber) {

1662

auto It = HashMap.find(NumFreeOpsHash.Hash);

1663

if (It == HashMap.end())

1664

HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane);

1665

else

1666

++It->second.first;

1667

}

1668

}

1669

// Select the lane with the minimum counter.

1670

unsigned BestLane = 0;

1671

unsigned CntMin = UINT_MAX(2147483647 *2U +1U);

1672

for (const auto &Data : reverse(HashMap)) {

1673

if (Data.second.first < CntMin) {

1674

CntMin = Data.second.first;

1675

BestLane = Data.second.second;

1676

}

1677

}

1678

return BestLane;

1679

}

1680

1681

/// Data structure that helps to reorder operands.

1682

struct OperandsOrderData {

1683

/// The best number of operands with the same APOs, which can be

1684

/// reordered.

1685

unsigned NumOfAPOs = UINT_MAX(2147483647 *2U +1U);

1686

/// Number of operands with the same/alternate instruction opcode and

1687

/// parent.

1688

unsigned NumOpsWithSameOpcodeParent = 0;

1689

/// Hash for the actual operands ordering.

1690

/// Used to count operands, actually their position id and opcode

1691

/// value. It is used in the voting mechanism to find the lane with the

1692

/// least number of operands that can freely move about or less profitable

1693

/// because it already has the most optimal set of operands. Can be

1694

/// replaced with SmallVector<unsigned> instead but hash code is faster

1695

/// and requires less memory.

1696

unsigned Hash = 0;

1697

};

1698

/// \returns the maximum number of operands that are allowed to be reordered

1699

/// for \p Lane and the number of compatible instructions(with the same

1700

/// parent/opcode). This is used as a heuristic for selecting the first lane

1701

/// to start operand reordering.

1702

OperandsOrderData getMaxNumOperandsThatCanBeReordered(unsigned Lane) const {

1703

unsigned CntTrue = 0;

1704

unsigned NumOperands = getNumOperands();

1705

// Operands with the same APO can be reordered. We therefore need to count

1706

// how many of them we have for each APO, like this: Cnt[APO] = x.

1707

// Since we only have two APOs, namely true and false, we can avoid using

1708

// a map. Instead we can simply count the number of operands that

1709

// correspond to one of them (in this case the 'true' APO), and calculate

1710

// the other by subtracting it from the total number of operands.

1711

// Operands with the same instruction opcode and parent are more

1712

// profitable since we don't need to move them in many cases, with a high

1713

// probability such lane already can be vectorized effectively.

1714

bool AllUndefs = true;

1715

unsigned NumOpsWithSameOpcodeParent = 0;

1716

Instruction *OpcodeI = nullptr;

1717

BasicBlock *Parent = nullptr;

1718

unsigned Hash = 0;

1719

for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {

1720

const OperandData &OpData = getData(OpIdx, Lane);

1721

if (OpData.APO)

1722

++CntTrue;

1723

// Use Boyer-Moore majority voting for finding the majority opcode and

1724

// the number of times it occurs.

1725

if (auto *I = dyn_cast<Instruction>(OpData.V)) {

1726

if (!OpcodeI || !getSameOpcode({OpcodeI, I}).getOpcode() ||

1727

I->getParent() != Parent) {

1728

if (NumOpsWithSameOpcodeParent == 0) {

1729

NumOpsWithSameOpcodeParent = 1;

1730

OpcodeI = I;

1731

Parent = I->getParent();

1732

} else {

1733

--NumOpsWithSameOpcodeParent;

1734

}

1735

} else {

1736

++NumOpsWithSameOpcodeParent;

1737

}

1738

}

1739

Hash = hash_combine(

1740

Hash, hash_value((OpIdx + 1) * (OpData.V->getValueID() + 1)));

1741

AllUndefs = AllUndefs && isa<UndefValue>(OpData.V);

1742

}

1743

if (AllUndefs)

1744

return {};

1745

OperandsOrderData Data;

1746

Data.NumOfAPOs = std::max(CntTrue, NumOperands - CntTrue);

1747

Data.NumOpsWithSameOpcodeParent = NumOpsWithSameOpcodeParent;

1748

Data.Hash = Hash;

1749

return Data;

1750

}

1751

1752

/// Go through the instructions in VL and append their operands.

1753

void appendOperandsOfVL(ArrayRef<Value *> VL) {

1754

assert(!VL.empty() && "Bad VL")(static_cast <bool> (!VL.empty() && "Bad VL") ?
void (0) : __assert_fail ("!VL.empty() && \"Bad VL\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 1754, __extension__
__PRETTY_FUNCTION__));

1755

assert((empty() || VL.size() == getNumLanes()) &&(static_cast <bool> ((empty() || VL.size() == getNumLanes
()) && "Expected same number of lanes") ? void (0) : __assert_fail
("(empty() || VL.size() == getNumLanes()) && \"Expected same number of lanes\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 1756, __extension__
__PRETTY_FUNCTION__))

1756

"Expected same number of lanes")(static_cast <bool> ((empty() || VL.size() == getNumLanes
()) && "Expected same number of lanes") ? void (0) : __assert_fail
("(empty() || VL.size() == getNumLanes()) && \"Expected same number of lanes\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 1756, __extension__
__PRETTY_FUNCTION__));

1757

assert(isa<Instruction>(VL[0]) && "Expected instruction")(static_cast <bool> (isa<Instruction>(VL[0]) &&
"Expected instruction") ? void (0) : __assert_fail ("isa<Instruction>(VL[0]) && \"Expected instruction\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 1757, __extension__
__PRETTY_FUNCTION__));

1758

unsigned NumOperands = cast<Instruction>(VL[0])->getNumOperands();

1759

OpsVec.resize(NumOperands);

1760

unsigned NumLanes = VL.size();

1761

for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {

1762

OpsVec[OpIdx].resize(NumLanes);

1763

for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {

1764

assert(isa<Instruction>(VL[Lane]) && "Expected instruction")(static_cast <bool> (isa<Instruction>(VL[Lane]) &&
"Expected instruction") ? void (0) : __assert_fail ("isa<Instruction>(VL[Lane]) && \"Expected instruction\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 1764, __extension__
__PRETTY_FUNCTION__));

1765

// Our tree has just 3 nodes: the root and two operands.

1766

// It is therefore trivial to get the APO. We only need to check the

1767

// opcode of VL[Lane] and whether the operand at OpIdx is the LHS or

1768

// RHS operand. The LHS operand of both add and sub is never attached

1769

// to an inversese operation in the linearized form, therefore its APO

1770

// is false. The RHS is true only if VL[Lane] is an inverse operation.

1771

1772

// Since operand reordering is performed on groups of commutative

1773

// operations or alternating sequences (e.g., +, -), we can safely

1774

// tell the inverse operations by checking commutativity.

1775

bool IsInverseOperation = !isCommutative(cast<Instruction>(VL[Lane]));

1776

bool APO = (OpIdx == 0) ? false : IsInverseOperation;

1777

OpsVec[OpIdx][Lane] = {cast<Instruction>(VL[Lane])->getOperand(OpIdx),

1778

APO, false};

1779

}

1780

}

1781

}

1782

1783

/// \returns the number of operands.

1784

unsigned getNumOperands() const { return OpsVec.size(); }

1785

1786

/// \returns the number of lanes.

1787

unsigned getNumLanes() const { return OpsVec[0].size(); }

1788

1789

/// \returns the operand value at \p OpIdx and \p Lane.

1790

Value *getValue(unsigned OpIdx, unsigned Lane) const {

1791

return getData(OpIdx, Lane).V;

1792

}

1793

1794

/// \returns true if the data structure is empty.

1795

bool empty() const { return OpsVec.empty(); }

1796

1797

/// Clears the data.

1798

void clear() { OpsVec.clear(); }

1799

1800

/// \Returns true if there are enough operands identical to \p Op to fill

1801

/// the whole vector.

1802

/// Note: This modifies the 'IsUsed' flag, so a cleanUsed() must follow.

1803

bool shouldBroadcast(Value *Op, unsigned OpIdx, unsigned Lane) {

1804

bool OpAPO = getData(OpIdx, Lane).APO;

1805

for (unsigned Ln = 0, Lns = getNumLanes(); Ln != Lns; ++Ln) {

1806

if (Ln == Lane)

1807

continue;

1808

// This is set to true if we found a candidate for broadcast at Lane.

1809

bool FoundCandidate = false;

1810

for (unsigned OpI = 0, OpE = getNumOperands(); OpI != OpE; ++OpI) {

1811

OperandData &Data = getData(OpI, Ln);

1812

if (Data.APO != OpAPO || Data.IsUsed)

1813

continue;

1814

if (Data.V == Op) {

1815

FoundCandidate = true;

1816

Data.IsUsed = true;

1817

break;

1818

}

1819

}

1820

if (!FoundCandidate)

1821

return false;

1822

}

1823

return true;

1824

}

1825

1826

public:

1827

/// Initialize with all the operands of the instruction vector \p RootVL.

1828

VLOperands(ArrayRef<Value *> RootVL, const DataLayout &DL,

1829

ScalarEvolution &SE, const BoUpSLP &R)

1830

: DL(DL), SE(SE), R(R) {

1831

// Append all the operands of RootVL.

1832

appendOperandsOfVL(RootVL);

1833

}

1834

1835

/// \Returns a value vector with the operands across all lanes for the

1836

/// opearnd at \p OpIdx.

1837

ValueList getVL(unsigned OpIdx) const {

1838

ValueList OpVL(OpsVec[OpIdx].size());

1839

assert(OpsVec[OpIdx].size() == getNumLanes() &&(static_cast <bool> (OpsVec[OpIdx].size() == getNumLanes
() && "Expected same num of lanes across all operands"
) ? void (0) : __assert_fail ("OpsVec[OpIdx].size() == getNumLanes() && \"Expected same num of lanes across all operands\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 1840, __extension__
__PRETTY_FUNCTION__))

1840

"Expected same num of lanes across all operands")(static_cast <bool> (OpsVec[OpIdx].size() == getNumLanes
() && "Expected same num of lanes across all operands"
) ? void (0) : __assert_fail ("OpsVec[OpIdx].size() == getNumLanes() && \"Expected same num of lanes across all operands\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 1840, __extension__
__PRETTY_FUNCTION__));

1841

for (unsigned Lane = 0, Lanes = getNumLanes(); Lane != Lanes; ++Lane)

1842

OpVL[Lane] = OpsVec[OpIdx][Lane].V;

1843

return OpVL;

1844

}

1845

1846

// Performs operand reordering for 2 or more operands.

1847

// The original operands are in OrigOps[OpIdx][Lane].

1848

// The reordered operands are returned in 'SortedOps[OpIdx][Lane]'.

1849

void reorder() {

1850

unsigned NumOperands = getNumOperands();

1851

unsigned NumLanes = getNumLanes();

1852

// Each operand has its own mode. We are using this mode to help us select

1853

// the instructions for each lane, so that they match best with the ones

1854

// we have selected so far.

1855

SmallVector<ReorderingMode, 2> ReorderingModes(NumOperands);

1856

1857

// This is a greedy single-pass algorithm. We are going over each lane

1858

// once and deciding on the best order right away with no back-tracking.

1859

// However, in order to increase its effectiveness, we start with the lane

1860

// that has operands that can move the least. For example, given the

1861

// following lanes:

1862

// Lane 0 : A[0] = B[0] + C[0] // Visited 3rd

1863

// Lane 1 : A[1] = C[1] - B[1] // Visited 1st

1864

// Lane 2 : A[2] = B[2] + C[2] // Visited 2nd

1865

// Lane 3 : A[3] = C[3] - B[3] // Visited 4th

1866

// we will start at Lane 1, since the operands of the subtraction cannot

1867

// be reordered. Then we will visit the rest of the lanes in a circular

1868

// fashion. That is, Lanes 2, then Lane 0, and finally Lane 3.

1869

1870

// Find the first lane that we will start our search from.

1871

unsigned FirstLane = getBestLaneToStartReordering();

1872

1873

// Initialize the modes.

1874

for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {

1875

Value *OpLane0 = getValue(OpIdx, FirstLane);

1876

// Keep track if we have instructions with all the same opcode on one

1877

// side.

1878

if (isa<LoadInst>(OpLane0))

1879

ReorderingModes[OpIdx] = ReorderingMode::Load;

1880

else if (isa<Instruction>(OpLane0)) {

1881

// Check if OpLane0 should be broadcast.

1882

if (shouldBroadcast(OpLane0, OpIdx, FirstLane))

1883

ReorderingModes[OpIdx] = ReorderingMode::Splat;

1884

else

1885

ReorderingModes[OpIdx] = ReorderingMode::Opcode;

1886

}

1887

else if (isa<Constant>(OpLane0))

1888

ReorderingModes[OpIdx] = ReorderingMode::Constant;

1889

else if (isa<Argument>(OpLane0))

1890

// Our best hope is a Splat. It may save some cost in some cases.

1891

ReorderingModes[OpIdx] = ReorderingMode::Splat;

1892

else

1893

// NOTE: This should be unreachable.

1894

ReorderingModes[OpIdx] = ReorderingMode::Failed;

1895

}

1896

1897

// Check that we don't have same operands. No need to reorder if operands

1898

// are just perfect diamond or shuffled diamond match. Do not do it only

1899

// for possible broadcasts or non-power of 2 number of scalars (just for

1900

// now).

1901

auto &&SkipReordering = [this]() {

1902

SmallPtrSet<Value *, 4> UniqueValues;

1903

ArrayRef<OperandData> Op0 = OpsVec.front();

1904

for (const OperandData &Data : Op0)

1905

UniqueValues.insert(Data.V);

1906

for (ArrayRef<OperandData> Op : drop_begin(OpsVec, 1)) {

1907

if (any_of(Op, [&UniqueValues](const OperandData &Data) {

1908

return !UniqueValues.contains(Data.V);

1909

}))

1910

return false;

1911

}

1912

// TODO: Check if we can remove a check for non-power-2 number of

1913

// scalars after full support of non-power-2 vectorization.

1914

return UniqueValues.size() != 2 && isPowerOf2_32(UniqueValues.size());

1915

};

1916

1917

// If the initial strategy fails for any of the operand indexes, then we

1918

// perform reordering again in a second pass. This helps avoid assigning

1919

// high priority to the failed strategy, and should improve reordering for

1920

// the non-failed operand indexes.

1921

for (int Pass = 0; Pass != 2; ++Pass) {

1922

// Check if no need to reorder operands since they're are perfect or

1923

// shuffled diamond match.

1924

// Need to to do it to avoid extra external use cost counting for

1925

// shuffled matches, which may cause regressions.

1926

if (SkipReordering())

1927

break;

1928

// Skip the second pass if the first pass did not fail.

1929

bool StrategyFailed = false;

1930

// Mark all operand data as free to use.

1931

clearUsed();

1932

// We keep the original operand order for the FirstLane, so reorder the

1933

// rest of the lanes. We are visiting the nodes in a circular fashion,

1934

// using FirstLane as the center point and increasing the radius

1935

// distance.

1936

SmallVector<SmallVector<Value *, 2>> MainAltOps(NumOperands);

1937

for (unsigned I = 0; I < NumOperands; ++I)

1938

MainAltOps[I].push_back(getData(I, FirstLane).V);

1939

1940

for (unsigned Distance = 1; Distance != NumLanes; ++Distance) {

1941

// Visit the lane on the right and then the lane on the left.

1942

for (int Direction : {+1, -1}) {

1943

int Lane = FirstLane + Direction * Distance;

1944

if (Lane < 0 || Lane >= (int)NumLanes)

1945

continue;

1946

int LastLane = Lane - Direction;

1947

assert(LastLane >= 0 && LastLane < (int)NumLanes &&(static_cast <bool> (LastLane >= 0 && LastLane
< (int)NumLanes && "Out of bounds") ? void (0) : __assert_fail
("LastLane >= 0 && LastLane < (int)NumLanes && \"Out of bounds\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 1948, __extension__
__PRETTY_FUNCTION__))

1948

"Out of bounds")(static_cast <bool> (LastLane >= 0 && LastLane
< (int)NumLanes && "Out of bounds") ? void (0) : __assert_fail
("LastLane >= 0 && LastLane < (int)NumLanes && \"Out of bounds\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 1948, __extension__
__PRETTY_FUNCTION__));

1949

// Look for a good match for each operand.

1950

for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {

1951

// Search for the operand that matches SortedOps[OpIdx][Lane-1].

1952

Optional<unsigned> BestIdx = getBestOperand(

1953

OpIdx, Lane, LastLane, ReorderingModes, MainAltOps[OpIdx]);

1954

// By not selecting a value, we allow the operands that follow to

1955

// select a better matching value. We will get a non-null value in

1956

// the next run of getBestOperand().

1957

if (BestIdx) {

1958

// Swap the current operand with the one returned by

1959

// getBestOperand().

1960

swap(OpIdx, *BestIdx, Lane);

1961

} else {

1962

// We failed to find a best operand, set mode to 'Failed'.

1963

ReorderingModes[OpIdx] = ReorderingMode::Failed;

1964

// Enable the second pass.

1965

StrategyFailed = true;

1966

}

1967

// Try to get the alternate opcode and follow it during analysis.

1968

if (MainAltOps[OpIdx].size() != 2) {

1969

OperandData &AltOp = getData(OpIdx, Lane);

1970

InstructionsState OpS =

1971

getSameOpcode({MainAltOps[OpIdx].front(), AltOp.V});

1972

if (OpS.getOpcode() && OpS.isAltShuffle())

1973

MainAltOps[OpIdx].push_back(AltOp.V);

1974

}

1975

}

1976

}

1977

}

1978

// Skip second pass if the strategy did not fail.

1979

if (!StrategyFailed)

1980

break;

1981

}

1982

}

1983

1984

#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)

1985

LLVM_DUMP_METHOD__attribute__((noinline)) __attribute__((__used__)) static StringRef getModeStr(ReorderingMode RMode) {

1986

switch (RMode) {

1987

case ReorderingMode::Load:

1988

return "Load";

1989

case ReorderingMode::Opcode:

1990

return "Opcode";

1991

case ReorderingMode::Constant:

1992

return "Constant";

1993

case ReorderingMode::Splat:

1994

return "Splat";

1995

case ReorderingMode::Failed:

1996

return "Failed";

1997

}

1998

llvm_unreachable("Unimplemented Reordering Type")::llvm::llvm_unreachable_internal("Unimplemented Reordering Type"
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 1998);

1999

}

2000

2001

LLVM_DUMP_METHOD__attribute__((noinline)) __attribute__((__used__)) static raw_ostream &printMode(ReorderingMode RMode,

2002

raw_ostream &OS) {

2003

return OS << getModeStr(RMode);

2004

}

2005

2006

/// Debug print.

2007

LLVM_DUMP_METHOD__attribute__((noinline)) __attribute__((__used__)) static void dumpMode(ReorderingMode RMode) {

2008

printMode(RMode, dbgs());

2009

}

2010

2011

friend raw_ostream &operator<<(raw_ostream &OS, ReorderingMode RMode) {

2012

return printMode(RMode, OS);

2013

}

2014

2015

LLVM_DUMP_METHOD__attribute__((noinline)) __attribute__((__used__)) raw_ostream &print(raw_ostream &OS) const {

2016

const unsigned Indent = 2;

2017

unsigned Cnt = 0;

2018

for (const OperandDataVec &OpDataVec : OpsVec) {

2019

OS << "Operand " << Cnt++ << "\n";

2020

for (const OperandData &OpData : OpDataVec) {

2021

OS.indent(Indent) << "{";

2022

if (Value *V = OpData.V)

2023

OS << *V;

2024

else

2025

OS << "null";

2026

OS << ", APO:" << OpData.APO << "}\n";

2027

}

2028

OS << "\n";

2029

}

2030

return OS;

2031

}

2032

2033

/// Debug print.

2034

LLVM_DUMP_METHOD__attribute__((noinline)) __attribute__((__used__)) void dump() const { print(dbgs()); }

2035

#endif

2036

};

2037

2038

/// Evaluate each pair in \p Candidates and return index into \p Candidates

2039

/// for a pair which have highest score deemed to have best chance to form

2040

/// root of profitable tree to vectorize. Return None if no candidate scored

2041

/// above the LookAheadHeuristics::ScoreFail.

2042

/// \param Limit Lower limit of the cost, considered to be good enough score.

2043

Optional<int>

2044

findBestRootPair(ArrayRef<std::pair<Value *, Value *>> Candidates,

2045

int Limit = LookAheadHeuristics::ScoreFail) {

2046

LookAheadHeuristics LookAhead(*DL, *SE, *this, /*NumLanes=*/2,

2047

RootLookAheadMaxDepth);

2048

int BestScore = Limit;

2049

Optional<int> Index;

2050

for (int I : seq<int>(0, Candidates.size())) {

2051

int Score = LookAhead.getScoreAtLevelRec(Candidates[I].first,

2052

Candidates[I].second,

2053

/*U1=*/nullptr, /*U2=*/nullptr,

2054

/*Level=*/1, None);

2055

if (Score > BestScore) {

2056

BestScore = Score;

2057

Index = I;

2058

}

2059

}

2060

return Index;

2061

}

2062

2063

/// Checks if the instruction is marked for deletion.

2064

bool isDeleted(Instruction *I) const { return DeletedInstructions.count(I); }

2065

2066

/// Removes an instruction from its block and eventually deletes it.

2067

/// It's like Instruction::eraseFromParent() except that the actual deletion

2068

/// is delayed until BoUpSLP is destructed.

2069

void eraseInstruction(Instruction *I) {

2070

DeletedInstructions.insert(I);

2071

}

2072

2073

/// Checks if the instruction was already analyzed for being possible

2074

/// reduction root.

2075

bool isAnalyzedReductionRoot(Instruction *I) const {

2076

return AnalyzedReductionsRoots.count(I);

2077

}

2078

/// Register given instruction as already analyzed for being possible

2079

/// reduction root.

2080

void analyzedReductionRoot(Instruction *I) {

2081

AnalyzedReductionsRoots.insert(I);

2082

}

2083

/// Checks if the provided list of reduced values was checked already for

2084

/// vectorization.

2085

bool areAnalyzedReductionVals(ArrayRef<Value *> VL) {

2086

return AnalyzedReductionVals.contains(hash_value(VL));

2087

}

2088

/// Adds the list of reduced values to list of already checked values for the

2089

/// vectorization.

2090

void analyzedReductionVals(ArrayRef<Value *> VL) {

2091

AnalyzedReductionVals.insert(hash_value(VL));

2092

}

2093

/// Clear the list of the analyzed reduction root instructions.

2094

void clearReductionData() {

2095

AnalyzedReductionsRoots.clear();

2096

AnalyzedReductionVals.clear();

2097

}

2098

/// Checks if the given value is gathered in one of the nodes.

2099

bool isAnyGathered(const SmallDenseSet<Value *> &Vals) const {

2100

return any_of(MustGather, [&](Value *V) { return Vals.contains(V); });

2101

}

2102

2103

~BoUpSLP();

2104

2105

private:

2106

/// Check if the operands on the edges \p Edges of the \p UserTE allows

2107

/// reordering (i.e. the operands can be reordered because they have only one

2108

/// user and reordarable).

2109

/// \param ReorderableGathers List of all gather nodes that require reordering

2110

/// (e.g., gather of extractlements or partially vectorizable loads).

2111

/// \param GatherOps List of gather operand nodes for \p UserTE that require

2112

/// reordering, subset of \p NonVectorized.

2113

bool

2114

canReorderOperands(TreeEntry *UserTE,

2115

SmallVectorImpl<std::pair<unsigned, TreeEntry *>> &Edges,

2116

ArrayRef<TreeEntry *> ReorderableGathers,

2117

SmallVectorImpl<TreeEntry *> &GatherOps);

2118

2119

/// Checks if the given \p TE is a gather node with clustered reused scalars

2120

/// and reorders it per given \p Mask.

2121

void reorderNodeWithReuses(TreeEntry &TE, ArrayRef<int> Mask) const;

2122

2123

/// Returns vectorized operand \p OpIdx of the node \p UserTE from the graph,

2124

/// if any. If it is not vectorized (gather node), returns nullptr.

2125

TreeEntry *getVectorizedOperand(TreeEntry *UserTE, unsigned OpIdx) {

2126

ArrayRef<Value *> VL = UserTE->getOperand(OpIdx);

2127

TreeEntry *TE = nullptr;

2128

const auto *It = find_if(VL, [this, &TE](Value *V) {

2129

TE = getTreeEntry(V);

2130

return TE;

2131

});

2132

if (It != VL.end() && TE->isSame(VL))

2133

return TE;

2134

return nullptr;

2135

}

2136

2137

/// Returns vectorized operand \p OpIdx of the node \p UserTE from the graph,

2138

/// if any. If it is not vectorized (gather node), returns nullptr.

2139

const TreeEntry *getVectorizedOperand(const TreeEntry *UserTE,

2140

unsigned OpIdx) const {

2141

return const_cast<BoUpSLP *>(this)->getVectorizedOperand(

2142

const_cast<TreeEntry *>(UserTE), OpIdx);

2143

}

2144

2145

/// Checks if all users of \p I are the part of the vectorization tree.

2146

bool areAllUsersVectorized(Instruction *I,

2147

ArrayRef<Value *> VectorizedVals) const;

2148

2149

/// Return information about the vector formed for the specified index

2150

/// of a vector of (the same) instruction.

2151

TargetTransformInfo::OperandValueInfo getOperandInfo(ArrayRef<Value *> VL,

2152

unsigned OpIdx);

2153

2154

/// \returns the cost of the vectorizable entry.

2155

InstructionCost getEntryCost(const TreeEntry *E,

2156

ArrayRef<Value *> VectorizedVals);

2157

2158

/// This is the recursive part of buildTree.

2159

void buildTree_rec(ArrayRef<Value *> Roots, unsigned Depth,

2160

const EdgeInfo &EI);

2161

2162

/// \returns true if the ExtractElement/ExtractValue instructions in \p VL can

2163

/// be vectorized to use the original vector (or aggregate "bitcast" to a

2164

/// vector) and sets \p CurrentOrder to the identity permutation; otherwise

2165

/// returns false, setting \p CurrentOrder to either an empty vector or a

2166

/// non-identity permutation that allows to reuse extract instructions.

2167

bool canReuseExtract(ArrayRef<Value *> VL, Value *OpValue,

2168

SmallVectorImpl<unsigned> &CurrentOrder) const;

2169

2170

/// Vectorize a single entry in the tree.

2171

Value *vectorizeTree(TreeEntry *E);

2172

2173

/// Vectorize a single entry in the tree, starting in \p VL.

2174

Value *vectorizeTree(ArrayRef<Value *> VL);

2175

2176

/// Create a new vector from a list of scalar values. Produces a sequence

2177

/// which exploits values reused across lanes, and arranges the inserts

2178

/// for ease of later optimization.

2179

Value *createBuildVector(ArrayRef<Value *> VL);

2180

2181

/// \returns the scalarization cost for this type. Scalarization in this

2182

/// context means the creation of vectors from a group of scalars. If \p

2183

/// NeedToShuffle is true, need to add a cost of reshuffling some of the

2184

/// vector elements.

2185

InstructionCost getGatherCost(FixedVectorType *Ty,

2186

const APInt &ShuffledIndices,

2187

bool NeedToShuffle) const;

2188

2189

/// Returns the instruction in the bundle, which can be used as a base point

2190

/// for scheduling. Usually it is the last instruction in the bundle, except

2191

/// for the case when all operands are external (in this case, it is the first

2192

/// instruction in the list).

2193

Instruction &getLastInstructionInBundle(const TreeEntry *E);

2194

2195

/// Checks if the gathered \p VL can be represented as shuffle(s) of previous

2196

/// tree entries.

2197

/// \returns ShuffleKind, if gathered values can be represented as shuffles of

2198

/// previous tree entries. \p Mask is filled with the shuffle mask.

2199

Optional<TargetTransformInfo::ShuffleKind>

2200

isGatherShuffledEntry(const TreeEntry *TE, SmallVectorImpl<int> &Mask,

2201

SmallVectorImpl<const TreeEntry *> &Entries);

2202

2203

/// \returns the scalarization cost for this list of values. Assuming that

2204

/// this subtree gets vectorized, we may need to extract the values from the

2205

/// roots. This method calculates the cost of extracting the values.

2206

InstructionCost getGatherCost(ArrayRef<Value *> VL) const;

2207

2208

/// Set the Builder insert point to one after the last instruction in

2209

/// the bundle

2210

void setInsertPointAfterBundle(const TreeEntry *E);

2211

2212

/// \returns a vector from a collection of scalars in \p VL.

2213

Value *gather(ArrayRef<Value *> VL);

2214

2215

/// \returns whether the VectorizableTree is fully vectorizable and will

2216

/// be beneficial even the tree height is tiny.

2217

bool isFullyVectorizableTinyTree(bool ForReduction) const;

2218

2219

/// Reorder commutative or alt operands to get better probability of

2220

/// generating vectorized code.

2221

static void reorderInputsAccordingToOpcode(ArrayRef<Value *> VL,

2222

SmallVectorImpl<Value *> &Left,

2223

SmallVectorImpl<Value *> &Right,

2224

const DataLayout &DL,

2225

ScalarEvolution &SE,

2226

const BoUpSLP &R);

2227

2228

/// Helper for `findExternalStoreUsersReorderIndices()`. It iterates over the

2229

/// users of \p TE and collects the stores. It returns the map from the store

2230

/// pointers to the collected stores.

2231

DenseMap<Value *, SmallVector<StoreInst *, 4>>

2232

collectUserStores(const BoUpSLP::TreeEntry *TE) const;

2233

2234

/// Helper for `findExternalStoreUsersReorderIndices()`. It checks if the

2235

/// stores in \p StoresVec can form a vector instruction. If so it returns true

2236

/// and populates \p ReorderIndices with the shuffle indices of the the stores

2237

/// when compared to the sorted vector.

2238

bool canFormVector(const SmallVector<StoreInst *, 4> &StoresVec,

2239

OrdersType &ReorderIndices) const;

2240

2241

/// Iterates through the users of \p TE, looking for scalar stores that can be

2242

/// potentially vectorized in a future SLP-tree. If found, it keeps track of

2243

/// their order and builds an order index vector for each store bundle. It

2244

/// returns all these order vectors found.

2245

/// We run this after the tree has formed, otherwise we may come across user

2246

/// instructions that are not yet in the tree.

2247

SmallVector<OrdersType, 1>

2248

findExternalStoreUsersReorderIndices(TreeEntry *TE) const;

2249

2250

struct TreeEntry {

2251

using VecTreeTy = SmallVector<std::unique_ptr<TreeEntry>, 8>;

2252

TreeEntry(VecTreeTy &Container) : Container(Container) {}

2253

2254

/// \returns true if the scalars in VL are equal to this entry.

2255

bool isSame(ArrayRef<Value *> VL) const {

2256

auto &&IsSame = [VL](ArrayRef<Value *> Scalars, ArrayRef<int> Mask) {

2257

if (Mask.size() != VL.size() && VL.size() == Scalars.size())

2258

return std::equal(VL.begin(), VL.end(), Scalars.begin());

2259

return VL.size() == Mask.size() &&

2260

std::equal(VL.begin(), VL.end(), Mask.begin(),

2261

[Scalars](Value *V, int Idx) {

2262

return (isa<UndefValue>(V) &&

2263

Idx == UndefMaskElem) ||

2264

(Idx != UndefMaskElem && V == Scalars[Idx]);

2265

});

2266

};

2267

if (!ReorderIndices.empty()) {

2268

// TODO: implement matching if the nodes are just reordered, still can

2269

// treat the vector as the same if the list of scalars matches VL

2270

// directly, without reordering.

2271

SmallVector<int> Mask;

2272

inversePermutation(ReorderIndices, Mask);

2273

if (VL.size() == Scalars.size())

2274

return IsSame(Scalars, Mask);

2275

if (VL.size() == ReuseShuffleIndices.size()) {

2276

::addMask(Mask, ReuseShuffleIndices);

2277

return IsSame(Scalars, Mask);

2278

}

2279

return false;

2280

}

2281

return IsSame(Scalars, ReuseShuffleIndices);

2282

}

2283

2284

/// \returns true if current entry has same operands as \p TE.

2285

bool hasEqualOperands(const TreeEntry &TE) const {

2286

if (TE.getNumOperands() != getNumOperands())

2287

return false;

2288

SmallBitVector Used(getNumOperands());

2289

for (unsigned I = 0, E = getNumOperands(); I < E; ++I) {

2290

unsigned PrevCount = Used.count();

2291

for (unsigned K = 0; K < E; ++K) {

2292

if (Used.test(K))

2293

continue;

2294

if (getOperand(K) == TE.getOperand(I)) {

2295

Used.set(K);

2296

break;

2297

}

2298

}

2299

// Check if we actually found the matching operand.

2300

if (PrevCount == Used.count())

2301

return false;

2302

}

2303

return true;

2304

}

2305

2306

/// \return Final vectorization factor for the node. Defined by the total

2307

/// number of vectorized scalars, including those, used several times in the

2308

/// entry and counted in the \a ReuseShuffleIndices, if any.

2309

unsigned getVectorFactor() const {

2310

if (!ReuseShuffleIndices.empty())

2311

return ReuseShuffleIndices.size();

2312

return Scalars.size();

2313

};

2314

2315

/// A vector of scalars.

2316

ValueList Scalars;

2317

2318

/// The Scalars are vectorized into this value. It is initialized to Null.

2319

Value *VectorizedValue = nullptr;

2320

2321

/// Do we need to gather this sequence or vectorize it

2322

/// (either with vector instruction or with scatter/gather

2323

/// intrinsics for store/load)?

2324

enum EntryState { Vectorize, ScatterVectorize, NeedToGather };

2325

EntryState State;

2326

2327

/// Does this sequence require some shuffling?

2328

SmallVector<int, 4> ReuseShuffleIndices;

2329

2330

/// Does this entry require reordering?

2331

SmallVector<unsigned, 4> ReorderIndices;

2332

2333

/// Points back to the VectorizableTree.

2334

///

2335

/// Only used for Graphviz right now. Unfortunately GraphTrait::NodeRef has

2336

/// to be a pointer and needs to be able to initialize the child iterator.

2337

/// Thus we need a reference back to the container to translate the indices

2338

/// to entries.

2339

VecTreeTy &Container;

2340

2341

/// The TreeEntry index containing the user of this entry. We can actually

2342

/// have multiple users so the data structure is not truly a tree.

2343

SmallVector<EdgeInfo, 1> UserTreeIndices;

2344

2345

/// The index of this treeEntry in VectorizableTree.

2346

int Idx = -1;

2347

2348

private:

2349

/// The operands of each instruction in each lane Operands[op_index][lane].

2350

/// Note: This helps avoid the replication of the code that performs the

2351

/// reordering of operands during buildTree_rec() and vectorizeTree().

2352

SmallVector<ValueList, 2> Operands;

2353

2354

/// The main/alternate instruction.

2355

Instruction *MainOp = nullptr;

2356

Instruction *AltOp = nullptr;

2357

2358

public:

2359

/// Set this bundle's \p OpIdx'th operand to \p OpVL.

2360

void setOperand(unsigned OpIdx, ArrayRef<Value *> OpVL) {

2361

if (Operands.size() < OpIdx + 1)

2362

Operands.resize(OpIdx + 1);

2363

assert(Operands[OpIdx].empty() && "Already resized?")(static_cast <bool> (Operands[OpIdx].empty() &&
"Already resized?") ? void (0) : __assert_fail ("Operands[OpIdx].empty() && \"Already resized?\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 2363, __extension__
__PRETTY_FUNCTION__));

2364

assert(OpVL.size() <= Scalars.size() &&(static_cast <bool> (OpVL.size() <= Scalars.size() &&
"Number of operands is greater than the number of scalars.")
? void (0) : __assert_fail ("OpVL.size() <= Scalars.size() && \"Number of operands is greater than the number of scalars.\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 2365, __extension__
__PRETTY_FUNCTION__))

2365

"Number of operands is greater than the number of scalars.")(static_cast <bool> (OpVL.size() <= Scalars.size() &&
"Number of operands is greater than the number of scalars.")
? void (0) : __assert_fail ("OpVL.size() <= Scalars.size() && \"Number of operands is greater than the number of scalars.\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 2365, __extension__
__PRETTY_FUNCTION__));

2366

Operands[OpIdx].resize(OpVL.size());

2367

copy(OpVL, Operands[OpIdx].begin());

2368

}

2369

2370

/// Set the operands of this bundle in their original order.

2371

void setOperandsInOrder() {

2372

assert(Operands.empty() && "Already initialized?")(static_cast <bool> (Operands.empty() && "Already initialized?"
) ? void (0) : __assert_fail ("Operands.empty() && \"Already initialized?\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 2372, __extension__
__PRETTY_FUNCTION__));

2373

auto *I0 = cast<Instruction>(Scalars[0]);

2374

Operands.resize(I0->getNumOperands());

2375

unsigned NumLanes = Scalars.size();

2376

for (unsigned OpIdx = 0, NumOperands = I0->getNumOperands();

2377

OpIdx != NumOperands; ++OpIdx) {

2378

Operands[OpIdx].resize(NumLanes);

2379

for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {

2380

auto *I = cast<Instruction>(Scalars[Lane]);

2381

assert(I->getNumOperands() == NumOperands &&(static_cast <bool> (I->getNumOperands() == NumOperands
&& "Expected same number of operands") ? void (0) : __assert_fail
("I->getNumOperands() == NumOperands && \"Expected same number of operands\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 2382, __extension__
__PRETTY_FUNCTION__))

2382

"Expected same number of operands")(static_cast <bool> (I->getNumOperands() == NumOperands
&& "Expected same number of operands") ? void (0) : __assert_fail
("I->getNumOperands() == NumOperands && \"Expected same number of operands\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 2382, __extension__
__PRETTY_FUNCTION__));

2383

Operands[OpIdx][Lane] = I->getOperand(OpIdx);

2384

}

2385

}

2386

}

2387

2388

/// Reorders operands of the node to the given mask \p Mask.

2389

void reorderOperands(ArrayRef<int> Mask) {

2390

for (ValueList &Operand : Operands)

2391

reorderScalars(Operand, Mask);

2392

}

2393

2394

/// \returns the \p OpIdx operand of this TreeEntry.

2395

ValueList &getOperand(unsigned OpIdx) {

2396

assert(OpIdx < Operands.size() && "Off bounds")(static_cast <bool> (OpIdx < Operands.size() &&
"Off bounds") ? void (0) : __assert_fail ("OpIdx < Operands.size() && \"Off bounds\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 2396, __extension__
__PRETTY_FUNCTION__));

2397

return Operands[OpIdx];

2398

}

2399

2400

/// \returns the \p OpIdx operand of this TreeEntry.

2401

ArrayRef<Value *> getOperand(unsigned OpIdx) const {

2402

assert(OpIdx < Operands.size() && "Off bounds")(static_cast <bool> (OpIdx < Operands.size() &&
"Off bounds") ? void (0) : __assert_fail ("OpIdx < Operands.size() && \"Off bounds\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 2402, __extension__
__PRETTY_FUNCTION__));

2403

return Operands[OpIdx];

2404

}

2405

2406

/// \returns the number of operands.

2407

unsigned getNumOperands() const { return Operands.size(); }

2408

2409

/// \return the single \p OpIdx operand.

2410

Value *getSingleOperand(unsigned OpIdx) const {

2411

assert(OpIdx < Operands.size() && "Off bounds")(static_cast <bool> (OpIdx < Operands.size() &&
"Off bounds") ? void (0) : __assert_fail ("OpIdx < Operands.size() && \"Off bounds\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 2411, __extension__
__PRETTY_FUNCTION__));

2412

assert(!Operands[OpIdx].empty() && "No operand available")(static_cast <bool> (!Operands[OpIdx].empty() &&
"No operand available") ? void (0) : __assert_fail ("!Operands[OpIdx].empty() && \"No operand available\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 2412, __extension__
__PRETTY_FUNCTION__));

2413

return Operands[OpIdx][0];

2414

}

2415

2416

/// Some of the instructions in the list have alternate opcodes.

2417

bool isAltShuffle() const { return MainOp != AltOp; }

2418

2419

bool isOpcodeOrAlt(Instruction *I) const {

2420

unsigned CheckedOpcode = I->getOpcode();

2421

return (getOpcode() == CheckedOpcode ||

2422

getAltOpcode() == CheckedOpcode);

2423

}

2424

2425

/// Chooses the correct key for scheduling data. If \p Op has the same (or

2426

/// alternate) opcode as \p OpValue, the key is \p Op. Otherwise the key is

2427

/// \p OpValue.

2428

Value *isOneOf(Value *Op) const {

2429

auto *I = dyn_cast<Instruction>(Op);

2430

if (I && isOpcodeOrAlt(I))

2431

return Op;

2432

return MainOp;

2433

}

2434

2435

void setOperations(const InstructionsState &S) {

2436

MainOp = S.MainOp;

2437

AltOp = S.AltOp;

2438

}

2439

2440

Instruction *getMainOp() const {

2441

return MainOp;

2442

}

2443

2444

Instruction *getAltOp() const {

2445

return AltOp;

2446

}

2447

2448

/// The main/alternate opcodes for the list of instructions.

2449

unsigned getOpcode() const {

2450

return MainOp ? MainOp->getOpcode() : 0;

2451

}

2452

2453

unsigned getAltOpcode() const {

2454

return AltOp ? AltOp->getOpcode() : 0;

2455

}

2456

2457

/// When ReuseReorderShuffleIndices is empty it just returns position of \p

2458

/// V within vector of Scalars. Otherwise, try to remap on its reuse index.

2459

int findLaneForValue(Value *V) const {

2460

unsigned FoundLane = std::distance(Scalars.begin(), find(Scalars, V));

2461

assert(FoundLane < Scalars.size() && "Couldn't find extract lane")(static_cast <bool> (FoundLane < Scalars.size() &&
"Couldn't find extract lane") ? void (0) : __assert_fail ("FoundLane < Scalars.size() && \"Couldn't find extract lane\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 2461, __extension__
__PRETTY_FUNCTION__));

2462

if (!ReorderIndices.empty())

2463

FoundLane = ReorderIndices[FoundLane];

2464

assert(FoundLane < Scalars.size() && "Couldn't find extract lane")(static_cast <bool> (FoundLane < Scalars.size() &&
"Couldn't find extract lane") ? void (0) : __assert_fail ("FoundLane < Scalars.size() && \"Couldn't find extract lane\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 2464, __extension__
__PRETTY_FUNCTION__));

2465

if (!ReuseShuffleIndices.empty()) {

2466

FoundLane = std::distance(ReuseShuffleIndices.begin(),

2467

find(ReuseShuffleIndices, FoundLane));

2468

}

2469

return FoundLane;

2470

}

2471

2472

#ifndef NDEBUG

2473

/// Debug printer.

2474

LLVM_DUMP_METHOD__attribute__((noinline)) __attribute__((__used__)) void dump() const {

2475

dbgs() << Idx << ".\n";

2476

for (unsigned OpI = 0, OpE = Operands.size(); OpI != OpE; ++OpI) {

2477

dbgs() << "Operand " << OpI << ":\n";

2478

for (const Value *V : Operands[OpI])

2479

dbgs().indent(2) << *V << "\n";

2480

}

2481

dbgs() << "Scalars: \n";

2482

for (Value *V : Scalars)

2483

dbgs().indent(2) << *V << "\n";

2484

dbgs() << "State: ";

2485

switch (State) {

2486

case Vectorize:

2487

dbgs() << "Vectorize\n";

2488

break;

2489

case ScatterVectorize:

2490

dbgs() << "ScatterVectorize\n";

2491

break;

2492

case NeedToGather:

2493

dbgs() << "NeedToGather\n";

2494

break;

2495

}

2496

dbgs() << "MainOp: ";

2497

if (MainOp)

2498

dbgs() << *MainOp << "\n";

2499

else

2500

dbgs() << "NULL\n";

2501

dbgs() << "AltOp: ";

2502

if (AltOp)

2503

dbgs() << *AltOp << "\n";

2504

else

2505

dbgs() << "NULL\n";

2506

dbgs() << "VectorizedValue: ";

2507

if (VectorizedValue)

2508

dbgs() << *VectorizedValue << "\n";

2509

else

2510

dbgs() << "NULL\n";

2511

dbgs() << "ReuseShuffleIndices: ";

2512

if (ReuseShuffleIndices.empty())

2513

dbgs() << "Empty";

2514

else

2515

for (int ReuseIdx : ReuseShuffleIndices)

2516

dbgs() << ReuseIdx << ", ";

2517

dbgs() << "\n";

2518

dbgs() << "ReorderIndices: ";

2519

for (unsigned ReorderIdx : ReorderIndices)

2520

dbgs() << ReorderIdx << ", ";

2521

dbgs() << "\n";

2522

dbgs() << "UserTreeIndices: ";

2523

for (const auto &EInfo : UserTreeIndices)

2524

dbgs() << EInfo << ", ";

2525

dbgs() << "\n";

2526

}

2527

#endif

2528

};

2529

2530

#ifndef NDEBUG

2531

void dumpTreeCosts(const TreeEntry *E, InstructionCost ReuseShuffleCost,

2532

InstructionCost VecCost,

2533

InstructionCost ScalarCost) const {

2534

dbgs() << "SLP: Calculated costs for Tree:\n"; E->dump();

2535

dbgs() << "SLP: Costs:\n";

2536

dbgs() << "SLP: ReuseShuffleCost = " << ReuseShuffleCost << "\n";

2537

dbgs() << "SLP: VectorCost = " << VecCost << "\n";

2538

dbgs() << "SLP: ScalarCost = " << ScalarCost << "\n";

2539

dbgs() << "SLP: ReuseShuffleCost + VecCost - ScalarCost = " <<

2540

ReuseShuffleCost + VecCost - ScalarCost << "\n";

2541

}

2542

#endif

2543

2544

/// Create a new VectorizableTree entry.

2545

TreeEntry *newTreeEntry(ArrayRef<Value *> VL, Optional<ScheduleData *> Bundle,

2546

const InstructionsState &S,

2547

const EdgeInfo &UserTreeIdx,

2548

ArrayRef<int> ReuseShuffleIndices = None,

2549

ArrayRef<unsigned> ReorderIndices = None) {

2550

TreeEntry::EntryState EntryState =

2551

Bundle ? TreeEntry::Vectorize : TreeEntry::NeedToGather;

2552

return newTreeEntry(VL, EntryState, Bundle, S, UserTreeIdx,

2553

ReuseShuffleIndices, ReorderIndices);

2554

}

2555

2556

TreeEntry *newTreeEntry(ArrayRef<Value *> VL,

2557

TreeEntry::EntryState EntryState,

2558

Optional<ScheduleData *> Bundle,

2559

const InstructionsState &S,

2560

const EdgeInfo &UserTreeIdx,

2561

ArrayRef<int> ReuseShuffleIndices = None,

2562

ArrayRef<unsigned> ReorderIndices = None) {

2563

assert(((!Bundle && EntryState == TreeEntry::NeedToGather) ||(static_cast <bool> (((!Bundle && EntryState ==
TreeEntry::NeedToGather) || (Bundle && EntryState !=
TreeEntry::NeedToGather)) && "Need to vectorize gather entry?"
) ? void (0) : __assert_fail ("((!Bundle && EntryState == TreeEntry::NeedToGather) || (Bundle && EntryState != TreeEntry::NeedToGather)) && \"Need to vectorize gather entry?\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 2565, __extension__
__PRETTY_FUNCTION__))

2564

(Bundle && EntryState != TreeEntry::NeedToGather)) &&(static_cast <bool> (((!Bundle && EntryState ==
TreeEntry::NeedToGather) || (Bundle && EntryState !=
TreeEntry::NeedToGather)) && "Need to vectorize gather entry?"
) ? void (0) : __assert_fail ("((!Bundle && EntryState == TreeEntry::NeedToGather) || (Bundle && EntryState != TreeEntry::NeedToGather)) && \"Need to vectorize gather entry?\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 2565, __extension__
__PRETTY_FUNCTION__))

2565

"Need to vectorize gather entry?")(static_cast <bool> (((!Bundle && EntryState ==
TreeEntry::NeedToGather) || (Bundle && EntryState !=
TreeEntry::NeedToGather)) && "Need to vectorize gather entry?"
) ? void (0) : __assert_fail ("((!Bundle && EntryState == TreeEntry::NeedToGather) || (Bundle && EntryState != TreeEntry::NeedToGather)) && \"Need to vectorize gather entry?\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 2565, __extension__
__PRETTY_FUNCTION__));

2566

VectorizableTree.push_back(std::make_unique<TreeEntry>(VectorizableTree));

2567

TreeEntry *Last = VectorizableTree.back().get();

2568

Last->Idx = VectorizableTree.size() - 1;

2569

Last->State = EntryState;

2570

Last->ReuseShuffleIndices.append(ReuseShuffleIndices.begin(),

2571

ReuseShuffleIndices.end());

2572

if (ReorderIndices.empty()) {

2573

Last->Scalars.assign(VL.begin(), VL.end());

2574

Last->setOperations(S);

2575

} else {

2576

// Reorder scalars and build final mask.

2577

Last->Scalars.assign(VL.size(), nullptr);

2578

transform(ReorderIndices, Last->Scalars.begin(),

2579

[VL](unsigned Idx) -> Value * {

2580

if (Idx >= VL.size())

2581

return UndefValue::get(VL.front()->getType());

2582

return VL[Idx];

2583

});

2584

InstructionsState S = getSameOpcode(Last->Scalars);

2585

Last->setOperations(S);

2586

Last->ReorderIndices.append(ReorderIndices.begin(), ReorderIndices.end());

2587

}

2588

if (Last->State != TreeEntry::NeedToGather) {

2589

for (Value *V : VL) {

2590

assert(!getTreeEntry(V) && "Scalar already in tree!")(static_cast <bool> (!getTreeEntry(V) && "Scalar already in tree!"
) ? void (0) : __assert_fail ("!getTreeEntry(V) && \"Scalar already in tree!\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 2590, __extension__
__PRETTY_FUNCTION__));

2591

ScalarToTreeEntry[V] = Last;

2592

}

2593

// Update the scheduler bundle to point to this TreeEntry.

2594

ScheduleData *BundleMember = *Bundle;

2595

assert((BundleMember || isa<PHINode>(S.MainOp) ||(static_cast <bool> ((BundleMember || isa<PHINode>
(S.MainOp) || isVectorLikeInstWithConstOps(S.MainOp) || doesNotNeedToSchedule
(VL)) && "Bundle and VL out of sync") ? void (0) : __assert_fail
("(BundleMember || isa<PHINode>(S.MainOp) || isVectorLikeInstWithConstOps(S.MainOp) || doesNotNeedToSchedule(VL)) && \"Bundle and VL out of sync\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 2598, __extension__
__PRETTY_FUNCTION__))

2596

isVectorLikeInstWithConstOps(S.MainOp) ||(static_cast <bool> ((BundleMember || isa<PHINode>
(S.MainOp) || isVectorLikeInstWithConstOps(S.MainOp) || doesNotNeedToSchedule
(VL)) && "Bundle and VL out of sync") ? void (0) : __assert_fail
("(BundleMember || isa<PHINode>(S.MainOp) || isVectorLikeInstWithConstOps(S.MainOp) || doesNotNeedToSchedule(VL)) && \"Bundle and VL out of sync\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 2598, __extension__
__PRETTY_FUNCTION__))

2597

doesNotNeedToSchedule(VL)) &&(static_cast <bool> ((BundleMember || isa<PHINode>
(S.MainOp) || isVectorLikeInstWithConstOps(S.MainOp) || doesNotNeedToSchedule
(VL)) && "Bundle and VL out of sync") ? void (0) : __assert_fail
("(BundleMember || isa<PHINode>(S.MainOp) || isVectorLikeInstWithConstOps(S.MainOp) || doesNotNeedToSchedule(VL)) && \"Bundle and VL out of sync\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 2598, __extension__
__PRETTY_FUNCTION__))

2598

"Bundle and VL out of sync")(static_cast <bool> ((BundleMember || isa<PHINode>
(S.MainOp) || isVectorLikeInstWithConstOps(S.MainOp) || doesNotNeedToSchedule
(VL)) && "Bundle and VL out of sync") ? void (0) : __assert_fail
("(BundleMember || isa<PHINode>(S.MainOp) || isVectorLikeInstWithConstOps(S.MainOp) || doesNotNeedToSchedule(VL)) && \"Bundle and VL out of sync\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 2598, __extension__
__PRETTY_FUNCTION__));

2599

if (BundleMember) {

2600

for (Value *V : VL) {

2601

if (doesNotNeedToBeScheduled(V))

2602

continue;

2603

assert(BundleMember && "Unexpected end of bundle.")(static_cast <bool> (BundleMember && "Unexpected end of bundle."
) ? void (0) : __assert_fail ("BundleMember && \"Unexpected end of bundle.\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 2603, __extension__
__PRETTY_FUNCTION__));

2604

BundleMember->TE = Last;

2605

BundleMember = BundleMember->NextInBundle;

2606

}

2607

}

2608

assert(!BundleMember && "Bundle and VL out of sync")(static_cast <bool> (!BundleMember && "Bundle and VL out of sync"
) ? void (0) : __assert_fail ("!BundleMember && \"Bundle and VL out of sync\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 2608, __extension__
__PRETTY_FUNCTION__));

2609

} else {

2610

MustGather.insert(VL.begin(), VL.end());

2611

}

2612

2613

if (UserTreeIdx.UserTE)

2614

Last->UserTreeIndices.push_back(UserTreeIdx);

2615

2616

return Last;

2617

}

2618

2619

/// -- Vectorization State --

2620

/// Holds all of the tree entries.

2621

TreeEntry::VecTreeTy VectorizableTree;

2622

2623

#ifndef NDEBUG

2624

/// Debug printer.

2625

LLVM_DUMP_METHOD__attribute__((noinline)) __attribute__((__used__)) void dumpVectorizableTree() const {

2626

for (unsigned Id = 0, IdE = VectorizableTree.size(); Id != IdE; ++Id) {

2627

VectorizableTree[Id]->dump();

2628

dbgs() << "\n";

2629

}

2630

}

2631

#endif

2632

2633

TreeEntry *getTreeEntry(Value *V) { return ScalarToTreeEntry.lookup(V); }

2634

2635

const TreeEntry *getTreeEntry(Value *V) const {

2636

return ScalarToTreeEntry.lookup(V);

2637

}

2638

2639

/// Maps a specific scalar to its tree entry.

2640

SmallDenseMap<Value*, TreeEntry *> ScalarToTreeEntry;

2641

2642

/// Maps a value to the proposed vectorizable size.

2643

SmallDenseMap<Value *, unsigned> InstrElementSize;

2644

2645

/// A list of scalars that we found that we need to keep as scalars.

2646

ValueSet MustGather;

2647

2648

/// This POD struct describes one external user in the vectorized tree.

2649

struct ExternalUser {

2650

ExternalUser(Value *S, llvm::User *U, int L)

2651

: Scalar(S), User(U), Lane(L) {}

2652

2653

// Which scalar in our function.

2654

Value *Scalar;

2655

2656

// Which user that uses the scalar.

2657

llvm::User *User;

2658

2659

// Which lane does the scalar belong to.

2660

int Lane;

2661

};

2662

using UserList = SmallVector<ExternalUser, 16>;

2663

2664

/// Checks if two instructions may access the same memory.

2665

///

2666

/// \p Loc1 is the location of \p Inst1. It is passed explicitly because it

2667

/// is invariant in the calling loop.

2668

bool isAliased(const MemoryLocation &Loc1, Instruction *Inst1,

2669

Instruction *Inst2) {

2670

// First check if the result is already in the cache.

2671

AliasCacheKey key = std::make_pair(Inst1, Inst2);

2672

Optional<bool> &result = AliasCache[key];

2673

if (result) {

2674

return result.value();

2675

}

2676

bool aliased = true;

2677

if (Loc1.Ptr && isSimple(Inst1))

2678

aliased = isModOrRefSet(BatchAA.getModRefInfo(Inst2, Loc1));

2679

// Store the result in the cache.

2680

result = aliased;

2681

return aliased;

2682

}

2683

2684

using AliasCacheKey = std::pair<Instruction *, Instruction *>;

2685

2686

/// Cache for alias results.

2687

/// TODO: consider moving this to the AliasAnalysis itself.

2688

DenseMap<AliasCacheKey, Optional<bool>> AliasCache;

2689

2690

// Cache for pointerMayBeCaptured calls inside AA. This is preserved

2691

// globally through SLP because we don't perform any action which

2692

// invalidates capture results.

2693

BatchAAResults BatchAA;

2694

2695

/// Temporary store for deleted instructions. Instructions will be deleted

2696

/// eventually when the BoUpSLP is destructed. The deferral is required to

2697

/// ensure that there are no incorrect collisions in the AliasCache, which

2698

/// can happen if a new instruction is allocated at the same address as a

2699

/// previously deleted instruction.

2700

DenseSet<Instruction *> DeletedInstructions;

2701

2702

/// Set of the instruction, being analyzed already for reductions.

2703

SmallPtrSet<Instruction *, 16> AnalyzedReductionsRoots;

2704

2705

/// Set of hashes for the list of reduction values already being analyzed.

2706

DenseSet<size_t> AnalyzedReductionVals;

2707

2708

/// A list of values that need to extracted out of the tree.

2709

/// This list holds pairs of (Internal Scalar : External User). External User

2710

/// can be nullptr, it means that this Internal Scalar will be used later,

2711

/// after vectorization.

2712

UserList ExternalUses;

2713

2714

/// Values used only by @llvm.assume calls.

2715

SmallPtrSet<const Value *, 32> EphValues;

2716

2717

/// Holds all of the instructions that we gathered.

2718

SetVector<Instruction *> GatherShuffleSeq;

2719

2720

/// A list of blocks that we are going to CSE.

2721

SetVector<BasicBlock *> CSEBlocks;

2722

2723

/// Contains all scheduling relevant data for an instruction.

2724

/// A ScheduleData either represents a single instruction or a member of an

2725

/// instruction bundle (= a group of instructions which is combined into a

2726

/// vector instruction).

2727

struct ScheduleData {

2728

// The initial value for the dependency counters. It means that the

2729

// dependencies are not calculated yet.

2730

enum { InvalidDeps = -1 };

2731

2732

ScheduleData() = default;

2733

2734

void init(int BlockSchedulingRegionID, Value *OpVal) {

2735

FirstInBundle = this;

2736

NextInBundle = nullptr;

2737

NextLoadStore = nullptr;

2738

IsScheduled = false;

2739

SchedulingRegionID = BlockSchedulingRegionID;

2740

clearDependencies();

2741

OpValue = OpVal;

2742

TE = nullptr;

2743

}

2744

2745

/// Verify basic self consistency properties

2746

void verify() {

2747

if (hasValidDependencies()) {

2748

assert(UnscheduledDeps <= Dependencies && "invariant")(static_cast <bool> (UnscheduledDeps <= Dependencies
&& "invariant") ? void (0) : __assert_fail ("UnscheduledDeps <= Dependencies && \"invariant\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 2748, __extension__
__PRETTY_FUNCTION__));

2749

} else {

2750

assert(UnscheduledDeps == Dependencies && "invariant")(static_cast <bool> (UnscheduledDeps == Dependencies &&
"invariant") ? void (0) : __assert_fail ("UnscheduledDeps == Dependencies && \"invariant\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 2750, __extension__
__PRETTY_FUNCTION__));

2751

}

2752

2753

if (IsScheduled) {

2754

assert(isSchedulingEntity() &&(static_cast <bool> (isSchedulingEntity() && "unexpected scheduled state"
) ? void (0) : __assert_fail ("isSchedulingEntity() && \"unexpected scheduled state\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 2755, __extension__
__PRETTY_FUNCTION__))

2755

"unexpected scheduled state")(static_cast <bool> (isSchedulingEntity() && "unexpected scheduled state"
) ? void (0) : __assert_fail ("isSchedulingEntity() && \"unexpected scheduled state\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 2755, __extension__
__PRETTY_FUNCTION__));

2756

for (const ScheduleData *BundleMember = this; BundleMember;

2757

BundleMember = BundleMember->NextInBundle) {

2758

assert(BundleMember->hasValidDependencies() &&(static_cast <bool> (BundleMember->hasValidDependencies
() && BundleMember->UnscheduledDeps == 0 &&
"unexpected scheduled state") ? void (0) : __assert_fail ("BundleMember->hasValidDependencies() && BundleMember->UnscheduledDeps == 0 && \"unexpected scheduled state\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 2760, __extension__
__PRETTY_FUNCTION__))

2759

BundleMember->UnscheduledDeps == 0 &&(static_cast <bool> (BundleMember->hasValidDependencies
() && BundleMember->UnscheduledDeps == 0 &&
"unexpected scheduled state") ? void (0) : __assert_fail ("BundleMember->hasValidDependencies() && BundleMember->UnscheduledDeps == 0 && \"unexpected scheduled state\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 2760, __extension__
__PRETTY_FUNCTION__))

2760

"unexpected scheduled state")(static_cast <bool> (BundleMember->hasValidDependencies
() && BundleMember->UnscheduledDeps == 0 &&
"unexpected scheduled state") ? void (0) : __assert_fail ("BundleMember->hasValidDependencies() && BundleMember->UnscheduledDeps == 0 && \"unexpected scheduled state\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 2760, __extension__
__PRETTY_FUNCTION__));

2761

assert((BundleMember == this || !BundleMember->IsScheduled) &&(static_cast <bool> ((BundleMember == this || !BundleMember
->IsScheduled) && "only bundle is marked scheduled"
) ? void (0) : __assert_fail ("(BundleMember == this || !BundleMember->IsScheduled) && \"only bundle is marked scheduled\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 2762, __extension__
__PRETTY_FUNCTION__))

2762

"only bundle is marked scheduled")(static_cast <bool> ((BundleMember == this || !BundleMember
->IsScheduled) && "only bundle is marked scheduled"
) ? void (0) : __assert_fail ("(BundleMember == this || !BundleMember->IsScheduled) && \"only bundle is marked scheduled\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 2762, __extension__
__PRETTY_FUNCTION__));

2763

}

2764

}

2765

2766

assert(Inst->getParent() == FirstInBundle->Inst->getParent() &&(static_cast <bool> (Inst->getParent() == FirstInBundle
->Inst->getParent() && "all bundle members must be in same basic block"
) ? void (0) : __assert_fail ("Inst->getParent() == FirstInBundle->Inst->getParent() && \"all bundle members must be in same basic block\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 2767, __extension__
__PRETTY_FUNCTION__))

2767

"all bundle members must be in same basic block")(static_cast <bool> (Inst->getParent() == FirstInBundle
->Inst->getParent() && "all bundle members must be in same basic block"
) ? void (0) : __assert_fail ("Inst->getParent() == FirstInBundle->Inst->getParent() && \"all bundle members must be in same basic block\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 2767, __extension__
__PRETTY_FUNCTION__));

2768

}

2769

2770

/// Returns true if the dependency information has been calculated.

2771

/// Note that depenendency validity can vary between instructions within

2772

/// a single bundle.

2773

bool hasValidDependencies() const { return Dependencies != InvalidDeps; }

2774

2775

/// Returns true for single instructions and for bundle representatives

2776

/// (= the head of a bundle).

2777

bool isSchedulingEntity() const { return FirstInBundle == this; }

2778

2779

/// Returns true if it represents an instruction bundle and not only a

2780

/// single instruction.

2781

bool isPartOfBundle() const {

2782

return NextInBundle != nullptr || FirstInBundle != this || TE;

2783

}

2784

2785

/// Returns true if it is ready for scheduling, i.e. it has no more

2786

/// unscheduled depending instructions/bundles.

2787

bool isReady() const {

2788

assert(isSchedulingEntity() &&(static_cast <bool> (isSchedulingEntity() && "can't consider non-scheduling entity for ready list"
) ? void (0) : __assert_fail ("isSchedulingEntity() && \"can't consider non-scheduling entity for ready list\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 2789, __extension__
__PRETTY_FUNCTION__))

2789

"can't consider non-scheduling entity for ready list")(static_cast <bool> (isSchedulingEntity() && "can't consider non-scheduling entity for ready list"
) ? void (0) : __assert_fail ("isSchedulingEntity() && \"can't consider non-scheduling entity for ready list\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 2789, __extension__
__PRETTY_FUNCTION__));

2790

return unscheduledDepsInBundle() == 0 && !IsScheduled;

2791

}

2792

2793

/// Modifies the number of unscheduled dependencies for this instruction,

2794

/// and returns the number of remaining dependencies for the containing

2795

/// bundle.

2796

int incrementUnscheduledDeps(int Incr) {

2797

assert(hasValidDependencies() &&(static_cast <bool> (hasValidDependencies() && "increment of unscheduled deps would be meaningless"
) ? void (0) : __assert_fail ("hasValidDependencies() && \"increment of unscheduled deps would be meaningless\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 2798, __extension__
__PRETTY_FUNCTION__))

2798

"increment of unscheduled deps would be meaningless")(static_cast <bool> (hasValidDependencies() && "increment of unscheduled deps would be meaningless"
) ? void (0) : __assert_fail ("hasValidDependencies() && \"increment of unscheduled deps would be meaningless\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 2798, __extension__
__PRETTY_FUNCTION__));

2799

UnscheduledDeps += Incr;

2800

return FirstInBundle->unscheduledDepsInBundle();

2801

}

2802

2803

/// Sets the number of unscheduled dependencies to the number of

2804

/// dependencies.

2805

void resetUnscheduledDeps() {

2806

UnscheduledDeps = Dependencies;

2807

}

2808

2809

/// Clears all dependency information.

2810

void clearDependencies() {

2811

Dependencies = InvalidDeps;

2812

resetUnscheduledDeps();

2813

MemoryDependencies.clear();

2814

ControlDependencies.clear();

2815

}

2816

2817

int unscheduledDepsInBundle() const {

2818

assert(isSchedulingEntity() && "only meaningful on the bundle")(static_cast <bool> (isSchedulingEntity() && "only meaningful on the bundle"
) ? void (0) : __assert_fail ("isSchedulingEntity() && \"only meaningful on the bundle\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 2818, __extension__
__PRETTY_FUNCTION__));

2819

int Sum = 0;

2820

for (const ScheduleData *BundleMember = this; BundleMember;

2821

BundleMember = BundleMember->NextInBundle) {

2822

if (BundleMember->UnscheduledDeps == InvalidDeps)

2823

return InvalidDeps;

2824

Sum += BundleMember->UnscheduledDeps;

2825

}

2826

return Sum;

2827

}

2828

2829

void dump(raw_ostream &os) const {

2830

if (!isSchedulingEntity()) {

2831

os << "/ " << *Inst;

2832

} else if (NextInBundle) {

2833

os << '[' << *Inst;

2834

ScheduleData *SD = NextInBundle;

2835

while (SD) {

2836

os << ';' << *SD->Inst;

2837

SD = SD->NextInBundle;

2838

}

2839

os << ']';

2840

} else {

2841

os << *Inst;

2842

}

2843

}

2844

2845

Instruction *Inst = nullptr;

2846

2847

/// Opcode of the current instruction in the schedule data.

2848

Value *OpValue = nullptr;

2849

2850

/// The TreeEntry that this instruction corresponds to.

2851

TreeEntry *TE = nullptr;

2852

2853

/// Points to the head in an instruction bundle (and always to this for

2854

/// single instructions).

2855

ScheduleData *FirstInBundle = nullptr;

2856

2857

/// Single linked list of all instructions in a bundle. Null if it is a

2858

/// single instruction.

2859

ScheduleData *NextInBundle = nullptr;

2860

2861

/// Single linked list of all memory instructions (e.g. load, store, call)

2862

/// in the block - until the end of the scheduling region.

2863

ScheduleData *NextLoadStore = nullptr;

2864

2865

/// The dependent memory instructions.

2866

/// This list is derived on demand in calculateDependencies().

2867

SmallVector<ScheduleData *, 4> MemoryDependencies;

2868

2869

/// List of instructions which this instruction could be control dependent

2870

/// on. Allowing such nodes to be scheduled below this one could introduce

2871

/// a runtime fault which didn't exist in the original program.

2872

/// ex: this is a load or udiv following a readonly call which inf loops

2873

SmallVector<ScheduleData *, 4> ControlDependencies;

2874

2875

/// This ScheduleData is in the current scheduling region if this matches

2876

/// the current SchedulingRegionID of BlockScheduling.

2877

int SchedulingRegionID = 0;

2878

2879

/// Used for getting a "good" final ordering of instructions.

2880

int SchedulingPriority = 0;

2881

2882

/// The number of dependencies. Constitutes of the number of users of the

2883

/// instruction plus the number of dependent memory instructions (if any).

2884

/// This value is calculated on demand.

2885

/// If InvalidDeps, the number of dependencies is not calculated yet.

2886

int Dependencies = InvalidDeps;

2887

2888

/// The number of dependencies minus the number of dependencies of scheduled

2889

/// instructions. As soon as this is zero, the instruction/bundle gets ready

2890

/// for scheduling.

2891

/// Note that this is negative as long as Dependencies is not calculated.

2892

int UnscheduledDeps = InvalidDeps;

2893

2894

/// True if this instruction is scheduled (or considered as scheduled in the

2895

/// dry-run).

2896

bool IsScheduled = false;

2897

};

2898

2899

#ifndef NDEBUG

2900

friend inline raw_ostream &operator<<(raw_ostream &os,

2901

const BoUpSLP::ScheduleData &SD) {

2902

SD.dump(os);

2903

return os;

2904

}

2905

#endif

2906

2907

friend struct GraphTraits<BoUpSLP *>;

2908

friend struct DOTGraphTraits<BoUpSLP *>;

2909

2910

/// Contains all scheduling data for a basic block.

2911

/// It does not schedules instructions, which are not memory read/write

2912

/// instructions and their operands are either constants, or arguments, or

2913

/// phis, or instructions from others blocks, or their users are phis or from

2914

/// the other blocks. The resulting vector instructions can be placed at the

2915

/// beginning of the basic block without scheduling (if operands does not need

2916

/// to be scheduled) or at the end of the block (if users are outside of the

2917

/// block). It allows to save some compile time and memory used by the

2918

/// compiler.

2919

/// ScheduleData is assigned for each instruction in between the boundaries of

2920

/// the tree entry, even for those, which are not part of the graph. It is

2921

/// required to correctly follow the dependencies between the instructions and

2922

/// their correct scheduling. The ScheduleData is not allocated for the

2923

/// instructions, which do not require scheduling, like phis, nodes with

2924

/// extractelements/insertelements only or nodes with instructions, with

2925

/// uses/operands outside of the block.

2926

struct BlockScheduling {

2927

BlockScheduling(BasicBlock *BB)

2928

: BB(BB), ChunkSize(BB->size()), ChunkPos(ChunkSize) {}

2929

2930

void clear() {

2931

ReadyInsts.clear();

2932

ScheduleStart = nullptr;

2933

ScheduleEnd = nullptr;

2934

FirstLoadStoreInRegion = nullptr;

2935

LastLoadStoreInRegion = nullptr;

2936

RegionHasStackSave = false;

2937

2938

// Reduce the maximum schedule region size by the size of the

2939

// previous scheduling run.

2940

ScheduleRegionSizeLimit -= ScheduleRegionSize;

2941

if (ScheduleRegionSizeLimit < MinScheduleRegionSize)

2942

ScheduleRegionSizeLimit = MinScheduleRegionSize;

2943

ScheduleRegionSize = 0;

2944

2945

// Make a new scheduling region, i.e. all existing ScheduleData is not

2946

// in the new region yet.

2947

++SchedulingRegionID;

2948

}

2949

2950

ScheduleData *getScheduleData(Instruction *I) {

2951

if (BB != I->getParent())

2952

// Avoid lookup if can't possibly be in map.

2953

return nullptr;

2954

ScheduleData *SD = ScheduleDataMap.lookup(I);

2955

if (SD && isInSchedulingRegion(SD))

2956

return SD;

2957

return nullptr;

2958

}

2959

2960

ScheduleData *getScheduleData(Value *V) {

2961

if (auto *I = dyn_cast<Instruction>(V))

2962

return getScheduleData(I);

2963

return nullptr;

2964

}

2965

2966

ScheduleData *getScheduleData(Value *V, Value *Key) {

2967

if (V == Key)

2968

return getScheduleData(V);

2969

auto I = ExtraScheduleDataMap.find(V);

2970

if (I != ExtraScheduleDataMap.end()) {

2971

ScheduleData *SD = I->second.lookup(Key);

2972

if (SD && isInSchedulingRegion(SD))

2973

return SD;

2974

}

2975

return nullptr;

2976

}

2977

2978

bool isInSchedulingRegion(ScheduleData *SD) const {

2979

return SD->SchedulingRegionID == SchedulingRegionID;

2980

}

2981

2982

/// Marks an instruction as scheduled and puts all dependent ready

2983

/// instructions into the ready-list.

2984

template <typename ReadyListType>

2985

void schedule(ScheduleData *SD, ReadyListType &ReadyList) {

2986

SD->IsScheduled = true;

2987

LLVM_DEBUG(dbgs() << "SLP: schedule " << *SD << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: schedule " << *SD <<
"\n"; } } while (false);

2988

2989

for (ScheduleData *BundleMember = SD; BundleMember;

2990

BundleMember = BundleMember->NextInBundle) {

2991

if (BundleMember->Inst != BundleMember->OpValue)

2992

continue;

2993

2994

// Handle the def-use chain dependencies.

2995

2996

// Decrement the unscheduled counter and insert to ready list if ready.

2997

auto &&DecrUnsched = [this, &ReadyList](Instruction *I) {

2998

doForAllOpcodes(I, [&ReadyList](ScheduleData *OpDef) {

2999

if (OpDef && OpDef->hasValidDependencies() &&

3000

OpDef->incrementUnscheduledDeps(-1) == 0) {

3001

// There are no more unscheduled dependencies after

3002

// decrementing, so we can put the dependent instruction

3003

// into the ready list.

3004

ScheduleData *DepBundle = OpDef->FirstInBundle;

3005

assert(!DepBundle->IsScheduled &&(static_cast <bool> (!DepBundle->IsScheduled &&
"already scheduled bundle gets ready") ? void (0) : __assert_fail
("!DepBundle->IsScheduled && \"already scheduled bundle gets ready\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 3006, __extension__
__PRETTY_FUNCTION__))

3006

"already scheduled bundle gets ready")(static_cast <bool> (!DepBundle->IsScheduled &&
"already scheduled bundle gets ready") ? void (0) : __assert_fail
("!DepBundle->IsScheduled && \"already scheduled bundle gets ready\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 3006, __extension__
__PRETTY_FUNCTION__));

3007

ReadyList.insert(DepBundle);

3008

LLVM_DEBUG(dbgs()do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: gets ready (def): " <<
*DepBundle << "\n"; } } while (false)

3009

<< "SLP: gets ready (def): " << *DepBundle << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: gets ready (def): " <<
*DepBundle << "\n"; } } while (false);

3010

}

3011

});

3012

};

3013

3014

// If BundleMember is a vector bundle, its operands may have been

3015

// reordered during buildTree(). We therefore need to get its operands

3016

// through the TreeEntry.

3017

if (TreeEntry *TE = BundleMember->TE) {

3018

// Need to search for the lane since the tree entry can be reordered.

3019

int Lane = std::distance(TE->Scalars.begin(),

3020

find(TE->Scalars, BundleMember->Inst));

3021

assert(Lane >= 0 && "Lane not set")(static_cast <bool> (Lane >= 0 && "Lane not set"
) ? void (0) : __assert_fail ("Lane >= 0 && \"Lane not set\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 3021, __extension__
__PRETTY_FUNCTION__));

3022

3023

// Since vectorization tree is being built recursively this assertion

3024

// ensures that the tree entry has all operands set before reaching

3025

// this code. Couple of exceptions known at the moment are extracts

3026

// where their second (immediate) operand is not added. Since

3027

// immediates do not affect scheduler behavior this is considered

3028

// okay.

3029

auto *In = BundleMember->Inst;

3030

assert(In &&(static_cast <bool> (In && (isa<ExtractValueInst
, ExtractElementInst>(In) || In->getNumOperands() == TE
->getNumOperands()) && "Missed TreeEntry operands?"
) ? void (0) : __assert_fail ("In && (isa<ExtractValueInst, ExtractElementInst>(In) || In->getNumOperands() == TE->getNumOperands()) && \"Missed TreeEntry operands?\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 3033, __extension__
__PRETTY_FUNCTION__))

3031

(isa<ExtractValueInst, ExtractElementInst>(In) ||(static_cast <bool> (In && (isa<ExtractValueInst
, ExtractElementInst>(In) || In->getNumOperands() == TE
->getNumOperands()) && "Missed TreeEntry operands?"
) ? void (0) : __assert_fail ("In && (isa<ExtractValueInst, ExtractElementInst>(In) || In->getNumOperands() == TE->getNumOperands()) && \"Missed TreeEntry operands?\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 3033, __extension__
__PRETTY_FUNCTION__))

3032

In->getNumOperands() == TE->getNumOperands()) &&(static_cast <bool> (In && (isa<ExtractValueInst
, ExtractElementInst>(In) || In->getNumOperands() == TE
->getNumOperands()) && "Missed TreeEntry operands?"
) ? void (0) : __assert_fail ("In && (isa<ExtractValueInst, ExtractElementInst>(In) || In->getNumOperands() == TE->getNumOperands()) && \"Missed TreeEntry operands?\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 3033, __extension__
__PRETTY_FUNCTION__))

3033

"Missed TreeEntry operands?")(static_cast <bool> (In && (isa<ExtractValueInst
, ExtractElementInst>(In) || In->getNumOperands() == TE
->getNumOperands()) && "Missed TreeEntry operands?"
) ? void (0) : __assert_fail ("In && (isa<ExtractValueInst, ExtractElementInst>(In) || In->getNumOperands() == TE->getNumOperands()) && \"Missed TreeEntry operands?\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 3033, __extension__
__PRETTY_FUNCTION__));

3034

(void)In; // fake use to avoid build failure when assertions disabled

3035

3036

for (unsigned OpIdx = 0, NumOperands = TE->getNumOperands();

3037

OpIdx != NumOperands; ++OpIdx)

3038

if (auto *I = dyn_cast<Instruction>(TE->getOperand(OpIdx)[Lane]))

3039

DecrUnsched(I);

3040

} else {

3041

// If BundleMember is a stand-alone instruction, no operand reordering

3042

// has taken place, so we directly access its operands.

3043

for (Use &U : BundleMember->Inst->operands())

3044

if (auto *I = dyn_cast<Instruction>(U.get()))

3045

DecrUnsched(I);

3046

}

3047

// Handle the memory dependencies.

3048

for (ScheduleData *MemoryDepSD : BundleMember->MemoryDependencies) {

3049

if (MemoryDepSD->hasValidDependencies() &&

3050

MemoryDepSD->incrementUnscheduledDeps(-1) == 0) {

3051

// There are no more unscheduled dependencies after decrementing,

3052

// so we can put the dependent instruction into the ready list.

3053

ScheduleData *DepBundle = MemoryDepSD->FirstInBundle;

3054

assert(!DepBundle->IsScheduled &&(static_cast <bool> (!DepBundle->IsScheduled &&
"already scheduled bundle gets ready") ? void (0) : __assert_fail
("!DepBundle->IsScheduled && \"already scheduled bundle gets ready\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 3055, __extension__
__PRETTY_FUNCTION__))

3055

"already scheduled bundle gets ready")(static_cast <bool> (!DepBundle->IsScheduled &&
"already scheduled bundle gets ready") ? void (0) : __assert_fail
("!DepBundle->IsScheduled && \"already scheduled bundle gets ready\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 3055, __extension__
__PRETTY_FUNCTION__));

3056

ReadyList.insert(DepBundle);

3057

LLVM_DEBUG(dbgs()do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: gets ready (mem): " <<
*DepBundle << "\n"; } } while (false)

3058

<< "SLP: gets ready (mem): " << *DepBundle << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: gets ready (mem): " <<
*DepBundle << "\n"; } } while (false);

3059

}

3060

}

3061

// Handle the control dependencies.

3062

for (ScheduleData *DepSD : BundleMember->ControlDependencies) {

3063

if (DepSD->incrementUnscheduledDeps(-1) == 0) {

3064

// There are no more unscheduled dependencies after decrementing,

3065

// so we can put the dependent instruction into the ready list.

3066

ScheduleData *DepBundle = DepSD->FirstInBundle;

3067

assert(!DepBundle->IsScheduled &&(static_cast <bool> (!DepBundle->IsScheduled &&
"already scheduled bundle gets ready") ? void (0) : __assert_fail
("!DepBundle->IsScheduled && \"already scheduled bundle gets ready\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 3068, __extension__
__PRETTY_FUNCTION__))

3068

"already scheduled bundle gets ready")(static_cast <bool> (!DepBundle->IsScheduled &&
"already scheduled bundle gets ready") ? void (0) : __assert_fail
("!DepBundle->IsScheduled && \"already scheduled bundle gets ready\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 3068, __extension__
__PRETTY_FUNCTION__));

3069

ReadyList.insert(DepBundle);

3070

LLVM_DEBUG(dbgs()do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: gets ready (ctl): " <<
*DepBundle << "\n"; } } while (false)

3071

<< "SLP: gets ready (ctl): " << *DepBundle << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: gets ready (ctl): " <<
*DepBundle << "\n"; } } while (false);

3072

}

3073

}

3074

3075

}

3076

}

3077

3078

/// Verify basic self consistency properties of the data structure.

3079

void verify() {

3080

if (!ScheduleStart)

3081

return;

3082

3083

assert(ScheduleStart->getParent() == ScheduleEnd->getParent() &&(static_cast <bool> (ScheduleStart->getParent() == ScheduleEnd
->getParent() && ScheduleStart->comesBefore(ScheduleEnd
) && "Not a valid scheduling region?") ? void (0) : __assert_fail
("ScheduleStart->getParent() == ScheduleEnd->getParent() && ScheduleStart->comesBefore(ScheduleEnd) && \"Not a valid scheduling region?\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 3085, __extension__
__PRETTY_FUNCTION__))

3084

ScheduleStart->comesBefore(ScheduleEnd) &&(static_cast <bool> (ScheduleStart->getParent() == ScheduleEnd
->getParent() && ScheduleStart->comesBefore(ScheduleEnd
) && "Not a valid scheduling region?") ? void (0) : __assert_fail
("ScheduleStart->getParent() == ScheduleEnd->getParent() && ScheduleStart->comesBefore(ScheduleEnd) && \"Not a valid scheduling region?\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 3085, __extension__
__PRETTY_FUNCTION__))

3085

"Not a valid scheduling region?")(static_cast <bool> (ScheduleStart->getParent() == ScheduleEnd
->getParent() && ScheduleStart->comesBefore(ScheduleEnd
) && "Not a valid scheduling region?") ? void (0) : __assert_fail
("ScheduleStart->getParent() == ScheduleEnd->getParent() && ScheduleStart->comesBefore(ScheduleEnd) && \"Not a valid scheduling region?\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 3085, __extension__
__PRETTY_FUNCTION__));

3086

3087

for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {

3088

auto *SD = getScheduleData(I);

3089

if (!SD)

3090

continue;

3091

assert(isInSchedulingRegion(SD) &&(static_cast <bool> (isInSchedulingRegion(SD) &&
"primary schedule data not in window?") ? void (0) : __assert_fail
("isInSchedulingRegion(SD) && \"primary schedule data not in window?\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 3092, __extension__
__PRETTY_FUNCTION__))

3092

"primary schedule data not in window?")(static_cast <bool> (isInSchedulingRegion(SD) &&
"primary schedule data not in window?") ? void (0) : __assert_fail
("isInSchedulingRegion(SD) && \"primary schedule data not in window?\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 3092, __extension__
__PRETTY_FUNCTION__));

3093

assert(isInSchedulingRegion(SD->FirstInBundle) &&(static_cast <bool> (isInSchedulingRegion(SD->FirstInBundle
) && "entire bundle in window!") ? void (0) : __assert_fail
("isInSchedulingRegion(SD->FirstInBundle) && \"entire bundle in window!\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 3094, __extension__
__PRETTY_FUNCTION__))

3094

"entire bundle in window!")(static_cast <bool> (isInSchedulingRegion(SD->FirstInBundle
) && "entire bundle in window!") ? void (0) : __assert_fail
("isInSchedulingRegion(SD->FirstInBundle) && \"entire bundle in window!\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 3094, __extension__
__PRETTY_FUNCTION__));

3095

(void)SD;

3096

doForAllOpcodes(I, [](ScheduleData *SD) { SD->verify(); });

3097

}

3098

3099

for (auto *SD : ReadyInsts) {

3100

assert(SD->isSchedulingEntity() && SD->isReady() &&(static_cast <bool> (SD->isSchedulingEntity() &&
SD->isReady() && "item in ready list not ready?")
? void (0) : __assert_fail ("SD->isSchedulingEntity() && SD->isReady() && \"item in ready list not ready?\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 3101, __extension__
__PRETTY_FUNCTION__))

3101

"item in ready list not ready?")(static_cast <bool> (SD->isSchedulingEntity() &&
SD->isReady() && "item in ready list not ready?")
? void (0) : __assert_fail ("SD->isSchedulingEntity() && SD->isReady() && \"item in ready list not ready?\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 3101, __extension__
__PRETTY_FUNCTION__));

3102

(void)SD;

3103

}

3104

}

3105

3106

void doForAllOpcodes(Value *V,

3107

function_ref<void(ScheduleData *SD)> Action) {

3108

if (ScheduleData *SD = getScheduleData(V))

3109

Action(SD);

3110

auto I = ExtraScheduleDataMap.find(V);

3111

if (I != ExtraScheduleDataMap.end())

3112

for (auto &P : I->second)

3113

if (isInSchedulingRegion(P.second))

3114

Action(P.second);

3115

}

3116

3117

/// Put all instructions into the ReadyList which are ready for scheduling.

3118

template <typename ReadyListType>

3119

void initialFillReadyList(ReadyListType &ReadyList) {

3120

for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {

3121

doForAllOpcodes(I, [&](ScheduleData *SD) {

3122

if (SD->isSchedulingEntity() && SD->hasValidDependencies() &&

3123

SD->isReady()) {

3124

ReadyList.insert(SD);

3125

LLVM_DEBUG(dbgs()do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: initially in ready list: "
<< *SD << "\n"; } } while (false)

3126

<< "SLP: initially in ready list: " << *SD << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: initially in ready list: "
<< *SD << "\n"; } } while (false);

3127

}

3128

});

3129

}

3130

}

3131

3132

/// Build a bundle from the ScheduleData nodes corresponding to the

3133

/// scalar instruction for each lane.

3134

ScheduleData *buildBundle(ArrayRef<Value *> VL);

3135

3136

/// Checks if a bundle of instructions can be scheduled, i.e. has no

3137

/// cyclic dependencies. This is only a dry-run, no instructions are

3138

/// actually moved at this stage.

3139

/// \returns the scheduling bundle. The returned Optional value is non-None

3140

/// if \p VL is allowed to be scheduled.

3141

Optional<ScheduleData *>

3142

tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,

3143

const InstructionsState &S);

3144

3145

/// Un-bundles a group of instructions.

3146

void cancelScheduling(ArrayRef<Value *> VL, Value *OpValue);

3147

3148

/// Allocates schedule data chunk.

3149

ScheduleData *allocateScheduleDataChunks();

3150

3151

/// Extends the scheduling region so that V is inside the region.

3152

/// \returns true if the region size is within the limit.

3153

bool extendSchedulingRegion(Value *V, const InstructionsState &S);

3154

3155

/// Initialize the ScheduleData structures for new instructions in the

3156

/// scheduling region.

3157

void initScheduleData(Instruction *FromI, Instruction *ToI,

3158

ScheduleData *PrevLoadStore,

3159

ScheduleData *NextLoadStore);

3160

3161

/// Updates the dependency information of a bundle and of all instructions/

3162

/// bundles which depend on the original bundle.

3163

void calculateDependencies(ScheduleData *SD, bool InsertInReadyList,

3164

BoUpSLP *SLP);

3165

3166

/// Sets all instruction in the scheduling region to un-scheduled.

3167

void resetSchedule();

3168

3169

BasicBlock *BB;

3170

3171

/// Simple memory allocation for ScheduleData.

3172

std::vector<std::unique_ptr<ScheduleData[]>> ScheduleDataChunks;

3173

3174

/// The size of a ScheduleData array in ScheduleDataChunks.

3175

int ChunkSize;

3176

3177

/// The allocator position in the current chunk, which is the last entry

3178

/// of ScheduleDataChunks.

3179

int ChunkPos;

3180

3181

/// Attaches ScheduleData to Instruction.

3182

/// Note that the mapping survives during all vectorization iterations, i.e.

3183

/// ScheduleData structures are recycled.

3184

DenseMap<Instruction *, ScheduleData *> ScheduleDataMap;

3185

3186

/// Attaches ScheduleData to Instruction with the leading key.

3187

DenseMap<Value *, SmallDenseMap<Value *, ScheduleData *>>

3188

ExtraScheduleDataMap;

3189

3190

/// The ready-list for scheduling (only used for the dry-run).

3191

SetVector<ScheduleData *> ReadyInsts;

3192

3193

/// The first instruction of the scheduling region.

3194

Instruction *ScheduleStart = nullptr;

3195

3196

/// The first instruction _after_ the scheduling region.

3197

Instruction *ScheduleEnd = nullptr;

3198

3199

/// The first memory accessing instruction in the scheduling region

3200

/// (can be null).

3201

ScheduleData *FirstLoadStoreInRegion = nullptr;

3202

3203

/// The last memory accessing instruction in the scheduling region

3204

/// (can be null).

3205

ScheduleData *LastLoadStoreInRegion = nullptr;

3206

3207

/// Is there an llvm.stacksave or llvm.stackrestore in the scheduling

3208

/// region? Used to optimize the dependence calculation for the

3209

/// common case where there isn't.

3210

bool RegionHasStackSave = false;

3211

3212

/// The current size of the scheduling region.

3213

int ScheduleRegionSize = 0;

3214

3215

/// The maximum size allowed for the scheduling region.

3216

int ScheduleRegionSizeLimit = ScheduleRegionSizeBudget;

3217

3218

/// The ID of the scheduling region. For a new vectorization iteration this

3219

/// is incremented which "removes" all ScheduleData from the region.

3220

/// Make sure that the initial SchedulingRegionID is greater than the

3221

/// initial SchedulingRegionID in ScheduleData (which is 0).

3222

int SchedulingRegionID = 1;

3223

};

3224

3225

/// Attaches the BlockScheduling structures to basic blocks.

3226

MapVector<BasicBlock *, std::unique_ptr<BlockScheduling>> BlocksSchedules;

3227

3228

/// Performs the "real" scheduling. Done before vectorization is actually

3229

/// performed in a basic block.

3230

void scheduleBlock(BlockScheduling *BS);

3231

3232

/// List of users to ignore during scheduling and that don't need extracting.

3233

const SmallDenseSet<Value *> *UserIgnoreList = nullptr;

3234

3235

/// A DenseMapInfo implementation for holding DenseMaps and DenseSets of

3236

/// sorted SmallVectors of unsigned.

3237

struct OrdersTypeDenseMapInfo {

3238

static OrdersType getEmptyKey() {

3239

OrdersType V;

3240

V.push_back(~1U);

3241

return V;

3242

}

3243

3244

static OrdersType getTombstoneKey() {

3245

OrdersType V;

3246

V.push_back(~2U);

3247

return V;

3248

}

3249

3250

static unsigned getHashValue(const OrdersType &V) {

3251

return static_cast<unsigned>(hash_combine_range(V.begin(), V.end()));

3252

}

3253

3254

static bool isEqual(const OrdersType &LHS, const OrdersType &RHS) {

3255

return LHS == RHS;

3256

}

3257

};

3258

3259

// Analysis and block reference.

3260

Function *F;

3261

ScalarEvolution *SE;

3262

TargetTransformInfo *TTI;

3263

TargetLibraryInfo *TLI;

3264

LoopInfo *LI;

3265

DominatorTree *DT;

3266

AssumptionCache *AC;

3267

DemandedBits *DB;

3268

const DataLayout *DL;

3269

OptimizationRemarkEmitter *ORE;

3270

3271

unsigned MaxVecRegSize; // This is set by TTI or overridden by cl::opt.

3272

unsigned MinVecRegSize; // Set by cl::opt (default: 128).

3273

3274

/// Instruction builder to construct the vectorized tree.

3275

IRBuilder<> Builder;

3276

3277

/// A map of scalar integer values to the smallest bit width with which they

3278

/// can legally be represented. The values map to (width, signed) pairs,

3279

/// where "width" indicates the minimum bit width and "signed" is True if the

3280

/// value must be signed-extended, rather than zero-extended, back to its

3281

/// original width.

3282

MapVector<Value *, std::pair<uint64_t, bool>> MinBWs;

3283

};

3284

3285

} // end namespace slpvectorizer

3286

3287

template <> struct GraphTraits<BoUpSLP *> {

3288

using TreeEntry = BoUpSLP::TreeEntry;

3289

3290

/// NodeRef has to be a pointer per the GraphWriter.

3291

using NodeRef = TreeEntry *;

3292

3293

using ContainerTy = BoUpSLP::TreeEntry::VecTreeTy;

3294

3295

/// Add the VectorizableTree to the index iterator to be able to return

3296

/// TreeEntry pointers.

3297

struct ChildIteratorType

3298

: public iterator_adaptor_base<

3299

ChildIteratorType, SmallVector<BoUpSLP::EdgeInfo, 1>::iterator> {

3300

ContainerTy &VectorizableTree;

3301

3302

ChildIteratorType(SmallVector<BoUpSLP::EdgeInfo, 1>::iterator W,

3303

ContainerTy &VT)

3304

: ChildIteratorType::iterator_adaptor_base(W), VectorizableTree(VT) {}

3305

3306

NodeRef operator*() { return I->UserTE; }

3307

};

3308

3309

static NodeRef getEntryNode(BoUpSLP &R) {

3310

return R.VectorizableTree[0].get();

3311

}

3312

3313

static ChildIteratorType child_begin(NodeRef N) {

3314

return {N->UserTreeIndices.begin(), N->Container};

3315

}

3316

3317

static ChildIteratorType child_end(NodeRef N) {

3318

return {N->UserTreeIndices.end(), N->Container};

3319

}

3320

3321

/// For the node iterator we just need to turn the TreeEntry iterator into a

3322

/// TreeEntry* iterator so that it dereferences to NodeRef.

3323

class nodes_iterator {

3324

using ItTy = ContainerTy::iterator;

3325

ItTy It;

3326

3327

public:

3328

nodes_iterator(const ItTy &It2) : It(It2) {}

3329

NodeRef operator*() { return It->get(); }

3330

nodes_iterator operator++() {

3331

++It;

3332

return *this;

3333

}

3334

bool operator!=(const nodes_iterator &N2) const { return N2.It != It; }

3335

};

3336

3337

static nodes_iterator nodes_begin(BoUpSLP *R) {

3338

return nodes_iterator(R->VectorizableTree.begin());

3339

}

3340

3341

static nodes_iterator nodes_end(BoUpSLP *R) {

3342

return nodes_iterator(R->VectorizableTree.end());

3343

}

3344

3345

static unsigned size(BoUpSLP *R) { return R->VectorizableTree.size(); }

3346

};

3347

3348

template <> struct DOTGraphTraits<BoUpSLP *> : public DefaultDOTGraphTraits {

3349

using TreeEntry = BoUpSLP::TreeEntry;

3350

3351

DOTGraphTraits(bool isSimple = false) : DefaultDOTGraphTraits(isSimple) {}

3352

3353

std::string getNodeLabel(const TreeEntry *Entry, const BoUpSLP *R) {

3354

std::string Str;

3355

raw_string_ostream OS(Str);

3356

if (isSplat(Entry->Scalars))

3357

OS << "<splat> ";

3358

for (auto *V : Entry->Scalars) {

3359

OS << *V;

3360

if (llvm::any_of(R->ExternalUses, [&](const BoUpSLP::ExternalUser &EU) {

3361

return EU.Scalar == V;

3362

}))

3363

OS << " <extract>";

3364

OS << "\n";

3365

}

3366

return Str;

3367

}

3368

3369

static std::string getNodeAttributes(const TreeEntry *Entry,

3370

const BoUpSLP *) {

3371

if (Entry->State == TreeEntry::NeedToGather)

3372

return "color=red";

3373

return "";

3374

}

3375

};

3376

3377

} // end namespace llvm

3378

3379

BoUpSLP::~BoUpSLP() {

3380

SmallVector<WeakTrackingVH> DeadInsts;

3381

for (auto *I : DeletedInstructions) {

3382

for (Use &U : I->operands()) {

3383

auto *Op = dyn_cast<Instruction>(U.get());

3384

if (Op && !DeletedInstructions.count(Op) && Op->hasOneUser() &&

3385

wouldInstructionBeTriviallyDead(Op, TLI))

3386

DeadInsts.emplace_back(Op);

3387

}

3388

I->dropAllReferences();

3389

}

3390

for (auto *I : DeletedInstructions) {

3391

assert(I->use_empty() &&(static_cast <bool> (I->use_empty() && "trying to erase instruction with users."
) ? void (0) : __assert_fail ("I->use_empty() && \"trying to erase instruction with users.\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 3392, __extension__
__PRETTY_FUNCTION__))

3392

"trying to erase instruction with users.")(static_cast <bool> (I->use_empty() && "trying to erase instruction with users."
) ? void (0) : __assert_fail ("I->use_empty() && \"trying to erase instruction with users.\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 3392, __extension__
__PRETTY_FUNCTION__));

3393

I->eraseFromParent();

3394

}

3395

3396

// Cleanup any dead scalar code feeding the vectorized instructions

3397

RecursivelyDeleteTriviallyDeadInstructions(DeadInsts, TLI);

3398

3399

#ifdef EXPENSIVE_CHECKS

3400

// If we could guarantee that this call is not extremely slow, we could

3401

// remove the ifdef limitation (see PR47712).

3402

assert(!verifyFunction(*F, &dbgs()))(static_cast <bool> (!verifyFunction(*F, &dbgs())) ?
void (0) : __assert_fail ("!verifyFunction(*F, &dbgs())"
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 3402, __extension__
__PRETTY_FUNCTION__));

3403

#endif

3404

}

3405

3406

/// Reorders the given \p Reuses mask according to the given \p Mask. \p Reuses

3407

/// contains original mask for the scalars reused in the node. Procedure

3408

/// transform this mask in accordance with the given \p Mask.

3409

static void reorderReuses(SmallVectorImpl<int> &Reuses, ArrayRef<int> Mask) {

3410

assert(!Mask.empty() && Reuses.size() == Mask.size() &&(static_cast <bool> (!Mask.empty() && Reuses.size
() == Mask.size() && "Expected non-empty mask.") ? void
(0) : __assert_fail ("!Mask.empty() && Reuses.size() == Mask.size() && \"Expected non-empty mask.\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 3411, __extension__
__PRETTY_FUNCTION__))

3411

"Expected non-empty mask.")(static_cast <bool> (!Mask.empty() && Reuses.size
() == Mask.size() && "Expected non-empty mask.") ? void
(0) : __assert_fail ("!Mask.empty() && Reuses.size() == Mask.size() && \"Expected non-empty mask.\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 3411, __extension__
__PRETTY_FUNCTION__));

3412

SmallVector<int> Prev(Reuses.begin(), Reuses.end());

3413

Prev.swap(Reuses);

3414

for (unsigned I = 0, E = Prev.size(); I < E; ++I)

3415

if (Mask[I] != UndefMaskElem)

3416

Reuses[Mask[I]] = Prev[I];

3417

}

3418

3419

/// Reorders the given \p Order according to the given \p Mask. \p Order - is

3420

/// the original order of the scalars. Procedure transforms the provided order

3421

/// in accordance with the given \p Mask. If the resulting \p Order is just an

3422

/// identity order, \p Order is cleared.

3423

static void reorderOrder(SmallVectorImpl<unsigned> &Order, ArrayRef<int> Mask) {

3424

assert(!Mask.empty() && "Expected non-empty mask.")(static_cast <bool> (!Mask.empty() && "Expected non-empty mask."
) ? void (0) : __assert_fail ("!Mask.empty() && \"Expected non-empty mask.\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 3424, __extension__
__PRETTY_FUNCTION__));

3425

SmallVector<int> MaskOrder;

3426

if (Order.empty()) {

3427

MaskOrder.resize(Mask.size());

3428

std::iota(MaskOrder.begin(), MaskOrder.end(), 0);

3429

} else {

3430

inversePermutation(Order, MaskOrder);

3431

}

3432

reorderReuses(MaskOrder, Mask);

3433

if (ShuffleVectorInst::isIdentityMask(MaskOrder)) {

3434

Order.clear();

3435

return;

3436

}

3437

Order.assign(Mask.size(), Mask.size());

3438

for (unsigned I = 0, E = Mask.size(); I < E; ++I)

3439

if (MaskOrder[I] != UndefMaskElem)

3440

Order[MaskOrder[I]] = I;

3441

fixupOrderingIndices(Order);

3442

}

3443

3444

Optional<BoUpSLP::OrdersType>

3445

BoUpSLP::findReusedOrderedScalars(const BoUpSLP::TreeEntry &TE) {

3446

assert(TE.State == TreeEntry::NeedToGather && "Expected gather node only.")(static_cast <bool> (TE.State == TreeEntry::NeedToGather
&& "Expected gather node only.") ? void (0) : __assert_fail
("TE.State == TreeEntry::NeedToGather && \"Expected gather node only.\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 3446, __extension__
__PRETTY_FUNCTION__));

3447

unsigned NumScalars = TE.Scalars.size();

3448

OrdersType CurrentOrder(NumScalars, NumScalars);

3449

SmallVector<int> Positions;

3450

SmallBitVector UsedPositions(NumScalars);

3451

const TreeEntry *STE = nullptr;

3452

// Try to find all gathered scalars that are gets vectorized in other

3453

// vectorize node. Here we can have only one single tree vector node to

3454

// correctly identify order of the gathered scalars.

3455

for (unsigned I = 0; I < NumScalars; ++I) {

3456

Value *V = TE.Scalars[I];

3457

if (!isa<LoadInst, ExtractElementInst, ExtractValueInst>(V))

3458

continue;

3459

if (const auto *LocalSTE = getTreeEntry(V)) {

3460

if (!STE)

3461

STE = LocalSTE;

3462

else if (STE != LocalSTE)

3463

// Take the order only from the single vector node.

3464

return None;

3465

unsigned Lane =

3466

std::distance(STE->Scalars.begin(), find(STE->Scalars, V));

3467

if (Lane >= NumScalars)

3468

return None;

3469

if (CurrentOrder[Lane] != NumScalars) {

3470

if (Lane != I)

3471

continue;

3472

UsedPositions.reset(CurrentOrder[Lane]);

3473

}

3474

// The partial identity (where only some elements of the gather node are

3475

// in the identity order) is good.

3476

CurrentOrder[Lane] = I;

3477

UsedPositions.set(I);

3478

}

3479

}

3480

// Need to keep the order if we have a vector entry and at least 2 scalars or

3481

// the vectorized entry has just 2 scalars.

3482

if (STE && (UsedPositions.count() > 1 || STE->Scalars.size() == 2)) {

3483

auto &&IsIdentityOrder = [NumScalars](ArrayRef<unsigned> CurrentOrder) {

3484

for (unsigned I = 0; I < NumScalars; ++I)

3485

if (CurrentOrder[I] != I && CurrentOrder[I] != NumScalars)

3486

return false;

3487

return true;

3488

};

3489

if (IsIdentityOrder(CurrentOrder)) {

3490

CurrentOrder.clear();

3491

return CurrentOrder;

3492

}

3493

auto *It = CurrentOrder.begin();

3494

for (unsigned I = 0; I < NumScalars;) {

3495

if (UsedPositions.test(I)) {

3496

++I;

3497

continue;

3498

}

3499

if (*It == NumScalars) {

3500

*It = I;

3501

++I;

3502

}

3503

++It;

3504

}

3505

return CurrentOrder;

3506

}

3507

return None;

3508

}

3509

3510

namespace {

3511

/// Tracks the state we can represent the loads in the given sequence.

3512

enum class LoadsState { Gather, Vectorize, ScatterVectorize };

3513

} // anonymous namespace

3514

3515

/// Checks if the given array of loads can be represented as a vectorized,

3516

/// scatter or just simple gather.

3517

static LoadsState canVectorizeLoads(ArrayRef<Value *> VL, const Value *VL0,

3518

const TargetTransformInfo &TTI,

3519

const DataLayout &DL, ScalarEvolution &SE,

3520

LoopInfo &LI,

3521

SmallVectorImpl<unsigned> &Order,

3522

SmallVectorImpl<Value *> &PointerOps) {

3523

// Check that a vectorized load would load the same memory as a scalar

3524

// load. For example, we don't want to vectorize loads that are smaller

3525

// than 8-bit. Even though we have a packed struct {<i2, i2, i2, i2>} LLVM

3526

// treats loading/storing it as an i8 struct. If we vectorize loads/stores

3527

// from such a struct, we read/write packed bits disagreeing with the

3528

// unvectorized version.

3529

Type *ScalarTy = VL0->getType();

3530

3531

if (DL.getTypeSizeInBits(ScalarTy) != DL.getTypeAllocSizeInBits(ScalarTy))

3532

return LoadsState::Gather;

3533

3534

// Make sure all loads in the bundle are simple - we can't vectorize

3535

// atomic or volatile loads.

3536

PointerOps.clear();

3537

PointerOps.resize(VL.size());

3538

auto *POIter = PointerOps.begin();

3539

for (Value *V : VL) {

3540

auto *L = cast<LoadInst>(V);

3541

if (!L->isSimple())

3542

return LoadsState::Gather;

3543

*POIter = L->getPointerOperand();

3544

++POIter;

3545

}

3546

3547

Order.clear();

3548

// Check the order of pointer operands or that all pointers are the same.

3549

bool IsSorted = sortPtrAccesses(PointerOps, ScalarTy, DL, SE, Order);

3550

if (IsSorted || all_of(PointerOps, [&PointerOps](Value *P) {

3551

if (getUnderlyingObject(P) != getUnderlyingObject(PointerOps.front()))

3552

return false;

3553

auto *GEP = dyn_cast<GetElementPtrInst>(P);

3554

if (!GEP)

3555

return false;

3556

auto *GEP0 = cast<GetElementPtrInst>(PointerOps.front());

3557

return GEP->getNumOperands() == 2 &&

3558

((isConstant(GEP->getOperand(1)) &&

3559

isConstant(GEP0->getOperand(1))) ||

3560

getSameOpcode({GEP->getOperand(1), GEP0->getOperand(1)})

3561

.getOpcode());

3562

})) {

3563

if (IsSorted) {

3564

Value *Ptr0;

3565

Value *PtrN;

3566

if (Order.empty()) {

3567

Ptr0 = PointerOps.front();

3568

PtrN = PointerOps.back();

3569

} else {

3570

Ptr0 = PointerOps[Order.front()];

3571

PtrN = PointerOps[Order.back()];

3572

}

3573

Optional<int> Diff =

3574

getPointersDiff(ScalarTy, Ptr0, ScalarTy, PtrN, DL, SE);

3575

// Check that the sorted loads are consecutive.

3576

if (static_cast<unsigned>(*Diff) == VL.size() - 1)

3577

return LoadsState::Vectorize;

3578

}

3579

// TODO: need to improve analysis of the pointers, if not all of them are

3580

// GEPs or have > 2 operands, we end up with a gather node, which just

3581

// increases the cost.

3582

Loop *L = LI.getLoopFor(cast<LoadInst>(VL0)->getParent());

3583

bool ProfitableGatherPointers =

3584

static_cast<unsigned>(count_if(PointerOps, [L](Value *V) {

3585

return L && L->isLoopInvariant(V);

3586

})) <= VL.size() / 2 && VL.size() > 2;

3587

if (ProfitableGatherPointers || all_of(PointerOps, [IsSorted](Value *P) {

3588

auto *GEP = dyn_cast<GetElementPtrInst>(P);

3589

return (IsSorted && !GEP && doesNotNeedToBeScheduled(P)) ||

3590

(GEP && GEP->getNumOperands() == 2);

3591

})) {

3592

Align CommonAlignment = cast<LoadInst>(VL0)->getAlign();

3593

for (Value *V : VL)

3594

CommonAlignment =

3595

std::min(CommonAlignment, cast<LoadInst>(V)->getAlign());

3596

auto *VecTy = FixedVectorType::get(ScalarTy, VL.size());

3597

if (TTI.isLegalMaskedGather(VecTy, CommonAlignment) &&

3598

!TTI.forceScalarizeMaskedGather(VecTy, CommonAlignment))

3599

return LoadsState::ScatterVectorize;

3600

}

3601

}

3602

3603

return LoadsState::Gather;

3604

}

3605

3606

bool clusterSortPtrAccesses(ArrayRef<Value *> VL, Type *ElemTy,

3607

const DataLayout &DL, ScalarEvolution &SE,

3608

SmallVectorImpl<unsigned> &SortedIndices) {

3609

assert(llvm::all_of((static_cast <bool> (llvm::all_of( VL, [](const Value *
V) { return V->getType()->isPointerTy(); }) && "Expected list of pointer operands."
) ? void (0) : __assert_fail ("llvm::all_of( VL, [](const Value *V) { return V->getType()->isPointerTy(); }) && \"Expected list of pointer operands.\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 3611, __extension__
__PRETTY_FUNCTION__))

3610

VL, [](const Value *V) { return V->getType()->isPointerTy(); }) &&(static_cast <bool> (llvm::all_of( VL, [](const Value *
V) { return V->getType()->isPointerTy(); }) && "Expected list of pointer operands."
) ? void (0) : __assert_fail ("llvm::all_of( VL, [](const Value *V) { return V->getType()->isPointerTy(); }) && \"Expected list of pointer operands.\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 3611, __extension__
__PRETTY_FUNCTION__))

3611

"Expected list of pointer operands.")(static_cast <bool> (llvm::all_of( VL, [](const Value *
V) { return V->getType()->isPointerTy(); }) && "Expected list of pointer operands."
) ? void (0) : __assert_fail ("llvm::all_of( VL, [](const Value *V) { return V->getType()->isPointerTy(); }) && \"Expected list of pointer operands.\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 3611, __extension__
__PRETTY_FUNCTION__));

3612

// Map from bases to a vector of (Ptr, Offset, OrigIdx), which we insert each

3613

// Ptr into, sort and return the sorted indices with values next to one

3614

// another.

3615

MapVector<Value *, SmallVector<std::tuple<Value *, int, unsigned>>> Bases;

3616

Bases[VL[0]].push_back(std::make_tuple(VL[0], 0U, 0U));

3617

3618

unsigned Cnt = 1;

3619

for (Value *Ptr : VL.drop_front()) {

3620

bool Found = any_of(Bases, [&](auto &Base) {

3621

Optional<int> Diff =

3622

getPointersDiff(ElemTy, Base.first, ElemTy, Ptr, DL, SE,

3623

/*StrictCheck=*/true);

3624

if (!Diff)

3625

return false;

3626

3627

Base.second.emplace_back(Ptr, *Diff, Cnt++);

3628

return true;

3629

});

3630

3631

if (!Found) {

3632

// If we haven't found enough to usefully cluster, return early.

3633

if (Bases.size() > VL.size() / 2 - 1)

3634

return false;

3635

3636

// Not found already - add a new Base

3637

Bases[Ptr].emplace_back(Ptr, 0, Cnt++);

3638

}

3639

}

3640

3641

// For each of the bases sort the pointers by Offset and check if any of the

3642

// base become consecutively allocated.

3643

bool AnyConsecutive = false;

3644

for (auto &Base : Bases) {

3645

auto &Vec = Base.second;

3646

if (Vec.size() > 1) {

3647

llvm::stable_sort(Vec, [](const std::tuple<Value *, int, unsigned> &X,

3648

const std::tuple<Value *, int, unsigned> &Y) {

3649

return std::get<1>(X) < std::get<1>(Y);

3650

});

3651

int InitialOffset = std::get<1>(Vec[0]);

3652

AnyConsecutive |= all_of(enumerate(Vec), [InitialOffset](auto &P) {

3653

return std::get<1>(P.value()) == int(P.index()) + InitialOffset;

3654

});

3655

}

3656

}

3657

3658

// Fill SortedIndices array only if it looks worth-while to sort the ptrs.

3659

SortedIndices.clear();

3660

if (!AnyConsecutive)

3661

return false;

3662

3663

for (auto &Base : Bases) {

3664

for (auto &T : Base.second)

3665

SortedIndices.push_back(std::get<2>(T));

3666

}

3667

3668

assert(SortedIndices.size() == VL.size() &&(static_cast <bool> (SortedIndices.size() == VL.size() &&
"Expected SortedIndices to be the size of VL") ? void (0) : __assert_fail
("SortedIndices.size() == VL.size() && \"Expected SortedIndices to be the size of VL\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 3669, __extension__
__PRETTY_FUNCTION__))

3669

"Expected SortedIndices to be the size of VL")(static_cast <bool> (SortedIndices.size() == VL.size() &&
"Expected SortedIndices to be the size of VL") ? void (0) : __assert_fail
("SortedIndices.size() == VL.size() && \"Expected SortedIndices to be the size of VL\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 3669, __extension__
__PRETTY_FUNCTION__));

3670

return true;

3671

}

3672

3673

Optional<BoUpSLP::OrdersType>

3674

BoUpSLP::findPartiallyOrderedLoads(const BoUpSLP::TreeEntry &TE) {

3675

assert(TE.State == TreeEntry::NeedToGather && "Expected gather node only.")(static_cast <bool> (TE.State == TreeEntry::NeedToGather
&& "Expected gather node only.") ? void (0) : __assert_fail
("TE.State == TreeEntry::NeedToGather && \"Expected gather node only.\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 3675, __extension__
__PRETTY_FUNCTION__));

3676

Type *ScalarTy = TE.Scalars[0]->getType();

3677

3678

SmallVector<Value *> Ptrs;

3679

Ptrs.reserve(TE.Scalars.size());

3680

for (Value *V : TE.Scalars) {

3681

auto *L = dyn_cast<LoadInst>(V);

3682

if (!L || !L->isSimple())

3683

return None;

3684

Ptrs.push_back(L->getPointerOperand());

3685

}

3686

3687

BoUpSLP::OrdersType Order;

3688

if (clusterSortPtrAccesses(Ptrs, ScalarTy, *DL, *SE, Order))

3689

return Order;

3690

return None;

3691

}

3692

3693

Optional<BoUpSLP::OrdersType> BoUpSLP::getReorderingData(const TreeEntry &TE,

3694

bool TopToBottom) {

3695

// No need to reorder if need to shuffle reuses, still need to shuffle the

3696

// node.

3697

if (!TE.ReuseShuffleIndices.empty()) {

3698

// Check if reuse shuffle indices can be improved by reordering.

3699

// For this, check that reuse mask is "clustered", i.e. each scalar values

3700

// is used once in each submask of size <number_of_scalars>.

3701

// Example: 4 scalar values.

3702

// ReuseShuffleIndices mask: 0, 1, 2, 3, 3, 2, 0, 1 - clustered.

3703

// 0, 1, 2, 3, 3, 3, 1, 0 - not clustered, because

3704

// element 3 is used twice in the second submask.

3705

unsigned Sz = TE.Scalars.size();

3706

if (!ShuffleVectorInst::isOneUseSingleSourceMask(TE.ReuseShuffleIndices,

3707

Sz))

3708

return None;

3709

unsigned VF = TE.getVectorFactor();

3710

// Try build correct order for extractelement instructions.

3711

SmallVector<int> ReusedMask(TE.ReuseShuffleIndices.begin(),

3712

TE.ReuseShuffleIndices.end());

3713

if (TE.getOpcode() == Instruction::ExtractElement && !TE.isAltShuffle() &&

3714

all_of(TE.Scalars, [Sz](Value *V) {

3715

Optional<unsigned> Idx = getExtractIndex(cast<Instruction>(V));

3716

return Idx && *Idx < Sz;

3717

})) {

3718

SmallVector<int> ReorderMask(Sz, UndefMaskElem);

3719

if (TE.ReorderIndices.empty())

3720

std::iota(ReorderMask.begin(), ReorderMask.end(), 0);

3721

else

3722

inversePermutation(TE.ReorderIndices, ReorderMask);

3723

for (unsigned I = 0; I < VF; ++I) {

3724

int &Idx = ReusedMask[I];

3725

if (Idx == UndefMaskElem)

3726

continue;

3727

Value *V = TE.Scalars[ReorderMask[Idx]];

3728

Optional<unsigned> EI = getExtractIndex(cast<Instruction>(V));

3729

Idx = std::distance(ReorderMask.begin(), find(ReorderMask, *EI));

3730

}

3731

}

3732

// Build the order of the VF size, need to reorder reuses shuffles, they are

3733

// always of VF size.

3734

OrdersType ResOrder(VF);

3735

std::iota(ResOrder.begin(), ResOrder.end(), 0);

3736

auto *It = ResOrder.begin();

3737

for (unsigned K = 0; K < VF; K += Sz) {

3738

OrdersType CurrentOrder(TE.ReorderIndices);

3739

SmallVector<int> SubMask(makeArrayRef(ReusedMask).slice(K, Sz));

3740

if (SubMask.front() == UndefMaskElem)

3741

std::iota(SubMask.begin(), SubMask.end(), 0);

3742

reorderOrder(CurrentOrder, SubMask);

3743

transform(CurrentOrder, It, [K](unsigned Pos) { return Pos + K; });

3744

std::advance(It, Sz);

3745

}

3746

if (all_of(enumerate(ResOrder),

3747

[](const auto &Data) { return Data.index() == Data.value(); }))

3748

return {}; // Use identity order.

3749

return ResOrder;

3750

}

3751

if (TE.State == TreeEntry::Vectorize &&

3752

(isa<LoadInst, ExtractElementInst, ExtractValueInst>(TE.getMainOp()) ||

3753

(TopToBottom && isa<StoreInst, InsertElementInst>(TE.getMainOp()))) &&

3754

!TE.isAltShuffle())

3755

return TE.ReorderIndices;

3756

if (TE.State == TreeEntry::NeedToGather) {

3757

// TODO: add analysis of other gather nodes with extractelement

3758

// instructions and other values/instructions, not only undefs.

3759

if (((TE.getOpcode() == Instruction::ExtractElement &&

3760

!TE.isAltShuffle()) ||

3761

(all_of(TE.Scalars,

3762

[](Value *V) {

3763

return isa<UndefValue, ExtractElementInst>(V);

3764

}) &&

3765

any_of(TE.Scalars,

3766

[](Value *V) { return isa<ExtractElementInst>(V); }))) &&

3767

all_of(TE.Scalars,

3768

[](Value *V) {

3769

auto *EE = dyn_cast<ExtractElementInst>(V);

3770

return !EE || isa<FixedVectorType>(EE->getVectorOperandType());

3771

}) &&

3772

allSameType(TE.Scalars)) {

3773

// Check that gather of extractelements can be represented as

3774

// just a shuffle of a single vector.

3775

OrdersType CurrentOrder;

3776

bool Reuse = canReuseExtract(TE.Scalars, TE.getMainOp(), CurrentOrder);

3777

if (Reuse || !CurrentOrder.empty()) {

3778

if (!CurrentOrder.empty())

3779

fixupOrderingIndices(CurrentOrder);

3780

return CurrentOrder;

3781

}

3782

}

3783

if (Optional<OrdersType> CurrentOrder = findReusedOrderedScalars(TE))

3784

return CurrentOrder;

3785

if (TE.Scalars.size() >= 4)

3786

if (Optional<OrdersType> Order = findPartiallyOrderedLoads(TE))

3787

return Order;

3788

}

3789

return None;

3790

}

3791

3792

/// Checks if the given mask is a "clustered" mask with the same clusters of

3793

/// size \p Sz, which are not identity submasks.

3794

static bool isRepeatedNonIdentityClusteredMask(ArrayRef<int> Mask,

3795

unsigned Sz) {

3796

ArrayRef<int> FirstCluster = Mask.slice(0, Sz);

3797

if (ShuffleVectorInst::isIdentityMask(FirstCluster))

3798

return false;

3799

for (unsigned I = 0, E = Mask.size(); I < E; I += Sz) {

3800

ArrayRef<int> Cluster = Mask.slice(I, Sz);

3801

if (Cluster != FirstCluster)

3802

return false;

3803

}

3804

return true;

3805

}

3806

3807

void BoUpSLP::reorderNodeWithReuses(TreeEntry &TE, ArrayRef<int> Mask) const {

3808

// For vectorized and non-clustered reused - just reorder reuses mask.

3809

const unsigned Sz = TE.Scalars.size();

3810

if (TE.State != TreeEntry::NeedToGather || !TE.ReorderIndices.empty() ||

3811

!ShuffleVectorInst::isOneUseSingleSourceMask(TE.ReuseShuffleIndices,

3812

Sz) ||

3813

!isRepeatedNonIdentityClusteredMask(TE.ReuseShuffleIndices, Sz)) {

3814

reorderReuses(TE.ReuseShuffleIndices, Mask);

3815

return;

3816

}

3817

// Try to improve gathered nodes with clustered reuses, if possible.

3818

reorderScalars(TE.Scalars, makeArrayRef(TE.ReuseShuffleIndices).slice(0, Sz));

3819

// Fill the reuses mask with the identity submasks.

3820

for (auto It = TE.ReuseShuffleIndices.begin(),

3821

End = TE.ReuseShuffleIndices.end();

3822

It != End; std::advance(It, Sz))

3823

std::iota(It, std::next(It + Sz), 0);

3824

}

3825

3826

void BoUpSLP::reorderTopToBottom() {

3827

// Maps VF to the graph nodes.

3828

DenseMap<unsigned, SetVector<TreeEntry *>> VFToOrderedEntries;

3829

// ExtractElement gather nodes which can be vectorized and need to handle

3830

// their ordering.

3831

DenseMap<const TreeEntry *, OrdersType> GathersToOrders;

3832

3833

// AltShuffles can also have a preferred ordering that leads to fewer

3834

// instructions, e.g., the addsub instruction in x86.

3835

DenseMap<const TreeEntry *, OrdersType> AltShufflesToOrders;

3836

3837

// Maps a TreeEntry to the reorder indices of external users.

3838

DenseMap<const TreeEntry *, SmallVector<OrdersType, 1>>

3839

ExternalUserReorderMap;

3840

// FIXME: Workaround for syntax error reported by MSVC buildbots.

3841

TargetTransformInfo &TTIRef = *TTI;

3842

// Find all reorderable nodes with the given VF.

3843

// Currently the are vectorized stores,loads,extracts + some gathering of

3844

// extracts.

3845

for_each(VectorizableTree, [this, &TTIRef, &VFToOrderedEntries,

3846

&GathersToOrders, &ExternalUserReorderMap,

3847

&AltShufflesToOrders](

3848

const std::unique_ptr<TreeEntry> &TE) {

3849

// Look for external users that will probably be vectorized.

3850

SmallVector<OrdersType, 1> ExternalUserReorderIndices =

3851

findExternalStoreUsersReorderIndices(TE.get());

3852

if (!ExternalUserReorderIndices.empty()) {

3853

VFToOrderedEntries[TE->Scalars.size()].insert(TE.get());

3854

ExternalUserReorderMap.try_emplace(TE.get(),

3855

std::move(ExternalUserReorderIndices));

3856

}

3857

3858

// Patterns like [fadd,fsub] can be combined into a single instruction in

3859

// x86. Reordering them into [fsub,fadd] blocks this pattern. So we need

3860

// to take into account their order when looking for the most used order.

3861

if (TE->isAltShuffle()) {

3862

VectorType *VecTy =

3863

FixedVectorType::get(TE->Scalars[0]->getType(), TE->Scalars.size());

3864

unsigned Opcode0 = TE->getOpcode();

3865

unsigned Opcode1 = TE->getAltOpcode();

3866

// The opcode mask selects between the two opcodes.

3867

SmallBitVector OpcodeMask(TE->Scalars.size(), false);

3868

for (unsigned Lane : seq<unsigned>(0, TE->Scalars.size()))

3869

if (cast<Instruction>(TE->Scalars[Lane])->getOpcode() == Opcode1)

3870

OpcodeMask.set(Lane);

3871

// If this pattern is supported by the target then we consider the order.

3872

if (TTIRef.isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask)) {

3873

VFToOrderedEntries[TE->Scalars.size()].insert(TE.get());

3874

AltShufflesToOrders.try_emplace(TE.get(), OrdersType());

3875

}

3876

// TODO: Check the reverse order too.

3877

}

3878

3879

if (Optional<OrdersType> CurrentOrder =

3880

getReorderingData(*TE, /*TopToBottom=*/true)) {

3881

// Do not include ordering for nodes used in the alt opcode vectorization,

3882

// better to reorder them during bottom-to-top stage. If follow the order

3883

// here, it causes reordering of the whole graph though actually it is

3884

// profitable just to reorder the subgraph that starts from the alternate

3885

// opcode vectorization node. Such nodes already end-up with the shuffle

3886

// instruction and it is just enough to change this shuffle rather than

3887

// rotate the scalars for the whole graph.

3888

unsigned Cnt = 0;

3889

const TreeEntry *UserTE = TE.get();

3890

while (UserTE && Cnt < RecursionMaxDepth) {

3891

if (UserTE->UserTreeIndices.size() != 1)

3892

break;

3893

if (all_of(UserTE->UserTreeIndices, [](const EdgeInfo &EI) {

3894

return EI.UserTE->State == TreeEntry::Vectorize &&

3895

EI.UserTE->isAltShuffle() && EI.UserTE->Idx != 0;

3896

}))

3897

return;

3898

UserTE = UserTE->UserTreeIndices.back().UserTE;

3899

++Cnt;

3900

}

3901

VFToOrderedEntries[TE->getVectorFactor()].insert(TE.get());

3902

if (TE->State != TreeEntry::Vectorize || !TE->ReuseShuffleIndices.empty())

3903

GathersToOrders.try_emplace(TE.get(), *CurrentOrder);

3904

}

3905

});

3906

3907

// Reorder the graph nodes according to their vectorization factor.

3908

for (unsigned VF = VectorizableTree.front()->Scalars.size(); VF > 1;

3909

VF /= 2) {

3910

auto It = VFToOrderedEntries.find(VF);

3911

if (It == VFToOrderedEntries.end())

3912

continue;

3913

// Try to find the most profitable order. We just are looking for the most

3914

// used order and reorder scalar elements in the nodes according to this

3915

// mostly used order.

3916

ArrayRef<TreeEntry *> OrderedEntries = It->second.getArrayRef();

3917

// All operands are reordered and used only in this node - propagate the

3918

// most used order to the user node.

3919

MapVector<OrdersType, unsigned,

3920

DenseMap<OrdersType, unsigned, OrdersTypeDenseMapInfo>>

3921

OrdersUses;

3922

SmallPtrSet<const TreeEntry *, 4> VisitedOps;

3923

for (const TreeEntry *OpTE : OrderedEntries) {

3924

// No need to reorder this nodes, still need to extend and to use shuffle,

3925

// just need to merge reordering shuffle and the reuse shuffle.

3926

if (!OpTE->ReuseShuffleIndices.empty() && !GathersToOrders.count(OpTE))

3927

continue;

3928

// Count number of orders uses.

3929

const auto &Order = [OpTE, &GathersToOrders,

3930

&AltShufflesToOrders]() -> const OrdersType & {

3931

if (OpTE->State == TreeEntry::NeedToGather ||

3932

!OpTE->ReuseShuffleIndices.empty()) {

3933

auto It = GathersToOrders.find(OpTE);

3934

if (It != GathersToOrders.end())

3935

return It->second;

3936

}

3937

if (OpTE->isAltShuffle()) {

3938

auto It = AltShufflesToOrders.find(OpTE);

3939

if (It != AltShufflesToOrders.end())

3940

return It->second;

3941

}

3942

return OpTE->ReorderIndices;

3943

}();

3944

// First consider the order of the external scalar users.

3945

auto It = ExternalUserReorderMap.find(OpTE);

3946

if (It != ExternalUserReorderMap.end()) {

3947

const auto &ExternalUserReorderIndices = It->second;

3948

// If the OpTE vector factor != number of scalars - use natural order,

3949

// it is an attempt to reorder node with reused scalars but with

3950

// external uses.

3951

if (OpTE->getVectorFactor() != OpTE->Scalars.size()) {

3952

OrdersUses.insert(std::make_pair(OrdersType(), 0)).first->second +=

3953

ExternalUserReorderIndices.size();

3954

} else {

3955

for (const OrdersType &ExtOrder : ExternalUserReorderIndices)

3956

++OrdersUses.insert(std::make_pair(ExtOrder, 0)).first->second;

3957

}

3958

// No other useful reorder data in this entry.

3959

if (Order.empty())

3960

continue;

3961

}

3962

// Stores actually store the mask, not the order, need to invert.

3963

if (OpTE->State == TreeEntry::Vectorize && !OpTE->isAltShuffle() &&

3964

OpTE->getOpcode() == Instruction::Store && !Order.empty()) {

3965

SmallVector<int> Mask;

3966

inversePermutation(Order, Mask);

3967

unsigned E = Order.size();

3968

OrdersType CurrentOrder(E, E);

3969

transform(Mask, CurrentOrder.begin(), [E](int Idx) {

3970

return Idx == UndefMaskElem ? E : static_cast<unsigned>(Idx);

3971

});

3972

fixupOrderingIndices(CurrentOrder);

3973

++OrdersUses.insert(std::make_pair(CurrentOrder, 0)).first->second;

3974

} else {

3975

++OrdersUses.insert(std::make_pair(Order, 0)).first->second;

3976

}

3977

}

3978

// Set order of the user node.

3979

if (OrdersUses.empty())

3980

continue;

3981

// Choose the most used order.

3982

ArrayRef<unsigned> BestOrder = OrdersUses.front().first;

3983

unsigned Cnt = OrdersUses.front().second;

3984

for (const auto &Pair : drop_begin(OrdersUses)) {

3985

if (Cnt < Pair.second || (Cnt == Pair.second && Pair.first.empty())) {

3986

BestOrder = Pair.first;

3987

Cnt = Pair.second;

3988

}

3989

}

3990

// Set order of the user node.

3991

if (BestOrder.empty())

3992

continue;

3993

SmallVector<int> Mask;

3994

inversePermutation(BestOrder, Mask);

3995

SmallVector<int> MaskOrder(BestOrder.size(), UndefMaskElem);

3996

unsigned E = BestOrder.size();

3997

transform(BestOrder, MaskOrder.begin(), [E](unsigned I) {

3998

return I < E ? static_cast<int>(I) : UndefMaskElem;

3999

});

4000

// Do an actual reordering, if profitable.

4001

for (std::unique_ptr<TreeEntry> &TE : VectorizableTree) {

4002

// Just do the reordering for the nodes with the given VF.

4003

if (TE->Scalars.size() != VF) {

4004

if (TE->ReuseShuffleIndices.size() == VF) {

4005

// Need to reorder the reuses masks of the operands with smaller VF to

4006

// be able to find the match between the graph nodes and scalar

4007

// operands of the given node during vectorization/cost estimation.

4008

assert(all_of(TE->UserTreeIndices,(static_cast <bool> (all_of(TE->UserTreeIndices, [VF
, &TE](const EdgeInfo &EI) { return EI.UserTE->Scalars
.size() == VF || EI.UserTE->Scalars.size() == TE->Scalars
.size(); }) && "All users must be of VF size.") ? void
(0) : __assert_fail ("all_of(TE->UserTreeIndices, [VF, &TE](const EdgeInfo &EI) { return EI.UserTE->Scalars.size() == VF || EI.UserTE->Scalars.size() == TE->Scalars.size(); }) && \"All users must be of VF size.\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 4014, __extension__
__PRETTY_FUNCTION__))

4009

[VF, &TE](const EdgeInfo &EI) {(static_cast <bool> (all_of(TE->UserTreeIndices, [VF
, &TE](const EdgeInfo &EI) { return EI.UserTE->Scalars
.size() == VF || EI.UserTE->Scalars.size() == TE->Scalars
.size(); }) && "All users must be of VF size.") ? void
(0) : __assert_fail ("all_of(TE->UserTreeIndices, [VF, &TE](const EdgeInfo &EI) { return EI.UserTE->Scalars.size() == VF || EI.UserTE->Scalars.size() == TE->Scalars.size(); }) && \"All users must be of VF size.\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 4014, __extension__
__PRETTY_FUNCTION__))

4010

return EI.UserTE->Scalars.size() == VF ||(static_cast <bool> (all_of(TE->UserTreeIndices, [VF
, &TE](const EdgeInfo &EI) { return EI.UserTE->Scalars
.size() == VF || EI.UserTE->Scalars.size() == TE->Scalars
.size(); }) && "All users must be of VF size.") ? void
(0) : __assert_fail ("all_of(TE->UserTreeIndices, [VF, &TE](const EdgeInfo &EI) { return EI.UserTE->Scalars.size() == VF || EI.UserTE->Scalars.size() == TE->Scalars.size(); }) && \"All users must be of VF size.\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 4014, __extension__
__PRETTY_FUNCTION__))

4011

EI.UserTE->Scalars.size() ==(static_cast <bool> (all_of(TE->UserTreeIndices, [VF
, &TE](const EdgeInfo &EI) { return EI.UserTE->Scalars
.size() == VF || EI.UserTE->Scalars.size() == TE->Scalars
.size(); }) && "All users must be of VF size.") ? void
(0) : __assert_fail ("all_of(TE->UserTreeIndices, [VF, &TE](const EdgeInfo &EI) { return EI.UserTE->Scalars.size() == VF || EI.UserTE->Scalars.size() == TE->Scalars.size(); }) && \"All users must be of VF size.\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 4014, __extension__
__PRETTY_FUNCTION__))

4012

TE->Scalars.size();(static_cast <bool> (all_of(TE->UserTreeIndices, [VF
, &TE](const EdgeInfo &EI) { return EI.UserTE->Scalars
.size() == VF || EI.UserTE->Scalars.size() == TE->Scalars
.size(); }) && "All users must be of VF size.") ? void
(0) : __assert_fail ("all_of(TE->UserTreeIndices, [VF, &TE](const EdgeInfo &EI) { return EI.UserTE->Scalars.size() == VF || EI.UserTE->Scalars.size() == TE->Scalars.size(); }) && \"All users must be of VF size.\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 4014, __extension__
__PRETTY_FUNCTION__))

4013

}) &&(static_cast <bool> (all_of(TE->UserTreeIndices, [VF
, &TE](const EdgeInfo &EI) { return EI.UserTE->Scalars
.size() == VF || EI.UserTE->Scalars.size() == TE->Scalars
.size(); }) && "All users must be of VF size.") ? void
(0) : __assert_fail ("all_of(TE->UserTreeIndices, [VF, &TE](const EdgeInfo &EI) { return EI.UserTE->Scalars.size() == VF || EI.UserTE->Scalars.size() == TE->Scalars.size(); }) && \"All users must be of VF size.\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 4014, __extension__
__PRETTY_FUNCTION__))

4014

"All users must be of VF size.")(static_cast <bool> (all_of(TE->UserTreeIndices, [VF
, &TE](const EdgeInfo &EI) { return EI.UserTE->Scalars
.size() == VF || EI.UserTE->Scalars.size() == TE->Scalars
.size(); }) && "All users must be of VF size.") ? void
(0) : __assert_fail ("all_of(TE->UserTreeIndices, [VF, &TE](const EdgeInfo &EI) { return EI.UserTE->Scalars.size() == VF || EI.UserTE->Scalars.size() == TE->Scalars.size(); }) && \"All users must be of VF size.\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 4014, __extension__
__PRETTY_FUNCTION__));

4015

// Update ordering of the operands with the smaller VF than the given

4016

// one.

4017

reorderNodeWithReuses(*TE, Mask);

4018

}

4019

continue;

4020

}

4021

if (TE->State == TreeEntry::Vectorize &&

4022

isa<ExtractElementInst, ExtractValueInst, LoadInst, StoreInst,

4023

InsertElementInst>(TE->getMainOp()) &&

4024

!TE->isAltShuffle()) {

4025

// Build correct orders for extract{element,value}, loads and

4026

// stores.

4027

reorderOrder(TE->ReorderIndices, Mask);

4028

if (isa<InsertElementInst, StoreInst>(TE->getMainOp()))

4029

TE->reorderOperands(Mask);

4030

} else {

4031

// Reorder the node and its operands.

4032

TE->reorderOperands(Mask);

4033

assert(TE->ReorderIndices.empty() &&(static_cast <bool> (TE->ReorderIndices.empty() &&
"Expected empty reorder sequence.") ? void (0) : __assert_fail
("TE->ReorderIndices.empty() && \"Expected empty reorder sequence.\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 4034, __extension__
__PRETTY_FUNCTION__))

4034

"Expected empty reorder sequence.")(static_cast <bool> (TE->ReorderIndices.empty() &&
"Expected empty reorder sequence.") ? void (0) : __assert_fail
("TE->ReorderIndices.empty() && \"Expected empty reorder sequence.\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 4034, __extension__
__PRETTY_FUNCTION__));

4035

reorderScalars(TE->Scalars, Mask);

4036

}

4037

if (!TE->ReuseShuffleIndices.empty()) {

4038

// Apply reversed order to keep the original ordering of the reused

4039

// elements to avoid extra reorder indices shuffling.

4040

OrdersType CurrentOrder;

4041

reorderOrder(CurrentOrder, MaskOrder);

4042

SmallVector<int> NewReuses;

4043

inversePermutation(CurrentOrder, NewReuses);

4044

addMask(NewReuses, TE->ReuseShuffleIndices);

4045

TE->ReuseShuffleIndices.swap(NewReuses);

4046

}

4047

}

4048

}

4049

}

4050

4051

bool BoUpSLP::canReorderOperands(

4052

TreeEntry *UserTE, SmallVectorImpl<std::pair<unsigned, TreeEntry *>> &Edges,

4053

ArrayRef<TreeEntry *> ReorderableGathers,

4054

SmallVectorImpl<TreeEntry *> &GatherOps) {

4055

for (unsigned I = 0, E = UserTE->getNumOperands(); I < E; ++I) {

4056

if (any_of(Edges, [I](const std::pair<unsigned, TreeEntry *> &OpData) {

4057

return OpData.first == I &&

4058

OpData.second->State == TreeEntry::Vectorize;

4059

}))

4060

continue;

4061

if (TreeEntry *TE = getVectorizedOperand(UserTE, I)) {

4062

// Do not reorder if operand node is used by many user nodes.

4063

if (any_of(TE->UserTreeIndices,

4064

[UserTE](const EdgeInfo &EI) { return EI.UserTE != UserTE; }))

4065

return false;

4066

// Add the node to the list of the ordered nodes with the identity

4067

// order.

4068

Edges.emplace_back(I, TE);

4069

// Add ScatterVectorize nodes to the list of operands, where just

4070

// reordering of the scalars is required. Similar to the gathers, so

4071

// simply add to the list of gathered ops.

4072

// If there are reused scalars, process this node as a regular vectorize

4073

// node, just reorder reuses mask.

4074

if (TE->State != TreeEntry::Vectorize && TE->ReuseShuffleIndices.empty())

4075

GatherOps.push_back(TE);

4076

continue;

4077

}

4078

TreeEntry *Gather = nullptr;

4079

if (count_if(ReorderableGathers,

4080

[&Gather, UserTE, I](TreeEntry *TE) {

4081

assert(TE->State != TreeEntry::Vectorize &&(static_cast <bool> (TE->State != TreeEntry::Vectorize
&& "Only non-vectorized nodes are expected.") ? void
(0) : __assert_fail ("TE->State != TreeEntry::Vectorize && \"Only non-vectorized nodes are expected.\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 4082, __extension__
__PRETTY_FUNCTION__))

4082

"Only non-vectorized nodes are expected.")(static_cast <bool> (TE->State != TreeEntry::Vectorize
&& "Only non-vectorized nodes are expected.") ? void
(0) : __assert_fail ("TE->State != TreeEntry::Vectorize && \"Only non-vectorized nodes are expected.\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 4082, __extension__
__PRETTY_FUNCTION__));

4083

if (any_of(TE->UserTreeIndices,

4084

[UserTE, I](const EdgeInfo &EI) {

4085

return EI.UserTE == UserTE && EI.EdgeIdx == I;

4086

})) {

4087

assert(TE->isSame(UserTE->getOperand(I)) &&(static_cast <bool> (TE->isSame(UserTE->getOperand
(I)) && "Operand entry does not match operands.") ? void
(0) : __assert_fail ("TE->isSame(UserTE->getOperand(I)) && \"Operand entry does not match operands.\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 4088, __extension__
__PRETTY_FUNCTION__))

4088

"Operand entry does not match operands.")(static_cast <bool> (TE->isSame(UserTE->getOperand
(I)) && "Operand entry does not match operands.") ? void
(0) : __assert_fail ("TE->isSame(UserTE->getOperand(I)) && \"Operand entry does not match operands.\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 4088, __extension__
__PRETTY_FUNCTION__));

4089

Gather = TE;

4090

return true;

4091

}

4092

return false;

4093

}) > 1 &&

4094

!all_of(UserTE->getOperand(I), isConstant))

4095

return false;

4096

if (Gather)

4097

GatherOps.push_back(Gather);

4098

}

4099

return true;

4100

}

4101

4102

void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {

4103

SetVector<TreeEntry *> OrderedEntries;

4104

DenseMap<const TreeEntry *, OrdersType> GathersToOrders;

4105

// Find all reorderable leaf nodes with the given VF.

4106

// Currently the are vectorized loads,extracts without alternate operands +

4107

// some gathering of extracts.

4108

SmallVector<TreeEntry *> NonVectorized;

4109

for_each(VectorizableTree, [this, &OrderedEntries, &GathersToOrders,

4110

&NonVectorized](

4111

const std::unique_ptr<TreeEntry> &TE) {

4112

if (TE->State != TreeEntry::Vectorize)

4113

NonVectorized.push_back(TE.get());

4114

if (Optional<OrdersType> CurrentOrder =

4115

getReorderingData(*TE, /*TopToBottom=*/false)) {

4116

OrderedEntries.insert(TE.get());

4117

if (TE->State != TreeEntry::Vectorize || !TE->ReuseShuffleIndices.empty())

4118

GathersToOrders.try_emplace(TE.get(), *CurrentOrder);

4119

}

4120

});

4121

4122

// 1. Propagate order to the graph nodes, which use only reordered nodes.

4123

// I.e., if the node has operands, that are reordered, try to make at least

4124

// one operand order in the natural order and reorder others + reorder the

4125

// user node itself.

4126

SmallPtrSet<const TreeEntry *, 4> Visited;

4127

while (!OrderedEntries.empty()) {

4128

// 1. Filter out only reordered nodes.

4129

// 2. If the entry has multiple uses - skip it and jump to the next node.

4130

DenseMap<TreeEntry *, SmallVector<std::pair<unsigned, TreeEntry *>>> Users;

4131

SmallVector<TreeEntry *> Filtered;

4132

for (TreeEntry *TE : OrderedEntries) {

4133

if (!(TE->State == TreeEntry::Vectorize ||

4134

(TE->State == TreeEntry::NeedToGather &&

4135

GathersToOrders.count(TE))) ||

4136

TE->UserTreeIndices.empty() || !TE->ReuseShuffleIndices.empty() ||

4137

!all_of(drop_begin(TE->UserTreeIndices),

4138

[TE](const EdgeInfo &EI) {

4139

return EI.UserTE == TE->UserTreeIndices.front().UserTE;

4140

}) ||

4141

!Visited.insert(TE).second) {

4142

Filtered.push_back(TE);

4143

continue;

4144

}

4145

// Build a map between user nodes and their operands order to speedup

4146

// search. The graph currently does not provide this dependency directly.

4147

for (EdgeInfo &EI : TE->UserTreeIndices) {

4148

TreeEntry *UserTE = EI.UserTE;

4149

auto It = Users.find(UserTE);

4150

if (It == Users.end())

4151

It = Users.insert({UserTE, {}}).first;

4152

It->second.emplace_back(EI.EdgeIdx, TE);

4153

}

4154

}

4155

// Erase filtered entries.

4156

for_each(Filtered,

4157

[&OrderedEntries](TreeEntry *TE) { OrderedEntries.remove(TE); });

4158

SmallVector<

4159

std::pair<TreeEntry *, SmallVector<std::pair<unsigned, TreeEntry *>>>>

4160

UsersVec(Users.begin(), Users.end());

4161

sort(UsersVec, [](const auto &Data1, const auto &Data2) {

4162

return Data1.first->Idx > Data2.first->Idx;

4163

});

4164

for (auto &Data : UsersVec) {

4165

// Check that operands are used only in the User node.

4166

SmallVector<TreeEntry *> GatherOps;

4167

if (!canReorderOperands(Data.first, Data.second, NonVectorized,

4168

GatherOps)) {

4169

for_each(Data.second,

4170

[&OrderedEntries](const std::pair<unsigned, TreeEntry *> &Op) {

4171

OrderedEntries.remove(Op.second);

4172

});

4173

continue;

4174

}

4175

// All operands are reordered and used only in this node - propagate the

4176

// most used order to the user node.

4177

MapVector<OrdersType, unsigned,

4178

DenseMap<OrdersType, unsigned, OrdersTypeDenseMapInfo>>

4179

OrdersUses;

4180

// Do the analysis for each tree entry only once, otherwise the order of

4181

// the same node my be considered several times, though might be not

4182

// profitable.

4183

SmallPtrSet<const TreeEntry *, 4> VisitedOps;

4184

SmallPtrSet<const TreeEntry *, 4> VisitedUsers;

4185

for (const auto &Op : Data.second) {

4186

TreeEntry *OpTE = Op.second;

4187

if (!VisitedOps.insert(OpTE).second)

4188

continue;

4189

if (!OpTE->ReuseShuffleIndices.empty() && !GathersToOrders.count(OpTE))

4190

continue;

4191

const auto &Order = [OpTE, &GathersToOrders]() -> const OrdersType & {

4192

if (OpTE->State == TreeEntry::NeedToGather ||

4193

!OpTE->ReuseShuffleIndices.empty())

4194

return GathersToOrders.find(OpTE)->second;

4195

return OpTE->ReorderIndices;

4196

}();

4197

unsigned NumOps = count_if(

4198

Data.second, [OpTE](const std::pair<unsigned, TreeEntry *> &P) {

4199

return P.second == OpTE;

4200

});

4201

// Stores actually store the mask, not the order, need to invert.

4202

if (OpTE->State == TreeEntry::Vectorize && !OpTE->isAltShuffle() &&

4203

OpTE->getOpcode() == Instruction::Store && !Order.empty()) {

4204

SmallVector<int> Mask;

4205

inversePermutation(Order, Mask);

4206

unsigned E = Order.size();

4207

OrdersType CurrentOrder(E, E);

4208

transform(Mask, CurrentOrder.begin(), [E](int Idx) {

4209

return Idx == UndefMaskElem ? E : static_cast<unsigned>(Idx);

4210

});

4211

fixupOrderingIndices(CurrentOrder);

4212

OrdersUses.insert(std::make_pair(CurrentOrder, 0)).first->second +=

4213

NumOps;

4214

} else {

4215

OrdersUses.insert(std::make_pair(Order, 0)).first->second += NumOps;

4216

}

4217

auto Res = OrdersUses.insert(std::make_pair(OrdersType(), 0));

4218

const auto &&AllowsReordering = [IgnoreReorder, &GathersToOrders](

4219

const TreeEntry *TE) {

4220

if (!TE->ReorderIndices.empty() || !TE->ReuseShuffleIndices.empty() ||

4221

(TE->State == TreeEntry::Vectorize && TE->isAltShuffle()) ||

4222

(IgnoreReorder && TE->Idx == 0))

4223

return true;

4224

if (TE->State == TreeEntry::NeedToGather) {

4225

auto It = GathersToOrders.find(TE);

4226

if (It != GathersToOrders.end())

4227

return !It->second.empty();

4228

return true;

4229

}

4230

return false;

4231

};

4232

for (const EdgeInfo &EI : OpTE->UserTreeIndices) {

4233

TreeEntry *UserTE = EI.UserTE;

4234

if (!VisitedUsers.insert(UserTE).second)

4235

continue;

4236

// May reorder user node if it requires reordering, has reused

4237

// scalars, is an alternate op vectorize node or its op nodes require

4238

// reordering.

4239

if (AllowsReordering(UserTE))

4240

continue;

4241

// Check if users allow reordering.

4242

// Currently look up just 1 level of operands to avoid increase of

4243

// the compile time.

4244

// Profitable to reorder if definitely more operands allow

4245

// reordering rather than those with natural order.

4246

ArrayRef<std::pair<unsigned, TreeEntry *>> Ops = Users[UserTE];

4247

if (static_cast<unsigned>(count_if(

4248

Ops, [UserTE, &AllowsReordering](

4249

const std::pair<unsigned, TreeEntry *> &Op) {

4250

return AllowsReordering(Op.second) &&

4251

all_of(Op.second->UserTreeIndices,

4252

[UserTE](const EdgeInfo &EI) {

4253

return EI.UserTE == UserTE;

4254

});

4255

})) <= Ops.size() / 2)

4256

++Res.first->second;

4257

}

4258

}

4259

// If no orders - skip current nodes and jump to the next one, if any.

4260

if (OrdersUses.empty()) {

4261

for_each(Data.second,

4262

[&OrderedEntries](const std::pair<unsigned, TreeEntry *> &Op) {

4263

OrderedEntries.remove(Op.second);

4264

});

4265

continue;

4266

}

4267

// Choose the best order.

4268

ArrayRef<unsigned> BestOrder = OrdersUses.front().first;

4269

unsigned Cnt = OrdersUses.front().second;

4270

for (const auto &Pair : drop_begin(OrdersUses)) {

4271

if (Cnt < Pair.second || (Cnt == Pair.second && Pair.first.empty())) {

4272

BestOrder = Pair.first;

4273

Cnt = Pair.second;

4274

}

4275

}

4276

// Set order of the user node (reordering of operands and user nodes).

4277

if (BestOrder.empty()) {

4278

for_each(Data.second,

4279

[&OrderedEntries](const std::pair<unsigned, TreeEntry *> &Op) {

4280

OrderedEntries.remove(Op.second);

4281

});

4282

continue;

4283

}

4284

// Erase operands from OrderedEntries list and adjust their orders.

4285

VisitedOps.clear();

4286

SmallVector<int> Mask;

4287

inversePermutation(BestOrder, Mask);

4288

SmallVector<int> MaskOrder(BestOrder.size(), UndefMaskElem);

4289

unsigned E = BestOrder.size();

4290

transform(BestOrder, MaskOrder.begin(), [E](unsigned I) {

4291

return I < E ? static_cast<int>(I) : UndefMaskElem;

4292

});

4293

for (const std::pair<unsigned, TreeEntry *> &Op : Data.second) {

4294

TreeEntry *TE = Op.second;

4295

OrderedEntries.remove(TE);

4296

if (!VisitedOps.insert(TE).second)

4297

continue;

4298

if (TE->ReuseShuffleIndices.size() == BestOrder.size()) {

4299

reorderNodeWithReuses(*TE, Mask);

4300

continue;

4301

}

4302

// Gathers are processed separately.

4303

if (TE->State != TreeEntry::Vectorize)

4304

continue;

4305

assert((BestOrder.size() == TE->ReorderIndices.size() ||(static_cast <bool> ((BestOrder.size() == TE->ReorderIndices
.size() || TE->ReorderIndices.empty()) && "Non-matching sizes of user/operand entries."
) ? void (0) : __assert_fail ("(BestOrder.size() == TE->ReorderIndices.size() || TE->ReorderIndices.empty()) && \"Non-matching sizes of user/operand entries.\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 4307, __extension__
__PRETTY_FUNCTION__))

4306

TE->ReorderIndices.empty()) &&(static_cast <bool> ((BestOrder.size() == TE->ReorderIndices
.size() || TE->ReorderIndices.empty()) && "Non-matching sizes of user/operand entries."
) ? void (0) : __assert_fail ("(BestOrder.size() == TE->ReorderIndices.size() || TE->ReorderIndices.empty()) && \"Non-matching sizes of user/operand entries.\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 4307, __extension__
__PRETTY_FUNCTION__))

4307

"Non-matching sizes of user/operand entries.")(static_cast <bool> ((BestOrder.size() == TE->ReorderIndices
.size() || TE->ReorderIndices.empty()) && "Non-matching sizes of user/operand entries."
) ? void (0) : __assert_fail ("(BestOrder.size() == TE->ReorderIndices.size() || TE->ReorderIndices.empty()) && \"Non-matching sizes of user/operand entries.\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 4307, __extension__
__PRETTY_FUNCTION__));

4308

reorderOrder(TE->ReorderIndices, Mask);

4309

if (IgnoreReorder && TE == VectorizableTree.front().get())

4310

IgnoreReorder = false;

4311

}

4312

// For gathers just need to reorder its scalars.

4313

for (TreeEntry *Gather : GatherOps) {

4314

assert(Gather->ReorderIndices.empty() &&(static_cast <bool> (Gather->ReorderIndices.empty() &&
"Unexpected reordering of gathers.") ? void (0) : __assert_fail
("Gather->ReorderIndices.empty() && \"Unexpected reordering of gathers.\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 4315, __extension__
__PRETTY_FUNCTION__))

4315

"Unexpected reordering of gathers.")(static_cast <bool> (Gather->ReorderIndices.empty() &&
"Unexpected reordering of gathers.") ? void (0) : __assert_fail
("Gather->ReorderIndices.empty() && \"Unexpected reordering of gathers.\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 4315, __extension__
__PRETTY_FUNCTION__));

4316

if (!Gather->ReuseShuffleIndices.empty()) {

4317

// Just reorder reuses indices.

4318

reorderReuses(Gather->ReuseShuffleIndices, Mask);

4319

continue;

4320

}

4321

reorderScalars(Gather->Scalars, Mask);

4322

OrderedEntries.remove(Gather);

4323

}

4324

// Reorder operands of the user node and set the ordering for the user

4325

// node itself.

4326

if (Data.first->State != TreeEntry::Vectorize ||

4327

!isa<ExtractElementInst, ExtractValueInst, LoadInst>(

4328

Data.first->getMainOp()) ||

4329

Data.first->isAltShuffle())

4330

Data.first->reorderOperands(Mask);

4331

if (!isa<InsertElementInst, StoreInst>(Data.first->getMainOp()) ||

4332

Data.first->isAltShuffle()) {

4333

reorderScalars(Data.first->Scalars, Mask);

4334

reorderOrder(Data.first->ReorderIndices, MaskOrder);

4335

if (Data.first->ReuseShuffleIndices.empty() &&

4336

!Data.first->ReorderIndices.empty() &&

4337

!Data.first->isAltShuffle()) {

4338

// Insert user node to the list to try to sink reordering deeper in

4339

// the graph.

4340

OrderedEntries.insert(Data.first);

4341

}

4342

} else {

4343

reorderOrder(Data.first->ReorderIndices, Mask);

4344

}

4345

}

4346

}

4347

// If the reordering is unnecessary, just remove the reorder.

4348

if (IgnoreReorder && !VectorizableTree.front()->ReorderIndices.empty() &&

4349

VectorizableTree.front()->ReuseShuffleIndices.empty())

4350

VectorizableTree.front()->ReorderIndices.clear();

4351

}

4352

4353

void BoUpSLP::buildExternalUses(

4354

const ExtraValueToDebugLocsMap &ExternallyUsedValues) {

4355

// Collect the values that we need to extract from the tree.

4356

for (auto &TEPtr : VectorizableTree) {

4357

TreeEntry *Entry = TEPtr.get();

4358

4359

// No need to handle users of gathered values.

4360

if (Entry->State == TreeEntry::NeedToGather)

4361

continue;

4362

4363

// For each lane:

4364

for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {

4365

Value *Scalar = Entry->Scalars[Lane];

4366

int FoundLane = Entry->findLaneForValue(Scalar);

4367

4368

// Check if the scalar is externally used as an extra arg.

4369

auto ExtI = ExternallyUsedValues.find(Scalar);

4370

if (ExtI != ExternallyUsedValues.end()) {

4371

LLVM_DEBUG(dbgs() << "SLP: Need to extract: Extra arg from lane "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Need to extract: Extra arg from lane "
<< Lane << " from " << *Scalar << ".\n"
; } } while (false)

4372

<< Lane << " from " << *Scalar << ".\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Need to extract: Extra arg from lane "
<< Lane << " from " << *Scalar << ".\n"
; } } while (false);

4373

ExternalUses.emplace_back(Scalar, nullptr, FoundLane);

4374

}

4375

for (User *U : Scalar->users()) {

4376

LLVM_DEBUG(dbgs() << "SLP: Checking user:" << *U << ".\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Checking user:" << *U <<
".\n"; } } while (false);

4377

4378

Instruction *UserInst = dyn_cast<Instruction>(U);

4379

if (!UserInst)

4380

continue;

4381

4382

if (isDeleted(UserInst))

4383

continue;

4384

4385

// Skip in-tree scalars that become vectors

4386

if (TreeEntry *UseEntry = getTreeEntry(U)) {

4387

Value *UseScalar = UseEntry->Scalars[0];

4388

// Some in-tree scalars will remain as scalar in vectorized

4389

// instructions. If that is the case, the one in Lane 0 will

4390

// be used.

4391

if (UseScalar != U ||

4392

UseEntry->State == TreeEntry::ScatterVectorize ||

4393

!InTreeUserNeedToExtract(Scalar, UserInst, TLI)) {

4394

LLVM_DEBUG(dbgs() << "SLP: \tInternal user will be removed:" << *Udo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: \tInternal user will be removed:"
<< *U << ".\n"; } } while (false)

4395

<< ".\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: \tInternal user will be removed:"
<< *U << ".\n"; } } while (false);

4396

assert(UseEntry->State != TreeEntry::NeedToGather && "Bad state")(static_cast <bool> (UseEntry->State != TreeEntry::NeedToGather
&& "Bad state") ? void (0) : __assert_fail ("UseEntry->State != TreeEntry::NeedToGather && \"Bad state\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 4396, __extension__
__PRETTY_FUNCTION__));

4397

continue;

4398

}

4399

}

4400

4401

// Ignore users in the user ignore list.

4402

if (UserIgnoreList && UserIgnoreList->contains(UserInst))

4403

continue;

4404

4405

LLVM_DEBUG(dbgs() << "SLP: Need to extract:" << *U << " from lane "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Need to extract:" << *
U << " from lane " << Lane << " from " <<
*Scalar << ".\n"; } } while (false)

4406

<< Lane << " from " << *Scalar << ".\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Need to extract:" << *
U << " from lane " << Lane << " from " <<
*Scalar << ".\n"; } } while (false);

4407

ExternalUses.push_back(ExternalUser(Scalar, U, FoundLane));

4408

}

4409

}

4410

}

4411

}

4412

4413

DenseMap<Value *, SmallVector<StoreInst *, 4>>

4414

BoUpSLP::collectUserStores(const BoUpSLP::TreeEntry *TE) const {

4415

DenseMap<Value *, SmallVector<StoreInst *, 4>> PtrToStoresMap;

4416

for (unsigned Lane : seq<unsigned>(0, TE->Scalars.size())) {

4417

Value *V = TE->Scalars[Lane];

4418

// To save compilation time we don't visit if we have too many users.

4419

static constexpr unsigned UsersLimit = 4;

4420

if (V->hasNUsesOrMore(UsersLimit))

4421

break;

4422

4423

// Collect stores per pointer object.

4424

for (User *U : V->users()) {

4425

auto *SI = dyn_cast<StoreInst>(U);

4426

if (SI == nullptr || !SI->isSimple() ||

4427

!isValidElementType(SI->getValueOperand()->getType()))

4428

continue;

4429

// Skip entry if already

4430

if (getTreeEntry(U))

4431

continue;

4432

4433

Value *Ptr = getUnderlyingObject(SI->getPointerOperand());

4434

auto &StoresVec = PtrToStoresMap[Ptr];

4435

// For now just keep one store per pointer object per lane.

4436

// TODO: Extend this to support multiple stores per pointer per lane

4437

if (StoresVec.size() > Lane)

4438

continue;

4439

// Skip if in different BBs.

4440

if (!StoresVec.empty() &&

4441

SI->getParent() != StoresVec.back()->getParent())

4442

continue;

4443

// Make sure that the stores are of the same type.

4444

if (!StoresVec.empty() &&

4445

SI->getValueOperand()->getType() !=

4446

StoresVec.back()->getValueOperand()->getType())

4447

continue;

4448

StoresVec.push_back(SI);

4449

}

4450

}

4451

return PtrToStoresMap;

4452

}

4453

4454

bool BoUpSLP::canFormVector(const SmallVector<StoreInst *, 4> &StoresVec,

4455

OrdersType &ReorderIndices) const {

4456

// We check whether the stores in StoreVec can form a vector by sorting them

4457

// and checking whether they are consecutive.

4458

4459

// To avoid calling getPointersDiff() while sorting we create a vector of

4460

// pairs {store, offset from first} and sort this instead.

4461

SmallVector<std::pair<StoreInst *, int>, 4> StoreOffsetVec(StoresVec.size());

4462

StoreInst *S0 = StoresVec[0];

4463

StoreOffsetVec[0] = {S0, 0};

4464

Type *S0Ty = S0->getValueOperand()->getType();

4465

Value *S0Ptr = S0->getPointerOperand();

4466

for (unsigned Idx : seq<unsigned>(1, StoresVec.size())) {

4467

StoreInst *SI = StoresVec[Idx];

4468

Optional<int> Diff =

4469

getPointersDiff(S0Ty, S0Ptr, SI->getValueOperand()->getType(),

4470

SI->getPointerOperand(), *DL, *SE,

4471

/*StrictCheck=*/true);

4472

// We failed to compare the pointers so just abandon this StoresVec.

4473

if (!Diff)

4474

return false;

4475

StoreOffsetVec[Idx] = {StoresVec[Idx], *Diff};

4476

}

4477

4478

// Sort the vector based on the pointers. We create a copy because we may

4479

// need the original later for calculating the reorder (shuffle) indices.

4480

stable_sort(StoreOffsetVec, [](const std::pair<StoreInst *, int> &Pair1,

4481

const std::pair<StoreInst *, int> &Pair2) {

4482

int Offset1 = Pair1.second;

4483

int Offset2 = Pair2.second;

4484

return Offset1 < Offset2;

4485

});

4486

4487

// Check if the stores are consecutive by checking if their difference is 1.

4488

for (unsigned Idx : seq<unsigned>(1, StoreOffsetVec.size()))

4489

if (StoreOffsetVec[Idx].second != StoreOffsetVec[Idx-1].second + 1)

4490

return false;

4491

4492

// Calculate the shuffle indices according to their offset against the sorted

4493

// StoreOffsetVec.

4494

ReorderIndices.reserve(StoresVec.size());

4495

for (StoreInst *SI : StoresVec) {

4496

unsigned Idx = find_if(StoreOffsetVec,

4497

[SI](const std::pair<StoreInst *, int> &Pair) {

4498

return Pair.first == SI;

4499

}) -

4500

StoreOffsetVec.begin();

4501

ReorderIndices.push_back(Idx);

4502

}

4503

// Identity order (e.g., {0,1,2,3}) is modeled as an empty OrdersType in

4504

// reorderTopToBottom() and reorderBottomToTop(), so we are following the

4505

// same convention here.

4506

auto IsIdentityOrder = [](const OrdersType &Order) {

4507

for (unsigned Idx : seq<unsigned>(0, Order.size()))

4508

if (Idx != Order[Idx])

4509

return false;

4510

return true;

4511

};

4512

if (IsIdentityOrder(ReorderIndices))

4513

ReorderIndices.clear();

4514

4515

return true;

4516

}

4517

4518

#ifndef NDEBUG

4519

LLVM_DUMP_METHOD__attribute__((noinline)) __attribute__((__used__)) static void dumpOrder(const BoUpSLP::OrdersType &Order) {

4520

for (unsigned Idx : Order)

4521

dbgs() << Idx << ", ";

4522

dbgs() << "\n";

4523

}

4524

#endif

4525

4526

SmallVector<BoUpSLP::OrdersType, 1>

4527

BoUpSLP::findExternalStoreUsersReorderIndices(TreeEntry *TE) const {

4528

unsigned NumLanes = TE->Scalars.size();

4529

4530

DenseMap<Value *, SmallVector<StoreInst *, 4>> PtrToStoresMap =

4531

collectUserStores(TE);

4532

4533

// Holds the reorder indices for each candidate store vector that is a user of

4534

// the current TreeEntry.

4535

SmallVector<OrdersType, 1> ExternalReorderIndices;

4536

4537

// Now inspect the stores collected per pointer and look for vectorization

4538

// candidates. For each candidate calculate the reorder index vector and push

4539

// it into `ExternalReorderIndices`

4540

for (const auto &Pair : PtrToStoresMap) {

4541

auto &StoresVec = Pair.second;

4542

// If we have fewer than NumLanes stores, then we can't form a vector.

4543

if (StoresVec.size() != NumLanes)

4544

continue;

4545

4546

// If the stores are not consecutive then abandon this StoresVec.

4547

OrdersType ReorderIndices;

4548

if (!canFormVector(StoresVec, ReorderIndices))

4549

continue;

4550

4551

// We now know that the scalars in StoresVec can form a vector instruction,

4552

// so set the reorder indices.

4553

ExternalReorderIndices.push_back(ReorderIndices);

4554

}

4555

return ExternalReorderIndices;

4556

}

4557

4558

void BoUpSLP::buildTree(ArrayRef<Value *> Roots,

4559

const SmallDenseSet<Value *> &UserIgnoreLst) {

4560

deleteTree();

4561

UserIgnoreList = &UserIgnoreLst;

4562

if (!allSameType(Roots))

4563

return;

4564

buildTree_rec(Roots, 0, EdgeInfo());

4565

}

4566

4567

void BoUpSLP::buildTree(ArrayRef<Value *> Roots) {

4568

deleteTree();

4569

if (!allSameType(Roots))

4570

return;

4571

buildTree_rec(Roots, 0, EdgeInfo());

4572

}

4573

4574

/// \return true if the specified list of values has only one instruction that

4575

/// requires scheduling, false otherwise.

4576

#ifndef NDEBUG

4577

static bool needToScheduleSingleInstruction(ArrayRef<Value *> VL) {

4578

Value *NeedsScheduling = nullptr;

4579

for (Value *V : VL) {

4580

if (doesNotNeedToBeScheduled(V))

4581

continue;

4582

if (!NeedsScheduling) {

4583

NeedsScheduling = V;

4584

continue;

4585

}

4586

return false;

4587

}

4588

return NeedsScheduling;

4589

}

4590

#endif

4591

4592

/// Generates key/subkey pair for the given value to provide effective sorting

4593

/// of the values and better detection of the vectorizable values sequences. The

4594

/// keys/subkeys can be used for better sorting of the values themselves (keys)

4595

/// and in values subgroups (subkeys).

4596

static std::pair<size_t, size_t> generateKeySubkey(

4597

Value *V, const TargetLibraryInfo *TLI,

4598

function_ref<hash_code(size_t, LoadInst *)> LoadsSubkeyGenerator,

4599

bool AllowAlternate) {

4600

hash_code Key = hash_value(V->getValueID() + 2);

4601

hash_code SubKey = hash_value(0);

4602

// Sort the loads by the distance between the pointers.

4603

if (auto *LI = dyn_cast<LoadInst>(V)) {

4604

Key = hash_combine(hash_value(Instruction::Load), Key);

4605

if (LI->isSimple())

4606

SubKey = hash_value(LoadsSubkeyGenerator(Key, LI));

4607

else

4608

SubKey = hash_value(LI);

4609

} else if (isVectorLikeInstWithConstOps(V)) {

4610

// Sort extracts by the vector operands.

4611

if (isa<ExtractElementInst, UndefValue>(V))

4612

Key = hash_value(Value::UndefValueVal + 1);

4613

if (auto *EI = dyn_cast<ExtractElementInst>(V)) {

4614

if (!isUndefVector(EI->getVectorOperand()) &&

4615

!isa<UndefValue>(EI->getIndexOperand()))

4616

SubKey = hash_value(EI->getVectorOperand());

4617

}

4618

} else if (auto *I = dyn_cast<Instruction>(V)) {

4619

// Sort other instructions just by the opcodes except for CMPInst.

4620

// For CMP also sort by the predicate kind.

4621

if ((isa<BinaryOperator, CastInst>(I)) &&

4622

isValidForAlternation(I->getOpcode())) {

4623

if (AllowAlternate)

4624

Key = hash_value(isa<BinaryOperator>(I) ? 1 : 0);

4625

else

4626

Key = hash_combine(hash_value(I->getOpcode()), Key);

4627

SubKey = hash_combine(

4628

hash_value(I->getOpcode()), hash_value(I->getType()),

4629

hash_value(isa<BinaryOperator>(I)

4630

? I->getType()

4631

: cast<CastInst>(I)->getOperand(0)->getType()));

4632

// For casts, look through the only operand to improve compile time.

4633

if (isa<CastInst>(I)) {

4634

std::pair<size_t, size_t> OpVals =

4635

generateKeySubkey(I->getOperand(0), TLI, LoadsSubkeyGenerator,

4636

/*=AllowAlternate*/ true);

4637

Key = hash_combine(OpVals.first, Key);

4638

SubKey = hash_combine(OpVals.first, SubKey);

4639

}

4640

} else if (auto *CI = dyn_cast<CmpInst>(I)) {

4641

CmpInst::Predicate Pred = CI->getPredicate();

4642

if (CI->isCommutative())

4643

Pred = std::min(Pred, CmpInst::getInversePredicate(Pred));

4644

CmpInst::Predicate SwapPred = CmpInst::getSwappedPredicate(Pred);

4645

SubKey = hash_combine(hash_value(I->getOpcode()), hash_value(Pred),

4646

hash_value(SwapPred),

4647

hash_value(CI->getOperand(0)->getType()));

4648

} else if (auto *Call = dyn_cast<CallInst>(I)) {

4649

Intrinsic::ID ID = getVectorIntrinsicIDForCall(Call, TLI);

4650

if (isTriviallyVectorizable(ID)) {

4651

SubKey = hash_combine(hash_value(I->getOpcode()), hash_value(ID));

4652

} else if (!VFDatabase(*Call).getMappings(*Call).empty()) {

4653

SubKey = hash_combine(hash_value(I->getOpcode()),

4654

hash_value(Call->getCalledFunction()));

4655

} else {

4656

Key = hash_combine(hash_value(Call), Key);

4657

SubKey = hash_combine(hash_value(I->getOpcode()), hash_value(Call));

4658

}

4659

for (const CallBase::BundleOpInfo &Op : Call->bundle_op_infos())

4660

SubKey = hash_combine(hash_value(Op.Begin), hash_value(Op.End),

4661

hash_value(Op.Tag), SubKey);

4662

} else if (auto *Gep = dyn_cast<GetElementPtrInst>(I)) {

4663

if (Gep->getNumOperands() == 2 && isa<ConstantInt>(Gep->getOperand(1)))

4664

SubKey = hash_value(Gep->getPointerOperand());

4665

else

4666

SubKey = hash_value(Gep);

4667

} else if (BinaryOperator::isIntDivRem(I->getOpcode()) &&

4668

!isa<ConstantInt>(I->getOperand(1))) {

4669

// Do not try to vectorize instructions with potentially high cost.

4670

SubKey = hash_value(I);

4671

} else {

4672

SubKey = hash_value(I->getOpcode());

4673

}

4674

Key = hash_combine(hash_value(I->getParent()), Key);

4675

}

4676

return std::make_pair(Key, SubKey);

4677

}

4678

4679

/// Checks if the specified instruction \p I is an alternate operation for

4680

/// the given \p MainOp and \p AltOp instructions.

4681

static bool isAlternateInstruction(const Instruction *I,

4682

const Instruction *MainOp,

4683

const Instruction *AltOp);

4684

4685

void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,

4686

const EdgeInfo &UserTreeIdx) {

4687

assert((allConstant(VL) || allSameType(VL)) && "Invalid types!")(static_cast <bool> ((allConstant(VL) || allSameType(VL
)) && "Invalid types!") ? void (0) : __assert_fail ("(allConstant(VL) || allSameType(VL)) && \"Invalid types!\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 4687, __extension__
__PRETTY_FUNCTION__));

4688

4689

SmallVector<int> ReuseShuffleIndicies;

4690

SmallVector<Value *> UniqueValues;

4691

auto &&TryToFindDuplicates = [&VL, &ReuseShuffleIndicies, &UniqueValues,

4692

&UserTreeIdx,

4693

this](const InstructionsState &S) {

4694

// Check that every instruction appears once in this bundle.

4695

DenseMap<Value *, unsigned> UniquePositions;

4696

for (Value *V : VL) {

4697

if (isConstant(V)) {

4698

ReuseShuffleIndicies.emplace_back(

4699

isa<UndefValue>(V) ? UndefMaskElem : UniqueValues.size());

4700

UniqueValues.emplace_back(V);

4701

continue;

4702

}

4703

auto Res = UniquePositions.try_emplace(V, UniqueValues.size());

4704

ReuseShuffleIndicies.emplace_back(Res.first->second);

4705

if (Res.second)

4706

UniqueValues.emplace_back(V);

4707

}

4708

size_t NumUniqueScalarValues = UniqueValues.size();

4709

if (NumUniqueScalarValues == VL.size()) {

4710

ReuseShuffleIndicies.clear();

4711

} else {

4712

LLVM_DEBUG(dbgs() << "SLP: Shuffle for reused scalars.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Shuffle for reused scalars.\n"
; } } while (false);

4713

if (NumUniqueScalarValues <= 1 ||

4714

(UniquePositions.size() == 1 && all_of(UniqueValues,

4715

[](Value *V) {

4716

return isa<UndefValue>(V) ||

4717

!isConstant(V);

4718

})) ||

4719

!llvm::isPowerOf2_32(NumUniqueScalarValues)) {

4720

LLVM_DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Scalar used twice in bundle.\n"
; } } while (false);

4721

newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx);

4722

return false;

4723

}

4724

VL = UniqueValues;

4725

}

4726

return true;

4727

};

4728

4729

InstructionsState S = getSameOpcode(VL);

4730

4731

// Gather if we hit the RecursionMaxDepth, unless this is a load (or z/sext of

4732

// a load), in which case peek through to include it in the tree, without

4733

// ballooning over-budget.

4734

if (Depth >= RecursionMaxDepth &&

4735

!(S.MainOp && isa<Instruction>(S.MainOp) && S.MainOp == S.AltOp &&

4736

VL.size() >= 4 &&

4737

(match(S.MainOp, m_Load(m_Value())) || all_of(VL, [&S](const Value *I) {

4738

return match(I,

4739

m_OneUse(m_ZExtOrSExt(m_OneUse(m_Load(m_Value()))))) &&

4740

cast<Instruction>(I)->getOpcode() ==

4741

cast<Instruction>(S.MainOp)->getOpcode();

4742

})))) {

4743

LLVM_DEBUG(dbgs() << "SLP: Gathering due to max recursion depth.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Gathering due to max recursion depth.\n"
; } } while (false);

4744

if (TryToFindDuplicates(S))

4745

newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,

4746

ReuseShuffleIndicies);

4747

return;

4748

}

4749

4750

// Don't handle scalable vectors

4751

if (S.getOpcode() == Instruction::ExtractElement &&

4752

isa<ScalableVectorType>(

4753

cast<ExtractElementInst>(S.OpValue)->getVectorOperandType())) {

4754

LLVM_DEBUG(dbgs() << "SLP: Gathering due to scalable vector type.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Gathering due to scalable vector type.\n"
; } } while (false);

4755

if (TryToFindDuplicates(S))

4756

newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,

4757

ReuseShuffleIndicies);

4758

return;

4759

}

4760

4761

// Don't handle vectors.

4762

if (S.OpValue->getType()->isVectorTy() &&

4763

!isa<InsertElementInst>(S.OpValue)) {

4764

LLVM_DEBUG(dbgs() << "SLP: Gathering due to vector type.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Gathering due to vector type.\n"
; } } while (false);

4765

newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx);

4766

return;

4767

}

4768

4769

if (StoreInst *SI = dyn_cast<StoreInst>(S.OpValue))

4770

if (SI->getValueOperand()->getType()->isVectorTy()) {

4771

LLVM_DEBUG(dbgs() << "SLP: Gathering due to store vector type.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Gathering due to store vector type.\n"
; } } while (false);

4772

newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx);

4773

return;

4774

}

4775

4776

// If all of the operands are identical or constant we have a simple solution.

4777

// If we deal with insert/extract instructions, they all must have constant

4778

// indices, otherwise we should gather them, not try to vectorize.

4779

// If alternate op node with 2 elements with gathered operands - do not

4780

// vectorize.

4781

auto &&NotProfitableForVectorization = [&S, this,

4782

Depth](ArrayRef<Value *> VL) {

4783

if (!S.getOpcode() || !S.isAltShuffle() || VL.size() > 2)

4784

return false;

4785

if (VectorizableTree.size() < MinTreeSize)

4786

return false;

4787

if (Depth >= RecursionMaxDepth - 1)

4788

return true;

4789

// Check if all operands are extracts, part of vector node or can build a

4790

// regular vectorize node.

4791

SmallVector<unsigned, 2> InstsCount(VL.size(), 0);

4792

for (Value *V : VL) {

4793

auto *I = cast<Instruction>(V);

4794

InstsCount.push_back(count_if(I->operand_values(), [](Value *Op) {

4795

return isa<Instruction>(Op) || isVectorLikeInstWithConstOps(Op);

4796

}));

4797

}

4798

bool IsCommutative = isCommutative(S.MainOp) || isCommutative(S.AltOp);

4799

if ((IsCommutative &&

4800

std::accumulate(InstsCount.begin(), InstsCount.end(), 0) < 2) ||

4801

(!IsCommutative &&

4802

all_of(InstsCount, [](unsigned ICnt) { return ICnt < 2; })))

4803

return true;

4804

assert(VL.size() == 2 && "Expected only 2 alternate op instructions.")(static_cast <bool> (VL.size() == 2 && "Expected only 2 alternate op instructions."
) ? void (0) : __assert_fail ("VL.size() == 2 && \"Expected only 2 alternate op instructions.\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 4804, __extension__
__PRETTY_FUNCTION__));

4805

SmallVector<SmallVector<std::pair<Value *, Value *>>> Candidates;

4806

auto *I1 = cast<Instruction>(VL.front());

4807

auto *I2 = cast<Instruction>(VL.back());

4808

for (int Op = 0, E = S.MainOp->getNumOperands(); Op < E; ++Op)

4809

Candidates.emplace_back().emplace_back(I1->getOperand(Op),

4810

I2->getOperand(Op));

4811

if (static_cast<unsigned>(count_if(

4812

Candidates, [this](ArrayRef<std::pair<Value *, Value *>> Cand) {

4813

return findBestRootPair(Cand, LookAheadHeuristics::ScoreSplat);

4814

})) >= S.MainOp->getNumOperands() / 2)

4815

return false;

4816

if (S.MainOp->getNumOperands() > 2)

4817

return true;

4818

if (IsCommutative) {

4819

// Check permuted operands.

4820

Candidates.clear();

4821

for (int Op = 0, E = S.MainOp->getNumOperands(); Op < E; ++Op)

4822

Candidates.emplace_back().emplace_back(I1->getOperand(Op),

4823

I2->getOperand((Op + 1) % E));

4824

if (any_of(

4825

Candidates, [this](ArrayRef<std::pair<Value *, Value *>> Cand) {

4826

return findBestRootPair(Cand, LookAheadHeuristics::ScoreSplat);

4827

}))

4828

return false;

4829

}

4830

return true;

4831

};

4832

SmallVector<unsigned> SortedIndices;

4833

BasicBlock *BB = nullptr;

4834

bool IsScatterVectorizeUserTE =

4835

UserTreeIdx.UserTE &&

4836

UserTreeIdx.UserTE->State == TreeEntry::ScatterVectorize;

4837

bool AreAllSameInsts =

4838

(S.getOpcode() && allSameBlock(VL)) ||

4839

(S.OpValue->getType()->isPointerTy() && IsScatterVectorizeUserTE &&

4840

VL.size() > 2 &&

4841

all_of(VL,

4842

[&BB](Value *V) {

4843

auto *I = dyn_cast<GetElementPtrInst>(V);

4844

if (!I)

4845

return doesNotNeedToBeScheduled(V);

4846

if (!BB)

4847

BB = I->getParent();

4848

return BB == I->getParent() && I->getNumOperands() == 2;

4849

}) &&

4850

BB &&

4851

sortPtrAccesses(VL, UserTreeIdx.UserTE->getMainOp()->getType(), *DL, *SE,

4852

SortedIndices));

4853

if (allConstant(VL) || isSplat(VL) || !AreAllSameInsts ||

4854

(isa<InsertElementInst, ExtractValueInst, ExtractElementInst>(

4855

S.OpValue) &&

4856

!all_of(VL, isVectorLikeInstWithConstOps)) ||

4857

NotProfitableForVectorization(VL)) {

4858

LLVM_DEBUG(dbgs() << "SLP: Gathering due to C,S,B,O, small shuffle. \n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Gathering due to C,S,B,O, small shuffle. \n"
; } } while (false);

4859

if (TryToFindDuplicates(S))

4860

newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,

4861

ReuseShuffleIndicies);

4862

return;

4863

}

4864

4865

// We now know that this is a vector of instructions of the same type from

4866

// the same block.

4867

4868

// Don't vectorize ephemeral values.

4869

if (!EphValues.empty()) {

4870

for (Value *V : VL) {

4871

if (EphValues.count(V)) {

4872

LLVM_DEBUG(dbgs() << "SLP: The instruction (" << *Vdo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: The instruction (" << *
V << ") is ephemeral.\n"; } } while (false)

4873

<< ") is ephemeral.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: The instruction (" << *
V << ") is ephemeral.\n"; } } while (false);

4874

newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx);

4875

return;

4876

}

4877

}

4878

}

4879

4880

// Check if this is a duplicate of another entry.

4881

if (TreeEntry *E = getTreeEntry(S.OpValue)) {

4882

LLVM_DEBUG(dbgs() << "SLP: \tChecking bundle: " << *S.OpValue << ".\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: \tChecking bundle: " <<
*S.OpValue << ".\n"; } } while (false);

4883

if (!E->isSame(VL)) {

4884

LLVM_DEBUG(dbgs() << "SLP: Gathering due to partial overlap.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Gathering due to partial overlap.\n"
; } } while (false);

4885

if (TryToFindDuplicates(S))

4886

newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,

4887

ReuseShuffleIndicies);

4888

return;

4889

}

4890

// Record the reuse of the tree node. FIXME, currently this is only used to

4891

// properly draw the graph rather than for the actual vectorization.

4892

E->UserTreeIndices.push_back(UserTreeIdx);

4893

LLVM_DEBUG(dbgs() << "SLP: Perfect diamond merge at " << *S.OpValuedo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Perfect diamond merge at " <<
*S.OpValue << ".\n"; } } while (false)

4894

<< ".\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Perfect diamond merge at " <<
*S.OpValue << ".\n"; } } while (false);

4895

return;

4896

}

4897

4898

// Check that none of the instructions in the bundle are already in the tree.

4899

for (Value *V : VL) {

4900

if (!IsScatterVectorizeUserTE && !isa<Instruction>(V))

4901

continue;

4902

if (getTreeEntry(V)) {

4903

LLVM_DEBUG(dbgs() << "SLP: The instruction (" << *Vdo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: The instruction (" << *
V << ") is already in tree.\n"; } } while (false)

4904

<< ") is already in tree.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: The instruction (" << *
V << ") is already in tree.\n"; } } while (false);

4905

if (TryToFindDuplicates(S))

4906

newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,

4907

ReuseShuffleIndicies);

4908

return;

4909

}

4910

}

4911

4912

// The reduction nodes (stored in UserIgnoreList) also should stay scalar.

4913

if (UserIgnoreList && !UserIgnoreList->empty()) {

4914

for (Value *V : VL) {

4915

if (UserIgnoreList && UserIgnoreList->contains(V)) {

4916

LLVM_DEBUG(dbgs() << "SLP: Gathering due to gathered scalar.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Gathering due to gathered scalar.\n"
; } } while (false);

4917

if (TryToFindDuplicates(S))

4918

newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,

4919

ReuseShuffleIndicies);

4920

return;

4921

}

4922

}

4923

}

4924

4925

// Special processing for sorted pointers for ScatterVectorize node with

4926

// constant indeces only.

4927

if (AreAllSameInsts && !(S.getOpcode() && allSameBlock(VL)) &&

4928

UserTreeIdx.UserTE &&

4929

UserTreeIdx.UserTE->State == TreeEntry::ScatterVectorize) {

4930

assert(S.OpValue->getType()->isPointerTy() &&(static_cast <bool> (S.OpValue->getType()->isPointerTy
() && count_if(VL, [](Value *V) { return isa<GetElementPtrInst
>(V); }) >= 2 && "Expected pointers only.") ? void
(0) : __assert_fail ("S.OpValue->getType()->isPointerTy() && count_if(VL, [](Value *V) { return isa<GetElementPtrInst>(V); }) >= 2 && \"Expected pointers only.\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 4933, __extension__
__PRETTY_FUNCTION__))

4931

count_if(VL, [](Value *V) { return isa<GetElementPtrInst>(V); }) >=(static_cast <bool> (S.OpValue->getType()->isPointerTy
() && count_if(VL, [](Value *V) { return isa<GetElementPtrInst
>(V); }) >= 2 && "Expected pointers only.") ? void
(0) : __assert_fail ("S.OpValue->getType()->isPointerTy() && count_if(VL, [](Value *V) { return isa<GetElementPtrInst>(V); }) >= 2 && \"Expected pointers only.\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 4933, __extension__
__PRETTY_FUNCTION__))

4932

2 &&(static_cast <bool> (S.OpValue->getType()->isPointerTy
() && count_if(VL, [](Value *V) { return isa<GetElementPtrInst
>(V); }) >= 2 && "Expected pointers only.") ? void
(0) : __assert_fail ("S.OpValue->getType()->isPointerTy() && count_if(VL, [](Value *V) { return isa<GetElementPtrInst>(V); }) >= 2 && \"Expected pointers only.\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 4933, __extension__
__PRETTY_FUNCTION__))

4933

"Expected pointers only.")(static_cast <bool> (S.OpValue->getType()->isPointerTy
() && count_if(VL, [](Value *V) { return isa<GetElementPtrInst
>(V); }) >= 2 && "Expected pointers only.") ? void
(0) : __assert_fail ("S.OpValue->getType()->isPointerTy() && count_if(VL, [](Value *V) { return isa<GetElementPtrInst>(V); }) >= 2 && \"Expected pointers only.\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 4933, __extension__
__PRETTY_FUNCTION__));

4934

// Reset S to make it GetElementPtr kind of node.

4935

const auto *It = find_if(VL, [](Value *V) { return isa<GetElementPtrInst>(V); });

4936

assert(It != VL.end() && "Expected at least one GEP.")(static_cast <bool> (It != VL.end() && "Expected at least one GEP."
) ? void (0) : __assert_fail ("It != VL.end() && \"Expected at least one GEP.\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 4936, __extension__
__PRETTY_FUNCTION__));

4937

S = getSameOpcode(*It);

4938

}

4939

4940

// Check that all of the users of the scalars that we want to vectorize are

4941

// schedulable.

4942

auto *VL0 = cast<Instruction>(S.OpValue);

4943

BB = VL0->getParent();

4944

4945

if (!DT->isReachableFromEntry(BB)) {

4946

// Don't go into unreachable blocks. They may contain instructions with

4947

// dependency cycles which confuse the final scheduling.

4948

LLVM_DEBUG(dbgs() << "SLP: bundle in unreachable block.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: bundle in unreachable block.\n"
; } } while (false);

4949

newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx);

4950

return;

4951

}

4952

4953

// Don't go into catchswitch blocks, which can happen with PHIs.

4954

// Such blocks can only have PHIs and the catchswitch. There is no

4955

// place to insert a shuffle if we need to, so just avoid that issue.

4956

if (isa<CatchSwitchInst>(BB->getTerminator())) {

4957

LLVM_DEBUG(dbgs() << "SLP: bundle in catchswitch block.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: bundle in catchswitch block.\n"
; } } while (false);

4958

newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx);

4959

return;

4960

}

4961

4962

// Check that every instruction appears once in this bundle.

4963

if (!TryToFindDuplicates(S))

4964

return;

4965

4966

auto &BSRef = BlocksSchedules[BB];

4967

if (!BSRef)

4968

BSRef = std::make_unique<BlockScheduling>(BB);

4969

4970

BlockScheduling &BS = *BSRef;

4971

4972

Optional<ScheduleData *> Bundle = BS.tryScheduleBundle(VL, this, S);

4973

#ifdef EXPENSIVE_CHECKS

4974

// Make sure we didn't break any internal invariants

4975

BS.verify();

4976

#endif

4977

if (!Bundle) {

4978

LLVM_DEBUG(dbgs() << "SLP: We are not able to schedule this bundle!\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: We are not able to schedule this bundle!\n"
; } } while (false);

4979

assert((!BS.getScheduleData(VL0) ||(static_cast <bool> ((!BS.getScheduleData(VL0) || !BS.getScheduleData
(VL0)->isPartOfBundle()) && "tryScheduleBundle should cancelScheduling on failure"
) ? void (0) : __assert_fail ("(!BS.getScheduleData(VL0) || !BS.getScheduleData(VL0)->isPartOfBundle()) && \"tryScheduleBundle should cancelScheduling on failure\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 4981, __extension__
__PRETTY_FUNCTION__))

4980

!BS.getScheduleData(VL0)->isPartOfBundle()) &&(static_cast <bool> ((!BS.getScheduleData(VL0) || !BS.getScheduleData
(VL0)->isPartOfBundle()) && "tryScheduleBundle should cancelScheduling on failure"
) ? void (0) : __assert_fail ("(!BS.getScheduleData(VL0) || !BS.getScheduleData(VL0)->isPartOfBundle()) && \"tryScheduleBundle should cancelScheduling on failure\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 4981, __extension__
__PRETTY_FUNCTION__))

4981

"tryScheduleBundle should cancelScheduling on failure")(static_cast <bool> ((!BS.getScheduleData(VL0) || !BS.getScheduleData
(VL0)->isPartOfBundle()) && "tryScheduleBundle should cancelScheduling on failure"
) ? void (0) : __assert_fail ("(!BS.getScheduleData(VL0) || !BS.getScheduleData(VL0)->isPartOfBundle()) && \"tryScheduleBundle should cancelScheduling on failure\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 4981, __extension__
__PRETTY_FUNCTION__));

4982

newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,

4983

ReuseShuffleIndicies);

4984

return;

4985

}

4986

LLVM_DEBUG(dbgs() << "SLP: We are able to schedule this bundle.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: We are able to schedule this bundle.\n"
; } } while (false);

4987

4988

unsigned ShuffleOrOp = S.isAltShuffle() ?

4989

(unsigned) Instruction::ShuffleVector : S.getOpcode();

4990

switch (ShuffleOrOp) {

4991

case Instruction::PHI: {

4992

auto *PH = cast<PHINode>(VL0);

4993

4994

// Check for terminator values (e.g. invoke).

4995

for (Value *V : VL)

4996

for (Value *Incoming : cast<PHINode>(V)->incoming_values()) {

4997

Instruction *Term = dyn_cast<Instruction>(Incoming);

4998

if (Term && Term->isTerminator()) {

4999

LLVM_DEBUG(dbgs()do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Need to swizzle PHINodes (terminator use).\n"
; } } while (false)

5000

<< "SLP: Need to swizzle PHINodes (terminator use).\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Need to swizzle PHINodes (terminator use).\n"
; } } while (false);

5001

BS.cancelScheduling(VL, VL0);

5002

newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,

5003

ReuseShuffleIndicies);

5004

return;

5005

}

5006

}

5007

5008

TreeEntry *TE =

5009

newTreeEntry(VL, Bundle, S, UserTreeIdx, ReuseShuffleIndicies);

5010

LLVM_DEBUG(dbgs() << "SLP: added a vector of PHINodes.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: added a vector of PHINodes.\n"
; } } while (false);

5011

5012

// Keeps the reordered operands to avoid code duplication.

5013

SmallVector<ValueList, 2> OperandsVec;

5014

for (unsigned I = 0, E = PH->getNumIncomingValues(); I < E; ++I) {

5015

if (!DT->isReachableFromEntry(PH->getIncomingBlock(I))) {

5016

ValueList Operands(VL.size(), PoisonValue::get(PH->getType()));

5017

TE->setOperand(I, Operands);

5018

OperandsVec.push_back(Operands);

5019

continue;

5020

}

5021

ValueList Operands;

5022

// Prepare the operand vector.

5023

for (Value *V : VL)

5024

Operands.push_back(cast<PHINode>(V)->getIncomingValueForBlock(

5025

PH->getIncomingBlock(I)));

5026

TE->setOperand(I, Operands);

5027

OperandsVec.push_back(Operands);

5028

}

5029

for (unsigned OpIdx = 0, OpE = OperandsVec.size(); OpIdx != OpE; ++OpIdx)

5030

buildTree_rec(OperandsVec[OpIdx], Depth + 1, {TE, OpIdx});

5031

return;

5032

}

5033

case Instruction::ExtractValue:

5034

case Instruction::ExtractElement: {

5035

OrdersType CurrentOrder;

5036

bool Reuse = canReuseExtract(VL, VL0, CurrentOrder);

5037

if (Reuse) {

5038

LLVM_DEBUG(dbgs() << "SLP: Reusing or shuffling extract sequence.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Reusing or shuffling extract sequence.\n"
; } } while (false);

5039

newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,

5040

ReuseShuffleIndicies);

5041

// This is a special case, as it does not gather, but at the same time

5042

// we are not extending buildTree_rec() towards the operands.

5043

ValueList Op0;

5044

Op0.assign(VL.size(), VL0->getOperand(0));

5045

VectorizableTree.back()->setOperand(0, Op0);

5046

return;

5047

}

5048

if (!CurrentOrder.empty()) {

5049

LLVM_DEBUG({do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { { dbgs() << "SLP: Reusing or shuffling of reordered extract sequence "
"with order"; for (unsigned Idx : CurrentOrder) dbgs() <<
" " << Idx; dbgs() << "\n"; }; } } while (false)

5050

dbgs() << "SLP: Reusing or shuffling of reordered extract sequence "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { { dbgs() << "SLP: Reusing or shuffling of reordered extract sequence "
"with order"; for (unsigned Idx : CurrentOrder) dbgs() <<
" " << Idx; dbgs() << "\n"; }; } } while (false)

5051

"with order";do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { { dbgs() << "SLP: Reusing or shuffling of reordered extract sequence "
"with order"; for (unsigned Idx : CurrentOrder) dbgs() <<
" " << Idx; dbgs() << "\n"; }; } } while (false)

5052

for (unsigned Idx : CurrentOrder)do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { { dbgs() << "SLP: Reusing or shuffling of reordered extract sequence "
"with order"; for (unsigned Idx : CurrentOrder) dbgs() <<
" " << Idx; dbgs() << "\n"; }; } } while (false)

5053

dbgs() << " " << Idx;do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { { dbgs() << "SLP: Reusing or shuffling of reordered extract sequence "
"with order"; for (unsigned Idx : CurrentOrder) dbgs() <<
" " << Idx; dbgs() << "\n"; }; } } while (false)

5054

dbgs() << "\n";do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { { dbgs() << "SLP: Reusing or shuffling of reordered extract sequence "
"with order"; for (unsigned Idx : CurrentOrder) dbgs() <<
" " << Idx; dbgs() << "\n"; }; } } while (false)

5055

})do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { { dbgs() << "SLP: Reusing or shuffling of reordered extract sequence "
"with order"; for (unsigned Idx : CurrentOrder) dbgs() <<
" " << Idx; dbgs() << "\n"; }; } } while (false);

5056

fixupOrderingIndices(CurrentOrder);

5057

// Insert new order with initial value 0, if it does not exist,

5058

// otherwise return the iterator to the existing one.

5059

newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,

5060

ReuseShuffleIndicies, CurrentOrder);

5061

// This is a special case, as it does not gather, but at the same time

5062

// we are not extending buildTree_rec() towards the operands.

5063

ValueList Op0;

5064

Op0.assign(VL.size(), VL0->getOperand(0));

5065

VectorizableTree.back()->setOperand(0, Op0);

5066

return;

5067

}

5068

LLVM_DEBUG(dbgs() << "SLP: Gather extract sequence.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Gather extract sequence.\n";
} } while (false);

5069

newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,

5070

ReuseShuffleIndicies);

5071

BS.cancelScheduling(VL, VL0);

5072

return;

5073

}

5074

case Instruction::InsertElement: {

5075

assert(ReuseShuffleIndicies.empty() && "All inserts should be unique")(static_cast <bool> (ReuseShuffleIndicies.empty() &&
"All inserts should be unique") ? void (0) : __assert_fail (
"ReuseShuffleIndicies.empty() && \"All inserts should be unique\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 5075, __extension__
__PRETTY_FUNCTION__));

5076

5077

// Check that we have a buildvector and not a shuffle of 2 or more

5078

// different vectors.

5079

ValueSet SourceVectors;

5080

for (Value *V : VL) {

5081

SourceVectors.insert(cast<Instruction>(V)->getOperand(0));

5082

assert(getInsertIndex(V) != None && "Non-constant or undef index?")(static_cast <bool> (getInsertIndex(V) != None &&
"Non-constant or undef index?") ? void (0) : __assert_fail (
"getInsertIndex(V) != None && \"Non-constant or undef index?\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 5082, __extension__
__PRETTY_FUNCTION__));

5083

}

5084

5085

if (count_if(VL, [&SourceVectors](Value *V) {

5086

return !SourceVectors.contains(V);

5087

}) >= 2) {

5088

// Found 2nd source vector - cancel.

5089

LLVM_DEBUG(dbgs() << "SLP: Gather of insertelement vectors with "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Gather of insertelement vectors with "
"different source vectors.\n"; } } while (false)

5090

"different source vectors.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Gather of insertelement vectors with "
"different source vectors.\n"; } } while (false);

5091

newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx);

5092

BS.cancelScheduling(VL, VL0);

5093

return;

5094

}

5095

5096

auto OrdCompare = [](const std::pair<int, int> &P1,

5097

const std::pair<int, int> &P2) {

5098

return P1.first > P2.first;

5099

};

5100

PriorityQueue<std::pair<int, int>, SmallVector<std::pair<int, int>>,

5101

decltype(OrdCompare)>

5102

Indices(OrdCompare);

5103

for (int I = 0, E = VL.size(); I < E; ++I) {

5104

unsigned Idx = *getInsertIndex(VL[I]);

5105

Indices.emplace(Idx, I);

5106

}

5107

OrdersType CurrentOrder(VL.size(), VL.size());

5108

bool IsIdentity = true;

5109

for (int I = 0, E = VL.size(); I < E; ++I) {

5110

CurrentOrder[Indices.top().second] = I;

5111

IsIdentity &= Indices.top().second == I;

5112

Indices.pop();

5113

}

5114

if (IsIdentity)

5115

CurrentOrder.clear();

5116

TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,

5117

None, CurrentOrder);

5118

LLVM_DEBUG(dbgs() << "SLP: added inserts bundle.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: added inserts bundle.\n"; } }
while (false);

5119

5120

constexpr int NumOps = 2;

5121

ValueList VectorOperands[NumOps];

5122

for (int I = 0; I < NumOps; ++I) {

5123

for (Value *V : VL)

5124

VectorOperands[I].push_back(cast<Instruction>(V)->getOperand(I));

5125

5126

TE->setOperand(I, VectorOperands[I]);

5127

}

5128

buildTree_rec(VectorOperands[NumOps - 1], Depth + 1, {TE, NumOps - 1});

5129

return;

5130

}

5131

case Instruction::Load: {

5132

// Check that a vectorized load would load the same memory as a scalar

5133

// load. For example, we don't want to vectorize loads that are smaller

5134

// than 8-bit. Even though we have a packed struct {<i2, i2, i2, i2>} LLVM

5135

// treats loading/storing it as an i8 struct. If we vectorize loads/stores

5136

// from such a struct, we read/write packed bits disagreeing with the

5137

// unvectorized version.

5138

SmallVector<Value *> PointerOps;

5139

OrdersType CurrentOrder;

5140

TreeEntry *TE = nullptr;

5141

switch (canVectorizeLoads(VL, VL0, *TTI, *DL, *SE, *LI, CurrentOrder,

5142

PointerOps)) {

5143

case LoadsState::Vectorize:

5144

if (CurrentOrder.empty()) {

5145

// Original loads are consecutive and does not require reordering.

5146

TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,

5147

ReuseShuffleIndicies);

5148

LLVM_DEBUG(dbgs() << "SLP: added a vector of loads.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: added a vector of loads.\n";
} } while (false);

5149

} else {

5150

fixupOrderingIndices(CurrentOrder);

5151

// Need to reorder.

5152

TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,

5153

ReuseShuffleIndicies, CurrentOrder);

5154

LLVM_DEBUG(dbgs() << "SLP: added a vector of jumbled loads.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: added a vector of jumbled loads.\n"
; } } while (false);

5155

}

5156

TE->setOperandsInOrder();

5157

break;

5158

case LoadsState::ScatterVectorize:

5159

// Vectorizing non-consecutive loads with `llvm.masked.gather`.

5160

TE = newTreeEntry(VL, TreeEntry::ScatterVectorize, Bundle, S,

5161

UserTreeIdx, ReuseShuffleIndicies);

5162

TE->setOperandsInOrder();

5163

buildTree_rec(PointerOps, Depth + 1, {TE, 0});

5164

LLVM_DEBUG(dbgs() << "SLP: added a vector of non-consecutive loads.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: added a vector of non-consecutive loads.\n"
; } } while (false);

5165

break;

5166

case LoadsState::Gather:

5167

BS.cancelScheduling(VL, VL0);

5168

newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,

5169

ReuseShuffleIndicies);

5170

#ifndef NDEBUG

5171

Type *ScalarTy = VL0->getType();

5172

if (DL->getTypeSizeInBits(ScalarTy) !=

5173

DL->getTypeAllocSizeInBits(ScalarTy))

5174

LLVM_DEBUG(dbgs() << "SLP: Gathering loads of non-packed type.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Gathering loads of non-packed type.\n"
; } } while (false);

5175

else if (any_of(VL, [](Value *V) {

5176

return !cast<LoadInst>(V)->isSimple();

5177

}))

5178

LLVM_DEBUG(dbgs() << "SLP: Gathering non-simple loads.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Gathering non-simple loads.\n"
; } } while (false);

5179

else

5180

LLVM_DEBUG(dbgs() << "SLP: Gathering non-consecutive loads.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Gathering non-consecutive loads.\n"
; } } while (false);

5181

#endif // NDEBUG

5182

break;

5183

}

5184

return;

5185

}

5186

case Instruction::ZExt:

5187

case Instruction::SExt:

5188

case Instruction::FPToUI:

5189

case Instruction::FPToSI:

5190

case Instruction::FPExt:

5191

case Instruction::PtrToInt:

5192

case Instruction::IntToPtr:

5193

case Instruction::SIToFP:

5194

case Instruction::UIToFP:

5195

case Instruction::Trunc:

5196

case Instruction::FPTrunc:

5197

case Instruction::BitCast: {

5198

Type *SrcTy = VL0->getOperand(0)->getType();

5199

for (Value *V : VL) {

5200

Type *Ty = cast<Instruction>(V)->getOperand(0)->getType();

5201

if (Ty != SrcTy || !isValidElementType(Ty)) {

5202

BS.cancelScheduling(VL, VL0);

5203

newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,

5204

ReuseShuffleIndicies);

5205

LLVM_DEBUG(dbgs()do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Gathering casts with different src types.\n"
; } } while (false)

5206

<< "SLP: Gathering casts with different src types.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Gathering casts with different src types.\n"
; } } while (false);

5207

return;

5208

}

5209

}

5210

TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,

5211

ReuseShuffleIndicies);

5212

LLVM_DEBUG(dbgs() << "SLP: added a vector of casts.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: added a vector of casts.\n";
} } while (false);

5213

5214

TE->setOperandsInOrder();

5215

for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {

5216

ValueList Operands;

5217

// Prepare the operand vector.

5218

for (Value *V : VL)

5219

Operands.push_back(cast<Instruction>(V)->getOperand(i));

5220

5221

buildTree_rec(Operands, Depth + 1, {TE, i});

5222

}

5223

return;

5224

}

5225

case Instruction::ICmp:

5226

case Instruction::FCmp: {

5227

// Check that all of the compares have the same predicate.

5228

CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate();

5229

CmpInst::Predicate SwapP0 = CmpInst::getSwappedPredicate(P0);

5230

Type *ComparedTy = VL0->getOperand(0)->getType();

5231

for (Value *V : VL) {

5232

CmpInst *Cmp = cast<CmpInst>(V);

5233

if ((Cmp->getPredicate() != P0 && Cmp->getPredicate() != SwapP0) ||

5234

Cmp->getOperand(0)->getType() != ComparedTy) {

5235

BS.cancelScheduling(VL, VL0);

5236

newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,

5237

ReuseShuffleIndicies);

5238

LLVM_DEBUG(dbgs()do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Gathering cmp with different predicate.\n"
; } } while (false)

5239

<< "SLP: Gathering cmp with different predicate.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Gathering cmp with different predicate.\n"
; } } while (false);

5240

return;

5241

}

5242

}

5243

5244

TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,

5245

ReuseShuffleIndicies);

5246

LLVM_DEBUG(dbgs() << "SLP: added a vector of compares.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: added a vector of compares.\n"
; } } while (false);

5247

5248

ValueList Left, Right;

5249

if (cast<CmpInst>(VL0)->isCommutative()) {

5250

// Commutative predicate - collect + sort operands of the instructions

5251

// so that each side is more likely to have the same opcode.

5252

assert(P0 == SwapP0 && "Commutative Predicate mismatch")(static_cast <bool> (P0 == SwapP0 && "Commutative Predicate mismatch"
) ? void (0) : __assert_fail ("P0 == SwapP0 && \"Commutative Predicate mismatch\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 5252, __extension__
__PRETTY_FUNCTION__));

5253

reorderInputsAccordingToOpcode(VL, Left, Right, *DL, *SE, *this);

5254

} else {

5255

// Collect operands - commute if it uses the swapped predicate.

5256

for (Value *V : VL) {

5257

auto *Cmp = cast<CmpInst>(V);

5258

Value *LHS = Cmp->getOperand(0);

5259

Value *RHS = Cmp->getOperand(1);

5260

if (Cmp->getPredicate() != P0)

5261

std::swap(LHS, RHS);

5262

Left.push_back(LHS);

5263

Right.push_back(RHS);

5264

}

5265

}

5266

TE->setOperand(0, Left);

5267

TE->setOperand(1, Right);

5268

buildTree_rec(Left, Depth + 1, {TE, 0});

5269

buildTree_rec(Right, Depth + 1, {TE, 1});

5270

return;

5271

}

5272

case Instruction::Select:

5273

case Instruction::FNeg:

5274

case Instruction::Add:

5275

case Instruction::FAdd:

5276

case Instruction::Sub:

5277

case Instruction::FSub:

5278

case Instruction::Mul:

5279

case Instruction::FMul:

5280

case Instruction::UDiv:

5281

case Instruction::SDiv:

5282

case Instruction::FDiv:

5283

case Instruction::URem:

5284

case Instruction::SRem:

5285

case Instruction::FRem:

5286

case Instruction::Shl:

5287

case Instruction::LShr:

5288

case Instruction::AShr:

5289

case Instruction::And:

5290

case Instruction::Or:

5291

case Instruction::Xor: {

5292

TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,

5293

ReuseShuffleIndicies);

5294

LLVM_DEBUG(dbgs() << "SLP: added a vector of un/bin op.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: added a vector of un/bin op.\n"
; } } while (false);

5295

5296

// Sort operands of the instructions so that each side is more likely to

5297

// have the same opcode.

5298

if (isa<BinaryOperator>(VL0) && VL0->isCommutative()) {

5299

ValueList Left, Right;

5300

reorderInputsAccordingToOpcode(VL, Left, Right, *DL, *SE, *this);

5301

TE->setOperand(0, Left);

5302

TE->setOperand(1, Right);

5303

buildTree_rec(Left, Depth + 1, {TE, 0});

5304

buildTree_rec(Right, Depth + 1, {TE, 1});

5305

return;

5306

}

5307

5308

TE->setOperandsInOrder();

5309

for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {

5310

ValueList Operands;

5311

// Prepare the operand vector.

5312

for (Value *V : VL)

5313

Operands.push_back(cast<Instruction>(V)->getOperand(i));

5314

5315

buildTree_rec(Operands, Depth + 1, {TE, i});

5316

}

5317

return;

5318

}

5319

case Instruction::GetElementPtr: {

5320

// We don't combine GEPs with complicated (nested) indexing.

5321

for (Value *V : VL) {

5322

auto *I = dyn_cast<GetElementPtrInst>(V);

5323

if (!I)

5324

continue;

5325

if (I->getNumOperands() != 2) {

5326

LLVM_DEBUG(dbgs() << "SLP: not-vectorizable GEP (nested indexes).\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: not-vectorizable GEP (nested indexes).\n"
; } } while (false);

5327

BS.cancelScheduling(VL, VL0);

5328

newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,

5329

ReuseShuffleIndicies);

5330

return;

5331

}

5332

}

5333

5334

// We can't combine several GEPs into one vector if they operate on

5335

// different types.

5336

Type *Ty0 = cast<GEPOperator>(VL0)->getSourceElementType();

5337

for (Value *V : VL) {

5338

auto *GEP = dyn_cast<GEPOperator>(V);

5339

if (!GEP)

5340

continue;

5341

Type *CurTy = GEP->getSourceElementType();

5342

if (Ty0 != CurTy) {

5343

LLVM_DEBUG(dbgs()do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: not-vectorizable GEP (different types).\n"
; } } while (false)

5344

<< "SLP: not-vectorizable GEP (different types).\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: not-vectorizable GEP (different types).\n"
; } } while (false);

5345

BS.cancelScheduling(VL, VL0);

5346

newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,

5347

ReuseShuffleIndicies);

5348

return;

5349

}

5350

}

5351

5352

// We don't combine GEPs with non-constant indexes.

5353

Type *Ty1 = VL0->getOperand(1)->getType();

5354

for (Value *V : VL) {

5355

auto *I = dyn_cast<GetElementPtrInst>(V);

5356

if (!I)

5357

continue;

5358

auto *Op = I->getOperand(1);

5359

if ((!IsScatterVectorizeUserTE && !isa<ConstantInt>(Op)) ||

5360

(Op->getType() != Ty1 &&

5361

((IsScatterVectorizeUserTE && !isa<ConstantInt>(Op)) ||

5362

Op->getType()->getScalarSizeInBits() >

5363

DL->getIndexSizeInBits(

5364

V->getType()->getPointerAddressSpace())))) {

5365

LLVM_DEBUG(dbgs()do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: not-vectorizable GEP (non-constant indexes).\n"
; } } while (false)

5366

<< "SLP: not-vectorizable GEP (non-constant indexes).\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: not-vectorizable GEP (non-constant indexes).\n"
; } } while (false);

5367

BS.cancelScheduling(VL, VL0);

5368

newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,

5369

ReuseShuffleIndicies);

5370

return;

5371

}

5372

}

5373

5374

TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,

5375

ReuseShuffleIndicies);

5376

LLVM_DEBUG(dbgs() << "SLP: added a vector of GEPs.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: added a vector of GEPs.\n"; }
} while (false);

5377

SmallVector<ValueList, 2> Operands(2);

5378

// Prepare the operand vector for pointer operands.

5379

for (Value *V : VL) {

5380

auto *GEP = dyn_cast<GetElementPtrInst>(V);

5381

if (!GEP) {

5382

Operands.front().push_back(V);

5383

continue;

5384

}

5385

Operands.front().push_back(GEP->getPointerOperand());

5386

}

5387

TE->setOperand(0, Operands.front());

5388

// Need to cast all indices to the same type before vectorization to

5389

// avoid crash.

5390

// Required to be able to find correct matches between different gather

5391

// nodes and reuse the vectorized values rather than trying to gather them

5392

// again.

5393

int IndexIdx = 1;

5394

Type *VL0Ty = VL0->getOperand(IndexIdx)->getType();

5395

Type *Ty = all_of(VL,

5396

[VL0Ty, IndexIdx](Value *V) {

5397

auto *GEP = dyn_cast<GetElementPtrInst>(V);

5398

if (!GEP)

5399

return true;

5400

return VL0Ty == GEP->getOperand(IndexIdx)->getType();

5401

})

5402

? VL0Ty

5403

: DL->getIndexType(cast<GetElementPtrInst>(VL0)

5404

->getPointerOperandType()

5405

->getScalarType());

5406

// Prepare the operand vector.

5407

for (Value *V : VL) {

5408

auto *I = dyn_cast<GetElementPtrInst>(V);

5409

if (!I) {

5410

Operands.back().push_back(

5411

ConstantInt::get(Ty, 0, /*isSigned=*/false));

5412

continue;

5413

}

5414

auto *Op = I->getOperand(IndexIdx);

5415

auto *CI = dyn_cast<ConstantInt>(Op);

5416

if (!CI)

5417

Operands.back().push_back(Op);

5418

else

5419

Operands.back().push_back(ConstantExpr::getIntegerCast(

5420

CI, Ty, CI->getValue().isSignBitSet()));

5421

}

5422

TE->setOperand(IndexIdx, Operands.back());

5423

5424

for (unsigned I = 0, Ops = Operands.size(); I < Ops; ++I)

5425

buildTree_rec(Operands[I], Depth + 1, {TE, I});

5426

return;

5427

}

5428

case Instruction::Store: {

5429

// Check if the stores are consecutive or if we need to swizzle them.

5430

llvm::Type *ScalarTy = cast<StoreInst>(VL0)->getValueOperand()->getType();

5431

// Avoid types that are padded when being allocated as scalars, while

5432

// being packed together in a vector (such as i1).

5433

if (DL->getTypeSizeInBits(ScalarTy) !=

5434

DL->getTypeAllocSizeInBits(ScalarTy)) {

5435

BS.cancelScheduling(VL, VL0);

5436

newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,

5437

ReuseShuffleIndicies);

5438

LLVM_DEBUG(dbgs() << "SLP: Gathering stores of non-packed type.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Gathering stores of non-packed type.\n"
; } } while (false);

5439

return;

5440

}

5441

// Make sure all stores in the bundle are simple - we can't vectorize

5442

// atomic or volatile stores.

5443

SmallVector<Value *, 4> PointerOps(VL.size());

5444

ValueList Operands(VL.size());

5445

auto POIter = PointerOps.begin();

5446

auto OIter = Operands.begin();

5447

for (Value *V : VL) {

5448

auto *SI = cast<StoreInst>(V);

5449

if (!SI->isSimple()) {

5450

BS.cancelScheduling(VL, VL0);

5451

newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,

5452

ReuseShuffleIndicies);

5453

LLVM_DEBUG(dbgs() << "SLP: Gathering non-simple stores.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Gathering non-simple stores.\n"
; } } while (false);

5454

return;

5455

}

5456

*POIter = SI->getPointerOperand();

5457

*OIter = SI->getValueOperand();

5458

++POIter;

5459

++OIter;

5460

}

5461

5462

OrdersType CurrentOrder;

5463

// Check the order of pointer operands.

5464

if (llvm::sortPtrAccesses(PointerOps, ScalarTy, *DL, *SE, CurrentOrder)) {

5465

Value *Ptr0;

5466

Value *PtrN;

5467

if (CurrentOrder.empty()) {

5468

Ptr0 = PointerOps.front();

5469

PtrN = PointerOps.back();

5470

} else {

5471

Ptr0 = PointerOps[CurrentOrder.front()];

5472

PtrN = PointerOps[CurrentOrder.back()];

5473

}

5474

Optional<int> Dist =

5475

getPointersDiff(ScalarTy, Ptr0, ScalarTy, PtrN, *DL, *SE);

5476

// Check that the sorted pointer operands are consecutive.

5477

if (static_cast<unsigned>(*Dist) == VL.size() - 1) {

5478

if (CurrentOrder.empty()) {

5479

// Original stores are consecutive and does not require reordering.

5480

TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S,

5481

UserTreeIdx, ReuseShuffleIndicies);

5482

TE->setOperandsInOrder();

5483

buildTree_rec(Operands, Depth + 1, {TE, 0});

5484

LLVM_DEBUG(dbgs() << "SLP: added a vector of stores.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: added a vector of stores.\n"
; } } while (false);

5485

} else {

5486

fixupOrderingIndices(CurrentOrder);

5487

TreeEntry *TE =

5488

newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,

5489

ReuseShuffleIndicies, CurrentOrder);

5490

TE->setOperandsInOrder();

5491

buildTree_rec(Operands, Depth + 1, {TE, 0});

5492

LLVM_DEBUG(dbgs() << "SLP: added a vector of jumbled stores.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: added a vector of jumbled stores.\n"
; } } while (false);

5493

}

5494

return;

5495

}

5496

}

5497

5498

BS.cancelScheduling(VL, VL0);

5499

newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,

5500

ReuseShuffleIndicies);

5501

LLVM_DEBUG(dbgs() << "SLP: Non-consecutive store.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Non-consecutive store.\n"; }
} while (false);

5502

return;

5503

}

5504

case Instruction::Call: {

5505

// Check if the calls are all to the same vectorizable intrinsic or

5506

// library function.

5507

CallInst *CI = cast<CallInst>(VL0);

5508

Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);

5509

5510

VFShape Shape = VFShape::get(

5511

*CI, ElementCount::getFixed(static_cast<unsigned int>(VL.size())),

5512

false /*HasGlobalPred*/);

5513

Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape);

5514

5515

if (!VecFunc && !isTriviallyVectorizable(ID)) {

5516

BS.cancelScheduling(VL, VL0);

5517

newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,

5518

ReuseShuffleIndicies);

5519

LLVM_DEBUG(dbgs() << "SLP: Non-vectorizable call.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Non-vectorizable call.\n"; }
} while (false);

5520

return;

5521

}

5522

Function *F = CI->getCalledFunction();

5523

unsigned NumArgs = CI->arg_size();

5524

SmallVector<Value*, 4> ScalarArgs(NumArgs, nullptr);

5525

for (unsigned j = 0; j != NumArgs; ++j)

5526

if (isVectorIntrinsicWithScalarOpAtArg(ID, j))

5527

ScalarArgs[j] = CI->getArgOperand(j);

5528

for (Value *V : VL) {

5529

CallInst *CI2 = dyn_cast<CallInst>(V);

5530

if (!CI2 || CI2->getCalledFunction() != F ||

5531

getVectorIntrinsicIDForCall(CI2, TLI) != ID ||

5532

(VecFunc &&

5533

VecFunc != VFDatabase(*CI2).getVectorizedFunction(Shape)) ||

5534

!CI->hasIdenticalOperandBundleSchema(*CI2)) {

5535

BS.cancelScheduling(VL, VL0);

5536

newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,

5537

ReuseShuffleIndicies);

5538

LLVM_DEBUG(dbgs() << "SLP: mismatched calls:" << *CI << "!=" << *Vdo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: mismatched calls:" << *
CI << "!=" << *V << "\n"; } } while (false)

5539

<< "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: mismatched calls:" << *
CI << "!=" << *V << "\n"; } } while (false);

5540

return;

5541

}

5542

// Some intrinsics have scalar arguments and should be same in order for

5543

// them to be vectorized.

5544

for (unsigned j = 0; j != NumArgs; ++j) {

5545

if (isVectorIntrinsicWithScalarOpAtArg(ID, j)) {

5546

Value *A1J = CI2->getArgOperand(j);

5547

if (ScalarArgs[j] != A1J) {

5548

BS.cancelScheduling(VL, VL0);

5549

newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,

5550

ReuseShuffleIndicies);

5551

LLVM_DEBUG(dbgs() << "SLP: mismatched arguments in call:" << *CIdo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: mismatched arguments in call:"
<< *CI << " argument " << ScalarArgs[j] <<
"!=" << A1J << "\n"; } } while (false)

5552

<< " argument " << ScalarArgs[j] << "!=" << A1Jdo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: mismatched arguments in call:"
<< *CI << " argument " << ScalarArgs[j] <<
"!=" << A1J << "\n"; } } while (false)

5553

<< "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: mismatched arguments in call:"
<< *CI << " argument " << ScalarArgs[j] <<
"!=" << A1J << "\n"; } } while (false);

5554

return;

5555

}

5556

}

5557

}

5558

// Verify that the bundle operands are identical between the two calls.

5559

if (CI->hasOperandBundles() &&

5560

!std::equal(CI->op_begin() + CI->getBundleOperandsStartIndex(),

5561

CI->op_begin() + CI->getBundleOperandsEndIndex(),

5562

CI2->op_begin() + CI2->getBundleOperandsStartIndex())) {

5563

BS.cancelScheduling(VL, VL0);

5564

newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,

5565

ReuseShuffleIndicies);

5566

LLVM_DEBUG(dbgs() << "SLP: mismatched bundle operands in calls:"do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: mismatched bundle operands in calls:"
<< *CI << "!=" << *V << '\n'; } } while
(false)

5567

<< *CI << "!=" << *V << '\n')do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: mismatched bundle operands in calls:"
<< *CI << "!=" << *V << '\n'; } } while
(false);

5568

return;

5569

}

5570

}

5571

5572

TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,

5573

ReuseShuffleIndicies);

5574

TE->setOperandsInOrder();

5575

for (unsigned i = 0, e = CI->arg_size(); i != e; ++i) {

5576

// For scalar operands no need to to create an entry since no need to

5577

// vectorize it.

5578

if (isVectorIntrinsicWithScalarOpAtArg(ID, i))

5579

continue;

5580

ValueList Operands;

5581

// Prepare the operand vector.

5582

for (Value *V : VL) {

5583

auto *CI2 = cast<CallInst>(V);

5584

Operands.push_back(CI2->getArgOperand(i));

5585

}

5586

buildTree_rec(Operands, Depth + 1, {TE, i});

5587

}

5588

return;

5589

}

5590

case Instruction::ShuffleVector: {

5591

// If this is not an alternate sequence of opcode like add-sub

5592

// then do not vectorize this instruction.

5593

if (!S.isAltShuffle()) {

5594

BS.cancelScheduling(VL, VL0);

5595

newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,

5596

ReuseShuffleIndicies);

5597

LLVM_DEBUG(dbgs() << "SLP: ShuffleVector are not vectorized.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: ShuffleVector are not vectorized.\n"
; } } while (false);

5598

return;

5599

}

5600

TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,

5601

ReuseShuffleIndicies);

5602

LLVM_DEBUG(dbgs() << "SLP: added a ShuffleVector op.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: added a ShuffleVector op.\n"
; } } while (false);

5603

5604

// Reorder operands if reordering would enable vectorization.

5605

auto *CI = dyn_cast<CmpInst>(VL0);

5606

if (isa<BinaryOperator>(VL0) || CI) {

5607

ValueList Left, Right;

5608

if (!CI || all_of(VL, [](Value *V) {

5609

return cast<CmpInst>(V)->isCommutative();

5610

})) {

5611

reorderInputsAccordingToOpcode(VL, Left, Right, *DL, *SE, *this);

5612

} else {

5613

auto *MainCI = cast<CmpInst>(S.MainOp);

5614

auto *AltCI = cast<CmpInst>(S.AltOp);

5615

CmpInst::Predicate MainP = MainCI->getPredicate();

5616

CmpInst::Predicate AltP = AltCI->getPredicate();

5617

assert(MainP != AltP &&(static_cast <bool> (MainP != AltP && "Expected different main/alternate predicates."
) ? void (0) : __assert_fail ("MainP != AltP && \"Expected different main/alternate predicates.\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 5618, __extension__
__PRETTY_FUNCTION__))

5618

"Expected different main/alternate predicates.")(static_cast <bool> (MainP != AltP && "Expected different main/alternate predicates."
) ? void (0) : __assert_fail ("MainP != AltP && \"Expected different main/alternate predicates.\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 5618, __extension__
__PRETTY_FUNCTION__));

5619

// Collect operands - commute if it uses the swapped predicate or

5620

// alternate operation.

5621

for (Value *V : VL) {

5622

auto *Cmp = cast<CmpInst>(V);

5623

Value *LHS = Cmp->getOperand(0);

5624

Value *RHS = Cmp->getOperand(1);

5625

5626

if (isAlternateInstruction(Cmp, MainCI, AltCI)) {

5627

if (AltP == CmpInst::getSwappedPredicate(Cmp->getPredicate()))

5628

std::swap(LHS, RHS);

5629

} else {

5630

if (MainP == CmpInst::getSwappedPredicate(Cmp->getPredicate()))

5631

std::swap(LHS, RHS);

5632

}

5633

Left.push_back(LHS);

5634

Right.push_back(RHS);

5635

}

5636

}

5637

TE->setOperand(0, Left);

5638

TE->setOperand(1, Right);

5639

buildTree_rec(Left, Depth + 1, {TE, 0});

5640

buildTree_rec(Right, Depth + 1, {TE, 1});

5641

return;

5642

}

5643

5644

TE->setOperandsInOrder();

5645

for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {

5646

ValueList Operands;

5647

// Prepare the operand vector.

5648

for (Value *V : VL)

5649

Operands.push_back(cast<Instruction>(V)->getOperand(i));

5650

5651

buildTree_rec(Operands, Depth + 1, {TE, i});

5652

}

5653

return;

5654

}

5655

default:

5656

BS.cancelScheduling(VL, VL0);

5657

newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,

5658

ReuseShuffleIndicies);

5659

LLVM_DEBUG(dbgs() << "SLP: Gathering unknown instruction.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Gathering unknown instruction.\n"
; } } while (false);

5660

return;

5661

}

5662

}

5663

5664

unsigned BoUpSLP::canMapToVector(Type *T, const DataLayout &DL) const {

5665

unsigned N = 1;

5666

Type *EltTy = T;

5667

5668

while (isa<StructType, ArrayType, VectorType>(EltTy)) {

5669

if (auto *ST = dyn_cast<StructType>(EltTy)) {

5670

// Check that struct is homogeneous.

5671

for (const auto *Ty : ST->elements())

5672

if (Ty != *ST->element_begin())

5673

return 0;

5674

N *= ST->getNumElements();

5675

EltTy = *ST->element_begin();

5676

} else if (auto *AT = dyn_cast<ArrayType>(EltTy)) {

5677

N *= AT->getNumElements();

5678

EltTy = AT->getElementType();

5679

} else {

5680

auto *VT = cast<FixedVectorType>(EltTy);

5681

N *= VT->getNumElements();

5682

EltTy = VT->getElementType();

5683

}

5684

}

5685

5686

if (!isValidElementType(EltTy))

5687

return 0;

5688

uint64_t VTSize = DL.getTypeStoreSizeInBits(FixedVectorType::get(EltTy, N));

5689

if (VTSize < MinVecRegSize || VTSize > MaxVecRegSize || VTSize != DL.getTypeStoreSizeInBits(T))

5690

return 0;

5691

return N;

5692

}

5693

5694

bool BoUpSLP::canReuseExtract(ArrayRef<Value *> VL, Value *OpValue,

5695

SmallVectorImpl<unsigned> &CurrentOrder) const {

5696

const auto *It = find_if(VL, [](Value *V) {

5697

return isa<ExtractElementInst, ExtractValueInst>(V);

5698

});

5699

assert(It != VL.end() && "Expected at least one extract instruction.")(static_cast <bool> (It != VL.end() && "Expected at least one extract instruction."
) ? void (0) : __assert_fail ("It != VL.end() && \"Expected at least one extract instruction.\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 5699, __extension__
__PRETTY_FUNCTION__));

5700

auto *E0 = cast<Instruction>(*It);

5701

assert(all_of(VL,(static_cast <bool> (all_of(VL, [](Value *V) { return isa
<UndefValue, ExtractElementInst, ExtractValueInst>( V);
}) && "Invalid opcode") ? void (0) : __assert_fail (
"all_of(VL, [](Value *V) { return isa<UndefValue, ExtractElementInst, ExtractValueInst>( V); }) && \"Invalid opcode\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 5706, __extension__
__PRETTY_FUNCTION__))

5702

[](Value *V) {(static_cast <bool> (all_of(VL, [](Value *V) { return isa
<UndefValue, ExtractElementInst, ExtractValueInst>( V);
}) && "Invalid opcode") ? void (0) : __assert_fail (
"all_of(VL, [](Value *V) { return isa<UndefValue, ExtractElementInst, ExtractValueInst>( V); }) && \"Invalid opcode\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 5706, __extension__
__PRETTY_FUNCTION__))

5703

return isa<UndefValue, ExtractElementInst, ExtractValueInst>((static_cast <bool> (all_of(VL, [](Value *V) { return isa
<UndefValue, ExtractElementInst, ExtractValueInst>( V);
}) && "Invalid opcode") ? void (0) : __assert_fail (
"all_of(VL, [](Value *V) { return isa<UndefValue, ExtractElementInst, ExtractValueInst>( V); }) && \"Invalid opcode\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 5706, __extension__
__PRETTY_FUNCTION__))

5704

V);(static_cast <bool> (all_of(VL, [](Value *V) { return isa
<UndefValue, ExtractElementInst, ExtractValueInst>( V);
}) && "Invalid opcode") ? void (0) : __assert_fail (
"all_of(VL, [](Value *V) { return isa<UndefValue, ExtractElementInst, ExtractValueInst>( V); }) && \"Invalid opcode\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 5706, __extension__
__PRETTY_FUNCTION__))

5705

}) &&(static_cast <bool> (all_of(VL, [](Value *V) { return isa
<UndefValue, ExtractElementInst, ExtractValueInst>( V);
}) && "Invalid opcode") ? void (0) : __assert_fail (
"all_of(VL, [](Value *V) { return isa<UndefValue, ExtractElementInst, ExtractValueInst>( V); }) && \"Invalid opcode\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 5706, __extension__
__PRETTY_FUNCTION__))

5706

"Invalid opcode")(static_cast <bool> (all_of(VL, [](Value *V) { return isa
<UndefValue, ExtractElementInst, ExtractValueInst>( V);
}) && "Invalid opcode") ? void (0) : __assert_fail (
"all_of(VL, [](Value *V) { return isa<UndefValue, ExtractElementInst, ExtractValueInst>( V); }) && \"Invalid opcode\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 5706, __extension__
__PRETTY_FUNCTION__));

5707

// Check if all of the extracts come from the same vector and from the

5708

// correct offset.

5709

Value *Vec = E0->getOperand(0);

5710

5711

CurrentOrder.clear();

5712

5713

// We have to extract from a vector/aggregate with the same number of elements.

5714

unsigned NElts;

5715

if (E0->getOpcode() == Instruction::ExtractValue) {

5716

const DataLayout &DL = E0->getModule()->getDataLayout();

5717

NElts = canMapToVector(Vec->getType(), DL);

5718

if (!NElts)

5719

return false;

5720

// Check if load can be rewritten as load of vector.

5721

LoadInst *LI = dyn_cast<LoadInst>(Vec);

5722

if (!LI || !LI->isSimple() || !LI->hasNUses(VL.size()))

5723

return false;

5724

} else {

5725

NElts = cast<FixedVectorType>(Vec->getType())->getNumElements();

5726

}

5727

5728

if (NElts != VL.size())

5729

return false;

5730

5731

// Check that all of the indices extract from the correct offset.

5732

bool ShouldKeepOrder = true;

5733

unsigned E = VL.size();

5734

// Assign to all items the initial value E + 1 so we can check if the extract

5735

// instruction index was used already.

5736

// Also, later we can check that all the indices are used and we have a

5737

// consecutive access in the extract instructions, by checking that no

5738

// element of CurrentOrder still has value E + 1.

5739

CurrentOrder.assign(E, E);

5740

unsigned I = 0;

5741

for (; I < E; ++I) {

5742

auto *Inst = dyn_cast<Instruction>(VL[I]);

5743

if (!Inst)

5744

continue;

5745

if (Inst->getOperand(0) != Vec)

5746

break;

5747

if (auto *EE = dyn_cast<ExtractElementInst>(Inst))

5748

if (isa<UndefValue>(EE->getIndexOperand()))

5749

continue;

5750

Optional<unsigned> Idx = getExtractIndex(Inst);

5751

if (!Idx)

5752

break;

5753

const unsigned ExtIdx = *Idx;

5754

if (ExtIdx != I) {

5755

if (ExtIdx >= E || CurrentOrder[ExtIdx] != E)

5756

break;

5757

ShouldKeepOrder = false;

5758

CurrentOrder[ExtIdx] = I;

5759

} else {

5760

if (CurrentOrder[I] != E)

5761

break;

5762

CurrentOrder[I] = I;

5763

}

5764

}

5765

if (I < E) {

5766

CurrentOrder.clear();

5767

return false;

5768

}

5769

if (ShouldKeepOrder)

5770

CurrentOrder.clear();

5771

5772

return ShouldKeepOrder;

5773

}

5774

5775

bool BoUpSLP::areAllUsersVectorized(Instruction *I,

5776

ArrayRef<Value *> VectorizedVals) const {

5777

return (I->hasOneUse() && is_contained(VectorizedVals, I)) ||

5778

all_of(I->users(), [this](User *U) {

5779

return ScalarToTreeEntry.count(U) > 0 ||

5780

isVectorLikeInstWithConstOps(U) ||

5781

(isa<ExtractElementInst>(U) && MustGather.contains(U));

5782

});

5783

}

5784

5785

static std::pair<InstructionCost, InstructionCost>

5786

getVectorCallCosts(CallInst *CI, FixedVectorType *VecTy,

5787

TargetTransformInfo *TTI, TargetLibraryInfo *TLI) {

5788

Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);

5789

5790

// Calculate the cost of the scalar and vector calls.

5791

SmallVector<Type *, 4> VecTys;

5792

for (Use &Arg : CI->args())

5793

VecTys.push_back(

5794

FixedVectorType::get(Arg->getType(), VecTy->getNumElements()));

5795

FastMathFlags FMF;

5796

if (auto *FPCI = dyn_cast<FPMathOperator>(CI))

5797

FMF = FPCI->getFastMathFlags();

5798

SmallVector<const Value *> Arguments(CI->args());

5799

IntrinsicCostAttributes CostAttrs(ID, VecTy, Arguments, VecTys, FMF,

5800

dyn_cast<IntrinsicInst>(CI));

5801

auto IntrinsicCost =

5802

TTI->getIntrinsicInstrCost(CostAttrs, TTI::TCK_RecipThroughput);

5803

5804

auto Shape = VFShape::get(*CI, ElementCount::getFixed(static_cast<unsigned>(

5805

VecTy->getNumElements())),

5806

false /*HasGlobalPred*/);

5807

Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape);

5808

auto LibCost = IntrinsicCost;

5809

if (!CI->isNoBuiltin() && VecFunc) {

5810

// Calculate the cost of the vector library call.

5811

// If the corresponding vector call is cheaper, return its cost.

5812

LibCost = TTI->getCallInstrCost(nullptr, VecTy, VecTys,

5813

TTI::TCK_RecipThroughput);

5814

}

5815

return {IntrinsicCost, LibCost};

5816

}

5817

5818

/// Compute the cost of creating a vector of type \p VecTy containing the

5819

/// extracted values from \p VL.

5820

static InstructionCost

5821

computeExtractCost(ArrayRef<Value *> VL, FixedVectorType *VecTy,

5822

TargetTransformInfo::ShuffleKind ShuffleKind,

5823

ArrayRef<int> Mask, TargetTransformInfo &TTI) {

5824

unsigned NumOfParts = TTI.getNumberOfParts(VecTy);

5825

5826

if (ShuffleKind != TargetTransformInfo::SK_PermuteSingleSrc || !NumOfParts ||

5827

VecTy->getNumElements() < NumOfParts)

5828

return TTI.getShuffleCost(ShuffleKind, VecTy, Mask);

5829

5830

bool AllConsecutive = true;

5831

unsigned EltsPerVector = VecTy->getNumElements() / NumOfParts;

5832

unsigned Idx = -1;

5833

InstructionCost Cost = 0;

5834

5835

// Process extracts in blocks of EltsPerVector to check if the source vector

5836

// operand can be re-used directly. If not, add the cost of creating a shuffle

5837

// to extract the values into a vector register.

5838

SmallVector<int> RegMask(EltsPerVector, UndefMaskElem);

5839

for (auto *V : VL) {

5840

++Idx;

5841

5842

// Reached the start of a new vector registers.

5843

if (Idx % EltsPerVector == 0) {

5844

RegMask.assign(EltsPerVector, UndefMaskElem);

5845

AllConsecutive = true;

5846

continue;

5847

}

5848

5849

// Need to exclude undefs from analysis.

5850

if (isa<UndefValue>(V) || Mask[Idx] == UndefMaskElem)

5851

continue;

5852

5853

// Check all extracts for a vector register on the target directly

5854

// extract values in order.

5855

unsigned CurrentIdx = *getExtractIndex(cast<Instruction>(V));

5856

if (!isa<UndefValue>(VL[Idx - 1]) && Mask[Idx - 1] != UndefMaskElem) {

5857

unsigned PrevIdx = *getExtractIndex(cast<Instruction>(VL[Idx - 1]));

5858

AllConsecutive &= PrevIdx + 1 == CurrentIdx &&

5859

CurrentIdx % EltsPerVector == Idx % EltsPerVector;

5860

RegMask[Idx % EltsPerVector] = CurrentIdx % EltsPerVector;

5861

}

5862

5863

if (AllConsecutive)

5864

continue;

5865

5866

// Skip all indices, except for the last index per vector block.

5867

if ((Idx + 1) % EltsPerVector != 0 && Idx + 1 != VL.size())

5868

continue;

5869

5870

// If we have a series of extracts which are not consecutive and hence

5871

// cannot re-use the source vector register directly, compute the shuffle

5872

// cost to extract the vector with EltsPerVector elements.

5873

Cost += TTI.getShuffleCost(

5874

TargetTransformInfo::SK_PermuteSingleSrc,

5875

FixedVectorType::get(VecTy->getElementType(), EltsPerVector), RegMask);

5876

}

5877

return Cost;

5878

}

5879

5880

/// Build shuffle mask for shuffle graph entries and lists of main and alternate

5881

/// operations operands.

5882

static void

5883

buildShuffleEntryMask(ArrayRef<Value *> VL, ArrayRef<unsigned> ReorderIndices,

5884

ArrayRef<int> ReusesIndices,

5885

const function_ref<bool(Instruction *)> IsAltOp,

5886

SmallVectorImpl<int> &Mask,

5887

SmallVectorImpl<Value *> *OpScalars = nullptr,

5888

SmallVectorImpl<Value *> *AltScalars = nullptr) {

5889

unsigned Sz = VL.size();

5890

Mask.assign(Sz, UndefMaskElem);

5891

SmallVector<int> OrderMask;

5892

if (!ReorderIndices.empty())

5893

inversePermutation(ReorderIndices, OrderMask);

5894

for (unsigned I = 0; I < Sz; ++I) {

5895

unsigned Idx = I;

5896

if (!ReorderIndices.empty())

5897

Idx = OrderMask[I];

5898

auto *OpInst = cast<Instruction>(VL[Idx]);

5899

if (IsAltOp(OpInst)) {

5900

Mask[I] = Sz + Idx;

5901

if (AltScalars)

5902

AltScalars->push_back(OpInst);

5903

} else {

5904

Mask[I] = Idx;

5905

if (OpScalars)

5906

OpScalars->push_back(OpInst);

5907

}

5908

}

5909

if (!ReusesIndices.empty()) {

5910

SmallVector<int> NewMask(ReusesIndices.size(), UndefMaskElem);

5911

transform(ReusesIndices, NewMask.begin(), [&Mask](int Idx) {

5912

return Idx != UndefMaskElem ? Mask[Idx] : UndefMaskElem;

5913

});

5914

Mask.swap(NewMask);

5915

}

5916

}

5917

5918

static bool isAlternateInstruction(const Instruction *I,

5919

const Instruction *MainOp,

5920

const Instruction *AltOp) {

5921

if (auto *MainCI = dyn_cast<CmpInst>(MainOp)) {

5922

auto *AltCI = cast<CmpInst>(AltOp);

5923

CmpInst::Predicate MainP = MainCI->getPredicate();

5924

CmpInst::Predicate AltP = AltCI->getPredicate();

5925

assert(MainP != AltP && "Expected different main/alternate predicates.")(static_cast <bool> (MainP != AltP && "Expected different main/alternate predicates."
) ? void (0) : __assert_fail ("MainP != AltP && \"Expected different main/alternate predicates.\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 5925, __extension__
__PRETTY_FUNCTION__));

5926

auto *CI = cast<CmpInst>(I);

5927

if (isCmpSameOrSwapped(MainCI, CI))

5928

return false;

5929

if (isCmpSameOrSwapped(AltCI, CI))

5930

return true;

5931

CmpInst::Predicate P = CI->getPredicate();

5932

CmpInst::Predicate SwappedP = CmpInst::getSwappedPredicate(P);

5933

5934

assert((MainP == P || AltP == P || MainP == SwappedP || AltP == SwappedP) &&(static_cast <bool> ((MainP == P || AltP == P || MainP ==
SwappedP || AltP == SwappedP) && "CmpInst expected to match either main or alternate predicate or "
"their swap.") ? void (0) : __assert_fail ("(MainP == P || AltP == P || MainP == SwappedP || AltP == SwappedP) && \"CmpInst expected to match either main or alternate predicate or \" \"their swap.\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 5936, __extension__
__PRETTY_FUNCTION__))

5935

"CmpInst expected to match either main or alternate predicate or "(static_cast <bool> ((MainP == P || AltP == P || MainP ==
SwappedP || AltP == SwappedP) && "CmpInst expected to match either main or alternate predicate or "
"their swap.") ? void (0) : __assert_fail ("(MainP == P || AltP == P || MainP == SwappedP || AltP == SwappedP) && \"CmpInst expected to match either main or alternate predicate or \" \"their swap.\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 5936, __extension__
__PRETTY_FUNCTION__))

5936

"their swap.")(static_cast <bool> ((MainP == P || AltP == P || MainP ==
SwappedP || AltP == SwappedP) && "CmpInst expected to match either main or alternate predicate or "
"their swap.") ? void (0) : __assert_fail ("(MainP == P || AltP == P || MainP == SwappedP || AltP == SwappedP) && \"CmpInst expected to match either main or alternate predicate or \" \"their swap.\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 5936, __extension__
__PRETTY_FUNCTION__));

5937

(void)AltP;

5938

return MainP != P && MainP != SwappedP;

5939

}

5940

return I->getOpcode() == AltOp->getOpcode();

5941

}

5942

5943

TTI::OperandValueInfo BoUpSLP::getOperandInfo(ArrayRef<Value *> VL,

5944

unsigned OpIdx) {

5945

assert(!VL.empty())(static_cast <bool> (!VL.empty()) ? void (0) : __assert_fail
("!VL.empty()", "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 5945, __extension__ __PRETTY_FUNCTION__));

5946

const auto *Op0 = cast<Instruction>(VL.front())->getOperand(OpIdx);

5947

5948

const bool IsConstant = all_of(VL, [&](Value *V) {

5949

// TODO: We should allow undef elements here

5950

auto *Op = cast<Instruction>(V)->getOperand(OpIdx);

5951

return isConstant(Op) && !isa<UndefValue>(Op);

5952

});

5953

const bool IsUniform = all_of(VL, [&](Value *V) {

5954

// TODO: We should allow undef elements here

5955

return cast<Instruction>(V)->getOperand(OpIdx) == Op0;

5956

});

5957

const bool IsPowerOfTwo = all_of(VL, [&](Value *V) {

5958

// TODO: We should allow undef elements here

5959

auto *Op = cast<Instruction>(V)->getOperand(OpIdx);

5960

if (auto *CI = dyn_cast<ConstantInt>(Op))

5961

return CI->getValue().isPowerOf2();

5962

return false;

5963

});

5964

const bool IsNegatedPowerOfTwo = all_of(VL, [&](Value *V) {

5965

// TODO: We should allow undef elements here

5966

auto *Op = cast<Instruction>(V)->getOperand(OpIdx);

5967

if (auto *CI = dyn_cast<ConstantInt>(Op))

5968

return CI->getValue().isNegatedPowerOf2();

5969

return false;

5970

});

5971

5972

TTI::OperandValueKind VK = TTI::OK_AnyValue;

5973

if (IsConstant && IsUniform)

5974

VK = TTI::OK_UniformConstantValue;

5975

else if (IsConstant)

5976

VK = TTI::OK_NonUniformConstantValue;

5977

else if (IsUniform)

5978

VK = TTI::OK_UniformValue;

5979

5980

TTI::OperandValueProperties VP = TTI::OP_None;

5981

VP = IsPowerOfTwo ? TTI::OP_PowerOf2 : VP;

5982

VP = IsNegatedPowerOfTwo ? TTI::OP_NegatedPowerOf2 : VP;

5983

5984

return {VK, VP};

5985

}

5986

5987

InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,

5988

ArrayRef<Value *> VectorizedVals) {

5989

ArrayRef<Value*> VL = E->Scalars;

5990

5991

Type *ScalarTy = VL[0]->getType();

5992

if (StoreInst *SI = dyn_cast<StoreInst>(VL[0]))

5993

ScalarTy = SI->getValueOperand()->getType();

5994

else if (CmpInst *CI = dyn_cast<CmpInst>(VL[0]))

5995

ScalarTy = CI->getOperand(0)->getType();

5996

else if (auto *IE = dyn_cast<InsertElementInst>(VL[0]))

5997

ScalarTy = IE->getOperand(1)->getType();

5998

auto *VecTy = FixedVectorType::get(ScalarTy, VL.size());

5999

TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;

6000

6001

// If we have computed a smaller type for the expression, update VecTy so

6002

// that the costs will be accurate.

6003

if (MinBWs.count(VL[0]))

6004

VecTy = FixedVectorType::get(

6005

IntegerType::get(F->getContext(), MinBWs[VL[0]].first), VL.size());

6006

unsigned EntryVF = E->getVectorFactor();

6007

auto *FinalVecTy = FixedVectorType::get(VecTy->getElementType(), EntryVF);

6008

6009

bool NeedToShuffleReuses = !E->ReuseShuffleIndices.empty();

6010

// FIXME: it tries to fix a problem with MSVC buildbots.

6011

TargetTransformInfo &TTIRef = *TTI;

6012

auto &&AdjustExtractsCost = [this, &TTIRef, CostKind, VL, VecTy,

6013

VectorizedVals, E](InstructionCost &Cost) {

6014

DenseMap<Value *, int> ExtractVectorsTys;

6015

SmallPtrSet<Value *, 4> CheckedExtracts;

6016

for (auto *V : VL) {

6017

if (isa<UndefValue>(V))

6018

continue;

6019

// If all users of instruction are going to be vectorized and this

6020

// instruction itself is not going to be vectorized, consider this

6021

// instruction as dead and remove its cost from the final cost of the

6022

// vectorized tree.

6023

// Also, avoid adjusting the cost for extractelements with multiple uses

6024

// in different graph entries.

6025

const TreeEntry *VE = getTreeEntry(V);

6026

if (!CheckedExtracts.insert(V).second ||

6027

!areAllUsersVectorized(cast<Instruction>(V), VectorizedVals) ||

6028

(VE && VE != E))

6029

continue;

6030

auto *EE = cast<ExtractElementInst>(V);

6031

Optional<unsigned> EEIdx = getExtractIndex(EE);

6032

if (!EEIdx)

6033

continue;

6034

unsigned Idx = *EEIdx;

6035

if (TTIRef.getNumberOfParts(VecTy) !=

6036

TTIRef.getNumberOfParts(EE->getVectorOperandType())) {

6037

auto It =

6038

ExtractVectorsTys.try_emplace(EE->getVectorOperand(), Idx).first;

6039

It->getSecond() = std::min<int>(It->second, Idx);

6040

}

6041

// Take credit for instruction that will become dead.

6042

if (EE->hasOneUse()) {

6043

Instruction *Ext = EE->user_back();

6044

if (isa<SExtInst, ZExtInst>(Ext) && all_of(Ext->users(), [](User *U) {

6045

return isa<GetElementPtrInst>(U);

6046

})) {

6047

// Use getExtractWithExtendCost() to calculate the cost of

6048

// extractelement/ext pair.

6049

Cost -=

6050

TTIRef.getExtractWithExtendCost(Ext->getOpcode(), Ext->getType(),

6051

EE->getVectorOperandType(), Idx);

6052

// Add back the cost of s|zext which is subtracted separately.

6053

Cost += TTIRef.getCastInstrCost(

6054

Ext->getOpcode(), Ext->getType(), EE->getType(),

6055

TTI::getCastContextHint(Ext), CostKind, Ext);

6056

continue;

6057

}

6058

}

6059

Cost -= TTIRef.getVectorInstrCost(*EE, EE->getVectorOperandType(), Idx);

6060

}

6061

// Add a cost for subvector extracts/inserts if required.

6062

for (const auto &Data : ExtractVectorsTys) {

6063

auto *EEVTy = cast<FixedVectorType>(Data.first->getType());

6064

unsigned NumElts = VecTy->getNumElements();

6065

if (Data.second % NumElts == 0)

6066

continue;

6067

if (TTIRef.getNumberOfParts(EEVTy) > TTIRef.getNumberOfParts(VecTy)) {

6068

unsigned Idx = (Data.second / NumElts) * NumElts;

6069

unsigned EENumElts = EEVTy->getNumElements();

6070

if (Idx + NumElts <= EENumElts) {

6071

Cost +=

6072

TTIRef.getShuffleCost(TargetTransformInfo::SK_ExtractSubvector,

6073

EEVTy, None, CostKind, Idx, VecTy);

6074

} else {

6075

// Need to round up the subvector type vectorization factor to avoid a

6076

// crash in cost model functions. Make SubVT so that Idx + VF of SubVT

6077

// <= EENumElts.

6078

auto *SubVT =

6079

FixedVectorType::get(VecTy->getElementType(), EENumElts - Idx);

6080

Cost +=

6081

TTIRef.getShuffleCost(TargetTransformInfo::SK_ExtractSubvector,

6082

EEVTy, None, CostKind, Idx, SubVT);

6083

}

6084

} else {

6085

Cost += TTIRef.getShuffleCost(TargetTransformInfo::SK_InsertSubvector,

6086

VecTy, None, CostKind, 0, EEVTy);

6087

}

6088

}

6089

};

6090

if (E->State == TreeEntry::NeedToGather) {

6091

if (allConstant(VL))

6092

return 0;

6093

if (isa<InsertElementInst>(VL[0]))

6094

return InstructionCost::getInvalid();

6095

SmallVector<int> Mask;

6096

SmallVector<const TreeEntry *> Entries;

6097

Optional<TargetTransformInfo::ShuffleKind> Shuffle =

6098

isGatherShuffledEntry(E, Mask, Entries);

6099

if (Shuffle) {

6100

InstructionCost GatherCost = 0;

6101

if (ShuffleVectorInst::isIdentityMask(Mask)) {

6102

// Perfect match in the graph, will reuse the previously vectorized

6103

// node. Cost is 0.

6104

LLVM_DEBUG(do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: perfect diamond match for gather bundle that starts with "
<< *VL.front() << ".\n"; } } while (false)

6105

dbgs()do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: perfect diamond match for gather bundle that starts with "
<< *VL.front() << ".\n"; } } while (false)

6106

<< "SLP: perfect diamond match for gather bundle that starts with "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: perfect diamond match for gather bundle that starts with "
<< *VL.front() << ".\n"; } } while (false)

6107

<< *VL.front() << ".\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: perfect diamond match for gather bundle that starts with "
<< *VL.front() << ".\n"; } } while (false);

6108

if (NeedToShuffleReuses)

6109

GatherCost =

6110

TTI->getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc,

6111

FinalVecTy, E->ReuseShuffleIndices);

6112

} else {

6113

LLVM_DEBUG(dbgs() << "SLP: shuffled " << Entries.size()do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: shuffled " << Entries.
size() << " entries for bundle that starts with " <<
*VL.front() << ".\n"; } } while (false)

6114

<< " entries for bundle that starts with "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: shuffled " << Entries.
size() << " entries for bundle that starts with " <<
*VL.front() << ".\n"; } } while (false)

6115

<< *VL.front() << ".\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: shuffled " << Entries.
size() << " entries for bundle that starts with " <<
*VL.front() << ".\n"; } } while (false);

6116

// Detected that instead of gather we can emit a shuffle of single/two

6117

// previously vectorized nodes. Add the cost of the permutation rather

6118

// than gather.

6119

::addMask(Mask, E->ReuseShuffleIndices);

6120

GatherCost = TTI->getShuffleCost(*Shuffle, FinalVecTy, Mask);

6121

}

6122

return GatherCost;

6123

}

6124

if ((E->getOpcode() == Instruction::ExtractElement ||

6125

all_of(E->Scalars,

6126

[](Value *V) {

6127

return isa<ExtractElementInst, UndefValue>(V);

6128

})) &&

6129

allSameType(VL)) {

6130

// Check that gather of extractelements can be represented as just a

6131

// shuffle of a single/two vectors the scalars are extracted from.

6132

SmallVector<int> Mask;

6133

Optional<TargetTransformInfo::ShuffleKind> ShuffleKind =

6134

isFixedVectorShuffle(VL, Mask);

6135

if (ShuffleKind) {

6136

// Found the bunch of extractelement instructions that must be gathered

6137

// into a vector and can be represented as a permutation elements in a

6138

// single input vector or of 2 input vectors.

6139

InstructionCost Cost =

6140

computeExtractCost(VL, VecTy, *ShuffleKind, Mask, *TTI);

6141

AdjustExtractsCost(Cost);

6142

if (NeedToShuffleReuses)

6143

Cost += TTI->getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc,

6144

FinalVecTy, E->ReuseShuffleIndices);

6145

return Cost;

6146

}

6147

}

6148

if (isSplat(VL)) {

6149

// Found the broadcasting of the single scalar, calculate the cost as the

6150

// broadcast.

6151

assert(VecTy == FinalVecTy &&(static_cast <bool> (VecTy == FinalVecTy && "No reused scalars expected for broadcast."
) ? void (0) : __assert_fail ("VecTy == FinalVecTy && \"No reused scalars expected for broadcast.\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 6152, __extension__
__PRETTY_FUNCTION__))

6152

"No reused scalars expected for broadcast.")(static_cast <bool> (VecTy == FinalVecTy && "No reused scalars expected for broadcast."
) ? void (0) : __assert_fail ("VecTy == FinalVecTy && \"No reused scalars expected for broadcast.\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 6152, __extension__
__PRETTY_FUNCTION__));

6153

return TTI->getShuffleCost(TargetTransformInfo::SK_Broadcast, VecTy,

6154

/*Mask=*/None, CostKind, /*Index=*/0,

6155

/*SubTp=*/nullptr, /*Args=*/VL[0]);

6156

}

6157

InstructionCost ReuseShuffleCost = 0;

6158

if (NeedToShuffleReuses)

6159

ReuseShuffleCost = TTI->getShuffleCost(

6160

TTI::SK_PermuteSingleSrc, FinalVecTy, E->ReuseShuffleIndices);

6161

// Improve gather cost for gather of loads, if we can group some of the

6162

// loads into vector loads.

6163

if (VL.size() > 2 && E->getOpcode() == Instruction::Load &&

6164

!E->isAltShuffle()) {

6165

BoUpSLP::ValueSet VectorizedLoads;

6166

unsigned StartIdx = 0;

6167

unsigned VF = VL.size() / 2;

6168

unsigned VectorizedCnt = 0;

6169

unsigned ScatterVectorizeCnt = 0;

6170

const unsigned Sz = DL->getTypeSizeInBits(E->getMainOp()->getType());

6171

for (unsigned MinVF = getMinVF(2 * Sz); VF >= MinVF; VF /= 2) {

6172

for (unsigned Cnt = StartIdx, End = VL.size(); Cnt + VF <= End;

6173

Cnt += VF) {

6174

ArrayRef<Value *> Slice = VL.slice(Cnt, VF);

6175

if (!VectorizedLoads.count(Slice.front()) &&

6176

!VectorizedLoads.count(Slice.back()) && allSameBlock(Slice)) {

6177

SmallVector<Value *> PointerOps;

6178

OrdersType CurrentOrder;

6179

LoadsState LS =

6180

canVectorizeLoads(Slice, Slice.front(), *TTI, *DL, *SE, *LI,

6181

CurrentOrder, PointerOps);

6182

switch (LS) {

6183

case LoadsState::Vectorize:

6184

case LoadsState::ScatterVectorize:

6185

// Mark the vectorized loads so that we don't vectorize them

6186

// again.

6187

if (LS == LoadsState::Vectorize)

6188

++VectorizedCnt;

6189

else

6190

++ScatterVectorizeCnt;

6191

VectorizedLoads.insert(Slice.begin(), Slice.end());

6192

// If we vectorized initial block, no need to try to vectorize it

6193

// again.

6194

if (Cnt == StartIdx)

6195

StartIdx += VF;

6196

break;

6197

case LoadsState::Gather:

6198

break;

6199

}

6200

}

6201

}

6202

// Check if the whole array was vectorized already - exit.

6203

if (StartIdx >= VL.size())

6204

break;

6205

// Found vectorizable parts - exit.

6206

if (!VectorizedLoads.empty())

6207

break;

6208

}

6209

if (!VectorizedLoads.empty()) {

6210

InstructionCost GatherCost = 0;

6211

unsigned NumParts = TTI->getNumberOfParts(VecTy);

6212

bool NeedInsertSubvectorAnalysis =

6213

!NumParts || (VL.size() / VF) > NumParts;

6214

// Get the cost for gathered loads.

6215

for (unsigned I = 0, End = VL.size(); I < End; I += VF) {

6216

if (VectorizedLoads.contains(VL[I]))

6217

continue;

6218

GatherCost += getGatherCost(VL.slice(I, VF));

6219

}

6220

// The cost for vectorized loads.

6221

InstructionCost ScalarsCost = 0;

6222

for (Value *V : VectorizedLoads) {

6223

auto *LI = cast<LoadInst>(V);

6224

ScalarsCost += TTI->getMemoryOpCost(

6225

Instruction::Load, LI->getType(), LI->getAlign(),

6226

LI->getPointerAddressSpace(), CostKind,

6227

{TTI::OK_AnyValue, TTI::OP_None}, LI);

6228

}

6229

auto *LI = cast<LoadInst>(E->getMainOp());

6230

auto *LoadTy = FixedVectorType::get(LI->getType(), VF);

6231

Align Alignment = LI->getAlign();

6232

GatherCost += VectorizedCnt *

6233

TTI->getMemoryOpCost(Instruction::Load, LoadTy, Alignment,

6234

LI->getPointerAddressSpace(),

6235

CostKind, {TTI::OK_AnyValue,

6236

TTI::OP_None}, LI);

6237

GatherCost += ScatterVectorizeCnt *

6238

TTI->getGatherScatterOpCost(

6239

Instruction::Load, LoadTy, LI->getPointerOperand(),

6240

/*VariableMask=*/false, Alignment, CostKind, LI);

6241

if (NeedInsertSubvectorAnalysis) {

6242

// Add the cost for the subvectors insert.

6243

for (int I = VF, E = VL.size(); I < E; I += VF)

6244

GatherCost += TTI->getShuffleCost(TTI::SK_InsertSubvector, VecTy,

6245

None, CostKind, I, LoadTy);

6246

}

6247

return ReuseShuffleCost + GatherCost - ScalarsCost;

6248

}

6249

}

6250

return ReuseShuffleCost + getGatherCost(VL);

6251

}

6252

InstructionCost CommonCost = 0;

6253

SmallVector<int> Mask;

6254

if (!E->ReorderIndices.empty()) {

6255

SmallVector<int> NewMask;

6256

if (E->getOpcode() == Instruction::Store) {

6257

// For stores the order is actually a mask.

6258

NewMask.resize(E->ReorderIndices.size());

6259

copy(E->ReorderIndices, NewMask.begin());

6260

} else {

6261

inversePermutation(E->ReorderIndices, NewMask);

6262

}

6263

::addMask(Mask, NewMask);

6264

}

6265

if (NeedToShuffleReuses)

6266

::addMask(Mask, E->ReuseShuffleIndices);

6267

if (!Mask.empty() && !ShuffleVectorInst::isIdentityMask(Mask))

6268

CommonCost =

6269

TTI->getShuffleCost(TTI::SK_PermuteSingleSrc, FinalVecTy, Mask);

6270

assert((E->State == TreeEntry::Vectorize ||(static_cast <bool> ((E->State == TreeEntry::Vectorize
|| E->State == TreeEntry::ScatterVectorize) && "Unhandled state"
) ? void (0) : __assert_fail ("(E->State == TreeEntry::Vectorize || E->State == TreeEntry::ScatterVectorize) && \"Unhandled state\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 6272, __extension__
__PRETTY_FUNCTION__))

6271

E->State == TreeEntry::ScatterVectorize) &&(static_cast <bool> ((E->State == TreeEntry::Vectorize
|| E->State == TreeEntry::ScatterVectorize) && "Unhandled state"
) ? void (0) : __assert_fail ("(E->State == TreeEntry::Vectorize || E->State == TreeEntry::ScatterVectorize) && \"Unhandled state\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 6272, __extension__
__PRETTY_FUNCTION__))

6272

"Unhandled state")(static_cast <bool> ((E->State == TreeEntry::Vectorize
|| E->State == TreeEntry::ScatterVectorize) && "Unhandled state"
) ? void (0) : __assert_fail ("(E->State == TreeEntry::Vectorize || E->State == TreeEntry::ScatterVectorize) && \"Unhandled state\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 6272, __extension__
__PRETTY_FUNCTION__));

6273

assert(E->getOpcode() &&(static_cast <bool> (E->getOpcode() && ((allSameType
(VL) && allSameBlock(VL)) || (E->getOpcode() == Instruction
::GetElementPtr && E->getMainOp()->getType()->
isPointerTy())) && "Invalid VL") ? void (0) : __assert_fail
("E->getOpcode() && ((allSameType(VL) && allSameBlock(VL)) || (E->getOpcode() == Instruction::GetElementPtr && E->getMainOp()->getType()->isPointerTy())) && \"Invalid VL\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 6277, __extension__
__PRETTY_FUNCTION__))

6274

((allSameType(VL) && allSameBlock(VL)) ||(static_cast <bool> (E->getOpcode() && ((allSameType
(VL) && allSameBlock(VL)) || (E->getOpcode() == Instruction
::GetElementPtr && E->getMainOp()->getType()->
isPointerTy())) && "Invalid VL") ? void (0) : __assert_fail
("E->getOpcode() && ((allSameType(VL) && allSameBlock(VL)) || (E->getOpcode() == Instruction::GetElementPtr && E->getMainOp()->getType()->isPointerTy())) && \"Invalid VL\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 6277, __extension__
__PRETTY_FUNCTION__))

6275

(E->getOpcode() == Instruction::GetElementPtr &&(static_cast <bool> (E->getOpcode() && ((allSameType
(VL) && allSameBlock(VL)) || (E->getOpcode() == Instruction
::GetElementPtr && E->getMainOp()->getType()->
isPointerTy())) && "Invalid VL") ? void (0) : __assert_fail
("E->getOpcode() && ((allSameType(VL) && allSameBlock(VL)) || (E->getOpcode() == Instruction::GetElementPtr && E->getMainOp()->getType()->isPointerTy())) && \"Invalid VL\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 6277, __extension__
__PRETTY_FUNCTION__))

6276

E->getMainOp()->getType()->isPointerTy())) &&(static_cast <bool> (E->getOpcode() && ((allSameType
(VL) && allSameBlock(VL)) || (E->getOpcode() == Instruction
::GetElementPtr && E->getMainOp()->getType()->
isPointerTy())) && "Invalid VL") ? void (0) : __assert_fail
("E->getOpcode() && ((allSameType(VL) && allSameBlock(VL)) || (E->getOpcode() == Instruction::GetElementPtr && E->getMainOp()->getType()->isPointerTy())) && \"Invalid VL\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 6277, __extension__
__PRETTY_FUNCTION__))

6277

"Invalid VL")(static_cast <bool> (E->getOpcode() && ((allSameType
(VL) && allSameBlock(VL)) || (E->getOpcode() == Instruction
::GetElementPtr && E->getMainOp()->getType()->
isPointerTy())) && "Invalid VL") ? void (0) : __assert_fail
("E->getOpcode() && ((allSameType(VL) && allSameBlock(VL)) || (E->getOpcode() == Instruction::GetElementPtr && E->getMainOp()->getType()->isPointerTy())) && \"Invalid VL\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 6277, __extension__
__PRETTY_FUNCTION__));

6278

Instruction *VL0 = E->getMainOp();

6279

unsigned ShuffleOrOp =

6280

E->isAltShuffle() ? (unsigned)Instruction::ShuffleVector : E->getOpcode();

6281

switch (ShuffleOrOp) {

6282

case Instruction::PHI:

6283

return 0;

6284

6285

case Instruction::ExtractValue:

6286

case Instruction::ExtractElement: {

6287

// The common cost of removal ExtractElement/ExtractValue instructions +

6288

// the cost of shuffles, if required to resuffle the original vector.

6289

if (NeedToShuffleReuses) {

6290

unsigned Idx = 0;

6291

for (unsigned I : E->ReuseShuffleIndices) {

6292

if (ShuffleOrOp == Instruction::ExtractElement) {

6293

auto *EE = cast<ExtractElementInst>(VL[I]);

6294

CommonCost -= TTI->getVectorInstrCost(

6295

*EE, EE->getVectorOperandType(), *getExtractIndex(EE));

6296

} else {

6297

CommonCost -= TTI->getVectorInstrCost(Instruction::ExtractElement,

6298

VecTy, Idx);

6299

++Idx;

6300

}

6301

}

6302

Idx = EntryVF;

6303

for (Value *V : VL) {

6304

if (ShuffleOrOp == Instruction::ExtractElement) {

6305

auto *EE = cast<ExtractElementInst>(V);

6306

CommonCost += TTI->getVectorInstrCost(

6307

*EE, EE->getVectorOperandType(), *getExtractIndex(EE));

6308

} else {

6309

--Idx;

6310

CommonCost += TTI->getVectorInstrCost(Instruction::ExtractElement,

6311

VecTy, Idx);

6312

}

6313

}

6314

}

6315

if (ShuffleOrOp == Instruction::ExtractValue) {

6316

for (unsigned I = 0, E = VL.size(); I < E; ++I) {

6317

auto *EI = cast<Instruction>(VL[I]);

6318

// Take credit for instruction that will become dead.

6319

if (EI->hasOneUse()) {

6320

Instruction *Ext = EI->user_back();

6321

if (isa<SExtInst, ZExtInst>(Ext) &&

6322

all_of(Ext->users(),

6323

[](User *U) { return isa<GetElementPtrInst>(U); })) {

6324

// Use getExtractWithExtendCost() to calculate the cost of

6325

// extractelement/ext pair.

6326

CommonCost -= TTI->getExtractWithExtendCost(

6327

Ext->getOpcode(), Ext->getType(), VecTy, I);

6328

// Add back the cost of s|zext which is subtracted separately.

6329

CommonCost += TTI->getCastInstrCost(

6330

Ext->getOpcode(), Ext->getType(), EI->getType(),

6331

TTI::getCastContextHint(Ext), CostKind, Ext);

6332

continue;

6333

}

6334

}

6335

CommonCost -=

6336

TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, I);

6337

}

6338

} else {

6339

AdjustExtractsCost(CommonCost);

6340

}

6341

return CommonCost;

6342

}

6343

case Instruction::InsertElement: {

6344

assert(E->ReuseShuffleIndices.empty() &&(static_cast <bool> (E->ReuseShuffleIndices.empty() &&
"Unique insertelements only are expected.") ? void (0) : __assert_fail
("E->ReuseShuffleIndices.empty() && \"Unique insertelements only are expected.\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 6345, __extension__
__PRETTY_FUNCTION__))

6345

"Unique insertelements only are expected.")(static_cast <bool> (E->ReuseShuffleIndices.empty() &&
"Unique insertelements only are expected.") ? void (0) : __assert_fail
("E->ReuseShuffleIndices.empty() && \"Unique insertelements only are expected.\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 6345, __extension__
__PRETTY_FUNCTION__));

6346

auto *SrcVecTy = cast<FixedVectorType>(VL0->getType());

6347

unsigned const NumElts = SrcVecTy->getNumElements();

6348

unsigned const NumScalars = VL.size();

6349

6350

unsigned NumOfParts = TTI->getNumberOfParts(SrcVecTy);

6351

6352

SmallVector<int> InsertMask(NumElts, UndefMaskElem);

6353

unsigned OffsetBeg = *getInsertIndex(VL.front());

6354

unsigned OffsetEnd = OffsetBeg;

6355

InsertMask[OffsetBeg] = 0;

6356

for (auto [I, V] : enumerate(VL.drop_front())) {

6357

unsigned Idx = *getInsertIndex(V);

6358

if (OffsetBeg > Idx)

6359

OffsetBeg = Idx;

6360

else if (OffsetEnd < Idx)

6361

OffsetEnd = Idx;

6362

InsertMask[Idx] = I + 1;

6363

}

6364

unsigned VecScalarsSz = PowerOf2Ceil(NumElts);

6365

if (NumOfParts > 0)

6366

VecScalarsSz = PowerOf2Ceil((NumElts + NumOfParts - 1) / NumOfParts);

6367

unsigned VecSz =

6368

(1 + OffsetEnd / VecScalarsSz - OffsetBeg / VecScalarsSz) *

6369

VecScalarsSz;

6370

unsigned Offset = VecScalarsSz * (OffsetBeg / VecScalarsSz);

6371

unsigned InsertVecSz = std::min<unsigned>(

6372

PowerOf2Ceil(OffsetEnd - OffsetBeg + 1),

6373

((OffsetEnd - OffsetBeg + VecScalarsSz) / VecScalarsSz) *

6374

VecScalarsSz);

6375

bool IsWholeSubvector =

6376

OffsetBeg == Offset && ((OffsetEnd + 1) % VecScalarsSz == 0);

6377

// Check if we can safely insert a subvector. If it is not possible, just

6378

// generate a whole-sized vector and shuffle the source vector and the new

6379

// subvector.

6380

if (OffsetBeg + InsertVecSz > VecSz) {

6381

// Align OffsetBeg to generate correct mask.

6382

OffsetBeg = alignDown(OffsetBeg, VecSz, Offset);

6383

InsertVecSz = VecSz;

6384

}

6385

6386

APInt DemandedElts = APInt::getZero(NumElts);

6387

// TODO: Add support for Instruction::InsertValue.

6388

SmallVector<int> Mask;

6389

if (!E->ReorderIndices.empty()) {

6390

inversePermutation(E->ReorderIndices, Mask);

6391

Mask.append(InsertVecSz - Mask.size(), UndefMaskElem);

6392

} else {

6393

Mask.assign(VecSz, UndefMaskElem);

6394

std::iota(Mask.begin(), std::next(Mask.begin(), InsertVecSz), 0);

6395

}

6396

bool IsIdentity = true;

6397

SmallVector<int> PrevMask(InsertVecSz, UndefMaskElem);

6398

Mask.swap(PrevMask);

6399

for (unsigned I = 0; I < NumScalars; ++I) {

6400

unsigned InsertIdx = *getInsertIndex(VL[PrevMask[I]]);

6401

DemandedElts.setBit(InsertIdx);

6402

IsIdentity &= InsertIdx - OffsetBeg == I;

6403

Mask[InsertIdx - OffsetBeg] = I;

6404

}

6405

assert(Offset < NumElts && "Failed to find vector index offset")(static_cast <bool> (Offset < NumElts && "Failed to find vector index offset"
) ? void (0) : __assert_fail ("Offset < NumElts && \"Failed to find vector index offset\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 6405, __extension__
__PRETTY_FUNCTION__));

6406

6407

InstructionCost Cost = 0;

6408

Cost -= TTI->getScalarizationOverhead(SrcVecTy, DemandedElts,

6409

/*Insert*/ true, /*Extract*/ false);

6410

6411

// First cost - resize to actual vector size if not identity shuffle or

6412

// need to shift the vector.

6413

// Do not calculate the cost if the actual size is the register size and

6414

// we can merge this shuffle with the following SK_Select.

6415

auto *InsertVecTy =

6416

FixedVectorType::get(SrcVecTy->getElementType(), InsertVecSz);

6417

if (!IsIdentity)

6418

Cost += TTI->getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc,

6419

InsertVecTy, Mask);

6420

auto *FirstInsert = cast<Instruction>(*find_if(E->Scalars, [E](Value *V) {

6421

return !is_contained(E->Scalars, cast<Instruction>(V)->getOperand(0));

6422

}));

6423

// Second cost - permutation with subvector, if some elements are from the

6424

// initial vector or inserting a subvector.

6425

// TODO: Implement the analysis of the FirstInsert->getOperand(0)

6426

// subvector of ActualVecTy.

6427

if (!isUndefVector(FirstInsert->getOperand(0), InsertMask) &&

6428

NumScalars != NumElts && !IsWholeSubvector) {

6429

if (InsertVecSz != VecSz) {

6430

auto *ActualVecTy =

6431

FixedVectorType::get(SrcVecTy->getElementType(), VecSz);

6432

Cost +=

6433

TTI->getShuffleCost(TTI::SK_InsertSubvector, ActualVecTy, None,

6434

CostKind, OffsetBeg - Offset, InsertVecTy);

6435

} else {

6436

for (unsigned I = 0, End = OffsetBeg - Offset; I < End; ++I)

6437

Mask[I] = I;

6438

for (unsigned I = OffsetBeg - Offset, End = OffsetEnd - Offset;

6439

I <= End; ++I)

6440

if (Mask[I] != UndefMaskElem)

6441

Mask[I] = I + VecSz;

6442

for (unsigned I = OffsetEnd + 1 - Offset; I < VecSz; ++I)

6443

Mask[I] = I;

6444

Cost += TTI->getShuffleCost(TTI::SK_PermuteTwoSrc, InsertVecTy, Mask);

6445

}

6446

}

6447

return Cost;

6448

}

6449

case Instruction::ZExt:

6450

case Instruction::SExt:

6451

case Instruction::FPToUI:

6452

case Instruction::FPToSI:

6453

case Instruction::FPExt:

6454

case Instruction::PtrToInt:

6455

case Instruction::IntToPtr:

6456

case Instruction::SIToFP:

6457

case Instruction::UIToFP:

6458

case Instruction::Trunc:

6459

case Instruction::FPTrunc:

6460

case Instruction::BitCast: {

6461

Type *SrcTy = VL0->getOperand(0)->getType();

6462

InstructionCost ScalarEltCost =

6463

TTI->getCastInstrCost(E->getOpcode(), ScalarTy, SrcTy,

6464

TTI::getCastContextHint(VL0), CostKind, VL0);

6465

if (NeedToShuffleReuses) {

6466

CommonCost -= (EntryVF - VL.size()) * ScalarEltCost;

6467

}

6468

6469

// Calculate the cost of this instruction.

6470

InstructionCost ScalarCost = VL.size() * ScalarEltCost;

6471

6472

auto *SrcVecTy = FixedVectorType::get(SrcTy, VL.size());

6473

InstructionCost VecCost = 0;

6474

// Check if the values are candidates to demote.

6475

if (!MinBWs.count(VL0) || VecTy != SrcVecTy) {

6476

VecCost = CommonCost + TTI->getCastInstrCost(

6477

E->getOpcode(), VecTy, SrcVecTy,

6478

TTI::getCastContextHint(VL0), CostKind, VL0);

6479

}

6480

LLVM_DEBUG(dumpTreeCosts(E, CommonCost, VecCost, ScalarCost))do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dumpTreeCosts(E, CommonCost, VecCost, ScalarCost);
} } while (false);

6481

return VecCost - ScalarCost;

6482

}

6483

case Instruction::FCmp:

6484

case Instruction::ICmp:

6485

case Instruction::Select: {

6486

// Calculate the cost of this instruction.

6487

InstructionCost ScalarEltCost =

6488

TTI->getCmpSelInstrCost(E->getOpcode(), ScalarTy, Builder.getInt1Ty(),

6489

CmpInst::BAD_ICMP_PREDICATE, CostKind, VL0);

6490

if (NeedToShuffleReuses) {

6491

CommonCost -= (EntryVF - VL.size()) * ScalarEltCost;

6492

}

6493

auto *MaskTy = FixedVectorType::get(Builder.getInt1Ty(), VL.size());

6494

InstructionCost ScalarCost = VecTy->getNumElements() * ScalarEltCost;

6495

6496

// Check if all entries in VL are either compares or selects with compares

6497

// as condition that have the same predicates.

6498

CmpInst::Predicate VecPred = CmpInst::BAD_ICMP_PREDICATE;

6499

bool First = true;

6500

for (auto *V : VL) {

6501

CmpInst::Predicate CurrentPred;

6502

auto MatchCmp = m_Cmp(CurrentPred, m_Value(), m_Value());

6503

if ((!match(V, m_Select(MatchCmp, m_Value(), m_Value())) &&

6504

!match(V, MatchCmp)) ||

6505

(!First && VecPred != CurrentPred)) {

6506

VecPred = CmpInst::BAD_ICMP_PREDICATE;

6507

break;

6508

}

6509

First = false;

6510

VecPred = CurrentPred;

6511

}

6512

6513

InstructionCost VecCost = TTI->getCmpSelInstrCost(

6514

E->getOpcode(), VecTy, MaskTy, VecPred, CostKind, VL0);

6515

// Check if it is possible and profitable to use min/max for selects in

6516

// VL.

6517

//

6518

auto IntrinsicAndUse = canConvertToMinOrMaxIntrinsic(VL);

6519

if (IntrinsicAndUse.first != Intrinsic::not_intrinsic) {

6520

IntrinsicCostAttributes CostAttrs(IntrinsicAndUse.first, VecTy,

6521

{VecTy, VecTy});

6522

InstructionCost IntrinsicCost =

6523

TTI->getIntrinsicInstrCost(CostAttrs, CostKind);

6524

// If the selects are the only uses of the compares, they will be dead

6525

// and we can adjust the cost by removing their cost.

6526

if (IntrinsicAndUse.second)

6527

IntrinsicCost -= TTI->getCmpSelInstrCost(Instruction::ICmp, VecTy,

6528

MaskTy, VecPred, CostKind);

6529

VecCost = std::min(VecCost, IntrinsicCost);

6530

}

6531

LLVM_DEBUG(dumpTreeCosts(E, CommonCost, VecCost, ScalarCost))do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dumpTreeCosts(E, CommonCost, VecCost, ScalarCost);
} } while (false);

6532

return CommonCost + VecCost - ScalarCost;

6533

}

6534

case Instruction::FNeg:

6535

case Instruction::Add:

6536

case Instruction::FAdd:

6537

case Instruction::Sub:

6538

case Instruction::FSub:

6539

case Instruction::Mul:

6540

case Instruction::FMul:

6541

case Instruction::UDiv:

6542

case Instruction::SDiv:

6543

case Instruction::FDiv:

6544

case Instruction::URem:

6545

case Instruction::SRem:

6546

case Instruction::FRem:

6547

case Instruction::Shl:

6548

case Instruction::LShr:

6549

case Instruction::AShr:

6550

case Instruction::And:

6551

case Instruction::Or:

6552

case Instruction::Xor: {

6553

const unsigned OpIdx = isa<BinaryOperator>(VL0) ? 1 : 0;

6554

6555

InstructionCost ScalarCost = 0;

6556

for (auto *V : VL) {

6557

auto *VI = cast<Instruction>(V);

6558

TTI::OperandValueInfo Op1Info = TTI::getOperandInfo(VI->getOperand(0));

6559

TTI::OperandValueInfo Op2Info = TTI::getOperandInfo(VI->getOperand(OpIdx));

6560

SmallVector<const Value *, 4> Operands(VI->operand_values());

6561

ScalarCost +=

6562

TTI->getArithmeticInstrCost(E->getOpcode(), ScalarTy, CostKind,

6563

Op1Info, Op2Info, Operands, VI);

6564

}

6565

if (NeedToShuffleReuses) {

6566

CommonCost -= (EntryVF - VL.size()) * ScalarCost/VL.size();

6567

}

6568

TTI::OperandValueInfo Op1Info = getOperandInfo(VL, 0);

6569

TTI::OperandValueInfo Op2Info = getOperandInfo(VL, OpIdx);

6570

InstructionCost VecCost =

6571

TTI->getArithmeticInstrCost(E->getOpcode(), VecTy, CostKind,

6572

Op1Info, Op2Info);

6573

LLVM_DEBUG(dumpTreeCosts(E, CommonCost, VecCost, ScalarCost))do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dumpTreeCosts(E, CommonCost, VecCost, ScalarCost);
} } while (false);

6574

return CommonCost + VecCost - ScalarCost;

6575

}

6576

case Instruction::GetElementPtr: {

6577

TargetTransformInfo::OperandValueKind Op1VK =

6578

TargetTransformInfo::OK_AnyValue;

6579

TargetTransformInfo::OperandValueKind Op2VK =

6580

any_of(VL,

6581

[](Value *V) {

6582

return isa<GetElementPtrInst>(V) &&

6583

!isConstant(

6584

cast<GetElementPtrInst>(V)->getOperand(1));

6585

})

6586

? TargetTransformInfo::OK_AnyValue

6587

: TargetTransformInfo::OK_UniformConstantValue;

6588

6589

InstructionCost ScalarEltCost = TTI->getArithmeticInstrCost(

6590

Instruction::Add, ScalarTy, CostKind,

6591

{Op1VK, TargetTransformInfo::OP_None},

6592

{Op2VK, TargetTransformInfo::OP_None});

6593

if (NeedToShuffleReuses) {

6594

CommonCost -= (EntryVF - VL.size()) * ScalarEltCost;

6595

}

6596

InstructionCost ScalarCost = VecTy->getNumElements() * ScalarEltCost;

6597

InstructionCost VecCost = TTI->getArithmeticInstrCost(

6598

Instruction::Add, VecTy, CostKind,

6599

{Op1VK, TargetTransformInfo::OP_None},

6600

{Op2VK, TargetTransformInfo::OP_None});

6601

LLVM_DEBUG(dumpTreeCosts(E, CommonCost, VecCost, ScalarCost))do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dumpTreeCosts(E, CommonCost, VecCost, ScalarCost);
} } while (false);

6602

return CommonCost + VecCost - ScalarCost;

6603

}

6604

case Instruction::Load: {

6605

// Cost of wide load - cost of scalar loads.

6606

Align Alignment = cast<LoadInst>(VL0)->getAlign();

6607

InstructionCost ScalarEltCost =

6608

TTI->getMemoryOpCost(Instruction::Load, ScalarTy, Alignment, 0,

6609

CostKind, {TTI::OK_AnyValue, TTI::OP_None}, VL0);

6610

if (NeedToShuffleReuses) {

6611

CommonCost -= (EntryVF - VL.size()) * ScalarEltCost;

6612

}

6613

InstructionCost ScalarLdCost = VecTy->getNumElements() * ScalarEltCost;

6614

InstructionCost VecLdCost;

6615

if (E->State == TreeEntry::Vectorize) {

6616

VecLdCost = TTI->getMemoryOpCost(Instruction::Load, VecTy, Alignment, 0,

6617

CostKind, {TTI::OK_AnyValue, TTI::OP_None}, VL0);

6618

} else {

6619

assert(E->State == TreeEntry::ScatterVectorize && "Unknown EntryState")(static_cast <bool> (E->State == TreeEntry::ScatterVectorize
&& "Unknown EntryState") ? void (0) : __assert_fail (
"E->State == TreeEntry::ScatterVectorize && \"Unknown EntryState\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 6619, __extension__
__PRETTY_FUNCTION__));

6620

Align CommonAlignment = Alignment;

6621

for (Value *V : VL)

6622

CommonAlignment =

6623

std::min(CommonAlignment, cast<LoadInst>(V)->getAlign());

6624

VecLdCost = TTI->getGatherScatterOpCost(

6625

Instruction::Load, VecTy, cast<LoadInst>(VL0)->getPointerOperand(),

6626

/*VariableMask=*/false, CommonAlignment, CostKind, VL0);

6627

}

6628

LLVM_DEBUG(dumpTreeCosts(E, CommonCost, VecLdCost, ScalarLdCost))do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dumpTreeCosts(E, CommonCost, VecLdCost, ScalarLdCost
); } } while (false);

6629

return CommonCost + VecLdCost - ScalarLdCost;

6630

}

6631

case Instruction::Store: {

6632

// We know that we can merge the stores. Calculate the cost.

6633

bool IsReorder = !E->ReorderIndices.empty();

6634

auto *SI =

6635

cast<StoreInst>(IsReorder ? VL[E->ReorderIndices.front()] : VL0);

6636

Align Alignment = SI->getAlign();

6637

InstructionCost ScalarStCost = 0;

6638

for (auto *V : VL) {

6639

auto *VI = cast<Instruction>(V);

6640

TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(VI->getOperand(0));

6641

ScalarStCost +=

6642

TTI->getMemoryOpCost(Instruction::Store, ScalarTy, Alignment, 0,

6643

CostKind, OpInfo, VI);

6644

}

6645

TTI::OperandValueInfo OpInfo = getOperandInfo(VL, 0);

6646

InstructionCost VecStCost =

6647

TTI->getMemoryOpCost(Instruction::Store, VecTy, Alignment, 0, CostKind,

6648

OpInfo);

6649

LLVM_DEBUG(dumpTreeCosts(E, CommonCost, VecStCost, ScalarStCost))do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dumpTreeCosts(E, CommonCost, VecStCost, ScalarStCost
); } } while (false);

6650

return CommonCost + VecStCost - ScalarStCost;

6651

}

6652

case Instruction::Call: {

6653

CallInst *CI = cast<CallInst>(VL0);

6654

Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);

6655

6656

// Calculate the cost of the scalar and vector calls.

6657

IntrinsicCostAttributes CostAttrs(ID, *CI, 1);

6658

InstructionCost ScalarEltCost =

6659

TTI->getIntrinsicInstrCost(CostAttrs, CostKind);

6660

if (NeedToShuffleReuses) {

6661

CommonCost -= (EntryVF - VL.size()) * ScalarEltCost;

6662

}

6663

InstructionCost ScalarCallCost = VecTy->getNumElements() * ScalarEltCost;

6664

6665

auto VecCallCosts = getVectorCallCosts(CI, VecTy, TTI, TLI);

6666

InstructionCost VecCallCost =

6667

std::min(VecCallCosts.first, VecCallCosts.second);

6668

6669

LLVM_DEBUG(dbgs() << "SLP: Call cost " << VecCallCost - ScalarCallCostdo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Call cost " << VecCallCost
- ScalarCallCost << " (" << VecCallCost <<
"-" << ScalarCallCost << ")" << " for " <<
*CI << "\n"; } } while (false)

6670

<< " (" << VecCallCost << "-" << ScalarCallCost << ")"do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Call cost " << VecCallCost
- ScalarCallCost << " (" << VecCallCost <<
"-" << ScalarCallCost << ")" << " for " <<
*CI << "\n"; } } while (false)

6671

<< " for " << *CI << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Call cost " << VecCallCost
- ScalarCallCost << " (" << VecCallCost <<
"-" << ScalarCallCost << ")" << " for " <<
*CI << "\n"; } } while (false);

6672

6673

return CommonCost + VecCallCost - ScalarCallCost;

6674

}

6675

case Instruction::ShuffleVector: {

6676

assert(E->isAltShuffle() &&(static_cast <bool> (E->isAltShuffle() && ((
Instruction::isBinaryOp(E->getOpcode()) && Instruction
::isBinaryOp(E->getAltOpcode())) || (Instruction::isCast(E
->getOpcode()) && Instruction::isCast(E->getAltOpcode
())) || (isa<CmpInst>(VL0) && isa<CmpInst>
(E->getAltOp()))) && "Invalid Shuffle Vector Operand"
) ? void (0) : __assert_fail ("E->isAltShuffle() && ((Instruction::isBinaryOp(E->getOpcode()) && Instruction::isBinaryOp(E->getAltOpcode())) || (Instruction::isCast(E->getOpcode()) && Instruction::isCast(E->getAltOpcode())) || (isa<CmpInst>(VL0) && isa<CmpInst>(E->getAltOp()))) && \"Invalid Shuffle Vector Operand\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 6682, __extension__
__PRETTY_FUNCTION__))

6677

((Instruction::isBinaryOp(E->getOpcode()) &&(static_cast <bool> (E->isAltShuffle() && ((
Instruction::isBinaryOp(E->getOpcode()) && Instruction
::isBinaryOp(E->getAltOpcode())) || (Instruction::isCast(E
->getOpcode()) && Instruction::isCast(E->getAltOpcode
())) || (isa<CmpInst>(VL0) && isa<CmpInst>
(E->getAltOp()))) && "Invalid Shuffle Vector Operand"
) ? void (0) : __assert_fail ("E->isAltShuffle() && ((Instruction::isBinaryOp(E->getOpcode()) && Instruction::isBinaryOp(E->getAltOpcode())) || (Instruction::isCast(E->getOpcode()) && Instruction::isCast(E->getAltOpcode())) || (isa<CmpInst>(VL0) && isa<CmpInst>(E->getAltOp()))) && \"Invalid Shuffle Vector Operand\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 6682, __extension__
__PRETTY_FUNCTION__))

6678

Instruction::isBinaryOp(E->getAltOpcode())) ||(static_cast <bool> (E->isAltShuffle() && ((
Instruction::isBinaryOp(E->getOpcode()) && Instruction
::isBinaryOp(E->getAltOpcode())) || (Instruction::isCast(E
->getOpcode()) && Instruction::isCast(E->getAltOpcode
())) || (isa<CmpInst>(VL0) && isa<CmpInst>
(E->getAltOp()))) && "Invalid Shuffle Vector Operand"
) ? void (0) : __assert_fail ("E->isAltShuffle() && ((Instruction::isBinaryOp(E->getOpcode()) && Instruction::isBinaryOp(E->getAltOpcode())) || (Instruction::isCast(E->getOpcode()) && Instruction::isCast(E->getAltOpcode())) || (isa<CmpInst>(VL0) && isa<CmpInst>(E->getAltOp()))) && \"Invalid Shuffle Vector Operand\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 6682, __extension__
__PRETTY_FUNCTION__))

6679

(Instruction::isCast(E->getOpcode()) &&(static_cast <bool> (E->isAltShuffle() && ((
Instruction::isBinaryOp(E->getOpcode()) && Instruction
::isBinaryOp(E->getAltOpcode())) || (Instruction::isCast(E
->getOpcode()) && Instruction::isCast(E->getAltOpcode
())) || (isa<CmpInst>(VL0) && isa<CmpInst>
(E->getAltOp()))) && "Invalid Shuffle Vector Operand"
) ? void (0) : __assert_fail ("E->isAltShuffle() && ((Instruction::isBinaryOp(E->getOpcode()) && Instruction::isBinaryOp(E->getAltOpcode())) || (Instruction::isCast(E->getOpcode()) && Instruction::isCast(E->getAltOpcode())) || (isa<CmpInst>(VL0) && isa<CmpInst>(E->getAltOp()))) && \"Invalid Shuffle Vector Operand\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 6682, __extension__
__PRETTY_FUNCTION__))

6680

Instruction::isCast(E->getAltOpcode())) ||(static_cast <bool> (E->isAltShuffle() && ((
Instruction::isBinaryOp(E->getOpcode()) && Instruction
::isBinaryOp(E->getAltOpcode())) || (Instruction::isCast(E
->getOpcode()) && Instruction::isCast(E->getAltOpcode
())) || (isa<CmpInst>(VL0) && isa<CmpInst>
(E->getAltOp()))) && "Invalid Shuffle Vector Operand"
) ? void (0) : __assert_fail ("E->isAltShuffle() && ((Instruction::isBinaryOp(E->getOpcode()) && Instruction::isBinaryOp(E->getAltOpcode())) || (Instruction::isCast(E->getOpcode()) && Instruction::isCast(E->getAltOpcode())) || (isa<CmpInst>(VL0) && isa<CmpInst>(E->getAltOp()))) && \"Invalid Shuffle Vector Operand\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 6682, __extension__
__PRETTY_FUNCTION__))

6681

(isa<CmpInst>(VL0) && isa<CmpInst>(E->getAltOp()))) &&(static_cast <bool> (E->isAltShuffle() && ((
Instruction::isBinaryOp(E->getOpcode()) && Instruction
::isBinaryOp(E->getAltOpcode())) || (Instruction::isCast(E
->getOpcode()) && Instruction::isCast(E->getAltOpcode
())) || (isa<CmpInst>(VL0) && isa<CmpInst>
(E->getAltOp()))) && "Invalid Shuffle Vector Operand"
) ? void (0) : __assert_fail ("E->isAltShuffle() && ((Instruction::isBinaryOp(E->getOpcode()) && Instruction::isBinaryOp(E->getAltOpcode())) || (Instruction::isCast(E->getOpcode()) && Instruction::isCast(E->getAltOpcode())) || (isa<CmpInst>(VL0) && isa<CmpInst>(E->getAltOp()))) && \"Invalid Shuffle Vector Operand\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 6682, __extension__
__PRETTY_FUNCTION__))

6682

"Invalid Shuffle Vector Operand")(static_cast <bool> (E->isAltShuffle() && ((
Instruction::isBinaryOp(E->getOpcode()) && Instruction
::isBinaryOp(E->getAltOpcode())) || (Instruction::isCast(E
->getOpcode()) && Instruction::isCast(E->getAltOpcode
())) || (isa<CmpInst>(VL0) && isa<CmpInst>
(E->getAltOp()))) && "Invalid Shuffle Vector Operand"
) ? void (0) : __assert_fail ("E->isAltShuffle() && ((Instruction::isBinaryOp(E->getOpcode()) && Instruction::isBinaryOp(E->getAltOpcode())) || (Instruction::isCast(E->getOpcode()) && Instruction::isCast(E->getAltOpcode())) || (isa<CmpInst>(VL0) && isa<CmpInst>(E->getAltOp()))) && \"Invalid Shuffle Vector Operand\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 6682, __extension__
__PRETTY_FUNCTION__));

6683

InstructionCost ScalarCost = 0;

6684

if (NeedToShuffleReuses) {

6685

for (unsigned Idx : E->ReuseShuffleIndices) {

6686

Instruction *I = cast<Instruction>(VL[Idx]);

6687

CommonCost -= TTI->getInstructionCost(I, CostKind);

6688

}

6689

for (Value *V : VL) {

6690

Instruction *I = cast<Instruction>(V);

6691

CommonCost += TTI->getInstructionCost(I, CostKind);

6692

}

6693

}

6694

for (Value *V : VL) {

6695

Instruction *I = cast<Instruction>(V);

6696

assert(E->isOpcodeOrAlt(I) && "Unexpected main/alternate opcode")(static_cast <bool> (E->isOpcodeOrAlt(I) && "Unexpected main/alternate opcode"
) ? void (0) : __assert_fail ("E->isOpcodeOrAlt(I) && \"Unexpected main/alternate opcode\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 6696, __extension__
__PRETTY_FUNCTION__));

6697

ScalarCost += TTI->getInstructionCost(I, CostKind);

6698

}

6699

// VecCost is equal to sum of the cost of creating 2 vectors

6700

// and the cost of creating shuffle.

6701

InstructionCost VecCost = 0;

6702

// Try to find the previous shuffle node with the same operands and same

6703

// main/alternate ops.

6704

auto &&TryFindNodeWithEqualOperands = [this, E]() {

6705

for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {

6706

if (TE.get() == E)

6707

break;

6708

if (TE->isAltShuffle() &&

6709

((TE->getOpcode() == E->getOpcode() &&

6710

TE->getAltOpcode() == E->getAltOpcode()) ||

6711

(TE->getOpcode() == E->getAltOpcode() &&

6712

TE->getAltOpcode() == E->getOpcode())) &&

6713

TE->hasEqualOperands(*E))

6714

return true;

6715

}

6716

return false;

6717

};

6718

if (TryFindNodeWithEqualOperands()) {

6719

LLVM_DEBUG({do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { { dbgs() << "SLP: diamond match for alternate node found.\n"
; E->dump(); }; } } while (false)

6720

dbgs() << "SLP: diamond match for alternate node found.\n";do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { { dbgs() << "SLP: diamond match for alternate node found.\n"
; E->dump(); }; } } while (false)

6721

E->dump();do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { { dbgs() << "SLP: diamond match for alternate node found.\n"
; E->dump(); }; } } while (false)

6722

})do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { { dbgs() << "SLP: diamond match for alternate node found.\n"
; E->dump(); }; } } while (false);

6723

// No need to add new vector costs here since we're going to reuse

6724

// same main/alternate vector ops, just do different shuffling.

6725

} else if (Instruction::isBinaryOp(E->getOpcode())) {

6726

VecCost = TTI->getArithmeticInstrCost(E->getOpcode(), VecTy, CostKind);

6727

VecCost += TTI->getArithmeticInstrCost(E->getAltOpcode(), VecTy,

6728

CostKind);

6729

} else if (auto *CI0 = dyn_cast<CmpInst>(VL0)) {

6730

VecCost = TTI->getCmpSelInstrCost(E->getOpcode(), ScalarTy,

6731

Builder.getInt1Ty(),

6732

CI0->getPredicate(), CostKind, VL0);

6733

VecCost += TTI->getCmpSelInstrCost(

6734

E->getOpcode(), ScalarTy, Builder.getInt1Ty(),

6735

cast<CmpInst>(E->getAltOp())->getPredicate(), CostKind,

6736

E->getAltOp());

6737

} else {

6738

Type *Src0SclTy = E->getMainOp()->getOperand(0)->getType();

6739

Type *Src1SclTy = E->getAltOp()->getOperand(0)->getType();

6740

auto *Src0Ty = FixedVectorType::get(Src0SclTy, VL.size());

6741

auto *Src1Ty = FixedVectorType::get(Src1SclTy, VL.size());

6742

VecCost = TTI->getCastInstrCost(E->getOpcode(), VecTy, Src0Ty,

6743

TTI::CastContextHint::None, CostKind);

6744

VecCost += TTI->getCastInstrCost(E->getAltOpcode(), VecTy, Src1Ty,

6745

TTI::CastContextHint::None, CostKind);

6746

}

6747

6748

if (E->ReuseShuffleIndices.empty()) {

6749

CommonCost =

6750

TTI->getShuffleCost(TargetTransformInfo::SK_Select, FinalVecTy);

6751

} else {

6752

SmallVector<int> Mask;

6753

buildShuffleEntryMask(

6754

E->Scalars, E->ReorderIndices, E->ReuseShuffleIndices,

6755

[E](Instruction *I) {

6756

assert(E->isOpcodeOrAlt(I) && "Unexpected main/alternate opcode")(static_cast <bool> (E->isOpcodeOrAlt(I) && "Unexpected main/alternate opcode"
) ? void (0) : __assert_fail ("E->isOpcodeOrAlt(I) && \"Unexpected main/alternate opcode\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 6756, __extension__
__PRETTY_FUNCTION__));

6757

return I->getOpcode() == E->getAltOpcode();

6758

},

6759

Mask);

6760

CommonCost = TTI->getShuffleCost(TargetTransformInfo::SK_PermuteTwoSrc,

6761

FinalVecTy, Mask);

6762

}

6763

LLVM_DEBUG(dumpTreeCosts(E, CommonCost, VecCost, ScalarCost))do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dumpTreeCosts(E, CommonCost, VecCost, ScalarCost);
} } while (false);

6764

return CommonCost + VecCost - ScalarCost;

6765

}

6766

default:

6767

llvm_unreachable("Unknown instruction")::llvm::llvm_unreachable_internal("Unknown instruction", "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 6767);

6768

}

6769

}

6770

6771

bool BoUpSLP::isFullyVectorizableTinyTree(bool ForReduction) const {

6772

LLVM_DEBUG(dbgs() << "SLP: Check whether the tree with height "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Check whether the tree with height "
<< VectorizableTree.size() << " is fully vectorizable .\n"
; } } while (false)

6773

<< VectorizableTree.size() << " is fully vectorizable .\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Check whether the tree with height "
<< VectorizableTree.size() << " is fully vectorizable .\n"
; } } while (false);

6774

6775

auto &&AreVectorizableGathers = [this](const TreeEntry *TE, unsigned Limit) {

6776

SmallVector<int> Mask;

6777

return TE->State == TreeEntry::NeedToGather &&

6778

!any_of(TE->Scalars,

6779

[this](Value *V) { return EphValues.contains(V); }) &&

6780

(allConstant(TE->Scalars) || isSplat(TE->Scalars) ||

6781

TE->Scalars.size() < Limit ||

6782

((TE->getOpcode() == Instruction::ExtractElement ||

6783

all_of(TE->Scalars,

6784

[](Value *V) {

6785

return isa<ExtractElementInst, UndefValue>(V);

6786

})) &&

6787

isFixedVectorShuffle(TE->Scalars, Mask)) ||

6788

(TE->State == TreeEntry::NeedToGather &&

6789

TE->getOpcode() == Instruction::Load && !TE->isAltShuffle()));

6790

};

6791

6792

// We only handle trees of heights 1 and 2.

6793

if (VectorizableTree.size() == 1 &&

6794

(VectorizableTree[0]->State == TreeEntry::Vectorize ||

6795

(ForReduction &&

6796

AreVectorizableGathers(VectorizableTree[0].get(),

6797

VectorizableTree[0]->Scalars.size()) &&

6798

VectorizableTree[0]->getVectorFactor() > 2)))

6799

return true;

6800

6801

if (VectorizableTree.size() != 2)

6802

return false;

6803

6804

// Handle splat and all-constants stores. Also try to vectorize tiny trees

6805

// with the second gather nodes if they have less scalar operands rather than

6806

// the initial tree element (may be profitable to shuffle the second gather)

6807

// or they are extractelements, which form shuffle.

6808

SmallVector<int> Mask;

6809

if (VectorizableTree[0]->State == TreeEntry::Vectorize &&

6810

AreVectorizableGathers(VectorizableTree[1].get(),

6811

VectorizableTree[0]->Scalars.size()))

6812

return true;

6813

6814

// Gathering cost would be too much for tiny trees.

6815

if (VectorizableTree[0]->State == TreeEntry::NeedToGather ||

6816

(VectorizableTree[1]->State == TreeEntry::NeedToGather &&

6817

VectorizableTree[0]->State != TreeEntry::ScatterVectorize))

6818

return false;

6819

6820

return true;

6821

}

6822

6823

static bool isLoadCombineCandidateImpl(Value *Root, unsigned NumElts,

6824

TargetTransformInfo *TTI,

6825

bool MustMatchOrInst) {

6826

// Look past the root to find a source value. Arbitrarily follow the

6827

// path through operand 0 of any 'or'. Also, peek through optional

6828

// shift-left-by-multiple-of-8-bits.

6829

Value *ZextLoad = Root;

6830

const APInt *ShAmtC;

6831

bool FoundOr = false;

6832

while (!isa<ConstantExpr>(ZextLoad) &&

6833

(match(ZextLoad, m_Or(m_Value(), m_Value())) ||

6834

(match(ZextLoad, m_Shl(m_Value(), m_APInt(ShAmtC))) &&

6835

ShAmtC->urem(8) == 0))) {

6836

auto *BinOp = cast<BinaryOperator>(ZextLoad);

6837

ZextLoad = BinOp->getOperand(0);

6838

if (BinOp->getOpcode() == Instruction::Or)

6839

FoundOr = true;

6840

}

6841

// Check if the input is an extended load of the required or/shift expression.

6842

Value *Load;

6843

if ((MustMatchOrInst && !FoundOr) || ZextLoad == Root ||

6844

!match(ZextLoad, m_ZExt(m_Value(Load))) || !isa<LoadInst>(Load))

6845

return false;

6846

6847

// Require that the total load bit width is a legal integer type.

6848

// For example, <8 x i8> --> i64 is a legal integer on a 64-bit target.

6849

// But <16 x i8> --> i128 is not, so the backend probably can't reduce it.

6850

Type *SrcTy = Load->getType();

6851

unsigned LoadBitWidth = SrcTy->getIntegerBitWidth() * NumElts;

6852

if (!TTI->isTypeLegal(IntegerType::get(Root->getContext(), LoadBitWidth)))

6853

return false;

6854

6855

// Everything matched - assume that we can fold the whole sequence using

6856

// load combining.

6857

LLVM_DEBUG(dbgs() << "SLP: Assume load combining for tree starting at "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Assume load combining for tree starting at "
<< *(cast<Instruction>(Root)) << "\n"; } }
while (false)

6858

<< *(cast<Instruction>(Root)) << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Assume load combining for tree starting at "
<< *(cast<Instruction>(Root)) << "\n"; } }
while (false);

6859

6860

return true;

6861

}

6862

6863

bool BoUpSLP::isLoadCombineReductionCandidate(RecurKind RdxKind) const {

6864

if (RdxKind != RecurKind::Or)

6865

return false;

6866

6867

unsigned NumElts = VectorizableTree[0]->Scalars.size();

6868

Value *FirstReduced = VectorizableTree[0]->Scalars[0];

6869

return isLoadCombineCandidateImpl(FirstReduced, NumElts, TTI,

6870

/* MatchOr */ false);

6871

}

6872

6873

bool BoUpSLP::isLoadCombineCandidate() const {

6874

// Peek through a final sequence of stores and check if all operations are

6875

// likely to be load-combined.

6876

unsigned NumElts = VectorizableTree[0]->Scalars.size();

6877

for (Value *Scalar : VectorizableTree[0]->Scalars) {

6878

Value *X;

6879

if (!match(Scalar, m_Store(m_Value(X), m_Value())) ||

6880

!isLoadCombineCandidateImpl(X, NumElts, TTI, /* MatchOr */ true))

6881

return false;

6882

}

6883

return true;

6884

}

6885

6886

bool BoUpSLP::isTreeTinyAndNotFullyVectorizable(bool ForReduction) const {

6887

// No need to vectorize inserts of gathered values.

6888

if (VectorizableTree.size() == 2 &&

6889

isa<InsertElementInst>(VectorizableTree[0]->Scalars[0]) &&

6890

VectorizableTree[1]->State == TreeEntry::NeedToGather &&

6891

(VectorizableTree[1]->getVectorFactor() <= 2 ||

6892

!(isSplat(VectorizableTree[1]->Scalars) ||

6893

allConstant(VectorizableTree[1]->Scalars))))

6894

return true;

6895

6896

// We can vectorize the tree if its size is greater than or equal to the

6897

// minimum size specified by the MinTreeSize command line option.

6898

if (VectorizableTree.size() >= MinTreeSize)

6899

return false;

6900

6901

// If we have a tiny tree (a tree whose size is less than MinTreeSize), we

6902

// can vectorize it if we can prove it fully vectorizable.

6903

if (isFullyVectorizableTinyTree(ForReduction))

6904

return false;

6905

6906

assert(VectorizableTree.empty()(static_cast <bool> (VectorizableTree.empty() ? ExternalUses
.empty() : true && "We shouldn't have any external users"
) ? void (0) : __assert_fail ("VectorizableTree.empty() ? ExternalUses.empty() : true && \"We shouldn't have any external users\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 6908, __extension__
__PRETTY_FUNCTION__))

6907

? ExternalUses.empty()(static_cast <bool> (VectorizableTree.empty() ? ExternalUses
.empty() : true && "We shouldn't have any external users"
) ? void (0) : __assert_fail ("VectorizableTree.empty() ? ExternalUses.empty() : true && \"We shouldn't have any external users\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 6908, __extension__
__PRETTY_FUNCTION__))

6908

: true && "We shouldn't have any external users")(static_cast <bool> (VectorizableTree.empty() ? ExternalUses
.empty() : true && "We shouldn't have any external users"
) ? void (0) : __assert_fail ("VectorizableTree.empty() ? ExternalUses.empty() : true && \"We shouldn't have any external users\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 6908, __extension__
__PRETTY_FUNCTION__));

6909

6910

// Otherwise, we can't vectorize the tree. It is both tiny and not fully

6911

// vectorizable.

6912

return true;

6913

}

6914

6915

InstructionCost BoUpSLP::getSpillCost() const {

6916

// Walk from the bottom of the tree to the top, tracking which values are

6917

// live. When we see a call instruction that is not part of our tree,

6918

// query TTI to see if there is a cost to keeping values live over it

6919

// (for example, if spills and fills are required).

6920

unsigned BundleWidth = VectorizableTree.front()->Scalars.size();

6921

InstructionCost Cost = 0;

6922

6923

SmallPtrSet<Instruction*, 4> LiveValues;

6924

Instruction *PrevInst = nullptr;

6925

6926

// The entries in VectorizableTree are not necessarily ordered by their

6927

// position in basic blocks. Collect them and order them by dominance so later

6928

// instructions are guaranteed to be visited first. For instructions in

6929

// different basic blocks, we only scan to the beginning of the block, so

6930

// their order does not matter, as long as all instructions in a basic block

6931

// are grouped together. Using dominance ensures a deterministic order.

6932

SmallVector<Instruction *, 16> OrderedScalars;

6933

for (const auto &TEPtr : VectorizableTree) {

6934

Instruction *Inst = dyn_cast<Instruction>(TEPtr->Scalars[0]);

6935

if (!Inst)

6936

continue;

6937

OrderedScalars.push_back(Inst);

6938

}

6939

llvm::sort(OrderedScalars, [&](Instruction *A, Instruction *B) {

6940

auto *NodeA = DT->getNode(A->getParent());

6941

auto *NodeB = DT->getNode(B->getParent());

6942

assert(NodeA && "Should only process reachable instructions")(static_cast <bool> (NodeA && "Should only process reachable instructions"
) ? void (0) : __assert_fail ("NodeA && \"Should only process reachable instructions\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 6942, __extension__
__PRETTY_FUNCTION__));

6943

assert(NodeB && "Should only process reachable instructions")(static_cast <bool> (NodeB && "Should only process reachable instructions"
) ? void (0) : __assert_fail ("NodeB && \"Should only process reachable instructions\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 6943, __extension__
__PRETTY_FUNCTION__));

6944

assert((NodeA == NodeB) == (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&(static_cast <bool> ((NodeA == NodeB) == (NodeA->getDFSNumIn
() == NodeB->getDFSNumIn()) && "Different nodes should have different DFS numbers"
) ? void (0) : __assert_fail ("(NodeA == NodeB) == (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) && \"Different nodes should have different DFS numbers\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 6945, __extension__
__PRETTY_FUNCTION__))

6945

"Different nodes should have different DFS numbers")(static_cast <bool> ((NodeA == NodeB) == (NodeA->getDFSNumIn
() == NodeB->getDFSNumIn()) && "Different nodes should have different DFS numbers"
) ? void (0) : __assert_fail ("(NodeA == NodeB) == (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) && \"Different nodes should have different DFS numbers\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 6945, __extension__
__PRETTY_FUNCTION__));

6946

if (NodeA != NodeB)

6947

return NodeA->getDFSNumIn() < NodeB->getDFSNumIn();

6948

return B->comesBefore(A);

6949

});

6950

6951

for (Instruction *Inst : OrderedScalars) {

6952

if (!PrevInst) {

6953

PrevInst = Inst;

6954

continue;

6955

}

6956

6957

// Update LiveValues.

6958

LiveValues.erase(PrevInst);

6959

for (auto &J : PrevInst->operands()) {

6960

if (isa<Instruction>(&*J) && getTreeEntry(&*J))

6961

LiveValues.insert(cast<Instruction>(&*J));

6962

}

6963

6964

LLVM_DEBUG({do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { { dbgs() << "SLP: #LV: " << LiveValues
.size(); for (auto *X : LiveValues) dbgs() << " " <<
X->getName(); dbgs() << ", Looking at "; Inst->dump
(); }; } } while (false)

6965

dbgs() << "SLP: #LV: " << LiveValues.size();do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { { dbgs() << "SLP: #LV: " << LiveValues
.size(); for (auto *X : LiveValues) dbgs() << " " <<
X->getName(); dbgs() << ", Looking at "; Inst->dump
(); }; } } while (false)

6966

for (auto *X : LiveValues)do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { { dbgs() << "SLP: #LV: " << LiveValues
.size(); for (auto *X : LiveValues) dbgs() << " " <<
X->getName(); dbgs() << ", Looking at "; Inst->dump
(); }; } } while (false)

6967

dbgs() << " " << X->getName();do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { { dbgs() << "SLP: #LV: " << LiveValues
.size(); for (auto *X : LiveValues) dbgs() << " " <<
X->getName(); dbgs() << ", Looking at "; Inst->dump
(); }; } } while (false)

6968

dbgs() << ", Looking at ";do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { { dbgs() << "SLP: #LV: " << LiveValues
.size(); for (auto *X : LiveValues) dbgs() << " " <<
X->getName(); dbgs() << ", Looking at "; Inst->dump
(); }; } } while (false)

6969

Inst->dump();do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { { dbgs() << "SLP: #LV: " << LiveValues
.size(); for (auto *X : LiveValues) dbgs() << " " <<
X->getName(); dbgs() << ", Looking at "; Inst->dump
(); }; } } while (false)

6970

})do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { { dbgs() << "SLP: #LV: " << LiveValues
.size(); for (auto *X : LiveValues) dbgs() << " " <<
X->getName(); dbgs() << ", Looking at "; Inst->dump
(); }; } } while (false);

6971

6972

// Now find the sequence of instructions between PrevInst and Inst.

6973

unsigned NumCalls = 0;

6974

BasicBlock::reverse_iterator InstIt = ++Inst->getIterator().getReverse(),

6975

PrevInstIt =

6976

PrevInst->getIterator().getReverse();

6977

while (InstIt != PrevInstIt) {

6978

if (PrevInstIt == PrevInst->getParent()->rend()) {

6979

PrevInstIt = Inst->getParent()->rbegin();

6980

continue;

6981

}

6982

6983

// Debug information does not impact spill cost.

6984

if ((isa<CallInst>(&*PrevInstIt) &&

6985

!isa<DbgInfoIntrinsic>(&*PrevInstIt)) &&

6986

&*PrevInstIt != PrevInst)

6987

NumCalls++;

6988

6989

++PrevInstIt;

6990

}

6991

6992

if (NumCalls) {

6993

SmallVector<Type*, 4> V;

6994

for (auto *II : LiveValues) {

6995

auto *ScalarTy = II->getType();

6996

if (auto *VectorTy = dyn_cast<FixedVectorType>(ScalarTy))

6997

ScalarTy = VectorTy->getElementType();

6998

V.push_back(FixedVectorType::get(ScalarTy, BundleWidth));

6999

}

7000

Cost += NumCalls * TTI->getCostOfKeepingLiveOverCall(V);

7001

}

7002

7003

PrevInst = Inst;

7004

}

7005

7006

return Cost;

7007

}

7008

7009

/// Check if two insertelement instructions are from the same buildvector.

7010

static bool areTwoInsertFromSameBuildVector(

7011

InsertElementInst *VU, InsertElementInst *V,

7012

function_ref<Value *(InsertElementInst *)> GetBaseOperand) {

7013

// Instructions must be from the same basic blocks.

7014

if (VU->getParent() != V->getParent())

7015

return false;

7016

// Checks if 2 insertelements are from the same buildvector.

7017

if (VU->getType() != V->getType())

7018

return false;

7019

// Multiple used inserts are separate nodes.

7020

if (!VU->hasOneUse() && !V->hasOneUse())

7021

return false;

7022

auto *IE1 = VU;

7023

auto *IE2 = V;

7024

unsigned Idx1 = *getInsertIndex(IE1);

7025

unsigned Idx2 = *getInsertIndex(IE2);

7026

// Go through the vector operand of insertelement instructions trying to find

7027

// either VU as the original vector for IE2 or V as the original vector for

7028

// IE1.

7029

do {

7030

if (IE2 == VU)

7031

return VU->hasOneUse();

7032

if (IE1 == V)

7033

return V->hasOneUse();

7034

if (IE1) {

7035

if ((IE1 != VU && !IE1->hasOneUse()) ||

7036

getInsertIndex(IE1).value_or(Idx2) == Idx2)

7037

IE1 = nullptr;

7038

else

7039

IE1 = dyn_cast_or_null<InsertElementInst>(GetBaseOperand(IE1));

7040

}

7041

if (IE2) {

7042

if ((IE2 != V && !IE2->hasOneUse()) ||

7043

getInsertIndex(IE2).value_or(Idx1) == Idx1)

7044

IE2 = nullptr;

7045

else

7046

IE2 = dyn_cast_or_null<InsertElementInst>(GetBaseOperand(IE2));

7047

}

7048

} while (IE1 || IE2);

7049

return false;

7050

}

7051

7052

/// Checks if the \p IE1 instructions is followed by \p IE2 instruction in the

7053

/// buildvector sequence.

7054

static bool isFirstInsertElement(const InsertElementInst *IE1,

7055

const InsertElementInst *IE2) {

7056

if (IE1 == IE2)

7057

return false;

7058

const auto *I1 = IE1;

7059

const auto *I2 = IE2;

7060

const InsertElementInst *PrevI1;

7061

const InsertElementInst *PrevI2;

7062

unsigned Idx1 = *getInsertIndex(IE1);

7063

unsigned Idx2 = *getInsertIndex(IE2);

7064

do {

7065

if (I2 == IE1)

7066

return true;

7067

if (I1 == IE2)

7068

return false;

7069

PrevI1 = I1;

7070

PrevI2 = I2;

7071

if (I1 && (I1 == IE1 || I1->hasOneUse()) &&

7072

getInsertIndex(I1).value_or(Idx2) != Idx2)

7073

I1 = dyn_cast<InsertElementInst>(I1->getOperand(0));

7074

if (I2 && ((I2 == IE2 || I2->hasOneUse())) &&

7075

getInsertIndex(I2).value_or(Idx1) != Idx1)

7076

I2 = dyn_cast<InsertElementInst>(I2->getOperand(0));

7077

} while ((I1 && PrevI1 != I1) || (I2 && PrevI2 != I2));

7078

llvm_unreachable("Two different buildvectors not expected.")::llvm::llvm_unreachable_internal("Two different buildvectors not expected."
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 7078);

7079

}

7080

7081

namespace {

7082

/// Returns incoming Value *, if the requested type is Value * too, or a default

7083

/// value, otherwise.

7084

struct ValueSelect {

7085

template <typename U>

7086

static std::enable_if_t<std::is_same<Value *, U>::value, Value *>

7087

get(Value *V) {

7088

return V;

7089

}

7090

template <typename U>

7091

static std::enable_if_t<!std::is_same<Value *, U>::value, U> get(Value *) {

7092

return U();

7093

}

7094

};

7095

} // namespace

7096

7097

/// Does the analysis of the provided shuffle masks and performs the requested

7098

/// actions on the vectors with the given shuffle masks. It tries to do it in

7099

/// several steps.

7100

/// 1. If the Base vector is not undef vector, resizing the very first mask to

7101

/// have common VF and perform action for 2 input vectors (including non-undef

7102

/// Base). Other shuffle masks are combined with the resulting after the 1 stage

7103

/// and processed as a shuffle of 2 elements.

7104

/// 2. If the Base is undef vector and have only 1 shuffle mask, perform the

7105

/// action only for 1 vector with the given mask, if it is not the identity

7106

/// mask.

7107

/// 3. If > 2 masks are used, perform the remaining shuffle actions for 2

7108

/// vectors, combing the masks properly between the steps.

7109

template <typename T>

7110

static T *performExtractsShuffleAction(

7111

MutableArrayRef<std::pair<T *, SmallVector<int>>> ShuffleMask, Value *Base,

7112

function_ref<unsigned(T *)> GetVF,

7113

function_ref<std::pair<T *, bool>(T *, ArrayRef<int>, bool)> ResizeAction,

7114

function_ref<T *(ArrayRef<int>, ArrayRef<T *>)> Action) {

7115

assert(!ShuffleMask.empty() && "Empty list of shuffles for inserts.")(static_cast <bool> (!ShuffleMask.empty() && "Empty list of shuffles for inserts."
) ? void (0) : __assert_fail ("!ShuffleMask.empty() && \"Empty list of shuffles for inserts.\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 7115, __extension__
__PRETTY_FUNCTION__));

7116

SmallVector<int> Mask(ShuffleMask.begin()->second);

7117

auto VMIt = std::next(ShuffleMask.begin());

7118

T *Prev = nullptr;

7119

bool IsBaseNotUndef = !isUndefVector(Base, Mask);

7120

if (IsBaseNotUndef) {

7121

// Base is not undef, need to combine it with the next subvectors.

7122

std::pair<T *, bool> Res =

7123

ResizeAction(ShuffleMask.begin()->first, Mask, /*ForSingleMask=*/false);

7124

for (unsigned Idx = 0, VF = Mask.size(); Idx < VF; ++Idx) {

7125

if (Mask[Idx] == UndefMaskElem)

7126

Mask[Idx] = Idx;

7127

else

7128

Mask[Idx] = (Res.second ? Idx : Mask[Idx]) + VF;

7129

}

7130

auto *V = ValueSelect::get<T *>(Base);

7131

(void)V;

7132

assert((!V || GetVF(V) == Mask.size()) &&(static_cast <bool> ((!V || GetVF(V) == Mask.size()) &&
"Expected base vector of VF number of elements.") ? void (0)
: __assert_fail ("(!V || GetVF(V) == Mask.size()) && \"Expected base vector of VF number of elements.\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 7133, __extension__
__PRETTY_FUNCTION__))

7133

"Expected base vector of VF number of elements.")(static_cast <bool> ((!V || GetVF(V) == Mask.size()) &&
"Expected base vector of VF number of elements.") ? void (0)
: __assert_fail ("(!V || GetVF(V) == Mask.size()) && \"Expected base vector of VF number of elements.\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 7133, __extension__
__PRETTY_FUNCTION__));

7134

Prev = Action(Mask, {nullptr, Res.first});

7135

} else if (ShuffleMask.size() == 1) {

7136

// Base is undef and only 1 vector is shuffled - perform the action only for

7137

// single vector, if the mask is not the identity mask.

7138

std::pair<T *, bool> Res = ResizeAction(ShuffleMask.begin()->first, Mask,

7139

/*ForSingleMask=*/true);

7140

if (Res.second)

7141

// Identity mask is found.

7142

Prev = Res.first;

7143

else

7144

Prev = Action(Mask, {ShuffleMask.begin()->first});

7145

} else {

7146

// Base is undef and at least 2 input vectors shuffled - perform 2 vectors

7147

// shuffles step by step, combining shuffle between the steps.

7148

unsigned Vec1VF = GetVF(ShuffleMask.begin()->first);

7149

unsigned Vec2VF = GetVF(VMIt->first);

7150

if (Vec1VF == Vec2VF) {

7151

// No need to resize the input vectors since they are of the same size, we

7152

// can shuffle them directly.

7153

ArrayRef<int> SecMask = VMIt->second;

7154

for (unsigned I = 0, VF = Mask.size(); I < VF; ++I) {

7155

if (SecMask[I] != UndefMaskElem) {

7156

assert(Mask[I] == UndefMaskElem && "Multiple uses of scalars.")(static_cast <bool> (Mask[I] == UndefMaskElem &&
"Multiple uses of scalars.") ? void (0) : __assert_fail ("Mask[I] == UndefMaskElem && \"Multiple uses of scalars.\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 7156, __extension__
__PRETTY_FUNCTION__));

7157

Mask[I] = SecMask[I] + Vec1VF;

7158

}

7159

}

7160

Prev = Action(Mask, {ShuffleMask.begin()->first, VMIt->first});

7161

} else {

7162

// Vectors of different sizes - resize and reshuffle.

7163

std::pair<T *, bool> Res1 = ResizeAction(ShuffleMask.begin()->first, Mask,

7164

/*ForSingleMask=*/false);

7165

std::pair<T *, bool> Res2 =

7166

ResizeAction(VMIt->first, VMIt->second, /*ForSingleMask=*/false);

7167

ArrayRef<int> SecMask = VMIt->second;

7168

for (unsigned I = 0, VF = Mask.size(); I < VF; ++I) {

7169

if (Mask[I] != UndefMaskElem) {

7170

assert(SecMask[I] == UndefMaskElem && "Multiple uses of scalars.")(static_cast <bool> (SecMask[I] == UndefMaskElem &&
"Multiple uses of scalars.") ? void (0) : __assert_fail ("SecMask[I] == UndefMaskElem && \"Multiple uses of scalars.\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 7170, __extension__
__PRETTY_FUNCTION__));

7171

if (Res1.second)

7172

Mask[I] = I;

7173

} else if (SecMask[I] != UndefMaskElem) {

7174

assert(Mask[I] == UndefMaskElem && "Multiple uses of scalars.")(static_cast <bool> (Mask[I] == UndefMaskElem &&
"Multiple uses of scalars.") ? void (0) : __assert_fail ("Mask[I] == UndefMaskElem && \"Multiple uses of scalars.\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 7174, __extension__
__PRETTY_FUNCTION__));

7175

Mask[I] = (Res2.second ? I : SecMask[I]) + VF;

7176

}

7177

}

7178

Prev = Action(Mask, {Res1.first, Res2.first});

7179

}

7180

VMIt = std::next(VMIt);

7181

}

7182

// Perform requested actions for the remaining masks/vectors.

7183

for (auto E = ShuffleMask.end(); VMIt != E; ++VMIt) {

7184

// Shuffle other input vectors, if any.

7185

std::pair<T *, bool> Res =

7186

ResizeAction(VMIt->first, VMIt->second, /*ForSingleMask=*/false);

7187

ArrayRef<int> SecMask = VMIt->second;

7188

for (unsigned I = 0, VF = Mask.size(); I < VF; ++I) {

7189

if (SecMask[I] != UndefMaskElem) {

7190

assert((Mask[I] == UndefMaskElem || IsBaseNotUndef) &&(static_cast <bool> ((Mask[I] == UndefMaskElem || IsBaseNotUndef
) && "Multiple uses of scalars.") ? void (0) : __assert_fail
("(Mask[I] == UndefMaskElem || IsBaseNotUndef) && \"Multiple uses of scalars.\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 7191, __extension__
__PRETTY_FUNCTION__))

7191

"Multiple uses of scalars.")(static_cast <bool> ((Mask[I] == UndefMaskElem || IsBaseNotUndef
) && "Multiple uses of scalars.") ? void (0) : __assert_fail
("(Mask[I] == UndefMaskElem || IsBaseNotUndef) && \"Multiple uses of scalars.\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 7191, __extension__
__PRETTY_FUNCTION__));

7192

Mask[I] = (Res.second ? I : SecMask[I]) + VF;

7193

} else if (Mask[I] != UndefMaskElem) {

7194

Mask[I] = I;

7195

}

7196

}

7197

Prev = Action(Mask, {Prev, Res.first});

7198

}

7199

return Prev;

7200

}

7201

7202

InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {

7203

InstructionCost Cost = 0;

7204

LLVM_DEBUG(dbgs() << "SLP: Calculating cost for tree of size "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Calculating cost for tree of size "
<< VectorizableTree.size() << ".\n"; } } while (
false)

7205

<< VectorizableTree.size() << ".\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Calculating cost for tree of size "
<< VectorizableTree.size() << ".\n"; } } while (
false);

7206

7207

unsigned BundleWidth = VectorizableTree[0]->Scalars.size();

7208

7209

for (unsigned I = 0, E = VectorizableTree.size(); I < E; ++I) {

7210

TreeEntry &TE = *VectorizableTree[I];

7211

if (TE.State == TreeEntry::NeedToGather) {

7212

if (const TreeEntry *E = getTreeEntry(TE.getMainOp());

7213

E && E->getVectorFactor() == TE.getVectorFactor() &&

7214

E->isSame(TE.Scalars)) {

7215

// Some gather nodes might be absolutely the same as some vectorizable

7216

// nodes after reordering, need to handle it.

7217

LLVM_DEBUG(dbgs() << "SLP: Adding cost 0 for bundle that starts with "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Adding cost 0 for bundle that starts with "
<< *TE.Scalars[0] << ".\n" << "SLP: Current total cost = "
<< Cost << "\n"; } } while (false)

7218

<< *TE.Scalars[0] << ".\n"do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Adding cost 0 for bundle that starts with "
<< *TE.Scalars[0] << ".\n" << "SLP: Current total cost = "
<< Cost << "\n"; } } while (false)

7219

<< "SLP: Current total cost = " << Cost << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Adding cost 0 for bundle that starts with "
<< *TE.Scalars[0] << ".\n" << "SLP: Current total cost = "
<< Cost << "\n"; } } while (false);

7220

continue;

7221

}

7222

}

7223

7224

InstructionCost C = getEntryCost(&TE, VectorizedVals);

7225

Cost += C;

7226

LLVM_DEBUG(dbgs() << "SLP: Adding cost " << Cdo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Adding cost " << C <<
" for bundle that starts with " << *TE.Scalars[0] <<
".\n" << "SLP: Current total cost = " << Cost <<
"\n"; } } while (false)

7227

<< " for bundle that starts with " << *TE.Scalars[0]do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Adding cost " << C <<
" for bundle that starts with " << *TE.Scalars[0] <<
".\n" << "SLP: Current total cost = " << Cost <<
"\n"; } } while (false)

7228

<< ".\n"do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Adding cost " << C <<
" for bundle that starts with " << *TE.Scalars[0] <<
".\n" << "SLP: Current total cost = " << Cost <<
"\n"; } } while (false)

7229

<< "SLP: Current total cost = " << Cost << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Adding cost " << C <<
" for bundle that starts with " << *TE.Scalars[0] <<
".\n" << "SLP: Current total cost = " << Cost <<
"\n"; } } while (false);

7230

}

7231

7232

SmallPtrSet<Value *, 16> ExtractCostCalculated;

7233

InstructionCost ExtractCost = 0;

7234

SmallVector<MapVector<const TreeEntry *, SmallVector<int>>> ShuffleMasks;

7235

SmallVector<std::pair<Value *, const TreeEntry *>> FirstUsers;

7236

SmallVector<APInt> DemandedElts;

7237

for (ExternalUser &EU : ExternalUses) {

7238

// We only add extract cost once for the same scalar.

7239

if (!isa_and_nonnull<InsertElementInst>(EU.User) &&

7240

!ExtractCostCalculated.insert(EU.Scalar).second)

7241

continue;

7242

7243

// Uses by ephemeral values are free (because the ephemeral value will be

7244

// removed prior to code generation, and so the extraction will be

7245

// removed as well).

7246

if (EphValues.count(EU.User))

7247

continue;

7248

7249

// No extract cost for vector "scalar"

7250

if (isa<FixedVectorType>(EU.Scalar->getType()))

7251

continue;

7252

7253

// Already counted the cost for external uses when tried to adjust the cost

7254

// for extractelements, no need to add it again.

7255

if (isa<ExtractElementInst>(EU.Scalar))

7256

continue;

7257

7258

// If found user is an insertelement, do not calculate extract cost but try

7259

// to detect it as a final shuffled/identity match.

7260

if (auto *VU = dyn_cast_or_null<InsertElementInst>(EU.User)) {

7261

if (auto *FTy = dyn_cast<FixedVectorType>(VU->getType())) {

7262

Optional<unsigned> InsertIdx = getInsertIndex(VU);

7263

if (InsertIdx) {

7264

const TreeEntry *ScalarTE = getTreeEntry(EU.Scalar);

7265

auto *It = find_if(

7266

FirstUsers,

7267

[this, VU](const std::pair<Value *, const TreeEntry *> &Pair) {

7268

return areTwoInsertFromSameBuildVector(

7269

VU, cast<InsertElementInst>(Pair.first),

7270

[this](InsertElementInst *II) -> Value * {

7271

Value *Op0 = II->getOperand(0);

7272

if (getTreeEntry(II) && !getTreeEntry(Op0))

7273

return nullptr;

7274

return Op0;

7275

});

7276

});

7277

int VecId = -1;

7278

if (It == FirstUsers.end()) {

7279

(void)ShuffleMasks.emplace_back();

7280

SmallVectorImpl<int> &Mask = ShuffleMasks.back()[ScalarTE];

7281

if (Mask.empty())

7282

Mask.assign(FTy->getNumElements(), UndefMaskElem);

7283

// Find the insertvector, vectorized in tree, if any.

7284

Value *Base = VU;

7285

while (auto *IEBase = dyn_cast<InsertElementInst>(Base)) {

7286

if (IEBase != EU.User &&

7287

(!IEBase->hasOneUse() ||

7288

getInsertIndex(IEBase).value_or(*InsertIdx) == *InsertIdx))

7289

break;

7290

// Build the mask for the vectorized insertelement instructions.

7291

if (const TreeEntry *E = getTreeEntry(IEBase)) {

7292

VU = IEBase;

7293

do {

7294

IEBase = cast<InsertElementInst>(Base);

7295

int Idx = *getInsertIndex(IEBase);

7296

assert(Mask[Idx] == UndefMaskElem &&(static_cast <bool> (Mask[Idx] == UndefMaskElem &&
"InsertElementInstruction used already.") ? void (0) : __assert_fail
("Mask[Idx] == UndefMaskElem && \"InsertElementInstruction used already.\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 7297, __extension__
__PRETTY_FUNCTION__))

7297

"InsertElementInstruction used already.")(static_cast <bool> (Mask[Idx] == UndefMaskElem &&
"InsertElementInstruction used already.") ? void (0) : __assert_fail
("Mask[Idx] == UndefMaskElem && \"InsertElementInstruction used already.\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 7297, __extension__
__PRETTY_FUNCTION__));

7298

Mask[Idx] = Idx;

7299

Base = IEBase->getOperand(0);

7300

} while (E == getTreeEntry(Base));

7301

break;

7302

}

7303

Base = cast<InsertElementInst>(Base)->getOperand(0);

7304

}

7305

FirstUsers.emplace_back(VU, ScalarTE);

7306

DemandedElts.push_back(APInt::getZero(FTy->getNumElements()));

7307

VecId = FirstUsers.size() - 1;

7308

} else {

7309

if (isFirstInsertElement(VU, cast<InsertElementInst>(It->first)))

7310

It->first = VU;

7311

VecId = std::distance(FirstUsers.begin(), It);

7312

}

7313

int InIdx = *InsertIdx;

7314

SmallVectorImpl<int> &Mask = ShuffleMasks[VecId][ScalarTE];

7315

if (Mask.empty())

7316

Mask.assign(FTy->getNumElements(), UndefMaskElem);

7317

Mask[InIdx] = EU.Lane;

7318

DemandedElts[VecId].setBit(InIdx);

7319

continue;

7320

}

7321

}

7322

}

7323

7324

// If we plan to rewrite the tree in a smaller type, we will need to sign

7325

// extend the extracted value back to the original type. Here, we account

7326

// for the extract and the added cost of the sign extend if needed.

7327

auto *VecTy = FixedVectorType::get(EU.Scalar->getType(), BundleWidth);

7328

auto *ScalarRoot = VectorizableTree[0]->Scalars[0];

7329

if (MinBWs.count(ScalarRoot)) {

7330

auto *MinTy = IntegerType::get(F->getContext(), MinBWs[ScalarRoot].first);

7331

auto Extend =

7332

MinBWs[ScalarRoot].second ? Instruction::SExt : Instruction::ZExt;

7333

VecTy = FixedVectorType::get(MinTy, BundleWidth);

7334

ExtractCost += TTI->getExtractWithExtendCost(Extend, EU.Scalar->getType(),

7335

VecTy, EU.Lane);

7336

} else {

7337

ExtractCost +=

7338

TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, EU.Lane);

7339

}

7340

}

7341

7342

InstructionCost SpillCost = getSpillCost();

7343

Cost += SpillCost + ExtractCost;

7344

auto &&ResizeToVF = [this, &Cost](const TreeEntry *TE, ArrayRef<int> Mask,

7345

bool) {

7346

InstructionCost C = 0;

7347

unsigned VF = Mask.size();

7348

unsigned VecVF = TE->getVectorFactor();

7349

if (VF != VecVF &&

7350

(any_of(Mask, [VF](int Idx) { return Idx >= static_cast<int>(VF); }) ||

7351

(all_of(Mask,

7352

[VF](int Idx) { return Idx < 2 * static_cast<int>(VF); }) &&

7353

!ShuffleVectorInst::isIdentityMask(Mask)))) {

7354

SmallVector<int> OrigMask(VecVF, UndefMaskElem);

7355

std::copy(Mask.begin(), std::next(Mask.begin(), std::min(VF, VecVF)),

7356

OrigMask.begin());

7357

C = TTI->getShuffleCost(

7358

TTI::SK_PermuteSingleSrc,

7359

FixedVectorType::get(TE->getMainOp()->getType(), VecVF), OrigMask);

7360

LLVM_DEBUG(do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Adding cost " << C <<
" for final shuffle of insertelement external users.\n"; TE->
dump(); dbgs() << "SLP: Current total cost = " <<
Cost << "\n"; } } while (false)

7361

dbgs() << "SLP: Adding cost " << Cdo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Adding cost " << C <<
" for final shuffle of insertelement external users.\n"; TE->
dump(); dbgs() << "SLP: Current total cost = " <<
Cost << "\n"; } } while (false)

7362

<< " for final shuffle of insertelement external users.\n";do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Adding cost " << C <<
" for final shuffle of insertelement external users.\n"; TE->
dump(); dbgs() << "SLP: Current total cost = " <<
Cost << "\n"; } } while (false)

7363

TE->dump(); dbgs() << "SLP: Current total cost = " << Cost << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Adding cost " << C <<
" for final shuffle of insertelement external users.\n"; TE->
dump(); dbgs() << "SLP: Current total cost = " <<
Cost << "\n"; } } while (false);

7364

Cost += C;

7365

return std::make_pair(TE, true);

7366

}

7367

return std::make_pair(TE, false);

7368

};

7369

// Calculate the cost of the reshuffled vectors, if any.

7370

for (int I = 0, E = FirstUsers.size(); I < E; ++I) {

7371

Value *Base = cast<Instruction>(FirstUsers[I].first)->getOperand(0);

7372

unsigned VF = ShuffleMasks[I].begin()->second.size();

7373

auto *FTy = FixedVectorType::get(

7374

cast<VectorType>(FirstUsers[I].first->getType())->getElementType(), VF);

7375

auto Vector = ShuffleMasks[I].takeVector();

7376

auto &&EstimateShufflesCost = [this, FTy,

7377

&Cost](ArrayRef<int> Mask,

7378

ArrayRef<const TreeEntry *> TEs) {

7379

assert((TEs.size() == 1 || TEs.size() == 2) &&(static_cast <bool> ((TEs.size() == 1 || TEs.size() == 2
) && "Expected exactly 1 or 2 tree entries.") ? void (
0) : __assert_fail ("(TEs.size() == 1 || TEs.size() == 2) && \"Expected exactly 1 or 2 tree entries.\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 7380, __extension__
__PRETTY_FUNCTION__))

7380

"Expected exactly 1 or 2 tree entries.")(static_cast <bool> ((TEs.size() == 1 || TEs.size() == 2
) && "Expected exactly 1 or 2 tree entries.") ? void (
0) : __assert_fail ("(TEs.size() == 1 || TEs.size() == 2) && \"Expected exactly 1 or 2 tree entries.\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 7380, __extension__
__PRETTY_FUNCTION__));

7381

if (TEs.size() == 1) {

7382

int Limit = 2 * Mask.size();

7383

if (!all_of(Mask, [Limit](int Idx) { return Idx < Limit; }) ||

7384

!ShuffleVectorInst::isIdentityMask(Mask)) {

7385

InstructionCost C =

7386

TTI->getShuffleCost(TTI::SK_PermuteSingleSrc, FTy, Mask);

7387

LLVM_DEBUG(dbgs() << "SLP: Adding cost " << Cdo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Adding cost " << C <<
" for final shuffle of insertelement " "external users.\n"; TEs
.front()->dump(); dbgs() << "SLP: Current total cost = "
<< Cost << "\n"; } } while (false)

7388

<< " for final shuffle of insertelement "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Adding cost " << C <<
" for final shuffle of insertelement " "external users.\n"; TEs
.front()->dump(); dbgs() << "SLP: Current total cost = "
<< Cost << "\n"; } } while (false)

7389

"external users.\n";do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Adding cost " << C <<
" for final shuffle of insertelement " "external users.\n"; TEs
.front()->dump(); dbgs() << "SLP: Current total cost = "
<< Cost << "\n"; } } while (false)

7390

TEs.front()->dump();do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Adding cost " << C <<
" for final shuffle of insertelement " "external users.\n"; TEs
.front()->dump(); dbgs() << "SLP: Current total cost = "
<< Cost << "\n"; } } while (false)

7391

dbgs() << "SLP: Current total cost = " << Cost << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Adding cost " << C <<
" for final shuffle of insertelement " "external users.\n"; TEs
.front()->dump(); dbgs() << "SLP: Current total cost = "
<< Cost << "\n"; } } while (false);

7392

Cost += C;

7393

}

7394

} else {

7395

InstructionCost C =

7396

TTI->getShuffleCost(TTI::SK_PermuteTwoSrc, FTy, Mask);

7397

LLVM_DEBUG(dbgs() << "SLP: Adding cost " << Cdo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Adding cost " << C <<
" for final shuffle of vector node and external " "insertelement users.\n"
; if (TEs.front()) { TEs.front()->dump(); } TEs.back()->
dump(); dbgs() << "SLP: Current total cost = " <<
Cost << "\n"; } } while (false)

7398

<< " for final shuffle of vector node and external "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Adding cost " << C <<
" for final shuffle of vector node and external " "insertelement users.\n"
; if (TEs.front()) { TEs.front()->dump(); } TEs.back()->
dump(); dbgs() << "SLP: Current total cost = " <<
Cost << "\n"; } } while (false)

7399

"insertelement users.\n";do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Adding cost " << C <<
" for final shuffle of vector node and external " "insertelement users.\n"
; if (TEs.front()) { TEs.front()->dump(); } TEs.back()->
dump(); dbgs() << "SLP: Current total cost = " <<
Cost << "\n"; } } while (false)

7400

if (TEs.front()) { TEs.front()->dump(); } TEs.back()->dump();do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Adding cost " << C <<
" for final shuffle of vector node and external " "insertelement users.\n"
; if (TEs.front()) { TEs.front()->dump(); } TEs.back()->
dump(); dbgs() << "SLP: Current total cost = " <<
Cost << "\n"; } } while (false)

7401

dbgs() << "SLP: Current total cost = " << Cost << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Adding cost " << C <<
" for final shuffle of vector node and external " "insertelement users.\n"
; if (TEs.front()) { TEs.front()->dump(); } TEs.back()->
dump(); dbgs() << "SLP: Current total cost = " <<
Cost << "\n"; } } while (false);

7402

Cost += C;

7403

}

7404

return TEs.back();

7405

};

7406

(void)performExtractsShuffleAction<const TreeEntry>(

7407

makeMutableArrayRef(Vector.data(), Vector.size()), Base,

7408

[](const TreeEntry *E) { return E->getVectorFactor(); }, ResizeToVF,

7409

EstimateShufflesCost);

7410

InstructionCost InsertCost = TTI->getScalarizationOverhead(

7411

cast<FixedVectorType>(FirstUsers[I].first->getType()), DemandedElts[I],

7412

/*Insert*/ true, /*Extract*/ false);

7413

Cost -= InsertCost;

7414

}

7415

7416

#ifndef NDEBUG

7417

SmallString<256> Str;

7418

{

7419

raw_svector_ostream OS(Str);

7420

OS << "SLP: Spill Cost = " << SpillCost << ".\n"

7421

<< "SLP: Extract Cost = " << ExtractCost << ".\n"

7422

<< "SLP: Total Cost = " << Cost << ".\n";

7423

}

7424

LLVM_DEBUG(dbgs() << Str)do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << Str; } } while (false);

7425

if (ViewSLPTree)

7426

ViewGraph(this, "SLP" + F->getName(), false, Str);

7427

#endif

7428

7429

return Cost;

7430

}

7431

7432

Optional<TargetTransformInfo::ShuffleKind>

7433

BoUpSLP::isGatherShuffledEntry(const TreeEntry *TE, SmallVectorImpl<int> &Mask,

7434

SmallVectorImpl<const TreeEntry *> &Entries) {

7435

// TODO: currently checking only for Scalars in the tree entry, need to count

7436

// reused elements too for better cost estimation.

7437

Mask.assign(TE->Scalars.size(), UndefMaskElem);

7438

Entries.clear();

7439

// Build a lists of values to tree entries.

7440

DenseMap<Value *, SmallPtrSet<const TreeEntry *, 4>> ValueToTEs;

7441

for (const std::unique_ptr<TreeEntry> &EntryPtr : VectorizableTree) {

7442

if (EntryPtr.get() == TE)

7443

break;

7444

if (EntryPtr->State != TreeEntry::NeedToGather)

7445

continue;

7446

for (Value *V : EntryPtr->Scalars)

7447

ValueToTEs.try_emplace(V).first->getSecond().insert(EntryPtr.get());

7448

}

7449

// Find all tree entries used by the gathered values. If no common entries

7450

// found - not a shuffle.

7451

// Here we build a set of tree nodes for each gathered value and trying to

7452

// find the intersection between these sets. If we have at least one common

7453

// tree node for each gathered value - we have just a permutation of the

7454

// single vector. If we have 2 different sets, we're in situation where we

7455

// have a permutation of 2 input vectors.

7456

SmallVector<SmallPtrSet<const TreeEntry *, 4>> UsedTEs;

7457

DenseMap<Value *, int> UsedValuesEntry;

7458

for (Value *V : TE->Scalars) {

7459

if (isa<UndefValue>(V))

7460

continue;

7461

// Build a list of tree entries where V is used.

7462

SmallPtrSet<const TreeEntry *, 4> VToTEs;

7463

auto It = ValueToTEs.find(V);

7464

if (It != ValueToTEs.end())

7465

VToTEs = It->second;

7466

if (const TreeEntry *VTE = getTreeEntry(V))

7467

VToTEs.insert(VTE);

7468

if (VToTEs.empty())

7469

return None;

7470

if (UsedTEs.empty()) {

7471

// The first iteration, just insert the list of nodes to vector.

7472

UsedTEs.push_back(VToTEs);

7473

} else {

7474

// Need to check if there are any previously used tree nodes which use V.

7475

// If there are no such nodes, consider that we have another one input

7476

// vector.

7477

SmallPtrSet<const TreeEntry *, 4> SavedVToTEs(VToTEs);

7478

unsigned Idx = 0;

7479

for (SmallPtrSet<const TreeEntry *, 4> &Set : UsedTEs) {

7480

// Do we have a non-empty intersection of previously listed tree entries

7481

// and tree entries using current V?

7482

set_intersect(VToTEs, Set);

7483

if (!VToTEs.empty()) {

7484

// Yes, write the new subset and continue analysis for the next

7485

// scalar.

7486

Set.swap(VToTEs);

7487

break;

7488

}

7489

VToTEs = SavedVToTEs;

7490

++Idx;

7491

}

7492

// No non-empty intersection found - need to add a second set of possible

7493

// source vectors.

7494

if (Idx == UsedTEs.size()) {

7495

// If the number of input vectors is greater than 2 - not a permutation,

7496

// fallback to the regular gather.

7497

if (UsedTEs.size() == 2)

7498

return None;

7499

UsedTEs.push_back(SavedVToTEs);

7500

Idx = UsedTEs.size() - 1;

7501

}

7502

UsedValuesEntry.try_emplace(V, Idx);

7503

}

7504

}

7505

7506

if (UsedTEs.empty()) {

7507

assert(all_of(TE->Scalars, UndefValue::classof) &&(static_cast <bool> (all_of(TE->Scalars, UndefValue::
classof) && "Expected vector of undefs only.") ? void
(0) : __assert_fail ("all_of(TE->Scalars, UndefValue::classof) && \"Expected vector of undefs only.\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 7508, __extension__
__PRETTY_FUNCTION__))

7508

"Expected vector of undefs only.")(static_cast <bool> (all_of(TE->Scalars, UndefValue::
classof) && "Expected vector of undefs only.") ? void
(0) : __assert_fail ("all_of(TE->Scalars, UndefValue::classof) && \"Expected vector of undefs only.\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 7508, __extension__
__PRETTY_FUNCTION__));

7509

return None;

7510

}

7511

7512

unsigned VF = 0;

7513

if (UsedTEs.size() == 1) {

7514

// Try to find the perfect match in another gather node at first.

7515

auto It = find_if(UsedTEs.front(), [TE](const TreeEntry *EntryPtr) {

7516

return EntryPtr->isSame(TE->Scalars);

7517

});

7518

if (It != UsedTEs.front().end()) {

7519

Entries.push_back(*It);

7520

std::iota(Mask.begin(), Mask.end(), 0);

7521

return TargetTransformInfo::SK_PermuteSingleSrc;

7522

}

7523

// No perfect match, just shuffle, so choose the first tree node.

7524

Entries.push_back(*UsedTEs.front().begin());

7525

} else {

7526

// Try to find nodes with the same vector factor.

7527

assert(UsedTEs.size() == 2 && "Expected at max 2 permuted entries.")(static_cast <bool> (UsedTEs.size() == 2 && "Expected at max 2 permuted entries."
) ? void (0) : __assert_fail ("UsedTEs.size() == 2 && \"Expected at max 2 permuted entries.\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 7527, __extension__
__PRETTY_FUNCTION__));

7528

DenseMap<int, const TreeEntry *> VFToTE;

7529

for (const TreeEntry *TE : UsedTEs.front())

7530

VFToTE.try_emplace(TE->getVectorFactor(), TE);

7531

for (const TreeEntry *TE : UsedTEs.back()) {

7532

auto It = VFToTE.find(TE->getVectorFactor());

7533

if (It != VFToTE.end()) {

7534

VF = It->first;

7535

Entries.push_back(It->second);

7536

Entries.push_back(TE);

7537

break;

7538

}

7539

}

7540

// No 2 source vectors with the same vector factor - give up and do regular

7541

// gather.

7542

if (Entries.empty())

7543

return None;

7544

}

7545

7546

// Build a shuffle mask for better cost estimation and vector emission.

7547

for (int I = 0, E = TE->Scalars.size(); I < E; ++I) {

7548

Value *V = TE->Scalars[I];

7549

if (isa<UndefValue>(V))

7550

continue;

7551

unsigned Idx = UsedValuesEntry.lookup(V);

7552

const TreeEntry *VTE = Entries[Idx];

7553

int FoundLane = VTE->findLaneForValue(V);

7554

Mask[I] = Idx * VF + FoundLane;

7555

// Extra check required by isSingleSourceMaskImpl function (called by

7556

// ShuffleVectorInst::isSingleSourceMask).

7557

if (Mask[I] >= 2 * E)

7558

return None;

7559

}

7560

switch (Entries.size()) {

7561

case 1:

7562

return TargetTransformInfo::SK_PermuteSingleSrc;

7563

case 2:

7564

return TargetTransformInfo::SK_PermuteTwoSrc;

7565

default:

7566

break;

7567

}

7568

return None;

7569

}

7570

7571

InstructionCost BoUpSLP::getGatherCost(FixedVectorType *Ty,

7572

const APInt &ShuffledIndices,

7573

bool NeedToShuffle) const {

7574

InstructionCost Cost =

7575

TTI->getScalarizationOverhead(Ty, ~ShuffledIndices, /*Insert*/ true,

7576

/*Extract*/ false);

7577

if (NeedToShuffle)

7578

Cost += TTI->getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, Ty);

7579

return Cost;

7580

}

7581

7582

InstructionCost BoUpSLP::getGatherCost(ArrayRef<Value *> VL) const {

7583

// Find the type of the operands in VL.

7584

Type *ScalarTy = VL[0]->getType();

7585

if (StoreInst *SI = dyn_cast<StoreInst>(VL[0]))

7586

ScalarTy = SI->getValueOperand()->getType();

7587

auto *VecTy = FixedVectorType::get(ScalarTy, VL.size());

7588

bool DuplicateNonConst = false;

7589

// Find the cost of inserting/extracting values from the vector.

7590

// Check if the same elements are inserted several times and count them as

7591

// shuffle candidates.

7592

APInt ShuffledElements = APInt::getZero(VL.size());

7593

DenseSet<Value *> UniqueElements;

7594

// Iterate in reverse order to consider insert elements with the high cost.

7595

for (unsigned I = VL.size(); I > 0; --I) {

7596

unsigned Idx = I - 1;

7597

// No need to shuffle duplicates for constants.

7598

if (isConstant(VL[Idx])) {

7599

ShuffledElements.setBit(Idx);

7600

continue;

7601

}

7602

if (!UniqueElements.insert(VL[Idx]).second) {

7603

DuplicateNonConst = true;

7604

ShuffledElements.setBit(Idx);

7605

}

7606

}

7607

return getGatherCost(VecTy, ShuffledElements, DuplicateNonConst);

7608

}

7609

7610

// Perform operand reordering on the instructions in VL and return the reordered

7611

// operands in Left and Right.

7612

void BoUpSLP::reorderInputsAccordingToOpcode(ArrayRef<Value *> VL,

7613

SmallVectorImpl<Value *> &Left,

7614

SmallVectorImpl<Value *> &Right,

7615

const DataLayout &DL,

7616

ScalarEvolution &SE,

7617

const BoUpSLP &R) {

7618

if (VL.empty())

7619

return;

7620

VLOperands Ops(VL, DL, SE, R);

7621

// Reorder the operands in place.

7622

Ops.reorder();

7623

Left = Ops.getVL(0);

7624

Right = Ops.getVL(1);

7625

}

7626

7627

Instruction &BoUpSLP::getLastInstructionInBundle(const TreeEntry *E) {

7628

// Get the basic block this bundle is in. All instructions in the bundle

7629

// should be in this block (except for extractelement-like instructions with

7630

// constant indeces).

7631

auto *Front = E->getMainOp();

7632

auto *BB = Front->getParent();

7633

assert(llvm::all_of(E->Scalars, [=](Value *V) -> bool {(static_cast <bool> (llvm::all_of(E->Scalars, [=](Value
*V) -> bool { if (E->getOpcode() == Instruction::GetElementPtr
&& !isa<GetElementPtrInst>(V)) return true; auto
*I = cast<Instruction>(V); return !E->isOpcodeOrAlt
(I) || I->getParent() == BB || isVectorLikeInstWithConstOps
(I); })) ? void (0) : __assert_fail ("llvm::all_of(E->Scalars, [=](Value *V) -> bool { if (E->getOpcode() == Instruction::GetElementPtr && !isa<GetElementPtrInst>(V)) return true; auto *I = cast<Instruction>(V); return !E->isOpcodeOrAlt(I) || I->getParent() == BB || isVectorLikeInstWithConstOps(I); })"
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 7640, __extension__
__PRETTY_FUNCTION__))

7634

if (E->getOpcode() == Instruction::GetElementPtr &&(static_cast <bool> (llvm::all_of(E->Scalars, [=](Value
*V) -> bool { if (E->getOpcode() == Instruction::GetElementPtr
&& !isa<GetElementPtrInst>(V)) return true; auto
*I = cast<Instruction>(V); return !E->isOpcodeOrAlt
(I) || I->getParent() == BB || isVectorLikeInstWithConstOps
(I); })) ? void (0) : __assert_fail ("llvm::all_of(E->Scalars, [=](Value *V) -> bool { if (E->getOpcode() == Instruction::GetElementPtr && !isa<GetElementPtrInst>(V)) return true; auto *I = cast<Instruction>(V); return !E->isOpcodeOrAlt(I) || I->getParent() == BB || isVectorLikeInstWithConstOps(I); })"
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 7640, __extension__
__PRETTY_FUNCTION__))

7635

!isa<GetElementPtrInst>(V))(static_cast <bool> (llvm::all_of(E->Scalars, [=](Value
*V) -> bool { if (E->getOpcode() == Instruction::GetElementPtr
&& !isa<GetElementPtrInst>(V)) return true; auto
*I = cast<Instruction>(V); return !E->isOpcodeOrAlt
(I) || I->getParent() == BB || isVectorLikeInstWithConstOps
(I); })) ? void (0) : __assert_fail ("llvm::all_of(E->Scalars, [=](Value *V) -> bool { if (E->getOpcode() == Instruction::GetElementPtr && !isa<GetElementPtrInst>(V)) return true; auto *I = cast<Instruction>(V); return !E->isOpcodeOrAlt(I) || I->getParent() == BB || isVectorLikeInstWithConstOps(I); })"
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 7640, __extension__
__PRETTY_FUNCTION__))

7636

return true;(static_cast <bool> (llvm::all_of(E->Scalars, [=](Value
*V) -> bool { if (E->getOpcode() == Instruction::GetElementPtr
&& !isa<GetElementPtrInst>(V)) return true; auto
*I = cast<Instruction>(V); return !E->isOpcodeOrAlt
(I) || I->getParent() == BB || isVectorLikeInstWithConstOps
(I); })) ? void (0) : __assert_fail ("llvm::all_of(E->Scalars, [=](Value *V) -> bool { if (E->getOpcode() == Instruction::GetElementPtr && !isa<GetElementPtrInst>(V)) return true; auto *I = cast<Instruction>(V); return !E->isOpcodeOrAlt(I) || I->getParent() == BB || isVectorLikeInstWithConstOps(I); })"
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 7640, __extension__
__PRETTY_FUNCTION__))

7637

auto *I = cast<Instruction>(V);(static_cast <bool> (llvm::all_of(E->Scalars, [=](Value
*V) -> bool { if (E->getOpcode() == Instruction::GetElementPtr
&& !isa<GetElementPtrInst>(V)) return true; auto
*I = cast<Instruction>(V); return !E->isOpcodeOrAlt
(I) || I->getParent() == BB || isVectorLikeInstWithConstOps
(I); })) ? void (0) : __assert_fail ("llvm::all_of(E->Scalars, [=](Value *V) -> bool { if (E->getOpcode() == Instruction::GetElementPtr && !isa<GetElementPtrInst>(V)) return true; auto *I = cast<Instruction>(V); return !E->isOpcodeOrAlt(I) || I->getParent() == BB || isVectorLikeInstWithConstOps(I); })"
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 7640, __extension__
__PRETTY_FUNCTION__))

7638

return !E->isOpcodeOrAlt(I) || I->getParent() == BB ||(static_cast <bool> (llvm::all_of(E->Scalars, [=](Value
*V) -> bool { if (E->getOpcode() == Instruction::GetElementPtr
&& !isa<GetElementPtrInst>(V)) return true; auto
*I = cast<Instruction>(V); return !E->isOpcodeOrAlt
(I) || I->getParent() == BB || isVectorLikeInstWithConstOps
(I); })) ? void (0) : __assert_fail ("llvm::all_of(E->Scalars, [=](Value *V) -> bool { if (E->getOpcode() == Instruction::GetElementPtr && !isa<GetElementPtrInst>(V)) return true; auto *I = cast<Instruction>(V); return !E->isOpcodeOrAlt(I) || I->getParent() == BB || isVectorLikeInstWithConstOps(I); })"
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 7640, __extension__
__PRETTY_FUNCTION__))

7639

isVectorLikeInstWithConstOps(I);(static_cast <bool> (llvm::all_of(E->Scalars, [=](Value
*V) -> bool { if (E->getOpcode() == Instruction::GetElementPtr
&& !isa<GetElementPtrInst>(V)) return true; auto
*I = cast<Instruction>(V); return !E->isOpcodeOrAlt
(I) || I->getParent() == BB || isVectorLikeInstWithConstOps
(I); })) ? void (0) : __assert_fail ("llvm::all_of(E->Scalars, [=](Value *V) -> bool { if (E->getOpcode() == Instruction::GetElementPtr && !isa<GetElementPtrInst>(V)) return true; auto *I = cast<Instruction>(V); return !E->isOpcodeOrAlt(I) || I->getParent() == BB || isVectorLikeInstWithConstOps(I); })"
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 7640, __extension__
__PRETTY_FUNCTION__))

7640

}))(static_cast <bool> (llvm::all_of(E->Scalars, [=](Value
*V) -> bool { if (E->getOpcode() == Instruction::GetElementPtr
&& !isa<GetElementPtrInst>(V)) return true; auto
*I = cast<Instruction>(V); return !E->isOpcodeOrAlt
(I) || I->getParent() == BB || isVectorLikeInstWithConstOps
(I); })) ? void (0) : __assert_fail ("llvm::all_of(E->Scalars, [=](Value *V) -> bool { if (E->getOpcode() == Instruction::GetElementPtr && !isa<GetElementPtrInst>(V)) return true; auto *I = cast<Instruction>(V); return !E->isOpcodeOrAlt(I) || I->getParent() == BB || isVectorLikeInstWithConstOps(I); })"
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 7640, __extension__
__PRETTY_FUNCTION__));

7641

7642

auto &&FindLastInst = [E, Front, this, &BB]() {

7643

Instruction *LastInst = Front;

7644

for (Value *V : E->Scalars) {

7645

auto *I = dyn_cast<Instruction>(V);

7646

if (!I)

7647

continue;

7648

if (LastInst->getParent() == I->getParent()) {

7649

if (LastInst->comesBefore(I))

7650

LastInst = I;

7651

continue;

7652

}

7653

assert(isVectorLikeInstWithConstOps(LastInst) &&(static_cast <bool> (isVectorLikeInstWithConstOps(LastInst
) && isVectorLikeInstWithConstOps(I) && "Expected vector-like insts only."
) ? void (0) : __assert_fail ("isVectorLikeInstWithConstOps(LastInst) && isVectorLikeInstWithConstOps(I) && \"Expected vector-like insts only.\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 7655, __extension__
__PRETTY_FUNCTION__))

7654

isVectorLikeInstWithConstOps(I) &&(static_cast <bool> (isVectorLikeInstWithConstOps(LastInst
) && isVectorLikeInstWithConstOps(I) && "Expected vector-like insts only."
) ? void (0) : __assert_fail ("isVectorLikeInstWithConstOps(LastInst) && isVectorLikeInstWithConstOps(I) && \"Expected vector-like insts only.\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 7655, __extension__
__PRETTY_FUNCTION__))

7655

"Expected vector-like insts only.")(static_cast <bool> (isVectorLikeInstWithConstOps(LastInst
) && isVectorLikeInstWithConstOps(I) && "Expected vector-like insts only."
) ? void (0) : __assert_fail ("isVectorLikeInstWithConstOps(LastInst) && isVectorLikeInstWithConstOps(I) && \"Expected vector-like insts only.\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 7655, __extension__
__PRETTY_FUNCTION__));

7656

if (!DT->isReachableFromEntry(LastInst->getParent())) {

7657

LastInst = I;

7658

continue;

7659

}

7660

if (!DT->isReachableFromEntry(I->getParent()))

7661

continue;

7662

auto *NodeA = DT->getNode(LastInst->getParent());

7663

auto *NodeB = DT->getNode(I->getParent());

7664

assert(NodeA && "Should only process reachable instructions")(static_cast <bool> (NodeA && "Should only process reachable instructions"
) ? void (0) : __assert_fail ("NodeA && \"Should only process reachable instructions\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 7664, __extension__
__PRETTY_FUNCTION__));

7665

assert(NodeB && "Should only process reachable instructions")(static_cast <bool> (NodeB && "Should only process reachable instructions"
) ? void (0) : __assert_fail ("NodeB && \"Should only process reachable instructions\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 7665, __extension__
__PRETTY_FUNCTION__));

7666

assert((NodeA == NodeB) ==(static_cast <bool> ((NodeA == NodeB) == (NodeA->getDFSNumIn
() == NodeB->getDFSNumIn()) && "Different nodes should have different DFS numbers"
) ? void (0) : __assert_fail ("(NodeA == NodeB) == (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) && \"Different nodes should have different DFS numbers\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 7668, __extension__
__PRETTY_FUNCTION__))

7667

(NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&(static_cast <bool> ((NodeA == NodeB) == (NodeA->getDFSNumIn
() == NodeB->getDFSNumIn()) && "Different nodes should have different DFS numbers"
) ? void (0) : __assert_fail ("(NodeA == NodeB) == (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) && \"Different nodes should have different DFS numbers\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 7668, __extension__
__PRETTY_FUNCTION__))

7668

"Different nodes should have different DFS numbers")(static_cast <bool> ((NodeA == NodeB) == (NodeA->getDFSNumIn
() == NodeB->getDFSNumIn()) && "Different nodes should have different DFS numbers"
) ? void (0) : __assert_fail ("(NodeA == NodeB) == (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) && \"Different nodes should have different DFS numbers\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 7668, __extension__
__PRETTY_FUNCTION__));

7669

if (NodeA->getDFSNumIn() < NodeB->getDFSNumIn())

7670

LastInst = I;

7671

}

7672

BB = LastInst->getParent();

7673

return LastInst;

7674

};

7675

7676

auto &&FindFirstInst = [E, Front]() {

7677

Instruction *FirstInst = Front;

7678

for (Value *V : E->Scalars) {

7679

auto *I = dyn_cast<Instruction>(V);

7680

if (!I)

7681

continue;

7682

if (I->comesBefore(FirstInst))

7683

FirstInst = I;

7684

}

7685

return FirstInst;

7686

};

7687

7688

// Set the insert point to the beginning of the basic block if the entry

7689

// should not be scheduled.

7690

if (E->State != TreeEntry::NeedToGather &&

7691

doesNotNeedToSchedule(E->Scalars)) {

7692

Instruction *InsertInst;

7693

if (all_of(E->Scalars, isUsedOutsideBlock))

7694

InsertInst = FindLastInst();

7695

else

7696

InsertInst = FindFirstInst();

7697

return *InsertInst;

7698

}

7699

7700

// The last instruction in the bundle in program order.

7701

Instruction *LastInst = nullptr;

7702

7703

// Find the last instruction. The common case should be that BB has been

7704

// scheduled, and the last instruction is VL.back(). So we start with

7705

// VL.back() and iterate over schedule data until we reach the end of the

7706

// bundle. The end of the bundle is marked by null ScheduleData.

7707

if (BlocksSchedules.count(BB)) {

7708

Value *V = E->isOneOf(E->Scalars.back());

7709

if (doesNotNeedToBeScheduled(V))

7710

V = *find_if_not(E->Scalars, doesNotNeedToBeScheduled);

7711

auto *Bundle = BlocksSchedules[BB]->getScheduleData(V);

7712

if (Bundle && Bundle->isPartOfBundle())

7713

for (; Bundle; Bundle = Bundle->NextInBundle)

7714

if (Bundle->OpValue == Bundle->Inst)

7715

LastInst = Bundle->Inst;

7716

}

7717

7718

// LastInst can still be null at this point if there's either not an entry

7719

// for BB in BlocksSchedules or there's no ScheduleData available for

7720

// VL.back(). This can be the case if buildTree_rec aborts for various

7721

// reasons (e.g., the maximum recursion depth is reached, the maximum region

7722

// size is reached, etc.). ScheduleData is initialized in the scheduling

7723

// "dry-run".

7724

//

7725

// If this happens, we can still find the last instruction by brute force. We

7726

// iterate forwards from Front (inclusive) until we either see all

7727

// instructions in the bundle or reach the end of the block. If Front is the

7728

// last instruction in program order, LastInst will be set to Front, and we

7729

// will visit all the remaining instructions in the block.

7730

//

7731

// One of the reasons we exit early from buildTree_rec is to place an upper

7732

// bound on compile-time. Thus, taking an additional compile-time hit here is

7733

// not ideal. However, this should be exceedingly rare since it requires that

7734

// we both exit early from buildTree_rec and that the bundle be out-of-order

7735

// (causing us to iterate all the way to the end of the block).

7736

if (!LastInst)

7737

LastInst = FindLastInst();

7738

assert(LastInst && "Failed to find last instruction in bundle")(static_cast <bool> (LastInst && "Failed to find last instruction in bundle"
) ? void (0) : __assert_fail ("LastInst && \"Failed to find last instruction in bundle\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 7738, __extension__
__PRETTY_FUNCTION__));

7739

return *LastInst;

7740

}

7741

7742

void BoUpSLP::setInsertPointAfterBundle(const TreeEntry *E) {

7743

auto *Front = E->getMainOp();

7744

Instruction *LastInst = &getLastInstructionInBundle(E);

7745

assert(LastInst && "Failed to find last instruction in bundle")(static_cast <bool> (LastInst && "Failed to find last instruction in bundle"
) ? void (0) : __assert_fail ("LastInst && \"Failed to find last instruction in bundle\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 7745, __extension__
__PRETTY_FUNCTION__));

7746

// If the instruction is PHI, set the insert point after all the PHIs.

7747

bool IsPHI = isa<PHINode>(LastInst);

7748

if (IsPHI)

7749

LastInst = LastInst->getParent()->getFirstNonPHI();

7750

if (IsPHI || (E->State != TreeEntry::NeedToGather &&

7751

doesNotNeedToSchedule(E->Scalars))) {

7752

Builder.SetInsertPoint(LastInst);

7753

} else {

7754

// Set the insertion point after the last instruction in the bundle. Set the

7755

// debug location to Front.

7756

Builder.SetInsertPoint(LastInst->getParent(),

7757

std::next(LastInst->getIterator()));

7758

}

7759

Builder.SetCurrentDebugLocation(Front->getDebugLoc());

7760

}

7761

7762

Value *BoUpSLP::gather(ArrayRef<Value *> VL) {

7763

// List of instructions/lanes from current block and/or the blocks which are

7764

// part of the current loop. These instructions will be inserted at the end to

7765

// make it possible to optimize loops and hoist invariant instructions out of

7766

// the loops body with better chances for success.

7767

SmallVector<std::pair<Value *, unsigned>, 4> PostponedInsts;

7768

SmallSet<int, 4> PostponedIndices;

7769

Loop *L = LI->getLoopFor(Builder.GetInsertBlock());

7770

auto &&CheckPredecessor = [](BasicBlock *InstBB, BasicBlock *InsertBB) {

7771

SmallPtrSet<BasicBlock *, 4> Visited;

7772

while (InsertBB && InsertBB != InstBB && Visited.insert(InsertBB).second)

7773

InsertBB = InsertBB->getSinglePredecessor();

7774

return InsertBB && InsertBB == InstBB;

7775

};

7776

for (int I = 0, E = VL.size(); I < E; ++I) {

7777

if (auto *Inst = dyn_cast<Instruction>(VL[I]))

7778

if ((CheckPredecessor(Inst->getParent(), Builder.GetInsertBlock()) ||

7779

getTreeEntry(Inst) || (L && (L->contains(Inst)))) &&

7780

PostponedIndices.insert(I).second)

7781

PostponedInsts.emplace_back(Inst, I);

7782

}

7783

7784

auto &&CreateInsertElement = [this](Value *Vec, Value *V, unsigned Pos) {

7785

Vec = Builder.CreateInsertElement(Vec, V, Builder.getInt32(Pos));

7786

auto *InsElt = dyn_cast<InsertElementInst>(Vec);

7787

if (!InsElt)

7788

return Vec;

7789

GatherShuffleSeq.insert(InsElt);

7790

CSEBlocks.insert(InsElt->getParent());

7791

// Add to our 'need-to-extract' list.

7792

if (TreeEntry *Entry = getTreeEntry(V)) {

7793

// Find which lane we need to extract.

7794

unsigned FoundLane = Entry->findLaneForValue(V);

7795

ExternalUses.emplace_back(V, InsElt, FoundLane);

7796

}

7797

return Vec;

7798

};

7799

Value *Val0 =

7800

isa<StoreInst>(VL[0]) ? cast<StoreInst>(VL[0])->getValueOperand() : VL[0];

7801

FixedVectorType *VecTy = FixedVectorType::get(Val0->getType(), VL.size());

7802

Value *Vec = PoisonValue::get(VecTy);

7803

SmallVector<int> NonConsts;

7804

// Insert constant values at first.

7805

for (int I = 0, E = VL.size(); I < E; ++I) {

7806

if (PostponedIndices.contains(I))

7807

continue;

7808

if (!isConstant(VL[I])) {

7809

NonConsts.push_back(I);

7810

continue;

7811

}

7812

Vec = CreateInsertElement(Vec, VL[I], I);

7813

}

7814

// Insert non-constant values.

7815

for (int I : NonConsts)

7816

Vec = CreateInsertElement(Vec, VL[I], I);

7817

// Append instructions, which are/may be part of the loop, in the end to make

7818

// it possible to hoist non-loop-based instructions.

7819

for (const std::pair<Value *, unsigned> &Pair : PostponedInsts)

7820

Vec = CreateInsertElement(Vec, Pair.first, Pair.second);

7821

7822

return Vec;

7823

}

7824

7825

namespace {

7826

/// Merges shuffle masks and emits final shuffle instruction, if required.

7827

class ShuffleInstructionBuilder {

7828

IRBuilderBase &Builder;

7829

const unsigned VF = 0;

7830

bool IsFinalized = false;

7831

SmallVector<int, 4> Mask;

7832

/// Holds all of the instructions that we gathered.

7833

SetVector<Instruction *> &GatherShuffleSeq;

7834

/// A list of blocks that we are going to CSE.

7835

SetVector<BasicBlock *> &CSEBlocks;

7836

7837

public:

7838

ShuffleInstructionBuilder(IRBuilderBase &Builder, unsigned VF,

7839

SetVector<Instruction *> &GatherShuffleSeq,

7840

SetVector<BasicBlock *> &CSEBlocks)

7841

: Builder(Builder), VF(VF), GatherShuffleSeq(GatherShuffleSeq),

7842

CSEBlocks(CSEBlocks) {}

7843

7844

/// Adds a mask, inverting it before applying.

7845

void addInversedMask(ArrayRef<unsigned> SubMask) {

7846

if (SubMask.empty())

7847

return;

7848

SmallVector<int, 4> NewMask;

7849

inversePermutation(SubMask, NewMask);

7850

addMask(NewMask);

7851

}

7852

7853

/// Functions adds masks, merging them into single one.

7854

void addMask(ArrayRef<unsigned> SubMask) {

7855

SmallVector<int, 4> NewMask(SubMask);

7856

addMask(NewMask);

7857

}

7858

7859

void addMask(ArrayRef<int> SubMask) { ::addMask(Mask, SubMask); }

7860

7861

Value *finalize(Value *V) {

7862

IsFinalized = true;

7863

unsigned ValueVF = cast<FixedVectorType>(V->getType())->getNumElements();

7864

if (VF == ValueVF && Mask.empty())

7865

return V;

7866

SmallVector<int, 4> NormalizedMask(VF, UndefMaskElem);

7867

std::iota(NormalizedMask.begin(), NormalizedMask.end(), 0);

7868

addMask(NormalizedMask);

7869

7870

if (VF == ValueVF && ShuffleVectorInst::isIdentityMask(Mask))

7871

return V;

7872

Value *Vec = Builder.CreateShuffleVector(V, Mask, "shuffle");

7873

if (auto *I = dyn_cast<Instruction>(Vec)) {

7874

GatherShuffleSeq.insert(I);

7875

CSEBlocks.insert(I->getParent());

7876

}

7877

return Vec;

7878

}

7879

7880

~ShuffleInstructionBuilder() {

7881

assert((IsFinalized || Mask.empty()) &&(static_cast <bool> ((IsFinalized || Mask.empty()) &&
"Shuffle construction must be finalized.") ? void (0) : __assert_fail
("(IsFinalized || Mask.empty()) && \"Shuffle construction must be finalized.\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 7882, __extension__
__PRETTY_FUNCTION__))

7882

"Shuffle construction must be finalized.")(static_cast <bool> ((IsFinalized || Mask.empty()) &&
"Shuffle construction must be finalized.") ? void (0) : __assert_fail
("(IsFinalized || Mask.empty()) && \"Shuffle construction must be finalized.\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 7882, __extension__
__PRETTY_FUNCTION__));

7883

}

7884

};

7885

} // namespace

7886

7887

Value *BoUpSLP::vectorizeTree(ArrayRef<Value *> VL) {

7888

const unsigned VF = VL.size();

7889

InstructionsState S = getSameOpcode(VL);

7890

// Special processing for GEPs bundle, which may include non-gep values.

7891

if (!S.getOpcode() && VL.front()->getType()->isPointerTy()) {

7892

const auto *It =

7893

find_if(VL, [](Value *V) { return isa<GetElementPtrInst>(V); });

7894

if (It != VL.end())

7895

S = getSameOpcode(*It);

7896

}

7897

if (S.getOpcode()) {

7898

if (TreeEntry *E = getTreeEntry(S.OpValue))

7899

if (E->isSame(VL)) {

7900

Value *V = vectorizeTree(E);

7901

if (VF != cast<FixedVectorType>(V->getType())->getNumElements()) {

7902

if (!E->ReuseShuffleIndices.empty()) {

7903

// Reshuffle to get only unique values.

7904

// If some of the scalars are duplicated in the vectorization tree

7905

// entry, we do not vectorize them but instead generate a mask for

7906

// the reuses. But if there are several users of the same entry,

7907

// they may have different vectorization factors. This is especially

7908

// important for PHI nodes. In this case, we need to adapt the

7909

// resulting instruction for the user vectorization factor and have

7910

// to reshuffle it again to take only unique elements of the vector.

7911

// Without this code the function incorrectly returns reduced vector

7912

// instruction with the same elements, not with the unique ones.

7913

7914

// block:

7915

// %phi = phi <2 x > { .., %entry} {%shuffle, %block}

7916

// %2 = shuffle <2 x > %phi, poison, <4 x > <1, 1, 0, 0>

7917

// ... (use %2)

7918

// %shuffle = shuffle <2 x> %2, poison, <2 x> {2, 0}

7919

// br %block

7920

SmallVector<int> UniqueIdxs(VF, UndefMaskElem);

7921

SmallSet<int, 4> UsedIdxs;

7922

int Pos = 0;

7923

int Sz = VL.size();

7924

for (int Idx : E->ReuseShuffleIndices) {

7925

if (Idx != Sz && Idx != UndefMaskElem &&

7926

UsedIdxs.insert(Idx).second)

7927

UniqueIdxs[Idx] = Pos;

7928

++Pos;

7929

}

7930

assert(VF >= UsedIdxs.size() && "Expected vectorization factor "(static_cast <bool> (VF >= UsedIdxs.size() &&
"Expected vectorization factor " "less than original vector size."
) ? void (0) : __assert_fail ("VF >= UsedIdxs.size() && \"Expected vectorization factor \" \"less than original vector size.\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 7931, __extension__
__PRETTY_FUNCTION__))

7931

"less than original vector size.")(static_cast <bool> (VF >= UsedIdxs.size() &&
"Expected vectorization factor " "less than original vector size."
) ? void (0) : __assert_fail ("VF >= UsedIdxs.size() && \"Expected vectorization factor \" \"less than original vector size.\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 7931, __extension__
__PRETTY_FUNCTION__));

7932

UniqueIdxs.append(VF - UsedIdxs.size(), UndefMaskElem);

7933

V = Builder.CreateShuffleVector(V, UniqueIdxs, "shrink.shuffle");

7934

} else {

7935

assert(VF < cast<FixedVectorType>(V->getType())->getNumElements() &&(static_cast <bool> (VF < cast<FixedVectorType>
(V->getType())->getNumElements() && "Expected vectorization factor less "
"than original vector size.") ? void (0) : __assert_fail ("VF < cast<FixedVectorType>(V->getType())->getNumElements() && \"Expected vectorization factor less \" \"than original vector size.\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 7937, __extension__
__PRETTY_FUNCTION__))

7936

"Expected vectorization factor less "(static_cast <bool> (VF < cast<FixedVectorType>
(V->getType())->getNumElements() && "Expected vectorization factor less "
"than original vector size.") ? void (0) : __assert_fail ("VF < cast<FixedVectorType>(V->getType())->getNumElements() && \"Expected vectorization factor less \" \"than original vector size.\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 7937, __extension__
__PRETTY_FUNCTION__))

7937

"than original vector size.")(static_cast <bool> (VF < cast<FixedVectorType>
(V->getType())->getNumElements() && "Expected vectorization factor less "
"than original vector size.") ? void (0) : __assert_fail ("VF < cast<FixedVectorType>(V->getType())->getNumElements() && \"Expected vectorization factor less \" \"than original vector size.\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 7937, __extension__
__PRETTY_FUNCTION__));

7938

SmallVector<int> UniformMask(VF, 0);

7939

std::iota(UniformMask.begin(), UniformMask.end(), 0);

7940

V = Builder.CreateShuffleVector(V, UniformMask, "shrink.shuffle");

7941

}

7942

if (auto *I = dyn_cast<Instruction>(V)) {

7943

GatherShuffleSeq.insert(I);

7944

CSEBlocks.insert(I->getParent());

7945

}

7946

}

7947

return V;

7948

}

7949

}

7950

7951

// Can't vectorize this, so simply build a new vector with each lane

7952

// corresponding to the requested value.

7953

return createBuildVector(VL);

7954

}

7955

Value *BoUpSLP::createBuildVector(ArrayRef<Value *> VL) {

7956

assert(any_of(VectorizableTree,(static_cast <bool> (any_of(VectorizableTree, [VL](const
std::unique_ptr<TreeEntry> &TE) { return TE->State
== TreeEntry::NeedToGather && TE->isSame(VL); }) &&
"Non-matching gather node.") ? void (0) : __assert_fail ("any_of(VectorizableTree, [VL](const std::unique_ptr<TreeEntry> &TE) { return TE->State == TreeEntry::NeedToGather && TE->isSame(VL); }) && \"Non-matching gather node.\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 7960, __extension__
__PRETTY_FUNCTION__))

7957

[VL](const std::unique_ptr<TreeEntry> &TE) {(static_cast <bool> (any_of(VectorizableTree, [VL](const
std::unique_ptr<TreeEntry> &TE) { return TE->State
== TreeEntry::NeedToGather && TE->isSame(VL); }) &&
"Non-matching gather node.") ? void (0) : __assert_fail ("any_of(VectorizableTree, [VL](const std::unique_ptr<TreeEntry> &TE) { return TE->State == TreeEntry::NeedToGather && TE->isSame(VL); }) && \"Non-matching gather node.\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 7960, __extension__
__PRETTY_FUNCTION__))

7958

return TE->State == TreeEntry::NeedToGather && TE->isSame(VL);(static_cast <bool> (any_of(VectorizableTree, [VL](const
std::unique_ptr<TreeEntry> &TE) { return TE->State
== TreeEntry::NeedToGather && TE->isSame(VL); }) &&
"Non-matching gather node.") ? void (0) : __assert_fail ("any_of(VectorizableTree, [VL](const std::unique_ptr<TreeEntry> &TE) { return TE->State == TreeEntry::NeedToGather && TE->isSame(VL); }) && \"Non-matching gather node.\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 7960, __extension__
__PRETTY_FUNCTION__))

7959

}) &&(static_cast <bool> (any_of(VectorizableTree, [VL](const
std::unique_ptr<TreeEntry> &TE) { return TE->State
== TreeEntry::NeedToGather && TE->isSame(VL); }) &&
"Non-matching gather node.") ? void (0) : __assert_fail ("any_of(VectorizableTree, [VL](const std::unique_ptr<TreeEntry> &TE) { return TE->State == TreeEntry::NeedToGather && TE->isSame(VL); }) && \"Non-matching gather node.\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 7960, __extension__
__PRETTY_FUNCTION__))

7960

"Non-matching gather node.")(static_cast <bool> (any_of(VectorizableTree, [VL](const
std::unique_ptr<TreeEntry> &TE) { return TE->State
== TreeEntry::NeedToGather && TE->isSame(VL); }) &&
"Non-matching gather node.") ? void (0) : __assert_fail ("any_of(VectorizableTree, [VL](const std::unique_ptr<TreeEntry> &TE) { return TE->State == TreeEntry::NeedToGather && TE->isSame(VL); }) && \"Non-matching gather node.\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 7960, __extension__
__PRETTY_FUNCTION__));

7961

unsigned VF = VL.size();

7962

// Exploit possible reuse of values across lanes.

7963

SmallVector<int> ReuseShuffleIndicies;

7964

SmallVector<Value *> UniqueValues;

7965

if (VL.size() > 2) {

7966

DenseMap<Value *, unsigned> UniquePositions;

7967

unsigned NumValues =

7968

std::distance(VL.begin(), find_if(reverse(VL), [](Value *V) {

7969

return !isa<UndefValue>(V);

7970

}).base());

7971

VF = std::max<unsigned>(VF, PowerOf2Ceil(NumValues));

7972

int UniqueVals = 0;

7973

for (Value *V : VL.drop_back(VL.size() - VF)) {

7974

if (isa<UndefValue>(V)) {

7975

ReuseShuffleIndicies.emplace_back(UndefMaskElem);

7976

continue;

7977

}

7978

if (isConstant(V)) {

7979

ReuseShuffleIndicies.emplace_back(UniqueValues.size());

7980

UniqueValues.emplace_back(V);

7981

continue;

7982

}

7983

auto Res = UniquePositions.try_emplace(V, UniqueValues.size());

7984

ReuseShuffleIndicies.emplace_back(Res.first->second);

7985

if (Res.second) {

7986

UniqueValues.emplace_back(V);

7987

++UniqueVals;

7988

}

7989

}

7990

if (UniqueVals == 1 && UniqueValues.size() == 1) {

7991

// Emit pure splat vector.

7992

ReuseShuffleIndicies.append(VF - ReuseShuffleIndicies.size(),

7993

UndefMaskElem);

7994

} else if (UniqueValues.size() >= VF - 1 || UniqueValues.size() <= 1) {

7995

if (UniqueValues.empty()) {

7996

assert(all_of(VL, UndefValue::classof) && "Expected list of undefs.")(static_cast <bool> (all_of(VL, UndefValue::classof) &&
"Expected list of undefs.") ? void (0) : __assert_fail ("all_of(VL, UndefValue::classof) && \"Expected list of undefs.\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 7996, __extension__
__PRETTY_FUNCTION__));

7997

NumValues = VF;

7998

}

7999

ReuseShuffleIndicies.clear();

8000

UniqueValues.clear();

8001

UniqueValues.append(VL.begin(), std::next(VL.begin(), NumValues));

8002

}

8003

UniqueValues.append(VF - UniqueValues.size(),

8004

PoisonValue::get(VL[0]->getType()));

8005

VL = UniqueValues;

8006

}

8007

8008

ShuffleInstructionBuilder ShuffleBuilder(Builder, VF, GatherShuffleSeq,

8009

CSEBlocks);

8010

Value *Vec = gather(VL);

8011

if (!ReuseShuffleIndicies.empty()) {

8012

ShuffleBuilder.addMask(ReuseShuffleIndicies);

8013

Vec = ShuffleBuilder.finalize(Vec);

8014

}

8015

return Vec;

8016

}

8017

8018

Value *BoUpSLP::vectorizeTree(TreeEntry *E) {

8019

IRBuilder<>::InsertPointGuard Guard(Builder);

8020

8021

if (E->VectorizedValue) {

8022

LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *E->Scalars[0] << ".\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Diamond merged for " <<
*E->Scalars[0] << ".\n"; } } while (false);

8023

return E->VectorizedValue;

8024

}

8025

8026

bool NeedToShuffleReuses = !E->ReuseShuffleIndices.empty();

8027

unsigned VF = E->getVectorFactor();

8028

ShuffleInstructionBuilder ShuffleBuilder(Builder, VF, GatherShuffleSeq,

8029

CSEBlocks);

8030

if (E->State == TreeEntry::NeedToGather) {

8031

if (E->getMainOp())

8032

setInsertPointAfterBundle(E);

8033

Value *Vec;

8034

SmallVector<int> Mask;

8035

SmallVector<const TreeEntry *> Entries;

8036

Optional<TargetTransformInfo::ShuffleKind> Shuffle =

8037

isGatherShuffledEntry(E, Mask, Entries);

8038

if (Shuffle) {

8039

assert((Entries.size() == 1 || Entries.size() == 2) &&(static_cast <bool> ((Entries.size() == 1 || Entries.size
() == 2) && "Expected shuffle of 1 or 2 entries.") ? void
(0) : __assert_fail ("(Entries.size() == 1 || Entries.size() == 2) && \"Expected shuffle of 1 or 2 entries.\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 8040, __extension__
__PRETTY_FUNCTION__))

8040

"Expected shuffle of 1 or 2 entries.")(static_cast <bool> ((Entries.size() == 1 || Entries.size
() == 2) && "Expected shuffle of 1 or 2 entries.") ? void
(0) : __assert_fail ("(Entries.size() == 1 || Entries.size() == 2) && \"Expected shuffle of 1 or 2 entries.\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 8040, __extension__
__PRETTY_FUNCTION__));

8041

Vec = Builder.CreateShuffleVector(Entries.front()->VectorizedValue,

8042

Entries.back()->VectorizedValue, Mask);

8043

if (auto *I = dyn_cast<Instruction>(Vec)) {

8044

GatherShuffleSeq.insert(I);

8045

CSEBlocks.insert(I->getParent());

8046

}

8047

} else {

8048

Vec = gather(E->Scalars);

8049

}

8050

if (NeedToShuffleReuses) {

8051

ShuffleBuilder.addMask(E->ReuseShuffleIndices);

8052

Vec = ShuffleBuilder.finalize(Vec);

8053

}

8054

E->VectorizedValue = Vec;

8055

return Vec;

8056

}

8057

8058

assert((E->State == TreeEntry::Vectorize ||(static_cast <bool> ((E->State == TreeEntry::Vectorize
|| E->State == TreeEntry::ScatterVectorize) && "Unhandled state"
) ? void (0) : __assert_fail ("(E->State == TreeEntry::Vectorize || E->State == TreeEntry::ScatterVectorize) && \"Unhandled state\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 8060, __extension__
__PRETTY_FUNCTION__))

8059

E->State == TreeEntry::ScatterVectorize) &&(static_cast <bool> ((E->State == TreeEntry::Vectorize
|| E->State == TreeEntry::ScatterVectorize) && "Unhandled state"
) ? void (0) : __assert_fail ("(E->State == TreeEntry::Vectorize || E->State == TreeEntry::ScatterVectorize) && \"Unhandled state\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 8060, __extension__
__PRETTY_FUNCTION__))

8060

"Unhandled state")(static_cast <bool> ((E->State == TreeEntry::Vectorize
|| E->State == TreeEntry::ScatterVectorize) && "Unhandled state"
) ? void (0) : __assert_fail ("(E->State == TreeEntry::Vectorize || E->State == TreeEntry::ScatterVectorize) && \"Unhandled state\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 8060, __extension__
__PRETTY_FUNCTION__));

8061

unsigned ShuffleOrOp =

8062

E->isAltShuffle() ? (unsigned)Instruction::ShuffleVector : E->getOpcode();

8063

Instruction *VL0 = E->getMainOp();

8064

Type *ScalarTy = VL0->getType();

8065

if (auto *Store = dyn_cast<StoreInst>(VL0))

8066

ScalarTy = Store->getValueOperand()->getType();

8067

else if (auto *IE = dyn_cast<InsertElementInst>(VL0))

8068

ScalarTy = IE->getOperand(1)->getType();

8069

auto *VecTy = FixedVectorType::get(ScalarTy, E->Scalars.size());

8070

switch (ShuffleOrOp) {

8071

case Instruction::PHI: {

8072

assert((E->ReorderIndices.empty() ||(static_cast <bool> ((E->ReorderIndices.empty() || E
!= VectorizableTree.front().get() || !E->UserTreeIndices.
empty()) && "PHI reordering is free.") ? void (0) : __assert_fail
("(E->ReorderIndices.empty() || E != VectorizableTree.front().get() || !E->UserTreeIndices.empty()) && \"PHI reordering is free.\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 8075, __extension__
__PRETTY_FUNCTION__))

8073

E != VectorizableTree.front().get() ||(static_cast <bool> ((E->ReorderIndices.empty() || E
!= VectorizableTree.front().get() || !E->UserTreeIndices.
empty()) && "PHI reordering is free.") ? void (0) : __assert_fail
("(E->ReorderIndices.empty() || E != VectorizableTree.front().get() || !E->UserTreeIndices.empty()) && \"PHI reordering is free.\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 8075, __extension__
__PRETTY_FUNCTION__))

8074

!E->UserTreeIndices.empty()) &&(static_cast <bool> ((E->ReorderIndices.empty() || E
!= VectorizableTree.front().get() || !E->UserTreeIndices.
empty()) && "PHI reordering is free.") ? void (0) : __assert_fail
("(E->ReorderIndices.empty() || E != VectorizableTree.front().get() || !E->UserTreeIndices.empty()) && \"PHI reordering is free.\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 8075, __extension__
__PRETTY_FUNCTION__))

8075

"PHI reordering is free.")(static_cast <bool> ((E->ReorderIndices.empty() || E
!= VectorizableTree.front().get() || !E->UserTreeIndices.
empty()) && "PHI reordering is free.") ? void (0) : __assert_fail
("(E->ReorderIndices.empty() || E != VectorizableTree.front().get() || !E->UserTreeIndices.empty()) && \"PHI reordering is free.\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 8075, __extension__
__PRETTY_FUNCTION__));

8076

auto *PH = cast<PHINode>(VL0);

8077

Builder.SetInsertPoint(PH->getParent()->getFirstNonPHI());

8078

Builder.SetCurrentDebugLocation(PH->getDebugLoc());

8079

PHINode *NewPhi = Builder.CreatePHI(VecTy, PH->getNumIncomingValues());

8080

Value *V = NewPhi;

8081

8082

// Adjust insertion point once all PHI's have been generated.

8083

Builder.SetInsertPoint(&*PH->getParent()->getFirstInsertionPt());

8084

Builder.SetCurrentDebugLocation(PH->getDebugLoc());

8085

8086

ShuffleBuilder.addInversedMask(E->ReorderIndices);

8087

ShuffleBuilder.addMask(E->ReuseShuffleIndices);

8088

V = ShuffleBuilder.finalize(V);

8089

8090

E->VectorizedValue = V;

8091

8092

// PHINodes may have multiple entries from the same block. We want to

8093

// visit every block once.

8094

SmallPtrSet<BasicBlock*, 4> VisitedBBs;

8095

8096

for (unsigned i = 0, e = PH->getNumIncomingValues(); i < e; ++i) {

8097

ValueList Operands;

8098

BasicBlock *IBB = PH->getIncomingBlock(i);

8099

8100

if (!VisitedBBs.insert(IBB).second) {

8101

NewPhi->addIncoming(NewPhi->getIncomingValueForBlock(IBB), IBB);

8102

continue;

8103

}

8104

8105

Builder.SetInsertPoint(IBB->getTerminator());

8106

Builder.SetCurrentDebugLocation(PH->getDebugLoc());

8107

Value *Vec = vectorizeTree(E->getOperand(i));

8108

NewPhi->addIncoming(Vec, IBB);

8109

}

8110

8111

assert(NewPhi->getNumIncomingValues() == PH->getNumIncomingValues() &&(static_cast <bool> (NewPhi->getNumIncomingValues() ==
PH->getNumIncomingValues() && "Invalid number of incoming values"
) ? void (0) : __assert_fail ("NewPhi->getNumIncomingValues() == PH->getNumIncomingValues() && \"Invalid number of incoming values\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 8112, __extension__
__PRETTY_FUNCTION__))

8112

"Invalid number of incoming values")(static_cast <bool> (NewPhi->getNumIncomingValues() ==
PH->getNumIncomingValues() && "Invalid number of incoming values"
) ? void (0) : __assert_fail ("NewPhi->getNumIncomingValues() == PH->getNumIncomingValues() && \"Invalid number of incoming values\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 8112, __extension__
__PRETTY_FUNCTION__));

8113

return V;

8114

}

8115

8116

case Instruction::ExtractElement: {

8117

Value *V = E->getSingleOperand(0);

8118

Builder.SetInsertPoint(VL0);

8119

ShuffleBuilder.addInversedMask(E->ReorderIndices);

8120

ShuffleBuilder.addMask(E->ReuseShuffleIndices);

8121

V = ShuffleBuilder.finalize(V);

8122

E->VectorizedValue = V;

8123

return V;

8124

}

8125

case Instruction::ExtractValue: {

8126

auto *LI = cast<LoadInst>(E->getSingleOperand(0));

8127

Builder.SetInsertPoint(LI);

8128

auto *PtrTy = PointerType::get(VecTy, LI->getPointerAddressSpace());

8129

Value *Ptr = Builder.CreateBitCast(LI->getOperand(0), PtrTy);

8130

LoadInst *V = Builder.CreateAlignedLoad(VecTy, Ptr, LI->getAlign());

8131

Value *NewV = propagateMetadata(V, E->Scalars);

8132

ShuffleBuilder.addInversedMask(E->ReorderIndices);

8133

ShuffleBuilder.addMask(E->ReuseShuffleIndices);

8134

NewV = ShuffleBuilder.finalize(NewV);

8135

E->VectorizedValue = NewV;

8136

return NewV;

8137

}

8138

case Instruction::InsertElement: {

8139

assert(E->ReuseShuffleIndices.empty() && "All inserts should be unique")(static_cast <bool> (E->ReuseShuffleIndices.empty() &&
"All inserts should be unique") ? void (0) : __assert_fail (
"E->ReuseShuffleIndices.empty() && \"All inserts should be unique\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 8139, __extension__
__PRETTY_FUNCTION__));

8140

Builder.SetInsertPoint(cast<Instruction>(E->Scalars.back()));

8141

Value *V = vectorizeTree(E->getOperand(1));

8142

8143

// Create InsertVector shuffle if necessary

8144

auto *FirstInsert = cast<Instruction>(*find_if(E->Scalars, [E](Value *V) {

8145

return !is_contained(E->Scalars, cast<Instruction>(V)->getOperand(0));

8146

}));

8147

const unsigned NumElts =

8148

cast<FixedVectorType>(FirstInsert->getType())->getNumElements();

8149

const unsigned NumScalars = E->Scalars.size();

8150

8151

unsigned Offset = *getInsertIndex(VL0);

8152

assert(Offset < NumElts && "Failed to find vector index offset")(static_cast <bool> (Offset < NumElts && "Failed to find vector index offset"
) ? void (0) : __assert_fail ("Offset < NumElts && \"Failed to find vector index offset\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 8152, __extension__
__PRETTY_FUNCTION__));

8153

8154

// Create shuffle to resize vector

8155

SmallVector<int> Mask;

8156

if (!E->ReorderIndices.empty()) {

8157

inversePermutation(E->ReorderIndices, Mask);

8158

Mask.append(NumElts - NumScalars, UndefMaskElem);

8159

} else {

8160

Mask.assign(NumElts, UndefMaskElem);

8161

std::iota(Mask.begin(), std::next(Mask.begin(), NumScalars), 0);

8162

}

8163

// Create InsertVector shuffle if necessary

8164

bool IsIdentity = true;

8165

SmallVector<int> PrevMask(NumElts, UndefMaskElem);

8166

Mask.swap(PrevMask);

8167

for (unsigned I = 0; I < NumScalars; ++I) {

8168

Value *Scalar = E->Scalars[PrevMask[I]];

8169

unsigned InsertIdx = *getInsertIndex(Scalar);

8170

IsIdentity &= InsertIdx - Offset == I;

8171

Mask[InsertIdx - Offset] = I;

8172

}

8173

if (!IsIdentity || NumElts != NumScalars) {

8174

V = Builder.CreateShuffleVector(V, Mask);

8175

if (auto *I = dyn_cast<Instruction>(V)) {

8176

GatherShuffleSeq.insert(I);

8177

CSEBlocks.insert(I->getParent());

8178

}

8179

}

8180

8181

SmallVector<int> InsertMask(NumElts, UndefMaskElem);

8182

for (unsigned I = 0; I < NumElts; I++) {

8183

if (Mask[I] != UndefMaskElem)

8184

InsertMask[Offset + I] = NumElts + I;

8185

}

8186

if (Offset != 0 ||

8187

!isUndefVector(FirstInsert->getOperand(0), InsertMask)) {

8188

for (unsigned I = 0; I < NumElts; I++) {

8189

if (InsertMask[I] == UndefMaskElem)

8190

InsertMask[I] = I;

8191

}

8192

8193

V = Builder.CreateShuffleVector(

8194

FirstInsert->getOperand(0), V, InsertMask,

8195

cast<Instruction>(E->Scalars.back())->getName());

8196

if (auto *I = dyn_cast<Instruction>(V)) {

8197

GatherShuffleSeq.insert(I);

8198

CSEBlocks.insert(I->getParent());

8199

}

8200

}

8201

8202

++NumVectorInstructions;

8203

E->VectorizedValue = V;

8204

return V;

8205

}

8206

case Instruction::ZExt:

8207

case Instruction::SExt:

8208

case Instruction::FPToUI:

8209

case Instruction::FPToSI:

8210

case Instruction::FPExt:

8211

case Instruction::PtrToInt:

8212

case Instruction::IntToPtr:

8213

case Instruction::SIToFP:

8214

case Instruction::UIToFP:

8215

case Instruction::Trunc:

8216

case Instruction::FPTrunc:

8217

case Instruction::BitCast: {

8218

setInsertPointAfterBundle(E);

8219

8220

Value *InVec = vectorizeTree(E->getOperand(0));

8221

8222

if (E->VectorizedValue) {

8223

LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Diamond merged for " <<
*VL0 << ".\n"; } } while (false);

8224

return E->VectorizedValue;

8225

}

8226

8227

auto *CI = cast<CastInst>(VL0);

8228

Value *V = Builder.CreateCast(CI->getOpcode(), InVec, VecTy);

8229

ShuffleBuilder.addInversedMask(E->ReorderIndices);

8230

ShuffleBuilder.addMask(E->ReuseShuffleIndices);

8231

V = ShuffleBuilder.finalize(V);

8232

8233

E->VectorizedValue = V;

8234

++NumVectorInstructions;

8235

return V;

8236

}

8237

case Instruction::FCmp:

8238

case Instruction::ICmp: {

8239

setInsertPointAfterBundle(E);

8240

8241

Value *L = vectorizeTree(E->getOperand(0));

8242

Value *R = vectorizeTree(E->getOperand(1));

8243

8244

if (E->VectorizedValue) {

8245

LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Diamond merged for " <<
*VL0 << ".\n"; } } while (false);

8246

return E->VectorizedValue;

8247

}

8248

8249

CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate();

8250

Value *V = Builder.CreateCmp(P0, L, R);

8251

propagateIRFlags(V, E->Scalars, VL0);

8252

ShuffleBuilder.addInversedMask(E->ReorderIndices);

8253

ShuffleBuilder.addMask(E->ReuseShuffleIndices);

8254

V = ShuffleBuilder.finalize(V);

8255

8256

E->VectorizedValue = V;

8257

++NumVectorInstructions;

8258

return V;

8259

}

8260

case Instruction::Select: {

8261

setInsertPointAfterBundle(E);

8262

8263

Value *Cond = vectorizeTree(E->getOperand(0));

8264

Value *True = vectorizeTree(E->getOperand(1));

8265

Value *False = vectorizeTree(E->getOperand(2));

8266

8267

if (E->VectorizedValue) {

8268

LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Diamond merged for " <<
*VL0 << ".\n"; } } while (false);

8269

return E->VectorizedValue;

8270

}

8271

8272

Value *V = Builder.CreateSelect(Cond, True, False);

8273

ShuffleBuilder.addInversedMask(E->ReorderIndices);

8274

ShuffleBuilder.addMask(E->ReuseShuffleIndices);

8275

V = ShuffleBuilder.finalize(V);

8276

8277

E->VectorizedValue = V;

8278

++NumVectorInstructions;

8279

return V;

8280

}

8281

case Instruction::FNeg: {

8282

setInsertPointAfterBundle(E);

8283

8284

Value *Op = vectorizeTree(E->getOperand(0));

8285

8286

if (E->VectorizedValue) {

8287

LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Diamond merged for " <<
*VL0 << ".\n"; } } while (false);

8288

return E->VectorizedValue;

8289

}

8290

8291

Value *V = Builder.CreateUnOp(

8292

static_cast<Instruction::UnaryOps>(E->getOpcode()), Op);

8293

propagateIRFlags(V, E->Scalars, VL0);

8294

if (auto *I = dyn_cast<Instruction>(V))

8295

V = propagateMetadata(I, E->Scalars);

8296

8297

ShuffleBuilder.addInversedMask(E->ReorderIndices);

8298

ShuffleBuilder.addMask(E->ReuseShuffleIndices);

8299

V = ShuffleBuilder.finalize(V);

8300

8301

E->VectorizedValue = V;

8302

++NumVectorInstructions;

8303

8304

return V;

8305

}

8306

case Instruction::Add:

8307

case Instruction::FAdd:

8308

case Instruction::Sub:

8309

case Instruction::FSub:

8310

case Instruction::Mul:

8311

case Instruction::FMul:

8312

case Instruction::UDiv:

8313

case Instruction::SDiv:

8314

case Instruction::FDiv:

8315

case Instruction::URem:

8316

case Instruction::SRem:

8317

case Instruction::FRem:

8318

case Instruction::Shl:

8319

case Instruction::LShr:

8320

case Instruction::AShr:

8321

case Instruction::And:

8322

case Instruction::Or:

8323

case Instruction::Xor: {

8324

setInsertPointAfterBundle(E);

8325

8326

Value *LHS = vectorizeTree(E->getOperand(0));

8327

Value *RHS = vectorizeTree(E->getOperand(1));

8328

8329

if (E->VectorizedValue) {

8330

LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Diamond merged for " <<
*VL0 << ".\n"; } } while (false);

8331

return E->VectorizedValue;

8332

}

8333

8334

Value *V = Builder.CreateBinOp(

8335

static_cast<Instruction::BinaryOps>(E->getOpcode()), LHS,

8336

RHS);

8337

propagateIRFlags(V, E->Scalars, VL0);

8338

if (auto *I = dyn_cast<Instruction>(V))

8339

V = propagateMetadata(I, E->Scalars);

8340

8341

ShuffleBuilder.addInversedMask(E->ReorderIndices);

8342

ShuffleBuilder.addMask(E->ReuseShuffleIndices);

8343

V = ShuffleBuilder.finalize(V);

8344

8345

E->VectorizedValue = V;

8346

++NumVectorInstructions;

8347

8348

return V;

8349

}

8350

case Instruction::Load: {

8351

// Loads are inserted at the head of the tree because we don't want to

8352

// sink them all the way down past store instructions.

8353

setInsertPointAfterBundle(E);

8354

8355

LoadInst *LI = cast<LoadInst>(VL0);

8356

Instruction *NewLI;

8357

unsigned AS = LI->getPointerAddressSpace();

8358

Value *PO = LI->getPointerOperand();

8359

if (E->State == TreeEntry::Vectorize) {

8360

Value *VecPtr = Builder.CreateBitCast(PO, VecTy->getPointerTo(AS));

8361

NewLI = Builder.CreateAlignedLoad(VecTy, VecPtr, LI->getAlign());

8362

8363

// The pointer operand uses an in-tree scalar so we add the new BitCast

8364

// or LoadInst to ExternalUses list to make sure that an extract will

8365

// be generated in the future.

8366

if (TreeEntry *Entry = getTreeEntry(PO)) {

8367

// Find which lane we need to extract.

8368

unsigned FoundLane = Entry->findLaneForValue(PO);

8369

ExternalUses.emplace_back(

8370

PO, PO != VecPtr ? cast<User>(VecPtr) : NewLI, FoundLane);

8371

}

8372

} else {

8373

assert(E->State == TreeEntry::ScatterVectorize && "Unhandled state")(static_cast <bool> (E->State == TreeEntry::ScatterVectorize
&& "Unhandled state") ? void (0) : __assert_fail ("E->State == TreeEntry::ScatterVectorize && \"Unhandled state\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 8373, __extension__
__PRETTY_FUNCTION__));

8374

Value *VecPtr = vectorizeTree(E->getOperand(0));

8375

// Use the minimum alignment of the gathered loads.

8376

Align CommonAlignment = LI->getAlign();

8377

for (Value *V : E->Scalars)

8378

CommonAlignment =

8379

std::min(CommonAlignment, cast<LoadInst>(V)->getAlign());

8380

NewLI = Builder.CreateMaskedGather(VecTy, VecPtr, CommonAlignment);

8381

}

8382

Value *V = propagateMetadata(NewLI, E->Scalars);

8383

8384

ShuffleBuilder.addInversedMask(E->ReorderIndices);

8385

ShuffleBuilder.addMask(E->ReuseShuffleIndices);

8386

V = ShuffleBuilder.finalize(V);

8387

E->VectorizedValue = V;

8388

++NumVectorInstructions;

8389

return V;

8390

}

8391

case Instruction::Store: {

8392

auto *SI = cast<StoreInst>(VL0);

8393

unsigned AS = SI->getPointerAddressSpace();

8394

8395

setInsertPointAfterBundle(E);

8396

8397

Value *VecValue = vectorizeTree(E->getOperand(0));

8398

ShuffleBuilder.addMask(E->ReorderIndices);

8399

VecValue = ShuffleBuilder.finalize(VecValue);

8400

8401

Value *ScalarPtr = SI->getPointerOperand();

8402

Value *VecPtr = Builder.CreateBitCast(

8403

ScalarPtr, VecValue->getType()->getPointerTo(AS));

8404

StoreInst *ST =

8405

Builder.CreateAlignedStore(VecValue, VecPtr, SI->getAlign());

8406

8407

// The pointer operand uses an in-tree scalar, so add the new BitCast or

8408

// StoreInst to ExternalUses to make sure that an extract will be

8409

// generated in the future.

8410

if (TreeEntry *Entry = getTreeEntry(ScalarPtr)) {

8411

// Find which lane we need to extract.

8412

unsigned FoundLane = Entry->findLaneForValue(ScalarPtr);

8413

ExternalUses.push_back(ExternalUser(

8414

ScalarPtr, ScalarPtr != VecPtr ? cast<User>(VecPtr) : ST,

8415

FoundLane));

8416

}

8417

8418

Value *V = propagateMetadata(ST, E->Scalars);

8419

8420

E->VectorizedValue = V;

8421

++NumVectorInstructions;

8422

return V;

8423

}

8424

case Instruction::GetElementPtr: {

8425

auto *GEP0 = cast<GetElementPtrInst>(VL0);

8426

setInsertPointAfterBundle(E);

8427

8428

Value *Op0 = vectorizeTree(E->getOperand(0));

8429

8430

SmallVector<Value *> OpVecs;

8431

for (int J = 1, N = GEP0->getNumOperands(); J < N; ++J) {

8432

Value *OpVec = vectorizeTree(E->getOperand(J));

8433

OpVecs.push_back(OpVec);

8434

}

8435

8436

Value *V = Builder.CreateGEP(GEP0->getSourceElementType(), Op0, OpVecs);

8437

if (Instruction *I = dyn_cast<GetElementPtrInst>(V)) {

8438

SmallVector<Value *> GEPs;

8439

for (Value *V : E->Scalars) {

8440

if (isa<GetElementPtrInst>(V))

8441

GEPs.push_back(V);

8442

}

8443

V = propagateMetadata(I, GEPs);

8444

}

8445

8446

ShuffleBuilder.addInversedMask(E->ReorderIndices);

8447

ShuffleBuilder.addMask(E->ReuseShuffleIndices);

8448

V = ShuffleBuilder.finalize(V);

8449

8450

E->VectorizedValue = V;

8451

++NumVectorInstructions;

8452

8453

return V;

8454

}

8455

case Instruction::Call: {

8456

CallInst *CI = cast<CallInst>(VL0);

8457

setInsertPointAfterBundle(E);

8458

8459

Intrinsic::ID IID = Intrinsic::not_intrinsic;

8460

if (Function *FI = CI->getCalledFunction())

8461

IID = FI->getIntrinsicID();

8462

8463

Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);

8464

8465

auto VecCallCosts = getVectorCallCosts(CI, VecTy, TTI, TLI);

8466

bool UseIntrinsic = ID != Intrinsic::not_intrinsic &&

8467

VecCallCosts.first <= VecCallCosts.second;

8468

8469

Value *ScalarArg = nullptr;

8470

std::vector<Value *> OpVecs;

8471

SmallVector<Type *, 2> TysForDecl =

8472

{FixedVectorType::get(CI->getType(), E->Scalars.size())};

8473

for (int j = 0, e = CI->arg_size(); j < e; ++j) {

8474

ValueList OpVL;

8475

// Some intrinsics have scalar arguments. This argument should not be

8476

// vectorized.

8477

if (UseIntrinsic && isVectorIntrinsicWithScalarOpAtArg(IID, j)) {

8478

CallInst *CEI = cast<CallInst>(VL0);

8479

ScalarArg = CEI->getArgOperand(j);

8480

OpVecs.push_back(CEI->getArgOperand(j));

8481

if (isVectorIntrinsicWithOverloadTypeAtArg(IID, j))

8482

TysForDecl.push_back(ScalarArg->getType());

8483

continue;

8484

}

8485

8486

Value *OpVec = vectorizeTree(E->getOperand(j));

8487

LLVM_DEBUG(dbgs() << "SLP: OpVec[" << j << "]: " << *OpVec << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: OpVec[" << j << "]: "
<< *OpVec << "\n"; } } while (false);

8488

OpVecs.push_back(OpVec);

8489

if (isVectorIntrinsicWithOverloadTypeAtArg(IID, j))

8490

TysForDecl.push_back(OpVec->getType());

8491

}

8492

8493

Function *CF;

8494

if (!UseIntrinsic) {

8495

VFShape Shape =

8496

VFShape::get(*CI, ElementCount::getFixed(static_cast<unsigned>(

8497

VecTy->getNumElements())),

8498

false /*HasGlobalPred*/);

8499

CF = VFDatabase(*CI).getVectorizedFunction(Shape);

8500

} else {

8501

CF = Intrinsic::getDeclaration(F->getParent(), ID, TysForDecl);

8502

}

8503

8504

SmallVector<OperandBundleDef, 1> OpBundles;

8505

CI->getOperandBundlesAsDefs(OpBundles);

8506

Value *V = Builder.CreateCall(CF, OpVecs, OpBundles);

8507

8508

// The scalar argument uses an in-tree scalar so we add the new vectorized

8509

// call to ExternalUses list to make sure that an extract will be

8510

// generated in the future.

8511

if (ScalarArg) {

8512

if (TreeEntry *Entry = getTreeEntry(ScalarArg)) {

8513

// Find which lane we need to extract.

8514

unsigned FoundLane = Entry->findLaneForValue(ScalarArg);

8515

ExternalUses.push_back(

8516

ExternalUser(ScalarArg, cast<User>(V), FoundLane));

8517

}

8518

}

8519

8520

propagateIRFlags(V, E->Scalars, VL0);

8521

ShuffleBuilder.addInversedMask(E->ReorderIndices);

8522

ShuffleBuilder.addMask(E->ReuseShuffleIndices);

8523

V = ShuffleBuilder.finalize(V);

8524

8525

E->VectorizedValue = V;

8526

++NumVectorInstructions;

8527

return V;

8528

}

8529

case Instruction::ShuffleVector: {

8530

assert(E->isAltShuffle() &&(static_cast <bool> (E->isAltShuffle() && ((
Instruction::isBinaryOp(E->getOpcode()) && Instruction
::isBinaryOp(E->getAltOpcode())) || (Instruction::isCast(E
->getOpcode()) && Instruction::isCast(E->getAltOpcode
())) || (isa<CmpInst>(VL0) && isa<CmpInst>
(E->getAltOp()))) && "Invalid Shuffle Vector Operand"
) ? void (0) : __assert_fail ("E->isAltShuffle() && ((Instruction::isBinaryOp(E->getOpcode()) && Instruction::isBinaryOp(E->getAltOpcode())) || (Instruction::isCast(E->getOpcode()) && Instruction::isCast(E->getAltOpcode())) || (isa<CmpInst>(VL0) && isa<CmpInst>(E->getAltOp()))) && \"Invalid Shuffle Vector Operand\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 8536, __extension__
__PRETTY_FUNCTION__))

8531

((Instruction::isBinaryOp(E->getOpcode()) &&(static_cast <bool> (E->isAltShuffle() && ((
Instruction::isBinaryOp(E->getOpcode()) && Instruction
::isBinaryOp(E->getAltOpcode())) || (Instruction::isCast(E
->getOpcode()) && Instruction::isCast(E->getAltOpcode
())) || (isa<CmpInst>(VL0) && isa<CmpInst>
(E->getAltOp()))) && "Invalid Shuffle Vector Operand"
) ? void (0) : __assert_fail ("E->isAltShuffle() && ((Instruction::isBinaryOp(E->getOpcode()) && Instruction::isBinaryOp(E->getAltOpcode())) || (Instruction::isCast(E->getOpcode()) && Instruction::isCast(E->getAltOpcode())) || (isa<CmpInst>(VL0) && isa<CmpInst>(E->getAltOp()))) && \"Invalid Shuffle Vector Operand\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 8536, __extension__
__PRETTY_FUNCTION__))

8532

Instruction::isBinaryOp(E->getAltOpcode())) ||(static_cast <bool> (E->isAltShuffle() && ((
Instruction::isBinaryOp(E->getOpcode()) && Instruction
::isBinaryOp(E->getAltOpcode())) || (Instruction::isCast(E
->getOpcode()) && Instruction::isCast(E->getAltOpcode
())) || (isa<CmpInst>(VL0) && isa<CmpInst>
(E->getAltOp()))) && "Invalid Shuffle Vector Operand"
) ? void (0) : __assert_fail ("E->isAltShuffle() && ((Instruction::isBinaryOp(E->getOpcode()) && Instruction::isBinaryOp(E->getAltOpcode())) || (Instruction::isCast(E->getOpcode()) && Instruction::isCast(E->getAltOpcode())) || (isa<CmpInst>(VL0) && isa<CmpInst>(E->getAltOp()))) && \"Invalid Shuffle Vector Operand\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 8536, __extension__
__PRETTY_FUNCTION__))

8533

(Instruction::isCast(E->getOpcode()) &&(static_cast <bool> (E->isAltShuffle() && ((
Instruction::isBinaryOp(E->getOpcode()) && Instruction
::isBinaryOp(E->getAltOpcode())) || (Instruction::isCast(E
->getOpcode()) && Instruction::isCast(E->getAltOpcode
())) || (isa<CmpInst>(VL0) && isa<CmpInst>
(E->getAltOp()))) && "Invalid Shuffle Vector Operand"
) ? void (0) : __assert_fail ("E->isAltShuffle() && ((Instruction::isBinaryOp(E->getOpcode()) && Instruction::isBinaryOp(E->getAltOpcode())) || (Instruction::isCast(E->getOpcode()) && Instruction::isCast(E->getAltOpcode())) || (isa<CmpInst>(VL0) && isa<CmpInst>(E->getAltOp()))) && \"Invalid Shuffle Vector Operand\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 8536, __extension__
__PRETTY_FUNCTION__))

8534

Instruction::isCast(E->getAltOpcode())) ||(static_cast <bool> (E->isAltShuffle() && ((
Instruction::isBinaryOp(E->getOpcode()) && Instruction
::isBinaryOp(E->getAltOpcode())) || (Instruction::isCast(E
->getOpcode()) && Instruction::isCast(E->getAltOpcode
())) || (isa<CmpInst>(VL0) && isa<CmpInst>
(E->getAltOp()))) && "Invalid Shuffle Vector Operand"
) ? void (0) : __assert_fail ("E->isAltShuffle() && ((Instruction::isBinaryOp(E->getOpcode()) && Instruction::isBinaryOp(E->getAltOpcode())) || (Instruction::isCast(E->getOpcode()) && Instruction::isCast(E->getAltOpcode())) || (isa<CmpInst>(VL0) && isa<CmpInst>(E->getAltOp()))) && \"Invalid Shuffle Vector Operand\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 8536, __extension__
__PRETTY_FUNCTION__))

8535

(isa<CmpInst>(VL0) && isa<CmpInst>(E->getAltOp()))) &&(static_cast <bool> (E->isAltShuffle() && ((
Instruction::isBinaryOp(E->getOpcode()) && Instruction
::isBinaryOp(E->getAltOpcode())) || (Instruction::isCast(E
->getOpcode()) && Instruction::isCast(E->getAltOpcode
())) || (isa<CmpInst>(VL0) && isa<CmpInst>
(E->getAltOp()))) && "Invalid Shuffle Vector Operand"
) ? void (0) : __assert_fail ("E->isAltShuffle() && ((Instruction::isBinaryOp(E->getOpcode()) && Instruction::isBinaryOp(E->getAltOpcode())) || (Instruction::isCast(E->getOpcode()) && Instruction::isCast(E->getAltOpcode())) || (isa<CmpInst>(VL0) && isa<CmpInst>(E->getAltOp()))) && \"Invalid Shuffle Vector Operand\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 8536, __extension__
__PRETTY_FUNCTION__))

8536

"Invalid Shuffle Vector Operand")(static_cast <bool> (E->isAltShuffle() && ((
Instruction::isBinaryOp(E->getOpcode()) && Instruction
::isBinaryOp(E->getAltOpcode())) || (Instruction::isCast(E
->getOpcode()) && Instruction::isCast(E->getAltOpcode
())) || (isa<CmpInst>(VL0) && isa<CmpInst>
(E->getAltOp()))) && "Invalid Shuffle Vector Operand"
) ? void (0) : __assert_fail ("E->isAltShuffle() && ((Instruction::isBinaryOp(E->getOpcode()) && Instruction::isBinaryOp(E->getAltOpcode())) || (Instruction::isCast(E->getOpcode()) && Instruction::isCast(E->getAltOpcode())) || (isa<CmpInst>(VL0) && isa<CmpInst>(E->getAltOp()))) && \"Invalid Shuffle Vector Operand\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 8536, __extension__
__PRETTY_FUNCTION__));

8537

8538

Value *LHS = nullptr, *RHS = nullptr;

8539

if (Instruction::isBinaryOp(E->getOpcode()) || isa<CmpInst>(VL0)) {

8540

setInsertPointAfterBundle(E);

8541

LHS = vectorizeTree(E->getOperand(0));

8542

RHS = vectorizeTree(E->getOperand(1));

8543

} else {

8544

setInsertPointAfterBundle(E);

8545

LHS = vectorizeTree(E->getOperand(0));

8546

}

8547

8548

if (E->VectorizedValue) {

8549

LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Diamond merged for " <<
*VL0 << ".\n"; } } while (false);

8550

return E->VectorizedValue;

8551

}

8552

8553

Value *V0, *V1;

8554

if (Instruction::isBinaryOp(E->getOpcode())) {

8555

V0 = Builder.CreateBinOp(

8556

static_cast<Instruction::BinaryOps>(E->getOpcode()), LHS, RHS);

8557

V1 = Builder.CreateBinOp(

8558

static_cast<Instruction::BinaryOps>(E->getAltOpcode()), LHS, RHS);

8559

} else if (auto *CI0 = dyn_cast<CmpInst>(VL0)) {

8560

V0 = Builder.CreateCmp(CI0->getPredicate(), LHS, RHS);

8561

auto *AltCI = cast<CmpInst>(E->getAltOp());

8562

CmpInst::Predicate AltPred = AltCI->getPredicate();

8563

V1 = Builder.CreateCmp(AltPred, LHS, RHS);

8564

} else {

8565

V0 = Builder.CreateCast(

8566

static_cast<Instruction::CastOps>(E->getOpcode()), LHS, VecTy);

8567

V1 = Builder.CreateCast(

8568

static_cast<Instruction::CastOps>(E->getAltOpcode()), LHS, VecTy);

8569

}

8570

// Add V0 and V1 to later analysis to try to find and remove matching

8571

// instruction, if any.

8572

for (Value *V : {V0, V1}) {

8573

if (auto *I = dyn_cast<Instruction>(V)) {

8574

GatherShuffleSeq.insert(I);

8575

CSEBlocks.insert(I->getParent());

8576

}

8577

}

8578

8579

// Create shuffle to take alternate operations from the vector.

8580

// Also, gather up main and alt scalar ops to propagate IR flags to

8581

// each vector operation.

8582

ValueList OpScalars, AltScalars;

8583

SmallVector<int> Mask;

8584

buildShuffleEntryMask(

8585

E->Scalars, E->ReorderIndices, E->ReuseShuffleIndices,

8586

[E](Instruction *I) {

8587

assert(E->isOpcodeOrAlt(I) && "Unexpected main/alternate opcode")(static_cast <bool> (E->isOpcodeOrAlt(I) && "Unexpected main/alternate opcode"
) ? void (0) : __assert_fail ("E->isOpcodeOrAlt(I) && \"Unexpected main/alternate opcode\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 8587, __extension__
__PRETTY_FUNCTION__));

8588

return isAlternateInstruction(I, E->getMainOp(), E->getAltOp());

8589

},

8590

Mask, &OpScalars, &AltScalars);

8591

8592

propagateIRFlags(V0, OpScalars);

8593

propagateIRFlags(V1, AltScalars);

8594

8595

Value *V = Builder.CreateShuffleVector(V0, V1, Mask);

8596

if (auto *I = dyn_cast<Instruction>(V)) {

8597

V = propagateMetadata(I, E->Scalars);

8598

GatherShuffleSeq.insert(I);

8599

CSEBlocks.insert(I->getParent());

8600

}

8601

V = ShuffleBuilder.finalize(V);

8602

8603

E->VectorizedValue = V;

8604

++NumVectorInstructions;

8605

8606

return V;

8607

}

8608

default:

8609

llvm_unreachable("unknown inst")::llvm::llvm_unreachable_internal("unknown inst", "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 8609);

8610

}

8611

return nullptr;

8612

}

8613

8614

Value *BoUpSLP::vectorizeTree() {

8615

ExtraValueToDebugLocsMap ExternallyUsedValues;

8616

return vectorizeTree(ExternallyUsedValues);

8617

}

8618

8619

namespace {

8620

/// Data type for handling buildvector sequences with the reused scalars from

8621

/// other tree entries.

8622

struct ShuffledInsertData {

8623

/// List of insertelements to be replaced by shuffles.

8624

SmallVector<InsertElementInst *> InsertElements;

8625

/// The parent vectors and shuffle mask for the given list of inserts.

8626

MapVector<Value *, SmallVector<int>> ValueMasks;

8627

};

8628

} // namespace

8629

8630

Value *

8631

BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues) {

8632

// All blocks must be scheduled before any instructions are inserted.

8633

for (auto &BSIter : BlocksSchedules) {

8634

scheduleBlock(BSIter.second.get());

8635

}

8636

8637

Builder.SetInsertPoint(&F->getEntryBlock().front());

8638

auto *VectorRoot = vectorizeTree(VectorizableTree[0].get());

8639

8640

// If the vectorized tree can be rewritten in a smaller type, we truncate the

8641

// vectorized root. InstCombine will then rewrite the entire expression. We

8642

// sign extend the extracted values below.

8643

auto *ScalarRoot = VectorizableTree[0]->Scalars[0];

8644

if (MinBWs.count(ScalarRoot)) {

8645

if (auto *I = dyn_cast<Instruction>(VectorRoot)) {

8646

// If current instr is a phi and not the last phi, insert it after the

8647

// last phi node.

8648

if (isa<PHINode>(I))

8649

Builder.SetInsertPoint(&*I->getParent()->getFirstInsertionPt());

8650

else

8651

Builder.SetInsertPoint(&*++BasicBlock::iterator(I));

8652

}

8653

auto BundleWidth = VectorizableTree[0]->Scalars.size();

8654

auto *MinTy = IntegerType::get(F->getContext(), MinBWs[ScalarRoot].first);

8655

auto *VecTy = FixedVectorType::get(MinTy, BundleWidth);

8656

auto *Trunc = Builder.CreateTrunc(VectorRoot, VecTy);

8657

VectorizableTree[0]->VectorizedValue = Trunc;

8658

}

8659

8660

LLVM_DEBUG(dbgs() << "SLP: Extracting " << ExternalUses.size()do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Extracting " << ExternalUses
.size() << " values .\n"; } } while (false)

8661

<< " values .\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Extracting " << ExternalUses
.size() << " values .\n"; } } while (false);

8662

8663

SmallVector<ShuffledInsertData> ShuffledInserts;

8664

// Maps vector instruction to original insertelement instruction

8665

DenseMap<Value *, InsertElementInst *> VectorToInsertElement;

8666

// Extract all of the elements with the external uses.

8667

for (const auto &ExternalUse : ExternalUses) {

8668

Value *Scalar = ExternalUse.Scalar;

8669

llvm::User *User = ExternalUse.User;

8670

8671

// Skip users that we already RAUW. This happens when one instruction

8672

// has multiple uses of the same value.

8673

if (User && !is_contained(Scalar->users(), User))

8674

continue;

8675

TreeEntry *E = getTreeEntry(Scalar);

8676

assert(E && "Invalid scalar")(static_cast <bool> (E && "Invalid scalar") ? void
(0) : __assert_fail ("E && \"Invalid scalar\"", "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 8676, __extension__ __PRETTY_FUNCTION__));

8677

assert(E->State != TreeEntry::NeedToGather &&(static_cast <bool> (E->State != TreeEntry::NeedToGather
&& "Extracting from a gather list") ? void (0) : __assert_fail
("E->State != TreeEntry::NeedToGather && \"Extracting from a gather list\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 8678, __extension__
__PRETTY_FUNCTION__))

8678

"Extracting from a gather list")(static_cast <bool> (E->State != TreeEntry::NeedToGather
&& "Extracting from a gather list") ? void (0) : __assert_fail
("E->State != TreeEntry::NeedToGather && \"Extracting from a gather list\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 8678, __extension__
__PRETTY_FUNCTION__));

8679

// Non-instruction pointers are not deleted, just skip them.

8680

if (E->getOpcode() == Instruction::GetElementPtr &&

8681

!isa<GetElementPtrInst>(Scalar))

8682

continue;

8683

8684

Value *Vec = E->VectorizedValue;

8685

assert(Vec && "Can't find vectorizable value")(static_cast <bool> (Vec && "Can't find vectorizable value"
) ? void (0) : __assert_fail ("Vec && \"Can't find vectorizable value\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 8685, __extension__
__PRETTY_FUNCTION__));

8686

8687

Value *Lane = Builder.getInt32(ExternalUse.Lane);

8688

auto ExtractAndExtendIfNeeded = [&](Value *Vec) {

8689

if (Scalar->getType() != Vec->getType()) {

8690

Value *Ex;

8691

// "Reuse" the existing extract to improve final codegen.

8692

if (auto *ES = dyn_cast<ExtractElementInst>(Scalar)) {

8693

Ex = Builder.CreateExtractElement(ES->getOperand(0),

8694

ES->getOperand(1));

8695

} else {

8696

Ex = Builder.CreateExtractElement(Vec, Lane);

8697

}

8698

// If necessary, sign-extend or zero-extend ScalarRoot

8699

// to the larger type.

8700

if (!MinBWs.count(ScalarRoot))

8701

return Ex;

8702

if (MinBWs[ScalarRoot].second)

8703

return Builder.CreateSExt(Ex, Scalar->getType());

8704

return Builder.CreateZExt(Ex, Scalar->getType());

8705

}

8706

assert(isa<FixedVectorType>(Scalar->getType()) &&(static_cast <bool> (isa<FixedVectorType>(Scalar->
getType()) && isa<InsertElementInst>(Scalar) &&
"In-tree scalar of vector type is not insertelement?") ? void
(0) : __assert_fail ("isa<FixedVectorType>(Scalar->getType()) && isa<InsertElementInst>(Scalar) && \"In-tree scalar of vector type is not insertelement?\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 8708, __extension__
__PRETTY_FUNCTION__))

8707

isa<InsertElementInst>(Scalar) &&(static_cast <bool> (isa<FixedVectorType>(Scalar->
getType()) && isa<InsertElementInst>(Scalar) &&
"In-tree scalar of vector type is not insertelement?") ? void
(0) : __assert_fail ("isa<FixedVectorType>(Scalar->getType()) && isa<InsertElementInst>(Scalar) && \"In-tree scalar of vector type is not insertelement?\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 8708, __extension__
__PRETTY_FUNCTION__))

8708

"In-tree scalar of vector type is not insertelement?")(static_cast <bool> (isa<FixedVectorType>(Scalar->
getType()) && isa<InsertElementInst>(Scalar) &&
"In-tree scalar of vector type is not insertelement?") ? void
(0) : __assert_fail ("isa<FixedVectorType>(Scalar->getType()) && isa<InsertElementInst>(Scalar) && \"In-tree scalar of vector type is not insertelement?\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 8708, __extension__
__PRETTY_FUNCTION__));

8709

auto *IE = cast<InsertElementInst>(Scalar);

8710

VectorToInsertElement.try_emplace(Vec, IE);

8711

return Vec;

8712

};

8713

// If User == nullptr, the Scalar is used as extra arg. Generate

8714

// ExtractElement instruction and update the record for this scalar in

8715

// ExternallyUsedValues.

8716

if (!User) {

8717

assert(ExternallyUsedValues.count(Scalar) &&(static_cast <bool> (ExternallyUsedValues.count(Scalar)
&& "Scalar with nullptr as an external user must be registered in "
"ExternallyUsedValues map") ? void (0) : __assert_fail ("ExternallyUsedValues.count(Scalar) && \"Scalar with nullptr as an external user must be registered in \" \"ExternallyUsedValues map\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 8719, __extension__
__PRETTY_FUNCTION__))

8718

"Scalar with nullptr as an external user must be registered in "(static_cast <bool> (ExternallyUsedValues.count(Scalar)
&& "Scalar with nullptr as an external user must be registered in "
"ExternallyUsedValues map") ? void (0) : __assert_fail ("ExternallyUsedValues.count(Scalar) && \"Scalar with nullptr as an external user must be registered in \" \"ExternallyUsedValues map\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 8719, __extension__
__PRETTY_FUNCTION__))

8719

"ExternallyUsedValues map")(static_cast <bool> (ExternallyUsedValues.count(Scalar)
&& "Scalar with nullptr as an external user must be registered in "
"ExternallyUsedValues map") ? void (0) : __assert_fail ("ExternallyUsedValues.count(Scalar) && \"Scalar with nullptr as an external user must be registered in \" \"ExternallyUsedValues map\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 8719, __extension__
__PRETTY_FUNCTION__));

8720

if (auto *VecI = dyn_cast<Instruction>(Vec)) {

8721

Builder.SetInsertPoint(VecI->getParent(),

8722

std::next(VecI->getIterator()));

8723

} else {

8724

Builder.SetInsertPoint(&F->getEntryBlock().front());

8725

}

8726

Value *NewInst = ExtractAndExtendIfNeeded(Vec);

8727

CSEBlocks.insert(cast<Instruction>(Scalar)->getParent());

8728

auto &NewInstLocs = ExternallyUsedValues[NewInst];

8729

auto It = ExternallyUsedValues.find(Scalar);

8730

assert(It != ExternallyUsedValues.end() &&(static_cast <bool> (It != ExternallyUsedValues.end() &&
"Externally used scalar is not found in ExternallyUsedValues"
) ? void (0) : __assert_fail ("It != ExternallyUsedValues.end() && \"Externally used scalar is not found in ExternallyUsedValues\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 8731, __extension__
__PRETTY_FUNCTION__))

8731

"Externally used scalar is not found in ExternallyUsedValues")(static_cast <bool> (It != ExternallyUsedValues.end() &&
"Externally used scalar is not found in ExternallyUsedValues"
) ? void (0) : __assert_fail ("It != ExternallyUsedValues.end() && \"Externally used scalar is not found in ExternallyUsedValues\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 8731, __extension__
__PRETTY_FUNCTION__));

8732

NewInstLocs.append(It->second);

8733

ExternallyUsedValues.erase(Scalar);

8734

// Required to update internally referenced instructions.

8735

Scalar->replaceAllUsesWith(NewInst);

8736

continue;

8737

}

8738

8739

if (auto *VU = dyn_cast<InsertElementInst>(User)) {

8740

// Skip if the scalar is another vector op or Vec is not an instruction.

8741

if (!Scalar->getType()->isVectorTy() && isa<Instruction>(Vec)) {

8742

if (auto *FTy = dyn_cast<FixedVectorType>(User->getType())) {

8743

Optional<unsigned> InsertIdx = getInsertIndex(VU);

8744

if (InsertIdx) {

8745

// Need to use original vector, if the root is truncated.

8746

if (MinBWs.count(Scalar) &&

8747

VectorizableTree[0]->VectorizedValue == Vec)

8748

Vec = VectorRoot;

8749

auto *It =

8750

find_if(ShuffledInserts, [VU](const ShuffledInsertData &Data) {

8751

// Checks if 2 insertelements are from the same buildvector.

8752

InsertElementInst *VecInsert = Data.InsertElements.front();

8753

return areTwoInsertFromSameBuildVector(

8754

VU, VecInsert,

8755

[](InsertElementInst *II) { return II->getOperand(0); });

8756

});

8757

unsigned Idx = *InsertIdx;

8758

if (It == ShuffledInserts.end()) {

8759

(void)ShuffledInserts.emplace_back();

8760

It = std::next(ShuffledInserts.begin(),

8761

ShuffledInserts.size() - 1);

8762

SmallVectorImpl<int> &Mask = It->ValueMasks[Vec];

8763

if (Mask.empty())

8764

Mask.assign(FTy->getNumElements(), UndefMaskElem);

8765

// Find the insertvector, vectorized in tree, if any.

8766

Value *Base = VU;

8767

while (auto *IEBase = dyn_cast<InsertElementInst>(Base)) {

8768

if (IEBase != User &&

8769

(!IEBase->hasOneUse() ||

8770

getInsertIndex(IEBase).value_or(Idx) == Idx))

8771

break;

8772

// Build the mask for the vectorized insertelement instructions.

8773

if (const TreeEntry *E = getTreeEntry(IEBase)) {

8774

do {

8775

IEBase = cast<InsertElementInst>(Base);

8776

int IEIdx = *getInsertIndex(IEBase);

8777

assert(Mask[Idx] == UndefMaskElem &&(static_cast <bool> (Mask[Idx] == UndefMaskElem &&
"InsertElementInstruction used already.") ? void (0) : __assert_fail
("Mask[Idx] == UndefMaskElem && \"InsertElementInstruction used already.\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 8778, __extension__
__PRETTY_FUNCTION__))

8778

"InsertElementInstruction used already.")(static_cast <bool> (Mask[Idx] == UndefMaskElem &&
"InsertElementInstruction used already.") ? void (0) : __assert_fail
("Mask[Idx] == UndefMaskElem && \"InsertElementInstruction used already.\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 8778, __extension__
__PRETTY_FUNCTION__));

8779

Mask[IEIdx] = IEIdx;

8780

Base = IEBase->getOperand(0);

8781

} while (E == getTreeEntry(Base));

8782

break;

8783

}

8784

Base = cast<InsertElementInst>(Base)->getOperand(0);

8785

// After the vectorization the def-use chain has changed, need

8786

// to look through original insertelement instructions, if they

8787

// get replaced by vector instructions.

8788

auto It = VectorToInsertElement.find(Base);

8789

if (It != VectorToInsertElement.end())

8790

Base = It->second;

8791

}

8792

}

8793

SmallVectorImpl<int> &Mask = It->ValueMasks[Vec];

8794

if (Mask.empty())

8795

Mask.assign(FTy->getNumElements(), UndefMaskElem);

8796

Mask[Idx] = ExternalUse.Lane;

8797

It->InsertElements.push_back(cast<InsertElementInst>(User));

8798

continue;

8799

}

8800

}

8801

}

8802

}

8803

8804

// Generate extracts for out-of-tree users.

8805

// Find the insertion point for the extractelement lane.

8806

if (auto *VecI = dyn_cast<Instruction>(Vec)) {

8807

if (PHINode *PH = dyn_cast<PHINode>(User)) {

8808

for (int i = 0, e = PH->getNumIncomingValues(); i != e; ++i) {

8809

if (PH->getIncomingValue(i) == Scalar) {

8810

Instruction *IncomingTerminator =

8811

PH->getIncomingBlock(i)->getTerminator();

8812

if (isa<CatchSwitchInst>(IncomingTerminator)) {

8813

Builder.SetInsertPoint(VecI->getParent(),

8814

std::next(VecI->getIterator()));

8815

} else {

8816

Builder.SetInsertPoint(PH->getIncomingBlock(i)->getTerminator());

8817

}

8818

Value *NewInst = ExtractAndExtendIfNeeded(Vec);

8819

CSEBlocks.insert(PH->getIncomingBlock(i));

8820

PH->setOperand(i, NewInst);

8821

}

8822

}

8823

} else {

8824

Builder.SetInsertPoint(cast<Instruction>(User));

8825

Value *NewInst = ExtractAndExtendIfNeeded(Vec);

8826

CSEBlocks.insert(cast<Instruction>(User)->getParent());

8827

User->replaceUsesOfWith(Scalar, NewInst);

8828

}

8829

} else {

8830

Builder.SetInsertPoint(&F->getEntryBlock().front());

8831

Value *NewInst = ExtractAndExtendIfNeeded(Vec);

8832

CSEBlocks.insert(&F->getEntryBlock());

8833

User->replaceUsesOfWith(Scalar, NewInst);

8834

}

8835

8836

LLVM_DEBUG(dbgs() << "SLP: Replaced:" << *User << ".\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Replaced:" << *User <<
".\n"; } } while (false);

8837

}

8838

8839

// Checks if the mask is an identity mask.

8840

auto &&IsIdentityMask = [](ArrayRef<int> Mask, FixedVectorType *VecTy) {

8841

int Limit = Mask.size();

8842

return VecTy->getNumElements() == Mask.size() &&

8843

all_of(Mask, [Limit](int Idx) { return Idx < Limit; }) &&

8844

ShuffleVectorInst::isIdentityMask(Mask);

8845

};

8846

// Tries to combine 2 different masks into single one.

8847

auto &&CombineMasks = [](SmallVectorImpl<int> &Mask, ArrayRef<int> ExtMask) {

8848

SmallVector<int> NewMask(ExtMask.size(), UndefMaskElem);

8849

for (int I = 0, Sz = ExtMask.size(); I < Sz; ++I) {

8850

if (ExtMask[I] == UndefMaskElem)

8851

continue;

8852

NewMask[I] = Mask[ExtMask[I]];

8853

}

8854

Mask.swap(NewMask);

8855

};

8856

// Peek through shuffles, trying to simplify the final shuffle code.

8857

auto &&PeekThroughShuffles =

8858

[&IsIdentityMask, &CombineMasks](Value *&V, SmallVectorImpl<int> &Mask,

8859

bool CheckForLengthChange = false) {

8860

while (auto *SV = dyn_cast<ShuffleVectorInst>(V)) {

8861

// Exit if not a fixed vector type or changing size shuffle.

8862

if (!isa<FixedVectorType>(SV->getType()) ||

8863

(CheckForLengthChange && SV->changesLength()))

8864

break;

8865

// Exit if the identity or broadcast mask is found.

8866

if (IsIdentityMask(Mask, cast<FixedVectorType>(SV->getType())) ||

8867

SV->isZeroEltSplat())

8868

break;

8869

bool IsOp1Undef = isUndefVector(SV->getOperand(0), Mask);

8870

bool IsOp2Undef = isUndefVector(SV->getOperand(1), Mask);

8871

if (!IsOp1Undef && !IsOp2Undef)

8872

break;

8873

SmallVector<int> ShuffleMask(SV->getShuffleMask().begin(),

8874

SV->getShuffleMask().end());

8875

CombineMasks(ShuffleMask, Mask);

8876

Mask.swap(ShuffleMask);

8877

if (IsOp2Undef)

8878

V = SV->getOperand(0);

8879

else

8880

V = SV->getOperand(1);

8881

}

8882

};

8883

// Smart shuffle instruction emission, walks through shuffles trees and

8884

// tries to find the best matching vector for the actual shuffle

8885

// instruction.

8886

auto &&CreateShuffle = [this, &IsIdentityMask, &PeekThroughShuffles,

8887

&CombineMasks](Value *V1, Value *V2,

8888

ArrayRef<int> Mask) -> Value * {

8889

assert(V1 && "Expected at least one vector value.")(static_cast <bool> (V1 && "Expected at least one vector value."
) ? void (0) : __assert_fail ("V1 && \"Expected at least one vector value.\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 8889, __extension__
__PRETTY_FUNCTION__));

8890

if (V2 && !isUndefVector(V2, Mask)) {

8891

// Peek through shuffles.

8892

Value *Op1 = V1;

8893

Value *Op2 = V2;

8894

int VF =

8895

cast<VectorType>(V1->getType())->getElementCount().getKnownMinValue();

8896

SmallVector<int> CombinedMask1(Mask.size(), UndefMaskElem);

8897

SmallVector<int> CombinedMask2(Mask.size(), UndefMaskElem);

8898

for (int I = 0, E = Mask.size(); I < E; ++I) {

8899

if (Mask[I] < VF)

8900

CombinedMask1[I] = Mask[I];

8901

else

8902

CombinedMask2[I] = Mask[I] - VF;

8903

}

8904

Value *PrevOp1;

8905

Value *PrevOp2;

8906

do {

8907

PrevOp1 = Op1;

8908

PrevOp2 = Op2;

8909

PeekThroughShuffles(Op1, CombinedMask1, /*CheckForLengthChange=*/true);

8910

PeekThroughShuffles(Op2, CombinedMask2, /*CheckForLengthChange=*/true);

8911

// Check if we have 2 resizing shuffles - need to peek through operands

8912

// again.

8913

if (auto *SV1 = dyn_cast<ShuffleVectorInst>(Op1))

8914

if (auto *SV2 = dyn_cast<ShuffleVectorInst>(Op2))

8915

if (SV1->getOperand(0)->getType() ==

8916

SV2->getOperand(0)->getType() &&

8917

SV1->getOperand(0)->getType() != SV1->getType() &&

8918

isUndefVector(SV1->getOperand(1), CombinedMask1) &&

8919

isUndefVector(SV2->getOperand(1), CombinedMask2)) {

8920

Op1 = SV1->getOperand(0);

8921

Op2 = SV2->getOperand(0);

8922

SmallVector<int> ShuffleMask1(SV1->getShuffleMask().begin(),

8923

SV1->getShuffleMask().end());

8924

CombineMasks(ShuffleMask1, CombinedMask1);

8925

CombinedMask1.swap(ShuffleMask1);

8926

SmallVector<int> ShuffleMask2(SV2->getShuffleMask().begin(),

8927

SV2->getShuffleMask().end());

8928

CombineMasks(ShuffleMask2, CombinedMask2);

8929

CombinedMask2.swap(ShuffleMask2);

8930

}

8931

} while (PrevOp1 != Op1 || PrevOp2 != Op2);

8932

VF = cast<VectorType>(Op1->getType())

8933

->getElementCount()

8934

.getKnownMinValue();

8935

for (int I = 0, E = Mask.size(); I < E; ++I) {

8936

if (CombinedMask2[I] != UndefMaskElem) {

8937

assert(CombinedMask1[I] == UndefMaskElem &&(static_cast <bool> (CombinedMask1[I] == UndefMaskElem &&
"Expected undefined mask element") ? void (0) : __assert_fail
("CombinedMask1[I] == UndefMaskElem && \"Expected undefined mask element\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 8938, __extension__
__PRETTY_FUNCTION__))

8938

"Expected undefined mask element")(static_cast <bool> (CombinedMask1[I] == UndefMaskElem &&
"Expected undefined mask element") ? void (0) : __assert_fail
("CombinedMask1[I] == UndefMaskElem && \"Expected undefined mask element\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 8938, __extension__
__PRETTY_FUNCTION__));

8939

CombinedMask1[I] = CombinedMask2[I] + (Op1 == Op2 ? 0 : VF);

8940

}

8941

}

8942

Value *Vec = Builder.CreateShuffleVector(

8943

Op1, Op1 == Op2 ? PoisonValue::get(Op1->getType()) : Op2,

8944

CombinedMask1);

8945

if (auto *I = dyn_cast<Instruction>(Vec)) {

8946

GatherShuffleSeq.insert(I);

8947

CSEBlocks.insert(I->getParent());

8948

}

8949

return Vec;

8950

}

8951

if (isa<PoisonValue>(V1))

8952

return PoisonValue::get(FixedVectorType::get(

8953

cast<VectorType>(V1->getType())->getElementType(), Mask.size()));

8954

Value *Op = V1;

8955

SmallVector<int> CombinedMask(Mask);

8956

PeekThroughShuffles(Op, CombinedMask);

8957

if (!isa<FixedVectorType>(Op->getType()) ||

8958

!IsIdentityMask(CombinedMask, cast<FixedVectorType>(Op->getType()))) {

8959

Value *Vec = Builder.CreateShuffleVector(Op, CombinedMask);

8960

if (auto *I = dyn_cast<Instruction>(Vec)) {

8961

GatherShuffleSeq.insert(I);

8962

CSEBlocks.insert(I->getParent());

8963

}

8964

return Vec;

8965

}

8966

return Op;

8967

};

8968

8969

auto &&ResizeToVF = [&CreateShuffle](Value *Vec, ArrayRef<int> Mask,

8970

bool ForSingleMask) {

8971

unsigned VF = Mask.size();

8972

unsigned VecVF = cast<FixedVectorType>(Vec->getType())->getNumElements();

8973

if (VF != VecVF) {

8974

if (any_of(Mask, [VF](int Idx) { return Idx >= static_cast<int>(VF); })) {

8975

Vec = CreateShuffle(Vec, nullptr, Mask);

8976

return std::make_pair(Vec, true);

8977

}

8978

if (!ForSingleMask) {

8979

SmallVector<int> ResizeMask(VF, UndefMaskElem);

8980

for (unsigned I = 0; I < VF; ++I) {

8981

if (Mask[I] != UndefMaskElem)

8982

ResizeMask[Mask[I]] = Mask[I];

8983

}

8984

Vec = CreateShuffle(Vec, nullptr, ResizeMask);

8985

}

8986

}

8987

8988

return std::make_pair(Vec, false);

8989

};

8990

// Perform shuffling of the vectorize tree entries for better handling of

8991

// external extracts.

8992

for (int I = 0, E = ShuffledInserts.size(); I < E; ++I) {

8993

// Find the first and the last instruction in the list of insertelements.

8994

sort(ShuffledInserts[I].InsertElements, isFirstInsertElement);

8995

InsertElementInst *FirstInsert = ShuffledInserts[I].InsertElements.front();

8996

InsertElementInst *LastInsert = ShuffledInserts[I].InsertElements.back();

8997

Builder.SetInsertPoint(LastInsert);

8998

auto Vector = ShuffledInserts[I].ValueMasks.takeVector();

8999

Value *NewInst = performExtractsShuffleAction<Value>(

9000

makeMutableArrayRef(Vector.data(), Vector.size()),

9001

FirstInsert->getOperand(0),

9002

[](Value *Vec) {

9003

return cast<VectorType>(Vec->getType())

9004

->getElementCount()

9005

.getKnownMinValue();

9006

},

9007

ResizeToVF,

9008

[FirstInsert, &CreateShuffle](ArrayRef<int> Mask,

9009

ArrayRef<Value *> Vals) {

9010

assert((Vals.size() == 1 || Vals.size() == 2) &&(static_cast <bool> ((Vals.size() == 1 || Vals.size() ==
2) && "Expected exactly 1 or 2 input values.") ? void
(0) : __assert_fail ("(Vals.size() == 1 || Vals.size() == 2) && \"Expected exactly 1 or 2 input values.\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 9011, __extension__
__PRETTY_FUNCTION__))

9011

"Expected exactly 1 or 2 input values.")(static_cast <bool> ((Vals.size() == 1 || Vals.size() ==
2) && "Expected exactly 1 or 2 input values.") ? void
(0) : __assert_fail ("(Vals.size() == 1 || Vals.size() == 2) && \"Expected exactly 1 or 2 input values.\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 9011, __extension__
__PRETTY_FUNCTION__));

9012

if (Vals.size() == 1) {

9013

// Do not create shuffle if the mask is a simple identity

9014

// non-resizing mask.

9015

if (Mask.size() != cast<FixedVectorType>(Vals.front()->getType())

9016

->getNumElements() ||

9017

!ShuffleVectorInst::isIdentityMask(Mask))

9018

return CreateShuffle(Vals.front(), nullptr, Mask);

9019

return Vals.front();

9020

}

9021

return CreateShuffle(Vals.front() ? Vals.front()

9022

: FirstInsert->getOperand(0),

9023

Vals.back(), Mask);

9024

});

9025

auto It = ShuffledInserts[I].InsertElements.rbegin();

9026

// Rebuild buildvector chain.

9027

InsertElementInst *II = nullptr;

9028

if (It != ShuffledInserts[I].InsertElements.rend())

9029

II = *It;

9030

SmallVector<Instruction *> Inserts;

9031

while (It != ShuffledInserts[I].InsertElements.rend()) {

9032

assert(II && "Must be an insertelement instruction.")(static_cast <bool> (II && "Must be an insertelement instruction."
) ? void (0) : __assert_fail ("II && \"Must be an insertelement instruction.\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 9032, __extension__
__PRETTY_FUNCTION__));

9033

if (*It == II)

9034

++It;

9035

else

9036

Inserts.push_back(cast<Instruction>(II));

9037

II = dyn_cast<InsertElementInst>(II->getOperand(0));

9038

}

9039

for (Instruction *II : reverse(Inserts)) {

9040

II->replaceUsesOfWith(II->getOperand(0), NewInst);

9041

if (auto *NewI = dyn_cast<Instruction>(NewInst))

9042

if (II->getParent() == NewI->getParent() && II->comesBefore(NewI))

9043

II->moveAfter(NewI);

9044

NewInst = II;

9045

}

9046

LastInsert->replaceAllUsesWith(NewInst);

9047

for (InsertElementInst *IE : reverse(ShuffledInserts[I].InsertElements)) {

9048

IE->replaceUsesOfWith(IE->getOperand(0),

9049

PoisonValue::get(IE->getOperand(0)->getType()));

9050

IE->replaceUsesOfWith(IE->getOperand(1),

9051

PoisonValue::get(IE->getOperand(1)->getType()));

9052

eraseInstruction(IE);

9053

}

9054

CSEBlocks.insert(LastInsert->getParent());

9055

}

9056

9057

// For each vectorized value:

9058

for (auto &TEPtr : VectorizableTree) {

9059

TreeEntry *Entry = TEPtr.get();

9060

9061

// No need to handle users of gathered values.

9062

if (Entry->State == TreeEntry::NeedToGather)

9063

continue;

9064

9065

assert(Entry->VectorizedValue && "Can't find vectorizable value")(static_cast <bool> (Entry->VectorizedValue &&
"Can't find vectorizable value") ? void (0) : __assert_fail (
"Entry->VectorizedValue && \"Can't find vectorizable value\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 9065, __extension__
__PRETTY_FUNCTION__));

9066

9067

// For each lane:

9068

for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {

9069

Value *Scalar = Entry->Scalars[Lane];

9070

9071

if (Entry->getOpcode() == Instruction::GetElementPtr &&

9072

!isa<GetElementPtrInst>(Scalar))

9073

continue;

9074

#ifndef NDEBUG

9075

Type *Ty = Scalar->getType();

9076

if (!Ty->isVoidTy()) {

9077

for (User *U : Scalar->users()) {

9078

LLVM_DEBUG(dbgs() << "SLP: \tvalidating user:" << *U << ".\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: \tvalidating user:" <<
*U << ".\n"; } } while (false);

9079

9080

// It is legal to delete users in the ignorelist.

9081

assert((getTreeEntry(U) ||(static_cast <bool> ((getTreeEntry(U) || (UserIgnoreList
&& UserIgnoreList->contains(U)) || (isa_and_nonnull
<Instruction>(U) && isDeleted(cast<Instruction
>(U)))) && "Deleting out-of-tree value") ? void (0
) : __assert_fail ("(getTreeEntry(U) || (UserIgnoreList && UserIgnoreList->contains(U)) || (isa_and_nonnull<Instruction>(U) && isDeleted(cast<Instruction>(U)))) && \"Deleting out-of-tree value\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 9085, __extension__
__PRETTY_FUNCTION__))

9082

(UserIgnoreList && UserIgnoreList->contains(U)) ||(static_cast <bool> ((getTreeEntry(U) || (UserIgnoreList
&& UserIgnoreList->contains(U)) || (isa_and_nonnull
<Instruction>(U) && isDeleted(cast<Instruction
>(U)))) && "Deleting out-of-tree value") ? void (0
) : __assert_fail ("(getTreeEntry(U) || (UserIgnoreList && UserIgnoreList->contains(U)) || (isa_and_nonnull<Instruction>(U) && isDeleted(cast<Instruction>(U)))) && \"Deleting out-of-tree value\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 9085, __extension__
__PRETTY_FUNCTION__))

9083

(isa_and_nonnull<Instruction>(U) &&(static_cast <bool> ((getTreeEntry(U) || (UserIgnoreList
&& UserIgnoreList->contains(U)) || (isa_and_nonnull
<Instruction>(U) && isDeleted(cast<Instruction
>(U)))) && "Deleting out-of-tree value") ? void (0
) : __assert_fail ("(getTreeEntry(U) || (UserIgnoreList && UserIgnoreList->contains(U)) || (isa_and_nonnull<Instruction>(U) && isDeleted(cast<Instruction>(U)))) && \"Deleting out-of-tree value\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 9085, __extension__
__PRETTY_FUNCTION__))

9084

isDeleted(cast<Instruction>(U)))) &&(static_cast <bool> ((getTreeEntry(U) || (UserIgnoreList
&& UserIgnoreList->contains(U)) || (isa_and_nonnull
<Instruction>(U) && isDeleted(cast<Instruction
>(U)))) && "Deleting out-of-tree value") ? void (0
) : __assert_fail ("(getTreeEntry(U) || (UserIgnoreList && UserIgnoreList->contains(U)) || (isa_and_nonnull<Instruction>(U) && isDeleted(cast<Instruction>(U)))) && \"Deleting out-of-tree value\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 9085, __extension__
__PRETTY_FUNCTION__))

9085

"Deleting out-of-tree value")(static_cast <bool> ((getTreeEntry(U) || (UserIgnoreList
&& UserIgnoreList->contains(U)) || (isa_and_nonnull
<Instruction>(U) && isDeleted(cast<Instruction
>(U)))) && "Deleting out-of-tree value") ? void (0
) : __assert_fail ("(getTreeEntry(U) || (UserIgnoreList && UserIgnoreList->contains(U)) || (isa_and_nonnull<Instruction>(U) && isDeleted(cast<Instruction>(U)))) && \"Deleting out-of-tree value\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 9085, __extension__
__PRETTY_FUNCTION__));

9086

}

9087

}

9088

#endif

9089

LLVM_DEBUG(dbgs() << "SLP: \tErasing scalar:" << *Scalar << ".\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: \tErasing scalar:" << *
Scalar << ".\n"; } } while (false);

9090

eraseInstruction(cast<Instruction>(Scalar));

9091

}

9092

}

9093

9094

Builder.ClearInsertionPoint();

9095

InstrElementSize.clear();

9096

9097

return VectorizableTree[0]->VectorizedValue;

9098

}

9099

9100

void BoUpSLP::optimizeGatherSequence() {

9101

LLVM_DEBUG(dbgs() << "SLP: Optimizing " << GatherShuffleSeq.size()do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Optimizing " << GatherShuffleSeq
.size() << " gather sequences instructions.\n"; } } while
(false)

9102

<< " gather sequences instructions.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Optimizing " << GatherShuffleSeq
.size() << " gather sequences instructions.\n"; } } while
(false);

9103

// LICM InsertElementInst sequences.

9104

for (Instruction *I : GatherShuffleSeq) {

9105

if (isDeleted(I))

9106

continue;

9107

9108

// Check if this block is inside a loop.

9109

Loop *L = LI->getLoopFor(I->getParent());

9110

if (!L)

9111

continue;

9112

9113

// Check if it has a preheader.

9114

BasicBlock *PreHeader = L->getLoopPreheader();

9115

if (!PreHeader)

9116

continue;

9117

9118

// If the vector or the element that we insert into it are

9119

// instructions that are defined in this basic block then we can't

9120

// hoist this instruction.

9121

if (any_of(I->operands(), [L](Value *V) {

9122

auto *OpI = dyn_cast<Instruction>(V);

9123

return OpI && L->contains(OpI);

9124

}))

9125

continue;

9126

9127

// We can hoist this instruction. Move it to the pre-header.

9128

I->moveBefore(PreHeader->getTerminator());

9129

CSEBlocks.insert(PreHeader);

9130

}

9131

9132

// Make a list of all reachable blocks in our CSE queue.

9133

SmallVector<const DomTreeNode *, 8> CSEWorkList;

9134

CSEWorkList.reserve(CSEBlocks.size());

9135

for (BasicBlock *BB : CSEBlocks)

9136

if (DomTreeNode *N = DT->getNode(BB)) {

9137

assert(DT->isReachableFromEntry(N))(static_cast <bool> (DT->isReachableFromEntry(N)) ? void
(0) : __assert_fail ("DT->isReachableFromEntry(N)", "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 9137, __extension__ __PRETTY_FUNCTION__));

9138

CSEWorkList.push_back(N);

9139

}

9140

9141

// Sort blocks by domination. This ensures we visit a block after all blocks

9142

// dominating it are visited.

9143

llvm::sort(CSEWorkList, [](const DomTreeNode *A, const DomTreeNode *B) {

9144

assert((A == B) == (A->getDFSNumIn() == B->getDFSNumIn()) &&(static_cast <bool> ((A == B) == (A->getDFSNumIn() ==
B->getDFSNumIn()) && "Different nodes should have different DFS numbers"
) ? void (0) : __assert_fail ("(A == B) == (A->getDFSNumIn() == B->getDFSNumIn()) && \"Different nodes should have different DFS numbers\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 9145, __extension__
__PRETTY_FUNCTION__))

9145

"Different nodes should have different DFS numbers")(static_cast <bool> ((A == B) == (A->getDFSNumIn() ==
B->getDFSNumIn()) && "Different nodes should have different DFS numbers"
) ? void (0) : __assert_fail ("(A == B) == (A->getDFSNumIn() == B->getDFSNumIn()) && \"Different nodes should have different DFS numbers\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 9145, __extension__
__PRETTY_FUNCTION__));

9146

return A->getDFSNumIn() < B->getDFSNumIn();

9147

});

9148

9149

// Less defined shuffles can be replaced by the more defined copies.

9150

// Between two shuffles one is less defined if it has the same vector operands

9151

// and its mask indeces are the same as in the first one or undefs. E.g.

9152

// shuffle %0, poison, <0, 0, 0, undef> is less defined than shuffle %0,

9153

// poison, <0, 0, 0, 0>.

9154

auto &&IsIdenticalOrLessDefined = [this](Instruction *I1, Instruction *I2,

9155

SmallVectorImpl<int> &NewMask) {

9156

if (I1->getType() != I2->getType())

9157

return false;

9158

auto *SI1 = dyn_cast<ShuffleVectorInst>(I1);

9159

auto *SI2 = dyn_cast<ShuffleVectorInst>(I2);

9160

if (!SI1 || !SI2)

9161

return I1->isIdenticalTo(I2);

9162

if (SI1->isIdenticalTo(SI2))

9163

return true;

9164

for (int I = 0, E = SI1->getNumOperands(); I < E; ++I)

9165

if (SI1->getOperand(I) != SI2->getOperand(I))

9166

return false;

9167

// Check if the second instruction is more defined than the first one.

9168

NewMask.assign(SI2->getShuffleMask().begin(), SI2->getShuffleMask().end());

9169

ArrayRef<int> SM1 = SI1->getShuffleMask();

9170

// Count trailing undefs in the mask to check the final number of used

9171

// registers.

9172

unsigned LastUndefsCnt = 0;

9173

for (int I = 0, E = NewMask.size(); I < E; ++I) {

9174

if (SM1[I] == UndefMaskElem)

9175

++LastUndefsCnt;

9176

else

9177

LastUndefsCnt = 0;

9178

if (NewMask[I] != UndefMaskElem && SM1[I] != UndefMaskElem &&

9179

NewMask[I] != SM1[I])

9180

return false;

9181

if (NewMask[I] == UndefMaskElem)

9182

NewMask[I] = SM1[I];

9183

}

9184

// Check if the last undefs actually change the final number of used vector

9185

// registers.

9186

return SM1.size() - LastUndefsCnt > 1 &&

9187

TTI->getNumberOfParts(SI1->getType()) ==

9188

TTI->getNumberOfParts(

9189

FixedVectorType::get(SI1->getType()->getElementType(),

9190

SM1.size() - LastUndefsCnt));

9191

};

9192

// Perform O(N^2) search over the gather/shuffle sequences and merge identical

9193

// instructions. TODO: We can further optimize this scan if we split the

9194

// instructions into different buckets based on the insert lane.

9195

SmallVector<Instruction *, 16> Visited;

9196

for (auto I = CSEWorkList.begin(), E = CSEWorkList.end(); I != E; ++I) {

9197

assert(*I &&(static_cast <bool> (*I && (I == CSEWorkList.begin
() || !DT->dominates(*I, *std::prev(I))) && "Worklist not sorted properly!"
) ? void (0) : __assert_fail ("*I && (I == CSEWorkList.begin() || !DT->dominates(*I, *std::prev(I))) && \"Worklist not sorted properly!\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 9199, __extension__
__PRETTY_FUNCTION__))

9198

(I == CSEWorkList.begin() || !DT->dominates(*I, *std::prev(I))) &&(static_cast <bool> (*I && (I == CSEWorkList.begin
() || !DT->dominates(*I, *std::prev(I))) && "Worklist not sorted properly!"
) ? void (0) : __assert_fail ("*I && (I == CSEWorkList.begin() || !DT->dominates(*I, *std::prev(I))) && \"Worklist not sorted properly!\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 9199, __extension__
__PRETTY_FUNCTION__))

9199

"Worklist not sorted properly!")(static_cast <bool> (*I && (I == CSEWorkList.begin
() || !DT->dominates(*I, *std::prev(I))) && "Worklist not sorted properly!"
) ? void (0) : __assert_fail ("*I && (I == CSEWorkList.begin() || !DT->dominates(*I, *std::prev(I))) && \"Worklist not sorted properly!\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 9199, __extension__
__PRETTY_FUNCTION__));

9200

BasicBlock *BB = (*I)->getBlock();

9201

// For all instructions in blocks containing gather sequences:

9202

for (Instruction &In : llvm::make_early_inc_range(*BB)) {

9203

if (isDeleted(&In))

9204

continue;

9205

if (!isa<InsertElementInst, ExtractElementInst, ShuffleVectorInst>(&In) &&

9206

!GatherShuffleSeq.contains(&In))

9207

continue;

9208

9209

// Check if we can replace this instruction with any of the

9210

// visited instructions.

9211

bool Replaced = false;

9212

for (Instruction *&V : Visited) {

9213

SmallVector<int> NewMask;

9214

if (IsIdenticalOrLessDefined(&In, V, NewMask) &&

9215

DT->dominates(V->getParent(), In.getParent())) {

9216

In.replaceAllUsesWith(V);

9217

eraseInstruction(&In);

9218

if (auto *SI = dyn_cast<ShuffleVectorInst>(V))

9219

if (!NewMask.empty())

9220

SI->setShuffleMask(NewMask);

9221

Replaced = true;

9222

break;

9223

}

9224

if (isa<ShuffleVectorInst>(In) && isa<ShuffleVectorInst>(V) &&

9225

GatherShuffleSeq.contains(V) &&

9226

IsIdenticalOrLessDefined(V, &In, NewMask) &&

9227

DT->dominates(In.getParent(), V->getParent())) {

9228

In.moveAfter(V);

9229

V->replaceAllUsesWith(&In);

9230

eraseInstruction(V);

9231

if (auto *SI = dyn_cast<ShuffleVectorInst>(&In))

9232

if (!NewMask.empty())

9233

SI->setShuffleMask(NewMask);

9234

V = &In;

9235

Replaced = true;

9236

break;

9237

}

9238

}

9239

if (!Replaced) {

9240

assert(!is_contained(Visited, &In))(static_cast <bool> (!is_contained(Visited, &In)) ?
void (0) : __assert_fail ("!is_contained(Visited, &In)",
"llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 9240, __extension__
__PRETTY_FUNCTION__));

9241

Visited.push_back(&In);

9242

}

9243

}

9244

}

9245

CSEBlocks.clear();

9246

GatherShuffleSeq.clear();

9247

}

9248

9249

BoUpSLP::ScheduleData *

9250

BoUpSLP::BlockScheduling::buildBundle(ArrayRef<Value *> VL) {

9251

ScheduleData *Bundle = nullptr;

9252

ScheduleData *PrevInBundle = nullptr;

9253

for (Value *V : VL) {

9254

if (doesNotNeedToBeScheduled(V))

9255

continue;

9256

ScheduleData *BundleMember = getScheduleData(V);

9257

assert(BundleMember &&(static_cast <bool> (BundleMember && "no ScheduleData for bundle member "
"(maybe not in same basic block)") ? void (0) : __assert_fail
("BundleMember && \"no ScheduleData for bundle member \" \"(maybe not in same basic block)\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 9259, __extension__
__PRETTY_FUNCTION__))

9258

"no ScheduleData for bundle member "(static_cast <bool> (BundleMember && "no ScheduleData for bundle member "
"(maybe not in same basic block)") ? void (0) : __assert_fail
("BundleMember && \"no ScheduleData for bundle member \" \"(maybe not in same basic block)\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 9259, __extension__
__PRETTY_FUNCTION__))

9259

"(maybe not in same basic block)")(static_cast <bool> (BundleMember && "no ScheduleData for bundle member "
"(maybe not in same basic block)") ? void (0) : __assert_fail
("BundleMember && \"no ScheduleData for bundle member \" \"(maybe not in same basic block)\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 9259, __extension__
__PRETTY_FUNCTION__));

9260

assert(BundleMember->isSchedulingEntity() &&(static_cast <bool> (BundleMember->isSchedulingEntity
() && "bundle member already part of other bundle") ?
void (0) : __assert_fail ("BundleMember->isSchedulingEntity() && \"bundle member already part of other bundle\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 9261, __extension__
__PRETTY_FUNCTION__))

9261

"bundle member already part of other bundle")(static_cast <bool> (BundleMember->isSchedulingEntity
() && "bundle member already part of other bundle") ?
void (0) : __assert_fail ("BundleMember->isSchedulingEntity() && \"bundle member already part of other bundle\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 9261, __extension__
__PRETTY_FUNCTION__));

9262

if (PrevInBundle) {

9263

PrevInBundle->NextInBundle = BundleMember;

9264

} else {

9265

Bundle = BundleMember;

9266

}

9267

9268

// Group the instructions to a bundle.

9269

BundleMember->FirstInBundle = Bundle;

9270

PrevInBundle = BundleMember;

9271

}

9272

assert(Bundle && "Failed to find schedule bundle")(static_cast <bool> (Bundle && "Failed to find schedule bundle"
) ? void (0) : __assert_fail ("Bundle && \"Failed to find schedule bundle\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 9272, __extension__
__PRETTY_FUNCTION__));

9273

return Bundle;

9274

}

9275

9276

// Groups the instructions to a bundle (which is then a single scheduling entity)

9277

// and schedules instructions until the bundle gets ready.

9278

Optional<BoUpSLP::ScheduleData *>

9279

BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,

9280

const InstructionsState &S) {

9281

// No need to schedule PHIs, insertelement, extractelement and extractvalue

9282

// instructions.

9283

if (isa<PHINode>(S.OpValue) || isVectorLikeInstWithConstOps(S.OpValue) ||

9284

doesNotNeedToSchedule(VL))

9285

return nullptr;

9286

9287

// Initialize the instruction bundle.

9288

Instruction *OldScheduleEnd = ScheduleEnd;

9289

LLVM_DEBUG(dbgs() << "SLP: bundle: " << *S.OpValue << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: bundle: " << *S.OpValue
<< "\n"; } } while (false);

9290

9291

auto TryScheduleBundleImpl = [this, OldScheduleEnd, SLP](bool ReSchedule,

9292

ScheduleData *Bundle) {

9293

// The scheduling region got new instructions at the lower end (or it is a

9294

// new region for the first bundle). This makes it necessary to

9295

// recalculate all dependencies.

9296

// It is seldom that this needs to be done a second time after adding the

9297

// initial bundle to the region.

9298

if (ScheduleEnd != OldScheduleEnd) {

9299

for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode())

9300

doForAllOpcodes(I, [](ScheduleData *SD) { SD->clearDependencies(); });

9301

ReSchedule = true;

9302

}

9303

if (Bundle) {

9304

LLVM_DEBUG(dbgs() << "SLP: try schedule bundle " << *Bundledo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: try schedule bundle " <<
*Bundle << " in block " << BB->getName() <<
"\n"; } } while (false)

9305

<< " in block " << BB->getName() << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: try schedule bundle " <<
*Bundle << " in block " << BB->getName() <<
"\n"; } } while (false);

9306

calculateDependencies(Bundle, /*InsertInReadyList=*/true, SLP);

9307

}

9308

9309

if (ReSchedule) {

9310

resetSchedule();

9311

initialFillReadyList(ReadyInsts);

9312

}

9313

9314

// Now try to schedule the new bundle or (if no bundle) just calculate

9315

// dependencies. As soon as the bundle is "ready" it means that there are no

9316

// cyclic dependencies and we can schedule it. Note that's important that we

9317

// don't "schedule" the bundle yet (see cancelScheduling).

9318

while (((!Bundle && ReSchedule) || (Bundle && !Bundle->isReady())) &&

9319

!ReadyInsts.empty()) {

9320

ScheduleData *Picked = ReadyInsts.pop_back_val();

9321

assert(Picked->isSchedulingEntity() && Picked->isReady() &&(static_cast <bool> (Picked->isSchedulingEntity() &&
Picked->isReady() && "must be ready to schedule")
? void (0) : __assert_fail ("Picked->isSchedulingEntity() && Picked->isReady() && \"must be ready to schedule\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 9322, __extension__
__PRETTY_FUNCTION__))

9322

"must be ready to schedule")(static_cast <bool> (Picked->isSchedulingEntity() &&
Picked->isReady() && "must be ready to schedule")
? void (0) : __assert_fail ("Picked->isSchedulingEntity() && Picked->isReady() && \"must be ready to schedule\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 9322, __extension__
__PRETTY_FUNCTION__));

9323

schedule(Picked, ReadyInsts);

9324

}

9325

};

9326

9327

// Make sure that the scheduling region contains all

9328

// instructions of the bundle.

9329

for (Value *V : VL) {

9330

if (doesNotNeedToBeScheduled(V))

9331

continue;

9332

if (!extendSchedulingRegion(V, S)) {

9333

// If the scheduling region got new instructions at the lower end (or it

9334

// is a new region for the first bundle). This makes it necessary to

9335

// recalculate all dependencies.

9336

// Otherwise the compiler may crash trying to incorrectly calculate

9337

// dependencies and emit instruction in the wrong order at the actual

9338

// scheduling.

9339

TryScheduleBundleImpl(/*ReSchedule=*/false, nullptr);

9340

return None;

9341

}

9342

}

9343

9344

bool ReSchedule = false;

9345

for (Value *V : VL) {

9346

if (doesNotNeedToBeScheduled(V))

9347

continue;

9348

ScheduleData *BundleMember = getScheduleData(V);

9349

assert(BundleMember &&(static_cast <bool> (BundleMember && "no ScheduleData for bundle member (maybe not in same basic block)"
) ? void (0) : __assert_fail ("BundleMember && \"no ScheduleData for bundle member (maybe not in same basic block)\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 9350, __extension__
__PRETTY_FUNCTION__))

9350

"no ScheduleData for bundle member (maybe not in same basic block)")(static_cast <bool> (BundleMember && "no ScheduleData for bundle member (maybe not in same basic block)"
) ? void (0) : __assert_fail ("BundleMember && \"no ScheduleData for bundle member (maybe not in same basic block)\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 9350, __extension__
__PRETTY_FUNCTION__));

9351

9352

// Make sure we don't leave the pieces of the bundle in the ready list when

9353

// whole bundle might not be ready.

9354

ReadyInsts.remove(BundleMember);

9355

9356

if (!BundleMember->IsScheduled)

9357

continue;

9358

// A bundle member was scheduled as single instruction before and now

9359

// needs to be scheduled as part of the bundle. We just get rid of the

9360

// existing schedule.

9361

LLVM_DEBUG(dbgs() << "SLP: reset schedule because " << *BundleMemberdo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: reset schedule because " <<
*BundleMember << " was already scheduled\n"; } } while
(false)

9362

<< " was already scheduled\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: reset schedule because " <<
*BundleMember << " was already scheduled\n"; } } while
(false);

9363

ReSchedule = true;

9364

}

9365

9366

auto *Bundle = buildBundle(VL);

9367

TryScheduleBundleImpl(ReSchedule, Bundle);

9368

if (!Bundle->isReady()) {

9369

cancelScheduling(VL, S.OpValue);

9370

return None;

9371

}

9372

return Bundle;

9373

}

9374

9375

void BoUpSLP::BlockScheduling::cancelScheduling(ArrayRef<Value *> VL,

9376

Value *OpValue) {

9377

if (isa<PHINode>(OpValue) || isVectorLikeInstWithConstOps(OpValue) ||

9378

doesNotNeedToSchedule(VL))

9379

return;

9380

9381

if (doesNotNeedToBeScheduled(OpValue))

9382

OpValue = *find_if_not(VL, doesNotNeedToBeScheduled);

9383

ScheduleData *Bundle = getScheduleData(OpValue);

9384

LLVM_DEBUG(dbgs() << "SLP: cancel scheduling of " << *Bundle << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: cancel scheduling of " <<
*Bundle << "\n"; } } while (false);

9385

assert(!Bundle->IsScheduled &&(static_cast <bool> (!Bundle->IsScheduled &&
"Can't cancel bundle which is already scheduled") ? void (0)
: __assert_fail ("!Bundle->IsScheduled && \"Can't cancel bundle which is already scheduled\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 9386, __extension__
__PRETTY_FUNCTION__))

9386

"Can't cancel bundle which is already scheduled")(static_cast <bool> (!Bundle->IsScheduled &&
"Can't cancel bundle which is already scheduled") ? void (0)
: __assert_fail ("!Bundle->IsScheduled && \"Can't cancel bundle which is already scheduled\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 9386, __extension__
__PRETTY_FUNCTION__));

9387

assert(Bundle->isSchedulingEntity() &&(static_cast <bool> (Bundle->isSchedulingEntity() &&
(Bundle->isPartOfBundle() || needToScheduleSingleInstruction
(VL)) && "tried to unbundle something which is not a bundle"
) ? void (0) : __assert_fail ("Bundle->isSchedulingEntity() && (Bundle->isPartOfBundle() || needToScheduleSingleInstruction(VL)) && \"tried to unbundle something which is not a bundle\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 9389, __extension__
__PRETTY_FUNCTION__))

9388

(Bundle->isPartOfBundle() || needToScheduleSingleInstruction(VL)) &&(static_cast <bool> (Bundle->isSchedulingEntity() &&
(Bundle->isPartOfBundle() || needToScheduleSingleInstruction
(VL)) && "tried to unbundle something which is not a bundle"
) ? void (0) : __assert_fail ("Bundle->isSchedulingEntity() && (Bundle->isPartOfBundle() || needToScheduleSingleInstruction(VL)) && \"tried to unbundle something which is not a bundle\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 9389, __extension__
__PRETTY_FUNCTION__))

9389

"tried to unbundle something which is not a bundle")(static_cast <bool> (Bundle->isSchedulingEntity() &&
(Bundle->isPartOfBundle() || needToScheduleSingleInstruction
(VL)) && "tried to unbundle something which is not a bundle"
) ? void (0) : __assert_fail ("Bundle->isSchedulingEntity() && (Bundle->isPartOfBundle() || needToScheduleSingleInstruction(VL)) && \"tried to unbundle something which is not a bundle\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 9389, __extension__
__PRETTY_FUNCTION__));

9390

9391

// Remove the bundle from the ready list.

9392

if (Bundle->isReady())

9393

ReadyInsts.remove(Bundle);

9394

9395

// Un-bundle: make single instructions out of the bundle.

9396

ScheduleData *BundleMember = Bundle;

9397

while (BundleMember) {

9398

assert(BundleMember->FirstInBundle == Bundle && "corrupt bundle links")(static_cast <bool> (BundleMember->FirstInBundle == Bundle
&& "corrupt bundle links") ? void (0) : __assert_fail
("BundleMember->FirstInBundle == Bundle && \"corrupt bundle links\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 9398, __extension__
__PRETTY_FUNCTION__));

9399

BundleMember->FirstInBundle = BundleMember;

9400

ScheduleData *Next = BundleMember->NextInBundle;

9401

BundleMember->NextInBundle = nullptr;

9402

BundleMember->TE = nullptr;

9403

if (BundleMember->unscheduledDepsInBundle() == 0) {

9404

ReadyInsts.insert(BundleMember);

9405

}

9406

BundleMember = Next;

9407

}

9408

}

9409

9410

BoUpSLP::ScheduleData *BoUpSLP::BlockScheduling::allocateScheduleDataChunks() {

9411

// Allocate a new ScheduleData for the instruction.

9412

if (ChunkPos >= ChunkSize) {

9413

ScheduleDataChunks.push_back(std::make_unique<ScheduleData[]>(ChunkSize));

9414

ChunkPos = 0;

9415

}

9416

return &(ScheduleDataChunks.back()[ChunkPos++]);

9417

}

9418

9419

bool BoUpSLP::BlockScheduling::extendSchedulingRegion(Value *V,

9420

const InstructionsState &S) {

9421

if (getScheduleData(V, isOneOf(S, V)))

9422

return true;

9423

Instruction *I = dyn_cast<Instruction>(V);

9424

assert(I && "bundle member must be an instruction")(static_cast <bool> (I && "bundle member must be an instruction"
) ? void (0) : __assert_fail ("I && \"bundle member must be an instruction\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 9424, __extension__
__PRETTY_FUNCTION__));

9425

assert(!isa<PHINode>(I) && !isVectorLikeInstWithConstOps(I) &&(static_cast <bool> (!isa<PHINode>(I) && !
isVectorLikeInstWithConstOps(I) && !doesNotNeedToBeScheduled
(I) && "phi nodes/insertelements/extractelements/extractvalues don't need to "
"be scheduled") ? void (0) : __assert_fail ("!isa<PHINode>(I) && !isVectorLikeInstWithConstOps(I) && !doesNotNeedToBeScheduled(I) && \"phi nodes/insertelements/extractelements/extractvalues don't need to \" \"be scheduled\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 9428, __extension__
__PRETTY_FUNCTION__))

9426

!doesNotNeedToBeScheduled(I) &&(static_cast <bool> (!isa<PHINode>(I) && !
isVectorLikeInstWithConstOps(I) && !doesNotNeedToBeScheduled
(I) && "phi nodes/insertelements/extractelements/extractvalues don't need to "
"be scheduled") ? void (0) : __assert_fail ("!isa<PHINode>(I) && !isVectorLikeInstWithConstOps(I) && !doesNotNeedToBeScheduled(I) && \"phi nodes/insertelements/extractelements/extractvalues don't need to \" \"be scheduled\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 9428, __extension__
__PRETTY_FUNCTION__))

9427

"phi nodes/insertelements/extractelements/extractvalues don't need to "(static_cast <bool> (!isa<PHINode>(I) && !
isVectorLikeInstWithConstOps(I) && !doesNotNeedToBeScheduled
(I) && "phi nodes/insertelements/extractelements/extractvalues don't need to "
"be scheduled") ? void (0) : __assert_fail ("!isa<PHINode>(I) && !isVectorLikeInstWithConstOps(I) && !doesNotNeedToBeScheduled(I) && \"phi nodes/insertelements/extractelements/extractvalues don't need to \" \"be scheduled\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 9428, __extension__
__PRETTY_FUNCTION__))

9428

"be scheduled")(static_cast <bool> (!isa<PHINode>(I) && !
isVectorLikeInstWithConstOps(I) && !doesNotNeedToBeScheduled
(I) && "phi nodes/insertelements/extractelements/extractvalues don't need to "
"be scheduled") ? void (0) : __assert_fail ("!isa<PHINode>(I) && !isVectorLikeInstWithConstOps(I) && !doesNotNeedToBeScheduled(I) && \"phi nodes/insertelements/extractelements/extractvalues don't need to \" \"be scheduled\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 9428, __extension__
__PRETTY_FUNCTION__));

9429

auto &&CheckScheduleForI = [this, &S](Instruction *I) -> bool {

9430

ScheduleData *ISD = getScheduleData(I);

9431

if (!ISD)

9432

return false;

9433

assert(isInSchedulingRegion(ISD) &&(static_cast <bool> (isInSchedulingRegion(ISD) &&
"ScheduleData not in scheduling region") ? void (0) : __assert_fail
("isInSchedulingRegion(ISD) && \"ScheduleData not in scheduling region\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 9434, __extension__
__PRETTY_FUNCTION__))

9434

"ScheduleData not in scheduling region")(static_cast <bool> (isInSchedulingRegion(ISD) &&
"ScheduleData not in scheduling region") ? void (0) : __assert_fail
("isInSchedulingRegion(ISD) && \"ScheduleData not in scheduling region\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 9434, __extension__
__PRETTY_FUNCTION__));

9435

ScheduleData *SD = allocateScheduleDataChunks();

9436

SD->Inst = I;

9437

SD->init(SchedulingRegionID, S.OpValue);

9438

ExtraScheduleDataMap[I][S.OpValue] = SD;

9439

return true;

9440

};

9441

if (CheckScheduleForI(I))

9442

return true;

9443

if (!ScheduleStart) {

9444

// It's the first instruction in the new region.

9445

initScheduleData(I, I->getNextNode(), nullptr, nullptr);

9446

ScheduleStart = I;

9447

ScheduleEnd = I->getNextNode();

9448

if (isOneOf(S, I) != I)

9449

CheckScheduleForI(I);

9450

assert(ScheduleEnd && "tried to vectorize a terminator?")(static_cast <bool> (ScheduleEnd && "tried to vectorize a terminator?"
) ? void (0) : __assert_fail ("ScheduleEnd && \"tried to vectorize a terminator?\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 9450, __extension__
__PRETTY_FUNCTION__));

9451

LLVM_DEBUG(dbgs() << "SLP: initialize schedule region to " << *I << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: initialize schedule region to "
<< *I << "\n"; } } while (false);

9452

return true;

9453

}

9454

// Search up and down at the same time, because we don't know if the new

9455

// instruction is above or below the existing scheduling region.

9456

BasicBlock::reverse_iterator UpIter =

9457

++ScheduleStart->getIterator().getReverse();

9458

BasicBlock::reverse_iterator UpperEnd = BB->rend();

9459

BasicBlock::iterator DownIter = ScheduleEnd->getIterator();

9460

BasicBlock::iterator LowerEnd = BB->end();

9461

while (UpIter != UpperEnd && DownIter != LowerEnd && &*UpIter != I &&

9462

&*DownIter != I) {

9463

if (++ScheduleRegionSize > ScheduleRegionSizeLimit) {

9464

LLVM_DEBUG(dbgs() << "SLP: exceeded schedule region size limit\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: exceeded schedule region size limit\n"
; } } while (false);

9465

return false;

9466

}

9467

9468

++UpIter;

9469

++DownIter;

9470

}

9471

if (DownIter == LowerEnd || (UpIter != UpperEnd && &*UpIter == I)) {

9472

assert(I->getParent() == ScheduleStart->getParent() &&(static_cast <bool> (I->getParent() == ScheduleStart
->getParent() && "Instruction is in wrong basic block."
) ? void (0) : __assert_fail ("I->getParent() == ScheduleStart->getParent() && \"Instruction is in wrong basic block.\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 9473, __extension__
__PRETTY_FUNCTION__))

9473

"Instruction is in wrong basic block.")(static_cast <bool> (I->getParent() == ScheduleStart
->getParent() && "Instruction is in wrong basic block."
) ? void (0) : __assert_fail ("I->getParent() == ScheduleStart->getParent() && \"Instruction is in wrong basic block.\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 9473, __extension__
__PRETTY_FUNCTION__));

9474

initScheduleData(I, ScheduleStart, nullptr, FirstLoadStoreInRegion);

9475

ScheduleStart = I;

9476

if (isOneOf(S, I) != I)

9477

CheckScheduleForI(I);

9478

LLVM_DEBUG(dbgs() << "SLP: extend schedule region start to " << *Ido { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: extend schedule region start to "
<< *I << "\n"; } } while (false)

9479

<< "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: extend schedule region start to "
<< *I << "\n"; } } while (false);

9480

return true;

9481

}

9482

assert((UpIter == UpperEnd || (DownIter != LowerEnd && &*DownIter == I)) &&(static_cast <bool> ((UpIter == UpperEnd || (DownIter !=
LowerEnd && &*DownIter == I)) && "Expected to reach top of the basic block or instruction down the "
"lower end.") ? void (0) : __assert_fail ("(UpIter == UpperEnd || (DownIter != LowerEnd && &*DownIter == I)) && \"Expected to reach top of the basic block or instruction down the \" \"lower end.\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 9484, __extension__
__PRETTY_FUNCTION__))

9483

"Expected to reach top of the basic block or instruction down the "(static_cast <bool> ((UpIter == UpperEnd || (DownIter !=
LowerEnd && &*DownIter == I)) && "Expected to reach top of the basic block or instruction down the "
"lower end.") ? void (0) : __assert_fail ("(UpIter == UpperEnd || (DownIter != LowerEnd && &*DownIter == I)) && \"Expected to reach top of the basic block or instruction down the \" \"lower end.\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 9484, __extension__
__PRETTY_FUNCTION__))

9484

"lower end.")(static_cast <bool> ((UpIter == UpperEnd || (DownIter !=
LowerEnd && &*DownIter == I)) && "Expected to reach top of the basic block or instruction down the "
"lower end.") ? void (0) : __assert_fail ("(UpIter == UpperEnd || (DownIter != LowerEnd && &*DownIter == I)) && \"Expected to reach top of the basic block or instruction down the \" \"lower end.\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 9484, __extension__
__PRETTY_FUNCTION__));

9485

assert(I->getParent() == ScheduleEnd->getParent() &&(static_cast <bool> (I->getParent() == ScheduleEnd->
getParent() && "Instruction is in wrong basic block."
) ? void (0) : __assert_fail ("I->getParent() == ScheduleEnd->getParent() && \"Instruction is in wrong basic block.\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 9486, __extension__
__PRETTY_FUNCTION__))

9486

"Instruction is in wrong basic block.")(static_cast <bool> (I->getParent() == ScheduleEnd->
getParent() && "Instruction is in wrong basic block."
) ? void (0) : __assert_fail ("I->getParent() == ScheduleEnd->getParent() && \"Instruction is in wrong basic block.\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 9486, __extension__
__PRETTY_FUNCTION__));

9487

initScheduleData(ScheduleEnd, I->getNextNode(), LastLoadStoreInRegion,

9488

nullptr);

9489

ScheduleEnd = I->getNextNode();

9490

if (isOneOf(S, I) != I)

9491

CheckScheduleForI(I);

9492

assert(ScheduleEnd && "tried to vectorize a terminator?")(static_cast <bool> (ScheduleEnd && "tried to vectorize a terminator?"
) ? void (0) : __assert_fail ("ScheduleEnd && \"tried to vectorize a terminator?\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 9492, __extension__
__PRETTY_FUNCTION__));

9493

LLVM_DEBUG(dbgs() << "SLP: extend schedule region end to " << *I << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: extend schedule region end to "
<< *I << "\n"; } } while (false);

9494

return true;

9495

}

9496

9497

void BoUpSLP::BlockScheduling::initScheduleData(Instruction *FromI,

9498

Instruction *ToI,

9499

ScheduleData *PrevLoadStore,

9500

ScheduleData *NextLoadStore) {

9501

ScheduleData *CurrentLoadStore = PrevLoadStore;

9502

for (Instruction *I = FromI; I != ToI; I = I->getNextNode()) {

9503

// No need to allocate data for non-schedulable instructions.

9504

if (doesNotNeedToBeScheduled(I))

9505

continue;

9506

ScheduleData *SD = ScheduleDataMap.lookup(I);

9507

if (!SD) {

9508

SD = allocateScheduleDataChunks();

9509

ScheduleDataMap[I] = SD;

9510

SD->Inst = I;

9511

}

9512

assert(!isInSchedulingRegion(SD) &&(static_cast <bool> (!isInSchedulingRegion(SD) &&
"new ScheduleData already in scheduling region") ? void (0) :
__assert_fail ("!isInSchedulingRegion(SD) && \"new ScheduleData already in scheduling region\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 9513, __extension__
__PRETTY_FUNCTION__))

9513

"new ScheduleData already in scheduling region")(static_cast <bool> (!isInSchedulingRegion(SD) &&
"new ScheduleData already in scheduling region") ? void (0) :
__assert_fail ("!isInSchedulingRegion(SD) && \"new ScheduleData already in scheduling region\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 9513, __extension__
__PRETTY_FUNCTION__));

9514

SD->init(SchedulingRegionID, I);

9515

9516

if (I->mayReadOrWriteMemory() &&

9517

(!isa<IntrinsicInst>(I) ||

9518

(cast<IntrinsicInst>(I)->getIntrinsicID() != Intrinsic::sideeffect &&

9519

cast<IntrinsicInst>(I)->getIntrinsicID() !=

9520

Intrinsic::pseudoprobe))) {

9521

// Update the linked list of memory accessing instructions.

9522

if (CurrentLoadStore) {

9523

CurrentLoadStore->NextLoadStore = SD;

9524

} else {

9525

FirstLoadStoreInRegion = SD;

9526

}

9527

CurrentLoadStore = SD;

9528

}

9529

9530

if (match(I, m_Intrinsic<Intrinsic::stacksave>()) ||

9531

match(I, m_Intrinsic<Intrinsic::stackrestore>()))

9532

RegionHasStackSave = true;

9533

}

9534

if (NextLoadStore) {

9535

if (CurrentLoadStore)

9536

CurrentLoadStore->NextLoadStore = NextLoadStore;

9537

} else {

9538

LastLoadStoreInRegion = CurrentLoadStore;

9539

}

9540

}

9541

9542

void BoUpSLP::BlockScheduling::calculateDependencies(ScheduleData *SD,

9543

bool InsertInReadyList,

9544

BoUpSLP *SLP) {

9545

assert(SD->isSchedulingEntity())(static_cast <bool> (SD->isSchedulingEntity()) ? void
(0) : __assert_fail ("SD->isSchedulingEntity()", "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 9545, __extension__ __PRETTY_FUNCTION__));

9546

9547

SmallVector<ScheduleData *, 10> WorkList;

9548

WorkList.push_back(SD);

9549

9550

while (!WorkList.empty()) {

9551

ScheduleData *SD = WorkList.pop_back_val();

9552

for (ScheduleData *BundleMember = SD; BundleMember;

9553

BundleMember = BundleMember->NextInBundle) {

9554

assert(isInSchedulingRegion(BundleMember))(static_cast <bool> (isInSchedulingRegion(BundleMember)
) ? void (0) : __assert_fail ("isInSchedulingRegion(BundleMember)"
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 9554, __extension__
__PRETTY_FUNCTION__));

9555

if (BundleMember->hasValidDependencies())

9556

continue;

9557

9558

LLVM_DEBUG(dbgs() << "SLP: update deps of " << *BundleMemberdo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: update deps of " <<
*BundleMember << "\n"; } } while (false)

9559

<< "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: update deps of " <<
*BundleMember << "\n"; } } while (false);

9560

BundleMember->Dependencies = 0;

9561

BundleMember->resetUnscheduledDeps();

9562

9563

// Handle def-use chain dependencies.

9564

if (BundleMember->OpValue != BundleMember->Inst) {

9565

if (ScheduleData *UseSD = getScheduleData(BundleMember->Inst)) {

9566

BundleMember->Dependencies++;

9567

ScheduleData *DestBundle = UseSD->FirstInBundle;

9568

if (!DestBundle->IsScheduled)

9569

BundleMember->incrementUnscheduledDeps(1);

9570

if (!DestBundle->hasValidDependencies())

9571

WorkList.push_back(DestBundle);

9572

}

9573

} else {

9574

for (User *U : BundleMember->Inst->users()) {

9575

if (ScheduleData *UseSD = getScheduleData(cast<Instruction>(U))) {

9576

BundleMember->Dependencies++;

9577

ScheduleData *DestBundle = UseSD->FirstInBundle;

9578

if (!DestBundle->IsScheduled)

9579

BundleMember->incrementUnscheduledDeps(1);

9580

if (!DestBundle->hasValidDependencies())

9581

WorkList.push_back(DestBundle);

9582

}

9583

}

9584

}

9585

9586

auto makeControlDependent = [&](Instruction *I) {

9587

auto *DepDest = getScheduleData(I);

9588

assert(DepDest && "must be in schedule window")(static_cast <bool> (DepDest && "must be in schedule window"
) ? void (0) : __assert_fail ("DepDest && \"must be in schedule window\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 9588, __extension__
__PRETTY_FUNCTION__));

9589

DepDest->ControlDependencies.push_back(BundleMember);

9590

BundleMember->Dependencies++;

9591

ScheduleData *DestBundle = DepDest->FirstInBundle;

9592

if (!DestBundle->IsScheduled)

9593

BundleMember->incrementUnscheduledDeps(1);

9594

if (!DestBundle->hasValidDependencies())

9595

WorkList.push_back(DestBundle);

9596

};

9597

9598

// Any instruction which isn't safe to speculate at the beginning of the

9599

// block is control dependend on any early exit or non-willreturn call

9600

// which proceeds it.

9601

if (!isGuaranteedToTransferExecutionToSuccessor(BundleMember->Inst)) {

9602

for (Instruction *I = BundleMember->Inst->getNextNode();

9603

I != ScheduleEnd; I = I->getNextNode()) {

9604

if (isSafeToSpeculativelyExecute(I, &*BB->begin(), SLP->AC))

9605

continue;

9606

9607

// Add the dependency

9608

makeControlDependent(I);

9609

9610

if (!isGuaranteedToTransferExecutionToSuccessor(I))

9611

// Everything past here must be control dependent on I.

9612

break;

9613

}

9614

}

9615

9616

if (RegionHasStackSave) {

9617

// If we have an inalloc alloca instruction, it needs to be scheduled

9618

// after any preceeding stacksave. We also need to prevent any alloca

9619

// from reordering above a preceeding stackrestore.

9620

if (match(BundleMember->Inst, m_Intrinsic<Intrinsic::stacksave>()) ||

9621

match(BundleMember->Inst, m_Intrinsic<Intrinsic::stackrestore>())) {

9622

for (Instruction *I = BundleMember->Inst->getNextNode();

9623

I != ScheduleEnd; I = I->getNextNode()) {

9624

if (match(I, m_Intrinsic<Intrinsic::stacksave>()) ||

9625

match(I, m_Intrinsic<Intrinsic::stackrestore>()))

9626

// Any allocas past here must be control dependent on I, and I

9627

// must be memory dependend on BundleMember->Inst.

9628

break;

9629

9630

if (!isa<AllocaInst>(I))

9631

continue;

9632

9633

// Add the dependency

9634

makeControlDependent(I);

9635

}

9636

}

9637

9638

// In addition to the cases handle just above, we need to prevent

9639

// allocas from moving below a stacksave. The stackrestore case

9640

// is currently thought to be conservatism.

9641

if (isa<AllocaInst>(BundleMember->Inst)) {

9642

for (Instruction *I = BundleMember->Inst->getNextNode();

9643

I != ScheduleEnd; I = I->getNextNode()) {

9644

if (!match(I, m_Intrinsic<Intrinsic::stacksave>()) &&

9645

!match(I, m_Intrinsic<Intrinsic::stackrestore>()))

9646

continue;

9647

9648

// Add the dependency

9649

makeControlDependent(I);

9650

break;

9651

}

9652

}

9653

}

9654

9655

// Handle the memory dependencies (if any).

9656

ScheduleData *DepDest = BundleMember->NextLoadStore;

9657

if (!DepDest)

9658

continue;

9659

Instruction *SrcInst = BundleMember->Inst;

9660

assert(SrcInst->mayReadOrWriteMemory() &&(static_cast <bool> (SrcInst->mayReadOrWriteMemory()
&& "NextLoadStore list for non memory effecting bundle?"
) ? void (0) : __assert_fail ("SrcInst->mayReadOrWriteMemory() && \"NextLoadStore list for non memory effecting bundle?\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 9661, __extension__
__PRETTY_FUNCTION__))

9661

"NextLoadStore list for non memory effecting bundle?")(static_cast <bool> (SrcInst->mayReadOrWriteMemory()
&& "NextLoadStore list for non memory effecting bundle?"
) ? void (0) : __assert_fail ("SrcInst->mayReadOrWriteMemory() && \"NextLoadStore list for non memory effecting bundle?\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 9661, __extension__
__PRETTY_FUNCTION__));

9662

MemoryLocation SrcLoc = getLocation(SrcInst);

9663

bool SrcMayWrite = BundleMember->Inst->mayWriteToMemory();

9664

unsigned numAliased = 0;

9665

unsigned DistToSrc = 1;

9666

9667

for ( ; DepDest; DepDest = DepDest->NextLoadStore) {

9668

assert(isInSchedulingRegion(DepDest))(static_cast <bool> (isInSchedulingRegion(DepDest)) ? void
(0) : __assert_fail ("isInSchedulingRegion(DepDest)", "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp"
, 9668, __extension__ __PRETTY_FUNCTION__));

9669

9670

// We have two limits to reduce the complexity:

9671

// 1) AliasedCheckLimit: It's a small limit to reduce calls to

9672

// SLP->isAliased (which is the expensive part in this loop).

9673

// 2) MaxMemDepDistance: It's for very large blocks and it aborts

9674

// the whole loop (even if the loop is fast, it's quadratic).

9675

// It's important for the loop break condition (see below) to

9676

// check this limit even between two read-only instructions.

9677

if (DistToSrc >= MaxMemDepDistance ||

9678

((SrcMayWrite || DepDest->Inst->mayWriteToMemory()) &&

9679

(numAliased >= AliasedCheckLimit ||

9680

SLP->isAliased(SrcLoc, SrcInst, DepDest->Inst)))) {

9681

9682

// We increment the counter only if the locations are aliased

9683

// (instead of counting all alias checks). This gives a better

9684

// balance between reduced runtime and accurate dependencies.

9685

numAliased++;

9686

9687

DepDest->MemoryDependencies.push_back(BundleMember);

9688

BundleMember->Dependencies++;

9689

ScheduleData *DestBundle = DepDest->FirstInBundle;

9690

if (!DestBundle->IsScheduled) {

9691

BundleMember->incrementUnscheduledDeps(1);

9692

}

9693

if (!DestBundle->hasValidDependencies()) {

9694

WorkList.push_back(DestBundle);

9695

}

9696

}

9697

9698

// Example, explaining the loop break condition: Let's assume our

9699

// starting instruction is i0 and MaxMemDepDistance = 3.

9700

//

9701

// +--------v--v--v

9702

// i0,i1,i2,i3,i4,i5,i6,i7,i8

9703

// +--------^--^--^

9704

//

9705

// MaxMemDepDistance let us stop alias-checking at i3 and we add

9706

// dependencies from i0 to i3,i4,.. (even if they are not aliased).

9707

// Previously we already added dependencies from i3 to i6,i7,i8

9708

// (because of MaxMemDepDistance). As we added a dependency from

9709

// i0 to i3, we have transitive dependencies from i0 to i6,i7,i8

9710

// and we can abort this loop at i6.

9711

if (DistToSrc >= 2 * MaxMemDepDistance)

9712

break;

9713

DistToSrc++;

9714

}

9715

}

9716

if (InsertInReadyList && SD->isReady()) {

9717

ReadyInsts.insert(SD);

9718

LLVM_DEBUG(dbgs() << "SLP: gets ready on update: " << *SD->Instdo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: gets ready on update: " <<
*SD->Inst << "\n"; } } while (false)

9719

<< "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: gets ready on update: " <<
*SD->Inst << "\n"; } } while (false);

9720

}

9721

}

9722

}

9723

9724

void BoUpSLP::BlockScheduling::resetSchedule() {

9725

assert(ScheduleStart &&(static_cast <bool> (ScheduleStart && "tried to reset schedule on block which has not been scheduled"
) ? void (0) : __assert_fail ("ScheduleStart && \"tried to reset schedule on block which has not been scheduled\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 9726, __extension__
__PRETTY_FUNCTION__))

9726

"tried to reset schedule on block which has not been scheduled")(static_cast <bool> (ScheduleStart && "tried to reset schedule on block which has not been scheduled"
) ? void (0) : __assert_fail ("ScheduleStart && \"tried to reset schedule on block which has not been scheduled\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 9726, __extension__
__PRETTY_FUNCTION__));

9727

for (Instruction *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {

9728

doForAllOpcodes(I, [&](ScheduleData *SD) {

9729

assert(isInSchedulingRegion(SD) &&(static_cast <bool> (isInSchedulingRegion(SD) &&
"ScheduleData not in scheduling region") ? void (0) : __assert_fail
("isInSchedulingRegion(SD) && \"ScheduleData not in scheduling region\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 9730, __extension__
__PRETTY_FUNCTION__))

9730

"ScheduleData not in scheduling region")(static_cast <bool> (isInSchedulingRegion(SD) &&
"ScheduleData not in scheduling region") ? void (0) : __assert_fail
("isInSchedulingRegion(SD) && \"ScheduleData not in scheduling region\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 9730, __extension__
__PRETTY_FUNCTION__));

9731

SD->IsScheduled = false;

9732

SD->resetUnscheduledDeps();

9733

});

9734

}

9735

ReadyInsts.clear();

9736

}

9737

9738

void BoUpSLP::scheduleBlock(BlockScheduling *BS) {

9739

if (!BS->ScheduleStart)

9740

return;

9741

9742

LLVM_DEBUG(dbgs() << "SLP: schedule block " << BS->BB->getName() << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: schedule block " << BS
->BB->getName() << "\n"; } } while (false);

9743

9744

// A key point - if we got here, pre-scheduling was able to find a valid

9745

// scheduling of the sub-graph of the scheduling window which consists

9746

// of all vector bundles and their transitive users. As such, we do not

9747

// need to reschedule anything *outside of* that subgraph.

9748

9749

BS->resetSchedule();

9750

9751

// For the real scheduling we use a more sophisticated ready-list: it is

9752

// sorted by the original instruction location. This lets the final schedule

9753

// be as close as possible to the original instruction order.

9754

// WARNING: If changing this order causes a correctness issue, that means

9755

// there is some missing dependence edge in the schedule data graph.

9756

struct ScheduleDataCompare {

9757

bool operator()(ScheduleData *SD1, ScheduleData *SD2) const {

9758

return SD2->SchedulingPriority < SD1->SchedulingPriority;

9759

}

9760

};

9761

std::set<ScheduleData *, ScheduleDataCompare> ReadyInsts;

9762

9763

// Ensure that all dependency data is updated (for nodes in the sub-graph)

9764

// and fill the ready-list with initial instructions.

9765

int Idx = 0;

9766

for (auto *I = BS->ScheduleStart; I != BS->ScheduleEnd;

9767

I = I->getNextNode()) {

9768

BS->doForAllOpcodes(I, [this, &Idx, BS](ScheduleData *SD) {

9769

TreeEntry *SDTE = getTreeEntry(SD->Inst);

9770

(void)SDTE;

9771

assert((isVectorLikeInstWithConstOps(SD->Inst) ||(static_cast <bool> ((isVectorLikeInstWithConstOps(SD->
Inst) || SD->isPartOfBundle() == (SDTE && !doesNotNeedToSchedule
(SDTE->Scalars))) && "scheduler and vectorizer bundle mismatch"
) ? void (0) : __assert_fail ("(isVectorLikeInstWithConstOps(SD->Inst) || SD->isPartOfBundle() == (SDTE && !doesNotNeedToSchedule(SDTE->Scalars))) && \"scheduler and vectorizer bundle mismatch\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 9774, __extension__
__PRETTY_FUNCTION__))

9772

SD->isPartOfBundle() ==(static_cast <bool> ((isVectorLikeInstWithConstOps(SD->
Inst) || SD->isPartOfBundle() == (SDTE && !doesNotNeedToSchedule
(SDTE->Scalars))) && "scheduler and vectorizer bundle mismatch"
) ? void (0) : __assert_fail ("(isVectorLikeInstWithConstOps(SD->Inst) || SD->isPartOfBundle() == (SDTE && !doesNotNeedToSchedule(SDTE->Scalars))) && \"scheduler and vectorizer bundle mismatch\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 9774, __extension__
__PRETTY_FUNCTION__))

9773

(SDTE && !doesNotNeedToSchedule(SDTE->Scalars))) &&(static_cast <bool> ((isVectorLikeInstWithConstOps(SD->
Inst) || SD->isPartOfBundle() == (SDTE && !doesNotNeedToSchedule
(SDTE->Scalars))) && "scheduler and vectorizer bundle mismatch"
) ? void (0) : __assert_fail ("(isVectorLikeInstWithConstOps(SD->Inst) || SD->isPartOfBundle() == (SDTE && !doesNotNeedToSchedule(SDTE->Scalars))) && \"scheduler and vectorizer bundle mismatch\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 9774, __extension__
__PRETTY_FUNCTION__))

9774

"scheduler and vectorizer bundle mismatch")(static_cast <bool> ((isVectorLikeInstWithConstOps(SD->
Inst) || SD->isPartOfBundle() == (SDTE && !doesNotNeedToSchedule
(SDTE->Scalars))) && "scheduler and vectorizer bundle mismatch"
) ? void (0) : __assert_fail ("(isVectorLikeInstWithConstOps(SD->Inst) || SD->isPartOfBundle() == (SDTE && !doesNotNeedToSchedule(SDTE->Scalars))) && \"scheduler and vectorizer bundle mismatch\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 9774, __extension__
__PRETTY_FUNCTION__));

9775

SD->FirstInBundle->SchedulingPriority = Idx++;

9776

9777

if (SD->isSchedulingEntity() && SD->isPartOfBundle())

9778

BS->calculateDependencies(SD, false, this);

9779

});

9780

}

9781

BS->initialFillReadyList(ReadyInsts);

9782

9783

Instruction *LastScheduledInst = BS->ScheduleEnd;

9784

9785

// Do the "real" scheduling.

9786

while (!ReadyInsts.empty()) {

9787

ScheduleData *picked = *ReadyInsts.begin();

9788

ReadyInsts.erase(ReadyInsts.begin());

9789

9790

// Move the scheduled instruction(s) to their dedicated places, if not

9791

// there yet.

9792

for (ScheduleData *BundleMember = picked; BundleMember;

9793

BundleMember = BundleMember->NextInBundle) {

9794

Instruction *pickedInst = BundleMember->Inst;

9795

if (pickedInst->getNextNode() != LastScheduledInst)

9796

pickedInst->moveBefore(LastScheduledInst);

9797

LastScheduledInst = pickedInst;

9798

}

9799

9800

BS->schedule(picked, ReadyInsts);

9801

}

9802

9803

// Check that we didn't break any of our invariants.

9804

#ifdef EXPENSIVE_CHECKS

9805

BS->verify();

9806

#endif

9807

9808

#if !defined(NDEBUG) || defined(EXPENSIVE_CHECKS)

9809

// Check that all schedulable entities got scheduled

9810

for (auto *I = BS->ScheduleStart; I != BS->ScheduleEnd; I = I->getNextNode()) {

9811

BS->doForAllOpcodes(I, [&](ScheduleData *SD) {

9812

if (SD->isSchedulingEntity() && SD->hasValidDependencies()) {

9813

assert(SD->IsScheduled && "must be scheduled at this point")(static_cast <bool> (SD->IsScheduled && "must be scheduled at this point"
) ? void (0) : __assert_fail ("SD->IsScheduled && \"must be scheduled at this point\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 9813, __extension__
__PRETTY_FUNCTION__));

9814

}

9815

});

9816

}

9817

#endif

9818

9819

// Avoid duplicate scheduling of the block.

9820

BS->ScheduleStart = nullptr;

9821

}

9822

9823

unsigned BoUpSLP::getVectorElementSize(Value *V) {

9824

// If V is a store, just return the width of the stored value (or value

9825

// truncated just before storing) without traversing the expression tree.

9826

// This is the common case.

9827

if (auto *Store = dyn_cast<StoreInst>(V))

9828

return DL->getTypeSizeInBits(Store->getValueOperand()->getType());

9829

9830

if (auto *IEI = dyn_cast<InsertElementInst>(V))

9831

return getVectorElementSize(IEI->getOperand(1));

9832

9833

auto E = InstrElementSize.find(V);

9834

if (E != InstrElementSize.end())

9835

return E->second;

9836

9837

// If V is not a store, we can traverse the expression tree to find loads

9838

// that feed it. The type of the loaded value may indicate a more suitable

9839

// width than V's type. We want to base the vector element size on the width

9840

// of memory operations where possible.

9841

SmallVector<std::pair<Instruction *, BasicBlock *>, 16> Worklist;

9842

SmallPtrSet<Instruction *, 16> Visited;

9843

if (auto *I = dyn_cast<Instruction>(V)) {

9844

Worklist.emplace_back(I, I->getParent());

9845

Visited.insert(I);

9846

}

9847

9848

// Traverse the expression tree in bottom-up order looking for loads. If we

9849

// encounter an instruction we don't yet handle, we give up.

9850

auto Width = 0u;

9851

while (!Worklist.empty()) {

9852

Instruction *I;

9853

BasicBlock *Parent;

9854

std::tie(I, Parent) = Worklist.pop_back_val();

9855

9856

// We should only be looking at scalar instructions here. If the current

9857

// instruction has a vector type, skip.

9858

auto *Ty = I->getType();

9859

if (isa<VectorType>(Ty))

9860

continue;

9861

9862

// If the current instruction is a load, update MaxWidth to reflect the

9863

// width of the loaded value.

9864

if (isa<LoadInst, ExtractElementInst, ExtractValueInst>(I))

9865

Width = std::max<unsigned>(Width, DL->getTypeSizeInBits(Ty));

9866

9867

// Otherwise, we need to visit the operands of the instruction. We only

9868

// handle the interesting cases from buildTree here. If an operand is an

9869

// instruction we haven't yet visited and from the same basic block as the

9870

// user or the use is a PHI node, we add it to the worklist.

9871

else if (isa<PHINode, CastInst, GetElementPtrInst, CmpInst, SelectInst,

9872

BinaryOperator, UnaryOperator>(I)) {

9873

for (Use &U : I->operands())

9874

if (auto *J = dyn_cast<Instruction>(U.get()))

9875

if (Visited.insert(J).second &&

9876

(isa<PHINode>(I) || J->getParent() == Parent))

9877

Worklist.emplace_back(J, J->getParent());

9878

} else {

9879

break;

9880

}

9881

}

9882

9883

// If we didn't encounter a memory access in the expression tree, or if we

9884

// gave up for some reason, just return the width of V. Otherwise, return the

9885

// maximum width we found.

9886

if (!Width) {

9887

if (auto *CI = dyn_cast<CmpInst>(V))

9888

V = CI->getOperand(0);

9889

Width = DL->getTypeSizeInBits(V->getType());

9890

}

9891

9892

for (Instruction *I : Visited)

9893

InstrElementSize[I] = Width;

9894

9895

return Width;

9896

}

9897

9898

// Determine if a value V in a vectorizable expression Expr can be demoted to a

9899

// smaller type with a truncation. We collect the values that will be demoted

9900

// in ToDemote and additional roots that require investigating in Roots.

9901

static bool collectValuesToDemote(Value *V, SmallPtrSetImpl<Value *> &Expr,

9902

SmallVectorImpl<Value *> &ToDemote,

9903

SmallVectorImpl<Value *> &Roots) {

9904

// We can always demote constants.

9905

if (isa<Constant>(V)) {

9906

ToDemote.push_back(V);

9907

return true;

9908

}

9909

9910

// If the value is not an instruction in the expression with only one use, it

9911

// cannot be demoted.

9912

auto *I = dyn_cast<Instruction>(V);

9913

if (!I || !I->hasOneUse() || !Expr.count(I))

9914

return false;

9915

9916

switch (I->getOpcode()) {

9917

9918

// We can always demote truncations and extensions. Since truncations can

9919

// seed additional demotion, we save the truncated value.

9920

case Instruction::Trunc:

9921

Roots.push_back(I->getOperand(0));

9922

break;

9923

case Instruction::ZExt:

9924

case Instruction::SExt:

9925

if (isa<ExtractElementInst, InsertElementInst>(I->getOperand(0)))

9926

return false;

9927

break;

9928

9929

// We can demote certain binary operations if we can demote both of their

9930

// operands.

9931

case Instruction::Add:

9932

case Instruction::Sub:

9933

case Instruction::Mul:

9934

case Instruction::And:

9935

case Instruction::Or:

9936

case Instruction::Xor:

9937

if (!collectValuesToDemote(I->getOperand(0), Expr, ToDemote, Roots) ||

9938

!collectValuesToDemote(I->getOperand(1), Expr, ToDemote, Roots))

9939

return false;

9940

break;

9941

9942

// We can demote selects if we can demote their true and false values.

9943

case Instruction::Select: {

9944

SelectInst *SI = cast<SelectInst>(I);

9945

if (!collectValuesToDemote(SI->getTrueValue(), Expr, ToDemote, Roots) ||

9946

!collectValuesToDemote(SI->getFalseValue(), Expr, ToDemote, Roots))

9947

return false;

9948

break;

9949

}

9950

9951

// We can demote phis if we can demote all their incoming operands. Note that

9952

// we don't need to worry about cycles since we ensure single use above.

9953

case Instruction::PHI: {

9954

PHINode *PN = cast<PHINode>(I);

9955

for (Value *IncValue : PN->incoming_values())

9956

if (!collectValuesToDemote(IncValue, Expr, ToDemote, Roots))

9957

return false;

9958

break;

9959

}

9960

9961

// Otherwise, conservatively give up.

9962

default:

9963

return false;

9964

}

9965

9966

// Record the value that we can demote.

9967

ToDemote.push_back(V);

9968

return true;

9969

}

9970

9971

void BoUpSLP::computeMinimumValueSizes() {

9972

// If there are no external uses, the expression tree must be rooted by a

9973

// store. We can't demote in-memory values, so there is nothing to do here.

9974

if (ExternalUses.empty())

9975

return;

9976

9977

// We only attempt to truncate integer expressions.

9978

auto &TreeRoot = VectorizableTree[0]->Scalars;

9979

auto *TreeRootIT = dyn_cast<IntegerType>(TreeRoot[0]->getType());

9980

if (!TreeRootIT)

9981

return;

9982

9983

// If the expression is not rooted by a store, these roots should have

9984

// external uses. We will rely on InstCombine to rewrite the expression in

9985

// the narrower type. However, InstCombine only rewrites single-use values.

9986

// This means that if a tree entry other than a root is used externally, it

9987

// must have multiple uses and InstCombine will not rewrite it. The code

9988

// below ensures that only the roots are used externally.

9989

SmallPtrSet<Value *, 32> Expr(TreeRoot.begin(), TreeRoot.end());

9990

for (auto &EU : ExternalUses)

9991

if (!Expr.erase(EU.Scalar))

9992

return;

9993

if (!Expr.empty())

9994

return;

9995

9996

// Collect the scalar values of the vectorizable expression. We will use this

9997

// context to determine which values can be demoted. If we see a truncation,

9998

// we mark it as seeding another demotion.

9999

for (auto &EntryPtr : VectorizableTree)

10000

Expr.insert(EntryPtr->Scalars.begin(), EntryPtr->Scalars.end());

10001

10002

// Ensure the roots of the vectorizable tree don't form a cycle. They must

10003

// have a single external user that is not in the vectorizable tree.

10004

for (auto *Root : TreeRoot)

10005

if (!Root->hasOneUse() || Expr.count(*Root->user_begin()))

10006

return;

10007

10008

// Conservatively determine if we can actually truncate the roots of the

10009

// expression. Collect the values that can be demoted in ToDemote and

10010

// additional roots that require investigating in Roots.

10011

SmallVector<Value *, 32> ToDemote;

10012

SmallVector<Value *, 4> Roots;

10013

for (auto *Root : TreeRoot)

10014

if (!collectValuesToDemote(Root, Expr, ToDemote, Roots))

10015

return;

10016

10017

// The maximum bit width required to represent all the values that can be

10018

// demoted without loss of precision. It would be safe to truncate the roots

10019

// of the expression to this width.

10020

auto MaxBitWidth = 8u;

10021

10022

// We first check if all the bits of the roots are demanded. If they're not,

10023

// we can truncate the roots to this narrower type.

10024

for (auto *Root : TreeRoot) {

10025

auto Mask = DB->getDemandedBits(cast<Instruction>(Root));

10026

MaxBitWidth = std::max<unsigned>(

10027

Mask.getBitWidth() - Mask.countLeadingZeros(), MaxBitWidth);

10028

}

10029

10030

// True if the roots can be zero-extended back to their original type, rather

10031

// than sign-extended. We know that if the leading bits are not demanded, we

10032

// can safely zero-extend. So we initialize IsKnownPositive to True.

10033

bool IsKnownPositive = true;

10034

10035

// If all the bits of the roots are demanded, we can try a little harder to

10036

// compute a narrower type. This can happen, for example, if the roots are

10037

// getelementptr indices. InstCombine promotes these indices to the pointer

10038

// width. Thus, all their bits are technically demanded even though the

10039

// address computation might be vectorized in a smaller type.

10040

//

10041

// We start by looking at each entry that can be demoted. We compute the

10042

// maximum bit width required to store the scalar by using ValueTracking to

10043

// compute the number of high-order bits we can truncate.

10044

if (MaxBitWidth == DL->getTypeSizeInBits(TreeRoot[0]->getType()) &&

10045

llvm::all_of(TreeRoot, [](Value *R) {

10046

assert(R->hasOneUse() && "Root should have only one use!")(static_cast <bool> (R->hasOneUse() && "Root should have only one use!"
) ? void (0) : __assert_fail ("R->hasOneUse() && \"Root should have only one use!\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 10046, __extension__
__PRETTY_FUNCTION__));

10047

return isa<GetElementPtrInst>(R->user_back());

10048

})) {

10049

MaxBitWidth = 8u;

10050

10051

// Determine if the sign bit of all the roots is known to be zero. If not,

10052

// IsKnownPositive is set to False.

10053

IsKnownPositive = llvm::all_of(TreeRoot, [&](Value *R) {

10054

KnownBits Known = computeKnownBits(R, *DL);

10055

return Known.isNonNegative();

10056

});

10057

10058

// Determine the maximum number of bits required to store the scalar

10059

// values.

10060

for (auto *Scalar : ToDemote) {

10061

auto NumSignBits = ComputeNumSignBits(Scalar, *DL, 0, AC, nullptr, DT);

10062

auto NumTypeBits = DL->getTypeSizeInBits(Scalar->getType());

10063

MaxBitWidth = std::max<unsigned>(NumTypeBits - NumSignBits, MaxBitWidth);

10064

}

10065

10066

// If we can't prove that the sign bit is zero, we must add one to the

10067

// maximum bit width to account for the unknown sign bit. This preserves

10068

// the existing sign bit so we can safely sign-extend the root back to the

10069

// original type. Otherwise, if we know the sign bit is zero, we will

10070

// zero-extend the root instead.

10071

//

10072

// FIXME: This is somewhat suboptimal, as there will be cases where adding

10073

// one to the maximum bit width will yield a larger-than-necessary

10074

// type. In general, we need to add an extra bit only if we can't

10075

// prove that the upper bit of the original type is equal to the

10076

// upper bit of the proposed smaller type. If these two bits are the

10077

// same (either zero or one) we know that sign-extending from the

10078

// smaller type will result in the same value. Here, since we can't

10079

// yet prove this, we are just making the proposed smaller type

10080

// larger to ensure correctness.

10081

if (!IsKnownPositive)

10082

++MaxBitWidth;

10083

}

10084

10085

// Round MaxBitWidth up to the next power-of-two.

10086

if (!isPowerOf2_64(MaxBitWidth))

10087

MaxBitWidth = NextPowerOf2(MaxBitWidth);

10088

10089

// If the maximum bit width we compute is less than the with of the roots'

10090

// type, we can proceed with the narrowing. Otherwise, do nothing.

10091

if (MaxBitWidth >= TreeRootIT->getBitWidth())

10092

return;

10093

10094

// If we can truncate the root, we must collect additional values that might

10095

// be demoted as a result. That is, those seeded by truncations we will

10096

// modify.

10097

while (!Roots.empty())

10098

collectValuesToDemote(Roots.pop_back_val(), Expr, ToDemote, Roots);

10099

10100

// Finally, map the values we can demote to the maximum bit with we computed.

10101

for (auto *Scalar : ToDemote)

10102

MinBWs[Scalar] = std::make_pair(MaxBitWidth, !IsKnownPositive);

10103

}

10104

10105

namespace {

10106

10107

/// The SLPVectorizer Pass.

10108

struct SLPVectorizer : public FunctionPass {

10109

SLPVectorizerPass Impl;

10110

10111

/// Pass identification, replacement for typeid

10112

static char ID;

10113

10114

explicit SLPVectorizer() : FunctionPass(ID) {

10115

initializeSLPVectorizerPass(*PassRegistry::getPassRegistry());

10116

}

10117

10118

bool doInitialization(Module &M) override { return false; }

10119

10120

bool runOnFunction(Function &F) override {

10121

if (skipFunction(F))

10122

return false;

10123

10124

auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();

10125

auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);

10126

auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();

10127

auto *TLI = TLIP ? &TLIP->getTLI(F) : nullptr;

10128

auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();

10129

auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();

10130

auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();

10131

auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);

10132

auto *DB = &getAnalysis<DemandedBitsWrapperPass>().getDemandedBits();

10133

auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();

10134

10135

return Impl.runImpl(F, SE, TTI, TLI, AA, LI, DT, AC, DB, ORE);

10136

}

10137

10138

void getAnalysisUsage(AnalysisUsage &AU) const override {

10139

FunctionPass::getAnalysisUsage(AU);

10140

AU.addRequired<AssumptionCacheTracker>();

10141

AU.addRequired<ScalarEvolutionWrapperPass>();

10142

AU.addRequired<AAResultsWrapperPass>();

10143

AU.addRequired<TargetTransformInfoWrapperPass>();

10144

AU.addRequired<LoopInfoWrapperPass>();

10145

AU.addRequired<DominatorTreeWrapperPass>();

10146

AU.addRequired<DemandedBitsWrapperPass>();

10147

AU.addRequired<OptimizationRemarkEmitterWrapperPass>();

10148

AU.addRequired<InjectTLIMappingsLegacy>();

10149

AU.addPreserved<LoopInfoWrapperPass>();

10150

AU.addPreserved<DominatorTreeWrapperPass>();

10151

AU.addPreserved<AAResultsWrapperPass>();

10152

AU.addPreserved<GlobalsAAWrapperPass>();

10153

AU.setPreservesCFG();

10154

}

10155

};

10156

10157

} // end anonymous namespace

10158

10159

PreservedAnalyses SLPVectorizerPass::run(Function &F, FunctionAnalysisManager &AM) {

10160

auto *SE = &AM.getResult<ScalarEvolutionAnalysis>(F);

10161

auto *TTI = &AM.getResult<TargetIRAnalysis>(F);

10162

auto *TLI = AM.getCachedResult<TargetLibraryAnalysis>(F);

10163

auto *AA = &AM.getResult<AAManager>(F);

10164

auto *LI = &AM.getResult<LoopAnalysis>(F);

10165

auto *DT = &AM.getResult<DominatorTreeAnalysis>(F);

10166

auto *AC = &AM.getResult<AssumptionAnalysis>(F);

10167

auto *DB = &AM.getResult<DemandedBitsAnalysis>(F);

10168

auto *ORE = &AM.getResult<OptimizationRemarkEmitterAnalysis>(F);

10169

10170

bool Changed = runImpl(F, SE, TTI, TLI, AA, LI, DT, AC, DB, ORE);

10171

if (!Changed)

10172

return PreservedAnalyses::all();

10173

10174

PreservedAnalyses PA;

10175

PA.preserveSet<CFGAnalyses>();

10176

return PA;

10177

}

10178

10179

bool SLPVectorizerPass::runImpl(Function &F, ScalarEvolution *SE_,

10180

TargetTransformInfo *TTI_,

10181

TargetLibraryInfo *TLI_, AAResults *AA_,

10182

LoopInfo *LI_, DominatorTree *DT_,

10183

AssumptionCache *AC_, DemandedBits *DB_,

10184

OptimizationRemarkEmitter *ORE_) {

10185

if (!RunSLPVectorization)

10186

return false;

10187

SE = SE_;

10188

TTI = TTI_;

10189

TLI = TLI_;

10190

AA = AA_;

10191

LI = LI_;

10192

DT = DT_;

10193

AC = AC_;

10194

DB = DB_;

10195

DL = &F.getParent()->getDataLayout();

10196

10197

Stores.clear();

10198

GEPs.clear();

10199

bool Changed = false;

10200

10201

// If the target claims to have no vector registers don't attempt

10202

// vectorization.

10203

if (!TTI->getNumberOfRegisters(TTI->getRegisterClassForType(true))) {

10204

LLVM_DEBUG(do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Didn't find any vector registers for target, abort.\n"
; } } while (false)

10205

dbgs() << "SLP: Didn't find any vector registers for target, abort.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Didn't find any vector registers for target, abort.\n"
; } } while (false);

10206

return false;

10207

}

10208

10209

// Don't vectorize when the attribute NoImplicitFloat is used.

10210

if (F.hasFnAttribute(Attribute::NoImplicitFloat))

10211

return false;

10212

10213

LLVM_DEBUG(dbgs() << "SLP: Analyzing blocks in " << F.getName() << ".\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Analyzing blocks in " <<
F.getName() << ".\n"; } } while (false);

10214

10215

// Use the bottom up slp vectorizer to construct chains that start with

10216

// store instructions.

10217

BoUpSLP R(&F, SE, TTI, TLI, AA, LI, DT, AC, DB, DL, ORE_);

10218

10219

// A general note: the vectorizer must use BoUpSLP::eraseInstruction() to

10220

// delete instructions.

10221

10222

// Update DFS numbers now so that we can use them for ordering.

10223

DT->updateDFSNumbers();

10224

10225

// Scan the blocks in the function in post order.

10226

for (auto *BB : post_order(&F.getEntryBlock())) {

10227

// Start new block - clear the list of reduction roots.

10228

R.clearReductionData();

10229

collectSeedInstructions(BB);

10230

10231

// Vectorize trees that end at stores.

10232

if (!Stores.empty()) {

10233

LLVM_DEBUG(dbgs() << "SLP: Found stores for " << Stores.size()do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Found stores for " << Stores
.size() << " underlying objects.\n"; } } while (false)

10234

<< " underlying objects.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Found stores for " << Stores
.size() << " underlying objects.\n"; } } while (false);

10235

Changed |= vectorizeStoreChains(R);

10236

}

10237

10238

// Vectorize trees that end at reductions.

10239

Changed |= vectorizeChainsInBlock(BB, R);

10240

10241

// Vectorize the index computations of getelementptr instructions. This

10242

// is primarily intended to catch gather-like idioms ending at

10243

// non-consecutive loads.

10244

if (!GEPs.empty()) {

10245

LLVM_DEBUG(dbgs() << "SLP: Found GEPs for " << GEPs.size()do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Found GEPs for " << GEPs
.size() << " underlying objects.\n"; } } while (false)

10246

<< " underlying objects.\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Found GEPs for " << GEPs
.size() << " underlying objects.\n"; } } while (false);

10247

Changed |= vectorizeGEPIndices(BB, R);

10248

}

10249

}

10250

10251

if (Changed) {

10252

R.optimizeGatherSequence();

10253

LLVM_DEBUG(dbgs() << "SLP: vectorized \"" << F.getName() << "\"\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: vectorized \"" << F.getName
() << "\"\n"; } } while (false);

10254

}

10255

return Changed;

10256

}

10257

10258

bool SLPVectorizerPass::vectorizeStoreChain(ArrayRef<Value *> Chain, BoUpSLP &R,

10259

unsigned Idx, unsigned MinVF) {

10260

LLVM_DEBUG(dbgs() << "SLP: Analyzing a store chain of length " << Chain.size()do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Analyzing a store chain of length "
<< Chain.size() << "\n"; } } while (false)

10261

<< "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Analyzing a store chain of length "
<< Chain.size() << "\n"; } } while (false);

10262

const unsigned Sz = R.getVectorElementSize(Chain[0]);

10263

unsigned VF = Chain.size();

10264

10265

if (!isPowerOf2_32(Sz) || !isPowerOf2_32(VF) || VF < 2 || VF < MinVF)

10266

return false;

10267

10268

LLVM_DEBUG(dbgs() << "SLP: Analyzing " << VF << " stores at offset " << Idxdo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Analyzing " << VF <<
" stores at offset " << Idx << "\n"; } } while (
false)

10269

<< "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Analyzing " << VF <<
" stores at offset " << Idx << "\n"; } } while (
false);

10270

10271

R.buildTree(Chain);

10272

if (R.isTreeTinyAndNotFullyVectorizable())

10273

return false;

10274

if (R.isLoadCombineCandidate())

10275

return false;

10276

R.reorderTopToBottom();

10277

R.reorderBottomToTop();

10278

R.buildExternalUses();

10279

10280

R.computeMinimumValueSizes();

10281

10282

InstructionCost Cost = R.getTreeCost();

10283

10284

LLVM_DEBUG(dbgs() << "SLP: Found cost = " << Cost << " for VF=" << VF << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Found cost = " << Cost
<< " for VF=" << VF << "\n"; } } while (false
);

10285

if (Cost < -SLPCostThreshold) {

10286

LLVM_DEBUG(dbgs() << "SLP: Decided to vectorize cost = " << Cost << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Decided to vectorize cost = "
<< Cost << "\n"; } } while (false);

10287

10288

using namespace ore;

10289

10290

R.getORE()->emit(OptimizationRemark(SV_NAME"slp-vectorizer", "StoresVectorized",

10291

cast<StoreInst>(Chain[0]))

10292

<< "Stores SLP vectorized with cost " << NV("Cost", Cost)

10293

<< " and with tree size "

10294

<< NV("TreeSize", R.getTreeSize()));

10295

10296

R.vectorizeTree();

10297

return true;

10298

}

10299

10300

return false;

10301

}

10302

10303

bool SLPVectorizerPass::vectorizeStores(ArrayRef<StoreInst *> Stores,

10304

BoUpSLP &R) {

10305

// We may run into multiple chains that merge into a single chain. We mark the

10306

// stores that we vectorized so that we don't visit the same store twice.

10307

BoUpSLP::ValueSet VectorizedStores;

10308

bool Changed = false;

10309

10310

int E = Stores.size();

10311

SmallBitVector Tails(E, false);

10312

int MaxIter = MaxStoreLookup.getValue();

10313

SmallVector<std::pair<int, int>, 16> ConsecutiveChain(

10314

E, std::make_pair(E, INT_MAX2147483647));

10315

SmallVector<SmallBitVector, 4> CheckedPairs(E, SmallBitVector(E, false));

10316

int IterCnt;

10317

auto &&FindConsecutiveAccess = [this, &Stores, &Tails, &IterCnt, MaxIter,

10318

&CheckedPairs,

10319

&ConsecutiveChain](int K, int Idx) {

10320

if (IterCnt >= MaxIter)

10321

return true;

10322

if (CheckedPairs[Idx].test(K))

10323

return ConsecutiveChain[K].second == 1 &&

10324

ConsecutiveChain[K].first == Idx;

10325

++IterCnt;

10326

CheckedPairs[Idx].set(K);

10327

CheckedPairs[K].set(Idx);

10328

Optional<int> Diff = getPointersDiff(

10329

Stores[K]->getValueOperand()->getType(), Stores[K]->getPointerOperand(),

10330

Stores[Idx]->getValueOperand()->getType(),

10331

Stores[Idx]->getPointerOperand(), *DL, *SE, /*StrictCheck=*/true);

10332

if (!Diff || *Diff == 0)

10333

return false;

10334

int Val = *Diff;

10335

if (Val < 0) {

10336

if (ConsecutiveChain[Idx].second > -Val) {

10337

Tails.set(K);

10338

ConsecutiveChain[Idx] = std::make_pair(K, -Val);

10339

}

10340

return false;

10341

}

10342

if (ConsecutiveChain[K].second <= Val)

10343

return false;

10344

10345

Tails.set(Idx);

10346

ConsecutiveChain[K] = std::make_pair(Idx, Val);

10347

return Val == 1;

10348

};

10349

// Do a quadratic search on all of the given stores in reverse order and find

10350

// all of the pairs of stores that follow each other.

10351

for (int Idx = E - 1; Idx >= 0; --Idx) {

10352

// If a store has multiple consecutive store candidates, search according

10353

// to the sequence: Idx-1, Idx+1, Idx-2, Idx+2, ...

10354

// This is because usually pairing with immediate succeeding or preceding

10355

// candidate create the best chance to find slp vectorization opportunity.

10356

const int MaxLookDepth = std::max(E - Idx, Idx + 1);

10357

IterCnt = 0;

10358

for (int Offset = 1, F = MaxLookDepth; Offset < F; ++Offset)

10359

if ((Idx >= Offset && FindConsecutiveAccess(Idx - Offset, Idx)) ||

10360

(Idx + Offset < E && FindConsecutiveAccess(Idx + Offset, Idx)))

10361

break;

10362

}

10363

10364

// Tracks if we tried to vectorize stores starting from the given tail

10365

// already.

10366

SmallBitVector TriedTails(E, false);

10367

// For stores that start but don't end a link in the chain:

10368

for (int Cnt = E; Cnt > 0; --Cnt) {

10369

int I = Cnt - 1;

10370

if (ConsecutiveChain[I].first == E || Tails.test(I))

10371

continue;

10372

// We found a store instr that starts a chain. Now follow the chain and try

10373

// to vectorize it.

10374

BoUpSLP::ValueList Operands;

10375

// Collect the chain into a list.

10376

while (I != E && !VectorizedStores.count(Stores[I])) {

10377

Operands.push_back(Stores[I]);

10378

Tails.set(I);

10379

if (ConsecutiveChain[I].second != 1) {

10380

// Mark the new end in the chain and go back, if required. It might be

10381

// required if the original stores come in reversed order, for example.

10382

if (ConsecutiveChain[I].first != E &&

10383

Tails.test(ConsecutiveChain[I].first) && !TriedTails.test(I) &&

10384

!VectorizedStores.count(Stores[ConsecutiveChain[I].first])) {

10385

TriedTails.set(I);

10386

Tails.reset(ConsecutiveChain[I].first);

10387

if (Cnt < ConsecutiveChain[I].first + 2)

10388

Cnt = ConsecutiveChain[I].first + 2;

10389

}

10390

break;

10391

}

10392

// Move to the next value in the chain.

10393

I = ConsecutiveChain[I].first;

10394

}

10395

assert(!Operands.empty() && "Expected non-empty list of stores.")(static_cast <bool> (!Operands.empty() && "Expected non-empty list of stores."
) ? void (0) : __assert_fail ("!Operands.empty() && \"Expected non-empty list of stores.\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 10395, __extension__
__PRETTY_FUNCTION__));

10396

10397

unsigned MaxVecRegSize = R.getMaxVecRegSize();

10398

unsigned EltSize = R.getVectorElementSize(Operands[0]);

10399

unsigned MaxElts = llvm::PowerOf2Floor(MaxVecRegSize / EltSize);

10400

10401

unsigned MaxVF = std::min(R.getMaximumVF(EltSize, Instruction::Store),

10402

MaxElts);

10403

auto *Store = cast<StoreInst>(Operands[0]);

10404

Type *StoreTy = Store->getValueOperand()->getType();

10405

Type *ValueTy = StoreTy;

10406

if (auto *Trunc = dyn_cast<TruncInst>(Store->getValueOperand()))

10407

ValueTy = Trunc->getSrcTy();

10408

unsigned MinVF = TTI->getStoreMinimumVF(

10409

R.getMinVF(DL->getTypeSizeInBits(ValueTy)), StoreTy, ValueTy);

10410

10411

if (MaxVF <= MinVF) {

10412

LLVM_DEBUG(dbgs() << "SLP: Vectorization infeasible as MaxVF (" << MaxVF << ") <= "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Vectorization infeasible as MaxVF ("
<< MaxVF << ") <= " << "MinVF (" <<
MinVF << ")\n"; } } while (false)

10413

<< "MinVF (" << MinVF << ")\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Vectorization infeasible as MaxVF ("
<< MaxVF << ") <= " << "MinVF (" <<
MinVF << ")\n"; } } while (false);

10414

}

10415

10416

// FIXME: Is division-by-2 the correct step? Should we assert that the

10417

// register size is a power-of-2?

10418

unsigned StartIdx = 0;

10419

for (unsigned Size = MaxVF; Size >= MinVF; Size /= 2) {

10420

for (unsigned Cnt = StartIdx, E = Operands.size(); Cnt + Size <= E;) {

10421

ArrayRef<Value *> Slice = makeArrayRef(Operands).slice(Cnt, Size);

10422

if (!VectorizedStores.count(Slice.front()) &&

10423

!VectorizedStores.count(Slice.back()) &&

10424

vectorizeStoreChain(Slice, R, Cnt, MinVF)) {

10425

// Mark the vectorized stores so that we don't vectorize them again.

10426

VectorizedStores.insert(Slice.begin(), Slice.end());

10427

Changed = true;

10428

// If we vectorized initial block, no need to try to vectorize it

10429

// again.

10430

if (Cnt == StartIdx)

10431

StartIdx += Size;

10432

Cnt += Size;

10433

continue;

10434

}

10435

++Cnt;

10436

}

10437

// Check if the whole array was vectorized already - exit.

10438

if (StartIdx >= Operands.size())

10439

break;

10440

}

10441

}

10442

10443

return Changed;

10444

}

10445

10446

void SLPVectorizerPass::collectSeedInstructions(BasicBlock *BB) {

10447

// Initialize the collections. We will make a single pass over the block.

10448

Stores.clear();

10449

GEPs.clear();

10450

10451

// Visit the store and getelementptr instructions in BB and organize them in

10452

// Stores and GEPs according to the underlying objects of their pointer

10453

// operands.

10454

for (Instruction &I : *BB) {

10455

// Ignore store instructions that are volatile or have a pointer operand

10456

// that doesn't point to a scalar type.

10457

if (auto *SI = dyn_cast<StoreInst>(&I)) {

10458

if (!SI->isSimple())

10459

continue;

10460

if (!isValidElementType(SI->getValueOperand()->getType()))

10461

continue;

10462

Stores[getUnderlyingObject(SI->getPointerOperand())].push_back(SI);

10463

}

10464

10465

// Ignore getelementptr instructions that have more than one index, a

10466

// constant index, or a pointer operand that doesn't point to a scalar

10467

// type.

10468

else if (auto *GEP = dyn_cast<GetElementPtrInst>(&I)) {

10469

auto Idx = GEP->idx_begin()->get();

10470

if (GEP->getNumIndices() > 1 || isa<Constant>(Idx))

10471

continue;

10472

if (!isValidElementType(Idx->getType()))

10473

continue;

10474

if (GEP->getType()->isVectorTy())

10475

continue;

10476

GEPs[GEP->getPointerOperand()].push_back(GEP);

10477

}

10478

}

10479

}

10480

10481

bool SLPVectorizerPass::tryToVectorizePair(Value *A, Value *B, BoUpSLP &R) {

10482

if (!A || !B)

10483

return false;

10484

if (isa<InsertElementInst>(A) || isa<InsertElementInst>(B))

10485

return false;

10486

Value *VL[] = {A, B};

10487

return tryToVectorizeList(VL, R);

10488

}

10489

10490

bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,

10491

bool LimitForRegisterSize) {

10492

if (VL.size() < 2)

10493

return false;

10494

10495

LLVM_DEBUG(dbgs() << "SLP: Trying to vectorize a list of length = "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Trying to vectorize a list of length = "
<< VL.size() << ".\n"; } } while (false)

10496

<< VL.size() << ".\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Trying to vectorize a list of length = "
<< VL.size() << ".\n"; } } while (false);

10497

10498

// Check that all of the parts are instructions of the same type,

10499

// we permit an alternate opcode via InstructionsState.

10500

InstructionsState S = getSameOpcode(VL);

10501

if (!S.getOpcode())

10502

return false;

10503

10504

Instruction *I0 = cast<Instruction>(S.OpValue);

10505

// Make sure invalid types (including vector type) are rejected before

10506

// determining vectorization factor for scalar instructions.

10507

for (Value *V : VL) {

10508

Type *Ty = V->getType();

10509

if (!isa<InsertElementInst>(V) && !isValidElementType(Ty)) {

10510

// NOTE: the following will give user internal llvm type name, which may

10511

// not be useful.

10512

R.getORE()->emit([&]() {

10513

std::string type_str;

10514

llvm::raw_string_ostream rso(type_str);

10515

Ty->print(rso);

10516

return OptimizationRemarkMissed(SV_NAME"slp-vectorizer", "UnsupportedType", I0)

10517

<< "Cannot SLP vectorize list: type "

10518

<< rso.str() + " is unsupported by vectorizer";

10519

});

10520

return false;

10521

}

10522

}

10523

10524

unsigned Sz = R.getVectorElementSize(I0);

10525

unsigned MinVF = R.getMinVF(Sz);

10526

unsigned MaxVF = std::max<unsigned>(PowerOf2Floor(VL.size()), MinVF);

10527

MaxVF = std::min(R.getMaximumVF(Sz, S.getOpcode()), MaxVF);

10528

if (MaxVF < 2) {

10529

R.getORE()->emit([&]() {

10530

return OptimizationRemarkMissed(SV_NAME"slp-vectorizer", "SmallVF", I0)

10531

<< "Cannot SLP vectorize list: vectorization factor "

10532

<< "less than 2 is not supported";

10533

});

10534

return false;

10535

}

10536

10537

bool Changed = false;

10538

bool CandidateFound = false;

10539

InstructionCost MinCost = SLPCostThreshold.getValue();

10540

Type *ScalarTy = VL[0]->getType();

10541

if (auto *IE = dyn_cast<InsertElementInst>(VL[0]))

10542

ScalarTy = IE->getOperand(1)->getType();

10543

10544

unsigned NextInst = 0, MaxInst = VL.size();

10545

for (unsigned VF = MaxVF; NextInst + 1 < MaxInst && VF >= MinVF; VF /= 2) {

10546

// No actual vectorization should happen, if number of parts is the same as

10547

// provided vectorization factor (i.e. the scalar type is used for vector

10548

// code during codegen).

10549

auto *VecTy = FixedVectorType::get(ScalarTy, VF);

10550

if (TTI->getNumberOfParts(VecTy) == VF)

10551

continue;

10552

for (unsigned I = NextInst; I < MaxInst; ++I) {

10553

unsigned OpsWidth = 0;

10554

10555

if (I + VF > MaxInst)

10556

OpsWidth = MaxInst - I;

10557

else

10558

OpsWidth = VF;

10559

10560

if (!isPowerOf2_32(OpsWidth))

10561

continue;

10562

10563

if ((LimitForRegisterSize && OpsWidth < MaxVF) ||

10564

(VF > MinVF && OpsWidth <= VF / 2) || (VF == MinVF && OpsWidth < 2))

10565

break;

10566

10567

ArrayRef<Value *> Ops = VL.slice(I, OpsWidth);

10568

// Check that a previous iteration of this loop did not delete the Value.

10569

if (llvm::any_of(Ops, [&R](Value *V) {

10570

auto *I = dyn_cast<Instruction>(V);

10571

return I && R.isDeleted(I);

10572

}))

10573

continue;

10574

10575

LLVM_DEBUG(dbgs() << "SLP: Analyzing " << OpsWidth << " operations "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Analyzing " << OpsWidth
<< " operations " << "\n"; } } while (false)

10576

<< "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Analyzing " << OpsWidth
<< " operations " << "\n"; } } while (false);

10577

10578

R.buildTree(Ops);

10579

if (R.isTreeTinyAndNotFullyVectorizable())

10580

continue;

10581

R.reorderTopToBottom();

10582

R.reorderBottomToTop(!isa<InsertElementInst>(Ops.front()));

10583

R.buildExternalUses();

10584

10585

R.computeMinimumValueSizes();

10586

InstructionCost Cost = R.getTreeCost();

10587

CandidateFound = true;

10588

MinCost = std::min(MinCost, Cost);

10589

10590

LLVM_DEBUG(dbgs() << "SLP: Found cost = " << Cost << " for VF=" << VF << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Found cost = " << Cost
<< " for VF=" << VF << "\n"; } } while (false
);

10591

if (Cost < -SLPCostThreshold) {

10592

LLVM_DEBUG(dbgs() << "SLP: Vectorizing list at cost:" << Cost << ".\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Vectorizing list at cost:" <<
Cost << ".\n"; } } while (false);

10593

R.getORE()->emit(OptimizationRemark(SV_NAME"slp-vectorizer", "VectorizedList",

10594

cast<Instruction>(Ops[0]))

10595

<< "SLP vectorized with cost " << ore::NV("Cost", Cost)

10596

<< " and with tree size "

10597

<< ore::NV("TreeSize", R.getTreeSize()));

10598

10599

R.vectorizeTree();

10600

// Move to the next bundle.

10601

I += VF - 1;

10602

NextInst = I + 1;

10603

Changed = true;

10604

}

10605

}

10606

}

10607

10608

if (!Changed && CandidateFound) {

10609

R.getORE()->emit([&]() {

10610

return OptimizationRemarkMissed(SV_NAME"slp-vectorizer", "NotBeneficial", I0)

10611

<< "List vectorization was possible but not beneficial with cost "

10612

<< ore::NV("Cost", MinCost) << " >= "

10613

<< ore::NV("Treshold", -SLPCostThreshold);

10614

});

10615

} else if (!Changed) {

10616

R.getORE()->emit([&]() {

10617

return OptimizationRemarkMissed(SV_NAME"slp-vectorizer", "NotPossible", I0)

10618

<< "Cannot SLP vectorize list: vectorization was impossible"

10619

<< " with available vectorization factors";

10620

});

10621

}

10622

return Changed;

10623

}

10624

10625

bool SLPVectorizerPass::tryToVectorize(Instruction *I, BoUpSLP &R) {

10626

if (!I)

10627

return false;

10628

10629

if (!isa<BinaryOperator, CmpInst>(I) || isa<VectorType>(I->getType()))

10630

return false;

10631

10632

Value *P = I->getParent();

10633

10634

// Vectorize in current basic block only.

10635

auto *Op0 = dyn_cast<Instruction>(I->getOperand(0));

10636

auto *Op1 = dyn_cast<Instruction>(I->getOperand(1));

10637

if (!Op0 || !Op1 || Op0->getParent() != P || Op1->getParent() != P)

10638

return false;

10639

10640

// First collect all possible candidates

10641

SmallVector<std::pair<Value *, Value *>, 4> Candidates;

10642

Candidates.emplace_back(Op0, Op1);

10643

10644

auto *A = dyn_cast<BinaryOperator>(Op0);

10645

auto *B = dyn_cast<BinaryOperator>(Op1);

10646

// Try to skip B.

10647

if (A && B && B->hasOneUse()) {

10648

auto *B0 = dyn_cast<BinaryOperator>(B->getOperand(0));

10649

auto *B1 = dyn_cast<BinaryOperator>(B->getOperand(1));

10650

if (B0 && B0->getParent() == P)

10651

Candidates.emplace_back(A, B0);

10652

if (B1 && B1->getParent() == P)

10653

Candidates.emplace_back(A, B1);

10654

}

10655

// Try to skip A.

10656

if (B && A && A->hasOneUse()) {

10657

auto *A0 = dyn_cast<BinaryOperator>(A->getOperand(0));

10658

auto *A1 = dyn_cast<BinaryOperator>(A->getOperand(1));

10659

if (A0 && A0->getParent() == P)

10660

Candidates.emplace_back(A0, B);

10661

if (A1 && A1->getParent() == P)

10662

Candidates.emplace_back(A1, B);

10663

}

10664

10665

if (Candidates.size() == 1)

10666

return tryToVectorizePair(Op0, Op1, R);

10667

10668

// We have multiple options. Try to pick the single best.

10669

Optional<int> BestCandidate = R.findBestRootPair(Candidates);

10670

if (!BestCandidate)

10671

return false;

10672

return tryToVectorizePair(Candidates[*BestCandidate].first,

10673

Candidates[*BestCandidate].second, R);

10674

}

10675

10676

namespace {

10677

10678

/// Model horizontal reductions.

10679

///

10680

/// A horizontal reduction is a tree of reduction instructions that has values

10681

/// that can be put into a vector as its leaves. For example:

10682

///

10683

/// mul mul mul mul

10684

/// \ / \ /

10685

/// + +

10686

/// \ /

10687

/// +

10688

/// This tree has "mul" as its leaf values and "+" as its reduction

10689

/// instructions. A reduction can feed into a store or a binary operation

10690

/// feeding a phi.

10691

/// ...

10692

/// \ /

10693

/// +

10694

/// |

10695

/// phi +=

10696

///

10697

/// Or:

10698

/// ...

10699

/// \ /

10700

/// +

10701

/// |

10702

/// *p =

10703

///

10704

class HorizontalReduction {

10705

using ReductionOpsType = SmallVector<Value *, 16>;

10706

using ReductionOpsListType = SmallVector<ReductionOpsType, 2>;

10707

ReductionOpsListType ReductionOps;

10708

/// List of possibly reduced values.

10709

SmallVector<SmallVector<Value *>> ReducedVals;

10710

/// Maps reduced value to the corresponding reduction operation.

10711

DenseMap<Value *, SmallVector<Instruction *>> ReducedValsToOps;

10712

// Use map vector to make stable output.

10713

MapVector<Instruction *, Value *> ExtraArgs;

10714

WeakTrackingVH ReductionRoot;

10715

/// The type of reduction operation.

10716

RecurKind RdxKind;

10717

10718

static bool isCmpSelMinMax(Instruction *I) {

10719

return match(I, m_Select(m_Cmp(), m_Value(), m_Value())) &&

10720

RecurrenceDescriptor::isMinMaxRecurrenceKind(getRdxKind(I));

10721

}

10722

10723

// And/or are potentially poison-safe logical patterns like:

10724

// select x, y, false

10725

// select x, true, y

10726

static bool isBoolLogicOp(Instruction *I) {

10727

return isa<SelectInst>(I) &&

10728

(match(I, m_LogicalAnd()) || match(I, m_LogicalOr()));

10729

}

10730

10731

/// Checks if instruction is associative and can be vectorized.

10732

static bool isVectorizable(RecurKind Kind, Instruction *I) {

10733

if (Kind == RecurKind::None)

10734

return false;

10735

10736

// Integer ops that map to select instructions or intrinsics are fine.

10737

if (RecurrenceDescriptor::isIntMinMaxRecurrenceKind(Kind) ||

10738

isBoolLogicOp(I))

10739

return true;

10740

10741

if (Kind == RecurKind::FMax || Kind == RecurKind::FMin) {

10742

// FP min/max are associative except for NaN and -0.0. We do not

10743

// have to rule out -0.0 here because the intrinsic semantics do not

10744

// specify a fixed result for it.

10745

return I->getFastMathFlags().noNaNs();

10746

}

10747

10748

return I->isAssociative();

10749

}

10750

10751

static Value *getRdxOperand(Instruction *I, unsigned Index) {

10752

// Poison-safe 'or' takes the form: select X, true, Y

10753

// To make that work with the normal operand processing, we skip the

10754

// true value operand.

10755

// TODO: Change the code and data structures to handle this without a hack.

10756

if (getRdxKind(I) == RecurKind::Or && isa<SelectInst>(I) && Index == 1)

10757

return I->getOperand(2);

10758

return I->getOperand(Index);

10759

}

10760

10761

/// Creates reduction operation with the current opcode.

10762

static Value *createOp(IRBuilder<> &Builder, RecurKind Kind, Value *LHS,

10763

Value *RHS, const Twine &Name, bool UseSelect) {

10764

unsigned RdxOpcode = RecurrenceDescriptor::getOpcode(Kind);

10765

switch (Kind) {

10766

case RecurKind::Or:

10767

if (UseSelect &&

10768

LHS->getType() == CmpInst::makeCmpResultType(LHS->getType()))

10769

return Builder.CreateSelect(LHS, Builder.getTrue(), RHS, Name);

10770

return Builder.CreateBinOp((Instruction::BinaryOps)RdxOpcode, LHS, RHS,

10771

Name);

10772

case RecurKind::And:

10773

if (UseSelect &&

10774

LHS->getType() == CmpInst::makeCmpResultType(LHS->getType()))

10775

return Builder.CreateSelect(LHS, RHS, Builder.getFalse(), Name);

10776

return Builder.CreateBinOp((Instruction::BinaryOps)RdxOpcode, LHS, RHS,

10777

Name);

10778

case RecurKind::Add:

10779

case RecurKind::Mul:

10780

case RecurKind::Xor:

10781

case RecurKind::FAdd:

10782

case RecurKind::FMul:

10783

return Builder.CreateBinOp((Instruction::BinaryOps)RdxOpcode, LHS, RHS,

10784

Name);

10785

case RecurKind::FMax:

10786

return Builder.CreateBinaryIntrinsic(Intrinsic::maxnum, LHS, RHS);

10787

case RecurKind::FMin:

10788

return Builder.CreateBinaryIntrinsic(Intrinsic::minnum, LHS, RHS);

10789

case RecurKind::SMax:

10790

if (UseSelect) {

10791

Value *Cmp = Builder.CreateICmpSGT(LHS, RHS, Name);

10792

return Builder.CreateSelect(Cmp, LHS, RHS, Name);

10793

}

10794

return Builder.CreateBinaryIntrinsic(Intrinsic::smax, LHS, RHS);

10795

case RecurKind::SMin:

10796

if (UseSelect) {

10797

Value *Cmp = Builder.CreateICmpSLT(LHS, RHS, Name);

10798

return Builder.CreateSelect(Cmp, LHS, RHS, Name);

10799

}

10800

return Builder.CreateBinaryIntrinsic(Intrinsic::smin, LHS, RHS);

10801

case RecurKind::UMax:

10802

if (UseSelect) {

10803

Value *Cmp = Builder.CreateICmpUGT(LHS, RHS, Name);

10804

return Builder.CreateSelect(Cmp, LHS, RHS, Name);

10805

}

10806

return Builder.CreateBinaryIntrinsic(Intrinsic::umax, LHS, RHS);

10807

case RecurKind::UMin:

10808

if (UseSelect) {

10809

Value *Cmp = Builder.CreateICmpULT(LHS, RHS, Name);

10810

return Builder.CreateSelect(Cmp, LHS, RHS, Name);

10811

}

10812

return Builder.CreateBinaryIntrinsic(Intrinsic::umin, LHS, RHS);

10813

default:

10814

llvm_unreachable("Unknown reduction operation.")::llvm::llvm_unreachable_internal("Unknown reduction operation."
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 10814);

10815

}

10816

}

10817

10818

/// Creates reduction operation with the current opcode with the IR flags

10819

/// from \p ReductionOps, dropping nuw/nsw flags.

10820

static Value *createOp(IRBuilder<> &Builder, RecurKind RdxKind, Value *LHS,

10821

Value *RHS, const Twine &Name,

10822

const ReductionOpsListType &ReductionOps) {

10823

bool UseSelect = ReductionOps.size() == 2 ||

10824

// Logical or/and.

10825

(ReductionOps.size() == 1 &&

10826

isa<SelectInst>(ReductionOps.front().front()));

10827

assert((!UseSelect || ReductionOps.size() != 2 ||(static_cast <bool> ((!UseSelect || ReductionOps.size()
!= 2 || isa<SelectInst>(ReductionOps[1][0])) &&
"Expected cmp + select pairs for reduction") ? void (0) : __assert_fail
("(!UseSelect || ReductionOps.size() != 2 || isa<SelectInst>(ReductionOps[1][0])) && \"Expected cmp + select pairs for reduction\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 10829, __extension__
__PRETTY_FUNCTION__))

10828

isa<SelectInst>(ReductionOps[1][0])) &&(static_cast <bool> ((!UseSelect || ReductionOps.size()
!= 2 || isa<SelectInst>(ReductionOps[1][0])) &&
"Expected cmp + select pairs for reduction") ? void (0) : __assert_fail
("(!UseSelect || ReductionOps.size() != 2 || isa<SelectInst>(ReductionOps[1][0])) && \"Expected cmp + select pairs for reduction\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 10829, __extension__
__PRETTY_FUNCTION__))

10829

"Expected cmp + select pairs for reduction")(static_cast <bool> ((!UseSelect || ReductionOps.size()
!= 2 || isa<SelectInst>(ReductionOps[1][0])) &&
"Expected cmp + select pairs for reduction") ? void (0) : __assert_fail
("(!UseSelect || ReductionOps.size() != 2 || isa<SelectInst>(ReductionOps[1][0])) && \"Expected cmp + select pairs for reduction\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 10829, __extension__
__PRETTY_FUNCTION__));

10830

Value *Op = createOp(Builder, RdxKind, LHS, RHS, Name, UseSelect);

10831

if (RecurrenceDescriptor::isIntMinMaxRecurrenceKind(RdxKind)) {

10832

if (auto *Sel = dyn_cast<SelectInst>(Op)) {

10833

propagateIRFlags(Sel->getCondition(), ReductionOps[0], nullptr,

10834

/*IncludeWrapFlags=*/false);

10835

propagateIRFlags(Op, ReductionOps[1], nullptr,

10836

/*IncludeWrapFlags=*/false);

10837

return Op;

10838

}

10839

}

10840

propagateIRFlags(Op, ReductionOps[0], nullptr, /*IncludeWrapFlags=*/false);

10841

return Op;

10842

}

10843

10844

static RecurKind getRdxKind(Value *V) {

10845

auto *I = dyn_cast<Instruction>(V);

10846

if (!I)

10847

return RecurKind::None;

10848

if (match(I, m_Add(m_Value(), m_Value())))

10849

return RecurKind::Add;

10850

if (match(I, m_Mul(m_Value(), m_Value())))

10851

return RecurKind::Mul;

10852

if (match(I, m_And(m_Value(), m_Value())) ||

10853

match(I, m_LogicalAnd(m_Value(), m_Value())))

10854

return RecurKind::And;

10855

if (match(I, m_Or(m_Value(), m_Value())) ||

10856

match(I, m_LogicalOr(m_Value(), m_Value())))

10857

return RecurKind::Or;

10858

if (match(I, m_Xor(m_Value(), m_Value())))

10859

return RecurKind::Xor;

10860

if (match(I, m_FAdd(m_Value(), m_Value())))

10861

return RecurKind::FAdd;

10862

if (match(I, m_FMul(m_Value(), m_Value())))

10863

return RecurKind::FMul;

10864

10865

if (match(I, m_Intrinsic<Intrinsic::maxnum>(m_Value(), m_Value())))

10866

return RecurKind::FMax;

10867

if (match(I, m_Intrinsic<Intrinsic::minnum>(m_Value(), m_Value())))

10868

return RecurKind::FMin;

10869

10870

// This matches either cmp+select or intrinsics. SLP is expected to handle

10871

// either form.

10872

// TODO: If we are canonicalizing to intrinsics, we can remove several

10873

// special-case paths that deal with selects.

10874

if (match(I, m_SMax(m_Value(), m_Value())))

10875

return RecurKind::SMax;

10876

if (match(I, m_SMin(m_Value(), m_Value())))

10877

return RecurKind::SMin;

10878

if (match(I, m_UMax(m_Value(), m_Value())))

10879

return RecurKind::UMax;

10880

if (match(I, m_UMin(m_Value(), m_Value())))

10881

return RecurKind::UMin;

10882

10883

if (auto *Select = dyn_cast<SelectInst>(I)) {

10884

// Try harder: look for min/max pattern based on instructions producing

10885

// same values such as: select ((cmp Inst1, Inst2), Inst1, Inst2).

10886

// During the intermediate stages of SLP, it's very common to have

10887

// pattern like this (since optimizeGatherSequence is run only once

10888

// at the end):

10889

// %1 = extractelement <2 x i32> %a, i32 0

10890

// %2 = extractelement <2 x i32> %a, i32 1

10891

// %cond = icmp sgt i32 %1, %2

10892

// %3 = extractelement <2 x i32> %a, i32 0

10893

// %4 = extractelement <2 x i32> %a, i32 1

10894

// %select = select i1 %cond, i32 %3, i32 %4

10895

CmpInst::Predicate Pred;

10896

Instruction *L1;

10897

Instruction *L2;

10898

10899

Value *LHS = Select->getTrueValue();

10900

Value *RHS = Select->getFalseValue();

10901

Value *Cond = Select->getCondition();

10902

10903

// TODO: Support inverse predicates.

10904

if (match(Cond, m_Cmp(Pred, m_Specific(LHS), m_Instruction(L2)))) {

10905

if (!isa<ExtractElementInst>(RHS) ||

10906

!L2->isIdenticalTo(cast<Instruction>(RHS)))

10907

return RecurKind::None;

10908

} else if (match(Cond, m_Cmp(Pred, m_Instruction(L1), m_Specific(RHS)))) {

10909

if (!isa<ExtractElementInst>(LHS) ||

10910

!L1->isIdenticalTo(cast<Instruction>(LHS)))

10911

return RecurKind::None;

10912

} else {

10913

if (!isa<ExtractElementInst>(LHS) || !isa<ExtractElementInst>(RHS))

10914

return RecurKind::None;

10915

if (!match(Cond, m_Cmp(Pred, m_Instruction(L1), m_Instruction(L2))) ||

10916

!L1->isIdenticalTo(cast<Instruction>(LHS)) ||

10917

!L2->isIdenticalTo(cast<Instruction>(RHS)))

10918

return RecurKind::None;

10919

}

10920

10921

switch (Pred) {

10922

default:

10923

return RecurKind::None;

10924

case CmpInst::ICMP_SGT:

10925

case CmpInst::ICMP_SGE:

10926

return RecurKind::SMax;

10927

case CmpInst::ICMP_SLT:

10928

case CmpInst::ICMP_SLE:

10929

return RecurKind::SMin;

10930

case CmpInst::ICMP_UGT:

10931

case CmpInst::ICMP_UGE:

10932

return RecurKind::UMax;

10933

case CmpInst::ICMP_ULT:

10934

case CmpInst::ICMP_ULE:

10935

return RecurKind::UMin;

10936

}

10937

}

10938

return RecurKind::None;

10939

}

10940

10941

/// Get the index of the first operand.

10942

static unsigned getFirstOperandIndex(Instruction *I) {

10943

return isCmpSelMinMax(I) ? 1 : 0;

10944

}

10945

10946

/// Total number of operands in the reduction operation.

10947

static unsigned getNumberOfOperands(Instruction *I) {

10948

return isCmpSelMinMax(I) ? 3 : 2;

10949

}

10950

10951

/// Checks if the instruction is in basic block \p BB.

10952

/// For a cmp+sel min/max reduction check that both ops are in \p BB.

10953

static bool hasSameParent(Instruction *I, BasicBlock *BB) {

10954

if (isCmpSelMinMax(I) || isBoolLogicOp(I)) {

10955

auto *Sel = cast<SelectInst>(I);

10956

auto *Cmp = dyn_cast<Instruction>(Sel->getCondition());

10957

return Sel->getParent() == BB && Cmp && Cmp->getParent() == BB;

10958

}

10959

return I->getParent() == BB;

10960

}

10961

10962

/// Expected number of uses for reduction operations/reduced values.

10963

static bool hasRequiredNumberOfUses(bool IsCmpSelMinMax, Instruction *I) {

10964

if (IsCmpSelMinMax) {

10965

// SelectInst must be used twice while the condition op must have single

10966

// use only.

10967

if (auto *Sel = dyn_cast<SelectInst>(I))

10968

return Sel->hasNUses(2) && Sel->getCondition()->hasOneUse();

10969

return I->hasNUses(2);

10970

}

10971

10972

// Arithmetic reduction operation must be used once only.

10973

return I->hasOneUse();

10974

}

10975

10976

/// Initializes the list of reduction operations.

10977

void initReductionOps(Instruction *I) {

10978

if (isCmpSelMinMax(I))

10979

ReductionOps.assign(2, ReductionOpsType());

10980

else

10981

ReductionOps.assign(1, ReductionOpsType());

10982

}

10983

10984

/// Add all reduction operations for the reduction instruction \p I.

10985

void addReductionOps(Instruction *I) {

10986

if (isCmpSelMinMax(I)) {

10987

ReductionOps[0].emplace_back(cast<SelectInst>(I)->getCondition());

10988

ReductionOps[1].emplace_back(I);

10989

} else {

10990

ReductionOps[0].emplace_back(I);

10991

}

10992

}

10993

10994

static Value *getLHS(RecurKind Kind, Instruction *I) {

10995

if (Kind == RecurKind::None)

10996

return nullptr;

10997

return I->getOperand(getFirstOperandIndex(I));

10998

}

10999

static Value *getRHS(RecurKind Kind, Instruction *I) {

11000

if (Kind == RecurKind::None)

11001

return nullptr;

11002

return I->getOperand(getFirstOperandIndex(I) + 1);

11003

}

11004

11005

public:

11006

HorizontalReduction() = default;

11007

11008

/// Try to find a reduction tree.

11009

bool matchAssociativeReduction(PHINode *Phi, Instruction *Inst,

11010

ScalarEvolution &SE, const DataLayout &DL,

11011

const TargetLibraryInfo &TLI) {

11012

assert((!Phi || is_contained(Phi->operands(), Inst)) &&(static_cast <bool> ((!Phi || is_contained(Phi->operands
(), Inst)) && "Phi needs to use the binary operator")
? void (0) : __assert_fail ("(!Phi || is_contained(Phi->operands(), Inst)) && \"Phi needs to use the binary operator\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 11013, __extension__
__PRETTY_FUNCTION__))

11013

"Phi needs to use the binary operator")(static_cast <bool> ((!Phi || is_contained(Phi->operands
(), Inst)) && "Phi needs to use the binary operator")
? void (0) : __assert_fail ("(!Phi || is_contained(Phi->operands(), Inst)) && \"Phi needs to use the binary operator\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 11013, __extension__
__PRETTY_FUNCTION__));

11014

assert((isa<BinaryOperator>(Inst) || isa<SelectInst>(Inst) ||(static_cast <bool> ((isa<BinaryOperator>(Inst) ||
isa<SelectInst>(Inst) || isa<IntrinsicInst>(Inst
)) && "Expected binop, select, or intrinsic for reduction matching"
) ? void (0) : __assert_fail ("(isa<BinaryOperator>(Inst) || isa<SelectInst>(Inst) || isa<IntrinsicInst>(Inst)) && \"Expected binop, select, or intrinsic for reduction matching\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 11016, __extension__
__PRETTY_FUNCTION__))

11015

isa<IntrinsicInst>(Inst)) &&(static_cast <bool> ((isa<BinaryOperator>(Inst) ||
isa<SelectInst>(Inst) || isa<IntrinsicInst>(Inst
)) && "Expected binop, select, or intrinsic for reduction matching"
) ? void (0) : __assert_fail ("(isa<BinaryOperator>(Inst) || isa<SelectInst>(Inst) || isa<IntrinsicInst>(Inst)) && \"Expected binop, select, or intrinsic for reduction matching\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 11016, __extension__
__PRETTY_FUNCTION__))

11016

"Expected binop, select, or intrinsic for reduction matching")(static_cast <bool> ((isa<BinaryOperator>(Inst) ||
isa<SelectInst>(Inst) || isa<IntrinsicInst>(Inst
)) && "Expected binop, select, or intrinsic for reduction matching"
) ? void (0) : __assert_fail ("(isa<BinaryOperator>(Inst) || isa<SelectInst>(Inst) || isa<IntrinsicInst>(Inst)) && \"Expected binop, select, or intrinsic for reduction matching\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 11016, __extension__
__PRETTY_FUNCTION__));

11017

RdxKind = getRdxKind(Inst);

11018

11019

// We could have a initial reductions that is not an add.

11020

// r *= v1 + v2 + v3 + v4

11021

// In such a case start looking for a tree rooted in the first '+'.

11022

if (Phi) {

11023

if (getLHS(RdxKind, Inst) == Phi) {

11024

Phi = nullptr;

11025

Inst = dyn_cast<Instruction>(getRHS(RdxKind, Inst));

11026

if (!Inst)

11027

return false;

11028

RdxKind = getRdxKind(Inst);

11029

} else if (getRHS(RdxKind, Inst) == Phi) {

11030

Phi = nullptr;

11031

Inst = dyn_cast<Instruction>(getLHS(RdxKind, Inst));

11032

if (!Inst)

11033

return false;

11034

RdxKind = getRdxKind(Inst);

11035

}

11036

}

11037

11038

if (!isVectorizable(RdxKind, Inst))

11039

return false;

11040

11041

// Analyze "regular" integer/FP types for reductions - no target-specific

11042

// types or pointers.

11043

Type *Ty = Inst->getType();

11044

if (!isValidElementType(Ty) || Ty->isPointerTy())

11045

return false;

11046

11047

// Though the ultimate reduction may have multiple uses, its condition must

11048

// have only single use.

11049

if (auto *Sel = dyn_cast<SelectInst>(Inst))

11050

if (!Sel->getCondition()->hasOneUse())

11051

return false;

11052

11053

ReductionRoot = Inst;

11054

11055

// Iterate through all the operands of the possible reduction tree and

11056

// gather all the reduced values, sorting them by their value id.

11057

BasicBlock *BB = Inst->getParent();

11058

bool IsCmpSelMinMax = isCmpSelMinMax(Inst);

11059

SmallVector<Instruction *> Worklist(1, Inst);

11060

// Checks if the operands of the \p TreeN instruction are also reduction

11061

// operations or should be treated as reduced values or an extra argument,

11062

// which is not part of the reduction.

11063

auto &&CheckOperands = [this, IsCmpSelMinMax,

11064

BB](Instruction *TreeN,

11065

SmallVectorImpl<Value *> &ExtraArgs,

11066

SmallVectorImpl<Value *> &PossibleReducedVals,

11067

SmallVectorImpl<Instruction *> &ReductionOps) {

11068

for (int I = getFirstOperandIndex(TreeN),

11069

End = getNumberOfOperands(TreeN);

11070

I < End; ++I) {

11071

Value *EdgeVal = getRdxOperand(TreeN, I);

11072

ReducedValsToOps[EdgeVal].push_back(TreeN);

11073

auto *EdgeInst = dyn_cast<Instruction>(EdgeVal);

11074

// Edge has wrong parent - mark as an extra argument.

11075

if (EdgeInst && !isVectorLikeInstWithConstOps(EdgeInst) &&

11076

!hasSameParent(EdgeInst, BB)) {

11077

ExtraArgs.push_back(EdgeVal);

11078

continue;

11079

}

11080

// If the edge is not an instruction, or it is different from the main

11081

// reduction opcode or has too many uses - possible reduced value.

11082

if (!EdgeInst || getRdxKind(EdgeInst) != RdxKind ||

11083

IsCmpSelMinMax != isCmpSelMinMax(EdgeInst) ||

11084

!hasRequiredNumberOfUses(IsCmpSelMinMax, EdgeInst) ||

11085

!isVectorizable(getRdxKind(EdgeInst), EdgeInst)) {

11086

PossibleReducedVals.push_back(EdgeVal);

11087

continue;

11088

}

11089

ReductionOps.push_back(EdgeInst);

11090

}

11091

};

11092

// Try to regroup reduced values so that it gets more profitable to try to

11093

// reduce them. Values are grouped by their value ids, instructions - by

11094

// instruction op id and/or alternate op id, plus do extra analysis for

11095

// loads (grouping them by the distabce between pointers) and cmp

11096

// instructions (grouping them by the predicate).

11097

MapVector<size_t, MapVector<size_t, MapVector<Value *, unsigned>>>

11098

PossibleReducedVals;

11099

initReductionOps(Inst);

11100

while (!Worklist.empty()) {

11101

Instruction *TreeN = Worklist.pop_back_val();

11102

SmallVector<Value *> Args;

11103

SmallVector<Value *> PossibleRedVals;

11104

SmallVector<Instruction *> PossibleReductionOps;

11105

CheckOperands(TreeN, Args, PossibleRedVals, PossibleReductionOps);

11106

// If too many extra args - mark the instruction itself as a reduction

11107

// value, not a reduction operation.

11108

if (Args.size() < 2) {

11109

addReductionOps(TreeN);

11110

// Add extra args.

11111

if (!Args.empty()) {

11112

assert(Args.size() == 1 && "Expected only single argument.")(static_cast <bool> (Args.size() == 1 && "Expected only single argument."
) ? void (0) : __assert_fail ("Args.size() == 1 && \"Expected only single argument.\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 11112, __extension__
__PRETTY_FUNCTION__));

11113

ExtraArgs[TreeN] = Args.front();

11114

}

11115

// Add reduction values. The values are sorted for better vectorization

11116

// results.

11117

for (Value *V : PossibleRedVals) {

11118

size_t Key, Idx;

11119

std::tie(Key, Idx) = generateKeySubkey(

11120

V, &TLI,

11121

[&PossibleReducedVals, &DL, &SE](size_t Key, LoadInst *LI) {

11122

auto It = PossibleReducedVals.find(Key);

11123

if (It != PossibleReducedVals.end()) {

11124

for (const auto &LoadData : It->second) {

11125

auto *RLI = cast<LoadInst>(LoadData.second.front().first);

11126

if (getPointersDiff(RLI->getType(),

11127

RLI->getPointerOperand(), LI->getType(),

11128

LI->getPointerOperand(), DL, SE,

11129

/*StrictCheck=*/true))

11130

return hash_value(RLI->getPointerOperand());

11131

}

11132

}

11133

return hash_value(LI->getPointerOperand());

11134

},

11135

/*AllowAlternate=*/false);

11136

++PossibleReducedVals[Key][Idx]

11137

.insert(std::make_pair(V, 0))

11138

.first->second;

11139

}

11140

Worklist.append(PossibleReductionOps.rbegin(),

11141

PossibleReductionOps.rend());

11142

} else {

11143

size_t Key, Idx;

11144

std::tie(Key, Idx) = generateKeySubkey(

11145

TreeN, &TLI,

11146

[&PossibleReducedVals, &DL, &SE](size_t Key, LoadInst *LI) {

11147

auto It = PossibleReducedVals.find(Key);

11148

if (It != PossibleReducedVals.end()) {

11149

for (const auto &LoadData : It->second) {

11150

auto *RLI = cast<LoadInst>(LoadData.second.front().first);

11151

if (getPointersDiff(RLI->getType(), RLI->getPointerOperand(),

11152

LI->getType(), LI->getPointerOperand(),

11153

DL, SE, /*StrictCheck=*/true))

11154

return hash_value(RLI->getPointerOperand());

11155

}

11156

}

11157

return hash_value(LI->getPointerOperand());

11158

},

11159

/*AllowAlternate=*/false);

11160

++PossibleReducedVals[Key][Idx]

11161

.insert(std::make_pair(TreeN, 0))

11162

.first->second;

11163

}

11164

}

11165

auto PossibleReducedValsVect = PossibleReducedVals.takeVector();

11166

// Sort values by the total number of values kinds to start the reduction

11167

// from the longest possible reduced values sequences.

11168

for (auto &PossibleReducedVals : PossibleReducedValsVect) {

11169

auto PossibleRedVals = PossibleReducedVals.second.takeVector();

11170

SmallVector<SmallVector<Value *>> PossibleRedValsVect;

11171

for (auto It = PossibleRedVals.begin(), E = PossibleRedVals.end();

11172

It != E; ++It) {

11173

PossibleRedValsVect.emplace_back();

11174

auto RedValsVect = It->second.takeVector();

11175

stable_sort(RedValsVect, llvm::less_second());

11176

for (const std::pair<Value *, unsigned> &Data : RedValsVect)

11177

PossibleRedValsVect.back().append(Data.second, Data.first);

11178

}

11179

stable_sort(PossibleRedValsVect, [](const auto &P1, const auto &P2) {

11180

return P1.size() > P2.size();

11181

});

11182

ReducedVals.emplace_back();

11183

for (ArrayRef<Value *> Data : PossibleRedValsVect)

11184

ReducedVals.back().append(Data.rbegin(), Data.rend());

11185

}

11186

// Sort the reduced values by number of same/alternate opcode and/or pointer

11187

// operand.

11188

stable_sort(ReducedVals, [](ArrayRef<Value *> P1, ArrayRef<Value *> P2) {

11189

return P1.size() > P2.size();

11190

});

11191

return true;

11192

}

11193

11194

/// Attempt to vectorize the tree found by matchAssociativeReduction.

11195

Value *tryToReduce(BoUpSLP &V, TargetTransformInfo *TTI) {

11196

constexpr int ReductionLimit = 4;

11197

constexpr unsigned RegMaxNumber = 4;

11198

constexpr unsigned RedValsMaxNumber = 128;

11199

// If there are a sufficient number of reduction values, reduce

11200

// to a nearby power-of-2. We can safely generate oversized

11201

// vectors and rely on the backend to split them to legal sizes.

11202

unsigned NumReducedVals = std::accumulate(

11203

ReducedVals.begin(), ReducedVals.end(), 0,

11204

[](int Num, ArrayRef<Value *> Vals) { return Num + Vals.size(); });

11205

if (NumReducedVals < ReductionLimit)

11206

return nullptr;

11207

11208

IRBuilder<> Builder(cast<Instruction>(ReductionRoot));

11209

11210

// Track the reduced values in case if they are replaced by extractelement

11211

// because of the vectorization.

11212

DenseMap<Value *, WeakTrackingVH> TrackedVals;

11213

BoUpSLP::ExtraValueToDebugLocsMap ExternallyUsedValues;

11214

// The same extra argument may be used several times, so log each attempt

11215

// to use it.

11216

for (const std::pair<Instruction *, Value *> &Pair : ExtraArgs) {

11217

assert(Pair.first && "DebugLoc must be set.")(static_cast <bool> (Pair.first && "DebugLoc must be set."
) ? void (0) : __assert_fail ("Pair.first && \"DebugLoc must be set.\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 11217, __extension__
__PRETTY_FUNCTION__));

11218

ExternallyUsedValues[Pair.second].push_back(Pair.first);

11219

TrackedVals.try_emplace(Pair.second, Pair.second);

11220

}

11221

11222

// The compare instruction of a min/max is the insertion point for new

11223

// instructions and may be replaced with a new compare instruction.

11224

auto &&GetCmpForMinMaxReduction = [](Instruction *RdxRootInst) {

11225

assert(isa<SelectInst>(RdxRootInst) &&(static_cast <bool> (isa<SelectInst>(RdxRootInst)
&& "Expected min/max reduction to have select root instruction"
) ? void (0) : __assert_fail ("isa<SelectInst>(RdxRootInst) && \"Expected min/max reduction to have select root instruction\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 11226, __extension__
__PRETTY_FUNCTION__))

11226

"Expected min/max reduction to have select root instruction")(static_cast <bool> (isa<SelectInst>(RdxRootInst)
&& "Expected min/max reduction to have select root instruction"
) ? void (0) : __assert_fail ("isa<SelectInst>(RdxRootInst) && \"Expected min/max reduction to have select root instruction\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 11226, __extension__
__PRETTY_FUNCTION__));

11227

Value *ScalarCond = cast<SelectInst>(RdxRootInst)->getCondition();

11228

assert(isa<Instruction>(ScalarCond) &&(static_cast <bool> (isa<Instruction>(ScalarCond)
&& "Expected min/max reduction to have compare condition"
) ? void (0) : __assert_fail ("isa<Instruction>(ScalarCond) && \"Expected min/max reduction to have compare condition\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 11229, __extension__
__PRETTY_FUNCTION__))

11229

"Expected min/max reduction to have compare condition")(static_cast <bool> (isa<Instruction>(ScalarCond)
&& "Expected min/max reduction to have compare condition"
) ? void (0) : __assert_fail ("isa<Instruction>(ScalarCond) && \"Expected min/max reduction to have compare condition\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 11229, __extension__
__PRETTY_FUNCTION__));

11230

return cast<Instruction>(ScalarCond);

11231

};

11232

11233

// The reduction root is used as the insertion point for new instructions,

11234

// so set it as externally used to prevent it from being deleted.

11235

ExternallyUsedValues[ReductionRoot];

11236

SmallDenseSet<Value *> IgnoreList;

11237

for (ReductionOpsType &RdxOps : ReductionOps)

11238

for (Value *RdxOp : RdxOps) {

11239

if (!RdxOp)

11240

continue;

11241

IgnoreList.insert(RdxOp);

11242

}

11243

bool IsCmpSelMinMax = isCmpSelMinMax(cast<Instruction>(ReductionRoot));

11244

11245

// Need to track reduced vals, they may be changed during vectorization of

11246

// subvectors.

11247

for (ArrayRef<Value *> Candidates : ReducedVals)

11248

for (Value *V : Candidates)

11249

TrackedVals.try_emplace(V, V);

11250

11251

DenseMap<Value *, unsigned> VectorizedVals;

11252

Value *VectorizedTree = nullptr;

11253

bool CheckForReusedReductionOps = false;

11254

// Try to vectorize elements based on their type.

11255

for (unsigned I = 0, E = ReducedVals.size(); I < E; ++I) {

11256

ArrayRef<Value *> OrigReducedVals = ReducedVals[I];

11257

InstructionsState S = getSameOpcode(OrigReducedVals);

11258

SmallVector<Value *> Candidates;

11259

DenseMap<Value *, Value *> TrackedToOrig;

11260

for (unsigned Cnt = 0, Sz = OrigReducedVals.size(); Cnt < Sz; ++Cnt) {

11261

Value *RdxVal = TrackedVals.find(OrigReducedVals[Cnt])->second;

11262

// Check if the reduction value was not overriden by the extractelement

11263

// instruction because of the vectorization and exclude it, if it is not

11264

// compatible with other values.

11265

if (auto *Inst = dyn_cast<Instruction>(RdxVal))

11266

if (isVectorLikeInstWithConstOps(Inst) &&

11267

(!S.getOpcode() || !S.isOpcodeOrAlt(Inst)))

11268

continue;

11269

Candidates.push_back(RdxVal);

11270

TrackedToOrig.try_emplace(RdxVal, OrigReducedVals[Cnt]);

11271

}

11272

bool ShuffledExtracts = false;

11273

// Try to handle shuffled extractelements.

11274

if (S.getOpcode() == Instruction::ExtractElement && !S.isAltShuffle() &&

11275

I + 1 < E) {

11276

InstructionsState NextS = getSameOpcode(ReducedVals[I + 1]);

11277

if (NextS.getOpcode() == Instruction::ExtractElement &&

11278

!NextS.isAltShuffle()) {

11279

SmallVector<Value *> CommonCandidates(Candidates);

11280

for (Value *RV : ReducedVals[I + 1]) {

11281

Value *RdxVal = TrackedVals.find(RV)->second;

11282

// Check if the reduction value was not overriden by the

11283

// extractelement instruction because of the vectorization and

11284

// exclude it, if it is not compatible with other values.

11285

if (auto *Inst = dyn_cast<Instruction>(RdxVal))

11286

if (!NextS.getOpcode() || !NextS.isOpcodeOrAlt(Inst))

11287

continue;

11288

CommonCandidates.push_back(RdxVal);

11289

TrackedToOrig.try_emplace(RdxVal, RV);

11290

}

11291

SmallVector<int> Mask;

11292

if (isFixedVectorShuffle(CommonCandidates, Mask)) {

11293

++I;

11294

Candidates.swap(CommonCandidates);

11295

ShuffledExtracts = true;

11296

}

11297

}

11298

}

11299

unsigned NumReducedVals = Candidates.size();

11300

if (NumReducedVals < ReductionLimit)

11301

continue;

11302

11303

unsigned MaxVecRegSize = V.getMaxVecRegSize();

11304

unsigned EltSize = V.getVectorElementSize(Candidates[0]);

11305

unsigned MaxElts = RegMaxNumber * PowerOf2Floor(MaxVecRegSize / EltSize);

11306

11307

unsigned ReduxWidth = std::min<unsigned>(

11308

PowerOf2Floor(NumReducedVals), std::max(RedValsMaxNumber, MaxElts));

11309

unsigned Start = 0;

11310

unsigned Pos = Start;

11311

// Restarts vectorization attempt with lower vector factor.

11312

unsigned PrevReduxWidth = ReduxWidth;

11313

bool CheckForReusedReductionOpsLocal = false;

11314

auto &&AdjustReducedVals = [&Pos, &Start, &ReduxWidth, NumReducedVals,

11315

&CheckForReusedReductionOpsLocal,

11316

&PrevReduxWidth, &V,

11317

&IgnoreList](bool IgnoreVL = false) {

11318

bool IsAnyRedOpGathered = !IgnoreVL && V.isAnyGathered(IgnoreList);

11319

if (!CheckForReusedReductionOpsLocal && PrevReduxWidth == ReduxWidth) {

11320

// Check if any of the reduction ops are gathered. If so, worth

11321

// trying again with less number of reduction ops.

11322

CheckForReusedReductionOpsLocal |= IsAnyRedOpGathered;

11323

}

11324

++Pos;

11325

if (Pos < NumReducedVals - ReduxWidth + 1)

11326

return IsAnyRedOpGathered;

11327

Pos = Start;

11328

ReduxWidth /= 2;

11329

return IsAnyRedOpGathered;

11330

};

11331

while (Pos < NumReducedVals - ReduxWidth + 1 &&

11332

ReduxWidth >= ReductionLimit) {

11333

// Dependency in tree of the reduction ops - drop this attempt, try

11334

// later.

11335

if (CheckForReusedReductionOpsLocal && PrevReduxWidth != ReduxWidth &&

11336

Start == 0) {

11337

CheckForReusedReductionOps = true;

11338

break;

11339

}

11340

PrevReduxWidth = ReduxWidth;

11341

ArrayRef<Value *> VL(std::next(Candidates.begin(), Pos), ReduxWidth);

11342

// Beeing analyzed already - skip.

11343

if (V.areAnalyzedReductionVals(VL)) {

11344

(void)AdjustReducedVals(/*IgnoreVL=*/true);

11345

continue;

11346

}

11347

// Early exit if any of the reduction values were deleted during

11348

// previous vectorization attempts.

11349

if (any_of(VL, [&V](Value *RedVal) {

11350

auto *RedValI = dyn_cast<Instruction>(RedVal);

11351

if (!RedValI)

11352

return false;

11353

return V.isDeleted(RedValI);

11354

}))

11355

break;

11356

V.buildTree(VL, IgnoreList);

11357

if (V.isTreeTinyAndNotFullyVectorizable(/*ForReduction=*/true)) {

11358

if (!AdjustReducedVals())

11359

V.analyzedReductionVals(VL);

11360

continue;

11361

}

11362

if (V.isLoadCombineReductionCandidate(RdxKind)) {

11363

if (!AdjustReducedVals())

11364

V.analyzedReductionVals(VL);

11365

continue;

11366

}

11367

V.reorderTopToBottom();

11368

// No need to reorder the root node at all.

11369

V.reorderBottomToTop(/*IgnoreReorder=*/true);

11370

// Keep extracted other reduction values, if they are used in the

11371

// vectorization trees.

11372

BoUpSLP::ExtraValueToDebugLocsMap LocalExternallyUsedValues(

11373

ExternallyUsedValues);

11374

for (unsigned Cnt = 0, Sz = ReducedVals.size(); Cnt < Sz; ++Cnt) {

11375

if (Cnt == I || (ShuffledExtracts && Cnt == I - 1))

11376

continue;

11377

for_each(ReducedVals[Cnt],

11378

[&LocalExternallyUsedValues, &TrackedVals](Value *V) {

11379

if (isa<Instruction>(V))

11380

LocalExternallyUsedValues[TrackedVals[V]];

11381

});

11382

}

11383

// Number of uses of the candidates in the vector of values.

11384

SmallDenseMap<Value *, unsigned> NumUses;

11385

for (unsigned Cnt = 0; Cnt < Pos; ++Cnt) {

11386

Value *V = Candidates[Cnt];

11387

if (NumUses.count(V) > 0)

11388

continue;

11389

NumUses[V] = std::count(VL.begin(), VL.end(), V);

11390

}

11391

for (unsigned Cnt = Pos + ReduxWidth; Cnt < NumReducedVals; ++Cnt) {

11392

Value *V = Candidates[Cnt];

11393

if (NumUses.count(V) > 0)

11394

continue;

11395

NumUses[V] = std::count(VL.begin(), VL.end(), V);

11396

}

11397

// Gather externally used values.

11398

SmallPtrSet<Value *, 4> Visited;

11399

for (unsigned Cnt = 0; Cnt < Pos; ++Cnt) {

11400

Value *V = Candidates[Cnt];

11401

if (!Visited.insert(V).second)

11402

continue;

11403

unsigned NumOps = VectorizedVals.lookup(V) + NumUses[V];

11404

if (NumOps != ReducedValsToOps.find(V)->second.size())

11405

LocalExternallyUsedValues[V];

11406

}

11407

for (unsigned Cnt = Pos + ReduxWidth; Cnt < NumReducedVals; ++Cnt) {

11408

Value *V = Candidates[Cnt];

11409

if (!Visited.insert(V).second)

11410

continue;

11411

unsigned NumOps = VectorizedVals.lookup(V) + NumUses[V];

11412

if (NumOps != ReducedValsToOps.find(V)->second.size())

11413

LocalExternallyUsedValues[V];

11414

}

11415

V.buildExternalUses(LocalExternallyUsedValues);

11416

11417

V.computeMinimumValueSizes();

11418

11419

// Intersect the fast-math-flags from all reduction operations.

11420

FastMathFlags RdxFMF;

11421

RdxFMF.set();

11422

for (Value *U : IgnoreList)

11423

if (auto *FPMO = dyn_cast<FPMathOperator>(U))

11424

RdxFMF &= FPMO->getFastMathFlags();

11425

// Estimate cost.

11426

InstructionCost TreeCost = V.getTreeCost(VL);

11427

InstructionCost ReductionCost =

11428

getReductionCost(TTI, VL, ReduxWidth, RdxFMF);

11429

InstructionCost Cost = TreeCost + ReductionCost;

11430

LLVM_DEBUG(dbgs() << "SLP: Found cost = " << Cost << " for reduction\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Found cost = " << Cost
<< " for reduction\n"; } } while (false);

11431

if (!Cost.isValid()) {

11432

return nullptr;

11433

}

11434

if (Cost >= -SLPCostThreshold) {

11435

V.getORE()->emit([&]() {

11436

return OptimizationRemarkMissed(

11437

SV_NAME"slp-vectorizer", "HorSLPNotBeneficial",

11438

ReducedValsToOps.find(VL[0])->second.front())

11439

<< "Vectorizing horizontal reduction is possible "

11440

<< "but not beneficial with cost " << ore::NV("Cost", Cost)

11441

<< " and threshold "

11442

<< ore::NV("Threshold", -SLPCostThreshold);

11443

});

11444

if (!AdjustReducedVals())

11445

V.analyzedReductionVals(VL);

11446

continue;

11447

}

11448

11449

LLVM_DEBUG(dbgs() << "SLP: Vectorizing horizontal reduction at cost:"do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Vectorizing horizontal reduction at cost:"
<< Cost << ". (HorRdx)\n"; } } while (false)

11450

<< Cost << ". (HorRdx)\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Vectorizing horizontal reduction at cost:"
<< Cost << ". (HorRdx)\n"; } } while (false);

11451

V.getORE()->emit([&]() {

11452

return OptimizationRemark(

11453

SV_NAME"slp-vectorizer", "VectorizedHorizontalReduction",

11454

ReducedValsToOps.find(VL[0])->second.front())

11455

<< "Vectorized horizontal reduction with cost "

11456

<< ore::NV("Cost", Cost) << " and with tree size "

11457

<< ore::NV("TreeSize", V.getTreeSize());

11458

});

11459

11460

Builder.setFastMathFlags(RdxFMF);

11461

11462

// Vectorize a tree.

11463

Value *VectorizedRoot = V.vectorizeTree(LocalExternallyUsedValues);

11464

11465

// Emit a reduction. If the root is a select (min/max idiom), the insert

11466

// point is the compare condition of that select.

11467

Instruction *RdxRootInst = cast<Instruction>(ReductionRoot);

11468

if (IsCmpSelMinMax)

11469

Builder.SetInsertPoint(GetCmpForMinMaxReduction(RdxRootInst));

11470

else

11471

Builder.SetInsertPoint(RdxRootInst);

11472

11473

// To prevent poison from leaking across what used to be sequential,

11474

// safe, scalar boolean logic operations, the reduction operand must be

11475

// frozen.

11476

if (isBoolLogicOp(RdxRootInst))

11477

VectorizedRoot = Builder.CreateFreeze(VectorizedRoot);

11478

11479

Value *ReducedSubTree =

11480

emitReduction(VectorizedRoot, Builder, ReduxWidth, TTI);

11481

11482

if (!VectorizedTree) {

11483

// Initialize the final value in the reduction.

11484

VectorizedTree = ReducedSubTree;

11485

} else {

11486

// Update the final value in the reduction.

11487

Builder.SetCurrentDebugLocation(

11488

cast<Instruction>(ReductionOps.front().front())->getDebugLoc());

11489

VectorizedTree = createOp(Builder, RdxKind, VectorizedTree,

11490

ReducedSubTree, "op.rdx", ReductionOps);

11491

}

11492

// Count vectorized reduced values to exclude them from final reduction.

11493

for (Value *V : VL)

11494

++VectorizedVals.try_emplace(TrackedToOrig.find(V)->second, 0)

11495

.first->getSecond();

11496

Pos += ReduxWidth;

11497

Start = Pos;

11498

ReduxWidth = PowerOf2Floor(NumReducedVals - Pos);

11499

}

11500

}

11501

if (VectorizedTree) {

11502

// Reorder operands of bool logical op in the natural order to avoid

11503

// possible problem with poison propagation. If not possible to reorder

11504

// (both operands are originally RHS), emit an extra freeze instruction

11505

// for the LHS operand.

11506

//I.e., if we have original code like this:

11507

// RedOp1 = select i1 ?, i1 LHS, i1 false

11508

// RedOp2 = select i1 RHS, i1 ?, i1 false

11509

11510

// Then, we swap LHS/RHS to create a new op that matches the poison

11511

// semantics of the original code.

11512

11513

// If we have original code like this and both values could be poison:

11514

// RedOp1 = select i1 ?, i1 LHS, i1 false

11515

// RedOp2 = select i1 ?, i1 RHS, i1 false

11516

11517

// Then, we must freeze LHS in the new op.

11518

auto &&FixBoolLogicalOps =

11519

[&Builder, VectorizedTree](Value *&LHS, Value *&RHS,

11520

Instruction *RedOp1, Instruction *RedOp2) {

11521

if (!isBoolLogicOp(RedOp1))

11522

return;

11523

if (LHS == VectorizedTree || getRdxOperand(RedOp1, 0) == LHS ||

11524

isGuaranteedNotToBePoison(LHS))

11525

return;

11526

if (!isBoolLogicOp(RedOp2))

11527

return;

11528

if (RHS == VectorizedTree || getRdxOperand(RedOp2, 0) == RHS ||

11529

isGuaranteedNotToBePoison(RHS)) {

11530

std::swap(LHS, RHS);

11531

return;

11532

}

11533

LHS = Builder.CreateFreeze(LHS);

11534

};

11535

// Finish the reduction.

11536

// Need to add extra arguments and not vectorized possible reduction

11537

// values.

11538

// Try to avoid dependencies between the scalar remainders after

11539

// reductions.

11540

auto &&FinalGen =

11541

[this, &Builder, &TrackedVals, &FixBoolLogicalOps](

11542

ArrayRef<std::pair<Instruction *, Value *>> InstVals) {

11543

unsigned Sz = InstVals.size();

11544

SmallVector<std::pair<Instruction *, Value *>> ExtraReds(Sz / 2 +

11545

Sz % 2);

11546

for (unsigned I = 0, E = (Sz / 2) * 2; I < E; I += 2) {

11547

Instruction *RedOp = InstVals[I + 1].first;

11548

Builder.SetCurrentDebugLocation(RedOp->getDebugLoc());

11549

Value *RdxVal1 = InstVals[I].second;

11550

Value *StableRdxVal1 = RdxVal1;

11551

auto It1 = TrackedVals.find(RdxVal1);

11552

if (It1 != TrackedVals.end())

11553

StableRdxVal1 = It1->second;

11554

Value *RdxVal2 = InstVals[I + 1].second;

11555

Value *StableRdxVal2 = RdxVal2;

11556

auto It2 = TrackedVals.find(RdxVal2);

11557

if (It2 != TrackedVals.end())

11558

StableRdxVal2 = It2->second;

11559

// To prevent poison from leaking across what used to be

11560

// sequential, safe, scalar boolean logic operations, the

11561

// reduction operand must be frozen.

11562

FixBoolLogicalOps(StableRdxVal1, StableRdxVal2, InstVals[I].first,

11563

RedOp);

11564

Value *ExtraRed = createOp(Builder, RdxKind, StableRdxVal1,

11565

StableRdxVal2, "op.rdx", ReductionOps);

11566

ExtraReds[I / 2] = std::make_pair(InstVals[I].first, ExtraRed);

11567

}

11568

if (Sz % 2 == 1)

11569

ExtraReds[Sz / 2] = InstVals.back();

11570

return ExtraReds;

11571

};

11572

SmallVector<std::pair<Instruction *, Value *>> ExtraReductions;

11573

ExtraReductions.emplace_back(cast<Instruction>(ReductionRoot),

11574

VectorizedTree);

11575

SmallPtrSet<Value *, 8> Visited;

11576

for (ArrayRef<Value *> Candidates : ReducedVals) {

11577

for (Value *RdxVal : Candidates) {

11578

if (!Visited.insert(RdxVal).second)

11579

continue;

11580

unsigned NumOps = VectorizedVals.lookup(RdxVal);

11581

for (Instruction *RedOp :

11582

makeArrayRef(ReducedValsToOps.find(RdxVal)->second)

11583

.drop_back(NumOps))

11584

ExtraReductions.emplace_back(RedOp, RdxVal);

11585

}

11586

}

11587

for (auto &Pair : ExternallyUsedValues) {

11588

// Add each externally used value to the final reduction.

11589

for (auto *I : Pair.second)

11590

ExtraReductions.emplace_back(I, Pair.first);

11591

}

11592

// Iterate through all not-vectorized reduction values/extra arguments.

11593

while (ExtraReductions.size() > 1) {

11594

VectorizedTree = ExtraReductions.front().second;

11595

SmallVector<std::pair<Instruction *, Value *>> NewReds =

11596

FinalGen(ExtraReductions);

11597

ExtraReductions.swap(NewReds);

11598

}

11599

VectorizedTree = ExtraReductions.front().second;

11600

11601

ReductionRoot->replaceAllUsesWith(VectorizedTree);

11602

11603

// The original scalar reduction is expected to have no remaining

11604

// uses outside the reduction tree itself. Assert that we got this

11605

// correct, replace internal uses with undef, and mark for eventual

11606

// deletion.

11607

#ifndef NDEBUG

11608

SmallSet<Value *, 4> IgnoreSet;

11609

for (ArrayRef<Value *> RdxOps : ReductionOps)

11610

IgnoreSet.insert(RdxOps.begin(), RdxOps.end());

11611

#endif

11612

for (ArrayRef<Value *> RdxOps : ReductionOps) {

11613

for (Value *Ignore : RdxOps) {

11614

if (!Ignore)

11615

continue;

11616

#ifndef NDEBUG

11617

for (auto *U : Ignore->users()) {

11618

assert(IgnoreSet.count(U) &&(static_cast <bool> (IgnoreSet.count(U) && "All users must be either in the reduction ops list."
) ? void (0) : __assert_fail ("IgnoreSet.count(U) && \"All users must be either in the reduction ops list.\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 11619, __extension__
__PRETTY_FUNCTION__))

11619

"All users must be either in the reduction ops list.")(static_cast <bool> (IgnoreSet.count(U) && "All users must be either in the reduction ops list."
) ? void (0) : __assert_fail ("IgnoreSet.count(U) && \"All users must be either in the reduction ops list.\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 11619, __extension__
__PRETTY_FUNCTION__));

11620

}

11621

#endif

11622

if (!Ignore->use_empty()) {

11623

Value *Undef = UndefValue::get(Ignore->getType());

11624

Ignore->replaceAllUsesWith(Undef);

11625

}

11626

V.eraseInstruction(cast<Instruction>(Ignore));

11627

}

11628

}

11629

} else if (!CheckForReusedReductionOps) {

11630

for (ReductionOpsType &RdxOps : ReductionOps)

11631

for (Value *RdxOp : RdxOps)

11632

V.analyzedReductionRoot(cast<Instruction>(RdxOp));

11633

}

11634

return VectorizedTree;

11635

}

11636

11637

private:

11638

/// Calculate the cost of a reduction.

11639

InstructionCost getReductionCost(TargetTransformInfo *TTI,

11640

ArrayRef<Value *> ReducedVals,

11641

unsigned ReduxWidth, FastMathFlags FMF) {

11642

TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;

11643

Value *FirstReducedVal = ReducedVals.front();

11644

Type *ScalarTy = FirstReducedVal->getType();

11645

FixedVectorType *VectorTy = FixedVectorType::get(ScalarTy, ReduxWidth);

11646

InstructionCost VectorCost = 0, ScalarCost;

11647

// If all of the reduced values are constant, the vector cost is 0, since

11648

// the reduction value can be calculated at the compile time.

11649

bool AllConsts = all_of(ReducedVals, isConstant);

11650

switch (RdxKind) {

11651

case RecurKind::Add:

11652

case RecurKind::Mul:

11653

case RecurKind::Or:

11654

case RecurKind::And:

11655

case RecurKind::Xor:

11656

case RecurKind::FAdd:

11657

case RecurKind::FMul: {

11658

unsigned RdxOpcode = RecurrenceDescriptor::getOpcode(RdxKind);

11659

if (!AllConsts)

11660

VectorCost =

11661

TTI->getArithmeticReductionCost(RdxOpcode, VectorTy, FMF, CostKind);

11662

ScalarCost = TTI->getArithmeticInstrCost(RdxOpcode, ScalarTy, CostKind);

11663

break;

11664

}

11665

case RecurKind::FMax:

11666

case RecurKind::FMin: {

11667

auto *SclCondTy = CmpInst::makeCmpResultType(ScalarTy);

11668

if (!AllConsts) {

11669

auto *VecCondTy =

11670

cast<VectorType>(CmpInst::makeCmpResultType(VectorTy));

11671

VectorCost =

11672

TTI->getMinMaxReductionCost(VectorTy, VecCondTy,

11673

/*IsUnsigned=*/false, CostKind);

11674

}

11675

CmpInst::Predicate RdxPred = getMinMaxReductionPredicate(RdxKind);

11676

ScalarCost = TTI->getCmpSelInstrCost(Instruction::FCmp, ScalarTy,

11677

SclCondTy, RdxPred, CostKind) +

11678

TTI->getCmpSelInstrCost(Instruction::Select, ScalarTy,

11679

SclCondTy, RdxPred, CostKind);

11680

break;

11681

}

11682

case RecurKind::SMax:

11683

case RecurKind::SMin:

11684

case RecurKind::UMax:

11685

case RecurKind::UMin: {

11686

auto *SclCondTy = CmpInst::makeCmpResultType(ScalarTy);

11687

if (!AllConsts) {

11688

auto *VecCondTy =

11689

cast<VectorType>(CmpInst::makeCmpResultType(VectorTy));

11690

bool IsUnsigned =

11691

RdxKind == RecurKind::UMax || RdxKind == RecurKind::UMin;

11692

VectorCost = TTI->getMinMaxReductionCost(VectorTy, VecCondTy,

11693

IsUnsigned, CostKind);

11694

}

11695

CmpInst::Predicate RdxPred = getMinMaxReductionPredicate(RdxKind);

11696

ScalarCost = TTI->getCmpSelInstrCost(Instruction::ICmp, ScalarTy,

11697

SclCondTy, RdxPred, CostKind) +

11698

TTI->getCmpSelInstrCost(Instruction::Select, ScalarTy,

11699

SclCondTy, RdxPred, CostKind);

11700

break;

11701

}

11702

default:

11703

llvm_unreachable("Expected arithmetic or min/max reduction operation")::llvm::llvm_unreachable_internal("Expected arithmetic or min/max reduction operation"
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 11703);

11704

}

11705

11706

// Scalar cost is repeated for N-1 elements.

11707

ScalarCost *= (ReduxWidth - 1);

11708

LLVM_DEBUG(dbgs() << "SLP: Adding cost " << VectorCost - ScalarCostdo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Adding cost " << VectorCost
- ScalarCost << " for reduction that starts with " <<
*FirstReducedVal << " (It is a splitting reduction)\n"
; } } while (false)

11709

<< " for reduction that starts with " << *FirstReducedValdo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Adding cost " << VectorCost
- ScalarCost << " for reduction that starts with " <<
*FirstReducedVal << " (It is a splitting reduction)\n"
; } } while (false)

11710

<< " (It is a splitting reduction)\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Adding cost " << VectorCost
- ScalarCost << " for reduction that starts with " <<
*FirstReducedVal << " (It is a splitting reduction)\n"
; } } while (false);

11711

return VectorCost - ScalarCost;

11712

}

11713

11714

/// Emit a horizontal reduction of the vectorized value.

11715

Value *emitReduction(Value *VectorizedValue, IRBuilder<> &Builder,

11716

unsigned ReduxWidth, const TargetTransformInfo *TTI) {

11717

assert(VectorizedValue && "Need to have a vectorized tree node")(static_cast <bool> (VectorizedValue && "Need to have a vectorized tree node"
) ? void (0) : __assert_fail ("VectorizedValue && \"Need to have a vectorized tree node\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 11717, __extension__
__PRETTY_FUNCTION__));

11718

assert(isPowerOf2_32(ReduxWidth) &&(static_cast <bool> (isPowerOf2_32(ReduxWidth) &&
"We only handle power-of-two reductions for now") ? void (0)
: __assert_fail ("isPowerOf2_32(ReduxWidth) && \"We only handle power-of-two reductions for now\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 11719, __extension__
__PRETTY_FUNCTION__))

11719

"We only handle power-of-two reductions for now")(static_cast <bool> (isPowerOf2_32(ReduxWidth) &&
"We only handle power-of-two reductions for now") ? void (0)
: __assert_fail ("isPowerOf2_32(ReduxWidth) && \"We only handle power-of-two reductions for now\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 11719, __extension__
__PRETTY_FUNCTION__));

11720

assert(RdxKind != RecurKind::FMulAdd &&(static_cast <bool> (RdxKind != RecurKind::FMulAdd &&
"A call to the llvm.fmuladd intrinsic is not handled yet") ?
void (0) : __assert_fail ("RdxKind != RecurKind::FMulAdd && \"A call to the llvm.fmuladd intrinsic is not handled yet\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 11721, __extension__
__PRETTY_FUNCTION__))

11721

"A call to the llvm.fmuladd intrinsic is not handled yet")(static_cast <bool> (RdxKind != RecurKind::FMulAdd &&
"A call to the llvm.fmuladd intrinsic is not handled yet") ?
void (0) : __assert_fail ("RdxKind != RecurKind::FMulAdd && \"A call to the llvm.fmuladd intrinsic is not handled yet\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 11721, __extension__
__PRETTY_FUNCTION__));

11722

11723

++NumVectorInstructions;

11724

return createSimpleTargetReduction(Builder, TTI, VectorizedValue, RdxKind);

11725

}

11726

};

11727

11728

} // end anonymous namespace

11729

11730

static Optional<unsigned> getAggregateSize(Instruction *InsertInst) {

11731

if (auto *IE = dyn_cast<InsertElementInst>(InsertInst))

11732

return cast<FixedVectorType>(IE->getType())->getNumElements();

11733

11734

unsigned AggregateSize = 1;

11735

auto *IV = cast<InsertValueInst>(InsertInst);

11736

Type *CurrentType = IV->getType();

11737

do {

11738

if (auto *ST = dyn_cast<StructType>(CurrentType)) {

11739

for (auto *Elt : ST->elements())

11740

if (Elt != ST->getElementType(0)) // check homogeneity

11741

return None;

11742

AggregateSize *= ST->getNumElements();

11743

CurrentType = ST->getElementType(0);

11744

} else if (auto *AT = dyn_cast<ArrayType>(CurrentType)) {

11745

AggregateSize *= AT->getNumElements();

11746

CurrentType = AT->getElementType();

11747

} else if (auto *VT = dyn_cast<FixedVectorType>(CurrentType)) {

11748

AggregateSize *= VT->getNumElements();

11749

return AggregateSize;

11750

} else if (CurrentType->isSingleValueType()) {

11751

return AggregateSize;

11752

} else {

11753

return None;

11754

}

11755

} while (true);

11756

}

11757

11758

static void findBuildAggregate_rec(Instruction *LastInsertInst,

11759

TargetTransformInfo *TTI,

11760

SmallVectorImpl<Value *> &BuildVectorOpds,

11761

SmallVectorImpl<Value *> &InsertElts,

11762

unsigned OperandOffset) {

11763

do {

11764

Value *InsertedOperand = LastInsertInst->getOperand(1);

11765

Optional<unsigned> OperandIndex =

11766

getInsertIndex(LastInsertInst, OperandOffset);

11767

if (!OperandIndex)

11768

return;

11769

if (isa<InsertElementInst, InsertValueInst>(InsertedOperand)) {

11770

findBuildAggregate_rec(cast<Instruction>(InsertedOperand), TTI,

11771

BuildVectorOpds, InsertElts, *OperandIndex);

11772

11773

} else {

11774

BuildVectorOpds[*OperandIndex] = InsertedOperand;

11775

InsertElts[*OperandIndex] = LastInsertInst;

11776

}

11777

LastInsertInst = dyn_cast<Instruction>(LastInsertInst->getOperand(0));

11778

} while (LastInsertInst != nullptr &&

11779

isa<InsertValueInst, InsertElementInst>(LastInsertInst) &&

11780

LastInsertInst->hasOneUse());

11781

}

11782

11783

/// Recognize construction of vectors like

11784

/// %ra = insertelement <4 x float> poison, float %s0, i32 0

11785

/// %rb = insertelement <4 x float> %ra, float %s1, i32 1

11786

/// %rc = insertelement <4 x float> %rb, float %s2, i32 2

11787

/// %rd = insertelement <4 x float> %rc, float %s3, i32 3

11788

/// starting from the last insertelement or insertvalue instruction.

11789

///

11790

/// Also recognize homogeneous aggregates like {<2 x float>, <2 x float>},

11791

/// {{float, float}, {float, float}}, [2 x {float, float}] and so on.

11792

/// See llvm/test/Transforms/SLPVectorizer/X86/pr42022.ll for examples.

11793

///

11794

/// Assume LastInsertInst is of InsertElementInst or InsertValueInst type.

11795

///

11796

/// \return true if it matches.

11797

static bool findBuildAggregate(Instruction *LastInsertInst,

11798

TargetTransformInfo *TTI,

11799

SmallVectorImpl<Value *> &BuildVectorOpds,

11800

SmallVectorImpl<Value *> &InsertElts) {

11801

11802

assert((isa<InsertElementInst>(LastInsertInst) ||(static_cast <bool> ((isa<InsertElementInst>(LastInsertInst
) || isa<InsertValueInst>(LastInsertInst)) && "Expected insertelement or insertvalue instruction!"
) ? void (0) : __assert_fail ("(isa<InsertElementInst>(LastInsertInst) || isa<InsertValueInst>(LastInsertInst)) && \"Expected insertelement or insertvalue instruction!\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 11804, __extension__
__PRETTY_FUNCTION__))

11803

isa<InsertValueInst>(LastInsertInst)) &&(static_cast <bool> ((isa<InsertElementInst>(LastInsertInst
) || isa<InsertValueInst>(LastInsertInst)) && "Expected insertelement or insertvalue instruction!"
) ? void (0) : __assert_fail ("(isa<InsertElementInst>(LastInsertInst) || isa<InsertValueInst>(LastInsertInst)) && \"Expected insertelement or insertvalue instruction!\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 11804, __extension__
__PRETTY_FUNCTION__))

11804

"Expected insertelement or insertvalue instruction!")(static_cast <bool> ((isa<InsertElementInst>(LastInsertInst
) || isa<InsertValueInst>(LastInsertInst)) && "Expected insertelement or insertvalue instruction!"
) ? void (0) : __assert_fail ("(isa<InsertElementInst>(LastInsertInst) || isa<InsertValueInst>(LastInsertInst)) && \"Expected insertelement or insertvalue instruction!\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 11804, __extension__
__PRETTY_FUNCTION__));

11805

11806

assert((BuildVectorOpds.empty() && InsertElts.empty()) &&(static_cast <bool> ((BuildVectorOpds.empty() &&
InsertElts.empty()) && "Expected empty result vectors!"
) ? void (0) : __assert_fail ("(BuildVectorOpds.empty() && InsertElts.empty()) && \"Expected empty result vectors!\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 11807, __extension__
__PRETTY_FUNCTION__))

11807

"Expected empty result vectors!")(static_cast <bool> ((BuildVectorOpds.empty() &&
InsertElts.empty()) && "Expected empty result vectors!"
) ? void (0) : __assert_fail ("(BuildVectorOpds.empty() && InsertElts.empty()) && \"Expected empty result vectors!\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 11807, __extension__
__PRETTY_FUNCTION__));

11808

11809

Optional<unsigned> AggregateSize = getAggregateSize(LastInsertInst);

11810

if (!AggregateSize)

11811

return false;

11812

BuildVectorOpds.resize(*AggregateSize);

11813

InsertElts.resize(*AggregateSize);

11814

11815

findBuildAggregate_rec(LastInsertInst, TTI, BuildVectorOpds, InsertElts, 0);

11816

llvm::erase_value(BuildVectorOpds, nullptr);

11817

llvm::erase_value(InsertElts, nullptr);

11818

if (BuildVectorOpds.size() >= 2)

11819

return true;

11820

11821

return false;

11822

}

11823

11824

/// Try and get a reduction value from a phi node.

11825

///

11826

/// Given a phi node \p P in a block \p ParentBB, consider possible reductions

11827

/// if they come from either \p ParentBB or a containing loop latch.

11828

///

11829

/// \returns A candidate reduction value if possible, or \code nullptr \endcode

11830

/// if not possible.

11831

static Value *getReductionValue(const DominatorTree *DT, PHINode *P,

11832

BasicBlock *ParentBB, LoopInfo *LI) {

11833

// There are situations where the reduction value is not dominated by the

11834

// reduction phi. Vectorizing such cases has been reported to cause

11835

// miscompiles. See PR25787.

11836

auto DominatedReduxValue = [&](Value *R) {

11837

return isa<Instruction>(R) &&

11838

DT->dominates(P->getParent(), cast<Instruction>(R)->getParent());

11839

};

11840

11841

Value *Rdx = nullptr;

11842

11843

// Return the incoming value if it comes from the same BB as the phi node.

11844

if (P->getIncomingBlock(0) == ParentBB) {

11845

Rdx = P->getIncomingValue(0);

11846

} else if (P->getIncomingBlock(1) == ParentBB) {

11847

Rdx = P->getIncomingValue(1);

11848

}

11849

11850

if (Rdx && DominatedReduxValue(Rdx))

11851

return Rdx;

11852

11853

// Otherwise, check whether we have a loop latch to look at.

11854

Loop *BBL = LI->getLoopFor(ParentBB);

11855

if (!BBL)

11856

return nullptr;

11857

BasicBlock *BBLatch = BBL->getLoopLatch();

11858

if (!BBLatch)

11859

return nullptr;

11860

11861

// There is a loop latch, return the incoming value if it comes from

11862

// that. This reduction pattern occasionally turns up.

11863

if (P->getIncomingBlock(0) == BBLatch) {

11864

Rdx = P->getIncomingValue(0);

11865

} else if (P->getIncomingBlock(1) == BBLatch) {

11866

Rdx = P->getIncomingValue(1);

11867

}

11868

11869

if (Rdx && DominatedReduxValue(Rdx))

11870

return Rdx;

11871

11872

return nullptr;

11873

}

11874

11875

static bool matchRdxBop(Instruction *I, Value *&V0, Value *&V1) {

11876

if (match(I, m_BinOp(m_Value(V0), m_Value(V1))))

11877

return true;

11878

if (match(I, m_Intrinsic<Intrinsic::maxnum>(m_Value(V0), m_Value(V1))))

11879

return true;

11880

if (match(I, m_Intrinsic<Intrinsic::minnum>(m_Value(V0), m_Value(V1))))

11881

return true;

11882

if (match(I, m_Intrinsic<Intrinsic::smax>(m_Value(V0), m_Value(V1))))

11883

return true;

11884

if (match(I, m_Intrinsic<Intrinsic::smin>(m_Value(V0), m_Value(V1))))

11885

return true;

11886

if (match(I, m_Intrinsic<Intrinsic::umax>(m_Value(V0), m_Value(V1))))

11887

return true;

11888

if (match(I, m_Intrinsic<Intrinsic::umin>(m_Value(V0), m_Value(V1))))

11889

return true;

11890

return false;

11891

}

11892

11893

bool SLPVectorizerPass::vectorizeHorReduction(

11894

PHINode *P, Value *V, BasicBlock *BB, BoUpSLP &R, TargetTransformInfo *TTI,

11895

SmallVectorImpl<WeakTrackingVH> &PostponedInsts) {

11896

if (!ShouldVectorizeHor)

11897

return false;

11898

11899

auto *Root = dyn_cast_or_null<Instruction>(V);

11900

if (!Root)

11901

return false;

11902

11903

if (!isa<BinaryOperator>(Root))

11904

P = nullptr;

11905

11906

if (Root->getParent() != BB || isa<PHINode>(Root))

11907

return false;

11908

// Start analysis starting from Root instruction. If horizontal reduction is

11909

// found, try to vectorize it. If it is not a horizontal reduction or

11910

// vectorization is not possible or not effective, and currently analyzed

11911

// instruction is a binary operation, try to vectorize the operands, using

11912

// pre-order DFS traversal order. If the operands were not vectorized, repeat

11913

// the same procedure considering each operand as a possible root of the

11914

// horizontal reduction.

11915

// Interrupt the process if the Root instruction itself was vectorized or all

11916

// sub-trees not higher that RecursionMaxDepth were analyzed/vectorized.

11917

// If a horizintal reduction was not matched or vectorized we collect

11918

// instructions for possible later attempts for vectorization.

11919

std::queue<std::pair<Instruction *, unsigned>> Stack;

11920

Stack.emplace(Root, 0);

11921

SmallPtrSet<Value *, 8> VisitedInstrs;

11922

bool Res = false;

11923

auto &&TryToReduce = [this, TTI, &P, &R](Instruction *Inst, Value *&B0,

11924

Value *&B1) -> Value * {

11925

if (R.isAnalyzedReductionRoot(Inst))

11926

return nullptr;

11927

bool IsBinop = matchRdxBop(Inst, B0, B1);

11928

bool IsSelect = match(Inst, m_Select(m_Value(), m_Value(), m_Value()));

11929

if (IsBinop || IsSelect) {

11930

HorizontalReduction HorRdx;

11931

if (HorRdx.matchAssociativeReduction(P, Inst, *SE, *DL, *TLI))

11932

return HorRdx.tryToReduce(R, TTI);

11933

}

11934

return nullptr;

11935

};

11936

while (!Stack.empty()) {

11937

Instruction *Inst;

11938

unsigned Level;

11939

std::tie(Inst, Level) = Stack.front();

11940

Stack.pop();

11941

// Do not try to analyze instruction that has already been vectorized.

11942

// This may happen when we vectorize instruction operands on a previous

11943

// iteration while stack was populated before that happened.

11944

if (R.isDeleted(Inst))

11945

continue;

11946

Value *B0 = nullptr, *B1 = nullptr;

11947

if (Value *V = TryToReduce(Inst, B0, B1)) {

11948

Res = true;

11949

// Set P to nullptr to avoid re-analysis of phi node in

11950

// matchAssociativeReduction function unless this is the root node.

11951

P = nullptr;

11952

if (auto *I = dyn_cast<Instruction>(V)) {

11953

// Try to find another reduction.

11954

Stack.emplace(I, Level);

11955

continue;

11956

}

11957

} else {

11958

bool IsBinop = B0 && B1;

11959

if (P && IsBinop) {

11960

Inst = dyn_cast<Instruction>(B0);

11961

if (Inst == P)

11962

Inst = dyn_cast<Instruction>(B1);

11963

if (!Inst) {

11964

// Set P to nullptr to avoid re-analysis of phi node in

11965

// matchAssociativeReduction function unless this is the root node.

11966

P = nullptr;

11967

continue;

11968

}

11969

}

11970

// Set P to nullptr to avoid re-analysis of phi node in

11971

// matchAssociativeReduction function unless this is the root node.

11972

P = nullptr;

11973

// Do not collect CmpInst or InsertElementInst/InsertValueInst as their

11974

// analysis is done separately.

11975

if (!isa<CmpInst, InsertElementInst, InsertValueInst>(Inst))

11976

PostponedInsts.push_back(Inst);

11977

}

11978

11979

// Try to vectorize operands.

11980

// Continue analysis for the instruction from the same basic block only to

11981

// save compile time.

11982

if (++Level < RecursionMaxDepth)

11983

for (auto *Op : Inst->operand_values())

11984

if (VisitedInstrs.insert(Op).second)

11985

if (auto *I = dyn_cast<Instruction>(Op))

11986

// Do not try to vectorize CmpInst operands, this is done

11987

// separately.

11988

if (!isa<PHINode, CmpInst, InsertElementInst, InsertValueInst>(I) &&

11989

!R.isDeleted(I) && I->getParent() == BB)

11990

Stack.emplace(I, Level);

11991

}

11992

return Res;

11993

}

11994

11995

bool SLPVectorizerPass::vectorizeRootInstruction(PHINode *P, Value *V,

11996

BasicBlock *BB, BoUpSLP &R,

11997

TargetTransformInfo *TTI) {

11998

SmallVector<WeakTrackingVH> PostponedInsts;

11999

bool Res = vectorizeHorReduction(P, V, BB, R, TTI, PostponedInsts);

12000

Res |= tryToVectorize(PostponedInsts, R);

12001

return Res;

12002

}

12003

12004

bool SLPVectorizerPass::tryToVectorize(ArrayRef<WeakTrackingVH> Insts,

12005

BoUpSLP &R) {

12006

bool Res = false;

12007

for (Value *V : Insts)

12008

if (auto *Inst = dyn_cast<Instruction>(V); Inst && !R.isDeleted(Inst))

12009

Res |= tryToVectorize(Inst, R);

12010

return Res;

12011

}

12012

12013

bool SLPVectorizerPass::vectorizeInsertValueInst(InsertValueInst *IVI,

12014

BasicBlock *BB, BoUpSLP &R) {

12015

const DataLayout &DL = BB->getModule()->getDataLayout();

12016

if (!R.canMapToVector(IVI->getType(), DL))

12017

return false;

12018

12019

SmallVector<Value *, 16> BuildVectorOpds;

12020

SmallVector<Value *, 16> BuildVectorInsts;

12021

if (!findBuildAggregate(IVI, TTI, BuildVectorOpds, BuildVectorInsts))

12022

return false;

12023

12024

LLVM_DEBUG(dbgs() << "SLP: array mappable to vector: " << *IVI << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: array mappable to vector: " <<
*IVI << "\n"; } } while (false);

12025

// Aggregate value is unlikely to be processed in vector register.

12026

return tryToVectorizeList(BuildVectorOpds, R);

12027

}

12028

12029

bool SLPVectorizerPass::vectorizeInsertElementInst(InsertElementInst *IEI,

12030

BasicBlock *BB, BoUpSLP &R) {

12031

SmallVector<Value *, 16> BuildVectorInsts;

12032

SmallVector<Value *, 16> BuildVectorOpds;

12033

SmallVector<int> Mask;

12034

if (!findBuildAggregate(IEI, TTI, BuildVectorOpds, BuildVectorInsts) ||

12035

(llvm::all_of(

12036

BuildVectorOpds,

12037

[](Value *V) { return isa<ExtractElementInst, UndefValue>(V); }) &&

12038

isFixedVectorShuffle(BuildVectorOpds, Mask)))

12039

return false;

12040

12041

LLVM_DEBUG(dbgs() << "SLP: array mappable to vector: " << *IEI << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: array mappable to vector: " <<
*IEI << "\n"; } } while (false);

12042

return tryToVectorizeList(BuildVectorInsts, R);

12043

}

12044

12045

template <typename T>

12046

static bool

12047

tryToVectorizeSequence(SmallVectorImpl<T *> &Incoming,

12048

function_ref<unsigned(T *)> Limit,

12049

function_ref<bool(T *, T *)> Comparator,

12050

function_ref<bool(T *, T *)> AreCompatible,

12051

function_ref<bool(ArrayRef<T *>, bool)> TryToVectorizeHelper,

12052

bool LimitForRegisterSize) {

12053

bool Changed = false;

12054

// Sort by type, parent, operands.

12055

stable_sort(Incoming, Comparator);

12056

12057

// Try to vectorize elements base on their type.

12058

SmallVector<T *> Candidates;

12059

for (auto *IncIt = Incoming.begin(), *E = Incoming.end(); IncIt != E;) {

12060

// Look for the next elements with the same type, parent and operand

12061

// kinds.

12062

auto *SameTypeIt = IncIt;

12063

while (SameTypeIt != E && AreCompatible(*SameTypeIt, *IncIt))

12064

++SameTypeIt;

12065

12066

// Try to vectorize them.

12067

unsigned NumElts = (SameTypeIt - IncIt);

12068

LLVM_DEBUG(dbgs() << "SLP: Trying to vectorize starting at nodes ("do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Trying to vectorize starting at nodes ("
<< NumElts << ")\n"; } } while (false)

12069

<< NumElts << ")\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Trying to vectorize starting at nodes ("
<< NumElts << ")\n"; } } while (false);

12070

// The vectorization is a 3-state attempt:

12071

// 1. Try to vectorize instructions with the same/alternate opcodes with the

12072

// size of maximal register at first.

12073

// 2. Try to vectorize remaining instructions with the same type, if

12074

// possible. This may result in the better vectorization results rather than

12075

// if we try just to vectorize instructions with the same/alternate opcodes.

12076

// 3. Final attempt to try to vectorize all instructions with the

12077

// same/alternate ops only, this may result in some extra final

12078

// vectorization.

12079

if (NumElts > 1 &&

12080

TryToVectorizeHelper(makeArrayRef(IncIt, NumElts), LimitForRegisterSize)) {

12081

// Success start over because instructions might have been changed.

12082

Changed = true;

12083

} else if (NumElts < Limit(*IncIt) &&

12084

(Candidates.empty() ||

12085

Candidates.front()->getType() == (*IncIt)->getType())) {

12086

Candidates.append(IncIt, std::next(IncIt, NumElts));

12087

}

12088

// Final attempt to vectorize instructions with the same types.

12089

if (Candidates.size() > 1 &&

12090

(SameTypeIt == E || (*SameTypeIt)->getType() != (*IncIt)->getType())) {

12091

if (TryToVectorizeHelper(Candidates, /*LimitForRegisterSize=*/false)) {

12092

// Success start over because instructions might have been changed.

12093

Changed = true;

12094

} else if (LimitForRegisterSize) {

12095

// Try to vectorize using small vectors.

12096

for (auto *It = Candidates.begin(), *End = Candidates.end();

12097

It != End;) {

12098

auto *SameTypeIt = It;

12099

while (SameTypeIt != End && AreCompatible(*SameTypeIt, *It))

12100

++SameTypeIt;

12101

unsigned NumElts = (SameTypeIt - It);

12102

if (NumElts > 1 && TryToVectorizeHelper(makeArrayRef(It, NumElts),

12103

/*LimitForRegisterSize=*/false))

12104

Changed = true;

12105

It = SameTypeIt;

12106

}

12107

}

12108

Candidates.clear();

12109

}

12110

12111

// Start over at the next instruction of a different type (or the end).

12112

IncIt = SameTypeIt;

12113

}

12114

return Changed;

12115

}

12116

12117

/// Compare two cmp instructions. If IsCompatibility is true, function returns

12118

/// true if 2 cmps have same/swapped predicates and mos compatible corresponding

12119

/// operands. If IsCompatibility is false, function implements strict weak

12120

/// ordering relation between two cmp instructions, returning true if the first

12121

/// instruction is "less" than the second, i.e. its predicate is less than the

12122

/// predicate of the second or the operands IDs are less than the operands IDs

12123

/// of the second cmp instruction.

12124

template <bool IsCompatibility>

12125

static bool compareCmp(Value *V, Value *V2,

12126

function_ref<bool(Instruction *)> IsDeleted) {

12127

auto *CI1 = cast<CmpInst>(V);

12128

auto *CI2 = cast<CmpInst>(V2);

12129

if (IsDeleted(CI2) || !isValidElementType(CI2->getType()))

12130

return false;

12131

if (CI1->getOperand(0)->getType()->getTypeID() <

12132

CI2->getOperand(0)->getType()->getTypeID())

12133

return !IsCompatibility;

12134

if (CI1->getOperand(0)->getType()->getTypeID() >

12135

CI2->getOperand(0)->getType()->getTypeID())

12136

return false;

12137

CmpInst::Predicate Pred1 = CI1->getPredicate();

12138

CmpInst::Predicate Pred2 = CI2->getPredicate();

12139

CmpInst::Predicate SwapPred1 = CmpInst::getSwappedPredicate(Pred1);

12140

CmpInst::Predicate SwapPred2 = CmpInst::getSwappedPredicate(Pred2);

12141

CmpInst::Predicate BasePred1 = std::min(Pred1, SwapPred1);

12142

CmpInst::Predicate BasePred2 = std::min(Pred2, SwapPred2);

12143

if (BasePred1 < BasePred2)

12144

return !IsCompatibility;

12145

if (BasePred1 > BasePred2)

12146

return false;

12147

// Compare operands.

12148

bool LEPreds = Pred1 <= Pred2;

12149

bool GEPreds = Pred1 >= Pred2;

12150

for (int I = 0, E = CI1->getNumOperands(); I < E; ++I) {

12151

auto *Op1 = CI1->getOperand(LEPreds ? I : E - I - 1);

12152

auto *Op2 = CI2->getOperand(GEPreds ? I : E - I - 1);

12153

if (Op1->getValueID() < Op2->getValueID())

12154

return !IsCompatibility;

12155

if (Op1->getValueID() > Op2->getValueID())

12156

return false;

12157

if (auto *I1 = dyn_cast<Instruction>(Op1))

12158

if (auto *I2 = dyn_cast<Instruction>(Op2)) {

12159

if (I1->getParent() != I2->getParent())

12160

return false;

12161

InstructionsState S = getSameOpcode({I1, I2});

12162

if (S.getOpcode())

12163

continue;

12164

return false;

12165

}

12166

}

12167

return IsCompatibility;

12168

}

12169

12170

bool SLPVectorizerPass::vectorizeSimpleInstructions(InstSetVector &Instructions,

12171

BasicBlock *BB, BoUpSLP &R,

12172

bool AtTerminator) {

12173

bool OpsChanged = false;

12174

SmallVector<Instruction *, 4> PostponedCmps;

12175

SmallVector<WeakTrackingVH> PostponedInsts;

12176

// pass1 - try to vectorize reductions only

12177

for (auto *I : reverse(Instructions)) {

12178

if (R.isDeleted(I))

12179

continue;

12180

if (isa<CmpInst>(I)) {

12181

PostponedCmps.push_back(I);

12182

continue;

12183

}

12184

OpsChanged |= vectorizeHorReduction(nullptr, I, BB, R, TTI, PostponedInsts);

12185

}

12186

// pass2 - try to match and vectorize a buildvector sequence.

12187

for (auto *I : reverse(Instructions)) {

12188

if (R.isDeleted(I) || isa<CmpInst>(I))

12189

continue;

12190

if (auto *LastInsertValue = dyn_cast<InsertValueInst>(I)) {

12191

OpsChanged |= vectorizeInsertValueInst(LastInsertValue, BB, R);

12192

} else if (auto *LastInsertElem = dyn_cast<InsertElementInst>(I)) {

12193

OpsChanged |= vectorizeInsertElementInst(LastInsertElem, BB, R);

12194

}

12195

}

12196

// Now try to vectorize postponed instructions.

12197

OpsChanged |= tryToVectorize(PostponedInsts, R);

12198

12199

if (AtTerminator) {

12200

// Try to find reductions first.

12201

for (Instruction *I : PostponedCmps) {

12202

if (R.isDeleted(I))

12203

continue;

12204

for (Value *Op : I->operands())

12205

OpsChanged |= vectorizeRootInstruction(nullptr, Op, BB, R, TTI);

12206

}

12207

// Try to vectorize operands as vector bundles.

12208

for (Instruction *I : PostponedCmps) {

12209

if (R.isDeleted(I))

12210

continue;

12211

OpsChanged |= tryToVectorize(I, R);

12212

}

12213

// Try to vectorize list of compares.

12214

// Sort by type, compare predicate, etc.

12215

auto &&CompareSorter = [&R](Value *V, Value *V2) {

12216

return compareCmp<false>(V, V2,

12217

[&R](Instruction *I) { return R.isDeleted(I); });

12218

};

12219

12220

auto &&AreCompatibleCompares = [&R](Value *V1, Value *V2) {

12221

if (V1 == V2)

12222

return true;

12223

return compareCmp<true>(V1, V2,

12224

[&R](Instruction *I) { return R.isDeleted(I); });

12225

};

12226

auto Limit = [&R](Value *V) {

12227

unsigned EltSize = R.getVectorElementSize(V);

12228

return std::max(2U, R.getMaxVecRegSize() / EltSize);

12229

};

12230

12231

SmallVector<Value *> Vals(PostponedCmps.begin(), PostponedCmps.end());

12232

OpsChanged |= tryToVectorizeSequence<Value>(

12233

Vals, Limit, CompareSorter, AreCompatibleCompares,

12234

[this, &R](ArrayRef<Value *> Candidates, bool LimitForRegisterSize) {

12235

// Exclude possible reductions from other blocks.

12236

bool ArePossiblyReducedInOtherBlock =

12237

any_of(Candidates, [](Value *V) {

12238

return any_of(V->users(), [V](User *U) {

12239

return isa<SelectInst>(U) &&

12240

cast<SelectInst>(U)->getParent() !=

12241

cast<Instruction>(V)->getParent();

12242

});

12243

});

12244

if (ArePossiblyReducedInOtherBlock)

12245

return false;

12246

return tryToVectorizeList(Candidates, R, LimitForRegisterSize);

12247

},

12248

/*LimitForRegisterSize=*/true);

12249

Instructions.clear();

12250

} else {

12251

Instructions.clear();

12252

// Insert in reverse order since the PostponedCmps vector was filled in

12253

// reverse order.

12254

Instructions.insert(PostponedCmps.rbegin(), PostponedCmps.rend());

12255

}

12256

return OpsChanged;

12257

}

12258

12259

bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {

12260

bool Changed = false;

12261

SmallVector<Value *, 4> Incoming;

12262

SmallPtrSet<Value *, 16> VisitedInstrs;

12263

// Maps phi nodes to the non-phi nodes found in the use tree for each phi

12264

// node. Allows better to identify the chains that can be vectorized in the

12265

// better way.

12266

DenseMap<Value *, SmallVector<Value *, 4>> PHIToOpcodes;

12267

auto PHICompare = [this, &PHIToOpcodes](Value *V1, Value *V2) {

12268

assert(isValidElementType(V1->getType()) &&(static_cast <bool> (isValidElementType(V1->getType(
)) && isValidElementType(V2->getType()) &&
"Expected vectorizable types only.") ? void (0) : __assert_fail
("isValidElementType(V1->getType()) && isValidElementType(V2->getType()) && \"Expected vectorizable types only.\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 12270, __extension__
__PRETTY_FUNCTION__))

12269

isValidElementType(V2->getType()) &&(static_cast <bool> (isValidElementType(V1->getType(
)) && isValidElementType(V2->getType()) &&
"Expected vectorizable types only.") ? void (0) : __assert_fail
("isValidElementType(V1->getType()) && isValidElementType(V2->getType()) && \"Expected vectorizable types only.\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 12270, __extension__
__PRETTY_FUNCTION__))

12270

"Expected vectorizable types only.")(static_cast <bool> (isValidElementType(V1->getType(
)) && isValidElementType(V2->getType()) &&
"Expected vectorizable types only.") ? void (0) : __assert_fail
("isValidElementType(V1->getType()) && isValidElementType(V2->getType()) && \"Expected vectorizable types only.\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 12270, __extension__
__PRETTY_FUNCTION__));

12271

// It is fine to compare type IDs here, since we expect only vectorizable

12272

// types, like ints, floats and pointers, we don't care about other type.

12273

if (V1->getType()->getTypeID() < V2->getType()->getTypeID())

12274

return true;

12275

if (V1->getType()->getTypeID() > V2->getType()->getTypeID())

12276

return false;

12277

ArrayRef<Value *> Opcodes1 = PHIToOpcodes[V1];

12278

ArrayRef<Value *> Opcodes2 = PHIToOpcodes[V2];

12279

if (Opcodes1.size() < Opcodes2.size())

12280

return true;

12281

if (Opcodes1.size() > Opcodes2.size())

12282

return false;

12283

Optional<bool> ConstOrder;

12284

for (int I = 0, E = Opcodes1.size(); I < E; ++I) {

12285

// Undefs are compatible with any other value.

12286

if (isa<UndefValue>(Opcodes1[I]) || isa<UndefValue>(Opcodes2[I])) {

12287

if (!ConstOrder)

12288

ConstOrder =

12289

!isa<UndefValue>(Opcodes1[I]) && isa<UndefValue>(Opcodes2[I]);

12290

continue;

12291

}

12292

if (auto *I1 = dyn_cast<Instruction>(Opcodes1[I]))

12293

if (auto *I2 = dyn_cast<Instruction>(Opcodes2[I])) {

12294

DomTreeNodeBase<BasicBlock> *NodeI1 = DT->getNode(I1->getParent());

12295

DomTreeNodeBase<BasicBlock> *NodeI2 = DT->getNode(I2->getParent());

12296

if (!NodeI1)

12297

return NodeI2 != nullptr;

12298

if (!NodeI2)

12299

return false;

12300

assert((NodeI1 == NodeI2) ==(static_cast <bool> ((NodeI1 == NodeI2) == (NodeI1->
getDFSNumIn() == NodeI2->getDFSNumIn()) && "Different nodes should have different DFS numbers"
) ? void (0) : __assert_fail ("(NodeI1 == NodeI2) == (NodeI1->getDFSNumIn() == NodeI2->getDFSNumIn()) && \"Different nodes should have different DFS numbers\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 12302, __extension__
__PRETTY_FUNCTION__))

12301

(NodeI1->getDFSNumIn() == NodeI2->getDFSNumIn()) &&(static_cast <bool> ((NodeI1 == NodeI2) == (NodeI1->
getDFSNumIn() == NodeI2->getDFSNumIn()) && "Different nodes should have different DFS numbers"
) ? void (0) : __assert_fail ("(NodeI1 == NodeI2) == (NodeI1->getDFSNumIn() == NodeI2->getDFSNumIn()) && \"Different nodes should have different DFS numbers\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 12302, __extension__
__PRETTY_FUNCTION__))

12302

"Different nodes should have different DFS numbers")(static_cast <bool> ((NodeI1 == NodeI2) == (NodeI1->
getDFSNumIn() == NodeI2->getDFSNumIn()) && "Different nodes should have different DFS numbers"
) ? void (0) : __assert_fail ("(NodeI1 == NodeI2) == (NodeI1->getDFSNumIn() == NodeI2->getDFSNumIn()) && \"Different nodes should have different DFS numbers\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 12302, __extension__
__PRETTY_FUNCTION__));

12303

if (NodeI1 != NodeI2)

12304

return NodeI1->getDFSNumIn() < NodeI2->getDFSNumIn();

12305

InstructionsState S = getSameOpcode({I1, I2});

12306

if (S.getOpcode())

12307

continue;

12308

return I1->getOpcode() < I2->getOpcode();

12309

}

12310

if (isa<Constant>(Opcodes1[I]) && isa<Constant>(Opcodes2[I])) {

12311

if (!ConstOrder)

12312

ConstOrder = Opcodes1[I]->getValueID() < Opcodes2[I]->getValueID();

12313

continue;

12314

}

12315

if (Opcodes1[I]->getValueID() < Opcodes2[I]->getValueID())

12316

return true;

12317

if (Opcodes1[I]->getValueID() > Opcodes2[I]->getValueID())

12318

return false;

12319

}

12320

return ConstOrder && *ConstOrder;

12321

};

12322

auto AreCompatiblePHIs = [&PHIToOpcodes](Value *V1, Value *V2) {

12323

if (V1 == V2)

12324

return true;

12325

if (V1->getType() != V2->getType())

12326

return false;

12327

ArrayRef<Value *> Opcodes1 = PHIToOpcodes[V1];

12328

ArrayRef<Value *> Opcodes2 = PHIToOpcodes[V2];

12329

if (Opcodes1.size() != Opcodes2.size())

12330

return false;

12331

for (int I = 0, E = Opcodes1.size(); I < E; ++I) {

12332

// Undefs are compatible with any other value.

12333

if (isa<UndefValue>(Opcodes1[I]) || isa<UndefValue>(Opcodes2[I]))

12334

continue;

12335

if (auto *I1 = dyn_cast<Instruction>(Opcodes1[I]))

12336

if (auto *I2 = dyn_cast<Instruction>(Opcodes2[I])) {

12337

if (I1->getParent() != I2->getParent())

12338

return false;

12339

InstructionsState S = getSameOpcode({I1, I2});

12340

if (S.getOpcode())

12341

continue;

12342

return false;

12343

}

12344

if (isa<Constant>(Opcodes1[I]) && isa<Constant>(Opcodes2[I]))

12345

continue;

12346

if (Opcodes1[I]->getValueID() != Opcodes2[I]->getValueID())

12347

return false;

12348

}

12349

return true;

12350

};

12351

auto Limit = [&R](Value *V) {

12352

unsigned EltSize = R.getVectorElementSize(V);

12353

return std::max(2U, R.getMaxVecRegSize() / EltSize);

12354

};

12355

12356

bool HaveVectorizedPhiNodes = false;

12357

do {

12358

// Collect the incoming values from the PHIs.

12359

Incoming.clear();

12360

for (Instruction &I : *BB) {

12361

PHINode *P = dyn_cast<PHINode>(&I);

12362

if (!P)

12363

break;

12364

12365

// No need to analyze deleted, vectorized and non-vectorizable

12366

// instructions.

12367

if (!VisitedInstrs.count(P) && !R.isDeleted(P) &&

12368

isValidElementType(P->getType()))

12369

Incoming.push_back(P);

12370

}

12371

12372

// Find the corresponding non-phi nodes for better matching when trying to

12373

// build the tree.

12374

for (Value *V : Incoming) {

12375

SmallVectorImpl<Value *> &Opcodes =

12376

PHIToOpcodes.try_emplace(V).first->getSecond();

12377

if (!Opcodes.empty())

12378

continue;

12379

SmallVector<Value *, 4> Nodes(1, V);

12380

SmallPtrSet<Value *, 4> Visited;

12381

while (!Nodes.empty()) {

12382

auto *PHI = cast<PHINode>(Nodes.pop_back_val());

12383

if (!Visited.insert(PHI).second)

12384

continue;

12385

for (Value *V : PHI->incoming_values()) {

12386

if (auto *PHI1 = dyn_cast<PHINode>((V))) {

12387

Nodes.push_back(PHI1);

12388

continue;

12389

}

12390

Opcodes.emplace_back(V);

12391

}

12392

}

12393

}

12394

12395

HaveVectorizedPhiNodes = tryToVectorizeSequence<Value>(

12396

Incoming, Limit, PHICompare, AreCompatiblePHIs,

12397

[this, &R](ArrayRef<Value *> Candidates, bool LimitForRegisterSize) {

12398

return tryToVectorizeList(Candidates, R, LimitForRegisterSize);

12399

},

12400

/*LimitForRegisterSize=*/true);

12401

Changed |= HaveVectorizedPhiNodes;

12402

VisitedInstrs.insert(Incoming.begin(), Incoming.end());

12403

} while (HaveVectorizedPhiNodes);

12404

12405

VisitedInstrs.clear();

12406

12407

InstSetVector PostProcessInstructions;

12408

SmallDenseSet<Instruction *, 4> KeyNodes;

12409

for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) {

12410

// Skip instructions with scalable type. The num of elements is unknown at

12411

// compile-time for scalable type.

12412

if (isa<ScalableVectorType>(it->getType()))

12413

continue;

12414

12415

// Skip instructions marked for the deletion.

12416

if (R.isDeleted(&*it))

12417

continue;

12418

// We may go through BB multiple times so skip the one we have checked.

12419

if (!VisitedInstrs.insert(&*it).second) {

12420

if (it->use_empty() && KeyNodes.contains(&*it) &&

12421

vectorizeSimpleInstructions(PostProcessInstructions, BB, R,

12422

it->isTerminator())) {

12423

// We would like to start over since some instructions are deleted

12424

// and the iterator may become invalid value.

12425

Changed = true;

12426

it = BB->begin();

12427

e = BB->end();

12428

}

12429

continue;

12430

}

12431

12432

if (isa<DbgInfoIntrinsic>(it))

12433

continue;

12434

12435

// Try to vectorize reductions that use PHINodes.

12436

if (PHINode *P = dyn_cast<PHINode>(it)) {

12437

// Check that the PHI is a reduction PHI.

12438

if (P->getNumIncomingValues() == 2) {

12439

// Try to match and vectorize a horizontal reduction.

12440

if (vectorizeRootInstruction(P, getReductionValue(DT, P, BB, LI), BB, R,

12441

TTI)) {

12442

Changed = true;

12443

it = BB->begin();

12444

e = BB->end();

12445

continue;

12446

}

12447

}

12448

// Try to vectorize the incoming values of the PHI, to catch reductions

12449

// that feed into PHIs.

12450

for (unsigned I = 0, E = P->getNumIncomingValues(); I != E; I++) {

12451

// Skip if the incoming block is the current BB for now. Also, bypass

12452

// unreachable IR for efficiency and to avoid crashing.

12453

// TODO: Collect the skipped incoming values and try to vectorize them

12454

// after processing BB.

12455

if (BB == P->getIncomingBlock(I) ||

12456

!DT->isReachableFromEntry(P->getIncomingBlock(I)))

12457

continue;

12458

12459

// Postponed instructions should not be vectorized here, delay their

12460

// vectorization.

12461

if (auto *PI = dyn_cast<Instruction>(P->getIncomingValue(I));

12462

PI && !PostProcessInstructions.contains(PI))

12463

Changed |= vectorizeRootInstruction(nullptr, P->getIncomingValue(I),

12464

P->getIncomingBlock(I), R, TTI);

12465

}

12466

continue;

12467

}

12468

12469

// Ran into an instruction without users, like terminator, or function call

12470

// with ignored return value, store. Ignore unused instructions (basing on

12471

// instruction type, except for CallInst and InvokeInst).

12472

if (it->use_empty() &&

12473

(it->getType()->isVoidTy() || isa<CallInst, InvokeInst>(it))) {

12474

KeyNodes.insert(&*it);

12475

bool OpsChanged = false;

12476

auto *SI = dyn_cast<StoreInst>(it);

12477

bool TryToVectorizeRoot = ShouldStartVectorizeHorAtStore || !SI;

12478

if (SI) {

12479

auto I = Stores.find(getUnderlyingObject(SI->getPointerOperand()));

12480

// Try to vectorize chain in store, if this is the only store to the

12481

// address in the block.

12482

// TODO: This is just a temporarily solution to save compile time. Need

12483

// to investigate if we can safely turn on slp-vectorize-hor-store

12484

// instead to allow lookup for reduction chains in all non-vectorized

12485

// stores (need to check side effects and compile time).

12486

TryToVectorizeRoot = (I == Stores.end() || I->second.size() == 1) &&

12487

SI->getValueOperand()->hasOneUse();

12488

}

12489

if (TryToVectorizeRoot) {

12490

for (auto *V : it->operand_values()) {

12491

// Postponed instructions should not be vectorized here, delay their

12492

// vectorization.

12493

if (auto *VI = dyn_cast<Instruction>(V);

12494

VI && !PostProcessInstructions.contains(VI))

12495

// Try to match and vectorize a horizontal reduction.

12496

OpsChanged |= vectorizeRootInstruction(nullptr, V, BB, R, TTI);

12497

}

12498

}

12499

// Start vectorization of post-process list of instructions from the

12500

// top-tree instructions to try to vectorize as many instructions as

12501

// possible.

12502

OpsChanged |= vectorizeSimpleInstructions(PostProcessInstructions, BB, R,

12503

it->isTerminator());

12504

if (OpsChanged) {

12505

// We would like to start over since some instructions are deleted

12506

// and the iterator may become invalid value.

12507

Changed = true;

12508

it = BB->begin();

12509

e = BB->end();

12510

continue;

12511

}

12512

}

12513

12514

if (isa<CmpInst, InsertElementInst, InsertValueInst>(it))

12515

PostProcessInstructions.insert(&*it);

12516

}

12517

12518

return Changed;

12519

}

12520

12521

bool SLPVectorizerPass::vectorizeGEPIndices(BasicBlock *BB, BoUpSLP &R) {

12522

auto Changed = false;

12523

for (auto &Entry : GEPs) {

12524

// If the getelementptr list has fewer than two elements, there's nothing

12525

// to do.

12526

if (Entry.second.size() < 2)

12527

continue;

12528

12529

LLVM_DEBUG(dbgs() << "SLP: Analyzing a getelementptr list of length "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Analyzing a getelementptr list of length "
<< Entry.second.size() << ".\n"; } } while (false
)

12530

<< Entry.second.size() << ".\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Analyzing a getelementptr list of length "
<< Entry.second.size() << ".\n"; } } while (false
);

12531

12532

// Process the GEP list in chunks suitable for the target's supported

12533

// vector size. If a vector register can't hold 1 element, we are done. We

12534

// are trying to vectorize the index computations, so the maximum number of

12535

// elements is based on the size of the index expression, rather than the

12536

// size of the GEP itself (the target's pointer size).

12537

unsigned MaxVecRegSize = R.getMaxVecRegSize();

12538

unsigned EltSize = R.getVectorElementSize(*Entry.second[0]->idx_begin());

12539

if (MaxVecRegSize < EltSize)

12540

continue;

12541

12542

unsigned MaxElts = MaxVecRegSize / EltSize;

12543

for (unsigned BI = 0, BE = Entry.second.size(); BI < BE; BI += MaxElts) {

12544

auto Len = std::min<unsigned>(BE - BI, MaxElts);

12545

ArrayRef<GetElementPtrInst *> GEPList(&Entry.second[BI], Len);

12546

12547

// Initialize a set a candidate getelementptrs. Note that we use a

12548

// SetVector here to preserve program order. If the index computations

12549

// are vectorizable and begin with loads, we want to minimize the chance

12550

// of having to reorder them later.

12551

SetVector<Value *> Candidates(GEPList.begin(), GEPList.end());

12552

12553

// Some of the candidates may have already been vectorized after we

12554

// initially collected them. If so, they are marked as deleted, so remove

12555

// them from the set of candidates.

12556

Candidates.remove_if(

12557

[&R](Value *I) { return R.isDeleted(cast<Instruction>(I)); });

12558

12559

// Remove from the set of candidates all pairs of getelementptrs with

12560

// constant differences. Such getelementptrs are likely not good

12561

// candidates for vectorization in a bottom-up phase since one can be

12562

// computed from the other. We also ensure all candidate getelementptr

12563

// indices are unique.

12564

for (int I = 0, E = GEPList.size(); I < E && Candidates.size() > 1; ++I) {

12565

auto *GEPI = GEPList[I];

12566

if (!Candidates.count(GEPI))

12567

continue;

12568

auto *SCEVI = SE->getSCEV(GEPList[I]);

12569

for (int J = I + 1; J < E && Candidates.size() > 1; ++J) {

12570

auto *GEPJ = GEPList[J];

12571

auto *SCEVJ = SE->getSCEV(GEPList[J]);

12572

if (isa<SCEVConstant>(SE->getMinusSCEV(SCEVI, SCEVJ))) {

12573

Candidates.remove(GEPI);

12574

Candidates.remove(GEPJ);

12575

} else if (GEPI->idx_begin()->get() == GEPJ->idx_begin()->get()) {

12576

Candidates.remove(GEPJ);

12577

}

12578

}

12579

}

12580

12581

// We break out of the above computation as soon as we know there are

12582

// fewer than two candidates remaining.

12583

if (Candidates.size() < 2)

12584

continue;

12585

12586

// Add the single, non-constant index of each candidate to the bundle. We

12587

// ensured the indices met these constraints when we originally collected

12588

// the getelementptrs.

12589

SmallVector<Value *, 16> Bundle(Candidates.size());

12590

auto BundleIndex = 0u;

12591

for (auto *V : Candidates) {

12592

auto *GEP = cast<GetElementPtrInst>(V);

12593

auto *GEPIdx = GEP->idx_begin()->get();

12594

assert(GEP->getNumIndices() == 1 || !isa<Constant>(GEPIdx))(static_cast <bool> (GEP->getNumIndices() == 1 || !isa
<Constant>(GEPIdx)) ? void (0) : __assert_fail ("GEP->getNumIndices() == 1 || !isa<Constant>(GEPIdx)"
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 12594, __extension__
__PRETTY_FUNCTION__));

12595

Bundle[BundleIndex++] = GEPIdx;

12596

}

12597

12598

// Try and vectorize the indices. We are currently only interested in

12599

// gather-like cases of the form:

12600

//

12601

// ... = g[a[0] - b[0]] + g[a[1] - b[1]] + ...

12602

//

12603

// where the loads of "a", the loads of "b", and the subtractions can be

12604

// performed in parallel. It's likely that detecting this pattern in a

12605

// bottom-up phase will be simpler and less costly than building a

12606

// full-blown top-down phase beginning at the consecutive loads.

12607

Changed |= tryToVectorizeList(Bundle, R);

12608

}

12609

}

12610

return Changed;

12611

}

12612

12613

bool SLPVectorizerPass::vectorizeStoreChains(BoUpSLP &R) {

12614

bool Changed = false;

12615

// Sort by type, base pointers and values operand. Value operands must be

12616

// compatible (have the same opcode, same parent), otherwise it is

12617

// definitely not profitable to try to vectorize them.

12618

auto &&StoreSorter = [this](StoreInst *V, StoreInst *V2) {

12619

if (V->getPointerOperandType()->getTypeID() <

12620

V2->getPointerOperandType()->getTypeID())

12621

return true;

12622

if (V->getPointerOperandType()->getTypeID() >

12623

V2->getPointerOperandType()->getTypeID())

12624

return false;

12625

// UndefValues are compatible with all other values.

12626

if (isa<UndefValue>(V->getValueOperand()) ||

12627

isa<UndefValue>(V2->getValueOperand()))

12628

return false;

12629

if (auto *I1 = dyn_cast<Instruction>(V->getValueOperand()))

12630

if (auto *I2 = dyn_cast<Instruction>(V2->getValueOperand())) {

12631

DomTreeNodeBase<llvm::BasicBlock> *NodeI1 =

12632

DT->getNode(I1->getParent());

12633

DomTreeNodeBase<llvm::BasicBlock> *NodeI2 =

12634

DT->getNode(I2->getParent());

12635

assert(NodeI1 && "Should only process reachable instructions")(static_cast <bool> (NodeI1 && "Should only process reachable instructions"
) ? void (0) : __assert_fail ("NodeI1 && \"Should only process reachable instructions\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 12635, __extension__
__PRETTY_FUNCTION__));

12636

assert(NodeI2 && "Should only process reachable instructions")(static_cast <bool> (NodeI2 && "Should only process reachable instructions"
) ? void (0) : __assert_fail ("NodeI2 && \"Should only process reachable instructions\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 12636, __extension__
__PRETTY_FUNCTION__));

12637

assert((NodeI1 == NodeI2) ==(static_cast <bool> ((NodeI1 == NodeI2) == (NodeI1->
getDFSNumIn() == NodeI2->getDFSNumIn()) && "Different nodes should have different DFS numbers"
) ? void (0) : __assert_fail ("(NodeI1 == NodeI2) == (NodeI1->getDFSNumIn() == NodeI2->getDFSNumIn()) && \"Different nodes should have different DFS numbers\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 12639, __extension__
__PRETTY_FUNCTION__))

12638

(NodeI1->getDFSNumIn() == NodeI2->getDFSNumIn()) &&(static_cast <bool> ((NodeI1 == NodeI2) == (NodeI1->
getDFSNumIn() == NodeI2->getDFSNumIn()) && "Different nodes should have different DFS numbers"
) ? void (0) : __assert_fail ("(NodeI1 == NodeI2) == (NodeI1->getDFSNumIn() == NodeI2->getDFSNumIn()) && \"Different nodes should have different DFS numbers\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 12639, __extension__
__PRETTY_FUNCTION__))

12639

"Different nodes should have different DFS numbers")(static_cast <bool> ((NodeI1 == NodeI2) == (NodeI1->
getDFSNumIn() == NodeI2->getDFSNumIn()) && "Different nodes should have different DFS numbers"
) ? void (0) : __assert_fail ("(NodeI1 == NodeI2) == (NodeI1->getDFSNumIn() == NodeI2->getDFSNumIn()) && \"Different nodes should have different DFS numbers\""
, "llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp", 12639, __extension__
__PRETTY_FUNCTION__));

12640

if (NodeI1 != NodeI2)

12641

return NodeI1->getDFSNumIn() < NodeI2->getDFSNumIn();

12642

InstructionsState S = getSameOpcode({I1, I2});

12643

if (S.getOpcode())

12644

return false;

12645

return I1->getOpcode() < I2->getOpcode();

12646

}

12647

if (isa<Constant>(V->getValueOperand()) &&

12648

isa<Constant>(V2->getValueOperand()))

12649

return false;

12650

return V->getValueOperand()->getValueID() <

12651

V2->getValueOperand()->getValueID();

12652

};

12653

12654

auto &&AreCompatibleStores = [](StoreInst *V1, StoreInst *V2) {

12655

if (V1 == V2)

12656

return true;

12657

if (V1->getPointerOperandType() != V2->getPointerOperandType())

12658

return false;

12659

// Undefs are compatible with any other value.

12660

if (isa<UndefValue>(V1->getValueOperand()) ||

12661

isa<UndefValue>(V2->getValueOperand()))

12662

return true;

12663

if (auto *I1 = dyn_cast<Instruction>(V1->getValueOperand()))

12664

if (auto *I2 = dyn_cast<Instruction>(V2->getValueOperand())) {

12665

if (I1->getParent() != I2->getParent())

12666

return false;

12667

InstructionsState S = getSameOpcode({I1, I2});

12668

return S.getOpcode() > 0;

12669

}

12670

if (isa<Constant>(V1->getValueOperand()) &&

12671

isa<Constant>(V2->getValueOperand()))

12672

return true;

12673

return V1->getValueOperand()->getValueID() ==

12674

V2->getValueOperand()->getValueID();

12675

};

12676

auto Limit = [&R, this](StoreInst *SI) {

12677

unsigned EltSize = DL->getTypeSizeInBits(SI->getValueOperand()->getType());

12678

return R.getMinVF(EltSize);

12679

};

12680

12681

// Attempt to sort and vectorize each of the store-groups.

12682

for (auto &Pair : Stores) {

12683

if (Pair.second.size() < 2)

12684

continue;

12685

12686

LLVM_DEBUG(dbgs() << "SLP: Analyzing a store chain of length "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Analyzing a store chain of length "
<< Pair.second.size() << ".\n"; } } while (false
)

12687

<< Pair.second.size() << ".\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("SLP")) { dbgs() << "SLP: Analyzing a store chain of length "
<< Pair.second.size() << ".\n"; } } while (false
);

12688

12689

if (!isValidElementType(Pair.second.front()->getValueOperand()->getType()))

12690

continue;

12691

12692

Changed |= tryToVectorizeSequence<StoreInst>(

12693

Pair.second, Limit, StoreSorter, AreCompatibleStores,

12694

[this, &R](ArrayRef<StoreInst *> Candidates, bool) {

12695

return vectorizeStores(Candidates, R);

12696

},

12697

/*LimitForRegisterSize=*/false);

12698

}

12699

return Changed;

12700

}

12701

12702

char SLPVectorizer::ID = 0;

12703

12704

static const char lv_name[] = "SLP Vectorizer";

12705

12706

INITIALIZE_PASS_BEGIN(SLPVectorizer, SV_NAME, lv_name, false, false)static void *initializeSLPVectorizerPassOnce(PassRegistry &
Registry) {

12707

INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)initializeAAResultsWrapperPassPass(Registry);

12708

INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)initializeTargetTransformInfoWrapperPassPass(Registry);

12709

INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)initializeAssumptionCacheTrackerPass(Registry);

12710

INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)initializeScalarEvolutionWrapperPassPass(Registry);

12711

INITIALIZE_PASS_DEPENDENCY(LoopSimplify)initializeLoopSimplifyPass(Registry);

12712

INITIALIZE_PASS_DEPENDENCY(DemandedBitsWrapperPass)initializeDemandedBitsWrapperPassPass(Registry);

12713

INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass)initializeOptimizationRemarkEmitterWrapperPassPass(Registry);

12714

INITIALIZE_PASS_DEPENDENCY(InjectTLIMappingsLegacy)initializeInjectTLIMappingsLegacyPass(Registry);

12715

INITIALIZE_PASS_END(SLPVectorizer, SV_NAME, lv_name, false, false)PassInfo *PI = new PassInfo( lv_name, "slp-vectorizer", &
SLPVectorizer::ID, PassInfo::NormalCtor_t(callDefaultCtor<
SLPVectorizer>), false, false); Registry.registerPass(*PI,
true); return PI; } static llvm::once_flag InitializeSLPVectorizerPassFlag
; void llvm::initializeSLPVectorizerPass(PassRegistry &Registry
) { llvm::call_once(InitializeSLPVectorizerPassFlag, initializeSLPVectorizerPassOnce
, std::ref(Registry)); }

12716

12717

Pass *llvm::createSLPVectorizerPass() { return new SLPVectorizer(); }

File:	build/llvm-toolchain-snapshot-16~++20221003111214+1fa2019828ca/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
Warning:	line 11594, column 9 Value stored to 'VectorizedTree' is never read

Bug Summary

Annotated Source Code