37#define DEBUG_TYPE "vplan-slp"
45 CompletelySLP =
false;
51 return cast<VPInstruction>(V)->getUnderlyingInstr();
53 unsigned BundleSize = 0;
55 Type *
T = cast<VPInstruction>(V)->getUnderlyingInstr()->getType();
56 assert(!
T->isVectorTy() &&
"Only scalar types supported for now");
57 BundleSize +=
T->getScalarSizeInBits();
59 WidestBundleBits = std::max(WidestBundleBits, BundleSize);
62 auto Res = BundleToCombined.try_emplace(to_vector<4>(
Operands), New);
64 "Already created a combined instruction for the operand bundle");
71 return Op && isa<VPInstruction>(
Op) &&
72 cast<VPInstruction>(
Op)->getUnderlyingInstr();
74 LLVM_DEBUG(
dbgs() <<
"VPSLP: not all operands are VPInstructions\n");
83 cast<VPInstruction>(
Operands[0])->getUnderlyingInstr();
84 unsigned Opcode = OriginalInstr->
getOpcode();
87 const Instruction *
I = cast<VPInstruction>(
Op)->getUnderlyingInstr();
88 return I->getOpcode() == Opcode &&
89 I->getType()->getPrimitiveSizeInBits() == Width;
97 return cast<VPInstruction>(
Op)->getParent() != &this->BB;
104 [](
VPValue *
Op) {
return Op->hasMoreThanOneUniqueUser(); })) {
105 LLVM_DEBUG(
dbgs() <<
"VPSLP: Some operands have multiple users.\n");
113 if (Opcode == Instruction::Load) {
114 unsigned LoadsSeen = 0;
116 for (
auto &
I : *Parent) {
117 auto *VPI = dyn_cast<VPInstruction>(&
I);
120 if (VPI->getOpcode() == Instruction::Load &&
126 if (LoadsSeen > 0 && VPI->mayWriteToMemory()) {
128 dbgs() <<
"VPSLP: instruction modifying memory between loads\n");
134 return cast<LoadInst>(cast<VPInstruction>(
Op)->getUnderlyingInstr())
137 LLVM_DEBUG(
dbgs() <<
"VPSLP: only simple loads are supported.\n");
142 if (Opcode == Instruction::Store)
144 return cast<StoreInst>(cast<VPInstruction>(
Op)->getUnderlyingInstr())
147 LLVM_DEBUG(
dbgs() <<
"VPSLP: only simple stores are supported.\n");
155 unsigned OperandIndex) {
159 auto *U = cast<VPInstruction>(V);
160 Operands.push_back(U->getOperand(OperandIndex));
167 cast<VPInstruction>(Values[0])->
getOpcode());
173 auto *VPI = cast<VPInstruction>(Values[0]);
175 switch (VPI->getOpcode()) {
176 case Instruction::Load:
178 case Instruction::Store:
182 for (
unsigned I = 0, NumOps = VPI->getNumOperands();
I < NumOps; ++
I)
192 unsigned Opcode = cast<VPInstruction>(Values[0])->getOpcode();
194 return cast<VPInstruction>(V)->getOpcode() != Opcode;
204 if (
A->getOpcode() !=
B->getOpcode())
207 if (
A->getOpcode() != Instruction::Load &&
208 A->getOpcode() != Instruction::Store)
213 return GA && GB && GA == GB && GA->getIndex(
A) + 1 == GB->getIndex(
B);
220 auto *I1 = dyn_cast<VPInstruction>(V1);
221 auto *I2 = dyn_cast<VPInstruction>(V2);
230 for (
unsigned I = 0, EV1 = I1->getNumOperands();
I < EV1; ++
I)
231 for (
unsigned J = 0, EV2 = I2->getNumOperands(); J < EV2; ++J)
233 getLAScore(I1->getOperand(
I), I2->getOperand(J), MaxLevel - 1, IAI);
237std::pair<VPlanSlp::OpMode, VPValue *>
241 assert((Mode == OpMode::Load || Mode == OpMode::Opcode) &&
242 "Currently we only handle load and commutative opcodes");
247 << *cast<VPInstruction>(
Last)->getUnderlyingInstr() <<
" ");
248 for (
auto *Candidate : Candidates) {
249 auto *LastI = cast<VPInstruction>(
Last);
250 auto *CandidateI = cast<VPInstruction>(Candidate);
252 LLVM_DEBUG(
dbgs() << *cast<VPInstruction>(Candidate)->getUnderlyingInstr()
259 if (BestCandidates.
empty())
260 return {OpMode::Failed,
nullptr};
262 if (BestCandidates.
size() == 1)
263 return {
Mode, BestCandidates[0]};
266 unsigned BestScore = 0;
268 unsigned PrevScore = ~0
u;
272 for (
auto *Candidate : BestCandidates) {
274 if (PrevScore == ~0u)
276 if (PrevScore != Score)
280 if (Score > BestScore) {
289 << *cast<VPInstruction>(Best)->getUnderlyingInstr()
291 Candidates.erase(Best);
304 for (
auto &
Operands : MultiNodeOps) {
306 if (cast<VPInstruction>(
Operands.second[0])->getOpcode() ==
308 Mode.push_back(OpMode::Load);
310 Mode.push_back(OpMode::Opcode);
313 for (
unsigned Lane = 1, E = MultiNodeOps[0].second.size(); Lane < E; ++Lane) {
314 LLVM_DEBUG(
dbgs() <<
" Finding best value for lane " << Lane <<
"\n");
317 for (
auto Ops : MultiNodeOps) {
319 dbgs() << *cast<VPInstruction>(Ops.second[Lane])->getUnderlyingInstr()
321 Candidates.
insert(Ops.second[Lane]);
325 for (
unsigned Op = 0, E = MultiNodeOps.size();
Op < E; ++
Op) {
327 if (Mode[
Op] == OpMode::Failed)
331 std::pair<OpMode, VPValue *> Res =
332 getBest(Mode[
Op],
Last, Candidates, IAI);
344#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
347 for (
auto *
Op : Values) {
348 if (
auto *VPInstr = cast_or_null<VPInstruction>(
Op))
349 if (
auto *Instr = VPInstr->getUnderlyingInstr()) {
353 dbgs() <<
" nullptr | ";
363 auto I = BundleToCombined.find(to_vector<4>(Values));
364 if (
I != BundleToCombined.end()) {
369 for (
auto *V : Values) {
370 auto UI = V->user_begin();
371 auto *FirstUser = *UI++;
372 while (UI != V->user_end()) {
373 assert(*UI == FirstUser &&
"Currently we only support SLP trees.");
383 dbgs() <<
"buildGraph: ";
387 if (!areVectorizable(Values))
391 unsigned ValuesOpcode = *
getOpcode(Values);
395 bool MultiNodeRoot = !MultiNodeActive;
396 MultiNodeActive =
true;
399 dbgs() <<
" Visiting Commutative";
404 if (OperandsOpcode && OperandsOpcode ==
getOpcode(Values)) {
419 MultiNodeActive =
false;
421 auto FinalOrder = reorderMultiNodeOps();
423 MultiNodeOps.clear();
424 for (
auto &Ops : FinalOrder) {
426 Ops.first->replaceAllUsesWith(NewOp);
427 for (
unsigned i = 0; i < CombinedOperands.
size(); i++)
428 if (CombinedOperands[i] == Ops.first)
429 CombinedOperands[i] = NewOp;
437 if (ValuesOpcode == Instruction::Load)
439 CombinedOperands.
push_back(cast<VPInstruction>(V)->getOperand(0));
446 switch (ValuesOpcode) {
447 case Instruction::Load:
450 case Instruction::Store:
454 Opcode = ValuesOpcode;
461 assert(CombinedOperands.
size() > 0 &&
"Need more some operands");
462 auto *Inst = cast<VPInstruction>(Values[0])->getUnderlyingInstr();
463 auto *VPI =
new VPInstruction(Opcode, CombinedOperands, Inst->getDebugLoc());
466 << *cast<VPInstruction>(Values[0]) <<
"\n");
467 addCombined(Values, VPI);
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
This file defines the DenseMap class.
mir Rename Register Operands
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
This file defines the SmallVector class.
static std::optional< unsigned > getOpcode(ArrayRef< VPValue * > Values)
Returns the opcode of Values or ~0 if they do not all agree.
static unsigned LookaheadMaxDepth
static bool areConsecutiveOrMatch(VPInstruction *A, VPInstruction *B, VPInterleavedAccessInfo &IAI)
Returns true if A and B access sequential memory if they are loads or stores or if they have identica...
static unsigned getLAScore(VPValue *V1, VPValue *V2, unsigned MaxLevel, VPInterleavedAccessInfo &IAI)
Implements getLAScore from Listing 7 in the paper.
static bool areCommutative(ArrayRef< VPValue * > Values)
static SmallVector< VPValue *, 4 > getOperands(ArrayRef< VPValue * > Values, unsigned OperandIndex)
This file contains the declarations of the entities induced by Vectorization Plans,...
This file contains the declarations of the Vectorization Plan base classes:
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
bool empty() const
empty - Check if the array is empty.
This class represents an Operation in the Expression.
bool isCommutative() const LLVM_READONLY
Return true if the instruction is commutative:
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
void reserve(size_type N)
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
The instances of the Type class are immutable: once they are created, they are never changed.
TypeSize getPrimitiveSizeInBits() const LLVM_READONLY
Return the basic size of this type if it is a primitive type.
VPBasicBlock serves as the leaf of the Hierarchical Control-Flow Graph.
VPRegionBlock * getParent()
This is a concrete Recipe that models a single VPlan-level instruction.
InterleaveGroup< VPInstruction > * getInterleaveGroup(VPInstruction *Instr) const
Get the interleave group that Instr belongs to.
VPInstruction * buildGraph(ArrayRef< VPValue * > Operands)
Tries to build an SLP tree rooted at Operands and returns a VPInstruction combining Operands,...
Type * getType() const
All values are typed, get the type of this value.
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
NodeAddr< InstrNode * > Instr
This is an optimization pass for GlobalISel generic memory operations.
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
DWARFExpression::Operation Op
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.