72#define LDIST_NAME "loop-distribute"
73#define DEBUG_TYPE LDIST_NAME
78 "llvm.loop.distribute.followup_all";
80 "llvm.loop.distribute.followup_coincident";
82 "llvm.loop.distribute.followup_sequential";
84 "llvm.loop.distribute.followup_fallback";
89 cl::desc(
"Turn on DominatorTree and LoopInfo verification "
90 "after Loop Distribution"),
94 "loop-distribute-non-if-convertible",
cl::Hidden,
95 cl::desc(
"Whether to distribute into a loop that may not be "
96 "if-convertible by the loop vectorizer"),
101 cl::desc(
"The maximum number of SCEV checks allowed for Loop "
105 "loop-distribute-scev-check-threshold-with-pragma",
cl::init(128),
108 "The maximum number of SCEV checks allowed for Loop "
109 "Distribution for loop marked with #pragma loop distribute(enable)"));
113 cl::desc(
"Enable the new, experimental LoopDistribution Pass"),
116STATISTIC(NumLoopsDistributed,
"Number of loops distributed");
127 : DepCycle(DepCycle), OrigLoop(L) {
132 bool hasDepCycle()
const {
return DepCycle; }
142 bool empty()
const {
return Set.empty(); }
146 void moveTo(InstPartition &
Other) {
147 Other.Set.insert(Set.begin(), Set.end());
149 Other.DepCycle |= DepCycle;
154 void populateUsedSet() {
158 for (
auto *
B : OrigLoop->getBlocks())
159 Set.insert(
B->getTerminator());
164 while (!Worklist.empty()) {
167 for (
Value *V :
I->operand_values()) {
168 auto *
I = dyn_cast<Instruction>(V);
169 if (
I && OrigLoop->contains(
I->getParent()) && Set.insert(
I).second)
170 Worklist.push_back(
I);
184 LI, DT, ClonedLoopBlocks);
190 const Loop *getClonedLoop()
const {
return ClonedLoop; }
195 Loop *getDistributedLoop()
const {
196 return ClonedLoop ? ClonedLoop : OrigLoop;
204 void remapInstructions() {
210 void removeUnusedInsts() {
213 for (
auto *Block : OrigLoop->getBlocks())
214 for (
auto &Inst : *Block)
215 if (!Set.count(&Inst)) {
218 NewInst = cast<Instruction>(VMap[NewInst]);
220 assert(!isa<BranchInst>(NewInst) &&
221 "Branches are marked used early on");
222 Unused.push_back(NewInst);
227 for (
auto *Inst :
reverse(Unused)) {
228 if (!Inst->use_empty())
230 Inst->eraseFromParent();
236 dbgs() <<
" (cycle)\n";
239 dbgs() <<
" " <<
I->getParent()->getName() <<
":" << *
I <<
"\n";
242 void printBlocks()
const {
243 for (
auto *BB : getDistributedLoop()->getBlocks())
259 Loop *ClonedLoop =
nullptr;
273class InstPartitionContainer {
278 :
L(
L), LI(LI), DT(DT) {}
281 unsigned getSize()
const {
return PartitionContainer.size(); }
287 if (PartitionContainer.empty() || !PartitionContainer.back().hasDepCycle())
288 PartitionContainer.emplace_back(Inst, L,
true);
290 PartitionContainer.back().add(Inst);
298 void addToNewNonCyclicPartition(
Instruction *Inst) {
299 PartitionContainer.emplace_back(Inst, L);
307 void mergeAdjacentNonCyclic() {
308 mergeAdjacentPartitionsIf(
309 [](
const InstPartition *
P) {
return !
P->hasDepCycle(); });
314 void mergeNonIfConvertible() {
315 mergeAdjacentPartitionsIf([&](
const InstPartition *Partition) {
316 if (Partition->hasDepCycle())
320 bool seenStore =
false;
322 for (
auto *Inst : *Partition)
323 if (isa<StoreInst>(Inst)) {
333 void mergeBeforePopulating() {
334 mergeAdjacentNonCyclic();
336 mergeNonIfConvertible();
346 bool mergeToAvoidDuplicatedLoads() {
350 LoadToPartitionT LoadToPartition;
351 ToBeMergedT ToBeMerged;
356 for (PartitionContainerT::iterator
I = PartitionContainer.begin(),
357 E = PartitionContainer.end();
364 if (isa<LoadInst>(Inst)) {
366 LoadToPartitionT::iterator LoadToPart;
368 std::tie(LoadToPart, NewElt) =
369 LoadToPartition.insert(std::make_pair(Inst, PartI));
372 <<
"Merging partitions due to this load in multiple "
373 <<
"partitions: " << PartI <<
", " << LoadToPart->second
380 ToBeMerged.unionSets(PartI, &*PartJ);
381 }
while (&*PartJ != LoadToPart->second);
385 if (ToBeMerged.empty())
390 for (ToBeMergedT::iterator
I = ToBeMerged.begin(),
E = ToBeMerged.end();
395 auto PartI =
I->getData();
396 for (
auto *PartJ :
make_range(std::next(ToBeMerged.member_begin(
I)),
397 ToBeMerged.member_end())) {
398 PartJ->moveTo(*PartI);
403 PartitionContainer.remove_if(
404 [](
const InstPartition &
P) {
return P.empty(); });
411 void setupPartitionIdOnInstructions() {
413 for (
const auto &Partition : PartitionContainer) {
418 std::tie(Iter, NewElt) =
419 InstToPartitionId.insert(std::make_pair(Inst, PartitionID));
429 void populateUsedSet() {
430 for (
auto &
P : PartitionContainer)
441 assert(Pred &&
"Preheader does not have a single predecessor");
443 assert(ExitBlock &&
"No single exit block");
446 assert(!PartitionContainer.empty() &&
"at least two partitions expected");
450 "preheader not empty");
453 MDNode *OrigLoopID =
L->getLoopID();
461 NewLoop = Part.cloneLoopWithPreheader(TopPH, Pred,
Index, LI, DT);
463 Part.getVMap()[ExitBlock] = TopPH;
464 Part.remapInstructions();
465 setNewLoopID(OrigLoopID, &Part);
472 setNewLoopID(OrigLoopID, &PartitionContainer.back());
477 for (
auto Curr = PartitionContainer.cbegin(),
478 Next = std::next(PartitionContainer.cbegin()),
479 E = PartitionContainer.cend();
480 Next !=
E; ++Curr, ++Next)
482 Next->getDistributedLoop()->getLoopPreheader(),
483 Curr->getDistributedLoop()->getExitingBlock());
487 void removeUnusedInsts() {
488 for (
auto &Partition : PartitionContainer)
489 Partition.removeUnusedInsts();
502 unsigned N = RtPtrCheck->
Pointers.size();
504 for (
unsigned I = 0;
I <
N; ++
I) {
509 int &Partition = PtrToPartitions[
I];
515 int ThisPartition = this->InstToPartitionId[Inst];
517 Partition = ThisPartition;
519 else if (Partition == -1)
521 else if (Partition != (
int)ThisPartition)
524 assert(Partition != -2 &&
"Pointer not belonging to any partition");
527 return PtrToPartitions;
532 for (
const auto &
P : PartitionContainer) {
533 OS <<
"Partition " <<
Index++ <<
" (" << &
P <<
"):\n";
542 const InstPartitionContainer &Partitions) {
543 Partitions.print(
OS);
548 void printBlocks()
const {
550 for (
const auto &
P : PartitionContainer) {
551 dbgs() <<
"\nPartition " <<
Index++ <<
" (" << &
P <<
"):\n";
557 using PartitionContainerT = std::list<InstPartition>;
560 PartitionContainerT PartitionContainer;
564 InstToPartitionIdT InstToPartitionId;
572 template <
class UnaryPredicate>
573 void mergeAdjacentPartitionsIf(UnaryPredicate Predicate) {
574 InstPartition *PrevMatch =
nullptr;
575 for (
auto I = PartitionContainer.begin();
I != PartitionContainer.end();) {
577 if (PrevMatch ==
nullptr && DoesMatch) {
580 }
else if (PrevMatch !=
nullptr && DoesMatch) {
581 I->moveTo(*PrevMatch);
582 I = PartitionContainer.erase(
I);
591 void setNewLoopID(
MDNode *OrigLoopID, InstPartition *Part) {
598 Loop *NewLoop = Part->getDistributedLoop();
610class MemoryInstructionDependences {
616 unsigned NumUnsafeDependencesStartOrEnd = 0;
626 MemoryInstructionDependences(
632 for (
const auto &Dep : Dependences)
633 if (Dep.isPossiblyBackward()) {
637 ++Accesses[Dep.Source].NumUnsafeDependencesStartOrEnd;
638 --Accesses[Dep.Destination].NumUnsafeDependencesStartOrEnd;
645 AccessesType Accesses;
649class LoopDistributeForLoop {
654 :
L(
L),
F(
F), LI(LI), DT(DT), SE(SE), LAIs(LAIs), ORE(ORE) {
660 assert(
L->isInnermost() &&
"Only process inner loops.");
663 <<
L->getHeader()->getParent()->getName()
664 <<
"\" checking " << *L <<
"\n");
667 if (!
L->getExitBlock())
668 return fail(
"MultipleExitBlocks",
"multiple exit blocks");
669 if (!
L->isLoopSimplifyForm())
670 return fail(
"NotLoopSimplifyForm",
671 "loop is not in loop-simplify form");
672 if (!
L->isRotatedForm())
673 return fail(
"NotBottomTested",
"loop is not bottom tested");
677 LAI = &LAIs.getInfo(*L);
682 return fail(
"MemOpsCanBeVectorized",
683 "memory operations are safe for vectorization");
686 if (!Dependences || Dependences->empty())
687 return fail(
"NoUnsafeDeps",
"no unsafe dependences to isolate");
689 InstPartitionContainer Partitions(L, LI, DT);
714 int NumUnsafeDependencesActive = 0;
715 for (
const auto &InstDep : MID) {
719 if (NumUnsafeDependencesActive ||
720 InstDep.NumUnsafeDependencesStartOrEnd > 0)
721 Partitions.addToCyclicPartition(
I);
723 Partitions.addToNewNonCyclicPartition(
I);
724 NumUnsafeDependencesActive += InstDep.NumUnsafeDependencesStartOrEnd;
725 assert(NumUnsafeDependencesActive >= 0 &&
726 "Negative number of dependences active");
735 for (
auto *Inst : DefsUsedOutside)
736 Partitions.addToNewNonCyclicPartition(Inst);
739 if (Partitions.getSize() < 2)
740 return fail(
"CantIsolateUnsafeDeps",
741 "cannot isolate unsafe dependencies");
745 Partitions.mergeBeforePopulating();
747 if (Partitions.getSize() < 2)
748 return fail(
"CantIsolateUnsafeDeps",
749 "cannot isolate unsafe dependencies");
752 Partitions.populateUsedSet();
757 if (Partitions.mergeToAvoidDuplicatedLoads()) {
758 LLVM_DEBUG(
dbgs() <<
"\nPartitions merged to ensure unique loads:\n"
760 if (Partitions.getSize() < 2)
761 return fail(
"CantIsolateUnsafeDeps",
762 "cannot isolate unsafe dependencies");
769 return fail(
"RuntimeCheckWithConvergent",
770 "may not insert runtime check with convergent operation");
776 return fail(
"TooManySCEVRuntimeChecks",
777 "too many SCEV run-time checks needed.\n");
780 return fail(
"HeuristicDisabled",
"distribution heuristic disabled");
785 Partitions.setupPartitionIdOnInstructions();
788 auto PtrToPartition = Partitions.computePartitionSetForPointers(*LAI);
790 const auto &AllChecks = RtPtrChecking->
getChecks();
791 auto Checks = includeOnlyCrossPartitionChecks(AllChecks, PtrToPartition,
795 return fail(
"RuntimeCheckWithConvergent",
796 "may not insert runtime check with convergent operation");
808 MDNode *OrigLoopID =
L->getLoopID();
813 LVer.versionLoop(DefsUsedOutside);
814 LVer.annotateLoopWithNoAlias();
822 "llvm.loop.distribute.",
true);
823 LVer.getNonVersionedLoop()->setLoopID(UnversionedLoopID);
828 Partitions.cloneLoops();
832 Partitions.removeUnusedInsts();
838 assert(DT->
verify(DominatorTree::VerificationLevel::Fast));
841 ++NumLoopsDistributed;
846 <<
"distributed loop";
854 bool Forced = isForced().value_or(
false);
861 L->getStartLoc(),
L->getHeader())
862 <<
"loop not distributed: use -Rpass-analysis=loop-distribute for "
871 RemarkName,
L->getStartLoc(),
L->getHeader())
872 <<
"loop not distributed: " << Message);
878 *F,
L->getStartLoc(),
"loop not distributed: failed "
879 "explicitly specified loop distribution"));
889 const std::optional<bool> &isForced()
const {
return IsForced; }
903 copy_if(AllChecks, std::back_inserter(Checks),
905 for (
unsigned PtrIdx1 :
Check.first->Members)
906 for (
unsigned PtrIdx2 :
Check.second->Members)
922 PtrToPartition, PtrIdx1, PtrIdx2))
933 std::optional<const MDOperand *>
Value =
939 assert(Op && mdconst::hasa<ConstantInt>(*Op) &&
"invalid metadata");
940 IsForced = mdconst::extract<ConstantInt>(*Op)->getZExtValue();
960 std::optional<bool> IsForced;
974 for (
Loop *TopLevelLoop : *LI)
977 if (L->isInnermost())
981 bool Changed =
false;
982 for (
Loop *L : Worklist) {
983 LoopDistributeForLoop LDL(L, &
F, LI, DT, SE, LAIs, ORE);
988 Changed |= LDL.processLoop();
1003 bool Changed =
runImpl(
F, &LI, &DT, &SE, &ORE, LAIs);
static void print(raw_ostream &Out, object::Archive::Kind Kind, T Val)
static void fail(const SDLoc &DL, SelectionDAG &DAG, const Twine &Msg)
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
This file contains the declarations for the subclasses of Constant, which represent the different fla...
This file defines the DenseMap class.
This file builds on the ADT/GraphTraits.h file to build generic depth first graph iterator.
std::optional< std::vector< StOtherPiece > > Other
Generic implementation of equivalence classes through the use Tarjan's efficient union-find algorithm...
static bool runImpl(Function &F, const TargetLowering &TLI)
This is the interface for a simple mod/ref and alias analysis over globals.
This header provides classes for managing per-loop analyses.
static const char *const LLVMLoopDistributeFollowupCoincident
static cl::opt< bool > DistributeNonIfConvertible("loop-distribute-non-if-convertible", cl::Hidden, cl::desc("Whether to distribute into a loop that may not be " "if-convertible by the loop vectorizer"), cl::init(false))
static cl::opt< bool > EnableLoopDistribute("enable-loop-distribute", cl::Hidden, cl::desc("Enable the new, experimental LoopDistribution Pass"), cl::init(false))
static cl::opt< unsigned > DistributeSCEVCheckThreshold("loop-distribute-scev-check-threshold", cl::init(8), cl::Hidden, cl::desc("The maximum number of SCEV checks allowed for Loop " "Distribution"))
static const char *const LLVMLoopDistributeFollowupSequential
static cl::opt< unsigned > PragmaDistributeSCEVCheckThreshold("loop-distribute-scev-check-threshold-with-pragma", cl::init(128), cl::Hidden, cl::desc("The maximum number of SCEV checks allowed for Loop " "Distribution for loop marked with #pragma loop distribute(enable)"))
static const char *const LLVMLoopDistributeFollowupAll
static const char *const LLVMLoopDistributeFollowupFallback
static bool runImpl(Function &F, LoopInfo *LI, DominatorTree *DT, ScalarEvolution *SE, OptimizationRemarkEmitter *ORE, LoopAccessInfoManager &LAIs)
Shared implementation between new and old PMs.
static cl::opt< bool > LDistVerify("loop-distribute-verify", cl::Hidden, cl::desc("Turn on DominatorTree and LoopInfo verification " "after Loop Distribution"), cl::init(false))
This header defines various interfaces for pass management in LLVM.
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
This file defines the SmallPtrSet class.
This file defines the SmallVector class.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
static unsigned getSize(unsigned Kind)
A container for analyses that lazily runs them and caches their results.
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
LLVM Basic Block Representation.
iterator begin()
Instruction iterator methods.
const BasicBlock * getSinglePredecessor() const
Return the predecessor of this block if it has a single predecessor block.
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
DenseMapIterator< KeyT, ValueT, KeyInfoT, BucketT > iterator
Dependence - This class represents a dependence between two memory memory references in a function.
Diagnostic information for optimization failures.
Analysis pass which computes a DominatorTree.
bool verify(VerificationLevel VL=VerificationLevel::Full) const
verify - checks if the tree is correct.
void changeImmediateDominator(DomTreeNodeBase< NodeT > *N, DomTreeNodeBase< NodeT > *NewIDom)
changeImmediateDominator - This method is used to update the dominator tree information when a node's...
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
EquivalenceClasses - This represents a collection of equivalence classes and supports three efficient...
const BasicBlock * getParent() const
This is an important class for using LLVM in a threaded context.
void diagnose(const DiagnosticInfo &DI)
Report a message to the currently installed diagnostic handler.
This analysis provides dependence information for the memory accesses of a loop.
Drive the analysis of memory accesses in the loop.
const MemoryDepChecker & getDepChecker() const
the Memory Dependence Checker which can determine the loop-independent and loop-carried dependences b...
const RuntimePointerChecking * getRuntimePointerChecking() const
bool canVectorizeMemory() const
Return true we can analyze the memory accesses in the loop and there are no memory dependence cycles.
const PredicatedScalarEvolution & getPSE() const
Used to add runtime SCEV checks.
static bool blockNeedsPredication(BasicBlock *BB, Loop *TheLoop, DominatorTree *DT)
Return true if the block BB needs to be predicated in order for the loop to be vectorized.
SmallVector< Instruction *, 4 > getInstructionsForAccess(Value *Ptr, bool isWrite) const
Return the list of instructions that use Ptr to read or write memory.
bool hasConvergentOp() const
Return true if there is a convergent operation in the loop.
Analysis pass that exposes the LoopInfo for a function.
BlockT * getLoopPreheader() const
If there is a preheader for this loop, return it.
PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM)
void verify(const DominatorTreeBase< BlockT, false > &DomTree) const
This class emits a version of the loop where run-time checks ensure that may-alias pointers can't ove...
Represents a single loop in the control flow graph.
void setLoopID(MDNode *LoopID) const
Set the llvm.loop loop id metadata for this loop.
Tracking metadata reference owned by Metadata.
Checks memory dependences among accesses to the same underlying object to determine whether there vec...
const SmallVectorImpl< Instruction * > & getMemoryInstructions() const
The vector of memory access instructions.
const SmallVectorImpl< Dependence > * getDependences() const
Returns the memory dependences.
static PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
const SCEVPredicate & getPredicate() const
A set of analyses that are preserved following a run of a transformation pass.
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
void preserve()
Mark an analysis as preserved.
Holds information about the memory runtime legality checks to verify that a group of pointers do not ...
void printChecks(raw_ostream &OS, const SmallVectorImpl< RuntimePointerCheck > &Checks, unsigned Depth=0) const
Print Checks.
bool needsChecking(const RuntimeCheckingPtrGroup &M, const RuntimeCheckingPtrGroup &N) const
Decide if we need to add a check between two groups of pointers, according to needsChecking.
static bool arePointersInSamePartition(const SmallVectorImpl< int > &PtrToPartition, unsigned PtrIdx1, unsigned PtrIdx2)
Check if pointers are in the same partition.
SmallVector< PointerInfo, 2 > Pointers
Information about the pointers that may require checking.
const SmallVectorImpl< RuntimePointerCheck > & getChecks() const
Returns the checks that generateChecks created.
This class represents an assumption made using SCEV expressions which can be checked at run-time.
virtual unsigned getComplexity() const
Returns the estimated complexity of this predicate.
virtual bool isAlwaysTrue() const =0
Returns true if the predicate is always true.
Analysis pass that exposes the ScalarEvolution for a function.
The main scalar evolution driver.
SmallPtrSetIterator< PtrType > const_iterator
SmallPtrSetIterator< PtrType > iterator
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
StringRef - Represent a constant reference to a string, i.e.
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
bool replaceUsesOfWith(Value *From, Value *To)
Replace uses of one Value with another.
LLVM Value Representation.
This class implements an extremely fast bulk output stream that can only output to a stream.
friend const_iterator begin(StringRef path, Style style)
Get begin iterator over path.
friend const_iterator end(StringRef path)
Get end iterator over path.
This provides a very simple, boring adaptor for a begin and end iterator into a range type.
Predicate
Predicate - These are "(BI << 5) | BO" for various predicates.
initializer< Ty > init(const Ty &Val)
const_iterator begin(StringRef path, Style style=Style::native)
Get begin iterator over path.
const_iterator end(StringRef path)
Get end iterator over path.
This is an optimization pass for GlobalISel generic memory operations.
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
void dump(const SparseBitVector< ElementSize > &LHS, raw_ostream &out)
std::pair< const RuntimeCheckingPtrGroup *, const RuntimeCheckingPtrGroup * > RuntimePointerCheck
A memcheck which made up of a pair of grouped pointers.
std::optional< const MDOperand * > findStringMetadataForLoop(const Loop *TheLoop, StringRef Name)
Find string metadata for loop.
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
std::optional< MDNode * > makeFollowupLoopID(MDNode *OrigLoopID, ArrayRef< StringRef > FollowupAttrs, const char *InheritOptionsAttrsPrefix="", bool AlwaysNew=false)
Create a new loop identifier for a loop created from a loop transformation.
OutputIt copy_if(R &&Range, OutputIt Out, UnaryPredicate P)
Provide wrappers to std::copy_if which take ranges instead of having to pass begin/end explicitly.
SmallVector< Instruction *, 8 > findDefsUsedOutsideOfLoop(Loop *L)
Returns the instructions that use values defined in the loop.
auto reverse(ContainerTy &&C)
Loop * cloneLoopWithPreheader(BasicBlock *Before, BasicBlock *LoopDomBB, Loop *OrigLoop, ValueToValueMapTy &VMap, const Twine &NameSuffix, LoopInfo *LI, DominatorTree *DT, SmallVectorImpl< BasicBlock * > &Blocks)
Clones a loop OrigLoop.
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
bool hasDisableAllTransformsHint(const Loop *L)
Look for the loop attribute that disables all transformation heuristic.
raw_ostream & operator<<(raw_ostream &OS, const APFixedPoint &FX)
void remapInstructionsInBlocks(ArrayRef< BasicBlock * > Blocks, ValueToValueMapTy &VMap)
Remaps instructions in Blocks using the mapping in VMap.
BasicBlock * SplitBlock(BasicBlock *Old, Instruction *SplitPt, DominatorTree *DT, LoopInfo *LI=nullptr, MemorySSAUpdater *MSSAU=nullptr, const Twine &BBName="", bool Before=false)
Split the specified block at the specified instruction.
iterator_range< df_iterator< T > > depth_first(const T &G)
Dependece between memory access instructions.