70#define DEBUG_TYPE "loop-fusion"
73STATISTIC(NumFusionCandidates,
"Number of candidates for loop fusion");
74STATISTIC(InvalidLoopStructure,
"Loop has invalid structure");
75STATISTIC(AddressTakenBB,
"Basic block has address taken");
76STATISTIC(MayThrowException,
"Loop may throw an exception");
77STATISTIC(ContainsVolatileAccess,
"Loop contains a volatile access");
78STATISTIC(ContainsAtomicAccess,
"Loop contains an atomic access");
79STATISTIC(NotSimplifiedForm,
"Loop is not in simplified form");
80STATISTIC(InvalidDependencies,
"Dependencies prevent fusion");
81STATISTIC(UnknownTripCount,
"Loop has unknown trip count");
82STATISTIC(UncomputableTripCount,
"SCEV cannot compute trip count of loop");
83STATISTIC(NonEqualTripCount,
"Loop trip counts are not the same");
86 "Loop has a non-empty preheader with instructions that cannot be moved");
87STATISTIC(FusionNotBeneficial,
"Fusion is not beneficial");
88STATISTIC(NonIdenticalGuards,
"Candidates have different guards");
89STATISTIC(NonEmptyExitBlock,
"Candidate has a non-empty exit block with "
90 "instructions that cannot be moved");
91STATISTIC(NonEmptyGuardBlock,
"Candidate has a non-empty guard block with "
92 "instructions that cannot be moved");
95 "The second candidate is guarded while the first one is not");
96STATISTIC(NumHoistedInsts,
"Number of hoisted preheader instructions.");
97STATISTIC(NumSunkInsts,
"Number of sunk preheader instructions.");
102 cl::desc(
"Max number of iterations to be peeled from a loop, such that "
103 "fusion can take place"));
108 cl::desc(
"Enable verbose debugging for Loop Fusion"),
123struct FusionCandidate {
162 : Preheader(L->getLoopPreheader()), Header(L->getHeader()),
163 ExitingBlock(L->getExitingBlock()), ExitBlock(L->getExitBlock()),
164 Latch(L->getLoopLatch()), L(L), Valid(
true),
165 GuardBranch(L->getLoopGuardBranch()), PP(PP), AbleToPeel(
canPeel(L)),
166 Peeled(
false), DT(DT), PDT(PDT), ORE(ORE) {
173 if (BB->hasAddressTaken()) {
176 reportInvalidCandidate(
"AddressTakenBB",
177 "Basic block has address taken");
186 "Loop may throw an exception");
189 if (
I.isVolatile()) {
191 ++ContainsVolatileAccess;
193 "Loop contains a volatile access");
201 ++ContainsAtomicAccess;
203 "Loop contains an atomic access");
206 if (
I.mayWriteToMemory())
207 MemWrites.push_back(&
I);
208 if (
I.mayReadFromMemory())
209 MemReads.push_back(&
I);
216 return Preheader && ExitingBlock && ExitBlock && Latch &&
L &&
223 assert(!
L->isInvalid() &&
"Loop is invalid!");
224 assert(Preheader ==
L->getLoopPreheader() &&
"Preheader is out of sync");
225 assert(Header ==
L->getHeader() &&
"Header is out of sync");
226 assert(ExitingBlock ==
L->getExitingBlock() &&
227 "Exiting Blocks is out of sync");
228 assert(ExitBlock ==
L->getExitBlock() &&
"Exit block is out of sync");
229 assert(Latch ==
L->getLoopLatch() &&
"Latch is out of sync");
239 return GuardBranch->getParent();
245 void updateAfterPeeling() {
246 Preheader =
L->getLoopPreheader();
247 Header =
L->getHeader();
248 ExitingBlock =
L->getExitingBlock();
249 ExitBlock =
L->getExitBlock();
250 Latch =
L->getLoopLatch();
262 assert(GuardBranch &&
"Only valid on guarded loops.");
270#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
272 dbgs() <<
"\tGuardBranch: ";
274 dbgs() << *GuardBranch;
278 << (GuardBranch ? GuardBranch->getName() :
"nullptr") <<
"\n"
279 <<
"\tPreheader: " << (Preheader ? Preheader->
getName() :
"nullptr")
281 <<
"\tHeader: " << (Header ? Header->getName() :
"nullptr") <<
"\n"
283 << (ExitingBlock ? ExitingBlock->
getName() :
"nullptr") <<
"\n"
284 <<
"\tExitBB: " << (ExitBlock ? ExitBlock->
getName() :
"nullptr")
286 <<
"\tLatch: " << (Latch ? Latch->
getName() :
"nullptr") <<
"\n"
288 << (getEntryBlock() ? getEntryBlock()->getName() :
"nullptr")
299 assert(Header &&
"Header should be guaranteed to exist!");
300 ++InvalidLoopStructure;
307 <<
" trip count not computable!\n");
310 "Loop has unknown trip count");
313 if (!
L->isLoopSimplifyForm()) {
315 <<
" is not in simplified form!\n");
318 "Loop is not in simplified form");
321 if (!
L->isRotatedForm()) {
350 L->getStartLoc(),
L->getHeader())
351 <<
"Loop is not a candidate for fusion");
354 L->getStartLoc(),
L->getHeader())
355 <<
"[" <<
L->getHeader()->getParent()->getName() <<
"]: "
356 <<
"Loop is not a candidate for fusion: " << RemarkMsg);
372 dbgs() <<
"****************************\n";
373 for (
const Loop *L : LV)
375 dbgs() <<
"****************************\n";
380 OS << FC.Preheader->getName();
389 for (
const FusionCandidate &FC : CandList)
397 dbgs() <<
"Fusion Candidates: \n";
398 for (
const auto &CandidateList : FusionCandidates) {
399 dbgs() <<
"*** Fusion Candidate List ***\n";
400 dbgs() << CandidateList;
401 dbgs() <<
"****************************\n";
414struct LoopDepthTree {
415 using LoopsOnLevelTy = SmallVector<LoopVector, 4>;
419 LoopDepthTree(LoopInfo &LI) : Depth(1) {
426 bool isRemovedLoop(
const Loop *L)
const {
return RemovedLoops.count(L); }
430 void removeLoop(
const Loop *L) { RemovedLoops.insert(L); }
434 LoopsOnLevelTy LoopsOnNextLevel;
438 if (!isRemovedLoop(L) &&
L->begin() !=
L->end())
439 LoopsOnNextLevel.emplace_back(
LoopVector(
L->begin(),
L->end()));
441 LoopsOnLevel = LoopsOnNextLevel;
442 RemovedLoops.clear();
446 bool empty()
const {
return size() == 0; }
447 size_t size()
const {
return LoopsOnLevel.size() - RemovedLoops.size(); }
448 unsigned getDepth()
const {
return Depth; }
450 iterator
begin() {
return LoopsOnLevel.begin(); }
451 iterator
end() {
return LoopsOnLevel.end(); }
452 const_iterator
begin()
const {
return LoopsOnLevel.begin(); }
453 const_iterator
end()
const {
return LoopsOnLevel.end(); }
458 SmallPtrSet<const Loop *, 8> RemovedLoops;
464 LoopsOnLevelTy LoopsOnLevel;
479 PostDominatorTree &PDT;
480 OptimizationRemarkEmitter &ORE;
482 const TargetTransformInfo &TTI;
485 LoopFuser(LoopInfo &LI, DominatorTree &DT, DependenceInfo &DI,
486 ScalarEvolution &SE, PostDominatorTree &PDT,
487 OptimizationRemarkEmitter &ORE, AssumptionCache &AC,
488 const TargetTransformInfo &TTI)
489 : LDT(LI), DTU(DT, PDT, DomTreeUpdater::UpdateStrategy::Lazy), LI(LI),
490 DT(DT), DI(DI), SE(SE), PDT(PDT), ORE(ORE), AC(AC), TTI(TTI) {}
495 bool fuseLoops(Function &
F) {
502 LLVM_DEBUG(
dbgs() <<
"Performing Loop Fusion on function " <<
F.getName()
506 while (!LDT.empty()) {
507 LLVM_DEBUG(
dbgs() <<
"Got " << LDT.size() <<
" loop sets for depth "
508 << LDT.getDepth() <<
"\n";);
511 assert(LV.size() > 0 &&
"Empty loop set was build!");
520 dbgs() <<
" Visit loop set (#" << LV.size() <<
"):\n";
526 collectFusionCandidates(LV);
531 FusionCandidates.clear();
557 void collectFusionCandidates(
const LoopVector &LV) {
561 FusionCandidate CurrCand(L, DT, &PDT, ORE, PP);
562 if (!CurrCand.isEligibleForFusion(SE))
570 bool FoundAdjacent =
false;
571 for (
auto &CurrCandList : FusionCandidates) {
572 if (isStrictlyAdjacent(CurrCandList.back(), CurrCand)) {
573 CurrCandList.push_back(CurrCand);
574 FoundAdjacent =
true;
575 NumFusionCandidates++;
579 <<
" to existing candidate list\n");
584 if (!FoundAdjacent) {
591 NewCandList.push_back(CurrCand);
592 FusionCandidates.push_back(NewCandList);
602 bool isBeneficialFusion(
const FusionCandidate &FC0,
603 const FusionCandidate &FC1) {
612 std::optional<int64_t>
613 calculateTripCountDiff(
const FusionCandidate &FC0,
614 const FusionCandidate &FC1)
const {
615 const SCEV *TripCount0 = SE.getBackedgeTakenCount(FC0.L);
617 UncomputableTripCount++;
618 LLVM_DEBUG(
dbgs() <<
"Trip count of first loop could not be computed!");
622 const SCEV *TripCount1 = SE.getBackedgeTakenCount(FC1.L);
624 UncomputableTripCount++;
625 LLVM_DEBUG(
dbgs() <<
"Trip count of second loop could not be computed!");
630 << *TripCount1 <<
" are "
631 << (TripCount0 == TripCount1 ?
"identical" :
"different")
634 if (TripCount0 == TripCount1)
638 "determining the difference between trip counts\n");
645 static_cast<int64_t
>(SE.getSmallConstantTripCount(FC0.L));
647 static_cast<int64_t
>(SE.getSmallConstantTripCount(FC1.L));
651 if (TC0 == 0 || TC1 == 0) {
652 LLVM_DEBUG(
dbgs() <<
"Loop(s) do not have a single exit point or do not "
653 "have a constant number of iterations. Peeling "
654 "is not benefical\n");
661 void peelFusionCandidate(FusionCandidate &FC0,
const FusionCandidate &FC1,
662 unsigned PeelCount) {
663 assert(FC0.AbleToPeel &&
"Should be able to peel loop");
666 <<
" iterations of the first loop. \n");
672 peelLoop(FC0.L, PeelCount,
false, &LI, &SE, DT, &AC,
678 auto TCDiff = calculateTripCountDiff(FC0, FC1);
680 assert(TCDiff && *TCDiff == 0 &&
681 "Loops should have identical trip counts after peeling");
687 PDT.recalculate(*FC0.Preheader->
getParent());
689 FC0.updateAfterPeeling();
703 SmallVector<Instruction *, 8> WorkList;
705 if (Pred != FC0.ExitBlock) {
708 DominatorTree::UpdateType(DominatorTree::Delete, Pred, BB));
713 for (Instruction *CurrentBranch : WorkList) {
714 BasicBlock *Succ = CurrentBranch->getSuccessor(0);
716 Succ = CurrentBranch->getSuccessor(1);
720 DTU.applyUpdates(TreeUpdates);
725 <<
" iterations from the first loop.\n"
726 "Both Loops have the same number of iterations now.\n");
736 bool fuseCandidates() {
739 for (
auto &CandidateList : FusionCandidates) {
740 if (CandidateList.size() < 2)
744 << CandidateList <<
"\n");
746 for (
auto It = CandidateList.begin(), NextIt = std::next(It);
747 NextIt != CandidateList.end(); It = NextIt, NextIt = std::next(It)) {
752 assert(!LDT.isRemovedLoop(FC0.L) &&
753 "Should not have removed loops in CandidateList!");
754 assert(!LDT.isRemovedLoop(FC1.L) &&
755 "Should not have removed loops in CandidateList!");
757 LLVM_DEBUG(
dbgs() <<
"Attempting to fuse candidate \n"; FC0.dump();
758 dbgs() <<
" with\n"; FC1.dump();
dbgs() <<
"\n");
763 std::optional<int64_t> TCDifference = calculateTripCountDiff(FC0, FC1);
769 FC0.AbleToPeel && TCDifference && *TCDifference > 0 &&
772 if (!WillPeel && (!TCDifference || *TCDifference != 0)) {
773 LLVM_DEBUG(
dbgs() <<
"Fusion candidates do not have identical trip "
774 "counts and peeling is not supported for this "
775 "case. Not fusing.\n");
777 reportLoopFusion<OptimizationRemarkMissed>(
778 FC0, FC1,
"NonEqualTripCount",
779 "Loop trip counts are not the same");
783 if ((!FC0.GuardBranch && FC1.GuardBranch) ||
784 (FC0.GuardBranch && !FC1.GuardBranch)) {
786 "another one is not. Not fusing.\n");
787 ++OnlySecondCandidateIsGuarded;
788 reportLoopFusion<OptimizationRemarkMissed>(
789 FC0, FC1,
"OnlySecondCandidateIsGuarded",
790 "The second candidate is guarded while the first one is not");
797 if (!TCDifference || *TCDifference == 0) {
798 if (FC0.GuardBranch && FC1.GuardBranch &&
799 !haveIdenticalGuards(FC0, FC1)) {
801 "guards. Not Fusing.\n");
802 ++NonIdenticalGuards;
803 reportLoopFusion<OptimizationRemarkMissed>(
804 FC0, FC1,
"NonIdenticalGuards",
805 "Candidates have different guards");
810 if (FC0.GuardBranch) {
811 assert(FC1.GuardBranch &&
"Expecting valid FC1 guard branch");
817 "instructions in exit block. Not fusing.\n");
819 reportLoopFusion<OptimizationRemarkMissed>(
820 FC0, FC1,
"NonEmptyExitBlock",
821 "Candidate has a non-empty exit block with "
822 "instructions that cannot be moved");
827 *FC1.GuardBranch->getParent(),
828 *FC0.GuardBranch->getParent()->getTerminator(), DT, &PDT,
831 "instructions in guard block. Not fusing.\n");
832 ++NonEmptyGuardBlock;
833 reportLoopFusion<OptimizationRemarkMissed>(
834 FC0, FC1,
"NonEmptyGuardBlock",
835 "Candidate has a non-empty guard block with "
836 "instructions that cannot be moved");
843 if (!dependencesAllowFusion(FC0, FC1)) {
844 LLVM_DEBUG(
dbgs() <<
"Memory dependencies do not allow fusion!\n");
845 ++InvalidDependencies;
846 reportLoopFusion<OptimizationRemarkMissed>(
847 FC0, FC1,
"InvalidDependencies",
"Dependencies prevent fusion");
854 SmallVector<Instruction *, 4> SafeToHoist;
855 SmallVector<Instruction *, 4> SafeToSink;
859 if (!isEmptyPreheader(FC1)) {
865 if (!collectMovablePreheaderInsts(FC0, FC1, SafeToHoist,
868 "Fusion Candidate Pre-header.\n"
871 reportLoopFusion<OptimizationRemarkMissed>(
872 FC0, FC1,
"NonEmptyPreheader",
873 "Loop has a non-empty preheader with instructions that "
879 bool BeneficialToFuse = isBeneficialFusion(FC0, FC1);
881 << (BeneficialToFuse ?
"" :
"un") <<
"profitable!\n");
882 if (!BeneficialToFuse) {
883 ++FusionNotBeneficial;
884 reportLoopFusion<OptimizationRemarkMissed>(
885 FC0, FC1,
"FusionNotBeneficial",
"Fusion is not beneficial");
893 movePreheaderInsts(FC0, FC1, SafeToHoist, SafeToSink);
895 LLVM_DEBUG(
dbgs() <<
"\tFusion is performed: " << FC0 <<
" and " << FC1
898 FusionCandidate FC0Copy = FC0;
901 bool Peel = TCDifference && *TCDifference > 0;
903 peelFusionCandidate(FC0Copy, FC1, *TCDifference);
910 reportLoopFusion<OptimizationRemark>((Peel ? FC0Copy : FC0), FC1,
911 "FuseCounter",
"Loops fused");
913 FusionCandidate FusedCand(performFusion((Peel ? FC0Copy : FC0), FC1),
914 DT, &PDT, ORE, FC0Copy.PP);
916 assert(FusedCand.isEligibleForFusion(SE) &&
917 "Fused candidate should be eligible for fusion!");
920 LDT.removeLoop(FC1.L);
923 It = CandidateList.erase(It);
924 It = CandidateList.erase(It);
925 It = CandidateList.insert(It, FusedCand);
930 LLVM_DEBUG(
dbgs() <<
"Candidate List (after fusion): " << CandidateList
944 bool canHoistInst(Instruction &
I,
945 const SmallVector<Instruction *, 4> &SafeToHoist,
946 const SmallVector<Instruction *, 4> &NotHoisting,
947 const FusionCandidate &FC0)
const {
949 assert(FC0PreheaderTarget &&
950 "Expected single successor for loop preheader.");
952 for (Use &
Op :
I.operands()) {
957 if (!(OpHoisted || DT.dominates(OpInst, FC0PreheaderTarget))) {
969 if (!
I.mayReadOrWriteMemory())
972 LLVM_DEBUG(
dbgs() <<
"Checking if this mem inst can be hoisted.\n");
973 for (Instruction *NotHoistedInst : NotHoisting) {
974 if (
auto D = DI.depends(&
I, NotHoistedInst)) {
977 if (
D->isFlow() ||
D->isAnti() ||
D->isOutput()) {
979 "preheader that is not being hoisted.\n");
985 for (Instruction *ReadInst : FC0.MemReads) {
986 if (
auto D = DI.depends(ReadInst, &
I)) {
989 LLVM_DEBUG(
dbgs() <<
"Inst depends on a read instruction in FC0.\n");
995 for (Instruction *WriteInst : FC0.MemWrites) {
996 if (
auto D = DI.depends(WriteInst, &
I)) {
998 if (
D->isFlow() ||
D->isOutput()) {
999 LLVM_DEBUG(
dbgs() <<
"Inst depends on a write instruction in FC0.\n");
1010 bool canSinkInst(Instruction &
I,
const FusionCandidate &FC1)
const {
1011 for (User *U :
I.users()) {
1024 if (!
I.mayReadOrWriteMemory())
1027 for (Instruction *ReadInst : FC1.MemReads) {
1028 if (
auto D = DI.depends(&
I, ReadInst)) {
1031 LLVM_DEBUG(
dbgs() <<
"Inst depends on a read instruction in FC1.\n");
1037 for (Instruction *WriteInst : FC1.MemWrites) {
1038 if (
auto D = DI.depends(&
I, WriteInst)) {
1040 if (
D->isOutput() ||
D->isAnti()) {
1041 LLVM_DEBUG(
dbgs() <<
"Inst depends on a write instruction in FC1.\n");
1052 bool collectMovablePreheaderInsts(
1053 const FusionCandidate &FC0,
const FusionCandidate &FC1,
1054 SmallVector<Instruction *, 4> &SafeToHoist,
1055 SmallVector<Instruction *, 4> &SafeToSink)
const {
1059 SmallVector<Instruction *, 4> NotHoisting;
1061 for (Instruction &
I : *FC1Preheader) {
1063 if (&
I == FC1Preheader->getTerminator())
1069 if (
I.mayThrow() || !
I.willReturn()) {
1070 LLVM_DEBUG(
dbgs() <<
"Inst: " <<
I <<
" may throw or won't return.\n");
1076 if (
I.isAtomic() ||
I.isVolatile()) {
1078 dbgs() <<
"\tInstruction is volatile or atomic. Cannot move it.\n");
1082 if (canHoistInst(
I, SafeToHoist, NotHoisting, FC0)) {
1089 if (canSinkInst(
I, FC1)) {
1099 dbgs() <<
"All preheader instructions could be sunk or hoisted!\n");
1105 bool dependencesAllowFusion(
const FusionCandidate &FC0,
1106 const FusionCandidate &FC1, Instruction &I0,
1110 LLVM_DEBUG(
dbgs() <<
"Check dep: " << I0 <<
" vs " << I1 <<
"\n");
1113 auto DepResult = DI.depends(&I0, &I1);
1119 dbgs() <<
" [#l: " << DepResult->getLevels() <<
"][Ordered: "
1120 << (DepResult->isOrdered() ?
"true" :
"false")
1122 LLVM_DEBUG(
dbgs() <<
"DepResult Levels: " << DepResult->getLevels()
1126 unsigned Levels = DepResult->getLevels();
1127 unsigned SameSDLevels = DepResult->getSameSDLevels();
1131 if (CurLoopLevel > Levels + SameSDLevels)
1135 for (
unsigned Level = 1;
Level <= std::min(CurLoopLevel - 1, Levels);
1137 unsigned Direction = DepResult->getDirection(Level,
false);
1143 LLVM_DEBUG(
dbgs() <<
"Safe to fuse due to non-equal acceses in the "
1150 assert(CurLoopLevel > Levels &&
"Fusion candidates are not separated");
1152 if (DepResult->isScalar(CurLoopLevel,
true)) {
1153 if (DepResult->isInput() || DepResult->isOutput()) {
1155 << (DepResult->isInput() ?
"input" :
"output")
1156 <<
" dependency\n");
1161 dbgs() <<
"Not safe to fuse due to a scalar flow dependency\n");
1165 unsigned CurDir = DepResult->getDirection(CurLoopLevel,
true);
1175 LLVM_DEBUG(
dbgs() <<
"Safe to fuse with no backward loop-carried "
1181 if (DepResult->getNextPredecessor() || DepResult->getNextSuccessor())
1182 LLVM_DEBUG(
dbgs() <<
"TODO: Implement pred/succ dependence handling!\n");
1188 bool dependencesAllowFusion(
const FusionCandidate &FC0,
1189 const FusionCandidate &FC1) {
1190 LLVM_DEBUG(
dbgs() <<
"Check if " << FC0 <<
" can be fused with " << FC1
1193 assert(DT.dominates(FC0.getEntryBlock(), FC1.getEntryBlock()));
1195 for (Instruction *WriteL0 : FC0.MemWrites) {
1196 for (Instruction *WriteL1 : FC1.MemWrites)
1197 if (!dependencesAllowFusion(FC0, FC1, *WriteL0, *WriteL1)) {
1200 for (Instruction *ReadL1 : FC1.MemReads)
1201 if (!dependencesAllowFusion(FC0, FC1, *WriteL0, *ReadL1)) {
1208 for (Instruction *ReadL0 : FC0.MemReads)
1209 for (Instruction *WriteL1 : FC1.MemWrites)
1210 if (!dependencesAllowFusion(FC0, FC1, *ReadL0, *WriteL1)) {
1216 for (BasicBlock *BB : FC1.L->
blocks())
1217 for (Instruction &
I : *BB)
1218 for (
auto &
Op :
I.operands())
1239 bool isStrictlyAdjacent(
const FusionCandidate &FC0,
1240 const FusionCandidate &FC1)
const {
1242 if (FC0.GuardBranch)
1243 return DT.dominates(FC0.getEntryBlock(), FC1.getEntryBlock()) &&
1245 return FC0.ExitBlock == FC1.getEntryBlock();
1248 bool isEmptyPreheader(
const FusionCandidate &FC)
const {
1249 return FC.Preheader->size() == 1;
1254 void movePreheaderInsts(
const FusionCandidate &FC0,
1255 const FusionCandidate &FC1,
1256 SmallVector<Instruction *, 4> &HoistInsts,
1257 SmallVector<Instruction *, 4> &SinkInsts)
const {
1260 "Attempting to sink and hoist preheader instructions, but not all "
1261 "the preheader instructions are accounted for.");
1263 NumHoistedInsts += HoistInsts.
size();
1264 NumSunkInsts += SinkInsts.
size();
1267 if (!HoistInsts.
empty())
1268 dbgs() <<
"Hoisting: \n";
1269 for (Instruction *
I : HoistInsts)
1270 dbgs() << *
I <<
"\n";
1271 if (!SinkInsts.
empty())
1272 dbgs() <<
"Sinking: \n";
1273 for (Instruction *
I : SinkInsts)
1274 dbgs() << *
I <<
"\n";
1277 for (Instruction *
I : HoistInsts) {
1278 assert(
I->getParent() == FC1.Preheader);
1279 I->moveBefore(*FC0.Preheader,
1283 for (Instruction *
I :
reverse(SinkInsts)) {
1284 assert(
I->getParent() == FC1.Preheader);
1292 "Expected the sunk PHI node to have 1 incoming value.");
1293 I->replaceAllUsesWith(
I->getOperand(0));
1294 I->eraseFromParent();
1312 bool haveIdenticalGuards(
const FusionCandidate &FC0,
1313 const FusionCandidate &FC1)
const {
1314 assert(FC0.GuardBranch && FC1.GuardBranch &&
1315 "Expecting FC0 and FC1 to be guarded loops.");
1319 if ((!FC0CmpInst || !FC1CmpInst) &&
1323 if (FC0CmpInst && FC1CmpInst && !FC0CmpInst->isIdenticalTo(FC1CmpInst))
1330 return (FC1.GuardBranch->
getSuccessor(0) == FC1.Preheader);
1331 return (FC1.GuardBranch->
getSuccessor(1) == FC1.Preheader);
1336 void simplifyLatchBranch(
const FusionCandidate &FC)
const {
1338 if (FCLatchBranch) {
1340 "Expecting the two successors of FCLatchBranch to be the same");
1341 UncondBrInst *NewBranch =
1349 void mergeLatch(
const FusionCandidate &FC0,
const FusionCandidate &FC1) {
1386 Loop *performFusion(
const FusionCandidate &FC0,
const FusionCandidate &FC1) {
1387 assert(FC0.isValid() && FC1.isValid() &&
1388 "Expecting valid fusion candidates");
1391 dbgs() <<
"Fusion Candidate 1: \n"; FC1.dump(););
1400 if (FC0.GuardBranch)
1401 return fuseGuardedLoops(FC0, FC1);
1418 if (FC0.ExitingBlock != FC0.Latch)
1419 for (PHINode &
PHI : FC0.Header->
phis())
1450 DominatorTree::Delete, FC0.ExitingBlock, FC1.Preheader));
1452 DominatorTree::Insert, FC0.ExitingBlock, FC1.Header));
1455 DominatorTree::Delete, FC0.ExitBlock, FC1.Preheader));
1461 DominatorTree::Delete, FC0.ExitingBlock, FC0.ExitBlock));
1464 DominatorTree::Insert, FC0.ExitingBlock, FC1.Header));
1465 new UnreachableInst(FC0.ExitBlock->
getContext(), FC0.ExitBlock);
1471 new UnreachableInst(FC1.Preheader->
getContext(), FC1.Preheader);
1473 DominatorTree::Delete, FC1.Preheader, FC1.Header));
1477 if (SE.isSCEVable(
PHI->getType()))
1478 SE.forgetValue(
PHI);
1479 if (
PHI->hasNUsesOrMore(1))
1482 PHI->eraseFromParent();
1490 for (PHINode *LCPHI : OriginalFC0PHIs) {
1491 int L1LatchBBIdx = LCPHI->getBasicBlockIndex(FC1.Latch);
1492 assert(L1LatchBBIdx >= 0 &&
1493 "Expected loop carried value to be rewired at this point!");
1495 Value *LCV = LCPHI->getIncomingValue(L1LatchBBIdx);
1497 PHINode *L1HeaderPHI =
1504 LCPHI->setIncomingValue(L1LatchBBIdx, L1HeaderPHI);
1513 simplifyLatchBranch(FC0);
1517 if (FC0.Latch != FC0.ExitingBlock)
1519 DominatorTree::Insert, FC0.Latch, FC1.Header));
1521 TreeUpdates.
emplace_back(DominatorTree::UpdateType(DominatorTree::Delete,
1522 FC0.Latch, FC0.Header));
1523 TreeUpdates.
emplace_back(DominatorTree::UpdateType(DominatorTree::Insert,
1524 FC1.Latch, FC0.Header));
1525 TreeUpdates.
emplace_back(DominatorTree::UpdateType(DominatorTree::Delete,
1526 FC1.Latch, FC1.Header));
1529 DTU.applyUpdates(TreeUpdates);
1531 LI.removeBlock(FC1.Preheader);
1532 DTU.deleteBB(FC1.Preheader);
1534 LI.removeBlock(FC0.ExitBlock);
1535 DTU.deleteBB(FC0.ExitBlock);
1544 SE.forgetLoop(FC1.L);
1545 SE.forgetLoop(FC0.L);
1548 SmallVector<BasicBlock *, 8> Blocks(FC1.L->
blocks());
1549 for (BasicBlock *BB : Blocks) {
1552 if (LI.getLoopFor(BB) != FC1.L)
1554 LI.changeLoopFor(BB, FC0.L);
1557 const auto &ChildLoopIt = FC1.L->
begin();
1558 Loop *ChildLoop = *ChildLoopIt;
1569 SE.forgetBlockAndLoopDispositions();
1573 mergeLatch(FC0, FC1);
1577 assert(DT.verify(DominatorTree::VerificationLevel::Fast));
1600 template <
typename RemarkKind>
1601 void reportLoopFusion(
const FusionCandidate &FC0,
const FusionCandidate &FC1,
1602 StringRef RemarkName, StringRef RemarkMsg) {
1603 assert(FC0.Preheader && FC1.Preheader &&
1604 "Expecting valid fusion candidates");
1605 using namespace ore;
1609 <<
"]: " <<
NV(
"Cand1", StringRef(FC0.Preheader->
getName())) <<
" and "
1610 <<
NV(
"Cand2", StringRef(FC1.Preheader->
getName())) <<
": "
1629 Loop *fuseGuardedLoops(
const FusionCandidate &FC0,
1630 const FusionCandidate &FC1) {
1631 assert(FC0.GuardBranch && FC1.GuardBranch &&
"Expecting guarded loops");
1633 BasicBlock *FC0GuardBlock = FC0.GuardBranch->getParent();
1634 BasicBlock *FC1GuardBlock = FC1.GuardBranch->getParent();
1635 BasicBlock *FC0NonLoopBlock = FC0.getNonLoopBlock();
1636 BasicBlock *FC1NonLoopBlock = FC1.getNonLoopBlock();
1644 (FC0.Peeled ? *FC0ExitBlockSuccessor : *FC0.ExitBlock), *FC1.ExitBlock,
1651 assert(FC0NonLoopBlock == FC1GuardBlock &&
"Loops are not adjacent");
1664 FC0.GuardBranch->replaceUsesOfWith(FC0NonLoopBlock, FC1NonLoopBlock);
1666 BasicBlock *BBToUpdate = FC0.Peeled ? FC0ExitBlockSuccessor : FC0.ExitBlock;
1670 FC1.GuardBranch->eraseFromParent();
1671 new UnreachableInst(FC1GuardBlock->
getContext(), FC1GuardBlock);
1674 DominatorTree::Delete, FC1GuardBlock, FC1.Preheader));
1676 DominatorTree::Delete, FC1GuardBlock, FC1NonLoopBlock));
1678 DominatorTree::Delete, FC0GuardBlock, FC1GuardBlock));
1680 DominatorTree::Insert, FC0GuardBlock, FC1NonLoopBlock));
1684 DominatorTree::Delete, FC0.ExitBlock, FC0ExitBlockSuccessor));
1687 DominatorTree::Delete, FC0ExitBlockSuccessor, FC1GuardBlock));
1689 new UnreachableInst(FC0ExitBlockSuccessor->
getContext(),
1690 FC0ExitBlockSuccessor);
1694 "Expecting guard block to have no predecessors");
1696 "Expecting guard block to have no successors");
1711 if (FC0.ExitingBlock != FC0.Latch)
1712 for (PHINode &
PHI : FC0.Header->
phis())
1715 assert(OriginalFC0PHIs.
empty() &&
"Expecting OriginalFC0PHIs to be empty!");
1738 DominatorTree::Delete, FC0.ExitingBlock, FC0.ExitBlock));
1740 DominatorTree::Insert, FC0.ExitingBlock, FC1.Header));
1751 new UnreachableInst(FC0.ExitBlock->
getContext(), FC0.ExitBlock);
1757 new UnreachableInst(FC1.Preheader->
getContext(), FC1.Preheader);
1759 DominatorTree::Delete, FC1.Preheader, FC1.Header));
1763 if (SE.isSCEVable(
PHI->getType()))
1764 SE.forgetValue(
PHI);
1765 if (
PHI->hasNUsesOrMore(1))
1768 PHI->eraseFromParent();
1776 for (PHINode *LCPHI : OriginalFC0PHIs) {
1777 int L1LatchBBIdx = LCPHI->getBasicBlockIndex(FC1.Latch);
1778 assert(L1LatchBBIdx >= 0 &&
1779 "Expected loop carried value to be rewired at this point!");
1781 Value *LCV = LCPHI->getIncomingValue(L1LatchBBIdx);
1783 PHINode *L1HeaderPHI =
1790 LCPHI->setIncomingValue(L1LatchBBIdx, L1HeaderPHI);
1801 simplifyLatchBranch(FC0);
1805 if (FC0.Latch != FC0.ExitingBlock)
1807 DominatorTree::Insert, FC0.Latch, FC1.Header));
1809 TreeUpdates.
emplace_back(DominatorTree::UpdateType(DominatorTree::Delete,
1810 FC0.Latch, FC0.Header));
1811 TreeUpdates.
emplace_back(DominatorTree::UpdateType(DominatorTree::Insert,
1812 FC1.Latch, FC0.Header));
1813 TreeUpdates.
emplace_back(DominatorTree::UpdateType(DominatorTree::Delete,
1814 FC1.Latch, FC1.Header));
1823 DTU.applyUpdates(TreeUpdates);
1825 LI.removeBlock(FC1GuardBlock);
1826 LI.removeBlock(FC1.Preheader);
1827 LI.removeBlock(FC0.ExitBlock);
1829 LI.removeBlock(FC0ExitBlockSuccessor);
1830 DTU.deleteBB(FC0ExitBlockSuccessor);
1832 DTU.deleteBB(FC1GuardBlock);
1833 DTU.deleteBB(FC1.Preheader);
1834 DTU.deleteBB(FC0.ExitBlock);
1841 SE.forgetLoop(FC1.L);
1842 SE.forgetLoop(FC0.L);
1845 SmallVector<BasicBlock *, 8> Blocks(FC1.L->
blocks());
1846 for (BasicBlock *BB : Blocks) {
1849 if (LI.getLoopFor(BB) != FC1.L)
1851 LI.changeLoopFor(BB, FC0.L);
1854 const auto &ChildLoopIt = FC1.L->
begin();
1855 Loop *ChildLoop = *ChildLoopIt;
1866 SE.forgetBlockAndLoopDispositions();
1870 mergeLatch(FC0, FC1);
1874 assert(DT.verify(DominatorTree::VerificationLevel::Fast));
1901 for (
auto &L : LI) {
1908 LoopFuser LF(LI, DT, DI, SE, PDT, ORE, AC,
TTI);
for(const MachineOperand &MO :llvm::drop_begin(OldMI.operands(), Desc.getNumOperands()))
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
static bool reportInvalidCandidate(const Instruction &I, llvm::Statistic &Stat)
#define LLVM_DUMP_METHOD
Mark debug helper function definitions like dump() that should not be stripped from debug builds.
static cl::opt< uint32_t > FusionPeelMaxCount("loop-fusion-peel-max-count", cl::init(0), cl::Hidden, cl::desc("Max number of iterations to be peeled from a loop, such that " "fusion can take place"))
static void printFusionCandidates(const FusionCandidateCollection &FusionCandidates)
std::list< FusionCandidate > FusionCandidateList
SmallVector< FusionCandidateList, 4 > FusionCandidateCollection
static void printLoopVector(const LoopVector &LV)
SmallVector< Loop *, 4 > LoopVector
static cl::opt< bool > VerboseFusionDebugging("loop-fusion-verbose-debug", cl::desc("Enable verbose debugging for Loop Fusion"), cl::Hidden, cl::init(false))
This file implements the Loop Fusion pass.
Loop::LoopBounds::Direction Direction
static bool isValid(const char C)
Returns true if C is a valid mangled character: <0-9a-zA-Z_>.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
A function analysis which provides an AssumptionCache.
LLVM Basic Block Representation.
LLVM_ABI void replaceSuccessorsPhiUsesWith(BasicBlock *Old, BasicBlock *New)
Update all phi nodes in this basic block's successors to refer to basic block New instead of basic bl...
iterator begin()
Instruction iterator methods.
iterator_range< const_phi_iterator > phis() const
Returns a range that iterates over the phis in the basic block.
LLVM_ABI const_iterator getFirstInsertionPt() const
Returns an iterator to the first instruction in this block that is suitable for inserting a non-PHI i...
const Function * getParent() const
Return the enclosing method, or null if none.
LLVM_ABI InstListType::const_iterator getFirstNonPHIOrDbg(bool SkipPseudoOp=true) const
Returns a pointer to the first instruction in this block that is not a PHINode or a debug intrinsic,...
LLVM_ABI const BasicBlock * getUniqueSuccessor() const
Return the successor of this block if it has a unique successor.
const Instruction & front() const
LLVM_ABI void replacePhiUsesWith(BasicBlock *Old, BasicBlock *New)
Update all phi nodes in this basic block to refer to basic block New instead of basic block Old.
LLVM_ABI const BasicBlock * getSingleSuccessor() const
Return the successor of this block if it has a single successor.
InstListType::iterator iterator
Instruction iterators...
LLVM_ABI LLVMContext & getContext() const
Get the context in which this basic block lives.
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction; assumes that the block is well-formed.
Conditional Branch instruction.
Value * getCondition() const
BasicBlock * getSuccessor(unsigned i) const
AnalysisPass to compute dependence information in a function.
Analysis pass which computes a DominatorTree.
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
LLVM_ABI void insertBefore(InstListType::iterator InsertPos)
Insert an unlinked instruction into a basic block immediately before the specified position.
LLVM_ABI InstListType::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
Analysis pass that exposes the LoopInfo for a function.
bool contains(const LoopT *L) const
Return true if the specified loop is contained within in this loop.
bool isInnermost() const
Return true if the loop does not contain any (natural) loops.
void removeBlockFromLoop(BlockT *BB)
This removes the specified basic block from the current loop, updating the Blocks as appropriate.
unsigned getLoopDepth() const
Return the nesting level of this loop.
iterator_range< block_iterator > blocks() const
void addChildLoop(LoopT *NewChild)
Add the specified loop to be a child of this loop.
void addBlockEntry(BlockT *BB)
This adds a basic block directly to the basic block list.
LoopT * removeChildLoop(iterator I)
This removes the specified child from being a subloop of this loop.
LLVM_ABI PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM)
reverse_iterator rend() const
reverse_iterator rbegin() const
Represents a single loop in the control flow graph.
DebugLoc getStartLoc() const
Return the debug location of the start of this loop.
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
static PHINode * Create(Type *Ty, unsigned NumReservedValues, const Twine &NameStr="", InsertPosition InsertBefore=nullptr)
Constructors - NumReservedValues is a hint for the number of incoming edges that this phi node will h...
static LLVM_ABI PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
Analysis pass which computes a PostDominatorTree.
PostDominatorTree Class - Concrete subclass of DominatorTree that is used to compute the post-dominat...
A set of analyses that are preserved following a run of a transformation pass.
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
PreservedAnalyses & preserve()
Mark an analysis as preserved.
Analysis pass that exposes the ScalarEvolution for a function.
The main scalar evolution driver.
LLVM_ABI bool hasLoopInvariantBackedgeTakenCount(const Loop *L)
Return true if the specified loop has an analyzable loop-invariant backedge-taken count.
reference emplace_back(ArgTypes &&... Args)
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Represent a constant reference to a string, i.e.
Analysis pass providing the TargetTransformInfo.
static UncondBrInst * Create(BasicBlock *Target, InsertPosition InsertBefore=nullptr)
LLVM_ABI bool replaceUsesOfWith(Value *From, Value *To)
Replace uses of one Value with another.
Type * getType() const
All values are typed, get the type of this value.
LLVM_ABI StringRef getName() const
Return a constant reference to the value's name.
self_iterator getIterator()
This class implements an extremely fast bulk output stream that can only output to a stream.
@ BasicBlock
Various leaf nodes.
@ Valid
The data is already valid.
initializer< Ty > init(const Ty &Val)
Add a small namespace to avoid name clashes with the classes used in the streaming interface.
DiagnosticInfoOptimizationBase::Argument NV
NodeAddr< DefNode * > Def
LLVM_ABI iterator begin() const
This is an optimization pass for GlobalISel generic memory operations.
LLVM_ABI bool simplifyLoop(Loop *L, DominatorTree *DT, LoopInfo *LI, ScalarEvolution *SE, AssumptionCache *AC, MemorySSAUpdater *MSSAU, bool PreserveLCSSA)
Simplify each loop in a loop nest recursively.
LLVM_ABI void ReplaceInstWithInst(BasicBlock *BB, BasicBlock::iterator &BI, Instruction *I)
Replace the instruction specified by BI with the instruction specified by I.
void dump(const SparseBitVector< ElementSize > &LHS, raw_ostream &out)
FunctionAddr VTableAddr Value
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
bool succ_empty(const Instruction *I)
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
LLVM_ABI bool verifyFunction(const Function &F, raw_ostream *OS=nullptr)
Check a function for errors, useful for use when debugging a pass.
LLVM_ABI void moveInstructionsToTheEnd(BasicBlock &FromBB, BasicBlock &ToBB, DominatorTree &DT, const PostDominatorTree &PDT, DependenceInfo &DI, ScalarEvolution &SE)
Move instructions, in an order-preserving manner, from FromBB to the end of ToBB when proven safe.
LLVM_ABI void moveInstructionsToTheBeginning(BasicBlock &FromBB, BasicBlock &ToBB, DominatorTree &DT, const PostDominatorTree &PDT, DependenceInfo &DI, ScalarEvolution &SE)
Move instructions, in an order-preserving manner, from FromBB to the beginning of ToBB when proven sa...
LLVM_ABI bool canPeel(const Loop *L)
auto reverse(ContainerTy &&C)
LLVM_ABI TargetTransformInfo::PeelingPreferences gatherPeelingPreferences(Loop *L, ScalarEvolution &SE, const TargetTransformInfo &TTI, std::optional< bool > UserAllowPeeling, std::optional< bool > UserAllowProfileBasedPeeling, bool UnrollingSpecficValues=false)
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
LLVM_ABI raw_fd_ostream & errs()
This returns a reference to a raw_ostream for standard error.
LLVM_ABI void peelLoop(Loop *L, unsigned PeelCount, bool PeelLast, LoopInfo *LI, ScalarEvolution *SE, DominatorTree &DT, AssumptionCache *AC, bool PreserveLCSSA, ValueToValueMapTy &VMap)
VMap is the value-map that maps instructions from the original loop to instructions in the last peele...
LLVM_ABI bool MergeBlockIntoPredecessor(BasicBlock *BB, DomTreeUpdater *DTU=nullptr, LoopInfo *LI=nullptr, MemorySSAUpdater *MSSAU=nullptr, MemoryDependenceResults *MemDep=nullptr, bool PredecessorWithTwoSuccessors=false, DominatorTree *DT=nullptr)
Attempts to merge a block into its predecessor, if possible.
LLVM_ABI void printLoop(const Loop &L, raw_ostream &OS, const std::string &Banner="")
Function to print a loop's contents as LLVM's text IR assembly.
DWARFExpression::Operation Op
raw_ostream & operator<<(raw_ostream &OS, const APFixedPoint &FX)
ValueMap< const Value *, WeakTrackingVH > ValueToValueMapTy
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
auto predecessors(const MachineBasicBlock *BB)
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
bool pred_empty(const BasicBlock *BB)
AnalysisManager< Function > FunctionAnalysisManager
Convenience typedef for the Function analysis manager.
LLVM_ABI bool isSafeToMoveBefore(Instruction &I, Instruction &InsertPoint, DominatorTree &DT, const PostDominatorTree *PDT=nullptr, DependenceInfo *DI=nullptr, bool CheckForEntireBlock=false)
Return true if I can be safely moved before InsertPoint.