55 #define DEBUG_TYPE "loop-interchange"
57 STATISTIC(LoopsInterchanged,
"Number of loops interchanged");
61 cl::desc(
"Interchange if you gain more than this number"));
68 using CharMatrix = std::vector<std::vector<char>>;
78 #ifdef DUMP_DEP_MATRICIES
79 static void printDepMatrix(CharMatrix &DepMatrix) {
80 for (
auto &Row : DepMatrix) {
98 if (!isa<Instruction>(
I))
100 if (
auto *Ld = dyn_cast<LoadInst>(&
I)) {
103 MemInstr.push_back(&
I);
104 }
else if (
auto *St = dyn_cast<StoreInst>(&
I)) {
107 MemInstr.push_back(&
I);
113 <<
" Loads and Stores to analyze\n");
115 ValueVector::iterator
I,
IE, J, JE;
117 for (
I = MemInstr.begin(),
IE = MemInstr.end();
I !=
IE; ++
I) {
118 for (J =
I, JE = MemInstr.end(); J != JE; ++J) {
119 std::vector<char> Dep;
125 if (isa<LoadInst>(Src) && isa<LoadInst>(Dst))
128 if (
auto D = DI->
depends(Src, Dst,
true)) {
129 assert(
D->isOrdered() &&
"Expected an output, flow or anti dep.");
131 D->isFlow() ?
"flow" :
D->isAnti() ?
"anti" :
"output";
132 dbgs() <<
"Found " << DepType
133 <<
" dependency between Src and Dst\n"
134 <<
" Src:" << *Src <<
"\n Dst:" << *Dst <<
'\n');
135 unsigned Levels =
D->getLevels();
137 for (
unsigned II = 1; II <= Levels; ++II) {
138 const SCEV *Distance =
D->getDistance(II);
140 dyn_cast_or_null<SCEVConstant>(Distance);
150 }
else if (
D->isScalar(II)) {
154 unsigned Dir =
D->getDirection(II);
168 while (Dep.size() !=
Level) {
172 DepMatrix.push_back(Dep);
175 <<
" dependencies inside loop\n");
189 for (
unsigned I = 0,
E = DepMatrix.size();
I <
E; ++
I)
190 std::swap(DepMatrix[
I][ToIndx], DepMatrix[
I][FromIndx]);
197 for (
unsigned i = 0;
i <= Column; ++
i) {
198 if (DepMatrix[Row][
i] ==
'<')
200 if (DepMatrix[Row][
i] ==
'>')
210 for (
unsigned i = 0;
i < Column; ++
i) {
211 if (DepMatrix[Row][
i] !=
'=' && DepMatrix[Row][
i] !=
'S' &&
212 DepMatrix[Row][
i] !=
'I')
219 unsigned OuterLoopId,
char InnerDep,
224 if (InnerDep == OuterDep)
230 if (InnerDep ==
'=' || InnerDep ==
'S' || InnerDep ==
'I')
236 if (InnerDep ==
'>') {
239 if (OuterLoopId == 0)
257 unsigned InnerLoopId,
258 unsigned OuterLoopId) {
259 unsigned NumRows = DepMatrix.size();
261 for (
unsigned Row = 0; Row < NumRows; ++Row) {
262 char InnerDep = DepMatrix[Row][InnerLoopId];
263 char OuterDep = DepMatrix[Row][OuterLoopId];
264 if (InnerDep ==
'*' || OuterDep ==
'*')
277 Loop *CurrentLoop = &L;
278 const std::vector<Loop *> *Vec = &CurrentLoop->
getSubLoops();
279 while (!Vec->empty()) {
283 if (Vec->size() != 1)
286 LoopList.push_back(CurrentLoop);
287 CurrentLoop = Vec->front();
290 LoopList.push_back(CurrentLoop);
297 return InnerIndexVar;
307 dyn_cast<SCEVAddRecExpr>(SE->
getSCEV(PhiVar));
311 if (!isa<SCEVConstant>(Step))
324 class LoopInterchangeLegality {
328 : OuterLoop(
Outer), InnerLoop(Inner), SE(SE), ORE(ORE) {}
331 bool canInterchangeLoops(
unsigned InnerLoopId,
unsigned OuterLoopId,
332 CharMatrix &DepMatrix);
336 bool isLoopStructureUnderstood(
PHINode *InnerInductionVar);
338 bool currentLimitations();
341 return OuterInnerReductions;
345 bool tightlyNested(
Loop *Outer,
Loop *Inner);
352 bool findInductionAndReductions(
Loop *L,
371 class LoopInterchangeProfitability {
375 : OuterLoop(
Outer), InnerLoop(Inner), SE(SE), ORE(ORE) {}
378 bool isProfitable(
unsigned InnerLoopId,
unsigned OuterLoopId,
379 CharMatrix &DepMatrix);
382 int getInstrOrderCost();
395 class LoopInterchangeTransform {
400 const LoopInterchangeLegality &LIL)
401 : OuterLoop(
Outer), InnerLoop(Inner), SE(SE), LI(LI), DT(DT),
402 LoopExit(LoopNestExit), LIL(LIL) {}
406 void restructureLoops(
Loop *NewInner,
Loop *NewOuter,
409 void removeChildLoop(
Loop *OuterLoop,
Loop *InnerLoop);
412 bool adjustLoopLinks();
413 bool adjustLoopBranches();
425 const LoopInterchangeLegality &LIL;
428 struct LoopInterchange {
439 : SE(SE), LI(LI), DI(DI), DT(DT), ORE(ORE) {}
449 const auto &LoopList = LN.
getLoops();
450 for (
unsigned I = 1;
I < LoopList.size(); ++
I)
451 if (LoopList[
I]->getParentLoop() != LoopList[
I - 1])
453 return processLoopList(LoopList);
457 for (
Loop *L : LoopList) {
459 if (isa<SCEVCouldNotCompute>(ExitCountOuter)) {
478 return LoopList.
size() - 1;
482 bool Changed =
false;
483 unsigned LoopNestDepth = LoopList.
size();
484 if (LoopNestDepth < 2) {
485 LLVM_DEBUG(
dbgs() <<
"Loop doesn't contain minimum nesting level.\n");
493 if (!isComputableLoopNest(LoopList)) {
494 LLVM_DEBUG(
dbgs() <<
"Not valid loop candidate for interchange\n");
498 LLVM_DEBUG(
dbgs() <<
"Processing LoopList of size = " << LoopNestDepth
501 CharMatrix DependencyMatrix;
502 Loop *OuterMostLoop = *(LoopList.
begin());
504 OuterMostLoop, DI)) {
508 #ifdef DUMP_DEP_MATRICIES
510 printDepMatrix(DependencyMatrix);
520 unsigned SelecLoopId = selectLoopForInterchange(LoopList);
522 Loop *LoopToBeInterchanged = LoopList[SelecLoopId];
523 for (
unsigned i = SelecLoopId;
i > 0;
i--) {
524 bool Interchanged = processLoop(LoopToBeInterchanged, LoopList[
i - 1],
i,
525 i - 1, LoopNestExit, DependencyMatrix);
530 #ifdef DUMP_DEP_MATRICIES
532 printDepMatrix(DependencyMatrix);
534 Changed |= Interchanged;
539 bool processLoop(
Loop *InnerLoop,
Loop *OuterLoop,
unsigned InnerLoopId,
540 unsigned OuterLoopId,
BasicBlock *LoopNestExit,
541 std::vector<std::vector<char>> &DependencyMatrix) {
543 <<
" and OuterLoopId = " << OuterLoopId <<
"\n");
544 LoopInterchangeLegality LIL(OuterLoop, InnerLoop, SE, ORE);
545 if (!LIL.canInterchangeLoops(InnerLoopId, OuterLoopId, DependencyMatrix)) {
546 LLVM_DEBUG(
dbgs() <<
"Not interchanging loops. Cannot prove legality.\n");
550 LoopInterchangeProfitability LIP(OuterLoop, InnerLoop, SE, ORE);
551 if (!LIP.isProfitable(InnerLoopId, OuterLoopId, DependencyMatrix)) {
560 <<
"Loop interchanged with enclosing loop.";
563 LoopInterchangeTransform LIT(OuterLoop, InnerLoop, SE, LI, DT, LoopNestExit,
570 "Inner loop not left in LCSSA form after loop interchange!");
572 "Outer loop not left in LCSSA form after loop interchange!");
580 bool LoopInterchangeLegality::containsUnsafeInstructions(
BasicBlock *
BB) {
582 return I.mayHaveSideEffects() ||
I.mayReadFromMemory();
586 bool LoopInterchangeLegality::tightlyNested(
Loop *OuterLoop,
Loop *InnerLoop) {
598 if (!OuterLoopHeaderBI)
602 if (Succ != InnerLoopPreHeader && Succ != InnerLoop->
getHeader() &&
603 Succ != OuterLoopLatch)
606 LLVM_DEBUG(
dbgs() <<
"Checking instructions in Loop header and Loop latch\n");
609 if (containsUnsafeInstructions(OuterLoopHeader) ||
610 containsUnsafeInstructions(OuterLoopLatch))
616 if (InnerLoopPreHeader != OuterLoopHeader &&
617 containsUnsafeInstructions(InnerLoopPreHeader))
625 bool LoopInterchangeLegality::isLoopStructureUnderstood(
629 for (
unsigned i = 0;
i < Num; ++
i) {
631 if (isa<Constant>(Val))
641 InnerLoopPreheader &&
652 PHINode *PHI = dyn_cast<PHINode>(SV);
664 if (isa<Constant>(V))
669 if (PHI->getNumIncomingValues() == 1)
681 bool LoopInterchangeLegality::findInductionAndReductions(
689 Inductions.push_back(&PHI);
694 if (!OuterInnerReductions.count(&PHI)) {
696 "across the outer loop.\n");
700 assert(PHI.getNumIncomingValues() == 2 &&
701 "Phis in loop header should have exactly 2 incoming values");
710 <<
"Failed to recognize PHI as an induction or reduction.\n");
713 OuterInnerReductions.insert(&PHI);
714 OuterInnerReductions.insert(InnerRedPhi);
723 bool LoopInterchangeLegality::currentLimitations() {
734 dbgs() <<
"Loops where the latch is not the exiting block are not"
735 <<
" supported currently.\n");
740 <<
"Loops where the latch is not the exiting block cannot be"
741 " interchange currently.";
748 if (!findInductionAndReductions(OuterLoop, Inductions, InnerLoop)) {
750 dbgs() <<
"Only outer loops with induction or reduction PHI nodes "
751 <<
"are supported currently.\n");
756 <<
"Only outer loops with induction or reduction PHI nodes can be"
757 " interchanged currently.";
763 if (Inductions.size() != 1) {
764 LLVM_DEBUG(
dbgs() <<
"Loops with more than 1 induction variables are not "
765 <<
"supported currently.\n");
770 <<
"Only outer loops with 1 induction variable can be "
771 "interchanged currently.";
777 if (!findInductionAndReductions(InnerLoop, Inductions,
nullptr)) {
779 dbgs() <<
"Only inner loops with induction or reduction PHI nodes "
780 <<
"are supported currently.\n");
785 <<
"Only inner loops with induction or reduction PHI nodes can be"
786 " interchange currently.";
792 if (Inductions.size() != 1) {
794 dbgs() <<
"We currently only support loops with 1 induction variable."
795 <<
"Failed to interchange due to current limitation\n");
800 <<
"Only inner loops with 1 induction variable can be "
801 "interchanged currently.";
808 if (!isLoopStructureUnderstood(InnerInductionVar)) {
814 <<
"Inner loop structure not understood currently.";
837 if (!InnerIndexVarInc) {
839 dbgs() <<
"Did not find an instruction to increment the induction "
845 <<
"The inner loop does not increment the induction variable.";
854 bool FoundInduction =
false;
857 if (isa<BranchInst>(
I) || isa<CmpInst>(
I) || isa<TruncInst>(
I) ||
863 if (!
I.isIdenticalTo(InnerIndexVarInc)) {
864 LLVM_DEBUG(
dbgs() <<
"Found unsupported instructions between induction "
865 <<
"variable increment and branch.\n");
870 <<
"Found unsupported instruction between induction variable "
871 "increment and branch.";
876 FoundInduction =
true;
881 if (!FoundInduction) {
887 <<
"Did not find the induction variable.";
903 if (PHI.getNumIncomingValues() > 1)
905 if (
any_of(PHI.users(), [&Reductions, OuterL](
User *U) {
906 PHINode *PN = dyn_cast<PHINode>(U);
908 (!Reductions.count(PN) && OuterL->contains(PN->getParent()));
929 if (PHI.getType()->isFloatingPointTy())
931 for (
unsigned i = 0;
i < PHI.getNumIncomingValues();
i++) {
932 Instruction *IncomingI = dyn_cast<Instruction>(PHI.getIncomingValue(
i));
953 bool LoopInterchangeLegality::canInterchangeLoops(
unsigned InnerLoopId,
954 unsigned OuterLoopId,
955 CharMatrix &DepMatrix) {
957 LLVM_DEBUG(
dbgs() <<
"Failed interchange InnerLoopId = " << InnerLoopId
958 <<
" and OuterLoopId = " << OuterLoopId
959 <<
" due to dependence\n");
964 <<
"Cannot interchange loops due to dependences.";
969 for (
auto *
BB : OuterLoop->
blocks())
971 if (
CallInst *CI = dyn_cast<CallInst>(&
I)) {
973 if (CI->doesNotReadMemory())
976 dbgs() <<
"Loops with call instructions cannot be interchanged "
982 <<
"Cannot interchange loops due to call instruction.";
990 if (currentLimitations()) {
991 LLVM_DEBUG(
dbgs() <<
"Not legal because of current transform limitation\n");
996 if (!tightlyNested(OuterLoop, InnerLoop)) {
1002 <<
"Cannot interchange loops because they are not tightly "
1009 OuterInnerReductions)) {
1010 LLVM_DEBUG(
dbgs() <<
"Found unsupported PHI nodes in inner loop exit.\n");
1015 <<
"Found unsupported PHI node in loop exit.";
1021 LLVM_DEBUG(
dbgs() <<
"Found unsupported PHI nodes in outer loop exit.\n");
1026 <<
"Found unsupported PHI node in loop exit.";
1034 int LoopInterchangeProfitability::getInstrOrderCost() {
1035 unsigned GoodOrder, BadOrder;
1036 BadOrder = GoodOrder = 0;
1040 unsigned NumOp =
GEP->getNumOperands();
1041 bool FoundInnerInduction =
false;
1042 bool FoundOuterInduction =
false;
1043 for (
unsigned i = 0;
i < NumOp; ++
i) {
1058 if (AR->
getLoop() == InnerLoop) {
1061 FoundInnerInduction =
true;
1062 if (FoundOuterInduction) {
1072 if (AR->
getLoop() == OuterLoop) {
1075 FoundOuterInduction =
true;
1076 if (FoundInnerInduction) {
1085 return GoodOrder - BadOrder;
1089 unsigned OuterLoopId,
1090 CharMatrix &DepMatrix) {
1094 for (
auto &Row : DepMatrix) {
1095 if (Row[InnerLoopId] !=
'S' && Row[InnerLoopId] !=
'I')
1098 if (Row[OuterLoopId] !=
'=')
1104 return !DepMatrix.empty();
1107 bool LoopInterchangeProfitability::isProfitable(
unsigned InnerLoopId,
1108 unsigned OuterLoopId,
1109 CharMatrix &DepMatrix) {
1118 int Cost = getInstrOrderCost();
1132 <<
"Interchanging loops is too costly (cost="
1133 <<
ore::NV(
"Cost", Cost) <<
", threshold="
1135 <<
") and it does not improve parallelism.";
1140 void LoopInterchangeTransform::removeChildLoop(
Loop *OuterLoop,
1142 for (
Loop *L : *OuterLoop)
1143 if (L == InnerLoop) {
1144 OuterLoop->removeChildLoop(L);
1173 void LoopInterchangeTransform::restructureLoops(
1183 if (OuterLoopParent) {
1185 removeChildLoop(OuterLoopParent, NewInner);
1186 removeChildLoop(NewInner, NewOuter);
1189 removeChildLoop(NewInner, NewOuter);
1214 if (
BB == OuterHeader ||
BB == OuterLatch)
1231 bool Transformed =
false;
1238 if (!InductionPHI) {
1239 LLVM_DEBUG(
dbgs() <<
"Failed to find the point to split loop latch \n");
1262 auto MoveInstructions = [&
i, &WorkList,
this, InductionPHI, NewLatch]() {
1263 for (;
i < WorkList.size();
i++) {
1269 "Moving instructions with side-effects may change behavior of "
1272 Instruction *UserI = cast<Instruction>(U.getUser());
1274 UserI->
getParent() == NewLatch || UserI == InductionPHI)
1279 for (
Value *
Op : WorkList[
i]->operands()) {
1283 OpI == InductionPHI)
1285 WorkList.insert(OpI);
1295 WorkList.insert(CondI);
1297 WorkList.insert(cast<Instruction>(InnerIndexVar));
1312 BasicBlock *OuterLoopHeader = OuterLoop->getHeader();
1313 if (InnerLoopPreHeader != OuterLoopHeader) {
1317 std::prev(InnerLoopPreHeader->
end()))))
1321 Transformed |= adjustLoopLinks();
1336 ToList.splice(InsertBefore->
getIterator(), FromList, FromList.begin(),
1347 I->removeFromParent();
1362 std::vector<DominatorTree::UpdateType> &DTUpdates,
1363 bool MustUpdateOnce =
true) {
1364 assert((!MustUpdateOnce ||
1368 }) == 1) &&
"BI must jump to OldBB exactly once.");
1369 bool Changed =
false;
1377 DTUpdates.push_back(
1379 DTUpdates.push_back(
1382 assert(Changed &&
"Expected a successor to be updated");
1399 assert(
P.getNumIncomingValues() == 1 &&
1400 "Only loops with a single exit are supported!");
1403 auto IncI = cast<Instruction>(
P.getIncomingValueForBlock(InnerLatch));
1406 if (IncI->getParent() != InnerLatch && IncI->
getParent() != InnerHeader)
1410 [OuterHeader, OuterExit, IncI, InnerHeader](
User *U) {
1411 return (cast<PHINode>(U)->getParent() == OuterHeader &&
1412 IncI->getParent() == InnerHeader) ||
1413 cast<PHINode>(U)->getParent() == OuterExit;
1415 "Can only replace phis iff the uses are in the loop nest exit or "
1416 "the incoming value is defined in the inner header (it will "
1417 "dominate all loop blocks after interchanging)");
1418 P.replaceAllUsesWith(IncI);
1419 P.eraseFromParent();
1424 LcssaInnerExit.push_back(&
P);
1428 LcssaInnerLatch.push_back(&
P);
1449 if (
P.getNumIncomingValues() != 1)
1453 auto I = dyn_cast<Instruction>(
P.getIncomingValue(0));
1457 PHINode *NewPhi = dyn_cast<PHINode>(
P.clone());
1461 P.setIncomingValue(0, NewPhi);
1471 bool LoopInterchangeTransform::adjustLoopBranches() {
1473 std::vector<DominatorTree::UpdateType> DTUpdates;
1475 BasicBlock *OuterLoopPreHeader = OuterLoop->getLoopPreheader();
1478 assert(OuterLoopPreHeader != OuterLoop->getHeader() &&
1479 InnerLoopPreHeader != InnerLoop->
getHeader() && OuterLoopPreHeader &&
1480 InnerLoopPreHeader &&
"Guaranteed by loop-simplify form");
1485 if (isa<PHINode>(OuterLoopPreHeader->
begin()) ||
1487 OuterLoopPreHeader =
1489 if (InnerLoopPreHeader == OuterLoop->getHeader())
1490 InnerLoopPreHeader =
1495 BasicBlock *OuterLoopHeader = OuterLoop->getHeader();
1497 BasicBlock *OuterLoopLatch = OuterLoop->getLoopLatch();
1513 if (!OuterLoopPredecessor || !InnerLoopLatchPredecessor ||
1514 !OuterLoopLatchBI || !InnerLoopLatchBI || !OuterLoopHeaderBI ||
1519 dyn_cast<BranchInst>(InnerLoopLatchPredecessor->
getTerminator());
1521 dyn_cast<BranchInst>(OuterLoopPredecessor->
getTerminator());
1523 if (!OuterLoopPredecessorBI || !InnerLoopLatchPredecessorBI)
1526 if (!InnerLoopHeaderSuccessor)
1534 InnerLoopPreHeader, DTUpdates,
false);
1538 updateSuccessor(OuterLoopHeaderBI, OuterLoopLatch, LoopExit, DTUpdates,
1541 InnerLoopHeaderSuccessor, DTUpdates,
1549 OuterLoopPreHeader, DTUpdates);
1552 if (InnerLoopLatchBI->
getSuccessor(0) == InnerLoopHeader)
1553 InnerLoopLatchSuccessor = InnerLoopLatchBI->
getSuccessor(1);
1555 InnerLoopLatchSuccessor = InnerLoopLatchBI->
getSuccessor(0);
1558 InnerLoopLatchSuccessor, DTUpdates);
1561 if (OuterLoopLatchBI->
getSuccessor(0) == OuterLoopHeader)
1562 OuterLoopLatchSuccessor = OuterLoopLatchBI->
getSuccessor(1);
1564 OuterLoopLatchSuccessor = OuterLoopLatchBI->
getSuccessor(0);
1567 OuterLoopLatchSuccessor, DTUpdates);
1568 updateSuccessor(OuterLoopLatchBI, OuterLoopLatchSuccessor, InnerLoopLatch,
1572 restructureLoops(OuterLoop, InnerLoop, InnerLoopPreHeader,
1573 OuterLoopPreHeader);
1575 moveLCSSAPhis(InnerLoopLatchSuccessor, InnerLoopHeader, InnerLoopLatch,
1576 OuterLoopHeader, OuterLoopLatch, InnerLoop->
getExitBlock(),
1585 InnerLoopPHIs.push_back(cast<PHINode>(&PHI));
1587 OuterLoopPHIs.push_back(cast<PHINode>(&PHI));
1589 auto &OuterInnerReductions = LIL.getOuterInnerReductions();
1590 (void)OuterInnerReductions;
1595 for (
PHINode *PHI : OuterLoopPHIs) {
1597 assert(OuterInnerReductions.count(PHI) &&
"Expected a reduction PHI node");
1599 for (
PHINode *PHI : InnerLoopPHIs) {
1601 assert(OuterInnerReductions.count(PHI) &&
"Expected a reduction PHI node");
1618 MayNeedLCSSAPhis.push_back(&
I);
1624 bool LoopInterchangeTransform::adjustLoopLinks() {
1626 bool Changed = adjustLoopBranches();
1631 BasicBlock *OuterLoopPreHeader = OuterLoop->getLoopPreheader();
1657 auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
1658 auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
1659 auto *DI = &getAnalysis<DependenceAnalysisWrapperPass>().getDI();
1660 auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
1661 auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
1663 return LoopInterchange(SE, LI, DI, DT, ORE).run(L);
1670 "Interchanges loops for cache reuse",
false,
false)
1690 if (!LoopInterchange(&AR.
SE, &AR.
LI, &DI, &AR.
DT, &ORE).run(LN))