LCOV - code coverage report
Current view: top level - lib/CodeGen - InterleavedAccessPass.cpp (source / functions) Hit Total Coverage
Test: llvm-toolchain.info Lines: 140 140 100.0 %
Date: 2017-09-14 15:23:50 Functions: 15 16 93.8 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : //===--------------------- InterleavedAccessPass.cpp ----------------------===//
       2             : //
       3             : //                     The LLVM Compiler Infrastructure
       4             : //
       5             : // This file is distributed under the University of Illinois Open Source
       6             : // License. See LICENSE.TXT for details.
       7             : //
       8             : //===----------------------------------------------------------------------===//
       9             : //
      10             : // This file implements the Interleaved Access pass, which identifies
      11             : // interleaved memory accesses and transforms them into target specific
      12             : // intrinsics.
      13             : //
      14             : // An interleaved load reads data from memory into several vectors, with
      15             : // DE-interleaving the data on a factor. An interleaved store writes several
      16             : // vectors to memory with RE-interleaving the data on a factor.
      17             : //
      18             : // As interleaved accesses are difficult to identified in CodeGen (mainly
      19             : // because the VECTOR_SHUFFLE DAG node is quite different from the shufflevector
      20             : // IR), we identify and transform them to intrinsics in this pass so the
      21             : // intrinsics can be easily matched into target specific instructions later in
      22             : // CodeGen.
      23             : //
      24             : // E.g. An interleaved load (Factor = 2):
      25             : //        %wide.vec = load <8 x i32>, <8 x i32>* %ptr
      26             : //        %v0 = shuffle <8 x i32> %wide.vec, <8 x i32> undef, <0, 2, 4, 6>
      27             : //        %v1 = shuffle <8 x i32> %wide.vec, <8 x i32> undef, <1, 3, 5, 7>
      28             : //
      29             : // It could be transformed into a ld2 intrinsic in AArch64 backend or a vld2
      30             : // intrinsic in ARM backend.
      31             : //
      32             : // In X86, this can be further optimized into a set of target
      33             : // specific loads followed by an optimized sequence of shuffles.
      34             : //
      35             : // E.g. An interleaved store (Factor = 3):
      36             : //        %i.vec = shuffle <8 x i32> %v0, <8 x i32> %v1,
      37             : //                                    <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11>
      38             : //        store <12 x i32> %i.vec, <12 x i32>* %ptr
      39             : //
      40             : // It could be transformed into a st3 intrinsic in AArch64 backend or a vst3
      41             : // intrinsic in ARM backend.
      42             : //
      43             : // Similarly, a set of interleaved stores can be transformed into an optimized
      44             : // sequence of shuffles followed by a set of target specific stores for X86.
      45             : //===----------------------------------------------------------------------===//
      46             : 
      47             : #include "llvm/CodeGen/Passes.h"
      48             : #include "llvm/CodeGen/TargetPassConfig.h"
      49             : #include "llvm/IR/Dominators.h"
      50             : #include "llvm/IR/InstIterator.h"
      51             : #include "llvm/Support/Debug.h"
      52             : #include "llvm/Support/MathExtras.h"
      53             : #include "llvm/Support/raw_ostream.h"
      54             : #include "llvm/Target/TargetLowering.h"
      55             : #include "llvm/Target/TargetSubtargetInfo.h"
      56             : 
      57             : using namespace llvm;
      58             : 
      59             : #define DEBUG_TYPE "interleaved-access"
      60             : 
      61       72306 : static cl::opt<bool> LowerInterleavedAccesses(
      62             :     "lower-interleaved-accesses",
      63      216918 :     cl::desc("Enable lowering interleaved accesses to intrinsics"),
      64      289224 :     cl::init(true), cl::Hidden);
      65             : 
      66             : namespace {
      67             : 
      68       18746 : class InterleavedAccess : public FunctionPass {
      69             : 
      70             : public:
      71             :   static char ID;
      72       18856 :   InterleavedAccess() : FunctionPass(ID), DT(nullptr), TLI(nullptr) {
      73        9428 :     initializeInterleavedAccessPass(*PassRegistry::getPassRegistry());
      74        9428 :   }
      75             : 
      76          15 :   StringRef getPassName() const override { return "Interleaved Access Pass"; }
      77             : 
      78             :   bool runOnFunction(Function &F) override;
      79             : 
      80        9414 :   void getAnalysisUsage(AnalysisUsage &AU) const override {
      81        9414 :     AU.addRequired<DominatorTreeWrapperPass>();
      82        9414 :     AU.addPreserved<DominatorTreeWrapperPass>();
      83        9414 :   }
      84             : 
      85             : private:
      86             :   DominatorTree *DT;
      87             :   const TargetLowering *TLI;
      88             : 
      89             :   /// The maximum supported interleave factor.
      90             :   unsigned MaxFactor;
      91             : 
      92             :   /// \brief Transform an interleaved load into target specific intrinsics.
      93             :   bool lowerInterleavedLoad(LoadInst *LI,
      94             :                             SmallVector<Instruction *, 32> &DeadInsts);
      95             : 
      96             :   /// \brief Transform an interleaved store into target specific intrinsics.
      97             :   bool lowerInterleavedStore(StoreInst *SI,
      98             :                              SmallVector<Instruction *, 32> &DeadInsts);
      99             : 
     100             :   /// \brief Returns true if the uses of an interleaved load by the
     101             :   /// extractelement instructions in \p Extracts can be replaced by uses of the
     102             :   /// shufflevector instructions in \p Shuffles instead. If so, the necessary
     103             :   /// replacements are also performed.
     104             :   bool tryReplaceExtracts(ArrayRef<ExtractElementInst *> Extracts,
     105             :                           ArrayRef<ShuffleVectorInst *> Shuffles);
     106             : };
     107             : } // end anonymous namespace.
     108             : 
     109             : char InterleavedAccess::ID = 0;
     110       26583 : INITIALIZE_PASS_BEGIN(InterleavedAccess, DEBUG_TYPE,
     111             :     "Lower interleaved memory accesses to target specific intrinsics", false,
     112             :     false)
     113       26583 : INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
     114      219669 : INITIALIZE_PASS_END(InterleavedAccess, DEBUG_TYPE,
     115             :     "Lower interleaved memory accesses to target specific intrinsics", false,
     116             :     false)
     117             : 
     118        9418 : FunctionPass *llvm::createInterleavedAccessPass() {
     119        9418 :   return new InterleavedAccess();
     120             : }
     121             : 
     122             : /// \brief Check if the mask is a DE-interleave mask of the given factor
     123             : /// \p Factor like:
     124             : ///     <Index, Index+Factor, ..., Index+(NumElts-1)*Factor>
     125             : static bool isDeInterleaveMaskOfFactor(ArrayRef<int> Mask, unsigned Factor,
     126             :                                        unsigned &Index) {
     127             :   // Check all potential start indices from 0 to (Factor - 1).
     128       26350 :   for (Index = 0; Index < Factor; Index++) {
     129             :     unsigned i = 0;
     130             : 
     131             :     // Check that elements are in ascending order by Factor. Ignore undef
     132             :     // elements.
     133       41425 :     for (; i < Mask.size(); i++)
     134       60192 :       if (Mask[i] >= 0 && static_cast<unsigned>(Mask[i]) != Index + i * Factor)
     135             :         break;
     136             : 
     137       20027 :     if (i == Mask.size())
     138             :       return true;
     139             :   }
     140             : 
     141             :   return false;
     142             : }
     143             : 
     144             : /// \brief Check if the mask is a DE-interleave mask for an interleaved load.
     145             : ///
     146             : /// E.g. DE-interleave masks (Factor = 2) could be:
     147             : ///     <0, 2, 4, 6>    (mask of index 0 to extract even elements)
     148             : ///     <1, 3, 5, 7>    (mask of index 1 to extract odd elements)
     149        2361 : static bool isDeInterleaveMask(ArrayRef<int> Mask, unsigned &Factor,
     150             :                                unsigned &Index, unsigned MaxFactor) {
     151        2361 :   if (Mask.size() < 2)
     152             :     return false;
     153             : 
     154             :   // Check potential Factors.
     155        8684 :   for (Factor = 2; Factor <= MaxFactor; Factor++)
     156        6719 :     if (isDeInterleaveMaskOfFactor(Mask, Factor, Index))
     157             :       return true;
     158             : 
     159             :   return false;
     160             : }
     161             : 
     162             : /// \brief Check if the mask can be used in an interleaved store.
     163             : //
     164             : /// It checks for a more general pattern than the RE-interleave mask.
     165             : /// I.e. <x, y, ... z, x+1, y+1, ...z+1, x+2, y+2, ...z+2, ...>
     166             : /// E.g. For a Factor of 2 (LaneLen=4): <4, 32, 5, 33, 6, 34, 7, 35>
     167             : /// E.g. For a Factor of 3 (LaneLen=4): <4, 32, 16, 5, 33, 17, 6, 34, 18, 7, 35, 19>
     168             : /// E.g. For a Factor of 4 (LaneLen=2): <8, 2, 12, 4, 9, 3, 13, 5>
     169             : ///
     170             : /// The particular case of an RE-interleave mask is:
     171             : /// I.e. <0, LaneLen, ... , LaneLen*(Factor - 1), 1, LaneLen + 1, ...>
     172             : /// E.g. For a Factor of 2 (LaneLen=4): <0, 4, 1, 5, 2, 6, 3, 7>
     173         937 : static bool isReInterleaveMask(ArrayRef<int> Mask, unsigned &Factor,
     174             :                                unsigned MaxFactor, unsigned OpNumElts) {
     175         937 :   unsigned NumElts = Mask.size();
     176         937 :   if (NumElts < 4)
     177             :     return false;
     178             : 
     179             :   // Check potential Factors.
     180        2329 :   for (Factor = 2; Factor <= MaxFactor; Factor++) {
     181        2022 :     if (NumElts % Factor)
     182         652 :       continue;
     183             : 
     184        1370 :     unsigned LaneLen = NumElts / Factor;
     185          96 :     if (!isPowerOf2_32(LaneLen))
     186          96 :       continue;
     187             : 
     188             :     // Check whether each element matches the general interleaved rule.
     189             :     // Ignore undef elements, as long as the defined elements match the rule.
     190             :     // Outer loop processes all factors (x, y, z in the above example)
     191             :     unsigned I = 0, J;
     192        4318 :     for (; I < Factor; I++) {
     193             :       unsigned SavedLaneValue;
     194             :       unsigned SavedNoUndefs = 0;
     195             : 
     196             :       // Inner loop processes consecutive accesses (x, x+1... in the example)
     197        9691 :       for (J = 0; J < LaneLen - 1; J++) {
     198             :         // Lane computes x's position in the Mask
     199        4504 :         unsigned Lane = J * Factor + I;
     200        4504 :         unsigned NextLane = Lane + Factor;
     201        9008 :         int LaneValue = Mask[Lane];
     202        9008 :         int NextLaneValue = Mask[NextLane];
     203             : 
     204             :         // If both are defined, values must be sequential
     205        8794 :         if (LaneValue >= 0 && NextLaneValue >= 0 &&
     206        4290 :             LaneValue + 1 != NextLaneValue)
     207             :           break;
     208             : 
     209             :         // If the next value is undef, save the current one as reference
     210        3661 :         if (LaneValue >= 0 && NextLaneValue < 0) {
     211          68 :           SavedLaneValue = LaneValue;
     212          68 :           SavedNoUndefs = 1;
     213             :         }
     214             : 
     215             :         // Undefs are allowed, but defined elements must still be consecutive:
     216             :         // i.e.: x,..., undef,..., x + 2,..., undef,..., undef,..., x + 5, ....
     217             :         // Verify this by storing the last non-undef followed by an undef
     218             :         // Check that following non-undef masks are incremented with the
     219             :         // corresponding distance.
     220        3661 :         if (SavedNoUndefs > 0 && LaneValue < 0) {
     221          52 :           SavedNoUndefs++;
     222          72 :           if (NextLaneValue >= 0 &&
     223          20 :               SavedLaneValue + SavedNoUndefs != (unsigned)NextLaneValue)
     224             :             break;
     225             :         }
     226             :       }
     227             : 
     228        2385 :       if (J < LaneLen - 1)
     229             :         break;
     230             : 
     231        1534 :       int StartMask = 0;
     232        3068 :       if (Mask[I] >= 0) {
     233             :         // Check that the start of the I range (J=0) is greater than 0
     234             :         StartMask = Mask[I];
     235         118 :       } else if (Mask[(LaneLen - 1) * Factor + I] >= 0) {
     236             :         // StartMask defined by the last value in lane
     237          31 :         StartMask = Mask[(LaneLen - 1) * Factor + I] - J;
     238          28 :       } else if (SavedNoUndefs > 0) {
     239             :         // StartMask defined by some non-zero value in the j loop
     240           4 :         StartMask = SavedLaneValue - (LaneLen - 1 - SavedNoUndefs);
     241             :       }
     242             :       // else StartMask remains set to 0, i.e. all elements are undefs
     243             : 
     244        1510 :       if (StartMask < 0)
     245             :         break;
     246             :       // We must stay within the vectors; This case can happen with undefs.
     247        1528 :       if (StartMask + LaneLen > OpNumElts*2)
     248             :         break;
     249             :     }
     250             : 
     251             :     // Found an interleaved mask of current factor.
     252        1274 :     if (I == Factor)
     253             :       return true;
     254             :   }
     255             : 
     256             :   return false;
     257             : }
     258             : 
     259      241633 : bool InterleavedAccess::lowerInterleavedLoad(
     260             :     LoadInst *LI, SmallVector<Instruction *, 32> &DeadInsts) {
     261      240337 :   if (!LI->isSimple())
     262             :     return false;
     263             : 
     264      240337 :   SmallVector<ShuffleVectorInst *, 4> Shuffles;
     265      480674 :   SmallVector<ExtractElementInst *, 4> Extracts;
     266             : 
     267             :   // Check if all users of this load are shufflevectors. If we encounter any
     268             :   // users that are extractelement instructions, we save them to later check if
     269             :   // they can be modifed to extract from one of the shufflevectors instead of
     270             :   // the load.
     271      967042 :   for (auto UI = LI->user_begin(), E = LI->user_end(); UI != E; UI++) {
     272      481096 :     auto *Extract = dyn_cast<ExtractElementInst>(*UI);
     273      241379 :     if (Extract && isa<ConstantInt>(Extract->getIndexOperand())) {
     274         191 :       Extracts.push_back(Extract);
     275         191 :       continue;
     276             :     }
     277      480714 :     ShuffleVectorInst *SVI = dyn_cast<ShuffleVectorInst>(*UI);
     278      248035 :     if (!SVI || !isa<UndefValue>(SVI->getOperand(1)))
     279      237701 :       return false;
     280             : 
     281        2656 :     Shuffles.push_back(SVI);
     282             :   }
     283             : 
     284        2636 :   if (Shuffles.empty())
     285             :     return false;
     286             : 
     287             :   unsigned Factor, Index;
     288             : 
     289             :   // Check if the first shufflevector is DE-interleave shuffle.
     290       11805 :   if (!isDeInterleaveMask(Shuffles[0]->getShuffleMask(), Factor, Index,
     291             :                           MaxFactor))
     292             :     return false;
     293             : 
     294             :   // Holds the corresponding index for each DE-interleave shuffle.
     295         396 :   SmallVector<unsigned, 4> Indices;
     296         396 :   Indices.push_back(Index);
     297             : 
     298         792 :   Type *VecTy = Shuffles[0]->getType();
     299             : 
     300             :   // Check if other shufflevectors are also DE-interleaved of the same type
     301             :   // and factor as the first shufflevector.
     302        1260 :   for (unsigned i = 1; i < Shuffles.size(); i++) {
     303         702 :     if (Shuffles[i]->getType() != VecTy)
     304             :       return false;
     305             : 
     306        1404 :     if (!isDeInterleaveMaskOfFactor(Shuffles[i]->getShuffleMask(), Factor,
     307             :                                     Index))
     308             :       return false;
     309             : 
     310         234 :     Indices.push_back(Index);
     311             :   }
     312             : 
     313             :   // Try and modify users of the load that are extractelement instructions to
     314             :   // use the shufflevector instructions instead of the load.
     315         792 :   if (!tryReplaceExtracts(Extracts, Shuffles))
     316             :     return false;
     317             : 
     318             :   DEBUG(dbgs() << "IA: Found an interleaved load: " << *LI << "\n");
     319             : 
     320             :   // Try to create target specific intrinsics to replace the load and shuffles.
     321        1176 :   if (!TLI->lowerInterleavedLoad(LI, Shuffles, Indices, Factor))
     322             :     return false;
     323             : 
     324         348 :   for (auto SVI : Shuffles)
     325         165 :     DeadInsts.push_back(SVI);
     326             : 
     327          61 :   DeadInsts.push_back(LI);
     328          61 :   return true;
     329             : }
     330             : 
     331         396 : bool InterleavedAccess::tryReplaceExtracts(
     332             :     ArrayRef<ExtractElementInst *> Extracts,
     333             :     ArrayRef<ShuffleVectorInst *> Shuffles) {
     334             : 
     335             :   // If there aren't any extractelement instructions to modify, there's nothing
     336             :   // to do.
     337         396 :   if (Extracts.empty())
     338             :     return true;
     339             : 
     340             :   // Maps extractelement instructions to vector-index pairs. The extractlement
     341             :   // instructions will be modified to use the new vector and index operands.
     342           8 :   DenseMap<ExtractElementInst *, std::pair<Value *, int>> ReplacementMap;
     343             : 
     344          24 :   for (auto *Extract : Extracts) {
     345             : 
     346             :     // The vector index that is extracted.
     347          36 :     auto *IndexOperand = cast<ConstantInt>(Extract->getIndexOperand());
     348          12 :     auto Index = IndexOperand->getSExtValue();
     349             : 
     350             :     // Look for a suitable shufflevector instruction. The goal is to modify the
     351             :     // extractelement instruction (which uses an interleaved load) to use one
     352             :     // of the shufflevector instructions instead of the load.
     353          28 :     for (auto *Shuffle : Shuffles) {
     354             : 
     355             :       // If the shufflevector instruction doesn't dominate the extract, we
     356             :       // can't create a use of it.
     357          12 :       if (!DT->dominates(Shuffle, Extract))
     358           2 :         continue;
     359             : 
     360             :       // Inspect the indices of the shufflevector instruction. If the shuffle
     361             :       // selects the same index that is extracted, we can modify the
     362             :       // extractelement instruction.
     363          12 :       SmallVector<int, 4> Indices;
     364          20 :       Shuffle->getShuffleMask(Indices);
     365          48 :       for (unsigned I = 0; I < Indices.size(); ++I)
     366          44 :         if (Indices[I] == Index) {
     367             :           assert(Extract->getOperand(0) == Shuffle->getOperand(0) &&
     368             :                  "Vector operations do not match");
     369          32 :           ReplacementMap[Extract] = std::make_pair(Shuffle, I);
     370             :           break;
     371             :         }
     372             : 
     373             :       // If we found a suitable shufflevector instruction, stop looking.
     374          20 :       if (ReplacementMap.count(Extract))
     375             :         break;
     376             :     }
     377             : 
     378             :     // If we did not find a suitable shufflevector instruction, the
     379             :     // extractelement instruction cannot be modified, so we must give up.
     380          24 :     if (!ReplacementMap.count(Extract))
     381           4 :       return false;
     382             :   }
     383             : 
     384             :   // Finally, perform the replacements.
     385           8 :   IRBuilder<> Builder(Extracts[0]->getContext());
     386          18 :   for (auto &Replacement : ReplacementMap) {
     387           6 :     auto *Extract = Replacement.first;
     388           6 :     auto *Vector = Replacement.second.first;
     389           6 :     auto Index = Replacement.second.second;
     390           6 :     Builder.SetInsertPoint(Extract);
     391          18 :     Extract->replaceAllUsesWith(Builder.CreateExtractElement(Vector, Index));
     392           6 :     Extract->eraseFromParent();
     393             :   }
     394             : 
     395           4 :   return true;
     396             : }
     397             : 
     398      245561 : bool InterleavedAccess::lowerInterleavedStore(
     399             :     StoreInst *SI, SmallVector<Instruction *, 32> &DeadInsts) {
     400      243918 :   if (!SI->isSimple())
     401             :     return false;
     402             : 
     403      244860 :   ShuffleVectorInst *SVI = dyn_cast<ShuffleVectorInst>(SI->getValueOperand());
     404        1884 :   if (!SVI || !SVI->hasOneUse())
     405             :     return false;
     406             : 
     407             :   // Check if the shufflevector is RE-interleave shuffle.
     408             :   unsigned Factor;
     409        1874 :   unsigned OpNumElts = SVI->getOperand(0)->getType()->getVectorNumElements();
     410        3748 :   if (!isReInterleaveMask(SVI->getShuffleMask(), Factor, MaxFactor, OpNumElts))
     411             :     return false;
     412             : 
     413             :   DEBUG(dbgs() << "IA: Found an interleaved store: " << *SI << "\n");
     414             : 
     415             :   // Try to create target specific intrinsics to replace the store and shuffle.
     416         411 :   if (!TLI->lowerInterleavedStore(SI, SVI, Factor))
     417             :     return false;
     418             : 
     419             :   // Already have a new target specific interleaved store. Erase the old store.
     420          61 :   DeadInsts.push_back(SI);
     421          61 :   DeadInsts.push_back(SVI);
     422             :   return true;
     423             : }
     424             : 
     425       91166 : bool InterleavedAccess::runOnFunction(Function &F) {
     426       91166 :   auto *TPC = getAnalysisIfAvailable<TargetPassConfig>();
     427      182332 :   if (!TPC || !LowerInterleavedAccesses)
     428             :     return false;
     429             : 
     430             :   DEBUG(dbgs() << "*** " << getPassName() << ": " << F.getName() << "\n");
     431             : 
     432      182224 :   DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
     433       91112 :   auto &TM = TPC->getTM<TargetMachine>();
     434       91112 :   TLI = TM.getSubtargetImpl(F)->getTargetLowering();
     435       91112 :   MaxFactor = TLI->getMaxSupportedInterleaveFactor();
     436             : 
     437             :   // Holds dead instructions that will be erased later.
     438       91112 :   SmallVector<Instruction *, 32> DeadInsts;
     439       91112 :   bool Changed = false;
     440             : 
     441     3419878 :   for (auto &I : instructions(F)) {
     442      241633 :     if (LoadInst *LI = dyn_cast<LoadInst>(&I))
     443      241633 :       Changed |= lowerInterleavedLoad(LI, DeadInsts);
     444             : 
     445      245561 :     if (StoreInst *SI = dyn_cast<StoreInst>(&I))
     446      245561 :       Changed |= lowerInterleavedStore(SI, DeadInsts);
     447             :   }
     448             : 
     449      273684 :   for (auto I : DeadInsts)
     450         348 :     I->eraseFromParent();
     451             : 
     452       91112 :   return Changed;
     453      216918 : }

Generated by: LCOV version 1.13