LCOV - code coverage report
Current view: top level - lib/CodeGen - InterleavedAccessPass.cpp (source / functions) Hit Total Coverage
Test: llvm-toolchain.info Lines: 118 118 100.0 %
Date: 2018-02-25 19:55:18 Functions: 15 16 93.8 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : //===- InterleavedAccessPass.cpp ------------------------------------------===//
       2             : //
       3             : //                     The LLVM Compiler Infrastructure
       4             : //
       5             : // This file is distributed under the University of Illinois Open Source
       6             : // License. See LICENSE.TXT for details.
       7             : //
       8             : //===----------------------------------------------------------------------===//
       9             : //
      10             : // This file implements the Interleaved Access pass, which identifies
      11             : // interleaved memory accesses and transforms them into target specific
      12             : // intrinsics.
      13             : //
      14             : // An interleaved load reads data from memory into several vectors, with
      15             : // DE-interleaving the data on a factor. An interleaved store writes several
      16             : // vectors to memory with RE-interleaving the data on a factor.
      17             : //
      18             : // As interleaved accesses are difficult to identified in CodeGen (mainly
      19             : // because the VECTOR_SHUFFLE DAG node is quite different from the shufflevector
      20             : // IR), we identify and transform them to intrinsics in this pass so the
      21             : // intrinsics can be easily matched into target specific instructions later in
      22             : // CodeGen.
      23             : //
      24             : // E.g. An interleaved load (Factor = 2):
      25             : //        %wide.vec = load <8 x i32>, <8 x i32>* %ptr
      26             : //        %v0 = shuffle <8 x i32> %wide.vec, <8 x i32> undef, <0, 2, 4, 6>
      27             : //        %v1 = shuffle <8 x i32> %wide.vec, <8 x i32> undef, <1, 3, 5, 7>
      28             : //
      29             : // It could be transformed into a ld2 intrinsic in AArch64 backend or a vld2
      30             : // intrinsic in ARM backend.
      31             : //
      32             : // In X86, this can be further optimized into a set of target
      33             : // specific loads followed by an optimized sequence of shuffles.
      34             : //
      35             : // E.g. An interleaved store (Factor = 3):
      36             : //        %i.vec = shuffle <8 x i32> %v0, <8 x i32> %v1,
      37             : //                                    <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11>
      38             : //        store <12 x i32> %i.vec, <12 x i32>* %ptr
      39             : //
      40             : // It could be transformed into a st3 intrinsic in AArch64 backend or a vst3
      41             : // intrinsic in ARM backend.
      42             : //
      43             : // Similarly, a set of interleaved stores can be transformed into an optimized
      44             : // sequence of shuffles followed by a set of target specific stores for X86.
      45             : //
      46             : //===----------------------------------------------------------------------===//
      47             : 
      48             : #include "llvm/ADT/ArrayRef.h"
      49             : #include "llvm/ADT/DenseMap.h"
      50             : #include "llvm/ADT/SmallVector.h"
      51             : #include "llvm/CodeGen/TargetLowering.h"
      52             : #include "llvm/CodeGen/TargetPassConfig.h"
      53             : #include "llvm/CodeGen/TargetSubtargetInfo.h"
      54             : #include "llvm/IR/Constants.h"
      55             : #include "llvm/IR/Dominators.h"
      56             : #include "llvm/IR/Function.h"
      57             : #include "llvm/IR/IRBuilder.h"
      58             : #include "llvm/IR/InstIterator.h"
      59             : #include "llvm/IR/Instruction.h"
      60             : #include "llvm/IR/Instructions.h"
      61             : #include "llvm/IR/Type.h"
      62             : #include "llvm/Pass.h"
      63             : #include "llvm/Support/Casting.h"
      64             : #include "llvm/Support/CommandLine.h"
      65             : #include "llvm/Support/Debug.h"
      66             : #include "llvm/Support/MathExtras.h"
      67             : #include "llvm/Support/raw_ostream.h"
      68             : #include "llvm/Target/TargetMachine.h"
      69             : #include <cassert>
      70             : #include <utility>
      71             : 
      72             : using namespace llvm;
      73             : 
      74             : #define DEBUG_TYPE "interleaved-access"
      75             : 
      76       81774 : static cl::opt<bool> LowerInterleavedAccesses(
      77             :     "lower-interleaved-accesses",
      78       81774 :     cl::desc("Enable lowering interleaved accesses to intrinsics"),
      79      245322 :     cl::init(true), cl::Hidden);
      80             : 
      81             : namespace {
      82             : 
      83       21054 : class InterleavedAccess : public FunctionPass {
      84             : public:
      85             :   static char ID;
      86             : 
      87       10585 :   InterleavedAccess() : FunctionPass(ID) {
      88       10585 :     initializeInterleavedAccessPass(*PassRegistry::getPassRegistry());
      89       10585 :   }
      90             : 
      91          16 :   StringRef getPassName() const override { return "Interleaved Access Pass"; }
      92             : 
      93             :   bool runOnFunction(Function &F) override;
      94             : 
      95       10566 :   void getAnalysisUsage(AnalysisUsage &AU) const override {
      96             :     AU.addRequired<DominatorTreeWrapperPass>();
      97             :     AU.addPreserved<DominatorTreeWrapperPass>();
      98       10566 :   }
      99             : 
     100             : private:
     101             :   DominatorTree *DT = nullptr;
     102             :   const TargetLowering *TLI = nullptr;
     103             : 
     104             :   /// The maximum supported interleave factor.
     105             :   unsigned MaxFactor;
     106             : 
     107             :   /// \brief Transform an interleaved load into target specific intrinsics.
     108             :   bool lowerInterleavedLoad(LoadInst *LI,
     109             :                             SmallVector<Instruction *, 32> &DeadInsts);
     110             : 
     111             :   /// \brief Transform an interleaved store into target specific intrinsics.
     112             :   bool lowerInterleavedStore(StoreInst *SI,
     113             :                              SmallVector<Instruction *, 32> &DeadInsts);
     114             : 
     115             :   /// \brief Returns true if the uses of an interleaved load by the
     116             :   /// extractelement instructions in \p Extracts can be replaced by uses of the
     117             :   /// shufflevector instructions in \p Shuffles instead. If so, the necessary
     118             :   /// replacements are also performed.
     119             :   bool tryReplaceExtracts(ArrayRef<ExtractElementInst *> Extracts,
     120             :                           ArrayRef<ShuffleVectorInst *> Shuffles);
     121             : };
     122             : 
     123             : } // end anonymous namespace.
     124             : 
     125             : char InterleavedAccess::ID = 0;
     126             : 
     127       29067 : INITIALIZE_PASS_BEGIN(InterleavedAccess, DEBUG_TYPE,
     128             :     "Lower interleaved memory accesses to target specific intrinsics", false,
     129             :     false)
     130       29067 : INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
     131      180624 : INITIALIZE_PASS_END(InterleavedAccess, DEBUG_TYPE,
     132             :     "Lower interleaved memory accesses to target specific intrinsics", false,
     133             :     false)
     134             : 
     135       10575 : FunctionPass *llvm::createInterleavedAccessPass() {
     136       10575 :   return new InterleavedAccess();
     137             : }
     138             : 
     139             : /// \brief Check if the mask is a DE-interleave mask of the given factor
     140             : /// \p Factor like:
     141             : ///     <Index, Index+Factor, ..., Index+(NumElts-1)*Factor>
     142             : static bool isDeInterleaveMaskOfFactor(ArrayRef<int> Mask, unsigned Factor,
     143             :                                        unsigned &Index) {
     144             :   // Check all potential start indices from 0 to (Factor - 1).
     145       35338 :   for (Index = 0; Index < Factor; Index++) {
     146             :     unsigned i = 0;
     147             : 
     148             :     // Check that elements are in ascending order by Factor. Ignore undef
     149             :     // elements.
     150       52107 :     for (; i < Mask.size(); i++)
     151       38749 :       if (Mask[i] >= 0 && static_cast<unsigned>(Mask[i]) != Index + i * Factor)
     152             :         break;
     153             : 
     154       26789 :     if (i == Mask.size())
     155             :       return true;
     156             :   }
     157             : 
     158             :   return false;
     159             : }
     160             : 
     161             : /// \brief Check if the mask is a DE-interleave mask for an interleaved load.
     162             : ///
     163             : /// E.g. DE-interleave masks (Factor = 2) could be:
     164             : ///     <0, 2, 4, 6>    (mask of index 0 to extract even elements)
     165             : ///     <1, 3, 5, 7>    (mask of index 1 to extract odd elements)
     166        3167 : static bool isDeInterleaveMask(ArrayRef<int> Mask, unsigned &Factor,
     167             :                                unsigned &Index, unsigned MaxFactor) {
     168        3167 :   if (Mask.size() < 2)
     169             :     return false;
     170             : 
     171             :   // Check potential Factors.
     172       11690 :   for (Factor = 2; Factor <= MaxFactor; Factor++)
     173        9002 :     if (isDeInterleaveMaskOfFactor(Mask, Factor, Index))
     174             :       return true;
     175             : 
     176             :   return false;
     177             : }
     178             : 
     179             : /// \brief Check if the mask can be used in an interleaved store.
     180             : //
     181             : /// It checks for a more general pattern than the RE-interleave mask.
     182             : /// I.e. <x, y, ... z, x+1, y+1, ...z+1, x+2, y+2, ...z+2, ...>
     183             : /// E.g. For a Factor of 2 (LaneLen=4): <4, 32, 5, 33, 6, 34, 7, 35>
     184             : /// E.g. For a Factor of 3 (LaneLen=4): <4, 32, 16, 5, 33, 17, 6, 34, 18, 7, 35, 19>
     185             : /// E.g. For a Factor of 4 (LaneLen=2): <8, 2, 12, 4, 9, 3, 13, 5>
     186             : ///
     187             : /// The particular case of an RE-interleave mask is:
     188             : /// I.e. <0, LaneLen, ... , LaneLen*(Factor - 1), 1, LaneLen + 1, ...>
     189             : /// E.g. For a Factor of 2 (LaneLen=4): <0, 4, 1, 5, 2, 6, 3, 7>
     190        1170 : static bool isReInterleaveMask(ArrayRef<int> Mask, unsigned &Factor,
     191             :                                unsigned MaxFactor, unsigned OpNumElts) {
     192        1170 :   unsigned NumElts = Mask.size();
     193        1170 :   if (NumElts < 4)
     194             :     return false;
     195             : 
     196             :   // Check potential Factors.
     197        2893 :   for (Factor = 2; Factor <= MaxFactor; Factor++) {
     198        2490 :     if (NumElts % Factor)
     199         822 :       continue;
     200             : 
     201        1668 :     unsigned LaneLen = NumElts / Factor;
     202         110 :     if (!isPowerOf2_32(LaneLen))
     203         110 :       continue;
     204             : 
     205             :     // Check whether each element matches the general interleaved rule.
     206             :     // Ignore undef elements, as long as the defined elements match the rule.
     207             :     // Outer loop processes all factors (x, y, z in the above example)
     208             :     unsigned I = 0, J;
     209        5094 :     for (; I < Factor; I++) {
     210             :       unsigned SavedLaneValue;
     211             :       unsigned SavedNoUndefs = 0;
     212             : 
     213             :       // Inner loop processes consecutive accesses (x, x+1... in the example)
     214       12575 :       for (J = 0; J < LaneLen - 1; J++) {
     215             :         // Lane computes x's position in the Mask
     216        5932 :         unsigned Lane = J * Factor + I;
     217        5932 :         unsigned NextLane = Lane + Factor;
     218       11864 :         int LaneValue = Mask[Lane];
     219       11864 :         int NextLaneValue = Mask[NextLane];
     220             : 
     221             :         // If both are defined, values must be sequential
     222       11646 :         if (LaneValue >= 0 && NextLaneValue >= 0 &&
     223        5714 :             LaneValue + 1 != NextLaneValue)
     224             :           break;
     225             : 
     226             :         // If the next value is undef, save the current one as reference
     227        4871 :         if (LaneValue >= 0 && NextLaneValue < 0) {
     228          72 :           SavedLaneValue = LaneValue;
     229             :           SavedNoUndefs = 1;
     230             :         }
     231             : 
     232             :         // Undefs are allowed, but defined elements must still be consecutive:
     233             :         // i.e.: x,..., undef,..., x + 2,..., undef,..., undef,..., x + 5, ....
     234             :         // Verify this by storing the last non-undef followed by an undef
     235             :         // Check that following non-undef masks are incremented with the
     236             :         // corresponding distance.
     237        4871 :         if (SavedNoUndefs > 0 && LaneValue < 0) {
     238          52 :           SavedNoUndefs++;
     239          72 :           if (NextLaneValue >= 0 &&
     240          20 :               SavedLaneValue + SavedNoUndefs != (unsigned)NextLaneValue)
     241             :             break;
     242             :         }
     243             :       }
     244             : 
     245        2849 :       if (J < LaneLen - 1)
     246             :         break;
     247             : 
     248             :       int StartMask = 0;
     249        3560 :       if (Mask[I] >= 0) {
     250             :         // Check that the start of the I range (J=0) is greater than 0
     251             :         StartMask = Mask[I];
     252         118 :       } else if (Mask[(LaneLen - 1) * Factor + I] >= 0) {
     253             :         // StartMask defined by the last value in lane
     254          31 :         StartMask = Mask[(LaneLen - 1) * Factor + I] - J;
     255          28 :       } else if (SavedNoUndefs > 0) {
     256             :         // StartMask defined by some non-zero value in the j loop
     257           4 :         StartMask = SavedLaneValue - (LaneLen - 1 - SavedNoUndefs);
     258             :       }
     259             :       // else StartMask remains set to 0, i.e. all elements are undefs
     260             : 
     261        1756 :       if (StartMask < 0)
     262             :         break;
     263             :       // We must stay within the vectors; This case can happen with undefs.
     264        1774 :       if (StartMask + LaneLen > OpNumElts*2)
     265             :         break;
     266             :     }
     267             : 
     268             :     // Found an interleaved mask of current factor.
     269        1558 :     if (I == Factor)
     270             :       return true;
     271             :   }
     272             : 
     273             :   return false;
     274             : }
     275             : 
     276      255461 : bool InterleavedAccess::lowerInterleavedLoad(
     277             :     LoadInst *LI, SmallVector<Instruction *, 32> &DeadInsts) {
     278             :   if (!LI->isSimple())
     279             :     return false;
     280             : 
     281             :   SmallVector<ShuffleVectorInst *, 4> Shuffles;
     282             :   SmallVector<ExtractElementInst *, 4> Extracts;
     283             : 
     284             :   // Check if all users of this load are shufflevectors. If we encounter any
     285             :   // users that are extractelement instructions, we save them to later check if
     286             :   // they can be modifed to extract from one of the shufflevectors instead of
     287             :   // the load.
     288      257833 :   for (auto UI = LI->user_begin(), E = LI->user_end(); UI != E; UI++) {
     289      254362 :     auto *Extract = dyn_cast<ExtractElementInst>(*UI);
     290      254965 :     if (Extract && isa<ConstantInt>(Extract->getIndexOperand())) {
     291         237 :       Extracts.push_back(Extract);
     292         237 :       continue;
     293             :     }
     294      254125 :     ShuffleVectorInst *SVI = dyn_cast<ShuffleVectorInst>(*UI);
     295      259270 :     if (!SVI || !isa<UndefValue>(SVI->getOperand(1)))
     296      250651 :       return false;
     297             : 
     298        3474 :     Shuffles.push_back(SVI);
     299             :   }
     300             : 
     301        3471 :   if (Shuffles.empty())
     302             :     return false;
     303             : 
     304             :   unsigned Factor, Index;
     305             : 
     306             :   // Check if the first shufflevector is DE-interleave shuffle.
     307        9501 :   if (!isDeInterleaveMask(Shuffles[0]->getShuffleMask(), Factor, Index,
     308             :                           MaxFactor))
     309             :     return false;
     310             : 
     311             :   // Holds the corresponding index for each DE-interleave shuffle.
     312             :   SmallVector<unsigned, 4> Indices;
     313         453 :   Indices.push_back(Index);
     314             : 
     315         453 :   Type *VecTy = Shuffles[0]->getType();
     316             : 
     317             :   // Check if other shufflevectors are also DE-interleaved of the same type
     318             :   // and factor as the first shufflevector.
     319        1644 :   for (unsigned i = 1; i < Shuffles.size(); i++) {
     320         492 :     if (Shuffles[i]->getType() != VecTy)
     321             :       return false;
     322             : 
     323         492 :     if (!isDeInterleaveMaskOfFactor(Shuffles[i]->getShuffleMask(), Factor,
     324             :                                     Index))
     325             :       return false;
     326             : 
     327         246 :     Indices.push_back(Index);
     328             :   }
     329             : 
     330             :   // Try and modify users of the load that are extractelement instructions to
     331             :   // use the shufflevector instructions instead of the load.
     332         453 :   if (!tryReplaceExtracts(Extracts, Shuffles))
     333             :     return false;
     334             : 
     335             :   DEBUG(dbgs() << "IA: Found an interleaved load: " << *LI << "\n");
     336             : 
     337             :   // Try to create target specific intrinsics to replace the load and shuffles.
     338         898 :   if (!TLI->lowerInterleavedLoad(LI, Shuffles, Indices, Factor))
     339             :     return false;
     340             : 
     341         426 :   for (auto SVI : Shuffles)
     342         180 :     DeadInsts.push_back(SVI);
     343             : 
     344          66 :   DeadInsts.push_back(LI);
     345          66 :   return true;
     346             : }
     347             : 
     348         453 : bool InterleavedAccess::tryReplaceExtracts(
     349             :     ArrayRef<ExtractElementInst *> Extracts,
     350             :     ArrayRef<ShuffleVectorInst *> Shuffles) {
     351             :   // If there aren't any extractelement instructions to modify, there's nothing
     352             :   // to do.
     353         453 :   if (Extracts.empty())
     354             :     return true;
     355             : 
     356             :   // Maps extractelement instructions to vector-index pairs. The extractlement
     357             :   // instructions will be modified to use the new vector and index operands.
     358             :   DenseMap<ExtractElementInst *, std::pair<Value *, int>> ReplacementMap;
     359             : 
     360          24 :   for (auto *Extract : Extracts) {
     361             :     // The vector index that is extracted.
     362             :     auto *IndexOperand = cast<ConstantInt>(Extract->getIndexOperand());
     363             :     auto Index = IndexOperand->getSExtValue();
     364             : 
     365             :     // Look for a suitable shufflevector instruction. The goal is to modify the
     366             :     // extractelement instruction (which uses an interleaved load) to use one
     367             :     // of the shufflevector instructions instead of the load.
     368          20 :     for (auto *Shuffle : Shuffles) {
     369             :       // If the shufflevector instruction doesn't dominate the extract, we
     370             :       // can't create a use of it.
     371          12 :       if (!DT->dominates(Shuffle, Extract))
     372           2 :         continue;
     373             : 
     374             :       // Inspect the indices of the shufflevector instruction. If the shuffle
     375             :       // selects the same index that is extracted, we can modify the
     376             :       // extractelement instruction.
     377             :       SmallVector<int, 4> Indices;
     378             :       Shuffle->getShuffleMask(Indices);
     379          62 :       for (unsigned I = 0; I < Indices.size(); ++I)
     380          22 :         if (Indices[I] == Index) {
     381             :           assert(Extract->getOperand(0) == Shuffle->getOperand(0) &&
     382             :                  "Vector operations do not match");
     383             :           ReplacementMap[Extract] = std::make_pair(Shuffle, I);
     384             :           break;
     385             :         }
     386             : 
     387             :       // If we found a suitable shufflevector instruction, stop looking.
     388          10 :       if (ReplacementMap.count(Extract))
     389             :         break;
     390             :     }
     391             : 
     392             :     // If we did not find a suitable shufflevector instruction, the
     393             :     // extractelement instruction cannot be modified, so we must give up.
     394          12 :     if (!ReplacementMap.count(Extract))
     395           4 :       return false;
     396             :   }
     397             : 
     398             :   // Finally, perform the replacements.
     399           4 :   IRBuilder<> Builder(Extracts[0]->getContext());
     400          14 :   for (auto &Replacement : ReplacementMap) {
     401           6 :     auto *Extract = Replacement.first;
     402           6 :     auto *Vector = Replacement.second.first;
     403           6 :     auto Index = Replacement.second.second;
     404           6 :     Builder.SetInsertPoint(Extract);
     405          18 :     Extract->replaceAllUsesWith(Builder.CreateExtractElement(Vector, Index));
     406           6 :     Extract->eraseFromParent();
     407             :   }
     408             : 
     409             :   return true;
     410             : }
     411             : 
     412      255366 : bool InterleavedAccess::lowerInterleavedStore(
     413             :     StoreInst *SI, SmallVector<Instruction *, 32> &DeadInsts) {
     414             :   if (!SI->isSimple())
     415             :     return false;
     416             : 
     417             :   ShuffleVectorInst *SVI = dyn_cast<ShuffleVectorInst>(SI->getValueOperand());
     418        1180 :   if (!SVI || !SVI->hasOneUse())
     419             :     return false;
     420             : 
     421             :   // Check if the shufflevector is RE-interleave shuffle.
     422             :   unsigned Factor;
     423        1170 :   unsigned OpNumElts = SVI->getOperand(0)->getType()->getVectorNumElements();
     424        3510 :   if (!isReInterleaveMask(SVI->getShuffleMask(), Factor, MaxFactor, OpNumElts))
     425             :     return false;
     426             : 
     427             :   DEBUG(dbgs() << "IA: Found an interleaved store: " << *SI << "\n");
     428             : 
     429             :   // Try to create target specific intrinsics to replace the store and shuffle.
     430         477 :   if (!TLI->lowerInterleavedStore(SI, SVI, Factor))
     431             :     return false;
     432             : 
     433             :   // Already have a new target specific interleaved store. Erase the old store.
     434          81 :   DeadInsts.push_back(SI);
     435          81 :   DeadInsts.push_back(SVI);
     436             :   return true;
     437             : }
     438             : 
     439      111318 : bool InterleavedAccess::runOnFunction(Function &F) {
     440      111318 :   auto *TPC = getAnalysisIfAvailable<TargetPassConfig>();
     441      222636 :   if (!TPC || !LowerInterleavedAccesses)
     442             :     return false;
     443             : 
     444             :   DEBUG(dbgs() << "*** " << getPassName() << ": " << F.getName() << "\n");
     445             : 
     446      222528 :   DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
     447      111264 :   auto &TM = TPC->getTM<TargetMachine>();
     448      111264 :   TLI = TM.getSubtargetImpl(F)->getTargetLowering();
     449      111264 :   MaxFactor = TLI->getMaxSupportedInterleaveFactor();
     450             : 
     451             :   // Holds dead instructions that will be erased later.
     452             :   SmallVector<Instruction *, 32> DeadInsts;
     453             :   bool Changed = false;
     454             : 
     455     1771098 :   for (auto &I : instructions(F)) {
     456             :     if (LoadInst *LI = dyn_cast<LoadInst>(&I))
     457      255461 :       Changed |= lowerInterleavedLoad(LI, DeadInsts);
     458             : 
     459             :     if (StoreInst *SI = dyn_cast<StoreInst>(&I))
     460      255366 :       Changed |= lowerInterleavedStore(SI, DeadInsts);
     461             :   }
     462             : 
     463      112080 :   for (auto I : DeadInsts)
     464         408 :     I->eraseFromParent();
     465             : 
     466             :   return Changed;
     467      245322 : }

Generated by: LCOV version 1.13