docs/doxygen/MVELaneInterleavingPass_8cpp_source.html

//===- MVELaneInterleaving.cpp - Inverleave for MVE instructions ----------===//

//

// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.

// See https://llvm.org/LICENSE.txt for license information.

// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

//

//===----------------------------------------------------------------------===//

//

// This pass interleaves around sext/zext/trunc instructions. MVE does not have

// a single sext/zext or trunc instruction that takes the bottom half of a

// vector and extends to a full width, like NEON has with MOVL. Instead it is

// expected that this happens through top/bottom instructions. So the MVE

// equivalent VMOVLT/B instructions take either the even or odd elements of the

// input and extend them to the larger type, producing a vector with half the

// number of elements each of double the bitwidth. As there is no simple

// instruction, we often have to turn sext/zext/trunc into a series of lane

// moves (or stack loads/stores, which we do not do yet).

//

// This pass takes vector code that starts at truncs, looks for interconnected

// blobs of operations that end with sext/zext (or constants/splats) of the

// form:

//   %sa = sext v8i16 %a to v8i32

//   %sb = sext v8i16 %b to v8i32

//   %add = add v8i32 %sa, %sb

//   %r = trunc %add to v8i16

// And adds shuffles to allow the use of VMOVL/VMOVN instrctions:

//   %sha = shuffle v8i16 %a, undef, <0, 2, 4, 6, 1, 3, 5, 7>

//   %sa = sext v8i16 %sha to v8i32

//   %shb = shuffle v8i16 %b, undef, <0, 2, 4, 6, 1, 3, 5, 7>

//   %sb = sext v8i16 %shb to v8i32

//   %add = add v8i32 %sa, %sb

//   %r = trunc %add to v8i16

//   %shr = shuffle v8i16 %r, undef, <0, 4, 1, 5, 2, 6, 3, 7>

// Which can then be split and lowered to MVE instructions efficiently:

//   %sa_b = VMOVLB.s16 %a

//   %sa_t = VMOVLT.s16 %a

//   %sb_b = VMOVLB.s16 %b

//   %sb_t = VMOVLT.s16 %b

//   %add_b = VADD.i32 %sa_b, %sb_b

//   %add_t = VADD.i32 %sa_t, %sb_t

//   %r = VMOVNT.i16 %add_b, %add_t

//

//===----------------------------------------------------------------------===//


#include "ARM.h"

#include "ARMBaseInstrInfo.h"

#include "ARMSubtarget.h"

#include "llvm/ADT/SetVector.h"

#include "llvm/Analysis/TargetTransformInfo.h"

#include "llvm/CodeGen/TargetLowering.h"

#include "llvm/CodeGen/TargetPassConfig.h"

#include "llvm/IR/BasicBlock.h"

#include "llvm/IR/DerivedTypes.h"

#include "llvm/IR/Function.h"

#include "llvm/IR/IRBuilder.h"

#include "llvm/IR/InstIterator.h"

#include "llvm/IR/InstrTypes.h"

#include "llvm/IR/Instruction.h"

#include "llvm/IR/Instructions.h"

#include "llvm/IR/IntrinsicInst.h"

#include "llvm/IR/Intrinsics.h"

#include "llvm/IR/Type.h"

#include "llvm/IR/Value.h"

#include "llvm/InitializePasses.h"

#include "llvm/Pass.h"

#include "llvm/Support/Casting.h"

#include <cassert>


using namespace llvm;


#define DEBUG_TYPE "mve-laneinterleave"


static cl::opt<bool> EnableInterleave(

    "enable-mve-interleave", cl::Hidden, cl::init(true),

    cl::desc("Enable interleave MVE vector operation lowering"));


namespace {


class MVELaneInterleaving : public FunctionPass {

public:

  static char ID; // Pass identification, replacement for typeid


  explicit MVELaneInterleaving() : FunctionPass(ID) {

    initializeMVELaneInterleavingPass(*PassRegistry::getPassRegistry());

  }


  bool runOnFunction(Function &F) override;


  StringRef getPassName() const override { return "MVE lane interleaving"; }


  void getAnalysisUsage(AnalysisUsage &AU) const override {

    AU.setPreservesCFG();

    AU.addRequired<TargetPassConfig>();

    FunctionPass::getAnalysisUsage(AU);

  }

};


} // end anonymous namespace


char MVELaneInterleaving::ID = 0;


INITIALIZE_PASS(MVELaneInterleaving, DEBUG_TYPE, "MVE lane interleaving", false,

                false)


Pass *llvm::createMVELaneInterleavingPass() {

  return new MVELaneInterleaving();

}


static bool isProfitableToInterleave(SmallSetVector<Instruction *, 4> &Exts,

                                     SmallSetVector<Instruction *, 4> &Truncs) {

  // This is not always beneficial to transform. Exts can be incorporated into

  // loads, Truncs can be folded into stores.

  // Truncs are usually the same number of instructions,

  //  VSTRH.32(A);VSTRH.32(B) vs VSTRH.16(VMOVNT A, B) with interleaving

  // Exts are unfortunately more instructions in the general case:

  //  A=VLDRH.32; B=VLDRH.32;

  // vs with interleaving:

  //  T=VLDRH.16; A=VMOVNB T; B=VMOVNT T

  // But those VMOVL may be folded into a VMULL.


  // But expensive extends/truncs are always good to remove. FPExts always

  // involve extra VCVT's so are always considered to be beneficial to convert.

  for (auto *E : Exts) {

    if (isa<FPExtInst>(E) || !isa<LoadInst>(E->getOperand(0))) {

      LLVM_DEBUG(dbgs() << "Beneficial due to " << *E << "\n");

      return true;

    }

  }

  for (auto *T : Truncs) {

    if (T->hasOneUse() && !isa<StoreInst>(*T->user_begin())) {

      LLVM_DEBUG(dbgs() << "Beneficial due to " << *T << "\n");

      return true;

    }

  }


  // Otherwise, we know we have a load(ext), see if any of the Extends are a

  // vmull. This is a simple heuristic and certainly not perfect.

  for (auto *E : Exts) {

    if (!E->hasOneUse() ||

        cast<Instruction>(*E->user_begin())->getOpcode() != Instruction::Mul) {

      LLVM_DEBUG(dbgs() << "Not beneficial due to " << *E << "\n");

      return false;

    }

  }

  return true;

}


static bool tryInterleave(Instruction *Start,

                          SmallPtrSetImpl<Instruction *> &Visited) {

  LLVM_DEBUG(dbgs() << "tryInterleave from " << *Start << "\n");


  if (!isa<Instruction>(Start->getOperand(0)))

    return false;


  // Look for connected operations starting from Ext's, terminating at Truncs.

  std::vector<Instruction *> Worklist;

  Worklist.push_back(Start);

  Worklist.push_back(cast<Instruction>(Start->getOperand(0)));


  SmallSetVector<Instruction *, 4> Truncs;

  SmallSetVector<Instruction *, 4> Reducts;

  SmallSetVector<Instruction *, 4> Exts;

  SmallSetVector<Use *, 4> OtherLeafs;

  SmallSetVector<Instruction *, 4> Ops;


  while (!Worklist.empty()) {

    Instruction *I = Worklist.back();

    Worklist.pop_back();


    switch (I->getOpcode()) {

    // Truncs

    case Instruction::Trunc:

    case Instruction::FPTrunc:

      if (!Truncs.insert(I))

        continue;

      Visited.insert(I);

      break;


    // Extend leafs

    case Instruction::SExt:

    case Instruction::ZExt:

    case Instruction::FPExt:

      if (Exts.count(I))

        continue;

      for (auto *Use : I->users())

        Worklist.push_back(cast<Instruction>(Use));

      Exts.insert(I);

      break;


    case Instruction::Call: {

      IntrinsicInst *II = dyn_cast<IntrinsicInst>(I);

      if (!II)

        return false;


      if (II->getIntrinsicID() == Intrinsic::vector_reduce_add) {

        if (!Reducts.insert(I))

          continue;

        Visited.insert(I);

        break;

      }


      switch (II->getIntrinsicID()) {

      case Intrinsic::abs:

      case Intrinsic::smin:

      case Intrinsic::smax:

      case Intrinsic::umin:

      case Intrinsic::umax:

      case Intrinsic::sadd_sat:

      case Intrinsic::ssub_sat:

      case Intrinsic::uadd_sat:

      case Intrinsic::usub_sat:

      case Intrinsic::minnum:

      case Intrinsic::maxnum:

      case Intrinsic::fabs:

      case Intrinsic::fma:

      case Intrinsic::ceil:

      case Intrinsic::floor:

      case Intrinsic::rint:

      case Intrinsic::round:

      case Intrinsic::trunc:

        break;

      default:

        return false;

      }

      [[fallthrough]]; // Fall through to treating these like an operator below.

    }

    // Binary/tertiary ops

    case Instruction::Add:

    case Instruction::Sub:

    case Instruction::Mul:

    case Instruction::AShr:

    case Instruction::LShr:

    case Instruction::Shl:

    case Instruction::ICmp:

    case Instruction::FCmp:

    case Instruction::FAdd:

    case Instruction::FMul:

    case Instruction::Select:

      if (!Ops.insert(I))

        continue;


      for (Use &Op : I->operands()) {

        if (!isa<FixedVectorType>(Op->getType()))

          continue;

        if (isa<Instruction>(Op))

          Worklist.push_back(cast<Instruction>(&Op));

        else

          OtherLeafs.insert(&Op);

      }


      for (auto *Use : I->users())

        Worklist.push_back(cast<Instruction>(Use));

      break;


    case Instruction::ShuffleVector:

      // A shuffle of a splat is a splat.

      if (cast<ShuffleVectorInst>(I)->isZeroEltSplat())

        continue;

      [[fallthrough]];


    default:

      LLVM_DEBUG(dbgs() << "  Unhandled instruction: " << *I << "\n");

      return false;

    }

  }


  if (Exts.empty() && OtherLeafs.empty())

    return false;


  LLVM_DEBUG({

    dbgs() << "Found group:\n  Exts:\n";

    for (auto *I : Exts)

      dbgs() << "  " << *I << "\n";

    dbgs() << "  Ops:\n";

    for (auto *I : Ops)

      dbgs() << "  " << *I << "\n";

    dbgs() << "  OtherLeafs:\n";

    for (auto *I : OtherLeafs)

      dbgs() << "  " << *I->get() << " of " << *I->getUser() << "\n";

    dbgs() << "  Truncs:\n";

    for (auto *I : Truncs)

      dbgs() << "  " << *I << "\n";

    dbgs() << "  Reducts:\n";

    for (auto *I : Reducts)

      dbgs() << "  " << *I << "\n";

  });


  assert((!Truncs.empty() || !Reducts.empty()) &&

         "Expected some truncs or reductions");

  if (Truncs.empty() && Exts.empty())

    return false;


  auto *VT = !Truncs.empty()

                 ? cast<FixedVectorType>(Truncs[0]->getType())

                 : cast<FixedVectorType>(Exts[0]->getOperand(0)->getType());

  LLVM_DEBUG(dbgs() << "Using VT:" << *VT << "\n");


  // Check types

  unsigned NumElts = VT->getNumElements();

  unsigned BaseElts = VT->getScalarSizeInBits() == 16

                          ? 8

                          : (VT->getScalarSizeInBits() == 8 ? 16 : 0);

  if (BaseElts == 0 || NumElts % BaseElts != 0) {

    LLVM_DEBUG(dbgs() << "  Type is unsupported\n");

    return false;

  }

  if (Start->getOperand(0)->getType()->getScalarSizeInBits() !=

      VT->getScalarSizeInBits() * 2) {

    LLVM_DEBUG(dbgs() << "  Type not double sized\n");

    return false;

  }

  for (Instruction *I : Exts)

    if (I->getOperand(0)->getType() != VT) {

      LLVM_DEBUG(dbgs() << "  Wrong type on " << *I << "\n");

      return false;

    }

  for (Instruction *I : Truncs)

    if (I->getType() != VT) {

      LLVM_DEBUG(dbgs() << "  Wrong type on " << *I << "\n");

      return false;

    }


  // Check that it looks beneficial

  if (!isProfitableToInterleave(Exts, Truncs))

    return false;

  if (!Reducts.empty() && (Ops.empty() || all_of(Ops, [](Instruction *I) {

                             return I->getOpcode() == Instruction::Mul ||

                                    I->getOpcode() == Instruction::Select ||

                                    I->getOpcode() == Instruction::ICmp;

                           }))) {

    LLVM_DEBUG(dbgs() << "Reduction does not look profitable\n");

    return false;

  }


  // Create new shuffles around the extends / truncs / other leaves.

  IRBuilder<> Builder(Start);


  SmallVector<int, 16> LeafMask;

  SmallVector<int, 16> TruncMask;

  // LeafMask : 0, 2, 4, 6, 1, 3, 5, 7   8, 10, 12, 14,  9, 11, 13, 15

  // TruncMask: 0, 4, 1, 5, 2, 6, 3, 7   8, 12,  9, 13, 10, 14, 11, 15

  for (unsigned Base = 0; Base < NumElts; Base += BaseElts) {

    for (unsigned i = 0; i < BaseElts / 2; i++)

      LeafMask.push_back(Base + i * 2);

    for (unsigned i = 0; i < BaseElts / 2; i++)

      LeafMask.push_back(Base + i * 2 + 1);

  }

  for (unsigned Base = 0; Base < NumElts; Base += BaseElts) {

    for (unsigned i = 0; i < BaseElts / 2; i++) {

      TruncMask.push_back(Base + i);

      TruncMask.push_back(Base + i + BaseElts / 2);

    }

  }


  for (Instruction *I : Exts) {

    LLVM_DEBUG(dbgs() << "Replacing ext " << *I << "\n");

    Builder.SetInsertPoint(I);

    Value *Shuffle = Builder.CreateShuffleVector(I->getOperand(0), LeafMask);

    bool FPext = isa<FPExtInst>(I);

    bool Sext = isa<SExtInst>(I);

    Value *Ext = FPext ? Builder.CreateFPExt(Shuffle, I->getType())

                       : Sext ? Builder.CreateSExt(Shuffle, I->getType())

                              : Builder.CreateZExt(Shuffle, I->getType());

    I->replaceAllUsesWith(Ext);

    LLVM_DEBUG(dbgs() << "  with " << *Shuffle << "\n");

  }


  for (Use *I : OtherLeafs) {

    LLVM_DEBUG(dbgs() << "Replacing leaf " << *I << "\n");

    Builder.SetInsertPoint(cast<Instruction>(I->getUser()));

    Value *Shuffle = Builder.CreateShuffleVector(I->get(), LeafMask);

    I->getUser()->setOperand(I->getOperandNo(), Shuffle);

    LLVM_DEBUG(dbgs() << "  with " << *Shuffle << "\n");

  }


  for (Instruction *I : Truncs) {

    LLVM_DEBUG(dbgs() << "Replacing trunc " << *I << "\n");


    Builder.SetInsertPoint(I->getParent(), ++I->getIterator());

    Value *Shuf = Builder.CreateShuffleVector(I, TruncMask);

    I->replaceAllUsesWith(Shuf);

    cast<Instruction>(Shuf)->setOperand(0, I);


    LLVM_DEBUG(dbgs() << "  with " << *Shuf << "\n");

  }


  return true;

}


// Add reductions are fairly common and associative, meaning we can start the

// interleaving from them and don't need to emit a shuffle.


static bool isAddReduction(Instruction &I) {

  if (auto *II = dyn_cast<IntrinsicInst>(&I))

    return II->getIntrinsicID() == Intrinsic::vector_reduce_add;

  return false;

}


bool MVELaneInterleaving::runOnFunction(Function &F) {

  if (!EnableInterleave)

    return false;

  auto &TPC = getAnalysis<TargetPassConfig>();

  auto &TM = TPC.getTM<TargetMachine>();

  auto *ST = &TM.getSubtarget<ARMSubtarget>(F);

  if (!ST->hasMVEIntegerOps())

    return false;


  bool Changed = false;


  SmallPtrSet<Instruction *, 16> Visited;

  for (Instruction &I : reverse(instructions(F))) {

    if (((I.getType()->isVectorTy() &&

          (isa<TruncInst>(I) || isa<FPTruncInst>(I))) ||

         isAddReduction(I)) &&

        !Visited.count(&I))

      Changed |= tryInterleave(&I, Visited);

  }


  return Changed;

}

assert
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")

ARMBaseInstrInfo.h

ARMSubtarget.h

ARM.h

instructions
Expand Atomic instructions
Definition AtomicExpandPass.cpp:183

E
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")

Casting.h

DerivedTypes.h

runOnFunction
static bool runOnFunction(Function &F, bool PostInlining)
Definition EntryExitInstrumenter.cpp:103

DEBUG_TYPE
#define DEBUG_TYPE
Definition GenericCycleImpl.h:31

IRBuilder.h

BasicBlock.h

Function.h

Instruction.h

IntrinsicInst.h

Type.h

Value.h

InitializePasses.h

InstIterator.h

InstrTypes.h

Instructions.h

Intrinsics.h

Ops
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
Definition ItaniumDemangle.h:3368

F
#define F(x, y, z)
Definition MD5.cpp:55

I
#define I(x, y, z)
Definition MD5.cpp:58

isProfitableToInterleave
static bool isProfitableToInterleave(SmallSetVector< Instruction *, 4 > &Exts, SmallSetVector< Instruction *, 4 > &Truncs)
Definition MVELaneInterleavingPass.cpp:109

tryInterleave
static bool tryInterleave(Instruction *Start, SmallPtrSetImpl< Instruction * > &Visited)
Definition MVELaneInterleavingPass.cpp:148

EnableInterleave
static cl::opt< bool > EnableInterleave("enable-mve-interleave", cl::Hidden, cl::init(true), cl::desc("Enable interleave MVE vector operation lowering"))

isAddReduction
static bool isAddReduction(Instruction &I)
Definition MVELaneInterleavingPass.cpp:392

T
#define T
Definition Mips16ISelLowering.cpp:353

II
uint64_t IntrinsicInst * II
Definition NVVMIntrRange.cpp:46

INITIALIZE_PASS
#define INITIALIZE_PASS(passName, arg, name, cfg, analysis)
Definition PassSupport.h:56

Pass.h

SetVector.h
This file implements a set that has insertion order iteration characteristics.

LLVM_DEBUG
#define LLVM_DEBUG(...)
Definition Debug.h:114

getType
static SymbolRef::Type getType(const Symbol *Sym)
Definition TapiFile.cpp:39

TargetLowering.h
This file describes how to lower LLVM code to machine code.

TargetPassConfig.h
Target-Independent Code Generator Pass Configuration Options pass.

TargetTransformInfo.h
This pass exposes codegen information to IR-level passes.

llvm::AnalysisUsage::addRequired
AnalysisUsage & addRequired()
Definition PassAnalysisSupport.h:76

llvm::AnalysisUsage::setPreservesCFG
LLVM_ABI void setPreservesCFG()
This function should be called by the pass, iff they do not:
Definition Pass.cpp:270

llvm::FunctionPass
FunctionPass class - This class is used to implement most global optimizations.
Definition Pass.h:314

llvm::Function
Definition Function.h:64

llvm::IRBuilder
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition IRBuilder.h:2788

llvm::Instruction
Definition Instruction.h:69

llvm::IntrinsicInst
A wrapper class for inspecting calls to intrinsic functions.
Definition IntrinsicInst.h:49

llvm::PassRegistry::getPassRegistry
static LLVM_ABI PassRegistry * getPassRegistry()
getPassRegistry - Access the global registry object, which is automatically initialized at applicatio...
Definition PassRegistry.cpp:24

llvm::Pass
Pass interface - Implemented by all 'passes'.
Definition Pass.h:99

llvm::SetVector::count
size_type count(const key_type &key) const
Count the number of elements of a given key in the SetVector.
Definition SetVector.h:261

llvm::SetVector::empty
bool empty() const
Determine if the SetVector is empty or not.
Definition SetVector.h:99

llvm::SetVector::insert
bool insert(const value_type &X)
Insert a new element into the SetVector.
Definition SetVector.h:150

llvm::SmallPtrSetImpl
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
Definition SmallPtrSet.h:368

llvm::SmallPtrSetImpl::count
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
Definition SmallPtrSet.h:455

llvm::SmallPtrSetImpl::insert
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition SmallPtrSet.h:389

llvm::SmallSetVector
A SetVector that performs no allocations if smaller than a certain size.
Definition SetVector.h:338

llvm::SmallVectorTemplateBase::push_back
void push_back(const T &Elt)
Definition SmallVector.h:416

llvm::SmallVector
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition SmallVector.h:1196

llvm::Use
A Use represents the edge between a Value definition and its users.
Definition Use.h:35

llvm::Value
LLVM Value Representation.
Definition Value.h:75

llvm::cl::opt
Definition CommandLine.h:1455

Changed
Changed
Definition ObjCARCOpts.cpp:2369

llvm::ARM_MB::ST
@ ST
Definition ARMBaseInfo.h:73

llvm::CallingConv::ID
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24

llvm::SystemZISD::TM
@ TM
Definition SystemZISelLowering.h:66

llvm::cl::Hidden
@ Hidden
Definition CommandLine.h:139

llvm::cl::init
initializer< Ty > init(const Ty &Val)
Definition CommandLine.h:445

llvm::sampleprof::Base
@ Base
Definition Discriminator.h:58

llvm
This is an optimization pass for GlobalISel generic memory operations.
Definition AddressRanges.h:18

llvm::all_of
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1725

llvm::createMVELaneInterleavingPass
Pass * createMVELaneInterleavingPass()

llvm::dyn_cast
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643

llvm::initializeMVELaneInterleavingPass
void initializeMVELaneInterleavingPass(PassRegistry &)

llvm::reverse
auto reverse(ContainerTy &&C)
Definition STLExtras.h:406

llvm::dbgs
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207

llvm::isa
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547

llvm::Op
DWARFExpression::Operation Op
Definition DWARFExpressionPrinter.cpp:22

llvm::cast
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559

llvm::cl::desc
Definition CommandLine.h:411