29#include "llvm/IR/IntrinsicsARM.h"
41#define DEBUG_TYPE "arm-parallel-dsp"
43STATISTIC(NumSMLAD ,
"Number of smlad instructions generated");
47 cl::desc(
"Disable the ARM Parallel DSP pass"));
51 cl::desc(
"Limit the number of loads analysed"));
67 bool Exchange =
false;
74 bool HasTwoLoadInputs()
const {
78 LoadInst *getBaseLoad()
const {
90 SetVector<Instruction*> Adds;
98 void InsertAdd(Instruction *
I) { Adds.insert(
I); }
103 auto GetMulOperand = [](
Value *
V) -> Instruction* {
106 if (
I->getOpcode() == Instruction::Mul)
109 if (
I->getOpcode() == Instruction::Mul)
118 Muls.push_back(std::make_unique<MulCandidate>(
I,
LHS,
RHS));
121 for (
auto *
Add : Adds) {
124 if (
auto *
Mul = GetMulOperand(
Add->getOperand(0)))
126 if (
auto *
Mul = GetMulOperand(
Add->getOperand(1)))
134 bool InsertAcc(
Value *V) {
143 void AddMulPair(MulCandidate *Mul0, MulCandidate *Mul1,
144 bool Exchange =
false) {
146 << *Mul0->Root <<
"\n"
147 << *Mul1->Root <<
"\n");
151 Mul1->Exchange =
true;
152 MulPairs.push_back(std::make_pair(Mul0, Mul1));
158 bool is64Bit()
const {
return Root->getType()->isIntegerTy(64); }
163 Value *getAccumulator() {
return Acc; }
166 SetVector<Instruction*> &getAdds() {
return Adds; }
170 MulCandList &getMuls() {
return Muls; }
174 MulPairList &getMulPairs() {
return MulPairs; }
177 void UpdateRoot(Instruction *SMLAD) {
178 Root->replaceAllUsesWith(SMLAD);
183 for (
auto *
Add : Adds)
185 for (
auto &
Mul : Muls)
187 <<
" " << *
Mul->LHS <<
"\n"
188 <<
" " << *
Mul->RHS <<
"\n");
195 LoadInst *NewLd =
nullptr;
199 WidenedLoad(SmallVectorImpl<LoadInst*> &Lds, LoadInst *Wide)
203 LoadInst *getLoad() {
211 TargetLibraryInfo *TLI;
213 const DataLayout *DL;
215 std::map<LoadInst*, LoadInst*> LoadPairs;
216 SmallPtrSet<LoadInst*, 4> OffsetLoads;
217 std::map<LoadInst*, std::unique_ptr<WidenedLoad>> WideLoads;
220 bool IsNarrowSequence(
Value *V);
222 bool RecordMemoryOps(BasicBlock *BB);
224 bool AreSequentialLoads(LoadInst *Ld0, LoadInst *Ld1, MemInstList &VecMem);
225 LoadInst* CreateWideLoad(MemInstList &Loads, IntegerType *LoadTy);
233 bool MatchSMLAD(Function &
F);
238 ARMParallelDSP() : FunctionPass(ID) { }
240 void getAnalysisUsage(AnalysisUsage &AU)
const override {
241 FunctionPass::getAnalysisUsage(AU);
259 SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
260 AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
261 TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(
F);
262 DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
263 auto &TPC = getAnalysis<TargetPassConfig>();
266 DL = &M->getDataLayout();
268 auto &
TM = TPC.getTM<TargetMachine>();
269 auto *
ST = &
TM.getSubtarget<ARMSubtarget>(
F);
271 if (!
ST->allowsUnalignedMem()) {
272 LLVM_DEBUG(
dbgs() <<
"Unaligned memory access not supported: not "
273 "running pass ARMParallelDSP\n");
278 LLVM_DEBUG(
dbgs() <<
"DSP extension not enabled: not running pass "
283 if (!
ST->isLittle()) {
284 LLVM_DEBUG(
dbgs() <<
"Only supporting little endian: not running pass "
285 <<
"ARMParallelDSP\n");
292 bool Changes = MatchSMLAD(
F);
299 MemInstList &VecMem) {
303 auto It = LoadPairs.find(Ld0);
304 if (It == LoadPairs.end() || It->second != Ld1)
313 VecMem.push_back(Ld0);
314 VecMem.push_back(Ld1);
323template<
unsigned MaxBitW
idth>
324bool ARMParallelDSP::IsNarrowSequence(
Value *V) {
326 if (SExt->getSrcTy()->getIntegerBitWidth() != MaxBitWidth)
331 return LoadPairs.count(Ld) || OffsetLoads.
count(Ld);
339bool ARMParallelDSP::RecordMemoryOps(BasicBlock *BB) {
341 SmallVector<Instruction*, 8> Writes;
348 for (
auto &
I : *BB) {
349 if (
I.mayWriteToMemory())
352 if (!Ld || !Ld->isSimple() ||
361 using InstSet = std::set<Instruction*>;
362 using DepMap = std::map<Instruction*, InstSet>;
367 for (
auto *
Write : Writes) {
368 for (
auto *
Read : Loads) {
369 MemoryLocation ReadLoc =
370 MemoryLocation(
Read->getPointerOperand(),
Size);
381 auto SafeToPair = [&](LoadInst *
Base, LoadInst *
Offset) {
383 LoadInst *Dominator = BaseFirst ?
Base :
Offset;
384 LoadInst *Dominated = BaseFirst ?
Offset :
Base;
386 if (
auto It = RAWDeps.find(Dominated); It != RAWDeps.end()) {
387 InstSet &WritesBefore = It->second;
389 for (
auto *Before : WritesBefore) {
400 for (
auto *
Base : Loads) {
401 for (
auto *
Offset : Loads) {
415 dbgs() <<
"Consecutive load pairs:\n";
416 for (auto &MapIt : LoadPairs) {
417 LLVM_DEBUG(dbgs() << *MapIt.first <<
", "
418 << *MapIt.second <<
"\n");
421 return LoadPairs.size() > 1;
428bool ARMParallelDSP::Search(
Value *V, BasicBlock *BB,
Reduction &R) {
434 return R.InsertAcc(V);
436 if (
I->getParent() != BB)
439 switch (
I->getOpcode()) {
442 case Instruction::PHI:
444 return R.InsertAcc(V);
445 case Instruction::Add: {
452 bool ValidLHS = Search(
LHS, BB, R);
453 bool ValidRHS = Search(
RHS, BB, R);
455 if (ValidLHS && ValidRHS)
459 if (
R.getRoot() ==
I)
462 return R.InsertAcc(
I);
464 case Instruction::Mul: {
465 Value *MulOp0 =
I->getOperand(0);
466 Value *MulOp1 =
I->getOperand(1);
467 return IsNarrowSequence<16>(MulOp0) && IsNarrowSequence<16>(MulOp1);
469 case Instruction::SExt:
470 return Search(
I->getOperand(0), BB, R);
506bool ARMParallelDSP::MatchSMLAD(Function &
F) {
510 SmallPtrSet<Instruction*, 4> AllAdds;
511 if (!RecordMemoryOps(&BB))
514 for (Instruction &
I :
reverse(BB)) {
515 if (
I.getOpcode() != Instruction::Add)
521 const auto *Ty =
I.getType();
522 if (!Ty->isIntegerTy(32) && !Ty->isIntegerTy(64))
526 if (!Search(&
I, &BB, R))
532 if (!CreateParallelPairs(R))
535 InsertParallelMACs(R);
538 LLVM_DEBUG(
dbgs() <<
"BB after inserting parallel MACs:\n" << BB);
545bool ARMParallelDSP::CreateParallelPairs(
Reduction &R) {
548 if (
R.getMuls().size() < 2)
552 for (
auto &MulCand :
R.getMuls()) {
553 if (!MulCand->HasTwoLoadInputs())
557 auto CanPair = [&](
Reduction &
R, MulCandidate *PMul0, MulCandidate *PMul1) {
562 auto Ld0 =
static_cast<LoadInst*
>(PMul0->LHS);
563 auto Ld1 =
static_cast<LoadInst*
>(PMul1->LHS);
564 auto Ld2 =
static_cast<LoadInst*
>(PMul0->RHS);
565 auto Ld3 =
static_cast<LoadInst*
>(PMul1->RHS);
568 if (Ld0 == Ld2 || Ld1 == Ld3)
571 if (AreSequentialLoads(Ld0, Ld1, PMul0->VecLd)) {
572 if (AreSequentialLoads(Ld2, Ld3, PMul1->VecLd)) {
574 R.AddMulPair(PMul0, PMul1);
576 }
else if (AreSequentialLoads(Ld3, Ld2, PMul1->VecLd)) {
579 R.AddMulPair(PMul0, PMul1,
true);
582 }
else if (AreSequentialLoads(Ld1, Ld0, PMul0->VecLd) &&
583 AreSequentialLoads(Ld2, Ld3, PMul1->VecLd)) {
588 R.AddMulPair(PMul1, PMul0,
true);
594 MulCandList &Muls =
R.getMuls();
595 const unsigned Elems = Muls.size();
596 for (
unsigned i = 0; i < Elems; ++i) {
597 MulCandidate *PMul0 =
static_cast<MulCandidate*
>(Muls[i].get());
601 for (
unsigned j = 0;
j < Elems; ++
j) {
605 MulCandidate *PMul1 =
static_cast<MulCandidate*
>(Muls[
j].get());
614 assert(PMul0 != PMul1 &&
"expected different chains");
616 if (CanPair(R, PMul0, PMul1))
620 return !
R.getMulPairs().empty();
623void ARMParallelDSP::InsertParallelMACs(
Reduction &R) {
625 auto CreateSMLAD = [&](LoadInst* WideLd0, LoadInst *WideLd1,
626 Value *Acc,
bool Exchange,
630 Value*
Args[] = { WideLd0, WideLd1, Acc };
638 SMLAD = Acc->
getType()->isIntegerTy(32)
652 "expected at least one instruction");
665 Value *Acc =
R.getAccumulator();
670 MulCandList &MulCands =
R.getMuls();
671 for (
auto &MulCand : MulCands) {
679 assert(
R.is64Bit() &&
"expected 64-bit result");
692 Builder.SetInsertPoint(GetInsertPoint(
Mul, Acc));
693 Acc = Builder.CreateAdd(
Mul, Acc);
700 }
else if (Acc->
getType() !=
R.getType()) {
701 Builder.SetInsertPoint(
R.getRoot());
702 Acc = Builder.CreateSExt(Acc,
R.getType());
706 llvm::sort(
R.getMulPairs(), [](
auto &PairA,
auto &PairB) {
707 const Instruction *A = PairA.first->Root;
708 const Instruction *B = PairB.first->Root;
709 return A->comesBefore(B);
713 for (
auto &Pair :
R.getMulPairs()) {
714 MulCandidate *LHSMul = Pair.first;
715 MulCandidate *RHSMul = Pair.second;
716 LoadInst *BaseLHS = LHSMul->getBaseLoad();
717 LoadInst *BaseRHS = RHSMul->getBaseLoad();
718 auto LIt = WideLoads.find(BaseLHS);
719 LoadInst *WideLHS = LIt != WideLoads.end()
720 ? LIt->second->getLoad()
721 : CreateWideLoad(LHSMul->VecLd, Ty);
722 auto RIt = WideLoads.find(BaseRHS);
723 LoadInst *WideRHS = RIt != WideLoads.end()
724 ? RIt->second->getLoad()
725 : CreateWideLoad(RHSMul->VecLd, Ty);
727 Instruction *InsertAfter = GetInsertPoint(WideLHS, WideRHS);
728 InsertAfter = GetInsertPoint(InsertAfter, Acc);
729 Acc = CreateSMLAD(WideLHS, WideRHS, Acc, RHSMul->Exchange, InsertAfter);
734LoadInst* ARMParallelDSP::CreateWideLoad(MemInstList &Loads,
735 IntegerType *LoadTy) {
736 assert(Loads.size() == 2 &&
"currently only support widening two loads");
738 LoadInst *
Base = Loads[0];
739 LoadInst *
Offset = Loads[1];
744 assert((BaseSExt && OffsetSExt)
745 &&
"Loads should have a single, extending, user");
747 std::function<void(
Value*,
Value*)> MoveBefore =
762 MoveBefore(
Op, Source);
773 Value *VecPtr =
Base->getPointerOperand();
774 LoadInst *WideLoad = IRB.CreateAlignedLoad(LoadTy, VecPtr,
Base->getAlign());
777 MoveBefore(
Base->getPointerOperand(), VecPtr);
778 MoveBefore(VecPtr, WideLoad);
783 Value *Bottom = IRB.CreateTrunc(WideLoad,
Base->getType());
784 Value *NewBaseSExt = IRB.CreateSExt(Bottom, BaseSExt->
getType());
789 Value *Top = IRB.CreateLShr(WideLoad, ShiftVal);
790 Value *Trunc = IRB.CreateTrunc(Top, OffsetTy);
791 Value *NewOffsetSExt = IRB.CreateSExt(Trunc, OffsetSExt->
getType());
796 <<
"Created Wide Load:\n"
799 << *NewBaseSExt <<
"\n"
802 << *NewOffsetSExt <<
"\n");
803 WideLoads.emplace(std::make_pair(
Base,
804 std::make_unique<WidenedLoad>(Loads, WideLoad)));
809 return new ARMParallelDSP();
812char ARMParallelDSP::ID = 0;
815 "Transform functions to use DSP intrinsics",
false,
false)
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
static cl::opt< bool > DisableParallelDSP("disable-arm-parallel-dsp", cl::Hidden, cl::init(false), cl::desc("Disable the ARM Parallel DSP pass"))
static cl::opt< unsigned > NumLoadLimit("arm-parallel-dsp-load-limit", cl::Hidden, cl::init(16), cl::desc("Limit the number of loads analysed"))
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static DeltaTreeNode * getRoot(void *Root)
static bool runOnFunction(Function &F, bool PostInlining)
This is the interface for a simple mod/ref and alias analysis over globals.
Module.h This file contains the declarations for the Module class.
loop Loop Strength Reduction
Machine Check Debug Module
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
This file defines the SmallPtrSet class.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
static SymbolRef::Type getType(const Symbol *Sym)
Target-Independent Code Generator Pass Configuration Options pass.
static bool is64Bit(const char *name)
ModRefInfo getModRefInfo(const Instruction *I, const std::optional< MemoryLocation > &OptLoc)
Check whether or not an instruction may read or write the optionally specified memory location.
AnalysisUsage & addRequired()
AnalysisUsage & addPreserved()
Add the specified Pass class to the set of analyses preserved by this pass.
LLVM_ABI void setPreservesCFG()
This function should be called by the pass, iff they do not:
InstListType::iterator iterator
Instruction iterators...
LLVM_ABI bool dominates(const BasicBlock *BB, const Use &U) const
Return true if the (end of the) basic block BB dominates the use U.
FunctionPass class - This class is used to implement most global optimizations.
PointerType * getType() const
Global values are always pointers.
LLVM_ABI bool comesBefore(const Instruction *Other) const
Given an instruction Other in the same basic block as this instruction, return true if this instructi...
static LLVM_ABI IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
unsigned getBitWidth() const
Get the number of bits in this IntegerType.
An instruction for reading from memory.
static constexpr LocationSize beforeOrAfterPointer()
Any location before or after the base pointer (but still within the underlying object).
Pass interface - Implemented by all 'passes'.
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
void insert_range(Range &&R)
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
bool isIntegerTy() const
True if this is an instance of IntegerType.
LLVM Value Representation.
Type * getType() const
All values are typed, get the type of this value.
LLVM_ABI void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
LLVM_ABI void dump() const
Support for debugging, callable in GDB: V->dump()
const ParentTy * getParent() const
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
LLVM_ABI Function * getOrInsertDeclaration(Module *M, ID id, ArrayRef< Type * > Tys={})
Look up the Function declaration of the intrinsic id in the Module M.
initializer< Ty > init(const Ty &Val)
Context & getContext() const
friend class Instruction
Iterator for Instructions in a `BasicBlock.
This is an optimization pass for GlobalISel generic memory operations.
void dump(const SparseBitVector< ElementSize > &LHS, raw_ostream &out)
FunctionAddr VTableAddr Value
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
auto reverse(ContainerTy &&C)
decltype(auto) get(const PointerIntPair< PointerTy, IntBits, IntType, PtrTraits, Info > &Pair)
void sort(IteratorTy Start, IteratorTy End)
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
bool isModOrRefSet(const ModRefInfo MRI)
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
IRBuilder(LLVMContext &, FolderTy, InserterTy, MDNode *, ArrayRef< OperandBundleDef >) -> IRBuilder< FolderTy, InserterTy >
@ Mul
Product of integers.
LLVM_ABI bool isConsecutiveAccess(Value *A, Value *B, const DataLayout &DL, ScalarEvolution &SE, bool CheckType=true)
Returns true if the memory operations A and B are consecutive.
DWARFExpression::Operation Op
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Pass * createARMParallelDSPPass()
AAResults AliasAnalysis
Temporary typedef for legacy code that uses a generic AliasAnalysis pointer or reference.