50#define DEBUG_TYPE "interleaved-load-combine"
55STATISTIC(NumInterleavedLoadCombine,
"Number of combined loads");
60 cl::desc(
"Disable combining of interleaved loads"));
64struct InterleavedLoadCombineImpl {
69 :
F(
F), DT(DT), MSSA(MSSA),
70 TLI(*TM.getSubtargetImpl(
F)->getTargetLowering()),
TTI(
TTI) {}
94 LoadInst *findFirstLoad(
const std::set<LoadInst *> &LIs);
99 bool combine(std::list<VectorInfo> &InterleavedLoad,
104 bool findPattern(std::list<VectorInfo> &Candidates,
105 std::list<VectorInfo> &InterleavedLoad,
unsigned Factor,
187 Polynomial(
Value *V) : V(V) {
188 IntegerType *Ty = dyn_cast<IntegerType>(V->getType());
196 Polynomial(
const APInt &A,
unsigned ErrorMSBs = 0)
197 : ErrorMSBs(ErrorMSBs), A(A) {}
200 : ErrorMSBs(ErrorMSBs), A(
BitWidth, A) {}
202 Polynomial() =
default;
205 void incErrorMSBs(
unsigned amt) {
206 if (ErrorMSBs == (
unsigned)-1)
210 if (ErrorMSBs > A.getBitWidth())
211 ErrorMSBs = A.getBitWidth();
215 void decErrorMSBs(
unsigned amt) {
216 if (ErrorMSBs == (
unsigned)-1)
226 Polynomial &add(
const APInt &
C) {
243 if (
C.getBitWidth() != A.getBitWidth()) {
253 Polynomial &mul(
const APInt &
C) {
304 if (
C.getBitWidth() != A.getBitWidth()) {
322 decErrorMSBs(
C.countr_zero());
325 pushBOperation(
Mul,
C);
330 Polynomial &lshr(
const APInt &
C) {
461 if (
C.getBitWidth() != A.getBitWidth()) {
470 unsigned shiftAmt =
C.getZExtValue();
471 if (shiftAmt >=
C.getBitWidth())
472 return mul(
APInt(
C.getBitWidth(), 0));
479 if (A.countr_zero() < shiftAmt)
480 ErrorMSBs = A.getBitWidth();
482 incErrorMSBs(shiftAmt);
485 pushBOperation(LShr,
C);
486 A = A.lshr(shiftAmt);
492 Polynomial &sextOrTrunc(
unsigned n) {
493 if (n < A.getBitWidth()) {
496 decErrorMSBs(A.getBitWidth() - n);
498 pushBOperation(Trunc,
APInt(
sizeof(n) * 8, n));
500 if (n > A.getBitWidth()) {
503 incErrorMSBs(n - A.getBitWidth());
505 pushBOperation(SExt,
APInt(
sizeof(n) * 8, n));
512 bool isFirstOrder()
const {
return V !=
nullptr; }
515 bool isCompatibleTo(
const Polynomial &o)
const {
517 if (A.getBitWidth() != o.A.getBitWidth())
521 if (!isFirstOrder() && !o.isFirstOrder())
529 if (B.size() != o.B.size())
532 auto *ob = o.B.begin();
533 for (
const auto &b : B) {
544 Polynomial
operator-(
const Polynomial &o)
const {
546 if (!isCompatibleTo(o))
552 return Polynomial(A - o.A, std::max(ErrorMSBs, o.ErrorMSBs));
557 Polynomial Result(*
this);
564 Polynomial Result(*
this);
570 bool isProvenEqualTo(
const Polynomial &o) {
572 Polynomial r = *
this - o;
573 return (r.ErrorMSBs == 0) && (!r.isFirstOrder()) && (r.A.isZero());
578 OS <<
"[{#ErrBits:" << ErrorMSBs <<
"} ";
583 OS <<
"(" << *V <<
") ";
601 OS << b.second <<
") ";
605 OS <<
"+ " << A <<
"]";
614 void pushBOperation(
const BOps
Op,
const APInt &
C) {
615 if (isFirstOrder()) {
616 B.push_back(std::make_pair(
Op,
C));
638 VectorInfo(
const VectorInfo &c) : VTy(c.VTy) {
640 "Copying VectorInfo is neither implemented nor necessary,");
653 ElementInfo(Polynomial
Offset = Polynomial(),
LoadInst *LI =
nullptr)
664 std::set<LoadInst *> LIs;
667 std::set<Instruction *> Is;
682 VectorInfo &operator=(
const VectorInfo &other) =
delete;
684 virtual ~VectorInfo() {
delete[] EI; }
695 bool isInterleaved(
unsigned Factor,
const DataLayout &
DL)
const {
697 for (
unsigned i = 1; i < getDimension(); i++) {
698 if (!EI[i].Ofs.isProvenEqualTo(EI[0].Ofs + i * Factor *
Size)) {
716 return computeFromSVI(SVI, Result,
DL);
717 LoadInst *LI = dyn_cast<LoadInst>(V);
719 return computeFromLI(LI, Result,
DL);
722 return computeFromBCI(BCI, Result,
DL);
732 static bool computeFromBCI(
BitCastInst *BCI, VectorInfo &Result,
747 unsigned Factor = Result.VTy->getNumElements() / VTy->
getNumElements();
748 unsigned NewSize =
DL.getTypeAllocSize(Result.VTy->getElementType());
751 if (NewSize * Factor != OldSize)
755 if (!compute(
Op, Old,
DL))
758 for (
unsigned i = 0; i < Result.VTy->getNumElements(); i += Factor) {
759 for (
unsigned j = 0; j < Factor; j++) {
761 ElementInfo(Old.EI[i / Factor].Ofs + j * NewSize,
762 j == 0 ? Old.EI[i / Factor].LI :
nullptr);
768 Result.LIs.insert(Old.LIs.begin(), Old.LIs.end());
769 Result.Is.insert(Old.Is.begin(), Old.Is.end());
770 Result.Is.insert(BCI);
771 Result.SVI =
nullptr;
793 VectorInfo
LHS(ArgTy);
798 VectorInfo
RHS(ArgTy);
827 Result.LIs.insert(
LHS.LIs.begin(),
LHS.LIs.end());
828 Result.Is.insert(
LHS.Is.begin(),
LHS.Is.end());
831 Result.LIs.insert(
RHS.LIs.begin(),
RHS.LIs.end());
832 Result.Is.insert(
RHS.Is.begin(),
RHS.Is.end());
834 Result.Is.insert(SVI);
840 "Invalid ShuffleVectorInst (index out of bounds)");
843 Result.EI[j] = ElementInfo();
846 Result.EI[j] =
LHS.EI[i];
848 Result.EI[j] = ElementInfo();
853 Result.EI[j] = ElementInfo();
869 static bool computeFromLI(
LoadInst *LI, VectorInfo &Result,
880 if (!
DL.typeSizeEqualsStoreSize(Result.VTy->getElementType()))
888 Result.LIs.insert(LI);
889 Result.Is.insert(LI);
891 for (
unsigned i = 0; i < Result.getDimension(); i++) {
896 int64_t Ofs =
DL.getIndexedOffsetInType(Result.VTy,
Idx);
897 Result.EI[i] = ElementInfo(
Offset + Ofs, i == 0 ? LI :
nullptr);
907 static void computePolynomialBinOp(
BinaryOperator &BO, Polynomial &Result) {
914 C = dyn_cast<ConstantInt>(
LHS);
920 case Instruction::Add:
924 computePolynomial(*
LHS, Result);
925 Result.add(
C->getValue());
928 case Instruction::LShr:
932 computePolynomial(*
LHS, Result);
933 Result.lshr(
C->getValue());
940 Result = Polynomial(&BO);
947 static void computePolynomial(
Value &V, Polynomial &Result) {
948 if (
auto *BO = dyn_cast<BinaryOperator>(&V))
949 computePolynomialBinOp(*BO, Result);
951 Result = Polynomial(&V);
960 static void computePolynomialFromPointer(
Value &
Ptr, Polynomial &Result,
966 Result = Polynomial();
970 unsigned PointerBits =
971 DL.getIndexSizeInBits(PtrTy->getPointerAddressSpace());
974 if (isa<CastInst>(&
Ptr)) {
977 case Instruction::BitCast:
978 computePolynomialFromPointer(*CI.
getOperand(0), Result, BasePtr,
DL);
982 Polynomial(PointerBits, 0);
987 else if (isa<GetElementPtrInst>(&
Ptr)) {
990 APInt BaseOffset(PointerBits, 0);
993 if (
GEP.accumulateConstantOffset(
DL, BaseOffset)) {
994 Result = Polynomial(BaseOffset);
995 BasePtr =
GEP.getPointerOperand();
1000 unsigned idxOperand, e;
1002 for (idxOperand = 1, e =
GEP.getNumOperands(); idxOperand < e;
1004 ConstantInt *IDX = dyn_cast<ConstantInt>(
GEP.getOperand(idxOperand));
1011 if (idxOperand + 1 != e) {
1012 Result = Polynomial();
1018 computePolynomial(*
GEP.getOperand(idxOperand), Result);
1023 DL.getIndexedOffsetInType(
GEP.getSourceElementType(), Indices);
1026 unsigned ResultSize =
DL.getTypeAllocSize(
GEP.getResultElementType());
1027 Result.sextOrTrunc(PointerBits);
1028 Result.mul(
APInt(PointerBits, ResultSize));
1029 Result.add(BaseOffset);
1030 BasePtr =
GEP.getPointerOperand();
1037 Polynomial(
DL.getIndexSizeInBits(PtrTy->getPointerAddressSpace()), 0);
1048 for (
unsigned i = 0; i < getDimension(); i++)
1049 OS << ((i == 0) ?
"[" :
", ") << EI[i].Ofs;
1057bool InterleavedLoadCombineImpl::findPattern(
1058 std::list<VectorInfo> &Candidates, std::list<VectorInfo> &InterleavedLoad,
1060 for (
auto C0 = Candidates.begin(), E0 = Candidates.end(); C0 != E0; ++C0) {
1063 unsigned Size =
DL.getTypeAllocSize(C0->VTy->getElementType());
1066 std::vector<std::list<VectorInfo>::iterator> Res(Factor, Candidates.end());
1068 for (
auto C = Candidates.begin(), E = Candidates.end();
C != E;
C++) {
1069 if (
C->VTy != C0->VTy)
1071 if (
C->BB != C0->BB)
1073 if (
C->PV != C0->PV)
1077 for (i = 1; i < Factor; i++) {
1078 if (
C->EI[0].Ofs.isProvenEqualTo(C0->EI[0].Ofs + i *
Size)) {
1083 for (i = 1; i < Factor; i++) {
1084 if (Res[i] == Candidates.end())
1093 if (Res[0] != Candidates.end()) {
1095 for (
unsigned i = 0; i < Factor; i++) {
1096 InterleavedLoad.splice(InterleavedLoad.end(), Candidates, Res[i]);
1106InterleavedLoadCombineImpl::findFirstLoad(
const std::set<LoadInst *> &LIs) {
1107 assert(!LIs.empty() &&
"No load instructions given.");
1115 return cast<LoadInst>(FLI);
1118bool InterleavedLoadCombineImpl::combine(std::list<VectorInfo> &InterleavedLoad,
1125 LoadInst *InsertionPoint = InterleavedLoad.front().EI[0].LI;
1128 if (!InsertionPoint)
1131 std::set<LoadInst *> LIs;
1132 std::set<Instruction *> Is;
1133 std::set<Instruction *> SVIs;
1140 unsigned Factor = InterleavedLoad.size();
1143 for (
auto &VI : InterleavedLoad) {
1145 LIs.insert(
VI.LIs.begin(),
VI.LIs.end());
1150 Is.insert(
VI.Is.begin(),
VI.Is.end());
1153 SVIs.insert(
VI.SVI);
1163 for (
const auto &
I : Is) {
1168 if (SVIs.find(
I) != SVIs.end())
1173 for (
auto *U :
I->users()) {
1174 if (Is.find(dyn_cast<Instruction>(U)) == Is.end())
1190 auto FMA = MSSA.getMemoryAccess(
First);
1191 for (
auto *LI : LIs) {
1192 auto MADef = MSSA.getMemoryAccess(LI)->getDefiningAccess();
1193 if (!MSSA.dominates(MADef, FMA))
1196 assert(!LIs.empty() &&
"There are no LoadInst to combine");
1199 for (
auto &VI : InterleavedLoad) {
1200 if (!DT.dominates(InsertionPoint,
VI.SVI))
1207 Type *ETy = InterleavedLoad.front().SVI->getType()->getElementType();
1208 unsigned ElementsPerSVI =
1209 cast<FixedVectorType>(InterleavedLoad.front().SVI->getType())
1213 auto Indices = llvm::to_vector<4>(llvm::seq<unsigned>(0, Factor));
1215 Instruction::Load, ILTy, Factor, Indices, InsertionPoint->
getAlign(),
1224 auto LI = Builder.CreateAlignedLoad(ILTy,
Ptr, InsertionPoint->
getAlign(),
1225 "interleaved.wide.load");
1227 MemoryUse *MSSALoad = cast<MemoryUse>(MSSAU.createMemoryAccessBefore(
1228 LI,
nullptr, MSSA.getMemoryAccess(InsertionPoint)));
1229 MSSAU.insertUse(MSSALoad,
true);
1233 for (
auto &VI : InterleavedLoad) {
1235 for (
unsigned j = 0;
j < ElementsPerSVI;
j++)
1236 Mask.push_back(i + j * Factor);
1238 Builder.SetInsertPoint(
VI.SVI);
1239 auto SVI = Builder.CreateShuffleVector(LI, Mask,
"interleaved.shuffle");
1240 VI.SVI->replaceAllUsesWith(SVI);
1244 NumInterleavedLoadCombine++;
1247 <<
"Load interleaved combined with factor "
1254bool InterleavedLoadCombineImpl::run() {
1256 bool changed =
false;
1257 unsigned MaxFactor = TLI.getMaxSupportedInterleaveFactor();
1259 auto &
DL =
F.getDataLayout();
1262 for (
unsigned Factor = MaxFactor; Factor >= 2; Factor--) {
1263 std::list<VectorInfo> Candidates;
1267 if (
auto SVI = dyn_cast<ShuffleVectorInst>(&
I)) {
1269 if (isa<ScalableVectorType>(SVI->getType()))
1272 Candidates.emplace_back(cast<FixedVectorType>(SVI->getType()));
1274 if (!VectorInfo::computeFromSVI(SVI, Candidates.back(),
DL)) {
1275 Candidates.pop_back();
1279 if (!Candidates.back().isInterleaved(Factor,
DL)) {
1280 Candidates.pop_back();
1286 std::list<VectorInfo> InterleavedLoad;
1287 while (findPattern(Candidates, InterleavedLoad, Factor,
DL)) {
1288 if (combine(InterleavedLoad, ORE)) {
1293 Candidates.splice(Candidates.begin(), InterleavedLoad,
1294 std::next(InterleavedLoad.begin()),
1295 InterleavedLoad.end());
1297 InterleavedLoad.clear();
1315 return "Interleaved Load Combine Pass";
1319 if (DisableInterleavedLoadCombine)
1322 auto *TPC = getAnalysisIfAvailable<TargetPassConfig>();
1329 return InterleavedLoadCombineImpl(
1330 F, getAnalysis<DominatorTreeWrapperPass>().getDomTree(),
1331 getAnalysis<MemorySSAWrapperPass>().getMSSA(),
1332 getAnalysis<TargetTransformInfoWrapperPass>().getTTI(
F),
1354 bool Changed = InterleavedLoadCombineImpl(
F, DT, MemSSA,
TTI, *TM).
run();
1358char InterleavedLoadCombine::ID = 0;
1362 "Combine interleaved loads into wide loads and shufflevector instructions",
1374 auto P =
new InterleavedLoadCombine();
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static void print(raw_ostream &Out, object::Archive::Kind Kind, T Val)
Expand Atomic instructions
static const Function * getParent(const Value *V)
static cl::opt< TargetTransformInfo::TargetCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(TargetTransformInfo::TCK_RecipThroughput), cl::values(clEnumValN(TargetTransformInfo::TCK_RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(TargetTransformInfo::TCK_Latency, "latency", "Instruction latency"), clEnumValN(TargetTransformInfo::TCK_CodeSize, "code-size", "Code size"), clEnumValN(TargetTransformInfo::TCK_SizeAndLatency, "size-latency", "Code size and latency")))
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
This file exposes an interface to building/using memory SSA to walk memory instructions using a use/d...
Module.h This file contains the declarations for the Module class.
FunctionAnalysisManager FAM
#define INITIALIZE_PASS_DEPENDENCY(depName)
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
This file describes how to lower LLVM code to machine code.
Target-Independent Code Generator Pass Configuration Options pass.
Class for arbitrary precision integers.
A container for analyses that lazily runs them and caches their results.
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
Represent the analysis usage information of a pass.
AnalysisUsage & addRequired()
LLVM Basic Block Representation.
InstListType::iterator iterator
Instruction iterators...
BinaryOps getOpcode() const
This class represents a no-op cast from one type to another.
This is the base class for all instructions that perform data casts.
Instruction::CastOps getOpcode() const
Return the opcode of this CastInst.
This is the shared class of boolean and integer constants.
This class represents an Operation in the Expression.
A parsed version of the target data layout string in and methods for querying it.
Analysis pass which computes a DominatorTree.
Legacy analysis pass which computes a DominatorTree.
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Class to represent fixed width SIMD vectors.
unsigned getNumElements() const
static FixedVectorType * get(Type *ElementType, unsigned NumElts)
FunctionPass class - This class is used to implement most global optimizations.
virtual bool runOnFunction(Function &F)=0
runOnFunction - Virtual method overriden by subclasses to do the per-function processing of the pass.
an instruction for type-safe pointer arithmetic to access elements of arrays and structs
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
bool isCommutative() const LLVM_READONLY
Return true if the instruction is commutative:
bool isAtomic() const LLVM_READONLY
Return true if this instruction has an AtomicOrdering of unordered or higher.
Class to represent integer types.
unsigned getBitWidth() const
Get the number of bits in this IntegerType.
PreservedAnalyses run(Function &F, FunctionAnalysisManager &FAM)
An instruction for reading from memory.
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
Value * getPointerOperand()
bool isVolatile() const
Return true if this is a load from a volatile memory location.
Align getAlign() const
Return the alignment of the access that is being performed.
An analysis that produces MemorySSA for a function.
Legacy analysis pass which computes MemorySSA.
Encapsulates MemorySSA, including all data associated with memory accesses.
Represents read-only accesses to memory.
static PassRegistry * getPassRegistry()
getPassRegistry - Access the global registry object, which is automatically initialized at applicatio...
virtual void getAnalysisUsage(AnalysisUsage &) const
getAnalysisUsage - This function should be overriden by passes that need analysis information to do t...
virtual StringRef getPassName() const
getPassName - Return a nice clean name for a pass.
A set of analyses that are preserved following a run of a transformation pass.
static PreservedAnalyses none()
Convenience factory function for the empty preserved set.
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
This instruction constructs a fixed permutation of two input vectors.
static void getShuffleMask(const Constant *Mask, SmallVectorImpl< int > &Result)
Convert the input shuffle mask operand to a vector of integers.
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
StringRef - Represent a constant reference to a string, i.e.
Analysis pass providing the TargetTransformInfo.
Result run(const Function &F, FunctionAnalysisManager &)
This class defines information used to lower LLVM code to legal SelectionDAG operators that the targe...
Primary interface to the complete machine description for the target machine.
The instances of the Type class are immutable: once they are created, they are never changed.
static IntegerType * getInt32Ty(LLVMContext &C)
Value * getOperand(unsigned i) const
LLVM Value Representation.
Type * getType() const
All values are typed, get the type of this value.
LLVMContext & getContext() const
All values hold a context through their type.
Type * getElementType() const
const ParentTy * getParent() const
This class implements an extremely fast bulk output stream that can only output to a stream.
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
@ C
The default llvm calling convention, compatible with C.
@ FMA
FMA - Perform a * b + c with no intermediate rounding step.
initializer< Ty > init(const Ty &Val)
DiagnosticInfoOptimizationBase::Argument NV
This is an optimization pass for GlobalISel generic memory operations.
void initializeInterleavedLoadCombinePass(PassRegistry &)
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
@ First
Helpers to iterate all locations in the MemoryEffectsBase class.
raw_ostream & operator<<(raw_ostream &OS, const APFixedPoint &FX)
constexpr unsigned BitWidth
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
APInt operator+(APInt a, const APInt &b)
FunctionPass * createInterleavedLoadCombinePass()
InterleavedLoadCombines Pass - This pass identifies interleaved loads and combines them into wide loa...
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.