50#define DEBUG_TYPE "interleaved-load-combine"
55STATISTIC(NumInterleavedLoadCombine,
"Number of combined loads");
60 cl::desc(
"Disable combining of interleaved loads"));
64struct InterleavedLoadCombineImpl {
68 :
F(
F), DT(DT), MSSA(MSSA),
69 TLI(*
TM.getSubtargetImpl(
F)->getTargetLowering()),
70 TTI(
TM.getTargetTransformInfo(
F)) {}
94 LoadInst *findFirstLoad(
const std::set<LoadInst *> &LIs);
99 bool combine(std::list<VectorInfo> &InterleavedLoad,
104 bool findPattern(std::list<VectorInfo> &Candidates,
105 std::list<VectorInfo> &InterleavedLoad,
unsigned Factor,
187 Polynomial(
Value *V) : V(V) {
188 IntegerType *Ty = dyn_cast<IntegerType>(V->getType());
196 Polynomial(
const APInt &A,
unsigned ErrorMSBs = 0)
197 : ErrorMSBs(ErrorMSBs), A(A) {}
200 : ErrorMSBs(ErrorMSBs), A(
BitWidth, A) {}
202 Polynomial() =
default;
205 void incErrorMSBs(
unsigned amt) {
206 if (ErrorMSBs == (
unsigned)-1)
210 if (ErrorMSBs > A.getBitWidth())
211 ErrorMSBs = A.getBitWidth();
215 void decErrorMSBs(
unsigned amt) {
216 if (ErrorMSBs == (
unsigned)-1)
226 Polynomial &add(
const APInt &
C) {
243 if (
C.getBitWidth() != A.getBitWidth()) {
253 Polynomial &mul(
const APInt &
C) {
304 if (
C.getBitWidth() != A.getBitWidth()) {
322 decErrorMSBs(
C.countr_zero());
325 pushBOperation(
Mul,
C);
330 Polynomial &lshr(
const APInt &
C) {
461 if (
C.getBitWidth() != A.getBitWidth()) {
470 unsigned shiftAmt =
C.getZExtValue();
471 if (shiftAmt >=
C.getBitWidth())
472 return mul(
APInt(
C.getBitWidth(), 0));
479 if (A.countr_zero() < shiftAmt)
480 ErrorMSBs = A.getBitWidth();
482 incErrorMSBs(shiftAmt);
485 pushBOperation(LShr,
C);
486 A = A.lshr(shiftAmt);
492 Polynomial &sextOrTrunc(
unsigned n) {
493 if (n < A.getBitWidth()) {
496 decErrorMSBs(A.getBitWidth() - n);
498 pushBOperation(Trunc,
APInt(
sizeof(n) * 8, n));
500 if (n > A.getBitWidth()) {
503 incErrorMSBs(n - A.getBitWidth());
505 pushBOperation(SExt,
APInt(
sizeof(n) * 8, n));
512 bool isFirstOrder()
const {
return V !=
nullptr; }
515 bool isCompatibleTo(
const Polynomial &o)
const {
517 if (A.getBitWidth() != o.A.getBitWidth())
521 if (!isFirstOrder() && !o.isFirstOrder())
529 if (B.size() != o.B.size())
532 auto *ob = o.B.begin();
533 for (
const auto &b : B) {
544 Polynomial
operator-(
const Polynomial &o)
const {
546 if (!isCompatibleTo(o))
552 return Polynomial(A - o.A, std::max(ErrorMSBs, o.ErrorMSBs));
557 Polynomial Result(*
this);
564 Polynomial Result(*
this);
570 bool isProvenEqualTo(
const Polynomial &o) {
572 Polynomial r = *
this - o;
573 return (r.ErrorMSBs == 0) && (!r.isFirstOrder()) && (r.A.isZero());
578 OS <<
"[{#ErrBits:" << ErrorMSBs <<
"} ";
583 OS <<
"(" << *V <<
") ";
601 OS << b.second <<
") ";
605 OS <<
"+ " << A <<
"]";
614 void pushBOperation(
const BOps
Op,
const APInt &
C) {
615 if (isFirstOrder()) {
616 B.push_back(std::make_pair(
Op,
C));
638 VectorInfo(
const VectorInfo &c) : VTy(c.VTy) {
640 "Copying VectorInfo is neither implemented nor necessary,");
653 ElementInfo(Polynomial
Offset = Polynomial(),
LoadInst *LI =
nullptr)
664 std::set<LoadInst *> LIs;
667 std::set<Instruction *> Is;
682 VectorInfo &operator=(
const VectorInfo &other) =
delete;
684 virtual ~VectorInfo() {
delete[] EI; }
695 bool isInterleaved(
unsigned Factor,
const DataLayout &
DL)
const {
697 for (
unsigned i = 1; i < getDimension(); i++) {
698 if (!EI[i].Ofs.isProvenEqualTo(EI[0].Ofs + i * Factor *
Size)) {
716 return computeFromSVI(SVI, Result,
DL);
717 LoadInst *LI = dyn_cast<LoadInst>(V);
719 return computeFromLI(LI, Result,
DL);
722 return computeFromBCI(BCI, Result,
DL);
732 static bool computeFromBCI(
BitCastInst *BCI, VectorInfo &Result,
747 unsigned Factor = Result.VTy->getNumElements() / VTy->
getNumElements();
748 unsigned NewSize =
DL.getTypeAllocSize(Result.VTy->getElementType());
751 if (NewSize * Factor != OldSize)
755 if (!compute(
Op, Old,
DL))
758 for (
unsigned i = 0; i < Result.VTy->getNumElements(); i += Factor) {
759 for (
unsigned j = 0; j < Factor; j++) {
761 ElementInfo(Old.EI[i / Factor].Ofs + j * NewSize,
762 j == 0 ? Old.EI[i / Factor].LI :
nullptr);
768 Result.LIs.insert(Old.LIs.begin(), Old.LIs.end());
769 Result.Is.insert(Old.Is.begin(), Old.Is.end());
770 Result.Is.insert(BCI);
771 Result.SVI =
nullptr;
793 VectorInfo
LHS(ArgTy);
798 VectorInfo
RHS(ArgTy);
827 Result.LIs.insert(
LHS.LIs.begin(),
LHS.LIs.end());
828 Result.Is.insert(
LHS.Is.begin(),
LHS.Is.end());
831 Result.LIs.insert(
RHS.LIs.begin(),
RHS.LIs.end());
832 Result.Is.insert(
RHS.Is.begin(),
RHS.Is.end());
834 Result.Is.insert(SVI);
840 "Invalid ShuffleVectorInst (index out of bounds)");
843 Result.EI[j] = ElementInfo();
846 Result.EI[j] =
LHS.EI[i];
848 Result.EI[j] = ElementInfo();
853 Result.EI[j] = ElementInfo();
869 static bool computeFromLI(
LoadInst *LI, VectorInfo &Result,
885 Result.LIs.insert(LI);
886 Result.Is.insert(LI);
888 for (
unsigned i = 0; i < Result.getDimension(); i++) {
893 int64_t Ofs =
DL.getIndexedOffsetInType(Result.VTy,
ArrayRef(
Idx, 2));
894 Result.EI[i] = ElementInfo(
Offset + Ofs, i == 0 ? LI :
nullptr);
904 static void computePolynomialBinOp(
BinaryOperator &BO, Polynomial &Result) {
911 C = dyn_cast<ConstantInt>(
LHS);
917 case Instruction::Add:
921 computePolynomial(*
LHS, Result);
922 Result.add(
C->getValue());
925 case Instruction::LShr:
929 computePolynomial(*
LHS, Result);
930 Result.lshr(
C->getValue());
937 Result = Polynomial(&BO);
944 static void computePolynomial(
Value &V, Polynomial &Result) {
945 if (
auto *BO = dyn_cast<BinaryOperator>(&V))
946 computePolynomialBinOp(*BO, Result);
948 Result = Polynomial(&V);
957 static void computePolynomialFromPointer(
Value &
Ptr, Polynomial &Result,
963 Result = Polynomial();
967 unsigned PointerBits =
968 DL.getIndexSizeInBits(PtrTy->getPointerAddressSpace());
971 if (isa<CastInst>(&
Ptr)) {
974 case Instruction::BitCast:
975 computePolynomialFromPointer(*CI.
getOperand(0), Result, BasePtr,
DL);
979 Polynomial(PointerBits, 0);
984 else if (isa<GetElementPtrInst>(&
Ptr)) {
987 APInt BaseOffset(PointerBits, 0);
990 if (
GEP.accumulateConstantOffset(
DL, BaseOffset)) {
991 Result = Polynomial(BaseOffset);
992 BasePtr =
GEP.getPointerOperand();
997 unsigned idxOperand, e;
999 for (idxOperand = 1, e =
GEP.getNumOperands(); idxOperand < e;
1001 ConstantInt *IDX = dyn_cast<ConstantInt>(
GEP.getOperand(idxOperand));
1008 if (idxOperand + 1 != e) {
1009 Result = Polynomial();
1015 computePolynomial(*
GEP.getOperand(idxOperand), Result);
1020 DL.getIndexedOffsetInType(
GEP.getSourceElementType(), Indices);
1023 unsigned ResultSize =
DL.getTypeAllocSize(
GEP.getResultElementType());
1024 Result.sextOrTrunc(PointerBits);
1025 Result.mul(
APInt(PointerBits, ResultSize));
1026 Result.add(BaseOffset);
1027 BasePtr =
GEP.getPointerOperand();
1034 Polynomial(
DL.getIndexSizeInBits(PtrTy->getPointerAddressSpace()), 0);
1045 for (
unsigned i = 0; i < getDimension(); i++)
1046 OS << ((i == 0) ?
"[" :
", ") << EI[i].Ofs;
1054bool InterleavedLoadCombineImpl::findPattern(
1055 std::list<VectorInfo> &Candidates, std::list<VectorInfo> &InterleavedLoad,
1057 for (
auto C0 = Candidates.begin(), E0 = Candidates.end(); C0 != E0; ++C0) {
1060 unsigned Size =
DL.getTypeAllocSize(C0->VTy->getElementType());
1063 std::vector<std::list<VectorInfo>::iterator> Res(Factor, Candidates.end());
1065 for (
auto C = Candidates.begin(), E = Candidates.end();
C != E;
C++) {
1066 if (
C->VTy != C0->VTy)
1068 if (
C->BB != C0->BB)
1070 if (
C->PV != C0->PV)
1074 for (i = 1; i < Factor; i++) {
1075 if (
C->EI[0].Ofs.isProvenEqualTo(C0->EI[0].Ofs + i *
Size)) {
1080 for (i = 1; i < Factor; i++) {
1081 if (Res[i] == Candidates.end())
1090 if (Res[0] != Candidates.end()) {
1092 for (
unsigned i = 0; i < Factor; i++) {
1093 InterleavedLoad.splice(InterleavedLoad.end(), Candidates, Res[i]);
1103InterleavedLoadCombineImpl::findFirstLoad(
const std::set<LoadInst *> &LIs) {
1104 assert(!LIs.empty() &&
"No load instructions given.");
1112 return cast<LoadInst>(FLI);
1115bool InterleavedLoadCombineImpl::combine(std::list<VectorInfo> &InterleavedLoad,
1122 LoadInst *InsertionPoint = InterleavedLoad.front().EI[0].LI;
1125 if (!InsertionPoint)
1128 std::set<LoadInst *> LIs;
1129 std::set<Instruction *> Is;
1130 std::set<Instruction *> SVIs;
1137 unsigned Factor = InterleavedLoad.size();
1140 for (
auto &VI : InterleavedLoad) {
1142 LIs.insert(
VI.LIs.begin(),
VI.LIs.end());
1147 Is.insert(
VI.Is.begin(),
VI.Is.end());
1150 SVIs.insert(
VI.SVI);
1160 for (
const auto &
I : Is) {
1165 if (SVIs.find(
I) != SVIs.end())
1170 for (
auto *U :
I->users()) {
1171 if (Is.find(dyn_cast<Instruction>(U)) == Is.end())
1187 auto FMA = MSSA.getMemoryAccess(
First);
1188 for (
auto *LI : LIs) {
1189 auto MADef = MSSA.getMemoryAccess(LI)->getDefiningAccess();
1190 if (!MSSA.dominates(MADef, FMA))
1193 assert(!LIs.empty() &&
"There are no LoadInst to combine");
1196 for (
auto &VI : InterleavedLoad) {
1197 if (!DT.dominates(InsertionPoint,
VI.SVI))
1204 Type *ETy = InterleavedLoad.front().SVI->getType()->getElementType();
1205 unsigned ElementsPerSVI =
1206 cast<FixedVectorType>(InterleavedLoad.front().SVI->getType())
1210 auto Indices = llvm::to_vector<4>(llvm::seq<unsigned>(0, Factor));
1212 Instruction::Load, ILTy, Factor, Indices, InsertionPoint->
getAlign(),
1221 auto LI = Builder.CreateAlignedLoad(ILTy,
Ptr, InsertionPoint->
getAlign(),
1222 "interleaved.wide.load");
1224 MemoryUse *MSSALoad = cast<MemoryUse>(MSSAU.createMemoryAccessBefore(
1225 LI,
nullptr, MSSA.getMemoryAccess(InsertionPoint)));
1226 MSSAU.insertUse(MSSALoad,
true);
1230 for (
auto &VI : InterleavedLoad) {
1232 for (
unsigned j = 0;
j < ElementsPerSVI;
j++)
1233 Mask.push_back(i + j * Factor);
1235 Builder.SetInsertPoint(
VI.SVI);
1236 auto SVI = Builder.CreateShuffleVector(LI, Mask,
"interleaved.shuffle");
1237 VI.SVI->replaceAllUsesWith(SVI);
1241 NumInterleavedLoadCombine++;
1244 <<
"Load interleaved combined with factor "
1251bool InterleavedLoadCombineImpl::run() {
1253 bool changed =
false;
1254 unsigned MaxFactor = TLI.getMaxSupportedInterleaveFactor();
1256 auto &
DL =
F.getParent()->getDataLayout();
1259 for (
unsigned Factor = MaxFactor; Factor >= 2; Factor--) {
1260 std::list<VectorInfo> Candidates;
1264 if (
auto SVI = dyn_cast<ShuffleVectorInst>(&
I)) {
1266 if (isa<ScalableVectorType>(SVI->getType()))
1269 Candidates.emplace_back(cast<FixedVectorType>(SVI->getType()));
1271 if (!VectorInfo::computeFromSVI(SVI, Candidates.back(),
DL)) {
1272 Candidates.pop_back();
1276 if (!Candidates.back().isInterleaved(Factor,
DL)) {
1277 Candidates.pop_back();
1283 std::list<VectorInfo> InterleavedLoad;
1284 while (findPattern(Candidates, InterleavedLoad, Factor,
DL)) {
1285 if (combine(InterleavedLoad, ORE)) {
1290 Candidates.splice(Candidates.begin(), InterleavedLoad,
1291 std::next(InterleavedLoad.begin()),
1292 InterleavedLoad.end());
1294 InterleavedLoad.clear();
1312 return "Interleaved Load Combine Pass";
1316 if (DisableInterleavedLoadCombine)
1319 auto *TPC = getAnalysisIfAvailable<TargetPassConfig>();
1326 return InterleavedLoadCombineImpl(
1327 F, getAnalysis<DominatorTreeWrapperPass>().getDomTree(),
1328 getAnalysis<MemorySSAWrapperPass>().getMSSA(),
1348 bool Changed = InterleavedLoadCombineImpl(
F, DT, MemSSA, *TM).run();
1352char InterleavedLoadCombine::ID = 0;
1356 "Combine interleaved loads into wide loads and shufflevector instructions",
1367 auto P =
new InterleavedLoadCombine();
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static void print(raw_ostream &Out, object::Archive::Kind Kind, T Val)
Expand Atomic instructions
static const Function * getParent(const Value *V)
static cl::opt< TargetTransformInfo::TargetCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(TargetTransformInfo::TCK_RecipThroughput), cl::values(clEnumValN(TargetTransformInfo::TCK_RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(TargetTransformInfo::TCK_Latency, "latency", "Instruction latency"), clEnumValN(TargetTransformInfo::TCK_CodeSize, "code-size", "Code size"), clEnumValN(TargetTransformInfo::TCK_SizeAndLatency, "size-latency", "Code size and latency")))
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
This file exposes an interface to building/using memory SSA to walk memory instructions using a use/d...
Module.h This file contains the declarations for the Module class.
FunctionAnalysisManager FAM
const char LLVMTargetMachineRef TM
#define INITIALIZE_PASS_DEPENDENCY(depName)
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
This file describes how to lower LLVM code to machine code.
Target-Independent Code Generator Pass Configuration Options pass.
Class for arbitrary precision integers.
A container for analyses that lazily runs them and caches their results.
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
Represent the analysis usage information of a pass.
AnalysisUsage & addRequired()
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
LLVM Basic Block Representation.
InstListType::iterator iterator
Instruction iterators...
BinaryOps getOpcode() const
This class represents a no-op cast from one type to another.
This is the base class for all instructions that perform data casts.
Instruction::CastOps getOpcode() const
Return the opcode of this CastInst.
This is the shared class of boolean and integer constants.
This class represents an Operation in the Expression.
A parsed version of the target data layout string in and methods for querying it.
Analysis pass which computes a DominatorTree.
Legacy analysis pass which computes a DominatorTree.
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Class to represent fixed width SIMD vectors.
unsigned getNumElements() const
static FixedVectorType * get(Type *ElementType, unsigned NumElts)
FunctionPass class - This class is used to implement most global optimizations.
virtual bool runOnFunction(Function &F)=0
runOnFunction - Virtual method overriden by subclasses to do the per-function processing of the pass.
an instruction for type-safe pointer arithmetic to access elements of arrays and structs
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
bool isCommutative() const LLVM_READONLY
Return true if the instruction is commutative:
bool isAtomic() const LLVM_READONLY
Return true if this instruction has an AtomicOrdering of unordered or higher.
const BasicBlock * getParent() const
Class to represent integer types.
unsigned getBitWidth() const
Get the number of bits in this IntegerType.
PreservedAnalyses run(Function &F, FunctionAnalysisManager &FAM)
An instruction for reading from memory.
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
Value * getPointerOperand()
bool isVolatile() const
Return true if this is a load from a volatile memory location.
Align getAlign() const
Return the alignment of the access that is being performed.
An analysis that produces MemorySSA for a function.
Legacy analysis pass which computes MemorySSA.
Encapsulates MemorySSA, including all data associated with memory accesses.
Represents read-only accesses to memory.
static PassRegistry * getPassRegistry()
getPassRegistry - Access the global registry object, which is automatically initialized at applicatio...
virtual void getAnalysisUsage(AnalysisUsage &) const
getAnalysisUsage - This function should be overriden by passes that need analysis information to do t...
virtual StringRef getPassName() const
getPassName - Return a nice clean name for a pass.
A set of analyses that are preserved following a run of a transformation pass.
static PreservedAnalyses none()
Convenience factory function for the empty preserved set.
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
This instruction constructs a fixed permutation of two input vectors.
static void getShuffleMask(const Constant *Mask, SmallVectorImpl< int > &Result)
Convert the input shuffle mask operand to a vector of integers.
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
StringRef - Represent a constant reference to a string, i.e.
This class defines information used to lower LLVM code to legal SelectionDAG operators that the targe...
Primary interface to the complete machine description for the target machine.
The instances of the Type class are immutable: once they are created, they are never changed.
static IntegerType * getInt32Ty(LLVMContext &C)
Value * getOperand(unsigned i) const
LLVM Value Representation.
Type * getType() const
All values are typed, get the type of this value.
LLVMContext & getContext() const
All values hold a context through their type.
Type * getElementType() const
This class implements an extremely fast bulk output stream that can only output to a stream.
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
@ C
The default llvm calling convention, compatible with C.
@ FMA
FMA - Perform a * b + c with no intermediate rounding step.
initializer< Ty > init(const Ty &Val)
DiagnosticInfoOptimizationBase::Argument NV
This is an optimization pass for GlobalISel generic memory operations.
void initializeInterleavedLoadCombinePass(PassRegistry &)
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
@ First
Helpers to iterate all locations in the MemoryEffectsBase class.
raw_ostream & operator<<(raw_ostream &OS, const APFixedPoint &FX)
constexpr unsigned BitWidth
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
APInt operator+(APInt a, const APInt &b)
FunctionPass * createInterleavedLoadCombinePass()
InterleavedLoadCombines Pass - This pass identifies interleaved loads and combines them into wide loa...
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.