49#define DEBUG_TYPE "interleaved-load-combine"
54STATISTIC(NumInterleavedLoadCombine,
"Number of combined loads");
59 cl::desc(
"Disable combining of interleaved loads"));
63struct InterleavedLoadCombineImpl {
68 :
F(
F), DT(DT), MSSA(MSSA),
69 TLI(*TM.getSubtargetImpl(
F)->getTargetLowering()),
TTI(
TTI) {}
93 LoadInst *findFirstLoad(
const std::set<LoadInst *> &LIs);
98 bool combine(std::list<VectorInfo> &InterleavedLoad,
103 bool findPattern(std::list<VectorInfo> &Candidates,
104 std::list<VectorInfo> &InterleavedLoad,
unsigned Factor,
186 Polynomial(
Value *V) : V(V) {
187 IntegerType *Ty = dyn_cast<IntegerType>(V->getType());
195 Polynomial(
const APInt &A,
unsigned ErrorMSBs = 0)
196 : ErrorMSBs(ErrorMSBs), A(A) {}
199 : ErrorMSBs(ErrorMSBs), A(
BitWidth, A) {}
201 Polynomial() =
default;
204 void incErrorMSBs(
unsigned amt) {
205 if (ErrorMSBs == (
unsigned)-1)
209 if (ErrorMSBs > A.getBitWidth())
210 ErrorMSBs = A.getBitWidth();
214 void decErrorMSBs(
unsigned amt) {
215 if (ErrorMSBs == (
unsigned)-1)
225 Polynomial &add(
const APInt &
C) {
242 if (
C.getBitWidth() != A.getBitWidth()) {
252 Polynomial &mul(
const APInt &
C) {
303 if (
C.getBitWidth() != A.getBitWidth()) {
321 decErrorMSBs(
C.countr_zero());
324 pushBOperation(
Mul,
C);
329 Polynomial &lshr(
const APInt &
C) {
460 if (
C.getBitWidth() != A.getBitWidth()) {
469 unsigned shiftAmt =
C.getZExtValue();
470 if (shiftAmt >=
C.getBitWidth())
471 return mul(
APInt(
C.getBitWidth(), 0));
478 if (A.countr_zero() < shiftAmt)
479 ErrorMSBs = A.getBitWidth();
481 incErrorMSBs(shiftAmt);
484 pushBOperation(LShr,
C);
485 A = A.lshr(shiftAmt);
491 Polynomial &sextOrTrunc(
unsigned n) {
492 if (n < A.getBitWidth()) {
495 decErrorMSBs(A.getBitWidth() - n);
497 pushBOperation(Trunc,
APInt(
sizeof(n) * 8, n));
499 if (n > A.getBitWidth()) {
502 incErrorMSBs(n - A.getBitWidth());
504 pushBOperation(SExt,
APInt(
sizeof(n) * 8, n));
511 bool isFirstOrder()
const {
return V !=
nullptr; }
514 bool isCompatibleTo(
const Polynomial &o)
const {
516 if (A.getBitWidth() != o.A.getBitWidth())
520 if (!isFirstOrder() && !o.isFirstOrder())
528 if (B.size() != o.B.size())
531 auto *ob = o.B.begin();
532 for (
const auto &b : B) {
543 Polynomial
operator-(
const Polynomial &o)
const {
545 if (!isCompatibleTo(o))
551 return Polynomial(A - o.A, std::max(ErrorMSBs, o.ErrorMSBs));
556 Polynomial Result(*
this);
563 Polynomial Result(*
this);
569 bool isProvenEqualTo(
const Polynomial &o) {
571 Polynomial r = *
this - o;
572 return (r.ErrorMSBs == 0) && (!r.isFirstOrder()) && (r.A.isZero());
577 OS <<
"[{#ErrBits:" << ErrorMSBs <<
"} ";
582 OS <<
"(" << *V <<
") ";
600 OS << b.second <<
") ";
604 OS <<
"+ " << A <<
"]";
613 void pushBOperation(
const BOps
Op,
const APInt &
C) {
614 if (isFirstOrder()) {
615 B.push_back(std::make_pair(
Op,
C));
637 VectorInfo(
const VectorInfo &c) : VTy(c.VTy) {
639 "Copying VectorInfo is neither implemented nor necessary,");
652 ElementInfo(Polynomial
Offset = Polynomial(),
LoadInst *LI =
nullptr)
663 std::set<LoadInst *> LIs;
666 std::set<Instruction *> Is;
681 VectorInfo &operator=(
const VectorInfo &other) =
delete;
683 virtual ~VectorInfo() {
delete[] EI; }
694 bool isInterleaved(
unsigned Factor,
const DataLayout &
DL)
const {
696 for (
unsigned i = 1; i < getDimension(); i++) {
697 if (!EI[i].Ofs.isProvenEqualTo(EI[0].Ofs + i * Factor *
Size)) {
715 return computeFromSVI(SVI, Result,
DL);
716 LoadInst *LI = dyn_cast<LoadInst>(V);
718 return computeFromLI(LI, Result,
DL);
721 return computeFromBCI(BCI, Result,
DL);
731 static bool computeFromBCI(
BitCastInst *BCI, VectorInfo &Result,
746 unsigned Factor = Result.VTy->getNumElements() / VTy->
getNumElements();
747 unsigned NewSize =
DL.getTypeAllocSize(Result.VTy->getElementType());
750 if (NewSize * Factor != OldSize)
754 if (!compute(
Op, Old,
DL))
757 for (
unsigned i = 0; i < Result.VTy->getNumElements(); i += Factor) {
758 for (
unsigned j = 0; j < Factor; j++) {
760 ElementInfo(Old.EI[i / Factor].Ofs + j * NewSize,
761 j == 0 ? Old.EI[i / Factor].LI :
nullptr);
767 Result.LIs.insert(Old.LIs.begin(), Old.LIs.end());
768 Result.Is.insert(Old.Is.begin(), Old.Is.end());
769 Result.Is.insert(BCI);
770 Result.SVI =
nullptr;
792 VectorInfo
LHS(ArgTy);
797 VectorInfo
RHS(ArgTy);
826 Result.LIs.insert(
LHS.LIs.begin(),
LHS.LIs.end());
827 Result.Is.insert(
LHS.Is.begin(),
LHS.Is.end());
830 Result.LIs.insert(
RHS.LIs.begin(),
RHS.LIs.end());
831 Result.Is.insert(
RHS.Is.begin(),
RHS.Is.end());
833 Result.Is.insert(SVI);
839 "Invalid ShuffleVectorInst (index out of bounds)");
842 Result.EI[j] = ElementInfo();
845 Result.EI[j] =
LHS.EI[i];
847 Result.EI[j] = ElementInfo();
852 Result.EI[j] = ElementInfo();
868 static bool computeFromLI(
LoadInst *LI, VectorInfo &Result,
879 if (!
DL.typeSizeEqualsStoreSize(Result.VTy->getElementType()))
887 Result.LIs.insert(LI);
888 Result.Is.insert(LI);
890 for (
unsigned i = 0; i < Result.getDimension(); i++) {
895 int64_t Ofs =
DL.getIndexedOffsetInType(Result.VTy,
Idx);
896 Result.EI[i] = ElementInfo(
Offset + Ofs, i == 0 ? LI :
nullptr);
906 static void computePolynomialBinOp(
BinaryOperator &BO, Polynomial &Result) {
913 C = dyn_cast<ConstantInt>(
LHS);
919 case Instruction::Add:
923 computePolynomial(*
LHS, Result);
924 Result.add(
C->getValue());
927 case Instruction::LShr:
931 computePolynomial(*
LHS, Result);
932 Result.lshr(
C->getValue());
939 Result = Polynomial(&BO);
946 static void computePolynomial(
Value &V, Polynomial &Result) {
947 if (
auto *BO = dyn_cast<BinaryOperator>(&V))
948 computePolynomialBinOp(*BO, Result);
950 Result = Polynomial(&V);
959 static void computePolynomialFromPointer(
Value &
Ptr, Polynomial &Result,
965 Result = Polynomial();
969 unsigned PointerBits =
970 DL.getIndexSizeInBits(PtrTy->getPointerAddressSpace());
973 if (isa<CastInst>(&
Ptr)) {
976 case Instruction::BitCast:
977 computePolynomialFromPointer(*CI.
getOperand(0), Result, BasePtr,
DL);
981 Polynomial(PointerBits, 0);
986 else if (isa<GetElementPtrInst>(&
Ptr)) {
989 APInt BaseOffset(PointerBits, 0);
992 if (
GEP.accumulateConstantOffset(
DL, BaseOffset)) {
993 Result = Polynomial(BaseOffset);
994 BasePtr =
GEP.getPointerOperand();
999 unsigned idxOperand, e;
1001 for (idxOperand = 1, e =
GEP.getNumOperands(); idxOperand < e;
1003 ConstantInt *IDX = dyn_cast<ConstantInt>(
GEP.getOperand(idxOperand));
1010 if (idxOperand + 1 != e) {
1011 Result = Polynomial();
1017 computePolynomial(*
GEP.getOperand(idxOperand), Result);
1022 DL.getIndexedOffsetInType(
GEP.getSourceElementType(), Indices);
1025 unsigned ResultSize =
DL.getTypeAllocSize(
GEP.getResultElementType());
1026 Result.sextOrTrunc(PointerBits);
1027 Result.mul(
APInt(PointerBits, ResultSize));
1028 Result.add(BaseOffset);
1029 BasePtr =
GEP.getPointerOperand();
1036 Polynomial(
DL.getIndexSizeInBits(PtrTy->getPointerAddressSpace()), 0);
1047 for (
unsigned i = 0; i < getDimension(); i++)
1048 OS << ((i == 0) ?
"[" :
", ") << EI[i].Ofs;
1056bool InterleavedLoadCombineImpl::findPattern(
1057 std::list<VectorInfo> &Candidates, std::list<VectorInfo> &InterleavedLoad,
1059 for (
auto C0 = Candidates.begin(), E0 = Candidates.end(); C0 != E0; ++C0) {
1062 unsigned Size =
DL.getTypeAllocSize(C0->VTy->getElementType());
1065 std::vector<std::list<VectorInfo>::iterator> Res(Factor, Candidates.end());
1067 for (
auto C = Candidates.begin(), E = Candidates.end();
C != E;
C++) {
1068 if (
C->VTy != C0->VTy)
1070 if (
C->BB != C0->BB)
1072 if (
C->PV != C0->PV)
1076 for (i = 1; i < Factor; i++) {
1077 if (
C->EI[0].Ofs.isProvenEqualTo(C0->EI[0].Ofs + i *
Size)) {
1082 for (i = 1; i < Factor; i++) {
1083 if (Res[i] == Candidates.end())
1092 if (Res[0] != Candidates.end()) {
1094 for (
unsigned i = 0; i < Factor; i++) {
1095 InterleavedLoad.splice(InterleavedLoad.end(), Candidates, Res[i]);
1105InterleavedLoadCombineImpl::findFirstLoad(
const std::set<LoadInst *> &LIs) {
1106 assert(!LIs.empty() &&
"No load instructions given.");
1114 return cast<LoadInst>(FLI);
1117bool InterleavedLoadCombineImpl::combine(std::list<VectorInfo> &InterleavedLoad,
1130 std::set<LoadInst *> LIs;
1131 std::set<Instruction *> Is;
1132 std::set<Instruction *> SVIs;
1139 unsigned Factor = InterleavedLoad.size();
1142 for (
auto &VI : InterleavedLoad) {
1144 LIs.insert(
VI.LIs.begin(),
VI.LIs.end());
1149 Is.insert(
VI.Is.begin(),
VI.Is.end());
1152 SVIs.insert(
VI.SVI);
1162 for (
const auto &
I : Is) {
1167 if (SVIs.find(
I) != SVIs.end())
1172 for (
auto *U :
I->users()) {
1173 if (Is.find(dyn_cast<Instruction>(U)) == Is.end())
1189 auto FMA = MSSA.getMemoryAccess(
First);
1190 for (
auto *LI : LIs) {
1191 auto MADef = MSSA.getMemoryAccess(LI)->getDefiningAccess();
1192 if (!MSSA.dominates(MADef, FMA))
1195 assert(!LIs.empty() &&
"There are no LoadInst to combine");
1198 for (
auto &VI : InterleavedLoad) {
1206 Type *ETy = InterleavedLoad.front().SVI->getType()->getElementType();
1207 unsigned ElementsPerSVI =
1208 cast<FixedVectorType>(InterleavedLoad.front().SVI->getType())
1212 auto Indices = llvm::to_vector<4>(llvm::seq<unsigned>(0, Factor));
1214 Instruction::Load, ILTy, Factor, Indices,
InsertionPoint->getAlign(),
1224 "interleaved.wide.load");
1226 MemoryUse *MSSALoad = cast<MemoryUse>(MSSAU.createMemoryAccessBefore(
1228 MSSAU.insertUse(MSSALoad,
true);
1232 for (
auto &VI : InterleavedLoad) {
1234 for (
unsigned j = 0;
j < ElementsPerSVI;
j++)
1235 Mask.push_back(i + j * Factor);
1237 Builder.SetInsertPoint(
VI.SVI);
1238 auto SVI = Builder.CreateShuffleVector(LI, Mask,
"interleaved.shuffle");
1239 VI.SVI->replaceAllUsesWith(SVI);
1243 NumInterleavedLoadCombine++;
1246 <<
"Load interleaved combined with factor "
1253bool InterleavedLoadCombineImpl::run() {
1255 bool changed =
false;
1256 unsigned MaxFactor = TLI.getMaxSupportedInterleaveFactor();
1258 auto &
DL =
F.getDataLayout();
1261 for (
unsigned Factor = MaxFactor; Factor >= 2; Factor--) {
1262 std::list<VectorInfo> Candidates;
1266 if (
auto SVI = dyn_cast<ShuffleVectorInst>(&
I)) {
1268 if (isa<ScalableVectorType>(SVI->getType()))
1271 Candidates.emplace_back(cast<FixedVectorType>(SVI->getType()));
1273 if (!VectorInfo::computeFromSVI(SVI, Candidates.back(),
DL)) {
1274 Candidates.pop_back();
1278 if (!Candidates.back().isInterleaved(Factor,
DL)) {
1279 Candidates.pop_back();
1285 std::list<VectorInfo> InterleavedLoad;
1286 while (findPattern(Candidates, InterleavedLoad, Factor,
DL)) {
1287 if (combine(InterleavedLoad, ORE)) {
1292 Candidates.splice(Candidates.begin(), InterleavedLoad,
1293 std::next(InterleavedLoad.begin()),
1294 InterleavedLoad.end());
1296 InterleavedLoad.clear();
1314 return "Interleaved Load Combine Pass";
1318 if (DisableInterleavedLoadCombine)
1321 auto *TPC = getAnalysisIfAvailable<TargetPassConfig>();
1328 return InterleavedLoadCombineImpl(
1329 F, getAnalysis<DominatorTreeWrapperPass>().getDomTree(),
1330 getAnalysis<MemorySSAWrapperPass>().getMSSA(),
1331 getAnalysis<TargetTransformInfoWrapperPass>().getTTI(
F),
1353 bool Changed = InterleavedLoadCombineImpl(
F, DT, MemSSA,
TTI, *TM).
run();
1357char InterleavedLoadCombine::ID = 0;
1361 "Combine interleaved loads into wide loads and shufflevector instructions",
1373 auto P =
new InterleavedLoadCombine();
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static void print(raw_ostream &Out, object::Archive::Kind Kind, T Val)
Expand Atomic instructions
static const Function * getParent(const Value *V)
static cl::opt< TargetTransformInfo::TargetCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(TargetTransformInfo::TCK_RecipThroughput), cl::values(clEnumValN(TargetTransformInfo::TCK_RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(TargetTransformInfo::TCK_Latency, "latency", "Instruction latency"), clEnumValN(TargetTransformInfo::TCK_CodeSize, "code-size", "Code size"), clEnumValN(TargetTransformInfo::TCK_SizeAndLatency, "size-latency", "Code size and latency")))
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
hexagon widen Hexagon Store false hexagon widen loads
This file exposes an interface to building/using memory SSA to walk memory instructions using a use/d...
FunctionAnalysisManager FAM
#define INITIALIZE_PASS_DEPENDENCY(depName)
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
This file describes how to lower LLVM code to machine code.
Target-Independent Code Generator Pass Configuration Options pass.
Class for arbitrary precision integers.
A container for analyses that lazily runs them and caches their results.
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
Represent the analysis usage information of a pass.
AnalysisUsage & addRequired()
LLVM Basic Block Representation.
InstListType::iterator iterator
Instruction iterators...
BinaryOps getOpcode() const
This class represents a no-op cast from one type to another.
This is the base class for all instructions that perform data casts.
Instruction::CastOps getOpcode() const
Return the opcode of this CastInst.
This is the shared class of boolean and integer constants.
This class represents an Operation in the Expression.
A parsed version of the target data layout string in and methods for querying it.
Analysis pass which computes a DominatorTree.
Legacy analysis pass which computes a DominatorTree.
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
bool dominates(const BasicBlock *BB, const Use &U) const
Return true if the (end of the) basic block BB dominates the use U.
Class to represent fixed width SIMD vectors.
unsigned getNumElements() const
static FixedVectorType * get(Type *ElementType, unsigned NumElts)
FunctionPass class - This class is used to implement most global optimizations.
virtual bool runOnFunction(Function &F)=0
runOnFunction - Virtual method overriden by subclasses to do the per-function processing of the pass.
an instruction for type-safe pointer arithmetic to access elements of arrays and structs
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
bool isCommutative() const LLVM_READONLY
Return true if the instruction is commutative:
bool isAtomic() const LLVM_READONLY
Return true if this instruction has an AtomicOrdering of unordered or higher.
Class to represent integer types.
unsigned getBitWidth() const
Get the number of bits in this IntegerType.
PreservedAnalyses run(Function &F, FunctionAnalysisManager &FAM)
An instruction for reading from memory.
Value * getPointerOperand()
bool isVolatile() const
Return true if this is a load from a volatile memory location.
An analysis that produces MemorySSA for a function.
Legacy analysis pass which computes MemorySSA.
Encapsulates MemorySSA, including all data associated with memory accesses.
Represents read-only accesses to memory.
static PassRegistry * getPassRegistry()
getPassRegistry - Access the global registry object, which is automatically initialized at applicatio...
virtual void getAnalysisUsage(AnalysisUsage &) const
getAnalysisUsage - This function should be overriden by passes that need analysis information to do t...
virtual StringRef getPassName() const
getPassName - Return a nice clean name for a pass.
A set of analyses that are preserved following a run of a transformation pass.
static PreservedAnalyses none()
Convenience factory function for the empty preserved set.
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
This instruction constructs a fixed permutation of two input vectors.
static void getShuffleMask(const Constant *Mask, SmallVectorImpl< int > &Result)
Convert the input shuffle mask operand to a vector of integers.
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
StringRef - Represent a constant reference to a string, i.e.
Analysis pass providing the TargetTransformInfo.
Result run(const Function &F, FunctionAnalysisManager &)
This class defines information used to lower LLVM code to legal SelectionDAG operators that the targe...
Primary interface to the complete machine description for the target machine.
The instances of the Type class are immutable: once they are created, they are never changed.
static IntegerType * getInt32Ty(LLVMContext &C)
Value * getOperand(unsigned i) const
LLVM Value Representation.
Type * getType() const
All values are typed, get the type of this value.
LLVMContext & getContext() const
All values hold a context through their type.
Type * getElementType() const
const ParentTy * getParent() const
This class implements an extremely fast bulk output stream that can only output to a stream.
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
@ C
The default llvm calling convention, compatible with C.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
@ FMA
FMA - Perform a * b + c with no intermediate rounding step.
initializer< Ty > init(const Ty &Val)
DiagnosticInfoOptimizationBase::Argument NV
This is an optimization pass for GlobalISel generic memory operations.
void initializeInterleavedLoadCombinePass(PassRegistry &)
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
@ First
Helpers to iterate all locations in the MemoryEffectsBase class.
raw_ostream & operator<<(raw_ostream &OS, const APFixedPoint &FX)
constexpr unsigned BitWidth
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
APInt operator+(APInt a, const APInt &b)
FunctionPass * createInterleavedLoadCombinePass()
InterleavedLoadCombines Pass - This pass identifies interleaved loads and combines them into wide loa...
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.