49#define DEBUG_TYPE "interleaved-load-combine"
54STATISTIC(NumInterleavedLoadCombine,
"Number of combined loads");
59 cl::desc(
"Disable combining of interleaved loads"));
63struct InterleavedLoadCombineImpl {
67 :
F(
F), DT(DT), MSSA(MSSA),
68 TLI(*
TM.getSubtargetImpl(
F)->getTargetLowering()),
69 TTI(
TM.getTargetTransformInfo(
F)) {}
93 LoadInst *findFirstLoad(
const std::set<LoadInst *> &LIs);
98 bool combine(std::list<VectorInfo> &InterleavedLoad,
103 bool findPattern(std::list<VectorInfo> &Candidates,
104 std::list<VectorInfo> &InterleavedLoad,
unsigned Factor,
186 Polynomial(
Value *V) : V(V) {
187 IntegerType *Ty = dyn_cast<IntegerType>(V->getType());
195 Polynomial(
const APInt &A,
unsigned ErrorMSBs = 0)
196 : ErrorMSBs(ErrorMSBs), A(A) {}
199 : ErrorMSBs(ErrorMSBs), A(
BitWidth, A) {}
201 Polynomial() =
default;
204 void incErrorMSBs(
unsigned amt) {
205 if (ErrorMSBs == (
unsigned)-1)
209 if (ErrorMSBs > A.getBitWidth())
210 ErrorMSBs = A.getBitWidth();
214 void decErrorMSBs(
unsigned amt) {
215 if (ErrorMSBs == (
unsigned)-1)
225 Polynomial &add(
const APInt &
C) {
242 if (
C.getBitWidth() != A.getBitWidth()) {
252 Polynomial &mul(
const APInt &
C) {
303 if (
C.getBitWidth() != A.getBitWidth()) {
321 decErrorMSBs(
C.countr_zero());
324 pushBOperation(
Mul,
C);
329 Polynomial &lshr(
const APInt &
C) {
460 if (
C.getBitWidth() != A.getBitWidth()) {
469 unsigned shiftAmt =
C.getZExtValue();
470 if (shiftAmt >=
C.getBitWidth())
471 return mul(
APInt(
C.getBitWidth(), 0));
478 if (A.countr_zero() < shiftAmt)
479 ErrorMSBs = A.getBitWidth();
481 incErrorMSBs(shiftAmt);
484 pushBOperation(LShr,
C);
485 A = A.lshr(shiftAmt);
491 Polynomial &sextOrTrunc(
unsigned n) {
492 if (n < A.getBitWidth()) {
495 decErrorMSBs(A.getBitWidth() - n);
497 pushBOperation(Trunc,
APInt(
sizeof(n) * 8, n));
499 if (n > A.getBitWidth()) {
502 incErrorMSBs(n - A.getBitWidth());
504 pushBOperation(SExt,
APInt(
sizeof(n) * 8, n));
511 bool isFirstOrder()
const {
return V !=
nullptr; }
514 bool isCompatibleTo(
const Polynomial &o)
const {
516 if (A.getBitWidth() != o.A.getBitWidth())
520 if (!isFirstOrder() && !o.isFirstOrder())
528 if (B.size() != o.B.size())
531 auto *ob = o.B.begin();
532 for (
const auto &b : B) {
543 Polynomial
operator-(
const Polynomial &o)
const {
545 if (!isCompatibleTo(o))
551 return Polynomial(A - o.A, std::max(ErrorMSBs, o.ErrorMSBs));
556 Polynomial Result(*
this);
563 Polynomial Result(*
this);
569 bool isProvenEqualTo(
const Polynomial &o) {
571 Polynomial r = *
this - o;
572 return (r.ErrorMSBs == 0) && (!r.isFirstOrder()) && (r.A.isZero());
577 OS <<
"[{#ErrBits:" << ErrorMSBs <<
"} ";
582 OS <<
"(" << *V <<
") ";
600 OS << b.second <<
") ";
604 OS <<
"+ " << A <<
"]";
613 void pushBOperation(
const BOps
Op,
const APInt &
C) {
614 if (isFirstOrder()) {
615 B.push_back(std::make_pair(
Op,
C));
637 VectorInfo(
const VectorInfo &c) : VTy(c.VTy) {
639 "Copying VectorInfo is neither implemented nor necessary,");
652 ElementInfo(Polynomial
Offset = Polynomial(),
LoadInst *LI =
nullptr)
663 std::set<LoadInst *> LIs;
666 std::set<Instruction *> Is;
681 VectorInfo &operator=(
const VectorInfo &other) =
delete;
683 virtual ~VectorInfo() {
delete[] EI; }
694 bool isInterleaved(
unsigned Factor,
const DataLayout &
DL)
const {
696 for (
unsigned i = 1; i < getDimension(); i++) {
697 if (!EI[i].Ofs.isProvenEqualTo(EI[0].Ofs + i * Factor *
Size)) {
715 return computeFromSVI(SVI, Result,
DL);
716 LoadInst *LI = dyn_cast<LoadInst>(V);
718 return computeFromLI(LI, Result,
DL);
721 return computeFromBCI(BCI, Result,
DL);
731 static bool computeFromBCI(
BitCastInst *BCI, VectorInfo &Result,
746 unsigned Factor = Result.VTy->getNumElements() / VTy->
getNumElements();
747 unsigned NewSize =
DL.getTypeAllocSize(Result.VTy->getElementType());
750 if (NewSize * Factor != OldSize)
754 if (!compute(
Op, Old,
DL))
757 for (
unsigned i = 0; i < Result.VTy->getNumElements(); i += Factor) {
758 for (
unsigned j = 0; j < Factor; j++) {
760 ElementInfo(Old.EI[i / Factor].Ofs + j * NewSize,
761 j == 0 ? Old.EI[i / Factor].LI :
nullptr);
767 Result.LIs.insert(Old.LIs.begin(), Old.LIs.end());
768 Result.Is.insert(Old.Is.begin(), Old.Is.end());
769 Result.Is.insert(BCI);
770 Result.SVI =
nullptr;
792 VectorInfo
LHS(ArgTy);
797 VectorInfo
RHS(ArgTy);
826 Result.LIs.insert(
LHS.LIs.begin(),
LHS.LIs.end());
827 Result.Is.insert(
LHS.Is.begin(),
LHS.Is.end());
830 Result.LIs.insert(
RHS.LIs.begin(),
RHS.LIs.end());
831 Result.Is.insert(
RHS.Is.begin(),
RHS.Is.end());
833 Result.Is.insert(SVI);
839 "Invalid ShuffleVectorInst (index out of bounds)");
842 Result.EI[j] = ElementInfo();
845 Result.EI[j] =
LHS.EI[i];
847 Result.EI[j] = ElementInfo();
852 Result.EI[j] = ElementInfo();
868 static bool computeFromLI(
LoadInst *LI, VectorInfo &Result,
884 Result.LIs.insert(LI);
885 Result.Is.insert(LI);
887 for (
unsigned i = 0; i < Result.getDimension(); i++) {
892 int64_t Ofs =
DL.getIndexedOffsetInType(Result.VTy,
ArrayRef(
Idx, 2));
893 Result.EI[i] = ElementInfo(
Offset + Ofs, i == 0 ? LI :
nullptr);
903 static void computePolynomialBinOp(
BinaryOperator &BO, Polynomial &Result) {
910 C = dyn_cast<ConstantInt>(
LHS);
916 case Instruction::Add:
920 computePolynomial(*
LHS, Result);
921 Result.add(
C->getValue());
924 case Instruction::LShr:
928 computePolynomial(*
LHS, Result);
929 Result.lshr(
C->getValue());
936 Result = Polynomial(&BO);
943 static void computePolynomial(
Value &V, Polynomial &Result) {
944 if (
auto *BO = dyn_cast<BinaryOperator>(&V))
945 computePolynomialBinOp(*BO, Result);
947 Result = Polynomial(&V);
956 static void computePolynomialFromPointer(
Value &
Ptr, Polynomial &Result,
962 Result = Polynomial();
966 unsigned PointerBits =
967 DL.getIndexSizeInBits(PtrTy->getPointerAddressSpace());
970 if (isa<CastInst>(&
Ptr)) {
973 case Instruction::BitCast:
974 computePolynomialFromPointer(*CI.
getOperand(0), Result, BasePtr,
DL);
978 Polynomial(PointerBits, 0);
983 else if (isa<GetElementPtrInst>(&
Ptr)) {
986 APInt BaseOffset(PointerBits, 0);
989 if (
GEP.accumulateConstantOffset(
DL, BaseOffset)) {
990 Result = Polynomial(BaseOffset);
991 BasePtr =
GEP.getPointerOperand();
996 unsigned idxOperand, e;
998 for (idxOperand = 1, e =
GEP.getNumOperands(); idxOperand < e;
1000 ConstantInt *IDX = dyn_cast<ConstantInt>(
GEP.getOperand(idxOperand));
1007 if (idxOperand + 1 != e) {
1008 Result = Polynomial();
1014 computePolynomial(*
GEP.getOperand(idxOperand), Result);
1019 DL.getIndexedOffsetInType(
GEP.getSourceElementType(), Indices);
1022 unsigned ResultSize =
DL.getTypeAllocSize(
GEP.getResultElementType());
1023 Result.sextOrTrunc(PointerBits);
1024 Result.mul(
APInt(PointerBits, ResultSize));
1025 Result.add(BaseOffset);
1026 BasePtr =
GEP.getPointerOperand();
1033 Polynomial(
DL.getIndexSizeInBits(PtrTy->getPointerAddressSpace()), 0);
1044 for (
unsigned i = 0; i < getDimension(); i++)
1045 OS << ((i == 0) ?
"[" :
", ") << EI[i].Ofs;
1053bool InterleavedLoadCombineImpl::findPattern(
1054 std::list<VectorInfo> &Candidates, std::list<VectorInfo> &InterleavedLoad,
1056 for (
auto C0 = Candidates.begin(), E0 = Candidates.end(); C0 != E0; ++C0) {
1059 unsigned Size =
DL.getTypeAllocSize(C0->VTy->getElementType());
1062 std::vector<std::list<VectorInfo>::iterator> Res(Factor, Candidates.end());
1064 for (
auto C = Candidates.begin(),
E = Candidates.end();
C !=
E;
C++) {
1065 if (
C->VTy != C0->VTy)
1067 if (
C->BB != C0->BB)
1069 if (
C->PV != C0->PV)
1073 for (i = 1; i < Factor; i++) {
1074 if (
C->EI[0].Ofs.isProvenEqualTo(C0->EI[0].Ofs + i *
Size)) {
1079 for (i = 1; i < Factor; i++) {
1080 if (Res[i] == Candidates.end())
1089 if (Res[0] != Candidates.end()) {
1091 for (
unsigned i = 0; i < Factor; i++) {
1092 InterleavedLoad.splice(InterleavedLoad.end(), Candidates, Res[i]);
1102InterleavedLoadCombineImpl::findFirstLoad(
const std::set<LoadInst *> &LIs) {
1103 assert(!LIs.empty() &&
"No load instructions given.");
1111 return cast<LoadInst>(FLI);
1114bool InterleavedLoadCombineImpl::combine(std::list<VectorInfo> &InterleavedLoad,
1121 LoadInst *InsertionPoint = InterleavedLoad.front().EI[0].LI;
1124 if (!InsertionPoint)
1127 std::set<LoadInst *> LIs;
1128 std::set<Instruction *> Is;
1129 std::set<Instruction *> SVIs;
1136 unsigned Factor = InterleavedLoad.size();
1139 for (
auto &VI : InterleavedLoad) {
1141 LIs.insert(
VI.LIs.begin(),
VI.LIs.end());
1146 Is.insert(
VI.Is.begin(),
VI.Is.end());
1149 SVIs.insert(
VI.SVI);
1159 for (
const auto &
I : Is) {
1164 if (SVIs.find(
I) != SVIs.end())
1169 for (
auto *U :
I->users()) {
1170 if (Is.find(dyn_cast<Instruction>(U)) == Is.end())
1186 auto FMA = MSSA.getMemoryAccess(
First);
1187 for (
auto *LI : LIs) {
1188 auto MADef = MSSA.getMemoryAccess(LI)->getDefiningAccess();
1189 if (!MSSA.dominates(MADef, FMA))
1192 assert(!LIs.empty() &&
"There are no LoadInst to combine");
1195 for (
auto &VI : InterleavedLoad) {
1196 if (!DT.dominates(InsertionPoint,
VI.SVI))
1203 Type *ETy = InterleavedLoad.front().SVI->getType()->getElementType();
1204 unsigned ElementsPerSVI =
1205 cast<FixedVectorType>(InterleavedLoad.front().SVI->getType())
1209 auto Indices = llvm::to_vector<4>(llvm::seq<unsigned>(0, Factor));
1211 Instruction::Load, ILTy, Factor, Indices, InsertionPoint->
getAlign(),
1220 auto LI = Builder.CreateAlignedLoad(ILTy,
Ptr, InsertionPoint->
getAlign(),
1221 "interleaved.wide.load");
1223 MemoryUse *MSSALoad = cast<MemoryUse>(MSSAU.createMemoryAccessBefore(
1224 LI,
nullptr, MSSA.getMemoryAccess(InsertionPoint)));
1225 MSSAU.insertUse(MSSALoad,
true);
1229 for (
auto &VI : InterleavedLoad) {
1231 for (
unsigned j = 0;
j < ElementsPerSVI;
j++)
1232 Mask.push_back(i + j * Factor);
1234 Builder.SetInsertPoint(
VI.SVI);
1235 auto SVI = Builder.CreateShuffleVector(LI, Mask,
"interleaved.shuffle");
1236 VI.SVI->replaceAllUsesWith(SVI);
1240 NumInterleavedLoadCombine++;
1243 <<
"Load interleaved combined with factor "
1250bool InterleavedLoadCombineImpl::run() {
1252 bool changed =
false;
1253 unsigned MaxFactor = TLI.getMaxSupportedInterleaveFactor();
1255 auto &
DL =
F.getParent()->getDataLayout();
1258 for (
unsigned Factor = MaxFactor; Factor >= 2; Factor--) {
1259 std::list<VectorInfo> Candidates;
1263 if (
auto SVI = dyn_cast<ShuffleVectorInst>(&
I)) {
1265 if (isa<ScalableVectorType>(SVI->getType()))
1268 Candidates.emplace_back(cast<FixedVectorType>(SVI->getType()));
1270 if (!VectorInfo::computeFromSVI(SVI, Candidates.back(),
DL)) {
1271 Candidates.pop_back();
1275 if (!Candidates.back().isInterleaved(Factor,
DL)) {
1276 Candidates.pop_back();
1282 std::list<VectorInfo> InterleavedLoad;
1283 while (findPattern(Candidates, InterleavedLoad, Factor,
DL)) {
1284 if (combine(InterleavedLoad, ORE)) {
1289 Candidates.splice(Candidates.begin(), InterleavedLoad,
1290 std::next(InterleavedLoad.begin()),
1291 InterleavedLoad.end());
1293 InterleavedLoad.clear();
1311 return "Interleaved Load Combine Pass";
1315 if (DisableInterleavedLoadCombine)
1318 auto *TPC = getAnalysisIfAvailable<TargetPassConfig>();
1325 return InterleavedLoadCombineImpl(
1326 F, getAnalysis<DominatorTreeWrapperPass>().getDomTree(),
1327 getAnalysis<MemorySSAWrapperPass>().getMSSA(),
1342char InterleavedLoadCombine::ID = 0;
1346 "Combine interleaved loads into wide loads and shufflevector instructions",
1357 auto P =
new InterleavedLoadCombine();
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static void print(raw_ostream &Out, object::Archive::Kind Kind, T Val)
static const Function * getParent(const Value *V)
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static cl::opt< TargetTransformInfo::TargetCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(TargetTransformInfo::TCK_RecipThroughput), cl::values(clEnumValN(TargetTransformInfo::TCK_RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(TargetTransformInfo::TCK_Latency, "latency", "Instruction latency"), clEnumValN(TargetTransformInfo::TCK_CodeSize, "code-size", "Code size"), clEnumValN(TargetTransformInfo::TCK_SizeAndLatency, "size-latency", "Code size and latency")))
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
Combine interleaved loads into wide loads and shufflevector instructions
This file exposes an interface to building/using memory SSA to walk memory instructions using a use/d...
Module.h This file contains the declarations for the Module class.
const char LLVMTargetMachineRef TM
#define INITIALIZE_PASS_DEPENDENCY(depName)
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
This file describes how to lower LLVM code to machine code.
Target-Independent Code Generator Pass Configuration Options pass.
Class for arbitrary precision integers.
Represent the analysis usage information of a pass.
AnalysisUsage & addRequired()
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
LLVM Basic Block Representation.
InstListType::iterator iterator
Instruction iterators...
BinaryOps getOpcode() const
This class represents a no-op cast from one type to another.
This is the base class for all instructions that perform data casts.
Instruction::CastOps getOpcode() const
Return the opcode of this CastInst.
This is the shared class of boolean and integer constants.
static Constant * get(Type *Ty, uint64_t V, bool IsSigned=false)
If Ty is a vector type, return a Constant with a splat of the given value.
This class represents an Operation in the Expression.
A parsed version of the target data layout string in and methods for querying it.
Legacy analysis pass which computes a DominatorTree.
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Class to represent fixed width SIMD vectors.
unsigned getNumElements() const
static FixedVectorType * get(Type *ElementType, unsigned NumElts)
FunctionPass class - This class is used to implement most global optimizations.
virtual bool runOnFunction(Function &F)=0
runOnFunction - Virtual method overriden by subclasses to do the per-function processing of the pass.
an instruction for type-safe pointer arithmetic to access elements of arrays and structs
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
bool isCommutative() const LLVM_READONLY
Return true if the instruction is commutative:
bool isAtomic() const LLVM_READONLY
Return true if this instruction has an AtomicOrdering of unordered or higher.
const BasicBlock * getParent() const
Class to represent integer types.
unsigned getBitWidth() const
Get the number of bits in this IntegerType.
An instruction for reading from memory.
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
Value * getPointerOperand()
bool isVolatile() const
Return true if this is a load from a volatile memory location.
Align getAlign() const
Return the alignment of the access that is being performed.
Legacy analysis pass which computes MemorySSA.
Encapsulates MemorySSA, including all data associated with memory accesses.
Represents read-only accesses to memory.
static PassRegistry * getPassRegistry()
getPassRegistry - Access the global registry object, which is automatically initialized at applicatio...
virtual void getAnalysisUsage(AnalysisUsage &) const
getAnalysisUsage - This function should be overriden by passes that need analysis information to do t...
virtual StringRef getPassName() const
getPassName - Return a nice clean name for a pass.
This instruction constructs a fixed permutation of two input vectors.
static void getShuffleMask(const Constant *Mask, SmallVectorImpl< int > &Result)
Convert the input shuffle mask operand to a vector of integers.
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
StringRef - Represent a constant reference to a string, i.e.
This class defines information used to lower LLVM code to legal SelectionDAG operators that the targe...
Primary interface to the complete machine description for the target machine.
The instances of the Type class are immutable: once they are created, they are never changed.
static IntegerType * getInt32Ty(LLVMContext &C)
Value * getOperand(unsigned i) const
LLVM Value Representation.
Type * getType() const
All values are typed, get the type of this value.
LLVMContext & getContext() const
All values hold a context through their type.
Type * getElementType() const
This class implements an extremely fast bulk output stream that can only output to a stream.
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
@ C
The default llvm calling convention, compatible with C.
@ FMA
FMA - Perform a * b + c with no intermediate rounding step.
initializer< Ty > init(const Ty &Val)
DiagnosticInfoOptimizationBase::Argument NV
This is an optimization pass for GlobalISel generic memory operations.
void initializeInterleavedLoadCombinePass(PassRegistry &)
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
@ First
Helpers to iterate all locations in the MemoryEffectsBase class.
raw_ostream & operator<<(raw_ostream &OS, const APFixedPoint &FX)
constexpr unsigned BitWidth
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
APInt operator+(APInt a, const APInt &b)
FunctionPass * createInterleavedLoadCombinePass()
InterleavedLoadCombines Pass - This pass identifies interleaved loads and combines them into wide loa...
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.