46class X86InterleavedAccessGroup {
58 const unsigned Factor;
89 unsigned NumSubVecElems);
94 unsigned NumSubVecElems);
97 unsigned NumSubVecElems);
111 : Inst(
I), Shuffles(Shuffs), Indices(Ind), Factor(
F), Subtarget(STarget),
112 DL(Inst->getDataLayout()), Builder(
B) {}
116 bool isSupported()
const;
120 bool lowerIntoOptimizedSequence();
125bool X86InterleavedAccessGroup::isSupported()
const {
126 VectorType *ShuffleVecTy = Shuffles[0]->getType();
127 Type *ShuffleEltTy = ShuffleVecTy->getElementType();
128 unsigned ShuffleElemSize =
DL.getTypeSizeInBits(ShuffleEltTy);
129 unsigned WideInstSize;
137 if (!Subtarget.hasAVX() || (Factor != 4 && Factor != 3))
140 if (isa<LoadInst>(Inst)) {
141 WideInstSize =
DL.getTypeSizeInBits(Inst->getType());
145 WideInstSize =
DL.getTypeSizeInBits(Shuffles[0]->
getType());
149 if (ShuffleElemSize == 64 && WideInstSize == 1024 && Factor == 4)
152 if (ShuffleElemSize == 8 && isa<StoreInst>(Inst) && Factor == 4 &&
153 (WideInstSize == 256 || WideInstSize == 512 || WideInstSize == 1024 ||
154 WideInstSize == 2048))
157 if (ShuffleElemSize == 8 && Factor == 3 &&
158 (WideInstSize == 384 || WideInstSize == 768 || WideInstSize == 1536))
164void X86InterleavedAccessGroup::decompose(
167 assert((isa<LoadInst>(VecInst) || isa<ShuffleVectorInst>(VecInst)) &&
168 "Expected Load or Shuffle");
173 DL.getTypeSizeInBits(VecWidth) >=
174 DL.getTypeSizeInBits(SubVecTy) * NumSubVectors &&
175 "Invalid Inst-size!!!");
177 if (
auto *SVI = dyn_cast<ShuffleVectorInst>(VecInst)) {
178 Value *Op0 = SVI->getOperand(0);
179 Value *Op1 = SVI->getOperand(1);
182 for (
unsigned i = 0; i < NumSubVectors; ++i)
184 cast<ShuffleVectorInst>(Builder.CreateShuffleVector(
192 LoadInst *LI = cast<LoadInst>(VecInst);
194 unsigned int NumLoads = NumSubVectors;
198 unsigned VecLength =
DL.getTypeSizeInBits(VecWidth);
200 if (VecLength == 768 || VecLength == 1536) {
202 NumLoads = NumSubVectors * (VecLength / 384);
204 VecBaseTy = SubVecTy;
208 "VecBaseTy's size must be a multiple of 8");
212 Align Alignment = FirstAlignment;
213 for (
unsigned i = 0; i < NumLoads; i++) {
216 Builder.CreateGEP(VecBaseTy, VecBasePtr, Builder.getInt32(i));
218 Builder.CreateAlignedLoad(VecBaseTy, NewBasePtr, Alignment);
220 Alignment = SubsequentAlignment;
233 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
234 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
235 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
236 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63};
258 "This function doesn't accept width smaller then 256");
286 unsigned VecElems,
unsigned Stride,
289 if (VecElems == 16) {
290 for (
unsigned i = 0; i < Stride; i++)
298 for (
unsigned i = 0; i < (VecElems / 16) * Stride; i += 2) {
300 (i + 1) / Stride * 16);
302 Vec[i % Stride], Vec[(i + 1) % Stride], OptimizeShuf);
303 OptimizeShuf.
clear();
306 if (VecElems == 32) {
307 std::copy(Temp, Temp + Stride, TransposedMatrix.
begin());
310 for (
unsigned i = 0; i < Stride; i++)
311 TransposedMatrix[i] =
315void X86InterleavedAccessGroup::interleave8bitStride4VF8(
325 TransposedMatrix.
resize(2);
330 for (
unsigned i = 0; i < 8; ++i) {
342 Builder.CreateShuffleVector(
Matrix[0],
Matrix[1], MaskLow);
344 Builder.CreateShuffleVector(
Matrix[2],
Matrix[3], MaskLow);
349 TransposedMatrix[0] =
350 Builder.CreateShuffleVector(IntrVec1Low, IntrVec2Low, MaskLowWord);
351 TransposedMatrix[1] =
352 Builder.CreateShuffleVector(IntrVec1Low, IntrVec2Low, MaskHighWord);
355void X86InterleavedAccessGroup::interleave8bitStride4(
367 TransposedMatrix.
resize(4);
394 IntrVec[0] = Builder.CreateShuffleVector(
Matrix[0],
Matrix[1], MaskLow);
395 IntrVec[1] = Builder.CreateShuffleVector(
Matrix[0],
Matrix[1], MaskHigh);
396 IntrVec[2] = Builder.CreateShuffleVector(
Matrix[2],
Matrix[3], MaskLow);
397 IntrVec[3] = Builder.CreateShuffleVector(
Matrix[2],
Matrix[3], MaskHigh);
405 for (
int i = 0; i < 4; i++)
406 VecOut[i] = Builder.CreateShuffleVector(IntrVec[i / 2], IntrVec[i / 2 + 2],
414 if (VT == MVT::v16i8) {
415 std::copy(VecOut, VecOut + 4, TransposedMatrix.
begin());
437 int LaneCount = std::max(VectorSize / 128, 1);
438 for (
int Lane = 0; Lane < LaneCount; Lane++)
439 for (
int i = 0, LaneSize = VF / LaneCount; i != LaneSize; ++i)
440 Mask.push_back((i * Stride) % LaneSize + LaneSize * Lane);
450 for (
int i = 0, FirstGroupElement = 0; i < 3; i++) {
451 int GroupSize = std::ceil((VF - FirstGroupElement) / 3.0);
453 FirstGroupElement = ((GroupSize)*3 + FirstGroupElement) % VF;
472 bool AlignDirection =
true,
bool Unary =
false) {
474 unsigned NumLanes = std::max((
int)VT.
getSizeInBits() / 128, 1);
475 unsigned NumLaneElts = NumElts / NumLanes;
477 Imm = AlignDirection ? Imm : (NumLaneElts - Imm);
480 for (
unsigned l = 0; l != NumElts; l += NumLaneElts) {
481 for (
unsigned i = 0; i != NumLaneElts; ++i) {
485 if (
Base >= NumLaneElts)
486 Base = Unary ?
Base % NumLaneElts :
Base + NumElts - NumLaneElts;
521 if (VecElems == 16) {
522 for (
int i = 0; i < 3; i++)
527 for (
unsigned j = 0; j < VecElems / 32; j++)
528 for (
int i = 0; i < 3; i++)
535 for (
int i = 0; i < 3; i++)
539void X86InterleavedAccessGroup::deinterleave8bitStride3(
547 TransposedMatrix.
resize(3);
553 Value *Vec[6], *TempVector[3];
560 for (
int i = 0; i < 2; i++)
571 for (
int i = 0; i < 3; i++)
572 Vec[i] = Builder.CreateShuffleVector(Vec[i], VPShuf);
578 for (
int i = 0; i < 3; i++)
580 Builder.CreateShuffleVector(Vec[(i + 2) % 3], Vec[i], VPAlign[0]);
586 for (
int i = 0; i < 3; i++)
587 Vec[i] = Builder.CreateShuffleVector(TempVector[(i + 1) % 3], TempVector[i],
594 Value *TempVec = Builder.CreateShuffleVector(Vec[1], VPAlign3);
595 TransposedMatrix[0] = Builder.CreateShuffleVector(Vec[0], VPAlign2);
596 TransposedMatrix[1] = VecElems == 8 ? Vec[2] : TempVec;
597 TransposedMatrix[2] = VecElems == 8 ? TempVec : Vec[2];
605 int IndexGroup[3] = {0, 0, 0};
610 int Lane = (VectorWidth / 128 > 0) ? VectorWidth / 128 : 1;
611 for (
int i = 0; i < 3; i++) {
612 IndexGroup[(Index * 3) % (VF / Lane)] = Index;
616 for (
int i = 0; i < VF / Lane; i++) {
622void X86InterleavedAccessGroup::interleave8bitStride3(
630 TransposedMatrix.
resize(3);
637 Value *Vec[3], *TempVector[3];
642 for (
int i = 0; i < 3; i++)
652 Vec[0] = Builder.CreateShuffleVector(InVec[0], VPAlign2);
653 Vec[1] = Builder.CreateShuffleVector(InVec[1], VPAlign3);
660 for (
int i = 0; i < 3; i++)
662 Builder.CreateShuffleVector(Vec[i], Vec[(i + 2) % 3], VPAlign[1]);
668 for (
int i = 0; i < 3; i++)
669 Vec[i] = Builder.CreateShuffleVector(TempVector[i], TempVector[(i + 1) % 3],
681void X86InterleavedAccessGroup::transpose_4x4(
685 TransposedMatrix.
resize(4);
688 static constexpr int IntMask1[] = {0, 1, 4, 5};
694 static constexpr int IntMask2[] = {2, 3, 6, 7};
700 static constexpr int IntMask3[] = {0, 4, 2, 6};
702 TransposedMatrix[0] = Builder.CreateShuffleVector(IntrVec1, IntrVec2, Mask);
703 TransposedMatrix[2] = Builder.CreateShuffleVector(IntrVec3, IntrVec4, Mask);
706 static constexpr int IntMask4[] = {1, 5, 3, 7};
708 TransposedMatrix[1] = Builder.CreateShuffleVector(IntrVec1, IntrVec2, Mask);
709 TransposedMatrix[3] = Builder.CreateShuffleVector(IntrVec3, IntrVec4, Mask);
714bool X86InterleavedAccessGroup::lowerIntoOptimizedSequence() {
717 auto *ShuffleTy = cast<FixedVectorType>(Shuffles[0]->
getType());
719 if (isa<LoadInst>(Inst)) {
720 auto *ShuffleEltTy = cast<FixedVectorType>(Inst->getType());
721 unsigned NumSubVecElems = ShuffleEltTy->getNumElements() / Factor;
722 switch (NumSubVecElems) {
730 if (ShuffleTy->getNumElements() != NumSubVecElems)
736 decompose(Inst, Factor, ShuffleTy, DecomposedVectors);
742 if (NumSubVecElems == 4)
743 transpose_4x4(DecomposedVectors, TransposedVectors);
745 deinterleave8bitStride3(DecomposedVectors, TransposedVectors,
750 for (
unsigned i = 0, e = Shuffles.size(); i < e; ++i)
751 Shuffles[i]->replaceAllUsesWith(TransposedVectors[Indices[i]]);
756 Type *ShuffleEltTy = ShuffleTy->getElementType();
757 unsigned NumSubVecElems = ShuffleTy->getNumElements() / Factor;
768 switch (NumSubVecElems) {
770 transpose_4x4(DecomposedVectors, TransposedVectors);
773 interleave8bitStride4VF8(DecomposedVectors, TransposedVectors);
779 interleave8bitStride4(DecomposedVectors, TransposedVectors,
782 interleave8bitStride3(DecomposedVectors, TransposedVectors,
794 Builder.CreateAlignedStore(WideVec,
SI->getPointerOperand(),
SI->getAlign());
807 "Invalid interleave factor");
808 assert(!Shuffles.
empty() &&
"Empty shufflevector input");
810 "Unmatched number of shufflevectors and indices");
814 X86InterleavedAccessGroup Grp(LI, Shuffles, Indices, Factor, Subtarget,
817 return Grp.isSupported() && Grp.lowerIntoOptimizedSequence();
822 unsigned Factor)
const {
824 "Invalid interleave factor");
826 assert(cast<FixedVectorType>(SVI->
getType())->getNumElements() % Factor ==
828 "Invalid interleaved store");
834 for (
unsigned i = 0; i < Factor; i++)
841 X86InterleavedAccessGroup Grp(SI, Shuffles, Indices, Factor, Subtarget,
844 return Grp.isSupported() && Grp.lowerIntoOptimizedSequence();
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static Decomposition decompose(Value *V, SmallVectorImpl< ConditionTy > &Preconditions, bool IsSigned, const DataLayout &DL)
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
This file defines the SmallVector class.
static SymbolRef::Type getType(const Symbol *Sym)
static void DecodePALIGNRMask(MVT VT, unsigned Imm, SmallVectorImpl< int > &ShuffleMask, bool AlignDirection=true, bool Unary=false)
static void genShuffleBland(MVT VT, ArrayRef< int > Mask, SmallVectorImpl< int > &Out, int LowOffset, int HighOffset)
static MVT scaleVectorType(MVT VT)
static constexpr int Concat[]
static void group2Shuffle(MVT VT, SmallVectorImpl< int > &Mask, SmallVectorImpl< int > &Output)
static void createShuffleStride(MVT VT, int Stride, SmallVectorImpl< int > &Mask)
static void concatSubVector(Value **Vec, ArrayRef< Instruction * > InVec, unsigned VecElems, IRBuilder<> &Builder)
static void setGroupSize(MVT VT, SmallVectorImpl< int > &SizeInfo)
static void reorderSubVector(MVT VT, SmallVectorImpl< Value * > &TransposedMatrix, ArrayRef< Value * > Vec, ArrayRef< int > VPShuf, unsigned VecElems, unsigned Stride, IRBuilder<> &Builder)
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
size_t size() const
size - Get the array size.
bool empty() const
empty - Check if the array is empty.
A parsed version of the target data layout string in and methods for querying it.
Class to represent fixed width SIMD vectors.
unsigned getNumElements() const
static FixedVectorType * get(Type *ElementType, unsigned NumElts)
Value * CreateShuffleVector(Value *V1, Value *V2, Value *Mask, const Twine &Name="")
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
An instruction for reading from memory.
Value * getPointerOperand()
Align getAlign() const
Return the alignment of the access that is being performed.
uint64_t getScalarSizeInBits() const
unsigned getVectorNumElements() const
static MVT getVT(Type *Ty, bool HandleUnknown=false)
Return the value type corresponding to the specified type.
TypeSize getSizeInBits() const
Returns the size of the specified MVT in bits.
static MVT getVectorVT(MVT VT, unsigned NumElements)
MVT getVectorElementType() const
static MVT getIntegerVT(unsigned BitWidth)
This instruction constructs a fixed permutation of two input vectors.
VectorType * getType() const
Overload to return most specific vector type.
static void getShuffleMask(const Constant *Mask, SmallVectorImpl< int > &Result)
Convert the input shuffle mask operand to a vector of integers.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
An instruction for storing to memory.
The instances of the Type class are immutable: once they are created, they are never changed.
bool isVectorTy() const
True if this is an instance of VectorType.
static IntegerType * getInt8Ty(LLVMContext &C)
TypeSize getPrimitiveSizeInBits() const LLVM_READONLY
Return the basic size of this type if it is a primitive type.
LLVM Value Representation.
Type * getType() const
All values are typed, get the type of this value.
LLVMContext & getContext() const
All values hold a context through their type.
unsigned getMaxSupportedInterleaveFactor() const override
Get the maximum supported factor for interleaved memory accesses.
bool lowerInterleavedLoad(LoadInst *LI, ArrayRef< ShuffleVectorInst * > Shuffles, ArrayRef< unsigned > Indices, unsigned Factor) const override
Lower interleaved load(s) into target specific instructions/intrinsics.
bool lowerInterleavedStore(StoreInst *SI, ShuffleVectorInst *SVI, unsigned Factor) const override
Lower interleaved store(s) into target specific instructions/intrinsics.
constexpr bool isKnownMultipleOf(ScalarTy RHS) const
This function tells the caller whether the element count is known at compile time to be a multiple of...
constexpr ScalarTy getFixedValue() const
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
This is an optimization pass for GlobalISel generic memory operations.
unsigned getPointerAddressSpace(const Type *T)
Value * concatenateVectors(IRBuilderBase &Builder, ArrayRef< Value * > Vecs)
Concatenate a list of vectors.
void createUnpackShuffleMask(EVT VT, SmallVectorImpl< int > &Mask, bool Lo, bool Unary)
Generate unpacklo/unpackhi shuffle mask.
void narrowShuffleMaskElts(int Scale, ArrayRef< int > Mask, SmallVectorImpl< int > &ScaledMask)
Replace each shuffle mask index with the scaled sequential indices for an equivalent mask of narrowed...
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
llvm::SmallVector< int, 16 > createSequentialMask(unsigned Start, unsigned NumInts, unsigned NumUndefs)
Create a sequential shuffle mask.
This struct is a compact representation of a valid (non-zero power of two) alignment.